v4.19.13 snapshot.
diff --git a/arch/x86/kernel/.gitignore b/arch/x86/kernel/.gitignore
new file mode 100644
index 0000000..08f4fd7
--- /dev/null
+++ b/arch/x86/kernel/.gitignore
@@ -0,0 +1,3 @@
+vsyscall.lds
+vsyscall_32.lds
+vmlinux.lds
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
new file mode 100644
index 0000000..8824d01
--- /dev/null
+++ b/arch/x86/kernel/Makefile
@@ -0,0 +1,152 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Makefile for the linux kernel.
+#
+
+extra-y := head_$(BITS).o
+extra-y += head$(BITS).o
+extra-y += ebda.o
+extra-y += platform-quirks.o
+extra-y += vmlinux.lds
+
+CPPFLAGS_vmlinux.lds += -U$(UTS_MACHINE)
+
+ifdef CONFIG_FUNCTION_TRACER
+# Do not profile debug and lowlevel utilities
+CFLAGS_REMOVE_tsc.o = -pg
+CFLAGS_REMOVE_paravirt-spinlocks.o = -pg
+CFLAGS_REMOVE_pvclock.o = -pg
+CFLAGS_REMOVE_kvmclock.o = -pg
+CFLAGS_REMOVE_ftrace.o = -pg
+CFLAGS_REMOVE_early_printk.o = -pg
+CFLAGS_REMOVE_head64.o = -pg
+endif
+
+KASAN_SANITIZE_head$(BITS).o := n
+KASAN_SANITIZE_dumpstack.o := n
+KASAN_SANITIZE_dumpstack_$(BITS).o := n
+KASAN_SANITIZE_stacktrace.o := n
+KASAN_SANITIZE_paravirt.o := n
+
+OBJECT_FILES_NON_STANDARD_relocate_kernel_$(BITS).o := y
+OBJECT_FILES_NON_STANDARD_test_nx.o := y
+OBJECT_FILES_NON_STANDARD_paravirt_patch_$(BITS).o := y
+
+ifdef CONFIG_FRAME_POINTER
+OBJECT_FILES_NON_STANDARD_ftrace_$(BITS).o := y
+endif
+
+# If instrumentation of this dir is enabled, boot hangs during first second.
+# Probably could be more selective here, but note that files related to irqs,
+# boot, dumpstack/stacktrace, etc are either non-interesting or can lead to
+# non-deterministic coverage.
+KCOV_INSTRUMENT := n
+
+CFLAGS_irq.o := -I$(src)/../include/asm/trace
+
+obj-y := process_$(BITS).o signal.o
+obj-$(CONFIG_COMPAT) += signal_compat.o
+obj-y += traps.o idt.o irq.o irq_$(BITS).o dumpstack_$(BITS).o
+obj-y += time.o ioport.o dumpstack.o nmi.o
+obj-$(CONFIG_MODIFY_LDT_SYSCALL) += ldt.o
+obj-y += setup.o x86_init.o i8259.o irqinit.o jump_label.o
+obj-$(CONFIG_IRQ_WORK) += irq_work.o
+obj-y += probe_roms.o
+obj-$(CONFIG_X86_64) += sys_x86_64.o
+obj-$(CONFIG_X86_ESPFIX64) += espfix_64.o
+obj-$(CONFIG_SYSFS) += ksysfs.o
+obj-y += bootflag.o e820.o
+obj-y += pci-dma.o quirks.o topology.o kdebugfs.o
+obj-y += alternative.o i8253.o hw_breakpoint.o
+obj-y += tsc.o tsc_msr.o io_delay.o rtc.o
+obj-y += pci-iommu_table.o
+obj-y += resource.o
+obj-y += irqflags.o
+
+obj-y += process.o
+obj-y += fpu/
+obj-y += ptrace.o
+obj-$(CONFIG_X86_32) += tls.o
+obj-$(CONFIG_IA32_EMULATION) += tls.o
+obj-y += step.o
+obj-$(CONFIG_INTEL_TXT) += tboot.o
+obj-$(CONFIG_ISA_DMA_API) += i8237.o
+obj-$(CONFIG_STACKTRACE) += stacktrace.o
+obj-y += cpu/
+obj-y += acpi/
+obj-y += reboot.o
+obj-$(CONFIG_X86_MSR) += msr.o
+obj-$(CONFIG_X86_CPUID) += cpuid.o
+obj-$(CONFIG_PCI) += early-quirks.o
+apm-y := apm_32.o
+obj-$(CONFIG_APM) += apm.o
+obj-$(CONFIG_SMP) += smp.o
+obj-$(CONFIG_SMP) += smpboot.o
+obj-$(CONFIG_X86_TSC) += tsc_sync.o
+obj-$(CONFIG_SMP) += setup_percpu.o
+obj-$(CONFIG_X86_MPPARSE) += mpparse.o
+obj-y += apic/
+obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups_32.o
+obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o
+obj-$(CONFIG_LIVEPATCH) += livepatch.o
+obj-$(CONFIG_FUNCTION_TRACER) += ftrace_$(BITS).o
+obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o
+obj-$(CONFIG_FTRACE_SYSCALLS) += ftrace.o
+obj-$(CONFIG_X86_TSC) += trace_clock.o
+obj-$(CONFIG_KEXEC_CORE) += machine_kexec_$(BITS).o
+obj-$(CONFIG_KEXEC_CORE) += relocate_kernel_$(BITS).o crash.o
+obj-$(CONFIG_KEXEC_FILE) += kexec-bzimage64.o
+obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o
+obj-y += kprobes/
+obj-$(CONFIG_MODULES) += module.o
+obj-$(CONFIG_DOUBLEFAULT) += doublefault.o
+obj-$(CONFIG_KGDB) += kgdb.o
+obj-$(CONFIG_VM86) += vm86_32.o
+obj-$(CONFIG_EARLY_PRINTK) += early_printk.o
+
+obj-$(CONFIG_HPET_TIMER) += hpet.o
+obj-$(CONFIG_APB_TIMER) += apb_timer.o
+
+obj-$(CONFIG_AMD_NB) += amd_nb.o
+obj-$(CONFIG_DEBUG_NMI_SELFTEST) += nmi_selftest.o
+
+obj-$(CONFIG_KVM_GUEST) += kvm.o kvmclock.o
+obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch_$(BITS).o
+obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= paravirt-spinlocks.o
+obj-$(CONFIG_PARAVIRT_CLOCK) += pvclock.o
+obj-$(CONFIG_X86_PMEM_LEGACY_DEVICE) += pmem.o
+
+obj-$(CONFIG_JAILHOUSE_GUEST) += jailhouse.o
+
+obj-$(CONFIG_EISA) += eisa.o
+obj-$(CONFIG_PCSPKR_PLATFORM) += pcspeaker.o
+
+obj-$(CONFIG_X86_CHECK_BIOS_CORRUPTION) += check.o
+
+obj-$(CONFIG_SWIOTLB) += pci-swiotlb.o
+obj-$(CONFIG_OF) += devicetree.o
+obj-$(CONFIG_UPROBES) += uprobes.o
+obj-y += sysfb.o
+obj-$(CONFIG_X86_SYSFB) += sysfb_simplefb.o
+obj-$(CONFIG_EFI) += sysfb_efi.o
+
+obj-$(CONFIG_PERF_EVENTS) += perf_regs.o
+obj-$(CONFIG_TRACING) += tracepoint.o
+obj-$(CONFIG_SCHED_MC_PRIO) += itmt.o
+obj-$(CONFIG_X86_INTEL_UMIP) += umip.o
+
+obj-$(CONFIG_UNWINDER_ORC) += unwind_orc.o
+obj-$(CONFIG_UNWINDER_FRAME_POINTER) += unwind_frame.o
+obj-$(CONFIG_UNWINDER_GUESS) += unwind_guess.o
+
+###
+# 64 bit specific files
+ifeq ($(CONFIG_X86_64),y)
+ obj-$(CONFIG_AUDIT) += audit_64.o
+
+ obj-$(CONFIG_GART_IOMMU) += amd_gart_64.o aperture_64.o
+ obj-$(CONFIG_CALGARY_IOMMU) += pci-calgary_64.o tce_64.o
+
+ obj-$(CONFIG_MMCONF_FAM10H) += mmconf-fam10h_64.o
+ obj-y += vsmp_64.o
+endif
diff --git a/arch/x86/kernel/acpi/Makefile b/arch/x86/kernel/acpi/Makefile
new file mode 100644
index 0000000..f1bb57b
--- /dev/null
+++ b/arch/x86/kernel/acpi/Makefile
@@ -0,0 +1,12 @@
+# SPDX-License-Identifier: GPL-2.0
+OBJECT_FILES_NON_STANDARD_wakeup_$(BITS).o := y
+
+obj-$(CONFIG_ACPI) += boot.o
+obj-$(CONFIG_ACPI_SLEEP) += sleep.o wakeup_$(BITS).o
+obj-$(CONFIG_ACPI_APEI) += apei.o
+obj-$(CONFIG_ACPI_CPPC_LIB) += cppc_msr.o
+
+ifneq ($(CONFIG_ACPI_PROCESSOR),)
+obj-y += cstate.o
+endif
+
diff --git a/arch/x86/kernel/acpi/apei.c b/arch/x86/kernel/acpi/apei.c
new file mode 100644
index 0000000..bb8d300
--- /dev/null
+++ b/arch/x86/kernel/acpi/apei.c
@@ -0,0 +1,54 @@
+/*
+ * Arch-specific APEI-related functions.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#include <acpi/apei.h>
+
+#include <asm/mce.h>
+#include <asm/tlbflush.h>
+
+int arch_apei_enable_cmcff(struct acpi_hest_header *hest_hdr, void *data)
+{
+#ifdef CONFIG_X86_MCE
+ int i;
+ struct acpi_hest_ia_corrected *cmc;
+ struct acpi_hest_ia_error_bank *mc_bank;
+
+ cmc = (struct acpi_hest_ia_corrected *)hest_hdr;
+ if (!cmc->enabled)
+ return 0;
+
+ /*
+ * We expect HEST to provide a list of MC banks that report errors
+ * in firmware first mode. Otherwise, return non-zero value to
+ * indicate that we are done parsing HEST.
+ */
+ if (!(cmc->flags & ACPI_HEST_FIRMWARE_FIRST) ||
+ !cmc->num_hardware_banks)
+ return 1;
+
+ pr_info("HEST: Enabling Firmware First mode for corrected errors.\n");
+
+ mc_bank = (struct acpi_hest_ia_error_bank *)(cmc + 1);
+ for (i = 0; i < cmc->num_hardware_banks; i++, mc_bank++)
+ mce_disable_bank(mc_bank->bank_number);
+#endif
+ return 1;
+}
+
+void arch_apei_report_mem_error(int sev, struct cper_sec_mem_err *mem_err)
+{
+#ifdef CONFIG_X86_MCE
+ apei_mce_report_mem_error(sev, mem_err);
+#endif
+}
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
new file mode 100644
index 0000000..3b20607
--- /dev/null
+++ b/arch/x86/kernel/acpi/boot.c
@@ -0,0 +1,1773 @@
+/*
+ * boot.c - Architecture-Specific Low-Level ACPI Boot Support
+ *
+ * Copyright (C) 2001, 2002 Paul Diefenbaugh <paul.s.diefenbaugh@intel.com>
+ * Copyright (C) 2001 Jun Nakajima <jun.nakajima@intel.com>
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ */
+
+#include <linux/init.h>
+#include <linux/acpi.h>
+#include <linux/acpi_pmtmr.h>
+#include <linux/efi.h>
+#include <linux/cpumask.h>
+#include <linux/export.h>
+#include <linux/dmi.h>
+#include <linux/irq.h>
+#include <linux/slab.h>
+#include <linux/bootmem.h>
+#include <linux/ioport.h>
+#include <linux/pci.h>
+#include <linux/efi-bgrt.h>
+#include <linux/serial_core.h>
+
+#include <asm/e820/api.h>
+#include <asm/irqdomain.h>
+#include <asm/pci_x86.h>
+#include <asm/pgtable.h>
+#include <asm/io_apic.h>
+#include <asm/apic.h>
+#include <asm/io.h>
+#include <asm/mpspec.h>
+#include <asm/smp.h>
+#include <asm/i8259.h>
+
+#include "sleep.h" /* To include x86_acpi_suspend_lowlevel */
+static int __initdata acpi_force = 0;
+int acpi_disabled;
+EXPORT_SYMBOL(acpi_disabled);
+
+#ifdef CONFIG_X86_64
+# include <asm/proto.h>
+#endif /* X86 */
+
+#define PREFIX "ACPI: "
+
+int acpi_noirq; /* skip ACPI IRQ initialization */
+int acpi_pci_disabled; /* skip ACPI PCI scan and IRQ initialization */
+EXPORT_SYMBOL(acpi_pci_disabled);
+
+int acpi_lapic;
+int acpi_ioapic;
+int acpi_strict;
+int acpi_disable_cmcff;
+
+/* ACPI SCI override configuration */
+u8 acpi_sci_flags __initdata;
+u32 acpi_sci_override_gsi __initdata = INVALID_ACPI_IRQ;
+int acpi_skip_timer_override __initdata;
+int acpi_use_timer_override __initdata;
+int acpi_fix_pin2_polarity __initdata;
+
+#ifdef CONFIG_X86_LOCAL_APIC
+static u64 acpi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE;
+#endif
+
+#ifdef CONFIG_X86_IO_APIC
+/*
+ * Locks related to IOAPIC hotplug
+ * Hotplug side:
+ * ->device_hotplug_lock
+ * ->acpi_ioapic_lock
+ * ->ioapic_lock
+ * Interrupt mapping side:
+ * ->acpi_ioapic_lock
+ * ->ioapic_mutex
+ * ->ioapic_lock
+ */
+static DEFINE_MUTEX(acpi_ioapic_lock);
+#endif
+
+/* --------------------------------------------------------------------------
+ Boot-time Configuration
+ -------------------------------------------------------------------------- */
+
+/*
+ * The default interrupt routing model is PIC (8259). This gets
+ * overridden if IOAPICs are enumerated (below).
+ */
+enum acpi_irq_model_id acpi_irq_model = ACPI_IRQ_MODEL_PIC;
+
+
+/*
+ * ISA irqs by default are the first 16 gsis but can be
+ * any gsi as specified by an interrupt source override.
+ */
+static u32 isa_irq_to_gsi[NR_IRQS_LEGACY] __read_mostly = {
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+};
+
+/*
+ * This is just a simple wrapper around early_memremap(),
+ * with sanity checks for phys == 0 and size == 0.
+ */
+void __init __iomem *__acpi_map_table(unsigned long phys, unsigned long size)
+{
+
+ if (!phys || !size)
+ return NULL;
+
+ return early_memremap(phys, size);
+}
+
+void __init __acpi_unmap_table(void __iomem *map, unsigned long size)
+{
+ if (!map || !size)
+ return;
+
+ early_memunmap(map, size);
+}
+
+#ifdef CONFIG_X86_LOCAL_APIC
+static int __init acpi_parse_madt(struct acpi_table_header *table)
+{
+ struct acpi_table_madt *madt = NULL;
+
+ if (!boot_cpu_has(X86_FEATURE_APIC))
+ return -EINVAL;
+
+ madt = (struct acpi_table_madt *)table;
+ if (!madt) {
+ printk(KERN_WARNING PREFIX "Unable to map MADT\n");
+ return -ENODEV;
+ }
+
+ if (madt->address) {
+ acpi_lapic_addr = (u64) madt->address;
+
+ printk(KERN_DEBUG PREFIX "Local APIC address 0x%08x\n",
+ madt->address);
+ }
+
+ default_acpi_madt_oem_check(madt->header.oem_id,
+ madt->header.oem_table_id);
+
+ return 0;
+}
+
+/**
+ * acpi_register_lapic - register a local apic and generates a logic cpu number
+ * @id: local apic id to register
+ * @acpiid: ACPI id to register
+ * @enabled: this cpu is enabled or not
+ *
+ * Returns the logic cpu number which maps to the local apic
+ */
+static int acpi_register_lapic(int id, u32 acpiid, u8 enabled)
+{
+ unsigned int ver = 0;
+ int cpu;
+
+ if (id >= MAX_LOCAL_APIC) {
+ printk(KERN_INFO PREFIX "skipped apicid that is too big\n");
+ return -EINVAL;
+ }
+
+ if (!enabled) {
+ ++disabled_cpus;
+ return -EINVAL;
+ }
+
+ if (boot_cpu_physical_apicid != -1U)
+ ver = boot_cpu_apic_version;
+
+ cpu = generic_processor_info(id, ver);
+ if (cpu >= 0)
+ early_per_cpu(x86_cpu_to_acpiid, cpu) = acpiid;
+
+ return cpu;
+}
+
+static int __init
+acpi_parse_x2apic(struct acpi_subtable_header *header, const unsigned long end)
+{
+ struct acpi_madt_local_x2apic *processor = NULL;
+#ifdef CONFIG_X86_X2APIC
+ u32 apic_id;
+ u8 enabled;
+#endif
+
+ processor = (struct acpi_madt_local_x2apic *)header;
+
+ if (BAD_MADT_ENTRY(processor, end))
+ return -EINVAL;
+
+ acpi_table_print_madt_entry(header);
+
+#ifdef CONFIG_X86_X2APIC
+ apic_id = processor->local_apic_id;
+ enabled = processor->lapic_flags & ACPI_MADT_ENABLED;
+
+ /* Ignore invalid ID */
+ if (apic_id == 0xffffffff)
+ return 0;
+
+ /*
+ * We need to register disabled CPU as well to permit
+ * counting disabled CPUs. This allows us to size
+ * cpus_possible_map more accurately, to permit
+ * to not preallocating memory for all NR_CPUS
+ * when we use CPU hotplug.
+ */
+ if (!apic->apic_id_valid(apic_id)) {
+ if (enabled)
+ pr_warn(PREFIX "x2apic entry ignored\n");
+ return 0;
+ }
+
+ acpi_register_lapic(apic_id, processor->uid, enabled);
+#else
+ printk(KERN_WARNING PREFIX "x2apic entry ignored\n");
+#endif
+
+ return 0;
+}
+
+static int __init
+acpi_parse_lapic(struct acpi_subtable_header * header, const unsigned long end)
+{
+ struct acpi_madt_local_apic *processor = NULL;
+
+ processor = (struct acpi_madt_local_apic *)header;
+
+ if (BAD_MADT_ENTRY(processor, end))
+ return -EINVAL;
+
+ acpi_table_print_madt_entry(header);
+
+ /* Ignore invalid ID */
+ if (processor->id == 0xff)
+ return 0;
+
+ /*
+ * We need to register disabled CPU as well to permit
+ * counting disabled CPUs. This allows us to size
+ * cpus_possible_map more accurately, to permit
+ * to not preallocating memory for all NR_CPUS
+ * when we use CPU hotplug.
+ */
+ acpi_register_lapic(processor->id, /* APIC ID */
+ processor->processor_id, /* ACPI ID */
+ processor->lapic_flags & ACPI_MADT_ENABLED);
+
+ return 0;
+}
+
+static int __init
+acpi_parse_sapic(struct acpi_subtable_header *header, const unsigned long end)
+{
+ struct acpi_madt_local_sapic *processor = NULL;
+
+ processor = (struct acpi_madt_local_sapic *)header;
+
+ if (BAD_MADT_ENTRY(processor, end))
+ return -EINVAL;
+
+ acpi_table_print_madt_entry(header);
+
+ acpi_register_lapic((processor->id << 8) | processor->eid,/* APIC ID */
+ processor->processor_id, /* ACPI ID */
+ processor->lapic_flags & ACPI_MADT_ENABLED);
+
+ return 0;
+}
+
+static int __init
+acpi_parse_lapic_addr_ovr(struct acpi_subtable_header * header,
+ const unsigned long end)
+{
+ struct acpi_madt_local_apic_override *lapic_addr_ovr = NULL;
+
+ lapic_addr_ovr = (struct acpi_madt_local_apic_override *)header;
+
+ if (BAD_MADT_ENTRY(lapic_addr_ovr, end))
+ return -EINVAL;
+
+ acpi_table_print_madt_entry(header);
+
+ acpi_lapic_addr = lapic_addr_ovr->address;
+
+ return 0;
+}
+
+static int __init
+acpi_parse_x2apic_nmi(struct acpi_subtable_header *header,
+ const unsigned long end)
+{
+ struct acpi_madt_local_x2apic_nmi *x2apic_nmi = NULL;
+
+ x2apic_nmi = (struct acpi_madt_local_x2apic_nmi *)header;
+
+ if (BAD_MADT_ENTRY(x2apic_nmi, end))
+ return -EINVAL;
+
+ acpi_table_print_madt_entry(header);
+
+ if (x2apic_nmi->lint != 1)
+ printk(KERN_WARNING PREFIX "NMI not connected to LINT 1!\n");
+
+ return 0;
+}
+
+static int __init
+acpi_parse_lapic_nmi(struct acpi_subtable_header * header, const unsigned long end)
+{
+ struct acpi_madt_local_apic_nmi *lapic_nmi = NULL;
+
+ lapic_nmi = (struct acpi_madt_local_apic_nmi *)header;
+
+ if (BAD_MADT_ENTRY(lapic_nmi, end))
+ return -EINVAL;
+
+ acpi_table_print_madt_entry(header);
+
+ if (lapic_nmi->lint != 1)
+ printk(KERN_WARNING PREFIX "NMI not connected to LINT 1!\n");
+
+ return 0;
+}
+
+#endif /*CONFIG_X86_LOCAL_APIC */
+
+#ifdef CONFIG_X86_IO_APIC
+#define MP_ISA_BUS 0
+
+static int __init mp_register_ioapic_irq(u8 bus_irq, u8 polarity,
+ u8 trigger, u32 gsi);
+
+static void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger,
+ u32 gsi)
+{
+ /*
+ * Check bus_irq boundary.
+ */
+ if (bus_irq >= NR_IRQS_LEGACY) {
+ pr_warn("Invalid bus_irq %u for legacy override\n", bus_irq);
+ return;
+ }
+
+ /*
+ * TBD: This check is for faulty timer entries, where the override
+ * erroneously sets the trigger to level, resulting in a HUGE
+ * increase of timer interrupts!
+ */
+ if ((bus_irq == 0) && (trigger == 3))
+ trigger = 1;
+
+ if (mp_register_ioapic_irq(bus_irq, polarity, trigger, gsi) < 0)
+ return;
+ /*
+ * Reset default identity mapping if gsi is also an legacy IRQ,
+ * otherwise there will be more than one entry with the same GSI
+ * and acpi_isa_irq_to_gsi() may give wrong result.
+ */
+ if (gsi < nr_legacy_irqs() && isa_irq_to_gsi[gsi] == gsi)
+ isa_irq_to_gsi[gsi] = INVALID_ACPI_IRQ;
+ isa_irq_to_gsi[bus_irq] = gsi;
+}
+
+static int mp_config_acpi_gsi(struct device *dev, u32 gsi, int trigger,
+ int polarity)
+{
+#ifdef CONFIG_X86_MPPARSE
+ struct mpc_intsrc mp_irq;
+ struct pci_dev *pdev;
+ unsigned char number;
+ unsigned int devfn;
+ int ioapic;
+ u8 pin;
+
+ if (!acpi_ioapic)
+ return 0;
+ if (!dev || !dev_is_pci(dev))
+ return 0;
+
+ pdev = to_pci_dev(dev);
+ number = pdev->bus->number;
+ devfn = pdev->devfn;
+ pin = pdev->pin;
+ /* print the entry should happen on mptable identically */
+ mp_irq.type = MP_INTSRC;
+ mp_irq.irqtype = mp_INT;
+ mp_irq.irqflag = (trigger == ACPI_EDGE_SENSITIVE ? 4 : 0x0c) |
+ (polarity == ACPI_ACTIVE_HIGH ? 1 : 3);
+ mp_irq.srcbus = number;
+ mp_irq.srcbusirq = (((devfn >> 3) & 0x1f) << 2) | ((pin - 1) & 3);
+ ioapic = mp_find_ioapic(gsi);
+ mp_irq.dstapic = mpc_ioapic_id(ioapic);
+ mp_irq.dstirq = mp_find_ioapic_pin(ioapic, gsi);
+
+ mp_save_irq(&mp_irq);
+#endif
+ return 0;
+}
+
+static int __init mp_register_ioapic_irq(u8 bus_irq, u8 polarity,
+ u8 trigger, u32 gsi)
+{
+ struct mpc_intsrc mp_irq;
+ int ioapic, pin;
+
+ /* Convert 'gsi' to 'ioapic.pin'(INTIN#) */
+ ioapic = mp_find_ioapic(gsi);
+ if (ioapic < 0) {
+ pr_warn("Failed to find ioapic for gsi : %u\n", gsi);
+ return ioapic;
+ }
+
+ pin = mp_find_ioapic_pin(ioapic, gsi);
+
+ mp_irq.type = MP_INTSRC;
+ mp_irq.irqtype = mp_INT;
+ mp_irq.irqflag = (trigger << 2) | polarity;
+ mp_irq.srcbus = MP_ISA_BUS;
+ mp_irq.srcbusirq = bus_irq;
+ mp_irq.dstapic = mpc_ioapic_id(ioapic);
+ mp_irq.dstirq = pin;
+
+ mp_save_irq(&mp_irq);
+
+ return 0;
+}
+
+static int __init
+acpi_parse_ioapic(struct acpi_subtable_header * header, const unsigned long end)
+{
+ struct acpi_madt_io_apic *ioapic = NULL;
+ struct ioapic_domain_cfg cfg = {
+ .type = IOAPIC_DOMAIN_DYNAMIC,
+ .ops = &mp_ioapic_irqdomain_ops,
+ };
+
+ ioapic = (struct acpi_madt_io_apic *)header;
+
+ if (BAD_MADT_ENTRY(ioapic, end))
+ return -EINVAL;
+
+ acpi_table_print_madt_entry(header);
+
+ /* Statically assign IRQ numbers for IOAPICs hosting legacy IRQs */
+ if (ioapic->global_irq_base < nr_legacy_irqs())
+ cfg.type = IOAPIC_DOMAIN_LEGACY;
+
+ mp_register_ioapic(ioapic->id, ioapic->address, ioapic->global_irq_base,
+ &cfg);
+
+ return 0;
+}
+
+/*
+ * Parse Interrupt Source Override for the ACPI SCI
+ */
+static void __init acpi_sci_ioapic_setup(u8 bus_irq, u16 polarity, u16 trigger, u32 gsi)
+{
+ if (trigger == 0) /* compatible SCI trigger is level */
+ trigger = 3;
+
+ if (polarity == 0) /* compatible SCI polarity is low */
+ polarity = 3;
+
+ /* Command-line over-ride via acpi_sci= */
+ if (acpi_sci_flags & ACPI_MADT_TRIGGER_MASK)
+ trigger = (acpi_sci_flags & ACPI_MADT_TRIGGER_MASK) >> 2;
+
+ if (acpi_sci_flags & ACPI_MADT_POLARITY_MASK)
+ polarity = acpi_sci_flags & ACPI_MADT_POLARITY_MASK;
+
+ if (bus_irq < NR_IRQS_LEGACY)
+ mp_override_legacy_irq(bus_irq, polarity, trigger, gsi);
+ else
+ mp_register_ioapic_irq(bus_irq, polarity, trigger, gsi);
+
+ acpi_penalize_sci_irq(bus_irq, trigger, polarity);
+
+ /*
+ * stash over-ride to indicate we've been here
+ * and for later update of acpi_gbl_FADT
+ */
+ acpi_sci_override_gsi = gsi;
+ return;
+}
+
+static int __init
+acpi_parse_int_src_ovr(struct acpi_subtable_header * header,
+ const unsigned long end)
+{
+ struct acpi_madt_interrupt_override *intsrc = NULL;
+
+ intsrc = (struct acpi_madt_interrupt_override *)header;
+
+ if (BAD_MADT_ENTRY(intsrc, end))
+ return -EINVAL;
+
+ acpi_table_print_madt_entry(header);
+
+ if (intsrc->source_irq == acpi_gbl_FADT.sci_interrupt) {
+ acpi_sci_ioapic_setup(intsrc->source_irq,
+ intsrc->inti_flags & ACPI_MADT_POLARITY_MASK,
+ (intsrc->inti_flags & ACPI_MADT_TRIGGER_MASK) >> 2,
+ intsrc->global_irq);
+ return 0;
+ }
+
+ if (intsrc->source_irq == 0) {
+ if (acpi_skip_timer_override) {
+ printk(PREFIX "BIOS IRQ0 override ignored.\n");
+ return 0;
+ }
+
+ if ((intsrc->global_irq == 2) && acpi_fix_pin2_polarity
+ && (intsrc->inti_flags & ACPI_MADT_POLARITY_MASK)) {
+ intsrc->inti_flags &= ~ACPI_MADT_POLARITY_MASK;
+ printk(PREFIX "BIOS IRQ0 pin2 override: forcing polarity to high active.\n");
+ }
+ }
+
+ mp_override_legacy_irq(intsrc->source_irq,
+ intsrc->inti_flags & ACPI_MADT_POLARITY_MASK,
+ (intsrc->inti_flags & ACPI_MADT_TRIGGER_MASK) >> 2,
+ intsrc->global_irq);
+
+ return 0;
+}
+
+static int __init
+acpi_parse_nmi_src(struct acpi_subtable_header * header, const unsigned long end)
+{
+ struct acpi_madt_nmi_source *nmi_src = NULL;
+
+ nmi_src = (struct acpi_madt_nmi_source *)header;
+
+ if (BAD_MADT_ENTRY(nmi_src, end))
+ return -EINVAL;
+
+ acpi_table_print_madt_entry(header);
+
+ /* TBD: Support nimsrc entries? */
+
+ return 0;
+}
+
+#endif /* CONFIG_X86_IO_APIC */
+
+/*
+ * acpi_pic_sci_set_trigger()
+ *
+ * use ELCR to set PIC-mode trigger type for SCI
+ *
+ * If a PIC-mode SCI is not recognized or gives spurious IRQ7's
+ * it may require Edge Trigger -- use "acpi_sci=edge"
+ *
+ * Port 0x4d0-4d1 are ECLR1 and ECLR2, the Edge/Level Control Registers
+ * for the 8259 PIC. bit[n] = 1 means irq[n] is Level, otherwise Edge.
+ * ECLR1 is IRQs 0-7 (IRQ 0, 1, 2 must be 0)
+ * ECLR2 is IRQs 8-15 (IRQ 8, 13 must be 0)
+ */
+
+void __init acpi_pic_sci_set_trigger(unsigned int irq, u16 trigger)
+{
+ unsigned int mask = 1 << irq;
+ unsigned int old, new;
+
+ /* Real old ELCR mask */
+ old = inb(0x4d0) | (inb(0x4d1) << 8);
+
+ /*
+ * If we use ACPI to set PCI IRQs, then we should clear ELCR
+ * since we will set it correctly as we enable the PCI irq
+ * routing.
+ */
+ new = acpi_noirq ? old : 0;
+
+ /*
+ * Update SCI information in the ELCR, it isn't in the PCI
+ * routing tables..
+ */
+ switch (trigger) {
+ case 1: /* Edge - clear */
+ new &= ~mask;
+ break;
+ case 3: /* Level - set */
+ new |= mask;
+ break;
+ }
+
+ if (old == new)
+ return;
+
+ printk(PREFIX "setting ELCR to %04x (from %04x)\n", new, old);
+ outb(new, 0x4d0);
+ outb(new >> 8, 0x4d1);
+}
+
+int acpi_gsi_to_irq(u32 gsi, unsigned int *irqp)
+{
+ int rc, irq, trigger, polarity;
+
+ if (acpi_irq_model == ACPI_IRQ_MODEL_PIC) {
+ *irqp = gsi;
+ return 0;
+ }
+
+ rc = acpi_get_override_irq(gsi, &trigger, &polarity);
+ if (rc)
+ return rc;
+
+ trigger = trigger ? ACPI_LEVEL_SENSITIVE : ACPI_EDGE_SENSITIVE;
+ polarity = polarity ? ACPI_ACTIVE_LOW : ACPI_ACTIVE_HIGH;
+ irq = acpi_register_gsi(NULL, gsi, trigger, polarity);
+ if (irq < 0)
+ return irq;
+
+ *irqp = irq;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(acpi_gsi_to_irq);
+
+int acpi_isa_irq_to_gsi(unsigned isa_irq, u32 *gsi)
+{
+ if (isa_irq < nr_legacy_irqs() &&
+ isa_irq_to_gsi[isa_irq] != INVALID_ACPI_IRQ) {
+ *gsi = isa_irq_to_gsi[isa_irq];
+ return 0;
+ }
+
+ return -1;
+}
+
+static int acpi_register_gsi_pic(struct device *dev, u32 gsi,
+ int trigger, int polarity)
+{
+#ifdef CONFIG_PCI
+ /*
+ * Make sure all (legacy) PCI IRQs are set as level-triggered.
+ */
+ if (trigger == ACPI_LEVEL_SENSITIVE)
+ elcr_set_level_irq(gsi);
+#endif
+
+ return gsi;
+}
+
+#ifdef CONFIG_X86_LOCAL_APIC
+static int acpi_register_gsi_ioapic(struct device *dev, u32 gsi,
+ int trigger, int polarity)
+{
+ int irq = gsi;
+#ifdef CONFIG_X86_IO_APIC
+ int node;
+ struct irq_alloc_info info;
+
+ node = dev ? dev_to_node(dev) : NUMA_NO_NODE;
+ trigger = trigger == ACPI_EDGE_SENSITIVE ? 0 : 1;
+ polarity = polarity == ACPI_ACTIVE_HIGH ? 0 : 1;
+ ioapic_set_alloc_attr(&info, node, trigger, polarity);
+
+ mutex_lock(&acpi_ioapic_lock);
+ irq = mp_map_gsi_to_irq(gsi, IOAPIC_MAP_ALLOC, &info);
+ /* Don't set up the ACPI SCI because it's already set up */
+ if (irq >= 0 && enable_update_mptable && gsi != acpi_gbl_FADT.sci_interrupt)
+ mp_config_acpi_gsi(dev, gsi, trigger, polarity);
+ mutex_unlock(&acpi_ioapic_lock);
+#endif
+
+ return irq;
+}
+
+static void acpi_unregister_gsi_ioapic(u32 gsi)
+{
+#ifdef CONFIG_X86_IO_APIC
+ int irq;
+
+ mutex_lock(&acpi_ioapic_lock);
+ irq = mp_map_gsi_to_irq(gsi, 0, NULL);
+ if (irq > 0)
+ mp_unmap_irq(irq);
+ mutex_unlock(&acpi_ioapic_lock);
+#endif
+}
+#endif
+
+int (*__acpi_register_gsi)(struct device *dev, u32 gsi,
+ int trigger, int polarity) = acpi_register_gsi_pic;
+void (*__acpi_unregister_gsi)(u32 gsi) = NULL;
+
+#ifdef CONFIG_ACPI_SLEEP
+int (*acpi_suspend_lowlevel)(void) = x86_acpi_suspend_lowlevel;
+#else
+int (*acpi_suspend_lowlevel)(void);
+#endif
+
+/*
+ * success: return IRQ number (>=0)
+ * failure: return < 0
+ */
+int acpi_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity)
+{
+ return __acpi_register_gsi(dev, gsi, trigger, polarity);
+}
+EXPORT_SYMBOL_GPL(acpi_register_gsi);
+
+void acpi_unregister_gsi(u32 gsi)
+{
+ if (__acpi_unregister_gsi)
+ __acpi_unregister_gsi(gsi);
+}
+EXPORT_SYMBOL_GPL(acpi_unregister_gsi);
+
+#ifdef CONFIG_X86_LOCAL_APIC
+static void __init acpi_set_irq_model_ioapic(void)
+{
+ acpi_irq_model = ACPI_IRQ_MODEL_IOAPIC;
+ __acpi_register_gsi = acpi_register_gsi_ioapic;
+ __acpi_unregister_gsi = acpi_unregister_gsi_ioapic;
+ acpi_ioapic = 1;
+}
+#endif
+
+/*
+ * ACPI based hotplug support for CPU
+ */
+#ifdef CONFIG_ACPI_HOTPLUG_CPU
+#include <acpi/processor.h>
+
+static int acpi_map_cpu2node(acpi_handle handle, int cpu, int physid)
+{
+#ifdef CONFIG_ACPI_NUMA
+ int nid;
+
+ nid = acpi_get_node(handle);
+ if (nid != NUMA_NO_NODE) {
+ set_apicid_to_node(physid, nid);
+ numa_set_node(cpu, nid);
+ }
+#endif
+ return 0;
+}
+
+int acpi_map_cpu(acpi_handle handle, phys_cpuid_t physid, u32 acpi_id,
+ int *pcpu)
+{
+ int cpu;
+
+ cpu = acpi_register_lapic(physid, acpi_id, ACPI_MADT_ENABLED);
+ if (cpu < 0) {
+ pr_info(PREFIX "Unable to map lapic to logical cpu number\n");
+ return cpu;
+ }
+
+ acpi_processor_set_pdc(handle);
+ acpi_map_cpu2node(handle, cpu, physid);
+
+ *pcpu = cpu;
+ return 0;
+}
+EXPORT_SYMBOL(acpi_map_cpu);
+
+int acpi_unmap_cpu(int cpu)
+{
+#ifdef CONFIG_ACPI_NUMA
+ set_apicid_to_node(per_cpu(x86_cpu_to_apicid, cpu), NUMA_NO_NODE);
+#endif
+
+ per_cpu(x86_cpu_to_apicid, cpu) = -1;
+ set_cpu_present(cpu, false);
+ num_processors--;
+
+ return (0);
+}
+EXPORT_SYMBOL(acpi_unmap_cpu);
+#endif /* CONFIG_ACPI_HOTPLUG_CPU */
+
+int acpi_register_ioapic(acpi_handle handle, u64 phys_addr, u32 gsi_base)
+{
+ int ret = -ENOSYS;
+#ifdef CONFIG_ACPI_HOTPLUG_IOAPIC
+ int ioapic_id;
+ u64 addr;
+ struct ioapic_domain_cfg cfg = {
+ .type = IOAPIC_DOMAIN_DYNAMIC,
+ .ops = &mp_ioapic_irqdomain_ops,
+ };
+
+ ioapic_id = acpi_get_ioapic_id(handle, gsi_base, &addr);
+ if (ioapic_id < 0) {
+ unsigned long long uid;
+ acpi_status status;
+
+ status = acpi_evaluate_integer(handle, METHOD_NAME__UID,
+ NULL, &uid);
+ if (ACPI_FAILURE(status)) {
+ acpi_handle_warn(handle, "failed to get IOAPIC ID.\n");
+ return -EINVAL;
+ }
+ ioapic_id = (int)uid;
+ }
+
+ mutex_lock(&acpi_ioapic_lock);
+ ret = mp_register_ioapic(ioapic_id, phys_addr, gsi_base, &cfg);
+ mutex_unlock(&acpi_ioapic_lock);
+#endif
+
+ return ret;
+}
+EXPORT_SYMBOL(acpi_register_ioapic);
+
+int acpi_unregister_ioapic(acpi_handle handle, u32 gsi_base)
+{
+ int ret = -ENOSYS;
+
+#ifdef CONFIG_ACPI_HOTPLUG_IOAPIC
+ mutex_lock(&acpi_ioapic_lock);
+ ret = mp_unregister_ioapic(gsi_base);
+ mutex_unlock(&acpi_ioapic_lock);
+#endif
+
+ return ret;
+}
+EXPORT_SYMBOL(acpi_unregister_ioapic);
+
+/**
+ * acpi_ioapic_registered - Check whether IOAPIC assoicatied with @gsi_base
+ * has been registered
+ * @handle: ACPI handle of the IOAPIC deivce
+ * @gsi_base: GSI base associated with the IOAPIC
+ *
+ * Assume caller holds some type of lock to serialize acpi_ioapic_registered()
+ * with acpi_register_ioapic()/acpi_unregister_ioapic().
+ */
+int acpi_ioapic_registered(acpi_handle handle, u32 gsi_base)
+{
+ int ret = 0;
+
+#ifdef CONFIG_ACPI_HOTPLUG_IOAPIC
+ mutex_lock(&acpi_ioapic_lock);
+ ret = mp_ioapic_registered(gsi_base);
+ mutex_unlock(&acpi_ioapic_lock);
+#endif
+
+ return ret;
+}
+
+static int __init acpi_parse_sbf(struct acpi_table_header *table)
+{
+ struct acpi_table_boot *sb = (struct acpi_table_boot *)table;
+
+ sbf_port = sb->cmos_index; /* Save CMOS port */
+
+ return 0;
+}
+
+#ifdef CONFIG_HPET_TIMER
+#include <asm/hpet.h>
+
+static struct resource *hpet_res __initdata;
+
+static int __init acpi_parse_hpet(struct acpi_table_header *table)
+{
+ struct acpi_table_hpet *hpet_tbl = (struct acpi_table_hpet *)table;
+
+ if (hpet_tbl->address.space_id != ACPI_SPACE_MEM) {
+ printk(KERN_WARNING PREFIX "HPET timers must be located in "
+ "memory.\n");
+ return -1;
+ }
+
+ hpet_address = hpet_tbl->address.address;
+ hpet_blockid = hpet_tbl->sequence;
+
+ /*
+ * Some broken BIOSes advertise HPET at 0x0. We really do not
+ * want to allocate a resource there.
+ */
+ if (!hpet_address) {
+ printk(KERN_WARNING PREFIX
+ "HPET id: %#x base: %#lx is invalid\n",
+ hpet_tbl->id, hpet_address);
+ return 0;
+ }
+#ifdef CONFIG_X86_64
+ /*
+ * Some even more broken BIOSes advertise HPET at
+ * 0xfed0000000000000 instead of 0xfed00000. Fix it up and add
+ * some noise:
+ */
+ if (hpet_address == 0xfed0000000000000UL) {
+ if (!hpet_force_user) {
+ printk(KERN_WARNING PREFIX "HPET id: %#x "
+ "base: 0xfed0000000000000 is bogus\n "
+ "try hpet=force on the kernel command line to "
+ "fix it up to 0xfed00000.\n", hpet_tbl->id);
+ hpet_address = 0;
+ return 0;
+ }
+ printk(KERN_WARNING PREFIX
+ "HPET id: %#x base: 0xfed0000000000000 fixed up "
+ "to 0xfed00000.\n", hpet_tbl->id);
+ hpet_address >>= 32;
+ }
+#endif
+ printk(KERN_INFO PREFIX "HPET id: %#x base: %#lx\n",
+ hpet_tbl->id, hpet_address);
+
+ /*
+ * Allocate and initialize the HPET firmware resource for adding into
+ * the resource tree during the lateinit timeframe.
+ */
+#define HPET_RESOURCE_NAME_SIZE 9
+ hpet_res = alloc_bootmem(sizeof(*hpet_res) + HPET_RESOURCE_NAME_SIZE);
+
+ hpet_res->name = (void *)&hpet_res[1];
+ hpet_res->flags = IORESOURCE_MEM;
+ snprintf((char *)hpet_res->name, HPET_RESOURCE_NAME_SIZE, "HPET %u",
+ hpet_tbl->sequence);
+
+ hpet_res->start = hpet_address;
+ hpet_res->end = hpet_address + (1 * 1024) - 1;
+
+ return 0;
+}
+
+/*
+ * hpet_insert_resource inserts the HPET resources used into the resource
+ * tree.
+ */
+static __init int hpet_insert_resource(void)
+{
+ if (!hpet_res)
+ return 1;
+
+ return insert_resource(&iomem_resource, hpet_res);
+}
+
+late_initcall(hpet_insert_resource);
+
+#else
+#define acpi_parse_hpet NULL
+#endif
+
+static int __init acpi_parse_fadt(struct acpi_table_header *table)
+{
+ if (!(acpi_gbl_FADT.boot_flags & ACPI_FADT_LEGACY_DEVICES)) {
+ pr_debug("ACPI: no legacy devices present\n");
+ x86_platform.legacy.devices.pnpbios = 0;
+ }
+
+ if (acpi_gbl_FADT.header.revision >= FADT2_REVISION_ID &&
+ !(acpi_gbl_FADT.boot_flags & ACPI_FADT_8042) &&
+ x86_platform.legacy.i8042 != X86_LEGACY_I8042_PLATFORM_ABSENT) {
+ pr_debug("ACPI: i8042 controller is absent\n");
+ x86_platform.legacy.i8042 = X86_LEGACY_I8042_FIRMWARE_ABSENT;
+ }
+
+ if (acpi_gbl_FADT.boot_flags & ACPI_FADT_NO_CMOS_RTC) {
+ pr_debug("ACPI: not registering RTC platform device\n");
+ x86_platform.legacy.rtc = 0;
+ }
+
+ if (acpi_gbl_FADT.boot_flags & ACPI_FADT_NO_VGA) {
+ pr_debug("ACPI: probing for VGA not safe\n");
+ x86_platform.legacy.no_vga = 1;
+ }
+
+#ifdef CONFIG_X86_PM_TIMER
+ /* detect the location of the ACPI PM Timer */
+ if (acpi_gbl_FADT.header.revision >= FADT2_REVISION_ID) {
+ /* FADT rev. 2 */
+ if (acpi_gbl_FADT.xpm_timer_block.space_id !=
+ ACPI_ADR_SPACE_SYSTEM_IO)
+ return 0;
+
+ pmtmr_ioport = acpi_gbl_FADT.xpm_timer_block.address;
+ /*
+ * "X" fields are optional extensions to the original V1.0
+ * fields, so we must selectively expand V1.0 fields if the
+ * corresponding X field is zero.
+ */
+ if (!pmtmr_ioport)
+ pmtmr_ioport = acpi_gbl_FADT.pm_timer_block;
+ } else {
+ /* FADT rev. 1 */
+ pmtmr_ioport = acpi_gbl_FADT.pm_timer_block;
+ }
+ if (pmtmr_ioport)
+ printk(KERN_INFO PREFIX "PM-Timer IO Port: %#x\n",
+ pmtmr_ioport);
+#endif
+ return 0;
+}
+
+#ifdef CONFIG_X86_LOCAL_APIC
+/*
+ * Parse LAPIC entries in MADT
+ * returns 0 on success, < 0 on error
+ */
+
+static int __init early_acpi_parse_madt_lapic_addr_ovr(void)
+{
+ int count;
+
+ if (!boot_cpu_has(X86_FEATURE_APIC))
+ return -ENODEV;
+
+ /*
+ * Note that the LAPIC address is obtained from the MADT (32-bit value)
+ * and (optionally) overridden by a LAPIC_ADDR_OVR entry (64-bit value).
+ */
+
+ count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC_OVERRIDE,
+ acpi_parse_lapic_addr_ovr, 0);
+ if (count < 0) {
+ printk(KERN_ERR PREFIX
+ "Error parsing LAPIC address override entry\n");
+ return count;
+ }
+
+ register_lapic_address(acpi_lapic_addr);
+
+ return count;
+}
+
+static int __init acpi_parse_madt_lapic_entries(void)
+{
+ int count;
+ int x2count = 0;
+ int ret;
+ struct acpi_subtable_proc madt_proc[2];
+
+ if (!boot_cpu_has(X86_FEATURE_APIC))
+ return -ENODEV;
+
+ count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_SAPIC,
+ acpi_parse_sapic, MAX_LOCAL_APIC);
+
+ if (!count) {
+ memset(madt_proc, 0, sizeof(madt_proc));
+ madt_proc[0].id = ACPI_MADT_TYPE_LOCAL_APIC;
+ madt_proc[0].handler = acpi_parse_lapic;
+ madt_proc[1].id = ACPI_MADT_TYPE_LOCAL_X2APIC;
+ madt_proc[1].handler = acpi_parse_x2apic;
+ ret = acpi_table_parse_entries_array(ACPI_SIG_MADT,
+ sizeof(struct acpi_table_madt),
+ madt_proc, ARRAY_SIZE(madt_proc), MAX_LOCAL_APIC);
+ if (ret < 0) {
+ printk(KERN_ERR PREFIX
+ "Error parsing LAPIC/X2APIC entries\n");
+ return ret;
+ }
+
+ count = madt_proc[0].count;
+ x2count = madt_proc[1].count;
+ }
+ if (!count && !x2count) {
+ printk(KERN_ERR PREFIX "No LAPIC entries present\n");
+ /* TBD: Cleanup to allow fallback to MPS */
+ return -ENODEV;
+ } else if (count < 0 || x2count < 0) {
+ printk(KERN_ERR PREFIX "Error parsing LAPIC entry\n");
+ /* TBD: Cleanup to allow fallback to MPS */
+ return count;
+ }
+
+ x2count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_X2APIC_NMI,
+ acpi_parse_x2apic_nmi, 0);
+ count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC_NMI,
+ acpi_parse_lapic_nmi, 0);
+ if (count < 0 || x2count < 0) {
+ printk(KERN_ERR PREFIX "Error parsing LAPIC NMI entry\n");
+ /* TBD: Cleanup to allow fallback to MPS */
+ return count;
+ }
+ return 0;
+}
+#endif /* CONFIG_X86_LOCAL_APIC */
+
+#ifdef CONFIG_X86_IO_APIC
+static void __init mp_config_acpi_legacy_irqs(void)
+{
+ int i;
+ struct mpc_intsrc mp_irq;
+
+#ifdef CONFIG_EISA
+ /*
+ * Fabricate the legacy ISA bus (bus #31).
+ */
+ mp_bus_id_to_type[MP_ISA_BUS] = MP_BUS_ISA;
+#endif
+ set_bit(MP_ISA_BUS, mp_bus_not_pci);
+ pr_debug("Bus #%d is ISA (nIRQs: %d)\n", MP_ISA_BUS, nr_legacy_irqs());
+
+ /*
+ * Use the default configuration for the IRQs 0-15. Unless
+ * overridden by (MADT) interrupt source override entries.
+ */
+ for (i = 0; i < nr_legacy_irqs(); i++) {
+ int ioapic, pin;
+ unsigned int dstapic;
+ int idx;
+ u32 gsi;
+
+ /* Locate the gsi that irq i maps to. */
+ if (acpi_isa_irq_to_gsi(i, &gsi))
+ continue;
+
+ /*
+ * Locate the IOAPIC that manages the ISA IRQ.
+ */
+ ioapic = mp_find_ioapic(gsi);
+ if (ioapic < 0)
+ continue;
+ pin = mp_find_ioapic_pin(ioapic, gsi);
+ dstapic = mpc_ioapic_id(ioapic);
+
+ for (idx = 0; idx < mp_irq_entries; idx++) {
+ struct mpc_intsrc *irq = mp_irqs + idx;
+
+ /* Do we already have a mapping for this ISA IRQ? */
+ if (irq->srcbus == MP_ISA_BUS && irq->srcbusirq == i)
+ break;
+
+ /* Do we already have a mapping for this IOAPIC pin */
+ if (irq->dstapic == dstapic && irq->dstirq == pin)
+ break;
+ }
+
+ if (idx != mp_irq_entries) {
+ printk(KERN_DEBUG "ACPI: IRQ%d used by override.\n", i);
+ continue; /* IRQ already used */
+ }
+
+ mp_irq.type = MP_INTSRC;
+ mp_irq.irqflag = 0; /* Conforming */
+ mp_irq.srcbus = MP_ISA_BUS;
+ mp_irq.dstapic = dstapic;
+ mp_irq.irqtype = mp_INT;
+ mp_irq.srcbusirq = i; /* Identity mapped */
+ mp_irq.dstirq = pin;
+
+ mp_save_irq(&mp_irq);
+ }
+}
+
+/*
+ * Parse IOAPIC related entries in MADT
+ * returns 0 on success, < 0 on error
+ */
+static int __init acpi_parse_madt_ioapic_entries(void)
+{
+ int count;
+
+ /*
+ * ACPI interpreter is required to complete interrupt setup,
+ * so if it is off, don't enumerate the io-apics with ACPI.
+ * If MPS is present, it will handle them,
+ * otherwise the system will stay in PIC mode
+ */
+ if (acpi_disabled || acpi_noirq)
+ return -ENODEV;
+
+ if (!boot_cpu_has(X86_FEATURE_APIC))
+ return -ENODEV;
+
+ /*
+ * if "noapic" boot option, don't look for IO-APICs
+ */
+ if (skip_ioapic_setup) {
+ printk(KERN_INFO PREFIX "Skipping IOAPIC probe "
+ "due to 'noapic' option.\n");
+ return -ENODEV;
+ }
+
+ count = acpi_table_parse_madt(ACPI_MADT_TYPE_IO_APIC, acpi_parse_ioapic,
+ MAX_IO_APICS);
+ if (!count) {
+ printk(KERN_ERR PREFIX "No IOAPIC entries present\n");
+ return -ENODEV;
+ } else if (count < 0) {
+ printk(KERN_ERR PREFIX "Error parsing IOAPIC entry\n");
+ return count;
+ }
+
+ count = acpi_table_parse_madt(ACPI_MADT_TYPE_INTERRUPT_OVERRIDE,
+ acpi_parse_int_src_ovr, nr_irqs);
+ if (count < 0) {
+ printk(KERN_ERR PREFIX
+ "Error parsing interrupt source overrides entry\n");
+ /* TBD: Cleanup to allow fallback to MPS */
+ return count;
+ }
+
+ /*
+ * If BIOS did not supply an INT_SRC_OVR for the SCI
+ * pretend we got one so we can set the SCI flags.
+ * But ignore setting up SCI on hardware reduced platforms.
+ */
+ if (acpi_sci_override_gsi == INVALID_ACPI_IRQ && !acpi_gbl_reduced_hardware)
+ acpi_sci_ioapic_setup(acpi_gbl_FADT.sci_interrupt, 0, 0,
+ acpi_gbl_FADT.sci_interrupt);
+
+ /* Fill in identity legacy mappings where no override */
+ mp_config_acpi_legacy_irqs();
+
+ count = acpi_table_parse_madt(ACPI_MADT_TYPE_NMI_SOURCE,
+ acpi_parse_nmi_src, nr_irqs);
+ if (count < 0) {
+ printk(KERN_ERR PREFIX "Error parsing NMI SRC entry\n");
+ /* TBD: Cleanup to allow fallback to MPS */
+ return count;
+ }
+
+ return 0;
+}
+#else
+static inline int acpi_parse_madt_ioapic_entries(void)
+{
+ return -1;
+}
+#endif /* !CONFIG_X86_IO_APIC */
+
+static void __init early_acpi_process_madt(void)
+{
+#ifdef CONFIG_X86_LOCAL_APIC
+ int error;
+
+ if (!acpi_table_parse(ACPI_SIG_MADT, acpi_parse_madt)) {
+
+ /*
+ * Parse MADT LAPIC entries
+ */
+ error = early_acpi_parse_madt_lapic_addr_ovr();
+ if (!error) {
+ acpi_lapic = 1;
+ smp_found_config = 1;
+ }
+ if (error == -EINVAL) {
+ /*
+ * Dell Precision Workstation 410, 610 come here.
+ */
+ printk(KERN_ERR PREFIX
+ "Invalid BIOS MADT, disabling ACPI\n");
+ disable_acpi();
+ }
+ }
+#endif
+}
+
+static void __init acpi_process_madt(void)
+{
+#ifdef CONFIG_X86_LOCAL_APIC
+ int error;
+
+ if (!acpi_table_parse(ACPI_SIG_MADT, acpi_parse_madt)) {
+
+ /*
+ * Parse MADT LAPIC entries
+ */
+ error = acpi_parse_madt_lapic_entries();
+ if (!error) {
+ acpi_lapic = 1;
+
+ /*
+ * Parse MADT IO-APIC entries
+ */
+ mutex_lock(&acpi_ioapic_lock);
+ error = acpi_parse_madt_ioapic_entries();
+ mutex_unlock(&acpi_ioapic_lock);
+ if (!error) {
+ acpi_set_irq_model_ioapic();
+
+ smp_found_config = 1;
+ }
+ }
+ if (error == -EINVAL) {
+ /*
+ * Dell Precision Workstation 410, 610 come here.
+ */
+ printk(KERN_ERR PREFIX
+ "Invalid BIOS MADT, disabling ACPI\n");
+ disable_acpi();
+ }
+ } else {
+ /*
+ * ACPI found no MADT, and so ACPI wants UP PIC mode.
+ * In the event an MPS table was found, forget it.
+ * Boot with "acpi=off" to use MPS on such a system.
+ */
+ if (smp_found_config) {
+ printk(KERN_WARNING PREFIX
+ "No APIC-table, disabling MPS\n");
+ smp_found_config = 0;
+ }
+ }
+
+ /*
+ * ACPI supports both logical (e.g. Hyper-Threading) and physical
+ * processors, where MPS only supports physical.
+ */
+ if (acpi_lapic && acpi_ioapic)
+ printk(KERN_INFO "Using ACPI (MADT) for SMP configuration "
+ "information\n");
+ else if (acpi_lapic)
+ printk(KERN_INFO "Using ACPI for processor (LAPIC) "
+ "configuration information\n");
+#endif
+ return;
+}
+
+static int __init disable_acpi_irq(const struct dmi_system_id *d)
+{
+ if (!acpi_force) {
+ printk(KERN_NOTICE "%s detected: force use of acpi=noirq\n",
+ d->ident);
+ acpi_noirq_set();
+ }
+ return 0;
+}
+
+static int __init disable_acpi_pci(const struct dmi_system_id *d)
+{
+ if (!acpi_force) {
+ printk(KERN_NOTICE "%s detected: force use of pci=noacpi\n",
+ d->ident);
+ acpi_disable_pci();
+ }
+ return 0;
+}
+
+static int __init dmi_disable_acpi(const struct dmi_system_id *d)
+{
+ if (!acpi_force) {
+ printk(KERN_NOTICE "%s detected: acpi off\n", d->ident);
+ disable_acpi();
+ } else {
+ printk(KERN_NOTICE
+ "Warning: DMI blacklist says broken, but acpi forced\n");
+ }
+ return 0;
+}
+
+/*
+ * Force ignoring BIOS IRQ0 override
+ */
+static int __init dmi_ignore_irq0_timer_override(const struct dmi_system_id *d)
+{
+ if (!acpi_skip_timer_override) {
+ pr_notice("%s detected: Ignoring BIOS IRQ0 override\n",
+ d->ident);
+ acpi_skip_timer_override = 1;
+ }
+ return 0;
+}
+
+/*
+ * ACPI offers an alternative platform interface model that removes
+ * ACPI hardware requirements for platforms that do not implement
+ * the PC Architecture.
+ *
+ * We initialize the Hardware-reduced ACPI model here:
+ */
+void __init acpi_generic_reduced_hw_init(void)
+{
+ /*
+ * Override x86_init functions and bypass legacy PIC in
+ * hardware reduced ACPI mode.
+ */
+ x86_init.timers.timer_init = x86_init_noop;
+ x86_init.irqs.pre_vector_init = x86_init_noop;
+ legacy_pic = &null_legacy_pic;
+}
+
+static void __init acpi_reduced_hw_init(void)
+{
+ if (acpi_gbl_reduced_hardware)
+ x86_init.acpi.reduced_hw_early_init();
+}
+
+/*
+ * If your system is blacklisted here, but you find that acpi=force
+ * works for you, please contact linux-acpi@vger.kernel.org
+ */
+static const struct dmi_system_id acpi_dmi_table[] __initconst = {
+ /*
+ * Boxes that need ACPI disabled
+ */
+ {
+ .callback = dmi_disable_acpi,
+ .ident = "IBM Thinkpad",
+ .matches = {
+ DMI_MATCH(DMI_BOARD_VENDOR, "IBM"),
+ DMI_MATCH(DMI_BOARD_NAME, "2629H1G"),
+ },
+ },
+
+ /*
+ * Boxes that need ACPI PCI IRQ routing disabled
+ */
+ {
+ .callback = disable_acpi_irq,
+ .ident = "ASUS A7V",
+ .matches = {
+ DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC"),
+ DMI_MATCH(DMI_BOARD_NAME, "<A7V>"),
+ /* newer BIOS, Revision 1011, does work */
+ DMI_MATCH(DMI_BIOS_VERSION,
+ "ASUS A7V ACPI BIOS Revision 1007"),
+ },
+ },
+ {
+ /*
+ * Latest BIOS for IBM 600E (1.16) has bad pcinum
+ * for LPC bridge, which is needed for the PCI
+ * interrupt links to work. DSDT fix is in bug 5966.
+ * 2645, 2646 model numbers are shared with 600/600E/600X
+ */
+ .callback = disable_acpi_irq,
+ .ident = "IBM Thinkpad 600 Series 2645",
+ .matches = {
+ DMI_MATCH(DMI_BOARD_VENDOR, "IBM"),
+ DMI_MATCH(DMI_BOARD_NAME, "2645"),
+ },
+ },
+ {
+ .callback = disable_acpi_irq,
+ .ident = "IBM Thinkpad 600 Series 2646",
+ .matches = {
+ DMI_MATCH(DMI_BOARD_VENDOR, "IBM"),
+ DMI_MATCH(DMI_BOARD_NAME, "2646"),
+ },
+ },
+ /*
+ * Boxes that need ACPI PCI IRQ routing and PCI scan disabled
+ */
+ { /* _BBN 0 bug */
+ .callback = disable_acpi_pci,
+ .ident = "ASUS PR-DLS",
+ .matches = {
+ DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."),
+ DMI_MATCH(DMI_BOARD_NAME, "PR-DLS"),
+ DMI_MATCH(DMI_BIOS_VERSION,
+ "ASUS PR-DLS ACPI BIOS Revision 1010"),
+ DMI_MATCH(DMI_BIOS_DATE, "03/21/2003")
+ },
+ },
+ {
+ .callback = disable_acpi_pci,
+ .ident = "Acer TravelMate 36x Laptop",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Acer"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "TravelMate 360"),
+ },
+ },
+ {}
+};
+
+/* second table for DMI checks that should run after early-quirks */
+static const struct dmi_system_id acpi_dmi_table_late[] __initconst = {
+ /*
+ * HP laptops which use a DSDT reporting as HP/SB400/10000,
+ * which includes some code which overrides all temperature
+ * trip points to 16C if the INTIN2 input of the I/O APIC
+ * is enabled. This input is incorrectly designated the
+ * ISA IRQ 0 via an interrupt source override even though
+ * it is wired to the output of the master 8259A and INTIN0
+ * is not connected at all. Force ignoring BIOS IRQ0
+ * override in that cases.
+ */
+ {
+ .callback = dmi_ignore_irq0_timer_override,
+ .ident = "HP nx6115 laptop",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq nx6115"),
+ },
+ },
+ {
+ .callback = dmi_ignore_irq0_timer_override,
+ .ident = "HP NX6125 laptop",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq nx6125"),
+ },
+ },
+ {
+ .callback = dmi_ignore_irq0_timer_override,
+ .ident = "HP NX6325 laptop",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq nx6325"),
+ },
+ },
+ {
+ .callback = dmi_ignore_irq0_timer_override,
+ .ident = "HP 6715b laptop",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq 6715b"),
+ },
+ },
+ {
+ .callback = dmi_ignore_irq0_timer_override,
+ .ident = "FUJITSU SIEMENS",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "FUJITSU SIEMENS"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "AMILO PRO V2030"),
+ },
+ },
+ {}
+};
+
+/*
+ * acpi_boot_table_init() and acpi_boot_init()
+ * called from setup_arch(), always.
+ * 1. checksums all tables
+ * 2. enumerates lapics
+ * 3. enumerates io-apics
+ *
+ * acpi_table_init() is separate to allow reading SRAT without
+ * other side effects.
+ *
+ * side effects of acpi_boot_init:
+ * acpi_lapic = 1 if LAPIC found
+ * acpi_ioapic = 1 if IOAPIC found
+ * if (acpi_lapic && acpi_ioapic) smp_found_config = 1;
+ * if acpi_blacklisted() acpi_disabled = 1;
+ * acpi_irq_model=...
+ * ...
+ */
+
+void __init acpi_boot_table_init(void)
+{
+ dmi_check_system(acpi_dmi_table);
+
+ /*
+ * If acpi_disabled, bail out
+ */
+ if (acpi_disabled)
+ return;
+
+ /*
+ * Initialize the ACPI boot-time table parser.
+ */
+ if (acpi_table_init()) {
+ disable_acpi();
+ return;
+ }
+
+ acpi_table_parse(ACPI_SIG_BOOT, acpi_parse_sbf);
+
+ /*
+ * blacklist may disable ACPI entirely
+ */
+ if (acpi_blacklisted()) {
+ if (acpi_force) {
+ printk(KERN_WARNING PREFIX "acpi=force override\n");
+ } else {
+ printk(KERN_WARNING PREFIX "Disabling ACPI support\n");
+ disable_acpi();
+ return;
+ }
+ }
+}
+
+int __init early_acpi_boot_init(void)
+{
+ /*
+ * If acpi_disabled, bail out
+ */
+ if (acpi_disabled)
+ return 1;
+
+ /*
+ * Process the Multiple APIC Description Table (MADT), if present
+ */
+ early_acpi_process_madt();
+
+ /*
+ * Hardware-reduced ACPI mode initialization:
+ */
+ acpi_reduced_hw_init();
+
+ return 0;
+}
+
+int __init acpi_boot_init(void)
+{
+ /* those are executed after early-quirks are executed */
+ dmi_check_system(acpi_dmi_table_late);
+
+ /*
+ * If acpi_disabled, bail out
+ */
+ if (acpi_disabled)
+ return 1;
+
+ acpi_table_parse(ACPI_SIG_BOOT, acpi_parse_sbf);
+
+ /*
+ * set sci_int and PM timer address
+ */
+ acpi_table_parse(ACPI_SIG_FADT, acpi_parse_fadt);
+
+ /*
+ * Process the Multiple APIC Description Table (MADT), if present
+ */
+ acpi_process_madt();
+
+ acpi_table_parse(ACPI_SIG_HPET, acpi_parse_hpet);
+ if (IS_ENABLED(CONFIG_ACPI_BGRT))
+ acpi_table_parse(ACPI_SIG_BGRT, acpi_parse_bgrt);
+
+ if (!acpi_noirq)
+ x86_init.pci.init = pci_acpi_init;
+
+ /* Do not enable ACPI SPCR console by default */
+ acpi_parse_spcr(earlycon_acpi_spcr_enable, false);
+ return 0;
+}
+
+static int __init parse_acpi(char *arg)
+{
+ if (!arg)
+ return -EINVAL;
+
+ /* "acpi=off" disables both ACPI table parsing and interpreter */
+ if (strcmp(arg, "off") == 0) {
+ disable_acpi();
+ }
+ /* acpi=force to over-ride black-list */
+ else if (strcmp(arg, "force") == 0) {
+ acpi_force = 1;
+ acpi_disabled = 0;
+ }
+ /* acpi=strict disables out-of-spec workarounds */
+ else if (strcmp(arg, "strict") == 0) {
+ acpi_strict = 1;
+ }
+ /* acpi=rsdt use RSDT instead of XSDT */
+ else if (strcmp(arg, "rsdt") == 0) {
+ acpi_gbl_do_not_use_xsdt = TRUE;
+ }
+ /* "acpi=noirq" disables ACPI interrupt routing */
+ else if (strcmp(arg, "noirq") == 0) {
+ acpi_noirq_set();
+ }
+ /* "acpi=copy_dsdt" copys DSDT */
+ else if (strcmp(arg, "copy_dsdt") == 0) {
+ acpi_gbl_copy_dsdt_locally = 1;
+ }
+ /* "acpi=nocmcff" disables FF mode for corrected errors */
+ else if (strcmp(arg, "nocmcff") == 0) {
+ acpi_disable_cmcff = 1;
+ } else {
+ /* Core will printk when we return error. */
+ return -EINVAL;
+ }
+ return 0;
+}
+early_param("acpi", parse_acpi);
+
+/* FIXME: Using pci= for an ACPI parameter is a travesty. */
+static int __init parse_pci(char *arg)
+{
+ if (arg && strcmp(arg, "noacpi") == 0)
+ acpi_disable_pci();
+ return 0;
+}
+early_param("pci", parse_pci);
+
+int __init acpi_mps_check(void)
+{
+#if defined(CONFIG_X86_LOCAL_APIC) && !defined(CONFIG_X86_MPPARSE)
+/* mptable code is not built-in*/
+ if (acpi_disabled || acpi_noirq) {
+ printk(KERN_WARNING "MPS support code is not built-in.\n"
+ "Using acpi=off or acpi=noirq or pci=noacpi "
+ "may have problem\n");
+ return 1;
+ }
+#endif
+ return 0;
+}
+
+#ifdef CONFIG_X86_IO_APIC
+static int __init parse_acpi_skip_timer_override(char *arg)
+{
+ acpi_skip_timer_override = 1;
+ return 0;
+}
+early_param("acpi_skip_timer_override", parse_acpi_skip_timer_override);
+
+static int __init parse_acpi_use_timer_override(char *arg)
+{
+ acpi_use_timer_override = 1;
+ return 0;
+}
+early_param("acpi_use_timer_override", parse_acpi_use_timer_override);
+#endif /* CONFIG_X86_IO_APIC */
+
+static int __init setup_acpi_sci(char *s)
+{
+ if (!s)
+ return -EINVAL;
+ if (!strcmp(s, "edge"))
+ acpi_sci_flags = ACPI_MADT_TRIGGER_EDGE |
+ (acpi_sci_flags & ~ACPI_MADT_TRIGGER_MASK);
+ else if (!strcmp(s, "level"))
+ acpi_sci_flags = ACPI_MADT_TRIGGER_LEVEL |
+ (acpi_sci_flags & ~ACPI_MADT_TRIGGER_MASK);
+ else if (!strcmp(s, "high"))
+ acpi_sci_flags = ACPI_MADT_POLARITY_ACTIVE_HIGH |
+ (acpi_sci_flags & ~ACPI_MADT_POLARITY_MASK);
+ else if (!strcmp(s, "low"))
+ acpi_sci_flags = ACPI_MADT_POLARITY_ACTIVE_LOW |
+ (acpi_sci_flags & ~ACPI_MADT_POLARITY_MASK);
+ else
+ return -EINVAL;
+ return 0;
+}
+early_param("acpi_sci", setup_acpi_sci);
+
+int __acpi_acquire_global_lock(unsigned int *lock)
+{
+ unsigned int old, new, val;
+ do {
+ old = *lock;
+ new = (((old & ~0x3) + 2) + ((old >> 1) & 0x1));
+ val = cmpxchg(lock, old, new);
+ } while (unlikely (val != old));
+ return (new < 3) ? -1 : 0;
+}
+
+int __acpi_release_global_lock(unsigned int *lock)
+{
+ unsigned int old, new, val;
+ do {
+ old = *lock;
+ new = old & ~0x3;
+ val = cmpxchg(lock, old, new);
+ } while (unlikely (val != old));
+ return old & 0x1;
+}
+
+void __init arch_reserve_mem_area(acpi_physical_address addr, size_t size)
+{
+ e820__range_add(addr, size, E820_TYPE_ACPI);
+ e820__update_table_print();
+}
diff --git a/arch/x86/kernel/acpi/cppc_msr.c b/arch/x86/kernel/acpi/cppc_msr.c
new file mode 100644
index 0000000..6fb478b
--- /dev/null
+++ b/arch/x86/kernel/acpi/cppc_msr.c
@@ -0,0 +1,58 @@
+/*
+ * cppc_msr.c: MSR Interface for CPPC
+ * Copyright (c) 2016, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ */
+
+#include <acpi/cppc_acpi.h>
+#include <asm/msr.h>
+
+/* Refer to drivers/acpi/cppc_acpi.c for the description of functions */
+
+bool cpc_ffh_supported(void)
+{
+ return true;
+}
+
+int cpc_read_ffh(int cpunum, struct cpc_reg *reg, u64 *val)
+{
+ int err;
+
+ err = rdmsrl_safe_on_cpu(cpunum, reg->address, val);
+ if (!err) {
+ u64 mask = GENMASK_ULL(reg->bit_offset + reg->bit_width - 1,
+ reg->bit_offset);
+
+ *val &= mask;
+ *val >>= reg->bit_offset;
+ }
+ return err;
+}
+
+int cpc_write_ffh(int cpunum, struct cpc_reg *reg, u64 val)
+{
+ u64 rd_val;
+ int err;
+
+ err = rdmsrl_safe_on_cpu(cpunum, reg->address, &rd_val);
+ if (!err) {
+ u64 mask = GENMASK_ULL(reg->bit_offset + reg->bit_width - 1,
+ reg->bit_offset);
+
+ val <<= reg->bit_offset;
+ val &= mask;
+ rd_val &= ~mask;
+ rd_val |= val;
+ err = wrmsrl_safe_on_cpu(cpunum, reg->address, rd_val);
+ }
+ return err;
+}
diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c
new file mode 100644
index 0000000..158ad14
--- /dev/null
+++ b/arch/x86/kernel/acpi/cstate.c
@@ -0,0 +1,185 @@
+/*
+ * Copyright (C) 2005 Intel Corporation
+ * Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
+ * - Added _PDC for SMP C-states on Intel CPUs
+ */
+
+#include <linux/kernel.h>
+#include <linux/export.h>
+#include <linux/init.h>
+#include <linux/acpi.h>
+#include <linux/cpu.h>
+#include <linux/sched.h>
+
+#include <acpi/processor.h>
+#include <asm/mwait.h>
+#include <asm/special_insns.h>
+
+/*
+ * Initialize bm_flags based on the CPU cache properties
+ * On SMP it depends on cache configuration
+ * - When cache is not shared among all CPUs, we flush cache
+ * before entering C3.
+ * - When cache is shared among all CPUs, we use bm_check
+ * mechanism as in UP case
+ *
+ * This routine is called only after all the CPUs are online
+ */
+void acpi_processor_power_init_bm_check(struct acpi_processor_flags *flags,
+ unsigned int cpu)
+{
+ struct cpuinfo_x86 *c = &cpu_data(cpu);
+
+ flags->bm_check = 0;
+ if (num_online_cpus() == 1)
+ flags->bm_check = 1;
+ else if (c->x86_vendor == X86_VENDOR_INTEL) {
+ /*
+ * Today all MP CPUs that support C3 share cache.
+ * And caches should not be flushed by software while
+ * entering C3 type state.
+ */
+ flags->bm_check = 1;
+ }
+
+ /*
+ * On all recent Intel platforms, ARB_DISABLE is a nop.
+ * So, set bm_control to zero to indicate that ARB_DISABLE
+ * is not required while entering C3 type state on
+ * P4, Core and beyond CPUs
+ */
+ if (c->x86_vendor == X86_VENDOR_INTEL &&
+ (c->x86 > 0xf || (c->x86 == 6 && c->x86_model >= 0x0f)))
+ flags->bm_control = 0;
+}
+EXPORT_SYMBOL(acpi_processor_power_init_bm_check);
+
+/* The code below handles cstate entry with monitor-mwait pair on Intel*/
+
+struct cstate_entry {
+ struct {
+ unsigned int eax;
+ unsigned int ecx;
+ } states[ACPI_PROCESSOR_MAX_POWER];
+};
+static struct cstate_entry __percpu *cpu_cstate_entry; /* per CPU ptr */
+
+static short mwait_supported[ACPI_PROCESSOR_MAX_POWER];
+
+#define NATIVE_CSTATE_BEYOND_HALT (2)
+
+static long acpi_processor_ffh_cstate_probe_cpu(void *_cx)
+{
+ struct acpi_processor_cx *cx = _cx;
+ long retval;
+ unsigned int eax, ebx, ecx, edx;
+ unsigned int edx_part;
+ unsigned int cstate_type; /* C-state type and not ACPI C-state type */
+ unsigned int num_cstate_subtype;
+
+ cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &edx);
+
+ /* Check whether this particular cx_type (in CST) is supported or not */
+ cstate_type = ((cx->address >> MWAIT_SUBSTATE_SIZE) &
+ MWAIT_CSTATE_MASK) + 1;
+ edx_part = edx >> (cstate_type * MWAIT_SUBSTATE_SIZE);
+ num_cstate_subtype = edx_part & MWAIT_SUBSTATE_MASK;
+
+ retval = 0;
+ /* If the HW does not support any sub-states in this C-state */
+ if (num_cstate_subtype == 0) {
+ pr_warn(FW_BUG "ACPI MWAIT C-state 0x%x not supported by HW (0x%x)\n",
+ cx->address, edx_part);
+ retval = -1;
+ goto out;
+ }
+
+ /* mwait ecx extensions INTERRUPT_BREAK should be supported for C2/C3 */
+ if (!(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED) ||
+ !(ecx & CPUID5_ECX_INTERRUPT_BREAK)) {
+ retval = -1;
+ goto out;
+ }
+
+ if (!mwait_supported[cstate_type]) {
+ mwait_supported[cstate_type] = 1;
+ printk(KERN_DEBUG
+ "Monitor-Mwait will be used to enter C-%d state\n",
+ cx->type);
+ }
+ snprintf(cx->desc,
+ ACPI_CX_DESC_LEN, "ACPI FFH MWAIT 0x%x",
+ cx->address);
+out:
+ return retval;
+}
+
+int acpi_processor_ffh_cstate_probe(unsigned int cpu,
+ struct acpi_processor_cx *cx, struct acpi_power_register *reg)
+{
+ struct cstate_entry *percpu_entry;
+ struct cpuinfo_x86 *c = &cpu_data(cpu);
+ long retval;
+
+ if (!cpu_cstate_entry || c->cpuid_level < CPUID_MWAIT_LEAF)
+ return -1;
+
+ if (reg->bit_offset != NATIVE_CSTATE_BEYOND_HALT)
+ return -1;
+
+ percpu_entry = per_cpu_ptr(cpu_cstate_entry, cpu);
+ percpu_entry->states[cx->index].eax = 0;
+ percpu_entry->states[cx->index].ecx = 0;
+
+ /* Make sure we are running on right CPU */
+
+ retval = work_on_cpu(cpu, acpi_processor_ffh_cstate_probe_cpu, cx);
+ if (retval == 0) {
+ /* Use the hint in CST */
+ percpu_entry->states[cx->index].eax = cx->address;
+ percpu_entry->states[cx->index].ecx = MWAIT_ECX_INTERRUPT_BREAK;
+ }
+
+ /*
+ * For _CST FFH on Intel, if GAS.access_size bit 1 is cleared,
+ * then we should skip checking BM_STS for this C-state.
+ * ref: "Intel Processor Vendor-Specific ACPI Interface Specification"
+ */
+ if ((c->x86_vendor == X86_VENDOR_INTEL) && !(reg->access_size & 0x2))
+ cx->bm_sts_skip = 1;
+
+ return retval;
+}
+EXPORT_SYMBOL_GPL(acpi_processor_ffh_cstate_probe);
+
+void __cpuidle acpi_processor_ffh_cstate_enter(struct acpi_processor_cx *cx)
+{
+ unsigned int cpu = smp_processor_id();
+ struct cstate_entry *percpu_entry;
+
+ percpu_entry = per_cpu_ptr(cpu_cstate_entry, cpu);
+ mwait_idle_with_hints(percpu_entry->states[cx->index].eax,
+ percpu_entry->states[cx->index].ecx);
+}
+EXPORT_SYMBOL_GPL(acpi_processor_ffh_cstate_enter);
+
+static int __init ffh_cstate_init(void)
+{
+ struct cpuinfo_x86 *c = &boot_cpu_data;
+
+ if (c->x86_vendor != X86_VENDOR_INTEL &&
+ c->x86_vendor != X86_VENDOR_AMD)
+ return -1;
+
+ cpu_cstate_entry = alloc_percpu(struct cstate_entry);
+ return 0;
+}
+
+static void __exit ffh_cstate_exit(void)
+{
+ free_percpu(cpu_cstate_entry);
+ cpu_cstate_entry = NULL;
+}
+
+arch_initcall(ffh_cstate_init);
+__exitcall(ffh_cstate_exit);
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
new file mode 100644
index 0000000..f1915b7
--- /dev/null
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -0,0 +1,150 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * sleep.c - x86-specific ACPI sleep support.
+ *
+ * Copyright (C) 2001-2003 Patrick Mochel
+ * Copyright (C) 2001-2003 Pavel Machek <pavel@ucw.cz>
+ */
+
+#include <linux/acpi.h>
+#include <linux/bootmem.h>
+#include <linux/memblock.h>
+#include <linux/dmi.h>
+#include <linux/cpumask.h>
+#include <asm/segment.h>
+#include <asm/desc.h>
+#include <asm/pgtable.h>
+#include <asm/cacheflush.h>
+#include <asm/realmode.h>
+
+#include <linux/ftrace.h>
+#include "../../realmode/rm/wakeup.h"
+#include "sleep.h"
+
+unsigned long acpi_realmode_flags;
+
+#if defined(CONFIG_SMP) && defined(CONFIG_64BIT)
+static char temp_stack[4096];
+#endif
+
+/**
+ * x86_acpi_enter_sleep_state - enter sleep state
+ * @state: Sleep state to enter.
+ *
+ * Wrapper around acpi_enter_sleep_state() to be called by assmebly.
+ */
+acpi_status asmlinkage __visible x86_acpi_enter_sleep_state(u8 state)
+{
+ return acpi_enter_sleep_state(state);
+}
+
+/**
+ * x86_acpi_suspend_lowlevel - save kernel state
+ *
+ * Create an identity mapped page table and copy the wakeup routine to
+ * low memory.
+ */
+int x86_acpi_suspend_lowlevel(void)
+{
+ struct wakeup_header *header =
+ (struct wakeup_header *) __va(real_mode_header->wakeup_header);
+
+ if (header->signature != WAKEUP_HEADER_SIGNATURE) {
+ printk(KERN_ERR "wakeup header does not match\n");
+ return -EINVAL;
+ }
+
+ header->video_mode = saved_video_mode;
+
+ header->pmode_behavior = 0;
+
+#ifndef CONFIG_64BIT
+ native_store_gdt((struct desc_ptr *)&header->pmode_gdt);
+
+ /*
+ * We have to check that we can write back the value, and not
+ * just read it. At least on 90 nm Pentium M (Family 6, Model
+ * 13), reading an invalid MSR is not guaranteed to trap, see
+ * Erratum X4 in "Intel Pentium M Processor on 90 nm Process
+ * with 2-MB L2 Cache and Intel® Processor A100 and A110 on 90
+ * nm process with 512-KB L2 Cache Specification Update".
+ */
+ if (!rdmsr_safe(MSR_EFER,
+ &header->pmode_efer_low,
+ &header->pmode_efer_high) &&
+ !wrmsr_safe(MSR_EFER,
+ header->pmode_efer_low,
+ header->pmode_efer_high))
+ header->pmode_behavior |= (1 << WAKEUP_BEHAVIOR_RESTORE_EFER);
+#endif /* !CONFIG_64BIT */
+
+ header->pmode_cr0 = read_cr0();
+ if (__this_cpu_read(cpu_info.cpuid_level) >= 0) {
+ header->pmode_cr4 = __read_cr4();
+ header->pmode_behavior |= (1 << WAKEUP_BEHAVIOR_RESTORE_CR4);
+ }
+ if (!rdmsr_safe(MSR_IA32_MISC_ENABLE,
+ &header->pmode_misc_en_low,
+ &header->pmode_misc_en_high) &&
+ !wrmsr_safe(MSR_IA32_MISC_ENABLE,
+ header->pmode_misc_en_low,
+ header->pmode_misc_en_high))
+ header->pmode_behavior |=
+ (1 << WAKEUP_BEHAVIOR_RESTORE_MISC_ENABLE);
+ header->realmode_flags = acpi_realmode_flags;
+ header->real_magic = 0x12345678;
+
+#ifndef CONFIG_64BIT
+ header->pmode_entry = (u32)&wakeup_pmode_return;
+ header->pmode_cr3 = (u32)__pa_symbol(initial_page_table);
+ saved_magic = 0x12345678;
+#else /* CONFIG_64BIT */
+#ifdef CONFIG_SMP
+ initial_stack = (unsigned long)temp_stack + sizeof(temp_stack);
+ early_gdt_descr.address =
+ (unsigned long)get_cpu_gdt_rw(smp_processor_id());
+ initial_gs = per_cpu_offset(smp_processor_id());
+#endif
+ initial_code = (unsigned long)wakeup_long64;
+ saved_magic = 0x123456789abcdef0L;
+#endif /* CONFIG_64BIT */
+
+ /*
+ * Pause/unpause graph tracing around do_suspend_lowlevel as it has
+ * inconsistent call/return info after it jumps to the wakeup vector.
+ */
+ pause_graph_tracing();
+ do_suspend_lowlevel();
+ unpause_graph_tracing();
+ return 0;
+}
+
+static int __init acpi_sleep_setup(char *str)
+{
+ while ((str != NULL) && (*str != '\0')) {
+ if (strncmp(str, "s3_bios", 7) == 0)
+ acpi_realmode_flags |= 1;
+ if (strncmp(str, "s3_mode", 7) == 0)
+ acpi_realmode_flags |= 2;
+ if (strncmp(str, "s3_beep", 7) == 0)
+ acpi_realmode_flags |= 4;
+#ifdef CONFIG_HIBERNATION
+ if (strncmp(str, "s4_nohwsig", 10) == 0)
+ acpi_no_s4_hw_signature();
+#endif
+ if (strncmp(str, "nonvs", 5) == 0)
+ acpi_nvs_nosave();
+ if (strncmp(str, "nonvs_s3", 8) == 0)
+ acpi_nvs_nosave_s3();
+ if (strncmp(str, "old_ordering", 12) == 0)
+ acpi_old_suspend_ordering();
+ if (strncmp(str, "nobl", 4) == 0)
+ acpi_sleep_no_blacklist();
+ str = strchr(str, ',');
+ if (str != NULL)
+ str += strspn(str, ", \t");
+ }
+ return 1;
+}
+
+__setup("acpi_sleep=", acpi_sleep_setup);
diff --git a/arch/x86/kernel/acpi/sleep.h b/arch/x86/kernel/acpi/sleep.h
new file mode 100644
index 0000000..fbb60ca
--- /dev/null
+++ b/arch/x86/kernel/acpi/sleep.h
@@ -0,0 +1,22 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Variables and functions used by the code in sleep.c
+ */
+
+#include <asm/realmode.h>
+
+extern unsigned long saved_video_mode;
+extern long saved_magic;
+
+extern int wakeup_pmode_return;
+
+extern u8 wake_sleep_flags;
+
+extern unsigned long acpi_copy_wakeup_routine(unsigned long);
+extern void wakeup_long64(void);
+
+extern void do_suspend_lowlevel(void);
+
+extern int x86_acpi_suspend_lowlevel(void);
+
+acpi_status asmlinkage x86_acpi_enter_sleep_state(u8 state);
diff --git a/arch/x86/kernel/acpi/wakeup_32.S b/arch/x86/kernel/acpi/wakeup_32.S
new file mode 100644
index 0000000..0c26b1b
--- /dev/null
+++ b/arch/x86/kernel/acpi/wakeup_32.S
@@ -0,0 +1,99 @@
+ .text
+#include <linux/linkage.h>
+#include <asm/segment.h>
+#include <asm/page_types.h>
+
+# Copyright 2003, 2008 Pavel Machek <pavel@suse.cz>, distribute under GPLv2
+
+ .code32
+ ALIGN
+
+ENTRY(wakeup_pmode_return)
+wakeup_pmode_return:
+ movw $__KERNEL_DS, %ax
+ movw %ax, %ss
+ movw %ax, %fs
+ movw %ax, %gs
+
+ movw $__USER_DS, %ax
+ movw %ax, %ds
+ movw %ax, %es
+
+ # reload the gdt, as we need the full 32 bit address
+ lidt saved_idt
+ lldt saved_ldt
+ ljmp $(__KERNEL_CS), $1f
+1:
+ movl %cr3, %eax
+ movl %eax, %cr3
+ wbinvd
+
+ # and restore the stack ... but you need gdt for this to work
+ movl saved_context_esp, %esp
+
+ movl %cs:saved_magic, %eax
+ cmpl $0x12345678, %eax
+ jne bogus_magic
+
+ # jump to place where we left off
+ movl saved_eip, %eax
+ jmp *%eax
+
+bogus_magic:
+ jmp bogus_magic
+
+
+
+save_registers:
+ sidt saved_idt
+ sldt saved_ldt
+ str saved_tss
+
+ leal 4(%esp), %eax
+ movl %eax, saved_context_esp
+ movl %ebx, saved_context_ebx
+ movl %ebp, saved_context_ebp
+ movl %esi, saved_context_esi
+ movl %edi, saved_context_edi
+ pushfl
+ popl saved_context_eflags
+
+ movl $ret_point, saved_eip
+ ret
+
+
+restore_registers:
+ movl saved_context_ebp, %ebp
+ movl saved_context_ebx, %ebx
+ movl saved_context_esi, %esi
+ movl saved_context_edi, %edi
+ pushl saved_context_eflags
+ popfl
+ ret
+
+ENTRY(do_suspend_lowlevel)
+ call save_processor_state
+ call save_registers
+ pushl $3
+ call x86_acpi_enter_sleep_state
+ addl $4, %esp
+
+# In case of S3 failure, we'll emerge here. Jump
+# to ret_point to recover
+ jmp ret_point
+ .p2align 4,,7
+ret_point:
+ call restore_registers
+ call restore_processor_state
+ ret
+
+.data
+ALIGN
+ENTRY(saved_magic) .long 0
+ENTRY(saved_eip) .long 0
+
+# saved registers
+saved_idt: .long 0,0
+saved_ldt: .long 0
+saved_tss: .long 0
+
diff --git a/arch/x86/kernel/acpi/wakeup_64.S b/arch/x86/kernel/acpi/wakeup_64.S
new file mode 100644
index 0000000..50b8ed0
--- /dev/null
+++ b/arch/x86/kernel/acpi/wakeup_64.S
@@ -0,0 +1,136 @@
+.text
+#include <linux/linkage.h>
+#include <asm/segment.h>
+#include <asm/pgtable_types.h>
+#include <asm/page_types.h>
+#include <asm/msr.h>
+#include <asm/asm-offsets.h>
+#include <asm/frame.h>
+
+# Copyright 2003 Pavel Machek <pavel@suse.cz>, distribute under GPLv2
+
+.code64
+ /*
+ * Hooray, we are in Long 64-bit mode (but still running in low memory)
+ */
+ENTRY(wakeup_long64)
+ movq saved_magic, %rax
+ movq $0x123456789abcdef0, %rdx
+ cmpq %rdx, %rax
+ jne bogus_64_magic
+
+ movw $__KERNEL_DS, %ax
+ movw %ax, %ss
+ movw %ax, %ds
+ movw %ax, %es
+ movw %ax, %fs
+ movw %ax, %gs
+ movq saved_rsp, %rsp
+
+ movq saved_rbx, %rbx
+ movq saved_rdi, %rdi
+ movq saved_rsi, %rsi
+ movq saved_rbp, %rbp
+
+ movq saved_rip, %rax
+ jmp *%rax
+ENDPROC(wakeup_long64)
+
+bogus_64_magic:
+ jmp bogus_64_magic
+
+ENTRY(do_suspend_lowlevel)
+ FRAME_BEGIN
+ subq $8, %rsp
+ xorl %eax, %eax
+ call save_processor_state
+
+ movq $saved_context, %rax
+ movq %rsp, pt_regs_sp(%rax)
+ movq %rbp, pt_regs_bp(%rax)
+ movq %rsi, pt_regs_si(%rax)
+ movq %rdi, pt_regs_di(%rax)
+ movq %rbx, pt_regs_bx(%rax)
+ movq %rcx, pt_regs_cx(%rax)
+ movq %rdx, pt_regs_dx(%rax)
+ movq %r8, pt_regs_r8(%rax)
+ movq %r9, pt_regs_r9(%rax)
+ movq %r10, pt_regs_r10(%rax)
+ movq %r11, pt_regs_r11(%rax)
+ movq %r12, pt_regs_r12(%rax)
+ movq %r13, pt_regs_r13(%rax)
+ movq %r14, pt_regs_r14(%rax)
+ movq %r15, pt_regs_r15(%rax)
+ pushfq
+ popq pt_regs_flags(%rax)
+
+ movq $.Lresume_point, saved_rip(%rip)
+
+ movq %rsp, saved_rsp
+ movq %rbp, saved_rbp
+ movq %rbx, saved_rbx
+ movq %rdi, saved_rdi
+ movq %rsi, saved_rsi
+
+ addq $8, %rsp
+ movl $3, %edi
+ xorl %eax, %eax
+ call x86_acpi_enter_sleep_state
+ /* in case something went wrong, restore the machine status and go on */
+ jmp .Lresume_point
+
+ .align 4
+.Lresume_point:
+ /* We don't restore %rax, it must be 0 anyway */
+ movq $saved_context, %rax
+ movq saved_context_cr4(%rax), %rbx
+ movq %rbx, %cr4
+ movq saved_context_cr3(%rax), %rbx
+ movq %rbx, %cr3
+ movq saved_context_cr2(%rax), %rbx
+ movq %rbx, %cr2
+ movq saved_context_cr0(%rax), %rbx
+ movq %rbx, %cr0
+ pushq pt_regs_flags(%rax)
+ popfq
+ movq pt_regs_sp(%rax), %rsp
+ movq pt_regs_bp(%rax), %rbp
+ movq pt_regs_si(%rax), %rsi
+ movq pt_regs_di(%rax), %rdi
+ movq pt_regs_bx(%rax), %rbx
+ movq pt_regs_cx(%rax), %rcx
+ movq pt_regs_dx(%rax), %rdx
+ movq pt_regs_r8(%rax), %r8
+ movq pt_regs_r9(%rax), %r9
+ movq pt_regs_r10(%rax), %r10
+ movq pt_regs_r11(%rax), %r11
+ movq pt_regs_r12(%rax), %r12
+ movq pt_regs_r13(%rax), %r13
+ movq pt_regs_r14(%rax), %r14
+ movq pt_regs_r15(%rax), %r15
+
+#ifdef CONFIG_KASAN
+ /*
+ * The suspend path may have poisoned some areas deeper in the stack,
+ * which we now need to unpoison.
+ */
+ movq %rsp, %rdi
+ call kasan_unpoison_task_stack_below
+#endif
+
+ xorl %eax, %eax
+ addq $8, %rsp
+ FRAME_END
+ jmp restore_processor_state
+ENDPROC(do_suspend_lowlevel)
+
+.data
+ENTRY(saved_rbp) .quad 0
+ENTRY(saved_rsi) .quad 0
+ENTRY(saved_rdi) .quad 0
+ENTRY(saved_rbx) .quad 0
+
+ENTRY(saved_rip) .quad 0
+ENTRY(saved_rsp) .quad 0
+
+ENTRY(saved_magic) .quad 0
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
new file mode 100644
index 0000000..b9d5e7c
--- /dev/null
+++ b/arch/x86/kernel/alternative.c
@@ -0,0 +1,831 @@
+#define pr_fmt(fmt) "SMP alternatives: " fmt
+
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/mutex.h>
+#include <linux/list.h>
+#include <linux/stringify.h>
+#include <linux/mm.h>
+#include <linux/vmalloc.h>
+#include <linux/memory.h>
+#include <linux/stop_machine.h>
+#include <linux/slab.h>
+#include <linux/kdebug.h>
+#include <asm/text-patching.h>
+#include <asm/alternative.h>
+#include <asm/sections.h>
+#include <asm/pgtable.h>
+#include <asm/mce.h>
+#include <asm/nmi.h>
+#include <asm/cacheflush.h>
+#include <asm/tlbflush.h>
+#include <asm/io.h>
+#include <asm/fixmap.h>
+
+int __read_mostly alternatives_patched;
+
+EXPORT_SYMBOL_GPL(alternatives_patched);
+
+#define MAX_PATCH_LEN (255-1)
+
+static int __initdata_or_module debug_alternative;
+
+static int __init debug_alt(char *str)
+{
+ debug_alternative = 1;
+ return 1;
+}
+__setup("debug-alternative", debug_alt);
+
+static int noreplace_smp;
+
+static int __init setup_noreplace_smp(char *str)
+{
+ noreplace_smp = 1;
+ return 1;
+}
+__setup("noreplace-smp", setup_noreplace_smp);
+
+#define DPRINTK(fmt, args...) \
+do { \
+ if (debug_alternative) \
+ printk(KERN_DEBUG "%s: " fmt "\n", __func__, ##args); \
+} while (0)
+
+#define DUMP_BYTES(buf, len, fmt, args...) \
+do { \
+ if (unlikely(debug_alternative)) { \
+ int j; \
+ \
+ if (!(len)) \
+ break; \
+ \
+ printk(KERN_DEBUG fmt, ##args); \
+ for (j = 0; j < (len) - 1; j++) \
+ printk(KERN_CONT "%02hhx ", buf[j]); \
+ printk(KERN_CONT "%02hhx\n", buf[j]); \
+ } \
+} while (0)
+
+/*
+ * Each GENERIC_NOPX is of X bytes, and defined as an array of bytes
+ * that correspond to that nop. Getting from one nop to the next, we
+ * add to the array the offset that is equal to the sum of all sizes of
+ * nops preceding the one we are after.
+ *
+ * Note: The GENERIC_NOP5_ATOMIC is at the end, as it breaks the
+ * nice symmetry of sizes of the previous nops.
+ */
+#if defined(GENERIC_NOP1) && !defined(CONFIG_X86_64)
+static const unsigned char intelnops[] =
+{
+ GENERIC_NOP1,
+ GENERIC_NOP2,
+ GENERIC_NOP3,
+ GENERIC_NOP4,
+ GENERIC_NOP5,
+ GENERIC_NOP6,
+ GENERIC_NOP7,
+ GENERIC_NOP8,
+ GENERIC_NOP5_ATOMIC
+};
+static const unsigned char * const intel_nops[ASM_NOP_MAX+2] =
+{
+ NULL,
+ intelnops,
+ intelnops + 1,
+ intelnops + 1 + 2,
+ intelnops + 1 + 2 + 3,
+ intelnops + 1 + 2 + 3 + 4,
+ intelnops + 1 + 2 + 3 + 4 + 5,
+ intelnops + 1 + 2 + 3 + 4 + 5 + 6,
+ intelnops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
+ intelnops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
+};
+#endif
+
+#ifdef K8_NOP1
+static const unsigned char k8nops[] =
+{
+ K8_NOP1,
+ K8_NOP2,
+ K8_NOP3,
+ K8_NOP4,
+ K8_NOP5,
+ K8_NOP6,
+ K8_NOP7,
+ K8_NOP8,
+ K8_NOP5_ATOMIC
+};
+static const unsigned char * const k8_nops[ASM_NOP_MAX+2] =
+{
+ NULL,
+ k8nops,
+ k8nops + 1,
+ k8nops + 1 + 2,
+ k8nops + 1 + 2 + 3,
+ k8nops + 1 + 2 + 3 + 4,
+ k8nops + 1 + 2 + 3 + 4 + 5,
+ k8nops + 1 + 2 + 3 + 4 + 5 + 6,
+ k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
+ k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
+};
+#endif
+
+#if defined(K7_NOP1) && !defined(CONFIG_X86_64)
+static const unsigned char k7nops[] =
+{
+ K7_NOP1,
+ K7_NOP2,
+ K7_NOP3,
+ K7_NOP4,
+ K7_NOP5,
+ K7_NOP6,
+ K7_NOP7,
+ K7_NOP8,
+ K7_NOP5_ATOMIC
+};
+static const unsigned char * const k7_nops[ASM_NOP_MAX+2] =
+{
+ NULL,
+ k7nops,
+ k7nops + 1,
+ k7nops + 1 + 2,
+ k7nops + 1 + 2 + 3,
+ k7nops + 1 + 2 + 3 + 4,
+ k7nops + 1 + 2 + 3 + 4 + 5,
+ k7nops + 1 + 2 + 3 + 4 + 5 + 6,
+ k7nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
+ k7nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
+};
+#endif
+
+#ifdef P6_NOP1
+static const unsigned char p6nops[] =
+{
+ P6_NOP1,
+ P6_NOP2,
+ P6_NOP3,
+ P6_NOP4,
+ P6_NOP5,
+ P6_NOP6,
+ P6_NOP7,
+ P6_NOP8,
+ P6_NOP5_ATOMIC
+};
+static const unsigned char * const p6_nops[ASM_NOP_MAX+2] =
+{
+ NULL,
+ p6nops,
+ p6nops + 1,
+ p6nops + 1 + 2,
+ p6nops + 1 + 2 + 3,
+ p6nops + 1 + 2 + 3 + 4,
+ p6nops + 1 + 2 + 3 + 4 + 5,
+ p6nops + 1 + 2 + 3 + 4 + 5 + 6,
+ p6nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
+ p6nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
+};
+#endif
+
+/* Initialize these to a safe default */
+#ifdef CONFIG_X86_64
+const unsigned char * const *ideal_nops = p6_nops;
+#else
+const unsigned char * const *ideal_nops = intel_nops;
+#endif
+
+void __init arch_init_ideal_nops(void)
+{
+ switch (boot_cpu_data.x86_vendor) {
+ case X86_VENDOR_INTEL:
+ /*
+ * Due to a decoder implementation quirk, some
+ * specific Intel CPUs actually perform better with
+ * the "k8_nops" than with the SDM-recommended NOPs.
+ */
+ if (boot_cpu_data.x86 == 6 &&
+ boot_cpu_data.x86_model >= 0x0f &&
+ boot_cpu_data.x86_model != 0x1c &&
+ boot_cpu_data.x86_model != 0x26 &&
+ boot_cpu_data.x86_model != 0x27 &&
+ boot_cpu_data.x86_model < 0x30) {
+ ideal_nops = k8_nops;
+ } else if (boot_cpu_has(X86_FEATURE_NOPL)) {
+ ideal_nops = p6_nops;
+ } else {
+#ifdef CONFIG_X86_64
+ ideal_nops = k8_nops;
+#else
+ ideal_nops = intel_nops;
+#endif
+ }
+ break;
+
+ case X86_VENDOR_AMD:
+ if (boot_cpu_data.x86 > 0xf) {
+ ideal_nops = p6_nops;
+ return;
+ }
+
+ /* fall through */
+
+ default:
+#ifdef CONFIG_X86_64
+ ideal_nops = k8_nops;
+#else
+ if (boot_cpu_has(X86_FEATURE_K8))
+ ideal_nops = k8_nops;
+ else if (boot_cpu_has(X86_FEATURE_K7))
+ ideal_nops = k7_nops;
+ else
+ ideal_nops = intel_nops;
+#endif
+ }
+}
+
+/* Use this to add nops to a buffer, then text_poke the whole buffer. */
+static void __init_or_module add_nops(void *insns, unsigned int len)
+{
+ while (len > 0) {
+ unsigned int noplen = len;
+ if (noplen > ASM_NOP_MAX)
+ noplen = ASM_NOP_MAX;
+ memcpy(insns, ideal_nops[noplen], noplen);
+ insns += noplen;
+ len -= noplen;
+ }
+}
+
+extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
+extern s32 __smp_locks[], __smp_locks_end[];
+void *text_poke_early(void *addr, const void *opcode, size_t len);
+
+/*
+ * Are we looking at a near JMP with a 1 or 4-byte displacement.
+ */
+static inline bool is_jmp(const u8 opcode)
+{
+ return opcode == 0xeb || opcode == 0xe9;
+}
+
+static void __init_or_module
+recompute_jump(struct alt_instr *a, u8 *orig_insn, u8 *repl_insn, u8 *insnbuf)
+{
+ u8 *next_rip, *tgt_rip;
+ s32 n_dspl, o_dspl;
+ int repl_len;
+
+ if (a->replacementlen != 5)
+ return;
+
+ o_dspl = *(s32 *)(insnbuf + 1);
+
+ /* next_rip of the replacement JMP */
+ next_rip = repl_insn + a->replacementlen;
+ /* target rip of the replacement JMP */
+ tgt_rip = next_rip + o_dspl;
+ n_dspl = tgt_rip - orig_insn;
+
+ DPRINTK("target RIP: %px, new_displ: 0x%x", tgt_rip, n_dspl);
+
+ if (tgt_rip - orig_insn >= 0) {
+ if (n_dspl - 2 <= 127)
+ goto two_byte_jmp;
+ else
+ goto five_byte_jmp;
+ /* negative offset */
+ } else {
+ if (((n_dspl - 2) & 0xff) == (n_dspl - 2))
+ goto two_byte_jmp;
+ else
+ goto five_byte_jmp;
+ }
+
+two_byte_jmp:
+ n_dspl -= 2;
+
+ insnbuf[0] = 0xeb;
+ insnbuf[1] = (s8)n_dspl;
+ add_nops(insnbuf + 2, 3);
+
+ repl_len = 2;
+ goto done;
+
+five_byte_jmp:
+ n_dspl -= 5;
+
+ insnbuf[0] = 0xe9;
+ *(s32 *)&insnbuf[1] = n_dspl;
+
+ repl_len = 5;
+
+done:
+
+ DPRINTK("final displ: 0x%08x, JMP 0x%lx",
+ n_dspl, (unsigned long)orig_insn + n_dspl + repl_len);
+}
+
+/*
+ * "noinline" to cause control flow change and thus invalidate I$ and
+ * cause refetch after modification.
+ */
+static void __init_or_module noinline optimize_nops(struct alt_instr *a, u8 *instr)
+{
+ unsigned long flags;
+ int i;
+
+ for (i = 0; i < a->padlen; i++) {
+ if (instr[i] != 0x90)
+ return;
+ }
+
+ local_irq_save(flags);
+ add_nops(instr + (a->instrlen - a->padlen), a->padlen);
+ local_irq_restore(flags);
+
+ DUMP_BYTES(instr, a->instrlen, "%px: [%d:%d) optimized NOPs: ",
+ instr, a->instrlen - a->padlen, a->padlen);
+}
+
+/*
+ * Replace instructions with better alternatives for this CPU type. This runs
+ * before SMP is initialized to avoid SMP problems with self modifying code.
+ * This implies that asymmetric systems where APs have less capabilities than
+ * the boot processor are not handled. Tough. Make sure you disable such
+ * features by hand.
+ *
+ * Marked "noinline" to cause control flow change and thus insn cache
+ * to refetch changed I$ lines.
+ */
+void __init_or_module noinline apply_alternatives(struct alt_instr *start,
+ struct alt_instr *end)
+{
+ struct alt_instr *a;
+ u8 *instr, *replacement;
+ u8 insnbuf[MAX_PATCH_LEN];
+
+ DPRINTK("alt table %px, -> %px", start, end);
+ /*
+ * The scan order should be from start to end. A later scanned
+ * alternative code can overwrite previously scanned alternative code.
+ * Some kernel functions (e.g. memcpy, memset, etc) use this order to
+ * patch code.
+ *
+ * So be careful if you want to change the scan order to any other
+ * order.
+ */
+ for (a = start; a < end; a++) {
+ int insnbuf_sz = 0;
+
+ instr = (u8 *)&a->instr_offset + a->instr_offset;
+ replacement = (u8 *)&a->repl_offset + a->repl_offset;
+ BUG_ON(a->instrlen > sizeof(insnbuf));
+ BUG_ON(a->cpuid >= (NCAPINTS + NBUGINTS) * 32);
+ if (!boot_cpu_has(a->cpuid)) {
+ if (a->padlen > 1)
+ optimize_nops(a, instr);
+
+ continue;
+ }
+
+ DPRINTK("feat: %d*32+%d, old: (%px len: %d), repl: (%px, len: %d), pad: %d",
+ a->cpuid >> 5,
+ a->cpuid & 0x1f,
+ instr, a->instrlen,
+ replacement, a->replacementlen, a->padlen);
+
+ DUMP_BYTES(instr, a->instrlen, "%px: old_insn: ", instr);
+ DUMP_BYTES(replacement, a->replacementlen, "%px: rpl_insn: ", replacement);
+
+ memcpy(insnbuf, replacement, a->replacementlen);
+ insnbuf_sz = a->replacementlen;
+
+ /*
+ * 0xe8 is a relative jump; fix the offset.
+ *
+ * Instruction length is checked before the opcode to avoid
+ * accessing uninitialized bytes for zero-length replacements.
+ */
+ if (a->replacementlen == 5 && *insnbuf == 0xe8) {
+ *(s32 *)(insnbuf + 1) += replacement - instr;
+ DPRINTK("Fix CALL offset: 0x%x, CALL 0x%lx",
+ *(s32 *)(insnbuf + 1),
+ (unsigned long)instr + *(s32 *)(insnbuf + 1) + 5);
+ }
+
+ if (a->replacementlen && is_jmp(replacement[0]))
+ recompute_jump(a, instr, replacement, insnbuf);
+
+ if (a->instrlen > a->replacementlen) {
+ add_nops(insnbuf + a->replacementlen,
+ a->instrlen - a->replacementlen);
+ insnbuf_sz += a->instrlen - a->replacementlen;
+ }
+ DUMP_BYTES(insnbuf, insnbuf_sz, "%px: final_insn: ", instr);
+
+ text_poke_early(instr, insnbuf, insnbuf_sz);
+ }
+}
+
+#ifdef CONFIG_SMP
+static void alternatives_smp_lock(const s32 *start, const s32 *end,
+ u8 *text, u8 *text_end)
+{
+ const s32 *poff;
+
+ for (poff = start; poff < end; poff++) {
+ u8 *ptr = (u8 *)poff + *poff;
+
+ if (!*poff || ptr < text || ptr >= text_end)
+ continue;
+ /* turn DS segment override prefix into lock prefix */
+ if (*ptr == 0x3e)
+ text_poke(ptr, ((unsigned char []){0xf0}), 1);
+ }
+}
+
+static void alternatives_smp_unlock(const s32 *start, const s32 *end,
+ u8 *text, u8 *text_end)
+{
+ const s32 *poff;
+
+ for (poff = start; poff < end; poff++) {
+ u8 *ptr = (u8 *)poff + *poff;
+
+ if (!*poff || ptr < text || ptr >= text_end)
+ continue;
+ /* turn lock prefix into DS segment override prefix */
+ if (*ptr == 0xf0)
+ text_poke(ptr, ((unsigned char []){0x3E}), 1);
+ }
+}
+
+struct smp_alt_module {
+ /* what is this ??? */
+ struct module *mod;
+ char *name;
+
+ /* ptrs to lock prefixes */
+ const s32 *locks;
+ const s32 *locks_end;
+
+ /* .text segment, needed to avoid patching init code ;) */
+ u8 *text;
+ u8 *text_end;
+
+ struct list_head next;
+};
+static LIST_HEAD(smp_alt_modules);
+static bool uniproc_patched = false; /* protected by text_mutex */
+
+void __init_or_module alternatives_smp_module_add(struct module *mod,
+ char *name,
+ void *locks, void *locks_end,
+ void *text, void *text_end)
+{
+ struct smp_alt_module *smp;
+
+ mutex_lock(&text_mutex);
+ if (!uniproc_patched)
+ goto unlock;
+
+ if (num_possible_cpus() == 1)
+ /* Don't bother remembering, we'll never have to undo it. */
+ goto smp_unlock;
+
+ smp = kzalloc(sizeof(*smp), GFP_KERNEL);
+ if (NULL == smp)
+ /* we'll run the (safe but slow) SMP code then ... */
+ goto unlock;
+
+ smp->mod = mod;
+ smp->name = name;
+ smp->locks = locks;
+ smp->locks_end = locks_end;
+ smp->text = text;
+ smp->text_end = text_end;
+ DPRINTK("locks %p -> %p, text %p -> %p, name %s\n",
+ smp->locks, smp->locks_end,
+ smp->text, smp->text_end, smp->name);
+
+ list_add_tail(&smp->next, &smp_alt_modules);
+smp_unlock:
+ alternatives_smp_unlock(locks, locks_end, text, text_end);
+unlock:
+ mutex_unlock(&text_mutex);
+}
+
+void __init_or_module alternatives_smp_module_del(struct module *mod)
+{
+ struct smp_alt_module *item;
+
+ mutex_lock(&text_mutex);
+ list_for_each_entry(item, &smp_alt_modules, next) {
+ if (mod != item->mod)
+ continue;
+ list_del(&item->next);
+ kfree(item);
+ break;
+ }
+ mutex_unlock(&text_mutex);
+}
+
+void alternatives_enable_smp(void)
+{
+ struct smp_alt_module *mod;
+
+ /* Why bother if there are no other CPUs? */
+ BUG_ON(num_possible_cpus() == 1);
+
+ mutex_lock(&text_mutex);
+
+ if (uniproc_patched) {
+ pr_info("switching to SMP code\n");
+ BUG_ON(num_online_cpus() != 1);
+ clear_cpu_cap(&boot_cpu_data, X86_FEATURE_UP);
+ clear_cpu_cap(&cpu_data(0), X86_FEATURE_UP);
+ list_for_each_entry(mod, &smp_alt_modules, next)
+ alternatives_smp_lock(mod->locks, mod->locks_end,
+ mod->text, mod->text_end);
+ uniproc_patched = false;
+ }
+ mutex_unlock(&text_mutex);
+}
+
+/*
+ * Return 1 if the address range is reserved for SMP-alternatives.
+ * Must hold text_mutex.
+ */
+int alternatives_text_reserved(void *start, void *end)
+{
+ struct smp_alt_module *mod;
+ const s32 *poff;
+ u8 *text_start = start;
+ u8 *text_end = end;
+
+ lockdep_assert_held(&text_mutex);
+
+ list_for_each_entry(mod, &smp_alt_modules, next) {
+ if (mod->text > text_end || mod->text_end < text_start)
+ continue;
+ for (poff = mod->locks; poff < mod->locks_end; poff++) {
+ const u8 *ptr = (const u8 *)poff + *poff;
+
+ if (text_start <= ptr && text_end > ptr)
+ return 1;
+ }
+ }
+
+ return 0;
+}
+#endif /* CONFIG_SMP */
+
+#ifdef CONFIG_PARAVIRT
+void __init_or_module apply_paravirt(struct paravirt_patch_site *start,
+ struct paravirt_patch_site *end)
+{
+ struct paravirt_patch_site *p;
+ char insnbuf[MAX_PATCH_LEN];
+
+ for (p = start; p < end; p++) {
+ unsigned int used;
+
+ BUG_ON(p->len > MAX_PATCH_LEN);
+ /* prep the buffer with the original instructions */
+ memcpy(insnbuf, p->instr, p->len);
+ used = pv_init_ops.patch(p->instrtype, p->clobbers, insnbuf,
+ (unsigned long)p->instr, p->len);
+
+ BUG_ON(used > p->len);
+
+ /* Pad the rest with nops */
+ add_nops(insnbuf + used, p->len - used);
+ text_poke_early(p->instr, insnbuf, p->len);
+ }
+}
+extern struct paravirt_patch_site __start_parainstructions[],
+ __stop_parainstructions[];
+#endif /* CONFIG_PARAVIRT */
+
+void __init alternative_instructions(void)
+{
+ /* The patching is not fully atomic, so try to avoid local interruptions
+ that might execute the to be patched code.
+ Other CPUs are not running. */
+ stop_nmi();
+
+ /*
+ * Don't stop machine check exceptions while patching.
+ * MCEs only happen when something got corrupted and in this
+ * case we must do something about the corruption.
+ * Ignoring it is worse than a unlikely patching race.
+ * Also machine checks tend to be broadcast and if one CPU
+ * goes into machine check the others follow quickly, so we don't
+ * expect a machine check to cause undue problems during to code
+ * patching.
+ */
+
+ apply_alternatives(__alt_instructions, __alt_instructions_end);
+
+#ifdef CONFIG_SMP
+ /* Patch to UP if other cpus not imminent. */
+ if (!noreplace_smp && (num_present_cpus() == 1 || setup_max_cpus <= 1)) {
+ uniproc_patched = true;
+ alternatives_smp_module_add(NULL, "core kernel",
+ __smp_locks, __smp_locks_end,
+ _text, _etext);
+ }
+
+ if (!uniproc_patched || num_possible_cpus() == 1)
+ free_init_pages("SMP alternatives",
+ (unsigned long)__smp_locks,
+ (unsigned long)__smp_locks_end);
+#endif
+
+ apply_paravirt(__parainstructions, __parainstructions_end);
+
+ restart_nmi();
+ alternatives_patched = 1;
+}
+
+/**
+ * text_poke_early - Update instructions on a live kernel at boot time
+ * @addr: address to modify
+ * @opcode: source of the copy
+ * @len: length to copy
+ *
+ * When you use this code to patch more than one byte of an instruction
+ * you need to make sure that other CPUs cannot execute this code in parallel.
+ * Also no thread must be currently preempted in the middle of these
+ * instructions. And on the local CPU you need to be protected again NMI or MCE
+ * handlers seeing an inconsistent instruction while you patch.
+ */
+void *__init_or_module text_poke_early(void *addr, const void *opcode,
+ size_t len)
+{
+ unsigned long flags;
+ local_irq_save(flags);
+ memcpy(addr, opcode, len);
+ local_irq_restore(flags);
+ sync_core();
+ /* Could also do a CLFLUSH here to speed up CPU recovery; but
+ that causes hangs on some VIA CPUs. */
+ return addr;
+}
+
+/**
+ * text_poke - Update instructions on a live kernel
+ * @addr: address to modify
+ * @opcode: source of the copy
+ * @len: length to copy
+ *
+ * Only atomic text poke/set should be allowed when not doing early patching.
+ * It means the size must be writable atomically and the address must be aligned
+ * in a way that permits an atomic write. It also makes sure we fit on a single
+ * page.
+ */
+void *text_poke(void *addr, const void *opcode, size_t len)
+{
+ unsigned long flags;
+ char *vaddr;
+ struct page *pages[2];
+ int i;
+
+ /*
+ * While boot memory allocator is runnig we cannot use struct
+ * pages as they are not yet initialized.
+ */
+ BUG_ON(!after_bootmem);
+
+ lockdep_assert_held(&text_mutex);
+
+ if (!core_kernel_text((unsigned long)addr)) {
+ pages[0] = vmalloc_to_page(addr);
+ pages[1] = vmalloc_to_page(addr + PAGE_SIZE);
+ } else {
+ pages[0] = virt_to_page(addr);
+ WARN_ON(!PageReserved(pages[0]));
+ pages[1] = virt_to_page(addr + PAGE_SIZE);
+ }
+ BUG_ON(!pages[0]);
+ local_irq_save(flags);
+ set_fixmap(FIX_TEXT_POKE0, page_to_phys(pages[0]));
+ if (pages[1])
+ set_fixmap(FIX_TEXT_POKE1, page_to_phys(pages[1]));
+ vaddr = (char *)fix_to_virt(FIX_TEXT_POKE0);
+ memcpy(&vaddr[(unsigned long)addr & ~PAGE_MASK], opcode, len);
+ clear_fixmap(FIX_TEXT_POKE0);
+ if (pages[1])
+ clear_fixmap(FIX_TEXT_POKE1);
+ local_flush_tlb();
+ sync_core();
+ /* Could also do a CLFLUSH here to speed up CPU recovery; but
+ that causes hangs on some VIA CPUs. */
+ for (i = 0; i < len; i++)
+ BUG_ON(((char *)addr)[i] != ((char *)opcode)[i]);
+ local_irq_restore(flags);
+ return addr;
+}
+
+static void do_sync_core(void *info)
+{
+ sync_core();
+}
+
+static bool bp_patching_in_progress;
+static void *bp_int3_handler, *bp_int3_addr;
+
+int poke_int3_handler(struct pt_regs *regs)
+{
+ /*
+ * Having observed our INT3 instruction, we now must observe
+ * bp_patching_in_progress.
+ *
+ * in_progress = TRUE INT3
+ * WMB RMB
+ * write INT3 if (in_progress)
+ *
+ * Idem for bp_int3_handler.
+ */
+ smp_rmb();
+
+ if (likely(!bp_patching_in_progress))
+ return 0;
+
+ if (user_mode(regs) || regs->ip != (unsigned long)bp_int3_addr)
+ return 0;
+
+ /* set up the specified breakpoint handler */
+ regs->ip = (unsigned long) bp_int3_handler;
+
+ return 1;
+
+}
+
+/**
+ * text_poke_bp() -- update instructions on live kernel on SMP
+ * @addr: address to patch
+ * @opcode: opcode of new instruction
+ * @len: length to copy
+ * @handler: address to jump to when the temporary breakpoint is hit
+ *
+ * Modify multi-byte instruction by using int3 breakpoint on SMP.
+ * We completely avoid stop_machine() here, and achieve the
+ * synchronization using int3 breakpoint.
+ *
+ * The way it is done:
+ * - add a int3 trap to the address that will be patched
+ * - sync cores
+ * - update all but the first byte of the patched range
+ * - sync cores
+ * - replace the first byte (int3) by the first byte of
+ * replacing opcode
+ * - sync cores
+ */
+void *text_poke_bp(void *addr, const void *opcode, size_t len, void *handler)
+{
+ unsigned char int3 = 0xcc;
+
+ bp_int3_handler = handler;
+ bp_int3_addr = (u8 *)addr + sizeof(int3);
+ bp_patching_in_progress = true;
+
+ lockdep_assert_held(&text_mutex);
+
+ /*
+ * Corresponding read barrier in int3 notifier for making sure the
+ * in_progress and handler are correctly ordered wrt. patching.
+ */
+ smp_wmb();
+
+ text_poke(addr, &int3, sizeof(int3));
+
+ on_each_cpu(do_sync_core, NULL, 1);
+
+ if (len - sizeof(int3) > 0) {
+ /* patch all but the first byte */
+ text_poke((char *)addr + sizeof(int3),
+ (const char *) opcode + sizeof(int3),
+ len - sizeof(int3));
+ /*
+ * According to Intel, this core syncing is very likely
+ * not necessary and we'd be safe even without it. But
+ * better safe than sorry (plus there's not only Intel).
+ */
+ on_each_cpu(do_sync_core, NULL, 1);
+ }
+
+ /* patch the first byte */
+ text_poke(addr, opcode, sizeof(int3));
+
+ on_each_cpu(do_sync_core, NULL, 1);
+ /*
+ * sync_core() implies an smp_mb() and orders this store against
+ * the writing of the new instruction.
+ */
+ bp_patching_in_progress = false;
+
+ return addr;
+}
+
diff --git a/arch/x86/kernel/amd_gart_64.c b/arch/x86/kernel/amd_gart_64.c
new file mode 100644
index 0000000..f299d8a
--- /dev/null
+++ b/arch/x86/kernel/amd_gart_64.c
@@ -0,0 +1,891 @@
+/*
+ * Dynamic DMA mapping support for AMD Hammer.
+ *
+ * Use the integrated AGP GART in the Hammer northbridge as an IOMMU for PCI.
+ * This allows to use PCI devices that only support 32bit addresses on systems
+ * with more than 4GB.
+ *
+ * See Documentation/DMA-API-HOWTO.txt for the interface specification.
+ *
+ * Copyright 2002 Andi Kleen, SuSE Labs.
+ * Subject to the GNU General Public License v2 only.
+ */
+
+#include <linux/types.h>
+#include <linux/ctype.h>
+#include <linux/agp_backend.h>
+#include <linux/init.h>
+#include <linux/mm.h>
+#include <linux/sched.h>
+#include <linux/sched/debug.h>
+#include <linux/string.h>
+#include <linux/spinlock.h>
+#include <linux/pci.h>
+#include <linux/topology.h>
+#include <linux/interrupt.h>
+#include <linux/bitmap.h>
+#include <linux/kdebug.h>
+#include <linux/scatterlist.h>
+#include <linux/iommu-helper.h>
+#include <linux/syscore_ops.h>
+#include <linux/io.h>
+#include <linux/gfp.h>
+#include <linux/atomic.h>
+#include <linux/dma-direct.h>
+#include <asm/mtrr.h>
+#include <asm/pgtable.h>
+#include <asm/proto.h>
+#include <asm/iommu.h>
+#include <asm/gart.h>
+#include <asm/set_memory.h>
+#include <asm/swiotlb.h>
+#include <asm/dma.h>
+#include <asm/amd_nb.h>
+#include <asm/x86_init.h>
+#include <asm/iommu_table.h>
+
+static unsigned long iommu_bus_base; /* GART remapping area (physical) */
+static unsigned long iommu_size; /* size of remapping area bytes */
+static unsigned long iommu_pages; /* .. and in pages */
+
+static u32 *iommu_gatt_base; /* Remapping table */
+
+static dma_addr_t bad_dma_addr;
+
+/*
+ * If this is disabled the IOMMU will use an optimized flushing strategy
+ * of only flushing when an mapping is reused. With it true the GART is
+ * flushed for every mapping. Problem is that doing the lazy flush seems
+ * to trigger bugs with some popular PCI cards, in particular 3ware (but
+ * has been also also seen with Qlogic at least).
+ */
+static int iommu_fullflush = 1;
+
+/* Allocation bitmap for the remapping area: */
+static DEFINE_SPINLOCK(iommu_bitmap_lock);
+/* Guarded by iommu_bitmap_lock: */
+static unsigned long *iommu_gart_bitmap;
+
+static u32 gart_unmapped_entry;
+
+#define GPTE_VALID 1
+#define GPTE_COHERENT 2
+#define GPTE_ENCODE(x) \
+ (((x) & 0xfffff000) | (((x) >> 32) << 4) | GPTE_VALID | GPTE_COHERENT)
+#define GPTE_DECODE(x) (((x) & 0xfffff000) | (((u64)(x) & 0xff0) << 28))
+
+#define EMERGENCY_PAGES 32 /* = 128KB */
+
+#ifdef CONFIG_AGP
+#define AGPEXTERN extern
+#else
+#define AGPEXTERN
+#endif
+
+/* GART can only remap to physical addresses < 1TB */
+#define GART_MAX_PHYS_ADDR (1ULL << 40)
+
+/* backdoor interface to AGP driver */
+AGPEXTERN int agp_memory_reserved;
+AGPEXTERN __u32 *agp_gatt_table;
+
+static unsigned long next_bit; /* protected by iommu_bitmap_lock */
+static bool need_flush; /* global flush state. set for each gart wrap */
+
+static unsigned long alloc_iommu(struct device *dev, int size,
+ unsigned long align_mask)
+{
+ unsigned long offset, flags;
+ unsigned long boundary_size;
+ unsigned long base_index;
+
+ base_index = ALIGN(iommu_bus_base & dma_get_seg_boundary(dev),
+ PAGE_SIZE) >> PAGE_SHIFT;
+ boundary_size = ALIGN((u64)dma_get_seg_boundary(dev) + 1,
+ PAGE_SIZE) >> PAGE_SHIFT;
+
+ spin_lock_irqsave(&iommu_bitmap_lock, flags);
+ offset = iommu_area_alloc(iommu_gart_bitmap, iommu_pages, next_bit,
+ size, base_index, boundary_size, align_mask);
+ if (offset == -1) {
+ need_flush = true;
+ offset = iommu_area_alloc(iommu_gart_bitmap, iommu_pages, 0,
+ size, base_index, boundary_size,
+ align_mask);
+ }
+ if (offset != -1) {
+ next_bit = offset+size;
+ if (next_bit >= iommu_pages) {
+ next_bit = 0;
+ need_flush = true;
+ }
+ }
+ if (iommu_fullflush)
+ need_flush = true;
+ spin_unlock_irqrestore(&iommu_bitmap_lock, flags);
+
+ return offset;
+}
+
+static void free_iommu(unsigned long offset, int size)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&iommu_bitmap_lock, flags);
+ bitmap_clear(iommu_gart_bitmap, offset, size);
+ if (offset >= next_bit)
+ next_bit = offset + size;
+ spin_unlock_irqrestore(&iommu_bitmap_lock, flags);
+}
+
+/*
+ * Use global flush state to avoid races with multiple flushers.
+ */
+static void flush_gart(void)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&iommu_bitmap_lock, flags);
+ if (need_flush) {
+ amd_flush_garts();
+ need_flush = false;
+ }
+ spin_unlock_irqrestore(&iommu_bitmap_lock, flags);
+}
+
+#ifdef CONFIG_IOMMU_LEAK
+/* Debugging aid for drivers that don't free their IOMMU tables */
+static int leak_trace;
+static int iommu_leak_pages = 20;
+
+static void dump_leak(void)
+{
+ static int dump;
+
+ if (dump)
+ return;
+ dump = 1;
+
+ show_stack(NULL, NULL);
+ debug_dma_dump_mappings(NULL);
+}
+#endif
+
+static void iommu_full(struct device *dev, size_t size, int dir)
+{
+ /*
+ * Ran out of IOMMU space for this operation. This is very bad.
+ * Unfortunately the drivers cannot handle this operation properly.
+ * Return some non mapped prereserved space in the aperture and
+ * let the Northbridge deal with it. This will result in garbage
+ * in the IO operation. When the size exceeds the prereserved space
+ * memory corruption will occur or random memory will be DMAed
+ * out. Hopefully no network devices use single mappings that big.
+ */
+
+ dev_err(dev, "PCI-DMA: Out of IOMMU space for %lu bytes\n", size);
+
+ if (size > PAGE_SIZE*EMERGENCY_PAGES) {
+ if (dir == PCI_DMA_FROMDEVICE || dir == PCI_DMA_BIDIRECTIONAL)
+ panic("PCI-DMA: Memory would be corrupted\n");
+ if (dir == PCI_DMA_TODEVICE || dir == PCI_DMA_BIDIRECTIONAL)
+ panic(KERN_ERR
+ "PCI-DMA: Random memory would be DMAed\n");
+ }
+#ifdef CONFIG_IOMMU_LEAK
+ dump_leak();
+#endif
+}
+
+static inline int
+need_iommu(struct device *dev, unsigned long addr, size_t size)
+{
+ return force_iommu || !dma_capable(dev, addr, size);
+}
+
+static inline int
+nonforced_iommu(struct device *dev, unsigned long addr, size_t size)
+{
+ return !dma_capable(dev, addr, size);
+}
+
+/* Map a single continuous physical area into the IOMMU.
+ * Caller needs to check if the iommu is needed and flush.
+ */
+static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem,
+ size_t size, int dir, unsigned long align_mask)
+{
+ unsigned long npages = iommu_num_pages(phys_mem, size, PAGE_SIZE);
+ unsigned long iommu_page;
+ int i;
+
+ if (unlikely(phys_mem + size > GART_MAX_PHYS_ADDR))
+ return bad_dma_addr;
+
+ iommu_page = alloc_iommu(dev, npages, align_mask);
+ if (iommu_page == -1) {
+ if (!nonforced_iommu(dev, phys_mem, size))
+ return phys_mem;
+ if (panic_on_overflow)
+ panic("dma_map_area overflow %lu bytes\n", size);
+ iommu_full(dev, size, dir);
+ return bad_dma_addr;
+ }
+
+ for (i = 0; i < npages; i++) {
+ iommu_gatt_base[iommu_page + i] = GPTE_ENCODE(phys_mem);
+ phys_mem += PAGE_SIZE;
+ }
+ return iommu_bus_base + iommu_page*PAGE_SIZE + (phys_mem & ~PAGE_MASK);
+}
+
+/* Map a single area into the IOMMU */
+static dma_addr_t gart_map_page(struct device *dev, struct page *page,
+ unsigned long offset, size_t size,
+ enum dma_data_direction dir,
+ unsigned long attrs)
+{
+ unsigned long bus;
+ phys_addr_t paddr = page_to_phys(page) + offset;
+
+ if (!dev)
+ dev = &x86_dma_fallback_dev;
+
+ if (!need_iommu(dev, paddr, size))
+ return paddr;
+
+ bus = dma_map_area(dev, paddr, size, dir, 0);
+ flush_gart();
+
+ return bus;
+}
+
+/*
+ * Free a DMA mapping.
+ */
+static void gart_unmap_page(struct device *dev, dma_addr_t dma_addr,
+ size_t size, enum dma_data_direction dir,
+ unsigned long attrs)
+{
+ unsigned long iommu_page;
+ int npages;
+ int i;
+
+ if (dma_addr < iommu_bus_base + EMERGENCY_PAGES*PAGE_SIZE ||
+ dma_addr >= iommu_bus_base + iommu_size)
+ return;
+
+ iommu_page = (dma_addr - iommu_bus_base)>>PAGE_SHIFT;
+ npages = iommu_num_pages(dma_addr, size, PAGE_SIZE);
+ for (i = 0; i < npages; i++) {
+ iommu_gatt_base[iommu_page + i] = gart_unmapped_entry;
+ }
+ free_iommu(iommu_page, npages);
+}
+
+/*
+ * Wrapper for pci_unmap_single working with scatterlists.
+ */
+static void gart_unmap_sg(struct device *dev, struct scatterlist *sg, int nents,
+ enum dma_data_direction dir, unsigned long attrs)
+{
+ struct scatterlist *s;
+ int i;
+
+ for_each_sg(sg, s, nents, i) {
+ if (!s->dma_length || !s->length)
+ break;
+ gart_unmap_page(dev, s->dma_address, s->dma_length, dir, 0);
+ }
+}
+
+/* Fallback for dma_map_sg in case of overflow */
+static int dma_map_sg_nonforce(struct device *dev, struct scatterlist *sg,
+ int nents, int dir)
+{
+ struct scatterlist *s;
+ int i;
+
+#ifdef CONFIG_IOMMU_DEBUG
+ pr_debug("dma_map_sg overflow\n");
+#endif
+
+ for_each_sg(sg, s, nents, i) {
+ unsigned long addr = sg_phys(s);
+
+ if (nonforced_iommu(dev, addr, s->length)) {
+ addr = dma_map_area(dev, addr, s->length, dir, 0);
+ if (addr == bad_dma_addr) {
+ if (i > 0)
+ gart_unmap_sg(dev, sg, i, dir, 0);
+ nents = 0;
+ sg[0].dma_length = 0;
+ break;
+ }
+ }
+ s->dma_address = addr;
+ s->dma_length = s->length;
+ }
+ flush_gart();
+
+ return nents;
+}
+
+/* Map multiple scatterlist entries continuous into the first. */
+static int __dma_map_cont(struct device *dev, struct scatterlist *start,
+ int nelems, struct scatterlist *sout,
+ unsigned long pages)
+{
+ unsigned long iommu_start = alloc_iommu(dev, pages, 0);
+ unsigned long iommu_page = iommu_start;
+ struct scatterlist *s;
+ int i;
+
+ if (iommu_start == -1)
+ return -1;
+
+ for_each_sg(start, s, nelems, i) {
+ unsigned long pages, addr;
+ unsigned long phys_addr = s->dma_address;
+
+ BUG_ON(s != start && s->offset);
+ if (s == start) {
+ sout->dma_address = iommu_bus_base;
+ sout->dma_address += iommu_page*PAGE_SIZE + s->offset;
+ sout->dma_length = s->length;
+ } else {
+ sout->dma_length += s->length;
+ }
+
+ addr = phys_addr;
+ pages = iommu_num_pages(s->offset, s->length, PAGE_SIZE);
+ while (pages--) {
+ iommu_gatt_base[iommu_page] = GPTE_ENCODE(addr);
+ addr += PAGE_SIZE;
+ iommu_page++;
+ }
+ }
+ BUG_ON(iommu_page - iommu_start != pages);
+
+ return 0;
+}
+
+static inline int
+dma_map_cont(struct device *dev, struct scatterlist *start, int nelems,
+ struct scatterlist *sout, unsigned long pages, int need)
+{
+ if (!need) {
+ BUG_ON(nelems != 1);
+ sout->dma_address = start->dma_address;
+ sout->dma_length = start->length;
+ return 0;
+ }
+ return __dma_map_cont(dev, start, nelems, sout, pages);
+}
+
+/*
+ * DMA map all entries in a scatterlist.
+ * Merge chunks that have page aligned sizes into a continuous mapping.
+ */
+static int gart_map_sg(struct device *dev, struct scatterlist *sg, int nents,
+ enum dma_data_direction dir, unsigned long attrs)
+{
+ struct scatterlist *s, *ps, *start_sg, *sgmap;
+ int need = 0, nextneed, i, out, start;
+ unsigned long pages = 0;
+ unsigned int seg_size;
+ unsigned int max_seg_size;
+
+ if (nents == 0)
+ return 0;
+
+ if (!dev)
+ dev = &x86_dma_fallback_dev;
+
+ out = 0;
+ start = 0;
+ start_sg = sg;
+ sgmap = sg;
+ seg_size = 0;
+ max_seg_size = dma_get_max_seg_size(dev);
+ ps = NULL; /* shut up gcc */
+
+ for_each_sg(sg, s, nents, i) {
+ dma_addr_t addr = sg_phys(s);
+
+ s->dma_address = addr;
+ BUG_ON(s->length == 0);
+
+ nextneed = need_iommu(dev, addr, s->length);
+
+ /* Handle the previous not yet processed entries */
+ if (i > start) {
+ /*
+ * Can only merge when the last chunk ends on a
+ * page boundary and the new one doesn't have an
+ * offset.
+ */
+ if (!iommu_merge || !nextneed || !need || s->offset ||
+ (s->length + seg_size > max_seg_size) ||
+ (ps->offset + ps->length) % PAGE_SIZE) {
+ if (dma_map_cont(dev, start_sg, i - start,
+ sgmap, pages, need) < 0)
+ goto error;
+ out++;
+
+ seg_size = 0;
+ sgmap = sg_next(sgmap);
+ pages = 0;
+ start = i;
+ start_sg = s;
+ }
+ }
+
+ seg_size += s->length;
+ need = nextneed;
+ pages += iommu_num_pages(s->offset, s->length, PAGE_SIZE);
+ ps = s;
+ }
+ if (dma_map_cont(dev, start_sg, i - start, sgmap, pages, need) < 0)
+ goto error;
+ out++;
+ flush_gart();
+ if (out < nents) {
+ sgmap = sg_next(sgmap);
+ sgmap->dma_length = 0;
+ }
+ return out;
+
+error:
+ flush_gart();
+ gart_unmap_sg(dev, sg, out, dir, 0);
+
+ /* When it was forced or merged try again in a dumb way */
+ if (force_iommu || iommu_merge) {
+ out = dma_map_sg_nonforce(dev, sg, nents, dir);
+ if (out > 0)
+ return out;
+ }
+ if (panic_on_overflow)
+ panic("dma_map_sg: overflow on %lu pages\n", pages);
+
+ iommu_full(dev, pages << PAGE_SHIFT, dir);
+ for_each_sg(sg, s, nents, i)
+ s->dma_address = bad_dma_addr;
+ return 0;
+}
+
+/* allocate and map a coherent mapping */
+static void *
+gart_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_addr,
+ gfp_t flag, unsigned long attrs)
+{
+ void *vaddr;
+
+ vaddr = dma_direct_alloc(dev, size, dma_addr, flag, attrs);
+ if (!vaddr ||
+ !force_iommu || dev->coherent_dma_mask <= DMA_BIT_MASK(24))
+ return vaddr;
+
+ *dma_addr = dma_map_area(dev, virt_to_phys(vaddr), size,
+ DMA_BIDIRECTIONAL, (1UL << get_order(size)) - 1);
+ flush_gart();
+ if (unlikely(*dma_addr == bad_dma_addr))
+ goto out_free;
+ return vaddr;
+out_free:
+ dma_direct_free(dev, size, vaddr, *dma_addr, attrs);
+ return NULL;
+}
+
+/* free a coherent mapping */
+static void
+gart_free_coherent(struct device *dev, size_t size, void *vaddr,
+ dma_addr_t dma_addr, unsigned long attrs)
+{
+ gart_unmap_page(dev, dma_addr, size, DMA_BIDIRECTIONAL, 0);
+ dma_direct_free(dev, size, vaddr, dma_addr, attrs);
+}
+
+static int gart_mapping_error(struct device *dev, dma_addr_t dma_addr)
+{
+ return (dma_addr == bad_dma_addr);
+}
+
+static int no_agp;
+
+static __init unsigned long check_iommu_size(unsigned long aper, u64 aper_size)
+{
+ unsigned long a;
+
+ if (!iommu_size) {
+ iommu_size = aper_size;
+ if (!no_agp)
+ iommu_size /= 2;
+ }
+
+ a = aper + iommu_size;
+ iommu_size -= round_up(a, PMD_PAGE_SIZE) - a;
+
+ if (iommu_size < 64*1024*1024) {
+ pr_warning(
+ "PCI-DMA: Warning: Small IOMMU %luMB."
+ " Consider increasing the AGP aperture in BIOS\n",
+ iommu_size >> 20);
+ }
+
+ return iommu_size;
+}
+
+static __init unsigned read_aperture(struct pci_dev *dev, u32 *size)
+{
+ unsigned aper_size = 0, aper_base_32, aper_order;
+ u64 aper_base;
+
+ pci_read_config_dword(dev, AMD64_GARTAPERTUREBASE, &aper_base_32);
+ pci_read_config_dword(dev, AMD64_GARTAPERTURECTL, &aper_order);
+ aper_order = (aper_order >> 1) & 7;
+
+ aper_base = aper_base_32 & 0x7fff;
+ aper_base <<= 25;
+
+ aper_size = (32 * 1024 * 1024) << aper_order;
+ if (aper_base + aper_size > 0x100000000UL || !aper_size)
+ aper_base = 0;
+
+ *size = aper_size;
+ return aper_base;
+}
+
+static void enable_gart_translations(void)
+{
+ int i;
+
+ if (!amd_nb_has_feature(AMD_NB_GART))
+ return;
+
+ for (i = 0; i < amd_nb_num(); i++) {
+ struct pci_dev *dev = node_to_amd_nb(i)->misc;
+
+ enable_gart_translation(dev, __pa(agp_gatt_table));
+ }
+
+ /* Flush the GART-TLB to remove stale entries */
+ amd_flush_garts();
+}
+
+/*
+ * If fix_up_north_bridges is set, the north bridges have to be fixed up on
+ * resume in the same way as they are handled in gart_iommu_hole_init().
+ */
+static bool fix_up_north_bridges;
+static u32 aperture_order;
+static u32 aperture_alloc;
+
+void set_up_gart_resume(u32 aper_order, u32 aper_alloc)
+{
+ fix_up_north_bridges = true;
+ aperture_order = aper_order;
+ aperture_alloc = aper_alloc;
+}
+
+static void gart_fixup_northbridges(void)
+{
+ int i;
+
+ if (!fix_up_north_bridges)
+ return;
+
+ if (!amd_nb_has_feature(AMD_NB_GART))
+ return;
+
+ pr_info("PCI-DMA: Restoring GART aperture settings\n");
+
+ for (i = 0; i < amd_nb_num(); i++) {
+ struct pci_dev *dev = node_to_amd_nb(i)->misc;
+
+ /*
+ * Don't enable translations just yet. That is the next
+ * step. Restore the pre-suspend aperture settings.
+ */
+ gart_set_size_and_enable(dev, aperture_order);
+ pci_write_config_dword(dev, AMD64_GARTAPERTUREBASE, aperture_alloc >> 25);
+ }
+}
+
+static void gart_resume(void)
+{
+ pr_info("PCI-DMA: Resuming GART IOMMU\n");
+
+ gart_fixup_northbridges();
+
+ enable_gart_translations();
+}
+
+static struct syscore_ops gart_syscore_ops = {
+ .resume = gart_resume,
+
+};
+
+/*
+ * Private Northbridge GATT initialization in case we cannot use the
+ * AGP driver for some reason.
+ */
+static __init int init_amd_gatt(struct agp_kern_info *info)
+{
+ unsigned aper_size, gatt_size, new_aper_size;
+ unsigned aper_base, new_aper_base;
+ struct pci_dev *dev;
+ void *gatt;
+ int i;
+
+ pr_info("PCI-DMA: Disabling AGP.\n");
+
+ aper_size = aper_base = info->aper_size = 0;
+ dev = NULL;
+ for (i = 0; i < amd_nb_num(); i++) {
+ dev = node_to_amd_nb(i)->misc;
+ new_aper_base = read_aperture(dev, &new_aper_size);
+ if (!new_aper_base)
+ goto nommu;
+
+ if (!aper_base) {
+ aper_size = new_aper_size;
+ aper_base = new_aper_base;
+ }
+ if (aper_size != new_aper_size || aper_base != new_aper_base)
+ goto nommu;
+ }
+ if (!aper_base)
+ goto nommu;
+
+ info->aper_base = aper_base;
+ info->aper_size = aper_size >> 20;
+
+ gatt_size = (aper_size >> PAGE_SHIFT) * sizeof(u32);
+ gatt = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
+ get_order(gatt_size));
+ if (!gatt)
+ panic("Cannot allocate GATT table");
+ if (set_memory_uc((unsigned long)gatt, gatt_size >> PAGE_SHIFT))
+ panic("Could not set GART PTEs to uncacheable pages");
+
+ agp_gatt_table = gatt;
+
+ register_syscore_ops(&gart_syscore_ops);
+
+ flush_gart();
+
+ pr_info("PCI-DMA: aperture base @ %x size %u KB\n",
+ aper_base, aper_size>>10);
+
+ return 0;
+
+ nommu:
+ /* Should not happen anymore */
+ pr_warning("PCI-DMA: More than 4GB of RAM and no IOMMU\n"
+ "falling back to iommu=soft.\n");
+ return -1;
+}
+
+static const struct dma_map_ops gart_dma_ops = {
+ .map_sg = gart_map_sg,
+ .unmap_sg = gart_unmap_sg,
+ .map_page = gart_map_page,
+ .unmap_page = gart_unmap_page,
+ .alloc = gart_alloc_coherent,
+ .free = gart_free_coherent,
+ .mapping_error = gart_mapping_error,
+ .dma_supported = dma_direct_supported,
+};
+
+static void gart_iommu_shutdown(void)
+{
+ struct pci_dev *dev;
+ int i;
+
+ /* don't shutdown it if there is AGP installed */
+ if (!no_agp)
+ return;
+
+ if (!amd_nb_has_feature(AMD_NB_GART))
+ return;
+
+ for (i = 0; i < amd_nb_num(); i++) {
+ u32 ctl;
+
+ dev = node_to_amd_nb(i)->misc;
+ pci_read_config_dword(dev, AMD64_GARTAPERTURECTL, &ctl);
+
+ ctl &= ~GARTEN;
+
+ pci_write_config_dword(dev, AMD64_GARTAPERTURECTL, ctl);
+ }
+}
+
+int __init gart_iommu_init(void)
+{
+ struct agp_kern_info info;
+ unsigned long iommu_start;
+ unsigned long aper_base, aper_size;
+ unsigned long start_pfn, end_pfn;
+ unsigned long scratch;
+ long i;
+
+ if (!amd_nb_has_feature(AMD_NB_GART))
+ return 0;
+
+#ifndef CONFIG_AGP_AMD64
+ no_agp = 1;
+#else
+ /* Makefile puts PCI initialization via subsys_initcall first. */
+ /* Add other AMD AGP bridge drivers here */
+ no_agp = no_agp ||
+ (agp_amd64_init() < 0) ||
+ (agp_copy_info(agp_bridge, &info) < 0);
+#endif
+
+ if (no_iommu ||
+ (!force_iommu && max_pfn <= MAX_DMA32_PFN) ||
+ !gart_iommu_aperture ||
+ (no_agp && init_amd_gatt(&info) < 0)) {
+ if (max_pfn > MAX_DMA32_PFN) {
+ pr_warning("More than 4GB of memory but GART IOMMU not available.\n");
+ pr_warning("falling back to iommu=soft.\n");
+ }
+ return 0;
+ }
+
+ /* need to map that range */
+ aper_size = info.aper_size << 20;
+ aper_base = info.aper_base;
+ end_pfn = (aper_base>>PAGE_SHIFT) + (aper_size>>PAGE_SHIFT);
+
+ start_pfn = PFN_DOWN(aper_base);
+ if (!pfn_range_is_mapped(start_pfn, end_pfn))
+ init_memory_mapping(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
+
+ pr_info("PCI-DMA: using GART IOMMU.\n");
+ iommu_size = check_iommu_size(info.aper_base, aper_size);
+ iommu_pages = iommu_size >> PAGE_SHIFT;
+
+ iommu_gart_bitmap = (void *) __get_free_pages(GFP_KERNEL | __GFP_ZERO,
+ get_order(iommu_pages/8));
+ if (!iommu_gart_bitmap)
+ panic("Cannot allocate iommu bitmap\n");
+
+#ifdef CONFIG_IOMMU_LEAK
+ if (leak_trace) {
+ int ret;
+
+ ret = dma_debug_resize_entries(iommu_pages);
+ if (ret)
+ pr_debug("PCI-DMA: Cannot trace all the entries\n");
+ }
+#endif
+
+ /*
+ * Out of IOMMU space handling.
+ * Reserve some invalid pages at the beginning of the GART.
+ */
+ bitmap_set(iommu_gart_bitmap, 0, EMERGENCY_PAGES);
+
+ pr_info("PCI-DMA: Reserving %luMB of IOMMU area in the AGP aperture\n",
+ iommu_size >> 20);
+
+ agp_memory_reserved = iommu_size;
+ iommu_start = aper_size - iommu_size;
+ iommu_bus_base = info.aper_base + iommu_start;
+ bad_dma_addr = iommu_bus_base;
+ iommu_gatt_base = agp_gatt_table + (iommu_start>>PAGE_SHIFT);
+
+ /*
+ * Unmap the IOMMU part of the GART. The alias of the page is
+ * always mapped with cache enabled and there is no full cache
+ * coherency across the GART remapping. The unmapping avoids
+ * automatic prefetches from the CPU allocating cache lines in
+ * there. All CPU accesses are done via the direct mapping to
+ * the backing memory. The GART address is only used by PCI
+ * devices.
+ */
+ set_memory_np((unsigned long)__va(iommu_bus_base),
+ iommu_size >> PAGE_SHIFT);
+ /*
+ * Tricky. The GART table remaps the physical memory range,
+ * so the CPU wont notice potential aliases and if the memory
+ * is remapped to UC later on, we might surprise the PCI devices
+ * with a stray writeout of a cacheline. So play it sure and
+ * do an explicit, full-scale wbinvd() _after_ having marked all
+ * the pages as Not-Present:
+ */
+ wbinvd();
+
+ /*
+ * Now all caches are flushed and we can safely enable
+ * GART hardware. Doing it early leaves the possibility
+ * of stale cache entries that can lead to GART PTE
+ * errors.
+ */
+ enable_gart_translations();
+
+ /*
+ * Try to workaround a bug (thanks to BenH):
+ * Set unmapped entries to a scratch page instead of 0.
+ * Any prefetches that hit unmapped entries won't get an bus abort
+ * then. (P2P bridge may be prefetching on DMA reads).
+ */
+ scratch = get_zeroed_page(GFP_KERNEL);
+ if (!scratch)
+ panic("Cannot allocate iommu scratch page");
+ gart_unmapped_entry = GPTE_ENCODE(__pa(scratch));
+ for (i = EMERGENCY_PAGES; i < iommu_pages; i++)
+ iommu_gatt_base[i] = gart_unmapped_entry;
+
+ flush_gart();
+ dma_ops = &gart_dma_ops;
+ x86_platform.iommu_shutdown = gart_iommu_shutdown;
+ swiotlb = 0;
+
+ return 0;
+}
+
+void __init gart_parse_options(char *p)
+{
+ int arg;
+
+#ifdef CONFIG_IOMMU_LEAK
+ if (!strncmp(p, "leak", 4)) {
+ leak_trace = 1;
+ p += 4;
+ if (*p == '=')
+ ++p;
+ if (isdigit(*p) && get_option(&p, &arg))
+ iommu_leak_pages = arg;
+ }
+#endif
+ if (isdigit(*p) && get_option(&p, &arg))
+ iommu_size = arg;
+ if (!strncmp(p, "fullflush", 9))
+ iommu_fullflush = 1;
+ if (!strncmp(p, "nofullflush", 11))
+ iommu_fullflush = 0;
+ if (!strncmp(p, "noagp", 5))
+ no_agp = 1;
+ if (!strncmp(p, "noaperture", 10))
+ fix_aperture = 0;
+ /* duplicated from pci-dma.c */
+ if (!strncmp(p, "force", 5))
+ gart_iommu_aperture_allowed = 1;
+ if (!strncmp(p, "allowed", 7))
+ gart_iommu_aperture_allowed = 1;
+ if (!strncmp(p, "memaper", 7)) {
+ fallback_aper_force = 1;
+ p += 7;
+ if (*p == '=') {
+ ++p;
+ if (get_option(&p, &arg))
+ fallback_aper_order = arg;
+ }
+ }
+}
+IOMMU_INIT_POST(gart_iommu_hole_init);
diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c
new file mode 100644
index 0000000..b481b95
--- /dev/null
+++ b/arch/x86/kernel/amd_nb.c
@@ -0,0 +1,461 @@
+/*
+ * Shared support code for AMD K8 northbridges and derivates.
+ * Copyright 2006 Andi Kleen, SUSE Labs. Subject to GPLv2.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/errno.h>
+#include <linux/export.h>
+#include <linux/spinlock.h>
+#include <asm/amd_nb.h>
+
+#define PCI_DEVICE_ID_AMD_17H_ROOT 0x1450
+#define PCI_DEVICE_ID_AMD_17H_M10H_ROOT 0x15d0
+#define PCI_DEVICE_ID_AMD_17H_DF_F3 0x1463
+#define PCI_DEVICE_ID_AMD_17H_DF_F4 0x1464
+#define PCI_DEVICE_ID_AMD_17H_M10H_DF_F3 0x15eb
+#define PCI_DEVICE_ID_AMD_17H_M10H_DF_F4 0x15ec
+
+/* Protect the PCI config register pairs used for SMN and DF indirect access. */
+static DEFINE_MUTEX(smn_mutex);
+
+static u32 *flush_words;
+
+static const struct pci_device_id amd_root_ids[] = {
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_ROOT) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M10H_ROOT) },
+ {}
+};
+
+#define PCI_DEVICE_ID_AMD_CNB17H_F4 0x1704
+
+const struct pci_device_id amd_nb_misc_ids[] = {
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MISC) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F3) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_M10H_F3) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_M30H_NB_F3) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_M60H_NB_F3) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_NB_F3) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_M30H_NB_F3) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_DF_F3) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M10H_DF_F3) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CNB17H_F3) },
+ {}
+};
+EXPORT_SYMBOL_GPL(amd_nb_misc_ids);
+
+static const struct pci_device_id amd_nb_link_ids[] = {
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F4) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_M30H_NB_F4) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_M60H_NB_F4) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_NB_F4) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_M30H_NB_F4) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_DF_F4) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M10H_DF_F4) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CNB17H_F4) },
+ {}
+};
+
+const struct amd_nb_bus_dev_range amd_nb_bus_dev_ranges[] __initconst = {
+ { 0x00, 0x18, 0x20 },
+ { 0xff, 0x00, 0x20 },
+ { 0xfe, 0x00, 0x20 },
+ { }
+};
+
+static struct amd_northbridge_info amd_northbridges;
+
+u16 amd_nb_num(void)
+{
+ return amd_northbridges.num;
+}
+EXPORT_SYMBOL_GPL(amd_nb_num);
+
+bool amd_nb_has_feature(unsigned int feature)
+{
+ return ((amd_northbridges.flags & feature) == feature);
+}
+EXPORT_SYMBOL_GPL(amd_nb_has_feature);
+
+struct amd_northbridge *node_to_amd_nb(int node)
+{
+ return (node < amd_northbridges.num) ? &amd_northbridges.nb[node] : NULL;
+}
+EXPORT_SYMBOL_GPL(node_to_amd_nb);
+
+static struct pci_dev *next_northbridge(struct pci_dev *dev,
+ const struct pci_device_id *ids)
+{
+ do {
+ dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev);
+ if (!dev)
+ break;
+ } while (!pci_match_id(ids, dev));
+ return dev;
+}
+
+static int __amd_smn_rw(u16 node, u32 address, u32 *value, bool write)
+{
+ struct pci_dev *root;
+ int err = -ENODEV;
+
+ if (node >= amd_northbridges.num)
+ goto out;
+
+ root = node_to_amd_nb(node)->root;
+ if (!root)
+ goto out;
+
+ mutex_lock(&smn_mutex);
+
+ err = pci_write_config_dword(root, 0x60, address);
+ if (err) {
+ pr_warn("Error programming SMN address 0x%x.\n", address);
+ goto out_unlock;
+ }
+
+ err = (write ? pci_write_config_dword(root, 0x64, *value)
+ : pci_read_config_dword(root, 0x64, value));
+ if (err)
+ pr_warn("Error %s SMN address 0x%x.\n",
+ (write ? "writing to" : "reading from"), address);
+
+out_unlock:
+ mutex_unlock(&smn_mutex);
+
+out:
+ return err;
+}
+
+int amd_smn_read(u16 node, u32 address, u32 *value)
+{
+ return __amd_smn_rw(node, address, value, false);
+}
+EXPORT_SYMBOL_GPL(amd_smn_read);
+
+int amd_smn_write(u16 node, u32 address, u32 value)
+{
+ return __amd_smn_rw(node, address, &value, true);
+}
+EXPORT_SYMBOL_GPL(amd_smn_write);
+
+/*
+ * Data Fabric Indirect Access uses FICAA/FICAD.
+ *
+ * Fabric Indirect Configuration Access Address (FICAA): Constructed based
+ * on the device's Instance Id and the PCI function and register offset of
+ * the desired register.
+ *
+ * Fabric Indirect Configuration Access Data (FICAD): There are FICAD LO
+ * and FICAD HI registers but so far we only need the LO register.
+ */
+int amd_df_indirect_read(u16 node, u8 func, u16 reg, u8 instance_id, u32 *lo)
+{
+ struct pci_dev *F4;
+ u32 ficaa;
+ int err = -ENODEV;
+
+ if (node >= amd_northbridges.num)
+ goto out;
+
+ F4 = node_to_amd_nb(node)->link;
+ if (!F4)
+ goto out;
+
+ ficaa = 1;
+ ficaa |= reg & 0x3FC;
+ ficaa |= (func & 0x7) << 11;
+ ficaa |= instance_id << 16;
+
+ mutex_lock(&smn_mutex);
+
+ err = pci_write_config_dword(F4, 0x5C, ficaa);
+ if (err) {
+ pr_warn("Error writing DF Indirect FICAA, FICAA=0x%x\n", ficaa);
+ goto out_unlock;
+ }
+
+ err = pci_read_config_dword(F4, 0x98, lo);
+ if (err)
+ pr_warn("Error reading DF Indirect FICAD LO, FICAA=0x%x.\n", ficaa);
+
+out_unlock:
+ mutex_unlock(&smn_mutex);
+
+out:
+ return err;
+}
+EXPORT_SYMBOL_GPL(amd_df_indirect_read);
+
+int amd_cache_northbridges(void)
+{
+ u16 i = 0;
+ struct amd_northbridge *nb;
+ struct pci_dev *root, *misc, *link;
+
+ if (amd_northbridges.num)
+ return 0;
+
+ misc = NULL;
+ while ((misc = next_northbridge(misc, amd_nb_misc_ids)) != NULL)
+ i++;
+
+ if (!i)
+ return -ENODEV;
+
+ nb = kcalloc(i, sizeof(struct amd_northbridge), GFP_KERNEL);
+ if (!nb)
+ return -ENOMEM;
+
+ amd_northbridges.nb = nb;
+ amd_northbridges.num = i;
+
+ link = misc = root = NULL;
+ for (i = 0; i != amd_northbridges.num; i++) {
+ node_to_amd_nb(i)->root = root =
+ next_northbridge(root, amd_root_ids);
+ node_to_amd_nb(i)->misc = misc =
+ next_northbridge(misc, amd_nb_misc_ids);
+ node_to_amd_nb(i)->link = link =
+ next_northbridge(link, amd_nb_link_ids);
+ }
+
+ if (amd_gart_present())
+ amd_northbridges.flags |= AMD_NB_GART;
+
+ /*
+ * Check for L3 cache presence.
+ */
+ if (!cpuid_edx(0x80000006))
+ return 0;
+
+ /*
+ * Some CPU families support L3 Cache Index Disable. There are some
+ * limitations because of E382 and E388 on family 0x10.
+ */
+ if (boot_cpu_data.x86 == 0x10 &&
+ boot_cpu_data.x86_model >= 0x8 &&
+ (boot_cpu_data.x86_model > 0x9 ||
+ boot_cpu_data.x86_stepping >= 0x1))
+ amd_northbridges.flags |= AMD_NB_L3_INDEX_DISABLE;
+
+ if (boot_cpu_data.x86 == 0x15)
+ amd_northbridges.flags |= AMD_NB_L3_INDEX_DISABLE;
+
+ /* L3 cache partitioning is supported on family 0x15 */
+ if (boot_cpu_data.x86 == 0x15)
+ amd_northbridges.flags |= AMD_NB_L3_PARTITIONING;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(amd_cache_northbridges);
+
+/*
+ * Ignores subdevice/subvendor but as far as I can figure out
+ * they're useless anyways
+ */
+bool __init early_is_amd_nb(u32 device)
+{
+ const struct pci_device_id *id;
+ u32 vendor = device & 0xffff;
+
+ device >>= 16;
+ for (id = amd_nb_misc_ids; id->vendor; id++)
+ if (vendor == id->vendor && device == id->device)
+ return true;
+ return false;
+}
+
+struct resource *amd_get_mmconfig_range(struct resource *res)
+{
+ u32 address;
+ u64 base, msr;
+ unsigned int segn_busn_bits;
+
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
+ return NULL;
+
+ /* assume all cpus from fam10h have mmconfig */
+ if (boot_cpu_data.x86 < 0x10)
+ return NULL;
+
+ address = MSR_FAM10H_MMIO_CONF_BASE;
+ rdmsrl(address, msr);
+
+ /* mmconfig is not enabled */
+ if (!(msr & FAM10H_MMIO_CONF_ENABLE))
+ return NULL;
+
+ base = msr & (FAM10H_MMIO_CONF_BASE_MASK<<FAM10H_MMIO_CONF_BASE_SHIFT);
+
+ segn_busn_bits = (msr >> FAM10H_MMIO_CONF_BUSRANGE_SHIFT) &
+ FAM10H_MMIO_CONF_BUSRANGE_MASK;
+
+ res->flags = IORESOURCE_MEM;
+ res->start = base;
+ res->end = base + (1ULL<<(segn_busn_bits + 20)) - 1;
+ return res;
+}
+
+int amd_get_subcaches(int cpu)
+{
+ struct pci_dev *link = node_to_amd_nb(amd_get_nb_id(cpu))->link;
+ unsigned int mask;
+
+ if (!amd_nb_has_feature(AMD_NB_L3_PARTITIONING))
+ return 0;
+
+ pci_read_config_dword(link, 0x1d4, &mask);
+
+ return (mask >> (4 * cpu_data(cpu).cpu_core_id)) & 0xf;
+}
+
+int amd_set_subcaches(int cpu, unsigned long mask)
+{
+ static unsigned int reset, ban;
+ struct amd_northbridge *nb = node_to_amd_nb(amd_get_nb_id(cpu));
+ unsigned int reg;
+ int cuid;
+
+ if (!amd_nb_has_feature(AMD_NB_L3_PARTITIONING) || mask > 0xf)
+ return -EINVAL;
+
+ /* if necessary, collect reset state of L3 partitioning and BAN mode */
+ if (reset == 0) {
+ pci_read_config_dword(nb->link, 0x1d4, &reset);
+ pci_read_config_dword(nb->misc, 0x1b8, &ban);
+ ban &= 0x180000;
+ }
+
+ /* deactivate BAN mode if any subcaches are to be disabled */
+ if (mask != 0xf) {
+ pci_read_config_dword(nb->misc, 0x1b8, ®);
+ pci_write_config_dword(nb->misc, 0x1b8, reg & ~0x180000);
+ }
+
+ cuid = cpu_data(cpu).cpu_core_id;
+ mask <<= 4 * cuid;
+ mask |= (0xf ^ (1 << cuid)) << 26;
+
+ pci_write_config_dword(nb->link, 0x1d4, mask);
+
+ /* reset BAN mode if L3 partitioning returned to reset state */
+ pci_read_config_dword(nb->link, 0x1d4, ®);
+ if (reg == reset) {
+ pci_read_config_dword(nb->misc, 0x1b8, ®);
+ reg &= ~0x180000;
+ pci_write_config_dword(nb->misc, 0x1b8, reg | ban);
+ }
+
+ return 0;
+}
+
+static void amd_cache_gart(void)
+{
+ u16 i;
+
+ if (!amd_nb_has_feature(AMD_NB_GART))
+ return;
+
+ flush_words = kmalloc_array(amd_northbridges.num, sizeof(u32), GFP_KERNEL);
+ if (!flush_words) {
+ amd_northbridges.flags &= ~AMD_NB_GART;
+ pr_notice("Cannot initialize GART flush words, GART support disabled\n");
+ return;
+ }
+
+ for (i = 0; i != amd_northbridges.num; i++)
+ pci_read_config_dword(node_to_amd_nb(i)->misc, 0x9c, &flush_words[i]);
+}
+
+void amd_flush_garts(void)
+{
+ int flushed, i;
+ unsigned long flags;
+ static DEFINE_SPINLOCK(gart_lock);
+
+ if (!amd_nb_has_feature(AMD_NB_GART))
+ return;
+
+ /*
+ * Avoid races between AGP and IOMMU. In theory it's not needed
+ * but I'm not sure if the hardware won't lose flush requests
+ * when another is pending. This whole thing is so expensive anyways
+ * that it doesn't matter to serialize more. -AK
+ */
+ spin_lock_irqsave(&gart_lock, flags);
+ flushed = 0;
+ for (i = 0; i < amd_northbridges.num; i++) {
+ pci_write_config_dword(node_to_amd_nb(i)->misc, 0x9c,
+ flush_words[i] | 1);
+ flushed++;
+ }
+ for (i = 0; i < amd_northbridges.num; i++) {
+ u32 w;
+ /* Make sure the hardware actually executed the flush*/
+ for (;;) {
+ pci_read_config_dword(node_to_amd_nb(i)->misc,
+ 0x9c, &w);
+ if (!(w & 1))
+ break;
+ cpu_relax();
+ }
+ }
+ spin_unlock_irqrestore(&gart_lock, flags);
+ if (!flushed)
+ pr_notice("nothing to flush?\n");
+}
+EXPORT_SYMBOL_GPL(amd_flush_garts);
+
+static void __fix_erratum_688(void *info)
+{
+#define MSR_AMD64_IC_CFG 0xC0011021
+
+ msr_set_bit(MSR_AMD64_IC_CFG, 3);
+ msr_set_bit(MSR_AMD64_IC_CFG, 14);
+}
+
+/* Apply erratum 688 fix so machines without a BIOS fix work. */
+static __init void fix_erratum_688(void)
+{
+ struct pci_dev *F4;
+ u32 val;
+
+ if (boot_cpu_data.x86 != 0x14)
+ return;
+
+ if (!amd_northbridges.num)
+ return;
+
+ F4 = node_to_amd_nb(0)->link;
+ if (!F4)
+ return;
+
+ if (pci_read_config_dword(F4, 0x164, &val))
+ return;
+
+ if (val & BIT(2))
+ return;
+
+ on_each_cpu(__fix_erratum_688, NULL, 0);
+
+ pr_info("x86/cpu/AMD: CPU erratum 688 worked around\n");
+}
+
+static __init int init_amd_nbs(void)
+{
+ amd_cache_northbridges();
+ amd_cache_gart();
+
+ fix_erratum_688();
+
+ return 0;
+}
+
+/* This has to go after the PCI subsystem */
+fs_initcall(init_amd_nbs);
diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c
new file mode 100644
index 0000000..65721dc
--- /dev/null
+++ b/arch/x86/kernel/apb_timer.c
@@ -0,0 +1,404 @@
+/*
+ * apb_timer.c: Driver for Langwell APB timers
+ *
+ * (C) Copyright 2009 Intel Corporation
+ * Author: Jacob Pan (jacob.jun.pan@intel.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ *
+ * Note:
+ * Langwell is the south complex of Intel Moorestown MID platform. There are
+ * eight external timers in total that can be used by the operating system.
+ * The timer information, such as frequency and addresses, is provided to the
+ * OS via SFI tables.
+ * Timer interrupts are routed via FW/HW emulated IOAPIC independently via
+ * individual redirection table entries (RTE).
+ * Unlike HPET, there is no master counter, therefore one of the timers are
+ * used as clocksource. The overall allocation looks like:
+ * - timer 0 - NR_CPUs for per cpu timer
+ * - one timer for clocksource
+ * - one timer for watchdog driver.
+ * It is also worth notice that APB timer does not support true one-shot mode,
+ * free-running mode will be used here to emulate one-shot mode.
+ * APB timer can also be used as broadcast timer along with per cpu local APIC
+ * timer, but by default APB timer has higher rating than local APIC timers.
+ */
+
+#include <linux/delay.h>
+#include <linux/dw_apb_timer.h>
+#include <linux/errno.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/pm.h>
+#include <linux/sfi.h>
+#include <linux/interrupt.h>
+#include <linux/cpu.h>
+#include <linux/irq.h>
+
+#include <asm/fixmap.h>
+#include <asm/apb_timer.h>
+#include <asm/intel-mid.h>
+#include <asm/time.h>
+
+#define APBT_CLOCKEVENT_RATING 110
+#define APBT_CLOCKSOURCE_RATING 250
+
+#define APBT_CLOCKEVENT0_NUM (0)
+#define APBT_CLOCKSOURCE_NUM (2)
+
+static phys_addr_t apbt_address;
+static int apb_timer_block_enabled;
+static void __iomem *apbt_virt_address;
+
+/*
+ * Common DW APB timer info
+ */
+static unsigned long apbt_freq;
+
+struct apbt_dev {
+ struct dw_apb_clock_event_device *timer;
+ unsigned int num;
+ int cpu;
+ unsigned int irq;
+ char name[10];
+};
+
+static struct dw_apb_clocksource *clocksource_apbt;
+
+static inline void __iomem *adev_virt_addr(struct apbt_dev *adev)
+{
+ return apbt_virt_address + adev->num * APBTMRS_REG_SIZE;
+}
+
+static DEFINE_PER_CPU(struct apbt_dev, cpu_apbt_dev);
+
+#ifdef CONFIG_SMP
+static unsigned int apbt_num_timers_used;
+#endif
+
+static inline void apbt_set_mapping(void)
+{
+ struct sfi_timer_table_entry *mtmr;
+ int phy_cs_timer_id = 0;
+
+ if (apbt_virt_address) {
+ pr_debug("APBT base already mapped\n");
+ return;
+ }
+ mtmr = sfi_get_mtmr(APBT_CLOCKEVENT0_NUM);
+ if (mtmr == NULL) {
+ printk(KERN_ERR "Failed to get MTMR %d from SFI\n",
+ APBT_CLOCKEVENT0_NUM);
+ return;
+ }
+ apbt_address = (phys_addr_t)mtmr->phys_addr;
+ if (!apbt_address) {
+ printk(KERN_WARNING "No timer base from SFI, use default\n");
+ apbt_address = APBT_DEFAULT_BASE;
+ }
+ apbt_virt_address = ioremap_nocache(apbt_address, APBT_MMAP_SIZE);
+ if (!apbt_virt_address) {
+ pr_debug("Failed mapping APBT phy address at %lu\n",\
+ (unsigned long)apbt_address);
+ goto panic_noapbt;
+ }
+ apbt_freq = mtmr->freq_hz;
+ sfi_free_mtmr(mtmr);
+
+ /* Now figure out the physical timer id for clocksource device */
+ mtmr = sfi_get_mtmr(APBT_CLOCKSOURCE_NUM);
+ if (mtmr == NULL)
+ goto panic_noapbt;
+
+ /* Now figure out the physical timer id */
+ pr_debug("Use timer %d for clocksource\n",
+ (int)(mtmr->phys_addr & 0xff) / APBTMRS_REG_SIZE);
+ phy_cs_timer_id = (unsigned int)(mtmr->phys_addr & 0xff) /
+ APBTMRS_REG_SIZE;
+
+ clocksource_apbt = dw_apb_clocksource_init(APBT_CLOCKSOURCE_RATING,
+ "apbt0", apbt_virt_address + phy_cs_timer_id *
+ APBTMRS_REG_SIZE, apbt_freq);
+ return;
+
+panic_noapbt:
+ panic("Failed to setup APB system timer\n");
+
+}
+
+static inline void apbt_clear_mapping(void)
+{
+ iounmap(apbt_virt_address);
+ apbt_virt_address = NULL;
+}
+
+static int __init apbt_clockevent_register(void)
+{
+ struct sfi_timer_table_entry *mtmr;
+ struct apbt_dev *adev = this_cpu_ptr(&cpu_apbt_dev);
+
+ mtmr = sfi_get_mtmr(APBT_CLOCKEVENT0_NUM);
+ if (mtmr == NULL) {
+ printk(KERN_ERR "Failed to get MTMR %d from SFI\n",
+ APBT_CLOCKEVENT0_NUM);
+ return -ENODEV;
+ }
+
+ adev->num = smp_processor_id();
+ adev->timer = dw_apb_clockevent_init(smp_processor_id(), "apbt0",
+ intel_mid_timer_options == INTEL_MID_TIMER_LAPIC_APBT ?
+ APBT_CLOCKEVENT_RATING - 100 : APBT_CLOCKEVENT_RATING,
+ adev_virt_addr(adev), 0, apbt_freq);
+ /* Firmware does EOI handling for us. */
+ adev->timer->eoi = NULL;
+
+ if (intel_mid_timer_options == INTEL_MID_TIMER_LAPIC_APBT) {
+ global_clock_event = &adev->timer->ced;
+ printk(KERN_DEBUG "%s clockevent registered as global\n",
+ global_clock_event->name);
+ }
+
+ dw_apb_clockevent_register(adev->timer);
+
+ sfi_free_mtmr(mtmr);
+ return 0;
+}
+
+#ifdef CONFIG_SMP
+
+static void apbt_setup_irq(struct apbt_dev *adev)
+{
+ irq_modify_status(adev->irq, 0, IRQ_MOVE_PCNTXT);
+ irq_set_affinity(adev->irq, cpumask_of(adev->cpu));
+}
+
+/* Should be called with per cpu */
+void apbt_setup_secondary_clock(void)
+{
+ struct apbt_dev *adev;
+ int cpu;
+
+ /* Don't register boot CPU clockevent */
+ cpu = smp_processor_id();
+ if (!cpu)
+ return;
+
+ adev = this_cpu_ptr(&cpu_apbt_dev);
+ if (!adev->timer) {
+ adev->timer = dw_apb_clockevent_init(cpu, adev->name,
+ APBT_CLOCKEVENT_RATING, adev_virt_addr(adev),
+ adev->irq, apbt_freq);
+ adev->timer->eoi = NULL;
+ } else {
+ dw_apb_clockevent_resume(adev->timer);
+ }
+
+ printk(KERN_INFO "Registering CPU %d clockevent device %s, cpu %08x\n",
+ cpu, adev->name, adev->cpu);
+
+ apbt_setup_irq(adev);
+ dw_apb_clockevent_register(adev->timer);
+
+ return;
+}
+
+/*
+ * this notify handler process CPU hotplug events. in case of S0i3, nonboot
+ * cpus are disabled/enabled frequently, for performance reasons, we keep the
+ * per cpu timer irq registered so that we do need to do free_irq/request_irq.
+ *
+ * TODO: it might be more reliable to directly disable percpu clockevent device
+ * without the notifier chain. currently, cpu 0 may get interrupts from other
+ * cpu timers during the offline process due to the ordering of notification.
+ * the extra interrupt is harmless.
+ */
+static int apbt_cpu_dead(unsigned int cpu)
+{
+ struct apbt_dev *adev = &per_cpu(cpu_apbt_dev, cpu);
+
+ dw_apb_clockevent_pause(adev->timer);
+ if (system_state == SYSTEM_RUNNING) {
+ pr_debug("skipping APBT CPU %u offline\n", cpu);
+ } else {
+ pr_debug("APBT clockevent for cpu %u offline\n", cpu);
+ dw_apb_clockevent_stop(adev->timer);
+ }
+ return 0;
+}
+
+static __init int apbt_late_init(void)
+{
+ if (intel_mid_timer_options == INTEL_MID_TIMER_LAPIC_APBT ||
+ !apb_timer_block_enabled)
+ return 0;
+ return cpuhp_setup_state(CPUHP_X86_APB_DEAD, "x86/apb:dead", NULL,
+ apbt_cpu_dead);
+}
+fs_initcall(apbt_late_init);
+#else
+
+void apbt_setup_secondary_clock(void) {}
+
+#endif /* CONFIG_SMP */
+
+static int apbt_clocksource_register(void)
+{
+ u64 start, now;
+ u64 t1;
+
+ /* Start the counter, use timer 2 as source, timer 0/1 for event */
+ dw_apb_clocksource_start(clocksource_apbt);
+
+ /* Verify whether apbt counter works */
+ t1 = dw_apb_clocksource_read(clocksource_apbt);
+ start = rdtsc();
+
+ /*
+ * We don't know the TSC frequency yet, but waiting for
+ * 200000 TSC cycles is safe:
+ * 4 GHz == 50us
+ * 1 GHz == 200us
+ */
+ do {
+ rep_nop();
+ now = rdtsc();
+ } while ((now - start) < 200000UL);
+
+ /* APBT is the only always on clocksource, it has to work! */
+ if (t1 == dw_apb_clocksource_read(clocksource_apbt))
+ panic("APBT counter not counting. APBT disabled\n");
+
+ dw_apb_clocksource_register(clocksource_apbt);
+
+ return 0;
+}
+
+/*
+ * Early setup the APBT timer, only use timer 0 for booting then switch to
+ * per CPU timer if possible.
+ * returns 1 if per cpu apbt is setup
+ * returns 0 if no per cpu apbt is chosen
+ * panic if set up failed, this is the only platform timer on Moorestown.
+ */
+void __init apbt_time_init(void)
+{
+#ifdef CONFIG_SMP
+ int i;
+ struct sfi_timer_table_entry *p_mtmr;
+ struct apbt_dev *adev;
+#endif
+
+ if (apb_timer_block_enabled)
+ return;
+ apbt_set_mapping();
+ if (!apbt_virt_address)
+ goto out_noapbt;
+ /*
+ * Read the frequency and check for a sane value, for ESL model
+ * we extend the possible clock range to allow time scaling.
+ */
+
+ if (apbt_freq < APBT_MIN_FREQ || apbt_freq > APBT_MAX_FREQ) {
+ pr_debug("APBT has invalid freq 0x%lx\n", apbt_freq);
+ goto out_noapbt;
+ }
+ if (apbt_clocksource_register()) {
+ pr_debug("APBT has failed to register clocksource\n");
+ goto out_noapbt;
+ }
+ if (!apbt_clockevent_register())
+ apb_timer_block_enabled = 1;
+ else {
+ pr_debug("APBT has failed to register clockevent\n");
+ goto out_noapbt;
+ }
+#ifdef CONFIG_SMP
+ /* kernel cmdline disable apb timer, so we will use lapic timers */
+ if (intel_mid_timer_options == INTEL_MID_TIMER_LAPIC_APBT) {
+ printk(KERN_INFO "apbt: disabled per cpu timer\n");
+ return;
+ }
+ pr_debug("%s: %d CPUs online\n", __func__, num_online_cpus());
+ if (num_possible_cpus() <= sfi_mtimer_num)
+ apbt_num_timers_used = num_possible_cpus();
+ else
+ apbt_num_timers_used = 1;
+ pr_debug("%s: %d APB timers used\n", __func__, apbt_num_timers_used);
+
+ /* here we set up per CPU timer data structure */
+ for (i = 0; i < apbt_num_timers_used; i++) {
+ adev = &per_cpu(cpu_apbt_dev, i);
+ adev->num = i;
+ adev->cpu = i;
+ p_mtmr = sfi_get_mtmr(i);
+ if (p_mtmr)
+ adev->irq = p_mtmr->irq;
+ else
+ printk(KERN_ERR "Failed to get timer for cpu %d\n", i);
+ snprintf(adev->name, sizeof(adev->name) - 1, "apbt%d", i);
+ }
+#endif
+
+ return;
+
+out_noapbt:
+ apbt_clear_mapping();
+ apb_timer_block_enabled = 0;
+ panic("failed to enable APB timer\n");
+}
+
+/* called before apb_timer_enable, use early map */
+unsigned long apbt_quick_calibrate(void)
+{
+ int i, scale;
+ u64 old, new;
+ u64 t1, t2;
+ unsigned long khz = 0;
+ u32 loop, shift;
+
+ apbt_set_mapping();
+ dw_apb_clocksource_start(clocksource_apbt);
+
+ /* check if the timer can count down, otherwise return */
+ old = dw_apb_clocksource_read(clocksource_apbt);
+ i = 10000;
+ while (--i) {
+ if (old != dw_apb_clocksource_read(clocksource_apbt))
+ break;
+ }
+ if (!i)
+ goto failed;
+
+ /* count 16 ms */
+ loop = (apbt_freq / 1000) << 4;
+
+ /* restart the timer to ensure it won't get to 0 in the calibration */
+ dw_apb_clocksource_start(clocksource_apbt);
+
+ old = dw_apb_clocksource_read(clocksource_apbt);
+ old += loop;
+
+ t1 = rdtsc();
+
+ do {
+ new = dw_apb_clocksource_read(clocksource_apbt);
+ } while (new < old);
+
+ t2 = rdtsc();
+
+ shift = 5;
+ if (unlikely(loop >> shift == 0)) {
+ printk(KERN_INFO
+ "APBT TSC calibration failed, not enough resolution\n");
+ return 0;
+ }
+ scale = (int)div_u64((t2 - t1), loop >> shift);
+ khz = (scale * (apbt_freq / 1000)) >> shift;
+ printk(KERN_INFO "TSC freq calculated by APB timer is %lu khz\n", khz);
+ return khz;
+failed:
+ return 0;
+}
diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
new file mode 100644
index 0000000..2c4d5ec
--- /dev/null
+++ b/arch/x86/kernel/aperture_64.c
@@ -0,0 +1,545 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Firmware replacement code.
+ *
+ * Work around broken BIOSes that don't set an aperture, only set the
+ * aperture in the AGP bridge, or set too small aperture.
+ *
+ * If all fails map the aperture over some low memory. This is cheaper than
+ * doing bounce buffering. The memory is lost. This is done at early boot
+ * because only the bootmem allocator can allocate 32+MB.
+ *
+ * Copyright 2002 Andi Kleen, SuSE Labs.
+ */
+#define pr_fmt(fmt) "AGP: " fmt
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/memblock.h>
+#include <linux/mmzone.h>
+#include <linux/pci_ids.h>
+#include <linux/pci.h>
+#include <linux/bitops.h>
+#include <linux/suspend.h>
+#include <asm/e820/api.h>
+#include <asm/io.h>
+#include <asm/iommu.h>
+#include <asm/gart.h>
+#include <asm/pci-direct.h>
+#include <asm/dma.h>
+#include <asm/amd_nb.h>
+#include <asm/x86_init.h>
+#include <linux/crash_dump.h>
+
+/*
+ * Using 512M as goal, in case kexec will load kernel_big
+ * that will do the on-position decompress, and could overlap with
+ * with the gart aperture that is used.
+ * Sequence:
+ * kernel_small
+ * ==> kexec (with kdump trigger path or gart still enabled)
+ * ==> kernel_small (gart area become e820_reserved)
+ * ==> kexec (with kdump trigger path or gart still enabled)
+ * ==> kerne_big (uncompressed size will be big than 64M or 128M)
+ * So don't use 512M below as gart iommu, leave the space for kernel
+ * code for safe.
+ */
+#define GART_MIN_ADDR (512ULL << 20)
+#define GART_MAX_ADDR (1ULL << 32)
+
+int gart_iommu_aperture;
+int gart_iommu_aperture_disabled __initdata;
+int gart_iommu_aperture_allowed __initdata;
+
+int fallback_aper_order __initdata = 1; /* 64MB */
+int fallback_aper_force __initdata;
+
+int fix_aperture __initdata = 1;
+
+#ifdef CONFIG_PROC_VMCORE
+/*
+ * If the first kernel maps the aperture over e820 RAM, the kdump kernel will
+ * use the same range because it will remain configured in the northbridge.
+ * Trying to dump this area via /proc/vmcore may crash the machine, so exclude
+ * it from vmcore.
+ */
+static unsigned long aperture_pfn_start, aperture_page_count;
+
+static int gart_oldmem_pfn_is_ram(unsigned long pfn)
+{
+ return likely((pfn < aperture_pfn_start) ||
+ (pfn >= aperture_pfn_start + aperture_page_count));
+}
+
+static void exclude_from_vmcore(u64 aper_base, u32 aper_order)
+{
+ aperture_pfn_start = aper_base >> PAGE_SHIFT;
+ aperture_page_count = (32 * 1024 * 1024) << aper_order >> PAGE_SHIFT;
+ WARN_ON(register_oldmem_pfn_is_ram(&gart_oldmem_pfn_is_ram));
+}
+#else
+static void exclude_from_vmcore(u64 aper_base, u32 aper_order)
+{
+}
+#endif
+
+/* This code runs before the PCI subsystem is initialized, so just
+ access the northbridge directly. */
+
+static u32 __init allocate_aperture(void)
+{
+ u32 aper_size;
+ unsigned long addr;
+
+ /* aper_size should <= 1G */
+ if (fallback_aper_order > 5)
+ fallback_aper_order = 5;
+ aper_size = (32 * 1024 * 1024) << fallback_aper_order;
+
+ /*
+ * Aperture has to be naturally aligned. This means a 2GB aperture
+ * won't have much chance of finding a place in the lower 4GB of
+ * memory. Unfortunately we cannot move it up because that would
+ * make the IOMMU useless.
+ */
+ addr = memblock_find_in_range(GART_MIN_ADDR, GART_MAX_ADDR,
+ aper_size, aper_size);
+ if (!addr) {
+ pr_err("Cannot allocate aperture memory hole [mem %#010lx-%#010lx] (%uKB)\n",
+ addr, addr + aper_size - 1, aper_size >> 10);
+ return 0;
+ }
+ memblock_reserve(addr, aper_size);
+ pr_info("Mapping aperture over RAM [mem %#010lx-%#010lx] (%uKB)\n",
+ addr, addr + aper_size - 1, aper_size >> 10);
+ register_nosave_region(addr >> PAGE_SHIFT,
+ (addr+aper_size) >> PAGE_SHIFT);
+
+ return (u32)addr;
+}
+
+
+/* Find a PCI capability */
+static u32 __init find_cap(int bus, int slot, int func, int cap)
+{
+ int bytes;
+ u8 pos;
+
+ if (!(read_pci_config_16(bus, slot, func, PCI_STATUS) &
+ PCI_STATUS_CAP_LIST))
+ return 0;
+
+ pos = read_pci_config_byte(bus, slot, func, PCI_CAPABILITY_LIST);
+ for (bytes = 0; bytes < 48 && pos >= 0x40; bytes++) {
+ u8 id;
+
+ pos &= ~3;
+ id = read_pci_config_byte(bus, slot, func, pos+PCI_CAP_LIST_ID);
+ if (id == 0xff)
+ break;
+ if (id == cap)
+ return pos;
+ pos = read_pci_config_byte(bus, slot, func,
+ pos+PCI_CAP_LIST_NEXT);
+ }
+ return 0;
+}
+
+/* Read a standard AGPv3 bridge header */
+static u32 __init read_agp(int bus, int slot, int func, int cap, u32 *order)
+{
+ u32 apsize;
+ u32 apsizereg;
+ int nbits;
+ u32 aper_low, aper_hi;
+ u64 aper;
+ u32 old_order;
+
+ pr_info("pci 0000:%02x:%02x:%02x: AGP bridge\n", bus, slot, func);
+ apsizereg = read_pci_config_16(bus, slot, func, cap + 0x14);
+ if (apsizereg == 0xffffffff) {
+ pr_err("pci 0000:%02x:%02x.%d: APSIZE unreadable\n",
+ bus, slot, func);
+ return 0;
+ }
+
+ /* old_order could be the value from NB gart setting */
+ old_order = *order;
+
+ apsize = apsizereg & 0xfff;
+ /* Some BIOS use weird encodings not in the AGPv3 table. */
+ if (apsize & 0xff)
+ apsize |= 0xf00;
+ nbits = hweight16(apsize);
+ *order = 7 - nbits;
+ if ((int)*order < 0) /* < 32MB */
+ *order = 0;
+
+ aper_low = read_pci_config(bus, slot, func, 0x10);
+ aper_hi = read_pci_config(bus, slot, func, 0x14);
+ aper = (aper_low & ~((1<<22)-1)) | ((u64)aper_hi << 32);
+
+ /*
+ * On some sick chips, APSIZE is 0. It means it wants 4G
+ * so let double check that order, and lets trust AMD NB settings:
+ */
+ pr_info("pci 0000:%02x:%02x.%d: AGP aperture [bus addr %#010Lx-%#010Lx] (old size %uMB)\n",
+ bus, slot, func, aper, aper + (32ULL << (old_order + 20)) - 1,
+ 32 << old_order);
+ if (aper + (32ULL<<(20 + *order)) > 0x100000000ULL) {
+ pr_info("pci 0000:%02x:%02x.%d: AGP aperture size %uMB (APSIZE %#x) is not right, using settings from NB\n",
+ bus, slot, func, 32 << *order, apsizereg);
+ *order = old_order;
+ }
+
+ pr_info("pci 0000:%02x:%02x.%d: AGP aperture [bus addr %#010Lx-%#010Lx] (%uMB, APSIZE %#x)\n",
+ bus, slot, func, aper, aper + (32ULL << (*order + 20)) - 1,
+ 32 << *order, apsizereg);
+
+ if (!aperture_valid(aper, (32*1024*1024) << *order, 32<<20))
+ return 0;
+ return (u32)aper;
+}
+
+/*
+ * Look for an AGP bridge. Windows only expects the aperture in the
+ * AGP bridge and some BIOS forget to initialize the Northbridge too.
+ * Work around this here.
+ *
+ * Do an PCI bus scan by hand because we're running before the PCI
+ * subsystem.
+ *
+ * All AMD AGP bridges are AGPv3 compliant, so we can do this scan
+ * generically. It's probably overkill to always scan all slots because
+ * the AGP bridges should be always an own bus on the HT hierarchy,
+ * but do it here for future safety.
+ */
+static u32 __init search_agp_bridge(u32 *order, int *valid_agp)
+{
+ int bus, slot, func;
+
+ /* Poor man's PCI discovery */
+ for (bus = 0; bus < 256; bus++) {
+ for (slot = 0; slot < 32; slot++) {
+ for (func = 0; func < 8; func++) {
+ u32 class, cap;
+ u8 type;
+ class = read_pci_config(bus, slot, func,
+ PCI_CLASS_REVISION);
+ if (class == 0xffffffff)
+ break;
+
+ switch (class >> 16) {
+ case PCI_CLASS_BRIDGE_HOST:
+ case PCI_CLASS_BRIDGE_OTHER: /* needed? */
+ /* AGP bridge? */
+ cap = find_cap(bus, slot, func,
+ PCI_CAP_ID_AGP);
+ if (!cap)
+ break;
+ *valid_agp = 1;
+ return read_agp(bus, slot, func, cap,
+ order);
+ }
+
+ /* No multi-function device? */
+ type = read_pci_config_byte(bus, slot, func,
+ PCI_HEADER_TYPE);
+ if (!(type & 0x80))
+ break;
+ }
+ }
+ }
+ pr_info("No AGP bridge found\n");
+
+ return 0;
+}
+
+static bool gart_fix_e820 __initdata = true;
+
+static int __init parse_gart_mem(char *p)
+{
+ return kstrtobool(p, &gart_fix_e820);
+}
+early_param("gart_fix_e820", parse_gart_mem);
+
+void __init early_gart_iommu_check(void)
+{
+ /*
+ * in case it is enabled before, esp for kexec/kdump,
+ * previous kernel already enable that. memset called
+ * by allocate_aperture/__alloc_bootmem_nopanic cause restart.
+ * or second kernel have different position for GART hole. and new
+ * kernel could use hole as RAM that is still used by GART set by
+ * first kernel
+ * or BIOS forget to put that in reserved.
+ * try to update e820 to make that region as reserved.
+ */
+ u32 agp_aper_order = 0;
+ int i, fix, slot, valid_agp = 0;
+ u32 ctl;
+ u32 aper_size = 0, aper_order = 0, last_aper_order = 0;
+ u64 aper_base = 0, last_aper_base = 0;
+ int aper_enabled = 0, last_aper_enabled = 0, last_valid = 0;
+
+ if (!amd_gart_present())
+ return;
+
+ if (!early_pci_allowed())
+ return;
+
+ /* This is mostly duplicate of iommu_hole_init */
+ search_agp_bridge(&agp_aper_order, &valid_agp);
+
+ fix = 0;
+ for (i = 0; amd_nb_bus_dev_ranges[i].dev_limit; i++) {
+ int bus;
+ int dev_base, dev_limit;
+
+ bus = amd_nb_bus_dev_ranges[i].bus;
+ dev_base = amd_nb_bus_dev_ranges[i].dev_base;
+ dev_limit = amd_nb_bus_dev_ranges[i].dev_limit;
+
+ for (slot = dev_base; slot < dev_limit; slot++) {
+ if (!early_is_amd_nb(read_pci_config(bus, slot, 3, 0x00)))
+ continue;
+
+ ctl = read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL);
+ aper_enabled = ctl & GARTEN;
+ aper_order = (ctl >> 1) & 7;
+ aper_size = (32 * 1024 * 1024) << aper_order;
+ aper_base = read_pci_config(bus, slot, 3, AMD64_GARTAPERTUREBASE) & 0x7fff;
+ aper_base <<= 25;
+
+ if (last_valid) {
+ if ((aper_order != last_aper_order) ||
+ (aper_base != last_aper_base) ||
+ (aper_enabled != last_aper_enabled)) {
+ fix = 1;
+ break;
+ }
+ }
+
+ last_aper_order = aper_order;
+ last_aper_base = aper_base;
+ last_aper_enabled = aper_enabled;
+ last_valid = 1;
+ }
+ }
+
+ if (!fix && !aper_enabled)
+ return;
+
+ if (!aper_base || !aper_size || aper_base + aper_size > 0x100000000UL)
+ fix = 1;
+
+ if (gart_fix_e820 && !fix && aper_enabled) {
+ if (e820__mapped_any(aper_base, aper_base + aper_size,
+ E820_TYPE_RAM)) {
+ /* reserve it, so we can reuse it in second kernel */
+ pr_info("e820: reserve [mem %#010Lx-%#010Lx] for GART\n",
+ aper_base, aper_base + aper_size - 1);
+ e820__range_add(aper_base, aper_size, E820_TYPE_RESERVED);
+ e820__update_table_print();
+ }
+ }
+
+ if (valid_agp)
+ return;
+
+ /* disable them all at first */
+ for (i = 0; i < amd_nb_bus_dev_ranges[i].dev_limit; i++) {
+ int bus;
+ int dev_base, dev_limit;
+
+ bus = amd_nb_bus_dev_ranges[i].bus;
+ dev_base = amd_nb_bus_dev_ranges[i].dev_base;
+ dev_limit = amd_nb_bus_dev_ranges[i].dev_limit;
+
+ for (slot = dev_base; slot < dev_limit; slot++) {
+ if (!early_is_amd_nb(read_pci_config(bus, slot, 3, 0x00)))
+ continue;
+
+ ctl = read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL);
+ ctl &= ~GARTEN;
+ write_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL, ctl);
+ }
+ }
+
+}
+
+static int __initdata printed_gart_size_msg;
+
+int __init gart_iommu_hole_init(void)
+{
+ u32 agp_aper_base = 0, agp_aper_order = 0;
+ u32 aper_size, aper_alloc = 0, aper_order = 0, last_aper_order = 0;
+ u64 aper_base, last_aper_base = 0;
+ int fix, slot, valid_agp = 0;
+ int i, node;
+
+ if (!amd_gart_present())
+ return -ENODEV;
+
+ if (gart_iommu_aperture_disabled || !fix_aperture ||
+ !early_pci_allowed())
+ return -ENODEV;
+
+ pr_info("Checking aperture...\n");
+
+ if (!fallback_aper_force)
+ agp_aper_base = search_agp_bridge(&agp_aper_order, &valid_agp);
+
+ fix = 0;
+ node = 0;
+ for (i = 0; i < amd_nb_bus_dev_ranges[i].dev_limit; i++) {
+ int bus;
+ int dev_base, dev_limit;
+ u32 ctl;
+
+ bus = amd_nb_bus_dev_ranges[i].bus;
+ dev_base = amd_nb_bus_dev_ranges[i].dev_base;
+ dev_limit = amd_nb_bus_dev_ranges[i].dev_limit;
+
+ for (slot = dev_base; slot < dev_limit; slot++) {
+ if (!early_is_amd_nb(read_pci_config(bus, slot, 3, 0x00)))
+ continue;
+
+ iommu_detected = 1;
+ gart_iommu_aperture = 1;
+ x86_init.iommu.iommu_init = gart_iommu_init;
+
+ ctl = read_pci_config(bus, slot, 3,
+ AMD64_GARTAPERTURECTL);
+
+ /*
+ * Before we do anything else disable the GART. It may
+ * still be enabled if we boot into a crash-kernel here.
+ * Reconfiguring the GART while it is enabled could have
+ * unknown side-effects.
+ */
+ ctl &= ~GARTEN;
+ write_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL, ctl);
+
+ aper_order = (ctl >> 1) & 7;
+ aper_size = (32 * 1024 * 1024) << aper_order;
+ aper_base = read_pci_config(bus, slot, 3, AMD64_GARTAPERTUREBASE) & 0x7fff;
+ aper_base <<= 25;
+
+ pr_info("Node %d: aperture [bus addr %#010Lx-%#010Lx] (%uMB)\n",
+ node, aper_base, aper_base + aper_size - 1,
+ aper_size >> 20);
+ node++;
+
+ if (!aperture_valid(aper_base, aper_size, 64<<20)) {
+ if (valid_agp && agp_aper_base &&
+ agp_aper_base == aper_base &&
+ agp_aper_order == aper_order) {
+ /* the same between two setting from NB and agp */
+ if (!no_iommu &&
+ max_pfn > MAX_DMA32_PFN &&
+ !printed_gart_size_msg) {
+ pr_err("you are using iommu with agp, but GART size is less than 64MB\n");
+ pr_err("please increase GART size in your BIOS setup\n");
+ pr_err("if BIOS doesn't have that option, contact your HW vendor!\n");
+ printed_gart_size_msg = 1;
+ }
+ } else {
+ fix = 1;
+ goto out;
+ }
+ }
+
+ if ((last_aper_order && aper_order != last_aper_order) ||
+ (last_aper_base && aper_base != last_aper_base)) {
+ fix = 1;
+ goto out;
+ }
+ last_aper_order = aper_order;
+ last_aper_base = aper_base;
+ }
+ }
+
+out:
+ if (!fix && !fallback_aper_force) {
+ if (last_aper_base) {
+ /*
+ * If this is the kdump kernel, the first kernel
+ * may have allocated the range over its e820 RAM
+ * and fixed up the northbridge
+ */
+ exclude_from_vmcore(last_aper_base, last_aper_order);
+
+ return 1;
+ }
+ return 0;
+ }
+
+ if (!fallback_aper_force) {
+ aper_alloc = agp_aper_base;
+ aper_order = agp_aper_order;
+ }
+
+ if (aper_alloc) {
+ /* Got the aperture from the AGP bridge */
+ } else if ((!no_iommu && max_pfn > MAX_DMA32_PFN) ||
+ force_iommu ||
+ valid_agp ||
+ fallback_aper_force) {
+ pr_info("Your BIOS doesn't leave an aperture memory hole\n");
+ pr_info("Please enable the IOMMU option in the BIOS setup\n");
+ pr_info("This costs you %dMB of RAM\n",
+ 32 << fallback_aper_order);
+
+ aper_order = fallback_aper_order;
+ aper_alloc = allocate_aperture();
+ if (!aper_alloc) {
+ /*
+ * Could disable AGP and IOMMU here, but it's
+ * probably not worth it. But the later users
+ * cannot deal with bad apertures and turning
+ * on the aperture over memory causes very
+ * strange problems, so it's better to panic
+ * early.
+ */
+ panic("Not enough memory for aperture");
+ }
+ } else {
+ return 0;
+ }
+
+ /*
+ * If this is the kdump kernel _and_ the first kernel did not
+ * configure the aperture in the northbridge, this range may
+ * overlap with the first kernel's memory. We can't access the
+ * range through vmcore even though it should be part of the dump.
+ */
+ exclude_from_vmcore(aper_alloc, aper_order);
+
+ /* Fix up the north bridges */
+ for (i = 0; i < amd_nb_bus_dev_ranges[i].dev_limit; i++) {
+ int bus, dev_base, dev_limit;
+
+ /*
+ * Don't enable translation yet but enable GART IO and CPU
+ * accesses and set DISTLBWALKPRB since GART table memory is UC.
+ */
+ u32 ctl = aper_order << 1;
+
+ bus = amd_nb_bus_dev_ranges[i].bus;
+ dev_base = amd_nb_bus_dev_ranges[i].dev_base;
+ dev_limit = amd_nb_bus_dev_ranges[i].dev_limit;
+ for (slot = dev_base; slot < dev_limit; slot++) {
+ if (!early_is_amd_nb(read_pci_config(bus, slot, 3, 0x00)))
+ continue;
+
+ write_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL, ctl);
+ write_pci_config(bus, slot, 3, AMD64_GARTAPERTUREBASE, aper_alloc >> 25);
+ }
+ }
+
+ set_up_gart_resume(aper_order, aper_alloc);
+
+ return 1;
+}
diff --git a/arch/x86/kernel/apic/Makefile b/arch/x86/kernel/apic/Makefile
new file mode 100644
index 0000000..a6fcaf1
--- /dev/null
+++ b/arch/x86/kernel/apic/Makefile
@@ -0,0 +1,30 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Makefile for local APIC drivers and for the IO-APIC code
+#
+
+# Leads to non-deterministic coverage that is not a function of syscall inputs.
+# In particualr, smp_apic_timer_interrupt() is called in random places.
+KCOV_INSTRUMENT := n
+
+obj-$(CONFIG_X86_LOCAL_APIC) += apic.o apic_common.o apic_noop.o ipi.o vector.o
+obj-y += hw_nmi.o
+
+obj-$(CONFIG_X86_IO_APIC) += io_apic.o
+obj-$(CONFIG_PCI_MSI) += msi.o
+obj-$(CONFIG_SMP) += ipi.o
+
+ifeq ($(CONFIG_X86_64),y)
+# APIC probe will depend on the listing order here
+obj-$(CONFIG_X86_NUMACHIP) += apic_numachip.o
+obj-$(CONFIG_X86_UV) += x2apic_uv_x.o
+obj-$(CONFIG_X86_X2APIC) += x2apic_phys.o
+obj-$(CONFIG_X86_X2APIC) += x2apic_cluster.o
+obj-y += apic_flat_64.o
+endif
+
+# APIC probe will depend on the listing order here
+obj-$(CONFIG_X86_BIGSMP) += bigsmp_32.o
+
+# For 32bit, probe_32 need to be listed last
+obj-$(CONFIG_X86_LOCAL_APIC) += probe_$(BITS).o
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
new file mode 100644
index 0000000..84132ed
--- /dev/null
+++ b/arch/x86/kernel/apic/apic.c
@@ -0,0 +1,2766 @@
+/*
+ * Local APIC handling, local APIC timers
+ *
+ * (c) 1999, 2000, 2009 Ingo Molnar <mingo@redhat.com>
+ *
+ * Fixes
+ * Maciej W. Rozycki : Bits for genuine 82489DX APICs;
+ * thanks to Eric Gilmore
+ * and Rolf G. Tews
+ * for testing these extensively.
+ * Maciej W. Rozycki : Various updates and fixes.
+ * Mikael Pettersson : Power Management for UP-APIC.
+ * Pavel Machek and
+ * Mikael Pettersson : PM converted to driver model.
+ */
+
+#include <linux/perf_event.h>
+#include <linux/kernel_stat.h>
+#include <linux/mc146818rtc.h>
+#include <linux/acpi_pmtmr.h>
+#include <linux/clockchips.h>
+#include <linux/interrupt.h>
+#include <linux/bootmem.h>
+#include <linux/ftrace.h>
+#include <linux/ioport.h>
+#include <linux/export.h>
+#include <linux/syscore_ops.h>
+#include <linux/delay.h>
+#include <linux/timex.h>
+#include <linux/i8253.h>
+#include <linux/dmar.h>
+#include <linux/init.h>
+#include <linux/cpu.h>
+#include <linux/dmi.h>
+#include <linux/smp.h>
+#include <linux/mm.h>
+
+#include <asm/trace/irq_vectors.h>
+#include <asm/irq_remapping.h>
+#include <asm/perf_event.h>
+#include <asm/x86_init.h>
+#include <asm/pgalloc.h>
+#include <linux/atomic.h>
+#include <asm/mpspec.h>
+#include <asm/i8259.h>
+#include <asm/proto.h>
+#include <asm/apic.h>
+#include <asm/io_apic.h>
+#include <asm/desc.h>
+#include <asm/hpet.h>
+#include <asm/mtrr.h>
+#include <asm/time.h>
+#include <asm/smp.h>
+#include <asm/mce.h>
+#include <asm/tsc.h>
+#include <asm/hypervisor.h>
+#include <asm/cpu_device_id.h>
+#include <asm/intel-family.h>
+#include <asm/irq_regs.h>
+
+unsigned int num_processors;
+
+unsigned disabled_cpus;
+
+/* Processor that is doing the boot up */
+unsigned int boot_cpu_physical_apicid = -1U;
+EXPORT_SYMBOL_GPL(boot_cpu_physical_apicid);
+
+u8 boot_cpu_apic_version;
+
+/*
+ * The highest APIC ID seen during enumeration.
+ */
+static unsigned int max_physical_apicid;
+
+/*
+ * Bitmask of physically existing CPUs:
+ */
+physid_mask_t phys_cpu_present_map;
+
+/*
+ * Processor to be disabled specified by kernel parameter
+ * disable_cpu_apicid=<int>, mostly used for the kdump 2nd kernel to
+ * avoid undefined behaviour caused by sending INIT from AP to BSP.
+ */
+static unsigned int disabled_cpu_apicid __read_mostly = BAD_APICID;
+
+/*
+ * This variable controls which CPUs receive external NMIs. By default,
+ * external NMIs are delivered only to the BSP.
+ */
+static int apic_extnmi = APIC_EXTNMI_BSP;
+
+/*
+ * Map cpu index to physical APIC ID
+ */
+DEFINE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_cpu_to_apicid, BAD_APICID);
+DEFINE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_bios_cpu_apicid, BAD_APICID);
+DEFINE_EARLY_PER_CPU_READ_MOSTLY(u32, x86_cpu_to_acpiid, U32_MAX);
+EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_apicid);
+EXPORT_EARLY_PER_CPU_SYMBOL(x86_bios_cpu_apicid);
+EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_acpiid);
+
+#ifdef CONFIG_X86_32
+
+/*
+ * On x86_32, the mapping between cpu and logical apicid may vary
+ * depending on apic in use. The following early percpu variable is
+ * used for the mapping. This is where the behaviors of x86_64 and 32
+ * actually diverge. Let's keep it ugly for now.
+ */
+DEFINE_EARLY_PER_CPU_READ_MOSTLY(int, x86_cpu_to_logical_apicid, BAD_APICID);
+
+/* Local APIC was disabled by the BIOS and enabled by the kernel */
+static int enabled_via_apicbase;
+
+/*
+ * Handle interrupt mode configuration register (IMCR).
+ * This register controls whether the interrupt signals
+ * that reach the BSP come from the master PIC or from the
+ * local APIC. Before entering Symmetric I/O Mode, either
+ * the BIOS or the operating system must switch out of
+ * PIC Mode by changing the IMCR.
+ */
+static inline void imcr_pic_to_apic(void)
+{
+ /* select IMCR register */
+ outb(0x70, 0x22);
+ /* NMI and 8259 INTR go through APIC */
+ outb(0x01, 0x23);
+}
+
+static inline void imcr_apic_to_pic(void)
+{
+ /* select IMCR register */
+ outb(0x70, 0x22);
+ /* NMI and 8259 INTR go directly to BSP */
+ outb(0x00, 0x23);
+}
+#endif
+
+/*
+ * Knob to control our willingness to enable the local APIC.
+ *
+ * +1=force-enable
+ */
+static int force_enable_local_apic __initdata;
+
+/*
+ * APIC command line parameters
+ */
+static int __init parse_lapic(char *arg)
+{
+ if (IS_ENABLED(CONFIG_X86_32) && !arg)
+ force_enable_local_apic = 1;
+ else if (arg && !strncmp(arg, "notscdeadline", 13))
+ setup_clear_cpu_cap(X86_FEATURE_TSC_DEADLINE_TIMER);
+ return 0;
+}
+early_param("lapic", parse_lapic);
+
+#ifdef CONFIG_X86_64
+static int apic_calibrate_pmtmr __initdata;
+static __init int setup_apicpmtimer(char *s)
+{
+ apic_calibrate_pmtmr = 1;
+ notsc_setup(NULL);
+ return 0;
+}
+__setup("apicpmtimer", setup_apicpmtimer);
+#endif
+
+unsigned long mp_lapic_addr;
+int disable_apic;
+/* Disable local APIC timer from the kernel commandline or via dmi quirk */
+static int disable_apic_timer __initdata;
+/* Local APIC timer works in C2 */
+int local_apic_timer_c2_ok;
+EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok);
+
+/*
+ * Debug level, exported for io_apic.c
+ */
+unsigned int apic_verbosity;
+
+int pic_mode;
+
+/* Have we found an MP table */
+int smp_found_config;
+
+static struct resource lapic_resource = {
+ .name = "Local APIC",
+ .flags = IORESOURCE_MEM | IORESOURCE_BUSY,
+};
+
+unsigned int lapic_timer_frequency = 0;
+
+static void apic_pm_activate(void);
+
+static unsigned long apic_phys;
+
+/*
+ * Get the LAPIC version
+ */
+static inline int lapic_get_version(void)
+{
+ return GET_APIC_VERSION(apic_read(APIC_LVR));
+}
+
+/*
+ * Check, if the APIC is integrated or a separate chip
+ */
+static inline int lapic_is_integrated(void)
+{
+ return APIC_INTEGRATED(lapic_get_version());
+}
+
+/*
+ * Check, whether this is a modern or a first generation APIC
+ */
+static int modern_apic(void)
+{
+ /* AMD systems use old APIC versions, so check the CPU */
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
+ boot_cpu_data.x86 >= 0xf)
+ return 1;
+ return lapic_get_version() >= 0x14;
+}
+
+/*
+ * right after this call apic become NOOP driven
+ * so apic->write/read doesn't do anything
+ */
+static void __init apic_disable(void)
+{
+ pr_info("APIC: switched to apic NOOP\n");
+ apic = &apic_noop;
+}
+
+void native_apic_wait_icr_idle(void)
+{
+ while (apic_read(APIC_ICR) & APIC_ICR_BUSY)
+ cpu_relax();
+}
+
+u32 native_safe_apic_wait_icr_idle(void)
+{
+ u32 send_status;
+ int timeout;
+
+ timeout = 0;
+ do {
+ send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY;
+ if (!send_status)
+ break;
+ inc_irq_stat(icr_read_retry_count);
+ udelay(100);
+ } while (timeout++ < 1000);
+
+ return send_status;
+}
+
+void native_apic_icr_write(u32 low, u32 id)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(id));
+ apic_write(APIC_ICR, low);
+ local_irq_restore(flags);
+}
+
+u64 native_apic_icr_read(void)
+{
+ u32 icr1, icr2;
+
+ icr2 = apic_read(APIC_ICR2);
+ icr1 = apic_read(APIC_ICR);
+
+ return icr1 | ((u64)icr2 << 32);
+}
+
+#ifdef CONFIG_X86_32
+/**
+ * get_physical_broadcast - Get number of physical broadcast IDs
+ */
+int get_physical_broadcast(void)
+{
+ return modern_apic() ? 0xff : 0xf;
+}
+#endif
+
+/**
+ * lapic_get_maxlvt - get the maximum number of local vector table entries
+ */
+int lapic_get_maxlvt(void)
+{
+ /*
+ * - we always have APIC integrated on 64bit mode
+ * - 82489DXs do not report # of LVT entries
+ */
+ return lapic_is_integrated() ? GET_APIC_MAXLVT(apic_read(APIC_LVR)) : 2;
+}
+
+/*
+ * Local APIC timer
+ */
+
+/* Clock divisor */
+#define APIC_DIVISOR 16
+#define TSC_DIVISOR 8
+
+/*
+ * This function sets up the local APIC timer, with a timeout of
+ * 'clocks' APIC bus clock. During calibration we actually call
+ * this function twice on the boot CPU, once with a bogus timeout
+ * value, second time for real. The other (noncalibrating) CPUs
+ * call this function only once, with the real, calibrated value.
+ *
+ * We do reads before writes even if unnecessary, to get around the
+ * P5 APIC double write bug.
+ */
+static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen)
+{
+ unsigned int lvtt_value, tmp_value;
+
+ lvtt_value = LOCAL_TIMER_VECTOR;
+ if (!oneshot)
+ lvtt_value |= APIC_LVT_TIMER_PERIODIC;
+ else if (boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER))
+ lvtt_value |= APIC_LVT_TIMER_TSCDEADLINE;
+
+ if (!lapic_is_integrated())
+ lvtt_value |= SET_APIC_TIMER_BASE(APIC_TIMER_BASE_DIV);
+
+ if (!irqen)
+ lvtt_value |= APIC_LVT_MASKED;
+
+ apic_write(APIC_LVTT, lvtt_value);
+
+ if (lvtt_value & APIC_LVT_TIMER_TSCDEADLINE) {
+ /*
+ * See Intel SDM: TSC-Deadline Mode chapter. In xAPIC mode,
+ * writing to the APIC LVTT and TSC_DEADLINE MSR isn't serialized.
+ * According to Intel, MFENCE can do the serialization here.
+ */
+ asm volatile("mfence" : : : "memory");
+
+ printk_once(KERN_DEBUG "TSC deadline timer enabled\n");
+ return;
+ }
+
+ /*
+ * Divide PICLK by 16
+ */
+ tmp_value = apic_read(APIC_TDCR);
+ apic_write(APIC_TDCR,
+ (tmp_value & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE)) |
+ APIC_TDR_DIV_16);
+
+ if (!oneshot)
+ apic_write(APIC_TMICT, clocks / APIC_DIVISOR);
+}
+
+/*
+ * Setup extended LVT, AMD specific
+ *
+ * Software should use the LVT offsets the BIOS provides. The offsets
+ * are determined by the subsystems using it like those for MCE
+ * threshold or IBS. On K8 only offset 0 (APIC500) and MCE interrupts
+ * are supported. Beginning with family 10h at least 4 offsets are
+ * available.
+ *
+ * Since the offsets must be consistent for all cores, we keep track
+ * of the LVT offsets in software and reserve the offset for the same
+ * vector also to be used on other cores. An offset is freed by
+ * setting the entry to APIC_EILVT_MASKED.
+ *
+ * If the BIOS is right, there should be no conflicts. Otherwise a
+ * "[Firmware Bug]: ..." error message is generated. However, if
+ * software does not properly determines the offsets, it is not
+ * necessarily a BIOS bug.
+ */
+
+static atomic_t eilvt_offsets[APIC_EILVT_NR_MAX];
+
+static inline int eilvt_entry_is_changeable(unsigned int old, unsigned int new)
+{
+ return (old & APIC_EILVT_MASKED)
+ || (new == APIC_EILVT_MASKED)
+ || ((new & ~APIC_EILVT_MASKED) == old);
+}
+
+static unsigned int reserve_eilvt_offset(int offset, unsigned int new)
+{
+ unsigned int rsvd, vector;
+
+ if (offset >= APIC_EILVT_NR_MAX)
+ return ~0;
+
+ rsvd = atomic_read(&eilvt_offsets[offset]);
+ do {
+ vector = rsvd & ~APIC_EILVT_MASKED; /* 0: unassigned */
+ if (vector && !eilvt_entry_is_changeable(vector, new))
+ /* may not change if vectors are different */
+ return rsvd;
+ rsvd = atomic_cmpxchg(&eilvt_offsets[offset], rsvd, new);
+ } while (rsvd != new);
+
+ rsvd &= ~APIC_EILVT_MASKED;
+ if (rsvd && rsvd != vector)
+ pr_info("LVT offset %d assigned for vector 0x%02x\n",
+ offset, rsvd);
+
+ return new;
+}
+
+/*
+ * If mask=1, the LVT entry does not generate interrupts while mask=0
+ * enables the vector. See also the BKDGs. Must be called with
+ * preemption disabled.
+ */
+
+int setup_APIC_eilvt(u8 offset, u8 vector, u8 msg_type, u8 mask)
+{
+ unsigned long reg = APIC_EILVTn(offset);
+ unsigned int new, old, reserved;
+
+ new = (mask << 16) | (msg_type << 8) | vector;
+ old = apic_read(reg);
+ reserved = reserve_eilvt_offset(offset, new);
+
+ if (reserved != new) {
+ pr_err(FW_BUG "cpu %d, try to use APIC%lX (LVT offset %d) for "
+ "vector 0x%x, but the register is already in use for "
+ "vector 0x%x on another cpu\n",
+ smp_processor_id(), reg, offset, new, reserved);
+ return -EINVAL;
+ }
+
+ if (!eilvt_entry_is_changeable(old, new)) {
+ pr_err(FW_BUG "cpu %d, try to use APIC%lX (LVT offset %d) for "
+ "vector 0x%x, but the register is already in use for "
+ "vector 0x%x on this cpu\n",
+ smp_processor_id(), reg, offset, new, old);
+ return -EBUSY;
+ }
+
+ apic_write(reg, new);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(setup_APIC_eilvt);
+
+/*
+ * Program the next event, relative to now
+ */
+static int lapic_next_event(unsigned long delta,
+ struct clock_event_device *evt)
+{
+ apic_write(APIC_TMICT, delta);
+ return 0;
+}
+
+static int lapic_next_deadline(unsigned long delta,
+ struct clock_event_device *evt)
+{
+ u64 tsc;
+
+ tsc = rdtsc();
+ wrmsrl(MSR_IA32_TSC_DEADLINE, tsc + (((u64) delta) * TSC_DIVISOR));
+ return 0;
+}
+
+static int lapic_timer_shutdown(struct clock_event_device *evt)
+{
+ unsigned int v;
+
+ /* Lapic used as dummy for broadcast ? */
+ if (evt->features & CLOCK_EVT_FEAT_DUMMY)
+ return 0;
+
+ v = apic_read(APIC_LVTT);
+ v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR);
+ apic_write(APIC_LVTT, v);
+ apic_write(APIC_TMICT, 0);
+ return 0;
+}
+
+static inline int
+lapic_timer_set_periodic_oneshot(struct clock_event_device *evt, bool oneshot)
+{
+ /* Lapic used as dummy for broadcast ? */
+ if (evt->features & CLOCK_EVT_FEAT_DUMMY)
+ return 0;
+
+ __setup_APIC_LVTT(lapic_timer_frequency, oneshot, 1);
+ return 0;
+}
+
+static int lapic_timer_set_periodic(struct clock_event_device *evt)
+{
+ return lapic_timer_set_periodic_oneshot(evt, false);
+}
+
+static int lapic_timer_set_oneshot(struct clock_event_device *evt)
+{
+ return lapic_timer_set_periodic_oneshot(evt, true);
+}
+
+/*
+ * Local APIC timer broadcast function
+ */
+static void lapic_timer_broadcast(const struct cpumask *mask)
+{
+#ifdef CONFIG_SMP
+ apic->send_IPI_mask(mask, LOCAL_TIMER_VECTOR);
+#endif
+}
+
+
+/*
+ * The local apic timer can be used for any function which is CPU local.
+ */
+static struct clock_event_device lapic_clockevent = {
+ .name = "lapic",
+ .features = CLOCK_EVT_FEAT_PERIODIC |
+ CLOCK_EVT_FEAT_ONESHOT | CLOCK_EVT_FEAT_C3STOP
+ | CLOCK_EVT_FEAT_DUMMY,
+ .shift = 32,
+ .set_state_shutdown = lapic_timer_shutdown,
+ .set_state_periodic = lapic_timer_set_periodic,
+ .set_state_oneshot = lapic_timer_set_oneshot,
+ .set_state_oneshot_stopped = lapic_timer_shutdown,
+ .set_next_event = lapic_next_event,
+ .broadcast = lapic_timer_broadcast,
+ .rating = 100,
+ .irq = -1,
+};
+static DEFINE_PER_CPU(struct clock_event_device, lapic_events);
+
+#define DEADLINE_MODEL_MATCH_FUNC(model, func) \
+ { X86_VENDOR_INTEL, 6, model, X86_FEATURE_ANY, (unsigned long)&func }
+
+#define DEADLINE_MODEL_MATCH_REV(model, rev) \
+ { X86_VENDOR_INTEL, 6, model, X86_FEATURE_ANY, (unsigned long)rev }
+
+static u32 hsx_deadline_rev(void)
+{
+ switch (boot_cpu_data.x86_stepping) {
+ case 0x02: return 0x3a; /* EP */
+ case 0x04: return 0x0f; /* EX */
+ }
+
+ return ~0U;
+}
+
+static u32 bdx_deadline_rev(void)
+{
+ switch (boot_cpu_data.x86_stepping) {
+ case 0x02: return 0x00000011;
+ case 0x03: return 0x0700000e;
+ case 0x04: return 0x0f00000c;
+ case 0x05: return 0x0e000003;
+ }
+
+ return ~0U;
+}
+
+static u32 skx_deadline_rev(void)
+{
+ switch (boot_cpu_data.x86_stepping) {
+ case 0x03: return 0x01000136;
+ case 0x04: return 0x02000014;
+ }
+
+ if (boot_cpu_data.x86_stepping > 4)
+ return 0;
+
+ return ~0U;
+}
+
+static const struct x86_cpu_id deadline_match[] = {
+ DEADLINE_MODEL_MATCH_FUNC( INTEL_FAM6_HASWELL_X, hsx_deadline_rev),
+ DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_BROADWELL_X, 0x0b000020),
+ DEADLINE_MODEL_MATCH_FUNC( INTEL_FAM6_BROADWELL_XEON_D, bdx_deadline_rev),
+ DEADLINE_MODEL_MATCH_FUNC( INTEL_FAM6_SKYLAKE_X, skx_deadline_rev),
+
+ DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_HASWELL_CORE, 0x22),
+ DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_HASWELL_ULT, 0x20),
+ DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_HASWELL_GT3E, 0x17),
+
+ DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_BROADWELL_CORE, 0x25),
+ DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_BROADWELL_GT3E, 0x17),
+
+ DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_SKYLAKE_MOBILE, 0xb2),
+ DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_SKYLAKE_DESKTOP, 0xb2),
+
+ DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_KABYLAKE_MOBILE, 0x52),
+ DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_KABYLAKE_DESKTOP, 0x52),
+
+ {},
+};
+
+static void apic_check_deadline_errata(void)
+{
+ const struct x86_cpu_id *m;
+ u32 rev;
+
+ if (!boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER) ||
+ boot_cpu_has(X86_FEATURE_HYPERVISOR))
+ return;
+
+ m = x86_match_cpu(deadline_match);
+ if (!m)
+ return;
+
+ /*
+ * Function pointers will have the MSB set due to address layout,
+ * immediate revisions will not.
+ */
+ if ((long)m->driver_data < 0)
+ rev = ((u32 (*)(void))(m->driver_data))();
+ else
+ rev = (u32)m->driver_data;
+
+ if (boot_cpu_data.microcode >= rev)
+ return;
+
+ setup_clear_cpu_cap(X86_FEATURE_TSC_DEADLINE_TIMER);
+ pr_err(FW_BUG "TSC_DEADLINE disabled due to Errata; "
+ "please update microcode to version: 0x%x (or later)\n", rev);
+}
+
+/*
+ * Setup the local APIC timer for this CPU. Copy the initialized values
+ * of the boot CPU and register the clock event in the framework.
+ */
+static void setup_APIC_timer(void)
+{
+ struct clock_event_device *levt = this_cpu_ptr(&lapic_events);
+
+ if (this_cpu_has(X86_FEATURE_ARAT)) {
+ lapic_clockevent.features &= ~CLOCK_EVT_FEAT_C3STOP;
+ /* Make LAPIC timer preferrable over percpu HPET */
+ lapic_clockevent.rating = 150;
+ }
+
+ memcpy(levt, &lapic_clockevent, sizeof(*levt));
+ levt->cpumask = cpumask_of(smp_processor_id());
+
+ if (this_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER)) {
+ levt->name = "lapic-deadline";
+ levt->features &= ~(CLOCK_EVT_FEAT_PERIODIC |
+ CLOCK_EVT_FEAT_DUMMY);
+ levt->set_next_event = lapic_next_deadline;
+ clockevents_config_and_register(levt,
+ tsc_khz * (1000 / TSC_DIVISOR),
+ 0xF, ~0UL);
+ } else
+ clockevents_register_device(levt);
+}
+
+/*
+ * Install the updated TSC frequency from recalibration at the TSC
+ * deadline clockevent devices.
+ */
+static void __lapic_update_tsc_freq(void *info)
+{
+ struct clock_event_device *levt = this_cpu_ptr(&lapic_events);
+
+ if (!this_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER))
+ return;
+
+ clockevents_update_freq(levt, tsc_khz * (1000 / TSC_DIVISOR));
+}
+
+void lapic_update_tsc_freq(void)
+{
+ /*
+ * The clockevent device's ->mult and ->shift can both be
+ * changed. In order to avoid races, schedule the frequency
+ * update code on each CPU.
+ */
+ on_each_cpu(__lapic_update_tsc_freq, NULL, 0);
+}
+
+/*
+ * In this functions we calibrate APIC bus clocks to the external timer.
+ *
+ * We want to do the calibration only once since we want to have local timer
+ * irqs syncron. CPUs connected by the same APIC bus have the very same bus
+ * frequency.
+ *
+ * This was previously done by reading the PIT/HPET and waiting for a wrap
+ * around to find out, that a tick has elapsed. I have a box, where the PIT
+ * readout is broken, so it never gets out of the wait loop again. This was
+ * also reported by others.
+ *
+ * Monitoring the jiffies value is inaccurate and the clockevents
+ * infrastructure allows us to do a simple substitution of the interrupt
+ * handler.
+ *
+ * The calibration routine also uses the pm_timer when possible, as the PIT
+ * happens to run way too slow (factor 2.3 on my VAIO CoreDuo, which goes
+ * back to normal later in the boot process).
+ */
+
+#define LAPIC_CAL_LOOPS (HZ/10)
+
+static __initdata int lapic_cal_loops = -1;
+static __initdata long lapic_cal_t1, lapic_cal_t2;
+static __initdata unsigned long long lapic_cal_tsc1, lapic_cal_tsc2;
+static __initdata unsigned long lapic_cal_pm1, lapic_cal_pm2;
+static __initdata unsigned long lapic_cal_j1, lapic_cal_j2;
+
+/*
+ * Temporary interrupt handler.
+ */
+static void __init lapic_cal_handler(struct clock_event_device *dev)
+{
+ unsigned long long tsc = 0;
+ long tapic = apic_read(APIC_TMCCT);
+ unsigned long pm = acpi_pm_read_early();
+
+ if (boot_cpu_has(X86_FEATURE_TSC))
+ tsc = rdtsc();
+
+ switch (lapic_cal_loops++) {
+ case 0:
+ lapic_cal_t1 = tapic;
+ lapic_cal_tsc1 = tsc;
+ lapic_cal_pm1 = pm;
+ lapic_cal_j1 = jiffies;
+ break;
+
+ case LAPIC_CAL_LOOPS:
+ lapic_cal_t2 = tapic;
+ lapic_cal_tsc2 = tsc;
+ if (pm < lapic_cal_pm1)
+ pm += ACPI_PM_OVRRUN;
+ lapic_cal_pm2 = pm;
+ lapic_cal_j2 = jiffies;
+ break;
+ }
+}
+
+static int __init
+calibrate_by_pmtimer(long deltapm, long *delta, long *deltatsc)
+{
+ const long pm_100ms = PMTMR_TICKS_PER_SEC / 10;
+ const long pm_thresh = pm_100ms / 100;
+ unsigned long mult;
+ u64 res;
+
+#ifndef CONFIG_X86_PM_TIMER
+ return -1;
+#endif
+
+ apic_printk(APIC_VERBOSE, "... PM-Timer delta = %ld\n", deltapm);
+
+ /* Check, if the PM timer is available */
+ if (!deltapm)
+ return -1;
+
+ mult = clocksource_hz2mult(PMTMR_TICKS_PER_SEC, 22);
+
+ if (deltapm > (pm_100ms - pm_thresh) &&
+ deltapm < (pm_100ms + pm_thresh)) {
+ apic_printk(APIC_VERBOSE, "... PM-Timer result ok\n");
+ return 0;
+ }
+
+ res = (((u64)deltapm) * mult) >> 22;
+ do_div(res, 1000000);
+ pr_warning("APIC calibration not consistent "
+ "with PM-Timer: %ldms instead of 100ms\n",(long)res);
+
+ /* Correct the lapic counter value */
+ res = (((u64)(*delta)) * pm_100ms);
+ do_div(res, deltapm);
+ pr_info("APIC delta adjusted to PM-Timer: "
+ "%lu (%ld)\n", (unsigned long)res, *delta);
+ *delta = (long)res;
+
+ /* Correct the tsc counter value */
+ if (boot_cpu_has(X86_FEATURE_TSC)) {
+ res = (((u64)(*deltatsc)) * pm_100ms);
+ do_div(res, deltapm);
+ apic_printk(APIC_VERBOSE, "TSC delta adjusted to "
+ "PM-Timer: %lu (%ld)\n",
+ (unsigned long)res, *deltatsc);
+ *deltatsc = (long)res;
+ }
+
+ return 0;
+}
+
+static int __init calibrate_APIC_clock(void)
+{
+ struct clock_event_device *levt = this_cpu_ptr(&lapic_events);
+ void (*real_handler)(struct clock_event_device *dev);
+ unsigned long deltaj;
+ long delta, deltatsc;
+ int pm_referenced = 0;
+
+ /**
+ * check if lapic timer has already been calibrated by platform
+ * specific routine, such as tsc calibration code. if so, we just fill
+ * in the clockevent structure and return.
+ */
+
+ if (boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER)) {
+ return 0;
+ } else if (lapic_timer_frequency) {
+ apic_printk(APIC_VERBOSE, "lapic timer already calibrated %d\n",
+ lapic_timer_frequency);
+ lapic_clockevent.mult = div_sc(lapic_timer_frequency/APIC_DIVISOR,
+ TICK_NSEC, lapic_clockevent.shift);
+ lapic_clockevent.max_delta_ns =
+ clockevent_delta2ns(0x7FFFFF, &lapic_clockevent);
+ lapic_clockevent.max_delta_ticks = 0x7FFFFF;
+ lapic_clockevent.min_delta_ns =
+ clockevent_delta2ns(0xF, &lapic_clockevent);
+ lapic_clockevent.min_delta_ticks = 0xF;
+ lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY;
+ return 0;
+ }
+
+ apic_printk(APIC_VERBOSE, "Using local APIC timer interrupts.\n"
+ "calibrating APIC timer ...\n");
+
+ local_irq_disable();
+
+ /* Replace the global interrupt handler */
+ real_handler = global_clock_event->event_handler;
+ global_clock_event->event_handler = lapic_cal_handler;
+
+ /*
+ * Setup the APIC counter to maximum. There is no way the lapic
+ * can underflow in the 100ms detection time frame
+ */
+ __setup_APIC_LVTT(0xffffffff, 0, 0);
+
+ /* Let the interrupts run */
+ local_irq_enable();
+
+ while (lapic_cal_loops <= LAPIC_CAL_LOOPS)
+ cpu_relax();
+
+ local_irq_disable();
+
+ /* Restore the real event handler */
+ global_clock_event->event_handler = real_handler;
+
+ /* Build delta t1-t2 as apic timer counts down */
+ delta = lapic_cal_t1 - lapic_cal_t2;
+ apic_printk(APIC_VERBOSE, "... lapic delta = %ld\n", delta);
+
+ deltatsc = (long)(lapic_cal_tsc2 - lapic_cal_tsc1);
+
+ /* we trust the PM based calibration if possible */
+ pm_referenced = !calibrate_by_pmtimer(lapic_cal_pm2 - lapic_cal_pm1,
+ &delta, &deltatsc);
+
+ /* Calculate the scaled math multiplication factor */
+ lapic_clockevent.mult = div_sc(delta, TICK_NSEC * LAPIC_CAL_LOOPS,
+ lapic_clockevent.shift);
+ lapic_clockevent.max_delta_ns =
+ clockevent_delta2ns(0x7FFFFFFF, &lapic_clockevent);
+ lapic_clockevent.max_delta_ticks = 0x7FFFFFFF;
+ lapic_clockevent.min_delta_ns =
+ clockevent_delta2ns(0xF, &lapic_clockevent);
+ lapic_clockevent.min_delta_ticks = 0xF;
+
+ lapic_timer_frequency = (delta * APIC_DIVISOR) / LAPIC_CAL_LOOPS;
+
+ apic_printk(APIC_VERBOSE, "..... delta %ld\n", delta);
+ apic_printk(APIC_VERBOSE, "..... mult: %u\n", lapic_clockevent.mult);
+ apic_printk(APIC_VERBOSE, "..... calibration result: %u\n",
+ lapic_timer_frequency);
+
+ if (boot_cpu_has(X86_FEATURE_TSC)) {
+ apic_printk(APIC_VERBOSE, "..... CPU clock speed is "
+ "%ld.%04ld MHz.\n",
+ (deltatsc / LAPIC_CAL_LOOPS) / (1000000 / HZ),
+ (deltatsc / LAPIC_CAL_LOOPS) % (1000000 / HZ));
+ }
+
+ apic_printk(APIC_VERBOSE, "..... host bus clock speed is "
+ "%u.%04u MHz.\n",
+ lapic_timer_frequency / (1000000 / HZ),
+ lapic_timer_frequency % (1000000 / HZ));
+
+ /*
+ * Do a sanity check on the APIC calibration result
+ */
+ if (lapic_timer_frequency < (1000000 / HZ)) {
+ local_irq_enable();
+ pr_warning("APIC frequency too slow, disabling apic timer\n");
+ return -1;
+ }
+
+ levt->features &= ~CLOCK_EVT_FEAT_DUMMY;
+
+ /*
+ * PM timer calibration failed or not turned on
+ * so lets try APIC timer based calibration
+ */
+ if (!pm_referenced) {
+ apic_printk(APIC_VERBOSE, "... verify APIC timer\n");
+
+ /*
+ * Setup the apic timer manually
+ */
+ levt->event_handler = lapic_cal_handler;
+ lapic_timer_set_periodic(levt);
+ lapic_cal_loops = -1;
+
+ /* Let the interrupts run */
+ local_irq_enable();
+
+ while (lapic_cal_loops <= LAPIC_CAL_LOOPS)
+ cpu_relax();
+
+ /* Stop the lapic timer */
+ local_irq_disable();
+ lapic_timer_shutdown(levt);
+
+ /* Jiffies delta */
+ deltaj = lapic_cal_j2 - lapic_cal_j1;
+ apic_printk(APIC_VERBOSE, "... jiffies delta = %lu\n", deltaj);
+
+ /* Check, if the jiffies result is consistent */
+ if (deltaj >= LAPIC_CAL_LOOPS-2 && deltaj <= LAPIC_CAL_LOOPS+2)
+ apic_printk(APIC_VERBOSE, "... jiffies result ok\n");
+ else
+ levt->features |= CLOCK_EVT_FEAT_DUMMY;
+ }
+ local_irq_enable();
+
+ if (levt->features & CLOCK_EVT_FEAT_DUMMY) {
+ pr_warning("APIC timer disabled due to verification failure\n");
+ return -1;
+ }
+
+ return 0;
+}
+
+/*
+ * Setup the boot APIC
+ *
+ * Calibrate and verify the result.
+ */
+void __init setup_boot_APIC_clock(void)
+{
+ /*
+ * The local apic timer can be disabled via the kernel
+ * commandline or from the CPU detection code. Register the lapic
+ * timer as a dummy clock event source on SMP systems, so the
+ * broadcast mechanism is used. On UP systems simply ignore it.
+ */
+ if (disable_apic_timer) {
+ pr_info("Disabling APIC timer\n");
+ /* No broadcast on UP ! */
+ if (num_possible_cpus() > 1) {
+ lapic_clockevent.mult = 1;
+ setup_APIC_timer();
+ }
+ return;
+ }
+
+ if (calibrate_APIC_clock()) {
+ /* No broadcast on UP ! */
+ if (num_possible_cpus() > 1)
+ setup_APIC_timer();
+ return;
+ }
+
+ /*
+ * If nmi_watchdog is set to IO_APIC, we need the
+ * PIT/HPET going. Otherwise register lapic as a dummy
+ * device.
+ */
+ lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY;
+
+ /* Setup the lapic or request the broadcast */
+ setup_APIC_timer();
+ amd_e400_c1e_apic_setup();
+}
+
+void setup_secondary_APIC_clock(void)
+{
+ setup_APIC_timer();
+ amd_e400_c1e_apic_setup();
+}
+
+/*
+ * The guts of the apic timer interrupt
+ */
+static void local_apic_timer_interrupt(void)
+{
+ struct clock_event_device *evt = this_cpu_ptr(&lapic_events);
+
+ /*
+ * Normally we should not be here till LAPIC has been initialized but
+ * in some cases like kdump, its possible that there is a pending LAPIC
+ * timer interrupt from previous kernel's context and is delivered in
+ * new kernel the moment interrupts are enabled.
+ *
+ * Interrupts are enabled early and LAPIC is setup much later, hence
+ * its possible that when we get here evt->event_handler is NULL.
+ * Check for event_handler being NULL and discard the interrupt as
+ * spurious.
+ */
+ if (!evt->event_handler) {
+ pr_warning("Spurious LAPIC timer interrupt on cpu %d\n",
+ smp_processor_id());
+ /* Switch it off */
+ lapic_timer_shutdown(evt);
+ return;
+ }
+
+ /*
+ * the NMI deadlock-detector uses this.
+ */
+ inc_irq_stat(apic_timer_irqs);
+
+ evt->event_handler(evt);
+}
+
+/*
+ * Local APIC timer interrupt. This is the most natural way for doing
+ * local interrupts, but local timer interrupts can be emulated by
+ * broadcast interrupts too. [in case the hw doesn't support APIC timers]
+ *
+ * [ if a single-CPU system runs an SMP kernel then we call the local
+ * interrupt as well. Thus we cannot inline the local irq ... ]
+ */
+__visible void __irq_entry smp_apic_timer_interrupt(struct pt_regs *regs)
+{
+ struct pt_regs *old_regs = set_irq_regs(regs);
+
+ /*
+ * NOTE! We'd better ACK the irq immediately,
+ * because timer handling can be slow.
+ *
+ * update_process_times() expects us to have done irq_enter().
+ * Besides, if we don't timer interrupts ignore the global
+ * interrupt lock, which is the WrongThing (tm) to do.
+ */
+ entering_ack_irq();
+ trace_local_timer_entry(LOCAL_TIMER_VECTOR);
+ local_apic_timer_interrupt();
+ trace_local_timer_exit(LOCAL_TIMER_VECTOR);
+ exiting_irq();
+
+ set_irq_regs(old_regs);
+}
+
+int setup_profiling_timer(unsigned int multiplier)
+{
+ return -EINVAL;
+}
+
+/*
+ * Local APIC start and shutdown
+ */
+
+/**
+ * clear_local_APIC - shutdown the local APIC
+ *
+ * This is called, when a CPU is disabled and before rebooting, so the state of
+ * the local APIC has no dangling leftovers. Also used to cleanout any BIOS
+ * leftovers during boot.
+ */
+void clear_local_APIC(void)
+{
+ int maxlvt;
+ u32 v;
+
+ /* APIC hasn't been mapped yet */
+ if (!x2apic_mode && !apic_phys)
+ return;
+
+ maxlvt = lapic_get_maxlvt();
+ /*
+ * Masking an LVT entry can trigger a local APIC error
+ * if the vector is zero. Mask LVTERR first to prevent this.
+ */
+ if (maxlvt >= 3) {
+ v = ERROR_APIC_VECTOR; /* any non-zero vector will do */
+ apic_write(APIC_LVTERR, v | APIC_LVT_MASKED);
+ }
+ /*
+ * Careful: we have to set masks only first to deassert
+ * any level-triggered sources.
+ */
+ v = apic_read(APIC_LVTT);
+ apic_write(APIC_LVTT, v | APIC_LVT_MASKED);
+ v = apic_read(APIC_LVT0);
+ apic_write(APIC_LVT0, v | APIC_LVT_MASKED);
+ v = apic_read(APIC_LVT1);
+ apic_write(APIC_LVT1, v | APIC_LVT_MASKED);
+ if (maxlvt >= 4) {
+ v = apic_read(APIC_LVTPC);
+ apic_write(APIC_LVTPC, v | APIC_LVT_MASKED);
+ }
+
+ /* lets not touch this if we didn't frob it */
+#ifdef CONFIG_X86_THERMAL_VECTOR
+ if (maxlvt >= 5) {
+ v = apic_read(APIC_LVTTHMR);
+ apic_write(APIC_LVTTHMR, v | APIC_LVT_MASKED);
+ }
+#endif
+#ifdef CONFIG_X86_MCE_INTEL
+ if (maxlvt >= 6) {
+ v = apic_read(APIC_LVTCMCI);
+ if (!(v & APIC_LVT_MASKED))
+ apic_write(APIC_LVTCMCI, v | APIC_LVT_MASKED);
+ }
+#endif
+
+ /*
+ * Clean APIC state for other OSs:
+ */
+ apic_write(APIC_LVTT, APIC_LVT_MASKED);
+ apic_write(APIC_LVT0, APIC_LVT_MASKED);
+ apic_write(APIC_LVT1, APIC_LVT_MASKED);
+ if (maxlvt >= 3)
+ apic_write(APIC_LVTERR, APIC_LVT_MASKED);
+ if (maxlvt >= 4)
+ apic_write(APIC_LVTPC, APIC_LVT_MASKED);
+
+ /* Integrated APIC (!82489DX) ? */
+ if (lapic_is_integrated()) {
+ if (maxlvt > 3)
+ /* Clear ESR due to Pentium errata 3AP and 11AP */
+ apic_write(APIC_ESR, 0);
+ apic_read(APIC_ESR);
+ }
+}
+
+/**
+ * disable_local_APIC - clear and disable the local APIC
+ */
+void disable_local_APIC(void)
+{
+ unsigned int value;
+
+ /* APIC hasn't been mapped yet */
+ if (!x2apic_mode && !apic_phys)
+ return;
+
+ clear_local_APIC();
+
+ /*
+ * Disable APIC (implies clearing of registers
+ * for 82489DX!).
+ */
+ value = apic_read(APIC_SPIV);
+ value &= ~APIC_SPIV_APIC_ENABLED;
+ apic_write(APIC_SPIV, value);
+
+#ifdef CONFIG_X86_32
+ /*
+ * When LAPIC was disabled by the BIOS and enabled by the kernel,
+ * restore the disabled state.
+ */
+ if (enabled_via_apicbase) {
+ unsigned int l, h;
+
+ rdmsr(MSR_IA32_APICBASE, l, h);
+ l &= ~MSR_IA32_APICBASE_ENABLE;
+ wrmsr(MSR_IA32_APICBASE, l, h);
+ }
+#endif
+}
+
+/*
+ * If Linux enabled the LAPIC against the BIOS default disable it down before
+ * re-entering the BIOS on shutdown. Otherwise the BIOS may get confused and
+ * not power-off. Additionally clear all LVT entries before disable_local_APIC
+ * for the case where Linux didn't enable the LAPIC.
+ */
+void lapic_shutdown(void)
+{
+ unsigned long flags;
+
+ if (!boot_cpu_has(X86_FEATURE_APIC) && !apic_from_smp_config())
+ return;
+
+ local_irq_save(flags);
+
+#ifdef CONFIG_X86_32
+ if (!enabled_via_apicbase)
+ clear_local_APIC();
+ else
+#endif
+ disable_local_APIC();
+
+
+ local_irq_restore(flags);
+}
+
+/**
+ * sync_Arb_IDs - synchronize APIC bus arbitration IDs
+ */
+void __init sync_Arb_IDs(void)
+{
+ /*
+ * Unsupported on P4 - see Intel Dev. Manual Vol. 3, Ch. 8.6.1 And not
+ * needed on AMD.
+ */
+ if (modern_apic() || boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
+ return;
+
+ /*
+ * Wait for idle.
+ */
+ apic_wait_icr_idle();
+
+ apic_printk(APIC_DEBUG, "Synchronizing Arb IDs.\n");
+ apic_write(APIC_ICR, APIC_DEST_ALLINC |
+ APIC_INT_LEVELTRIG | APIC_DM_INIT);
+}
+
+enum apic_intr_mode_id apic_intr_mode;
+
+static int __init apic_intr_mode_select(void)
+{
+ /* Check kernel option */
+ if (disable_apic) {
+ pr_info("APIC disabled via kernel command line\n");
+ return APIC_PIC;
+ }
+
+ /* Check BIOS */
+#ifdef CONFIG_X86_64
+ /* On 64-bit, the APIC must be integrated, Check local APIC only */
+ if (!boot_cpu_has(X86_FEATURE_APIC)) {
+ disable_apic = 1;
+ pr_info("APIC disabled by BIOS\n");
+ return APIC_PIC;
+ }
+#else
+ /* On 32-bit, the APIC may be integrated APIC or 82489DX */
+
+ /* Neither 82489DX nor integrated APIC ? */
+ if (!boot_cpu_has(X86_FEATURE_APIC) && !smp_found_config) {
+ disable_apic = 1;
+ return APIC_PIC;
+ }
+
+ /* If the BIOS pretends there is an integrated APIC ? */
+ if (!boot_cpu_has(X86_FEATURE_APIC) &&
+ APIC_INTEGRATED(boot_cpu_apic_version)) {
+ disable_apic = 1;
+ pr_err(FW_BUG "Local APIC %d not detected, force emulation\n",
+ boot_cpu_physical_apicid);
+ return APIC_PIC;
+ }
+#endif
+
+ /* Check MP table or ACPI MADT configuration */
+ if (!smp_found_config) {
+ disable_ioapic_support();
+ if (!acpi_lapic) {
+ pr_info("APIC: ACPI MADT or MP tables are not detected\n");
+ return APIC_VIRTUAL_WIRE_NO_CONFIG;
+ }
+ return APIC_VIRTUAL_WIRE;
+ }
+
+#ifdef CONFIG_SMP
+ /* If SMP should be disabled, then really disable it! */
+ if (!setup_max_cpus) {
+ pr_info("APIC: SMP mode deactivated\n");
+ return APIC_SYMMETRIC_IO_NO_ROUTING;
+ }
+
+ if (read_apic_id() != boot_cpu_physical_apicid) {
+ panic("Boot APIC ID in local APIC unexpected (%d vs %d)",
+ read_apic_id(), boot_cpu_physical_apicid);
+ /* Or can we switch back to PIC here? */
+ }
+#endif
+
+ return APIC_SYMMETRIC_IO;
+}
+
+/*
+ * An initial setup of the virtual wire mode.
+ */
+void __init init_bsp_APIC(void)
+{
+ unsigned int value;
+
+ /*
+ * Don't do the setup now if we have a SMP BIOS as the
+ * through-I/O-APIC virtual wire mode might be active.
+ */
+ if (smp_found_config || !boot_cpu_has(X86_FEATURE_APIC))
+ return;
+
+ /*
+ * Do not trust the local APIC being empty at bootup.
+ */
+ clear_local_APIC();
+
+ /*
+ * Enable APIC.
+ */
+ value = apic_read(APIC_SPIV);
+ value &= ~APIC_VECTOR_MASK;
+ value |= APIC_SPIV_APIC_ENABLED;
+
+#ifdef CONFIG_X86_32
+ /* This bit is reserved on P4/Xeon and should be cleared */
+ if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) &&
+ (boot_cpu_data.x86 == 15))
+ value &= ~APIC_SPIV_FOCUS_DISABLED;
+ else
+#endif
+ value |= APIC_SPIV_FOCUS_DISABLED;
+ value |= SPURIOUS_APIC_VECTOR;
+ apic_write(APIC_SPIV, value);
+
+ /*
+ * Set up the virtual wire mode.
+ */
+ apic_write(APIC_LVT0, APIC_DM_EXTINT);
+ value = APIC_DM_NMI;
+ if (!lapic_is_integrated()) /* 82489DX */
+ value |= APIC_LVT_LEVEL_TRIGGER;
+ if (apic_extnmi == APIC_EXTNMI_NONE)
+ value |= APIC_LVT_MASKED;
+ apic_write(APIC_LVT1, value);
+}
+
+/* Init the interrupt delivery mode for the BSP */
+void __init apic_intr_mode_init(void)
+{
+ bool upmode = IS_ENABLED(CONFIG_UP_LATE_INIT);
+
+ apic_intr_mode = apic_intr_mode_select();
+
+ switch (apic_intr_mode) {
+ case APIC_PIC:
+ pr_info("APIC: Keep in PIC mode(8259)\n");
+ return;
+ case APIC_VIRTUAL_WIRE:
+ pr_info("APIC: Switch to virtual wire mode setup\n");
+ default_setup_apic_routing();
+ break;
+ case APIC_VIRTUAL_WIRE_NO_CONFIG:
+ pr_info("APIC: Switch to virtual wire mode setup with no configuration\n");
+ upmode = true;
+ default_setup_apic_routing();
+ break;
+ case APIC_SYMMETRIC_IO:
+ pr_info("APIC: Switch to symmetric I/O mode setup\n");
+ default_setup_apic_routing();
+ break;
+ case APIC_SYMMETRIC_IO_NO_ROUTING:
+ pr_info("APIC: Switch to symmetric I/O mode setup in no SMP routine\n");
+ break;
+ }
+
+ apic_bsp_setup(upmode);
+}
+
+static void lapic_setup_esr(void)
+{
+ unsigned int oldvalue, value, maxlvt;
+
+ if (!lapic_is_integrated()) {
+ pr_info("No ESR for 82489DX.\n");
+ return;
+ }
+
+ if (apic->disable_esr) {
+ /*
+ * Something untraceable is creating bad interrupts on
+ * secondary quads ... for the moment, just leave the
+ * ESR disabled - we can't do anything useful with the
+ * errors anyway - mbligh
+ */
+ pr_info("Leaving ESR disabled.\n");
+ return;
+ }
+
+ maxlvt = lapic_get_maxlvt();
+ if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
+ apic_write(APIC_ESR, 0);
+ oldvalue = apic_read(APIC_ESR);
+
+ /* enables sending errors */
+ value = ERROR_APIC_VECTOR;
+ apic_write(APIC_LVTERR, value);
+
+ /*
+ * spec says clear errors after enabling vector.
+ */
+ if (maxlvt > 3)
+ apic_write(APIC_ESR, 0);
+ value = apic_read(APIC_ESR);
+ if (value != oldvalue)
+ apic_printk(APIC_VERBOSE, "ESR value before enabling "
+ "vector: 0x%08x after: 0x%08x\n",
+ oldvalue, value);
+}
+
+static void apic_pending_intr_clear(void)
+{
+ long long max_loops = cpu_khz ? cpu_khz : 1000000;
+ unsigned long long tsc = 0, ntsc;
+ unsigned int queued;
+ unsigned long value;
+ int i, j, acked = 0;
+
+ if (boot_cpu_has(X86_FEATURE_TSC))
+ tsc = rdtsc();
+ /*
+ * After a crash, we no longer service the interrupts and a pending
+ * interrupt from previous kernel might still have ISR bit set.
+ *
+ * Most probably by now CPU has serviced that pending interrupt and
+ * it might not have done the ack_APIC_irq() because it thought,
+ * interrupt came from i8259 as ExtInt. LAPIC did not get EOI so it
+ * does not clear the ISR bit and cpu thinks it has already serivced
+ * the interrupt. Hence a vector might get locked. It was noticed
+ * for timer irq (vector 0x31). Issue an extra EOI to clear ISR.
+ */
+ do {
+ queued = 0;
+ for (i = APIC_ISR_NR - 1; i >= 0; i--)
+ queued |= apic_read(APIC_IRR + i*0x10);
+
+ for (i = APIC_ISR_NR - 1; i >= 0; i--) {
+ value = apic_read(APIC_ISR + i*0x10);
+ for_each_set_bit(j, &value, 32) {
+ ack_APIC_irq();
+ acked++;
+ }
+ }
+ if (acked > 256) {
+ pr_err("LAPIC pending interrupts after %d EOI\n", acked);
+ break;
+ }
+ if (queued) {
+ if (boot_cpu_has(X86_FEATURE_TSC) && cpu_khz) {
+ ntsc = rdtsc();
+ max_loops = (cpu_khz << 10) - (ntsc - tsc);
+ } else {
+ max_loops--;
+ }
+ }
+ } while (queued && max_loops > 0);
+ WARN_ON(max_loops <= 0);
+}
+
+/**
+ * setup_local_APIC - setup the local APIC
+ *
+ * Used to setup local APIC while initializing BSP or bringing up APs.
+ * Always called with preemption disabled.
+ */
+static void setup_local_APIC(void)
+{
+ int cpu = smp_processor_id();
+ unsigned int value;
+#ifdef CONFIG_X86_32
+ int logical_apicid, ldr_apicid;
+#endif
+
+
+ if (disable_apic) {
+ disable_ioapic_support();
+ return;
+ }
+
+#ifdef CONFIG_X86_32
+ /* Pound the ESR really hard over the head with a big hammer - mbligh */
+ if (lapic_is_integrated() && apic->disable_esr) {
+ apic_write(APIC_ESR, 0);
+ apic_write(APIC_ESR, 0);
+ apic_write(APIC_ESR, 0);
+ apic_write(APIC_ESR, 0);
+ }
+#endif
+ perf_events_lapic_init();
+
+ /*
+ * Double-check whether this APIC is really registered.
+ * This is meaningless in clustered apic mode, so we skip it.
+ */
+ BUG_ON(!apic->apic_id_registered());
+
+ /*
+ * Intel recommends to set DFR, LDR and TPR before enabling
+ * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel
+ * document number 292116). So here it goes...
+ */
+ apic->init_apic_ldr();
+
+#ifdef CONFIG_X86_32
+ /*
+ * APIC LDR is initialized. If logical_apicid mapping was
+ * initialized during get_smp_config(), make sure it matches the
+ * actual value.
+ */
+ logical_apicid = early_per_cpu(x86_cpu_to_logical_apicid, cpu);
+ ldr_apicid = GET_APIC_LOGICAL_ID(apic_read(APIC_LDR));
+ WARN_ON(logical_apicid != BAD_APICID && logical_apicid != ldr_apicid);
+ /* always use the value from LDR */
+ early_per_cpu(x86_cpu_to_logical_apicid, cpu) = ldr_apicid;
+#endif
+
+ /*
+ * Set Task Priority to 'accept all'. We never change this
+ * later on.
+ */
+ value = apic_read(APIC_TASKPRI);
+ value &= ~APIC_TPRI_MASK;
+ apic_write(APIC_TASKPRI, value);
+
+ apic_pending_intr_clear();
+
+ /*
+ * Now that we are all set up, enable the APIC
+ */
+ value = apic_read(APIC_SPIV);
+ value &= ~APIC_VECTOR_MASK;
+ /*
+ * Enable APIC
+ */
+ value |= APIC_SPIV_APIC_ENABLED;
+
+#ifdef CONFIG_X86_32
+ /*
+ * Some unknown Intel IO/APIC (or APIC) errata is biting us with
+ * certain networking cards. If high frequency interrupts are
+ * happening on a particular IOAPIC pin, plus the IOAPIC routing
+ * entry is masked/unmasked at a high rate as well then sooner or
+ * later IOAPIC line gets 'stuck', no more interrupts are received
+ * from the device. If focus CPU is disabled then the hang goes
+ * away, oh well :-(
+ *
+ * [ This bug can be reproduced easily with a level-triggered
+ * PCI Ne2000 networking cards and PII/PIII processors, dual
+ * BX chipset. ]
+ */
+ /*
+ * Actually disabling the focus CPU check just makes the hang less
+ * frequent as it makes the interrupt distributon model be more
+ * like LRU than MRU (the short-term load is more even across CPUs).
+ */
+
+ /*
+ * - enable focus processor (bit==0)
+ * - 64bit mode always use processor focus
+ * so no need to set it
+ */
+ value &= ~APIC_SPIV_FOCUS_DISABLED;
+#endif
+
+ /*
+ * Set spurious IRQ vector
+ */
+ value |= SPURIOUS_APIC_VECTOR;
+ apic_write(APIC_SPIV, value);
+
+ /*
+ * Set up LVT0, LVT1:
+ *
+ * set up through-local-APIC on the boot CPU's LINT0. This is not
+ * strictly necessary in pure symmetric-IO mode, but sometimes
+ * we delegate interrupts to the 8259A.
+ */
+ /*
+ * TODO: set up through-local-APIC from through-I/O-APIC? --macro
+ */
+ value = apic_read(APIC_LVT0) & APIC_LVT_MASKED;
+ if (!cpu && (pic_mode || !value || skip_ioapic_setup)) {
+ value = APIC_DM_EXTINT;
+ apic_printk(APIC_VERBOSE, "enabled ExtINT on CPU#%d\n", cpu);
+ } else {
+ value = APIC_DM_EXTINT | APIC_LVT_MASKED;
+ apic_printk(APIC_VERBOSE, "masked ExtINT on CPU#%d\n", cpu);
+ }
+ apic_write(APIC_LVT0, value);
+
+ /*
+ * Only the BSP sees the LINT1 NMI signal by default. This can be
+ * modified by apic_extnmi= boot option.
+ */
+ if ((!cpu && apic_extnmi != APIC_EXTNMI_NONE) ||
+ apic_extnmi == APIC_EXTNMI_ALL)
+ value = APIC_DM_NMI;
+ else
+ value = APIC_DM_NMI | APIC_LVT_MASKED;
+
+ /* Is 82489DX ? */
+ if (!lapic_is_integrated())
+ value |= APIC_LVT_LEVEL_TRIGGER;
+ apic_write(APIC_LVT1, value);
+
+#ifdef CONFIG_X86_MCE_INTEL
+ /* Recheck CMCI information after local APIC is up on CPU #0 */
+ if (!cpu)
+ cmci_recheck();
+#endif
+}
+
+static void end_local_APIC_setup(void)
+{
+ lapic_setup_esr();
+
+#ifdef CONFIG_X86_32
+ {
+ unsigned int value;
+ /* Disable the local apic timer */
+ value = apic_read(APIC_LVTT);
+ value |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR);
+ apic_write(APIC_LVTT, value);
+ }
+#endif
+
+ apic_pm_activate();
+}
+
+/*
+ * APIC setup function for application processors. Called from smpboot.c
+ */
+void apic_ap_setup(void)
+{
+ setup_local_APIC();
+ end_local_APIC_setup();
+}
+
+#ifdef CONFIG_X86_X2APIC
+int x2apic_mode;
+
+enum {
+ X2APIC_OFF,
+ X2APIC_ON,
+ X2APIC_DISABLED,
+};
+static int x2apic_state;
+
+static void __x2apic_disable(void)
+{
+ u64 msr;
+
+ if (!boot_cpu_has(X86_FEATURE_APIC))
+ return;
+
+ rdmsrl(MSR_IA32_APICBASE, msr);
+ if (!(msr & X2APIC_ENABLE))
+ return;
+ /* Disable xapic and x2apic first and then reenable xapic mode */
+ wrmsrl(MSR_IA32_APICBASE, msr & ~(X2APIC_ENABLE | XAPIC_ENABLE));
+ wrmsrl(MSR_IA32_APICBASE, msr & ~X2APIC_ENABLE);
+ printk_once(KERN_INFO "x2apic disabled\n");
+}
+
+static void __x2apic_enable(void)
+{
+ u64 msr;
+
+ rdmsrl(MSR_IA32_APICBASE, msr);
+ if (msr & X2APIC_ENABLE)
+ return;
+ wrmsrl(MSR_IA32_APICBASE, msr | X2APIC_ENABLE);
+ printk_once(KERN_INFO "x2apic enabled\n");
+}
+
+static int __init setup_nox2apic(char *str)
+{
+ if (x2apic_enabled()) {
+ int apicid = native_apic_msr_read(APIC_ID);
+
+ if (apicid >= 255) {
+ pr_warning("Apicid: %08x, cannot enforce nox2apic\n",
+ apicid);
+ return 0;
+ }
+ pr_warning("x2apic already enabled.\n");
+ __x2apic_disable();
+ }
+ setup_clear_cpu_cap(X86_FEATURE_X2APIC);
+ x2apic_state = X2APIC_DISABLED;
+ x2apic_mode = 0;
+ return 0;
+}
+early_param("nox2apic", setup_nox2apic);
+
+/* Called from cpu_init() to enable x2apic on (secondary) cpus */
+void x2apic_setup(void)
+{
+ /*
+ * If x2apic is not in ON state, disable it if already enabled
+ * from BIOS.
+ */
+ if (x2apic_state != X2APIC_ON) {
+ __x2apic_disable();
+ return;
+ }
+ __x2apic_enable();
+}
+
+static __init void x2apic_disable(void)
+{
+ u32 x2apic_id, state = x2apic_state;
+
+ x2apic_mode = 0;
+ x2apic_state = X2APIC_DISABLED;
+
+ if (state != X2APIC_ON)
+ return;
+
+ x2apic_id = read_apic_id();
+ if (x2apic_id >= 255)
+ panic("Cannot disable x2apic, id: %08x\n", x2apic_id);
+
+ __x2apic_disable();
+ register_lapic_address(mp_lapic_addr);
+}
+
+static __init void x2apic_enable(void)
+{
+ if (x2apic_state != X2APIC_OFF)
+ return;
+
+ x2apic_mode = 1;
+ x2apic_state = X2APIC_ON;
+ __x2apic_enable();
+}
+
+static __init void try_to_enable_x2apic(int remap_mode)
+{
+ if (x2apic_state == X2APIC_DISABLED)
+ return;
+
+ if (remap_mode != IRQ_REMAP_X2APIC_MODE) {
+ /* IR is required if there is APIC ID > 255 even when running
+ * under KVM
+ */
+ if (max_physical_apicid > 255 ||
+ !x86_init.hyper.x2apic_available()) {
+ pr_info("x2apic: IRQ remapping doesn't support X2APIC mode\n");
+ x2apic_disable();
+ return;
+ }
+
+ /*
+ * without IR all CPUs can be addressed by IOAPIC/MSI
+ * only in physical mode
+ */
+ x2apic_phys = 1;
+ }
+ x2apic_enable();
+}
+
+void __init check_x2apic(void)
+{
+ if (x2apic_enabled()) {
+ pr_info("x2apic: enabled by BIOS, switching to x2apic ops\n");
+ x2apic_mode = 1;
+ x2apic_state = X2APIC_ON;
+ } else if (!boot_cpu_has(X86_FEATURE_X2APIC)) {
+ x2apic_state = X2APIC_DISABLED;
+ }
+}
+#else /* CONFIG_X86_X2APIC */
+static int __init validate_x2apic(void)
+{
+ if (!apic_is_x2apic_enabled())
+ return 0;
+ /*
+ * Checkme: Can we simply turn off x2apic here instead of panic?
+ */
+ panic("BIOS has enabled x2apic but kernel doesn't support x2apic, please disable x2apic in BIOS.\n");
+}
+early_initcall(validate_x2apic);
+
+static inline void try_to_enable_x2apic(int remap_mode) { }
+static inline void __x2apic_enable(void) { }
+#endif /* !CONFIG_X86_X2APIC */
+
+void __init enable_IR_x2apic(void)
+{
+ unsigned long flags;
+ int ret, ir_stat;
+
+ if (skip_ioapic_setup) {
+ pr_info("Not enabling interrupt remapping due to skipped IO-APIC setup\n");
+ return;
+ }
+
+ ir_stat = irq_remapping_prepare();
+ if (ir_stat < 0 && !x2apic_supported())
+ return;
+
+ ret = save_ioapic_entries();
+ if (ret) {
+ pr_info("Saving IO-APIC state failed: %d\n", ret);
+ return;
+ }
+
+ local_irq_save(flags);
+ legacy_pic->mask_all();
+ mask_ioapic_entries();
+
+ /* If irq_remapping_prepare() succeeded, try to enable it */
+ if (ir_stat >= 0)
+ ir_stat = irq_remapping_enable();
+ /* ir_stat contains the remap mode or an error code */
+ try_to_enable_x2apic(ir_stat);
+
+ if (ir_stat < 0)
+ restore_ioapic_entries();
+ legacy_pic->restore_mask();
+ local_irq_restore(flags);
+}
+
+#ifdef CONFIG_X86_64
+/*
+ * Detect and enable local APICs on non-SMP boards.
+ * Original code written by Keir Fraser.
+ * On AMD64 we trust the BIOS - if it says no APIC it is likely
+ * not correctly set up (usually the APIC timer won't work etc.)
+ */
+static int __init detect_init_APIC(void)
+{
+ if (!boot_cpu_has(X86_FEATURE_APIC)) {
+ pr_info("No local APIC present\n");
+ return -1;
+ }
+
+ mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
+ return 0;
+}
+#else
+
+static int __init apic_verify(void)
+{
+ u32 features, h, l;
+
+ /*
+ * The APIC feature bit should now be enabled
+ * in `cpuid'
+ */
+ features = cpuid_edx(1);
+ if (!(features & (1 << X86_FEATURE_APIC))) {
+ pr_warning("Could not enable APIC!\n");
+ return -1;
+ }
+ set_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC);
+ mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
+
+ /* The BIOS may have set up the APIC at some other address */
+ if (boot_cpu_data.x86 >= 6) {
+ rdmsr(MSR_IA32_APICBASE, l, h);
+ if (l & MSR_IA32_APICBASE_ENABLE)
+ mp_lapic_addr = l & MSR_IA32_APICBASE_BASE;
+ }
+
+ pr_info("Found and enabled local APIC!\n");
+ return 0;
+}
+
+int __init apic_force_enable(unsigned long addr)
+{
+ u32 h, l;
+
+ if (disable_apic)
+ return -1;
+
+ /*
+ * Some BIOSes disable the local APIC in the APIC_BASE
+ * MSR. This can only be done in software for Intel P6 or later
+ * and AMD K7 (Model > 1) or later.
+ */
+ if (boot_cpu_data.x86 >= 6) {
+ rdmsr(MSR_IA32_APICBASE, l, h);
+ if (!(l & MSR_IA32_APICBASE_ENABLE)) {
+ pr_info("Local APIC disabled by BIOS -- reenabling.\n");
+ l &= ~MSR_IA32_APICBASE_BASE;
+ l |= MSR_IA32_APICBASE_ENABLE | addr;
+ wrmsr(MSR_IA32_APICBASE, l, h);
+ enabled_via_apicbase = 1;
+ }
+ }
+ return apic_verify();
+}
+
+/*
+ * Detect and initialize APIC
+ */
+static int __init detect_init_APIC(void)
+{
+ /* Disabled by kernel option? */
+ if (disable_apic)
+ return -1;
+
+ switch (boot_cpu_data.x86_vendor) {
+ case X86_VENDOR_AMD:
+ if ((boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model > 1) ||
+ (boot_cpu_data.x86 >= 15))
+ break;
+ goto no_apic;
+ case X86_VENDOR_INTEL:
+ if (boot_cpu_data.x86 == 6 || boot_cpu_data.x86 == 15 ||
+ (boot_cpu_data.x86 == 5 && boot_cpu_has(X86_FEATURE_APIC)))
+ break;
+ goto no_apic;
+ default:
+ goto no_apic;
+ }
+
+ if (!boot_cpu_has(X86_FEATURE_APIC)) {
+ /*
+ * Over-ride BIOS and try to enable the local APIC only if
+ * "lapic" specified.
+ */
+ if (!force_enable_local_apic) {
+ pr_info("Local APIC disabled by BIOS -- "
+ "you can enable it with \"lapic\"\n");
+ return -1;
+ }
+ if (apic_force_enable(APIC_DEFAULT_PHYS_BASE))
+ return -1;
+ } else {
+ if (apic_verify())
+ return -1;
+ }
+
+ apic_pm_activate();
+
+ return 0;
+
+no_apic:
+ pr_info("No local APIC present or hardware disabled\n");
+ return -1;
+}
+#endif
+
+/**
+ * init_apic_mappings - initialize APIC mappings
+ */
+void __init init_apic_mappings(void)
+{
+ unsigned int new_apicid;
+
+ apic_check_deadline_errata();
+
+ if (x2apic_mode) {
+ boot_cpu_physical_apicid = read_apic_id();
+ return;
+ }
+
+ /* If no local APIC can be found return early */
+ if (!smp_found_config && detect_init_APIC()) {
+ /* lets NOP'ify apic operations */
+ pr_info("APIC: disable apic facility\n");
+ apic_disable();
+ } else {
+ apic_phys = mp_lapic_addr;
+
+ /*
+ * If the system has ACPI MADT tables or MP info, the LAPIC
+ * address is already registered.
+ */
+ if (!acpi_lapic && !smp_found_config)
+ register_lapic_address(apic_phys);
+ }
+
+ /*
+ * Fetch the APIC ID of the BSP in case we have a
+ * default configuration (or the MP table is broken).
+ */
+ new_apicid = read_apic_id();
+ if (boot_cpu_physical_apicid != new_apicid) {
+ boot_cpu_physical_apicid = new_apicid;
+ /*
+ * yeah -- we lie about apic_version
+ * in case if apic was disabled via boot option
+ * but it's not a problem for SMP compiled kernel
+ * since apic_intr_mode_select is prepared for such
+ * a case and disable smp mode
+ */
+ boot_cpu_apic_version = GET_APIC_VERSION(apic_read(APIC_LVR));
+ }
+}
+
+void __init register_lapic_address(unsigned long address)
+{
+ mp_lapic_addr = address;
+
+ if (!x2apic_mode) {
+ set_fixmap_nocache(FIX_APIC_BASE, address);
+ apic_printk(APIC_VERBOSE, "mapped APIC to %16lx (%16lx)\n",
+ APIC_BASE, address);
+ }
+ if (boot_cpu_physical_apicid == -1U) {
+ boot_cpu_physical_apicid = read_apic_id();
+ boot_cpu_apic_version = GET_APIC_VERSION(apic_read(APIC_LVR));
+ }
+}
+
+/*
+ * Local APIC interrupts
+ */
+
+/*
+ * This interrupt should _never_ happen with our APIC/SMP architecture
+ */
+__visible void __irq_entry smp_spurious_interrupt(struct pt_regs *regs)
+{
+ u8 vector = ~regs->orig_ax;
+ u32 v;
+
+ entering_irq();
+ trace_spurious_apic_entry(vector);
+
+ /*
+ * Check if this really is a spurious interrupt and ACK it
+ * if it is a vectored one. Just in case...
+ * Spurious interrupts should not be ACKed.
+ */
+ v = apic_read(APIC_ISR + ((vector & ~0x1f) >> 1));
+ if (v & (1 << (vector & 0x1f)))
+ ack_APIC_irq();
+
+ inc_irq_stat(irq_spurious_count);
+
+ /* see sw-dev-man vol 3, chapter 7.4.13.5 */
+ pr_info("spurious APIC interrupt through vector %02x on CPU#%d, "
+ "should never happen.\n", vector, smp_processor_id());
+
+ trace_spurious_apic_exit(vector);
+ exiting_irq();
+}
+
+/*
+ * This interrupt should never happen with our APIC/SMP architecture
+ */
+__visible void __irq_entry smp_error_interrupt(struct pt_regs *regs)
+{
+ static const char * const error_interrupt_reason[] = {
+ "Send CS error", /* APIC Error Bit 0 */
+ "Receive CS error", /* APIC Error Bit 1 */
+ "Send accept error", /* APIC Error Bit 2 */
+ "Receive accept error", /* APIC Error Bit 3 */
+ "Redirectable IPI", /* APIC Error Bit 4 */
+ "Send illegal vector", /* APIC Error Bit 5 */
+ "Received illegal vector", /* APIC Error Bit 6 */
+ "Illegal register address", /* APIC Error Bit 7 */
+ };
+ u32 v, i = 0;
+
+ entering_irq();
+ trace_error_apic_entry(ERROR_APIC_VECTOR);
+
+ /* First tickle the hardware, only then report what went on. -- REW */
+ if (lapic_get_maxlvt() > 3) /* Due to the Pentium erratum 3AP. */
+ apic_write(APIC_ESR, 0);
+ v = apic_read(APIC_ESR);
+ ack_APIC_irq();
+ atomic_inc(&irq_err_count);
+
+ apic_printk(APIC_DEBUG, KERN_DEBUG "APIC error on CPU%d: %02x",
+ smp_processor_id(), v);
+
+ v &= 0xff;
+ while (v) {
+ if (v & 0x1)
+ apic_printk(APIC_DEBUG, KERN_CONT " : %s", error_interrupt_reason[i]);
+ i++;
+ v >>= 1;
+ }
+
+ apic_printk(APIC_DEBUG, KERN_CONT "\n");
+
+ trace_error_apic_exit(ERROR_APIC_VECTOR);
+ exiting_irq();
+}
+
+/**
+ * connect_bsp_APIC - attach the APIC to the interrupt system
+ */
+static void __init connect_bsp_APIC(void)
+{
+#ifdef CONFIG_X86_32
+ if (pic_mode) {
+ /*
+ * Do not trust the local APIC being empty at bootup.
+ */
+ clear_local_APIC();
+ /*
+ * PIC mode, enable APIC mode in the IMCR, i.e. connect BSP's
+ * local APIC to INT and NMI lines.
+ */
+ apic_printk(APIC_VERBOSE, "leaving PIC mode, "
+ "enabling APIC mode.\n");
+ imcr_pic_to_apic();
+ }
+#endif
+}
+
+/**
+ * disconnect_bsp_APIC - detach the APIC from the interrupt system
+ * @virt_wire_setup: indicates, whether virtual wire mode is selected
+ *
+ * Virtual wire mode is necessary to deliver legacy interrupts even when the
+ * APIC is disabled.
+ */
+void disconnect_bsp_APIC(int virt_wire_setup)
+{
+ unsigned int value;
+
+#ifdef CONFIG_X86_32
+ if (pic_mode) {
+ /*
+ * Put the board back into PIC mode (has an effect only on
+ * certain older boards). Note that APIC interrupts, including
+ * IPIs, won't work beyond this point! The only exception are
+ * INIT IPIs.
+ */
+ apic_printk(APIC_VERBOSE, "disabling APIC mode, "
+ "entering PIC mode.\n");
+ imcr_apic_to_pic();
+ return;
+ }
+#endif
+
+ /* Go back to Virtual Wire compatibility mode */
+
+ /* For the spurious interrupt use vector F, and enable it */
+ value = apic_read(APIC_SPIV);
+ value &= ~APIC_VECTOR_MASK;
+ value |= APIC_SPIV_APIC_ENABLED;
+ value |= 0xf;
+ apic_write(APIC_SPIV, value);
+
+ if (!virt_wire_setup) {
+ /*
+ * For LVT0 make it edge triggered, active high,
+ * external and enabled
+ */
+ value = apic_read(APIC_LVT0);
+ value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING |
+ APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
+ APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED);
+ value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
+ value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT);
+ apic_write(APIC_LVT0, value);
+ } else {
+ /* Disable LVT0 */
+ apic_write(APIC_LVT0, APIC_LVT_MASKED);
+ }
+
+ /*
+ * For LVT1 make it edge triggered, active high,
+ * nmi and enabled
+ */
+ value = apic_read(APIC_LVT1);
+ value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING |
+ APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
+ APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED);
+ value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
+ value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI);
+ apic_write(APIC_LVT1, value);
+}
+
+/*
+ * The number of allocated logical CPU IDs. Since logical CPU IDs are allocated
+ * contiguously, it equals to current allocated max logical CPU ID plus 1.
+ * All allocated CPU IDs should be in the [0, nr_logical_cpuids) range,
+ * so the maximum of nr_logical_cpuids is nr_cpu_ids.
+ *
+ * NOTE: Reserve 0 for BSP.
+ */
+static int nr_logical_cpuids = 1;
+
+/*
+ * Used to store mapping between logical CPU IDs and APIC IDs.
+ */
+static int cpuid_to_apicid[] = {
+ [0 ... NR_CPUS - 1] = -1,
+};
+
+#ifdef CONFIG_SMP
+/**
+ * apic_id_is_primary_thread - Check whether APIC ID belongs to a primary thread
+ * @id: APIC ID to check
+ */
+bool apic_id_is_primary_thread(unsigned int apicid)
+{
+ u32 mask;
+
+ if (smp_num_siblings == 1)
+ return true;
+ /* Isolate the SMT bit(s) in the APICID and check for 0 */
+ mask = (1U << (fls(smp_num_siblings) - 1)) - 1;
+ return !(apicid & mask);
+}
+#endif
+
+/*
+ * Should use this API to allocate logical CPU IDs to keep nr_logical_cpuids
+ * and cpuid_to_apicid[] synchronized.
+ */
+static int allocate_logical_cpuid(int apicid)
+{
+ int i;
+
+ /*
+ * cpuid <-> apicid mapping is persistent, so when a cpu is up,
+ * check if the kernel has allocated a cpuid for it.
+ */
+ for (i = 0; i < nr_logical_cpuids; i++) {
+ if (cpuid_to_apicid[i] == apicid)
+ return i;
+ }
+
+ /* Allocate a new cpuid. */
+ if (nr_logical_cpuids >= nr_cpu_ids) {
+ WARN_ONCE(1, "APIC: NR_CPUS/possible_cpus limit of %u reached. "
+ "Processor %d/0x%x and the rest are ignored.\n",
+ nr_cpu_ids, nr_logical_cpuids, apicid);
+ return -EINVAL;
+ }
+
+ cpuid_to_apicid[nr_logical_cpuids] = apicid;
+ return nr_logical_cpuids++;
+}
+
+int generic_processor_info(int apicid, int version)
+{
+ int cpu, max = nr_cpu_ids;
+ bool boot_cpu_detected = physid_isset(boot_cpu_physical_apicid,
+ phys_cpu_present_map);
+
+ /*
+ * boot_cpu_physical_apicid is designed to have the apicid
+ * returned by read_apic_id(), i.e, the apicid of the
+ * currently booting-up processor. However, on some platforms,
+ * it is temporarily modified by the apicid reported as BSP
+ * through MP table. Concretely:
+ *
+ * - arch/x86/kernel/mpparse.c: MP_processor_info()
+ * - arch/x86/mm/amdtopology.c: amd_numa_init()
+ *
+ * This function is executed with the modified
+ * boot_cpu_physical_apicid. So, disabled_cpu_apicid kernel
+ * parameter doesn't work to disable APs on kdump 2nd kernel.
+ *
+ * Since fixing handling of boot_cpu_physical_apicid requires
+ * another discussion and tests on each platform, we leave it
+ * for now and here we use read_apic_id() directly in this
+ * function, generic_processor_info().
+ */
+ if (disabled_cpu_apicid != BAD_APICID &&
+ disabled_cpu_apicid != read_apic_id() &&
+ disabled_cpu_apicid == apicid) {
+ int thiscpu = num_processors + disabled_cpus;
+
+ pr_warning("APIC: Disabling requested cpu."
+ " Processor %d/0x%x ignored.\n",
+ thiscpu, apicid);
+
+ disabled_cpus++;
+ return -ENODEV;
+ }
+
+ /*
+ * If boot cpu has not been detected yet, then only allow upto
+ * nr_cpu_ids - 1 processors and keep one slot free for boot cpu
+ */
+ if (!boot_cpu_detected && num_processors >= nr_cpu_ids - 1 &&
+ apicid != boot_cpu_physical_apicid) {
+ int thiscpu = max + disabled_cpus - 1;
+
+ pr_warning(
+ "APIC: NR_CPUS/possible_cpus limit of %i almost"
+ " reached. Keeping one slot for boot cpu."
+ " Processor %d/0x%x ignored.\n", max, thiscpu, apicid);
+
+ disabled_cpus++;
+ return -ENODEV;
+ }
+
+ if (num_processors >= nr_cpu_ids) {
+ int thiscpu = max + disabled_cpus;
+
+ pr_warning("APIC: NR_CPUS/possible_cpus limit of %i "
+ "reached. Processor %d/0x%x ignored.\n",
+ max, thiscpu, apicid);
+
+ disabled_cpus++;
+ return -EINVAL;
+ }
+
+ if (apicid == boot_cpu_physical_apicid) {
+ /*
+ * x86_bios_cpu_apicid is required to have processors listed
+ * in same order as logical cpu numbers. Hence the first
+ * entry is BSP, and so on.
+ * boot_cpu_init() already hold bit 0 in cpu_present_mask
+ * for BSP.
+ */
+ cpu = 0;
+
+ /* Logical cpuid 0 is reserved for BSP. */
+ cpuid_to_apicid[0] = apicid;
+ } else {
+ cpu = allocate_logical_cpuid(apicid);
+ if (cpu < 0) {
+ disabled_cpus++;
+ return -EINVAL;
+ }
+ }
+
+ /*
+ * Validate version
+ */
+ if (version == 0x0) {
+ pr_warning("BIOS bug: APIC version is 0 for CPU %d/0x%x, fixing up to 0x10\n",
+ cpu, apicid);
+ version = 0x10;
+ }
+
+ if (version != boot_cpu_apic_version) {
+ pr_warning("BIOS bug: APIC version mismatch, boot CPU: %x, CPU %d: version %x\n",
+ boot_cpu_apic_version, cpu, version);
+ }
+
+ if (apicid > max_physical_apicid)
+ max_physical_apicid = apicid;
+
+#if defined(CONFIG_SMP) || defined(CONFIG_X86_64)
+ early_per_cpu(x86_cpu_to_apicid, cpu) = apicid;
+ early_per_cpu(x86_bios_cpu_apicid, cpu) = apicid;
+#endif
+#ifdef CONFIG_X86_32
+ early_per_cpu(x86_cpu_to_logical_apicid, cpu) =
+ apic->x86_32_early_logical_apicid(cpu);
+#endif
+ set_cpu_possible(cpu, true);
+ physid_set(apicid, phys_cpu_present_map);
+ set_cpu_present(cpu, true);
+ num_processors++;
+
+ return cpu;
+}
+
+int hard_smp_processor_id(void)
+{
+ return read_apic_id();
+}
+
+/*
+ * Override the generic EOI implementation with an optimized version.
+ * Only called during early boot when only one CPU is active and with
+ * interrupts disabled, so we know this does not race with actual APIC driver
+ * use.
+ */
+void __init apic_set_eoi_write(void (*eoi_write)(u32 reg, u32 v))
+{
+ struct apic **drv;
+
+ for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) {
+ /* Should happen once for each apic */
+ WARN_ON((*drv)->eoi_write == eoi_write);
+ (*drv)->native_eoi_write = (*drv)->eoi_write;
+ (*drv)->eoi_write = eoi_write;
+ }
+}
+
+static void __init apic_bsp_up_setup(void)
+{
+#ifdef CONFIG_X86_64
+ apic_write(APIC_ID, apic->set_apic_id(boot_cpu_physical_apicid));
+#else
+ /*
+ * Hack: In case of kdump, after a crash, kernel might be booting
+ * on a cpu with non-zero lapic id. But boot_cpu_physical_apicid
+ * might be zero if read from MP tables. Get it from LAPIC.
+ */
+# ifdef CONFIG_CRASH_DUMP
+ boot_cpu_physical_apicid = read_apic_id();
+# endif
+#endif
+ physid_set_mask_of_physid(boot_cpu_physical_apicid, &phys_cpu_present_map);
+}
+
+/**
+ * apic_bsp_setup - Setup function for local apic and io-apic
+ * @upmode: Force UP mode (for APIC_init_uniprocessor)
+ *
+ * Returns:
+ * apic_id of BSP APIC
+ */
+void __init apic_bsp_setup(bool upmode)
+{
+ connect_bsp_APIC();
+ if (upmode)
+ apic_bsp_up_setup();
+ setup_local_APIC();
+
+ enable_IO_APIC();
+ end_local_APIC_setup();
+ irq_remap_enable_fault_handling();
+ setup_IO_APIC();
+}
+
+#ifdef CONFIG_UP_LATE_INIT
+void __init up_late_init(void)
+{
+ if (apic_intr_mode == APIC_PIC)
+ return;
+
+ /* Setup local timer */
+ x86_init.timers.setup_percpu_clockev();
+}
+#endif
+
+/*
+ * Power management
+ */
+#ifdef CONFIG_PM
+
+static struct {
+ /*
+ * 'active' is true if the local APIC was enabled by us and
+ * not the BIOS; this signifies that we are also responsible
+ * for disabling it before entering apm/acpi suspend
+ */
+ int active;
+ /* r/w apic fields */
+ unsigned int apic_id;
+ unsigned int apic_taskpri;
+ unsigned int apic_ldr;
+ unsigned int apic_dfr;
+ unsigned int apic_spiv;
+ unsigned int apic_lvtt;
+ unsigned int apic_lvtpc;
+ unsigned int apic_lvt0;
+ unsigned int apic_lvt1;
+ unsigned int apic_lvterr;
+ unsigned int apic_tmict;
+ unsigned int apic_tdcr;
+ unsigned int apic_thmr;
+ unsigned int apic_cmci;
+} apic_pm_state;
+
+static int lapic_suspend(void)
+{
+ unsigned long flags;
+ int maxlvt;
+
+ if (!apic_pm_state.active)
+ return 0;
+
+ maxlvt = lapic_get_maxlvt();
+
+ apic_pm_state.apic_id = apic_read(APIC_ID);
+ apic_pm_state.apic_taskpri = apic_read(APIC_TASKPRI);
+ apic_pm_state.apic_ldr = apic_read(APIC_LDR);
+ apic_pm_state.apic_dfr = apic_read(APIC_DFR);
+ apic_pm_state.apic_spiv = apic_read(APIC_SPIV);
+ apic_pm_state.apic_lvtt = apic_read(APIC_LVTT);
+ if (maxlvt >= 4)
+ apic_pm_state.apic_lvtpc = apic_read(APIC_LVTPC);
+ apic_pm_state.apic_lvt0 = apic_read(APIC_LVT0);
+ apic_pm_state.apic_lvt1 = apic_read(APIC_LVT1);
+ apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR);
+ apic_pm_state.apic_tmict = apic_read(APIC_TMICT);
+ apic_pm_state.apic_tdcr = apic_read(APIC_TDCR);
+#ifdef CONFIG_X86_THERMAL_VECTOR
+ if (maxlvt >= 5)
+ apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR);
+#endif
+#ifdef CONFIG_X86_MCE_INTEL
+ if (maxlvt >= 6)
+ apic_pm_state.apic_cmci = apic_read(APIC_LVTCMCI);
+#endif
+
+ local_irq_save(flags);
+ disable_local_APIC();
+
+ irq_remapping_disable();
+
+ local_irq_restore(flags);
+ return 0;
+}
+
+static void lapic_resume(void)
+{
+ unsigned int l, h;
+ unsigned long flags;
+ int maxlvt;
+
+ if (!apic_pm_state.active)
+ return;
+
+ local_irq_save(flags);
+
+ /*
+ * IO-APIC and PIC have their own resume routines.
+ * We just mask them here to make sure the interrupt
+ * subsystem is completely quiet while we enable x2apic
+ * and interrupt-remapping.
+ */
+ mask_ioapic_entries();
+ legacy_pic->mask_all();
+
+ if (x2apic_mode) {
+ __x2apic_enable();
+ } else {
+ /*
+ * Make sure the APICBASE points to the right address
+ *
+ * FIXME! This will be wrong if we ever support suspend on
+ * SMP! We'll need to do this as part of the CPU restore!
+ */
+ if (boot_cpu_data.x86 >= 6) {
+ rdmsr(MSR_IA32_APICBASE, l, h);
+ l &= ~MSR_IA32_APICBASE_BASE;
+ l |= MSR_IA32_APICBASE_ENABLE | mp_lapic_addr;
+ wrmsr(MSR_IA32_APICBASE, l, h);
+ }
+ }
+
+ maxlvt = lapic_get_maxlvt();
+ apic_write(APIC_LVTERR, ERROR_APIC_VECTOR | APIC_LVT_MASKED);
+ apic_write(APIC_ID, apic_pm_state.apic_id);
+ apic_write(APIC_DFR, apic_pm_state.apic_dfr);
+ apic_write(APIC_LDR, apic_pm_state.apic_ldr);
+ apic_write(APIC_TASKPRI, apic_pm_state.apic_taskpri);
+ apic_write(APIC_SPIV, apic_pm_state.apic_spiv);
+ apic_write(APIC_LVT0, apic_pm_state.apic_lvt0);
+ apic_write(APIC_LVT1, apic_pm_state.apic_lvt1);
+#ifdef CONFIG_X86_THERMAL_VECTOR
+ if (maxlvt >= 5)
+ apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr);
+#endif
+#ifdef CONFIG_X86_MCE_INTEL
+ if (maxlvt >= 6)
+ apic_write(APIC_LVTCMCI, apic_pm_state.apic_cmci);
+#endif
+ if (maxlvt >= 4)
+ apic_write(APIC_LVTPC, apic_pm_state.apic_lvtpc);
+ apic_write(APIC_LVTT, apic_pm_state.apic_lvtt);
+ apic_write(APIC_TDCR, apic_pm_state.apic_tdcr);
+ apic_write(APIC_TMICT, apic_pm_state.apic_tmict);
+ apic_write(APIC_ESR, 0);
+ apic_read(APIC_ESR);
+ apic_write(APIC_LVTERR, apic_pm_state.apic_lvterr);
+ apic_write(APIC_ESR, 0);
+ apic_read(APIC_ESR);
+
+ irq_remapping_reenable(x2apic_mode);
+
+ local_irq_restore(flags);
+}
+
+/*
+ * This device has no shutdown method - fully functioning local APICs
+ * are needed on every CPU up until machine_halt/restart/poweroff.
+ */
+
+static struct syscore_ops lapic_syscore_ops = {
+ .resume = lapic_resume,
+ .suspend = lapic_suspend,
+};
+
+static void apic_pm_activate(void)
+{
+ apic_pm_state.active = 1;
+}
+
+static int __init init_lapic_sysfs(void)
+{
+ /* XXX: remove suspend/resume procs if !apic_pm_state.active? */
+ if (boot_cpu_has(X86_FEATURE_APIC))
+ register_syscore_ops(&lapic_syscore_ops);
+
+ return 0;
+}
+
+/* local apic needs to resume before other devices access its registers. */
+core_initcall(init_lapic_sysfs);
+
+#else /* CONFIG_PM */
+
+static void apic_pm_activate(void) { }
+
+#endif /* CONFIG_PM */
+
+#ifdef CONFIG_X86_64
+
+static int multi_checked;
+static int multi;
+
+static int set_multi(const struct dmi_system_id *d)
+{
+ if (multi)
+ return 0;
+ pr_info("APIC: %s detected, Multi Chassis\n", d->ident);
+ multi = 1;
+ return 0;
+}
+
+static const struct dmi_system_id multi_dmi_table[] = {
+ {
+ .callback = set_multi,
+ .ident = "IBM System Summit2",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "IBM"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "Summit2"),
+ },
+ },
+ {}
+};
+
+static void dmi_check_multi(void)
+{
+ if (multi_checked)
+ return;
+
+ dmi_check_system(multi_dmi_table);
+ multi_checked = 1;
+}
+
+/*
+ * apic_is_clustered_box() -- Check if we can expect good TSC
+ *
+ * Thus far, the major user of this is IBM's Summit2 series:
+ * Clustered boxes may have unsynced TSC problems if they are
+ * multi-chassis.
+ * Use DMI to check them
+ */
+int apic_is_clustered_box(void)
+{
+ dmi_check_multi();
+ return multi;
+}
+#endif
+
+/*
+ * APIC command line parameters
+ */
+static int __init setup_disableapic(char *arg)
+{
+ disable_apic = 1;
+ setup_clear_cpu_cap(X86_FEATURE_APIC);
+ return 0;
+}
+early_param("disableapic", setup_disableapic);
+
+/* same as disableapic, for compatibility */
+static int __init setup_nolapic(char *arg)
+{
+ return setup_disableapic(arg);
+}
+early_param("nolapic", setup_nolapic);
+
+static int __init parse_lapic_timer_c2_ok(char *arg)
+{
+ local_apic_timer_c2_ok = 1;
+ return 0;
+}
+early_param("lapic_timer_c2_ok", parse_lapic_timer_c2_ok);
+
+static int __init parse_disable_apic_timer(char *arg)
+{
+ disable_apic_timer = 1;
+ return 0;
+}
+early_param("noapictimer", parse_disable_apic_timer);
+
+static int __init parse_nolapic_timer(char *arg)
+{
+ disable_apic_timer = 1;
+ return 0;
+}
+early_param("nolapic_timer", parse_nolapic_timer);
+
+static int __init apic_set_verbosity(char *arg)
+{
+ if (!arg) {
+#ifdef CONFIG_X86_64
+ skip_ioapic_setup = 0;
+ return 0;
+#endif
+ return -EINVAL;
+ }
+
+ if (strcmp("debug", arg) == 0)
+ apic_verbosity = APIC_DEBUG;
+ else if (strcmp("verbose", arg) == 0)
+ apic_verbosity = APIC_VERBOSE;
+#ifdef CONFIG_X86_64
+ else {
+ pr_warning("APIC Verbosity level %s not recognised"
+ " use apic=verbose or apic=debug\n", arg);
+ return -EINVAL;
+ }
+#endif
+
+ return 0;
+}
+early_param("apic", apic_set_verbosity);
+
+static int __init lapic_insert_resource(void)
+{
+ if (!apic_phys)
+ return -1;
+
+ /* Put local APIC into the resource map. */
+ lapic_resource.start = apic_phys;
+ lapic_resource.end = lapic_resource.start + PAGE_SIZE - 1;
+ insert_resource(&iomem_resource, &lapic_resource);
+
+ return 0;
+}
+
+/*
+ * need call insert after e820__reserve_resources()
+ * that is using request_resource
+ */
+late_initcall(lapic_insert_resource);
+
+static int __init apic_set_disabled_cpu_apicid(char *arg)
+{
+ if (!arg || !get_option(&arg, &disabled_cpu_apicid))
+ return -EINVAL;
+
+ return 0;
+}
+early_param("disable_cpu_apicid", apic_set_disabled_cpu_apicid);
+
+static int __init apic_set_extnmi(char *arg)
+{
+ if (!arg)
+ return -EINVAL;
+
+ if (!strncmp("all", arg, 3))
+ apic_extnmi = APIC_EXTNMI_ALL;
+ else if (!strncmp("none", arg, 4))
+ apic_extnmi = APIC_EXTNMI_NONE;
+ else if (!strncmp("bsp", arg, 3))
+ apic_extnmi = APIC_EXTNMI_BSP;
+ else {
+ pr_warn("Unknown external NMI delivery mode `%s' ignored\n", arg);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+early_param("apic_extnmi", apic_set_extnmi);
diff --git a/arch/x86/kernel/apic/apic_common.c b/arch/x86/kernel/apic/apic_common.c
new file mode 100644
index 0000000..02b4839
--- /dev/null
+++ b/arch/x86/kernel/apic/apic_common.c
@@ -0,0 +1,46 @@
+/*
+ * Common functions shared between the various APIC flavours
+ *
+ * SPDX-License-Identifier: GPL-2.0
+ */
+#include <linux/irq.h>
+#include <asm/apic.h>
+
+u32 apic_default_calc_apicid(unsigned int cpu)
+{
+ return per_cpu(x86_cpu_to_apicid, cpu);
+}
+
+u32 apic_flat_calc_apicid(unsigned int cpu)
+{
+ return 1U << cpu;
+}
+
+bool default_check_apicid_used(physid_mask_t *map, int apicid)
+{
+ return physid_isset(apicid, *map);
+}
+
+void default_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap)
+{
+ *retmap = *phys_map;
+}
+
+int default_cpu_present_to_apicid(int mps_cpu)
+{
+ if (mps_cpu < nr_cpu_ids && cpu_present(mps_cpu))
+ return (int)per_cpu(x86_bios_cpu_apicid, mps_cpu);
+ else
+ return BAD_APICID;
+}
+EXPORT_SYMBOL_GPL(default_cpu_present_to_apicid);
+
+int default_check_phys_apicid_present(int phys_apicid)
+{
+ return physid_isset(phys_apicid, phys_cpu_present_map);
+}
+
+int default_apic_id_valid(u32 apicid)
+{
+ return (apicid < 255);
+}
diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c
new file mode 100644
index 0000000..e84c9eb
--- /dev/null
+++ b/arch/x86/kernel/apic/apic_flat_64.c
@@ -0,0 +1,296 @@
+/*
+ * Copyright 2004 James Cleverdon, IBM.
+ * Subject to the GNU Public License, v.2
+ *
+ * Flat APIC subarch code.
+ *
+ * Hacked for x86-64 by James Cleverdon from i386 architecture code by
+ * Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and
+ * James Cleverdon.
+ */
+#include <linux/errno.h>
+#include <linux/threads.h>
+#include <linux/cpumask.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <linux/ctype.h>
+#include <linux/hardirq.h>
+#include <linux/export.h>
+#include <asm/smp.h>
+#include <asm/apic.h>
+#include <asm/ipi.h>
+#include <asm/jailhouse_para.h>
+
+#include <linux/acpi.h>
+
+static struct apic apic_physflat;
+static struct apic apic_flat;
+
+struct apic *apic __ro_after_init = &apic_flat;
+EXPORT_SYMBOL_GPL(apic);
+
+static int flat_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
+{
+ return 1;
+}
+
+/*
+ * Set up the logical destination ID.
+ *
+ * Intel recommends to set DFR, LDR and TPR before enabling
+ * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel
+ * document number 292116). So here it goes...
+ */
+void flat_init_apic_ldr(void)
+{
+ unsigned long val;
+ unsigned long num, id;
+
+ num = smp_processor_id();
+ id = 1UL << num;
+ apic_write(APIC_DFR, APIC_DFR_FLAT);
+ val = apic_read(APIC_LDR) & ~APIC_LDR_MASK;
+ val |= SET_APIC_LOGICAL_ID(id);
+ apic_write(APIC_LDR, val);
+}
+
+static void _flat_send_IPI_mask(unsigned long mask, int vector)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ __default_send_IPI_dest_field(mask, vector, apic->dest_logical);
+ local_irq_restore(flags);
+}
+
+static void flat_send_IPI_mask(const struct cpumask *cpumask, int vector)
+{
+ unsigned long mask = cpumask_bits(cpumask)[0];
+
+ _flat_send_IPI_mask(mask, vector);
+}
+
+static void
+flat_send_IPI_mask_allbutself(const struct cpumask *cpumask, int vector)
+{
+ unsigned long mask = cpumask_bits(cpumask)[0];
+ int cpu = smp_processor_id();
+
+ if (cpu < BITS_PER_LONG)
+ clear_bit(cpu, &mask);
+
+ _flat_send_IPI_mask(mask, vector);
+}
+
+static void flat_send_IPI_allbutself(int vector)
+{
+ int cpu = smp_processor_id();
+
+ if (IS_ENABLED(CONFIG_HOTPLUG_CPU) || vector == NMI_VECTOR) {
+ if (!cpumask_equal(cpu_online_mask, cpumask_of(cpu))) {
+ unsigned long mask = cpumask_bits(cpu_online_mask)[0];
+
+ if (cpu < BITS_PER_LONG)
+ clear_bit(cpu, &mask);
+
+ _flat_send_IPI_mask(mask, vector);
+ }
+ } else if (num_online_cpus() > 1) {
+ __default_send_IPI_shortcut(APIC_DEST_ALLBUT,
+ vector, apic->dest_logical);
+ }
+}
+
+static void flat_send_IPI_all(int vector)
+{
+ if (vector == NMI_VECTOR) {
+ flat_send_IPI_mask(cpu_online_mask, vector);
+ } else {
+ __default_send_IPI_shortcut(APIC_DEST_ALLINC,
+ vector, apic->dest_logical);
+ }
+}
+
+static unsigned int flat_get_apic_id(unsigned long x)
+{
+ return (x >> 24) & 0xFF;
+}
+
+static u32 set_apic_id(unsigned int id)
+{
+ return (id & 0xFF) << 24;
+}
+
+static unsigned int read_xapic_id(void)
+{
+ return flat_get_apic_id(apic_read(APIC_ID));
+}
+
+static int flat_apic_id_registered(void)
+{
+ return physid_isset(read_xapic_id(), phys_cpu_present_map);
+}
+
+static int flat_phys_pkg_id(int initial_apic_id, int index_msb)
+{
+ return initial_apic_id >> index_msb;
+}
+
+static int flat_probe(void)
+{
+ return 1;
+}
+
+static struct apic apic_flat __ro_after_init = {
+ .name = "flat",
+ .probe = flat_probe,
+ .acpi_madt_oem_check = flat_acpi_madt_oem_check,
+ .apic_id_valid = default_apic_id_valid,
+ .apic_id_registered = flat_apic_id_registered,
+
+ .irq_delivery_mode = dest_Fixed,
+ .irq_dest_mode = 1, /* logical */
+
+ .disable_esr = 0,
+ .dest_logical = APIC_DEST_LOGICAL,
+ .check_apicid_used = NULL,
+
+ .init_apic_ldr = flat_init_apic_ldr,
+
+ .ioapic_phys_id_map = NULL,
+ .setup_apic_routing = NULL,
+ .cpu_present_to_apicid = default_cpu_present_to_apicid,
+ .apicid_to_cpu_present = NULL,
+ .check_phys_apicid_present = default_check_phys_apicid_present,
+ .phys_pkg_id = flat_phys_pkg_id,
+
+ .get_apic_id = flat_get_apic_id,
+ .set_apic_id = set_apic_id,
+
+ .calc_dest_apicid = apic_flat_calc_apicid,
+
+ .send_IPI = default_send_IPI_single,
+ .send_IPI_mask = flat_send_IPI_mask,
+ .send_IPI_mask_allbutself = flat_send_IPI_mask_allbutself,
+ .send_IPI_allbutself = flat_send_IPI_allbutself,
+ .send_IPI_all = flat_send_IPI_all,
+ .send_IPI_self = apic_send_IPI_self,
+
+ .inquire_remote_apic = default_inquire_remote_apic,
+
+ .read = native_apic_mem_read,
+ .write = native_apic_mem_write,
+ .eoi_write = native_apic_mem_write,
+ .icr_read = native_apic_icr_read,
+ .icr_write = native_apic_icr_write,
+ .wait_icr_idle = native_apic_wait_icr_idle,
+ .safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
+};
+
+/*
+ * Physflat mode is used when there are more than 8 CPUs on a system.
+ * We cannot use logical delivery in this case because the mask
+ * overflows, so use physical mode.
+ */
+static int physflat_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
+{
+#ifdef CONFIG_ACPI
+ /*
+ * Quirk: some x86_64 machines can only use physical APIC mode
+ * regardless of how many processors are present (x86_64 ES7000
+ * is an example).
+ */
+ if (acpi_gbl_FADT.header.revision >= FADT2_REVISION_ID &&
+ (acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL)) {
+ printk(KERN_DEBUG "system APIC only can use physical flat");
+ return 1;
+ }
+
+ if (!strncmp(oem_id, "IBM", 3) && !strncmp(oem_table_id, "EXA", 3)) {
+ printk(KERN_DEBUG "IBM Summit detected, will use apic physical");
+ return 1;
+ }
+#endif
+
+ return 0;
+}
+
+static void physflat_init_apic_ldr(void)
+{
+ /*
+ * LDR and DFR are not involved in physflat mode, rather:
+ * "In physical destination mode, the destination processor is
+ * specified by its local APIC ID [...]." (Intel SDM, 10.6.2.1)
+ */
+}
+
+static void physflat_send_IPI_allbutself(int vector)
+{
+ default_send_IPI_mask_allbutself_phys(cpu_online_mask, vector);
+}
+
+static void physflat_send_IPI_all(int vector)
+{
+ default_send_IPI_mask_sequence_phys(cpu_online_mask, vector);
+}
+
+static int physflat_probe(void)
+{
+ if (apic == &apic_physflat || num_possible_cpus() > 8 ||
+ jailhouse_paravirt())
+ return 1;
+
+ return 0;
+}
+
+static struct apic apic_physflat __ro_after_init = {
+
+ .name = "physical flat",
+ .probe = physflat_probe,
+ .acpi_madt_oem_check = physflat_acpi_madt_oem_check,
+ .apic_id_valid = default_apic_id_valid,
+ .apic_id_registered = flat_apic_id_registered,
+
+ .irq_delivery_mode = dest_Fixed,
+ .irq_dest_mode = 0, /* physical */
+
+ .disable_esr = 0,
+ .dest_logical = 0,
+ .check_apicid_used = NULL,
+
+ .init_apic_ldr = physflat_init_apic_ldr,
+
+ .ioapic_phys_id_map = NULL,
+ .setup_apic_routing = NULL,
+ .cpu_present_to_apicid = default_cpu_present_to_apicid,
+ .apicid_to_cpu_present = NULL,
+ .check_phys_apicid_present = default_check_phys_apicid_present,
+ .phys_pkg_id = flat_phys_pkg_id,
+
+ .get_apic_id = flat_get_apic_id,
+ .set_apic_id = set_apic_id,
+
+ .calc_dest_apicid = apic_default_calc_apicid,
+
+ .send_IPI = default_send_IPI_single_phys,
+ .send_IPI_mask = default_send_IPI_mask_sequence_phys,
+ .send_IPI_mask_allbutself = default_send_IPI_mask_allbutself_phys,
+ .send_IPI_allbutself = physflat_send_IPI_allbutself,
+ .send_IPI_all = physflat_send_IPI_all,
+ .send_IPI_self = apic_send_IPI_self,
+
+ .inquire_remote_apic = default_inquire_remote_apic,
+
+ .read = native_apic_mem_read,
+ .write = native_apic_mem_write,
+ .eoi_write = native_apic_mem_write,
+ .icr_read = native_apic_icr_read,
+ .icr_write = native_apic_icr_write,
+ .wait_icr_idle = native_apic_wait_icr_idle,
+ .safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
+};
+
+/*
+ * We need to check for physflat first, so this order is important.
+ */
+apic_drivers(apic_physflat, apic_flat);
diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c
new file mode 100644
index 0000000..5078b5c
--- /dev/null
+++ b/arch/x86/kernel/apic/apic_noop.c
@@ -0,0 +1,160 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * NOOP APIC driver.
+ *
+ * Does almost nothing and should be substituted by a real apic driver via
+ * probe routine.
+ *
+ * Though in case if apic is disabled (for some reason) we try
+ * to not uglify the caller's code and allow to call (some) apic routines
+ * like self-ipi, etc...
+ */
+
+#include <linux/threads.h>
+#include <linux/cpumask.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <linux/ctype.h>
+#include <linux/errno.h>
+#include <asm/fixmap.h>
+#include <asm/mpspec.h>
+#include <asm/apicdef.h>
+#include <asm/apic.h>
+#include <asm/setup.h>
+
+#include <linux/smp.h>
+#include <asm/ipi.h>
+
+#include <linux/interrupt.h>
+#include <asm/acpi.h>
+#include <asm/e820/api.h>
+
+static void noop_init_apic_ldr(void) { }
+static void noop_send_IPI(int cpu, int vector) { }
+static void noop_send_IPI_mask(const struct cpumask *cpumask, int vector) { }
+static void noop_send_IPI_mask_allbutself(const struct cpumask *cpumask, int vector) { }
+static void noop_send_IPI_allbutself(int vector) { }
+static void noop_send_IPI_all(int vector) { }
+static void noop_send_IPI_self(int vector) { }
+static void noop_apic_wait_icr_idle(void) { }
+static void noop_apic_icr_write(u32 low, u32 id) { }
+
+static int noop_wakeup_secondary_cpu(int apicid, unsigned long start_eip)
+{
+ return -1;
+}
+
+static u32 noop_safe_apic_wait_icr_idle(void)
+{
+ return 0;
+}
+
+static u64 noop_apic_icr_read(void)
+{
+ return 0;
+}
+
+static int noop_phys_pkg_id(int cpuid_apic, int index_msb)
+{
+ return 0;
+}
+
+static unsigned int noop_get_apic_id(unsigned long x)
+{
+ return 0;
+}
+
+static int noop_probe(void)
+{
+ /*
+ * NOOP apic should not ever be
+ * enabled via probe routine
+ */
+ return 0;
+}
+
+static int noop_apic_id_registered(void)
+{
+ /*
+ * if we would be really "pedantic"
+ * we should pass read_apic_id() here
+ * but since NOOP suppose APIC ID = 0
+ * lets save a few cycles
+ */
+ return physid_isset(0, phys_cpu_present_map);
+}
+
+static u32 noop_apic_read(u32 reg)
+{
+ WARN_ON_ONCE(boot_cpu_has(X86_FEATURE_APIC) && !disable_apic);
+ return 0;
+}
+
+static void noop_apic_write(u32 reg, u32 v)
+{
+ WARN_ON_ONCE(boot_cpu_has(X86_FEATURE_APIC) && !disable_apic);
+}
+
+#ifdef CONFIG_X86_32
+static int noop_x86_32_early_logical_apicid(int cpu)
+{
+ return BAD_APICID;
+}
+#endif
+
+struct apic apic_noop __ro_after_init = {
+ .name = "noop",
+ .probe = noop_probe,
+ .acpi_madt_oem_check = NULL,
+
+ .apic_id_valid = default_apic_id_valid,
+ .apic_id_registered = noop_apic_id_registered,
+
+ .irq_delivery_mode = dest_Fixed,
+ /* logical delivery broadcast to all CPUs: */
+ .irq_dest_mode = 1,
+
+ .disable_esr = 0,
+ .dest_logical = APIC_DEST_LOGICAL,
+ .check_apicid_used = default_check_apicid_used,
+
+ .init_apic_ldr = noop_init_apic_ldr,
+
+ .ioapic_phys_id_map = default_ioapic_phys_id_map,
+ .setup_apic_routing = NULL,
+
+ .cpu_present_to_apicid = default_cpu_present_to_apicid,
+ .apicid_to_cpu_present = physid_set_mask_of_physid,
+
+ .check_phys_apicid_present = default_check_phys_apicid_present,
+
+ .phys_pkg_id = noop_phys_pkg_id,
+
+ .get_apic_id = noop_get_apic_id,
+ .set_apic_id = NULL,
+
+ .calc_dest_apicid = apic_flat_calc_apicid,
+
+ .send_IPI = noop_send_IPI,
+ .send_IPI_mask = noop_send_IPI_mask,
+ .send_IPI_mask_allbutself = noop_send_IPI_mask_allbutself,
+ .send_IPI_allbutself = noop_send_IPI_allbutself,
+ .send_IPI_all = noop_send_IPI_all,
+ .send_IPI_self = noop_send_IPI_self,
+
+ .wakeup_secondary_cpu = noop_wakeup_secondary_cpu,
+
+ .inquire_remote_apic = NULL,
+
+ .read = noop_apic_read,
+ .write = noop_apic_write,
+ .eoi_write = noop_apic_write,
+ .icr_read = noop_apic_icr_read,
+ .icr_write = noop_apic_icr_write,
+ .wait_icr_idle = noop_apic_wait_icr_idle,
+ .safe_wait_icr_idle = noop_safe_apic_wait_icr_idle,
+
+#ifdef CONFIG_X86_32
+ .x86_32_early_logical_apicid = noop_x86_32_early_logical_apicid,
+#endif
+};
diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c
new file mode 100644
index 0000000..78778b5
--- /dev/null
+++ b/arch/x86/kernel/apic/apic_numachip.c
@@ -0,0 +1,338 @@
+/*
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License. See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Numascale NumaConnect-Specific APIC Code
+ *
+ * Copyright (C) 2011 Numascale AS. All rights reserved.
+ *
+ * Send feedback to <support@numascale.com>
+ *
+ */
+
+#include <linux/init.h>
+
+#include <asm/numachip/numachip.h>
+#include <asm/numachip/numachip_csr.h>
+#include <asm/ipi.h>
+#include <asm/apic_flat_64.h>
+#include <asm/pgtable.h>
+#include <asm/pci_x86.h>
+
+u8 numachip_system __read_mostly;
+static const struct apic apic_numachip1;
+static const struct apic apic_numachip2;
+static void (*numachip_apic_icr_write)(int apicid, unsigned int val) __read_mostly;
+
+static unsigned int numachip1_get_apic_id(unsigned long x)
+{
+ unsigned long value;
+ unsigned int id = (x >> 24) & 0xff;
+
+ if (static_cpu_has(X86_FEATURE_NODEID_MSR)) {
+ rdmsrl(MSR_FAM10H_NODE_ID, value);
+ id |= (value << 2) & 0xff00;
+ }
+
+ return id;
+}
+
+static u32 numachip1_set_apic_id(unsigned int id)
+{
+ return (id & 0xff) << 24;
+}
+
+static unsigned int numachip2_get_apic_id(unsigned long x)
+{
+ u64 mcfg;
+
+ rdmsrl(MSR_FAM10H_MMIO_CONF_BASE, mcfg);
+ return ((mcfg >> (28 - 8)) & 0xfff00) | (x >> 24);
+}
+
+static u32 numachip2_set_apic_id(unsigned int id)
+{
+ return id << 24;
+}
+
+static int numachip_apic_id_valid(u32 apicid)
+{
+ /* Trust what bootloader passes in MADT */
+ return 1;
+}
+
+static int numachip_apic_id_registered(void)
+{
+ return 1;
+}
+
+static int numachip_phys_pkg_id(int initial_apic_id, int index_msb)
+{
+ return initial_apic_id >> index_msb;
+}
+
+static void numachip1_apic_icr_write(int apicid, unsigned int val)
+{
+ write_lcsr(CSR_G3_EXT_IRQ_GEN, (apicid << 16) | val);
+}
+
+static void numachip2_apic_icr_write(int apicid, unsigned int val)
+{
+ numachip2_write32_lcsr(NUMACHIP2_APIC_ICR, (apicid << 12) | val);
+}
+
+static int numachip_wakeup_secondary(int phys_apicid, unsigned long start_rip)
+{
+ numachip_apic_icr_write(phys_apicid, APIC_DM_INIT);
+ numachip_apic_icr_write(phys_apicid, APIC_DM_STARTUP |
+ (start_rip >> 12));
+
+ return 0;
+}
+
+static void numachip_send_IPI_one(int cpu, int vector)
+{
+ int local_apicid, apicid = per_cpu(x86_cpu_to_apicid, cpu);
+ unsigned int dmode;
+
+ preempt_disable();
+ local_apicid = __this_cpu_read(x86_cpu_to_apicid);
+
+ /* Send via local APIC where non-local part matches */
+ if (!((apicid ^ local_apicid) >> NUMACHIP_LAPIC_BITS)) {
+ unsigned long flags;
+
+ local_irq_save(flags);
+ __default_send_IPI_dest_field(apicid, vector,
+ APIC_DEST_PHYSICAL);
+ local_irq_restore(flags);
+ preempt_enable();
+ return;
+ }
+ preempt_enable();
+
+ dmode = (vector == NMI_VECTOR) ? APIC_DM_NMI : APIC_DM_FIXED;
+ numachip_apic_icr_write(apicid, dmode | vector);
+}
+
+static void numachip_send_IPI_mask(const struct cpumask *mask, int vector)
+{
+ unsigned int cpu;
+
+ for_each_cpu(cpu, mask)
+ numachip_send_IPI_one(cpu, vector);
+}
+
+static void numachip_send_IPI_mask_allbutself(const struct cpumask *mask,
+ int vector)
+{
+ unsigned int this_cpu = smp_processor_id();
+ unsigned int cpu;
+
+ for_each_cpu(cpu, mask) {
+ if (cpu != this_cpu)
+ numachip_send_IPI_one(cpu, vector);
+ }
+}
+
+static void numachip_send_IPI_allbutself(int vector)
+{
+ unsigned int this_cpu = smp_processor_id();
+ unsigned int cpu;
+
+ for_each_online_cpu(cpu) {
+ if (cpu != this_cpu)
+ numachip_send_IPI_one(cpu, vector);
+ }
+}
+
+static void numachip_send_IPI_all(int vector)
+{
+ numachip_send_IPI_mask(cpu_online_mask, vector);
+}
+
+static void numachip_send_IPI_self(int vector)
+{
+ apic_write(APIC_SELF_IPI, vector);
+}
+
+static int __init numachip1_probe(void)
+{
+ return apic == &apic_numachip1;
+}
+
+static int __init numachip2_probe(void)
+{
+ return apic == &apic_numachip2;
+}
+
+static void fixup_cpu_id(struct cpuinfo_x86 *c, int node)
+{
+ u64 val;
+ u32 nodes = 1;
+
+ this_cpu_write(cpu_llc_id, node);
+
+ /* Account for nodes per socket in multi-core-module processors */
+ if (static_cpu_has(X86_FEATURE_NODEID_MSR)) {
+ rdmsrl(MSR_FAM10H_NODE_ID, val);
+ nodes = ((val >> 3) & 7) + 1;
+ }
+
+ c->phys_proc_id = node / nodes;
+}
+
+static int __init numachip_system_init(void)
+{
+ /* Map the LCSR area and set up the apic_icr_write function */
+ switch (numachip_system) {
+ case 1:
+ init_extra_mapping_uc(NUMACHIP_LCSR_BASE, NUMACHIP_LCSR_SIZE);
+ numachip_apic_icr_write = numachip1_apic_icr_write;
+ break;
+ case 2:
+ init_extra_mapping_uc(NUMACHIP2_LCSR_BASE, NUMACHIP2_LCSR_SIZE);
+ numachip_apic_icr_write = numachip2_apic_icr_write;
+ break;
+ default:
+ return 0;
+ }
+
+ x86_cpuinit.fixup_cpu_id = fixup_cpu_id;
+ x86_init.pci.arch_init = pci_numachip_init;
+
+ return 0;
+}
+early_initcall(numachip_system_init);
+
+static int numachip1_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
+{
+ if ((strncmp(oem_id, "NUMASC", 6) != 0) ||
+ (strncmp(oem_table_id, "NCONNECT", 8) != 0))
+ return 0;
+
+ numachip_system = 1;
+
+ return 1;
+}
+
+static int numachip2_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
+{
+ if ((strncmp(oem_id, "NUMASC", 6) != 0) ||
+ (strncmp(oem_table_id, "NCONECT2", 8) != 0))
+ return 0;
+
+ numachip_system = 2;
+
+ return 1;
+}
+
+/* APIC IPIs are queued */
+static void numachip_apic_wait_icr_idle(void)
+{
+}
+
+/* APIC NMI IPIs are queued */
+static u32 numachip_safe_apic_wait_icr_idle(void)
+{
+ return 0;
+}
+
+static const struct apic apic_numachip1 __refconst = {
+ .name = "NumaConnect system",
+ .probe = numachip1_probe,
+ .acpi_madt_oem_check = numachip1_acpi_madt_oem_check,
+ .apic_id_valid = numachip_apic_id_valid,
+ .apic_id_registered = numachip_apic_id_registered,
+
+ .irq_delivery_mode = dest_Fixed,
+ .irq_dest_mode = 0, /* physical */
+
+ .disable_esr = 0,
+ .dest_logical = 0,
+ .check_apicid_used = NULL,
+
+ .init_apic_ldr = flat_init_apic_ldr,
+
+ .ioapic_phys_id_map = NULL,
+ .setup_apic_routing = NULL,
+ .cpu_present_to_apicid = default_cpu_present_to_apicid,
+ .apicid_to_cpu_present = NULL,
+ .check_phys_apicid_present = default_check_phys_apicid_present,
+ .phys_pkg_id = numachip_phys_pkg_id,
+
+ .get_apic_id = numachip1_get_apic_id,
+ .set_apic_id = numachip1_set_apic_id,
+
+ .calc_dest_apicid = apic_default_calc_apicid,
+
+ .send_IPI = numachip_send_IPI_one,
+ .send_IPI_mask = numachip_send_IPI_mask,
+ .send_IPI_mask_allbutself = numachip_send_IPI_mask_allbutself,
+ .send_IPI_allbutself = numachip_send_IPI_allbutself,
+ .send_IPI_all = numachip_send_IPI_all,
+ .send_IPI_self = numachip_send_IPI_self,
+
+ .wakeup_secondary_cpu = numachip_wakeup_secondary,
+ .inquire_remote_apic = NULL, /* REMRD not supported */
+
+ .read = native_apic_mem_read,
+ .write = native_apic_mem_write,
+ .eoi_write = native_apic_mem_write,
+ .icr_read = native_apic_icr_read,
+ .icr_write = native_apic_icr_write,
+ .wait_icr_idle = numachip_apic_wait_icr_idle,
+ .safe_wait_icr_idle = numachip_safe_apic_wait_icr_idle,
+};
+
+apic_driver(apic_numachip1);
+
+static const struct apic apic_numachip2 __refconst = {
+ .name = "NumaConnect2 system",
+ .probe = numachip2_probe,
+ .acpi_madt_oem_check = numachip2_acpi_madt_oem_check,
+ .apic_id_valid = numachip_apic_id_valid,
+ .apic_id_registered = numachip_apic_id_registered,
+
+ .irq_delivery_mode = dest_Fixed,
+ .irq_dest_mode = 0, /* physical */
+
+ .disable_esr = 0,
+ .dest_logical = 0,
+ .check_apicid_used = NULL,
+
+ .init_apic_ldr = flat_init_apic_ldr,
+
+ .ioapic_phys_id_map = NULL,
+ .setup_apic_routing = NULL,
+ .cpu_present_to_apicid = default_cpu_present_to_apicid,
+ .apicid_to_cpu_present = NULL,
+ .check_phys_apicid_present = default_check_phys_apicid_present,
+ .phys_pkg_id = numachip_phys_pkg_id,
+
+ .get_apic_id = numachip2_get_apic_id,
+ .set_apic_id = numachip2_set_apic_id,
+
+ .calc_dest_apicid = apic_default_calc_apicid,
+
+ .send_IPI = numachip_send_IPI_one,
+ .send_IPI_mask = numachip_send_IPI_mask,
+ .send_IPI_mask_allbutself = numachip_send_IPI_mask_allbutself,
+ .send_IPI_allbutself = numachip_send_IPI_allbutself,
+ .send_IPI_all = numachip_send_IPI_all,
+ .send_IPI_self = numachip_send_IPI_self,
+
+ .wakeup_secondary_cpu = numachip_wakeup_secondary,
+ .inquire_remote_apic = NULL, /* REMRD not supported */
+
+ .read = native_apic_mem_read,
+ .write = native_apic_mem_write,
+ .eoi_write = native_apic_mem_write,
+ .icr_read = native_apic_icr_read,
+ .icr_write = native_apic_icr_write,
+ .wait_icr_idle = numachip_apic_wait_icr_idle,
+ .safe_wait_icr_idle = numachip_safe_apic_wait_icr_idle,
+};
+
+apic_driver(apic_numachip2);
diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c
new file mode 100644
index 0000000..afee386
--- /dev/null
+++ b/arch/x86/kernel/apic/bigsmp_32.c
@@ -0,0 +1,216 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * APIC driver for "bigsmp" xAPIC machines with more than 8 virtual CPUs.
+ *
+ * Drives the local APIC in "clustered mode".
+ */
+#include <linux/threads.h>
+#include <linux/cpumask.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/dmi.h>
+#include <linux/smp.h>
+
+#include <asm/apicdef.h>
+#include <asm/fixmap.h>
+#include <asm/mpspec.h>
+#include <asm/apic.h>
+#include <asm/ipi.h>
+
+static unsigned bigsmp_get_apic_id(unsigned long x)
+{
+ return (x >> 24) & 0xFF;
+}
+
+static int bigsmp_apic_id_registered(void)
+{
+ return 1;
+}
+
+static bool bigsmp_check_apicid_used(physid_mask_t *map, int apicid)
+{
+ return false;
+}
+
+static int bigsmp_early_logical_apicid(int cpu)
+{
+ /* on bigsmp, logical apicid is the same as physical */
+ return early_per_cpu(x86_cpu_to_apicid, cpu);
+}
+
+static inline unsigned long calculate_ldr(int cpu)
+{
+ unsigned long val, id;
+
+ val = apic_read(APIC_LDR) & ~APIC_LDR_MASK;
+ id = per_cpu(x86_bios_cpu_apicid, cpu);
+ val |= SET_APIC_LOGICAL_ID(id);
+
+ return val;
+}
+
+/*
+ * Set up the logical destination ID.
+ *
+ * Intel recommends to set DFR, LDR and TPR before enabling
+ * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel
+ * document number 292116). So here it goes...
+ */
+static void bigsmp_init_apic_ldr(void)
+{
+ unsigned long val;
+ int cpu = smp_processor_id();
+
+ apic_write(APIC_DFR, APIC_DFR_FLAT);
+ val = calculate_ldr(cpu);
+ apic_write(APIC_LDR, val);
+}
+
+static void bigsmp_setup_apic_routing(void)
+{
+ printk(KERN_INFO
+ "Enabling APIC mode: Physflat. Using %d I/O APICs\n",
+ nr_ioapics);
+}
+
+static int bigsmp_cpu_present_to_apicid(int mps_cpu)
+{
+ if (mps_cpu < nr_cpu_ids)
+ return (int) per_cpu(x86_bios_cpu_apicid, mps_cpu);
+
+ return BAD_APICID;
+}
+
+static void bigsmp_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap)
+{
+ /* For clustered we don't have a good way to do this yet - hack */
+ physids_promote(0xFFL, retmap);
+}
+
+static int bigsmp_check_phys_apicid_present(int phys_apicid)
+{
+ return 1;
+}
+
+static int bigsmp_phys_pkg_id(int cpuid_apic, int index_msb)
+{
+ return cpuid_apic >> index_msb;
+}
+
+static void bigsmp_send_IPI_allbutself(int vector)
+{
+ default_send_IPI_mask_allbutself_phys(cpu_online_mask, vector);
+}
+
+static void bigsmp_send_IPI_all(int vector)
+{
+ default_send_IPI_mask_sequence_phys(cpu_online_mask, vector);
+}
+
+static int dmi_bigsmp; /* can be set by dmi scanners */
+
+static int hp_ht_bigsmp(const struct dmi_system_id *d)
+{
+ printk(KERN_NOTICE "%s detected: force use of apic=bigsmp\n", d->ident);
+ dmi_bigsmp = 1;
+
+ return 0;
+}
+
+
+static const struct dmi_system_id bigsmp_dmi_table[] = {
+ { hp_ht_bigsmp, "HP ProLiant DL760 G2",
+ { DMI_MATCH(DMI_BIOS_VENDOR, "HP"),
+ DMI_MATCH(DMI_BIOS_VERSION, "P44-"),
+ }
+ },
+
+ { hp_ht_bigsmp, "HP ProLiant DL740",
+ { DMI_MATCH(DMI_BIOS_VENDOR, "HP"),
+ DMI_MATCH(DMI_BIOS_VERSION, "P47-"),
+ }
+ },
+ { } /* NULL entry stops DMI scanning */
+};
+
+static int probe_bigsmp(void)
+{
+ if (def_to_bigsmp)
+ dmi_bigsmp = 1;
+ else
+ dmi_check_system(bigsmp_dmi_table);
+
+ return dmi_bigsmp;
+}
+
+static struct apic apic_bigsmp __ro_after_init = {
+
+ .name = "bigsmp",
+ .probe = probe_bigsmp,
+ .acpi_madt_oem_check = NULL,
+ .apic_id_valid = default_apic_id_valid,
+ .apic_id_registered = bigsmp_apic_id_registered,
+
+ .irq_delivery_mode = dest_Fixed,
+ /* phys delivery to target CPU: */
+ .irq_dest_mode = 0,
+
+ .disable_esr = 1,
+ .dest_logical = 0,
+ .check_apicid_used = bigsmp_check_apicid_used,
+
+ .init_apic_ldr = bigsmp_init_apic_ldr,
+
+ .ioapic_phys_id_map = bigsmp_ioapic_phys_id_map,
+ .setup_apic_routing = bigsmp_setup_apic_routing,
+ .cpu_present_to_apicid = bigsmp_cpu_present_to_apicid,
+ .apicid_to_cpu_present = physid_set_mask_of_physid,
+ .check_phys_apicid_present = bigsmp_check_phys_apicid_present,
+ .phys_pkg_id = bigsmp_phys_pkg_id,
+
+ .get_apic_id = bigsmp_get_apic_id,
+ .set_apic_id = NULL,
+
+ .calc_dest_apicid = apic_default_calc_apicid,
+
+ .send_IPI = default_send_IPI_single_phys,
+ .send_IPI_mask = default_send_IPI_mask_sequence_phys,
+ .send_IPI_mask_allbutself = NULL,
+ .send_IPI_allbutself = bigsmp_send_IPI_allbutself,
+ .send_IPI_all = bigsmp_send_IPI_all,
+ .send_IPI_self = default_send_IPI_self,
+
+ .inquire_remote_apic = default_inquire_remote_apic,
+
+ .read = native_apic_mem_read,
+ .write = native_apic_mem_write,
+ .eoi_write = native_apic_mem_write,
+ .icr_read = native_apic_icr_read,
+ .icr_write = native_apic_icr_write,
+ .wait_icr_idle = native_apic_wait_icr_idle,
+ .safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
+
+ .x86_32_early_logical_apicid = bigsmp_early_logical_apicid,
+};
+
+void __init generic_bigsmp_probe(void)
+{
+ unsigned int cpu;
+
+ if (!probe_bigsmp())
+ return;
+
+ apic = &apic_bigsmp;
+
+ for_each_possible_cpu(cpu) {
+ if (early_per_cpu(x86_cpu_to_logical_apicid,
+ cpu) == BAD_APICID)
+ continue;
+ early_per_cpu(x86_cpu_to_logical_apicid, cpu) =
+ bigsmp_early_logical_apicid(cpu);
+ }
+
+ pr_info("Overriding APIC driver with %s\n", apic_bigsmp.name);
+}
+
+apic_driver(apic_bigsmp);
diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c
new file mode 100644
index 0000000..d1fc62a
--- /dev/null
+++ b/arch/x86/kernel/apic/hw_nmi.c
@@ -0,0 +1,58 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * HW NMI watchdog support
+ *
+ * started by Don Zickus, Copyright (C) 2010 Red Hat, Inc.
+ *
+ * Arch specific calls to support NMI watchdog
+ *
+ * Bits copied from original nmi.c file
+ *
+ */
+#include <asm/apic.h>
+#include <asm/nmi.h>
+
+#include <linux/cpumask.h>
+#include <linux/kdebug.h>
+#include <linux/notifier.h>
+#include <linux/kprobes.h>
+#include <linux/nmi.h>
+#include <linux/init.h>
+#include <linux/delay.h>
+
+#ifdef CONFIG_HARDLOCKUP_DETECTOR_PERF
+u64 hw_nmi_get_sample_period(int watchdog_thresh)
+{
+ return (u64)(cpu_khz) * 1000 * watchdog_thresh;
+}
+#endif
+
+#ifdef arch_trigger_cpumask_backtrace
+static void nmi_raise_cpu_backtrace(cpumask_t *mask)
+{
+ apic->send_IPI_mask(mask, NMI_VECTOR);
+}
+
+void arch_trigger_cpumask_backtrace(const cpumask_t *mask, bool exclude_self)
+{
+ nmi_trigger_cpumask_backtrace(mask, exclude_self,
+ nmi_raise_cpu_backtrace);
+}
+
+static int nmi_cpu_backtrace_handler(unsigned int cmd, struct pt_regs *regs)
+{
+ if (nmi_cpu_backtrace(regs))
+ return NMI_HANDLED;
+
+ return NMI_DONE;
+}
+NOKPROBE_SYMBOL(nmi_cpu_backtrace_handler);
+
+static int __init register_nmi_cpu_backtrace_handler(void)
+{
+ register_nmi_handler(NMI_LOCAL, nmi_cpu_backtrace_handler,
+ 0, "arch_bt");
+ return 0;
+}
+early_initcall(register_nmi_cpu_backtrace_handler);
+#endif
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
new file mode 100644
index 0000000..ff0d14c
--- /dev/null
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -0,0 +1,3012 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Intel IO-APIC support for multi-Pentium hosts.
+ *
+ * Copyright (C) 1997, 1998, 1999, 2000, 2009 Ingo Molnar, Hajnalka Szabo
+ *
+ * Many thanks to Stig Venaas for trying out countless experimental
+ * patches and reporting/debugging problems patiently!
+ *
+ * (c) 1999, Multiple IO-APIC support, developed by
+ * Ken-ichi Yaku <yaku@css1.kbnes.nec.co.jp> and
+ * Hidemi Kishimoto <kisimoto@css1.kbnes.nec.co.jp>,
+ * further tested and cleaned up by Zach Brown <zab@redhat.com>
+ * and Ingo Molnar <mingo@redhat.com>
+ *
+ * Fixes
+ * Maciej W. Rozycki : Bits for genuine 82489DX APICs;
+ * thanks to Eric Gilmore
+ * and Rolf G. Tews
+ * for testing these extensively
+ * Paul Diefenbaugh : Added full ACPI support
+ *
+ * Historical information which is worth to be preserved:
+ *
+ * - SiS APIC rmw bug:
+ *
+ * We used to have a workaround for a bug in SiS chips which
+ * required to rewrite the index register for a read-modify-write
+ * operation as the chip lost the index information which was
+ * setup for the read already. We cache the data now, so that
+ * workaround has been removed.
+ */
+
+#include <linux/mm.h>
+#include <linux/interrupt.h>
+#include <linux/irq.h>
+#include <linux/init.h>
+#include <linux/delay.h>
+#include <linux/sched.h>
+#include <linux/pci.h>
+#include <linux/mc146818rtc.h>
+#include <linux/compiler.h>
+#include <linux/acpi.h>
+#include <linux/export.h>
+#include <linux/syscore_ops.h>
+#include <linux/freezer.h>
+#include <linux/kthread.h>
+#include <linux/jiffies.h> /* time_after() */
+#include <linux/slab.h>
+#include <linux/bootmem.h>
+
+#include <asm/irqdomain.h>
+#include <asm/io.h>
+#include <asm/smp.h>
+#include <asm/cpu.h>
+#include <asm/desc.h>
+#include <asm/proto.h>
+#include <asm/acpi.h>
+#include <asm/dma.h>
+#include <asm/timer.h>
+#include <asm/i8259.h>
+#include <asm/setup.h>
+#include <asm/irq_remapping.h>
+#include <asm/hw_irq.h>
+
+#include <asm/apic.h>
+
+#define for_each_ioapic(idx) \
+ for ((idx) = 0; (idx) < nr_ioapics; (idx)++)
+#define for_each_ioapic_reverse(idx) \
+ for ((idx) = nr_ioapics - 1; (idx) >= 0; (idx)--)
+#define for_each_pin(idx, pin) \
+ for ((pin) = 0; (pin) < ioapics[(idx)].nr_registers; (pin)++)
+#define for_each_ioapic_pin(idx, pin) \
+ for_each_ioapic((idx)) \
+ for_each_pin((idx), (pin))
+#define for_each_irq_pin(entry, head) \
+ list_for_each_entry(entry, &head, list)
+
+static DEFINE_RAW_SPINLOCK(ioapic_lock);
+static DEFINE_MUTEX(ioapic_mutex);
+static unsigned int ioapic_dynirq_base;
+static int ioapic_initialized;
+
+struct irq_pin_list {
+ struct list_head list;
+ int apic, pin;
+};
+
+struct mp_chip_data {
+ struct list_head irq_2_pin;
+ struct IO_APIC_route_entry entry;
+ int trigger;
+ int polarity;
+ u32 count;
+ bool isa_irq;
+};
+
+struct mp_ioapic_gsi {
+ u32 gsi_base;
+ u32 gsi_end;
+};
+
+static struct ioapic {
+ /*
+ * # of IRQ routing registers
+ */
+ int nr_registers;
+ /*
+ * Saved state during suspend/resume, or while enabling intr-remap.
+ */
+ struct IO_APIC_route_entry *saved_registers;
+ /* I/O APIC config */
+ struct mpc_ioapic mp_config;
+ /* IO APIC gsi routing info */
+ struct mp_ioapic_gsi gsi_config;
+ struct ioapic_domain_cfg irqdomain_cfg;
+ struct irq_domain *irqdomain;
+ struct resource *iomem_res;
+} ioapics[MAX_IO_APICS];
+
+#define mpc_ioapic_ver(ioapic_idx) ioapics[ioapic_idx].mp_config.apicver
+
+int mpc_ioapic_id(int ioapic_idx)
+{
+ return ioapics[ioapic_idx].mp_config.apicid;
+}
+
+unsigned int mpc_ioapic_addr(int ioapic_idx)
+{
+ return ioapics[ioapic_idx].mp_config.apicaddr;
+}
+
+static inline struct mp_ioapic_gsi *mp_ioapic_gsi_routing(int ioapic_idx)
+{
+ return &ioapics[ioapic_idx].gsi_config;
+}
+
+static inline int mp_ioapic_pin_count(int ioapic)
+{
+ struct mp_ioapic_gsi *gsi_cfg = mp_ioapic_gsi_routing(ioapic);
+
+ return gsi_cfg->gsi_end - gsi_cfg->gsi_base + 1;
+}
+
+static inline u32 mp_pin_to_gsi(int ioapic, int pin)
+{
+ return mp_ioapic_gsi_routing(ioapic)->gsi_base + pin;
+}
+
+static inline bool mp_is_legacy_irq(int irq)
+{
+ return irq >= 0 && irq < nr_legacy_irqs();
+}
+
+/*
+ * Initialize all legacy IRQs and all pins on the first IOAPIC
+ * if we have legacy interrupt controller. Kernel boot option "pirq="
+ * may rely on non-legacy pins on the first IOAPIC.
+ */
+static inline int mp_init_irq_at_boot(int ioapic, int irq)
+{
+ if (!nr_legacy_irqs())
+ return 0;
+
+ return ioapic == 0 || mp_is_legacy_irq(irq);
+}
+
+static inline struct irq_domain *mp_ioapic_irqdomain(int ioapic)
+{
+ return ioapics[ioapic].irqdomain;
+}
+
+int nr_ioapics;
+
+/* The one past the highest gsi number used */
+u32 gsi_top;
+
+/* MP IRQ source entries */
+struct mpc_intsrc mp_irqs[MAX_IRQ_SOURCES];
+
+/* # of MP IRQ source entries */
+int mp_irq_entries;
+
+#ifdef CONFIG_EISA
+int mp_bus_id_to_type[MAX_MP_BUSSES];
+#endif
+
+DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
+
+int skip_ioapic_setup;
+
+/**
+ * disable_ioapic_support() - disables ioapic support at runtime
+ */
+void disable_ioapic_support(void)
+{
+#ifdef CONFIG_PCI
+ noioapicquirk = 1;
+ noioapicreroute = -1;
+#endif
+ skip_ioapic_setup = 1;
+}
+
+static int __init parse_noapic(char *str)
+{
+ /* disable IO-APIC */
+ disable_ioapic_support();
+ return 0;
+}
+early_param("noapic", parse_noapic);
+
+/* Will be called in mpparse/acpi/sfi codes for saving IRQ info */
+void mp_save_irq(struct mpc_intsrc *m)
+{
+ int i;
+
+ apic_printk(APIC_VERBOSE, "Int: type %d, pol %d, trig %d, bus %02x,"
+ " IRQ %02x, APIC ID %x, APIC INT %02x\n",
+ m->irqtype, m->irqflag & 3, (m->irqflag >> 2) & 3, m->srcbus,
+ m->srcbusirq, m->dstapic, m->dstirq);
+
+ for (i = 0; i < mp_irq_entries; i++) {
+ if (!memcmp(&mp_irqs[i], m, sizeof(*m)))
+ return;
+ }
+
+ memcpy(&mp_irqs[mp_irq_entries], m, sizeof(*m));
+ if (++mp_irq_entries == MAX_IRQ_SOURCES)
+ panic("Max # of irq sources exceeded!!\n");
+}
+
+static void alloc_ioapic_saved_registers(int idx)
+{
+ size_t size;
+
+ if (ioapics[idx].saved_registers)
+ return;
+
+ size = sizeof(struct IO_APIC_route_entry) * ioapics[idx].nr_registers;
+ ioapics[idx].saved_registers = kzalloc(size, GFP_KERNEL);
+ if (!ioapics[idx].saved_registers)
+ pr_err("IOAPIC %d: suspend/resume impossible!\n", idx);
+}
+
+static void free_ioapic_saved_registers(int idx)
+{
+ kfree(ioapics[idx].saved_registers);
+ ioapics[idx].saved_registers = NULL;
+}
+
+int __init arch_early_ioapic_init(void)
+{
+ int i;
+
+ if (!nr_legacy_irqs())
+ io_apic_irqs = ~0UL;
+
+ for_each_ioapic(i)
+ alloc_ioapic_saved_registers(i);
+
+ return 0;
+}
+
+struct io_apic {
+ unsigned int index;
+ unsigned int unused[3];
+ unsigned int data;
+ unsigned int unused2[11];
+ unsigned int eoi;
+};
+
+static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx)
+{
+ return (void __iomem *) __fix_to_virt(FIX_IO_APIC_BASE_0 + idx)
+ + (mpc_ioapic_addr(idx) & ~PAGE_MASK);
+}
+
+static inline void io_apic_eoi(unsigned int apic, unsigned int vector)
+{
+ struct io_apic __iomem *io_apic = io_apic_base(apic);
+ writel(vector, &io_apic->eoi);
+}
+
+unsigned int native_io_apic_read(unsigned int apic, unsigned int reg)
+{
+ struct io_apic __iomem *io_apic = io_apic_base(apic);
+ writel(reg, &io_apic->index);
+ return readl(&io_apic->data);
+}
+
+static void io_apic_write(unsigned int apic, unsigned int reg,
+ unsigned int value)
+{
+ struct io_apic __iomem *io_apic = io_apic_base(apic);
+
+ writel(reg, &io_apic->index);
+ writel(value, &io_apic->data);
+}
+
+union entry_union {
+ struct { u32 w1, w2; };
+ struct IO_APIC_route_entry entry;
+};
+
+static struct IO_APIC_route_entry __ioapic_read_entry(int apic, int pin)
+{
+ union entry_union eu;
+
+ eu.w1 = io_apic_read(apic, 0x10 + 2 * pin);
+ eu.w2 = io_apic_read(apic, 0x11 + 2 * pin);
+
+ return eu.entry;
+}
+
+static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin)
+{
+ union entry_union eu;
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&ioapic_lock, flags);
+ eu.entry = __ioapic_read_entry(apic, pin);
+ raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+
+ return eu.entry;
+}
+
+/*
+ * When we write a new IO APIC routing entry, we need to write the high
+ * word first! If the mask bit in the low word is clear, we will enable
+ * the interrupt, and we need to make sure the entry is fully populated
+ * before that happens.
+ */
+static void __ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
+{
+ union entry_union eu = {{0, 0}};
+
+ eu.entry = e;
+ io_apic_write(apic, 0x11 + 2*pin, eu.w2);
+ io_apic_write(apic, 0x10 + 2*pin, eu.w1);
+}
+
+static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
+{
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&ioapic_lock, flags);
+ __ioapic_write_entry(apic, pin, e);
+ raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+}
+
+/*
+ * When we mask an IO APIC routing entry, we need to write the low
+ * word first, in order to set the mask bit before we change the
+ * high bits!
+ */
+static void ioapic_mask_entry(int apic, int pin)
+{
+ unsigned long flags;
+ union entry_union eu = { .entry.mask = IOAPIC_MASKED };
+
+ raw_spin_lock_irqsave(&ioapic_lock, flags);
+ io_apic_write(apic, 0x10 + 2*pin, eu.w1);
+ io_apic_write(apic, 0x11 + 2*pin, eu.w2);
+ raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+}
+
+/*
+ * The common case is 1:1 IRQ<->pin mappings. Sometimes there are
+ * shared ISA-space IRQs, so we have to support them. We are super
+ * fast in the common case, and fast for shared ISA-space IRQs.
+ */
+static int __add_pin_to_irq_node(struct mp_chip_data *data,
+ int node, int apic, int pin)
+{
+ struct irq_pin_list *entry;
+
+ /* don't allow duplicates */
+ for_each_irq_pin(entry, data->irq_2_pin)
+ if (entry->apic == apic && entry->pin == pin)
+ return 0;
+
+ entry = kzalloc_node(sizeof(struct irq_pin_list), GFP_ATOMIC, node);
+ if (!entry) {
+ pr_err("can not alloc irq_pin_list (%d,%d,%d)\n",
+ node, apic, pin);
+ return -ENOMEM;
+ }
+ entry->apic = apic;
+ entry->pin = pin;
+ list_add_tail(&entry->list, &data->irq_2_pin);
+
+ return 0;
+}
+
+static void __remove_pin_from_irq(struct mp_chip_data *data, int apic, int pin)
+{
+ struct irq_pin_list *tmp, *entry;
+
+ list_for_each_entry_safe(entry, tmp, &data->irq_2_pin, list)
+ if (entry->apic == apic && entry->pin == pin) {
+ list_del(&entry->list);
+ kfree(entry);
+ return;
+ }
+}
+
+static void add_pin_to_irq_node(struct mp_chip_data *data,
+ int node, int apic, int pin)
+{
+ if (__add_pin_to_irq_node(data, node, apic, pin))
+ panic("IO-APIC: failed to add irq-pin. Can not proceed\n");
+}
+
+/*
+ * Reroute an IRQ to a different pin.
+ */
+static void __init replace_pin_at_irq_node(struct mp_chip_data *data, int node,
+ int oldapic, int oldpin,
+ int newapic, int newpin)
+{
+ struct irq_pin_list *entry;
+
+ for_each_irq_pin(entry, data->irq_2_pin) {
+ if (entry->apic == oldapic && entry->pin == oldpin) {
+ entry->apic = newapic;
+ entry->pin = newpin;
+ /* every one is different, right? */
+ return;
+ }
+ }
+
+ /* old apic/pin didn't exist, so just add new ones */
+ add_pin_to_irq_node(data, node, newapic, newpin);
+}
+
+static void io_apic_modify_irq(struct mp_chip_data *data,
+ int mask_and, int mask_or,
+ void (*final)(struct irq_pin_list *entry))
+{
+ union entry_union eu;
+ struct irq_pin_list *entry;
+
+ eu.entry = data->entry;
+ eu.w1 &= mask_and;
+ eu.w1 |= mask_or;
+ data->entry = eu.entry;
+
+ for_each_irq_pin(entry, data->irq_2_pin) {
+ io_apic_write(entry->apic, 0x10 + 2 * entry->pin, eu.w1);
+ if (final)
+ final(entry);
+ }
+}
+
+static void io_apic_sync(struct irq_pin_list *entry)
+{
+ /*
+ * Synchronize the IO-APIC and the CPU by doing
+ * a dummy read from the IO-APIC
+ */
+ struct io_apic __iomem *io_apic;
+
+ io_apic = io_apic_base(entry->apic);
+ readl(&io_apic->data);
+}
+
+static void mask_ioapic_irq(struct irq_data *irq_data)
+{
+ struct mp_chip_data *data = irq_data->chip_data;
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&ioapic_lock, flags);
+ io_apic_modify_irq(data, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync);
+ raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+}
+
+static void __unmask_ioapic(struct mp_chip_data *data)
+{
+ io_apic_modify_irq(data, ~IO_APIC_REDIR_MASKED, 0, NULL);
+}
+
+static void unmask_ioapic_irq(struct irq_data *irq_data)
+{
+ struct mp_chip_data *data = irq_data->chip_data;
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&ioapic_lock, flags);
+ __unmask_ioapic(data);
+ raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+}
+
+/*
+ * IO-APIC versions below 0x20 don't support EOI register.
+ * For the record, here is the information about various versions:
+ * 0Xh 82489DX
+ * 1Xh I/OAPIC or I/O(x)APIC which are not PCI 2.2 Compliant
+ * 2Xh I/O(x)APIC which is PCI 2.2 Compliant
+ * 30h-FFh Reserved
+ *
+ * Some of the Intel ICH Specs (ICH2 to ICH5) documents the io-apic
+ * version as 0x2. This is an error with documentation and these ICH chips
+ * use io-apic's of version 0x20.
+ *
+ * For IO-APIC's with EOI register, we use that to do an explicit EOI.
+ * Otherwise, we simulate the EOI message manually by changing the trigger
+ * mode to edge and then back to level, with RTE being masked during this.
+ */
+static void __eoi_ioapic_pin(int apic, int pin, int vector)
+{
+ if (mpc_ioapic_ver(apic) >= 0x20) {
+ io_apic_eoi(apic, vector);
+ } else {
+ struct IO_APIC_route_entry entry, entry1;
+
+ entry = entry1 = __ioapic_read_entry(apic, pin);
+
+ /*
+ * Mask the entry and change the trigger mode to edge.
+ */
+ entry1.mask = IOAPIC_MASKED;
+ entry1.trigger = IOAPIC_EDGE;
+
+ __ioapic_write_entry(apic, pin, entry1);
+
+ /*
+ * Restore the previous level triggered entry.
+ */
+ __ioapic_write_entry(apic, pin, entry);
+ }
+}
+
+static void eoi_ioapic_pin(int vector, struct mp_chip_data *data)
+{
+ unsigned long flags;
+ struct irq_pin_list *entry;
+
+ raw_spin_lock_irqsave(&ioapic_lock, flags);
+ for_each_irq_pin(entry, data->irq_2_pin)
+ __eoi_ioapic_pin(entry->apic, entry->pin, vector);
+ raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+}
+
+static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
+{
+ struct IO_APIC_route_entry entry;
+
+ /* Check delivery_mode to be sure we're not clearing an SMI pin */
+ entry = ioapic_read_entry(apic, pin);
+ if (entry.delivery_mode == dest_SMI)
+ return;
+
+ /*
+ * Make sure the entry is masked and re-read the contents to check
+ * if it is a level triggered pin and if the remote-IRR is set.
+ */
+ if (entry.mask == IOAPIC_UNMASKED) {
+ entry.mask = IOAPIC_MASKED;
+ ioapic_write_entry(apic, pin, entry);
+ entry = ioapic_read_entry(apic, pin);
+ }
+
+ if (entry.irr) {
+ unsigned long flags;
+
+ /*
+ * Make sure the trigger mode is set to level. Explicit EOI
+ * doesn't clear the remote-IRR if the trigger mode is not
+ * set to level.
+ */
+ if (entry.trigger == IOAPIC_EDGE) {
+ entry.trigger = IOAPIC_LEVEL;
+ ioapic_write_entry(apic, pin, entry);
+ }
+ raw_spin_lock_irqsave(&ioapic_lock, flags);
+ __eoi_ioapic_pin(apic, pin, entry.vector);
+ raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+ }
+
+ /*
+ * Clear the rest of the bits in the IO-APIC RTE except for the mask
+ * bit.
+ */
+ ioapic_mask_entry(apic, pin);
+ entry = ioapic_read_entry(apic, pin);
+ if (entry.irr)
+ pr_err("Unable to reset IRR for apic: %d, pin :%d\n",
+ mpc_ioapic_id(apic), pin);
+}
+
+void clear_IO_APIC (void)
+{
+ int apic, pin;
+
+ for_each_ioapic_pin(apic, pin)
+ clear_IO_APIC_pin(apic, pin);
+}
+
+#ifdef CONFIG_X86_32
+/*
+ * support for broken MP BIOSs, enables hand-redirection of PIRQ0-7 to
+ * specific CPU-side IRQs.
+ */
+
+#define MAX_PIRQS 8
+static int pirq_entries[MAX_PIRQS] = {
+ [0 ... MAX_PIRQS - 1] = -1
+};
+
+static int __init ioapic_pirq_setup(char *str)
+{
+ int i, max;
+ int ints[MAX_PIRQS+1];
+
+ get_options(str, ARRAY_SIZE(ints), ints);
+
+ apic_printk(APIC_VERBOSE, KERN_INFO
+ "PIRQ redirection, working around broken MP-BIOS.\n");
+ max = MAX_PIRQS;
+ if (ints[0] < MAX_PIRQS)
+ max = ints[0];
+
+ for (i = 0; i < max; i++) {
+ apic_printk(APIC_VERBOSE, KERN_DEBUG
+ "... PIRQ%d -> IRQ %d\n", i, ints[i+1]);
+ /*
+ * PIRQs are mapped upside down, usually.
+ */
+ pirq_entries[MAX_PIRQS-i-1] = ints[i+1];
+ }
+ return 1;
+}
+
+__setup("pirq=", ioapic_pirq_setup);
+#endif /* CONFIG_X86_32 */
+
+/*
+ * Saves all the IO-APIC RTE's
+ */
+int save_ioapic_entries(void)
+{
+ int apic, pin;
+ int err = 0;
+
+ for_each_ioapic(apic) {
+ if (!ioapics[apic].saved_registers) {
+ err = -ENOMEM;
+ continue;
+ }
+
+ for_each_pin(apic, pin)
+ ioapics[apic].saved_registers[pin] =
+ ioapic_read_entry(apic, pin);
+ }
+
+ return err;
+}
+
+/*
+ * Mask all IO APIC entries.
+ */
+void mask_ioapic_entries(void)
+{
+ int apic, pin;
+
+ for_each_ioapic(apic) {
+ if (!ioapics[apic].saved_registers)
+ continue;
+
+ for_each_pin(apic, pin) {
+ struct IO_APIC_route_entry entry;
+
+ entry = ioapics[apic].saved_registers[pin];
+ if (entry.mask == IOAPIC_UNMASKED) {
+ entry.mask = IOAPIC_MASKED;
+ ioapic_write_entry(apic, pin, entry);
+ }
+ }
+ }
+}
+
+/*
+ * Restore IO APIC entries which was saved in the ioapic structure.
+ */
+int restore_ioapic_entries(void)
+{
+ int apic, pin;
+
+ for_each_ioapic(apic) {
+ if (!ioapics[apic].saved_registers)
+ continue;
+
+ for_each_pin(apic, pin)
+ ioapic_write_entry(apic, pin,
+ ioapics[apic].saved_registers[pin]);
+ }
+ return 0;
+}
+
+/*
+ * Find the IRQ entry number of a certain pin.
+ */
+static int find_irq_entry(int ioapic_idx, int pin, int type)
+{
+ int i;
+
+ for (i = 0; i < mp_irq_entries; i++)
+ if (mp_irqs[i].irqtype == type &&
+ (mp_irqs[i].dstapic == mpc_ioapic_id(ioapic_idx) ||
+ mp_irqs[i].dstapic == MP_APIC_ALL) &&
+ mp_irqs[i].dstirq == pin)
+ return i;
+
+ return -1;
+}
+
+/*
+ * Find the pin to which IRQ[irq] (ISA) is connected
+ */
+static int __init find_isa_irq_pin(int irq, int type)
+{
+ int i;
+
+ for (i = 0; i < mp_irq_entries; i++) {
+ int lbus = mp_irqs[i].srcbus;
+
+ if (test_bit(lbus, mp_bus_not_pci) &&
+ (mp_irqs[i].irqtype == type) &&
+ (mp_irqs[i].srcbusirq == irq))
+
+ return mp_irqs[i].dstirq;
+ }
+ return -1;
+}
+
+static int __init find_isa_irq_apic(int irq, int type)
+{
+ int i;
+
+ for (i = 0; i < mp_irq_entries; i++) {
+ int lbus = mp_irqs[i].srcbus;
+
+ if (test_bit(lbus, mp_bus_not_pci) &&
+ (mp_irqs[i].irqtype == type) &&
+ (mp_irqs[i].srcbusirq == irq))
+ break;
+ }
+
+ if (i < mp_irq_entries) {
+ int ioapic_idx;
+
+ for_each_ioapic(ioapic_idx)
+ if (mpc_ioapic_id(ioapic_idx) == mp_irqs[i].dstapic)
+ return ioapic_idx;
+ }
+
+ return -1;
+}
+
+#ifdef CONFIG_EISA
+/*
+ * EISA Edge/Level control register, ELCR
+ */
+static int EISA_ELCR(unsigned int irq)
+{
+ if (irq < nr_legacy_irqs()) {
+ unsigned int port = 0x4d0 + (irq >> 3);
+ return (inb(port) >> (irq & 7)) & 1;
+ }
+ apic_printk(APIC_VERBOSE, KERN_INFO
+ "Broken MPtable reports ISA irq %d\n", irq);
+ return 0;
+}
+
+#endif
+
+/* ISA interrupts are always active high edge triggered,
+ * when listed as conforming in the MP table. */
+
+#define default_ISA_trigger(idx) (IOAPIC_EDGE)
+#define default_ISA_polarity(idx) (IOAPIC_POL_HIGH)
+
+/* EISA interrupts are always polarity zero and can be edge or level
+ * trigger depending on the ELCR value. If an interrupt is listed as
+ * EISA conforming in the MP table, that means its trigger type must
+ * be read in from the ELCR */
+
+#define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].srcbusirq))
+#define default_EISA_polarity(idx) default_ISA_polarity(idx)
+
+/* PCI interrupts are always active low level triggered,
+ * when listed as conforming in the MP table. */
+
+#define default_PCI_trigger(idx) (IOAPIC_LEVEL)
+#define default_PCI_polarity(idx) (IOAPIC_POL_LOW)
+
+static int irq_polarity(int idx)
+{
+ int bus = mp_irqs[idx].srcbus;
+
+ /*
+ * Determine IRQ line polarity (high active or low active):
+ */
+ switch (mp_irqs[idx].irqflag & MP_IRQPOL_MASK) {
+ case MP_IRQPOL_DEFAULT:
+ /* conforms to spec, ie. bus-type dependent polarity */
+ if (test_bit(bus, mp_bus_not_pci))
+ return default_ISA_polarity(idx);
+ else
+ return default_PCI_polarity(idx);
+ case MP_IRQPOL_ACTIVE_HIGH:
+ return IOAPIC_POL_HIGH;
+ case MP_IRQPOL_RESERVED:
+ pr_warn("IOAPIC: Invalid polarity: 2, defaulting to low\n");
+ case MP_IRQPOL_ACTIVE_LOW:
+ default: /* Pointless default required due to do gcc stupidity */
+ return IOAPIC_POL_LOW;
+ }
+}
+
+#ifdef CONFIG_EISA
+static int eisa_irq_trigger(int idx, int bus, int trigger)
+{
+ switch (mp_bus_id_to_type[bus]) {
+ case MP_BUS_PCI:
+ case MP_BUS_ISA:
+ return trigger;
+ case MP_BUS_EISA:
+ return default_EISA_trigger(idx);
+ }
+ pr_warn("IOAPIC: Invalid srcbus: %d defaulting to level\n", bus);
+ return IOAPIC_LEVEL;
+}
+#else
+static inline int eisa_irq_trigger(int idx, int bus, int trigger)
+{
+ return trigger;
+}
+#endif
+
+static int irq_trigger(int idx)
+{
+ int bus = mp_irqs[idx].srcbus;
+ int trigger;
+
+ /*
+ * Determine IRQ trigger mode (edge or level sensitive):
+ */
+ switch (mp_irqs[idx].irqflag & MP_IRQTRIG_MASK) {
+ case MP_IRQTRIG_DEFAULT:
+ /* conforms to spec, ie. bus-type dependent trigger mode */
+ if (test_bit(bus, mp_bus_not_pci))
+ trigger = default_ISA_trigger(idx);
+ else
+ trigger = default_PCI_trigger(idx);
+ /* Take EISA into account */
+ return eisa_irq_trigger(idx, bus, trigger);
+ case MP_IRQTRIG_EDGE:
+ return IOAPIC_EDGE;
+ case MP_IRQTRIG_RESERVED:
+ pr_warn("IOAPIC: Invalid trigger mode 2 defaulting to level\n");
+ case MP_IRQTRIG_LEVEL:
+ default: /* Pointless default required due to do gcc stupidity */
+ return IOAPIC_LEVEL;
+ }
+}
+
+void ioapic_set_alloc_attr(struct irq_alloc_info *info, int node,
+ int trigger, int polarity)
+{
+ init_irq_alloc_info(info, NULL);
+ info->type = X86_IRQ_ALLOC_TYPE_IOAPIC;
+ info->ioapic_node = node;
+ info->ioapic_trigger = trigger;
+ info->ioapic_polarity = polarity;
+ info->ioapic_valid = 1;
+}
+
+#ifndef CONFIG_ACPI
+int acpi_get_override_irq(u32 gsi, int *trigger, int *polarity);
+#endif
+
+static void ioapic_copy_alloc_attr(struct irq_alloc_info *dst,
+ struct irq_alloc_info *src,
+ u32 gsi, int ioapic_idx, int pin)
+{
+ int trigger, polarity;
+
+ copy_irq_alloc_info(dst, src);
+ dst->type = X86_IRQ_ALLOC_TYPE_IOAPIC;
+ dst->ioapic_id = mpc_ioapic_id(ioapic_idx);
+ dst->ioapic_pin = pin;
+ dst->ioapic_valid = 1;
+ if (src && src->ioapic_valid) {
+ dst->ioapic_node = src->ioapic_node;
+ dst->ioapic_trigger = src->ioapic_trigger;
+ dst->ioapic_polarity = src->ioapic_polarity;
+ } else {
+ dst->ioapic_node = NUMA_NO_NODE;
+ if (acpi_get_override_irq(gsi, &trigger, &polarity) >= 0) {
+ dst->ioapic_trigger = trigger;
+ dst->ioapic_polarity = polarity;
+ } else {
+ /*
+ * PCI interrupts are always active low level
+ * triggered.
+ */
+ dst->ioapic_trigger = IOAPIC_LEVEL;
+ dst->ioapic_polarity = IOAPIC_POL_LOW;
+ }
+ }
+}
+
+static int ioapic_alloc_attr_node(struct irq_alloc_info *info)
+{
+ return (info && info->ioapic_valid) ? info->ioapic_node : NUMA_NO_NODE;
+}
+
+static void mp_register_handler(unsigned int irq, unsigned long trigger)
+{
+ irq_flow_handler_t hdl;
+ bool fasteoi;
+
+ if (trigger) {
+ irq_set_status_flags(irq, IRQ_LEVEL);
+ fasteoi = true;
+ } else {
+ irq_clear_status_flags(irq, IRQ_LEVEL);
+ fasteoi = false;
+ }
+
+ hdl = fasteoi ? handle_fasteoi_irq : handle_edge_irq;
+ __irq_set_handler(irq, hdl, 0, fasteoi ? "fasteoi" : "edge");
+}
+
+static bool mp_check_pin_attr(int irq, struct irq_alloc_info *info)
+{
+ struct mp_chip_data *data = irq_get_chip_data(irq);
+
+ /*
+ * setup_IO_APIC_irqs() programs all legacy IRQs with default trigger
+ * and polarity attirbutes. So allow the first user to reprogram the
+ * pin with real trigger and polarity attributes.
+ */
+ if (irq < nr_legacy_irqs() && data->count == 1) {
+ if (info->ioapic_trigger != data->trigger)
+ mp_register_handler(irq, info->ioapic_trigger);
+ data->entry.trigger = data->trigger = info->ioapic_trigger;
+ data->entry.polarity = data->polarity = info->ioapic_polarity;
+ }
+
+ return data->trigger == info->ioapic_trigger &&
+ data->polarity == info->ioapic_polarity;
+}
+
+static int alloc_irq_from_domain(struct irq_domain *domain, int ioapic, u32 gsi,
+ struct irq_alloc_info *info)
+{
+ bool legacy = false;
+ int irq = -1;
+ int type = ioapics[ioapic].irqdomain_cfg.type;
+
+ switch (type) {
+ case IOAPIC_DOMAIN_LEGACY:
+ /*
+ * Dynamically allocate IRQ number for non-ISA IRQs in the first
+ * 16 GSIs on some weird platforms.
+ */
+ if (!ioapic_initialized || gsi >= nr_legacy_irqs())
+ irq = gsi;
+ legacy = mp_is_legacy_irq(irq);
+ break;
+ case IOAPIC_DOMAIN_STRICT:
+ irq = gsi;
+ break;
+ case IOAPIC_DOMAIN_DYNAMIC:
+ break;
+ default:
+ WARN(1, "ioapic: unknown irqdomain type %d\n", type);
+ return -1;
+ }
+
+ return __irq_domain_alloc_irqs(domain, irq, 1,
+ ioapic_alloc_attr_node(info),
+ info, legacy, NULL);
+}
+
+/*
+ * Need special handling for ISA IRQs because there may be multiple IOAPIC pins
+ * sharing the same ISA IRQ number and irqdomain only supports 1:1 mapping
+ * between IOAPIC pin and IRQ number. A typical IOAPIC has 24 pins, pin 0-15 are
+ * used for legacy IRQs and pin 16-23 are used for PCI IRQs (PIRQ A-H).
+ * When ACPI is disabled, only legacy IRQ numbers (IRQ0-15) are available, and
+ * some BIOSes may use MP Interrupt Source records to override IRQ numbers for
+ * PIRQs instead of reprogramming the interrupt routing logic. Thus there may be
+ * multiple pins sharing the same legacy IRQ number when ACPI is disabled.
+ */
+static int alloc_isa_irq_from_domain(struct irq_domain *domain,
+ int irq, int ioapic, int pin,
+ struct irq_alloc_info *info)
+{
+ struct mp_chip_data *data;
+ struct irq_data *irq_data = irq_get_irq_data(irq);
+ int node = ioapic_alloc_attr_node(info);
+
+ /*
+ * Legacy ISA IRQ has already been allocated, just add pin to
+ * the pin list assoicated with this IRQ and program the IOAPIC
+ * entry. The IOAPIC entry
+ */
+ if (irq_data && irq_data->parent_data) {
+ if (!mp_check_pin_attr(irq, info))
+ return -EBUSY;
+ if (__add_pin_to_irq_node(irq_data->chip_data, node, ioapic,
+ info->ioapic_pin))
+ return -ENOMEM;
+ } else {
+ info->flags |= X86_IRQ_ALLOC_LEGACY;
+ irq = __irq_domain_alloc_irqs(domain, irq, 1, node, info, true,
+ NULL);
+ if (irq >= 0) {
+ irq_data = irq_domain_get_irq_data(domain, irq);
+ data = irq_data->chip_data;
+ data->isa_irq = true;
+ }
+ }
+
+ return irq;
+}
+
+static int mp_map_pin_to_irq(u32 gsi, int idx, int ioapic, int pin,
+ unsigned int flags, struct irq_alloc_info *info)
+{
+ int irq;
+ bool legacy = false;
+ struct irq_alloc_info tmp;
+ struct mp_chip_data *data;
+ struct irq_domain *domain = mp_ioapic_irqdomain(ioapic);
+
+ if (!domain)
+ return -ENOSYS;
+
+ if (idx >= 0 && test_bit(mp_irqs[idx].srcbus, mp_bus_not_pci)) {
+ irq = mp_irqs[idx].srcbusirq;
+ legacy = mp_is_legacy_irq(irq);
+ }
+
+ mutex_lock(&ioapic_mutex);
+ if (!(flags & IOAPIC_MAP_ALLOC)) {
+ if (!legacy) {
+ irq = irq_find_mapping(domain, pin);
+ if (irq == 0)
+ irq = -ENOENT;
+ }
+ } else {
+ ioapic_copy_alloc_attr(&tmp, info, gsi, ioapic, pin);
+ if (legacy)
+ irq = alloc_isa_irq_from_domain(domain, irq,
+ ioapic, pin, &tmp);
+ else if ((irq = irq_find_mapping(domain, pin)) == 0)
+ irq = alloc_irq_from_domain(domain, ioapic, gsi, &tmp);
+ else if (!mp_check_pin_attr(irq, &tmp))
+ irq = -EBUSY;
+ if (irq >= 0) {
+ data = irq_get_chip_data(irq);
+ data->count++;
+ }
+ }
+ mutex_unlock(&ioapic_mutex);
+
+ return irq;
+}
+
+static int pin_2_irq(int idx, int ioapic, int pin, unsigned int flags)
+{
+ u32 gsi = mp_pin_to_gsi(ioapic, pin);
+
+ /*
+ * Debugging check, we are in big trouble if this message pops up!
+ */
+ if (mp_irqs[idx].dstirq != pin)
+ pr_err("broken BIOS or MPTABLE parser, ayiee!!\n");
+
+#ifdef CONFIG_X86_32
+ /*
+ * PCI IRQ command line redirection. Yes, limits are hardcoded.
+ */
+ if ((pin >= 16) && (pin <= 23)) {
+ if (pirq_entries[pin-16] != -1) {
+ if (!pirq_entries[pin-16]) {
+ apic_printk(APIC_VERBOSE, KERN_DEBUG
+ "disabling PIRQ%d\n", pin-16);
+ } else {
+ int irq = pirq_entries[pin-16];
+ apic_printk(APIC_VERBOSE, KERN_DEBUG
+ "using PIRQ%d -> IRQ %d\n",
+ pin-16, irq);
+ return irq;
+ }
+ }
+ }
+#endif
+
+ return mp_map_pin_to_irq(gsi, idx, ioapic, pin, flags, NULL);
+}
+
+int mp_map_gsi_to_irq(u32 gsi, unsigned int flags, struct irq_alloc_info *info)
+{
+ int ioapic, pin, idx;
+
+ ioapic = mp_find_ioapic(gsi);
+ if (ioapic < 0)
+ return -ENODEV;
+
+ pin = mp_find_ioapic_pin(ioapic, gsi);
+ idx = find_irq_entry(ioapic, pin, mp_INT);
+ if ((flags & IOAPIC_MAP_CHECK) && idx < 0)
+ return -ENODEV;
+
+ return mp_map_pin_to_irq(gsi, idx, ioapic, pin, flags, info);
+}
+
+void mp_unmap_irq(int irq)
+{
+ struct irq_data *irq_data = irq_get_irq_data(irq);
+ struct mp_chip_data *data;
+
+ if (!irq_data || !irq_data->domain)
+ return;
+
+ data = irq_data->chip_data;
+ if (!data || data->isa_irq)
+ return;
+
+ mutex_lock(&ioapic_mutex);
+ if (--data->count == 0)
+ irq_domain_free_irqs(irq, 1);
+ mutex_unlock(&ioapic_mutex);
+}
+
+/*
+ * Find a specific PCI IRQ entry.
+ * Not an __init, possibly needed by modules
+ */
+int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin)
+{
+ int irq, i, best_ioapic = -1, best_idx = -1;
+
+ apic_printk(APIC_DEBUG,
+ "querying PCI -> IRQ mapping bus:%d, slot:%d, pin:%d.\n",
+ bus, slot, pin);
+ if (test_bit(bus, mp_bus_not_pci)) {
+ apic_printk(APIC_VERBOSE,
+ "PCI BIOS passed nonexistent PCI bus %d!\n", bus);
+ return -1;
+ }
+
+ for (i = 0; i < mp_irq_entries; i++) {
+ int lbus = mp_irqs[i].srcbus;
+ int ioapic_idx, found = 0;
+
+ if (bus != lbus || mp_irqs[i].irqtype != mp_INT ||
+ slot != ((mp_irqs[i].srcbusirq >> 2) & 0x1f))
+ continue;
+
+ for_each_ioapic(ioapic_idx)
+ if (mpc_ioapic_id(ioapic_idx) == mp_irqs[i].dstapic ||
+ mp_irqs[i].dstapic == MP_APIC_ALL) {
+ found = 1;
+ break;
+ }
+ if (!found)
+ continue;
+
+ /* Skip ISA IRQs */
+ irq = pin_2_irq(i, ioapic_idx, mp_irqs[i].dstirq, 0);
+ if (irq > 0 && !IO_APIC_IRQ(irq))
+ continue;
+
+ if (pin == (mp_irqs[i].srcbusirq & 3)) {
+ best_idx = i;
+ best_ioapic = ioapic_idx;
+ goto out;
+ }
+
+ /*
+ * Use the first all-but-pin matching entry as a
+ * best-guess fuzzy result for broken mptables.
+ */
+ if (best_idx < 0) {
+ best_idx = i;
+ best_ioapic = ioapic_idx;
+ }
+ }
+ if (best_idx < 0)
+ return -1;
+
+out:
+ return pin_2_irq(best_idx, best_ioapic, mp_irqs[best_idx].dstirq,
+ IOAPIC_MAP_ALLOC);
+}
+EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector);
+
+static struct irq_chip ioapic_chip, ioapic_ir_chip;
+
+static void __init setup_IO_APIC_irqs(void)
+{
+ unsigned int ioapic, pin;
+ int idx;
+
+ apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
+
+ for_each_ioapic_pin(ioapic, pin) {
+ idx = find_irq_entry(ioapic, pin, mp_INT);
+ if (idx < 0)
+ apic_printk(APIC_VERBOSE,
+ KERN_DEBUG " apic %d pin %d not connected\n",
+ mpc_ioapic_id(ioapic), pin);
+ else
+ pin_2_irq(idx, ioapic, pin,
+ ioapic ? 0 : IOAPIC_MAP_ALLOC);
+ }
+}
+
+void ioapic_zap_locks(void)
+{
+ raw_spin_lock_init(&ioapic_lock);
+}
+
+static void io_apic_print_entries(unsigned int apic, unsigned int nr_entries)
+{
+ int i;
+ char buf[256];
+ struct IO_APIC_route_entry entry;
+ struct IR_IO_APIC_route_entry *ir_entry = (void *)&entry;
+
+ printk(KERN_DEBUG "IOAPIC %d:\n", apic);
+ for (i = 0; i <= nr_entries; i++) {
+ entry = ioapic_read_entry(apic, i);
+ snprintf(buf, sizeof(buf),
+ " pin%02x, %s, %s, %s, V(%02X), IRR(%1d), S(%1d)",
+ i,
+ entry.mask == IOAPIC_MASKED ? "disabled" : "enabled ",
+ entry.trigger == IOAPIC_LEVEL ? "level" : "edge ",
+ entry.polarity == IOAPIC_POL_LOW ? "low " : "high",
+ entry.vector, entry.irr, entry.delivery_status);
+ if (ir_entry->format)
+ printk(KERN_DEBUG "%s, remapped, I(%04X), Z(%X)\n",
+ buf, (ir_entry->index2 << 15) | ir_entry->index,
+ ir_entry->zero);
+ else
+ printk(KERN_DEBUG "%s, %s, D(%02X), M(%1d)\n",
+ buf,
+ entry.dest_mode == IOAPIC_DEST_MODE_LOGICAL ?
+ "logical " : "physical",
+ entry.dest, entry.delivery_mode);
+ }
+}
+
+static void __init print_IO_APIC(int ioapic_idx)
+{
+ union IO_APIC_reg_00 reg_00;
+ union IO_APIC_reg_01 reg_01;
+ union IO_APIC_reg_02 reg_02;
+ union IO_APIC_reg_03 reg_03;
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&ioapic_lock, flags);
+ reg_00.raw = io_apic_read(ioapic_idx, 0);
+ reg_01.raw = io_apic_read(ioapic_idx, 1);
+ if (reg_01.bits.version >= 0x10)
+ reg_02.raw = io_apic_read(ioapic_idx, 2);
+ if (reg_01.bits.version >= 0x20)
+ reg_03.raw = io_apic_read(ioapic_idx, 3);
+ raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+
+ printk(KERN_DEBUG "IO APIC #%d......\n", mpc_ioapic_id(ioapic_idx));
+ printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw);
+ printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID);
+ printk(KERN_DEBUG "....... : Delivery Type: %X\n", reg_00.bits.delivery_type);
+ printk(KERN_DEBUG "....... : LTS : %X\n", reg_00.bits.LTS);
+
+ printk(KERN_DEBUG ".... register #01: %08X\n", *(int *)®_01);
+ printk(KERN_DEBUG "....... : max redirection entries: %02X\n",
+ reg_01.bits.entries);
+
+ printk(KERN_DEBUG "....... : PRQ implemented: %X\n", reg_01.bits.PRQ);
+ printk(KERN_DEBUG "....... : IO APIC version: %02X\n",
+ reg_01.bits.version);
+
+ /*
+ * Some Intel chipsets with IO APIC VERSION of 0x1? don't have reg_02,
+ * but the value of reg_02 is read as the previous read register
+ * value, so ignore it if reg_02 == reg_01.
+ */
+ if (reg_01.bits.version >= 0x10 && reg_02.raw != reg_01.raw) {
+ printk(KERN_DEBUG ".... register #02: %08X\n", reg_02.raw);
+ printk(KERN_DEBUG "....... : arbitration: %02X\n", reg_02.bits.arbitration);
+ }
+
+ /*
+ * Some Intel chipsets with IO APIC VERSION of 0x2? don't have reg_02
+ * or reg_03, but the value of reg_0[23] is read as the previous read
+ * register value, so ignore it if reg_03 == reg_0[12].
+ */
+ if (reg_01.bits.version >= 0x20 && reg_03.raw != reg_02.raw &&
+ reg_03.raw != reg_01.raw) {
+ printk(KERN_DEBUG ".... register #03: %08X\n", reg_03.raw);
+ printk(KERN_DEBUG "....... : Boot DT : %X\n", reg_03.bits.boot_DT);
+ }
+
+ printk(KERN_DEBUG ".... IRQ redirection table:\n");
+ io_apic_print_entries(ioapic_idx, reg_01.bits.entries);
+}
+
+void __init print_IO_APICs(void)
+{
+ int ioapic_idx;
+ unsigned int irq;
+
+ printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
+ for_each_ioapic(ioapic_idx)
+ printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n",
+ mpc_ioapic_id(ioapic_idx),
+ ioapics[ioapic_idx].nr_registers);
+
+ /*
+ * We are a bit conservative about what we expect. We have to
+ * know about every hardware change ASAP.
+ */
+ printk(KERN_INFO "testing the IO APIC.......................\n");
+
+ for_each_ioapic(ioapic_idx)
+ print_IO_APIC(ioapic_idx);
+
+ printk(KERN_DEBUG "IRQ to pin mappings:\n");
+ for_each_active_irq(irq) {
+ struct irq_pin_list *entry;
+ struct irq_chip *chip;
+ struct mp_chip_data *data;
+
+ chip = irq_get_chip(irq);
+ if (chip != &ioapic_chip && chip != &ioapic_ir_chip)
+ continue;
+ data = irq_get_chip_data(irq);
+ if (!data)
+ continue;
+ if (list_empty(&data->irq_2_pin))
+ continue;
+
+ printk(KERN_DEBUG "IRQ%d ", irq);
+ for_each_irq_pin(entry, data->irq_2_pin)
+ pr_cont("-> %d:%d", entry->apic, entry->pin);
+ pr_cont("\n");
+ }
+
+ printk(KERN_INFO ".................................... done.\n");
+}
+
+/* Where if anywhere is the i8259 connect in external int mode */
+static struct { int pin, apic; } ioapic_i8259 = { -1, -1 };
+
+void __init enable_IO_APIC(void)
+{
+ int i8259_apic, i8259_pin;
+ int apic, pin;
+
+ if (skip_ioapic_setup)
+ nr_ioapics = 0;
+
+ if (!nr_legacy_irqs() || !nr_ioapics)
+ return;
+
+ for_each_ioapic_pin(apic, pin) {
+ /* See if any of the pins is in ExtINT mode */
+ struct IO_APIC_route_entry entry = ioapic_read_entry(apic, pin);
+
+ /* If the interrupt line is enabled and in ExtInt mode
+ * I have found the pin where the i8259 is connected.
+ */
+ if ((entry.mask == 0) && (entry.delivery_mode == dest_ExtINT)) {
+ ioapic_i8259.apic = apic;
+ ioapic_i8259.pin = pin;
+ goto found_i8259;
+ }
+ }
+ found_i8259:
+ /* Look to see what if the MP table has reported the ExtINT */
+ /* If we could not find the appropriate pin by looking at the ioapic
+ * the i8259 probably is not connected the ioapic but give the
+ * mptable a chance anyway.
+ */
+ i8259_pin = find_isa_irq_pin(0, mp_ExtINT);
+ i8259_apic = find_isa_irq_apic(0, mp_ExtINT);
+ /* Trust the MP table if nothing is setup in the hardware */
+ if ((ioapic_i8259.pin == -1) && (i8259_pin >= 0)) {
+ printk(KERN_WARNING "ExtINT not setup in hardware but reported by MP table\n");
+ ioapic_i8259.pin = i8259_pin;
+ ioapic_i8259.apic = i8259_apic;
+ }
+ /* Complain if the MP table and the hardware disagree */
+ if (((ioapic_i8259.apic != i8259_apic) || (ioapic_i8259.pin != i8259_pin)) &&
+ (i8259_pin >= 0) && (ioapic_i8259.pin >= 0))
+ {
+ printk(KERN_WARNING "ExtINT in hardware and MP table differ\n");
+ }
+
+ /*
+ * Do not trust the IO-APIC being empty at bootup
+ */
+ clear_IO_APIC();
+}
+
+void native_restore_boot_irq_mode(void)
+{
+ /*
+ * If the i8259 is routed through an IOAPIC
+ * Put that IOAPIC in virtual wire mode
+ * so legacy interrupts can be delivered.
+ */
+ if (ioapic_i8259.pin != -1) {
+ struct IO_APIC_route_entry entry;
+
+ memset(&entry, 0, sizeof(entry));
+ entry.mask = IOAPIC_UNMASKED;
+ entry.trigger = IOAPIC_EDGE;
+ entry.polarity = IOAPIC_POL_HIGH;
+ entry.dest_mode = IOAPIC_DEST_MODE_PHYSICAL;
+ entry.delivery_mode = dest_ExtINT;
+ entry.dest = read_apic_id();
+
+ /*
+ * Add it to the IO-APIC irq-routing table:
+ */
+ ioapic_write_entry(ioapic_i8259.apic, ioapic_i8259.pin, entry);
+ }
+
+ if (boot_cpu_has(X86_FEATURE_APIC) || apic_from_smp_config())
+ disconnect_bsp_APIC(ioapic_i8259.pin != -1);
+}
+
+void restore_boot_irq_mode(void)
+{
+ if (!nr_legacy_irqs())
+ return;
+
+ x86_apic_ops.restore();
+}
+
+#ifdef CONFIG_X86_32
+/*
+ * function to set the IO-APIC physical IDs based on the
+ * values stored in the MPC table.
+ *
+ * by Matt Domsch <Matt_Domsch@dell.com> Tue Dec 21 12:25:05 CST 1999
+ */
+void __init setup_ioapic_ids_from_mpc_nocheck(void)
+{
+ union IO_APIC_reg_00 reg_00;
+ physid_mask_t phys_id_present_map;
+ int ioapic_idx;
+ int i;
+ unsigned char old_id;
+ unsigned long flags;
+
+ /*
+ * This is broken; anything with a real cpu count has to
+ * circumvent this idiocy regardless.
+ */
+ apic->ioapic_phys_id_map(&phys_cpu_present_map, &phys_id_present_map);
+
+ /*
+ * Set the IOAPIC ID to the value stored in the MPC table.
+ */
+ for_each_ioapic(ioapic_idx) {
+ /* Read the register 0 value */
+ raw_spin_lock_irqsave(&ioapic_lock, flags);
+ reg_00.raw = io_apic_read(ioapic_idx, 0);
+ raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+
+ old_id = mpc_ioapic_id(ioapic_idx);
+
+ if (mpc_ioapic_id(ioapic_idx) >= get_physical_broadcast()) {
+ printk(KERN_ERR "BIOS bug, IO-APIC#%d ID is %d in the MPC table!...\n",
+ ioapic_idx, mpc_ioapic_id(ioapic_idx));
+ printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
+ reg_00.bits.ID);
+ ioapics[ioapic_idx].mp_config.apicid = reg_00.bits.ID;
+ }
+
+ /*
+ * Sanity check, is the ID really free? Every APIC in a
+ * system must have a unique ID or we get lots of nice
+ * 'stuck on smp_invalidate_needed IPI wait' messages.
+ */
+ if (apic->check_apicid_used(&phys_id_present_map,
+ mpc_ioapic_id(ioapic_idx))) {
+ printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n",
+ ioapic_idx, mpc_ioapic_id(ioapic_idx));
+ for (i = 0; i < get_physical_broadcast(); i++)
+ if (!physid_isset(i, phys_id_present_map))
+ break;
+ if (i >= get_physical_broadcast())
+ panic("Max APIC ID exceeded!\n");
+ printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
+ i);
+ physid_set(i, phys_id_present_map);
+ ioapics[ioapic_idx].mp_config.apicid = i;
+ } else {
+ physid_mask_t tmp;
+ apic->apicid_to_cpu_present(mpc_ioapic_id(ioapic_idx),
+ &tmp);
+ apic_printk(APIC_VERBOSE, "Setting %d in the "
+ "phys_id_present_map\n",
+ mpc_ioapic_id(ioapic_idx));
+ physids_or(phys_id_present_map, phys_id_present_map, tmp);
+ }
+
+ /*
+ * We need to adjust the IRQ routing table
+ * if the ID changed.
+ */
+ if (old_id != mpc_ioapic_id(ioapic_idx))
+ for (i = 0; i < mp_irq_entries; i++)
+ if (mp_irqs[i].dstapic == old_id)
+ mp_irqs[i].dstapic
+ = mpc_ioapic_id(ioapic_idx);
+
+ /*
+ * Update the ID register according to the right value
+ * from the MPC table if they are different.
+ */
+ if (mpc_ioapic_id(ioapic_idx) == reg_00.bits.ID)
+ continue;
+
+ apic_printk(APIC_VERBOSE, KERN_INFO
+ "...changing IO-APIC physical APIC ID to %d ...",
+ mpc_ioapic_id(ioapic_idx));
+
+ reg_00.bits.ID = mpc_ioapic_id(ioapic_idx);
+ raw_spin_lock_irqsave(&ioapic_lock, flags);
+ io_apic_write(ioapic_idx, 0, reg_00.raw);
+ raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+
+ /*
+ * Sanity check
+ */
+ raw_spin_lock_irqsave(&ioapic_lock, flags);
+ reg_00.raw = io_apic_read(ioapic_idx, 0);
+ raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+ if (reg_00.bits.ID != mpc_ioapic_id(ioapic_idx))
+ pr_cont("could not set ID!\n");
+ else
+ apic_printk(APIC_VERBOSE, " ok.\n");
+ }
+}
+
+void __init setup_ioapic_ids_from_mpc(void)
+{
+
+ if (acpi_ioapic)
+ return;
+ /*
+ * Don't check I/O APIC IDs for xAPIC systems. They have
+ * no meaning without the serial APIC bus.
+ */
+ if (!(boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
+ || APIC_XAPIC(boot_cpu_apic_version))
+ return;
+ setup_ioapic_ids_from_mpc_nocheck();
+}
+#endif
+
+int no_timer_check __initdata;
+
+static int __init notimercheck(char *s)
+{
+ no_timer_check = 1;
+ return 1;
+}
+__setup("no_timer_check", notimercheck);
+
+static void __init delay_with_tsc(void)
+{
+ unsigned long long start, now;
+ unsigned long end = jiffies + 4;
+
+ start = rdtsc();
+
+ /*
+ * We don't know the TSC frequency yet, but waiting for
+ * 40000000000/HZ TSC cycles is safe:
+ * 4 GHz == 10 jiffies
+ * 1 GHz == 40 jiffies
+ */
+ do {
+ rep_nop();
+ now = rdtsc();
+ } while ((now - start) < 40000000000ULL / HZ &&
+ time_before_eq(jiffies, end));
+}
+
+static void __init delay_without_tsc(void)
+{
+ unsigned long end = jiffies + 4;
+ int band = 1;
+
+ /*
+ * We don't know any frequency yet, but waiting for
+ * 40940000000/HZ cycles is safe:
+ * 4 GHz == 10 jiffies
+ * 1 GHz == 40 jiffies
+ * 1 << 1 + 1 << 2 +...+ 1 << 11 = 4094
+ */
+ do {
+ __delay(((1U << band++) * 10000000UL) / HZ);
+ } while (band < 12 && time_before_eq(jiffies, end));
+}
+
+/*
+ * There is a nasty bug in some older SMP boards, their mptable lies
+ * about the timer IRQ. We do the following to work around the situation:
+ *
+ * - timer IRQ defaults to IO-APIC IRQ
+ * - if this function detects that timer IRQs are defunct, then we fall
+ * back to ISA timer IRQs
+ */
+static int __init timer_irq_works(void)
+{
+ unsigned long t1 = jiffies;
+ unsigned long flags;
+
+ if (no_timer_check)
+ return 1;
+
+ local_save_flags(flags);
+ local_irq_enable();
+
+ if (boot_cpu_has(X86_FEATURE_TSC))
+ delay_with_tsc();
+ else
+ delay_without_tsc();
+
+ local_irq_restore(flags);
+
+ /*
+ * Expect a few ticks at least, to be sure some possible
+ * glue logic does not lock up after one or two first
+ * ticks in a non-ExtINT mode. Also the local APIC
+ * might have cached one ExtINT interrupt. Finally, at
+ * least one tick may be lost due to delays.
+ */
+
+ /* jiffies wrap? */
+ if (time_after(jiffies, t1 + 4))
+ return 1;
+ return 0;
+}
+
+/*
+ * In the SMP+IOAPIC case it might happen that there are an unspecified
+ * number of pending IRQ events unhandled. These cases are very rare,
+ * so we 'resend' these IRQs via IPIs, to the same CPU. It's much
+ * better to do it this way as thus we do not have to be aware of
+ * 'pending' interrupts in the IRQ path, except at this point.
+ */
+/*
+ * Edge triggered needs to resend any interrupt
+ * that was delayed but this is now handled in the device
+ * independent code.
+ */
+
+/*
+ * Starting up a edge-triggered IO-APIC interrupt is
+ * nasty - we need to make sure that we get the edge.
+ * If it is already asserted for some reason, we need
+ * return 1 to indicate that is was pending.
+ *
+ * This is not complete - we should be able to fake
+ * an edge even if it isn't on the 8259A...
+ */
+static unsigned int startup_ioapic_irq(struct irq_data *data)
+{
+ int was_pending = 0, irq = data->irq;
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&ioapic_lock, flags);
+ if (irq < nr_legacy_irqs()) {
+ legacy_pic->mask(irq);
+ if (legacy_pic->irq_pending(irq))
+ was_pending = 1;
+ }
+ __unmask_ioapic(data->chip_data);
+ raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+
+ return was_pending;
+}
+
+atomic_t irq_mis_count;
+
+#ifdef CONFIG_GENERIC_PENDING_IRQ
+static bool io_apic_level_ack_pending(struct mp_chip_data *data)
+{
+ struct irq_pin_list *entry;
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&ioapic_lock, flags);
+ for_each_irq_pin(entry, data->irq_2_pin) {
+ unsigned int reg;
+ int pin;
+
+ pin = entry->pin;
+ reg = io_apic_read(entry->apic, 0x10 + pin*2);
+ /* Is the remote IRR bit set? */
+ if (reg & IO_APIC_REDIR_REMOTE_IRR) {
+ raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+ return true;
+ }
+ }
+ raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+
+ return false;
+}
+
+static inline bool ioapic_irqd_mask(struct irq_data *data)
+{
+ /* If we are moving the irq we need to mask it */
+ if (unlikely(irqd_is_setaffinity_pending(data))) {
+ mask_ioapic_irq(data);
+ return true;
+ }
+ return false;
+}
+
+static inline void ioapic_irqd_unmask(struct irq_data *data, bool masked)
+{
+ if (unlikely(masked)) {
+ /* Only migrate the irq if the ack has been received.
+ *
+ * On rare occasions the broadcast level triggered ack gets
+ * delayed going to ioapics, and if we reprogram the
+ * vector while Remote IRR is still set the irq will never
+ * fire again.
+ *
+ * To prevent this scenario we read the Remote IRR bit
+ * of the ioapic. This has two effects.
+ * - On any sane system the read of the ioapic will
+ * flush writes (and acks) going to the ioapic from
+ * this cpu.
+ * - We get to see if the ACK has actually been delivered.
+ *
+ * Based on failed experiments of reprogramming the
+ * ioapic entry from outside of irq context starting
+ * with masking the ioapic entry and then polling until
+ * Remote IRR was clear before reprogramming the
+ * ioapic I don't trust the Remote IRR bit to be
+ * completey accurate.
+ *
+ * However there appears to be no other way to plug
+ * this race, so if the Remote IRR bit is not
+ * accurate and is causing problems then it is a hardware bug
+ * and you can go talk to the chipset vendor about it.
+ */
+ if (!io_apic_level_ack_pending(data->chip_data))
+ irq_move_masked_irq(data);
+ unmask_ioapic_irq(data);
+ }
+}
+#else
+static inline bool ioapic_irqd_mask(struct irq_data *data)
+{
+ return false;
+}
+static inline void ioapic_irqd_unmask(struct irq_data *data, bool masked)
+{
+}
+#endif
+
+static void ioapic_ack_level(struct irq_data *irq_data)
+{
+ struct irq_cfg *cfg = irqd_cfg(irq_data);
+ unsigned long v;
+ bool masked;
+ int i;
+
+ irq_complete_move(cfg);
+ masked = ioapic_irqd_mask(irq_data);
+
+ /*
+ * It appears there is an erratum which affects at least version 0x11
+ * of I/O APIC (that's the 82093AA and cores integrated into various
+ * chipsets). Under certain conditions a level-triggered interrupt is
+ * erroneously delivered as edge-triggered one but the respective IRR
+ * bit gets set nevertheless. As a result the I/O unit expects an EOI
+ * message but it will never arrive and further interrupts are blocked
+ * from the source. The exact reason is so far unknown, but the
+ * phenomenon was observed when two consecutive interrupt requests
+ * from a given source get delivered to the same CPU and the source is
+ * temporarily disabled in between.
+ *
+ * A workaround is to simulate an EOI message manually. We achieve it
+ * by setting the trigger mode to edge and then to level when the edge
+ * trigger mode gets detected in the TMR of a local APIC for a
+ * level-triggered interrupt. We mask the source for the time of the
+ * operation to prevent an edge-triggered interrupt escaping meanwhile.
+ * The idea is from Manfred Spraul. --macro
+ *
+ * Also in the case when cpu goes offline, fixup_irqs() will forward
+ * any unhandled interrupt on the offlined cpu to the new cpu
+ * destination that is handling the corresponding interrupt. This
+ * interrupt forwarding is done via IPI's. Hence, in this case also
+ * level-triggered io-apic interrupt will be seen as an edge
+ * interrupt in the IRR. And we can't rely on the cpu's EOI
+ * to be broadcasted to the IO-APIC's which will clear the remoteIRR
+ * corresponding to the level-triggered interrupt. Hence on IO-APIC's
+ * supporting EOI register, we do an explicit EOI to clear the
+ * remote IRR and on IO-APIC's which don't have an EOI register,
+ * we use the above logic (mask+edge followed by unmask+level) from
+ * Manfred Spraul to clear the remote IRR.
+ */
+ i = cfg->vector;
+ v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1));
+
+ /*
+ * We must acknowledge the irq before we move it or the acknowledge will
+ * not propagate properly.
+ */
+ ack_APIC_irq();
+
+ /*
+ * Tail end of clearing remote IRR bit (either by delivering the EOI
+ * message via io-apic EOI register write or simulating it using
+ * mask+edge followed by unnask+level logic) manually when the
+ * level triggered interrupt is seen as the edge triggered interrupt
+ * at the cpu.
+ */
+ if (!(v & (1 << (i & 0x1f)))) {
+ atomic_inc(&irq_mis_count);
+ eoi_ioapic_pin(cfg->vector, irq_data->chip_data);
+ }
+
+ ioapic_irqd_unmask(irq_data, masked);
+}
+
+static void ioapic_ir_ack_level(struct irq_data *irq_data)
+{
+ struct mp_chip_data *data = irq_data->chip_data;
+
+ /*
+ * Intr-remapping uses pin number as the virtual vector
+ * in the RTE. Actual vector is programmed in
+ * intr-remapping table entry. Hence for the io-apic
+ * EOI we use the pin number.
+ */
+ apic_ack_irq(irq_data);
+ eoi_ioapic_pin(data->entry.vector, data);
+}
+
+static void ioapic_configure_entry(struct irq_data *irqd)
+{
+ struct mp_chip_data *mpd = irqd->chip_data;
+ struct irq_cfg *cfg = irqd_cfg(irqd);
+ struct irq_pin_list *entry;
+
+ /*
+ * Only update when the parent is the vector domain, don't touch it
+ * if the parent is the remapping domain. Check the installed
+ * ioapic chip to verify that.
+ */
+ if (irqd->chip == &ioapic_chip) {
+ mpd->entry.dest = cfg->dest_apicid;
+ mpd->entry.vector = cfg->vector;
+ }
+ for_each_irq_pin(entry, mpd->irq_2_pin)
+ __ioapic_write_entry(entry->apic, entry->pin, mpd->entry);
+}
+
+static int ioapic_set_affinity(struct irq_data *irq_data,
+ const struct cpumask *mask, bool force)
+{
+ struct irq_data *parent = irq_data->parent_data;
+ unsigned long flags;
+ int ret;
+
+ ret = parent->chip->irq_set_affinity(parent, mask, force);
+ raw_spin_lock_irqsave(&ioapic_lock, flags);
+ if (ret >= 0 && ret != IRQ_SET_MASK_OK_DONE)
+ ioapic_configure_entry(irq_data);
+ raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+
+ return ret;
+}
+
+static struct irq_chip ioapic_chip __read_mostly = {
+ .name = "IO-APIC",
+ .irq_startup = startup_ioapic_irq,
+ .irq_mask = mask_ioapic_irq,
+ .irq_unmask = unmask_ioapic_irq,
+ .irq_ack = irq_chip_ack_parent,
+ .irq_eoi = ioapic_ack_level,
+ .irq_set_affinity = ioapic_set_affinity,
+ .irq_retrigger = irq_chip_retrigger_hierarchy,
+ .flags = IRQCHIP_SKIP_SET_WAKE,
+};
+
+static struct irq_chip ioapic_ir_chip __read_mostly = {
+ .name = "IR-IO-APIC",
+ .irq_startup = startup_ioapic_irq,
+ .irq_mask = mask_ioapic_irq,
+ .irq_unmask = unmask_ioapic_irq,
+ .irq_ack = irq_chip_ack_parent,
+ .irq_eoi = ioapic_ir_ack_level,
+ .irq_set_affinity = ioapic_set_affinity,
+ .irq_retrigger = irq_chip_retrigger_hierarchy,
+ .flags = IRQCHIP_SKIP_SET_WAKE,
+};
+
+static inline void init_IO_APIC_traps(void)
+{
+ struct irq_cfg *cfg;
+ unsigned int irq;
+
+ for_each_active_irq(irq) {
+ cfg = irq_cfg(irq);
+ if (IO_APIC_IRQ(irq) && cfg && !cfg->vector) {
+ /*
+ * Hmm.. We don't have an entry for this,
+ * so default to an old-fashioned 8259
+ * interrupt if we can..
+ */
+ if (irq < nr_legacy_irqs())
+ legacy_pic->make_irq(irq);
+ else
+ /* Strange. Oh, well.. */
+ irq_set_chip(irq, &no_irq_chip);
+ }
+ }
+}
+
+/*
+ * The local APIC irq-chip implementation:
+ */
+
+static void mask_lapic_irq(struct irq_data *data)
+{
+ unsigned long v;
+
+ v = apic_read(APIC_LVT0);
+ apic_write(APIC_LVT0, v | APIC_LVT_MASKED);
+}
+
+static void unmask_lapic_irq(struct irq_data *data)
+{
+ unsigned long v;
+
+ v = apic_read(APIC_LVT0);
+ apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED);
+}
+
+static void ack_lapic_irq(struct irq_data *data)
+{
+ ack_APIC_irq();
+}
+
+static struct irq_chip lapic_chip __read_mostly = {
+ .name = "local-APIC",
+ .irq_mask = mask_lapic_irq,
+ .irq_unmask = unmask_lapic_irq,
+ .irq_ack = ack_lapic_irq,
+};
+
+static void lapic_register_intr(int irq)
+{
+ irq_clear_status_flags(irq, IRQ_LEVEL);
+ irq_set_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq,
+ "edge");
+}
+
+/*
+ * This looks a bit hackish but it's about the only one way of sending
+ * a few INTA cycles to 8259As and any associated glue logic. ICR does
+ * not support the ExtINT mode, unfortunately. We need to send these
+ * cycles as some i82489DX-based boards have glue logic that keeps the
+ * 8259A interrupt line asserted until INTA. --macro
+ */
+static inline void __init unlock_ExtINT_logic(void)
+{
+ int apic, pin, i;
+ struct IO_APIC_route_entry entry0, entry1;
+ unsigned char save_control, save_freq_select;
+
+ pin = find_isa_irq_pin(8, mp_INT);
+ if (pin == -1) {
+ WARN_ON_ONCE(1);
+ return;
+ }
+ apic = find_isa_irq_apic(8, mp_INT);
+ if (apic == -1) {
+ WARN_ON_ONCE(1);
+ return;
+ }
+
+ entry0 = ioapic_read_entry(apic, pin);
+ clear_IO_APIC_pin(apic, pin);
+
+ memset(&entry1, 0, sizeof(entry1));
+
+ entry1.dest_mode = IOAPIC_DEST_MODE_PHYSICAL;
+ entry1.mask = IOAPIC_UNMASKED;
+ entry1.dest = hard_smp_processor_id();
+ entry1.delivery_mode = dest_ExtINT;
+ entry1.polarity = entry0.polarity;
+ entry1.trigger = IOAPIC_EDGE;
+ entry1.vector = 0;
+
+ ioapic_write_entry(apic, pin, entry1);
+
+ save_control = CMOS_READ(RTC_CONTROL);
+ save_freq_select = CMOS_READ(RTC_FREQ_SELECT);
+ CMOS_WRITE((save_freq_select & ~RTC_RATE_SELECT) | 0x6,
+ RTC_FREQ_SELECT);
+ CMOS_WRITE(save_control | RTC_PIE, RTC_CONTROL);
+
+ i = 100;
+ while (i-- > 0) {
+ mdelay(10);
+ if ((CMOS_READ(RTC_INTR_FLAGS) & RTC_PF) == RTC_PF)
+ i -= 10;
+ }
+
+ CMOS_WRITE(save_control, RTC_CONTROL);
+ CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT);
+ clear_IO_APIC_pin(apic, pin);
+
+ ioapic_write_entry(apic, pin, entry0);
+}
+
+static int disable_timer_pin_1 __initdata;
+/* Actually the next is obsolete, but keep it for paranoid reasons -AK */
+static int __init disable_timer_pin_setup(char *arg)
+{
+ disable_timer_pin_1 = 1;
+ return 0;
+}
+early_param("disable_timer_pin_1", disable_timer_pin_setup);
+
+static int mp_alloc_timer_irq(int ioapic, int pin)
+{
+ int irq = -1;
+ struct irq_domain *domain = mp_ioapic_irqdomain(ioapic);
+
+ if (domain) {
+ struct irq_alloc_info info;
+
+ ioapic_set_alloc_attr(&info, NUMA_NO_NODE, 0, 0);
+ info.ioapic_id = mpc_ioapic_id(ioapic);
+ info.ioapic_pin = pin;
+ mutex_lock(&ioapic_mutex);
+ irq = alloc_isa_irq_from_domain(domain, 0, ioapic, pin, &info);
+ mutex_unlock(&ioapic_mutex);
+ }
+
+ return irq;
+}
+
+/*
+ * This code may look a bit paranoid, but it's supposed to cooperate with
+ * a wide range of boards and BIOS bugs. Fortunately only the timer IRQ
+ * is so screwy. Thanks to Brian Perkins for testing/hacking this beast
+ * fanatically on his truly buggy board.
+ *
+ * FIXME: really need to revamp this for all platforms.
+ */
+static inline void __init check_timer(void)
+{
+ struct irq_data *irq_data = irq_get_irq_data(0);
+ struct mp_chip_data *data = irq_data->chip_data;
+ struct irq_cfg *cfg = irqd_cfg(irq_data);
+ int node = cpu_to_node(0);
+ int apic1, pin1, apic2, pin2;
+ unsigned long flags;
+ int no_pin1 = 0;
+
+ local_irq_save(flags);
+
+ /*
+ * get/set the timer IRQ vector:
+ */
+ legacy_pic->mask(0);
+
+ /*
+ * As IRQ0 is to be enabled in the 8259A, the virtual
+ * wire has to be disabled in the local APIC. Also
+ * timer interrupts need to be acknowledged manually in
+ * the 8259A for the i82489DX when using the NMI
+ * watchdog as that APIC treats NMIs as level-triggered.
+ * The AEOI mode will finish them in the 8259A
+ * automatically.
+ */
+ apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
+ legacy_pic->init(1);
+
+ pin1 = find_isa_irq_pin(0, mp_INT);
+ apic1 = find_isa_irq_apic(0, mp_INT);
+ pin2 = ioapic_i8259.pin;
+ apic2 = ioapic_i8259.apic;
+
+ apic_printk(APIC_QUIET, KERN_INFO "..TIMER: vector=0x%02X "
+ "apic1=%d pin1=%d apic2=%d pin2=%d\n",
+ cfg->vector, apic1, pin1, apic2, pin2);
+
+ /*
+ * Some BIOS writers are clueless and report the ExtINTA
+ * I/O APIC input from the cascaded 8259A as the timer
+ * interrupt input. So just in case, if only one pin
+ * was found above, try it both directly and through the
+ * 8259A.
+ */
+ if (pin1 == -1) {
+ panic_if_irq_remap("BIOS bug: timer not connected to IO-APIC");
+ pin1 = pin2;
+ apic1 = apic2;
+ no_pin1 = 1;
+ } else if (pin2 == -1) {
+ pin2 = pin1;
+ apic2 = apic1;
+ }
+
+ if (pin1 != -1) {
+ /* Ok, does IRQ0 through the IOAPIC work? */
+ if (no_pin1) {
+ mp_alloc_timer_irq(apic1, pin1);
+ } else {
+ /*
+ * for edge trigger, it's already unmasked,
+ * so only need to unmask if it is level-trigger
+ * do we really have level trigger timer?
+ */
+ int idx;
+ idx = find_irq_entry(apic1, pin1, mp_INT);
+ if (idx != -1 && irq_trigger(idx))
+ unmask_ioapic_irq(irq_get_irq_data(0));
+ }
+ irq_domain_deactivate_irq(irq_data);
+ irq_domain_activate_irq(irq_data, false);
+ if (timer_irq_works()) {
+ if (disable_timer_pin_1 > 0)
+ clear_IO_APIC_pin(0, pin1);
+ goto out;
+ }
+ panic_if_irq_remap("timer doesn't work through Interrupt-remapped IO-APIC");
+ local_irq_disable();
+ clear_IO_APIC_pin(apic1, pin1);
+ if (!no_pin1)
+ apic_printk(APIC_QUIET, KERN_ERR "..MP-BIOS bug: "
+ "8254 timer not connected to IO-APIC\n");
+
+ apic_printk(APIC_QUIET, KERN_INFO "...trying to set up timer "
+ "(IRQ0) through the 8259A ...\n");
+ apic_printk(APIC_QUIET, KERN_INFO
+ "..... (found apic %d pin %d) ...\n", apic2, pin2);
+ /*
+ * legacy devices should be connected to IO APIC #0
+ */
+ replace_pin_at_irq_node(data, node, apic1, pin1, apic2, pin2);
+ irq_domain_deactivate_irq(irq_data);
+ irq_domain_activate_irq(irq_data, false);
+ legacy_pic->unmask(0);
+ if (timer_irq_works()) {
+ apic_printk(APIC_QUIET, KERN_INFO "....... works.\n");
+ goto out;
+ }
+ /*
+ * Cleanup, just in case ...
+ */
+ local_irq_disable();
+ legacy_pic->mask(0);
+ clear_IO_APIC_pin(apic2, pin2);
+ apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n");
+ }
+
+ apic_printk(APIC_QUIET, KERN_INFO
+ "...trying to set up timer as Virtual Wire IRQ...\n");
+
+ lapic_register_intr(0);
+ apic_write(APIC_LVT0, APIC_DM_FIXED | cfg->vector); /* Fixed mode */
+ legacy_pic->unmask(0);
+
+ if (timer_irq_works()) {
+ apic_printk(APIC_QUIET, KERN_INFO "..... works.\n");
+ goto out;
+ }
+ local_irq_disable();
+ legacy_pic->mask(0);
+ apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | cfg->vector);
+ apic_printk(APIC_QUIET, KERN_INFO "..... failed.\n");
+
+ apic_printk(APIC_QUIET, KERN_INFO
+ "...trying to set up timer as ExtINT IRQ...\n");
+
+ legacy_pic->init(0);
+ legacy_pic->make_irq(0);
+ apic_write(APIC_LVT0, APIC_DM_EXTINT);
+
+ unlock_ExtINT_logic();
+
+ if (timer_irq_works()) {
+ apic_printk(APIC_QUIET, KERN_INFO "..... works.\n");
+ goto out;
+ }
+ local_irq_disable();
+ apic_printk(APIC_QUIET, KERN_INFO "..... failed :(.\n");
+ if (apic_is_x2apic_enabled())
+ apic_printk(APIC_QUIET, KERN_INFO
+ "Perhaps problem with the pre-enabled x2apic mode\n"
+ "Try booting with x2apic and interrupt-remapping disabled in the bios.\n");
+ panic("IO-APIC + timer doesn't work! Boot with apic=debug and send a "
+ "report. Then try booting with the 'noapic' option.\n");
+out:
+ local_irq_restore(flags);
+}
+
+/*
+ * Traditionally ISA IRQ2 is the cascade IRQ, and is not available
+ * to devices. However there may be an I/O APIC pin available for
+ * this interrupt regardless. The pin may be left unconnected, but
+ * typically it will be reused as an ExtINT cascade interrupt for
+ * the master 8259A. In the MPS case such a pin will normally be
+ * reported as an ExtINT interrupt in the MP table. With ACPI
+ * there is no provision for ExtINT interrupts, and in the absence
+ * of an override it would be treated as an ordinary ISA I/O APIC
+ * interrupt, that is edge-triggered and unmasked by default. We
+ * used to do this, but it caused problems on some systems because
+ * of the NMI watchdog and sometimes IRQ0 of the 8254 timer using
+ * the same ExtINT cascade interrupt to drive the local APIC of the
+ * bootstrap processor. Therefore we refrain from routing IRQ2 to
+ * the I/O APIC in all cases now. No actual device should request
+ * it anyway. --macro
+ */
+#define PIC_IRQS (1UL << PIC_CASCADE_IR)
+
+static int mp_irqdomain_create(int ioapic)
+{
+ struct irq_alloc_info info;
+ struct irq_domain *parent;
+ int hwirqs = mp_ioapic_pin_count(ioapic);
+ struct ioapic *ip = &ioapics[ioapic];
+ struct ioapic_domain_cfg *cfg = &ip->irqdomain_cfg;
+ struct mp_ioapic_gsi *gsi_cfg = mp_ioapic_gsi_routing(ioapic);
+ struct fwnode_handle *fn;
+ char *name = "IO-APIC";
+
+ if (cfg->type == IOAPIC_DOMAIN_INVALID)
+ return 0;
+
+ init_irq_alloc_info(&info, NULL);
+ info.type = X86_IRQ_ALLOC_TYPE_IOAPIC;
+ info.ioapic_id = mpc_ioapic_id(ioapic);
+ parent = irq_remapping_get_ir_irq_domain(&info);
+ if (!parent)
+ parent = x86_vector_domain;
+ else
+ name = "IO-APIC-IR";
+
+ /* Handle device tree enumerated APICs proper */
+ if (cfg->dev) {
+ fn = of_node_to_fwnode(cfg->dev);
+ } else {
+ fn = irq_domain_alloc_named_id_fwnode(name, ioapic);
+ if (!fn)
+ return -ENOMEM;
+ }
+
+ ip->irqdomain = irq_domain_create_linear(fn, hwirqs, cfg->ops,
+ (void *)(long)ioapic);
+
+ /* Release fw handle if it was allocated above */
+ if (!cfg->dev)
+ irq_domain_free_fwnode(fn);
+
+ if (!ip->irqdomain)
+ return -ENOMEM;
+
+ ip->irqdomain->parent = parent;
+
+ if (cfg->type == IOAPIC_DOMAIN_LEGACY ||
+ cfg->type == IOAPIC_DOMAIN_STRICT)
+ ioapic_dynirq_base = max(ioapic_dynirq_base,
+ gsi_cfg->gsi_end + 1);
+
+ return 0;
+}
+
+static void ioapic_destroy_irqdomain(int idx)
+{
+ if (ioapics[idx].irqdomain) {
+ irq_domain_remove(ioapics[idx].irqdomain);
+ ioapics[idx].irqdomain = NULL;
+ }
+}
+
+void __init setup_IO_APIC(void)
+{
+ int ioapic;
+
+ if (skip_ioapic_setup || !nr_ioapics)
+ return;
+
+ io_apic_irqs = nr_legacy_irqs() ? ~PIC_IRQS : ~0UL;
+
+ apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n");
+ for_each_ioapic(ioapic)
+ BUG_ON(mp_irqdomain_create(ioapic));
+
+ /*
+ * Set up IO-APIC IRQ routing.
+ */
+ x86_init.mpparse.setup_ioapic_ids();
+
+ sync_Arb_IDs();
+ setup_IO_APIC_irqs();
+ init_IO_APIC_traps();
+ if (nr_legacy_irqs())
+ check_timer();
+
+ ioapic_initialized = 1;
+}
+
+static void resume_ioapic_id(int ioapic_idx)
+{
+ unsigned long flags;
+ union IO_APIC_reg_00 reg_00;
+
+ raw_spin_lock_irqsave(&ioapic_lock, flags);
+ reg_00.raw = io_apic_read(ioapic_idx, 0);
+ if (reg_00.bits.ID != mpc_ioapic_id(ioapic_idx)) {
+ reg_00.bits.ID = mpc_ioapic_id(ioapic_idx);
+ io_apic_write(ioapic_idx, 0, reg_00.raw);
+ }
+ raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+}
+
+static void ioapic_resume(void)
+{
+ int ioapic_idx;
+
+ for_each_ioapic_reverse(ioapic_idx)
+ resume_ioapic_id(ioapic_idx);
+
+ restore_ioapic_entries();
+}
+
+static struct syscore_ops ioapic_syscore_ops = {
+ .suspend = save_ioapic_entries,
+ .resume = ioapic_resume,
+};
+
+static int __init ioapic_init_ops(void)
+{
+ register_syscore_ops(&ioapic_syscore_ops);
+
+ return 0;
+}
+
+device_initcall(ioapic_init_ops);
+
+static int io_apic_get_redir_entries(int ioapic)
+{
+ union IO_APIC_reg_01 reg_01;
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&ioapic_lock, flags);
+ reg_01.raw = io_apic_read(ioapic, 1);
+ raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+
+ /* The register returns the maximum index redir index
+ * supported, which is one less than the total number of redir
+ * entries.
+ */
+ return reg_01.bits.entries + 1;
+}
+
+unsigned int arch_dynirq_lower_bound(unsigned int from)
+{
+ /*
+ * dmar_alloc_hwirq() may be called before setup_IO_APIC(), so use
+ * gsi_top if ioapic_dynirq_base hasn't been initialized yet.
+ */
+ return ioapic_initialized ? ioapic_dynirq_base : gsi_top;
+}
+
+#ifdef CONFIG_X86_32
+static int io_apic_get_unique_id(int ioapic, int apic_id)
+{
+ union IO_APIC_reg_00 reg_00;
+ static physid_mask_t apic_id_map = PHYSID_MASK_NONE;
+ physid_mask_t tmp;
+ unsigned long flags;
+ int i = 0;
+
+ /*
+ * The P4 platform supports up to 256 APIC IDs on two separate APIC
+ * buses (one for LAPICs, one for IOAPICs), where predecessors only
+ * supports up to 16 on one shared APIC bus.
+ *
+ * TBD: Expand LAPIC/IOAPIC support on P4-class systems to take full
+ * advantage of new APIC bus architecture.
+ */
+
+ if (physids_empty(apic_id_map))
+ apic->ioapic_phys_id_map(&phys_cpu_present_map, &apic_id_map);
+
+ raw_spin_lock_irqsave(&ioapic_lock, flags);
+ reg_00.raw = io_apic_read(ioapic, 0);
+ raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+
+ if (apic_id >= get_physical_broadcast()) {
+ printk(KERN_WARNING "IOAPIC[%d]: Invalid apic_id %d, trying "
+ "%d\n", ioapic, apic_id, reg_00.bits.ID);
+ apic_id = reg_00.bits.ID;
+ }
+
+ /*
+ * Every APIC in a system must have a unique ID or we get lots of nice
+ * 'stuck on smp_invalidate_needed IPI wait' messages.
+ */
+ if (apic->check_apicid_used(&apic_id_map, apic_id)) {
+
+ for (i = 0; i < get_physical_broadcast(); i++) {
+ if (!apic->check_apicid_used(&apic_id_map, i))
+ break;
+ }
+
+ if (i == get_physical_broadcast())
+ panic("Max apic_id exceeded!\n");
+
+ printk(KERN_WARNING "IOAPIC[%d]: apic_id %d already used, "
+ "trying %d\n", ioapic, apic_id, i);
+
+ apic_id = i;
+ }
+
+ apic->apicid_to_cpu_present(apic_id, &tmp);
+ physids_or(apic_id_map, apic_id_map, tmp);
+
+ if (reg_00.bits.ID != apic_id) {
+ reg_00.bits.ID = apic_id;
+
+ raw_spin_lock_irqsave(&ioapic_lock, flags);
+ io_apic_write(ioapic, 0, reg_00.raw);
+ reg_00.raw = io_apic_read(ioapic, 0);
+ raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+
+ /* Sanity check */
+ if (reg_00.bits.ID != apic_id) {
+ pr_err("IOAPIC[%d]: Unable to change apic_id!\n",
+ ioapic);
+ return -1;
+ }
+ }
+
+ apic_printk(APIC_VERBOSE, KERN_INFO
+ "IOAPIC[%d]: Assigned apic_id %d\n", ioapic, apic_id);
+
+ return apic_id;
+}
+
+static u8 io_apic_unique_id(int idx, u8 id)
+{
+ if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) &&
+ !APIC_XAPIC(boot_cpu_apic_version))
+ return io_apic_get_unique_id(idx, id);
+ else
+ return id;
+}
+#else
+static u8 io_apic_unique_id(int idx, u8 id)
+{
+ union IO_APIC_reg_00 reg_00;
+ DECLARE_BITMAP(used, 256);
+ unsigned long flags;
+ u8 new_id;
+ int i;
+
+ bitmap_zero(used, 256);
+ for_each_ioapic(i)
+ __set_bit(mpc_ioapic_id(i), used);
+
+ /* Hand out the requested id if available */
+ if (!test_bit(id, used))
+ return id;
+
+ /*
+ * Read the current id from the ioapic and keep it if
+ * available.
+ */
+ raw_spin_lock_irqsave(&ioapic_lock, flags);
+ reg_00.raw = io_apic_read(idx, 0);
+ raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+ new_id = reg_00.bits.ID;
+ if (!test_bit(new_id, used)) {
+ apic_printk(APIC_VERBOSE, KERN_INFO
+ "IOAPIC[%d]: Using reg apic_id %d instead of %d\n",
+ idx, new_id, id);
+ return new_id;
+ }
+
+ /*
+ * Get the next free id and write it to the ioapic.
+ */
+ new_id = find_first_zero_bit(used, 256);
+ reg_00.bits.ID = new_id;
+ raw_spin_lock_irqsave(&ioapic_lock, flags);
+ io_apic_write(idx, 0, reg_00.raw);
+ reg_00.raw = io_apic_read(idx, 0);
+ raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+ /* Sanity check */
+ BUG_ON(reg_00.bits.ID != new_id);
+
+ return new_id;
+}
+#endif
+
+static int io_apic_get_version(int ioapic)
+{
+ union IO_APIC_reg_01 reg_01;
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&ioapic_lock, flags);
+ reg_01.raw = io_apic_read(ioapic, 1);
+ raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+
+ return reg_01.bits.version;
+}
+
+int acpi_get_override_irq(u32 gsi, int *trigger, int *polarity)
+{
+ int ioapic, pin, idx;
+
+ if (skip_ioapic_setup)
+ return -1;
+
+ ioapic = mp_find_ioapic(gsi);
+ if (ioapic < 0)
+ return -1;
+
+ pin = mp_find_ioapic_pin(ioapic, gsi);
+ if (pin < 0)
+ return -1;
+
+ idx = find_irq_entry(ioapic, pin, mp_INT);
+ if (idx < 0)
+ return -1;
+
+ *trigger = irq_trigger(idx);
+ *polarity = irq_polarity(idx);
+ return 0;
+}
+
+/*
+ * This function updates target affinity of IOAPIC interrupts to include
+ * the CPUs which came online during SMP bringup.
+ */
+#define IOAPIC_RESOURCE_NAME_SIZE 11
+
+static struct resource *ioapic_resources;
+
+static struct resource * __init ioapic_setup_resources(void)
+{
+ unsigned long n;
+ struct resource *res;
+ char *mem;
+ int i;
+
+ if (nr_ioapics == 0)
+ return NULL;
+
+ n = IOAPIC_RESOURCE_NAME_SIZE + sizeof(struct resource);
+ n *= nr_ioapics;
+
+ mem = alloc_bootmem(n);
+ res = (void *)mem;
+
+ mem += sizeof(struct resource) * nr_ioapics;
+
+ for_each_ioapic(i) {
+ res[i].name = mem;
+ res[i].flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+ snprintf(mem, IOAPIC_RESOURCE_NAME_SIZE, "IOAPIC %u", i);
+ mem += IOAPIC_RESOURCE_NAME_SIZE;
+ ioapics[i].iomem_res = &res[i];
+ }
+
+ ioapic_resources = res;
+
+ return res;
+}
+
+void __init io_apic_init_mappings(void)
+{
+ unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0;
+ struct resource *ioapic_res;
+ int i;
+
+ ioapic_res = ioapic_setup_resources();
+ for_each_ioapic(i) {
+ if (smp_found_config) {
+ ioapic_phys = mpc_ioapic_addr(i);
+#ifdef CONFIG_X86_32
+ if (!ioapic_phys) {
+ printk(KERN_ERR
+ "WARNING: bogus zero IO-APIC "
+ "address found in MPTABLE, "
+ "disabling IO/APIC support!\n");
+ smp_found_config = 0;
+ skip_ioapic_setup = 1;
+ goto fake_ioapic_page;
+ }
+#endif
+ } else {
+#ifdef CONFIG_X86_32
+fake_ioapic_page:
+#endif
+ ioapic_phys = (unsigned long)alloc_bootmem_pages(PAGE_SIZE);
+ ioapic_phys = __pa(ioapic_phys);
+ }
+ set_fixmap_nocache(idx, ioapic_phys);
+ apic_printk(APIC_VERBOSE, "mapped IOAPIC to %08lx (%08lx)\n",
+ __fix_to_virt(idx) + (ioapic_phys & ~PAGE_MASK),
+ ioapic_phys);
+ idx++;
+
+ ioapic_res->start = ioapic_phys;
+ ioapic_res->end = ioapic_phys + IO_APIC_SLOT_SIZE - 1;
+ ioapic_res++;
+ }
+}
+
+void __init ioapic_insert_resources(void)
+{
+ int i;
+ struct resource *r = ioapic_resources;
+
+ if (!r) {
+ if (nr_ioapics > 0)
+ printk(KERN_ERR
+ "IO APIC resources couldn't be allocated.\n");
+ return;
+ }
+
+ for_each_ioapic(i) {
+ insert_resource(&iomem_resource, r);
+ r++;
+ }
+}
+
+int mp_find_ioapic(u32 gsi)
+{
+ int i;
+
+ if (nr_ioapics == 0)
+ return -1;
+
+ /* Find the IOAPIC that manages this GSI. */
+ for_each_ioapic(i) {
+ struct mp_ioapic_gsi *gsi_cfg = mp_ioapic_gsi_routing(i);
+ if (gsi >= gsi_cfg->gsi_base && gsi <= gsi_cfg->gsi_end)
+ return i;
+ }
+
+ printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi);
+ return -1;
+}
+
+int mp_find_ioapic_pin(int ioapic, u32 gsi)
+{
+ struct mp_ioapic_gsi *gsi_cfg;
+
+ if (WARN_ON(ioapic < 0))
+ return -1;
+
+ gsi_cfg = mp_ioapic_gsi_routing(ioapic);
+ if (WARN_ON(gsi > gsi_cfg->gsi_end))
+ return -1;
+
+ return gsi - gsi_cfg->gsi_base;
+}
+
+static int bad_ioapic_register(int idx)
+{
+ union IO_APIC_reg_00 reg_00;
+ union IO_APIC_reg_01 reg_01;
+ union IO_APIC_reg_02 reg_02;
+
+ reg_00.raw = io_apic_read(idx, 0);
+ reg_01.raw = io_apic_read(idx, 1);
+ reg_02.raw = io_apic_read(idx, 2);
+
+ if (reg_00.raw == -1 && reg_01.raw == -1 && reg_02.raw == -1) {
+ pr_warn("I/O APIC 0x%x registers return all ones, skipping!\n",
+ mpc_ioapic_addr(idx));
+ return 1;
+ }
+
+ return 0;
+}
+
+static int find_free_ioapic_entry(void)
+{
+ int idx;
+
+ for (idx = 0; idx < MAX_IO_APICS; idx++)
+ if (ioapics[idx].nr_registers == 0)
+ return idx;
+
+ return MAX_IO_APICS;
+}
+
+/**
+ * mp_register_ioapic - Register an IOAPIC device
+ * @id: hardware IOAPIC ID
+ * @address: physical address of IOAPIC register area
+ * @gsi_base: base of GSI associated with the IOAPIC
+ * @cfg: configuration information for the IOAPIC
+ */
+int mp_register_ioapic(int id, u32 address, u32 gsi_base,
+ struct ioapic_domain_cfg *cfg)
+{
+ bool hotplug = !!ioapic_initialized;
+ struct mp_ioapic_gsi *gsi_cfg;
+ int idx, ioapic, entries;
+ u32 gsi_end;
+
+ if (!address) {
+ pr_warn("Bogus (zero) I/O APIC address found, skipping!\n");
+ return -EINVAL;
+ }
+ for_each_ioapic(ioapic)
+ if (ioapics[ioapic].mp_config.apicaddr == address) {
+ pr_warn("address 0x%x conflicts with IOAPIC%d\n",
+ address, ioapic);
+ return -EEXIST;
+ }
+
+ idx = find_free_ioapic_entry();
+ if (idx >= MAX_IO_APICS) {
+ pr_warn("Max # of I/O APICs (%d) exceeded (found %d), skipping\n",
+ MAX_IO_APICS, idx);
+ return -ENOSPC;
+ }
+
+ ioapics[idx].mp_config.type = MP_IOAPIC;
+ ioapics[idx].mp_config.flags = MPC_APIC_USABLE;
+ ioapics[idx].mp_config.apicaddr = address;
+
+ set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
+ if (bad_ioapic_register(idx)) {
+ clear_fixmap(FIX_IO_APIC_BASE_0 + idx);
+ return -ENODEV;
+ }
+
+ ioapics[idx].mp_config.apicid = io_apic_unique_id(idx, id);
+ ioapics[idx].mp_config.apicver = io_apic_get_version(idx);
+
+ /*
+ * Build basic GSI lookup table to facilitate gsi->io_apic lookups
+ * and to prevent reprogramming of IOAPIC pins (PCI GSIs).
+ */
+ entries = io_apic_get_redir_entries(idx);
+ gsi_end = gsi_base + entries - 1;
+ for_each_ioapic(ioapic) {
+ gsi_cfg = mp_ioapic_gsi_routing(ioapic);
+ if ((gsi_base >= gsi_cfg->gsi_base &&
+ gsi_base <= gsi_cfg->gsi_end) ||
+ (gsi_end >= gsi_cfg->gsi_base &&
+ gsi_end <= gsi_cfg->gsi_end)) {
+ pr_warn("GSI range [%u-%u] for new IOAPIC conflicts with GSI[%u-%u]\n",
+ gsi_base, gsi_end,
+ gsi_cfg->gsi_base, gsi_cfg->gsi_end);
+ clear_fixmap(FIX_IO_APIC_BASE_0 + idx);
+ return -ENOSPC;
+ }
+ }
+ gsi_cfg = mp_ioapic_gsi_routing(idx);
+ gsi_cfg->gsi_base = gsi_base;
+ gsi_cfg->gsi_end = gsi_end;
+
+ ioapics[idx].irqdomain = NULL;
+ ioapics[idx].irqdomain_cfg = *cfg;
+
+ /*
+ * If mp_register_ioapic() is called during early boot stage when
+ * walking ACPI/SFI/DT tables, it's too early to create irqdomain,
+ * we are still using bootmem allocator. So delay it to setup_IO_APIC().
+ */
+ if (hotplug) {
+ if (mp_irqdomain_create(idx)) {
+ clear_fixmap(FIX_IO_APIC_BASE_0 + idx);
+ return -ENOMEM;
+ }
+ alloc_ioapic_saved_registers(idx);
+ }
+
+ if (gsi_cfg->gsi_end >= gsi_top)
+ gsi_top = gsi_cfg->gsi_end + 1;
+ if (nr_ioapics <= idx)
+ nr_ioapics = idx + 1;
+
+ /* Set nr_registers to mark entry present */
+ ioapics[idx].nr_registers = entries;
+
+ pr_info("IOAPIC[%d]: apic_id %d, version %d, address 0x%x, GSI %d-%d\n",
+ idx, mpc_ioapic_id(idx),
+ mpc_ioapic_ver(idx), mpc_ioapic_addr(idx),
+ gsi_cfg->gsi_base, gsi_cfg->gsi_end);
+
+ return 0;
+}
+
+int mp_unregister_ioapic(u32 gsi_base)
+{
+ int ioapic, pin;
+ int found = 0;
+
+ for_each_ioapic(ioapic)
+ if (ioapics[ioapic].gsi_config.gsi_base == gsi_base) {
+ found = 1;
+ break;
+ }
+ if (!found) {
+ pr_warn("can't find IOAPIC for GSI %d\n", gsi_base);
+ return -ENODEV;
+ }
+
+ for_each_pin(ioapic, pin) {
+ u32 gsi = mp_pin_to_gsi(ioapic, pin);
+ int irq = mp_map_gsi_to_irq(gsi, 0, NULL);
+ struct mp_chip_data *data;
+
+ if (irq >= 0) {
+ data = irq_get_chip_data(irq);
+ if (data && data->count) {
+ pr_warn("pin%d on IOAPIC%d is still in use.\n",
+ pin, ioapic);
+ return -EBUSY;
+ }
+ }
+ }
+
+ /* Mark entry not present */
+ ioapics[ioapic].nr_registers = 0;
+ ioapic_destroy_irqdomain(ioapic);
+ free_ioapic_saved_registers(ioapic);
+ if (ioapics[ioapic].iomem_res)
+ release_resource(ioapics[ioapic].iomem_res);
+ clear_fixmap(FIX_IO_APIC_BASE_0 + ioapic);
+ memset(&ioapics[ioapic], 0, sizeof(ioapics[ioapic]));
+
+ return 0;
+}
+
+int mp_ioapic_registered(u32 gsi_base)
+{
+ int ioapic;
+
+ for_each_ioapic(ioapic)
+ if (ioapics[ioapic].gsi_config.gsi_base == gsi_base)
+ return 1;
+
+ return 0;
+}
+
+static void mp_irqdomain_get_attr(u32 gsi, struct mp_chip_data *data,
+ struct irq_alloc_info *info)
+{
+ if (info && info->ioapic_valid) {
+ data->trigger = info->ioapic_trigger;
+ data->polarity = info->ioapic_polarity;
+ } else if (acpi_get_override_irq(gsi, &data->trigger,
+ &data->polarity) < 0) {
+ /* PCI interrupts are always active low level triggered. */
+ data->trigger = IOAPIC_LEVEL;
+ data->polarity = IOAPIC_POL_LOW;
+ }
+}
+
+static void mp_setup_entry(struct irq_cfg *cfg, struct mp_chip_data *data,
+ struct IO_APIC_route_entry *entry)
+{
+ memset(entry, 0, sizeof(*entry));
+ entry->delivery_mode = apic->irq_delivery_mode;
+ entry->dest_mode = apic->irq_dest_mode;
+ entry->dest = cfg->dest_apicid;
+ entry->vector = cfg->vector;
+ entry->trigger = data->trigger;
+ entry->polarity = data->polarity;
+ /*
+ * Mask level triggered irqs. Edge triggered irqs are masked
+ * by the irq core code in case they fire.
+ */
+ if (data->trigger == IOAPIC_LEVEL)
+ entry->mask = IOAPIC_MASKED;
+ else
+ entry->mask = IOAPIC_UNMASKED;
+}
+
+int mp_irqdomain_alloc(struct irq_domain *domain, unsigned int virq,
+ unsigned int nr_irqs, void *arg)
+{
+ int ret, ioapic, pin;
+ struct irq_cfg *cfg;
+ struct irq_data *irq_data;
+ struct mp_chip_data *data;
+ struct irq_alloc_info *info = arg;
+ unsigned long flags;
+
+ if (!info || nr_irqs > 1)
+ return -EINVAL;
+ irq_data = irq_domain_get_irq_data(domain, virq);
+ if (!irq_data)
+ return -EINVAL;
+
+ ioapic = mp_irqdomain_ioapic_idx(domain);
+ pin = info->ioapic_pin;
+ if (irq_find_mapping(domain, (irq_hw_number_t)pin) > 0)
+ return -EEXIST;
+
+ data = kzalloc(sizeof(*data), GFP_KERNEL);
+ if (!data)
+ return -ENOMEM;
+
+ info->ioapic_entry = &data->entry;
+ ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, info);
+ if (ret < 0) {
+ kfree(data);
+ return ret;
+ }
+
+ INIT_LIST_HEAD(&data->irq_2_pin);
+ irq_data->hwirq = info->ioapic_pin;
+ irq_data->chip = (domain->parent == x86_vector_domain) ?
+ &ioapic_chip : &ioapic_ir_chip;
+ irq_data->chip_data = data;
+ mp_irqdomain_get_attr(mp_pin_to_gsi(ioapic, pin), data, info);
+
+ cfg = irqd_cfg(irq_data);
+ add_pin_to_irq_node(data, ioapic_alloc_attr_node(info), ioapic, pin);
+
+ local_irq_save(flags);
+ if (info->ioapic_entry)
+ mp_setup_entry(cfg, data, info->ioapic_entry);
+ mp_register_handler(virq, data->trigger);
+ if (virq < nr_legacy_irqs())
+ legacy_pic->mask(virq);
+ local_irq_restore(flags);
+
+ apic_printk(APIC_VERBOSE, KERN_DEBUG
+ "IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> IRQ %d Mode:%i Active:%i Dest:%d)\n",
+ ioapic, mpc_ioapic_id(ioapic), pin, cfg->vector,
+ virq, data->trigger, data->polarity, cfg->dest_apicid);
+
+ return 0;
+}
+
+void mp_irqdomain_free(struct irq_domain *domain, unsigned int virq,
+ unsigned int nr_irqs)
+{
+ struct irq_data *irq_data;
+ struct mp_chip_data *data;
+
+ BUG_ON(nr_irqs != 1);
+ irq_data = irq_domain_get_irq_data(domain, virq);
+ if (irq_data && irq_data->chip_data) {
+ data = irq_data->chip_data;
+ __remove_pin_from_irq(data, mp_irqdomain_ioapic_idx(domain),
+ (int)irq_data->hwirq);
+ WARN_ON(!list_empty(&data->irq_2_pin));
+ kfree(irq_data->chip_data);
+ }
+ irq_domain_free_irqs_top(domain, virq, nr_irqs);
+}
+
+int mp_irqdomain_activate(struct irq_domain *domain,
+ struct irq_data *irq_data, bool reserve)
+{
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&ioapic_lock, flags);
+ ioapic_configure_entry(irq_data);
+ raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+ return 0;
+}
+
+void mp_irqdomain_deactivate(struct irq_domain *domain,
+ struct irq_data *irq_data)
+{
+ /* It won't be called for IRQ with multiple IOAPIC pins associated */
+ ioapic_mask_entry(mp_irqdomain_ioapic_idx(domain),
+ (int)irq_data->hwirq);
+}
+
+int mp_irqdomain_ioapic_idx(struct irq_domain *domain)
+{
+ return (int)(long)domain->host_data;
+}
+
+const struct irq_domain_ops mp_ioapic_irqdomain_ops = {
+ .alloc = mp_irqdomain_alloc,
+ .free = mp_irqdomain_free,
+ .activate = mp_irqdomain_activate,
+ .deactivate = mp_irqdomain_deactivate,
+};
diff --git a/arch/x86/kernel/apic/ipi.c b/arch/x86/kernel/apic/ipi.c
new file mode 100644
index 0000000..82f9244
--- /dev/null
+++ b/arch/x86/kernel/apic/ipi.c
@@ -0,0 +1,244 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/cpumask.h>
+#include <linux/interrupt.h>
+
+#include <linux/mm.h>
+#include <linux/delay.h>
+#include <linux/spinlock.h>
+#include <linux/kernel_stat.h>
+#include <linux/mc146818rtc.h>
+#include <linux/cache.h>
+#include <linux/cpu.h>
+
+#include <asm/smp.h>
+#include <asm/mtrr.h>
+#include <asm/tlbflush.h>
+#include <asm/mmu_context.h>
+#include <asm/apic.h>
+#include <asm/proto.h>
+#include <asm/ipi.h>
+
+void __default_send_IPI_shortcut(unsigned int shortcut, int vector, unsigned int dest)
+{
+ /*
+ * Subtle. In the case of the 'never do double writes' workaround
+ * we have to lock out interrupts to be safe. As we don't care
+ * of the value read we use an atomic rmw access to avoid costly
+ * cli/sti. Otherwise we use an even cheaper single atomic write
+ * to the APIC.
+ */
+ unsigned int cfg;
+
+ /*
+ * Wait for idle.
+ */
+ __xapic_wait_icr_idle();
+
+ /*
+ * No need to touch the target chip field
+ */
+ cfg = __prepare_ICR(shortcut, vector, dest);
+
+ /*
+ * Send the IPI. The write to APIC_ICR fires this off.
+ */
+ native_apic_mem_write(APIC_ICR, cfg);
+}
+
+/*
+ * This is used to send an IPI with no shorthand notation (the destination is
+ * specified in bits 56 to 63 of the ICR).
+ */
+void __default_send_IPI_dest_field(unsigned int mask, int vector, unsigned int dest)
+{
+ unsigned long cfg;
+
+ /*
+ * Wait for idle.
+ */
+ if (unlikely(vector == NMI_VECTOR))
+ safe_apic_wait_icr_idle();
+ else
+ __xapic_wait_icr_idle();
+
+ /*
+ * prepare target chip field
+ */
+ cfg = __prepare_ICR2(mask);
+ native_apic_mem_write(APIC_ICR2, cfg);
+
+ /*
+ * program the ICR
+ */
+ cfg = __prepare_ICR(0, vector, dest);
+
+ /*
+ * Send the IPI. The write to APIC_ICR fires this off.
+ */
+ native_apic_mem_write(APIC_ICR, cfg);
+}
+
+void default_send_IPI_single_phys(int cpu, int vector)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ __default_send_IPI_dest_field(per_cpu(x86_cpu_to_apicid, cpu),
+ vector, APIC_DEST_PHYSICAL);
+ local_irq_restore(flags);
+}
+
+void default_send_IPI_mask_sequence_phys(const struct cpumask *mask, int vector)
+{
+ unsigned long query_cpu;
+ unsigned long flags;
+
+ /*
+ * Hack. The clustered APIC addressing mode doesn't allow us to send
+ * to an arbitrary mask, so I do a unicast to each CPU instead.
+ * - mbligh
+ */
+ local_irq_save(flags);
+ for_each_cpu(query_cpu, mask) {
+ __default_send_IPI_dest_field(per_cpu(x86_cpu_to_apicid,
+ query_cpu), vector, APIC_DEST_PHYSICAL);
+ }
+ local_irq_restore(flags);
+}
+
+void default_send_IPI_mask_allbutself_phys(const struct cpumask *mask,
+ int vector)
+{
+ unsigned int this_cpu = smp_processor_id();
+ unsigned int query_cpu;
+ unsigned long flags;
+
+ /* See Hack comment above */
+
+ local_irq_save(flags);
+ for_each_cpu(query_cpu, mask) {
+ if (query_cpu == this_cpu)
+ continue;
+ __default_send_IPI_dest_field(per_cpu(x86_cpu_to_apicid,
+ query_cpu), vector, APIC_DEST_PHYSICAL);
+ }
+ local_irq_restore(flags);
+}
+
+/*
+ * Helper function for APICs which insist on cpumasks
+ */
+void default_send_IPI_single(int cpu, int vector)
+{
+ apic->send_IPI_mask(cpumask_of(cpu), vector);
+}
+
+#ifdef CONFIG_X86_32
+
+void default_send_IPI_mask_sequence_logical(const struct cpumask *mask,
+ int vector)
+{
+ unsigned long flags;
+ unsigned int query_cpu;
+
+ /*
+ * Hack. The clustered APIC addressing mode doesn't allow us to send
+ * to an arbitrary mask, so I do a unicasts to each CPU instead. This
+ * should be modified to do 1 message per cluster ID - mbligh
+ */
+
+ local_irq_save(flags);
+ for_each_cpu(query_cpu, mask)
+ __default_send_IPI_dest_field(
+ early_per_cpu(x86_cpu_to_logical_apicid, query_cpu),
+ vector, apic->dest_logical);
+ local_irq_restore(flags);
+}
+
+void default_send_IPI_mask_allbutself_logical(const struct cpumask *mask,
+ int vector)
+{
+ unsigned long flags;
+ unsigned int query_cpu;
+ unsigned int this_cpu = smp_processor_id();
+
+ /* See Hack comment above */
+
+ local_irq_save(flags);
+ for_each_cpu(query_cpu, mask) {
+ if (query_cpu == this_cpu)
+ continue;
+ __default_send_IPI_dest_field(
+ early_per_cpu(x86_cpu_to_logical_apicid, query_cpu),
+ vector, apic->dest_logical);
+ }
+ local_irq_restore(flags);
+}
+
+/*
+ * This is only used on smaller machines.
+ */
+void default_send_IPI_mask_logical(const struct cpumask *cpumask, int vector)
+{
+ unsigned long mask = cpumask_bits(cpumask)[0];
+ unsigned long flags;
+
+ if (!mask)
+ return;
+
+ local_irq_save(flags);
+ WARN_ON(mask & ~cpumask_bits(cpu_online_mask)[0]);
+ __default_send_IPI_dest_field(mask, vector, apic->dest_logical);
+ local_irq_restore(flags);
+}
+
+void default_send_IPI_allbutself(int vector)
+{
+ /*
+ * if there are no other CPUs in the system then we get an APIC send
+ * error if we try to broadcast, thus avoid sending IPIs in this case.
+ */
+ if (!(num_online_cpus() > 1))
+ return;
+
+ __default_local_send_IPI_allbutself(vector);
+}
+
+void default_send_IPI_all(int vector)
+{
+ __default_local_send_IPI_all(vector);
+}
+
+void default_send_IPI_self(int vector)
+{
+ __default_send_IPI_shortcut(APIC_DEST_SELF, vector, apic->dest_logical);
+}
+
+/* must come after the send_IPI functions above for inlining */
+static int convert_apicid_to_cpu(int apic_id)
+{
+ int i;
+
+ for_each_possible_cpu(i) {
+ if (per_cpu(x86_cpu_to_apicid, i) == apic_id)
+ return i;
+ }
+ return -1;
+}
+
+int safe_smp_processor_id(void)
+{
+ int apicid, cpuid;
+
+ if (!boot_cpu_has(X86_FEATURE_APIC))
+ return 0;
+
+ apicid = hard_smp_processor_id();
+ if (apicid == BAD_APICID)
+ return 0;
+
+ cpuid = convert_apicid_to_cpu(apicid);
+
+ return cpuid >= 0 ? cpuid : 0;
+}
+#endif
diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c
new file mode 100644
index 0000000..72a9440
--- /dev/null
+++ b/arch/x86/kernel/apic/msi.c
@@ -0,0 +1,389 @@
+/*
+ * Support of MSI, HPET and DMAR interrupts.
+ *
+ * Copyright (C) 1997, 1998, 1999, 2000, 2009 Ingo Molnar, Hajnalka Szabo
+ * Moved from arch/x86/kernel/apic/io_apic.c.
+ * Jiang Liu <jiang.liu@linux.intel.com>
+ * Convert to hierarchical irqdomain
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/mm.h>
+#include <linux/interrupt.h>
+#include <linux/irq.h>
+#include <linux/pci.h>
+#include <linux/dmar.h>
+#include <linux/hpet.h>
+#include <linux/msi.h>
+#include <asm/irqdomain.h>
+#include <asm/msidef.h>
+#include <asm/hpet.h>
+#include <asm/hw_irq.h>
+#include <asm/apic.h>
+#include <asm/irq_remapping.h>
+
+static struct irq_domain *msi_default_domain;
+
+static void irq_msi_compose_msg(struct irq_data *data, struct msi_msg *msg)
+{
+ struct irq_cfg *cfg = irqd_cfg(data);
+
+ msg->address_hi = MSI_ADDR_BASE_HI;
+
+ if (x2apic_enabled())
+ msg->address_hi |= MSI_ADDR_EXT_DEST_ID(cfg->dest_apicid);
+
+ msg->address_lo =
+ MSI_ADDR_BASE_LO |
+ ((apic->irq_dest_mode == 0) ?
+ MSI_ADDR_DEST_MODE_PHYSICAL :
+ MSI_ADDR_DEST_MODE_LOGICAL) |
+ MSI_ADDR_REDIRECTION_CPU |
+ MSI_ADDR_DEST_ID(cfg->dest_apicid);
+
+ msg->data =
+ MSI_DATA_TRIGGER_EDGE |
+ MSI_DATA_LEVEL_ASSERT |
+ MSI_DATA_DELIVERY_FIXED |
+ MSI_DATA_VECTOR(cfg->vector);
+}
+
+/*
+ * IRQ Chip for MSI PCI/PCI-X/PCI-Express Devices,
+ * which implement the MSI or MSI-X Capability Structure.
+ */
+static struct irq_chip pci_msi_controller = {
+ .name = "PCI-MSI",
+ .irq_unmask = pci_msi_unmask_irq,
+ .irq_mask = pci_msi_mask_irq,
+ .irq_ack = irq_chip_ack_parent,
+ .irq_retrigger = irq_chip_retrigger_hierarchy,
+ .irq_compose_msi_msg = irq_msi_compose_msg,
+ .flags = IRQCHIP_SKIP_SET_WAKE,
+};
+
+int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
+{
+ struct irq_domain *domain;
+ struct irq_alloc_info info;
+
+ init_irq_alloc_info(&info, NULL);
+ info.type = X86_IRQ_ALLOC_TYPE_MSI;
+ info.msi_dev = dev;
+
+ domain = irq_remapping_get_irq_domain(&info);
+ if (domain == NULL)
+ domain = msi_default_domain;
+ if (domain == NULL)
+ return -ENOSYS;
+
+ return msi_domain_alloc_irqs(domain, &dev->dev, nvec);
+}
+
+void native_teardown_msi_irq(unsigned int irq)
+{
+ irq_domain_free_irqs(irq, 1);
+}
+
+static irq_hw_number_t pci_msi_get_hwirq(struct msi_domain_info *info,
+ msi_alloc_info_t *arg)
+{
+ return arg->msi_hwirq;
+}
+
+int pci_msi_prepare(struct irq_domain *domain, struct device *dev, int nvec,
+ msi_alloc_info_t *arg)
+{
+ struct pci_dev *pdev = to_pci_dev(dev);
+ struct msi_desc *desc = first_pci_msi_entry(pdev);
+
+ init_irq_alloc_info(arg, NULL);
+ arg->msi_dev = pdev;
+ if (desc->msi_attrib.is_msix) {
+ arg->type = X86_IRQ_ALLOC_TYPE_MSIX;
+ } else {
+ arg->type = X86_IRQ_ALLOC_TYPE_MSI;
+ arg->flags |= X86_IRQ_ALLOC_CONTIGUOUS_VECTORS;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(pci_msi_prepare);
+
+void pci_msi_set_desc(msi_alloc_info_t *arg, struct msi_desc *desc)
+{
+ arg->msi_hwirq = pci_msi_domain_calc_hwirq(arg->msi_dev, desc);
+}
+EXPORT_SYMBOL_GPL(pci_msi_set_desc);
+
+static struct msi_domain_ops pci_msi_domain_ops = {
+ .get_hwirq = pci_msi_get_hwirq,
+ .msi_prepare = pci_msi_prepare,
+ .set_desc = pci_msi_set_desc,
+};
+
+static struct msi_domain_info pci_msi_domain_info = {
+ .flags = MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE_DEF_CHIP_OPS |
+ MSI_FLAG_PCI_MSIX,
+ .ops = &pci_msi_domain_ops,
+ .chip = &pci_msi_controller,
+ .handler = handle_edge_irq,
+ .handler_name = "edge",
+};
+
+void __init arch_init_msi_domain(struct irq_domain *parent)
+{
+ struct fwnode_handle *fn;
+
+ if (disable_apic)
+ return;
+
+ fn = irq_domain_alloc_named_fwnode("PCI-MSI");
+ if (fn) {
+ msi_default_domain =
+ pci_msi_create_irq_domain(fn, &pci_msi_domain_info,
+ parent);
+ irq_domain_free_fwnode(fn);
+ }
+ if (!msi_default_domain)
+ pr_warn("failed to initialize irqdomain for MSI/MSI-x.\n");
+}
+
+#ifdef CONFIG_IRQ_REMAP
+static struct irq_chip pci_msi_ir_controller = {
+ .name = "IR-PCI-MSI",
+ .irq_unmask = pci_msi_unmask_irq,
+ .irq_mask = pci_msi_mask_irq,
+ .irq_ack = irq_chip_ack_parent,
+ .irq_retrigger = irq_chip_retrigger_hierarchy,
+ .irq_set_vcpu_affinity = irq_chip_set_vcpu_affinity_parent,
+ .flags = IRQCHIP_SKIP_SET_WAKE,
+};
+
+static struct msi_domain_info pci_msi_ir_domain_info = {
+ .flags = MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE_DEF_CHIP_OPS |
+ MSI_FLAG_MULTI_PCI_MSI | MSI_FLAG_PCI_MSIX,
+ .ops = &pci_msi_domain_ops,
+ .chip = &pci_msi_ir_controller,
+ .handler = handle_edge_irq,
+ .handler_name = "edge",
+};
+
+struct irq_domain *arch_create_remap_msi_irq_domain(struct irq_domain *parent,
+ const char *name, int id)
+{
+ struct fwnode_handle *fn;
+ struct irq_domain *d;
+
+ fn = irq_domain_alloc_named_id_fwnode(name, id);
+ if (!fn)
+ return NULL;
+ d = pci_msi_create_irq_domain(fn, &pci_msi_ir_domain_info, parent);
+ irq_domain_free_fwnode(fn);
+ return d;
+}
+#endif
+
+#ifdef CONFIG_DMAR_TABLE
+static void dmar_msi_write_msg(struct irq_data *data, struct msi_msg *msg)
+{
+ dmar_msi_write(data->irq, msg);
+}
+
+static struct irq_chip dmar_msi_controller = {
+ .name = "DMAR-MSI",
+ .irq_unmask = dmar_msi_unmask,
+ .irq_mask = dmar_msi_mask,
+ .irq_ack = irq_chip_ack_parent,
+ .irq_set_affinity = msi_domain_set_affinity,
+ .irq_retrigger = irq_chip_retrigger_hierarchy,
+ .irq_compose_msi_msg = irq_msi_compose_msg,
+ .irq_write_msi_msg = dmar_msi_write_msg,
+ .flags = IRQCHIP_SKIP_SET_WAKE,
+};
+
+static irq_hw_number_t dmar_msi_get_hwirq(struct msi_domain_info *info,
+ msi_alloc_info_t *arg)
+{
+ return arg->dmar_id;
+}
+
+static int dmar_msi_init(struct irq_domain *domain,
+ struct msi_domain_info *info, unsigned int virq,
+ irq_hw_number_t hwirq, msi_alloc_info_t *arg)
+{
+ irq_domain_set_info(domain, virq, arg->dmar_id, info->chip, NULL,
+ handle_edge_irq, arg->dmar_data, "edge");
+
+ return 0;
+}
+
+static struct msi_domain_ops dmar_msi_domain_ops = {
+ .get_hwirq = dmar_msi_get_hwirq,
+ .msi_init = dmar_msi_init,
+};
+
+static struct msi_domain_info dmar_msi_domain_info = {
+ .ops = &dmar_msi_domain_ops,
+ .chip = &dmar_msi_controller,
+};
+
+static struct irq_domain *dmar_get_irq_domain(void)
+{
+ static struct irq_domain *dmar_domain;
+ static DEFINE_MUTEX(dmar_lock);
+ struct fwnode_handle *fn;
+
+ mutex_lock(&dmar_lock);
+ if (dmar_domain)
+ goto out;
+
+ fn = irq_domain_alloc_named_fwnode("DMAR-MSI");
+ if (fn) {
+ dmar_domain = msi_create_irq_domain(fn, &dmar_msi_domain_info,
+ x86_vector_domain);
+ irq_domain_free_fwnode(fn);
+ }
+out:
+ mutex_unlock(&dmar_lock);
+ return dmar_domain;
+}
+
+int dmar_alloc_hwirq(int id, int node, void *arg)
+{
+ struct irq_domain *domain = dmar_get_irq_domain();
+ struct irq_alloc_info info;
+
+ if (!domain)
+ return -1;
+
+ init_irq_alloc_info(&info, NULL);
+ info.type = X86_IRQ_ALLOC_TYPE_DMAR;
+ info.dmar_id = id;
+ info.dmar_data = arg;
+
+ return irq_domain_alloc_irqs(domain, 1, node, &info);
+}
+
+void dmar_free_hwirq(int irq)
+{
+ irq_domain_free_irqs(irq, 1);
+}
+#endif
+
+/*
+ * MSI message composition
+ */
+#ifdef CONFIG_HPET_TIMER
+static inline int hpet_dev_id(struct irq_domain *domain)
+{
+ struct msi_domain_info *info = msi_get_domain_info(domain);
+
+ return (int)(long)info->data;
+}
+
+static void hpet_msi_write_msg(struct irq_data *data, struct msi_msg *msg)
+{
+ hpet_msi_write(irq_data_get_irq_handler_data(data), msg);
+}
+
+static struct irq_chip hpet_msi_controller __ro_after_init = {
+ .name = "HPET-MSI",
+ .irq_unmask = hpet_msi_unmask,
+ .irq_mask = hpet_msi_mask,
+ .irq_ack = irq_chip_ack_parent,
+ .irq_set_affinity = msi_domain_set_affinity,
+ .irq_retrigger = irq_chip_retrigger_hierarchy,
+ .irq_compose_msi_msg = irq_msi_compose_msg,
+ .irq_write_msi_msg = hpet_msi_write_msg,
+ .flags = IRQCHIP_SKIP_SET_WAKE,
+};
+
+static irq_hw_number_t hpet_msi_get_hwirq(struct msi_domain_info *info,
+ msi_alloc_info_t *arg)
+{
+ return arg->hpet_index;
+}
+
+static int hpet_msi_init(struct irq_domain *domain,
+ struct msi_domain_info *info, unsigned int virq,
+ irq_hw_number_t hwirq, msi_alloc_info_t *arg)
+{
+ irq_set_status_flags(virq, IRQ_MOVE_PCNTXT);
+ irq_domain_set_info(domain, virq, arg->hpet_index, info->chip, NULL,
+ handle_edge_irq, arg->hpet_data, "edge");
+
+ return 0;
+}
+
+static void hpet_msi_free(struct irq_domain *domain,
+ struct msi_domain_info *info, unsigned int virq)
+{
+ irq_clear_status_flags(virq, IRQ_MOVE_PCNTXT);
+}
+
+static struct msi_domain_ops hpet_msi_domain_ops = {
+ .get_hwirq = hpet_msi_get_hwirq,
+ .msi_init = hpet_msi_init,
+ .msi_free = hpet_msi_free,
+};
+
+static struct msi_domain_info hpet_msi_domain_info = {
+ .ops = &hpet_msi_domain_ops,
+ .chip = &hpet_msi_controller,
+};
+
+struct irq_domain *hpet_create_irq_domain(int hpet_id)
+{
+ struct msi_domain_info *domain_info;
+ struct irq_domain *parent, *d;
+ struct irq_alloc_info info;
+ struct fwnode_handle *fn;
+
+ if (x86_vector_domain == NULL)
+ return NULL;
+
+ domain_info = kzalloc(sizeof(*domain_info), GFP_KERNEL);
+ if (!domain_info)
+ return NULL;
+
+ *domain_info = hpet_msi_domain_info;
+ domain_info->data = (void *)(long)hpet_id;
+
+ init_irq_alloc_info(&info, NULL);
+ info.type = X86_IRQ_ALLOC_TYPE_HPET;
+ info.hpet_id = hpet_id;
+ parent = irq_remapping_get_ir_irq_domain(&info);
+ if (parent == NULL)
+ parent = x86_vector_domain;
+ else
+ hpet_msi_controller.name = "IR-HPET-MSI";
+
+ fn = irq_domain_alloc_named_id_fwnode(hpet_msi_controller.name,
+ hpet_id);
+ if (!fn) {
+ kfree(domain_info);
+ return NULL;
+ }
+
+ d = msi_create_irq_domain(fn, domain_info, parent);
+ irq_domain_free_fwnode(fn);
+ return d;
+}
+
+int hpet_assign_irq(struct irq_domain *domain, struct hpet_dev *dev,
+ int dev_num)
+{
+ struct irq_alloc_info info;
+
+ init_irq_alloc_info(&info, NULL);
+ info.type = X86_IRQ_ALLOC_TYPE_HPET;
+ info.hpet_data = dev;
+ info.hpet_id = hpet_dev_id(domain);
+ info.hpet_index = dev_num;
+
+ return irq_domain_alloc_irqs(domain, 1, NUMA_NO_NODE, &info);
+}
+#endif
diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c
new file mode 100644
index 0000000..02e8acb
--- /dev/null
+++ b/arch/x86/kernel/apic/probe_32.c
@@ -0,0 +1,249 @@
+/*
+ * Default generic APIC driver. This handles up to 8 CPUs.
+ *
+ * Copyright 2003 Andi Kleen, SuSE Labs.
+ * Subject to the GNU Public License, v.2
+ *
+ * Generic x86 APIC driver probe layer.
+ */
+#include <linux/threads.h>
+#include <linux/cpumask.h>
+#include <linux/export.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <linux/ctype.h>
+#include <linux/init.h>
+#include <linux/errno.h>
+#include <asm/fixmap.h>
+#include <asm/mpspec.h>
+#include <asm/apicdef.h>
+#include <asm/apic.h>
+#include <asm/setup.h>
+
+#include <linux/smp.h>
+#include <asm/ipi.h>
+
+#include <linux/interrupt.h>
+#include <asm/acpi.h>
+#include <asm/e820/api.h>
+
+#ifdef CONFIG_HOTPLUG_CPU
+#define DEFAULT_SEND_IPI (1)
+#else
+#define DEFAULT_SEND_IPI (0)
+#endif
+
+int no_broadcast = DEFAULT_SEND_IPI;
+
+static __init int no_ipi_broadcast(char *str)
+{
+ get_option(&str, &no_broadcast);
+ pr_info("Using %s mode\n",
+ no_broadcast ? "No IPI Broadcast" : "IPI Broadcast");
+ return 1;
+}
+__setup("no_ipi_broadcast=", no_ipi_broadcast);
+
+static int __init print_ipi_mode(void)
+{
+ pr_info("Using IPI %s mode\n",
+ no_broadcast ? "No-Shortcut" : "Shortcut");
+ return 0;
+}
+late_initcall(print_ipi_mode);
+
+static int default_x86_32_early_logical_apicid(int cpu)
+{
+ return 1 << cpu;
+}
+
+static void setup_apic_flat_routing(void)
+{
+#ifdef CONFIG_X86_IO_APIC
+ printk(KERN_INFO
+ "Enabling APIC mode: Flat. Using %d I/O APICs\n",
+ nr_ioapics);
+#endif
+}
+
+static int default_apic_id_registered(void)
+{
+ return physid_isset(read_apic_id(), phys_cpu_present_map);
+}
+
+/*
+ * Set up the logical destination ID. Intel recommends to set DFR, LDR and
+ * TPR before enabling an APIC. See e.g. "AP-388 82489DX User's Manual"
+ * (Intel document number 292116).
+ */
+static void default_init_apic_ldr(void)
+{
+ unsigned long val;
+
+ apic_write(APIC_DFR, APIC_DFR_VALUE);
+ val = apic_read(APIC_LDR) & ~APIC_LDR_MASK;
+ val |= SET_APIC_LOGICAL_ID(1UL << smp_processor_id());
+ apic_write(APIC_LDR, val);
+}
+
+static int default_phys_pkg_id(int cpuid_apic, int index_msb)
+{
+ return cpuid_apic >> index_msb;
+}
+
+/* should be called last. */
+static int probe_default(void)
+{
+ return 1;
+}
+
+static struct apic apic_default __ro_after_init = {
+
+ .name = "default",
+ .probe = probe_default,
+ .acpi_madt_oem_check = NULL,
+ .apic_id_valid = default_apic_id_valid,
+ .apic_id_registered = default_apic_id_registered,
+
+ .irq_delivery_mode = dest_Fixed,
+ /* logical delivery broadcast to all CPUs: */
+ .irq_dest_mode = 1,
+
+ .disable_esr = 0,
+ .dest_logical = APIC_DEST_LOGICAL,
+ .check_apicid_used = default_check_apicid_used,
+
+ .init_apic_ldr = default_init_apic_ldr,
+
+ .ioapic_phys_id_map = default_ioapic_phys_id_map,
+ .setup_apic_routing = setup_apic_flat_routing,
+ .cpu_present_to_apicid = default_cpu_present_to_apicid,
+ .apicid_to_cpu_present = physid_set_mask_of_physid,
+ .check_phys_apicid_present = default_check_phys_apicid_present,
+ .phys_pkg_id = default_phys_pkg_id,
+
+ .get_apic_id = default_get_apic_id,
+ .set_apic_id = NULL,
+
+ .calc_dest_apicid = apic_flat_calc_apicid,
+
+ .send_IPI = default_send_IPI_single,
+ .send_IPI_mask = default_send_IPI_mask_logical,
+ .send_IPI_mask_allbutself = default_send_IPI_mask_allbutself_logical,
+ .send_IPI_allbutself = default_send_IPI_allbutself,
+ .send_IPI_all = default_send_IPI_all,
+ .send_IPI_self = default_send_IPI_self,
+
+ .inquire_remote_apic = default_inquire_remote_apic,
+
+ .read = native_apic_mem_read,
+ .write = native_apic_mem_write,
+ .eoi_write = native_apic_mem_write,
+ .icr_read = native_apic_icr_read,
+ .icr_write = native_apic_icr_write,
+ .wait_icr_idle = native_apic_wait_icr_idle,
+ .safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
+
+ .x86_32_early_logical_apicid = default_x86_32_early_logical_apicid,
+};
+
+apic_driver(apic_default);
+
+struct apic *apic __ro_after_init = &apic_default;
+EXPORT_SYMBOL_GPL(apic);
+
+static int cmdline_apic __initdata;
+static int __init parse_apic(char *arg)
+{
+ struct apic **drv;
+
+ if (!arg)
+ return -EINVAL;
+
+ for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) {
+ if (!strcmp((*drv)->name, arg)) {
+ apic = *drv;
+ cmdline_apic = 1;
+ return 0;
+ }
+ }
+
+ /* Parsed again by __setup for debug/verbose */
+ return 0;
+}
+early_param("apic", parse_apic);
+
+void __init default_setup_apic_routing(void)
+{
+ int version = boot_cpu_apic_version;
+
+ if (num_possible_cpus() > 8) {
+ switch (boot_cpu_data.x86_vendor) {
+ case X86_VENDOR_INTEL:
+ if (!APIC_XAPIC(version)) {
+ def_to_bigsmp = 0;
+ break;
+ }
+ /* If P4 and above fall through */
+ case X86_VENDOR_AMD:
+ def_to_bigsmp = 1;
+ }
+ }
+
+#ifdef CONFIG_X86_BIGSMP
+ /*
+ * This is used to switch to bigsmp mode when
+ * - There is no apic= option specified by the user
+ * - generic_apic_probe() has chosen apic_default as the sub_arch
+ * - we find more than 8 CPUs in acpi LAPIC listing with xAPIC support
+ */
+
+ if (!cmdline_apic && apic == &apic_default)
+ generic_bigsmp_probe();
+#endif
+
+ if (apic->setup_apic_routing)
+ apic->setup_apic_routing();
+
+ if (x86_platform.apic_post_init)
+ x86_platform.apic_post_init();
+}
+
+void __init generic_apic_probe(void)
+{
+ if (!cmdline_apic) {
+ struct apic **drv;
+
+ for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) {
+ if ((*drv)->probe()) {
+ apic = *drv;
+ break;
+ }
+ }
+ /* Not visible without early console */
+ if (drv == __apicdrivers_end)
+ panic("Didn't find an APIC driver");
+ }
+ printk(KERN_INFO "Using APIC driver %s\n", apic->name);
+}
+
+/* This function can switch the APIC even after the initial ->probe() */
+int __init default_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
+{
+ struct apic **drv;
+
+ for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) {
+ if (!(*drv)->acpi_madt_oem_check)
+ continue;
+ if (!(*drv)->acpi_madt_oem_check(oem_id, oem_table_id))
+ continue;
+
+ if (!cmdline_apic) {
+ apic = *drv;
+ printk(KERN_INFO "Switched to APIC driver `%s'.\n",
+ apic->name);
+ }
+ return 1;
+ }
+ return 0;
+}
diff --git a/arch/x86/kernel/apic/probe_64.c b/arch/x86/kernel/apic/probe_64.c
new file mode 100644
index 0000000..c303054
--- /dev/null
+++ b/arch/x86/kernel/apic/probe_64.c
@@ -0,0 +1,71 @@
+/*
+ * Copyright 2004 James Cleverdon, IBM.
+ * Subject to the GNU Public License, v.2
+ *
+ * Generic APIC sub-arch probe layer.
+ *
+ * Hacked for x86-64 by James Cleverdon from i386 architecture code by
+ * Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and
+ * James Cleverdon.
+ */
+#include <linux/threads.h>
+#include <linux/cpumask.h>
+#include <linux/string.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/ctype.h>
+#include <linux/hardirq.h>
+#include <linux/dmar.h>
+
+#include <asm/smp.h>
+#include <asm/apic.h>
+#include <asm/ipi.h>
+#include <asm/setup.h>
+
+/*
+ * Check the APIC IDs in bios_cpu_apicid and choose the APIC mode.
+ */
+void __init default_setup_apic_routing(void)
+{
+ struct apic **drv;
+
+ enable_IR_x2apic();
+
+ for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) {
+ if ((*drv)->probe && (*drv)->probe()) {
+ if (apic != *drv) {
+ apic = *drv;
+ pr_info("Switched APIC routing to %s.\n",
+ apic->name);
+ }
+ break;
+ }
+ }
+
+ if (x86_platform.apic_post_init)
+ x86_platform.apic_post_init();
+}
+
+/* Same for both flat and physical. */
+
+void apic_send_IPI_self(int vector)
+{
+ __default_send_IPI_shortcut(APIC_DEST_SELF, vector, APIC_DEST_PHYSICAL);
+}
+
+int __init default_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
+{
+ struct apic **drv;
+
+ for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) {
+ if ((*drv)->acpi_madt_oem_check(oem_id, oem_table_id)) {
+ if (apic != *drv) {
+ apic = *drv;
+ pr_info("Setting APIC routing to %s.\n",
+ apic->name);
+ }
+ return 1;
+ }
+ }
+ return 0;
+}
diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c
new file mode 100644
index 0000000..7654feb
--- /dev/null
+++ b/arch/x86/kernel/apic/vector.c
@@ -0,0 +1,1247 @@
+/*
+ * Local APIC related interfaces to support IOAPIC, MSI, etc.
+ *
+ * Copyright (C) 1997, 1998, 1999, 2000, 2009 Ingo Molnar, Hajnalka Szabo
+ * Moved from arch/x86/kernel/apic/io_apic.c.
+ * Jiang Liu <jiang.liu@linux.intel.com>
+ * Enable support of hierarchical irqdomains
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/interrupt.h>
+#include <linux/irq.h>
+#include <linux/seq_file.h>
+#include <linux/init.h>
+#include <linux/compiler.h>
+#include <linux/slab.h>
+#include <asm/irqdomain.h>
+#include <asm/hw_irq.h>
+#include <asm/apic.h>
+#include <asm/i8259.h>
+#include <asm/desc.h>
+#include <asm/irq_remapping.h>
+
+#include <asm/trace/irq_vectors.h>
+
+struct apic_chip_data {
+ struct irq_cfg hw_irq_cfg;
+ unsigned int vector;
+ unsigned int prev_vector;
+ unsigned int cpu;
+ unsigned int prev_cpu;
+ unsigned int irq;
+ struct hlist_node clist;
+ unsigned int move_in_progress : 1,
+ is_managed : 1,
+ can_reserve : 1,
+ has_reserved : 1;
+};
+
+struct irq_domain *x86_vector_domain;
+EXPORT_SYMBOL_GPL(x86_vector_domain);
+static DEFINE_RAW_SPINLOCK(vector_lock);
+static cpumask_var_t vector_searchmask;
+static struct irq_chip lapic_controller;
+static struct irq_matrix *vector_matrix;
+#ifdef CONFIG_SMP
+static DEFINE_PER_CPU(struct hlist_head, cleanup_list);
+#endif
+
+void lock_vector_lock(void)
+{
+ /* Used to the online set of cpus does not change
+ * during assign_irq_vector.
+ */
+ raw_spin_lock(&vector_lock);
+}
+
+void unlock_vector_lock(void)
+{
+ raw_spin_unlock(&vector_lock);
+}
+
+void init_irq_alloc_info(struct irq_alloc_info *info,
+ const struct cpumask *mask)
+{
+ memset(info, 0, sizeof(*info));
+ info->mask = mask;
+}
+
+void copy_irq_alloc_info(struct irq_alloc_info *dst, struct irq_alloc_info *src)
+{
+ if (src)
+ *dst = *src;
+ else
+ memset(dst, 0, sizeof(*dst));
+}
+
+static struct apic_chip_data *apic_chip_data(struct irq_data *irqd)
+{
+ if (!irqd)
+ return NULL;
+
+ while (irqd->parent_data)
+ irqd = irqd->parent_data;
+
+ return irqd->chip_data;
+}
+
+struct irq_cfg *irqd_cfg(struct irq_data *irqd)
+{
+ struct apic_chip_data *apicd = apic_chip_data(irqd);
+
+ return apicd ? &apicd->hw_irq_cfg : NULL;
+}
+EXPORT_SYMBOL_GPL(irqd_cfg);
+
+struct irq_cfg *irq_cfg(unsigned int irq)
+{
+ return irqd_cfg(irq_get_irq_data(irq));
+}
+
+static struct apic_chip_data *alloc_apic_chip_data(int node)
+{
+ struct apic_chip_data *apicd;
+
+ apicd = kzalloc_node(sizeof(*apicd), GFP_KERNEL, node);
+ if (apicd)
+ INIT_HLIST_NODE(&apicd->clist);
+ return apicd;
+}
+
+static void free_apic_chip_data(struct apic_chip_data *apicd)
+{
+ kfree(apicd);
+}
+
+static void apic_update_irq_cfg(struct irq_data *irqd, unsigned int vector,
+ unsigned int cpu)
+{
+ struct apic_chip_data *apicd = apic_chip_data(irqd);
+
+ lockdep_assert_held(&vector_lock);
+
+ apicd->hw_irq_cfg.vector = vector;
+ apicd->hw_irq_cfg.dest_apicid = apic->calc_dest_apicid(cpu);
+ irq_data_update_effective_affinity(irqd, cpumask_of(cpu));
+ trace_vector_config(irqd->irq, vector, cpu,
+ apicd->hw_irq_cfg.dest_apicid);
+}
+
+static void apic_update_vector(struct irq_data *irqd, unsigned int newvec,
+ unsigned int newcpu)
+{
+ struct apic_chip_data *apicd = apic_chip_data(irqd);
+ struct irq_desc *desc = irq_data_to_desc(irqd);
+ bool managed = irqd_affinity_is_managed(irqd);
+
+ lockdep_assert_held(&vector_lock);
+
+ trace_vector_update(irqd->irq, newvec, newcpu, apicd->vector,
+ apicd->cpu);
+
+ /*
+ * If there is no vector associated or if the associated vector is
+ * the shutdown vector, which is associated to make PCI/MSI
+ * shutdown mode work, then there is nothing to release. Clear out
+ * prev_vector for this and the offlined target case.
+ */
+ apicd->prev_vector = 0;
+ if (!apicd->vector || apicd->vector == MANAGED_IRQ_SHUTDOWN_VECTOR)
+ goto setnew;
+ /*
+ * If the target CPU of the previous vector is online, then mark
+ * the vector as move in progress and store it for cleanup when the
+ * first interrupt on the new vector arrives. If the target CPU is
+ * offline then the regular release mechanism via the cleanup
+ * vector is not possible and the vector can be immediately freed
+ * in the underlying matrix allocator.
+ */
+ if (cpu_online(apicd->cpu)) {
+ apicd->move_in_progress = true;
+ apicd->prev_vector = apicd->vector;
+ apicd->prev_cpu = apicd->cpu;
+ } else {
+ irq_matrix_free(vector_matrix, apicd->cpu, apicd->vector,
+ managed);
+ }
+
+setnew:
+ apicd->vector = newvec;
+ apicd->cpu = newcpu;
+ BUG_ON(!IS_ERR_OR_NULL(per_cpu(vector_irq, newcpu)[newvec]));
+ per_cpu(vector_irq, newcpu)[newvec] = desc;
+}
+
+static void vector_assign_managed_shutdown(struct irq_data *irqd)
+{
+ unsigned int cpu = cpumask_first(cpu_online_mask);
+
+ apic_update_irq_cfg(irqd, MANAGED_IRQ_SHUTDOWN_VECTOR, cpu);
+}
+
+static int reserve_managed_vector(struct irq_data *irqd)
+{
+ const struct cpumask *affmsk = irq_data_get_affinity_mask(irqd);
+ struct apic_chip_data *apicd = apic_chip_data(irqd);
+ unsigned long flags;
+ int ret;
+
+ raw_spin_lock_irqsave(&vector_lock, flags);
+ apicd->is_managed = true;
+ ret = irq_matrix_reserve_managed(vector_matrix, affmsk);
+ raw_spin_unlock_irqrestore(&vector_lock, flags);
+ trace_vector_reserve_managed(irqd->irq, ret);
+ return ret;
+}
+
+static void reserve_irq_vector_locked(struct irq_data *irqd)
+{
+ struct apic_chip_data *apicd = apic_chip_data(irqd);
+
+ irq_matrix_reserve(vector_matrix);
+ apicd->can_reserve = true;
+ apicd->has_reserved = true;
+ irqd_set_can_reserve(irqd);
+ trace_vector_reserve(irqd->irq, 0);
+ vector_assign_managed_shutdown(irqd);
+}
+
+static int reserve_irq_vector(struct irq_data *irqd)
+{
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&vector_lock, flags);
+ reserve_irq_vector_locked(irqd);
+ raw_spin_unlock_irqrestore(&vector_lock, flags);
+ return 0;
+}
+
+static int
+assign_vector_locked(struct irq_data *irqd, const struct cpumask *dest)
+{
+ struct apic_chip_data *apicd = apic_chip_data(irqd);
+ bool resvd = apicd->has_reserved;
+ unsigned int cpu = apicd->cpu;
+ int vector = apicd->vector;
+
+ lockdep_assert_held(&vector_lock);
+
+ /*
+ * If the current target CPU is online and in the new requested
+ * affinity mask, there is no point in moving the interrupt from
+ * one CPU to another.
+ */
+ if (vector && cpu_online(cpu) && cpumask_test_cpu(cpu, dest))
+ return 0;
+
+ /*
+ * Careful here. @apicd might either have move_in_progress set or
+ * be enqueued for cleanup. Assigning a new vector would either
+ * leave a stale vector on some CPU around or in case of a pending
+ * cleanup corrupt the hlist.
+ */
+ if (apicd->move_in_progress || !hlist_unhashed(&apicd->clist))
+ return -EBUSY;
+
+ vector = irq_matrix_alloc(vector_matrix, dest, resvd, &cpu);
+ trace_vector_alloc(irqd->irq, vector, resvd, vector);
+ if (vector < 0)
+ return vector;
+ apic_update_vector(irqd, vector, cpu);
+ apic_update_irq_cfg(irqd, vector, cpu);
+
+ return 0;
+}
+
+static int assign_irq_vector(struct irq_data *irqd, const struct cpumask *dest)
+{
+ unsigned long flags;
+ int ret;
+
+ raw_spin_lock_irqsave(&vector_lock, flags);
+ cpumask_and(vector_searchmask, dest, cpu_online_mask);
+ ret = assign_vector_locked(irqd, vector_searchmask);
+ raw_spin_unlock_irqrestore(&vector_lock, flags);
+ return ret;
+}
+
+static int assign_irq_vector_any_locked(struct irq_data *irqd)
+{
+ /* Get the affinity mask - either irq_default_affinity or (user) set */
+ const struct cpumask *affmsk = irq_data_get_affinity_mask(irqd);
+ int node = irq_data_get_node(irqd);
+
+ if (node == NUMA_NO_NODE)
+ goto all;
+ /* Try the intersection of @affmsk and node mask */
+ cpumask_and(vector_searchmask, cpumask_of_node(node), affmsk);
+ if (!assign_vector_locked(irqd, vector_searchmask))
+ return 0;
+ /* Try the node mask */
+ if (!assign_vector_locked(irqd, cpumask_of_node(node)))
+ return 0;
+all:
+ /* Try the full affinity mask */
+ cpumask_and(vector_searchmask, affmsk, cpu_online_mask);
+ if (!assign_vector_locked(irqd, vector_searchmask))
+ return 0;
+ /* Try the full online mask */
+ return assign_vector_locked(irqd, cpu_online_mask);
+}
+
+static int
+assign_irq_vector_policy(struct irq_data *irqd, struct irq_alloc_info *info)
+{
+ if (irqd_affinity_is_managed(irqd))
+ return reserve_managed_vector(irqd);
+ if (info->mask)
+ return assign_irq_vector(irqd, info->mask);
+ /*
+ * Make only a global reservation with no guarantee. A real vector
+ * is associated at activation time.
+ */
+ return reserve_irq_vector(irqd);
+}
+
+static int
+assign_managed_vector(struct irq_data *irqd, const struct cpumask *dest)
+{
+ const struct cpumask *affmsk = irq_data_get_affinity_mask(irqd);
+ struct apic_chip_data *apicd = apic_chip_data(irqd);
+ int vector, cpu;
+
+ cpumask_and(vector_searchmask, vector_searchmask, affmsk);
+ cpu = cpumask_first(vector_searchmask);
+ if (cpu >= nr_cpu_ids)
+ return -EINVAL;
+ /* set_affinity might call here for nothing */
+ if (apicd->vector && cpumask_test_cpu(apicd->cpu, vector_searchmask))
+ return 0;
+ vector = irq_matrix_alloc_managed(vector_matrix, cpu);
+ trace_vector_alloc_managed(irqd->irq, vector, vector);
+ if (vector < 0)
+ return vector;
+ apic_update_vector(irqd, vector, cpu);
+ apic_update_irq_cfg(irqd, vector, cpu);
+ return 0;
+}
+
+static void clear_irq_vector(struct irq_data *irqd)
+{
+ struct apic_chip_data *apicd = apic_chip_data(irqd);
+ bool managed = irqd_affinity_is_managed(irqd);
+ unsigned int vector = apicd->vector;
+
+ lockdep_assert_held(&vector_lock);
+
+ if (!vector)
+ return;
+
+ trace_vector_clear(irqd->irq, vector, apicd->cpu, apicd->prev_vector,
+ apicd->prev_cpu);
+
+ per_cpu(vector_irq, apicd->cpu)[vector] = VECTOR_UNUSED;
+ irq_matrix_free(vector_matrix, apicd->cpu, vector, managed);
+ apicd->vector = 0;
+
+ /* Clean up move in progress */
+ vector = apicd->prev_vector;
+ if (!vector)
+ return;
+
+ per_cpu(vector_irq, apicd->prev_cpu)[vector] = VECTOR_UNUSED;
+ irq_matrix_free(vector_matrix, apicd->prev_cpu, vector, managed);
+ apicd->prev_vector = 0;
+ apicd->move_in_progress = 0;
+ hlist_del_init(&apicd->clist);
+}
+
+static void x86_vector_deactivate(struct irq_domain *dom, struct irq_data *irqd)
+{
+ struct apic_chip_data *apicd = apic_chip_data(irqd);
+ unsigned long flags;
+
+ trace_vector_deactivate(irqd->irq, apicd->is_managed,
+ apicd->can_reserve, false);
+
+ /* Regular fixed assigned interrupt */
+ if (!apicd->is_managed && !apicd->can_reserve)
+ return;
+ /* If the interrupt has a global reservation, nothing to do */
+ if (apicd->has_reserved)
+ return;
+
+ raw_spin_lock_irqsave(&vector_lock, flags);
+ clear_irq_vector(irqd);
+ if (apicd->can_reserve)
+ reserve_irq_vector_locked(irqd);
+ else
+ vector_assign_managed_shutdown(irqd);
+ raw_spin_unlock_irqrestore(&vector_lock, flags);
+}
+
+static int activate_reserved(struct irq_data *irqd)
+{
+ struct apic_chip_data *apicd = apic_chip_data(irqd);
+ int ret;
+
+ ret = assign_irq_vector_any_locked(irqd);
+ if (!ret) {
+ apicd->has_reserved = false;
+ /*
+ * Core might have disabled reservation mode after
+ * allocating the irq descriptor. Ideally this should
+ * happen before allocation time, but that would require
+ * completely convoluted ways of transporting that
+ * information.
+ */
+ if (!irqd_can_reserve(irqd))
+ apicd->can_reserve = false;
+ }
+ return ret;
+}
+
+static int activate_managed(struct irq_data *irqd)
+{
+ const struct cpumask *dest = irq_data_get_affinity_mask(irqd);
+ int ret;
+
+ cpumask_and(vector_searchmask, dest, cpu_online_mask);
+ if (WARN_ON_ONCE(cpumask_empty(vector_searchmask))) {
+ /* Something in the core code broke! Survive gracefully */
+ pr_err("Managed startup for irq %u, but no CPU\n", irqd->irq);
+ return -EINVAL;
+ }
+
+ ret = assign_managed_vector(irqd, vector_searchmask);
+ /*
+ * This should not happen. The vector reservation got buggered. Handle
+ * it gracefully.
+ */
+ if (WARN_ON_ONCE(ret < 0)) {
+ pr_err("Managed startup irq %u, no vector available\n",
+ irqd->irq);
+ }
+ return ret;
+}
+
+static int x86_vector_activate(struct irq_domain *dom, struct irq_data *irqd,
+ bool reserve)
+{
+ struct apic_chip_data *apicd = apic_chip_data(irqd);
+ unsigned long flags;
+ int ret = 0;
+
+ trace_vector_activate(irqd->irq, apicd->is_managed,
+ apicd->can_reserve, reserve);
+
+ /* Nothing to do for fixed assigned vectors */
+ if (!apicd->can_reserve && !apicd->is_managed)
+ return 0;
+
+ raw_spin_lock_irqsave(&vector_lock, flags);
+ if (reserve || irqd_is_managed_and_shutdown(irqd))
+ vector_assign_managed_shutdown(irqd);
+ else if (apicd->is_managed)
+ ret = activate_managed(irqd);
+ else if (apicd->has_reserved)
+ ret = activate_reserved(irqd);
+ raw_spin_unlock_irqrestore(&vector_lock, flags);
+ return ret;
+}
+
+static void vector_free_reserved_and_managed(struct irq_data *irqd)
+{
+ const struct cpumask *dest = irq_data_get_affinity_mask(irqd);
+ struct apic_chip_data *apicd = apic_chip_data(irqd);
+
+ trace_vector_teardown(irqd->irq, apicd->is_managed,
+ apicd->has_reserved);
+
+ if (apicd->has_reserved)
+ irq_matrix_remove_reserved(vector_matrix);
+ if (apicd->is_managed)
+ irq_matrix_remove_managed(vector_matrix, dest);
+}
+
+static void x86_vector_free_irqs(struct irq_domain *domain,
+ unsigned int virq, unsigned int nr_irqs)
+{
+ struct apic_chip_data *apicd;
+ struct irq_data *irqd;
+ unsigned long flags;
+ int i;
+
+ for (i = 0; i < nr_irqs; i++) {
+ irqd = irq_domain_get_irq_data(x86_vector_domain, virq + i);
+ if (irqd && irqd->chip_data) {
+ raw_spin_lock_irqsave(&vector_lock, flags);
+ clear_irq_vector(irqd);
+ vector_free_reserved_and_managed(irqd);
+ apicd = irqd->chip_data;
+ irq_domain_reset_irq_data(irqd);
+ raw_spin_unlock_irqrestore(&vector_lock, flags);
+ free_apic_chip_data(apicd);
+ }
+ }
+}
+
+static bool vector_configure_legacy(unsigned int virq, struct irq_data *irqd,
+ struct apic_chip_data *apicd)
+{
+ unsigned long flags;
+ bool realloc = false;
+
+ apicd->vector = ISA_IRQ_VECTOR(virq);
+ apicd->cpu = 0;
+
+ raw_spin_lock_irqsave(&vector_lock, flags);
+ /*
+ * If the interrupt is activated, then it must stay at this vector
+ * position. That's usually the timer interrupt (0).
+ */
+ if (irqd_is_activated(irqd)) {
+ trace_vector_setup(virq, true, 0);
+ apic_update_irq_cfg(irqd, apicd->vector, apicd->cpu);
+ } else {
+ /* Release the vector */
+ apicd->can_reserve = true;
+ irqd_set_can_reserve(irqd);
+ clear_irq_vector(irqd);
+ realloc = true;
+ }
+ raw_spin_unlock_irqrestore(&vector_lock, flags);
+ return realloc;
+}
+
+static int x86_vector_alloc_irqs(struct irq_domain *domain, unsigned int virq,
+ unsigned int nr_irqs, void *arg)
+{
+ struct irq_alloc_info *info = arg;
+ struct apic_chip_data *apicd;
+ struct irq_data *irqd;
+ int i, err, node;
+
+ if (disable_apic)
+ return -ENXIO;
+
+ /* Currently vector allocator can't guarantee contiguous allocations */
+ if ((info->flags & X86_IRQ_ALLOC_CONTIGUOUS_VECTORS) && nr_irqs > 1)
+ return -ENOSYS;
+
+ for (i = 0; i < nr_irqs; i++) {
+ irqd = irq_domain_get_irq_data(domain, virq + i);
+ BUG_ON(!irqd);
+ node = irq_data_get_node(irqd);
+ WARN_ON_ONCE(irqd->chip_data);
+ apicd = alloc_apic_chip_data(node);
+ if (!apicd) {
+ err = -ENOMEM;
+ goto error;
+ }
+
+ apicd->irq = virq + i;
+ irqd->chip = &lapic_controller;
+ irqd->chip_data = apicd;
+ irqd->hwirq = virq + i;
+ irqd_set_single_target(irqd);
+ /*
+ * Legacy vectors are already assigned when the IOAPIC
+ * takes them over. They stay on the same vector. This is
+ * required for check_timer() to work correctly as it might
+ * switch back to legacy mode. Only update the hardware
+ * config.
+ */
+ if (info->flags & X86_IRQ_ALLOC_LEGACY) {
+ if (!vector_configure_legacy(virq + i, irqd, apicd))
+ continue;
+ }
+
+ err = assign_irq_vector_policy(irqd, info);
+ trace_vector_setup(virq + i, false, err);
+ if (err) {
+ irqd->chip_data = NULL;
+ free_apic_chip_data(apicd);
+ goto error;
+ }
+ }
+
+ return 0;
+
+error:
+ x86_vector_free_irqs(domain, virq, i);
+ return err;
+}
+
+#ifdef CONFIG_GENERIC_IRQ_DEBUGFS
+static void x86_vector_debug_show(struct seq_file *m, struct irq_domain *d,
+ struct irq_data *irqd, int ind)
+{
+ struct apic_chip_data apicd;
+ unsigned long flags;
+ int irq;
+
+ if (!irqd) {
+ irq_matrix_debug_show(m, vector_matrix, ind);
+ return;
+ }
+
+ irq = irqd->irq;
+ if (irq < nr_legacy_irqs() && !test_bit(irq, &io_apic_irqs)) {
+ seq_printf(m, "%*sVector: %5d\n", ind, "", ISA_IRQ_VECTOR(irq));
+ seq_printf(m, "%*sTarget: Legacy PIC all CPUs\n", ind, "");
+ return;
+ }
+
+ if (!irqd->chip_data) {
+ seq_printf(m, "%*sVector: Not assigned\n", ind, "");
+ return;
+ }
+
+ raw_spin_lock_irqsave(&vector_lock, flags);
+ memcpy(&apicd, irqd->chip_data, sizeof(apicd));
+ raw_spin_unlock_irqrestore(&vector_lock, flags);
+
+ seq_printf(m, "%*sVector: %5u\n", ind, "", apicd.vector);
+ seq_printf(m, "%*sTarget: %5u\n", ind, "", apicd.cpu);
+ if (apicd.prev_vector) {
+ seq_printf(m, "%*sPrevious vector: %5u\n", ind, "", apicd.prev_vector);
+ seq_printf(m, "%*sPrevious target: %5u\n", ind, "", apicd.prev_cpu);
+ }
+ seq_printf(m, "%*smove_in_progress: %u\n", ind, "", apicd.move_in_progress ? 1 : 0);
+ seq_printf(m, "%*sis_managed: %u\n", ind, "", apicd.is_managed ? 1 : 0);
+ seq_printf(m, "%*scan_reserve: %u\n", ind, "", apicd.can_reserve ? 1 : 0);
+ seq_printf(m, "%*shas_reserved: %u\n", ind, "", apicd.has_reserved ? 1 : 0);
+ seq_printf(m, "%*scleanup_pending: %u\n", ind, "", !hlist_unhashed(&apicd.clist));
+}
+#endif
+
+static const struct irq_domain_ops x86_vector_domain_ops = {
+ .alloc = x86_vector_alloc_irqs,
+ .free = x86_vector_free_irqs,
+ .activate = x86_vector_activate,
+ .deactivate = x86_vector_deactivate,
+#ifdef CONFIG_GENERIC_IRQ_DEBUGFS
+ .debug_show = x86_vector_debug_show,
+#endif
+};
+
+int __init arch_probe_nr_irqs(void)
+{
+ int nr;
+
+ if (nr_irqs > (NR_VECTORS * nr_cpu_ids))
+ nr_irqs = NR_VECTORS * nr_cpu_ids;
+
+ nr = (gsi_top + nr_legacy_irqs()) + 8 * nr_cpu_ids;
+#if defined(CONFIG_PCI_MSI)
+ /*
+ * for MSI and HT dyn irq
+ */
+ if (gsi_top <= NR_IRQS_LEGACY)
+ nr += 8 * nr_cpu_ids;
+ else
+ nr += gsi_top * 16;
+#endif
+ if (nr < nr_irqs)
+ nr_irqs = nr;
+
+ /*
+ * We don't know if PIC is present at this point so we need to do
+ * probe() to get the right number of legacy IRQs.
+ */
+ return legacy_pic->probe();
+}
+
+void lapic_assign_legacy_vector(unsigned int irq, bool replace)
+{
+ /*
+ * Use assign system here so it wont get accounted as allocated
+ * and moveable in the cpu hotplug check and it prevents managed
+ * irq reservation from touching it.
+ */
+ irq_matrix_assign_system(vector_matrix, ISA_IRQ_VECTOR(irq), replace);
+}
+
+void __init lapic_assign_system_vectors(void)
+{
+ unsigned int i, vector = 0;
+
+ for_each_set_bit_from(vector, system_vectors, NR_VECTORS)
+ irq_matrix_assign_system(vector_matrix, vector, false);
+
+ if (nr_legacy_irqs() > 1)
+ lapic_assign_legacy_vector(PIC_CASCADE_IR, false);
+
+ /* System vectors are reserved, online it */
+ irq_matrix_online(vector_matrix);
+
+ /* Mark the preallocated legacy interrupts */
+ for (i = 0; i < nr_legacy_irqs(); i++) {
+ if (i != PIC_CASCADE_IR)
+ irq_matrix_assign(vector_matrix, ISA_IRQ_VECTOR(i));
+ }
+}
+
+int __init arch_early_irq_init(void)
+{
+ struct fwnode_handle *fn;
+
+ fn = irq_domain_alloc_named_fwnode("VECTOR");
+ BUG_ON(!fn);
+ x86_vector_domain = irq_domain_create_tree(fn, &x86_vector_domain_ops,
+ NULL);
+ BUG_ON(x86_vector_domain == NULL);
+ irq_domain_free_fwnode(fn);
+ irq_set_default_host(x86_vector_domain);
+
+ arch_init_msi_domain(x86_vector_domain);
+
+ BUG_ON(!alloc_cpumask_var(&vector_searchmask, GFP_KERNEL));
+
+ /*
+ * Allocate the vector matrix allocator data structure and limit the
+ * search area.
+ */
+ vector_matrix = irq_alloc_matrix(NR_VECTORS, FIRST_EXTERNAL_VECTOR,
+ FIRST_SYSTEM_VECTOR);
+ BUG_ON(!vector_matrix);
+
+ return arch_early_ioapic_init();
+}
+
+#ifdef CONFIG_SMP
+
+static struct irq_desc *__setup_vector_irq(int vector)
+{
+ int isairq = vector - ISA_IRQ_VECTOR(0);
+
+ /* Check whether the irq is in the legacy space */
+ if (isairq < 0 || isairq >= nr_legacy_irqs())
+ return VECTOR_UNUSED;
+ /* Check whether the irq is handled by the IOAPIC */
+ if (test_bit(isairq, &io_apic_irqs))
+ return VECTOR_UNUSED;
+ return irq_to_desc(isairq);
+}
+
+/* Online the local APIC infrastructure and initialize the vectors */
+void lapic_online(void)
+{
+ unsigned int vector;
+
+ lockdep_assert_held(&vector_lock);
+
+ /* Online the vector matrix array for this CPU */
+ irq_matrix_online(vector_matrix);
+
+ /*
+ * The interrupt affinity logic never targets interrupts to offline
+ * CPUs. The exception are the legacy PIC interrupts. In general
+ * they are only targeted to CPU0, but depending on the platform
+ * they can be distributed to any online CPU in hardware. The
+ * kernel has no influence on that. So all active legacy vectors
+ * must be installed on all CPUs. All non legacy interrupts can be
+ * cleared.
+ */
+ for (vector = 0; vector < NR_VECTORS; vector++)
+ this_cpu_write(vector_irq[vector], __setup_vector_irq(vector));
+}
+
+void lapic_offline(void)
+{
+ lock_vector_lock();
+ irq_matrix_offline(vector_matrix);
+ unlock_vector_lock();
+}
+
+static int apic_set_affinity(struct irq_data *irqd,
+ const struct cpumask *dest, bool force)
+{
+ struct apic_chip_data *apicd = apic_chip_data(irqd);
+ int err;
+
+ /*
+ * Core code can call here for inactive interrupts. For inactive
+ * interrupts which use managed or reservation mode there is no
+ * point in going through the vector assignment right now as the
+ * activation will assign a vector which fits the destination
+ * cpumask. Let the core code store the destination mask and be
+ * done with it.
+ */
+ if (!irqd_is_activated(irqd) &&
+ (apicd->is_managed || apicd->can_reserve))
+ return IRQ_SET_MASK_OK;
+
+ raw_spin_lock(&vector_lock);
+ cpumask_and(vector_searchmask, dest, cpu_online_mask);
+ if (irqd_affinity_is_managed(irqd))
+ err = assign_managed_vector(irqd, vector_searchmask);
+ else
+ err = assign_vector_locked(irqd, vector_searchmask);
+ raw_spin_unlock(&vector_lock);
+ return err ? err : IRQ_SET_MASK_OK;
+}
+
+#else
+# define apic_set_affinity NULL
+#endif
+
+static int apic_retrigger_irq(struct irq_data *irqd)
+{
+ struct apic_chip_data *apicd = apic_chip_data(irqd);
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&vector_lock, flags);
+ apic->send_IPI(apicd->cpu, apicd->vector);
+ raw_spin_unlock_irqrestore(&vector_lock, flags);
+
+ return 1;
+}
+
+void apic_ack_irq(struct irq_data *irqd)
+{
+ irq_move_irq(irqd);
+ ack_APIC_irq();
+}
+
+void apic_ack_edge(struct irq_data *irqd)
+{
+ irq_complete_move(irqd_cfg(irqd));
+ apic_ack_irq(irqd);
+}
+
+static struct irq_chip lapic_controller = {
+ .name = "APIC",
+ .irq_ack = apic_ack_edge,
+ .irq_set_affinity = apic_set_affinity,
+ .irq_retrigger = apic_retrigger_irq,
+};
+
+#ifdef CONFIG_SMP
+
+static void free_moved_vector(struct apic_chip_data *apicd)
+{
+ unsigned int vector = apicd->prev_vector;
+ unsigned int cpu = apicd->prev_cpu;
+ bool managed = apicd->is_managed;
+
+ /*
+ * This should never happen. Managed interrupts are not
+ * migrated except on CPU down, which does not involve the
+ * cleanup vector. But try to keep the accounting correct
+ * nevertheless.
+ */
+ WARN_ON_ONCE(managed);
+
+ trace_vector_free_moved(apicd->irq, cpu, vector, managed);
+ irq_matrix_free(vector_matrix, cpu, vector, managed);
+ per_cpu(vector_irq, cpu)[vector] = VECTOR_UNUSED;
+ hlist_del_init(&apicd->clist);
+ apicd->prev_vector = 0;
+ apicd->move_in_progress = 0;
+}
+
+asmlinkage __visible void __irq_entry smp_irq_move_cleanup_interrupt(void)
+{
+ struct hlist_head *clhead = this_cpu_ptr(&cleanup_list);
+ struct apic_chip_data *apicd;
+ struct hlist_node *tmp;
+
+ entering_ack_irq();
+ /* Prevent vectors vanishing under us */
+ raw_spin_lock(&vector_lock);
+
+ hlist_for_each_entry_safe(apicd, tmp, clhead, clist) {
+ unsigned int irr, vector = apicd->prev_vector;
+
+ /*
+ * Paranoia: Check if the vector that needs to be cleaned
+ * up is registered at the APICs IRR. If so, then this is
+ * not the best time to clean it up. Clean it up in the
+ * next attempt by sending another IRQ_MOVE_CLEANUP_VECTOR
+ * to this CPU. IRQ_MOVE_CLEANUP_VECTOR is the lowest
+ * priority external vector, so on return from this
+ * interrupt the device interrupt will happen first.
+ */
+ irr = apic_read(APIC_IRR + (vector / 32 * 0x10));
+ if (irr & (1U << (vector % 32))) {
+ apic->send_IPI_self(IRQ_MOVE_CLEANUP_VECTOR);
+ continue;
+ }
+ free_moved_vector(apicd);
+ }
+
+ raw_spin_unlock(&vector_lock);
+ exiting_irq();
+}
+
+static void __send_cleanup_vector(struct apic_chip_data *apicd)
+{
+ unsigned int cpu;
+
+ raw_spin_lock(&vector_lock);
+ apicd->move_in_progress = 0;
+ cpu = apicd->prev_cpu;
+ if (cpu_online(cpu)) {
+ hlist_add_head(&apicd->clist, per_cpu_ptr(&cleanup_list, cpu));
+ apic->send_IPI(cpu, IRQ_MOVE_CLEANUP_VECTOR);
+ } else {
+ apicd->prev_vector = 0;
+ }
+ raw_spin_unlock(&vector_lock);
+}
+
+void send_cleanup_vector(struct irq_cfg *cfg)
+{
+ struct apic_chip_data *apicd;
+
+ apicd = container_of(cfg, struct apic_chip_data, hw_irq_cfg);
+ if (apicd->move_in_progress)
+ __send_cleanup_vector(apicd);
+}
+
+static void __irq_complete_move(struct irq_cfg *cfg, unsigned vector)
+{
+ struct apic_chip_data *apicd;
+
+ apicd = container_of(cfg, struct apic_chip_data, hw_irq_cfg);
+ if (likely(!apicd->move_in_progress))
+ return;
+
+ if (vector == apicd->vector && apicd->cpu == smp_processor_id())
+ __send_cleanup_vector(apicd);
+}
+
+void irq_complete_move(struct irq_cfg *cfg)
+{
+ __irq_complete_move(cfg, ~get_irq_regs()->orig_ax);
+}
+
+/*
+ * Called from fixup_irqs() with @desc->lock held and interrupts disabled.
+ */
+void irq_force_complete_move(struct irq_desc *desc)
+{
+ struct apic_chip_data *apicd;
+ struct irq_data *irqd;
+ unsigned int vector;
+
+ /*
+ * The function is called for all descriptors regardless of which
+ * irqdomain they belong to. For example if an IRQ is provided by
+ * an irq_chip as part of a GPIO driver, the chip data for that
+ * descriptor is specific to the irq_chip in question.
+ *
+ * Check first that the chip_data is what we expect
+ * (apic_chip_data) before touching it any further.
+ */
+ irqd = irq_domain_get_irq_data(x86_vector_domain,
+ irq_desc_get_irq(desc));
+ if (!irqd)
+ return;
+
+ raw_spin_lock(&vector_lock);
+ apicd = apic_chip_data(irqd);
+ if (!apicd)
+ goto unlock;
+
+ /*
+ * If prev_vector is empty, no action required.
+ */
+ vector = apicd->prev_vector;
+ if (!vector)
+ goto unlock;
+
+ /*
+ * This is tricky. If the cleanup of the old vector has not been
+ * done yet, then the following setaffinity call will fail with
+ * -EBUSY. This can leave the interrupt in a stale state.
+ *
+ * All CPUs are stuck in stop machine with interrupts disabled so
+ * calling __irq_complete_move() would be completely pointless.
+ *
+ * 1) The interrupt is in move_in_progress state. That means that we
+ * have not seen an interrupt since the io_apic was reprogrammed to
+ * the new vector.
+ *
+ * 2) The interrupt has fired on the new vector, but the cleanup IPIs
+ * have not been processed yet.
+ */
+ if (apicd->move_in_progress) {
+ /*
+ * In theory there is a race:
+ *
+ * set_ioapic(new_vector) <-- Interrupt is raised before update
+ * is effective, i.e. it's raised on
+ * the old vector.
+ *
+ * So if the target cpu cannot handle that interrupt before
+ * the old vector is cleaned up, we get a spurious interrupt
+ * and in the worst case the ioapic irq line becomes stale.
+ *
+ * But in case of cpu hotplug this should be a non issue
+ * because if the affinity update happens right before all
+ * cpus rendevouz in stop machine, there is no way that the
+ * interrupt can be blocked on the target cpu because all cpus
+ * loops first with interrupts enabled in stop machine, so the
+ * old vector is not yet cleaned up when the interrupt fires.
+ *
+ * So the only way to run into this issue is if the delivery
+ * of the interrupt on the apic/system bus would be delayed
+ * beyond the point where the target cpu disables interrupts
+ * in stop machine. I doubt that it can happen, but at least
+ * there is a theroretical chance. Virtualization might be
+ * able to expose this, but AFAICT the IOAPIC emulation is not
+ * as stupid as the real hardware.
+ *
+ * Anyway, there is nothing we can do about that at this point
+ * w/o refactoring the whole fixup_irq() business completely.
+ * We print at least the irq number and the old vector number,
+ * so we have the necessary information when a problem in that
+ * area arises.
+ */
+ pr_warn("IRQ fixup: irq %d move in progress, old vector %d\n",
+ irqd->irq, vector);
+ }
+ free_moved_vector(apicd);
+unlock:
+ raw_spin_unlock(&vector_lock);
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+/*
+ * Note, this is not accurate accounting, but at least good enough to
+ * prevent that the actual interrupt move will run out of vectors.
+ */
+int lapic_can_unplug_cpu(void)
+{
+ unsigned int rsvd, avl, tomove, cpu = smp_processor_id();
+ int ret = 0;
+
+ raw_spin_lock(&vector_lock);
+ tomove = irq_matrix_allocated(vector_matrix);
+ avl = irq_matrix_available(vector_matrix, true);
+ if (avl < tomove) {
+ pr_warn("CPU %u has %u vectors, %u available. Cannot disable CPU\n",
+ cpu, tomove, avl);
+ ret = -ENOSPC;
+ goto out;
+ }
+ rsvd = irq_matrix_reserved(vector_matrix);
+ if (avl < rsvd) {
+ pr_warn("Reserved vectors %u > available %u. IRQ request may fail\n",
+ rsvd, avl);
+ }
+out:
+ raw_spin_unlock(&vector_lock);
+ return ret;
+}
+#endif /* HOTPLUG_CPU */
+#endif /* SMP */
+
+static void __init print_APIC_field(int base)
+{
+ int i;
+
+ printk(KERN_DEBUG);
+
+ for (i = 0; i < 8; i++)
+ pr_cont("%08x", apic_read(base + i*0x10));
+
+ pr_cont("\n");
+}
+
+static void __init print_local_APIC(void *dummy)
+{
+ unsigned int i, v, ver, maxlvt;
+ u64 icr;
+
+ pr_debug("printing local APIC contents on CPU#%d/%d:\n",
+ smp_processor_id(), hard_smp_processor_id());
+ v = apic_read(APIC_ID);
+ pr_info("... APIC ID: %08x (%01x)\n", v, read_apic_id());
+ v = apic_read(APIC_LVR);
+ pr_info("... APIC VERSION: %08x\n", v);
+ ver = GET_APIC_VERSION(v);
+ maxlvt = lapic_get_maxlvt();
+
+ v = apic_read(APIC_TASKPRI);
+ pr_debug("... APIC TASKPRI: %08x (%02x)\n", v, v & APIC_TPRI_MASK);
+
+ /* !82489DX */
+ if (APIC_INTEGRATED(ver)) {
+ if (!APIC_XAPIC(ver)) {
+ v = apic_read(APIC_ARBPRI);
+ pr_debug("... APIC ARBPRI: %08x (%02x)\n",
+ v, v & APIC_ARBPRI_MASK);
+ }
+ v = apic_read(APIC_PROCPRI);
+ pr_debug("... APIC PROCPRI: %08x\n", v);
+ }
+
+ /*
+ * Remote read supported only in the 82489DX and local APIC for
+ * Pentium processors.
+ */
+ if (!APIC_INTEGRATED(ver) || maxlvt == 3) {
+ v = apic_read(APIC_RRR);
+ pr_debug("... APIC RRR: %08x\n", v);
+ }
+
+ v = apic_read(APIC_LDR);
+ pr_debug("... APIC LDR: %08x\n", v);
+ if (!x2apic_enabled()) {
+ v = apic_read(APIC_DFR);
+ pr_debug("... APIC DFR: %08x\n", v);
+ }
+ v = apic_read(APIC_SPIV);
+ pr_debug("... APIC SPIV: %08x\n", v);
+
+ pr_debug("... APIC ISR field:\n");
+ print_APIC_field(APIC_ISR);
+ pr_debug("... APIC TMR field:\n");
+ print_APIC_field(APIC_TMR);
+ pr_debug("... APIC IRR field:\n");
+ print_APIC_field(APIC_IRR);
+
+ /* !82489DX */
+ if (APIC_INTEGRATED(ver)) {
+ /* Due to the Pentium erratum 3AP. */
+ if (maxlvt > 3)
+ apic_write(APIC_ESR, 0);
+
+ v = apic_read(APIC_ESR);
+ pr_debug("... APIC ESR: %08x\n", v);
+ }
+
+ icr = apic_icr_read();
+ pr_debug("... APIC ICR: %08x\n", (u32)icr);
+ pr_debug("... APIC ICR2: %08x\n", (u32)(icr >> 32));
+
+ v = apic_read(APIC_LVTT);
+ pr_debug("... APIC LVTT: %08x\n", v);
+
+ if (maxlvt > 3) {
+ /* PC is LVT#4. */
+ v = apic_read(APIC_LVTPC);
+ pr_debug("... APIC LVTPC: %08x\n", v);
+ }
+ v = apic_read(APIC_LVT0);
+ pr_debug("... APIC LVT0: %08x\n", v);
+ v = apic_read(APIC_LVT1);
+ pr_debug("... APIC LVT1: %08x\n", v);
+
+ if (maxlvt > 2) {
+ /* ERR is LVT#3. */
+ v = apic_read(APIC_LVTERR);
+ pr_debug("... APIC LVTERR: %08x\n", v);
+ }
+
+ v = apic_read(APIC_TMICT);
+ pr_debug("... APIC TMICT: %08x\n", v);
+ v = apic_read(APIC_TMCCT);
+ pr_debug("... APIC TMCCT: %08x\n", v);
+ v = apic_read(APIC_TDCR);
+ pr_debug("... APIC TDCR: %08x\n", v);
+
+ if (boot_cpu_has(X86_FEATURE_EXTAPIC)) {
+ v = apic_read(APIC_EFEAT);
+ maxlvt = (v >> 16) & 0xff;
+ pr_debug("... APIC EFEAT: %08x\n", v);
+ v = apic_read(APIC_ECTRL);
+ pr_debug("... APIC ECTRL: %08x\n", v);
+ for (i = 0; i < maxlvt; i++) {
+ v = apic_read(APIC_EILVTn(i));
+ pr_debug("... APIC EILVT%d: %08x\n", i, v);
+ }
+ }
+ pr_cont("\n");
+}
+
+static void __init print_local_APICs(int maxcpu)
+{
+ int cpu;
+
+ if (!maxcpu)
+ return;
+
+ preempt_disable();
+ for_each_online_cpu(cpu) {
+ if (cpu >= maxcpu)
+ break;
+ smp_call_function_single(cpu, print_local_APIC, NULL, 1);
+ }
+ preempt_enable();
+}
+
+static void __init print_PIC(void)
+{
+ unsigned int v;
+ unsigned long flags;
+
+ if (!nr_legacy_irqs())
+ return;
+
+ pr_debug("\nprinting PIC contents\n");
+
+ raw_spin_lock_irqsave(&i8259A_lock, flags);
+
+ v = inb(0xa1) << 8 | inb(0x21);
+ pr_debug("... PIC IMR: %04x\n", v);
+
+ v = inb(0xa0) << 8 | inb(0x20);
+ pr_debug("... PIC IRR: %04x\n", v);
+
+ outb(0x0b, 0xa0);
+ outb(0x0b, 0x20);
+ v = inb(0xa0) << 8 | inb(0x20);
+ outb(0x0a, 0xa0);
+ outb(0x0a, 0x20);
+
+ raw_spin_unlock_irqrestore(&i8259A_lock, flags);
+
+ pr_debug("... PIC ISR: %04x\n", v);
+
+ v = inb(0x4d1) << 8 | inb(0x4d0);
+ pr_debug("... PIC ELCR: %04x\n", v);
+}
+
+static int show_lapic __initdata = 1;
+static __init int setup_show_lapic(char *arg)
+{
+ int num = -1;
+
+ if (strcmp(arg, "all") == 0) {
+ show_lapic = CONFIG_NR_CPUS;
+ } else {
+ get_option(&arg, &num);
+ if (num >= 0)
+ show_lapic = num;
+ }
+
+ return 1;
+}
+__setup("show_lapic=", setup_show_lapic);
+
+static int __init print_ICs(void)
+{
+ if (apic_verbosity == APIC_QUIET)
+ return 0;
+
+ print_PIC();
+
+ /* don't print out if apic is not there */
+ if (!boot_cpu_has(X86_FEATURE_APIC) && !apic_from_smp_config())
+ return 0;
+
+ print_local_APICs(show_lapic);
+ print_IO_APICs();
+
+ return 0;
+}
+
+late_initcall(print_ICs);
diff --git a/arch/x86/kernel/apic/x2apic.h b/arch/x86/kernel/apic/x2apic.h
new file mode 100644
index 0000000..a49b360
--- /dev/null
+++ b/arch/x86/kernel/apic/x2apic.h
@@ -0,0 +1,9 @@
+/* Common bits for X2APIC cluster/physical modes. */
+
+int x2apic_apic_id_valid(u32 apicid);
+int x2apic_apic_id_registered(void);
+void __x2apic_send_IPI_dest(unsigned int apicid, int vector, unsigned int dest);
+unsigned int x2apic_get_apic_id(unsigned long id);
+u32 x2apic_set_apic_id(unsigned int id);
+int x2apic_phys_pkg_id(int initial_apicid, int index_msb);
+void x2apic_send_IPI_self(int vector);
diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
new file mode 100644
index 0000000..7685444
--- /dev/null
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -0,0 +1,227 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/threads.h>
+#include <linux/cpumask.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <linux/ctype.h>
+#include <linux/dmar.h>
+#include <linux/irq.h>
+#include <linux/cpu.h>
+
+#include <asm/smp.h>
+#include "x2apic.h"
+
+struct cluster_mask {
+ unsigned int clusterid;
+ int node;
+ struct cpumask mask;
+};
+
+static DEFINE_PER_CPU(u32, x86_cpu_to_logical_apicid);
+static DEFINE_PER_CPU(cpumask_var_t, ipi_mask);
+static DEFINE_PER_CPU(struct cluster_mask *, cluster_masks);
+static struct cluster_mask *cluster_hotplug_mask;
+
+static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
+{
+ return x2apic_enabled();
+}
+
+static void x2apic_send_IPI(int cpu, int vector)
+{
+ u32 dest = per_cpu(x86_cpu_to_logical_apicid, cpu);
+
+ x2apic_wrmsr_fence();
+ __x2apic_send_IPI_dest(dest, vector, APIC_DEST_LOGICAL);
+}
+
+static void
+__x2apic_send_IPI_mask(const struct cpumask *mask, int vector, int apic_dest)
+{
+ unsigned int cpu, clustercpu;
+ struct cpumask *tmpmsk;
+ unsigned long flags;
+ u32 dest;
+
+ x2apic_wrmsr_fence();
+ local_irq_save(flags);
+
+ tmpmsk = this_cpu_cpumask_var_ptr(ipi_mask);
+ cpumask_copy(tmpmsk, mask);
+ /* If IPI should not be sent to self, clear current CPU */
+ if (apic_dest != APIC_DEST_ALLINC)
+ cpumask_clear_cpu(smp_processor_id(), tmpmsk);
+
+ /* Collapse cpus in a cluster so a single IPI per cluster is sent */
+ for_each_cpu(cpu, tmpmsk) {
+ struct cluster_mask *cmsk = per_cpu(cluster_masks, cpu);
+
+ dest = 0;
+ for_each_cpu_and(clustercpu, tmpmsk, &cmsk->mask)
+ dest |= per_cpu(x86_cpu_to_logical_apicid, clustercpu);
+
+ if (!dest)
+ continue;
+
+ __x2apic_send_IPI_dest(dest, vector, apic->dest_logical);
+ /* Remove cluster CPUs from tmpmask */
+ cpumask_andnot(tmpmsk, tmpmsk, &cmsk->mask);
+ }
+
+ local_irq_restore(flags);
+}
+
+static void x2apic_send_IPI_mask(const struct cpumask *mask, int vector)
+{
+ __x2apic_send_IPI_mask(mask, vector, APIC_DEST_ALLINC);
+}
+
+static void
+x2apic_send_IPI_mask_allbutself(const struct cpumask *mask, int vector)
+{
+ __x2apic_send_IPI_mask(mask, vector, APIC_DEST_ALLBUT);
+}
+
+static void x2apic_send_IPI_allbutself(int vector)
+{
+ __x2apic_send_IPI_mask(cpu_online_mask, vector, APIC_DEST_ALLBUT);
+}
+
+static void x2apic_send_IPI_all(int vector)
+{
+ __x2apic_send_IPI_mask(cpu_online_mask, vector, APIC_DEST_ALLINC);
+}
+
+static u32 x2apic_calc_apicid(unsigned int cpu)
+{
+ return per_cpu(x86_cpu_to_logical_apicid, cpu);
+}
+
+static void init_x2apic_ldr(void)
+{
+ struct cluster_mask *cmsk = this_cpu_read(cluster_masks);
+ u32 cluster, apicid = apic_read(APIC_LDR);
+ unsigned int cpu;
+
+ this_cpu_write(x86_cpu_to_logical_apicid, apicid);
+
+ if (cmsk)
+ goto update;
+
+ cluster = apicid >> 16;
+ for_each_online_cpu(cpu) {
+ cmsk = per_cpu(cluster_masks, cpu);
+ /* Matching cluster found. Link and update it. */
+ if (cmsk && cmsk->clusterid == cluster)
+ goto update;
+ }
+ cmsk = cluster_hotplug_mask;
+ cmsk->clusterid = cluster;
+ cluster_hotplug_mask = NULL;
+update:
+ this_cpu_write(cluster_masks, cmsk);
+ cpumask_set_cpu(smp_processor_id(), &cmsk->mask);
+}
+
+static int alloc_clustermask(unsigned int cpu, int node)
+{
+ if (per_cpu(cluster_masks, cpu))
+ return 0;
+ /*
+ * If a hotplug spare mask exists, check whether it's on the right
+ * node. If not, free it and allocate a new one.
+ */
+ if (cluster_hotplug_mask) {
+ if (cluster_hotplug_mask->node == node)
+ return 0;
+ kfree(cluster_hotplug_mask);
+ }
+
+ cluster_hotplug_mask = kzalloc_node(sizeof(*cluster_hotplug_mask),
+ GFP_KERNEL, node);
+ if (!cluster_hotplug_mask)
+ return -ENOMEM;
+ cluster_hotplug_mask->node = node;
+ return 0;
+}
+
+static int x2apic_prepare_cpu(unsigned int cpu)
+{
+ if (alloc_clustermask(cpu, cpu_to_node(cpu)) < 0)
+ return -ENOMEM;
+ if (!zalloc_cpumask_var(&per_cpu(ipi_mask, cpu), GFP_KERNEL))
+ return -ENOMEM;
+ return 0;
+}
+
+static int x2apic_dead_cpu(unsigned int dead_cpu)
+{
+ struct cluster_mask *cmsk = per_cpu(cluster_masks, dead_cpu);
+
+ cpumask_clear_cpu(dead_cpu, &cmsk->mask);
+ free_cpumask_var(per_cpu(ipi_mask, dead_cpu));
+ return 0;
+}
+
+static int x2apic_cluster_probe(void)
+{
+ if (!x2apic_mode)
+ return 0;
+
+ if (cpuhp_setup_state(CPUHP_X2APIC_PREPARE, "x86/x2apic:prepare",
+ x2apic_prepare_cpu, x2apic_dead_cpu) < 0) {
+ pr_err("Failed to register X2APIC_PREPARE\n");
+ return 0;
+ }
+ init_x2apic_ldr();
+ return 1;
+}
+
+static struct apic apic_x2apic_cluster __ro_after_init = {
+
+ .name = "cluster x2apic",
+ .probe = x2apic_cluster_probe,
+ .acpi_madt_oem_check = x2apic_acpi_madt_oem_check,
+ .apic_id_valid = x2apic_apic_id_valid,
+ .apic_id_registered = x2apic_apic_id_registered,
+
+ .irq_delivery_mode = dest_Fixed,
+ .irq_dest_mode = 1, /* logical */
+
+ .disable_esr = 0,
+ .dest_logical = APIC_DEST_LOGICAL,
+ .check_apicid_used = NULL,
+
+ .init_apic_ldr = init_x2apic_ldr,
+
+ .ioapic_phys_id_map = NULL,
+ .setup_apic_routing = NULL,
+ .cpu_present_to_apicid = default_cpu_present_to_apicid,
+ .apicid_to_cpu_present = NULL,
+ .check_phys_apicid_present = default_check_phys_apicid_present,
+ .phys_pkg_id = x2apic_phys_pkg_id,
+
+ .get_apic_id = x2apic_get_apic_id,
+ .set_apic_id = x2apic_set_apic_id,
+
+ .calc_dest_apicid = x2apic_calc_apicid,
+
+ .send_IPI = x2apic_send_IPI,
+ .send_IPI_mask = x2apic_send_IPI_mask,
+ .send_IPI_mask_allbutself = x2apic_send_IPI_mask_allbutself,
+ .send_IPI_allbutself = x2apic_send_IPI_allbutself,
+ .send_IPI_all = x2apic_send_IPI_all,
+ .send_IPI_self = x2apic_send_IPI_self,
+
+ .inquire_remote_apic = NULL,
+
+ .read = native_apic_msr_read,
+ .write = native_apic_msr_write,
+ .eoi_write = native_apic_msr_eoi_write,
+ .icr_read = native_x2apic_icr_read,
+ .icr_write = native_x2apic_icr_write,
+ .wait_icr_idle = native_x2apic_wait_icr_idle,
+ .safe_wait_icr_idle = native_safe_x2apic_wait_icr_idle,
+};
+
+apic_driver(apic_x2apic_cluster);
diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c
new file mode 100644
index 0000000..b5cf9e7
--- /dev/null
+++ b/arch/x86/kernel/apic/x2apic_phys.c
@@ -0,0 +1,187 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/threads.h>
+#include <linux/cpumask.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <linux/ctype.h>
+#include <linux/dmar.h>
+
+#include <asm/smp.h>
+#include <asm/ipi.h>
+#include "x2apic.h"
+
+int x2apic_phys;
+
+static struct apic apic_x2apic_phys;
+
+static int __init set_x2apic_phys_mode(char *arg)
+{
+ x2apic_phys = 1;
+ return 0;
+}
+early_param("x2apic_phys", set_x2apic_phys_mode);
+
+static bool x2apic_fadt_phys(void)
+{
+#ifdef CONFIG_ACPI
+ if ((acpi_gbl_FADT.header.revision >= FADT2_REVISION_ID) &&
+ (acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL)) {
+ printk(KERN_DEBUG "System requires x2apic physical mode\n");
+ return true;
+ }
+#endif
+ return false;
+}
+
+static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
+{
+ return x2apic_enabled() && (x2apic_phys || x2apic_fadt_phys());
+}
+
+static void x2apic_send_IPI(int cpu, int vector)
+{
+ u32 dest = per_cpu(x86_cpu_to_apicid, cpu);
+
+ x2apic_wrmsr_fence();
+ __x2apic_send_IPI_dest(dest, vector, APIC_DEST_PHYSICAL);
+}
+
+static void
+__x2apic_send_IPI_mask(const struct cpumask *mask, int vector, int apic_dest)
+{
+ unsigned long query_cpu;
+ unsigned long this_cpu;
+ unsigned long flags;
+
+ x2apic_wrmsr_fence();
+
+ local_irq_save(flags);
+
+ this_cpu = smp_processor_id();
+ for_each_cpu(query_cpu, mask) {
+ if (apic_dest == APIC_DEST_ALLBUT && this_cpu == query_cpu)
+ continue;
+ __x2apic_send_IPI_dest(per_cpu(x86_cpu_to_apicid, query_cpu),
+ vector, APIC_DEST_PHYSICAL);
+ }
+ local_irq_restore(flags);
+}
+
+static void x2apic_send_IPI_mask(const struct cpumask *mask, int vector)
+{
+ __x2apic_send_IPI_mask(mask, vector, APIC_DEST_ALLINC);
+}
+
+static void
+ x2apic_send_IPI_mask_allbutself(const struct cpumask *mask, int vector)
+{
+ __x2apic_send_IPI_mask(mask, vector, APIC_DEST_ALLBUT);
+}
+
+static void x2apic_send_IPI_allbutself(int vector)
+{
+ __x2apic_send_IPI_mask(cpu_online_mask, vector, APIC_DEST_ALLBUT);
+}
+
+static void x2apic_send_IPI_all(int vector)
+{
+ __x2apic_send_IPI_mask(cpu_online_mask, vector, APIC_DEST_ALLINC);
+}
+
+static void init_x2apic_ldr(void)
+{
+}
+
+static int x2apic_phys_probe(void)
+{
+ if (x2apic_mode && (x2apic_phys || x2apic_fadt_phys()))
+ return 1;
+
+ return apic == &apic_x2apic_phys;
+}
+
+/* Common x2apic functions, also used by x2apic_cluster */
+int x2apic_apic_id_valid(u32 apicid)
+{
+ return 1;
+}
+
+int x2apic_apic_id_registered(void)
+{
+ return 1;
+}
+
+void __x2apic_send_IPI_dest(unsigned int apicid, int vector, unsigned int dest)
+{
+ unsigned long cfg = __prepare_ICR(0, vector, dest);
+ native_x2apic_icr_write(cfg, apicid);
+}
+
+unsigned int x2apic_get_apic_id(unsigned long id)
+{
+ return id;
+}
+
+u32 x2apic_set_apic_id(unsigned int id)
+{
+ return id;
+}
+
+int x2apic_phys_pkg_id(int initial_apicid, int index_msb)
+{
+ return initial_apicid >> index_msb;
+}
+
+void x2apic_send_IPI_self(int vector)
+{
+ apic_write(APIC_SELF_IPI, vector);
+}
+
+static struct apic apic_x2apic_phys __ro_after_init = {
+
+ .name = "physical x2apic",
+ .probe = x2apic_phys_probe,
+ .acpi_madt_oem_check = x2apic_acpi_madt_oem_check,
+ .apic_id_valid = x2apic_apic_id_valid,
+ .apic_id_registered = x2apic_apic_id_registered,
+
+ .irq_delivery_mode = dest_Fixed,
+ .irq_dest_mode = 0, /* physical */
+
+ .disable_esr = 0,
+ .dest_logical = 0,
+ .check_apicid_used = NULL,
+
+ .init_apic_ldr = init_x2apic_ldr,
+
+ .ioapic_phys_id_map = NULL,
+ .setup_apic_routing = NULL,
+ .cpu_present_to_apicid = default_cpu_present_to_apicid,
+ .apicid_to_cpu_present = NULL,
+ .check_phys_apicid_present = default_check_phys_apicid_present,
+ .phys_pkg_id = x2apic_phys_pkg_id,
+
+ .get_apic_id = x2apic_get_apic_id,
+ .set_apic_id = x2apic_set_apic_id,
+
+ .calc_dest_apicid = apic_default_calc_apicid,
+
+ .send_IPI = x2apic_send_IPI,
+ .send_IPI_mask = x2apic_send_IPI_mask,
+ .send_IPI_mask_allbutself = x2apic_send_IPI_mask_allbutself,
+ .send_IPI_allbutself = x2apic_send_IPI_allbutself,
+ .send_IPI_all = x2apic_send_IPI_all,
+ .send_IPI_self = x2apic_send_IPI_self,
+
+ .inquire_remote_apic = NULL,
+
+ .read = native_apic_msr_read,
+ .write = native_apic_msr_write,
+ .eoi_write = native_apic_msr_eoi_write,
+ .icr_read = native_x2apic_icr_read,
+ .icr_write = native_x2apic_icr_write,
+ .wait_icr_idle = native_x2apic_wait_icr_idle,
+ .safe_wait_icr_idle = native_safe_x2apic_wait_icr_idle,
+};
+
+apic_driver(apic_x2apic_phys);
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
new file mode 100644
index 0000000..391f358
--- /dev/null
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -0,0 +1,1609 @@
+/*
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License. See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * SGI UV APIC functions (note: not an Intel compatible APIC)
+ *
+ * Copyright (C) 2007-2014 Silicon Graphics, Inc. All rights reserved.
+ */
+#include <linux/cpumask.h>
+#include <linux/hardirq.h>
+#include <linux/proc_fs.h>
+#include <linux/threads.h>
+#include <linux/kernel.h>
+#include <linux/export.h>
+#include <linux/string.h>
+#include <linux/ctype.h>
+#include <linux/sched.h>
+#include <linux/timer.h>
+#include <linux/slab.h>
+#include <linux/cpu.h>
+#include <linux/init.h>
+#include <linux/io.h>
+#include <linux/pci.h>
+#include <linux/kdebug.h>
+#include <linux/delay.h>
+#include <linux/crash_dump.h>
+#include <linux/reboot.h>
+#include <linux/memory.h>
+
+#include <asm/uv/uv_mmrs.h>
+#include <asm/uv/uv_hub.h>
+#include <asm/current.h>
+#include <asm/pgtable.h>
+#include <asm/uv/bios.h>
+#include <asm/uv/uv.h>
+#include <asm/apic.h>
+#include <asm/e820/api.h>
+#include <asm/ipi.h>
+#include <asm/smp.h>
+#include <asm/x86_init.h>
+#include <asm/nmi.h>
+
+DEFINE_PER_CPU(int, x2apic_extra_bits);
+
+static enum uv_system_type uv_system_type;
+static bool uv_hubless_system;
+static u64 gru_start_paddr, gru_end_paddr;
+static u64 gru_dist_base, gru_first_node_paddr = -1LL, gru_last_node_paddr;
+static u64 gru_dist_lmask, gru_dist_umask;
+static union uvh_apicid uvh_apicid;
+
+/* Information derived from CPUID: */
+static struct {
+ unsigned int apicid_shift;
+ unsigned int apicid_mask;
+ unsigned int socketid_shift; /* aka pnode_shift for UV1/2/3 */
+ unsigned int pnode_mask;
+ unsigned int gpa_shift;
+ unsigned int gnode_shift;
+} uv_cpuid;
+
+int uv_min_hub_revision_id;
+EXPORT_SYMBOL_GPL(uv_min_hub_revision_id);
+
+unsigned int uv_apicid_hibits;
+EXPORT_SYMBOL_GPL(uv_apicid_hibits);
+
+static struct apic apic_x2apic_uv_x;
+static struct uv_hub_info_s uv_hub_info_node0;
+
+/* Set this to use hardware error handler instead of kernel panic: */
+static int disable_uv_undefined_panic = 1;
+
+unsigned long uv_undefined(char *str)
+{
+ if (likely(!disable_uv_undefined_panic))
+ panic("UV: error: undefined MMR: %s\n", str);
+ else
+ pr_crit("UV: error: undefined MMR: %s\n", str);
+
+ /* Cause a machine fault: */
+ return ~0ul;
+}
+EXPORT_SYMBOL(uv_undefined);
+
+static unsigned long __init uv_early_read_mmr(unsigned long addr)
+{
+ unsigned long val, *mmr;
+
+ mmr = early_ioremap(UV_LOCAL_MMR_BASE | addr, sizeof(*mmr));
+ val = *mmr;
+ early_iounmap(mmr, sizeof(*mmr));
+
+ return val;
+}
+
+static inline bool is_GRU_range(u64 start, u64 end)
+{
+ if (gru_dist_base) {
+ u64 su = start & gru_dist_umask; /* Upper (incl pnode) bits */
+ u64 sl = start & gru_dist_lmask; /* Base offset bits */
+ u64 eu = end & gru_dist_umask;
+ u64 el = end & gru_dist_lmask;
+
+ /* Must reside completely within a single GRU range: */
+ return (sl == gru_dist_base && el == gru_dist_base &&
+ su >= gru_first_node_paddr &&
+ su <= gru_last_node_paddr &&
+ eu == su);
+ } else {
+ return start >= gru_start_paddr && end <= gru_end_paddr;
+ }
+}
+
+static bool uv_is_untracked_pat_range(u64 start, u64 end)
+{
+ return is_ISA_range(start, end) || is_GRU_range(start, end);
+}
+
+static int __init early_get_pnodeid(void)
+{
+ union uvh_node_id_u node_id;
+ union uvh_rh_gam_config_mmr_u m_n_config;
+ int pnode;
+
+ /* Currently, all blades have same revision number */
+ node_id.v = uv_early_read_mmr(UVH_NODE_ID);
+ m_n_config.v = uv_early_read_mmr(UVH_RH_GAM_CONFIG_MMR);
+ uv_min_hub_revision_id = node_id.s.revision;
+
+ switch (node_id.s.part_number) {
+ case UV2_HUB_PART_NUMBER:
+ case UV2_HUB_PART_NUMBER_X:
+ uv_min_hub_revision_id += UV2_HUB_REVISION_BASE - 1;
+ break;
+ case UV3_HUB_PART_NUMBER:
+ case UV3_HUB_PART_NUMBER_X:
+ uv_min_hub_revision_id += UV3_HUB_REVISION_BASE;
+ break;
+
+ /* Update: UV4A has only a modified revision to indicate HUB fixes */
+ case UV4_HUB_PART_NUMBER:
+ uv_min_hub_revision_id += UV4_HUB_REVISION_BASE - 1;
+ uv_cpuid.gnode_shift = 2; /* min partition is 4 sockets */
+ break;
+ }
+
+ uv_hub_info->hub_revision = uv_min_hub_revision_id;
+ uv_cpuid.pnode_mask = (1 << m_n_config.s.n_skt) - 1;
+ pnode = (node_id.s.node_id >> 1) & uv_cpuid.pnode_mask;
+ uv_cpuid.gpa_shift = 46; /* Default unless changed */
+
+ pr_info("UV: rev:%d part#:%x nodeid:%04x n_skt:%d pnmsk:%x pn:%x\n",
+ node_id.s.revision, node_id.s.part_number, node_id.s.node_id,
+ m_n_config.s.n_skt, uv_cpuid.pnode_mask, pnode);
+ return pnode;
+}
+
+static void __init uv_tsc_check_sync(void)
+{
+ u64 mmr;
+ int sync_state;
+ int mmr_shift;
+ char *state;
+ bool valid;
+
+ /* Accommodate different UV arch BIOSes */
+ mmr = uv_early_read_mmr(UVH_TSC_SYNC_MMR);
+ mmr_shift =
+ is_uv1_hub() ? 0 :
+ is_uv2_hub() ? UVH_TSC_SYNC_SHIFT_UV2K : UVH_TSC_SYNC_SHIFT;
+ if (mmr_shift)
+ sync_state = (mmr >> mmr_shift) & UVH_TSC_SYNC_MASK;
+ else
+ sync_state = 0;
+
+ switch (sync_state) {
+ case UVH_TSC_SYNC_VALID:
+ state = "in sync";
+ valid = true;
+ break;
+
+ case UVH_TSC_SYNC_INVALID:
+ state = "unstable";
+ valid = false;
+ break;
+ default:
+ state = "unknown: assuming valid";
+ valid = true;
+ break;
+ }
+ pr_info("UV: TSC sync state from BIOS:0%d(%s)\n", sync_state, state);
+
+ /* Mark flag that says TSC != 0 is valid for socket 0 */
+ if (valid)
+ mark_tsc_async_resets("UV BIOS");
+ else
+ mark_tsc_unstable("UV BIOS");
+}
+
+/* [Copied from arch/x86/kernel/cpu/topology.c:detect_extended_topology()] */
+
+#define SMT_LEVEL 0 /* Leaf 0xb SMT level */
+#define INVALID_TYPE 0 /* Leaf 0xb sub-leaf types */
+#define SMT_TYPE 1
+#define CORE_TYPE 2
+#define LEAFB_SUBTYPE(ecx) (((ecx) >> 8) & 0xff)
+#define BITS_SHIFT_NEXT_LEVEL(eax) ((eax) & 0x1f)
+
+static void set_x2apic_bits(void)
+{
+ unsigned int eax, ebx, ecx, edx, sub_index;
+ unsigned int sid_shift;
+
+ cpuid(0, &eax, &ebx, &ecx, &edx);
+ if (eax < 0xb) {
+ pr_info("UV: CPU does not have CPUID.11\n");
+ return;
+ }
+
+ cpuid_count(0xb, SMT_LEVEL, &eax, &ebx, &ecx, &edx);
+ if (ebx == 0 || (LEAFB_SUBTYPE(ecx) != SMT_TYPE)) {
+ pr_info("UV: CPUID.11 not implemented\n");
+ return;
+ }
+
+ sid_shift = BITS_SHIFT_NEXT_LEVEL(eax);
+ sub_index = 1;
+ do {
+ cpuid_count(0xb, sub_index, &eax, &ebx, &ecx, &edx);
+ if (LEAFB_SUBTYPE(ecx) == CORE_TYPE) {
+ sid_shift = BITS_SHIFT_NEXT_LEVEL(eax);
+ break;
+ }
+ sub_index++;
+ } while (LEAFB_SUBTYPE(ecx) != INVALID_TYPE);
+
+ uv_cpuid.apicid_shift = 0;
+ uv_cpuid.apicid_mask = (~(-1 << sid_shift));
+ uv_cpuid.socketid_shift = sid_shift;
+}
+
+static void __init early_get_apic_socketid_shift(void)
+{
+ if (is_uv2_hub() || is_uv3_hub())
+ uvh_apicid.v = uv_early_read_mmr(UVH_APICID);
+
+ set_x2apic_bits();
+
+ pr_info("UV: apicid_shift:%d apicid_mask:0x%x\n", uv_cpuid.apicid_shift, uv_cpuid.apicid_mask);
+ pr_info("UV: socketid_shift:%d pnode_mask:0x%x\n", uv_cpuid.socketid_shift, uv_cpuid.pnode_mask);
+}
+
+/*
+ * Add an extra bit as dictated by bios to the destination apicid of
+ * interrupts potentially passing through the UV HUB. This prevents
+ * a deadlock between interrupts and IO port operations.
+ */
+static void __init uv_set_apicid_hibit(void)
+{
+ union uv1h_lb_target_physical_apic_id_mask_u apicid_mask;
+
+ if (is_uv1_hub()) {
+ apicid_mask.v = uv_early_read_mmr(UV1H_LB_TARGET_PHYSICAL_APIC_ID_MASK);
+ uv_apicid_hibits = apicid_mask.s1.bit_enables & UV_APICID_HIBIT_MASK;
+ }
+}
+
+static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
+{
+ int pnodeid;
+ int uv_apic;
+
+ if (strncmp(oem_id, "SGI", 3) != 0) {
+ if (strncmp(oem_id, "NSGI", 4) == 0) {
+ uv_hubless_system = true;
+ pr_info("UV: OEM IDs %s/%s, HUBLESS\n",
+ oem_id, oem_table_id);
+ }
+ return 0;
+ }
+
+ if (numa_off) {
+ pr_err("UV: NUMA is off, disabling UV support\n");
+ return 0;
+ }
+
+ /* Set up early hub type field in uv_hub_info for Node 0 */
+ uv_cpu_info->p_uv_hub_info = &uv_hub_info_node0;
+
+ /*
+ * Determine UV arch type.
+ * SGI: UV100/1000
+ * SGI2: UV2000/3000
+ * SGI3: UV300 (truncated to 4 chars because of different varieties)
+ * SGI4: UV400 (truncated to 4 chars because of different varieties)
+ */
+ uv_hub_info->hub_revision =
+ !strncmp(oem_id, "SGI4", 4) ? UV4_HUB_REVISION_BASE :
+ !strncmp(oem_id, "SGI3", 4) ? UV3_HUB_REVISION_BASE :
+ !strcmp(oem_id, "SGI2") ? UV2_HUB_REVISION_BASE :
+ !strcmp(oem_id, "SGI") ? UV1_HUB_REVISION_BASE : 0;
+
+ if (uv_hub_info->hub_revision == 0)
+ goto badbios;
+
+ pnodeid = early_get_pnodeid();
+ early_get_apic_socketid_shift();
+
+ x86_platform.is_untracked_pat_range = uv_is_untracked_pat_range;
+ x86_platform.nmi_init = uv_nmi_init;
+
+ if (!strcmp(oem_table_id, "UVX")) {
+ /* This is the most common hardware variant: */
+ uv_system_type = UV_X2APIC;
+ uv_apic = 0;
+
+ } else if (!strcmp(oem_table_id, "UVH")) {
+ /* Only UV1 systems: */
+ uv_system_type = UV_NON_UNIQUE_APIC;
+ x86_platform.legacy.warm_reset = 0;
+ __this_cpu_write(x2apic_extra_bits, pnodeid << uvh_apicid.s.pnode_shift);
+ uv_set_apicid_hibit();
+ uv_apic = 1;
+
+ } else if (!strcmp(oem_table_id, "UVL")) {
+ /* Only used for very small systems: */
+ uv_system_type = UV_LEGACY_APIC;
+ uv_apic = 0;
+
+ } else {
+ goto badbios;
+ }
+
+ pr_info("UV: OEM IDs %s/%s, System/HUB Types %d/%d, uv_apic %d\n", oem_id, oem_table_id, uv_system_type, uv_min_hub_revision_id, uv_apic);
+ uv_tsc_check_sync();
+
+ return uv_apic;
+
+badbios:
+ pr_err("UV: OEM_ID:%s OEM_TABLE_ID:%s\n", oem_id, oem_table_id);
+ pr_err("Current BIOS not supported, update kernel and/or BIOS\n");
+ BUG();
+}
+
+enum uv_system_type get_uv_system_type(void)
+{
+ return uv_system_type;
+}
+
+int is_uv_system(void)
+{
+ return uv_system_type != UV_NONE;
+}
+EXPORT_SYMBOL_GPL(is_uv_system);
+
+int is_uv_hubless(void)
+{
+ return uv_hubless_system;
+}
+EXPORT_SYMBOL_GPL(is_uv_hubless);
+
+void **__uv_hub_info_list;
+EXPORT_SYMBOL_GPL(__uv_hub_info_list);
+
+DEFINE_PER_CPU(struct uv_cpu_info_s, __uv_cpu_info);
+EXPORT_PER_CPU_SYMBOL_GPL(__uv_cpu_info);
+
+short uv_possible_blades;
+EXPORT_SYMBOL_GPL(uv_possible_blades);
+
+unsigned long sn_rtc_cycles_per_second;
+EXPORT_SYMBOL(sn_rtc_cycles_per_second);
+
+/* The following values are used for the per node hub info struct */
+static __initdata unsigned short *_node_to_pnode;
+static __initdata unsigned short _min_socket, _max_socket;
+static __initdata unsigned short _min_pnode, _max_pnode, _gr_table_len;
+static __initdata struct uv_gam_range_entry *uv_gre_table;
+static __initdata struct uv_gam_parameters *uv_gp_table;
+static __initdata unsigned short *_socket_to_node;
+static __initdata unsigned short *_socket_to_pnode;
+static __initdata unsigned short *_pnode_to_socket;
+
+static __initdata struct uv_gam_range_s *_gr_table;
+
+#define SOCK_EMPTY ((unsigned short)~0)
+
+extern int uv_hub_info_version(void)
+{
+ return UV_HUB_INFO_VERSION;
+}
+EXPORT_SYMBOL(uv_hub_info_version);
+
+/* Default UV memory block size is 2GB */
+static unsigned long mem_block_size __initdata = (2UL << 30);
+
+/* Kernel parameter to specify UV mem block size */
+static int __init parse_mem_block_size(char *ptr)
+{
+ unsigned long size = memparse(ptr, NULL);
+
+ /* Size will be rounded down by set_block_size() below */
+ mem_block_size = size;
+ return 0;
+}
+early_param("uv_memblksize", parse_mem_block_size);
+
+static __init int adj_blksize(u32 lgre)
+{
+ unsigned long base = (unsigned long)lgre << UV_GAM_RANGE_SHFT;
+ unsigned long size;
+
+ for (size = mem_block_size; size > MIN_MEMORY_BLOCK_SIZE; size >>= 1)
+ if (IS_ALIGNED(base, size))
+ break;
+
+ if (size >= mem_block_size)
+ return 0;
+
+ mem_block_size = size;
+ return 1;
+}
+
+static __init void set_block_size(void)
+{
+ unsigned int order = ffs(mem_block_size);
+
+ if (order) {
+ /* adjust for ffs return of 1..64 */
+ set_memory_block_size_order(order - 1);
+ pr_info("UV: mem_block_size set to 0x%lx\n", mem_block_size);
+ } else {
+ /* bad or zero value, default to 1UL << 31 (2GB) */
+ pr_err("UV: mem_block_size error with 0x%lx\n", mem_block_size);
+ set_memory_block_size_order(31);
+ }
+}
+
+/* Build GAM range lookup table: */
+static __init void build_uv_gr_table(void)
+{
+ struct uv_gam_range_entry *gre = uv_gre_table;
+ struct uv_gam_range_s *grt;
+ unsigned long last_limit = 0, ram_limit = 0;
+ int bytes, i, sid, lsid = -1, indx = 0, lindx = -1;
+
+ if (!gre)
+ return;
+
+ bytes = _gr_table_len * sizeof(struct uv_gam_range_s);
+ grt = kzalloc(bytes, GFP_KERNEL);
+ BUG_ON(!grt);
+ _gr_table = grt;
+
+ for (; gre->type != UV_GAM_RANGE_TYPE_UNUSED; gre++) {
+ if (gre->type == UV_GAM_RANGE_TYPE_HOLE) {
+ if (!ram_limit) {
+ /* Mark hole between RAM/non-RAM: */
+ ram_limit = last_limit;
+ last_limit = gre->limit;
+ lsid++;
+ continue;
+ }
+ last_limit = gre->limit;
+ pr_info("UV: extra hole in GAM RE table @%d\n", (int)(gre - uv_gre_table));
+ continue;
+ }
+ if (_max_socket < gre->sockid) {
+ pr_err("UV: GAM table sockid(%d) too large(>%d) @%d\n", gre->sockid, _max_socket, (int)(gre - uv_gre_table));
+ continue;
+ }
+ sid = gre->sockid - _min_socket;
+ if (lsid < sid) {
+ /* New range: */
+ grt = &_gr_table[indx];
+ grt->base = lindx;
+ grt->nasid = gre->nasid;
+ grt->limit = last_limit = gre->limit;
+ lsid = sid;
+ lindx = indx++;
+ continue;
+ }
+ /* Update range: */
+ if (lsid == sid && !ram_limit) {
+ /* .. if contiguous: */
+ if (grt->limit == last_limit) {
+ grt->limit = last_limit = gre->limit;
+ continue;
+ }
+ }
+ /* Non-contiguous RAM range: */
+ if (!ram_limit) {
+ grt++;
+ grt->base = lindx;
+ grt->nasid = gre->nasid;
+ grt->limit = last_limit = gre->limit;
+ continue;
+ }
+ /* Non-contiguous/non-RAM: */
+ grt++;
+ /* base is this entry */
+ grt->base = grt - _gr_table;
+ grt->nasid = gre->nasid;
+ grt->limit = last_limit = gre->limit;
+ lsid++;
+ }
+
+ /* Shorten table if possible */
+ grt++;
+ i = grt - _gr_table;
+ if (i < _gr_table_len) {
+ void *ret;
+
+ bytes = i * sizeof(struct uv_gam_range_s);
+ ret = krealloc(_gr_table, bytes, GFP_KERNEL);
+ if (ret) {
+ _gr_table = ret;
+ _gr_table_len = i;
+ }
+ }
+
+ /* Display resultant GAM range table: */
+ for (i = 0, grt = _gr_table; i < _gr_table_len; i++, grt++) {
+ unsigned long start, end;
+ int gb = grt->base;
+
+ start = gb < 0 ? 0 : (unsigned long)_gr_table[gb].limit << UV_GAM_RANGE_SHFT;
+ end = (unsigned long)grt->limit << UV_GAM_RANGE_SHFT;
+
+ pr_info("UV: GAM Range %2d %04x 0x%013lx-0x%013lx (%d)\n", i, grt->nasid, start, end, gb);
+ }
+}
+
+static int uv_wakeup_secondary(int phys_apicid, unsigned long start_rip)
+{
+ unsigned long val;
+ int pnode;
+
+ pnode = uv_apicid_to_pnode(phys_apicid);
+ phys_apicid |= uv_apicid_hibits;
+
+ val = (1UL << UVH_IPI_INT_SEND_SHFT) |
+ (phys_apicid << UVH_IPI_INT_APIC_ID_SHFT) |
+ ((start_rip << UVH_IPI_INT_VECTOR_SHFT) >> 12) |
+ APIC_DM_INIT;
+
+ uv_write_global_mmr64(pnode, UVH_IPI_INT, val);
+
+ val = (1UL << UVH_IPI_INT_SEND_SHFT) |
+ (phys_apicid << UVH_IPI_INT_APIC_ID_SHFT) |
+ ((start_rip << UVH_IPI_INT_VECTOR_SHFT) >> 12) |
+ APIC_DM_STARTUP;
+
+ uv_write_global_mmr64(pnode, UVH_IPI_INT, val);
+
+ return 0;
+}
+
+static void uv_send_IPI_one(int cpu, int vector)
+{
+ unsigned long apicid;
+ int pnode;
+
+ apicid = per_cpu(x86_cpu_to_apicid, cpu);
+ pnode = uv_apicid_to_pnode(apicid);
+ uv_hub_send_ipi(pnode, apicid, vector);
+}
+
+static void uv_send_IPI_mask(const struct cpumask *mask, int vector)
+{
+ unsigned int cpu;
+
+ for_each_cpu(cpu, mask)
+ uv_send_IPI_one(cpu, vector);
+}
+
+static void uv_send_IPI_mask_allbutself(const struct cpumask *mask, int vector)
+{
+ unsigned int this_cpu = smp_processor_id();
+ unsigned int cpu;
+
+ for_each_cpu(cpu, mask) {
+ if (cpu != this_cpu)
+ uv_send_IPI_one(cpu, vector);
+ }
+}
+
+static void uv_send_IPI_allbutself(int vector)
+{
+ unsigned int this_cpu = smp_processor_id();
+ unsigned int cpu;
+
+ for_each_online_cpu(cpu) {
+ if (cpu != this_cpu)
+ uv_send_IPI_one(cpu, vector);
+ }
+}
+
+static void uv_send_IPI_all(int vector)
+{
+ uv_send_IPI_mask(cpu_online_mask, vector);
+}
+
+static int uv_apic_id_valid(u32 apicid)
+{
+ return 1;
+}
+
+static int uv_apic_id_registered(void)
+{
+ return 1;
+}
+
+static void uv_init_apic_ldr(void)
+{
+}
+
+static u32 apic_uv_calc_apicid(unsigned int cpu)
+{
+ return apic_default_calc_apicid(cpu) | uv_apicid_hibits;
+}
+
+static unsigned int x2apic_get_apic_id(unsigned long x)
+{
+ unsigned int id;
+
+ WARN_ON(preemptible() && num_online_cpus() > 1);
+ id = x | __this_cpu_read(x2apic_extra_bits);
+
+ return id;
+}
+
+static u32 set_apic_id(unsigned int id)
+{
+ /* CHECKME: Do we need to mask out the xapic extra bits? */
+ return id;
+}
+
+static unsigned int uv_read_apic_id(void)
+{
+ return x2apic_get_apic_id(apic_read(APIC_ID));
+}
+
+static int uv_phys_pkg_id(int initial_apicid, int index_msb)
+{
+ return uv_read_apic_id() >> index_msb;
+}
+
+static void uv_send_IPI_self(int vector)
+{
+ apic_write(APIC_SELF_IPI, vector);
+}
+
+static int uv_probe(void)
+{
+ return apic == &apic_x2apic_uv_x;
+}
+
+static struct apic apic_x2apic_uv_x __ro_after_init = {
+
+ .name = "UV large system",
+ .probe = uv_probe,
+ .acpi_madt_oem_check = uv_acpi_madt_oem_check,
+ .apic_id_valid = uv_apic_id_valid,
+ .apic_id_registered = uv_apic_id_registered,
+
+ .irq_delivery_mode = dest_Fixed,
+ .irq_dest_mode = 0, /* Physical */
+
+ .disable_esr = 0,
+ .dest_logical = APIC_DEST_LOGICAL,
+ .check_apicid_used = NULL,
+
+ .init_apic_ldr = uv_init_apic_ldr,
+
+ .ioapic_phys_id_map = NULL,
+ .setup_apic_routing = NULL,
+ .cpu_present_to_apicid = default_cpu_present_to_apicid,
+ .apicid_to_cpu_present = NULL,
+ .check_phys_apicid_present = default_check_phys_apicid_present,
+ .phys_pkg_id = uv_phys_pkg_id,
+
+ .get_apic_id = x2apic_get_apic_id,
+ .set_apic_id = set_apic_id,
+
+ .calc_dest_apicid = apic_uv_calc_apicid,
+
+ .send_IPI = uv_send_IPI_one,
+ .send_IPI_mask = uv_send_IPI_mask,
+ .send_IPI_mask_allbutself = uv_send_IPI_mask_allbutself,
+ .send_IPI_allbutself = uv_send_IPI_allbutself,
+ .send_IPI_all = uv_send_IPI_all,
+ .send_IPI_self = uv_send_IPI_self,
+
+ .wakeup_secondary_cpu = uv_wakeup_secondary,
+ .inquire_remote_apic = NULL,
+
+ .read = native_apic_msr_read,
+ .write = native_apic_msr_write,
+ .eoi_write = native_apic_msr_eoi_write,
+ .icr_read = native_x2apic_icr_read,
+ .icr_write = native_x2apic_icr_write,
+ .wait_icr_idle = native_x2apic_wait_icr_idle,
+ .safe_wait_icr_idle = native_safe_x2apic_wait_icr_idle,
+};
+
+static void set_x2apic_extra_bits(int pnode)
+{
+ __this_cpu_write(x2apic_extra_bits, pnode << uvh_apicid.s.pnode_shift);
+}
+
+#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_LENGTH 3
+#define DEST_SHIFT UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR_DEST_BASE_SHFT
+
+static __init void get_lowmem_redirect(unsigned long *base, unsigned long *size)
+{
+ union uvh_rh_gam_alias210_overlay_config_2_mmr_u alias;
+ union uvh_rh_gam_alias210_redirect_config_2_mmr_u redirect;
+ unsigned long m_redirect;
+ unsigned long m_overlay;
+ int i;
+
+ for (i = 0; i < UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_LENGTH; i++) {
+ switch (i) {
+ case 0:
+ m_redirect = UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR;
+ m_overlay = UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR;
+ break;
+ case 1:
+ m_redirect = UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR;
+ m_overlay = UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR;
+ break;
+ case 2:
+ m_redirect = UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR;
+ m_overlay = UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR;
+ break;
+ }
+ alias.v = uv_read_local_mmr(m_overlay);
+ if (alias.s.enable && alias.s.base == 0) {
+ *size = (1UL << alias.s.m_alias);
+ redirect.v = uv_read_local_mmr(m_redirect);
+ *base = (unsigned long)redirect.s.dest_base << DEST_SHIFT;
+ return;
+ }
+ }
+ *base = *size = 0;
+}
+
+enum map_type {map_wb, map_uc};
+
+static __init void map_high(char *id, unsigned long base, int pshift, int bshift, int max_pnode, enum map_type map_type)
+{
+ unsigned long bytes, paddr;
+
+ paddr = base << pshift;
+ bytes = (1UL << bshift) * (max_pnode + 1);
+ if (!paddr) {
+ pr_info("UV: Map %s_HI base address NULL\n", id);
+ return;
+ }
+ pr_debug("UV: Map %s_HI 0x%lx - 0x%lx\n", id, paddr, paddr + bytes);
+ if (map_type == map_uc)
+ init_extra_mapping_uc(paddr, bytes);
+ else
+ init_extra_mapping_wb(paddr, bytes);
+}
+
+static __init void map_gru_distributed(unsigned long c)
+{
+ union uvh_rh_gam_gru_overlay_config_mmr_u gru;
+ u64 paddr;
+ unsigned long bytes;
+ int nid;
+
+ gru.v = c;
+
+ /* Only base bits 42:28 relevant in dist mode */
+ gru_dist_base = gru.v & 0x000007fff0000000UL;
+ if (!gru_dist_base) {
+ pr_info("UV: Map GRU_DIST base address NULL\n");
+ return;
+ }
+
+ bytes = 1UL << UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT;
+ gru_dist_lmask = ((1UL << uv_hub_info->m_val) - 1) & ~(bytes - 1);
+ gru_dist_umask = ~((1UL << uv_hub_info->m_val) - 1);
+ gru_dist_base &= gru_dist_lmask; /* Clear bits above M */
+
+ for_each_online_node(nid) {
+ paddr = ((u64)uv_node_to_pnode(nid) << uv_hub_info->m_val) |
+ gru_dist_base;
+ init_extra_mapping_wb(paddr, bytes);
+ gru_first_node_paddr = min(paddr, gru_first_node_paddr);
+ gru_last_node_paddr = max(paddr, gru_last_node_paddr);
+ }
+
+ /* Save upper (63:M) bits of address only for is_GRU_range */
+ gru_first_node_paddr &= gru_dist_umask;
+ gru_last_node_paddr &= gru_dist_umask;
+
+ pr_debug("UV: Map GRU_DIST base 0x%016llx 0x%016llx - 0x%016llx\n", gru_dist_base, gru_first_node_paddr, gru_last_node_paddr);
+}
+
+static __init void map_gru_high(int max_pnode)
+{
+ union uvh_rh_gam_gru_overlay_config_mmr_u gru;
+ int shift = UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT;
+ unsigned long mask = UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_MASK;
+ unsigned long base;
+
+ gru.v = uv_read_local_mmr(UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR);
+ if (!gru.s.enable) {
+ pr_info("UV: GRU disabled\n");
+ return;
+ }
+
+ /* Only UV3 has distributed GRU mode */
+ if (is_uv3_hub() && gru.s3.mode) {
+ map_gru_distributed(gru.v);
+ return;
+ }
+
+ base = (gru.v & mask) >> shift;
+ map_high("GRU", base, shift, shift, max_pnode, map_wb);
+ gru_start_paddr = ((u64)base << shift);
+ gru_end_paddr = gru_start_paddr + (1UL << shift) * (max_pnode + 1);
+}
+
+static __init void map_mmr_high(int max_pnode)
+{
+ union uvh_rh_gam_mmr_overlay_config_mmr_u mmr;
+ int shift = UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_SHFT;
+
+ mmr.v = uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR);
+ if (mmr.s.enable)
+ map_high("MMR", mmr.s.base, shift, shift, max_pnode, map_uc);
+ else
+ pr_info("UV: MMR disabled\n");
+}
+
+/* UV3/4 have identical MMIOH overlay configs, UV4A is slightly different */
+static __init void map_mmioh_high_uv34(int index, int min_pnode, int max_pnode)
+{
+ unsigned long overlay;
+ unsigned long mmr;
+ unsigned long base;
+ unsigned long nasid_mask;
+ unsigned long m_overlay;
+ int i, n, shift, m_io, max_io;
+ int nasid, lnasid, fi, li;
+ char *id;
+
+ if (index == 0) {
+ id = "MMIOH0";
+ m_overlay = UVH_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR;
+ overlay = uv_read_local_mmr(m_overlay);
+ base = overlay & UVH_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_BASE_MASK;
+ mmr = UVH_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR;
+ m_io = (overlay & UVH_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_M_IO_MASK)
+ >> UVH_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_M_IO_SHFT;
+ shift = UVH_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_M_IO_SHFT;
+ n = UVH_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR_DEPTH;
+ nasid_mask = UVH_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR_NASID_MASK;
+ } else {
+ id = "MMIOH1";
+ m_overlay = UVH_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR;
+ overlay = uv_read_local_mmr(m_overlay);
+ base = overlay & UVH_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR_BASE_MASK;
+ mmr = UVH_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR;
+ m_io = (overlay & UVH_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR_M_IO_MASK)
+ >> UVH_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR_M_IO_SHFT;
+ shift = UVH_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR_M_IO_SHFT;
+ n = UVH_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR_DEPTH;
+ nasid_mask = UVH_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR_NASID_MASK;
+ }
+ pr_info("UV: %s overlay 0x%lx base:0x%lx m_io:%d\n", id, overlay, base, m_io);
+ if (!(overlay & UVH_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_ENABLE_MASK)) {
+ pr_info("UV: %s disabled\n", id);
+ return;
+ }
+
+ /* Convert to NASID: */
+ min_pnode *= 2;
+ max_pnode *= 2;
+ max_io = lnasid = fi = li = -1;
+
+ for (i = 0; i < n; i++) {
+ unsigned long m_redirect = mmr + i * 8;
+ unsigned long redirect = uv_read_local_mmr(m_redirect);
+
+ nasid = redirect & nasid_mask;
+ if (i == 0)
+ pr_info("UV: %s redirect base 0x%lx(@0x%lx) 0x%04x\n",
+ id, redirect, m_redirect, nasid);
+
+ /* Invalid NASID: */
+ if (nasid < min_pnode || max_pnode < nasid)
+ nasid = -1;
+
+ if (nasid == lnasid) {
+ li = i;
+ /* Last entry check: */
+ if (i != n-1)
+ continue;
+ }
+
+ /* Check if we have a cached (or last) redirect to print: */
+ if (lnasid != -1 || (i == n-1 && nasid != -1)) {
+ unsigned long addr1, addr2;
+ int f, l;
+
+ if (lnasid == -1) {
+ f = l = i;
+ lnasid = nasid;
+ } else {
+ f = fi;
+ l = li;
+ }
+ addr1 = (base << shift) + f * (1ULL << m_io);
+ addr2 = (base << shift) + (l + 1) * (1ULL << m_io);
+ pr_info("UV: %s[%03d..%03d] NASID 0x%04x ADDR 0x%016lx - 0x%016lx\n", id, fi, li, lnasid, addr1, addr2);
+ if (max_io < l)
+ max_io = l;
+ }
+ fi = li = i;
+ lnasid = nasid;
+ }
+
+ pr_info("UV: %s base:0x%lx shift:%d M_IO:%d MAX_IO:%d\n", id, base, shift, m_io, max_io);
+
+ if (max_io >= 0)
+ map_high(id, base, shift, m_io, max_io, map_uc);
+}
+
+static __init void map_mmioh_high(int min_pnode, int max_pnode)
+{
+ union uvh_rh_gam_mmioh_overlay_config_mmr_u mmioh;
+ unsigned long mmr, base;
+ int shift, enable, m_io, n_io;
+
+ if (is_uv3_hub() || is_uv4_hub()) {
+ /* Map both MMIOH regions: */
+ map_mmioh_high_uv34(0, min_pnode, max_pnode);
+ map_mmioh_high_uv34(1, min_pnode, max_pnode);
+ return;
+ }
+
+ if (is_uv1_hub()) {
+ mmr = UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR;
+ shift = UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_SHFT;
+ mmioh.v = uv_read_local_mmr(mmr);
+ enable = !!mmioh.s1.enable;
+ base = mmioh.s1.base;
+ m_io = mmioh.s1.m_io;
+ n_io = mmioh.s1.n_io;
+ } else if (is_uv2_hub()) {
+ mmr = UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR;
+ shift = UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_SHFT;
+ mmioh.v = uv_read_local_mmr(mmr);
+ enable = !!mmioh.s2.enable;
+ base = mmioh.s2.base;
+ m_io = mmioh.s2.m_io;
+ n_io = mmioh.s2.n_io;
+ } else {
+ return;
+ }
+
+ if (enable) {
+ max_pnode &= (1 << n_io) - 1;
+ pr_info("UV: base:0x%lx shift:%d N_IO:%d M_IO:%d max_pnode:0x%x\n", base, shift, m_io, n_io, max_pnode);
+ map_high("MMIOH", base, shift, m_io, max_pnode, map_uc);
+ } else {
+ pr_info("UV: MMIOH disabled\n");
+ }
+}
+
+static __init void map_low_mmrs(void)
+{
+ init_extra_mapping_uc(UV_GLOBAL_MMR32_BASE, UV_GLOBAL_MMR32_SIZE);
+ init_extra_mapping_uc(UV_LOCAL_MMR_BASE, UV_LOCAL_MMR_SIZE);
+}
+
+static __init void uv_rtc_init(void)
+{
+ long status;
+ u64 ticks_per_sec;
+
+ status = uv_bios_freq_base(BIOS_FREQ_BASE_REALTIME_CLOCK, &ticks_per_sec);
+
+ if (status != BIOS_STATUS_SUCCESS || ticks_per_sec < 100000) {
+ pr_warn("UV: unable to determine platform RTC clock frequency, guessing.\n");
+
+ /* BIOS gives wrong value for clock frequency, so guess: */
+ sn_rtc_cycles_per_second = 1000000000000UL / 30000UL;
+ } else {
+ sn_rtc_cycles_per_second = ticks_per_sec;
+ }
+}
+
+/*
+ * percpu heartbeat timer
+ */
+static void uv_heartbeat(struct timer_list *timer)
+{
+ unsigned char bits = uv_scir_info->state;
+
+ /* Flip heartbeat bit: */
+ bits ^= SCIR_CPU_HEARTBEAT;
+
+ /* Is this CPU idle? */
+ if (idle_cpu(raw_smp_processor_id()))
+ bits &= ~SCIR_CPU_ACTIVITY;
+ else
+ bits |= SCIR_CPU_ACTIVITY;
+
+ /* Update system controller interface reg: */
+ uv_set_scir_bits(bits);
+
+ /* Enable next timer period: */
+ mod_timer(timer, jiffies + SCIR_CPU_HB_INTERVAL);
+}
+
+static int uv_heartbeat_enable(unsigned int cpu)
+{
+ while (!uv_cpu_scir_info(cpu)->enabled) {
+ struct timer_list *timer = &uv_cpu_scir_info(cpu)->timer;
+
+ uv_set_cpu_scir_bits(cpu, SCIR_CPU_HEARTBEAT|SCIR_CPU_ACTIVITY);
+ timer_setup(timer, uv_heartbeat, TIMER_PINNED);
+ timer->expires = jiffies + SCIR_CPU_HB_INTERVAL;
+ add_timer_on(timer, cpu);
+ uv_cpu_scir_info(cpu)->enabled = 1;
+
+ /* Also ensure that boot CPU is enabled: */
+ cpu = 0;
+ }
+ return 0;
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+static int uv_heartbeat_disable(unsigned int cpu)
+{
+ if (uv_cpu_scir_info(cpu)->enabled) {
+ uv_cpu_scir_info(cpu)->enabled = 0;
+ del_timer(&uv_cpu_scir_info(cpu)->timer);
+ }
+ uv_set_cpu_scir_bits(cpu, 0xff);
+ return 0;
+}
+
+static __init void uv_scir_register_cpu_notifier(void)
+{
+ cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "x86/x2apic-uvx:online",
+ uv_heartbeat_enable, uv_heartbeat_disable);
+}
+
+#else /* !CONFIG_HOTPLUG_CPU */
+
+static __init void uv_scir_register_cpu_notifier(void)
+{
+}
+
+static __init int uv_init_heartbeat(void)
+{
+ int cpu;
+
+ if (is_uv_system()) {
+ for_each_online_cpu(cpu)
+ uv_heartbeat_enable(cpu);
+ }
+
+ return 0;
+}
+
+late_initcall(uv_init_heartbeat);
+
+#endif /* !CONFIG_HOTPLUG_CPU */
+
+/* Direct Legacy VGA I/O traffic to designated IOH */
+int uv_set_vga_state(struct pci_dev *pdev, bool decode, unsigned int command_bits, u32 flags)
+{
+ int domain, bus, rc;
+
+ if (!(flags & PCI_VGA_STATE_CHANGE_BRIDGE))
+ return 0;
+
+ if ((command_bits & PCI_COMMAND_IO) == 0)
+ return 0;
+
+ domain = pci_domain_nr(pdev->bus);
+ bus = pdev->bus->number;
+
+ rc = uv_bios_set_legacy_vga_target(decode, domain, bus);
+
+ return rc;
+}
+
+/*
+ * Called on each CPU to initialize the per_cpu UV data area.
+ * FIXME: hotplug not supported yet
+ */
+void uv_cpu_init(void)
+{
+ /* CPU 0 initialization will be done via uv_system_init. */
+ if (smp_processor_id() == 0)
+ return;
+
+ uv_hub_info->nr_online_cpus++;
+
+ if (get_uv_system_type() == UV_NON_UNIQUE_APIC)
+ set_x2apic_extra_bits(uv_hub_info->pnode);
+}
+
+struct mn {
+ unsigned char m_val;
+ unsigned char n_val;
+ unsigned char m_shift;
+ unsigned char n_lshift;
+};
+
+static void get_mn(struct mn *mnp)
+{
+ union uvh_rh_gam_config_mmr_u m_n_config;
+ union uv3h_gr0_gam_gr_config_u m_gr_config;
+
+ /* Make sure the whole structure is well initialized: */
+ memset(mnp, 0, sizeof(*mnp));
+
+ m_n_config.v = uv_read_local_mmr(UVH_RH_GAM_CONFIG_MMR);
+ mnp->n_val = m_n_config.s.n_skt;
+
+ if (is_uv4_hub()) {
+ mnp->m_val = 0;
+ mnp->n_lshift = 0;
+ } else if (is_uv3_hub()) {
+ mnp->m_val = m_n_config.s3.m_skt;
+ m_gr_config.v = uv_read_local_mmr(UV3H_GR0_GAM_GR_CONFIG);
+ mnp->n_lshift = m_gr_config.s3.m_skt;
+ } else if (is_uv2_hub()) {
+ mnp->m_val = m_n_config.s2.m_skt;
+ mnp->n_lshift = mnp->m_val == 40 ? 40 : 39;
+ } else if (is_uv1_hub()) {
+ mnp->m_val = m_n_config.s1.m_skt;
+ mnp->n_lshift = mnp->m_val;
+ }
+ mnp->m_shift = mnp->m_val ? 64 - mnp->m_val : 0;
+}
+
+void __init uv_init_hub_info(struct uv_hub_info_s *hi)
+{
+ union uvh_node_id_u node_id;
+ struct mn mn;
+
+ get_mn(&mn);
+ hi->gpa_mask = mn.m_val ?
+ (1UL << (mn.m_val + mn.n_val)) - 1 :
+ (1UL << uv_cpuid.gpa_shift) - 1;
+
+ hi->m_val = mn.m_val;
+ hi->n_val = mn.n_val;
+ hi->m_shift = mn.m_shift;
+ hi->n_lshift = mn.n_lshift ? mn.n_lshift : 0;
+ hi->hub_revision = uv_hub_info->hub_revision;
+ hi->pnode_mask = uv_cpuid.pnode_mask;
+ hi->min_pnode = _min_pnode;
+ hi->min_socket = _min_socket;
+ hi->pnode_to_socket = _pnode_to_socket;
+ hi->socket_to_node = _socket_to_node;
+ hi->socket_to_pnode = _socket_to_pnode;
+ hi->gr_table_len = _gr_table_len;
+ hi->gr_table = _gr_table;
+
+ node_id.v = uv_read_local_mmr(UVH_NODE_ID);
+ uv_cpuid.gnode_shift = max_t(unsigned int, uv_cpuid.gnode_shift, mn.n_val);
+ hi->gnode_extra = (node_id.s.node_id & ~((1 << uv_cpuid.gnode_shift) - 1)) >> 1;
+ if (mn.m_val)
+ hi->gnode_upper = (u64)hi->gnode_extra << mn.m_val;
+
+ if (uv_gp_table) {
+ hi->global_mmr_base = uv_gp_table->mmr_base;
+ hi->global_mmr_shift = uv_gp_table->mmr_shift;
+ hi->global_gru_base = uv_gp_table->gru_base;
+ hi->global_gru_shift = uv_gp_table->gru_shift;
+ hi->gpa_shift = uv_gp_table->gpa_shift;
+ hi->gpa_mask = (1UL << hi->gpa_shift) - 1;
+ } else {
+ hi->global_mmr_base = uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR) & ~UV_MMR_ENABLE;
+ hi->global_mmr_shift = _UV_GLOBAL_MMR64_PNODE_SHIFT;
+ }
+
+ get_lowmem_redirect(&hi->lowmem_remap_base, &hi->lowmem_remap_top);
+
+ hi->apic_pnode_shift = uv_cpuid.socketid_shift;
+
+ /* Show system specific info: */
+ pr_info("UV: N:%d M:%d m_shift:%d n_lshift:%d\n", hi->n_val, hi->m_val, hi->m_shift, hi->n_lshift);
+ pr_info("UV: gpa_mask/shift:0x%lx/%d pnode_mask:0x%x apic_pns:%d\n", hi->gpa_mask, hi->gpa_shift, hi->pnode_mask, hi->apic_pnode_shift);
+ pr_info("UV: mmr_base/shift:0x%lx/%ld gru_base/shift:0x%lx/%ld\n", hi->global_mmr_base, hi->global_mmr_shift, hi->global_gru_base, hi->global_gru_shift);
+ pr_info("UV: gnode_upper:0x%lx gnode_extra:0x%x\n", hi->gnode_upper, hi->gnode_extra);
+}
+
+static void __init decode_gam_params(unsigned long ptr)
+{
+ uv_gp_table = (struct uv_gam_parameters *)ptr;
+
+ pr_info("UV: GAM Params...\n");
+ pr_info("UV: mmr_base/shift:0x%llx/%d gru_base/shift:0x%llx/%d gpa_shift:%d\n",
+ uv_gp_table->mmr_base, uv_gp_table->mmr_shift,
+ uv_gp_table->gru_base, uv_gp_table->gru_shift,
+ uv_gp_table->gpa_shift);
+}
+
+static void __init decode_gam_rng_tbl(unsigned long ptr)
+{
+ struct uv_gam_range_entry *gre = (struct uv_gam_range_entry *)ptr;
+ unsigned long lgre = 0;
+ int index = 0;
+ int sock_min = 999999, pnode_min = 99999;
+ int sock_max = -1, pnode_max = -1;
+
+ uv_gre_table = gre;
+ for (; gre->type != UV_GAM_RANGE_TYPE_UNUSED; gre++) {
+ unsigned long size = ((unsigned long)(gre->limit - lgre)
+ << UV_GAM_RANGE_SHFT);
+ int order = 0;
+ char suffix[] = " KMGTPE";
+ int flag = ' ';
+
+ while (size > 9999 && order < sizeof(suffix)) {
+ size /= 1024;
+ order++;
+ }
+
+ /* adjust max block size to current range start */
+ if (gre->type == 1 || gre->type == 2)
+ if (adj_blksize(lgre))
+ flag = '*';
+
+ if (!index) {
+ pr_info("UV: GAM Range Table...\n");
+ pr_info("UV: # %20s %14s %6s %4s %5s %3s %2s\n", "Range", "", "Size", "Type", "NASID", "SID", "PN");
+ }
+ pr_info("UV: %2d: 0x%014lx-0x%014lx%c %5lu%c %3d %04x %02x %02x\n",
+ index++,
+ (unsigned long)lgre << UV_GAM_RANGE_SHFT,
+ (unsigned long)gre->limit << UV_GAM_RANGE_SHFT,
+ flag, size, suffix[order],
+ gre->type, gre->nasid, gre->sockid, gre->pnode);
+
+ /* update to next range start */
+ lgre = gre->limit;
+ if (sock_min > gre->sockid)
+ sock_min = gre->sockid;
+ if (sock_max < gre->sockid)
+ sock_max = gre->sockid;
+ if (pnode_min > gre->pnode)
+ pnode_min = gre->pnode;
+ if (pnode_max < gre->pnode)
+ pnode_max = gre->pnode;
+ }
+ _min_socket = sock_min;
+ _max_socket = sock_max;
+ _min_pnode = pnode_min;
+ _max_pnode = pnode_max;
+ _gr_table_len = index;
+
+ pr_info("UV: GRT: %d entries, sockets(min:%x,max:%x) pnodes(min:%x,max:%x)\n", index, _min_socket, _max_socket, _min_pnode, _max_pnode);
+}
+
+static int __init decode_uv_systab(void)
+{
+ struct uv_systab *st;
+ int i;
+
+ if (uv_hub_info->hub_revision < UV4_HUB_REVISION_BASE)
+ return 0; /* No extended UVsystab required */
+
+ st = uv_systab;
+ if ((!st) || (st->revision < UV_SYSTAB_VERSION_UV4_LATEST)) {
+ int rev = st ? st->revision : 0;
+
+ pr_err("UV: BIOS UVsystab version(%x) mismatch, expecting(%x)\n", rev, UV_SYSTAB_VERSION_UV4_LATEST);
+ pr_err("UV: Cannot support UV operations, switching to generic PC\n");
+ uv_system_type = UV_NONE;
+
+ return -EINVAL;
+ }
+
+ for (i = 0; st->entry[i].type != UV_SYSTAB_TYPE_UNUSED; i++) {
+ unsigned long ptr = st->entry[i].offset;
+
+ if (!ptr)
+ continue;
+
+ ptr = ptr + (unsigned long)st;
+
+ switch (st->entry[i].type) {
+ case UV_SYSTAB_TYPE_GAM_PARAMS:
+ decode_gam_params(ptr);
+ break;
+
+ case UV_SYSTAB_TYPE_GAM_RNG_TBL:
+ decode_gam_rng_tbl(ptr);
+ break;
+ }
+ }
+ return 0;
+}
+
+/*
+ * Set up physical blade translations from UVH_NODE_PRESENT_TABLE
+ * .. NB: UVH_NODE_PRESENT_TABLE is going away,
+ * .. being replaced by GAM Range Table
+ */
+static __init void boot_init_possible_blades(struct uv_hub_info_s *hub_info)
+{
+ int i, uv_pb = 0;
+
+ pr_info("UV: NODE_PRESENT_DEPTH = %d\n", UVH_NODE_PRESENT_TABLE_DEPTH);
+ for (i = 0; i < UVH_NODE_PRESENT_TABLE_DEPTH; i++) {
+ unsigned long np;
+
+ np = uv_read_local_mmr(UVH_NODE_PRESENT_TABLE + i * 8);
+ if (np)
+ pr_info("UV: NODE_PRESENT(%d) = 0x%016lx\n", i, np);
+
+ uv_pb += hweight64(np);
+ }
+ if (uv_possible_blades != uv_pb)
+ uv_possible_blades = uv_pb;
+}
+
+static void __init build_socket_tables(void)
+{
+ struct uv_gam_range_entry *gre = uv_gre_table;
+ int num, nump;
+ int cpu, i, lnid;
+ int minsock = _min_socket;
+ int maxsock = _max_socket;
+ int minpnode = _min_pnode;
+ int maxpnode = _max_pnode;
+ size_t bytes;
+
+ if (!gre) {
+ if (is_uv1_hub() || is_uv2_hub() || is_uv3_hub()) {
+ pr_info("UV: No UVsystab socket table, ignoring\n");
+ return;
+ }
+ pr_crit("UV: Error: UVsystab address translations not available!\n");
+ BUG();
+ }
+
+ /* Build socket id -> node id, pnode */
+ num = maxsock - minsock + 1;
+ bytes = num * sizeof(_socket_to_node[0]);
+ _socket_to_node = kmalloc(bytes, GFP_KERNEL);
+ _socket_to_pnode = kmalloc(bytes, GFP_KERNEL);
+
+ nump = maxpnode - minpnode + 1;
+ bytes = nump * sizeof(_pnode_to_socket[0]);
+ _pnode_to_socket = kmalloc(bytes, GFP_KERNEL);
+ BUG_ON(!_socket_to_node || !_socket_to_pnode || !_pnode_to_socket);
+
+ for (i = 0; i < num; i++)
+ _socket_to_node[i] = _socket_to_pnode[i] = SOCK_EMPTY;
+
+ for (i = 0; i < nump; i++)
+ _pnode_to_socket[i] = SOCK_EMPTY;
+
+ /* Fill in pnode/node/addr conversion list values: */
+ pr_info("UV: GAM Building socket/pnode conversion tables\n");
+ for (; gre->type != UV_GAM_RANGE_TYPE_UNUSED; gre++) {
+ if (gre->type == UV_GAM_RANGE_TYPE_HOLE)
+ continue;
+ i = gre->sockid - minsock;
+ /* Duplicate: */
+ if (_socket_to_pnode[i] != SOCK_EMPTY)
+ continue;
+ _socket_to_pnode[i] = gre->pnode;
+
+ i = gre->pnode - minpnode;
+ _pnode_to_socket[i] = gre->sockid;
+
+ pr_info("UV: sid:%02x type:%d nasid:%04x pn:%02x pn2s:%2x\n",
+ gre->sockid, gre->type, gre->nasid,
+ _socket_to_pnode[gre->sockid - minsock],
+ _pnode_to_socket[gre->pnode - minpnode]);
+ }
+
+ /* Set socket -> node values: */
+ lnid = -1;
+ for_each_present_cpu(cpu) {
+ int nid = cpu_to_node(cpu);
+ int apicid, sockid;
+
+ if (lnid == nid)
+ continue;
+ lnid = nid;
+ apicid = per_cpu(x86_cpu_to_apicid, cpu);
+ sockid = apicid >> uv_cpuid.socketid_shift;
+ _socket_to_node[sockid - minsock] = nid;
+ pr_info("UV: sid:%02x: apicid:%04x node:%2d\n",
+ sockid, apicid, nid);
+ }
+
+ /* Set up physical blade to pnode translation from GAM Range Table: */
+ bytes = num_possible_nodes() * sizeof(_node_to_pnode[0]);
+ _node_to_pnode = kmalloc(bytes, GFP_KERNEL);
+ BUG_ON(!_node_to_pnode);
+
+ for (lnid = 0; lnid < num_possible_nodes(); lnid++) {
+ unsigned short sockid;
+
+ for (sockid = minsock; sockid <= maxsock; sockid++) {
+ if (lnid == _socket_to_node[sockid - minsock]) {
+ _node_to_pnode[lnid] = _socket_to_pnode[sockid - minsock];
+ break;
+ }
+ }
+ if (sockid > maxsock) {
+ pr_err("UV: socket for node %d not found!\n", lnid);
+ BUG();
+ }
+ }
+
+ /*
+ * If socket id == pnode or socket id == node for all nodes,
+ * system runs faster by removing corresponding conversion table.
+ */
+ pr_info("UV: Checking socket->node/pnode for identity maps\n");
+ if (minsock == 0) {
+ for (i = 0; i < num; i++)
+ if (_socket_to_node[i] == SOCK_EMPTY || i != _socket_to_node[i])
+ break;
+ if (i >= num) {
+ kfree(_socket_to_node);
+ _socket_to_node = NULL;
+ pr_info("UV: 1:1 socket_to_node table removed\n");
+ }
+ }
+ if (minsock == minpnode) {
+ for (i = 0; i < num; i++)
+ if (_socket_to_pnode[i] != SOCK_EMPTY &&
+ _socket_to_pnode[i] != i + minpnode)
+ break;
+ if (i >= num) {
+ kfree(_socket_to_pnode);
+ _socket_to_pnode = NULL;
+ pr_info("UV: 1:1 socket_to_pnode table removed\n");
+ }
+ }
+}
+
+static void __init uv_system_init_hub(void)
+{
+ struct uv_hub_info_s hub_info = {0};
+ int bytes, cpu, nodeid;
+ unsigned short min_pnode = 9999, max_pnode = 0;
+ char *hub = is_uv4_hub() ? "UV400" :
+ is_uv3_hub() ? "UV300" :
+ is_uv2_hub() ? "UV2000/3000" :
+ is_uv1_hub() ? "UV100/1000" : NULL;
+
+ if (!hub) {
+ pr_err("UV: Unknown/unsupported UV hub\n");
+ return;
+ }
+ pr_info("UV: Found %s hub\n", hub);
+
+ map_low_mmrs();
+
+ /* Get uv_systab for decoding: */
+ uv_bios_init();
+
+ /* If there's an UVsystab problem then abort UV init: */
+ if (decode_uv_systab() < 0)
+ return;
+
+ build_socket_tables();
+ build_uv_gr_table();
+ set_block_size();
+ uv_init_hub_info(&hub_info);
+ uv_possible_blades = num_possible_nodes();
+ if (!_node_to_pnode)
+ boot_init_possible_blades(&hub_info);
+
+ /* uv_num_possible_blades() is really the hub count: */
+ pr_info("UV: Found %d hubs, %d nodes, %d CPUs\n", uv_num_possible_blades(), num_possible_nodes(), num_possible_cpus());
+
+ uv_bios_get_sn_info(0, &uv_type, &sn_partition_id, &sn_coherency_id, &sn_region_size, &system_serial_number);
+ hub_info.coherency_domain_number = sn_coherency_id;
+ uv_rtc_init();
+
+ bytes = sizeof(void *) * uv_num_possible_blades();
+ __uv_hub_info_list = kzalloc(bytes, GFP_KERNEL);
+ BUG_ON(!__uv_hub_info_list);
+
+ bytes = sizeof(struct uv_hub_info_s);
+ for_each_node(nodeid) {
+ struct uv_hub_info_s *new_hub;
+
+ if (__uv_hub_info_list[nodeid]) {
+ pr_err("UV: Node %d UV HUB already initialized!?\n", nodeid);
+ BUG();
+ }
+
+ /* Allocate new per hub info list */
+ new_hub = (nodeid == 0) ? &uv_hub_info_node0 : kzalloc_node(bytes, GFP_KERNEL, nodeid);
+ BUG_ON(!new_hub);
+ __uv_hub_info_list[nodeid] = new_hub;
+ new_hub = uv_hub_info_list(nodeid);
+ BUG_ON(!new_hub);
+ *new_hub = hub_info;
+
+ /* Use information from GAM table if available: */
+ if (_node_to_pnode)
+ new_hub->pnode = _node_to_pnode[nodeid];
+ else /* Or fill in during CPU loop: */
+ new_hub->pnode = 0xffff;
+
+ new_hub->numa_blade_id = uv_node_to_blade_id(nodeid);
+ new_hub->memory_nid = -1;
+ new_hub->nr_possible_cpus = 0;
+ new_hub->nr_online_cpus = 0;
+ }
+
+ /* Initialize per CPU info: */
+ for_each_possible_cpu(cpu) {
+ int apicid = per_cpu(x86_cpu_to_apicid, cpu);
+ int numa_node_id;
+ unsigned short pnode;
+
+ nodeid = cpu_to_node(cpu);
+ numa_node_id = numa_cpu_node(cpu);
+ pnode = uv_apicid_to_pnode(apicid);
+
+ uv_cpu_info_per(cpu)->p_uv_hub_info = uv_hub_info_list(nodeid);
+ uv_cpu_info_per(cpu)->blade_cpu_id = uv_cpu_hub_info(cpu)->nr_possible_cpus++;
+ if (uv_cpu_hub_info(cpu)->memory_nid == -1)
+ uv_cpu_hub_info(cpu)->memory_nid = cpu_to_node(cpu);
+
+ /* Init memoryless node: */
+ if (nodeid != numa_node_id &&
+ uv_hub_info_list(numa_node_id)->pnode == 0xffff)
+ uv_hub_info_list(numa_node_id)->pnode = pnode;
+ else if (uv_cpu_hub_info(cpu)->pnode == 0xffff)
+ uv_cpu_hub_info(cpu)->pnode = pnode;
+
+ uv_cpu_scir_info(cpu)->offset = uv_scir_offset(apicid);
+ }
+
+ for_each_node(nodeid) {
+ unsigned short pnode = uv_hub_info_list(nodeid)->pnode;
+
+ /* Add pnode info for pre-GAM list nodes without CPUs: */
+ if (pnode == 0xffff) {
+ unsigned long paddr;
+
+ paddr = node_start_pfn(nodeid) << PAGE_SHIFT;
+ pnode = uv_gpa_to_pnode(uv_soc_phys_ram_to_gpa(paddr));
+ uv_hub_info_list(nodeid)->pnode = pnode;
+ }
+ min_pnode = min(pnode, min_pnode);
+ max_pnode = max(pnode, max_pnode);
+ pr_info("UV: UVHUB node:%2d pn:%02x nrcpus:%d\n",
+ nodeid,
+ uv_hub_info_list(nodeid)->pnode,
+ uv_hub_info_list(nodeid)->nr_possible_cpus);
+ }
+
+ pr_info("UV: min_pnode:%02x max_pnode:%02x\n", min_pnode, max_pnode);
+ map_gru_high(max_pnode);
+ map_mmr_high(max_pnode);
+ map_mmioh_high(min_pnode, max_pnode);
+
+ uv_nmi_setup();
+ uv_cpu_init();
+ uv_scir_register_cpu_notifier();
+ proc_mkdir("sgi_uv", NULL);
+
+ /* Register Legacy VGA I/O redirection handler: */
+ pci_register_set_vga_state(uv_set_vga_state);
+
+ /*
+ * For a kdump kernel the reset must be BOOT_ACPI, not BOOT_EFI, as
+ * EFI is not enabled in the kdump kernel:
+ */
+ if (is_kdump_kernel())
+ reboot_type = BOOT_ACPI;
+}
+
+/*
+ * There is a small amount of UV specific code needed to initialize a
+ * UV system that does not have a "UV HUB" (referred to as "hubless").
+ */
+void __init uv_system_init(void)
+{
+ if (likely(!is_uv_system() && !is_uv_hubless()))
+ return;
+
+ if (is_uv_system())
+ uv_system_init_hub();
+ else
+ uv_nmi_setup_hubless();
+}
+
+apic_driver(apic_x2apic_uv_x);
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
new file mode 100644
index 0000000..f7151cd
--- /dev/null
+++ b/arch/x86/kernel/apm_32.c
@@ -0,0 +1,2447 @@
+/* -*- linux-c -*-
+ * APM BIOS driver for Linux
+ * Copyright 1994-2001 Stephen Rothwell (sfr@canb.auug.org.au)
+ *
+ * Initial development of this driver was funded by NEC Australia P/L
+ * and NEC Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * October 1995, Rik Faith (faith@cs.unc.edu):
+ * Minor enhancements and updates (to the patch set) for 1.3.x
+ * Documentation
+ * January 1996, Rik Faith (faith@cs.unc.edu):
+ * Make /proc/apm easy to format (bump driver version)
+ * March 1996, Rik Faith (faith@cs.unc.edu):
+ * Prohibit APM BIOS calls unless apm_enabled.
+ * (Thanks to Ulrich Windl <Ulrich.Windl@rz.uni-regensburg.de>)
+ * April 1996, Stephen Rothwell (sfr@canb.auug.org.au)
+ * Version 1.0 and 1.1
+ * May 1996, Version 1.2
+ * Feb 1998, Version 1.3
+ * Feb 1998, Version 1.4
+ * Aug 1998, Version 1.5
+ * Sep 1998, Version 1.6
+ * Nov 1998, Version 1.7
+ * Jan 1999, Version 1.8
+ * Jan 1999, Version 1.9
+ * Oct 1999, Version 1.10
+ * Nov 1999, Version 1.11
+ * Jan 2000, Version 1.12
+ * Feb 2000, Version 1.13
+ * Nov 2000, Version 1.14
+ * Oct 2001, Version 1.15
+ * Jan 2002, Version 1.16
+ * Oct 2002, Version 1.16ac
+ *
+ * History:
+ * 0.6b: first version in official kernel, Linux 1.3.46
+ * 0.7: changed /proc/apm format, Linux 1.3.58
+ * 0.8: fixed gcc 2.7.[12] compilation problems, Linux 1.3.59
+ * 0.9: only call bios if bios is present, Linux 1.3.72
+ * 1.0: use fixed device number, consolidate /proc/apm into this file,
+ * Linux 1.3.85
+ * 1.1: support user-space standby and suspend, power off after system
+ * halted, Linux 1.3.98
+ * 1.2: When resetting RTC after resume, take care so that the time
+ * is only incorrect by 30-60mS (vs. 1S previously) (Gabor J. Toth
+ * <jtoth@princeton.edu>); improve interaction between
+ * screen-blanking and gpm (Stephen Rothwell); Linux 1.99.4
+ * 1.2a:Simple change to stop mysterious bug reports with SMP also added
+ * levels to the printk calls. APM is not defined for SMP machines.
+ * The new replacement for it is, but Linux doesn't yet support this.
+ * Alan Cox Linux 2.1.55
+ * 1.3: Set up a valid data descriptor 0x40 for buggy BIOS's
+ * 1.4: Upgraded to support APM 1.2. Integrated ThinkPad suspend patch by
+ * Dean Gaudet <dgaudet@arctic.org>.
+ * C. Scott Ananian <cananian@alumni.princeton.edu> Linux 2.1.87
+ * 1.5: Fix segment register reloading (in case of bad segments saved
+ * across BIOS call).
+ * Stephen Rothwell
+ * 1.6: Cope with compiler/assembler differences.
+ * Only try to turn off the first display device.
+ * Fix OOPS at power off with no APM BIOS by Jan Echternach
+ * <echter@informatik.uni-rostock.de>
+ * Stephen Rothwell
+ * 1.7: Modify driver's cached copy of the disabled/disengaged flags
+ * to reflect current state of APM BIOS.
+ * Chris Rankin <rankinc@bellsouth.net>
+ * Reset interrupt 0 timer to 100Hz after suspend
+ * Chad Miller <cmiller@surfsouth.com>
+ * Add CONFIG_APM_IGNORE_SUSPEND_BOUNCE
+ * Richard Gooch <rgooch@atnf.csiro.au>
+ * Allow boot time disabling of APM
+ * Make boot messages far less verbose by default
+ * Make asm safer
+ * Stephen Rothwell
+ * 1.8: Add CONFIG_APM_RTC_IS_GMT
+ * Richard Gooch <rgooch@atnf.csiro.au>
+ * change APM_NOINTS to CONFIG_APM_ALLOW_INTS
+ * remove dependency on CONFIG_PROC_FS
+ * Stephen Rothwell
+ * 1.9: Fix small typo. <laslo@wodip.opole.pl>
+ * Try to cope with BIOS's that need to have all display
+ * devices blanked and not just the first one.
+ * Ross Paterson <ross@soi.city.ac.uk>
+ * Fix segment limit setting it has always been wrong as
+ * the segments needed to have byte granularity.
+ * Mark a few things __init.
+ * Add hack to allow power off of SMP systems by popular request.
+ * Use CONFIG_SMP instead of __SMP__
+ * Ignore BOUNCES for three seconds.
+ * Stephen Rothwell
+ * 1.10: Fix for Thinkpad return code.
+ * Merge 2.2 and 2.3 drivers.
+ * Remove APM dependencies in arch/i386/kernel/process.c
+ * Remove APM dependencies in drivers/char/sysrq.c
+ * Reset time across standby.
+ * Allow more inititialisation on SMP.
+ * Remove CONFIG_APM_POWER_OFF and make it boot time
+ * configurable (default on).
+ * Make debug only a boot time parameter (remove APM_DEBUG).
+ * Try to blank all devices on any error.
+ * 1.11: Remove APM dependencies in drivers/char/console.c
+ * Check nr_running to detect if we are idle (from
+ * Borislav Deianov <borislav@lix.polytechnique.fr>)
+ * Fix for bioses that don't zero the top part of the
+ * entrypoint offset (Mario Sitta <sitta@al.unipmn.it>)
+ * (reported by Panos Katsaloulis <teras@writeme.com>).
+ * Real mode power off patch (Walter Hofmann
+ * <Walter.Hofmann@physik.stud.uni-erlangen.de>).
+ * 1.12: Remove CONFIG_SMP as the compiler will optimize
+ * the code away anyway (smp_num_cpus == 1 in UP)
+ * noted by Artur Skawina <skawina@geocities.com>.
+ * Make power off under SMP work again.
+ * Fix thinko with initial engaging of BIOS.
+ * Make sure power off only happens on CPU 0
+ * (Paul "Rusty" Russell <rusty@rustcorp.com.au>).
+ * Do error notification to user mode if BIOS calls fail.
+ * Move entrypoint offset fix to ...boot/setup.S
+ * where it belongs (Cosmos <gis88564@cis.nctu.edu.tw>).
+ * Remove smp-power-off. SMP users must now specify
+ * "apm=power-off" on the kernel command line. Suggested
+ * by Jim Avera <jima@hal.com>, modified by Alan Cox
+ * <alan@lxorguk.ukuu.org.uk>.
+ * Register the /proc/apm entry even on SMP so that
+ * scripts that check for it before doing power off
+ * work (Jim Avera <jima@hal.com>).
+ * 1.13: Changes for new pm_ interfaces (Andy Henroid
+ * <andy_henroid@yahoo.com>).
+ * Modularize the code.
+ * Fix the Thinkpad (again) :-( (CONFIG_APM_IGNORE_MULTIPLE_SUSPENDS
+ * is now the way life works).
+ * Fix thinko in suspend() (wrong return).
+ * Notify drivers on critical suspend.
+ * Make kapmd absorb more idle time (Pavel Machek <pavel@ucw.cz>
+ * modified by sfr).
+ * Disable interrupts while we are suspended (Andy Henroid
+ * <andy_henroid@yahoo.com> fixed by sfr).
+ * Make power off work on SMP again (Tony Hoyle
+ * <tmh@magenta-logic.com> and <zlatko@iskon.hr>) modified by sfr.
+ * Remove CONFIG_APM_SUSPEND_BOUNCE. The bounce ignore
+ * interval is now configurable.
+ * 1.14: Make connection version persist across module unload/load.
+ * Enable and engage power management earlier.
+ * Disengage power management on module unload.
+ * Changed to use the sysrq-register hack for registering the
+ * power off function called by magic sysrq based upon discussions
+ * in irc://irc.openprojects.net/#kernelnewbies
+ * (Crutcher Dunnavant <crutcher+kernel@datastacks.com>).
+ * Make CONFIG_APM_REAL_MODE_POWER_OFF run time configurable.
+ * (Arjan van de Ven <arjanv@redhat.com>) modified by sfr.
+ * Work around byte swap bug in one of the Vaio's BIOS's
+ * (Marc Boucher <marc@mbsi.ca>).
+ * Exposed the disable flag to dmi so that we can handle known
+ * broken APM (Alan Cox <alan@lxorguk.ukuu.org.uk>).
+ * 1.14ac: If the BIOS says "I slowed the CPU down" then don't spin
+ * calling it - instead idle. (Alan Cox <alan@lxorguk.ukuu.org.uk>)
+ * If an APM idle fails log it and idle sensibly
+ * 1.15: Don't queue events to clients who open the device O_WRONLY.
+ * Don't expect replies from clients who open the device O_RDONLY.
+ * (Idea from Thomas Hood)
+ * Minor waitqueue cleanups. (John Fremlin <chief@bandits.org>)
+ * 1.16: Fix idle calling. (Andreas Steinmetz <ast@domdv.de> et al.)
+ * Notify listeners of standby or suspend events before notifying
+ * drivers. Return EBUSY to ioctl() if suspend is rejected.
+ * (Russell King <rmk@arm.linux.org.uk> and Thomas Hood)
+ * Ignore first resume after we generate our own resume event
+ * after a suspend (Thomas Hood)
+ * Daemonize now gets rid of our controlling terminal (sfr).
+ * CONFIG_APM_CPU_IDLE now just affects the default value of
+ * idle_threshold (sfr).
+ * Change name of kernel apm daemon (as it no longer idles) (sfr).
+ * 1.16ac: Fix up SMP support somewhat. You can now force SMP on and we
+ * make _all_ APM calls on the CPU#0. Fix unsafe sign bug.
+ * TODO: determine if its "boot CPU" or "CPU0" we want to lock to.
+ *
+ * APM 1.1 Reference:
+ *
+ * Intel Corporation, Microsoft Corporation. Advanced Power Management
+ * (APM) BIOS Interface Specification, Revision 1.1, September 1993.
+ * Intel Order Number 241704-001. Microsoft Part Number 781-110-X01.
+ *
+ * [This document is available free from Intel by calling 800.628.8686 (fax
+ * 916.356.6100) or 800.548.4725; or from
+ * http://www.microsoft.com/whdc/archive/amp_12.mspx It is also
+ * available from Microsoft by calling 206.882.8080.]
+ *
+ * APM 1.2 Reference:
+ * Intel Corporation, Microsoft Corporation. Advanced Power Management
+ * (APM) BIOS Interface Specification, Revision 1.2, February 1996.
+ *
+ * [This document is available from Microsoft at:
+ * http://www.microsoft.com/whdc/archive/amp_12.mspx]
+ */
+
+#define pr_fmt(fmt) "apm: " fmt
+
+#include <linux/module.h>
+
+#include <linux/poll.h>
+#include <linux/types.h>
+#include <linux/stddef.h>
+#include <linux/timer.h>
+#include <linux/fcntl.h>
+#include <linux/slab.h>
+#include <linux/stat.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/miscdevice.h>
+#include <linux/apm_bios.h>
+#include <linux/init.h>
+#include <linux/time.h>
+#include <linux/sched/signal.h>
+#include <linux/sched/cputime.h>
+#include <linux/pm.h>
+#include <linux/capability.h>
+#include <linux/device.h>
+#include <linux/kernel.h>
+#include <linux/freezer.h>
+#include <linux/smp.h>
+#include <linux/dmi.h>
+#include <linux/suspend.h>
+#include <linux/kthread.h>
+#include <linux/jiffies.h>
+#include <linux/acpi.h>
+#include <linux/syscore_ops.h>
+#include <linux/i8253.h>
+#include <linux/cpuidle.h>
+
+#include <linux/uaccess.h>
+#include <asm/desc.h>
+#include <asm/olpc.h>
+#include <asm/paravirt.h>
+#include <asm/reboot.h>
+#include <asm/nospec-branch.h>
+
+#if defined(CONFIG_APM_DISPLAY_BLANK) && defined(CONFIG_VT)
+extern int (*console_blank_hook)(int);
+#endif
+
+/*
+ * The apm_bios device is one of the misc char devices.
+ * This is its minor number.
+ */
+#define APM_MINOR_DEV 134
+
+/*
+ * Various options can be changed at boot time as follows:
+ * (We allow underscores for compatibility with the modules code)
+ * apm=on/off enable/disable APM
+ * [no-]allow[-_]ints allow interrupts during BIOS calls
+ * [no-]broken[-_]psr BIOS has a broken GetPowerStatus call
+ * [no-]realmode[-_]power[-_]off switch to real mode before
+ * powering off
+ * [no-]debug log some debugging messages
+ * [no-]power[-_]off power off on shutdown
+ * [no-]smp Use apm even on an SMP box
+ * bounce[-_]interval=<n> number of ticks to ignore suspend
+ * bounces
+ * idle[-_]threshold=<n> System idle percentage above which to
+ * make APM BIOS idle calls. Set it to
+ * 100 to disable.
+ * idle[-_]period=<n> Period (in 1/100s of a second) over
+ * which the idle percentage is
+ * calculated.
+ */
+
+/* KNOWN PROBLEM MACHINES:
+ *
+ * U: TI 4000M TravelMate: BIOS is *NOT* APM compliant
+ * [Confirmed by TI representative]
+ * ?: ACER 486DX4/75: uses dseg 0040, in violation of APM specification
+ * [Confirmed by BIOS disassembly]
+ * [This may work now ...]
+ * P: Toshiba 1950S: battery life information only gets updated after resume
+ * P: Midwest Micro Soundbook Elite DX2/66 monochrome: screen blanking
+ * broken in BIOS [Reported by Garst R. Reese <reese@isn.net>]
+ * ?: AcerNote-950: oops on reading /proc/apm - workaround is a WIP
+ * Neale Banks <neale@lowendale.com.au> December 2000
+ *
+ * Legend: U = unusable with APM patches
+ * P = partially usable with APM patches
+ */
+
+/*
+ * Define as 1 to make the driver always call the APM BIOS busy
+ * routine even if the clock was not reported as slowed by the
+ * idle routine. Otherwise, define as 0.
+ */
+#define ALWAYS_CALL_BUSY 1
+
+/*
+ * Define to make the APM BIOS calls zero all data segment registers (so
+ * that an incorrect BIOS implementation will cause a kernel panic if it
+ * tries to write to arbitrary memory).
+ */
+#define APM_ZERO_SEGS
+
+#include <asm/apm.h>
+
+/*
+ * Define to re-initialize the interrupt 0 timer to 100 Hz after a suspend.
+ * This patched by Chad Miller <cmiller@surfsouth.com>, original code by
+ * David Chen <chen@ctpa04.mit.edu>
+ */
+#undef INIT_TIMER_AFTER_SUSPEND
+
+#ifdef INIT_TIMER_AFTER_SUSPEND
+#include <linux/timex.h>
+#include <asm/io.h>
+#include <linux/delay.h>
+#endif
+
+/*
+ * Need to poll the APM BIOS every second
+ */
+#define APM_CHECK_TIMEOUT (HZ)
+
+/*
+ * Ignore suspend events for this amount of time after a resume
+ */
+#define DEFAULT_BOUNCE_INTERVAL (3 * HZ)
+
+/*
+ * Maximum number of events stored
+ */
+#define APM_MAX_EVENTS 20
+
+/*
+ * The per-file APM data
+ */
+struct apm_user {
+ int magic;
+ struct apm_user *next;
+ unsigned int suser: 1;
+ unsigned int writer: 1;
+ unsigned int reader: 1;
+ unsigned int suspend_wait: 1;
+ int suspend_result;
+ int suspends_pending;
+ int standbys_pending;
+ int suspends_read;
+ int standbys_read;
+ int event_head;
+ int event_tail;
+ apm_event_t events[APM_MAX_EVENTS];
+};
+
+/*
+ * The magic number in apm_user
+ */
+#define APM_BIOS_MAGIC 0x4101
+
+/*
+ * idle percentage above which bios idle calls are done
+ */
+#ifdef CONFIG_APM_CPU_IDLE
+#define DEFAULT_IDLE_THRESHOLD 95
+#else
+#define DEFAULT_IDLE_THRESHOLD 100
+#endif
+#define DEFAULT_IDLE_PERIOD (100 / 3)
+
+static int apm_cpu_idle(struct cpuidle_device *dev,
+ struct cpuidle_driver *drv, int index);
+
+static struct cpuidle_driver apm_idle_driver = {
+ .name = "apm_idle",
+ .owner = THIS_MODULE,
+ .states = {
+ { /* entry 0 is for polling */ },
+ { /* entry 1 is for APM idle */
+ .name = "APM",
+ .desc = "APM idle",
+ .exit_latency = 250, /* WAG */
+ .target_residency = 500, /* WAG */
+ .enter = &apm_cpu_idle
+ },
+ },
+ .state_count = 2,
+};
+
+static struct cpuidle_device apm_cpuidle_device;
+
+/*
+ * Local variables
+ */
+__visible struct {
+ unsigned long offset;
+ unsigned short segment;
+} apm_bios_entry;
+static int clock_slowed;
+static int idle_threshold __read_mostly = DEFAULT_IDLE_THRESHOLD;
+static int idle_period __read_mostly = DEFAULT_IDLE_PERIOD;
+static int suspends_pending;
+static int standbys_pending;
+static int ignore_sys_suspend;
+static int ignore_normal_resume;
+static int bounce_interval __read_mostly = DEFAULT_BOUNCE_INTERVAL;
+
+static bool debug __read_mostly;
+static bool smp __read_mostly;
+static int apm_disabled = -1;
+#ifdef CONFIG_SMP
+static bool power_off;
+#else
+static bool power_off = 1;
+#endif
+static bool realmode_power_off;
+#ifdef CONFIG_APM_ALLOW_INTS
+static bool allow_ints = 1;
+#else
+static bool allow_ints;
+#endif
+static bool broken_psr;
+
+static DECLARE_WAIT_QUEUE_HEAD(apm_waitqueue);
+static DECLARE_WAIT_QUEUE_HEAD(apm_suspend_waitqueue);
+static struct apm_user *user_list;
+static DEFINE_SPINLOCK(user_list_lock);
+static DEFINE_MUTEX(apm_mutex);
+
+/*
+ * Set up a segment that references the real mode segment 0x40
+ * that extends up to the end of page zero (that we have reserved).
+ * This is for buggy BIOS's that refer to (real mode) segment 0x40
+ * even though they are called in protected mode.
+ */
+static struct desc_struct bad_bios_desc = GDT_ENTRY_INIT(0x4092,
+ (unsigned long)__va(0x400UL), PAGE_SIZE - 0x400 - 1);
+
+static const char driver_version[] = "1.16ac"; /* no spaces */
+
+static struct task_struct *kapmd_task;
+
+/*
+ * APM event names taken from the APM 1.2 specification. These are
+ * the message codes that the BIOS uses to tell us about events
+ */
+static const char * const apm_event_name[] = {
+ "system standby",
+ "system suspend",
+ "normal resume",
+ "critical resume",
+ "low battery",
+ "power status change",
+ "update time",
+ "critical suspend",
+ "user standby",
+ "user suspend",
+ "system standby resume",
+ "capabilities change"
+};
+#define NR_APM_EVENT_NAME ARRAY_SIZE(apm_event_name)
+
+typedef struct lookup_t {
+ int key;
+ char *msg;
+} lookup_t;
+
+/*
+ * The BIOS returns a set of standard error codes in AX when the
+ * carry flag is set.
+ */
+
+static const lookup_t error_table[] = {
+/* N/A { APM_SUCCESS, "Operation succeeded" }, */
+ { APM_DISABLED, "Power management disabled" },
+ { APM_CONNECTED, "Real mode interface already connected" },
+ { APM_NOT_CONNECTED, "Interface not connected" },
+ { APM_16_CONNECTED, "16 bit interface already connected" },
+/* N/A { APM_16_UNSUPPORTED, "16 bit interface not supported" }, */
+ { APM_32_CONNECTED, "32 bit interface already connected" },
+ { APM_32_UNSUPPORTED, "32 bit interface not supported" },
+ { APM_BAD_DEVICE, "Unrecognized device ID" },
+ { APM_BAD_PARAM, "Parameter out of range" },
+ { APM_NOT_ENGAGED, "Interface not engaged" },
+ { APM_BAD_FUNCTION, "Function not supported" },
+ { APM_RESUME_DISABLED, "Resume timer disabled" },
+ { APM_BAD_STATE, "Unable to enter requested state" },
+/* N/A { APM_NO_EVENTS, "No events pending" }, */
+ { APM_NO_ERROR, "BIOS did not set a return code" },
+ { APM_NOT_PRESENT, "No APM present" }
+};
+#define ERROR_COUNT ARRAY_SIZE(error_table)
+
+/**
+ * apm_error - display an APM error
+ * @str: information string
+ * @err: APM BIOS return code
+ *
+ * Write a meaningful log entry to the kernel log in the event of
+ * an APM error. Note that this also handles (negative) kernel errors.
+ */
+
+static void apm_error(char *str, int err)
+{
+ int i;
+
+ for (i = 0; i < ERROR_COUNT; i++)
+ if (error_table[i].key == err)
+ break;
+ if (i < ERROR_COUNT)
+ pr_notice("%s: %s\n", str, error_table[i].msg);
+ else if (err < 0)
+ pr_notice("%s: linux error code %i\n", str, err);
+ else
+ pr_notice("%s: unknown error code %#2.2x\n",
+ str, err);
+}
+
+/*
+ * These are the actual BIOS calls. Depending on APM_ZERO_SEGS and
+ * apm_info.allow_ints, we are being really paranoid here! Not only
+ * are interrupts disabled, but all the segment registers (except SS)
+ * are saved and zeroed this means that if the BIOS tries to reference
+ * any data without explicitly loading the segment registers, the kernel
+ * will fault immediately rather than have some unforeseen circumstances
+ * for the rest of the kernel. And it will be very obvious! :-) Doing
+ * this depends on CS referring to the same physical memory as DS so that
+ * DS can be zeroed before the call. Unfortunately, we can't do anything
+ * about the stack segment/pointer. Also, we tell the compiler that
+ * everything could change.
+ *
+ * Also, we KNOW that for the non error case of apm_bios_call, there
+ * is no useful data returned in the low order 8 bits of eax.
+ */
+
+static inline unsigned long __apm_irq_save(void)
+{
+ unsigned long flags;
+ local_save_flags(flags);
+ if (apm_info.allow_ints) {
+ if (irqs_disabled_flags(flags))
+ local_irq_enable();
+ } else
+ local_irq_disable();
+
+ return flags;
+}
+
+#define apm_irq_save(flags) \
+ do { flags = __apm_irq_save(); } while (0)
+
+static inline void apm_irq_restore(unsigned long flags)
+{
+ if (irqs_disabled_flags(flags))
+ local_irq_disable();
+ else if (irqs_disabled())
+ local_irq_enable();
+}
+
+#ifdef APM_ZERO_SEGS
+# define APM_DECL_SEGS \
+ unsigned int saved_fs; unsigned int saved_gs;
+# define APM_DO_SAVE_SEGS \
+ savesegment(fs, saved_fs); savesegment(gs, saved_gs)
+# define APM_DO_RESTORE_SEGS \
+ loadsegment(fs, saved_fs); loadsegment(gs, saved_gs)
+#else
+# define APM_DECL_SEGS
+# define APM_DO_SAVE_SEGS
+# define APM_DO_RESTORE_SEGS
+#endif
+
+struct apm_bios_call {
+ u32 func;
+ /* In and out */
+ u32 ebx;
+ u32 ecx;
+ /* Out only */
+ u32 eax;
+ u32 edx;
+ u32 esi;
+
+ /* Error: -ENOMEM, or bits 8-15 of eax */
+ int err;
+};
+
+/**
+ * __apm_bios_call - Make an APM BIOS 32bit call
+ * @_call: pointer to struct apm_bios_call.
+ *
+ * Make an APM call using the 32bit protected mode interface. The
+ * caller is responsible for knowing if APM BIOS is configured and
+ * enabled. This call can disable interrupts for a long period of
+ * time on some laptops. The return value is in AH and the carry
+ * flag is loaded into AL. If there is an error, then the error
+ * code is returned in AH (bits 8-15 of eax) and this function
+ * returns non-zero.
+ *
+ * Note: this makes the call on the current CPU.
+ */
+static long __apm_bios_call(void *_call)
+{
+ APM_DECL_SEGS
+ unsigned long flags;
+ int cpu;
+ struct desc_struct save_desc_40;
+ struct desc_struct *gdt;
+ struct apm_bios_call *call = _call;
+
+ cpu = get_cpu();
+ BUG_ON(cpu != 0);
+ gdt = get_cpu_gdt_rw(cpu);
+ save_desc_40 = gdt[0x40 / 8];
+ gdt[0x40 / 8] = bad_bios_desc;
+
+ apm_irq_save(flags);
+ firmware_restrict_branch_speculation_start();
+ APM_DO_SAVE_SEGS;
+ apm_bios_call_asm(call->func, call->ebx, call->ecx,
+ &call->eax, &call->ebx, &call->ecx, &call->edx,
+ &call->esi);
+ APM_DO_RESTORE_SEGS;
+ firmware_restrict_branch_speculation_end();
+ apm_irq_restore(flags);
+ gdt[0x40 / 8] = save_desc_40;
+ put_cpu();
+
+ return call->eax & 0xff;
+}
+
+/* Run __apm_bios_call or __apm_bios_call_simple on CPU 0 */
+static int on_cpu0(long (*fn)(void *), struct apm_bios_call *call)
+{
+ int ret;
+
+ /* Don't bother with work_on_cpu in the common case, so we don't
+ * have to worry about OOM or overhead. */
+ if (get_cpu() == 0) {
+ ret = fn(call);
+ put_cpu();
+ } else {
+ put_cpu();
+ ret = work_on_cpu(0, fn, call);
+ }
+
+ /* work_on_cpu can fail with -ENOMEM */
+ if (ret < 0)
+ call->err = ret;
+ else
+ call->err = (call->eax >> 8) & 0xff;
+
+ return ret;
+}
+
+/**
+ * apm_bios_call - Make an APM BIOS 32bit call (on CPU 0)
+ * @call: the apm_bios_call registers.
+ *
+ * If there is an error, it is returned in @call.err.
+ */
+static int apm_bios_call(struct apm_bios_call *call)
+{
+ return on_cpu0(__apm_bios_call, call);
+}
+
+/**
+ * __apm_bios_call_simple - Make an APM BIOS 32bit call (on CPU 0)
+ * @_call: pointer to struct apm_bios_call.
+ *
+ * Make a BIOS call that returns one value only, or just status.
+ * If there is an error, then the error code is returned in AH
+ * (bits 8-15 of eax) and this function returns non-zero (it can
+ * also return -ENOMEM). This is used for simpler BIOS operations.
+ * This call may hold interrupts off for a long time on some laptops.
+ *
+ * Note: this makes the call on the current CPU.
+ */
+static long __apm_bios_call_simple(void *_call)
+{
+ u8 error;
+ APM_DECL_SEGS
+ unsigned long flags;
+ int cpu;
+ struct desc_struct save_desc_40;
+ struct desc_struct *gdt;
+ struct apm_bios_call *call = _call;
+
+ cpu = get_cpu();
+ BUG_ON(cpu != 0);
+ gdt = get_cpu_gdt_rw(cpu);
+ save_desc_40 = gdt[0x40 / 8];
+ gdt[0x40 / 8] = bad_bios_desc;
+
+ apm_irq_save(flags);
+ firmware_restrict_branch_speculation_start();
+ APM_DO_SAVE_SEGS;
+ error = apm_bios_call_simple_asm(call->func, call->ebx, call->ecx,
+ &call->eax);
+ APM_DO_RESTORE_SEGS;
+ firmware_restrict_branch_speculation_end();
+ apm_irq_restore(flags);
+ gdt[0x40 / 8] = save_desc_40;
+ put_cpu();
+ return error;
+}
+
+/**
+ * apm_bios_call_simple - make a simple APM BIOS 32bit call
+ * @func: APM function to invoke
+ * @ebx_in: EBX register value for BIOS call
+ * @ecx_in: ECX register value for BIOS call
+ * @eax: EAX register on return from the BIOS call
+ * @err: bits
+ *
+ * Make a BIOS call that returns one value only, or just status.
+ * If there is an error, then the error code is returned in @err
+ * and this function returns non-zero. This is used for simpler
+ * BIOS operations. This call may hold interrupts off for a long
+ * time on some laptops.
+ */
+static int apm_bios_call_simple(u32 func, u32 ebx_in, u32 ecx_in, u32 *eax,
+ int *err)
+{
+ struct apm_bios_call call;
+ int ret;
+
+ call.func = func;
+ call.ebx = ebx_in;
+ call.ecx = ecx_in;
+
+ ret = on_cpu0(__apm_bios_call_simple, &call);
+ *eax = call.eax;
+ *err = call.err;
+ return ret;
+}
+
+/**
+ * apm_driver_version - APM driver version
+ * @val: loaded with the APM version on return
+ *
+ * Retrieve the APM version supported by the BIOS. This is only
+ * supported for APM 1.1 or higher. An error indicates APM 1.0 is
+ * probably present.
+ *
+ * On entry val should point to a value indicating the APM driver
+ * version with the high byte being the major and the low byte the
+ * minor number both in BCD
+ *
+ * On return it will hold the BIOS revision supported in the
+ * same format.
+ */
+
+static int apm_driver_version(u_short *val)
+{
+ u32 eax;
+ int err;
+
+ if (apm_bios_call_simple(APM_FUNC_VERSION, 0, *val, &eax, &err))
+ return err;
+ *val = eax;
+ return APM_SUCCESS;
+}
+
+/**
+ * apm_get_event - get an APM event from the BIOS
+ * @event: pointer to the event
+ * @info: point to the event information
+ *
+ * The APM BIOS provides a polled information for event
+ * reporting. The BIOS expects to be polled at least every second
+ * when events are pending. When a message is found the caller should
+ * poll until no more messages are present. However, this causes
+ * problems on some laptops where a suspend event notification is
+ * not cleared until it is acknowledged.
+ *
+ * Additional information is returned in the info pointer, providing
+ * that APM 1.2 is in use. If no messges are pending the value 0x80
+ * is returned (No power management events pending).
+ */
+static int apm_get_event(apm_event_t *event, apm_eventinfo_t *info)
+{
+ struct apm_bios_call call;
+
+ call.func = APM_FUNC_GET_EVENT;
+ call.ebx = call.ecx = 0;
+
+ if (apm_bios_call(&call))
+ return call.err;
+
+ *event = call.ebx;
+ if (apm_info.connection_version < 0x0102)
+ *info = ~0; /* indicate info not valid */
+ else
+ *info = call.ecx;
+ return APM_SUCCESS;
+}
+
+/**
+ * set_power_state - set the power management state
+ * @what: which items to transition
+ * @state: state to transition to
+ *
+ * Request an APM change of state for one or more system devices. The
+ * processor state must be transitioned last of all. what holds the
+ * class of device in the upper byte and the device number (0xFF for
+ * all) for the object to be transitioned.
+ *
+ * The state holds the state to transition to, which may in fact
+ * be an acceptance of a BIOS requested state change.
+ */
+
+static int set_power_state(u_short what, u_short state)
+{
+ u32 eax;
+ int err;
+
+ if (apm_bios_call_simple(APM_FUNC_SET_STATE, what, state, &eax, &err))
+ return err;
+ return APM_SUCCESS;
+}
+
+/**
+ * set_system_power_state - set system wide power state
+ * @state: which state to enter
+ *
+ * Transition the entire system into a new APM power state.
+ */
+
+static int set_system_power_state(u_short state)
+{
+ return set_power_state(APM_DEVICE_ALL, state);
+}
+
+/**
+ * apm_do_idle - perform power saving
+ *
+ * This function notifies the BIOS that the processor is (in the view
+ * of the OS) idle. It returns -1 in the event that the BIOS refuses
+ * to handle the idle request. On a success the function returns 1
+ * if the BIOS did clock slowing or 0 otherwise.
+ */
+
+static int apm_do_idle(void)
+{
+ u32 eax;
+ u8 ret = 0;
+ int idled = 0;
+ int err = 0;
+
+ if (!need_resched()) {
+ idled = 1;
+ ret = apm_bios_call_simple(APM_FUNC_IDLE, 0, 0, &eax, &err);
+ }
+
+ if (!idled)
+ return 0;
+
+ if (ret) {
+ static unsigned long t;
+
+ /* This always fails on some SMP boards running UP kernels.
+ * Only report the failure the first 5 times.
+ */
+ if (++t < 5) {
+ printk(KERN_DEBUG "apm_do_idle failed (%d)\n", err);
+ t = jiffies;
+ }
+ return -1;
+ }
+ clock_slowed = (apm_info.bios.flags & APM_IDLE_SLOWS_CLOCK) != 0;
+ return clock_slowed;
+}
+
+/**
+ * apm_do_busy - inform the BIOS the CPU is busy
+ *
+ * Request that the BIOS brings the CPU back to full performance.
+ */
+
+static void apm_do_busy(void)
+{
+ u32 dummy;
+ int err;
+
+ if (clock_slowed || ALWAYS_CALL_BUSY) {
+ (void)apm_bios_call_simple(APM_FUNC_BUSY, 0, 0, &dummy, &err);
+ clock_slowed = 0;
+ }
+}
+
+/*
+ * If no process has really been interested in
+ * the CPU for some time, we want to call BIOS
+ * power management - we probably want
+ * to conserve power.
+ */
+#define IDLE_CALC_LIMIT (HZ * 100)
+#define IDLE_LEAKY_MAX 16
+
+/**
+ * apm_cpu_idle - cpu idling for APM capable Linux
+ *
+ * This is the idling function the kernel executes when APM is available. It
+ * tries to do BIOS powermanagement based on the average system idle time.
+ * Furthermore it calls the system default idle routine.
+ */
+
+static int apm_cpu_idle(struct cpuidle_device *dev,
+ struct cpuidle_driver *drv, int index)
+{
+ static int use_apm_idle; /* = 0 */
+ static unsigned int last_jiffies; /* = 0 */
+ static u64 last_stime; /* = 0 */
+ u64 stime, utime;
+
+ int apm_idle_done = 0;
+ unsigned int jiffies_since_last_check = jiffies - last_jiffies;
+ unsigned int bucket;
+
+recalc:
+ task_cputime(current, &utime, &stime);
+ if (jiffies_since_last_check > IDLE_CALC_LIMIT) {
+ use_apm_idle = 0;
+ } else if (jiffies_since_last_check > idle_period) {
+ unsigned int idle_percentage;
+
+ idle_percentage = nsecs_to_jiffies(stime - last_stime);
+ idle_percentage *= 100;
+ idle_percentage /= jiffies_since_last_check;
+ use_apm_idle = (idle_percentage > idle_threshold);
+ if (apm_info.forbid_idle)
+ use_apm_idle = 0;
+ }
+
+ last_jiffies = jiffies;
+ last_stime = stime;
+
+ bucket = IDLE_LEAKY_MAX;
+
+ while (!need_resched()) {
+ if (use_apm_idle) {
+ unsigned int t;
+
+ t = jiffies;
+ switch (apm_do_idle()) {
+ case 0:
+ apm_idle_done = 1;
+ if (t != jiffies) {
+ if (bucket) {
+ bucket = IDLE_LEAKY_MAX;
+ continue;
+ }
+ } else if (bucket) {
+ bucket--;
+ continue;
+ }
+ break;
+ case 1:
+ apm_idle_done = 1;
+ break;
+ default: /* BIOS refused */
+ break;
+ }
+ }
+ default_idle();
+ local_irq_disable();
+ jiffies_since_last_check = jiffies - last_jiffies;
+ if (jiffies_since_last_check > idle_period)
+ goto recalc;
+ }
+
+ if (apm_idle_done)
+ apm_do_busy();
+
+ return index;
+}
+
+/**
+ * apm_power_off - ask the BIOS to power off
+ *
+ * Handle the power off sequence. This is the one piece of code we
+ * will execute even on SMP machines. In order to deal with BIOS
+ * bugs we support real mode APM BIOS power off calls. We also make
+ * the SMP call on CPU0 as some systems will only honour this call
+ * on their first cpu.
+ */
+
+static void apm_power_off(void)
+{
+ /* Some bioses don't like being called from CPU != 0 */
+ if (apm_info.realmode_power_off) {
+ set_cpus_allowed_ptr(current, cpumask_of(0));
+ machine_real_restart(MRR_APM);
+ } else {
+ (void)set_system_power_state(APM_STATE_OFF);
+ }
+}
+
+#ifdef CONFIG_APM_DO_ENABLE
+
+/**
+ * apm_enable_power_management - enable BIOS APM power management
+ * @enable: enable yes/no
+ *
+ * Enable or disable the APM BIOS power services.
+ */
+
+static int apm_enable_power_management(int enable)
+{
+ u32 eax;
+ int err;
+
+ if ((enable == 0) && (apm_info.bios.flags & APM_BIOS_DISENGAGED))
+ return APM_NOT_ENGAGED;
+ if (apm_bios_call_simple(APM_FUNC_ENABLE_PM, APM_DEVICE_BALL,
+ enable, &eax, &err))
+ return err;
+ if (enable)
+ apm_info.bios.flags &= ~APM_BIOS_DISABLED;
+ else
+ apm_info.bios.flags |= APM_BIOS_DISABLED;
+ return APM_SUCCESS;
+}
+#endif
+
+/**
+ * apm_get_power_status - get current power state
+ * @status: returned status
+ * @bat: battery info
+ * @life: estimated life
+ *
+ * Obtain the current power status from the APM BIOS. We return a
+ * status which gives the rough battery status, and current power
+ * source. The bat value returned give an estimate as a percentage
+ * of life and a status value for the battery. The estimated life
+ * if reported is a lifetime in secodnds/minutes at current powwer
+ * consumption.
+ */
+
+static int apm_get_power_status(u_short *status, u_short *bat, u_short *life)
+{
+ struct apm_bios_call call;
+
+ call.func = APM_FUNC_GET_STATUS;
+ call.ebx = APM_DEVICE_ALL;
+ call.ecx = 0;
+
+ if (apm_info.get_power_status_broken)
+ return APM_32_UNSUPPORTED;
+ if (apm_bios_call(&call)) {
+ if (!call.err)
+ return APM_NO_ERROR;
+ return call.err;
+ }
+ *status = call.ebx;
+ *bat = call.ecx;
+ if (apm_info.get_power_status_swabinminutes) {
+ *life = swab16((u16)call.edx);
+ *life |= 0x8000;
+ } else
+ *life = call.edx;
+ return APM_SUCCESS;
+}
+
+#if 0
+static int apm_get_battery_status(u_short which, u_short *status,
+ u_short *bat, u_short *life, u_short *nbat)
+{
+ u32 eax;
+ u32 ebx;
+ u32 ecx;
+ u32 edx;
+ u32 esi;
+
+ if (apm_info.connection_version < 0x0102) {
+ /* pretend we only have one battery. */
+ if (which != 1)
+ return APM_BAD_DEVICE;
+ *nbat = 1;
+ return apm_get_power_status(status, bat, life);
+ }
+
+ if (apm_bios_call(APM_FUNC_GET_STATUS, (0x8000 | (which)), 0, &eax,
+ &ebx, &ecx, &edx, &esi))
+ return (eax >> 8) & 0xff;
+ *status = ebx;
+ *bat = ecx;
+ *life = edx;
+ *nbat = esi;
+ return APM_SUCCESS;
+}
+#endif
+
+/**
+ * apm_engage_power_management - enable PM on a device
+ * @device: identity of device
+ * @enable: on/off
+ *
+ * Activate or deactivate power management on either a specific device
+ * or the entire system (%APM_DEVICE_ALL).
+ */
+
+static int apm_engage_power_management(u_short device, int enable)
+{
+ u32 eax;
+ int err;
+
+ if ((enable == 0) && (device == APM_DEVICE_ALL)
+ && (apm_info.bios.flags & APM_BIOS_DISABLED))
+ return APM_DISABLED;
+ if (apm_bios_call_simple(APM_FUNC_ENGAGE_PM, device, enable,
+ &eax, &err))
+ return err;
+ if (device == APM_DEVICE_ALL) {
+ if (enable)
+ apm_info.bios.flags &= ~APM_BIOS_DISENGAGED;
+ else
+ apm_info.bios.flags |= APM_BIOS_DISENGAGED;
+ }
+ return APM_SUCCESS;
+}
+
+#if defined(CONFIG_APM_DISPLAY_BLANK) && defined(CONFIG_VT)
+
+/**
+ * apm_console_blank - blank the display
+ * @blank: on/off
+ *
+ * Attempt to blank the console, firstly by blanking just video device
+ * zero, and if that fails (some BIOSes don't support it) then it blanks
+ * all video devices. Typically the BIOS will do laptop backlight and
+ * monitor powerdown for us.
+ */
+
+static int apm_console_blank(int blank)
+{
+ int error = APM_NOT_ENGAGED; /* silence gcc */
+ int i;
+ u_short state;
+ static const u_short dev[3] = { 0x100, 0x1FF, 0x101 };
+
+ state = blank ? APM_STATE_STANDBY : APM_STATE_READY;
+
+ for (i = 0; i < ARRAY_SIZE(dev); i++) {
+ error = set_power_state(dev[i], state);
+
+ if ((error == APM_SUCCESS) || (error == APM_NO_ERROR))
+ return 1;
+
+ if (error == APM_NOT_ENGAGED)
+ break;
+ }
+
+ if (error == APM_NOT_ENGAGED) {
+ static int tried;
+ int eng_error;
+ if (tried++ == 0) {
+ eng_error = apm_engage_power_management(APM_DEVICE_ALL, 1);
+ if (eng_error) {
+ apm_error("set display", error);
+ apm_error("engage interface", eng_error);
+ return 0;
+ } else
+ return apm_console_blank(blank);
+ }
+ }
+ apm_error("set display", error);
+ return 0;
+}
+#endif
+
+static int queue_empty(struct apm_user *as)
+{
+ return as->event_head == as->event_tail;
+}
+
+static apm_event_t get_queued_event(struct apm_user *as)
+{
+ if (++as->event_tail >= APM_MAX_EVENTS)
+ as->event_tail = 0;
+ return as->events[as->event_tail];
+}
+
+static void queue_event(apm_event_t event, struct apm_user *sender)
+{
+ struct apm_user *as;
+
+ spin_lock(&user_list_lock);
+ if (user_list == NULL)
+ goto out;
+ for (as = user_list; as != NULL; as = as->next) {
+ if ((as == sender) || (!as->reader))
+ continue;
+ if (++as->event_head >= APM_MAX_EVENTS)
+ as->event_head = 0;
+
+ if (as->event_head == as->event_tail) {
+ static int notified;
+
+ if (notified++ == 0)
+ pr_err("an event queue overflowed\n");
+ if (++as->event_tail >= APM_MAX_EVENTS)
+ as->event_tail = 0;
+ }
+ as->events[as->event_head] = event;
+ if (!as->suser || !as->writer)
+ continue;
+ switch (event) {
+ case APM_SYS_SUSPEND:
+ case APM_USER_SUSPEND:
+ as->suspends_pending++;
+ suspends_pending++;
+ break;
+
+ case APM_SYS_STANDBY:
+ case APM_USER_STANDBY:
+ as->standbys_pending++;
+ standbys_pending++;
+ break;
+ }
+ }
+ wake_up_interruptible(&apm_waitqueue);
+out:
+ spin_unlock(&user_list_lock);
+}
+
+static void reinit_timer(void)
+{
+#ifdef INIT_TIMER_AFTER_SUSPEND
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&i8253_lock, flags);
+ /* set the clock to HZ */
+ outb_p(0x34, PIT_MODE); /* binary, mode 2, LSB/MSB, ch 0 */
+ udelay(10);
+ outb_p(LATCH & 0xff, PIT_CH0); /* LSB */
+ udelay(10);
+ outb_p(LATCH >> 8, PIT_CH0); /* MSB */
+ udelay(10);
+ raw_spin_unlock_irqrestore(&i8253_lock, flags);
+#endif
+}
+
+static int suspend(int vetoable)
+{
+ int err;
+ struct apm_user *as;
+
+ dpm_suspend_start(PMSG_SUSPEND);
+ dpm_suspend_end(PMSG_SUSPEND);
+
+ local_irq_disable();
+ syscore_suspend();
+
+ local_irq_enable();
+
+ save_processor_state();
+ err = set_system_power_state(APM_STATE_SUSPEND);
+ ignore_normal_resume = 1;
+ restore_processor_state();
+
+ local_irq_disable();
+ reinit_timer();
+
+ if (err == APM_NO_ERROR)
+ err = APM_SUCCESS;
+ if (err != APM_SUCCESS)
+ apm_error("suspend", err);
+ err = (err == APM_SUCCESS) ? 0 : -EIO;
+
+ syscore_resume();
+ local_irq_enable();
+
+ dpm_resume_start(PMSG_RESUME);
+ dpm_resume_end(PMSG_RESUME);
+
+ queue_event(APM_NORMAL_RESUME, NULL);
+ spin_lock(&user_list_lock);
+ for (as = user_list; as != NULL; as = as->next) {
+ as->suspend_wait = 0;
+ as->suspend_result = err;
+ }
+ spin_unlock(&user_list_lock);
+ wake_up_interruptible(&apm_suspend_waitqueue);
+ return err;
+}
+
+static void standby(void)
+{
+ int err;
+
+ dpm_suspend_end(PMSG_SUSPEND);
+
+ local_irq_disable();
+ syscore_suspend();
+ local_irq_enable();
+
+ err = set_system_power_state(APM_STATE_STANDBY);
+ if ((err != APM_SUCCESS) && (err != APM_NO_ERROR))
+ apm_error("standby", err);
+
+ local_irq_disable();
+ syscore_resume();
+ local_irq_enable();
+
+ dpm_resume_start(PMSG_RESUME);
+}
+
+static apm_event_t get_event(void)
+{
+ int error;
+ apm_event_t event = APM_NO_EVENTS; /* silence gcc */
+ apm_eventinfo_t info;
+
+ static int notified;
+
+ /* we don't use the eventinfo */
+ error = apm_get_event(&event, &info);
+ if (error == APM_SUCCESS)
+ return event;
+
+ if ((error != APM_NO_EVENTS) && (notified++ == 0))
+ apm_error("get_event", error);
+
+ return 0;
+}
+
+static void check_events(void)
+{
+ apm_event_t event;
+ static unsigned long last_resume;
+ static int ignore_bounce;
+
+ while ((event = get_event()) != 0) {
+ if (debug) {
+ if (event <= NR_APM_EVENT_NAME)
+ printk(KERN_DEBUG "apm: received %s notify\n",
+ apm_event_name[event - 1]);
+ else
+ printk(KERN_DEBUG "apm: received unknown "
+ "event 0x%02x\n", event);
+ }
+ if (ignore_bounce
+ && (time_after(jiffies, last_resume + bounce_interval)))
+ ignore_bounce = 0;
+
+ switch (event) {
+ case APM_SYS_STANDBY:
+ case APM_USER_STANDBY:
+ queue_event(event, NULL);
+ if (standbys_pending <= 0)
+ standby();
+ break;
+
+ case APM_USER_SUSPEND:
+#ifdef CONFIG_APM_IGNORE_USER_SUSPEND
+ if (apm_info.connection_version > 0x100)
+ set_system_power_state(APM_STATE_REJECT);
+ break;
+#endif
+ case APM_SYS_SUSPEND:
+ if (ignore_bounce) {
+ if (apm_info.connection_version > 0x100)
+ set_system_power_state(APM_STATE_REJECT);
+ break;
+ }
+ /*
+ * If we are already processing a SUSPEND,
+ * then further SUSPEND events from the BIOS
+ * will be ignored. We also return here to
+ * cope with the fact that the Thinkpads keep
+ * sending a SUSPEND event until something else
+ * happens!
+ */
+ if (ignore_sys_suspend)
+ return;
+ ignore_sys_suspend = 1;
+ queue_event(event, NULL);
+ if (suspends_pending <= 0)
+ (void) suspend(1);
+ break;
+
+ case APM_NORMAL_RESUME:
+ case APM_CRITICAL_RESUME:
+ case APM_STANDBY_RESUME:
+ ignore_sys_suspend = 0;
+ last_resume = jiffies;
+ ignore_bounce = 1;
+ if ((event != APM_NORMAL_RESUME)
+ || (ignore_normal_resume == 0)) {
+ dpm_resume_end(PMSG_RESUME);
+ queue_event(event, NULL);
+ }
+ ignore_normal_resume = 0;
+ break;
+
+ case APM_CAPABILITY_CHANGE:
+ case APM_LOW_BATTERY:
+ case APM_POWER_STATUS_CHANGE:
+ queue_event(event, NULL);
+ /* If needed, notify drivers here */
+ break;
+
+ case APM_UPDATE_TIME:
+ break;
+
+ case APM_CRITICAL_SUSPEND:
+ /*
+ * We are not allowed to reject a critical suspend.
+ */
+ (void)suspend(0);
+ break;
+ }
+ }
+}
+
+static void apm_event_handler(void)
+{
+ static int pending_count = 4;
+ int err;
+
+ if ((standbys_pending > 0) || (suspends_pending > 0)) {
+ if ((apm_info.connection_version > 0x100) &&
+ (pending_count-- <= 0)) {
+ pending_count = 4;
+ if (debug)
+ printk(KERN_DEBUG "apm: setting state busy\n");
+ err = set_system_power_state(APM_STATE_BUSY);
+ if (err)
+ apm_error("busy", err);
+ }
+ } else
+ pending_count = 4;
+ check_events();
+}
+
+/*
+ * This is the APM thread main loop.
+ */
+
+static void apm_mainloop(void)
+{
+ DECLARE_WAITQUEUE(wait, current);
+
+ add_wait_queue(&apm_waitqueue, &wait);
+ set_current_state(TASK_INTERRUPTIBLE);
+ for (;;) {
+ schedule_timeout(APM_CHECK_TIMEOUT);
+ if (kthread_should_stop())
+ break;
+ /*
+ * Ok, check all events, check for idle (and mark us sleeping
+ * so as not to count towards the load average)..
+ */
+ set_current_state(TASK_INTERRUPTIBLE);
+ apm_event_handler();
+ }
+ remove_wait_queue(&apm_waitqueue, &wait);
+}
+
+static int check_apm_user(struct apm_user *as, const char *func)
+{
+ if (as == NULL || as->magic != APM_BIOS_MAGIC) {
+ pr_err("%s passed bad filp\n", func);
+ return 1;
+ }
+ return 0;
+}
+
+static ssize_t do_read(struct file *fp, char __user *buf, size_t count, loff_t *ppos)
+{
+ struct apm_user *as;
+ int i;
+ apm_event_t event;
+
+ as = fp->private_data;
+ if (check_apm_user(as, "read"))
+ return -EIO;
+ if ((int)count < sizeof(apm_event_t))
+ return -EINVAL;
+ if ((queue_empty(as)) && (fp->f_flags & O_NONBLOCK))
+ return -EAGAIN;
+ wait_event_interruptible(apm_waitqueue, !queue_empty(as));
+ i = count;
+ while ((i >= sizeof(event)) && !queue_empty(as)) {
+ event = get_queued_event(as);
+ if (copy_to_user(buf, &event, sizeof(event))) {
+ if (i < count)
+ break;
+ return -EFAULT;
+ }
+ switch (event) {
+ case APM_SYS_SUSPEND:
+ case APM_USER_SUSPEND:
+ as->suspends_read++;
+ break;
+
+ case APM_SYS_STANDBY:
+ case APM_USER_STANDBY:
+ as->standbys_read++;
+ break;
+ }
+ buf += sizeof(event);
+ i -= sizeof(event);
+ }
+ if (i < count)
+ return count - i;
+ if (signal_pending(current))
+ return -ERESTARTSYS;
+ return 0;
+}
+
+static __poll_t do_poll(struct file *fp, poll_table *wait)
+{
+ struct apm_user *as;
+
+ as = fp->private_data;
+ if (check_apm_user(as, "poll"))
+ return 0;
+ poll_wait(fp, &apm_waitqueue, wait);
+ if (!queue_empty(as))
+ return EPOLLIN | EPOLLRDNORM;
+ return 0;
+}
+
+static long do_ioctl(struct file *filp, u_int cmd, u_long arg)
+{
+ struct apm_user *as;
+ int ret;
+
+ as = filp->private_data;
+ if (check_apm_user(as, "ioctl"))
+ return -EIO;
+ if (!as->suser || !as->writer)
+ return -EPERM;
+ switch (cmd) {
+ case APM_IOC_STANDBY:
+ mutex_lock(&apm_mutex);
+ if (as->standbys_read > 0) {
+ as->standbys_read--;
+ as->standbys_pending--;
+ standbys_pending--;
+ } else
+ queue_event(APM_USER_STANDBY, as);
+ if (standbys_pending <= 0)
+ standby();
+ mutex_unlock(&apm_mutex);
+ break;
+ case APM_IOC_SUSPEND:
+ mutex_lock(&apm_mutex);
+ if (as->suspends_read > 0) {
+ as->suspends_read--;
+ as->suspends_pending--;
+ suspends_pending--;
+ } else
+ queue_event(APM_USER_SUSPEND, as);
+ if (suspends_pending <= 0) {
+ ret = suspend(1);
+ mutex_unlock(&apm_mutex);
+ } else {
+ as->suspend_wait = 1;
+ mutex_unlock(&apm_mutex);
+ wait_event_interruptible(apm_suspend_waitqueue,
+ as->suspend_wait == 0);
+ ret = as->suspend_result;
+ }
+ return ret;
+ default:
+ return -ENOTTY;
+ }
+ return 0;
+}
+
+static int do_release(struct inode *inode, struct file *filp)
+{
+ struct apm_user *as;
+
+ as = filp->private_data;
+ if (check_apm_user(as, "release"))
+ return 0;
+ filp->private_data = NULL;
+ if (as->standbys_pending > 0) {
+ standbys_pending -= as->standbys_pending;
+ if (standbys_pending <= 0)
+ standby();
+ }
+ if (as->suspends_pending > 0) {
+ suspends_pending -= as->suspends_pending;
+ if (suspends_pending <= 0)
+ (void) suspend(1);
+ }
+ spin_lock(&user_list_lock);
+ if (user_list == as)
+ user_list = as->next;
+ else {
+ struct apm_user *as1;
+
+ for (as1 = user_list;
+ (as1 != NULL) && (as1->next != as);
+ as1 = as1->next)
+ ;
+ if (as1 == NULL)
+ pr_err("filp not in user list\n");
+ else
+ as1->next = as->next;
+ }
+ spin_unlock(&user_list_lock);
+ kfree(as);
+ return 0;
+}
+
+static int do_open(struct inode *inode, struct file *filp)
+{
+ struct apm_user *as;
+
+ as = kmalloc(sizeof(*as), GFP_KERNEL);
+ if (as == NULL)
+ return -ENOMEM;
+
+ as->magic = APM_BIOS_MAGIC;
+ as->event_tail = as->event_head = 0;
+ as->suspends_pending = as->standbys_pending = 0;
+ as->suspends_read = as->standbys_read = 0;
+ /*
+ * XXX - this is a tiny bit broken, when we consider BSD
+ * process accounting. If the device is opened by root, we
+ * instantly flag that we used superuser privs. Who knows,
+ * we might close the device immediately without doing a
+ * privileged operation -- cevans
+ */
+ as->suser = capable(CAP_SYS_ADMIN);
+ as->writer = (filp->f_mode & FMODE_WRITE) == FMODE_WRITE;
+ as->reader = (filp->f_mode & FMODE_READ) == FMODE_READ;
+ spin_lock(&user_list_lock);
+ as->next = user_list;
+ user_list = as;
+ spin_unlock(&user_list_lock);
+ filp->private_data = as;
+ return 0;
+}
+
+#ifdef CONFIG_PROC_FS
+static int proc_apm_show(struct seq_file *m, void *v)
+{
+ unsigned short bx;
+ unsigned short cx;
+ unsigned short dx;
+ int error;
+ unsigned short ac_line_status = 0xff;
+ unsigned short battery_status = 0xff;
+ unsigned short battery_flag = 0xff;
+ int percentage = -1;
+ int time_units = -1;
+ char *units = "?";
+
+ if ((num_online_cpus() == 1) &&
+ !(error = apm_get_power_status(&bx, &cx, &dx))) {
+ ac_line_status = (bx >> 8) & 0xff;
+ battery_status = bx & 0xff;
+ if ((cx & 0xff) != 0xff)
+ percentage = cx & 0xff;
+
+ if (apm_info.connection_version > 0x100) {
+ battery_flag = (cx >> 8) & 0xff;
+ if (dx != 0xffff) {
+ units = (dx & 0x8000) ? "min" : "sec";
+ time_units = dx & 0x7fff;
+ }
+ }
+ }
+ /* Arguments, with symbols from linux/apm_bios.h. Information is
+ from the Get Power Status (0x0a) call unless otherwise noted.
+
+ 0) Linux driver version (this will change if format changes)
+ 1) APM BIOS Version. Usually 1.0, 1.1 or 1.2.
+ 2) APM flags from APM Installation Check (0x00):
+ bit 0: APM_16_BIT_SUPPORT
+ bit 1: APM_32_BIT_SUPPORT
+ bit 2: APM_IDLE_SLOWS_CLOCK
+ bit 3: APM_BIOS_DISABLED
+ bit 4: APM_BIOS_DISENGAGED
+ 3) AC line status
+ 0x00: Off-line
+ 0x01: On-line
+ 0x02: On backup power (BIOS >= 1.1 only)
+ 0xff: Unknown
+ 4) Battery status
+ 0x00: High
+ 0x01: Low
+ 0x02: Critical
+ 0x03: Charging
+ 0x04: Selected battery not present (BIOS >= 1.2 only)
+ 0xff: Unknown
+ 5) Battery flag
+ bit 0: High
+ bit 1: Low
+ bit 2: Critical
+ bit 3: Charging
+ bit 7: No system battery
+ 0xff: Unknown
+ 6) Remaining battery life (percentage of charge):
+ 0-100: valid
+ -1: Unknown
+ 7) Remaining battery life (time units):
+ Number of remaining minutes or seconds
+ -1: Unknown
+ 8) min = minutes; sec = seconds */
+
+ seq_printf(m, "%s %d.%d 0x%02x 0x%02x 0x%02x 0x%02x %d%% %d %s\n",
+ driver_version,
+ (apm_info.bios.version >> 8) & 0xff,
+ apm_info.bios.version & 0xff,
+ apm_info.bios.flags,
+ ac_line_status,
+ battery_status,
+ battery_flag,
+ percentage,
+ time_units,
+ units);
+ return 0;
+}
+#endif
+
+static int apm(void *unused)
+{
+ unsigned short bx;
+ unsigned short cx;
+ unsigned short dx;
+ int error;
+ char *power_stat;
+ char *bat_stat;
+
+ /* 2002/08/01 - WT
+ * This is to avoid random crashes at boot time during initialization
+ * on SMP systems in case of "apm=power-off" mode. Seen on ASUS A7M266D.
+ * Some bioses don't like being called from CPU != 0.
+ * Method suggested by Ingo Molnar.
+ */
+ set_cpus_allowed_ptr(current, cpumask_of(0));
+ BUG_ON(smp_processor_id() != 0);
+
+ if (apm_info.connection_version == 0) {
+ apm_info.connection_version = apm_info.bios.version;
+ if (apm_info.connection_version > 0x100) {
+ /*
+ * We only support BIOSs up to version 1.2
+ */
+ if (apm_info.connection_version > 0x0102)
+ apm_info.connection_version = 0x0102;
+ error = apm_driver_version(&apm_info.connection_version);
+ if (error != APM_SUCCESS) {
+ apm_error("driver version", error);
+ /* Fall back to an APM 1.0 connection. */
+ apm_info.connection_version = 0x100;
+ }
+ }
+ }
+
+ if (debug)
+ printk(KERN_INFO "apm: Connection version %d.%d\n",
+ (apm_info.connection_version >> 8) & 0xff,
+ apm_info.connection_version & 0xff);
+
+#ifdef CONFIG_APM_DO_ENABLE
+ if (apm_info.bios.flags & APM_BIOS_DISABLED) {
+ /*
+ * This call causes my NEC UltraLite Versa 33/C to hang if it
+ * is booted with PM disabled but not in the docking station.
+ * Unfortunate ...
+ */
+ error = apm_enable_power_management(1);
+ if (error) {
+ apm_error("enable power management", error);
+ return -1;
+ }
+ }
+#endif
+
+ if ((apm_info.bios.flags & APM_BIOS_DISENGAGED)
+ && (apm_info.connection_version > 0x0100)) {
+ error = apm_engage_power_management(APM_DEVICE_ALL, 1);
+ if (error) {
+ apm_error("engage power management", error);
+ return -1;
+ }
+ }
+
+ if (debug && (num_online_cpus() == 1 || smp)) {
+ error = apm_get_power_status(&bx, &cx, &dx);
+ if (error)
+ printk(KERN_INFO "apm: power status not available\n");
+ else {
+ switch ((bx >> 8) & 0xff) {
+ case 0:
+ power_stat = "off line";
+ break;
+ case 1:
+ power_stat = "on line";
+ break;
+ case 2:
+ power_stat = "on backup power";
+ break;
+ default:
+ power_stat = "unknown";
+ break;
+ }
+ switch (bx & 0xff) {
+ case 0:
+ bat_stat = "high";
+ break;
+ case 1:
+ bat_stat = "low";
+ break;
+ case 2:
+ bat_stat = "critical";
+ break;
+ case 3:
+ bat_stat = "charging";
+ break;
+ default:
+ bat_stat = "unknown";
+ break;
+ }
+ printk(KERN_INFO
+ "apm: AC %s, battery status %s, battery life ",
+ power_stat, bat_stat);
+ if ((cx & 0xff) == 0xff)
+ printk("unknown\n");
+ else
+ printk("%d%%\n", cx & 0xff);
+ if (apm_info.connection_version > 0x100) {
+ printk(KERN_INFO
+ "apm: battery flag 0x%02x, battery life ",
+ (cx >> 8) & 0xff);
+ if (dx == 0xffff)
+ printk("unknown\n");
+ else
+ printk("%d %s\n", dx & 0x7fff,
+ (dx & 0x8000) ?
+ "minutes" : "seconds");
+ }
+ }
+ }
+
+ /* Install our power off handler.. */
+ if (power_off)
+ pm_power_off = apm_power_off;
+
+ if (num_online_cpus() == 1 || smp) {
+#if defined(CONFIG_APM_DISPLAY_BLANK) && defined(CONFIG_VT)
+ console_blank_hook = apm_console_blank;
+#endif
+ apm_mainloop();
+#if defined(CONFIG_APM_DISPLAY_BLANK) && defined(CONFIG_VT)
+ console_blank_hook = NULL;
+#endif
+ }
+
+ return 0;
+}
+
+#ifndef MODULE
+static int __init apm_setup(char *str)
+{
+ int invert;
+
+ while ((str != NULL) && (*str != '\0')) {
+ if (strncmp(str, "off", 3) == 0)
+ apm_disabled = 1;
+ if (strncmp(str, "on", 2) == 0)
+ apm_disabled = 0;
+ if ((strncmp(str, "bounce-interval=", 16) == 0) ||
+ (strncmp(str, "bounce_interval=", 16) == 0))
+ bounce_interval = simple_strtol(str + 16, NULL, 0);
+ if ((strncmp(str, "idle-threshold=", 15) == 0) ||
+ (strncmp(str, "idle_threshold=", 15) == 0))
+ idle_threshold = simple_strtol(str + 15, NULL, 0);
+ if ((strncmp(str, "idle-period=", 12) == 0) ||
+ (strncmp(str, "idle_period=", 12) == 0))
+ idle_period = simple_strtol(str + 12, NULL, 0);
+ invert = (strncmp(str, "no-", 3) == 0) ||
+ (strncmp(str, "no_", 3) == 0);
+ if (invert)
+ str += 3;
+ if (strncmp(str, "debug", 5) == 0)
+ debug = !invert;
+ if ((strncmp(str, "power-off", 9) == 0) ||
+ (strncmp(str, "power_off", 9) == 0))
+ power_off = !invert;
+ if (strncmp(str, "smp", 3) == 0) {
+ smp = !invert;
+ idle_threshold = 100;
+ }
+ if ((strncmp(str, "allow-ints", 10) == 0) ||
+ (strncmp(str, "allow_ints", 10) == 0))
+ apm_info.allow_ints = !invert;
+ if ((strncmp(str, "broken-psr", 10) == 0) ||
+ (strncmp(str, "broken_psr", 10) == 0))
+ apm_info.get_power_status_broken = !invert;
+ if ((strncmp(str, "realmode-power-off", 18) == 0) ||
+ (strncmp(str, "realmode_power_off", 18) == 0))
+ apm_info.realmode_power_off = !invert;
+ str = strchr(str, ',');
+ if (str != NULL)
+ str += strspn(str, ", \t");
+ }
+ return 1;
+}
+
+__setup("apm=", apm_setup);
+#endif
+
+static const struct file_operations apm_bios_fops = {
+ .owner = THIS_MODULE,
+ .read = do_read,
+ .poll = do_poll,
+ .unlocked_ioctl = do_ioctl,
+ .open = do_open,
+ .release = do_release,
+ .llseek = noop_llseek,
+};
+
+static struct miscdevice apm_device = {
+ APM_MINOR_DEV,
+ "apm_bios",
+ &apm_bios_fops
+};
+
+
+/* Simple "print if true" callback */
+static int __init print_if_true(const struct dmi_system_id *d)
+{
+ printk("%s\n", d->ident);
+ return 0;
+}
+
+/*
+ * Some Bioses enable the PS/2 mouse (touchpad) at resume, even if it was
+ * disabled before the suspend. Linux used to get terribly confused by that.
+ */
+static int __init broken_ps2_resume(const struct dmi_system_id *d)
+{
+ printk(KERN_INFO "%s machine detected. Mousepad Resume Bug "
+ "workaround hopefully not needed.\n", d->ident);
+ return 0;
+}
+
+/* Some bioses have a broken protected mode poweroff and need to use realmode */
+static int __init set_realmode_power_off(const struct dmi_system_id *d)
+{
+ if (apm_info.realmode_power_off == 0) {
+ apm_info.realmode_power_off = 1;
+ printk(KERN_INFO "%s bios detected. "
+ "Using realmode poweroff only.\n", d->ident);
+ }
+ return 0;
+}
+
+/* Some laptops require interrupts to be enabled during APM calls */
+static int __init set_apm_ints(const struct dmi_system_id *d)
+{
+ if (apm_info.allow_ints == 0) {
+ apm_info.allow_ints = 1;
+ printk(KERN_INFO "%s machine detected. "
+ "Enabling interrupts during APM calls.\n", d->ident);
+ }
+ return 0;
+}
+
+/* Some APM bioses corrupt memory or just plain do not work */
+static int __init apm_is_horked(const struct dmi_system_id *d)
+{
+ if (apm_info.disabled == 0) {
+ apm_info.disabled = 1;
+ printk(KERN_INFO "%s machine detected. "
+ "Disabling APM.\n", d->ident);
+ }
+ return 0;
+}
+
+static int __init apm_is_horked_d850md(const struct dmi_system_id *d)
+{
+ if (apm_info.disabled == 0) {
+ apm_info.disabled = 1;
+ printk(KERN_INFO "%s machine detected. "
+ "Disabling APM.\n", d->ident);
+ printk(KERN_INFO "This bug is fixed in bios P15 which is available for\n");
+ printk(KERN_INFO "download from support.intel.com\n");
+ }
+ return 0;
+}
+
+/* Some APM bioses hang on APM idle calls */
+static int __init apm_likes_to_melt(const struct dmi_system_id *d)
+{
+ if (apm_info.forbid_idle == 0) {
+ apm_info.forbid_idle = 1;
+ printk(KERN_INFO "%s machine detected. "
+ "Disabling APM idle calls.\n", d->ident);
+ }
+ return 0;
+}
+
+/*
+ * Check for clue free BIOS implementations who use
+ * the following QA technique
+ *
+ * [ Write BIOS Code ]<------
+ * | ^
+ * < Does it Compile >----N--
+ * |Y ^
+ * < Does it Boot Win98 >-N--
+ * |Y
+ * [Ship It]
+ *
+ * Phoenix A04 08/24/2000 is known bad (Dell Inspiron 5000e)
+ * Phoenix A07 09/29/2000 is known good (Dell Inspiron 5000)
+ */
+static int __init broken_apm_power(const struct dmi_system_id *d)
+{
+ apm_info.get_power_status_broken = 1;
+ printk(KERN_WARNING "BIOS strings suggest APM bugs, "
+ "disabling power status reporting.\n");
+ return 0;
+}
+
+/*
+ * This bios swaps the APM minute reporting bytes over (Many sony laptops
+ * have this problem).
+ */
+static int __init swab_apm_power_in_minutes(const struct dmi_system_id *d)
+{
+ apm_info.get_power_status_swabinminutes = 1;
+ printk(KERN_WARNING "BIOS strings suggest APM reports battery life "
+ "in minutes and wrong byte order.\n");
+ return 0;
+}
+
+static const struct dmi_system_id apm_dmi_table[] __initconst = {
+ {
+ print_if_true,
+ KERN_WARNING "IBM T23 - BIOS 1.03b+ and controller firmware 1.02+ may be needed for Linux APM.",
+ { DMI_MATCH(DMI_SYS_VENDOR, "IBM"),
+ DMI_MATCH(DMI_BIOS_VERSION, "1AET38WW (1.01b)"), },
+ },
+ { /* Handle problems with APM on the C600 */
+ broken_ps2_resume, "Dell Latitude C600",
+ { DMI_MATCH(DMI_SYS_VENDOR, "Dell"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "Latitude C600"), },
+ },
+ { /* Allow interrupts during suspend on Dell Latitude laptops*/
+ set_apm_ints, "Dell Latitude",
+ { DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "Latitude C510"), }
+ },
+ { /* APM crashes */
+ apm_is_horked, "Dell Inspiron 2500",
+ { DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "Inspiron 2500"),
+ DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
+ DMI_MATCH(DMI_BIOS_VERSION, "A11"), },
+ },
+ { /* Allow interrupts during suspend on Dell Inspiron laptops*/
+ set_apm_ints, "Dell Inspiron", {
+ DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "Inspiron 4000"), },
+ },
+ { /* Handle problems with APM on Inspiron 5000e */
+ broken_apm_power, "Dell Inspiron 5000e",
+ { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
+ DMI_MATCH(DMI_BIOS_VERSION, "A04"),
+ DMI_MATCH(DMI_BIOS_DATE, "08/24/2000"), },
+ },
+ { /* Handle problems with APM on Inspiron 2500 */
+ broken_apm_power, "Dell Inspiron 2500",
+ { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
+ DMI_MATCH(DMI_BIOS_VERSION, "A12"),
+ DMI_MATCH(DMI_BIOS_DATE, "02/04/2002"), },
+ },
+ { /* APM crashes */
+ apm_is_horked, "Dell Dimension 4100",
+ { DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "XPS-Z"),
+ DMI_MATCH(DMI_BIOS_VENDOR, "Intel Corp."),
+ DMI_MATCH(DMI_BIOS_VERSION, "A11"), },
+ },
+ { /* Allow interrupts during suspend on Compaq Laptops*/
+ set_apm_ints, "Compaq 12XL125",
+ { DMI_MATCH(DMI_SYS_VENDOR, "Compaq"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "Compaq PC"),
+ DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
+ DMI_MATCH(DMI_BIOS_VERSION, "4.06"), },
+ },
+ { /* Allow interrupts during APM or the clock goes slow */
+ set_apm_ints, "ASUSTeK",
+ { DMI_MATCH(DMI_SYS_VENDOR, "ASUSTeK Computer Inc."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "L8400K series Notebook PC"), },
+ },
+ { /* APM blows on shutdown */
+ apm_is_horked, "ABIT KX7-333[R]",
+ { DMI_MATCH(DMI_BOARD_VENDOR, "ABIT"),
+ DMI_MATCH(DMI_BOARD_NAME, "VT8367-8233A (KX7-333[R])"), },
+ },
+ { /* APM crashes */
+ apm_is_horked, "Trigem Delhi3",
+ { DMI_MATCH(DMI_SYS_VENDOR, "TriGem Computer, Inc"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "Delhi3"), },
+ },
+ { /* APM crashes */
+ apm_is_horked, "Fujitsu-Siemens",
+ { DMI_MATCH(DMI_BIOS_VENDOR, "hoenix/FUJITSU SIEMENS"),
+ DMI_MATCH(DMI_BIOS_VERSION, "Version1.01"), },
+ },
+ { /* APM crashes */
+ apm_is_horked_d850md, "Intel D850MD",
+ { DMI_MATCH(DMI_BIOS_VENDOR, "Intel Corp."),
+ DMI_MATCH(DMI_BIOS_VERSION, "MV85010A.86A.0016.P07.0201251536"), },
+ },
+ { /* APM crashes */
+ apm_is_horked, "Intel D810EMO",
+ { DMI_MATCH(DMI_BIOS_VENDOR, "Intel Corp."),
+ DMI_MATCH(DMI_BIOS_VERSION, "MO81010A.86A.0008.P04.0004170800"), },
+ },
+ { /* APM crashes */
+ apm_is_horked, "Dell XPS-Z",
+ { DMI_MATCH(DMI_BIOS_VENDOR, "Intel Corp."),
+ DMI_MATCH(DMI_BIOS_VERSION, "A11"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "XPS-Z"), },
+ },
+ { /* APM crashes */
+ apm_is_horked, "Sharp PC-PJ/AX",
+ { DMI_MATCH(DMI_SYS_VENDOR, "SHARP"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "PC-PJ/AX"),
+ DMI_MATCH(DMI_BIOS_VENDOR, "SystemSoft"),
+ DMI_MATCH(DMI_BIOS_VERSION, "Version R2.08"), },
+ },
+ { /* APM crashes */
+ apm_is_horked, "Dell Inspiron 2500",
+ { DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "Inspiron 2500"),
+ DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
+ DMI_MATCH(DMI_BIOS_VERSION, "A11"), },
+ },
+ { /* APM idle hangs */
+ apm_likes_to_melt, "Jabil AMD",
+ { DMI_MATCH(DMI_BIOS_VENDOR, "American Megatrends Inc."),
+ DMI_MATCH(DMI_BIOS_VERSION, "0AASNP06"), },
+ },
+ { /* APM idle hangs */
+ apm_likes_to_melt, "AMI Bios",
+ { DMI_MATCH(DMI_BIOS_VENDOR, "American Megatrends Inc."),
+ DMI_MATCH(DMI_BIOS_VERSION, "0AASNP05"), },
+ },
+ { /* Handle problems with APM on Sony Vaio PCG-N505X(DE) */
+ swab_apm_power_in_minutes, "Sony VAIO",
+ { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
+ DMI_MATCH(DMI_BIOS_VERSION, "R0206H"),
+ DMI_MATCH(DMI_BIOS_DATE, "08/23/99"), },
+ },
+ { /* Handle problems with APM on Sony Vaio PCG-N505VX */
+ swab_apm_power_in_minutes, "Sony VAIO",
+ { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
+ DMI_MATCH(DMI_BIOS_VERSION, "W2K06H0"),
+ DMI_MATCH(DMI_BIOS_DATE, "02/03/00"), },
+ },
+ { /* Handle problems with APM on Sony Vaio PCG-XG29 */
+ swab_apm_power_in_minutes, "Sony VAIO",
+ { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
+ DMI_MATCH(DMI_BIOS_VERSION, "R0117A0"),
+ DMI_MATCH(DMI_BIOS_DATE, "04/25/00"), },
+ },
+ { /* Handle problems with APM on Sony Vaio PCG-Z600NE */
+ swab_apm_power_in_minutes, "Sony VAIO",
+ { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
+ DMI_MATCH(DMI_BIOS_VERSION, "R0121Z1"),
+ DMI_MATCH(DMI_BIOS_DATE, "05/11/00"), },
+ },
+ { /* Handle problems with APM on Sony Vaio PCG-Z600NE */
+ swab_apm_power_in_minutes, "Sony VAIO",
+ { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
+ DMI_MATCH(DMI_BIOS_VERSION, "WME01Z1"),
+ DMI_MATCH(DMI_BIOS_DATE, "08/11/00"), },
+ },
+ { /* Handle problems with APM on Sony Vaio PCG-Z600LEK(DE) */
+ swab_apm_power_in_minutes, "Sony VAIO",
+ { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
+ DMI_MATCH(DMI_BIOS_VERSION, "R0206Z3"),
+ DMI_MATCH(DMI_BIOS_DATE, "12/25/00"), },
+ },
+ { /* Handle problems with APM on Sony Vaio PCG-Z505LS */
+ swab_apm_power_in_minutes, "Sony VAIO",
+ { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
+ DMI_MATCH(DMI_BIOS_VERSION, "R0203D0"),
+ DMI_MATCH(DMI_BIOS_DATE, "05/12/00"), },
+ },
+ { /* Handle problems with APM on Sony Vaio PCG-Z505LS */
+ swab_apm_power_in_minutes, "Sony VAIO",
+ { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
+ DMI_MATCH(DMI_BIOS_VERSION, "R0203Z3"),
+ DMI_MATCH(DMI_BIOS_DATE, "08/25/00"), },
+ },
+ { /* Handle problems with APM on Sony Vaio PCG-Z505LS (with updated BIOS) */
+ swab_apm_power_in_minutes, "Sony VAIO",
+ { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
+ DMI_MATCH(DMI_BIOS_VERSION, "R0209Z3"),
+ DMI_MATCH(DMI_BIOS_DATE, "05/12/01"), },
+ },
+ { /* Handle problems with APM on Sony Vaio PCG-F104K */
+ swab_apm_power_in_minutes, "Sony VAIO",
+ { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
+ DMI_MATCH(DMI_BIOS_VERSION, "R0204K2"),
+ DMI_MATCH(DMI_BIOS_DATE, "08/28/00"), },
+ },
+
+ { /* Handle problems with APM on Sony Vaio PCG-C1VN/C1VE */
+ swab_apm_power_in_minutes, "Sony VAIO",
+ { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
+ DMI_MATCH(DMI_BIOS_VERSION, "R0208P1"),
+ DMI_MATCH(DMI_BIOS_DATE, "11/09/00"), },
+ },
+ { /* Handle problems with APM on Sony Vaio PCG-C1VE */
+ swab_apm_power_in_minutes, "Sony VAIO",
+ { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
+ DMI_MATCH(DMI_BIOS_VERSION, "R0204P1"),
+ DMI_MATCH(DMI_BIOS_DATE, "09/12/00"), },
+ },
+ { /* Handle problems with APM on Sony Vaio PCG-C1VE */
+ swab_apm_power_in_minutes, "Sony VAIO",
+ { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
+ DMI_MATCH(DMI_BIOS_VERSION, "WXPO1Z3"),
+ DMI_MATCH(DMI_BIOS_DATE, "10/26/01"), },
+ },
+ { /* broken PM poweroff bios */
+ set_realmode_power_off, "Award Software v4.60 PGMA",
+ { DMI_MATCH(DMI_BIOS_VENDOR, "Award Software International, Inc."),
+ DMI_MATCH(DMI_BIOS_VERSION, "4.60 PGMA"),
+ DMI_MATCH(DMI_BIOS_DATE, "134526184"), },
+ },
+
+ /* Generic per vendor APM settings */
+
+ { /* Allow interrupts during suspend on IBM laptops */
+ set_apm_ints, "IBM",
+ { DMI_MATCH(DMI_SYS_VENDOR, "IBM"), },
+ },
+
+ { }
+};
+
+/*
+ * Just start the APM thread. We do NOT want to do APM BIOS
+ * calls from anything but the APM thread, if for no other reason
+ * than the fact that we don't trust the APM BIOS. This way,
+ * most common APM BIOS problems that lead to protection errors
+ * etc will have at least some level of being contained...
+ *
+ * In short, if something bad happens, at least we have a choice
+ * of just killing the apm thread..
+ */
+static int __init apm_init(void)
+{
+ struct desc_struct *gdt;
+ int err;
+
+ dmi_check_system(apm_dmi_table);
+
+ if (apm_info.bios.version == 0 || machine_is_olpc()) {
+ printk(KERN_INFO "apm: BIOS not found.\n");
+ return -ENODEV;
+ }
+ printk(KERN_INFO
+ "apm: BIOS version %d.%d Flags 0x%02x (Driver version %s)\n",
+ ((apm_info.bios.version >> 8) & 0xff),
+ (apm_info.bios.version & 0xff),
+ apm_info.bios.flags,
+ driver_version);
+ if ((apm_info.bios.flags & APM_32_BIT_SUPPORT) == 0) {
+ printk(KERN_INFO "apm: no 32 bit BIOS support\n");
+ return -ENODEV;
+ }
+
+ if (allow_ints)
+ apm_info.allow_ints = 1;
+ if (broken_psr)
+ apm_info.get_power_status_broken = 1;
+ if (realmode_power_off)
+ apm_info.realmode_power_off = 1;
+ /* User can override, but default is to trust DMI */
+ if (apm_disabled != -1)
+ apm_info.disabled = apm_disabled;
+
+ /*
+ * Fix for the Compaq Contura 3/25c which reports BIOS version 0.1
+ * but is reportedly a 1.0 BIOS.
+ */
+ if (apm_info.bios.version == 0x001)
+ apm_info.bios.version = 0x100;
+
+ /* BIOS < 1.2 doesn't set cseg_16_len */
+ if (apm_info.bios.version < 0x102)
+ apm_info.bios.cseg_16_len = 0; /* 64k */
+
+ if (debug) {
+ printk(KERN_INFO "apm: entry %x:%x cseg16 %x dseg %x",
+ apm_info.bios.cseg, apm_info.bios.offset,
+ apm_info.bios.cseg_16, apm_info.bios.dseg);
+ if (apm_info.bios.version > 0x100)
+ printk(" cseg len %x, dseg len %x",
+ apm_info.bios.cseg_len,
+ apm_info.bios.dseg_len);
+ if (apm_info.bios.version > 0x101)
+ printk(" cseg16 len %x", apm_info.bios.cseg_16_len);
+ printk("\n");
+ }
+
+ if (apm_info.disabled) {
+ pr_notice("disabled on user request.\n");
+ return -ENODEV;
+ }
+ if ((num_online_cpus() > 1) && !power_off && !smp) {
+ pr_notice("disabled - APM is not SMP safe.\n");
+ apm_info.disabled = 1;
+ return -ENODEV;
+ }
+ if (!acpi_disabled) {
+ pr_notice("overridden by ACPI.\n");
+ apm_info.disabled = 1;
+ return -ENODEV;
+ }
+
+ /*
+ * Set up the long jump entry point to the APM BIOS, which is called
+ * from inline assembly.
+ */
+ apm_bios_entry.offset = apm_info.bios.offset;
+ apm_bios_entry.segment = APM_CS;
+
+ /*
+ * The APM 1.1 BIOS is supposed to provide limit information that it
+ * recognizes. Many machines do this correctly, but many others do
+ * not restrict themselves to their claimed limit. When this happens,
+ * they will cause a segmentation violation in the kernel at boot time.
+ * Most BIOS's, however, will respect a 64k limit, so we use that.
+ *
+ * Note we only set APM segments on CPU zero, since we pin the APM
+ * code to that CPU.
+ */
+ gdt = get_cpu_gdt_rw(0);
+ set_desc_base(&gdt[APM_CS >> 3],
+ (unsigned long)__va((unsigned long)apm_info.bios.cseg << 4));
+ set_desc_base(&gdt[APM_CS_16 >> 3],
+ (unsigned long)__va((unsigned long)apm_info.bios.cseg_16 << 4));
+ set_desc_base(&gdt[APM_DS >> 3],
+ (unsigned long)__va((unsigned long)apm_info.bios.dseg << 4));
+
+ proc_create_single("apm", 0, NULL, proc_apm_show);
+
+ kapmd_task = kthread_create(apm, NULL, "kapmd");
+ if (IS_ERR(kapmd_task)) {
+ pr_err("disabled - Unable to start kernel thread\n");
+ err = PTR_ERR(kapmd_task);
+ kapmd_task = NULL;
+ remove_proc_entry("apm", NULL);
+ return err;
+ }
+ wake_up_process(kapmd_task);
+
+ if (num_online_cpus() > 1 && !smp) {
+ printk(KERN_NOTICE
+ "apm: disabled - APM is not SMP safe (power off active).\n");
+ return 0;
+ }
+
+ /*
+ * Note we don't actually care if the misc_device cannot be registered.
+ * this driver can do its job without it, even if userspace can't
+ * control it. just log the error
+ */
+ if (misc_register(&apm_device))
+ printk(KERN_WARNING "apm: Could not register misc device.\n");
+
+ if (HZ != 100)
+ idle_period = (idle_period * HZ) / 100;
+ if (idle_threshold < 100) {
+ cpuidle_poll_state_init(&apm_idle_driver);
+ if (!cpuidle_register_driver(&apm_idle_driver))
+ if (cpuidle_register_device(&apm_cpuidle_device))
+ cpuidle_unregister_driver(&apm_idle_driver);
+ }
+
+ return 0;
+}
+
+static void __exit apm_exit(void)
+{
+ int error;
+
+ cpuidle_unregister_device(&apm_cpuidle_device);
+ cpuidle_unregister_driver(&apm_idle_driver);
+
+ if (((apm_info.bios.flags & APM_BIOS_DISENGAGED) == 0)
+ && (apm_info.connection_version > 0x0100)) {
+ error = apm_engage_power_management(APM_DEVICE_ALL, 0);
+ if (error)
+ apm_error("disengage power management", error);
+ }
+ misc_deregister(&apm_device);
+ remove_proc_entry("apm", NULL);
+ if (power_off)
+ pm_power_off = NULL;
+ if (kapmd_task) {
+ kthread_stop(kapmd_task);
+ kapmd_task = NULL;
+ }
+}
+
+module_init(apm_init);
+module_exit(apm_exit);
+
+MODULE_AUTHOR("Stephen Rothwell");
+MODULE_DESCRIPTION("Advanced Power Management");
+MODULE_LICENSE("GPL");
+module_param(debug, bool, 0644);
+MODULE_PARM_DESC(debug, "Enable debug mode");
+module_param(power_off, bool, 0444);
+MODULE_PARM_DESC(power_off, "Enable power off");
+module_param(bounce_interval, int, 0444);
+MODULE_PARM_DESC(bounce_interval,
+ "Set the number of ticks to ignore suspend bounces");
+module_param(allow_ints, bool, 0444);
+MODULE_PARM_DESC(allow_ints, "Allow interrupts during BIOS calls");
+module_param(broken_psr, bool, 0444);
+MODULE_PARM_DESC(broken_psr, "BIOS has a broken GetPowerStatus call");
+module_param(realmode_power_off, bool, 0444);
+MODULE_PARM_DESC(realmode_power_off,
+ "Switch to real mode before powering off");
+module_param(idle_threshold, int, 0444);
+MODULE_PARM_DESC(idle_threshold,
+ "System idle percentage above which to make APM BIOS idle calls");
+module_param(idle_period, int, 0444);
+MODULE_PARM_DESC(idle_period,
+ "Period (in sec/100) over which to calculate the idle percentage");
+module_param(smp, bool, 0444);
+MODULE_PARM_DESC(smp,
+ "Set this to enable APM use on an SMP platform. Use with caution on older systems");
+MODULE_ALIAS_MISCDEV(APM_MINOR_DEV);
diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c
new file mode 100644
index 0000000..01de31d
--- /dev/null
+++ b/arch/x86/kernel/asm-offsets.c
@@ -0,0 +1,111 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Generate definitions needed by assembly language modules.
+ * This code generates raw asm output which is post-processed to extract
+ * and format the required data.
+ */
+#define COMPILE_OFFSETS
+
+#include <linux/crypto.h>
+#include <linux/sched.h>
+#include <linux/stddef.h>
+#include <linux/hardirq.h>
+#include <linux/suspend.h>
+#include <linux/kbuild.h>
+#include <asm/processor.h>
+#include <asm/thread_info.h>
+#include <asm/sigframe.h>
+#include <asm/bootparam.h>
+#include <asm/suspend.h>
+#include <asm/tlbflush.h>
+
+#ifdef CONFIG_XEN
+#include <xen/interface/xen.h>
+#endif
+
+#ifdef CONFIG_X86_32
+# include "asm-offsets_32.c"
+#else
+# include "asm-offsets_64.c"
+#endif
+
+void common(void) {
+ BLANK();
+ OFFSET(TASK_threadsp, task_struct, thread.sp);
+#ifdef CONFIG_STACKPROTECTOR
+ OFFSET(TASK_stack_canary, task_struct, stack_canary);
+#endif
+
+ BLANK();
+ OFFSET(TASK_TI_flags, task_struct, thread_info.flags);
+ OFFSET(TASK_addr_limit, task_struct, thread.addr_limit);
+
+ BLANK();
+ OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx);
+
+ BLANK();
+ OFFSET(pbe_address, pbe, address);
+ OFFSET(pbe_orig_address, pbe, orig_address);
+ OFFSET(pbe_next, pbe, next);
+
+#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
+ BLANK();
+ OFFSET(IA32_SIGCONTEXT_ax, sigcontext_32, ax);
+ OFFSET(IA32_SIGCONTEXT_bx, sigcontext_32, bx);
+ OFFSET(IA32_SIGCONTEXT_cx, sigcontext_32, cx);
+ OFFSET(IA32_SIGCONTEXT_dx, sigcontext_32, dx);
+ OFFSET(IA32_SIGCONTEXT_si, sigcontext_32, si);
+ OFFSET(IA32_SIGCONTEXT_di, sigcontext_32, di);
+ OFFSET(IA32_SIGCONTEXT_bp, sigcontext_32, bp);
+ OFFSET(IA32_SIGCONTEXT_sp, sigcontext_32, sp);
+ OFFSET(IA32_SIGCONTEXT_ip, sigcontext_32, ip);
+
+ BLANK();
+ OFFSET(IA32_RT_SIGFRAME_sigcontext, rt_sigframe_ia32, uc.uc_mcontext);
+#endif
+
+#ifdef CONFIG_PARAVIRT
+ BLANK();
+ OFFSET(PARAVIRT_PATCH_pv_cpu_ops, paravirt_patch_template, pv_cpu_ops);
+ OFFSET(PARAVIRT_PATCH_pv_irq_ops, paravirt_patch_template, pv_irq_ops);
+ OFFSET(PV_IRQ_irq_disable, pv_irq_ops, irq_disable);
+ OFFSET(PV_IRQ_irq_enable, pv_irq_ops, irq_enable);
+ OFFSET(PV_CPU_iret, pv_cpu_ops, iret);
+ OFFSET(PV_CPU_read_cr0, pv_cpu_ops, read_cr0);
+ OFFSET(PV_MMU_read_cr2, pv_mmu_ops, read_cr2);
+#endif
+
+#ifdef CONFIG_XEN
+ BLANK();
+ OFFSET(XEN_vcpu_info_mask, vcpu_info, evtchn_upcall_mask);
+ OFFSET(XEN_vcpu_info_pending, vcpu_info, evtchn_upcall_pending);
+#endif
+
+ BLANK();
+ OFFSET(BP_scratch, boot_params, scratch);
+ OFFSET(BP_secure_boot, boot_params, secure_boot);
+ OFFSET(BP_loadflags, boot_params, hdr.loadflags);
+ OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch);
+ OFFSET(BP_version, boot_params, hdr.version);
+ OFFSET(BP_kernel_alignment, boot_params, hdr.kernel_alignment);
+ OFFSET(BP_init_size, boot_params, hdr.init_size);
+ OFFSET(BP_pref_address, boot_params, hdr.pref_address);
+ OFFSET(BP_code32_start, boot_params, hdr.code32_start);
+
+ BLANK();
+ DEFINE(PTREGS_SIZE, sizeof(struct pt_regs));
+
+ /* TLB state for the entry code */
+ OFFSET(TLB_STATE_user_pcid_flush_mask, tlb_state, user_pcid_flush_mask);
+
+ /* Layout info for cpu_entry_area */
+ OFFSET(CPU_ENTRY_AREA_tss, cpu_entry_area, tss);
+ OFFSET(CPU_ENTRY_AREA_entry_trampoline, cpu_entry_area, entry_trampoline);
+ OFFSET(CPU_ENTRY_AREA_entry_stack, cpu_entry_area, entry_stack_page);
+ DEFINE(SIZEOF_entry_stack, sizeof(struct entry_stack));
+ DEFINE(MASK_entry_stack, (~(sizeof(struct entry_stack) - 1)));
+
+ /* Offset for sp0 and sp1 into the tss_struct */
+ OFFSET(TSS_sp0, tss_struct, x86_tss.sp0);
+ OFFSET(TSS_sp1, tss_struct, x86_tss.sp1);
+}
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
new file mode 100644
index 0000000..82826f2
--- /dev/null
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -0,0 +1,67 @@
+// SPDX-License-Identifier: GPL-2.0
+#ifndef __LINUX_KBUILD_H
+# error "Please do not build this file directly, build asm-offsets.c instead"
+#endif
+
+#include <asm/ucontext.h>
+
+#define __SYSCALL_I386(nr, sym, qual) [nr] = 1,
+static char syscalls[] = {
+#include <asm/syscalls_32.h>
+};
+
+/* workaround for a warning with -Wmissing-prototypes */
+void foo(void);
+
+void foo(void)
+{
+ OFFSET(CPUINFO_x86, cpuinfo_x86, x86);
+ OFFSET(CPUINFO_x86_vendor, cpuinfo_x86, x86_vendor);
+ OFFSET(CPUINFO_x86_model, cpuinfo_x86, x86_model);
+ OFFSET(CPUINFO_x86_stepping, cpuinfo_x86, x86_stepping);
+ OFFSET(CPUINFO_cpuid_level, cpuinfo_x86, cpuid_level);
+ OFFSET(CPUINFO_x86_capability, cpuinfo_x86, x86_capability);
+ OFFSET(CPUINFO_x86_vendor_id, cpuinfo_x86, x86_vendor_id);
+ BLANK();
+
+ OFFSET(PT_EBX, pt_regs, bx);
+ OFFSET(PT_ECX, pt_regs, cx);
+ OFFSET(PT_EDX, pt_regs, dx);
+ OFFSET(PT_ESI, pt_regs, si);
+ OFFSET(PT_EDI, pt_regs, di);
+ OFFSET(PT_EBP, pt_regs, bp);
+ OFFSET(PT_EAX, pt_regs, ax);
+ OFFSET(PT_DS, pt_regs, ds);
+ OFFSET(PT_ES, pt_regs, es);
+ OFFSET(PT_FS, pt_regs, fs);
+ OFFSET(PT_GS, pt_regs, gs);
+ OFFSET(PT_ORIG_EAX, pt_regs, orig_ax);
+ OFFSET(PT_EIP, pt_regs, ip);
+ OFFSET(PT_CS, pt_regs, cs);
+ OFFSET(PT_EFLAGS, pt_regs, flags);
+ OFFSET(PT_OLDESP, pt_regs, sp);
+ OFFSET(PT_OLDSS, pt_regs, ss);
+ BLANK();
+
+ OFFSET(saved_context_gdt_desc, saved_context, gdt_desc);
+ BLANK();
+
+ /*
+ * Offset from the entry stack to task stack stored in TSS. Kernel entry
+ * happens on the per-cpu entry-stack, and the asm code switches to the
+ * task-stack pointer stored in x86_tss.sp1, which is a copy of
+ * task->thread.sp0 where entry code can find it.
+ */
+ DEFINE(TSS_entry2task_stack,
+ offsetof(struct cpu_entry_area, tss.x86_tss.sp1) -
+ offsetofend(struct cpu_entry_area, entry_stack_page.stack));
+
+#ifdef CONFIG_STACKPROTECTOR
+ BLANK();
+ OFFSET(stack_canary_offset, stack_canary, canary);
+#endif
+
+ BLANK();
+ DEFINE(__NR_syscall_max, sizeof(syscalls) - 1);
+ DEFINE(NR_syscalls, sizeof(syscalls));
+}
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
new file mode 100644
index 0000000..3b9405e
--- /dev/null
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -0,0 +1,82 @@
+// SPDX-License-Identifier: GPL-2.0
+#ifndef __LINUX_KBUILD_H
+# error "Please do not build this file directly, build asm-offsets.c instead"
+#endif
+
+#include <asm/ia32.h>
+
+#define __SYSCALL_64(nr, sym, qual) [nr] = 1,
+static char syscalls_64[] = {
+#include <asm/syscalls_64.h>
+};
+#define __SYSCALL_I386(nr, sym, qual) [nr] = 1,
+static char syscalls_ia32[] = {
+#include <asm/syscalls_32.h>
+};
+
+#if defined(CONFIG_KVM_GUEST) && defined(CONFIG_PARAVIRT_SPINLOCKS)
+#include <asm/kvm_para.h>
+#endif
+
+int main(void)
+{
+#ifdef CONFIG_PARAVIRT
+ OFFSET(PV_CPU_usergs_sysret64, pv_cpu_ops, usergs_sysret64);
+ OFFSET(PV_CPU_swapgs, pv_cpu_ops, swapgs);
+#ifdef CONFIG_DEBUG_ENTRY
+ OFFSET(PV_IRQ_save_fl, pv_irq_ops, save_fl);
+#endif
+ BLANK();
+#endif
+
+#if defined(CONFIG_KVM_GUEST) && defined(CONFIG_PARAVIRT_SPINLOCKS)
+ OFFSET(KVM_STEAL_TIME_preempted, kvm_steal_time, preempted);
+ BLANK();
+#endif
+
+#define ENTRY(entry) OFFSET(pt_regs_ ## entry, pt_regs, entry)
+ ENTRY(bx);
+ ENTRY(cx);
+ ENTRY(dx);
+ ENTRY(sp);
+ ENTRY(bp);
+ ENTRY(si);
+ ENTRY(di);
+ ENTRY(r8);
+ ENTRY(r9);
+ ENTRY(r10);
+ ENTRY(r11);
+ ENTRY(r12);
+ ENTRY(r13);
+ ENTRY(r14);
+ ENTRY(r15);
+ ENTRY(flags);
+ BLANK();
+#undef ENTRY
+
+#define ENTRY(entry) OFFSET(saved_context_ ## entry, saved_context, entry)
+ ENTRY(cr0);
+ ENTRY(cr2);
+ ENTRY(cr3);
+ ENTRY(cr4);
+ ENTRY(cr8);
+ ENTRY(gdt_desc);
+ BLANK();
+#undef ENTRY
+
+ OFFSET(TSS_ist, tss_struct, x86_tss.ist);
+ BLANK();
+
+#ifdef CONFIG_STACKPROTECTOR
+ DEFINE(stack_canary_offset, offsetof(union irq_stack_union, stack_canary));
+ BLANK();
+#endif
+
+ DEFINE(__NR_syscall_max, sizeof(syscalls_64) - 1);
+ DEFINE(NR_syscalls, sizeof(syscalls_64));
+
+ DEFINE(__NR_syscall_compat_max, sizeof(syscalls_ia32) - 1);
+ DEFINE(IA32_NR_syscalls, sizeof(syscalls_ia32));
+
+ return 0;
+}
diff --git a/arch/x86/kernel/audit_64.c b/arch/x86/kernel/audit_64.c
new file mode 100644
index 0000000..e1efe44
--- /dev/null
+++ b/arch/x86/kernel/audit_64.c
@@ -0,0 +1,83 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/audit.h>
+#include <asm/unistd.h>
+
+static unsigned dir_class[] = {
+#include <asm-generic/audit_dir_write.h>
+~0U
+};
+
+static unsigned read_class[] = {
+#include <asm-generic/audit_read.h>
+~0U
+};
+
+static unsigned write_class[] = {
+#include <asm-generic/audit_write.h>
+~0U
+};
+
+static unsigned chattr_class[] = {
+#include <asm-generic/audit_change_attr.h>
+~0U
+};
+
+static unsigned signal_class[] = {
+#include <asm-generic/audit_signal.h>
+~0U
+};
+
+int audit_classify_arch(int arch)
+{
+#ifdef CONFIG_IA32_EMULATION
+ if (arch == AUDIT_ARCH_I386)
+ return 1;
+#endif
+ return 0;
+}
+
+int audit_classify_syscall(int abi, unsigned syscall)
+{
+#ifdef CONFIG_IA32_EMULATION
+ extern int ia32_classify_syscall(unsigned);
+ if (abi == AUDIT_ARCH_I386)
+ return ia32_classify_syscall(syscall);
+#endif
+ switch(syscall) {
+ case __NR_open:
+ return 2;
+ case __NR_openat:
+ return 3;
+ case __NR_execve:
+ case __NR_execveat:
+ return 5;
+ default:
+ return 0;
+ }
+}
+
+static int __init audit_classes_init(void)
+{
+#ifdef CONFIG_IA32_EMULATION
+ extern __u32 ia32_dir_class[];
+ extern __u32 ia32_write_class[];
+ extern __u32 ia32_read_class[];
+ extern __u32 ia32_chattr_class[];
+ extern __u32 ia32_signal_class[];
+ audit_register_class(AUDIT_CLASS_WRITE_32, ia32_write_class);
+ audit_register_class(AUDIT_CLASS_READ_32, ia32_read_class);
+ audit_register_class(AUDIT_CLASS_DIR_WRITE_32, ia32_dir_class);
+ audit_register_class(AUDIT_CLASS_CHATTR_32, ia32_chattr_class);
+ audit_register_class(AUDIT_CLASS_SIGNAL_32, ia32_signal_class);
+#endif
+ audit_register_class(AUDIT_CLASS_WRITE, write_class);
+ audit_register_class(AUDIT_CLASS_READ, read_class);
+ audit_register_class(AUDIT_CLASS_DIR_WRITE, dir_class);
+ audit_register_class(AUDIT_CLASS_CHATTR, chattr_class);
+ audit_register_class(AUDIT_CLASS_SIGNAL, signal_class);
+ return 0;
+}
+
+__initcall(audit_classes_init);
diff --git a/arch/x86/kernel/bootflag.c b/arch/x86/kernel/bootflag.c
new file mode 100644
index 0000000..3fed7ae
--- /dev/null
+++ b/arch/x86/kernel/bootflag.c
@@ -0,0 +1,102 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Implement 'Simple Boot Flag Specification 2.0'
+ */
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/string.h>
+#include <linux/spinlock.h>
+#include <linux/acpi.h>
+#include <asm/io.h>
+
+#include <linux/mc146818rtc.h>
+
+#define SBF_RESERVED (0x78)
+#define SBF_PNPOS (1<<0)
+#define SBF_BOOTING (1<<1)
+#define SBF_DIAG (1<<2)
+#define SBF_PARITY (1<<7)
+
+int sbf_port __initdata = -1; /* set via acpi_boot_init() */
+
+static int __init parity(u8 v)
+{
+ int x = 0;
+ int i;
+
+ for (i = 0; i < 8; i++) {
+ x ^= (v & 1);
+ v >>= 1;
+ }
+
+ return x;
+}
+
+static void __init sbf_write(u8 v)
+{
+ unsigned long flags;
+
+ if (sbf_port != -1) {
+ v &= ~SBF_PARITY;
+ if (!parity(v))
+ v |= SBF_PARITY;
+
+ printk(KERN_INFO "Simple Boot Flag at 0x%x set to 0x%x\n",
+ sbf_port, v);
+
+ spin_lock_irqsave(&rtc_lock, flags);
+ CMOS_WRITE(v, sbf_port);
+ spin_unlock_irqrestore(&rtc_lock, flags);
+ }
+}
+
+static u8 __init sbf_read(void)
+{
+ unsigned long flags;
+ u8 v;
+
+ if (sbf_port == -1)
+ return 0;
+
+ spin_lock_irqsave(&rtc_lock, flags);
+ v = CMOS_READ(sbf_port);
+ spin_unlock_irqrestore(&rtc_lock, flags);
+
+ return v;
+}
+
+static int __init sbf_value_valid(u8 v)
+{
+ if (v & SBF_RESERVED) /* Reserved bits */
+ return 0;
+ if (!parity(v))
+ return 0;
+
+ return 1;
+}
+
+static int __init sbf_init(void)
+{
+ u8 v;
+
+ if (sbf_port == -1)
+ return 0;
+
+ v = sbf_read();
+ if (!sbf_value_valid(v)) {
+ printk(KERN_WARNING "Simple Boot Flag value 0x%x read from "
+ "CMOS RAM was invalid\n", v);
+ }
+
+ v &= ~SBF_RESERVED;
+ v &= ~SBF_BOOTING;
+ v &= ~SBF_DIAG;
+#if defined(CONFIG_ISAPNP)
+ v |= SBF_PNPOS;
+#endif
+ sbf_write(v);
+
+ return 0;
+}
+arch_initcall(sbf_init);
diff --git a/arch/x86/kernel/check.c b/arch/x86/kernel/check.c
new file mode 100644
index 0000000..cc8258a
--- /dev/null
+++ b/arch/x86/kernel/check.c
@@ -0,0 +1,183 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/init.h>
+#include <linux/sched.h>
+#include <linux/kthread.h>
+#include <linux/workqueue.h>
+#include <linux/memblock.h>
+
+#include <asm/proto.h>
+
+/*
+ * Some BIOSes seem to corrupt the low 64k of memory during events
+ * like suspend/resume and unplugging an HDMI cable. Reserve all
+ * remaining free memory in that area and fill it with a distinct
+ * pattern.
+ */
+#define MAX_SCAN_AREAS 8
+
+static int __read_mostly memory_corruption_check = -1;
+
+static unsigned __read_mostly corruption_check_size = 64*1024;
+static unsigned __read_mostly corruption_check_period = 60; /* seconds */
+
+static struct scan_area {
+ u64 addr;
+ u64 size;
+} scan_areas[MAX_SCAN_AREAS];
+static int num_scan_areas;
+
+static __init int set_corruption_check(char *arg)
+{
+ ssize_t ret;
+ unsigned long val;
+
+ if (!arg) {
+ pr_err("memory_corruption_check config string not provided\n");
+ return -EINVAL;
+ }
+
+ ret = kstrtoul(arg, 10, &val);
+ if (ret)
+ return ret;
+
+ memory_corruption_check = val;
+ return 0;
+}
+early_param("memory_corruption_check", set_corruption_check);
+
+static __init int set_corruption_check_period(char *arg)
+{
+ ssize_t ret;
+ unsigned long val;
+
+ if (!arg) {
+ pr_err("memory_corruption_check_period config string not provided\n");
+ return -EINVAL;
+ }
+
+ ret = kstrtoul(arg, 10, &val);
+ if (ret)
+ return ret;
+
+ corruption_check_period = val;
+ return 0;
+}
+early_param("memory_corruption_check_period", set_corruption_check_period);
+
+static __init int set_corruption_check_size(char *arg)
+{
+ char *end;
+ unsigned size;
+
+ if (!arg) {
+ pr_err("memory_corruption_check_size config string not provided\n");
+ return -EINVAL;
+ }
+
+ size = memparse(arg, &end);
+
+ if (*end == '\0')
+ corruption_check_size = size;
+
+ return (size == corruption_check_size) ? 0 : -EINVAL;
+}
+early_param("memory_corruption_check_size", set_corruption_check_size);
+
+
+void __init setup_bios_corruption_check(void)
+{
+ phys_addr_t start, end;
+ u64 i;
+
+ if (memory_corruption_check == -1) {
+ memory_corruption_check =
+#ifdef CONFIG_X86_BOOTPARAM_MEMORY_CORRUPTION_CHECK
+ 1
+#else
+ 0
+#endif
+ ;
+ }
+
+ if (corruption_check_size == 0)
+ memory_corruption_check = 0;
+
+ if (!memory_corruption_check)
+ return;
+
+ corruption_check_size = round_up(corruption_check_size, PAGE_SIZE);
+
+ for_each_free_mem_range(i, NUMA_NO_NODE, MEMBLOCK_NONE, &start, &end,
+ NULL) {
+ start = clamp_t(phys_addr_t, round_up(start, PAGE_SIZE),
+ PAGE_SIZE, corruption_check_size);
+ end = clamp_t(phys_addr_t, round_down(end, PAGE_SIZE),
+ PAGE_SIZE, corruption_check_size);
+ if (start >= end)
+ continue;
+
+ memblock_reserve(start, end - start);
+ scan_areas[num_scan_areas].addr = start;
+ scan_areas[num_scan_areas].size = end - start;
+
+ /* Assume we've already mapped this early memory */
+ memset(__va(start), 0, end - start);
+
+ if (++num_scan_areas >= MAX_SCAN_AREAS)
+ break;
+ }
+
+ if (num_scan_areas)
+ printk(KERN_INFO "Scanning %d areas for low memory corruption\n", num_scan_areas);
+}
+
+
+void check_for_bios_corruption(void)
+{
+ int i;
+ int corruption = 0;
+
+ if (!memory_corruption_check)
+ return;
+
+ for (i = 0; i < num_scan_areas; i++) {
+ unsigned long *addr = __va(scan_areas[i].addr);
+ unsigned long size = scan_areas[i].size;
+
+ for (; size; addr++, size -= sizeof(unsigned long)) {
+ if (!*addr)
+ continue;
+ printk(KERN_ERR "Corrupted low memory at %p (%lx phys) = %08lx\n",
+ addr, __pa(addr), *addr);
+ corruption = 1;
+ *addr = 0;
+ }
+ }
+
+ WARN_ONCE(corruption, KERN_ERR "Memory corruption detected in low memory\n");
+}
+
+static void check_corruption(struct work_struct *dummy);
+static DECLARE_DELAYED_WORK(bios_check_work, check_corruption);
+
+static void check_corruption(struct work_struct *dummy)
+{
+ check_for_bios_corruption();
+ schedule_delayed_work(&bios_check_work,
+ round_jiffies_relative(corruption_check_period*HZ));
+}
+
+static int start_periodic_check_for_corruption(void)
+{
+ if (!num_scan_areas || !memory_corruption_check || corruption_check_period == 0)
+ return 0;
+
+ printk(KERN_INFO "Scanning for low memory corruption every %d seconds\n",
+ corruption_check_period);
+
+ /* First time we run the checks right away */
+ schedule_delayed_work(&bios_check_work, 0);
+ return 0;
+}
+device_initcall(start_periodic_check_for_corruption);
+
diff --git a/arch/x86/kernel/cpu/.gitignore b/arch/x86/kernel/cpu/.gitignore
new file mode 100644
index 0000000..667df55
--- /dev/null
+++ b/arch/x86/kernel/cpu/.gitignore
@@ -0,0 +1 @@
+capflags.c
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
new file mode 100644
index 0000000..347137e
--- /dev/null
+++ b/arch/x86/kernel/cpu/Makefile
@@ -0,0 +1,60 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Makefile for x86-compatible CPU details, features and quirks
+#
+
+# Don't trace early stages of a secondary CPU boot
+ifdef CONFIG_FUNCTION_TRACER
+CFLAGS_REMOVE_common.o = -pg
+CFLAGS_REMOVE_perf_event.o = -pg
+endif
+
+# If these files are instrumented, boot hangs during the first second.
+KCOV_INSTRUMENT_common.o := n
+KCOV_INSTRUMENT_perf_event.o := n
+
+# Make sure load_percpu_segment has no stackprotector
+nostackp := $(call cc-option, -fno-stack-protector)
+CFLAGS_common.o := $(nostackp)
+
+obj-y := cacheinfo.o scattered.o topology.o
+obj-y += common.o
+obj-y += rdrand.o
+obj-y += match.o
+obj-y += bugs.o
+obj-y += aperfmperf.o
+obj-y += cpuid-deps.o
+
+obj-$(CONFIG_PROC_FS) += proc.o
+obj-$(CONFIG_X86_FEATURE_NAMES) += capflags.o powerflags.o
+
+obj-$(CONFIG_CPU_SUP_INTEL) += intel.o intel_pconfig.o
+obj-$(CONFIG_CPU_SUP_AMD) += amd.o
+obj-$(CONFIG_CPU_SUP_CYRIX_32) += cyrix.o
+obj-$(CONFIG_CPU_SUP_CENTAUR) += centaur.o
+obj-$(CONFIG_CPU_SUP_TRANSMETA_32) += transmeta.o
+obj-$(CONFIG_CPU_SUP_UMC_32) += umc.o
+
+obj-$(CONFIG_INTEL_RDT) += intel_rdt.o intel_rdt_rdtgroup.o intel_rdt_monitor.o
+obj-$(CONFIG_INTEL_RDT) += intel_rdt_ctrlmondata.o intel_rdt_pseudo_lock.o
+CFLAGS_intel_rdt_pseudo_lock.o = -I$(src)
+
+obj-$(CONFIG_X86_MCE) += mcheck/
+obj-$(CONFIG_MTRR) += mtrr/
+obj-$(CONFIG_MICROCODE) += microcode/
+
+obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o
+
+obj-$(CONFIG_HYPERVISOR_GUEST) += vmware.o hypervisor.o mshyperv.o
+
+ifdef CONFIG_X86_FEATURE_NAMES
+quiet_cmd_mkcapflags = MKCAP $@
+ cmd_mkcapflags = $(CONFIG_SHELL) $(srctree)/$(src)/mkcapflags.sh $< $@
+
+cpufeature = $(src)/../../include/asm/cpufeatures.h
+
+targets += capflags.c
+$(obj)/capflags.c: $(cpufeature) $(src)/mkcapflags.sh FORCE
+ $(call if_changed,mkcapflags)
+endif
+clean-files += capflags.c
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
new file mode 100644
index 0000000..eeea634
--- /dev/null
+++ b/arch/x86/kernel/cpu/amd.c
@@ -0,0 +1,1095 @@
+#include <linux/export.h>
+#include <linux/bitops.h>
+#include <linux/elf.h>
+#include <linux/mm.h>
+
+#include <linux/io.h>
+#include <linux/sched.h>
+#include <linux/sched/clock.h>
+#include <linux/random.h>
+#include <asm/processor.h>
+#include <asm/apic.h>
+#include <asm/cacheinfo.h>
+#include <asm/cpu.h>
+#include <asm/spec-ctrl.h>
+#include <asm/smp.h>
+#include <asm/pci-direct.h>
+#include <asm/delay.h>
+
+#ifdef CONFIG_X86_64
+# include <asm/mmconfig.h>
+# include <asm/set_memory.h>
+#endif
+
+#include "cpu.h"
+
+static const int amd_erratum_383[];
+static const int amd_erratum_400[];
+static bool cpu_has_amd_erratum(struct cpuinfo_x86 *cpu, const int *erratum);
+
+/*
+ * nodes_per_socket: Stores the number of nodes per socket.
+ * Refer to Fam15h Models 00-0fh BKDG - CPUID Fn8000_001E_ECX
+ * Node Identifiers[10:8]
+ */
+static u32 nodes_per_socket = 1;
+
+static inline int rdmsrl_amd_safe(unsigned msr, unsigned long long *p)
+{
+ u32 gprs[8] = { 0 };
+ int err;
+
+ WARN_ONCE((boot_cpu_data.x86 != 0xf),
+ "%s should only be used on K8!\n", __func__);
+
+ gprs[1] = msr;
+ gprs[7] = 0x9c5a203a;
+
+ err = rdmsr_safe_regs(gprs);
+
+ *p = gprs[0] | ((u64)gprs[2] << 32);
+
+ return err;
+}
+
+static inline int wrmsrl_amd_safe(unsigned msr, unsigned long long val)
+{
+ u32 gprs[8] = { 0 };
+
+ WARN_ONCE((boot_cpu_data.x86 != 0xf),
+ "%s should only be used on K8!\n", __func__);
+
+ gprs[0] = (u32)val;
+ gprs[1] = msr;
+ gprs[2] = val >> 32;
+ gprs[7] = 0x9c5a203a;
+
+ return wrmsr_safe_regs(gprs);
+}
+
+/*
+ * B step AMD K6 before B 9730xxxx have hardware bugs that can cause
+ * misexecution of code under Linux. Owners of such processors should
+ * contact AMD for precise details and a CPU swap.
+ *
+ * See http://www.multimania.com/poulot/k6bug.html
+ * and section 2.6.2 of "AMD-K6 Processor Revision Guide - Model 6"
+ * (Publication # 21266 Issue Date: August 1998)
+ *
+ * The following test is erm.. interesting. AMD neglected to up
+ * the chip setting when fixing the bug but they also tweaked some
+ * performance at the same time..
+ */
+
+extern __visible void vide(void);
+__asm__(".globl vide\n"
+ ".type vide, @function\n"
+ ".align 4\n"
+ "vide: ret\n");
+
+static void init_amd_k5(struct cpuinfo_x86 *c)
+{
+#ifdef CONFIG_X86_32
+/*
+ * General Systems BIOSen alias the cpu frequency registers
+ * of the Elan at 0x000df000. Unfortunately, one of the Linux
+ * drivers subsequently pokes it, and changes the CPU speed.
+ * Workaround : Remove the unneeded alias.
+ */
+#define CBAR (0xfffc) /* Configuration Base Address (32-bit) */
+#define CBAR_ENB (0x80000000)
+#define CBAR_KEY (0X000000CB)
+ if (c->x86_model == 9 || c->x86_model == 10) {
+ if (inl(CBAR) & CBAR_ENB)
+ outl(0 | CBAR_KEY, CBAR);
+ }
+#endif
+}
+
+static void init_amd_k6(struct cpuinfo_x86 *c)
+{
+#ifdef CONFIG_X86_32
+ u32 l, h;
+ int mbytes = get_num_physpages() >> (20-PAGE_SHIFT);
+
+ if (c->x86_model < 6) {
+ /* Based on AMD doc 20734R - June 2000 */
+ if (c->x86_model == 0) {
+ clear_cpu_cap(c, X86_FEATURE_APIC);
+ set_cpu_cap(c, X86_FEATURE_PGE);
+ }
+ return;
+ }
+
+ if (c->x86_model == 6 && c->x86_stepping == 1) {
+ const int K6_BUG_LOOP = 1000000;
+ int n;
+ void (*f_vide)(void);
+ u64 d, d2;
+
+ pr_info("AMD K6 stepping B detected - ");
+
+ /*
+ * It looks like AMD fixed the 2.6.2 bug and improved indirect
+ * calls at the same time.
+ */
+
+ n = K6_BUG_LOOP;
+ f_vide = vide;
+ OPTIMIZER_HIDE_VAR(f_vide);
+ d = rdtsc();
+ while (n--)
+ f_vide();
+ d2 = rdtsc();
+ d = d2-d;
+
+ if (d > 20*K6_BUG_LOOP)
+ pr_cont("system stability may be impaired when more than 32 MB are used.\n");
+ else
+ pr_cont("probably OK (after B9730xxxx).\n");
+ }
+
+ /* K6 with old style WHCR */
+ if (c->x86_model < 8 ||
+ (c->x86_model == 8 && c->x86_stepping < 8)) {
+ /* We can only write allocate on the low 508Mb */
+ if (mbytes > 508)
+ mbytes = 508;
+
+ rdmsr(MSR_K6_WHCR, l, h);
+ if ((l&0x0000FFFF) == 0) {
+ unsigned long flags;
+ l = (1<<0)|((mbytes/4)<<1);
+ local_irq_save(flags);
+ wbinvd();
+ wrmsr(MSR_K6_WHCR, l, h);
+ local_irq_restore(flags);
+ pr_info("Enabling old style K6 write allocation for %d Mb\n",
+ mbytes);
+ }
+ return;
+ }
+
+ if ((c->x86_model == 8 && c->x86_stepping > 7) ||
+ c->x86_model == 9 || c->x86_model == 13) {
+ /* The more serious chips .. */
+
+ if (mbytes > 4092)
+ mbytes = 4092;
+
+ rdmsr(MSR_K6_WHCR, l, h);
+ if ((l&0xFFFF0000) == 0) {
+ unsigned long flags;
+ l = ((mbytes>>2)<<22)|(1<<16);
+ local_irq_save(flags);
+ wbinvd();
+ wrmsr(MSR_K6_WHCR, l, h);
+ local_irq_restore(flags);
+ pr_info("Enabling new style K6 write allocation for %d Mb\n",
+ mbytes);
+ }
+
+ return;
+ }
+
+ if (c->x86_model == 10) {
+ /* AMD Geode LX is model 10 */
+ /* placeholder for any needed mods */
+ return;
+ }
+#endif
+}
+
+static void init_amd_k7(struct cpuinfo_x86 *c)
+{
+#ifdef CONFIG_X86_32
+ u32 l, h;
+
+ /*
+ * Bit 15 of Athlon specific MSR 15, needs to be 0
+ * to enable SSE on Palomino/Morgan/Barton CPU's.
+ * If the BIOS didn't enable it already, enable it here.
+ */
+ if (c->x86_model >= 6 && c->x86_model <= 10) {
+ if (!cpu_has(c, X86_FEATURE_XMM)) {
+ pr_info("Enabling disabled K7/SSE Support.\n");
+ msr_clear_bit(MSR_K7_HWCR, 15);
+ set_cpu_cap(c, X86_FEATURE_XMM);
+ }
+ }
+
+ /*
+ * It's been determined by AMD that Athlons since model 8 stepping 1
+ * are more robust with CLK_CTL set to 200xxxxx instead of 600xxxxx
+ * As per AMD technical note 27212 0.2
+ */
+ if ((c->x86_model == 8 && c->x86_stepping >= 1) || (c->x86_model > 8)) {
+ rdmsr(MSR_K7_CLK_CTL, l, h);
+ if ((l & 0xfff00000) != 0x20000000) {
+ pr_info("CPU: CLK_CTL MSR was %x. Reprogramming to %x\n",
+ l, ((l & 0x000fffff)|0x20000000));
+ wrmsr(MSR_K7_CLK_CTL, (l & 0x000fffff)|0x20000000, h);
+ }
+ }
+
+ /* calling is from identify_secondary_cpu() ? */
+ if (!c->cpu_index)
+ return;
+
+ /*
+ * Certain Athlons might work (for various values of 'work') in SMP
+ * but they are not certified as MP capable.
+ */
+ /* Athlon 660/661 is valid. */
+ if ((c->x86_model == 6) && ((c->x86_stepping == 0) ||
+ (c->x86_stepping == 1)))
+ return;
+
+ /* Duron 670 is valid */
+ if ((c->x86_model == 7) && (c->x86_stepping == 0))
+ return;
+
+ /*
+ * Athlon 662, Duron 671, and Athlon >model 7 have capability
+ * bit. It's worth noting that the A5 stepping (662) of some
+ * Athlon XP's have the MP bit set.
+ * See http://www.heise.de/newsticker/data/jow-18.10.01-000 for
+ * more.
+ */
+ if (((c->x86_model == 6) && (c->x86_stepping >= 2)) ||
+ ((c->x86_model == 7) && (c->x86_stepping >= 1)) ||
+ (c->x86_model > 7))
+ if (cpu_has(c, X86_FEATURE_MP))
+ return;
+
+ /* If we get here, not a certified SMP capable AMD system. */
+
+ /*
+ * Don't taint if we are running SMP kernel on a single non-MP
+ * approved Athlon
+ */
+ WARN_ONCE(1, "WARNING: This combination of AMD"
+ " processors is not suitable for SMP.\n");
+ add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_NOW_UNRELIABLE);
+#endif
+}
+
+#ifdef CONFIG_NUMA
+/*
+ * To workaround broken NUMA config. Read the comment in
+ * srat_detect_node().
+ */
+static int nearby_node(int apicid)
+{
+ int i, node;
+
+ for (i = apicid - 1; i >= 0; i--) {
+ node = __apicid_to_node[i];
+ if (node != NUMA_NO_NODE && node_online(node))
+ return node;
+ }
+ for (i = apicid + 1; i < MAX_LOCAL_APIC; i++) {
+ node = __apicid_to_node[i];
+ if (node != NUMA_NO_NODE && node_online(node))
+ return node;
+ }
+ return first_node(node_online_map); /* Shouldn't happen */
+}
+#endif
+
+/*
+ * Fix up cpu_core_id for pre-F17h systems to be in the
+ * [0 .. cores_per_node - 1] range. Not really needed but
+ * kept so as not to break existing setups.
+ */
+static void legacy_fixup_core_id(struct cpuinfo_x86 *c)
+{
+ u32 cus_per_node;
+
+ if (c->x86 >= 0x17)
+ return;
+
+ cus_per_node = c->x86_max_cores / nodes_per_socket;
+ c->cpu_core_id %= cus_per_node;
+}
+
+
+static void amd_get_topology_early(struct cpuinfo_x86 *c)
+{
+ if (cpu_has(c, X86_FEATURE_TOPOEXT))
+ smp_num_siblings = ((cpuid_ebx(0x8000001e) >> 8) & 0xff) + 1;
+}
+
+/*
+ * Fixup core topology information for
+ * (1) AMD multi-node processors
+ * Assumption: Number of cores in each internal node is the same.
+ * (2) AMD processors supporting compute units
+ */
+static void amd_get_topology(struct cpuinfo_x86 *c)
+{
+ u8 node_id;
+ int cpu = smp_processor_id();
+
+ /* get information required for multi-node processors */
+ if (boot_cpu_has(X86_FEATURE_TOPOEXT)) {
+ int err;
+ u32 eax, ebx, ecx, edx;
+
+ cpuid(0x8000001e, &eax, &ebx, &ecx, &edx);
+
+ node_id = ecx & 0xff;
+
+ if (c->x86 == 0x15)
+ c->cu_id = ebx & 0xff;
+
+ if (c->x86 >= 0x17) {
+ c->cpu_core_id = ebx & 0xff;
+
+ if (smp_num_siblings > 1)
+ c->x86_max_cores /= smp_num_siblings;
+ }
+
+ /*
+ * In case leaf B is available, use it to derive
+ * topology information.
+ */
+ err = detect_extended_topology(c);
+ if (!err)
+ c->x86_coreid_bits = get_count_order(c->x86_max_cores);
+
+ cacheinfo_amd_init_llc_id(c, cpu, node_id);
+
+ } else if (cpu_has(c, X86_FEATURE_NODEID_MSR)) {
+ u64 value;
+
+ rdmsrl(MSR_FAM10H_NODE_ID, value);
+ node_id = value & 7;
+
+ per_cpu(cpu_llc_id, cpu) = node_id;
+ } else
+ return;
+
+ if (nodes_per_socket > 1) {
+ set_cpu_cap(c, X86_FEATURE_AMD_DCM);
+ legacy_fixup_core_id(c);
+ }
+}
+
+/*
+ * On a AMD dual core setup the lower bits of the APIC id distinguish the cores.
+ * Assumes number of cores is a power of two.
+ */
+static void amd_detect_cmp(struct cpuinfo_x86 *c)
+{
+ unsigned bits;
+ int cpu = smp_processor_id();
+
+ bits = c->x86_coreid_bits;
+ /* Low order bits define the core id (index of core in socket) */
+ c->cpu_core_id = c->initial_apicid & ((1 << bits)-1);
+ /* Convert the initial APIC ID into the socket ID */
+ c->phys_proc_id = c->initial_apicid >> bits;
+ /* use socket ID also for last level cache */
+ per_cpu(cpu_llc_id, cpu) = c->phys_proc_id;
+}
+
+u16 amd_get_nb_id(int cpu)
+{
+ return per_cpu(cpu_llc_id, cpu);
+}
+EXPORT_SYMBOL_GPL(amd_get_nb_id);
+
+u32 amd_get_nodes_per_socket(void)
+{
+ return nodes_per_socket;
+}
+EXPORT_SYMBOL_GPL(amd_get_nodes_per_socket);
+
+static void srat_detect_node(struct cpuinfo_x86 *c)
+{
+#ifdef CONFIG_NUMA
+ int cpu = smp_processor_id();
+ int node;
+ unsigned apicid = c->apicid;
+
+ node = numa_cpu_node(cpu);
+ if (node == NUMA_NO_NODE)
+ node = per_cpu(cpu_llc_id, cpu);
+
+ /*
+ * On multi-fabric platform (e.g. Numascale NumaChip) a
+ * platform-specific handler needs to be called to fixup some
+ * IDs of the CPU.
+ */
+ if (x86_cpuinit.fixup_cpu_id)
+ x86_cpuinit.fixup_cpu_id(c, node);
+
+ if (!node_online(node)) {
+ /*
+ * Two possibilities here:
+ *
+ * - The CPU is missing memory and no node was created. In
+ * that case try picking one from a nearby CPU.
+ *
+ * - The APIC IDs differ from the HyperTransport node IDs
+ * which the K8 northbridge parsing fills in. Assume
+ * they are all increased by a constant offset, but in
+ * the same order as the HT nodeids. If that doesn't
+ * result in a usable node fall back to the path for the
+ * previous case.
+ *
+ * This workaround operates directly on the mapping between
+ * APIC ID and NUMA node, assuming certain relationship
+ * between APIC ID, HT node ID and NUMA topology. As going
+ * through CPU mapping may alter the outcome, directly
+ * access __apicid_to_node[].
+ */
+ int ht_nodeid = c->initial_apicid;
+
+ if (__apicid_to_node[ht_nodeid] != NUMA_NO_NODE)
+ node = __apicid_to_node[ht_nodeid];
+ /* Pick a nearby node */
+ if (!node_online(node))
+ node = nearby_node(apicid);
+ }
+ numa_set_node(cpu, node);
+#endif
+}
+
+static void early_init_amd_mc(struct cpuinfo_x86 *c)
+{
+#ifdef CONFIG_SMP
+ unsigned bits, ecx;
+
+ /* Multi core CPU? */
+ if (c->extended_cpuid_level < 0x80000008)
+ return;
+
+ ecx = cpuid_ecx(0x80000008);
+
+ c->x86_max_cores = (ecx & 0xff) + 1;
+
+ /* CPU telling us the core id bits shift? */
+ bits = (ecx >> 12) & 0xF;
+
+ /* Otherwise recompute */
+ if (bits == 0) {
+ while ((1 << bits) < c->x86_max_cores)
+ bits++;
+ }
+
+ c->x86_coreid_bits = bits;
+#endif
+}
+
+static void bsp_init_amd(struct cpuinfo_x86 *c)
+{
+
+#ifdef CONFIG_X86_64
+ if (c->x86 >= 0xf) {
+ unsigned long long tseg;
+
+ /*
+ * Split up direct mapping around the TSEG SMM area.
+ * Don't do it for gbpages because there seems very little
+ * benefit in doing so.
+ */
+ if (!rdmsrl_safe(MSR_K8_TSEG_ADDR, &tseg)) {
+ unsigned long pfn = tseg >> PAGE_SHIFT;
+
+ pr_debug("tseg: %010llx\n", tseg);
+ if (pfn_range_is_mapped(pfn, pfn + 1))
+ set_memory_4k((unsigned long)__va(tseg), 1);
+ }
+ }
+#endif
+
+ if (cpu_has(c, X86_FEATURE_CONSTANT_TSC)) {
+
+ if (c->x86 > 0x10 ||
+ (c->x86 == 0x10 && c->x86_model >= 0x2)) {
+ u64 val;
+
+ rdmsrl(MSR_K7_HWCR, val);
+ if (!(val & BIT(24)))
+ pr_warn(FW_BUG "TSC doesn't count with P0 frequency!\n");
+ }
+ }
+
+ if (c->x86 == 0x15) {
+ unsigned long upperbit;
+ u32 cpuid, assoc;
+
+ cpuid = cpuid_edx(0x80000005);
+ assoc = cpuid >> 16 & 0xff;
+ upperbit = ((cpuid >> 24) << 10) / assoc;
+
+ va_align.mask = (upperbit - 1) & PAGE_MASK;
+ va_align.flags = ALIGN_VA_32 | ALIGN_VA_64;
+
+ /* A random value per boot for bit slice [12:upper_bit) */
+ va_align.bits = get_random_int() & va_align.mask;
+ }
+
+ if (cpu_has(c, X86_FEATURE_MWAITX))
+ use_mwaitx_delay();
+
+ if (boot_cpu_has(X86_FEATURE_TOPOEXT)) {
+ u32 ecx;
+
+ ecx = cpuid_ecx(0x8000001e);
+ nodes_per_socket = ((ecx >> 8) & 7) + 1;
+ } else if (boot_cpu_has(X86_FEATURE_NODEID_MSR)) {
+ u64 value;
+
+ rdmsrl(MSR_FAM10H_NODE_ID, value);
+ nodes_per_socket = ((value >> 3) & 7) + 1;
+ }
+
+ if (!boot_cpu_has(X86_FEATURE_AMD_SSBD) &&
+ !boot_cpu_has(X86_FEATURE_VIRT_SSBD) &&
+ c->x86 >= 0x15 && c->x86 <= 0x17) {
+ unsigned int bit;
+
+ switch (c->x86) {
+ case 0x15: bit = 54; break;
+ case 0x16: bit = 33; break;
+ case 0x17: bit = 10; break;
+ default: return;
+ }
+ /*
+ * Try to cache the base value so further operations can
+ * avoid RMW. If that faults, do not enable SSBD.
+ */
+ if (!rdmsrl_safe(MSR_AMD64_LS_CFG, &x86_amd_ls_cfg_base)) {
+ setup_force_cpu_cap(X86_FEATURE_LS_CFG_SSBD);
+ setup_force_cpu_cap(X86_FEATURE_SSBD);
+ x86_amd_ls_cfg_ssbd_mask = 1ULL << bit;
+ }
+ }
+}
+
+static void early_detect_mem_encrypt(struct cpuinfo_x86 *c)
+{
+ u64 msr;
+
+ /*
+ * BIOS support is required for SME and SEV.
+ * For SME: If BIOS has enabled SME then adjust x86_phys_bits by
+ * the SME physical address space reduction value.
+ * If BIOS has not enabled SME then don't advertise the
+ * SME feature (set in scattered.c).
+ * For SEV: If BIOS has not enabled SEV then don't advertise the
+ * SEV feature (set in scattered.c).
+ *
+ * In all cases, since support for SME and SEV requires long mode,
+ * don't advertise the feature under CONFIG_X86_32.
+ */
+ if (cpu_has(c, X86_FEATURE_SME) || cpu_has(c, X86_FEATURE_SEV)) {
+ /* Check if memory encryption is enabled */
+ rdmsrl(MSR_K8_SYSCFG, msr);
+ if (!(msr & MSR_K8_SYSCFG_MEM_ENCRYPT))
+ goto clear_all;
+
+ /*
+ * Always adjust physical address bits. Even though this
+ * will be a value above 32-bits this is still done for
+ * CONFIG_X86_32 so that accurate values are reported.
+ */
+ c->x86_phys_bits -= (cpuid_ebx(0x8000001f) >> 6) & 0x3f;
+
+ if (IS_ENABLED(CONFIG_X86_32))
+ goto clear_all;
+
+ rdmsrl(MSR_K7_HWCR, msr);
+ if (!(msr & MSR_K7_HWCR_SMMLOCK))
+ goto clear_sev;
+
+ return;
+
+clear_all:
+ clear_cpu_cap(c, X86_FEATURE_SME);
+clear_sev:
+ clear_cpu_cap(c, X86_FEATURE_SEV);
+ }
+}
+
+static void early_init_amd(struct cpuinfo_x86 *c)
+{
+ u64 value;
+ u32 dummy;
+
+ early_init_amd_mc(c);
+
+#ifdef CONFIG_X86_32
+ if (c->x86 == 6)
+ set_cpu_cap(c, X86_FEATURE_K7);
+#endif
+
+ if (c->x86 >= 0xf)
+ set_cpu_cap(c, X86_FEATURE_K8);
+
+ rdmsr_safe(MSR_AMD64_PATCH_LEVEL, &c->microcode, &dummy);
+
+ /*
+ * c->x86_power is 8000_0007 edx. Bit 8 is TSC runs at constant rate
+ * with P/T states and does not stop in deep C-states
+ */
+ if (c->x86_power & (1 << 8)) {
+ set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
+ set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC);
+ }
+
+ /* Bit 12 of 8000_0007 edx is accumulated power mechanism. */
+ if (c->x86_power & BIT(12))
+ set_cpu_cap(c, X86_FEATURE_ACC_POWER);
+
+#ifdef CONFIG_X86_64
+ set_cpu_cap(c, X86_FEATURE_SYSCALL32);
+#else
+ /* Set MTRR capability flag if appropriate */
+ if (c->x86 == 5)
+ if (c->x86_model == 13 || c->x86_model == 9 ||
+ (c->x86_model == 8 && c->x86_stepping >= 8))
+ set_cpu_cap(c, X86_FEATURE_K6_MTRR);
+#endif
+#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_PCI)
+ /*
+ * ApicID can always be treated as an 8-bit value for AMD APIC versions
+ * >= 0x10, but even old K8s came out of reset with version 0x10. So, we
+ * can safely set X86_FEATURE_EXTD_APICID unconditionally for families
+ * after 16h.
+ */
+ if (boot_cpu_has(X86_FEATURE_APIC)) {
+ if (c->x86 > 0x16)
+ set_cpu_cap(c, X86_FEATURE_EXTD_APICID);
+ else if (c->x86 >= 0xf) {
+ /* check CPU config space for extended APIC ID */
+ unsigned int val;
+
+ val = read_pci_config(0, 24, 0, 0x68);
+ if ((val >> 17 & 0x3) == 0x3)
+ set_cpu_cap(c, X86_FEATURE_EXTD_APICID);
+ }
+ }
+#endif
+
+ /*
+ * This is only needed to tell the kernel whether to use VMCALL
+ * and VMMCALL. VMMCALL is never executed except under virt, so
+ * we can set it unconditionally.
+ */
+ set_cpu_cap(c, X86_FEATURE_VMMCALL);
+
+ /* F16h erratum 793, CVE-2013-6885 */
+ if (c->x86 == 0x16 && c->x86_model <= 0xf)
+ msr_set_bit(MSR_AMD64_LS_CFG, 15);
+
+ /*
+ * Check whether the machine is affected by erratum 400. This is
+ * used to select the proper idle routine and to enable the check
+ * whether the machine is affected in arch_post_acpi_init(), which
+ * sets the X86_BUG_AMD_APIC_C1E bug depending on the MSR check.
+ */
+ if (cpu_has_amd_erratum(c, amd_erratum_400))
+ set_cpu_bug(c, X86_BUG_AMD_E400);
+
+ early_detect_mem_encrypt(c);
+
+ /* Re-enable TopologyExtensions if switched off by BIOS */
+ if (c->x86 == 0x15 &&
+ (c->x86_model >= 0x10 && c->x86_model <= 0x6f) &&
+ !cpu_has(c, X86_FEATURE_TOPOEXT)) {
+
+ if (msr_set_bit(0xc0011005, 54) > 0) {
+ rdmsrl(0xc0011005, value);
+ if (value & BIT_64(54)) {
+ set_cpu_cap(c, X86_FEATURE_TOPOEXT);
+ pr_info_once(FW_INFO "CPU: Re-enabling disabled Topology Extensions Support.\n");
+ }
+ }
+ }
+
+ amd_get_topology_early(c);
+}
+
+static void init_amd_k8(struct cpuinfo_x86 *c)
+{
+ u32 level;
+ u64 value;
+
+ /* On C+ stepping K8 rep microcode works well for copy/memset */
+ level = cpuid_eax(1);
+ if ((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58)
+ set_cpu_cap(c, X86_FEATURE_REP_GOOD);
+
+ /*
+ * Some BIOSes incorrectly force this feature, but only K8 revision D
+ * (model = 0x14) and later actually support it.
+ * (AMD Erratum #110, docId: 25759).
+ */
+ if (c->x86_model < 0x14 && cpu_has(c, X86_FEATURE_LAHF_LM)) {
+ clear_cpu_cap(c, X86_FEATURE_LAHF_LM);
+ if (!rdmsrl_amd_safe(0xc001100d, &value)) {
+ value &= ~BIT_64(32);
+ wrmsrl_amd_safe(0xc001100d, value);
+ }
+ }
+
+ if (!c->x86_model_id[0])
+ strcpy(c->x86_model_id, "Hammer");
+
+#ifdef CONFIG_SMP
+ /*
+ * Disable TLB flush filter by setting HWCR.FFDIS on K8
+ * bit 6 of msr C001_0015
+ *
+ * Errata 63 for SH-B3 steppings
+ * Errata 122 for all steppings (F+ have it disabled by default)
+ */
+ msr_set_bit(MSR_K7_HWCR, 6);
+#endif
+ set_cpu_bug(c, X86_BUG_SWAPGS_FENCE);
+}
+
+static void init_amd_gh(struct cpuinfo_x86 *c)
+{
+#ifdef CONFIG_MMCONF_FAM10H
+ /* do this for boot cpu */
+ if (c == &boot_cpu_data)
+ check_enable_amd_mmconf_dmi();
+
+ fam10h_check_enable_mmcfg();
+#endif
+
+ /*
+ * Disable GART TLB Walk Errors on Fam10h. We do this here because this
+ * is always needed when GART is enabled, even in a kernel which has no
+ * MCE support built in. BIOS should disable GartTlbWlk Errors already.
+ * If it doesn't, we do it here as suggested by the BKDG.
+ *
+ * Fixes: https://bugzilla.kernel.org/show_bug.cgi?id=33012
+ */
+ msr_set_bit(MSR_AMD64_MCx_MASK(4), 10);
+
+ /*
+ * On family 10h BIOS may not have properly enabled WC+ support, causing
+ * it to be converted to CD memtype. This may result in performance
+ * degradation for certain nested-paging guests. Prevent this conversion
+ * by clearing bit 24 in MSR_AMD64_BU_CFG2.
+ *
+ * NOTE: we want to use the _safe accessors so as not to #GP kvm
+ * guests on older kvm hosts.
+ */
+ msr_clear_bit(MSR_AMD64_BU_CFG2, 24);
+
+ if (cpu_has_amd_erratum(c, amd_erratum_383))
+ set_cpu_bug(c, X86_BUG_AMD_TLB_MMATCH);
+}
+
+#define MSR_AMD64_DE_CFG 0xC0011029
+
+static void init_amd_ln(struct cpuinfo_x86 *c)
+{
+ /*
+ * Apply erratum 665 fix unconditionally so machines without a BIOS
+ * fix work.
+ */
+ msr_set_bit(MSR_AMD64_DE_CFG, 31);
+}
+
+static void init_amd_bd(struct cpuinfo_x86 *c)
+{
+ u64 value;
+
+ /*
+ * The way access filter has a performance penalty on some workloads.
+ * Disable it on the affected CPUs.
+ */
+ if ((c->x86_model >= 0x02) && (c->x86_model < 0x20)) {
+ if (!rdmsrl_safe(MSR_F15H_IC_CFG, &value) && !(value & 0x1E)) {
+ value |= 0x1E;
+ wrmsrl_safe(MSR_F15H_IC_CFG, value);
+ }
+ }
+}
+
+static void init_amd_zn(struct cpuinfo_x86 *c)
+{
+ set_cpu_cap(c, X86_FEATURE_ZEN);
+ /*
+ * Fix erratum 1076: CPB feature bit not being set in CPUID. It affects
+ * all up to and including B1.
+ */
+ if (c->x86_model <= 1 && c->x86_stepping <= 1)
+ set_cpu_cap(c, X86_FEATURE_CPB);
+}
+
+static void init_amd(struct cpuinfo_x86 *c)
+{
+ early_init_amd(c);
+
+ /*
+ * Bit 31 in normal CPUID used for nonstandard 3DNow ID;
+ * 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway
+ */
+ clear_cpu_cap(c, 0*32+31);
+
+ if (c->x86 >= 0x10)
+ set_cpu_cap(c, X86_FEATURE_REP_GOOD);
+
+ /* get apicid instead of initial apic id from cpuid */
+ c->apicid = hard_smp_processor_id();
+
+ /* K6s reports MCEs but don't actually have all the MSRs */
+ if (c->x86 < 6)
+ clear_cpu_cap(c, X86_FEATURE_MCE);
+
+ switch (c->x86) {
+ case 4: init_amd_k5(c); break;
+ case 5: init_amd_k6(c); break;
+ case 6: init_amd_k7(c); break;
+ case 0xf: init_amd_k8(c); break;
+ case 0x10: init_amd_gh(c); break;
+ case 0x12: init_amd_ln(c); break;
+ case 0x15: init_amd_bd(c); break;
+ case 0x17: init_amd_zn(c); break;
+ }
+
+ /*
+ * Enable workaround for FXSAVE leak on CPUs
+ * without a XSaveErPtr feature
+ */
+ if ((c->x86 >= 6) && (!cpu_has(c, X86_FEATURE_XSAVEERPTR)))
+ set_cpu_bug(c, X86_BUG_FXSAVE_LEAK);
+
+ cpu_detect_cache_sizes(c);
+
+ amd_detect_cmp(c);
+ amd_get_topology(c);
+ srat_detect_node(c);
+
+ init_amd_cacheinfo(c);
+
+ if (cpu_has(c, X86_FEATURE_XMM2)) {
+ unsigned long long val;
+ int ret;
+
+ /*
+ * A serializing LFENCE has less overhead than MFENCE, so
+ * use it for execution serialization. On families which
+ * don't have that MSR, LFENCE is already serializing.
+ * msr_set_bit() uses the safe accessors, too, even if the MSR
+ * is not present.
+ */
+ msr_set_bit(MSR_F10H_DECFG,
+ MSR_F10H_DECFG_LFENCE_SERIALIZE_BIT);
+
+ /*
+ * Verify that the MSR write was successful (could be running
+ * under a hypervisor) and only then assume that LFENCE is
+ * serializing.
+ */
+ ret = rdmsrl_safe(MSR_F10H_DECFG, &val);
+ if (!ret && (val & MSR_F10H_DECFG_LFENCE_SERIALIZE)) {
+ /* A serializing LFENCE stops RDTSC speculation */
+ set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
+ } else {
+ /* MFENCE stops RDTSC speculation */
+ set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC);
+ }
+ }
+
+ /*
+ * Family 0x12 and above processors have APIC timer
+ * running in deep C states.
+ */
+ if (c->x86 > 0x11)
+ set_cpu_cap(c, X86_FEATURE_ARAT);
+
+ /* 3DNow or LM implies PREFETCHW */
+ if (!cpu_has(c, X86_FEATURE_3DNOWPREFETCH))
+ if (cpu_has(c, X86_FEATURE_3DNOW) || cpu_has(c, X86_FEATURE_LM))
+ set_cpu_cap(c, X86_FEATURE_3DNOWPREFETCH);
+
+ /* AMD CPUs don't reset SS attributes on SYSRET, Xen does. */
+ if (!cpu_has(c, X86_FEATURE_XENPV))
+ set_cpu_bug(c, X86_BUG_SYSRET_SS_ATTRS);
+}
+
+#ifdef CONFIG_X86_32
+static unsigned int amd_size_cache(struct cpuinfo_x86 *c, unsigned int size)
+{
+ /* AMD errata T13 (order #21922) */
+ if (c->x86 == 6) {
+ /* Duron Rev A0 */
+ if (c->x86_model == 3 && c->x86_stepping == 0)
+ size = 64;
+ /* Tbird rev A1/A2 */
+ if (c->x86_model == 4 &&
+ (c->x86_stepping == 0 || c->x86_stepping == 1))
+ size = 256;
+ }
+ return size;
+}
+#endif
+
+static void cpu_detect_tlb_amd(struct cpuinfo_x86 *c)
+{
+ u32 ebx, eax, ecx, edx;
+ u16 mask = 0xfff;
+
+ if (c->x86 < 0xf)
+ return;
+
+ if (c->extended_cpuid_level < 0x80000006)
+ return;
+
+ cpuid(0x80000006, &eax, &ebx, &ecx, &edx);
+
+ tlb_lld_4k[ENTRIES] = (ebx >> 16) & mask;
+ tlb_lli_4k[ENTRIES] = ebx & mask;
+
+ /*
+ * K8 doesn't have 2M/4M entries in the L2 TLB so read out the L1 TLB
+ * characteristics from the CPUID function 0x80000005 instead.
+ */
+ if (c->x86 == 0xf) {
+ cpuid(0x80000005, &eax, &ebx, &ecx, &edx);
+ mask = 0xff;
+ }
+
+ /* Handle DTLB 2M and 4M sizes, fall back to L1 if L2 is disabled */
+ if (!((eax >> 16) & mask))
+ tlb_lld_2m[ENTRIES] = (cpuid_eax(0x80000005) >> 16) & 0xff;
+ else
+ tlb_lld_2m[ENTRIES] = (eax >> 16) & mask;
+
+ /* a 4M entry uses two 2M entries */
+ tlb_lld_4m[ENTRIES] = tlb_lld_2m[ENTRIES] >> 1;
+
+ /* Handle ITLB 2M and 4M sizes, fall back to L1 if L2 is disabled */
+ if (!(eax & mask)) {
+ /* Erratum 658 */
+ if (c->x86 == 0x15 && c->x86_model <= 0x1f) {
+ tlb_lli_2m[ENTRIES] = 1024;
+ } else {
+ cpuid(0x80000005, &eax, &ebx, &ecx, &edx);
+ tlb_lli_2m[ENTRIES] = eax & 0xff;
+ }
+ } else
+ tlb_lli_2m[ENTRIES] = eax & mask;
+
+ tlb_lli_4m[ENTRIES] = tlb_lli_2m[ENTRIES] >> 1;
+}
+
+static const struct cpu_dev amd_cpu_dev = {
+ .c_vendor = "AMD",
+ .c_ident = { "AuthenticAMD" },
+#ifdef CONFIG_X86_32
+ .legacy_models = {
+ { .family = 4, .model_names =
+ {
+ [3] = "486 DX/2",
+ [7] = "486 DX/2-WB",
+ [8] = "486 DX/4",
+ [9] = "486 DX/4-WB",
+ [14] = "Am5x86-WT",
+ [15] = "Am5x86-WB"
+ }
+ },
+ },
+ .legacy_cache_size = amd_size_cache,
+#endif
+ .c_early_init = early_init_amd,
+ .c_detect_tlb = cpu_detect_tlb_amd,
+ .c_bsp_init = bsp_init_amd,
+ .c_init = init_amd,
+ .c_x86_vendor = X86_VENDOR_AMD,
+};
+
+cpu_dev_register(amd_cpu_dev);
+
+/*
+ * AMD errata checking
+ *
+ * Errata are defined as arrays of ints using the AMD_LEGACY_ERRATUM() or
+ * AMD_OSVW_ERRATUM() macros. The latter is intended for newer errata that
+ * have an OSVW id assigned, which it takes as first argument. Both take a
+ * variable number of family-specific model-stepping ranges created by
+ * AMD_MODEL_RANGE().
+ *
+ * Example:
+ *
+ * const int amd_erratum_319[] =
+ * AMD_LEGACY_ERRATUM(AMD_MODEL_RANGE(0x10, 0x2, 0x1, 0x4, 0x2),
+ * AMD_MODEL_RANGE(0x10, 0x8, 0x0, 0x8, 0x0),
+ * AMD_MODEL_RANGE(0x10, 0x9, 0x0, 0x9, 0x0));
+ */
+
+#define AMD_LEGACY_ERRATUM(...) { -1, __VA_ARGS__, 0 }
+#define AMD_OSVW_ERRATUM(osvw_id, ...) { osvw_id, __VA_ARGS__, 0 }
+#define AMD_MODEL_RANGE(f, m_start, s_start, m_end, s_end) \
+ ((f << 24) | (m_start << 16) | (s_start << 12) | (m_end << 4) | (s_end))
+#define AMD_MODEL_RANGE_FAMILY(range) (((range) >> 24) & 0xff)
+#define AMD_MODEL_RANGE_START(range) (((range) >> 12) & 0xfff)
+#define AMD_MODEL_RANGE_END(range) ((range) & 0xfff)
+
+static const int amd_erratum_400[] =
+ AMD_OSVW_ERRATUM(1, AMD_MODEL_RANGE(0xf, 0x41, 0x2, 0xff, 0xf),
+ AMD_MODEL_RANGE(0x10, 0x2, 0x1, 0xff, 0xf));
+
+static const int amd_erratum_383[] =
+ AMD_OSVW_ERRATUM(3, AMD_MODEL_RANGE(0x10, 0, 0, 0xff, 0xf));
+
+
+static bool cpu_has_amd_erratum(struct cpuinfo_x86 *cpu, const int *erratum)
+{
+ int osvw_id = *erratum++;
+ u32 range;
+ u32 ms;
+
+ if (osvw_id >= 0 && osvw_id < 65536 &&
+ cpu_has(cpu, X86_FEATURE_OSVW)) {
+ u64 osvw_len;
+
+ rdmsrl(MSR_AMD64_OSVW_ID_LENGTH, osvw_len);
+ if (osvw_id < osvw_len) {
+ u64 osvw_bits;
+
+ rdmsrl(MSR_AMD64_OSVW_STATUS + (osvw_id >> 6),
+ osvw_bits);
+ return osvw_bits & (1ULL << (osvw_id & 0x3f));
+ }
+ }
+
+ /* OSVW unavailable or ID unknown, match family-model-stepping range */
+ ms = (cpu->x86_model << 4) | cpu->x86_stepping;
+ while ((range = *erratum++))
+ if ((cpu->x86 == AMD_MODEL_RANGE_FAMILY(range)) &&
+ (ms >= AMD_MODEL_RANGE_START(range)) &&
+ (ms <= AMD_MODEL_RANGE_END(range)))
+ return true;
+
+ return false;
+}
+
+void set_dr_addr_mask(unsigned long mask, int dr)
+{
+ if (!boot_cpu_has(X86_FEATURE_BPEXT))
+ return;
+
+ switch (dr) {
+ case 0:
+ wrmsr(MSR_F16H_DR0_ADDR_MASK, mask, 0);
+ break;
+ case 1:
+ case 2:
+ case 3:
+ wrmsr(MSR_F16H_DR1_ADDR_MASK - 1 + dr, mask, 0);
+ break;
+ default:
+ break;
+ }
+}
diff --git a/arch/x86/kernel/cpu/aperfmperf.c b/arch/x86/kernel/cpu/aperfmperf.c
new file mode 100644
index 0000000..7eba34d
--- /dev/null
+++ b/arch/x86/kernel/cpu/aperfmperf.c
@@ -0,0 +1,127 @@
+/*
+ * x86 APERF/MPERF KHz calculation for
+ * /sys/.../cpufreq/scaling_cur_freq
+ *
+ * Copyright (C) 2017 Intel Corp.
+ * Author: Len Brown <len.brown@intel.com>
+ *
+ * This file is licensed under GPLv2.
+ */
+
+#include <linux/delay.h>
+#include <linux/ktime.h>
+#include <linux/math64.h>
+#include <linux/percpu.h>
+#include <linux/smp.h>
+
+#include "cpu.h"
+
+struct aperfmperf_sample {
+ unsigned int khz;
+ ktime_t time;
+ u64 aperf;
+ u64 mperf;
+};
+
+static DEFINE_PER_CPU(struct aperfmperf_sample, samples);
+
+#define APERFMPERF_CACHE_THRESHOLD_MS 10
+#define APERFMPERF_REFRESH_DELAY_MS 10
+#define APERFMPERF_STALE_THRESHOLD_MS 1000
+
+/*
+ * aperfmperf_snapshot_khz()
+ * On the current CPU, snapshot APERF, MPERF, and jiffies
+ * unless we already did it within 10ms
+ * calculate kHz, save snapshot
+ */
+static void aperfmperf_snapshot_khz(void *dummy)
+{
+ u64 aperf, aperf_delta;
+ u64 mperf, mperf_delta;
+ struct aperfmperf_sample *s = this_cpu_ptr(&samples);
+ unsigned long flags;
+
+ local_irq_save(flags);
+ rdmsrl(MSR_IA32_APERF, aperf);
+ rdmsrl(MSR_IA32_MPERF, mperf);
+ local_irq_restore(flags);
+
+ aperf_delta = aperf - s->aperf;
+ mperf_delta = mperf - s->mperf;
+
+ /*
+ * There is no architectural guarantee that MPERF
+ * increments faster than we can read it.
+ */
+ if (mperf_delta == 0)
+ return;
+
+ s->time = ktime_get();
+ s->aperf = aperf;
+ s->mperf = mperf;
+ s->khz = div64_u64((cpu_khz * aperf_delta), mperf_delta);
+}
+
+static bool aperfmperf_snapshot_cpu(int cpu, ktime_t now, bool wait)
+{
+ s64 time_delta = ktime_ms_delta(now, per_cpu(samples.time, cpu));
+
+ /* Don't bother re-computing within the cache threshold time. */
+ if (time_delta < APERFMPERF_CACHE_THRESHOLD_MS)
+ return true;
+
+ smp_call_function_single(cpu, aperfmperf_snapshot_khz, NULL, wait);
+
+ /* Return false if the previous iteration was too long ago. */
+ return time_delta <= APERFMPERF_STALE_THRESHOLD_MS;
+}
+
+unsigned int aperfmperf_get_khz(int cpu)
+{
+ if (!cpu_khz)
+ return 0;
+
+ if (!static_cpu_has(X86_FEATURE_APERFMPERF))
+ return 0;
+
+ aperfmperf_snapshot_cpu(cpu, ktime_get(), true);
+ return per_cpu(samples.khz, cpu);
+}
+
+void arch_freq_prepare_all(void)
+{
+ ktime_t now = ktime_get();
+ bool wait = false;
+ int cpu;
+
+ if (!cpu_khz)
+ return;
+
+ if (!static_cpu_has(X86_FEATURE_APERFMPERF))
+ return;
+
+ for_each_online_cpu(cpu)
+ if (!aperfmperf_snapshot_cpu(cpu, now, false))
+ wait = true;
+
+ if (wait)
+ msleep(APERFMPERF_REFRESH_DELAY_MS);
+}
+
+unsigned int arch_freq_get_on_cpu(int cpu)
+{
+ if (!cpu_khz)
+ return 0;
+
+ if (!static_cpu_has(X86_FEATURE_APERFMPERF))
+ return 0;
+
+ if (aperfmperf_snapshot_cpu(cpu, ktime_get(), true))
+ return per_cpu(samples.khz, cpu);
+
+ msleep(APERFMPERF_REFRESH_DELAY_MS);
+ smp_call_function_single(cpu, aperfmperf_snapshot_khz, NULL, 1);
+
+ return per_cpu(samples.khz, cpu);
+}
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
new file mode 100644
index 0000000..78928f5
--- /dev/null
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -0,0 +1,1174 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 1994 Linus Torvalds
+ *
+ * Cyrix stuff, June 1998 by:
+ * - Rafael R. Reilova (moved everything from head.S),
+ * <rreilova@ececs.uc.edu>
+ * - Channing Corn (tests & fixes),
+ * - Andrew D. Balsa (code cleanup).
+ */
+#include <linux/init.h>
+#include <linux/utsname.h>
+#include <linux/cpu.h>
+#include <linux/module.h>
+#include <linux/nospec.h>
+#include <linux/prctl.h>
+#include <linux/sched/smt.h>
+
+#include <asm/spec-ctrl.h>
+#include <asm/cmdline.h>
+#include <asm/bugs.h>
+#include <asm/processor.h>
+#include <asm/processor-flags.h>
+#include <asm/fpu/internal.h>
+#include <asm/msr.h>
+#include <asm/vmx.h>
+#include <asm/paravirt.h>
+#include <asm/alternative.h>
+#include <asm/pgtable.h>
+#include <asm/set_memory.h>
+#include <asm/intel-family.h>
+#include <asm/e820/api.h>
+#include <asm/hypervisor.h>
+
+static void __init spectre_v2_select_mitigation(void);
+static void __init ssb_select_mitigation(void);
+static void __init l1tf_select_mitigation(void);
+
+/* The base value of the SPEC_CTRL MSR that always has to be preserved. */
+u64 x86_spec_ctrl_base;
+EXPORT_SYMBOL_GPL(x86_spec_ctrl_base);
+static DEFINE_MUTEX(spec_ctrl_mutex);
+
+/*
+ * The vendor and possibly platform specific bits which can be modified in
+ * x86_spec_ctrl_base.
+ */
+static u64 __ro_after_init x86_spec_ctrl_mask = SPEC_CTRL_IBRS;
+
+/*
+ * AMD specific MSR info for Speculative Store Bypass control.
+ * x86_amd_ls_cfg_ssbd_mask is initialized in identify_boot_cpu().
+ */
+u64 __ro_after_init x86_amd_ls_cfg_base;
+u64 __ro_after_init x86_amd_ls_cfg_ssbd_mask;
+
+/* Control conditional STIPB in switch_to() */
+DEFINE_STATIC_KEY_FALSE(switch_to_cond_stibp);
+/* Control conditional IBPB in switch_mm() */
+DEFINE_STATIC_KEY_FALSE(switch_mm_cond_ibpb);
+/* Control unconditional IBPB in switch_mm() */
+DEFINE_STATIC_KEY_FALSE(switch_mm_always_ibpb);
+
+void __init check_bugs(void)
+{
+ identify_boot_cpu();
+
+ /*
+ * identify_boot_cpu() initialized SMT support information, let the
+ * core code know.
+ */
+ cpu_smt_check_topology_early();
+
+ if (!IS_ENABLED(CONFIG_SMP)) {
+ pr_info("CPU: ");
+ print_cpu_info(&boot_cpu_data);
+ }
+
+ /*
+ * Read the SPEC_CTRL MSR to account for reserved bits which may
+ * have unknown values. AMD64_LS_CFG MSR is cached in the early AMD
+ * init code as it is not enumerated and depends on the family.
+ */
+ if (boot_cpu_has(X86_FEATURE_MSR_SPEC_CTRL))
+ rdmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base);
+
+ /* Allow STIBP in MSR_SPEC_CTRL if supported */
+ if (boot_cpu_has(X86_FEATURE_STIBP))
+ x86_spec_ctrl_mask |= SPEC_CTRL_STIBP;
+
+ /* Select the proper spectre mitigation before patching alternatives */
+ spectre_v2_select_mitigation();
+
+ /*
+ * Select proper mitigation for any exposure to the Speculative Store
+ * Bypass vulnerability.
+ */
+ ssb_select_mitigation();
+
+ l1tf_select_mitigation();
+
+#ifdef CONFIG_X86_32
+ /*
+ * Check whether we are able to run this kernel safely on SMP.
+ *
+ * - i386 is no longer supported.
+ * - In order to run on anything without a TSC, we need to be
+ * compiled for a i486.
+ */
+ if (boot_cpu_data.x86 < 4)
+ panic("Kernel requires i486+ for 'invlpg' and other features");
+
+ init_utsname()->machine[1] =
+ '0' + (boot_cpu_data.x86 > 6 ? 6 : boot_cpu_data.x86);
+ alternative_instructions();
+
+ fpu__init_check_bugs();
+#else /* CONFIG_X86_64 */
+ alternative_instructions();
+
+ /*
+ * Make sure the first 2MB area is not mapped by huge pages
+ * There are typically fixed size MTRRs in there and overlapping
+ * MTRRs into large pages causes slow downs.
+ *
+ * Right now we don't do that with gbpages because there seems
+ * very little benefit for that case.
+ */
+ if (!direct_gbpages)
+ set_memory_4k((unsigned long)__va(0), 1);
+#endif
+}
+
+void
+x86_virt_spec_ctrl(u64 guest_spec_ctrl, u64 guest_virt_spec_ctrl, bool setguest)
+{
+ u64 msrval, guestval, hostval = x86_spec_ctrl_base;
+ struct thread_info *ti = current_thread_info();
+
+ /* Is MSR_SPEC_CTRL implemented ? */
+ if (static_cpu_has(X86_FEATURE_MSR_SPEC_CTRL)) {
+ /*
+ * Restrict guest_spec_ctrl to supported values. Clear the
+ * modifiable bits in the host base value and or the
+ * modifiable bits from the guest value.
+ */
+ guestval = hostval & ~x86_spec_ctrl_mask;
+ guestval |= guest_spec_ctrl & x86_spec_ctrl_mask;
+
+ /* SSBD controlled in MSR_SPEC_CTRL */
+ if (static_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD) ||
+ static_cpu_has(X86_FEATURE_AMD_SSBD))
+ hostval |= ssbd_tif_to_spec_ctrl(ti->flags);
+
+ /* Conditional STIBP enabled? */
+ if (static_branch_unlikely(&switch_to_cond_stibp))
+ hostval |= stibp_tif_to_spec_ctrl(ti->flags);
+
+ if (hostval != guestval) {
+ msrval = setguest ? guestval : hostval;
+ wrmsrl(MSR_IA32_SPEC_CTRL, msrval);
+ }
+ }
+
+ /*
+ * If SSBD is not handled in MSR_SPEC_CTRL on AMD, update
+ * MSR_AMD64_L2_CFG or MSR_VIRT_SPEC_CTRL if supported.
+ */
+ if (!static_cpu_has(X86_FEATURE_LS_CFG_SSBD) &&
+ !static_cpu_has(X86_FEATURE_VIRT_SSBD))
+ return;
+
+ /*
+ * If the host has SSBD mitigation enabled, force it in the host's
+ * virtual MSR value. If its not permanently enabled, evaluate
+ * current's TIF_SSBD thread flag.
+ */
+ if (static_cpu_has(X86_FEATURE_SPEC_STORE_BYPASS_DISABLE))
+ hostval = SPEC_CTRL_SSBD;
+ else
+ hostval = ssbd_tif_to_spec_ctrl(ti->flags);
+
+ /* Sanitize the guest value */
+ guestval = guest_virt_spec_ctrl & SPEC_CTRL_SSBD;
+
+ if (hostval != guestval) {
+ unsigned long tif;
+
+ tif = setguest ? ssbd_spec_ctrl_to_tif(guestval) :
+ ssbd_spec_ctrl_to_tif(hostval);
+
+ speculation_ctrl_update(tif);
+ }
+}
+EXPORT_SYMBOL_GPL(x86_virt_spec_ctrl);
+
+static void x86_amd_ssb_disable(void)
+{
+ u64 msrval = x86_amd_ls_cfg_base | x86_amd_ls_cfg_ssbd_mask;
+
+ if (boot_cpu_has(X86_FEATURE_VIRT_SSBD))
+ wrmsrl(MSR_AMD64_VIRT_SPEC_CTRL, SPEC_CTRL_SSBD);
+ else if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD))
+ wrmsrl(MSR_AMD64_LS_CFG, msrval);
+}
+
+#undef pr_fmt
+#define pr_fmt(fmt) "Spectre V2 : " fmt
+
+static enum spectre_v2_mitigation spectre_v2_enabled __ro_after_init =
+ SPECTRE_V2_NONE;
+
+static enum spectre_v2_user_mitigation spectre_v2_user __ro_after_init =
+ SPECTRE_V2_USER_NONE;
+
+#ifdef RETPOLINE
+static bool spectre_v2_bad_module;
+
+bool retpoline_module_ok(bool has_retpoline)
+{
+ if (spectre_v2_enabled == SPECTRE_V2_NONE || has_retpoline)
+ return true;
+
+ pr_err("System may be vulnerable to spectre v2\n");
+ spectre_v2_bad_module = true;
+ return false;
+}
+
+static inline const char *spectre_v2_module_string(void)
+{
+ return spectre_v2_bad_module ? " - vulnerable module loaded" : "";
+}
+#else
+static inline const char *spectre_v2_module_string(void) { return ""; }
+#endif
+
+static inline bool match_option(const char *arg, int arglen, const char *opt)
+{
+ int len = strlen(opt);
+
+ return len == arglen && !strncmp(arg, opt, len);
+}
+
+/* The kernel command line selection for spectre v2 */
+enum spectre_v2_mitigation_cmd {
+ SPECTRE_V2_CMD_NONE,
+ SPECTRE_V2_CMD_AUTO,
+ SPECTRE_V2_CMD_FORCE,
+ SPECTRE_V2_CMD_RETPOLINE,
+ SPECTRE_V2_CMD_RETPOLINE_GENERIC,
+ SPECTRE_V2_CMD_RETPOLINE_AMD,
+};
+
+enum spectre_v2_user_cmd {
+ SPECTRE_V2_USER_CMD_NONE,
+ SPECTRE_V2_USER_CMD_AUTO,
+ SPECTRE_V2_USER_CMD_FORCE,
+ SPECTRE_V2_USER_CMD_PRCTL,
+ SPECTRE_V2_USER_CMD_PRCTL_IBPB,
+ SPECTRE_V2_USER_CMD_SECCOMP,
+ SPECTRE_V2_USER_CMD_SECCOMP_IBPB,
+};
+
+static const char * const spectre_v2_user_strings[] = {
+ [SPECTRE_V2_USER_NONE] = "User space: Vulnerable",
+ [SPECTRE_V2_USER_STRICT] = "User space: Mitigation: STIBP protection",
+ [SPECTRE_V2_USER_PRCTL] = "User space: Mitigation: STIBP via prctl",
+ [SPECTRE_V2_USER_SECCOMP] = "User space: Mitigation: STIBP via seccomp and prctl",
+};
+
+static const struct {
+ const char *option;
+ enum spectre_v2_user_cmd cmd;
+ bool secure;
+} v2_user_options[] __initdata = {
+ { "auto", SPECTRE_V2_USER_CMD_AUTO, false },
+ { "off", SPECTRE_V2_USER_CMD_NONE, false },
+ { "on", SPECTRE_V2_USER_CMD_FORCE, true },
+ { "prctl", SPECTRE_V2_USER_CMD_PRCTL, false },
+ { "prctl,ibpb", SPECTRE_V2_USER_CMD_PRCTL_IBPB, false },
+ { "seccomp", SPECTRE_V2_USER_CMD_SECCOMP, false },
+ { "seccomp,ibpb", SPECTRE_V2_USER_CMD_SECCOMP_IBPB, false },
+};
+
+static void __init spec_v2_user_print_cond(const char *reason, bool secure)
+{
+ if (boot_cpu_has_bug(X86_BUG_SPECTRE_V2) != secure)
+ pr_info("spectre_v2_user=%s forced on command line.\n", reason);
+}
+
+static enum spectre_v2_user_cmd __init
+spectre_v2_parse_user_cmdline(enum spectre_v2_mitigation_cmd v2_cmd)
+{
+ char arg[20];
+ int ret, i;
+
+ switch (v2_cmd) {
+ case SPECTRE_V2_CMD_NONE:
+ return SPECTRE_V2_USER_CMD_NONE;
+ case SPECTRE_V2_CMD_FORCE:
+ return SPECTRE_V2_USER_CMD_FORCE;
+ default:
+ break;
+ }
+
+ ret = cmdline_find_option(boot_command_line, "spectre_v2_user",
+ arg, sizeof(arg));
+ if (ret < 0)
+ return SPECTRE_V2_USER_CMD_AUTO;
+
+ for (i = 0; i < ARRAY_SIZE(v2_user_options); i++) {
+ if (match_option(arg, ret, v2_user_options[i].option)) {
+ spec_v2_user_print_cond(v2_user_options[i].option,
+ v2_user_options[i].secure);
+ return v2_user_options[i].cmd;
+ }
+ }
+
+ pr_err("Unknown user space protection option (%s). Switching to AUTO select\n", arg);
+ return SPECTRE_V2_USER_CMD_AUTO;
+}
+
+static void __init
+spectre_v2_user_select_mitigation(enum spectre_v2_mitigation_cmd v2_cmd)
+{
+ enum spectre_v2_user_mitigation mode = SPECTRE_V2_USER_NONE;
+ bool smt_possible = IS_ENABLED(CONFIG_SMP);
+ enum spectre_v2_user_cmd cmd;
+
+ if (!boot_cpu_has(X86_FEATURE_IBPB) && !boot_cpu_has(X86_FEATURE_STIBP))
+ return;
+
+ if (cpu_smt_control == CPU_SMT_FORCE_DISABLED ||
+ cpu_smt_control == CPU_SMT_NOT_SUPPORTED)
+ smt_possible = false;
+
+ cmd = spectre_v2_parse_user_cmdline(v2_cmd);
+ switch (cmd) {
+ case SPECTRE_V2_USER_CMD_NONE:
+ goto set_mode;
+ case SPECTRE_V2_USER_CMD_FORCE:
+ mode = SPECTRE_V2_USER_STRICT;
+ break;
+ case SPECTRE_V2_USER_CMD_PRCTL:
+ case SPECTRE_V2_USER_CMD_PRCTL_IBPB:
+ mode = SPECTRE_V2_USER_PRCTL;
+ break;
+ case SPECTRE_V2_USER_CMD_AUTO:
+ case SPECTRE_V2_USER_CMD_SECCOMP:
+ case SPECTRE_V2_USER_CMD_SECCOMP_IBPB:
+ if (IS_ENABLED(CONFIG_SECCOMP))
+ mode = SPECTRE_V2_USER_SECCOMP;
+ else
+ mode = SPECTRE_V2_USER_PRCTL;
+ break;
+ }
+
+ /* Initialize Indirect Branch Prediction Barrier */
+ if (boot_cpu_has(X86_FEATURE_IBPB)) {
+ setup_force_cpu_cap(X86_FEATURE_USE_IBPB);
+
+ switch (cmd) {
+ case SPECTRE_V2_USER_CMD_FORCE:
+ case SPECTRE_V2_USER_CMD_PRCTL_IBPB:
+ case SPECTRE_V2_USER_CMD_SECCOMP_IBPB:
+ static_branch_enable(&switch_mm_always_ibpb);
+ break;
+ case SPECTRE_V2_USER_CMD_PRCTL:
+ case SPECTRE_V2_USER_CMD_AUTO:
+ case SPECTRE_V2_USER_CMD_SECCOMP:
+ static_branch_enable(&switch_mm_cond_ibpb);
+ break;
+ default:
+ break;
+ }
+
+ pr_info("mitigation: Enabling %s Indirect Branch Prediction Barrier\n",
+ static_key_enabled(&switch_mm_always_ibpb) ?
+ "always-on" : "conditional");
+ }
+
+ /* If enhanced IBRS is enabled no STIPB required */
+ if (spectre_v2_enabled == SPECTRE_V2_IBRS_ENHANCED)
+ return;
+
+ /*
+ * If SMT is not possible or STIBP is not available clear the STIPB
+ * mode.
+ */
+ if (!smt_possible || !boot_cpu_has(X86_FEATURE_STIBP))
+ mode = SPECTRE_V2_USER_NONE;
+set_mode:
+ spectre_v2_user = mode;
+ /* Only print the STIBP mode when SMT possible */
+ if (smt_possible)
+ pr_info("%s\n", spectre_v2_user_strings[mode]);
+}
+
+static const char * const spectre_v2_strings[] = {
+ [SPECTRE_V2_NONE] = "Vulnerable",
+ [SPECTRE_V2_RETPOLINE_GENERIC] = "Mitigation: Full generic retpoline",
+ [SPECTRE_V2_RETPOLINE_AMD] = "Mitigation: Full AMD retpoline",
+ [SPECTRE_V2_IBRS_ENHANCED] = "Mitigation: Enhanced IBRS",
+};
+
+static const struct {
+ const char *option;
+ enum spectre_v2_mitigation_cmd cmd;
+ bool secure;
+} mitigation_options[] __initdata = {
+ { "off", SPECTRE_V2_CMD_NONE, false },
+ { "on", SPECTRE_V2_CMD_FORCE, true },
+ { "retpoline", SPECTRE_V2_CMD_RETPOLINE, false },
+ { "retpoline,amd", SPECTRE_V2_CMD_RETPOLINE_AMD, false },
+ { "retpoline,generic", SPECTRE_V2_CMD_RETPOLINE_GENERIC, false },
+ { "auto", SPECTRE_V2_CMD_AUTO, false },
+};
+
+static void __init spec_v2_print_cond(const char *reason, bool secure)
+{
+ if (boot_cpu_has_bug(X86_BUG_SPECTRE_V2) != secure)
+ pr_info("%s selected on command line.\n", reason);
+}
+
+static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void)
+{
+ enum spectre_v2_mitigation_cmd cmd = SPECTRE_V2_CMD_AUTO;
+ char arg[20];
+ int ret, i;
+
+ if (cmdline_find_option_bool(boot_command_line, "nospectre_v2"))
+ return SPECTRE_V2_CMD_NONE;
+
+ ret = cmdline_find_option(boot_command_line, "spectre_v2", arg, sizeof(arg));
+ if (ret < 0)
+ return SPECTRE_V2_CMD_AUTO;
+
+ for (i = 0; i < ARRAY_SIZE(mitigation_options); i++) {
+ if (!match_option(arg, ret, mitigation_options[i].option))
+ continue;
+ cmd = mitigation_options[i].cmd;
+ break;
+ }
+
+ if (i >= ARRAY_SIZE(mitigation_options)) {
+ pr_err("unknown option (%s). Switching to AUTO select\n", arg);
+ return SPECTRE_V2_CMD_AUTO;
+ }
+
+ if ((cmd == SPECTRE_V2_CMD_RETPOLINE ||
+ cmd == SPECTRE_V2_CMD_RETPOLINE_AMD ||
+ cmd == SPECTRE_V2_CMD_RETPOLINE_GENERIC) &&
+ !IS_ENABLED(CONFIG_RETPOLINE)) {
+ pr_err("%s selected but not compiled in. Switching to AUTO select\n", mitigation_options[i].option);
+ return SPECTRE_V2_CMD_AUTO;
+ }
+
+ if (cmd == SPECTRE_V2_CMD_RETPOLINE_AMD &&
+ boot_cpu_data.x86_vendor != X86_VENDOR_AMD) {
+ pr_err("retpoline,amd selected but CPU is not AMD. Switching to AUTO select\n");
+ return SPECTRE_V2_CMD_AUTO;
+ }
+
+ spec_v2_print_cond(mitigation_options[i].option,
+ mitigation_options[i].secure);
+ return cmd;
+}
+
+static void __init spectre_v2_select_mitigation(void)
+{
+ enum spectre_v2_mitigation_cmd cmd = spectre_v2_parse_cmdline();
+ enum spectre_v2_mitigation mode = SPECTRE_V2_NONE;
+
+ /*
+ * If the CPU is not affected and the command line mode is NONE or AUTO
+ * then nothing to do.
+ */
+ if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2) &&
+ (cmd == SPECTRE_V2_CMD_NONE || cmd == SPECTRE_V2_CMD_AUTO))
+ return;
+
+ switch (cmd) {
+ case SPECTRE_V2_CMD_NONE:
+ return;
+
+ case SPECTRE_V2_CMD_FORCE:
+ case SPECTRE_V2_CMD_AUTO:
+ if (boot_cpu_has(X86_FEATURE_IBRS_ENHANCED)) {
+ mode = SPECTRE_V2_IBRS_ENHANCED;
+ /* Force it so VMEXIT will restore correctly */
+ x86_spec_ctrl_base |= SPEC_CTRL_IBRS;
+ wrmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base);
+ goto specv2_set_mode;
+ }
+ if (IS_ENABLED(CONFIG_RETPOLINE))
+ goto retpoline_auto;
+ break;
+ case SPECTRE_V2_CMD_RETPOLINE_AMD:
+ if (IS_ENABLED(CONFIG_RETPOLINE))
+ goto retpoline_amd;
+ break;
+ case SPECTRE_V2_CMD_RETPOLINE_GENERIC:
+ if (IS_ENABLED(CONFIG_RETPOLINE))
+ goto retpoline_generic;
+ break;
+ case SPECTRE_V2_CMD_RETPOLINE:
+ if (IS_ENABLED(CONFIG_RETPOLINE))
+ goto retpoline_auto;
+ break;
+ }
+ pr_err("Spectre mitigation: kernel not compiled with retpoline; no mitigation available!");
+ return;
+
+retpoline_auto:
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
+ retpoline_amd:
+ if (!boot_cpu_has(X86_FEATURE_LFENCE_RDTSC)) {
+ pr_err("Spectre mitigation: LFENCE not serializing, switching to generic retpoline\n");
+ goto retpoline_generic;
+ }
+ mode = SPECTRE_V2_RETPOLINE_AMD;
+ setup_force_cpu_cap(X86_FEATURE_RETPOLINE_AMD);
+ setup_force_cpu_cap(X86_FEATURE_RETPOLINE);
+ } else {
+ retpoline_generic:
+ mode = SPECTRE_V2_RETPOLINE_GENERIC;
+ setup_force_cpu_cap(X86_FEATURE_RETPOLINE);
+ }
+
+specv2_set_mode:
+ spectre_v2_enabled = mode;
+ pr_info("%s\n", spectre_v2_strings[mode]);
+
+ /*
+ * If spectre v2 protection has been enabled, unconditionally fill
+ * RSB during a context switch; this protects against two independent
+ * issues:
+ *
+ * - RSB underflow (and switch to BTB) on Skylake+
+ * - SpectreRSB variant of spectre v2 on X86_BUG_SPECTRE_V2 CPUs
+ */
+ setup_force_cpu_cap(X86_FEATURE_RSB_CTXSW);
+ pr_info("Spectre v2 / SpectreRSB mitigation: Filling RSB on context switch\n");
+
+ /*
+ * Retpoline means the kernel is safe because it has no indirect
+ * branches. Enhanced IBRS protects firmware too, so, enable restricted
+ * speculation around firmware calls only when Enhanced IBRS isn't
+ * supported.
+ *
+ * Use "mode" to check Enhanced IBRS instead of boot_cpu_has(), because
+ * the user might select retpoline on the kernel command line and if
+ * the CPU supports Enhanced IBRS, kernel might un-intentionally not
+ * enable IBRS around firmware calls.
+ */
+ if (boot_cpu_has(X86_FEATURE_IBRS) && mode != SPECTRE_V2_IBRS_ENHANCED) {
+ setup_force_cpu_cap(X86_FEATURE_USE_IBRS_FW);
+ pr_info("Enabling Restricted Speculation for firmware calls\n");
+ }
+
+ /* Set up IBPB and STIBP depending on the general spectre V2 command */
+ spectre_v2_user_select_mitigation(cmd);
+
+ /* Enable STIBP if appropriate */
+ arch_smt_update();
+}
+
+static void update_stibp_msr(void * __unused)
+{
+ wrmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base);
+}
+
+/* Update x86_spec_ctrl_base in case SMT state changed. */
+static void update_stibp_strict(void)
+{
+ u64 mask = x86_spec_ctrl_base & ~SPEC_CTRL_STIBP;
+
+ if (sched_smt_active())
+ mask |= SPEC_CTRL_STIBP;
+
+ if (mask == x86_spec_ctrl_base)
+ return;
+
+ pr_info("Update user space SMT mitigation: STIBP %s\n",
+ mask & SPEC_CTRL_STIBP ? "always-on" : "off");
+ x86_spec_ctrl_base = mask;
+ on_each_cpu(update_stibp_msr, NULL, 1);
+}
+
+/* Update the static key controlling the evaluation of TIF_SPEC_IB */
+static void update_indir_branch_cond(void)
+{
+ if (sched_smt_active())
+ static_branch_enable(&switch_to_cond_stibp);
+ else
+ static_branch_disable(&switch_to_cond_stibp);
+}
+
+void arch_smt_update(void)
+{
+ /* Enhanced IBRS implies STIBP. No update required. */
+ if (spectre_v2_enabled == SPECTRE_V2_IBRS_ENHANCED)
+ return;
+
+ mutex_lock(&spec_ctrl_mutex);
+
+ switch (spectre_v2_user) {
+ case SPECTRE_V2_USER_NONE:
+ break;
+ case SPECTRE_V2_USER_STRICT:
+ update_stibp_strict();
+ break;
+ case SPECTRE_V2_USER_PRCTL:
+ case SPECTRE_V2_USER_SECCOMP:
+ update_indir_branch_cond();
+ break;
+ }
+
+ mutex_unlock(&spec_ctrl_mutex);
+}
+
+#undef pr_fmt
+#define pr_fmt(fmt) "Speculative Store Bypass: " fmt
+
+static enum ssb_mitigation ssb_mode __ro_after_init = SPEC_STORE_BYPASS_NONE;
+
+/* The kernel command line selection */
+enum ssb_mitigation_cmd {
+ SPEC_STORE_BYPASS_CMD_NONE,
+ SPEC_STORE_BYPASS_CMD_AUTO,
+ SPEC_STORE_BYPASS_CMD_ON,
+ SPEC_STORE_BYPASS_CMD_PRCTL,
+ SPEC_STORE_BYPASS_CMD_SECCOMP,
+};
+
+static const char * const ssb_strings[] = {
+ [SPEC_STORE_BYPASS_NONE] = "Vulnerable",
+ [SPEC_STORE_BYPASS_DISABLE] = "Mitigation: Speculative Store Bypass disabled",
+ [SPEC_STORE_BYPASS_PRCTL] = "Mitigation: Speculative Store Bypass disabled via prctl",
+ [SPEC_STORE_BYPASS_SECCOMP] = "Mitigation: Speculative Store Bypass disabled via prctl and seccomp",
+};
+
+static const struct {
+ const char *option;
+ enum ssb_mitigation_cmd cmd;
+} ssb_mitigation_options[] __initdata = {
+ { "auto", SPEC_STORE_BYPASS_CMD_AUTO }, /* Platform decides */
+ { "on", SPEC_STORE_BYPASS_CMD_ON }, /* Disable Speculative Store Bypass */
+ { "off", SPEC_STORE_BYPASS_CMD_NONE }, /* Don't touch Speculative Store Bypass */
+ { "prctl", SPEC_STORE_BYPASS_CMD_PRCTL }, /* Disable Speculative Store Bypass via prctl */
+ { "seccomp", SPEC_STORE_BYPASS_CMD_SECCOMP }, /* Disable Speculative Store Bypass via prctl and seccomp */
+};
+
+static enum ssb_mitigation_cmd __init ssb_parse_cmdline(void)
+{
+ enum ssb_mitigation_cmd cmd = SPEC_STORE_BYPASS_CMD_AUTO;
+ char arg[20];
+ int ret, i;
+
+ if (cmdline_find_option_bool(boot_command_line, "nospec_store_bypass_disable")) {
+ return SPEC_STORE_BYPASS_CMD_NONE;
+ } else {
+ ret = cmdline_find_option(boot_command_line, "spec_store_bypass_disable",
+ arg, sizeof(arg));
+ if (ret < 0)
+ return SPEC_STORE_BYPASS_CMD_AUTO;
+
+ for (i = 0; i < ARRAY_SIZE(ssb_mitigation_options); i++) {
+ if (!match_option(arg, ret, ssb_mitigation_options[i].option))
+ continue;
+
+ cmd = ssb_mitigation_options[i].cmd;
+ break;
+ }
+
+ if (i >= ARRAY_SIZE(ssb_mitigation_options)) {
+ pr_err("unknown option (%s). Switching to AUTO select\n", arg);
+ return SPEC_STORE_BYPASS_CMD_AUTO;
+ }
+ }
+
+ return cmd;
+}
+
+static enum ssb_mitigation __init __ssb_select_mitigation(void)
+{
+ enum ssb_mitigation mode = SPEC_STORE_BYPASS_NONE;
+ enum ssb_mitigation_cmd cmd;
+
+ if (!boot_cpu_has(X86_FEATURE_SSBD))
+ return mode;
+
+ cmd = ssb_parse_cmdline();
+ if (!boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS) &&
+ (cmd == SPEC_STORE_BYPASS_CMD_NONE ||
+ cmd == SPEC_STORE_BYPASS_CMD_AUTO))
+ return mode;
+
+ switch (cmd) {
+ case SPEC_STORE_BYPASS_CMD_AUTO:
+ case SPEC_STORE_BYPASS_CMD_SECCOMP:
+ /*
+ * Choose prctl+seccomp as the default mode if seccomp is
+ * enabled.
+ */
+ if (IS_ENABLED(CONFIG_SECCOMP))
+ mode = SPEC_STORE_BYPASS_SECCOMP;
+ else
+ mode = SPEC_STORE_BYPASS_PRCTL;
+ break;
+ case SPEC_STORE_BYPASS_CMD_ON:
+ mode = SPEC_STORE_BYPASS_DISABLE;
+ break;
+ case SPEC_STORE_BYPASS_CMD_PRCTL:
+ mode = SPEC_STORE_BYPASS_PRCTL;
+ break;
+ case SPEC_STORE_BYPASS_CMD_NONE:
+ break;
+ }
+
+ /*
+ * We have three CPU feature flags that are in play here:
+ * - X86_BUG_SPEC_STORE_BYPASS - CPU is susceptible.
+ * - X86_FEATURE_SSBD - CPU is able to turn off speculative store bypass
+ * - X86_FEATURE_SPEC_STORE_BYPASS_DISABLE - engage the mitigation
+ */
+ if (mode == SPEC_STORE_BYPASS_DISABLE) {
+ setup_force_cpu_cap(X86_FEATURE_SPEC_STORE_BYPASS_DISABLE);
+ /*
+ * Intel uses the SPEC CTRL MSR Bit(2) for this, while AMD may
+ * use a completely different MSR and bit dependent on family.
+ */
+ if (!static_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD) &&
+ !static_cpu_has(X86_FEATURE_AMD_SSBD)) {
+ x86_amd_ssb_disable();
+ } else {
+ x86_spec_ctrl_base |= SPEC_CTRL_SSBD;
+ x86_spec_ctrl_mask |= SPEC_CTRL_SSBD;
+ wrmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base);
+ }
+ }
+
+ return mode;
+}
+
+static void ssb_select_mitigation(void)
+{
+ ssb_mode = __ssb_select_mitigation();
+
+ if (boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS))
+ pr_info("%s\n", ssb_strings[ssb_mode]);
+}
+
+#undef pr_fmt
+#define pr_fmt(fmt) "Speculation prctl: " fmt
+
+static void task_update_spec_tif(struct task_struct *tsk)
+{
+ /* Force the update of the real TIF bits */
+ set_tsk_thread_flag(tsk, TIF_SPEC_FORCE_UPDATE);
+
+ /*
+ * Immediately update the speculation control MSRs for the current
+ * task, but for a non-current task delay setting the CPU
+ * mitigation until it is scheduled next.
+ *
+ * This can only happen for SECCOMP mitigation. For PRCTL it's
+ * always the current task.
+ */
+ if (tsk == current)
+ speculation_ctrl_update_current();
+}
+
+static int ssb_prctl_set(struct task_struct *task, unsigned long ctrl)
+{
+ if (ssb_mode != SPEC_STORE_BYPASS_PRCTL &&
+ ssb_mode != SPEC_STORE_BYPASS_SECCOMP)
+ return -ENXIO;
+
+ switch (ctrl) {
+ case PR_SPEC_ENABLE:
+ /* If speculation is force disabled, enable is not allowed */
+ if (task_spec_ssb_force_disable(task))
+ return -EPERM;
+ task_clear_spec_ssb_disable(task);
+ task_update_spec_tif(task);
+ break;
+ case PR_SPEC_DISABLE:
+ task_set_spec_ssb_disable(task);
+ task_update_spec_tif(task);
+ break;
+ case PR_SPEC_FORCE_DISABLE:
+ task_set_spec_ssb_disable(task);
+ task_set_spec_ssb_force_disable(task);
+ task_update_spec_tif(task);
+ break;
+ default:
+ return -ERANGE;
+ }
+ return 0;
+}
+
+static int ib_prctl_set(struct task_struct *task, unsigned long ctrl)
+{
+ switch (ctrl) {
+ case PR_SPEC_ENABLE:
+ if (spectre_v2_user == SPECTRE_V2_USER_NONE)
+ return 0;
+ /*
+ * Indirect branch speculation is always disabled in strict
+ * mode.
+ */
+ if (spectre_v2_user == SPECTRE_V2_USER_STRICT)
+ return -EPERM;
+ task_clear_spec_ib_disable(task);
+ task_update_spec_tif(task);
+ break;
+ case PR_SPEC_DISABLE:
+ case PR_SPEC_FORCE_DISABLE:
+ /*
+ * Indirect branch speculation is always allowed when
+ * mitigation is force disabled.
+ */
+ if (spectre_v2_user == SPECTRE_V2_USER_NONE)
+ return -EPERM;
+ if (spectre_v2_user == SPECTRE_V2_USER_STRICT)
+ return 0;
+ task_set_spec_ib_disable(task);
+ if (ctrl == PR_SPEC_FORCE_DISABLE)
+ task_set_spec_ib_force_disable(task);
+ task_update_spec_tif(task);
+ break;
+ default:
+ return -ERANGE;
+ }
+ return 0;
+}
+
+int arch_prctl_spec_ctrl_set(struct task_struct *task, unsigned long which,
+ unsigned long ctrl)
+{
+ switch (which) {
+ case PR_SPEC_STORE_BYPASS:
+ return ssb_prctl_set(task, ctrl);
+ case PR_SPEC_INDIRECT_BRANCH:
+ return ib_prctl_set(task, ctrl);
+ default:
+ return -ENODEV;
+ }
+}
+
+#ifdef CONFIG_SECCOMP
+void arch_seccomp_spec_mitigate(struct task_struct *task)
+{
+ if (ssb_mode == SPEC_STORE_BYPASS_SECCOMP)
+ ssb_prctl_set(task, PR_SPEC_FORCE_DISABLE);
+ if (spectre_v2_user == SPECTRE_V2_USER_SECCOMP)
+ ib_prctl_set(task, PR_SPEC_FORCE_DISABLE);
+}
+#endif
+
+static int ssb_prctl_get(struct task_struct *task)
+{
+ switch (ssb_mode) {
+ case SPEC_STORE_BYPASS_DISABLE:
+ return PR_SPEC_DISABLE;
+ case SPEC_STORE_BYPASS_SECCOMP:
+ case SPEC_STORE_BYPASS_PRCTL:
+ if (task_spec_ssb_force_disable(task))
+ return PR_SPEC_PRCTL | PR_SPEC_FORCE_DISABLE;
+ if (task_spec_ssb_disable(task))
+ return PR_SPEC_PRCTL | PR_SPEC_DISABLE;
+ return PR_SPEC_PRCTL | PR_SPEC_ENABLE;
+ default:
+ if (boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS))
+ return PR_SPEC_ENABLE;
+ return PR_SPEC_NOT_AFFECTED;
+ }
+}
+
+static int ib_prctl_get(struct task_struct *task)
+{
+ if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2))
+ return PR_SPEC_NOT_AFFECTED;
+
+ switch (spectre_v2_user) {
+ case SPECTRE_V2_USER_NONE:
+ return PR_SPEC_ENABLE;
+ case SPECTRE_V2_USER_PRCTL:
+ case SPECTRE_V2_USER_SECCOMP:
+ if (task_spec_ib_force_disable(task))
+ return PR_SPEC_PRCTL | PR_SPEC_FORCE_DISABLE;
+ if (task_spec_ib_disable(task))
+ return PR_SPEC_PRCTL | PR_SPEC_DISABLE;
+ return PR_SPEC_PRCTL | PR_SPEC_ENABLE;
+ case SPECTRE_V2_USER_STRICT:
+ return PR_SPEC_DISABLE;
+ default:
+ return PR_SPEC_NOT_AFFECTED;
+ }
+}
+
+int arch_prctl_spec_ctrl_get(struct task_struct *task, unsigned long which)
+{
+ switch (which) {
+ case PR_SPEC_STORE_BYPASS:
+ return ssb_prctl_get(task);
+ case PR_SPEC_INDIRECT_BRANCH:
+ return ib_prctl_get(task);
+ default:
+ return -ENODEV;
+ }
+}
+
+void x86_spec_ctrl_setup_ap(void)
+{
+ if (boot_cpu_has(X86_FEATURE_MSR_SPEC_CTRL))
+ wrmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base);
+
+ if (ssb_mode == SPEC_STORE_BYPASS_DISABLE)
+ x86_amd_ssb_disable();
+}
+
+#undef pr_fmt
+#define pr_fmt(fmt) "L1TF: " fmt
+
+/* Default mitigation for L1TF-affected CPUs */
+enum l1tf_mitigations l1tf_mitigation __ro_after_init = L1TF_MITIGATION_FLUSH;
+#if IS_ENABLED(CONFIG_KVM_INTEL)
+EXPORT_SYMBOL_GPL(l1tf_mitigation);
+#endif
+enum vmx_l1d_flush_state l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_AUTO;
+EXPORT_SYMBOL_GPL(l1tf_vmx_mitigation);
+
+/*
+ * These CPUs all support 44bits physical address space internally in the
+ * cache but CPUID can report a smaller number of physical address bits.
+ *
+ * The L1TF mitigation uses the top most address bit for the inversion of
+ * non present PTEs. When the installed memory reaches into the top most
+ * address bit due to memory holes, which has been observed on machines
+ * which report 36bits physical address bits and have 32G RAM installed,
+ * then the mitigation range check in l1tf_select_mitigation() triggers.
+ * This is a false positive because the mitigation is still possible due to
+ * the fact that the cache uses 44bit internally. Use the cache bits
+ * instead of the reported physical bits and adjust them on the affected
+ * machines to 44bit if the reported bits are less than 44.
+ */
+static void override_cache_bits(struct cpuinfo_x86 *c)
+{
+ if (c->x86 != 6)
+ return;
+
+ switch (c->x86_model) {
+ case INTEL_FAM6_NEHALEM:
+ case INTEL_FAM6_WESTMERE:
+ case INTEL_FAM6_SANDYBRIDGE:
+ case INTEL_FAM6_IVYBRIDGE:
+ case INTEL_FAM6_HASWELL_CORE:
+ case INTEL_FAM6_HASWELL_ULT:
+ case INTEL_FAM6_HASWELL_GT3E:
+ case INTEL_FAM6_BROADWELL_CORE:
+ case INTEL_FAM6_BROADWELL_GT3E:
+ case INTEL_FAM6_SKYLAKE_MOBILE:
+ case INTEL_FAM6_SKYLAKE_DESKTOP:
+ case INTEL_FAM6_KABYLAKE_MOBILE:
+ case INTEL_FAM6_KABYLAKE_DESKTOP:
+ if (c->x86_cache_bits < 44)
+ c->x86_cache_bits = 44;
+ break;
+ }
+}
+
+static void __init l1tf_select_mitigation(void)
+{
+ u64 half_pa;
+
+ if (!boot_cpu_has_bug(X86_BUG_L1TF))
+ return;
+
+ override_cache_bits(&boot_cpu_data);
+
+ switch (l1tf_mitigation) {
+ case L1TF_MITIGATION_OFF:
+ case L1TF_MITIGATION_FLUSH_NOWARN:
+ case L1TF_MITIGATION_FLUSH:
+ break;
+ case L1TF_MITIGATION_FLUSH_NOSMT:
+ case L1TF_MITIGATION_FULL:
+ cpu_smt_disable(false);
+ break;
+ case L1TF_MITIGATION_FULL_FORCE:
+ cpu_smt_disable(true);
+ break;
+ }
+
+#if CONFIG_PGTABLE_LEVELS == 2
+ pr_warn("Kernel not compiled for PAE. No mitigation for L1TF\n");
+ return;
+#endif
+
+ half_pa = (u64)l1tf_pfn_limit() << PAGE_SHIFT;
+ if (e820__mapped_any(half_pa, ULLONG_MAX - half_pa, E820_TYPE_RAM)) {
+ pr_warn("System has more than MAX_PA/2 memory. L1TF mitigation not effective.\n");
+ pr_info("You may make it effective by booting the kernel with mem=%llu parameter.\n",
+ half_pa);
+ pr_info("However, doing so will make a part of your RAM unusable.\n");
+ pr_info("Reading https://www.kernel.org/doc/html/latest/admin-guide/l1tf.html might help you decide.\n");
+ return;
+ }
+
+ setup_force_cpu_cap(X86_FEATURE_L1TF_PTEINV);
+}
+
+static int __init l1tf_cmdline(char *str)
+{
+ if (!boot_cpu_has_bug(X86_BUG_L1TF))
+ return 0;
+
+ if (!str)
+ return -EINVAL;
+
+ if (!strcmp(str, "off"))
+ l1tf_mitigation = L1TF_MITIGATION_OFF;
+ else if (!strcmp(str, "flush,nowarn"))
+ l1tf_mitigation = L1TF_MITIGATION_FLUSH_NOWARN;
+ else if (!strcmp(str, "flush"))
+ l1tf_mitigation = L1TF_MITIGATION_FLUSH;
+ else if (!strcmp(str, "flush,nosmt"))
+ l1tf_mitigation = L1TF_MITIGATION_FLUSH_NOSMT;
+ else if (!strcmp(str, "full"))
+ l1tf_mitigation = L1TF_MITIGATION_FULL;
+ else if (!strcmp(str, "full,force"))
+ l1tf_mitigation = L1TF_MITIGATION_FULL_FORCE;
+
+ return 0;
+}
+early_param("l1tf", l1tf_cmdline);
+
+#undef pr_fmt
+
+#ifdef CONFIG_SYSFS
+
+#define L1TF_DEFAULT_MSG "Mitigation: PTE Inversion"
+
+#if IS_ENABLED(CONFIG_KVM_INTEL)
+static const char * const l1tf_vmx_states[] = {
+ [VMENTER_L1D_FLUSH_AUTO] = "auto",
+ [VMENTER_L1D_FLUSH_NEVER] = "vulnerable",
+ [VMENTER_L1D_FLUSH_COND] = "conditional cache flushes",
+ [VMENTER_L1D_FLUSH_ALWAYS] = "cache flushes",
+ [VMENTER_L1D_FLUSH_EPT_DISABLED] = "EPT disabled",
+ [VMENTER_L1D_FLUSH_NOT_REQUIRED] = "flush not necessary"
+};
+
+static ssize_t l1tf_show_state(char *buf)
+{
+ if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_AUTO)
+ return sprintf(buf, "%s\n", L1TF_DEFAULT_MSG);
+
+ if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_EPT_DISABLED ||
+ (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_NEVER &&
+ sched_smt_active())) {
+ return sprintf(buf, "%s; VMX: %s\n", L1TF_DEFAULT_MSG,
+ l1tf_vmx_states[l1tf_vmx_mitigation]);
+ }
+
+ return sprintf(buf, "%s; VMX: %s, SMT %s\n", L1TF_DEFAULT_MSG,
+ l1tf_vmx_states[l1tf_vmx_mitigation],
+ sched_smt_active() ? "vulnerable" : "disabled");
+}
+#else
+static ssize_t l1tf_show_state(char *buf)
+{
+ return sprintf(buf, "%s\n", L1TF_DEFAULT_MSG);
+}
+#endif
+
+static char *stibp_state(void)
+{
+ if (spectre_v2_enabled == SPECTRE_V2_IBRS_ENHANCED)
+ return "";
+
+ switch (spectre_v2_user) {
+ case SPECTRE_V2_USER_NONE:
+ return ", STIBP: disabled";
+ case SPECTRE_V2_USER_STRICT:
+ return ", STIBP: forced";
+ case SPECTRE_V2_USER_PRCTL:
+ case SPECTRE_V2_USER_SECCOMP:
+ if (static_key_enabled(&switch_to_cond_stibp))
+ return ", STIBP: conditional";
+ }
+ return "";
+}
+
+static char *ibpb_state(void)
+{
+ if (boot_cpu_has(X86_FEATURE_IBPB)) {
+ if (static_key_enabled(&switch_mm_always_ibpb))
+ return ", IBPB: always-on";
+ if (static_key_enabled(&switch_mm_cond_ibpb))
+ return ", IBPB: conditional";
+ return ", IBPB: disabled";
+ }
+ return "";
+}
+
+static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr,
+ char *buf, unsigned int bug)
+{
+ if (!boot_cpu_has_bug(bug))
+ return sprintf(buf, "Not affected\n");
+
+ switch (bug) {
+ case X86_BUG_CPU_MELTDOWN:
+ if (boot_cpu_has(X86_FEATURE_PTI))
+ return sprintf(buf, "Mitigation: PTI\n");
+
+ if (hypervisor_is_type(X86_HYPER_XEN_PV))
+ return sprintf(buf, "Unknown (XEN PV detected, hypervisor mitigation required)\n");
+
+ break;
+
+ case X86_BUG_SPECTRE_V1:
+ return sprintf(buf, "Mitigation: __user pointer sanitization\n");
+
+ case X86_BUG_SPECTRE_V2:
+ return sprintf(buf, "%s%s%s%s%s%s\n", spectre_v2_strings[spectre_v2_enabled],
+ ibpb_state(),
+ boot_cpu_has(X86_FEATURE_USE_IBRS_FW) ? ", IBRS_FW" : "",
+ stibp_state(),
+ boot_cpu_has(X86_FEATURE_RSB_CTXSW) ? ", RSB filling" : "",
+ spectre_v2_module_string());
+
+ case X86_BUG_SPEC_STORE_BYPASS:
+ return sprintf(buf, "%s\n", ssb_strings[ssb_mode]);
+
+ case X86_BUG_L1TF:
+ if (boot_cpu_has(X86_FEATURE_L1TF_PTEINV))
+ return l1tf_show_state(buf);
+ break;
+ default:
+ break;
+ }
+
+ return sprintf(buf, "Vulnerable\n");
+}
+
+ssize_t cpu_show_meltdown(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ return cpu_show_common(dev, attr, buf, X86_BUG_CPU_MELTDOWN);
+}
+
+ssize_t cpu_show_spectre_v1(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ return cpu_show_common(dev, attr, buf, X86_BUG_SPECTRE_V1);
+}
+
+ssize_t cpu_show_spectre_v2(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ return cpu_show_common(dev, attr, buf, X86_BUG_SPECTRE_V2);
+}
+
+ssize_t cpu_show_spec_store_bypass(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ return cpu_show_common(dev, attr, buf, X86_BUG_SPEC_STORE_BYPASS);
+}
+
+ssize_t cpu_show_l1tf(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ return cpu_show_common(dev, attr, buf, X86_BUG_L1TF);
+}
+#endif
diff --git a/arch/x86/kernel/cpu/cacheinfo.c b/arch/x86/kernel/cpu/cacheinfo.c
new file mode 100644
index 0000000..0c5fcbd
--- /dev/null
+++ b/arch/x86/kernel/cpu/cacheinfo.c
@@ -0,0 +1,1010 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Routines to identify caches on Intel CPU.
+ *
+ * Changes:
+ * Venkatesh Pallipadi : Adding cache identification through cpuid(4)
+ * Ashok Raj <ashok.raj@intel.com>: Work with CPU hotplug infrastructure.
+ * Andi Kleen / Andreas Herrmann : CPUID4 emulation on AMD.
+ */
+
+#include <linux/slab.h>
+#include <linux/cacheinfo.h>
+#include <linux/cpu.h>
+#include <linux/sched.h>
+#include <linux/capability.h>
+#include <linux/sysfs.h>
+#include <linux/pci.h>
+
+#include <asm/cpufeature.h>
+#include <asm/amd_nb.h>
+#include <asm/smp.h>
+
+#include "cpu.h"
+
+#define LVL_1_INST 1
+#define LVL_1_DATA 2
+#define LVL_2 3
+#define LVL_3 4
+#define LVL_TRACE 5
+
+struct _cache_table {
+ unsigned char descriptor;
+ char cache_type;
+ short size;
+};
+
+#define MB(x) ((x) * 1024)
+
+/* All the cache descriptor types we care about (no TLB or
+ trace cache entries) */
+
+static const struct _cache_table cache_table[] =
+{
+ { 0x06, LVL_1_INST, 8 }, /* 4-way set assoc, 32 byte line size */
+ { 0x08, LVL_1_INST, 16 }, /* 4-way set assoc, 32 byte line size */
+ { 0x09, LVL_1_INST, 32 }, /* 4-way set assoc, 64 byte line size */
+ { 0x0a, LVL_1_DATA, 8 }, /* 2 way set assoc, 32 byte line size */
+ { 0x0c, LVL_1_DATA, 16 }, /* 4-way set assoc, 32 byte line size */
+ { 0x0d, LVL_1_DATA, 16 }, /* 4-way set assoc, 64 byte line size */
+ { 0x0e, LVL_1_DATA, 24 }, /* 6-way set assoc, 64 byte line size */
+ { 0x21, LVL_2, 256 }, /* 8-way set assoc, 64 byte line size */
+ { 0x22, LVL_3, 512 }, /* 4-way set assoc, sectored cache, 64 byte line size */
+ { 0x23, LVL_3, MB(1) }, /* 8-way set assoc, sectored cache, 64 byte line size */
+ { 0x25, LVL_3, MB(2) }, /* 8-way set assoc, sectored cache, 64 byte line size */
+ { 0x29, LVL_3, MB(4) }, /* 8-way set assoc, sectored cache, 64 byte line size */
+ { 0x2c, LVL_1_DATA, 32 }, /* 8-way set assoc, 64 byte line size */
+ { 0x30, LVL_1_INST, 32 }, /* 8-way set assoc, 64 byte line size */
+ { 0x39, LVL_2, 128 }, /* 4-way set assoc, sectored cache, 64 byte line size */
+ { 0x3a, LVL_2, 192 }, /* 6-way set assoc, sectored cache, 64 byte line size */
+ { 0x3b, LVL_2, 128 }, /* 2-way set assoc, sectored cache, 64 byte line size */
+ { 0x3c, LVL_2, 256 }, /* 4-way set assoc, sectored cache, 64 byte line size */
+ { 0x3d, LVL_2, 384 }, /* 6-way set assoc, sectored cache, 64 byte line size */
+ { 0x3e, LVL_2, 512 }, /* 4-way set assoc, sectored cache, 64 byte line size */
+ { 0x3f, LVL_2, 256 }, /* 2-way set assoc, 64 byte line size */
+ { 0x41, LVL_2, 128 }, /* 4-way set assoc, 32 byte line size */
+ { 0x42, LVL_2, 256 }, /* 4-way set assoc, 32 byte line size */
+ { 0x43, LVL_2, 512 }, /* 4-way set assoc, 32 byte line size */
+ { 0x44, LVL_2, MB(1) }, /* 4-way set assoc, 32 byte line size */
+ { 0x45, LVL_2, MB(2) }, /* 4-way set assoc, 32 byte line size */
+ { 0x46, LVL_3, MB(4) }, /* 4-way set assoc, 64 byte line size */
+ { 0x47, LVL_3, MB(8) }, /* 8-way set assoc, 64 byte line size */
+ { 0x48, LVL_2, MB(3) }, /* 12-way set assoc, 64 byte line size */
+ { 0x49, LVL_3, MB(4) }, /* 16-way set assoc, 64 byte line size */
+ { 0x4a, LVL_3, MB(6) }, /* 12-way set assoc, 64 byte line size */
+ { 0x4b, LVL_3, MB(8) }, /* 16-way set assoc, 64 byte line size */
+ { 0x4c, LVL_3, MB(12) }, /* 12-way set assoc, 64 byte line size */
+ { 0x4d, LVL_3, MB(16) }, /* 16-way set assoc, 64 byte line size */
+ { 0x4e, LVL_2, MB(6) }, /* 24-way set assoc, 64 byte line size */
+ { 0x60, LVL_1_DATA, 16 }, /* 8-way set assoc, sectored cache, 64 byte line size */
+ { 0x66, LVL_1_DATA, 8 }, /* 4-way set assoc, sectored cache, 64 byte line size */
+ { 0x67, LVL_1_DATA, 16 }, /* 4-way set assoc, sectored cache, 64 byte line size */
+ { 0x68, LVL_1_DATA, 32 }, /* 4-way set assoc, sectored cache, 64 byte line size */
+ { 0x70, LVL_TRACE, 12 }, /* 8-way set assoc */
+ { 0x71, LVL_TRACE, 16 }, /* 8-way set assoc */
+ { 0x72, LVL_TRACE, 32 }, /* 8-way set assoc */
+ { 0x73, LVL_TRACE, 64 }, /* 8-way set assoc */
+ { 0x78, LVL_2, MB(1) }, /* 4-way set assoc, 64 byte line size */
+ { 0x79, LVL_2, 128 }, /* 8-way set assoc, sectored cache, 64 byte line size */
+ { 0x7a, LVL_2, 256 }, /* 8-way set assoc, sectored cache, 64 byte line size */
+ { 0x7b, LVL_2, 512 }, /* 8-way set assoc, sectored cache, 64 byte line size */
+ { 0x7c, LVL_2, MB(1) }, /* 8-way set assoc, sectored cache, 64 byte line size */
+ { 0x7d, LVL_2, MB(2) }, /* 8-way set assoc, 64 byte line size */
+ { 0x7f, LVL_2, 512 }, /* 2-way set assoc, 64 byte line size */
+ { 0x80, LVL_2, 512 }, /* 8-way set assoc, 64 byte line size */
+ { 0x82, LVL_2, 256 }, /* 8-way set assoc, 32 byte line size */
+ { 0x83, LVL_2, 512 }, /* 8-way set assoc, 32 byte line size */
+ { 0x84, LVL_2, MB(1) }, /* 8-way set assoc, 32 byte line size */
+ { 0x85, LVL_2, MB(2) }, /* 8-way set assoc, 32 byte line size */
+ { 0x86, LVL_2, 512 }, /* 4-way set assoc, 64 byte line size */
+ { 0x87, LVL_2, MB(1) }, /* 8-way set assoc, 64 byte line size */
+ { 0xd0, LVL_3, 512 }, /* 4-way set assoc, 64 byte line size */
+ { 0xd1, LVL_3, MB(1) }, /* 4-way set assoc, 64 byte line size */
+ { 0xd2, LVL_3, MB(2) }, /* 4-way set assoc, 64 byte line size */
+ { 0xd6, LVL_3, MB(1) }, /* 8-way set assoc, 64 byte line size */
+ { 0xd7, LVL_3, MB(2) }, /* 8-way set assoc, 64 byte line size */
+ { 0xd8, LVL_3, MB(4) }, /* 12-way set assoc, 64 byte line size */
+ { 0xdc, LVL_3, MB(2) }, /* 12-way set assoc, 64 byte line size */
+ { 0xdd, LVL_3, MB(4) }, /* 12-way set assoc, 64 byte line size */
+ { 0xde, LVL_3, MB(8) }, /* 12-way set assoc, 64 byte line size */
+ { 0xe2, LVL_3, MB(2) }, /* 16-way set assoc, 64 byte line size */
+ { 0xe3, LVL_3, MB(4) }, /* 16-way set assoc, 64 byte line size */
+ { 0xe4, LVL_3, MB(8) }, /* 16-way set assoc, 64 byte line size */
+ { 0xea, LVL_3, MB(12) }, /* 24-way set assoc, 64 byte line size */
+ { 0xeb, LVL_3, MB(18) }, /* 24-way set assoc, 64 byte line size */
+ { 0xec, LVL_3, MB(24) }, /* 24-way set assoc, 64 byte line size */
+ { 0x00, 0, 0}
+};
+
+
+enum _cache_type {
+ CTYPE_NULL = 0,
+ CTYPE_DATA = 1,
+ CTYPE_INST = 2,
+ CTYPE_UNIFIED = 3
+};
+
+union _cpuid4_leaf_eax {
+ struct {
+ enum _cache_type type:5;
+ unsigned int level:3;
+ unsigned int is_self_initializing:1;
+ unsigned int is_fully_associative:1;
+ unsigned int reserved:4;
+ unsigned int num_threads_sharing:12;
+ unsigned int num_cores_on_die:6;
+ } split;
+ u32 full;
+};
+
+union _cpuid4_leaf_ebx {
+ struct {
+ unsigned int coherency_line_size:12;
+ unsigned int physical_line_partition:10;
+ unsigned int ways_of_associativity:10;
+ } split;
+ u32 full;
+};
+
+union _cpuid4_leaf_ecx {
+ struct {
+ unsigned int number_of_sets:32;
+ } split;
+ u32 full;
+};
+
+struct _cpuid4_info_regs {
+ union _cpuid4_leaf_eax eax;
+ union _cpuid4_leaf_ebx ebx;
+ union _cpuid4_leaf_ecx ecx;
+ unsigned int id;
+ unsigned long size;
+ struct amd_northbridge *nb;
+};
+
+static unsigned short num_cache_leaves;
+
+/* AMD doesn't have CPUID4. Emulate it here to report the same
+ information to the user. This makes some assumptions about the machine:
+ L2 not shared, no SMT etc. that is currently true on AMD CPUs.
+
+ In theory the TLBs could be reported as fake type (they are in "dummy").
+ Maybe later */
+union l1_cache {
+ struct {
+ unsigned line_size:8;
+ unsigned lines_per_tag:8;
+ unsigned assoc:8;
+ unsigned size_in_kb:8;
+ };
+ unsigned val;
+};
+
+union l2_cache {
+ struct {
+ unsigned line_size:8;
+ unsigned lines_per_tag:4;
+ unsigned assoc:4;
+ unsigned size_in_kb:16;
+ };
+ unsigned val;
+};
+
+union l3_cache {
+ struct {
+ unsigned line_size:8;
+ unsigned lines_per_tag:4;
+ unsigned assoc:4;
+ unsigned res:2;
+ unsigned size_encoded:14;
+ };
+ unsigned val;
+};
+
+static const unsigned short assocs[] = {
+ [1] = 1,
+ [2] = 2,
+ [4] = 4,
+ [6] = 8,
+ [8] = 16,
+ [0xa] = 32,
+ [0xb] = 48,
+ [0xc] = 64,
+ [0xd] = 96,
+ [0xe] = 128,
+ [0xf] = 0xffff /* fully associative - no way to show this currently */
+};
+
+static const unsigned char levels[] = { 1, 1, 2, 3 };
+static const unsigned char types[] = { 1, 2, 3, 3 };
+
+static const enum cache_type cache_type_map[] = {
+ [CTYPE_NULL] = CACHE_TYPE_NOCACHE,
+ [CTYPE_DATA] = CACHE_TYPE_DATA,
+ [CTYPE_INST] = CACHE_TYPE_INST,
+ [CTYPE_UNIFIED] = CACHE_TYPE_UNIFIED,
+};
+
+static void
+amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax,
+ union _cpuid4_leaf_ebx *ebx,
+ union _cpuid4_leaf_ecx *ecx)
+{
+ unsigned dummy;
+ unsigned line_size, lines_per_tag, assoc, size_in_kb;
+ union l1_cache l1i, l1d;
+ union l2_cache l2;
+ union l3_cache l3;
+ union l1_cache *l1 = &l1d;
+
+ eax->full = 0;
+ ebx->full = 0;
+ ecx->full = 0;
+
+ cpuid(0x80000005, &dummy, &dummy, &l1d.val, &l1i.val);
+ cpuid(0x80000006, &dummy, &dummy, &l2.val, &l3.val);
+
+ switch (leaf) {
+ case 1:
+ l1 = &l1i;
+ case 0:
+ if (!l1->val)
+ return;
+ assoc = assocs[l1->assoc];
+ line_size = l1->line_size;
+ lines_per_tag = l1->lines_per_tag;
+ size_in_kb = l1->size_in_kb;
+ break;
+ case 2:
+ if (!l2.val)
+ return;
+ assoc = assocs[l2.assoc];
+ line_size = l2.line_size;
+ lines_per_tag = l2.lines_per_tag;
+ /* cpu_data has errata corrections for K7 applied */
+ size_in_kb = __this_cpu_read(cpu_info.x86_cache_size);
+ break;
+ case 3:
+ if (!l3.val)
+ return;
+ assoc = assocs[l3.assoc];
+ line_size = l3.line_size;
+ lines_per_tag = l3.lines_per_tag;
+ size_in_kb = l3.size_encoded * 512;
+ if (boot_cpu_has(X86_FEATURE_AMD_DCM)) {
+ size_in_kb = size_in_kb >> 1;
+ assoc = assoc >> 1;
+ }
+ break;
+ default:
+ return;
+ }
+
+ eax->split.is_self_initializing = 1;
+ eax->split.type = types[leaf];
+ eax->split.level = levels[leaf];
+ eax->split.num_threads_sharing = 0;
+ eax->split.num_cores_on_die = __this_cpu_read(cpu_info.x86_max_cores) - 1;
+
+
+ if (assoc == 0xffff)
+ eax->split.is_fully_associative = 1;
+ ebx->split.coherency_line_size = line_size - 1;
+ ebx->split.ways_of_associativity = assoc - 1;
+ ebx->split.physical_line_partition = lines_per_tag - 1;
+ ecx->split.number_of_sets = (size_in_kb * 1024) / line_size /
+ (ebx->split.ways_of_associativity + 1) - 1;
+}
+
+#if defined(CONFIG_AMD_NB) && defined(CONFIG_SYSFS)
+
+/*
+ * L3 cache descriptors
+ */
+static void amd_calc_l3_indices(struct amd_northbridge *nb)
+{
+ struct amd_l3_cache *l3 = &nb->l3_cache;
+ unsigned int sc0, sc1, sc2, sc3;
+ u32 val = 0;
+
+ pci_read_config_dword(nb->misc, 0x1C4, &val);
+
+ /* calculate subcache sizes */
+ l3->subcaches[0] = sc0 = !(val & BIT(0));
+ l3->subcaches[1] = sc1 = !(val & BIT(4));
+
+ if (boot_cpu_data.x86 == 0x15) {
+ l3->subcaches[0] = sc0 += !(val & BIT(1));
+ l3->subcaches[1] = sc1 += !(val & BIT(5));
+ }
+
+ l3->subcaches[2] = sc2 = !(val & BIT(8)) + !(val & BIT(9));
+ l3->subcaches[3] = sc3 = !(val & BIT(12)) + !(val & BIT(13));
+
+ l3->indices = (max(max3(sc0, sc1, sc2), sc3) << 10) - 1;
+}
+
+/*
+ * check whether a slot used for disabling an L3 index is occupied.
+ * @l3: L3 cache descriptor
+ * @slot: slot number (0..1)
+ *
+ * @returns: the disabled index if used or negative value if slot free.
+ */
+static int amd_get_l3_disable_slot(struct amd_northbridge *nb, unsigned slot)
+{
+ unsigned int reg = 0;
+
+ pci_read_config_dword(nb->misc, 0x1BC + slot * 4, ®);
+
+ /* check whether this slot is activated already */
+ if (reg & (3UL << 30))
+ return reg & 0xfff;
+
+ return -1;
+}
+
+static ssize_t show_cache_disable(struct cacheinfo *this_leaf, char *buf,
+ unsigned int slot)
+{
+ int index;
+ struct amd_northbridge *nb = this_leaf->priv;
+
+ index = amd_get_l3_disable_slot(nb, slot);
+ if (index >= 0)
+ return sprintf(buf, "%d\n", index);
+
+ return sprintf(buf, "FREE\n");
+}
+
+#define SHOW_CACHE_DISABLE(slot) \
+static ssize_t \
+cache_disable_##slot##_show(struct device *dev, \
+ struct device_attribute *attr, char *buf) \
+{ \
+ struct cacheinfo *this_leaf = dev_get_drvdata(dev); \
+ return show_cache_disable(this_leaf, buf, slot); \
+}
+SHOW_CACHE_DISABLE(0)
+SHOW_CACHE_DISABLE(1)
+
+static void amd_l3_disable_index(struct amd_northbridge *nb, int cpu,
+ unsigned slot, unsigned long idx)
+{
+ int i;
+
+ idx |= BIT(30);
+
+ /*
+ * disable index in all 4 subcaches
+ */
+ for (i = 0; i < 4; i++) {
+ u32 reg = idx | (i << 20);
+
+ if (!nb->l3_cache.subcaches[i])
+ continue;
+
+ pci_write_config_dword(nb->misc, 0x1BC + slot * 4, reg);
+
+ /*
+ * We need to WBINVD on a core on the node containing the L3
+ * cache which indices we disable therefore a simple wbinvd()
+ * is not sufficient.
+ */
+ wbinvd_on_cpu(cpu);
+
+ reg |= BIT(31);
+ pci_write_config_dword(nb->misc, 0x1BC + slot * 4, reg);
+ }
+}
+
+/*
+ * disable a L3 cache index by using a disable-slot
+ *
+ * @l3: L3 cache descriptor
+ * @cpu: A CPU on the node containing the L3 cache
+ * @slot: slot number (0..1)
+ * @index: index to disable
+ *
+ * @return: 0 on success, error status on failure
+ */
+static int amd_set_l3_disable_slot(struct amd_northbridge *nb, int cpu,
+ unsigned slot, unsigned long index)
+{
+ int ret = 0;
+
+ /* check if @slot is already used or the index is already disabled */
+ ret = amd_get_l3_disable_slot(nb, slot);
+ if (ret >= 0)
+ return -EEXIST;
+
+ if (index > nb->l3_cache.indices)
+ return -EINVAL;
+
+ /* check whether the other slot has disabled the same index already */
+ if (index == amd_get_l3_disable_slot(nb, !slot))
+ return -EEXIST;
+
+ amd_l3_disable_index(nb, cpu, slot, index);
+
+ return 0;
+}
+
+static ssize_t store_cache_disable(struct cacheinfo *this_leaf,
+ const char *buf, size_t count,
+ unsigned int slot)
+{
+ unsigned long val = 0;
+ int cpu, err = 0;
+ struct amd_northbridge *nb = this_leaf->priv;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ cpu = cpumask_first(&this_leaf->shared_cpu_map);
+
+ if (kstrtoul(buf, 10, &val) < 0)
+ return -EINVAL;
+
+ err = amd_set_l3_disable_slot(nb, cpu, slot, val);
+ if (err) {
+ if (err == -EEXIST)
+ pr_warn("L3 slot %d in use/index already disabled!\n",
+ slot);
+ return err;
+ }
+ return count;
+}
+
+#define STORE_CACHE_DISABLE(slot) \
+static ssize_t \
+cache_disable_##slot##_store(struct device *dev, \
+ struct device_attribute *attr, \
+ const char *buf, size_t count) \
+{ \
+ struct cacheinfo *this_leaf = dev_get_drvdata(dev); \
+ return store_cache_disable(this_leaf, buf, count, slot); \
+}
+STORE_CACHE_DISABLE(0)
+STORE_CACHE_DISABLE(1)
+
+static ssize_t subcaches_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct cacheinfo *this_leaf = dev_get_drvdata(dev);
+ int cpu = cpumask_first(&this_leaf->shared_cpu_map);
+
+ return sprintf(buf, "%x\n", amd_get_subcaches(cpu));
+}
+
+static ssize_t subcaches_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct cacheinfo *this_leaf = dev_get_drvdata(dev);
+ int cpu = cpumask_first(&this_leaf->shared_cpu_map);
+ unsigned long val;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (kstrtoul(buf, 16, &val) < 0)
+ return -EINVAL;
+
+ if (amd_set_subcaches(cpu, val))
+ return -EINVAL;
+
+ return count;
+}
+
+static DEVICE_ATTR_RW(cache_disable_0);
+static DEVICE_ATTR_RW(cache_disable_1);
+static DEVICE_ATTR_RW(subcaches);
+
+static umode_t
+cache_private_attrs_is_visible(struct kobject *kobj,
+ struct attribute *attr, int unused)
+{
+ struct device *dev = kobj_to_dev(kobj);
+ struct cacheinfo *this_leaf = dev_get_drvdata(dev);
+ umode_t mode = attr->mode;
+
+ if (!this_leaf->priv)
+ return 0;
+
+ if ((attr == &dev_attr_subcaches.attr) &&
+ amd_nb_has_feature(AMD_NB_L3_PARTITIONING))
+ return mode;
+
+ if ((attr == &dev_attr_cache_disable_0.attr ||
+ attr == &dev_attr_cache_disable_1.attr) &&
+ amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE))
+ return mode;
+
+ return 0;
+}
+
+static struct attribute_group cache_private_group = {
+ .is_visible = cache_private_attrs_is_visible,
+};
+
+static void init_amd_l3_attrs(void)
+{
+ int n = 1;
+ static struct attribute **amd_l3_attrs;
+
+ if (amd_l3_attrs) /* already initialized */
+ return;
+
+ if (amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE))
+ n += 2;
+ if (amd_nb_has_feature(AMD_NB_L3_PARTITIONING))
+ n += 1;
+
+ amd_l3_attrs = kcalloc(n, sizeof(*amd_l3_attrs), GFP_KERNEL);
+ if (!amd_l3_attrs)
+ return;
+
+ n = 0;
+ if (amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) {
+ amd_l3_attrs[n++] = &dev_attr_cache_disable_0.attr;
+ amd_l3_attrs[n++] = &dev_attr_cache_disable_1.attr;
+ }
+ if (amd_nb_has_feature(AMD_NB_L3_PARTITIONING))
+ amd_l3_attrs[n++] = &dev_attr_subcaches.attr;
+
+ cache_private_group.attrs = amd_l3_attrs;
+}
+
+const struct attribute_group *
+cache_get_priv_group(struct cacheinfo *this_leaf)
+{
+ struct amd_northbridge *nb = this_leaf->priv;
+
+ if (this_leaf->level < 3 || !nb)
+ return NULL;
+
+ if (nb && nb->l3_cache.indices)
+ init_amd_l3_attrs();
+
+ return &cache_private_group;
+}
+
+static void amd_init_l3_cache(struct _cpuid4_info_regs *this_leaf, int index)
+{
+ int node;
+
+ /* only for L3, and not in virtualized environments */
+ if (index < 3)
+ return;
+
+ node = amd_get_nb_id(smp_processor_id());
+ this_leaf->nb = node_to_amd_nb(node);
+ if (this_leaf->nb && !this_leaf->nb->l3_cache.indices)
+ amd_calc_l3_indices(this_leaf->nb);
+}
+#else
+#define amd_init_l3_cache(x, y)
+#endif /* CONFIG_AMD_NB && CONFIG_SYSFS */
+
+static int
+cpuid4_cache_lookup_regs(int index, struct _cpuid4_info_regs *this_leaf)
+{
+ union _cpuid4_leaf_eax eax;
+ union _cpuid4_leaf_ebx ebx;
+ union _cpuid4_leaf_ecx ecx;
+ unsigned edx;
+
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
+ if (boot_cpu_has(X86_FEATURE_TOPOEXT))
+ cpuid_count(0x8000001d, index, &eax.full,
+ &ebx.full, &ecx.full, &edx);
+ else
+ amd_cpuid4(index, &eax, &ebx, &ecx);
+ amd_init_l3_cache(this_leaf, index);
+ } else {
+ cpuid_count(4, index, &eax.full, &ebx.full, &ecx.full, &edx);
+ }
+
+ if (eax.split.type == CTYPE_NULL)
+ return -EIO; /* better error ? */
+
+ this_leaf->eax = eax;
+ this_leaf->ebx = ebx;
+ this_leaf->ecx = ecx;
+ this_leaf->size = (ecx.split.number_of_sets + 1) *
+ (ebx.split.coherency_line_size + 1) *
+ (ebx.split.physical_line_partition + 1) *
+ (ebx.split.ways_of_associativity + 1);
+ return 0;
+}
+
+static int find_num_cache_leaves(struct cpuinfo_x86 *c)
+{
+ unsigned int eax, ebx, ecx, edx, op;
+ union _cpuid4_leaf_eax cache_eax;
+ int i = -1;
+
+ if (c->x86_vendor == X86_VENDOR_AMD)
+ op = 0x8000001d;
+ else
+ op = 4;
+
+ do {
+ ++i;
+ /* Do cpuid(op) loop to find out num_cache_leaves */
+ cpuid_count(op, i, &eax, &ebx, &ecx, &edx);
+ cache_eax.full = eax;
+ } while (cache_eax.split.type != CTYPE_NULL);
+ return i;
+}
+
+void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c, int cpu, u8 node_id)
+{
+ /*
+ * We may have multiple LLCs if L3 caches exist, so check if we
+ * have an L3 cache by looking at the L3 cache CPUID leaf.
+ */
+ if (!cpuid_edx(0x80000006))
+ return;
+
+ if (c->x86 < 0x17) {
+ /* LLC is at the node level. */
+ per_cpu(cpu_llc_id, cpu) = node_id;
+ } else if (c->x86 == 0x17 &&
+ c->x86_model >= 0 && c->x86_model <= 0x1F) {
+ /*
+ * LLC is at the core complex level.
+ * Core complex ID is ApicId[3] for these processors.
+ */
+ per_cpu(cpu_llc_id, cpu) = c->apicid >> 3;
+ } else {
+ /*
+ * LLC ID is calculated from the number of threads sharing the
+ * cache.
+ * */
+ u32 eax, ebx, ecx, edx, num_sharing_cache = 0;
+ u32 llc_index = find_num_cache_leaves(c) - 1;
+
+ cpuid_count(0x8000001d, llc_index, &eax, &ebx, &ecx, &edx);
+ if (eax)
+ num_sharing_cache = ((eax >> 14) & 0xfff) + 1;
+
+ if (num_sharing_cache) {
+ int bits = get_count_order(num_sharing_cache);
+
+ per_cpu(cpu_llc_id, cpu) = c->apicid >> bits;
+ }
+ }
+}
+
+void init_amd_cacheinfo(struct cpuinfo_x86 *c)
+{
+
+ if (boot_cpu_has(X86_FEATURE_TOPOEXT)) {
+ num_cache_leaves = find_num_cache_leaves(c);
+ } else if (c->extended_cpuid_level >= 0x80000006) {
+ if (cpuid_edx(0x80000006) & 0xf000)
+ num_cache_leaves = 4;
+ else
+ num_cache_leaves = 3;
+ }
+}
+
+void init_intel_cacheinfo(struct cpuinfo_x86 *c)
+{
+ /* Cache sizes */
+ unsigned int trace = 0, l1i = 0, l1d = 0, l2 = 0, l3 = 0;
+ unsigned int new_l1d = 0, new_l1i = 0; /* Cache sizes from cpuid(4) */
+ unsigned int new_l2 = 0, new_l3 = 0, i; /* Cache sizes from cpuid(4) */
+ unsigned int l2_id = 0, l3_id = 0, num_threads_sharing, index_msb;
+#ifdef CONFIG_SMP
+ unsigned int cpu = c->cpu_index;
+#endif
+
+ if (c->cpuid_level > 3) {
+ static int is_initialized;
+
+ if (is_initialized == 0) {
+ /* Init num_cache_leaves from boot CPU */
+ num_cache_leaves = find_num_cache_leaves(c);
+ is_initialized++;
+ }
+
+ /*
+ * Whenever possible use cpuid(4), deterministic cache
+ * parameters cpuid leaf to find the cache details
+ */
+ for (i = 0; i < num_cache_leaves; i++) {
+ struct _cpuid4_info_regs this_leaf = {};
+ int retval;
+
+ retval = cpuid4_cache_lookup_regs(i, &this_leaf);
+ if (retval < 0)
+ continue;
+
+ switch (this_leaf.eax.split.level) {
+ case 1:
+ if (this_leaf.eax.split.type == CTYPE_DATA)
+ new_l1d = this_leaf.size/1024;
+ else if (this_leaf.eax.split.type == CTYPE_INST)
+ new_l1i = this_leaf.size/1024;
+ break;
+ case 2:
+ new_l2 = this_leaf.size/1024;
+ num_threads_sharing = 1 + this_leaf.eax.split.num_threads_sharing;
+ index_msb = get_count_order(num_threads_sharing);
+ l2_id = c->apicid & ~((1 << index_msb) - 1);
+ break;
+ case 3:
+ new_l3 = this_leaf.size/1024;
+ num_threads_sharing = 1 + this_leaf.eax.split.num_threads_sharing;
+ index_msb = get_count_order(num_threads_sharing);
+ l3_id = c->apicid & ~((1 << index_msb) - 1);
+ break;
+ default:
+ break;
+ }
+ }
+ }
+ /*
+ * Don't use cpuid2 if cpuid4 is supported. For P4, we use cpuid2 for
+ * trace cache
+ */
+ if ((num_cache_leaves == 0 || c->x86 == 15) && c->cpuid_level > 1) {
+ /* supports eax=2 call */
+ int j, n;
+ unsigned int regs[4];
+ unsigned char *dp = (unsigned char *)regs;
+ int only_trace = 0;
+
+ if (num_cache_leaves != 0 && c->x86 == 15)
+ only_trace = 1;
+
+ /* Number of times to iterate */
+ n = cpuid_eax(2) & 0xFF;
+
+ for (i = 0 ; i < n ; i++) {
+ cpuid(2, ®s[0], ®s[1], ®s[2], ®s[3]);
+
+ /* If bit 31 is set, this is an unknown format */
+ for (j = 0 ; j < 3 ; j++)
+ if (regs[j] & (1 << 31))
+ regs[j] = 0;
+
+ /* Byte 0 is level count, not a descriptor */
+ for (j = 1 ; j < 16 ; j++) {
+ unsigned char des = dp[j];
+ unsigned char k = 0;
+
+ /* look up this descriptor in the table */
+ while (cache_table[k].descriptor != 0) {
+ if (cache_table[k].descriptor == des) {
+ if (only_trace && cache_table[k].cache_type != LVL_TRACE)
+ break;
+ switch (cache_table[k].cache_type) {
+ case LVL_1_INST:
+ l1i += cache_table[k].size;
+ break;
+ case LVL_1_DATA:
+ l1d += cache_table[k].size;
+ break;
+ case LVL_2:
+ l2 += cache_table[k].size;
+ break;
+ case LVL_3:
+ l3 += cache_table[k].size;
+ break;
+ case LVL_TRACE:
+ trace += cache_table[k].size;
+ break;
+ }
+
+ break;
+ }
+
+ k++;
+ }
+ }
+ }
+ }
+
+ if (new_l1d)
+ l1d = new_l1d;
+
+ if (new_l1i)
+ l1i = new_l1i;
+
+ if (new_l2) {
+ l2 = new_l2;
+#ifdef CONFIG_SMP
+ per_cpu(cpu_llc_id, cpu) = l2_id;
+#endif
+ }
+
+ if (new_l3) {
+ l3 = new_l3;
+#ifdef CONFIG_SMP
+ per_cpu(cpu_llc_id, cpu) = l3_id;
+#endif
+ }
+
+#ifdef CONFIG_SMP
+ /*
+ * If cpu_llc_id is not yet set, this means cpuid_level < 4 which in
+ * turns means that the only possibility is SMT (as indicated in
+ * cpuid1). Since cpuid2 doesn't specify shared caches, and we know
+ * that SMT shares all caches, we can unconditionally set cpu_llc_id to
+ * c->phys_proc_id.
+ */
+ if (per_cpu(cpu_llc_id, cpu) == BAD_APICID)
+ per_cpu(cpu_llc_id, cpu) = c->phys_proc_id;
+#endif
+
+ c->x86_cache_size = l3 ? l3 : (l2 ? l2 : (l1i+l1d));
+
+ if (!l2)
+ cpu_detect_cache_sizes(c);
+}
+
+static int __cache_amd_cpumap_setup(unsigned int cpu, int index,
+ struct _cpuid4_info_regs *base)
+{
+ struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu);
+ struct cacheinfo *this_leaf;
+ int i, sibling;
+
+ /*
+ * For L3, always use the pre-calculated cpu_llc_shared_mask
+ * to derive shared_cpu_map.
+ */
+ if (index == 3) {
+ for_each_cpu(i, cpu_llc_shared_mask(cpu)) {
+ this_cpu_ci = get_cpu_cacheinfo(i);
+ if (!this_cpu_ci->info_list)
+ continue;
+ this_leaf = this_cpu_ci->info_list + index;
+ for_each_cpu(sibling, cpu_llc_shared_mask(cpu)) {
+ if (!cpu_online(sibling))
+ continue;
+ cpumask_set_cpu(sibling,
+ &this_leaf->shared_cpu_map);
+ }
+ }
+ } else if (boot_cpu_has(X86_FEATURE_TOPOEXT)) {
+ unsigned int apicid, nshared, first, last;
+
+ nshared = base->eax.split.num_threads_sharing + 1;
+ apicid = cpu_data(cpu).apicid;
+ first = apicid - (apicid % nshared);
+ last = first + nshared - 1;
+
+ for_each_online_cpu(i) {
+ this_cpu_ci = get_cpu_cacheinfo(i);
+ if (!this_cpu_ci->info_list)
+ continue;
+
+ apicid = cpu_data(i).apicid;
+ if ((apicid < first) || (apicid > last))
+ continue;
+
+ this_leaf = this_cpu_ci->info_list + index;
+
+ for_each_online_cpu(sibling) {
+ apicid = cpu_data(sibling).apicid;
+ if ((apicid < first) || (apicid > last))
+ continue;
+ cpumask_set_cpu(sibling,
+ &this_leaf->shared_cpu_map);
+ }
+ }
+ } else
+ return 0;
+
+ return 1;
+}
+
+static void __cache_cpumap_setup(unsigned int cpu, int index,
+ struct _cpuid4_info_regs *base)
+{
+ struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu);
+ struct cacheinfo *this_leaf, *sibling_leaf;
+ unsigned long num_threads_sharing;
+ int index_msb, i;
+ struct cpuinfo_x86 *c = &cpu_data(cpu);
+
+ if (c->x86_vendor == X86_VENDOR_AMD) {
+ if (__cache_amd_cpumap_setup(cpu, index, base))
+ return;
+ }
+
+ this_leaf = this_cpu_ci->info_list + index;
+ num_threads_sharing = 1 + base->eax.split.num_threads_sharing;
+
+ cpumask_set_cpu(cpu, &this_leaf->shared_cpu_map);
+ if (num_threads_sharing == 1)
+ return;
+
+ index_msb = get_count_order(num_threads_sharing);
+
+ for_each_online_cpu(i)
+ if (cpu_data(i).apicid >> index_msb == c->apicid >> index_msb) {
+ struct cpu_cacheinfo *sib_cpu_ci = get_cpu_cacheinfo(i);
+
+ if (i == cpu || !sib_cpu_ci->info_list)
+ continue;/* skip if itself or no cacheinfo */
+ sibling_leaf = sib_cpu_ci->info_list + index;
+ cpumask_set_cpu(i, &this_leaf->shared_cpu_map);
+ cpumask_set_cpu(cpu, &sibling_leaf->shared_cpu_map);
+ }
+}
+
+static void ci_leaf_init(struct cacheinfo *this_leaf,
+ struct _cpuid4_info_regs *base)
+{
+ this_leaf->id = base->id;
+ this_leaf->attributes = CACHE_ID;
+ this_leaf->level = base->eax.split.level;
+ this_leaf->type = cache_type_map[base->eax.split.type];
+ this_leaf->coherency_line_size =
+ base->ebx.split.coherency_line_size + 1;
+ this_leaf->ways_of_associativity =
+ base->ebx.split.ways_of_associativity + 1;
+ this_leaf->size = base->size;
+ this_leaf->number_of_sets = base->ecx.split.number_of_sets + 1;
+ this_leaf->physical_line_partition =
+ base->ebx.split.physical_line_partition + 1;
+ this_leaf->priv = base->nb;
+}
+
+static int __init_cache_level(unsigned int cpu)
+{
+ struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu);
+
+ if (!num_cache_leaves)
+ return -ENOENT;
+ if (!this_cpu_ci)
+ return -EINVAL;
+ this_cpu_ci->num_levels = 3;
+ this_cpu_ci->num_leaves = num_cache_leaves;
+ return 0;
+}
+
+/*
+ * The max shared threads number comes from CPUID.4:EAX[25-14] with input
+ * ECX as cache index. Then right shift apicid by the number's order to get
+ * cache id for this cache node.
+ */
+static void get_cache_id(int cpu, struct _cpuid4_info_regs *id4_regs)
+{
+ struct cpuinfo_x86 *c = &cpu_data(cpu);
+ unsigned long num_threads_sharing;
+ int index_msb;
+
+ num_threads_sharing = 1 + id4_regs->eax.split.num_threads_sharing;
+ index_msb = get_count_order(num_threads_sharing);
+ id4_regs->id = c->apicid >> index_msb;
+}
+
+static int __populate_cache_leaves(unsigned int cpu)
+{
+ unsigned int idx, ret;
+ struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu);
+ struct cacheinfo *this_leaf = this_cpu_ci->info_list;
+ struct _cpuid4_info_regs id4_regs = {};
+
+ for (idx = 0; idx < this_cpu_ci->num_leaves; idx++) {
+ ret = cpuid4_cache_lookup_regs(idx, &id4_regs);
+ if (ret)
+ return ret;
+ get_cache_id(cpu, &id4_regs);
+ ci_leaf_init(this_leaf++, &id4_regs);
+ __cache_cpumap_setup(cpu, idx, &id4_regs);
+ }
+ this_cpu_ci->cpu_map_populated = true;
+
+ return 0;
+}
+
+DEFINE_SMP_CALL_CACHE_FUNCTION(init_cache_level)
+DEFINE_SMP_CALL_CACHE_FUNCTION(populate_cache_leaves)
diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c
new file mode 100644
index 0000000..14433ff
--- /dev/null
+++ b/arch/x86/kernel/cpu/centaur.c
@@ -0,0 +1,288 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/sched.h>
+#include <linux/sched/clock.h>
+
+#include <asm/cpufeature.h>
+#include <asm/e820/api.h>
+#include <asm/mtrr.h>
+#include <asm/msr.h>
+
+#include "cpu.h"
+
+#define ACE_PRESENT (1 << 6)
+#define ACE_ENABLED (1 << 7)
+#define ACE_FCR (1 << 28) /* MSR_VIA_FCR */
+
+#define RNG_PRESENT (1 << 2)
+#define RNG_ENABLED (1 << 3)
+#define RNG_ENABLE (1 << 6) /* MSR_VIA_RNG */
+
+#define X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW 0x00200000
+#define X86_VMX_FEATURE_PROC_CTLS_VNMI 0x00400000
+#define X86_VMX_FEATURE_PROC_CTLS_2ND_CTLS 0x80000000
+#define X86_VMX_FEATURE_PROC_CTLS2_VIRT_APIC 0x00000001
+#define X86_VMX_FEATURE_PROC_CTLS2_EPT 0x00000002
+#define X86_VMX_FEATURE_PROC_CTLS2_VPID 0x00000020
+
+static void init_c3(struct cpuinfo_x86 *c)
+{
+ u32 lo, hi;
+
+ /* Test for Centaur Extended Feature Flags presence */
+ if (cpuid_eax(0xC0000000) >= 0xC0000001) {
+ u32 tmp = cpuid_edx(0xC0000001);
+
+ /* enable ACE unit, if present and disabled */
+ if ((tmp & (ACE_PRESENT | ACE_ENABLED)) == ACE_PRESENT) {
+ rdmsr(MSR_VIA_FCR, lo, hi);
+ lo |= ACE_FCR; /* enable ACE unit */
+ wrmsr(MSR_VIA_FCR, lo, hi);
+ pr_info("CPU: Enabled ACE h/w crypto\n");
+ }
+
+ /* enable RNG unit, if present and disabled */
+ if ((tmp & (RNG_PRESENT | RNG_ENABLED)) == RNG_PRESENT) {
+ rdmsr(MSR_VIA_RNG, lo, hi);
+ lo |= RNG_ENABLE; /* enable RNG unit */
+ wrmsr(MSR_VIA_RNG, lo, hi);
+ pr_info("CPU: Enabled h/w RNG\n");
+ }
+
+ /* store Centaur Extended Feature Flags as
+ * word 5 of the CPU capability bit array
+ */
+ c->x86_capability[CPUID_C000_0001_EDX] = cpuid_edx(0xC0000001);
+ }
+#ifdef CONFIG_X86_32
+ /* Cyrix III family needs CX8 & PGE explicitly enabled. */
+ if (c->x86_model >= 6 && c->x86_model <= 13) {
+ rdmsr(MSR_VIA_FCR, lo, hi);
+ lo |= (1<<1 | 1<<7);
+ wrmsr(MSR_VIA_FCR, lo, hi);
+ set_cpu_cap(c, X86_FEATURE_CX8);
+ }
+
+ /* Before Nehemiah, the C3's had 3dNOW! */
+ if (c->x86_model >= 6 && c->x86_model < 9)
+ set_cpu_cap(c, X86_FEATURE_3DNOW);
+#endif
+ if (c->x86 == 0x6 && c->x86_model >= 0xf) {
+ c->x86_cache_alignment = c->x86_clflush_size * 2;
+ set_cpu_cap(c, X86_FEATURE_REP_GOOD);
+ }
+
+ cpu_detect_cache_sizes(c);
+}
+
+enum {
+ ECX8 = 1<<1,
+ EIERRINT = 1<<2,
+ DPM = 1<<3,
+ DMCE = 1<<4,
+ DSTPCLK = 1<<5,
+ ELINEAR = 1<<6,
+ DSMC = 1<<7,
+ DTLOCK = 1<<8,
+ EDCTLB = 1<<8,
+ EMMX = 1<<9,
+ DPDC = 1<<11,
+ EBRPRED = 1<<12,
+ DIC = 1<<13,
+ DDC = 1<<14,
+ DNA = 1<<15,
+ ERETSTK = 1<<16,
+ E2MMX = 1<<19,
+ EAMD3D = 1<<20,
+};
+
+static void early_init_centaur(struct cpuinfo_x86 *c)
+{
+ switch (c->x86) {
+#ifdef CONFIG_X86_32
+ case 5:
+ /* Emulate MTRRs using Centaur's MCR. */
+ set_cpu_cap(c, X86_FEATURE_CENTAUR_MCR);
+ break;
+#endif
+ case 6:
+ if (c->x86_model >= 0xf)
+ set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
+ break;
+ }
+#ifdef CONFIG_X86_64
+ set_cpu_cap(c, X86_FEATURE_SYSENTER32);
+#endif
+ if (c->x86_power & (1 << 8)) {
+ set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
+ set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC);
+ }
+}
+
+static void centaur_detect_vmx_virtcap(struct cpuinfo_x86 *c)
+{
+ u32 vmx_msr_low, vmx_msr_high, msr_ctl, msr_ctl2;
+
+ rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, vmx_msr_low, vmx_msr_high);
+ msr_ctl = vmx_msr_high | vmx_msr_low;
+
+ if (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW)
+ set_cpu_cap(c, X86_FEATURE_TPR_SHADOW);
+ if (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_VNMI)
+ set_cpu_cap(c, X86_FEATURE_VNMI);
+ if (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_2ND_CTLS) {
+ rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2,
+ vmx_msr_low, vmx_msr_high);
+ msr_ctl2 = vmx_msr_high | vmx_msr_low;
+ if ((msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_VIRT_APIC) &&
+ (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW))
+ set_cpu_cap(c, X86_FEATURE_FLEXPRIORITY);
+ if (msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_EPT)
+ set_cpu_cap(c, X86_FEATURE_EPT);
+ if (msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_VPID)
+ set_cpu_cap(c, X86_FEATURE_VPID);
+ }
+}
+
+static void init_centaur(struct cpuinfo_x86 *c)
+{
+#ifdef CONFIG_X86_32
+ char *name;
+ u32 fcr_set = 0;
+ u32 fcr_clr = 0;
+ u32 lo, hi, newlo;
+ u32 aa, bb, cc, dd;
+
+ /*
+ * Bit 31 in normal CPUID used for nonstandard 3DNow ID;
+ * 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway
+ */
+ clear_cpu_cap(c, 0*32+31);
+#endif
+ early_init_centaur(c);
+ init_intel_cacheinfo(c);
+ detect_num_cpu_cores(c);
+#ifdef CONFIG_X86_32
+ detect_ht(c);
+#endif
+
+ if (c->cpuid_level > 9) {
+ unsigned int eax = cpuid_eax(10);
+
+ /*
+ * Check for version and the number of counters
+ * Version(eax[7:0]) can't be 0;
+ * Counters(eax[15:8]) should be greater than 1;
+ */
+ if ((eax & 0xff) && (((eax >> 8) & 0xff) > 1))
+ set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON);
+ }
+
+ switch (c->x86) {
+#ifdef CONFIG_X86_32
+ case 5:
+ switch (c->x86_model) {
+ case 4:
+ name = "C6";
+ fcr_set = ECX8|DSMC|EDCTLB|EMMX|ERETSTK;
+ fcr_clr = DPDC;
+ pr_notice("Disabling bugged TSC.\n");
+ clear_cpu_cap(c, X86_FEATURE_TSC);
+ break;
+ case 8:
+ switch (c->x86_stepping) {
+ default:
+ name = "2";
+ break;
+ case 7 ... 9:
+ name = "2A";
+ break;
+ case 10 ... 15:
+ name = "2B";
+ break;
+ }
+ fcr_set = ECX8|DSMC|DTLOCK|EMMX|EBRPRED|ERETSTK|
+ E2MMX|EAMD3D;
+ fcr_clr = DPDC;
+ break;
+ case 9:
+ name = "3";
+ fcr_set = ECX8|DSMC|DTLOCK|EMMX|EBRPRED|ERETSTK|
+ E2MMX|EAMD3D;
+ fcr_clr = DPDC;
+ break;
+ default:
+ name = "??";
+ }
+
+ rdmsr(MSR_IDT_FCR1, lo, hi);
+ newlo = (lo|fcr_set) & (~fcr_clr);
+
+ if (newlo != lo) {
+ pr_info("Centaur FCR was 0x%X now 0x%X\n",
+ lo, newlo);
+ wrmsr(MSR_IDT_FCR1, newlo, hi);
+ } else {
+ pr_info("Centaur FCR is 0x%X\n", lo);
+ }
+ /* Emulate MTRRs using Centaur's MCR. */
+ set_cpu_cap(c, X86_FEATURE_CENTAUR_MCR);
+ /* Report CX8 */
+ set_cpu_cap(c, X86_FEATURE_CX8);
+ /* Set 3DNow! on Winchip 2 and above. */
+ if (c->x86_model >= 8)
+ set_cpu_cap(c, X86_FEATURE_3DNOW);
+ /* See if we can find out some more. */
+ if (cpuid_eax(0x80000000) >= 0x80000005) {
+ /* Yes, we can. */
+ cpuid(0x80000005, &aa, &bb, &cc, &dd);
+ /* Add L1 data and code cache sizes. */
+ c->x86_cache_size = (cc>>24)+(dd>>24);
+ }
+ sprintf(c->x86_model_id, "WinChip %s", name);
+ break;
+#endif
+ case 6:
+ init_c3(c);
+ break;
+ }
+#ifdef CONFIG_X86_64
+ set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
+#endif
+
+ if (cpu_has(c, X86_FEATURE_VMX))
+ centaur_detect_vmx_virtcap(c);
+}
+
+#ifdef CONFIG_X86_32
+static unsigned int
+centaur_size_cache(struct cpuinfo_x86 *c, unsigned int size)
+{
+ /* VIA C3 CPUs (670-68F) need further shifting. */
+ if ((c->x86 == 6) && ((c->x86_model == 7) || (c->x86_model == 8)))
+ size >>= 8;
+
+ /*
+ * There's also an erratum in Nehemiah stepping 1, which
+ * returns '65KB' instead of '64KB'
+ * - Note, it seems this may only be in engineering samples.
+ */
+ if ((c->x86 == 6) && (c->x86_model == 9) &&
+ (c->x86_stepping == 1) && (size == 65))
+ size -= 1;
+ return size;
+}
+#endif
+
+static const struct cpu_dev centaur_cpu_dev = {
+ .c_vendor = "Centaur",
+ .c_ident = { "CentaurHauls" },
+ .c_early_init = early_init_centaur,
+ .c_init = init_centaur,
+#ifdef CONFIG_X86_32
+ .legacy_cache_size = centaur_size_cache,
+#endif
+ .c_x86_vendor = X86_VENDOR_CENTAUR,
+};
+
+cpu_dev_register(centaur_cpu_dev);
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
new file mode 100644
index 0000000..44c4ef3
--- /dev/null
+++ b/arch/x86/kernel/cpu/common.c
@@ -0,0 +1,1895 @@
+/* cpu_feature_enabled() cannot be used this early */
+#define USE_EARLY_PGTABLE_L5
+
+#include <linux/bootmem.h>
+#include <linux/linkage.h>
+#include <linux/bitops.h>
+#include <linux/kernel.h>
+#include <linux/export.h>
+#include <linux/percpu.h>
+#include <linux/string.h>
+#include <linux/ctype.h>
+#include <linux/delay.h>
+#include <linux/sched/mm.h>
+#include <linux/sched/clock.h>
+#include <linux/sched/task.h>
+#include <linux/init.h>
+#include <linux/kprobes.h>
+#include <linux/kgdb.h>
+#include <linux/smp.h>
+#include <linux/io.h>
+#include <linux/syscore_ops.h>
+
+#include <asm/stackprotector.h>
+#include <asm/perf_event.h>
+#include <asm/mmu_context.h>
+#include <asm/archrandom.h>
+#include <asm/hypervisor.h>
+#include <asm/processor.h>
+#include <asm/tlbflush.h>
+#include <asm/debugreg.h>
+#include <asm/sections.h>
+#include <asm/vsyscall.h>
+#include <linux/topology.h>
+#include <linux/cpumask.h>
+#include <asm/pgtable.h>
+#include <linux/atomic.h>
+#include <asm/proto.h>
+#include <asm/setup.h>
+#include <asm/apic.h>
+#include <asm/desc.h>
+#include <asm/fpu/internal.h>
+#include <asm/mtrr.h>
+#include <asm/hwcap2.h>
+#include <linux/numa.h>
+#include <asm/asm.h>
+#include <asm/bugs.h>
+#include <asm/cpu.h>
+#include <asm/mce.h>
+#include <asm/msr.h>
+#include <asm/pat.h>
+#include <asm/microcode.h>
+#include <asm/microcode_intel.h>
+#include <asm/intel-family.h>
+#include <asm/cpu_device_id.h>
+
+#ifdef CONFIG_X86_LOCAL_APIC
+#include <asm/uv/uv.h>
+#endif
+
+#include "cpu.h"
+
+u32 elf_hwcap2 __read_mostly;
+
+/* all of these masks are initialized in setup_cpu_local_masks() */
+cpumask_var_t cpu_initialized_mask;
+cpumask_var_t cpu_callout_mask;
+cpumask_var_t cpu_callin_mask;
+
+/* representing cpus for which sibling maps can be computed */
+cpumask_var_t cpu_sibling_setup_mask;
+
+/* Number of siblings per CPU package */
+int smp_num_siblings = 1;
+EXPORT_SYMBOL(smp_num_siblings);
+
+/* Last level cache ID of each logical CPU */
+DEFINE_PER_CPU_READ_MOSTLY(u16, cpu_llc_id) = BAD_APICID;
+
+/* correctly size the local cpu masks */
+void __init setup_cpu_local_masks(void)
+{
+ alloc_bootmem_cpumask_var(&cpu_initialized_mask);
+ alloc_bootmem_cpumask_var(&cpu_callin_mask);
+ alloc_bootmem_cpumask_var(&cpu_callout_mask);
+ alloc_bootmem_cpumask_var(&cpu_sibling_setup_mask);
+}
+
+static void default_init(struct cpuinfo_x86 *c)
+{
+#ifdef CONFIG_X86_64
+ cpu_detect_cache_sizes(c);
+#else
+ /* Not much we can do here... */
+ /* Check if at least it has cpuid */
+ if (c->cpuid_level == -1) {
+ /* No cpuid. It must be an ancient CPU */
+ if (c->x86 == 4)
+ strcpy(c->x86_model_id, "486");
+ else if (c->x86 == 3)
+ strcpy(c->x86_model_id, "386");
+ }
+#endif
+}
+
+static const struct cpu_dev default_cpu = {
+ .c_init = default_init,
+ .c_vendor = "Unknown",
+ .c_x86_vendor = X86_VENDOR_UNKNOWN,
+};
+
+static const struct cpu_dev *this_cpu = &default_cpu;
+
+DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
+#ifdef CONFIG_X86_64
+ /*
+ * We need valid kernel segments for data and code in long mode too
+ * IRET will check the segment types kkeil 2000/10/28
+ * Also sysret mandates a special GDT layout
+ *
+ * TLS descriptors are currently at a different place compared to i386.
+ * Hopefully nobody expects them at a fixed place (Wine?)
+ */
+ [GDT_ENTRY_KERNEL32_CS] = GDT_ENTRY_INIT(0xc09b, 0, 0xfffff),
+ [GDT_ENTRY_KERNEL_CS] = GDT_ENTRY_INIT(0xa09b, 0, 0xfffff),
+ [GDT_ENTRY_KERNEL_DS] = GDT_ENTRY_INIT(0xc093, 0, 0xfffff),
+ [GDT_ENTRY_DEFAULT_USER32_CS] = GDT_ENTRY_INIT(0xc0fb, 0, 0xfffff),
+ [GDT_ENTRY_DEFAULT_USER_DS] = GDT_ENTRY_INIT(0xc0f3, 0, 0xfffff),
+ [GDT_ENTRY_DEFAULT_USER_CS] = GDT_ENTRY_INIT(0xa0fb, 0, 0xfffff),
+#else
+ [GDT_ENTRY_KERNEL_CS] = GDT_ENTRY_INIT(0xc09a, 0, 0xfffff),
+ [GDT_ENTRY_KERNEL_DS] = GDT_ENTRY_INIT(0xc092, 0, 0xfffff),
+ [GDT_ENTRY_DEFAULT_USER_CS] = GDT_ENTRY_INIT(0xc0fa, 0, 0xfffff),
+ [GDT_ENTRY_DEFAULT_USER_DS] = GDT_ENTRY_INIT(0xc0f2, 0, 0xfffff),
+ /*
+ * Segments used for calling PnP BIOS have byte granularity.
+ * They code segments and data segments have fixed 64k limits,
+ * the transfer segment sizes are set at run time.
+ */
+ /* 32-bit code */
+ [GDT_ENTRY_PNPBIOS_CS32] = GDT_ENTRY_INIT(0x409a, 0, 0xffff),
+ /* 16-bit code */
+ [GDT_ENTRY_PNPBIOS_CS16] = GDT_ENTRY_INIT(0x009a, 0, 0xffff),
+ /* 16-bit data */
+ [GDT_ENTRY_PNPBIOS_DS] = GDT_ENTRY_INIT(0x0092, 0, 0xffff),
+ /* 16-bit data */
+ [GDT_ENTRY_PNPBIOS_TS1] = GDT_ENTRY_INIT(0x0092, 0, 0),
+ /* 16-bit data */
+ [GDT_ENTRY_PNPBIOS_TS2] = GDT_ENTRY_INIT(0x0092, 0, 0),
+ /*
+ * The APM segments have byte granularity and their bases
+ * are set at run time. All have 64k limits.
+ */
+ /* 32-bit code */
+ [GDT_ENTRY_APMBIOS_BASE] = GDT_ENTRY_INIT(0x409a, 0, 0xffff),
+ /* 16-bit code */
+ [GDT_ENTRY_APMBIOS_BASE+1] = GDT_ENTRY_INIT(0x009a, 0, 0xffff),
+ /* data */
+ [GDT_ENTRY_APMBIOS_BASE+2] = GDT_ENTRY_INIT(0x4092, 0, 0xffff),
+
+ [GDT_ENTRY_ESPFIX_SS] = GDT_ENTRY_INIT(0xc092, 0, 0xfffff),
+ [GDT_ENTRY_PERCPU] = GDT_ENTRY_INIT(0xc092, 0, 0xfffff),
+ GDT_STACK_CANARY_INIT
+#endif
+} };
+EXPORT_PER_CPU_SYMBOL_GPL(gdt_page);
+
+static int __init x86_mpx_setup(char *s)
+{
+ /* require an exact match without trailing characters */
+ if (strlen(s))
+ return 0;
+
+ /* do not emit a message if the feature is not present */
+ if (!boot_cpu_has(X86_FEATURE_MPX))
+ return 1;
+
+ setup_clear_cpu_cap(X86_FEATURE_MPX);
+ pr_info("nompx: Intel Memory Protection Extensions (MPX) disabled\n");
+ return 1;
+}
+__setup("nompx", x86_mpx_setup);
+
+#ifdef CONFIG_X86_64
+static int __init x86_nopcid_setup(char *s)
+{
+ /* nopcid doesn't accept parameters */
+ if (s)
+ return -EINVAL;
+
+ /* do not emit a message if the feature is not present */
+ if (!boot_cpu_has(X86_FEATURE_PCID))
+ return 0;
+
+ setup_clear_cpu_cap(X86_FEATURE_PCID);
+ pr_info("nopcid: PCID feature disabled\n");
+ return 0;
+}
+early_param("nopcid", x86_nopcid_setup);
+#endif
+
+static int __init x86_noinvpcid_setup(char *s)
+{
+ /* noinvpcid doesn't accept parameters */
+ if (s)
+ return -EINVAL;
+
+ /* do not emit a message if the feature is not present */
+ if (!boot_cpu_has(X86_FEATURE_INVPCID))
+ return 0;
+
+ setup_clear_cpu_cap(X86_FEATURE_INVPCID);
+ pr_info("noinvpcid: INVPCID feature disabled\n");
+ return 0;
+}
+early_param("noinvpcid", x86_noinvpcid_setup);
+
+#ifdef CONFIG_X86_32
+static int cachesize_override = -1;
+static int disable_x86_serial_nr = 1;
+
+static int __init cachesize_setup(char *str)
+{
+ get_option(&str, &cachesize_override);
+ return 1;
+}
+__setup("cachesize=", cachesize_setup);
+
+static int __init x86_sep_setup(char *s)
+{
+ setup_clear_cpu_cap(X86_FEATURE_SEP);
+ return 1;
+}
+__setup("nosep", x86_sep_setup);
+
+/* Standard macro to see if a specific flag is changeable */
+static inline int flag_is_changeable_p(u32 flag)
+{
+ u32 f1, f2;
+
+ /*
+ * Cyrix and IDT cpus allow disabling of CPUID
+ * so the code below may return different results
+ * when it is executed before and after enabling
+ * the CPUID. Add "volatile" to not allow gcc to
+ * optimize the subsequent calls to this function.
+ */
+ asm volatile ("pushfl \n\t"
+ "pushfl \n\t"
+ "popl %0 \n\t"
+ "movl %0, %1 \n\t"
+ "xorl %2, %0 \n\t"
+ "pushl %0 \n\t"
+ "popfl \n\t"
+ "pushfl \n\t"
+ "popl %0 \n\t"
+ "popfl \n\t"
+
+ : "=&r" (f1), "=&r" (f2)
+ : "ir" (flag));
+
+ return ((f1^f2) & flag) != 0;
+}
+
+/* Probe for the CPUID instruction */
+int have_cpuid_p(void)
+{
+ return flag_is_changeable_p(X86_EFLAGS_ID);
+}
+
+static void squash_the_stupid_serial_number(struct cpuinfo_x86 *c)
+{
+ unsigned long lo, hi;
+
+ if (!cpu_has(c, X86_FEATURE_PN) || !disable_x86_serial_nr)
+ return;
+
+ /* Disable processor serial number: */
+
+ rdmsr(MSR_IA32_BBL_CR_CTL, lo, hi);
+ lo |= 0x200000;
+ wrmsr(MSR_IA32_BBL_CR_CTL, lo, hi);
+
+ pr_notice("CPU serial number disabled.\n");
+ clear_cpu_cap(c, X86_FEATURE_PN);
+
+ /* Disabling the serial number may affect the cpuid level */
+ c->cpuid_level = cpuid_eax(0);
+}
+
+static int __init x86_serial_nr_setup(char *s)
+{
+ disable_x86_serial_nr = 0;
+ return 1;
+}
+__setup("serialnumber", x86_serial_nr_setup);
+#else
+static inline int flag_is_changeable_p(u32 flag)
+{
+ return 1;
+}
+static inline void squash_the_stupid_serial_number(struct cpuinfo_x86 *c)
+{
+}
+#endif
+
+static __init int setup_disable_smep(char *arg)
+{
+ setup_clear_cpu_cap(X86_FEATURE_SMEP);
+ /* Check for things that depend on SMEP being enabled: */
+ check_mpx_erratum(&boot_cpu_data);
+ return 1;
+}
+__setup("nosmep", setup_disable_smep);
+
+static __always_inline void setup_smep(struct cpuinfo_x86 *c)
+{
+ if (cpu_has(c, X86_FEATURE_SMEP))
+ cr4_set_bits(X86_CR4_SMEP);
+}
+
+static __init int setup_disable_smap(char *arg)
+{
+ setup_clear_cpu_cap(X86_FEATURE_SMAP);
+ return 1;
+}
+__setup("nosmap", setup_disable_smap);
+
+static __always_inline void setup_smap(struct cpuinfo_x86 *c)
+{
+ unsigned long eflags = native_save_fl();
+
+ /* This should have been cleared long ago */
+ BUG_ON(eflags & X86_EFLAGS_AC);
+
+ if (cpu_has(c, X86_FEATURE_SMAP)) {
+#ifdef CONFIG_X86_SMAP
+ cr4_set_bits(X86_CR4_SMAP);
+#else
+ cr4_clear_bits(X86_CR4_SMAP);
+#endif
+ }
+}
+
+static __always_inline void setup_umip(struct cpuinfo_x86 *c)
+{
+ /* Check the boot processor, plus build option for UMIP. */
+ if (!cpu_feature_enabled(X86_FEATURE_UMIP))
+ goto out;
+
+ /* Check the current processor's cpuid bits. */
+ if (!cpu_has(c, X86_FEATURE_UMIP))
+ goto out;
+
+ cr4_set_bits(X86_CR4_UMIP);
+
+ pr_info("x86/cpu: Activated the Intel User Mode Instruction Prevention (UMIP) CPU feature\n");
+
+ return;
+
+out:
+ /*
+ * Make sure UMIP is disabled in case it was enabled in a
+ * previous boot (e.g., via kexec).
+ */
+ cr4_clear_bits(X86_CR4_UMIP);
+}
+
+/*
+ * Protection Keys are not available in 32-bit mode.
+ */
+static bool pku_disabled;
+
+static __always_inline void setup_pku(struct cpuinfo_x86 *c)
+{
+ /* check the boot processor, plus compile options for PKU: */
+ if (!cpu_feature_enabled(X86_FEATURE_PKU))
+ return;
+ /* checks the actual processor's cpuid bits: */
+ if (!cpu_has(c, X86_FEATURE_PKU))
+ return;
+ if (pku_disabled)
+ return;
+
+ cr4_set_bits(X86_CR4_PKE);
+ /*
+ * Seting X86_CR4_PKE will cause the X86_FEATURE_OSPKE
+ * cpuid bit to be set. We need to ensure that we
+ * update that bit in this CPU's "cpu_info".
+ */
+ get_cpu_cap(c);
+}
+
+#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
+static __init int setup_disable_pku(char *arg)
+{
+ /*
+ * Do not clear the X86_FEATURE_PKU bit. All of the
+ * runtime checks are against OSPKE so clearing the
+ * bit does nothing.
+ *
+ * This way, we will see "pku" in cpuinfo, but not
+ * "ospke", which is exactly what we want. It shows
+ * that the CPU has PKU, but the OS has not enabled it.
+ * This happens to be exactly how a system would look
+ * if we disabled the config option.
+ */
+ pr_info("x86: 'nopku' specified, disabling Memory Protection Keys\n");
+ pku_disabled = true;
+ return 1;
+}
+__setup("nopku", setup_disable_pku);
+#endif /* CONFIG_X86_64 */
+
+/*
+ * Some CPU features depend on higher CPUID levels, which may not always
+ * be available due to CPUID level capping or broken virtualization
+ * software. Add those features to this table to auto-disable them.
+ */
+struct cpuid_dependent_feature {
+ u32 feature;
+ u32 level;
+};
+
+static const struct cpuid_dependent_feature
+cpuid_dependent_features[] = {
+ { X86_FEATURE_MWAIT, 0x00000005 },
+ { X86_FEATURE_DCA, 0x00000009 },
+ { X86_FEATURE_XSAVE, 0x0000000d },
+ { 0, 0 }
+};
+
+static void filter_cpuid_features(struct cpuinfo_x86 *c, bool warn)
+{
+ const struct cpuid_dependent_feature *df;
+
+ for (df = cpuid_dependent_features; df->feature; df++) {
+
+ if (!cpu_has(c, df->feature))
+ continue;
+ /*
+ * Note: cpuid_level is set to -1 if unavailable, but
+ * extended_extended_level is set to 0 if unavailable
+ * and the legitimate extended levels are all negative
+ * when signed; hence the weird messing around with
+ * signs here...
+ */
+ if (!((s32)df->level < 0 ?
+ (u32)df->level > (u32)c->extended_cpuid_level :
+ (s32)df->level > (s32)c->cpuid_level))
+ continue;
+
+ clear_cpu_cap(c, df->feature);
+ if (!warn)
+ continue;
+
+ pr_warn("CPU: CPU feature " X86_CAP_FMT " disabled, no CPUID level 0x%x\n",
+ x86_cap_flag(df->feature), df->level);
+ }
+}
+
+/*
+ * Naming convention should be: <Name> [(<Codename>)]
+ * This table only is used unless init_<vendor>() below doesn't set it;
+ * in particular, if CPUID levels 0x80000002..4 are supported, this
+ * isn't used
+ */
+
+/* Look up CPU names by table lookup. */
+static const char *table_lookup_model(struct cpuinfo_x86 *c)
+{
+#ifdef CONFIG_X86_32
+ const struct legacy_cpu_model_info *info;
+
+ if (c->x86_model >= 16)
+ return NULL; /* Range check */
+
+ if (!this_cpu)
+ return NULL;
+
+ info = this_cpu->legacy_models;
+
+ while (info->family) {
+ if (info->family == c->x86)
+ return info->model_names[c->x86_model];
+ info++;
+ }
+#endif
+ return NULL; /* Not found */
+}
+
+__u32 cpu_caps_cleared[NCAPINTS + NBUGINTS];
+__u32 cpu_caps_set[NCAPINTS + NBUGINTS];
+
+void load_percpu_segment(int cpu)
+{
+#ifdef CONFIG_X86_32
+ loadsegment(fs, __KERNEL_PERCPU);
+#else
+ __loadsegment_simple(gs, 0);
+ wrmsrl(MSR_GS_BASE, cpu_kernelmode_gs_base(cpu));
+#endif
+ load_stack_canary_segment();
+}
+
+#ifdef CONFIG_X86_32
+/* The 32-bit entry code needs to find cpu_entry_area. */
+DEFINE_PER_CPU(struct cpu_entry_area *, cpu_entry_area);
+#endif
+
+#ifdef CONFIG_X86_64
+/*
+ * Special IST stacks which the CPU switches to when it calls
+ * an IST-marked descriptor entry. Up to 7 stacks (hardware
+ * limit), all of them are 4K, except the debug stack which
+ * is 8K.
+ */
+static const unsigned int exception_stack_sizes[N_EXCEPTION_STACKS] = {
+ [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STKSZ,
+ [DEBUG_STACK - 1] = DEBUG_STKSZ
+};
+#endif
+
+/* Load the original GDT from the per-cpu structure */
+void load_direct_gdt(int cpu)
+{
+ struct desc_ptr gdt_descr;
+
+ gdt_descr.address = (long)get_cpu_gdt_rw(cpu);
+ gdt_descr.size = GDT_SIZE - 1;
+ load_gdt(&gdt_descr);
+}
+EXPORT_SYMBOL_GPL(load_direct_gdt);
+
+/* Load a fixmap remapping of the per-cpu GDT */
+void load_fixmap_gdt(int cpu)
+{
+ struct desc_ptr gdt_descr;
+
+ gdt_descr.address = (long)get_cpu_gdt_ro(cpu);
+ gdt_descr.size = GDT_SIZE - 1;
+ load_gdt(&gdt_descr);
+}
+EXPORT_SYMBOL_GPL(load_fixmap_gdt);
+
+/*
+ * Current gdt points %fs at the "master" per-cpu area: after this,
+ * it's on the real one.
+ */
+void switch_to_new_gdt(int cpu)
+{
+ /* Load the original GDT */
+ load_direct_gdt(cpu);
+ /* Reload the per-cpu base */
+ load_percpu_segment(cpu);
+}
+
+static const struct cpu_dev *cpu_devs[X86_VENDOR_NUM] = {};
+
+static void get_model_name(struct cpuinfo_x86 *c)
+{
+ unsigned int *v;
+ char *p, *q, *s;
+
+ if (c->extended_cpuid_level < 0x80000004)
+ return;
+
+ v = (unsigned int *)c->x86_model_id;
+ cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]);
+ cpuid(0x80000003, &v[4], &v[5], &v[6], &v[7]);
+ cpuid(0x80000004, &v[8], &v[9], &v[10], &v[11]);
+ c->x86_model_id[48] = 0;
+
+ /* Trim whitespace */
+ p = q = s = &c->x86_model_id[0];
+
+ while (*p == ' ')
+ p++;
+
+ while (*p) {
+ /* Note the last non-whitespace index */
+ if (!isspace(*p))
+ s = q;
+
+ *q++ = *p++;
+ }
+
+ *(s + 1) = '\0';
+}
+
+void detect_num_cpu_cores(struct cpuinfo_x86 *c)
+{
+ unsigned int eax, ebx, ecx, edx;
+
+ c->x86_max_cores = 1;
+ if (!IS_ENABLED(CONFIG_SMP) || c->cpuid_level < 4)
+ return;
+
+ cpuid_count(4, 0, &eax, &ebx, &ecx, &edx);
+ if (eax & 0x1f)
+ c->x86_max_cores = (eax >> 26) + 1;
+}
+
+void cpu_detect_cache_sizes(struct cpuinfo_x86 *c)
+{
+ unsigned int n, dummy, ebx, ecx, edx, l2size;
+
+ n = c->extended_cpuid_level;
+
+ if (n >= 0x80000005) {
+ cpuid(0x80000005, &dummy, &ebx, &ecx, &edx);
+ c->x86_cache_size = (ecx>>24) + (edx>>24);
+#ifdef CONFIG_X86_64
+ /* On K8 L1 TLB is inclusive, so don't count it */
+ c->x86_tlbsize = 0;
+#endif
+ }
+
+ if (n < 0x80000006) /* Some chips just has a large L1. */
+ return;
+
+ cpuid(0x80000006, &dummy, &ebx, &ecx, &edx);
+ l2size = ecx >> 16;
+
+#ifdef CONFIG_X86_64
+ c->x86_tlbsize += ((ebx >> 16) & 0xfff) + (ebx & 0xfff);
+#else
+ /* do processor-specific cache resizing */
+ if (this_cpu->legacy_cache_size)
+ l2size = this_cpu->legacy_cache_size(c, l2size);
+
+ /* Allow user to override all this if necessary. */
+ if (cachesize_override != -1)
+ l2size = cachesize_override;
+
+ if (l2size == 0)
+ return; /* Again, no L2 cache is possible */
+#endif
+
+ c->x86_cache_size = l2size;
+}
+
+u16 __read_mostly tlb_lli_4k[NR_INFO];
+u16 __read_mostly tlb_lli_2m[NR_INFO];
+u16 __read_mostly tlb_lli_4m[NR_INFO];
+u16 __read_mostly tlb_lld_4k[NR_INFO];
+u16 __read_mostly tlb_lld_2m[NR_INFO];
+u16 __read_mostly tlb_lld_4m[NR_INFO];
+u16 __read_mostly tlb_lld_1g[NR_INFO];
+
+static void cpu_detect_tlb(struct cpuinfo_x86 *c)
+{
+ if (this_cpu->c_detect_tlb)
+ this_cpu->c_detect_tlb(c);
+
+ pr_info("Last level iTLB entries: 4KB %d, 2MB %d, 4MB %d\n",
+ tlb_lli_4k[ENTRIES], tlb_lli_2m[ENTRIES],
+ tlb_lli_4m[ENTRIES]);
+
+ pr_info("Last level dTLB entries: 4KB %d, 2MB %d, 4MB %d, 1GB %d\n",
+ tlb_lld_4k[ENTRIES], tlb_lld_2m[ENTRIES],
+ tlb_lld_4m[ENTRIES], tlb_lld_1g[ENTRIES]);
+}
+
+int detect_ht_early(struct cpuinfo_x86 *c)
+{
+#ifdef CONFIG_SMP
+ u32 eax, ebx, ecx, edx;
+
+ if (!cpu_has(c, X86_FEATURE_HT))
+ return -1;
+
+ if (cpu_has(c, X86_FEATURE_CMP_LEGACY))
+ return -1;
+
+ if (cpu_has(c, X86_FEATURE_XTOPOLOGY))
+ return -1;
+
+ cpuid(1, &eax, &ebx, &ecx, &edx);
+
+ smp_num_siblings = (ebx & 0xff0000) >> 16;
+ if (smp_num_siblings == 1)
+ pr_info_once("CPU0: Hyper-Threading is disabled\n");
+#endif
+ return 0;
+}
+
+void detect_ht(struct cpuinfo_x86 *c)
+{
+#ifdef CONFIG_SMP
+ int index_msb, core_bits;
+
+ if (detect_ht_early(c) < 0)
+ return;
+
+ index_msb = get_count_order(smp_num_siblings);
+ c->phys_proc_id = apic->phys_pkg_id(c->initial_apicid, index_msb);
+
+ smp_num_siblings = smp_num_siblings / c->x86_max_cores;
+
+ index_msb = get_count_order(smp_num_siblings);
+
+ core_bits = get_count_order(c->x86_max_cores);
+
+ c->cpu_core_id = apic->phys_pkg_id(c->initial_apicid, index_msb) &
+ ((1 << core_bits) - 1);
+#endif
+}
+
+static void get_cpu_vendor(struct cpuinfo_x86 *c)
+{
+ char *v = c->x86_vendor_id;
+ int i;
+
+ for (i = 0; i < X86_VENDOR_NUM; i++) {
+ if (!cpu_devs[i])
+ break;
+
+ if (!strcmp(v, cpu_devs[i]->c_ident[0]) ||
+ (cpu_devs[i]->c_ident[1] &&
+ !strcmp(v, cpu_devs[i]->c_ident[1]))) {
+
+ this_cpu = cpu_devs[i];
+ c->x86_vendor = this_cpu->c_x86_vendor;
+ return;
+ }
+ }
+
+ pr_err_once("CPU: vendor_id '%s' unknown, using generic init.\n" \
+ "CPU: Your system may be unstable.\n", v);
+
+ c->x86_vendor = X86_VENDOR_UNKNOWN;
+ this_cpu = &default_cpu;
+}
+
+void cpu_detect(struct cpuinfo_x86 *c)
+{
+ /* Get vendor name */
+ cpuid(0x00000000, (unsigned int *)&c->cpuid_level,
+ (unsigned int *)&c->x86_vendor_id[0],
+ (unsigned int *)&c->x86_vendor_id[8],
+ (unsigned int *)&c->x86_vendor_id[4]);
+
+ c->x86 = 4;
+ /* Intel-defined flags: level 0x00000001 */
+ if (c->cpuid_level >= 0x00000001) {
+ u32 junk, tfms, cap0, misc;
+
+ cpuid(0x00000001, &tfms, &misc, &junk, &cap0);
+ c->x86 = x86_family(tfms);
+ c->x86_model = x86_model(tfms);
+ c->x86_stepping = x86_stepping(tfms);
+
+ if (cap0 & (1<<19)) {
+ c->x86_clflush_size = ((misc >> 8) & 0xff) * 8;
+ c->x86_cache_alignment = c->x86_clflush_size;
+ }
+ }
+}
+
+static void apply_forced_caps(struct cpuinfo_x86 *c)
+{
+ int i;
+
+ for (i = 0; i < NCAPINTS + NBUGINTS; i++) {
+ c->x86_capability[i] &= ~cpu_caps_cleared[i];
+ c->x86_capability[i] |= cpu_caps_set[i];
+ }
+}
+
+static void init_speculation_control(struct cpuinfo_x86 *c)
+{
+ /*
+ * The Intel SPEC_CTRL CPUID bit implies IBRS and IBPB support,
+ * and they also have a different bit for STIBP support. Also,
+ * a hypervisor might have set the individual AMD bits even on
+ * Intel CPUs, for finer-grained selection of what's available.
+ */
+ if (cpu_has(c, X86_FEATURE_SPEC_CTRL)) {
+ set_cpu_cap(c, X86_FEATURE_IBRS);
+ set_cpu_cap(c, X86_FEATURE_IBPB);
+ set_cpu_cap(c, X86_FEATURE_MSR_SPEC_CTRL);
+ }
+
+ if (cpu_has(c, X86_FEATURE_INTEL_STIBP))
+ set_cpu_cap(c, X86_FEATURE_STIBP);
+
+ if (cpu_has(c, X86_FEATURE_SPEC_CTRL_SSBD) ||
+ cpu_has(c, X86_FEATURE_VIRT_SSBD))
+ set_cpu_cap(c, X86_FEATURE_SSBD);
+
+ if (cpu_has(c, X86_FEATURE_AMD_IBRS)) {
+ set_cpu_cap(c, X86_FEATURE_IBRS);
+ set_cpu_cap(c, X86_FEATURE_MSR_SPEC_CTRL);
+ }
+
+ if (cpu_has(c, X86_FEATURE_AMD_IBPB))
+ set_cpu_cap(c, X86_FEATURE_IBPB);
+
+ if (cpu_has(c, X86_FEATURE_AMD_STIBP)) {
+ set_cpu_cap(c, X86_FEATURE_STIBP);
+ set_cpu_cap(c, X86_FEATURE_MSR_SPEC_CTRL);
+ }
+
+ if (cpu_has(c, X86_FEATURE_AMD_SSBD)) {
+ set_cpu_cap(c, X86_FEATURE_SSBD);
+ set_cpu_cap(c, X86_FEATURE_MSR_SPEC_CTRL);
+ clear_cpu_cap(c, X86_FEATURE_VIRT_SSBD);
+ }
+}
+
+void get_cpu_cap(struct cpuinfo_x86 *c)
+{
+ u32 eax, ebx, ecx, edx;
+
+ /* Intel-defined flags: level 0x00000001 */
+ if (c->cpuid_level >= 0x00000001) {
+ cpuid(0x00000001, &eax, &ebx, &ecx, &edx);
+
+ c->x86_capability[CPUID_1_ECX] = ecx;
+ c->x86_capability[CPUID_1_EDX] = edx;
+ }
+
+ /* Thermal and Power Management Leaf: level 0x00000006 (eax) */
+ if (c->cpuid_level >= 0x00000006)
+ c->x86_capability[CPUID_6_EAX] = cpuid_eax(0x00000006);
+
+ /* Additional Intel-defined flags: level 0x00000007 */
+ if (c->cpuid_level >= 0x00000007) {
+ cpuid_count(0x00000007, 0, &eax, &ebx, &ecx, &edx);
+ c->x86_capability[CPUID_7_0_EBX] = ebx;
+ c->x86_capability[CPUID_7_ECX] = ecx;
+ c->x86_capability[CPUID_7_EDX] = edx;
+ }
+
+ /* Extended state features: level 0x0000000d */
+ if (c->cpuid_level >= 0x0000000d) {
+ cpuid_count(0x0000000d, 1, &eax, &ebx, &ecx, &edx);
+
+ c->x86_capability[CPUID_D_1_EAX] = eax;
+ }
+
+ /* Additional Intel-defined flags: level 0x0000000F */
+ if (c->cpuid_level >= 0x0000000F) {
+
+ /* QoS sub-leaf, EAX=0Fh, ECX=0 */
+ cpuid_count(0x0000000F, 0, &eax, &ebx, &ecx, &edx);
+ c->x86_capability[CPUID_F_0_EDX] = edx;
+
+ if (cpu_has(c, X86_FEATURE_CQM_LLC)) {
+ /* will be overridden if occupancy monitoring exists */
+ c->x86_cache_max_rmid = ebx;
+
+ /* QoS sub-leaf, EAX=0Fh, ECX=1 */
+ cpuid_count(0x0000000F, 1, &eax, &ebx, &ecx, &edx);
+ c->x86_capability[CPUID_F_1_EDX] = edx;
+
+ if ((cpu_has(c, X86_FEATURE_CQM_OCCUP_LLC)) ||
+ ((cpu_has(c, X86_FEATURE_CQM_MBM_TOTAL)) ||
+ (cpu_has(c, X86_FEATURE_CQM_MBM_LOCAL)))) {
+ c->x86_cache_max_rmid = ecx;
+ c->x86_cache_occ_scale = ebx;
+ }
+ } else {
+ c->x86_cache_max_rmid = -1;
+ c->x86_cache_occ_scale = -1;
+ }
+ }
+
+ /* AMD-defined flags: level 0x80000001 */
+ eax = cpuid_eax(0x80000000);
+ c->extended_cpuid_level = eax;
+
+ if ((eax & 0xffff0000) == 0x80000000) {
+ if (eax >= 0x80000001) {
+ cpuid(0x80000001, &eax, &ebx, &ecx, &edx);
+
+ c->x86_capability[CPUID_8000_0001_ECX] = ecx;
+ c->x86_capability[CPUID_8000_0001_EDX] = edx;
+ }
+ }
+
+ if (c->extended_cpuid_level >= 0x80000007) {
+ cpuid(0x80000007, &eax, &ebx, &ecx, &edx);
+
+ c->x86_capability[CPUID_8000_0007_EBX] = ebx;
+ c->x86_power = edx;
+ }
+
+ if (c->extended_cpuid_level >= 0x80000008) {
+ cpuid(0x80000008, &eax, &ebx, &ecx, &edx);
+ c->x86_capability[CPUID_8000_0008_EBX] = ebx;
+ }
+
+ if (c->extended_cpuid_level >= 0x8000000a)
+ c->x86_capability[CPUID_8000_000A_EDX] = cpuid_edx(0x8000000a);
+
+ init_scattered_cpuid_features(c);
+ init_speculation_control(c);
+
+ /*
+ * Clear/Set all flags overridden by options, after probe.
+ * This needs to happen each time we re-probe, which may happen
+ * several times during CPU initialization.
+ */
+ apply_forced_caps(c);
+}
+
+void get_cpu_address_sizes(struct cpuinfo_x86 *c)
+{
+ u32 eax, ebx, ecx, edx;
+
+ if (c->extended_cpuid_level >= 0x80000008) {
+ cpuid(0x80000008, &eax, &ebx, &ecx, &edx);
+
+ c->x86_virt_bits = (eax >> 8) & 0xff;
+ c->x86_phys_bits = eax & 0xff;
+ }
+#ifdef CONFIG_X86_32
+ else if (cpu_has(c, X86_FEATURE_PAE) || cpu_has(c, X86_FEATURE_PSE36))
+ c->x86_phys_bits = 36;
+#endif
+ c->x86_cache_bits = c->x86_phys_bits;
+}
+
+static void identify_cpu_without_cpuid(struct cpuinfo_x86 *c)
+{
+#ifdef CONFIG_X86_32
+ int i;
+
+ /*
+ * First of all, decide if this is a 486 or higher
+ * It's a 486 if we can modify the AC flag
+ */
+ if (flag_is_changeable_p(X86_EFLAGS_AC))
+ c->x86 = 4;
+ else
+ c->x86 = 3;
+
+ for (i = 0; i < X86_VENDOR_NUM; i++)
+ if (cpu_devs[i] && cpu_devs[i]->c_identify) {
+ c->x86_vendor_id[0] = 0;
+ cpu_devs[i]->c_identify(c);
+ if (c->x86_vendor_id[0]) {
+ get_cpu_vendor(c);
+ break;
+ }
+ }
+#endif
+}
+
+static const __initconst struct x86_cpu_id cpu_no_speculation[] = {
+ { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_CEDARVIEW, X86_FEATURE_ANY },
+ { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_CLOVERVIEW, X86_FEATURE_ANY },
+ { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_LINCROFT, X86_FEATURE_ANY },
+ { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_PENWELL, X86_FEATURE_ANY },
+ { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_PINEVIEW, X86_FEATURE_ANY },
+ { X86_VENDOR_CENTAUR, 5 },
+ { X86_VENDOR_INTEL, 5 },
+ { X86_VENDOR_NSC, 5 },
+ { X86_VENDOR_ANY, 4 },
+ {}
+};
+
+static const __initconst struct x86_cpu_id cpu_no_meltdown[] = {
+ { X86_VENDOR_AMD },
+ {}
+};
+
+/* Only list CPUs which speculate but are non susceptible to SSB */
+static const __initconst struct x86_cpu_id cpu_no_spec_store_bypass[] = {
+ { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SILVERMONT1 },
+ { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_AIRMONT },
+ { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SILVERMONT2 },
+ { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_MERRIFIELD },
+ { X86_VENDOR_INTEL, 6, INTEL_FAM6_CORE_YONAH },
+ { X86_VENDOR_INTEL, 6, INTEL_FAM6_XEON_PHI_KNL },
+ { X86_VENDOR_INTEL, 6, INTEL_FAM6_XEON_PHI_KNM },
+ { X86_VENDOR_AMD, 0x12, },
+ { X86_VENDOR_AMD, 0x11, },
+ { X86_VENDOR_AMD, 0x10, },
+ { X86_VENDOR_AMD, 0xf, },
+ {}
+};
+
+static const __initconst struct x86_cpu_id cpu_no_l1tf[] = {
+ /* in addition to cpu_no_speculation */
+ { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SILVERMONT1 },
+ { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SILVERMONT2 },
+ { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_AIRMONT },
+ { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_MERRIFIELD },
+ { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_MOOREFIELD },
+ { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_GOLDMONT },
+ { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_DENVERTON },
+ { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_GEMINI_LAKE },
+ { X86_VENDOR_INTEL, 6, INTEL_FAM6_XEON_PHI_KNL },
+ { X86_VENDOR_INTEL, 6, INTEL_FAM6_XEON_PHI_KNM },
+ {}
+};
+
+static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c)
+{
+ u64 ia32_cap = 0;
+
+ if (x86_match_cpu(cpu_no_speculation))
+ return;
+
+ setup_force_cpu_bug(X86_BUG_SPECTRE_V1);
+ setup_force_cpu_bug(X86_BUG_SPECTRE_V2);
+
+ if (cpu_has(c, X86_FEATURE_ARCH_CAPABILITIES))
+ rdmsrl(MSR_IA32_ARCH_CAPABILITIES, ia32_cap);
+
+ if (!x86_match_cpu(cpu_no_spec_store_bypass) &&
+ !(ia32_cap & ARCH_CAP_SSB_NO) &&
+ !cpu_has(c, X86_FEATURE_AMD_SSB_NO))
+ setup_force_cpu_bug(X86_BUG_SPEC_STORE_BYPASS);
+
+ if (ia32_cap & ARCH_CAP_IBRS_ALL)
+ setup_force_cpu_cap(X86_FEATURE_IBRS_ENHANCED);
+
+ if (x86_match_cpu(cpu_no_meltdown))
+ return;
+
+ /* Rogue Data Cache Load? No! */
+ if (ia32_cap & ARCH_CAP_RDCL_NO)
+ return;
+
+ setup_force_cpu_bug(X86_BUG_CPU_MELTDOWN);
+
+ if (x86_match_cpu(cpu_no_l1tf))
+ return;
+
+ setup_force_cpu_bug(X86_BUG_L1TF);
+}
+
+/*
+ * The NOPL instruction is supposed to exist on all CPUs of family >= 6;
+ * unfortunately, that's not true in practice because of early VIA
+ * chips and (more importantly) broken virtualizers that are not easy
+ * to detect. In the latter case it doesn't even *fail* reliably, so
+ * probing for it doesn't even work. Disable it completely on 32-bit
+ * unless we can find a reliable way to detect all the broken cases.
+ * Enable it explicitly on 64-bit for non-constant inputs of cpu_has().
+ */
+static void detect_nopl(void)
+{
+#ifdef CONFIG_X86_32
+ setup_clear_cpu_cap(X86_FEATURE_NOPL);
+#else
+ setup_force_cpu_cap(X86_FEATURE_NOPL);
+#endif
+}
+
+/*
+ * Do minimum CPU detection early.
+ * Fields really needed: vendor, cpuid_level, family, model, mask,
+ * cache alignment.
+ * The others are not touched to avoid unwanted side effects.
+ *
+ * WARNING: this function is only called on the boot CPU. Don't add code
+ * here that is supposed to run on all CPUs.
+ */
+static void __init early_identify_cpu(struct cpuinfo_x86 *c)
+{
+#ifdef CONFIG_X86_64
+ c->x86_clflush_size = 64;
+ c->x86_phys_bits = 36;
+ c->x86_virt_bits = 48;
+#else
+ c->x86_clflush_size = 32;
+ c->x86_phys_bits = 32;
+ c->x86_virt_bits = 32;
+#endif
+ c->x86_cache_alignment = c->x86_clflush_size;
+
+ memset(&c->x86_capability, 0, sizeof c->x86_capability);
+ c->extended_cpuid_level = 0;
+
+ /* cyrix could have cpuid enabled via c_identify()*/
+ if (have_cpuid_p()) {
+ cpu_detect(c);
+ get_cpu_vendor(c);
+ get_cpu_cap(c);
+ get_cpu_address_sizes(c);
+ setup_force_cpu_cap(X86_FEATURE_CPUID);
+
+ if (this_cpu->c_early_init)
+ this_cpu->c_early_init(c);
+
+ c->cpu_index = 0;
+ filter_cpuid_features(c, false);
+
+ if (this_cpu->c_bsp_init)
+ this_cpu->c_bsp_init(c);
+ } else {
+ identify_cpu_without_cpuid(c);
+ setup_clear_cpu_cap(X86_FEATURE_CPUID);
+ }
+
+ setup_force_cpu_cap(X86_FEATURE_ALWAYS);
+
+ cpu_set_bug_bits(c);
+
+ fpu__init_system(c);
+
+#ifdef CONFIG_X86_32
+ /*
+ * Regardless of whether PCID is enumerated, the SDM says
+ * that it can't be enabled in 32-bit mode.
+ */
+ setup_clear_cpu_cap(X86_FEATURE_PCID);
+#endif
+
+ /*
+ * Later in the boot process pgtable_l5_enabled() relies on
+ * cpu_feature_enabled(X86_FEATURE_LA57). If 5-level paging is not
+ * enabled by this point we need to clear the feature bit to avoid
+ * false-positives at the later stage.
+ *
+ * pgtable_l5_enabled() can be false here for several reasons:
+ * - 5-level paging is disabled compile-time;
+ * - it's 32-bit kernel;
+ * - machine doesn't support 5-level paging;
+ * - user specified 'no5lvl' in kernel command line.
+ */
+ if (!pgtable_l5_enabled())
+ setup_clear_cpu_cap(X86_FEATURE_LA57);
+
+ detect_nopl();
+}
+
+void __init early_cpu_init(void)
+{
+ const struct cpu_dev *const *cdev;
+ int count = 0;
+
+#ifdef CONFIG_PROCESSOR_SELECT
+ pr_info("KERNEL supported cpus:\n");
+#endif
+
+ for (cdev = __x86_cpu_dev_start; cdev < __x86_cpu_dev_end; cdev++) {
+ const struct cpu_dev *cpudev = *cdev;
+
+ if (count >= X86_VENDOR_NUM)
+ break;
+ cpu_devs[count] = cpudev;
+ count++;
+
+#ifdef CONFIG_PROCESSOR_SELECT
+ {
+ unsigned int j;
+
+ for (j = 0; j < 2; j++) {
+ if (!cpudev->c_ident[j])
+ continue;
+ pr_info(" %s %s\n", cpudev->c_vendor,
+ cpudev->c_ident[j]);
+ }
+ }
+#endif
+ }
+ early_identify_cpu(&boot_cpu_data);
+}
+
+static void detect_null_seg_behavior(struct cpuinfo_x86 *c)
+{
+#ifdef CONFIG_X86_64
+ /*
+ * Empirically, writing zero to a segment selector on AMD does
+ * not clear the base, whereas writing zero to a segment
+ * selector on Intel does clear the base. Intel's behavior
+ * allows slightly faster context switches in the common case
+ * where GS is unused by the prev and next threads.
+ *
+ * Since neither vendor documents this anywhere that I can see,
+ * detect it directly instead of hardcoding the choice by
+ * vendor.
+ *
+ * I've designated AMD's behavior as the "bug" because it's
+ * counterintuitive and less friendly.
+ */
+
+ unsigned long old_base, tmp;
+ rdmsrl(MSR_FS_BASE, old_base);
+ wrmsrl(MSR_FS_BASE, 1);
+ loadsegment(fs, 0);
+ rdmsrl(MSR_FS_BASE, tmp);
+ if (tmp != 0)
+ set_cpu_bug(c, X86_BUG_NULL_SEG);
+ wrmsrl(MSR_FS_BASE, old_base);
+#endif
+}
+
+static void generic_identify(struct cpuinfo_x86 *c)
+{
+ c->extended_cpuid_level = 0;
+
+ if (!have_cpuid_p())
+ identify_cpu_without_cpuid(c);
+
+ /* cyrix could have cpuid enabled via c_identify()*/
+ if (!have_cpuid_p())
+ return;
+
+ cpu_detect(c);
+
+ get_cpu_vendor(c);
+
+ get_cpu_cap(c);
+
+ get_cpu_address_sizes(c);
+
+ if (c->cpuid_level >= 0x00000001) {
+ c->initial_apicid = (cpuid_ebx(1) >> 24) & 0xFF;
+#ifdef CONFIG_X86_32
+# ifdef CONFIG_SMP
+ c->apicid = apic->phys_pkg_id(c->initial_apicid, 0);
+# else
+ c->apicid = c->initial_apicid;
+# endif
+#endif
+ c->phys_proc_id = c->initial_apicid;
+ }
+
+ get_model_name(c); /* Default name */
+
+ detect_null_seg_behavior(c);
+
+ /*
+ * ESPFIX is a strange bug. All real CPUs have it. Paravirt
+ * systems that run Linux at CPL > 0 may or may not have the
+ * issue, but, even if they have the issue, there's absolutely
+ * nothing we can do about it because we can't use the real IRET
+ * instruction.
+ *
+ * NB: For the time being, only 32-bit kernels support
+ * X86_BUG_ESPFIX as such. 64-bit kernels directly choose
+ * whether to apply espfix using paravirt hooks. If any
+ * non-paravirt system ever shows up that does *not* have the
+ * ESPFIX issue, we can change this.
+ */
+#ifdef CONFIG_X86_32
+# ifdef CONFIG_PARAVIRT
+ do {
+ extern void native_iret(void);
+ if (pv_cpu_ops.iret == native_iret)
+ set_cpu_bug(c, X86_BUG_ESPFIX);
+ } while (0);
+# else
+ set_cpu_bug(c, X86_BUG_ESPFIX);
+# endif
+#endif
+}
+
+static void x86_init_cache_qos(struct cpuinfo_x86 *c)
+{
+ /*
+ * The heavy lifting of max_rmid and cache_occ_scale are handled
+ * in get_cpu_cap(). Here we just set the max_rmid for the boot_cpu
+ * in case CQM bits really aren't there in this CPU.
+ */
+ if (c != &boot_cpu_data) {
+ boot_cpu_data.x86_cache_max_rmid =
+ min(boot_cpu_data.x86_cache_max_rmid,
+ c->x86_cache_max_rmid);
+ }
+}
+
+/*
+ * Validate that ACPI/mptables have the same information about the
+ * effective APIC id and update the package map.
+ */
+static void validate_apic_and_package_id(struct cpuinfo_x86 *c)
+{
+#ifdef CONFIG_SMP
+ unsigned int apicid, cpu = smp_processor_id();
+
+ apicid = apic->cpu_present_to_apicid(cpu);
+
+ if (apicid != c->apicid) {
+ pr_err(FW_BUG "CPU%u: APIC id mismatch. Firmware: %x APIC: %x\n",
+ cpu, apicid, c->initial_apicid);
+ }
+ BUG_ON(topology_update_package_map(c->phys_proc_id, cpu));
+#else
+ c->logical_proc_id = 0;
+#endif
+}
+
+/*
+ * This does the hard work of actually picking apart the CPU stuff...
+ */
+static void identify_cpu(struct cpuinfo_x86 *c)
+{
+ int i;
+
+ c->loops_per_jiffy = loops_per_jiffy;
+ c->x86_cache_size = 0;
+ c->x86_vendor = X86_VENDOR_UNKNOWN;
+ c->x86_model = c->x86_stepping = 0; /* So far unknown... */
+ c->x86_vendor_id[0] = '\0'; /* Unset */
+ c->x86_model_id[0] = '\0'; /* Unset */
+ c->x86_max_cores = 1;
+ c->x86_coreid_bits = 0;
+ c->cu_id = 0xff;
+#ifdef CONFIG_X86_64
+ c->x86_clflush_size = 64;
+ c->x86_phys_bits = 36;
+ c->x86_virt_bits = 48;
+#else
+ c->cpuid_level = -1; /* CPUID not detected */
+ c->x86_clflush_size = 32;
+ c->x86_phys_bits = 32;
+ c->x86_virt_bits = 32;
+#endif
+ c->x86_cache_alignment = c->x86_clflush_size;
+ memset(&c->x86_capability, 0, sizeof c->x86_capability);
+
+ generic_identify(c);
+
+ if (this_cpu->c_identify)
+ this_cpu->c_identify(c);
+
+ /* Clear/Set all flags overridden by options, after probe */
+ apply_forced_caps(c);
+
+#ifdef CONFIG_X86_64
+ c->apicid = apic->phys_pkg_id(c->initial_apicid, 0);
+#endif
+
+ /*
+ * Vendor-specific initialization. In this section we
+ * canonicalize the feature flags, meaning if there are
+ * features a certain CPU supports which CPUID doesn't
+ * tell us, CPUID claiming incorrect flags, or other bugs,
+ * we handle them here.
+ *
+ * At the end of this section, c->x86_capability better
+ * indicate the features this CPU genuinely supports!
+ */
+ if (this_cpu->c_init)
+ this_cpu->c_init(c);
+
+ /* Disable the PN if appropriate */
+ squash_the_stupid_serial_number(c);
+
+ /* Set up SMEP/SMAP/UMIP */
+ setup_smep(c);
+ setup_smap(c);
+ setup_umip(c);
+
+ /*
+ * The vendor-specific functions might have changed features.
+ * Now we do "generic changes."
+ */
+
+ /* Filter out anything that depends on CPUID levels we don't have */
+ filter_cpuid_features(c, true);
+
+ /* If the model name is still unset, do table lookup. */
+ if (!c->x86_model_id[0]) {
+ const char *p;
+ p = table_lookup_model(c);
+ if (p)
+ strcpy(c->x86_model_id, p);
+ else
+ /* Last resort... */
+ sprintf(c->x86_model_id, "%02x/%02x",
+ c->x86, c->x86_model);
+ }
+
+#ifdef CONFIG_X86_64
+ detect_ht(c);
+#endif
+
+ x86_init_rdrand(c);
+ x86_init_cache_qos(c);
+ setup_pku(c);
+
+ /*
+ * Clear/Set all flags overridden by options, need do it
+ * before following smp all cpus cap AND.
+ */
+ apply_forced_caps(c);
+
+ /*
+ * On SMP, boot_cpu_data holds the common feature set between
+ * all CPUs; so make sure that we indicate which features are
+ * common between the CPUs. The first time this routine gets
+ * executed, c == &boot_cpu_data.
+ */
+ if (c != &boot_cpu_data) {
+ /* AND the already accumulated flags with these */
+ for (i = 0; i < NCAPINTS; i++)
+ boot_cpu_data.x86_capability[i] &= c->x86_capability[i];
+
+ /* OR, i.e. replicate the bug flags */
+ for (i = NCAPINTS; i < NCAPINTS + NBUGINTS; i++)
+ c->x86_capability[i] |= boot_cpu_data.x86_capability[i];
+ }
+
+ /* Init Machine Check Exception if available. */
+ mcheck_cpu_init(c);
+
+ select_idle_routine(c);
+
+#ifdef CONFIG_NUMA
+ numa_add_cpu(smp_processor_id());
+#endif
+}
+
+/*
+ * Set up the CPU state needed to execute SYSENTER/SYSEXIT instructions
+ * on 32-bit kernels:
+ */
+#ifdef CONFIG_X86_32
+void enable_sep_cpu(void)
+{
+ struct tss_struct *tss;
+ int cpu;
+
+ if (!boot_cpu_has(X86_FEATURE_SEP))
+ return;
+
+ cpu = get_cpu();
+ tss = &per_cpu(cpu_tss_rw, cpu);
+
+ /*
+ * We cache MSR_IA32_SYSENTER_CS's value in the TSS's ss1 field --
+ * see the big comment in struct x86_hw_tss's definition.
+ */
+
+ tss->x86_tss.ss1 = __KERNEL_CS;
+ wrmsr(MSR_IA32_SYSENTER_CS, tss->x86_tss.ss1, 0);
+ wrmsr(MSR_IA32_SYSENTER_ESP, (unsigned long)(cpu_entry_stack(cpu) + 1), 0);
+ wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long)entry_SYSENTER_32, 0);
+
+ put_cpu();
+}
+#endif
+
+void __init identify_boot_cpu(void)
+{
+ identify_cpu(&boot_cpu_data);
+#ifdef CONFIG_X86_32
+ sysenter_setup();
+ enable_sep_cpu();
+#endif
+ cpu_detect_tlb(&boot_cpu_data);
+}
+
+void identify_secondary_cpu(struct cpuinfo_x86 *c)
+{
+ BUG_ON(c == &boot_cpu_data);
+ identify_cpu(c);
+#ifdef CONFIG_X86_32
+ enable_sep_cpu();
+#endif
+ mtrr_ap_init();
+ validate_apic_and_package_id(c);
+ x86_spec_ctrl_setup_ap();
+}
+
+static __init int setup_noclflush(char *arg)
+{
+ setup_clear_cpu_cap(X86_FEATURE_CLFLUSH);
+ setup_clear_cpu_cap(X86_FEATURE_CLFLUSHOPT);
+ return 1;
+}
+__setup("noclflush", setup_noclflush);
+
+void print_cpu_info(struct cpuinfo_x86 *c)
+{
+ const char *vendor = NULL;
+
+ if (c->x86_vendor < X86_VENDOR_NUM) {
+ vendor = this_cpu->c_vendor;
+ } else {
+ if (c->cpuid_level >= 0)
+ vendor = c->x86_vendor_id;
+ }
+
+ if (vendor && !strstr(c->x86_model_id, vendor))
+ pr_cont("%s ", vendor);
+
+ if (c->x86_model_id[0])
+ pr_cont("%s", c->x86_model_id);
+ else
+ pr_cont("%d86", c->x86);
+
+ pr_cont(" (family: 0x%x, model: 0x%x", c->x86, c->x86_model);
+
+ if (c->x86_stepping || c->cpuid_level >= 0)
+ pr_cont(", stepping: 0x%x)\n", c->x86_stepping);
+ else
+ pr_cont(")\n");
+}
+
+/*
+ * clearcpuid= was already parsed in fpu__init_parse_early_param.
+ * But we need to keep a dummy __setup around otherwise it would
+ * show up as an environment variable for init.
+ */
+static __init int setup_clearcpuid(char *arg)
+{
+ return 1;
+}
+__setup("clearcpuid=", setup_clearcpuid);
+
+#ifdef CONFIG_X86_64
+DEFINE_PER_CPU_FIRST(union irq_stack_union,
+ irq_stack_union) __aligned(PAGE_SIZE) __visible;
+EXPORT_PER_CPU_SYMBOL_GPL(irq_stack_union);
+
+/*
+ * The following percpu variables are hot. Align current_task to
+ * cacheline size such that they fall in the same cacheline.
+ */
+DEFINE_PER_CPU(struct task_struct *, current_task) ____cacheline_aligned =
+ &init_task;
+EXPORT_PER_CPU_SYMBOL(current_task);
+
+DEFINE_PER_CPU(char *, irq_stack_ptr) =
+ init_per_cpu_var(irq_stack_union.irq_stack) + IRQ_STACK_SIZE;
+
+DEFINE_PER_CPU(unsigned int, irq_count) __visible = -1;
+
+DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT;
+EXPORT_PER_CPU_SYMBOL(__preempt_count);
+
+/* May not be marked __init: used by software suspend */
+void syscall_init(void)
+{
+ extern char _entry_trampoline[];
+ extern char entry_SYSCALL_64_trampoline[];
+
+ int cpu = smp_processor_id();
+ unsigned long SYSCALL64_entry_trampoline =
+ (unsigned long)get_cpu_entry_area(cpu)->entry_trampoline +
+ (entry_SYSCALL_64_trampoline - _entry_trampoline);
+
+ wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS);
+ if (static_cpu_has(X86_FEATURE_PTI))
+ wrmsrl(MSR_LSTAR, SYSCALL64_entry_trampoline);
+ else
+ wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64);
+
+#ifdef CONFIG_IA32_EMULATION
+ wrmsrl(MSR_CSTAR, (unsigned long)entry_SYSCALL_compat);
+ /*
+ * This only works on Intel CPUs.
+ * On AMD CPUs these MSRs are 32-bit, CPU truncates MSR_IA32_SYSENTER_EIP.
+ * This does not cause SYSENTER to jump to the wrong location, because
+ * AMD doesn't allow SYSENTER in long mode (either 32- or 64-bit).
+ */
+ wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS);
+ wrmsrl_safe(MSR_IA32_SYSENTER_ESP, (unsigned long)(cpu_entry_stack(cpu) + 1));
+ wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat);
+#else
+ wrmsrl(MSR_CSTAR, (unsigned long)ignore_sysret);
+ wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)GDT_ENTRY_INVALID_SEG);
+ wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL);
+ wrmsrl_safe(MSR_IA32_SYSENTER_EIP, 0ULL);
+#endif
+
+ /* Flags to clear on syscall */
+ wrmsrl(MSR_SYSCALL_MASK,
+ X86_EFLAGS_TF|X86_EFLAGS_DF|X86_EFLAGS_IF|
+ X86_EFLAGS_IOPL|X86_EFLAGS_AC|X86_EFLAGS_NT);
+}
+
+/*
+ * Copies of the original ist values from the tss are only accessed during
+ * debugging, no special alignment required.
+ */
+DEFINE_PER_CPU(struct orig_ist, orig_ist);
+
+static DEFINE_PER_CPU(unsigned long, debug_stack_addr);
+DEFINE_PER_CPU(int, debug_stack_usage);
+
+int is_debug_stack(unsigned long addr)
+{
+ return __this_cpu_read(debug_stack_usage) ||
+ (addr <= __this_cpu_read(debug_stack_addr) &&
+ addr > (__this_cpu_read(debug_stack_addr) - DEBUG_STKSZ));
+}
+NOKPROBE_SYMBOL(is_debug_stack);
+
+DEFINE_PER_CPU(u32, debug_idt_ctr);
+
+void debug_stack_set_zero(void)
+{
+ this_cpu_inc(debug_idt_ctr);
+ load_current_idt();
+}
+NOKPROBE_SYMBOL(debug_stack_set_zero);
+
+void debug_stack_reset(void)
+{
+ if (WARN_ON(!this_cpu_read(debug_idt_ctr)))
+ return;
+ if (this_cpu_dec_return(debug_idt_ctr) == 0)
+ load_current_idt();
+}
+NOKPROBE_SYMBOL(debug_stack_reset);
+
+#else /* CONFIG_X86_64 */
+
+DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task;
+EXPORT_PER_CPU_SYMBOL(current_task);
+DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT;
+EXPORT_PER_CPU_SYMBOL(__preempt_count);
+
+/*
+ * On x86_32, vm86 modifies tss.sp0, so sp0 isn't a reliable way to find
+ * the top of the kernel stack. Use an extra percpu variable to track the
+ * top of the kernel stack directly.
+ */
+DEFINE_PER_CPU(unsigned long, cpu_current_top_of_stack) =
+ (unsigned long)&init_thread_union + THREAD_SIZE;
+EXPORT_PER_CPU_SYMBOL(cpu_current_top_of_stack);
+
+#ifdef CONFIG_STACKPROTECTOR
+DEFINE_PER_CPU_ALIGNED(struct stack_canary, stack_canary);
+#endif
+
+#endif /* CONFIG_X86_64 */
+
+/*
+ * Clear all 6 debug registers:
+ */
+static void clear_all_debug_regs(void)
+{
+ int i;
+
+ for (i = 0; i < 8; i++) {
+ /* Ignore db4, db5 */
+ if ((i == 4) || (i == 5))
+ continue;
+
+ set_debugreg(0, i);
+ }
+}
+
+#ifdef CONFIG_KGDB
+/*
+ * Restore debug regs if using kgdbwait and you have a kernel debugger
+ * connection established.
+ */
+static void dbg_restore_debug_regs(void)
+{
+ if (unlikely(kgdb_connected && arch_kgdb_ops.correct_hw_break))
+ arch_kgdb_ops.correct_hw_break();
+}
+#else /* ! CONFIG_KGDB */
+#define dbg_restore_debug_regs()
+#endif /* ! CONFIG_KGDB */
+
+static void wait_for_master_cpu(int cpu)
+{
+#ifdef CONFIG_SMP
+ /*
+ * wait for ACK from master CPU before continuing
+ * with AP initialization
+ */
+ WARN_ON(cpumask_test_and_set_cpu(cpu, cpu_initialized_mask));
+ while (!cpumask_test_cpu(cpu, cpu_callout_mask))
+ cpu_relax();
+#endif
+}
+
+/*
+ * cpu_init() initializes state that is per-CPU. Some data is already
+ * initialized (naturally) in the bootstrap process, such as the GDT
+ * and IDT. We reload them nevertheless, this function acts as a
+ * 'CPU state barrier', nothing should get across.
+ * A lot of state is already set up in PDA init for 64 bit
+ */
+#ifdef CONFIG_X86_64
+
+void cpu_init(void)
+{
+ struct orig_ist *oist;
+ struct task_struct *me;
+ struct tss_struct *t;
+ unsigned long v;
+ int cpu = raw_smp_processor_id();
+ int i;
+
+ wait_for_master_cpu(cpu);
+
+ /*
+ * Initialize the CR4 shadow before doing anything that could
+ * try to read it.
+ */
+ cr4_init_shadow();
+
+ if (cpu)
+ load_ucode_ap();
+
+ t = &per_cpu(cpu_tss_rw, cpu);
+ oist = &per_cpu(orig_ist, cpu);
+
+#ifdef CONFIG_NUMA
+ if (this_cpu_read(numa_node) == 0 &&
+ early_cpu_to_node(cpu) != NUMA_NO_NODE)
+ set_numa_node(early_cpu_to_node(cpu));
+#endif
+
+ me = current;
+
+ pr_debug("Initializing CPU#%d\n", cpu);
+
+ cr4_clear_bits(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
+
+ /*
+ * Initialize the per-CPU GDT with the boot GDT,
+ * and set up the GDT descriptor:
+ */
+
+ switch_to_new_gdt(cpu);
+ loadsegment(fs, 0);
+
+ load_current_idt();
+
+ memset(me->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8);
+ syscall_init();
+
+ wrmsrl(MSR_FS_BASE, 0);
+ wrmsrl(MSR_KERNEL_GS_BASE, 0);
+ barrier();
+
+ x86_configure_nx();
+ x2apic_setup();
+
+ /*
+ * set up and load the per-CPU TSS
+ */
+ if (!oist->ist[0]) {
+ char *estacks = get_cpu_entry_area(cpu)->exception_stacks;
+
+ for (v = 0; v < N_EXCEPTION_STACKS; v++) {
+ estacks += exception_stack_sizes[v];
+ oist->ist[v] = t->x86_tss.ist[v] =
+ (unsigned long)estacks;
+ if (v == DEBUG_STACK-1)
+ per_cpu(debug_stack_addr, cpu) = (unsigned long)estacks;
+ }
+ }
+
+ t->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET;
+
+ /*
+ * <= is required because the CPU will access up to
+ * 8 bits beyond the end of the IO permission bitmap.
+ */
+ for (i = 0; i <= IO_BITMAP_LONGS; i++)
+ t->io_bitmap[i] = ~0UL;
+
+ mmgrab(&init_mm);
+ me->active_mm = &init_mm;
+ BUG_ON(me->mm);
+ initialize_tlbstate_and_flush();
+ enter_lazy_tlb(&init_mm, me);
+
+ /*
+ * Initialize the TSS. sp0 points to the entry trampoline stack
+ * regardless of what task is running.
+ */
+ set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss);
+ load_TR_desc();
+ load_sp0((unsigned long)(cpu_entry_stack(cpu) + 1));
+
+ load_mm_ldt(&init_mm);
+
+ clear_all_debug_regs();
+ dbg_restore_debug_regs();
+
+ fpu__init_cpu();
+
+ if (is_uv_system())
+ uv_cpu_init();
+
+ load_fixmap_gdt(cpu);
+}
+
+#else
+
+void cpu_init(void)
+{
+ int cpu = smp_processor_id();
+ struct task_struct *curr = current;
+ struct tss_struct *t = &per_cpu(cpu_tss_rw, cpu);
+
+ wait_for_master_cpu(cpu);
+
+ /*
+ * Initialize the CR4 shadow before doing anything that could
+ * try to read it.
+ */
+ cr4_init_shadow();
+
+ show_ucode_info_early();
+
+ pr_info("Initializing CPU#%d\n", cpu);
+
+ if (cpu_feature_enabled(X86_FEATURE_VME) ||
+ boot_cpu_has(X86_FEATURE_TSC) ||
+ boot_cpu_has(X86_FEATURE_DE))
+ cr4_clear_bits(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
+
+ load_current_idt();
+ switch_to_new_gdt(cpu);
+
+ /*
+ * Set up and load the per-CPU TSS and LDT
+ */
+ mmgrab(&init_mm);
+ curr->active_mm = &init_mm;
+ BUG_ON(curr->mm);
+ initialize_tlbstate_and_flush();
+ enter_lazy_tlb(&init_mm, curr);
+
+ /*
+ * Initialize the TSS. sp0 points to the entry trampoline stack
+ * regardless of what task is running.
+ */
+ set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss);
+ load_TR_desc();
+ load_sp0((unsigned long)(cpu_entry_stack(cpu) + 1));
+
+ load_mm_ldt(&init_mm);
+
+ t->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET;
+
+#ifdef CONFIG_DOUBLEFAULT
+ /* Set up doublefault TSS pointer in the GDT */
+ __set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS, &doublefault_tss);
+#endif
+
+ clear_all_debug_regs();
+ dbg_restore_debug_regs();
+
+ fpu__init_cpu();
+
+ load_fixmap_gdt(cpu);
+}
+#endif
+
+static void bsp_resume(void)
+{
+ if (this_cpu->c_bsp_resume)
+ this_cpu->c_bsp_resume(&boot_cpu_data);
+}
+
+static struct syscore_ops cpu_syscore_ops = {
+ .resume = bsp_resume,
+};
+
+static int __init init_cpu_syscore(void)
+{
+ register_syscore_ops(&cpu_syscore_ops);
+ return 0;
+}
+core_initcall(init_cpu_syscore);
+
+/*
+ * The microcode loader calls this upon late microcode load to recheck features,
+ * only when microcode has been updated. Caller holds microcode_mutex and CPU
+ * hotplug lock.
+ */
+void microcode_check(void)
+{
+ struct cpuinfo_x86 info;
+
+ perf_check_microcode();
+
+ /* Reload CPUID max function as it might've changed. */
+ info.cpuid_level = cpuid_eax(0);
+
+ /*
+ * Copy all capability leafs to pick up the synthetic ones so that
+ * memcmp() below doesn't fail on that. The ones coming from CPUID will
+ * get overwritten in get_cpu_cap().
+ */
+ memcpy(&info.x86_capability, &boot_cpu_data.x86_capability, sizeof(info.x86_capability));
+
+ get_cpu_cap(&info);
+
+ if (!memcmp(&info.x86_capability, &boot_cpu_data.x86_capability, sizeof(info.x86_capability)))
+ return;
+
+ pr_warn("x86/CPU: CPU features have changed after loading microcode, but might not take effect.\n");
+ pr_warn("x86/CPU: Please consider either early loading through initrd/built-in or a potential BIOS update.\n");
+}
diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h
new file mode 100644
index 0000000..7b229af
--- /dev/null
+++ b/arch/x86/kernel/cpu/cpu.h
@@ -0,0 +1,68 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef ARCH_X86_CPU_H
+#define ARCH_X86_CPU_H
+
+/* attempt to consolidate cpu attributes */
+struct cpu_dev {
+ const char *c_vendor;
+
+ /* some have two possibilities for cpuid string */
+ const char *c_ident[2];
+
+ void (*c_early_init)(struct cpuinfo_x86 *);
+ void (*c_bsp_init)(struct cpuinfo_x86 *);
+ void (*c_init)(struct cpuinfo_x86 *);
+ void (*c_identify)(struct cpuinfo_x86 *);
+ void (*c_detect_tlb)(struct cpuinfo_x86 *);
+ void (*c_bsp_resume)(struct cpuinfo_x86 *);
+ int c_x86_vendor;
+#ifdef CONFIG_X86_32
+ /* Optional vendor specific routine to obtain the cache size. */
+ unsigned int (*legacy_cache_size)(struct cpuinfo_x86 *,
+ unsigned int);
+
+ /* Family/stepping-based lookup table for model names. */
+ struct legacy_cpu_model_info {
+ int family;
+ const char *model_names[16];
+ } legacy_models[5];
+#endif
+};
+
+struct _tlb_table {
+ unsigned char descriptor;
+ char tlb_type;
+ unsigned int entries;
+ /* unsigned int ways; */
+ char info[128];
+};
+
+#define cpu_dev_register(cpu_devX) \
+ static const struct cpu_dev *const __cpu_dev_##cpu_devX __used \
+ __attribute__((__section__(".x86_cpu_dev.init"))) = \
+ &cpu_devX;
+
+extern const struct cpu_dev *const __x86_cpu_dev_start[],
+ *const __x86_cpu_dev_end[];
+
+extern void get_cpu_cap(struct cpuinfo_x86 *c);
+extern void get_cpu_address_sizes(struct cpuinfo_x86 *c);
+extern void cpu_detect_cache_sizes(struct cpuinfo_x86 *c);
+extern void init_scattered_cpuid_features(struct cpuinfo_x86 *c);
+extern u32 get_scattered_cpuid_leaf(unsigned int level,
+ unsigned int sub_leaf,
+ enum cpuid_regs_idx reg);
+extern void init_intel_cacheinfo(struct cpuinfo_x86 *c);
+extern void init_amd_cacheinfo(struct cpuinfo_x86 *c);
+
+extern void detect_num_cpu_cores(struct cpuinfo_x86 *c);
+extern int detect_extended_topology_early(struct cpuinfo_x86 *c);
+extern int detect_extended_topology(struct cpuinfo_x86 *c);
+extern int detect_ht_early(struct cpuinfo_x86 *c);
+extern void detect_ht(struct cpuinfo_x86 *c);
+
+unsigned int aperfmperf_get_khz(int cpu);
+
+extern void x86_spec_ctrl_setup_ap(void);
+
+#endif /* ARCH_X86_CPU_H */
diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c
new file mode 100644
index 0000000..2c0bd38
--- /dev/null
+++ b/arch/x86/kernel/cpu/cpuid-deps.c
@@ -0,0 +1,121 @@
+/* Declare dependencies between CPUIDs */
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <asm/cpufeature.h>
+
+struct cpuid_dep {
+ unsigned int feature;
+ unsigned int depends;
+};
+
+/*
+ * Table of CPUID features that depend on others.
+ *
+ * This only includes dependencies that can be usefully disabled, not
+ * features part of the base set (like FPU).
+ *
+ * Note this all is not __init / __initdata because it can be
+ * called from cpu hotplug. It shouldn't do anything in this case,
+ * but it's difficult to tell that to the init reference checker.
+ */
+static const struct cpuid_dep cpuid_deps[] = {
+ { X86_FEATURE_XSAVEOPT, X86_FEATURE_XSAVE },
+ { X86_FEATURE_XSAVEC, X86_FEATURE_XSAVE },
+ { X86_FEATURE_XSAVES, X86_FEATURE_XSAVE },
+ { X86_FEATURE_AVX, X86_FEATURE_XSAVE },
+ { X86_FEATURE_PKU, X86_FEATURE_XSAVE },
+ { X86_FEATURE_MPX, X86_FEATURE_XSAVE },
+ { X86_FEATURE_XGETBV1, X86_FEATURE_XSAVE },
+ { X86_FEATURE_FXSR_OPT, X86_FEATURE_FXSR },
+ { X86_FEATURE_XMM, X86_FEATURE_FXSR },
+ { X86_FEATURE_XMM2, X86_FEATURE_XMM },
+ { X86_FEATURE_XMM3, X86_FEATURE_XMM2 },
+ { X86_FEATURE_XMM4_1, X86_FEATURE_XMM2 },
+ { X86_FEATURE_XMM4_2, X86_FEATURE_XMM2 },
+ { X86_FEATURE_XMM3, X86_FEATURE_XMM2 },
+ { X86_FEATURE_PCLMULQDQ, X86_FEATURE_XMM2 },
+ { X86_FEATURE_SSSE3, X86_FEATURE_XMM2, },
+ { X86_FEATURE_F16C, X86_FEATURE_XMM2, },
+ { X86_FEATURE_AES, X86_FEATURE_XMM2 },
+ { X86_FEATURE_SHA_NI, X86_FEATURE_XMM2 },
+ { X86_FEATURE_FMA, X86_FEATURE_AVX },
+ { X86_FEATURE_AVX2, X86_FEATURE_AVX, },
+ { X86_FEATURE_AVX512F, X86_FEATURE_AVX, },
+ { X86_FEATURE_AVX512IFMA, X86_FEATURE_AVX512F },
+ { X86_FEATURE_AVX512PF, X86_FEATURE_AVX512F },
+ { X86_FEATURE_AVX512ER, X86_FEATURE_AVX512F },
+ { X86_FEATURE_AVX512CD, X86_FEATURE_AVX512F },
+ { X86_FEATURE_AVX512DQ, X86_FEATURE_AVX512F },
+ { X86_FEATURE_AVX512BW, X86_FEATURE_AVX512F },
+ { X86_FEATURE_AVX512VL, X86_FEATURE_AVX512F },
+ { X86_FEATURE_AVX512VBMI, X86_FEATURE_AVX512F },
+ { X86_FEATURE_AVX512_VBMI2, X86_FEATURE_AVX512VL },
+ { X86_FEATURE_GFNI, X86_FEATURE_AVX512VL },
+ { X86_FEATURE_VAES, X86_FEATURE_AVX512VL },
+ { X86_FEATURE_VPCLMULQDQ, X86_FEATURE_AVX512VL },
+ { X86_FEATURE_AVX512_VNNI, X86_FEATURE_AVX512VL },
+ { X86_FEATURE_AVX512_BITALG, X86_FEATURE_AVX512VL },
+ { X86_FEATURE_AVX512_4VNNIW, X86_FEATURE_AVX512F },
+ { X86_FEATURE_AVX512_4FMAPS, X86_FEATURE_AVX512F },
+ { X86_FEATURE_AVX512_VPOPCNTDQ, X86_FEATURE_AVX512F },
+ {}
+};
+
+static inline void clear_feature(struct cpuinfo_x86 *c, unsigned int feature)
+{
+ /*
+ * Note: This could use the non atomic __*_bit() variants, but the
+ * rest of the cpufeature code uses atomics as well, so keep it for
+ * consistency. Cleanup all of it separately.
+ */
+ if (!c) {
+ clear_cpu_cap(&boot_cpu_data, feature);
+ set_bit(feature, (unsigned long *)cpu_caps_cleared);
+ } else {
+ clear_bit(feature, (unsigned long *)c->x86_capability);
+ }
+}
+
+/* Take the capabilities and the BUG bits into account */
+#define MAX_FEATURE_BITS ((NCAPINTS + NBUGINTS) * sizeof(u32) * 8)
+
+static void do_clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int feature)
+{
+ DECLARE_BITMAP(disable, MAX_FEATURE_BITS);
+ const struct cpuid_dep *d;
+ bool changed;
+
+ if (WARN_ON(feature >= MAX_FEATURE_BITS))
+ return;
+
+ clear_feature(c, feature);
+
+ /* Collect all features to disable, handling dependencies */
+ memset(disable, 0, sizeof(disable));
+ __set_bit(feature, disable);
+
+ /* Loop until we get a stable state. */
+ do {
+ changed = false;
+ for (d = cpuid_deps; d->feature; d++) {
+ if (!test_bit(d->depends, disable))
+ continue;
+ if (__test_and_set_bit(d->feature, disable))
+ continue;
+
+ changed = true;
+ clear_feature(c, d->feature);
+ }
+ } while (changed);
+}
+
+void clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int feature)
+{
+ do_clear_cpu_cap(c, feature);
+}
+
+void setup_clear_cpu_cap(unsigned int feature)
+{
+ do_clear_cpu_cap(NULL, feature);
+}
diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c
new file mode 100644
index 0000000..8949b7a
--- /dev/null
+++ b/arch/x86/kernel/cpu/cyrix.c
@@ -0,0 +1,466 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/bitops.h>
+#include <linux/delay.h>
+#include <linux/pci.h>
+#include <asm/dma.h>
+#include <linux/io.h>
+#include <asm/processor-cyrix.h>
+#include <asm/processor-flags.h>
+#include <linux/timer.h>
+#include <asm/pci-direct.h>
+#include <asm/tsc.h>
+#include <asm/cpufeature.h>
+#include <linux/sched.h>
+#include <linux/sched/clock.h>
+
+#include "cpu.h"
+
+/*
+ * Read NSC/Cyrix DEVID registers (DIR) to get more detailed info. about the CPU
+ */
+static void __do_cyrix_devid(unsigned char *dir0, unsigned char *dir1)
+{
+ unsigned char ccr2, ccr3;
+
+ /* we test for DEVID by checking whether CCR3 is writable */
+ ccr3 = getCx86(CX86_CCR3);
+ setCx86(CX86_CCR3, ccr3 ^ 0x80);
+ getCx86(0xc0); /* dummy to change bus */
+
+ if (getCx86(CX86_CCR3) == ccr3) { /* no DEVID regs. */
+ ccr2 = getCx86(CX86_CCR2);
+ setCx86(CX86_CCR2, ccr2 ^ 0x04);
+ getCx86(0xc0); /* dummy */
+
+ if (getCx86(CX86_CCR2) == ccr2) /* old Cx486SLC/DLC */
+ *dir0 = 0xfd;
+ else { /* Cx486S A step */
+ setCx86(CX86_CCR2, ccr2);
+ *dir0 = 0xfe;
+ }
+ } else {
+ setCx86(CX86_CCR3, ccr3); /* restore CCR3 */
+
+ /* read DIR0 and DIR1 CPU registers */
+ *dir0 = getCx86(CX86_DIR0);
+ *dir1 = getCx86(CX86_DIR1);
+ }
+}
+
+static void do_cyrix_devid(unsigned char *dir0, unsigned char *dir1)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ __do_cyrix_devid(dir0, dir1);
+ local_irq_restore(flags);
+}
+/*
+ * Cx86_dir0_msb is a HACK needed by check_cx686_cpuid/slop in bugs.h in
+ * order to identify the Cyrix CPU model after we're out of setup.c
+ *
+ * Actually since bugs.h doesn't even reference this perhaps someone should
+ * fix the documentation ???
+ */
+static unsigned char Cx86_dir0_msb = 0;
+
+static const char Cx86_model[][9] = {
+ "Cx486", "Cx486", "5x86 ", "6x86", "MediaGX ", "6x86MX ",
+ "M II ", "Unknown"
+};
+static const char Cx486_name[][5] = {
+ "SLC", "DLC", "SLC2", "DLC2", "SRx", "DRx",
+ "SRx2", "DRx2"
+};
+static const char Cx486S_name[][4] = {
+ "S", "S2", "Se", "S2e"
+};
+static const char Cx486D_name[][4] = {
+ "DX", "DX2", "?", "?", "?", "DX4"
+};
+static char Cx86_cb[] = "?.5x Core/Bus Clock";
+static const char cyrix_model_mult1[] = "12??43";
+static const char cyrix_model_mult2[] = "12233445";
+
+/*
+ * Reset the slow-loop (SLOP) bit on the 686(L) which is set by some old
+ * BIOSes for compatibility with DOS games. This makes the udelay loop
+ * work correctly, and improves performance.
+ *
+ * FIXME: our newer udelay uses the tsc. We don't need to frob with SLOP
+ */
+
+static void check_cx686_slop(struct cpuinfo_x86 *c)
+{
+ unsigned long flags;
+
+ if (Cx86_dir0_msb == 3) {
+ unsigned char ccr3, ccr5;
+
+ local_irq_save(flags);
+ ccr3 = getCx86(CX86_CCR3);
+ setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */
+ ccr5 = getCx86(CX86_CCR5);
+ if (ccr5 & 2)
+ setCx86(CX86_CCR5, ccr5 & 0xfd); /* reset SLOP */
+ setCx86(CX86_CCR3, ccr3); /* disable MAPEN */
+ local_irq_restore(flags);
+
+ if (ccr5 & 2) { /* possible wrong calibration done */
+ pr_info("Recalibrating delay loop with SLOP bit reset\n");
+ calibrate_delay();
+ c->loops_per_jiffy = loops_per_jiffy;
+ }
+ }
+}
+
+
+static void set_cx86_reorder(void)
+{
+ u8 ccr3;
+
+ pr_info("Enable Memory access reorder on Cyrix/NSC processor.\n");
+ ccr3 = getCx86(CX86_CCR3);
+ setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */
+
+ /* Load/Store Serialize to mem access disable (=reorder it) */
+ setCx86_old(CX86_PCR0, getCx86_old(CX86_PCR0) & ~0x80);
+ /* set load/store serialize from 1GB to 4GB */
+ ccr3 |= 0xe0;
+ setCx86(CX86_CCR3, ccr3);
+}
+
+static void set_cx86_memwb(void)
+{
+ pr_info("Enable Memory-Write-back mode on Cyrix/NSC processor.\n");
+
+ /* CCR2 bit 2: unlock NW bit */
+ setCx86_old(CX86_CCR2, getCx86_old(CX86_CCR2) & ~0x04);
+ /* set 'Not Write-through' */
+ write_cr0(read_cr0() | X86_CR0_NW);
+ /* CCR2 bit 2: lock NW bit and set WT1 */
+ setCx86_old(CX86_CCR2, getCx86_old(CX86_CCR2) | 0x14);
+}
+
+/*
+ * Configure later MediaGX and/or Geode processor.
+ */
+
+static void geode_configure(void)
+{
+ unsigned long flags;
+ u8 ccr3;
+ local_irq_save(flags);
+
+ /* Suspend on halt power saving and enable #SUSP pin */
+ setCx86_old(CX86_CCR2, getCx86_old(CX86_CCR2) | 0x88);
+
+ ccr3 = getCx86(CX86_CCR3);
+ setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */
+
+
+ /* FPU fast, DTE cache, Mem bypass */
+ setCx86_old(CX86_CCR4, getCx86_old(CX86_CCR4) | 0x38);
+ setCx86(CX86_CCR3, ccr3); /* disable MAPEN */
+
+ set_cx86_memwb();
+ set_cx86_reorder();
+
+ local_irq_restore(flags);
+}
+
+static void early_init_cyrix(struct cpuinfo_x86 *c)
+{
+ unsigned char dir0, dir0_msn, dir1 = 0;
+
+ __do_cyrix_devid(&dir0, &dir1);
+ dir0_msn = dir0 >> 4; /* identifies CPU "family" */
+
+ switch (dir0_msn) {
+ case 3: /* 6x86/6x86L */
+ /* Emulate MTRRs using Cyrix's ARRs. */
+ set_cpu_cap(c, X86_FEATURE_CYRIX_ARR);
+ break;
+ case 5: /* 6x86MX/M II */
+ /* Emulate MTRRs using Cyrix's ARRs. */
+ set_cpu_cap(c, X86_FEATURE_CYRIX_ARR);
+ break;
+ }
+}
+
+static void init_cyrix(struct cpuinfo_x86 *c)
+{
+ unsigned char dir0, dir0_msn, dir0_lsn, dir1 = 0;
+ char *buf = c->x86_model_id;
+ const char *p = NULL;
+
+ /*
+ * Bit 31 in normal CPUID used for nonstandard 3DNow ID;
+ * 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway
+ */
+ clear_cpu_cap(c, 0*32+31);
+
+ /* Cyrix used bit 24 in extended (AMD) CPUID for Cyrix MMX extensions */
+ if (test_cpu_cap(c, 1*32+24)) {
+ clear_cpu_cap(c, 1*32+24);
+ set_cpu_cap(c, X86_FEATURE_CXMMX);
+ }
+
+ do_cyrix_devid(&dir0, &dir1);
+
+ check_cx686_slop(c);
+
+ Cx86_dir0_msb = dir0_msn = dir0 >> 4; /* identifies CPU "family" */
+ dir0_lsn = dir0 & 0xf; /* model or clock multiplier */
+
+ /* common case step number/rev -- exceptions handled below */
+ c->x86_model = (dir1 >> 4) + 1;
+ c->x86_stepping = dir1 & 0xf;
+
+ /* Now cook; the original recipe is by Channing Corn, from Cyrix.
+ * We do the same thing for each generation: we work out
+ * the model, multiplier and stepping. Black magic included,
+ * to make the silicon step/rev numbers match the printed ones.
+ */
+
+ switch (dir0_msn) {
+ unsigned char tmp;
+
+ case 0: /* Cx486SLC/DLC/SRx/DRx */
+ p = Cx486_name[dir0_lsn & 7];
+ break;
+
+ case 1: /* Cx486S/DX/DX2/DX4 */
+ p = (dir0_lsn & 8) ? Cx486D_name[dir0_lsn & 5]
+ : Cx486S_name[dir0_lsn & 3];
+ break;
+
+ case 2: /* 5x86 */
+ Cx86_cb[2] = cyrix_model_mult1[dir0_lsn & 5];
+ p = Cx86_cb+2;
+ break;
+
+ case 3: /* 6x86/6x86L */
+ Cx86_cb[1] = ' ';
+ Cx86_cb[2] = cyrix_model_mult1[dir0_lsn & 5];
+ if (dir1 > 0x21) { /* 686L */
+ Cx86_cb[0] = 'L';
+ p = Cx86_cb;
+ (c->x86_model)++;
+ } else /* 686 */
+ p = Cx86_cb+1;
+ /* Emulate MTRRs using Cyrix's ARRs. */
+ set_cpu_cap(c, X86_FEATURE_CYRIX_ARR);
+ /* 6x86's contain this bug */
+ set_cpu_bug(c, X86_BUG_COMA);
+ break;
+
+ case 4: /* MediaGX/GXm or Geode GXM/GXLV/GX1 */
+ case 11: /* GX1 with inverted Device ID */
+#ifdef CONFIG_PCI
+ {
+ u32 vendor, device;
+ /*
+ * It isn't really a PCI quirk directly, but the cure is the
+ * same. The MediaGX has deep magic SMM stuff that handles the
+ * SB emulation. It throws away the fifo on disable_dma() which
+ * is wrong and ruins the audio.
+ *
+ * Bug2: VSA1 has a wrap bug so that using maximum sized DMA
+ * causes bad things. According to NatSemi VSA2 has another
+ * bug to do with 'hlt'. I've not seen any boards using VSA2
+ * and X doesn't seem to support it either so who cares 8).
+ * VSA1 we work around however.
+ */
+
+ pr_info("Working around Cyrix MediaGX virtual DMA bugs.\n");
+ isa_dma_bridge_buggy = 2;
+
+ /* We do this before the PCI layer is running. However we
+ are safe here as we know the bridge must be a Cyrix
+ companion and must be present */
+ vendor = read_pci_config_16(0, 0, 0x12, PCI_VENDOR_ID);
+ device = read_pci_config_16(0, 0, 0x12, PCI_DEVICE_ID);
+
+ /*
+ * The 5510/5520 companion chips have a funky PIT.
+ */
+ if (vendor == PCI_VENDOR_ID_CYRIX &&
+ (device == PCI_DEVICE_ID_CYRIX_5510 ||
+ device == PCI_DEVICE_ID_CYRIX_5520))
+ mark_tsc_unstable("cyrix 5510/5520 detected");
+ }
+#endif
+ c->x86_cache_size = 16; /* Yep 16K integrated cache thats it */
+
+ /* GXm supports extended cpuid levels 'ala' AMD */
+ if (c->cpuid_level == 2) {
+ /* Enable cxMMX extensions (GX1 Datasheet 54) */
+ setCx86_old(CX86_CCR7, getCx86_old(CX86_CCR7) | 1);
+
+ /*
+ * GXm : 0x30 ... 0x5f GXm datasheet 51
+ * GXlv: 0x6x GXlv datasheet 54
+ * ? : 0x7x
+ * GX1 : 0x8x GX1 datasheet 56
+ */
+ if ((0x30 <= dir1 && dir1 <= 0x6f) ||
+ (0x80 <= dir1 && dir1 <= 0x8f))
+ geode_configure();
+ return;
+ } else { /* MediaGX */
+ Cx86_cb[2] = (dir0_lsn & 1) ? '3' : '4';
+ p = Cx86_cb+2;
+ c->x86_model = (dir1 & 0x20) ? 1 : 2;
+ }
+ break;
+
+ case 5: /* 6x86MX/M II */
+ if (dir1 > 7) {
+ dir0_msn++; /* M II */
+ /* Enable MMX extensions (App note 108) */
+ setCx86_old(CX86_CCR7, getCx86_old(CX86_CCR7)|1);
+ } else {
+ /* A 6x86MX - it has the bug. */
+ set_cpu_bug(c, X86_BUG_COMA);
+ }
+ tmp = (!(dir0_lsn & 7) || dir0_lsn & 1) ? 2 : 0;
+ Cx86_cb[tmp] = cyrix_model_mult2[dir0_lsn & 7];
+ p = Cx86_cb+tmp;
+ if (((dir1 & 0x0f) > 4) || ((dir1 & 0xf0) == 0x20))
+ (c->x86_model)++;
+ /* Emulate MTRRs using Cyrix's ARRs. */
+ set_cpu_cap(c, X86_FEATURE_CYRIX_ARR);
+ break;
+
+ case 0xf: /* Cyrix 486 without DEVID registers */
+ switch (dir0_lsn) {
+ case 0xd: /* either a 486SLC or DLC w/o DEVID */
+ dir0_msn = 0;
+ p = Cx486_name[!!boot_cpu_has(X86_FEATURE_FPU)];
+ break;
+
+ case 0xe: /* a 486S A step */
+ dir0_msn = 0;
+ p = Cx486S_name[0];
+ break;
+ }
+ break;
+
+ default: /* unknown (shouldn't happen, we know everyone ;-) */
+ dir0_msn = 7;
+ break;
+ }
+ strcpy(buf, Cx86_model[dir0_msn & 7]);
+ if (p)
+ strcat(buf, p);
+ return;
+}
+
+/*
+ * Handle National Semiconductor branded processors
+ */
+static void init_nsc(struct cpuinfo_x86 *c)
+{
+ /*
+ * There may be GX1 processors in the wild that are branded
+ * NSC and not Cyrix.
+ *
+ * This function only handles the GX processor, and kicks every
+ * thing else to the Cyrix init function above - that should
+ * cover any processors that might have been branded differently
+ * after NSC acquired Cyrix.
+ *
+ * If this breaks your GX1 horribly, please e-mail
+ * info-linux@ldcmail.amd.com to tell us.
+ */
+
+ /* Handle the GX (Formally known as the GX2) */
+
+ if (c->x86 == 5 && c->x86_model == 5)
+ cpu_detect_cache_sizes(c);
+ else
+ init_cyrix(c);
+}
+
+/*
+ * Cyrix CPUs without cpuid or with cpuid not yet enabled can be detected
+ * by the fact that they preserve the flags across the division of 5/2.
+ * PII and PPro exhibit this behavior too, but they have cpuid available.
+ */
+
+/*
+ * Perform the Cyrix 5/2 test. A Cyrix won't change
+ * the flags, while other 486 chips will.
+ */
+static inline int test_cyrix_52div(void)
+{
+ unsigned int test;
+
+ __asm__ __volatile__(
+ "sahf\n\t" /* clear flags (%eax = 0x0005) */
+ "div %b2\n\t" /* divide 5 by 2 */
+ "lahf" /* store flags into %ah */
+ : "=a" (test)
+ : "0" (5), "q" (2)
+ : "cc");
+
+ /* AH is 0x02 on Cyrix after the divide.. */
+ return (unsigned char) (test >> 8) == 0x02;
+}
+
+static void cyrix_identify(struct cpuinfo_x86 *c)
+{
+ /* Detect Cyrix with disabled CPUID */
+ if (c->x86 == 4 && test_cyrix_52div()) {
+ unsigned char dir0, dir1;
+
+ strcpy(c->x86_vendor_id, "CyrixInstead");
+ c->x86_vendor = X86_VENDOR_CYRIX;
+
+ /* Actually enable cpuid on the older cyrix */
+
+ /* Retrieve CPU revisions */
+
+ do_cyrix_devid(&dir0, &dir1);
+
+ dir0 >>= 4;
+
+ /* Check it is an affected model */
+
+ if (dir0 == 5 || dir0 == 3) {
+ unsigned char ccr3;
+ unsigned long flags;
+ pr_info("Enabling CPUID on Cyrix processor.\n");
+ local_irq_save(flags);
+ ccr3 = getCx86(CX86_CCR3);
+ /* enable MAPEN */
+ setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10);
+ /* enable cpuid */
+ setCx86_old(CX86_CCR4, getCx86_old(CX86_CCR4) | 0x80);
+ /* disable MAPEN */
+ setCx86(CX86_CCR3, ccr3);
+ local_irq_restore(flags);
+ }
+ }
+}
+
+static const struct cpu_dev cyrix_cpu_dev = {
+ .c_vendor = "Cyrix",
+ .c_ident = { "CyrixInstead" },
+ .c_early_init = early_init_cyrix,
+ .c_init = init_cyrix,
+ .c_identify = cyrix_identify,
+ .c_x86_vendor = X86_VENDOR_CYRIX,
+};
+
+cpu_dev_register(cyrix_cpu_dev);
+
+static const struct cpu_dev nsc_cpu_dev = {
+ .c_vendor = "NSC",
+ .c_ident = { "Geode by NSC" },
+ .c_init = init_nsc,
+ .c_x86_vendor = X86_VENDOR_NSC,
+};
+
+cpu_dev_register(nsc_cpu_dev);
diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c
new file mode 100644
index 0000000..479ca47
--- /dev/null
+++ b/arch/x86/kernel/cpu/hypervisor.c
@@ -0,0 +1,102 @@
+/*
+ * Common hypervisor code
+ *
+ * Copyright (C) 2008, VMware, Inc.
+ * Author : Alok N Kataria <akataria@vmware.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ */
+
+#include <linux/init.h>
+#include <linux/export.h>
+#include <asm/processor.h>
+#include <asm/hypervisor.h>
+
+extern const struct hypervisor_x86 x86_hyper_vmware;
+extern const struct hypervisor_x86 x86_hyper_ms_hyperv;
+extern const struct hypervisor_x86 x86_hyper_xen_pv;
+extern const struct hypervisor_x86 x86_hyper_xen_hvm;
+extern const struct hypervisor_x86 x86_hyper_kvm;
+extern const struct hypervisor_x86 x86_hyper_jailhouse;
+
+static const __initconst struct hypervisor_x86 * const hypervisors[] =
+{
+#ifdef CONFIG_XEN_PV
+ &x86_hyper_xen_pv,
+#endif
+#ifdef CONFIG_XEN_PVHVM
+ &x86_hyper_xen_hvm,
+#endif
+ &x86_hyper_vmware,
+ &x86_hyper_ms_hyperv,
+#ifdef CONFIG_KVM_GUEST
+ &x86_hyper_kvm,
+#endif
+#ifdef CONFIG_JAILHOUSE_GUEST
+ &x86_hyper_jailhouse,
+#endif
+};
+
+enum x86_hypervisor_type x86_hyper_type;
+EXPORT_SYMBOL(x86_hyper_type);
+
+static inline const struct hypervisor_x86 * __init
+detect_hypervisor_vendor(void)
+{
+ const struct hypervisor_x86 *h = NULL, * const *p;
+ uint32_t pri, max_pri = 0;
+
+ for (p = hypervisors; p < hypervisors + ARRAY_SIZE(hypervisors); p++) {
+ pri = (*p)->detect();
+ if (pri > max_pri) {
+ max_pri = pri;
+ h = *p;
+ }
+ }
+
+ if (h)
+ pr_info("Hypervisor detected: %s\n", h->name);
+
+ return h;
+}
+
+static void __init copy_array(const void *src, void *target, unsigned int size)
+{
+ unsigned int i, n = size / sizeof(void *);
+ const void * const *from = (const void * const *)src;
+ const void **to = (const void **)target;
+
+ for (i = 0; i < n; i++)
+ if (from[i])
+ to[i] = from[i];
+}
+
+void __init init_hypervisor_platform(void)
+{
+ const struct hypervisor_x86 *h;
+
+ h = detect_hypervisor_vendor();
+
+ if (!h)
+ return;
+
+ copy_array(&h->init, &x86_init.hyper, sizeof(h->init));
+ copy_array(&h->runtime, &x86_platform.hyper, sizeof(h->runtime));
+
+ x86_hyper_type = h->type;
+ x86_init.hyper.init_platform();
+}
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
new file mode 100644
index 0000000..fc3c07f
--- /dev/null
+++ b/arch/x86/kernel/cpu/intel.c
@@ -0,0 +1,1031 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/kernel.h>
+
+#include <linux/string.h>
+#include <linux/bitops.h>
+#include <linux/smp.h>
+#include <linux/sched.h>
+#include <linux/sched/clock.h>
+#include <linux/thread_info.h>
+#include <linux/init.h>
+#include <linux/uaccess.h>
+
+#include <asm/cpufeature.h>
+#include <asm/pgtable.h>
+#include <asm/msr.h>
+#include <asm/bugs.h>
+#include <asm/cpu.h>
+#include <asm/intel-family.h>
+#include <asm/microcode_intel.h>
+#include <asm/hwcap2.h>
+#include <asm/elf.h>
+
+#ifdef CONFIG_X86_64
+#include <linux/topology.h>
+#endif
+
+#include "cpu.h"
+
+#ifdef CONFIG_X86_LOCAL_APIC
+#include <asm/mpspec.h>
+#include <asm/apic.h>
+#endif
+
+/*
+ * Just in case our CPU detection goes bad, or you have a weird system,
+ * allow a way to override the automatic disabling of MPX.
+ */
+static int forcempx;
+
+static int __init forcempx_setup(char *__unused)
+{
+ forcempx = 1;
+
+ return 1;
+}
+__setup("intel-skd-046-workaround=disable", forcempx_setup);
+
+void check_mpx_erratum(struct cpuinfo_x86 *c)
+{
+ if (forcempx)
+ return;
+ /*
+ * Turn off the MPX feature on CPUs where SMEP is not
+ * available or disabled.
+ *
+ * Works around Intel Erratum SKD046: "Branch Instructions
+ * May Initialize MPX Bound Registers Incorrectly".
+ *
+ * This might falsely disable MPX on systems without
+ * SMEP, like Atom processors without SMEP. But there
+ * is no such hardware known at the moment.
+ */
+ if (cpu_has(c, X86_FEATURE_MPX) && !cpu_has(c, X86_FEATURE_SMEP)) {
+ setup_clear_cpu_cap(X86_FEATURE_MPX);
+ pr_warn("x86/mpx: Disabling MPX since SMEP not present\n");
+ }
+}
+
+static bool ring3mwait_disabled __read_mostly;
+
+static int __init ring3mwait_disable(char *__unused)
+{
+ ring3mwait_disabled = true;
+ return 0;
+}
+__setup("ring3mwait=disable", ring3mwait_disable);
+
+static void probe_xeon_phi_r3mwait(struct cpuinfo_x86 *c)
+{
+ /*
+ * Ring 3 MONITOR/MWAIT feature cannot be detected without
+ * cpu model and family comparison.
+ */
+ if (c->x86 != 6)
+ return;
+ switch (c->x86_model) {
+ case INTEL_FAM6_XEON_PHI_KNL:
+ case INTEL_FAM6_XEON_PHI_KNM:
+ break;
+ default:
+ return;
+ }
+
+ if (ring3mwait_disabled)
+ return;
+
+ set_cpu_cap(c, X86_FEATURE_RING3MWAIT);
+ this_cpu_or(msr_misc_features_shadow,
+ 1UL << MSR_MISC_FEATURES_ENABLES_RING3MWAIT_BIT);
+
+ if (c == &boot_cpu_data)
+ ELF_HWCAP2 |= HWCAP2_RING3MWAIT;
+}
+
+/*
+ * Early microcode releases for the Spectre v2 mitigation were broken.
+ * Information taken from;
+ * - https://newsroom.intel.com/wp-content/uploads/sites/11/2018/03/microcode-update-guidance.pdf
+ * - https://kb.vmware.com/s/article/52345
+ * - Microcode revisions observed in the wild
+ * - Release note from 20180108 microcode release
+ */
+struct sku_microcode {
+ u8 model;
+ u8 stepping;
+ u32 microcode;
+};
+static const struct sku_microcode spectre_bad_microcodes[] = {
+ { INTEL_FAM6_KABYLAKE_DESKTOP, 0x0B, 0x80 },
+ { INTEL_FAM6_KABYLAKE_DESKTOP, 0x0A, 0x80 },
+ { INTEL_FAM6_KABYLAKE_DESKTOP, 0x09, 0x80 },
+ { INTEL_FAM6_KABYLAKE_MOBILE, 0x0A, 0x80 },
+ { INTEL_FAM6_KABYLAKE_MOBILE, 0x09, 0x80 },
+ { INTEL_FAM6_SKYLAKE_X, 0x03, 0x0100013e },
+ { INTEL_FAM6_SKYLAKE_X, 0x04, 0x0200003c },
+ { INTEL_FAM6_BROADWELL_CORE, 0x04, 0x28 },
+ { INTEL_FAM6_BROADWELL_GT3E, 0x01, 0x1b },
+ { INTEL_FAM6_BROADWELL_XEON_D, 0x02, 0x14 },
+ { INTEL_FAM6_BROADWELL_XEON_D, 0x03, 0x07000011 },
+ { INTEL_FAM6_BROADWELL_X, 0x01, 0x0b000025 },
+ { INTEL_FAM6_HASWELL_ULT, 0x01, 0x21 },
+ { INTEL_FAM6_HASWELL_GT3E, 0x01, 0x18 },
+ { INTEL_FAM6_HASWELL_CORE, 0x03, 0x23 },
+ { INTEL_FAM6_HASWELL_X, 0x02, 0x3b },
+ { INTEL_FAM6_HASWELL_X, 0x04, 0x10 },
+ { INTEL_FAM6_IVYBRIDGE_X, 0x04, 0x42a },
+ /* Observed in the wild */
+ { INTEL_FAM6_SANDYBRIDGE_X, 0x06, 0x61b },
+ { INTEL_FAM6_SANDYBRIDGE_X, 0x07, 0x712 },
+};
+
+static bool bad_spectre_microcode(struct cpuinfo_x86 *c)
+{
+ int i;
+
+ /*
+ * We know that the hypervisor lie to us on the microcode version so
+ * we may as well hope that it is running the correct version.
+ */
+ if (cpu_has(c, X86_FEATURE_HYPERVISOR))
+ return false;
+
+ if (c->x86 != 6)
+ return false;
+
+ for (i = 0; i < ARRAY_SIZE(spectre_bad_microcodes); i++) {
+ if (c->x86_model == spectre_bad_microcodes[i].model &&
+ c->x86_stepping == spectre_bad_microcodes[i].stepping)
+ return (c->microcode <= spectre_bad_microcodes[i].microcode);
+ }
+ return false;
+}
+
+static void early_init_intel(struct cpuinfo_x86 *c)
+{
+ u64 misc_enable;
+
+ /* Unmask CPUID levels if masked: */
+ if (c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xd)) {
+ if (msr_clear_bit(MSR_IA32_MISC_ENABLE,
+ MSR_IA32_MISC_ENABLE_LIMIT_CPUID_BIT) > 0) {
+ c->cpuid_level = cpuid_eax(0);
+ get_cpu_cap(c);
+ }
+ }
+
+ if ((c->x86 == 0xf && c->x86_model >= 0x03) ||
+ (c->x86 == 0x6 && c->x86_model >= 0x0e))
+ set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
+
+ if (c->x86 >= 6 && !cpu_has(c, X86_FEATURE_IA64))
+ c->microcode = intel_get_microcode_revision();
+
+ /* Now if any of them are set, check the blacklist and clear the lot */
+ if ((cpu_has(c, X86_FEATURE_SPEC_CTRL) ||
+ cpu_has(c, X86_FEATURE_INTEL_STIBP) ||
+ cpu_has(c, X86_FEATURE_IBRS) || cpu_has(c, X86_FEATURE_IBPB) ||
+ cpu_has(c, X86_FEATURE_STIBP)) && bad_spectre_microcode(c)) {
+ pr_warn("Intel Spectre v2 broken microcode detected; disabling Speculation Control\n");
+ setup_clear_cpu_cap(X86_FEATURE_IBRS);
+ setup_clear_cpu_cap(X86_FEATURE_IBPB);
+ setup_clear_cpu_cap(X86_FEATURE_STIBP);
+ setup_clear_cpu_cap(X86_FEATURE_SPEC_CTRL);
+ setup_clear_cpu_cap(X86_FEATURE_MSR_SPEC_CTRL);
+ setup_clear_cpu_cap(X86_FEATURE_INTEL_STIBP);
+ setup_clear_cpu_cap(X86_FEATURE_SSBD);
+ setup_clear_cpu_cap(X86_FEATURE_SPEC_CTRL_SSBD);
+ }
+
+ /*
+ * Atom erratum AAE44/AAF40/AAG38/AAH41:
+ *
+ * A race condition between speculative fetches and invalidating
+ * a large page. This is worked around in microcode, but we
+ * need the microcode to have already been loaded... so if it is
+ * not, recommend a BIOS update and disable large pages.
+ */
+ if (c->x86 == 6 && c->x86_model == 0x1c && c->x86_stepping <= 2 &&
+ c->microcode < 0x20e) {
+ pr_warn("Atom PSE erratum detected, BIOS microcode update recommended\n");
+ clear_cpu_cap(c, X86_FEATURE_PSE);
+ }
+
+#ifdef CONFIG_X86_64
+ set_cpu_cap(c, X86_FEATURE_SYSENTER32);
+#else
+ /* Netburst reports 64 bytes clflush size, but does IO in 128 bytes */
+ if (c->x86 == 15 && c->x86_cache_alignment == 64)
+ c->x86_cache_alignment = 128;
+#endif
+
+ /* CPUID workaround for 0F33/0F34 CPU */
+ if (c->x86 == 0xF && c->x86_model == 0x3
+ && (c->x86_stepping == 0x3 || c->x86_stepping == 0x4))
+ c->x86_phys_bits = 36;
+
+ /*
+ * c->x86_power is 8000_0007 edx. Bit 8 is TSC runs at constant rate
+ * with P/T states and does not stop in deep C-states.
+ *
+ * It is also reliable across cores and sockets. (but not across
+ * cabinets - we turn it off in that case explicitly.)
+ */
+ if (c->x86_power & (1 << 8)) {
+ set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
+ set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC);
+ }
+
+ /* Penwell and Cloverview have the TSC which doesn't sleep on S3 */
+ if (c->x86 == 6) {
+ switch (c->x86_model) {
+ case 0x27: /* Penwell */
+ case 0x35: /* Cloverview */
+ case 0x4a: /* Merrifield */
+ set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC_S3);
+ break;
+ default:
+ break;
+ }
+ }
+
+ /*
+ * There is a known erratum on Pentium III and Core Solo
+ * and Core Duo CPUs.
+ * " Page with PAT set to WC while associated MTRR is UC
+ * may consolidate to UC "
+ * Because of this erratum, it is better to stick with
+ * setting WC in MTRR rather than using PAT on these CPUs.
+ *
+ * Enable PAT WC only on P4, Core 2 or later CPUs.
+ */
+ if (c->x86 == 6 && c->x86_model < 15)
+ clear_cpu_cap(c, X86_FEATURE_PAT);
+
+ /*
+ * If fast string is not enabled in IA32_MISC_ENABLE for any reason,
+ * clear the fast string and enhanced fast string CPU capabilities.
+ */
+ if (c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xd)) {
+ rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable);
+ if (!(misc_enable & MSR_IA32_MISC_ENABLE_FAST_STRING)) {
+ pr_info("Disabled fast string operations\n");
+ setup_clear_cpu_cap(X86_FEATURE_REP_GOOD);
+ setup_clear_cpu_cap(X86_FEATURE_ERMS);
+ }
+ }
+
+ /*
+ * Intel Quark Core DevMan_001.pdf section 6.4.11
+ * "The operating system also is required to invalidate (i.e., flush)
+ * the TLB when any changes are made to any of the page table entries.
+ * The operating system must reload CR3 to cause the TLB to be flushed"
+ *
+ * As a result, boot_cpu_has(X86_FEATURE_PGE) in arch/x86/include/asm/tlbflush.h
+ * should be false so that __flush_tlb_all() causes CR3 insted of CR4.PGE
+ * to be modified.
+ */
+ if (c->x86 == 5 && c->x86_model == 9) {
+ pr_info("Disabling PGE capability bit\n");
+ setup_clear_cpu_cap(X86_FEATURE_PGE);
+ }
+
+ if (c->cpuid_level >= 0x00000001) {
+ u32 eax, ebx, ecx, edx;
+
+ cpuid(0x00000001, &eax, &ebx, &ecx, &edx);
+ /*
+ * If HTT (EDX[28]) is set EBX[16:23] contain the number of
+ * apicids which are reserved per package. Store the resulting
+ * shift value for the package management code.
+ */
+ if (edx & (1U << 28))
+ c->x86_coreid_bits = get_count_order((ebx >> 16) & 0xff);
+ }
+
+ check_mpx_erratum(c);
+
+ /*
+ * Get the number of SMT siblings early from the extended topology
+ * leaf, if available. Otherwise try the legacy SMT detection.
+ */
+ if (detect_extended_topology_early(c) < 0)
+ detect_ht_early(c);
+}
+
+#ifdef CONFIG_X86_32
+/*
+ * Early probe support logic for ppro memory erratum #50
+ *
+ * This is called before we do cpu ident work
+ */
+
+int ppro_with_ram_bug(void)
+{
+ /* Uses data from early_cpu_detect now */
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL &&
+ boot_cpu_data.x86 == 6 &&
+ boot_cpu_data.x86_model == 1 &&
+ boot_cpu_data.x86_stepping < 8) {
+ pr_info("Pentium Pro with Errata#50 detected. Taking evasive action.\n");
+ return 1;
+ }
+ return 0;
+}
+
+static void intel_smp_check(struct cpuinfo_x86 *c)
+{
+ /* calling is from identify_secondary_cpu() ? */
+ if (!c->cpu_index)
+ return;
+
+ /*
+ * Mask B, Pentium, but not Pentium MMX
+ */
+ if (c->x86 == 5 &&
+ c->x86_stepping >= 1 && c->x86_stepping <= 4 &&
+ c->x86_model <= 3) {
+ /*
+ * Remember we have B step Pentia with bugs
+ */
+ WARN_ONCE(1, "WARNING: SMP operation may be unreliable"
+ "with B stepping processors.\n");
+ }
+}
+
+static int forcepae;
+static int __init forcepae_setup(char *__unused)
+{
+ forcepae = 1;
+ return 1;
+}
+__setup("forcepae", forcepae_setup);
+
+static void intel_workarounds(struct cpuinfo_x86 *c)
+{
+#ifdef CONFIG_X86_F00F_BUG
+ /*
+ * All models of Pentium and Pentium with MMX technology CPUs
+ * have the F0 0F bug, which lets nonprivileged users lock up the
+ * system. Announce that the fault handler will be checking for it.
+ * The Quark is also family 5, but does not have the same bug.
+ */
+ clear_cpu_bug(c, X86_BUG_F00F);
+ if (c->x86 == 5 && c->x86_model < 9) {
+ static int f00f_workaround_enabled;
+
+ set_cpu_bug(c, X86_BUG_F00F);
+ if (!f00f_workaround_enabled) {
+ pr_notice("Intel Pentium with F0 0F bug - workaround enabled.\n");
+ f00f_workaround_enabled = 1;
+ }
+ }
+#endif
+
+ /*
+ * SEP CPUID bug: Pentium Pro reports SEP but doesn't have it until
+ * model 3 mask 3
+ */
+ if ((c->x86<<8 | c->x86_model<<4 | c->x86_stepping) < 0x633)
+ clear_cpu_cap(c, X86_FEATURE_SEP);
+
+ /*
+ * PAE CPUID issue: many Pentium M report no PAE but may have a
+ * functionally usable PAE implementation.
+ * Forcefully enable PAE if kernel parameter "forcepae" is present.
+ */
+ if (forcepae) {
+ pr_warn("PAE forced!\n");
+ set_cpu_cap(c, X86_FEATURE_PAE);
+ add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_NOW_UNRELIABLE);
+ }
+
+ /*
+ * P4 Xeon erratum 037 workaround.
+ * Hardware prefetcher may cause stale data to be loaded into the cache.
+ */
+ if ((c->x86 == 15) && (c->x86_model == 1) && (c->x86_stepping == 1)) {
+ if (msr_set_bit(MSR_IA32_MISC_ENABLE,
+ MSR_IA32_MISC_ENABLE_PREFETCH_DISABLE_BIT) > 0) {
+ pr_info("CPU: C0 stepping P4 Xeon detected.\n");
+ pr_info("CPU: Disabling hardware prefetching (Erratum 037)\n");
+ }
+ }
+
+ /*
+ * See if we have a good local APIC by checking for buggy Pentia,
+ * i.e. all B steppings and the C2 stepping of P54C when using their
+ * integrated APIC (see 11AP erratum in "Pentium Processor
+ * Specification Update").
+ */
+ if (boot_cpu_has(X86_FEATURE_APIC) && (c->x86<<8 | c->x86_model<<4) == 0x520 &&
+ (c->x86_stepping < 0x6 || c->x86_stepping == 0xb))
+ set_cpu_bug(c, X86_BUG_11AP);
+
+
+#ifdef CONFIG_X86_INTEL_USERCOPY
+ /*
+ * Set up the preferred alignment for movsl bulk memory moves
+ */
+ switch (c->x86) {
+ case 4: /* 486: untested */
+ break;
+ case 5: /* Old Pentia: untested */
+ break;
+ case 6: /* PII/PIII only like movsl with 8-byte alignment */
+ movsl_mask.mask = 7;
+ break;
+ case 15: /* P4 is OK down to 8-byte alignment */
+ movsl_mask.mask = 7;
+ break;
+ }
+#endif
+
+ intel_smp_check(c);
+}
+#else
+static void intel_workarounds(struct cpuinfo_x86 *c)
+{
+}
+#endif
+
+static void srat_detect_node(struct cpuinfo_x86 *c)
+{
+#ifdef CONFIG_NUMA
+ unsigned node;
+ int cpu = smp_processor_id();
+
+ /* Don't do the funky fallback heuristics the AMD version employs
+ for now. */
+ node = numa_cpu_node(cpu);
+ if (node == NUMA_NO_NODE || !node_online(node)) {
+ /* reuse the value from init_cpu_to_node() */
+ node = cpu_to_node(cpu);
+ }
+ numa_set_node(cpu, node);
+#endif
+}
+
+static void detect_vmx_virtcap(struct cpuinfo_x86 *c)
+{
+ /* Intel VMX MSR indicated features */
+#define X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW 0x00200000
+#define X86_VMX_FEATURE_PROC_CTLS_VNMI 0x00400000
+#define X86_VMX_FEATURE_PROC_CTLS_2ND_CTLS 0x80000000
+#define X86_VMX_FEATURE_PROC_CTLS2_VIRT_APIC 0x00000001
+#define X86_VMX_FEATURE_PROC_CTLS2_EPT 0x00000002
+#define X86_VMX_FEATURE_PROC_CTLS2_VPID 0x00000020
+#define x86_VMX_FEATURE_EPT_CAP_AD 0x00200000
+
+ u32 vmx_msr_low, vmx_msr_high, msr_ctl, msr_ctl2;
+ u32 msr_vpid_cap, msr_ept_cap;
+
+ clear_cpu_cap(c, X86_FEATURE_TPR_SHADOW);
+ clear_cpu_cap(c, X86_FEATURE_VNMI);
+ clear_cpu_cap(c, X86_FEATURE_FLEXPRIORITY);
+ clear_cpu_cap(c, X86_FEATURE_EPT);
+ clear_cpu_cap(c, X86_FEATURE_VPID);
+ clear_cpu_cap(c, X86_FEATURE_EPT_AD);
+
+ rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, vmx_msr_low, vmx_msr_high);
+ msr_ctl = vmx_msr_high | vmx_msr_low;
+ if (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW)
+ set_cpu_cap(c, X86_FEATURE_TPR_SHADOW);
+ if (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_VNMI)
+ set_cpu_cap(c, X86_FEATURE_VNMI);
+ if (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_2ND_CTLS) {
+ rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2,
+ vmx_msr_low, vmx_msr_high);
+ msr_ctl2 = vmx_msr_high | vmx_msr_low;
+ if ((msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_VIRT_APIC) &&
+ (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW))
+ set_cpu_cap(c, X86_FEATURE_FLEXPRIORITY);
+ if (msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_EPT) {
+ set_cpu_cap(c, X86_FEATURE_EPT);
+ rdmsr(MSR_IA32_VMX_EPT_VPID_CAP,
+ msr_ept_cap, msr_vpid_cap);
+ if (msr_ept_cap & x86_VMX_FEATURE_EPT_CAP_AD)
+ set_cpu_cap(c, X86_FEATURE_EPT_AD);
+ }
+ if (msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_VPID)
+ set_cpu_cap(c, X86_FEATURE_VPID);
+ }
+}
+
+#define MSR_IA32_TME_ACTIVATE 0x982
+
+/* Helpers to access TME_ACTIVATE MSR */
+#define TME_ACTIVATE_LOCKED(x) (x & 0x1)
+#define TME_ACTIVATE_ENABLED(x) (x & 0x2)
+
+#define TME_ACTIVATE_POLICY(x) ((x >> 4) & 0xf) /* Bits 7:4 */
+#define TME_ACTIVATE_POLICY_AES_XTS_128 0
+
+#define TME_ACTIVATE_KEYID_BITS(x) ((x >> 32) & 0xf) /* Bits 35:32 */
+
+#define TME_ACTIVATE_CRYPTO_ALGS(x) ((x >> 48) & 0xffff) /* Bits 63:48 */
+#define TME_ACTIVATE_CRYPTO_AES_XTS_128 1
+
+/* Values for mktme_status (SW only construct) */
+#define MKTME_ENABLED 0
+#define MKTME_DISABLED 1
+#define MKTME_UNINITIALIZED 2
+static int mktme_status = MKTME_UNINITIALIZED;
+
+static void detect_tme(struct cpuinfo_x86 *c)
+{
+ u64 tme_activate, tme_policy, tme_crypto_algs;
+ int keyid_bits = 0, nr_keyids = 0;
+ static u64 tme_activate_cpu0 = 0;
+
+ rdmsrl(MSR_IA32_TME_ACTIVATE, tme_activate);
+
+ if (mktme_status != MKTME_UNINITIALIZED) {
+ if (tme_activate != tme_activate_cpu0) {
+ /* Broken BIOS? */
+ pr_err_once("x86/tme: configuration is inconsistent between CPUs\n");
+ pr_err_once("x86/tme: MKTME is not usable\n");
+ mktme_status = MKTME_DISABLED;
+
+ /* Proceed. We may need to exclude bits from x86_phys_bits. */
+ }
+ } else {
+ tme_activate_cpu0 = tme_activate;
+ }
+
+ if (!TME_ACTIVATE_LOCKED(tme_activate) || !TME_ACTIVATE_ENABLED(tme_activate)) {
+ pr_info_once("x86/tme: not enabled by BIOS\n");
+ mktme_status = MKTME_DISABLED;
+ return;
+ }
+
+ if (mktme_status != MKTME_UNINITIALIZED)
+ goto detect_keyid_bits;
+
+ pr_info("x86/tme: enabled by BIOS\n");
+
+ tme_policy = TME_ACTIVATE_POLICY(tme_activate);
+ if (tme_policy != TME_ACTIVATE_POLICY_AES_XTS_128)
+ pr_warn("x86/tme: Unknown policy is active: %#llx\n", tme_policy);
+
+ tme_crypto_algs = TME_ACTIVATE_CRYPTO_ALGS(tme_activate);
+ if (!(tme_crypto_algs & TME_ACTIVATE_CRYPTO_AES_XTS_128)) {
+ pr_err("x86/mktme: No known encryption algorithm is supported: %#llx\n",
+ tme_crypto_algs);
+ mktme_status = MKTME_DISABLED;
+ }
+detect_keyid_bits:
+ keyid_bits = TME_ACTIVATE_KEYID_BITS(tme_activate);
+ nr_keyids = (1UL << keyid_bits) - 1;
+ if (nr_keyids) {
+ pr_info_once("x86/mktme: enabled by BIOS\n");
+ pr_info_once("x86/mktme: %d KeyIDs available\n", nr_keyids);
+ } else {
+ pr_info_once("x86/mktme: disabled by BIOS\n");
+ }
+
+ if (mktme_status == MKTME_UNINITIALIZED) {
+ /* MKTME is usable */
+ mktme_status = MKTME_ENABLED;
+ }
+
+ /*
+ * KeyID bits effectively lower the number of physical address
+ * bits. Update cpuinfo_x86::x86_phys_bits accordingly.
+ */
+ c->x86_phys_bits -= keyid_bits;
+}
+
+static void init_intel_energy_perf(struct cpuinfo_x86 *c)
+{
+ u64 epb;
+
+ /*
+ * Initialize MSR_IA32_ENERGY_PERF_BIAS if not already initialized.
+ * (x86_energy_perf_policy(8) is available to change it at run-time.)
+ */
+ if (!cpu_has(c, X86_FEATURE_EPB))
+ return;
+
+ rdmsrl(MSR_IA32_ENERGY_PERF_BIAS, epb);
+ if ((epb & 0xF) != ENERGY_PERF_BIAS_PERFORMANCE)
+ return;
+
+ pr_warn_once("ENERGY_PERF_BIAS: Set to 'normal', was 'performance'\n");
+ pr_warn_once("ENERGY_PERF_BIAS: View and update with x86_energy_perf_policy(8)\n");
+ epb = (epb & ~0xF) | ENERGY_PERF_BIAS_NORMAL;
+ wrmsrl(MSR_IA32_ENERGY_PERF_BIAS, epb);
+}
+
+static void intel_bsp_resume(struct cpuinfo_x86 *c)
+{
+ /*
+ * MSR_IA32_ENERGY_PERF_BIAS is lost across suspend/resume,
+ * so reinitialize it properly like during bootup:
+ */
+ init_intel_energy_perf(c);
+}
+
+static void init_cpuid_fault(struct cpuinfo_x86 *c)
+{
+ u64 msr;
+
+ if (!rdmsrl_safe(MSR_PLATFORM_INFO, &msr)) {
+ if (msr & MSR_PLATFORM_INFO_CPUID_FAULT)
+ set_cpu_cap(c, X86_FEATURE_CPUID_FAULT);
+ }
+}
+
+static void init_intel_misc_features(struct cpuinfo_x86 *c)
+{
+ u64 msr;
+
+ if (rdmsrl_safe(MSR_MISC_FEATURES_ENABLES, &msr))
+ return;
+
+ /* Clear all MISC features */
+ this_cpu_write(msr_misc_features_shadow, 0);
+
+ /* Check features and update capabilities and shadow control bits */
+ init_cpuid_fault(c);
+ probe_xeon_phi_r3mwait(c);
+
+ msr = this_cpu_read(msr_misc_features_shadow);
+ wrmsrl(MSR_MISC_FEATURES_ENABLES, msr);
+}
+
+static void init_intel(struct cpuinfo_x86 *c)
+{
+ early_init_intel(c);
+
+ intel_workarounds(c);
+
+ /*
+ * Detect the extended topology information if available. This
+ * will reinitialise the initial_apicid which will be used
+ * in init_intel_cacheinfo()
+ */
+ detect_extended_topology(c);
+
+ if (!cpu_has(c, X86_FEATURE_XTOPOLOGY)) {
+ /*
+ * let's use the legacy cpuid vector 0x1 and 0x4 for topology
+ * detection.
+ */
+ detect_num_cpu_cores(c);
+#ifdef CONFIG_X86_32
+ detect_ht(c);
+#endif
+ }
+
+ init_intel_cacheinfo(c);
+
+ if (c->cpuid_level > 9) {
+ unsigned eax = cpuid_eax(10);
+ /* Check for version and the number of counters */
+ if ((eax & 0xff) && (((eax>>8) & 0xff) > 1))
+ set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON);
+ }
+
+ if (cpu_has(c, X86_FEATURE_XMM2))
+ set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
+
+ if (boot_cpu_has(X86_FEATURE_DS)) {
+ unsigned int l1, l2;
+
+ rdmsr(MSR_IA32_MISC_ENABLE, l1, l2);
+ if (!(l1 & (1<<11)))
+ set_cpu_cap(c, X86_FEATURE_BTS);
+ if (!(l1 & (1<<12)))
+ set_cpu_cap(c, X86_FEATURE_PEBS);
+ }
+
+ if (c->x86 == 6 && boot_cpu_has(X86_FEATURE_CLFLUSH) &&
+ (c->x86_model == 29 || c->x86_model == 46 || c->x86_model == 47))
+ set_cpu_bug(c, X86_BUG_CLFLUSH_MONITOR);
+
+ if (c->x86 == 6 && boot_cpu_has(X86_FEATURE_MWAIT) &&
+ ((c->x86_model == INTEL_FAM6_ATOM_GOLDMONT)))
+ set_cpu_bug(c, X86_BUG_MONITOR);
+
+#ifdef CONFIG_X86_64
+ if (c->x86 == 15)
+ c->x86_cache_alignment = c->x86_clflush_size * 2;
+ if (c->x86 == 6)
+ set_cpu_cap(c, X86_FEATURE_REP_GOOD);
+#else
+ /*
+ * Names for the Pentium II/Celeron processors
+ * detectable only by also checking the cache size.
+ * Dixon is NOT a Celeron.
+ */
+ if (c->x86 == 6) {
+ unsigned int l2 = c->x86_cache_size;
+ char *p = NULL;
+
+ switch (c->x86_model) {
+ case 5:
+ if (l2 == 0)
+ p = "Celeron (Covington)";
+ else if (l2 == 256)
+ p = "Mobile Pentium II (Dixon)";
+ break;
+
+ case 6:
+ if (l2 == 128)
+ p = "Celeron (Mendocino)";
+ else if (c->x86_stepping == 0 || c->x86_stepping == 5)
+ p = "Celeron-A";
+ break;
+
+ case 8:
+ if (l2 == 128)
+ p = "Celeron (Coppermine)";
+ break;
+ }
+
+ if (p)
+ strcpy(c->x86_model_id, p);
+ }
+
+ if (c->x86 == 15)
+ set_cpu_cap(c, X86_FEATURE_P4);
+ if (c->x86 == 6)
+ set_cpu_cap(c, X86_FEATURE_P3);
+#endif
+
+ /* Work around errata */
+ srat_detect_node(c);
+
+ if (cpu_has(c, X86_FEATURE_VMX))
+ detect_vmx_virtcap(c);
+
+ if (cpu_has(c, X86_FEATURE_TME))
+ detect_tme(c);
+
+ init_intel_energy_perf(c);
+
+ init_intel_misc_features(c);
+}
+
+#ifdef CONFIG_X86_32
+static unsigned int intel_size_cache(struct cpuinfo_x86 *c, unsigned int size)
+{
+ /*
+ * Intel PIII Tualatin. This comes in two flavours.
+ * One has 256kb of cache, the other 512. We have no way
+ * to determine which, so we use a boottime override
+ * for the 512kb model, and assume 256 otherwise.
+ */
+ if ((c->x86 == 6) && (c->x86_model == 11) && (size == 0))
+ size = 256;
+
+ /*
+ * Intel Quark SoC X1000 contains a 4-way set associative
+ * 16K cache with a 16 byte cache line and 256 lines per tag
+ */
+ if ((c->x86 == 5) && (c->x86_model == 9))
+ size = 16;
+ return size;
+}
+#endif
+
+#define TLB_INST_4K 0x01
+#define TLB_INST_4M 0x02
+#define TLB_INST_2M_4M 0x03
+
+#define TLB_INST_ALL 0x05
+#define TLB_INST_1G 0x06
+
+#define TLB_DATA_4K 0x11
+#define TLB_DATA_4M 0x12
+#define TLB_DATA_2M_4M 0x13
+#define TLB_DATA_4K_4M 0x14
+
+#define TLB_DATA_1G 0x16
+
+#define TLB_DATA0_4K 0x21
+#define TLB_DATA0_4M 0x22
+#define TLB_DATA0_2M_4M 0x23
+
+#define STLB_4K 0x41
+#define STLB_4K_2M 0x42
+
+static const struct _tlb_table intel_tlb_table[] = {
+ { 0x01, TLB_INST_4K, 32, " TLB_INST 4 KByte pages, 4-way set associative" },
+ { 0x02, TLB_INST_4M, 2, " TLB_INST 4 MByte pages, full associative" },
+ { 0x03, TLB_DATA_4K, 64, " TLB_DATA 4 KByte pages, 4-way set associative" },
+ { 0x04, TLB_DATA_4M, 8, " TLB_DATA 4 MByte pages, 4-way set associative" },
+ { 0x05, TLB_DATA_4M, 32, " TLB_DATA 4 MByte pages, 4-way set associative" },
+ { 0x0b, TLB_INST_4M, 4, " TLB_INST 4 MByte pages, 4-way set associative" },
+ { 0x4f, TLB_INST_4K, 32, " TLB_INST 4 KByte pages */" },
+ { 0x50, TLB_INST_ALL, 64, " TLB_INST 4 KByte and 2-MByte or 4-MByte pages" },
+ { 0x51, TLB_INST_ALL, 128, " TLB_INST 4 KByte and 2-MByte or 4-MByte pages" },
+ { 0x52, TLB_INST_ALL, 256, " TLB_INST 4 KByte and 2-MByte or 4-MByte pages" },
+ { 0x55, TLB_INST_2M_4M, 7, " TLB_INST 2-MByte or 4-MByte pages, fully associative" },
+ { 0x56, TLB_DATA0_4M, 16, " TLB_DATA0 4 MByte pages, 4-way set associative" },
+ { 0x57, TLB_DATA0_4K, 16, " TLB_DATA0 4 KByte pages, 4-way associative" },
+ { 0x59, TLB_DATA0_4K, 16, " TLB_DATA0 4 KByte pages, fully associative" },
+ { 0x5a, TLB_DATA0_2M_4M, 32, " TLB_DATA0 2-MByte or 4 MByte pages, 4-way set associative" },
+ { 0x5b, TLB_DATA_4K_4M, 64, " TLB_DATA 4 KByte and 4 MByte pages" },
+ { 0x5c, TLB_DATA_4K_4M, 128, " TLB_DATA 4 KByte and 4 MByte pages" },
+ { 0x5d, TLB_DATA_4K_4M, 256, " TLB_DATA 4 KByte and 4 MByte pages" },
+ { 0x61, TLB_INST_4K, 48, " TLB_INST 4 KByte pages, full associative" },
+ { 0x63, TLB_DATA_1G, 4, " TLB_DATA 1 GByte pages, 4-way set associative" },
+ { 0x6b, TLB_DATA_4K, 256, " TLB_DATA 4 KByte pages, 8-way associative" },
+ { 0x6c, TLB_DATA_2M_4M, 128, " TLB_DATA 2 MByte or 4 MByte pages, 8-way associative" },
+ { 0x6d, TLB_DATA_1G, 16, " TLB_DATA 1 GByte pages, fully associative" },
+ { 0x76, TLB_INST_2M_4M, 8, " TLB_INST 2-MByte or 4-MByte pages, fully associative" },
+ { 0xb0, TLB_INST_4K, 128, " TLB_INST 4 KByte pages, 4-way set associative" },
+ { 0xb1, TLB_INST_2M_4M, 4, " TLB_INST 2M pages, 4-way, 8 entries or 4M pages, 4-way entries" },
+ { 0xb2, TLB_INST_4K, 64, " TLB_INST 4KByte pages, 4-way set associative" },
+ { 0xb3, TLB_DATA_4K, 128, " TLB_DATA 4 KByte pages, 4-way set associative" },
+ { 0xb4, TLB_DATA_4K, 256, " TLB_DATA 4 KByte pages, 4-way associative" },
+ { 0xb5, TLB_INST_4K, 64, " TLB_INST 4 KByte pages, 8-way set associative" },
+ { 0xb6, TLB_INST_4K, 128, " TLB_INST 4 KByte pages, 8-way set associative" },
+ { 0xba, TLB_DATA_4K, 64, " TLB_DATA 4 KByte pages, 4-way associative" },
+ { 0xc0, TLB_DATA_4K_4M, 8, " TLB_DATA 4 KByte and 4 MByte pages, 4-way associative" },
+ { 0xc1, STLB_4K_2M, 1024, " STLB 4 KByte and 2 MByte pages, 8-way associative" },
+ { 0xc2, TLB_DATA_2M_4M, 16, " DTLB 2 MByte/4MByte pages, 4-way associative" },
+ { 0xca, STLB_4K, 512, " STLB 4 KByte pages, 4-way associative" },
+ { 0x00, 0, 0 }
+};
+
+static void intel_tlb_lookup(const unsigned char desc)
+{
+ unsigned char k;
+ if (desc == 0)
+ return;
+
+ /* look up this descriptor in the table */
+ for (k = 0; intel_tlb_table[k].descriptor != desc && \
+ intel_tlb_table[k].descriptor != 0; k++)
+ ;
+
+ if (intel_tlb_table[k].tlb_type == 0)
+ return;
+
+ switch (intel_tlb_table[k].tlb_type) {
+ case STLB_4K:
+ if (tlb_lli_4k[ENTRIES] < intel_tlb_table[k].entries)
+ tlb_lli_4k[ENTRIES] = intel_tlb_table[k].entries;
+ if (tlb_lld_4k[ENTRIES] < intel_tlb_table[k].entries)
+ tlb_lld_4k[ENTRIES] = intel_tlb_table[k].entries;
+ break;
+ case STLB_4K_2M:
+ if (tlb_lli_4k[ENTRIES] < intel_tlb_table[k].entries)
+ tlb_lli_4k[ENTRIES] = intel_tlb_table[k].entries;
+ if (tlb_lld_4k[ENTRIES] < intel_tlb_table[k].entries)
+ tlb_lld_4k[ENTRIES] = intel_tlb_table[k].entries;
+ if (tlb_lli_2m[ENTRIES] < intel_tlb_table[k].entries)
+ tlb_lli_2m[ENTRIES] = intel_tlb_table[k].entries;
+ if (tlb_lld_2m[ENTRIES] < intel_tlb_table[k].entries)
+ tlb_lld_2m[ENTRIES] = intel_tlb_table[k].entries;
+ if (tlb_lli_4m[ENTRIES] < intel_tlb_table[k].entries)
+ tlb_lli_4m[ENTRIES] = intel_tlb_table[k].entries;
+ if (tlb_lld_4m[ENTRIES] < intel_tlb_table[k].entries)
+ tlb_lld_4m[ENTRIES] = intel_tlb_table[k].entries;
+ break;
+ case TLB_INST_ALL:
+ if (tlb_lli_4k[ENTRIES] < intel_tlb_table[k].entries)
+ tlb_lli_4k[ENTRIES] = intel_tlb_table[k].entries;
+ if (tlb_lli_2m[ENTRIES] < intel_tlb_table[k].entries)
+ tlb_lli_2m[ENTRIES] = intel_tlb_table[k].entries;
+ if (tlb_lli_4m[ENTRIES] < intel_tlb_table[k].entries)
+ tlb_lli_4m[ENTRIES] = intel_tlb_table[k].entries;
+ break;
+ case TLB_INST_4K:
+ if (tlb_lli_4k[ENTRIES] < intel_tlb_table[k].entries)
+ tlb_lli_4k[ENTRIES] = intel_tlb_table[k].entries;
+ break;
+ case TLB_INST_4M:
+ if (tlb_lli_4m[ENTRIES] < intel_tlb_table[k].entries)
+ tlb_lli_4m[ENTRIES] = intel_tlb_table[k].entries;
+ break;
+ case TLB_INST_2M_4M:
+ if (tlb_lli_2m[ENTRIES] < intel_tlb_table[k].entries)
+ tlb_lli_2m[ENTRIES] = intel_tlb_table[k].entries;
+ if (tlb_lli_4m[ENTRIES] < intel_tlb_table[k].entries)
+ tlb_lli_4m[ENTRIES] = intel_tlb_table[k].entries;
+ break;
+ case TLB_DATA_4K:
+ case TLB_DATA0_4K:
+ if (tlb_lld_4k[ENTRIES] < intel_tlb_table[k].entries)
+ tlb_lld_4k[ENTRIES] = intel_tlb_table[k].entries;
+ break;
+ case TLB_DATA_4M:
+ case TLB_DATA0_4M:
+ if (tlb_lld_4m[ENTRIES] < intel_tlb_table[k].entries)
+ tlb_lld_4m[ENTRIES] = intel_tlb_table[k].entries;
+ break;
+ case TLB_DATA_2M_4M:
+ case TLB_DATA0_2M_4M:
+ if (tlb_lld_2m[ENTRIES] < intel_tlb_table[k].entries)
+ tlb_lld_2m[ENTRIES] = intel_tlb_table[k].entries;
+ if (tlb_lld_4m[ENTRIES] < intel_tlb_table[k].entries)
+ tlb_lld_4m[ENTRIES] = intel_tlb_table[k].entries;
+ break;
+ case TLB_DATA_4K_4M:
+ if (tlb_lld_4k[ENTRIES] < intel_tlb_table[k].entries)
+ tlb_lld_4k[ENTRIES] = intel_tlb_table[k].entries;
+ if (tlb_lld_4m[ENTRIES] < intel_tlb_table[k].entries)
+ tlb_lld_4m[ENTRIES] = intel_tlb_table[k].entries;
+ break;
+ case TLB_DATA_1G:
+ if (tlb_lld_1g[ENTRIES] < intel_tlb_table[k].entries)
+ tlb_lld_1g[ENTRIES] = intel_tlb_table[k].entries;
+ break;
+ }
+}
+
+static void intel_detect_tlb(struct cpuinfo_x86 *c)
+{
+ int i, j, n;
+ unsigned int regs[4];
+ unsigned char *desc = (unsigned char *)regs;
+
+ if (c->cpuid_level < 2)
+ return;
+
+ /* Number of times to iterate */
+ n = cpuid_eax(2) & 0xFF;
+
+ for (i = 0 ; i < n ; i++) {
+ cpuid(2, ®s[0], ®s[1], ®s[2], ®s[3]);
+
+ /* If bit 31 is set, this is an unknown format */
+ for (j = 0 ; j < 3 ; j++)
+ if (regs[j] & (1 << 31))
+ regs[j] = 0;
+
+ /* Byte 0 is level count, not a descriptor */
+ for (j = 1 ; j < 16 ; j++)
+ intel_tlb_lookup(desc[j]);
+ }
+}
+
+static const struct cpu_dev intel_cpu_dev = {
+ .c_vendor = "Intel",
+ .c_ident = { "GenuineIntel" },
+#ifdef CONFIG_X86_32
+ .legacy_models = {
+ { .family = 4, .model_names =
+ {
+ [0] = "486 DX-25/33",
+ [1] = "486 DX-50",
+ [2] = "486 SX",
+ [3] = "486 DX/2",
+ [4] = "486 SL",
+ [5] = "486 SX/2",
+ [7] = "486 DX/2-WB",
+ [8] = "486 DX/4",
+ [9] = "486 DX/4-WB"
+ }
+ },
+ { .family = 5, .model_names =
+ {
+ [0] = "Pentium 60/66 A-step",
+ [1] = "Pentium 60/66",
+ [2] = "Pentium 75 - 200",
+ [3] = "OverDrive PODP5V83",
+ [4] = "Pentium MMX",
+ [7] = "Mobile Pentium 75 - 200",
+ [8] = "Mobile Pentium MMX",
+ [9] = "Quark SoC X1000",
+ }
+ },
+ { .family = 6, .model_names =
+ {
+ [0] = "Pentium Pro A-step",
+ [1] = "Pentium Pro",
+ [3] = "Pentium II (Klamath)",
+ [4] = "Pentium II (Deschutes)",
+ [5] = "Pentium II (Deschutes)",
+ [6] = "Mobile Pentium II",
+ [7] = "Pentium III (Katmai)",
+ [8] = "Pentium III (Coppermine)",
+ [10] = "Pentium III (Cascades)",
+ [11] = "Pentium III (Tualatin)",
+ }
+ },
+ { .family = 15, .model_names =
+ {
+ [0] = "Pentium 4 (Unknown)",
+ [1] = "Pentium 4 (Willamette)",
+ [2] = "Pentium 4 (Northwood)",
+ [4] = "Pentium 4 (Foster)",
+ [5] = "Pentium 4 (Foster)",
+ }
+ },
+ },
+ .legacy_cache_size = intel_size_cache,
+#endif
+ .c_detect_tlb = intel_detect_tlb,
+ .c_early_init = early_init_intel,
+ .c_init = init_intel,
+ .c_bsp_resume = intel_bsp_resume,
+ .c_x86_vendor = X86_VENDOR_INTEL,
+};
+
+cpu_dev_register(intel_cpu_dev);
+
diff --git a/arch/x86/kernel/cpu/intel_pconfig.c b/arch/x86/kernel/cpu/intel_pconfig.c
new file mode 100644
index 0000000..0771a90
--- /dev/null
+++ b/arch/x86/kernel/cpu/intel_pconfig.c
@@ -0,0 +1,82 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Intel PCONFIG instruction support.
+ *
+ * Copyright (C) 2017 Intel Corporation
+ *
+ * Author:
+ * Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
+ */
+
+#include <asm/cpufeature.h>
+#include <asm/intel_pconfig.h>
+
+#define PCONFIG_CPUID 0x1b
+
+#define PCONFIG_CPUID_SUBLEAF_MASK ((1 << 12) - 1)
+
+/* Subleaf type (EAX) for PCONFIG CPUID leaf (0x1B) */
+enum {
+ PCONFIG_CPUID_SUBLEAF_INVALID = 0,
+ PCONFIG_CPUID_SUBLEAF_TARGETID = 1,
+};
+
+/* Bitmask of supported targets */
+static u64 targets_supported __read_mostly;
+
+int pconfig_target_supported(enum pconfig_target target)
+{
+ /*
+ * We would need to re-think the implementation once we get > 64
+ * PCONFIG targets. Spec allows up to 2^32 targets.
+ */
+ BUILD_BUG_ON(PCONFIG_TARGET_NR >= 64);
+
+ if (WARN_ON_ONCE(target >= 64))
+ return 0;
+ return targets_supported & (1ULL << target);
+}
+
+static int __init intel_pconfig_init(void)
+{
+ int subleaf;
+
+ if (!boot_cpu_has(X86_FEATURE_PCONFIG))
+ return 0;
+
+ /*
+ * Scan subleafs of PCONFIG CPUID leaf.
+ *
+ * Subleafs of the same type need not to be consecutive.
+ *
+ * Stop on the first invalid subleaf type. All subleafs after the first
+ * invalid are invalid too.
+ */
+ for (subleaf = 0; subleaf < INT_MAX; subleaf++) {
+ struct cpuid_regs regs;
+
+ cpuid_count(PCONFIG_CPUID, subleaf,
+ ®s.eax, ®s.ebx, ®s.ecx, ®s.edx);
+
+ switch (regs.eax & PCONFIG_CPUID_SUBLEAF_MASK) {
+ case PCONFIG_CPUID_SUBLEAF_INVALID:
+ /* Stop on the first invalid subleaf */
+ goto out;
+ case PCONFIG_CPUID_SUBLEAF_TARGETID:
+ /* Mark supported PCONFIG targets */
+ if (regs.ebx < 64)
+ targets_supported |= (1ULL << regs.ebx);
+ if (regs.ecx < 64)
+ targets_supported |= (1ULL << regs.ecx);
+ if (regs.edx < 64)
+ targets_supported |= (1ULL << regs.edx);
+ break;
+ default:
+ /* Unknown CPUID.PCONFIG subleaf: ignore */
+ break;
+ }
+ }
+out:
+ return 0;
+}
+arch_initcall(intel_pconfig_init);
diff --git a/arch/x86/kernel/cpu/intel_rdt.c b/arch/x86/kernel/cpu/intel_rdt.c
new file mode 100644
index 0000000..abb71ac
--- /dev/null
+++ b/arch/x86/kernel/cpu/intel_rdt.c
@@ -0,0 +1,904 @@
+/*
+ * Resource Director Technology(RDT)
+ * - Cache Allocation code.
+ *
+ * Copyright (C) 2016 Intel Corporation
+ *
+ * Authors:
+ * Fenghua Yu <fenghua.yu@intel.com>
+ * Tony Luck <tony.luck@intel.com>
+ * Vikas Shivappa <vikas.shivappa@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * More information about RDT be found in the Intel (R) x86 Architecture
+ * Software Developer Manual June 2016, volume 3, section 17.17.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/slab.h>
+#include <linux/err.h>
+#include <linux/cacheinfo.h>
+#include <linux/cpuhotplug.h>
+
+#include <asm/intel-family.h>
+#include <asm/intel_rdt_sched.h>
+#include "intel_rdt.h"
+
+#define MBA_IS_LINEAR 0x4
+#define MBA_MAX_MBPS U32_MAX
+
+/* Mutex to protect rdtgroup access. */
+DEFINE_MUTEX(rdtgroup_mutex);
+
+/*
+ * The cached intel_pqr_state is strictly per CPU and can never be
+ * updated from a remote CPU. Functions which modify the state
+ * are called with interrupts disabled and no preemption, which
+ * is sufficient for the protection.
+ */
+DEFINE_PER_CPU(struct intel_pqr_state, pqr_state);
+
+/*
+ * Used to store the max resource name width and max resource data width
+ * to display the schemata in a tabular format
+ */
+int max_name_width, max_data_width;
+
+/*
+ * Global boolean for rdt_alloc which is true if any
+ * resource allocation is enabled.
+ */
+bool rdt_alloc_capable;
+
+static void
+mba_wrmsr(struct rdt_domain *d, struct msr_param *m, struct rdt_resource *r);
+static void
+cat_wrmsr(struct rdt_domain *d, struct msr_param *m, struct rdt_resource *r);
+
+#define domain_init(id) LIST_HEAD_INIT(rdt_resources_all[id].domains)
+
+struct rdt_resource rdt_resources_all[] = {
+ [RDT_RESOURCE_L3] =
+ {
+ .rid = RDT_RESOURCE_L3,
+ .name = "L3",
+ .domains = domain_init(RDT_RESOURCE_L3),
+ .msr_base = IA32_L3_CBM_BASE,
+ .msr_update = cat_wrmsr,
+ .cache_level = 3,
+ .cache = {
+ .min_cbm_bits = 1,
+ .cbm_idx_mult = 1,
+ .cbm_idx_offset = 0,
+ },
+ .parse_ctrlval = parse_cbm,
+ .format_str = "%d=%0*x",
+ .fflags = RFTYPE_RES_CACHE,
+ },
+ [RDT_RESOURCE_L3DATA] =
+ {
+ .rid = RDT_RESOURCE_L3DATA,
+ .name = "L3DATA",
+ .domains = domain_init(RDT_RESOURCE_L3DATA),
+ .msr_base = IA32_L3_CBM_BASE,
+ .msr_update = cat_wrmsr,
+ .cache_level = 3,
+ .cache = {
+ .min_cbm_bits = 1,
+ .cbm_idx_mult = 2,
+ .cbm_idx_offset = 0,
+ },
+ .parse_ctrlval = parse_cbm,
+ .format_str = "%d=%0*x",
+ .fflags = RFTYPE_RES_CACHE,
+ },
+ [RDT_RESOURCE_L3CODE] =
+ {
+ .rid = RDT_RESOURCE_L3CODE,
+ .name = "L3CODE",
+ .domains = domain_init(RDT_RESOURCE_L3CODE),
+ .msr_base = IA32_L3_CBM_BASE,
+ .msr_update = cat_wrmsr,
+ .cache_level = 3,
+ .cache = {
+ .min_cbm_bits = 1,
+ .cbm_idx_mult = 2,
+ .cbm_idx_offset = 1,
+ },
+ .parse_ctrlval = parse_cbm,
+ .format_str = "%d=%0*x",
+ .fflags = RFTYPE_RES_CACHE,
+ },
+ [RDT_RESOURCE_L2] =
+ {
+ .rid = RDT_RESOURCE_L2,
+ .name = "L2",
+ .domains = domain_init(RDT_RESOURCE_L2),
+ .msr_base = IA32_L2_CBM_BASE,
+ .msr_update = cat_wrmsr,
+ .cache_level = 2,
+ .cache = {
+ .min_cbm_bits = 1,
+ .cbm_idx_mult = 1,
+ .cbm_idx_offset = 0,
+ },
+ .parse_ctrlval = parse_cbm,
+ .format_str = "%d=%0*x",
+ .fflags = RFTYPE_RES_CACHE,
+ },
+ [RDT_RESOURCE_L2DATA] =
+ {
+ .rid = RDT_RESOURCE_L2DATA,
+ .name = "L2DATA",
+ .domains = domain_init(RDT_RESOURCE_L2DATA),
+ .msr_base = IA32_L2_CBM_BASE,
+ .msr_update = cat_wrmsr,
+ .cache_level = 2,
+ .cache = {
+ .min_cbm_bits = 1,
+ .cbm_idx_mult = 2,
+ .cbm_idx_offset = 0,
+ },
+ .parse_ctrlval = parse_cbm,
+ .format_str = "%d=%0*x",
+ .fflags = RFTYPE_RES_CACHE,
+ },
+ [RDT_RESOURCE_L2CODE] =
+ {
+ .rid = RDT_RESOURCE_L2CODE,
+ .name = "L2CODE",
+ .domains = domain_init(RDT_RESOURCE_L2CODE),
+ .msr_base = IA32_L2_CBM_BASE,
+ .msr_update = cat_wrmsr,
+ .cache_level = 2,
+ .cache = {
+ .min_cbm_bits = 1,
+ .cbm_idx_mult = 2,
+ .cbm_idx_offset = 1,
+ },
+ .parse_ctrlval = parse_cbm,
+ .format_str = "%d=%0*x",
+ .fflags = RFTYPE_RES_CACHE,
+ },
+ [RDT_RESOURCE_MBA] =
+ {
+ .rid = RDT_RESOURCE_MBA,
+ .name = "MB",
+ .domains = domain_init(RDT_RESOURCE_MBA),
+ .msr_base = IA32_MBA_THRTL_BASE,
+ .msr_update = mba_wrmsr,
+ .cache_level = 3,
+ .parse_ctrlval = parse_bw,
+ .format_str = "%d=%*u",
+ .fflags = RFTYPE_RES_MB,
+ },
+};
+
+static unsigned int cbm_idx(struct rdt_resource *r, unsigned int closid)
+{
+ return closid * r->cache.cbm_idx_mult + r->cache.cbm_idx_offset;
+}
+
+/*
+ * cache_alloc_hsw_probe() - Have to probe for Intel haswell server CPUs
+ * as they do not have CPUID enumeration support for Cache allocation.
+ * The check for Vendor/Family/Model is not enough to guarantee that
+ * the MSRs won't #GP fault because only the following SKUs support
+ * CAT:
+ * Intel(R) Xeon(R) CPU E5-2658 v3 @ 2.20GHz
+ * Intel(R) Xeon(R) CPU E5-2648L v3 @ 1.80GHz
+ * Intel(R) Xeon(R) CPU E5-2628L v3 @ 2.00GHz
+ * Intel(R) Xeon(R) CPU E5-2618L v3 @ 2.30GHz
+ * Intel(R) Xeon(R) CPU E5-2608L v3 @ 2.00GHz
+ * Intel(R) Xeon(R) CPU E5-2658A v3 @ 2.20GHz
+ *
+ * Probe by trying to write the first of the L3 cach mask registers
+ * and checking that the bits stick. Max CLOSids is always 4 and max cbm length
+ * is always 20 on hsw server parts. The minimum cache bitmask length
+ * allowed for HSW server is always 2 bits. Hardcode all of them.
+ */
+static inline void cache_alloc_hsw_probe(void)
+{
+ struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3];
+ u32 l, h, max_cbm = BIT_MASK(20) - 1;
+
+ if (wrmsr_safe(IA32_L3_CBM_BASE, max_cbm, 0))
+ return;
+ rdmsr(IA32_L3_CBM_BASE, l, h);
+
+ /* If all the bits were set in MSR, return success */
+ if (l != max_cbm)
+ return;
+
+ r->num_closid = 4;
+ r->default_ctrl = max_cbm;
+ r->cache.cbm_len = 20;
+ r->cache.shareable_bits = 0xc0000;
+ r->cache.min_cbm_bits = 2;
+ r->alloc_capable = true;
+ r->alloc_enabled = true;
+
+ rdt_alloc_capable = true;
+}
+
+bool is_mba_sc(struct rdt_resource *r)
+{
+ if (!r)
+ return rdt_resources_all[RDT_RESOURCE_MBA].membw.mba_sc;
+
+ return r->membw.mba_sc;
+}
+
+/*
+ * rdt_get_mb_table() - get a mapping of bandwidth(b/w) percentage values
+ * exposed to user interface and the h/w understandable delay values.
+ *
+ * The non-linear delay values have the granularity of power of two
+ * and also the h/w does not guarantee a curve for configured delay
+ * values vs. actual b/w enforced.
+ * Hence we need a mapping that is pre calibrated so the user can
+ * express the memory b/w as a percentage value.
+ */
+static inline bool rdt_get_mb_table(struct rdt_resource *r)
+{
+ /*
+ * There are no Intel SKUs as of now to support non-linear delay.
+ */
+ pr_info("MBA b/w map not implemented for cpu:%d, model:%d",
+ boot_cpu_data.x86, boot_cpu_data.x86_model);
+
+ return false;
+}
+
+static bool rdt_get_mem_config(struct rdt_resource *r)
+{
+ union cpuid_0x10_3_eax eax;
+ union cpuid_0x10_x_edx edx;
+ u32 ebx, ecx;
+
+ cpuid_count(0x00000010, 3, &eax.full, &ebx, &ecx, &edx.full);
+ r->num_closid = edx.split.cos_max + 1;
+ r->membw.max_delay = eax.split.max_delay + 1;
+ r->default_ctrl = MAX_MBA_BW;
+ if (ecx & MBA_IS_LINEAR) {
+ r->membw.delay_linear = true;
+ r->membw.min_bw = MAX_MBA_BW - r->membw.max_delay;
+ r->membw.bw_gran = MAX_MBA_BW - r->membw.max_delay;
+ } else {
+ if (!rdt_get_mb_table(r))
+ return false;
+ }
+ r->data_width = 3;
+
+ r->alloc_capable = true;
+ r->alloc_enabled = true;
+
+ return true;
+}
+
+static void rdt_get_cache_alloc_cfg(int idx, struct rdt_resource *r)
+{
+ union cpuid_0x10_1_eax eax;
+ union cpuid_0x10_x_edx edx;
+ u32 ebx, ecx;
+
+ cpuid_count(0x00000010, idx, &eax.full, &ebx, &ecx, &edx.full);
+ r->num_closid = edx.split.cos_max + 1;
+ r->cache.cbm_len = eax.split.cbm_len + 1;
+ r->default_ctrl = BIT_MASK(eax.split.cbm_len + 1) - 1;
+ r->cache.shareable_bits = ebx & r->default_ctrl;
+ r->data_width = (r->cache.cbm_len + 3) / 4;
+ r->alloc_capable = true;
+ r->alloc_enabled = true;
+}
+
+static void rdt_get_cdp_config(int level, int type)
+{
+ struct rdt_resource *r_l = &rdt_resources_all[level];
+ struct rdt_resource *r = &rdt_resources_all[type];
+
+ r->num_closid = r_l->num_closid / 2;
+ r->cache.cbm_len = r_l->cache.cbm_len;
+ r->default_ctrl = r_l->default_ctrl;
+ r->cache.shareable_bits = r_l->cache.shareable_bits;
+ r->data_width = (r->cache.cbm_len + 3) / 4;
+ r->alloc_capable = true;
+ /*
+ * By default, CDP is disabled. CDP can be enabled by mount parameter
+ * "cdp" during resctrl file system mount time.
+ */
+ r->alloc_enabled = false;
+}
+
+static void rdt_get_cdp_l3_config(void)
+{
+ rdt_get_cdp_config(RDT_RESOURCE_L3, RDT_RESOURCE_L3DATA);
+ rdt_get_cdp_config(RDT_RESOURCE_L3, RDT_RESOURCE_L3CODE);
+}
+
+static void rdt_get_cdp_l2_config(void)
+{
+ rdt_get_cdp_config(RDT_RESOURCE_L2, RDT_RESOURCE_L2DATA);
+ rdt_get_cdp_config(RDT_RESOURCE_L2, RDT_RESOURCE_L2CODE);
+}
+
+static int get_cache_id(int cpu, int level)
+{
+ struct cpu_cacheinfo *ci = get_cpu_cacheinfo(cpu);
+ int i;
+
+ for (i = 0; i < ci->num_leaves; i++) {
+ if (ci->info_list[i].level == level)
+ return ci->info_list[i].id;
+ }
+
+ return -1;
+}
+
+/*
+ * Map the memory b/w percentage value to delay values
+ * that can be written to QOS_MSRs.
+ * There are currently no SKUs which support non linear delay values.
+ */
+u32 delay_bw_map(unsigned long bw, struct rdt_resource *r)
+{
+ if (r->membw.delay_linear)
+ return MAX_MBA_BW - bw;
+
+ pr_warn_once("Non Linear delay-bw map not supported but queried\n");
+ return r->default_ctrl;
+}
+
+static void
+mba_wrmsr(struct rdt_domain *d, struct msr_param *m, struct rdt_resource *r)
+{
+ unsigned int i;
+
+ /* Write the delay values for mba. */
+ for (i = m->low; i < m->high; i++)
+ wrmsrl(r->msr_base + i, delay_bw_map(d->ctrl_val[i], r));
+}
+
+static void
+cat_wrmsr(struct rdt_domain *d, struct msr_param *m, struct rdt_resource *r)
+{
+ unsigned int i;
+
+ for (i = m->low; i < m->high; i++)
+ wrmsrl(r->msr_base + cbm_idx(r, i), d->ctrl_val[i]);
+}
+
+struct rdt_domain *get_domain_from_cpu(int cpu, struct rdt_resource *r)
+{
+ struct rdt_domain *d;
+
+ list_for_each_entry(d, &r->domains, list) {
+ /* Find the domain that contains this CPU */
+ if (cpumask_test_cpu(cpu, &d->cpu_mask))
+ return d;
+ }
+
+ return NULL;
+}
+
+void rdt_ctrl_update(void *arg)
+{
+ struct msr_param *m = arg;
+ struct rdt_resource *r = m->res;
+ int cpu = smp_processor_id();
+ struct rdt_domain *d;
+
+ d = get_domain_from_cpu(cpu, r);
+ if (d) {
+ r->msr_update(d, m, r);
+ return;
+ }
+ pr_warn_once("cpu %d not found in any domain for resource %s\n",
+ cpu, r->name);
+}
+
+/*
+ * rdt_find_domain - Find a domain in a resource that matches input resource id
+ *
+ * Search resource r's domain list to find the resource id. If the resource
+ * id is found in a domain, return the domain. Otherwise, if requested by
+ * caller, return the first domain whose id is bigger than the input id.
+ * The domain list is sorted by id in ascending order.
+ */
+struct rdt_domain *rdt_find_domain(struct rdt_resource *r, int id,
+ struct list_head **pos)
+{
+ struct rdt_domain *d;
+ struct list_head *l;
+
+ if (id < 0)
+ return ERR_PTR(id);
+
+ list_for_each(l, &r->domains) {
+ d = list_entry(l, struct rdt_domain, list);
+ /* When id is found, return its domain. */
+ if (id == d->id)
+ return d;
+ /* Stop searching when finding id's position in sorted list. */
+ if (id < d->id)
+ break;
+ }
+
+ if (pos)
+ *pos = l;
+
+ return NULL;
+}
+
+void setup_default_ctrlval(struct rdt_resource *r, u32 *dc, u32 *dm)
+{
+ int i;
+
+ /*
+ * Initialize the Control MSRs to having no control.
+ * For Cache Allocation: Set all bits in cbm
+ * For Memory Allocation: Set b/w requested to 100%
+ * and the bandwidth in MBps to U32_MAX
+ */
+ for (i = 0; i < r->num_closid; i++, dc++, dm++) {
+ *dc = r->default_ctrl;
+ *dm = MBA_MAX_MBPS;
+ }
+}
+
+static int domain_setup_ctrlval(struct rdt_resource *r, struct rdt_domain *d)
+{
+ struct msr_param m;
+ u32 *dc, *dm;
+
+ dc = kmalloc_array(r->num_closid, sizeof(*d->ctrl_val), GFP_KERNEL);
+ if (!dc)
+ return -ENOMEM;
+
+ dm = kmalloc_array(r->num_closid, sizeof(*d->mbps_val), GFP_KERNEL);
+ if (!dm) {
+ kfree(dc);
+ return -ENOMEM;
+ }
+
+ d->ctrl_val = dc;
+ d->mbps_val = dm;
+ setup_default_ctrlval(r, dc, dm);
+
+ m.low = 0;
+ m.high = r->num_closid;
+ r->msr_update(d, &m, r);
+ return 0;
+}
+
+static int domain_setup_mon_state(struct rdt_resource *r, struct rdt_domain *d)
+{
+ size_t tsize;
+
+ if (is_llc_occupancy_enabled()) {
+ d->rmid_busy_llc = kcalloc(BITS_TO_LONGS(r->num_rmid),
+ sizeof(unsigned long),
+ GFP_KERNEL);
+ if (!d->rmid_busy_llc)
+ return -ENOMEM;
+ INIT_DELAYED_WORK(&d->cqm_limbo, cqm_handle_limbo);
+ }
+ if (is_mbm_total_enabled()) {
+ tsize = sizeof(*d->mbm_total);
+ d->mbm_total = kcalloc(r->num_rmid, tsize, GFP_KERNEL);
+ if (!d->mbm_total) {
+ kfree(d->rmid_busy_llc);
+ return -ENOMEM;
+ }
+ }
+ if (is_mbm_local_enabled()) {
+ tsize = sizeof(*d->mbm_local);
+ d->mbm_local = kcalloc(r->num_rmid, tsize, GFP_KERNEL);
+ if (!d->mbm_local) {
+ kfree(d->rmid_busy_llc);
+ kfree(d->mbm_total);
+ return -ENOMEM;
+ }
+ }
+
+ if (is_mbm_enabled()) {
+ INIT_DELAYED_WORK(&d->mbm_over, mbm_handle_overflow);
+ mbm_setup_overflow_handler(d, MBM_OVERFLOW_INTERVAL);
+ }
+
+ return 0;
+}
+
+/*
+ * domain_add_cpu - Add a cpu to a resource's domain list.
+ *
+ * If an existing domain in the resource r's domain list matches the cpu's
+ * resource id, add the cpu in the domain.
+ *
+ * Otherwise, a new domain is allocated and inserted into the right position
+ * in the domain list sorted by id in ascending order.
+ *
+ * The order in the domain list is visible to users when we print entries
+ * in the schemata file and schemata input is validated to have the same order
+ * as this list.
+ */
+static void domain_add_cpu(int cpu, struct rdt_resource *r)
+{
+ int id = get_cache_id(cpu, r->cache_level);
+ struct list_head *add_pos = NULL;
+ struct rdt_domain *d;
+
+ d = rdt_find_domain(r, id, &add_pos);
+ if (IS_ERR(d)) {
+ pr_warn("Could't find cache id for cpu %d\n", cpu);
+ return;
+ }
+
+ if (d) {
+ cpumask_set_cpu(cpu, &d->cpu_mask);
+ return;
+ }
+
+ d = kzalloc_node(sizeof(*d), GFP_KERNEL, cpu_to_node(cpu));
+ if (!d)
+ return;
+
+ d->id = id;
+ cpumask_set_cpu(cpu, &d->cpu_mask);
+
+ if (r->alloc_capable && domain_setup_ctrlval(r, d)) {
+ kfree(d);
+ return;
+ }
+
+ if (r->mon_capable && domain_setup_mon_state(r, d)) {
+ kfree(d);
+ return;
+ }
+
+ list_add_tail(&d->list, add_pos);
+
+ /*
+ * If resctrl is mounted, add
+ * per domain monitor data directories.
+ */
+ if (static_branch_unlikely(&rdt_mon_enable_key))
+ mkdir_mondata_subdir_allrdtgrp(r, d);
+}
+
+static void domain_remove_cpu(int cpu, struct rdt_resource *r)
+{
+ int id = get_cache_id(cpu, r->cache_level);
+ struct rdt_domain *d;
+
+ d = rdt_find_domain(r, id, NULL);
+ if (IS_ERR_OR_NULL(d)) {
+ pr_warn("Could't find cache id for cpu %d\n", cpu);
+ return;
+ }
+
+ cpumask_clear_cpu(cpu, &d->cpu_mask);
+ if (cpumask_empty(&d->cpu_mask)) {
+ /*
+ * If resctrl is mounted, remove all the
+ * per domain monitor data directories.
+ */
+ if (static_branch_unlikely(&rdt_mon_enable_key))
+ rmdir_mondata_subdir_allrdtgrp(r, d->id);
+ list_del(&d->list);
+ if (is_mbm_enabled())
+ cancel_delayed_work(&d->mbm_over);
+ if (is_llc_occupancy_enabled() && has_busy_rmid(r, d)) {
+ /*
+ * When a package is going down, forcefully
+ * decrement rmid->ebusy. There is no way to know
+ * that the L3 was flushed and hence may lead to
+ * incorrect counts in rare scenarios, but leaving
+ * the RMID as busy creates RMID leaks if the
+ * package never comes back.
+ */
+ __check_limbo(d, true);
+ cancel_delayed_work(&d->cqm_limbo);
+ }
+
+ kfree(d->ctrl_val);
+ kfree(d->mbps_val);
+ kfree(d->rmid_busy_llc);
+ kfree(d->mbm_total);
+ kfree(d->mbm_local);
+ kfree(d);
+ return;
+ }
+
+ if (r == &rdt_resources_all[RDT_RESOURCE_L3]) {
+ if (is_mbm_enabled() && cpu == d->mbm_work_cpu) {
+ cancel_delayed_work(&d->mbm_over);
+ mbm_setup_overflow_handler(d, 0);
+ }
+ if (is_llc_occupancy_enabled() && cpu == d->cqm_work_cpu &&
+ has_busy_rmid(r, d)) {
+ cancel_delayed_work(&d->cqm_limbo);
+ cqm_setup_limbo_handler(d, 0);
+ }
+ }
+}
+
+static void clear_closid_rmid(int cpu)
+{
+ struct intel_pqr_state *state = this_cpu_ptr(&pqr_state);
+
+ state->default_closid = 0;
+ state->default_rmid = 0;
+ state->cur_closid = 0;
+ state->cur_rmid = 0;
+ wrmsr(IA32_PQR_ASSOC, 0, 0);
+}
+
+static int intel_rdt_online_cpu(unsigned int cpu)
+{
+ struct rdt_resource *r;
+
+ mutex_lock(&rdtgroup_mutex);
+ for_each_capable_rdt_resource(r)
+ domain_add_cpu(cpu, r);
+ /* The cpu is set in default rdtgroup after online. */
+ cpumask_set_cpu(cpu, &rdtgroup_default.cpu_mask);
+ clear_closid_rmid(cpu);
+ mutex_unlock(&rdtgroup_mutex);
+
+ return 0;
+}
+
+static void clear_childcpus(struct rdtgroup *r, unsigned int cpu)
+{
+ struct rdtgroup *cr;
+
+ list_for_each_entry(cr, &r->mon.crdtgrp_list, mon.crdtgrp_list) {
+ if (cpumask_test_and_clear_cpu(cpu, &cr->cpu_mask)) {
+ break;
+ }
+ }
+}
+
+static int intel_rdt_offline_cpu(unsigned int cpu)
+{
+ struct rdtgroup *rdtgrp;
+ struct rdt_resource *r;
+
+ mutex_lock(&rdtgroup_mutex);
+ for_each_capable_rdt_resource(r)
+ domain_remove_cpu(cpu, r);
+ list_for_each_entry(rdtgrp, &rdt_all_groups, rdtgroup_list) {
+ if (cpumask_test_and_clear_cpu(cpu, &rdtgrp->cpu_mask)) {
+ clear_childcpus(rdtgrp, cpu);
+ break;
+ }
+ }
+ clear_closid_rmid(cpu);
+ mutex_unlock(&rdtgroup_mutex);
+
+ return 0;
+}
+
+/*
+ * Choose a width for the resource name and resource data based on the
+ * resource that has widest name and cbm.
+ */
+static __init void rdt_init_padding(void)
+{
+ struct rdt_resource *r;
+ int cl;
+
+ for_each_alloc_capable_rdt_resource(r) {
+ cl = strlen(r->name);
+ if (cl > max_name_width)
+ max_name_width = cl;
+
+ if (r->data_width > max_data_width)
+ max_data_width = r->data_width;
+ }
+}
+
+enum {
+ RDT_FLAG_CMT,
+ RDT_FLAG_MBM_TOTAL,
+ RDT_FLAG_MBM_LOCAL,
+ RDT_FLAG_L3_CAT,
+ RDT_FLAG_L3_CDP,
+ RDT_FLAG_L2_CAT,
+ RDT_FLAG_L2_CDP,
+ RDT_FLAG_MBA,
+};
+
+#define RDT_OPT(idx, n, f) \
+[idx] = { \
+ .name = n, \
+ .flag = f \
+}
+
+struct rdt_options {
+ char *name;
+ int flag;
+ bool force_off, force_on;
+};
+
+static struct rdt_options rdt_options[] __initdata = {
+ RDT_OPT(RDT_FLAG_CMT, "cmt", X86_FEATURE_CQM_OCCUP_LLC),
+ RDT_OPT(RDT_FLAG_MBM_TOTAL, "mbmtotal", X86_FEATURE_CQM_MBM_TOTAL),
+ RDT_OPT(RDT_FLAG_MBM_LOCAL, "mbmlocal", X86_FEATURE_CQM_MBM_LOCAL),
+ RDT_OPT(RDT_FLAG_L3_CAT, "l3cat", X86_FEATURE_CAT_L3),
+ RDT_OPT(RDT_FLAG_L3_CDP, "l3cdp", X86_FEATURE_CDP_L3),
+ RDT_OPT(RDT_FLAG_L2_CAT, "l2cat", X86_FEATURE_CAT_L2),
+ RDT_OPT(RDT_FLAG_L2_CDP, "l2cdp", X86_FEATURE_CDP_L2),
+ RDT_OPT(RDT_FLAG_MBA, "mba", X86_FEATURE_MBA),
+};
+#define NUM_RDT_OPTIONS ARRAY_SIZE(rdt_options)
+
+static int __init set_rdt_options(char *str)
+{
+ struct rdt_options *o;
+ bool force_off;
+ char *tok;
+
+ if (*str == '=')
+ str++;
+ while ((tok = strsep(&str, ",")) != NULL) {
+ force_off = *tok == '!';
+ if (force_off)
+ tok++;
+ for (o = rdt_options; o < &rdt_options[NUM_RDT_OPTIONS]; o++) {
+ if (strcmp(tok, o->name) == 0) {
+ if (force_off)
+ o->force_off = true;
+ else
+ o->force_on = true;
+ break;
+ }
+ }
+ }
+ return 1;
+}
+__setup("rdt", set_rdt_options);
+
+static bool __init rdt_cpu_has(int flag)
+{
+ bool ret = boot_cpu_has(flag);
+ struct rdt_options *o;
+
+ if (!ret)
+ return ret;
+
+ for (o = rdt_options; o < &rdt_options[NUM_RDT_OPTIONS]; o++) {
+ if (flag == o->flag) {
+ if (o->force_off)
+ ret = false;
+ if (o->force_on)
+ ret = true;
+ break;
+ }
+ }
+ return ret;
+}
+
+static __init bool get_rdt_alloc_resources(void)
+{
+ bool ret = false;
+
+ if (rdt_alloc_capable)
+ return true;
+
+ if (!boot_cpu_has(X86_FEATURE_RDT_A))
+ return false;
+
+ if (rdt_cpu_has(X86_FEATURE_CAT_L3)) {
+ rdt_get_cache_alloc_cfg(1, &rdt_resources_all[RDT_RESOURCE_L3]);
+ if (rdt_cpu_has(X86_FEATURE_CDP_L3))
+ rdt_get_cdp_l3_config();
+ ret = true;
+ }
+ if (rdt_cpu_has(X86_FEATURE_CAT_L2)) {
+ /* CPUID 0x10.2 fields are same format at 0x10.1 */
+ rdt_get_cache_alloc_cfg(2, &rdt_resources_all[RDT_RESOURCE_L2]);
+ if (rdt_cpu_has(X86_FEATURE_CDP_L2))
+ rdt_get_cdp_l2_config();
+ ret = true;
+ }
+
+ if (rdt_cpu_has(X86_FEATURE_MBA)) {
+ if (rdt_get_mem_config(&rdt_resources_all[RDT_RESOURCE_MBA]))
+ ret = true;
+ }
+ return ret;
+}
+
+static __init bool get_rdt_mon_resources(void)
+{
+ if (rdt_cpu_has(X86_FEATURE_CQM_OCCUP_LLC))
+ rdt_mon_features |= (1 << QOS_L3_OCCUP_EVENT_ID);
+ if (rdt_cpu_has(X86_FEATURE_CQM_MBM_TOTAL))
+ rdt_mon_features |= (1 << QOS_L3_MBM_TOTAL_EVENT_ID);
+ if (rdt_cpu_has(X86_FEATURE_CQM_MBM_LOCAL))
+ rdt_mon_features |= (1 << QOS_L3_MBM_LOCAL_EVENT_ID);
+
+ if (!rdt_mon_features)
+ return false;
+
+ return !rdt_get_mon_l3_config(&rdt_resources_all[RDT_RESOURCE_L3]);
+}
+
+static __init void rdt_quirks(void)
+{
+ switch (boot_cpu_data.x86_model) {
+ case INTEL_FAM6_HASWELL_X:
+ if (!rdt_options[RDT_FLAG_L3_CAT].force_off)
+ cache_alloc_hsw_probe();
+ break;
+ case INTEL_FAM6_SKYLAKE_X:
+ if (boot_cpu_data.x86_stepping <= 4)
+ set_rdt_options("!cmt,!mbmtotal,!mbmlocal,!l3cat");
+ else
+ set_rdt_options("!l3cat");
+ }
+}
+
+static __init bool get_rdt_resources(void)
+{
+ rdt_quirks();
+ rdt_alloc_capable = get_rdt_alloc_resources();
+ rdt_mon_capable = get_rdt_mon_resources();
+
+ return (rdt_mon_capable || rdt_alloc_capable);
+}
+
+static enum cpuhp_state rdt_online;
+
+static int __init intel_rdt_late_init(void)
+{
+ struct rdt_resource *r;
+ int state, ret;
+
+ if (!get_rdt_resources())
+ return -ENODEV;
+
+ rdt_init_padding();
+
+ state = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN,
+ "x86/rdt/cat:online:",
+ intel_rdt_online_cpu, intel_rdt_offline_cpu);
+ if (state < 0)
+ return state;
+
+ ret = rdtgroup_init();
+ if (ret) {
+ cpuhp_remove_state(state);
+ return ret;
+ }
+ rdt_online = state;
+
+ for_each_alloc_capable_rdt_resource(r)
+ pr_info("Intel RDT %s allocation detected\n", r->name);
+
+ for_each_mon_capable_rdt_resource(r)
+ pr_info("Intel RDT %s monitoring detected\n", r->name);
+
+ return 0;
+}
+
+late_initcall(intel_rdt_late_init);
+
+static void __exit intel_rdt_exit(void)
+{
+ cpuhp_remove_state(rdt_online);
+ rdtgroup_exit();
+}
+
+__exitcall(intel_rdt_exit);
diff --git a/arch/x86/kernel/cpu/intel_rdt.h b/arch/x86/kernel/cpu/intel_rdt.h
new file mode 100644
index 0000000..3736f6d
--- /dev/null
+++ b/arch/x86/kernel/cpu/intel_rdt.h
@@ -0,0 +1,571 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_INTEL_RDT_H
+#define _ASM_X86_INTEL_RDT_H
+
+#include <linux/sched.h>
+#include <linux/kernfs.h>
+#include <linux/jump_label.h>
+
+#define IA32_L3_QOS_CFG 0xc81
+#define IA32_L2_QOS_CFG 0xc82
+#define IA32_L3_CBM_BASE 0xc90
+#define IA32_L2_CBM_BASE 0xd10
+#define IA32_MBA_THRTL_BASE 0xd50
+
+#define L3_QOS_CDP_ENABLE 0x01ULL
+
+#define L2_QOS_CDP_ENABLE 0x01ULL
+
+/*
+ * Event IDs are used to program IA32_QM_EVTSEL before reading event
+ * counter from IA32_QM_CTR
+ */
+#define QOS_L3_OCCUP_EVENT_ID 0x01
+#define QOS_L3_MBM_TOTAL_EVENT_ID 0x02
+#define QOS_L3_MBM_LOCAL_EVENT_ID 0x03
+
+#define CQM_LIMBOCHECK_INTERVAL 1000
+
+#define MBM_CNTR_WIDTH 24
+#define MBM_OVERFLOW_INTERVAL 1000
+#define MAX_MBA_BW 100u
+
+#define RMID_VAL_ERROR BIT_ULL(63)
+#define RMID_VAL_UNAVAIL BIT_ULL(62)
+
+DECLARE_STATIC_KEY_FALSE(rdt_enable_key);
+
+/**
+ * struct mon_evt - Entry in the event list of a resource
+ * @evtid: event id
+ * @name: name of the event
+ */
+struct mon_evt {
+ u32 evtid;
+ char *name;
+ struct list_head list;
+};
+
+/**
+ * struct mon_data_bits - Monitoring details for each event file
+ * @rid: Resource id associated with the event file.
+ * @evtid: Event id associated with the event file
+ * @domid: The domain to which the event file belongs
+ */
+union mon_data_bits {
+ void *priv;
+ struct {
+ unsigned int rid : 10;
+ unsigned int evtid : 8;
+ unsigned int domid : 14;
+ } u;
+};
+
+struct rmid_read {
+ struct rdtgroup *rgrp;
+ struct rdt_domain *d;
+ int evtid;
+ bool first;
+ u64 val;
+};
+
+extern unsigned int intel_cqm_threshold;
+extern bool rdt_alloc_capable;
+extern bool rdt_mon_capable;
+extern unsigned int rdt_mon_features;
+
+enum rdt_group_type {
+ RDTCTRL_GROUP = 0,
+ RDTMON_GROUP,
+ RDT_NUM_GROUP,
+};
+
+/**
+ * enum rdtgrp_mode - Mode of a RDT resource group
+ * @RDT_MODE_SHAREABLE: This resource group allows sharing of its allocations
+ * @RDT_MODE_EXCLUSIVE: No sharing of this resource group's allocations allowed
+ * @RDT_MODE_PSEUDO_LOCKSETUP: Resource group will be used for Pseudo-Locking
+ * @RDT_MODE_PSEUDO_LOCKED: No sharing of this resource group's allocations
+ * allowed AND the allocations are Cache Pseudo-Locked
+ *
+ * The mode of a resource group enables control over the allowed overlap
+ * between allocations associated with different resource groups (classes
+ * of service). User is able to modify the mode of a resource group by
+ * writing to the "mode" resctrl file associated with the resource group.
+ *
+ * The "shareable", "exclusive", and "pseudo-locksetup" modes are set by
+ * writing the appropriate text to the "mode" file. A resource group enters
+ * "pseudo-locked" mode after the schemata is written while the resource
+ * group is in "pseudo-locksetup" mode.
+ */
+enum rdtgrp_mode {
+ RDT_MODE_SHAREABLE = 0,
+ RDT_MODE_EXCLUSIVE,
+ RDT_MODE_PSEUDO_LOCKSETUP,
+ RDT_MODE_PSEUDO_LOCKED,
+
+ /* Must be last */
+ RDT_NUM_MODES,
+};
+
+/**
+ * struct mongroup - store mon group's data in resctrl fs.
+ * @mon_data_kn kernlfs node for the mon_data directory
+ * @parent: parent rdtgrp
+ * @crdtgrp_list: child rdtgroup node list
+ * @rmid: rmid for this rdtgroup
+ */
+struct mongroup {
+ struct kernfs_node *mon_data_kn;
+ struct rdtgroup *parent;
+ struct list_head crdtgrp_list;
+ u32 rmid;
+};
+
+/**
+ * struct pseudo_lock_region - pseudo-lock region information
+ * @r: RDT resource to which this pseudo-locked region
+ * belongs
+ * @d: RDT domain to which this pseudo-locked region
+ * belongs
+ * @cbm: bitmask of the pseudo-locked region
+ * @lock_thread_wq: waitqueue used to wait on the pseudo-locking thread
+ * completion
+ * @thread_done: variable used by waitqueue to test if pseudo-locking
+ * thread completed
+ * @cpu: core associated with the cache on which the setup code
+ * will be run
+ * @line_size: size of the cache lines
+ * @size: size of pseudo-locked region in bytes
+ * @kmem: the kernel memory associated with pseudo-locked region
+ * @minor: minor number of character device associated with this
+ * region
+ * @debugfs_dir: pointer to this region's directory in the debugfs
+ * filesystem
+ * @pm_reqs: Power management QoS requests related to this region
+ */
+struct pseudo_lock_region {
+ struct rdt_resource *r;
+ struct rdt_domain *d;
+ u32 cbm;
+ wait_queue_head_t lock_thread_wq;
+ int thread_done;
+ int cpu;
+ unsigned int line_size;
+ unsigned int size;
+ void *kmem;
+ unsigned int minor;
+ struct dentry *debugfs_dir;
+ struct list_head pm_reqs;
+};
+
+/**
+ * struct rdtgroup - store rdtgroup's data in resctrl file system.
+ * @kn: kernfs node
+ * @rdtgroup_list: linked list for all rdtgroups
+ * @closid: closid for this rdtgroup
+ * @cpu_mask: CPUs assigned to this rdtgroup
+ * @flags: status bits
+ * @waitcount: how many cpus expect to find this
+ * group when they acquire rdtgroup_mutex
+ * @type: indicates type of this rdtgroup - either
+ * monitor only or ctrl_mon group
+ * @mon: mongroup related data
+ * @mode: mode of resource group
+ * @plr: pseudo-locked region
+ */
+struct rdtgroup {
+ struct kernfs_node *kn;
+ struct list_head rdtgroup_list;
+ u32 closid;
+ struct cpumask cpu_mask;
+ int flags;
+ atomic_t waitcount;
+ enum rdt_group_type type;
+ struct mongroup mon;
+ enum rdtgrp_mode mode;
+ struct pseudo_lock_region *plr;
+};
+
+/* rdtgroup.flags */
+#define RDT_DELETED 1
+
+/* rftype.flags */
+#define RFTYPE_FLAGS_CPUS_LIST 1
+
+/*
+ * Define the file type flags for base and info directories.
+ */
+#define RFTYPE_INFO BIT(0)
+#define RFTYPE_BASE BIT(1)
+#define RF_CTRLSHIFT 4
+#define RF_MONSHIFT 5
+#define RF_TOPSHIFT 6
+#define RFTYPE_CTRL BIT(RF_CTRLSHIFT)
+#define RFTYPE_MON BIT(RF_MONSHIFT)
+#define RFTYPE_TOP BIT(RF_TOPSHIFT)
+#define RFTYPE_RES_CACHE BIT(8)
+#define RFTYPE_RES_MB BIT(9)
+#define RF_CTRL_INFO (RFTYPE_INFO | RFTYPE_CTRL)
+#define RF_MON_INFO (RFTYPE_INFO | RFTYPE_MON)
+#define RF_TOP_INFO (RFTYPE_INFO | RFTYPE_TOP)
+#define RF_CTRL_BASE (RFTYPE_BASE | RFTYPE_CTRL)
+
+/* List of all resource groups */
+extern struct list_head rdt_all_groups;
+
+extern int max_name_width, max_data_width;
+
+int __init rdtgroup_init(void);
+void __exit rdtgroup_exit(void);
+
+/**
+ * struct rftype - describe each file in the resctrl file system
+ * @name: File name
+ * @mode: Access mode
+ * @kf_ops: File operations
+ * @flags: File specific RFTYPE_FLAGS_* flags
+ * @fflags: File specific RF_* or RFTYPE_* flags
+ * @seq_show: Show content of the file
+ * @write: Write to the file
+ */
+struct rftype {
+ char *name;
+ umode_t mode;
+ struct kernfs_ops *kf_ops;
+ unsigned long flags;
+ unsigned long fflags;
+
+ int (*seq_show)(struct kernfs_open_file *of,
+ struct seq_file *sf, void *v);
+ /*
+ * write() is the generic write callback which maps directly to
+ * kernfs write operation and overrides all other operations.
+ * Maximum write size is determined by ->max_write_len.
+ */
+ ssize_t (*write)(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off);
+};
+
+/**
+ * struct mbm_state - status for each MBM counter in each domain
+ * @chunks: Total data moved (multiply by rdt_group.mon_scale to get bytes)
+ * @prev_msr Value of IA32_QM_CTR for this RMID last time we read it
+ * @chunks_bw Total local data moved. Used for bandwidth calculation
+ * @prev_bw_msr:Value of previous IA32_QM_CTR for bandwidth counting
+ * @prev_bw The most recent bandwidth in MBps
+ * @delta_bw Difference between the current and previous bandwidth
+ * @delta_comp Indicates whether to compute the delta_bw
+ */
+struct mbm_state {
+ u64 chunks;
+ u64 prev_msr;
+ u64 chunks_bw;
+ u64 prev_bw_msr;
+ u32 prev_bw;
+ u32 delta_bw;
+ bool delta_comp;
+};
+
+/**
+ * struct rdt_domain - group of cpus sharing an RDT resource
+ * @list: all instances of this resource
+ * @id: unique id for this instance
+ * @cpu_mask: which cpus share this resource
+ * @rmid_busy_llc:
+ * bitmap of which limbo RMIDs are above threshold
+ * @mbm_total: saved state for MBM total bandwidth
+ * @mbm_local: saved state for MBM local bandwidth
+ * @mbm_over: worker to periodically read MBM h/w counters
+ * @cqm_limbo: worker to periodically read CQM h/w counters
+ * @mbm_work_cpu:
+ * worker cpu for MBM h/w counters
+ * @cqm_work_cpu:
+ * worker cpu for CQM h/w counters
+ * @ctrl_val: array of cache or mem ctrl values (indexed by CLOSID)
+ * @mbps_val: When mba_sc is enabled, this holds the bandwidth in MBps
+ * @new_ctrl: new ctrl value to be loaded
+ * @have_new_ctrl: did user provide new_ctrl for this domain
+ * @plr: pseudo-locked region (if any) associated with domain
+ */
+struct rdt_domain {
+ struct list_head list;
+ int id;
+ struct cpumask cpu_mask;
+ unsigned long *rmid_busy_llc;
+ struct mbm_state *mbm_total;
+ struct mbm_state *mbm_local;
+ struct delayed_work mbm_over;
+ struct delayed_work cqm_limbo;
+ int mbm_work_cpu;
+ int cqm_work_cpu;
+ u32 *ctrl_val;
+ u32 *mbps_val;
+ u32 new_ctrl;
+ bool have_new_ctrl;
+ struct pseudo_lock_region *plr;
+};
+
+/**
+ * struct msr_param - set a range of MSRs from a domain
+ * @res: The resource to use
+ * @low: Beginning index from base MSR
+ * @high: End index
+ */
+struct msr_param {
+ struct rdt_resource *res;
+ int low;
+ int high;
+};
+
+/**
+ * struct rdt_cache - Cache allocation related data
+ * @cbm_len: Length of the cache bit mask
+ * @min_cbm_bits: Minimum number of consecutive bits to be set
+ * @cbm_idx_mult: Multiplier of CBM index
+ * @cbm_idx_offset: Offset of CBM index. CBM index is computed by:
+ * closid * cbm_idx_multi + cbm_idx_offset
+ * in a cache bit mask
+ * @shareable_bits: Bitmask of shareable resource with other
+ * executing entities
+ */
+struct rdt_cache {
+ unsigned int cbm_len;
+ unsigned int min_cbm_bits;
+ unsigned int cbm_idx_mult;
+ unsigned int cbm_idx_offset;
+ unsigned int shareable_bits;
+};
+
+/**
+ * struct rdt_membw - Memory bandwidth allocation related data
+ * @max_delay: Max throttle delay. Delay is the hardware
+ * representation for memory bandwidth.
+ * @min_bw: Minimum memory bandwidth percentage user can request
+ * @bw_gran: Granularity at which the memory bandwidth is allocated
+ * @delay_linear: True if memory B/W delay is in linear scale
+ * @mba_sc: True if MBA software controller(mba_sc) is enabled
+ * @mb_map: Mapping of memory B/W percentage to memory B/W delay
+ */
+struct rdt_membw {
+ u32 max_delay;
+ u32 min_bw;
+ u32 bw_gran;
+ u32 delay_linear;
+ bool mba_sc;
+ u32 *mb_map;
+};
+
+static inline bool is_llc_occupancy_enabled(void)
+{
+ return (rdt_mon_features & (1 << QOS_L3_OCCUP_EVENT_ID));
+}
+
+static inline bool is_mbm_total_enabled(void)
+{
+ return (rdt_mon_features & (1 << QOS_L3_MBM_TOTAL_EVENT_ID));
+}
+
+static inline bool is_mbm_local_enabled(void)
+{
+ return (rdt_mon_features & (1 << QOS_L3_MBM_LOCAL_EVENT_ID));
+}
+
+static inline bool is_mbm_enabled(void)
+{
+ return (is_mbm_total_enabled() || is_mbm_local_enabled());
+}
+
+static inline bool is_mbm_event(int e)
+{
+ return (e >= QOS_L3_MBM_TOTAL_EVENT_ID &&
+ e <= QOS_L3_MBM_LOCAL_EVENT_ID);
+}
+
+struct rdt_parse_data {
+ struct rdtgroup *rdtgrp;
+ char *buf;
+};
+
+/**
+ * struct rdt_resource - attributes of an RDT resource
+ * @rid: The index of the resource
+ * @alloc_enabled: Is allocation enabled on this machine
+ * @mon_enabled: Is monitoring enabled for this feature
+ * @alloc_capable: Is allocation available on this machine
+ * @mon_capable: Is monitor feature available on this machine
+ * @name: Name to use in "schemata" file
+ * @num_closid: Number of CLOSIDs available
+ * @cache_level: Which cache level defines scope of this resource
+ * @default_ctrl: Specifies default cache cbm or memory B/W percent.
+ * @msr_base: Base MSR address for CBMs
+ * @msr_update: Function pointer to update QOS MSRs
+ * @data_width: Character width of data when displaying
+ * @domains: All domains for this resource
+ * @cache: Cache allocation related data
+ * @format_str: Per resource format string to show domain value
+ * @parse_ctrlval: Per resource function pointer to parse control values
+ * @evt_list: List of monitoring events
+ * @num_rmid: Number of RMIDs available
+ * @mon_scale: cqm counter * mon_scale = occupancy in bytes
+ * @fflags: flags to choose base and info files
+ */
+struct rdt_resource {
+ int rid;
+ bool alloc_enabled;
+ bool mon_enabled;
+ bool alloc_capable;
+ bool mon_capable;
+ char *name;
+ int num_closid;
+ int cache_level;
+ u32 default_ctrl;
+ unsigned int msr_base;
+ void (*msr_update) (struct rdt_domain *d, struct msr_param *m,
+ struct rdt_resource *r);
+ int data_width;
+ struct list_head domains;
+ struct rdt_cache cache;
+ struct rdt_membw membw;
+ const char *format_str;
+ int (*parse_ctrlval)(struct rdt_parse_data *data,
+ struct rdt_resource *r,
+ struct rdt_domain *d);
+ struct list_head evt_list;
+ int num_rmid;
+ unsigned int mon_scale;
+ unsigned long fflags;
+};
+
+int parse_cbm(struct rdt_parse_data *data, struct rdt_resource *r,
+ struct rdt_domain *d);
+int parse_bw(struct rdt_parse_data *data, struct rdt_resource *r,
+ struct rdt_domain *d);
+
+extern struct mutex rdtgroup_mutex;
+
+extern struct rdt_resource rdt_resources_all[];
+extern struct rdtgroup rdtgroup_default;
+DECLARE_STATIC_KEY_FALSE(rdt_alloc_enable_key);
+
+extern struct dentry *debugfs_resctrl;
+
+enum {
+ RDT_RESOURCE_L3,
+ RDT_RESOURCE_L3DATA,
+ RDT_RESOURCE_L3CODE,
+ RDT_RESOURCE_L2,
+ RDT_RESOURCE_L2DATA,
+ RDT_RESOURCE_L2CODE,
+ RDT_RESOURCE_MBA,
+
+ /* Must be the last */
+ RDT_NUM_RESOURCES,
+};
+
+#define for_each_capable_rdt_resource(r) \
+ for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\
+ r++) \
+ if (r->alloc_capable || r->mon_capable)
+
+#define for_each_alloc_capable_rdt_resource(r) \
+ for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\
+ r++) \
+ if (r->alloc_capable)
+
+#define for_each_mon_capable_rdt_resource(r) \
+ for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\
+ r++) \
+ if (r->mon_capable)
+
+#define for_each_alloc_enabled_rdt_resource(r) \
+ for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\
+ r++) \
+ if (r->alloc_enabled)
+
+#define for_each_mon_enabled_rdt_resource(r) \
+ for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\
+ r++) \
+ if (r->mon_enabled)
+
+/* CPUID.(EAX=10H, ECX=ResID=1).EAX */
+union cpuid_0x10_1_eax {
+ struct {
+ unsigned int cbm_len:5;
+ } split;
+ unsigned int full;
+};
+
+/* CPUID.(EAX=10H, ECX=ResID=3).EAX */
+union cpuid_0x10_3_eax {
+ struct {
+ unsigned int max_delay:12;
+ } split;
+ unsigned int full;
+};
+
+/* CPUID.(EAX=10H, ECX=ResID).EDX */
+union cpuid_0x10_x_edx {
+ struct {
+ unsigned int cos_max:16;
+ } split;
+ unsigned int full;
+};
+
+void rdt_last_cmd_clear(void);
+void rdt_last_cmd_puts(const char *s);
+void rdt_last_cmd_printf(const char *fmt, ...);
+
+void rdt_ctrl_update(void *arg);
+struct rdtgroup *rdtgroup_kn_lock_live(struct kernfs_node *kn);
+void rdtgroup_kn_unlock(struct kernfs_node *kn);
+int rdtgroup_kn_mode_restrict(struct rdtgroup *r, const char *name);
+int rdtgroup_kn_mode_restore(struct rdtgroup *r, const char *name,
+ umode_t mask);
+struct rdt_domain *rdt_find_domain(struct rdt_resource *r, int id,
+ struct list_head **pos);
+ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off);
+int rdtgroup_schemata_show(struct kernfs_open_file *of,
+ struct seq_file *s, void *v);
+bool rdtgroup_cbm_overlaps(struct rdt_resource *r, struct rdt_domain *d,
+ unsigned long cbm, int closid, bool exclusive);
+unsigned int rdtgroup_cbm_to_size(struct rdt_resource *r, struct rdt_domain *d,
+ unsigned long cbm);
+enum rdtgrp_mode rdtgroup_mode_by_closid(int closid);
+int rdtgroup_tasks_assigned(struct rdtgroup *r);
+int rdtgroup_locksetup_enter(struct rdtgroup *rdtgrp);
+int rdtgroup_locksetup_exit(struct rdtgroup *rdtgrp);
+bool rdtgroup_cbm_overlaps_pseudo_locked(struct rdt_domain *d, unsigned long cbm);
+bool rdtgroup_pseudo_locked_in_hierarchy(struct rdt_domain *d);
+int rdt_pseudo_lock_init(void);
+void rdt_pseudo_lock_release(void);
+int rdtgroup_pseudo_lock_create(struct rdtgroup *rdtgrp);
+void rdtgroup_pseudo_lock_remove(struct rdtgroup *rdtgrp);
+struct rdt_domain *get_domain_from_cpu(int cpu, struct rdt_resource *r);
+int update_domains(struct rdt_resource *r, int closid);
+int closids_supported(void);
+void closid_free(int closid);
+int alloc_rmid(void);
+void free_rmid(u32 rmid);
+int rdt_get_mon_l3_config(struct rdt_resource *r);
+void mon_event_count(void *info);
+int rdtgroup_mondata_show(struct seq_file *m, void *arg);
+void rmdir_mondata_subdir_allrdtgrp(struct rdt_resource *r,
+ unsigned int dom_id);
+void mkdir_mondata_subdir_allrdtgrp(struct rdt_resource *r,
+ struct rdt_domain *d);
+void mon_event_read(struct rmid_read *rr, struct rdt_domain *d,
+ struct rdtgroup *rdtgrp, int evtid, int first);
+void mbm_setup_overflow_handler(struct rdt_domain *dom,
+ unsigned long delay_ms);
+void mbm_handle_overflow(struct work_struct *work);
+bool is_mba_sc(struct rdt_resource *r);
+void setup_default_ctrlval(struct rdt_resource *r, u32 *dc, u32 *dm);
+u32 delay_bw_map(unsigned long bw, struct rdt_resource *r);
+void cqm_setup_limbo_handler(struct rdt_domain *dom, unsigned long delay_ms);
+void cqm_handle_limbo(struct work_struct *work);
+bool has_busy_rmid(struct rdt_resource *r, struct rdt_domain *d);
+void __check_limbo(struct rdt_domain *d, bool force_free);
+
+#endif /* _ASM_X86_INTEL_RDT_H */
diff --git a/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c b/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c
new file mode 100644
index 0000000..627e5c8
--- /dev/null
+++ b/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c
@@ -0,0 +1,479 @@
+/*
+ * Resource Director Technology(RDT)
+ * - Cache Allocation code.
+ *
+ * Copyright (C) 2016 Intel Corporation
+ *
+ * Authors:
+ * Fenghua Yu <fenghua.yu@intel.com>
+ * Tony Luck <tony.luck@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * More information about RDT be found in the Intel (R) x86 Architecture
+ * Software Developer Manual June 2016, volume 3, section 17.17.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/cpu.h>
+#include <linux/kernfs.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+#include "intel_rdt.h"
+
+/*
+ * Check whether MBA bandwidth percentage value is correct. The value is
+ * checked against the minimum and max bandwidth values specified by the
+ * hardware. The allocated bandwidth percentage is rounded to the next
+ * control step available on the hardware.
+ */
+static bool bw_validate(char *buf, unsigned long *data, struct rdt_resource *r)
+{
+ unsigned long bw;
+ int ret;
+
+ /*
+ * Only linear delay values is supported for current Intel SKUs.
+ */
+ if (!r->membw.delay_linear) {
+ rdt_last_cmd_puts("No support for non-linear MB domains\n");
+ return false;
+ }
+
+ ret = kstrtoul(buf, 10, &bw);
+ if (ret) {
+ rdt_last_cmd_printf("Non-decimal digit in MB value %s\n", buf);
+ return false;
+ }
+
+ if ((bw < r->membw.min_bw || bw > r->default_ctrl) &&
+ !is_mba_sc(r)) {
+ rdt_last_cmd_printf("MB value %ld out of range [%d,%d]\n", bw,
+ r->membw.min_bw, r->default_ctrl);
+ return false;
+ }
+
+ *data = roundup(bw, (unsigned long)r->membw.bw_gran);
+ return true;
+}
+
+int parse_bw(struct rdt_parse_data *data, struct rdt_resource *r,
+ struct rdt_domain *d)
+{
+ unsigned long bw_val;
+
+ if (d->have_new_ctrl) {
+ rdt_last_cmd_printf("duplicate domain %d\n", d->id);
+ return -EINVAL;
+ }
+
+ if (!bw_validate(data->buf, &bw_val, r))
+ return -EINVAL;
+ d->new_ctrl = bw_val;
+ d->have_new_ctrl = true;
+
+ return 0;
+}
+
+/*
+ * Check whether a cache bit mask is valid. The SDM says:
+ * Please note that all (and only) contiguous '1' combinations
+ * are allowed (e.g. FFFFH, 0FF0H, 003CH, etc.).
+ * Additionally Haswell requires at least two bits set.
+ */
+static bool cbm_validate(char *buf, u32 *data, struct rdt_resource *r)
+{
+ unsigned long first_bit, zero_bit, val;
+ unsigned int cbm_len = r->cache.cbm_len;
+ int ret;
+
+ ret = kstrtoul(buf, 16, &val);
+ if (ret) {
+ rdt_last_cmd_printf("non-hex character in mask %s\n", buf);
+ return false;
+ }
+
+ if (val == 0 || val > r->default_ctrl) {
+ rdt_last_cmd_puts("mask out of range\n");
+ return false;
+ }
+
+ first_bit = find_first_bit(&val, cbm_len);
+ zero_bit = find_next_zero_bit(&val, cbm_len, first_bit);
+
+ if (find_next_bit(&val, cbm_len, zero_bit) < cbm_len) {
+ rdt_last_cmd_printf("mask %lx has non-consecutive 1-bits\n", val);
+ return false;
+ }
+
+ if ((zero_bit - first_bit) < r->cache.min_cbm_bits) {
+ rdt_last_cmd_printf("Need at least %d bits in mask\n",
+ r->cache.min_cbm_bits);
+ return false;
+ }
+
+ *data = val;
+ return true;
+}
+
+/*
+ * Read one cache bit mask (hex). Check that it is valid for the current
+ * resource type.
+ */
+int parse_cbm(struct rdt_parse_data *data, struct rdt_resource *r,
+ struct rdt_domain *d)
+{
+ struct rdtgroup *rdtgrp = data->rdtgrp;
+ u32 cbm_val;
+
+ if (d->have_new_ctrl) {
+ rdt_last_cmd_printf("duplicate domain %d\n", d->id);
+ return -EINVAL;
+ }
+
+ /*
+ * Cannot set up more than one pseudo-locked region in a cache
+ * hierarchy.
+ */
+ if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP &&
+ rdtgroup_pseudo_locked_in_hierarchy(d)) {
+ rdt_last_cmd_printf("pseudo-locked region in hierarchy\n");
+ return -EINVAL;
+ }
+
+ if (!cbm_validate(data->buf, &cbm_val, r))
+ return -EINVAL;
+
+ if ((rdtgrp->mode == RDT_MODE_EXCLUSIVE ||
+ rdtgrp->mode == RDT_MODE_SHAREABLE) &&
+ rdtgroup_cbm_overlaps_pseudo_locked(d, cbm_val)) {
+ rdt_last_cmd_printf("CBM overlaps with pseudo-locked region\n");
+ return -EINVAL;
+ }
+
+ /*
+ * The CBM may not overlap with the CBM of another closid if
+ * either is exclusive.
+ */
+ if (rdtgroup_cbm_overlaps(r, d, cbm_val, rdtgrp->closid, true)) {
+ rdt_last_cmd_printf("overlaps with exclusive group\n");
+ return -EINVAL;
+ }
+
+ if (rdtgroup_cbm_overlaps(r, d, cbm_val, rdtgrp->closid, false)) {
+ if (rdtgrp->mode == RDT_MODE_EXCLUSIVE ||
+ rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
+ rdt_last_cmd_printf("overlaps with other group\n");
+ return -EINVAL;
+ }
+ }
+
+ d->new_ctrl = cbm_val;
+ d->have_new_ctrl = true;
+
+ return 0;
+}
+
+/*
+ * For each domain in this resource we expect to find a series of:
+ * id=mask
+ * separated by ";". The "id" is in decimal, and must match one of
+ * the "id"s for this resource.
+ */
+static int parse_line(char *line, struct rdt_resource *r,
+ struct rdtgroup *rdtgrp)
+{
+ struct rdt_parse_data data;
+ char *dom = NULL, *id;
+ struct rdt_domain *d;
+ unsigned long dom_id;
+
+ if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP &&
+ r->rid == RDT_RESOURCE_MBA) {
+ rdt_last_cmd_puts("Cannot pseudo-lock MBA resource\n");
+ return -EINVAL;
+ }
+
+next:
+ if (!line || line[0] == '\0')
+ return 0;
+ dom = strsep(&line, ";");
+ id = strsep(&dom, "=");
+ if (!dom || kstrtoul(id, 10, &dom_id)) {
+ rdt_last_cmd_puts("Missing '=' or non-numeric domain\n");
+ return -EINVAL;
+ }
+ dom = strim(dom);
+ list_for_each_entry(d, &r->domains, list) {
+ if (d->id == dom_id) {
+ data.buf = dom;
+ data.rdtgrp = rdtgrp;
+ if (r->parse_ctrlval(&data, r, d))
+ return -EINVAL;
+ if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
+ /*
+ * In pseudo-locking setup mode and just
+ * parsed a valid CBM that should be
+ * pseudo-locked. Only one locked region per
+ * resource group and domain so just do
+ * the required initialization for single
+ * region and return.
+ */
+ rdtgrp->plr->r = r;
+ rdtgrp->plr->d = d;
+ rdtgrp->plr->cbm = d->new_ctrl;
+ d->plr = rdtgrp->plr;
+ return 0;
+ }
+ goto next;
+ }
+ }
+ return -EINVAL;
+}
+
+int update_domains(struct rdt_resource *r, int closid)
+{
+ struct msr_param msr_param;
+ cpumask_var_t cpu_mask;
+ struct rdt_domain *d;
+ bool mba_sc;
+ u32 *dc;
+ int cpu;
+
+ if (!zalloc_cpumask_var(&cpu_mask, GFP_KERNEL))
+ return -ENOMEM;
+
+ msr_param.low = closid;
+ msr_param.high = msr_param.low + 1;
+ msr_param.res = r;
+
+ mba_sc = is_mba_sc(r);
+ list_for_each_entry(d, &r->domains, list) {
+ dc = !mba_sc ? d->ctrl_val : d->mbps_val;
+ if (d->have_new_ctrl && d->new_ctrl != dc[closid]) {
+ cpumask_set_cpu(cpumask_any(&d->cpu_mask), cpu_mask);
+ dc[closid] = d->new_ctrl;
+ }
+ }
+
+ /*
+ * Avoid writing the control msr with control values when
+ * MBA software controller is enabled
+ */
+ if (cpumask_empty(cpu_mask) || mba_sc)
+ goto done;
+ cpu = get_cpu();
+ /* Update CBM on this cpu if it's in cpu_mask. */
+ if (cpumask_test_cpu(cpu, cpu_mask))
+ rdt_ctrl_update(&msr_param);
+ /* Update CBM on other cpus. */
+ smp_call_function_many(cpu_mask, rdt_ctrl_update, &msr_param, 1);
+ put_cpu();
+
+done:
+ free_cpumask_var(cpu_mask);
+
+ return 0;
+}
+
+static int rdtgroup_parse_resource(char *resname, char *tok,
+ struct rdtgroup *rdtgrp)
+{
+ struct rdt_resource *r;
+
+ for_each_alloc_enabled_rdt_resource(r) {
+ if (!strcmp(resname, r->name) && rdtgrp->closid < r->num_closid)
+ return parse_line(tok, r, rdtgrp);
+ }
+ rdt_last_cmd_printf("unknown/unsupported resource name '%s'\n", resname);
+ return -EINVAL;
+}
+
+ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ struct rdtgroup *rdtgrp;
+ struct rdt_domain *dom;
+ struct rdt_resource *r;
+ char *tok, *resname;
+ int ret = 0;
+
+ /* Valid input requires a trailing newline */
+ if (nbytes == 0 || buf[nbytes - 1] != '\n')
+ return -EINVAL;
+ buf[nbytes - 1] = '\0';
+
+ cpus_read_lock();
+ rdtgrp = rdtgroup_kn_lock_live(of->kn);
+ if (!rdtgrp) {
+ rdtgroup_kn_unlock(of->kn);
+ cpus_read_unlock();
+ return -ENOENT;
+ }
+ rdt_last_cmd_clear();
+
+ /*
+ * No changes to pseudo-locked region allowed. It has to be removed
+ * and re-created instead.
+ */
+ if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) {
+ ret = -EINVAL;
+ rdt_last_cmd_puts("resource group is pseudo-locked\n");
+ goto out;
+ }
+
+ for_each_alloc_enabled_rdt_resource(r) {
+ list_for_each_entry(dom, &r->domains, list)
+ dom->have_new_ctrl = false;
+ }
+
+ while ((tok = strsep(&buf, "\n")) != NULL) {
+ resname = strim(strsep(&tok, ":"));
+ if (!tok) {
+ rdt_last_cmd_puts("Missing ':'\n");
+ ret = -EINVAL;
+ goto out;
+ }
+ if (tok[0] == '\0') {
+ rdt_last_cmd_printf("Missing '%s' value\n", resname);
+ ret = -EINVAL;
+ goto out;
+ }
+ ret = rdtgroup_parse_resource(resname, tok, rdtgrp);
+ if (ret)
+ goto out;
+ }
+
+ for_each_alloc_enabled_rdt_resource(r) {
+ ret = update_domains(r, rdtgrp->closid);
+ if (ret)
+ goto out;
+ }
+
+ if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
+ /*
+ * If pseudo-locking fails we keep the resource group in
+ * mode RDT_MODE_PSEUDO_LOCKSETUP with its class of service
+ * active and updated for just the domain the pseudo-locked
+ * region was requested for.
+ */
+ ret = rdtgroup_pseudo_lock_create(rdtgrp);
+ }
+
+out:
+ rdtgroup_kn_unlock(of->kn);
+ cpus_read_unlock();
+ return ret ?: nbytes;
+}
+
+static void show_doms(struct seq_file *s, struct rdt_resource *r, int closid)
+{
+ struct rdt_domain *dom;
+ bool sep = false;
+ u32 ctrl_val;
+
+ seq_printf(s, "%*s:", max_name_width, r->name);
+ list_for_each_entry(dom, &r->domains, list) {
+ if (sep)
+ seq_puts(s, ";");
+
+ ctrl_val = (!is_mba_sc(r) ? dom->ctrl_val[closid] :
+ dom->mbps_val[closid]);
+ seq_printf(s, r->format_str, dom->id, max_data_width,
+ ctrl_val);
+ sep = true;
+ }
+ seq_puts(s, "\n");
+}
+
+int rdtgroup_schemata_show(struct kernfs_open_file *of,
+ struct seq_file *s, void *v)
+{
+ struct rdtgroup *rdtgrp;
+ struct rdt_resource *r;
+ int ret = 0;
+ u32 closid;
+
+ rdtgrp = rdtgroup_kn_lock_live(of->kn);
+ if (rdtgrp) {
+ if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
+ for_each_alloc_enabled_rdt_resource(r)
+ seq_printf(s, "%s:uninitialized\n", r->name);
+ } else if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) {
+ seq_printf(s, "%s:%d=%x\n", rdtgrp->plr->r->name,
+ rdtgrp->plr->d->id, rdtgrp->plr->cbm);
+ } else {
+ closid = rdtgrp->closid;
+ for_each_alloc_enabled_rdt_resource(r) {
+ if (closid < r->num_closid)
+ show_doms(s, r, closid);
+ }
+ }
+ } else {
+ ret = -ENOENT;
+ }
+ rdtgroup_kn_unlock(of->kn);
+ return ret;
+}
+
+void mon_event_read(struct rmid_read *rr, struct rdt_domain *d,
+ struct rdtgroup *rdtgrp, int evtid, int first)
+{
+ /*
+ * setup the parameters to send to the IPI to read the data.
+ */
+ rr->rgrp = rdtgrp;
+ rr->evtid = evtid;
+ rr->d = d;
+ rr->val = 0;
+ rr->first = first;
+
+ smp_call_function_any(&d->cpu_mask, mon_event_count, rr, 1);
+}
+
+int rdtgroup_mondata_show(struct seq_file *m, void *arg)
+{
+ struct kernfs_open_file *of = m->private;
+ u32 resid, evtid, domid;
+ struct rdtgroup *rdtgrp;
+ struct rdt_resource *r;
+ union mon_data_bits md;
+ struct rdt_domain *d;
+ struct rmid_read rr;
+ int ret = 0;
+
+ rdtgrp = rdtgroup_kn_lock_live(of->kn);
+
+ md.priv = of->kn->priv;
+ resid = md.u.rid;
+ domid = md.u.domid;
+ evtid = md.u.evtid;
+
+ r = &rdt_resources_all[resid];
+ d = rdt_find_domain(r, domid, NULL);
+ if (!d) {
+ ret = -ENOENT;
+ goto out;
+ }
+
+ mon_event_read(&rr, d, rdtgrp, evtid, false);
+
+ if (rr.val & RMID_VAL_ERROR)
+ seq_puts(m, "Error\n");
+ else if (rr.val & RMID_VAL_UNAVAIL)
+ seq_puts(m, "Unavailable\n");
+ else
+ seq_printf(m, "%llu\n", rr.val * r->mon_scale);
+
+out:
+ rdtgroup_kn_unlock(of->kn);
+ return ret;
+}
diff --git a/arch/x86/kernel/cpu/intel_rdt_monitor.c b/arch/x86/kernel/cpu/intel_rdt_monitor.c
new file mode 100644
index 0000000..b0f3aed
--- /dev/null
+++ b/arch/x86/kernel/cpu/intel_rdt_monitor.c
@@ -0,0 +1,655 @@
+/*
+ * Resource Director Technology(RDT)
+ * - Monitoring code
+ *
+ * Copyright (C) 2017 Intel Corporation
+ *
+ * Author:
+ * Vikas Shivappa <vikas.shivappa@intel.com>
+ *
+ * This replaces the cqm.c based on perf but we reuse a lot of
+ * code and datastructures originally from Peter Zijlstra and Matt Fleming.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * More information about RDT be found in the Intel (R) x86 Architecture
+ * Software Developer Manual June 2016, volume 3, section 17.17.
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <asm/cpu_device_id.h>
+#include "intel_rdt.h"
+
+#define MSR_IA32_QM_CTR 0x0c8e
+#define MSR_IA32_QM_EVTSEL 0x0c8d
+
+struct rmid_entry {
+ u32 rmid;
+ int busy;
+ struct list_head list;
+};
+
+/**
+ * @rmid_free_lru A least recently used list of free RMIDs
+ * These RMIDs are guaranteed to have an occupancy less than the
+ * threshold occupancy
+ */
+static LIST_HEAD(rmid_free_lru);
+
+/**
+ * @rmid_limbo_count count of currently unused but (potentially)
+ * dirty RMIDs.
+ * This counts RMIDs that no one is currently using but that
+ * may have a occupancy value > intel_cqm_threshold. User can change
+ * the threshold occupancy value.
+ */
+static unsigned int rmid_limbo_count;
+
+/**
+ * @rmid_entry - The entry in the limbo and free lists.
+ */
+static struct rmid_entry *rmid_ptrs;
+
+/*
+ * Global boolean for rdt_monitor which is true if any
+ * resource monitoring is enabled.
+ */
+bool rdt_mon_capable;
+
+/*
+ * Global to indicate which monitoring events are enabled.
+ */
+unsigned int rdt_mon_features;
+
+/*
+ * This is the threshold cache occupancy at which we will consider an
+ * RMID available for re-allocation.
+ */
+unsigned int intel_cqm_threshold;
+
+static inline struct rmid_entry *__rmid_entry(u32 rmid)
+{
+ struct rmid_entry *entry;
+
+ entry = &rmid_ptrs[rmid];
+ WARN_ON(entry->rmid != rmid);
+
+ return entry;
+}
+
+static u64 __rmid_read(u32 rmid, u32 eventid)
+{
+ u64 val;
+
+ /*
+ * As per the SDM, when IA32_QM_EVTSEL.EvtID (bits 7:0) is configured
+ * with a valid event code for supported resource type and the bits
+ * IA32_QM_EVTSEL.RMID (bits 41:32) are configured with valid RMID,
+ * IA32_QM_CTR.data (bits 61:0) reports the monitored data.
+ * IA32_QM_CTR.Error (bit 63) and IA32_QM_CTR.Unavailable (bit 62)
+ * are error bits.
+ */
+ wrmsr(MSR_IA32_QM_EVTSEL, eventid, rmid);
+ rdmsrl(MSR_IA32_QM_CTR, val);
+
+ return val;
+}
+
+static bool rmid_dirty(struct rmid_entry *entry)
+{
+ u64 val = __rmid_read(entry->rmid, QOS_L3_OCCUP_EVENT_ID);
+
+ return val >= intel_cqm_threshold;
+}
+
+/*
+ * Check the RMIDs that are marked as busy for this domain. If the
+ * reported LLC occupancy is below the threshold clear the busy bit and
+ * decrement the count. If the busy count gets to zero on an RMID, we
+ * free the RMID
+ */
+void __check_limbo(struct rdt_domain *d, bool force_free)
+{
+ struct rmid_entry *entry;
+ struct rdt_resource *r;
+ u32 crmid = 1, nrmid;
+
+ r = &rdt_resources_all[RDT_RESOURCE_L3];
+
+ /*
+ * Skip RMID 0 and start from RMID 1 and check all the RMIDs that
+ * are marked as busy for occupancy < threshold. If the occupancy
+ * is less than the threshold decrement the busy counter of the
+ * RMID and move it to the free list when the counter reaches 0.
+ */
+ for (;;) {
+ nrmid = find_next_bit(d->rmid_busy_llc, r->num_rmid, crmid);
+ if (nrmid >= r->num_rmid)
+ break;
+
+ entry = __rmid_entry(nrmid);
+ if (force_free || !rmid_dirty(entry)) {
+ clear_bit(entry->rmid, d->rmid_busy_llc);
+ if (!--entry->busy) {
+ rmid_limbo_count--;
+ list_add_tail(&entry->list, &rmid_free_lru);
+ }
+ }
+ crmid = nrmid + 1;
+ }
+}
+
+bool has_busy_rmid(struct rdt_resource *r, struct rdt_domain *d)
+{
+ return find_first_bit(d->rmid_busy_llc, r->num_rmid) != r->num_rmid;
+}
+
+/*
+ * As of now the RMIDs allocation is global.
+ * However we keep track of which packages the RMIDs
+ * are used to optimize the limbo list management.
+ */
+int alloc_rmid(void)
+{
+ struct rmid_entry *entry;
+
+ lockdep_assert_held(&rdtgroup_mutex);
+
+ if (list_empty(&rmid_free_lru))
+ return rmid_limbo_count ? -EBUSY : -ENOSPC;
+
+ entry = list_first_entry(&rmid_free_lru,
+ struct rmid_entry, list);
+ list_del(&entry->list);
+
+ return entry->rmid;
+}
+
+static void add_rmid_to_limbo(struct rmid_entry *entry)
+{
+ struct rdt_resource *r;
+ struct rdt_domain *d;
+ int cpu;
+ u64 val;
+
+ r = &rdt_resources_all[RDT_RESOURCE_L3];
+
+ entry->busy = 0;
+ cpu = get_cpu();
+ list_for_each_entry(d, &r->domains, list) {
+ if (cpumask_test_cpu(cpu, &d->cpu_mask)) {
+ val = __rmid_read(entry->rmid, QOS_L3_OCCUP_EVENT_ID);
+ if (val <= intel_cqm_threshold)
+ continue;
+ }
+
+ /*
+ * For the first limbo RMID in the domain,
+ * setup up the limbo worker.
+ */
+ if (!has_busy_rmid(r, d))
+ cqm_setup_limbo_handler(d, CQM_LIMBOCHECK_INTERVAL);
+ set_bit(entry->rmid, d->rmid_busy_llc);
+ entry->busy++;
+ }
+ put_cpu();
+
+ if (entry->busy)
+ rmid_limbo_count++;
+ else
+ list_add_tail(&entry->list, &rmid_free_lru);
+}
+
+void free_rmid(u32 rmid)
+{
+ struct rmid_entry *entry;
+
+ if (!rmid)
+ return;
+
+ lockdep_assert_held(&rdtgroup_mutex);
+
+ entry = __rmid_entry(rmid);
+
+ if (is_llc_occupancy_enabled())
+ add_rmid_to_limbo(entry);
+ else
+ list_add_tail(&entry->list, &rmid_free_lru);
+}
+
+static u64 mbm_overflow_count(u64 prev_msr, u64 cur_msr)
+{
+ u64 shift = 64 - MBM_CNTR_WIDTH, chunks;
+
+ chunks = (cur_msr << shift) - (prev_msr << shift);
+ return chunks >>= shift;
+}
+
+static int __mon_event_count(u32 rmid, struct rmid_read *rr)
+{
+ struct mbm_state *m;
+ u64 chunks, tval;
+
+ tval = __rmid_read(rmid, rr->evtid);
+ if (tval & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL)) {
+ rr->val = tval;
+ return -EINVAL;
+ }
+ switch (rr->evtid) {
+ case QOS_L3_OCCUP_EVENT_ID:
+ rr->val += tval;
+ return 0;
+ case QOS_L3_MBM_TOTAL_EVENT_ID:
+ m = &rr->d->mbm_total[rmid];
+ break;
+ case QOS_L3_MBM_LOCAL_EVENT_ID:
+ m = &rr->d->mbm_local[rmid];
+ break;
+ default:
+ /*
+ * Code would never reach here because
+ * an invalid event id would fail the __rmid_read.
+ */
+ return -EINVAL;
+ }
+
+ if (rr->first) {
+ memset(m, 0, sizeof(struct mbm_state));
+ m->prev_bw_msr = m->prev_msr = tval;
+ return 0;
+ }
+
+ chunks = mbm_overflow_count(m->prev_msr, tval);
+ m->chunks += chunks;
+ m->prev_msr = tval;
+
+ rr->val += m->chunks;
+ return 0;
+}
+
+/*
+ * Supporting function to calculate the memory bandwidth
+ * and delta bandwidth in MBps.
+ */
+static void mbm_bw_count(u32 rmid, struct rmid_read *rr)
+{
+ struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3];
+ struct mbm_state *m = &rr->d->mbm_local[rmid];
+ u64 tval, cur_bw, chunks;
+
+ tval = __rmid_read(rmid, rr->evtid);
+ if (tval & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
+ return;
+
+ chunks = mbm_overflow_count(m->prev_bw_msr, tval);
+ m->chunks_bw += chunks;
+ m->chunks = m->chunks_bw;
+ cur_bw = (chunks * r->mon_scale) >> 20;
+
+ if (m->delta_comp)
+ m->delta_bw = abs(cur_bw - m->prev_bw);
+ m->delta_comp = false;
+ m->prev_bw = cur_bw;
+ m->prev_bw_msr = tval;
+}
+
+/*
+ * This is called via IPI to read the CQM/MBM counters
+ * on a domain.
+ */
+void mon_event_count(void *info)
+{
+ struct rdtgroup *rdtgrp, *entry;
+ struct rmid_read *rr = info;
+ struct list_head *head;
+
+ rdtgrp = rr->rgrp;
+
+ if (__mon_event_count(rdtgrp->mon.rmid, rr))
+ return;
+
+ /*
+ * For Ctrl groups read data from child monitor groups.
+ */
+ head = &rdtgrp->mon.crdtgrp_list;
+
+ if (rdtgrp->type == RDTCTRL_GROUP) {
+ list_for_each_entry(entry, head, mon.crdtgrp_list) {
+ if (__mon_event_count(entry->mon.rmid, rr))
+ return;
+ }
+ }
+}
+
+/*
+ * Feedback loop for MBA software controller (mba_sc)
+ *
+ * mba_sc is a feedback loop where we periodically read MBM counters and
+ * adjust the bandwidth percentage values via the IA32_MBA_THRTL_MSRs so
+ * that:
+ *
+ * current bandwdith(cur_bw) < user specified bandwidth(user_bw)
+ *
+ * This uses the MBM counters to measure the bandwidth and MBA throttle
+ * MSRs to control the bandwidth for a particular rdtgrp. It builds on the
+ * fact that resctrl rdtgroups have both monitoring and control.
+ *
+ * The frequency of the checks is 1s and we just tag along the MBM overflow
+ * timer. Having 1s interval makes the calculation of bandwidth simpler.
+ *
+ * Although MBA's goal is to restrict the bandwidth to a maximum, there may
+ * be a need to increase the bandwidth to avoid uncecessarily restricting
+ * the L2 <-> L3 traffic.
+ *
+ * Since MBA controls the L2 external bandwidth where as MBM measures the
+ * L3 external bandwidth the following sequence could lead to such a
+ * situation.
+ *
+ * Consider an rdtgroup which had high L3 <-> memory traffic in initial
+ * phases -> mba_sc kicks in and reduced bandwidth percentage values -> but
+ * after some time rdtgroup has mostly L2 <-> L3 traffic.
+ *
+ * In this case we may restrict the rdtgroup's L2 <-> L3 traffic as its
+ * throttle MSRs already have low percentage values. To avoid
+ * unnecessarily restricting such rdtgroups, we also increase the bandwidth.
+ */
+static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_domain *dom_mbm)
+{
+ u32 closid, rmid, cur_msr, cur_msr_val, new_msr_val;
+ struct mbm_state *pmbm_data, *cmbm_data;
+ u32 cur_bw, delta_bw, user_bw;
+ struct rdt_resource *r_mba;
+ struct rdt_domain *dom_mba;
+ struct list_head *head;
+ struct rdtgroup *entry;
+
+ r_mba = &rdt_resources_all[RDT_RESOURCE_MBA];
+ closid = rgrp->closid;
+ rmid = rgrp->mon.rmid;
+ pmbm_data = &dom_mbm->mbm_local[rmid];
+
+ dom_mba = get_domain_from_cpu(smp_processor_id(), r_mba);
+ if (!dom_mba) {
+ pr_warn_once("Failure to get domain for MBA update\n");
+ return;
+ }
+
+ cur_bw = pmbm_data->prev_bw;
+ user_bw = dom_mba->mbps_val[closid];
+ delta_bw = pmbm_data->delta_bw;
+ cur_msr_val = dom_mba->ctrl_val[closid];
+
+ /*
+ * For Ctrl groups read data from child monitor groups.
+ */
+ head = &rgrp->mon.crdtgrp_list;
+ list_for_each_entry(entry, head, mon.crdtgrp_list) {
+ cmbm_data = &dom_mbm->mbm_local[entry->mon.rmid];
+ cur_bw += cmbm_data->prev_bw;
+ delta_bw += cmbm_data->delta_bw;
+ }
+
+ /*
+ * Scale up/down the bandwidth linearly for the ctrl group. The
+ * bandwidth step is the bandwidth granularity specified by the
+ * hardware.
+ *
+ * The delta_bw is used when increasing the bandwidth so that we
+ * dont alternately increase and decrease the control values
+ * continuously.
+ *
+ * For ex: consider cur_bw = 90MBps, user_bw = 100MBps and if
+ * bandwidth step is 20MBps(> user_bw - cur_bw), we would keep
+ * switching between 90 and 110 continuously if we only check
+ * cur_bw < user_bw.
+ */
+ if (cur_msr_val > r_mba->membw.min_bw && user_bw < cur_bw) {
+ new_msr_val = cur_msr_val - r_mba->membw.bw_gran;
+ } else if (cur_msr_val < MAX_MBA_BW &&
+ (user_bw > (cur_bw + delta_bw))) {
+ new_msr_val = cur_msr_val + r_mba->membw.bw_gran;
+ } else {
+ return;
+ }
+
+ cur_msr = r_mba->msr_base + closid;
+ wrmsrl(cur_msr, delay_bw_map(new_msr_val, r_mba));
+ dom_mba->ctrl_val[closid] = new_msr_val;
+
+ /*
+ * Delta values are updated dynamically package wise for each
+ * rdtgrp everytime the throttle MSR changes value.
+ *
+ * This is because (1)the increase in bandwidth is not perfectly
+ * linear and only "approximately" linear even when the hardware
+ * says it is linear.(2)Also since MBA is a core specific
+ * mechanism, the delta values vary based on number of cores used
+ * by the rdtgrp.
+ */
+ pmbm_data->delta_comp = true;
+ list_for_each_entry(entry, head, mon.crdtgrp_list) {
+ cmbm_data = &dom_mbm->mbm_local[entry->mon.rmid];
+ cmbm_data->delta_comp = true;
+ }
+}
+
+static void mbm_update(struct rdt_domain *d, int rmid)
+{
+ struct rmid_read rr;
+
+ rr.first = false;
+ rr.d = d;
+
+ /*
+ * This is protected from concurrent reads from user
+ * as both the user and we hold the global mutex.
+ */
+ if (is_mbm_total_enabled()) {
+ rr.evtid = QOS_L3_MBM_TOTAL_EVENT_ID;
+ __mon_event_count(rmid, &rr);
+ }
+ if (is_mbm_local_enabled()) {
+ rr.evtid = QOS_L3_MBM_LOCAL_EVENT_ID;
+
+ /*
+ * Call the MBA software controller only for the
+ * control groups and when user has enabled
+ * the software controller explicitly.
+ */
+ if (!is_mba_sc(NULL))
+ __mon_event_count(rmid, &rr);
+ else
+ mbm_bw_count(rmid, &rr);
+ }
+}
+
+/*
+ * Handler to scan the limbo list and move the RMIDs
+ * to free list whose occupancy < threshold_occupancy.
+ */
+void cqm_handle_limbo(struct work_struct *work)
+{
+ unsigned long delay = msecs_to_jiffies(CQM_LIMBOCHECK_INTERVAL);
+ int cpu = smp_processor_id();
+ struct rdt_resource *r;
+ struct rdt_domain *d;
+
+ mutex_lock(&rdtgroup_mutex);
+
+ r = &rdt_resources_all[RDT_RESOURCE_L3];
+ d = get_domain_from_cpu(cpu, r);
+
+ if (!d) {
+ pr_warn_once("Failure to get domain for limbo worker\n");
+ goto out_unlock;
+ }
+
+ __check_limbo(d, false);
+
+ if (has_busy_rmid(r, d))
+ schedule_delayed_work_on(cpu, &d->cqm_limbo, delay);
+
+out_unlock:
+ mutex_unlock(&rdtgroup_mutex);
+}
+
+void cqm_setup_limbo_handler(struct rdt_domain *dom, unsigned long delay_ms)
+{
+ unsigned long delay = msecs_to_jiffies(delay_ms);
+ struct rdt_resource *r;
+ int cpu;
+
+ r = &rdt_resources_all[RDT_RESOURCE_L3];
+
+ cpu = cpumask_any(&dom->cpu_mask);
+ dom->cqm_work_cpu = cpu;
+
+ schedule_delayed_work_on(cpu, &dom->cqm_limbo, delay);
+}
+
+void mbm_handle_overflow(struct work_struct *work)
+{
+ unsigned long delay = msecs_to_jiffies(MBM_OVERFLOW_INTERVAL);
+ struct rdtgroup *prgrp, *crgrp;
+ int cpu = smp_processor_id();
+ struct list_head *head;
+ struct rdt_domain *d;
+
+ mutex_lock(&rdtgroup_mutex);
+
+ if (!static_branch_likely(&rdt_enable_key))
+ goto out_unlock;
+
+ d = get_domain_from_cpu(cpu, &rdt_resources_all[RDT_RESOURCE_L3]);
+ if (!d)
+ goto out_unlock;
+
+ list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) {
+ mbm_update(d, prgrp->mon.rmid);
+
+ head = &prgrp->mon.crdtgrp_list;
+ list_for_each_entry(crgrp, head, mon.crdtgrp_list)
+ mbm_update(d, crgrp->mon.rmid);
+
+ if (is_mba_sc(NULL))
+ update_mba_bw(prgrp, d);
+ }
+
+ schedule_delayed_work_on(cpu, &d->mbm_over, delay);
+
+out_unlock:
+ mutex_unlock(&rdtgroup_mutex);
+}
+
+void mbm_setup_overflow_handler(struct rdt_domain *dom, unsigned long delay_ms)
+{
+ unsigned long delay = msecs_to_jiffies(delay_ms);
+ int cpu;
+
+ if (!static_branch_likely(&rdt_enable_key))
+ return;
+ cpu = cpumask_any(&dom->cpu_mask);
+ dom->mbm_work_cpu = cpu;
+ schedule_delayed_work_on(cpu, &dom->mbm_over, delay);
+}
+
+static int dom_data_init(struct rdt_resource *r)
+{
+ struct rmid_entry *entry = NULL;
+ int i, nr_rmids;
+
+ nr_rmids = r->num_rmid;
+ rmid_ptrs = kcalloc(nr_rmids, sizeof(struct rmid_entry), GFP_KERNEL);
+ if (!rmid_ptrs)
+ return -ENOMEM;
+
+ for (i = 0; i < nr_rmids; i++) {
+ entry = &rmid_ptrs[i];
+ INIT_LIST_HEAD(&entry->list);
+
+ entry->rmid = i;
+ list_add_tail(&entry->list, &rmid_free_lru);
+ }
+
+ /*
+ * RMID 0 is special and is always allocated. It's used for all
+ * tasks that are not monitored.
+ */
+ entry = __rmid_entry(0);
+ list_del(&entry->list);
+
+ return 0;
+}
+
+static struct mon_evt llc_occupancy_event = {
+ .name = "llc_occupancy",
+ .evtid = QOS_L3_OCCUP_EVENT_ID,
+};
+
+static struct mon_evt mbm_total_event = {
+ .name = "mbm_total_bytes",
+ .evtid = QOS_L3_MBM_TOTAL_EVENT_ID,
+};
+
+static struct mon_evt mbm_local_event = {
+ .name = "mbm_local_bytes",
+ .evtid = QOS_L3_MBM_LOCAL_EVENT_ID,
+};
+
+/*
+ * Initialize the event list for the resource.
+ *
+ * Note that MBM events are also part of RDT_RESOURCE_L3 resource
+ * because as per the SDM the total and local memory bandwidth
+ * are enumerated as part of L3 monitoring.
+ */
+static void l3_mon_evt_init(struct rdt_resource *r)
+{
+ INIT_LIST_HEAD(&r->evt_list);
+
+ if (is_llc_occupancy_enabled())
+ list_add_tail(&llc_occupancy_event.list, &r->evt_list);
+ if (is_mbm_total_enabled())
+ list_add_tail(&mbm_total_event.list, &r->evt_list);
+ if (is_mbm_local_enabled())
+ list_add_tail(&mbm_local_event.list, &r->evt_list);
+}
+
+int rdt_get_mon_l3_config(struct rdt_resource *r)
+{
+ int ret;
+
+ r->mon_scale = boot_cpu_data.x86_cache_occ_scale;
+ r->num_rmid = boot_cpu_data.x86_cache_max_rmid + 1;
+
+ /*
+ * A reasonable upper limit on the max threshold is the number
+ * of lines tagged per RMID if all RMIDs have the same number of
+ * lines tagged in the LLC.
+ *
+ * For a 35MB LLC and 56 RMIDs, this is ~1.8% of the LLC.
+ */
+ intel_cqm_threshold = boot_cpu_data.x86_cache_size * 1024 / r->num_rmid;
+
+ /* h/w works in units of "boot_cpu_data.x86_cache_occ_scale" */
+ intel_cqm_threshold /= r->mon_scale;
+
+ ret = dom_data_init(r);
+ if (ret)
+ return ret;
+
+ l3_mon_evt_init(r);
+
+ r->mon_capable = true;
+ r->mon_enabled = true;
+
+ return 0;
+}
diff --git a/arch/x86/kernel/cpu/intel_rdt_pseudo_lock.c b/arch/x86/kernel/cpu/intel_rdt_pseudo_lock.c
new file mode 100644
index 0000000..f8c260d
--- /dev/null
+++ b/arch/x86/kernel/cpu/intel_rdt_pseudo_lock.c
@@ -0,0 +1,1524 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Resource Director Technology (RDT)
+ *
+ * Pseudo-locking support built on top of Cache Allocation Technology (CAT)
+ *
+ * Copyright (C) 2018 Intel Corporation
+ *
+ * Author: Reinette Chatre <reinette.chatre@intel.com>
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/cacheinfo.h>
+#include <linux/cpu.h>
+#include <linux/cpumask.h>
+#include <linux/debugfs.h>
+#include <linux/kthread.h>
+#include <linux/mman.h>
+#include <linux/pm_qos.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+
+#include <asm/cacheflush.h>
+#include <asm/intel-family.h>
+#include <asm/intel_rdt_sched.h>
+#include <asm/perf_event.h>
+
+#include "intel_rdt.h"
+
+#define CREATE_TRACE_POINTS
+#include "intel_rdt_pseudo_lock_event.h"
+
+/*
+ * MSR_MISC_FEATURE_CONTROL register enables the modification of hardware
+ * prefetcher state. Details about this register can be found in the MSR
+ * tables for specific platforms found in Intel's SDM.
+ */
+#define MSR_MISC_FEATURE_CONTROL 0x000001a4
+
+/*
+ * The bits needed to disable hardware prefetching varies based on the
+ * platform. During initialization we will discover which bits to use.
+ */
+static u64 prefetch_disable_bits;
+
+/*
+ * Major number assigned to and shared by all devices exposing
+ * pseudo-locked regions.
+ */
+static unsigned int pseudo_lock_major;
+static unsigned long pseudo_lock_minor_avail = GENMASK(MINORBITS, 0);
+static struct class *pseudo_lock_class;
+
+/**
+ * get_prefetch_disable_bits - prefetch disable bits of supported platforms
+ *
+ * Capture the list of platforms that have been validated to support
+ * pseudo-locking. This includes testing to ensure pseudo-locked regions
+ * with low cache miss rates can be created under variety of load conditions
+ * as well as that these pseudo-locked regions can maintain their low cache
+ * miss rates under variety of load conditions for significant lengths of time.
+ *
+ * After a platform has been validated to support pseudo-locking its
+ * hardware prefetch disable bits are included here as they are documented
+ * in the SDM.
+ *
+ * When adding a platform here also add support for its cache events to
+ * measure_cycles_perf_fn()
+ *
+ * Return:
+ * If platform is supported, the bits to disable hardware prefetchers, 0
+ * if platform is not supported.
+ */
+static u64 get_prefetch_disable_bits(void)
+{
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ||
+ boot_cpu_data.x86 != 6)
+ return 0;
+
+ switch (boot_cpu_data.x86_model) {
+ case INTEL_FAM6_BROADWELL_X:
+ /*
+ * SDM defines bits of MSR_MISC_FEATURE_CONTROL register
+ * as:
+ * 0 L2 Hardware Prefetcher Disable (R/W)
+ * 1 L2 Adjacent Cache Line Prefetcher Disable (R/W)
+ * 2 DCU Hardware Prefetcher Disable (R/W)
+ * 3 DCU IP Prefetcher Disable (R/W)
+ * 63:4 Reserved
+ */
+ return 0xF;
+ case INTEL_FAM6_ATOM_GOLDMONT:
+ case INTEL_FAM6_ATOM_GEMINI_LAKE:
+ /*
+ * SDM defines bits of MSR_MISC_FEATURE_CONTROL register
+ * as:
+ * 0 L2 Hardware Prefetcher Disable (R/W)
+ * 1 Reserved
+ * 2 DCU Hardware Prefetcher Disable (R/W)
+ * 63:3 Reserved
+ */
+ return 0x5;
+ }
+
+ return 0;
+}
+
+/*
+ * Helper to write 64bit value to MSR without tracing. Used when
+ * use of the cache should be restricted and use of registers used
+ * for local variables avoided.
+ */
+static inline void pseudo_wrmsrl_notrace(unsigned int msr, u64 val)
+{
+ __wrmsr(msr, (u32)(val & 0xffffffffULL), (u32)(val >> 32));
+}
+
+/**
+ * pseudo_lock_minor_get - Obtain available minor number
+ * @minor: Pointer to where new minor number will be stored
+ *
+ * A bitmask is used to track available minor numbers. Here the next free
+ * minor number is marked as unavailable and returned.
+ *
+ * Return: 0 on success, <0 on failure.
+ */
+static int pseudo_lock_minor_get(unsigned int *minor)
+{
+ unsigned long first_bit;
+
+ first_bit = find_first_bit(&pseudo_lock_minor_avail, MINORBITS);
+
+ if (first_bit == MINORBITS)
+ return -ENOSPC;
+
+ __clear_bit(first_bit, &pseudo_lock_minor_avail);
+ *minor = first_bit;
+
+ return 0;
+}
+
+/**
+ * pseudo_lock_minor_release - Return minor number to available
+ * @minor: The minor number made available
+ */
+static void pseudo_lock_minor_release(unsigned int minor)
+{
+ __set_bit(minor, &pseudo_lock_minor_avail);
+}
+
+/**
+ * region_find_by_minor - Locate a pseudo-lock region by inode minor number
+ * @minor: The minor number of the device representing pseudo-locked region
+ *
+ * When the character device is accessed we need to determine which
+ * pseudo-locked region it belongs to. This is done by matching the minor
+ * number of the device to the pseudo-locked region it belongs.
+ *
+ * Minor numbers are assigned at the time a pseudo-locked region is associated
+ * with a cache instance.
+ *
+ * Return: On success return pointer to resource group owning the pseudo-locked
+ * region, NULL on failure.
+ */
+static struct rdtgroup *region_find_by_minor(unsigned int minor)
+{
+ struct rdtgroup *rdtgrp, *rdtgrp_match = NULL;
+
+ list_for_each_entry(rdtgrp, &rdt_all_groups, rdtgroup_list) {
+ if (rdtgrp->plr && rdtgrp->plr->minor == minor) {
+ rdtgrp_match = rdtgrp;
+ break;
+ }
+ }
+ return rdtgrp_match;
+}
+
+/**
+ * pseudo_lock_pm_req - A power management QoS request list entry
+ * @list: Entry within the @pm_reqs list for a pseudo-locked region
+ * @req: PM QoS request
+ */
+struct pseudo_lock_pm_req {
+ struct list_head list;
+ struct dev_pm_qos_request req;
+};
+
+static void pseudo_lock_cstates_relax(struct pseudo_lock_region *plr)
+{
+ struct pseudo_lock_pm_req *pm_req, *next;
+
+ list_for_each_entry_safe(pm_req, next, &plr->pm_reqs, list) {
+ dev_pm_qos_remove_request(&pm_req->req);
+ list_del(&pm_req->list);
+ kfree(pm_req);
+ }
+}
+
+/**
+ * pseudo_lock_cstates_constrain - Restrict cores from entering C6
+ *
+ * To prevent the cache from being affected by power management entering
+ * C6 has to be avoided. This is accomplished by requesting a latency
+ * requirement lower than lowest C6 exit latency of all supported
+ * platforms as found in the cpuidle state tables in the intel_idle driver.
+ * At this time it is possible to do so with a single latency requirement
+ * for all supported platforms.
+ *
+ * Since Goldmont is supported, which is affected by X86_BUG_MONITOR,
+ * the ACPI latencies need to be considered while keeping in mind that C2
+ * may be set to map to deeper sleep states. In this case the latency
+ * requirement needs to prevent entering C2 also.
+ */
+static int pseudo_lock_cstates_constrain(struct pseudo_lock_region *plr)
+{
+ struct pseudo_lock_pm_req *pm_req;
+ int cpu;
+ int ret;
+
+ for_each_cpu(cpu, &plr->d->cpu_mask) {
+ pm_req = kzalloc(sizeof(*pm_req), GFP_KERNEL);
+ if (!pm_req) {
+ rdt_last_cmd_puts("fail allocating mem for PM QoS\n");
+ ret = -ENOMEM;
+ goto out_err;
+ }
+ ret = dev_pm_qos_add_request(get_cpu_device(cpu),
+ &pm_req->req,
+ DEV_PM_QOS_RESUME_LATENCY,
+ 30);
+ if (ret < 0) {
+ rdt_last_cmd_printf("fail to add latency req cpu%d\n",
+ cpu);
+ kfree(pm_req);
+ ret = -1;
+ goto out_err;
+ }
+ list_add(&pm_req->list, &plr->pm_reqs);
+ }
+
+ return 0;
+
+out_err:
+ pseudo_lock_cstates_relax(plr);
+ return ret;
+}
+
+/**
+ * pseudo_lock_region_clear - Reset pseudo-lock region data
+ * @plr: pseudo-lock region
+ *
+ * All content of the pseudo-locked region is reset - any memory allocated
+ * freed.
+ *
+ * Return: void
+ */
+static void pseudo_lock_region_clear(struct pseudo_lock_region *plr)
+{
+ plr->size = 0;
+ plr->line_size = 0;
+ kfree(plr->kmem);
+ plr->kmem = NULL;
+ plr->r = NULL;
+ if (plr->d)
+ plr->d->plr = NULL;
+ plr->d = NULL;
+ plr->cbm = 0;
+ plr->debugfs_dir = NULL;
+}
+
+/**
+ * pseudo_lock_region_init - Initialize pseudo-lock region information
+ * @plr: pseudo-lock region
+ *
+ * Called after user provided a schemata to be pseudo-locked. From the
+ * schemata the &struct pseudo_lock_region is on entry already initialized
+ * with the resource, domain, and capacity bitmask. Here the information
+ * required for pseudo-locking is deduced from this data and &struct
+ * pseudo_lock_region initialized further. This information includes:
+ * - size in bytes of the region to be pseudo-locked
+ * - cache line size to know the stride with which data needs to be accessed
+ * to be pseudo-locked
+ * - a cpu associated with the cache instance on which the pseudo-locking
+ * flow can be executed
+ *
+ * Return: 0 on success, <0 on failure. Descriptive error will be written
+ * to last_cmd_status buffer.
+ */
+static int pseudo_lock_region_init(struct pseudo_lock_region *plr)
+{
+ struct cpu_cacheinfo *ci;
+ int ret;
+ int i;
+
+ /* Pick the first cpu we find that is associated with the cache. */
+ plr->cpu = cpumask_first(&plr->d->cpu_mask);
+
+ if (!cpu_online(plr->cpu)) {
+ rdt_last_cmd_printf("cpu %u associated with cache not online\n",
+ plr->cpu);
+ ret = -ENODEV;
+ goto out_region;
+ }
+
+ ci = get_cpu_cacheinfo(plr->cpu);
+
+ plr->size = rdtgroup_cbm_to_size(plr->r, plr->d, plr->cbm);
+
+ for (i = 0; i < ci->num_leaves; i++) {
+ if (ci->info_list[i].level == plr->r->cache_level) {
+ plr->line_size = ci->info_list[i].coherency_line_size;
+ return 0;
+ }
+ }
+
+ ret = -1;
+ rdt_last_cmd_puts("unable to determine cache line size\n");
+out_region:
+ pseudo_lock_region_clear(plr);
+ return ret;
+}
+
+/**
+ * pseudo_lock_init - Initialize a pseudo-lock region
+ * @rdtgrp: resource group to which new pseudo-locked region will belong
+ *
+ * A pseudo-locked region is associated with a resource group. When this
+ * association is created the pseudo-locked region is initialized. The
+ * details of the pseudo-locked region are not known at this time so only
+ * allocation is done and association established.
+ *
+ * Return: 0 on success, <0 on failure
+ */
+static int pseudo_lock_init(struct rdtgroup *rdtgrp)
+{
+ struct pseudo_lock_region *plr;
+
+ plr = kzalloc(sizeof(*plr), GFP_KERNEL);
+ if (!plr)
+ return -ENOMEM;
+
+ init_waitqueue_head(&plr->lock_thread_wq);
+ INIT_LIST_HEAD(&plr->pm_reqs);
+ rdtgrp->plr = plr;
+ return 0;
+}
+
+/**
+ * pseudo_lock_region_alloc - Allocate kernel memory that will be pseudo-locked
+ * @plr: pseudo-lock region
+ *
+ * Initialize the details required to set up the pseudo-locked region and
+ * allocate the contiguous memory that will be pseudo-locked to the cache.
+ *
+ * Return: 0 on success, <0 on failure. Descriptive error will be written
+ * to last_cmd_status buffer.
+ */
+static int pseudo_lock_region_alloc(struct pseudo_lock_region *plr)
+{
+ int ret;
+
+ ret = pseudo_lock_region_init(plr);
+ if (ret < 0)
+ return ret;
+
+ /*
+ * We do not yet support contiguous regions larger than
+ * KMALLOC_MAX_SIZE.
+ */
+ if (plr->size > KMALLOC_MAX_SIZE) {
+ rdt_last_cmd_puts("requested region exceeds maximum size\n");
+ ret = -E2BIG;
+ goto out_region;
+ }
+
+ plr->kmem = kzalloc(plr->size, GFP_KERNEL);
+ if (!plr->kmem) {
+ rdt_last_cmd_puts("unable to allocate memory\n");
+ ret = -ENOMEM;
+ goto out_region;
+ }
+
+ ret = 0;
+ goto out;
+out_region:
+ pseudo_lock_region_clear(plr);
+out:
+ return ret;
+}
+
+/**
+ * pseudo_lock_free - Free a pseudo-locked region
+ * @rdtgrp: resource group to which pseudo-locked region belonged
+ *
+ * The pseudo-locked region's resources have already been released, or not
+ * yet created at this point. Now it can be freed and disassociated from the
+ * resource group.
+ *
+ * Return: void
+ */
+static void pseudo_lock_free(struct rdtgroup *rdtgrp)
+{
+ pseudo_lock_region_clear(rdtgrp->plr);
+ kfree(rdtgrp->plr);
+ rdtgrp->plr = NULL;
+}
+
+/**
+ * pseudo_lock_fn - Load kernel memory into cache
+ * @_rdtgrp: resource group to which pseudo-lock region belongs
+ *
+ * This is the core pseudo-locking flow.
+ *
+ * First we ensure that the kernel memory cannot be found in the cache.
+ * Then, while taking care that there will be as little interference as
+ * possible, the memory to be loaded is accessed while core is running
+ * with class of service set to the bitmask of the pseudo-locked region.
+ * After this is complete no future CAT allocations will be allowed to
+ * overlap with this bitmask.
+ *
+ * Local register variables are utilized to ensure that the memory region
+ * to be locked is the only memory access made during the critical locking
+ * loop.
+ *
+ * Return: 0. Waiter on waitqueue will be woken on completion.
+ */
+static int pseudo_lock_fn(void *_rdtgrp)
+{
+ struct rdtgroup *rdtgrp = _rdtgrp;
+ struct pseudo_lock_region *plr = rdtgrp->plr;
+ u32 rmid_p, closid_p;
+ unsigned long i;
+#ifdef CONFIG_KASAN
+ /*
+ * The registers used for local register variables are also used
+ * when KASAN is active. When KASAN is active we use a regular
+ * variable to ensure we always use a valid pointer, but the cost
+ * is that this variable will enter the cache through evicting the
+ * memory we are trying to lock into the cache. Thus expect lower
+ * pseudo-locking success rate when KASAN is active.
+ */
+ unsigned int line_size;
+ unsigned int size;
+ void *mem_r;
+#else
+ register unsigned int line_size asm("esi");
+ register unsigned int size asm("edi");
+#ifdef CONFIG_X86_64
+ register void *mem_r asm("rbx");
+#else
+ register void *mem_r asm("ebx");
+#endif /* CONFIG_X86_64 */
+#endif /* CONFIG_KASAN */
+
+ /*
+ * Make sure none of the allocated memory is cached. If it is we
+ * will get a cache hit in below loop from outside of pseudo-locked
+ * region.
+ * wbinvd (as opposed to clflush/clflushopt) is required to
+ * increase likelihood that allocated cache portion will be filled
+ * with associated memory.
+ */
+ native_wbinvd();
+
+ /*
+ * Always called with interrupts enabled. By disabling interrupts
+ * ensure that we will not be preempted during this critical section.
+ */
+ local_irq_disable();
+
+ /*
+ * Call wrmsr and rdmsr as directly as possible to avoid tracing
+ * clobbering local register variables or affecting cache accesses.
+ *
+ * Disable the hardware prefetcher so that when the end of the memory
+ * being pseudo-locked is reached the hardware will not read beyond
+ * the buffer and evict pseudo-locked memory read earlier from the
+ * cache.
+ */
+ __wrmsr(MSR_MISC_FEATURE_CONTROL, prefetch_disable_bits, 0x0);
+ closid_p = this_cpu_read(pqr_state.cur_closid);
+ rmid_p = this_cpu_read(pqr_state.cur_rmid);
+ mem_r = plr->kmem;
+ size = plr->size;
+ line_size = plr->line_size;
+ /*
+ * Critical section begin: start by writing the closid associated
+ * with the capacity bitmask of the cache region being
+ * pseudo-locked followed by reading of kernel memory to load it
+ * into the cache.
+ */
+ __wrmsr(IA32_PQR_ASSOC, rmid_p, rdtgrp->closid);
+ /*
+ * Cache was flushed earlier. Now access kernel memory to read it
+ * into cache region associated with just activated plr->closid.
+ * Loop over data twice:
+ * - In first loop the cache region is shared with the page walker
+ * as it populates the paging structure caches (including TLB).
+ * - In the second loop the paging structure caches are used and
+ * cache region is populated with the memory being referenced.
+ */
+ for (i = 0; i < size; i += PAGE_SIZE) {
+ /*
+ * Add a barrier to prevent speculative execution of this
+ * loop reading beyond the end of the buffer.
+ */
+ rmb();
+ asm volatile("mov (%0,%1,1), %%eax\n\t"
+ :
+ : "r" (mem_r), "r" (i)
+ : "%eax", "memory");
+ }
+ for (i = 0; i < size; i += line_size) {
+ /*
+ * Add a barrier to prevent speculative execution of this
+ * loop reading beyond the end of the buffer.
+ */
+ rmb();
+ asm volatile("mov (%0,%1,1), %%eax\n\t"
+ :
+ : "r" (mem_r), "r" (i)
+ : "%eax", "memory");
+ }
+ /*
+ * Critical section end: restore closid with capacity bitmask that
+ * does not overlap with pseudo-locked region.
+ */
+ __wrmsr(IA32_PQR_ASSOC, rmid_p, closid_p);
+
+ /* Re-enable the hardware prefetcher(s) */
+ wrmsr(MSR_MISC_FEATURE_CONTROL, 0x0, 0x0);
+ local_irq_enable();
+
+ plr->thread_done = 1;
+ wake_up_interruptible(&plr->lock_thread_wq);
+ return 0;
+}
+
+/**
+ * rdtgroup_monitor_in_progress - Test if monitoring in progress
+ * @r: resource group being queried
+ *
+ * Return: 1 if monitor groups have been created for this resource
+ * group, 0 otherwise.
+ */
+static int rdtgroup_monitor_in_progress(struct rdtgroup *rdtgrp)
+{
+ return !list_empty(&rdtgrp->mon.crdtgrp_list);
+}
+
+/**
+ * rdtgroup_locksetup_user_restrict - Restrict user access to group
+ * @rdtgrp: resource group needing access restricted
+ *
+ * A resource group used for cache pseudo-locking cannot have cpus or tasks
+ * assigned to it. This is communicated to the user by restricting access
+ * to all the files that can be used to make such changes.
+ *
+ * Permissions restored with rdtgroup_locksetup_user_restore()
+ *
+ * Return: 0 on success, <0 on failure. If a failure occurs during the
+ * restriction of access an attempt will be made to restore permissions but
+ * the state of the mode of these files will be uncertain when a failure
+ * occurs.
+ */
+static int rdtgroup_locksetup_user_restrict(struct rdtgroup *rdtgrp)
+{
+ int ret;
+
+ ret = rdtgroup_kn_mode_restrict(rdtgrp, "tasks");
+ if (ret)
+ return ret;
+
+ ret = rdtgroup_kn_mode_restrict(rdtgrp, "cpus");
+ if (ret)
+ goto err_tasks;
+
+ ret = rdtgroup_kn_mode_restrict(rdtgrp, "cpus_list");
+ if (ret)
+ goto err_cpus;
+
+ if (rdt_mon_capable) {
+ ret = rdtgroup_kn_mode_restrict(rdtgrp, "mon_groups");
+ if (ret)
+ goto err_cpus_list;
+ }
+
+ ret = 0;
+ goto out;
+
+err_cpus_list:
+ rdtgroup_kn_mode_restore(rdtgrp, "cpus_list", 0777);
+err_cpus:
+ rdtgroup_kn_mode_restore(rdtgrp, "cpus", 0777);
+err_tasks:
+ rdtgroup_kn_mode_restore(rdtgrp, "tasks", 0777);
+out:
+ return ret;
+}
+
+/**
+ * rdtgroup_locksetup_user_restore - Restore user access to group
+ * @rdtgrp: resource group needing access restored
+ *
+ * Restore all file access previously removed using
+ * rdtgroup_locksetup_user_restrict()
+ *
+ * Return: 0 on success, <0 on failure. If a failure occurs during the
+ * restoration of access an attempt will be made to restrict permissions
+ * again but the state of the mode of these files will be uncertain when
+ * a failure occurs.
+ */
+static int rdtgroup_locksetup_user_restore(struct rdtgroup *rdtgrp)
+{
+ int ret;
+
+ ret = rdtgroup_kn_mode_restore(rdtgrp, "tasks", 0777);
+ if (ret)
+ return ret;
+
+ ret = rdtgroup_kn_mode_restore(rdtgrp, "cpus", 0777);
+ if (ret)
+ goto err_tasks;
+
+ ret = rdtgroup_kn_mode_restore(rdtgrp, "cpus_list", 0777);
+ if (ret)
+ goto err_cpus;
+
+ if (rdt_mon_capable) {
+ ret = rdtgroup_kn_mode_restore(rdtgrp, "mon_groups", 0777);
+ if (ret)
+ goto err_cpus_list;
+ }
+
+ ret = 0;
+ goto out;
+
+err_cpus_list:
+ rdtgroup_kn_mode_restrict(rdtgrp, "cpus_list");
+err_cpus:
+ rdtgroup_kn_mode_restrict(rdtgrp, "cpus");
+err_tasks:
+ rdtgroup_kn_mode_restrict(rdtgrp, "tasks");
+out:
+ return ret;
+}
+
+/**
+ * rdtgroup_locksetup_enter - Resource group enters locksetup mode
+ * @rdtgrp: resource group requested to enter locksetup mode
+ *
+ * A resource group enters locksetup mode to reflect that it would be used
+ * to represent a pseudo-locked region and is in the process of being set
+ * up to do so. A resource group used for a pseudo-locked region would
+ * lose the closid associated with it so we cannot allow it to have any
+ * tasks or cpus assigned nor permit tasks or cpus to be assigned in the
+ * future. Monitoring of a pseudo-locked region is not allowed either.
+ *
+ * The above and more restrictions on a pseudo-locked region are checked
+ * for and enforced before the resource group enters the locksetup mode.
+ *
+ * Returns: 0 if the resource group successfully entered locksetup mode, <0
+ * on failure. On failure the last_cmd_status buffer is updated with text to
+ * communicate details of failure to the user.
+ */
+int rdtgroup_locksetup_enter(struct rdtgroup *rdtgrp)
+{
+ int ret;
+
+ /*
+ * The default resource group can neither be removed nor lose the
+ * default closid associated with it.
+ */
+ if (rdtgrp == &rdtgroup_default) {
+ rdt_last_cmd_puts("cannot pseudo-lock default group\n");
+ return -EINVAL;
+ }
+
+ /*
+ * Cache Pseudo-locking not supported when CDP is enabled.
+ *
+ * Some things to consider if you would like to enable this
+ * support (using L3 CDP as example):
+ * - When CDP is enabled two separate resources are exposed,
+ * L3DATA and L3CODE, but they are actually on the same cache.
+ * The implication for pseudo-locking is that if a
+ * pseudo-locked region is created on a domain of one
+ * resource (eg. L3CODE), then a pseudo-locked region cannot
+ * be created on that same domain of the other resource
+ * (eg. L3DATA). This is because the creation of a
+ * pseudo-locked region involves a call to wbinvd that will
+ * affect all cache allocations on particular domain.
+ * - Considering the previous, it may be possible to only
+ * expose one of the CDP resources to pseudo-locking and
+ * hide the other. For example, we could consider to only
+ * expose L3DATA and since the L3 cache is unified it is
+ * still possible to place instructions there are execute it.
+ * - If only one region is exposed to pseudo-locking we should
+ * still keep in mind that availability of a portion of cache
+ * for pseudo-locking should take into account both resources.
+ * Similarly, if a pseudo-locked region is created in one
+ * resource, the portion of cache used by it should be made
+ * unavailable to all future allocations from both resources.
+ */
+ if (rdt_resources_all[RDT_RESOURCE_L3DATA].alloc_enabled ||
+ rdt_resources_all[RDT_RESOURCE_L2DATA].alloc_enabled) {
+ rdt_last_cmd_puts("CDP enabled\n");
+ return -EINVAL;
+ }
+
+ /*
+ * Not knowing the bits to disable prefetching implies that this
+ * platform does not support Cache Pseudo-Locking.
+ */
+ prefetch_disable_bits = get_prefetch_disable_bits();
+ if (prefetch_disable_bits == 0) {
+ rdt_last_cmd_puts("pseudo-locking not supported\n");
+ return -EINVAL;
+ }
+
+ if (rdtgroup_monitor_in_progress(rdtgrp)) {
+ rdt_last_cmd_puts("monitoring in progress\n");
+ return -EINVAL;
+ }
+
+ if (rdtgroup_tasks_assigned(rdtgrp)) {
+ rdt_last_cmd_puts("tasks assigned to resource group\n");
+ return -EINVAL;
+ }
+
+ if (!cpumask_empty(&rdtgrp->cpu_mask)) {
+ rdt_last_cmd_puts("CPUs assigned to resource group\n");
+ return -EINVAL;
+ }
+
+ if (rdtgroup_locksetup_user_restrict(rdtgrp)) {
+ rdt_last_cmd_puts("unable to modify resctrl permissions\n");
+ return -EIO;
+ }
+
+ ret = pseudo_lock_init(rdtgrp);
+ if (ret) {
+ rdt_last_cmd_puts("unable to init pseudo-lock region\n");
+ goto out_release;
+ }
+
+ /*
+ * If this system is capable of monitoring a rmid would have been
+ * allocated when the control group was created. This is not needed
+ * anymore when this group would be used for pseudo-locking. This
+ * is safe to call on platforms not capable of monitoring.
+ */
+ free_rmid(rdtgrp->mon.rmid);
+
+ ret = 0;
+ goto out;
+
+out_release:
+ rdtgroup_locksetup_user_restore(rdtgrp);
+out:
+ return ret;
+}
+
+/**
+ * rdtgroup_locksetup_exit - resource group exist locksetup mode
+ * @rdtgrp: resource group
+ *
+ * When a resource group exits locksetup mode the earlier restrictions are
+ * lifted.
+ *
+ * Return: 0 on success, <0 on failure
+ */
+int rdtgroup_locksetup_exit(struct rdtgroup *rdtgrp)
+{
+ int ret;
+
+ if (rdt_mon_capable) {
+ ret = alloc_rmid();
+ if (ret < 0) {
+ rdt_last_cmd_puts("out of RMIDs\n");
+ return ret;
+ }
+ rdtgrp->mon.rmid = ret;
+ }
+
+ ret = rdtgroup_locksetup_user_restore(rdtgrp);
+ if (ret) {
+ free_rmid(rdtgrp->mon.rmid);
+ return ret;
+ }
+
+ pseudo_lock_free(rdtgrp);
+ return 0;
+}
+
+/**
+ * rdtgroup_cbm_overlaps_pseudo_locked - Test if CBM or portion is pseudo-locked
+ * @d: RDT domain
+ * @cbm: CBM to test
+ *
+ * @d represents a cache instance and @cbm a capacity bitmask that is
+ * considered for it. Determine if @cbm overlaps with any existing
+ * pseudo-locked region on @d.
+ *
+ * @cbm is unsigned long, even if only 32 bits are used, to make the
+ * bitmap functions work correctly.
+ *
+ * Return: true if @cbm overlaps with pseudo-locked region on @d, false
+ * otherwise.
+ */
+bool rdtgroup_cbm_overlaps_pseudo_locked(struct rdt_domain *d, unsigned long cbm)
+{
+ unsigned int cbm_len;
+ unsigned long cbm_b;
+
+ if (d->plr) {
+ cbm_len = d->plr->r->cache.cbm_len;
+ cbm_b = d->plr->cbm;
+ if (bitmap_intersects(&cbm, &cbm_b, cbm_len))
+ return true;
+ }
+ return false;
+}
+
+/**
+ * rdtgroup_pseudo_locked_in_hierarchy - Pseudo-locked region in cache hierarchy
+ * @d: RDT domain under test
+ *
+ * The setup of a pseudo-locked region affects all cache instances within
+ * the hierarchy of the region. It is thus essential to know if any
+ * pseudo-locked regions exist within a cache hierarchy to prevent any
+ * attempts to create new pseudo-locked regions in the same hierarchy.
+ *
+ * Return: true if a pseudo-locked region exists in the hierarchy of @d or
+ * if it is not possible to test due to memory allocation issue,
+ * false otherwise.
+ */
+bool rdtgroup_pseudo_locked_in_hierarchy(struct rdt_domain *d)
+{
+ cpumask_var_t cpu_with_psl;
+ struct rdt_resource *r;
+ struct rdt_domain *d_i;
+ bool ret = false;
+
+ if (!zalloc_cpumask_var(&cpu_with_psl, GFP_KERNEL))
+ return true;
+
+ /*
+ * First determine which cpus have pseudo-locked regions
+ * associated with them.
+ */
+ for_each_alloc_enabled_rdt_resource(r) {
+ list_for_each_entry(d_i, &r->domains, list) {
+ if (d_i->plr)
+ cpumask_or(cpu_with_psl, cpu_with_psl,
+ &d_i->cpu_mask);
+ }
+ }
+
+ /*
+ * Next test if new pseudo-locked region would intersect with
+ * existing region.
+ */
+ if (cpumask_intersects(&d->cpu_mask, cpu_with_psl))
+ ret = true;
+
+ free_cpumask_var(cpu_with_psl);
+ return ret;
+}
+
+/**
+ * measure_cycles_lat_fn - Measure cycle latency to read pseudo-locked memory
+ * @_plr: pseudo-lock region to measure
+ *
+ * There is no deterministic way to test if a memory region is cached. One
+ * way is to measure how long it takes to read the memory, the speed of
+ * access is a good way to learn how close to the cpu the data was. Even
+ * more, if the prefetcher is disabled and the memory is read at a stride
+ * of half the cache line, then a cache miss will be easy to spot since the
+ * read of the first half would be significantly slower than the read of
+ * the second half.
+ *
+ * Return: 0. Waiter on waitqueue will be woken on completion.
+ */
+static int measure_cycles_lat_fn(void *_plr)
+{
+ struct pseudo_lock_region *plr = _plr;
+ unsigned long i;
+ u64 start, end;
+#ifdef CONFIG_KASAN
+ /*
+ * The registers used for local register variables are also used
+ * when KASAN is active. When KASAN is active we use a regular
+ * variable to ensure we always use a valid pointer to access memory.
+ * The cost is that accessing this pointer, which could be in
+ * cache, will be included in the measurement of memory read latency.
+ */
+ void *mem_r;
+#else
+#ifdef CONFIG_X86_64
+ register void *mem_r asm("rbx");
+#else
+ register void *mem_r asm("ebx");
+#endif /* CONFIG_X86_64 */
+#endif /* CONFIG_KASAN */
+
+ local_irq_disable();
+ /*
+ * The wrmsr call may be reordered with the assignment below it.
+ * Call wrmsr as directly as possible to avoid tracing clobbering
+ * local register variable used for memory pointer.
+ */
+ __wrmsr(MSR_MISC_FEATURE_CONTROL, prefetch_disable_bits, 0x0);
+ mem_r = plr->kmem;
+ /*
+ * Dummy execute of the time measurement to load the needed
+ * instructions into the L1 instruction cache.
+ */
+ start = rdtsc_ordered();
+ for (i = 0; i < plr->size; i += 32) {
+ start = rdtsc_ordered();
+ asm volatile("mov (%0,%1,1), %%eax\n\t"
+ :
+ : "r" (mem_r), "r" (i)
+ : "%eax", "memory");
+ end = rdtsc_ordered();
+ trace_pseudo_lock_mem_latency((u32)(end - start));
+ }
+ wrmsr(MSR_MISC_FEATURE_CONTROL, 0x0, 0x0);
+ local_irq_enable();
+ plr->thread_done = 1;
+ wake_up_interruptible(&plr->lock_thread_wq);
+ return 0;
+}
+
+static int measure_cycles_perf_fn(void *_plr)
+{
+ unsigned long long l3_hits = 0, l3_miss = 0;
+ u64 l3_hit_bits = 0, l3_miss_bits = 0;
+ struct pseudo_lock_region *plr = _plr;
+ unsigned long long l2_hits, l2_miss;
+ u64 l2_hit_bits, l2_miss_bits;
+ unsigned long i;
+#ifdef CONFIG_KASAN
+ /*
+ * The registers used for local register variables are also used
+ * when KASAN is active. When KASAN is active we use regular variables
+ * at the cost of including cache access latency to these variables
+ * in the measurements.
+ */
+ unsigned int line_size;
+ unsigned int size;
+ void *mem_r;
+#else
+ register unsigned int line_size asm("esi");
+ register unsigned int size asm("edi");
+#ifdef CONFIG_X86_64
+ register void *mem_r asm("rbx");
+#else
+ register void *mem_r asm("ebx");
+#endif /* CONFIG_X86_64 */
+#endif /* CONFIG_KASAN */
+
+ /*
+ * Non-architectural event for the Goldmont Microarchitecture
+ * from Intel x86 Architecture Software Developer Manual (SDM):
+ * MEM_LOAD_UOPS_RETIRED D1H (event number)
+ * Umask values:
+ * L1_HIT 01H
+ * L2_HIT 02H
+ * L1_MISS 08H
+ * L2_MISS 10H
+ *
+ * On Broadwell Microarchitecture the MEM_LOAD_UOPS_RETIRED event
+ * has two "no fix" errata associated with it: BDM35 and BDM100. On
+ * this platform we use the following events instead:
+ * L2_RQSTS 24H (Documented in https://download.01.org/perfmon/BDW/)
+ * REFERENCES FFH
+ * MISS 3FH
+ * LONGEST_LAT_CACHE 2EH (Documented in SDM)
+ * REFERENCE 4FH
+ * MISS 41H
+ */
+
+ /*
+ * Start by setting flags for IA32_PERFEVTSELx:
+ * OS (Operating system mode) 0x2
+ * INT (APIC interrupt enable) 0x10
+ * EN (Enable counter) 0x40
+ *
+ * Then add the Umask value and event number to select performance
+ * event.
+ */
+
+ switch (boot_cpu_data.x86_model) {
+ case INTEL_FAM6_ATOM_GOLDMONT:
+ case INTEL_FAM6_ATOM_GEMINI_LAKE:
+ l2_hit_bits = (0x52ULL << 16) | (0x2 << 8) | 0xd1;
+ l2_miss_bits = (0x52ULL << 16) | (0x10 << 8) | 0xd1;
+ break;
+ case INTEL_FAM6_BROADWELL_X:
+ /* On BDW the l2_hit_bits count references, not hits */
+ l2_hit_bits = (0x52ULL << 16) | (0xff << 8) | 0x24;
+ l2_miss_bits = (0x52ULL << 16) | (0x3f << 8) | 0x24;
+ /* On BDW the l3_hit_bits count references, not hits */
+ l3_hit_bits = (0x52ULL << 16) | (0x4f << 8) | 0x2e;
+ l3_miss_bits = (0x52ULL << 16) | (0x41 << 8) | 0x2e;
+ break;
+ default:
+ goto out;
+ }
+
+ local_irq_disable();
+ /*
+ * Call wrmsr direcly to avoid the local register variables from
+ * being overwritten due to reordering of their assignment with
+ * the wrmsr calls.
+ */
+ __wrmsr(MSR_MISC_FEATURE_CONTROL, prefetch_disable_bits, 0x0);
+ /* Disable events and reset counters */
+ pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_EVENTSEL0, 0x0);
+ pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_EVENTSEL0 + 1, 0x0);
+ pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_PERFCTR0, 0x0);
+ pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_PERFCTR0 + 1, 0x0);
+ if (l3_hit_bits > 0) {
+ pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_EVENTSEL0 + 2, 0x0);
+ pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_EVENTSEL0 + 3, 0x0);
+ pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_PERFCTR0 + 2, 0x0);
+ pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_PERFCTR0 + 3, 0x0);
+ }
+ /* Set and enable the L2 counters */
+ pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_EVENTSEL0, l2_hit_bits);
+ pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_EVENTSEL0 + 1, l2_miss_bits);
+ if (l3_hit_bits > 0) {
+ pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_EVENTSEL0 + 2,
+ l3_hit_bits);
+ pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_EVENTSEL0 + 3,
+ l3_miss_bits);
+ }
+ mem_r = plr->kmem;
+ size = plr->size;
+ line_size = plr->line_size;
+ for (i = 0; i < size; i += line_size) {
+ asm volatile("mov (%0,%1,1), %%eax\n\t"
+ :
+ : "r" (mem_r), "r" (i)
+ : "%eax", "memory");
+ }
+ /*
+ * Call wrmsr directly (no tracing) to not influence
+ * the cache access counters as they are disabled.
+ */
+ pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_EVENTSEL0,
+ l2_hit_bits & ~(0x40ULL << 16));
+ pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_EVENTSEL0 + 1,
+ l2_miss_bits & ~(0x40ULL << 16));
+ if (l3_hit_bits > 0) {
+ pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_EVENTSEL0 + 2,
+ l3_hit_bits & ~(0x40ULL << 16));
+ pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_EVENTSEL0 + 3,
+ l3_miss_bits & ~(0x40ULL << 16));
+ }
+ l2_hits = native_read_pmc(0);
+ l2_miss = native_read_pmc(1);
+ if (l3_hit_bits > 0) {
+ l3_hits = native_read_pmc(2);
+ l3_miss = native_read_pmc(3);
+ }
+ wrmsr(MSR_MISC_FEATURE_CONTROL, 0x0, 0x0);
+ local_irq_enable();
+ /*
+ * On BDW we count references and misses, need to adjust. Sometimes
+ * the "hits" counter is a bit more than the references, for
+ * example, x references but x + 1 hits. To not report invalid
+ * hit values in this case we treat that as misses eaqual to
+ * references.
+ */
+ if (boot_cpu_data.x86_model == INTEL_FAM6_BROADWELL_X)
+ l2_hits -= (l2_miss > l2_hits ? l2_hits : l2_miss);
+ trace_pseudo_lock_l2(l2_hits, l2_miss);
+ if (l3_hit_bits > 0) {
+ if (boot_cpu_data.x86_model == INTEL_FAM6_BROADWELL_X)
+ l3_hits -= (l3_miss > l3_hits ? l3_hits : l3_miss);
+ trace_pseudo_lock_l3(l3_hits, l3_miss);
+ }
+
+out:
+ plr->thread_done = 1;
+ wake_up_interruptible(&plr->lock_thread_wq);
+ return 0;
+}
+
+/**
+ * pseudo_lock_measure_cycles - Trigger latency measure to pseudo-locked region
+ *
+ * The measurement of latency to access a pseudo-locked region should be
+ * done from a cpu that is associated with that pseudo-locked region.
+ * Determine which cpu is associated with this region and start a thread on
+ * that cpu to perform the measurement, wait for that thread to complete.
+ *
+ * Return: 0 on success, <0 on failure
+ */
+static int pseudo_lock_measure_cycles(struct rdtgroup *rdtgrp, int sel)
+{
+ struct pseudo_lock_region *plr = rdtgrp->plr;
+ struct task_struct *thread;
+ unsigned int cpu;
+ int ret = -1;
+
+ cpus_read_lock();
+ mutex_lock(&rdtgroup_mutex);
+
+ if (rdtgrp->flags & RDT_DELETED) {
+ ret = -ENODEV;
+ goto out;
+ }
+
+ plr->thread_done = 0;
+ cpu = cpumask_first(&plr->d->cpu_mask);
+ if (!cpu_online(cpu)) {
+ ret = -ENODEV;
+ goto out;
+ }
+
+ if (sel == 1)
+ thread = kthread_create_on_node(measure_cycles_lat_fn, plr,
+ cpu_to_node(cpu),
+ "pseudo_lock_measure/%u",
+ cpu);
+ else if (sel == 2)
+ thread = kthread_create_on_node(measure_cycles_perf_fn, plr,
+ cpu_to_node(cpu),
+ "pseudo_lock_measure/%u",
+ cpu);
+ else
+ goto out;
+
+ if (IS_ERR(thread)) {
+ ret = PTR_ERR(thread);
+ goto out;
+ }
+ kthread_bind(thread, cpu);
+ wake_up_process(thread);
+
+ ret = wait_event_interruptible(plr->lock_thread_wq,
+ plr->thread_done == 1);
+ if (ret < 0)
+ goto out;
+
+ ret = 0;
+
+out:
+ mutex_unlock(&rdtgroup_mutex);
+ cpus_read_unlock();
+ return ret;
+}
+
+static ssize_t pseudo_lock_measure_trigger(struct file *file,
+ const char __user *user_buf,
+ size_t count, loff_t *ppos)
+{
+ struct rdtgroup *rdtgrp = file->private_data;
+ size_t buf_size;
+ char buf[32];
+ int ret;
+ int sel;
+
+ buf_size = min(count, (sizeof(buf) - 1));
+ if (copy_from_user(buf, user_buf, buf_size))
+ return -EFAULT;
+
+ buf[buf_size] = '\0';
+ ret = kstrtoint(buf, 10, &sel);
+ if (ret == 0) {
+ if (sel != 1)
+ return -EINVAL;
+ ret = debugfs_file_get(file->f_path.dentry);
+ if (ret)
+ return ret;
+ ret = pseudo_lock_measure_cycles(rdtgrp, sel);
+ if (ret == 0)
+ ret = count;
+ debugfs_file_put(file->f_path.dentry);
+ }
+
+ return ret;
+}
+
+static const struct file_operations pseudo_measure_fops = {
+ .write = pseudo_lock_measure_trigger,
+ .open = simple_open,
+ .llseek = default_llseek,
+};
+
+/**
+ * rdtgroup_pseudo_lock_create - Create a pseudo-locked region
+ * @rdtgrp: resource group to which pseudo-lock region belongs
+ *
+ * Called when a resource group in the pseudo-locksetup mode receives a
+ * valid schemata that should be pseudo-locked. Since the resource group is
+ * in pseudo-locksetup mode the &struct pseudo_lock_region has already been
+ * allocated and initialized with the essential information. If a failure
+ * occurs the resource group remains in the pseudo-locksetup mode with the
+ * &struct pseudo_lock_region associated with it, but cleared from all
+ * information and ready for the user to re-attempt pseudo-locking by
+ * writing the schemata again.
+ *
+ * Return: 0 if the pseudo-locked region was successfully pseudo-locked, <0
+ * on failure. Descriptive error will be written to last_cmd_status buffer.
+ */
+int rdtgroup_pseudo_lock_create(struct rdtgroup *rdtgrp)
+{
+ struct pseudo_lock_region *plr = rdtgrp->plr;
+ struct task_struct *thread;
+ unsigned int new_minor;
+ struct device *dev;
+ int ret;
+
+ ret = pseudo_lock_region_alloc(plr);
+ if (ret < 0)
+ return ret;
+
+ ret = pseudo_lock_cstates_constrain(plr);
+ if (ret < 0) {
+ ret = -EINVAL;
+ goto out_region;
+ }
+
+ plr->thread_done = 0;
+
+ thread = kthread_create_on_node(pseudo_lock_fn, rdtgrp,
+ cpu_to_node(plr->cpu),
+ "pseudo_lock/%u", plr->cpu);
+ if (IS_ERR(thread)) {
+ ret = PTR_ERR(thread);
+ rdt_last_cmd_printf("locking thread returned error %d\n", ret);
+ goto out_cstates;
+ }
+
+ kthread_bind(thread, plr->cpu);
+ wake_up_process(thread);
+
+ ret = wait_event_interruptible(plr->lock_thread_wq,
+ plr->thread_done == 1);
+ if (ret < 0) {
+ /*
+ * If the thread does not get on the CPU for whatever
+ * reason and the process which sets up the region is
+ * interrupted then this will leave the thread in runnable
+ * state and once it gets on the CPU it will derefence
+ * the cleared, but not freed, plr struct resulting in an
+ * empty pseudo-locking loop.
+ */
+ rdt_last_cmd_puts("locking thread interrupted\n");
+ goto out_cstates;
+ }
+
+ ret = pseudo_lock_minor_get(&new_minor);
+ if (ret < 0) {
+ rdt_last_cmd_puts("unable to obtain a new minor number\n");
+ goto out_cstates;
+ }
+
+ /*
+ * Unlock access but do not release the reference. The
+ * pseudo-locked region will still be here on return.
+ *
+ * The mutex has to be released temporarily to avoid a potential
+ * deadlock with the mm->mmap_sem semaphore which is obtained in
+ * the device_create() and debugfs_create_dir() callpath below
+ * as well as before the mmap() callback is called.
+ */
+ mutex_unlock(&rdtgroup_mutex);
+
+ if (!IS_ERR_OR_NULL(debugfs_resctrl)) {
+ plr->debugfs_dir = debugfs_create_dir(rdtgrp->kn->name,
+ debugfs_resctrl);
+ if (!IS_ERR_OR_NULL(plr->debugfs_dir))
+ debugfs_create_file("pseudo_lock_measure", 0200,
+ plr->debugfs_dir, rdtgrp,
+ &pseudo_measure_fops);
+ }
+
+ dev = device_create(pseudo_lock_class, NULL,
+ MKDEV(pseudo_lock_major, new_minor),
+ rdtgrp, "%s", rdtgrp->kn->name);
+
+ mutex_lock(&rdtgroup_mutex);
+
+ if (IS_ERR(dev)) {
+ ret = PTR_ERR(dev);
+ rdt_last_cmd_printf("failed to create character device: %d\n",
+ ret);
+ goto out_debugfs;
+ }
+
+ /* We released the mutex - check if group was removed while we did so */
+ if (rdtgrp->flags & RDT_DELETED) {
+ ret = -ENODEV;
+ goto out_device;
+ }
+
+ plr->minor = new_minor;
+
+ rdtgrp->mode = RDT_MODE_PSEUDO_LOCKED;
+ closid_free(rdtgrp->closid);
+ rdtgroup_kn_mode_restore(rdtgrp, "cpus", 0444);
+ rdtgroup_kn_mode_restore(rdtgrp, "cpus_list", 0444);
+
+ ret = 0;
+ goto out;
+
+out_device:
+ device_destroy(pseudo_lock_class, MKDEV(pseudo_lock_major, new_minor));
+out_debugfs:
+ debugfs_remove_recursive(plr->debugfs_dir);
+ pseudo_lock_minor_release(new_minor);
+out_cstates:
+ pseudo_lock_cstates_relax(plr);
+out_region:
+ pseudo_lock_region_clear(plr);
+out:
+ return ret;
+}
+
+/**
+ * rdtgroup_pseudo_lock_remove - Remove a pseudo-locked region
+ * @rdtgrp: resource group to which the pseudo-locked region belongs
+ *
+ * The removal of a pseudo-locked region can be initiated when the resource
+ * group is removed from user space via a "rmdir" from userspace or the
+ * unmount of the resctrl filesystem. On removal the resource group does
+ * not go back to pseudo-locksetup mode before it is removed, instead it is
+ * removed directly. There is thus assymmetry with the creation where the
+ * &struct pseudo_lock_region is removed here while it was not created in
+ * rdtgroup_pseudo_lock_create().
+ *
+ * Return: void
+ */
+void rdtgroup_pseudo_lock_remove(struct rdtgroup *rdtgrp)
+{
+ struct pseudo_lock_region *plr = rdtgrp->plr;
+
+ if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
+ /*
+ * Default group cannot be a pseudo-locked region so we can
+ * free closid here.
+ */
+ closid_free(rdtgrp->closid);
+ goto free;
+ }
+
+ pseudo_lock_cstates_relax(plr);
+ debugfs_remove_recursive(rdtgrp->plr->debugfs_dir);
+ device_destroy(pseudo_lock_class, MKDEV(pseudo_lock_major, plr->minor));
+ pseudo_lock_minor_release(plr->minor);
+
+free:
+ pseudo_lock_free(rdtgrp);
+}
+
+static int pseudo_lock_dev_open(struct inode *inode, struct file *filp)
+{
+ struct rdtgroup *rdtgrp;
+
+ mutex_lock(&rdtgroup_mutex);
+
+ rdtgrp = region_find_by_minor(iminor(inode));
+ if (!rdtgrp) {
+ mutex_unlock(&rdtgroup_mutex);
+ return -ENODEV;
+ }
+
+ filp->private_data = rdtgrp;
+ atomic_inc(&rdtgrp->waitcount);
+ /* Perform a non-seekable open - llseek is not supported */
+ filp->f_mode &= ~(FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE);
+
+ mutex_unlock(&rdtgroup_mutex);
+
+ return 0;
+}
+
+static int pseudo_lock_dev_release(struct inode *inode, struct file *filp)
+{
+ struct rdtgroup *rdtgrp;
+
+ mutex_lock(&rdtgroup_mutex);
+ rdtgrp = filp->private_data;
+ WARN_ON(!rdtgrp);
+ if (!rdtgrp) {
+ mutex_unlock(&rdtgroup_mutex);
+ return -ENODEV;
+ }
+ filp->private_data = NULL;
+ atomic_dec(&rdtgrp->waitcount);
+ mutex_unlock(&rdtgroup_mutex);
+ return 0;
+}
+
+static int pseudo_lock_dev_mremap(struct vm_area_struct *area)
+{
+ /* Not supported */
+ return -EINVAL;
+}
+
+static const struct vm_operations_struct pseudo_mmap_ops = {
+ .mremap = pseudo_lock_dev_mremap,
+};
+
+static int pseudo_lock_dev_mmap(struct file *filp, struct vm_area_struct *vma)
+{
+ unsigned long vsize = vma->vm_end - vma->vm_start;
+ unsigned long off = vma->vm_pgoff << PAGE_SHIFT;
+ struct pseudo_lock_region *plr;
+ struct rdtgroup *rdtgrp;
+ unsigned long physical;
+ unsigned long psize;
+
+ mutex_lock(&rdtgroup_mutex);
+
+ rdtgrp = filp->private_data;
+ WARN_ON(!rdtgrp);
+ if (!rdtgrp) {
+ mutex_unlock(&rdtgroup_mutex);
+ return -ENODEV;
+ }
+
+ plr = rdtgrp->plr;
+
+ /*
+ * Task is required to run with affinity to the cpus associated
+ * with the pseudo-locked region. If this is not the case the task
+ * may be scheduled elsewhere and invalidate entries in the
+ * pseudo-locked region.
+ */
+ if (!cpumask_subset(¤t->cpus_allowed, &plr->d->cpu_mask)) {
+ mutex_unlock(&rdtgroup_mutex);
+ return -EINVAL;
+ }
+
+ physical = __pa(plr->kmem) >> PAGE_SHIFT;
+ psize = plr->size - off;
+
+ if (off > plr->size) {
+ mutex_unlock(&rdtgroup_mutex);
+ return -ENOSPC;
+ }
+
+ /*
+ * Ensure changes are carried directly to the memory being mapped,
+ * do not allow copy-on-write mapping.
+ */
+ if (!(vma->vm_flags & VM_SHARED)) {
+ mutex_unlock(&rdtgroup_mutex);
+ return -EINVAL;
+ }
+
+ if (vsize > psize) {
+ mutex_unlock(&rdtgroup_mutex);
+ return -ENOSPC;
+ }
+
+ memset(plr->kmem + off, 0, vsize);
+
+ if (remap_pfn_range(vma, vma->vm_start, physical + vma->vm_pgoff,
+ vsize, vma->vm_page_prot)) {
+ mutex_unlock(&rdtgroup_mutex);
+ return -EAGAIN;
+ }
+ vma->vm_ops = &pseudo_mmap_ops;
+ mutex_unlock(&rdtgroup_mutex);
+ return 0;
+}
+
+static const struct file_operations pseudo_lock_dev_fops = {
+ .owner = THIS_MODULE,
+ .llseek = no_llseek,
+ .read = NULL,
+ .write = NULL,
+ .open = pseudo_lock_dev_open,
+ .release = pseudo_lock_dev_release,
+ .mmap = pseudo_lock_dev_mmap,
+};
+
+static char *pseudo_lock_devnode(struct device *dev, umode_t *mode)
+{
+ struct rdtgroup *rdtgrp;
+
+ rdtgrp = dev_get_drvdata(dev);
+ if (mode)
+ *mode = 0600;
+ return kasprintf(GFP_KERNEL, "pseudo_lock/%s", rdtgrp->kn->name);
+}
+
+int rdt_pseudo_lock_init(void)
+{
+ int ret;
+
+ ret = register_chrdev(0, "pseudo_lock", &pseudo_lock_dev_fops);
+ if (ret < 0)
+ return ret;
+
+ pseudo_lock_major = ret;
+
+ pseudo_lock_class = class_create(THIS_MODULE, "pseudo_lock");
+ if (IS_ERR(pseudo_lock_class)) {
+ ret = PTR_ERR(pseudo_lock_class);
+ unregister_chrdev(pseudo_lock_major, "pseudo_lock");
+ return ret;
+ }
+
+ pseudo_lock_class->devnode = pseudo_lock_devnode;
+ return 0;
+}
+
+void rdt_pseudo_lock_release(void)
+{
+ class_destroy(pseudo_lock_class);
+ pseudo_lock_class = NULL;
+ unregister_chrdev(pseudo_lock_major, "pseudo_lock");
+ pseudo_lock_major = 0;
+}
diff --git a/arch/x86/kernel/cpu/intel_rdt_pseudo_lock_event.h b/arch/x86/kernel/cpu/intel_rdt_pseudo_lock_event.h
new file mode 100644
index 0000000..2c041e6
--- /dev/null
+++ b/arch/x86/kernel/cpu/intel_rdt_pseudo_lock_event.h
@@ -0,0 +1,43 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM resctrl
+
+#if !defined(_TRACE_PSEUDO_LOCK_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_PSEUDO_LOCK_H
+
+#include <linux/tracepoint.h>
+
+TRACE_EVENT(pseudo_lock_mem_latency,
+ TP_PROTO(u32 latency),
+ TP_ARGS(latency),
+ TP_STRUCT__entry(__field(u32, latency)),
+ TP_fast_assign(__entry->latency = latency),
+ TP_printk("latency=%u", __entry->latency)
+ );
+
+TRACE_EVENT(pseudo_lock_l2,
+ TP_PROTO(u64 l2_hits, u64 l2_miss),
+ TP_ARGS(l2_hits, l2_miss),
+ TP_STRUCT__entry(__field(u64, l2_hits)
+ __field(u64, l2_miss)),
+ TP_fast_assign(__entry->l2_hits = l2_hits;
+ __entry->l2_miss = l2_miss;),
+ TP_printk("hits=%llu miss=%llu",
+ __entry->l2_hits, __entry->l2_miss));
+
+TRACE_EVENT(pseudo_lock_l3,
+ TP_PROTO(u64 l3_hits, u64 l3_miss),
+ TP_ARGS(l3_hits, l3_miss),
+ TP_STRUCT__entry(__field(u64, l3_hits)
+ __field(u64, l3_miss)),
+ TP_fast_assign(__entry->l3_hits = l3_hits;
+ __entry->l3_miss = l3_miss;),
+ TP_printk("hits=%llu miss=%llu",
+ __entry->l3_hits, __entry->l3_miss));
+
+#endif /* _TRACE_PSEUDO_LOCK_H */
+
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE intel_rdt_pseudo_lock_event
+#include <trace/define_trace.h>
diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
new file mode 100644
index 0000000..643670f
--- /dev/null
+++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
@@ -0,0 +1,2925 @@
+/*
+ * User interface for Resource Alloction in Resource Director Technology(RDT)
+ *
+ * Copyright (C) 2016 Intel Corporation
+ *
+ * Author: Fenghua Yu <fenghua.yu@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * More information about RDT be found in the Intel (R) x86 Architecture
+ * Software Developer Manual.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/cacheinfo.h>
+#include <linux/cpu.h>
+#include <linux/debugfs.h>
+#include <linux/fs.h>
+#include <linux/sysfs.h>
+#include <linux/kernfs.h>
+#include <linux/seq_buf.h>
+#include <linux/seq_file.h>
+#include <linux/sched/signal.h>
+#include <linux/sched/task.h>
+#include <linux/slab.h>
+#include <linux/task_work.h>
+
+#include <uapi/linux/magic.h>
+
+#include <asm/intel_rdt_sched.h>
+#include "intel_rdt.h"
+
+DEFINE_STATIC_KEY_FALSE(rdt_enable_key);
+DEFINE_STATIC_KEY_FALSE(rdt_mon_enable_key);
+DEFINE_STATIC_KEY_FALSE(rdt_alloc_enable_key);
+static struct kernfs_root *rdt_root;
+struct rdtgroup rdtgroup_default;
+LIST_HEAD(rdt_all_groups);
+
+/* Kernel fs node for "info" directory under root */
+static struct kernfs_node *kn_info;
+
+/* Kernel fs node for "mon_groups" directory under root */
+static struct kernfs_node *kn_mongrp;
+
+/* Kernel fs node for "mon_data" directory under root */
+static struct kernfs_node *kn_mondata;
+
+static struct seq_buf last_cmd_status;
+static char last_cmd_status_buf[512];
+
+struct dentry *debugfs_resctrl;
+
+void rdt_last_cmd_clear(void)
+{
+ lockdep_assert_held(&rdtgroup_mutex);
+ seq_buf_clear(&last_cmd_status);
+}
+
+void rdt_last_cmd_puts(const char *s)
+{
+ lockdep_assert_held(&rdtgroup_mutex);
+ seq_buf_puts(&last_cmd_status, s);
+}
+
+void rdt_last_cmd_printf(const char *fmt, ...)
+{
+ va_list ap;
+
+ va_start(ap, fmt);
+ lockdep_assert_held(&rdtgroup_mutex);
+ seq_buf_vprintf(&last_cmd_status, fmt, ap);
+ va_end(ap);
+}
+
+/*
+ * Trivial allocator for CLOSIDs. Since h/w only supports a small number,
+ * we can keep a bitmap of free CLOSIDs in a single integer.
+ *
+ * Using a global CLOSID across all resources has some advantages and
+ * some drawbacks:
+ * + We can simply set "current->closid" to assign a task to a resource
+ * group.
+ * + Context switch code can avoid extra memory references deciding which
+ * CLOSID to load into the PQR_ASSOC MSR
+ * - We give up some options in configuring resource groups across multi-socket
+ * systems.
+ * - Our choices on how to configure each resource become progressively more
+ * limited as the number of resources grows.
+ */
+static int closid_free_map;
+static int closid_free_map_len;
+
+int closids_supported(void)
+{
+ return closid_free_map_len;
+}
+
+static void closid_init(void)
+{
+ struct rdt_resource *r;
+ int rdt_min_closid = 32;
+
+ /* Compute rdt_min_closid across all resources */
+ for_each_alloc_enabled_rdt_resource(r)
+ rdt_min_closid = min(rdt_min_closid, r->num_closid);
+
+ closid_free_map = BIT_MASK(rdt_min_closid) - 1;
+
+ /* CLOSID 0 is always reserved for the default group */
+ closid_free_map &= ~1;
+ closid_free_map_len = rdt_min_closid;
+}
+
+static int closid_alloc(void)
+{
+ u32 closid = ffs(closid_free_map);
+
+ if (closid == 0)
+ return -ENOSPC;
+ closid--;
+ closid_free_map &= ~(1 << closid);
+
+ return closid;
+}
+
+void closid_free(int closid)
+{
+ closid_free_map |= 1 << closid;
+}
+
+/**
+ * closid_allocated - test if provided closid is in use
+ * @closid: closid to be tested
+ *
+ * Return: true if @closid is currently associated with a resource group,
+ * false if @closid is free
+ */
+static bool closid_allocated(unsigned int closid)
+{
+ return (closid_free_map & (1 << closid)) == 0;
+}
+
+/**
+ * rdtgroup_mode_by_closid - Return mode of resource group with closid
+ * @closid: closid if the resource group
+ *
+ * Each resource group is associated with a @closid. Here the mode
+ * of a resource group can be queried by searching for it using its closid.
+ *
+ * Return: mode as &enum rdtgrp_mode of resource group with closid @closid
+ */
+enum rdtgrp_mode rdtgroup_mode_by_closid(int closid)
+{
+ struct rdtgroup *rdtgrp;
+
+ list_for_each_entry(rdtgrp, &rdt_all_groups, rdtgroup_list) {
+ if (rdtgrp->closid == closid)
+ return rdtgrp->mode;
+ }
+
+ return RDT_NUM_MODES;
+}
+
+static const char * const rdt_mode_str[] = {
+ [RDT_MODE_SHAREABLE] = "shareable",
+ [RDT_MODE_EXCLUSIVE] = "exclusive",
+ [RDT_MODE_PSEUDO_LOCKSETUP] = "pseudo-locksetup",
+ [RDT_MODE_PSEUDO_LOCKED] = "pseudo-locked",
+};
+
+/**
+ * rdtgroup_mode_str - Return the string representation of mode
+ * @mode: the resource group mode as &enum rdtgroup_mode
+ *
+ * Return: string representation of valid mode, "unknown" otherwise
+ */
+static const char *rdtgroup_mode_str(enum rdtgrp_mode mode)
+{
+ if (mode < RDT_MODE_SHAREABLE || mode >= RDT_NUM_MODES)
+ return "unknown";
+
+ return rdt_mode_str[mode];
+}
+
+/* set uid and gid of rdtgroup dirs and files to that of the creator */
+static int rdtgroup_kn_set_ugid(struct kernfs_node *kn)
+{
+ struct iattr iattr = { .ia_valid = ATTR_UID | ATTR_GID,
+ .ia_uid = current_fsuid(),
+ .ia_gid = current_fsgid(), };
+
+ if (uid_eq(iattr.ia_uid, GLOBAL_ROOT_UID) &&
+ gid_eq(iattr.ia_gid, GLOBAL_ROOT_GID))
+ return 0;
+
+ return kernfs_setattr(kn, &iattr);
+}
+
+static int rdtgroup_add_file(struct kernfs_node *parent_kn, struct rftype *rft)
+{
+ struct kernfs_node *kn;
+ int ret;
+
+ kn = __kernfs_create_file(parent_kn, rft->name, rft->mode,
+ GLOBAL_ROOT_UID, GLOBAL_ROOT_GID,
+ 0, rft->kf_ops, rft, NULL, NULL);
+ if (IS_ERR(kn))
+ return PTR_ERR(kn);
+
+ ret = rdtgroup_kn_set_ugid(kn);
+ if (ret) {
+ kernfs_remove(kn);
+ return ret;
+ }
+
+ return 0;
+}
+
+static int rdtgroup_seqfile_show(struct seq_file *m, void *arg)
+{
+ struct kernfs_open_file *of = m->private;
+ struct rftype *rft = of->kn->priv;
+
+ if (rft->seq_show)
+ return rft->seq_show(of, m, arg);
+ return 0;
+}
+
+static ssize_t rdtgroup_file_write(struct kernfs_open_file *of, char *buf,
+ size_t nbytes, loff_t off)
+{
+ struct rftype *rft = of->kn->priv;
+
+ if (rft->write)
+ return rft->write(of, buf, nbytes, off);
+
+ return -EINVAL;
+}
+
+static struct kernfs_ops rdtgroup_kf_single_ops = {
+ .atomic_write_len = PAGE_SIZE,
+ .write = rdtgroup_file_write,
+ .seq_show = rdtgroup_seqfile_show,
+};
+
+static struct kernfs_ops kf_mondata_ops = {
+ .atomic_write_len = PAGE_SIZE,
+ .seq_show = rdtgroup_mondata_show,
+};
+
+static bool is_cpu_list(struct kernfs_open_file *of)
+{
+ struct rftype *rft = of->kn->priv;
+
+ return rft->flags & RFTYPE_FLAGS_CPUS_LIST;
+}
+
+static int rdtgroup_cpus_show(struct kernfs_open_file *of,
+ struct seq_file *s, void *v)
+{
+ struct rdtgroup *rdtgrp;
+ int ret = 0;
+
+ rdtgrp = rdtgroup_kn_lock_live(of->kn);
+
+ if (rdtgrp) {
+ if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED)
+ seq_printf(s, is_cpu_list(of) ? "%*pbl\n" : "%*pb\n",
+ cpumask_pr_args(&rdtgrp->plr->d->cpu_mask));
+ else
+ seq_printf(s, is_cpu_list(of) ? "%*pbl\n" : "%*pb\n",
+ cpumask_pr_args(&rdtgrp->cpu_mask));
+ } else {
+ ret = -ENOENT;
+ }
+ rdtgroup_kn_unlock(of->kn);
+
+ return ret;
+}
+
+/*
+ * This is safe against intel_rdt_sched_in() called from __switch_to()
+ * because __switch_to() is executed with interrupts disabled. A local call
+ * from update_closid_rmid() is proteced against __switch_to() because
+ * preemption is disabled.
+ */
+static void update_cpu_closid_rmid(void *info)
+{
+ struct rdtgroup *r = info;
+
+ if (r) {
+ this_cpu_write(pqr_state.default_closid, r->closid);
+ this_cpu_write(pqr_state.default_rmid, r->mon.rmid);
+ }
+
+ /*
+ * We cannot unconditionally write the MSR because the current
+ * executing task might have its own closid selected. Just reuse
+ * the context switch code.
+ */
+ intel_rdt_sched_in();
+}
+
+/*
+ * Update the PGR_ASSOC MSR on all cpus in @cpu_mask,
+ *
+ * Per task closids/rmids must have been set up before calling this function.
+ */
+static void
+update_closid_rmid(const struct cpumask *cpu_mask, struct rdtgroup *r)
+{
+ int cpu = get_cpu();
+
+ if (cpumask_test_cpu(cpu, cpu_mask))
+ update_cpu_closid_rmid(r);
+ smp_call_function_many(cpu_mask, update_cpu_closid_rmid, r, 1);
+ put_cpu();
+}
+
+static int cpus_mon_write(struct rdtgroup *rdtgrp, cpumask_var_t newmask,
+ cpumask_var_t tmpmask)
+{
+ struct rdtgroup *prgrp = rdtgrp->mon.parent, *crgrp;
+ struct list_head *head;
+
+ /* Check whether cpus belong to parent ctrl group */
+ cpumask_andnot(tmpmask, newmask, &prgrp->cpu_mask);
+ if (cpumask_weight(tmpmask)) {
+ rdt_last_cmd_puts("can only add CPUs to mongroup that belong to parent\n");
+ return -EINVAL;
+ }
+
+ /* Check whether cpus are dropped from this group */
+ cpumask_andnot(tmpmask, &rdtgrp->cpu_mask, newmask);
+ if (cpumask_weight(tmpmask)) {
+ /* Give any dropped cpus to parent rdtgroup */
+ cpumask_or(&prgrp->cpu_mask, &prgrp->cpu_mask, tmpmask);
+ update_closid_rmid(tmpmask, prgrp);
+ }
+
+ /*
+ * If we added cpus, remove them from previous group that owned them
+ * and update per-cpu rmid
+ */
+ cpumask_andnot(tmpmask, newmask, &rdtgrp->cpu_mask);
+ if (cpumask_weight(tmpmask)) {
+ head = &prgrp->mon.crdtgrp_list;
+ list_for_each_entry(crgrp, head, mon.crdtgrp_list) {
+ if (crgrp == rdtgrp)
+ continue;
+ cpumask_andnot(&crgrp->cpu_mask, &crgrp->cpu_mask,
+ tmpmask);
+ }
+ update_closid_rmid(tmpmask, rdtgrp);
+ }
+
+ /* Done pushing/pulling - update this group with new mask */
+ cpumask_copy(&rdtgrp->cpu_mask, newmask);
+
+ return 0;
+}
+
+static void cpumask_rdtgrp_clear(struct rdtgroup *r, struct cpumask *m)
+{
+ struct rdtgroup *crgrp;
+
+ cpumask_andnot(&r->cpu_mask, &r->cpu_mask, m);
+ /* update the child mon group masks as well*/
+ list_for_each_entry(crgrp, &r->mon.crdtgrp_list, mon.crdtgrp_list)
+ cpumask_and(&crgrp->cpu_mask, &r->cpu_mask, &crgrp->cpu_mask);
+}
+
+static int cpus_ctrl_write(struct rdtgroup *rdtgrp, cpumask_var_t newmask,
+ cpumask_var_t tmpmask, cpumask_var_t tmpmask1)
+{
+ struct rdtgroup *r, *crgrp;
+ struct list_head *head;
+
+ /* Check whether cpus are dropped from this group */
+ cpumask_andnot(tmpmask, &rdtgrp->cpu_mask, newmask);
+ if (cpumask_weight(tmpmask)) {
+ /* Can't drop from default group */
+ if (rdtgrp == &rdtgroup_default) {
+ rdt_last_cmd_puts("Can't drop CPUs from default group\n");
+ return -EINVAL;
+ }
+
+ /* Give any dropped cpus to rdtgroup_default */
+ cpumask_or(&rdtgroup_default.cpu_mask,
+ &rdtgroup_default.cpu_mask, tmpmask);
+ update_closid_rmid(tmpmask, &rdtgroup_default);
+ }
+
+ /*
+ * If we added cpus, remove them from previous group and
+ * the prev group's child groups that owned them
+ * and update per-cpu closid/rmid.
+ */
+ cpumask_andnot(tmpmask, newmask, &rdtgrp->cpu_mask);
+ if (cpumask_weight(tmpmask)) {
+ list_for_each_entry(r, &rdt_all_groups, rdtgroup_list) {
+ if (r == rdtgrp)
+ continue;
+ cpumask_and(tmpmask1, &r->cpu_mask, tmpmask);
+ if (cpumask_weight(tmpmask1))
+ cpumask_rdtgrp_clear(r, tmpmask1);
+ }
+ update_closid_rmid(tmpmask, rdtgrp);
+ }
+
+ /* Done pushing/pulling - update this group with new mask */
+ cpumask_copy(&rdtgrp->cpu_mask, newmask);
+
+ /*
+ * Clear child mon group masks since there is a new parent mask
+ * now and update the rmid for the cpus the child lost.
+ */
+ head = &rdtgrp->mon.crdtgrp_list;
+ list_for_each_entry(crgrp, head, mon.crdtgrp_list) {
+ cpumask_and(tmpmask, &rdtgrp->cpu_mask, &crgrp->cpu_mask);
+ update_closid_rmid(tmpmask, rdtgrp);
+ cpumask_clear(&crgrp->cpu_mask);
+ }
+
+ return 0;
+}
+
+static ssize_t rdtgroup_cpus_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ cpumask_var_t tmpmask, newmask, tmpmask1;
+ struct rdtgroup *rdtgrp;
+ int ret;
+
+ if (!buf)
+ return -EINVAL;
+
+ if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
+ return -ENOMEM;
+ if (!zalloc_cpumask_var(&newmask, GFP_KERNEL)) {
+ free_cpumask_var(tmpmask);
+ return -ENOMEM;
+ }
+ if (!zalloc_cpumask_var(&tmpmask1, GFP_KERNEL)) {
+ free_cpumask_var(tmpmask);
+ free_cpumask_var(newmask);
+ return -ENOMEM;
+ }
+
+ rdtgrp = rdtgroup_kn_lock_live(of->kn);
+ rdt_last_cmd_clear();
+ if (!rdtgrp) {
+ ret = -ENOENT;
+ rdt_last_cmd_puts("directory was removed\n");
+ goto unlock;
+ }
+
+ if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED ||
+ rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
+ ret = -EINVAL;
+ rdt_last_cmd_puts("pseudo-locking in progress\n");
+ goto unlock;
+ }
+
+ if (is_cpu_list(of))
+ ret = cpulist_parse(buf, newmask);
+ else
+ ret = cpumask_parse(buf, newmask);
+
+ if (ret) {
+ rdt_last_cmd_puts("bad cpu list/mask\n");
+ goto unlock;
+ }
+
+ /* check that user didn't specify any offline cpus */
+ cpumask_andnot(tmpmask, newmask, cpu_online_mask);
+ if (cpumask_weight(tmpmask)) {
+ ret = -EINVAL;
+ rdt_last_cmd_puts("can only assign online cpus\n");
+ goto unlock;
+ }
+
+ if (rdtgrp->type == RDTCTRL_GROUP)
+ ret = cpus_ctrl_write(rdtgrp, newmask, tmpmask, tmpmask1);
+ else if (rdtgrp->type == RDTMON_GROUP)
+ ret = cpus_mon_write(rdtgrp, newmask, tmpmask);
+ else
+ ret = -EINVAL;
+
+unlock:
+ rdtgroup_kn_unlock(of->kn);
+ free_cpumask_var(tmpmask);
+ free_cpumask_var(newmask);
+ free_cpumask_var(tmpmask1);
+
+ return ret ?: nbytes;
+}
+
+struct task_move_callback {
+ struct callback_head work;
+ struct rdtgroup *rdtgrp;
+};
+
+static void move_myself(struct callback_head *head)
+{
+ struct task_move_callback *callback;
+ struct rdtgroup *rdtgrp;
+
+ callback = container_of(head, struct task_move_callback, work);
+ rdtgrp = callback->rdtgrp;
+
+ /*
+ * If resource group was deleted before this task work callback
+ * was invoked, then assign the task to root group and free the
+ * resource group.
+ */
+ if (atomic_dec_and_test(&rdtgrp->waitcount) &&
+ (rdtgrp->flags & RDT_DELETED)) {
+ current->closid = 0;
+ current->rmid = 0;
+ kfree(rdtgrp);
+ }
+
+ preempt_disable();
+ /* update PQR_ASSOC MSR to make resource group go into effect */
+ intel_rdt_sched_in();
+ preempt_enable();
+
+ kfree(callback);
+}
+
+static int __rdtgroup_move_task(struct task_struct *tsk,
+ struct rdtgroup *rdtgrp)
+{
+ struct task_move_callback *callback;
+ int ret;
+
+ callback = kzalloc(sizeof(*callback), GFP_KERNEL);
+ if (!callback)
+ return -ENOMEM;
+ callback->work.func = move_myself;
+ callback->rdtgrp = rdtgrp;
+
+ /*
+ * Take a refcount, so rdtgrp cannot be freed before the
+ * callback has been invoked.
+ */
+ atomic_inc(&rdtgrp->waitcount);
+ ret = task_work_add(tsk, &callback->work, true);
+ if (ret) {
+ /*
+ * Task is exiting. Drop the refcount and free the callback.
+ * No need to check the refcount as the group cannot be
+ * deleted before the write function unlocks rdtgroup_mutex.
+ */
+ atomic_dec(&rdtgrp->waitcount);
+ kfree(callback);
+ rdt_last_cmd_puts("task exited\n");
+ } else {
+ /*
+ * For ctrl_mon groups move both closid and rmid.
+ * For monitor groups, can move the tasks only from
+ * their parent CTRL group.
+ */
+ if (rdtgrp->type == RDTCTRL_GROUP) {
+ tsk->closid = rdtgrp->closid;
+ tsk->rmid = rdtgrp->mon.rmid;
+ } else if (rdtgrp->type == RDTMON_GROUP) {
+ if (rdtgrp->mon.parent->closid == tsk->closid) {
+ tsk->rmid = rdtgrp->mon.rmid;
+ } else {
+ rdt_last_cmd_puts("Can't move task to different control group\n");
+ ret = -EINVAL;
+ }
+ }
+ }
+ return ret;
+}
+
+/**
+ * rdtgroup_tasks_assigned - Test if tasks have been assigned to resource group
+ * @r: Resource group
+ *
+ * Return: 1 if tasks have been assigned to @r, 0 otherwise
+ */
+int rdtgroup_tasks_assigned(struct rdtgroup *r)
+{
+ struct task_struct *p, *t;
+ int ret = 0;
+
+ lockdep_assert_held(&rdtgroup_mutex);
+
+ rcu_read_lock();
+ for_each_process_thread(p, t) {
+ if ((r->type == RDTCTRL_GROUP && t->closid == r->closid) ||
+ (r->type == RDTMON_GROUP && t->rmid == r->mon.rmid)) {
+ ret = 1;
+ break;
+ }
+ }
+ rcu_read_unlock();
+
+ return ret;
+}
+
+static int rdtgroup_task_write_permission(struct task_struct *task,
+ struct kernfs_open_file *of)
+{
+ const struct cred *tcred = get_task_cred(task);
+ const struct cred *cred = current_cred();
+ int ret = 0;
+
+ /*
+ * Even if we're attaching all tasks in the thread group, we only
+ * need to check permissions on one of them.
+ */
+ if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) &&
+ !uid_eq(cred->euid, tcred->uid) &&
+ !uid_eq(cred->euid, tcred->suid)) {
+ rdt_last_cmd_printf("No permission to move task %d\n", task->pid);
+ ret = -EPERM;
+ }
+
+ put_cred(tcred);
+ return ret;
+}
+
+static int rdtgroup_move_task(pid_t pid, struct rdtgroup *rdtgrp,
+ struct kernfs_open_file *of)
+{
+ struct task_struct *tsk;
+ int ret;
+
+ rcu_read_lock();
+ if (pid) {
+ tsk = find_task_by_vpid(pid);
+ if (!tsk) {
+ rcu_read_unlock();
+ rdt_last_cmd_printf("No task %d\n", pid);
+ return -ESRCH;
+ }
+ } else {
+ tsk = current;
+ }
+
+ get_task_struct(tsk);
+ rcu_read_unlock();
+
+ ret = rdtgroup_task_write_permission(tsk, of);
+ if (!ret)
+ ret = __rdtgroup_move_task(tsk, rdtgrp);
+
+ put_task_struct(tsk);
+ return ret;
+}
+
+static ssize_t rdtgroup_tasks_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ struct rdtgroup *rdtgrp;
+ int ret = 0;
+ pid_t pid;
+
+ if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0)
+ return -EINVAL;
+ rdtgrp = rdtgroup_kn_lock_live(of->kn);
+ if (!rdtgrp) {
+ rdtgroup_kn_unlock(of->kn);
+ return -ENOENT;
+ }
+ rdt_last_cmd_clear();
+
+ if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED ||
+ rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
+ ret = -EINVAL;
+ rdt_last_cmd_puts("pseudo-locking in progress\n");
+ goto unlock;
+ }
+
+ ret = rdtgroup_move_task(pid, rdtgrp, of);
+
+unlock:
+ rdtgroup_kn_unlock(of->kn);
+
+ return ret ?: nbytes;
+}
+
+static void show_rdt_tasks(struct rdtgroup *r, struct seq_file *s)
+{
+ struct task_struct *p, *t;
+
+ rcu_read_lock();
+ for_each_process_thread(p, t) {
+ if ((r->type == RDTCTRL_GROUP && t->closid == r->closid) ||
+ (r->type == RDTMON_GROUP && t->rmid == r->mon.rmid))
+ seq_printf(s, "%d\n", t->pid);
+ }
+ rcu_read_unlock();
+}
+
+static int rdtgroup_tasks_show(struct kernfs_open_file *of,
+ struct seq_file *s, void *v)
+{
+ struct rdtgroup *rdtgrp;
+ int ret = 0;
+
+ rdtgrp = rdtgroup_kn_lock_live(of->kn);
+ if (rdtgrp)
+ show_rdt_tasks(rdtgrp, s);
+ else
+ ret = -ENOENT;
+ rdtgroup_kn_unlock(of->kn);
+
+ return ret;
+}
+
+static int rdt_last_cmd_status_show(struct kernfs_open_file *of,
+ struct seq_file *seq, void *v)
+{
+ int len;
+
+ mutex_lock(&rdtgroup_mutex);
+ len = seq_buf_used(&last_cmd_status);
+ if (len)
+ seq_printf(seq, "%.*s", len, last_cmd_status_buf);
+ else
+ seq_puts(seq, "ok\n");
+ mutex_unlock(&rdtgroup_mutex);
+ return 0;
+}
+
+static int rdt_num_closids_show(struct kernfs_open_file *of,
+ struct seq_file *seq, void *v)
+{
+ struct rdt_resource *r = of->kn->parent->priv;
+
+ seq_printf(seq, "%d\n", r->num_closid);
+ return 0;
+}
+
+static int rdt_default_ctrl_show(struct kernfs_open_file *of,
+ struct seq_file *seq, void *v)
+{
+ struct rdt_resource *r = of->kn->parent->priv;
+
+ seq_printf(seq, "%x\n", r->default_ctrl);
+ return 0;
+}
+
+static int rdt_min_cbm_bits_show(struct kernfs_open_file *of,
+ struct seq_file *seq, void *v)
+{
+ struct rdt_resource *r = of->kn->parent->priv;
+
+ seq_printf(seq, "%u\n", r->cache.min_cbm_bits);
+ return 0;
+}
+
+static int rdt_shareable_bits_show(struct kernfs_open_file *of,
+ struct seq_file *seq, void *v)
+{
+ struct rdt_resource *r = of->kn->parent->priv;
+
+ seq_printf(seq, "%x\n", r->cache.shareable_bits);
+ return 0;
+}
+
+/**
+ * rdt_bit_usage_show - Display current usage of resources
+ *
+ * A domain is a shared resource that can now be allocated differently. Here
+ * we display the current regions of the domain as an annotated bitmask.
+ * For each domain of this resource its allocation bitmask
+ * is annotated as below to indicate the current usage of the corresponding bit:
+ * 0 - currently unused
+ * X - currently available for sharing and used by software and hardware
+ * H - currently used by hardware only but available for software use
+ * S - currently used and shareable by software only
+ * E - currently used exclusively by one resource group
+ * P - currently pseudo-locked by one resource group
+ */
+static int rdt_bit_usage_show(struct kernfs_open_file *of,
+ struct seq_file *seq, void *v)
+{
+ struct rdt_resource *r = of->kn->parent->priv;
+ u32 sw_shareable = 0, hw_shareable = 0;
+ u32 exclusive = 0, pseudo_locked = 0;
+ struct rdt_domain *dom;
+ int i, hwb, swb, excl, psl;
+ enum rdtgrp_mode mode;
+ bool sep = false;
+ u32 *ctrl;
+
+ mutex_lock(&rdtgroup_mutex);
+ hw_shareable = r->cache.shareable_bits;
+ list_for_each_entry(dom, &r->domains, list) {
+ if (sep)
+ seq_putc(seq, ';');
+ ctrl = dom->ctrl_val;
+ sw_shareable = 0;
+ exclusive = 0;
+ seq_printf(seq, "%d=", dom->id);
+ for (i = 0; i < closids_supported(); i++, ctrl++) {
+ if (!closid_allocated(i))
+ continue;
+ mode = rdtgroup_mode_by_closid(i);
+ switch (mode) {
+ case RDT_MODE_SHAREABLE:
+ sw_shareable |= *ctrl;
+ break;
+ case RDT_MODE_EXCLUSIVE:
+ exclusive |= *ctrl;
+ break;
+ case RDT_MODE_PSEUDO_LOCKSETUP:
+ /*
+ * RDT_MODE_PSEUDO_LOCKSETUP is possible
+ * here but not included since the CBM
+ * associated with this CLOSID in this mode
+ * is not initialized and no task or cpu can be
+ * assigned this CLOSID.
+ */
+ break;
+ case RDT_MODE_PSEUDO_LOCKED:
+ case RDT_NUM_MODES:
+ WARN(1,
+ "invalid mode for closid %d\n", i);
+ break;
+ }
+ }
+ for (i = r->cache.cbm_len - 1; i >= 0; i--) {
+ pseudo_locked = dom->plr ? dom->plr->cbm : 0;
+ hwb = test_bit(i, (unsigned long *)&hw_shareable);
+ swb = test_bit(i, (unsigned long *)&sw_shareable);
+ excl = test_bit(i, (unsigned long *)&exclusive);
+ psl = test_bit(i, (unsigned long *)&pseudo_locked);
+ if (hwb && swb)
+ seq_putc(seq, 'X');
+ else if (hwb && !swb)
+ seq_putc(seq, 'H');
+ else if (!hwb && swb)
+ seq_putc(seq, 'S');
+ else if (excl)
+ seq_putc(seq, 'E');
+ else if (psl)
+ seq_putc(seq, 'P');
+ else /* Unused bits remain */
+ seq_putc(seq, '0');
+ }
+ sep = true;
+ }
+ seq_putc(seq, '\n');
+ mutex_unlock(&rdtgroup_mutex);
+ return 0;
+}
+
+static int rdt_min_bw_show(struct kernfs_open_file *of,
+ struct seq_file *seq, void *v)
+{
+ struct rdt_resource *r = of->kn->parent->priv;
+
+ seq_printf(seq, "%u\n", r->membw.min_bw);
+ return 0;
+}
+
+static int rdt_num_rmids_show(struct kernfs_open_file *of,
+ struct seq_file *seq, void *v)
+{
+ struct rdt_resource *r = of->kn->parent->priv;
+
+ seq_printf(seq, "%d\n", r->num_rmid);
+
+ return 0;
+}
+
+static int rdt_mon_features_show(struct kernfs_open_file *of,
+ struct seq_file *seq, void *v)
+{
+ struct rdt_resource *r = of->kn->parent->priv;
+ struct mon_evt *mevt;
+
+ list_for_each_entry(mevt, &r->evt_list, list)
+ seq_printf(seq, "%s\n", mevt->name);
+
+ return 0;
+}
+
+static int rdt_bw_gran_show(struct kernfs_open_file *of,
+ struct seq_file *seq, void *v)
+{
+ struct rdt_resource *r = of->kn->parent->priv;
+
+ seq_printf(seq, "%u\n", r->membw.bw_gran);
+ return 0;
+}
+
+static int rdt_delay_linear_show(struct kernfs_open_file *of,
+ struct seq_file *seq, void *v)
+{
+ struct rdt_resource *r = of->kn->parent->priv;
+
+ seq_printf(seq, "%u\n", r->membw.delay_linear);
+ return 0;
+}
+
+static int max_threshold_occ_show(struct kernfs_open_file *of,
+ struct seq_file *seq, void *v)
+{
+ struct rdt_resource *r = of->kn->parent->priv;
+
+ seq_printf(seq, "%u\n", intel_cqm_threshold * r->mon_scale);
+
+ return 0;
+}
+
+static ssize_t max_threshold_occ_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ struct rdt_resource *r = of->kn->parent->priv;
+ unsigned int bytes;
+ int ret;
+
+ ret = kstrtouint(buf, 0, &bytes);
+ if (ret)
+ return ret;
+
+ if (bytes > (boot_cpu_data.x86_cache_size * 1024))
+ return -EINVAL;
+
+ intel_cqm_threshold = bytes / r->mon_scale;
+
+ return nbytes;
+}
+
+/*
+ * rdtgroup_mode_show - Display mode of this resource group
+ */
+static int rdtgroup_mode_show(struct kernfs_open_file *of,
+ struct seq_file *s, void *v)
+{
+ struct rdtgroup *rdtgrp;
+
+ rdtgrp = rdtgroup_kn_lock_live(of->kn);
+ if (!rdtgrp) {
+ rdtgroup_kn_unlock(of->kn);
+ return -ENOENT;
+ }
+
+ seq_printf(s, "%s\n", rdtgroup_mode_str(rdtgrp->mode));
+
+ rdtgroup_kn_unlock(of->kn);
+ return 0;
+}
+
+/**
+ * rdtgroup_cbm_overlaps - Does CBM for intended closid overlap with other
+ * @r: Resource to which domain instance @d belongs.
+ * @d: The domain instance for which @closid is being tested.
+ * @cbm: Capacity bitmask being tested.
+ * @closid: Intended closid for @cbm.
+ * @exclusive: Only check if overlaps with exclusive resource groups
+ *
+ * Checks if provided @cbm intended to be used for @closid on domain
+ * @d overlaps with any other closids or other hardware usage associated
+ * with this domain. If @exclusive is true then only overlaps with
+ * resource groups in exclusive mode will be considered. If @exclusive
+ * is false then overlaps with any resource group or hardware entities
+ * will be considered.
+ *
+ * @cbm is unsigned long, even if only 32 bits are used, to make the
+ * bitmap functions work correctly.
+ *
+ * Return: false if CBM does not overlap, true if it does.
+ */
+bool rdtgroup_cbm_overlaps(struct rdt_resource *r, struct rdt_domain *d,
+ unsigned long cbm, int closid, bool exclusive)
+{
+ enum rdtgrp_mode mode;
+ unsigned long ctrl_b;
+ u32 *ctrl;
+ int i;
+
+ /* Check for any overlap with regions used by hardware directly */
+ if (!exclusive) {
+ ctrl_b = r->cache.shareable_bits;
+ if (bitmap_intersects(&cbm, &ctrl_b, r->cache.cbm_len))
+ return true;
+ }
+
+ /* Check for overlap with other resource groups */
+ ctrl = d->ctrl_val;
+ for (i = 0; i < closids_supported(); i++, ctrl++) {
+ ctrl_b = *ctrl;
+ mode = rdtgroup_mode_by_closid(i);
+ if (closid_allocated(i) && i != closid &&
+ mode != RDT_MODE_PSEUDO_LOCKSETUP) {
+ if (bitmap_intersects(&cbm, &ctrl_b, r->cache.cbm_len)) {
+ if (exclusive) {
+ if (mode == RDT_MODE_EXCLUSIVE)
+ return true;
+ continue;
+ }
+ return true;
+ }
+ }
+ }
+
+ return false;
+}
+
+/**
+ * rdtgroup_mode_test_exclusive - Test if this resource group can be exclusive
+ *
+ * An exclusive resource group implies that there should be no sharing of
+ * its allocated resources. At the time this group is considered to be
+ * exclusive this test can determine if its current schemata supports this
+ * setting by testing for overlap with all other resource groups.
+ *
+ * Return: true if resource group can be exclusive, false if there is overlap
+ * with allocations of other resource groups and thus this resource group
+ * cannot be exclusive.
+ */
+static bool rdtgroup_mode_test_exclusive(struct rdtgroup *rdtgrp)
+{
+ int closid = rdtgrp->closid;
+ struct rdt_resource *r;
+ bool has_cache = false;
+ struct rdt_domain *d;
+
+ for_each_alloc_enabled_rdt_resource(r) {
+ if (r->rid == RDT_RESOURCE_MBA)
+ continue;
+ has_cache = true;
+ list_for_each_entry(d, &r->domains, list) {
+ if (rdtgroup_cbm_overlaps(r, d, d->ctrl_val[closid],
+ rdtgrp->closid, false)) {
+ rdt_last_cmd_puts("schemata overlaps\n");
+ return false;
+ }
+ }
+ }
+
+ if (!has_cache) {
+ rdt_last_cmd_puts("cannot be exclusive without CAT/CDP\n");
+ return false;
+ }
+
+ return true;
+}
+
+/**
+ * rdtgroup_mode_write - Modify the resource group's mode
+ *
+ */
+static ssize_t rdtgroup_mode_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ struct rdtgroup *rdtgrp;
+ enum rdtgrp_mode mode;
+ int ret = 0;
+
+ /* Valid input requires a trailing newline */
+ if (nbytes == 0 || buf[nbytes - 1] != '\n')
+ return -EINVAL;
+ buf[nbytes - 1] = '\0';
+
+ rdtgrp = rdtgroup_kn_lock_live(of->kn);
+ if (!rdtgrp) {
+ rdtgroup_kn_unlock(of->kn);
+ return -ENOENT;
+ }
+
+ rdt_last_cmd_clear();
+
+ mode = rdtgrp->mode;
+
+ if ((!strcmp(buf, "shareable") && mode == RDT_MODE_SHAREABLE) ||
+ (!strcmp(buf, "exclusive") && mode == RDT_MODE_EXCLUSIVE) ||
+ (!strcmp(buf, "pseudo-locksetup") &&
+ mode == RDT_MODE_PSEUDO_LOCKSETUP) ||
+ (!strcmp(buf, "pseudo-locked") && mode == RDT_MODE_PSEUDO_LOCKED))
+ goto out;
+
+ if (mode == RDT_MODE_PSEUDO_LOCKED) {
+ rdt_last_cmd_printf("cannot change pseudo-locked group\n");
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (!strcmp(buf, "shareable")) {
+ if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
+ ret = rdtgroup_locksetup_exit(rdtgrp);
+ if (ret)
+ goto out;
+ }
+ rdtgrp->mode = RDT_MODE_SHAREABLE;
+ } else if (!strcmp(buf, "exclusive")) {
+ if (!rdtgroup_mode_test_exclusive(rdtgrp)) {
+ ret = -EINVAL;
+ goto out;
+ }
+ if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
+ ret = rdtgroup_locksetup_exit(rdtgrp);
+ if (ret)
+ goto out;
+ }
+ rdtgrp->mode = RDT_MODE_EXCLUSIVE;
+ } else if (!strcmp(buf, "pseudo-locksetup")) {
+ ret = rdtgroup_locksetup_enter(rdtgrp);
+ if (ret)
+ goto out;
+ rdtgrp->mode = RDT_MODE_PSEUDO_LOCKSETUP;
+ } else {
+ rdt_last_cmd_printf("unknown/unsupported mode\n");
+ ret = -EINVAL;
+ }
+
+out:
+ rdtgroup_kn_unlock(of->kn);
+ return ret ?: nbytes;
+}
+
+/**
+ * rdtgroup_cbm_to_size - Translate CBM to size in bytes
+ * @r: RDT resource to which @d belongs.
+ * @d: RDT domain instance.
+ * @cbm: bitmask for which the size should be computed.
+ *
+ * The bitmask provided associated with the RDT domain instance @d will be
+ * translated into how many bytes it represents. The size in bytes is
+ * computed by first dividing the total cache size by the CBM length to
+ * determine how many bytes each bit in the bitmask represents. The result
+ * is multiplied with the number of bits set in the bitmask.
+ *
+ * @cbm is unsigned long, even if only 32 bits are used to make the
+ * bitmap functions work correctly.
+ */
+unsigned int rdtgroup_cbm_to_size(struct rdt_resource *r,
+ struct rdt_domain *d, unsigned long cbm)
+{
+ struct cpu_cacheinfo *ci;
+ unsigned int size = 0;
+ int num_b, i;
+
+ num_b = bitmap_weight(&cbm, r->cache.cbm_len);
+ ci = get_cpu_cacheinfo(cpumask_any(&d->cpu_mask));
+ for (i = 0; i < ci->num_leaves; i++) {
+ if (ci->info_list[i].level == r->cache_level) {
+ size = ci->info_list[i].size / r->cache.cbm_len * num_b;
+ break;
+ }
+ }
+
+ return size;
+}
+
+/**
+ * rdtgroup_size_show - Display size in bytes of allocated regions
+ *
+ * The "size" file mirrors the layout of the "schemata" file, printing the
+ * size in bytes of each region instead of the capacity bitmask.
+ *
+ */
+static int rdtgroup_size_show(struct kernfs_open_file *of,
+ struct seq_file *s, void *v)
+{
+ struct rdtgroup *rdtgrp;
+ struct rdt_resource *r;
+ struct rdt_domain *d;
+ unsigned int size;
+ bool sep;
+ u32 ctrl;
+
+ rdtgrp = rdtgroup_kn_lock_live(of->kn);
+ if (!rdtgrp) {
+ rdtgroup_kn_unlock(of->kn);
+ return -ENOENT;
+ }
+
+ if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) {
+ seq_printf(s, "%*s:", max_name_width, rdtgrp->plr->r->name);
+ size = rdtgroup_cbm_to_size(rdtgrp->plr->r,
+ rdtgrp->plr->d,
+ rdtgrp->plr->cbm);
+ seq_printf(s, "%d=%u\n", rdtgrp->plr->d->id, size);
+ goto out;
+ }
+
+ for_each_alloc_enabled_rdt_resource(r) {
+ sep = false;
+ seq_printf(s, "%*s:", max_name_width, r->name);
+ list_for_each_entry(d, &r->domains, list) {
+ if (sep)
+ seq_putc(s, ';');
+ if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
+ size = 0;
+ } else {
+ ctrl = (!is_mba_sc(r) ?
+ d->ctrl_val[rdtgrp->closid] :
+ d->mbps_val[rdtgrp->closid]);
+ if (r->rid == RDT_RESOURCE_MBA)
+ size = ctrl;
+ else
+ size = rdtgroup_cbm_to_size(r, d, ctrl);
+ }
+ seq_printf(s, "%d=%u", d->id, size);
+ sep = true;
+ }
+ seq_putc(s, '\n');
+ }
+
+out:
+ rdtgroup_kn_unlock(of->kn);
+
+ return 0;
+}
+
+/* rdtgroup information files for one cache resource. */
+static struct rftype res_common_files[] = {
+ {
+ .name = "last_cmd_status",
+ .mode = 0444,
+ .kf_ops = &rdtgroup_kf_single_ops,
+ .seq_show = rdt_last_cmd_status_show,
+ .fflags = RF_TOP_INFO,
+ },
+ {
+ .name = "num_closids",
+ .mode = 0444,
+ .kf_ops = &rdtgroup_kf_single_ops,
+ .seq_show = rdt_num_closids_show,
+ .fflags = RF_CTRL_INFO,
+ },
+ {
+ .name = "mon_features",
+ .mode = 0444,
+ .kf_ops = &rdtgroup_kf_single_ops,
+ .seq_show = rdt_mon_features_show,
+ .fflags = RF_MON_INFO,
+ },
+ {
+ .name = "num_rmids",
+ .mode = 0444,
+ .kf_ops = &rdtgroup_kf_single_ops,
+ .seq_show = rdt_num_rmids_show,
+ .fflags = RF_MON_INFO,
+ },
+ {
+ .name = "cbm_mask",
+ .mode = 0444,
+ .kf_ops = &rdtgroup_kf_single_ops,
+ .seq_show = rdt_default_ctrl_show,
+ .fflags = RF_CTRL_INFO | RFTYPE_RES_CACHE,
+ },
+ {
+ .name = "min_cbm_bits",
+ .mode = 0444,
+ .kf_ops = &rdtgroup_kf_single_ops,
+ .seq_show = rdt_min_cbm_bits_show,
+ .fflags = RF_CTRL_INFO | RFTYPE_RES_CACHE,
+ },
+ {
+ .name = "shareable_bits",
+ .mode = 0444,
+ .kf_ops = &rdtgroup_kf_single_ops,
+ .seq_show = rdt_shareable_bits_show,
+ .fflags = RF_CTRL_INFO | RFTYPE_RES_CACHE,
+ },
+ {
+ .name = "bit_usage",
+ .mode = 0444,
+ .kf_ops = &rdtgroup_kf_single_ops,
+ .seq_show = rdt_bit_usage_show,
+ .fflags = RF_CTRL_INFO | RFTYPE_RES_CACHE,
+ },
+ {
+ .name = "min_bandwidth",
+ .mode = 0444,
+ .kf_ops = &rdtgroup_kf_single_ops,
+ .seq_show = rdt_min_bw_show,
+ .fflags = RF_CTRL_INFO | RFTYPE_RES_MB,
+ },
+ {
+ .name = "bandwidth_gran",
+ .mode = 0444,
+ .kf_ops = &rdtgroup_kf_single_ops,
+ .seq_show = rdt_bw_gran_show,
+ .fflags = RF_CTRL_INFO | RFTYPE_RES_MB,
+ },
+ {
+ .name = "delay_linear",
+ .mode = 0444,
+ .kf_ops = &rdtgroup_kf_single_ops,
+ .seq_show = rdt_delay_linear_show,
+ .fflags = RF_CTRL_INFO | RFTYPE_RES_MB,
+ },
+ {
+ .name = "max_threshold_occupancy",
+ .mode = 0644,
+ .kf_ops = &rdtgroup_kf_single_ops,
+ .write = max_threshold_occ_write,
+ .seq_show = max_threshold_occ_show,
+ .fflags = RF_MON_INFO | RFTYPE_RES_CACHE,
+ },
+ {
+ .name = "cpus",
+ .mode = 0644,
+ .kf_ops = &rdtgroup_kf_single_ops,
+ .write = rdtgroup_cpus_write,
+ .seq_show = rdtgroup_cpus_show,
+ .fflags = RFTYPE_BASE,
+ },
+ {
+ .name = "cpus_list",
+ .mode = 0644,
+ .kf_ops = &rdtgroup_kf_single_ops,
+ .write = rdtgroup_cpus_write,
+ .seq_show = rdtgroup_cpus_show,
+ .flags = RFTYPE_FLAGS_CPUS_LIST,
+ .fflags = RFTYPE_BASE,
+ },
+ {
+ .name = "tasks",
+ .mode = 0644,
+ .kf_ops = &rdtgroup_kf_single_ops,
+ .write = rdtgroup_tasks_write,
+ .seq_show = rdtgroup_tasks_show,
+ .fflags = RFTYPE_BASE,
+ },
+ {
+ .name = "schemata",
+ .mode = 0644,
+ .kf_ops = &rdtgroup_kf_single_ops,
+ .write = rdtgroup_schemata_write,
+ .seq_show = rdtgroup_schemata_show,
+ .fflags = RF_CTRL_BASE,
+ },
+ {
+ .name = "mode",
+ .mode = 0644,
+ .kf_ops = &rdtgroup_kf_single_ops,
+ .write = rdtgroup_mode_write,
+ .seq_show = rdtgroup_mode_show,
+ .fflags = RF_CTRL_BASE,
+ },
+ {
+ .name = "size",
+ .mode = 0444,
+ .kf_ops = &rdtgroup_kf_single_ops,
+ .seq_show = rdtgroup_size_show,
+ .fflags = RF_CTRL_BASE,
+ },
+
+};
+
+static int rdtgroup_add_files(struct kernfs_node *kn, unsigned long fflags)
+{
+ struct rftype *rfts, *rft;
+ int ret, len;
+
+ rfts = res_common_files;
+ len = ARRAY_SIZE(res_common_files);
+
+ lockdep_assert_held(&rdtgroup_mutex);
+
+ for (rft = rfts; rft < rfts + len; rft++) {
+ if ((fflags & rft->fflags) == rft->fflags) {
+ ret = rdtgroup_add_file(kn, rft);
+ if (ret)
+ goto error;
+ }
+ }
+
+ return 0;
+error:
+ pr_warn("Failed to add %s, err=%d\n", rft->name, ret);
+ while (--rft >= rfts) {
+ if ((fflags & rft->fflags) == rft->fflags)
+ kernfs_remove_by_name(kn, rft->name);
+ }
+ return ret;
+}
+
+/**
+ * rdtgroup_kn_mode_restrict - Restrict user access to named resctrl file
+ * @r: The resource group with which the file is associated.
+ * @name: Name of the file
+ *
+ * The permissions of named resctrl file, directory, or link are modified
+ * to not allow read, write, or execute by any user.
+ *
+ * WARNING: This function is intended to communicate to the user that the
+ * resctrl file has been locked down - that it is not relevant to the
+ * particular state the system finds itself in. It should not be relied
+ * on to protect from user access because after the file's permissions
+ * are restricted the user can still change the permissions using chmod
+ * from the command line.
+ *
+ * Return: 0 on success, <0 on failure.
+ */
+int rdtgroup_kn_mode_restrict(struct rdtgroup *r, const char *name)
+{
+ struct iattr iattr = {.ia_valid = ATTR_MODE,};
+ struct kernfs_node *kn;
+ int ret = 0;
+
+ kn = kernfs_find_and_get_ns(r->kn, name, NULL);
+ if (!kn)
+ return -ENOENT;
+
+ switch (kernfs_type(kn)) {
+ case KERNFS_DIR:
+ iattr.ia_mode = S_IFDIR;
+ break;
+ case KERNFS_FILE:
+ iattr.ia_mode = S_IFREG;
+ break;
+ case KERNFS_LINK:
+ iattr.ia_mode = S_IFLNK;
+ break;
+ }
+
+ ret = kernfs_setattr(kn, &iattr);
+ kernfs_put(kn);
+ return ret;
+}
+
+/**
+ * rdtgroup_kn_mode_restore - Restore user access to named resctrl file
+ * @r: The resource group with which the file is associated.
+ * @name: Name of the file
+ * @mask: Mask of permissions that should be restored
+ *
+ * Restore the permissions of the named file. If @name is a directory the
+ * permissions of its parent will be used.
+ *
+ * Return: 0 on success, <0 on failure.
+ */
+int rdtgroup_kn_mode_restore(struct rdtgroup *r, const char *name,
+ umode_t mask)
+{
+ struct iattr iattr = {.ia_valid = ATTR_MODE,};
+ struct kernfs_node *kn, *parent;
+ struct rftype *rfts, *rft;
+ int ret, len;
+
+ rfts = res_common_files;
+ len = ARRAY_SIZE(res_common_files);
+
+ for (rft = rfts; rft < rfts + len; rft++) {
+ if (!strcmp(rft->name, name))
+ iattr.ia_mode = rft->mode & mask;
+ }
+
+ kn = kernfs_find_and_get_ns(r->kn, name, NULL);
+ if (!kn)
+ return -ENOENT;
+
+ switch (kernfs_type(kn)) {
+ case KERNFS_DIR:
+ parent = kernfs_get_parent(kn);
+ if (parent) {
+ iattr.ia_mode |= parent->mode;
+ kernfs_put(parent);
+ }
+ iattr.ia_mode |= S_IFDIR;
+ break;
+ case KERNFS_FILE:
+ iattr.ia_mode |= S_IFREG;
+ break;
+ case KERNFS_LINK:
+ iattr.ia_mode |= S_IFLNK;
+ break;
+ }
+
+ ret = kernfs_setattr(kn, &iattr);
+ kernfs_put(kn);
+ return ret;
+}
+
+static int rdtgroup_mkdir_info_resdir(struct rdt_resource *r, char *name,
+ unsigned long fflags)
+{
+ struct kernfs_node *kn_subdir;
+ int ret;
+
+ kn_subdir = kernfs_create_dir(kn_info, name,
+ kn_info->mode, r);
+ if (IS_ERR(kn_subdir))
+ return PTR_ERR(kn_subdir);
+
+ kernfs_get(kn_subdir);
+ ret = rdtgroup_kn_set_ugid(kn_subdir);
+ if (ret)
+ return ret;
+
+ ret = rdtgroup_add_files(kn_subdir, fflags);
+ if (!ret)
+ kernfs_activate(kn_subdir);
+
+ return ret;
+}
+
+static int rdtgroup_create_info_dir(struct kernfs_node *parent_kn)
+{
+ struct rdt_resource *r;
+ unsigned long fflags;
+ char name[32];
+ int ret;
+
+ /* create the directory */
+ kn_info = kernfs_create_dir(parent_kn, "info", parent_kn->mode, NULL);
+ if (IS_ERR(kn_info))
+ return PTR_ERR(kn_info);
+ kernfs_get(kn_info);
+
+ ret = rdtgroup_add_files(kn_info, RF_TOP_INFO);
+ if (ret)
+ goto out_destroy;
+
+ for_each_alloc_enabled_rdt_resource(r) {
+ fflags = r->fflags | RF_CTRL_INFO;
+ ret = rdtgroup_mkdir_info_resdir(r, r->name, fflags);
+ if (ret)
+ goto out_destroy;
+ }
+
+ for_each_mon_enabled_rdt_resource(r) {
+ fflags = r->fflags | RF_MON_INFO;
+ sprintf(name, "%s_MON", r->name);
+ ret = rdtgroup_mkdir_info_resdir(r, name, fflags);
+ if (ret)
+ goto out_destroy;
+ }
+
+ /*
+ * This extra ref will be put in kernfs_remove() and guarantees
+ * that @rdtgrp->kn is always accessible.
+ */
+ kernfs_get(kn_info);
+
+ ret = rdtgroup_kn_set_ugid(kn_info);
+ if (ret)
+ goto out_destroy;
+
+ kernfs_activate(kn_info);
+
+ return 0;
+
+out_destroy:
+ kernfs_remove(kn_info);
+ return ret;
+}
+
+static int
+mongroup_create_dir(struct kernfs_node *parent_kn, struct rdtgroup *prgrp,
+ char *name, struct kernfs_node **dest_kn)
+{
+ struct kernfs_node *kn;
+ int ret;
+
+ /* create the directory */
+ kn = kernfs_create_dir(parent_kn, name, parent_kn->mode, prgrp);
+ if (IS_ERR(kn))
+ return PTR_ERR(kn);
+
+ if (dest_kn)
+ *dest_kn = kn;
+
+ /*
+ * This extra ref will be put in kernfs_remove() and guarantees
+ * that @rdtgrp->kn is always accessible.
+ */
+ kernfs_get(kn);
+
+ ret = rdtgroup_kn_set_ugid(kn);
+ if (ret)
+ goto out_destroy;
+
+ kernfs_activate(kn);
+
+ return 0;
+
+out_destroy:
+ kernfs_remove(kn);
+ return ret;
+}
+
+static void l3_qos_cfg_update(void *arg)
+{
+ bool *enable = arg;
+
+ wrmsrl(IA32_L3_QOS_CFG, *enable ? L3_QOS_CDP_ENABLE : 0ULL);
+}
+
+static void l2_qos_cfg_update(void *arg)
+{
+ bool *enable = arg;
+
+ wrmsrl(IA32_L2_QOS_CFG, *enable ? L2_QOS_CDP_ENABLE : 0ULL);
+}
+
+static inline bool is_mba_linear(void)
+{
+ return rdt_resources_all[RDT_RESOURCE_MBA].membw.delay_linear;
+}
+
+static int set_cache_qos_cfg(int level, bool enable)
+{
+ void (*update)(void *arg);
+ struct rdt_resource *r_l;
+ cpumask_var_t cpu_mask;
+ struct rdt_domain *d;
+ int cpu;
+
+ if (!zalloc_cpumask_var(&cpu_mask, GFP_KERNEL))
+ return -ENOMEM;
+
+ if (level == RDT_RESOURCE_L3)
+ update = l3_qos_cfg_update;
+ else if (level == RDT_RESOURCE_L2)
+ update = l2_qos_cfg_update;
+ else
+ return -EINVAL;
+
+ r_l = &rdt_resources_all[level];
+ list_for_each_entry(d, &r_l->domains, list) {
+ /* Pick one CPU from each domain instance to update MSR */
+ cpumask_set_cpu(cpumask_any(&d->cpu_mask), cpu_mask);
+ }
+ cpu = get_cpu();
+ /* Update QOS_CFG MSR on this cpu if it's in cpu_mask. */
+ if (cpumask_test_cpu(cpu, cpu_mask))
+ update(&enable);
+ /* Update QOS_CFG MSR on all other cpus in cpu_mask. */
+ smp_call_function_many(cpu_mask, update, &enable, 1);
+ put_cpu();
+
+ free_cpumask_var(cpu_mask);
+
+ return 0;
+}
+
+/*
+ * Enable or disable the MBA software controller
+ * which helps user specify bandwidth in MBps.
+ * MBA software controller is supported only if
+ * MBM is supported and MBA is in linear scale.
+ */
+static int set_mba_sc(bool mba_sc)
+{
+ struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_MBA];
+ struct rdt_domain *d;
+
+ if (!is_mbm_enabled() || !is_mba_linear() ||
+ mba_sc == is_mba_sc(r))
+ return -EINVAL;
+
+ r->membw.mba_sc = mba_sc;
+ list_for_each_entry(d, &r->domains, list)
+ setup_default_ctrlval(r, d->ctrl_val, d->mbps_val);
+
+ return 0;
+}
+
+static int cdp_enable(int level, int data_type, int code_type)
+{
+ struct rdt_resource *r_ldata = &rdt_resources_all[data_type];
+ struct rdt_resource *r_lcode = &rdt_resources_all[code_type];
+ struct rdt_resource *r_l = &rdt_resources_all[level];
+ int ret;
+
+ if (!r_l->alloc_capable || !r_ldata->alloc_capable ||
+ !r_lcode->alloc_capable)
+ return -EINVAL;
+
+ ret = set_cache_qos_cfg(level, true);
+ if (!ret) {
+ r_l->alloc_enabled = false;
+ r_ldata->alloc_enabled = true;
+ r_lcode->alloc_enabled = true;
+ }
+ return ret;
+}
+
+static int cdpl3_enable(void)
+{
+ return cdp_enable(RDT_RESOURCE_L3, RDT_RESOURCE_L3DATA,
+ RDT_RESOURCE_L3CODE);
+}
+
+static int cdpl2_enable(void)
+{
+ return cdp_enable(RDT_RESOURCE_L2, RDT_RESOURCE_L2DATA,
+ RDT_RESOURCE_L2CODE);
+}
+
+static void cdp_disable(int level, int data_type, int code_type)
+{
+ struct rdt_resource *r = &rdt_resources_all[level];
+
+ r->alloc_enabled = r->alloc_capable;
+
+ if (rdt_resources_all[data_type].alloc_enabled) {
+ rdt_resources_all[data_type].alloc_enabled = false;
+ rdt_resources_all[code_type].alloc_enabled = false;
+ set_cache_qos_cfg(level, false);
+ }
+}
+
+static void cdpl3_disable(void)
+{
+ cdp_disable(RDT_RESOURCE_L3, RDT_RESOURCE_L3DATA, RDT_RESOURCE_L3CODE);
+}
+
+static void cdpl2_disable(void)
+{
+ cdp_disable(RDT_RESOURCE_L2, RDT_RESOURCE_L2DATA, RDT_RESOURCE_L2CODE);
+}
+
+static void cdp_disable_all(void)
+{
+ if (rdt_resources_all[RDT_RESOURCE_L3DATA].alloc_enabled)
+ cdpl3_disable();
+ if (rdt_resources_all[RDT_RESOURCE_L2DATA].alloc_enabled)
+ cdpl2_disable();
+}
+
+static int parse_rdtgroupfs_options(char *data)
+{
+ char *token, *o = data;
+ int ret = 0;
+
+ while ((token = strsep(&o, ",")) != NULL) {
+ if (!*token) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (!strcmp(token, "cdp")) {
+ ret = cdpl3_enable();
+ if (ret)
+ goto out;
+ } else if (!strcmp(token, "cdpl2")) {
+ ret = cdpl2_enable();
+ if (ret)
+ goto out;
+ } else if (!strcmp(token, "mba_MBps")) {
+ ret = set_mba_sc(true);
+ if (ret)
+ goto out;
+ } else {
+ ret = -EINVAL;
+ goto out;
+ }
+ }
+
+ return 0;
+
+out:
+ pr_err("Invalid mount option \"%s\"\n", token);
+
+ return ret;
+}
+
+/*
+ * We don't allow rdtgroup directories to be created anywhere
+ * except the root directory. Thus when looking for the rdtgroup
+ * structure for a kernfs node we are either looking at a directory,
+ * in which case the rdtgroup structure is pointed at by the "priv"
+ * field, otherwise we have a file, and need only look to the parent
+ * to find the rdtgroup.
+ */
+static struct rdtgroup *kernfs_to_rdtgroup(struct kernfs_node *kn)
+{
+ if (kernfs_type(kn) == KERNFS_DIR) {
+ /*
+ * All the resource directories use "kn->priv"
+ * to point to the "struct rdtgroup" for the
+ * resource. "info" and its subdirectories don't
+ * have rdtgroup structures, so return NULL here.
+ */
+ if (kn == kn_info || kn->parent == kn_info)
+ return NULL;
+ else
+ return kn->priv;
+ } else {
+ return kn->parent->priv;
+ }
+}
+
+struct rdtgroup *rdtgroup_kn_lock_live(struct kernfs_node *kn)
+{
+ struct rdtgroup *rdtgrp = kernfs_to_rdtgroup(kn);
+
+ if (!rdtgrp)
+ return NULL;
+
+ atomic_inc(&rdtgrp->waitcount);
+ kernfs_break_active_protection(kn);
+
+ mutex_lock(&rdtgroup_mutex);
+
+ /* Was this group deleted while we waited? */
+ if (rdtgrp->flags & RDT_DELETED)
+ return NULL;
+
+ return rdtgrp;
+}
+
+void rdtgroup_kn_unlock(struct kernfs_node *kn)
+{
+ struct rdtgroup *rdtgrp = kernfs_to_rdtgroup(kn);
+
+ if (!rdtgrp)
+ return;
+
+ mutex_unlock(&rdtgroup_mutex);
+
+ if (atomic_dec_and_test(&rdtgrp->waitcount) &&
+ (rdtgrp->flags & RDT_DELETED)) {
+ if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP ||
+ rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED)
+ rdtgroup_pseudo_lock_remove(rdtgrp);
+ kernfs_unbreak_active_protection(kn);
+ kernfs_put(rdtgrp->kn);
+ kfree(rdtgrp);
+ } else {
+ kernfs_unbreak_active_protection(kn);
+ }
+}
+
+static int mkdir_mondata_all(struct kernfs_node *parent_kn,
+ struct rdtgroup *prgrp,
+ struct kernfs_node **mon_data_kn);
+
+static struct dentry *rdt_mount(struct file_system_type *fs_type,
+ int flags, const char *unused_dev_name,
+ void *data)
+{
+ struct rdt_domain *dom;
+ struct rdt_resource *r;
+ struct dentry *dentry;
+ int ret;
+
+ cpus_read_lock();
+ mutex_lock(&rdtgroup_mutex);
+ /*
+ * resctrl file system can only be mounted once.
+ */
+ if (static_branch_unlikely(&rdt_enable_key)) {
+ dentry = ERR_PTR(-EBUSY);
+ goto out;
+ }
+
+ ret = parse_rdtgroupfs_options(data);
+ if (ret) {
+ dentry = ERR_PTR(ret);
+ goto out_cdp;
+ }
+
+ closid_init();
+
+ ret = rdtgroup_create_info_dir(rdtgroup_default.kn);
+ if (ret) {
+ dentry = ERR_PTR(ret);
+ goto out_cdp;
+ }
+
+ if (rdt_mon_capable) {
+ ret = mongroup_create_dir(rdtgroup_default.kn,
+ NULL, "mon_groups",
+ &kn_mongrp);
+ if (ret) {
+ dentry = ERR_PTR(ret);
+ goto out_info;
+ }
+ kernfs_get(kn_mongrp);
+
+ ret = mkdir_mondata_all(rdtgroup_default.kn,
+ &rdtgroup_default, &kn_mondata);
+ if (ret) {
+ dentry = ERR_PTR(ret);
+ goto out_mongrp;
+ }
+ kernfs_get(kn_mondata);
+ rdtgroup_default.mon.mon_data_kn = kn_mondata;
+ }
+
+ ret = rdt_pseudo_lock_init();
+ if (ret) {
+ dentry = ERR_PTR(ret);
+ goto out_mondata;
+ }
+
+ dentry = kernfs_mount(fs_type, flags, rdt_root,
+ RDTGROUP_SUPER_MAGIC, NULL);
+ if (IS_ERR(dentry))
+ goto out_psl;
+
+ if (rdt_alloc_capable)
+ static_branch_enable_cpuslocked(&rdt_alloc_enable_key);
+ if (rdt_mon_capable)
+ static_branch_enable_cpuslocked(&rdt_mon_enable_key);
+
+ if (rdt_alloc_capable || rdt_mon_capable)
+ static_branch_enable_cpuslocked(&rdt_enable_key);
+
+ if (is_mbm_enabled()) {
+ r = &rdt_resources_all[RDT_RESOURCE_L3];
+ list_for_each_entry(dom, &r->domains, list)
+ mbm_setup_overflow_handler(dom, MBM_OVERFLOW_INTERVAL);
+ }
+
+ goto out;
+
+out_psl:
+ rdt_pseudo_lock_release();
+out_mondata:
+ if (rdt_mon_capable)
+ kernfs_remove(kn_mondata);
+out_mongrp:
+ if (rdt_mon_capable)
+ kernfs_remove(kn_mongrp);
+out_info:
+ kernfs_remove(kn_info);
+out_cdp:
+ cdp_disable_all();
+out:
+ rdt_last_cmd_clear();
+ mutex_unlock(&rdtgroup_mutex);
+ cpus_read_unlock();
+
+ return dentry;
+}
+
+static int reset_all_ctrls(struct rdt_resource *r)
+{
+ struct msr_param msr_param;
+ cpumask_var_t cpu_mask;
+ struct rdt_domain *d;
+ int i, cpu;
+
+ if (!zalloc_cpumask_var(&cpu_mask, GFP_KERNEL))
+ return -ENOMEM;
+
+ msr_param.res = r;
+ msr_param.low = 0;
+ msr_param.high = r->num_closid;
+
+ /*
+ * Disable resource control for this resource by setting all
+ * CBMs in all domains to the maximum mask value. Pick one CPU
+ * from each domain to update the MSRs below.
+ */
+ list_for_each_entry(d, &r->domains, list) {
+ cpumask_set_cpu(cpumask_any(&d->cpu_mask), cpu_mask);
+
+ for (i = 0; i < r->num_closid; i++)
+ d->ctrl_val[i] = r->default_ctrl;
+ }
+ cpu = get_cpu();
+ /* Update CBM on this cpu if it's in cpu_mask. */
+ if (cpumask_test_cpu(cpu, cpu_mask))
+ rdt_ctrl_update(&msr_param);
+ /* Update CBM on all other cpus in cpu_mask. */
+ smp_call_function_many(cpu_mask, rdt_ctrl_update, &msr_param, 1);
+ put_cpu();
+
+ free_cpumask_var(cpu_mask);
+
+ return 0;
+}
+
+static bool is_closid_match(struct task_struct *t, struct rdtgroup *r)
+{
+ return (rdt_alloc_capable &&
+ (r->type == RDTCTRL_GROUP) && (t->closid == r->closid));
+}
+
+static bool is_rmid_match(struct task_struct *t, struct rdtgroup *r)
+{
+ return (rdt_mon_capable &&
+ (r->type == RDTMON_GROUP) && (t->rmid == r->mon.rmid));
+}
+
+/*
+ * Move tasks from one to the other group. If @from is NULL, then all tasks
+ * in the systems are moved unconditionally (used for teardown).
+ *
+ * If @mask is not NULL the cpus on which moved tasks are running are set
+ * in that mask so the update smp function call is restricted to affected
+ * cpus.
+ */
+static void rdt_move_group_tasks(struct rdtgroup *from, struct rdtgroup *to,
+ struct cpumask *mask)
+{
+ struct task_struct *p, *t;
+
+ read_lock(&tasklist_lock);
+ for_each_process_thread(p, t) {
+ if (!from || is_closid_match(t, from) ||
+ is_rmid_match(t, from)) {
+ t->closid = to->closid;
+ t->rmid = to->mon.rmid;
+
+#ifdef CONFIG_SMP
+ /*
+ * This is safe on x86 w/o barriers as the ordering
+ * of writing to task_cpu() and t->on_cpu is
+ * reverse to the reading here. The detection is
+ * inaccurate as tasks might move or schedule
+ * before the smp function call takes place. In
+ * such a case the function call is pointless, but
+ * there is no other side effect.
+ */
+ if (mask && t->on_cpu)
+ cpumask_set_cpu(task_cpu(t), mask);
+#endif
+ }
+ }
+ read_unlock(&tasklist_lock);
+}
+
+static void free_all_child_rdtgrp(struct rdtgroup *rdtgrp)
+{
+ struct rdtgroup *sentry, *stmp;
+ struct list_head *head;
+
+ head = &rdtgrp->mon.crdtgrp_list;
+ list_for_each_entry_safe(sentry, stmp, head, mon.crdtgrp_list) {
+ free_rmid(sentry->mon.rmid);
+ list_del(&sentry->mon.crdtgrp_list);
+ kfree(sentry);
+ }
+}
+
+/*
+ * Forcibly remove all of subdirectories under root.
+ */
+static void rmdir_all_sub(void)
+{
+ struct rdtgroup *rdtgrp, *tmp;
+
+ /* Move all tasks to the default resource group */
+ rdt_move_group_tasks(NULL, &rdtgroup_default, NULL);
+
+ list_for_each_entry_safe(rdtgrp, tmp, &rdt_all_groups, rdtgroup_list) {
+ /* Free any child rmids */
+ free_all_child_rdtgrp(rdtgrp);
+
+ /* Remove each rdtgroup other than root */
+ if (rdtgrp == &rdtgroup_default)
+ continue;
+
+ if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP ||
+ rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED)
+ rdtgroup_pseudo_lock_remove(rdtgrp);
+
+ /*
+ * Give any CPUs back to the default group. We cannot copy
+ * cpu_online_mask because a CPU might have executed the
+ * offline callback already, but is still marked online.
+ */
+ cpumask_or(&rdtgroup_default.cpu_mask,
+ &rdtgroup_default.cpu_mask, &rdtgrp->cpu_mask);
+
+ free_rmid(rdtgrp->mon.rmid);
+
+ kernfs_remove(rdtgrp->kn);
+ list_del(&rdtgrp->rdtgroup_list);
+ kfree(rdtgrp);
+ }
+ /* Notify online CPUs to update per cpu storage and PQR_ASSOC MSR */
+ update_closid_rmid(cpu_online_mask, &rdtgroup_default);
+
+ kernfs_remove(kn_info);
+ kernfs_remove(kn_mongrp);
+ kernfs_remove(kn_mondata);
+}
+
+static void rdt_kill_sb(struct super_block *sb)
+{
+ struct rdt_resource *r;
+
+ cpus_read_lock();
+ mutex_lock(&rdtgroup_mutex);
+
+ set_mba_sc(false);
+
+ /*Put everything back to default values. */
+ for_each_alloc_enabled_rdt_resource(r)
+ reset_all_ctrls(r);
+ cdp_disable_all();
+ rmdir_all_sub();
+ rdt_pseudo_lock_release();
+ rdtgroup_default.mode = RDT_MODE_SHAREABLE;
+ static_branch_disable_cpuslocked(&rdt_alloc_enable_key);
+ static_branch_disable_cpuslocked(&rdt_mon_enable_key);
+ static_branch_disable_cpuslocked(&rdt_enable_key);
+ kernfs_kill_sb(sb);
+ mutex_unlock(&rdtgroup_mutex);
+ cpus_read_unlock();
+}
+
+static struct file_system_type rdt_fs_type = {
+ .name = "resctrl",
+ .mount = rdt_mount,
+ .kill_sb = rdt_kill_sb,
+};
+
+static int mon_addfile(struct kernfs_node *parent_kn, const char *name,
+ void *priv)
+{
+ struct kernfs_node *kn;
+ int ret = 0;
+
+ kn = __kernfs_create_file(parent_kn, name, 0444,
+ GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, 0,
+ &kf_mondata_ops, priv, NULL, NULL);
+ if (IS_ERR(kn))
+ return PTR_ERR(kn);
+
+ ret = rdtgroup_kn_set_ugid(kn);
+ if (ret) {
+ kernfs_remove(kn);
+ return ret;
+ }
+
+ return ret;
+}
+
+/*
+ * Remove all subdirectories of mon_data of ctrl_mon groups
+ * and monitor groups with given domain id.
+ */
+void rmdir_mondata_subdir_allrdtgrp(struct rdt_resource *r, unsigned int dom_id)
+{
+ struct rdtgroup *prgrp, *crgrp;
+ char name[32];
+
+ if (!r->mon_enabled)
+ return;
+
+ list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) {
+ sprintf(name, "mon_%s_%02d", r->name, dom_id);
+ kernfs_remove_by_name(prgrp->mon.mon_data_kn, name);
+
+ list_for_each_entry(crgrp, &prgrp->mon.crdtgrp_list, mon.crdtgrp_list)
+ kernfs_remove_by_name(crgrp->mon.mon_data_kn, name);
+ }
+}
+
+static int mkdir_mondata_subdir(struct kernfs_node *parent_kn,
+ struct rdt_domain *d,
+ struct rdt_resource *r, struct rdtgroup *prgrp)
+{
+ union mon_data_bits priv;
+ struct kernfs_node *kn;
+ struct mon_evt *mevt;
+ struct rmid_read rr;
+ char name[32];
+ int ret;
+
+ sprintf(name, "mon_%s_%02d", r->name, d->id);
+ /* create the directory */
+ kn = kernfs_create_dir(parent_kn, name, parent_kn->mode, prgrp);
+ if (IS_ERR(kn))
+ return PTR_ERR(kn);
+
+ /*
+ * This extra ref will be put in kernfs_remove() and guarantees
+ * that kn is always accessible.
+ */
+ kernfs_get(kn);
+ ret = rdtgroup_kn_set_ugid(kn);
+ if (ret)
+ goto out_destroy;
+
+ if (WARN_ON(list_empty(&r->evt_list))) {
+ ret = -EPERM;
+ goto out_destroy;
+ }
+
+ priv.u.rid = r->rid;
+ priv.u.domid = d->id;
+ list_for_each_entry(mevt, &r->evt_list, list) {
+ priv.u.evtid = mevt->evtid;
+ ret = mon_addfile(kn, mevt->name, priv.priv);
+ if (ret)
+ goto out_destroy;
+
+ if (is_mbm_event(mevt->evtid))
+ mon_event_read(&rr, d, prgrp, mevt->evtid, true);
+ }
+ kernfs_activate(kn);
+ return 0;
+
+out_destroy:
+ kernfs_remove(kn);
+ return ret;
+}
+
+/*
+ * Add all subdirectories of mon_data for "ctrl_mon" groups
+ * and "monitor" groups with given domain id.
+ */
+void mkdir_mondata_subdir_allrdtgrp(struct rdt_resource *r,
+ struct rdt_domain *d)
+{
+ struct kernfs_node *parent_kn;
+ struct rdtgroup *prgrp, *crgrp;
+ struct list_head *head;
+
+ if (!r->mon_enabled)
+ return;
+
+ list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) {
+ parent_kn = prgrp->mon.mon_data_kn;
+ mkdir_mondata_subdir(parent_kn, d, r, prgrp);
+
+ head = &prgrp->mon.crdtgrp_list;
+ list_for_each_entry(crgrp, head, mon.crdtgrp_list) {
+ parent_kn = crgrp->mon.mon_data_kn;
+ mkdir_mondata_subdir(parent_kn, d, r, crgrp);
+ }
+ }
+}
+
+static int mkdir_mondata_subdir_alldom(struct kernfs_node *parent_kn,
+ struct rdt_resource *r,
+ struct rdtgroup *prgrp)
+{
+ struct rdt_domain *dom;
+ int ret;
+
+ list_for_each_entry(dom, &r->domains, list) {
+ ret = mkdir_mondata_subdir(parent_kn, dom, r, prgrp);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+/*
+ * This creates a directory mon_data which contains the monitored data.
+ *
+ * mon_data has one directory for each domain whic are named
+ * in the format mon_<domain_name>_<domain_id>. For ex: A mon_data
+ * with L3 domain looks as below:
+ * ./mon_data:
+ * mon_L3_00
+ * mon_L3_01
+ * mon_L3_02
+ * ...
+ *
+ * Each domain directory has one file per event:
+ * ./mon_L3_00/:
+ * llc_occupancy
+ *
+ */
+static int mkdir_mondata_all(struct kernfs_node *parent_kn,
+ struct rdtgroup *prgrp,
+ struct kernfs_node **dest_kn)
+{
+ struct rdt_resource *r;
+ struct kernfs_node *kn;
+ int ret;
+
+ /*
+ * Create the mon_data directory first.
+ */
+ ret = mongroup_create_dir(parent_kn, NULL, "mon_data", &kn);
+ if (ret)
+ return ret;
+
+ if (dest_kn)
+ *dest_kn = kn;
+
+ /*
+ * Create the subdirectories for each domain. Note that all events
+ * in a domain like L3 are grouped into a resource whose domain is L3
+ */
+ for_each_mon_enabled_rdt_resource(r) {
+ ret = mkdir_mondata_subdir_alldom(kn, r, prgrp);
+ if (ret)
+ goto out_destroy;
+ }
+
+ return 0;
+
+out_destroy:
+ kernfs_remove(kn);
+ return ret;
+}
+
+/**
+ * cbm_ensure_valid - Enforce validity on provided CBM
+ * @_val: Candidate CBM
+ * @r: RDT resource to which the CBM belongs
+ *
+ * The provided CBM represents all cache portions available for use. This
+ * may be represented by a bitmap that does not consist of contiguous ones
+ * and thus be an invalid CBM.
+ * Here the provided CBM is forced to be a valid CBM by only considering
+ * the first set of contiguous bits as valid and clearing all bits.
+ * The intention here is to provide a valid default CBM with which a new
+ * resource group is initialized. The user can follow this with a
+ * modification to the CBM if the default does not satisfy the
+ * requirements.
+ */
+static void cbm_ensure_valid(u32 *_val, struct rdt_resource *r)
+{
+ /*
+ * Convert the u32 _val to an unsigned long required by all the bit
+ * operations within this function. No more than 32 bits of this
+ * converted value can be accessed because all bit operations are
+ * additionally provided with cbm_len that is initialized during
+ * hardware enumeration using five bits from the EAX register and
+ * thus never can exceed 32 bits.
+ */
+ unsigned long *val = (unsigned long *)_val;
+ unsigned int cbm_len = r->cache.cbm_len;
+ unsigned long first_bit, zero_bit;
+
+ if (*val == 0)
+ return;
+
+ first_bit = find_first_bit(val, cbm_len);
+ zero_bit = find_next_zero_bit(val, cbm_len, first_bit);
+
+ /* Clear any remaining bits to ensure contiguous region */
+ bitmap_clear(val, zero_bit, cbm_len - zero_bit);
+}
+
+/**
+ * rdtgroup_init_alloc - Initialize the new RDT group's allocations
+ *
+ * A new RDT group is being created on an allocation capable (CAT)
+ * supporting system. Set this group up to start off with all usable
+ * allocations. That is, all shareable and unused bits.
+ *
+ * All-zero CBM is invalid. If there are no more shareable bits available
+ * on any domain then the entire allocation will fail.
+ */
+static int rdtgroup_init_alloc(struct rdtgroup *rdtgrp)
+{
+ u32 used_b = 0, unused_b = 0;
+ u32 closid = rdtgrp->closid;
+ struct rdt_resource *r;
+ unsigned long tmp_cbm;
+ enum rdtgrp_mode mode;
+ struct rdt_domain *d;
+ int i, ret;
+ u32 *ctrl;
+
+ for_each_alloc_enabled_rdt_resource(r) {
+ /*
+ * Only initialize default allocations for CBM cache
+ * resources
+ */
+ if (r->rid == RDT_RESOURCE_MBA)
+ continue;
+ list_for_each_entry(d, &r->domains, list) {
+ d->have_new_ctrl = false;
+ d->new_ctrl = r->cache.shareable_bits;
+ used_b = r->cache.shareable_bits;
+ ctrl = d->ctrl_val;
+ for (i = 0; i < closids_supported(); i++, ctrl++) {
+ if (closid_allocated(i) && i != closid) {
+ mode = rdtgroup_mode_by_closid(i);
+ if (mode == RDT_MODE_PSEUDO_LOCKSETUP)
+ break;
+ used_b |= *ctrl;
+ if (mode == RDT_MODE_SHAREABLE)
+ d->new_ctrl |= *ctrl;
+ }
+ }
+ if (d->plr && d->plr->cbm > 0)
+ used_b |= d->plr->cbm;
+ unused_b = used_b ^ (BIT_MASK(r->cache.cbm_len) - 1);
+ unused_b &= BIT_MASK(r->cache.cbm_len) - 1;
+ d->new_ctrl |= unused_b;
+ /*
+ * Force the initial CBM to be valid, user can
+ * modify the CBM based on system availability.
+ */
+ cbm_ensure_valid(&d->new_ctrl, r);
+ /*
+ * Assign the u32 CBM to an unsigned long to ensure
+ * that bitmap_weight() does not access out-of-bound
+ * memory.
+ */
+ tmp_cbm = d->new_ctrl;
+ if (bitmap_weight(&tmp_cbm, r->cache.cbm_len) <
+ r->cache.min_cbm_bits) {
+ rdt_last_cmd_printf("no space on %s:%d\n",
+ r->name, d->id);
+ return -ENOSPC;
+ }
+ d->have_new_ctrl = true;
+ }
+ }
+
+ for_each_alloc_enabled_rdt_resource(r) {
+ /*
+ * Only initialize default allocations for CBM cache
+ * resources
+ */
+ if (r->rid == RDT_RESOURCE_MBA)
+ continue;
+ ret = update_domains(r, rdtgrp->closid);
+ if (ret < 0) {
+ rdt_last_cmd_puts("failed to initialize allocations\n");
+ return ret;
+ }
+ rdtgrp->mode = RDT_MODE_SHAREABLE;
+ }
+
+ return 0;
+}
+
+static int mkdir_rdt_prepare(struct kernfs_node *parent_kn,
+ struct kernfs_node *prgrp_kn,
+ const char *name, umode_t mode,
+ enum rdt_group_type rtype, struct rdtgroup **r)
+{
+ struct rdtgroup *prdtgrp, *rdtgrp;
+ struct kernfs_node *kn;
+ uint files = 0;
+ int ret;
+
+ prdtgrp = rdtgroup_kn_lock_live(prgrp_kn);
+ rdt_last_cmd_clear();
+ if (!prdtgrp) {
+ ret = -ENODEV;
+ rdt_last_cmd_puts("directory was removed\n");
+ goto out_unlock;
+ }
+
+ if (rtype == RDTMON_GROUP &&
+ (prdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP ||
+ prdtgrp->mode == RDT_MODE_PSEUDO_LOCKED)) {
+ ret = -EINVAL;
+ rdt_last_cmd_puts("pseudo-locking in progress\n");
+ goto out_unlock;
+ }
+
+ /* allocate the rdtgroup. */
+ rdtgrp = kzalloc(sizeof(*rdtgrp), GFP_KERNEL);
+ if (!rdtgrp) {
+ ret = -ENOSPC;
+ rdt_last_cmd_puts("kernel out of memory\n");
+ goto out_unlock;
+ }
+ *r = rdtgrp;
+ rdtgrp->mon.parent = prdtgrp;
+ rdtgrp->type = rtype;
+ INIT_LIST_HEAD(&rdtgrp->mon.crdtgrp_list);
+
+ /* kernfs creates the directory for rdtgrp */
+ kn = kernfs_create_dir(parent_kn, name, mode, rdtgrp);
+ if (IS_ERR(kn)) {
+ ret = PTR_ERR(kn);
+ rdt_last_cmd_puts("kernfs create error\n");
+ goto out_free_rgrp;
+ }
+ rdtgrp->kn = kn;
+
+ /*
+ * kernfs_remove() will drop the reference count on "kn" which
+ * will free it. But we still need it to stick around for the
+ * rdtgroup_kn_unlock(kn} call below. Take one extra reference
+ * here, which will be dropped inside rdtgroup_kn_unlock().
+ */
+ kernfs_get(kn);
+
+ ret = rdtgroup_kn_set_ugid(kn);
+ if (ret) {
+ rdt_last_cmd_puts("kernfs perm error\n");
+ goto out_destroy;
+ }
+
+ files = RFTYPE_BASE | BIT(RF_CTRLSHIFT + rtype);
+ ret = rdtgroup_add_files(kn, files);
+ if (ret) {
+ rdt_last_cmd_puts("kernfs fill error\n");
+ goto out_destroy;
+ }
+
+ if (rdt_mon_capable) {
+ ret = alloc_rmid();
+ if (ret < 0) {
+ rdt_last_cmd_puts("out of RMIDs\n");
+ goto out_destroy;
+ }
+ rdtgrp->mon.rmid = ret;
+
+ ret = mkdir_mondata_all(kn, rdtgrp, &rdtgrp->mon.mon_data_kn);
+ if (ret) {
+ rdt_last_cmd_puts("kernfs subdir error\n");
+ goto out_idfree;
+ }
+ }
+ kernfs_activate(kn);
+
+ /*
+ * The caller unlocks the prgrp_kn upon success.
+ */
+ return 0;
+
+out_idfree:
+ free_rmid(rdtgrp->mon.rmid);
+out_destroy:
+ kernfs_remove(rdtgrp->kn);
+out_free_rgrp:
+ kfree(rdtgrp);
+out_unlock:
+ rdtgroup_kn_unlock(prgrp_kn);
+ return ret;
+}
+
+static void mkdir_rdt_prepare_clean(struct rdtgroup *rgrp)
+{
+ kernfs_remove(rgrp->kn);
+ free_rmid(rgrp->mon.rmid);
+ kfree(rgrp);
+}
+
+/*
+ * Create a monitor group under "mon_groups" directory of a control
+ * and monitor group(ctrl_mon). This is a resource group
+ * to monitor a subset of tasks and cpus in its parent ctrl_mon group.
+ */
+static int rdtgroup_mkdir_mon(struct kernfs_node *parent_kn,
+ struct kernfs_node *prgrp_kn,
+ const char *name,
+ umode_t mode)
+{
+ struct rdtgroup *rdtgrp, *prgrp;
+ int ret;
+
+ ret = mkdir_rdt_prepare(parent_kn, prgrp_kn, name, mode, RDTMON_GROUP,
+ &rdtgrp);
+ if (ret)
+ return ret;
+
+ prgrp = rdtgrp->mon.parent;
+ rdtgrp->closid = prgrp->closid;
+
+ /*
+ * Add the rdtgrp to the list of rdtgrps the parent
+ * ctrl_mon group has to track.
+ */
+ list_add_tail(&rdtgrp->mon.crdtgrp_list, &prgrp->mon.crdtgrp_list);
+
+ rdtgroup_kn_unlock(prgrp_kn);
+ return ret;
+}
+
+/*
+ * These are rdtgroups created under the root directory. Can be used
+ * to allocate and monitor resources.
+ */
+static int rdtgroup_mkdir_ctrl_mon(struct kernfs_node *parent_kn,
+ struct kernfs_node *prgrp_kn,
+ const char *name, umode_t mode)
+{
+ struct rdtgroup *rdtgrp;
+ struct kernfs_node *kn;
+ u32 closid;
+ int ret;
+
+ ret = mkdir_rdt_prepare(parent_kn, prgrp_kn, name, mode, RDTCTRL_GROUP,
+ &rdtgrp);
+ if (ret)
+ return ret;
+
+ kn = rdtgrp->kn;
+ ret = closid_alloc();
+ if (ret < 0) {
+ rdt_last_cmd_puts("out of CLOSIDs\n");
+ goto out_common_fail;
+ }
+ closid = ret;
+ ret = 0;
+
+ rdtgrp->closid = closid;
+ ret = rdtgroup_init_alloc(rdtgrp);
+ if (ret < 0)
+ goto out_id_free;
+
+ list_add(&rdtgrp->rdtgroup_list, &rdt_all_groups);
+
+ if (rdt_mon_capable) {
+ /*
+ * Create an empty mon_groups directory to hold the subset
+ * of tasks and cpus to monitor.
+ */
+ ret = mongroup_create_dir(kn, NULL, "mon_groups", NULL);
+ if (ret) {
+ rdt_last_cmd_puts("kernfs subdir error\n");
+ goto out_del_list;
+ }
+ }
+
+ goto out_unlock;
+
+out_del_list:
+ list_del(&rdtgrp->rdtgroup_list);
+out_id_free:
+ closid_free(closid);
+out_common_fail:
+ mkdir_rdt_prepare_clean(rdtgrp);
+out_unlock:
+ rdtgroup_kn_unlock(prgrp_kn);
+ return ret;
+}
+
+/*
+ * We allow creating mon groups only with in a directory called "mon_groups"
+ * which is present in every ctrl_mon group. Check if this is a valid
+ * "mon_groups" directory.
+ *
+ * 1. The directory should be named "mon_groups".
+ * 2. The mon group itself should "not" be named "mon_groups".
+ * This makes sure "mon_groups" directory always has a ctrl_mon group
+ * as parent.
+ */
+static bool is_mon_groups(struct kernfs_node *kn, const char *name)
+{
+ return (!strcmp(kn->name, "mon_groups") &&
+ strcmp(name, "mon_groups"));
+}
+
+static int rdtgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
+ umode_t mode)
+{
+ /* Do not accept '\n' to avoid unparsable situation. */
+ if (strchr(name, '\n'))
+ return -EINVAL;
+
+ /*
+ * If the parent directory is the root directory and RDT
+ * allocation is supported, add a control and monitoring
+ * subdirectory
+ */
+ if (rdt_alloc_capable && parent_kn == rdtgroup_default.kn)
+ return rdtgroup_mkdir_ctrl_mon(parent_kn, parent_kn, name, mode);
+
+ /*
+ * If RDT monitoring is supported and the parent directory is a valid
+ * "mon_groups" directory, add a monitoring subdirectory.
+ */
+ if (rdt_mon_capable && is_mon_groups(parent_kn, name))
+ return rdtgroup_mkdir_mon(parent_kn, parent_kn->parent, name, mode);
+
+ return -EPERM;
+}
+
+static int rdtgroup_rmdir_mon(struct kernfs_node *kn, struct rdtgroup *rdtgrp,
+ cpumask_var_t tmpmask)
+{
+ struct rdtgroup *prdtgrp = rdtgrp->mon.parent;
+ int cpu;
+
+ /* Give any tasks back to the parent group */
+ rdt_move_group_tasks(rdtgrp, prdtgrp, tmpmask);
+
+ /* Update per cpu rmid of the moved CPUs first */
+ for_each_cpu(cpu, &rdtgrp->cpu_mask)
+ per_cpu(pqr_state.default_rmid, cpu) = prdtgrp->mon.rmid;
+ /*
+ * Update the MSR on moved CPUs and CPUs which have moved
+ * task running on them.
+ */
+ cpumask_or(tmpmask, tmpmask, &rdtgrp->cpu_mask);
+ update_closid_rmid(tmpmask, NULL);
+
+ rdtgrp->flags = RDT_DELETED;
+ free_rmid(rdtgrp->mon.rmid);
+
+ /*
+ * Remove the rdtgrp from the parent ctrl_mon group's list
+ */
+ WARN_ON(list_empty(&prdtgrp->mon.crdtgrp_list));
+ list_del(&rdtgrp->mon.crdtgrp_list);
+
+ /*
+ * one extra hold on this, will drop when we kfree(rdtgrp)
+ * in rdtgroup_kn_unlock()
+ */
+ kernfs_get(kn);
+ kernfs_remove(rdtgrp->kn);
+
+ return 0;
+}
+
+static int rdtgroup_ctrl_remove(struct kernfs_node *kn,
+ struct rdtgroup *rdtgrp)
+{
+ rdtgrp->flags = RDT_DELETED;
+ list_del(&rdtgrp->rdtgroup_list);
+
+ /*
+ * one extra hold on this, will drop when we kfree(rdtgrp)
+ * in rdtgroup_kn_unlock()
+ */
+ kernfs_get(kn);
+ kernfs_remove(rdtgrp->kn);
+ return 0;
+}
+
+static int rdtgroup_rmdir_ctrl(struct kernfs_node *kn, struct rdtgroup *rdtgrp,
+ cpumask_var_t tmpmask)
+{
+ int cpu;
+
+ /* Give any tasks back to the default group */
+ rdt_move_group_tasks(rdtgrp, &rdtgroup_default, tmpmask);
+
+ /* Give any CPUs back to the default group */
+ cpumask_or(&rdtgroup_default.cpu_mask,
+ &rdtgroup_default.cpu_mask, &rdtgrp->cpu_mask);
+
+ /* Update per cpu closid and rmid of the moved CPUs first */
+ for_each_cpu(cpu, &rdtgrp->cpu_mask) {
+ per_cpu(pqr_state.default_closid, cpu) = rdtgroup_default.closid;
+ per_cpu(pqr_state.default_rmid, cpu) = rdtgroup_default.mon.rmid;
+ }
+
+ /*
+ * Update the MSR on moved CPUs and CPUs which have moved
+ * task running on them.
+ */
+ cpumask_or(tmpmask, tmpmask, &rdtgrp->cpu_mask);
+ update_closid_rmid(tmpmask, NULL);
+
+ closid_free(rdtgrp->closid);
+ free_rmid(rdtgrp->mon.rmid);
+
+ /*
+ * Free all the child monitor group rmids.
+ */
+ free_all_child_rdtgrp(rdtgrp);
+
+ rdtgroup_ctrl_remove(kn, rdtgrp);
+
+ return 0;
+}
+
+static int rdtgroup_rmdir(struct kernfs_node *kn)
+{
+ struct kernfs_node *parent_kn = kn->parent;
+ struct rdtgroup *rdtgrp;
+ cpumask_var_t tmpmask;
+ int ret = 0;
+
+ if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
+ return -ENOMEM;
+
+ rdtgrp = rdtgroup_kn_lock_live(kn);
+ if (!rdtgrp) {
+ ret = -EPERM;
+ goto out;
+ }
+
+ /*
+ * If the rdtgroup is a ctrl_mon group and parent directory
+ * is the root directory, remove the ctrl_mon group.
+ *
+ * If the rdtgroup is a mon group and parent directory
+ * is a valid "mon_groups" directory, remove the mon group.
+ */
+ if (rdtgrp->type == RDTCTRL_GROUP && parent_kn == rdtgroup_default.kn) {
+ if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP ||
+ rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) {
+ ret = rdtgroup_ctrl_remove(kn, rdtgrp);
+ } else {
+ ret = rdtgroup_rmdir_ctrl(kn, rdtgrp, tmpmask);
+ }
+ } else if (rdtgrp->type == RDTMON_GROUP &&
+ is_mon_groups(parent_kn, kn->name)) {
+ ret = rdtgroup_rmdir_mon(kn, rdtgrp, tmpmask);
+ } else {
+ ret = -EPERM;
+ }
+
+out:
+ rdtgroup_kn_unlock(kn);
+ free_cpumask_var(tmpmask);
+ return ret;
+}
+
+static int rdtgroup_show_options(struct seq_file *seq, struct kernfs_root *kf)
+{
+ if (rdt_resources_all[RDT_RESOURCE_L3DATA].alloc_enabled)
+ seq_puts(seq, ",cdp");
+
+ if (rdt_resources_all[RDT_RESOURCE_L2DATA].alloc_enabled)
+ seq_puts(seq, ",cdpl2");
+
+ if (is_mba_sc(&rdt_resources_all[RDT_RESOURCE_MBA]))
+ seq_puts(seq, ",mba_MBps");
+
+ return 0;
+}
+
+static struct kernfs_syscall_ops rdtgroup_kf_syscall_ops = {
+ .mkdir = rdtgroup_mkdir,
+ .rmdir = rdtgroup_rmdir,
+ .show_options = rdtgroup_show_options,
+};
+
+static int __init rdtgroup_setup_root(void)
+{
+ int ret;
+
+ rdt_root = kernfs_create_root(&rdtgroup_kf_syscall_ops,
+ KERNFS_ROOT_CREATE_DEACTIVATED |
+ KERNFS_ROOT_EXTRA_OPEN_PERM_CHECK,
+ &rdtgroup_default);
+ if (IS_ERR(rdt_root))
+ return PTR_ERR(rdt_root);
+
+ mutex_lock(&rdtgroup_mutex);
+
+ rdtgroup_default.closid = 0;
+ rdtgroup_default.mon.rmid = 0;
+ rdtgroup_default.type = RDTCTRL_GROUP;
+ INIT_LIST_HEAD(&rdtgroup_default.mon.crdtgrp_list);
+
+ list_add(&rdtgroup_default.rdtgroup_list, &rdt_all_groups);
+
+ ret = rdtgroup_add_files(rdt_root->kn, RF_CTRL_BASE);
+ if (ret) {
+ kernfs_destroy_root(rdt_root);
+ goto out;
+ }
+
+ rdtgroup_default.kn = rdt_root->kn;
+ kernfs_activate(rdtgroup_default.kn);
+
+out:
+ mutex_unlock(&rdtgroup_mutex);
+
+ return ret;
+}
+
+/*
+ * rdtgroup_init - rdtgroup initialization
+ *
+ * Setup resctrl file system including set up root, create mount point,
+ * register rdtgroup filesystem, and initialize files under root directory.
+ *
+ * Return: 0 on success or -errno
+ */
+int __init rdtgroup_init(void)
+{
+ int ret = 0;
+
+ seq_buf_init(&last_cmd_status, last_cmd_status_buf,
+ sizeof(last_cmd_status_buf));
+
+ ret = rdtgroup_setup_root();
+ if (ret)
+ return ret;
+
+ ret = sysfs_create_mount_point(fs_kobj, "resctrl");
+ if (ret)
+ goto cleanup_root;
+
+ ret = register_filesystem(&rdt_fs_type);
+ if (ret)
+ goto cleanup_mountpoint;
+
+ /*
+ * Adding the resctrl debugfs directory here may not be ideal since
+ * it would let the resctrl debugfs directory appear on the debugfs
+ * filesystem before the resctrl filesystem is mounted.
+ * It may also be ok since that would enable debugging of RDT before
+ * resctrl is mounted.
+ * The reason why the debugfs directory is created here and not in
+ * rdt_mount() is because rdt_mount() takes rdtgroup_mutex and
+ * during the debugfs directory creation also &sb->s_type->i_mutex_key
+ * (the lockdep class of inode->i_rwsem). Other filesystem
+ * interactions (eg. SyS_getdents) have the lock ordering:
+ * &sb->s_type->i_mutex_key --> &mm->mmap_sem
+ * During mmap(), called with &mm->mmap_sem, the rdtgroup_mutex
+ * is taken, thus creating dependency:
+ * &mm->mmap_sem --> rdtgroup_mutex for the latter that can cause
+ * issues considering the other two lock dependencies.
+ * By creating the debugfs directory here we avoid a dependency
+ * that may cause deadlock (even though file operations cannot
+ * occur until the filesystem is mounted, but I do not know how to
+ * tell lockdep that).
+ */
+ debugfs_resctrl = debugfs_create_dir("resctrl", NULL);
+
+ return 0;
+
+cleanup_mountpoint:
+ sysfs_remove_mount_point(fs_kobj, "resctrl");
+cleanup_root:
+ kernfs_destroy_root(rdt_root);
+
+ return ret;
+}
+
+void __exit rdtgroup_exit(void)
+{
+ debugfs_remove_recursive(debugfs_resctrl);
+ unregister_filesystem(&rdt_fs_type);
+ sysfs_remove_mount_point(fs_kobj, "resctrl");
+ kernfs_destroy_root(rdt_root);
+}
diff --git a/arch/x86/kernel/cpu/match.c b/arch/x86/kernel/cpu/match.c
new file mode 100644
index 0000000..3fed388
--- /dev/null
+++ b/arch/x86/kernel/cpu/match.c
@@ -0,0 +1,50 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <asm/cpu_device_id.h>
+#include <asm/cpufeature.h>
+#include <linux/cpu.h>
+#include <linux/export.h>
+#include <linux/slab.h>
+
+/**
+ * x86_match_cpu - match current CPU again an array of x86_cpu_ids
+ * @match: Pointer to array of x86_cpu_ids. Last entry terminated with
+ * {}.
+ *
+ * Return the entry if the current CPU matches the entries in the
+ * passed x86_cpu_id match table. Otherwise NULL. The match table
+ * contains vendor (X86_VENDOR_*), family, model and feature bits or
+ * respective wildcard entries.
+ *
+ * A typical table entry would be to match a specific CPU
+ * { X86_VENDOR_INTEL, 6, 0x12 }
+ * or to match a specific CPU feature
+ * { X86_FEATURE_MATCH(X86_FEATURE_FOOBAR) }
+ *
+ * Fields can be wildcarded with %X86_VENDOR_ANY, %X86_FAMILY_ANY,
+ * %X86_MODEL_ANY, %X86_FEATURE_ANY or 0 (except for vendor)
+ *
+ * Arrays used to match for this should also be declared using
+ * MODULE_DEVICE_TABLE(x86cpu, ...)
+ *
+ * This always matches against the boot cpu, assuming models and features are
+ * consistent over all CPUs.
+ */
+const struct x86_cpu_id *x86_match_cpu(const struct x86_cpu_id *match)
+{
+ const struct x86_cpu_id *m;
+ struct cpuinfo_x86 *c = &boot_cpu_data;
+
+ for (m = match; m->vendor | m->family | m->model | m->feature; m++) {
+ if (m->vendor != X86_VENDOR_ANY && c->x86_vendor != m->vendor)
+ continue;
+ if (m->family != X86_FAMILY_ANY && c->x86 != m->family)
+ continue;
+ if (m->model != X86_MODEL_ANY && c->x86_model != m->model)
+ continue;
+ if (m->feature != X86_FEATURE_ANY && !cpu_has(c, m->feature))
+ continue;
+ return m;
+ }
+ return NULL;
+}
+EXPORT_SYMBOL(x86_match_cpu);
diff --git a/arch/x86/kernel/cpu/mcheck/Makefile b/arch/x86/kernel/cpu/mcheck/Makefile
new file mode 100644
index 0000000..bcc7c54
--- /dev/null
+++ b/arch/x86/kernel/cpu/mcheck/Makefile
@@ -0,0 +1,14 @@
+# SPDX-License-Identifier: GPL-2.0
+obj-y = mce.o mce-severity.o mce-genpool.o
+
+obj-$(CONFIG_X86_ANCIENT_MCE) += winchip.o p5.o
+obj-$(CONFIG_X86_MCE_INTEL) += mce_intel.o
+obj-$(CONFIG_X86_MCE_AMD) += mce_amd.o
+obj-$(CONFIG_X86_MCE_THRESHOLD) += threshold.o
+obj-$(CONFIG_X86_MCE_INJECT) += mce-inject.o
+
+obj-$(CONFIG_X86_THERMAL_VECTOR) += therm_throt.o
+
+obj-$(CONFIG_ACPI_APEI) += mce-apei.o
+
+obj-$(CONFIG_X86_MCELOG_LEGACY) += dev-mcelog.o
diff --git a/arch/x86/kernel/cpu/mcheck/dev-mcelog.c b/arch/x86/kernel/cpu/mcheck/dev-mcelog.c
new file mode 100644
index 0000000..97685a0
--- /dev/null
+++ b/arch/x86/kernel/cpu/mcheck/dev-mcelog.c
@@ -0,0 +1,363 @@
+/*
+ * /dev/mcelog driver
+ *
+ * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
+ * Rest from unknown author(s).
+ * 2004 Andi Kleen. Rewrote most of it.
+ * Copyright 2008 Intel Corporation
+ * Author: Andi Kleen
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/miscdevice.h>
+#include <linux/slab.h>
+#include <linux/kmod.h>
+#include <linux/poll.h>
+
+#include "mce-internal.h"
+
+static BLOCKING_NOTIFIER_HEAD(mce_injector_chain);
+
+static DEFINE_MUTEX(mce_chrdev_read_mutex);
+
+static char mce_helper[128];
+static char *mce_helper_argv[2] = { mce_helper, NULL };
+
+/*
+ * Lockless MCE logging infrastructure.
+ * This avoids deadlocks on printk locks without having to break locks. Also
+ * separate MCEs from kernel messages to avoid bogus bug reports.
+ */
+
+static struct mce_log_buffer mcelog = {
+ .signature = MCE_LOG_SIGNATURE,
+ .len = MCE_LOG_LEN,
+ .recordlen = sizeof(struct mce),
+};
+
+static DECLARE_WAIT_QUEUE_HEAD(mce_chrdev_wait);
+
+/* User mode helper program triggered by machine check event */
+extern char mce_helper[128];
+
+static int dev_mce_log(struct notifier_block *nb, unsigned long val,
+ void *data)
+{
+ struct mce *mce = (struct mce *)data;
+ unsigned int entry;
+
+ mutex_lock(&mce_chrdev_read_mutex);
+
+ entry = mcelog.next;
+
+ /*
+ * When the buffer fills up discard new entries. Assume that the
+ * earlier errors are the more interesting ones:
+ */
+ if (entry >= MCE_LOG_LEN) {
+ set_bit(MCE_OVERFLOW, (unsigned long *)&mcelog.flags);
+ goto unlock;
+ }
+
+ mcelog.next = entry + 1;
+
+ memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
+ mcelog.entry[entry].finished = 1;
+
+ /* wake processes polling /dev/mcelog */
+ wake_up_interruptible(&mce_chrdev_wait);
+
+unlock:
+ mutex_unlock(&mce_chrdev_read_mutex);
+
+ return NOTIFY_OK;
+}
+
+static struct notifier_block dev_mcelog_nb = {
+ .notifier_call = dev_mce_log,
+ .priority = MCE_PRIO_MCELOG,
+};
+
+static void mce_do_trigger(struct work_struct *work)
+{
+ call_usermodehelper(mce_helper, mce_helper_argv, NULL, UMH_NO_WAIT);
+}
+
+static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
+
+
+void mce_work_trigger(void)
+{
+ if (mce_helper[0])
+ schedule_work(&mce_trigger_work);
+}
+
+static ssize_t
+show_trigger(struct device *s, struct device_attribute *attr, char *buf)
+{
+ strcpy(buf, mce_helper);
+ strcat(buf, "\n");
+ return strlen(mce_helper) + 1;
+}
+
+static ssize_t set_trigger(struct device *s, struct device_attribute *attr,
+ const char *buf, size_t siz)
+{
+ char *p;
+
+ strncpy(mce_helper, buf, sizeof(mce_helper));
+ mce_helper[sizeof(mce_helper)-1] = 0;
+ p = strchr(mce_helper, '\n');
+
+ if (p)
+ *p = 0;
+
+ return strlen(mce_helper) + !!p;
+}
+
+DEVICE_ATTR(trigger, 0644, show_trigger, set_trigger);
+
+/*
+ * mce_chrdev: Character device /dev/mcelog to read and clear the MCE log.
+ */
+
+static DEFINE_SPINLOCK(mce_chrdev_state_lock);
+static int mce_chrdev_open_count; /* #times opened */
+static int mce_chrdev_open_exclu; /* already open exclusive? */
+
+static int mce_chrdev_open(struct inode *inode, struct file *file)
+{
+ spin_lock(&mce_chrdev_state_lock);
+
+ if (mce_chrdev_open_exclu ||
+ (mce_chrdev_open_count && (file->f_flags & O_EXCL))) {
+ spin_unlock(&mce_chrdev_state_lock);
+
+ return -EBUSY;
+ }
+
+ if (file->f_flags & O_EXCL)
+ mce_chrdev_open_exclu = 1;
+ mce_chrdev_open_count++;
+
+ spin_unlock(&mce_chrdev_state_lock);
+
+ return nonseekable_open(inode, file);
+}
+
+static int mce_chrdev_release(struct inode *inode, struct file *file)
+{
+ spin_lock(&mce_chrdev_state_lock);
+
+ mce_chrdev_open_count--;
+ mce_chrdev_open_exclu = 0;
+
+ spin_unlock(&mce_chrdev_state_lock);
+
+ return 0;
+}
+
+static int mce_apei_read_done;
+
+/* Collect MCE record of previous boot in persistent storage via APEI ERST. */
+static int __mce_read_apei(char __user **ubuf, size_t usize)
+{
+ int rc;
+ u64 record_id;
+ struct mce m;
+
+ if (usize < sizeof(struct mce))
+ return -EINVAL;
+
+ rc = apei_read_mce(&m, &record_id);
+ /* Error or no more MCE record */
+ if (rc <= 0) {
+ mce_apei_read_done = 1;
+ /*
+ * When ERST is disabled, mce_chrdev_read() should return
+ * "no record" instead of "no device."
+ */
+ if (rc == -ENODEV)
+ return 0;
+ return rc;
+ }
+ rc = -EFAULT;
+ if (copy_to_user(*ubuf, &m, sizeof(struct mce)))
+ return rc;
+ /*
+ * In fact, we should have cleared the record after that has
+ * been flushed to the disk or sent to network in
+ * /sbin/mcelog, but we have no interface to support that now,
+ * so just clear it to avoid duplication.
+ */
+ rc = apei_clear_mce(record_id);
+ if (rc) {
+ mce_apei_read_done = 1;
+ return rc;
+ }
+ *ubuf += sizeof(struct mce);
+
+ return 0;
+}
+
+static ssize_t mce_chrdev_read(struct file *filp, char __user *ubuf,
+ size_t usize, loff_t *off)
+{
+ char __user *buf = ubuf;
+ unsigned next;
+ int i, err;
+
+ mutex_lock(&mce_chrdev_read_mutex);
+
+ if (!mce_apei_read_done) {
+ err = __mce_read_apei(&buf, usize);
+ if (err || buf != ubuf)
+ goto out;
+ }
+
+ /* Only supports full reads right now */
+ err = -EINVAL;
+ if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce))
+ goto out;
+
+ next = mcelog.next;
+ err = 0;
+
+ for (i = 0; i < next; i++) {
+ struct mce *m = &mcelog.entry[i];
+
+ err |= copy_to_user(buf, m, sizeof(*m));
+ buf += sizeof(*m);
+ }
+
+ memset(mcelog.entry, 0, next * sizeof(struct mce));
+ mcelog.next = 0;
+
+ if (err)
+ err = -EFAULT;
+
+out:
+ mutex_unlock(&mce_chrdev_read_mutex);
+
+ return err ? err : buf - ubuf;
+}
+
+static __poll_t mce_chrdev_poll(struct file *file, poll_table *wait)
+{
+ poll_wait(file, &mce_chrdev_wait, wait);
+ if (READ_ONCE(mcelog.next))
+ return EPOLLIN | EPOLLRDNORM;
+ if (!mce_apei_read_done && apei_check_mce())
+ return EPOLLIN | EPOLLRDNORM;
+ return 0;
+}
+
+static long mce_chrdev_ioctl(struct file *f, unsigned int cmd,
+ unsigned long arg)
+{
+ int __user *p = (int __user *)arg;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ switch (cmd) {
+ case MCE_GET_RECORD_LEN:
+ return put_user(sizeof(struct mce), p);
+ case MCE_GET_LOG_LEN:
+ return put_user(MCE_LOG_LEN, p);
+ case MCE_GETCLEAR_FLAGS: {
+ unsigned flags;
+
+ do {
+ flags = mcelog.flags;
+ } while (cmpxchg(&mcelog.flags, flags, 0) != flags);
+
+ return put_user(flags, p);
+ }
+ default:
+ return -ENOTTY;
+ }
+}
+
+void mce_register_injector_chain(struct notifier_block *nb)
+{
+ blocking_notifier_chain_register(&mce_injector_chain, nb);
+}
+EXPORT_SYMBOL_GPL(mce_register_injector_chain);
+
+void mce_unregister_injector_chain(struct notifier_block *nb)
+{
+ blocking_notifier_chain_unregister(&mce_injector_chain, nb);
+}
+EXPORT_SYMBOL_GPL(mce_unregister_injector_chain);
+
+static ssize_t mce_chrdev_write(struct file *filp, const char __user *ubuf,
+ size_t usize, loff_t *off)
+{
+ struct mce m;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+ /*
+ * There are some cases where real MSR reads could slip
+ * through.
+ */
+ if (!boot_cpu_has(X86_FEATURE_MCE) || !boot_cpu_has(X86_FEATURE_MCA))
+ return -EIO;
+
+ if ((unsigned long)usize > sizeof(struct mce))
+ usize = sizeof(struct mce);
+ if (copy_from_user(&m, ubuf, usize))
+ return -EFAULT;
+
+ if (m.extcpu >= num_possible_cpus() || !cpu_online(m.extcpu))
+ return -EINVAL;
+
+ /*
+ * Need to give user space some time to set everything up,
+ * so do it a jiffie or two later everywhere.
+ */
+ schedule_timeout(2);
+
+ blocking_notifier_call_chain(&mce_injector_chain, 0, &m);
+
+ return usize;
+}
+
+static const struct file_operations mce_chrdev_ops = {
+ .open = mce_chrdev_open,
+ .release = mce_chrdev_release,
+ .read = mce_chrdev_read,
+ .write = mce_chrdev_write,
+ .poll = mce_chrdev_poll,
+ .unlocked_ioctl = mce_chrdev_ioctl,
+ .llseek = no_llseek,
+};
+
+static struct miscdevice mce_chrdev_device = {
+ MISC_MCELOG_MINOR,
+ "mcelog",
+ &mce_chrdev_ops,
+};
+
+static __init int dev_mcelog_init_device(void)
+{
+ int err;
+
+ /* register character device /dev/mcelog */
+ err = misc_register(&mce_chrdev_device);
+ if (err) {
+ if (err == -EBUSY)
+ /* Xen dom0 might have registered the device already. */
+ pr_info("Unable to init device /dev/mcelog, already registered");
+ else
+ pr_err("Unable to init device /dev/mcelog (rc: %d)\n", err);
+
+ return err;
+ }
+
+ mce_register_decode_chain(&dev_mcelog_nb);
+ return 0;
+}
+device_initcall_sync(dev_mcelog_init_device);
diff --git a/arch/x86/kernel/cpu/mcheck/mce-apei.c b/arch/x86/kernel/cpu/mcheck/mce-apei.c
new file mode 100644
index 0000000..2eee853
--- /dev/null
+++ b/arch/x86/kernel/cpu/mcheck/mce-apei.c
@@ -0,0 +1,157 @@
+/*
+ * Bridge between MCE and APEI
+ *
+ * On some machine, corrected memory errors are reported via APEI
+ * generic hardware error source (GHES) instead of corrected Machine
+ * Check. These corrected memory errors can be reported to user space
+ * through /dev/mcelog via faking a corrected Machine Check, so that
+ * the error memory page can be offlined by /sbin/mcelog if the error
+ * count for one page is beyond the threshold.
+ *
+ * For fatal MCE, save MCE record into persistent storage via ERST, so
+ * that the MCE record can be logged after reboot via ERST.
+ *
+ * Copyright 2010 Intel Corp.
+ * Author: Huang Ying <ying.huang@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/export.h>
+#include <linux/kernel.h>
+#include <linux/acpi.h>
+#include <linux/cper.h>
+#include <acpi/apei.h>
+#include <acpi/ghes.h>
+#include <asm/mce.h>
+
+#include "mce-internal.h"
+
+void apei_mce_report_mem_error(int severity, struct cper_sec_mem_err *mem_err)
+{
+ struct mce m;
+
+ if (!(mem_err->validation_bits & CPER_MEM_VALID_PA))
+ return;
+
+ mce_setup(&m);
+ m.bank = -1;
+ /* Fake a memory read error with unknown channel */
+ m.status = MCI_STATUS_VAL | MCI_STATUS_EN | MCI_STATUS_ADDRV | 0x9f;
+
+ if (severity >= GHES_SEV_RECOVERABLE)
+ m.status |= MCI_STATUS_UC;
+
+ if (severity >= GHES_SEV_PANIC) {
+ m.status |= MCI_STATUS_PCC;
+ m.tsc = rdtsc();
+ }
+
+ m.addr = mem_err->physical_addr;
+ mce_log(&m);
+}
+EXPORT_SYMBOL_GPL(apei_mce_report_mem_error);
+
+#define CPER_CREATOR_MCE \
+ UUID_LE(0x75a574e3, 0x5052, 0x4b29, 0x8a, 0x8e, 0xbe, 0x2c, \
+ 0x64, 0x90, 0xb8, 0x9d)
+#define CPER_SECTION_TYPE_MCE \
+ UUID_LE(0xfe08ffbe, 0x95e4, 0x4be7, 0xbc, 0x73, 0x40, 0x96, \
+ 0x04, 0x4a, 0x38, 0xfc)
+
+/*
+ * CPER specification (in UEFI specification 2.3 appendix N) requires
+ * byte-packed.
+ */
+struct cper_mce_record {
+ struct cper_record_header hdr;
+ struct cper_section_descriptor sec_hdr;
+ struct mce mce;
+} __packed;
+
+int apei_write_mce(struct mce *m)
+{
+ struct cper_mce_record rcd;
+
+ memset(&rcd, 0, sizeof(rcd));
+ memcpy(rcd.hdr.signature, CPER_SIG_RECORD, CPER_SIG_SIZE);
+ rcd.hdr.revision = CPER_RECORD_REV;
+ rcd.hdr.signature_end = CPER_SIG_END;
+ rcd.hdr.section_count = 1;
+ rcd.hdr.error_severity = CPER_SEV_FATAL;
+ /* timestamp, platform_id, partition_id are all invalid */
+ rcd.hdr.validation_bits = 0;
+ rcd.hdr.record_length = sizeof(rcd);
+ rcd.hdr.creator_id = CPER_CREATOR_MCE;
+ rcd.hdr.notification_type = CPER_NOTIFY_MCE;
+ rcd.hdr.record_id = cper_next_record_id();
+ rcd.hdr.flags = CPER_HW_ERROR_FLAGS_PREVERR;
+
+ rcd.sec_hdr.section_offset = (void *)&rcd.mce - (void *)&rcd;
+ rcd.sec_hdr.section_length = sizeof(rcd.mce);
+ rcd.sec_hdr.revision = CPER_SEC_REV;
+ /* fru_id and fru_text is invalid */
+ rcd.sec_hdr.validation_bits = 0;
+ rcd.sec_hdr.flags = CPER_SEC_PRIMARY;
+ rcd.sec_hdr.section_type = CPER_SECTION_TYPE_MCE;
+ rcd.sec_hdr.section_severity = CPER_SEV_FATAL;
+
+ memcpy(&rcd.mce, m, sizeof(*m));
+
+ return erst_write(&rcd.hdr);
+}
+
+ssize_t apei_read_mce(struct mce *m, u64 *record_id)
+{
+ struct cper_mce_record rcd;
+ int rc, pos;
+
+ rc = erst_get_record_id_begin(&pos);
+ if (rc)
+ return rc;
+retry:
+ rc = erst_get_record_id_next(&pos, record_id);
+ if (rc)
+ goto out;
+ /* no more record */
+ if (*record_id == APEI_ERST_INVALID_RECORD_ID)
+ goto out;
+ rc = erst_read(*record_id, &rcd.hdr, sizeof(rcd));
+ /* someone else has cleared the record, try next one */
+ if (rc == -ENOENT)
+ goto retry;
+ else if (rc < 0)
+ goto out;
+ /* try to skip other type records in storage */
+ else if (rc != sizeof(rcd) ||
+ uuid_le_cmp(rcd.hdr.creator_id, CPER_CREATOR_MCE))
+ goto retry;
+ memcpy(m, &rcd.mce, sizeof(*m));
+ rc = sizeof(*m);
+out:
+ erst_get_record_id_end();
+
+ return rc;
+}
+
+/* Check whether there is record in ERST */
+int apei_check_mce(void)
+{
+ return erst_get_record_count();
+}
+
+int apei_clear_mce(u64 record_id)
+{
+ return erst_clear(record_id);
+}
diff --git a/arch/x86/kernel/cpu/mcheck/mce-genpool.c b/arch/x86/kernel/cpu/mcheck/mce-genpool.c
new file mode 100644
index 0000000..217cd44
--- /dev/null
+++ b/arch/x86/kernel/cpu/mcheck/mce-genpool.c
@@ -0,0 +1,145 @@
+/*
+ * MCE event pool management in MCE context
+ *
+ * Copyright (C) 2015 Intel Corp.
+ * Author: Chen, Gong <gong.chen@linux.intel.com>
+ *
+ * This file is licensed under GPLv2.
+ */
+#include <linux/smp.h>
+#include <linux/mm.h>
+#include <linux/genalloc.h>
+#include <linux/llist.h>
+#include "mce-internal.h"
+
+/*
+ * printk() is not safe in MCE context. This is a lock-less memory allocator
+ * used to save error information organized in a lock-less list.
+ *
+ * This memory pool is only to be used to save MCE records in MCE context.
+ * MCE events are rare, so a fixed size memory pool should be enough. Use
+ * 2 pages to save MCE events for now (~80 MCE records at most).
+ */
+#define MCE_POOLSZ (2 * PAGE_SIZE)
+
+static struct gen_pool *mce_evt_pool;
+static LLIST_HEAD(mce_event_llist);
+static char gen_pool_buf[MCE_POOLSZ];
+
+/*
+ * Compare the record "t" with each of the records on list "l" to see if
+ * an equivalent one is present in the list.
+ */
+static bool is_duplicate_mce_record(struct mce_evt_llist *t, struct mce_evt_llist *l)
+{
+ struct mce_evt_llist *node;
+ struct mce *m1, *m2;
+
+ m1 = &t->mce;
+
+ llist_for_each_entry(node, &l->llnode, llnode) {
+ m2 = &node->mce;
+
+ if (!mce_cmp(m1, m2))
+ return true;
+ }
+ return false;
+}
+
+/*
+ * The system has panicked - we'd like to peruse the list of MCE records
+ * that have been queued, but not seen by anyone yet. The list is in
+ * reverse time order, so we need to reverse it. While doing that we can
+ * also drop duplicate records (these were logged because some banks are
+ * shared between cores or by all threads on a socket).
+ */
+struct llist_node *mce_gen_pool_prepare_records(void)
+{
+ struct llist_node *head;
+ LLIST_HEAD(new_head);
+ struct mce_evt_llist *node, *t;
+
+ head = llist_del_all(&mce_event_llist);
+ if (!head)
+ return NULL;
+
+ /* squeeze out duplicates while reversing order */
+ llist_for_each_entry_safe(node, t, head, llnode) {
+ if (!is_duplicate_mce_record(node, t))
+ llist_add(&node->llnode, &new_head);
+ }
+
+ return new_head.first;
+}
+
+void mce_gen_pool_process(struct work_struct *__unused)
+{
+ struct llist_node *head;
+ struct mce_evt_llist *node, *tmp;
+ struct mce *mce;
+
+ head = llist_del_all(&mce_event_llist);
+ if (!head)
+ return;
+
+ head = llist_reverse_order(head);
+ llist_for_each_entry_safe(node, tmp, head, llnode) {
+ mce = &node->mce;
+ blocking_notifier_call_chain(&x86_mce_decoder_chain, 0, mce);
+ gen_pool_free(mce_evt_pool, (unsigned long)node, sizeof(*node));
+ }
+}
+
+bool mce_gen_pool_empty(void)
+{
+ return llist_empty(&mce_event_llist);
+}
+
+int mce_gen_pool_add(struct mce *mce)
+{
+ struct mce_evt_llist *node;
+
+ if (!mce_evt_pool)
+ return -EINVAL;
+
+ node = (void *)gen_pool_alloc(mce_evt_pool, sizeof(*node));
+ if (!node) {
+ pr_warn_ratelimited("MCE records pool full!\n");
+ return -ENOMEM;
+ }
+
+ memcpy(&node->mce, mce, sizeof(*mce));
+ llist_add(&node->llnode, &mce_event_llist);
+
+ return 0;
+}
+
+static int mce_gen_pool_create(void)
+{
+ struct gen_pool *tmpp;
+ int ret = -ENOMEM;
+
+ tmpp = gen_pool_create(ilog2(sizeof(struct mce_evt_llist)), -1);
+ if (!tmpp)
+ goto out;
+
+ ret = gen_pool_add(tmpp, (unsigned long)gen_pool_buf, MCE_POOLSZ, -1);
+ if (ret) {
+ gen_pool_destroy(tmpp);
+ goto out;
+ }
+
+ mce_evt_pool = tmpp;
+
+out:
+ return ret;
+}
+
+int mce_gen_pool_init(void)
+{
+ /* Just init mce_gen_pool once. */
+ if (mce_evt_pool)
+ return 0;
+
+ return mce_gen_pool_create();
+}
diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c
new file mode 100644
index 0000000..c805a06
--- /dev/null
+++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c
@@ -0,0 +1,733 @@
+/*
+ * Machine check injection support.
+ * Copyright 2008 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ *
+ * Authors:
+ * Andi Kleen
+ * Ying Huang
+ *
+ * The AMD part (from mce_amd_inj.c): a simple MCE injection facility
+ * for testing different aspects of the RAS code. This driver should be
+ * built as module so that it can be loaded on production kernels for
+ * testing purposes.
+ *
+ * This file may be distributed under the terms of the GNU General Public
+ * License version 2.
+ *
+ * Copyright (c) 2010-17: Borislav Petkov <bp@alien8.de>
+ * Advanced Micro Devices Inc.
+ */
+
+#include <linux/cpu.h>
+#include <linux/debugfs.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/notifier.h>
+#include <linux/pci.h>
+#include <linux/uaccess.h>
+
+#include <asm/amd_nb.h>
+#include <asm/apic.h>
+#include <asm/irq_vectors.h>
+#include <asm/mce.h>
+#include <asm/nmi.h>
+#include <asm/smp.h>
+
+#include "mce-internal.h"
+
+/*
+ * Collect all the MCi_XXX settings
+ */
+static struct mce i_mce;
+static struct dentry *dfs_inj;
+
+static u8 n_banks;
+
+#define MAX_FLAG_OPT_SIZE 4
+#define NBCFG 0x44
+
+enum injection_type {
+ SW_INJ = 0, /* SW injection, simply decode the error */
+ HW_INJ, /* Trigger a #MC */
+ DFR_INT_INJ, /* Trigger Deferred error interrupt */
+ THR_INT_INJ, /* Trigger threshold interrupt */
+ N_INJ_TYPES,
+};
+
+static const char * const flags_options[] = {
+ [SW_INJ] = "sw",
+ [HW_INJ] = "hw",
+ [DFR_INT_INJ] = "df",
+ [THR_INT_INJ] = "th",
+ NULL
+};
+
+/* Set default injection to SW_INJ */
+static enum injection_type inj_type = SW_INJ;
+
+#define MCE_INJECT_SET(reg) \
+static int inj_##reg##_set(void *data, u64 val) \
+{ \
+ struct mce *m = (struct mce *)data; \
+ \
+ m->reg = val; \
+ return 0; \
+}
+
+MCE_INJECT_SET(status);
+MCE_INJECT_SET(misc);
+MCE_INJECT_SET(addr);
+MCE_INJECT_SET(synd);
+
+#define MCE_INJECT_GET(reg) \
+static int inj_##reg##_get(void *data, u64 *val) \
+{ \
+ struct mce *m = (struct mce *)data; \
+ \
+ *val = m->reg; \
+ return 0; \
+}
+
+MCE_INJECT_GET(status);
+MCE_INJECT_GET(misc);
+MCE_INJECT_GET(addr);
+MCE_INJECT_GET(synd);
+
+DEFINE_SIMPLE_ATTRIBUTE(status_fops, inj_status_get, inj_status_set, "%llx\n");
+DEFINE_SIMPLE_ATTRIBUTE(misc_fops, inj_misc_get, inj_misc_set, "%llx\n");
+DEFINE_SIMPLE_ATTRIBUTE(addr_fops, inj_addr_get, inj_addr_set, "%llx\n");
+DEFINE_SIMPLE_ATTRIBUTE(synd_fops, inj_synd_get, inj_synd_set, "%llx\n");
+
+static void setup_inj_struct(struct mce *m)
+{
+ memset(m, 0, sizeof(struct mce));
+
+ m->cpuvendor = boot_cpu_data.x86_vendor;
+}
+
+/* Update fake mce registers on current CPU. */
+static void inject_mce(struct mce *m)
+{
+ struct mce *i = &per_cpu(injectm, m->extcpu);
+
+ /* Make sure no one reads partially written injectm */
+ i->finished = 0;
+ mb();
+ m->finished = 0;
+ /* First set the fields after finished */
+ i->extcpu = m->extcpu;
+ mb();
+ /* Now write record in order, finished last (except above) */
+ memcpy(i, m, sizeof(struct mce));
+ /* Finally activate it */
+ mb();
+ i->finished = 1;
+}
+
+static void raise_poll(struct mce *m)
+{
+ unsigned long flags;
+ mce_banks_t b;
+
+ memset(&b, 0xff, sizeof(mce_banks_t));
+ local_irq_save(flags);
+ machine_check_poll(0, &b);
+ local_irq_restore(flags);
+ m->finished = 0;
+}
+
+static void raise_exception(struct mce *m, struct pt_regs *pregs)
+{
+ struct pt_regs regs;
+ unsigned long flags;
+
+ if (!pregs) {
+ memset(®s, 0, sizeof(struct pt_regs));
+ regs.ip = m->ip;
+ regs.cs = m->cs;
+ pregs = ®s;
+ }
+ /* in mcheck exeception handler, irq will be disabled */
+ local_irq_save(flags);
+ do_machine_check(pregs, 0);
+ local_irq_restore(flags);
+ m->finished = 0;
+}
+
+static cpumask_var_t mce_inject_cpumask;
+static DEFINE_MUTEX(mce_inject_mutex);
+
+static int mce_raise_notify(unsigned int cmd, struct pt_regs *regs)
+{
+ int cpu = smp_processor_id();
+ struct mce *m = this_cpu_ptr(&injectm);
+ if (!cpumask_test_cpu(cpu, mce_inject_cpumask))
+ return NMI_DONE;
+ cpumask_clear_cpu(cpu, mce_inject_cpumask);
+ if (m->inject_flags & MCJ_EXCEPTION)
+ raise_exception(m, regs);
+ else if (m->status)
+ raise_poll(m);
+ return NMI_HANDLED;
+}
+
+static void mce_irq_ipi(void *info)
+{
+ int cpu = smp_processor_id();
+ struct mce *m = this_cpu_ptr(&injectm);
+
+ if (cpumask_test_cpu(cpu, mce_inject_cpumask) &&
+ m->inject_flags & MCJ_EXCEPTION) {
+ cpumask_clear_cpu(cpu, mce_inject_cpumask);
+ raise_exception(m, NULL);
+ }
+}
+
+/* Inject mce on current CPU */
+static int raise_local(void)
+{
+ struct mce *m = this_cpu_ptr(&injectm);
+ int context = MCJ_CTX(m->inject_flags);
+ int ret = 0;
+ int cpu = m->extcpu;
+
+ if (m->inject_flags & MCJ_EXCEPTION) {
+ pr_info("Triggering MCE exception on CPU %d\n", cpu);
+ switch (context) {
+ case MCJ_CTX_IRQ:
+ /*
+ * Could do more to fake interrupts like
+ * calling irq_enter, but the necessary
+ * machinery isn't exported currently.
+ */
+ /*FALL THROUGH*/
+ case MCJ_CTX_PROCESS:
+ raise_exception(m, NULL);
+ break;
+ default:
+ pr_info("Invalid MCE context\n");
+ ret = -EINVAL;
+ }
+ pr_info("MCE exception done on CPU %d\n", cpu);
+ } else if (m->status) {
+ pr_info("Starting machine check poll CPU %d\n", cpu);
+ raise_poll(m);
+ mce_notify_irq();
+ pr_info("Machine check poll done on CPU %d\n", cpu);
+ } else
+ m->finished = 0;
+
+ return ret;
+}
+
+static void __maybe_unused raise_mce(struct mce *m)
+{
+ int context = MCJ_CTX(m->inject_flags);
+
+ inject_mce(m);
+
+ if (context == MCJ_CTX_RANDOM)
+ return;
+
+ if (m->inject_flags & (MCJ_IRQ_BROADCAST | MCJ_NMI_BROADCAST)) {
+ unsigned long start;
+ int cpu;
+
+ get_online_cpus();
+ cpumask_copy(mce_inject_cpumask, cpu_online_mask);
+ cpumask_clear_cpu(get_cpu(), mce_inject_cpumask);
+ for_each_online_cpu(cpu) {
+ struct mce *mcpu = &per_cpu(injectm, cpu);
+ if (!mcpu->finished ||
+ MCJ_CTX(mcpu->inject_flags) != MCJ_CTX_RANDOM)
+ cpumask_clear_cpu(cpu, mce_inject_cpumask);
+ }
+ if (!cpumask_empty(mce_inject_cpumask)) {
+ if (m->inject_flags & MCJ_IRQ_BROADCAST) {
+ /*
+ * don't wait because mce_irq_ipi is necessary
+ * to be sync with following raise_local
+ */
+ preempt_disable();
+ smp_call_function_many(mce_inject_cpumask,
+ mce_irq_ipi, NULL, 0);
+ preempt_enable();
+ } else if (m->inject_flags & MCJ_NMI_BROADCAST)
+ apic->send_IPI_mask(mce_inject_cpumask,
+ NMI_VECTOR);
+ }
+ start = jiffies;
+ while (!cpumask_empty(mce_inject_cpumask)) {
+ if (!time_before(jiffies, start + 2*HZ)) {
+ pr_err("Timeout waiting for mce inject %lx\n",
+ *cpumask_bits(mce_inject_cpumask));
+ break;
+ }
+ cpu_relax();
+ }
+ raise_local();
+ put_cpu();
+ put_online_cpus();
+ } else {
+ preempt_disable();
+ raise_local();
+ preempt_enable();
+ }
+}
+
+static int mce_inject_raise(struct notifier_block *nb, unsigned long val,
+ void *data)
+{
+ struct mce *m = (struct mce *)data;
+
+ if (!m)
+ return NOTIFY_DONE;
+
+ mutex_lock(&mce_inject_mutex);
+ raise_mce(m);
+ mutex_unlock(&mce_inject_mutex);
+
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block inject_nb = {
+ .notifier_call = mce_inject_raise,
+};
+
+/*
+ * Caller needs to be make sure this cpu doesn't disappear
+ * from under us, i.e.: get_cpu/put_cpu.
+ */
+static int toggle_hw_mce_inject(unsigned int cpu, bool enable)
+{
+ u32 l, h;
+ int err;
+
+ err = rdmsr_on_cpu(cpu, MSR_K7_HWCR, &l, &h);
+ if (err) {
+ pr_err("%s: error reading HWCR\n", __func__);
+ return err;
+ }
+
+ enable ? (l |= BIT(18)) : (l &= ~BIT(18));
+
+ err = wrmsr_on_cpu(cpu, MSR_K7_HWCR, l, h);
+ if (err)
+ pr_err("%s: error writing HWCR\n", __func__);
+
+ return err;
+}
+
+static int __set_inj(const char *buf)
+{
+ int i;
+
+ for (i = 0; i < N_INJ_TYPES; i++) {
+ if (!strncmp(flags_options[i], buf, strlen(flags_options[i]))) {
+ inj_type = i;
+ return 0;
+ }
+ }
+ return -EINVAL;
+}
+
+static ssize_t flags_read(struct file *filp, char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ char buf[MAX_FLAG_OPT_SIZE];
+ int n;
+
+ n = sprintf(buf, "%s\n", flags_options[inj_type]);
+
+ return simple_read_from_buffer(ubuf, cnt, ppos, buf, n);
+}
+
+static ssize_t flags_write(struct file *filp, const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ char buf[MAX_FLAG_OPT_SIZE], *__buf;
+ int err;
+
+ if (cnt > MAX_FLAG_OPT_SIZE)
+ return -EINVAL;
+
+ if (copy_from_user(&buf, ubuf, cnt))
+ return -EFAULT;
+
+ buf[cnt - 1] = 0;
+
+ /* strip whitespace */
+ __buf = strstrip(buf);
+
+ err = __set_inj(__buf);
+ if (err) {
+ pr_err("%s: Invalid flags value: %s\n", __func__, __buf);
+ return err;
+ }
+
+ *ppos += cnt;
+
+ return cnt;
+}
+
+static const struct file_operations flags_fops = {
+ .read = flags_read,
+ .write = flags_write,
+ .llseek = generic_file_llseek,
+};
+
+/*
+ * On which CPU to inject?
+ */
+MCE_INJECT_GET(extcpu);
+
+static int inj_extcpu_set(void *data, u64 val)
+{
+ struct mce *m = (struct mce *)data;
+
+ if (val >= nr_cpu_ids || !cpu_online(val)) {
+ pr_err("%s: Invalid CPU: %llu\n", __func__, val);
+ return -EINVAL;
+ }
+ m->extcpu = val;
+ return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(extcpu_fops, inj_extcpu_get, inj_extcpu_set, "%llu\n");
+
+static void trigger_mce(void *info)
+{
+ asm volatile("int $18");
+}
+
+static void trigger_dfr_int(void *info)
+{
+ asm volatile("int %0" :: "i" (DEFERRED_ERROR_VECTOR));
+}
+
+static void trigger_thr_int(void *info)
+{
+ asm volatile("int %0" :: "i" (THRESHOLD_APIC_VECTOR));
+}
+
+static u32 get_nbc_for_node(int node_id)
+{
+ struct cpuinfo_x86 *c = &boot_cpu_data;
+ u32 cores_per_node;
+
+ cores_per_node = (c->x86_max_cores * smp_num_siblings) / amd_get_nodes_per_socket();
+
+ return cores_per_node * node_id;
+}
+
+static void toggle_nb_mca_mst_cpu(u16 nid)
+{
+ struct amd_northbridge *nb;
+ struct pci_dev *F3;
+ u32 val;
+ int err;
+
+ nb = node_to_amd_nb(nid);
+ if (!nb)
+ return;
+
+ F3 = nb->misc;
+ if (!F3)
+ return;
+
+ err = pci_read_config_dword(F3, NBCFG, &val);
+ if (err) {
+ pr_err("%s: Error reading F%dx%03x.\n",
+ __func__, PCI_FUNC(F3->devfn), NBCFG);
+ return;
+ }
+
+ if (val & BIT(27))
+ return;
+
+ pr_err("%s: Set D18F3x44[NbMcaToMstCpuEn] which BIOS hasn't done.\n",
+ __func__);
+
+ val |= BIT(27);
+ err = pci_write_config_dword(F3, NBCFG, val);
+ if (err)
+ pr_err("%s: Error writing F%dx%03x.\n",
+ __func__, PCI_FUNC(F3->devfn), NBCFG);
+}
+
+static void prepare_msrs(void *info)
+{
+ struct mce m = *(struct mce *)info;
+ u8 b = m.bank;
+
+ wrmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
+
+ if (boot_cpu_has(X86_FEATURE_SMCA)) {
+ if (m.inject_flags == DFR_INT_INJ) {
+ wrmsrl(MSR_AMD64_SMCA_MCx_DESTAT(b), m.status);
+ wrmsrl(MSR_AMD64_SMCA_MCx_DEADDR(b), m.addr);
+ } else {
+ wrmsrl(MSR_AMD64_SMCA_MCx_STATUS(b), m.status);
+ wrmsrl(MSR_AMD64_SMCA_MCx_ADDR(b), m.addr);
+ }
+
+ wrmsrl(MSR_AMD64_SMCA_MCx_MISC(b), m.misc);
+ wrmsrl(MSR_AMD64_SMCA_MCx_SYND(b), m.synd);
+ } else {
+ wrmsrl(MSR_IA32_MCx_STATUS(b), m.status);
+ wrmsrl(MSR_IA32_MCx_ADDR(b), m.addr);
+ wrmsrl(MSR_IA32_MCx_MISC(b), m.misc);
+ }
+}
+
+static void do_inject(void)
+{
+ u64 mcg_status = 0;
+ unsigned int cpu = i_mce.extcpu;
+ u8 b = i_mce.bank;
+
+ i_mce.tsc = rdtsc_ordered();
+
+ if (i_mce.misc)
+ i_mce.status |= MCI_STATUS_MISCV;
+
+ if (i_mce.synd)
+ i_mce.status |= MCI_STATUS_SYNDV;
+
+ if (inj_type == SW_INJ) {
+ mce_inject_log(&i_mce);
+ return;
+ }
+
+ /* prep MCE global settings for the injection */
+ mcg_status = MCG_STATUS_MCIP | MCG_STATUS_EIPV;
+
+ if (!(i_mce.status & MCI_STATUS_PCC))
+ mcg_status |= MCG_STATUS_RIPV;
+
+ /*
+ * Ensure necessary status bits for deferred errors:
+ * - MCx_STATUS[Deferred]: make sure it is a deferred error
+ * - MCx_STATUS[UC] cleared: deferred errors are _not_ UC
+ */
+ if (inj_type == DFR_INT_INJ) {
+ i_mce.status |= MCI_STATUS_DEFERRED;
+ i_mce.status |= (i_mce.status & ~MCI_STATUS_UC);
+ }
+
+ /*
+ * For multi node CPUs, logging and reporting of bank 4 errors happens
+ * only on the node base core. Refer to D18F3x44[NbMcaToMstCpuEn] for
+ * Fam10h and later BKDGs.
+ */
+ if (static_cpu_has(X86_FEATURE_AMD_DCM) &&
+ b == 4 &&
+ boot_cpu_data.x86 < 0x17) {
+ toggle_nb_mca_mst_cpu(amd_get_nb_id(cpu));
+ cpu = get_nbc_for_node(amd_get_nb_id(cpu));
+ }
+
+ get_online_cpus();
+ if (!cpu_online(cpu))
+ goto err;
+
+ toggle_hw_mce_inject(cpu, true);
+
+ i_mce.mcgstatus = mcg_status;
+ i_mce.inject_flags = inj_type;
+ smp_call_function_single(cpu, prepare_msrs, &i_mce, 0);
+
+ toggle_hw_mce_inject(cpu, false);
+
+ switch (inj_type) {
+ case DFR_INT_INJ:
+ smp_call_function_single(cpu, trigger_dfr_int, NULL, 0);
+ break;
+ case THR_INT_INJ:
+ smp_call_function_single(cpu, trigger_thr_int, NULL, 0);
+ break;
+ default:
+ smp_call_function_single(cpu, trigger_mce, NULL, 0);
+ }
+
+err:
+ put_online_cpus();
+
+}
+
+/*
+ * This denotes into which bank we're injecting and triggers
+ * the injection, at the same time.
+ */
+static int inj_bank_set(void *data, u64 val)
+{
+ struct mce *m = (struct mce *)data;
+
+ if (val >= n_banks) {
+ pr_err("Non-existent MCE bank: %llu\n", val);
+ return -EINVAL;
+ }
+
+ m->bank = val;
+ do_inject();
+
+ return 0;
+}
+
+MCE_INJECT_GET(bank);
+
+DEFINE_SIMPLE_ATTRIBUTE(bank_fops, inj_bank_get, inj_bank_set, "%llu\n");
+
+static const char readme_msg[] =
+"Description of the files and their usages:\n"
+"\n"
+"Note1: i refers to the bank number below.\n"
+"Note2: See respective BKDGs for the exact bit definitions of the files below\n"
+"as they mirror the hardware registers.\n"
+"\n"
+"status:\t Set MCi_STATUS: the bits in that MSR control the error type and\n"
+"\t attributes of the error which caused the MCE.\n"
+"\n"
+"misc:\t Set MCi_MISC: provide auxiliary info about the error. It is mostly\n"
+"\t used for error thresholding purposes and its validity is indicated by\n"
+"\t MCi_STATUS[MiscV].\n"
+"\n"
+"synd:\t Set MCi_SYND: provide syndrome info about the error. Only valid on\n"
+"\t Scalable MCA systems, and its validity is indicated by MCi_STATUS[SyndV].\n"
+"\n"
+"addr:\t Error address value to be written to MCi_ADDR. Log address information\n"
+"\t associated with the error.\n"
+"\n"
+"cpu:\t The CPU to inject the error on.\n"
+"\n"
+"bank:\t Specify the bank you want to inject the error into: the number of\n"
+"\t banks in a processor varies and is family/model-specific, therefore, the\n"
+"\t supplied value is sanity-checked. Setting the bank value also triggers the\n"
+"\t injection.\n"
+"\n"
+"flags:\t Injection type to be performed. Writing to this file will trigger a\n"
+"\t real machine check, an APIC interrupt or invoke the error decoder routines\n"
+"\t for AMD processors.\n"
+"\n"
+"\t Allowed error injection types:\n"
+"\t - \"sw\": Software error injection. Decode error to a human-readable \n"
+"\t format only. Safe to use.\n"
+"\t - \"hw\": Hardware error injection. Causes the #MC exception handler to \n"
+"\t handle the error. Be warned: might cause system panic if MCi_STATUS[PCC] \n"
+"\t is set. Therefore, consider setting (debugfs_mountpoint)/mce/fake_panic \n"
+"\t before injecting.\n"
+"\t - \"df\": Trigger APIC interrupt for Deferred error. Causes deferred \n"
+"\t error APIC interrupt handler to handle the error if the feature is \n"
+"\t is present in hardware. \n"
+"\t - \"th\": Trigger APIC interrupt for Threshold errors. Causes threshold \n"
+"\t APIC interrupt handler to handle the error. \n"
+"\n";
+
+static ssize_t
+inj_readme_read(struct file *filp, char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ return simple_read_from_buffer(ubuf, cnt, ppos,
+ readme_msg, strlen(readme_msg));
+}
+
+static const struct file_operations readme_fops = {
+ .read = inj_readme_read,
+};
+
+static struct dfs_node {
+ char *name;
+ struct dentry *d;
+ const struct file_operations *fops;
+ umode_t perm;
+} dfs_fls[] = {
+ { .name = "status", .fops = &status_fops, .perm = S_IRUSR | S_IWUSR },
+ { .name = "misc", .fops = &misc_fops, .perm = S_IRUSR | S_IWUSR },
+ { .name = "addr", .fops = &addr_fops, .perm = S_IRUSR | S_IWUSR },
+ { .name = "synd", .fops = &synd_fops, .perm = S_IRUSR | S_IWUSR },
+ { .name = "bank", .fops = &bank_fops, .perm = S_IRUSR | S_IWUSR },
+ { .name = "flags", .fops = &flags_fops, .perm = S_IRUSR | S_IWUSR },
+ { .name = "cpu", .fops = &extcpu_fops, .perm = S_IRUSR | S_IWUSR },
+ { .name = "README", .fops = &readme_fops, .perm = S_IRUSR | S_IRGRP | S_IROTH },
+};
+
+static int __init debugfs_init(void)
+{
+ unsigned int i;
+ u64 cap;
+
+ rdmsrl(MSR_IA32_MCG_CAP, cap);
+ n_banks = cap & MCG_BANKCNT_MASK;
+
+ dfs_inj = debugfs_create_dir("mce-inject", NULL);
+ if (!dfs_inj)
+ return -EINVAL;
+
+ for (i = 0; i < ARRAY_SIZE(dfs_fls); i++) {
+ dfs_fls[i].d = debugfs_create_file(dfs_fls[i].name,
+ dfs_fls[i].perm,
+ dfs_inj,
+ &i_mce,
+ dfs_fls[i].fops);
+
+ if (!dfs_fls[i].d)
+ goto err_dfs_add;
+ }
+
+ return 0;
+
+err_dfs_add:
+ while (i-- > 0)
+ debugfs_remove(dfs_fls[i].d);
+
+ debugfs_remove(dfs_inj);
+ dfs_inj = NULL;
+
+ return -ENODEV;
+}
+
+static int __init inject_init(void)
+{
+ int err;
+
+ if (!alloc_cpumask_var(&mce_inject_cpumask, GFP_KERNEL))
+ return -ENOMEM;
+
+ err = debugfs_init();
+ if (err) {
+ free_cpumask_var(mce_inject_cpumask);
+ return err;
+ }
+
+ register_nmi_handler(NMI_LOCAL, mce_raise_notify, 0, "mce_notify");
+ mce_register_injector_chain(&inject_nb);
+
+ setup_inj_struct(&i_mce);
+
+ pr_info("Machine check injector initialized\n");
+
+ return 0;
+}
+
+static void __exit inject_exit(void)
+{
+
+ mce_unregister_injector_chain(&inject_nb);
+ unregister_nmi_handler(NMI_LOCAL, "mce_notify");
+
+ debugfs_remove_recursive(dfs_inj);
+ dfs_inj = NULL;
+
+ memset(&dfs_fls, 0, sizeof(dfs_fls));
+
+ free_cpumask_var(mce_inject_cpumask);
+}
+
+module_init(inject_init);
+module_exit(inject_exit);
+MODULE_LICENSE("GPL");
diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h
new file mode 100644
index 0000000..ceb67cd
--- /dev/null
+++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h
@@ -0,0 +1,173 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __X86_MCE_INTERNAL_H__
+#define __X86_MCE_INTERNAL_H__
+
+#include <linux/device.h>
+#include <asm/mce.h>
+
+enum severity_level {
+ MCE_NO_SEVERITY,
+ MCE_DEFERRED_SEVERITY,
+ MCE_UCNA_SEVERITY = MCE_DEFERRED_SEVERITY,
+ MCE_KEEP_SEVERITY,
+ MCE_SOME_SEVERITY,
+ MCE_AO_SEVERITY,
+ MCE_UC_SEVERITY,
+ MCE_AR_SEVERITY,
+ MCE_PANIC_SEVERITY,
+};
+
+extern struct blocking_notifier_head x86_mce_decoder_chain;
+
+#define ATTR_LEN 16
+#define INITIAL_CHECK_INTERVAL 5 * 60 /* 5 minutes */
+
+/* One object for each MCE bank, shared by all CPUs */
+struct mce_bank {
+ u64 ctl; /* subevents to enable */
+ unsigned char init; /* initialise bank? */
+ struct device_attribute attr; /* device attribute */
+ char attrname[ATTR_LEN]; /* attribute name */
+};
+
+struct mce_evt_llist {
+ struct llist_node llnode;
+ struct mce mce;
+};
+
+void mce_gen_pool_process(struct work_struct *__unused);
+bool mce_gen_pool_empty(void);
+int mce_gen_pool_add(struct mce *mce);
+int mce_gen_pool_init(void);
+struct llist_node *mce_gen_pool_prepare_records(void);
+
+extern int (*mce_severity)(struct mce *a, int tolerant, char **msg, bool is_excp);
+struct dentry *mce_get_debugfs_dir(void);
+
+extern struct mce_bank *mce_banks;
+extern mce_banks_t mce_banks_ce_disabled;
+
+#ifdef CONFIG_X86_MCE_INTEL
+unsigned long cmci_intel_adjust_timer(unsigned long interval);
+bool mce_intel_cmci_poll(void);
+void mce_intel_hcpu_update(unsigned long cpu);
+void cmci_disable_bank(int bank);
+#else
+# define cmci_intel_adjust_timer mce_adjust_timer_default
+static inline bool mce_intel_cmci_poll(void) { return false; }
+static inline void mce_intel_hcpu_update(unsigned long cpu) { }
+static inline void cmci_disable_bank(int bank) { }
+#endif
+
+void mce_timer_kick(unsigned long interval);
+
+#ifdef CONFIG_ACPI_APEI
+int apei_write_mce(struct mce *m);
+ssize_t apei_read_mce(struct mce *m, u64 *record_id);
+int apei_check_mce(void);
+int apei_clear_mce(u64 record_id);
+#else
+static inline int apei_write_mce(struct mce *m)
+{
+ return -EINVAL;
+}
+static inline ssize_t apei_read_mce(struct mce *m, u64 *record_id)
+{
+ return 0;
+}
+static inline int apei_check_mce(void)
+{
+ return 0;
+}
+static inline int apei_clear_mce(u64 record_id)
+{
+ return -EINVAL;
+}
+#endif
+
+void mce_inject_log(struct mce *m);
+
+/*
+ * We consider records to be equivalent if bank+status+addr+misc all match.
+ * This is only used when the system is going down because of a fatal error
+ * to avoid cluttering the console log with essentially repeated information.
+ * In normal processing all errors seen are logged.
+ */
+static inline bool mce_cmp(struct mce *m1, struct mce *m2)
+{
+ return m1->bank != m2->bank ||
+ m1->status != m2->status ||
+ m1->addr != m2->addr ||
+ m1->misc != m2->misc;
+}
+
+extern struct device_attribute dev_attr_trigger;
+
+#ifdef CONFIG_X86_MCELOG_LEGACY
+void mce_work_trigger(void);
+void mce_register_injector_chain(struct notifier_block *nb);
+void mce_unregister_injector_chain(struct notifier_block *nb);
+#else
+static inline void mce_work_trigger(void) { }
+static inline void mce_register_injector_chain(struct notifier_block *nb) { }
+static inline void mce_unregister_injector_chain(struct notifier_block *nb) { }
+#endif
+
+struct mca_config {
+ bool dont_log_ce;
+ bool cmci_disabled;
+ bool ignore_ce;
+
+ __u64 lmce_disabled : 1,
+ disabled : 1,
+ ser : 1,
+ recovery : 1,
+ bios_cmci_threshold : 1,
+ __reserved : 59;
+
+ u8 banks;
+ s8 bootlog;
+ int tolerant;
+ int monarch_timeout;
+ int panic_timeout;
+ u32 rip_msr;
+};
+
+extern struct mca_config mca_cfg;
+
+struct mce_vendor_flags {
+ /*
+ * Indicates that overflow conditions are not fatal, when set.
+ */
+ __u64 overflow_recov : 1,
+
+ /*
+ * (AMD) SUCCOR stands for S/W UnCorrectable error COntainment and
+ * Recovery. It indicates support for data poisoning in HW and deferred
+ * error interrupts.
+ */
+ succor : 1,
+
+ /*
+ * (AMD) SMCA: This bit indicates support for Scalable MCA which expands
+ * the register space for each MCA bank and also increases number of
+ * banks. Also, to accommodate the new banks and registers, the MCA
+ * register space is moved to a new MSR range.
+ */
+ smca : 1,
+
+ __reserved_0 : 61;
+};
+
+extern struct mce_vendor_flags mce_flags;
+
+struct mca_msr_regs {
+ u32 (*ctl) (int bank);
+ u32 (*status) (int bank);
+ u32 (*addr) (int bank);
+ u32 (*misc) (int bank);
+};
+
+extern struct mca_msr_regs msr_ops;
+
+#endif /* __X86_MCE_INTERNAL_H__ */
diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c
new file mode 100644
index 0000000..f34d89c
--- /dev/null
+++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c
@@ -0,0 +1,418 @@
+/*
+ * MCE grading rules.
+ * Copyright 2008, 2009 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ *
+ * Author: Andi Kleen
+ */
+#include <linux/kernel.h>
+#include <linux/seq_file.h>
+#include <linux/init.h>
+#include <linux/debugfs.h>
+#include <asm/mce.h>
+#include <linux/uaccess.h>
+
+#include "mce-internal.h"
+
+/*
+ * Grade an mce by severity. In general the most severe ones are processed
+ * first. Since there are quite a lot of combinations test the bits in a
+ * table-driven way. The rules are simply processed in order, first
+ * match wins.
+ *
+ * Note this is only used for machine check exceptions, the corrected
+ * errors use much simpler rules. The exceptions still check for the corrected
+ * errors, but only to leave them alone for the CMCI handler (except for
+ * panic situations)
+ */
+
+enum context { IN_KERNEL = 1, IN_USER = 2, IN_KERNEL_RECOV = 3 };
+enum ser { SER_REQUIRED = 1, NO_SER = 2 };
+enum exception { EXCP_CONTEXT = 1, NO_EXCP = 2 };
+
+static struct severity {
+ u64 mask;
+ u64 result;
+ unsigned char sev;
+ unsigned char mcgmask;
+ unsigned char mcgres;
+ unsigned char ser;
+ unsigned char context;
+ unsigned char excp;
+ unsigned char covered;
+ char *msg;
+} severities[] = {
+#define MCESEV(s, m, c...) { .sev = MCE_ ## s ## _SEVERITY, .msg = m, ## c }
+#define KERNEL .context = IN_KERNEL
+#define USER .context = IN_USER
+#define KERNEL_RECOV .context = IN_KERNEL_RECOV
+#define SER .ser = SER_REQUIRED
+#define NOSER .ser = NO_SER
+#define EXCP .excp = EXCP_CONTEXT
+#define NOEXCP .excp = NO_EXCP
+#define BITCLR(x) .mask = x, .result = 0
+#define BITSET(x) .mask = x, .result = x
+#define MCGMASK(x, y) .mcgmask = x, .mcgres = y
+#define MASK(x, y) .mask = x, .result = y
+#define MCI_UC_S (MCI_STATUS_UC|MCI_STATUS_S)
+#define MCI_UC_AR (MCI_STATUS_UC|MCI_STATUS_AR)
+#define MCI_UC_SAR (MCI_STATUS_UC|MCI_STATUS_S|MCI_STATUS_AR)
+#define MCI_ADDR (MCI_STATUS_ADDRV|MCI_STATUS_MISCV)
+
+ MCESEV(
+ NO, "Invalid",
+ BITCLR(MCI_STATUS_VAL)
+ ),
+ MCESEV(
+ NO, "Not enabled",
+ EXCP, BITCLR(MCI_STATUS_EN)
+ ),
+ MCESEV(
+ PANIC, "Processor context corrupt",
+ BITSET(MCI_STATUS_PCC)
+ ),
+ /* When MCIP is not set something is very confused */
+ MCESEV(
+ PANIC, "MCIP not set in MCA handler",
+ EXCP, MCGMASK(MCG_STATUS_MCIP, 0)
+ ),
+ /* Neither return not error IP -- no chance to recover -> PANIC */
+ MCESEV(
+ PANIC, "Neither restart nor error IP",
+ EXCP, MCGMASK(MCG_STATUS_RIPV|MCG_STATUS_EIPV, 0)
+ ),
+ MCESEV(
+ PANIC, "In kernel and no restart IP",
+ EXCP, KERNEL, MCGMASK(MCG_STATUS_RIPV, 0)
+ ),
+ MCESEV(
+ PANIC, "In kernel and no restart IP",
+ EXCP, KERNEL_RECOV, MCGMASK(MCG_STATUS_RIPV, 0)
+ ),
+ MCESEV(
+ DEFERRED, "Deferred error",
+ NOSER, MASK(MCI_STATUS_UC|MCI_STATUS_DEFERRED|MCI_STATUS_POISON, MCI_STATUS_DEFERRED)
+ ),
+ MCESEV(
+ KEEP, "Corrected error",
+ NOSER, BITCLR(MCI_STATUS_UC)
+ ),
+
+ /*
+ * known AO MCACODs reported via MCE or CMC:
+ *
+ * SRAO could be signaled either via a machine check exception or
+ * CMCI with the corresponding bit S 1 or 0. So we don't need to
+ * check bit S for SRAO.
+ */
+ MCESEV(
+ AO, "Action optional: memory scrubbing error",
+ SER, MASK(MCI_STATUS_OVER|MCI_UC_AR|MCACOD_SCRUBMSK, MCI_STATUS_UC|MCACOD_SCRUB)
+ ),
+ MCESEV(
+ AO, "Action optional: last level cache writeback error",
+ SER, MASK(MCI_STATUS_OVER|MCI_UC_AR|MCACOD, MCI_STATUS_UC|MCACOD_L3WB)
+ ),
+
+ /* ignore OVER for UCNA */
+ MCESEV(
+ UCNA, "Uncorrected no action required",
+ SER, MASK(MCI_UC_SAR, MCI_STATUS_UC)
+ ),
+ MCESEV(
+ PANIC, "Illegal combination (UCNA with AR=1)",
+ SER,
+ MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_STATUS_UC|MCI_STATUS_AR)
+ ),
+ MCESEV(
+ KEEP, "Non signalled machine check",
+ SER, BITCLR(MCI_STATUS_S)
+ ),
+
+ MCESEV(
+ PANIC, "Action required with lost events",
+ SER, BITSET(MCI_STATUS_OVER|MCI_UC_SAR)
+ ),
+
+ /* known AR MCACODs: */
+#ifdef CONFIG_MEMORY_FAILURE
+ MCESEV(
+ KEEP, "Action required but unaffected thread is continuable",
+ SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR, MCI_UC_SAR|MCI_ADDR),
+ MCGMASK(MCG_STATUS_RIPV|MCG_STATUS_EIPV, MCG_STATUS_RIPV)
+ ),
+ MCESEV(
+ AR, "Action required: data load in error recoverable area of kernel",
+ SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA),
+ KERNEL_RECOV
+ ),
+ MCESEV(
+ AR, "Action required: data load error in a user process",
+ SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA),
+ USER
+ ),
+ MCESEV(
+ AR, "Action required: instruction fetch error in a user process",
+ SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_INSTR),
+ USER
+ ),
+ MCESEV(
+ PANIC, "Data load in unrecoverable area of kernel",
+ SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA),
+ KERNEL
+ ),
+#endif
+ MCESEV(
+ PANIC, "Action required: unknown MCACOD",
+ SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_SAR)
+ ),
+
+ MCESEV(
+ SOME, "Action optional: unknown MCACOD",
+ SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_S)
+ ),
+ MCESEV(
+ SOME, "Action optional with lost events",
+ SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_STATUS_OVER|MCI_UC_S)
+ ),
+
+ MCESEV(
+ PANIC, "Overflowed uncorrected",
+ BITSET(MCI_STATUS_OVER|MCI_STATUS_UC)
+ ),
+ MCESEV(
+ UC, "Uncorrected",
+ BITSET(MCI_STATUS_UC)
+ ),
+ MCESEV(
+ SOME, "No match",
+ BITSET(0)
+ ) /* always matches. keep at end */
+};
+
+#define mc_recoverable(mcg) (((mcg) & (MCG_STATUS_RIPV|MCG_STATUS_EIPV)) == \
+ (MCG_STATUS_RIPV|MCG_STATUS_EIPV))
+
+/*
+ * If mcgstatus indicated that ip/cs on the stack were
+ * no good, then "m->cs" will be zero and we will have
+ * to assume the worst case (IN_KERNEL) as we actually
+ * have no idea what we were executing when the machine
+ * check hit.
+ * If we do have a good "m->cs" (or a faked one in the
+ * case we were executing in VM86 mode) we can use it to
+ * distinguish an exception taken in user from from one
+ * taken in the kernel.
+ */
+static int error_context(struct mce *m)
+{
+ if ((m->cs & 3) == 3)
+ return IN_USER;
+ if (mc_recoverable(m->mcgstatus) && ex_has_fault_handler(m->ip))
+ return IN_KERNEL_RECOV;
+ return IN_KERNEL;
+}
+
+static int mce_severity_amd_smca(struct mce *m, enum context err_ctx)
+{
+ u32 addr = MSR_AMD64_SMCA_MCx_CONFIG(m->bank);
+ u32 low, high;
+
+ /*
+ * We need to look at the following bits:
+ * - "succor" bit (data poisoning support), and
+ * - TCC bit (Task Context Corrupt)
+ * in MCi_STATUS to determine error severity.
+ */
+ if (!mce_flags.succor)
+ return MCE_PANIC_SEVERITY;
+
+ if (rdmsr_safe(addr, &low, &high))
+ return MCE_PANIC_SEVERITY;
+
+ /* TCC (Task context corrupt). If set and if IN_KERNEL, panic. */
+ if ((low & MCI_CONFIG_MCAX) &&
+ (m->status & MCI_STATUS_TCC) &&
+ (err_ctx == IN_KERNEL))
+ return MCE_PANIC_SEVERITY;
+
+ /* ...otherwise invoke hwpoison handler. */
+ return MCE_AR_SEVERITY;
+}
+
+/*
+ * See AMD Error Scope Hierarchy table in a newer BKDG. For example
+ * 49125_15h_Models_30h-3Fh_BKDG.pdf, section "RAS Features"
+ */
+static int mce_severity_amd(struct mce *m, int tolerant, char **msg, bool is_excp)
+{
+ enum context ctx = error_context(m);
+
+ /* Processor Context Corrupt, no need to fumble too much, die! */
+ if (m->status & MCI_STATUS_PCC)
+ return MCE_PANIC_SEVERITY;
+
+ if (m->status & MCI_STATUS_UC) {
+
+ if (ctx == IN_KERNEL)
+ return MCE_PANIC_SEVERITY;
+
+ /*
+ * On older systems where overflow_recov flag is not present, we
+ * should simply panic if an error overflow occurs. If
+ * overflow_recov flag is present and set, then software can try
+ * to at least kill process to prolong system operation.
+ */
+ if (mce_flags.overflow_recov) {
+ if (mce_flags.smca)
+ return mce_severity_amd_smca(m, ctx);
+
+ /* kill current process */
+ return MCE_AR_SEVERITY;
+ } else {
+ /* at least one error was not logged */
+ if (m->status & MCI_STATUS_OVER)
+ return MCE_PANIC_SEVERITY;
+ }
+
+ /*
+ * For any other case, return MCE_UC_SEVERITY so that we log the
+ * error and exit #MC handler.
+ */
+ return MCE_UC_SEVERITY;
+ }
+
+ /*
+ * deferred error: poll handler catches these and adds to mce_ring so
+ * memory-failure can take recovery actions.
+ */
+ if (m->status & MCI_STATUS_DEFERRED)
+ return MCE_DEFERRED_SEVERITY;
+
+ /*
+ * corrected error: poll handler catches these and passes responsibility
+ * of decoding the error to EDAC
+ */
+ return MCE_KEEP_SEVERITY;
+}
+
+static int mce_severity_intel(struct mce *m, int tolerant, char **msg, bool is_excp)
+{
+ enum exception excp = (is_excp ? EXCP_CONTEXT : NO_EXCP);
+ enum context ctx = error_context(m);
+ struct severity *s;
+
+ for (s = severities;; s++) {
+ if ((m->status & s->mask) != s->result)
+ continue;
+ if ((m->mcgstatus & s->mcgmask) != s->mcgres)
+ continue;
+ if (s->ser == SER_REQUIRED && !mca_cfg.ser)
+ continue;
+ if (s->ser == NO_SER && mca_cfg.ser)
+ continue;
+ if (s->context && ctx != s->context)
+ continue;
+ if (s->excp && excp != s->excp)
+ continue;
+ if (msg)
+ *msg = s->msg;
+ s->covered = 1;
+ if (s->sev >= MCE_UC_SEVERITY && ctx == IN_KERNEL) {
+ if (tolerant < 1)
+ return MCE_PANIC_SEVERITY;
+ }
+ return s->sev;
+ }
+}
+
+/* Default to mce_severity_intel */
+int (*mce_severity)(struct mce *m, int tolerant, char **msg, bool is_excp) =
+ mce_severity_intel;
+
+void __init mcheck_vendor_init_severity(void)
+{
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
+ mce_severity = mce_severity_amd;
+}
+
+#ifdef CONFIG_DEBUG_FS
+static void *s_start(struct seq_file *f, loff_t *pos)
+{
+ if (*pos >= ARRAY_SIZE(severities))
+ return NULL;
+ return &severities[*pos];
+}
+
+static void *s_next(struct seq_file *f, void *data, loff_t *pos)
+{
+ if (++(*pos) >= ARRAY_SIZE(severities))
+ return NULL;
+ return &severities[*pos];
+}
+
+static void s_stop(struct seq_file *f, void *data)
+{
+}
+
+static int s_show(struct seq_file *f, void *data)
+{
+ struct severity *ser = data;
+ seq_printf(f, "%d\t%s\n", ser->covered, ser->msg);
+ return 0;
+}
+
+static const struct seq_operations severities_seq_ops = {
+ .start = s_start,
+ .next = s_next,
+ .stop = s_stop,
+ .show = s_show,
+};
+
+static int severities_coverage_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &severities_seq_ops);
+}
+
+static ssize_t severities_coverage_write(struct file *file,
+ const char __user *ubuf,
+ size_t count, loff_t *ppos)
+{
+ int i;
+ for (i = 0; i < ARRAY_SIZE(severities); i++)
+ severities[i].covered = 0;
+ return count;
+}
+
+static const struct file_operations severities_coverage_fops = {
+ .open = severities_coverage_open,
+ .release = seq_release,
+ .read = seq_read,
+ .write = severities_coverage_write,
+ .llseek = seq_lseek,
+};
+
+static int __init severities_debugfs_init(void)
+{
+ struct dentry *dmce, *fsev;
+
+ dmce = mce_get_debugfs_dir();
+ if (!dmce)
+ goto err_out;
+
+ fsev = debugfs_create_file("severities-coverage", 0444, dmce, NULL,
+ &severities_coverage_fops);
+ if (!fsev)
+ goto err_out;
+
+ return 0;
+
+err_out:
+ return -ENOMEM;
+}
+late_initcall(severities_debugfs_init);
+#endif /* CONFIG_DEBUG_FS */
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
new file mode 100644
index 0000000..cdbedeb
--- /dev/null
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -0,0 +1,2489 @@
+/*
+ * Machine check handler.
+ *
+ * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
+ * Rest from unknown author(s).
+ * 2004 Andi Kleen. Rewrote most of it.
+ * Copyright 2008 Intel Corporation
+ * Author: Andi Kleen
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/thread_info.h>
+#include <linux/capability.h>
+#include <linux/miscdevice.h>
+#include <linux/ratelimit.h>
+#include <linux/rcupdate.h>
+#include <linux/kobject.h>
+#include <linux/uaccess.h>
+#include <linux/kdebug.h>
+#include <linux/kernel.h>
+#include <linux/percpu.h>
+#include <linux/string.h>
+#include <linux/device.h>
+#include <linux/syscore_ops.h>
+#include <linux/delay.h>
+#include <linux/ctype.h>
+#include <linux/sched.h>
+#include <linux/sysfs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/kmod.h>
+#include <linux/poll.h>
+#include <linux/nmi.h>
+#include <linux/cpu.h>
+#include <linux/ras.h>
+#include <linux/smp.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/debugfs.h>
+#include <linux/irq_work.h>
+#include <linux/export.h>
+#include <linux/jump_label.h>
+#include <linux/set_memory.h>
+
+#include <asm/intel-family.h>
+#include <asm/processor.h>
+#include <asm/traps.h>
+#include <asm/tlbflush.h>
+#include <asm/mce.h>
+#include <asm/msr.h>
+#include <asm/reboot.h>
+
+#include "mce-internal.h"
+
+static DEFINE_MUTEX(mce_log_mutex);
+
+/* sysfs synchronization */
+static DEFINE_MUTEX(mce_sysfs_mutex);
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/mce.h>
+
+#define SPINUNIT 100 /* 100ns */
+
+DEFINE_PER_CPU(unsigned, mce_exception_count);
+
+struct mce_bank *mce_banks __read_mostly;
+struct mce_vendor_flags mce_flags __read_mostly;
+
+struct mca_config mca_cfg __read_mostly = {
+ .bootlog = -1,
+ /*
+ * Tolerant levels:
+ * 0: always panic on uncorrected errors, log corrected errors
+ * 1: panic or SIGBUS on uncorrected errors, log corrected errors
+ * 2: SIGBUS or log uncorrected errors (if possible), log corr. errors
+ * 3: never panic or SIGBUS, log all errors (for testing only)
+ */
+ .tolerant = 1,
+ .monarch_timeout = -1
+};
+
+static DEFINE_PER_CPU(struct mce, mces_seen);
+static unsigned long mce_need_notify;
+static int cpu_missing;
+
+/*
+ * MCA banks polled by the period polling timer for corrected events.
+ * With Intel CMCI, this only has MCA banks which do not support CMCI (if any).
+ */
+DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
+ [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
+};
+
+/*
+ * MCA banks controlled through firmware first for corrected errors.
+ * This is a global list of banks for which we won't enable CMCI and we
+ * won't poll. Firmware controls these banks and is responsible for
+ * reporting corrected errors through GHES. Uncorrected/recoverable
+ * errors are still notified through a machine check.
+ */
+mce_banks_t mce_banks_ce_disabled;
+
+static struct work_struct mce_work;
+static struct irq_work mce_irq_work;
+
+static void (*quirk_no_way_out)(int bank, struct mce *m, struct pt_regs *regs);
+
+/*
+ * CPU/chipset specific EDAC code can register a notifier call here to print
+ * MCE errors in a human-readable form.
+ */
+BLOCKING_NOTIFIER_HEAD(x86_mce_decoder_chain);
+
+/* Do initial initialization of a struct mce */
+void mce_setup(struct mce *m)
+{
+ memset(m, 0, sizeof(struct mce));
+ m->cpu = m->extcpu = smp_processor_id();
+ /* need the internal __ version to avoid deadlocks */
+ m->time = __ktime_get_real_seconds();
+ m->cpuvendor = boot_cpu_data.x86_vendor;
+ m->cpuid = cpuid_eax(1);
+ m->socketid = cpu_data(m->extcpu).phys_proc_id;
+ m->apicid = cpu_data(m->extcpu).initial_apicid;
+ rdmsrl(MSR_IA32_MCG_CAP, m->mcgcap);
+
+ if (this_cpu_has(X86_FEATURE_INTEL_PPIN))
+ rdmsrl(MSR_PPIN, m->ppin);
+
+ m->microcode = boot_cpu_data.microcode;
+}
+
+DEFINE_PER_CPU(struct mce, injectm);
+EXPORT_PER_CPU_SYMBOL_GPL(injectm);
+
+void mce_log(struct mce *m)
+{
+ if (!mce_gen_pool_add(m))
+ irq_work_queue(&mce_irq_work);
+}
+
+void mce_inject_log(struct mce *m)
+{
+ mutex_lock(&mce_log_mutex);
+ mce_log(m);
+ mutex_unlock(&mce_log_mutex);
+}
+EXPORT_SYMBOL_GPL(mce_inject_log);
+
+static struct notifier_block mce_srao_nb;
+
+/*
+ * We run the default notifier if we have only the SRAO, the first and the
+ * default notifier registered. I.e., the mandatory NUM_DEFAULT_NOTIFIERS
+ * notifiers registered on the chain.
+ */
+#define NUM_DEFAULT_NOTIFIERS 3
+static atomic_t num_notifiers;
+
+void mce_register_decode_chain(struct notifier_block *nb)
+{
+ if (WARN_ON(nb->priority > MCE_PRIO_MCELOG && nb->priority < MCE_PRIO_EDAC))
+ return;
+
+ atomic_inc(&num_notifiers);
+
+ blocking_notifier_chain_register(&x86_mce_decoder_chain, nb);
+}
+EXPORT_SYMBOL_GPL(mce_register_decode_chain);
+
+void mce_unregister_decode_chain(struct notifier_block *nb)
+{
+ atomic_dec(&num_notifiers);
+
+ blocking_notifier_chain_unregister(&x86_mce_decoder_chain, nb);
+}
+EXPORT_SYMBOL_GPL(mce_unregister_decode_chain);
+
+static inline u32 ctl_reg(int bank)
+{
+ return MSR_IA32_MCx_CTL(bank);
+}
+
+static inline u32 status_reg(int bank)
+{
+ return MSR_IA32_MCx_STATUS(bank);
+}
+
+static inline u32 addr_reg(int bank)
+{
+ return MSR_IA32_MCx_ADDR(bank);
+}
+
+static inline u32 misc_reg(int bank)
+{
+ return MSR_IA32_MCx_MISC(bank);
+}
+
+static inline u32 smca_ctl_reg(int bank)
+{
+ return MSR_AMD64_SMCA_MCx_CTL(bank);
+}
+
+static inline u32 smca_status_reg(int bank)
+{
+ return MSR_AMD64_SMCA_MCx_STATUS(bank);
+}
+
+static inline u32 smca_addr_reg(int bank)
+{
+ return MSR_AMD64_SMCA_MCx_ADDR(bank);
+}
+
+static inline u32 smca_misc_reg(int bank)
+{
+ return MSR_AMD64_SMCA_MCx_MISC(bank);
+}
+
+struct mca_msr_regs msr_ops = {
+ .ctl = ctl_reg,
+ .status = status_reg,
+ .addr = addr_reg,
+ .misc = misc_reg
+};
+
+static void __print_mce(struct mce *m)
+{
+ pr_emerg(HW_ERR "CPU %d: Machine Check%s: %Lx Bank %d: %016Lx\n",
+ m->extcpu,
+ (m->mcgstatus & MCG_STATUS_MCIP ? " Exception" : ""),
+ m->mcgstatus, m->bank, m->status);
+
+ if (m->ip) {
+ pr_emerg(HW_ERR "RIP%s %02x:<%016Lx> ",
+ !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
+ m->cs, m->ip);
+
+ if (m->cs == __KERNEL_CS)
+ pr_cont("{%pS}", (void *)(unsigned long)m->ip);
+ pr_cont("\n");
+ }
+
+ pr_emerg(HW_ERR "TSC %llx ", m->tsc);
+ if (m->addr)
+ pr_cont("ADDR %llx ", m->addr);
+ if (m->misc)
+ pr_cont("MISC %llx ", m->misc);
+
+ if (mce_flags.smca) {
+ if (m->synd)
+ pr_cont("SYND %llx ", m->synd);
+ if (m->ipid)
+ pr_cont("IPID %llx ", m->ipid);
+ }
+
+ pr_cont("\n");
+ /*
+ * Note this output is parsed by external tools and old fields
+ * should not be changed.
+ */
+ pr_emerg(HW_ERR "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x microcode %x\n",
+ m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid,
+ m->microcode);
+}
+
+static void print_mce(struct mce *m)
+{
+ __print_mce(m);
+
+ if (m->cpuvendor != X86_VENDOR_AMD)
+ pr_emerg_ratelimited(HW_ERR "Run the above through 'mcelog --ascii'\n");
+}
+
+#define PANIC_TIMEOUT 5 /* 5 seconds */
+
+static atomic_t mce_panicked;
+
+static int fake_panic;
+static atomic_t mce_fake_panicked;
+
+/* Panic in progress. Enable interrupts and wait for final IPI */
+static void wait_for_panic(void)
+{
+ long timeout = PANIC_TIMEOUT*USEC_PER_SEC;
+
+ preempt_disable();
+ local_irq_enable();
+ while (timeout-- > 0)
+ udelay(1);
+ if (panic_timeout == 0)
+ panic_timeout = mca_cfg.panic_timeout;
+ panic("Panicing machine check CPU died");
+}
+
+static void mce_panic(const char *msg, struct mce *final, char *exp)
+{
+ int apei_err = 0;
+ struct llist_node *pending;
+ struct mce_evt_llist *l;
+
+ if (!fake_panic) {
+ /*
+ * Make sure only one CPU runs in machine check panic
+ */
+ if (atomic_inc_return(&mce_panicked) > 1)
+ wait_for_panic();
+ barrier();
+
+ bust_spinlocks(1);
+ console_verbose();
+ } else {
+ /* Don't log too much for fake panic */
+ if (atomic_inc_return(&mce_fake_panicked) > 1)
+ return;
+ }
+ pending = mce_gen_pool_prepare_records();
+ /* First print corrected ones that are still unlogged */
+ llist_for_each_entry(l, pending, llnode) {
+ struct mce *m = &l->mce;
+ if (!(m->status & MCI_STATUS_UC)) {
+ print_mce(m);
+ if (!apei_err)
+ apei_err = apei_write_mce(m);
+ }
+ }
+ /* Now print uncorrected but with the final one last */
+ llist_for_each_entry(l, pending, llnode) {
+ struct mce *m = &l->mce;
+ if (!(m->status & MCI_STATUS_UC))
+ continue;
+ if (!final || mce_cmp(m, final)) {
+ print_mce(m);
+ if (!apei_err)
+ apei_err = apei_write_mce(m);
+ }
+ }
+ if (final) {
+ print_mce(final);
+ if (!apei_err)
+ apei_err = apei_write_mce(final);
+ }
+ if (cpu_missing)
+ pr_emerg(HW_ERR "Some CPUs didn't answer in synchronization\n");
+ if (exp)
+ pr_emerg(HW_ERR "Machine check: %s\n", exp);
+ if (!fake_panic) {
+ if (panic_timeout == 0)
+ panic_timeout = mca_cfg.panic_timeout;
+ panic(msg);
+ } else
+ pr_emerg(HW_ERR "Fake kernel panic: %s\n", msg);
+}
+
+/* Support code for software error injection */
+
+static int msr_to_offset(u32 msr)
+{
+ unsigned bank = __this_cpu_read(injectm.bank);
+
+ if (msr == mca_cfg.rip_msr)
+ return offsetof(struct mce, ip);
+ if (msr == msr_ops.status(bank))
+ return offsetof(struct mce, status);
+ if (msr == msr_ops.addr(bank))
+ return offsetof(struct mce, addr);
+ if (msr == msr_ops.misc(bank))
+ return offsetof(struct mce, misc);
+ if (msr == MSR_IA32_MCG_STATUS)
+ return offsetof(struct mce, mcgstatus);
+ return -1;
+}
+
+/* MSR access wrappers used for error injection */
+static u64 mce_rdmsrl(u32 msr)
+{
+ u64 v;
+
+ if (__this_cpu_read(injectm.finished)) {
+ int offset = msr_to_offset(msr);
+
+ if (offset < 0)
+ return 0;
+ return *(u64 *)((char *)this_cpu_ptr(&injectm) + offset);
+ }
+
+ if (rdmsrl_safe(msr, &v)) {
+ WARN_ONCE(1, "mce: Unable to read MSR 0x%x!\n", msr);
+ /*
+ * Return zero in case the access faulted. This should
+ * not happen normally but can happen if the CPU does
+ * something weird, or if the code is buggy.
+ */
+ v = 0;
+ }
+
+ return v;
+}
+
+static void mce_wrmsrl(u32 msr, u64 v)
+{
+ if (__this_cpu_read(injectm.finished)) {
+ int offset = msr_to_offset(msr);
+
+ if (offset >= 0)
+ *(u64 *)((char *)this_cpu_ptr(&injectm) + offset) = v;
+ return;
+ }
+ wrmsrl(msr, v);
+}
+
+/*
+ * Collect all global (w.r.t. this processor) status about this machine
+ * check into our "mce" struct so that we can use it later to assess
+ * the severity of the problem as we read per-bank specific details.
+ */
+static inline void mce_gather_info(struct mce *m, struct pt_regs *regs)
+{
+ mce_setup(m);
+
+ m->mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
+ if (regs) {
+ /*
+ * Get the address of the instruction at the time of
+ * the machine check error.
+ */
+ if (m->mcgstatus & (MCG_STATUS_RIPV|MCG_STATUS_EIPV)) {
+ m->ip = regs->ip;
+ m->cs = regs->cs;
+
+ /*
+ * When in VM86 mode make the cs look like ring 3
+ * always. This is a lie, but it's better than passing
+ * the additional vm86 bit around everywhere.
+ */
+ if (v8086_mode(regs))
+ m->cs |= 3;
+ }
+ /* Use accurate RIP reporting if available. */
+ if (mca_cfg.rip_msr)
+ m->ip = mce_rdmsrl(mca_cfg.rip_msr);
+ }
+}
+
+int mce_available(struct cpuinfo_x86 *c)
+{
+ if (mca_cfg.disabled)
+ return 0;
+ return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
+}
+
+static void mce_schedule_work(void)
+{
+ if (!mce_gen_pool_empty())
+ schedule_work(&mce_work);
+}
+
+static void mce_irq_work_cb(struct irq_work *entry)
+{
+ mce_schedule_work();
+}
+
+static void mce_report_event(struct pt_regs *regs)
+{
+ if (regs->flags & (X86_VM_MASK|X86_EFLAGS_IF)) {
+ mce_notify_irq();
+ /*
+ * Triggering the work queue here is just an insurance
+ * policy in case the syscall exit notify handler
+ * doesn't run soon enough or ends up running on the
+ * wrong CPU (can happen when audit sleeps)
+ */
+ mce_schedule_work();
+ return;
+ }
+
+ irq_work_queue(&mce_irq_work);
+}
+
+/*
+ * Check if the address reported by the CPU is in a format we can parse.
+ * It would be possible to add code for most other cases, but all would
+ * be somewhat complicated (e.g. segment offset would require an instruction
+ * parser). So only support physical addresses up to page granuality for now.
+ */
+int mce_usable_address(struct mce *m)
+{
+ if (!(m->status & MCI_STATUS_ADDRV))
+ return 0;
+
+ /* Checks after this one are Intel-specific: */
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
+ return 1;
+
+ if (!(m->status & MCI_STATUS_MISCV))
+ return 0;
+
+ if (MCI_MISC_ADDR_LSB(m->misc) > PAGE_SHIFT)
+ return 0;
+
+ if (MCI_MISC_ADDR_MODE(m->misc) != MCI_MISC_ADDR_PHYS)
+ return 0;
+
+ return 1;
+}
+EXPORT_SYMBOL_GPL(mce_usable_address);
+
+bool mce_is_memory_error(struct mce *m)
+{
+ if (m->cpuvendor == X86_VENDOR_AMD) {
+ return amd_mce_is_memory_error(m);
+
+ } else if (m->cpuvendor == X86_VENDOR_INTEL) {
+ /*
+ * Intel SDM Volume 3B - 15.9.2 Compound Error Codes
+ *
+ * Bit 7 of the MCACOD field of IA32_MCi_STATUS is used for
+ * indicating a memory error. Bit 8 is used for indicating a
+ * cache hierarchy error. The combination of bit 2 and bit 3
+ * is used for indicating a `generic' cache hierarchy error
+ * But we can't just blindly check the above bits, because if
+ * bit 11 is set, then it is a bus/interconnect error - and
+ * either way the above bits just gives more detail on what
+ * bus/interconnect error happened. Note that bit 12 can be
+ * ignored, as it's the "filter" bit.
+ */
+ return (m->status & 0xef80) == BIT(7) ||
+ (m->status & 0xef00) == BIT(8) ||
+ (m->status & 0xeffc) == 0xc;
+ }
+
+ return false;
+}
+EXPORT_SYMBOL_GPL(mce_is_memory_error);
+
+bool mce_is_correctable(struct mce *m)
+{
+ if (m->cpuvendor == X86_VENDOR_AMD && m->status & MCI_STATUS_DEFERRED)
+ return false;
+
+ if (m->status & MCI_STATUS_UC)
+ return false;
+
+ return true;
+}
+EXPORT_SYMBOL_GPL(mce_is_correctable);
+
+static bool cec_add_mce(struct mce *m)
+{
+ if (!m)
+ return false;
+
+ /* We eat only correctable DRAM errors with usable addresses. */
+ if (mce_is_memory_error(m) &&
+ mce_is_correctable(m) &&
+ mce_usable_address(m))
+ if (!cec_add_elem(m->addr >> PAGE_SHIFT))
+ return true;
+
+ return false;
+}
+
+static int mce_first_notifier(struct notifier_block *nb, unsigned long val,
+ void *data)
+{
+ struct mce *m = (struct mce *)data;
+
+ if (!m)
+ return NOTIFY_DONE;
+
+ if (cec_add_mce(m))
+ return NOTIFY_STOP;
+
+ /* Emit the trace record: */
+ trace_mce_record(m);
+
+ set_bit(0, &mce_need_notify);
+
+ mce_notify_irq();
+
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block first_nb = {
+ .notifier_call = mce_first_notifier,
+ .priority = MCE_PRIO_FIRST,
+};
+
+static int srao_decode_notifier(struct notifier_block *nb, unsigned long val,
+ void *data)
+{
+ struct mce *mce = (struct mce *)data;
+ unsigned long pfn;
+
+ if (!mce)
+ return NOTIFY_DONE;
+
+ if (mce_usable_address(mce) && (mce->severity == MCE_AO_SEVERITY)) {
+ pfn = mce->addr >> PAGE_SHIFT;
+ if (!memory_failure(pfn, 0))
+ set_mce_nospec(pfn);
+ }
+
+ return NOTIFY_OK;
+}
+static struct notifier_block mce_srao_nb = {
+ .notifier_call = srao_decode_notifier,
+ .priority = MCE_PRIO_SRAO,
+};
+
+static int mce_default_notifier(struct notifier_block *nb, unsigned long val,
+ void *data)
+{
+ struct mce *m = (struct mce *)data;
+
+ if (!m)
+ return NOTIFY_DONE;
+
+ if (atomic_read(&num_notifiers) > NUM_DEFAULT_NOTIFIERS)
+ return NOTIFY_DONE;
+
+ __print_mce(m);
+
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block mce_default_nb = {
+ .notifier_call = mce_default_notifier,
+ /* lowest prio, we want it to run last. */
+ .priority = MCE_PRIO_LOWEST,
+};
+
+/*
+ * Read ADDR and MISC registers.
+ */
+static void mce_read_aux(struct mce *m, int i)
+{
+ if (m->status & MCI_STATUS_MISCV)
+ m->misc = mce_rdmsrl(msr_ops.misc(i));
+
+ if (m->status & MCI_STATUS_ADDRV) {
+ m->addr = mce_rdmsrl(msr_ops.addr(i));
+
+ /*
+ * Mask the reported address by the reported granularity.
+ */
+ if (mca_cfg.ser && (m->status & MCI_STATUS_MISCV)) {
+ u8 shift = MCI_MISC_ADDR_LSB(m->misc);
+ m->addr >>= shift;
+ m->addr <<= shift;
+ }
+
+ /*
+ * Extract [55:<lsb>] where lsb is the least significant
+ * *valid* bit of the address bits.
+ */
+ if (mce_flags.smca) {
+ u8 lsb = (m->addr >> 56) & 0x3f;
+
+ m->addr &= GENMASK_ULL(55, lsb);
+ }
+ }
+
+ if (mce_flags.smca) {
+ m->ipid = mce_rdmsrl(MSR_AMD64_SMCA_MCx_IPID(i));
+
+ if (m->status & MCI_STATUS_SYNDV)
+ m->synd = mce_rdmsrl(MSR_AMD64_SMCA_MCx_SYND(i));
+ }
+}
+
+DEFINE_PER_CPU(unsigned, mce_poll_count);
+
+/*
+ * Poll for corrected events or events that happened before reset.
+ * Those are just logged through /dev/mcelog.
+ *
+ * This is executed in standard interrupt context.
+ *
+ * Note: spec recommends to panic for fatal unsignalled
+ * errors here. However this would be quite problematic --
+ * we would need to reimplement the Monarch handling and
+ * it would mess up the exclusion between exception handler
+ * and poll hander -- * so we skip this for now.
+ * These cases should not happen anyways, or only when the CPU
+ * is already totally * confused. In this case it's likely it will
+ * not fully execute the machine check handler either.
+ */
+bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
+{
+ bool error_seen = false;
+ struct mce m;
+ int i;
+
+ this_cpu_inc(mce_poll_count);
+
+ mce_gather_info(&m, NULL);
+
+ if (flags & MCP_TIMESTAMP)
+ m.tsc = rdtsc();
+
+ for (i = 0; i < mca_cfg.banks; i++) {
+ if (!mce_banks[i].ctl || !test_bit(i, *b))
+ continue;
+
+ m.misc = 0;
+ m.addr = 0;
+ m.bank = i;
+
+ barrier();
+ m.status = mce_rdmsrl(msr_ops.status(i));
+ if (!(m.status & MCI_STATUS_VAL))
+ continue;
+
+ /*
+ * Uncorrected or signalled events are handled by the exception
+ * handler when it is enabled, so don't process those here.
+ *
+ * TBD do the same check for MCI_STATUS_EN here?
+ */
+ if (!(flags & MCP_UC) &&
+ (m.status & (mca_cfg.ser ? MCI_STATUS_S : MCI_STATUS_UC)))
+ continue;
+
+ error_seen = true;
+
+ mce_read_aux(&m, i);
+
+ m.severity = mce_severity(&m, mca_cfg.tolerant, NULL, false);
+
+ /*
+ * Don't get the IP here because it's unlikely to
+ * have anything to do with the actual error location.
+ */
+ if (!(flags & MCP_DONTLOG) && !mca_cfg.dont_log_ce)
+ mce_log(&m);
+ else if (mce_usable_address(&m)) {
+ /*
+ * Although we skipped logging this, we still want
+ * to take action. Add to the pool so the registered
+ * notifiers will see it.
+ */
+ if (!mce_gen_pool_add(&m))
+ mce_schedule_work();
+ }
+
+ /*
+ * Clear state for this bank.
+ */
+ mce_wrmsrl(msr_ops.status(i), 0);
+ }
+
+ /*
+ * Don't clear MCG_STATUS here because it's only defined for
+ * exceptions.
+ */
+
+ sync_core();
+
+ return error_seen;
+}
+EXPORT_SYMBOL_GPL(machine_check_poll);
+
+/*
+ * Do a quick check if any of the events requires a panic.
+ * This decides if we keep the events around or clear them.
+ */
+static int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp,
+ struct pt_regs *regs)
+{
+ char *tmp;
+ int i;
+
+ for (i = 0; i < mca_cfg.banks; i++) {
+ m->status = mce_rdmsrl(msr_ops.status(i));
+ if (!(m->status & MCI_STATUS_VAL))
+ continue;
+
+ __set_bit(i, validp);
+ if (quirk_no_way_out)
+ quirk_no_way_out(i, m, regs);
+
+ if (mce_severity(m, mca_cfg.tolerant, &tmp, true) >= MCE_PANIC_SEVERITY) {
+ mce_read_aux(m, i);
+ *msg = tmp;
+ return 1;
+ }
+ }
+ return 0;
+}
+
+/*
+ * Variable to establish order between CPUs while scanning.
+ * Each CPU spins initially until executing is equal its number.
+ */
+static atomic_t mce_executing;
+
+/*
+ * Defines order of CPUs on entry. First CPU becomes Monarch.
+ */
+static atomic_t mce_callin;
+
+/*
+ * Check if a timeout waiting for other CPUs happened.
+ */
+static int mce_timed_out(u64 *t, const char *msg)
+{
+ /*
+ * The others already did panic for some reason.
+ * Bail out like in a timeout.
+ * rmb() to tell the compiler that system_state
+ * might have been modified by someone else.
+ */
+ rmb();
+ if (atomic_read(&mce_panicked))
+ wait_for_panic();
+ if (!mca_cfg.monarch_timeout)
+ goto out;
+ if ((s64)*t < SPINUNIT) {
+ if (mca_cfg.tolerant <= 1)
+ mce_panic(msg, NULL, NULL);
+ cpu_missing = 1;
+ return 1;
+ }
+ *t -= SPINUNIT;
+out:
+ touch_nmi_watchdog();
+ return 0;
+}
+
+/*
+ * The Monarch's reign. The Monarch is the CPU who entered
+ * the machine check handler first. It waits for the others to
+ * raise the exception too and then grades them. When any
+ * error is fatal panic. Only then let the others continue.
+ *
+ * The other CPUs entering the MCE handler will be controlled by the
+ * Monarch. They are called Subjects.
+ *
+ * This way we prevent any potential data corruption in a unrecoverable case
+ * and also makes sure always all CPU's errors are examined.
+ *
+ * Also this detects the case of a machine check event coming from outer
+ * space (not detected by any CPUs) In this case some external agent wants
+ * us to shut down, so panic too.
+ *
+ * The other CPUs might still decide to panic if the handler happens
+ * in a unrecoverable place, but in this case the system is in a semi-stable
+ * state and won't corrupt anything by itself. It's ok to let the others
+ * continue for a bit first.
+ *
+ * All the spin loops have timeouts; when a timeout happens a CPU
+ * typically elects itself to be Monarch.
+ */
+static void mce_reign(void)
+{
+ int cpu;
+ struct mce *m = NULL;
+ int global_worst = 0;
+ char *msg = NULL;
+ char *nmsg = NULL;
+
+ /*
+ * This CPU is the Monarch and the other CPUs have run
+ * through their handlers.
+ * Grade the severity of the errors of all the CPUs.
+ */
+ for_each_possible_cpu(cpu) {
+ int severity = mce_severity(&per_cpu(mces_seen, cpu),
+ mca_cfg.tolerant,
+ &nmsg, true);
+ if (severity > global_worst) {
+ msg = nmsg;
+ global_worst = severity;
+ m = &per_cpu(mces_seen, cpu);
+ }
+ }
+
+ /*
+ * Cannot recover? Panic here then.
+ * This dumps all the mces in the log buffer and stops the
+ * other CPUs.
+ */
+ if (m && global_worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3)
+ mce_panic("Fatal machine check", m, msg);
+
+ /*
+ * For UC somewhere we let the CPU who detects it handle it.
+ * Also must let continue the others, otherwise the handling
+ * CPU could deadlock on a lock.
+ */
+
+ /*
+ * No machine check event found. Must be some external
+ * source or one CPU is hung. Panic.
+ */
+ if (global_worst <= MCE_KEEP_SEVERITY && mca_cfg.tolerant < 3)
+ mce_panic("Fatal machine check from unknown source", NULL, NULL);
+
+ /*
+ * Now clear all the mces_seen so that they don't reappear on
+ * the next mce.
+ */
+ for_each_possible_cpu(cpu)
+ memset(&per_cpu(mces_seen, cpu), 0, sizeof(struct mce));
+}
+
+static atomic_t global_nwo;
+
+/*
+ * Start of Monarch synchronization. This waits until all CPUs have
+ * entered the exception handler and then determines if any of them
+ * saw a fatal event that requires panic. Then it executes them
+ * in the entry order.
+ * TBD double check parallel CPU hotunplug
+ */
+static int mce_start(int *no_way_out)
+{
+ int order;
+ int cpus = num_online_cpus();
+ u64 timeout = (u64)mca_cfg.monarch_timeout * NSEC_PER_USEC;
+
+ if (!timeout)
+ return -1;
+
+ atomic_add(*no_way_out, &global_nwo);
+ /*
+ * Rely on the implied barrier below, such that global_nwo
+ * is updated before mce_callin.
+ */
+ order = atomic_inc_return(&mce_callin);
+
+ /*
+ * Wait for everyone.
+ */
+ while (atomic_read(&mce_callin) != cpus) {
+ if (mce_timed_out(&timeout,
+ "Timeout: Not all CPUs entered broadcast exception handler")) {
+ atomic_set(&global_nwo, 0);
+ return -1;
+ }
+ ndelay(SPINUNIT);
+ }
+
+ /*
+ * mce_callin should be read before global_nwo
+ */
+ smp_rmb();
+
+ if (order == 1) {
+ /*
+ * Monarch: Starts executing now, the others wait.
+ */
+ atomic_set(&mce_executing, 1);
+ } else {
+ /*
+ * Subject: Now start the scanning loop one by one in
+ * the original callin order.
+ * This way when there are any shared banks it will be
+ * only seen by one CPU before cleared, avoiding duplicates.
+ */
+ while (atomic_read(&mce_executing) < order) {
+ if (mce_timed_out(&timeout,
+ "Timeout: Subject CPUs unable to finish machine check processing")) {
+ atomic_set(&global_nwo, 0);
+ return -1;
+ }
+ ndelay(SPINUNIT);
+ }
+ }
+
+ /*
+ * Cache the global no_way_out state.
+ */
+ *no_way_out = atomic_read(&global_nwo);
+
+ return order;
+}
+
+/*
+ * Synchronize between CPUs after main scanning loop.
+ * This invokes the bulk of the Monarch processing.
+ */
+static int mce_end(int order)
+{
+ int ret = -1;
+ u64 timeout = (u64)mca_cfg.monarch_timeout * NSEC_PER_USEC;
+
+ if (!timeout)
+ goto reset;
+ if (order < 0)
+ goto reset;
+
+ /*
+ * Allow others to run.
+ */
+ atomic_inc(&mce_executing);
+
+ if (order == 1) {
+ /* CHECKME: Can this race with a parallel hotplug? */
+ int cpus = num_online_cpus();
+
+ /*
+ * Monarch: Wait for everyone to go through their scanning
+ * loops.
+ */
+ while (atomic_read(&mce_executing) <= cpus) {
+ if (mce_timed_out(&timeout,
+ "Timeout: Monarch CPU unable to finish machine check processing"))
+ goto reset;
+ ndelay(SPINUNIT);
+ }
+
+ mce_reign();
+ barrier();
+ ret = 0;
+ } else {
+ /*
+ * Subject: Wait for Monarch to finish.
+ */
+ while (atomic_read(&mce_executing) != 0) {
+ if (mce_timed_out(&timeout,
+ "Timeout: Monarch CPU did not finish machine check processing"))
+ goto reset;
+ ndelay(SPINUNIT);
+ }
+
+ /*
+ * Don't reset anything. That's done by the Monarch.
+ */
+ return 0;
+ }
+
+ /*
+ * Reset all global state.
+ */
+reset:
+ atomic_set(&global_nwo, 0);
+ atomic_set(&mce_callin, 0);
+ barrier();
+
+ /*
+ * Let others run again.
+ */
+ atomic_set(&mce_executing, 0);
+ return ret;
+}
+
+static void mce_clear_state(unsigned long *toclear)
+{
+ int i;
+
+ for (i = 0; i < mca_cfg.banks; i++) {
+ if (test_bit(i, toclear))
+ mce_wrmsrl(msr_ops.status(i), 0);
+ }
+}
+
+static int do_memory_failure(struct mce *m)
+{
+ int flags = MF_ACTION_REQUIRED;
+ int ret;
+
+ pr_err("Uncorrected hardware memory error in user-access at %llx", m->addr);
+ if (!(m->mcgstatus & MCG_STATUS_RIPV))
+ flags |= MF_MUST_KILL;
+ ret = memory_failure(m->addr >> PAGE_SHIFT, flags);
+ if (ret)
+ pr_err("Memory error not recovered");
+ else
+ set_mce_nospec(m->addr >> PAGE_SHIFT);
+ return ret;
+}
+
+
+/*
+ * Cases where we avoid rendezvous handler timeout:
+ * 1) If this CPU is offline.
+ *
+ * 2) If crashing_cpu was set, e.g. we're entering kdump and we need to
+ * skip those CPUs which remain looping in the 1st kernel - see
+ * crash_nmi_callback().
+ *
+ * Note: there still is a small window between kexec-ing and the new,
+ * kdump kernel establishing a new #MC handler where a broadcasted MCE
+ * might not get handled properly.
+ */
+static bool __mc_check_crashing_cpu(int cpu)
+{
+ if (cpu_is_offline(cpu) ||
+ (crashing_cpu != -1 && crashing_cpu != cpu)) {
+ u64 mcgstatus;
+
+ mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
+ if (mcgstatus & MCG_STATUS_RIPV) {
+ mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
+ return true;
+ }
+ }
+ return false;
+}
+
+static void __mc_scan_banks(struct mce *m, struct mce *final,
+ unsigned long *toclear, unsigned long *valid_banks,
+ int no_way_out, int *worst)
+{
+ struct mca_config *cfg = &mca_cfg;
+ int severity, i;
+
+ for (i = 0; i < cfg->banks; i++) {
+ __clear_bit(i, toclear);
+ if (!test_bit(i, valid_banks))
+ continue;
+
+ if (!mce_banks[i].ctl)
+ continue;
+
+ m->misc = 0;
+ m->addr = 0;
+ m->bank = i;
+
+ m->status = mce_rdmsrl(msr_ops.status(i));
+ if (!(m->status & MCI_STATUS_VAL))
+ continue;
+
+ /*
+ * Corrected or non-signaled errors are handled by
+ * machine_check_poll(). Leave them alone, unless this panics.
+ */
+ if (!(m->status & (cfg->ser ? MCI_STATUS_S : MCI_STATUS_UC)) &&
+ !no_way_out)
+ continue;
+
+ /* Set taint even when machine check was not enabled. */
+ add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
+
+ severity = mce_severity(m, cfg->tolerant, NULL, true);
+
+ /*
+ * When machine check was for corrected/deferred handler don't
+ * touch, unless we're panicking.
+ */
+ if ((severity == MCE_KEEP_SEVERITY ||
+ severity == MCE_UCNA_SEVERITY) && !no_way_out)
+ continue;
+
+ __set_bit(i, toclear);
+
+ /* Machine check event was not enabled. Clear, but ignore. */
+ if (severity == MCE_NO_SEVERITY)
+ continue;
+
+ mce_read_aux(m, i);
+
+ /* assuming valid severity level != 0 */
+ m->severity = severity;
+
+ mce_log(m);
+
+ if (severity > *worst) {
+ *final = *m;
+ *worst = severity;
+ }
+ }
+
+ /* mce_clear_state will clear *final, save locally for use later */
+ *m = *final;
+}
+
+/*
+ * The actual machine check handler. This only handles real
+ * exceptions when something got corrupted coming in through int 18.
+ *
+ * This is executed in NMI context not subject to normal locking rules. This
+ * implies that most kernel services cannot be safely used. Don't even
+ * think about putting a printk in there!
+ *
+ * On Intel systems this is entered on all CPUs in parallel through
+ * MCE broadcast. However some CPUs might be broken beyond repair,
+ * so be always careful when synchronizing with others.
+ */
+void do_machine_check(struct pt_regs *regs, long error_code)
+{
+ DECLARE_BITMAP(valid_banks, MAX_NR_BANKS);
+ DECLARE_BITMAP(toclear, MAX_NR_BANKS);
+ struct mca_config *cfg = &mca_cfg;
+ int cpu = smp_processor_id();
+ char *msg = "Unknown";
+ struct mce m, *final;
+ int worst = 0;
+
+ /*
+ * Establish sequential order between the CPUs entering the machine
+ * check handler.
+ */
+ int order = -1;
+
+ /*
+ * If no_way_out gets set, there is no safe way to recover from this
+ * MCE. If mca_cfg.tolerant is cranked up, we'll try anyway.
+ */
+ int no_way_out = 0;
+
+ /*
+ * If kill_it gets set, there might be a way to recover from this
+ * error.
+ */
+ int kill_it = 0;
+
+ /*
+ * MCEs are always local on AMD. Same is determined by MCG_STATUS_LMCES
+ * on Intel.
+ */
+ int lmce = 1;
+
+ if (__mc_check_crashing_cpu(cpu))
+ return;
+
+ ist_enter(regs);
+
+ this_cpu_inc(mce_exception_count);
+
+ mce_gather_info(&m, regs);
+ m.tsc = rdtsc();
+
+ final = this_cpu_ptr(&mces_seen);
+ *final = m;
+
+ memset(valid_banks, 0, sizeof(valid_banks));
+ no_way_out = mce_no_way_out(&m, &msg, valid_banks, regs);
+
+ barrier();
+
+ /*
+ * When no restart IP might need to kill or panic.
+ * Assume the worst for now, but if we find the
+ * severity is MCE_AR_SEVERITY we have other options.
+ */
+ if (!(m.mcgstatus & MCG_STATUS_RIPV))
+ kill_it = 1;
+
+ /*
+ * Check if this MCE is signaled to only this logical processor,
+ * on Intel only.
+ */
+ if (m.cpuvendor == X86_VENDOR_INTEL)
+ lmce = m.mcgstatus & MCG_STATUS_LMCES;
+
+ /*
+ * Local machine check may already know that we have to panic.
+ * Broadcast machine check begins rendezvous in mce_start()
+ * Go through all banks in exclusion of the other CPUs. This way we
+ * don't report duplicated events on shared banks because the first one
+ * to see it will clear it.
+ */
+ if (lmce) {
+ if (no_way_out)
+ mce_panic("Fatal local machine check", &m, msg);
+ } else {
+ order = mce_start(&no_way_out);
+ }
+
+ __mc_scan_banks(&m, final, toclear, valid_banks, no_way_out, &worst);
+
+ if (!no_way_out)
+ mce_clear_state(toclear);
+
+ /*
+ * Do most of the synchronization with other CPUs.
+ * When there's any problem use only local no_way_out state.
+ */
+ if (!lmce) {
+ if (mce_end(order) < 0)
+ no_way_out = worst >= MCE_PANIC_SEVERITY;
+ } else {
+ /*
+ * If there was a fatal machine check we should have
+ * already called mce_panic earlier in this function.
+ * Since we re-read the banks, we might have found
+ * something new. Check again to see if we found a
+ * fatal error. We call "mce_severity()" again to
+ * make sure we have the right "msg".
+ */
+ if (worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3) {
+ mce_severity(&m, cfg->tolerant, &msg, true);
+ mce_panic("Local fatal machine check!", &m, msg);
+ }
+ }
+
+ /*
+ * If tolerant is at an insane level we drop requests to kill
+ * processes and continue even when there is no way out.
+ */
+ if (cfg->tolerant == 3)
+ kill_it = 0;
+ else if (no_way_out)
+ mce_panic("Fatal machine check on current CPU", &m, msg);
+
+ if (worst > 0)
+ mce_report_event(regs);
+ mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
+
+ sync_core();
+
+ if (worst != MCE_AR_SEVERITY && !kill_it)
+ goto out_ist;
+
+ /* Fault was in user mode and we need to take some action */
+ if ((m.cs & 3) == 3) {
+ ist_begin_non_atomic(regs);
+ local_irq_enable();
+
+ if (kill_it || do_memory_failure(&m))
+ force_sig(SIGBUS, current);
+ local_irq_disable();
+ ist_end_non_atomic();
+ } else {
+ if (!fixup_exception(regs, X86_TRAP_MC))
+ mce_panic("Failed kernel mode recovery", &m, NULL);
+ }
+
+out_ist:
+ ist_exit(regs);
+}
+EXPORT_SYMBOL_GPL(do_machine_check);
+
+#ifndef CONFIG_MEMORY_FAILURE
+int memory_failure(unsigned long pfn, int flags)
+{
+ /* mce_severity() should not hand us an ACTION_REQUIRED error */
+ BUG_ON(flags & MF_ACTION_REQUIRED);
+ pr_err("Uncorrected memory error in page 0x%lx ignored\n"
+ "Rebuild kernel with CONFIG_MEMORY_FAILURE=y for smarter handling\n",
+ pfn);
+
+ return 0;
+}
+#endif
+
+/*
+ * Periodic polling timer for "silent" machine check errors. If the
+ * poller finds an MCE, poll 2x faster. When the poller finds no more
+ * errors, poll 2x slower (up to check_interval seconds).
+ */
+static unsigned long check_interval = INITIAL_CHECK_INTERVAL;
+
+static DEFINE_PER_CPU(unsigned long, mce_next_interval); /* in jiffies */
+static DEFINE_PER_CPU(struct timer_list, mce_timer);
+
+static unsigned long mce_adjust_timer_default(unsigned long interval)
+{
+ return interval;
+}
+
+static unsigned long (*mce_adjust_timer)(unsigned long interval) = mce_adjust_timer_default;
+
+static void __start_timer(struct timer_list *t, unsigned long interval)
+{
+ unsigned long when = jiffies + interval;
+ unsigned long flags;
+
+ local_irq_save(flags);
+
+ if (!timer_pending(t) || time_before(when, t->expires))
+ mod_timer(t, round_jiffies(when));
+
+ local_irq_restore(flags);
+}
+
+static void mce_timer_fn(struct timer_list *t)
+{
+ struct timer_list *cpu_t = this_cpu_ptr(&mce_timer);
+ unsigned long iv;
+
+ WARN_ON(cpu_t != t);
+
+ iv = __this_cpu_read(mce_next_interval);
+
+ if (mce_available(this_cpu_ptr(&cpu_info))) {
+ machine_check_poll(0, this_cpu_ptr(&mce_poll_banks));
+
+ if (mce_intel_cmci_poll()) {
+ iv = mce_adjust_timer(iv);
+ goto done;
+ }
+ }
+
+ /*
+ * Alert userspace if needed. If we logged an MCE, reduce the polling
+ * interval, otherwise increase the polling interval.
+ */
+ if (mce_notify_irq())
+ iv = max(iv / 2, (unsigned long) HZ/100);
+ else
+ iv = min(iv * 2, round_jiffies_relative(check_interval * HZ));
+
+done:
+ __this_cpu_write(mce_next_interval, iv);
+ __start_timer(t, iv);
+}
+
+/*
+ * Ensure that the timer is firing in @interval from now.
+ */
+void mce_timer_kick(unsigned long interval)
+{
+ struct timer_list *t = this_cpu_ptr(&mce_timer);
+ unsigned long iv = __this_cpu_read(mce_next_interval);
+
+ __start_timer(t, interval);
+
+ if (interval < iv)
+ __this_cpu_write(mce_next_interval, interval);
+}
+
+/* Must not be called in IRQ context where del_timer_sync() can deadlock */
+static void mce_timer_delete_all(void)
+{
+ int cpu;
+
+ for_each_online_cpu(cpu)
+ del_timer_sync(&per_cpu(mce_timer, cpu));
+}
+
+/*
+ * Notify the user(s) about new machine check events.
+ * Can be called from interrupt context, but not from machine check/NMI
+ * context.
+ */
+int mce_notify_irq(void)
+{
+ /* Not more than two messages every minute */
+ static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
+
+ if (test_and_clear_bit(0, &mce_need_notify)) {
+ mce_work_trigger();
+
+ if (__ratelimit(&ratelimit))
+ pr_info(HW_ERR "Machine check events logged\n");
+
+ return 1;
+ }
+ return 0;
+}
+EXPORT_SYMBOL_GPL(mce_notify_irq);
+
+static int __mcheck_cpu_mce_banks_init(void)
+{
+ int i;
+ u8 num_banks = mca_cfg.banks;
+
+ mce_banks = kcalloc(num_banks, sizeof(struct mce_bank), GFP_KERNEL);
+ if (!mce_banks)
+ return -ENOMEM;
+
+ for (i = 0; i < num_banks; i++) {
+ struct mce_bank *b = &mce_banks[i];
+
+ b->ctl = -1ULL;
+ b->init = 1;
+ }
+ return 0;
+}
+
+/*
+ * Initialize Machine Checks for a CPU.
+ */
+static int __mcheck_cpu_cap_init(void)
+{
+ unsigned b;
+ u64 cap;
+
+ rdmsrl(MSR_IA32_MCG_CAP, cap);
+
+ b = cap & MCG_BANKCNT_MASK;
+ if (!mca_cfg.banks)
+ pr_info("CPU supports %d MCE banks\n", b);
+
+ if (b > MAX_NR_BANKS) {
+ pr_warn("Using only %u machine check banks out of %u\n",
+ MAX_NR_BANKS, b);
+ b = MAX_NR_BANKS;
+ }
+
+ /* Don't support asymmetric configurations today */
+ WARN_ON(mca_cfg.banks != 0 && b != mca_cfg.banks);
+ mca_cfg.banks = b;
+
+ if (!mce_banks) {
+ int err = __mcheck_cpu_mce_banks_init();
+
+ if (err)
+ return err;
+ }
+
+ /* Use accurate RIP reporting if available. */
+ if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9)
+ mca_cfg.rip_msr = MSR_IA32_MCG_EIP;
+
+ if (cap & MCG_SER_P)
+ mca_cfg.ser = 1;
+
+ return 0;
+}
+
+static void __mcheck_cpu_init_generic(void)
+{
+ enum mcp_flags m_fl = 0;
+ mce_banks_t all_banks;
+ u64 cap;
+
+ if (!mca_cfg.bootlog)
+ m_fl = MCP_DONTLOG;
+
+ /*
+ * Log the machine checks left over from the previous reset.
+ */
+ bitmap_fill(all_banks, MAX_NR_BANKS);
+ machine_check_poll(MCP_UC | m_fl, &all_banks);
+
+ cr4_set_bits(X86_CR4_MCE);
+
+ rdmsrl(MSR_IA32_MCG_CAP, cap);
+ if (cap & MCG_CTL_P)
+ wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
+}
+
+static void __mcheck_cpu_init_clear_banks(void)
+{
+ int i;
+
+ for (i = 0; i < mca_cfg.banks; i++) {
+ struct mce_bank *b = &mce_banks[i];
+
+ if (!b->init)
+ continue;
+ wrmsrl(msr_ops.ctl(i), b->ctl);
+ wrmsrl(msr_ops.status(i), 0);
+ }
+}
+
+/*
+ * During IFU recovery Sandy Bridge -EP4S processors set the RIPV and
+ * EIPV bits in MCG_STATUS to zero on the affected logical processor (SDM
+ * Vol 3B Table 15-20). But this confuses both the code that determines
+ * whether the machine check occurred in kernel or user mode, and also
+ * the severity assessment code. Pretend that EIPV was set, and take the
+ * ip/cs values from the pt_regs that mce_gather_info() ignored earlier.
+ */
+static void quirk_sandybridge_ifu(int bank, struct mce *m, struct pt_regs *regs)
+{
+ if (bank != 0)
+ return;
+ if ((m->mcgstatus & (MCG_STATUS_EIPV|MCG_STATUS_RIPV)) != 0)
+ return;
+ if ((m->status & (MCI_STATUS_OVER|MCI_STATUS_UC|
+ MCI_STATUS_EN|MCI_STATUS_MISCV|MCI_STATUS_ADDRV|
+ MCI_STATUS_PCC|MCI_STATUS_S|MCI_STATUS_AR|
+ MCACOD)) !=
+ (MCI_STATUS_UC|MCI_STATUS_EN|
+ MCI_STATUS_MISCV|MCI_STATUS_ADDRV|MCI_STATUS_S|
+ MCI_STATUS_AR|MCACOD_INSTR))
+ return;
+
+ m->mcgstatus |= MCG_STATUS_EIPV;
+ m->ip = regs->ip;
+ m->cs = regs->cs;
+}
+
+/* Add per CPU specific workarounds here */
+static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
+{
+ struct mca_config *cfg = &mca_cfg;
+
+ if (c->x86_vendor == X86_VENDOR_UNKNOWN) {
+ pr_info("unknown CPU type - not enabling MCE support\n");
+ return -EOPNOTSUPP;
+ }
+
+ /* This should be disabled by the BIOS, but isn't always */
+ if (c->x86_vendor == X86_VENDOR_AMD) {
+ if (c->x86 == 15 && cfg->banks > 4) {
+ /*
+ * disable GART TBL walk error reporting, which
+ * trips off incorrectly with the IOMMU & 3ware
+ * & Cerberus:
+ */
+ clear_bit(10, (unsigned long *)&mce_banks[4].ctl);
+ }
+ if (c->x86 < 0x11 && cfg->bootlog < 0) {
+ /*
+ * Lots of broken BIOS around that don't clear them
+ * by default and leave crap in there. Don't log:
+ */
+ cfg->bootlog = 0;
+ }
+ /*
+ * Various K7s with broken bank 0 around. Always disable
+ * by default.
+ */
+ if (c->x86 == 6 && cfg->banks > 0)
+ mce_banks[0].ctl = 0;
+
+ /*
+ * overflow_recov is supported for F15h Models 00h-0fh
+ * even though we don't have a CPUID bit for it.
+ */
+ if (c->x86 == 0x15 && c->x86_model <= 0xf)
+ mce_flags.overflow_recov = 1;
+
+ /*
+ * Turn off MC4_MISC thresholding banks on those models since
+ * they're not supported there.
+ */
+ if (c->x86 == 0x15 &&
+ (c->x86_model >= 0x10 && c->x86_model <= 0x1f)) {
+ int i;
+ u64 hwcr;
+ bool need_toggle;
+ u32 msrs[] = {
+ 0x00000413, /* MC4_MISC0 */
+ 0xc0000408, /* MC4_MISC1 */
+ };
+
+ rdmsrl(MSR_K7_HWCR, hwcr);
+
+ /* McStatusWrEn has to be set */
+ need_toggle = !(hwcr & BIT(18));
+
+ if (need_toggle)
+ wrmsrl(MSR_K7_HWCR, hwcr | BIT(18));
+
+ /* Clear CntP bit safely */
+ for (i = 0; i < ARRAY_SIZE(msrs); i++)
+ msr_clear_bit(msrs[i], 62);
+
+ /* restore old settings */
+ if (need_toggle)
+ wrmsrl(MSR_K7_HWCR, hwcr);
+ }
+ }
+
+ if (c->x86_vendor == X86_VENDOR_INTEL) {
+ /*
+ * SDM documents that on family 6 bank 0 should not be written
+ * because it aliases to another special BIOS controlled
+ * register.
+ * But it's not aliased anymore on model 0x1a+
+ * Don't ignore bank 0 completely because there could be a
+ * valid event later, merely don't write CTL0.
+ */
+
+ if (c->x86 == 6 && c->x86_model < 0x1A && cfg->banks > 0)
+ mce_banks[0].init = 0;
+
+ /*
+ * All newer Intel systems support MCE broadcasting. Enable
+ * synchronization with a one second timeout.
+ */
+ if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) &&
+ cfg->monarch_timeout < 0)
+ cfg->monarch_timeout = USEC_PER_SEC;
+
+ /*
+ * There are also broken BIOSes on some Pentium M and
+ * earlier systems:
+ */
+ if (c->x86 == 6 && c->x86_model <= 13 && cfg->bootlog < 0)
+ cfg->bootlog = 0;
+
+ if (c->x86 == 6 && c->x86_model == 45)
+ quirk_no_way_out = quirk_sandybridge_ifu;
+ }
+ if (cfg->monarch_timeout < 0)
+ cfg->monarch_timeout = 0;
+ if (cfg->bootlog != 0)
+ cfg->panic_timeout = 30;
+
+ return 0;
+}
+
+static int __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c)
+{
+ if (c->x86 != 5)
+ return 0;
+
+ switch (c->x86_vendor) {
+ case X86_VENDOR_INTEL:
+ intel_p5_mcheck_init(c);
+ return 1;
+ break;
+ case X86_VENDOR_CENTAUR:
+ winchip_mcheck_init(c);
+ return 1;
+ break;
+ default:
+ return 0;
+ }
+
+ return 0;
+}
+
+/*
+ * Init basic CPU features needed for early decoding of MCEs.
+ */
+static void __mcheck_cpu_init_early(struct cpuinfo_x86 *c)
+{
+ if (c->x86_vendor == X86_VENDOR_AMD) {
+ mce_flags.overflow_recov = !!cpu_has(c, X86_FEATURE_OVERFLOW_RECOV);
+ mce_flags.succor = !!cpu_has(c, X86_FEATURE_SUCCOR);
+ mce_flags.smca = !!cpu_has(c, X86_FEATURE_SMCA);
+
+ if (mce_flags.smca) {
+ msr_ops.ctl = smca_ctl_reg;
+ msr_ops.status = smca_status_reg;
+ msr_ops.addr = smca_addr_reg;
+ msr_ops.misc = smca_misc_reg;
+ }
+ }
+}
+
+static void mce_centaur_feature_init(struct cpuinfo_x86 *c)
+{
+ struct mca_config *cfg = &mca_cfg;
+
+ /*
+ * All newer Centaur CPUs support MCE broadcasting. Enable
+ * synchronization with a one second timeout.
+ */
+ if ((c->x86 == 6 && c->x86_model == 0xf && c->x86_stepping >= 0xe) ||
+ c->x86 > 6) {
+ if (cfg->monarch_timeout < 0)
+ cfg->monarch_timeout = USEC_PER_SEC;
+ }
+}
+
+static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
+{
+ switch (c->x86_vendor) {
+ case X86_VENDOR_INTEL:
+ mce_intel_feature_init(c);
+ mce_adjust_timer = cmci_intel_adjust_timer;
+ break;
+
+ case X86_VENDOR_AMD: {
+ mce_amd_feature_init(c);
+ break;
+ }
+ case X86_VENDOR_CENTAUR:
+ mce_centaur_feature_init(c);
+ break;
+
+ default:
+ break;
+ }
+}
+
+static void __mcheck_cpu_clear_vendor(struct cpuinfo_x86 *c)
+{
+ switch (c->x86_vendor) {
+ case X86_VENDOR_INTEL:
+ mce_intel_feature_clear(c);
+ break;
+ default:
+ break;
+ }
+}
+
+static void mce_start_timer(struct timer_list *t)
+{
+ unsigned long iv = check_interval * HZ;
+
+ if (mca_cfg.ignore_ce || !iv)
+ return;
+
+ this_cpu_write(mce_next_interval, iv);
+ __start_timer(t, iv);
+}
+
+static void __mcheck_cpu_setup_timer(void)
+{
+ struct timer_list *t = this_cpu_ptr(&mce_timer);
+
+ timer_setup(t, mce_timer_fn, TIMER_PINNED);
+}
+
+static void __mcheck_cpu_init_timer(void)
+{
+ struct timer_list *t = this_cpu_ptr(&mce_timer);
+
+ timer_setup(t, mce_timer_fn, TIMER_PINNED);
+ mce_start_timer(t);
+}
+
+/* Handle unconfigured int18 (should never happen) */
+static void unexpected_machine_check(struct pt_regs *regs, long error_code)
+{
+ pr_err("CPU#%d: Unexpected int18 (Machine Check)\n",
+ smp_processor_id());
+}
+
+/* Call the installed machine check handler for this CPU setup. */
+void (*machine_check_vector)(struct pt_regs *, long error_code) =
+ unexpected_machine_check;
+
+dotraplinkage void do_mce(struct pt_regs *regs, long error_code)
+{
+ machine_check_vector(regs, error_code);
+}
+
+/*
+ * Called for each booted CPU to set up machine checks.
+ * Must be called with preempt off:
+ */
+void mcheck_cpu_init(struct cpuinfo_x86 *c)
+{
+ if (mca_cfg.disabled)
+ return;
+
+ if (__mcheck_cpu_ancient_init(c))
+ return;
+
+ if (!mce_available(c))
+ return;
+
+ if (__mcheck_cpu_cap_init() < 0 || __mcheck_cpu_apply_quirks(c) < 0) {
+ mca_cfg.disabled = 1;
+ return;
+ }
+
+ if (mce_gen_pool_init()) {
+ mca_cfg.disabled = 1;
+ pr_emerg("Couldn't allocate MCE records pool!\n");
+ return;
+ }
+
+ machine_check_vector = do_machine_check;
+
+ __mcheck_cpu_init_early(c);
+ __mcheck_cpu_init_generic();
+ __mcheck_cpu_init_vendor(c);
+ __mcheck_cpu_init_clear_banks();
+ __mcheck_cpu_setup_timer();
+}
+
+/*
+ * Called for each booted CPU to clear some machine checks opt-ins
+ */
+void mcheck_cpu_clear(struct cpuinfo_x86 *c)
+{
+ if (mca_cfg.disabled)
+ return;
+
+ if (!mce_available(c))
+ return;
+
+ /*
+ * Possibly to clear general settings generic to x86
+ * __mcheck_cpu_clear_generic(c);
+ */
+ __mcheck_cpu_clear_vendor(c);
+
+}
+
+static void __mce_disable_bank(void *arg)
+{
+ int bank = *((int *)arg);
+ __clear_bit(bank, this_cpu_ptr(mce_poll_banks));
+ cmci_disable_bank(bank);
+}
+
+void mce_disable_bank(int bank)
+{
+ if (bank >= mca_cfg.banks) {
+ pr_warn(FW_BUG
+ "Ignoring request to disable invalid MCA bank %d.\n",
+ bank);
+ return;
+ }
+ set_bit(bank, mce_banks_ce_disabled);
+ on_each_cpu(__mce_disable_bank, &bank, 1);
+}
+
+/*
+ * mce=off Disables machine check
+ * mce=no_cmci Disables CMCI
+ * mce=no_lmce Disables LMCE
+ * mce=dont_log_ce Clears corrected events silently, no log created for CEs.
+ * mce=ignore_ce Disables polling and CMCI, corrected events are not cleared.
+ * mce=TOLERANCELEVEL[,monarchtimeout] (number, see above)
+ * monarchtimeout is how long to wait for other CPUs on machine
+ * check, or 0 to not wait
+ * mce=bootlog Log MCEs from before booting. Disabled by default on AMD Fam10h
+ and older.
+ * mce=nobootlog Don't log MCEs from before booting.
+ * mce=bios_cmci_threshold Don't program the CMCI threshold
+ * mce=recovery force enable memcpy_mcsafe()
+ */
+static int __init mcheck_enable(char *str)
+{
+ struct mca_config *cfg = &mca_cfg;
+
+ if (*str == 0) {
+ enable_p5_mce();
+ return 1;
+ }
+ if (*str == '=')
+ str++;
+ if (!strcmp(str, "off"))
+ cfg->disabled = 1;
+ else if (!strcmp(str, "no_cmci"))
+ cfg->cmci_disabled = true;
+ else if (!strcmp(str, "no_lmce"))
+ cfg->lmce_disabled = 1;
+ else if (!strcmp(str, "dont_log_ce"))
+ cfg->dont_log_ce = true;
+ else if (!strcmp(str, "ignore_ce"))
+ cfg->ignore_ce = true;
+ else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog"))
+ cfg->bootlog = (str[0] == 'b');
+ else if (!strcmp(str, "bios_cmci_threshold"))
+ cfg->bios_cmci_threshold = 1;
+ else if (!strcmp(str, "recovery"))
+ cfg->recovery = 1;
+ else if (isdigit(str[0])) {
+ if (get_option(&str, &cfg->tolerant) == 2)
+ get_option(&str, &(cfg->monarch_timeout));
+ } else {
+ pr_info("mce argument %s ignored. Please use /sys\n", str);
+ return 0;
+ }
+ return 1;
+}
+__setup("mce", mcheck_enable);
+
+int __init mcheck_init(void)
+{
+ mcheck_intel_therm_init();
+ mce_register_decode_chain(&first_nb);
+ mce_register_decode_chain(&mce_srao_nb);
+ mce_register_decode_chain(&mce_default_nb);
+ mcheck_vendor_init_severity();
+
+ INIT_WORK(&mce_work, mce_gen_pool_process);
+ init_irq_work(&mce_irq_work, mce_irq_work_cb);
+
+ return 0;
+}
+
+/*
+ * mce_syscore: PM support
+ */
+
+/*
+ * Disable machine checks on suspend and shutdown. We can't really handle
+ * them later.
+ */
+static void mce_disable_error_reporting(void)
+{
+ int i;
+
+ for (i = 0; i < mca_cfg.banks; i++) {
+ struct mce_bank *b = &mce_banks[i];
+
+ if (b->init)
+ wrmsrl(msr_ops.ctl(i), 0);
+ }
+ return;
+}
+
+static void vendor_disable_error_reporting(void)
+{
+ /*
+ * Don't clear on Intel or AMD CPUs. Some of these MSRs are socket-wide.
+ * Disabling them for just a single offlined CPU is bad, since it will
+ * inhibit reporting for all shared resources on the socket like the
+ * last level cache (LLC), the integrated memory controller (iMC), etc.
+ */
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL ||
+ boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
+ return;
+
+ mce_disable_error_reporting();
+}
+
+static int mce_syscore_suspend(void)
+{
+ vendor_disable_error_reporting();
+ return 0;
+}
+
+static void mce_syscore_shutdown(void)
+{
+ vendor_disable_error_reporting();
+}
+
+/*
+ * On resume clear all MCE state. Don't want to see leftovers from the BIOS.
+ * Only one CPU is active at this time, the others get re-added later using
+ * CPU hotplug:
+ */
+static void mce_syscore_resume(void)
+{
+ __mcheck_cpu_init_generic();
+ __mcheck_cpu_init_vendor(raw_cpu_ptr(&cpu_info));
+ __mcheck_cpu_init_clear_banks();
+}
+
+static struct syscore_ops mce_syscore_ops = {
+ .suspend = mce_syscore_suspend,
+ .shutdown = mce_syscore_shutdown,
+ .resume = mce_syscore_resume,
+};
+
+/*
+ * mce_device: Sysfs support
+ */
+
+static void mce_cpu_restart(void *data)
+{
+ if (!mce_available(raw_cpu_ptr(&cpu_info)))
+ return;
+ __mcheck_cpu_init_generic();
+ __mcheck_cpu_init_clear_banks();
+ __mcheck_cpu_init_timer();
+}
+
+/* Reinit MCEs after user configuration changes */
+static void mce_restart(void)
+{
+ mce_timer_delete_all();
+ on_each_cpu(mce_cpu_restart, NULL, 1);
+}
+
+/* Toggle features for corrected errors */
+static void mce_disable_cmci(void *data)
+{
+ if (!mce_available(raw_cpu_ptr(&cpu_info)))
+ return;
+ cmci_clear();
+}
+
+static void mce_enable_ce(void *all)
+{
+ if (!mce_available(raw_cpu_ptr(&cpu_info)))
+ return;
+ cmci_reenable();
+ cmci_recheck();
+ if (all)
+ __mcheck_cpu_init_timer();
+}
+
+static struct bus_type mce_subsys = {
+ .name = "machinecheck",
+ .dev_name = "machinecheck",
+};
+
+DEFINE_PER_CPU(struct device *, mce_device);
+
+static inline struct mce_bank *attr_to_bank(struct device_attribute *attr)
+{
+ return container_of(attr, struct mce_bank, attr);
+}
+
+static ssize_t show_bank(struct device *s, struct device_attribute *attr,
+ char *buf)
+{
+ return sprintf(buf, "%llx\n", attr_to_bank(attr)->ctl);
+}
+
+static ssize_t set_bank(struct device *s, struct device_attribute *attr,
+ const char *buf, size_t size)
+{
+ u64 new;
+
+ if (kstrtou64(buf, 0, &new) < 0)
+ return -EINVAL;
+
+ attr_to_bank(attr)->ctl = new;
+ mce_restart();
+
+ return size;
+}
+
+static ssize_t set_ignore_ce(struct device *s,
+ struct device_attribute *attr,
+ const char *buf, size_t size)
+{
+ u64 new;
+
+ if (kstrtou64(buf, 0, &new) < 0)
+ return -EINVAL;
+
+ mutex_lock(&mce_sysfs_mutex);
+ if (mca_cfg.ignore_ce ^ !!new) {
+ if (new) {
+ /* disable ce features */
+ mce_timer_delete_all();
+ on_each_cpu(mce_disable_cmci, NULL, 1);
+ mca_cfg.ignore_ce = true;
+ } else {
+ /* enable ce features */
+ mca_cfg.ignore_ce = false;
+ on_each_cpu(mce_enable_ce, (void *)1, 1);
+ }
+ }
+ mutex_unlock(&mce_sysfs_mutex);
+
+ return size;
+}
+
+static ssize_t set_cmci_disabled(struct device *s,
+ struct device_attribute *attr,
+ const char *buf, size_t size)
+{
+ u64 new;
+
+ if (kstrtou64(buf, 0, &new) < 0)
+ return -EINVAL;
+
+ mutex_lock(&mce_sysfs_mutex);
+ if (mca_cfg.cmci_disabled ^ !!new) {
+ if (new) {
+ /* disable cmci */
+ on_each_cpu(mce_disable_cmci, NULL, 1);
+ mca_cfg.cmci_disabled = true;
+ } else {
+ /* enable cmci */
+ mca_cfg.cmci_disabled = false;
+ on_each_cpu(mce_enable_ce, NULL, 1);
+ }
+ }
+ mutex_unlock(&mce_sysfs_mutex);
+
+ return size;
+}
+
+static ssize_t store_int_with_restart(struct device *s,
+ struct device_attribute *attr,
+ const char *buf, size_t size)
+{
+ unsigned long old_check_interval = check_interval;
+ ssize_t ret = device_store_ulong(s, attr, buf, size);
+
+ if (check_interval == old_check_interval)
+ return ret;
+
+ mutex_lock(&mce_sysfs_mutex);
+ mce_restart();
+ mutex_unlock(&mce_sysfs_mutex);
+
+ return ret;
+}
+
+static DEVICE_INT_ATTR(tolerant, 0644, mca_cfg.tolerant);
+static DEVICE_INT_ATTR(monarch_timeout, 0644, mca_cfg.monarch_timeout);
+static DEVICE_BOOL_ATTR(dont_log_ce, 0644, mca_cfg.dont_log_ce);
+
+static struct dev_ext_attribute dev_attr_check_interval = {
+ __ATTR(check_interval, 0644, device_show_int, store_int_with_restart),
+ &check_interval
+};
+
+static struct dev_ext_attribute dev_attr_ignore_ce = {
+ __ATTR(ignore_ce, 0644, device_show_bool, set_ignore_ce),
+ &mca_cfg.ignore_ce
+};
+
+static struct dev_ext_attribute dev_attr_cmci_disabled = {
+ __ATTR(cmci_disabled, 0644, device_show_bool, set_cmci_disabled),
+ &mca_cfg.cmci_disabled
+};
+
+static struct device_attribute *mce_device_attrs[] = {
+ &dev_attr_tolerant.attr,
+ &dev_attr_check_interval.attr,
+#ifdef CONFIG_X86_MCELOG_LEGACY
+ &dev_attr_trigger,
+#endif
+ &dev_attr_monarch_timeout.attr,
+ &dev_attr_dont_log_ce.attr,
+ &dev_attr_ignore_ce.attr,
+ &dev_attr_cmci_disabled.attr,
+ NULL
+};
+
+static cpumask_var_t mce_device_initialized;
+
+static void mce_device_release(struct device *dev)
+{
+ kfree(dev);
+}
+
+/* Per cpu device init. All of the cpus still share the same ctrl bank: */
+static int mce_device_create(unsigned int cpu)
+{
+ struct device *dev;
+ int err;
+ int i, j;
+
+ if (!mce_available(&boot_cpu_data))
+ return -EIO;
+
+ dev = per_cpu(mce_device, cpu);
+ if (dev)
+ return 0;
+
+ dev = kzalloc(sizeof *dev, GFP_KERNEL);
+ if (!dev)
+ return -ENOMEM;
+ dev->id = cpu;
+ dev->bus = &mce_subsys;
+ dev->release = &mce_device_release;
+
+ err = device_register(dev);
+ if (err) {
+ put_device(dev);
+ return err;
+ }
+
+ for (i = 0; mce_device_attrs[i]; i++) {
+ err = device_create_file(dev, mce_device_attrs[i]);
+ if (err)
+ goto error;
+ }
+ for (j = 0; j < mca_cfg.banks; j++) {
+ err = device_create_file(dev, &mce_banks[j].attr);
+ if (err)
+ goto error2;
+ }
+ cpumask_set_cpu(cpu, mce_device_initialized);
+ per_cpu(mce_device, cpu) = dev;
+
+ return 0;
+error2:
+ while (--j >= 0)
+ device_remove_file(dev, &mce_banks[j].attr);
+error:
+ while (--i >= 0)
+ device_remove_file(dev, mce_device_attrs[i]);
+
+ device_unregister(dev);
+
+ return err;
+}
+
+static void mce_device_remove(unsigned int cpu)
+{
+ struct device *dev = per_cpu(mce_device, cpu);
+ int i;
+
+ if (!cpumask_test_cpu(cpu, mce_device_initialized))
+ return;
+
+ for (i = 0; mce_device_attrs[i]; i++)
+ device_remove_file(dev, mce_device_attrs[i]);
+
+ for (i = 0; i < mca_cfg.banks; i++)
+ device_remove_file(dev, &mce_banks[i].attr);
+
+ device_unregister(dev);
+ cpumask_clear_cpu(cpu, mce_device_initialized);
+ per_cpu(mce_device, cpu) = NULL;
+}
+
+/* Make sure there are no machine checks on offlined CPUs. */
+static void mce_disable_cpu(void)
+{
+ if (!mce_available(raw_cpu_ptr(&cpu_info)))
+ return;
+
+ if (!cpuhp_tasks_frozen)
+ cmci_clear();
+
+ vendor_disable_error_reporting();
+}
+
+static void mce_reenable_cpu(void)
+{
+ int i;
+
+ if (!mce_available(raw_cpu_ptr(&cpu_info)))
+ return;
+
+ if (!cpuhp_tasks_frozen)
+ cmci_reenable();
+ for (i = 0; i < mca_cfg.banks; i++) {
+ struct mce_bank *b = &mce_banks[i];
+
+ if (b->init)
+ wrmsrl(msr_ops.ctl(i), b->ctl);
+ }
+}
+
+static int mce_cpu_dead(unsigned int cpu)
+{
+ mce_intel_hcpu_update(cpu);
+
+ /* intentionally ignoring frozen here */
+ if (!cpuhp_tasks_frozen)
+ cmci_rediscover();
+ return 0;
+}
+
+static int mce_cpu_online(unsigned int cpu)
+{
+ struct timer_list *t = this_cpu_ptr(&mce_timer);
+ int ret;
+
+ mce_device_create(cpu);
+
+ ret = mce_threshold_create_device(cpu);
+ if (ret) {
+ mce_device_remove(cpu);
+ return ret;
+ }
+ mce_reenable_cpu();
+ mce_start_timer(t);
+ return 0;
+}
+
+static int mce_cpu_pre_down(unsigned int cpu)
+{
+ struct timer_list *t = this_cpu_ptr(&mce_timer);
+
+ mce_disable_cpu();
+ del_timer_sync(t);
+ mce_threshold_remove_device(cpu);
+ mce_device_remove(cpu);
+ return 0;
+}
+
+static __init void mce_init_banks(void)
+{
+ int i;
+
+ for (i = 0; i < mca_cfg.banks; i++) {
+ struct mce_bank *b = &mce_banks[i];
+ struct device_attribute *a = &b->attr;
+
+ sysfs_attr_init(&a->attr);
+ a->attr.name = b->attrname;
+ snprintf(b->attrname, ATTR_LEN, "bank%d", i);
+
+ a->attr.mode = 0644;
+ a->show = show_bank;
+ a->store = set_bank;
+ }
+}
+
+static __init int mcheck_init_device(void)
+{
+ int err;
+
+ /*
+ * Check if we have a spare virtual bit. This will only become
+ * a problem if/when we move beyond 5-level page tables.
+ */
+ MAYBE_BUILD_BUG_ON(__VIRTUAL_MASK_SHIFT >= 63);
+
+ if (!mce_available(&boot_cpu_data)) {
+ err = -EIO;
+ goto err_out;
+ }
+
+ if (!zalloc_cpumask_var(&mce_device_initialized, GFP_KERNEL)) {
+ err = -ENOMEM;
+ goto err_out;
+ }
+
+ mce_init_banks();
+
+ err = subsys_system_register(&mce_subsys, NULL);
+ if (err)
+ goto err_out_mem;
+
+ err = cpuhp_setup_state(CPUHP_X86_MCE_DEAD, "x86/mce:dead", NULL,
+ mce_cpu_dead);
+ if (err)
+ goto err_out_mem;
+
+ err = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/mce:online",
+ mce_cpu_online, mce_cpu_pre_down);
+ if (err < 0)
+ goto err_out_online;
+
+ register_syscore_ops(&mce_syscore_ops);
+
+ return 0;
+
+err_out_online:
+ cpuhp_remove_state(CPUHP_X86_MCE_DEAD);
+
+err_out_mem:
+ free_cpumask_var(mce_device_initialized);
+
+err_out:
+ pr_err("Unable to init MCE device (rc: %d)\n", err);
+
+ return err;
+}
+device_initcall_sync(mcheck_init_device);
+
+/*
+ * Old style boot options parsing. Only for compatibility.
+ */
+static int __init mcheck_disable(char *str)
+{
+ mca_cfg.disabled = 1;
+ return 1;
+}
+__setup("nomce", mcheck_disable);
+
+#ifdef CONFIG_DEBUG_FS
+struct dentry *mce_get_debugfs_dir(void)
+{
+ static struct dentry *dmce;
+
+ if (!dmce)
+ dmce = debugfs_create_dir("mce", NULL);
+
+ return dmce;
+}
+
+static void mce_reset(void)
+{
+ cpu_missing = 0;
+ atomic_set(&mce_fake_panicked, 0);
+ atomic_set(&mce_executing, 0);
+ atomic_set(&mce_callin, 0);
+ atomic_set(&global_nwo, 0);
+}
+
+static int fake_panic_get(void *data, u64 *val)
+{
+ *val = fake_panic;
+ return 0;
+}
+
+static int fake_panic_set(void *data, u64 val)
+{
+ mce_reset();
+ fake_panic = val;
+ return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(fake_panic_fops, fake_panic_get,
+ fake_panic_set, "%llu\n");
+
+static int __init mcheck_debugfs_init(void)
+{
+ struct dentry *dmce, *ffake_panic;
+
+ dmce = mce_get_debugfs_dir();
+ if (!dmce)
+ return -ENOMEM;
+ ffake_panic = debugfs_create_file("fake_panic", 0444, dmce, NULL,
+ &fake_panic_fops);
+ if (!ffake_panic)
+ return -ENOMEM;
+
+ return 0;
+}
+#else
+static int __init mcheck_debugfs_init(void) { return -EINVAL; }
+#endif
+
+DEFINE_STATIC_KEY_FALSE(mcsafe_key);
+EXPORT_SYMBOL_GPL(mcsafe_key);
+
+static int __init mcheck_late_init(void)
+{
+ if (mca_cfg.recovery)
+ static_branch_inc(&mcsafe_key);
+
+ mcheck_debugfs_init();
+ cec_init();
+
+ /*
+ * Flush out everything that has been logged during early boot, now that
+ * everything has been initialized (workqueues, decoders, ...).
+ */
+ mce_schedule_work();
+
+ return 0;
+}
+late_initcall(mcheck_late_init);
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
new file mode 100644
index 0000000..e12454e
--- /dev/null
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -0,0 +1,1437 @@
+/*
+ * (c) 2005-2016 Advanced Micro Devices, Inc.
+ * Your use of this code is subject to the terms and conditions of the
+ * GNU general public license version 2. See "COPYING" or
+ * http://www.gnu.org/licenses/gpl.html
+ *
+ * Written by Jacob Shin - AMD, Inc.
+ * Maintained by: Borislav Petkov <bp@alien8.de>
+ *
+ * All MC4_MISCi registers are shared between cores on a node.
+ */
+#include <linux/interrupt.h>
+#include <linux/notifier.h>
+#include <linux/kobject.h>
+#include <linux/percpu.h>
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/sysfs.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/cpu.h>
+#include <linux/smp.h>
+#include <linux/string.h>
+
+#include <asm/amd_nb.h>
+#include <asm/apic.h>
+#include <asm/mce.h>
+#include <asm/msr.h>
+#include <asm/trace/irq_vectors.h>
+
+#include "mce-internal.h"
+
+#define NR_BLOCKS 5
+#define THRESHOLD_MAX 0xFFF
+#define INT_TYPE_APIC 0x00020000
+#define MASK_VALID_HI 0x80000000
+#define MASK_CNTP_HI 0x40000000
+#define MASK_LOCKED_HI 0x20000000
+#define MASK_LVTOFF_HI 0x00F00000
+#define MASK_COUNT_EN_HI 0x00080000
+#define MASK_INT_TYPE_HI 0x00060000
+#define MASK_OVERFLOW_HI 0x00010000
+#define MASK_ERR_COUNT_HI 0x00000FFF
+#define MASK_BLKPTR_LO 0xFF000000
+#define MCG_XBLK_ADDR 0xC0000400
+
+/* Deferred error settings */
+#define MSR_CU_DEF_ERR 0xC0000410
+#define MASK_DEF_LVTOFF 0x000000F0
+#define MASK_DEF_INT_TYPE 0x00000006
+#define DEF_LVT_OFF 0x2
+#define DEF_INT_TYPE_APIC 0x2
+
+/* Scalable MCA: */
+
+/* Threshold LVT offset is at MSR0xC0000410[15:12] */
+#define SMCA_THR_LVT_OFF 0xF000
+
+static bool thresholding_irq_en;
+
+static const char * const th_names[] = {
+ "load_store",
+ "insn_fetch",
+ "combined_unit",
+ "decode_unit",
+ "northbridge",
+ "execution_unit",
+};
+
+static const char * const smca_umc_block_names[] = {
+ "dram_ecc",
+ "misc_umc"
+};
+
+struct smca_bank_name {
+ const char *name; /* Short name for sysfs */
+ const char *long_name; /* Long name for pretty-printing */
+};
+
+static struct smca_bank_name smca_names[] = {
+ [SMCA_LS] = { "load_store", "Load Store Unit" },
+ [SMCA_IF] = { "insn_fetch", "Instruction Fetch Unit" },
+ [SMCA_L2_CACHE] = { "l2_cache", "L2 Cache" },
+ [SMCA_DE] = { "decode_unit", "Decode Unit" },
+ [SMCA_RESERVED] = { "reserved", "Reserved" },
+ [SMCA_EX] = { "execution_unit", "Execution Unit" },
+ [SMCA_FP] = { "floating_point", "Floating Point Unit" },
+ [SMCA_L3_CACHE] = { "l3_cache", "L3 Cache" },
+ [SMCA_CS] = { "coherent_slave", "Coherent Slave" },
+ [SMCA_PIE] = { "pie", "Power, Interrupts, etc." },
+ [SMCA_UMC] = { "umc", "Unified Memory Controller" },
+ [SMCA_PB] = { "param_block", "Parameter Block" },
+ [SMCA_PSP] = { "psp", "Platform Security Processor" },
+ [SMCA_SMU] = { "smu", "System Management Unit" },
+};
+
+static u32 smca_bank_addrs[MAX_NR_BANKS][NR_BLOCKS] __ro_after_init =
+{
+ [0 ... MAX_NR_BANKS - 1] = { [0 ... NR_BLOCKS - 1] = -1 }
+};
+
+const char *smca_get_name(enum smca_bank_types t)
+{
+ if (t >= N_SMCA_BANK_TYPES)
+ return NULL;
+
+ return smca_names[t].name;
+}
+
+const char *smca_get_long_name(enum smca_bank_types t)
+{
+ if (t >= N_SMCA_BANK_TYPES)
+ return NULL;
+
+ return smca_names[t].long_name;
+}
+EXPORT_SYMBOL_GPL(smca_get_long_name);
+
+static enum smca_bank_types smca_get_bank_type(unsigned int bank)
+{
+ struct smca_bank *b;
+
+ if (bank >= MAX_NR_BANKS)
+ return N_SMCA_BANK_TYPES;
+
+ b = &smca_banks[bank];
+ if (!b->hwid)
+ return N_SMCA_BANK_TYPES;
+
+ return b->hwid->bank_type;
+}
+
+static struct smca_hwid smca_hwid_mcatypes[] = {
+ /* { bank_type, hwid_mcatype, xec_bitmap } */
+
+ /* Reserved type */
+ { SMCA_RESERVED, HWID_MCATYPE(0x00, 0x0), 0x0 },
+
+ /* ZN Core (HWID=0xB0) MCA types */
+ { SMCA_LS, HWID_MCATYPE(0xB0, 0x0), 0x1FFFEF },
+ { SMCA_IF, HWID_MCATYPE(0xB0, 0x1), 0x3FFF },
+ { SMCA_L2_CACHE, HWID_MCATYPE(0xB0, 0x2), 0xF },
+ { SMCA_DE, HWID_MCATYPE(0xB0, 0x3), 0x1FF },
+ /* HWID 0xB0 MCATYPE 0x4 is Reserved */
+ { SMCA_EX, HWID_MCATYPE(0xB0, 0x5), 0x7FF },
+ { SMCA_FP, HWID_MCATYPE(0xB0, 0x6), 0x7F },
+ { SMCA_L3_CACHE, HWID_MCATYPE(0xB0, 0x7), 0xFF },
+
+ /* Data Fabric MCA types */
+ { SMCA_CS, HWID_MCATYPE(0x2E, 0x0), 0x1FF },
+ { SMCA_PIE, HWID_MCATYPE(0x2E, 0x1), 0xF },
+
+ /* Unified Memory Controller MCA type */
+ { SMCA_UMC, HWID_MCATYPE(0x96, 0x0), 0x3F },
+
+ /* Parameter Block MCA type */
+ { SMCA_PB, HWID_MCATYPE(0x05, 0x0), 0x1 },
+
+ /* Platform Security Processor MCA type */
+ { SMCA_PSP, HWID_MCATYPE(0xFF, 0x0), 0x1 },
+
+ /* System Management Unit MCA type */
+ { SMCA_SMU, HWID_MCATYPE(0x01, 0x0), 0x1 },
+};
+
+struct smca_bank smca_banks[MAX_NR_BANKS];
+EXPORT_SYMBOL_GPL(smca_banks);
+
+/*
+ * In SMCA enabled processors, we can have multiple banks for a given IP type.
+ * So to define a unique name for each bank, we use a temp c-string to append
+ * the MCA_IPID[InstanceId] to type's name in get_name().
+ *
+ * InstanceId is 32 bits which is 8 characters. Make sure MAX_MCATYPE_NAME_LEN
+ * is greater than 8 plus 1 (for underscore) plus length of longest type name.
+ */
+#define MAX_MCATYPE_NAME_LEN 30
+static char buf_mcatype[MAX_MCATYPE_NAME_LEN];
+
+static DEFINE_PER_CPU(struct threshold_bank **, threshold_banks);
+static DEFINE_PER_CPU(unsigned int, bank_map); /* see which banks are on */
+
+static void amd_threshold_interrupt(void);
+static void amd_deferred_error_interrupt(void);
+
+static void default_deferred_error_interrupt(void)
+{
+ pr_err("Unexpected deferred interrupt at vector %x\n", DEFERRED_ERROR_VECTOR);
+}
+void (*deferred_error_int_vector)(void) = default_deferred_error_interrupt;
+
+static void smca_configure(unsigned int bank, unsigned int cpu)
+{
+ unsigned int i, hwid_mcatype;
+ struct smca_hwid *s_hwid;
+ u32 high, low;
+ u32 smca_config = MSR_AMD64_SMCA_MCx_CONFIG(bank);
+
+ /* Set appropriate bits in MCA_CONFIG */
+ if (!rdmsr_safe(smca_config, &low, &high)) {
+ /*
+ * OS is required to set the MCAX bit to acknowledge that it is
+ * now using the new MSR ranges and new registers under each
+ * bank. It also means that the OS will configure deferred
+ * errors in the new MCx_CONFIG register. If the bit is not set,
+ * uncorrectable errors will cause a system panic.
+ *
+ * MCA_CONFIG[MCAX] is bit 32 (0 in the high portion of the MSR.)
+ */
+ high |= BIT(0);
+
+ /*
+ * SMCA sets the Deferred Error Interrupt type per bank.
+ *
+ * MCA_CONFIG[DeferredIntTypeSupported] is bit 5, and tells us
+ * if the DeferredIntType bit field is available.
+ *
+ * MCA_CONFIG[DeferredIntType] is bits [38:37] ([6:5] in the
+ * high portion of the MSR). OS should set this to 0x1 to enable
+ * APIC based interrupt. First, check that no interrupt has been
+ * set.
+ */
+ if ((low & BIT(5)) && !((high >> 5) & 0x3))
+ high |= BIT(5);
+
+ wrmsr(smca_config, low, high);
+ }
+
+ /* Return early if this bank was already initialized. */
+ if (smca_banks[bank].hwid)
+ return;
+
+ if (rdmsr_safe_on_cpu(cpu, MSR_AMD64_SMCA_MCx_IPID(bank), &low, &high)) {
+ pr_warn("Failed to read MCA_IPID for bank %d\n", bank);
+ return;
+ }
+
+ hwid_mcatype = HWID_MCATYPE(high & MCI_IPID_HWID,
+ (high & MCI_IPID_MCATYPE) >> 16);
+
+ for (i = 0; i < ARRAY_SIZE(smca_hwid_mcatypes); i++) {
+ s_hwid = &smca_hwid_mcatypes[i];
+ if (hwid_mcatype == s_hwid->hwid_mcatype) {
+ smca_banks[bank].hwid = s_hwid;
+ smca_banks[bank].id = low;
+ smca_banks[bank].sysfs_id = s_hwid->count++;
+ break;
+ }
+ }
+}
+
+struct thresh_restart {
+ struct threshold_block *b;
+ int reset;
+ int set_lvt_off;
+ int lvt_off;
+ u16 old_limit;
+};
+
+static inline bool is_shared_bank(int bank)
+{
+ /*
+ * Scalable MCA provides for only one core to have access to the MSRs of
+ * a shared bank.
+ */
+ if (mce_flags.smca)
+ return false;
+
+ /* Bank 4 is for northbridge reporting and is thus shared */
+ return (bank == 4);
+}
+
+static const char *bank4_names(const struct threshold_block *b)
+{
+ switch (b->address) {
+ /* MSR4_MISC0 */
+ case 0x00000413:
+ return "dram";
+
+ case 0xc0000408:
+ return "ht_links";
+
+ case 0xc0000409:
+ return "l3_cache";
+
+ default:
+ WARN(1, "Funny MSR: 0x%08x\n", b->address);
+ return "";
+ }
+};
+
+
+static bool lvt_interrupt_supported(unsigned int bank, u32 msr_high_bits)
+{
+ /*
+ * bank 4 supports APIC LVT interrupts implicitly since forever.
+ */
+ if (bank == 4)
+ return true;
+
+ /*
+ * IntP: interrupt present; if this bit is set, the thresholding
+ * bank can generate APIC LVT interrupts
+ */
+ return msr_high_bits & BIT(28);
+}
+
+static int lvt_off_valid(struct threshold_block *b, int apic, u32 lo, u32 hi)
+{
+ int msr = (hi & MASK_LVTOFF_HI) >> 20;
+
+ if (apic < 0) {
+ pr_err(FW_BUG "cpu %d, failed to setup threshold interrupt "
+ "for bank %d, block %d (MSR%08X=0x%x%08x)\n", b->cpu,
+ b->bank, b->block, b->address, hi, lo);
+ return 0;
+ }
+
+ if (apic != msr) {
+ /*
+ * On SMCA CPUs, LVT offset is programmed at a different MSR, and
+ * the BIOS provides the value. The original field where LVT offset
+ * was set is reserved. Return early here:
+ */
+ if (mce_flags.smca)
+ return 0;
+
+ pr_err(FW_BUG "cpu %d, invalid threshold interrupt offset %d "
+ "for bank %d, block %d (MSR%08X=0x%x%08x)\n",
+ b->cpu, apic, b->bank, b->block, b->address, hi, lo);
+ return 0;
+ }
+
+ return 1;
+};
+
+/* Reprogram MCx_MISC MSR behind this threshold bank. */
+static void threshold_restart_bank(void *_tr)
+{
+ struct thresh_restart *tr = _tr;
+ u32 hi, lo;
+
+ rdmsr(tr->b->address, lo, hi);
+
+ if (tr->b->threshold_limit < (hi & THRESHOLD_MAX))
+ tr->reset = 1; /* limit cannot be lower than err count */
+
+ if (tr->reset) { /* reset err count and overflow bit */
+ hi =
+ (hi & ~(MASK_ERR_COUNT_HI | MASK_OVERFLOW_HI)) |
+ (THRESHOLD_MAX - tr->b->threshold_limit);
+ } else if (tr->old_limit) { /* change limit w/o reset */
+ int new_count = (hi & THRESHOLD_MAX) +
+ (tr->old_limit - tr->b->threshold_limit);
+
+ hi = (hi & ~MASK_ERR_COUNT_HI) |
+ (new_count & THRESHOLD_MAX);
+ }
+
+ /* clear IntType */
+ hi &= ~MASK_INT_TYPE_HI;
+
+ if (!tr->b->interrupt_capable)
+ goto done;
+
+ if (tr->set_lvt_off) {
+ if (lvt_off_valid(tr->b, tr->lvt_off, lo, hi)) {
+ /* set new lvt offset */
+ hi &= ~MASK_LVTOFF_HI;
+ hi |= tr->lvt_off << 20;
+ }
+ }
+
+ if (tr->b->interrupt_enable)
+ hi |= INT_TYPE_APIC;
+
+ done:
+
+ hi |= MASK_COUNT_EN_HI;
+ wrmsr(tr->b->address, lo, hi);
+}
+
+static void mce_threshold_block_init(struct threshold_block *b, int offset)
+{
+ struct thresh_restart tr = {
+ .b = b,
+ .set_lvt_off = 1,
+ .lvt_off = offset,
+ };
+
+ b->threshold_limit = THRESHOLD_MAX;
+ threshold_restart_bank(&tr);
+};
+
+static int setup_APIC_mce_threshold(int reserved, int new)
+{
+ if (reserved < 0 && !setup_APIC_eilvt(new, THRESHOLD_APIC_VECTOR,
+ APIC_EILVT_MSG_FIX, 0))
+ return new;
+
+ return reserved;
+}
+
+static int setup_APIC_deferred_error(int reserved, int new)
+{
+ if (reserved < 0 && !setup_APIC_eilvt(new, DEFERRED_ERROR_VECTOR,
+ APIC_EILVT_MSG_FIX, 0))
+ return new;
+
+ return reserved;
+}
+
+static void deferred_error_interrupt_enable(struct cpuinfo_x86 *c)
+{
+ u32 low = 0, high = 0;
+ int def_offset = -1, def_new;
+
+ if (rdmsr_safe(MSR_CU_DEF_ERR, &low, &high))
+ return;
+
+ def_new = (low & MASK_DEF_LVTOFF) >> 4;
+ if (!(low & MASK_DEF_LVTOFF)) {
+ pr_err(FW_BUG "Your BIOS is not setting up LVT offset 0x2 for deferred error IRQs correctly.\n");
+ def_new = DEF_LVT_OFF;
+ low = (low & ~MASK_DEF_LVTOFF) | (DEF_LVT_OFF << 4);
+ }
+
+ def_offset = setup_APIC_deferred_error(def_offset, def_new);
+ if ((def_offset == def_new) &&
+ (deferred_error_int_vector != amd_deferred_error_interrupt))
+ deferred_error_int_vector = amd_deferred_error_interrupt;
+
+ if (!mce_flags.smca)
+ low = (low & ~MASK_DEF_INT_TYPE) | DEF_INT_TYPE_APIC;
+
+ wrmsr(MSR_CU_DEF_ERR, low, high);
+}
+
+static u32 smca_get_block_address(unsigned int bank, unsigned int block)
+{
+ u32 low, high;
+ u32 addr = 0;
+
+ if (smca_get_bank_type(bank) == SMCA_RESERVED)
+ return addr;
+
+ if (!block)
+ return MSR_AMD64_SMCA_MCx_MISC(bank);
+
+ /* Check our cache first: */
+ if (smca_bank_addrs[bank][block] != -1)
+ return smca_bank_addrs[bank][block];
+
+ /*
+ * For SMCA enabled processors, BLKPTR field of the first MISC register
+ * (MCx_MISC0) indicates presence of additional MISC regs set (MISC1-4).
+ */
+ if (rdmsr_safe(MSR_AMD64_SMCA_MCx_CONFIG(bank), &low, &high))
+ goto out;
+
+ if (!(low & MCI_CONFIG_MCAX))
+ goto out;
+
+ if (!rdmsr_safe(MSR_AMD64_SMCA_MCx_MISC(bank), &low, &high) &&
+ (low & MASK_BLKPTR_LO))
+ addr = MSR_AMD64_SMCA_MCx_MISCy(bank, block - 1);
+
+out:
+ smca_bank_addrs[bank][block] = addr;
+ return addr;
+}
+
+static u32 get_block_address(u32 current_addr, u32 low, u32 high,
+ unsigned int bank, unsigned int block)
+{
+ u32 addr = 0, offset = 0;
+
+ if ((bank >= mca_cfg.banks) || (block >= NR_BLOCKS))
+ return addr;
+
+ if (mce_flags.smca)
+ return smca_get_block_address(bank, block);
+
+ /* Fall back to method we used for older processors: */
+ switch (block) {
+ case 0:
+ addr = msr_ops.misc(bank);
+ break;
+ case 1:
+ offset = ((low & MASK_BLKPTR_LO) >> 21);
+ if (offset)
+ addr = MCG_XBLK_ADDR + offset;
+ break;
+ default:
+ addr = ++current_addr;
+ }
+ return addr;
+}
+
+static int
+prepare_threshold_block(unsigned int bank, unsigned int block, u32 addr,
+ int offset, u32 misc_high)
+{
+ unsigned int cpu = smp_processor_id();
+ u32 smca_low, smca_high;
+ struct threshold_block b;
+ int new;
+
+ if (!block)
+ per_cpu(bank_map, cpu) |= (1 << bank);
+
+ memset(&b, 0, sizeof(b));
+ b.cpu = cpu;
+ b.bank = bank;
+ b.block = block;
+ b.address = addr;
+ b.interrupt_capable = lvt_interrupt_supported(bank, misc_high);
+
+ if (!b.interrupt_capable)
+ goto done;
+
+ b.interrupt_enable = 1;
+
+ if (!mce_flags.smca) {
+ new = (misc_high & MASK_LVTOFF_HI) >> 20;
+ goto set_offset;
+ }
+
+ /* Gather LVT offset for thresholding: */
+ if (rdmsr_safe(MSR_CU_DEF_ERR, &smca_low, &smca_high))
+ goto out;
+
+ new = (smca_low & SMCA_THR_LVT_OFF) >> 12;
+
+set_offset:
+ offset = setup_APIC_mce_threshold(offset, new);
+ if (offset == new)
+ thresholding_irq_en = true;
+
+done:
+ mce_threshold_block_init(&b, offset);
+
+out:
+ return offset;
+}
+
+/* cpu init entry point, called from mce.c with preempt off */
+void mce_amd_feature_init(struct cpuinfo_x86 *c)
+{
+ u32 low = 0, high = 0, address = 0;
+ unsigned int bank, block, cpu = smp_processor_id();
+ int offset = -1;
+
+ for (bank = 0; bank < mca_cfg.banks; ++bank) {
+ if (mce_flags.smca)
+ smca_configure(bank, cpu);
+
+ for (block = 0; block < NR_BLOCKS; ++block) {
+ address = get_block_address(address, low, high, bank, block);
+ if (!address)
+ break;
+
+ if (rdmsr_safe(address, &low, &high))
+ break;
+
+ if (!(high & MASK_VALID_HI))
+ continue;
+
+ if (!(high & MASK_CNTP_HI) ||
+ (high & MASK_LOCKED_HI))
+ continue;
+
+ offset = prepare_threshold_block(bank, block, address, offset, high);
+ }
+ }
+
+ if (mce_flags.succor)
+ deferred_error_interrupt_enable(c);
+}
+
+int umc_normaddr_to_sysaddr(u64 norm_addr, u16 nid, u8 umc, u64 *sys_addr)
+{
+ u64 dram_base_addr, dram_limit_addr, dram_hole_base;
+ /* We start from the normalized address */
+ u64 ret_addr = norm_addr;
+
+ u32 tmp;
+
+ u8 die_id_shift, die_id_mask, socket_id_shift, socket_id_mask;
+ u8 intlv_num_dies, intlv_num_chan, intlv_num_sockets;
+ u8 intlv_addr_sel, intlv_addr_bit;
+ u8 num_intlv_bits, hashed_bit;
+ u8 lgcy_mmio_hole_en, base = 0;
+ u8 cs_mask, cs_id = 0;
+ bool hash_enabled = false;
+
+ /* Read D18F0x1B4 (DramOffset), check if base 1 is used. */
+ if (amd_df_indirect_read(nid, 0, 0x1B4, umc, &tmp))
+ goto out_err;
+
+ /* Remove HiAddrOffset from normalized address, if enabled: */
+ if (tmp & BIT(0)) {
+ u64 hi_addr_offset = (tmp & GENMASK_ULL(31, 20)) << 8;
+
+ if (norm_addr >= hi_addr_offset) {
+ ret_addr -= hi_addr_offset;
+ base = 1;
+ }
+ }
+
+ /* Read D18F0x110 (DramBaseAddress). */
+ if (amd_df_indirect_read(nid, 0, 0x110 + (8 * base), umc, &tmp))
+ goto out_err;
+
+ /* Check if address range is valid. */
+ if (!(tmp & BIT(0))) {
+ pr_err("%s: Invalid DramBaseAddress range: 0x%x.\n",
+ __func__, tmp);
+ goto out_err;
+ }
+
+ lgcy_mmio_hole_en = tmp & BIT(1);
+ intlv_num_chan = (tmp >> 4) & 0xF;
+ intlv_addr_sel = (tmp >> 8) & 0x7;
+ dram_base_addr = (tmp & GENMASK_ULL(31, 12)) << 16;
+
+ /* {0, 1, 2, 3} map to address bits {8, 9, 10, 11} respectively */
+ if (intlv_addr_sel > 3) {
+ pr_err("%s: Invalid interleave address select %d.\n",
+ __func__, intlv_addr_sel);
+ goto out_err;
+ }
+
+ /* Read D18F0x114 (DramLimitAddress). */
+ if (amd_df_indirect_read(nid, 0, 0x114 + (8 * base), umc, &tmp))
+ goto out_err;
+
+ intlv_num_sockets = (tmp >> 8) & 0x1;
+ intlv_num_dies = (tmp >> 10) & 0x3;
+ dram_limit_addr = ((tmp & GENMASK_ULL(31, 12)) << 16) | GENMASK_ULL(27, 0);
+
+ intlv_addr_bit = intlv_addr_sel + 8;
+
+ /* Re-use intlv_num_chan by setting it equal to log2(#channels) */
+ switch (intlv_num_chan) {
+ case 0: intlv_num_chan = 0; break;
+ case 1: intlv_num_chan = 1; break;
+ case 3: intlv_num_chan = 2; break;
+ case 5: intlv_num_chan = 3; break;
+ case 7: intlv_num_chan = 4; break;
+
+ case 8: intlv_num_chan = 1;
+ hash_enabled = true;
+ break;
+ default:
+ pr_err("%s: Invalid number of interleaved channels %d.\n",
+ __func__, intlv_num_chan);
+ goto out_err;
+ }
+
+ num_intlv_bits = intlv_num_chan;
+
+ if (intlv_num_dies > 2) {
+ pr_err("%s: Invalid number of interleaved nodes/dies %d.\n",
+ __func__, intlv_num_dies);
+ goto out_err;
+ }
+
+ num_intlv_bits += intlv_num_dies;
+
+ /* Add a bit if sockets are interleaved. */
+ num_intlv_bits += intlv_num_sockets;
+
+ /* Assert num_intlv_bits <= 4 */
+ if (num_intlv_bits > 4) {
+ pr_err("%s: Invalid interleave bits %d.\n",
+ __func__, num_intlv_bits);
+ goto out_err;
+ }
+
+ if (num_intlv_bits > 0) {
+ u64 temp_addr_x, temp_addr_i, temp_addr_y;
+ u8 die_id_bit, sock_id_bit, cs_fabric_id;
+
+ /*
+ * Read FabricBlockInstanceInformation3_CS[BlockFabricID].
+ * This is the fabric id for this coherent slave. Use
+ * umc/channel# as instance id of the coherent slave
+ * for FICAA.
+ */
+ if (amd_df_indirect_read(nid, 0, 0x50, umc, &tmp))
+ goto out_err;
+
+ cs_fabric_id = (tmp >> 8) & 0xFF;
+ die_id_bit = 0;
+
+ /* If interleaved over more than 1 channel: */
+ if (intlv_num_chan) {
+ die_id_bit = intlv_num_chan;
+ cs_mask = (1 << die_id_bit) - 1;
+ cs_id = cs_fabric_id & cs_mask;
+ }
+
+ sock_id_bit = die_id_bit;
+
+ /* Read D18F1x208 (SystemFabricIdMask). */
+ if (intlv_num_dies || intlv_num_sockets)
+ if (amd_df_indirect_read(nid, 1, 0x208, umc, &tmp))
+ goto out_err;
+
+ /* If interleaved over more than 1 die. */
+ if (intlv_num_dies) {
+ sock_id_bit = die_id_bit + intlv_num_dies;
+ die_id_shift = (tmp >> 24) & 0xF;
+ die_id_mask = (tmp >> 8) & 0xFF;
+
+ cs_id |= ((cs_fabric_id & die_id_mask) >> die_id_shift) << die_id_bit;
+ }
+
+ /* If interleaved over more than 1 socket. */
+ if (intlv_num_sockets) {
+ socket_id_shift = (tmp >> 28) & 0xF;
+ socket_id_mask = (tmp >> 16) & 0xFF;
+
+ cs_id |= ((cs_fabric_id & socket_id_mask) >> socket_id_shift) << sock_id_bit;
+ }
+
+ /*
+ * The pre-interleaved address consists of XXXXXXIIIYYYYY
+ * where III is the ID for this CS, and XXXXXXYYYYY are the
+ * address bits from the post-interleaved address.
+ * "num_intlv_bits" has been calculated to tell us how many "I"
+ * bits there are. "intlv_addr_bit" tells us how many "Y" bits
+ * there are (where "I" starts).
+ */
+ temp_addr_y = ret_addr & GENMASK_ULL(intlv_addr_bit-1, 0);
+ temp_addr_i = (cs_id << intlv_addr_bit);
+ temp_addr_x = (ret_addr & GENMASK_ULL(63, intlv_addr_bit)) << num_intlv_bits;
+ ret_addr = temp_addr_x | temp_addr_i | temp_addr_y;
+ }
+
+ /* Add dram base address */
+ ret_addr += dram_base_addr;
+
+ /* If legacy MMIO hole enabled */
+ if (lgcy_mmio_hole_en) {
+ if (amd_df_indirect_read(nid, 0, 0x104, umc, &tmp))
+ goto out_err;
+
+ dram_hole_base = tmp & GENMASK(31, 24);
+ if (ret_addr >= dram_hole_base)
+ ret_addr += (BIT_ULL(32) - dram_hole_base);
+ }
+
+ if (hash_enabled) {
+ /* Save some parentheses and grab ls-bit at the end. */
+ hashed_bit = (ret_addr >> 12) ^
+ (ret_addr >> 18) ^
+ (ret_addr >> 21) ^
+ (ret_addr >> 30) ^
+ cs_id;
+
+ hashed_bit &= BIT(0);
+
+ if (hashed_bit != ((ret_addr >> intlv_addr_bit) & BIT(0)))
+ ret_addr ^= BIT(intlv_addr_bit);
+ }
+
+ /* Is calculated system address is above DRAM limit address? */
+ if (ret_addr > dram_limit_addr)
+ goto out_err;
+
+ *sys_addr = ret_addr;
+ return 0;
+
+out_err:
+ return -EINVAL;
+}
+EXPORT_SYMBOL_GPL(umc_normaddr_to_sysaddr);
+
+bool amd_mce_is_memory_error(struct mce *m)
+{
+ /* ErrCodeExt[20:16] */
+ u8 xec = (m->status >> 16) & 0x1f;
+
+ if (mce_flags.smca)
+ return smca_get_bank_type(m->bank) == SMCA_UMC && xec == 0x0;
+
+ return m->bank == 4 && xec == 0x8;
+}
+
+static void __log_error(unsigned int bank, u64 status, u64 addr, u64 misc)
+{
+ struct mce m;
+
+ mce_setup(&m);
+
+ m.status = status;
+ m.misc = misc;
+ m.bank = bank;
+ m.tsc = rdtsc();
+
+ if (m.status & MCI_STATUS_ADDRV) {
+ m.addr = addr;
+
+ /*
+ * Extract [55:<lsb>] where lsb is the least significant
+ * *valid* bit of the address bits.
+ */
+ if (mce_flags.smca) {
+ u8 lsb = (m.addr >> 56) & 0x3f;
+
+ m.addr &= GENMASK_ULL(55, lsb);
+ }
+ }
+
+ if (mce_flags.smca) {
+ rdmsrl(MSR_AMD64_SMCA_MCx_IPID(bank), m.ipid);
+
+ if (m.status & MCI_STATUS_SYNDV)
+ rdmsrl(MSR_AMD64_SMCA_MCx_SYND(bank), m.synd);
+ }
+
+ mce_log(&m);
+}
+
+asmlinkage __visible void __irq_entry smp_deferred_error_interrupt(void)
+{
+ entering_irq();
+ trace_deferred_error_apic_entry(DEFERRED_ERROR_VECTOR);
+ inc_irq_stat(irq_deferred_error_count);
+ deferred_error_int_vector();
+ trace_deferred_error_apic_exit(DEFERRED_ERROR_VECTOR);
+ exiting_ack_irq();
+}
+
+/*
+ * Returns true if the logged error is deferred. False, otherwise.
+ */
+static inline bool
+_log_error_bank(unsigned int bank, u32 msr_stat, u32 msr_addr, u64 misc)
+{
+ u64 status, addr = 0;
+
+ rdmsrl(msr_stat, status);
+ if (!(status & MCI_STATUS_VAL))
+ return false;
+
+ if (status & MCI_STATUS_ADDRV)
+ rdmsrl(msr_addr, addr);
+
+ __log_error(bank, status, addr, misc);
+
+ wrmsrl(msr_stat, 0);
+
+ return status & MCI_STATUS_DEFERRED;
+}
+
+/*
+ * We have three scenarios for checking for Deferred errors:
+ *
+ * 1) Non-SMCA systems check MCA_STATUS and log error if found.
+ * 2) SMCA systems check MCA_STATUS. If error is found then log it and also
+ * clear MCA_DESTAT.
+ * 3) SMCA systems check MCA_DESTAT, if error was not found in MCA_STATUS, and
+ * log it.
+ */
+static void log_error_deferred(unsigned int bank)
+{
+ bool defrd;
+
+ defrd = _log_error_bank(bank, msr_ops.status(bank),
+ msr_ops.addr(bank), 0);
+
+ if (!mce_flags.smca)
+ return;
+
+ /* Clear MCA_DESTAT if we logged the deferred error from MCA_STATUS. */
+ if (defrd) {
+ wrmsrl(MSR_AMD64_SMCA_MCx_DESTAT(bank), 0);
+ return;
+ }
+
+ /*
+ * Only deferred errors are logged in MCA_DE{STAT,ADDR} so just check
+ * for a valid error.
+ */
+ _log_error_bank(bank, MSR_AMD64_SMCA_MCx_DESTAT(bank),
+ MSR_AMD64_SMCA_MCx_DEADDR(bank), 0);
+}
+
+/* APIC interrupt handler for deferred errors */
+static void amd_deferred_error_interrupt(void)
+{
+ unsigned int bank;
+
+ for (bank = 0; bank < mca_cfg.banks; ++bank)
+ log_error_deferred(bank);
+}
+
+static void log_error_thresholding(unsigned int bank, u64 misc)
+{
+ _log_error_bank(bank, msr_ops.status(bank), msr_ops.addr(bank), misc);
+}
+
+static void log_and_reset_block(struct threshold_block *block)
+{
+ struct thresh_restart tr;
+ u32 low = 0, high = 0;
+
+ if (!block)
+ return;
+
+ if (rdmsr_safe(block->address, &low, &high))
+ return;
+
+ if (!(high & MASK_OVERFLOW_HI))
+ return;
+
+ /* Log the MCE which caused the threshold event. */
+ log_error_thresholding(block->bank, ((u64)high << 32) | low);
+
+ /* Reset threshold block after logging error. */
+ memset(&tr, 0, sizeof(tr));
+ tr.b = block;
+ threshold_restart_bank(&tr);
+}
+
+/*
+ * Threshold interrupt handler will service THRESHOLD_APIC_VECTOR. The interrupt
+ * goes off when error_count reaches threshold_limit.
+ */
+static void amd_threshold_interrupt(void)
+{
+ struct threshold_block *first_block = NULL, *block = NULL, *tmp = NULL;
+ unsigned int bank, cpu = smp_processor_id();
+
+ for (bank = 0; bank < mca_cfg.banks; ++bank) {
+ if (!(per_cpu(bank_map, cpu) & (1 << bank)))
+ continue;
+
+ first_block = per_cpu(threshold_banks, cpu)[bank]->blocks;
+ if (!first_block)
+ continue;
+
+ /*
+ * The first block is also the head of the list. Check it first
+ * before iterating over the rest.
+ */
+ log_and_reset_block(first_block);
+ list_for_each_entry_safe(block, tmp, &first_block->miscj, miscj)
+ log_and_reset_block(block);
+ }
+}
+
+/*
+ * Sysfs Interface
+ */
+
+struct threshold_attr {
+ struct attribute attr;
+ ssize_t (*show) (struct threshold_block *, char *);
+ ssize_t (*store) (struct threshold_block *, const char *, size_t count);
+};
+
+#define SHOW_FIELDS(name) \
+static ssize_t show_ ## name(struct threshold_block *b, char *buf) \
+{ \
+ return sprintf(buf, "%lu\n", (unsigned long) b->name); \
+}
+SHOW_FIELDS(interrupt_enable)
+SHOW_FIELDS(threshold_limit)
+
+static ssize_t
+store_interrupt_enable(struct threshold_block *b, const char *buf, size_t size)
+{
+ struct thresh_restart tr;
+ unsigned long new;
+
+ if (!b->interrupt_capable)
+ return -EINVAL;
+
+ if (kstrtoul(buf, 0, &new) < 0)
+ return -EINVAL;
+
+ b->interrupt_enable = !!new;
+
+ memset(&tr, 0, sizeof(tr));
+ tr.b = b;
+
+ smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1);
+
+ return size;
+}
+
+static ssize_t
+store_threshold_limit(struct threshold_block *b, const char *buf, size_t size)
+{
+ struct thresh_restart tr;
+ unsigned long new;
+
+ if (kstrtoul(buf, 0, &new) < 0)
+ return -EINVAL;
+
+ if (new > THRESHOLD_MAX)
+ new = THRESHOLD_MAX;
+ if (new < 1)
+ new = 1;
+
+ memset(&tr, 0, sizeof(tr));
+ tr.old_limit = b->threshold_limit;
+ b->threshold_limit = new;
+ tr.b = b;
+
+ smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1);
+
+ return size;
+}
+
+static ssize_t show_error_count(struct threshold_block *b, char *buf)
+{
+ u32 lo, hi;
+
+ rdmsr_on_cpu(b->cpu, b->address, &lo, &hi);
+
+ return sprintf(buf, "%u\n", ((hi & THRESHOLD_MAX) -
+ (THRESHOLD_MAX - b->threshold_limit)));
+}
+
+static struct threshold_attr error_count = {
+ .attr = {.name = __stringify(error_count), .mode = 0444 },
+ .show = show_error_count,
+};
+
+#define RW_ATTR(val) \
+static struct threshold_attr val = { \
+ .attr = {.name = __stringify(val), .mode = 0644 }, \
+ .show = show_## val, \
+ .store = store_## val, \
+};
+
+RW_ATTR(interrupt_enable);
+RW_ATTR(threshold_limit);
+
+static struct attribute *default_attrs[] = {
+ &threshold_limit.attr,
+ &error_count.attr,
+ NULL, /* possibly interrupt_enable if supported, see below */
+ NULL,
+};
+
+#define to_block(k) container_of(k, struct threshold_block, kobj)
+#define to_attr(a) container_of(a, struct threshold_attr, attr)
+
+static ssize_t show(struct kobject *kobj, struct attribute *attr, char *buf)
+{
+ struct threshold_block *b = to_block(kobj);
+ struct threshold_attr *a = to_attr(attr);
+ ssize_t ret;
+
+ ret = a->show ? a->show(b, buf) : -EIO;
+
+ return ret;
+}
+
+static ssize_t store(struct kobject *kobj, struct attribute *attr,
+ const char *buf, size_t count)
+{
+ struct threshold_block *b = to_block(kobj);
+ struct threshold_attr *a = to_attr(attr);
+ ssize_t ret;
+
+ ret = a->store ? a->store(b, buf, count) : -EIO;
+
+ return ret;
+}
+
+static const struct sysfs_ops threshold_ops = {
+ .show = show,
+ .store = store,
+};
+
+static struct kobj_type threshold_ktype = {
+ .sysfs_ops = &threshold_ops,
+ .default_attrs = default_attrs,
+};
+
+static const char *get_name(unsigned int bank, struct threshold_block *b)
+{
+ enum smca_bank_types bank_type;
+
+ if (!mce_flags.smca) {
+ if (b && bank == 4)
+ return bank4_names(b);
+
+ return th_names[bank];
+ }
+
+ bank_type = smca_get_bank_type(bank);
+ if (bank_type >= N_SMCA_BANK_TYPES)
+ return NULL;
+
+ if (b && bank_type == SMCA_UMC) {
+ if (b->block < ARRAY_SIZE(smca_umc_block_names))
+ return smca_umc_block_names[b->block];
+ return NULL;
+ }
+
+ if (smca_banks[bank].hwid->count == 1)
+ return smca_get_name(bank_type);
+
+ snprintf(buf_mcatype, MAX_MCATYPE_NAME_LEN,
+ "%s_%x", smca_get_name(bank_type),
+ smca_banks[bank].sysfs_id);
+ return buf_mcatype;
+}
+
+static int allocate_threshold_blocks(unsigned int cpu, unsigned int bank,
+ unsigned int block, u32 address)
+{
+ struct threshold_block *b = NULL;
+ u32 low, high;
+ int err;
+
+ if ((bank >= mca_cfg.banks) || (block >= NR_BLOCKS))
+ return 0;
+
+ if (rdmsr_safe_on_cpu(cpu, address, &low, &high))
+ return 0;
+
+ if (!(high & MASK_VALID_HI)) {
+ if (block)
+ goto recurse;
+ else
+ return 0;
+ }
+
+ if (!(high & MASK_CNTP_HI) ||
+ (high & MASK_LOCKED_HI))
+ goto recurse;
+
+ b = kzalloc(sizeof(struct threshold_block), GFP_KERNEL);
+ if (!b)
+ return -ENOMEM;
+
+ b->block = block;
+ b->bank = bank;
+ b->cpu = cpu;
+ b->address = address;
+ b->interrupt_enable = 0;
+ b->interrupt_capable = lvt_interrupt_supported(bank, high);
+ b->threshold_limit = THRESHOLD_MAX;
+
+ if (b->interrupt_capable) {
+ threshold_ktype.default_attrs[2] = &interrupt_enable.attr;
+ b->interrupt_enable = 1;
+ } else {
+ threshold_ktype.default_attrs[2] = NULL;
+ }
+
+ INIT_LIST_HEAD(&b->miscj);
+
+ if (per_cpu(threshold_banks, cpu)[bank]->blocks) {
+ list_add(&b->miscj,
+ &per_cpu(threshold_banks, cpu)[bank]->blocks->miscj);
+ } else {
+ per_cpu(threshold_banks, cpu)[bank]->blocks = b;
+ }
+
+ err = kobject_init_and_add(&b->kobj, &threshold_ktype,
+ per_cpu(threshold_banks, cpu)[bank]->kobj,
+ get_name(bank, b));
+ if (err)
+ goto out_free;
+recurse:
+ address = get_block_address(address, low, high, bank, ++block);
+ if (!address)
+ return 0;
+
+ err = allocate_threshold_blocks(cpu, bank, block, address);
+ if (err)
+ goto out_free;
+
+ if (b)
+ kobject_uevent(&b->kobj, KOBJ_ADD);
+
+ return err;
+
+out_free:
+ if (b) {
+ kobject_put(&b->kobj);
+ list_del(&b->miscj);
+ kfree(b);
+ }
+ return err;
+}
+
+static int __threshold_add_blocks(struct threshold_bank *b)
+{
+ struct list_head *head = &b->blocks->miscj;
+ struct threshold_block *pos = NULL;
+ struct threshold_block *tmp = NULL;
+ int err = 0;
+
+ err = kobject_add(&b->blocks->kobj, b->kobj, b->blocks->kobj.name);
+ if (err)
+ return err;
+
+ list_for_each_entry_safe(pos, tmp, head, miscj) {
+
+ err = kobject_add(&pos->kobj, b->kobj, pos->kobj.name);
+ if (err) {
+ list_for_each_entry_safe_reverse(pos, tmp, head, miscj)
+ kobject_del(&pos->kobj);
+
+ return err;
+ }
+ }
+ return err;
+}
+
+static int threshold_create_bank(unsigned int cpu, unsigned int bank)
+{
+ struct device *dev = per_cpu(mce_device, cpu);
+ struct amd_northbridge *nb = NULL;
+ struct threshold_bank *b = NULL;
+ const char *name = get_name(bank, NULL);
+ int err = 0;
+
+ if (!dev)
+ return -ENODEV;
+
+ if (is_shared_bank(bank)) {
+ nb = node_to_amd_nb(amd_get_nb_id(cpu));
+
+ /* threshold descriptor already initialized on this node? */
+ if (nb && nb->bank4) {
+ /* yes, use it */
+ b = nb->bank4;
+ err = kobject_add(b->kobj, &dev->kobj, name);
+ if (err)
+ goto out;
+
+ per_cpu(threshold_banks, cpu)[bank] = b;
+ refcount_inc(&b->cpus);
+
+ err = __threshold_add_blocks(b);
+
+ goto out;
+ }
+ }
+
+ b = kzalloc(sizeof(struct threshold_bank), GFP_KERNEL);
+ if (!b) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ b->kobj = kobject_create_and_add(name, &dev->kobj);
+ if (!b->kobj) {
+ err = -EINVAL;
+ goto out_free;
+ }
+
+ per_cpu(threshold_banks, cpu)[bank] = b;
+
+ if (is_shared_bank(bank)) {
+ refcount_set(&b->cpus, 1);
+
+ /* nb is already initialized, see above */
+ if (nb) {
+ WARN_ON(nb->bank4);
+ nb->bank4 = b;
+ }
+ }
+
+ err = allocate_threshold_blocks(cpu, bank, 0, msr_ops.misc(bank));
+ if (!err)
+ goto out;
+
+ out_free:
+ kfree(b);
+
+ out:
+ return err;
+}
+
+static void deallocate_threshold_block(unsigned int cpu,
+ unsigned int bank)
+{
+ struct threshold_block *pos = NULL;
+ struct threshold_block *tmp = NULL;
+ struct threshold_bank *head = per_cpu(threshold_banks, cpu)[bank];
+
+ if (!head)
+ return;
+
+ list_for_each_entry_safe(pos, tmp, &head->blocks->miscj, miscj) {
+ kobject_put(&pos->kobj);
+ list_del(&pos->miscj);
+ kfree(pos);
+ }
+
+ kfree(per_cpu(threshold_banks, cpu)[bank]->blocks);
+ per_cpu(threshold_banks, cpu)[bank]->blocks = NULL;
+}
+
+static void __threshold_remove_blocks(struct threshold_bank *b)
+{
+ struct threshold_block *pos = NULL;
+ struct threshold_block *tmp = NULL;
+
+ kobject_del(b->kobj);
+
+ list_for_each_entry_safe(pos, tmp, &b->blocks->miscj, miscj)
+ kobject_del(&pos->kobj);
+}
+
+static void threshold_remove_bank(unsigned int cpu, int bank)
+{
+ struct amd_northbridge *nb;
+ struct threshold_bank *b;
+
+ b = per_cpu(threshold_banks, cpu)[bank];
+ if (!b)
+ return;
+
+ if (!b->blocks)
+ goto free_out;
+
+ if (is_shared_bank(bank)) {
+ if (!refcount_dec_and_test(&b->cpus)) {
+ __threshold_remove_blocks(b);
+ per_cpu(threshold_banks, cpu)[bank] = NULL;
+ return;
+ } else {
+ /*
+ * the last CPU on this node using the shared bank is
+ * going away, remove that bank now.
+ */
+ nb = node_to_amd_nb(amd_get_nb_id(cpu));
+ nb->bank4 = NULL;
+ }
+ }
+
+ deallocate_threshold_block(cpu, bank);
+
+free_out:
+ kobject_del(b->kobj);
+ kobject_put(b->kobj);
+ kfree(b);
+ per_cpu(threshold_banks, cpu)[bank] = NULL;
+}
+
+int mce_threshold_remove_device(unsigned int cpu)
+{
+ unsigned int bank;
+
+ for (bank = 0; bank < mca_cfg.banks; ++bank) {
+ if (!(per_cpu(bank_map, cpu) & (1 << bank)))
+ continue;
+ threshold_remove_bank(cpu, bank);
+ }
+ kfree(per_cpu(threshold_banks, cpu));
+ per_cpu(threshold_banks, cpu) = NULL;
+ return 0;
+}
+
+/* create dir/files for all valid threshold banks */
+int mce_threshold_create_device(unsigned int cpu)
+{
+ unsigned int bank;
+ struct threshold_bank **bp;
+ int err = 0;
+
+ bp = per_cpu(threshold_banks, cpu);
+ if (bp)
+ return 0;
+
+ bp = kcalloc(mca_cfg.banks, sizeof(struct threshold_bank *),
+ GFP_KERNEL);
+ if (!bp)
+ return -ENOMEM;
+
+ per_cpu(threshold_banks, cpu) = bp;
+
+ for (bank = 0; bank < mca_cfg.banks; ++bank) {
+ if (!(per_cpu(bank_map, cpu) & (1 << bank)))
+ continue;
+ err = threshold_create_bank(cpu, bank);
+ if (err)
+ goto err;
+ }
+ return err;
+err:
+ mce_threshold_remove_device(cpu);
+ return err;
+}
+
+static __init int threshold_init_device(void)
+{
+ unsigned lcpu = 0;
+
+ /* to hit CPUs online before the notifier is up */
+ for_each_online_cpu(lcpu) {
+ int err = mce_threshold_create_device(lcpu);
+
+ if (err)
+ return err;
+ }
+
+ if (thresholding_irq_en)
+ mce_threshold_vector = amd_threshold_interrupt;
+
+ return 0;
+}
+/*
+ * there are 3 funcs which need to be _initcalled in a logic sequence:
+ * 1. xen_late_init_mcelog
+ * 2. mcheck_init_device
+ * 3. threshold_init_device
+ *
+ * xen_late_init_mcelog must register xen_mce_chrdev_device before
+ * native mce_chrdev_device registration if running under xen platform;
+ *
+ * mcheck_init_device should be inited before threshold_init_device to
+ * initialize mce_device, otherwise a NULL ptr dereference will cause panic.
+ *
+ * so we use following _initcalls
+ * 1. device_initcall(xen_late_init_mcelog);
+ * 2. device_initcall_sync(mcheck_init_device);
+ * 3. late_initcall(threshold_init_device);
+ *
+ * when running under xen, the initcall order is 1,2,3;
+ * on baremetal, we skip 1 and we do only 2 and 3.
+ */
+late_initcall(threshold_init_device);
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c
new file mode 100644
index 0000000..d05be30
--- /dev/null
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c
@@ -0,0 +1,518 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Intel specific MCE features.
+ * Copyright 2004 Zwane Mwaikambo <zwane@linuxpower.ca>
+ * Copyright (C) 2008, 2009 Intel Corporation
+ * Author: Andi Kleen
+ */
+
+#include <linux/gfp.h>
+#include <linux/interrupt.h>
+#include <linux/percpu.h>
+#include <linux/sched.h>
+#include <linux/cpumask.h>
+#include <asm/apic.h>
+#include <asm/cpufeature.h>
+#include <asm/intel-family.h>
+#include <asm/processor.h>
+#include <asm/msr.h>
+#include <asm/mce.h>
+
+#include "mce-internal.h"
+
+/*
+ * Support for Intel Correct Machine Check Interrupts. This allows
+ * the CPU to raise an interrupt when a corrected machine check happened.
+ * Normally we pick those up using a regular polling timer.
+ * Also supports reliable discovery of shared banks.
+ */
+
+/*
+ * CMCI can be delivered to multiple cpus that share a machine check bank
+ * so we need to designate a single cpu to process errors logged in each bank
+ * in the interrupt handler (otherwise we would have many races and potential
+ * double reporting of the same error).
+ * Note that this can change when a cpu is offlined or brought online since
+ * some MCA banks are shared across cpus. When a cpu is offlined, cmci_clear()
+ * disables CMCI on all banks owned by the cpu and clears this bitfield. At
+ * this point, cmci_rediscover() kicks in and a different cpu may end up
+ * taking ownership of some of the shared MCA banks that were previously
+ * owned by the offlined cpu.
+ */
+static DEFINE_PER_CPU(mce_banks_t, mce_banks_owned);
+
+/*
+ * CMCI storm detection backoff counter
+ *
+ * During storm, we reset this counter to INITIAL_CHECK_INTERVAL in case we've
+ * encountered an error. If not, we decrement it by one. We signal the end of
+ * the CMCI storm when it reaches 0.
+ */
+static DEFINE_PER_CPU(int, cmci_backoff_cnt);
+
+/*
+ * cmci_discover_lock protects against parallel discovery attempts
+ * which could race against each other.
+ */
+static DEFINE_RAW_SPINLOCK(cmci_discover_lock);
+
+#define CMCI_THRESHOLD 1
+#define CMCI_POLL_INTERVAL (30 * HZ)
+#define CMCI_STORM_INTERVAL (HZ)
+#define CMCI_STORM_THRESHOLD 15
+
+static DEFINE_PER_CPU(unsigned long, cmci_time_stamp);
+static DEFINE_PER_CPU(unsigned int, cmci_storm_cnt);
+static DEFINE_PER_CPU(unsigned int, cmci_storm_state);
+
+enum {
+ CMCI_STORM_NONE,
+ CMCI_STORM_ACTIVE,
+ CMCI_STORM_SUBSIDED,
+};
+
+static atomic_t cmci_storm_on_cpus;
+
+static int cmci_supported(int *banks)
+{
+ u64 cap;
+
+ if (mca_cfg.cmci_disabled || mca_cfg.ignore_ce)
+ return 0;
+
+ /*
+ * Vendor check is not strictly needed, but the initial
+ * initialization is vendor keyed and this
+ * makes sure none of the backdoors are entered otherwise.
+ */
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
+ return 0;
+ if (!boot_cpu_has(X86_FEATURE_APIC) || lapic_get_maxlvt() < 6)
+ return 0;
+ rdmsrl(MSR_IA32_MCG_CAP, cap);
+ *banks = min_t(unsigned, MAX_NR_BANKS, cap & 0xff);
+ return !!(cap & MCG_CMCI_P);
+}
+
+static bool lmce_supported(void)
+{
+ u64 tmp;
+
+ if (mca_cfg.lmce_disabled)
+ return false;
+
+ rdmsrl(MSR_IA32_MCG_CAP, tmp);
+
+ /*
+ * LMCE depends on recovery support in the processor. Hence both
+ * MCG_SER_P and MCG_LMCE_P should be present in MCG_CAP.
+ */
+ if ((tmp & (MCG_SER_P | MCG_LMCE_P)) !=
+ (MCG_SER_P | MCG_LMCE_P))
+ return false;
+
+ /*
+ * BIOS should indicate support for LMCE by setting bit 20 in
+ * IA32_FEATURE_CONTROL without which touching MCG_EXT_CTL will
+ * generate a #GP fault.
+ */
+ rdmsrl(MSR_IA32_FEATURE_CONTROL, tmp);
+ if ((tmp & (FEATURE_CONTROL_LOCKED | FEATURE_CONTROL_LMCE)) ==
+ (FEATURE_CONTROL_LOCKED | FEATURE_CONTROL_LMCE))
+ return true;
+
+ return false;
+}
+
+bool mce_intel_cmci_poll(void)
+{
+ if (__this_cpu_read(cmci_storm_state) == CMCI_STORM_NONE)
+ return false;
+
+ /*
+ * Reset the counter if we've logged an error in the last poll
+ * during the storm.
+ */
+ if (machine_check_poll(0, this_cpu_ptr(&mce_banks_owned)))
+ this_cpu_write(cmci_backoff_cnt, INITIAL_CHECK_INTERVAL);
+ else
+ this_cpu_dec(cmci_backoff_cnt);
+
+ return true;
+}
+
+void mce_intel_hcpu_update(unsigned long cpu)
+{
+ if (per_cpu(cmci_storm_state, cpu) == CMCI_STORM_ACTIVE)
+ atomic_dec(&cmci_storm_on_cpus);
+
+ per_cpu(cmci_storm_state, cpu) = CMCI_STORM_NONE;
+}
+
+static void cmci_toggle_interrupt_mode(bool on)
+{
+ unsigned long flags, *owned;
+ int bank;
+ u64 val;
+
+ raw_spin_lock_irqsave(&cmci_discover_lock, flags);
+ owned = this_cpu_ptr(mce_banks_owned);
+ for_each_set_bit(bank, owned, MAX_NR_BANKS) {
+ rdmsrl(MSR_IA32_MCx_CTL2(bank), val);
+
+ if (on)
+ val |= MCI_CTL2_CMCI_EN;
+ else
+ val &= ~MCI_CTL2_CMCI_EN;
+
+ wrmsrl(MSR_IA32_MCx_CTL2(bank), val);
+ }
+ raw_spin_unlock_irqrestore(&cmci_discover_lock, flags);
+}
+
+unsigned long cmci_intel_adjust_timer(unsigned long interval)
+{
+ if ((this_cpu_read(cmci_backoff_cnt) > 0) &&
+ (__this_cpu_read(cmci_storm_state) == CMCI_STORM_ACTIVE)) {
+ mce_notify_irq();
+ return CMCI_STORM_INTERVAL;
+ }
+
+ switch (__this_cpu_read(cmci_storm_state)) {
+ case CMCI_STORM_ACTIVE:
+
+ /*
+ * We switch back to interrupt mode once the poll timer has
+ * silenced itself. That means no events recorded and the timer
+ * interval is back to our poll interval.
+ */
+ __this_cpu_write(cmci_storm_state, CMCI_STORM_SUBSIDED);
+ if (!atomic_sub_return(1, &cmci_storm_on_cpus))
+ pr_notice("CMCI storm subsided: switching to interrupt mode\n");
+
+ /* FALLTHROUGH */
+
+ case CMCI_STORM_SUBSIDED:
+ /*
+ * We wait for all CPUs to go back to SUBSIDED state. When that
+ * happens we switch back to interrupt mode.
+ */
+ if (!atomic_read(&cmci_storm_on_cpus)) {
+ __this_cpu_write(cmci_storm_state, CMCI_STORM_NONE);
+ cmci_toggle_interrupt_mode(true);
+ cmci_recheck();
+ }
+ return CMCI_POLL_INTERVAL;
+ default:
+
+ /* We have shiny weather. Let the poll do whatever it thinks. */
+ return interval;
+ }
+}
+
+static bool cmci_storm_detect(void)
+{
+ unsigned int cnt = __this_cpu_read(cmci_storm_cnt);
+ unsigned long ts = __this_cpu_read(cmci_time_stamp);
+ unsigned long now = jiffies;
+ int r;
+
+ if (__this_cpu_read(cmci_storm_state) != CMCI_STORM_NONE)
+ return true;
+
+ if (time_before_eq(now, ts + CMCI_STORM_INTERVAL)) {
+ cnt++;
+ } else {
+ cnt = 1;
+ __this_cpu_write(cmci_time_stamp, now);
+ }
+ __this_cpu_write(cmci_storm_cnt, cnt);
+
+ if (cnt <= CMCI_STORM_THRESHOLD)
+ return false;
+
+ cmci_toggle_interrupt_mode(false);
+ __this_cpu_write(cmci_storm_state, CMCI_STORM_ACTIVE);
+ r = atomic_add_return(1, &cmci_storm_on_cpus);
+ mce_timer_kick(CMCI_STORM_INTERVAL);
+ this_cpu_write(cmci_backoff_cnt, INITIAL_CHECK_INTERVAL);
+
+ if (r == 1)
+ pr_notice("CMCI storm detected: switching to poll mode\n");
+ return true;
+}
+
+/*
+ * The interrupt handler. This is called on every event.
+ * Just call the poller directly to log any events.
+ * This could in theory increase the threshold under high load,
+ * but doesn't for now.
+ */
+static void intel_threshold_interrupt(void)
+{
+ if (cmci_storm_detect())
+ return;
+
+ machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_banks_owned));
+}
+
+/*
+ * Enable CMCI (Corrected Machine Check Interrupt) for available MCE banks
+ * on this CPU. Use the algorithm recommended in the SDM to discover shared
+ * banks.
+ */
+static void cmci_discover(int banks)
+{
+ unsigned long *owned = (void *)this_cpu_ptr(&mce_banks_owned);
+ unsigned long flags;
+ int i;
+ int bios_wrong_thresh = 0;
+
+ raw_spin_lock_irqsave(&cmci_discover_lock, flags);
+ for (i = 0; i < banks; i++) {
+ u64 val;
+ int bios_zero_thresh = 0;
+
+ if (test_bit(i, owned))
+ continue;
+
+ /* Skip banks in firmware first mode */
+ if (test_bit(i, mce_banks_ce_disabled))
+ continue;
+
+ rdmsrl(MSR_IA32_MCx_CTL2(i), val);
+
+ /* Already owned by someone else? */
+ if (val & MCI_CTL2_CMCI_EN) {
+ clear_bit(i, owned);
+ __clear_bit(i, this_cpu_ptr(mce_poll_banks));
+ continue;
+ }
+
+ if (!mca_cfg.bios_cmci_threshold) {
+ val &= ~MCI_CTL2_CMCI_THRESHOLD_MASK;
+ val |= CMCI_THRESHOLD;
+ } else if (!(val & MCI_CTL2_CMCI_THRESHOLD_MASK)) {
+ /*
+ * If bios_cmci_threshold boot option was specified
+ * but the threshold is zero, we'll try to initialize
+ * it to 1.
+ */
+ bios_zero_thresh = 1;
+ val |= CMCI_THRESHOLD;
+ }
+
+ val |= MCI_CTL2_CMCI_EN;
+ wrmsrl(MSR_IA32_MCx_CTL2(i), val);
+ rdmsrl(MSR_IA32_MCx_CTL2(i), val);
+
+ /* Did the enable bit stick? -- the bank supports CMCI */
+ if (val & MCI_CTL2_CMCI_EN) {
+ set_bit(i, owned);
+ __clear_bit(i, this_cpu_ptr(mce_poll_banks));
+ /*
+ * We are able to set thresholds for some banks that
+ * had a threshold of 0. This means the BIOS has not
+ * set the thresholds properly or does not work with
+ * this boot option. Note down now and report later.
+ */
+ if (mca_cfg.bios_cmci_threshold && bios_zero_thresh &&
+ (val & MCI_CTL2_CMCI_THRESHOLD_MASK))
+ bios_wrong_thresh = 1;
+ } else {
+ WARN_ON(!test_bit(i, this_cpu_ptr(mce_poll_banks)));
+ }
+ }
+ raw_spin_unlock_irqrestore(&cmci_discover_lock, flags);
+ if (mca_cfg.bios_cmci_threshold && bios_wrong_thresh) {
+ pr_info_once(
+ "bios_cmci_threshold: Some banks do not have valid thresholds set\n");
+ pr_info_once(
+ "bios_cmci_threshold: Make sure your BIOS supports this boot option\n");
+ }
+}
+
+/*
+ * Just in case we missed an event during initialization check
+ * all the CMCI owned banks.
+ */
+void cmci_recheck(void)
+{
+ unsigned long flags;
+ int banks;
+
+ if (!mce_available(raw_cpu_ptr(&cpu_info)) || !cmci_supported(&banks))
+ return;
+
+ local_irq_save(flags);
+ machine_check_poll(0, this_cpu_ptr(&mce_banks_owned));
+ local_irq_restore(flags);
+}
+
+/* Caller must hold the lock on cmci_discover_lock */
+static void __cmci_disable_bank(int bank)
+{
+ u64 val;
+
+ if (!test_bit(bank, this_cpu_ptr(mce_banks_owned)))
+ return;
+ rdmsrl(MSR_IA32_MCx_CTL2(bank), val);
+ val &= ~MCI_CTL2_CMCI_EN;
+ wrmsrl(MSR_IA32_MCx_CTL2(bank), val);
+ __clear_bit(bank, this_cpu_ptr(mce_banks_owned));
+}
+
+/*
+ * Disable CMCI on this CPU for all banks it owns when it goes down.
+ * This allows other CPUs to claim the banks on rediscovery.
+ */
+void cmci_clear(void)
+{
+ unsigned long flags;
+ int i;
+ int banks;
+
+ if (!cmci_supported(&banks))
+ return;
+ raw_spin_lock_irqsave(&cmci_discover_lock, flags);
+ for (i = 0; i < banks; i++)
+ __cmci_disable_bank(i);
+ raw_spin_unlock_irqrestore(&cmci_discover_lock, flags);
+}
+
+static void cmci_rediscover_work_func(void *arg)
+{
+ int banks;
+
+ /* Recheck banks in case CPUs don't all have the same */
+ if (cmci_supported(&banks))
+ cmci_discover(banks);
+}
+
+/* After a CPU went down cycle through all the others and rediscover */
+void cmci_rediscover(void)
+{
+ int banks;
+
+ if (!cmci_supported(&banks))
+ return;
+
+ on_each_cpu(cmci_rediscover_work_func, NULL, 1);
+}
+
+/*
+ * Reenable CMCI on this CPU in case a CPU down failed.
+ */
+void cmci_reenable(void)
+{
+ int banks;
+ if (cmci_supported(&banks))
+ cmci_discover(banks);
+}
+
+void cmci_disable_bank(int bank)
+{
+ int banks;
+ unsigned long flags;
+
+ if (!cmci_supported(&banks))
+ return;
+
+ raw_spin_lock_irqsave(&cmci_discover_lock, flags);
+ __cmci_disable_bank(bank);
+ raw_spin_unlock_irqrestore(&cmci_discover_lock, flags);
+}
+
+static void intel_init_cmci(void)
+{
+ int banks;
+
+ if (!cmci_supported(&banks))
+ return;
+
+ mce_threshold_vector = intel_threshold_interrupt;
+ cmci_discover(banks);
+ /*
+ * For CPU #0 this runs with still disabled APIC, but that's
+ * ok because only the vector is set up. We still do another
+ * check for the banks later for CPU #0 just to make sure
+ * to not miss any events.
+ */
+ apic_write(APIC_LVTCMCI, THRESHOLD_APIC_VECTOR|APIC_DM_FIXED);
+ cmci_recheck();
+}
+
+static void intel_init_lmce(void)
+{
+ u64 val;
+
+ if (!lmce_supported())
+ return;
+
+ rdmsrl(MSR_IA32_MCG_EXT_CTL, val);
+
+ if (!(val & MCG_EXT_CTL_LMCE_EN))
+ wrmsrl(MSR_IA32_MCG_EXT_CTL, val | MCG_EXT_CTL_LMCE_EN);
+}
+
+static void intel_clear_lmce(void)
+{
+ u64 val;
+
+ if (!lmce_supported())
+ return;
+
+ rdmsrl(MSR_IA32_MCG_EXT_CTL, val);
+ val &= ~MCG_EXT_CTL_LMCE_EN;
+ wrmsrl(MSR_IA32_MCG_EXT_CTL, val);
+}
+
+static void intel_ppin_init(struct cpuinfo_x86 *c)
+{
+ unsigned long long val;
+
+ /*
+ * Even if testing the presence of the MSR would be enough, we don't
+ * want to risk the situation where other models reuse this MSR for
+ * other purposes.
+ */
+ switch (c->x86_model) {
+ case INTEL_FAM6_IVYBRIDGE_X:
+ case INTEL_FAM6_HASWELL_X:
+ case INTEL_FAM6_BROADWELL_XEON_D:
+ case INTEL_FAM6_BROADWELL_X:
+ case INTEL_FAM6_SKYLAKE_X:
+ case INTEL_FAM6_XEON_PHI_KNL:
+ case INTEL_FAM6_XEON_PHI_KNM:
+
+ if (rdmsrl_safe(MSR_PPIN_CTL, &val))
+ return;
+
+ if ((val & 3UL) == 1UL) {
+ /* PPIN available but disabled: */
+ return;
+ }
+
+ /* If PPIN is disabled, but not locked, try to enable: */
+ if (!(val & 3UL)) {
+ wrmsrl_safe(MSR_PPIN_CTL, val | 2UL);
+ rdmsrl_safe(MSR_PPIN_CTL, &val);
+ }
+
+ if ((val & 3UL) == 2UL)
+ set_cpu_cap(c, X86_FEATURE_INTEL_PPIN);
+ }
+}
+
+void mce_intel_feature_init(struct cpuinfo_x86 *c)
+{
+ intel_init_thermal(c);
+ intel_init_cmci();
+ intel_init_lmce();
+ intel_ppin_init(c);
+}
+
+void mce_intel_feature_clear(struct cpuinfo_x86 *c)
+{
+ intel_clear_lmce();
+}
diff --git a/arch/x86/kernel/cpu/mcheck/p5.c b/arch/x86/kernel/cpu/mcheck/p5.c
new file mode 100644
index 0000000..5cddf83
--- /dev/null
+++ b/arch/x86/kernel/cpu/mcheck/p5.c
@@ -0,0 +1,69 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * P5 specific Machine Check Exception Reporting
+ * (C) Copyright 2002 Alan Cox <alan@lxorguk.ukuu.org.uk>
+ */
+#include <linux/interrupt.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/smp.h>
+
+#include <asm/processor.h>
+#include <asm/traps.h>
+#include <asm/tlbflush.h>
+#include <asm/mce.h>
+#include <asm/msr.h>
+
+/* By default disabled */
+int mce_p5_enabled __read_mostly;
+
+/* Machine check handler for Pentium class Intel CPUs: */
+static void pentium_machine_check(struct pt_regs *regs, long error_code)
+{
+ u32 loaddr, hi, lotype;
+
+ ist_enter(regs);
+
+ rdmsr(MSR_IA32_P5_MC_ADDR, loaddr, hi);
+ rdmsr(MSR_IA32_P5_MC_TYPE, lotype, hi);
+
+ pr_emerg("CPU#%d: Machine Check Exception: 0x%8X (type 0x%8X).\n",
+ smp_processor_id(), loaddr, lotype);
+
+ if (lotype & (1<<5)) {
+ pr_emerg("CPU#%d: Possible thermal failure (CPU on fire ?).\n",
+ smp_processor_id());
+ }
+
+ add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
+
+ ist_exit(regs);
+}
+
+/* Set up machine check reporting for processors with Intel style MCE: */
+void intel_p5_mcheck_init(struct cpuinfo_x86 *c)
+{
+ u32 l, h;
+
+ /* Default P5 to off as its often misconnected: */
+ if (!mce_p5_enabled)
+ return;
+
+ /* Check for MCE support: */
+ if (!cpu_has(c, X86_FEATURE_MCE))
+ return;
+
+ machine_check_vector = pentium_machine_check;
+ /* Make sure the vector pointer is visible before we enable MCEs: */
+ wmb();
+
+ /* Read registers before enabling: */
+ rdmsr(MSR_IA32_P5_MC_ADDR, l, h);
+ rdmsr(MSR_IA32_P5_MC_TYPE, l, h);
+ pr_info("Intel old style machine check architecture supported.\n");
+
+ /* Enable MCE: */
+ cr4_set_bits(X86_CR4_MCE);
+ pr_info("Intel old style machine check reporting enabled on CPU#%d.\n",
+ smp_processor_id());
+}
diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
new file mode 100644
index 0000000..2da67b7
--- /dev/null
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -0,0 +1,520 @@
+/*
+ * Thermal throttle event support code (such as syslog messaging and rate
+ * limiting) that was factored out from x86_64 (mce_intel.c) and i386 (p4.c).
+ *
+ * This allows consistent reporting of CPU thermal throttle events.
+ *
+ * Maintains a counter in /sys that keeps track of the number of thermal
+ * events, such that the user knows how bad the thermal problem might be
+ * (since the logging to syslog is rate limited).
+ *
+ * Author: Dmitriy Zavin (dmitriyz@google.com)
+ *
+ * Credits: Adapted from Zwane Mwaikambo's original code in mce_intel.c.
+ * Inspired by Ross Biro's and Al Borchers' counter code.
+ */
+#include <linux/interrupt.h>
+#include <linux/notifier.h>
+#include <linux/jiffies.h>
+#include <linux/kernel.h>
+#include <linux/percpu.h>
+#include <linux/export.h>
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/smp.h>
+#include <linux/cpu.h>
+
+#include <asm/processor.h>
+#include <asm/apic.h>
+#include <asm/mce.h>
+#include <asm/msr.h>
+#include <asm/trace/irq_vectors.h>
+
+/* How long to wait between reporting thermal events */
+#define CHECK_INTERVAL (300 * HZ)
+
+#define THERMAL_THROTTLING_EVENT 0
+#define POWER_LIMIT_EVENT 1
+
+/*
+ * Current thermal event state:
+ */
+struct _thermal_state {
+ bool new_event;
+ int event;
+ u64 next_check;
+ unsigned long count;
+ unsigned long last_count;
+};
+
+struct thermal_state {
+ struct _thermal_state core_throttle;
+ struct _thermal_state core_power_limit;
+ struct _thermal_state package_throttle;
+ struct _thermal_state package_power_limit;
+ struct _thermal_state core_thresh0;
+ struct _thermal_state core_thresh1;
+ struct _thermal_state pkg_thresh0;
+ struct _thermal_state pkg_thresh1;
+};
+
+/* Callback to handle core threshold interrupts */
+int (*platform_thermal_notify)(__u64 msr_val);
+EXPORT_SYMBOL(platform_thermal_notify);
+
+/* Callback to handle core package threshold_interrupts */
+int (*platform_thermal_package_notify)(__u64 msr_val);
+EXPORT_SYMBOL_GPL(platform_thermal_package_notify);
+
+/* Callback support of rate control, return true, if
+ * callback has rate control */
+bool (*platform_thermal_package_rate_control)(void);
+EXPORT_SYMBOL_GPL(platform_thermal_package_rate_control);
+
+
+static DEFINE_PER_CPU(struct thermal_state, thermal_state);
+
+static atomic_t therm_throt_en = ATOMIC_INIT(0);
+
+static u32 lvtthmr_init __read_mostly;
+
+#ifdef CONFIG_SYSFS
+#define define_therm_throt_device_one_ro(_name) \
+ static DEVICE_ATTR(_name, 0444, \
+ therm_throt_device_show_##_name, \
+ NULL) \
+
+#define define_therm_throt_device_show_func(event, name) \
+ \
+static ssize_t therm_throt_device_show_##event##_##name( \
+ struct device *dev, \
+ struct device_attribute *attr, \
+ char *buf) \
+{ \
+ unsigned int cpu = dev->id; \
+ ssize_t ret; \
+ \
+ preempt_disable(); /* CPU hotplug */ \
+ if (cpu_online(cpu)) { \
+ ret = sprintf(buf, "%lu\n", \
+ per_cpu(thermal_state, cpu).event.name); \
+ } else \
+ ret = 0; \
+ preempt_enable(); \
+ \
+ return ret; \
+}
+
+define_therm_throt_device_show_func(core_throttle, count);
+define_therm_throt_device_one_ro(core_throttle_count);
+
+define_therm_throt_device_show_func(core_power_limit, count);
+define_therm_throt_device_one_ro(core_power_limit_count);
+
+define_therm_throt_device_show_func(package_throttle, count);
+define_therm_throt_device_one_ro(package_throttle_count);
+
+define_therm_throt_device_show_func(package_power_limit, count);
+define_therm_throt_device_one_ro(package_power_limit_count);
+
+static struct attribute *thermal_throttle_attrs[] = {
+ &dev_attr_core_throttle_count.attr,
+ NULL
+};
+
+static const struct attribute_group thermal_attr_group = {
+ .attrs = thermal_throttle_attrs,
+ .name = "thermal_throttle"
+};
+#endif /* CONFIG_SYSFS */
+
+#define CORE_LEVEL 0
+#define PACKAGE_LEVEL 1
+
+/***
+ * therm_throt_process - Process thermal throttling event from interrupt
+ * @curr: Whether the condition is current or not (boolean), since the
+ * thermal interrupt normally gets called both when the thermal
+ * event begins and once the event has ended.
+ *
+ * This function is called by the thermal interrupt after the
+ * IRQ has been acknowledged.
+ *
+ * It will take care of rate limiting and printing messages to the syslog.
+ */
+static void therm_throt_process(bool new_event, int event, int level)
+{
+ struct _thermal_state *state;
+ unsigned int this_cpu = smp_processor_id();
+ bool old_event;
+ u64 now;
+ struct thermal_state *pstate = &per_cpu(thermal_state, this_cpu);
+
+ now = get_jiffies_64();
+ if (level == CORE_LEVEL) {
+ if (event == THERMAL_THROTTLING_EVENT)
+ state = &pstate->core_throttle;
+ else if (event == POWER_LIMIT_EVENT)
+ state = &pstate->core_power_limit;
+ else
+ return;
+ } else if (level == PACKAGE_LEVEL) {
+ if (event == THERMAL_THROTTLING_EVENT)
+ state = &pstate->package_throttle;
+ else if (event == POWER_LIMIT_EVENT)
+ state = &pstate->package_power_limit;
+ else
+ return;
+ } else
+ return;
+
+ old_event = state->new_event;
+ state->new_event = new_event;
+
+ if (new_event)
+ state->count++;
+
+ if (time_before64(now, state->next_check) &&
+ state->count != state->last_count)
+ return;
+
+ state->next_check = now + CHECK_INTERVAL;
+ state->last_count = state->count;
+
+ /* if we just entered the thermal event */
+ if (new_event) {
+ if (event == THERMAL_THROTTLING_EVENT)
+ pr_crit("CPU%d: %s temperature above threshold, cpu clock throttled (total events = %lu)\n",
+ this_cpu,
+ level == CORE_LEVEL ? "Core" : "Package",
+ state->count);
+ return;
+ }
+ if (old_event) {
+ if (event == THERMAL_THROTTLING_EVENT)
+ pr_info("CPU%d: %s temperature/speed normal\n", this_cpu,
+ level == CORE_LEVEL ? "Core" : "Package");
+ return;
+ }
+}
+
+static int thresh_event_valid(int level, int event)
+{
+ struct _thermal_state *state;
+ unsigned int this_cpu = smp_processor_id();
+ struct thermal_state *pstate = &per_cpu(thermal_state, this_cpu);
+ u64 now = get_jiffies_64();
+
+ if (level == PACKAGE_LEVEL)
+ state = (event == 0) ? &pstate->pkg_thresh0 :
+ &pstate->pkg_thresh1;
+ else
+ state = (event == 0) ? &pstate->core_thresh0 :
+ &pstate->core_thresh1;
+
+ if (time_before64(now, state->next_check))
+ return 0;
+
+ state->next_check = now + CHECK_INTERVAL;
+
+ return 1;
+}
+
+static bool int_pln_enable;
+static int __init int_pln_enable_setup(char *s)
+{
+ int_pln_enable = true;
+
+ return 1;
+}
+__setup("int_pln_enable", int_pln_enable_setup);
+
+#ifdef CONFIG_SYSFS
+/* Add/Remove thermal_throttle interface for CPU device: */
+static int thermal_throttle_add_dev(struct device *dev, unsigned int cpu)
+{
+ int err;
+ struct cpuinfo_x86 *c = &cpu_data(cpu);
+
+ err = sysfs_create_group(&dev->kobj, &thermal_attr_group);
+ if (err)
+ return err;
+
+ if (cpu_has(c, X86_FEATURE_PLN) && int_pln_enable)
+ err = sysfs_add_file_to_group(&dev->kobj,
+ &dev_attr_core_power_limit_count.attr,
+ thermal_attr_group.name);
+ if (cpu_has(c, X86_FEATURE_PTS)) {
+ err = sysfs_add_file_to_group(&dev->kobj,
+ &dev_attr_package_throttle_count.attr,
+ thermal_attr_group.name);
+ if (cpu_has(c, X86_FEATURE_PLN) && int_pln_enable)
+ err = sysfs_add_file_to_group(&dev->kobj,
+ &dev_attr_package_power_limit_count.attr,
+ thermal_attr_group.name);
+ }
+
+ return err;
+}
+
+static void thermal_throttle_remove_dev(struct device *dev)
+{
+ sysfs_remove_group(&dev->kobj, &thermal_attr_group);
+}
+
+/* Get notified when a cpu comes on/off. Be hotplug friendly. */
+static int thermal_throttle_online(unsigned int cpu)
+{
+ struct device *dev = get_cpu_device(cpu);
+
+ return thermal_throttle_add_dev(dev, cpu);
+}
+
+static int thermal_throttle_offline(unsigned int cpu)
+{
+ struct device *dev = get_cpu_device(cpu);
+
+ thermal_throttle_remove_dev(dev);
+ return 0;
+}
+
+static __init int thermal_throttle_init_device(void)
+{
+ int ret;
+
+ if (!atomic_read(&therm_throt_en))
+ return 0;
+
+ ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/therm:online",
+ thermal_throttle_online,
+ thermal_throttle_offline);
+ return ret < 0 ? ret : 0;
+}
+device_initcall(thermal_throttle_init_device);
+
+#endif /* CONFIG_SYSFS */
+
+static void notify_package_thresholds(__u64 msr_val)
+{
+ bool notify_thres_0 = false;
+ bool notify_thres_1 = false;
+
+ if (!platform_thermal_package_notify)
+ return;
+
+ /* lower threshold check */
+ if (msr_val & THERM_LOG_THRESHOLD0)
+ notify_thres_0 = true;
+ /* higher threshold check */
+ if (msr_val & THERM_LOG_THRESHOLD1)
+ notify_thres_1 = true;
+
+ if (!notify_thres_0 && !notify_thres_1)
+ return;
+
+ if (platform_thermal_package_rate_control &&
+ platform_thermal_package_rate_control()) {
+ /* Rate control is implemented in callback */
+ platform_thermal_package_notify(msr_val);
+ return;
+ }
+
+ /* lower threshold reached */
+ if (notify_thres_0 && thresh_event_valid(PACKAGE_LEVEL, 0))
+ platform_thermal_package_notify(msr_val);
+ /* higher threshold reached */
+ if (notify_thres_1 && thresh_event_valid(PACKAGE_LEVEL, 1))
+ platform_thermal_package_notify(msr_val);
+}
+
+static void notify_thresholds(__u64 msr_val)
+{
+ /* check whether the interrupt handler is defined;
+ * otherwise simply return
+ */
+ if (!platform_thermal_notify)
+ return;
+
+ /* lower threshold reached */
+ if ((msr_val & THERM_LOG_THRESHOLD0) &&
+ thresh_event_valid(CORE_LEVEL, 0))
+ platform_thermal_notify(msr_val);
+ /* higher threshold reached */
+ if ((msr_val & THERM_LOG_THRESHOLD1) &&
+ thresh_event_valid(CORE_LEVEL, 1))
+ platform_thermal_notify(msr_val);
+}
+
+/* Thermal transition interrupt handler */
+static void intel_thermal_interrupt(void)
+{
+ __u64 msr_val;
+
+ if (static_cpu_has(X86_FEATURE_HWP))
+ wrmsrl_safe(MSR_HWP_STATUS, 0);
+
+ rdmsrl(MSR_IA32_THERM_STATUS, msr_val);
+
+ /* Check for violation of core thermal thresholds*/
+ notify_thresholds(msr_val);
+
+ therm_throt_process(msr_val & THERM_STATUS_PROCHOT,
+ THERMAL_THROTTLING_EVENT,
+ CORE_LEVEL);
+
+ if (this_cpu_has(X86_FEATURE_PLN) && int_pln_enable)
+ therm_throt_process(msr_val & THERM_STATUS_POWER_LIMIT,
+ POWER_LIMIT_EVENT,
+ CORE_LEVEL);
+
+ if (this_cpu_has(X86_FEATURE_PTS)) {
+ rdmsrl(MSR_IA32_PACKAGE_THERM_STATUS, msr_val);
+ /* check violations of package thermal thresholds */
+ notify_package_thresholds(msr_val);
+ therm_throt_process(msr_val & PACKAGE_THERM_STATUS_PROCHOT,
+ THERMAL_THROTTLING_EVENT,
+ PACKAGE_LEVEL);
+ if (this_cpu_has(X86_FEATURE_PLN) && int_pln_enable)
+ therm_throt_process(msr_val &
+ PACKAGE_THERM_STATUS_POWER_LIMIT,
+ POWER_LIMIT_EVENT,
+ PACKAGE_LEVEL);
+ }
+}
+
+static void unexpected_thermal_interrupt(void)
+{
+ pr_err("CPU%d: Unexpected LVT thermal interrupt!\n",
+ smp_processor_id());
+}
+
+static void (*smp_thermal_vector)(void) = unexpected_thermal_interrupt;
+
+asmlinkage __visible void __irq_entry smp_thermal_interrupt(struct pt_regs *r)
+{
+ entering_irq();
+ trace_thermal_apic_entry(THERMAL_APIC_VECTOR);
+ inc_irq_stat(irq_thermal_count);
+ smp_thermal_vector();
+ trace_thermal_apic_exit(THERMAL_APIC_VECTOR);
+ exiting_ack_irq();
+}
+
+/* Thermal monitoring depends on APIC, ACPI and clock modulation */
+static int intel_thermal_supported(struct cpuinfo_x86 *c)
+{
+ if (!boot_cpu_has(X86_FEATURE_APIC))
+ return 0;
+ if (!cpu_has(c, X86_FEATURE_ACPI) || !cpu_has(c, X86_FEATURE_ACC))
+ return 0;
+ return 1;
+}
+
+void __init mcheck_intel_therm_init(void)
+{
+ /*
+ * This function is only called on boot CPU. Save the init thermal
+ * LVT value on BSP and use that value to restore APs' thermal LVT
+ * entry BIOS programmed later
+ */
+ if (intel_thermal_supported(&boot_cpu_data))
+ lvtthmr_init = apic_read(APIC_LVTTHMR);
+}
+
+void intel_init_thermal(struct cpuinfo_x86 *c)
+{
+ unsigned int cpu = smp_processor_id();
+ int tm2 = 0;
+ u32 l, h;
+
+ if (!intel_thermal_supported(c))
+ return;
+
+ /*
+ * First check if its enabled already, in which case there might
+ * be some SMM goo which handles it, so we can't even put a handler
+ * since it might be delivered via SMI already:
+ */
+ rdmsr(MSR_IA32_MISC_ENABLE, l, h);
+
+ h = lvtthmr_init;
+ /*
+ * The initial value of thermal LVT entries on all APs always reads
+ * 0x10000 because APs are woken up by BSP issuing INIT-SIPI-SIPI
+ * sequence to them and LVT registers are reset to 0s except for
+ * the mask bits which are set to 1s when APs receive INIT IPI.
+ * If BIOS takes over the thermal interrupt and sets its interrupt
+ * delivery mode to SMI (not fixed), it restores the value that the
+ * BIOS has programmed on AP based on BSP's info we saved since BIOS
+ * is always setting the same value for all threads/cores.
+ */
+ if ((h & APIC_DM_FIXED_MASK) != APIC_DM_FIXED)
+ apic_write(APIC_LVTTHMR, lvtthmr_init);
+
+
+ if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) {
+ if (system_state == SYSTEM_BOOTING)
+ pr_debug("CPU%d: Thermal monitoring handled by SMI\n", cpu);
+ return;
+ }
+
+ /* early Pentium M models use different method for enabling TM2 */
+ if (cpu_has(c, X86_FEATURE_TM2)) {
+ if (c->x86 == 6 && (c->x86_model == 9 || c->x86_model == 13)) {
+ rdmsr(MSR_THERM2_CTL, l, h);
+ if (l & MSR_THERM2_CTL_TM_SELECT)
+ tm2 = 1;
+ } else if (l & MSR_IA32_MISC_ENABLE_TM2)
+ tm2 = 1;
+ }
+
+ /* We'll mask the thermal vector in the lapic till we're ready: */
+ h = THERMAL_APIC_VECTOR | APIC_DM_FIXED | APIC_LVT_MASKED;
+ apic_write(APIC_LVTTHMR, h);
+
+ rdmsr(MSR_IA32_THERM_INTERRUPT, l, h);
+ if (cpu_has(c, X86_FEATURE_PLN) && !int_pln_enable)
+ wrmsr(MSR_IA32_THERM_INTERRUPT,
+ (l | (THERM_INT_LOW_ENABLE
+ | THERM_INT_HIGH_ENABLE)) & ~THERM_INT_PLN_ENABLE, h);
+ else if (cpu_has(c, X86_FEATURE_PLN) && int_pln_enable)
+ wrmsr(MSR_IA32_THERM_INTERRUPT,
+ l | (THERM_INT_LOW_ENABLE
+ | THERM_INT_HIGH_ENABLE | THERM_INT_PLN_ENABLE), h);
+ else
+ wrmsr(MSR_IA32_THERM_INTERRUPT,
+ l | (THERM_INT_LOW_ENABLE | THERM_INT_HIGH_ENABLE), h);
+
+ if (cpu_has(c, X86_FEATURE_PTS)) {
+ rdmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h);
+ if (cpu_has(c, X86_FEATURE_PLN) && !int_pln_enable)
+ wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT,
+ (l | (PACKAGE_THERM_INT_LOW_ENABLE
+ | PACKAGE_THERM_INT_HIGH_ENABLE))
+ & ~PACKAGE_THERM_INT_PLN_ENABLE, h);
+ else if (cpu_has(c, X86_FEATURE_PLN) && int_pln_enable)
+ wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT,
+ l | (PACKAGE_THERM_INT_LOW_ENABLE
+ | PACKAGE_THERM_INT_HIGH_ENABLE
+ | PACKAGE_THERM_INT_PLN_ENABLE), h);
+ else
+ wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT,
+ l | (PACKAGE_THERM_INT_LOW_ENABLE
+ | PACKAGE_THERM_INT_HIGH_ENABLE), h);
+ }
+
+ smp_thermal_vector = intel_thermal_interrupt;
+
+ rdmsr(MSR_IA32_MISC_ENABLE, l, h);
+ wrmsr(MSR_IA32_MISC_ENABLE, l | MSR_IA32_MISC_ENABLE_TM1, h);
+
+ /* Unmask the thermal vector: */
+ l = apic_read(APIC_LVTTHMR);
+ apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED);
+
+ pr_info_once("CPU0: Thermal monitoring enabled (%s)\n",
+ tm2 ? "TM2" : "TM1");
+
+ /* enable thermal throttle processing */
+ atomic_set(&therm_throt_en, 1);
+}
diff --git a/arch/x86/kernel/cpu/mcheck/threshold.c b/arch/x86/kernel/cpu/mcheck/threshold.c
new file mode 100644
index 0000000..2b584b3
--- /dev/null
+++ b/arch/x86/kernel/cpu/mcheck/threshold.c
@@ -0,0 +1,29 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Common corrected MCE threshold handler code:
+ */
+#include <linux/interrupt.h>
+#include <linux/kernel.h>
+
+#include <asm/irq_vectors.h>
+#include <asm/apic.h>
+#include <asm/mce.h>
+#include <asm/trace/irq_vectors.h>
+
+static void default_threshold_interrupt(void)
+{
+ pr_err("Unexpected threshold interrupt at vector %x\n",
+ THRESHOLD_APIC_VECTOR);
+}
+
+void (*mce_threshold_vector)(void) = default_threshold_interrupt;
+
+asmlinkage __visible void __irq_entry smp_threshold_interrupt(void)
+{
+ entering_irq();
+ trace_threshold_apic_entry(THRESHOLD_APIC_VECTOR);
+ inc_irq_stat(irq_threshold_count);
+ mce_threshold_vector();
+ trace_threshold_apic_exit(THRESHOLD_APIC_VECTOR);
+ exiting_ack_irq();
+}
diff --git a/arch/x86/kernel/cpu/mcheck/winchip.c b/arch/x86/kernel/cpu/mcheck/winchip.c
new file mode 100644
index 0000000..3b45b27
--- /dev/null
+++ b/arch/x86/kernel/cpu/mcheck/winchip.c
@@ -0,0 +1,44 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * IDT Winchip specific Machine Check Exception Reporting
+ * (C) Copyright 2002 Alan Cox <alan@lxorguk.ukuu.org.uk>
+ */
+#include <linux/interrupt.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
+
+#include <asm/processor.h>
+#include <asm/traps.h>
+#include <asm/tlbflush.h>
+#include <asm/mce.h>
+#include <asm/msr.h>
+
+/* Machine check handler for WinChip C6: */
+static void winchip_machine_check(struct pt_regs *regs, long error_code)
+{
+ ist_enter(regs);
+
+ pr_emerg("CPU0: Machine Check Exception.\n");
+ add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
+
+ ist_exit(regs);
+}
+
+/* Set up machine check reporting on the Winchip C6 series */
+void winchip_mcheck_init(struct cpuinfo_x86 *c)
+{
+ u32 lo, hi;
+
+ machine_check_vector = winchip_machine_check;
+ /* Make sure the vector pointer is visible before we enable MCEs: */
+ wmb();
+
+ rdmsr(MSR_IDT_FCR1, lo, hi);
+ lo |= (1<<2); /* Enable EIERRINT (int 18 MCE) */
+ lo &= ~(1<<4); /* Enable MCE */
+ wrmsr(MSR_IDT_FCR1, lo, hi);
+
+ cr4_set_bits(X86_CR4_MCE);
+
+ pr_info("Winchip machine check reporting enabled on CPU#0.\n");
+}
diff --git a/arch/x86/kernel/cpu/microcode/Makefile b/arch/x86/kernel/cpu/microcode/Makefile
new file mode 100644
index 0000000..ba12e8a
--- /dev/null
+++ b/arch/x86/kernel/cpu/microcode/Makefile
@@ -0,0 +1,4 @@
+microcode-y := core.o
+obj-$(CONFIG_MICROCODE) += microcode.o
+microcode-$(CONFIG_MICROCODE_INTEL) += intel.o
+microcode-$(CONFIG_MICROCODE_AMD) += amd.o
diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c
new file mode 100644
index 0000000..07b5fc0
--- /dev/null
+++ b/arch/x86/kernel/cpu/microcode/amd.c
@@ -0,0 +1,818 @@
+/*
+ * AMD CPU Microcode Update Driver for Linux
+ *
+ * This driver allows to upgrade microcode on F10h AMD
+ * CPUs and later.
+ *
+ * Copyright (C) 2008-2011 Advanced Micro Devices Inc.
+ * 2013-2016 Borislav Petkov <bp@alien8.de>
+ *
+ * Author: Peter Oruba <peter.oruba@amd.com>
+ *
+ * Based on work by:
+ * Tigran Aivazian <aivazian.tigran@gmail.com>
+ *
+ * early loader:
+ * Copyright (C) 2013 Advanced Micro Devices, Inc.
+ *
+ * Author: Jacob Shin <jacob.shin@amd.com>
+ * Fixes: Borislav Petkov <bp@suse.de>
+ *
+ * Licensed under the terms of the GNU General Public
+ * License version 2. See file COPYING for details.
+ */
+#define pr_fmt(fmt) "microcode: " fmt
+
+#include <linux/earlycpio.h>
+#include <linux/firmware.h>
+#include <linux/uaccess.h>
+#include <linux/vmalloc.h>
+#include <linux/initrd.h>
+#include <linux/kernel.h>
+#include <linux/pci.h>
+
+#include <asm/microcode_amd.h>
+#include <asm/microcode.h>
+#include <asm/processor.h>
+#include <asm/setup.h>
+#include <asm/cpu.h>
+#include <asm/msr.h>
+
+static struct equiv_cpu_entry *equiv_cpu_table;
+
+/*
+ * This points to the current valid container of microcode patches which we will
+ * save from the initrd/builtin before jettisoning its contents. @mc is the
+ * microcode patch we found to match.
+ */
+struct cont_desc {
+ struct microcode_amd *mc;
+ u32 cpuid_1_eax;
+ u32 psize;
+ u8 *data;
+ size_t size;
+};
+
+static u32 ucode_new_rev;
+static u8 amd_ucode_patch[PATCH_MAX_SIZE];
+
+/*
+ * Microcode patch container file is prepended to the initrd in cpio
+ * format. See Documentation/x86/microcode.txt
+ */
+static const char
+ucode_path[] __maybe_unused = "kernel/x86/microcode/AuthenticAMD.bin";
+
+static u16 find_equiv_id(struct equiv_cpu_entry *equiv_table, u32 sig)
+{
+ for (; equiv_table && equiv_table->installed_cpu; equiv_table++) {
+ if (sig == equiv_table->installed_cpu)
+ return equiv_table->equiv_cpu;
+ }
+
+ return 0;
+}
+
+/*
+ * This scans the ucode blob for the proper container as we can have multiple
+ * containers glued together. Returns the equivalence ID from the equivalence
+ * table or 0 if none found.
+ * Returns the amount of bytes consumed while scanning. @desc contains all the
+ * data we're going to use in later stages of the application.
+ */
+static ssize_t parse_container(u8 *ucode, ssize_t size, struct cont_desc *desc)
+{
+ struct equiv_cpu_entry *eq;
+ ssize_t orig_size = size;
+ u32 *hdr = (u32 *)ucode;
+ u16 eq_id;
+ u8 *buf;
+
+ /* Am I looking at an equivalence table header? */
+ if (hdr[0] != UCODE_MAGIC ||
+ hdr[1] != UCODE_EQUIV_CPU_TABLE_TYPE ||
+ hdr[2] == 0)
+ return CONTAINER_HDR_SZ;
+
+ buf = ucode;
+
+ eq = (struct equiv_cpu_entry *)(buf + CONTAINER_HDR_SZ);
+
+ /* Find the equivalence ID of our CPU in this table: */
+ eq_id = find_equiv_id(eq, desc->cpuid_1_eax);
+
+ buf += hdr[2] + CONTAINER_HDR_SZ;
+ size -= hdr[2] + CONTAINER_HDR_SZ;
+
+ /*
+ * Scan through the rest of the container to find where it ends. We do
+ * some basic sanity-checking too.
+ */
+ while (size > 0) {
+ struct microcode_amd *mc;
+ u32 patch_size;
+
+ hdr = (u32 *)buf;
+
+ if (hdr[0] != UCODE_UCODE_TYPE)
+ break;
+
+ /* Sanity-check patch size. */
+ patch_size = hdr[1];
+ if (patch_size > PATCH_MAX_SIZE)
+ break;
+
+ /* Skip patch section header: */
+ buf += SECTION_HDR_SIZE;
+ size -= SECTION_HDR_SIZE;
+
+ mc = (struct microcode_amd *)buf;
+ if (eq_id == mc->hdr.processor_rev_id) {
+ desc->psize = patch_size;
+ desc->mc = mc;
+ }
+
+ buf += patch_size;
+ size -= patch_size;
+ }
+
+ /*
+ * If we have found a patch (desc->mc), it means we're looking at the
+ * container which has a patch for this CPU so return 0 to mean, @ucode
+ * already points to the proper container. Otherwise, we return the size
+ * we scanned so that we can advance to the next container in the
+ * buffer.
+ */
+ if (desc->mc) {
+ desc->data = ucode;
+ desc->size = orig_size - size;
+
+ return 0;
+ }
+
+ return orig_size - size;
+}
+
+/*
+ * Scan the ucode blob for the proper container as we can have multiple
+ * containers glued together.
+ */
+static void scan_containers(u8 *ucode, size_t size, struct cont_desc *desc)
+{
+ ssize_t rem = size;
+
+ while (rem >= 0) {
+ ssize_t s = parse_container(ucode, rem, desc);
+ if (!s)
+ return;
+
+ ucode += s;
+ rem -= s;
+ }
+}
+
+static int __apply_microcode_amd(struct microcode_amd *mc)
+{
+ u32 rev, dummy;
+
+ native_wrmsrl(MSR_AMD64_PATCH_LOADER, (u64)(long)&mc->hdr.data_code);
+
+ /* verify patch application was successful */
+ native_rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy);
+ if (rev != mc->hdr.patch_id)
+ return -1;
+
+ return 0;
+}
+
+/*
+ * Early load occurs before we can vmalloc(). So we look for the microcode
+ * patch container file in initrd, traverse equivalent cpu table, look for a
+ * matching microcode patch, and update, all in initrd memory in place.
+ * When vmalloc() is available for use later -- on 64-bit during first AP load,
+ * and on 32-bit during save_microcode_in_initrd_amd() -- we can call
+ * load_microcode_amd() to save equivalent cpu table and microcode patches in
+ * kernel heap memory.
+ *
+ * Returns true if container found (sets @desc), false otherwise.
+ */
+static bool
+apply_microcode_early_amd(u32 cpuid_1_eax, void *ucode, size_t size, bool save_patch)
+{
+ struct cont_desc desc = { 0 };
+ u8 (*patch)[PATCH_MAX_SIZE];
+ struct microcode_amd *mc;
+ u32 rev, dummy, *new_rev;
+ bool ret = false;
+
+#ifdef CONFIG_X86_32
+ new_rev = (u32 *)__pa_nodebug(&ucode_new_rev);
+ patch = (u8 (*)[PATCH_MAX_SIZE])__pa_nodebug(&amd_ucode_patch);
+#else
+ new_rev = &ucode_new_rev;
+ patch = &amd_ucode_patch;
+#endif
+
+ desc.cpuid_1_eax = cpuid_1_eax;
+
+ scan_containers(ucode, size, &desc);
+
+ mc = desc.mc;
+ if (!mc)
+ return ret;
+
+ native_rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy);
+ if (rev >= mc->hdr.patch_id)
+ return ret;
+
+ if (!__apply_microcode_amd(mc)) {
+ *new_rev = mc->hdr.patch_id;
+ ret = true;
+
+ if (save_patch)
+ memcpy(patch, mc, min_t(u32, desc.psize, PATCH_MAX_SIZE));
+ }
+
+ return ret;
+}
+
+static bool get_builtin_microcode(struct cpio_data *cp, unsigned int family)
+{
+#ifdef CONFIG_X86_64
+ char fw_name[36] = "amd-ucode/microcode_amd.bin";
+
+ if (family >= 0x15)
+ snprintf(fw_name, sizeof(fw_name),
+ "amd-ucode/microcode_amd_fam%.2xh.bin", family);
+
+ return get_builtin_firmware(cp, fw_name);
+#else
+ return false;
+#endif
+}
+
+static void __load_ucode_amd(unsigned int cpuid_1_eax, struct cpio_data *ret)
+{
+ struct ucode_cpu_info *uci;
+ struct cpio_data cp;
+ const char *path;
+ bool use_pa;
+
+ if (IS_ENABLED(CONFIG_X86_32)) {
+ uci = (struct ucode_cpu_info *)__pa_nodebug(ucode_cpu_info);
+ path = (const char *)__pa_nodebug(ucode_path);
+ use_pa = true;
+ } else {
+ uci = ucode_cpu_info;
+ path = ucode_path;
+ use_pa = false;
+ }
+
+ if (!get_builtin_microcode(&cp, x86_family(cpuid_1_eax)))
+ cp = find_microcode_in_initrd(path, use_pa);
+
+ /* Needed in load_microcode_amd() */
+ uci->cpu_sig.sig = cpuid_1_eax;
+
+ *ret = cp;
+}
+
+void __init load_ucode_amd_bsp(unsigned int cpuid_1_eax)
+{
+ struct cpio_data cp = { };
+
+ __load_ucode_amd(cpuid_1_eax, &cp);
+ if (!(cp.data && cp.size))
+ return;
+
+ apply_microcode_early_amd(cpuid_1_eax, cp.data, cp.size, true);
+}
+
+void load_ucode_amd_ap(unsigned int cpuid_1_eax)
+{
+ struct microcode_amd *mc;
+ struct cpio_data cp;
+ u32 *new_rev, rev, dummy;
+
+ if (IS_ENABLED(CONFIG_X86_32)) {
+ mc = (struct microcode_amd *)__pa_nodebug(amd_ucode_patch);
+ new_rev = (u32 *)__pa_nodebug(&ucode_new_rev);
+ } else {
+ mc = (struct microcode_amd *)amd_ucode_patch;
+ new_rev = &ucode_new_rev;
+ }
+
+ native_rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy);
+
+ /* Check whether we have saved a new patch already: */
+ if (*new_rev && rev < mc->hdr.patch_id) {
+ if (!__apply_microcode_amd(mc)) {
+ *new_rev = mc->hdr.patch_id;
+ return;
+ }
+ }
+
+ __load_ucode_amd(cpuid_1_eax, &cp);
+ if (!(cp.data && cp.size))
+ return;
+
+ apply_microcode_early_amd(cpuid_1_eax, cp.data, cp.size, false);
+}
+
+static enum ucode_state
+load_microcode_amd(bool save, u8 family, const u8 *data, size_t size);
+
+int __init save_microcode_in_initrd_amd(unsigned int cpuid_1_eax)
+{
+ struct cont_desc desc = { 0 };
+ enum ucode_state ret;
+ struct cpio_data cp;
+
+ cp = find_microcode_in_initrd(ucode_path, false);
+ if (!(cp.data && cp.size))
+ return -EINVAL;
+
+ desc.cpuid_1_eax = cpuid_1_eax;
+
+ scan_containers(cp.data, cp.size, &desc);
+ if (!desc.mc)
+ return -EINVAL;
+
+ ret = load_microcode_amd(true, x86_family(cpuid_1_eax), desc.data, desc.size);
+ if (ret > UCODE_UPDATED)
+ return -EINVAL;
+
+ return 0;
+}
+
+void reload_ucode_amd(void)
+{
+ struct microcode_amd *mc;
+ u32 rev, dummy;
+
+ mc = (struct microcode_amd *)amd_ucode_patch;
+
+ rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy);
+
+ if (rev < mc->hdr.patch_id) {
+ if (!__apply_microcode_amd(mc)) {
+ ucode_new_rev = mc->hdr.patch_id;
+ pr_info("reload patch_level=0x%08x\n", ucode_new_rev);
+ }
+ }
+}
+static u16 __find_equiv_id(unsigned int cpu)
+{
+ struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+ return find_equiv_id(equiv_cpu_table, uci->cpu_sig.sig);
+}
+
+static u32 find_cpu_family_by_equiv_cpu(u16 equiv_cpu)
+{
+ int i = 0;
+
+ BUG_ON(!equiv_cpu_table);
+
+ while (equiv_cpu_table[i].equiv_cpu != 0) {
+ if (equiv_cpu == equiv_cpu_table[i].equiv_cpu)
+ return equiv_cpu_table[i].installed_cpu;
+ i++;
+ }
+ return 0;
+}
+
+/*
+ * a small, trivial cache of per-family ucode patches
+ */
+static struct ucode_patch *cache_find_patch(u16 equiv_cpu)
+{
+ struct ucode_patch *p;
+
+ list_for_each_entry(p, µcode_cache, plist)
+ if (p->equiv_cpu == equiv_cpu)
+ return p;
+ return NULL;
+}
+
+static void update_cache(struct ucode_patch *new_patch)
+{
+ struct ucode_patch *p;
+
+ list_for_each_entry(p, µcode_cache, plist) {
+ if (p->equiv_cpu == new_patch->equiv_cpu) {
+ if (p->patch_id >= new_patch->patch_id) {
+ /* we already have the latest patch */
+ kfree(new_patch->data);
+ kfree(new_patch);
+ return;
+ }
+
+ list_replace(&p->plist, &new_patch->plist);
+ kfree(p->data);
+ kfree(p);
+ return;
+ }
+ }
+ /* no patch found, add it */
+ list_add_tail(&new_patch->plist, µcode_cache);
+}
+
+static void free_cache(void)
+{
+ struct ucode_patch *p, *tmp;
+
+ list_for_each_entry_safe(p, tmp, µcode_cache, plist) {
+ __list_del(p->plist.prev, p->plist.next);
+ kfree(p->data);
+ kfree(p);
+ }
+}
+
+static struct ucode_patch *find_patch(unsigned int cpu)
+{
+ u16 equiv_id;
+
+ equiv_id = __find_equiv_id(cpu);
+ if (!equiv_id)
+ return NULL;
+
+ return cache_find_patch(equiv_id);
+}
+
+static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig)
+{
+ struct cpuinfo_x86 *c = &cpu_data(cpu);
+ struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+ struct ucode_patch *p;
+
+ csig->sig = cpuid_eax(0x00000001);
+ csig->rev = c->microcode;
+
+ /*
+ * a patch could have been loaded early, set uci->mc so that
+ * mc_bp_resume() can call apply_microcode()
+ */
+ p = find_patch(cpu);
+ if (p && (p->patch_id == csig->rev))
+ uci->mc = p->data;
+
+ pr_info("CPU%d: patch_level=0x%08x\n", cpu, csig->rev);
+
+ return 0;
+}
+
+static unsigned int verify_patch_size(u8 family, u32 patch_size,
+ unsigned int size)
+{
+ u32 max_size;
+
+#define F1XH_MPB_MAX_SIZE 2048
+#define F14H_MPB_MAX_SIZE 1824
+#define F15H_MPB_MAX_SIZE 4096
+#define F16H_MPB_MAX_SIZE 3458
+#define F17H_MPB_MAX_SIZE 3200
+
+ switch (family) {
+ case 0x14:
+ max_size = F14H_MPB_MAX_SIZE;
+ break;
+ case 0x15:
+ max_size = F15H_MPB_MAX_SIZE;
+ break;
+ case 0x16:
+ max_size = F16H_MPB_MAX_SIZE;
+ break;
+ case 0x17:
+ max_size = F17H_MPB_MAX_SIZE;
+ break;
+ default:
+ max_size = F1XH_MPB_MAX_SIZE;
+ break;
+ }
+
+ if (patch_size > min_t(u32, size, max_size)) {
+ pr_err("patch size mismatch\n");
+ return 0;
+ }
+
+ return patch_size;
+}
+
+static enum ucode_state apply_microcode_amd(int cpu)
+{
+ struct cpuinfo_x86 *c = &cpu_data(cpu);
+ struct microcode_amd *mc_amd;
+ struct ucode_cpu_info *uci;
+ struct ucode_patch *p;
+ enum ucode_state ret;
+ u32 rev, dummy;
+
+ BUG_ON(raw_smp_processor_id() != cpu);
+
+ uci = ucode_cpu_info + cpu;
+
+ p = find_patch(cpu);
+ if (!p)
+ return UCODE_NFOUND;
+
+ mc_amd = p->data;
+ uci->mc = p->data;
+
+ rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy);
+
+ /* need to apply patch? */
+ if (rev >= mc_amd->hdr.patch_id) {
+ ret = UCODE_OK;
+ goto out;
+ }
+
+ if (__apply_microcode_amd(mc_amd)) {
+ pr_err("CPU%d: update failed for patch_level=0x%08x\n",
+ cpu, mc_amd->hdr.patch_id);
+ return UCODE_ERROR;
+ }
+
+ rev = mc_amd->hdr.patch_id;
+ ret = UCODE_UPDATED;
+
+ pr_info("CPU%d: new patch_level=0x%08x\n", cpu, rev);
+
+out:
+ uci->cpu_sig.rev = rev;
+ c->microcode = rev;
+
+ /* Update boot_cpu_data's revision too, if we're on the BSP: */
+ if (c->cpu_index == boot_cpu_data.cpu_index)
+ boot_cpu_data.microcode = rev;
+
+ return ret;
+}
+
+static int install_equiv_cpu_table(const u8 *buf)
+{
+ unsigned int *ibuf = (unsigned int *)buf;
+ unsigned int type = ibuf[1];
+ unsigned int size = ibuf[2];
+
+ if (type != UCODE_EQUIV_CPU_TABLE_TYPE || !size) {
+ pr_err("empty section/"
+ "invalid type field in container file section header\n");
+ return -EINVAL;
+ }
+
+ equiv_cpu_table = vmalloc(size);
+ if (!equiv_cpu_table) {
+ pr_err("failed to allocate equivalent CPU table\n");
+ return -ENOMEM;
+ }
+
+ memcpy(equiv_cpu_table, buf + CONTAINER_HDR_SZ, size);
+
+ /* add header length */
+ return size + CONTAINER_HDR_SZ;
+}
+
+static void free_equiv_cpu_table(void)
+{
+ vfree(equiv_cpu_table);
+ equiv_cpu_table = NULL;
+}
+
+static void cleanup(void)
+{
+ free_equiv_cpu_table();
+ free_cache();
+}
+
+/*
+ * We return the current size even if some of the checks failed so that
+ * we can skip over the next patch. If we return a negative value, we
+ * signal a grave error like a memory allocation has failed and the
+ * driver cannot continue functioning normally. In such cases, we tear
+ * down everything we've used up so far and exit.
+ */
+static int verify_and_add_patch(u8 family, u8 *fw, unsigned int leftover)
+{
+ struct microcode_header_amd *mc_hdr;
+ struct ucode_patch *patch;
+ unsigned int patch_size, crnt_size, ret;
+ u32 proc_fam;
+ u16 proc_id;
+
+ patch_size = *(u32 *)(fw + 4);
+ crnt_size = patch_size + SECTION_HDR_SIZE;
+ mc_hdr = (struct microcode_header_amd *)(fw + SECTION_HDR_SIZE);
+ proc_id = mc_hdr->processor_rev_id;
+
+ proc_fam = find_cpu_family_by_equiv_cpu(proc_id);
+ if (!proc_fam) {
+ pr_err("No patch family for equiv ID: 0x%04x\n", proc_id);
+ return crnt_size;
+ }
+
+ /* check if patch is for the current family */
+ proc_fam = ((proc_fam >> 8) & 0xf) + ((proc_fam >> 20) & 0xff);
+ if (proc_fam != family)
+ return crnt_size;
+
+ if (mc_hdr->nb_dev_id || mc_hdr->sb_dev_id) {
+ pr_err("Patch-ID 0x%08x: chipset-specific code unsupported.\n",
+ mc_hdr->patch_id);
+ return crnt_size;
+ }
+
+ ret = verify_patch_size(family, patch_size, leftover);
+ if (!ret) {
+ pr_err("Patch-ID 0x%08x: size mismatch.\n", mc_hdr->patch_id);
+ return crnt_size;
+ }
+
+ patch = kzalloc(sizeof(*patch), GFP_KERNEL);
+ if (!patch) {
+ pr_err("Patch allocation failure.\n");
+ return -EINVAL;
+ }
+
+ patch->data = kmemdup(fw + SECTION_HDR_SIZE, patch_size, GFP_KERNEL);
+ if (!patch->data) {
+ pr_err("Patch data allocation failure.\n");
+ kfree(patch);
+ return -EINVAL;
+ }
+
+ INIT_LIST_HEAD(&patch->plist);
+ patch->patch_id = mc_hdr->patch_id;
+ patch->equiv_cpu = proc_id;
+
+ pr_debug("%s: Added patch_id: 0x%08x, proc_id: 0x%04x\n",
+ __func__, patch->patch_id, proc_id);
+
+ /* ... and add to cache. */
+ update_cache(patch);
+
+ return crnt_size;
+}
+
+static enum ucode_state __load_microcode_amd(u8 family, const u8 *data,
+ size_t size)
+{
+ enum ucode_state ret = UCODE_ERROR;
+ unsigned int leftover;
+ u8 *fw = (u8 *)data;
+ int crnt_size = 0;
+ int offset;
+
+ offset = install_equiv_cpu_table(data);
+ if (offset < 0) {
+ pr_err("failed to create equivalent cpu table\n");
+ return ret;
+ }
+ fw += offset;
+ leftover = size - offset;
+
+ if (*(u32 *)fw != UCODE_UCODE_TYPE) {
+ pr_err("invalid type field in container file section header\n");
+ free_equiv_cpu_table();
+ return ret;
+ }
+
+ while (leftover) {
+ crnt_size = verify_and_add_patch(family, fw, leftover);
+ if (crnt_size < 0)
+ return ret;
+
+ fw += crnt_size;
+ leftover -= crnt_size;
+ }
+
+ return UCODE_OK;
+}
+
+static enum ucode_state
+load_microcode_amd(bool save, u8 family, const u8 *data, size_t size)
+{
+ struct ucode_patch *p;
+ enum ucode_state ret;
+
+ /* free old equiv table */
+ free_equiv_cpu_table();
+
+ ret = __load_microcode_amd(family, data, size);
+ if (ret != UCODE_OK) {
+ cleanup();
+ return ret;
+ }
+
+ p = find_patch(0);
+ if (!p) {
+ return ret;
+ } else {
+ if (boot_cpu_data.microcode == p->patch_id)
+ return ret;
+
+ ret = UCODE_NEW;
+ }
+
+ /* save BSP's matching patch for early load */
+ if (!save)
+ return ret;
+
+ memset(amd_ucode_patch, 0, PATCH_MAX_SIZE);
+ memcpy(amd_ucode_patch, p->data, min_t(u32, ksize(p->data), PATCH_MAX_SIZE));
+
+ return ret;
+}
+
+/*
+ * AMD microcode firmware naming convention, up to family 15h they are in
+ * the legacy file:
+ *
+ * amd-ucode/microcode_amd.bin
+ *
+ * This legacy file is always smaller than 2K in size.
+ *
+ * Beginning with family 15h, they are in family-specific firmware files:
+ *
+ * amd-ucode/microcode_amd_fam15h.bin
+ * amd-ucode/microcode_amd_fam16h.bin
+ * ...
+ *
+ * These might be larger than 2K.
+ */
+static enum ucode_state request_microcode_amd(int cpu, struct device *device,
+ bool refresh_fw)
+{
+ char fw_name[36] = "amd-ucode/microcode_amd.bin";
+ struct cpuinfo_x86 *c = &cpu_data(cpu);
+ bool bsp = c->cpu_index == boot_cpu_data.cpu_index;
+ enum ucode_state ret = UCODE_NFOUND;
+ const struct firmware *fw;
+
+ /* reload ucode container only on the boot cpu */
+ if (!refresh_fw || !bsp)
+ return UCODE_OK;
+
+ if (c->x86 >= 0x15)
+ snprintf(fw_name, sizeof(fw_name), "amd-ucode/microcode_amd_fam%.2xh.bin", c->x86);
+
+ if (request_firmware_direct(&fw, (const char *)fw_name, device)) {
+ pr_debug("failed to load file %s\n", fw_name);
+ goto out;
+ }
+
+ ret = UCODE_ERROR;
+ if (*(u32 *)fw->data != UCODE_MAGIC) {
+ pr_err("invalid magic value (0x%08x)\n", *(u32 *)fw->data);
+ goto fw_release;
+ }
+
+ ret = load_microcode_amd(bsp, c->x86, fw->data, fw->size);
+
+ fw_release:
+ release_firmware(fw);
+
+ out:
+ return ret;
+}
+
+static enum ucode_state
+request_microcode_user(int cpu, const void __user *buf, size_t size)
+{
+ return UCODE_ERROR;
+}
+
+static void microcode_fini_cpu_amd(int cpu)
+{
+ struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+
+ uci->mc = NULL;
+}
+
+static struct microcode_ops microcode_amd_ops = {
+ .request_microcode_user = request_microcode_user,
+ .request_microcode_fw = request_microcode_amd,
+ .collect_cpu_info = collect_cpu_info_amd,
+ .apply_microcode = apply_microcode_amd,
+ .microcode_fini_cpu = microcode_fini_cpu_amd,
+};
+
+struct microcode_ops * __init init_amd_microcode(void)
+{
+ struct cpuinfo_x86 *c = &boot_cpu_data;
+
+ if (c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10) {
+ pr_warn("AMD CPU family 0x%x not supported\n", c->x86);
+ return NULL;
+ }
+
+ if (ucode_new_rev)
+ pr_info_once("microcode updated early to new patch_level=0x%08x\n",
+ ucode_new_rev);
+
+ return µcode_amd_ops;
+}
+
+void __exit exit_amd_microcode(void)
+{
+ cleanup();
+}
diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c
new file mode 100644
index 0000000..b9bc8a1
--- /dev/null
+++ b/arch/x86/kernel/cpu/microcode/core.c
@@ -0,0 +1,901 @@
+/*
+ * CPU Microcode Update Driver for Linux
+ *
+ * Copyright (C) 2000-2006 Tigran Aivazian <aivazian.tigran@gmail.com>
+ * 2006 Shaohua Li <shaohua.li@intel.com>
+ * 2013-2016 Borislav Petkov <bp@alien8.de>
+ *
+ * X86 CPU microcode early update for Linux:
+ *
+ * Copyright (C) 2012 Fenghua Yu <fenghua.yu@intel.com>
+ * H Peter Anvin" <hpa@zytor.com>
+ * (C) 2015 Borislav Petkov <bp@alien8.de>
+ *
+ * This driver allows to upgrade microcode on x86 processors.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#define pr_fmt(fmt) "microcode: " fmt
+
+#include <linux/platform_device.h>
+#include <linux/stop_machine.h>
+#include <linux/syscore_ops.h>
+#include <linux/miscdevice.h>
+#include <linux/capability.h>
+#include <linux/firmware.h>
+#include <linux/kernel.h>
+#include <linux/delay.h>
+#include <linux/mutex.h>
+#include <linux/cpu.h>
+#include <linux/nmi.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+
+#include <asm/microcode_intel.h>
+#include <asm/cpu_device_id.h>
+#include <asm/microcode_amd.h>
+#include <asm/perf_event.h>
+#include <asm/microcode.h>
+#include <asm/processor.h>
+#include <asm/cmdline.h>
+#include <asm/setup.h>
+
+#define DRIVER_VERSION "2.2"
+
+static struct microcode_ops *microcode_ops;
+static bool dis_ucode_ldr = true;
+
+bool initrd_gone;
+
+LIST_HEAD(microcode_cache);
+
+/*
+ * Synchronization.
+ *
+ * All non cpu-hotplug-callback call sites use:
+ *
+ * - microcode_mutex to synchronize with each other;
+ * - get/put_online_cpus() to synchronize with
+ * the cpu-hotplug-callback call sites.
+ *
+ * We guarantee that only a single cpu is being
+ * updated at any particular moment of time.
+ */
+static DEFINE_MUTEX(microcode_mutex);
+
+/*
+ * Serialize late loading so that CPUs get updated one-by-one.
+ */
+static DEFINE_RAW_SPINLOCK(update_lock);
+
+struct ucode_cpu_info ucode_cpu_info[NR_CPUS];
+
+struct cpu_info_ctx {
+ struct cpu_signature *cpu_sig;
+ int err;
+};
+
+/*
+ * Those patch levels cannot be updated to newer ones and thus should be final.
+ */
+static u32 final_levels[] = {
+ 0x01000098,
+ 0x0100009f,
+ 0x010000af,
+ 0, /* T-101 terminator */
+};
+
+/*
+ * Check the current patch level on this CPU.
+ *
+ * Returns:
+ * - true: if update should stop
+ * - false: otherwise
+ */
+static bool amd_check_current_patch_level(void)
+{
+ u32 lvl, dummy, i;
+ u32 *levels;
+
+ native_rdmsr(MSR_AMD64_PATCH_LEVEL, lvl, dummy);
+
+ if (IS_ENABLED(CONFIG_X86_32))
+ levels = (u32 *)__pa_nodebug(&final_levels);
+ else
+ levels = final_levels;
+
+ for (i = 0; levels[i]; i++) {
+ if (lvl == levels[i])
+ return true;
+ }
+ return false;
+}
+
+static bool __init check_loader_disabled_bsp(void)
+{
+ static const char *__dis_opt_str = "dis_ucode_ldr";
+
+#ifdef CONFIG_X86_32
+ const char *cmdline = (const char *)__pa_nodebug(boot_command_line);
+ const char *option = (const char *)__pa_nodebug(__dis_opt_str);
+ bool *res = (bool *)__pa_nodebug(&dis_ucode_ldr);
+
+#else /* CONFIG_X86_64 */
+ const char *cmdline = boot_command_line;
+ const char *option = __dis_opt_str;
+ bool *res = &dis_ucode_ldr;
+#endif
+
+ /*
+ * CPUID(1).ECX[31]: reserved for hypervisor use. This is still not
+ * completely accurate as xen pv guests don't see that CPUID bit set but
+ * that's good enough as they don't land on the BSP path anyway.
+ */
+ if (native_cpuid_ecx(1) & BIT(31))
+ return *res;
+
+ if (x86_cpuid_vendor() == X86_VENDOR_AMD) {
+ if (amd_check_current_patch_level())
+ return *res;
+ }
+
+ if (cmdline_find_option_bool(cmdline, option) <= 0)
+ *res = false;
+
+ return *res;
+}
+
+extern struct builtin_fw __start_builtin_fw[];
+extern struct builtin_fw __end_builtin_fw[];
+
+bool get_builtin_firmware(struct cpio_data *cd, const char *name)
+{
+#ifdef CONFIG_FW_LOADER
+ struct builtin_fw *b_fw;
+
+ for (b_fw = __start_builtin_fw; b_fw != __end_builtin_fw; b_fw++) {
+ if (!strcmp(name, b_fw->name)) {
+ cd->size = b_fw->size;
+ cd->data = b_fw->data;
+ return true;
+ }
+ }
+#endif
+ return false;
+}
+
+void __init load_ucode_bsp(void)
+{
+ unsigned int cpuid_1_eax;
+ bool intel = true;
+
+ if (!have_cpuid_p())
+ return;
+
+ cpuid_1_eax = native_cpuid_eax(1);
+
+ switch (x86_cpuid_vendor()) {
+ case X86_VENDOR_INTEL:
+ if (x86_family(cpuid_1_eax) < 6)
+ return;
+ break;
+
+ case X86_VENDOR_AMD:
+ if (x86_family(cpuid_1_eax) < 0x10)
+ return;
+ intel = false;
+ break;
+
+ default:
+ return;
+ }
+
+ if (check_loader_disabled_bsp())
+ return;
+
+ if (intel)
+ load_ucode_intel_bsp();
+ else
+ load_ucode_amd_bsp(cpuid_1_eax);
+}
+
+static bool check_loader_disabled_ap(void)
+{
+#ifdef CONFIG_X86_32
+ return *((bool *)__pa_nodebug(&dis_ucode_ldr));
+#else
+ return dis_ucode_ldr;
+#endif
+}
+
+void load_ucode_ap(void)
+{
+ unsigned int cpuid_1_eax;
+
+ if (check_loader_disabled_ap())
+ return;
+
+ cpuid_1_eax = native_cpuid_eax(1);
+
+ switch (x86_cpuid_vendor()) {
+ case X86_VENDOR_INTEL:
+ if (x86_family(cpuid_1_eax) >= 6)
+ load_ucode_intel_ap();
+ break;
+ case X86_VENDOR_AMD:
+ if (x86_family(cpuid_1_eax) >= 0x10)
+ load_ucode_amd_ap(cpuid_1_eax);
+ break;
+ default:
+ break;
+ }
+}
+
+static int __init save_microcode_in_initrd(void)
+{
+ struct cpuinfo_x86 *c = &boot_cpu_data;
+ int ret = -EINVAL;
+
+ switch (c->x86_vendor) {
+ case X86_VENDOR_INTEL:
+ if (c->x86 >= 6)
+ ret = save_microcode_in_initrd_intel();
+ break;
+ case X86_VENDOR_AMD:
+ if (c->x86 >= 0x10)
+ ret = save_microcode_in_initrd_amd(cpuid_eax(1));
+ break;
+ default:
+ break;
+ }
+
+ initrd_gone = true;
+
+ return ret;
+}
+
+struct cpio_data find_microcode_in_initrd(const char *path, bool use_pa)
+{
+#ifdef CONFIG_BLK_DEV_INITRD
+ unsigned long start = 0;
+ size_t size;
+
+#ifdef CONFIG_X86_32
+ struct boot_params *params;
+
+ if (use_pa)
+ params = (struct boot_params *)__pa_nodebug(&boot_params);
+ else
+ params = &boot_params;
+
+ size = params->hdr.ramdisk_size;
+
+ /*
+ * Set start only if we have an initrd image. We cannot use initrd_start
+ * because it is not set that early yet.
+ */
+ if (size)
+ start = params->hdr.ramdisk_image;
+
+# else /* CONFIG_X86_64 */
+ size = (unsigned long)boot_params.ext_ramdisk_size << 32;
+ size |= boot_params.hdr.ramdisk_size;
+
+ if (size) {
+ start = (unsigned long)boot_params.ext_ramdisk_image << 32;
+ start |= boot_params.hdr.ramdisk_image;
+
+ start += PAGE_OFFSET;
+ }
+# endif
+
+ /*
+ * Fixup the start address: after reserve_initrd() runs, initrd_start
+ * has the virtual address of the beginning of the initrd. It also
+ * possibly relocates the ramdisk. In either case, initrd_start contains
+ * the updated address so use that instead.
+ *
+ * initrd_gone is for the hotplug case where we've thrown out initrd
+ * already.
+ */
+ if (!use_pa) {
+ if (initrd_gone)
+ return (struct cpio_data){ NULL, 0, "" };
+ if (initrd_start)
+ start = initrd_start;
+ } else {
+ /*
+ * The picture with physical addresses is a bit different: we
+ * need to get the *physical* address to which the ramdisk was
+ * relocated, i.e., relocated_ramdisk (not initrd_start) and
+ * since we're running from physical addresses, we need to access
+ * relocated_ramdisk through its *physical* address too.
+ */
+ u64 *rr = (u64 *)__pa_nodebug(&relocated_ramdisk);
+ if (*rr)
+ start = *rr;
+ }
+
+ return find_cpio_data(path, (void *)start, size, NULL);
+#else /* !CONFIG_BLK_DEV_INITRD */
+ return (struct cpio_data){ NULL, 0, "" };
+#endif
+}
+
+void reload_early_microcode(void)
+{
+ int vendor, family;
+
+ vendor = x86_cpuid_vendor();
+ family = x86_cpuid_family();
+
+ switch (vendor) {
+ case X86_VENDOR_INTEL:
+ if (family >= 6)
+ reload_ucode_intel();
+ break;
+ case X86_VENDOR_AMD:
+ if (family >= 0x10)
+ reload_ucode_amd();
+ break;
+ default:
+ break;
+ }
+}
+
+static void collect_cpu_info_local(void *arg)
+{
+ struct cpu_info_ctx *ctx = arg;
+
+ ctx->err = microcode_ops->collect_cpu_info(smp_processor_id(),
+ ctx->cpu_sig);
+}
+
+static int collect_cpu_info_on_target(int cpu, struct cpu_signature *cpu_sig)
+{
+ struct cpu_info_ctx ctx = { .cpu_sig = cpu_sig, .err = 0 };
+ int ret;
+
+ ret = smp_call_function_single(cpu, collect_cpu_info_local, &ctx, 1);
+ if (!ret)
+ ret = ctx.err;
+
+ return ret;
+}
+
+static int collect_cpu_info(int cpu)
+{
+ struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+ int ret;
+
+ memset(uci, 0, sizeof(*uci));
+
+ ret = collect_cpu_info_on_target(cpu, &uci->cpu_sig);
+ if (!ret)
+ uci->valid = 1;
+
+ return ret;
+}
+
+static void apply_microcode_local(void *arg)
+{
+ enum ucode_state *err = arg;
+
+ *err = microcode_ops->apply_microcode(smp_processor_id());
+}
+
+static int apply_microcode_on_target(int cpu)
+{
+ enum ucode_state err;
+ int ret;
+
+ ret = smp_call_function_single(cpu, apply_microcode_local, &err, 1);
+ if (!ret) {
+ if (err == UCODE_ERROR)
+ ret = 1;
+ }
+ return ret;
+}
+
+#ifdef CONFIG_MICROCODE_OLD_INTERFACE
+static int do_microcode_update(const void __user *buf, size_t size)
+{
+ int error = 0;
+ int cpu;
+
+ for_each_online_cpu(cpu) {
+ struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+ enum ucode_state ustate;
+
+ if (!uci->valid)
+ continue;
+
+ ustate = microcode_ops->request_microcode_user(cpu, buf, size);
+ if (ustate == UCODE_ERROR) {
+ error = -1;
+ break;
+ } else if (ustate == UCODE_OK)
+ apply_microcode_on_target(cpu);
+ }
+
+ return error;
+}
+
+static int microcode_open(struct inode *inode, struct file *file)
+{
+ return capable(CAP_SYS_RAWIO) ? nonseekable_open(inode, file) : -EPERM;
+}
+
+static ssize_t microcode_write(struct file *file, const char __user *buf,
+ size_t len, loff_t *ppos)
+{
+ ssize_t ret = -EINVAL;
+
+ if ((len >> PAGE_SHIFT) > totalram_pages) {
+ pr_err("too much data (max %ld pages)\n", totalram_pages);
+ return ret;
+ }
+
+ get_online_cpus();
+ mutex_lock(µcode_mutex);
+
+ if (do_microcode_update(buf, len) == 0)
+ ret = (ssize_t)len;
+
+ if (ret > 0)
+ perf_check_microcode();
+
+ mutex_unlock(µcode_mutex);
+ put_online_cpus();
+
+ return ret;
+}
+
+static const struct file_operations microcode_fops = {
+ .owner = THIS_MODULE,
+ .write = microcode_write,
+ .open = microcode_open,
+ .llseek = no_llseek,
+};
+
+static struct miscdevice microcode_dev = {
+ .minor = MICROCODE_MINOR,
+ .name = "microcode",
+ .nodename = "cpu/microcode",
+ .fops = µcode_fops,
+};
+
+static int __init microcode_dev_init(void)
+{
+ int error;
+
+ error = misc_register(µcode_dev);
+ if (error) {
+ pr_err("can't misc_register on minor=%d\n", MICROCODE_MINOR);
+ return error;
+ }
+
+ return 0;
+}
+
+static void __exit microcode_dev_exit(void)
+{
+ misc_deregister(µcode_dev);
+}
+#else
+#define microcode_dev_init() 0
+#define microcode_dev_exit() do { } while (0)
+#endif
+
+/* fake device for request_firmware */
+static struct platform_device *microcode_pdev;
+
+/*
+ * Late loading dance. Why the heavy-handed stomp_machine effort?
+ *
+ * - HT siblings must be idle and not execute other code while the other sibling
+ * is loading microcode in order to avoid any negative interactions caused by
+ * the loading.
+ *
+ * - In addition, microcode update on the cores must be serialized until this
+ * requirement can be relaxed in the future. Right now, this is conservative
+ * and good.
+ */
+#define SPINUNIT 100 /* 100 nsec */
+
+static int check_online_cpus(void)
+{
+ unsigned int cpu;
+
+ /*
+ * Make sure all CPUs are online. It's fine for SMT to be disabled if
+ * all the primary threads are still online.
+ */
+ for_each_present_cpu(cpu) {
+ if (topology_is_primary_thread(cpu) && !cpu_online(cpu)) {
+ pr_err("Not all CPUs online, aborting microcode update.\n");
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
+
+static atomic_t late_cpus_in;
+static atomic_t late_cpus_out;
+
+static int __wait_for_cpus(atomic_t *t, long long timeout)
+{
+ int all_cpus = num_online_cpus();
+
+ atomic_inc(t);
+
+ while (atomic_read(t) < all_cpus) {
+ if (timeout < SPINUNIT) {
+ pr_err("Timeout while waiting for CPUs rendezvous, remaining: %d\n",
+ all_cpus - atomic_read(t));
+ return 1;
+ }
+
+ ndelay(SPINUNIT);
+ timeout -= SPINUNIT;
+
+ touch_nmi_watchdog();
+ }
+ return 0;
+}
+
+/*
+ * Returns:
+ * < 0 - on error
+ * 0 - no update done
+ * 1 - microcode was updated
+ */
+static int __reload_late(void *info)
+{
+ int cpu = smp_processor_id();
+ enum ucode_state err;
+ int ret = 0;
+
+ /*
+ * Wait for all CPUs to arrive. A load will not be attempted unless all
+ * CPUs show up.
+ * */
+ if (__wait_for_cpus(&late_cpus_in, NSEC_PER_SEC))
+ return -1;
+
+ raw_spin_lock(&update_lock);
+ apply_microcode_local(&err);
+ raw_spin_unlock(&update_lock);
+
+ /* siblings return UCODE_OK because their engine got updated already */
+ if (err > UCODE_NFOUND) {
+ pr_warn("Error reloading microcode on CPU %d\n", cpu);
+ ret = -1;
+ } else if (err == UCODE_UPDATED || err == UCODE_OK) {
+ ret = 1;
+ }
+
+ /*
+ * Increase the wait timeout to a safe value here since we're
+ * serializing the microcode update and that could take a while on a
+ * large number of CPUs. And that is fine as the *actual* timeout will
+ * be determined by the last CPU finished updating and thus cut short.
+ */
+ if (__wait_for_cpus(&late_cpus_out, NSEC_PER_SEC * num_online_cpus()))
+ panic("Timeout during microcode update!\n");
+
+ return ret;
+}
+
+/*
+ * Reload microcode late on all CPUs. Wait for a sec until they
+ * all gather together.
+ */
+static int microcode_reload_late(void)
+{
+ int ret;
+
+ atomic_set(&late_cpus_in, 0);
+ atomic_set(&late_cpus_out, 0);
+
+ ret = stop_machine_cpuslocked(__reload_late, NULL, cpu_online_mask);
+ if (ret > 0)
+ microcode_check();
+
+ return ret;
+}
+
+static ssize_t reload_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t size)
+{
+ enum ucode_state tmp_ret = UCODE_OK;
+ int bsp = boot_cpu_data.cpu_index;
+ unsigned long val;
+ ssize_t ret = 0;
+
+ ret = kstrtoul(buf, 0, &val);
+ if (ret)
+ return ret;
+
+ if (val != 1)
+ return size;
+
+ tmp_ret = microcode_ops->request_microcode_fw(bsp, µcode_pdev->dev, true);
+ if (tmp_ret != UCODE_NEW)
+ return size;
+
+ get_online_cpus();
+
+ ret = check_online_cpus();
+ if (ret)
+ goto put;
+
+ mutex_lock(µcode_mutex);
+ ret = microcode_reload_late();
+ mutex_unlock(µcode_mutex);
+
+put:
+ put_online_cpus();
+
+ if (ret >= 0)
+ ret = size;
+
+ return ret;
+}
+
+static ssize_t version_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct ucode_cpu_info *uci = ucode_cpu_info + dev->id;
+
+ return sprintf(buf, "0x%x\n", uci->cpu_sig.rev);
+}
+
+static ssize_t pf_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct ucode_cpu_info *uci = ucode_cpu_info + dev->id;
+
+ return sprintf(buf, "0x%x\n", uci->cpu_sig.pf);
+}
+
+static DEVICE_ATTR_WO(reload);
+static DEVICE_ATTR(version, 0400, version_show, NULL);
+static DEVICE_ATTR(processor_flags, 0400, pf_show, NULL);
+
+static struct attribute *mc_default_attrs[] = {
+ &dev_attr_version.attr,
+ &dev_attr_processor_flags.attr,
+ NULL
+};
+
+static const struct attribute_group mc_attr_group = {
+ .attrs = mc_default_attrs,
+ .name = "microcode",
+};
+
+static void microcode_fini_cpu(int cpu)
+{
+ if (microcode_ops->microcode_fini_cpu)
+ microcode_ops->microcode_fini_cpu(cpu);
+}
+
+static enum ucode_state microcode_resume_cpu(int cpu)
+{
+ if (apply_microcode_on_target(cpu))
+ return UCODE_ERROR;
+
+ pr_debug("CPU%d updated upon resume\n", cpu);
+
+ return UCODE_OK;
+}
+
+static enum ucode_state microcode_init_cpu(int cpu, bool refresh_fw)
+{
+ enum ucode_state ustate;
+ struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+
+ if (uci->valid)
+ return UCODE_OK;
+
+ if (collect_cpu_info(cpu))
+ return UCODE_ERROR;
+
+ /* --dimm. Trigger a delayed update? */
+ if (system_state != SYSTEM_RUNNING)
+ return UCODE_NFOUND;
+
+ ustate = microcode_ops->request_microcode_fw(cpu, µcode_pdev->dev, refresh_fw);
+ if (ustate == UCODE_NEW) {
+ pr_debug("CPU%d updated upon init\n", cpu);
+ apply_microcode_on_target(cpu);
+ }
+
+ return ustate;
+}
+
+static enum ucode_state microcode_update_cpu(int cpu)
+{
+ struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+
+ /* Refresh CPU microcode revision after resume. */
+ collect_cpu_info(cpu);
+
+ if (uci->valid)
+ return microcode_resume_cpu(cpu);
+
+ return microcode_init_cpu(cpu, false);
+}
+
+static int mc_device_add(struct device *dev, struct subsys_interface *sif)
+{
+ int err, cpu = dev->id;
+
+ if (!cpu_online(cpu))
+ return 0;
+
+ pr_debug("CPU%d added\n", cpu);
+
+ err = sysfs_create_group(&dev->kobj, &mc_attr_group);
+ if (err)
+ return err;
+
+ if (microcode_init_cpu(cpu, true) == UCODE_ERROR)
+ return -EINVAL;
+
+ return err;
+}
+
+static void mc_device_remove(struct device *dev, struct subsys_interface *sif)
+{
+ int cpu = dev->id;
+
+ if (!cpu_online(cpu))
+ return;
+
+ pr_debug("CPU%d removed\n", cpu);
+ microcode_fini_cpu(cpu);
+ sysfs_remove_group(&dev->kobj, &mc_attr_group);
+}
+
+static struct subsys_interface mc_cpu_interface = {
+ .name = "microcode",
+ .subsys = &cpu_subsys,
+ .add_dev = mc_device_add,
+ .remove_dev = mc_device_remove,
+};
+
+/**
+ * mc_bp_resume - Update boot CPU microcode during resume.
+ */
+static void mc_bp_resume(void)
+{
+ int cpu = smp_processor_id();
+ struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+
+ if (uci->valid && uci->mc)
+ microcode_ops->apply_microcode(cpu);
+ else if (!uci->mc)
+ reload_early_microcode();
+}
+
+static struct syscore_ops mc_syscore_ops = {
+ .resume = mc_bp_resume,
+};
+
+static int mc_cpu_online(unsigned int cpu)
+{
+ struct device *dev;
+
+ dev = get_cpu_device(cpu);
+ microcode_update_cpu(cpu);
+ pr_debug("CPU%d added\n", cpu);
+
+ if (sysfs_create_group(&dev->kobj, &mc_attr_group))
+ pr_err("Failed to create group for CPU%d\n", cpu);
+ return 0;
+}
+
+static int mc_cpu_down_prep(unsigned int cpu)
+{
+ struct device *dev;
+
+ dev = get_cpu_device(cpu);
+ /* Suspend is in progress, only remove the interface */
+ sysfs_remove_group(&dev->kobj, &mc_attr_group);
+ pr_debug("CPU%d removed\n", cpu);
+
+ return 0;
+}
+
+static struct attribute *cpu_root_microcode_attrs[] = {
+ &dev_attr_reload.attr,
+ NULL
+};
+
+static const struct attribute_group cpu_root_microcode_group = {
+ .name = "microcode",
+ .attrs = cpu_root_microcode_attrs,
+};
+
+int __init microcode_init(void)
+{
+ struct cpuinfo_x86 *c = &boot_cpu_data;
+ int error;
+
+ if (dis_ucode_ldr)
+ return -EINVAL;
+
+ if (c->x86_vendor == X86_VENDOR_INTEL)
+ microcode_ops = init_intel_microcode();
+ else if (c->x86_vendor == X86_VENDOR_AMD)
+ microcode_ops = init_amd_microcode();
+ else
+ pr_err("no support for this CPU vendor\n");
+
+ if (!microcode_ops)
+ return -ENODEV;
+
+ microcode_pdev = platform_device_register_simple("microcode", -1,
+ NULL, 0);
+ if (IS_ERR(microcode_pdev))
+ return PTR_ERR(microcode_pdev);
+
+ get_online_cpus();
+ mutex_lock(µcode_mutex);
+
+ error = subsys_interface_register(&mc_cpu_interface);
+ if (!error)
+ perf_check_microcode();
+ mutex_unlock(µcode_mutex);
+ put_online_cpus();
+
+ if (error)
+ goto out_pdev;
+
+ error = sysfs_create_group(&cpu_subsys.dev_root->kobj,
+ &cpu_root_microcode_group);
+
+ if (error) {
+ pr_err("Error creating microcode group!\n");
+ goto out_driver;
+ }
+
+ error = microcode_dev_init();
+ if (error)
+ goto out_ucode_group;
+
+ register_syscore_ops(&mc_syscore_ops);
+ cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "x86/microcode:online",
+ mc_cpu_online, mc_cpu_down_prep);
+
+ pr_info("Microcode Update Driver: v%s.", DRIVER_VERSION);
+
+ return 0;
+
+ out_ucode_group:
+ sysfs_remove_group(&cpu_subsys.dev_root->kobj,
+ &cpu_root_microcode_group);
+
+ out_driver:
+ get_online_cpus();
+ mutex_lock(µcode_mutex);
+
+ subsys_interface_unregister(&mc_cpu_interface);
+
+ mutex_unlock(µcode_mutex);
+ put_online_cpus();
+
+ out_pdev:
+ platform_device_unregister(microcode_pdev);
+ return error;
+
+}
+fs_initcall(save_microcode_in_initrd);
+late_initcall(microcode_init);
diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c
new file mode 100644
index 0000000..16936a2
--- /dev/null
+++ b/arch/x86/kernel/cpu/microcode/intel.c
@@ -0,0 +1,1047 @@
+/*
+ * Intel CPU Microcode Update Driver for Linux
+ *
+ * Copyright (C) 2000-2006 Tigran Aivazian <aivazian.tigran@gmail.com>
+ * 2006 Shaohua Li <shaohua.li@intel.com>
+ *
+ * Intel CPU microcode early update for Linux
+ *
+ * Copyright (C) 2012 Fenghua Yu <fenghua.yu@intel.com>
+ * H Peter Anvin" <hpa@zytor.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+/*
+ * This needs to be before all headers so that pr_debug in printk.h doesn't turn
+ * printk calls into no_printk().
+ *
+ *#define DEBUG
+ */
+#define pr_fmt(fmt) "microcode: " fmt
+
+#include <linux/earlycpio.h>
+#include <linux/firmware.h>
+#include <linux/uaccess.h>
+#include <linux/vmalloc.h>
+#include <linux/initrd.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/cpu.h>
+#include <linux/mm.h>
+
+#include <asm/microcode_intel.h>
+#include <asm/intel-family.h>
+#include <asm/processor.h>
+#include <asm/tlbflush.h>
+#include <asm/setup.h>
+#include <asm/msr.h>
+
+static const char ucode_path[] = "kernel/x86/microcode/GenuineIntel.bin";
+
+/* Current microcode patch used in early patching on the APs. */
+static struct microcode_intel *intel_ucode_patch;
+
+/* last level cache size per core */
+static int llc_size_per_core;
+
+static inline bool cpu_signatures_match(unsigned int s1, unsigned int p1,
+ unsigned int s2, unsigned int p2)
+{
+ if (s1 != s2)
+ return false;
+
+ /* Processor flags are either both 0 ... */
+ if (!p1 && !p2)
+ return true;
+
+ /* ... or they intersect. */
+ return p1 & p2;
+}
+
+/*
+ * Returns 1 if update has been found, 0 otherwise.
+ */
+static int find_matching_signature(void *mc, unsigned int csig, int cpf)
+{
+ struct microcode_header_intel *mc_hdr = mc;
+ struct extended_sigtable *ext_hdr;
+ struct extended_signature *ext_sig;
+ int i;
+
+ if (cpu_signatures_match(csig, cpf, mc_hdr->sig, mc_hdr->pf))
+ return 1;
+
+ /* Look for ext. headers: */
+ if (get_totalsize(mc_hdr) <= get_datasize(mc_hdr) + MC_HEADER_SIZE)
+ return 0;
+
+ ext_hdr = mc + get_datasize(mc_hdr) + MC_HEADER_SIZE;
+ ext_sig = (void *)ext_hdr + EXT_HEADER_SIZE;
+
+ for (i = 0; i < ext_hdr->count; i++) {
+ if (cpu_signatures_match(csig, cpf, ext_sig->sig, ext_sig->pf))
+ return 1;
+ ext_sig++;
+ }
+ return 0;
+}
+
+/*
+ * Returns 1 if update has been found, 0 otherwise.
+ */
+static int has_newer_microcode(void *mc, unsigned int csig, int cpf, int new_rev)
+{
+ struct microcode_header_intel *mc_hdr = mc;
+
+ if (mc_hdr->rev <= new_rev)
+ return 0;
+
+ return find_matching_signature(mc, csig, cpf);
+}
+
+/*
+ * Given CPU signature and a microcode patch, this function finds if the
+ * microcode patch has matching family and model with the CPU.
+ *
+ * %true - if there's a match
+ * %false - otherwise
+ */
+static bool microcode_matches(struct microcode_header_intel *mc_header,
+ unsigned long sig)
+{
+ unsigned long total_size = get_totalsize(mc_header);
+ unsigned long data_size = get_datasize(mc_header);
+ struct extended_sigtable *ext_header;
+ unsigned int fam_ucode, model_ucode;
+ struct extended_signature *ext_sig;
+ unsigned int fam, model;
+ int ext_sigcount, i;
+
+ fam = x86_family(sig);
+ model = x86_model(sig);
+
+ fam_ucode = x86_family(mc_header->sig);
+ model_ucode = x86_model(mc_header->sig);
+
+ if (fam == fam_ucode && model == model_ucode)
+ return true;
+
+ /* Look for ext. headers: */
+ if (total_size <= data_size + MC_HEADER_SIZE)
+ return false;
+
+ ext_header = (void *) mc_header + data_size + MC_HEADER_SIZE;
+ ext_sig = (void *)ext_header + EXT_HEADER_SIZE;
+ ext_sigcount = ext_header->count;
+
+ for (i = 0; i < ext_sigcount; i++) {
+ fam_ucode = x86_family(ext_sig->sig);
+ model_ucode = x86_model(ext_sig->sig);
+
+ if (fam == fam_ucode && model == model_ucode)
+ return true;
+
+ ext_sig++;
+ }
+ return false;
+}
+
+static struct ucode_patch *memdup_patch(void *data, unsigned int size)
+{
+ struct ucode_patch *p;
+
+ p = kzalloc(sizeof(struct ucode_patch), GFP_KERNEL);
+ if (!p)
+ return NULL;
+
+ p->data = kmemdup(data, size, GFP_KERNEL);
+ if (!p->data) {
+ kfree(p);
+ return NULL;
+ }
+
+ return p;
+}
+
+static void save_microcode_patch(void *data, unsigned int size)
+{
+ struct microcode_header_intel *mc_hdr, *mc_saved_hdr;
+ struct ucode_patch *iter, *tmp, *p = NULL;
+ bool prev_found = false;
+ unsigned int sig, pf;
+
+ mc_hdr = (struct microcode_header_intel *)data;
+
+ list_for_each_entry_safe(iter, tmp, µcode_cache, plist) {
+ mc_saved_hdr = (struct microcode_header_intel *)iter->data;
+ sig = mc_saved_hdr->sig;
+ pf = mc_saved_hdr->pf;
+
+ if (find_matching_signature(data, sig, pf)) {
+ prev_found = true;
+
+ if (mc_hdr->rev <= mc_saved_hdr->rev)
+ continue;
+
+ p = memdup_patch(data, size);
+ if (!p)
+ pr_err("Error allocating buffer %p\n", data);
+ else {
+ list_replace(&iter->plist, &p->plist);
+ kfree(iter->data);
+ kfree(iter);
+ }
+ }
+ }
+
+ /*
+ * There weren't any previous patches found in the list cache; save the
+ * newly found.
+ */
+ if (!prev_found) {
+ p = memdup_patch(data, size);
+ if (!p)
+ pr_err("Error allocating buffer for %p\n", data);
+ else
+ list_add_tail(&p->plist, µcode_cache);
+ }
+
+ if (!p)
+ return;
+
+ /*
+ * Save for early loading. On 32-bit, that needs to be a physical
+ * address as the APs are running from physical addresses, before
+ * paging has been enabled.
+ */
+ if (IS_ENABLED(CONFIG_X86_32))
+ intel_ucode_patch = (struct microcode_intel *)__pa_nodebug(p->data);
+ else
+ intel_ucode_patch = p->data;
+}
+
+static int microcode_sanity_check(void *mc, int print_err)
+{
+ unsigned long total_size, data_size, ext_table_size;
+ struct microcode_header_intel *mc_header = mc;
+ struct extended_sigtable *ext_header = NULL;
+ u32 sum, orig_sum, ext_sigcount = 0, i;
+ struct extended_signature *ext_sig;
+
+ total_size = get_totalsize(mc_header);
+ data_size = get_datasize(mc_header);
+
+ if (data_size + MC_HEADER_SIZE > total_size) {
+ if (print_err)
+ pr_err("Error: bad microcode data file size.\n");
+ return -EINVAL;
+ }
+
+ if (mc_header->ldrver != 1 || mc_header->hdrver != 1) {
+ if (print_err)
+ pr_err("Error: invalid/unknown microcode update format.\n");
+ return -EINVAL;
+ }
+
+ ext_table_size = total_size - (MC_HEADER_SIZE + data_size);
+ if (ext_table_size) {
+ u32 ext_table_sum = 0;
+ u32 *ext_tablep;
+
+ if ((ext_table_size < EXT_HEADER_SIZE)
+ || ((ext_table_size - EXT_HEADER_SIZE) % EXT_SIGNATURE_SIZE)) {
+ if (print_err)
+ pr_err("Error: truncated extended signature table.\n");
+ return -EINVAL;
+ }
+
+ ext_header = mc + MC_HEADER_SIZE + data_size;
+ if (ext_table_size != exttable_size(ext_header)) {
+ if (print_err)
+ pr_err("Error: extended signature table size mismatch.\n");
+ return -EFAULT;
+ }
+
+ ext_sigcount = ext_header->count;
+
+ /*
+ * Check extended table checksum: the sum of all dwords that
+ * comprise a valid table must be 0.
+ */
+ ext_tablep = (u32 *)ext_header;
+
+ i = ext_table_size / sizeof(u32);
+ while (i--)
+ ext_table_sum += ext_tablep[i];
+
+ if (ext_table_sum) {
+ if (print_err)
+ pr_warn("Bad extended signature table checksum, aborting.\n");
+ return -EINVAL;
+ }
+ }
+
+ /*
+ * Calculate the checksum of update data and header. The checksum of
+ * valid update data and header including the extended signature table
+ * must be 0.
+ */
+ orig_sum = 0;
+ i = (MC_HEADER_SIZE + data_size) / sizeof(u32);
+ while (i--)
+ orig_sum += ((u32 *)mc)[i];
+
+ if (orig_sum) {
+ if (print_err)
+ pr_err("Bad microcode data checksum, aborting.\n");
+ return -EINVAL;
+ }
+
+ if (!ext_table_size)
+ return 0;
+
+ /*
+ * Check extended signature checksum: 0 => valid.
+ */
+ for (i = 0; i < ext_sigcount; i++) {
+ ext_sig = (void *)ext_header + EXT_HEADER_SIZE +
+ EXT_SIGNATURE_SIZE * i;
+
+ sum = (mc_header->sig + mc_header->pf + mc_header->cksum) -
+ (ext_sig->sig + ext_sig->pf + ext_sig->cksum);
+ if (sum) {
+ if (print_err)
+ pr_err("Bad extended signature checksum, aborting.\n");
+ return -EINVAL;
+ }
+ }
+ return 0;
+}
+
+/*
+ * Get microcode matching with BSP's model. Only CPUs with the same model as
+ * BSP can stay in the platform.
+ */
+static struct microcode_intel *
+scan_microcode(void *data, size_t size, struct ucode_cpu_info *uci, bool save)
+{
+ struct microcode_header_intel *mc_header;
+ struct microcode_intel *patch = NULL;
+ unsigned int mc_size;
+
+ while (size) {
+ if (size < sizeof(struct microcode_header_intel))
+ break;
+
+ mc_header = (struct microcode_header_intel *)data;
+
+ mc_size = get_totalsize(mc_header);
+ if (!mc_size ||
+ mc_size > size ||
+ microcode_sanity_check(data, 0) < 0)
+ break;
+
+ size -= mc_size;
+
+ if (!microcode_matches(mc_header, uci->cpu_sig.sig)) {
+ data += mc_size;
+ continue;
+ }
+
+ if (save) {
+ save_microcode_patch(data, mc_size);
+ goto next;
+ }
+
+
+ if (!patch) {
+ if (!has_newer_microcode(data,
+ uci->cpu_sig.sig,
+ uci->cpu_sig.pf,
+ uci->cpu_sig.rev))
+ goto next;
+
+ } else {
+ struct microcode_header_intel *phdr = &patch->hdr;
+
+ if (!has_newer_microcode(data,
+ phdr->sig,
+ phdr->pf,
+ phdr->rev))
+ goto next;
+ }
+
+ /* We have a newer patch, save it. */
+ patch = data;
+
+next:
+ data += mc_size;
+ }
+
+ if (size)
+ return NULL;
+
+ return patch;
+}
+
+static int collect_cpu_info_early(struct ucode_cpu_info *uci)
+{
+ unsigned int val[2];
+ unsigned int family, model;
+ struct cpu_signature csig = { 0 };
+ unsigned int eax, ebx, ecx, edx;
+
+ memset(uci, 0, sizeof(*uci));
+
+ eax = 0x00000001;
+ ecx = 0;
+ native_cpuid(&eax, &ebx, &ecx, &edx);
+ csig.sig = eax;
+
+ family = x86_family(eax);
+ model = x86_model(eax);
+
+ if ((model >= 5) || (family > 6)) {
+ /* get processor flags from MSR 0x17 */
+ native_rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]);
+ csig.pf = 1 << ((val[1] >> 18) & 7);
+ }
+
+ csig.rev = intel_get_microcode_revision();
+
+ uci->cpu_sig = csig;
+ uci->valid = 1;
+
+ return 0;
+}
+
+static void show_saved_mc(void)
+{
+#ifdef DEBUG
+ int i = 0, j;
+ unsigned int sig, pf, rev, total_size, data_size, date;
+ struct ucode_cpu_info uci;
+ struct ucode_patch *p;
+
+ if (list_empty(µcode_cache)) {
+ pr_debug("no microcode data saved.\n");
+ return;
+ }
+
+ collect_cpu_info_early(&uci);
+
+ sig = uci.cpu_sig.sig;
+ pf = uci.cpu_sig.pf;
+ rev = uci.cpu_sig.rev;
+ pr_debug("CPU: sig=0x%x, pf=0x%x, rev=0x%x\n", sig, pf, rev);
+
+ list_for_each_entry(p, µcode_cache, plist) {
+ struct microcode_header_intel *mc_saved_header;
+ struct extended_sigtable *ext_header;
+ struct extended_signature *ext_sig;
+ int ext_sigcount;
+
+ mc_saved_header = (struct microcode_header_intel *)p->data;
+
+ sig = mc_saved_header->sig;
+ pf = mc_saved_header->pf;
+ rev = mc_saved_header->rev;
+ date = mc_saved_header->date;
+
+ total_size = get_totalsize(mc_saved_header);
+ data_size = get_datasize(mc_saved_header);
+
+ pr_debug("mc_saved[%d]: sig=0x%x, pf=0x%x, rev=0x%x, total size=0x%x, date = %04x-%02x-%02x\n",
+ i++, sig, pf, rev, total_size,
+ date & 0xffff,
+ date >> 24,
+ (date >> 16) & 0xff);
+
+ /* Look for ext. headers: */
+ if (total_size <= data_size + MC_HEADER_SIZE)
+ continue;
+
+ ext_header = (void *)mc_saved_header + data_size + MC_HEADER_SIZE;
+ ext_sigcount = ext_header->count;
+ ext_sig = (void *)ext_header + EXT_HEADER_SIZE;
+
+ for (j = 0; j < ext_sigcount; j++) {
+ sig = ext_sig->sig;
+ pf = ext_sig->pf;
+
+ pr_debug("\tExtended[%d]: sig=0x%x, pf=0x%x\n",
+ j, sig, pf);
+
+ ext_sig++;
+ }
+ }
+#endif
+}
+
+/*
+ * Save this microcode patch. It will be loaded early when a CPU is
+ * hot-added or resumes.
+ */
+static void save_mc_for_early(u8 *mc, unsigned int size)
+{
+ /* Synchronization during CPU hotplug. */
+ static DEFINE_MUTEX(x86_cpu_microcode_mutex);
+
+ mutex_lock(&x86_cpu_microcode_mutex);
+
+ save_microcode_patch(mc, size);
+ show_saved_mc();
+
+ mutex_unlock(&x86_cpu_microcode_mutex);
+}
+
+static bool load_builtin_intel_microcode(struct cpio_data *cp)
+{
+ unsigned int eax = 1, ebx, ecx = 0, edx;
+ char name[30];
+
+ if (IS_ENABLED(CONFIG_X86_32))
+ return false;
+
+ native_cpuid(&eax, &ebx, &ecx, &edx);
+
+ sprintf(name, "intel-ucode/%02x-%02x-%02x",
+ x86_family(eax), x86_model(eax), x86_stepping(eax));
+
+ return get_builtin_firmware(cp, name);
+}
+
+/*
+ * Print ucode update info.
+ */
+static void
+print_ucode_info(struct ucode_cpu_info *uci, unsigned int date)
+{
+ pr_info_once("microcode updated early to revision 0x%x, date = %04x-%02x-%02x\n",
+ uci->cpu_sig.rev,
+ date & 0xffff,
+ date >> 24,
+ (date >> 16) & 0xff);
+}
+
+#ifdef CONFIG_X86_32
+
+static int delay_ucode_info;
+static int current_mc_date;
+
+/*
+ * Print early updated ucode info after printk works. This is delayed info dump.
+ */
+void show_ucode_info_early(void)
+{
+ struct ucode_cpu_info uci;
+
+ if (delay_ucode_info) {
+ collect_cpu_info_early(&uci);
+ print_ucode_info(&uci, current_mc_date);
+ delay_ucode_info = 0;
+ }
+}
+
+/*
+ * At this point, we can not call printk() yet. Delay printing microcode info in
+ * show_ucode_info_early() until printk() works.
+ */
+static void print_ucode(struct ucode_cpu_info *uci)
+{
+ struct microcode_intel *mc;
+ int *delay_ucode_info_p;
+ int *current_mc_date_p;
+
+ mc = uci->mc;
+ if (!mc)
+ return;
+
+ delay_ucode_info_p = (int *)__pa_nodebug(&delay_ucode_info);
+ current_mc_date_p = (int *)__pa_nodebug(¤t_mc_date);
+
+ *delay_ucode_info_p = 1;
+ *current_mc_date_p = mc->hdr.date;
+}
+#else
+
+static inline void print_ucode(struct ucode_cpu_info *uci)
+{
+ struct microcode_intel *mc;
+
+ mc = uci->mc;
+ if (!mc)
+ return;
+
+ print_ucode_info(uci, mc->hdr.date);
+}
+#endif
+
+static int apply_microcode_early(struct ucode_cpu_info *uci, bool early)
+{
+ struct microcode_intel *mc;
+ u32 rev;
+
+ mc = uci->mc;
+ if (!mc)
+ return 0;
+
+ /*
+ * Save us the MSR write below - which is a particular expensive
+ * operation - when the other hyperthread has updated the microcode
+ * already.
+ */
+ rev = intel_get_microcode_revision();
+ if (rev >= mc->hdr.rev) {
+ uci->cpu_sig.rev = rev;
+ return UCODE_OK;
+ }
+
+ /*
+ * Writeback and invalidate caches before updating microcode to avoid
+ * internal issues depending on what the microcode is updating.
+ */
+ native_wbinvd();
+
+ /* write microcode via MSR 0x79 */
+ native_wrmsrl(MSR_IA32_UCODE_WRITE, (unsigned long)mc->bits);
+
+ rev = intel_get_microcode_revision();
+ if (rev != mc->hdr.rev)
+ return -1;
+
+ uci->cpu_sig.rev = rev;
+
+ if (early)
+ print_ucode(uci);
+ else
+ print_ucode_info(uci, mc->hdr.date);
+
+ return 0;
+}
+
+int __init save_microcode_in_initrd_intel(void)
+{
+ struct ucode_cpu_info uci;
+ struct cpio_data cp;
+
+ /*
+ * initrd is going away, clear patch ptr. We will scan the microcode one
+ * last time before jettisoning and save a patch, if found. Then we will
+ * update that pointer too, with a stable patch address to use when
+ * resuming the cores.
+ */
+ intel_ucode_patch = NULL;
+
+ if (!load_builtin_intel_microcode(&cp))
+ cp = find_microcode_in_initrd(ucode_path, false);
+
+ if (!(cp.data && cp.size))
+ return 0;
+
+ collect_cpu_info_early(&uci);
+
+ scan_microcode(cp.data, cp.size, &uci, true);
+
+ show_saved_mc();
+
+ return 0;
+}
+
+/*
+ * @res_patch, output: a pointer to the patch we found.
+ */
+static struct microcode_intel *__load_ucode_intel(struct ucode_cpu_info *uci)
+{
+ static const char *path;
+ struct cpio_data cp;
+ bool use_pa;
+
+ if (IS_ENABLED(CONFIG_X86_32)) {
+ path = (const char *)__pa_nodebug(ucode_path);
+ use_pa = true;
+ } else {
+ path = ucode_path;
+ use_pa = false;
+ }
+
+ /* try built-in microcode first */
+ if (!load_builtin_intel_microcode(&cp))
+ cp = find_microcode_in_initrd(path, use_pa);
+
+ if (!(cp.data && cp.size))
+ return NULL;
+
+ collect_cpu_info_early(uci);
+
+ return scan_microcode(cp.data, cp.size, uci, false);
+}
+
+void __init load_ucode_intel_bsp(void)
+{
+ struct microcode_intel *patch;
+ struct ucode_cpu_info uci;
+
+ patch = __load_ucode_intel(&uci);
+ if (!patch)
+ return;
+
+ uci.mc = patch;
+
+ apply_microcode_early(&uci, true);
+}
+
+void load_ucode_intel_ap(void)
+{
+ struct microcode_intel *patch, **iup;
+ struct ucode_cpu_info uci;
+
+ if (IS_ENABLED(CONFIG_X86_32))
+ iup = (struct microcode_intel **) __pa_nodebug(&intel_ucode_patch);
+ else
+ iup = &intel_ucode_patch;
+
+reget:
+ if (!*iup) {
+ patch = __load_ucode_intel(&uci);
+ if (!patch)
+ return;
+
+ *iup = patch;
+ }
+
+ uci.mc = *iup;
+
+ if (apply_microcode_early(&uci, true)) {
+ /* Mixed-silicon system? Try to refetch the proper patch: */
+ *iup = NULL;
+
+ goto reget;
+ }
+}
+
+static struct microcode_intel *find_patch(struct ucode_cpu_info *uci)
+{
+ struct microcode_header_intel *phdr;
+ struct ucode_patch *iter, *tmp;
+
+ list_for_each_entry_safe(iter, tmp, µcode_cache, plist) {
+
+ phdr = (struct microcode_header_intel *)iter->data;
+
+ if (phdr->rev <= uci->cpu_sig.rev)
+ continue;
+
+ if (!find_matching_signature(phdr,
+ uci->cpu_sig.sig,
+ uci->cpu_sig.pf))
+ continue;
+
+ return iter->data;
+ }
+ return NULL;
+}
+
+void reload_ucode_intel(void)
+{
+ struct microcode_intel *p;
+ struct ucode_cpu_info uci;
+
+ collect_cpu_info_early(&uci);
+
+ p = find_patch(&uci);
+ if (!p)
+ return;
+
+ uci.mc = p;
+
+ apply_microcode_early(&uci, false);
+}
+
+static int collect_cpu_info(int cpu_num, struct cpu_signature *csig)
+{
+ static struct cpu_signature prev;
+ struct cpuinfo_x86 *c = &cpu_data(cpu_num);
+ unsigned int val[2];
+
+ memset(csig, 0, sizeof(*csig));
+
+ csig->sig = cpuid_eax(0x00000001);
+
+ if ((c->x86_model >= 5) || (c->x86 > 6)) {
+ /* get processor flags from MSR 0x17 */
+ rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]);
+ csig->pf = 1 << ((val[1] >> 18) & 7);
+ }
+
+ csig->rev = c->microcode;
+
+ /* No extra locking on prev, races are harmless. */
+ if (csig->sig != prev.sig || csig->pf != prev.pf || csig->rev != prev.rev) {
+ pr_info("sig=0x%x, pf=0x%x, revision=0x%x\n",
+ csig->sig, csig->pf, csig->rev);
+ prev = *csig;
+ }
+
+ return 0;
+}
+
+static enum ucode_state apply_microcode_intel(int cpu)
+{
+ struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+ struct cpuinfo_x86 *c = &cpu_data(cpu);
+ struct microcode_intel *mc;
+ enum ucode_state ret;
+ static int prev_rev;
+ u32 rev;
+
+ /* We should bind the task to the CPU */
+ if (WARN_ON(raw_smp_processor_id() != cpu))
+ return UCODE_ERROR;
+
+ /* Look for a newer patch in our cache: */
+ mc = find_patch(uci);
+ if (!mc) {
+ mc = uci->mc;
+ if (!mc)
+ return UCODE_NFOUND;
+ }
+
+ /*
+ * Save us the MSR write below - which is a particular expensive
+ * operation - when the other hyperthread has updated the microcode
+ * already.
+ */
+ rev = intel_get_microcode_revision();
+ if (rev >= mc->hdr.rev) {
+ ret = UCODE_OK;
+ goto out;
+ }
+
+ /*
+ * Writeback and invalidate caches before updating microcode to avoid
+ * internal issues depending on what the microcode is updating.
+ */
+ native_wbinvd();
+
+ /* write microcode via MSR 0x79 */
+ wrmsrl(MSR_IA32_UCODE_WRITE, (unsigned long)mc->bits);
+
+ rev = intel_get_microcode_revision();
+
+ if (rev != mc->hdr.rev) {
+ pr_err("CPU%d update to revision 0x%x failed\n",
+ cpu, mc->hdr.rev);
+ return UCODE_ERROR;
+ }
+
+ if (rev != prev_rev) {
+ pr_info("updated to revision 0x%x, date = %04x-%02x-%02x\n",
+ rev,
+ mc->hdr.date & 0xffff,
+ mc->hdr.date >> 24,
+ (mc->hdr.date >> 16) & 0xff);
+ prev_rev = rev;
+ }
+
+ ret = UCODE_UPDATED;
+
+out:
+ uci->cpu_sig.rev = rev;
+ c->microcode = rev;
+
+ /* Update boot_cpu_data's revision too, if we're on the BSP: */
+ if (c->cpu_index == boot_cpu_data.cpu_index)
+ boot_cpu_data.microcode = rev;
+
+ return ret;
+}
+
+static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size,
+ int (*get_ucode_data)(void *, const void *, size_t))
+{
+ struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+ u8 *ucode_ptr = data, *new_mc = NULL, *mc = NULL;
+ int new_rev = uci->cpu_sig.rev;
+ unsigned int leftover = size;
+ unsigned int curr_mc_size = 0, new_mc_size = 0;
+ unsigned int csig, cpf;
+ enum ucode_state ret = UCODE_OK;
+
+ while (leftover) {
+ struct microcode_header_intel mc_header;
+ unsigned int mc_size;
+
+ if (leftover < sizeof(mc_header)) {
+ pr_err("error! Truncated header in microcode data file\n");
+ break;
+ }
+
+ if (get_ucode_data(&mc_header, ucode_ptr, sizeof(mc_header)))
+ break;
+
+ mc_size = get_totalsize(&mc_header);
+ if (!mc_size || mc_size > leftover) {
+ pr_err("error! Bad data in microcode data file\n");
+ break;
+ }
+
+ /* For performance reasons, reuse mc area when possible */
+ if (!mc || mc_size > curr_mc_size) {
+ vfree(mc);
+ mc = vmalloc(mc_size);
+ if (!mc)
+ break;
+ curr_mc_size = mc_size;
+ }
+
+ if (get_ucode_data(mc, ucode_ptr, mc_size) ||
+ microcode_sanity_check(mc, 1) < 0) {
+ break;
+ }
+
+ csig = uci->cpu_sig.sig;
+ cpf = uci->cpu_sig.pf;
+ if (has_newer_microcode(mc, csig, cpf, new_rev)) {
+ vfree(new_mc);
+ new_rev = mc_header.rev;
+ new_mc = mc;
+ new_mc_size = mc_size;
+ mc = NULL; /* trigger new vmalloc */
+ ret = UCODE_NEW;
+ }
+
+ ucode_ptr += mc_size;
+ leftover -= mc_size;
+ }
+
+ vfree(mc);
+
+ if (leftover) {
+ vfree(new_mc);
+ return UCODE_ERROR;
+ }
+
+ if (!new_mc)
+ return UCODE_NFOUND;
+
+ vfree(uci->mc);
+ uci->mc = (struct microcode_intel *)new_mc;
+
+ /*
+ * If early loading microcode is supported, save this mc into
+ * permanent memory. So it will be loaded early when a CPU is hot added
+ * or resumes.
+ */
+ save_mc_for_early(new_mc, new_mc_size);
+
+ pr_debug("CPU%d found a matching microcode update with version 0x%x (current=0x%x)\n",
+ cpu, new_rev, uci->cpu_sig.rev);
+
+ return ret;
+}
+
+static int get_ucode_fw(void *to, const void *from, size_t n)
+{
+ memcpy(to, from, n);
+ return 0;
+}
+
+static bool is_blacklisted(unsigned int cpu)
+{
+ struct cpuinfo_x86 *c = &cpu_data(cpu);
+
+ /*
+ * Late loading on model 79 with microcode revision less than 0x0b000021
+ * and LLC size per core bigger than 2.5MB may result in a system hang.
+ * This behavior is documented in item BDF90, #334165 (Intel Xeon
+ * Processor E7-8800/4800 v4 Product Family).
+ */
+ if (c->x86 == 6 &&
+ c->x86_model == INTEL_FAM6_BROADWELL_X &&
+ c->x86_stepping == 0x01 &&
+ llc_size_per_core > 2621440 &&
+ c->microcode < 0x0b000021) {
+ pr_err_once("Erratum BDF90: late loading with revision < 0x0b000021 (0x%x) disabled.\n", c->microcode);
+ pr_err_once("Please consider either early loading through initrd/built-in or a potential BIOS update.\n");
+ return true;
+ }
+
+ return false;
+}
+
+static enum ucode_state request_microcode_fw(int cpu, struct device *device,
+ bool refresh_fw)
+{
+ char name[30];
+ struct cpuinfo_x86 *c = &cpu_data(cpu);
+ const struct firmware *firmware;
+ enum ucode_state ret;
+
+ if (is_blacklisted(cpu))
+ return UCODE_NFOUND;
+
+ sprintf(name, "intel-ucode/%02x-%02x-%02x",
+ c->x86, c->x86_model, c->x86_stepping);
+
+ if (request_firmware_direct(&firmware, name, device)) {
+ pr_debug("data file %s load failed\n", name);
+ return UCODE_NFOUND;
+ }
+
+ ret = generic_load_microcode(cpu, (void *)firmware->data,
+ firmware->size, &get_ucode_fw);
+
+ release_firmware(firmware);
+
+ return ret;
+}
+
+static int get_ucode_user(void *to, const void *from, size_t n)
+{
+ return copy_from_user(to, from, n);
+}
+
+static enum ucode_state
+request_microcode_user(int cpu, const void __user *buf, size_t size)
+{
+ if (is_blacklisted(cpu))
+ return UCODE_NFOUND;
+
+ return generic_load_microcode(cpu, (void *)buf, size, &get_ucode_user);
+}
+
+static struct microcode_ops microcode_intel_ops = {
+ .request_microcode_user = request_microcode_user,
+ .request_microcode_fw = request_microcode_fw,
+ .collect_cpu_info = collect_cpu_info,
+ .apply_microcode = apply_microcode_intel,
+};
+
+static int __init calc_llc_size_per_core(struct cpuinfo_x86 *c)
+{
+ u64 llc_size = c->x86_cache_size * 1024ULL;
+
+ do_div(llc_size, c->x86_max_cores);
+
+ return (int)llc_size;
+}
+
+struct microcode_ops * __init init_intel_microcode(void)
+{
+ struct cpuinfo_x86 *c = &boot_cpu_data;
+
+ if (c->x86_vendor != X86_VENDOR_INTEL || c->x86 < 6 ||
+ cpu_has(c, X86_FEATURE_IA64)) {
+ pr_err("Intel CPU family 0x%x not supported\n", c->x86);
+ return NULL;
+ }
+
+ llc_size_per_core = calc_llc_size_per_core(c);
+
+ return µcode_intel_ops;
+}
diff --git a/arch/x86/kernel/cpu/mkcapflags.sh b/arch/x86/kernel/cpu/mkcapflags.sh
new file mode 100644
index 0000000..d0dfb89
--- /dev/null
+++ b/arch/x86/kernel/cpu/mkcapflags.sh
@@ -0,0 +1,65 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+#
+# Generate the x86_cap/bug_flags[] arrays from include/asm/cpufeatures.h
+#
+
+IN=$1
+OUT=$2
+
+dump_array()
+{
+ ARRAY=$1
+ SIZE=$2
+ PFX=$3
+ POSTFIX=$4
+
+ PFX_SZ=$(echo $PFX | wc -c)
+ TABS="$(printf '\t\t\t\t\t')"
+
+ echo "const char * const $ARRAY[$SIZE] = {"
+
+ # Iterate through any input lines starting with #define $PFX
+ sed -n -e 's/\t/ /g' -e "s/^ *# *define *$PFX//p" $IN |
+ while read i
+ do
+ # Name is everything up to the first whitespace
+ NAME="$(echo "$i" | sed 's/ .*//')"
+
+ # If the /* comment */ starts with a quote string, grab that.
+ VALUE="$(echo "$i" | sed -n 's@.*/\* *\("[^"]*"\).*\*/@\1@p')"
+ [ -z "$VALUE" ] && VALUE="\"$NAME\""
+ [ "$VALUE" = '""' ] && continue
+
+ # Name is uppercase, VALUE is all lowercase
+ VALUE="$(echo "$VALUE" | tr A-Z a-z)"
+
+ if [ -n "$POSTFIX" ]; then
+ T=$(( $PFX_SZ + $(echo $POSTFIX | wc -c) + 2 ))
+ TABS="$(printf '\t\t\t\t\t\t')"
+ TABCOUNT=$(( ( 6*8 - ($T + 1) - $(echo "$NAME" | wc -c) ) / 8 ))
+ printf "\t[%s - %s]%.*s = %s,\n" "$PFX$NAME" "$POSTFIX" "$TABCOUNT" "$TABS" "$VALUE"
+ else
+ TABCOUNT=$(( ( 5*8 - ($PFX_SZ + 1) - $(echo "$NAME" | wc -c) ) / 8 ))
+ printf "\t[%s]%.*s = %s,\n" "$PFX$NAME" "$TABCOUNT" "$TABS" "$VALUE"
+ fi
+ done
+ echo "};"
+}
+
+trap 'rm "$OUT"' EXIT
+
+(
+ echo "#ifndef _ASM_X86_CPUFEATURES_H"
+ echo "#include <asm/cpufeatures.h>"
+ echo "#endif"
+ echo ""
+
+ dump_array "x86_cap_flags" "NCAPINTS*32" "X86_FEATURE_" ""
+ echo ""
+
+ dump_array "x86_bug_flags" "NBUGINTS*32" "X86_BUG_" "NCAPINTS*32"
+
+) > $OUT
+
+trap - EXIT
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
new file mode 100644
index 0000000..852e74e
--- /dev/null
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -0,0 +1,325 @@
+/*
+ * HyperV Detection code.
+ *
+ * Copyright (C) 2010, Novell, Inc.
+ * Author : K. Y. Srinivasan <ksrinivasan@novell.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; version 2 of the License.
+ *
+ */
+
+#include <linux/types.h>
+#include <linux/time.h>
+#include <linux/clocksource.h>
+#include <linux/init.h>
+#include <linux/export.h>
+#include <linux/hardirq.h>
+#include <linux/efi.h>
+#include <linux/interrupt.h>
+#include <linux/irq.h>
+#include <linux/kexec.h>
+#include <linux/i8253.h>
+#include <asm/processor.h>
+#include <asm/hypervisor.h>
+#include <asm/hyperv-tlfs.h>
+#include <asm/mshyperv.h>
+#include <asm/desc.h>
+#include <asm/irq_regs.h>
+#include <asm/i8259.h>
+#include <asm/apic.h>
+#include <asm/timer.h>
+#include <asm/reboot.h>
+#include <asm/nmi.h>
+
+struct ms_hyperv_info ms_hyperv;
+EXPORT_SYMBOL_GPL(ms_hyperv);
+
+#if IS_ENABLED(CONFIG_HYPERV)
+static void (*vmbus_handler)(void);
+static void (*hv_stimer0_handler)(void);
+static void (*hv_kexec_handler)(void);
+static void (*hv_crash_handler)(struct pt_regs *regs);
+
+__visible void __irq_entry hyperv_vector_handler(struct pt_regs *regs)
+{
+ struct pt_regs *old_regs = set_irq_regs(regs);
+
+ entering_irq();
+ inc_irq_stat(irq_hv_callback_count);
+ if (vmbus_handler)
+ vmbus_handler();
+
+ if (ms_hyperv.hints & HV_DEPRECATING_AEOI_RECOMMENDED)
+ ack_APIC_irq();
+
+ exiting_irq();
+ set_irq_regs(old_regs);
+}
+
+void hv_setup_vmbus_irq(void (*handler)(void))
+{
+ vmbus_handler = handler;
+}
+
+void hv_remove_vmbus_irq(void)
+{
+ /* We have no way to deallocate the interrupt gate */
+ vmbus_handler = NULL;
+}
+EXPORT_SYMBOL_GPL(hv_setup_vmbus_irq);
+EXPORT_SYMBOL_GPL(hv_remove_vmbus_irq);
+
+/*
+ * Routines to do per-architecture handling of stimer0
+ * interrupts when in Direct Mode
+ */
+
+__visible void __irq_entry hv_stimer0_vector_handler(struct pt_regs *regs)
+{
+ struct pt_regs *old_regs = set_irq_regs(regs);
+
+ entering_irq();
+ inc_irq_stat(hyperv_stimer0_count);
+ if (hv_stimer0_handler)
+ hv_stimer0_handler();
+ ack_APIC_irq();
+
+ exiting_irq();
+ set_irq_regs(old_regs);
+}
+
+int hv_setup_stimer0_irq(int *irq, int *vector, void (*handler)(void))
+{
+ *vector = HYPERV_STIMER0_VECTOR;
+ *irq = 0; /* Unused on x86/x64 */
+ hv_stimer0_handler = handler;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(hv_setup_stimer0_irq);
+
+void hv_remove_stimer0_irq(int irq)
+{
+ /* We have no way to deallocate the interrupt gate */
+ hv_stimer0_handler = NULL;
+}
+EXPORT_SYMBOL_GPL(hv_remove_stimer0_irq);
+
+void hv_setup_kexec_handler(void (*handler)(void))
+{
+ hv_kexec_handler = handler;
+}
+EXPORT_SYMBOL_GPL(hv_setup_kexec_handler);
+
+void hv_remove_kexec_handler(void)
+{
+ hv_kexec_handler = NULL;
+}
+EXPORT_SYMBOL_GPL(hv_remove_kexec_handler);
+
+void hv_setup_crash_handler(void (*handler)(struct pt_regs *regs))
+{
+ hv_crash_handler = handler;
+}
+EXPORT_SYMBOL_GPL(hv_setup_crash_handler);
+
+void hv_remove_crash_handler(void)
+{
+ hv_crash_handler = NULL;
+}
+EXPORT_SYMBOL_GPL(hv_remove_crash_handler);
+
+#ifdef CONFIG_KEXEC_CORE
+static void hv_machine_shutdown(void)
+{
+ if (kexec_in_progress && hv_kexec_handler)
+ hv_kexec_handler();
+ native_machine_shutdown();
+}
+
+static void hv_machine_crash_shutdown(struct pt_regs *regs)
+{
+ if (hv_crash_handler)
+ hv_crash_handler(regs);
+ native_machine_crash_shutdown(regs);
+}
+#endif /* CONFIG_KEXEC_CORE */
+#endif /* CONFIG_HYPERV */
+
+static uint32_t __init ms_hyperv_platform(void)
+{
+ u32 eax;
+ u32 hyp_signature[3];
+
+ if (!boot_cpu_has(X86_FEATURE_HYPERVISOR))
+ return 0;
+
+ cpuid(HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS,
+ &eax, &hyp_signature[0], &hyp_signature[1], &hyp_signature[2]);
+
+ if (eax >= HYPERV_CPUID_MIN &&
+ eax <= HYPERV_CPUID_MAX &&
+ !memcmp("Microsoft Hv", hyp_signature, 12))
+ return HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS;
+
+ return 0;
+}
+
+static unsigned char hv_get_nmi_reason(void)
+{
+ return 0;
+}
+
+#ifdef CONFIG_X86_LOCAL_APIC
+/*
+ * Prior to WS2016 Debug-VM sends NMIs to all CPUs which makes
+ * it dificult to process CHANNELMSG_UNLOAD in case of crash. Handle
+ * unknown NMI on the first CPU which gets it.
+ */
+static int hv_nmi_unknown(unsigned int val, struct pt_regs *regs)
+{
+ static atomic_t nmi_cpu = ATOMIC_INIT(-1);
+
+ if (!unknown_nmi_panic)
+ return NMI_DONE;
+
+ if (atomic_cmpxchg(&nmi_cpu, -1, raw_smp_processor_id()) != -1)
+ return NMI_HANDLED;
+
+ return NMI_DONE;
+}
+#endif
+
+static unsigned long hv_get_tsc_khz(void)
+{
+ unsigned long freq;
+
+ rdmsrl(HV_X64_MSR_TSC_FREQUENCY, freq);
+
+ return freq / 1000;
+}
+
+static void __init ms_hyperv_init_platform(void)
+{
+ int hv_host_info_eax;
+ int hv_host_info_ebx;
+ int hv_host_info_ecx;
+ int hv_host_info_edx;
+
+ /*
+ * Extract the features and hints
+ */
+ ms_hyperv.features = cpuid_eax(HYPERV_CPUID_FEATURES);
+ ms_hyperv.misc_features = cpuid_edx(HYPERV_CPUID_FEATURES);
+ ms_hyperv.hints = cpuid_eax(HYPERV_CPUID_ENLIGHTMENT_INFO);
+
+ pr_info("Hyper-V: features 0x%x, hints 0x%x\n",
+ ms_hyperv.features, ms_hyperv.hints);
+
+ ms_hyperv.max_vp_index = cpuid_eax(HYPERV_CPUID_IMPLEMENT_LIMITS);
+ ms_hyperv.max_lp_index = cpuid_ebx(HYPERV_CPUID_IMPLEMENT_LIMITS);
+
+ pr_debug("Hyper-V: max %u virtual processors, %u logical processors\n",
+ ms_hyperv.max_vp_index, ms_hyperv.max_lp_index);
+
+ /*
+ * Extract host information.
+ */
+ if (cpuid_eax(HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS) >=
+ HYPERV_CPUID_VERSION) {
+ hv_host_info_eax = cpuid_eax(HYPERV_CPUID_VERSION);
+ hv_host_info_ebx = cpuid_ebx(HYPERV_CPUID_VERSION);
+ hv_host_info_ecx = cpuid_ecx(HYPERV_CPUID_VERSION);
+ hv_host_info_edx = cpuid_edx(HYPERV_CPUID_VERSION);
+
+ pr_info("Hyper-V Host Build:%d-%d.%d-%d-%d.%d\n",
+ hv_host_info_eax, hv_host_info_ebx >> 16,
+ hv_host_info_ebx & 0xFFFF, hv_host_info_ecx,
+ hv_host_info_edx >> 24, hv_host_info_edx & 0xFFFFFF);
+ }
+
+ if (ms_hyperv.features & HV_X64_ACCESS_FREQUENCY_MSRS &&
+ ms_hyperv.misc_features & HV_FEATURE_FREQUENCY_MSRS_AVAILABLE) {
+ x86_platform.calibrate_tsc = hv_get_tsc_khz;
+ x86_platform.calibrate_cpu = hv_get_tsc_khz;
+ }
+
+ if (ms_hyperv.hints & HV_X64_ENLIGHTENED_VMCS_RECOMMENDED) {
+ ms_hyperv.nested_features =
+ cpuid_eax(HYPERV_CPUID_NESTED_FEATURES);
+ }
+
+#ifdef CONFIG_X86_LOCAL_APIC
+ if (ms_hyperv.features & HV_X64_ACCESS_FREQUENCY_MSRS &&
+ ms_hyperv.misc_features & HV_FEATURE_FREQUENCY_MSRS_AVAILABLE) {
+ /*
+ * Get the APIC frequency.
+ */
+ u64 hv_lapic_frequency;
+
+ rdmsrl(HV_X64_MSR_APIC_FREQUENCY, hv_lapic_frequency);
+ hv_lapic_frequency = div_u64(hv_lapic_frequency, HZ);
+ lapic_timer_frequency = hv_lapic_frequency;
+ pr_info("Hyper-V: LAPIC Timer Frequency: %#x\n",
+ lapic_timer_frequency);
+ }
+
+ register_nmi_handler(NMI_UNKNOWN, hv_nmi_unknown, NMI_FLAG_FIRST,
+ "hv_nmi_unknown");
+#endif
+
+#ifdef CONFIG_X86_IO_APIC
+ no_timer_check = 1;
+#endif
+
+#if IS_ENABLED(CONFIG_HYPERV) && defined(CONFIG_KEXEC_CORE)
+ machine_ops.shutdown = hv_machine_shutdown;
+ machine_ops.crash_shutdown = hv_machine_crash_shutdown;
+#endif
+ mark_tsc_unstable("running on Hyper-V");
+
+ /*
+ * Generation 2 instances don't support reading the NMI status from
+ * 0x61 port.
+ */
+ if (efi_enabled(EFI_BOOT))
+ x86_platform.get_nmi_reason = hv_get_nmi_reason;
+
+ /*
+ * Hyper-V VMs have a PIT emulation quirk such that zeroing the
+ * counter register during PIT shutdown restarts the PIT. So it
+ * continues to interrupt @18.2 HZ. Setting i8253_clear_counter
+ * to false tells pit_shutdown() not to zero the counter so that
+ * the PIT really is shutdown. Generation 2 VMs don't have a PIT,
+ * and setting this value has no effect.
+ */
+ i8253_clear_counter_on_shutdown = false;
+
+#if IS_ENABLED(CONFIG_HYPERV)
+ /*
+ * Setup the hook to get control post apic initialization.
+ */
+ x86_platform.apic_post_init = hyperv_init;
+ hyperv_setup_mmu_ops();
+ /* Setup the IDT for hypervisor callback */
+ alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, hyperv_callback_vector);
+
+ /* Setup the IDT for reenlightenment notifications */
+ if (ms_hyperv.features & HV_X64_ACCESS_REENLIGHTENMENT)
+ alloc_intr_gate(HYPERV_REENLIGHTENMENT_VECTOR,
+ hyperv_reenlightenment_vector);
+
+ /* Setup the IDT for stimer0 */
+ if (ms_hyperv.misc_features & HV_STIMER_DIRECT_MODE_AVAILABLE)
+ alloc_intr_gate(HYPERV_STIMER0_VECTOR,
+ hv_stimer0_callback_vector);
+#endif
+}
+
+const __initconst struct hypervisor_x86 x86_hyper_ms_hyperv = {
+ .name = "Microsoft Hyper-V",
+ .detect = ms_hyperv_platform,
+ .type = X86_HYPER_MS_HYPERV,
+ .init.init_platform = ms_hyperv_init_platform,
+};
diff --git a/arch/x86/kernel/cpu/mtrr/Makefile b/arch/x86/kernel/cpu/mtrr/Makefile
new file mode 100644
index 0000000..2ad9107
--- /dev/null
+++ b/arch/x86/kernel/cpu/mtrr/Makefile
@@ -0,0 +1,3 @@
+obj-y := mtrr.o if.o generic.o cleanup.o
+obj-$(CONFIG_X86_32) += amd.o cyrix.o centaur.o
+
diff --git a/arch/x86/kernel/cpu/mtrr/amd.c b/arch/x86/kernel/cpu/mtrr/amd.c
new file mode 100644
index 0000000..a65a027
--- /dev/null
+++ b/arch/x86/kernel/cpu/mtrr/amd.c
@@ -0,0 +1,125 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/init.h>
+#include <linux/mm.h>
+#include <asm/mtrr.h>
+#include <asm/msr.h>
+
+#include "mtrr.h"
+
+static void
+amd_get_mtrr(unsigned int reg, unsigned long *base,
+ unsigned long *size, mtrr_type *type)
+{
+ unsigned long low, high;
+
+ rdmsr(MSR_K6_UWCCR, low, high);
+ /* Upper dword is region 1, lower is region 0 */
+ if (reg == 1)
+ low = high;
+ /* The base masks off on the right alignment */
+ *base = (low & 0xFFFE0000) >> PAGE_SHIFT;
+ *type = 0;
+ if (low & 1)
+ *type = MTRR_TYPE_UNCACHABLE;
+ if (low & 2)
+ *type = MTRR_TYPE_WRCOMB;
+ if (!(low & 3)) {
+ *size = 0;
+ return;
+ }
+ /*
+ * This needs a little explaining. The size is stored as an
+ * inverted mask of bits of 128K granularity 15 bits long offset
+ * 2 bits.
+ *
+ * So to get a size we do invert the mask and add 1 to the lowest
+ * mask bit (4 as its 2 bits in). This gives us a size we then shift
+ * to turn into 128K blocks.
+ *
+ * eg 111 1111 1111 1100 is 512K
+ *
+ * invert 000 0000 0000 0011
+ * +1 000 0000 0000 0100
+ * *128K ...
+ */
+ low = (~low) & 0x1FFFC;
+ *size = (low + 4) << (15 - PAGE_SHIFT);
+}
+
+/**
+ * amd_set_mtrr - Set variable MTRR register on the local CPU.
+ *
+ * @reg The register to set.
+ * @base The base address of the region.
+ * @size The size of the region. If this is 0 the region is disabled.
+ * @type The type of the region.
+ *
+ * Returns nothing.
+ */
+static void
+amd_set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type type)
+{
+ u32 regs[2];
+
+ /*
+ * Low is MTRR0, High MTRR 1
+ */
+ rdmsr(MSR_K6_UWCCR, regs[0], regs[1]);
+ /*
+ * Blank to disable
+ */
+ if (size == 0) {
+ regs[reg] = 0;
+ } else {
+ /*
+ * Set the register to the base, the type (off by one) and an
+ * inverted bitmask of the size The size is the only odd
+ * bit. We are fed say 512K We invert this and we get 111 1111
+ * 1111 1011 but if you subtract one and invert you get the
+ * desired 111 1111 1111 1100 mask
+ *
+ * But ~(x - 1) == ~x + 1 == -x. Two's complement rocks!
+ */
+ regs[reg] = (-size >> (15 - PAGE_SHIFT) & 0x0001FFFC)
+ | (base << PAGE_SHIFT) | (type + 1);
+ }
+
+ /*
+ * The writeback rule is quite specific. See the manual. Its
+ * disable local interrupts, write back the cache, set the mtrr
+ */
+ wbinvd();
+ wrmsr(MSR_K6_UWCCR, regs[0], regs[1]);
+}
+
+static int
+amd_validate_add_page(unsigned long base, unsigned long size, unsigned int type)
+{
+ /*
+ * Apply the K6 block alignment and size rules
+ * In order
+ * o Uncached or gathering only
+ * o 128K or bigger block
+ * o Power of 2 block
+ * o base suitably aligned to the power
+ */
+ if (type > MTRR_TYPE_WRCOMB || size < (1 << (17 - PAGE_SHIFT))
+ || (size & ~(size - 1)) - size || (base & (size - 1)))
+ return -EINVAL;
+ return 0;
+}
+
+static const struct mtrr_ops amd_mtrr_ops = {
+ .vendor = X86_VENDOR_AMD,
+ .set = amd_set_mtrr,
+ .get = amd_get_mtrr,
+ .get_free_region = generic_get_free_region,
+ .validate_add_page = amd_validate_add_page,
+ .have_wrcomb = positive_have_wrcomb,
+};
+
+int __init amd_init_mtrr(void)
+{
+ set_mtrr_ops(&amd_mtrr_ops);
+ return 0;
+}
diff --git a/arch/x86/kernel/cpu/mtrr/centaur.c b/arch/x86/kernel/cpu/mtrr/centaur.c
new file mode 100644
index 0000000..f271778
--- /dev/null
+++ b/arch/x86/kernel/cpu/mtrr/centaur.c
@@ -0,0 +1,127 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/init.h>
+#include <linux/mm.h>
+
+#include <asm/mtrr.h>
+#include <asm/msr.h>
+
+#include "mtrr.h"
+
+static struct {
+ unsigned long high;
+ unsigned long low;
+} centaur_mcr[8];
+
+static u8 centaur_mcr_reserved;
+static u8 centaur_mcr_type; /* 0 for winchip, 1 for winchip2 */
+
+/**
+ * centaur_get_free_region - Get a free MTRR.
+ *
+ * @base: The starting (base) address of the region.
+ * @size: The size (in bytes) of the region.
+ *
+ * Returns: the index of the region on success, else -1 on error.
+ */
+static int
+centaur_get_free_region(unsigned long base, unsigned long size, int replace_reg)
+{
+ unsigned long lbase, lsize;
+ mtrr_type ltype;
+ int i, max;
+
+ max = num_var_ranges;
+ if (replace_reg >= 0 && replace_reg < max)
+ return replace_reg;
+
+ for (i = 0; i < max; ++i) {
+ if (centaur_mcr_reserved & (1 << i))
+ continue;
+ mtrr_if->get(i, &lbase, &lsize, <ype);
+ if (lsize == 0)
+ return i;
+ }
+
+ return -ENOSPC;
+}
+
+/*
+ * Report boot time MCR setups
+ */
+void mtrr_centaur_report_mcr(int mcr, u32 lo, u32 hi)
+{
+ centaur_mcr[mcr].low = lo;
+ centaur_mcr[mcr].high = hi;
+}
+
+static void
+centaur_get_mcr(unsigned int reg, unsigned long *base,
+ unsigned long *size, mtrr_type * type)
+{
+ *base = centaur_mcr[reg].high >> PAGE_SHIFT;
+ *size = -(centaur_mcr[reg].low & 0xfffff000) >> PAGE_SHIFT;
+ *type = MTRR_TYPE_WRCOMB; /* write-combining */
+
+ if (centaur_mcr_type == 1 && ((centaur_mcr[reg].low & 31) & 2))
+ *type = MTRR_TYPE_UNCACHABLE;
+ if (centaur_mcr_type == 1 && (centaur_mcr[reg].low & 31) == 25)
+ *type = MTRR_TYPE_WRBACK;
+ if (centaur_mcr_type == 0 && (centaur_mcr[reg].low & 31) == 31)
+ *type = MTRR_TYPE_WRBACK;
+}
+
+static void
+centaur_set_mcr(unsigned int reg, unsigned long base,
+ unsigned long size, mtrr_type type)
+{
+ unsigned long low, high;
+
+ if (size == 0) {
+ /* Disable */
+ high = low = 0;
+ } else {
+ high = base << PAGE_SHIFT;
+ if (centaur_mcr_type == 0) {
+ /* Only support write-combining... */
+ low = -size << PAGE_SHIFT | 0x1f;
+ } else {
+ if (type == MTRR_TYPE_UNCACHABLE)
+ low = -size << PAGE_SHIFT | 0x02; /* NC */
+ else
+ low = -size << PAGE_SHIFT | 0x09; /* WWO, WC */
+ }
+ }
+ centaur_mcr[reg].high = high;
+ centaur_mcr[reg].low = low;
+ wrmsr(MSR_IDT_MCR0 + reg, low, high);
+}
+
+static int
+centaur_validate_add_page(unsigned long base, unsigned long size, unsigned int type)
+{
+ /*
+ * FIXME: Winchip2 supports uncached
+ */
+ if (type != MTRR_TYPE_WRCOMB &&
+ (centaur_mcr_type == 0 || type != MTRR_TYPE_UNCACHABLE)) {
+ pr_warn("mtrr: only write-combining%s supported\n",
+ centaur_mcr_type ? " and uncacheable are" : " is");
+ return -EINVAL;
+ }
+ return 0;
+}
+
+static const struct mtrr_ops centaur_mtrr_ops = {
+ .vendor = X86_VENDOR_CENTAUR,
+ .set = centaur_set_mcr,
+ .get = centaur_get_mcr,
+ .get_free_region = centaur_get_free_region,
+ .validate_add_page = centaur_validate_add_page,
+ .have_wrcomb = positive_have_wrcomb,
+};
+
+int __init centaur_init_mtrr(void)
+{
+ set_mtrr_ops(¢aur_mtrr_ops);
+ return 0;
+}
diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c
new file mode 100644
index 0000000..765afd5
--- /dev/null
+++ b/arch/x86/kernel/cpu/mtrr/cleanup.c
@@ -0,0 +1,987 @@
+/*
+ * MTRR (Memory Type Range Register) cleanup
+ *
+ * Copyright (C) 2009 Yinghai Lu
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Library General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Library General Public License for more details.
+ *
+ * You should have received a copy of the GNU Library General Public
+ * License along with this library; if not, write to the Free
+ * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+#include <linux/init.h>
+#include <linux/pci.h>
+#include <linux/smp.h>
+#include <linux/cpu.h>
+#include <linux/mutex.h>
+#include <linux/uaccess.h>
+#include <linux/kvm_para.h>
+#include <linux/range.h>
+
+#include <asm/processor.h>
+#include <asm/e820/api.h>
+#include <asm/mtrr.h>
+#include <asm/msr.h>
+
+#include "mtrr.h"
+
+struct var_mtrr_range_state {
+ unsigned long base_pfn;
+ unsigned long size_pfn;
+ mtrr_type type;
+};
+
+struct var_mtrr_state {
+ unsigned long range_startk;
+ unsigned long range_sizek;
+ unsigned long chunk_sizek;
+ unsigned long gran_sizek;
+ unsigned int reg;
+};
+
+/* Should be related to MTRR_VAR_RANGES nums */
+#define RANGE_NUM 256
+
+static struct range __initdata range[RANGE_NUM];
+static int __initdata nr_range;
+
+static struct var_mtrr_range_state __initdata range_state[RANGE_NUM];
+
+static int __initdata debug_print;
+#define Dprintk(x...) do { if (debug_print) pr_debug(x); } while (0)
+
+#define BIOS_BUG_MSG \
+ "WARNING: BIOS bug: VAR MTRR %d contains strange UC entry under 1M, check with your system vendor!\n"
+
+static int __init
+x86_get_mtrr_mem_range(struct range *range, int nr_range,
+ unsigned long extra_remove_base,
+ unsigned long extra_remove_size)
+{
+ unsigned long base, size;
+ mtrr_type type;
+ int i;
+
+ for (i = 0; i < num_var_ranges; i++) {
+ type = range_state[i].type;
+ if (type != MTRR_TYPE_WRBACK)
+ continue;
+ base = range_state[i].base_pfn;
+ size = range_state[i].size_pfn;
+ nr_range = add_range_with_merge(range, RANGE_NUM, nr_range,
+ base, base + size);
+ }
+ if (debug_print) {
+ pr_debug("After WB checking\n");
+ for (i = 0; i < nr_range; i++)
+ pr_debug("MTRR MAP PFN: %016llx - %016llx\n",
+ range[i].start, range[i].end);
+ }
+
+ /* Take out UC ranges: */
+ for (i = 0; i < num_var_ranges; i++) {
+ type = range_state[i].type;
+ if (type != MTRR_TYPE_UNCACHABLE &&
+ type != MTRR_TYPE_WRPROT)
+ continue;
+ size = range_state[i].size_pfn;
+ if (!size)
+ continue;
+ base = range_state[i].base_pfn;
+ if (base < (1<<(20-PAGE_SHIFT)) && mtrr_state.have_fixed &&
+ (mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED) &&
+ (mtrr_state.enabled & MTRR_STATE_MTRR_FIXED_ENABLED)) {
+ /* Var MTRR contains UC entry below 1M? Skip it: */
+ pr_warn(BIOS_BUG_MSG, i);
+ if (base + size <= (1<<(20-PAGE_SHIFT)))
+ continue;
+ size -= (1<<(20-PAGE_SHIFT)) - base;
+ base = 1<<(20-PAGE_SHIFT);
+ }
+ subtract_range(range, RANGE_NUM, base, base + size);
+ }
+ if (extra_remove_size)
+ subtract_range(range, RANGE_NUM, extra_remove_base,
+ extra_remove_base + extra_remove_size);
+
+ if (debug_print) {
+ pr_debug("After UC checking\n");
+ for (i = 0; i < RANGE_NUM; i++) {
+ if (!range[i].end)
+ continue;
+ pr_debug("MTRR MAP PFN: %016llx - %016llx\n",
+ range[i].start, range[i].end);
+ }
+ }
+
+ /* sort the ranges */
+ nr_range = clean_sort_range(range, RANGE_NUM);
+ if (debug_print) {
+ pr_debug("After sorting\n");
+ for (i = 0; i < nr_range; i++)
+ pr_debug("MTRR MAP PFN: %016llx - %016llx\n",
+ range[i].start, range[i].end);
+ }
+
+ return nr_range;
+}
+
+#ifdef CONFIG_MTRR_SANITIZER
+
+static unsigned long __init sum_ranges(struct range *range, int nr_range)
+{
+ unsigned long sum = 0;
+ int i;
+
+ for (i = 0; i < nr_range; i++)
+ sum += range[i].end - range[i].start;
+
+ return sum;
+}
+
+static int enable_mtrr_cleanup __initdata =
+ CONFIG_MTRR_SANITIZER_ENABLE_DEFAULT;
+
+static int __init disable_mtrr_cleanup_setup(char *str)
+{
+ enable_mtrr_cleanup = 0;
+ return 0;
+}
+early_param("disable_mtrr_cleanup", disable_mtrr_cleanup_setup);
+
+static int __init enable_mtrr_cleanup_setup(char *str)
+{
+ enable_mtrr_cleanup = 1;
+ return 0;
+}
+early_param("enable_mtrr_cleanup", enable_mtrr_cleanup_setup);
+
+static int __init mtrr_cleanup_debug_setup(char *str)
+{
+ debug_print = 1;
+ return 0;
+}
+early_param("mtrr_cleanup_debug", mtrr_cleanup_debug_setup);
+
+static void __init
+set_var_mtrr(unsigned int reg, unsigned long basek, unsigned long sizek,
+ unsigned char type, unsigned int address_bits)
+{
+ u32 base_lo, base_hi, mask_lo, mask_hi;
+ u64 base, mask;
+
+ if (!sizek) {
+ fill_mtrr_var_range(reg, 0, 0, 0, 0);
+ return;
+ }
+
+ mask = (1ULL << address_bits) - 1;
+ mask &= ~((((u64)sizek) << 10) - 1);
+
+ base = ((u64)basek) << 10;
+
+ base |= type;
+ mask |= 0x800;
+
+ base_lo = base & ((1ULL<<32) - 1);
+ base_hi = base >> 32;
+
+ mask_lo = mask & ((1ULL<<32) - 1);
+ mask_hi = mask >> 32;
+
+ fill_mtrr_var_range(reg, base_lo, base_hi, mask_lo, mask_hi);
+}
+
+static void __init
+save_var_mtrr(unsigned int reg, unsigned long basek, unsigned long sizek,
+ unsigned char type)
+{
+ range_state[reg].base_pfn = basek >> (PAGE_SHIFT - 10);
+ range_state[reg].size_pfn = sizek >> (PAGE_SHIFT - 10);
+ range_state[reg].type = type;
+}
+
+static void __init set_var_mtrr_all(unsigned int address_bits)
+{
+ unsigned long basek, sizek;
+ unsigned char type;
+ unsigned int reg;
+
+ for (reg = 0; reg < num_var_ranges; reg++) {
+ basek = range_state[reg].base_pfn << (PAGE_SHIFT - 10);
+ sizek = range_state[reg].size_pfn << (PAGE_SHIFT - 10);
+ type = range_state[reg].type;
+
+ set_var_mtrr(reg, basek, sizek, type, address_bits);
+ }
+}
+
+static unsigned long to_size_factor(unsigned long sizek, char *factorp)
+{
+ unsigned long base = sizek;
+ char factor;
+
+ if (base & ((1<<10) - 1)) {
+ /* Not MB-aligned: */
+ factor = 'K';
+ } else if (base & ((1<<20) - 1)) {
+ factor = 'M';
+ base >>= 10;
+ } else {
+ factor = 'G';
+ base >>= 20;
+ }
+
+ *factorp = factor;
+
+ return base;
+}
+
+static unsigned int __init
+range_to_mtrr(unsigned int reg, unsigned long range_startk,
+ unsigned long range_sizek, unsigned char type)
+{
+ if (!range_sizek || (reg >= num_var_ranges))
+ return reg;
+
+ while (range_sizek) {
+ unsigned long max_align, align;
+ unsigned long sizek;
+
+ /* Compute the maximum size with which we can make a range: */
+ if (range_startk)
+ max_align = __ffs(range_startk);
+ else
+ max_align = BITS_PER_LONG - 1;
+
+ align = __fls(range_sizek);
+ if (align > max_align)
+ align = max_align;
+
+ sizek = 1UL << align;
+ if (debug_print) {
+ char start_factor = 'K', size_factor = 'K';
+ unsigned long start_base, size_base;
+
+ start_base = to_size_factor(range_startk, &start_factor);
+ size_base = to_size_factor(sizek, &size_factor);
+
+ Dprintk("Setting variable MTRR %d, "
+ "base: %ld%cB, range: %ld%cB, type %s\n",
+ reg, start_base, start_factor,
+ size_base, size_factor,
+ (type == MTRR_TYPE_UNCACHABLE) ? "UC" :
+ ((type == MTRR_TYPE_WRBACK) ? "WB" : "Other")
+ );
+ }
+ save_var_mtrr(reg++, range_startk, sizek, type);
+ range_startk += sizek;
+ range_sizek -= sizek;
+ if (reg >= num_var_ranges)
+ break;
+ }
+ return reg;
+}
+
+static unsigned __init
+range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigned long basek,
+ unsigned long sizek)
+{
+ unsigned long hole_basek, hole_sizek;
+ unsigned long second_basek, second_sizek;
+ unsigned long range0_basek, range0_sizek;
+ unsigned long range_basek, range_sizek;
+ unsigned long chunk_sizek;
+ unsigned long gran_sizek;
+
+ hole_basek = 0;
+ hole_sizek = 0;
+ second_basek = 0;
+ second_sizek = 0;
+ chunk_sizek = state->chunk_sizek;
+ gran_sizek = state->gran_sizek;
+
+ /* Align with gran size, prevent small block used up MTRRs: */
+ range_basek = ALIGN(state->range_startk, gran_sizek);
+ if ((range_basek > basek) && basek)
+ return second_sizek;
+
+ state->range_sizek -= (range_basek - state->range_startk);
+ range_sizek = ALIGN(state->range_sizek, gran_sizek);
+
+ while (range_sizek > state->range_sizek) {
+ range_sizek -= gran_sizek;
+ if (!range_sizek)
+ return 0;
+ }
+ state->range_sizek = range_sizek;
+
+ /* Try to append some small hole: */
+ range0_basek = state->range_startk;
+ range0_sizek = ALIGN(state->range_sizek, chunk_sizek);
+
+ /* No increase: */
+ if (range0_sizek == state->range_sizek) {
+ Dprintk("rangeX: %016lx - %016lx\n",
+ range0_basek<<10,
+ (range0_basek + state->range_sizek)<<10);
+ state->reg = range_to_mtrr(state->reg, range0_basek,
+ state->range_sizek, MTRR_TYPE_WRBACK);
+ return 0;
+ }
+
+ /* Only cut back when it is not the last: */
+ if (sizek) {
+ while (range0_basek + range0_sizek > (basek + sizek)) {
+ if (range0_sizek >= chunk_sizek)
+ range0_sizek -= chunk_sizek;
+ else
+ range0_sizek = 0;
+
+ if (!range0_sizek)
+ break;
+ }
+ }
+
+second_try:
+ range_basek = range0_basek + range0_sizek;
+
+ /* One hole in the middle: */
+ if (range_basek > basek && range_basek <= (basek + sizek))
+ second_sizek = range_basek - basek;
+
+ if (range0_sizek > state->range_sizek) {
+
+ /* One hole in middle or at the end: */
+ hole_sizek = range0_sizek - state->range_sizek - second_sizek;
+
+ /* Hole size should be less than half of range0 size: */
+ if (hole_sizek >= (range0_sizek >> 1) &&
+ range0_sizek >= chunk_sizek) {
+ range0_sizek -= chunk_sizek;
+ second_sizek = 0;
+ hole_sizek = 0;
+
+ goto second_try;
+ }
+ }
+
+ if (range0_sizek) {
+ Dprintk("range0: %016lx - %016lx\n",
+ range0_basek<<10,
+ (range0_basek + range0_sizek)<<10);
+ state->reg = range_to_mtrr(state->reg, range0_basek,
+ range0_sizek, MTRR_TYPE_WRBACK);
+ }
+
+ if (range0_sizek < state->range_sizek) {
+ /* Need to handle left over range: */
+ range_sizek = state->range_sizek - range0_sizek;
+
+ Dprintk("range: %016lx - %016lx\n",
+ range_basek<<10,
+ (range_basek + range_sizek)<<10);
+
+ state->reg = range_to_mtrr(state->reg, range_basek,
+ range_sizek, MTRR_TYPE_WRBACK);
+ }
+
+ if (hole_sizek) {
+ hole_basek = range_basek - hole_sizek - second_sizek;
+ Dprintk("hole: %016lx - %016lx\n",
+ hole_basek<<10,
+ (hole_basek + hole_sizek)<<10);
+ state->reg = range_to_mtrr(state->reg, hole_basek,
+ hole_sizek, MTRR_TYPE_UNCACHABLE);
+ }
+
+ return second_sizek;
+}
+
+static void __init
+set_var_mtrr_range(struct var_mtrr_state *state, unsigned long base_pfn,
+ unsigned long size_pfn)
+{
+ unsigned long basek, sizek;
+ unsigned long second_sizek = 0;
+
+ if (state->reg >= num_var_ranges)
+ return;
+
+ basek = base_pfn << (PAGE_SHIFT - 10);
+ sizek = size_pfn << (PAGE_SHIFT - 10);
+
+ /* See if I can merge with the last range: */
+ if ((basek <= 1024) ||
+ (state->range_startk + state->range_sizek == basek)) {
+ unsigned long endk = basek + sizek;
+ state->range_sizek = endk - state->range_startk;
+ return;
+ }
+ /* Write the range mtrrs: */
+ if (state->range_sizek != 0)
+ second_sizek = range_to_mtrr_with_hole(state, basek, sizek);
+
+ /* Allocate an msr: */
+ state->range_startk = basek + second_sizek;
+ state->range_sizek = sizek - second_sizek;
+}
+
+/* Mininum size of mtrr block that can take hole: */
+static u64 mtrr_chunk_size __initdata = (256ULL<<20);
+
+static int __init parse_mtrr_chunk_size_opt(char *p)
+{
+ if (!p)
+ return -EINVAL;
+ mtrr_chunk_size = memparse(p, &p);
+ return 0;
+}
+early_param("mtrr_chunk_size", parse_mtrr_chunk_size_opt);
+
+/* Granularity of mtrr of block: */
+static u64 mtrr_gran_size __initdata;
+
+static int __init parse_mtrr_gran_size_opt(char *p)
+{
+ if (!p)
+ return -EINVAL;
+ mtrr_gran_size = memparse(p, &p);
+ return 0;
+}
+early_param("mtrr_gran_size", parse_mtrr_gran_size_opt);
+
+static unsigned long nr_mtrr_spare_reg __initdata =
+ CONFIG_MTRR_SANITIZER_SPARE_REG_NR_DEFAULT;
+
+static int __init parse_mtrr_spare_reg(char *arg)
+{
+ if (arg)
+ nr_mtrr_spare_reg = simple_strtoul(arg, NULL, 0);
+ return 0;
+}
+early_param("mtrr_spare_reg_nr", parse_mtrr_spare_reg);
+
+static int __init
+x86_setup_var_mtrrs(struct range *range, int nr_range,
+ u64 chunk_size, u64 gran_size)
+{
+ struct var_mtrr_state var_state;
+ int num_reg;
+ int i;
+
+ var_state.range_startk = 0;
+ var_state.range_sizek = 0;
+ var_state.reg = 0;
+ var_state.chunk_sizek = chunk_size >> 10;
+ var_state.gran_sizek = gran_size >> 10;
+
+ memset(range_state, 0, sizeof(range_state));
+
+ /* Write the range: */
+ for (i = 0; i < nr_range; i++) {
+ set_var_mtrr_range(&var_state, range[i].start,
+ range[i].end - range[i].start);
+ }
+
+ /* Write the last range: */
+ if (var_state.range_sizek != 0)
+ range_to_mtrr_with_hole(&var_state, 0, 0);
+
+ num_reg = var_state.reg;
+ /* Clear out the extra MTRR's: */
+ while (var_state.reg < num_var_ranges) {
+ save_var_mtrr(var_state.reg, 0, 0, 0);
+ var_state.reg++;
+ }
+
+ return num_reg;
+}
+
+struct mtrr_cleanup_result {
+ unsigned long gran_sizek;
+ unsigned long chunk_sizek;
+ unsigned long lose_cover_sizek;
+ unsigned int num_reg;
+ int bad;
+};
+
+/*
+ * gran_size: 64K, 128K, 256K, 512K, 1M, 2M, ..., 2G
+ * chunk size: gran_size, ..., 2G
+ * so we need (1+16)*8
+ */
+#define NUM_RESULT 136
+#define PSHIFT (PAGE_SHIFT - 10)
+
+static struct mtrr_cleanup_result __initdata result[NUM_RESULT];
+static unsigned long __initdata min_loss_pfn[RANGE_NUM];
+
+static void __init print_out_mtrr_range_state(void)
+{
+ char start_factor = 'K', size_factor = 'K';
+ unsigned long start_base, size_base;
+ mtrr_type type;
+ int i;
+
+ for (i = 0; i < num_var_ranges; i++) {
+
+ size_base = range_state[i].size_pfn << (PAGE_SHIFT - 10);
+ if (!size_base)
+ continue;
+
+ size_base = to_size_factor(size_base, &size_factor),
+ start_base = range_state[i].base_pfn << (PAGE_SHIFT - 10);
+ start_base = to_size_factor(start_base, &start_factor),
+ type = range_state[i].type;
+
+ pr_debug("reg %d, base: %ld%cB, range: %ld%cB, type %s\n",
+ i, start_base, start_factor,
+ size_base, size_factor,
+ (type == MTRR_TYPE_UNCACHABLE) ? "UC" :
+ ((type == MTRR_TYPE_WRPROT) ? "WP" :
+ ((type == MTRR_TYPE_WRBACK) ? "WB" : "Other"))
+ );
+ }
+}
+
+static int __init mtrr_need_cleanup(void)
+{
+ int i;
+ mtrr_type type;
+ unsigned long size;
+ /* Extra one for all 0: */
+ int num[MTRR_NUM_TYPES + 1];
+
+ /* Check entries number: */
+ memset(num, 0, sizeof(num));
+ for (i = 0; i < num_var_ranges; i++) {
+ type = range_state[i].type;
+ size = range_state[i].size_pfn;
+ if (type >= MTRR_NUM_TYPES)
+ continue;
+ if (!size)
+ type = MTRR_NUM_TYPES;
+ num[type]++;
+ }
+
+ /* Check if we got UC entries: */
+ if (!num[MTRR_TYPE_UNCACHABLE])
+ return 0;
+
+ /* Check if we only had WB and UC */
+ if (num[MTRR_TYPE_WRBACK] + num[MTRR_TYPE_UNCACHABLE] !=
+ num_var_ranges - num[MTRR_NUM_TYPES])
+ return 0;
+
+ return 1;
+}
+
+static unsigned long __initdata range_sums;
+
+static void __init
+mtrr_calc_range_state(u64 chunk_size, u64 gran_size,
+ unsigned long x_remove_base,
+ unsigned long x_remove_size, int i)
+{
+ /*
+ * range_new should really be an automatic variable, but
+ * putting 4096 bytes on the stack is frowned upon, to put it
+ * mildly. It is safe to make it a static __initdata variable,
+ * since mtrr_calc_range_state is only called during init and
+ * there's no way it will call itself recursively.
+ */
+ static struct range range_new[RANGE_NUM] __initdata;
+ unsigned long range_sums_new;
+ int nr_range_new;
+ int num_reg;
+
+ /* Convert ranges to var ranges state: */
+ num_reg = x86_setup_var_mtrrs(range, nr_range, chunk_size, gran_size);
+
+ /* We got new setting in range_state, check it: */
+ memset(range_new, 0, sizeof(range_new));
+ nr_range_new = x86_get_mtrr_mem_range(range_new, 0,
+ x_remove_base, x_remove_size);
+ range_sums_new = sum_ranges(range_new, nr_range_new);
+
+ result[i].chunk_sizek = chunk_size >> 10;
+ result[i].gran_sizek = gran_size >> 10;
+ result[i].num_reg = num_reg;
+
+ if (range_sums < range_sums_new) {
+ result[i].lose_cover_sizek = (range_sums_new - range_sums) << PSHIFT;
+ result[i].bad = 1;
+ } else {
+ result[i].lose_cover_sizek = (range_sums - range_sums_new) << PSHIFT;
+ }
+
+ /* Double check it: */
+ if (!result[i].bad && !result[i].lose_cover_sizek) {
+ if (nr_range_new != nr_range || memcmp(range, range_new, sizeof(range)))
+ result[i].bad = 1;
+ }
+
+ if (!result[i].bad && (range_sums - range_sums_new < min_loss_pfn[num_reg]))
+ min_loss_pfn[num_reg] = range_sums - range_sums_new;
+}
+
+static void __init mtrr_print_out_one_result(int i)
+{
+ unsigned long gran_base, chunk_base, lose_base;
+ char gran_factor, chunk_factor, lose_factor;
+
+ gran_base = to_size_factor(result[i].gran_sizek, &gran_factor);
+ chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor);
+ lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor);
+
+ pr_info("%sgran_size: %ld%c \tchunk_size: %ld%c \t",
+ result[i].bad ? "*BAD*" : " ",
+ gran_base, gran_factor, chunk_base, chunk_factor);
+ pr_cont("num_reg: %d \tlose cover RAM: %s%ld%c\n",
+ result[i].num_reg, result[i].bad ? "-" : "",
+ lose_base, lose_factor);
+}
+
+static int __init mtrr_search_optimal_index(void)
+{
+ int num_reg_good;
+ int index_good;
+ int i;
+
+ if (nr_mtrr_spare_reg >= num_var_ranges)
+ nr_mtrr_spare_reg = num_var_ranges - 1;
+
+ num_reg_good = -1;
+ for (i = num_var_ranges - nr_mtrr_spare_reg; i > 0; i--) {
+ if (!min_loss_pfn[i])
+ num_reg_good = i;
+ }
+
+ index_good = -1;
+ if (num_reg_good != -1) {
+ for (i = 0; i < NUM_RESULT; i++) {
+ if (!result[i].bad &&
+ result[i].num_reg == num_reg_good &&
+ !result[i].lose_cover_sizek) {
+ index_good = i;
+ break;
+ }
+ }
+ }
+
+ return index_good;
+}
+
+int __init mtrr_cleanup(unsigned address_bits)
+{
+ unsigned long x_remove_base, x_remove_size;
+ unsigned long base, size, def, dummy;
+ u64 chunk_size, gran_size;
+ mtrr_type type;
+ int index_good;
+ int i;
+
+ if (!is_cpu(INTEL) || enable_mtrr_cleanup < 1)
+ return 0;
+
+ rdmsr(MSR_MTRRdefType, def, dummy);
+ def &= 0xff;
+ if (def != MTRR_TYPE_UNCACHABLE)
+ return 0;
+
+ /* Get it and store it aside: */
+ memset(range_state, 0, sizeof(range_state));
+ for (i = 0; i < num_var_ranges; i++) {
+ mtrr_if->get(i, &base, &size, &type);
+ range_state[i].base_pfn = base;
+ range_state[i].size_pfn = size;
+ range_state[i].type = type;
+ }
+
+ /* Check if we need handle it and can handle it: */
+ if (!mtrr_need_cleanup())
+ return 0;
+
+ /* Print original var MTRRs at first, for debugging: */
+ pr_debug("original variable MTRRs\n");
+ print_out_mtrr_range_state();
+
+ memset(range, 0, sizeof(range));
+ x_remove_size = 0;
+ x_remove_base = 1 << (32 - PAGE_SHIFT);
+ if (mtrr_tom2)
+ x_remove_size = (mtrr_tom2 >> PAGE_SHIFT) - x_remove_base;
+
+ /*
+ * [0, 1M) should always be covered by var mtrr with WB
+ * and fixed mtrrs should take effect before var mtrr for it:
+ */
+ nr_range = add_range_with_merge(range, RANGE_NUM, 0, 0,
+ 1ULL<<(20 - PAGE_SHIFT));
+ /* add from var mtrr at last */
+ nr_range = x86_get_mtrr_mem_range(range, nr_range,
+ x_remove_base, x_remove_size);
+
+ range_sums = sum_ranges(range, nr_range);
+ pr_info("total RAM covered: %ldM\n",
+ range_sums >> (20 - PAGE_SHIFT));
+
+ if (mtrr_chunk_size && mtrr_gran_size) {
+ i = 0;
+ mtrr_calc_range_state(mtrr_chunk_size, mtrr_gran_size,
+ x_remove_base, x_remove_size, i);
+
+ mtrr_print_out_one_result(i);
+
+ if (!result[i].bad) {
+ set_var_mtrr_all(address_bits);
+ pr_debug("New variable MTRRs\n");
+ print_out_mtrr_range_state();
+ return 1;
+ }
+ pr_info("invalid mtrr_gran_size or mtrr_chunk_size, will find optimal one\n");
+ }
+
+ i = 0;
+ memset(min_loss_pfn, 0xff, sizeof(min_loss_pfn));
+ memset(result, 0, sizeof(result));
+ for (gran_size = (1ULL<<16); gran_size < (1ULL<<32); gran_size <<= 1) {
+
+ for (chunk_size = gran_size; chunk_size < (1ULL<<32);
+ chunk_size <<= 1) {
+
+ if (i >= NUM_RESULT)
+ continue;
+
+ mtrr_calc_range_state(chunk_size, gran_size,
+ x_remove_base, x_remove_size, i);
+ if (debug_print) {
+ mtrr_print_out_one_result(i);
+ pr_info("\n");
+ }
+
+ i++;
+ }
+ }
+
+ /* Try to find the optimal index: */
+ index_good = mtrr_search_optimal_index();
+
+ if (index_good != -1) {
+ pr_info("Found optimal setting for mtrr clean up\n");
+ i = index_good;
+ mtrr_print_out_one_result(i);
+
+ /* Convert ranges to var ranges state: */
+ chunk_size = result[i].chunk_sizek;
+ chunk_size <<= 10;
+ gran_size = result[i].gran_sizek;
+ gran_size <<= 10;
+ x86_setup_var_mtrrs(range, nr_range, chunk_size, gran_size);
+ set_var_mtrr_all(address_bits);
+ pr_debug("New variable MTRRs\n");
+ print_out_mtrr_range_state();
+ return 1;
+ } else {
+ /* print out all */
+ for (i = 0; i < NUM_RESULT; i++)
+ mtrr_print_out_one_result(i);
+ }
+
+ pr_info("mtrr_cleanup: can not find optimal value\n");
+ pr_info("please specify mtrr_gran_size/mtrr_chunk_size\n");
+
+ return 0;
+}
+#else
+int __init mtrr_cleanup(unsigned address_bits)
+{
+ return 0;
+}
+#endif
+
+static int disable_mtrr_trim;
+
+static int __init disable_mtrr_trim_setup(char *str)
+{
+ disable_mtrr_trim = 1;
+ return 0;
+}
+early_param("disable_mtrr_trim", disable_mtrr_trim_setup);
+
+/*
+ * Newer AMD K8s and later CPUs have a special magic MSR way to force WB
+ * for memory >4GB. Check for that here.
+ * Note this won't check if the MTRRs < 4GB where the magic bit doesn't
+ * apply to are wrong, but so far we don't know of any such case in the wild.
+ */
+#define Tom2Enabled (1U << 21)
+#define Tom2ForceMemTypeWB (1U << 22)
+
+int __init amd_special_default_mtrr(void)
+{
+ u32 l, h;
+
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
+ return 0;
+ if (boot_cpu_data.x86 < 0xf)
+ return 0;
+ /* In case some hypervisor doesn't pass SYSCFG through: */
+ if (rdmsr_safe(MSR_K8_SYSCFG, &l, &h) < 0)
+ return 0;
+ /*
+ * Memory between 4GB and top of mem is forced WB by this magic bit.
+ * Reserved before K8RevF, but should be zero there.
+ */
+ if ((l & (Tom2Enabled | Tom2ForceMemTypeWB)) ==
+ (Tom2Enabled | Tom2ForceMemTypeWB))
+ return 1;
+ return 0;
+}
+
+static u64 __init
+real_trim_memory(unsigned long start_pfn, unsigned long limit_pfn)
+{
+ u64 trim_start, trim_size;
+
+ trim_start = start_pfn;
+ trim_start <<= PAGE_SHIFT;
+
+ trim_size = limit_pfn;
+ trim_size <<= PAGE_SHIFT;
+ trim_size -= trim_start;
+
+ return e820__range_update(trim_start, trim_size, E820_TYPE_RAM, E820_TYPE_RESERVED);
+}
+
+/**
+ * mtrr_trim_uncached_memory - trim RAM not covered by MTRRs
+ * @end_pfn: ending page frame number
+ *
+ * Some buggy BIOSes don't setup the MTRRs properly for systems with certain
+ * memory configurations. This routine checks that the highest MTRR matches
+ * the end of memory, to make sure the MTRRs having a write back type cover
+ * all of the memory the kernel is intending to use. If not, it'll trim any
+ * memory off the end by adjusting end_pfn, removing it from the kernel's
+ * allocation pools, warning the user with an obnoxious message.
+ */
+int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
+{
+ unsigned long i, base, size, highest_pfn = 0, def, dummy;
+ mtrr_type type;
+ u64 total_trim_size;
+ /* extra one for all 0 */
+ int num[MTRR_NUM_TYPES + 1];
+
+ /*
+ * Make sure we only trim uncachable memory on machines that
+ * support the Intel MTRR architecture:
+ */
+ if (!is_cpu(INTEL) || disable_mtrr_trim)
+ return 0;
+
+ rdmsr(MSR_MTRRdefType, def, dummy);
+ def &= 0xff;
+ if (def != MTRR_TYPE_UNCACHABLE)
+ return 0;
+
+ /* Get it and store it aside: */
+ memset(range_state, 0, sizeof(range_state));
+ for (i = 0; i < num_var_ranges; i++) {
+ mtrr_if->get(i, &base, &size, &type);
+ range_state[i].base_pfn = base;
+ range_state[i].size_pfn = size;
+ range_state[i].type = type;
+ }
+
+ /* Find highest cached pfn: */
+ for (i = 0; i < num_var_ranges; i++) {
+ type = range_state[i].type;
+ if (type != MTRR_TYPE_WRBACK)
+ continue;
+ base = range_state[i].base_pfn;
+ size = range_state[i].size_pfn;
+ if (highest_pfn < base + size)
+ highest_pfn = base + size;
+ }
+
+ /* kvm/qemu doesn't have mtrr set right, don't trim them all: */
+ if (!highest_pfn) {
+ pr_info("CPU MTRRs all blank - virtualized system.\n");
+ return 0;
+ }
+
+ /* Check entries number: */
+ memset(num, 0, sizeof(num));
+ for (i = 0; i < num_var_ranges; i++) {
+ type = range_state[i].type;
+ if (type >= MTRR_NUM_TYPES)
+ continue;
+ size = range_state[i].size_pfn;
+ if (!size)
+ type = MTRR_NUM_TYPES;
+ num[type]++;
+ }
+
+ /* No entry for WB? */
+ if (!num[MTRR_TYPE_WRBACK])
+ return 0;
+
+ /* Check if we only had WB and UC: */
+ if (num[MTRR_TYPE_WRBACK] + num[MTRR_TYPE_UNCACHABLE] !=
+ num_var_ranges - num[MTRR_NUM_TYPES])
+ return 0;
+
+ memset(range, 0, sizeof(range));
+ nr_range = 0;
+ if (mtrr_tom2) {
+ range[nr_range].start = (1ULL<<(32 - PAGE_SHIFT));
+ range[nr_range].end = mtrr_tom2 >> PAGE_SHIFT;
+ if (highest_pfn < range[nr_range].end)
+ highest_pfn = range[nr_range].end;
+ nr_range++;
+ }
+ nr_range = x86_get_mtrr_mem_range(range, nr_range, 0, 0);
+
+ /* Check the head: */
+ total_trim_size = 0;
+ if (range[0].start)
+ total_trim_size += real_trim_memory(0, range[0].start);
+
+ /* Check the holes: */
+ for (i = 0; i < nr_range - 1; i++) {
+ if (range[i].end < range[i+1].start)
+ total_trim_size += real_trim_memory(range[i].end,
+ range[i+1].start);
+ }
+
+ /* Check the top: */
+ i = nr_range - 1;
+ if (range[i].end < end_pfn)
+ total_trim_size += real_trim_memory(range[i].end,
+ end_pfn);
+
+ if (total_trim_size) {
+ pr_warn("WARNING: BIOS bug: CPU MTRRs don't cover all of memory, losing %lluMB of RAM.\n",
+ total_trim_size >> 20);
+
+ if (!changed_by_mtrr_cleanup)
+ WARN_ON(1);
+
+ pr_info("update e820 for mtrr\n");
+ e820__update_table_print();
+
+ return 1;
+ }
+
+ return 0;
+}
diff --git a/arch/x86/kernel/cpu/mtrr/cyrix.c b/arch/x86/kernel/cpu/mtrr/cyrix.c
new file mode 100644
index 0000000..4296c70
--- /dev/null
+++ b/arch/x86/kernel/cpu/mtrr/cyrix.c
@@ -0,0 +1,283 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/init.h>
+#include <linux/io.h>
+#include <linux/mm.h>
+
+#include <asm/processor-cyrix.h>
+#include <asm/processor-flags.h>
+#include <asm/mtrr.h>
+#include <asm/msr.h>
+
+#include "mtrr.h"
+
+static void
+cyrix_get_arr(unsigned int reg, unsigned long *base,
+ unsigned long *size, mtrr_type * type)
+{
+ unsigned char arr, ccr3, rcr, shift;
+ unsigned long flags;
+
+ arr = CX86_ARR_BASE + (reg << 1) + reg; /* avoid multiplication by 3 */
+
+ local_irq_save(flags);
+
+ ccr3 = getCx86(CX86_CCR3);
+ setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */
+ ((unsigned char *)base)[3] = getCx86(arr);
+ ((unsigned char *)base)[2] = getCx86(arr + 1);
+ ((unsigned char *)base)[1] = getCx86(arr + 2);
+ rcr = getCx86(CX86_RCR_BASE + reg);
+ setCx86(CX86_CCR3, ccr3); /* disable MAPEN */
+
+ local_irq_restore(flags);
+
+ shift = ((unsigned char *) base)[1] & 0x0f;
+ *base >>= PAGE_SHIFT;
+
+ /*
+ * Power of two, at least 4K on ARR0-ARR6, 256K on ARR7
+ * Note: shift==0xf means 4G, this is unsupported.
+ */
+ if (shift)
+ *size = (reg < 7 ? 0x1UL : 0x40UL) << (shift - 1);
+ else
+ *size = 0;
+
+ /* Bit 0 is Cache Enable on ARR7, Cache Disable on ARR0-ARR6 */
+ if (reg < 7) {
+ switch (rcr) {
+ case 1:
+ *type = MTRR_TYPE_UNCACHABLE;
+ break;
+ case 8:
+ *type = MTRR_TYPE_WRBACK;
+ break;
+ case 9:
+ *type = MTRR_TYPE_WRCOMB;
+ break;
+ case 24:
+ default:
+ *type = MTRR_TYPE_WRTHROUGH;
+ break;
+ }
+ } else {
+ switch (rcr) {
+ case 0:
+ *type = MTRR_TYPE_UNCACHABLE;
+ break;
+ case 8:
+ *type = MTRR_TYPE_WRCOMB;
+ break;
+ case 9:
+ *type = MTRR_TYPE_WRBACK;
+ break;
+ case 25:
+ default:
+ *type = MTRR_TYPE_WRTHROUGH;
+ break;
+ }
+ }
+}
+
+/*
+ * cyrix_get_free_region - get a free ARR.
+ *
+ * @base: the starting (base) address of the region.
+ * @size: the size (in bytes) of the region.
+ *
+ * Returns: the index of the region on success, else -1 on error.
+*/
+static int
+cyrix_get_free_region(unsigned long base, unsigned long size, int replace_reg)
+{
+ unsigned long lbase, lsize;
+ mtrr_type ltype;
+ int i;
+
+ switch (replace_reg) {
+ case 7:
+ if (size < 0x40)
+ break;
+ case 6:
+ case 5:
+ case 4:
+ return replace_reg;
+ case 3:
+ case 2:
+ case 1:
+ case 0:
+ return replace_reg;
+ }
+ /* If we are to set up a region >32M then look at ARR7 immediately */
+ if (size > 0x2000) {
+ cyrix_get_arr(7, &lbase, &lsize, <ype);
+ if (lsize == 0)
+ return 7;
+ /* Else try ARR0-ARR6 first */
+ } else {
+ for (i = 0; i < 7; i++) {
+ cyrix_get_arr(i, &lbase, &lsize, <ype);
+ if (lsize == 0)
+ return i;
+ }
+ /*
+ * ARR0-ARR6 isn't free
+ * try ARR7 but its size must be at least 256K
+ */
+ cyrix_get_arr(i, &lbase, &lsize, <ype);
+ if ((lsize == 0) && (size >= 0x40))
+ return i;
+ }
+ return -ENOSPC;
+}
+
+static u32 cr4, ccr3;
+
+static void prepare_set(void)
+{
+ u32 cr0;
+
+ /* Save value of CR4 and clear Page Global Enable (bit 7) */
+ if (boot_cpu_has(X86_FEATURE_PGE)) {
+ cr4 = __read_cr4();
+ __write_cr4(cr4 & ~X86_CR4_PGE);
+ }
+
+ /*
+ * Disable and flush caches.
+ * Note that wbinvd flushes the TLBs as a side-effect
+ */
+ cr0 = read_cr0() | X86_CR0_CD;
+ wbinvd();
+ write_cr0(cr0);
+ wbinvd();
+
+ /* Cyrix ARRs - everything else was excluded at the top */
+ ccr3 = getCx86(CX86_CCR3);
+
+ /* Cyrix ARRs - everything else was excluded at the top */
+ setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10);
+}
+
+static void post_set(void)
+{
+ /* Flush caches and TLBs */
+ wbinvd();
+
+ /* Cyrix ARRs - everything else was excluded at the top */
+ setCx86(CX86_CCR3, ccr3);
+
+ /* Enable caches */
+ write_cr0(read_cr0() & ~X86_CR0_CD);
+
+ /* Restore value of CR4 */
+ if (boot_cpu_has(X86_FEATURE_PGE))
+ __write_cr4(cr4);
+}
+
+static void cyrix_set_arr(unsigned int reg, unsigned long base,
+ unsigned long size, mtrr_type type)
+{
+ unsigned char arr, arr_type, arr_size;
+
+ arr = CX86_ARR_BASE + (reg << 1) + reg; /* avoid multiplication by 3 */
+
+ /* count down from 32M (ARR0-ARR6) or from 2G (ARR7) */
+ if (reg >= 7)
+ size >>= 6;
+
+ size &= 0x7fff; /* make sure arr_size <= 14 */
+ for (arr_size = 0; size; arr_size++, size >>= 1)
+ ;
+
+ if (reg < 7) {
+ switch (type) {
+ case MTRR_TYPE_UNCACHABLE:
+ arr_type = 1;
+ break;
+ case MTRR_TYPE_WRCOMB:
+ arr_type = 9;
+ break;
+ case MTRR_TYPE_WRTHROUGH:
+ arr_type = 24;
+ break;
+ default:
+ arr_type = 8;
+ break;
+ }
+ } else {
+ switch (type) {
+ case MTRR_TYPE_UNCACHABLE:
+ arr_type = 0;
+ break;
+ case MTRR_TYPE_WRCOMB:
+ arr_type = 8;
+ break;
+ case MTRR_TYPE_WRTHROUGH:
+ arr_type = 25;
+ break;
+ default:
+ arr_type = 9;
+ break;
+ }
+ }
+
+ prepare_set();
+
+ base <<= PAGE_SHIFT;
+ setCx86(arr + 0, ((unsigned char *)&base)[3]);
+ setCx86(arr + 1, ((unsigned char *)&base)[2]);
+ setCx86(arr + 2, (((unsigned char *)&base)[1]) | arr_size);
+ setCx86(CX86_RCR_BASE + reg, arr_type);
+
+ post_set();
+}
+
+typedef struct {
+ unsigned long base;
+ unsigned long size;
+ mtrr_type type;
+} arr_state_t;
+
+static arr_state_t arr_state[8] = {
+ {0UL, 0UL, 0UL}, {0UL, 0UL, 0UL}, {0UL, 0UL, 0UL}, {0UL, 0UL, 0UL},
+ {0UL, 0UL, 0UL}, {0UL, 0UL, 0UL}, {0UL, 0UL, 0UL}, {0UL, 0UL, 0UL}
+};
+
+static unsigned char ccr_state[7] = { 0, 0, 0, 0, 0, 0, 0 };
+
+static void cyrix_set_all(void)
+{
+ int i;
+
+ prepare_set();
+
+ /* the CCRs are not contiguous */
+ for (i = 0; i < 4; i++)
+ setCx86(CX86_CCR0 + i, ccr_state[i]);
+ for (; i < 7; i++)
+ setCx86(CX86_CCR4 + i, ccr_state[i]);
+
+ for (i = 0; i < 8; i++) {
+ cyrix_set_arr(i, arr_state[i].base,
+ arr_state[i].size, arr_state[i].type);
+ }
+
+ post_set();
+}
+
+static const struct mtrr_ops cyrix_mtrr_ops = {
+ .vendor = X86_VENDOR_CYRIX,
+ .set_all = cyrix_set_all,
+ .set = cyrix_set_arr,
+ .get = cyrix_get_arr,
+ .get_free_region = cyrix_get_free_region,
+ .validate_add_page = generic_validate_add_page,
+ .have_wrcomb = positive_have_wrcomb,
+};
+
+int __init cyrix_init_mtrr(void)
+{
+ set_mtrr_ops(&cyrix_mtrr_ops);
+ return 0;
+}
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
new file mode 100644
index 0000000..e12ee86
--- /dev/null
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -0,0 +1,913 @@
+/*
+ * This only handles 32bit MTRR on 32bit hosts. This is strictly wrong
+ * because MTRRs can span up to 40 bits (36bits on most modern x86)
+ */
+#define DEBUG
+
+#include <linux/export.h>
+#include <linux/init.h>
+#include <linux/io.h>
+#include <linux/mm.h>
+
+#include <asm/processor-flags.h>
+#include <asm/cpufeature.h>
+#include <asm/tlbflush.h>
+#include <asm/mtrr.h>
+#include <asm/msr.h>
+#include <asm/pat.h>
+
+#include "mtrr.h"
+
+struct fixed_range_block {
+ int base_msr; /* start address of an MTRR block */
+ int ranges; /* number of MTRRs in this block */
+};
+
+static struct fixed_range_block fixed_range_blocks[] = {
+ { MSR_MTRRfix64K_00000, 1 }, /* one 64k MTRR */
+ { MSR_MTRRfix16K_80000, 2 }, /* two 16k MTRRs */
+ { MSR_MTRRfix4K_C0000, 8 }, /* eight 4k MTRRs */
+ {}
+};
+
+static unsigned long smp_changes_mask;
+static int mtrr_state_set;
+u64 mtrr_tom2;
+
+struct mtrr_state_type mtrr_state;
+EXPORT_SYMBOL_GPL(mtrr_state);
+
+/*
+ * BIOS is expected to clear MtrrFixDramModEn bit, see for example
+ * "BIOS and Kernel Developer's Guide for the AMD Athlon 64 and AMD
+ * Opteron Processors" (26094 Rev. 3.30 February 2006), section
+ * "13.2.1.2 SYSCFG Register": "The MtrrFixDramModEn bit should be set
+ * to 1 during BIOS initialization of the fixed MTRRs, then cleared to
+ * 0 for operation."
+ */
+static inline void k8_check_syscfg_dram_mod_en(void)
+{
+ u32 lo, hi;
+
+ if (!((boot_cpu_data.x86_vendor == X86_VENDOR_AMD) &&
+ (boot_cpu_data.x86 >= 0x0f)))
+ return;
+
+ rdmsr(MSR_K8_SYSCFG, lo, hi);
+ if (lo & K8_MTRRFIXRANGE_DRAM_MODIFY) {
+ pr_err(FW_WARN "MTRR: CPU %u: SYSCFG[MtrrFixDramModEn]"
+ " not cleared by BIOS, clearing this bit\n",
+ smp_processor_id());
+ lo &= ~K8_MTRRFIXRANGE_DRAM_MODIFY;
+ mtrr_wrmsr(MSR_K8_SYSCFG, lo, hi);
+ }
+}
+
+/* Get the size of contiguous MTRR range */
+static u64 get_mtrr_size(u64 mask)
+{
+ u64 size;
+
+ mask >>= PAGE_SHIFT;
+ mask |= size_or_mask;
+ size = -mask;
+ size <<= PAGE_SHIFT;
+ return size;
+}
+
+/*
+ * Check and return the effective type for MTRR-MTRR type overlap.
+ * Returns 1 if the effective type is UNCACHEABLE, else returns 0
+ */
+static int check_type_overlap(u8 *prev, u8 *curr)
+{
+ if (*prev == MTRR_TYPE_UNCACHABLE || *curr == MTRR_TYPE_UNCACHABLE) {
+ *prev = MTRR_TYPE_UNCACHABLE;
+ *curr = MTRR_TYPE_UNCACHABLE;
+ return 1;
+ }
+
+ if ((*prev == MTRR_TYPE_WRBACK && *curr == MTRR_TYPE_WRTHROUGH) ||
+ (*prev == MTRR_TYPE_WRTHROUGH && *curr == MTRR_TYPE_WRBACK)) {
+ *prev = MTRR_TYPE_WRTHROUGH;
+ *curr = MTRR_TYPE_WRTHROUGH;
+ }
+
+ if (*prev != *curr) {
+ *prev = MTRR_TYPE_UNCACHABLE;
+ *curr = MTRR_TYPE_UNCACHABLE;
+ return 1;
+ }
+
+ return 0;
+}
+
+/**
+ * mtrr_type_lookup_fixed - look up memory type in MTRR fixed entries
+ *
+ * Return the MTRR fixed memory type of 'start'.
+ *
+ * MTRR fixed entries are divided into the following ways:
+ * 0x00000 - 0x7FFFF : This range is divided into eight 64KB sub-ranges
+ * 0x80000 - 0xBFFFF : This range is divided into sixteen 16KB sub-ranges
+ * 0xC0000 - 0xFFFFF : This range is divided into sixty-four 4KB sub-ranges
+ *
+ * Return Values:
+ * MTRR_TYPE_(type) - Matched memory type
+ * MTRR_TYPE_INVALID - Unmatched
+ */
+static u8 mtrr_type_lookup_fixed(u64 start, u64 end)
+{
+ int idx;
+
+ if (start >= 0x100000)
+ return MTRR_TYPE_INVALID;
+
+ /* 0x0 - 0x7FFFF */
+ if (start < 0x80000) {
+ idx = 0;
+ idx += (start >> 16);
+ return mtrr_state.fixed_ranges[idx];
+ /* 0x80000 - 0xBFFFF */
+ } else if (start < 0xC0000) {
+ idx = 1 * 8;
+ idx += ((start - 0x80000) >> 14);
+ return mtrr_state.fixed_ranges[idx];
+ }
+
+ /* 0xC0000 - 0xFFFFF */
+ idx = 3 * 8;
+ idx += ((start - 0xC0000) >> 12);
+ return mtrr_state.fixed_ranges[idx];
+}
+
+/**
+ * mtrr_type_lookup_variable - look up memory type in MTRR variable entries
+ *
+ * Return Value:
+ * MTRR_TYPE_(type) - Matched memory type or default memory type (unmatched)
+ *
+ * Output Arguments:
+ * repeat - Set to 1 when [start:end] spanned across MTRR range and type
+ * returned corresponds only to [start:*partial_end]. Caller has
+ * to lookup again for [*partial_end:end].
+ *
+ * uniform - Set to 1 when an MTRR covers the region uniformly, i.e. the
+ * region is fully covered by a single MTRR entry or the default
+ * type.
+ */
+static u8 mtrr_type_lookup_variable(u64 start, u64 end, u64 *partial_end,
+ int *repeat, u8 *uniform)
+{
+ int i;
+ u64 base, mask;
+ u8 prev_match, curr_match;
+
+ *repeat = 0;
+ *uniform = 1;
+
+ /* Make end inclusive instead of exclusive */
+ end--;
+
+ prev_match = MTRR_TYPE_INVALID;
+ for (i = 0; i < num_var_ranges; ++i) {
+ unsigned short start_state, end_state, inclusive;
+
+ if (!(mtrr_state.var_ranges[i].mask_lo & (1 << 11)))
+ continue;
+
+ base = (((u64)mtrr_state.var_ranges[i].base_hi) << 32) +
+ (mtrr_state.var_ranges[i].base_lo & PAGE_MASK);
+ mask = (((u64)mtrr_state.var_ranges[i].mask_hi) << 32) +
+ (mtrr_state.var_ranges[i].mask_lo & PAGE_MASK);
+
+ start_state = ((start & mask) == (base & mask));
+ end_state = ((end & mask) == (base & mask));
+ inclusive = ((start < base) && (end > base));
+
+ if ((start_state != end_state) || inclusive) {
+ /*
+ * We have start:end spanning across an MTRR.
+ * We split the region into either
+ *
+ * - start_state:1
+ * (start:mtrr_end)(mtrr_end:end)
+ * - end_state:1
+ * (start:mtrr_start)(mtrr_start:end)
+ * - inclusive:1
+ * (start:mtrr_start)(mtrr_start:mtrr_end)(mtrr_end:end)
+ *
+ * depending on kind of overlap.
+ *
+ * Return the type of the first region and a pointer
+ * to the start of next region so that caller will be
+ * advised to lookup again after having adjusted start
+ * and end.
+ *
+ * Note: This way we handle overlaps with multiple
+ * entries and the default type properly.
+ */
+ if (start_state)
+ *partial_end = base + get_mtrr_size(mask);
+ else
+ *partial_end = base;
+
+ if (unlikely(*partial_end <= start)) {
+ WARN_ON(1);
+ *partial_end = start + PAGE_SIZE;
+ }
+
+ end = *partial_end - 1; /* end is inclusive */
+ *repeat = 1;
+ *uniform = 0;
+ }
+
+ if ((start & mask) != (base & mask))
+ continue;
+
+ curr_match = mtrr_state.var_ranges[i].base_lo & 0xff;
+ if (prev_match == MTRR_TYPE_INVALID) {
+ prev_match = curr_match;
+ continue;
+ }
+
+ *uniform = 0;
+ if (check_type_overlap(&prev_match, &curr_match))
+ return curr_match;
+ }
+
+ if (prev_match != MTRR_TYPE_INVALID)
+ return prev_match;
+
+ return mtrr_state.def_type;
+}
+
+/**
+ * mtrr_type_lookup - look up memory type in MTRR
+ *
+ * Return Values:
+ * MTRR_TYPE_(type) - The effective MTRR type for the region
+ * MTRR_TYPE_INVALID - MTRR is disabled
+ *
+ * Output Argument:
+ * uniform - Set to 1 when an MTRR covers the region uniformly, i.e. the
+ * region is fully covered by a single MTRR entry or the default
+ * type.
+ */
+u8 mtrr_type_lookup(u64 start, u64 end, u8 *uniform)
+{
+ u8 type, prev_type, is_uniform = 1, dummy;
+ int repeat;
+ u64 partial_end;
+
+ if (!mtrr_state_set)
+ return MTRR_TYPE_INVALID;
+
+ if (!(mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED))
+ return MTRR_TYPE_INVALID;
+
+ /*
+ * Look up the fixed ranges first, which take priority over
+ * the variable ranges.
+ */
+ if ((start < 0x100000) &&
+ (mtrr_state.have_fixed) &&
+ (mtrr_state.enabled & MTRR_STATE_MTRR_FIXED_ENABLED)) {
+ is_uniform = 0;
+ type = mtrr_type_lookup_fixed(start, end);
+ goto out;
+ }
+
+ /*
+ * Look up the variable ranges. Look of multiple ranges matching
+ * this address and pick type as per MTRR precedence.
+ */
+ type = mtrr_type_lookup_variable(start, end, &partial_end,
+ &repeat, &is_uniform);
+
+ /*
+ * Common path is with repeat = 0.
+ * However, we can have cases where [start:end] spans across some
+ * MTRR ranges and/or the default type. Do repeated lookups for
+ * that case here.
+ */
+ while (repeat) {
+ prev_type = type;
+ start = partial_end;
+ is_uniform = 0;
+ type = mtrr_type_lookup_variable(start, end, &partial_end,
+ &repeat, &dummy);
+
+ if (check_type_overlap(&prev_type, &type))
+ goto out;
+ }
+
+ if (mtrr_tom2 && (start >= (1ULL<<32)) && (end < mtrr_tom2))
+ type = MTRR_TYPE_WRBACK;
+
+out:
+ *uniform = is_uniform;
+ return type;
+}
+
+/* Get the MSR pair relating to a var range */
+static void
+get_mtrr_var_range(unsigned int index, struct mtrr_var_range *vr)
+{
+ rdmsr(MTRRphysBase_MSR(index), vr->base_lo, vr->base_hi);
+ rdmsr(MTRRphysMask_MSR(index), vr->mask_lo, vr->mask_hi);
+}
+
+/* Fill the MSR pair relating to a var range */
+void fill_mtrr_var_range(unsigned int index,
+ u32 base_lo, u32 base_hi, u32 mask_lo, u32 mask_hi)
+{
+ struct mtrr_var_range *vr;
+
+ vr = mtrr_state.var_ranges;
+
+ vr[index].base_lo = base_lo;
+ vr[index].base_hi = base_hi;
+ vr[index].mask_lo = mask_lo;
+ vr[index].mask_hi = mask_hi;
+}
+
+static void get_fixed_ranges(mtrr_type *frs)
+{
+ unsigned int *p = (unsigned int *)frs;
+ int i;
+
+ k8_check_syscfg_dram_mod_en();
+
+ rdmsr(MSR_MTRRfix64K_00000, p[0], p[1]);
+
+ for (i = 0; i < 2; i++)
+ rdmsr(MSR_MTRRfix16K_80000 + i, p[2 + i * 2], p[3 + i * 2]);
+ for (i = 0; i < 8; i++)
+ rdmsr(MSR_MTRRfix4K_C0000 + i, p[6 + i * 2], p[7 + i * 2]);
+}
+
+void mtrr_save_fixed_ranges(void *info)
+{
+ if (boot_cpu_has(X86_FEATURE_MTRR))
+ get_fixed_ranges(mtrr_state.fixed_ranges);
+}
+
+static unsigned __initdata last_fixed_start;
+static unsigned __initdata last_fixed_end;
+static mtrr_type __initdata last_fixed_type;
+
+static void __init print_fixed_last(void)
+{
+ if (!last_fixed_end)
+ return;
+
+ pr_debug(" %05X-%05X %s\n", last_fixed_start,
+ last_fixed_end - 1, mtrr_attrib_to_str(last_fixed_type));
+
+ last_fixed_end = 0;
+}
+
+static void __init update_fixed_last(unsigned base, unsigned end,
+ mtrr_type type)
+{
+ last_fixed_start = base;
+ last_fixed_end = end;
+ last_fixed_type = type;
+}
+
+static void __init
+print_fixed(unsigned base, unsigned step, const mtrr_type *types)
+{
+ unsigned i;
+
+ for (i = 0; i < 8; ++i, ++types, base += step) {
+ if (last_fixed_end == 0) {
+ update_fixed_last(base, base + step, *types);
+ continue;
+ }
+ if (last_fixed_end == base && last_fixed_type == *types) {
+ last_fixed_end = base + step;
+ continue;
+ }
+ /* new segments: gap or different type */
+ print_fixed_last();
+ update_fixed_last(base, base + step, *types);
+ }
+}
+
+static void prepare_set(void);
+static void post_set(void);
+
+static void __init print_mtrr_state(void)
+{
+ unsigned int i;
+ int high_width;
+
+ pr_debug("MTRR default type: %s\n",
+ mtrr_attrib_to_str(mtrr_state.def_type));
+ if (mtrr_state.have_fixed) {
+ pr_debug("MTRR fixed ranges %sabled:\n",
+ ((mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED) &&
+ (mtrr_state.enabled & MTRR_STATE_MTRR_FIXED_ENABLED)) ?
+ "en" : "dis");
+ print_fixed(0x00000, 0x10000, mtrr_state.fixed_ranges + 0);
+ for (i = 0; i < 2; ++i)
+ print_fixed(0x80000 + i * 0x20000, 0x04000,
+ mtrr_state.fixed_ranges + (i + 1) * 8);
+ for (i = 0; i < 8; ++i)
+ print_fixed(0xC0000 + i * 0x08000, 0x01000,
+ mtrr_state.fixed_ranges + (i + 3) * 8);
+
+ /* tail */
+ print_fixed_last();
+ }
+ pr_debug("MTRR variable ranges %sabled:\n",
+ mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED ? "en" : "dis");
+ high_width = (__ffs64(size_or_mask) - (32 - PAGE_SHIFT) + 3) / 4;
+
+ for (i = 0; i < num_var_ranges; ++i) {
+ if (mtrr_state.var_ranges[i].mask_lo & (1 << 11))
+ pr_debug(" %u base %0*X%05X000 mask %0*X%05X000 %s\n",
+ i,
+ high_width,
+ mtrr_state.var_ranges[i].base_hi,
+ mtrr_state.var_ranges[i].base_lo >> 12,
+ high_width,
+ mtrr_state.var_ranges[i].mask_hi,
+ mtrr_state.var_ranges[i].mask_lo >> 12,
+ mtrr_attrib_to_str(mtrr_state.var_ranges[i].base_lo & 0xff));
+ else
+ pr_debug(" %u disabled\n", i);
+ }
+ if (mtrr_tom2)
+ pr_debug("TOM2: %016llx aka %lldM\n", mtrr_tom2, mtrr_tom2>>20);
+}
+
+/* PAT setup for BP. We need to go through sync steps here */
+void __init mtrr_bp_pat_init(void)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ prepare_set();
+
+ pat_init();
+
+ post_set();
+ local_irq_restore(flags);
+}
+
+/* Grab all of the MTRR state for this CPU into *state */
+bool __init get_mtrr_state(void)
+{
+ struct mtrr_var_range *vrs;
+ unsigned lo, dummy;
+ unsigned int i;
+
+ vrs = mtrr_state.var_ranges;
+
+ rdmsr(MSR_MTRRcap, lo, dummy);
+ mtrr_state.have_fixed = (lo >> 8) & 1;
+
+ for (i = 0; i < num_var_ranges; i++)
+ get_mtrr_var_range(i, &vrs[i]);
+ if (mtrr_state.have_fixed)
+ get_fixed_ranges(mtrr_state.fixed_ranges);
+
+ rdmsr(MSR_MTRRdefType, lo, dummy);
+ mtrr_state.def_type = (lo & 0xff);
+ mtrr_state.enabled = (lo & 0xc00) >> 10;
+
+ if (amd_special_default_mtrr()) {
+ unsigned low, high;
+
+ /* TOP_MEM2 */
+ rdmsr(MSR_K8_TOP_MEM2, low, high);
+ mtrr_tom2 = high;
+ mtrr_tom2 <<= 32;
+ mtrr_tom2 |= low;
+ mtrr_tom2 &= 0xffffff800000ULL;
+ }
+
+ print_mtrr_state();
+
+ mtrr_state_set = 1;
+
+ return !!(mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED);
+}
+
+/* Some BIOS's are messed up and don't set all MTRRs the same! */
+void __init mtrr_state_warn(void)
+{
+ unsigned long mask = smp_changes_mask;
+
+ if (!mask)
+ return;
+ if (mask & MTRR_CHANGE_MASK_FIXED)
+ pr_warn("mtrr: your CPUs had inconsistent fixed MTRR settings\n");
+ if (mask & MTRR_CHANGE_MASK_VARIABLE)
+ pr_warn("mtrr: your CPUs had inconsistent variable MTRR settings\n");
+ if (mask & MTRR_CHANGE_MASK_DEFTYPE)
+ pr_warn("mtrr: your CPUs had inconsistent MTRRdefType settings\n");
+
+ pr_info("mtrr: probably your BIOS does not setup all CPUs.\n");
+ pr_info("mtrr: corrected configuration.\n");
+}
+
+/*
+ * Doesn't attempt to pass an error out to MTRR users
+ * because it's quite complicated in some cases and probably not
+ * worth it because the best error handling is to ignore it.
+ */
+void mtrr_wrmsr(unsigned msr, unsigned a, unsigned b)
+{
+ if (wrmsr_safe(msr, a, b) < 0) {
+ pr_err("MTRR: CPU %u: Writing MSR %x to %x:%x failed\n",
+ smp_processor_id(), msr, a, b);
+ }
+}
+
+/**
+ * set_fixed_range - checks & updates a fixed-range MTRR if it
+ * differs from the value it should have
+ * @msr: MSR address of the MTTR which should be checked and updated
+ * @changed: pointer which indicates whether the MTRR needed to be changed
+ * @msrwords: pointer to the MSR values which the MSR should have
+ */
+static void set_fixed_range(int msr, bool *changed, unsigned int *msrwords)
+{
+ unsigned lo, hi;
+
+ rdmsr(msr, lo, hi);
+
+ if (lo != msrwords[0] || hi != msrwords[1]) {
+ mtrr_wrmsr(msr, msrwords[0], msrwords[1]);
+ *changed = true;
+ }
+}
+
+/**
+ * generic_get_free_region - Get a free MTRR.
+ * @base: The starting (base) address of the region.
+ * @size: The size (in bytes) of the region.
+ * @replace_reg: mtrr index to be replaced; set to invalid value if none.
+ *
+ * Returns: The index of the region on success, else negative on error.
+ */
+int
+generic_get_free_region(unsigned long base, unsigned long size, int replace_reg)
+{
+ unsigned long lbase, lsize;
+ mtrr_type ltype;
+ int i, max;
+
+ max = num_var_ranges;
+ if (replace_reg >= 0 && replace_reg < max)
+ return replace_reg;
+
+ for (i = 0; i < max; ++i) {
+ mtrr_if->get(i, &lbase, &lsize, <ype);
+ if (lsize == 0)
+ return i;
+ }
+
+ return -ENOSPC;
+}
+
+static void generic_get_mtrr(unsigned int reg, unsigned long *base,
+ unsigned long *size, mtrr_type *type)
+{
+ u32 mask_lo, mask_hi, base_lo, base_hi;
+ unsigned int hi;
+ u64 tmp, mask;
+
+ /*
+ * get_mtrr doesn't need to update mtrr_state, also it could be called
+ * from any cpu, so try to print it out directly.
+ */
+ get_cpu();
+
+ rdmsr(MTRRphysMask_MSR(reg), mask_lo, mask_hi);
+
+ if ((mask_lo & 0x800) == 0) {
+ /* Invalid (i.e. free) range */
+ *base = 0;
+ *size = 0;
+ *type = 0;
+ goto out_put_cpu;
+ }
+
+ rdmsr(MTRRphysBase_MSR(reg), base_lo, base_hi);
+
+ /* Work out the shifted address mask: */
+ tmp = (u64)mask_hi << (32 - PAGE_SHIFT) | mask_lo >> PAGE_SHIFT;
+ mask = size_or_mask | tmp;
+
+ /* Expand tmp with high bits to all 1s: */
+ hi = fls64(tmp);
+ if (hi > 0) {
+ tmp |= ~((1ULL<<(hi - 1)) - 1);
+
+ if (tmp != mask) {
+ pr_warn("mtrr: your BIOS has configured an incorrect mask, fixing it.\n");
+ add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
+ mask = tmp;
+ }
+ }
+
+ /*
+ * This works correctly if size is a power of two, i.e. a
+ * contiguous range:
+ */
+ *size = -mask;
+ *base = (u64)base_hi << (32 - PAGE_SHIFT) | base_lo >> PAGE_SHIFT;
+ *type = base_lo & 0xff;
+
+out_put_cpu:
+ put_cpu();
+}
+
+/**
+ * set_fixed_ranges - checks & updates the fixed-range MTRRs if they
+ * differ from the saved set
+ * @frs: pointer to fixed-range MTRR values, saved by get_fixed_ranges()
+ */
+static int set_fixed_ranges(mtrr_type *frs)
+{
+ unsigned long long *saved = (unsigned long long *)frs;
+ bool changed = false;
+ int block = -1, range;
+
+ k8_check_syscfg_dram_mod_en();
+
+ while (fixed_range_blocks[++block].ranges) {
+ for (range = 0; range < fixed_range_blocks[block].ranges; range++)
+ set_fixed_range(fixed_range_blocks[block].base_msr + range,
+ &changed, (unsigned int *)saved++);
+ }
+
+ return changed;
+}
+
+/*
+ * Set the MSR pair relating to a var range.
+ * Returns true if changes are made.
+ */
+static bool set_mtrr_var_ranges(unsigned int index, struct mtrr_var_range *vr)
+{
+ unsigned int lo, hi;
+ bool changed = false;
+
+ rdmsr(MTRRphysBase_MSR(index), lo, hi);
+ if ((vr->base_lo & 0xfffff0ffUL) != (lo & 0xfffff0ffUL)
+ || (vr->base_hi & (size_and_mask >> (32 - PAGE_SHIFT))) !=
+ (hi & (size_and_mask >> (32 - PAGE_SHIFT)))) {
+
+ mtrr_wrmsr(MTRRphysBase_MSR(index), vr->base_lo, vr->base_hi);
+ changed = true;
+ }
+
+ rdmsr(MTRRphysMask_MSR(index), lo, hi);
+
+ if ((vr->mask_lo & 0xfffff800UL) != (lo & 0xfffff800UL)
+ || (vr->mask_hi & (size_and_mask >> (32 - PAGE_SHIFT))) !=
+ (hi & (size_and_mask >> (32 - PAGE_SHIFT)))) {
+ mtrr_wrmsr(MTRRphysMask_MSR(index), vr->mask_lo, vr->mask_hi);
+ changed = true;
+ }
+ return changed;
+}
+
+static u32 deftype_lo, deftype_hi;
+
+/**
+ * set_mtrr_state - Set the MTRR state for this CPU.
+ *
+ * NOTE: The CPU must already be in a safe state for MTRR changes.
+ * RETURNS: 0 if no changes made, else a mask indicating what was changed.
+ */
+static unsigned long set_mtrr_state(void)
+{
+ unsigned long change_mask = 0;
+ unsigned int i;
+
+ for (i = 0; i < num_var_ranges; i++) {
+ if (set_mtrr_var_ranges(i, &mtrr_state.var_ranges[i]))
+ change_mask |= MTRR_CHANGE_MASK_VARIABLE;
+ }
+
+ if (mtrr_state.have_fixed && set_fixed_ranges(mtrr_state.fixed_ranges))
+ change_mask |= MTRR_CHANGE_MASK_FIXED;
+
+ /*
+ * Set_mtrr_restore restores the old value of MTRRdefType,
+ * so to set it we fiddle with the saved value:
+ */
+ if ((deftype_lo & 0xff) != mtrr_state.def_type
+ || ((deftype_lo & 0xc00) >> 10) != mtrr_state.enabled) {
+
+ deftype_lo = (deftype_lo & ~0xcff) | mtrr_state.def_type |
+ (mtrr_state.enabled << 10);
+ change_mask |= MTRR_CHANGE_MASK_DEFTYPE;
+ }
+
+ return change_mask;
+}
+
+
+static unsigned long cr4;
+static DEFINE_RAW_SPINLOCK(set_atomicity_lock);
+
+/*
+ * Since we are disabling the cache don't allow any interrupts,
+ * they would run extremely slow and would only increase the pain.
+ *
+ * The caller must ensure that local interrupts are disabled and
+ * are reenabled after post_set() has been called.
+ */
+static void prepare_set(void) __acquires(set_atomicity_lock)
+{
+ unsigned long cr0;
+
+ /*
+ * Note that this is not ideal
+ * since the cache is only flushed/disabled for this CPU while the
+ * MTRRs are changed, but changing this requires more invasive
+ * changes to the way the kernel boots
+ */
+
+ raw_spin_lock(&set_atomicity_lock);
+
+ /* Enter the no-fill (CD=1, NW=0) cache mode and flush caches. */
+ cr0 = read_cr0() | X86_CR0_CD;
+ write_cr0(cr0);
+ wbinvd();
+
+ /* Save value of CR4 and clear Page Global Enable (bit 7) */
+ if (boot_cpu_has(X86_FEATURE_PGE)) {
+ cr4 = __read_cr4();
+ __write_cr4(cr4 & ~X86_CR4_PGE);
+ }
+
+ /* Flush all TLBs via a mov %cr3, %reg; mov %reg, %cr3 */
+ count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
+ __flush_tlb();
+
+ /* Save MTRR state */
+ rdmsr(MSR_MTRRdefType, deftype_lo, deftype_hi);
+
+ /* Disable MTRRs, and set the default type to uncached */
+ mtrr_wrmsr(MSR_MTRRdefType, deftype_lo & ~0xcff, deftype_hi);
+ wbinvd();
+}
+
+static void post_set(void) __releases(set_atomicity_lock)
+{
+ /* Flush TLBs (no need to flush caches - they are disabled) */
+ count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
+ __flush_tlb();
+
+ /* Intel (P6) standard MTRRs */
+ mtrr_wrmsr(MSR_MTRRdefType, deftype_lo, deftype_hi);
+
+ /* Enable caches */
+ write_cr0(read_cr0() & ~X86_CR0_CD);
+
+ /* Restore value of CR4 */
+ if (boot_cpu_has(X86_FEATURE_PGE))
+ __write_cr4(cr4);
+ raw_spin_unlock(&set_atomicity_lock);
+}
+
+static void generic_set_all(void)
+{
+ unsigned long mask, count;
+ unsigned long flags;
+
+ local_irq_save(flags);
+ prepare_set();
+
+ /* Actually set the state */
+ mask = set_mtrr_state();
+
+ /* also set PAT */
+ pat_init();
+
+ post_set();
+ local_irq_restore(flags);
+
+ /* Use the atomic bitops to update the global mask */
+ for (count = 0; count < sizeof mask * 8; ++count) {
+ if (mask & 0x01)
+ set_bit(count, &smp_changes_mask);
+ mask >>= 1;
+ }
+
+}
+
+/**
+ * generic_set_mtrr - set variable MTRR register on the local CPU.
+ *
+ * @reg: The register to set.
+ * @base: The base address of the region.
+ * @size: The size of the region. If this is 0 the region is disabled.
+ * @type: The type of the region.
+ *
+ * Returns nothing.
+ */
+static void generic_set_mtrr(unsigned int reg, unsigned long base,
+ unsigned long size, mtrr_type type)
+{
+ unsigned long flags;
+ struct mtrr_var_range *vr;
+
+ vr = &mtrr_state.var_ranges[reg];
+
+ local_irq_save(flags);
+ prepare_set();
+
+ if (size == 0) {
+ /*
+ * The invalid bit is kept in the mask, so we simply
+ * clear the relevant mask register to disable a range.
+ */
+ mtrr_wrmsr(MTRRphysMask_MSR(reg), 0, 0);
+ memset(vr, 0, sizeof(struct mtrr_var_range));
+ } else {
+ vr->base_lo = base << PAGE_SHIFT | type;
+ vr->base_hi = (base & size_and_mask) >> (32 - PAGE_SHIFT);
+ vr->mask_lo = -size << PAGE_SHIFT | 0x800;
+ vr->mask_hi = (-size & size_and_mask) >> (32 - PAGE_SHIFT);
+
+ mtrr_wrmsr(MTRRphysBase_MSR(reg), vr->base_lo, vr->base_hi);
+ mtrr_wrmsr(MTRRphysMask_MSR(reg), vr->mask_lo, vr->mask_hi);
+ }
+
+ post_set();
+ local_irq_restore(flags);
+}
+
+int generic_validate_add_page(unsigned long base, unsigned long size,
+ unsigned int type)
+{
+ unsigned long lbase, last;
+
+ /*
+ * For Intel PPro stepping <= 7
+ * must be 4 MiB aligned and not touch 0x70000000 -> 0x7003FFFF
+ */
+ if (is_cpu(INTEL) && boot_cpu_data.x86 == 6 &&
+ boot_cpu_data.x86_model == 1 &&
+ boot_cpu_data.x86_stepping <= 7) {
+ if (base & ((1 << (22 - PAGE_SHIFT)) - 1)) {
+ pr_warn("mtrr: base(0x%lx000) is not 4 MiB aligned\n", base);
+ return -EINVAL;
+ }
+ if (!(base + size < 0x70000 || base > 0x7003F) &&
+ (type == MTRR_TYPE_WRCOMB
+ || type == MTRR_TYPE_WRBACK)) {
+ pr_warn("mtrr: writable mtrr between 0x70000000 and 0x7003FFFF may hang the CPU.\n");
+ return -EINVAL;
+ }
+ }
+
+ /*
+ * Check upper bits of base and last are equal and lower bits are 0
+ * for base and 1 for last
+ */
+ last = base + size - 1;
+ for (lbase = base; !(lbase & 1) && (last & 1);
+ lbase = lbase >> 1, last = last >> 1)
+ ;
+ if (lbase != last) {
+ pr_warn("mtrr: base(0x%lx000) is not aligned on a size(0x%lx000) boundary\n", base, size);
+ return -EINVAL;
+ }
+ return 0;
+}
+
+static int generic_have_wrcomb(void)
+{
+ unsigned long config, dummy;
+ rdmsr(MSR_MTRRcap, config, dummy);
+ return config & (1 << 10);
+}
+
+int positive_have_wrcomb(void)
+{
+ return 1;
+}
+
+/*
+ * Generic structure...
+ */
+const struct mtrr_ops generic_mtrr_ops = {
+ .use_intel_if = 1,
+ .set_all = generic_set_all,
+ .get = generic_get_mtrr,
+ .get_free_region = generic_get_free_region,
+ .set = generic_set_mtrr,
+ .validate_add_page = generic_validate_add_page,
+ .have_wrcomb = generic_have_wrcomb,
+};
diff --git a/arch/x86/kernel/cpu/mtrr/if.c b/arch/x86/kernel/cpu/mtrr/if.c
new file mode 100644
index 0000000..254683b
--- /dev/null
+++ b/arch/x86/kernel/cpu/mtrr/if.c
@@ -0,0 +1,443 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/capability.h>
+#include <linux/seq_file.h>
+#include <linux/uaccess.h>
+#include <linux/proc_fs.h>
+#include <linux/ctype.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+
+#define LINE_SIZE 80
+
+#include <asm/mtrr.h>
+
+#include "mtrr.h"
+
+#define FILE_FCOUNT(f) (((struct seq_file *)((f)->private_data))->private)
+
+static const char *const mtrr_strings[MTRR_NUM_TYPES] =
+{
+ "uncachable", /* 0 */
+ "write-combining", /* 1 */
+ "?", /* 2 */
+ "?", /* 3 */
+ "write-through", /* 4 */
+ "write-protect", /* 5 */
+ "write-back", /* 6 */
+};
+
+const char *mtrr_attrib_to_str(int x)
+{
+ return (x <= 6) ? mtrr_strings[x] : "?";
+}
+
+#ifdef CONFIG_PROC_FS
+
+static int
+mtrr_file_add(unsigned long base, unsigned long size,
+ unsigned int type, bool increment, struct file *file, int page)
+{
+ unsigned int *fcount = FILE_FCOUNT(file);
+ int reg, max;
+
+ max = num_var_ranges;
+ if (fcount == NULL) {
+ fcount = kcalloc(max, sizeof(*fcount), GFP_KERNEL);
+ if (!fcount)
+ return -ENOMEM;
+ FILE_FCOUNT(file) = fcount;
+ }
+ if (!page) {
+ if ((base & (PAGE_SIZE - 1)) || (size & (PAGE_SIZE - 1)))
+ return -EINVAL;
+ base >>= PAGE_SHIFT;
+ size >>= PAGE_SHIFT;
+ }
+ reg = mtrr_add_page(base, size, type, true);
+ if (reg >= 0)
+ ++fcount[reg];
+ return reg;
+}
+
+static int
+mtrr_file_del(unsigned long base, unsigned long size,
+ struct file *file, int page)
+{
+ unsigned int *fcount = FILE_FCOUNT(file);
+ int reg;
+
+ if (!page) {
+ if ((base & (PAGE_SIZE - 1)) || (size & (PAGE_SIZE - 1)))
+ return -EINVAL;
+ base >>= PAGE_SHIFT;
+ size >>= PAGE_SHIFT;
+ }
+ reg = mtrr_del_page(-1, base, size);
+ if (reg < 0)
+ return reg;
+ if (fcount == NULL)
+ return reg;
+ if (fcount[reg] < 1)
+ return -EINVAL;
+ --fcount[reg];
+ return reg;
+}
+
+/*
+ * seq_file can seek but we ignore it.
+ *
+ * Format of control line:
+ * "base=%Lx size=%Lx type=%s" or "disable=%d"
+ */
+static ssize_t
+mtrr_write(struct file *file, const char __user *buf, size_t len, loff_t * ppos)
+{
+ int i, err;
+ unsigned long reg;
+ unsigned long long base, size;
+ char *ptr;
+ char line[LINE_SIZE];
+ int length;
+ size_t linelen;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ memset(line, 0, LINE_SIZE);
+
+ len = min_t(size_t, len, LINE_SIZE - 1);
+ length = strncpy_from_user(line, buf, len);
+ if (length < 0)
+ return length;
+
+ linelen = strlen(line);
+ ptr = line + linelen - 1;
+ if (linelen && *ptr == '\n')
+ *ptr = '\0';
+
+ if (!strncmp(line, "disable=", 8)) {
+ reg = simple_strtoul(line + 8, &ptr, 0);
+ err = mtrr_del_page(reg, 0, 0);
+ if (err < 0)
+ return err;
+ return len;
+ }
+
+ if (strncmp(line, "base=", 5))
+ return -EINVAL;
+
+ base = simple_strtoull(line + 5, &ptr, 0);
+ ptr = skip_spaces(ptr);
+
+ if (strncmp(ptr, "size=", 5))
+ return -EINVAL;
+
+ size = simple_strtoull(ptr + 5, &ptr, 0);
+ if ((base & 0xfff) || (size & 0xfff))
+ return -EINVAL;
+ ptr = skip_spaces(ptr);
+
+ if (strncmp(ptr, "type=", 5))
+ return -EINVAL;
+ ptr = skip_spaces(ptr + 5);
+
+ i = match_string(mtrr_strings, MTRR_NUM_TYPES, ptr);
+ if (i < 0)
+ return i;
+
+ base >>= PAGE_SHIFT;
+ size >>= PAGE_SHIFT;
+ err = mtrr_add_page((unsigned long)base, (unsigned long)size, i, true);
+ if (err < 0)
+ return err;
+ return len;
+}
+
+static long
+mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg)
+{
+ int err = 0;
+ mtrr_type type;
+ unsigned long base;
+ unsigned long size;
+ struct mtrr_sentry sentry;
+ struct mtrr_gentry gentry;
+ void __user *arg = (void __user *) __arg;
+
+ memset(&gentry, 0, sizeof(gentry));
+
+ switch (cmd) {
+ case MTRRIOC_ADD_ENTRY:
+ case MTRRIOC_SET_ENTRY:
+ case MTRRIOC_DEL_ENTRY:
+ case MTRRIOC_KILL_ENTRY:
+ case MTRRIOC_ADD_PAGE_ENTRY:
+ case MTRRIOC_SET_PAGE_ENTRY:
+ case MTRRIOC_DEL_PAGE_ENTRY:
+ case MTRRIOC_KILL_PAGE_ENTRY:
+ if (copy_from_user(&sentry, arg, sizeof sentry))
+ return -EFAULT;
+ break;
+ case MTRRIOC_GET_ENTRY:
+ case MTRRIOC_GET_PAGE_ENTRY:
+ if (copy_from_user(&gentry, arg, sizeof gentry))
+ return -EFAULT;
+ break;
+#ifdef CONFIG_COMPAT
+ case MTRRIOC32_ADD_ENTRY:
+ case MTRRIOC32_SET_ENTRY:
+ case MTRRIOC32_DEL_ENTRY:
+ case MTRRIOC32_KILL_ENTRY:
+ case MTRRIOC32_ADD_PAGE_ENTRY:
+ case MTRRIOC32_SET_PAGE_ENTRY:
+ case MTRRIOC32_DEL_PAGE_ENTRY:
+ case MTRRIOC32_KILL_PAGE_ENTRY: {
+ struct mtrr_sentry32 __user *s32;
+
+ s32 = (struct mtrr_sentry32 __user *)__arg;
+ err = get_user(sentry.base, &s32->base);
+ err |= get_user(sentry.size, &s32->size);
+ err |= get_user(sentry.type, &s32->type);
+ if (err)
+ return err;
+ break;
+ }
+ case MTRRIOC32_GET_ENTRY:
+ case MTRRIOC32_GET_PAGE_ENTRY: {
+ struct mtrr_gentry32 __user *g32;
+
+ g32 = (struct mtrr_gentry32 __user *)__arg;
+ err = get_user(gentry.regnum, &g32->regnum);
+ err |= get_user(gentry.base, &g32->base);
+ err |= get_user(gentry.size, &g32->size);
+ err |= get_user(gentry.type, &g32->type);
+ if (err)
+ return err;
+ break;
+ }
+#endif
+ }
+
+ switch (cmd) {
+ default:
+ return -ENOTTY;
+ case MTRRIOC_ADD_ENTRY:
+#ifdef CONFIG_COMPAT
+ case MTRRIOC32_ADD_ENTRY:
+#endif
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+ err =
+ mtrr_file_add(sentry.base, sentry.size, sentry.type, true,
+ file, 0);
+ break;
+ case MTRRIOC_SET_ENTRY:
+#ifdef CONFIG_COMPAT
+ case MTRRIOC32_SET_ENTRY:
+#endif
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+ err = mtrr_add(sentry.base, sentry.size, sentry.type, false);
+ break;
+ case MTRRIOC_DEL_ENTRY:
+#ifdef CONFIG_COMPAT
+ case MTRRIOC32_DEL_ENTRY:
+#endif
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+ err = mtrr_file_del(sentry.base, sentry.size, file, 0);
+ break;
+ case MTRRIOC_KILL_ENTRY:
+#ifdef CONFIG_COMPAT
+ case MTRRIOC32_KILL_ENTRY:
+#endif
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+ err = mtrr_del(-1, sentry.base, sentry.size);
+ break;
+ case MTRRIOC_GET_ENTRY:
+#ifdef CONFIG_COMPAT
+ case MTRRIOC32_GET_ENTRY:
+#endif
+ if (gentry.regnum >= num_var_ranges)
+ return -EINVAL;
+ mtrr_if->get(gentry.regnum, &base, &size, &type);
+
+ /* Hide entries that go above 4GB */
+ if (base + size - 1 >= (1UL << (8 * sizeof(gentry.size) - PAGE_SHIFT))
+ || size >= (1UL << (8 * sizeof(gentry.size) - PAGE_SHIFT)))
+ gentry.base = gentry.size = gentry.type = 0;
+ else {
+ gentry.base = base << PAGE_SHIFT;
+ gentry.size = size << PAGE_SHIFT;
+ gentry.type = type;
+ }
+
+ break;
+ case MTRRIOC_ADD_PAGE_ENTRY:
+#ifdef CONFIG_COMPAT
+ case MTRRIOC32_ADD_PAGE_ENTRY:
+#endif
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+ err =
+ mtrr_file_add(sentry.base, sentry.size, sentry.type, true,
+ file, 1);
+ break;
+ case MTRRIOC_SET_PAGE_ENTRY:
+#ifdef CONFIG_COMPAT
+ case MTRRIOC32_SET_PAGE_ENTRY:
+#endif
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+ err =
+ mtrr_add_page(sentry.base, sentry.size, sentry.type, false);
+ break;
+ case MTRRIOC_DEL_PAGE_ENTRY:
+#ifdef CONFIG_COMPAT
+ case MTRRIOC32_DEL_PAGE_ENTRY:
+#endif
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+ err = mtrr_file_del(sentry.base, sentry.size, file, 1);
+ break;
+ case MTRRIOC_KILL_PAGE_ENTRY:
+#ifdef CONFIG_COMPAT
+ case MTRRIOC32_KILL_PAGE_ENTRY:
+#endif
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+ err = mtrr_del_page(-1, sentry.base, sentry.size);
+ break;
+ case MTRRIOC_GET_PAGE_ENTRY:
+#ifdef CONFIG_COMPAT
+ case MTRRIOC32_GET_PAGE_ENTRY:
+#endif
+ if (gentry.regnum >= num_var_ranges)
+ return -EINVAL;
+ mtrr_if->get(gentry.regnum, &base, &size, &type);
+ /* Hide entries that would overflow */
+ if (size != (__typeof__(gentry.size))size)
+ gentry.base = gentry.size = gentry.type = 0;
+ else {
+ gentry.base = base;
+ gentry.size = size;
+ gentry.type = type;
+ }
+ break;
+ }
+
+ if (err)
+ return err;
+
+ switch (cmd) {
+ case MTRRIOC_GET_ENTRY:
+ case MTRRIOC_GET_PAGE_ENTRY:
+ if (copy_to_user(arg, &gentry, sizeof gentry))
+ err = -EFAULT;
+ break;
+#ifdef CONFIG_COMPAT
+ case MTRRIOC32_GET_ENTRY:
+ case MTRRIOC32_GET_PAGE_ENTRY: {
+ struct mtrr_gentry32 __user *g32;
+
+ g32 = (struct mtrr_gentry32 __user *)__arg;
+ err = put_user(gentry.base, &g32->base);
+ err |= put_user(gentry.size, &g32->size);
+ err |= put_user(gentry.regnum, &g32->regnum);
+ err |= put_user(gentry.type, &g32->type);
+ break;
+ }
+#endif
+ }
+ return err;
+}
+
+static int mtrr_close(struct inode *ino, struct file *file)
+{
+ unsigned int *fcount = FILE_FCOUNT(file);
+ int i, max;
+
+ if (fcount != NULL) {
+ max = num_var_ranges;
+ for (i = 0; i < max; ++i) {
+ while (fcount[i] > 0) {
+ mtrr_del(i, 0, 0);
+ --fcount[i];
+ }
+ }
+ kfree(fcount);
+ FILE_FCOUNT(file) = NULL;
+ }
+ return single_release(ino, file);
+}
+
+static int mtrr_seq_show(struct seq_file *seq, void *offset);
+
+static int mtrr_open(struct inode *inode, struct file *file)
+{
+ if (!mtrr_if)
+ return -EIO;
+ if (!mtrr_if->get)
+ return -ENXIO;
+ return single_open(file, mtrr_seq_show, NULL);
+}
+
+static const struct file_operations mtrr_fops = {
+ .owner = THIS_MODULE,
+ .open = mtrr_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .write = mtrr_write,
+ .unlocked_ioctl = mtrr_ioctl,
+ .compat_ioctl = mtrr_ioctl,
+ .release = mtrr_close,
+};
+
+static int mtrr_seq_show(struct seq_file *seq, void *offset)
+{
+ char factor;
+ int i, max;
+ mtrr_type type;
+ unsigned long base, size;
+
+ max = num_var_ranges;
+ for (i = 0; i < max; i++) {
+ mtrr_if->get(i, &base, &size, &type);
+ if (size == 0) {
+ mtrr_usage_table[i] = 0;
+ continue;
+ }
+ if (size < (0x100000 >> PAGE_SHIFT)) {
+ /* less than 1MB */
+ factor = 'K';
+ size <<= PAGE_SHIFT - 10;
+ } else {
+ factor = 'M';
+ size >>= 20 - PAGE_SHIFT;
+ }
+ /* Base can be > 32bit */
+ seq_printf(seq, "reg%02i: base=0x%06lx000 (%5luMB), size=%5lu%cB, count=%d: %s\n",
+ i, base, base >> (20 - PAGE_SHIFT),
+ size, factor,
+ mtrr_usage_table[i], mtrr_attrib_to_str(type));
+ }
+ return 0;
+}
+
+static int __init mtrr_if_init(void)
+{
+ struct cpuinfo_x86 *c = &boot_cpu_data;
+
+ if ((!cpu_has(c, X86_FEATURE_MTRR)) &&
+ (!cpu_has(c, X86_FEATURE_K6_MTRR)) &&
+ (!cpu_has(c, X86_FEATURE_CYRIX_ARR)) &&
+ (!cpu_has(c, X86_FEATURE_CENTAUR_MCR)))
+ return -ENODEV;
+
+ proc_create("mtrr", S_IWUSR | S_IRUGO, NULL, &mtrr_fops);
+ return 0;
+}
+arch_initcall(mtrr_if_init);
+#endif /* CONFIG_PROC_FS */
diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.c b/arch/x86/kernel/cpu/mtrr/mtrr.c
new file mode 100644
index 0000000..9a19c80
--- /dev/null
+++ b/arch/x86/kernel/cpu/mtrr/mtrr.c
@@ -0,0 +1,890 @@
+/* Generic MTRR (Memory Type Range Register) driver.
+
+ Copyright (C) 1997-2000 Richard Gooch
+ Copyright (c) 2002 Patrick Mochel
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Library General Public
+ License as published by the Free Software Foundation; either
+ version 2 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Library General Public License for more details.
+
+ You should have received a copy of the GNU Library General Public
+ License along with this library; if not, write to the Free
+ Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+ Richard Gooch may be reached by email at rgooch@atnf.csiro.au
+ The postal address is:
+ Richard Gooch, c/o ATNF, P. O. Box 76, Epping, N.S.W., 2121, Australia.
+
+ Source: "Pentium Pro Family Developer's Manual, Volume 3:
+ Operating System Writer's Guide" (Intel document number 242692),
+ section 11.11.7
+
+ This was cleaned and made readable by Patrick Mochel <mochel@osdl.org>
+ on 6-7 March 2002.
+ Source: Intel Architecture Software Developers Manual, Volume 3:
+ System Programming Guide; Section 9.11. (1997 edition - PPro).
+*/
+
+#define DEBUG
+
+#include <linux/types.h> /* FIXME: kvm_para.h needs this */
+
+#include <linux/stop_machine.h>
+#include <linux/kvm_para.h>
+#include <linux/uaccess.h>
+#include <linux/export.h>
+#include <linux/mutex.h>
+#include <linux/init.h>
+#include <linux/sort.h>
+#include <linux/cpu.h>
+#include <linux/pci.h>
+#include <linux/smp.h>
+#include <linux/syscore_ops.h>
+#include <linux/rcupdate.h>
+
+#include <asm/cpufeature.h>
+#include <asm/e820/api.h>
+#include <asm/mtrr.h>
+#include <asm/msr.h>
+#include <asm/pat.h>
+
+#include "mtrr.h"
+
+/* arch_phys_wc_add returns an MTRR register index plus this offset. */
+#define MTRR_TO_PHYS_WC_OFFSET 1000
+
+u32 num_var_ranges;
+static bool __mtrr_enabled;
+
+static bool mtrr_enabled(void)
+{
+ return __mtrr_enabled;
+}
+
+unsigned int mtrr_usage_table[MTRR_MAX_VAR_RANGES];
+static DEFINE_MUTEX(mtrr_mutex);
+
+u64 size_or_mask, size_and_mask;
+static bool mtrr_aps_delayed_init;
+
+static const struct mtrr_ops *mtrr_ops[X86_VENDOR_NUM] __ro_after_init;
+
+const struct mtrr_ops *mtrr_if;
+
+static void set_mtrr(unsigned int reg, unsigned long base,
+ unsigned long size, mtrr_type type);
+
+void __init set_mtrr_ops(const struct mtrr_ops *ops)
+{
+ if (ops->vendor && ops->vendor < X86_VENDOR_NUM)
+ mtrr_ops[ops->vendor] = ops;
+}
+
+/* Returns non-zero if we have the write-combining memory type */
+static int have_wrcomb(void)
+{
+ struct pci_dev *dev;
+
+ dev = pci_get_class(PCI_CLASS_BRIDGE_HOST << 8, NULL);
+ if (dev != NULL) {
+ /*
+ * ServerWorks LE chipsets < rev 6 have problems with
+ * write-combining. Don't allow it and leave room for other
+ * chipsets to be tagged
+ */
+ if (dev->vendor == PCI_VENDOR_ID_SERVERWORKS &&
+ dev->device == PCI_DEVICE_ID_SERVERWORKS_LE &&
+ dev->revision <= 5) {
+ pr_info("Serverworks LE rev < 6 detected. Write-combining disabled.\n");
+ pci_dev_put(dev);
+ return 0;
+ }
+ /*
+ * Intel 450NX errata # 23. Non ascending cacheline evictions to
+ * write combining memory may resulting in data corruption
+ */
+ if (dev->vendor == PCI_VENDOR_ID_INTEL &&
+ dev->device == PCI_DEVICE_ID_INTEL_82451NX) {
+ pr_info("Intel 450NX MMC detected. Write-combining disabled.\n");
+ pci_dev_put(dev);
+ return 0;
+ }
+ pci_dev_put(dev);
+ }
+ return mtrr_if->have_wrcomb ? mtrr_if->have_wrcomb() : 0;
+}
+
+/* This function returns the number of variable MTRRs */
+static void __init set_num_var_ranges(void)
+{
+ unsigned long config = 0, dummy;
+
+ if (use_intel())
+ rdmsr(MSR_MTRRcap, config, dummy);
+ else if (is_cpu(AMD))
+ config = 2;
+ else if (is_cpu(CYRIX) || is_cpu(CENTAUR))
+ config = 8;
+
+ num_var_ranges = config & 0xff;
+}
+
+static void __init init_table(void)
+{
+ int i, max;
+
+ max = num_var_ranges;
+ for (i = 0; i < max; i++)
+ mtrr_usage_table[i] = 1;
+}
+
+struct set_mtrr_data {
+ unsigned long smp_base;
+ unsigned long smp_size;
+ unsigned int smp_reg;
+ mtrr_type smp_type;
+};
+
+/**
+ * mtrr_rendezvous_handler - Work done in the synchronization handler. Executed
+ * by all the CPUs.
+ * @info: pointer to mtrr configuration data
+ *
+ * Returns nothing.
+ */
+static int mtrr_rendezvous_handler(void *info)
+{
+ struct set_mtrr_data *data = info;
+
+ /*
+ * We use this same function to initialize the mtrrs during boot,
+ * resume, runtime cpu online and on an explicit request to set a
+ * specific MTRR.
+ *
+ * During boot or suspend, the state of the boot cpu's mtrrs has been
+ * saved, and we want to replicate that across all the cpus that come
+ * online (either at the end of boot or resume or during a runtime cpu
+ * online). If we're doing that, @reg is set to something special and on
+ * all the cpu's we do mtrr_if->set_all() (On the logical cpu that
+ * started the boot/resume sequence, this might be a duplicate
+ * set_all()).
+ */
+ if (data->smp_reg != ~0U) {
+ mtrr_if->set(data->smp_reg, data->smp_base,
+ data->smp_size, data->smp_type);
+ } else if (mtrr_aps_delayed_init || !cpu_online(smp_processor_id())) {
+ mtrr_if->set_all();
+ }
+ return 0;
+}
+
+static inline int types_compatible(mtrr_type type1, mtrr_type type2)
+{
+ return type1 == MTRR_TYPE_UNCACHABLE ||
+ type2 == MTRR_TYPE_UNCACHABLE ||
+ (type1 == MTRR_TYPE_WRTHROUGH && type2 == MTRR_TYPE_WRBACK) ||
+ (type1 == MTRR_TYPE_WRBACK && type2 == MTRR_TYPE_WRTHROUGH);
+}
+
+/**
+ * set_mtrr - update mtrrs on all processors
+ * @reg: mtrr in question
+ * @base: mtrr base
+ * @size: mtrr size
+ * @type: mtrr type
+ *
+ * This is kinda tricky, but fortunately, Intel spelled it out for us cleanly:
+ *
+ * 1. Queue work to do the following on all processors:
+ * 2. Disable Interrupts
+ * 3. Wait for all procs to do so
+ * 4. Enter no-fill cache mode
+ * 5. Flush caches
+ * 6. Clear PGE bit
+ * 7. Flush all TLBs
+ * 8. Disable all range registers
+ * 9. Update the MTRRs
+ * 10. Enable all range registers
+ * 11. Flush all TLBs and caches again
+ * 12. Enter normal cache mode and reenable caching
+ * 13. Set PGE
+ * 14. Wait for buddies to catch up
+ * 15. Enable interrupts.
+ *
+ * What does that mean for us? Well, stop_machine() will ensure that
+ * the rendezvous handler is started on each CPU. And in lockstep they
+ * do the state transition of disabling interrupts, updating MTRR's
+ * (the CPU vendors may each do it differently, so we call mtrr_if->set()
+ * callback and let them take care of it.) and enabling interrupts.
+ *
+ * Note that the mechanism is the same for UP systems, too; all the SMP stuff
+ * becomes nops.
+ */
+static void
+set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type type)
+{
+ struct set_mtrr_data data = { .smp_reg = reg,
+ .smp_base = base,
+ .smp_size = size,
+ .smp_type = type
+ };
+
+ stop_machine(mtrr_rendezvous_handler, &data, cpu_online_mask);
+}
+
+static void set_mtrr_cpuslocked(unsigned int reg, unsigned long base,
+ unsigned long size, mtrr_type type)
+{
+ struct set_mtrr_data data = { .smp_reg = reg,
+ .smp_base = base,
+ .smp_size = size,
+ .smp_type = type
+ };
+
+ stop_machine_cpuslocked(mtrr_rendezvous_handler, &data, cpu_online_mask);
+}
+
+static void set_mtrr_from_inactive_cpu(unsigned int reg, unsigned long base,
+ unsigned long size, mtrr_type type)
+{
+ struct set_mtrr_data data = { .smp_reg = reg,
+ .smp_base = base,
+ .smp_size = size,
+ .smp_type = type
+ };
+
+ stop_machine_from_inactive_cpu(mtrr_rendezvous_handler, &data,
+ cpu_callout_mask);
+}
+
+/**
+ * mtrr_add_page - Add a memory type region
+ * @base: Physical base address of region in pages (in units of 4 kB!)
+ * @size: Physical size of region in pages (4 kB)
+ * @type: Type of MTRR desired
+ * @increment: If this is true do usage counting on the region
+ *
+ * Memory type region registers control the caching on newer Intel and
+ * non Intel processors. This function allows drivers to request an
+ * MTRR is added. The details and hardware specifics of each processor's
+ * implementation are hidden from the caller, but nevertheless the
+ * caller should expect to need to provide a power of two size on an
+ * equivalent power of two boundary.
+ *
+ * If the region cannot be added either because all regions are in use
+ * or the CPU cannot support it a negative value is returned. On success
+ * the register number for this entry is returned, but should be treated
+ * as a cookie only.
+ *
+ * On a multiprocessor machine the changes are made to all processors.
+ * This is required on x86 by the Intel processors.
+ *
+ * The available types are
+ *
+ * %MTRR_TYPE_UNCACHABLE - No caching
+ *
+ * %MTRR_TYPE_WRBACK - Write data back in bursts whenever
+ *
+ * %MTRR_TYPE_WRCOMB - Write data back soon but allow bursts
+ *
+ * %MTRR_TYPE_WRTHROUGH - Cache reads but not writes
+ *
+ * BUGS: Needs a quiet flag for the cases where drivers do not mind
+ * failures and do not wish system log messages to be sent.
+ */
+int mtrr_add_page(unsigned long base, unsigned long size,
+ unsigned int type, bool increment)
+{
+ unsigned long lbase, lsize;
+ int i, replace, error;
+ mtrr_type ltype;
+
+ if (!mtrr_enabled())
+ return -ENXIO;
+
+ error = mtrr_if->validate_add_page(base, size, type);
+ if (error)
+ return error;
+
+ if (type >= MTRR_NUM_TYPES) {
+ pr_warn("type: %u invalid\n", type);
+ return -EINVAL;
+ }
+
+ /* If the type is WC, check that this processor supports it */
+ if ((type == MTRR_TYPE_WRCOMB) && !have_wrcomb()) {
+ pr_warn("your processor doesn't support write-combining\n");
+ return -ENOSYS;
+ }
+
+ if (!size) {
+ pr_warn("zero sized request\n");
+ return -EINVAL;
+ }
+
+ if ((base | (base + size - 1)) >>
+ (boot_cpu_data.x86_phys_bits - PAGE_SHIFT)) {
+ pr_warn("base or size exceeds the MTRR width\n");
+ return -EINVAL;
+ }
+
+ error = -EINVAL;
+ replace = -1;
+
+ /* No CPU hotplug when we change MTRR entries */
+ get_online_cpus();
+
+ /* Search for existing MTRR */
+ mutex_lock(&mtrr_mutex);
+ for (i = 0; i < num_var_ranges; ++i) {
+ mtrr_if->get(i, &lbase, &lsize, <ype);
+ if (!lsize || base > lbase + lsize - 1 ||
+ base + size - 1 < lbase)
+ continue;
+ /*
+ * At this point we know there is some kind of
+ * overlap/enclosure
+ */
+ if (base < lbase || base + size - 1 > lbase + lsize - 1) {
+ if (base <= lbase &&
+ base + size - 1 >= lbase + lsize - 1) {
+ /* New region encloses an existing region */
+ if (type == ltype) {
+ replace = replace == -1 ? i : -2;
+ continue;
+ } else if (types_compatible(type, ltype))
+ continue;
+ }
+ pr_warn("0x%lx000,0x%lx000 overlaps existing 0x%lx000,0x%lx000\n", base, size, lbase,
+ lsize);
+ goto out;
+ }
+ /* New region is enclosed by an existing region */
+ if (ltype != type) {
+ if (types_compatible(type, ltype))
+ continue;
+ pr_warn("type mismatch for %lx000,%lx000 old: %s new: %s\n",
+ base, size, mtrr_attrib_to_str(ltype),
+ mtrr_attrib_to_str(type));
+ goto out;
+ }
+ if (increment)
+ ++mtrr_usage_table[i];
+ error = i;
+ goto out;
+ }
+ /* Search for an empty MTRR */
+ i = mtrr_if->get_free_region(base, size, replace);
+ if (i >= 0) {
+ set_mtrr_cpuslocked(i, base, size, type);
+ if (likely(replace < 0)) {
+ mtrr_usage_table[i] = 1;
+ } else {
+ mtrr_usage_table[i] = mtrr_usage_table[replace];
+ if (increment)
+ mtrr_usage_table[i]++;
+ if (unlikely(replace != i)) {
+ set_mtrr_cpuslocked(replace, 0, 0, 0);
+ mtrr_usage_table[replace] = 0;
+ }
+ }
+ } else {
+ pr_info("no more MTRRs available\n");
+ }
+ error = i;
+ out:
+ mutex_unlock(&mtrr_mutex);
+ put_online_cpus();
+ return error;
+}
+
+static int mtrr_check(unsigned long base, unsigned long size)
+{
+ if ((base & (PAGE_SIZE - 1)) || (size & (PAGE_SIZE - 1))) {
+ pr_warn("size and base must be multiples of 4 kiB\n");
+ pr_debug("size: 0x%lx base: 0x%lx\n", size, base);
+ dump_stack();
+ return -1;
+ }
+ return 0;
+}
+
+/**
+ * mtrr_add - Add a memory type region
+ * @base: Physical base address of region
+ * @size: Physical size of region
+ * @type: Type of MTRR desired
+ * @increment: If this is true do usage counting on the region
+ *
+ * Memory type region registers control the caching on newer Intel and
+ * non Intel processors. This function allows drivers to request an
+ * MTRR is added. The details and hardware specifics of each processor's
+ * implementation are hidden from the caller, but nevertheless the
+ * caller should expect to need to provide a power of two size on an
+ * equivalent power of two boundary.
+ *
+ * If the region cannot be added either because all regions are in use
+ * or the CPU cannot support it a negative value is returned. On success
+ * the register number for this entry is returned, but should be treated
+ * as a cookie only.
+ *
+ * On a multiprocessor machine the changes are made to all processors.
+ * This is required on x86 by the Intel processors.
+ *
+ * The available types are
+ *
+ * %MTRR_TYPE_UNCACHABLE - No caching
+ *
+ * %MTRR_TYPE_WRBACK - Write data back in bursts whenever
+ *
+ * %MTRR_TYPE_WRCOMB - Write data back soon but allow bursts
+ *
+ * %MTRR_TYPE_WRTHROUGH - Cache reads but not writes
+ *
+ * BUGS: Needs a quiet flag for the cases where drivers do not mind
+ * failures and do not wish system log messages to be sent.
+ */
+int mtrr_add(unsigned long base, unsigned long size, unsigned int type,
+ bool increment)
+{
+ if (!mtrr_enabled())
+ return -ENODEV;
+ if (mtrr_check(base, size))
+ return -EINVAL;
+ return mtrr_add_page(base >> PAGE_SHIFT, size >> PAGE_SHIFT, type,
+ increment);
+}
+
+/**
+ * mtrr_del_page - delete a memory type region
+ * @reg: Register returned by mtrr_add
+ * @base: Physical base address
+ * @size: Size of region
+ *
+ * If register is supplied then base and size are ignored. This is
+ * how drivers should call it.
+ *
+ * Releases an MTRR region. If the usage count drops to zero the
+ * register is freed and the region returns to default state.
+ * On success the register is returned, on failure a negative error
+ * code.
+ */
+int mtrr_del_page(int reg, unsigned long base, unsigned long size)
+{
+ int i, max;
+ mtrr_type ltype;
+ unsigned long lbase, lsize;
+ int error = -EINVAL;
+
+ if (!mtrr_enabled())
+ return -ENODEV;
+
+ max = num_var_ranges;
+ /* No CPU hotplug when we change MTRR entries */
+ get_online_cpus();
+ mutex_lock(&mtrr_mutex);
+ if (reg < 0) {
+ /* Search for existing MTRR */
+ for (i = 0; i < max; ++i) {
+ mtrr_if->get(i, &lbase, &lsize, <ype);
+ if (lbase == base && lsize == size) {
+ reg = i;
+ break;
+ }
+ }
+ if (reg < 0) {
+ pr_debug("no MTRR for %lx000,%lx000 found\n",
+ base, size);
+ goto out;
+ }
+ }
+ if (reg >= max) {
+ pr_warn("register: %d too big\n", reg);
+ goto out;
+ }
+ mtrr_if->get(reg, &lbase, &lsize, <ype);
+ if (lsize < 1) {
+ pr_warn("MTRR %d not used\n", reg);
+ goto out;
+ }
+ if (mtrr_usage_table[reg] < 1) {
+ pr_warn("reg: %d has count=0\n", reg);
+ goto out;
+ }
+ if (--mtrr_usage_table[reg] < 1)
+ set_mtrr_cpuslocked(reg, 0, 0, 0);
+ error = reg;
+ out:
+ mutex_unlock(&mtrr_mutex);
+ put_online_cpus();
+ return error;
+}
+
+/**
+ * mtrr_del - delete a memory type region
+ * @reg: Register returned by mtrr_add
+ * @base: Physical base address
+ * @size: Size of region
+ *
+ * If register is supplied then base and size are ignored. This is
+ * how drivers should call it.
+ *
+ * Releases an MTRR region. If the usage count drops to zero the
+ * register is freed and the region returns to default state.
+ * On success the register is returned, on failure a negative error
+ * code.
+ */
+int mtrr_del(int reg, unsigned long base, unsigned long size)
+{
+ if (!mtrr_enabled())
+ return -ENODEV;
+ if (mtrr_check(base, size))
+ return -EINVAL;
+ return mtrr_del_page(reg, base >> PAGE_SHIFT, size >> PAGE_SHIFT);
+}
+
+/**
+ * arch_phys_wc_add - add a WC MTRR and handle errors if PAT is unavailable
+ * @base: Physical base address
+ * @size: Size of region
+ *
+ * If PAT is available, this does nothing. If PAT is unavailable, it
+ * attempts to add a WC MTRR covering size bytes starting at base and
+ * logs an error if this fails.
+ *
+ * The called should provide a power of two size on an equivalent
+ * power of two boundary.
+ *
+ * Drivers must store the return value to pass to mtrr_del_wc_if_needed,
+ * but drivers should not try to interpret that return value.
+ */
+int arch_phys_wc_add(unsigned long base, unsigned long size)
+{
+ int ret;
+
+ if (pat_enabled() || !mtrr_enabled())
+ return 0; /* Success! (We don't need to do anything.) */
+
+ ret = mtrr_add(base, size, MTRR_TYPE_WRCOMB, true);
+ if (ret < 0) {
+ pr_warn("Failed to add WC MTRR for [%p-%p]; performance may suffer.",
+ (void *)base, (void *)(base + size - 1));
+ return ret;
+ }
+ return ret + MTRR_TO_PHYS_WC_OFFSET;
+}
+EXPORT_SYMBOL(arch_phys_wc_add);
+
+/*
+ * arch_phys_wc_del - undoes arch_phys_wc_add
+ * @handle: Return value from arch_phys_wc_add
+ *
+ * This cleans up after mtrr_add_wc_if_needed.
+ *
+ * The API guarantees that mtrr_del_wc_if_needed(error code) and
+ * mtrr_del_wc_if_needed(0) do nothing.
+ */
+void arch_phys_wc_del(int handle)
+{
+ if (handle >= 1) {
+ WARN_ON(handle < MTRR_TO_PHYS_WC_OFFSET);
+ mtrr_del(handle - MTRR_TO_PHYS_WC_OFFSET, 0, 0);
+ }
+}
+EXPORT_SYMBOL(arch_phys_wc_del);
+
+/*
+ * arch_phys_wc_index - translates arch_phys_wc_add's return value
+ * @handle: Return value from arch_phys_wc_add
+ *
+ * This will turn the return value from arch_phys_wc_add into an mtrr
+ * index suitable for debugging.
+ *
+ * Note: There is no legitimate use for this function, except possibly
+ * in printk line. Alas there is an illegitimate use in some ancient
+ * drm ioctls.
+ */
+int arch_phys_wc_index(int handle)
+{
+ if (handle < MTRR_TO_PHYS_WC_OFFSET)
+ return -1;
+ else
+ return handle - MTRR_TO_PHYS_WC_OFFSET;
+}
+EXPORT_SYMBOL_GPL(arch_phys_wc_index);
+
+/*
+ * HACK ALERT!
+ * These should be called implicitly, but we can't yet until all the initcall
+ * stuff is done...
+ */
+static void __init init_ifs(void)
+{
+#ifndef CONFIG_X86_64
+ amd_init_mtrr();
+ cyrix_init_mtrr();
+ centaur_init_mtrr();
+#endif
+}
+
+/* The suspend/resume methods are only for CPU without MTRR. CPU using generic
+ * MTRR driver doesn't require this
+ */
+struct mtrr_value {
+ mtrr_type ltype;
+ unsigned long lbase;
+ unsigned long lsize;
+};
+
+static struct mtrr_value mtrr_value[MTRR_MAX_VAR_RANGES];
+
+static int mtrr_save(void)
+{
+ int i;
+
+ for (i = 0; i < num_var_ranges; i++) {
+ mtrr_if->get(i, &mtrr_value[i].lbase,
+ &mtrr_value[i].lsize,
+ &mtrr_value[i].ltype);
+ }
+ return 0;
+}
+
+static void mtrr_restore(void)
+{
+ int i;
+
+ for (i = 0; i < num_var_ranges; i++) {
+ if (mtrr_value[i].lsize) {
+ set_mtrr(i, mtrr_value[i].lbase,
+ mtrr_value[i].lsize,
+ mtrr_value[i].ltype);
+ }
+ }
+}
+
+
+
+static struct syscore_ops mtrr_syscore_ops = {
+ .suspend = mtrr_save,
+ .resume = mtrr_restore,
+};
+
+int __initdata changed_by_mtrr_cleanup;
+
+#define SIZE_OR_MASK_BITS(n) (~((1ULL << ((n) - PAGE_SHIFT)) - 1))
+/**
+ * mtrr_bp_init - initialize mtrrs on the boot CPU
+ *
+ * This needs to be called early; before any of the other CPUs are
+ * initialized (i.e. before smp_init()).
+ *
+ */
+void __init mtrr_bp_init(void)
+{
+ u32 phys_addr;
+
+ init_ifs();
+
+ phys_addr = 32;
+
+ if (boot_cpu_has(X86_FEATURE_MTRR)) {
+ mtrr_if = &generic_mtrr_ops;
+ size_or_mask = SIZE_OR_MASK_BITS(36);
+ size_and_mask = 0x00f00000;
+ phys_addr = 36;
+
+ /*
+ * This is an AMD specific MSR, but we assume(hope?) that
+ * Intel will implement it too when they extend the address
+ * bus of the Xeon.
+ */
+ if (cpuid_eax(0x80000000) >= 0x80000008) {
+ phys_addr = cpuid_eax(0x80000008) & 0xff;
+ /* CPUID workaround for Intel 0F33/0F34 CPU */
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL &&
+ boot_cpu_data.x86 == 0xF &&
+ boot_cpu_data.x86_model == 0x3 &&
+ (boot_cpu_data.x86_stepping == 0x3 ||
+ boot_cpu_data.x86_stepping == 0x4))
+ phys_addr = 36;
+
+ size_or_mask = SIZE_OR_MASK_BITS(phys_addr);
+ size_and_mask = ~size_or_mask & 0xfffff00000ULL;
+ } else if (boot_cpu_data.x86_vendor == X86_VENDOR_CENTAUR &&
+ boot_cpu_data.x86 == 6) {
+ /*
+ * VIA C* family have Intel style MTRRs,
+ * but don't support PAE
+ */
+ size_or_mask = SIZE_OR_MASK_BITS(32);
+ size_and_mask = 0;
+ phys_addr = 32;
+ }
+ } else {
+ switch (boot_cpu_data.x86_vendor) {
+ case X86_VENDOR_AMD:
+ if (cpu_feature_enabled(X86_FEATURE_K6_MTRR)) {
+ /* Pre-Athlon (K6) AMD CPU MTRRs */
+ mtrr_if = mtrr_ops[X86_VENDOR_AMD];
+ size_or_mask = SIZE_OR_MASK_BITS(32);
+ size_and_mask = 0;
+ }
+ break;
+ case X86_VENDOR_CENTAUR:
+ if (cpu_feature_enabled(X86_FEATURE_CENTAUR_MCR)) {
+ mtrr_if = mtrr_ops[X86_VENDOR_CENTAUR];
+ size_or_mask = SIZE_OR_MASK_BITS(32);
+ size_and_mask = 0;
+ }
+ break;
+ case X86_VENDOR_CYRIX:
+ if (cpu_feature_enabled(X86_FEATURE_CYRIX_ARR)) {
+ mtrr_if = mtrr_ops[X86_VENDOR_CYRIX];
+ size_or_mask = SIZE_OR_MASK_BITS(32);
+ size_and_mask = 0;
+ }
+ break;
+ default:
+ break;
+ }
+ }
+
+ if (mtrr_if) {
+ __mtrr_enabled = true;
+ set_num_var_ranges();
+ init_table();
+ if (use_intel()) {
+ /* BIOS may override */
+ __mtrr_enabled = get_mtrr_state();
+
+ if (mtrr_enabled())
+ mtrr_bp_pat_init();
+
+ if (mtrr_cleanup(phys_addr)) {
+ changed_by_mtrr_cleanup = 1;
+ mtrr_if->set_all();
+ }
+ }
+ }
+
+ if (!mtrr_enabled()) {
+ pr_info("Disabled\n");
+
+ /*
+ * PAT initialization relies on MTRR's rendezvous handler.
+ * Skip PAT init until the handler can initialize both
+ * features independently.
+ */
+ pat_disable("MTRRs disabled, skipping PAT initialization too.");
+ }
+}
+
+void mtrr_ap_init(void)
+{
+ if (!mtrr_enabled())
+ return;
+
+ if (!use_intel() || mtrr_aps_delayed_init)
+ return;
+
+ rcu_cpu_starting(smp_processor_id());
+
+ /*
+ * Ideally we should hold mtrr_mutex here to avoid mtrr entries
+ * changed, but this routine will be called in cpu boot time,
+ * holding the lock breaks it.
+ *
+ * This routine is called in two cases:
+ *
+ * 1. very earily time of software resume, when there absolutely
+ * isn't mtrr entry changes;
+ *
+ * 2. cpu hotadd time. We let mtrr_add/del_page hold cpuhotplug
+ * lock to prevent mtrr entry changes
+ */
+ set_mtrr_from_inactive_cpu(~0U, 0, 0, 0);
+}
+
+/**
+ * Save current fixed-range MTRR state of the first cpu in cpu_online_mask.
+ */
+void mtrr_save_state(void)
+{
+ int first_cpu;
+
+ if (!mtrr_enabled())
+ return;
+
+ first_cpu = cpumask_first(cpu_online_mask);
+ smp_call_function_single(first_cpu, mtrr_save_fixed_ranges, NULL, 1);
+}
+
+void set_mtrr_aps_delayed_init(void)
+{
+ if (!mtrr_enabled())
+ return;
+ if (!use_intel())
+ return;
+
+ mtrr_aps_delayed_init = true;
+}
+
+/*
+ * Delayed MTRR initialization for all AP's
+ */
+void mtrr_aps_init(void)
+{
+ if (!use_intel() || !mtrr_enabled())
+ return;
+
+ /*
+ * Check if someone has requested the delay of AP MTRR initialization,
+ * by doing set_mtrr_aps_delayed_init(), prior to this point. If not,
+ * then we are done.
+ */
+ if (!mtrr_aps_delayed_init)
+ return;
+
+ set_mtrr(~0U, 0, 0, 0);
+ mtrr_aps_delayed_init = false;
+}
+
+void mtrr_bp_restore(void)
+{
+ if (!use_intel() || !mtrr_enabled())
+ return;
+
+ mtrr_if->set_all();
+}
+
+static int __init mtrr_init_finialize(void)
+{
+ if (!mtrr_enabled())
+ return 0;
+
+ if (use_intel()) {
+ if (!changed_by_mtrr_cleanup)
+ mtrr_state_warn();
+ return 0;
+ }
+
+ /*
+ * The CPU has no MTRR and seems to not support SMP. They have
+ * specific drivers, we use a tricky method to support
+ * suspend/resume for them.
+ *
+ * TBD: is there any system with such CPU which supports
+ * suspend/resume? If no, we should remove the code.
+ */
+ register_syscore_ops(&mtrr_syscore_ops);
+
+ return 0;
+}
+subsys_initcall(mtrr_init_finialize);
diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h
new file mode 100644
index 0000000..2ac99e5
--- /dev/null
+++ b/arch/x86/kernel/cpu/mtrr/mtrr.h
@@ -0,0 +1,80 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * local MTRR defines.
+ */
+
+#include <linux/types.h>
+#include <linux/stddef.h>
+
+#define MTRR_CHANGE_MASK_FIXED 0x01
+#define MTRR_CHANGE_MASK_VARIABLE 0x02
+#define MTRR_CHANGE_MASK_DEFTYPE 0x04
+
+extern unsigned int mtrr_usage_table[MTRR_MAX_VAR_RANGES];
+
+struct mtrr_ops {
+ u32 vendor;
+ u32 use_intel_if;
+ void (*set)(unsigned int reg, unsigned long base,
+ unsigned long size, mtrr_type type);
+ void (*set_all)(void);
+
+ void (*get)(unsigned int reg, unsigned long *base,
+ unsigned long *size, mtrr_type *type);
+ int (*get_free_region)(unsigned long base, unsigned long size,
+ int replace_reg);
+ int (*validate_add_page)(unsigned long base, unsigned long size,
+ unsigned int type);
+ int (*have_wrcomb)(void);
+};
+
+extern int generic_get_free_region(unsigned long base, unsigned long size,
+ int replace_reg);
+extern int generic_validate_add_page(unsigned long base, unsigned long size,
+ unsigned int type);
+
+extern const struct mtrr_ops generic_mtrr_ops;
+
+extern int positive_have_wrcomb(void);
+
+/* library functions for processor-specific routines */
+struct set_mtrr_context {
+ unsigned long flags;
+ unsigned long cr4val;
+ u32 deftype_lo;
+ u32 deftype_hi;
+ u32 ccr3;
+};
+
+void set_mtrr_done(struct set_mtrr_context *ctxt);
+void set_mtrr_cache_disable(struct set_mtrr_context *ctxt);
+void set_mtrr_prepare_save(struct set_mtrr_context *ctxt);
+
+void fill_mtrr_var_range(unsigned int index,
+ u32 base_lo, u32 base_hi, u32 mask_lo, u32 mask_hi);
+bool get_mtrr_state(void);
+void mtrr_bp_pat_init(void);
+
+extern void __init set_mtrr_ops(const struct mtrr_ops *ops);
+
+extern u64 size_or_mask, size_and_mask;
+extern const struct mtrr_ops *mtrr_if;
+
+#define is_cpu(vnd) (mtrr_if && mtrr_if->vendor == X86_VENDOR_##vnd)
+#define use_intel() (mtrr_if && mtrr_if->use_intel_if == 1)
+
+extern unsigned int num_var_ranges;
+extern u64 mtrr_tom2;
+extern struct mtrr_state_type mtrr_state;
+
+void mtrr_state_warn(void);
+const char *mtrr_attrib_to_str(int x);
+void mtrr_wrmsr(unsigned, unsigned, unsigned);
+
+/* CPU specific mtrr init functions */
+int amd_init_mtrr(void);
+int cyrix_init_mtrr(void);
+int centaur_init_mtrr(void);
+
+extern int changed_by_mtrr_cleanup;
+extern int mtrr_cleanup(unsigned address_bits);
diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c
new file mode 100644
index 0000000..d389083
--- /dev/null
+++ b/arch/x86/kernel/cpu/perfctr-watchdog.c
@@ -0,0 +1,161 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * local apic based NMI watchdog for various CPUs.
+ *
+ * This file also handles reservation of performance counters for coordination
+ * with other users (like oprofile).
+ *
+ * Note that these events normally don't tick when the CPU idles. This means
+ * the frequency varies with CPU load.
+ *
+ * Original code for K7/P6 written by Keith Owens
+ *
+ */
+
+#include <linux/percpu.h>
+#include <linux/export.h>
+#include <linux/kernel.h>
+#include <linux/bitops.h>
+#include <linux/smp.h>
+#include <asm/nmi.h>
+#include <linux/kprobes.h>
+
+#include <asm/apic.h>
+#include <asm/perf_event.h>
+
+/*
+ * this number is calculated from Intel's MSR_P4_CRU_ESCR5 register and it's
+ * offset from MSR_P4_BSU_ESCR0.
+ *
+ * It will be the max for all platforms (for now)
+ */
+#define NMI_MAX_COUNTER_BITS 66
+
+/*
+ * perfctr_nmi_owner tracks the ownership of the perfctr registers:
+ * evtsel_nmi_owner tracks the ownership of the event selection
+ * - different performance counters/ event selection may be reserved for
+ * different subsystems this reservation system just tries to coordinate
+ * things a little
+ */
+static DECLARE_BITMAP(perfctr_nmi_owner, NMI_MAX_COUNTER_BITS);
+static DECLARE_BITMAP(evntsel_nmi_owner, NMI_MAX_COUNTER_BITS);
+
+/* converts an msr to an appropriate reservation bit */
+static inline unsigned int nmi_perfctr_msr_to_bit(unsigned int msr)
+{
+ /* returns the bit offset of the performance counter register */
+ switch (boot_cpu_data.x86_vendor) {
+ case X86_VENDOR_AMD:
+ if (msr >= MSR_F15H_PERF_CTR)
+ return (msr - MSR_F15H_PERF_CTR) >> 1;
+ return msr - MSR_K7_PERFCTR0;
+ case X86_VENDOR_INTEL:
+ if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
+ return msr - MSR_ARCH_PERFMON_PERFCTR0;
+
+ switch (boot_cpu_data.x86) {
+ case 6:
+ return msr - MSR_P6_PERFCTR0;
+ case 11:
+ return msr - MSR_KNC_PERFCTR0;
+ case 15:
+ return msr - MSR_P4_BPU_PERFCTR0;
+ }
+ }
+ return 0;
+}
+
+/*
+ * converts an msr to an appropriate reservation bit
+ * returns the bit offset of the event selection register
+ */
+static inline unsigned int nmi_evntsel_msr_to_bit(unsigned int msr)
+{
+ /* returns the bit offset of the event selection register */
+ switch (boot_cpu_data.x86_vendor) {
+ case X86_VENDOR_AMD:
+ if (msr >= MSR_F15H_PERF_CTL)
+ return (msr - MSR_F15H_PERF_CTL) >> 1;
+ return msr - MSR_K7_EVNTSEL0;
+ case X86_VENDOR_INTEL:
+ if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
+ return msr - MSR_ARCH_PERFMON_EVENTSEL0;
+
+ switch (boot_cpu_data.x86) {
+ case 6:
+ return msr - MSR_P6_EVNTSEL0;
+ case 11:
+ return msr - MSR_KNC_EVNTSEL0;
+ case 15:
+ return msr - MSR_P4_BSU_ESCR0;
+ }
+ }
+ return 0;
+
+}
+
+/* checks for a bit availability (hack for oprofile) */
+int avail_to_resrv_perfctr_nmi_bit(unsigned int counter)
+{
+ BUG_ON(counter > NMI_MAX_COUNTER_BITS);
+
+ return !test_bit(counter, perfctr_nmi_owner);
+}
+EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi_bit);
+
+int reserve_perfctr_nmi(unsigned int msr)
+{
+ unsigned int counter;
+
+ counter = nmi_perfctr_msr_to_bit(msr);
+ /* register not managed by the allocator? */
+ if (counter > NMI_MAX_COUNTER_BITS)
+ return 1;
+
+ if (!test_and_set_bit(counter, perfctr_nmi_owner))
+ return 1;
+ return 0;
+}
+EXPORT_SYMBOL(reserve_perfctr_nmi);
+
+void release_perfctr_nmi(unsigned int msr)
+{
+ unsigned int counter;
+
+ counter = nmi_perfctr_msr_to_bit(msr);
+ /* register not managed by the allocator? */
+ if (counter > NMI_MAX_COUNTER_BITS)
+ return;
+
+ clear_bit(counter, perfctr_nmi_owner);
+}
+EXPORT_SYMBOL(release_perfctr_nmi);
+
+int reserve_evntsel_nmi(unsigned int msr)
+{
+ unsigned int counter;
+
+ counter = nmi_evntsel_msr_to_bit(msr);
+ /* register not managed by the allocator? */
+ if (counter > NMI_MAX_COUNTER_BITS)
+ return 1;
+
+ if (!test_and_set_bit(counter, evntsel_nmi_owner))
+ return 1;
+ return 0;
+}
+EXPORT_SYMBOL(reserve_evntsel_nmi);
+
+void release_evntsel_nmi(unsigned int msr)
+{
+ unsigned int counter;
+
+ counter = nmi_evntsel_msr_to_bit(msr);
+ /* register not managed by the allocator? */
+ if (counter > NMI_MAX_COUNTER_BITS)
+ return;
+
+ clear_bit(counter, evntsel_nmi_owner);
+}
+EXPORT_SYMBOL(release_evntsel_nmi);
diff --git a/arch/x86/kernel/cpu/powerflags.c b/arch/x86/kernel/cpu/powerflags.c
new file mode 100644
index 0000000..fd6ec2a
--- /dev/null
+++ b/arch/x86/kernel/cpu/powerflags.c
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Strings for the various x86 power flags
+ *
+ * This file must not contain any executable code.
+ */
+
+#include <asm/cpufeature.h>
+
+const char *const x86_power_flags[32] = {
+ "ts", /* temperature sensor */
+ "fid", /* frequency id control */
+ "vid", /* voltage id control */
+ "ttp", /* thermal trip */
+ "tm", /* hardware thermal control */
+ "stc", /* software thermal control */
+ "100mhzsteps", /* 100 MHz multiplier control */
+ "hwpstate", /* hardware P-state control */
+ "", /* tsc invariant mapped to constant_tsc */
+ "cpb", /* core performance boost */
+ "eff_freq_ro", /* Readonly aperf/mperf */
+ "proc_feedback", /* processor feedback interface */
+ "acc_power", /* accumulated power mechanism */
+};
diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c
new file mode 100644
index 0000000..2c8522a
--- /dev/null
+++ b/arch/x86/kernel/cpu/proc.c
@@ -0,0 +1,167 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/smp.h>
+#include <linux/timex.h>
+#include <linux/string.h>
+#include <linux/seq_file.h>
+#include <linux/cpufreq.h>
+
+#include "cpu.h"
+
+/*
+ * Get CPU information for use by the procfs.
+ */
+static void show_cpuinfo_core(struct seq_file *m, struct cpuinfo_x86 *c,
+ unsigned int cpu)
+{
+#ifdef CONFIG_SMP
+ seq_printf(m, "physical id\t: %d\n", c->phys_proc_id);
+ seq_printf(m, "siblings\t: %d\n",
+ cpumask_weight(topology_core_cpumask(cpu)));
+ seq_printf(m, "core id\t\t: %d\n", c->cpu_core_id);
+ seq_printf(m, "cpu cores\t: %d\n", c->booted_cores);
+ seq_printf(m, "apicid\t\t: %d\n", c->apicid);
+ seq_printf(m, "initial apicid\t: %d\n", c->initial_apicid);
+#endif
+}
+
+#ifdef CONFIG_X86_32
+static void show_cpuinfo_misc(struct seq_file *m, struct cpuinfo_x86 *c)
+{
+ seq_printf(m,
+ "fdiv_bug\t: %s\n"
+ "f00f_bug\t: %s\n"
+ "coma_bug\t: %s\n"
+ "fpu\t\t: %s\n"
+ "fpu_exception\t: %s\n"
+ "cpuid level\t: %d\n"
+ "wp\t\t: yes\n",
+ static_cpu_has_bug(X86_BUG_FDIV) ? "yes" : "no",
+ static_cpu_has_bug(X86_BUG_F00F) ? "yes" : "no",
+ static_cpu_has_bug(X86_BUG_COMA) ? "yes" : "no",
+ static_cpu_has(X86_FEATURE_FPU) ? "yes" : "no",
+ static_cpu_has(X86_FEATURE_FPU) ? "yes" : "no",
+ c->cpuid_level);
+}
+#else
+static void show_cpuinfo_misc(struct seq_file *m, struct cpuinfo_x86 *c)
+{
+ seq_printf(m,
+ "fpu\t\t: yes\n"
+ "fpu_exception\t: yes\n"
+ "cpuid level\t: %d\n"
+ "wp\t\t: yes\n",
+ c->cpuid_level);
+}
+#endif
+
+static int show_cpuinfo(struct seq_file *m, void *v)
+{
+ struct cpuinfo_x86 *c = v;
+ unsigned int cpu;
+ int i;
+
+ cpu = c->cpu_index;
+ seq_printf(m, "processor\t: %u\n"
+ "vendor_id\t: %s\n"
+ "cpu family\t: %d\n"
+ "model\t\t: %u\n"
+ "model name\t: %s\n",
+ cpu,
+ c->x86_vendor_id[0] ? c->x86_vendor_id : "unknown",
+ c->x86,
+ c->x86_model,
+ c->x86_model_id[0] ? c->x86_model_id : "unknown");
+
+ if (c->x86_stepping || c->cpuid_level >= 0)
+ seq_printf(m, "stepping\t: %d\n", c->x86_stepping);
+ else
+ seq_puts(m, "stepping\t: unknown\n");
+ if (c->microcode)
+ seq_printf(m, "microcode\t: 0x%x\n", c->microcode);
+
+ if (cpu_has(c, X86_FEATURE_TSC)) {
+ unsigned int freq = aperfmperf_get_khz(cpu);
+
+ if (!freq)
+ freq = cpufreq_quick_get(cpu);
+ if (!freq)
+ freq = cpu_khz;
+ seq_printf(m, "cpu MHz\t\t: %u.%03u\n",
+ freq / 1000, (freq % 1000));
+ }
+
+ /* Cache size */
+ if (c->x86_cache_size)
+ seq_printf(m, "cache size\t: %u KB\n", c->x86_cache_size);
+
+ show_cpuinfo_core(m, c, cpu);
+ show_cpuinfo_misc(m, c);
+
+ seq_puts(m, "flags\t\t:");
+ for (i = 0; i < 32*NCAPINTS; i++)
+ if (cpu_has(c, i) && x86_cap_flags[i] != NULL)
+ seq_printf(m, " %s", x86_cap_flags[i]);
+
+ seq_puts(m, "\nbugs\t\t:");
+ for (i = 0; i < 32*NBUGINTS; i++) {
+ unsigned int bug_bit = 32*NCAPINTS + i;
+
+ if (cpu_has_bug(c, bug_bit) && x86_bug_flags[i])
+ seq_printf(m, " %s", x86_bug_flags[i]);
+ }
+
+ seq_printf(m, "\nbogomips\t: %lu.%02lu\n",
+ c->loops_per_jiffy/(500000/HZ),
+ (c->loops_per_jiffy/(5000/HZ)) % 100);
+
+#ifdef CONFIG_X86_64
+ if (c->x86_tlbsize > 0)
+ seq_printf(m, "TLB size\t: %d 4K pages\n", c->x86_tlbsize);
+#endif
+ seq_printf(m, "clflush size\t: %u\n", c->x86_clflush_size);
+ seq_printf(m, "cache_alignment\t: %d\n", c->x86_cache_alignment);
+ seq_printf(m, "address sizes\t: %u bits physical, %u bits virtual\n",
+ c->x86_phys_bits, c->x86_virt_bits);
+
+ seq_puts(m, "power management:");
+ for (i = 0; i < 32; i++) {
+ if (c->x86_power & (1 << i)) {
+ if (i < ARRAY_SIZE(x86_power_flags) &&
+ x86_power_flags[i])
+ seq_printf(m, "%s%s",
+ x86_power_flags[i][0] ? " " : "",
+ x86_power_flags[i]);
+ else
+ seq_printf(m, " [%d]", i);
+ }
+ }
+
+ seq_puts(m, "\n\n");
+
+ return 0;
+}
+
+static void *c_start(struct seq_file *m, loff_t *pos)
+{
+ *pos = cpumask_next(*pos - 1, cpu_online_mask);
+ if ((*pos) < nr_cpu_ids)
+ return &cpu_data(*pos);
+ return NULL;
+}
+
+static void *c_next(struct seq_file *m, void *v, loff_t *pos)
+{
+ (*pos)++;
+ return c_start(m, pos);
+}
+
+static void c_stop(struct seq_file *m, void *v)
+{
+}
+
+const struct seq_operations cpuinfo_op = {
+ .start = c_start,
+ .next = c_next,
+ .stop = c_stop,
+ .show = show_cpuinfo,
+};
diff --git a/arch/x86/kernel/cpu/rdrand.c b/arch/x86/kernel/cpu/rdrand.c
new file mode 100644
index 0000000..cfa97ff
--- /dev/null
+++ b/arch/x86/kernel/cpu/rdrand.c
@@ -0,0 +1,59 @@
+/*
+ * This file is part of the Linux kernel.
+ *
+ * Copyright (c) 2011, Intel Corporation
+ * Authors: Fenghua Yu <fenghua.yu@intel.com>,
+ * H. Peter Anvin <hpa@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ */
+
+#include <asm/processor.h>
+#include <asm/archrandom.h>
+#include <asm/sections.h>
+
+static int __init x86_rdrand_setup(char *s)
+{
+ setup_clear_cpu_cap(X86_FEATURE_RDRAND);
+ setup_clear_cpu_cap(X86_FEATURE_RDSEED);
+ return 1;
+}
+__setup("nordrand", x86_rdrand_setup);
+
+/*
+ * RDRAND has Built-In-Self-Test (BIST) that runs on every invocation.
+ * Run the instruction a few times as a sanity check.
+ * If it fails, it is simple to disable RDRAND here.
+ */
+#define SANITY_CHECK_LOOPS 8
+
+#ifdef CONFIG_ARCH_RANDOM
+void x86_init_rdrand(struct cpuinfo_x86 *c)
+{
+ unsigned long tmp;
+ int i;
+
+ if (!cpu_has(c, X86_FEATURE_RDRAND))
+ return;
+
+ for (i = 0; i < SANITY_CHECK_LOOPS; i++) {
+ if (!rdrand_long(&tmp)) {
+ clear_cpu_cap(c, X86_FEATURE_RDRAND);
+ pr_warn_once("rdrand: disabled\n");
+ return;
+ }
+ }
+}
+#endif
diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c
new file mode 100644
index 0000000..772c219
--- /dev/null
+++ b/arch/x86/kernel/cpu/scattered.c
@@ -0,0 +1,82 @@
+/*
+ * Routines to identify additional cpu features that are scattered in
+ * cpuid space.
+ */
+#include <linux/cpu.h>
+
+#include <asm/pat.h>
+#include <asm/processor.h>
+
+#include <asm/apic.h>
+
+struct cpuid_bit {
+ u16 feature;
+ u8 reg;
+ u8 bit;
+ u32 level;
+ u32 sub_leaf;
+};
+
+/* Please keep the leaf sorted by cpuid_bit.level for faster search. */
+static const struct cpuid_bit cpuid_bits[] = {
+ { X86_FEATURE_APERFMPERF, CPUID_ECX, 0, 0x00000006, 0 },
+ { X86_FEATURE_EPB, CPUID_ECX, 3, 0x00000006, 0 },
+ { X86_FEATURE_CAT_L3, CPUID_EBX, 1, 0x00000010, 0 },
+ { X86_FEATURE_CAT_L2, CPUID_EBX, 2, 0x00000010, 0 },
+ { X86_FEATURE_CDP_L3, CPUID_ECX, 2, 0x00000010, 1 },
+ { X86_FEATURE_CDP_L2, CPUID_ECX, 2, 0x00000010, 2 },
+ { X86_FEATURE_MBA, CPUID_EBX, 3, 0x00000010, 0 },
+ { X86_FEATURE_HW_PSTATE, CPUID_EDX, 7, 0x80000007, 0 },
+ { X86_FEATURE_CPB, CPUID_EDX, 9, 0x80000007, 0 },
+ { X86_FEATURE_PROC_FEEDBACK, CPUID_EDX, 11, 0x80000007, 0 },
+ { X86_FEATURE_SME, CPUID_EAX, 0, 0x8000001f, 0 },
+ { X86_FEATURE_SEV, CPUID_EAX, 1, 0x8000001f, 0 },
+ { 0, 0, 0, 0, 0 }
+};
+
+void init_scattered_cpuid_features(struct cpuinfo_x86 *c)
+{
+ u32 max_level;
+ u32 regs[4];
+ const struct cpuid_bit *cb;
+
+ for (cb = cpuid_bits; cb->feature; cb++) {
+
+ /* Verify that the level is valid */
+ max_level = cpuid_eax(cb->level & 0xffff0000);
+ if (max_level < cb->level ||
+ max_level > (cb->level | 0xffff))
+ continue;
+
+ cpuid_count(cb->level, cb->sub_leaf, ®s[CPUID_EAX],
+ ®s[CPUID_EBX], ®s[CPUID_ECX],
+ ®s[CPUID_EDX]);
+
+ if (regs[cb->reg] & (1 << cb->bit))
+ set_cpu_cap(c, cb->feature);
+ }
+}
+
+u32 get_scattered_cpuid_leaf(unsigned int level, unsigned int sub_leaf,
+ enum cpuid_regs_idx reg)
+{
+ const struct cpuid_bit *cb;
+ u32 cpuid_val = 0;
+
+ for (cb = cpuid_bits; cb->feature; cb++) {
+
+ if (level > cb->level)
+ continue;
+
+ if (level < cb->level)
+ break;
+
+ if (reg == cb->reg && sub_leaf == cb->sub_leaf) {
+ if (cpu_has(&boot_cpu_data, cb->feature))
+ cpuid_val |= BIT(cb->bit);
+ }
+ }
+
+ return cpuid_val;
+}
+EXPORT_SYMBOL_GPL(get_scattered_cpuid_leaf);
diff --git a/arch/x86/kernel/cpu/topology.c b/arch/x86/kernel/cpu/topology.c
new file mode 100644
index 0000000..71ca064
--- /dev/null
+++ b/arch/x86/kernel/cpu/topology.c
@@ -0,0 +1,103 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Check for extended topology enumeration cpuid leaf 0xb and if it
+ * exists, use it for populating initial_apicid and cpu topology
+ * detection.
+ */
+
+#include <linux/cpu.h>
+#include <asm/apic.h>
+#include <asm/pat.h>
+#include <asm/processor.h>
+
+/* leaf 0xb SMT level */
+#define SMT_LEVEL 0
+
+/* leaf 0xb sub-leaf types */
+#define INVALID_TYPE 0
+#define SMT_TYPE 1
+#define CORE_TYPE 2
+
+#define LEAFB_SUBTYPE(ecx) (((ecx) >> 8) & 0xff)
+#define BITS_SHIFT_NEXT_LEVEL(eax) ((eax) & 0x1f)
+#define LEVEL_MAX_SIBLINGS(ebx) ((ebx) & 0xffff)
+
+int detect_extended_topology_early(struct cpuinfo_x86 *c)
+{
+#ifdef CONFIG_SMP
+ unsigned int eax, ebx, ecx, edx;
+
+ if (c->cpuid_level < 0xb)
+ return -1;
+
+ cpuid_count(0xb, SMT_LEVEL, &eax, &ebx, &ecx, &edx);
+
+ /*
+ * check if the cpuid leaf 0xb is actually implemented.
+ */
+ if (ebx == 0 || (LEAFB_SUBTYPE(ecx) != SMT_TYPE))
+ return -1;
+
+ set_cpu_cap(c, X86_FEATURE_XTOPOLOGY);
+
+ /*
+ * initial apic id, which also represents 32-bit extended x2apic id.
+ */
+ c->initial_apicid = edx;
+ smp_num_siblings = LEVEL_MAX_SIBLINGS(ebx);
+#endif
+ return 0;
+}
+
+/*
+ * Check for extended topology enumeration cpuid leaf 0xb and if it
+ * exists, use it for populating initial_apicid and cpu topology
+ * detection.
+ */
+int detect_extended_topology(struct cpuinfo_x86 *c)
+{
+#ifdef CONFIG_SMP
+ unsigned int eax, ebx, ecx, edx, sub_index;
+ unsigned int ht_mask_width, core_plus_mask_width;
+ unsigned int core_select_mask, core_level_siblings;
+
+ if (detect_extended_topology_early(c) < 0)
+ return -1;
+
+ /*
+ * Populate HT related information from sub-leaf level 0.
+ */
+ cpuid_count(0xb, SMT_LEVEL, &eax, &ebx, &ecx, &edx);
+ core_level_siblings = smp_num_siblings = LEVEL_MAX_SIBLINGS(ebx);
+ core_plus_mask_width = ht_mask_width = BITS_SHIFT_NEXT_LEVEL(eax);
+
+ sub_index = 1;
+ do {
+ cpuid_count(0xb, sub_index, &eax, &ebx, &ecx, &edx);
+
+ /*
+ * Check for the Core type in the implemented sub leaves.
+ */
+ if (LEAFB_SUBTYPE(ecx) == CORE_TYPE) {
+ core_level_siblings = LEVEL_MAX_SIBLINGS(ebx);
+ core_plus_mask_width = BITS_SHIFT_NEXT_LEVEL(eax);
+ break;
+ }
+
+ sub_index++;
+ } while (LEAFB_SUBTYPE(ecx) != INVALID_TYPE);
+
+ core_select_mask = (~(-1 << core_plus_mask_width)) >> ht_mask_width;
+
+ c->cpu_core_id = apic->phys_pkg_id(c->initial_apicid, ht_mask_width)
+ & core_select_mask;
+ c->phys_proc_id = apic->phys_pkg_id(c->initial_apicid, core_plus_mask_width);
+ /*
+ * Reinit the apicid, now that we have extended initial_apicid.
+ */
+ c->apicid = apic->phys_pkg_id(c->initial_apicid, 0);
+
+ c->x86_max_cores = (core_level_siblings / smp_num_siblings);
+#endif
+ return 0;
+}
diff --git a/arch/x86/kernel/cpu/transmeta.c b/arch/x86/kernel/cpu/transmeta.c
new file mode 100644
index 0000000..42c9398
--- /dev/null
+++ b/arch/x86/kernel/cpu/transmeta.c
@@ -0,0 +1,111 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/sched/clock.h>
+#include <linux/mm.h>
+#include <asm/cpufeature.h>
+#include <asm/msr.h>
+#include "cpu.h"
+
+static void early_init_transmeta(struct cpuinfo_x86 *c)
+{
+ u32 xlvl;
+
+ /* Transmeta-defined flags: level 0x80860001 */
+ xlvl = cpuid_eax(0x80860000);
+ if ((xlvl & 0xffff0000) == 0x80860000) {
+ if (xlvl >= 0x80860001)
+ c->x86_capability[CPUID_8086_0001_EDX] = cpuid_edx(0x80860001);
+ }
+}
+
+static void init_transmeta(struct cpuinfo_x86 *c)
+{
+ unsigned int cap_mask, uk, max, dummy;
+ unsigned int cms_rev1, cms_rev2;
+ unsigned int cpu_rev, cpu_freq = 0, cpu_flags, new_cpu_rev;
+ char cpu_info[65];
+
+ early_init_transmeta(c);
+
+ cpu_detect_cache_sizes(c);
+
+ /* Print CMS and CPU revision */
+ max = cpuid_eax(0x80860000);
+ cpu_rev = 0;
+ if (max >= 0x80860001) {
+ cpuid(0x80860001, &dummy, &cpu_rev, &cpu_freq, &cpu_flags);
+ if (cpu_rev != 0x02000000) {
+ pr_info("CPU: Processor revision %u.%u.%u.%u, %u MHz\n",
+ (cpu_rev >> 24) & 0xff,
+ (cpu_rev >> 16) & 0xff,
+ (cpu_rev >> 8) & 0xff,
+ cpu_rev & 0xff,
+ cpu_freq);
+ }
+ }
+ if (max >= 0x80860002) {
+ cpuid(0x80860002, &new_cpu_rev, &cms_rev1, &cms_rev2, &dummy);
+ if (cpu_rev == 0x02000000) {
+ pr_info("CPU: Processor revision %08X, %u MHz\n",
+ new_cpu_rev, cpu_freq);
+ }
+ pr_info("CPU: Code Morphing Software revision %u.%u.%u-%u-%u\n",
+ (cms_rev1 >> 24) & 0xff,
+ (cms_rev1 >> 16) & 0xff,
+ (cms_rev1 >> 8) & 0xff,
+ cms_rev1 & 0xff,
+ cms_rev2);
+ }
+ if (max >= 0x80860006) {
+ cpuid(0x80860003,
+ (void *)&cpu_info[0],
+ (void *)&cpu_info[4],
+ (void *)&cpu_info[8],
+ (void *)&cpu_info[12]);
+ cpuid(0x80860004,
+ (void *)&cpu_info[16],
+ (void *)&cpu_info[20],
+ (void *)&cpu_info[24],
+ (void *)&cpu_info[28]);
+ cpuid(0x80860005,
+ (void *)&cpu_info[32],
+ (void *)&cpu_info[36],
+ (void *)&cpu_info[40],
+ (void *)&cpu_info[44]);
+ cpuid(0x80860006,
+ (void *)&cpu_info[48],
+ (void *)&cpu_info[52],
+ (void *)&cpu_info[56],
+ (void *)&cpu_info[60]);
+ cpu_info[64] = '\0';
+ pr_info("CPU: %s\n", cpu_info);
+ }
+
+ /* Unhide possibly hidden capability flags */
+ rdmsr(0x80860004, cap_mask, uk);
+ wrmsr(0x80860004, ~0, uk);
+ c->x86_capability[CPUID_1_EDX] = cpuid_edx(0x00000001);
+ wrmsr(0x80860004, cap_mask, uk);
+
+ /* All Transmeta CPUs have a constant TSC */
+ set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
+
+#ifdef CONFIG_SYSCTL
+ /*
+ * randomize_va_space slows us down enormously;
+ * it probably triggers retranslation of x86->native bytecode
+ */
+ randomize_va_space = 0;
+#endif
+}
+
+static const struct cpu_dev transmeta_cpu_dev = {
+ .c_vendor = "Transmeta",
+ .c_ident = { "GenuineTMx86", "TransmetaCPU" },
+ .c_early_init = early_init_transmeta,
+ .c_init = init_transmeta,
+ .c_x86_vendor = X86_VENDOR_TRANSMETA,
+};
+
+cpu_dev_register(transmeta_cpu_dev);
diff --git a/arch/x86/kernel/cpu/umc.c b/arch/x86/kernel/cpu/umc.c
new file mode 100644
index 0000000..65a58a3
--- /dev/null
+++ b/arch/x86/kernel/cpu/umc.c
@@ -0,0 +1,26 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/kernel.h>
+#include <asm/processor.h>
+#include "cpu.h"
+
+/*
+ * UMC chips appear to be only either 386 or 486,
+ * so no special init takes place.
+ */
+
+static const struct cpu_dev umc_cpu_dev = {
+ .c_vendor = "UMC",
+ .c_ident = { "UMC UMC UMC" },
+ .legacy_models = {
+ { .family = 4, .model_names =
+ {
+ [1] = "U5D",
+ [2] = "U5S",
+ }
+ },
+ },
+ .c_x86_vendor = X86_VENDOR_UMC,
+};
+
+cpu_dev_register(umc_cpu_dev);
+
diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c
new file mode 100644
index 0000000..d805202
--- /dev/null
+++ b/arch/x86/kernel/cpu/vmware.c
@@ -0,0 +1,214 @@
+/*
+ * VMware Detection code.
+ *
+ * Copyright (C) 2008, VMware, Inc.
+ * Author : Alok N Kataria <akataria@vmware.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ */
+
+#include <linux/dmi.h>
+#include <linux/init.h>
+#include <linux/export.h>
+#include <linux/clocksource.h>
+#include <asm/div64.h>
+#include <asm/x86_init.h>
+#include <asm/hypervisor.h>
+#include <asm/timer.h>
+#include <asm/apic.h>
+
+#undef pr_fmt
+#define pr_fmt(fmt) "vmware: " fmt
+
+#define CPUID_VMWARE_INFO_LEAF 0x40000000
+#define VMWARE_HYPERVISOR_MAGIC 0x564D5868
+#define VMWARE_HYPERVISOR_PORT 0x5658
+
+#define VMWARE_PORT_CMD_GETVERSION 10
+#define VMWARE_PORT_CMD_GETHZ 45
+#define VMWARE_PORT_CMD_GETVCPU_INFO 68
+#define VMWARE_PORT_CMD_LEGACY_X2APIC 3
+#define VMWARE_PORT_CMD_VCPU_RESERVED 31
+
+#define VMWARE_PORT(cmd, eax, ebx, ecx, edx) \
+ __asm__("inl (%%dx)" : \
+ "=a"(eax), "=c"(ecx), "=d"(edx), "=b"(ebx) : \
+ "0"(VMWARE_HYPERVISOR_MAGIC), \
+ "1"(VMWARE_PORT_CMD_##cmd), \
+ "2"(VMWARE_HYPERVISOR_PORT), "3"(UINT_MAX) : \
+ "memory");
+
+static unsigned long vmware_tsc_khz __ro_after_init;
+
+static inline int __vmware_platform(void)
+{
+ uint32_t eax, ebx, ecx, edx;
+ VMWARE_PORT(GETVERSION, eax, ebx, ecx, edx);
+ return eax != (uint32_t)-1 && ebx == VMWARE_HYPERVISOR_MAGIC;
+}
+
+static unsigned long vmware_get_tsc_khz(void)
+{
+ return vmware_tsc_khz;
+}
+
+#ifdef CONFIG_PARAVIRT
+static struct cyc2ns_data vmware_cyc2ns __ro_after_init;
+static int vmw_sched_clock __initdata = 1;
+
+static __init int setup_vmw_sched_clock(char *s)
+{
+ vmw_sched_clock = 0;
+ return 0;
+}
+early_param("no-vmw-sched-clock", setup_vmw_sched_clock);
+
+static unsigned long long notrace vmware_sched_clock(void)
+{
+ unsigned long long ns;
+
+ ns = mul_u64_u32_shr(rdtsc(), vmware_cyc2ns.cyc2ns_mul,
+ vmware_cyc2ns.cyc2ns_shift);
+ ns -= vmware_cyc2ns.cyc2ns_offset;
+ return ns;
+}
+
+static void __init vmware_sched_clock_setup(void)
+{
+ struct cyc2ns_data *d = &vmware_cyc2ns;
+ unsigned long long tsc_now = rdtsc();
+
+ clocks_calc_mult_shift(&d->cyc2ns_mul, &d->cyc2ns_shift,
+ vmware_tsc_khz, NSEC_PER_MSEC, 0);
+ d->cyc2ns_offset = mul_u64_u32_shr(tsc_now, d->cyc2ns_mul,
+ d->cyc2ns_shift);
+
+ pv_time_ops.sched_clock = vmware_sched_clock;
+ pr_info("using sched offset of %llu ns\n", d->cyc2ns_offset);
+}
+
+static void __init vmware_paravirt_ops_setup(void)
+{
+ pv_info.name = "VMware hypervisor";
+ pv_cpu_ops.io_delay = paravirt_nop;
+
+ if (vmware_tsc_khz && vmw_sched_clock)
+ vmware_sched_clock_setup();
+}
+#else
+#define vmware_paravirt_ops_setup() do {} while (0)
+#endif
+
+/*
+ * VMware hypervisor takes care of exporting a reliable TSC to the guest.
+ * Still, due to timing difference when running on virtual cpus, the TSC can
+ * be marked as unstable in some cases. For example, the TSC sync check at
+ * bootup can fail due to a marginal offset between vcpus' TSCs (though the
+ * TSCs do not drift from each other). Also, the ACPI PM timer clocksource
+ * is not suitable as a watchdog when running on a hypervisor because the
+ * kernel may miss a wrap of the counter if the vcpu is descheduled for a
+ * long time. To skip these checks at runtime we set these capability bits,
+ * so that the kernel could just trust the hypervisor with providing a
+ * reliable virtual TSC that is suitable for timekeeping.
+ */
+static void __init vmware_set_capabilities(void)
+{
+ setup_force_cpu_cap(X86_FEATURE_CONSTANT_TSC);
+ setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE);
+}
+
+static void __init vmware_platform_setup(void)
+{
+ uint32_t eax, ebx, ecx, edx;
+ uint64_t lpj, tsc_khz;
+
+ VMWARE_PORT(GETHZ, eax, ebx, ecx, edx);
+
+ if (ebx != UINT_MAX) {
+ lpj = tsc_khz = eax | (((uint64_t)ebx) << 32);
+ do_div(tsc_khz, 1000);
+ WARN_ON(tsc_khz >> 32);
+ pr_info("TSC freq read from hypervisor : %lu.%03lu MHz\n",
+ (unsigned long) tsc_khz / 1000,
+ (unsigned long) tsc_khz % 1000);
+
+ if (!preset_lpj) {
+ do_div(lpj, HZ);
+ preset_lpj = lpj;
+ }
+
+ vmware_tsc_khz = tsc_khz;
+ x86_platform.calibrate_tsc = vmware_get_tsc_khz;
+ x86_platform.calibrate_cpu = vmware_get_tsc_khz;
+
+#ifdef CONFIG_X86_LOCAL_APIC
+ /* Skip lapic calibration since we know the bus frequency. */
+ lapic_timer_frequency = ecx / HZ;
+ pr_info("Host bus clock speed read from hypervisor : %u Hz\n",
+ ecx);
+#endif
+ } else {
+ pr_warn("Failed to get TSC freq from the hypervisor\n");
+ }
+
+ vmware_paravirt_ops_setup();
+
+#ifdef CONFIG_X86_IO_APIC
+ no_timer_check = 1;
+#endif
+
+ vmware_set_capabilities();
+}
+
+/*
+ * While checking the dmi string information, just checking the product
+ * serial key should be enough, as this will always have a VMware
+ * specific string when running under VMware hypervisor.
+ */
+static uint32_t __init vmware_platform(void)
+{
+ if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) {
+ unsigned int eax;
+ unsigned int hyper_vendor_id[3];
+
+ cpuid(CPUID_VMWARE_INFO_LEAF, &eax, &hyper_vendor_id[0],
+ &hyper_vendor_id[1], &hyper_vendor_id[2]);
+ if (!memcmp(hyper_vendor_id, "VMwareVMware", 12))
+ return CPUID_VMWARE_INFO_LEAF;
+ } else if (dmi_available && dmi_name_in_serial("VMware") &&
+ __vmware_platform())
+ return 1;
+
+ return 0;
+}
+
+/* Checks if hypervisor supports x2apic without VT-D interrupt remapping. */
+static bool __init vmware_legacy_x2apic_available(void)
+{
+ uint32_t eax, ebx, ecx, edx;
+ VMWARE_PORT(GETVCPU_INFO, eax, ebx, ecx, edx);
+ return (eax & (1 << VMWARE_PORT_CMD_VCPU_RESERVED)) == 0 &&
+ (eax & (1 << VMWARE_PORT_CMD_LEGACY_X2APIC)) != 0;
+}
+
+const __initconst struct hypervisor_x86 x86_hyper_vmware = {
+ .name = "VMware",
+ .detect = vmware_platform,
+ .type = X86_HYPER_VMWARE,
+ .init.init_platform = vmware_platform_setup,
+ .init.x2apic_available = vmware_legacy_x2apic_available,
+};
diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c
new file mode 100644
index 0000000..1d300f9
--- /dev/null
+++ b/arch/x86/kernel/cpuid.c
@@ -0,0 +1,196 @@
+/* ----------------------------------------------------------------------- *
+ *
+ * Copyright 2000-2008 H. Peter Anvin - All Rights Reserved
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, Inc., 675 Mass Ave, Cambridge MA 02139,
+ * USA; either version 2 of the License, or (at your option) any later
+ * version; incorporated herein by reference.
+ *
+ * ----------------------------------------------------------------------- */
+
+/*
+ * x86 CPUID access device
+ *
+ * This device is accessed by lseek() to the appropriate CPUID level
+ * and then read in chunks of 16 bytes. A larger size means multiple
+ * reads of consecutive levels.
+ *
+ * The lower 32 bits of the file position is used as the incoming %eax,
+ * and the upper 32 bits of the file position as the incoming %ecx,
+ * the latter intended for "counting" eax levels like eax=4.
+ *
+ * This driver uses /dev/cpu/%d/cpuid where %d is the minor number, and on
+ * an SMP box will direct the access to CPU %d.
+ */
+
+#include <linux/module.h>
+
+#include <linux/types.h>
+#include <linux/errno.h>
+#include <linux/fcntl.h>
+#include <linux/init.h>
+#include <linux/poll.h>
+#include <linux/smp.h>
+#include <linux/major.h>
+#include <linux/fs.h>
+#include <linux/device.h>
+#include <linux/cpu.h>
+#include <linux/notifier.h>
+#include <linux/uaccess.h>
+#include <linux/gfp.h>
+#include <linux/completion.h>
+
+#include <asm/processor.h>
+#include <asm/msr.h>
+
+static struct class *cpuid_class;
+static enum cpuhp_state cpuhp_cpuid_state;
+
+struct cpuid_regs_done {
+ struct cpuid_regs regs;
+ struct completion done;
+};
+
+static void cpuid_smp_cpuid(void *cmd_block)
+{
+ struct cpuid_regs_done *cmd = cmd_block;
+
+ cpuid_count(cmd->regs.eax, cmd->regs.ecx,
+ &cmd->regs.eax, &cmd->regs.ebx,
+ &cmd->regs.ecx, &cmd->regs.edx);
+
+ complete(&cmd->done);
+}
+
+static ssize_t cpuid_read(struct file *file, char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ char __user *tmp = buf;
+ struct cpuid_regs_done cmd;
+ int cpu = iminor(file_inode(file));
+ u64 pos = *ppos;
+ ssize_t bytes = 0;
+ int err = 0;
+
+ if (count % 16)
+ return -EINVAL; /* Invalid chunk size */
+
+ init_completion(&cmd.done);
+ for (; count; count -= 16) {
+ call_single_data_t csd = {
+ .func = cpuid_smp_cpuid,
+ .info = &cmd,
+ };
+
+ cmd.regs.eax = pos;
+ cmd.regs.ecx = pos >> 32;
+
+ err = smp_call_function_single_async(cpu, &csd);
+ if (err)
+ break;
+ wait_for_completion(&cmd.done);
+ if (copy_to_user(tmp, &cmd.regs, 16)) {
+ err = -EFAULT;
+ break;
+ }
+ tmp += 16;
+ bytes += 16;
+ *ppos = ++pos;
+ reinit_completion(&cmd.done);
+ }
+
+ return bytes ? bytes : err;
+}
+
+static int cpuid_open(struct inode *inode, struct file *file)
+{
+ unsigned int cpu;
+ struct cpuinfo_x86 *c;
+
+ cpu = iminor(file_inode(file));
+ if (cpu >= nr_cpu_ids || !cpu_online(cpu))
+ return -ENXIO; /* No such CPU */
+
+ c = &cpu_data(cpu);
+ if (c->cpuid_level < 0)
+ return -EIO; /* CPUID not supported */
+
+ return 0;
+}
+
+/*
+ * File operations we support
+ */
+static const struct file_operations cpuid_fops = {
+ .owner = THIS_MODULE,
+ .llseek = no_seek_end_llseek,
+ .read = cpuid_read,
+ .open = cpuid_open,
+};
+
+static int cpuid_device_create(unsigned int cpu)
+{
+ struct device *dev;
+
+ dev = device_create(cpuid_class, NULL, MKDEV(CPUID_MAJOR, cpu), NULL,
+ "cpu%d", cpu);
+ return PTR_ERR_OR_ZERO(dev);
+}
+
+static int cpuid_device_destroy(unsigned int cpu)
+{
+ device_destroy(cpuid_class, MKDEV(CPUID_MAJOR, cpu));
+ return 0;
+}
+
+static char *cpuid_devnode(struct device *dev, umode_t *mode)
+{
+ return kasprintf(GFP_KERNEL, "cpu/%u/cpuid", MINOR(dev->devt));
+}
+
+static int __init cpuid_init(void)
+{
+ int err;
+
+ if (__register_chrdev(CPUID_MAJOR, 0, NR_CPUS,
+ "cpu/cpuid", &cpuid_fops)) {
+ printk(KERN_ERR "cpuid: unable to get major %d for cpuid\n",
+ CPUID_MAJOR);
+ return -EBUSY;
+ }
+ cpuid_class = class_create(THIS_MODULE, "cpuid");
+ if (IS_ERR(cpuid_class)) {
+ err = PTR_ERR(cpuid_class);
+ goto out_chrdev;
+ }
+ cpuid_class->devnode = cpuid_devnode;
+
+ err = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/cpuid:online",
+ cpuid_device_create, cpuid_device_destroy);
+ if (err < 0)
+ goto out_class;
+
+ cpuhp_cpuid_state = err;
+ return 0;
+
+out_class:
+ class_destroy(cpuid_class);
+out_chrdev:
+ __unregister_chrdev(CPUID_MAJOR, 0, NR_CPUS, "cpu/cpuid");
+ return err;
+}
+module_init(cpuid_init);
+
+static void __exit cpuid_exit(void)
+{
+ cpuhp_remove_state(cpuhp_cpuid_state);
+ class_destroy(cpuid_class);
+ __unregister_chrdev(CPUID_MAJOR, 0, NR_CPUS, "cpu/cpuid");
+}
+module_exit(cpuid_exit);
+
+MODULE_AUTHOR("H. Peter Anvin <hpa@zytor.com>");
+MODULE_DESCRIPTION("x86 generic CPUID driver");
+MODULE_LICENSE("GPL");
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
new file mode 100644
index 0000000..f631a3f
--- /dev/null
+++ b/arch/x86/kernel/crash.c
@@ -0,0 +1,483 @@
+/*
+ * Architecture specific (i386/x86_64) functions for kexec based crash dumps.
+ *
+ * Created by: Hariprasad Nellitheertha (hari@in.ibm.com)
+ *
+ * Copyright (C) IBM Corporation, 2004. All rights reserved.
+ * Copyright (C) Red Hat Inc., 2014. All rights reserved.
+ * Authors:
+ * Vivek Goyal <vgoyal@redhat.com>
+ *
+ */
+
+#define pr_fmt(fmt) "kexec: " fmt
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/smp.h>
+#include <linux/reboot.h>
+#include <linux/kexec.h>
+#include <linux/delay.h>
+#include <linux/elf.h>
+#include <linux/elfcore.h>
+#include <linux/export.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+
+#include <asm/processor.h>
+#include <asm/hardirq.h>
+#include <asm/nmi.h>
+#include <asm/hw_irq.h>
+#include <asm/apic.h>
+#include <asm/e820/types.h>
+#include <asm/io_apic.h>
+#include <asm/hpet.h>
+#include <linux/kdebug.h>
+#include <asm/cpu.h>
+#include <asm/reboot.h>
+#include <asm/virtext.h>
+#include <asm/intel_pt.h>
+
+/* Used while preparing memory map entries for second kernel */
+struct crash_memmap_data {
+ struct boot_params *params;
+ /* Type of memory */
+ unsigned int type;
+};
+
+/*
+ * This is used to VMCLEAR all VMCSs loaded on the
+ * processor. And when loading kvm_intel module, the
+ * callback function pointer will be assigned.
+ *
+ * protected by rcu.
+ */
+crash_vmclear_fn __rcu *crash_vmclear_loaded_vmcss = NULL;
+EXPORT_SYMBOL_GPL(crash_vmclear_loaded_vmcss);
+unsigned long crash_zero_bytes;
+
+static inline void cpu_crash_vmclear_loaded_vmcss(void)
+{
+ crash_vmclear_fn *do_vmclear_operation = NULL;
+
+ rcu_read_lock();
+ do_vmclear_operation = rcu_dereference(crash_vmclear_loaded_vmcss);
+ if (do_vmclear_operation)
+ do_vmclear_operation();
+ rcu_read_unlock();
+}
+
+#if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC)
+
+static void kdump_nmi_callback(int cpu, struct pt_regs *regs)
+{
+#ifdef CONFIG_X86_32
+ struct pt_regs fixed_regs;
+
+ if (!user_mode(regs)) {
+ crash_fixup_ss_esp(&fixed_regs, regs);
+ regs = &fixed_regs;
+ }
+#endif
+ crash_save_cpu(regs, cpu);
+
+ /*
+ * VMCLEAR VMCSs loaded on all cpus if needed.
+ */
+ cpu_crash_vmclear_loaded_vmcss();
+
+ /* Disable VMX or SVM if needed.
+ *
+ * We need to disable virtualization on all CPUs.
+ * Having VMX or SVM enabled on any CPU may break rebooting
+ * after the kdump kernel has finished its task.
+ */
+ cpu_emergency_vmxoff();
+ cpu_emergency_svm_disable();
+
+ /*
+ * Disable Intel PT to stop its logging
+ */
+ cpu_emergency_stop_pt();
+
+ disable_local_APIC();
+}
+
+void kdump_nmi_shootdown_cpus(void)
+{
+ nmi_shootdown_cpus(kdump_nmi_callback);
+
+ disable_local_APIC();
+}
+
+/* Override the weak function in kernel/panic.c */
+void crash_smp_send_stop(void)
+{
+ static int cpus_stopped;
+
+ if (cpus_stopped)
+ return;
+
+ if (smp_ops.crash_stop_other_cpus)
+ smp_ops.crash_stop_other_cpus();
+ else
+ smp_send_stop();
+
+ cpus_stopped = 1;
+}
+
+#else
+void crash_smp_send_stop(void)
+{
+ /* There are no cpus to shootdown */
+}
+#endif
+
+void native_machine_crash_shutdown(struct pt_regs *regs)
+{
+ /* This function is only called after the system
+ * has panicked or is otherwise in a critical state.
+ * The minimum amount of code to allow a kexec'd kernel
+ * to run successfully needs to happen here.
+ *
+ * In practice this means shooting down the other cpus in
+ * an SMP system.
+ */
+ /* The kernel is broken so disable interrupts */
+ local_irq_disable();
+
+ crash_smp_send_stop();
+
+ /*
+ * VMCLEAR VMCSs loaded on this cpu if needed.
+ */
+ cpu_crash_vmclear_loaded_vmcss();
+
+ /* Booting kdump kernel with VMX or SVM enabled won't work,
+ * because (among other limitations) we can't disable paging
+ * with the virt flags.
+ */
+ cpu_emergency_vmxoff();
+ cpu_emergency_svm_disable();
+
+ /*
+ * Disable Intel PT to stop its logging
+ */
+ cpu_emergency_stop_pt();
+
+#ifdef CONFIG_X86_IO_APIC
+ /* Prevent crash_kexec() from deadlocking on ioapic_lock. */
+ ioapic_zap_locks();
+ clear_IO_APIC();
+#endif
+ lapic_shutdown();
+ restore_boot_irq_mode();
+#ifdef CONFIG_HPET_TIMER
+ hpet_disable();
+#endif
+ crash_save_cpu(regs, safe_smp_processor_id());
+}
+
+#ifdef CONFIG_KEXEC_FILE
+static int get_nr_ram_ranges_callback(struct resource *res, void *arg)
+{
+ unsigned int *nr_ranges = arg;
+
+ (*nr_ranges)++;
+ return 0;
+}
+
+/* Gather all the required information to prepare elf headers for ram regions */
+static struct crash_mem *fill_up_crash_elf_data(void)
+{
+ unsigned int nr_ranges = 0;
+ struct crash_mem *cmem;
+
+ walk_system_ram_res(0, -1, &nr_ranges,
+ get_nr_ram_ranges_callback);
+ if (!nr_ranges)
+ return NULL;
+
+ /*
+ * Exclusion of crash region and/or crashk_low_res may cause
+ * another range split. So add extra two slots here.
+ */
+ nr_ranges += 2;
+ cmem = vzalloc(sizeof(struct crash_mem) +
+ sizeof(struct crash_mem_range) * nr_ranges);
+ if (!cmem)
+ return NULL;
+
+ cmem->max_nr_ranges = nr_ranges;
+ cmem->nr_ranges = 0;
+
+ return cmem;
+}
+
+/*
+ * Look for any unwanted ranges between mstart, mend and remove them. This
+ * might lead to split and split ranges are put in cmem->ranges[] array
+ */
+static int elf_header_exclude_ranges(struct crash_mem *cmem)
+{
+ int ret = 0;
+
+ /* Exclude crashkernel region */
+ ret = crash_exclude_mem_range(cmem, crashk_res.start, crashk_res.end);
+ if (ret)
+ return ret;
+
+ if (crashk_low_res.end) {
+ ret = crash_exclude_mem_range(cmem, crashk_low_res.start,
+ crashk_low_res.end);
+ if (ret)
+ return ret;
+ }
+
+ return ret;
+}
+
+static int prepare_elf64_ram_headers_callback(struct resource *res, void *arg)
+{
+ struct crash_mem *cmem = arg;
+
+ cmem->ranges[cmem->nr_ranges].start = res->start;
+ cmem->ranges[cmem->nr_ranges].end = res->end;
+ cmem->nr_ranges++;
+
+ return 0;
+}
+
+/* Prepare elf headers. Return addr and size */
+static int prepare_elf_headers(struct kimage *image, void **addr,
+ unsigned long *sz)
+{
+ struct crash_mem *cmem;
+ Elf64_Ehdr *ehdr;
+ Elf64_Phdr *phdr;
+ int ret, i;
+
+ cmem = fill_up_crash_elf_data();
+ if (!cmem)
+ return -ENOMEM;
+
+ ret = walk_system_ram_res(0, -1, cmem,
+ prepare_elf64_ram_headers_callback);
+ if (ret)
+ goto out;
+
+ /* Exclude unwanted mem ranges */
+ ret = elf_header_exclude_ranges(cmem);
+ if (ret)
+ goto out;
+
+ /* By default prepare 64bit headers */
+ ret = crash_prepare_elf64_headers(cmem,
+ IS_ENABLED(CONFIG_X86_64), addr, sz);
+ if (ret)
+ goto out;
+
+ /*
+ * If a range matches backup region, adjust offset to backup
+ * segment.
+ */
+ ehdr = (Elf64_Ehdr *)*addr;
+ phdr = (Elf64_Phdr *)(ehdr + 1);
+ for (i = 0; i < ehdr->e_phnum; phdr++, i++)
+ if (phdr->p_type == PT_LOAD &&
+ phdr->p_paddr == image->arch.backup_src_start &&
+ phdr->p_memsz == image->arch.backup_src_sz) {
+ phdr->p_offset = image->arch.backup_load_addr;
+ break;
+ }
+out:
+ vfree(cmem);
+ return ret;
+}
+
+static int add_e820_entry(struct boot_params *params, struct e820_entry *entry)
+{
+ unsigned int nr_e820_entries;
+
+ nr_e820_entries = params->e820_entries;
+ if (nr_e820_entries >= E820_MAX_ENTRIES_ZEROPAGE)
+ return 1;
+
+ memcpy(¶ms->e820_table[nr_e820_entries], entry,
+ sizeof(struct e820_entry));
+ params->e820_entries++;
+ return 0;
+}
+
+static int memmap_entry_callback(struct resource *res, void *arg)
+{
+ struct crash_memmap_data *cmd = arg;
+ struct boot_params *params = cmd->params;
+ struct e820_entry ei;
+
+ ei.addr = res->start;
+ ei.size = resource_size(res);
+ ei.type = cmd->type;
+ add_e820_entry(params, &ei);
+
+ return 0;
+}
+
+static int memmap_exclude_ranges(struct kimage *image, struct crash_mem *cmem,
+ unsigned long long mstart,
+ unsigned long long mend)
+{
+ unsigned long start, end;
+ int ret = 0;
+
+ cmem->ranges[0].start = mstart;
+ cmem->ranges[0].end = mend;
+ cmem->nr_ranges = 1;
+
+ /* Exclude Backup region */
+ start = image->arch.backup_load_addr;
+ end = start + image->arch.backup_src_sz - 1;
+ ret = crash_exclude_mem_range(cmem, start, end);
+ if (ret)
+ return ret;
+
+ /* Exclude elf header region */
+ start = image->arch.elf_load_addr;
+ end = start + image->arch.elf_headers_sz - 1;
+ return crash_exclude_mem_range(cmem, start, end);
+}
+
+/* Prepare memory map for crash dump kernel */
+int crash_setup_memmap_entries(struct kimage *image, struct boot_params *params)
+{
+ int i, ret = 0;
+ unsigned long flags;
+ struct e820_entry ei;
+ struct crash_memmap_data cmd;
+ struct crash_mem *cmem;
+
+ cmem = vzalloc(sizeof(struct crash_mem));
+ if (!cmem)
+ return -ENOMEM;
+
+ memset(&cmd, 0, sizeof(struct crash_memmap_data));
+ cmd.params = params;
+
+ /* Add first 640K segment */
+ ei.addr = image->arch.backup_src_start;
+ ei.size = image->arch.backup_src_sz;
+ ei.type = E820_TYPE_RAM;
+ add_e820_entry(params, &ei);
+
+ /* Add ACPI tables */
+ cmd.type = E820_TYPE_ACPI;
+ flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+ walk_iomem_res_desc(IORES_DESC_ACPI_TABLES, flags, 0, -1, &cmd,
+ memmap_entry_callback);
+
+ /* Add ACPI Non-volatile Storage */
+ cmd.type = E820_TYPE_NVS;
+ walk_iomem_res_desc(IORES_DESC_ACPI_NV_STORAGE, flags, 0, -1, &cmd,
+ memmap_entry_callback);
+
+ /* Add crashk_low_res region */
+ if (crashk_low_res.end) {
+ ei.addr = crashk_low_res.start;
+ ei.size = crashk_low_res.end - crashk_low_res.start + 1;
+ ei.type = E820_TYPE_RAM;
+ add_e820_entry(params, &ei);
+ }
+
+ /* Exclude some ranges from crashk_res and add rest to memmap */
+ ret = memmap_exclude_ranges(image, cmem, crashk_res.start,
+ crashk_res.end);
+ if (ret)
+ goto out;
+
+ for (i = 0; i < cmem->nr_ranges; i++) {
+ ei.size = cmem->ranges[i].end - cmem->ranges[i].start + 1;
+
+ /* If entry is less than a page, skip it */
+ if (ei.size < PAGE_SIZE)
+ continue;
+ ei.addr = cmem->ranges[i].start;
+ ei.type = E820_TYPE_RAM;
+ add_e820_entry(params, &ei);
+ }
+
+out:
+ vfree(cmem);
+ return ret;
+}
+
+static int determine_backup_region(struct resource *res, void *arg)
+{
+ struct kimage *image = arg;
+
+ image->arch.backup_src_start = res->start;
+ image->arch.backup_src_sz = resource_size(res);
+
+ /* Expecting only one range for backup region */
+ return 1;
+}
+
+int crash_load_segments(struct kimage *image)
+{
+ int ret;
+ struct kexec_buf kbuf = { .image = image, .buf_min = 0,
+ .buf_max = ULONG_MAX, .top_down = false };
+
+ /*
+ * Determine and load a segment for backup area. First 640K RAM
+ * region is backup source
+ */
+
+ ret = walk_system_ram_res(KEXEC_BACKUP_SRC_START, KEXEC_BACKUP_SRC_END,
+ image, determine_backup_region);
+
+ /* Zero or postive return values are ok */
+ if (ret < 0)
+ return ret;
+
+ /* Add backup segment. */
+ if (image->arch.backup_src_sz) {
+ kbuf.buffer = &crash_zero_bytes;
+ kbuf.bufsz = sizeof(crash_zero_bytes);
+ kbuf.memsz = image->arch.backup_src_sz;
+ kbuf.buf_align = PAGE_SIZE;
+ /*
+ * Ideally there is no source for backup segment. This is
+ * copied in purgatory after crash. Just add a zero filled
+ * segment for now to make sure checksum logic works fine.
+ */
+ ret = kexec_add_buffer(&kbuf);
+ if (ret)
+ return ret;
+ image->arch.backup_load_addr = kbuf.mem;
+ pr_debug("Loaded backup region at 0x%lx backup_start=0x%lx memsz=0x%lx\n",
+ image->arch.backup_load_addr,
+ image->arch.backup_src_start, kbuf.memsz);
+ }
+
+ /* Prepare elf headers and add a segment */
+ ret = prepare_elf_headers(image, &kbuf.buffer, &kbuf.bufsz);
+ if (ret)
+ return ret;
+
+ image->arch.elf_headers = kbuf.buffer;
+ image->arch.elf_headers_sz = kbuf.bufsz;
+
+ kbuf.memsz = kbuf.bufsz;
+ kbuf.buf_align = ELF_CORE_HEADER_ALIGN;
+ ret = kexec_add_buffer(&kbuf);
+ if (ret) {
+ vfree((void *)image->arch.elf_headers);
+ return ret;
+ }
+ image->arch.elf_load_addr = kbuf.mem;
+ pr_debug("Loaded ELF headers at 0x%lx bufsz=0x%lx memsz=0x%lx\n",
+ image->arch.elf_load_addr, kbuf.bufsz, kbuf.bufsz);
+
+ return ret;
+}
+#endif /* CONFIG_KEXEC_FILE */
diff --git a/arch/x86/kernel/crash_dump_32.c b/arch/x86/kernel/crash_dump_32.c
new file mode 100644
index 0000000..33ee476
--- /dev/null
+++ b/arch/x86/kernel/crash_dump_32.c
@@ -0,0 +1,96 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Memory preserving reboot related code.
+ *
+ * Created by: Hariprasad Nellitheertha (hari@in.ibm.com)
+ * Copyright (C) IBM Corporation, 2004. All rights reserved
+ */
+
+#include <linux/slab.h>
+#include <linux/errno.h>
+#include <linux/highmem.h>
+#include <linux/crash_dump.h>
+
+#include <linux/uaccess.h>
+
+static void *kdump_buf_page;
+
+static inline bool is_crashed_pfn_valid(unsigned long pfn)
+{
+#ifndef CONFIG_X86_PAE
+ /*
+ * non-PAE kdump kernel executed from a PAE one will crop high pte
+ * bits and poke unwanted space counting again from address 0, we
+ * don't want that. pte must fit into unsigned long. In fact the
+ * test checks high 12 bits for being zero (pfn will be shifted left
+ * by PAGE_SHIFT).
+ */
+ return pte_pfn(pfn_pte(pfn, __pgprot(0))) == pfn;
+#else
+ return true;
+#endif
+}
+
+/**
+ * copy_oldmem_page - copy one page from "oldmem"
+ * @pfn: page frame number to be copied
+ * @buf: target memory address for the copy; this can be in kernel address
+ * space or user address space (see @userbuf)
+ * @csize: number of bytes to copy
+ * @offset: offset in bytes into the page (based on pfn) to begin the copy
+ * @userbuf: if set, @buf is in user address space, use copy_to_user(),
+ * otherwise @buf is in kernel address space, use memcpy().
+ *
+ * Copy a page from "oldmem". For this page, there is no pte mapped
+ * in the current kernel. We stitch up a pte, similar to kmap_atomic.
+ *
+ * Calling copy_to_user() in atomic context is not desirable. Hence first
+ * copying the data to a pre-allocated kernel page and then copying to user
+ * space in non-atomic context.
+ */
+ssize_t copy_oldmem_page(unsigned long pfn, char *buf,
+ size_t csize, unsigned long offset, int userbuf)
+{
+ void *vaddr;
+
+ if (!csize)
+ return 0;
+
+ if (!is_crashed_pfn_valid(pfn))
+ return -EFAULT;
+
+ vaddr = kmap_atomic_pfn(pfn);
+
+ if (!userbuf) {
+ memcpy(buf, (vaddr + offset), csize);
+ kunmap_atomic(vaddr);
+ } else {
+ if (!kdump_buf_page) {
+ printk(KERN_WARNING "Kdump: Kdump buffer page not"
+ " allocated\n");
+ kunmap_atomic(vaddr);
+ return -EFAULT;
+ }
+ copy_page(kdump_buf_page, vaddr);
+ kunmap_atomic(vaddr);
+ if (copy_to_user(buf, (kdump_buf_page + offset), csize))
+ return -EFAULT;
+ }
+
+ return csize;
+}
+
+static int __init kdump_buf_page_init(void)
+{
+ int ret = 0;
+
+ kdump_buf_page = kmalloc(PAGE_SIZE, GFP_KERNEL);
+ if (!kdump_buf_page) {
+ printk(KERN_WARNING "Kdump: Failed to allocate kdump buffer"
+ " page\n");
+ ret = -ENOMEM;
+ }
+
+ return ret;
+}
+arch_initcall(kdump_buf_page_init);
diff --git a/arch/x86/kernel/crash_dump_64.c b/arch/x86/kernel/crash_dump_64.c
new file mode 100644
index 0000000..4f2e077
--- /dev/null
+++ b/arch/x86/kernel/crash_dump_64.c
@@ -0,0 +1,50 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Memory preserving reboot related code.
+ *
+ * Created by: Hariprasad Nellitheertha (hari@in.ibm.com)
+ * Copyright (C) IBM Corporation, 2004. All rights reserved
+ */
+
+#include <linux/errno.h>
+#include <linux/crash_dump.h>
+#include <linux/uaccess.h>
+#include <linux/io.h>
+
+/**
+ * copy_oldmem_page - copy one page from "oldmem"
+ * @pfn: page frame number to be copied
+ * @buf: target memory address for the copy; this can be in kernel address
+ * space or user address space (see @userbuf)
+ * @csize: number of bytes to copy
+ * @offset: offset in bytes into the page (based on pfn) to begin the copy
+ * @userbuf: if set, @buf is in user address space, use copy_to_user(),
+ * otherwise @buf is in kernel address space, use memcpy().
+ *
+ * Copy a page from "oldmem". For this page, there is no pte mapped
+ * in the current kernel. We stitch up a pte, similar to kmap_atomic.
+ */
+ssize_t copy_oldmem_page(unsigned long pfn, char *buf,
+ size_t csize, unsigned long offset, int userbuf)
+{
+ void *vaddr;
+
+ if (!csize)
+ return 0;
+
+ vaddr = ioremap_cache(pfn << PAGE_SHIFT, PAGE_SIZE);
+ if (!vaddr)
+ return -ENOMEM;
+
+ if (userbuf) {
+ if (copy_to_user(buf, vaddr + offset, csize)) {
+ iounmap(vaddr);
+ return -EFAULT;
+ }
+ } else
+ memcpy(buf, vaddr + offset, csize);
+
+ set_iounmap_nonlazy();
+ iounmap(vaddr);
+ return csize;
+}
diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c
new file mode 100644
index 0000000..f39f3a0
--- /dev/null
+++ b/arch/x86/kernel/devicetree.c
@@ -0,0 +1,323 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Architecture specific OF callbacks.
+ */
+#include <linux/export.h>
+#include <linux/io.h>
+#include <linux/interrupt.h>
+#include <linux/list.h>
+#include <linux/of.h>
+#include <linux/of_fdt.h>
+#include <linux/of_address.h>
+#include <linux/of_platform.h>
+#include <linux/of_irq.h>
+#include <linux/libfdt.h>
+#include <linux/slab.h>
+#include <linux/pci.h>
+#include <linux/of_pci.h>
+#include <linux/initrd.h>
+
+#include <asm/irqdomain.h>
+#include <asm/hpet.h>
+#include <asm/apic.h>
+#include <asm/pci_x86.h>
+#include <asm/setup.h>
+#include <asm/i8259.h>
+
+__initdata u64 initial_dtb;
+char __initdata cmd_line[COMMAND_LINE_SIZE];
+
+int __initdata of_ioapic;
+
+void __init early_init_dt_scan_chosen_arch(unsigned long node)
+{
+ BUG();
+}
+
+void __init early_init_dt_add_memory_arch(u64 base, u64 size)
+{
+ BUG();
+}
+
+void __init add_dtb(u64 data)
+{
+ initial_dtb = data + offsetof(struct setup_data, data);
+}
+
+/*
+ * CE4100 ids. Will be moved to machine_device_initcall() once we have it.
+ */
+static struct of_device_id __initdata ce4100_ids[] = {
+ { .compatible = "intel,ce4100-cp", },
+ { .compatible = "isa", },
+ { .compatible = "pci", },
+ {},
+};
+
+static int __init add_bus_probe(void)
+{
+ if (!of_have_populated_dt())
+ return 0;
+
+ return of_platform_bus_probe(NULL, ce4100_ids, NULL);
+}
+device_initcall(add_bus_probe);
+
+#ifdef CONFIG_PCI
+struct device_node *pcibios_get_phb_of_node(struct pci_bus *bus)
+{
+ struct device_node *np;
+
+ for_each_node_by_type(np, "pci") {
+ const void *prop;
+ unsigned int bus_min;
+
+ prop = of_get_property(np, "bus-range", NULL);
+ if (!prop)
+ continue;
+ bus_min = be32_to_cpup(prop);
+ if (bus->number == bus_min)
+ return np;
+ }
+ return NULL;
+}
+
+static int x86_of_pci_irq_enable(struct pci_dev *dev)
+{
+ u32 virq;
+ int ret;
+ u8 pin;
+
+ ret = pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin);
+ if (ret)
+ return ret;
+ if (!pin)
+ return 0;
+
+ virq = of_irq_parse_and_map_pci(dev, 0, 0);
+ if (virq == 0)
+ return -EINVAL;
+ dev->irq = virq;
+ return 0;
+}
+
+static void x86_of_pci_irq_disable(struct pci_dev *dev)
+{
+}
+
+void x86_of_pci_init(void)
+{
+ pcibios_enable_irq = x86_of_pci_irq_enable;
+ pcibios_disable_irq = x86_of_pci_irq_disable;
+}
+#endif
+
+static void __init dtb_setup_hpet(void)
+{
+#ifdef CONFIG_HPET_TIMER
+ struct device_node *dn;
+ struct resource r;
+ int ret;
+
+ dn = of_find_compatible_node(NULL, NULL, "intel,ce4100-hpet");
+ if (!dn)
+ return;
+ ret = of_address_to_resource(dn, 0, &r);
+ if (ret) {
+ WARN_ON(1);
+ return;
+ }
+ hpet_address = r.start;
+#endif
+}
+
+#ifdef CONFIG_X86_LOCAL_APIC
+
+static void __init dtb_cpu_setup(void)
+{
+ struct device_node *dn;
+ u32 apic_id, version;
+ int ret;
+
+ version = GET_APIC_VERSION(apic_read(APIC_LVR));
+ for_each_node_by_type(dn, "cpu") {
+ ret = of_property_read_u32(dn, "reg", &apic_id);
+ if (ret < 0) {
+ pr_warn("%pOF: missing local APIC ID\n", dn);
+ continue;
+ }
+ generic_processor_info(apic_id, version);
+ }
+}
+
+static void __init dtb_lapic_setup(void)
+{
+ struct device_node *dn;
+ struct resource r;
+ unsigned long lapic_addr = APIC_DEFAULT_PHYS_BASE;
+ int ret;
+
+ dn = of_find_compatible_node(NULL, NULL, "intel,ce4100-lapic");
+ if (dn) {
+ ret = of_address_to_resource(dn, 0, &r);
+ if (WARN_ON(ret))
+ return;
+ lapic_addr = r.start;
+ }
+
+ /* Did the boot loader setup the local APIC ? */
+ if (!boot_cpu_has(X86_FEATURE_APIC)) {
+ if (apic_force_enable(lapic_addr))
+ return;
+ }
+ smp_found_config = 1;
+ pic_mode = 1;
+ register_lapic_address(lapic_addr);
+}
+
+#endif /* CONFIG_X86_LOCAL_APIC */
+
+#ifdef CONFIG_X86_IO_APIC
+static unsigned int ioapic_id;
+
+struct of_ioapic_type {
+ u32 out_type;
+ u32 trigger;
+ u32 polarity;
+};
+
+static struct of_ioapic_type of_ioapic_type[] =
+{
+ {
+ .out_type = IRQ_TYPE_EDGE_RISING,
+ .trigger = IOAPIC_EDGE,
+ .polarity = 1,
+ },
+ {
+ .out_type = IRQ_TYPE_LEVEL_LOW,
+ .trigger = IOAPIC_LEVEL,
+ .polarity = 0,
+ },
+ {
+ .out_type = IRQ_TYPE_LEVEL_HIGH,
+ .trigger = IOAPIC_LEVEL,
+ .polarity = 1,
+ },
+ {
+ .out_type = IRQ_TYPE_EDGE_FALLING,
+ .trigger = IOAPIC_EDGE,
+ .polarity = 0,
+ },
+};
+
+static int dt_irqdomain_alloc(struct irq_domain *domain, unsigned int virq,
+ unsigned int nr_irqs, void *arg)
+{
+ struct irq_fwspec *fwspec = (struct irq_fwspec *)arg;
+ struct of_ioapic_type *it;
+ struct irq_alloc_info tmp;
+ int type_index;
+
+ if (WARN_ON(fwspec->param_count < 2))
+ return -EINVAL;
+
+ type_index = fwspec->param[1];
+ if (type_index >= ARRAY_SIZE(of_ioapic_type))
+ return -EINVAL;
+
+ it = &of_ioapic_type[type_index];
+ ioapic_set_alloc_attr(&tmp, NUMA_NO_NODE, it->trigger, it->polarity);
+ tmp.ioapic_id = mpc_ioapic_id(mp_irqdomain_ioapic_idx(domain));
+ tmp.ioapic_pin = fwspec->param[0];
+
+ return mp_irqdomain_alloc(domain, virq, nr_irqs, &tmp);
+}
+
+static const struct irq_domain_ops ioapic_irq_domain_ops = {
+ .alloc = dt_irqdomain_alloc,
+ .free = mp_irqdomain_free,
+ .activate = mp_irqdomain_activate,
+ .deactivate = mp_irqdomain_deactivate,
+};
+
+static void __init dtb_add_ioapic(struct device_node *dn)
+{
+ struct resource r;
+ int ret;
+ struct ioapic_domain_cfg cfg = {
+ .type = IOAPIC_DOMAIN_DYNAMIC,
+ .ops = &ioapic_irq_domain_ops,
+ .dev = dn,
+ };
+
+ ret = of_address_to_resource(dn, 0, &r);
+ if (ret) {
+ printk(KERN_ERR "Can't obtain address from device node %pOF.\n", dn);
+ return;
+ }
+ mp_register_ioapic(++ioapic_id, r.start, gsi_top, &cfg);
+}
+
+static void __init dtb_ioapic_setup(void)
+{
+ struct device_node *dn;
+
+ for_each_compatible_node(dn, NULL, "intel,ce4100-ioapic")
+ dtb_add_ioapic(dn);
+
+ if (nr_ioapics) {
+ of_ioapic = 1;
+ return;
+ }
+ printk(KERN_ERR "Error: No information about IO-APIC in OF.\n");
+}
+#else
+static void __init dtb_ioapic_setup(void) {}
+#endif
+
+static void __init dtb_apic_setup(void)
+{
+#ifdef CONFIG_X86_LOCAL_APIC
+ dtb_lapic_setup();
+ dtb_cpu_setup();
+#endif
+ dtb_ioapic_setup();
+}
+
+#ifdef CONFIG_OF_EARLY_FLATTREE
+static void __init x86_flattree_get_config(void)
+{
+ u32 size, map_len;
+ void *dt;
+
+ if (!initial_dtb)
+ return;
+
+ map_len = max(PAGE_SIZE - (initial_dtb & ~PAGE_MASK), (u64)128);
+
+ dt = early_memremap(initial_dtb, map_len);
+ size = fdt_totalsize(dt);
+ if (map_len < size) {
+ early_memunmap(dt, map_len);
+ dt = early_memremap(initial_dtb, size);
+ map_len = size;
+ }
+
+ early_init_dt_verify(dt);
+ unflatten_and_copy_device_tree();
+ early_memunmap(dt, map_len);
+}
+#else
+static inline void x86_flattree_get_config(void) { }
+#endif
+
+void __init x86_dtb_init(void)
+{
+ x86_flattree_get_config();
+
+ if (!of_have_populated_dt())
+ return;
+
+ dtb_setup_hpet();
+ dtb_apic_setup();
+}
diff --git a/arch/x86/kernel/doublefault.c b/arch/x86/kernel/doublefault.c
new file mode 100644
index 0000000..0b8cedb
--- /dev/null
+++ b/arch/x86/kernel/doublefault.c
@@ -0,0 +1,83 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/mm.h>
+#include <linux/sched.h>
+#include <linux/sched/debug.h>
+#include <linux/init_task.h>
+#include <linux/fs.h>
+
+#include <linux/uaccess.h>
+#include <asm/pgtable.h>
+#include <asm/processor.h>
+#include <asm/desc.h>
+
+#ifdef CONFIG_X86_32
+
+#define DOUBLEFAULT_STACKSIZE (1024)
+static unsigned long doublefault_stack[DOUBLEFAULT_STACKSIZE];
+#define STACK_START (unsigned long)(doublefault_stack+DOUBLEFAULT_STACKSIZE)
+
+#define ptr_ok(x) ((x) > PAGE_OFFSET && (x) < PAGE_OFFSET + MAXMEM)
+
+static void doublefault_fn(void)
+{
+ struct desc_ptr gdt_desc = {0, 0};
+ unsigned long gdt, tss;
+
+ native_store_gdt(&gdt_desc);
+ gdt = gdt_desc.address;
+
+ printk(KERN_EMERG "PANIC: double fault, gdt at %08lx [%d bytes]\n", gdt, gdt_desc.size);
+
+ if (ptr_ok(gdt)) {
+ gdt += GDT_ENTRY_TSS << 3;
+ tss = get_desc_base((struct desc_struct *)gdt);
+ printk(KERN_EMERG "double fault, tss at %08lx\n", tss);
+
+ if (ptr_ok(tss)) {
+ struct x86_hw_tss *t = (struct x86_hw_tss *)tss;
+
+ printk(KERN_EMERG "eip = %08lx, esp = %08lx\n",
+ t->ip, t->sp);
+
+ printk(KERN_EMERG "eax = %08lx, ebx = %08lx, ecx = %08lx, edx = %08lx\n",
+ t->ax, t->bx, t->cx, t->dx);
+ printk(KERN_EMERG "esi = %08lx, edi = %08lx\n",
+ t->si, t->di);
+ }
+ }
+
+ for (;;)
+ cpu_relax();
+}
+
+struct x86_hw_tss doublefault_tss __cacheline_aligned = {
+ .sp0 = STACK_START,
+ .ss0 = __KERNEL_DS,
+ .ldt = 0,
+ .io_bitmap_base = INVALID_IO_BITMAP_OFFSET,
+
+ .ip = (unsigned long) doublefault_fn,
+ /* 0x2 bit is always set */
+ .flags = X86_EFLAGS_SF | 0x2,
+ .sp = STACK_START,
+ .es = __USER_DS,
+ .cs = __KERNEL_CS,
+ .ss = __KERNEL_DS,
+ .ds = __USER_DS,
+ .fs = __KERNEL_PERCPU,
+
+ .__cr3 = __pa_nodebug(swapper_pg_dir),
+};
+
+/* dummy for do_double_fault() call */
+void df_debug(struct pt_regs *regs, long error_code) {}
+
+#else /* !CONFIG_X86_32 */
+
+void df_debug(struct pt_regs *regs, long error_code)
+{
+ pr_emerg("PANIC: double fault, error_code: 0x%lx\n", error_code);
+ show_regs(regs);
+ panic("Machine halted.");
+}
+#endif
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
new file mode 100644
index 0000000..2b58864
--- /dev/null
+++ b/arch/x86/kernel/dumpstack.c
@@ -0,0 +1,419 @@
+/*
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs
+ */
+#include <linux/kallsyms.h>
+#include <linux/kprobes.h>
+#include <linux/uaccess.h>
+#include <linux/utsname.h>
+#include <linux/hardirq.h>
+#include <linux/kdebug.h>
+#include <linux/module.h>
+#include <linux/ptrace.h>
+#include <linux/sched/debug.h>
+#include <linux/sched/task_stack.h>
+#include <linux/ftrace.h>
+#include <linux/kexec.h>
+#include <linux/bug.h>
+#include <linux/nmi.h>
+#include <linux/sysfs.h>
+#include <linux/kasan.h>
+
+#include <asm/cpu_entry_area.h>
+#include <asm/stacktrace.h>
+#include <asm/unwind.h>
+
+int panic_on_unrecovered_nmi;
+int panic_on_io_nmi;
+static int die_counter;
+
+static struct pt_regs exec_summary_regs;
+
+bool in_task_stack(unsigned long *stack, struct task_struct *task,
+ struct stack_info *info)
+{
+ unsigned long *begin = task_stack_page(task);
+ unsigned long *end = task_stack_page(task) + THREAD_SIZE;
+
+ if (stack < begin || stack >= end)
+ return false;
+
+ info->type = STACK_TYPE_TASK;
+ info->begin = begin;
+ info->end = end;
+ info->next_sp = NULL;
+
+ return true;
+}
+
+bool in_entry_stack(unsigned long *stack, struct stack_info *info)
+{
+ struct entry_stack *ss = cpu_entry_stack(smp_processor_id());
+
+ void *begin = ss;
+ void *end = ss + 1;
+
+ if ((void *)stack < begin || (void *)stack >= end)
+ return false;
+
+ info->type = STACK_TYPE_ENTRY;
+ info->begin = begin;
+ info->end = end;
+ info->next_sp = NULL;
+
+ return true;
+}
+
+static void printk_stack_address(unsigned long address, int reliable,
+ char *log_lvl)
+{
+ touch_nmi_watchdog();
+ printk("%s %s%pB\n", log_lvl, reliable ? "" : "? ", (void *)address);
+}
+
+/*
+ * There are a couple of reasons for the 2/3rd prologue, courtesy of Linus:
+ *
+ * In case where we don't have the exact kernel image (which, if we did, we can
+ * simply disassemble and navigate to the RIP), the purpose of the bigger
+ * prologue is to have more context and to be able to correlate the code from
+ * the different toolchains better.
+ *
+ * In addition, it helps in recreating the register allocation of the failing
+ * kernel and thus make sense of the register dump.
+ *
+ * What is more, the additional complication of a variable length insn arch like
+ * x86 warrants having longer byte sequence before rIP so that the disassembler
+ * can "sync" up properly and find instruction boundaries when decoding the
+ * opcode bytes.
+ *
+ * Thus, the 2/3rds prologue and 64 byte OPCODE_BUFSIZE is just a random
+ * guesstimate in attempt to achieve all of the above.
+ */
+void show_opcodes(struct pt_regs *regs, const char *loglvl)
+{
+#define PROLOGUE_SIZE 42
+#define EPILOGUE_SIZE 21
+#define OPCODE_BUFSIZE (PROLOGUE_SIZE + 1 + EPILOGUE_SIZE)
+ u8 opcodes[OPCODE_BUFSIZE];
+ unsigned long prologue = regs->ip - PROLOGUE_SIZE;
+ bool bad_ip;
+
+ /*
+ * Make sure userspace isn't trying to trick us into dumping kernel
+ * memory by pointing the userspace instruction pointer at it.
+ */
+ bad_ip = user_mode(regs) &&
+ __chk_range_not_ok(prologue, OPCODE_BUFSIZE, TASK_SIZE_MAX);
+
+ if (bad_ip || probe_kernel_read(opcodes, (u8 *)prologue,
+ OPCODE_BUFSIZE)) {
+ printk("%sCode: Bad RIP value.\n", loglvl);
+ } else {
+ printk("%sCode: %" __stringify(PROLOGUE_SIZE) "ph <%02x> %"
+ __stringify(EPILOGUE_SIZE) "ph\n", loglvl, opcodes,
+ opcodes[PROLOGUE_SIZE], opcodes + PROLOGUE_SIZE + 1);
+ }
+}
+
+void show_ip(struct pt_regs *regs, const char *loglvl)
+{
+#ifdef CONFIG_X86_32
+ printk("%sEIP: %pS\n", loglvl, (void *)regs->ip);
+#else
+ printk("%sRIP: %04x:%pS\n", loglvl, (int)regs->cs, (void *)regs->ip);
+#endif
+ show_opcodes(regs, loglvl);
+}
+
+void show_iret_regs(struct pt_regs *regs)
+{
+ show_ip(regs, KERN_DEFAULT);
+ printk(KERN_DEFAULT "RSP: %04x:%016lx EFLAGS: %08lx", (int)regs->ss,
+ regs->sp, regs->flags);
+}
+
+static void show_regs_if_on_stack(struct stack_info *info, struct pt_regs *regs,
+ bool partial)
+{
+ /*
+ * These on_stack() checks aren't strictly necessary: the unwind code
+ * has already validated the 'regs' pointer. The checks are done for
+ * ordering reasons: if the registers are on the next stack, we don't
+ * want to print them out yet. Otherwise they'll be shown as part of
+ * the wrong stack. Later, when show_trace_log_lvl() switches to the
+ * next stack, this function will be called again with the same regs so
+ * they can be printed in the right context.
+ */
+ if (!partial && on_stack(info, regs, sizeof(*regs))) {
+ __show_regs(regs, SHOW_REGS_SHORT);
+
+ } else if (partial && on_stack(info, (void *)regs + IRET_FRAME_OFFSET,
+ IRET_FRAME_SIZE)) {
+ /*
+ * When an interrupt or exception occurs in entry code, the
+ * full pt_regs might not have been saved yet. In that case
+ * just print the iret frame.
+ */
+ show_iret_regs(regs);
+ }
+}
+
+void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
+ unsigned long *stack, char *log_lvl)
+{
+ struct unwind_state state;
+ struct stack_info stack_info = {0};
+ unsigned long visit_mask = 0;
+ int graph_idx = 0;
+ bool partial = false;
+
+ printk("%sCall Trace:\n", log_lvl);
+
+ unwind_start(&state, task, regs, stack);
+ stack = stack ? : get_stack_pointer(task, regs);
+ regs = unwind_get_entry_regs(&state, &partial);
+
+ /*
+ * Iterate through the stacks, starting with the current stack pointer.
+ * Each stack has a pointer to the next one.
+ *
+ * x86-64 can have several stacks:
+ * - task stack
+ * - interrupt stack
+ * - HW exception stacks (double fault, nmi, debug, mce)
+ * - entry stack
+ *
+ * x86-32 can have up to four stacks:
+ * - task stack
+ * - softirq stack
+ * - hardirq stack
+ * - entry stack
+ */
+ for ( ; stack; stack = PTR_ALIGN(stack_info.next_sp, sizeof(long))) {
+ const char *stack_name;
+
+ if (get_stack_info(stack, task, &stack_info, &visit_mask)) {
+ /*
+ * We weren't on a valid stack. It's possible that
+ * we overflowed a valid stack into a guard page.
+ * See if the next page up is valid so that we can
+ * generate some kind of backtrace if this happens.
+ */
+ stack = (unsigned long *)PAGE_ALIGN((unsigned long)stack);
+ if (get_stack_info(stack, task, &stack_info, &visit_mask))
+ break;
+ }
+
+ stack_name = stack_type_name(stack_info.type);
+ if (stack_name)
+ printk("%s <%s>\n", log_lvl, stack_name);
+
+ if (regs)
+ show_regs_if_on_stack(&stack_info, regs, partial);
+
+ /*
+ * Scan the stack, printing any text addresses we find. At the
+ * same time, follow proper stack frames with the unwinder.
+ *
+ * Addresses found during the scan which are not reported by
+ * the unwinder are considered to be additional clues which are
+ * sometimes useful for debugging and are prefixed with '?'.
+ * This also serves as a failsafe option in case the unwinder
+ * goes off in the weeds.
+ */
+ for (; stack < stack_info.end; stack++) {
+ unsigned long real_addr;
+ int reliable = 0;
+ unsigned long addr = READ_ONCE_NOCHECK(*stack);
+ unsigned long *ret_addr_p =
+ unwind_get_return_address_ptr(&state);
+
+ if (!__kernel_text_address(addr))
+ continue;
+
+ /*
+ * Don't print regs->ip again if it was already printed
+ * by show_regs_if_on_stack().
+ */
+ if (regs && stack == ®s->ip)
+ goto next;
+
+ if (stack == ret_addr_p)
+ reliable = 1;
+
+ /*
+ * When function graph tracing is enabled for a
+ * function, its return address on the stack is
+ * replaced with the address of an ftrace handler
+ * (return_to_handler). In that case, before printing
+ * the "real" address, we want to print the handler
+ * address as an "unreliable" hint that function graph
+ * tracing was involved.
+ */
+ real_addr = ftrace_graph_ret_addr(task, &graph_idx,
+ addr, stack);
+ if (real_addr != addr)
+ printk_stack_address(addr, 0, log_lvl);
+ printk_stack_address(real_addr, reliable, log_lvl);
+
+ if (!reliable)
+ continue;
+
+next:
+ /*
+ * Get the next frame from the unwinder. No need to
+ * check for an error: if anything goes wrong, the rest
+ * of the addresses will just be printed as unreliable.
+ */
+ unwind_next_frame(&state);
+
+ /* if the frame has entry regs, print them */
+ regs = unwind_get_entry_regs(&state, &partial);
+ if (regs)
+ show_regs_if_on_stack(&stack_info, regs, partial);
+ }
+
+ if (stack_name)
+ printk("%s </%s>\n", log_lvl, stack_name);
+ }
+}
+
+void show_stack(struct task_struct *task, unsigned long *sp)
+{
+ task = task ? : current;
+
+ /*
+ * Stack frames below this one aren't interesting. Don't show them
+ * if we're printing for %current.
+ */
+ if (!sp && task == current)
+ sp = get_stack_pointer(current, NULL);
+
+ show_trace_log_lvl(task, NULL, sp, KERN_DEFAULT);
+}
+
+void show_stack_regs(struct pt_regs *regs)
+{
+ show_trace_log_lvl(current, regs, NULL, KERN_DEFAULT);
+}
+
+static arch_spinlock_t die_lock = __ARCH_SPIN_LOCK_UNLOCKED;
+static int die_owner = -1;
+static unsigned int die_nest_count;
+
+unsigned long oops_begin(void)
+{
+ int cpu;
+ unsigned long flags;
+
+ oops_enter();
+
+ /* racy, but better than risking deadlock. */
+ raw_local_irq_save(flags);
+ cpu = smp_processor_id();
+ if (!arch_spin_trylock(&die_lock)) {
+ if (cpu == die_owner)
+ /* nested oops. should stop eventually */;
+ else
+ arch_spin_lock(&die_lock);
+ }
+ die_nest_count++;
+ die_owner = cpu;
+ console_verbose();
+ bust_spinlocks(1);
+ return flags;
+}
+NOKPROBE_SYMBOL(oops_begin);
+
+void __noreturn rewind_stack_do_exit(int signr);
+
+void oops_end(unsigned long flags, struct pt_regs *regs, int signr)
+{
+ if (regs && kexec_should_crash(current))
+ crash_kexec(regs);
+
+ bust_spinlocks(0);
+ die_owner = -1;
+ add_taint(TAINT_DIE, LOCKDEP_NOW_UNRELIABLE);
+ die_nest_count--;
+ if (!die_nest_count)
+ /* Nest count reaches zero, release the lock. */
+ arch_spin_unlock(&die_lock);
+ raw_local_irq_restore(flags);
+ oops_exit();
+
+ /* Executive summary in case the oops scrolled away */
+ __show_regs(&exec_summary_regs, SHOW_REGS_ALL);
+
+ if (!signr)
+ return;
+ if (in_interrupt())
+ panic("Fatal exception in interrupt");
+ if (panic_on_oops)
+ panic("Fatal exception");
+
+ /*
+ * We're not going to return, but we might be on an IST stack or
+ * have very little stack space left. Rewind the stack and kill
+ * the task.
+ * Before we rewind the stack, we have to tell KASAN that we're going to
+ * reuse the task stack and that existing poisons are invalid.
+ */
+ kasan_unpoison_task_stack(current);
+ rewind_stack_do_exit(signr);
+}
+NOKPROBE_SYMBOL(oops_end);
+
+int __die(const char *str, struct pt_regs *regs, long err)
+{
+ /* Save the regs of the first oops for the executive summary later. */
+ if (!die_counter)
+ exec_summary_regs = *regs;
+
+ printk(KERN_DEFAULT
+ "%s: %04lx [#%d]%s%s%s%s%s\n", str, err & 0xffff, ++die_counter,
+ IS_ENABLED(CONFIG_PREEMPT) ? " PREEMPT" : "",
+ IS_ENABLED(CONFIG_SMP) ? " SMP" : "",
+ debug_pagealloc_enabled() ? " DEBUG_PAGEALLOC" : "",
+ IS_ENABLED(CONFIG_KASAN) ? " KASAN" : "",
+ IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION) ?
+ (boot_cpu_has(X86_FEATURE_PTI) ? " PTI" : " NOPTI") : "");
+
+ show_regs(regs);
+ print_modules();
+
+ if (notify_die(DIE_OOPS, str, regs, err,
+ current->thread.trap_nr, SIGSEGV) == NOTIFY_STOP)
+ return 1;
+
+ return 0;
+}
+NOKPROBE_SYMBOL(__die);
+
+/*
+ * This is gone through when something in the kernel has done something bad
+ * and is about to be terminated:
+ */
+void die(const char *str, struct pt_regs *regs, long err)
+{
+ unsigned long flags = oops_begin();
+ int sig = SIGSEGV;
+
+ if (__die(str, regs, err))
+ sig = 0;
+ oops_end(flags, regs, sig);
+}
+
+void show_regs(struct pt_regs *regs)
+{
+ show_regs_print_info(KERN_DEFAULT);
+
+ __show_regs(regs, user_mode(regs) ? SHOW_REGS_USER : SHOW_REGS_ALL);
+
+ /*
+ * When in-kernel, we also print out the stack at the time of the fault..
+ */
+ if (!user_mode(regs))
+ show_trace_log_lvl(current, regs, NULL, KERN_DEFAULT);
+}
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c
new file mode 100644
index 0000000..cd53f30
--- /dev/null
+++ b/arch/x86/kernel/dumpstack_32.c
@@ -0,0 +1,129 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs
+ */
+#include <linux/sched/debug.h>
+#include <linux/kallsyms.h>
+#include <linux/kprobes.h>
+#include <linux/uaccess.h>
+#include <linux/hardirq.h>
+#include <linux/kdebug.h>
+#include <linux/export.h>
+#include <linux/ptrace.h>
+#include <linux/kexec.h>
+#include <linux/sysfs.h>
+#include <linux/bug.h>
+#include <linux/nmi.h>
+
+#include <asm/stacktrace.h>
+
+const char *stack_type_name(enum stack_type type)
+{
+ if (type == STACK_TYPE_IRQ)
+ return "IRQ";
+
+ if (type == STACK_TYPE_SOFTIRQ)
+ return "SOFTIRQ";
+
+ if (type == STACK_TYPE_ENTRY)
+ return "ENTRY_TRAMPOLINE";
+
+ return NULL;
+}
+
+static bool in_hardirq_stack(unsigned long *stack, struct stack_info *info)
+{
+ unsigned long *begin = (unsigned long *)this_cpu_read(hardirq_stack);
+ unsigned long *end = begin + (THREAD_SIZE / sizeof(long));
+
+ /*
+ * This is a software stack, so 'end' can be a valid stack pointer.
+ * It just means the stack is empty.
+ */
+ if (stack <= begin || stack > end)
+ return false;
+
+ info->type = STACK_TYPE_IRQ;
+ info->begin = begin;
+ info->end = end;
+
+ /*
+ * See irq_32.c -- the next stack pointer is stored at the beginning of
+ * the stack.
+ */
+ info->next_sp = (unsigned long *)*begin;
+
+ return true;
+}
+
+static bool in_softirq_stack(unsigned long *stack, struct stack_info *info)
+{
+ unsigned long *begin = (unsigned long *)this_cpu_read(softirq_stack);
+ unsigned long *end = begin + (THREAD_SIZE / sizeof(long));
+
+ /*
+ * This is a software stack, so 'end' can be a valid stack pointer.
+ * It just means the stack is empty.
+ */
+ if (stack <= begin || stack > end)
+ return false;
+
+ info->type = STACK_TYPE_SOFTIRQ;
+ info->begin = begin;
+ info->end = end;
+
+ /*
+ * The next stack pointer is stored at the beginning of the stack.
+ * See irq_32.c.
+ */
+ info->next_sp = (unsigned long *)*begin;
+
+ return true;
+}
+
+int get_stack_info(unsigned long *stack, struct task_struct *task,
+ struct stack_info *info, unsigned long *visit_mask)
+{
+ if (!stack)
+ goto unknown;
+
+ task = task ? : current;
+
+ if (in_task_stack(stack, task, info))
+ goto recursion_check;
+
+ if (task != current)
+ goto unknown;
+
+ if (in_entry_stack(stack, info))
+ goto recursion_check;
+
+ if (in_hardirq_stack(stack, info))
+ goto recursion_check;
+
+ if (in_softirq_stack(stack, info))
+ goto recursion_check;
+
+ goto unknown;
+
+recursion_check:
+ /*
+ * Make sure we don't iterate through any given stack more than once.
+ * If it comes up a second time then there's something wrong going on:
+ * just break out and report an unknown stack type.
+ */
+ if (visit_mask) {
+ if (*visit_mask & (1UL << info->type)) {
+ printk_deferred_once(KERN_WARNING "WARNING: stack recursion on stack type %d\n", info->type);
+ goto unknown;
+ }
+ *visit_mask |= 1UL << info->type;
+ }
+
+ return 0;
+
+unknown:
+ info->type = STACK_TYPE_UNKNOWN;
+ return -EINVAL;
+}
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
new file mode 100644
index 0000000..5cdb9e8
--- /dev/null
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -0,0 +1,151 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs
+ */
+#include <linux/sched/debug.h>
+#include <linux/kallsyms.h>
+#include <linux/kprobes.h>
+#include <linux/uaccess.h>
+#include <linux/hardirq.h>
+#include <linux/kdebug.h>
+#include <linux/export.h>
+#include <linux/ptrace.h>
+#include <linux/kexec.h>
+#include <linux/sysfs.h>
+#include <linux/bug.h>
+#include <linux/nmi.h>
+
+#include <asm/stacktrace.h>
+
+static char *exception_stack_names[N_EXCEPTION_STACKS] = {
+ [ DOUBLEFAULT_STACK-1 ] = "#DF",
+ [ NMI_STACK-1 ] = "NMI",
+ [ DEBUG_STACK-1 ] = "#DB",
+ [ MCE_STACK-1 ] = "#MC",
+};
+
+static unsigned long exception_stack_sizes[N_EXCEPTION_STACKS] = {
+ [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STKSZ,
+ [DEBUG_STACK - 1] = DEBUG_STKSZ
+};
+
+const char *stack_type_name(enum stack_type type)
+{
+ BUILD_BUG_ON(N_EXCEPTION_STACKS != 4);
+
+ if (type == STACK_TYPE_IRQ)
+ return "IRQ";
+
+ if (type == STACK_TYPE_ENTRY) {
+ /*
+ * On 64-bit, we have a generic entry stack that we
+ * use for all the kernel entry points, including
+ * SYSENTER.
+ */
+ return "ENTRY_TRAMPOLINE";
+ }
+
+ if (type >= STACK_TYPE_EXCEPTION && type <= STACK_TYPE_EXCEPTION_LAST)
+ return exception_stack_names[type - STACK_TYPE_EXCEPTION];
+
+ return NULL;
+}
+
+static bool in_exception_stack(unsigned long *stack, struct stack_info *info)
+{
+ unsigned long *begin, *end;
+ struct pt_regs *regs;
+ unsigned k;
+
+ BUILD_BUG_ON(N_EXCEPTION_STACKS != 4);
+
+ for (k = 0; k < N_EXCEPTION_STACKS; k++) {
+ end = (unsigned long *)raw_cpu_ptr(&orig_ist)->ist[k];
+ begin = end - (exception_stack_sizes[k] / sizeof(long));
+ regs = (struct pt_regs *)end - 1;
+
+ if (stack <= begin || stack >= end)
+ continue;
+
+ info->type = STACK_TYPE_EXCEPTION + k;
+ info->begin = begin;
+ info->end = end;
+ info->next_sp = (unsigned long *)regs->sp;
+
+ return true;
+ }
+
+ return false;
+}
+
+static bool in_irq_stack(unsigned long *stack, struct stack_info *info)
+{
+ unsigned long *end = (unsigned long *)this_cpu_read(irq_stack_ptr);
+ unsigned long *begin = end - (IRQ_STACK_SIZE / sizeof(long));
+
+ /*
+ * This is a software stack, so 'end' can be a valid stack pointer.
+ * It just means the stack is empty.
+ */
+ if (stack <= begin || stack > end)
+ return false;
+
+ info->type = STACK_TYPE_IRQ;
+ info->begin = begin;
+ info->end = end;
+
+ /*
+ * The next stack pointer is the first thing pushed by the entry code
+ * after switching to the irq stack.
+ */
+ info->next_sp = (unsigned long *)*(end - 1);
+
+ return true;
+}
+
+int get_stack_info(unsigned long *stack, struct task_struct *task,
+ struct stack_info *info, unsigned long *visit_mask)
+{
+ if (!stack)
+ goto unknown;
+
+ task = task ? : current;
+
+ if (in_task_stack(stack, task, info))
+ goto recursion_check;
+
+ if (task != current)
+ goto unknown;
+
+ if (in_exception_stack(stack, info))
+ goto recursion_check;
+
+ if (in_irq_stack(stack, info))
+ goto recursion_check;
+
+ if (in_entry_stack(stack, info))
+ goto recursion_check;
+
+ goto unknown;
+
+recursion_check:
+ /*
+ * Make sure we don't iterate through any given stack more than once.
+ * If it comes up a second time then there's something wrong going on:
+ * just break out and report an unknown stack type.
+ */
+ if (visit_mask) {
+ if (*visit_mask & (1UL << info->type)) {
+ printk_deferred_once(KERN_WARNING "WARNING: stack recursion on stack type %d\n", info->type);
+ goto unknown;
+ }
+ *visit_mask |= 1UL << info->type;
+ }
+
+ return 0;
+
+unknown:
+ info->type = STACK_TYPE_UNKNOWN;
+ return -EINVAL;
+}
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
new file mode 100644
index 0000000..d1f25c8
--- /dev/null
+++ b/arch/x86/kernel/e820.c
@@ -0,0 +1,1280 @@
+/*
+ * Low level x86 E820 memory map handling functions.
+ *
+ * The firmware and bootloader passes us the "E820 table", which is the primary
+ * physical memory layout description available about x86 systems.
+ *
+ * The kernel takes the E820 memory layout and optionally modifies it with
+ * quirks and other tweaks, and feeds that into the generic Linux memory
+ * allocation code routines via a platform independent interface (memblock, etc.).
+ */
+#include <linux/crash_dump.h>
+#include <linux/bootmem.h>
+#include <linux/suspend.h>
+#include <linux/acpi.h>
+#include <linux/firmware-map.h>
+#include <linux/memblock.h>
+#include <linux/sort.h>
+
+#include <asm/e820/api.h>
+#include <asm/setup.h>
+
+/*
+ * We organize the E820 table into three main data structures:
+ *
+ * - 'e820_table_firmware': the original firmware version passed to us by the
+ * bootloader - not modified by the kernel. It is composed of two parts:
+ * the first 128 E820 memory entries in boot_params.e820_table and the remaining
+ * (if any) entries of the SETUP_E820_EXT nodes. We use this to:
+ *
+ * - inform the user about the firmware's notion of memory layout
+ * via /sys/firmware/memmap
+ *
+ * - the hibernation code uses it to generate a kernel-independent MD5
+ * fingerprint of the physical memory layout of a system.
+ *
+ * - 'e820_table_kexec': a slightly modified (by the kernel) firmware version
+ * passed to us by the bootloader - the major difference between
+ * e820_table_firmware[] and this one is that, the latter marks the setup_data
+ * list created by the EFI boot stub as reserved, so that kexec can reuse the
+ * setup_data information in the second kernel. Besides, e820_table_kexec[]
+ * might also be modified by the kexec itself to fake a mptable.
+ * We use this to:
+ *
+ * - kexec, which is a bootloader in disguise, uses the original E820
+ * layout to pass to the kexec-ed kernel. This way the original kernel
+ * can have a restricted E820 map while the kexec()-ed kexec-kernel
+ * can have access to full memory - etc.
+ *
+ * - 'e820_table': this is the main E820 table that is massaged by the
+ * low level x86 platform code, or modified by boot parameters, before
+ * passed on to higher level MM layers.
+ *
+ * Once the E820 map has been converted to the standard Linux memory layout
+ * information its role stops - modifying it has no effect and does not get
+ * re-propagated. So itsmain role is a temporary bootstrap storage of firmware
+ * specific memory layout data during early bootup.
+ */
+static struct e820_table e820_table_init __initdata;
+static struct e820_table e820_table_kexec_init __initdata;
+static struct e820_table e820_table_firmware_init __initdata;
+
+struct e820_table *e820_table __refdata = &e820_table_init;
+struct e820_table *e820_table_kexec __refdata = &e820_table_kexec_init;
+struct e820_table *e820_table_firmware __refdata = &e820_table_firmware_init;
+
+/* For PCI or other memory-mapped resources */
+unsigned long pci_mem_start = 0xaeedbabe;
+#ifdef CONFIG_PCI
+EXPORT_SYMBOL(pci_mem_start);
+#endif
+
+/*
+ * This function checks if any part of the range <start,end> is mapped
+ * with type.
+ */
+bool e820__mapped_any(u64 start, u64 end, enum e820_type type)
+{
+ int i;
+
+ for (i = 0; i < e820_table->nr_entries; i++) {
+ struct e820_entry *entry = &e820_table->entries[i];
+
+ if (type && entry->type != type)
+ continue;
+ if (entry->addr >= end || entry->addr + entry->size <= start)
+ continue;
+ return 1;
+ }
+ return 0;
+}
+EXPORT_SYMBOL_GPL(e820__mapped_any);
+
+/*
+ * This function checks if the entire <start,end> range is mapped with 'type'.
+ *
+ * Note: this function only works correctly once the E820 table is sorted and
+ * not-overlapping (at least for the range specified), which is the case normally.
+ */
+static struct e820_entry *__e820__mapped_all(u64 start, u64 end,
+ enum e820_type type)
+{
+ int i;
+
+ for (i = 0; i < e820_table->nr_entries; i++) {
+ struct e820_entry *entry = &e820_table->entries[i];
+
+ if (type && entry->type != type)
+ continue;
+
+ /* Is the region (part) in overlap with the current region? */
+ if (entry->addr >= end || entry->addr + entry->size <= start)
+ continue;
+
+ /*
+ * If the region is at the beginning of <start,end> we move
+ * 'start' to the end of the region since it's ok until there
+ */
+ if (entry->addr <= start)
+ start = entry->addr + entry->size;
+
+ /*
+ * If 'start' is now at or beyond 'end', we're done, full
+ * coverage of the desired range exists:
+ */
+ if (start >= end)
+ return entry;
+ }
+
+ return NULL;
+}
+
+/*
+ * This function checks if the entire range <start,end> is mapped with type.
+ */
+bool __init e820__mapped_all(u64 start, u64 end, enum e820_type type)
+{
+ return __e820__mapped_all(start, end, type);
+}
+
+/*
+ * This function returns the type associated with the range <start,end>.
+ */
+int e820__get_entry_type(u64 start, u64 end)
+{
+ struct e820_entry *entry = __e820__mapped_all(start, end, 0);
+
+ return entry ? entry->type : -EINVAL;
+}
+
+/*
+ * Add a memory region to the kernel E820 map.
+ */
+static void __init __e820__range_add(struct e820_table *table, u64 start, u64 size, enum e820_type type)
+{
+ int x = table->nr_entries;
+
+ if (x >= ARRAY_SIZE(table->entries)) {
+ pr_err("too many entries; ignoring [mem %#010llx-%#010llx]\n",
+ start, start + size - 1);
+ return;
+ }
+
+ table->entries[x].addr = start;
+ table->entries[x].size = size;
+ table->entries[x].type = type;
+ table->nr_entries++;
+}
+
+void __init e820__range_add(u64 start, u64 size, enum e820_type type)
+{
+ __e820__range_add(e820_table, start, size, type);
+}
+
+static void __init e820_print_type(enum e820_type type)
+{
+ switch (type) {
+ case E820_TYPE_RAM: /* Fall through: */
+ case E820_TYPE_RESERVED_KERN: pr_cont("usable"); break;
+ case E820_TYPE_RESERVED: pr_cont("reserved"); break;
+ case E820_TYPE_ACPI: pr_cont("ACPI data"); break;
+ case E820_TYPE_NVS: pr_cont("ACPI NVS"); break;
+ case E820_TYPE_UNUSABLE: pr_cont("unusable"); break;
+ case E820_TYPE_PMEM: /* Fall through: */
+ case E820_TYPE_PRAM: pr_cont("persistent (type %u)", type); break;
+ default: pr_cont("type %u", type); break;
+ }
+}
+
+void __init e820__print_table(char *who)
+{
+ int i;
+
+ for (i = 0; i < e820_table->nr_entries; i++) {
+ pr_info("%s: [mem %#018Lx-%#018Lx] ",
+ who,
+ e820_table->entries[i].addr,
+ e820_table->entries[i].addr + e820_table->entries[i].size - 1);
+
+ e820_print_type(e820_table->entries[i].type);
+ pr_cont("\n");
+ }
+}
+
+/*
+ * Sanitize an E820 map.
+ *
+ * Some E820 layouts include overlapping entries. The following
+ * replaces the original E820 map with a new one, removing overlaps,
+ * and resolving conflicting memory types in favor of highest
+ * numbered type.
+ *
+ * The input parameter 'entries' points to an array of 'struct
+ * e820_entry' which on entry has elements in the range [0, *nr_entries)
+ * valid, and which has space for up to max_nr_entries entries.
+ * On return, the resulting sanitized E820 map entries will be in
+ * overwritten in the same location, starting at 'entries'.
+ *
+ * The integer pointed to by nr_entries must be valid on entry (the
+ * current number of valid entries located at 'entries'). If the
+ * sanitizing succeeds the *nr_entries will be updated with the new
+ * number of valid entries (something no more than max_nr_entries).
+ *
+ * The return value from e820__update_table() is zero if it
+ * successfully 'sanitized' the map entries passed in, and is -1
+ * if it did nothing, which can happen if either of (1) it was
+ * only passed one map entry, or (2) any of the input map entries
+ * were invalid (start + size < start, meaning that the size was
+ * so big the described memory range wrapped around through zero.)
+ *
+ * Visually we're performing the following
+ * (1,2,3,4 = memory types)...
+ *
+ * Sample memory map (w/overlaps):
+ * ____22__________________
+ * ______________________4_
+ * ____1111________________
+ * _44_____________________
+ * 11111111________________
+ * ____________________33__
+ * ___________44___________
+ * __________33333_________
+ * ______________22________
+ * ___________________2222_
+ * _________111111111______
+ * _____________________11_
+ * _________________4______
+ *
+ * Sanitized equivalent (no overlap):
+ * 1_______________________
+ * _44_____________________
+ * ___1____________________
+ * ____22__________________
+ * ______11________________
+ * _________1______________
+ * __________3_____________
+ * ___________44___________
+ * _____________33_________
+ * _______________2________
+ * ________________1_______
+ * _________________4______
+ * ___________________2____
+ * ____________________33__
+ * ______________________4_
+ */
+struct change_member {
+ /* Pointer to the original entry: */
+ struct e820_entry *entry;
+ /* Address for this change point: */
+ unsigned long long addr;
+};
+
+static struct change_member change_point_list[2*E820_MAX_ENTRIES] __initdata;
+static struct change_member *change_point[2*E820_MAX_ENTRIES] __initdata;
+static struct e820_entry *overlap_list[E820_MAX_ENTRIES] __initdata;
+static struct e820_entry new_entries[E820_MAX_ENTRIES] __initdata;
+
+static int __init cpcompare(const void *a, const void *b)
+{
+ struct change_member * const *app = a, * const *bpp = b;
+ const struct change_member *ap = *app, *bp = *bpp;
+
+ /*
+ * Inputs are pointers to two elements of change_point[]. If their
+ * addresses are not equal, their difference dominates. If the addresses
+ * are equal, then consider one that represents the end of its region
+ * to be greater than one that does not.
+ */
+ if (ap->addr != bp->addr)
+ return ap->addr > bp->addr ? 1 : -1;
+
+ return (ap->addr != ap->entry->addr) - (bp->addr != bp->entry->addr);
+}
+
+int __init e820__update_table(struct e820_table *table)
+{
+ struct e820_entry *entries = table->entries;
+ u32 max_nr_entries = ARRAY_SIZE(table->entries);
+ enum e820_type current_type, last_type;
+ unsigned long long last_addr;
+ u32 new_nr_entries, overlap_entries;
+ u32 i, chg_idx, chg_nr;
+
+ /* If there's only one memory region, don't bother: */
+ if (table->nr_entries < 2)
+ return -1;
+
+ BUG_ON(table->nr_entries > max_nr_entries);
+
+ /* Bail out if we find any unreasonable addresses in the map: */
+ for (i = 0; i < table->nr_entries; i++) {
+ if (entries[i].addr + entries[i].size < entries[i].addr)
+ return -1;
+ }
+
+ /* Create pointers for initial change-point information (for sorting): */
+ for (i = 0; i < 2 * table->nr_entries; i++)
+ change_point[i] = &change_point_list[i];
+
+ /*
+ * Record all known change-points (starting and ending addresses),
+ * omitting empty memory regions:
+ */
+ chg_idx = 0;
+ for (i = 0; i < table->nr_entries; i++) {
+ if (entries[i].size != 0) {
+ change_point[chg_idx]->addr = entries[i].addr;
+ change_point[chg_idx++]->entry = &entries[i];
+ change_point[chg_idx]->addr = entries[i].addr + entries[i].size;
+ change_point[chg_idx++]->entry = &entries[i];
+ }
+ }
+ chg_nr = chg_idx;
+
+ /* Sort change-point list by memory addresses (low -> high): */
+ sort(change_point, chg_nr, sizeof(*change_point), cpcompare, NULL);
+
+ /* Create a new memory map, removing overlaps: */
+ overlap_entries = 0; /* Number of entries in the overlap table */
+ new_nr_entries = 0; /* Index for creating new map entries */
+ last_type = 0; /* Start with undefined memory type */
+ last_addr = 0; /* Start with 0 as last starting address */
+
+ /* Loop through change-points, determining effect on the new map: */
+ for (chg_idx = 0; chg_idx < chg_nr; chg_idx++) {
+ /* Keep track of all overlapping entries */
+ if (change_point[chg_idx]->addr == change_point[chg_idx]->entry->addr) {
+ /* Add map entry to overlap list (> 1 entry implies an overlap) */
+ overlap_list[overlap_entries++] = change_point[chg_idx]->entry;
+ } else {
+ /* Remove entry from list (order independent, so swap with last): */
+ for (i = 0; i < overlap_entries; i++) {
+ if (overlap_list[i] == change_point[chg_idx]->entry)
+ overlap_list[i] = overlap_list[overlap_entries-1];
+ }
+ overlap_entries--;
+ }
+ /*
+ * If there are overlapping entries, decide which
+ * "type" to use (larger value takes precedence --
+ * 1=usable, 2,3,4,4+=unusable)
+ */
+ current_type = 0;
+ for (i = 0; i < overlap_entries; i++) {
+ if (overlap_list[i]->type > current_type)
+ current_type = overlap_list[i]->type;
+ }
+
+ /* Continue building up new map based on this information: */
+ if (current_type != last_type || current_type == E820_TYPE_PRAM) {
+ if (last_type != 0) {
+ new_entries[new_nr_entries].size = change_point[chg_idx]->addr - last_addr;
+ /* Move forward only if the new size was non-zero: */
+ if (new_entries[new_nr_entries].size != 0)
+ /* No more space left for new entries? */
+ if (++new_nr_entries >= max_nr_entries)
+ break;
+ }
+ if (current_type != 0) {
+ new_entries[new_nr_entries].addr = change_point[chg_idx]->addr;
+ new_entries[new_nr_entries].type = current_type;
+ last_addr = change_point[chg_idx]->addr;
+ }
+ last_type = current_type;
+ }
+ }
+
+ /* Copy the new entries into the original location: */
+ memcpy(entries, new_entries, new_nr_entries*sizeof(*entries));
+ table->nr_entries = new_nr_entries;
+
+ return 0;
+}
+
+static int __init __append_e820_table(struct boot_e820_entry *entries, u32 nr_entries)
+{
+ struct boot_e820_entry *entry = entries;
+
+ while (nr_entries) {
+ u64 start = entry->addr;
+ u64 size = entry->size;
+ u64 end = start + size - 1;
+ u32 type = entry->type;
+
+ /* Ignore the entry on 64-bit overflow: */
+ if (start > end && likely(size))
+ return -1;
+
+ e820__range_add(start, size, type);
+
+ entry++;
+ nr_entries--;
+ }
+ return 0;
+}
+
+/*
+ * Copy the BIOS E820 map into a safe place.
+ *
+ * Sanity-check it while we're at it..
+ *
+ * If we're lucky and live on a modern system, the setup code
+ * will have given us a memory map that we can use to properly
+ * set up memory. If we aren't, we'll fake a memory map.
+ */
+static int __init append_e820_table(struct boot_e820_entry *entries, u32 nr_entries)
+{
+ /* Only one memory region (or negative)? Ignore it */
+ if (nr_entries < 2)
+ return -1;
+
+ return __append_e820_table(entries, nr_entries);
+}
+
+static u64 __init
+__e820__range_update(struct e820_table *table, u64 start, u64 size, enum e820_type old_type, enum e820_type new_type)
+{
+ u64 end;
+ unsigned int i;
+ u64 real_updated_size = 0;
+
+ BUG_ON(old_type == new_type);
+
+ if (size > (ULLONG_MAX - start))
+ size = ULLONG_MAX - start;
+
+ end = start + size;
+ printk(KERN_DEBUG "e820: update [mem %#010Lx-%#010Lx] ", start, end - 1);
+ e820_print_type(old_type);
+ pr_cont(" ==> ");
+ e820_print_type(new_type);
+ pr_cont("\n");
+
+ for (i = 0; i < table->nr_entries; i++) {
+ struct e820_entry *entry = &table->entries[i];
+ u64 final_start, final_end;
+ u64 entry_end;
+
+ if (entry->type != old_type)
+ continue;
+
+ entry_end = entry->addr + entry->size;
+
+ /* Completely covered by new range? */
+ if (entry->addr >= start && entry_end <= end) {
+ entry->type = new_type;
+ real_updated_size += entry->size;
+ continue;
+ }
+
+ /* New range is completely covered? */
+ if (entry->addr < start && entry_end > end) {
+ __e820__range_add(table, start, size, new_type);
+ __e820__range_add(table, end, entry_end - end, entry->type);
+ entry->size = start - entry->addr;
+ real_updated_size += size;
+ continue;
+ }
+
+ /* Partially covered: */
+ final_start = max(start, entry->addr);
+ final_end = min(end, entry_end);
+ if (final_start >= final_end)
+ continue;
+
+ __e820__range_add(table, final_start, final_end - final_start, new_type);
+
+ real_updated_size += final_end - final_start;
+
+ /*
+ * Left range could be head or tail, so need to update
+ * its size first:
+ */
+ entry->size -= final_end - final_start;
+ if (entry->addr < final_start)
+ continue;
+
+ entry->addr = final_end;
+ }
+ return real_updated_size;
+}
+
+u64 __init e820__range_update(u64 start, u64 size, enum e820_type old_type, enum e820_type new_type)
+{
+ return __e820__range_update(e820_table, start, size, old_type, new_type);
+}
+
+static u64 __init e820__range_update_kexec(u64 start, u64 size, enum e820_type old_type, enum e820_type new_type)
+{
+ return __e820__range_update(e820_table_kexec, start, size, old_type, new_type);
+}
+
+/* Remove a range of memory from the E820 table: */
+u64 __init e820__range_remove(u64 start, u64 size, enum e820_type old_type, bool check_type)
+{
+ int i;
+ u64 end;
+ u64 real_removed_size = 0;
+
+ if (size > (ULLONG_MAX - start))
+ size = ULLONG_MAX - start;
+
+ end = start + size;
+ printk(KERN_DEBUG "e820: remove [mem %#010Lx-%#010Lx] ", start, end - 1);
+ if (check_type)
+ e820_print_type(old_type);
+ pr_cont("\n");
+
+ for (i = 0; i < e820_table->nr_entries; i++) {
+ struct e820_entry *entry = &e820_table->entries[i];
+ u64 final_start, final_end;
+ u64 entry_end;
+
+ if (check_type && entry->type != old_type)
+ continue;
+
+ entry_end = entry->addr + entry->size;
+
+ /* Completely covered? */
+ if (entry->addr >= start && entry_end <= end) {
+ real_removed_size += entry->size;
+ memset(entry, 0, sizeof(*entry));
+ continue;
+ }
+
+ /* Is the new range completely covered? */
+ if (entry->addr < start && entry_end > end) {
+ e820__range_add(end, entry_end - end, entry->type);
+ entry->size = start - entry->addr;
+ real_removed_size += size;
+ continue;
+ }
+
+ /* Partially covered: */
+ final_start = max(start, entry->addr);
+ final_end = min(end, entry_end);
+ if (final_start >= final_end)
+ continue;
+
+ real_removed_size += final_end - final_start;
+
+ /*
+ * Left range could be head or tail, so need to update
+ * the size first:
+ */
+ entry->size -= final_end - final_start;
+ if (entry->addr < final_start)
+ continue;
+
+ entry->addr = final_end;
+ }
+ return real_removed_size;
+}
+
+void __init e820__update_table_print(void)
+{
+ if (e820__update_table(e820_table))
+ return;
+
+ pr_info("modified physical RAM map:\n");
+ e820__print_table("modified");
+}
+
+static void __init e820__update_table_kexec(void)
+{
+ e820__update_table(e820_table_kexec);
+}
+
+#define MAX_GAP_END 0x100000000ull
+
+/*
+ * Search for a gap in the E820 memory space from 0 to MAX_GAP_END (4GB).
+ */
+static int __init e820_search_gap(unsigned long *gapstart, unsigned long *gapsize)
+{
+ unsigned long long last = MAX_GAP_END;
+ int i = e820_table->nr_entries;
+ int found = 0;
+
+ while (--i >= 0) {
+ unsigned long long start = e820_table->entries[i].addr;
+ unsigned long long end = start + e820_table->entries[i].size;
+
+ /*
+ * Since "last" is at most 4GB, we know we'll
+ * fit in 32 bits if this condition is true:
+ */
+ if (last > end) {
+ unsigned long gap = last - end;
+
+ if (gap >= *gapsize) {
+ *gapsize = gap;
+ *gapstart = end;
+ found = 1;
+ }
+ }
+ if (start < last)
+ last = start;
+ }
+ return found;
+}
+
+/*
+ * Search for the biggest gap in the low 32 bits of the E820
+ * memory space. We pass this space to the PCI subsystem, so
+ * that it can assign MMIO resources for hotplug or
+ * unconfigured devices in.
+ *
+ * Hopefully the BIOS let enough space left.
+ */
+__init void e820__setup_pci_gap(void)
+{
+ unsigned long gapstart, gapsize;
+ int found;
+
+ gapsize = 0x400000;
+ found = e820_search_gap(&gapstart, &gapsize);
+
+ if (!found) {
+#ifdef CONFIG_X86_64
+ gapstart = (max_pfn << PAGE_SHIFT) + 1024*1024;
+ pr_err("Cannot find an available gap in the 32-bit address range\n");
+ pr_err("PCI devices with unassigned 32-bit BARs may not work!\n");
+#else
+ gapstart = 0x10000000;
+#endif
+ }
+
+ /*
+ * e820__reserve_resources_late() protects stolen RAM already:
+ */
+ pci_mem_start = gapstart;
+
+ pr_info("[mem %#010lx-%#010lx] available for PCI devices\n",
+ gapstart, gapstart + gapsize - 1);
+}
+
+/*
+ * Called late during init, in free_initmem().
+ *
+ * Initial e820_table and e820_table_kexec are largish __initdata arrays.
+ *
+ * Copy them to a (usually much smaller) dynamically allocated area that is
+ * sized precisely after the number of e820 entries.
+ *
+ * This is done after we've performed all the fixes and tweaks to the tables.
+ * All functions which modify them are __init functions, which won't exist
+ * after free_initmem().
+ */
+__init void e820__reallocate_tables(void)
+{
+ struct e820_table *n;
+ int size;
+
+ size = offsetof(struct e820_table, entries) + sizeof(struct e820_entry)*e820_table->nr_entries;
+ n = kmalloc(size, GFP_KERNEL);
+ BUG_ON(!n);
+ memcpy(n, e820_table, size);
+ e820_table = n;
+
+ size = offsetof(struct e820_table, entries) + sizeof(struct e820_entry)*e820_table_kexec->nr_entries;
+ n = kmalloc(size, GFP_KERNEL);
+ BUG_ON(!n);
+ memcpy(n, e820_table_kexec, size);
+ e820_table_kexec = n;
+
+ size = offsetof(struct e820_table, entries) + sizeof(struct e820_entry)*e820_table_firmware->nr_entries;
+ n = kmalloc(size, GFP_KERNEL);
+ BUG_ON(!n);
+ memcpy(n, e820_table_firmware, size);
+ e820_table_firmware = n;
+}
+
+/*
+ * Because of the small fixed size of struct boot_params, only the first
+ * 128 E820 memory entries are passed to the kernel via boot_params.e820_table,
+ * the remaining (if any) entries are passed via the SETUP_E820_EXT node of
+ * struct setup_data, which is parsed here.
+ */
+void __init e820__memory_setup_extended(u64 phys_addr, u32 data_len)
+{
+ int entries;
+ struct boot_e820_entry *extmap;
+ struct setup_data *sdata;
+
+ sdata = early_memremap(phys_addr, data_len);
+ entries = sdata->len / sizeof(*extmap);
+ extmap = (struct boot_e820_entry *)(sdata->data);
+
+ __append_e820_table(extmap, entries);
+ e820__update_table(e820_table);
+
+ memcpy(e820_table_kexec, e820_table, sizeof(*e820_table_kexec));
+ memcpy(e820_table_firmware, e820_table, sizeof(*e820_table_firmware));
+
+ early_memunmap(sdata, data_len);
+ pr_info("extended physical RAM map:\n");
+ e820__print_table("extended");
+}
+
+/*
+ * Find the ranges of physical addresses that do not correspond to
+ * E820 RAM areas and register the corresponding pages as 'nosave' for
+ * hibernation (32-bit) or software suspend and suspend to RAM (64-bit).
+ *
+ * This function requires the E820 map to be sorted and without any
+ * overlapping entries.
+ */
+void __init e820__register_nosave_regions(unsigned long limit_pfn)
+{
+ int i;
+ unsigned long pfn = 0;
+
+ for (i = 0; i < e820_table->nr_entries; i++) {
+ struct e820_entry *entry = &e820_table->entries[i];
+
+ if (pfn < PFN_UP(entry->addr))
+ register_nosave_region(pfn, PFN_UP(entry->addr));
+
+ pfn = PFN_DOWN(entry->addr + entry->size);
+
+ if (entry->type != E820_TYPE_RAM && entry->type != E820_TYPE_RESERVED_KERN)
+ register_nosave_region(PFN_UP(entry->addr), pfn);
+
+ if (pfn >= limit_pfn)
+ break;
+ }
+}
+
+#ifdef CONFIG_ACPI
+/*
+ * Register ACPI NVS memory regions, so that we can save/restore them during
+ * hibernation and the subsequent resume:
+ */
+static int __init e820__register_nvs_regions(void)
+{
+ int i;
+
+ for (i = 0; i < e820_table->nr_entries; i++) {
+ struct e820_entry *entry = &e820_table->entries[i];
+
+ if (entry->type == E820_TYPE_NVS)
+ acpi_nvs_register(entry->addr, entry->size);
+ }
+
+ return 0;
+}
+core_initcall(e820__register_nvs_regions);
+#endif
+
+/*
+ * Allocate the requested number of bytes with the requsted alignment
+ * and return (the physical address) to the caller. Also register this
+ * range in the 'kexec' E820 table as a reserved range.
+ *
+ * This allows kexec to fake a new mptable, as if it came from the real
+ * system.
+ */
+u64 __init e820__memblock_alloc_reserved(u64 size, u64 align)
+{
+ u64 addr;
+
+ addr = __memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ACCESSIBLE);
+ if (addr) {
+ e820__range_update_kexec(addr, size, E820_TYPE_RAM, E820_TYPE_RESERVED);
+ pr_info("update e820_table_kexec for e820__memblock_alloc_reserved()\n");
+ e820__update_table_kexec();
+ }
+
+ return addr;
+}
+
+#ifdef CONFIG_X86_32
+# ifdef CONFIG_X86_PAE
+# define MAX_ARCH_PFN (1ULL<<(36-PAGE_SHIFT))
+# else
+# define MAX_ARCH_PFN (1ULL<<(32-PAGE_SHIFT))
+# endif
+#else /* CONFIG_X86_32 */
+# define MAX_ARCH_PFN MAXMEM>>PAGE_SHIFT
+#endif
+
+/*
+ * Find the highest page frame number we have available
+ */
+static unsigned long __init e820_end_pfn(unsigned long limit_pfn, enum e820_type type)
+{
+ int i;
+ unsigned long last_pfn = 0;
+ unsigned long max_arch_pfn = MAX_ARCH_PFN;
+
+ for (i = 0; i < e820_table->nr_entries; i++) {
+ struct e820_entry *entry = &e820_table->entries[i];
+ unsigned long start_pfn;
+ unsigned long end_pfn;
+
+ if (entry->type != type)
+ continue;
+
+ start_pfn = entry->addr >> PAGE_SHIFT;
+ end_pfn = (entry->addr + entry->size) >> PAGE_SHIFT;
+
+ if (start_pfn >= limit_pfn)
+ continue;
+ if (end_pfn > limit_pfn) {
+ last_pfn = limit_pfn;
+ break;
+ }
+ if (end_pfn > last_pfn)
+ last_pfn = end_pfn;
+ }
+
+ if (last_pfn > max_arch_pfn)
+ last_pfn = max_arch_pfn;
+
+ pr_info("last_pfn = %#lx max_arch_pfn = %#lx\n",
+ last_pfn, max_arch_pfn);
+ return last_pfn;
+}
+
+unsigned long __init e820__end_of_ram_pfn(void)
+{
+ return e820_end_pfn(MAX_ARCH_PFN, E820_TYPE_RAM);
+}
+
+unsigned long __init e820__end_of_low_ram_pfn(void)
+{
+ return e820_end_pfn(1UL << (32 - PAGE_SHIFT), E820_TYPE_RAM);
+}
+
+static void __init early_panic(char *msg)
+{
+ early_printk(msg);
+ panic(msg);
+}
+
+static int userdef __initdata;
+
+/* The "mem=nopentium" boot option disables 4MB page tables on 32-bit kernels: */
+static int __init parse_memopt(char *p)
+{
+ u64 mem_size;
+
+ if (!p)
+ return -EINVAL;
+
+ if (!strcmp(p, "nopentium")) {
+#ifdef CONFIG_X86_32
+ setup_clear_cpu_cap(X86_FEATURE_PSE);
+ return 0;
+#else
+ pr_warn("mem=nopentium ignored! (only supported on x86_32)\n");
+ return -EINVAL;
+#endif
+ }
+
+ userdef = 1;
+ mem_size = memparse(p, &p);
+
+ /* Don't remove all memory when getting "mem={invalid}" parameter: */
+ if (mem_size == 0)
+ return -EINVAL;
+
+ e820__range_remove(mem_size, ULLONG_MAX - mem_size, E820_TYPE_RAM, 1);
+
+ return 0;
+}
+early_param("mem", parse_memopt);
+
+static int __init parse_memmap_one(char *p)
+{
+ char *oldp;
+ u64 start_at, mem_size;
+
+ if (!p)
+ return -EINVAL;
+
+ if (!strncmp(p, "exactmap", 8)) {
+#ifdef CONFIG_CRASH_DUMP
+ /*
+ * If we are doing a crash dump, we still need to know
+ * the real memory size before the original memory map is
+ * reset.
+ */
+ saved_max_pfn = e820__end_of_ram_pfn();
+#endif
+ e820_table->nr_entries = 0;
+ userdef = 1;
+ return 0;
+ }
+
+ oldp = p;
+ mem_size = memparse(p, &p);
+ if (p == oldp)
+ return -EINVAL;
+
+ userdef = 1;
+ if (*p == '@') {
+ start_at = memparse(p+1, &p);
+ e820__range_add(start_at, mem_size, E820_TYPE_RAM);
+ } else if (*p == '#') {
+ start_at = memparse(p+1, &p);
+ e820__range_add(start_at, mem_size, E820_TYPE_ACPI);
+ } else if (*p == '$') {
+ start_at = memparse(p+1, &p);
+ e820__range_add(start_at, mem_size, E820_TYPE_RESERVED);
+ } else if (*p == '!') {
+ start_at = memparse(p+1, &p);
+ e820__range_add(start_at, mem_size, E820_TYPE_PRAM);
+ } else if (*p == '%') {
+ enum e820_type from = 0, to = 0;
+
+ start_at = memparse(p + 1, &p);
+ if (*p == '-')
+ from = simple_strtoull(p + 1, &p, 0);
+ if (*p == '+')
+ to = simple_strtoull(p + 1, &p, 0);
+ if (*p != '\0')
+ return -EINVAL;
+ if (from && to)
+ e820__range_update(start_at, mem_size, from, to);
+ else if (to)
+ e820__range_add(start_at, mem_size, to);
+ else if (from)
+ e820__range_remove(start_at, mem_size, from, 1);
+ else
+ e820__range_remove(start_at, mem_size, 0, 0);
+ } else {
+ e820__range_remove(mem_size, ULLONG_MAX - mem_size, E820_TYPE_RAM, 1);
+ }
+
+ return *p == '\0' ? 0 : -EINVAL;
+}
+
+static int __init parse_memmap_opt(char *str)
+{
+ while (str) {
+ char *k = strchr(str, ',');
+
+ if (k)
+ *k++ = 0;
+
+ parse_memmap_one(str);
+ str = k;
+ }
+
+ return 0;
+}
+early_param("memmap", parse_memmap_opt);
+
+/*
+ * Reserve all entries from the bootloader's extensible data nodes list,
+ * because if present we are going to use it later on to fetch e820
+ * entries from it:
+ */
+void __init e820__reserve_setup_data(void)
+{
+ struct setup_data *data;
+ u64 pa_data;
+
+ pa_data = boot_params.hdr.setup_data;
+ if (!pa_data)
+ return;
+
+ while (pa_data) {
+ data = early_memremap(pa_data, sizeof(*data));
+ e820__range_update(pa_data, sizeof(*data)+data->len, E820_TYPE_RAM, E820_TYPE_RESERVED_KERN);
+ e820__range_update_kexec(pa_data, sizeof(*data)+data->len, E820_TYPE_RAM, E820_TYPE_RESERVED_KERN);
+ pa_data = data->next;
+ early_memunmap(data, sizeof(*data));
+ }
+
+ e820__update_table(e820_table);
+ e820__update_table(e820_table_kexec);
+
+ pr_info("extended physical RAM map:\n");
+ e820__print_table("reserve setup_data");
+}
+
+/*
+ * Called after parse_early_param(), after early parameters (such as mem=)
+ * have been processed, in which case we already have an E820 table filled in
+ * via the parameter callback function(s), but it's not sorted and printed yet:
+ */
+void __init e820__finish_early_params(void)
+{
+ if (userdef) {
+ if (e820__update_table(e820_table) < 0)
+ early_panic("Invalid user supplied memory map");
+
+ pr_info("user-defined physical RAM map:\n");
+ e820__print_table("user");
+ }
+}
+
+static const char *__init e820_type_to_string(struct e820_entry *entry)
+{
+ switch (entry->type) {
+ case E820_TYPE_RESERVED_KERN: /* Fall-through: */
+ case E820_TYPE_RAM: return "System RAM";
+ case E820_TYPE_ACPI: return "ACPI Tables";
+ case E820_TYPE_NVS: return "ACPI Non-volatile Storage";
+ case E820_TYPE_UNUSABLE: return "Unusable memory";
+ case E820_TYPE_PRAM: return "Persistent Memory (legacy)";
+ case E820_TYPE_PMEM: return "Persistent Memory";
+ case E820_TYPE_RESERVED: return "Reserved";
+ default: return "Unknown E820 type";
+ }
+}
+
+static unsigned long __init e820_type_to_iomem_type(struct e820_entry *entry)
+{
+ switch (entry->type) {
+ case E820_TYPE_RESERVED_KERN: /* Fall-through: */
+ case E820_TYPE_RAM: return IORESOURCE_SYSTEM_RAM;
+ case E820_TYPE_ACPI: /* Fall-through: */
+ case E820_TYPE_NVS: /* Fall-through: */
+ case E820_TYPE_UNUSABLE: /* Fall-through: */
+ case E820_TYPE_PRAM: /* Fall-through: */
+ case E820_TYPE_PMEM: /* Fall-through: */
+ case E820_TYPE_RESERVED: /* Fall-through: */
+ default: return IORESOURCE_MEM;
+ }
+}
+
+static unsigned long __init e820_type_to_iores_desc(struct e820_entry *entry)
+{
+ switch (entry->type) {
+ case E820_TYPE_ACPI: return IORES_DESC_ACPI_TABLES;
+ case E820_TYPE_NVS: return IORES_DESC_ACPI_NV_STORAGE;
+ case E820_TYPE_PMEM: return IORES_DESC_PERSISTENT_MEMORY;
+ case E820_TYPE_PRAM: return IORES_DESC_PERSISTENT_MEMORY_LEGACY;
+ case E820_TYPE_RESERVED_KERN: /* Fall-through: */
+ case E820_TYPE_RAM: /* Fall-through: */
+ case E820_TYPE_UNUSABLE: /* Fall-through: */
+ case E820_TYPE_RESERVED: /* Fall-through: */
+ default: return IORES_DESC_NONE;
+ }
+}
+
+static bool __init do_mark_busy(enum e820_type type, struct resource *res)
+{
+ /* this is the legacy bios/dos rom-shadow + mmio region */
+ if (res->start < (1ULL<<20))
+ return true;
+
+ /*
+ * Treat persistent memory like device memory, i.e. reserve it
+ * for exclusive use of a driver
+ */
+ switch (type) {
+ case E820_TYPE_RESERVED:
+ case E820_TYPE_PRAM:
+ case E820_TYPE_PMEM:
+ return false;
+ case E820_TYPE_RESERVED_KERN:
+ case E820_TYPE_RAM:
+ case E820_TYPE_ACPI:
+ case E820_TYPE_NVS:
+ case E820_TYPE_UNUSABLE:
+ default:
+ return true;
+ }
+}
+
+/*
+ * Mark E820 reserved areas as busy for the resource manager:
+ */
+
+static struct resource __initdata *e820_res;
+
+void __init e820__reserve_resources(void)
+{
+ int i;
+ struct resource *res;
+ u64 end;
+
+ res = alloc_bootmem(sizeof(*res) * e820_table->nr_entries);
+ e820_res = res;
+
+ for (i = 0; i < e820_table->nr_entries; i++) {
+ struct e820_entry *entry = e820_table->entries + i;
+
+ end = entry->addr + entry->size - 1;
+ if (end != (resource_size_t)end) {
+ res++;
+ continue;
+ }
+ res->start = entry->addr;
+ res->end = end;
+ res->name = e820_type_to_string(entry);
+ res->flags = e820_type_to_iomem_type(entry);
+ res->desc = e820_type_to_iores_desc(entry);
+
+ /*
+ * Don't register the region that could be conflicted with
+ * PCI device BAR resources and insert them later in
+ * pcibios_resource_survey():
+ */
+ if (do_mark_busy(entry->type, res)) {
+ res->flags |= IORESOURCE_BUSY;
+ insert_resource(&iomem_resource, res);
+ }
+ res++;
+ }
+
+ /* Expose the bootloader-provided memory layout to the sysfs. */
+ for (i = 0; i < e820_table_firmware->nr_entries; i++) {
+ struct e820_entry *entry = e820_table_firmware->entries + i;
+
+ firmware_map_add_early(entry->addr, entry->addr + entry->size, e820_type_to_string(entry));
+ }
+}
+
+/*
+ * How much should we pad the end of RAM, depending on where it is?
+ */
+static unsigned long __init ram_alignment(resource_size_t pos)
+{
+ unsigned long mb = pos >> 20;
+
+ /* To 64kB in the first megabyte */
+ if (!mb)
+ return 64*1024;
+
+ /* To 1MB in the first 16MB */
+ if (mb < 16)
+ return 1024*1024;
+
+ /* To 64MB for anything above that */
+ return 64*1024*1024;
+}
+
+#define MAX_RESOURCE_SIZE ((resource_size_t)-1)
+
+void __init e820__reserve_resources_late(void)
+{
+ int i;
+ struct resource *res;
+
+ res = e820_res;
+ for (i = 0; i < e820_table->nr_entries; i++) {
+ if (!res->parent && res->end)
+ insert_resource_expand_to_fit(&iomem_resource, res);
+ res++;
+ }
+
+ /*
+ * Try to bump up RAM regions to reasonable boundaries, to
+ * avoid stolen RAM:
+ */
+ for (i = 0; i < e820_table->nr_entries; i++) {
+ struct e820_entry *entry = &e820_table->entries[i];
+ u64 start, end;
+
+ if (entry->type != E820_TYPE_RAM)
+ continue;
+
+ start = entry->addr + entry->size;
+ end = round_up(start, ram_alignment(start)) - 1;
+ if (end > MAX_RESOURCE_SIZE)
+ end = MAX_RESOURCE_SIZE;
+ if (start >= end)
+ continue;
+
+ printk(KERN_DEBUG "e820: reserve RAM buffer [mem %#010llx-%#010llx]\n", start, end);
+ reserve_region_with_split(&iomem_resource, start, end, "RAM buffer");
+ }
+}
+
+/*
+ * Pass the firmware (bootloader) E820 map to the kernel and process it:
+ */
+char *__init e820__memory_setup_default(void)
+{
+ char *who = "BIOS-e820";
+
+ /*
+ * Try to copy the BIOS-supplied E820-map.
+ *
+ * Otherwise fake a memory map; one section from 0k->640k,
+ * the next section from 1mb->appropriate_mem_k
+ */
+ if (append_e820_table(boot_params.e820_table, boot_params.e820_entries) < 0) {
+ u64 mem_size;
+
+ /* Compare results from other methods and take the one that gives more RAM: */
+ if (boot_params.alt_mem_k < boot_params.screen_info.ext_mem_k) {
+ mem_size = boot_params.screen_info.ext_mem_k;
+ who = "BIOS-88";
+ } else {
+ mem_size = boot_params.alt_mem_k;
+ who = "BIOS-e801";
+ }
+
+ e820_table->nr_entries = 0;
+ e820__range_add(0, LOWMEMSIZE(), E820_TYPE_RAM);
+ e820__range_add(HIGH_MEMORY, mem_size << 10, E820_TYPE_RAM);
+ }
+
+ /* We just appended a lot of ranges, sanitize the table: */
+ e820__update_table(e820_table);
+
+ return who;
+}
+
+/*
+ * Calls e820__memory_setup_default() in essence to pick up the firmware/bootloader
+ * E820 map - with an optional platform quirk available for virtual platforms
+ * to override this method of boot environment processing:
+ */
+void __init e820__memory_setup(void)
+{
+ char *who;
+
+ /* This is a firmware interface ABI - make sure we don't break it: */
+ BUILD_BUG_ON(sizeof(struct boot_e820_entry) != 20);
+
+ who = x86_init.resources.memory_setup();
+
+ memcpy(e820_table_kexec, e820_table, sizeof(*e820_table_kexec));
+ memcpy(e820_table_firmware, e820_table, sizeof(*e820_table_firmware));
+
+ pr_info("BIOS-provided physical RAM map:\n");
+ e820__print_table(who);
+}
+
+void __init e820__memblock_setup(void)
+{
+ int i;
+ u64 end;
+
+ /*
+ * The bootstrap memblock region count maximum is 128 entries
+ * (INIT_MEMBLOCK_REGIONS), but EFI might pass us more E820 entries
+ * than that - so allow memblock resizing.
+ *
+ * This is safe, because this call happens pretty late during x86 setup,
+ * so we know about reserved memory regions already. (This is important
+ * so that memblock resizing does no stomp over reserved areas.)
+ */
+ memblock_allow_resize();
+
+ for (i = 0; i < e820_table->nr_entries; i++) {
+ struct e820_entry *entry = &e820_table->entries[i];
+
+ end = entry->addr + entry->size;
+ if (end != (resource_size_t)end)
+ continue;
+
+ if (entry->type != E820_TYPE_RAM && entry->type != E820_TYPE_RESERVED_KERN)
+ continue;
+
+ memblock_add(entry->addr, entry->size);
+ }
+
+ /* Throw away partial pages: */
+ memblock_trim_memory(PAGE_SIZE);
+
+ memblock_dump_all();
+}
diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c
new file mode 100644
index 0000000..50d5848
--- /dev/null
+++ b/arch/x86/kernel/early-quirks.c
@@ -0,0 +1,794 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Various workarounds for chipset bugs.
+ This code runs very early and can't use the regular PCI subsystem
+ The entries are keyed to PCI bridges which usually identify chipsets
+ uniquely.
+ This is only for whole classes of chipsets with specific problems which
+ need early invasive action (e.g. before the timers are initialized).
+ Most PCI device specific workarounds can be done later and should be
+ in standard PCI quirks
+ Mainboard specific bugs should be handled by DMI entries.
+ CPU specific bugs in setup.c */
+
+#include <linux/pci.h>
+#include <linux/acpi.h>
+#include <linux/delay.h>
+#include <linux/pci_ids.h>
+#include <linux/bcma/bcma.h>
+#include <linux/bcma/bcma_regs.h>
+#include <linux/platform_data/x86/apple.h>
+#include <drm/i915_drm.h>
+#include <asm/pci-direct.h>
+#include <asm/dma.h>
+#include <asm/io_apic.h>
+#include <asm/apic.h>
+#include <asm/hpet.h>
+#include <asm/iommu.h>
+#include <asm/gart.h>
+#include <asm/irq_remapping.h>
+#include <asm/early_ioremap.h>
+
+static void __init fix_hypertransport_config(int num, int slot, int func)
+{
+ u32 htcfg;
+ /*
+ * we found a hypertransport bus
+ * make sure that we are broadcasting
+ * interrupts to all cpus on the ht bus
+ * if we're using extended apic ids
+ */
+ htcfg = read_pci_config(num, slot, func, 0x68);
+ if (htcfg & (1 << 18)) {
+ printk(KERN_INFO "Detected use of extended apic ids "
+ "on hypertransport bus\n");
+ if ((htcfg & (1 << 17)) == 0) {
+ printk(KERN_INFO "Enabling hypertransport extended "
+ "apic interrupt broadcast\n");
+ printk(KERN_INFO "Note this is a bios bug, "
+ "please contact your hw vendor\n");
+ htcfg |= (1 << 17);
+ write_pci_config(num, slot, func, 0x68, htcfg);
+ }
+ }
+
+
+}
+
+static void __init via_bugs(int num, int slot, int func)
+{
+#ifdef CONFIG_GART_IOMMU
+ if ((max_pfn > MAX_DMA32_PFN || force_iommu) &&
+ !gart_iommu_aperture_allowed) {
+ printk(KERN_INFO
+ "Looks like a VIA chipset. Disabling IOMMU."
+ " Override with iommu=allowed\n");
+ gart_iommu_aperture_disabled = 1;
+ }
+#endif
+}
+
+#ifdef CONFIG_ACPI
+#ifdef CONFIG_X86_IO_APIC
+
+static int __init nvidia_hpet_check(struct acpi_table_header *header)
+{
+ return 0;
+}
+#endif /* CONFIG_X86_IO_APIC */
+#endif /* CONFIG_ACPI */
+
+static void __init nvidia_bugs(int num, int slot, int func)
+{
+#ifdef CONFIG_ACPI
+#ifdef CONFIG_X86_IO_APIC
+ /*
+ * Only applies to Nvidia root ports (bus 0) and not to
+ * Nvidia graphics cards with PCI ports on secondary buses.
+ */
+ if (num)
+ return;
+
+ /*
+ * All timer overrides on Nvidia are
+ * wrong unless HPET is enabled.
+ * Unfortunately that's not true on many Asus boards.
+ * We don't know yet how to detect this automatically, but
+ * at least allow a command line override.
+ */
+ if (acpi_use_timer_override)
+ return;
+
+ if (acpi_table_parse(ACPI_SIG_HPET, nvidia_hpet_check)) {
+ acpi_skip_timer_override = 1;
+ printk(KERN_INFO "Nvidia board "
+ "detected. Ignoring ACPI "
+ "timer override.\n");
+ printk(KERN_INFO "If you got timer trouble "
+ "try acpi_use_timer_override\n");
+ }
+#endif
+#endif
+ /* RED-PEN skip them on mptables too? */
+
+}
+
+#if defined(CONFIG_ACPI) && defined(CONFIG_X86_IO_APIC)
+static u32 __init ati_ixp4x0_rev(int num, int slot, int func)
+{
+ u32 d;
+ u8 b;
+
+ b = read_pci_config_byte(num, slot, func, 0xac);
+ b &= ~(1<<5);
+ write_pci_config_byte(num, slot, func, 0xac, b);
+
+ d = read_pci_config(num, slot, func, 0x70);
+ d |= 1<<8;
+ write_pci_config(num, slot, func, 0x70, d);
+
+ d = read_pci_config(num, slot, func, 0x8);
+ d &= 0xff;
+ return d;
+}
+
+static void __init ati_bugs(int num, int slot, int func)
+{
+ u32 d;
+ u8 b;
+
+ if (acpi_use_timer_override)
+ return;
+
+ d = ati_ixp4x0_rev(num, slot, func);
+ if (d < 0x82)
+ acpi_skip_timer_override = 1;
+ else {
+ /* check for IRQ0 interrupt swap */
+ outb(0x72, 0xcd6); b = inb(0xcd7);
+ if (!(b & 0x2))
+ acpi_skip_timer_override = 1;
+ }
+
+ if (acpi_skip_timer_override) {
+ printk(KERN_INFO "SB4X0 revision 0x%x\n", d);
+ printk(KERN_INFO "Ignoring ACPI timer override.\n");
+ printk(KERN_INFO "If you got timer trouble "
+ "try acpi_use_timer_override\n");
+ }
+}
+
+static u32 __init ati_sbx00_rev(int num, int slot, int func)
+{
+ u32 d;
+
+ d = read_pci_config(num, slot, func, 0x8);
+ d &= 0xff;
+
+ return d;
+}
+
+static void __init ati_bugs_contd(int num, int slot, int func)
+{
+ u32 d, rev;
+
+ rev = ati_sbx00_rev(num, slot, func);
+ if (rev >= 0x40)
+ acpi_fix_pin2_polarity = 1;
+
+ /*
+ * SB600: revisions 0x11, 0x12, 0x13, 0x14, ...
+ * SB700: revisions 0x39, 0x3a, ...
+ * SB800: revisions 0x40, 0x41, ...
+ */
+ if (rev >= 0x39)
+ return;
+
+ if (acpi_use_timer_override)
+ return;
+
+ /* check for IRQ0 interrupt swap */
+ d = read_pci_config(num, slot, func, 0x64);
+ if (!(d & (1<<14)))
+ acpi_skip_timer_override = 1;
+
+ if (acpi_skip_timer_override) {
+ printk(KERN_INFO "SB600 revision 0x%x\n", rev);
+ printk(KERN_INFO "Ignoring ACPI timer override.\n");
+ printk(KERN_INFO "If you got timer trouble "
+ "try acpi_use_timer_override\n");
+ }
+}
+#else
+static void __init ati_bugs(int num, int slot, int func)
+{
+}
+
+static void __init ati_bugs_contd(int num, int slot, int func)
+{
+}
+#endif
+
+static void __init intel_remapping_check(int num, int slot, int func)
+{
+ u8 revision;
+ u16 device;
+
+ device = read_pci_config_16(num, slot, func, PCI_DEVICE_ID);
+ revision = read_pci_config_byte(num, slot, func, PCI_REVISION_ID);
+
+ /*
+ * Revision <= 13 of all triggering devices id in this quirk
+ * have a problem draining interrupts when irq remapping is
+ * enabled, and should be flagged as broken. Additionally
+ * revision 0x22 of device id 0x3405 has this problem.
+ */
+ if (revision <= 0x13)
+ set_irq_remapping_broken();
+ else if (device == 0x3405 && revision == 0x22)
+ set_irq_remapping_broken();
+}
+
+/*
+ * Systems with Intel graphics controllers set aside memory exclusively
+ * for gfx driver use. This memory is not marked in the E820 as reserved
+ * or as RAM, and so is subject to overlap from E820 manipulation later
+ * in the boot process. On some systems, MMIO space is allocated on top,
+ * despite the efforts of the "RAM buffer" approach, which simply rounds
+ * memory boundaries up to 64M to try to catch space that may decode
+ * as RAM and so is not suitable for MMIO.
+ */
+
+#define KB(x) ((x) * 1024UL)
+#define MB(x) (KB (KB (x)))
+
+static resource_size_t __init i830_tseg_size(void)
+{
+ u8 esmramc = read_pci_config_byte(0, 0, 0, I830_ESMRAMC);
+
+ if (!(esmramc & TSEG_ENABLE))
+ return 0;
+
+ if (esmramc & I830_TSEG_SIZE_1M)
+ return MB(1);
+ else
+ return KB(512);
+}
+
+static resource_size_t __init i845_tseg_size(void)
+{
+ u8 esmramc = read_pci_config_byte(0, 0, 0, I845_ESMRAMC);
+ u8 tseg_size = esmramc & I845_TSEG_SIZE_MASK;
+
+ if (!(esmramc & TSEG_ENABLE))
+ return 0;
+
+ switch (tseg_size) {
+ case I845_TSEG_SIZE_512K: return KB(512);
+ case I845_TSEG_SIZE_1M: return MB(1);
+ default:
+ WARN(1, "Unknown ESMRAMC value: %x!\n", esmramc);
+ }
+ return 0;
+}
+
+static resource_size_t __init i85x_tseg_size(void)
+{
+ u8 esmramc = read_pci_config_byte(0, 0, 0, I85X_ESMRAMC);
+
+ if (!(esmramc & TSEG_ENABLE))
+ return 0;
+
+ return MB(1);
+}
+
+static resource_size_t __init i830_mem_size(void)
+{
+ return read_pci_config_byte(0, 0, 0, I830_DRB3) * MB(32);
+}
+
+static resource_size_t __init i85x_mem_size(void)
+{
+ return read_pci_config_byte(0, 0, 1, I85X_DRB3) * MB(32);
+}
+
+/*
+ * On 830/845/85x the stolen memory base isn't available in any
+ * register. We need to calculate it as TOM-TSEG_SIZE-stolen_size.
+ */
+static resource_size_t __init i830_stolen_base(int num, int slot, int func,
+ resource_size_t stolen_size)
+{
+ return i830_mem_size() - i830_tseg_size() - stolen_size;
+}
+
+static resource_size_t __init i845_stolen_base(int num, int slot, int func,
+ resource_size_t stolen_size)
+{
+ return i830_mem_size() - i845_tseg_size() - stolen_size;
+}
+
+static resource_size_t __init i85x_stolen_base(int num, int slot, int func,
+ resource_size_t stolen_size)
+{
+ return i85x_mem_size() - i85x_tseg_size() - stolen_size;
+}
+
+static resource_size_t __init i865_stolen_base(int num, int slot, int func,
+ resource_size_t stolen_size)
+{
+ u16 toud = 0;
+
+ toud = read_pci_config_16(0, 0, 0, I865_TOUD);
+
+ return toud * KB(64) + i845_tseg_size();
+}
+
+static resource_size_t __init gen3_stolen_base(int num, int slot, int func,
+ resource_size_t stolen_size)
+{
+ u32 bsm;
+
+ /* Almost universally we can find the Graphics Base of Stolen Memory
+ * at register BSM (0x5c) in the igfx configuration space. On a few
+ * (desktop) machines this is also mirrored in the bridge device at
+ * different locations, or in the MCHBAR.
+ */
+ bsm = read_pci_config(num, slot, func, INTEL_BSM);
+
+ return bsm & INTEL_BSM_MASK;
+}
+
+static resource_size_t __init gen11_stolen_base(int num, int slot, int func,
+ resource_size_t stolen_size)
+{
+ u64 bsm;
+
+ bsm = read_pci_config(num, slot, func, INTEL_GEN11_BSM_DW0);
+ bsm &= INTEL_BSM_MASK;
+ bsm |= (u64)read_pci_config(num, slot, func, INTEL_GEN11_BSM_DW1) << 32;
+
+ return bsm;
+}
+
+static resource_size_t __init i830_stolen_size(int num, int slot, int func)
+{
+ u16 gmch_ctrl;
+ u16 gms;
+
+ gmch_ctrl = read_pci_config_16(0, 0, 0, I830_GMCH_CTRL);
+ gms = gmch_ctrl & I830_GMCH_GMS_MASK;
+
+ switch (gms) {
+ case I830_GMCH_GMS_STOLEN_512: return KB(512);
+ case I830_GMCH_GMS_STOLEN_1024: return MB(1);
+ case I830_GMCH_GMS_STOLEN_8192: return MB(8);
+ /* local memory isn't part of the normal address space */
+ case I830_GMCH_GMS_LOCAL: return 0;
+ default:
+ WARN(1, "Unknown GMCH_CTRL value: %x!\n", gmch_ctrl);
+ }
+
+ return 0;
+}
+
+static resource_size_t __init gen3_stolen_size(int num, int slot, int func)
+{
+ u16 gmch_ctrl;
+ u16 gms;
+
+ gmch_ctrl = read_pci_config_16(0, 0, 0, I830_GMCH_CTRL);
+ gms = gmch_ctrl & I855_GMCH_GMS_MASK;
+
+ switch (gms) {
+ case I855_GMCH_GMS_STOLEN_1M: return MB(1);
+ case I855_GMCH_GMS_STOLEN_4M: return MB(4);
+ case I855_GMCH_GMS_STOLEN_8M: return MB(8);
+ case I855_GMCH_GMS_STOLEN_16M: return MB(16);
+ case I855_GMCH_GMS_STOLEN_32M: return MB(32);
+ case I915_GMCH_GMS_STOLEN_48M: return MB(48);
+ case I915_GMCH_GMS_STOLEN_64M: return MB(64);
+ case G33_GMCH_GMS_STOLEN_128M: return MB(128);
+ case G33_GMCH_GMS_STOLEN_256M: return MB(256);
+ case INTEL_GMCH_GMS_STOLEN_96M: return MB(96);
+ case INTEL_GMCH_GMS_STOLEN_160M:return MB(160);
+ case INTEL_GMCH_GMS_STOLEN_224M:return MB(224);
+ case INTEL_GMCH_GMS_STOLEN_352M:return MB(352);
+ default:
+ WARN(1, "Unknown GMCH_CTRL value: %x!\n", gmch_ctrl);
+ }
+
+ return 0;
+}
+
+static resource_size_t __init gen6_stolen_size(int num, int slot, int func)
+{
+ u16 gmch_ctrl;
+ u16 gms;
+
+ gmch_ctrl = read_pci_config_16(num, slot, func, SNB_GMCH_CTRL);
+ gms = (gmch_ctrl >> SNB_GMCH_GMS_SHIFT) & SNB_GMCH_GMS_MASK;
+
+ return gms * MB(32);
+}
+
+static resource_size_t __init gen8_stolen_size(int num, int slot, int func)
+{
+ u16 gmch_ctrl;
+ u16 gms;
+
+ gmch_ctrl = read_pci_config_16(num, slot, func, SNB_GMCH_CTRL);
+ gms = (gmch_ctrl >> BDW_GMCH_GMS_SHIFT) & BDW_GMCH_GMS_MASK;
+
+ return gms * MB(32);
+}
+
+static resource_size_t __init chv_stolen_size(int num, int slot, int func)
+{
+ u16 gmch_ctrl;
+ u16 gms;
+
+ gmch_ctrl = read_pci_config_16(num, slot, func, SNB_GMCH_CTRL);
+ gms = (gmch_ctrl >> SNB_GMCH_GMS_SHIFT) & SNB_GMCH_GMS_MASK;
+
+ /*
+ * 0x0 to 0x10: 32MB increments starting at 0MB
+ * 0x11 to 0x16: 4MB increments starting at 8MB
+ * 0x17 to 0x1d: 4MB increments start at 36MB
+ */
+ if (gms < 0x11)
+ return gms * MB(32);
+ else if (gms < 0x17)
+ return (gms - 0x11) * MB(4) + MB(8);
+ else
+ return (gms - 0x17) * MB(4) + MB(36);
+}
+
+static resource_size_t __init gen9_stolen_size(int num, int slot, int func)
+{
+ u16 gmch_ctrl;
+ u16 gms;
+
+ gmch_ctrl = read_pci_config_16(num, slot, func, SNB_GMCH_CTRL);
+ gms = (gmch_ctrl >> BDW_GMCH_GMS_SHIFT) & BDW_GMCH_GMS_MASK;
+
+ /* 0x0 to 0xef: 32MB increments starting at 0MB */
+ /* 0xf0 to 0xfe: 4MB increments starting at 4MB */
+ if (gms < 0xf0)
+ return gms * MB(32);
+ else
+ return (gms - 0xf0) * MB(4) + MB(4);
+}
+
+struct intel_early_ops {
+ resource_size_t (*stolen_size)(int num, int slot, int func);
+ resource_size_t (*stolen_base)(int num, int slot, int func,
+ resource_size_t size);
+};
+
+static const struct intel_early_ops i830_early_ops __initconst = {
+ .stolen_base = i830_stolen_base,
+ .stolen_size = i830_stolen_size,
+};
+
+static const struct intel_early_ops i845_early_ops __initconst = {
+ .stolen_base = i845_stolen_base,
+ .stolen_size = i830_stolen_size,
+};
+
+static const struct intel_early_ops i85x_early_ops __initconst = {
+ .stolen_base = i85x_stolen_base,
+ .stolen_size = gen3_stolen_size,
+};
+
+static const struct intel_early_ops i865_early_ops __initconst = {
+ .stolen_base = i865_stolen_base,
+ .stolen_size = gen3_stolen_size,
+};
+
+static const struct intel_early_ops gen3_early_ops __initconst = {
+ .stolen_base = gen3_stolen_base,
+ .stolen_size = gen3_stolen_size,
+};
+
+static const struct intel_early_ops gen6_early_ops __initconst = {
+ .stolen_base = gen3_stolen_base,
+ .stolen_size = gen6_stolen_size,
+};
+
+static const struct intel_early_ops gen8_early_ops __initconst = {
+ .stolen_base = gen3_stolen_base,
+ .stolen_size = gen8_stolen_size,
+};
+
+static const struct intel_early_ops gen9_early_ops __initconst = {
+ .stolen_base = gen3_stolen_base,
+ .stolen_size = gen9_stolen_size,
+};
+
+static const struct intel_early_ops chv_early_ops __initconst = {
+ .stolen_base = gen3_stolen_base,
+ .stolen_size = chv_stolen_size,
+};
+
+static const struct intel_early_ops gen11_early_ops __initconst = {
+ .stolen_base = gen11_stolen_base,
+ .stolen_size = gen9_stolen_size,
+};
+
+static const struct pci_device_id intel_early_ids[] __initconst = {
+ INTEL_I830_IDS(&i830_early_ops),
+ INTEL_I845G_IDS(&i845_early_ops),
+ INTEL_I85X_IDS(&i85x_early_ops),
+ INTEL_I865G_IDS(&i865_early_ops),
+ INTEL_I915G_IDS(&gen3_early_ops),
+ INTEL_I915GM_IDS(&gen3_early_ops),
+ INTEL_I945G_IDS(&gen3_early_ops),
+ INTEL_I945GM_IDS(&gen3_early_ops),
+ INTEL_VLV_IDS(&gen6_early_ops),
+ INTEL_PINEVIEW_IDS(&gen3_early_ops),
+ INTEL_I965G_IDS(&gen3_early_ops),
+ INTEL_G33_IDS(&gen3_early_ops),
+ INTEL_I965GM_IDS(&gen3_early_ops),
+ INTEL_GM45_IDS(&gen3_early_ops),
+ INTEL_G45_IDS(&gen3_early_ops),
+ INTEL_IRONLAKE_D_IDS(&gen3_early_ops),
+ INTEL_IRONLAKE_M_IDS(&gen3_early_ops),
+ INTEL_SNB_D_IDS(&gen6_early_ops),
+ INTEL_SNB_M_IDS(&gen6_early_ops),
+ INTEL_IVB_M_IDS(&gen6_early_ops),
+ INTEL_IVB_D_IDS(&gen6_early_ops),
+ INTEL_HSW_IDS(&gen6_early_ops),
+ INTEL_BDW_IDS(&gen8_early_ops),
+ INTEL_CHV_IDS(&chv_early_ops),
+ INTEL_SKL_IDS(&gen9_early_ops),
+ INTEL_BXT_IDS(&gen9_early_ops),
+ INTEL_KBL_IDS(&gen9_early_ops),
+ INTEL_CFL_IDS(&gen9_early_ops),
+ INTEL_GLK_IDS(&gen9_early_ops),
+ INTEL_CNL_IDS(&gen9_early_ops),
+ INTEL_ICL_11_IDS(&gen11_early_ops),
+};
+
+struct resource intel_graphics_stolen_res __ro_after_init = DEFINE_RES_MEM(0, 0);
+EXPORT_SYMBOL(intel_graphics_stolen_res);
+
+static void __init
+intel_graphics_stolen(int num, int slot, int func,
+ const struct intel_early_ops *early_ops)
+{
+ resource_size_t base, size;
+ resource_size_t end;
+
+ size = early_ops->stolen_size(num, slot, func);
+ base = early_ops->stolen_base(num, slot, func, size);
+
+ if (!size || !base)
+ return;
+
+ end = base + size - 1;
+
+ intel_graphics_stolen_res.start = base;
+ intel_graphics_stolen_res.end = end;
+
+ printk(KERN_INFO "Reserving Intel graphics memory at %pR\n",
+ &intel_graphics_stolen_res);
+
+ /* Mark this space as reserved */
+ e820__range_add(base, size, E820_TYPE_RESERVED);
+ e820__update_table(e820_table);
+}
+
+static void __init intel_graphics_quirks(int num, int slot, int func)
+{
+ const struct intel_early_ops *early_ops;
+ u16 device;
+ int i;
+
+ device = read_pci_config_16(num, slot, func, PCI_DEVICE_ID);
+
+ for (i = 0; i < ARRAY_SIZE(intel_early_ids); i++) {
+ kernel_ulong_t driver_data = intel_early_ids[i].driver_data;
+
+ if (intel_early_ids[i].device != device)
+ continue;
+
+ early_ops = (typeof(early_ops))driver_data;
+
+ intel_graphics_stolen(num, slot, func, early_ops);
+
+ return;
+ }
+}
+
+static void __init force_disable_hpet(int num, int slot, int func)
+{
+#ifdef CONFIG_HPET_TIMER
+ boot_hpet_disable = true;
+ pr_info("x86/hpet: Will disable the HPET for this platform because it's not reliable\n");
+#endif
+}
+
+#define BCM4331_MMIO_SIZE 16384
+#define BCM4331_PM_CAP 0x40
+#define bcma_aread32(reg) ioread32(mmio + 1 * BCMA_CORE_SIZE + reg)
+#define bcma_awrite32(reg, val) iowrite32(val, mmio + 1 * BCMA_CORE_SIZE + reg)
+
+static void __init apple_airport_reset(int bus, int slot, int func)
+{
+ void __iomem *mmio;
+ u16 pmcsr;
+ u64 addr;
+ int i;
+
+ if (!x86_apple_machine)
+ return;
+
+ /* Card may have been put into PCI_D3hot by grub quirk */
+ pmcsr = read_pci_config_16(bus, slot, func, BCM4331_PM_CAP + PCI_PM_CTRL);
+
+ if ((pmcsr & PCI_PM_CTRL_STATE_MASK) != PCI_D0) {
+ pmcsr &= ~PCI_PM_CTRL_STATE_MASK;
+ write_pci_config_16(bus, slot, func, BCM4331_PM_CAP + PCI_PM_CTRL, pmcsr);
+ mdelay(10);
+
+ pmcsr = read_pci_config_16(bus, slot, func, BCM4331_PM_CAP + PCI_PM_CTRL);
+ if ((pmcsr & PCI_PM_CTRL_STATE_MASK) != PCI_D0) {
+ pr_err("pci 0000:%02x:%02x.%d: Cannot power up Apple AirPort card\n",
+ bus, slot, func);
+ return;
+ }
+ }
+
+ addr = read_pci_config(bus, slot, func, PCI_BASE_ADDRESS_0);
+ addr |= (u64)read_pci_config(bus, slot, func, PCI_BASE_ADDRESS_1) << 32;
+ addr &= PCI_BASE_ADDRESS_MEM_MASK;
+
+ mmio = early_ioremap(addr, BCM4331_MMIO_SIZE);
+ if (!mmio) {
+ pr_err("pci 0000:%02x:%02x.%d: Cannot iomap Apple AirPort card\n",
+ bus, slot, func);
+ return;
+ }
+
+ pr_info("Resetting Apple AirPort card (left enabled by EFI)\n");
+
+ for (i = 0; bcma_aread32(BCMA_RESET_ST) && i < 30; i++)
+ udelay(10);
+
+ bcma_awrite32(BCMA_RESET_CTL, BCMA_RESET_CTL_RESET);
+ bcma_aread32(BCMA_RESET_CTL);
+ udelay(1);
+
+ bcma_awrite32(BCMA_RESET_CTL, 0);
+ bcma_aread32(BCMA_RESET_CTL);
+ udelay(10);
+
+ early_iounmap(mmio, BCM4331_MMIO_SIZE);
+}
+
+#define QFLAG_APPLY_ONCE 0x1
+#define QFLAG_APPLIED 0x2
+#define QFLAG_DONE (QFLAG_APPLY_ONCE|QFLAG_APPLIED)
+struct chipset {
+ u32 vendor;
+ u32 device;
+ u32 class;
+ u32 class_mask;
+ u32 flags;
+ void (*f)(int num, int slot, int func);
+};
+
+static struct chipset early_qrk[] __initdata = {
+ { PCI_VENDOR_ID_NVIDIA, PCI_ANY_ID,
+ PCI_CLASS_BRIDGE_PCI, PCI_ANY_ID, QFLAG_APPLY_ONCE, nvidia_bugs },
+ { PCI_VENDOR_ID_VIA, PCI_ANY_ID,
+ PCI_CLASS_BRIDGE_PCI, PCI_ANY_ID, QFLAG_APPLY_ONCE, via_bugs },
+ { PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB,
+ PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, fix_hypertransport_config },
+ { PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_IXP400_SMBUS,
+ PCI_CLASS_SERIAL_SMBUS, PCI_ANY_ID, 0, ati_bugs },
+ { PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_SBX00_SMBUS,
+ PCI_CLASS_SERIAL_SMBUS, PCI_ANY_ID, 0, ati_bugs_contd },
+ { PCI_VENDOR_ID_INTEL, 0x3403, PCI_CLASS_BRIDGE_HOST,
+ PCI_BASE_CLASS_BRIDGE, 0, intel_remapping_check },
+ { PCI_VENDOR_ID_INTEL, 0x3405, PCI_CLASS_BRIDGE_HOST,
+ PCI_BASE_CLASS_BRIDGE, 0, intel_remapping_check },
+ { PCI_VENDOR_ID_INTEL, 0x3406, PCI_CLASS_BRIDGE_HOST,
+ PCI_BASE_CLASS_BRIDGE, 0, intel_remapping_check },
+ { PCI_VENDOR_ID_INTEL, PCI_ANY_ID, PCI_CLASS_DISPLAY_VGA, PCI_ANY_ID,
+ QFLAG_APPLY_ONCE, intel_graphics_quirks },
+ /*
+ * HPET on the current version of the Baytrail platform has accuracy
+ * problems: it will halt in deep idle state - so we disable it.
+ *
+ * More details can be found in section 18.10.1.3 of the datasheet:
+ *
+ * http://www.intel.com/content/dam/www/public/us/en/documents/datasheets/atom-z8000-datasheet-vol-1.pdf
+ */
+ { PCI_VENDOR_ID_INTEL, 0x0f00,
+ PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, force_disable_hpet},
+ { PCI_VENDOR_ID_BROADCOM, 0x4331,
+ PCI_CLASS_NETWORK_OTHER, PCI_ANY_ID, 0, apple_airport_reset},
+ {}
+};
+
+static void __init early_pci_scan_bus(int bus);
+
+/**
+ * check_dev_quirk - apply early quirks to a given PCI device
+ * @num: bus number
+ * @slot: slot number
+ * @func: PCI function
+ *
+ * Check the vendor & device ID against the early quirks table.
+ *
+ * If the device is single function, let early_pci_scan_bus() know so we don't
+ * poke at this device again.
+ */
+static int __init check_dev_quirk(int num, int slot, int func)
+{
+ u16 class;
+ u16 vendor;
+ u16 device;
+ u8 type;
+ u8 sec;
+ int i;
+
+ class = read_pci_config_16(num, slot, func, PCI_CLASS_DEVICE);
+
+ if (class == 0xffff)
+ return -1; /* no class, treat as single function */
+
+ vendor = read_pci_config_16(num, slot, func, PCI_VENDOR_ID);
+
+ device = read_pci_config_16(num, slot, func, PCI_DEVICE_ID);
+
+ for (i = 0; early_qrk[i].f != NULL; i++) {
+ if (((early_qrk[i].vendor == PCI_ANY_ID) ||
+ (early_qrk[i].vendor == vendor)) &&
+ ((early_qrk[i].device == PCI_ANY_ID) ||
+ (early_qrk[i].device == device)) &&
+ (!((early_qrk[i].class ^ class) &
+ early_qrk[i].class_mask))) {
+ if ((early_qrk[i].flags &
+ QFLAG_DONE) != QFLAG_DONE)
+ early_qrk[i].f(num, slot, func);
+ early_qrk[i].flags |= QFLAG_APPLIED;
+ }
+ }
+
+ type = read_pci_config_byte(num, slot, func,
+ PCI_HEADER_TYPE);
+
+ if ((type & 0x7f) == PCI_HEADER_TYPE_BRIDGE) {
+ sec = read_pci_config_byte(num, slot, func, PCI_SECONDARY_BUS);
+ if (sec > num)
+ early_pci_scan_bus(sec);
+ }
+
+ if (!(type & 0x80))
+ return -1;
+
+ return 0;
+}
+
+static void __init early_pci_scan_bus(int bus)
+{
+ int slot, func;
+
+ /* Poor man's PCI discovery */
+ for (slot = 0; slot < 32; slot++)
+ for (func = 0; func < 8; func++) {
+ /* Only probe function 0 on single fn devices */
+ if (check_dev_quirk(bus, slot, func))
+ break;
+ }
+}
+
+void __init early_quirks(void)
+{
+ if (!early_pci_allowed())
+ return;
+
+ early_pci_scan_bus(0);
+}
diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c
new file mode 100644
index 0000000..374a52f
--- /dev/null
+++ b/arch/x86/kernel/early_printk.c
@@ -0,0 +1,405 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/console.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/string.h>
+#include <linux/screen_info.h>
+#include <linux/usb/ch9.h>
+#include <linux/pci_regs.h>
+#include <linux/pci_ids.h>
+#include <linux/errno.h>
+#include <asm/io.h>
+#include <asm/processor.h>
+#include <asm/fcntl.h>
+#include <asm/setup.h>
+#include <xen/hvc-console.h>
+#include <asm/pci-direct.h>
+#include <asm/fixmap.h>
+#include <asm/intel-mid.h>
+#include <asm/pgtable.h>
+#include <linux/usb/ehci_def.h>
+#include <linux/usb/xhci-dbgp.h>
+#include <linux/efi.h>
+#include <asm/efi.h>
+#include <asm/pci_x86.h>
+
+/* Simple VGA output */
+#define VGABASE (__ISA_IO_base + 0xb8000)
+
+static int max_ypos = 25, max_xpos = 80;
+static int current_ypos = 25, current_xpos;
+
+static void early_vga_write(struct console *con, const char *str, unsigned n)
+{
+ char c;
+ int i, k, j;
+
+ while ((c = *str++) != '\0' && n-- > 0) {
+ if (current_ypos >= max_ypos) {
+ /* scroll 1 line up */
+ for (k = 1, j = 0; k < max_ypos; k++, j++) {
+ for (i = 0; i < max_xpos; i++) {
+ writew(readw(VGABASE+2*(max_xpos*k+i)),
+ VGABASE + 2*(max_xpos*j + i));
+ }
+ }
+ for (i = 0; i < max_xpos; i++)
+ writew(0x720, VGABASE + 2*(max_xpos*j + i));
+ current_ypos = max_ypos-1;
+ }
+#ifdef CONFIG_KGDB_KDB
+ if (c == '\b') {
+ if (current_xpos > 0)
+ current_xpos--;
+ } else if (c == '\r') {
+ current_xpos = 0;
+ } else
+#endif
+ if (c == '\n') {
+ current_xpos = 0;
+ current_ypos++;
+ } else if (c != '\r') {
+ writew(((0x7 << 8) | (unsigned short) c),
+ VGABASE + 2*(max_xpos*current_ypos +
+ current_xpos++));
+ if (current_xpos >= max_xpos) {
+ current_xpos = 0;
+ current_ypos++;
+ }
+ }
+ }
+}
+
+static struct console early_vga_console = {
+ .name = "earlyvga",
+ .write = early_vga_write,
+ .flags = CON_PRINTBUFFER,
+ .index = -1,
+};
+
+/* Serial functions loosely based on a similar package from Klaus P. Gerlicher */
+
+static unsigned long early_serial_base = 0x3f8; /* ttyS0 */
+
+#define XMTRDY 0x20
+
+#define DLAB 0x80
+
+#define TXR 0 /* Transmit register (WRITE) */
+#define RXR 0 /* Receive register (READ) */
+#define IER 1 /* Interrupt Enable */
+#define IIR 2 /* Interrupt ID */
+#define FCR 2 /* FIFO control */
+#define LCR 3 /* Line control */
+#define MCR 4 /* Modem control */
+#define LSR 5 /* Line Status */
+#define MSR 6 /* Modem Status */
+#define DLL 0 /* Divisor Latch Low */
+#define DLH 1 /* Divisor latch High */
+
+static unsigned int io_serial_in(unsigned long addr, int offset)
+{
+ return inb(addr + offset);
+}
+
+static void io_serial_out(unsigned long addr, int offset, int value)
+{
+ outb(value, addr + offset);
+}
+
+static unsigned int (*serial_in)(unsigned long addr, int offset) = io_serial_in;
+static void (*serial_out)(unsigned long addr, int offset, int value) = io_serial_out;
+
+static int early_serial_putc(unsigned char ch)
+{
+ unsigned timeout = 0xffff;
+
+ while ((serial_in(early_serial_base, LSR) & XMTRDY) == 0 && --timeout)
+ cpu_relax();
+ serial_out(early_serial_base, TXR, ch);
+ return timeout ? 0 : -1;
+}
+
+static void early_serial_write(struct console *con, const char *s, unsigned n)
+{
+ while (*s && n-- > 0) {
+ if (*s == '\n')
+ early_serial_putc('\r');
+ early_serial_putc(*s);
+ s++;
+ }
+}
+
+static __init void early_serial_hw_init(unsigned divisor)
+{
+ unsigned char c;
+
+ serial_out(early_serial_base, LCR, 0x3); /* 8n1 */
+ serial_out(early_serial_base, IER, 0); /* no interrupt */
+ serial_out(early_serial_base, FCR, 0); /* no fifo */
+ serial_out(early_serial_base, MCR, 0x3); /* DTR + RTS */
+
+ c = serial_in(early_serial_base, LCR);
+ serial_out(early_serial_base, LCR, c | DLAB);
+ serial_out(early_serial_base, DLL, divisor & 0xff);
+ serial_out(early_serial_base, DLH, (divisor >> 8) & 0xff);
+ serial_out(early_serial_base, LCR, c & ~DLAB);
+}
+
+#define DEFAULT_BAUD 9600
+
+static __init void early_serial_init(char *s)
+{
+ unsigned divisor;
+ unsigned long baud = DEFAULT_BAUD;
+ char *e;
+
+ if (*s == ',')
+ ++s;
+
+ if (*s) {
+ unsigned port;
+ if (!strncmp(s, "0x", 2)) {
+ early_serial_base = simple_strtoul(s, &e, 16);
+ } else {
+ static const int __initconst bases[] = { 0x3f8, 0x2f8 };
+
+ if (!strncmp(s, "ttyS", 4))
+ s += 4;
+ port = simple_strtoul(s, &e, 10);
+ if (port > 1 || s == e)
+ port = 0;
+ early_serial_base = bases[port];
+ }
+ s += strcspn(s, ",");
+ if (*s == ',')
+ s++;
+ }
+
+ if (*s) {
+ baud = simple_strtoull(s, &e, 0);
+
+ if (baud == 0 || s == e)
+ baud = DEFAULT_BAUD;
+ }
+
+ /* Convert from baud to divisor value */
+ divisor = 115200 / baud;
+
+ /* These will always be IO based ports */
+ serial_in = io_serial_in;
+ serial_out = io_serial_out;
+
+ /* Set up the HW */
+ early_serial_hw_init(divisor);
+}
+
+#ifdef CONFIG_PCI
+static void mem32_serial_out(unsigned long addr, int offset, int value)
+{
+ u32 __iomem *vaddr = (u32 __iomem *)addr;
+ /* shift implied by pointer type */
+ writel(value, vaddr + offset);
+}
+
+static unsigned int mem32_serial_in(unsigned long addr, int offset)
+{
+ u32 __iomem *vaddr = (u32 __iomem *)addr;
+ /* shift implied by pointer type */
+ return readl(vaddr + offset);
+}
+
+/*
+ * early_pci_serial_init()
+ *
+ * This function is invoked when the early_printk param starts with "pciserial"
+ * The rest of the param should be "[force],B:D.F,baud", where B, D & F describe
+ * the location of a PCI device that must be a UART device. "force" is optional
+ * and overrides the use of an UART device with a wrong PCI class code.
+ */
+static __init void early_pci_serial_init(char *s)
+{
+ unsigned divisor;
+ unsigned long baud = DEFAULT_BAUD;
+ u8 bus, slot, func;
+ u32 classcode, bar0;
+ u16 cmdreg;
+ char *e;
+ int force = 0;
+
+ if (*s == ',')
+ ++s;
+
+ if (*s == 0)
+ return;
+
+ /* Force the use of an UART device with wrong class code */
+ if (!strncmp(s, "force,", 6)) {
+ force = 1;
+ s += 6;
+ }
+
+ /*
+ * Part the param to get the BDF values
+ */
+ bus = (u8)simple_strtoul(s, &e, 16);
+ s = e;
+ if (*s != ':')
+ return;
+ ++s;
+ slot = (u8)simple_strtoul(s, &e, 16);
+ s = e;
+ if (*s != '.')
+ return;
+ ++s;
+ func = (u8)simple_strtoul(s, &e, 16);
+ s = e;
+
+ /* A baud might be following */
+ if (*s == ',')
+ s++;
+
+ /*
+ * Find the device from the BDF
+ */
+ cmdreg = read_pci_config(bus, slot, func, PCI_COMMAND);
+ classcode = read_pci_config(bus, slot, func, PCI_CLASS_REVISION);
+ bar0 = read_pci_config(bus, slot, func, PCI_BASE_ADDRESS_0);
+
+ /*
+ * Verify it is a UART type device
+ */
+ if (((classcode >> 16 != PCI_CLASS_COMMUNICATION_MODEM) &&
+ (classcode >> 16 != PCI_CLASS_COMMUNICATION_SERIAL)) ||
+ (((classcode >> 8) & 0xff) != 0x02)) /* 16550 I/F at BAR0 */ {
+ if (!force)
+ return;
+ }
+
+ /*
+ * Determine if it is IO or memory mapped
+ */
+ if (bar0 & 0x01) {
+ /* it is IO mapped */
+ serial_in = io_serial_in;
+ serial_out = io_serial_out;
+ early_serial_base = bar0&0xfffffffc;
+ write_pci_config(bus, slot, func, PCI_COMMAND,
+ cmdreg|PCI_COMMAND_IO);
+ } else {
+ /* It is memory mapped - assume 32-bit alignment */
+ serial_in = mem32_serial_in;
+ serial_out = mem32_serial_out;
+ /* WARNING! assuming the address is always in the first 4G */
+ early_serial_base =
+ (unsigned long)early_ioremap(bar0 & 0xfffffff0, 0x10);
+ write_pci_config(bus, slot, func, PCI_COMMAND,
+ cmdreg|PCI_COMMAND_MEMORY);
+ }
+
+ /*
+ * Initialize the hardware
+ */
+ if (*s) {
+ if (strcmp(s, "nocfg") == 0)
+ /* Sometimes, we want to leave the UART alone
+ * and assume the BIOS has set it up correctly.
+ * "nocfg" tells us this is the case, and we
+ * should do no more setup.
+ */
+ return;
+ if (kstrtoul(s, 0, &baud) < 0 || baud == 0)
+ baud = DEFAULT_BAUD;
+ }
+
+ /* Convert from baud to divisor value */
+ divisor = 115200 / baud;
+
+ /* Set up the HW */
+ early_serial_hw_init(divisor);
+}
+#endif
+
+static struct console early_serial_console = {
+ .name = "earlyser",
+ .write = early_serial_write,
+ .flags = CON_PRINTBUFFER,
+ .index = -1,
+};
+
+static void early_console_register(struct console *con, int keep_early)
+{
+ if (con->index != -1) {
+ printk(KERN_CRIT "ERROR: earlyprintk= %s already used\n",
+ con->name);
+ return;
+ }
+ early_console = con;
+ if (keep_early)
+ early_console->flags &= ~CON_BOOT;
+ else
+ early_console->flags |= CON_BOOT;
+ register_console(early_console);
+}
+
+static int __init setup_early_printk(char *buf)
+{
+ int keep;
+
+ if (!buf)
+ return 0;
+
+ if (early_console)
+ return 0;
+
+ keep = (strstr(buf, "keep") != NULL);
+
+ while (*buf != '\0') {
+ if (!strncmp(buf, "serial", 6)) {
+ buf += 6;
+ early_serial_init(buf);
+ early_console_register(&early_serial_console, keep);
+ if (!strncmp(buf, ",ttyS", 5))
+ buf += 5;
+ }
+ if (!strncmp(buf, "ttyS", 4)) {
+ early_serial_init(buf + 4);
+ early_console_register(&early_serial_console, keep);
+ }
+#ifdef CONFIG_PCI
+ if (!strncmp(buf, "pciserial", 9)) {
+ early_pci_serial_init(buf + 9);
+ early_console_register(&early_serial_console, keep);
+ buf += 9; /* Keep from match the above "serial" */
+ }
+#endif
+ if (!strncmp(buf, "vga", 3) &&
+ boot_params.screen_info.orig_video_isVGA == 1) {
+ max_xpos = boot_params.screen_info.orig_video_cols;
+ max_ypos = boot_params.screen_info.orig_video_lines;
+ current_ypos = boot_params.screen_info.orig_y;
+ early_console_register(&early_vga_console, keep);
+ }
+#ifdef CONFIG_EARLY_PRINTK_DBGP
+ if (!strncmp(buf, "dbgp", 4) && !early_dbgp_init(buf + 4))
+ early_console_register(&early_dbgp_console, keep);
+#endif
+#ifdef CONFIG_HVC_XEN
+ if (!strncmp(buf, "xen", 3))
+ early_console_register(&xenboot_console, keep);
+#endif
+#ifdef CONFIG_EARLY_PRINTK_EFI
+ if (!strncmp(buf, "efi", 3))
+ early_console_register(&early_efi_console, keep);
+#endif
+#ifdef CONFIG_EARLY_PRINTK_USB_XDBC
+ if (!strncmp(buf, "xdbc", 4))
+ early_xdbc_parse_parameter(buf + 4);
+#endif
+
+ buf++;
+ }
+ return 0;
+}
+
+early_param("earlyprintk", setup_early_printk);
diff --git a/arch/x86/kernel/ebda.c b/arch/x86/kernel/ebda.c
new file mode 100644
index 0000000..38e7d59
--- /dev/null
+++ b/arch/x86/kernel/ebda.c
@@ -0,0 +1,98 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/memblock.h>
+
+#include <asm/setup.h>
+#include <asm/bios_ebda.h>
+
+/*
+ * This function reserves all conventional PC system BIOS related
+ * firmware memory areas (some of which are data, some of which
+ * are code), that must not be used by the kernel as available
+ * RAM.
+ *
+ * The BIOS places the EBDA/XBDA at the top of conventional
+ * memory, and usually decreases the reported amount of
+ * conventional memory (int 0x12) too.
+ *
+ * This means that as a first approximation on most systems we can
+ * guess the reserved BIOS area by looking at the low BIOS RAM size
+ * value and assume that everything above that value (up to 1MB) is
+ * reserved.
+ *
+ * But life in firmware country is not that simple:
+ *
+ * - This code also contains a quirk for Dell systems that neglect
+ * to reserve the EBDA area in the 'RAM size' value ...
+ *
+ * - The same quirk also avoids a problem with the AMD768MPX
+ * chipset: reserve a page before VGA to prevent PCI prefetch
+ * into it (errata #56). (Usually the page is reserved anyways,
+ * unless you have no PS/2 mouse plugged in.)
+ *
+ * - Plus paravirt systems don't have a reliable value in the
+ * 'BIOS RAM size' pointer we can rely on, so we must quirk
+ * them too.
+ *
+ * Due to those various problems this function is deliberately
+ * very conservative and tries to err on the side of reserving
+ * too much, to not risk reserving too little.
+ *
+ * Losing a small amount of memory in the bottom megabyte is
+ * rarely a problem, as long as we have enough memory to install
+ * the SMP bootup trampoline which *must* be in this area.
+ *
+ * Using memory that is in use by the BIOS or by some DMA device
+ * the BIOS didn't shut down *is* a big problem to the kernel,
+ * obviously.
+ */
+
+#define BIOS_RAM_SIZE_KB_PTR 0x413
+
+#define BIOS_START_MIN 0x20000U /* 128K, less than this is insane */
+#define BIOS_START_MAX 0x9f000U /* 640K, absolute maximum */
+
+void __init reserve_bios_regions(void)
+{
+ unsigned int bios_start, ebda_start;
+
+ /*
+ * NOTE: In a paravirtual environment the BIOS reserved
+ * area is absent. We'll just have to assume that the
+ * paravirt case can handle memory setup correctly,
+ * without our help.
+ */
+ if (!x86_platform.legacy.reserve_bios_regions)
+ return;
+
+ /*
+ * BIOS RAM size is encoded in kilobytes, convert it
+ * to bytes to get a first guess at where the BIOS
+ * firmware area starts:
+ */
+ bios_start = *(unsigned short *)__va(BIOS_RAM_SIZE_KB_PTR);
+ bios_start <<= 10;
+
+ /*
+ * If bios_start is less than 128K, assume it is bogus
+ * and bump it up to 640K. Similarly, if bios_start is above 640K,
+ * don't trust it.
+ */
+ if (bios_start < BIOS_START_MIN || bios_start > BIOS_START_MAX)
+ bios_start = BIOS_START_MAX;
+
+ /* Get the start address of the EBDA page: */
+ ebda_start = get_bios_ebda();
+
+ /*
+ * If the EBDA start address is sane and is below the BIOS region,
+ * then also reserve everything from the EBDA start address up to
+ * the BIOS region.
+ */
+ if (ebda_start >= BIOS_START_MIN && ebda_start < bios_start)
+ bios_start = ebda_start;
+
+ /* Reserve all memory between bios_start and the 1MB mark: */
+ memblock_reserve(bios_start, 0x100000 - bios_start);
+}
diff --git a/arch/x86/kernel/eisa.c b/arch/x86/kernel/eisa.c
new file mode 100644
index 0000000..e8c8c5d
--- /dev/null
+++ b/arch/x86/kernel/eisa.c
@@ -0,0 +1,25 @@
+/*
+ * EISA specific code
+ *
+ * This file is licensed under the GPL V2
+ */
+#include <linux/ioport.h>
+#include <linux/eisa.h>
+#include <linux/io.h>
+
+#include <xen/xen.h>
+
+static __init int eisa_bus_probe(void)
+{
+ void __iomem *p;
+
+ if (xen_pv_domain() && !xen_initial_domain())
+ return 0;
+
+ p = ioremap(0x0FFFD9, 4);
+ if (p && readl(p) == 'E' + ('I' << 8) + ('S' << 16) + ('A' << 24))
+ EISA_bus = 1;
+ iounmap(p);
+ return 0;
+}
+subsys_initcall(eisa_bus_probe);
diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c
new file mode 100644
index 0000000..aebd0d5
--- /dev/null
+++ b/arch/x86/kernel/espfix_64.c
@@ -0,0 +1,215 @@
+/* ----------------------------------------------------------------------- *
+ *
+ * Copyright 2014 Intel Corporation; author: H. Peter Anvin
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * ----------------------------------------------------------------------- */
+
+/*
+ * The IRET instruction, when returning to a 16-bit segment, only
+ * restores the bottom 16 bits of the user space stack pointer. This
+ * causes some 16-bit software to break, but it also leaks kernel state
+ * to user space.
+ *
+ * This works around this by creating percpu "ministacks", each of which
+ * is mapped 2^16 times 64K apart. When we detect that the return SS is
+ * on the LDT, we copy the IRET frame to the ministack and use the
+ * relevant alias to return to userspace. The ministacks are mapped
+ * readonly, so if the IRET fault we promote #GP to #DF which is an IST
+ * vector and thus has its own stack; we then do the fixup in the #DF
+ * handler.
+ *
+ * This file sets up the ministacks and the related page tables. The
+ * actual ministack invocation is in entry_64.S.
+ */
+
+#include <linux/init.h>
+#include <linux/init_task.h>
+#include <linux/kernel.h>
+#include <linux/percpu.h>
+#include <linux/gfp.h>
+#include <linux/random.h>
+#include <asm/pgtable.h>
+#include <asm/pgalloc.h>
+#include <asm/setup.h>
+#include <asm/espfix.h>
+
+/*
+ * Note: we only need 6*8 = 48 bytes for the espfix stack, but round
+ * it up to a cache line to avoid unnecessary sharing.
+ */
+#define ESPFIX_STACK_SIZE (8*8UL)
+#define ESPFIX_STACKS_PER_PAGE (PAGE_SIZE/ESPFIX_STACK_SIZE)
+
+/* There is address space for how many espfix pages? */
+#define ESPFIX_PAGE_SPACE (1UL << (P4D_SHIFT-PAGE_SHIFT-16))
+
+#define ESPFIX_MAX_CPUS (ESPFIX_STACKS_PER_PAGE * ESPFIX_PAGE_SPACE)
+#if CONFIG_NR_CPUS > ESPFIX_MAX_CPUS
+# error "Need more virtual address space for the ESPFIX hack"
+#endif
+
+#define PGALLOC_GFP (GFP_KERNEL | __GFP_ZERO)
+
+/* This contains the *bottom* address of the espfix stack */
+DEFINE_PER_CPU_READ_MOSTLY(unsigned long, espfix_stack);
+DEFINE_PER_CPU_READ_MOSTLY(unsigned long, espfix_waddr);
+
+/* Initialization mutex - should this be a spinlock? */
+static DEFINE_MUTEX(espfix_init_mutex);
+
+/* Page allocation bitmap - each page serves ESPFIX_STACKS_PER_PAGE CPUs */
+#define ESPFIX_MAX_PAGES DIV_ROUND_UP(CONFIG_NR_CPUS, ESPFIX_STACKS_PER_PAGE)
+static void *espfix_pages[ESPFIX_MAX_PAGES];
+
+static __page_aligned_bss pud_t espfix_pud_page[PTRS_PER_PUD]
+ __aligned(PAGE_SIZE);
+
+static unsigned int page_random, slot_random;
+
+/*
+ * This returns the bottom address of the espfix stack for a specific CPU.
+ * The math allows for a non-power-of-two ESPFIX_STACK_SIZE, in which case
+ * we have to account for some amount of padding at the end of each page.
+ */
+static inline unsigned long espfix_base_addr(unsigned int cpu)
+{
+ unsigned long page, slot;
+ unsigned long addr;
+
+ page = (cpu / ESPFIX_STACKS_PER_PAGE) ^ page_random;
+ slot = (cpu + slot_random) % ESPFIX_STACKS_PER_PAGE;
+ addr = (page << PAGE_SHIFT) + (slot * ESPFIX_STACK_SIZE);
+ addr = (addr & 0xffffUL) | ((addr & ~0xffffUL) << 16);
+ addr += ESPFIX_BASE_ADDR;
+ return addr;
+}
+
+#define PTE_STRIDE (65536/PAGE_SIZE)
+#define ESPFIX_PTE_CLONES (PTRS_PER_PTE/PTE_STRIDE)
+#define ESPFIX_PMD_CLONES PTRS_PER_PMD
+#define ESPFIX_PUD_CLONES (65536/(ESPFIX_PTE_CLONES*ESPFIX_PMD_CLONES))
+
+#define PGTABLE_PROT ((_KERNPG_TABLE & ~_PAGE_RW) | _PAGE_NX)
+
+static void init_espfix_random(void)
+{
+ unsigned long rand;
+
+ /*
+ * This is run before the entropy pools are initialized,
+ * but this is hopefully better than nothing.
+ */
+ if (!arch_get_random_long(&rand)) {
+ /* The constant is an arbitrary large prime */
+ rand = rdtsc();
+ rand *= 0xc345c6b72fd16123UL;
+ }
+
+ slot_random = rand % ESPFIX_STACKS_PER_PAGE;
+ page_random = (rand / ESPFIX_STACKS_PER_PAGE)
+ & (ESPFIX_PAGE_SPACE - 1);
+}
+
+void __init init_espfix_bsp(void)
+{
+ pgd_t *pgd;
+ p4d_t *p4d;
+
+ /* Install the espfix pud into the kernel page directory */
+ pgd = &init_top_pgt[pgd_index(ESPFIX_BASE_ADDR)];
+ p4d = p4d_alloc(&init_mm, pgd, ESPFIX_BASE_ADDR);
+ p4d_populate(&init_mm, p4d, espfix_pud_page);
+
+ /* Randomize the locations */
+ init_espfix_random();
+
+ /* The rest is the same as for any other processor */
+ init_espfix_ap(0);
+}
+
+void init_espfix_ap(int cpu)
+{
+ unsigned int page;
+ unsigned long addr;
+ pud_t pud, *pud_p;
+ pmd_t pmd, *pmd_p;
+ pte_t pte, *pte_p;
+ int n, node;
+ void *stack_page;
+ pteval_t ptemask;
+
+ /* We only have to do this once... */
+ if (likely(per_cpu(espfix_stack, cpu)))
+ return; /* Already initialized */
+
+ addr = espfix_base_addr(cpu);
+ page = cpu/ESPFIX_STACKS_PER_PAGE;
+
+ /* Did another CPU already set this up? */
+ stack_page = READ_ONCE(espfix_pages[page]);
+ if (likely(stack_page))
+ goto done;
+
+ mutex_lock(&espfix_init_mutex);
+
+ /* Did we race on the lock? */
+ stack_page = READ_ONCE(espfix_pages[page]);
+ if (stack_page)
+ goto unlock_done;
+
+ node = cpu_to_node(cpu);
+ ptemask = __supported_pte_mask;
+
+ pud_p = &espfix_pud_page[pud_index(addr)];
+ pud = *pud_p;
+ if (!pud_present(pud)) {
+ struct page *page = alloc_pages_node(node, PGALLOC_GFP, 0);
+
+ pmd_p = (pmd_t *)page_address(page);
+ pud = __pud(__pa(pmd_p) | (PGTABLE_PROT & ptemask));
+ paravirt_alloc_pmd(&init_mm, __pa(pmd_p) >> PAGE_SHIFT);
+ for (n = 0; n < ESPFIX_PUD_CLONES; n++)
+ set_pud(&pud_p[n], pud);
+ }
+
+ pmd_p = pmd_offset(&pud, addr);
+ pmd = *pmd_p;
+ if (!pmd_present(pmd)) {
+ struct page *page = alloc_pages_node(node, PGALLOC_GFP, 0);
+
+ pte_p = (pte_t *)page_address(page);
+ pmd = __pmd(__pa(pte_p) | (PGTABLE_PROT & ptemask));
+ paravirt_alloc_pte(&init_mm, __pa(pte_p) >> PAGE_SHIFT);
+ for (n = 0; n < ESPFIX_PMD_CLONES; n++)
+ set_pmd(&pmd_p[n], pmd);
+ }
+
+ pte_p = pte_offset_kernel(&pmd, addr);
+ stack_page = page_address(alloc_pages_node(node, GFP_KERNEL, 0));
+ /*
+ * __PAGE_KERNEL_* includes _PAGE_GLOBAL, which we want since
+ * this is mapped to userspace.
+ */
+ pte = __pte(__pa(stack_page) | ((__PAGE_KERNEL_RO | _PAGE_ENC) & ptemask));
+ for (n = 0; n < ESPFIX_PTE_CLONES; n++)
+ set_pte(&pte_p[n*PTE_STRIDE], pte);
+
+ /* Job is done for this CPU and any CPU which shares this page */
+ WRITE_ONCE(espfix_pages[page], stack_page);
+
+unlock_done:
+ mutex_unlock(&espfix_init_mutex);
+done:
+ per_cpu(espfix_stack, cpu) = addr;
+ per_cpu(espfix_waddr, cpu) = (unsigned long)stack_page
+ + (addr & ~PAGE_MASK);
+}
diff --git a/arch/x86/kernel/fpu/Makefile b/arch/x86/kernel/fpu/Makefile
new file mode 100644
index 0000000..68279ef
--- /dev/null
+++ b/arch/x86/kernel/fpu/Makefile
@@ -0,0 +1,5 @@
+#
+# Build rules for the FPU support code:
+#
+
+obj-y += init.o bugs.o core.o regset.o signal.o xstate.o
diff --git a/arch/x86/kernel/fpu/bugs.c b/arch/x86/kernel/fpu/bugs.c
new file mode 100644
index 0000000..2954fab
--- /dev/null
+++ b/arch/x86/kernel/fpu/bugs.c
@@ -0,0 +1,59 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * x86 FPU bug checks:
+ */
+#include <asm/fpu/internal.h>
+
+/*
+ * Boot time CPU/FPU FDIV bug detection code:
+ */
+
+static double __initdata x = 4195835.0;
+static double __initdata y = 3145727.0;
+
+/*
+ * This used to check for exceptions..
+ * However, it turns out that to support that,
+ * the XMM trap handlers basically had to
+ * be buggy. So let's have a correct XMM trap
+ * handler, and forget about printing out
+ * some status at boot.
+ *
+ * We should really only care about bugs here
+ * anyway. Not features.
+ */
+void __init fpu__init_check_bugs(void)
+{
+ s32 fdiv_bug;
+
+ /* kernel_fpu_begin/end() relies on patched alternative instructions. */
+ if (!boot_cpu_has(X86_FEATURE_FPU))
+ return;
+
+ kernel_fpu_begin();
+
+ /*
+ * trap_init() enabled FXSR and company _before_ testing for FP
+ * problems here.
+ *
+ * Test for the divl bug: http://en.wikipedia.org/wiki/Fdiv_bug
+ */
+ __asm__("fninit\n\t"
+ "fldl %1\n\t"
+ "fdivl %2\n\t"
+ "fmull %2\n\t"
+ "fldl %1\n\t"
+ "fsubp %%st,%%st(1)\n\t"
+ "fistpl %0\n\t"
+ "fwait\n\t"
+ "fninit"
+ : "=m" (*&fdiv_bug)
+ : "m" (*&x), "m" (*&y));
+
+ kernel_fpu_end();
+
+ if (fdiv_bug) {
+ set_cpu_bug(&boot_cpu_data, X86_BUG_FDIV);
+ pr_warn("Hmm, FPU with FDIV bug\n");
+ }
+}
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
new file mode 100644
index 0000000..2ea85b3
--- /dev/null
+++ b/arch/x86/kernel/fpu/core.c
@@ -0,0 +1,470 @@
+/*
+ * Copyright (C) 1994 Linus Torvalds
+ *
+ * Pentium III FXSR, SSE support
+ * General FPU state handling cleanups
+ * Gareth Hughes <gareth@valinux.com>, May 2000
+ */
+#include <asm/fpu/internal.h>
+#include <asm/fpu/regset.h>
+#include <asm/fpu/signal.h>
+#include <asm/fpu/types.h>
+#include <asm/traps.h>
+#include <asm/irq_regs.h>
+
+#include <linux/hardirq.h>
+#include <linux/pkeys.h>
+
+#define CREATE_TRACE_POINTS
+#include <asm/trace/fpu.h>
+
+/*
+ * Represents the initial FPU state. It's mostly (but not completely) zeroes,
+ * depending on the FPU hardware format:
+ */
+union fpregs_state init_fpstate __read_mostly;
+
+/*
+ * Track whether the kernel is using the FPU state
+ * currently.
+ *
+ * This flag is used:
+ *
+ * - by IRQ context code to potentially use the FPU
+ * if it's unused.
+ *
+ * - to debug kernel_fpu_begin()/end() correctness
+ */
+static DEFINE_PER_CPU(bool, in_kernel_fpu);
+
+/*
+ * Track which context is using the FPU on the CPU:
+ */
+DEFINE_PER_CPU(struct fpu *, fpu_fpregs_owner_ctx);
+
+static void kernel_fpu_disable(void)
+{
+ WARN_ON_FPU(this_cpu_read(in_kernel_fpu));
+ this_cpu_write(in_kernel_fpu, true);
+}
+
+static void kernel_fpu_enable(void)
+{
+ WARN_ON_FPU(!this_cpu_read(in_kernel_fpu));
+ this_cpu_write(in_kernel_fpu, false);
+}
+
+static bool kernel_fpu_disabled(void)
+{
+ return this_cpu_read(in_kernel_fpu);
+}
+
+static bool interrupted_kernel_fpu_idle(void)
+{
+ return !kernel_fpu_disabled();
+}
+
+/*
+ * Were we in user mode (or vm86 mode) when we were
+ * interrupted?
+ *
+ * Doing kernel_fpu_begin/end() is ok if we are running
+ * in an interrupt context from user mode - we'll just
+ * save the FPU state as required.
+ */
+static bool interrupted_user_mode(void)
+{
+ struct pt_regs *regs = get_irq_regs();
+ return regs && user_mode(regs);
+}
+
+/*
+ * Can we use the FPU in kernel mode with the
+ * whole "kernel_fpu_begin/end()" sequence?
+ *
+ * It's always ok in process context (ie "not interrupt")
+ * but it is sometimes ok even from an irq.
+ */
+bool irq_fpu_usable(void)
+{
+ return !in_interrupt() ||
+ interrupted_user_mode() ||
+ interrupted_kernel_fpu_idle();
+}
+EXPORT_SYMBOL(irq_fpu_usable);
+
+void __kernel_fpu_begin(void)
+{
+ struct fpu *fpu = ¤t->thread.fpu;
+
+ WARN_ON_FPU(!irq_fpu_usable());
+
+ kernel_fpu_disable();
+
+ if (fpu->initialized) {
+ /*
+ * Ignore return value -- we don't care if reg state
+ * is clobbered.
+ */
+ copy_fpregs_to_fpstate(fpu);
+ } else {
+ __cpu_invalidate_fpregs_state();
+ }
+}
+EXPORT_SYMBOL(__kernel_fpu_begin);
+
+void __kernel_fpu_end(void)
+{
+ struct fpu *fpu = ¤t->thread.fpu;
+
+ if (fpu->initialized)
+ copy_kernel_to_fpregs(&fpu->state);
+
+ kernel_fpu_enable();
+}
+EXPORT_SYMBOL(__kernel_fpu_end);
+
+void kernel_fpu_begin(void)
+{
+ preempt_disable();
+ __kernel_fpu_begin();
+}
+EXPORT_SYMBOL_GPL(kernel_fpu_begin);
+
+void kernel_fpu_end(void)
+{
+ __kernel_fpu_end();
+ preempt_enable();
+}
+EXPORT_SYMBOL_GPL(kernel_fpu_end);
+
+/*
+ * Save the FPU state (mark it for reload if necessary):
+ *
+ * This only ever gets called for the current task.
+ */
+void fpu__save(struct fpu *fpu)
+{
+ WARN_ON_FPU(fpu != ¤t->thread.fpu);
+
+ preempt_disable();
+ trace_x86_fpu_before_save(fpu);
+ if (fpu->initialized) {
+ if (!copy_fpregs_to_fpstate(fpu)) {
+ copy_kernel_to_fpregs(&fpu->state);
+ }
+ }
+ trace_x86_fpu_after_save(fpu);
+ preempt_enable();
+}
+EXPORT_SYMBOL_GPL(fpu__save);
+
+/*
+ * Legacy x87 fpstate state init:
+ */
+static inline void fpstate_init_fstate(struct fregs_state *fp)
+{
+ fp->cwd = 0xffff037fu;
+ fp->swd = 0xffff0000u;
+ fp->twd = 0xffffffffu;
+ fp->fos = 0xffff0000u;
+}
+
+void fpstate_init(union fpregs_state *state)
+{
+ if (!static_cpu_has(X86_FEATURE_FPU)) {
+ fpstate_init_soft(&state->soft);
+ return;
+ }
+
+ memset(state, 0, fpu_kernel_xstate_size);
+
+ if (static_cpu_has(X86_FEATURE_XSAVES))
+ fpstate_init_xstate(&state->xsave);
+ if (static_cpu_has(X86_FEATURE_FXSR))
+ fpstate_init_fxstate(&state->fxsave);
+ else
+ fpstate_init_fstate(&state->fsave);
+}
+EXPORT_SYMBOL_GPL(fpstate_init);
+
+int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu)
+{
+ dst_fpu->last_cpu = -1;
+
+ if (!src_fpu->initialized || !static_cpu_has(X86_FEATURE_FPU))
+ return 0;
+
+ WARN_ON_FPU(src_fpu != ¤t->thread.fpu);
+
+ /*
+ * Don't let 'init optimized' areas of the XSAVE area
+ * leak into the child task:
+ */
+ memset(&dst_fpu->state.xsave, 0, fpu_kernel_xstate_size);
+
+ /*
+ * Save current FPU registers directly into the child
+ * FPU context, without any memory-to-memory copying.
+ *
+ * ( The function 'fails' in the FNSAVE case, which destroys
+ * register contents so we have to copy them back. )
+ */
+ if (!copy_fpregs_to_fpstate(dst_fpu)) {
+ memcpy(&src_fpu->state, &dst_fpu->state, fpu_kernel_xstate_size);
+ copy_kernel_to_fpregs(&src_fpu->state);
+ }
+
+ trace_x86_fpu_copy_src(src_fpu);
+ trace_x86_fpu_copy_dst(dst_fpu);
+
+ return 0;
+}
+
+/*
+ * Activate the current task's in-memory FPU context,
+ * if it has not been used before:
+ */
+void fpu__initialize(struct fpu *fpu)
+{
+ WARN_ON_FPU(fpu != ¤t->thread.fpu);
+
+ if (!fpu->initialized) {
+ fpstate_init(&fpu->state);
+ trace_x86_fpu_init_state(fpu);
+
+ trace_x86_fpu_activate_state(fpu);
+ /* Safe to do for the current task: */
+ fpu->initialized = 1;
+ }
+}
+EXPORT_SYMBOL_GPL(fpu__initialize);
+
+/*
+ * This function must be called before we read a task's fpstate.
+ *
+ * There's two cases where this gets called:
+ *
+ * - for the current task (when coredumping), in which case we have
+ * to save the latest FPU registers into the fpstate,
+ *
+ * - or it's called for stopped tasks (ptrace), in which case the
+ * registers were already saved by the context-switch code when
+ * the task scheduled out - we only have to initialize the registers
+ * if they've never been initialized.
+ *
+ * If the task has used the FPU before then save it.
+ */
+void fpu__prepare_read(struct fpu *fpu)
+{
+ if (fpu == ¤t->thread.fpu) {
+ fpu__save(fpu);
+ } else {
+ if (!fpu->initialized) {
+ fpstate_init(&fpu->state);
+ trace_x86_fpu_init_state(fpu);
+
+ trace_x86_fpu_activate_state(fpu);
+ /* Safe to do for current and for stopped child tasks: */
+ fpu->initialized = 1;
+ }
+ }
+}
+
+/*
+ * This function must be called before we write a task's fpstate.
+ *
+ * If the task has used the FPU before then invalidate any cached FPU registers.
+ * If the task has not used the FPU before then initialize its fpstate.
+ *
+ * After this function call, after registers in the fpstate are
+ * modified and the child task has woken up, the child task will
+ * restore the modified FPU state from the modified context. If we
+ * didn't clear its cached status here then the cached in-registers
+ * state pending on its former CPU could be restored, corrupting
+ * the modifications.
+ */
+void fpu__prepare_write(struct fpu *fpu)
+{
+ /*
+ * Only stopped child tasks can be used to modify the FPU
+ * state in the fpstate buffer:
+ */
+ WARN_ON_FPU(fpu == ¤t->thread.fpu);
+
+ if (fpu->initialized) {
+ /* Invalidate any cached state: */
+ __fpu_invalidate_fpregs_state(fpu);
+ } else {
+ fpstate_init(&fpu->state);
+ trace_x86_fpu_init_state(fpu);
+
+ trace_x86_fpu_activate_state(fpu);
+ /* Safe to do for stopped child tasks: */
+ fpu->initialized = 1;
+ }
+}
+
+/*
+ * 'fpu__restore()' is called to copy FPU registers from
+ * the FPU fpstate to the live hw registers and to activate
+ * access to the hardware registers, so that FPU instructions
+ * can be used afterwards.
+ *
+ * Must be called with kernel preemption disabled (for example
+ * with local interrupts disabled, as it is in the case of
+ * do_device_not_available()).
+ */
+void fpu__restore(struct fpu *fpu)
+{
+ fpu__initialize(fpu);
+
+ /* Avoid __kernel_fpu_begin() right after fpregs_activate() */
+ kernel_fpu_disable();
+ trace_x86_fpu_before_restore(fpu);
+ fpregs_activate(fpu);
+ copy_kernel_to_fpregs(&fpu->state);
+ trace_x86_fpu_after_restore(fpu);
+ kernel_fpu_enable();
+}
+EXPORT_SYMBOL_GPL(fpu__restore);
+
+/*
+ * Drops current FPU state: deactivates the fpregs and
+ * the fpstate. NOTE: it still leaves previous contents
+ * in the fpregs in the eager-FPU case.
+ *
+ * This function can be used in cases where we know that
+ * a state-restore is coming: either an explicit one,
+ * or a reschedule.
+ */
+void fpu__drop(struct fpu *fpu)
+{
+ preempt_disable();
+
+ if (fpu == ¤t->thread.fpu) {
+ if (fpu->initialized) {
+ /* Ignore delayed exceptions from user space */
+ asm volatile("1: fwait\n"
+ "2:\n"
+ _ASM_EXTABLE(1b, 2b));
+ fpregs_deactivate(fpu);
+ }
+ }
+
+ fpu->initialized = 0;
+
+ trace_x86_fpu_dropped(fpu);
+
+ preempt_enable();
+}
+
+/*
+ * Clear FPU registers by setting them up from
+ * the init fpstate:
+ */
+static inline void copy_init_fpstate_to_fpregs(void)
+{
+ if (use_xsave())
+ copy_kernel_to_xregs(&init_fpstate.xsave, -1);
+ else if (static_cpu_has(X86_FEATURE_FXSR))
+ copy_kernel_to_fxregs(&init_fpstate.fxsave);
+ else
+ copy_kernel_to_fregs(&init_fpstate.fsave);
+
+ if (boot_cpu_has(X86_FEATURE_OSPKE))
+ copy_init_pkru_to_fpregs();
+}
+
+/*
+ * Clear the FPU state back to init state.
+ *
+ * Called by sys_execve(), by the signal handler code and by various
+ * error paths.
+ */
+void fpu__clear(struct fpu *fpu)
+{
+ WARN_ON_FPU(fpu != ¤t->thread.fpu); /* Almost certainly an anomaly */
+
+ fpu__drop(fpu);
+
+ /*
+ * Make sure fpstate is cleared and initialized.
+ */
+ if (static_cpu_has(X86_FEATURE_FPU)) {
+ preempt_disable();
+ fpu__initialize(fpu);
+ user_fpu_begin();
+ copy_init_fpstate_to_fpregs();
+ preempt_enable();
+ }
+}
+
+/*
+ * x87 math exception handling:
+ */
+
+int fpu__exception_code(struct fpu *fpu, int trap_nr)
+{
+ int err;
+
+ if (trap_nr == X86_TRAP_MF) {
+ unsigned short cwd, swd;
+ /*
+ * (~cwd & swd) will mask out exceptions that are not set to unmasked
+ * status. 0x3f is the exception bits in these regs, 0x200 is the
+ * C1 reg you need in case of a stack fault, 0x040 is the stack
+ * fault bit. We should only be taking one exception at a time,
+ * so if this combination doesn't produce any single exception,
+ * then we have a bad program that isn't synchronizing its FPU usage
+ * and it will suffer the consequences since we won't be able to
+ * fully reproduce the context of the exception.
+ */
+ if (boot_cpu_has(X86_FEATURE_FXSR)) {
+ cwd = fpu->state.fxsave.cwd;
+ swd = fpu->state.fxsave.swd;
+ } else {
+ cwd = (unsigned short)fpu->state.fsave.cwd;
+ swd = (unsigned short)fpu->state.fsave.swd;
+ }
+
+ err = swd & ~cwd;
+ } else {
+ /*
+ * The SIMD FPU exceptions are handled a little differently, as there
+ * is only a single status/control register. Thus, to determine which
+ * unmasked exception was caught we must mask the exception mask bits
+ * at 0x1f80, and then use these to mask the exception bits at 0x3f.
+ */
+ unsigned short mxcsr = MXCSR_DEFAULT;
+
+ if (boot_cpu_has(X86_FEATURE_XMM))
+ mxcsr = fpu->state.fxsave.mxcsr;
+
+ err = ~(mxcsr >> 7) & mxcsr;
+ }
+
+ if (err & 0x001) { /* Invalid op */
+ /*
+ * swd & 0x240 == 0x040: Stack Underflow
+ * swd & 0x240 == 0x240: Stack Overflow
+ * User must clear the SF bit (0x40) if set
+ */
+ return FPE_FLTINV;
+ } else if (err & 0x004) { /* Divide by Zero */
+ return FPE_FLTDIV;
+ } else if (err & 0x008) { /* Overflow */
+ return FPE_FLTOVF;
+ } else if (err & 0x012) { /* Denormal, Underflow */
+ return FPE_FLTUND;
+ } else if (err & 0x020) { /* Precision */
+ return FPE_FLTRES;
+ }
+
+ /*
+ * If we're using IRQ 13, or supposedly even some trap
+ * X86_TRAP_MF implementations, it's possible
+ * we get a spurious trap, which is not an error.
+ */
+ return 0;
+}
diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c
new file mode 100644
index 0000000..6abd835
--- /dev/null
+++ b/arch/x86/kernel/fpu/init.c
@@ -0,0 +1,303 @@
+/*
+ * x86 FPU boot time init code:
+ */
+#include <asm/fpu/internal.h>
+#include <asm/tlbflush.h>
+#include <asm/setup.h>
+#include <asm/cmdline.h>
+
+#include <linux/sched.h>
+#include <linux/sched/task.h>
+#include <linux/init.h>
+
+/*
+ * Initialize the registers found in all CPUs, CR0 and CR4:
+ */
+static void fpu__init_cpu_generic(void)
+{
+ unsigned long cr0;
+ unsigned long cr4_mask = 0;
+
+ if (boot_cpu_has(X86_FEATURE_FXSR))
+ cr4_mask |= X86_CR4_OSFXSR;
+ if (boot_cpu_has(X86_FEATURE_XMM))
+ cr4_mask |= X86_CR4_OSXMMEXCPT;
+ if (cr4_mask)
+ cr4_set_bits(cr4_mask);
+
+ cr0 = read_cr0();
+ cr0 &= ~(X86_CR0_TS|X86_CR0_EM); /* clear TS and EM */
+ if (!boot_cpu_has(X86_FEATURE_FPU))
+ cr0 |= X86_CR0_EM;
+ write_cr0(cr0);
+
+ /* Flush out any pending x87 state: */
+#ifdef CONFIG_MATH_EMULATION
+ if (!boot_cpu_has(X86_FEATURE_FPU))
+ fpstate_init_soft(¤t->thread.fpu.state.soft);
+ else
+#endif
+ asm volatile ("fninit");
+}
+
+/*
+ * Enable all supported FPU features. Called when a CPU is brought online:
+ */
+void fpu__init_cpu(void)
+{
+ fpu__init_cpu_generic();
+ fpu__init_cpu_xstate();
+}
+
+static bool fpu__probe_without_cpuid(void)
+{
+ unsigned long cr0;
+ u16 fsw, fcw;
+
+ fsw = fcw = 0xffff;
+
+ cr0 = read_cr0();
+ cr0 &= ~(X86_CR0_TS | X86_CR0_EM);
+ write_cr0(cr0);
+
+ asm volatile("fninit ; fnstsw %0 ; fnstcw %1" : "+m" (fsw), "+m" (fcw));
+
+ pr_info("x86/fpu: Probing for FPU: FSW=0x%04hx FCW=0x%04hx\n", fsw, fcw);
+
+ return fsw == 0 && (fcw & 0x103f) == 0x003f;
+}
+
+static void fpu__init_system_early_generic(struct cpuinfo_x86 *c)
+{
+ if (!boot_cpu_has(X86_FEATURE_CPUID) &&
+ !test_bit(X86_FEATURE_FPU, (unsigned long *)cpu_caps_cleared)) {
+ if (fpu__probe_without_cpuid())
+ setup_force_cpu_cap(X86_FEATURE_FPU);
+ else
+ setup_clear_cpu_cap(X86_FEATURE_FPU);
+ }
+
+#ifndef CONFIG_MATH_EMULATION
+ if (!test_cpu_cap(&boot_cpu_data, X86_FEATURE_FPU)) {
+ pr_emerg("x86/fpu: Giving up, no FPU found and no math emulation present\n");
+ for (;;)
+ asm volatile("hlt");
+ }
+#endif
+}
+
+/*
+ * Boot time FPU feature detection code:
+ */
+unsigned int mxcsr_feature_mask __read_mostly = 0xffffffffu;
+EXPORT_SYMBOL_GPL(mxcsr_feature_mask);
+
+static void __init fpu__init_system_mxcsr(void)
+{
+ unsigned int mask = 0;
+
+ if (boot_cpu_has(X86_FEATURE_FXSR)) {
+ /* Static because GCC does not get 16-byte stack alignment right: */
+ static struct fxregs_state fxregs __initdata;
+
+ asm volatile("fxsave %0" : "+m" (fxregs));
+
+ mask = fxregs.mxcsr_mask;
+
+ /*
+ * If zero then use the default features mask,
+ * which has all features set, except the
+ * denormals-are-zero feature bit:
+ */
+ if (mask == 0)
+ mask = 0x0000ffbf;
+ }
+ mxcsr_feature_mask &= mask;
+}
+
+/*
+ * Once per bootup FPU initialization sequences that will run on most x86 CPUs:
+ */
+static void __init fpu__init_system_generic(void)
+{
+ /*
+ * Set up the legacy init FPU context. (xstate init might overwrite this
+ * with a more modern format, if the CPU supports it.)
+ */
+ fpstate_init(&init_fpstate);
+
+ fpu__init_system_mxcsr();
+}
+
+/*
+ * Size of the FPU context state. All tasks in the system use the
+ * same context size, regardless of what portion they use.
+ * This is inherent to the XSAVE architecture which puts all state
+ * components into a single, continuous memory block:
+ */
+unsigned int fpu_kernel_xstate_size;
+EXPORT_SYMBOL_GPL(fpu_kernel_xstate_size);
+
+/* Get alignment of the TYPE. */
+#define TYPE_ALIGN(TYPE) offsetof(struct { char x; TYPE test; }, test)
+
+/*
+ * Enforce that 'MEMBER' is the last field of 'TYPE'.
+ *
+ * Align the computed size with alignment of the TYPE,
+ * because that's how C aligns structs.
+ */
+#define CHECK_MEMBER_AT_END_OF(TYPE, MEMBER) \
+ BUILD_BUG_ON(sizeof(TYPE) != ALIGN(offsetofend(TYPE, MEMBER), \
+ TYPE_ALIGN(TYPE)))
+
+/*
+ * We append the 'struct fpu' to the task_struct:
+ */
+static void __init fpu__init_task_struct_size(void)
+{
+ int task_size = sizeof(struct task_struct);
+
+ /*
+ * Subtract off the static size of the register state.
+ * It potentially has a bunch of padding.
+ */
+ task_size -= sizeof(((struct task_struct *)0)->thread.fpu.state);
+
+ /*
+ * Add back the dynamically-calculated register state
+ * size.
+ */
+ task_size += fpu_kernel_xstate_size;
+
+ /*
+ * We dynamically size 'struct fpu', so we require that
+ * it be at the end of 'thread_struct' and that
+ * 'thread_struct' be at the end of 'task_struct'. If
+ * you hit a compile error here, check the structure to
+ * see if something got added to the end.
+ */
+ CHECK_MEMBER_AT_END_OF(struct fpu, state);
+ CHECK_MEMBER_AT_END_OF(struct thread_struct, fpu);
+ CHECK_MEMBER_AT_END_OF(struct task_struct, thread);
+
+ arch_task_struct_size = task_size;
+}
+
+/*
+ * Set up the user and kernel xstate sizes based on the legacy FPU context size.
+ *
+ * We set this up first, and later it will be overwritten by
+ * fpu__init_system_xstate() if the CPU knows about xstates.
+ */
+static void __init fpu__init_system_xstate_size_legacy(void)
+{
+ static int on_boot_cpu __initdata = 1;
+
+ WARN_ON_FPU(!on_boot_cpu);
+ on_boot_cpu = 0;
+
+ /*
+ * Note that xstate sizes might be overwritten later during
+ * fpu__init_system_xstate().
+ */
+
+ if (!boot_cpu_has(X86_FEATURE_FPU)) {
+ /*
+ * Disable xsave as we do not support it if i387
+ * emulation is enabled.
+ */
+ setup_clear_cpu_cap(X86_FEATURE_XSAVE);
+ setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT);
+ fpu_kernel_xstate_size = sizeof(struct swregs_state);
+ } else {
+ if (boot_cpu_has(X86_FEATURE_FXSR))
+ fpu_kernel_xstate_size =
+ sizeof(struct fxregs_state);
+ else
+ fpu_kernel_xstate_size =
+ sizeof(struct fregs_state);
+ }
+
+ fpu_user_xstate_size = fpu_kernel_xstate_size;
+}
+
+/*
+ * Find supported xfeatures based on cpu features and command-line input.
+ * This must be called after fpu__init_parse_early_param() is called and
+ * xfeatures_mask is enumerated.
+ */
+u64 __init fpu__get_supported_xfeatures_mask(void)
+{
+ return XCNTXT_MASK;
+}
+
+/* Legacy code to initialize eager fpu mode. */
+static void __init fpu__init_system_ctx_switch(void)
+{
+ static bool on_boot_cpu __initdata = 1;
+
+ WARN_ON_FPU(!on_boot_cpu);
+ on_boot_cpu = 0;
+
+ WARN_ON_FPU(current->thread.fpu.initialized);
+}
+
+/*
+ * We parse fpu parameters early because fpu__init_system() is executed
+ * before parse_early_param().
+ */
+static void __init fpu__init_parse_early_param(void)
+{
+ char arg[32];
+ char *argptr = arg;
+ int bit;
+
+ if (cmdline_find_option_bool(boot_command_line, "no387"))
+ setup_clear_cpu_cap(X86_FEATURE_FPU);
+
+ if (cmdline_find_option_bool(boot_command_line, "nofxsr")) {
+ setup_clear_cpu_cap(X86_FEATURE_FXSR);
+ setup_clear_cpu_cap(X86_FEATURE_FXSR_OPT);
+ setup_clear_cpu_cap(X86_FEATURE_XMM);
+ }
+
+ if (cmdline_find_option_bool(boot_command_line, "noxsave"))
+ fpu__xstate_clear_all_cpu_caps();
+
+ if (cmdline_find_option_bool(boot_command_line, "noxsaveopt"))
+ setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT);
+
+ if (cmdline_find_option_bool(boot_command_line, "noxsaves"))
+ setup_clear_cpu_cap(X86_FEATURE_XSAVES);
+
+ if (cmdline_find_option(boot_command_line, "clearcpuid", arg,
+ sizeof(arg)) &&
+ get_option(&argptr, &bit) &&
+ bit >= 0 &&
+ bit < NCAPINTS * 32)
+ setup_clear_cpu_cap(bit);
+}
+
+/*
+ * Called on the boot CPU once per system bootup, to set up the initial
+ * FPU state that is later cloned into all processes:
+ */
+void __init fpu__init_system(struct cpuinfo_x86 *c)
+{
+ fpu__init_parse_early_param();
+ fpu__init_system_early_generic(c);
+
+ /*
+ * The FPU has to be operational for some of the
+ * later FPU init activities:
+ */
+ fpu__init_cpu();
+
+ fpu__init_system_generic();
+ fpu__init_system_xstate_size_legacy();
+ fpu__init_system_xstate();
+ fpu__init_task_struct_size();
+
+ fpu__init_system_ctx_switch();
+}
diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c
new file mode 100644
index 0000000..bc02f51
--- /dev/null
+++ b/arch/x86/kernel/fpu/regset.c
@@ -0,0 +1,387 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * FPU register's regset abstraction, for ptrace, core dumps, etc.
+ */
+#include <asm/fpu/internal.h>
+#include <asm/fpu/signal.h>
+#include <asm/fpu/regset.h>
+#include <asm/fpu/xstate.h>
+#include <linux/sched/task_stack.h>
+
+/*
+ * The xstateregs_active() routine is the same as the regset_fpregs_active() routine,
+ * as the "regset->n" for the xstate regset will be updated based on the feature
+ * capabilities supported by the xsave.
+ */
+int regset_fpregs_active(struct task_struct *target, const struct user_regset *regset)
+{
+ struct fpu *target_fpu = &target->thread.fpu;
+
+ return target_fpu->initialized ? regset->n : 0;
+}
+
+int regset_xregset_fpregs_active(struct task_struct *target, const struct user_regset *regset)
+{
+ struct fpu *target_fpu = &target->thread.fpu;
+
+ if (boot_cpu_has(X86_FEATURE_FXSR) && target_fpu->initialized)
+ return regset->n;
+ else
+ return 0;
+}
+
+int xfpregs_get(struct task_struct *target, const struct user_regset *regset,
+ unsigned int pos, unsigned int count,
+ void *kbuf, void __user *ubuf)
+{
+ struct fpu *fpu = &target->thread.fpu;
+
+ if (!boot_cpu_has(X86_FEATURE_FXSR))
+ return -ENODEV;
+
+ fpu__prepare_read(fpu);
+ fpstate_sanitize_xstate(fpu);
+
+ return user_regset_copyout(&pos, &count, &kbuf, &ubuf,
+ &fpu->state.fxsave, 0, -1);
+}
+
+int xfpregs_set(struct task_struct *target, const struct user_regset *regset,
+ unsigned int pos, unsigned int count,
+ const void *kbuf, const void __user *ubuf)
+{
+ struct fpu *fpu = &target->thread.fpu;
+ int ret;
+
+ if (!boot_cpu_has(X86_FEATURE_FXSR))
+ return -ENODEV;
+
+ fpu__prepare_write(fpu);
+ fpstate_sanitize_xstate(fpu);
+
+ ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
+ &fpu->state.fxsave, 0, -1);
+
+ /*
+ * mxcsr reserved bits must be masked to zero for security reasons.
+ */
+ fpu->state.fxsave.mxcsr &= mxcsr_feature_mask;
+
+ /*
+ * update the header bits in the xsave header, indicating the
+ * presence of FP and SSE state.
+ */
+ if (boot_cpu_has(X86_FEATURE_XSAVE))
+ fpu->state.xsave.header.xfeatures |= XFEATURE_MASK_FPSSE;
+
+ return ret;
+}
+
+int xstateregs_get(struct task_struct *target, const struct user_regset *regset,
+ unsigned int pos, unsigned int count,
+ void *kbuf, void __user *ubuf)
+{
+ struct fpu *fpu = &target->thread.fpu;
+ struct xregs_state *xsave;
+ int ret;
+
+ if (!boot_cpu_has(X86_FEATURE_XSAVE))
+ return -ENODEV;
+
+ xsave = &fpu->state.xsave;
+
+ fpu__prepare_read(fpu);
+
+ if (using_compacted_format()) {
+ if (kbuf)
+ ret = copy_xstate_to_kernel(kbuf, xsave, pos, count);
+ else
+ ret = copy_xstate_to_user(ubuf, xsave, pos, count);
+ } else {
+ fpstate_sanitize_xstate(fpu);
+ /*
+ * Copy the 48 bytes defined by the software into the xsave
+ * area in the thread struct, so that we can copy the whole
+ * area to user using one user_regset_copyout().
+ */
+ memcpy(&xsave->i387.sw_reserved, xstate_fx_sw_bytes, sizeof(xstate_fx_sw_bytes));
+
+ /*
+ * Copy the xstate memory layout.
+ */
+ ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, xsave, 0, -1);
+ }
+ return ret;
+}
+
+int xstateregs_set(struct task_struct *target, const struct user_regset *regset,
+ unsigned int pos, unsigned int count,
+ const void *kbuf, const void __user *ubuf)
+{
+ struct fpu *fpu = &target->thread.fpu;
+ struct xregs_state *xsave;
+ int ret;
+
+ if (!boot_cpu_has(X86_FEATURE_XSAVE))
+ return -ENODEV;
+
+ /*
+ * A whole standard-format XSAVE buffer is needed:
+ */
+ if ((pos != 0) || (count < fpu_user_xstate_size))
+ return -EFAULT;
+
+ xsave = &fpu->state.xsave;
+
+ fpu__prepare_write(fpu);
+
+ if (using_compacted_format()) {
+ if (kbuf)
+ ret = copy_kernel_to_xstate(xsave, kbuf);
+ else
+ ret = copy_user_to_xstate(xsave, ubuf);
+ } else {
+ ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, xsave, 0, -1);
+ if (!ret)
+ ret = validate_xstate_header(&xsave->header);
+ }
+
+ /*
+ * mxcsr reserved bits must be masked to zero for security reasons.
+ */
+ xsave->i387.mxcsr &= mxcsr_feature_mask;
+
+ /*
+ * In case of failure, mark all states as init:
+ */
+ if (ret)
+ fpstate_init(&fpu->state);
+
+ return ret;
+}
+
+#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
+
+/*
+ * FPU tag word conversions.
+ */
+
+static inline unsigned short twd_i387_to_fxsr(unsigned short twd)
+{
+ unsigned int tmp; /* to avoid 16 bit prefixes in the code */
+
+ /* Transform each pair of bits into 01 (valid) or 00 (empty) */
+ tmp = ~twd;
+ tmp = (tmp | (tmp>>1)) & 0x5555; /* 0V0V0V0V0V0V0V0V */
+ /* and move the valid bits to the lower byte. */
+ tmp = (tmp | (tmp >> 1)) & 0x3333; /* 00VV00VV00VV00VV */
+ tmp = (tmp | (tmp >> 2)) & 0x0f0f; /* 0000VVVV0000VVVV */
+ tmp = (tmp | (tmp >> 4)) & 0x00ff; /* 00000000VVVVVVVV */
+
+ return tmp;
+}
+
+#define FPREG_ADDR(f, n) ((void *)&(f)->st_space + (n) * 16)
+#define FP_EXP_TAG_VALID 0
+#define FP_EXP_TAG_ZERO 1
+#define FP_EXP_TAG_SPECIAL 2
+#define FP_EXP_TAG_EMPTY 3
+
+static inline u32 twd_fxsr_to_i387(struct fxregs_state *fxsave)
+{
+ struct _fpxreg *st;
+ u32 tos = (fxsave->swd >> 11) & 7;
+ u32 twd = (unsigned long) fxsave->twd;
+ u32 tag;
+ u32 ret = 0xffff0000u;
+ int i;
+
+ for (i = 0; i < 8; i++, twd >>= 1) {
+ if (twd & 0x1) {
+ st = FPREG_ADDR(fxsave, (i - tos) & 7);
+
+ switch (st->exponent & 0x7fff) {
+ case 0x7fff:
+ tag = FP_EXP_TAG_SPECIAL;
+ break;
+ case 0x0000:
+ if (!st->significand[0] &&
+ !st->significand[1] &&
+ !st->significand[2] &&
+ !st->significand[3])
+ tag = FP_EXP_TAG_ZERO;
+ else
+ tag = FP_EXP_TAG_SPECIAL;
+ break;
+ default:
+ if (st->significand[3] & 0x8000)
+ tag = FP_EXP_TAG_VALID;
+ else
+ tag = FP_EXP_TAG_SPECIAL;
+ break;
+ }
+ } else {
+ tag = FP_EXP_TAG_EMPTY;
+ }
+ ret |= tag << (2 * i);
+ }
+ return ret;
+}
+
+/*
+ * FXSR floating point environment conversions.
+ */
+
+void
+convert_from_fxsr(struct user_i387_ia32_struct *env, struct task_struct *tsk)
+{
+ struct fxregs_state *fxsave = &tsk->thread.fpu.state.fxsave;
+ struct _fpreg *to = (struct _fpreg *) &env->st_space[0];
+ struct _fpxreg *from = (struct _fpxreg *) &fxsave->st_space[0];
+ int i;
+
+ env->cwd = fxsave->cwd | 0xffff0000u;
+ env->swd = fxsave->swd | 0xffff0000u;
+ env->twd = twd_fxsr_to_i387(fxsave);
+
+#ifdef CONFIG_X86_64
+ env->fip = fxsave->rip;
+ env->foo = fxsave->rdp;
+ /*
+ * should be actually ds/cs at fpu exception time, but
+ * that information is not available in 64bit mode.
+ */
+ env->fcs = task_pt_regs(tsk)->cs;
+ if (tsk == current) {
+ savesegment(ds, env->fos);
+ } else {
+ env->fos = tsk->thread.ds;
+ }
+ env->fos |= 0xffff0000;
+#else
+ env->fip = fxsave->fip;
+ env->fcs = (u16) fxsave->fcs | ((u32) fxsave->fop << 16);
+ env->foo = fxsave->foo;
+ env->fos = fxsave->fos;
+#endif
+
+ for (i = 0; i < 8; ++i)
+ memcpy(&to[i], &from[i], sizeof(to[0]));
+}
+
+void convert_to_fxsr(struct task_struct *tsk,
+ const struct user_i387_ia32_struct *env)
+
+{
+ struct fxregs_state *fxsave = &tsk->thread.fpu.state.fxsave;
+ struct _fpreg *from = (struct _fpreg *) &env->st_space[0];
+ struct _fpxreg *to = (struct _fpxreg *) &fxsave->st_space[0];
+ int i;
+
+ fxsave->cwd = env->cwd;
+ fxsave->swd = env->swd;
+ fxsave->twd = twd_i387_to_fxsr(env->twd);
+ fxsave->fop = (u16) ((u32) env->fcs >> 16);
+#ifdef CONFIG_X86_64
+ fxsave->rip = env->fip;
+ fxsave->rdp = env->foo;
+ /* cs and ds ignored */
+#else
+ fxsave->fip = env->fip;
+ fxsave->fcs = (env->fcs & 0xffff);
+ fxsave->foo = env->foo;
+ fxsave->fos = env->fos;
+#endif
+
+ for (i = 0; i < 8; ++i)
+ memcpy(&to[i], &from[i], sizeof(from[0]));
+}
+
+int fpregs_get(struct task_struct *target, const struct user_regset *regset,
+ unsigned int pos, unsigned int count,
+ void *kbuf, void __user *ubuf)
+{
+ struct fpu *fpu = &target->thread.fpu;
+ struct user_i387_ia32_struct env;
+
+ fpu__prepare_read(fpu);
+
+ if (!boot_cpu_has(X86_FEATURE_FPU))
+ return fpregs_soft_get(target, regset, pos, count, kbuf, ubuf);
+
+ if (!boot_cpu_has(X86_FEATURE_FXSR))
+ return user_regset_copyout(&pos, &count, &kbuf, &ubuf,
+ &fpu->state.fsave, 0,
+ -1);
+
+ fpstate_sanitize_xstate(fpu);
+
+ if (kbuf && pos == 0 && count == sizeof(env)) {
+ convert_from_fxsr(kbuf, target);
+ return 0;
+ }
+
+ convert_from_fxsr(&env, target);
+
+ return user_regset_copyout(&pos, &count, &kbuf, &ubuf, &env, 0, -1);
+}
+
+int fpregs_set(struct task_struct *target, const struct user_regset *regset,
+ unsigned int pos, unsigned int count,
+ const void *kbuf, const void __user *ubuf)
+{
+ struct fpu *fpu = &target->thread.fpu;
+ struct user_i387_ia32_struct env;
+ int ret;
+
+ fpu__prepare_write(fpu);
+ fpstate_sanitize_xstate(fpu);
+
+ if (!boot_cpu_has(X86_FEATURE_FPU))
+ return fpregs_soft_set(target, regset, pos, count, kbuf, ubuf);
+
+ if (!boot_cpu_has(X86_FEATURE_FXSR))
+ return user_regset_copyin(&pos, &count, &kbuf, &ubuf,
+ &fpu->state.fsave, 0,
+ -1);
+
+ if (pos > 0 || count < sizeof(env))
+ convert_from_fxsr(&env, target);
+
+ ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &env, 0, -1);
+ if (!ret)
+ convert_to_fxsr(target, &env);
+
+ /*
+ * update the header bit in the xsave header, indicating the
+ * presence of FP.
+ */
+ if (boot_cpu_has(X86_FEATURE_XSAVE))
+ fpu->state.xsave.header.xfeatures |= XFEATURE_MASK_FP;
+ return ret;
+}
+
+/*
+ * FPU state for core dumps.
+ * This is only used for a.out dumps now.
+ * It is declared generically using elf_fpregset_t (which is
+ * struct user_i387_struct) but is in fact only used for 32-bit
+ * dumps, so on 64-bit it is really struct user_i387_ia32_struct.
+ */
+int dump_fpu(struct pt_regs *regs, struct user_i387_struct *ufpu)
+{
+ struct task_struct *tsk = current;
+ struct fpu *fpu = &tsk->thread.fpu;
+ int fpvalid;
+
+ fpvalid = fpu->initialized;
+ if (fpvalid)
+ fpvalid = !fpregs_get(tsk, NULL,
+ 0, sizeof(struct user_i387_ia32_struct),
+ ufpu, NULL);
+
+ return fpvalid;
+}
+EXPORT_SYMBOL(dump_fpu);
+
+#endif /* CONFIG_X86_32 || CONFIG_IA32_EMULATION */
diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
new file mode 100644
index 0000000..d99a8ee
--- /dev/null
+++ b/arch/x86/kernel/fpu/signal.c
@@ -0,0 +1,430 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * FPU signal frame handling routines.
+ */
+
+#include <linux/compat.h>
+#include <linux/cpu.h>
+
+#include <asm/fpu/internal.h>
+#include <asm/fpu/signal.h>
+#include <asm/fpu/regset.h>
+#include <asm/fpu/xstate.h>
+
+#include <asm/sigframe.h>
+#include <asm/trace/fpu.h>
+
+static struct _fpx_sw_bytes fx_sw_reserved, fx_sw_reserved_ia32;
+
+/*
+ * Check for the presence of extended state information in the
+ * user fpstate pointer in the sigcontext.
+ */
+static inline int check_for_xstate(struct fxregs_state __user *buf,
+ void __user *fpstate,
+ struct _fpx_sw_bytes *fx_sw)
+{
+ int min_xstate_size = sizeof(struct fxregs_state) +
+ sizeof(struct xstate_header);
+ unsigned int magic2;
+
+ if (__copy_from_user(fx_sw, &buf->sw_reserved[0], sizeof(*fx_sw)))
+ return -1;
+
+ /* Check for the first magic field and other error scenarios. */
+ if (fx_sw->magic1 != FP_XSTATE_MAGIC1 ||
+ fx_sw->xstate_size < min_xstate_size ||
+ fx_sw->xstate_size > fpu_user_xstate_size ||
+ fx_sw->xstate_size > fx_sw->extended_size)
+ return -1;
+
+ /*
+ * Check for the presence of second magic word at the end of memory
+ * layout. This detects the case where the user just copied the legacy
+ * fpstate layout with out copying the extended state information
+ * in the memory layout.
+ */
+ if (__get_user(magic2, (__u32 __user *)(fpstate + fx_sw->xstate_size))
+ || magic2 != FP_XSTATE_MAGIC2)
+ return -1;
+
+ return 0;
+}
+
+/*
+ * Signal frame handlers.
+ */
+static inline int save_fsave_header(struct task_struct *tsk, void __user *buf)
+{
+ if (use_fxsr()) {
+ struct xregs_state *xsave = &tsk->thread.fpu.state.xsave;
+ struct user_i387_ia32_struct env;
+ struct _fpstate_32 __user *fp = buf;
+
+ convert_from_fxsr(&env, tsk);
+
+ if (__copy_to_user(buf, &env, sizeof(env)) ||
+ __put_user(xsave->i387.swd, &fp->status) ||
+ __put_user(X86_FXSR_MAGIC, &fp->magic))
+ return -1;
+ } else {
+ struct fregs_state __user *fp = buf;
+ u32 swd;
+ if (__get_user(swd, &fp->swd) || __put_user(swd, &fp->status))
+ return -1;
+ }
+
+ return 0;
+}
+
+static inline int save_xstate_epilog(void __user *buf, int ia32_frame)
+{
+ struct xregs_state __user *x = buf;
+ struct _fpx_sw_bytes *sw_bytes;
+ u32 xfeatures;
+ int err;
+
+ /* Setup the bytes not touched by the [f]xsave and reserved for SW. */
+ sw_bytes = ia32_frame ? &fx_sw_reserved_ia32 : &fx_sw_reserved;
+ err = __copy_to_user(&x->i387.sw_reserved, sw_bytes, sizeof(*sw_bytes));
+
+ if (!use_xsave())
+ return err;
+
+ err |= __put_user(FP_XSTATE_MAGIC2,
+ (__u32 *)(buf + fpu_user_xstate_size));
+
+ /*
+ * Read the xfeatures which we copied (directly from the cpu or
+ * from the state in task struct) to the user buffers.
+ */
+ err |= __get_user(xfeatures, (__u32 *)&x->header.xfeatures);
+
+ /*
+ * For legacy compatible, we always set FP/SSE bits in the bit
+ * vector while saving the state to the user context. This will
+ * enable us capturing any changes(during sigreturn) to
+ * the FP/SSE bits by the legacy applications which don't touch
+ * xfeatures in the xsave header.
+ *
+ * xsave aware apps can change the xfeatures in the xsave
+ * header as well as change any contents in the memory layout.
+ * xrestore as part of sigreturn will capture all the changes.
+ */
+ xfeatures |= XFEATURE_MASK_FPSSE;
+
+ err |= __put_user(xfeatures, (__u32 *)&x->header.xfeatures);
+
+ return err;
+}
+
+static inline int copy_fpregs_to_sigframe(struct xregs_state __user *buf)
+{
+ int err;
+
+ if (use_xsave())
+ err = copy_xregs_to_user(buf);
+ else if (use_fxsr())
+ err = copy_fxregs_to_user((struct fxregs_state __user *) buf);
+ else
+ err = copy_fregs_to_user((struct fregs_state __user *) buf);
+
+ if (unlikely(err) && __clear_user(buf, fpu_user_xstate_size))
+ err = -EFAULT;
+ return err;
+}
+
+/*
+ * Save the fpu, extended register state to the user signal frame.
+ *
+ * 'buf_fx' is the 64-byte aligned pointer at which the [f|fx|x]save
+ * state is copied.
+ * 'buf' points to the 'buf_fx' or to the fsave header followed by 'buf_fx'.
+ *
+ * buf == buf_fx for 64-bit frames and 32-bit fsave frame.
+ * buf != buf_fx for 32-bit frames with fxstate.
+ *
+ * If the fpu, extended register state is live, save the state directly
+ * to the user frame pointed by the aligned pointer 'buf_fx'. Otherwise,
+ * copy the thread's fpu state to the user frame starting at 'buf_fx'.
+ *
+ * If this is a 32-bit frame with fxstate, put a fsave header before
+ * the aligned state at 'buf_fx'.
+ *
+ * For [f]xsave state, update the SW reserved fields in the [f]xsave frame
+ * indicating the absence/presence of the extended state to the user.
+ */
+int copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size)
+{
+ struct fpu *fpu = ¤t->thread.fpu;
+ struct xregs_state *xsave = &fpu->state.xsave;
+ struct task_struct *tsk = current;
+ int ia32_fxstate = (buf != buf_fx);
+
+ ia32_fxstate &= (IS_ENABLED(CONFIG_X86_32) ||
+ IS_ENABLED(CONFIG_IA32_EMULATION));
+
+ if (!access_ok(VERIFY_WRITE, buf, size))
+ return -EACCES;
+
+ if (!static_cpu_has(X86_FEATURE_FPU))
+ return fpregs_soft_get(current, NULL, 0,
+ sizeof(struct user_i387_ia32_struct), NULL,
+ (struct _fpstate_32 __user *) buf) ? -1 : 1;
+
+ if (fpu->initialized || using_compacted_format()) {
+ /* Save the live register state to the user directly. */
+ if (copy_fpregs_to_sigframe(buf_fx))
+ return -1;
+ /* Update the thread's fxstate to save the fsave header. */
+ if (ia32_fxstate)
+ copy_fxregs_to_kernel(fpu);
+ } else {
+ /*
+ * It is a *bug* if kernel uses compacted-format for xsave
+ * area and we copy it out directly to a signal frame. It
+ * should have been handled above by saving the registers
+ * directly.
+ */
+ if (boot_cpu_has(X86_FEATURE_XSAVES)) {
+ WARN_ONCE(1, "x86/fpu: saving compacted-format xsave area to a signal frame!\n");
+ return -1;
+ }
+
+ fpstate_sanitize_xstate(fpu);
+ if (__copy_to_user(buf_fx, xsave, fpu_user_xstate_size))
+ return -1;
+ }
+
+ /* Save the fsave header for the 32-bit frames. */
+ if ((ia32_fxstate || !use_fxsr()) && save_fsave_header(tsk, buf))
+ return -1;
+
+ if (use_fxsr() && save_xstate_epilog(buf_fx, ia32_fxstate))
+ return -1;
+
+ return 0;
+}
+
+static inline void
+sanitize_restored_xstate(struct task_struct *tsk,
+ struct user_i387_ia32_struct *ia32_env,
+ u64 xfeatures, int fx_only)
+{
+ struct xregs_state *xsave = &tsk->thread.fpu.state.xsave;
+ struct xstate_header *header = &xsave->header;
+
+ if (use_xsave()) {
+ /*
+ * Note: we don't need to zero the reserved bits in the
+ * xstate_header here because we either didn't copy them at all,
+ * or we checked earlier that they aren't set.
+ */
+
+ /*
+ * Init the state that is not present in the memory
+ * layout and not enabled by the OS.
+ */
+ if (fx_only)
+ header->xfeatures = XFEATURE_MASK_FPSSE;
+ else
+ header->xfeatures &= xfeatures;
+ }
+
+ if (use_fxsr()) {
+ /*
+ * mscsr reserved bits must be masked to zero for security
+ * reasons.
+ */
+ xsave->i387.mxcsr &= mxcsr_feature_mask;
+
+ convert_to_fxsr(tsk, ia32_env);
+ }
+}
+
+/*
+ * Restore the extended state if present. Otherwise, restore the FP/SSE state.
+ */
+static inline int copy_user_to_fpregs_zeroing(void __user *buf, u64 xbv, int fx_only)
+{
+ if (use_xsave()) {
+ if ((unsigned long)buf % 64 || fx_only) {
+ u64 init_bv = xfeatures_mask & ~XFEATURE_MASK_FPSSE;
+ copy_kernel_to_xregs(&init_fpstate.xsave, init_bv);
+ return copy_user_to_fxregs(buf);
+ } else {
+ u64 init_bv = xfeatures_mask & ~xbv;
+ if (unlikely(init_bv))
+ copy_kernel_to_xregs(&init_fpstate.xsave, init_bv);
+ return copy_user_to_xregs(buf, xbv);
+ }
+ } else if (use_fxsr()) {
+ return copy_user_to_fxregs(buf);
+ } else
+ return copy_user_to_fregs(buf);
+}
+
+static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size)
+{
+ int ia32_fxstate = (buf != buf_fx);
+ struct task_struct *tsk = current;
+ struct fpu *fpu = &tsk->thread.fpu;
+ int state_size = fpu_kernel_xstate_size;
+ u64 xfeatures = 0;
+ int fx_only = 0;
+
+ ia32_fxstate &= (IS_ENABLED(CONFIG_X86_32) ||
+ IS_ENABLED(CONFIG_IA32_EMULATION));
+
+ if (!buf) {
+ fpu__clear(fpu);
+ return 0;
+ }
+
+ if (!access_ok(VERIFY_READ, buf, size))
+ return -EACCES;
+
+ fpu__initialize(fpu);
+
+ if (!static_cpu_has(X86_FEATURE_FPU))
+ return fpregs_soft_set(current, NULL,
+ 0, sizeof(struct user_i387_ia32_struct),
+ NULL, buf) != 0;
+
+ if (use_xsave()) {
+ struct _fpx_sw_bytes fx_sw_user;
+ if (unlikely(check_for_xstate(buf_fx, buf_fx, &fx_sw_user))) {
+ /*
+ * Couldn't find the extended state information in the
+ * memory layout. Restore just the FP/SSE and init all
+ * the other extended state.
+ */
+ state_size = sizeof(struct fxregs_state);
+ fx_only = 1;
+ trace_x86_fpu_xstate_check_failed(fpu);
+ } else {
+ state_size = fx_sw_user.xstate_size;
+ xfeatures = fx_sw_user.xfeatures;
+ }
+ }
+
+ if (ia32_fxstate) {
+ /*
+ * For 32-bit frames with fxstate, copy the user state to the
+ * thread's fpu state, reconstruct fxstate from the fsave
+ * header. Validate and sanitize the copied state.
+ */
+ struct user_i387_ia32_struct env;
+ int err = 0;
+
+ /*
+ * Drop the current fpu which clears fpu->initialized. This ensures
+ * that any context-switch during the copy of the new state,
+ * avoids the intermediate state from getting restored/saved.
+ * Thus avoiding the new restored state from getting corrupted.
+ * We will be ready to restore/save the state only after
+ * fpu->initialized is again set.
+ */
+ fpu__drop(fpu);
+
+ if (using_compacted_format()) {
+ err = copy_user_to_xstate(&fpu->state.xsave, buf_fx);
+ } else {
+ err = __copy_from_user(&fpu->state.xsave, buf_fx, state_size);
+
+ if (!err && state_size > offsetof(struct xregs_state, header))
+ err = validate_xstate_header(&fpu->state.xsave.header);
+ }
+
+ if (err || __copy_from_user(&env, buf, sizeof(env))) {
+ fpstate_init(&fpu->state);
+ trace_x86_fpu_init_state(fpu);
+ err = -1;
+ } else {
+ sanitize_restored_xstate(tsk, &env, xfeatures, fx_only);
+ }
+
+ local_bh_disable();
+ fpu->initialized = 1;
+ fpu__restore(fpu);
+ local_bh_enable();
+
+ return err;
+ } else {
+ /*
+ * For 64-bit frames and 32-bit fsave frames, restore the user
+ * state to the registers directly (with exceptions handled).
+ */
+ user_fpu_begin();
+ if (copy_user_to_fpregs_zeroing(buf_fx, xfeatures, fx_only)) {
+ fpu__clear(fpu);
+ return -1;
+ }
+ }
+
+ return 0;
+}
+
+static inline int xstate_sigframe_size(void)
+{
+ return use_xsave() ? fpu_user_xstate_size + FP_XSTATE_MAGIC2_SIZE :
+ fpu_user_xstate_size;
+}
+
+/*
+ * Restore FPU state from a sigframe:
+ */
+int fpu__restore_sig(void __user *buf, int ia32_frame)
+{
+ void __user *buf_fx = buf;
+ int size = xstate_sigframe_size();
+
+ if (ia32_frame && use_fxsr()) {
+ buf_fx = buf + sizeof(struct fregs_state);
+ size += sizeof(struct fregs_state);
+ }
+
+ return __fpu__restore_sig(buf, buf_fx, size);
+}
+
+unsigned long
+fpu__alloc_mathframe(unsigned long sp, int ia32_frame,
+ unsigned long *buf_fx, unsigned long *size)
+{
+ unsigned long frame_size = xstate_sigframe_size();
+
+ *buf_fx = sp = round_down(sp - frame_size, 64);
+ if (ia32_frame && use_fxsr()) {
+ frame_size += sizeof(struct fregs_state);
+ sp -= sizeof(struct fregs_state);
+ }
+
+ *size = frame_size;
+
+ return sp;
+}
+/*
+ * Prepare the SW reserved portion of the fxsave memory layout, indicating
+ * the presence of the extended state information in the memory layout
+ * pointed by the fpstate pointer in the sigcontext.
+ * This will be saved when ever the FP and extended state context is
+ * saved on the user stack during the signal handler delivery to the user.
+ */
+void fpu__init_prepare_fx_sw_frame(void)
+{
+ int size = fpu_user_xstate_size + FP_XSTATE_MAGIC2_SIZE;
+
+ fx_sw_reserved.magic1 = FP_XSTATE_MAGIC1;
+ fx_sw_reserved.extended_size = size;
+ fx_sw_reserved.xfeatures = xfeatures_mask;
+ fx_sw_reserved.xstate_size = fpu_user_xstate_size;
+
+ if (IS_ENABLED(CONFIG_IA32_EMULATION) ||
+ IS_ENABLED(CONFIG_X86_32)) {
+ int fsave_header_size = sizeof(struct fregs_state);
+
+ fx_sw_reserved_ia32 = fx_sw_reserved;
+ fx_sw_reserved_ia32.extended_size = size + fsave_header_size;
+ }
+}
+
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
new file mode 100644
index 0000000..87a57b7
--- /dev/null
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -0,0 +1,1247 @@
+/*
+ * xsave/xrstor support.
+ *
+ * Author: Suresh Siddha <suresh.b.siddha@intel.com>
+ */
+#include <linux/compat.h>
+#include <linux/cpu.h>
+#include <linux/mman.h>
+#include <linux/pkeys.h>
+
+#include <asm/fpu/api.h>
+#include <asm/fpu/internal.h>
+#include <asm/fpu/signal.h>
+#include <asm/fpu/regset.h>
+#include <asm/fpu/xstate.h>
+
+#include <asm/tlbflush.h>
+#include <asm/cpufeature.h>
+
+/*
+ * Although we spell it out in here, the Processor Trace
+ * xfeature is completely unused. We use other mechanisms
+ * to save/restore PT state in Linux.
+ */
+static const char *xfeature_names[] =
+{
+ "x87 floating point registers" ,
+ "SSE registers" ,
+ "AVX registers" ,
+ "MPX bounds registers" ,
+ "MPX CSR" ,
+ "AVX-512 opmask" ,
+ "AVX-512 Hi256" ,
+ "AVX-512 ZMM_Hi256" ,
+ "Processor Trace (unused)" ,
+ "Protection Keys User registers",
+ "unknown xstate feature" ,
+};
+
+static short xsave_cpuid_features[] __initdata = {
+ X86_FEATURE_FPU,
+ X86_FEATURE_XMM,
+ X86_FEATURE_AVX,
+ X86_FEATURE_MPX,
+ X86_FEATURE_MPX,
+ X86_FEATURE_AVX512F,
+ X86_FEATURE_AVX512F,
+ X86_FEATURE_AVX512F,
+ X86_FEATURE_INTEL_PT,
+ X86_FEATURE_PKU,
+};
+
+/*
+ * Mask of xstate features supported by the CPU and the kernel:
+ */
+u64 xfeatures_mask __read_mostly;
+
+static unsigned int xstate_offsets[XFEATURE_MAX] = { [ 0 ... XFEATURE_MAX - 1] = -1};
+static unsigned int xstate_sizes[XFEATURE_MAX] = { [ 0 ... XFEATURE_MAX - 1] = -1};
+static unsigned int xstate_comp_offsets[sizeof(xfeatures_mask)*8];
+
+/*
+ * The XSAVE area of kernel can be in standard or compacted format;
+ * it is always in standard format for user mode. This is the user
+ * mode standard format size used for signal and ptrace frames.
+ */
+unsigned int fpu_user_xstate_size;
+
+/*
+ * Clear all of the X86_FEATURE_* bits that are unavailable
+ * when the CPU has no XSAVE support.
+ */
+void fpu__xstate_clear_all_cpu_caps(void)
+{
+ setup_clear_cpu_cap(X86_FEATURE_XSAVE);
+}
+
+/*
+ * Return whether the system supports a given xfeature.
+ *
+ * Also return the name of the (most advanced) feature that the caller requested:
+ */
+int cpu_has_xfeatures(u64 xfeatures_needed, const char **feature_name)
+{
+ u64 xfeatures_missing = xfeatures_needed & ~xfeatures_mask;
+
+ if (unlikely(feature_name)) {
+ long xfeature_idx, max_idx;
+ u64 xfeatures_print;
+ /*
+ * So we use FLS here to be able to print the most advanced
+ * feature that was requested but is missing. So if a driver
+ * asks about "XFEATURE_MASK_SSE | XFEATURE_MASK_YMM" we'll print the
+ * missing AVX feature - this is the most informative message
+ * to users:
+ */
+ if (xfeatures_missing)
+ xfeatures_print = xfeatures_missing;
+ else
+ xfeatures_print = xfeatures_needed;
+
+ xfeature_idx = fls64(xfeatures_print)-1;
+ max_idx = ARRAY_SIZE(xfeature_names)-1;
+ xfeature_idx = min(xfeature_idx, max_idx);
+
+ *feature_name = xfeature_names[xfeature_idx];
+ }
+
+ if (xfeatures_missing)
+ return 0;
+
+ return 1;
+}
+EXPORT_SYMBOL_GPL(cpu_has_xfeatures);
+
+static int xfeature_is_supervisor(int xfeature_nr)
+{
+ /*
+ * We currently do not support supervisor states, but if
+ * we did, we could find out like this.
+ *
+ * SDM says: If state component 'i' is a user state component,
+ * ECX[0] return 0; if state component i is a supervisor
+ * state component, ECX[0] returns 1.
+ */
+ u32 eax, ebx, ecx, edx;
+
+ cpuid_count(XSTATE_CPUID, xfeature_nr, &eax, &ebx, &ecx, &edx);
+ return !!(ecx & 1);
+}
+
+static int xfeature_is_user(int xfeature_nr)
+{
+ return !xfeature_is_supervisor(xfeature_nr);
+}
+
+/*
+ * When executing XSAVEOPT (or other optimized XSAVE instructions), if
+ * a processor implementation detects that an FPU state component is still
+ * (or is again) in its initialized state, it may clear the corresponding
+ * bit in the header.xfeatures field, and can skip the writeout of registers
+ * to the corresponding memory layout.
+ *
+ * This means that when the bit is zero, the state component might still contain
+ * some previous - non-initialized register state.
+ *
+ * Before writing xstate information to user-space we sanitize those components,
+ * to always ensure that the memory layout of a feature will be in the init state
+ * if the corresponding header bit is zero. This is to ensure that user-space doesn't
+ * see some stale state in the memory layout during signal handling, debugging etc.
+ */
+void fpstate_sanitize_xstate(struct fpu *fpu)
+{
+ struct fxregs_state *fx = &fpu->state.fxsave;
+ int feature_bit;
+ u64 xfeatures;
+
+ if (!use_xsaveopt())
+ return;
+
+ xfeatures = fpu->state.xsave.header.xfeatures;
+
+ /*
+ * None of the feature bits are in init state. So nothing else
+ * to do for us, as the memory layout is up to date.
+ */
+ if ((xfeatures & xfeatures_mask) == xfeatures_mask)
+ return;
+
+ /*
+ * FP is in init state
+ */
+ if (!(xfeatures & XFEATURE_MASK_FP)) {
+ fx->cwd = 0x37f;
+ fx->swd = 0;
+ fx->twd = 0;
+ fx->fop = 0;
+ fx->rip = 0;
+ fx->rdp = 0;
+ memset(&fx->st_space[0], 0, 128);
+ }
+
+ /*
+ * SSE is in init state
+ */
+ if (!(xfeatures & XFEATURE_MASK_SSE))
+ memset(&fx->xmm_space[0], 0, 256);
+
+ /*
+ * First two features are FPU and SSE, which above we handled
+ * in a special way already:
+ */
+ feature_bit = 0x2;
+ xfeatures = (xfeatures_mask & ~xfeatures) >> 2;
+
+ /*
+ * Update all the remaining memory layouts according to their
+ * standard xstate layout, if their header bit is in the init
+ * state:
+ */
+ while (xfeatures) {
+ if (xfeatures & 0x1) {
+ int offset = xstate_comp_offsets[feature_bit];
+ int size = xstate_sizes[feature_bit];
+
+ memcpy((void *)fx + offset,
+ (void *)&init_fpstate.xsave + offset,
+ size);
+ }
+
+ xfeatures >>= 1;
+ feature_bit++;
+ }
+}
+
+/*
+ * Enable the extended processor state save/restore feature.
+ * Called once per CPU onlining.
+ */
+void fpu__init_cpu_xstate(void)
+{
+ if (!boot_cpu_has(X86_FEATURE_XSAVE) || !xfeatures_mask)
+ return;
+ /*
+ * Make it clear that XSAVES supervisor states are not yet
+ * implemented should anyone expect it to work by changing
+ * bits in XFEATURE_MASK_* macros and XCR0.
+ */
+ WARN_ONCE((xfeatures_mask & XFEATURE_MASK_SUPERVISOR),
+ "x86/fpu: XSAVES supervisor states are not yet implemented.\n");
+
+ xfeatures_mask &= ~XFEATURE_MASK_SUPERVISOR;
+
+ cr4_set_bits(X86_CR4_OSXSAVE);
+ xsetbv(XCR_XFEATURE_ENABLED_MASK, xfeatures_mask);
+}
+
+/*
+ * Note that in the future we will likely need a pair of
+ * functions here: one for user xstates and the other for
+ * system xstates. For now, they are the same.
+ */
+static int xfeature_enabled(enum xfeature xfeature)
+{
+ return !!(xfeatures_mask & (1UL << xfeature));
+}
+
+/*
+ * Record the offsets and sizes of various xstates contained
+ * in the XSAVE state memory layout.
+ */
+static void __init setup_xstate_features(void)
+{
+ u32 eax, ebx, ecx, edx, i;
+ /* start at the beginnning of the "extended state" */
+ unsigned int last_good_offset = offsetof(struct xregs_state,
+ extended_state_area);
+ /*
+ * The FP xstates and SSE xstates are legacy states. They are always
+ * in the fixed offsets in the xsave area in either compacted form
+ * or standard form.
+ */
+ xstate_offsets[0] = 0;
+ xstate_sizes[0] = offsetof(struct fxregs_state, xmm_space);
+ xstate_offsets[1] = xstate_sizes[0];
+ xstate_sizes[1] = FIELD_SIZEOF(struct fxregs_state, xmm_space);
+
+ for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) {
+ if (!xfeature_enabled(i))
+ continue;
+
+ cpuid_count(XSTATE_CPUID, i, &eax, &ebx, &ecx, &edx);
+
+ /*
+ * If an xfeature is supervisor state, the offset
+ * in EBX is invalid. We leave it to -1.
+ */
+ if (xfeature_is_user(i))
+ xstate_offsets[i] = ebx;
+
+ xstate_sizes[i] = eax;
+ /*
+ * In our xstate size checks, we assume that the
+ * highest-numbered xstate feature has the
+ * highest offset in the buffer. Ensure it does.
+ */
+ WARN_ONCE(last_good_offset > xstate_offsets[i],
+ "x86/fpu: misordered xstate at %d\n", last_good_offset);
+ last_good_offset = xstate_offsets[i];
+ }
+}
+
+static void __init print_xstate_feature(u64 xstate_mask)
+{
+ const char *feature_name;
+
+ if (cpu_has_xfeatures(xstate_mask, &feature_name))
+ pr_info("x86/fpu: Supporting XSAVE feature 0x%03Lx: '%s'\n", xstate_mask, feature_name);
+}
+
+/*
+ * Print out all the supported xstate features:
+ */
+static void __init print_xstate_features(void)
+{
+ print_xstate_feature(XFEATURE_MASK_FP);
+ print_xstate_feature(XFEATURE_MASK_SSE);
+ print_xstate_feature(XFEATURE_MASK_YMM);
+ print_xstate_feature(XFEATURE_MASK_BNDREGS);
+ print_xstate_feature(XFEATURE_MASK_BNDCSR);
+ print_xstate_feature(XFEATURE_MASK_OPMASK);
+ print_xstate_feature(XFEATURE_MASK_ZMM_Hi256);
+ print_xstate_feature(XFEATURE_MASK_Hi16_ZMM);
+ print_xstate_feature(XFEATURE_MASK_PKRU);
+}
+
+/*
+ * This check is important because it is easy to get XSTATE_*
+ * confused with XSTATE_BIT_*.
+ */
+#define CHECK_XFEATURE(nr) do { \
+ WARN_ON(nr < FIRST_EXTENDED_XFEATURE); \
+ WARN_ON(nr >= XFEATURE_MAX); \
+} while (0)
+
+/*
+ * We could cache this like xstate_size[], but we only use
+ * it here, so it would be a waste of space.
+ */
+static int xfeature_is_aligned(int xfeature_nr)
+{
+ u32 eax, ebx, ecx, edx;
+
+ CHECK_XFEATURE(xfeature_nr);
+ cpuid_count(XSTATE_CPUID, xfeature_nr, &eax, &ebx, &ecx, &edx);
+ /*
+ * The value returned by ECX[1] indicates the alignment
+ * of state component 'i' when the compacted format
+ * of the extended region of an XSAVE area is used:
+ */
+ return !!(ecx & 2);
+}
+
+/*
+ * This function sets up offsets and sizes of all extended states in
+ * xsave area. This supports both standard format and compacted format
+ * of the xsave aread.
+ */
+static void __init setup_xstate_comp(void)
+{
+ unsigned int xstate_comp_sizes[sizeof(xfeatures_mask)*8];
+ int i;
+
+ /*
+ * The FP xstates and SSE xstates are legacy states. They are always
+ * in the fixed offsets in the xsave area in either compacted form
+ * or standard form.
+ */
+ xstate_comp_offsets[0] = 0;
+ xstate_comp_offsets[1] = offsetof(struct fxregs_state, xmm_space);
+
+ if (!boot_cpu_has(X86_FEATURE_XSAVES)) {
+ for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) {
+ if (xfeature_enabled(i)) {
+ xstate_comp_offsets[i] = xstate_offsets[i];
+ xstate_comp_sizes[i] = xstate_sizes[i];
+ }
+ }
+ return;
+ }
+
+ xstate_comp_offsets[FIRST_EXTENDED_XFEATURE] =
+ FXSAVE_SIZE + XSAVE_HDR_SIZE;
+
+ for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) {
+ if (xfeature_enabled(i))
+ xstate_comp_sizes[i] = xstate_sizes[i];
+ else
+ xstate_comp_sizes[i] = 0;
+
+ if (i > FIRST_EXTENDED_XFEATURE) {
+ xstate_comp_offsets[i] = xstate_comp_offsets[i-1]
+ + xstate_comp_sizes[i-1];
+
+ if (xfeature_is_aligned(i))
+ xstate_comp_offsets[i] =
+ ALIGN(xstate_comp_offsets[i], 64);
+ }
+ }
+}
+
+/*
+ * Print out xstate component offsets and sizes
+ */
+static void __init print_xstate_offset_size(void)
+{
+ int i;
+
+ for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) {
+ if (!xfeature_enabled(i))
+ continue;
+ pr_info("x86/fpu: xstate_offset[%d]: %4d, xstate_sizes[%d]: %4d\n",
+ i, xstate_comp_offsets[i], i, xstate_sizes[i]);
+ }
+}
+
+/*
+ * setup the xstate image representing the init state
+ */
+static void __init setup_init_fpu_buf(void)
+{
+ static int on_boot_cpu __initdata = 1;
+
+ WARN_ON_FPU(!on_boot_cpu);
+ on_boot_cpu = 0;
+
+ if (!boot_cpu_has(X86_FEATURE_XSAVE))
+ return;
+
+ setup_xstate_features();
+ print_xstate_features();
+
+ if (boot_cpu_has(X86_FEATURE_XSAVES))
+ init_fpstate.xsave.header.xcomp_bv = (u64)1 << 63 | xfeatures_mask;
+
+ /*
+ * Init all the features state with header.xfeatures being 0x0
+ */
+ copy_kernel_to_xregs_booting(&init_fpstate.xsave);
+
+ /*
+ * Dump the init state again. This is to identify the init state
+ * of any feature which is not represented by all zero's.
+ */
+ copy_xregs_to_kernel_booting(&init_fpstate.xsave);
+}
+
+static int xfeature_uncompacted_offset(int xfeature_nr)
+{
+ u32 eax, ebx, ecx, edx;
+
+ /*
+ * Only XSAVES supports supervisor states and it uses compacted
+ * format. Checking a supervisor state's uncompacted offset is
+ * an error.
+ */
+ if (XFEATURE_MASK_SUPERVISOR & (1 << xfeature_nr)) {
+ WARN_ONCE(1, "No fixed offset for xstate %d\n", xfeature_nr);
+ return -1;
+ }
+
+ CHECK_XFEATURE(xfeature_nr);
+ cpuid_count(XSTATE_CPUID, xfeature_nr, &eax, &ebx, &ecx, &edx);
+ return ebx;
+}
+
+static int xfeature_size(int xfeature_nr)
+{
+ u32 eax, ebx, ecx, edx;
+
+ CHECK_XFEATURE(xfeature_nr);
+ cpuid_count(XSTATE_CPUID, xfeature_nr, &eax, &ebx, &ecx, &edx);
+ return eax;
+}
+
+/*
+ * 'XSAVES' implies two different things:
+ * 1. saving of supervisor/system state
+ * 2. using the compacted format
+ *
+ * Use this function when dealing with the compacted format so
+ * that it is obvious which aspect of 'XSAVES' is being handled
+ * by the calling code.
+ */
+int using_compacted_format(void)
+{
+ return boot_cpu_has(X86_FEATURE_XSAVES);
+}
+
+/* Validate an xstate header supplied by userspace (ptrace or sigreturn) */
+int validate_xstate_header(const struct xstate_header *hdr)
+{
+ /* No unknown or supervisor features may be set */
+ if (hdr->xfeatures & (~xfeatures_mask | XFEATURE_MASK_SUPERVISOR))
+ return -EINVAL;
+
+ /* Userspace must use the uncompacted format */
+ if (hdr->xcomp_bv)
+ return -EINVAL;
+
+ /*
+ * If 'reserved' is shrunken to add a new field, make sure to validate
+ * that new field here!
+ */
+ BUILD_BUG_ON(sizeof(hdr->reserved) != 48);
+
+ /* No reserved bits may be set */
+ if (memchr_inv(hdr->reserved, 0, sizeof(hdr->reserved)))
+ return -EINVAL;
+
+ return 0;
+}
+
+static void __xstate_dump_leaves(void)
+{
+ int i;
+ u32 eax, ebx, ecx, edx;
+ static int should_dump = 1;
+
+ if (!should_dump)
+ return;
+ should_dump = 0;
+ /*
+ * Dump out a few leaves past the ones that we support
+ * just in case there are some goodies up there
+ */
+ for (i = 0; i < XFEATURE_MAX + 10; i++) {
+ cpuid_count(XSTATE_CPUID, i, &eax, &ebx, &ecx, &edx);
+ pr_warn("CPUID[%02x, %02x]: eax=%08x ebx=%08x ecx=%08x edx=%08x\n",
+ XSTATE_CPUID, i, eax, ebx, ecx, edx);
+ }
+}
+
+#define XSTATE_WARN_ON(x) do { \
+ if (WARN_ONCE(x, "XSAVE consistency problem, dumping leaves")) { \
+ __xstate_dump_leaves(); \
+ } \
+} while (0)
+
+#define XCHECK_SZ(sz, nr, nr_macro, __struct) do { \
+ if ((nr == nr_macro) && \
+ WARN_ONCE(sz != sizeof(__struct), \
+ "%s: struct is %zu bytes, cpu state %d bytes\n", \
+ __stringify(nr_macro), sizeof(__struct), sz)) { \
+ __xstate_dump_leaves(); \
+ } \
+} while (0)
+
+/*
+ * We have a C struct for each 'xstate'. We need to ensure
+ * that our software representation matches what the CPU
+ * tells us about the state's size.
+ */
+static void check_xstate_against_struct(int nr)
+{
+ /*
+ * Ask the CPU for the size of the state.
+ */
+ int sz = xfeature_size(nr);
+ /*
+ * Match each CPU state with the corresponding software
+ * structure.
+ */
+ XCHECK_SZ(sz, nr, XFEATURE_YMM, struct ymmh_struct);
+ XCHECK_SZ(sz, nr, XFEATURE_BNDREGS, struct mpx_bndreg_state);
+ XCHECK_SZ(sz, nr, XFEATURE_BNDCSR, struct mpx_bndcsr_state);
+ XCHECK_SZ(sz, nr, XFEATURE_OPMASK, struct avx_512_opmask_state);
+ XCHECK_SZ(sz, nr, XFEATURE_ZMM_Hi256, struct avx_512_zmm_uppers_state);
+ XCHECK_SZ(sz, nr, XFEATURE_Hi16_ZMM, struct avx_512_hi16_state);
+ XCHECK_SZ(sz, nr, XFEATURE_PKRU, struct pkru_state);
+
+ /*
+ * Make *SURE* to add any feature numbers in below if
+ * there are "holes" in the xsave state component
+ * numbers.
+ */
+ if ((nr < XFEATURE_YMM) ||
+ (nr >= XFEATURE_MAX) ||
+ (nr == XFEATURE_PT_UNIMPLEMENTED_SO_FAR)) {
+ WARN_ONCE(1, "no structure for xstate: %d\n", nr);
+ XSTATE_WARN_ON(1);
+ }
+}
+
+/*
+ * This essentially double-checks what the cpu told us about
+ * how large the XSAVE buffer needs to be. We are recalculating
+ * it to be safe.
+ */
+static void do_extra_xstate_size_checks(void)
+{
+ int paranoid_xstate_size = FXSAVE_SIZE + XSAVE_HDR_SIZE;
+ int i;
+
+ for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) {
+ if (!xfeature_enabled(i))
+ continue;
+
+ check_xstate_against_struct(i);
+ /*
+ * Supervisor state components can be managed only by
+ * XSAVES, which is compacted-format only.
+ */
+ if (!using_compacted_format())
+ XSTATE_WARN_ON(xfeature_is_supervisor(i));
+
+ /* Align from the end of the previous feature */
+ if (xfeature_is_aligned(i))
+ paranoid_xstate_size = ALIGN(paranoid_xstate_size, 64);
+ /*
+ * The offset of a given state in the non-compacted
+ * format is given to us in a CPUID leaf. We check
+ * them for being ordered (increasing offsets) in
+ * setup_xstate_features().
+ */
+ if (!using_compacted_format())
+ paranoid_xstate_size = xfeature_uncompacted_offset(i);
+ /*
+ * The compacted-format offset always depends on where
+ * the previous state ended.
+ */
+ paranoid_xstate_size += xfeature_size(i);
+ }
+ XSTATE_WARN_ON(paranoid_xstate_size != fpu_kernel_xstate_size);
+}
+
+
+/*
+ * Get total size of enabled xstates in XCR0/xfeatures_mask.
+ *
+ * Note the SDM's wording here. "sub-function 0" only enumerates
+ * the size of the *user* states. If we use it to size a buffer
+ * that we use 'XSAVES' on, we could potentially overflow the
+ * buffer because 'XSAVES' saves system states too.
+ *
+ * Note that we do not currently set any bits on IA32_XSS so
+ * 'XCR0 | IA32_XSS == XCR0' for now.
+ */
+static unsigned int __init get_xsaves_size(void)
+{
+ unsigned int eax, ebx, ecx, edx;
+ /*
+ * - CPUID function 0DH, sub-function 1:
+ * EBX enumerates the size (in bytes) required by
+ * the XSAVES instruction for an XSAVE area
+ * containing all the state components
+ * corresponding to bits currently set in
+ * XCR0 | IA32_XSS.
+ */
+ cpuid_count(XSTATE_CPUID, 1, &eax, &ebx, &ecx, &edx);
+ return ebx;
+}
+
+static unsigned int __init get_xsave_size(void)
+{
+ unsigned int eax, ebx, ecx, edx;
+ /*
+ * - CPUID function 0DH, sub-function 0:
+ * EBX enumerates the size (in bytes) required by
+ * the XSAVE instruction for an XSAVE area
+ * containing all the *user* state components
+ * corresponding to bits currently set in XCR0.
+ */
+ cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx);
+ return ebx;
+}
+
+/*
+ * Will the runtime-enumerated 'xstate_size' fit in the init
+ * task's statically-allocated buffer?
+ */
+static bool is_supported_xstate_size(unsigned int test_xstate_size)
+{
+ if (test_xstate_size <= sizeof(union fpregs_state))
+ return true;
+
+ pr_warn("x86/fpu: xstate buffer too small (%zu < %d), disabling xsave\n",
+ sizeof(union fpregs_state), test_xstate_size);
+ return false;
+}
+
+static int init_xstate_size(void)
+{
+ /* Recompute the context size for enabled features: */
+ unsigned int possible_xstate_size;
+ unsigned int xsave_size;
+
+ xsave_size = get_xsave_size();
+
+ if (boot_cpu_has(X86_FEATURE_XSAVES))
+ possible_xstate_size = get_xsaves_size();
+ else
+ possible_xstate_size = xsave_size;
+
+ /* Ensure we have the space to store all enabled: */
+ if (!is_supported_xstate_size(possible_xstate_size))
+ return -EINVAL;
+
+ /*
+ * The size is OK, we are definitely going to use xsave,
+ * make it known to the world that we need more space.
+ */
+ fpu_kernel_xstate_size = possible_xstate_size;
+ do_extra_xstate_size_checks();
+
+ /*
+ * User space is always in standard format.
+ */
+ fpu_user_xstate_size = xsave_size;
+ return 0;
+}
+
+/*
+ * We enabled the XSAVE hardware, but something went wrong and
+ * we can not use it. Disable it.
+ */
+static void fpu__init_disable_system_xstate(void)
+{
+ xfeatures_mask = 0;
+ cr4_clear_bits(X86_CR4_OSXSAVE);
+ fpu__xstate_clear_all_cpu_caps();
+}
+
+/*
+ * Enable and initialize the xsave feature.
+ * Called once per system bootup.
+ */
+void __init fpu__init_system_xstate(void)
+{
+ unsigned int eax, ebx, ecx, edx;
+ static int on_boot_cpu __initdata = 1;
+ int err;
+ int i;
+
+ WARN_ON_FPU(!on_boot_cpu);
+ on_boot_cpu = 0;
+
+ if (!boot_cpu_has(X86_FEATURE_FPU)) {
+ pr_info("x86/fpu: No FPU detected\n");
+ return;
+ }
+
+ if (!boot_cpu_has(X86_FEATURE_XSAVE)) {
+ pr_info("x86/fpu: x87 FPU will use %s\n",
+ boot_cpu_has(X86_FEATURE_FXSR) ? "FXSAVE" : "FSAVE");
+ return;
+ }
+
+ if (boot_cpu_data.cpuid_level < XSTATE_CPUID) {
+ WARN_ON_FPU(1);
+ return;
+ }
+
+ cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx);
+ xfeatures_mask = eax + ((u64)edx << 32);
+
+ if ((xfeatures_mask & XFEATURE_MASK_FPSSE) != XFEATURE_MASK_FPSSE) {
+ /*
+ * This indicates that something really unexpected happened
+ * with the enumeration. Disable XSAVE and try to continue
+ * booting without it. This is too early to BUG().
+ */
+ pr_err("x86/fpu: FP/SSE not present amongst the CPU's xstate features: 0x%llx.\n", xfeatures_mask);
+ goto out_disable;
+ }
+
+ /*
+ * Clear XSAVE features that are disabled in the normal CPUID.
+ */
+ for (i = 0; i < ARRAY_SIZE(xsave_cpuid_features); i++) {
+ if (!boot_cpu_has(xsave_cpuid_features[i]))
+ xfeatures_mask &= ~BIT(i);
+ }
+
+ xfeatures_mask &= fpu__get_supported_xfeatures_mask();
+
+ /* Enable xstate instructions to be able to continue with initialization: */
+ fpu__init_cpu_xstate();
+ err = init_xstate_size();
+ if (err)
+ goto out_disable;
+
+ /*
+ * Update info used for ptrace frames; use standard-format size and no
+ * supervisor xstates:
+ */
+ update_regset_xstate_info(fpu_user_xstate_size, xfeatures_mask & ~XFEATURE_MASK_SUPERVISOR);
+
+ fpu__init_prepare_fx_sw_frame();
+ setup_init_fpu_buf();
+ setup_xstate_comp();
+ print_xstate_offset_size();
+
+ pr_info("x86/fpu: Enabled xstate features 0x%llx, context size is %d bytes, using '%s' format.\n",
+ xfeatures_mask,
+ fpu_kernel_xstate_size,
+ boot_cpu_has(X86_FEATURE_XSAVES) ? "compacted" : "standard");
+ return;
+
+out_disable:
+ /* something went wrong, try to boot without any XSAVE support */
+ fpu__init_disable_system_xstate();
+}
+
+/*
+ * Restore minimal FPU state after suspend:
+ */
+void fpu__resume_cpu(void)
+{
+ /*
+ * Restore XCR0 on xsave capable CPUs:
+ */
+ if (boot_cpu_has(X86_FEATURE_XSAVE))
+ xsetbv(XCR_XFEATURE_ENABLED_MASK, xfeatures_mask);
+}
+
+/*
+ * Given an xstate feature mask, calculate where in the xsave
+ * buffer the state is. Callers should ensure that the buffer
+ * is valid.
+ *
+ * Note: does not work for compacted buffers.
+ */
+void *__raw_xsave_addr(struct xregs_state *xsave, int xstate_feature_mask)
+{
+ int feature_nr = fls64(xstate_feature_mask) - 1;
+
+ if (!xfeature_enabled(feature_nr)) {
+ WARN_ON_FPU(1);
+ return NULL;
+ }
+
+ return (void *)xsave + xstate_comp_offsets[feature_nr];
+}
+/*
+ * Given the xsave area and a state inside, this function returns the
+ * address of the state.
+ *
+ * This is the API that is called to get xstate address in either
+ * standard format or compacted format of xsave area.
+ *
+ * Note that if there is no data for the field in the xsave buffer
+ * this will return NULL.
+ *
+ * Inputs:
+ * xstate: the thread's storage area for all FPU data
+ * xstate_feature: state which is defined in xsave.h (e.g.
+ * XFEATURE_MASK_FP, XFEATURE_MASK_SSE, etc...)
+ * Output:
+ * address of the state in the xsave area, or NULL if the
+ * field is not present in the xsave buffer.
+ */
+void *get_xsave_addr(struct xregs_state *xsave, int xstate_feature)
+{
+ /*
+ * Do we even *have* xsave state?
+ */
+ if (!boot_cpu_has(X86_FEATURE_XSAVE))
+ return NULL;
+
+ /*
+ * We should not ever be requesting features that we
+ * have not enabled. Remember that pcntxt_mask is
+ * what we write to the XCR0 register.
+ */
+ WARN_ONCE(!(xfeatures_mask & xstate_feature),
+ "get of unsupported state");
+ /*
+ * This assumes the last 'xsave*' instruction to
+ * have requested that 'xstate_feature' be saved.
+ * If it did not, we might be seeing and old value
+ * of the field in the buffer.
+ *
+ * This can happen because the last 'xsave' did not
+ * request that this feature be saved (unlikely)
+ * or because the "init optimization" caused it
+ * to not be saved.
+ */
+ if (!(xsave->header.xfeatures & xstate_feature))
+ return NULL;
+
+ return __raw_xsave_addr(xsave, xstate_feature);
+}
+EXPORT_SYMBOL_GPL(get_xsave_addr);
+
+/*
+ * This wraps up the common operations that need to occur when retrieving
+ * data from xsave state. It first ensures that the current task was
+ * using the FPU and retrieves the data in to a buffer. It then calculates
+ * the offset of the requested field in the buffer.
+ *
+ * This function is safe to call whether the FPU is in use or not.
+ *
+ * Note that this only works on the current task.
+ *
+ * Inputs:
+ * @xsave_state: state which is defined in xsave.h (e.g. XFEATURE_MASK_FP,
+ * XFEATURE_MASK_SSE, etc...)
+ * Output:
+ * address of the state in the xsave area or NULL if the state
+ * is not present or is in its 'init state'.
+ */
+const void *get_xsave_field_ptr(int xsave_state)
+{
+ struct fpu *fpu = ¤t->thread.fpu;
+
+ if (!fpu->initialized)
+ return NULL;
+ /*
+ * fpu__save() takes the CPU's xstate registers
+ * and saves them off to the 'fpu memory buffer.
+ */
+ fpu__save(fpu);
+
+ return get_xsave_addr(&fpu->state.xsave, xsave_state);
+}
+
+#ifdef CONFIG_ARCH_HAS_PKEYS
+
+#define NR_VALID_PKRU_BITS (CONFIG_NR_PROTECTION_KEYS * 2)
+#define PKRU_VALID_MASK (NR_VALID_PKRU_BITS - 1)
+/*
+ * This will go out and modify PKRU register to set the access
+ * rights for @pkey to @init_val.
+ */
+int arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
+ unsigned long init_val)
+{
+ u32 old_pkru;
+ int pkey_shift = (pkey * PKRU_BITS_PER_PKEY);
+ u32 new_pkru_bits = 0;
+
+ /*
+ * This check implies XSAVE support. OSPKE only gets
+ * set if we enable XSAVE and we enable PKU in XCR0.
+ */
+ if (!boot_cpu_has(X86_FEATURE_OSPKE))
+ return -EINVAL;
+
+ /* Set the bits we need in PKRU: */
+ if (init_val & PKEY_DISABLE_ACCESS)
+ new_pkru_bits |= PKRU_AD_BIT;
+ if (init_val & PKEY_DISABLE_WRITE)
+ new_pkru_bits |= PKRU_WD_BIT;
+
+ /* Shift the bits in to the correct place in PKRU for pkey: */
+ new_pkru_bits <<= pkey_shift;
+
+ /* Get old PKRU and mask off any old bits in place: */
+ old_pkru = read_pkru();
+ old_pkru &= ~((PKRU_AD_BIT|PKRU_WD_BIT) << pkey_shift);
+
+ /* Write old part along with new part: */
+ write_pkru(old_pkru | new_pkru_bits);
+
+ return 0;
+}
+#endif /* ! CONFIG_ARCH_HAS_PKEYS */
+
+/*
+ * Weird legacy quirk: SSE and YMM states store information in the
+ * MXCSR and MXCSR_FLAGS fields of the FP area. That means if the FP
+ * area is marked as unused in the xfeatures header, we need to copy
+ * MXCSR and MXCSR_FLAGS if either SSE or YMM are in use.
+ */
+static inline bool xfeatures_mxcsr_quirk(u64 xfeatures)
+{
+ if (!(xfeatures & (XFEATURE_MASK_SSE|XFEATURE_MASK_YMM)))
+ return false;
+
+ if (xfeatures & XFEATURE_MASK_FP)
+ return false;
+
+ return true;
+}
+
+/*
+ * This is similar to user_regset_copyout(), but will not add offset to
+ * the source data pointer or increment pos, count, kbuf, and ubuf.
+ */
+static inline void
+__copy_xstate_to_kernel(void *kbuf, const void *data,
+ unsigned int offset, unsigned int size, unsigned int size_total)
+{
+ if (offset < size_total) {
+ unsigned int copy = min(size, size_total - offset);
+
+ memcpy(kbuf + offset, data, copy);
+ }
+}
+
+/*
+ * Convert from kernel XSAVES compacted format to standard format and copy
+ * to a kernel-space ptrace buffer.
+ *
+ * It supports partial copy but pos always starts from zero. This is called
+ * from xstateregs_get() and there we check the CPU has XSAVES.
+ */
+int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int offset_start, unsigned int size_total)
+{
+ unsigned int offset, size;
+ struct xstate_header header;
+ int i;
+
+ /*
+ * Currently copy_regset_to_user() starts from pos 0:
+ */
+ if (unlikely(offset_start != 0))
+ return -EFAULT;
+
+ /*
+ * The destination is a ptrace buffer; we put in only user xstates:
+ */
+ memset(&header, 0, sizeof(header));
+ header.xfeatures = xsave->header.xfeatures;
+ header.xfeatures &= ~XFEATURE_MASK_SUPERVISOR;
+
+ /*
+ * Copy xregs_state->header:
+ */
+ offset = offsetof(struct xregs_state, header);
+ size = sizeof(header);
+
+ __copy_xstate_to_kernel(kbuf, &header, offset, size, size_total);
+
+ for (i = 0; i < XFEATURE_MAX; i++) {
+ /*
+ * Copy only in-use xstates:
+ */
+ if ((header.xfeatures >> i) & 1) {
+ void *src = __raw_xsave_addr(xsave, 1 << i);
+
+ offset = xstate_offsets[i];
+ size = xstate_sizes[i];
+
+ /* The next component has to fit fully into the output buffer: */
+ if (offset + size > size_total)
+ break;
+
+ __copy_xstate_to_kernel(kbuf, src, offset, size, size_total);
+ }
+
+ }
+
+ if (xfeatures_mxcsr_quirk(header.xfeatures)) {
+ offset = offsetof(struct fxregs_state, mxcsr);
+ size = MXCSR_AND_FLAGS_SIZE;
+ __copy_xstate_to_kernel(kbuf, &xsave->i387.mxcsr, offset, size, size_total);
+ }
+
+ /*
+ * Fill xsave->i387.sw_reserved value for ptrace frame:
+ */
+ offset = offsetof(struct fxregs_state, sw_reserved);
+ size = sizeof(xstate_fx_sw_bytes);
+
+ __copy_xstate_to_kernel(kbuf, xstate_fx_sw_bytes, offset, size, size_total);
+
+ return 0;
+}
+
+static inline int
+__copy_xstate_to_user(void __user *ubuf, const void *data, unsigned int offset, unsigned int size, unsigned int size_total)
+{
+ if (!size)
+ return 0;
+
+ if (offset < size_total) {
+ unsigned int copy = min(size, size_total - offset);
+
+ if (__copy_to_user(ubuf + offset, data, copy))
+ return -EFAULT;
+ }
+ return 0;
+}
+
+/*
+ * Convert from kernel XSAVES compacted format to standard format and copy
+ * to a user-space buffer. It supports partial copy but pos always starts from
+ * zero. This is called from xstateregs_get() and there we check the CPU
+ * has XSAVES.
+ */
+int copy_xstate_to_user(void __user *ubuf, struct xregs_state *xsave, unsigned int offset_start, unsigned int size_total)
+{
+ unsigned int offset, size;
+ int ret, i;
+ struct xstate_header header;
+
+ /*
+ * Currently copy_regset_to_user() starts from pos 0:
+ */
+ if (unlikely(offset_start != 0))
+ return -EFAULT;
+
+ /*
+ * The destination is a ptrace buffer; we put in only user xstates:
+ */
+ memset(&header, 0, sizeof(header));
+ header.xfeatures = xsave->header.xfeatures;
+ header.xfeatures &= ~XFEATURE_MASK_SUPERVISOR;
+
+ /*
+ * Copy xregs_state->header:
+ */
+ offset = offsetof(struct xregs_state, header);
+ size = sizeof(header);
+
+ ret = __copy_xstate_to_user(ubuf, &header, offset, size, size_total);
+ if (ret)
+ return ret;
+
+ for (i = 0; i < XFEATURE_MAX; i++) {
+ /*
+ * Copy only in-use xstates:
+ */
+ if ((header.xfeatures >> i) & 1) {
+ void *src = __raw_xsave_addr(xsave, 1 << i);
+
+ offset = xstate_offsets[i];
+ size = xstate_sizes[i];
+
+ /* The next component has to fit fully into the output buffer: */
+ if (offset + size > size_total)
+ break;
+
+ ret = __copy_xstate_to_user(ubuf, src, offset, size, size_total);
+ if (ret)
+ return ret;
+ }
+
+ }
+
+ if (xfeatures_mxcsr_quirk(header.xfeatures)) {
+ offset = offsetof(struct fxregs_state, mxcsr);
+ size = MXCSR_AND_FLAGS_SIZE;
+ __copy_xstate_to_user(ubuf, &xsave->i387.mxcsr, offset, size, size_total);
+ }
+
+ /*
+ * Fill xsave->i387.sw_reserved value for ptrace frame:
+ */
+ offset = offsetof(struct fxregs_state, sw_reserved);
+ size = sizeof(xstate_fx_sw_bytes);
+
+ ret = __copy_xstate_to_user(ubuf, xstate_fx_sw_bytes, offset, size, size_total);
+ if (ret)
+ return ret;
+
+ return 0;
+}
+
+/*
+ * Convert from a ptrace standard-format kernel buffer to kernel XSAVES format
+ * and copy to the target thread. This is called from xstateregs_set().
+ */
+int copy_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf)
+{
+ unsigned int offset, size;
+ int i;
+ struct xstate_header hdr;
+
+ offset = offsetof(struct xregs_state, header);
+ size = sizeof(hdr);
+
+ memcpy(&hdr, kbuf + offset, size);
+
+ if (validate_xstate_header(&hdr))
+ return -EINVAL;
+
+ for (i = 0; i < XFEATURE_MAX; i++) {
+ u64 mask = ((u64)1 << i);
+
+ if (hdr.xfeatures & mask) {
+ void *dst = __raw_xsave_addr(xsave, 1 << i);
+
+ offset = xstate_offsets[i];
+ size = xstate_sizes[i];
+
+ memcpy(dst, kbuf + offset, size);
+ }
+ }
+
+ if (xfeatures_mxcsr_quirk(hdr.xfeatures)) {
+ offset = offsetof(struct fxregs_state, mxcsr);
+ size = MXCSR_AND_FLAGS_SIZE;
+ memcpy(&xsave->i387.mxcsr, kbuf + offset, size);
+ }
+
+ /*
+ * The state that came in from userspace was user-state only.
+ * Mask all the user states out of 'xfeatures':
+ */
+ xsave->header.xfeatures &= XFEATURE_MASK_SUPERVISOR;
+
+ /*
+ * Add back in the features that came in from userspace:
+ */
+ xsave->header.xfeatures |= hdr.xfeatures;
+
+ return 0;
+}
+
+/*
+ * Convert from a ptrace or sigreturn standard-format user-space buffer to
+ * kernel XSAVES format and copy to the target thread. This is called from
+ * xstateregs_set(), as well as potentially from the sigreturn() and
+ * rt_sigreturn() system calls.
+ */
+int copy_user_to_xstate(struct xregs_state *xsave, const void __user *ubuf)
+{
+ unsigned int offset, size;
+ int i;
+ struct xstate_header hdr;
+
+ offset = offsetof(struct xregs_state, header);
+ size = sizeof(hdr);
+
+ if (__copy_from_user(&hdr, ubuf + offset, size))
+ return -EFAULT;
+
+ if (validate_xstate_header(&hdr))
+ return -EINVAL;
+
+ for (i = 0; i < XFEATURE_MAX; i++) {
+ u64 mask = ((u64)1 << i);
+
+ if (hdr.xfeatures & mask) {
+ void *dst = __raw_xsave_addr(xsave, 1 << i);
+
+ offset = xstate_offsets[i];
+ size = xstate_sizes[i];
+
+ if (__copy_from_user(dst, ubuf + offset, size))
+ return -EFAULT;
+ }
+ }
+
+ if (xfeatures_mxcsr_quirk(hdr.xfeatures)) {
+ offset = offsetof(struct fxregs_state, mxcsr);
+ size = MXCSR_AND_FLAGS_SIZE;
+ if (__copy_from_user(&xsave->i387.mxcsr, ubuf + offset, size))
+ return -EFAULT;
+ }
+
+ /*
+ * The state that came in from userspace was user-state only.
+ * Mask all the user states out of 'xfeatures':
+ */
+ xsave->header.xfeatures &= XFEATURE_MASK_SUPERVISOR;
+
+ /*
+ * Add back in the features that came in from userspace:
+ */
+ xsave->header.xfeatures |= hdr.xfeatures;
+
+ return 0;
+}
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
new file mode 100644
index 0000000..7ee8067
--- /dev/null
+++ b/arch/x86/kernel/ftrace.c
@@ -0,0 +1,1051 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Dynamic function tracing support.
+ *
+ * Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com>
+ *
+ * Thanks goes to Ingo Molnar, for suggesting the idea.
+ * Mathieu Desnoyers, for suggesting postponing the modifications.
+ * Arjan van de Ven, for keeping me straight, and explaining to me
+ * the dangers of modifying code on the run.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/spinlock.h>
+#include <linux/hardirq.h>
+#include <linux/uaccess.h>
+#include <linux/ftrace.h>
+#include <linux/percpu.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/module.h>
+
+#include <trace/syscall.h>
+
+#include <asm/set_memory.h>
+#include <asm/kprobes.h>
+#include <asm/ftrace.h>
+#include <asm/nops.h>
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+
+int ftrace_arch_code_modify_prepare(void)
+{
+ set_kernel_text_rw();
+ set_all_modules_text_rw();
+ return 0;
+}
+
+int ftrace_arch_code_modify_post_process(void)
+{
+ set_all_modules_text_ro();
+ set_kernel_text_ro();
+ return 0;
+}
+
+union ftrace_code_union {
+ char code[MCOUNT_INSN_SIZE];
+ struct {
+ unsigned char e8;
+ int offset;
+ } __attribute__((packed));
+};
+
+static int ftrace_calc_offset(long ip, long addr)
+{
+ return (int)(addr - ip);
+}
+
+static unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr)
+{
+ static union ftrace_code_union calc;
+
+ calc.e8 = 0xe8;
+ calc.offset = ftrace_calc_offset(ip + MCOUNT_INSN_SIZE, addr);
+
+ /*
+ * No locking needed, this must be called via kstop_machine
+ * which in essence is like running on a uniprocessor machine.
+ */
+ return calc.code;
+}
+
+static inline int
+within(unsigned long addr, unsigned long start, unsigned long end)
+{
+ return addr >= start && addr < end;
+}
+
+static unsigned long text_ip_addr(unsigned long ip)
+{
+ /*
+ * On x86_64, kernel text mappings are mapped read-only, so we use
+ * the kernel identity mapping instead of the kernel text mapping
+ * to modify the kernel text.
+ *
+ * For 32bit kernels, these mappings are same and we can use
+ * kernel identity mapping to modify code.
+ */
+ if (within(ip, (unsigned long)_text, (unsigned long)_etext))
+ ip = (unsigned long)__va(__pa_symbol(ip));
+
+ return ip;
+}
+
+static const unsigned char *ftrace_nop_replace(void)
+{
+ return ideal_nops[NOP_ATOMIC5];
+}
+
+static int
+ftrace_modify_code_direct(unsigned long ip, unsigned const char *old_code,
+ unsigned const char *new_code)
+{
+ unsigned char replaced[MCOUNT_INSN_SIZE];
+
+ ftrace_expected = old_code;
+
+ /*
+ * Note:
+ * We are paranoid about modifying text, as if a bug was to happen, it
+ * could cause us to read or write to someplace that could cause harm.
+ * Carefully read and modify the code with probe_kernel_*(), and make
+ * sure what we read is what we expected it to be before modifying it.
+ */
+
+ /* read the text we want to modify */
+ if (probe_kernel_read(replaced, (void *)ip, MCOUNT_INSN_SIZE))
+ return -EFAULT;
+
+ /* Make sure it is what we expect it to be */
+ if (memcmp(replaced, old_code, MCOUNT_INSN_SIZE) != 0)
+ return -EINVAL;
+
+ ip = text_ip_addr(ip);
+
+ /* replace the text with the new text */
+ if (probe_kernel_write((void *)ip, new_code, MCOUNT_INSN_SIZE))
+ return -EPERM;
+
+ sync_core();
+
+ return 0;
+}
+
+int ftrace_make_nop(struct module *mod,
+ struct dyn_ftrace *rec, unsigned long addr)
+{
+ unsigned const char *new, *old;
+ unsigned long ip = rec->ip;
+
+ old = ftrace_call_replace(ip, addr);
+ new = ftrace_nop_replace();
+
+ /*
+ * On boot up, and when modules are loaded, the MCOUNT_ADDR
+ * is converted to a nop, and will never become MCOUNT_ADDR
+ * again. This code is either running before SMP (on boot up)
+ * or before the code will ever be executed (module load).
+ * We do not want to use the breakpoint version in this case,
+ * just modify the code directly.
+ */
+ if (addr == MCOUNT_ADDR)
+ return ftrace_modify_code_direct(rec->ip, old, new);
+
+ ftrace_expected = NULL;
+
+ /* Normal cases use add_brk_on_nop */
+ WARN_ONCE(1, "invalid use of ftrace_make_nop");
+ return -EINVAL;
+}
+
+int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
+{
+ unsigned const char *new, *old;
+ unsigned long ip = rec->ip;
+
+ old = ftrace_nop_replace();
+ new = ftrace_call_replace(ip, addr);
+
+ /* Should only be called when module is loaded */
+ return ftrace_modify_code_direct(rec->ip, old, new);
+}
+
+/*
+ * The modifying_ftrace_code is used to tell the breakpoint
+ * handler to call ftrace_int3_handler(). If it fails to
+ * call this handler for a breakpoint added by ftrace, then
+ * the kernel may crash.
+ *
+ * As atomic_writes on x86 do not need a barrier, we do not
+ * need to add smp_mb()s for this to work. It is also considered
+ * that we can not read the modifying_ftrace_code before
+ * executing the breakpoint. That would be quite remarkable if
+ * it could do that. Here's the flow that is required:
+ *
+ * CPU-0 CPU-1
+ *
+ * atomic_inc(mfc);
+ * write int3s
+ * <trap-int3> // implicit (r)mb
+ * if (atomic_read(mfc))
+ * call ftrace_int3_handler()
+ *
+ * Then when we are finished:
+ *
+ * atomic_dec(mfc);
+ *
+ * If we hit a breakpoint that was not set by ftrace, it does not
+ * matter if ftrace_int3_handler() is called or not. It will
+ * simply be ignored. But it is crucial that a ftrace nop/caller
+ * breakpoint is handled. No other user should ever place a
+ * breakpoint on an ftrace nop/caller location. It must only
+ * be done by this code.
+ */
+atomic_t modifying_ftrace_code __read_mostly;
+
+static int
+ftrace_modify_code(unsigned long ip, unsigned const char *old_code,
+ unsigned const char *new_code);
+
+/*
+ * Should never be called:
+ * As it is only called by __ftrace_replace_code() which is called by
+ * ftrace_replace_code() that x86 overrides, and by ftrace_update_code()
+ * which is called to turn mcount into nops or nops into function calls
+ * but not to convert a function from not using regs to one that uses
+ * regs, which ftrace_modify_call() is for.
+ */
+int ftrace_modify_call(struct dyn_ftrace *rec, unsigned long old_addr,
+ unsigned long addr)
+{
+ WARN_ON(1);
+ ftrace_expected = NULL;
+ return -EINVAL;
+}
+
+static unsigned long ftrace_update_func;
+
+static int update_ftrace_func(unsigned long ip, void *new)
+{
+ unsigned char old[MCOUNT_INSN_SIZE];
+ int ret;
+
+ memcpy(old, (void *)ip, MCOUNT_INSN_SIZE);
+
+ ftrace_update_func = ip;
+ /* Make sure the breakpoints see the ftrace_update_func update */
+ smp_wmb();
+
+ /* See comment above by declaration of modifying_ftrace_code */
+ atomic_inc(&modifying_ftrace_code);
+
+ ret = ftrace_modify_code(ip, old, new);
+
+ atomic_dec(&modifying_ftrace_code);
+
+ return ret;
+}
+
+int ftrace_update_ftrace_func(ftrace_func_t func)
+{
+ unsigned long ip = (unsigned long)(&ftrace_call);
+ unsigned char *new;
+ int ret;
+
+ new = ftrace_call_replace(ip, (unsigned long)func);
+ ret = update_ftrace_func(ip, new);
+
+ /* Also update the regs callback function */
+ if (!ret) {
+ ip = (unsigned long)(&ftrace_regs_call);
+ new = ftrace_call_replace(ip, (unsigned long)func);
+ ret = update_ftrace_func(ip, new);
+ }
+
+ return ret;
+}
+
+static int is_ftrace_caller(unsigned long ip)
+{
+ if (ip == ftrace_update_func)
+ return 1;
+
+ return 0;
+}
+
+/*
+ * A breakpoint was added to the code address we are about to
+ * modify, and this is the handle that will just skip over it.
+ * We are either changing a nop into a trace call, or a trace
+ * call to a nop. While the change is taking place, we treat
+ * it just like it was a nop.
+ */
+int ftrace_int3_handler(struct pt_regs *regs)
+{
+ unsigned long ip;
+
+ if (WARN_ON_ONCE(!regs))
+ return 0;
+
+ ip = regs->ip - 1;
+ if (!ftrace_location(ip) && !is_ftrace_caller(ip))
+ return 0;
+
+ regs->ip += MCOUNT_INSN_SIZE - 1;
+
+ return 1;
+}
+
+static int ftrace_write(unsigned long ip, const char *val, int size)
+{
+ ip = text_ip_addr(ip);
+
+ if (probe_kernel_write((void *)ip, val, size))
+ return -EPERM;
+
+ return 0;
+}
+
+static int add_break(unsigned long ip, const char *old)
+{
+ unsigned char replaced[MCOUNT_INSN_SIZE];
+ unsigned char brk = BREAKPOINT_INSTRUCTION;
+
+ if (probe_kernel_read(replaced, (void *)ip, MCOUNT_INSN_SIZE))
+ return -EFAULT;
+
+ ftrace_expected = old;
+
+ /* Make sure it is what we expect it to be */
+ if (memcmp(replaced, old, MCOUNT_INSN_SIZE) != 0)
+ return -EINVAL;
+
+ return ftrace_write(ip, &brk, 1);
+}
+
+static int add_brk_on_call(struct dyn_ftrace *rec, unsigned long addr)
+{
+ unsigned const char *old;
+ unsigned long ip = rec->ip;
+
+ old = ftrace_call_replace(ip, addr);
+
+ return add_break(rec->ip, old);
+}
+
+
+static int add_brk_on_nop(struct dyn_ftrace *rec)
+{
+ unsigned const char *old;
+
+ old = ftrace_nop_replace();
+
+ return add_break(rec->ip, old);
+}
+
+static int add_breakpoints(struct dyn_ftrace *rec, int enable)
+{
+ unsigned long ftrace_addr;
+ int ret;
+
+ ftrace_addr = ftrace_get_addr_curr(rec);
+
+ ret = ftrace_test_record(rec, enable);
+
+ switch (ret) {
+ case FTRACE_UPDATE_IGNORE:
+ return 0;
+
+ case FTRACE_UPDATE_MAKE_CALL:
+ /* converting nop to call */
+ return add_brk_on_nop(rec);
+
+ case FTRACE_UPDATE_MODIFY_CALL:
+ case FTRACE_UPDATE_MAKE_NOP:
+ /* converting a call to a nop */
+ return add_brk_on_call(rec, ftrace_addr);
+ }
+ return 0;
+}
+
+/*
+ * On error, we need to remove breakpoints. This needs to
+ * be done caefully. If the address does not currently have a
+ * breakpoint, we know we are done. Otherwise, we look at the
+ * remaining 4 bytes of the instruction. If it matches a nop
+ * we replace the breakpoint with the nop. Otherwise we replace
+ * it with the call instruction.
+ */
+static int remove_breakpoint(struct dyn_ftrace *rec)
+{
+ unsigned char ins[MCOUNT_INSN_SIZE];
+ unsigned char brk = BREAKPOINT_INSTRUCTION;
+ const unsigned char *nop;
+ unsigned long ftrace_addr;
+ unsigned long ip = rec->ip;
+
+ /* If we fail the read, just give up */
+ if (probe_kernel_read(ins, (void *)ip, MCOUNT_INSN_SIZE))
+ return -EFAULT;
+
+ /* If this does not have a breakpoint, we are done */
+ if (ins[0] != brk)
+ return 0;
+
+ nop = ftrace_nop_replace();
+
+ /*
+ * If the last 4 bytes of the instruction do not match
+ * a nop, then we assume that this is a call to ftrace_addr.
+ */
+ if (memcmp(&ins[1], &nop[1], MCOUNT_INSN_SIZE - 1) != 0) {
+ /*
+ * For extra paranoidism, we check if the breakpoint is on
+ * a call that would actually jump to the ftrace_addr.
+ * If not, don't touch the breakpoint, we make just create
+ * a disaster.
+ */
+ ftrace_addr = ftrace_get_addr_new(rec);
+ nop = ftrace_call_replace(ip, ftrace_addr);
+
+ if (memcmp(&ins[1], &nop[1], MCOUNT_INSN_SIZE - 1) == 0)
+ goto update;
+
+ /* Check both ftrace_addr and ftrace_old_addr */
+ ftrace_addr = ftrace_get_addr_curr(rec);
+ nop = ftrace_call_replace(ip, ftrace_addr);
+
+ ftrace_expected = nop;
+
+ if (memcmp(&ins[1], &nop[1], MCOUNT_INSN_SIZE - 1) != 0)
+ return -EINVAL;
+ }
+
+ update:
+ return ftrace_write(ip, nop, 1);
+}
+
+static int add_update_code(unsigned long ip, unsigned const char *new)
+{
+ /* skip breakpoint */
+ ip++;
+ new++;
+ return ftrace_write(ip, new, MCOUNT_INSN_SIZE - 1);
+}
+
+static int add_update_call(struct dyn_ftrace *rec, unsigned long addr)
+{
+ unsigned long ip = rec->ip;
+ unsigned const char *new;
+
+ new = ftrace_call_replace(ip, addr);
+ return add_update_code(ip, new);
+}
+
+static int add_update_nop(struct dyn_ftrace *rec)
+{
+ unsigned long ip = rec->ip;
+ unsigned const char *new;
+
+ new = ftrace_nop_replace();
+ return add_update_code(ip, new);
+}
+
+static int add_update(struct dyn_ftrace *rec, int enable)
+{
+ unsigned long ftrace_addr;
+ int ret;
+
+ ret = ftrace_test_record(rec, enable);
+
+ ftrace_addr = ftrace_get_addr_new(rec);
+
+ switch (ret) {
+ case FTRACE_UPDATE_IGNORE:
+ return 0;
+
+ case FTRACE_UPDATE_MODIFY_CALL:
+ case FTRACE_UPDATE_MAKE_CALL:
+ /* converting nop to call */
+ return add_update_call(rec, ftrace_addr);
+
+ case FTRACE_UPDATE_MAKE_NOP:
+ /* converting a call to a nop */
+ return add_update_nop(rec);
+ }
+
+ return 0;
+}
+
+static int finish_update_call(struct dyn_ftrace *rec, unsigned long addr)
+{
+ unsigned long ip = rec->ip;
+ unsigned const char *new;
+
+ new = ftrace_call_replace(ip, addr);
+
+ return ftrace_write(ip, new, 1);
+}
+
+static int finish_update_nop(struct dyn_ftrace *rec)
+{
+ unsigned long ip = rec->ip;
+ unsigned const char *new;
+
+ new = ftrace_nop_replace();
+
+ return ftrace_write(ip, new, 1);
+}
+
+static int finish_update(struct dyn_ftrace *rec, int enable)
+{
+ unsigned long ftrace_addr;
+ int ret;
+
+ ret = ftrace_update_record(rec, enable);
+
+ ftrace_addr = ftrace_get_addr_new(rec);
+
+ switch (ret) {
+ case FTRACE_UPDATE_IGNORE:
+ return 0;
+
+ case FTRACE_UPDATE_MODIFY_CALL:
+ case FTRACE_UPDATE_MAKE_CALL:
+ /* converting nop to call */
+ return finish_update_call(rec, ftrace_addr);
+
+ case FTRACE_UPDATE_MAKE_NOP:
+ /* converting a call to a nop */
+ return finish_update_nop(rec);
+ }
+
+ return 0;
+}
+
+static void do_sync_core(void *data)
+{
+ sync_core();
+}
+
+static void run_sync(void)
+{
+ int enable_irqs;
+
+ /* No need to sync if there's only one CPU */
+ if (num_online_cpus() == 1)
+ return;
+
+ enable_irqs = irqs_disabled();
+
+ /* We may be called with interrupts disabled (on bootup). */
+ if (enable_irqs)
+ local_irq_enable();
+ on_each_cpu(do_sync_core, NULL, 1);
+ if (enable_irqs)
+ local_irq_disable();
+}
+
+void ftrace_replace_code(int enable)
+{
+ struct ftrace_rec_iter *iter;
+ struct dyn_ftrace *rec;
+ const char *report = "adding breakpoints";
+ int count = 0;
+ int ret;
+
+ for_ftrace_rec_iter(iter) {
+ rec = ftrace_rec_iter_record(iter);
+
+ ret = add_breakpoints(rec, enable);
+ if (ret)
+ goto remove_breakpoints;
+ count++;
+ }
+
+ run_sync();
+
+ report = "updating code";
+ count = 0;
+
+ for_ftrace_rec_iter(iter) {
+ rec = ftrace_rec_iter_record(iter);
+
+ ret = add_update(rec, enable);
+ if (ret)
+ goto remove_breakpoints;
+ count++;
+ }
+
+ run_sync();
+
+ report = "removing breakpoints";
+ count = 0;
+
+ for_ftrace_rec_iter(iter) {
+ rec = ftrace_rec_iter_record(iter);
+
+ ret = finish_update(rec, enable);
+ if (ret)
+ goto remove_breakpoints;
+ count++;
+ }
+
+ run_sync();
+
+ return;
+
+ remove_breakpoints:
+ pr_warn("Failed on %s (%d):\n", report, count);
+ ftrace_bug(ret, rec);
+ for_ftrace_rec_iter(iter) {
+ rec = ftrace_rec_iter_record(iter);
+ /*
+ * Breakpoints are handled only when this function is in
+ * progress. The system could not work with them.
+ */
+ if (remove_breakpoint(rec))
+ BUG();
+ }
+ run_sync();
+}
+
+static int
+ftrace_modify_code(unsigned long ip, unsigned const char *old_code,
+ unsigned const char *new_code)
+{
+ int ret;
+
+ ret = add_break(ip, old_code);
+ if (ret)
+ goto out;
+
+ run_sync();
+
+ ret = add_update_code(ip, new_code);
+ if (ret)
+ goto fail_update;
+
+ run_sync();
+
+ ret = ftrace_write(ip, new_code, 1);
+ /*
+ * The breakpoint is handled only when this function is in progress.
+ * The system could not work if we could not remove it.
+ */
+ BUG_ON(ret);
+ out:
+ run_sync();
+ return ret;
+
+ fail_update:
+ /* Also here the system could not work with the breakpoint */
+ if (ftrace_write(ip, old_code, 1))
+ BUG();
+ goto out;
+}
+
+void arch_ftrace_update_code(int command)
+{
+ /* See comment above by declaration of modifying_ftrace_code */
+ atomic_inc(&modifying_ftrace_code);
+
+ ftrace_modify_all_code(command);
+
+ atomic_dec(&modifying_ftrace_code);
+}
+
+int __init ftrace_dyn_arch_init(void)
+{
+ return 0;
+}
+
+#if defined(CONFIG_X86_64) || defined(CONFIG_FUNCTION_GRAPH_TRACER)
+static unsigned char *ftrace_jmp_replace(unsigned long ip, unsigned long addr)
+{
+ static union ftrace_code_union calc;
+
+ /* Jmp not a call (ignore the .e8) */
+ calc.e8 = 0xe9;
+ calc.offset = ftrace_calc_offset(ip + MCOUNT_INSN_SIZE, addr);
+
+ /*
+ * ftrace external locks synchronize the access to the static variable.
+ */
+ return calc.code;
+}
+#endif
+
+/* Currently only x86_64 supports dynamic trampolines */
+#ifdef CONFIG_X86_64
+
+#ifdef CONFIG_MODULES
+#include <linux/moduleloader.h>
+/* Module allocation simplifies allocating memory for code */
+static inline void *alloc_tramp(unsigned long size)
+{
+ return module_alloc(size);
+}
+static inline void tramp_free(void *tramp, int size)
+{
+ int npages = PAGE_ALIGN(size) >> PAGE_SHIFT;
+
+ set_memory_nx((unsigned long)tramp, npages);
+ set_memory_rw((unsigned long)tramp, npages);
+ module_memfree(tramp);
+}
+#else
+/* Trampolines can only be created if modules are supported */
+static inline void *alloc_tramp(unsigned long size)
+{
+ return NULL;
+}
+static inline void tramp_free(void *tramp, int size) { }
+#endif
+
+/* Defined as markers to the end of the ftrace default trampolines */
+extern void ftrace_regs_caller_end(void);
+extern void ftrace_epilogue(void);
+extern void ftrace_caller_op_ptr(void);
+extern void ftrace_regs_caller_op_ptr(void);
+
+/* movq function_trace_op(%rip), %rdx */
+/* 0x48 0x8b 0x15 <offset-to-ftrace_trace_op (4 bytes)> */
+#define OP_REF_SIZE 7
+
+/*
+ * The ftrace_ops is passed to the function callback. Since the
+ * trampoline only services a single ftrace_ops, we can pass in
+ * that ops directly.
+ *
+ * The ftrace_op_code_union is used to create a pointer to the
+ * ftrace_ops that will be passed to the callback function.
+ */
+union ftrace_op_code_union {
+ char code[OP_REF_SIZE];
+ struct {
+ char op[3];
+ int offset;
+ } __attribute__((packed));
+};
+
+static unsigned long
+create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size)
+{
+ unsigned const char *jmp;
+ unsigned long start_offset;
+ unsigned long end_offset;
+ unsigned long op_offset;
+ unsigned long offset;
+ unsigned long size;
+ unsigned long ip;
+ unsigned long *ptr;
+ void *trampoline;
+ /* 48 8b 15 <offset> is movq <offset>(%rip), %rdx */
+ unsigned const char op_ref[] = { 0x48, 0x8b, 0x15 };
+ union ftrace_op_code_union op_ptr;
+ int ret;
+
+ if (ops->flags & FTRACE_OPS_FL_SAVE_REGS) {
+ start_offset = (unsigned long)ftrace_regs_caller;
+ end_offset = (unsigned long)ftrace_regs_caller_end;
+ op_offset = (unsigned long)ftrace_regs_caller_op_ptr;
+ } else {
+ start_offset = (unsigned long)ftrace_caller;
+ end_offset = (unsigned long)ftrace_epilogue;
+ op_offset = (unsigned long)ftrace_caller_op_ptr;
+ }
+
+ size = end_offset - start_offset;
+
+ /*
+ * Allocate enough size to store the ftrace_caller code,
+ * the jmp to ftrace_epilogue, as well as the address of
+ * the ftrace_ops this trampoline is used for.
+ */
+ trampoline = alloc_tramp(size + MCOUNT_INSN_SIZE + sizeof(void *));
+ if (!trampoline)
+ return 0;
+
+ *tramp_size = size + MCOUNT_INSN_SIZE + sizeof(void *);
+
+ /* Copy ftrace_caller onto the trampoline memory */
+ ret = probe_kernel_read(trampoline, (void *)start_offset, size);
+ if (WARN_ON(ret < 0)) {
+ tramp_free(trampoline, *tramp_size);
+ return 0;
+ }
+
+ ip = (unsigned long)trampoline + size;
+
+ /* The trampoline ends with a jmp to ftrace_epilogue */
+ jmp = ftrace_jmp_replace(ip, (unsigned long)ftrace_epilogue);
+ memcpy(trampoline + size, jmp, MCOUNT_INSN_SIZE);
+
+ /*
+ * The address of the ftrace_ops that is used for this trampoline
+ * is stored at the end of the trampoline. This will be used to
+ * load the third parameter for the callback. Basically, that
+ * location at the end of the trampoline takes the place of
+ * the global function_trace_op variable.
+ */
+
+ ptr = (unsigned long *)(trampoline + size + MCOUNT_INSN_SIZE);
+ *ptr = (unsigned long)ops;
+
+ op_offset -= start_offset;
+ memcpy(&op_ptr, trampoline + op_offset, OP_REF_SIZE);
+
+ /* Are we pointing to the reference? */
+ if (WARN_ON(memcmp(op_ptr.op, op_ref, 3) != 0)) {
+ tramp_free(trampoline, *tramp_size);
+ return 0;
+ }
+
+ /* Load the contents of ptr into the callback parameter */
+ offset = (unsigned long)ptr;
+ offset -= (unsigned long)trampoline + op_offset + OP_REF_SIZE;
+
+ op_ptr.offset = offset;
+
+ /* put in the new offset to the ftrace_ops */
+ memcpy(trampoline + op_offset, &op_ptr, OP_REF_SIZE);
+
+ /* ALLOC_TRAMP flags lets us know we created it */
+ ops->flags |= FTRACE_OPS_FL_ALLOC_TRAMP;
+
+ return (unsigned long)trampoline;
+}
+
+static unsigned long calc_trampoline_call_offset(bool save_regs)
+{
+ unsigned long start_offset;
+ unsigned long call_offset;
+
+ if (save_regs) {
+ start_offset = (unsigned long)ftrace_regs_caller;
+ call_offset = (unsigned long)ftrace_regs_call;
+ } else {
+ start_offset = (unsigned long)ftrace_caller;
+ call_offset = (unsigned long)ftrace_call;
+ }
+
+ return call_offset - start_offset;
+}
+
+void arch_ftrace_update_trampoline(struct ftrace_ops *ops)
+{
+ ftrace_func_t func;
+ unsigned char *new;
+ unsigned long offset;
+ unsigned long ip;
+ unsigned int size;
+ int ret, npages;
+
+ if (ops->trampoline) {
+ /*
+ * The ftrace_ops caller may set up its own trampoline.
+ * In such a case, this code must not modify it.
+ */
+ if (!(ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP))
+ return;
+ npages = PAGE_ALIGN(ops->trampoline_size) >> PAGE_SHIFT;
+ set_memory_rw(ops->trampoline, npages);
+ } else {
+ ops->trampoline = create_trampoline(ops, &size);
+ if (!ops->trampoline)
+ return;
+ ops->trampoline_size = size;
+ npages = PAGE_ALIGN(size) >> PAGE_SHIFT;
+ }
+
+ offset = calc_trampoline_call_offset(ops->flags & FTRACE_OPS_FL_SAVE_REGS);
+ ip = ops->trampoline + offset;
+
+ func = ftrace_ops_get_func(ops);
+
+ /* Do a safe modify in case the trampoline is executing */
+ new = ftrace_call_replace(ip, (unsigned long)func);
+ ret = update_ftrace_func(ip, new);
+ set_memory_ro(ops->trampoline, npages);
+
+ /* The update should never fail */
+ WARN_ON(ret);
+}
+
+/* Return the address of the function the trampoline calls */
+static void *addr_from_call(void *ptr)
+{
+ union ftrace_code_union calc;
+ int ret;
+
+ ret = probe_kernel_read(&calc, ptr, MCOUNT_INSN_SIZE);
+ if (WARN_ON_ONCE(ret < 0))
+ return NULL;
+
+ /* Make sure this is a call */
+ if (WARN_ON_ONCE(calc.e8 != 0xe8)) {
+ pr_warn("Expected e8, got %x\n", calc.e8);
+ return NULL;
+ }
+
+ return ptr + MCOUNT_INSN_SIZE + calc.offset;
+}
+
+void prepare_ftrace_return(unsigned long self_addr, unsigned long *parent,
+ unsigned long frame_pointer);
+
+/*
+ * If the ops->trampoline was not allocated, then it probably
+ * has a static trampoline func, or is the ftrace caller itself.
+ */
+static void *static_tramp_func(struct ftrace_ops *ops, struct dyn_ftrace *rec)
+{
+ unsigned long offset;
+ bool save_regs = rec->flags & FTRACE_FL_REGS_EN;
+ void *ptr;
+
+ if (ops && ops->trampoline) {
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+ /*
+ * We only know about function graph tracer setting as static
+ * trampoline.
+ */
+ if (ops->trampoline == FTRACE_GRAPH_ADDR)
+ return (void *)prepare_ftrace_return;
+#endif
+ return NULL;
+ }
+
+ offset = calc_trampoline_call_offset(save_regs);
+
+ if (save_regs)
+ ptr = (void *)FTRACE_REGS_ADDR + offset;
+ else
+ ptr = (void *)FTRACE_ADDR + offset;
+
+ return addr_from_call(ptr);
+}
+
+void *arch_ftrace_trampoline_func(struct ftrace_ops *ops, struct dyn_ftrace *rec)
+{
+ unsigned long offset;
+
+ /* If we didn't allocate this trampoline, consider it static */
+ if (!ops || !(ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP))
+ return static_tramp_func(ops, rec);
+
+ offset = calc_trampoline_call_offset(ops->flags & FTRACE_OPS_FL_SAVE_REGS);
+ return addr_from_call((void *)ops->trampoline + offset);
+}
+
+void arch_ftrace_trampoline_free(struct ftrace_ops *ops)
+{
+ if (!ops || !(ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP))
+ return;
+
+ tramp_free((void *)ops->trampoline, ops->trampoline_size);
+ ops->trampoline = 0;
+}
+
+#endif /* CONFIG_X86_64 */
+#endif /* CONFIG_DYNAMIC_FTRACE */
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+extern void ftrace_graph_call(void);
+
+static int ftrace_mod_jmp(unsigned long ip, void *func)
+{
+ unsigned char *new;
+
+ new = ftrace_jmp_replace(ip, (unsigned long)func);
+
+ return update_ftrace_func(ip, new);
+}
+
+int ftrace_enable_ftrace_graph_caller(void)
+{
+ unsigned long ip = (unsigned long)(&ftrace_graph_call);
+
+ return ftrace_mod_jmp(ip, &ftrace_graph_caller);
+}
+
+int ftrace_disable_ftrace_graph_caller(void)
+{
+ unsigned long ip = (unsigned long)(&ftrace_graph_call);
+
+ return ftrace_mod_jmp(ip, &ftrace_stub);
+}
+
+#endif /* !CONFIG_DYNAMIC_FTRACE */
+
+/*
+ * Hook the return address and push it in the stack of return addrs
+ * in current thread info.
+ */
+void prepare_ftrace_return(unsigned long self_addr, unsigned long *parent,
+ unsigned long frame_pointer)
+{
+ unsigned long old;
+ int faulted;
+ unsigned long return_hooker = (unsigned long)
+ &return_to_handler;
+
+ /*
+ * When resuming from suspend-to-ram, this function can be indirectly
+ * called from early CPU startup code while the CPU is in real mode,
+ * which would fail miserably. Make sure the stack pointer is a
+ * virtual address.
+ *
+ * This check isn't as accurate as virt_addr_valid(), but it should be
+ * good enough for this purpose, and it's fast.
+ */
+ if (unlikely((long)__builtin_frame_address(0) >= 0))
+ return;
+
+ if (unlikely(ftrace_graph_is_dead()))
+ return;
+
+ if (unlikely(atomic_read(¤t->tracing_graph_pause)))
+ return;
+
+ /*
+ * Protect against fault, even if it shouldn't
+ * happen. This tool is too much intrusive to
+ * ignore such a protection.
+ */
+ asm volatile(
+ "1: " _ASM_MOV " (%[parent]), %[old]\n"
+ "2: " _ASM_MOV " %[return_hooker], (%[parent])\n"
+ " movl $0, %[faulted]\n"
+ "3:\n"
+
+ ".section .fixup, \"ax\"\n"
+ "4: movl $1, %[faulted]\n"
+ " jmp 3b\n"
+ ".previous\n"
+
+ _ASM_EXTABLE(1b, 4b)
+ _ASM_EXTABLE(2b, 4b)
+
+ : [old] "=&r" (old), [faulted] "=r" (faulted)
+ : [parent] "r" (parent), [return_hooker] "r" (return_hooker)
+ : "memory"
+ );
+
+ if (unlikely(faulted)) {
+ ftrace_graph_stop();
+ WARN_ON(1);
+ return;
+ }
+
+ if (function_graph_enter(old, self_addr, frame_pointer, parent))
+ *parent = old;
+}
+#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
diff --git a/arch/x86/kernel/ftrace_32.S b/arch/x86/kernel/ftrace_32.S
new file mode 100644
index 0000000..4c8440d
--- /dev/null
+++ b/arch/x86/kernel/ftrace_32.S
@@ -0,0 +1,247 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2017 Steven Rostedt, VMware Inc.
+ */
+
+#include <linux/linkage.h>
+#include <asm/page_types.h>
+#include <asm/segment.h>
+#include <asm/export.h>
+#include <asm/ftrace.h>
+#include <asm/nospec-branch.h>
+
+#ifdef CC_USING_FENTRY
+# define function_hook __fentry__
+EXPORT_SYMBOL(__fentry__)
+#else
+# define function_hook mcount
+EXPORT_SYMBOL(mcount)
+#endif
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+
+/* mcount uses a frame pointer even if CONFIG_FRAME_POINTER is not set */
+#if !defined(CC_USING_FENTRY) || defined(CONFIG_FRAME_POINTER)
+# define USING_FRAME_POINTER
+#endif
+
+#ifdef USING_FRAME_POINTER
+# define MCOUNT_FRAME 1 /* using frame = true */
+#else
+# define MCOUNT_FRAME 0 /* using frame = false */
+#endif
+
+ENTRY(function_hook)
+ ret
+END(function_hook)
+
+ENTRY(ftrace_caller)
+
+#ifdef USING_FRAME_POINTER
+# ifdef CC_USING_FENTRY
+ /*
+ * Frame pointers are of ip followed by bp.
+ * Since fentry is an immediate jump, we are left with
+ * parent-ip, function-ip. We need to add a frame with
+ * parent-ip followed by ebp.
+ */
+ pushl 4(%esp) /* parent ip */
+ pushl %ebp
+ movl %esp, %ebp
+ pushl 2*4(%esp) /* function ip */
+# endif
+ /* For mcount, the function ip is directly above */
+ pushl %ebp
+ movl %esp, %ebp
+#endif
+ pushl %eax
+ pushl %ecx
+ pushl %edx
+ pushl $0 /* Pass NULL as regs pointer */
+
+#ifdef USING_FRAME_POINTER
+ /* Load parent ebp into edx */
+ movl 4*4(%esp), %edx
+#else
+ /* There's no frame pointer, load the appropriate stack addr instead */
+ lea 4*4(%esp), %edx
+#endif
+
+ movl (MCOUNT_FRAME+4)*4(%esp), %eax /* load the rip */
+ /* Get the parent ip */
+ movl 4(%edx), %edx /* edx has ebp */
+
+ movl function_trace_op, %ecx
+ subl $MCOUNT_INSN_SIZE, %eax
+
+.globl ftrace_call
+ftrace_call:
+ call ftrace_stub
+
+ addl $4, %esp /* skip NULL pointer */
+ popl %edx
+ popl %ecx
+ popl %eax
+#ifdef USING_FRAME_POINTER
+ popl %ebp
+# ifdef CC_USING_FENTRY
+ addl $4,%esp /* skip function ip */
+ popl %ebp /* this is the orig bp */
+ addl $4, %esp /* skip parent ip */
+# endif
+#endif
+.Lftrace_ret:
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+.globl ftrace_graph_call
+ftrace_graph_call:
+ jmp ftrace_stub
+#endif
+
+/* This is weak to keep gas from relaxing the jumps */
+WEAK(ftrace_stub)
+ ret
+END(ftrace_caller)
+
+ENTRY(ftrace_regs_caller)
+ /*
+ * i386 does not save SS and ESP when coming from kernel.
+ * Instead, to get sp, ®s->sp is used (see ptrace.h).
+ * Unfortunately, that means eflags must be at the same location
+ * as the current return ip is. We move the return ip into the
+ * regs->ip location, and move flags into the return ip location.
+ */
+ pushl $__KERNEL_CS
+ pushl 4(%esp) /* Save the return ip */
+ pushl $0 /* Load 0 into orig_ax */
+ pushl %gs
+ pushl %fs
+ pushl %es
+ pushl %ds
+ pushl %eax
+
+ /* Get flags and place them into the return ip slot */
+ pushf
+ popl %eax
+ movl %eax, 8*4(%esp)
+
+ pushl %ebp
+ pushl %edi
+ pushl %esi
+ pushl %edx
+ pushl %ecx
+ pushl %ebx
+
+ movl 12*4(%esp), %eax /* Load ip (1st parameter) */
+ subl $MCOUNT_INSN_SIZE, %eax /* Adjust ip */
+#ifdef CC_USING_FENTRY
+ movl 15*4(%esp), %edx /* Load parent ip (2nd parameter) */
+#else
+ movl 0x4(%ebp), %edx /* Load parent ip (2nd parameter) */
+#endif
+ movl function_trace_op, %ecx /* Save ftrace_pos in 3rd parameter */
+ pushl %esp /* Save pt_regs as 4th parameter */
+
+GLOBAL(ftrace_regs_call)
+ call ftrace_stub
+
+ addl $4, %esp /* Skip pt_regs */
+
+ /* restore flags */
+ push 14*4(%esp)
+ popf
+
+ /* Move return ip back to its original location */
+ movl 12*4(%esp), %eax
+ movl %eax, 14*4(%esp)
+
+ popl %ebx
+ popl %ecx
+ popl %edx
+ popl %esi
+ popl %edi
+ popl %ebp
+ popl %eax
+ popl %ds
+ popl %es
+ popl %fs
+ popl %gs
+
+ /* use lea to not affect flags */
+ lea 3*4(%esp), %esp /* Skip orig_ax, ip and cs */
+
+ jmp .Lftrace_ret
+#else /* ! CONFIG_DYNAMIC_FTRACE */
+
+ENTRY(function_hook)
+ cmpl $__PAGE_OFFSET, %esp
+ jb ftrace_stub /* Paging not enabled yet? */
+
+ cmpl $ftrace_stub, ftrace_trace_function
+ jnz .Ltrace
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+ cmpl $ftrace_stub, ftrace_graph_return
+ jnz ftrace_graph_caller
+
+ cmpl $ftrace_graph_entry_stub, ftrace_graph_entry
+ jnz ftrace_graph_caller
+#endif
+.globl ftrace_stub
+ftrace_stub:
+ ret
+
+ /* taken from glibc */
+.Ltrace:
+ pushl %eax
+ pushl %ecx
+ pushl %edx
+ movl 0xc(%esp), %eax
+ movl 0x4(%ebp), %edx
+ subl $MCOUNT_INSN_SIZE, %eax
+
+ movl ftrace_trace_function, %ecx
+ CALL_NOSPEC %ecx
+
+ popl %edx
+ popl %ecx
+ popl %eax
+ jmp ftrace_stub
+END(function_hook)
+#endif /* CONFIG_DYNAMIC_FTRACE */
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+ENTRY(ftrace_graph_caller)
+ pushl %eax
+ pushl %ecx
+ pushl %edx
+ movl 3*4(%esp), %eax
+ /* Even with frame pointers, fentry doesn't have one here */
+#ifdef CC_USING_FENTRY
+ lea 4*4(%esp), %edx
+ movl $0, %ecx
+#else
+ lea 0x4(%ebp), %edx
+ movl (%ebp), %ecx
+#endif
+ subl $MCOUNT_INSN_SIZE, %eax
+ call prepare_ftrace_return
+ popl %edx
+ popl %ecx
+ popl %eax
+ ret
+END(ftrace_graph_caller)
+
+.globl return_to_handler
+return_to_handler:
+ pushl %eax
+ pushl %edx
+#ifdef CC_USING_FENTRY
+ movl $0, %eax
+#else
+ movl %ebp, %eax
+#endif
+ call ftrace_return_to_handler
+ movl %eax, %ecx
+ popl %edx
+ popl %eax
+ JMP_NOSPEC %ecx
+#endif
diff --git a/arch/x86/kernel/ftrace_64.S b/arch/x86/kernel/ftrace_64.S
new file mode 100644
index 0000000..91b2cff
--- /dev/null
+++ b/arch/x86/kernel/ftrace_64.S
@@ -0,0 +1,339 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2014 Steven Rostedt, Red Hat Inc
+ */
+
+#include <linux/linkage.h>
+#include <asm/ptrace.h>
+#include <asm/ftrace.h>
+#include <asm/export.h>
+#include <asm/nospec-branch.h>
+#include <asm/unwind_hints.h>
+
+ .code64
+ .section .entry.text, "ax"
+
+#ifdef CC_USING_FENTRY
+# define function_hook __fentry__
+EXPORT_SYMBOL(__fentry__)
+#else
+# define function_hook mcount
+EXPORT_SYMBOL(mcount)
+#endif
+
+#ifdef CONFIG_FRAME_POINTER
+# ifdef CC_USING_FENTRY
+/* Save parent and function stack frames (rip and rbp) */
+# define MCOUNT_FRAME_SIZE (8+16*2)
+# else
+/* Save just function stack frame (rip and rbp) */
+# define MCOUNT_FRAME_SIZE (8+16)
+# endif
+#else
+/* No need to save a stack frame */
+# define MCOUNT_FRAME_SIZE 0
+#endif /* CONFIG_FRAME_POINTER */
+
+/* Size of stack used to save mcount regs in save_mcount_regs */
+#define MCOUNT_REG_SIZE (SS+8 + MCOUNT_FRAME_SIZE)
+
+/*
+ * gcc -pg option adds a call to 'mcount' in most functions.
+ * When -mfentry is used, the call is to 'fentry' and not 'mcount'
+ * and is done before the function's stack frame is set up.
+ * They both require a set of regs to be saved before calling
+ * any C code and restored before returning back to the function.
+ *
+ * On boot up, all these calls are converted into nops. When tracing
+ * is enabled, the call can jump to either ftrace_caller or
+ * ftrace_regs_caller. Callbacks (tracing functions) that require
+ * ftrace_regs_caller (like kprobes) need to have pt_regs passed to
+ * it. For this reason, the size of the pt_regs structure will be
+ * allocated on the stack and the required mcount registers will
+ * be saved in the locations that pt_regs has them in.
+ */
+
+/*
+ * @added: the amount of stack added before calling this
+ *
+ * After this is called, the following registers contain:
+ *
+ * %rdi - holds the address that called the trampoline
+ * %rsi - holds the parent function (traced function's return address)
+ * %rdx - holds the original %rbp
+ */
+.macro save_mcount_regs added=0
+
+#ifdef CONFIG_FRAME_POINTER
+ /* Save the original rbp */
+ pushq %rbp
+
+ /*
+ * Stack traces will stop at the ftrace trampoline if the frame pointer
+ * is not set up properly. If fentry is used, we need to save a frame
+ * pointer for the parent as well as the function traced, because the
+ * fentry is called before the stack frame is set up, where as mcount
+ * is called afterward.
+ */
+#ifdef CC_USING_FENTRY
+ /* Save the parent pointer (skip orig rbp and our return address) */
+ pushq \added+8*2(%rsp)
+ pushq %rbp
+ movq %rsp, %rbp
+ /* Save the return address (now skip orig rbp, rbp and parent) */
+ pushq \added+8*3(%rsp)
+#else
+ /* Can't assume that rip is before this (unless added was zero) */
+ pushq \added+8(%rsp)
+#endif
+ pushq %rbp
+ movq %rsp, %rbp
+#endif /* CONFIG_FRAME_POINTER */
+
+ /*
+ * We add enough stack to save all regs.
+ */
+ subq $(MCOUNT_REG_SIZE - MCOUNT_FRAME_SIZE), %rsp
+ movq %rax, RAX(%rsp)
+ movq %rcx, RCX(%rsp)
+ movq %rdx, RDX(%rsp)
+ movq %rsi, RSI(%rsp)
+ movq %rdi, RDI(%rsp)
+ movq %r8, R8(%rsp)
+ movq %r9, R9(%rsp)
+ /*
+ * Save the original RBP. Even though the mcount ABI does not
+ * require this, it helps out callers.
+ */
+#ifdef CONFIG_FRAME_POINTER
+ movq MCOUNT_REG_SIZE-8(%rsp), %rdx
+#else
+ movq %rbp, %rdx
+#endif
+ movq %rdx, RBP(%rsp)
+
+ /* Copy the parent address into %rsi (second parameter) */
+#ifdef CC_USING_FENTRY
+ movq MCOUNT_REG_SIZE+8+\added(%rsp), %rsi
+#else
+ /* %rdx contains original %rbp */
+ movq 8(%rdx), %rsi
+#endif
+
+ /* Move RIP to its proper location */
+ movq MCOUNT_REG_SIZE+\added(%rsp), %rdi
+ movq %rdi, RIP(%rsp)
+
+ /*
+ * Now %rdi (the first parameter) has the return address of
+ * where ftrace_call returns. But the callbacks expect the
+ * address of the call itself.
+ */
+ subq $MCOUNT_INSN_SIZE, %rdi
+ .endm
+
+.macro restore_mcount_regs
+ movq R9(%rsp), %r9
+ movq R8(%rsp), %r8
+ movq RDI(%rsp), %rdi
+ movq RSI(%rsp), %rsi
+ movq RDX(%rsp), %rdx
+ movq RCX(%rsp), %rcx
+ movq RAX(%rsp), %rax
+
+ /* ftrace_regs_caller can modify %rbp */
+ movq RBP(%rsp), %rbp
+
+ addq $MCOUNT_REG_SIZE, %rsp
+
+ .endm
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+
+ENTRY(function_hook)
+ retq
+ENDPROC(function_hook)
+
+ENTRY(ftrace_caller)
+ /* save_mcount_regs fills in first two parameters */
+ save_mcount_regs
+
+GLOBAL(ftrace_caller_op_ptr)
+ /* Load the ftrace_ops into the 3rd parameter */
+ movq function_trace_op(%rip), %rdx
+
+ /* regs go into 4th parameter (but make it NULL) */
+ movq $0, %rcx
+
+GLOBAL(ftrace_call)
+ call ftrace_stub
+
+ restore_mcount_regs
+
+ /*
+ * The copied trampoline must call ftrace_epilogue as it
+ * still may need to call the function graph tracer.
+ *
+ * The code up to this label is copied into trampolines so
+ * think twice before adding any new code or changing the
+ * layout here.
+ */
+GLOBAL(ftrace_epilogue)
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+GLOBAL(ftrace_graph_call)
+ jmp ftrace_stub
+#endif
+
+/* This is weak to keep gas from relaxing the jumps */
+WEAK(ftrace_stub)
+ retq
+ENDPROC(ftrace_caller)
+
+ENTRY(ftrace_regs_caller)
+ /* Save the current flags before any operations that can change them */
+ pushfq
+
+ /* added 8 bytes to save flags */
+ save_mcount_regs 8
+ /* save_mcount_regs fills in first two parameters */
+
+GLOBAL(ftrace_regs_caller_op_ptr)
+ /* Load the ftrace_ops into the 3rd parameter */
+ movq function_trace_op(%rip), %rdx
+
+ /* Save the rest of pt_regs */
+ movq %r15, R15(%rsp)
+ movq %r14, R14(%rsp)
+ movq %r13, R13(%rsp)
+ movq %r12, R12(%rsp)
+ movq %r11, R11(%rsp)
+ movq %r10, R10(%rsp)
+ movq %rbx, RBX(%rsp)
+ /* Copy saved flags */
+ movq MCOUNT_REG_SIZE(%rsp), %rcx
+ movq %rcx, EFLAGS(%rsp)
+ /* Kernel segments */
+ movq $__KERNEL_DS, %rcx
+ movq %rcx, SS(%rsp)
+ movq $__KERNEL_CS, %rcx
+ movq %rcx, CS(%rsp)
+ /* Stack - skipping return address and flags */
+ leaq MCOUNT_REG_SIZE+8*2(%rsp), %rcx
+ movq %rcx, RSP(%rsp)
+
+ /* regs go into 4th parameter */
+ leaq (%rsp), %rcx
+
+GLOBAL(ftrace_regs_call)
+ call ftrace_stub
+
+ /* Copy flags back to SS, to restore them */
+ movq EFLAGS(%rsp), %rax
+ movq %rax, MCOUNT_REG_SIZE(%rsp)
+
+ /* Handlers can change the RIP */
+ movq RIP(%rsp), %rax
+ movq %rax, MCOUNT_REG_SIZE+8(%rsp)
+
+ /* restore the rest of pt_regs */
+ movq R15(%rsp), %r15
+ movq R14(%rsp), %r14
+ movq R13(%rsp), %r13
+ movq R12(%rsp), %r12
+ movq R10(%rsp), %r10
+ movq RBX(%rsp), %rbx
+
+ restore_mcount_regs
+
+ /* Restore flags */
+ popfq
+
+ /*
+ * As this jmp to ftrace_epilogue can be a short jump
+ * it must not be copied into the trampoline.
+ * The trampoline will add the code to jump
+ * to the return.
+ */
+GLOBAL(ftrace_regs_caller_end)
+
+ jmp ftrace_epilogue
+
+ENDPROC(ftrace_regs_caller)
+
+
+#else /* ! CONFIG_DYNAMIC_FTRACE */
+
+ENTRY(function_hook)
+ cmpq $ftrace_stub, ftrace_trace_function
+ jnz trace
+
+fgraph_trace:
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+ cmpq $ftrace_stub, ftrace_graph_return
+ jnz ftrace_graph_caller
+
+ cmpq $ftrace_graph_entry_stub, ftrace_graph_entry
+ jnz ftrace_graph_caller
+#endif
+
+GLOBAL(ftrace_stub)
+ retq
+
+trace:
+ /* save_mcount_regs fills in first two parameters */
+ save_mcount_regs
+
+ /*
+ * When DYNAMIC_FTRACE is not defined, ARCH_SUPPORTS_FTRACE_OPS is not
+ * set (see include/asm/ftrace.h and include/linux/ftrace.h). Only the
+ * ip and parent ip are used and the list function is called when
+ * function tracing is enabled.
+ */
+ movq ftrace_trace_function, %r8
+ CALL_NOSPEC %r8
+ restore_mcount_regs
+
+ jmp fgraph_trace
+ENDPROC(function_hook)
+#endif /* CONFIG_DYNAMIC_FTRACE */
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+ENTRY(ftrace_graph_caller)
+ /* Saves rbp into %rdx and fills first parameter */
+ save_mcount_regs
+
+#ifdef CC_USING_FENTRY
+ leaq MCOUNT_REG_SIZE+8(%rsp), %rsi
+ movq $0, %rdx /* No framepointers needed */
+#else
+ /* Save address of the return address of traced function */
+ leaq 8(%rdx), %rsi
+ /* ftrace does sanity checks against frame pointers */
+ movq (%rdx), %rdx
+#endif
+ call prepare_ftrace_return
+
+ restore_mcount_regs
+
+ retq
+ENDPROC(ftrace_graph_caller)
+
+ENTRY(return_to_handler)
+ UNWIND_HINT_EMPTY
+ subq $24, %rsp
+
+ /* Save the return values */
+ movq %rax, (%rsp)
+ movq %rdx, 8(%rsp)
+ movq %rbp, %rdi
+
+ call ftrace_return_to_handler
+
+ movq %rax, %rdi
+ movq 8(%rsp), %rdx
+ movq (%rsp), %rax
+ addq $24, %rsp
+ JMP_NOSPEC %rdi
+END(return_to_handler)
+#endif
diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c
new file mode 100644
index 0000000..ec6fefb
--- /dev/null
+++ b/arch/x86/kernel/head32.c
@@ -0,0 +1,119 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * linux/arch/i386/kernel/head32.c -- prepare to run common code
+ *
+ * Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE
+ * Copyright (C) 2007 Eric Biederman <ebiederm@xmission.com>
+ */
+
+#include <linux/init.h>
+#include <linux/start_kernel.h>
+#include <linux/mm.h>
+#include <linux/memblock.h>
+
+#include <asm/desc.h>
+#include <asm/setup.h>
+#include <asm/sections.h>
+#include <asm/e820/api.h>
+#include <asm/page.h>
+#include <asm/apic.h>
+#include <asm/io_apic.h>
+#include <asm/bios_ebda.h>
+#include <asm/tlbflush.h>
+#include <asm/bootparam_utils.h>
+
+static void __init i386_default_early_setup(void)
+{
+ /* Initialize 32bit specific setup functions */
+ x86_init.resources.reserve_resources = i386_reserve_resources;
+ x86_init.mpparse.setup_ioapic_ids = setup_ioapic_ids_from_mpc;
+}
+
+asmlinkage __visible void __init i386_start_kernel(void)
+{
+ /* Make sure IDT is set up before any exception happens */
+ idt_setup_early_handler();
+
+ cr4_init_shadow();
+
+ sanitize_boot_params(&boot_params);
+
+ x86_early_init_platform_quirks();
+
+ /* Call the subarch specific early setup function */
+ switch (boot_params.hdr.hardware_subarch) {
+ case X86_SUBARCH_INTEL_MID:
+ x86_intel_mid_early_setup();
+ break;
+ case X86_SUBARCH_CE4100:
+ x86_ce4100_early_setup();
+ break;
+ default:
+ i386_default_early_setup();
+ break;
+ }
+
+ start_kernel();
+}
+
+/*
+ * Initialize page tables. This creates a PDE and a set of page
+ * tables, which are located immediately beyond __brk_base. The variable
+ * _brk_end is set up to point to the first "safe" location.
+ * Mappings are created both at virtual address 0 (identity mapping)
+ * and PAGE_OFFSET for up to _end.
+ *
+ * In PAE mode initial_page_table is statically defined to contain
+ * enough entries to cover the VMSPLIT option (that is the top 1, 2 or 3
+ * entries). The identity mapping is handled by pointing two PGD entries
+ * to the first kernel PMD. Note the upper half of each PMD or PTE are
+ * always zero at this stage.
+ */
+void __init mk_early_pgtbl_32(void)
+{
+#ifdef __pa
+#undef __pa
+#endif
+#define __pa(x) ((unsigned long)(x) - PAGE_OFFSET)
+ pte_t pte, *ptep;
+ int i;
+ unsigned long *ptr;
+ /* Enough space to fit pagetables for the low memory linear map */
+ const unsigned long limit = __pa(_end) +
+ (PAGE_TABLE_SIZE(LOWMEM_PAGES) << PAGE_SHIFT);
+#ifdef CONFIG_X86_PAE
+ pmd_t pl2, *pl2p = (pmd_t *)__pa(initial_pg_pmd);
+#define SET_PL2(pl2, val) { (pl2).pmd = (val); }
+#else
+ pgd_t pl2, *pl2p = (pgd_t *)__pa(initial_page_table);
+#define SET_PL2(pl2, val) { (pl2).pgd = (val); }
+#endif
+
+ ptep = (pte_t *)__pa(__brk_base);
+ pte.pte = PTE_IDENT_ATTR;
+
+ while ((pte.pte & PTE_PFN_MASK) < limit) {
+
+ SET_PL2(pl2, (unsigned long)ptep | PDE_IDENT_ATTR);
+ *pl2p = pl2;
+#ifndef CONFIG_X86_PAE
+ /* Kernel PDE entry */
+ *(pl2p + ((PAGE_OFFSET >> PGDIR_SHIFT))) = pl2;
+#endif
+ for (i = 0; i < PTRS_PER_PTE; i++) {
+ *ptep = pte;
+ pte.pte += PAGE_SIZE;
+ ptep++;
+ }
+
+ pl2p++;
+ }
+
+ ptr = (unsigned long *)__pa(&max_pfn_mapped);
+ /* Can't use pte_pfn() since it's a call with CONFIG_PARAVIRT */
+ *ptr = (pte.pte & PTE_PFN_MASK) >> PAGE_SHIFT;
+
+ ptr = (unsigned long *)__pa(&_brk_end);
+ *ptr = (unsigned long)ptep + PAGE_OFFSET;
+}
+
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
new file mode 100644
index 0000000..ddee1f0
--- /dev/null
+++ b/arch/x86/kernel/head64.c
@@ -0,0 +1,471 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * prepare to run common code
+ *
+ * Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE
+ */
+
+#define DISABLE_BRANCH_PROFILING
+
+/* cpu_feature_enabled() cannot be used this early */
+#define USE_EARLY_PGTABLE_L5
+
+#include <linux/init.h>
+#include <linux/linkage.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/percpu.h>
+#include <linux/start_kernel.h>
+#include <linux/io.h>
+#include <linux/memblock.h>
+#include <linux/mem_encrypt.h>
+
+#include <asm/processor.h>
+#include <asm/proto.h>
+#include <asm/smp.h>
+#include <asm/setup.h>
+#include <asm/desc.h>
+#include <asm/pgtable.h>
+#include <asm/tlbflush.h>
+#include <asm/sections.h>
+#include <asm/kdebug.h>
+#include <asm/e820/api.h>
+#include <asm/bios_ebda.h>
+#include <asm/bootparam_utils.h>
+#include <asm/microcode.h>
+#include <asm/kasan.h>
+#include <asm/fixmap.h>
+
+/*
+ * Manage page tables very early on.
+ */
+extern pmd_t early_dynamic_pgts[EARLY_DYNAMIC_PAGE_TABLES][PTRS_PER_PMD];
+static unsigned int __initdata next_early_pgt;
+pmdval_t early_pmd_flags = __PAGE_KERNEL_LARGE & ~(_PAGE_GLOBAL | _PAGE_NX);
+
+#ifdef CONFIG_X86_5LEVEL
+unsigned int __pgtable_l5_enabled __ro_after_init;
+unsigned int pgdir_shift __ro_after_init = 39;
+EXPORT_SYMBOL(pgdir_shift);
+unsigned int ptrs_per_p4d __ro_after_init = 1;
+EXPORT_SYMBOL(ptrs_per_p4d);
+#endif
+
+#ifdef CONFIG_DYNAMIC_MEMORY_LAYOUT
+unsigned long page_offset_base __ro_after_init = __PAGE_OFFSET_BASE_L4;
+EXPORT_SYMBOL(page_offset_base);
+unsigned long vmalloc_base __ro_after_init = __VMALLOC_BASE_L4;
+EXPORT_SYMBOL(vmalloc_base);
+unsigned long vmemmap_base __ro_after_init = __VMEMMAP_BASE_L4;
+EXPORT_SYMBOL(vmemmap_base);
+#endif
+
+#define __head __section(.head.text)
+
+static void __head *fixup_pointer(void *ptr, unsigned long physaddr)
+{
+ return ptr - (void *)_text + (void *)physaddr;
+}
+
+static unsigned long __head *fixup_long(void *ptr, unsigned long physaddr)
+{
+ return fixup_pointer(ptr, physaddr);
+}
+
+#ifdef CONFIG_X86_5LEVEL
+static unsigned int __head *fixup_int(void *ptr, unsigned long physaddr)
+{
+ return fixup_pointer(ptr, physaddr);
+}
+
+static bool __head check_la57_support(unsigned long physaddr)
+{
+ /*
+ * 5-level paging is detected and enabled at kernel decomression
+ * stage. Only check if it has been enabled there.
+ */
+ if (!(native_read_cr4() & X86_CR4_LA57))
+ return false;
+
+ *fixup_int(&__pgtable_l5_enabled, physaddr) = 1;
+ *fixup_int(&pgdir_shift, physaddr) = 48;
+ *fixup_int(&ptrs_per_p4d, physaddr) = 512;
+ *fixup_long(&page_offset_base, physaddr) = __PAGE_OFFSET_BASE_L5;
+ *fixup_long(&vmalloc_base, physaddr) = __VMALLOC_BASE_L5;
+ *fixup_long(&vmemmap_base, physaddr) = __VMEMMAP_BASE_L5;
+
+ return true;
+}
+#else
+static bool __head check_la57_support(unsigned long physaddr)
+{
+ return false;
+}
+#endif
+
+/* Code in __startup_64() can be relocated during execution, but the compiler
+ * doesn't have to generate PC-relative relocations when accessing globals from
+ * that function. Clang actually does not generate them, which leads to
+ * boot-time crashes. To work around this problem, every global pointer must
+ * be adjusted using fixup_pointer().
+ */
+unsigned long __head __startup_64(unsigned long physaddr,
+ struct boot_params *bp)
+{
+ unsigned long vaddr, vaddr_end;
+ unsigned long load_delta, *p;
+ unsigned long pgtable_flags;
+ pgdval_t *pgd;
+ p4dval_t *p4d;
+ pudval_t *pud;
+ pmdval_t *pmd, pmd_entry;
+ pteval_t *mask_ptr;
+ bool la57;
+ int i;
+ unsigned int *next_pgt_ptr;
+
+ la57 = check_la57_support(physaddr);
+
+ /* Is the address too large? */
+ if (physaddr >> MAX_PHYSMEM_BITS)
+ for (;;);
+
+ /*
+ * Compute the delta between the address I am compiled to run at
+ * and the address I am actually running at.
+ */
+ load_delta = physaddr - (unsigned long)(_text - __START_KERNEL_map);
+
+ /* Is the address not 2M aligned? */
+ if (load_delta & ~PMD_PAGE_MASK)
+ for (;;);
+
+ /* Activate Secure Memory Encryption (SME) if supported and enabled */
+ sme_enable(bp);
+
+ /* Include the SME encryption mask in the fixup value */
+ load_delta += sme_get_me_mask();
+
+ /* Fixup the physical addresses in the page table */
+
+ pgd = fixup_pointer(&early_top_pgt, physaddr);
+ p = pgd + pgd_index(__START_KERNEL_map);
+ if (la57)
+ *p = (unsigned long)level4_kernel_pgt;
+ else
+ *p = (unsigned long)level3_kernel_pgt;
+ *p += _PAGE_TABLE_NOENC - __START_KERNEL_map + load_delta;
+
+ if (la57) {
+ p4d = fixup_pointer(&level4_kernel_pgt, physaddr);
+ p4d[511] += load_delta;
+ }
+
+ pud = fixup_pointer(&level3_kernel_pgt, physaddr);
+ pud[510] += load_delta;
+ pud[511] += load_delta;
+
+ pmd = fixup_pointer(level2_fixmap_pgt, physaddr);
+ for (i = FIXMAP_PMD_TOP; i > FIXMAP_PMD_TOP - FIXMAP_PMD_NUM; i--)
+ pmd[i] += load_delta;
+
+ /*
+ * Set up the identity mapping for the switchover. These
+ * entries should *NOT* have the global bit set! This also
+ * creates a bunch of nonsense entries but that is fine --
+ * it avoids problems around wraparound.
+ */
+
+ next_pgt_ptr = fixup_pointer(&next_early_pgt, physaddr);
+ pud = fixup_pointer(early_dynamic_pgts[(*next_pgt_ptr)++], physaddr);
+ pmd = fixup_pointer(early_dynamic_pgts[(*next_pgt_ptr)++], physaddr);
+
+ pgtable_flags = _KERNPG_TABLE_NOENC + sme_get_me_mask();
+
+ if (la57) {
+ p4d = fixup_pointer(early_dynamic_pgts[next_early_pgt++], physaddr);
+
+ i = (physaddr >> PGDIR_SHIFT) % PTRS_PER_PGD;
+ pgd[i + 0] = (pgdval_t)p4d + pgtable_flags;
+ pgd[i + 1] = (pgdval_t)p4d + pgtable_flags;
+
+ i = (physaddr >> P4D_SHIFT) % PTRS_PER_P4D;
+ p4d[i + 0] = (pgdval_t)pud + pgtable_flags;
+ p4d[i + 1] = (pgdval_t)pud + pgtable_flags;
+ } else {
+ i = (physaddr >> PGDIR_SHIFT) % PTRS_PER_PGD;
+ pgd[i + 0] = (pgdval_t)pud + pgtable_flags;
+ pgd[i + 1] = (pgdval_t)pud + pgtable_flags;
+ }
+
+ i = (physaddr >> PUD_SHIFT) % PTRS_PER_PUD;
+ pud[i + 0] = (pudval_t)pmd + pgtable_flags;
+ pud[i + 1] = (pudval_t)pmd + pgtable_flags;
+
+ pmd_entry = __PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL;
+ /* Filter out unsupported __PAGE_KERNEL_* bits: */
+ mask_ptr = fixup_pointer(&__supported_pte_mask, physaddr);
+ pmd_entry &= *mask_ptr;
+ pmd_entry += sme_get_me_mask();
+ pmd_entry += physaddr;
+
+ for (i = 0; i < DIV_ROUND_UP(_end - _text, PMD_SIZE); i++) {
+ int idx = i + (physaddr >> PMD_SHIFT) % PTRS_PER_PMD;
+ pmd[idx] = pmd_entry + i * PMD_SIZE;
+ }
+
+ /*
+ * Fixup the kernel text+data virtual addresses. Note that
+ * we might write invalid pmds, when the kernel is relocated
+ * cleanup_highmap() fixes this up along with the mappings
+ * beyond _end.
+ */
+
+ pmd = fixup_pointer(level2_kernel_pgt, physaddr);
+ for (i = 0; i < PTRS_PER_PMD; i++) {
+ if (pmd[i] & _PAGE_PRESENT)
+ pmd[i] += load_delta;
+ }
+
+ /*
+ * Fixup phys_base - remove the memory encryption mask to obtain
+ * the true physical address.
+ */
+ *fixup_long(&phys_base, physaddr) += load_delta - sme_get_me_mask();
+
+ /* Encrypt the kernel and related (if SME is active) */
+ sme_encrypt_kernel(bp);
+
+ /*
+ * Clear the memory encryption mask from the .bss..decrypted section.
+ * The bss section will be memset to zero later in the initialization so
+ * there is no need to zero it after changing the memory encryption
+ * attribute.
+ */
+ if (mem_encrypt_active()) {
+ vaddr = (unsigned long)__start_bss_decrypted;
+ vaddr_end = (unsigned long)__end_bss_decrypted;
+ for (; vaddr < vaddr_end; vaddr += PMD_SIZE) {
+ i = pmd_index(vaddr);
+ pmd[i] -= sme_get_me_mask();
+ }
+ }
+
+ /*
+ * Return the SME encryption mask (if SME is active) to be used as a
+ * modifier for the initial pgdir entry programmed into CR3.
+ */
+ return sme_get_me_mask();
+}
+
+unsigned long __startup_secondary_64(void)
+{
+ /*
+ * Return the SME encryption mask (if SME is active) to be used as a
+ * modifier for the initial pgdir entry programmed into CR3.
+ */
+ return sme_get_me_mask();
+}
+
+/* Wipe all early page tables except for the kernel symbol map */
+static void __init reset_early_page_tables(void)
+{
+ memset(early_top_pgt, 0, sizeof(pgd_t)*(PTRS_PER_PGD-1));
+ next_early_pgt = 0;
+ write_cr3(__sme_pa_nodebug(early_top_pgt));
+}
+
+/* Create a new PMD entry */
+int __init __early_make_pgtable(unsigned long address, pmdval_t pmd)
+{
+ unsigned long physaddr = address - __PAGE_OFFSET;
+ pgdval_t pgd, *pgd_p;
+ p4dval_t p4d, *p4d_p;
+ pudval_t pud, *pud_p;
+ pmdval_t *pmd_p;
+
+ /* Invalid address or early pgt is done ? */
+ if (physaddr >= MAXMEM || read_cr3_pa() != __pa_nodebug(early_top_pgt))
+ return -1;
+
+again:
+ pgd_p = &early_top_pgt[pgd_index(address)].pgd;
+ pgd = *pgd_p;
+
+ /*
+ * The use of __START_KERNEL_map rather than __PAGE_OFFSET here is
+ * critical -- __PAGE_OFFSET would point us back into the dynamic
+ * range and we might end up looping forever...
+ */
+ if (!pgtable_l5_enabled())
+ p4d_p = pgd_p;
+ else if (pgd)
+ p4d_p = (p4dval_t *)((pgd & PTE_PFN_MASK) + __START_KERNEL_map - phys_base);
+ else {
+ if (next_early_pgt >= EARLY_DYNAMIC_PAGE_TABLES) {
+ reset_early_page_tables();
+ goto again;
+ }
+
+ p4d_p = (p4dval_t *)early_dynamic_pgts[next_early_pgt++];
+ memset(p4d_p, 0, sizeof(*p4d_p) * PTRS_PER_P4D);
+ *pgd_p = (pgdval_t)p4d_p - __START_KERNEL_map + phys_base + _KERNPG_TABLE;
+ }
+ p4d_p += p4d_index(address);
+ p4d = *p4d_p;
+
+ if (p4d)
+ pud_p = (pudval_t *)((p4d & PTE_PFN_MASK) + __START_KERNEL_map - phys_base);
+ else {
+ if (next_early_pgt >= EARLY_DYNAMIC_PAGE_TABLES) {
+ reset_early_page_tables();
+ goto again;
+ }
+
+ pud_p = (pudval_t *)early_dynamic_pgts[next_early_pgt++];
+ memset(pud_p, 0, sizeof(*pud_p) * PTRS_PER_PUD);
+ *p4d_p = (p4dval_t)pud_p - __START_KERNEL_map + phys_base + _KERNPG_TABLE;
+ }
+ pud_p += pud_index(address);
+ pud = *pud_p;
+
+ if (pud)
+ pmd_p = (pmdval_t *)((pud & PTE_PFN_MASK) + __START_KERNEL_map - phys_base);
+ else {
+ if (next_early_pgt >= EARLY_DYNAMIC_PAGE_TABLES) {
+ reset_early_page_tables();
+ goto again;
+ }
+
+ pmd_p = (pmdval_t *)early_dynamic_pgts[next_early_pgt++];
+ memset(pmd_p, 0, sizeof(*pmd_p) * PTRS_PER_PMD);
+ *pud_p = (pudval_t)pmd_p - __START_KERNEL_map + phys_base + _KERNPG_TABLE;
+ }
+ pmd_p[pmd_index(address)] = pmd;
+
+ return 0;
+}
+
+int __init early_make_pgtable(unsigned long address)
+{
+ unsigned long physaddr = address - __PAGE_OFFSET;
+ pmdval_t pmd;
+
+ pmd = (physaddr & PMD_MASK) + early_pmd_flags;
+
+ return __early_make_pgtable(address, pmd);
+}
+
+/* Don't add a printk in there. printk relies on the PDA which is not initialized
+ yet. */
+static void __init clear_bss(void)
+{
+ memset(__bss_start, 0,
+ (unsigned long) __bss_stop - (unsigned long) __bss_start);
+}
+
+static unsigned long get_cmd_line_ptr(void)
+{
+ unsigned long cmd_line_ptr = boot_params.hdr.cmd_line_ptr;
+
+ cmd_line_ptr |= (u64)boot_params.ext_cmd_line_ptr << 32;
+
+ return cmd_line_ptr;
+}
+
+static void __init copy_bootdata(char *real_mode_data)
+{
+ char * command_line;
+ unsigned long cmd_line_ptr;
+
+ /*
+ * If SME is active, this will create decrypted mappings of the
+ * boot data in advance of the copy operations.
+ */
+ sme_map_bootdata(real_mode_data);
+
+ memcpy(&boot_params, real_mode_data, sizeof boot_params);
+ sanitize_boot_params(&boot_params);
+ cmd_line_ptr = get_cmd_line_ptr();
+ if (cmd_line_ptr) {
+ command_line = __va(cmd_line_ptr);
+ memcpy(boot_command_line, command_line, COMMAND_LINE_SIZE);
+ }
+
+ /*
+ * The old boot data is no longer needed and won't be reserved,
+ * freeing up that memory for use by the system. If SME is active,
+ * we need to remove the mappings that were created so that the
+ * memory doesn't remain mapped as decrypted.
+ */
+ sme_unmap_bootdata(real_mode_data);
+}
+
+asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data)
+{
+ /*
+ * Build-time sanity checks on the kernel image and module
+ * area mappings. (these are purely build-time and produce no code)
+ */
+ BUILD_BUG_ON(MODULES_VADDR < __START_KERNEL_map);
+ BUILD_BUG_ON(MODULES_VADDR - __START_KERNEL_map < KERNEL_IMAGE_SIZE);
+ BUILD_BUG_ON(MODULES_LEN + KERNEL_IMAGE_SIZE > 2*PUD_SIZE);
+ BUILD_BUG_ON((__START_KERNEL_map & ~PMD_MASK) != 0);
+ BUILD_BUG_ON((MODULES_VADDR & ~PMD_MASK) != 0);
+ BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL));
+ MAYBE_BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) ==
+ (__START_KERNEL & PGDIR_MASK)));
+ BUILD_BUG_ON(__fix_to_virt(__end_of_fixed_addresses) <= MODULES_END);
+
+ cr4_init_shadow();
+
+ /* Kill off the identity-map trampoline */
+ reset_early_page_tables();
+
+ clear_bss();
+
+ clear_page(init_top_pgt);
+
+ /*
+ * SME support may update early_pmd_flags to include the memory
+ * encryption mask, so it needs to be called before anything
+ * that may generate a page fault.
+ */
+ sme_early_init();
+
+ kasan_early_init();
+
+ idt_setup_early_handler();
+
+ copy_bootdata(__va(real_mode_data));
+
+ /*
+ * Load microcode early on BSP.
+ */
+ load_ucode_bsp();
+
+ /* set init_top_pgt kernel high mapping*/
+ init_top_pgt[511] = early_top_pgt[511];
+
+ x86_64_start_reservations(real_mode_data);
+}
+
+void __init x86_64_start_reservations(char *real_mode_data)
+{
+ /* version is always not zero if it is copied */
+ if (!boot_params.hdr.version)
+ copy_bootdata(__va(real_mode_data));
+
+ x86_early_init_platform_quirks();
+
+ switch (boot_params.hdr.hardware_subarch) {
+ case X86_SUBARCH_INTEL_MID:
+ x86_intel_mid_early_setup();
+ break;
+ default:
+ break;
+ }
+
+ start_kernel();
+}
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
new file mode 100644
index 0000000..30f9cb2
--- /dev/null
+++ b/arch/x86/kernel/head_32.S
@@ -0,0 +1,623 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ *
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ *
+ * Enhanced CPU detection and feature setting code by Mike Jagdis
+ * and Martin Mares, November 1997.
+ */
+
+.text
+#include <linux/threads.h>
+#include <linux/init.h>
+#include <linux/linkage.h>
+#include <asm/segment.h>
+#include <asm/page_types.h>
+#include <asm/pgtable_types.h>
+#include <asm/cache.h>
+#include <asm/thread_info.h>
+#include <asm/asm-offsets.h>
+#include <asm/setup.h>
+#include <asm/processor-flags.h>
+#include <asm/msr-index.h>
+#include <asm/cpufeatures.h>
+#include <asm/percpu.h>
+#include <asm/nops.h>
+#include <asm/bootparam.h>
+#include <asm/export.h>
+#include <asm/pgtable_32.h>
+
+/* Physical address */
+#define pa(X) ((X) - __PAGE_OFFSET)
+
+/*
+ * References to members of the new_cpu_data structure.
+ */
+
+#define X86 new_cpu_data+CPUINFO_x86
+#define X86_VENDOR new_cpu_data+CPUINFO_x86_vendor
+#define X86_MODEL new_cpu_data+CPUINFO_x86_model
+#define X86_STEPPING new_cpu_data+CPUINFO_x86_stepping
+#define X86_HARD_MATH new_cpu_data+CPUINFO_hard_math
+#define X86_CPUID new_cpu_data+CPUINFO_cpuid_level
+#define X86_CAPABILITY new_cpu_data+CPUINFO_x86_capability
+#define X86_VENDOR_ID new_cpu_data+CPUINFO_x86_vendor_id
+
+
+#define SIZEOF_PTREGS 17*4
+
+/*
+ * Worst-case size of the kernel mapping we need to make:
+ * a relocatable kernel can live anywhere in lowmem, so we need to be able
+ * to map all of lowmem.
+ */
+KERNEL_PAGES = LOWMEM_PAGES
+
+INIT_MAP_SIZE = PAGE_TABLE_SIZE(KERNEL_PAGES) * PAGE_SIZE
+RESERVE_BRK(pagetables, INIT_MAP_SIZE)
+
+/*
+ * 32-bit kernel entrypoint; only used by the boot CPU. On entry,
+ * %esi points to the real-mode code as a 32-bit pointer.
+ * CS and DS must be 4 GB flat segments, but we don't depend on
+ * any particular GDT layout, because we load our own as soon as we
+ * can.
+ */
+__HEAD
+ENTRY(startup_32)
+ movl pa(initial_stack),%ecx
+
+ /* test KEEP_SEGMENTS flag to see if the bootloader is asking
+ us to not reload segments */
+ testb $KEEP_SEGMENTS, BP_loadflags(%esi)
+ jnz 2f
+
+/*
+ * Set segments to known values.
+ */
+ lgdt pa(boot_gdt_descr)
+ movl $(__BOOT_DS),%eax
+ movl %eax,%ds
+ movl %eax,%es
+ movl %eax,%fs
+ movl %eax,%gs
+ movl %eax,%ss
+2:
+ leal -__PAGE_OFFSET(%ecx),%esp
+
+/*
+ * Clear BSS first so that there are no surprises...
+ */
+ cld
+ xorl %eax,%eax
+ movl $pa(__bss_start),%edi
+ movl $pa(__bss_stop),%ecx
+ subl %edi,%ecx
+ shrl $2,%ecx
+ rep ; stosl
+/*
+ * Copy bootup parameters out of the way.
+ * Note: %esi still has the pointer to the real-mode data.
+ * With the kexec as boot loader, parameter segment might be loaded beyond
+ * kernel image and might not even be addressable by early boot page tables.
+ * (kexec on panic case). Hence copy out the parameters before initializing
+ * page tables.
+ */
+ movl $pa(boot_params),%edi
+ movl $(PARAM_SIZE/4),%ecx
+ cld
+ rep
+ movsl
+ movl pa(boot_params) + NEW_CL_POINTER,%esi
+ andl %esi,%esi
+ jz 1f # No command line
+ movl $pa(boot_command_line),%edi
+ movl $(COMMAND_LINE_SIZE/4),%ecx
+ rep
+ movsl
+1:
+
+#ifdef CONFIG_OLPC
+ /* save OFW's pgdir table for later use when calling into OFW */
+ movl %cr3, %eax
+ movl %eax, pa(olpc_ofw_pgd)
+#endif
+
+#ifdef CONFIG_MICROCODE
+ /* Early load ucode on BSP. */
+ call load_ucode_bsp
+#endif
+
+ /* Create early pagetables. */
+ call mk_early_pgtbl_32
+
+ /* Do early initialization of the fixmap area */
+ movl $pa(initial_pg_fixmap)+PDE_IDENT_ATTR,%eax
+#ifdef CONFIG_X86_PAE
+#define KPMDS (((-__PAGE_OFFSET) >> 30) & 3) /* Number of kernel PMDs */
+ movl %eax,pa(initial_pg_pmd+0x1000*KPMDS-8)
+#else
+ movl %eax,pa(initial_page_table+0xffc)
+#endif
+
+#ifdef CONFIG_PARAVIRT
+ /* This is can only trip for a broken bootloader... */
+ cmpw $0x207, pa(boot_params + BP_version)
+ jb .Ldefault_entry
+
+ /* Paravirt-compatible boot parameters. Look to see what architecture
+ we're booting under. */
+ movl pa(boot_params + BP_hardware_subarch), %eax
+ cmpl $num_subarch_entries, %eax
+ jae .Lbad_subarch
+
+ movl pa(subarch_entries)(,%eax,4), %eax
+ subl $__PAGE_OFFSET, %eax
+ jmp *%eax
+
+.Lbad_subarch:
+WEAK(xen_entry)
+ /* Unknown implementation; there's really
+ nothing we can do at this point. */
+ ud2a
+
+ __INITDATA
+
+subarch_entries:
+ .long .Ldefault_entry /* normal x86/PC */
+ .long xen_entry /* Xen hypervisor */
+ .long .Ldefault_entry /* Moorestown MID */
+num_subarch_entries = (. - subarch_entries) / 4
+.previous
+#else
+ jmp .Ldefault_entry
+#endif /* CONFIG_PARAVIRT */
+
+#ifdef CONFIG_HOTPLUG_CPU
+/*
+ * Boot CPU0 entry point. It's called from play_dead(). Everything has been set
+ * up already except stack. We just set up stack here. Then call
+ * start_secondary().
+ */
+ENTRY(start_cpu0)
+ movl initial_stack, %ecx
+ movl %ecx, %esp
+ call *(initial_code)
+1: jmp 1b
+ENDPROC(start_cpu0)
+#endif
+
+/*
+ * Non-boot CPU entry point; entered from trampoline.S
+ * We can't lgdt here, because lgdt itself uses a data segment, but
+ * we know the trampoline has already loaded the boot_gdt for us.
+ *
+ * If cpu hotplug is not supported then this code can go in init section
+ * which will be freed later
+ */
+ENTRY(startup_32_smp)
+ cld
+ movl $(__BOOT_DS),%eax
+ movl %eax,%ds
+ movl %eax,%es
+ movl %eax,%fs
+ movl %eax,%gs
+ movl pa(initial_stack),%ecx
+ movl %eax,%ss
+ leal -__PAGE_OFFSET(%ecx),%esp
+
+#ifdef CONFIG_MICROCODE
+ /* Early load ucode on AP. */
+ call load_ucode_ap
+#endif
+
+.Ldefault_entry:
+ movl $(CR0_STATE & ~X86_CR0_PG),%eax
+ movl %eax,%cr0
+
+/*
+ * We want to start out with EFLAGS unambiguously cleared. Some BIOSes leave
+ * bits like NT set. This would confuse the debugger if this code is traced. So
+ * initialize them properly now before switching to protected mode. That means
+ * DF in particular (even though we have cleared it earlier after copying the
+ * command line) because GCC expects it.
+ */
+ pushl $0
+ popfl
+
+/*
+ * New page tables may be in 4Mbyte page mode and may be using the global pages.
+ *
+ * NOTE! If we are on a 486 we may have no cr4 at all! Specifically, cr4 exists
+ * if and only if CPUID exists and has flags other than the FPU flag set.
+ */
+ movl $-1,pa(X86_CPUID) # preset CPUID level
+ movl $X86_EFLAGS_ID,%ecx
+ pushl %ecx
+ popfl # set EFLAGS=ID
+ pushfl
+ popl %eax # get EFLAGS
+ testl $X86_EFLAGS_ID,%eax # did EFLAGS.ID remained set?
+ jz .Lenable_paging # hw disallowed setting of ID bit
+ # which means no CPUID and no CR4
+
+ xorl %eax,%eax
+ cpuid
+ movl %eax,pa(X86_CPUID) # save largest std CPUID function
+
+ movl $1,%eax
+ cpuid
+ andl $~1,%edx # Ignore CPUID.FPU
+ jz .Lenable_paging # No flags or only CPUID.FPU = no CR4
+
+ movl pa(mmu_cr4_features),%eax
+ movl %eax,%cr4
+
+ testb $X86_CR4_PAE, %al # check if PAE is enabled
+ jz .Lenable_paging
+
+ /* Check if extended functions are implemented */
+ movl $0x80000000, %eax
+ cpuid
+ /* Value must be in the range 0x80000001 to 0x8000ffff */
+ subl $0x80000001, %eax
+ cmpl $(0x8000ffff-0x80000001), %eax
+ ja .Lenable_paging
+
+ /* Clear bogus XD_DISABLE bits */
+ call verify_cpu
+
+ mov $0x80000001, %eax
+ cpuid
+ /* Execute Disable bit supported? */
+ btl $(X86_FEATURE_NX & 31), %edx
+ jnc .Lenable_paging
+
+ /* Setup EFER (Extended Feature Enable Register) */
+ movl $MSR_EFER, %ecx
+ rdmsr
+
+ btsl $_EFER_NX, %eax
+ /* Make changes effective */
+ wrmsr
+
+.Lenable_paging:
+
+/*
+ * Enable paging
+ */
+ movl $pa(initial_page_table), %eax
+ movl %eax,%cr3 /* set the page table pointer.. */
+ movl $CR0_STATE,%eax
+ movl %eax,%cr0 /* ..and set paging (PG) bit */
+ ljmp $__BOOT_CS,$1f /* Clear prefetch and normalize %eip */
+1:
+ /* Shift the stack pointer to a virtual address */
+ addl $__PAGE_OFFSET, %esp
+
+/*
+ * start system 32-bit setup. We need to re-do some of the things done
+ * in 16-bit mode for the "real" operations.
+ */
+ movl setup_once_ref,%eax
+ andl %eax,%eax
+ jz 1f # Did we do this already?
+ call *%eax
+1:
+
+/*
+ * Check if it is 486
+ */
+ movb $4,X86 # at least 486
+ cmpl $-1,X86_CPUID
+ je .Lis486
+
+ /* get vendor info */
+ xorl %eax,%eax # call CPUID with 0 -> return vendor ID
+ cpuid
+ movl %eax,X86_CPUID # save CPUID level
+ movl %ebx,X86_VENDOR_ID # lo 4 chars
+ movl %edx,X86_VENDOR_ID+4 # next 4 chars
+ movl %ecx,X86_VENDOR_ID+8 # last 4 chars
+
+ orl %eax,%eax # do we have processor info as well?
+ je .Lis486
+
+ movl $1,%eax # Use the CPUID instruction to get CPU type
+ cpuid
+ movb %al,%cl # save reg for future use
+ andb $0x0f,%ah # mask processor family
+ movb %ah,X86
+ andb $0xf0,%al # mask model
+ shrb $4,%al
+ movb %al,X86_MODEL
+ andb $0x0f,%cl # mask mask revision
+ movb %cl,X86_STEPPING
+ movl %edx,X86_CAPABILITY
+
+.Lis486:
+ movl $0x50022,%ecx # set AM, WP, NE and MP
+ movl %cr0,%eax
+ andl $0x80000011,%eax # Save PG,PE,ET
+ orl %ecx,%eax
+ movl %eax,%cr0
+
+ lgdt early_gdt_descr
+ ljmp $(__KERNEL_CS),$1f
+1: movl $(__KERNEL_DS),%eax # reload all the segment registers
+ movl %eax,%ss # after changing gdt.
+
+ movl $(__USER_DS),%eax # DS/ES contains default USER segment
+ movl %eax,%ds
+ movl %eax,%es
+
+ movl $(__KERNEL_PERCPU), %eax
+ movl %eax,%fs # set this cpu's percpu
+
+ movl $(__KERNEL_STACK_CANARY),%eax
+ movl %eax,%gs
+
+ xorl %eax,%eax # Clear LDT
+ lldt %ax
+
+ call *(initial_code)
+1: jmp 1b
+ENDPROC(startup_32_smp)
+
+#include "verify_cpu.S"
+
+/*
+ * setup_once
+ *
+ * The setup work we only want to run on the BSP.
+ *
+ * Warning: %esi is live across this function.
+ */
+__INIT
+setup_once:
+#ifdef CONFIG_STACKPROTECTOR
+ /*
+ * Configure the stack canary. The linker can't handle this by
+ * relocation. Manually set base address in stack canary
+ * segment descriptor.
+ */
+ movl $gdt_page,%eax
+ movl $stack_canary,%ecx
+ movw %cx, 8 * GDT_ENTRY_STACK_CANARY + 2(%eax)
+ shrl $16, %ecx
+ movb %cl, 8 * GDT_ENTRY_STACK_CANARY + 4(%eax)
+ movb %ch, 8 * GDT_ENTRY_STACK_CANARY + 7(%eax)
+#endif
+
+ andl $0,setup_once_ref /* Once is enough, thanks */
+ ret
+
+ENTRY(early_idt_handler_array)
+ # 36(%esp) %eflags
+ # 32(%esp) %cs
+ # 28(%esp) %eip
+ # 24(%rsp) error code
+ i = 0
+ .rept NUM_EXCEPTION_VECTORS
+ .if ((EXCEPTION_ERRCODE_MASK >> i) & 1) == 0
+ pushl $0 # Dummy error code, to make stack frame uniform
+ .endif
+ pushl $i # 20(%esp) Vector number
+ jmp early_idt_handler_common
+ i = i + 1
+ .fill early_idt_handler_array + i*EARLY_IDT_HANDLER_SIZE - ., 1, 0xcc
+ .endr
+ENDPROC(early_idt_handler_array)
+
+early_idt_handler_common:
+ /*
+ * The stack is the hardware frame, an error code or zero, and the
+ * vector number.
+ */
+ cld
+
+ incl %ss:early_recursion_flag
+
+ /* The vector number is in pt_regs->gs */
+
+ cld
+ pushl %fs /* pt_regs->fs (__fsh varies by model) */
+ pushl %es /* pt_regs->es (__esh varies by model) */
+ pushl %ds /* pt_regs->ds (__dsh varies by model) */
+ pushl %eax /* pt_regs->ax */
+ pushl %ebp /* pt_regs->bp */
+ pushl %edi /* pt_regs->di */
+ pushl %esi /* pt_regs->si */
+ pushl %edx /* pt_regs->dx */
+ pushl %ecx /* pt_regs->cx */
+ pushl %ebx /* pt_regs->bx */
+
+ /* Fix up DS and ES */
+ movl $(__KERNEL_DS), %ecx
+ movl %ecx, %ds
+ movl %ecx, %es
+
+ /* Load the vector number into EDX */
+ movl PT_GS(%esp), %edx
+
+ /* Load GS into pt_regs->gs (and maybe clobber __gsh) */
+ movw %gs, PT_GS(%esp)
+
+ movl %esp, %eax /* args are pt_regs (EAX), trapnr (EDX) */
+ call early_fixup_exception
+
+ popl %ebx /* pt_regs->bx */
+ popl %ecx /* pt_regs->cx */
+ popl %edx /* pt_regs->dx */
+ popl %esi /* pt_regs->si */
+ popl %edi /* pt_regs->di */
+ popl %ebp /* pt_regs->bp */
+ popl %eax /* pt_regs->ax */
+ popl %ds /* pt_regs->ds (always ignores __dsh) */
+ popl %es /* pt_regs->es (always ignores __esh) */
+ popl %fs /* pt_regs->fs (always ignores __fsh) */
+ popl %gs /* pt_regs->gs (always ignores __gsh) */
+ decl %ss:early_recursion_flag
+ addl $4, %esp /* pop pt_regs->orig_ax */
+ iret
+ENDPROC(early_idt_handler_common)
+
+/* This is the default interrupt "handler" :-) */
+ENTRY(early_ignore_irq)
+ cld
+#ifdef CONFIG_PRINTK
+ pushl %eax
+ pushl %ecx
+ pushl %edx
+ pushl %es
+ pushl %ds
+ movl $(__KERNEL_DS),%eax
+ movl %eax,%ds
+ movl %eax,%es
+ cmpl $2,early_recursion_flag
+ je hlt_loop
+ incl early_recursion_flag
+ pushl 16(%esp)
+ pushl 24(%esp)
+ pushl 32(%esp)
+ pushl 40(%esp)
+ pushl $int_msg
+ call printk
+
+ call dump_stack
+
+ addl $(5*4),%esp
+ popl %ds
+ popl %es
+ popl %edx
+ popl %ecx
+ popl %eax
+#endif
+ iret
+
+hlt_loop:
+ hlt
+ jmp hlt_loop
+ENDPROC(early_ignore_irq)
+
+__INITDATA
+ .align 4
+GLOBAL(early_recursion_flag)
+ .long 0
+
+__REFDATA
+ .align 4
+ENTRY(initial_code)
+ .long i386_start_kernel
+ENTRY(setup_once_ref)
+ .long setup_once
+
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+#define PGD_ALIGN (2 * PAGE_SIZE)
+#define PTI_USER_PGD_FILL 1024
+#else
+#define PGD_ALIGN (PAGE_SIZE)
+#define PTI_USER_PGD_FILL 0
+#endif
+/*
+ * BSS section
+ */
+__PAGE_ALIGNED_BSS
+ .align PGD_ALIGN
+#ifdef CONFIG_X86_PAE
+.globl initial_pg_pmd
+initial_pg_pmd:
+ .fill 1024*KPMDS,4,0
+#else
+.globl initial_page_table
+initial_page_table:
+ .fill 1024,4,0
+#endif
+ .align PGD_ALIGN
+initial_pg_fixmap:
+ .fill 1024,4,0
+.globl swapper_pg_dir
+ .align PGD_ALIGN
+swapper_pg_dir:
+ .fill 1024,4,0
+ .fill PTI_USER_PGD_FILL,4,0
+.globl empty_zero_page
+empty_zero_page:
+ .fill 4096,1,0
+EXPORT_SYMBOL(empty_zero_page)
+
+/*
+ * This starts the data section.
+ */
+#ifdef CONFIG_X86_PAE
+__PAGE_ALIGNED_DATA
+ /* Page-aligned for the benefit of paravirt? */
+ .align PGD_ALIGN
+ENTRY(initial_page_table)
+ .long pa(initial_pg_pmd+PGD_IDENT_ATTR),0 /* low identity map */
+# if KPMDS == 3
+ .long pa(initial_pg_pmd+PGD_IDENT_ATTR),0
+ .long pa(initial_pg_pmd+PGD_IDENT_ATTR+0x1000),0
+ .long pa(initial_pg_pmd+PGD_IDENT_ATTR+0x2000),0
+# elif KPMDS == 2
+ .long 0,0
+ .long pa(initial_pg_pmd+PGD_IDENT_ATTR),0
+ .long pa(initial_pg_pmd+PGD_IDENT_ATTR+0x1000),0
+# elif KPMDS == 1
+ .long 0,0
+ .long 0,0
+ .long pa(initial_pg_pmd+PGD_IDENT_ATTR),0
+# else
+# error "Kernel PMDs should be 1, 2 or 3"
+# endif
+ .align PAGE_SIZE /* needs to be page-sized too */
+#endif
+
+.data
+.balign 4
+ENTRY(initial_stack)
+ /*
+ * The SIZEOF_PTREGS gap is a convention which helps the in-kernel
+ * unwinder reliably detect the end of the stack.
+ */
+ .long init_thread_union + THREAD_SIZE - SIZEOF_PTREGS - \
+ TOP_OF_KERNEL_STACK_PADDING;
+
+__INITRODATA
+int_msg:
+ .asciz "Unknown interrupt or fault at: %p %p %p\n"
+
+#include "../../x86/xen/xen-head.S"
+
+/*
+ * The IDT and GDT 'descriptors' are a strange 48-bit object
+ * only used by the lidt and lgdt instructions. They are not
+ * like usual segment descriptors - they consist of a 16-bit
+ * segment size, and 32-bit linear address value:
+ */
+
+ .data
+.globl boot_gdt_descr
+
+ ALIGN
+# early boot GDT descriptor (must use 1:1 address mapping)
+ .word 0 # 32 bit align gdt_desc.address
+boot_gdt_descr:
+ .word __BOOT_DS+7
+ .long boot_gdt - __PAGE_OFFSET
+
+# boot GDT descriptor (later on used by CPU#0):
+ .word 0 # 32 bit align gdt_desc.address
+ENTRY(early_gdt_descr)
+ .word GDT_ENTRIES*8-1
+ .long gdt_page /* Overwritten for secondary CPUs */
+
+/*
+ * The boot_gdt must mirror the equivalent in setup.S and is
+ * used only for booting.
+ */
+ .align L1_CACHE_BYTES
+ENTRY(boot_gdt)
+ .fill GDT_ENTRY_BOOT_CS,8,0
+ .quad 0x00cf9a000000ffff /* kernel 4GB code at 0x00000000 */
+ .quad 0x00cf92000000ffff /* kernel 4GB data at 0x00000000 */
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
new file mode 100644
index 0000000..a3618cf
--- /dev/null
+++ b/arch/x86/kernel/head_64.S
@@ -0,0 +1,485 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * linux/arch/x86/kernel/head_64.S -- start in 32bit and switch to 64bit
+ *
+ * Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE
+ * Copyright (C) 2000 Pavel Machek <pavel@suse.cz>
+ * Copyright (C) 2000 Karsten Keil <kkeil@suse.de>
+ * Copyright (C) 2001,2002 Andi Kleen <ak@suse.de>
+ * Copyright (C) 2005 Eric Biederman <ebiederm@xmission.com>
+ */
+
+
+#include <linux/linkage.h>
+#include <linux/threads.h>
+#include <linux/init.h>
+#include <asm/segment.h>
+#include <asm/pgtable.h>
+#include <asm/page.h>
+#include <asm/msr.h>
+#include <asm/cache.h>
+#include <asm/processor-flags.h>
+#include <asm/percpu.h>
+#include <asm/nops.h>
+#include "../entry/calling.h"
+#include <asm/export.h>
+#include <asm/nospec-branch.h>
+#include <asm/fixmap.h>
+
+#ifdef CONFIG_PARAVIRT
+#include <asm/asm-offsets.h>
+#include <asm/paravirt.h>
+#define GET_CR2_INTO(reg) GET_CR2_INTO_RAX ; movq %rax, reg
+#else
+#define GET_CR2_INTO(reg) movq %cr2, reg
+#define INTERRUPT_RETURN iretq
+#endif
+
+/* we are not able to switch in one step to the final KERNEL ADDRESS SPACE
+ * because we need identity-mapped pages.
+ *
+ */
+
+#define l4_index(x) (((x) >> 39) & 511)
+#define pud_index(x) (((x) >> PUD_SHIFT) & (PTRS_PER_PUD-1))
+
+L4_PAGE_OFFSET = l4_index(__PAGE_OFFSET_BASE_L4)
+L4_START_KERNEL = l4_index(__START_KERNEL_map)
+
+L3_START_KERNEL = pud_index(__START_KERNEL_map)
+
+ .text
+ __HEAD
+ .code64
+ .globl startup_64
+startup_64:
+ UNWIND_HINT_EMPTY
+ /*
+ * At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 0,
+ * and someone has loaded an identity mapped page table
+ * for us. These identity mapped page tables map all of the
+ * kernel pages and possibly all of memory.
+ *
+ * %rsi holds a physical pointer to real_mode_data.
+ *
+ * We come here either directly from a 64bit bootloader, or from
+ * arch/x86/boot/compressed/head_64.S.
+ *
+ * We only come here initially at boot nothing else comes here.
+ *
+ * Since we may be loaded at an address different from what we were
+ * compiled to run at we first fixup the physical addresses in our page
+ * tables and then reload them.
+ */
+
+ /* Set up the stack for verify_cpu(), similar to initial_stack below */
+ leaq (__end_init_task - SIZEOF_PTREGS)(%rip), %rsp
+
+ /* Sanitize CPU configuration */
+ call verify_cpu
+
+ /*
+ * Perform pagetable fixups. Additionally, if SME is active, encrypt
+ * the kernel and retrieve the modifier (SME encryption mask if SME
+ * is active) to be added to the initial pgdir entry that will be
+ * programmed into CR3.
+ */
+ leaq _text(%rip), %rdi
+ pushq %rsi
+ call __startup_64
+ popq %rsi
+
+ /* Form the CR3 value being sure to include the CR3 modifier */
+ addq $(early_top_pgt - __START_KERNEL_map), %rax
+ jmp 1f
+ENTRY(secondary_startup_64)
+ UNWIND_HINT_EMPTY
+ /*
+ * At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 0,
+ * and someone has loaded a mapped page table.
+ *
+ * %rsi holds a physical pointer to real_mode_data.
+ *
+ * We come here either from startup_64 (using physical addresses)
+ * or from trampoline.S (using virtual addresses).
+ *
+ * Using virtual addresses from trampoline.S removes the need
+ * to have any identity mapped pages in the kernel page table
+ * after the boot processor executes this code.
+ */
+
+ /* Sanitize CPU configuration */
+ call verify_cpu
+
+ /*
+ * Retrieve the modifier (SME encryption mask if SME is active) to be
+ * added to the initial pgdir entry that will be programmed into CR3.
+ */
+ pushq %rsi
+ call __startup_secondary_64
+ popq %rsi
+
+ /* Form the CR3 value being sure to include the CR3 modifier */
+ addq $(init_top_pgt - __START_KERNEL_map), %rax
+1:
+
+ /* Enable PAE mode, PGE and LA57 */
+ movl $(X86_CR4_PAE | X86_CR4_PGE), %ecx
+#ifdef CONFIG_X86_5LEVEL
+ testl $1, __pgtable_l5_enabled(%rip)
+ jz 1f
+ orl $X86_CR4_LA57, %ecx
+1:
+#endif
+ movq %rcx, %cr4
+
+ /* Setup early boot stage 4-/5-level pagetables. */
+ addq phys_base(%rip), %rax
+ movq %rax, %cr3
+
+ /* Ensure I am executing from virtual addresses */
+ movq $1f, %rax
+ ANNOTATE_RETPOLINE_SAFE
+ jmp *%rax
+1:
+ UNWIND_HINT_EMPTY
+
+ /* Check if nx is implemented */
+ movl $0x80000001, %eax
+ cpuid
+ movl %edx,%edi
+
+ /* Setup EFER (Extended Feature Enable Register) */
+ movl $MSR_EFER, %ecx
+ rdmsr
+ btsl $_EFER_SCE, %eax /* Enable System Call */
+ btl $20,%edi /* No Execute supported? */
+ jnc 1f
+ btsl $_EFER_NX, %eax
+ btsq $_PAGE_BIT_NX,early_pmd_flags(%rip)
+1: wrmsr /* Make changes effective */
+
+ /* Setup cr0 */
+ movl $CR0_STATE, %eax
+ /* Make changes effective */
+ movq %rax, %cr0
+
+ /* Setup a boot time stack */
+ movq initial_stack(%rip), %rsp
+
+ /* zero EFLAGS after setting rsp */
+ pushq $0
+ popfq
+
+ /*
+ * We must switch to a new descriptor in kernel space for the GDT
+ * because soon the kernel won't have access anymore to the userspace
+ * addresses where we're currently running on. We have to do that here
+ * because in 32bit we couldn't load a 64bit linear address.
+ */
+ lgdt early_gdt_descr(%rip)
+
+ /* set up data segments */
+ xorl %eax,%eax
+ movl %eax,%ds
+ movl %eax,%ss
+ movl %eax,%es
+
+ /*
+ * We don't really need to load %fs or %gs, but load them anyway
+ * to kill any stale realmode selectors. This allows execution
+ * under VT hardware.
+ */
+ movl %eax,%fs
+ movl %eax,%gs
+
+ /* Set up %gs.
+ *
+ * The base of %gs always points to the bottom of the irqstack
+ * union. If the stack protector canary is enabled, it is
+ * located at %gs:40. Note that, on SMP, the boot cpu uses
+ * init data section till per cpu areas are set up.
+ */
+ movl $MSR_GS_BASE,%ecx
+ movl initial_gs(%rip),%eax
+ movl initial_gs+4(%rip),%edx
+ wrmsr
+
+ /* rsi is pointer to real mode structure with interesting info.
+ pass it to C */
+ movq %rsi, %rdi
+
+.Ljump_to_C_code:
+ /*
+ * Jump to run C code and to be on a real kernel address.
+ * Since we are running on identity-mapped space we have to jump
+ * to the full 64bit address, this is only possible as indirect
+ * jump. In addition we need to ensure %cs is set so we make this
+ * a far return.
+ *
+ * Note: do not change to far jump indirect with 64bit offset.
+ *
+ * AMD does not support far jump indirect with 64bit offset.
+ * AMD64 Architecture Programmer's Manual, Volume 3: states only
+ * JMP FAR mem16:16 FF /5 Far jump indirect,
+ * with the target specified by a far pointer in memory.
+ * JMP FAR mem16:32 FF /5 Far jump indirect,
+ * with the target specified by a far pointer in memory.
+ *
+ * Intel64 does support 64bit offset.
+ * Software Developer Manual Vol 2: states:
+ * FF /5 JMP m16:16 Jump far, absolute indirect,
+ * address given in m16:16
+ * FF /5 JMP m16:32 Jump far, absolute indirect,
+ * address given in m16:32.
+ * REX.W + FF /5 JMP m16:64 Jump far, absolute indirect,
+ * address given in m16:64.
+ */
+ pushq $.Lafter_lret # put return address on stack for unwinder
+ xorl %ebp, %ebp # clear frame pointer
+ movq initial_code(%rip), %rax
+ pushq $__KERNEL_CS # set correct cs
+ pushq %rax # target address in negative space
+ lretq
+.Lafter_lret:
+END(secondary_startup_64)
+
+#include "verify_cpu.S"
+
+#ifdef CONFIG_HOTPLUG_CPU
+/*
+ * Boot CPU0 entry point. It's called from play_dead(). Everything has been set
+ * up already except stack. We just set up stack here. Then call
+ * start_secondary() via .Ljump_to_C_code.
+ */
+ENTRY(start_cpu0)
+ movq initial_stack(%rip), %rsp
+ UNWIND_HINT_EMPTY
+ jmp .Ljump_to_C_code
+ENDPROC(start_cpu0)
+#endif
+
+ /* Both SMP bootup and ACPI suspend change these variables */
+ __REFDATA
+ .balign 8
+ GLOBAL(initial_code)
+ .quad x86_64_start_kernel
+ GLOBAL(initial_gs)
+ .quad INIT_PER_CPU_VAR(irq_stack_union)
+ GLOBAL(initial_stack)
+ /*
+ * The SIZEOF_PTREGS gap is a convention which helps the in-kernel
+ * unwinder reliably detect the end of the stack.
+ */
+ .quad init_thread_union + THREAD_SIZE - SIZEOF_PTREGS
+ __FINITDATA
+
+ __INIT
+ENTRY(early_idt_handler_array)
+ i = 0
+ .rept NUM_EXCEPTION_VECTORS
+ .if ((EXCEPTION_ERRCODE_MASK >> i) & 1) == 0
+ UNWIND_HINT_IRET_REGS
+ pushq $0 # Dummy error code, to make stack frame uniform
+ .else
+ UNWIND_HINT_IRET_REGS offset=8
+ .endif
+ pushq $i # 72(%rsp) Vector number
+ jmp early_idt_handler_common
+ UNWIND_HINT_IRET_REGS
+ i = i + 1
+ .fill early_idt_handler_array + i*EARLY_IDT_HANDLER_SIZE - ., 1, 0xcc
+ .endr
+ UNWIND_HINT_IRET_REGS offset=16
+END(early_idt_handler_array)
+
+early_idt_handler_common:
+ /*
+ * The stack is the hardware frame, an error code or zero, and the
+ * vector number.
+ */
+ cld
+
+ incl early_recursion_flag(%rip)
+
+ /* The vector number is currently in the pt_regs->di slot. */
+ pushq %rsi /* pt_regs->si */
+ movq 8(%rsp), %rsi /* RSI = vector number */
+ movq %rdi, 8(%rsp) /* pt_regs->di = RDI */
+ pushq %rdx /* pt_regs->dx */
+ pushq %rcx /* pt_regs->cx */
+ pushq %rax /* pt_regs->ax */
+ pushq %r8 /* pt_regs->r8 */
+ pushq %r9 /* pt_regs->r9 */
+ pushq %r10 /* pt_regs->r10 */
+ pushq %r11 /* pt_regs->r11 */
+ pushq %rbx /* pt_regs->bx */
+ pushq %rbp /* pt_regs->bp */
+ pushq %r12 /* pt_regs->r12 */
+ pushq %r13 /* pt_regs->r13 */
+ pushq %r14 /* pt_regs->r14 */
+ pushq %r15 /* pt_regs->r15 */
+ UNWIND_HINT_REGS
+
+ cmpq $14,%rsi /* Page fault? */
+ jnz 10f
+ GET_CR2_INTO(%rdi) /* Can clobber any volatile register if pv */
+ call early_make_pgtable
+ andl %eax,%eax
+ jz 20f /* All good */
+
+10:
+ movq %rsp,%rdi /* RDI = pt_regs; RSI is already trapnr */
+ call early_fixup_exception
+
+20:
+ decl early_recursion_flag(%rip)
+ jmp restore_regs_and_return_to_kernel
+END(early_idt_handler_common)
+
+ __INITDATA
+
+ .balign 4
+GLOBAL(early_recursion_flag)
+ .long 0
+
+#define NEXT_PAGE(name) \
+ .balign PAGE_SIZE; \
+GLOBAL(name)
+
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+/*
+ * Each PGD needs to be 8k long and 8k aligned. We do not
+ * ever go out to userspace with these, so we do not
+ * strictly *need* the second page, but this allows us to
+ * have a single set_pgd() implementation that does not
+ * need to worry about whether it has 4k or 8k to work
+ * with.
+ *
+ * This ensures PGDs are 8k long:
+ */
+#define PTI_USER_PGD_FILL 512
+/* This ensures they are 8k-aligned: */
+#define NEXT_PGD_PAGE(name) \
+ .balign 2 * PAGE_SIZE; \
+GLOBAL(name)
+#else
+#define NEXT_PGD_PAGE(name) NEXT_PAGE(name)
+#define PTI_USER_PGD_FILL 0
+#endif
+
+/* Automate the creation of 1 to 1 mapping pmd entries */
+#define PMDS(START, PERM, COUNT) \
+ i = 0 ; \
+ .rept (COUNT) ; \
+ .quad (START) + (i << PMD_SHIFT) + (PERM) ; \
+ i = i + 1 ; \
+ .endr
+
+ __INITDATA
+NEXT_PGD_PAGE(early_top_pgt)
+ .fill 512,8,0
+ .fill PTI_USER_PGD_FILL,8,0
+
+NEXT_PAGE(early_dynamic_pgts)
+ .fill 512*EARLY_DYNAMIC_PAGE_TABLES,8,0
+
+ .data
+
+#if defined(CONFIG_XEN_PV) || defined(CONFIG_XEN_PVH)
+NEXT_PGD_PAGE(init_top_pgt)
+ .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC
+ .org init_top_pgt + L4_PAGE_OFFSET*8, 0
+ .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC
+ .org init_top_pgt + L4_START_KERNEL*8, 0
+ /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */
+ .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
+ .fill PTI_USER_PGD_FILL,8,0
+
+NEXT_PAGE(level3_ident_pgt)
+ .quad level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC
+ .fill 511, 8, 0
+NEXT_PAGE(level2_ident_pgt)
+ /*
+ * Since I easily can, map the first 1G.
+ * Don't set NX because code runs from these pages.
+ *
+ * Note: This sets _PAGE_GLOBAL despite whether
+ * the CPU supports it or it is enabled. But,
+ * the CPU should ignore the bit.
+ */
+ PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD)
+#else
+NEXT_PGD_PAGE(init_top_pgt)
+ .fill 512,8,0
+ .fill PTI_USER_PGD_FILL,8,0
+#endif
+
+#ifdef CONFIG_X86_5LEVEL
+NEXT_PAGE(level4_kernel_pgt)
+ .fill 511,8,0
+ .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
+#endif
+
+NEXT_PAGE(level3_kernel_pgt)
+ .fill L3_START_KERNEL,8,0
+ /* (2^48-(2*1024*1024*1024)-((2^39)*511))/(2^30) = 510 */
+ .quad level2_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC
+ .quad level2_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
+
+NEXT_PAGE(level2_kernel_pgt)
+ /*
+ * 512 MB kernel mapping. We spend a full page on this pagetable
+ * anyway.
+ *
+ * The kernel code+data+bss must not be bigger than that.
+ *
+ * (NOTE: at +512MB starts the module area, see MODULES_VADDR.
+ * If you want to increase this then increase MODULES_VADDR
+ * too.)
+ *
+ * This table is eventually used by the kernel during normal
+ * runtime. Care must be taken to clear out undesired bits
+ * later, like _PAGE_RW or _PAGE_GLOBAL in some cases.
+ */
+ PMDS(0, __PAGE_KERNEL_LARGE_EXEC,
+ KERNEL_IMAGE_SIZE/PMD_SIZE)
+
+NEXT_PAGE(level2_fixmap_pgt)
+ .fill (512 - 4 - FIXMAP_PMD_NUM),8,0
+ pgtno = 0
+ .rept (FIXMAP_PMD_NUM)
+ .quad level1_fixmap_pgt + (pgtno << PAGE_SHIFT) - __START_KERNEL_map \
+ + _PAGE_TABLE_NOENC;
+ pgtno = pgtno + 1
+ .endr
+ /* 6 MB reserved space + a 2MB hole */
+ .fill 4,8,0
+
+NEXT_PAGE(level1_fixmap_pgt)
+ .rept (FIXMAP_PMD_NUM)
+ .fill 512,8,0
+ .endr
+
+#undef PMDS
+
+ .data
+ .align 16
+ .globl early_gdt_descr
+early_gdt_descr:
+ .word GDT_ENTRIES*8-1
+early_gdt_descr_base:
+ .quad INIT_PER_CPU_VAR(gdt_page)
+
+ENTRY(phys_base)
+ /* This must match the first entry in level2_kernel_pgt */
+ .quad 0x0000000000000000
+EXPORT_SYMBOL(phys_base)
+
+#include "../../x86/xen/xen-head.S"
+
+ __PAGE_ALIGNED_BSS
+NEXT_PAGE(empty_zero_page)
+ .skip PAGE_SIZE
+EXPORT_SYMBOL(empty_zero_page)
+
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
new file mode 100644
index 0000000..b0acb22
--- /dev/null
+++ b/arch/x86/kernel/hpet.c
@@ -0,0 +1,1364 @@
+#include <linux/clocksource.h>
+#include <linux/clockchips.h>
+#include <linux/interrupt.h>
+#include <linux/irq.h>
+#include <linux/export.h>
+#include <linux/delay.h>
+#include <linux/errno.h>
+#include <linux/i8253.h>
+#include <linux/slab.h>
+#include <linux/hpet.h>
+#include <linux/init.h>
+#include <linux/cpu.h>
+#include <linux/pm.h>
+#include <linux/io.h>
+
+#include <asm/cpufeature.h>
+#include <asm/irqdomain.h>
+#include <asm/fixmap.h>
+#include <asm/hpet.h>
+#include <asm/time.h>
+
+#define HPET_MASK CLOCKSOURCE_MASK(32)
+
+/* FSEC = 10^-15
+ NSEC = 10^-9 */
+#define FSEC_PER_NSEC 1000000L
+
+#define HPET_DEV_USED_BIT 2
+#define HPET_DEV_USED (1 << HPET_DEV_USED_BIT)
+#define HPET_DEV_VALID 0x8
+#define HPET_DEV_FSB_CAP 0x1000
+#define HPET_DEV_PERI_CAP 0x2000
+
+#define HPET_MIN_CYCLES 128
+#define HPET_MIN_PROG_DELTA (HPET_MIN_CYCLES + (HPET_MIN_CYCLES >> 1))
+
+/*
+ * HPET address is set in acpi/boot.c, when an ACPI entry exists
+ */
+unsigned long hpet_address;
+u8 hpet_blockid; /* OS timer block num */
+bool hpet_msi_disable;
+
+#ifdef CONFIG_PCI_MSI
+static unsigned int hpet_num_timers;
+#endif
+static void __iomem *hpet_virt_address;
+
+struct hpet_dev {
+ struct clock_event_device evt;
+ unsigned int num;
+ int cpu;
+ unsigned int irq;
+ unsigned int flags;
+ char name[10];
+};
+
+static inline struct hpet_dev *EVT_TO_HPET_DEV(struct clock_event_device *evtdev)
+{
+ return container_of(evtdev, struct hpet_dev, evt);
+}
+
+inline unsigned int hpet_readl(unsigned int a)
+{
+ return readl(hpet_virt_address + a);
+}
+
+static inline void hpet_writel(unsigned int d, unsigned int a)
+{
+ writel(d, hpet_virt_address + a);
+}
+
+#ifdef CONFIG_X86_64
+#include <asm/pgtable.h>
+#endif
+
+static inline void hpet_set_mapping(void)
+{
+ hpet_virt_address = ioremap_nocache(hpet_address, HPET_MMAP_SIZE);
+}
+
+static inline void hpet_clear_mapping(void)
+{
+ iounmap(hpet_virt_address);
+ hpet_virt_address = NULL;
+}
+
+/*
+ * HPET command line enable / disable
+ */
+bool boot_hpet_disable;
+bool hpet_force_user;
+static bool hpet_verbose;
+
+static int __init hpet_setup(char *str)
+{
+ while (str) {
+ char *next = strchr(str, ',');
+
+ if (next)
+ *next++ = 0;
+ if (!strncmp("disable", str, 7))
+ boot_hpet_disable = true;
+ if (!strncmp("force", str, 5))
+ hpet_force_user = true;
+ if (!strncmp("verbose", str, 7))
+ hpet_verbose = true;
+ str = next;
+ }
+ return 1;
+}
+__setup("hpet=", hpet_setup);
+
+static int __init disable_hpet(char *str)
+{
+ boot_hpet_disable = true;
+ return 1;
+}
+__setup("nohpet", disable_hpet);
+
+static inline int is_hpet_capable(void)
+{
+ return !boot_hpet_disable && hpet_address;
+}
+
+/*
+ * HPET timer interrupt enable / disable
+ */
+static bool hpet_legacy_int_enabled;
+
+/**
+ * is_hpet_enabled - check whether the hpet timer interrupt is enabled
+ */
+int is_hpet_enabled(void)
+{
+ return is_hpet_capable() && hpet_legacy_int_enabled;
+}
+EXPORT_SYMBOL_GPL(is_hpet_enabled);
+
+static void _hpet_print_config(const char *function, int line)
+{
+ u32 i, timers, l, h;
+ printk(KERN_INFO "hpet: %s(%d):\n", function, line);
+ l = hpet_readl(HPET_ID);
+ h = hpet_readl(HPET_PERIOD);
+ timers = ((l & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT) + 1;
+ printk(KERN_INFO "hpet: ID: 0x%x, PERIOD: 0x%x\n", l, h);
+ l = hpet_readl(HPET_CFG);
+ h = hpet_readl(HPET_STATUS);
+ printk(KERN_INFO "hpet: CFG: 0x%x, STATUS: 0x%x\n", l, h);
+ l = hpet_readl(HPET_COUNTER);
+ h = hpet_readl(HPET_COUNTER+4);
+ printk(KERN_INFO "hpet: COUNTER_l: 0x%x, COUNTER_h: 0x%x\n", l, h);
+
+ for (i = 0; i < timers; i++) {
+ l = hpet_readl(HPET_Tn_CFG(i));
+ h = hpet_readl(HPET_Tn_CFG(i)+4);
+ printk(KERN_INFO "hpet: T%d: CFG_l: 0x%x, CFG_h: 0x%x\n",
+ i, l, h);
+ l = hpet_readl(HPET_Tn_CMP(i));
+ h = hpet_readl(HPET_Tn_CMP(i)+4);
+ printk(KERN_INFO "hpet: T%d: CMP_l: 0x%x, CMP_h: 0x%x\n",
+ i, l, h);
+ l = hpet_readl(HPET_Tn_ROUTE(i));
+ h = hpet_readl(HPET_Tn_ROUTE(i)+4);
+ printk(KERN_INFO "hpet: T%d ROUTE_l: 0x%x, ROUTE_h: 0x%x\n",
+ i, l, h);
+ }
+}
+
+#define hpet_print_config() \
+do { \
+ if (hpet_verbose) \
+ _hpet_print_config(__func__, __LINE__); \
+} while (0)
+
+/*
+ * When the hpet driver (/dev/hpet) is enabled, we need to reserve
+ * timer 0 and timer 1 in case of RTC emulation.
+ */
+#ifdef CONFIG_HPET
+
+static void hpet_reserve_msi_timers(struct hpet_data *hd);
+
+static void hpet_reserve_platform_timers(unsigned int id)
+{
+ struct hpet __iomem *hpet = hpet_virt_address;
+ struct hpet_timer __iomem *timer = &hpet->hpet_timers[2];
+ unsigned int nrtimers, i;
+ struct hpet_data hd;
+
+ nrtimers = ((id & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT) + 1;
+
+ memset(&hd, 0, sizeof(hd));
+ hd.hd_phys_address = hpet_address;
+ hd.hd_address = hpet;
+ hd.hd_nirqs = nrtimers;
+ hpet_reserve_timer(&hd, 0);
+
+#ifdef CONFIG_HPET_EMULATE_RTC
+ hpet_reserve_timer(&hd, 1);
+#endif
+
+ /*
+ * NOTE that hd_irq[] reflects IOAPIC input pins (LEGACY_8254
+ * is wrong for i8259!) not the output IRQ. Many BIOS writers
+ * don't bother configuring *any* comparator interrupts.
+ */
+ hd.hd_irq[0] = HPET_LEGACY_8254;
+ hd.hd_irq[1] = HPET_LEGACY_RTC;
+
+ for (i = 2; i < nrtimers; timer++, i++) {
+ hd.hd_irq[i] = (readl(&timer->hpet_config) &
+ Tn_INT_ROUTE_CNF_MASK) >> Tn_INT_ROUTE_CNF_SHIFT;
+ }
+
+ hpet_reserve_msi_timers(&hd);
+
+ hpet_alloc(&hd);
+
+}
+#else
+static void hpet_reserve_platform_timers(unsigned int id) { }
+#endif
+
+/*
+ * Common hpet info
+ */
+static unsigned long hpet_freq;
+
+static struct clock_event_device hpet_clockevent;
+
+static void hpet_stop_counter(void)
+{
+ u32 cfg = hpet_readl(HPET_CFG);
+ cfg &= ~HPET_CFG_ENABLE;
+ hpet_writel(cfg, HPET_CFG);
+}
+
+static void hpet_reset_counter(void)
+{
+ hpet_writel(0, HPET_COUNTER);
+ hpet_writel(0, HPET_COUNTER + 4);
+}
+
+static void hpet_start_counter(void)
+{
+ unsigned int cfg = hpet_readl(HPET_CFG);
+ cfg |= HPET_CFG_ENABLE;
+ hpet_writel(cfg, HPET_CFG);
+}
+
+static void hpet_restart_counter(void)
+{
+ hpet_stop_counter();
+ hpet_reset_counter();
+ hpet_start_counter();
+}
+
+static void hpet_resume_device(void)
+{
+ force_hpet_resume();
+}
+
+static void hpet_resume_counter(struct clocksource *cs)
+{
+ hpet_resume_device();
+ hpet_restart_counter();
+}
+
+static void hpet_enable_legacy_int(void)
+{
+ unsigned int cfg = hpet_readl(HPET_CFG);
+
+ cfg |= HPET_CFG_LEGACY;
+ hpet_writel(cfg, HPET_CFG);
+ hpet_legacy_int_enabled = true;
+}
+
+static void hpet_legacy_clockevent_register(void)
+{
+ /* Start HPET legacy interrupts */
+ hpet_enable_legacy_int();
+
+ /*
+ * Start hpet with the boot cpu mask and make it
+ * global after the IO_APIC has been initialized.
+ */
+ hpet_clockevent.cpumask = cpumask_of(boot_cpu_data.cpu_index);
+ clockevents_config_and_register(&hpet_clockevent, hpet_freq,
+ HPET_MIN_PROG_DELTA, 0x7FFFFFFF);
+ global_clock_event = &hpet_clockevent;
+ printk(KERN_DEBUG "hpet clockevent registered\n");
+}
+
+static int hpet_set_periodic(struct clock_event_device *evt, int timer)
+{
+ unsigned int cfg, cmp, now;
+ uint64_t delta;
+
+ hpet_stop_counter();
+ delta = ((uint64_t)(NSEC_PER_SEC / HZ)) * evt->mult;
+ delta >>= evt->shift;
+ now = hpet_readl(HPET_COUNTER);
+ cmp = now + (unsigned int)delta;
+ cfg = hpet_readl(HPET_Tn_CFG(timer));
+ cfg |= HPET_TN_ENABLE | HPET_TN_PERIODIC | HPET_TN_SETVAL |
+ HPET_TN_32BIT;
+ hpet_writel(cfg, HPET_Tn_CFG(timer));
+ hpet_writel(cmp, HPET_Tn_CMP(timer));
+ udelay(1);
+ /*
+ * HPET on AMD 81xx needs a second write (with HPET_TN_SETVAL
+ * cleared) to T0_CMP to set the period. The HPET_TN_SETVAL
+ * bit is automatically cleared after the first write.
+ * (See AMD-8111 HyperTransport I/O Hub Data Sheet,
+ * Publication # 24674)
+ */
+ hpet_writel((unsigned int)delta, HPET_Tn_CMP(timer));
+ hpet_start_counter();
+ hpet_print_config();
+
+ return 0;
+}
+
+static int hpet_set_oneshot(struct clock_event_device *evt, int timer)
+{
+ unsigned int cfg;
+
+ cfg = hpet_readl(HPET_Tn_CFG(timer));
+ cfg &= ~HPET_TN_PERIODIC;
+ cfg |= HPET_TN_ENABLE | HPET_TN_32BIT;
+ hpet_writel(cfg, HPET_Tn_CFG(timer));
+
+ return 0;
+}
+
+static int hpet_shutdown(struct clock_event_device *evt, int timer)
+{
+ unsigned int cfg;
+
+ cfg = hpet_readl(HPET_Tn_CFG(timer));
+ cfg &= ~HPET_TN_ENABLE;
+ hpet_writel(cfg, HPET_Tn_CFG(timer));
+
+ return 0;
+}
+
+static int hpet_resume(struct clock_event_device *evt)
+{
+ hpet_enable_legacy_int();
+ hpet_print_config();
+ return 0;
+}
+
+static int hpet_next_event(unsigned long delta,
+ struct clock_event_device *evt, int timer)
+{
+ u32 cnt;
+ s32 res;
+
+ cnt = hpet_readl(HPET_COUNTER);
+ cnt += (u32) delta;
+ hpet_writel(cnt, HPET_Tn_CMP(timer));
+
+ /*
+ * HPETs are a complete disaster. The compare register is
+ * based on a equal comparison and neither provides a less
+ * than or equal functionality (which would require to take
+ * the wraparound into account) nor a simple count down event
+ * mode. Further the write to the comparator register is
+ * delayed internally up to two HPET clock cycles in certain
+ * chipsets (ATI, ICH9,10). Some newer AMD chipsets have even
+ * longer delays. We worked around that by reading back the
+ * compare register, but that required another workaround for
+ * ICH9,10 chips where the first readout after write can
+ * return the old stale value. We already had a minimum
+ * programming delta of 5us enforced, but a NMI or SMI hitting
+ * between the counter readout and the comparator write can
+ * move us behind that point easily. Now instead of reading
+ * the compare register back several times, we make the ETIME
+ * decision based on the following: Return ETIME if the
+ * counter value after the write is less than HPET_MIN_CYCLES
+ * away from the event or if the counter is already ahead of
+ * the event. The minimum programming delta for the generic
+ * clockevents code is set to 1.5 * HPET_MIN_CYCLES.
+ */
+ res = (s32)(cnt - hpet_readl(HPET_COUNTER));
+
+ return res < HPET_MIN_CYCLES ? -ETIME : 0;
+}
+
+static int hpet_legacy_shutdown(struct clock_event_device *evt)
+{
+ return hpet_shutdown(evt, 0);
+}
+
+static int hpet_legacy_set_oneshot(struct clock_event_device *evt)
+{
+ return hpet_set_oneshot(evt, 0);
+}
+
+static int hpet_legacy_set_periodic(struct clock_event_device *evt)
+{
+ return hpet_set_periodic(evt, 0);
+}
+
+static int hpet_legacy_resume(struct clock_event_device *evt)
+{
+ return hpet_resume(evt);
+}
+
+static int hpet_legacy_next_event(unsigned long delta,
+ struct clock_event_device *evt)
+{
+ return hpet_next_event(delta, evt, 0);
+}
+
+/*
+ * The hpet clock event device
+ */
+static struct clock_event_device hpet_clockevent = {
+ .name = "hpet",
+ .features = CLOCK_EVT_FEAT_PERIODIC |
+ CLOCK_EVT_FEAT_ONESHOT,
+ .set_state_periodic = hpet_legacy_set_periodic,
+ .set_state_oneshot = hpet_legacy_set_oneshot,
+ .set_state_shutdown = hpet_legacy_shutdown,
+ .tick_resume = hpet_legacy_resume,
+ .set_next_event = hpet_legacy_next_event,
+ .irq = 0,
+ .rating = 50,
+};
+
+/*
+ * HPET MSI Support
+ */
+#ifdef CONFIG_PCI_MSI
+
+static DEFINE_PER_CPU(struct hpet_dev *, cpu_hpet_dev);
+static struct hpet_dev *hpet_devs;
+static struct irq_domain *hpet_domain;
+
+void hpet_msi_unmask(struct irq_data *data)
+{
+ struct hpet_dev *hdev = irq_data_get_irq_handler_data(data);
+ unsigned int cfg;
+
+ /* unmask it */
+ cfg = hpet_readl(HPET_Tn_CFG(hdev->num));
+ cfg |= HPET_TN_ENABLE | HPET_TN_FSB;
+ hpet_writel(cfg, HPET_Tn_CFG(hdev->num));
+}
+
+void hpet_msi_mask(struct irq_data *data)
+{
+ struct hpet_dev *hdev = irq_data_get_irq_handler_data(data);
+ unsigned int cfg;
+
+ /* mask it */
+ cfg = hpet_readl(HPET_Tn_CFG(hdev->num));
+ cfg &= ~(HPET_TN_ENABLE | HPET_TN_FSB);
+ hpet_writel(cfg, HPET_Tn_CFG(hdev->num));
+}
+
+void hpet_msi_write(struct hpet_dev *hdev, struct msi_msg *msg)
+{
+ hpet_writel(msg->data, HPET_Tn_ROUTE(hdev->num));
+ hpet_writel(msg->address_lo, HPET_Tn_ROUTE(hdev->num) + 4);
+}
+
+void hpet_msi_read(struct hpet_dev *hdev, struct msi_msg *msg)
+{
+ msg->data = hpet_readl(HPET_Tn_ROUTE(hdev->num));
+ msg->address_lo = hpet_readl(HPET_Tn_ROUTE(hdev->num) + 4);
+ msg->address_hi = 0;
+}
+
+static int hpet_msi_shutdown(struct clock_event_device *evt)
+{
+ struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt);
+
+ return hpet_shutdown(evt, hdev->num);
+}
+
+static int hpet_msi_set_oneshot(struct clock_event_device *evt)
+{
+ struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt);
+
+ return hpet_set_oneshot(evt, hdev->num);
+}
+
+static int hpet_msi_set_periodic(struct clock_event_device *evt)
+{
+ struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt);
+
+ return hpet_set_periodic(evt, hdev->num);
+}
+
+static int hpet_msi_resume(struct clock_event_device *evt)
+{
+ struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt);
+ struct irq_data *data = irq_get_irq_data(hdev->irq);
+ struct msi_msg msg;
+
+ /* Restore the MSI msg and unmask the interrupt */
+ irq_chip_compose_msi_msg(data, &msg);
+ hpet_msi_write(hdev, &msg);
+ hpet_msi_unmask(data);
+ return 0;
+}
+
+static int hpet_msi_next_event(unsigned long delta,
+ struct clock_event_device *evt)
+{
+ struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt);
+ return hpet_next_event(delta, evt, hdev->num);
+}
+
+static irqreturn_t hpet_interrupt_handler(int irq, void *data)
+{
+ struct hpet_dev *dev = (struct hpet_dev *)data;
+ struct clock_event_device *hevt = &dev->evt;
+
+ if (!hevt->event_handler) {
+ printk(KERN_INFO "Spurious HPET timer interrupt on HPET timer %d\n",
+ dev->num);
+ return IRQ_HANDLED;
+ }
+
+ hevt->event_handler(hevt);
+ return IRQ_HANDLED;
+}
+
+static int hpet_setup_irq(struct hpet_dev *dev)
+{
+
+ if (request_irq(dev->irq, hpet_interrupt_handler,
+ IRQF_TIMER | IRQF_NOBALANCING,
+ dev->name, dev))
+ return -1;
+
+ disable_irq(dev->irq);
+ irq_set_affinity(dev->irq, cpumask_of(dev->cpu));
+ enable_irq(dev->irq);
+
+ printk(KERN_DEBUG "hpet: %s irq %d for MSI\n",
+ dev->name, dev->irq);
+
+ return 0;
+}
+
+/* This should be called in specific @cpu */
+static void init_one_hpet_msi_clockevent(struct hpet_dev *hdev, int cpu)
+{
+ struct clock_event_device *evt = &hdev->evt;
+
+ WARN_ON(cpu != smp_processor_id());
+ if (!(hdev->flags & HPET_DEV_VALID))
+ return;
+
+ hdev->cpu = cpu;
+ per_cpu(cpu_hpet_dev, cpu) = hdev;
+ evt->name = hdev->name;
+ hpet_setup_irq(hdev);
+ evt->irq = hdev->irq;
+
+ evt->rating = 110;
+ evt->features = CLOCK_EVT_FEAT_ONESHOT;
+ if (hdev->flags & HPET_DEV_PERI_CAP) {
+ evt->features |= CLOCK_EVT_FEAT_PERIODIC;
+ evt->set_state_periodic = hpet_msi_set_periodic;
+ }
+
+ evt->set_state_shutdown = hpet_msi_shutdown;
+ evt->set_state_oneshot = hpet_msi_set_oneshot;
+ evt->tick_resume = hpet_msi_resume;
+ evt->set_next_event = hpet_msi_next_event;
+ evt->cpumask = cpumask_of(hdev->cpu);
+
+ clockevents_config_and_register(evt, hpet_freq, HPET_MIN_PROG_DELTA,
+ 0x7FFFFFFF);
+}
+
+#ifdef CONFIG_HPET
+/* Reserve at least one timer for userspace (/dev/hpet) */
+#define RESERVE_TIMERS 1
+#else
+#define RESERVE_TIMERS 0
+#endif
+
+static void hpet_msi_capability_lookup(unsigned int start_timer)
+{
+ unsigned int id;
+ unsigned int num_timers;
+ unsigned int num_timers_used = 0;
+ int i, irq;
+
+ if (hpet_msi_disable)
+ return;
+
+ if (boot_cpu_has(X86_FEATURE_ARAT))
+ return;
+ id = hpet_readl(HPET_ID);
+
+ num_timers = ((id & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT);
+ num_timers++; /* Value read out starts from 0 */
+ hpet_print_config();
+
+ hpet_domain = hpet_create_irq_domain(hpet_blockid);
+ if (!hpet_domain)
+ return;
+
+ hpet_devs = kcalloc(num_timers, sizeof(struct hpet_dev), GFP_KERNEL);
+ if (!hpet_devs)
+ return;
+
+ hpet_num_timers = num_timers;
+
+ for (i = start_timer; i < num_timers - RESERVE_TIMERS; i++) {
+ struct hpet_dev *hdev = &hpet_devs[num_timers_used];
+ unsigned int cfg = hpet_readl(HPET_Tn_CFG(i));
+
+ /* Only consider HPET timer with MSI support */
+ if (!(cfg & HPET_TN_FSB_CAP))
+ continue;
+
+ hdev->flags = 0;
+ if (cfg & HPET_TN_PERIODIC_CAP)
+ hdev->flags |= HPET_DEV_PERI_CAP;
+ sprintf(hdev->name, "hpet%d", i);
+ hdev->num = i;
+
+ irq = hpet_assign_irq(hpet_domain, hdev, hdev->num);
+ if (irq <= 0)
+ continue;
+
+ hdev->irq = irq;
+ hdev->flags |= HPET_DEV_FSB_CAP;
+ hdev->flags |= HPET_DEV_VALID;
+ num_timers_used++;
+ if (num_timers_used == num_possible_cpus())
+ break;
+ }
+
+ printk(KERN_INFO "HPET: %d timers in total, %d timers will be used for per-cpu timer\n",
+ num_timers, num_timers_used);
+}
+
+#ifdef CONFIG_HPET
+static void hpet_reserve_msi_timers(struct hpet_data *hd)
+{
+ int i;
+
+ if (!hpet_devs)
+ return;
+
+ for (i = 0; i < hpet_num_timers; i++) {
+ struct hpet_dev *hdev = &hpet_devs[i];
+
+ if (!(hdev->flags & HPET_DEV_VALID))
+ continue;
+
+ hd->hd_irq[hdev->num] = hdev->irq;
+ hpet_reserve_timer(hd, hdev->num);
+ }
+}
+#endif
+
+static struct hpet_dev *hpet_get_unused_timer(void)
+{
+ int i;
+
+ if (!hpet_devs)
+ return NULL;
+
+ for (i = 0; i < hpet_num_timers; i++) {
+ struct hpet_dev *hdev = &hpet_devs[i];
+
+ if (!(hdev->flags & HPET_DEV_VALID))
+ continue;
+ if (test_and_set_bit(HPET_DEV_USED_BIT,
+ (unsigned long *)&hdev->flags))
+ continue;
+ return hdev;
+ }
+ return NULL;
+}
+
+struct hpet_work_struct {
+ struct delayed_work work;
+ struct completion complete;
+};
+
+static void hpet_work(struct work_struct *w)
+{
+ struct hpet_dev *hdev;
+ int cpu = smp_processor_id();
+ struct hpet_work_struct *hpet_work;
+
+ hpet_work = container_of(w, struct hpet_work_struct, work.work);
+
+ hdev = hpet_get_unused_timer();
+ if (hdev)
+ init_one_hpet_msi_clockevent(hdev, cpu);
+
+ complete(&hpet_work->complete);
+}
+
+static int hpet_cpuhp_online(unsigned int cpu)
+{
+ struct hpet_work_struct work;
+
+ INIT_DELAYED_WORK_ONSTACK(&work.work, hpet_work);
+ init_completion(&work.complete);
+ /* FIXME: add schedule_work_on() */
+ schedule_delayed_work_on(cpu, &work.work, 0);
+ wait_for_completion(&work.complete);
+ destroy_delayed_work_on_stack(&work.work);
+ return 0;
+}
+
+static int hpet_cpuhp_dead(unsigned int cpu)
+{
+ struct hpet_dev *hdev = per_cpu(cpu_hpet_dev, cpu);
+
+ if (!hdev)
+ return 0;
+ free_irq(hdev->irq, hdev);
+ hdev->flags &= ~HPET_DEV_USED;
+ per_cpu(cpu_hpet_dev, cpu) = NULL;
+ return 0;
+}
+#else
+
+static void hpet_msi_capability_lookup(unsigned int start_timer)
+{
+ return;
+}
+
+#ifdef CONFIG_HPET
+static void hpet_reserve_msi_timers(struct hpet_data *hd)
+{
+ return;
+}
+#endif
+
+#define hpet_cpuhp_online NULL
+#define hpet_cpuhp_dead NULL
+
+#endif
+
+/*
+ * Clock source related code
+ */
+#if defined(CONFIG_SMP) && defined(CONFIG_64BIT)
+/*
+ * Reading the HPET counter is a very slow operation. If a large number of
+ * CPUs are trying to access the HPET counter simultaneously, it can cause
+ * massive delay and slow down system performance dramatically. This may
+ * happen when HPET is the default clock source instead of TSC. For a
+ * really large system with hundreds of CPUs, the slowdown may be so
+ * severe that it may actually crash the system because of a NMI watchdog
+ * soft lockup, for example.
+ *
+ * If multiple CPUs are trying to access the HPET counter at the same time,
+ * we don't actually need to read the counter multiple times. Instead, the
+ * other CPUs can use the counter value read by the first CPU in the group.
+ *
+ * This special feature is only enabled on x86-64 systems. It is unlikely
+ * that 32-bit x86 systems will have enough CPUs to require this feature
+ * with its associated locking overhead. And we also need 64-bit atomic
+ * read.
+ *
+ * The lock and the hpet value are stored together and can be read in a
+ * single atomic 64-bit read. It is explicitly assumed that arch_spinlock_t
+ * is 32 bits in size.
+ */
+union hpet_lock {
+ struct {
+ arch_spinlock_t lock;
+ u32 value;
+ };
+ u64 lockval;
+};
+
+static union hpet_lock hpet __cacheline_aligned = {
+ { .lock = __ARCH_SPIN_LOCK_UNLOCKED, },
+};
+
+static u64 read_hpet(struct clocksource *cs)
+{
+ unsigned long flags;
+ union hpet_lock old, new;
+
+ BUILD_BUG_ON(sizeof(union hpet_lock) != 8);
+
+ /*
+ * Read HPET directly if in NMI.
+ */
+ if (in_nmi())
+ return (u64)hpet_readl(HPET_COUNTER);
+
+ /*
+ * Read the current state of the lock and HPET value atomically.
+ */
+ old.lockval = READ_ONCE(hpet.lockval);
+
+ if (arch_spin_is_locked(&old.lock))
+ goto contended;
+
+ local_irq_save(flags);
+ if (arch_spin_trylock(&hpet.lock)) {
+ new.value = hpet_readl(HPET_COUNTER);
+ /*
+ * Use WRITE_ONCE() to prevent store tearing.
+ */
+ WRITE_ONCE(hpet.value, new.value);
+ arch_spin_unlock(&hpet.lock);
+ local_irq_restore(flags);
+ return (u64)new.value;
+ }
+ local_irq_restore(flags);
+
+contended:
+ /*
+ * Contended case
+ * --------------
+ * Wait until the HPET value change or the lock is free to indicate
+ * its value is up-to-date.
+ *
+ * It is possible that old.value has already contained the latest
+ * HPET value while the lock holder was in the process of releasing
+ * the lock. Checking for lock state change will enable us to return
+ * the value immediately instead of waiting for the next HPET reader
+ * to come along.
+ */
+ do {
+ cpu_relax();
+ new.lockval = READ_ONCE(hpet.lockval);
+ } while ((new.value == old.value) && arch_spin_is_locked(&new.lock));
+
+ return (u64)new.value;
+}
+#else
+/*
+ * For UP or 32-bit.
+ */
+static u64 read_hpet(struct clocksource *cs)
+{
+ return (u64)hpet_readl(HPET_COUNTER);
+}
+#endif
+
+static struct clocksource clocksource_hpet = {
+ .name = "hpet",
+ .rating = 250,
+ .read = read_hpet,
+ .mask = HPET_MASK,
+ .flags = CLOCK_SOURCE_IS_CONTINUOUS,
+ .resume = hpet_resume_counter,
+};
+
+static int hpet_clocksource_register(void)
+{
+ u64 start, now;
+ u64 t1;
+
+ /* Start the counter */
+ hpet_restart_counter();
+
+ /* Verify whether hpet counter works */
+ t1 = hpet_readl(HPET_COUNTER);
+ start = rdtsc();
+
+ /*
+ * We don't know the TSC frequency yet, but waiting for
+ * 200000 TSC cycles is safe:
+ * 4 GHz == 50us
+ * 1 GHz == 200us
+ */
+ do {
+ rep_nop();
+ now = rdtsc();
+ } while ((now - start) < 200000UL);
+
+ if (t1 == hpet_readl(HPET_COUNTER)) {
+ printk(KERN_WARNING
+ "HPET counter not counting. HPET disabled\n");
+ return -ENODEV;
+ }
+
+ clocksource_register_hz(&clocksource_hpet, (u32)hpet_freq);
+ return 0;
+}
+
+static u32 *hpet_boot_cfg;
+
+/**
+ * hpet_enable - Try to setup the HPET timer. Returns 1 on success.
+ */
+int __init hpet_enable(void)
+{
+ u32 hpet_period, cfg, id;
+ u64 freq;
+ unsigned int i, last;
+
+ if (!is_hpet_capable())
+ return 0;
+
+ hpet_set_mapping();
+
+ /*
+ * Read the period and check for a sane value:
+ */
+ hpet_period = hpet_readl(HPET_PERIOD);
+
+ /*
+ * AMD SB700 based systems with spread spectrum enabled use a
+ * SMM based HPET emulation to provide proper frequency
+ * setting. The SMM code is initialized with the first HPET
+ * register access and takes some time to complete. During
+ * this time the config register reads 0xffffffff. We check
+ * for max. 1000 loops whether the config register reads a non
+ * 0xffffffff value to make sure that HPET is up and running
+ * before we go further. A counting loop is safe, as the HPET
+ * access takes thousands of CPU cycles. On non SB700 based
+ * machines this check is only done once and has no side
+ * effects.
+ */
+ for (i = 0; hpet_readl(HPET_CFG) == 0xFFFFFFFF; i++) {
+ if (i == 1000) {
+ printk(KERN_WARNING
+ "HPET config register value = 0xFFFFFFFF. "
+ "Disabling HPET\n");
+ goto out_nohpet;
+ }
+ }
+
+ if (hpet_period < HPET_MIN_PERIOD || hpet_period > HPET_MAX_PERIOD)
+ goto out_nohpet;
+
+ /*
+ * The period is a femto seconds value. Convert it to a
+ * frequency.
+ */
+ freq = FSEC_PER_SEC;
+ do_div(freq, hpet_period);
+ hpet_freq = freq;
+
+ /*
+ * Read the HPET ID register to retrieve the IRQ routing
+ * information and the number of channels
+ */
+ id = hpet_readl(HPET_ID);
+ hpet_print_config();
+
+ last = (id & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT;
+
+#ifdef CONFIG_HPET_EMULATE_RTC
+ /*
+ * The legacy routing mode needs at least two channels, tick timer
+ * and the rtc emulation channel.
+ */
+ if (!last)
+ goto out_nohpet;
+#endif
+
+ cfg = hpet_readl(HPET_CFG);
+ hpet_boot_cfg = kmalloc_array(last + 2, sizeof(*hpet_boot_cfg),
+ GFP_KERNEL);
+ if (hpet_boot_cfg)
+ *hpet_boot_cfg = cfg;
+ else
+ pr_warn("HPET initial state will not be saved\n");
+ cfg &= ~(HPET_CFG_ENABLE | HPET_CFG_LEGACY);
+ hpet_writel(cfg, HPET_CFG);
+ if (cfg)
+ pr_warn("Unrecognized bits %#x set in global cfg\n", cfg);
+
+ for (i = 0; i <= last; ++i) {
+ cfg = hpet_readl(HPET_Tn_CFG(i));
+ if (hpet_boot_cfg)
+ hpet_boot_cfg[i + 1] = cfg;
+ cfg &= ~(HPET_TN_ENABLE | HPET_TN_LEVEL | HPET_TN_FSB);
+ hpet_writel(cfg, HPET_Tn_CFG(i));
+ cfg &= ~(HPET_TN_PERIODIC | HPET_TN_PERIODIC_CAP
+ | HPET_TN_64BIT_CAP | HPET_TN_32BIT | HPET_TN_ROUTE
+ | HPET_TN_FSB | HPET_TN_FSB_CAP);
+ if (cfg)
+ pr_warn("Unrecognized bits %#x set in cfg#%u\n",
+ cfg, i);
+ }
+ hpet_print_config();
+
+ if (hpet_clocksource_register())
+ goto out_nohpet;
+
+ if (id & HPET_ID_LEGSUP) {
+ hpet_legacy_clockevent_register();
+ return 1;
+ }
+ return 0;
+
+out_nohpet:
+ hpet_clear_mapping();
+ hpet_address = 0;
+ return 0;
+}
+
+/*
+ * Needs to be late, as the reserve_timer code calls kalloc !
+ *
+ * Not a problem on i386 as hpet_enable is called from late_time_init,
+ * but on x86_64 it is necessary !
+ */
+static __init int hpet_late_init(void)
+{
+ int ret;
+
+ if (boot_hpet_disable)
+ return -ENODEV;
+
+ if (!hpet_address) {
+ if (!force_hpet_address)
+ return -ENODEV;
+
+ hpet_address = force_hpet_address;
+ hpet_enable();
+ }
+
+ if (!hpet_virt_address)
+ return -ENODEV;
+
+ if (hpet_readl(HPET_ID) & HPET_ID_LEGSUP)
+ hpet_msi_capability_lookup(2);
+ else
+ hpet_msi_capability_lookup(0);
+
+ hpet_reserve_platform_timers(hpet_readl(HPET_ID));
+ hpet_print_config();
+
+ if (hpet_msi_disable)
+ return 0;
+
+ if (boot_cpu_has(X86_FEATURE_ARAT))
+ return 0;
+
+ /* This notifier should be called after workqueue is ready */
+ ret = cpuhp_setup_state(CPUHP_AP_X86_HPET_ONLINE, "x86/hpet:online",
+ hpet_cpuhp_online, NULL);
+ if (ret)
+ return ret;
+ ret = cpuhp_setup_state(CPUHP_X86_HPET_DEAD, "x86/hpet:dead", NULL,
+ hpet_cpuhp_dead);
+ if (ret)
+ goto err_cpuhp;
+ return 0;
+
+err_cpuhp:
+ cpuhp_remove_state(CPUHP_AP_X86_HPET_ONLINE);
+ return ret;
+}
+fs_initcall(hpet_late_init);
+
+void hpet_disable(void)
+{
+ if (is_hpet_capable() && hpet_virt_address) {
+ unsigned int cfg = hpet_readl(HPET_CFG), id, last;
+
+ if (hpet_boot_cfg)
+ cfg = *hpet_boot_cfg;
+ else if (hpet_legacy_int_enabled) {
+ cfg &= ~HPET_CFG_LEGACY;
+ hpet_legacy_int_enabled = false;
+ }
+ cfg &= ~HPET_CFG_ENABLE;
+ hpet_writel(cfg, HPET_CFG);
+
+ if (!hpet_boot_cfg)
+ return;
+
+ id = hpet_readl(HPET_ID);
+ last = ((id & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT);
+
+ for (id = 0; id <= last; ++id)
+ hpet_writel(hpet_boot_cfg[id + 1], HPET_Tn_CFG(id));
+
+ if (*hpet_boot_cfg & HPET_CFG_ENABLE)
+ hpet_writel(*hpet_boot_cfg, HPET_CFG);
+ }
+}
+
+#ifdef CONFIG_HPET_EMULATE_RTC
+
+/* HPET in LegacyReplacement Mode eats up RTC interrupt line. When, HPET
+ * is enabled, we support RTC interrupt functionality in software.
+ * RTC has 3 kinds of interrupts:
+ * 1) Update Interrupt - generate an interrupt, every sec, when RTC clock
+ * is updated
+ * 2) Alarm Interrupt - generate an interrupt at a specific time of day
+ * 3) Periodic Interrupt - generate periodic interrupt, with frequencies
+ * 2Hz-8192Hz (2Hz-64Hz for non-root user) (all freqs in powers of 2)
+ * (1) and (2) above are implemented using polling at a frequency of
+ * 64 Hz. The exact frequency is a tradeoff between accuracy and interrupt
+ * overhead. (DEFAULT_RTC_INT_FREQ)
+ * For (3), we use interrupts at 64Hz or user specified periodic
+ * frequency, whichever is higher.
+ */
+#include <linux/mc146818rtc.h>
+#include <linux/rtc.h>
+
+#define DEFAULT_RTC_INT_FREQ 64
+#define DEFAULT_RTC_SHIFT 6
+#define RTC_NUM_INTS 1
+
+static unsigned long hpet_rtc_flags;
+static int hpet_prev_update_sec;
+static struct rtc_time hpet_alarm_time;
+static unsigned long hpet_pie_count;
+static u32 hpet_t1_cmp;
+static u32 hpet_default_delta;
+static u32 hpet_pie_delta;
+static unsigned long hpet_pie_limit;
+
+static rtc_irq_handler irq_handler;
+
+/*
+ * Check that the hpet counter c1 is ahead of the c2
+ */
+static inline int hpet_cnt_ahead(u32 c1, u32 c2)
+{
+ return (s32)(c2 - c1) < 0;
+}
+
+/*
+ * Registers a IRQ handler.
+ */
+int hpet_register_irq_handler(rtc_irq_handler handler)
+{
+ if (!is_hpet_enabled())
+ return -ENODEV;
+ if (irq_handler)
+ return -EBUSY;
+
+ irq_handler = handler;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(hpet_register_irq_handler);
+
+/*
+ * Deregisters the IRQ handler registered with hpet_register_irq_handler()
+ * and does cleanup.
+ */
+void hpet_unregister_irq_handler(rtc_irq_handler handler)
+{
+ if (!is_hpet_enabled())
+ return;
+
+ irq_handler = NULL;
+ hpet_rtc_flags = 0;
+}
+EXPORT_SYMBOL_GPL(hpet_unregister_irq_handler);
+
+/*
+ * Timer 1 for RTC emulation. We use one shot mode, as periodic mode
+ * is not supported by all HPET implementations for timer 1.
+ *
+ * hpet_rtc_timer_init() is called when the rtc is initialized.
+ */
+int hpet_rtc_timer_init(void)
+{
+ unsigned int cfg, cnt, delta;
+ unsigned long flags;
+
+ if (!is_hpet_enabled())
+ return 0;
+
+ if (!hpet_default_delta) {
+ uint64_t clc;
+
+ clc = (uint64_t) hpet_clockevent.mult * NSEC_PER_SEC;
+ clc >>= hpet_clockevent.shift + DEFAULT_RTC_SHIFT;
+ hpet_default_delta = clc;
+ }
+
+ if (!(hpet_rtc_flags & RTC_PIE) || hpet_pie_limit)
+ delta = hpet_default_delta;
+ else
+ delta = hpet_pie_delta;
+
+ local_irq_save(flags);
+
+ cnt = delta + hpet_readl(HPET_COUNTER);
+ hpet_writel(cnt, HPET_T1_CMP);
+ hpet_t1_cmp = cnt;
+
+ cfg = hpet_readl(HPET_T1_CFG);
+ cfg &= ~HPET_TN_PERIODIC;
+ cfg |= HPET_TN_ENABLE | HPET_TN_32BIT;
+ hpet_writel(cfg, HPET_T1_CFG);
+
+ local_irq_restore(flags);
+
+ return 1;
+}
+EXPORT_SYMBOL_GPL(hpet_rtc_timer_init);
+
+static void hpet_disable_rtc_channel(void)
+{
+ u32 cfg = hpet_readl(HPET_T1_CFG);
+ cfg &= ~HPET_TN_ENABLE;
+ hpet_writel(cfg, HPET_T1_CFG);
+}
+
+/*
+ * The functions below are called from rtc driver.
+ * Return 0 if HPET is not being used.
+ * Otherwise do the necessary changes and return 1.
+ */
+int hpet_mask_rtc_irq_bit(unsigned long bit_mask)
+{
+ if (!is_hpet_enabled())
+ return 0;
+
+ hpet_rtc_flags &= ~bit_mask;
+ if (unlikely(!hpet_rtc_flags))
+ hpet_disable_rtc_channel();
+
+ return 1;
+}
+EXPORT_SYMBOL_GPL(hpet_mask_rtc_irq_bit);
+
+int hpet_set_rtc_irq_bit(unsigned long bit_mask)
+{
+ unsigned long oldbits = hpet_rtc_flags;
+
+ if (!is_hpet_enabled())
+ return 0;
+
+ hpet_rtc_flags |= bit_mask;
+
+ if ((bit_mask & RTC_UIE) && !(oldbits & RTC_UIE))
+ hpet_prev_update_sec = -1;
+
+ if (!oldbits)
+ hpet_rtc_timer_init();
+
+ return 1;
+}
+EXPORT_SYMBOL_GPL(hpet_set_rtc_irq_bit);
+
+int hpet_set_alarm_time(unsigned char hrs, unsigned char min,
+ unsigned char sec)
+{
+ if (!is_hpet_enabled())
+ return 0;
+
+ hpet_alarm_time.tm_hour = hrs;
+ hpet_alarm_time.tm_min = min;
+ hpet_alarm_time.tm_sec = sec;
+
+ return 1;
+}
+EXPORT_SYMBOL_GPL(hpet_set_alarm_time);
+
+int hpet_set_periodic_freq(unsigned long freq)
+{
+ uint64_t clc;
+
+ if (!is_hpet_enabled())
+ return 0;
+
+ if (freq <= DEFAULT_RTC_INT_FREQ)
+ hpet_pie_limit = DEFAULT_RTC_INT_FREQ / freq;
+ else {
+ clc = (uint64_t) hpet_clockevent.mult * NSEC_PER_SEC;
+ do_div(clc, freq);
+ clc >>= hpet_clockevent.shift;
+ hpet_pie_delta = clc;
+ hpet_pie_limit = 0;
+ }
+ return 1;
+}
+EXPORT_SYMBOL_GPL(hpet_set_periodic_freq);
+
+int hpet_rtc_dropped_irq(void)
+{
+ return is_hpet_enabled();
+}
+EXPORT_SYMBOL_GPL(hpet_rtc_dropped_irq);
+
+static void hpet_rtc_timer_reinit(void)
+{
+ unsigned int delta;
+ int lost_ints = -1;
+
+ if (unlikely(!hpet_rtc_flags))
+ hpet_disable_rtc_channel();
+
+ if (!(hpet_rtc_flags & RTC_PIE) || hpet_pie_limit)
+ delta = hpet_default_delta;
+ else
+ delta = hpet_pie_delta;
+
+ /*
+ * Increment the comparator value until we are ahead of the
+ * current count.
+ */
+ do {
+ hpet_t1_cmp += delta;
+ hpet_writel(hpet_t1_cmp, HPET_T1_CMP);
+ lost_ints++;
+ } while (!hpet_cnt_ahead(hpet_t1_cmp, hpet_readl(HPET_COUNTER)));
+
+ if (lost_ints) {
+ if (hpet_rtc_flags & RTC_PIE)
+ hpet_pie_count += lost_ints;
+ if (printk_ratelimit())
+ printk(KERN_WARNING "hpet1: lost %d rtc interrupts\n",
+ lost_ints);
+ }
+}
+
+irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id)
+{
+ struct rtc_time curr_time;
+ unsigned long rtc_int_flag = 0;
+
+ hpet_rtc_timer_reinit();
+ memset(&curr_time, 0, sizeof(struct rtc_time));
+
+ if (hpet_rtc_flags & (RTC_UIE | RTC_AIE))
+ mc146818_get_time(&curr_time);
+
+ if (hpet_rtc_flags & RTC_UIE &&
+ curr_time.tm_sec != hpet_prev_update_sec) {
+ if (hpet_prev_update_sec >= 0)
+ rtc_int_flag = RTC_UF;
+ hpet_prev_update_sec = curr_time.tm_sec;
+ }
+
+ if (hpet_rtc_flags & RTC_PIE &&
+ ++hpet_pie_count >= hpet_pie_limit) {
+ rtc_int_flag |= RTC_PF;
+ hpet_pie_count = 0;
+ }
+
+ if (hpet_rtc_flags & RTC_AIE &&
+ (curr_time.tm_sec == hpet_alarm_time.tm_sec) &&
+ (curr_time.tm_min == hpet_alarm_time.tm_min) &&
+ (curr_time.tm_hour == hpet_alarm_time.tm_hour))
+ rtc_int_flag |= RTC_AF;
+
+ if (rtc_int_flag) {
+ rtc_int_flag |= (RTC_IRQF | (RTC_NUM_INTS << 8));
+ if (irq_handler)
+ irq_handler(rtc_int_flag, dev_id);
+ }
+ return IRQ_HANDLED;
+}
+EXPORT_SYMBOL_GPL(hpet_rtc_interrupt);
+#endif
diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c
new file mode 100644
index 0000000..34a5c17
--- /dev/null
+++ b/arch/x86/kernel/hw_breakpoint.c
@@ -0,0 +1,551 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) 2007 Alan Stern
+ * Copyright (C) 2009 IBM Corporation
+ * Copyright (C) 2009 Frederic Weisbecker <fweisbec@gmail.com>
+ *
+ * Authors: Alan Stern <stern@rowland.harvard.edu>
+ * K.Prasad <prasad@linux.vnet.ibm.com>
+ * Frederic Weisbecker <fweisbec@gmail.com>
+ */
+
+/*
+ * HW_breakpoint: a unified kernel/user-space hardware breakpoint facility,
+ * using the CPU's debug registers.
+ */
+
+#include <linux/perf_event.h>
+#include <linux/hw_breakpoint.h>
+#include <linux/irqflags.h>
+#include <linux/notifier.h>
+#include <linux/kallsyms.h>
+#include <linux/kprobes.h>
+#include <linux/percpu.h>
+#include <linux/kdebug.h>
+#include <linux/kernel.h>
+#include <linux/export.h>
+#include <linux/sched.h>
+#include <linux/smp.h>
+
+#include <asm/hw_breakpoint.h>
+#include <asm/processor.h>
+#include <asm/debugreg.h>
+#include <asm/user.h>
+
+/* Per cpu debug control register value */
+DEFINE_PER_CPU(unsigned long, cpu_dr7);
+EXPORT_PER_CPU_SYMBOL(cpu_dr7);
+
+/* Per cpu debug address registers values */
+static DEFINE_PER_CPU(unsigned long, cpu_debugreg[HBP_NUM]);
+
+/*
+ * Stores the breakpoints currently in use on each breakpoint address
+ * register for each cpus
+ */
+static DEFINE_PER_CPU(struct perf_event *, bp_per_reg[HBP_NUM]);
+
+
+static inline unsigned long
+__encode_dr7(int drnum, unsigned int len, unsigned int type)
+{
+ unsigned long bp_info;
+
+ bp_info = (len | type) & 0xf;
+ bp_info <<= (DR_CONTROL_SHIFT + drnum * DR_CONTROL_SIZE);
+ bp_info |= (DR_GLOBAL_ENABLE << (drnum * DR_ENABLE_SIZE));
+
+ return bp_info;
+}
+
+/*
+ * Encode the length, type, Exact, and Enable bits for a particular breakpoint
+ * as stored in debug register 7.
+ */
+unsigned long encode_dr7(int drnum, unsigned int len, unsigned int type)
+{
+ return __encode_dr7(drnum, len, type) | DR_GLOBAL_SLOWDOWN;
+}
+
+/*
+ * Decode the length and type bits for a particular breakpoint as
+ * stored in debug register 7. Return the "enabled" status.
+ */
+int decode_dr7(unsigned long dr7, int bpnum, unsigned *len, unsigned *type)
+{
+ int bp_info = dr7 >> (DR_CONTROL_SHIFT + bpnum * DR_CONTROL_SIZE);
+
+ *len = (bp_info & 0xc) | 0x40;
+ *type = (bp_info & 0x3) | 0x80;
+
+ return (dr7 >> (bpnum * DR_ENABLE_SIZE)) & 0x3;
+}
+
+/*
+ * Install a perf counter breakpoint.
+ *
+ * We seek a free debug address register and use it for this
+ * breakpoint. Eventually we enable it in the debug control register.
+ *
+ * Atomic: we hold the counter->ctx->lock and we only handle variables
+ * and registers local to this cpu.
+ */
+int arch_install_hw_breakpoint(struct perf_event *bp)
+{
+ struct arch_hw_breakpoint *info = counter_arch_bp(bp);
+ unsigned long *dr7;
+ int i;
+
+ for (i = 0; i < HBP_NUM; i++) {
+ struct perf_event **slot = this_cpu_ptr(&bp_per_reg[i]);
+
+ if (!*slot) {
+ *slot = bp;
+ break;
+ }
+ }
+
+ if (WARN_ONCE(i == HBP_NUM, "Can't find any breakpoint slot"))
+ return -EBUSY;
+
+ set_debugreg(info->address, i);
+ __this_cpu_write(cpu_debugreg[i], info->address);
+
+ dr7 = this_cpu_ptr(&cpu_dr7);
+ *dr7 |= encode_dr7(i, info->len, info->type);
+
+ set_debugreg(*dr7, 7);
+ if (info->mask)
+ set_dr_addr_mask(info->mask, i);
+
+ return 0;
+}
+
+/*
+ * Uninstall the breakpoint contained in the given counter.
+ *
+ * First we search the debug address register it uses and then we disable
+ * it.
+ *
+ * Atomic: we hold the counter->ctx->lock and we only handle variables
+ * and registers local to this cpu.
+ */
+void arch_uninstall_hw_breakpoint(struct perf_event *bp)
+{
+ struct arch_hw_breakpoint *info = counter_arch_bp(bp);
+ unsigned long *dr7;
+ int i;
+
+ for (i = 0; i < HBP_NUM; i++) {
+ struct perf_event **slot = this_cpu_ptr(&bp_per_reg[i]);
+
+ if (*slot == bp) {
+ *slot = NULL;
+ break;
+ }
+ }
+
+ if (WARN_ONCE(i == HBP_NUM, "Can't find any breakpoint slot"))
+ return;
+
+ dr7 = this_cpu_ptr(&cpu_dr7);
+ *dr7 &= ~__encode_dr7(i, info->len, info->type);
+
+ set_debugreg(*dr7, 7);
+ if (info->mask)
+ set_dr_addr_mask(0, i);
+}
+
+static int arch_bp_generic_len(int x86_len)
+{
+ switch (x86_len) {
+ case X86_BREAKPOINT_LEN_1:
+ return HW_BREAKPOINT_LEN_1;
+ case X86_BREAKPOINT_LEN_2:
+ return HW_BREAKPOINT_LEN_2;
+ case X86_BREAKPOINT_LEN_4:
+ return HW_BREAKPOINT_LEN_4;
+#ifdef CONFIG_X86_64
+ case X86_BREAKPOINT_LEN_8:
+ return HW_BREAKPOINT_LEN_8;
+#endif
+ default:
+ return -EINVAL;
+ }
+}
+
+int arch_bp_generic_fields(int x86_len, int x86_type,
+ int *gen_len, int *gen_type)
+{
+ int len;
+
+ /* Type */
+ switch (x86_type) {
+ case X86_BREAKPOINT_EXECUTE:
+ if (x86_len != X86_BREAKPOINT_LEN_X)
+ return -EINVAL;
+
+ *gen_type = HW_BREAKPOINT_X;
+ *gen_len = sizeof(long);
+ return 0;
+ case X86_BREAKPOINT_WRITE:
+ *gen_type = HW_BREAKPOINT_W;
+ break;
+ case X86_BREAKPOINT_RW:
+ *gen_type = HW_BREAKPOINT_W | HW_BREAKPOINT_R;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ /* Len */
+ len = arch_bp_generic_len(x86_len);
+ if (len < 0)
+ return -EINVAL;
+ *gen_len = len;
+
+ return 0;
+}
+
+/*
+ * Check for virtual address in kernel space.
+ */
+int arch_check_bp_in_kernelspace(struct arch_hw_breakpoint *hw)
+{
+ unsigned long va;
+ int len;
+
+ va = hw->address;
+ len = arch_bp_generic_len(hw->len);
+ WARN_ON_ONCE(len < 0);
+
+ /*
+ * We don't need to worry about va + len - 1 overflowing:
+ * we already require that va is aligned to a multiple of len.
+ */
+ return (va >= TASK_SIZE_MAX) || ((va + len - 1) >= TASK_SIZE_MAX);
+}
+
+static int arch_build_bp_info(struct perf_event *bp,
+ const struct perf_event_attr *attr,
+ struct arch_hw_breakpoint *hw)
+{
+ hw->address = attr->bp_addr;
+ hw->mask = 0;
+
+ /* Type */
+ switch (attr->bp_type) {
+ case HW_BREAKPOINT_W:
+ hw->type = X86_BREAKPOINT_WRITE;
+ break;
+ case HW_BREAKPOINT_W | HW_BREAKPOINT_R:
+ hw->type = X86_BREAKPOINT_RW;
+ break;
+ case HW_BREAKPOINT_X:
+ /*
+ * We don't allow kernel breakpoints in places that are not
+ * acceptable for kprobes. On non-kprobes kernels, we don't
+ * allow kernel breakpoints at all.
+ */
+ if (attr->bp_addr >= TASK_SIZE_MAX) {
+#ifdef CONFIG_KPROBES
+ if (within_kprobe_blacklist(attr->bp_addr))
+ return -EINVAL;
+#else
+ return -EINVAL;
+#endif
+ }
+
+ hw->type = X86_BREAKPOINT_EXECUTE;
+ /*
+ * x86 inst breakpoints need to have a specific undefined len.
+ * But we still need to check userspace is not trying to setup
+ * an unsupported length, to get a range breakpoint for example.
+ */
+ if (attr->bp_len == sizeof(long)) {
+ hw->len = X86_BREAKPOINT_LEN_X;
+ return 0;
+ }
+ default:
+ return -EINVAL;
+ }
+
+ /* Len */
+ switch (attr->bp_len) {
+ case HW_BREAKPOINT_LEN_1:
+ hw->len = X86_BREAKPOINT_LEN_1;
+ break;
+ case HW_BREAKPOINT_LEN_2:
+ hw->len = X86_BREAKPOINT_LEN_2;
+ break;
+ case HW_BREAKPOINT_LEN_4:
+ hw->len = X86_BREAKPOINT_LEN_4;
+ break;
+#ifdef CONFIG_X86_64
+ case HW_BREAKPOINT_LEN_8:
+ hw->len = X86_BREAKPOINT_LEN_8;
+ break;
+#endif
+ default:
+ /* AMD range breakpoint */
+ if (!is_power_of_2(attr->bp_len))
+ return -EINVAL;
+ if (attr->bp_addr & (attr->bp_len - 1))
+ return -EINVAL;
+
+ if (!boot_cpu_has(X86_FEATURE_BPEXT))
+ return -EOPNOTSUPP;
+
+ /*
+ * It's impossible to use a range breakpoint to fake out
+ * user vs kernel detection because bp_len - 1 can't
+ * have the high bit set. If we ever allow range instruction
+ * breakpoints, then we'll have to check for kprobe-blacklisted
+ * addresses anywhere in the range.
+ */
+ hw->mask = attr->bp_len - 1;
+ hw->len = X86_BREAKPOINT_LEN_1;
+ }
+
+ return 0;
+}
+
+/*
+ * Validate the arch-specific HW Breakpoint register settings
+ */
+int hw_breakpoint_arch_parse(struct perf_event *bp,
+ const struct perf_event_attr *attr,
+ struct arch_hw_breakpoint *hw)
+{
+ unsigned int align;
+ int ret;
+
+
+ ret = arch_build_bp_info(bp, attr, hw);
+ if (ret)
+ return ret;
+
+ switch (hw->len) {
+ case X86_BREAKPOINT_LEN_1:
+ align = 0;
+ if (hw->mask)
+ align = hw->mask;
+ break;
+ case X86_BREAKPOINT_LEN_2:
+ align = 1;
+ break;
+ case X86_BREAKPOINT_LEN_4:
+ align = 3;
+ break;
+#ifdef CONFIG_X86_64
+ case X86_BREAKPOINT_LEN_8:
+ align = 7;
+ break;
+#endif
+ default:
+ WARN_ON_ONCE(1);
+ }
+
+ /*
+ * Check that the low-order bits of the address are appropriate
+ * for the alignment implied by len.
+ */
+ if (hw->address & align)
+ return -EINVAL;
+
+ return 0;
+}
+
+/*
+ * Dump the debug register contents to the user.
+ * We can't dump our per cpu values because it
+ * may contain cpu wide breakpoint, something that
+ * doesn't belong to the current task.
+ *
+ * TODO: include non-ptrace user breakpoints (perf)
+ */
+void aout_dump_debugregs(struct user *dump)
+{
+ int i;
+ int dr7 = 0;
+ struct perf_event *bp;
+ struct arch_hw_breakpoint *info;
+ struct thread_struct *thread = ¤t->thread;
+
+ for (i = 0; i < HBP_NUM; i++) {
+ bp = thread->ptrace_bps[i];
+
+ if (bp && !bp->attr.disabled) {
+ dump->u_debugreg[i] = bp->attr.bp_addr;
+ info = counter_arch_bp(bp);
+ dr7 |= encode_dr7(i, info->len, info->type);
+ } else {
+ dump->u_debugreg[i] = 0;
+ }
+ }
+
+ dump->u_debugreg[4] = 0;
+ dump->u_debugreg[5] = 0;
+ dump->u_debugreg[6] = current->thread.debugreg6;
+
+ dump->u_debugreg[7] = dr7;
+}
+EXPORT_SYMBOL_GPL(aout_dump_debugregs);
+
+/*
+ * Release the user breakpoints used by ptrace
+ */
+void flush_ptrace_hw_breakpoint(struct task_struct *tsk)
+{
+ int i;
+ struct thread_struct *t = &tsk->thread;
+
+ for (i = 0; i < HBP_NUM; i++) {
+ unregister_hw_breakpoint(t->ptrace_bps[i]);
+ t->ptrace_bps[i] = NULL;
+ }
+
+ t->debugreg6 = 0;
+ t->ptrace_dr7 = 0;
+}
+
+void hw_breakpoint_restore(void)
+{
+ set_debugreg(__this_cpu_read(cpu_debugreg[0]), 0);
+ set_debugreg(__this_cpu_read(cpu_debugreg[1]), 1);
+ set_debugreg(__this_cpu_read(cpu_debugreg[2]), 2);
+ set_debugreg(__this_cpu_read(cpu_debugreg[3]), 3);
+ set_debugreg(current->thread.debugreg6, 6);
+ set_debugreg(__this_cpu_read(cpu_dr7), 7);
+}
+EXPORT_SYMBOL_GPL(hw_breakpoint_restore);
+
+/*
+ * Handle debug exception notifications.
+ *
+ * Return value is either NOTIFY_STOP or NOTIFY_DONE as explained below.
+ *
+ * NOTIFY_DONE returned if one of the following conditions is true.
+ * i) When the causative address is from user-space and the exception
+ * is a valid one, i.e. not triggered as a result of lazy debug register
+ * switching
+ * ii) When there are more bits than trap<n> set in DR6 register (such
+ * as BD, BS or BT) indicating that more than one debug condition is
+ * met and requires some more action in do_debug().
+ *
+ * NOTIFY_STOP returned for all other cases
+ *
+ */
+static int hw_breakpoint_handler(struct die_args *args)
+{
+ int i, cpu, rc = NOTIFY_STOP;
+ struct perf_event *bp;
+ unsigned long dr7, dr6;
+ unsigned long *dr6_p;
+
+ /* The DR6 value is pointed by args->err */
+ dr6_p = (unsigned long *)ERR_PTR(args->err);
+ dr6 = *dr6_p;
+
+ /* If it's a single step, TRAP bits are random */
+ if (dr6 & DR_STEP)
+ return NOTIFY_DONE;
+
+ /* Do an early return if no trap bits are set in DR6 */
+ if ((dr6 & DR_TRAP_BITS) == 0)
+ return NOTIFY_DONE;
+
+ get_debugreg(dr7, 7);
+ /* Disable breakpoints during exception handling */
+ set_debugreg(0UL, 7);
+ /*
+ * Assert that local interrupts are disabled
+ * Reset the DRn bits in the virtualized register value.
+ * The ptrace trigger routine will add in whatever is needed.
+ */
+ current->thread.debugreg6 &= ~DR_TRAP_BITS;
+ cpu = get_cpu();
+
+ /* Handle all the breakpoints that were triggered */
+ for (i = 0; i < HBP_NUM; ++i) {
+ if (likely(!(dr6 & (DR_TRAP0 << i))))
+ continue;
+
+ /*
+ * The counter may be concurrently released but that can only
+ * occur from a call_rcu() path. We can then safely fetch
+ * the breakpoint, use its callback, touch its counter
+ * while we are in an rcu_read_lock() path.
+ */
+ rcu_read_lock();
+
+ bp = per_cpu(bp_per_reg[i], cpu);
+ /*
+ * Reset the 'i'th TRAP bit in dr6 to denote completion of
+ * exception handling
+ */
+ (*dr6_p) &= ~(DR_TRAP0 << i);
+ /*
+ * bp can be NULL due to lazy debug register switching
+ * or due to concurrent perf counter removing.
+ */
+ if (!bp) {
+ rcu_read_unlock();
+ break;
+ }
+
+ perf_bp_event(bp, args->regs);
+
+ /*
+ * Set up resume flag to avoid breakpoint recursion when
+ * returning back to origin.
+ */
+ if (bp->hw.info.type == X86_BREAKPOINT_EXECUTE)
+ args->regs->flags |= X86_EFLAGS_RF;
+
+ rcu_read_unlock();
+ }
+ /*
+ * Further processing in do_debug() is needed for a) user-space
+ * breakpoints (to generate signals) and b) when the system has
+ * taken exception due to multiple causes
+ */
+ if ((current->thread.debugreg6 & DR_TRAP_BITS) ||
+ (dr6 & (~DR_TRAP_BITS)))
+ rc = NOTIFY_DONE;
+
+ set_debugreg(dr7, 7);
+ put_cpu();
+
+ return rc;
+}
+
+/*
+ * Handle debug exception notifications.
+ */
+int hw_breakpoint_exceptions_notify(
+ struct notifier_block *unused, unsigned long val, void *data)
+{
+ if (val != DIE_DEBUG)
+ return NOTIFY_DONE;
+
+ return hw_breakpoint_handler(data);
+}
+
+void hw_breakpoint_pmu_read(struct perf_event *bp)
+{
+ /* TODO */
+}
diff --git a/arch/x86/kernel/i8237.c b/arch/x86/kernel/i8237.c
new file mode 100644
index 0000000..0a3e70f
--- /dev/null
+++ b/arch/x86/kernel/i8237.c
@@ -0,0 +1,80 @@
+/*
+ * 8237A DMA controller suspend functions.
+ *
+ * Written by Pierre Ossman, 2005.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or (at
+ * your option) any later version.
+ */
+
+#include <linux/dmi.h>
+#include <linux/init.h>
+#include <linux/syscore_ops.h>
+
+#include <asm/dma.h>
+#include <asm/x86_init.h>
+
+/*
+ * This module just handles suspend/resume issues with the
+ * 8237A DMA controller (used for ISA and LPC).
+ * Allocation is handled in kernel/dma.c and normal usage is
+ * in asm/dma.h.
+ */
+
+static void i8237A_resume(void)
+{
+ unsigned long flags;
+ int i;
+
+ flags = claim_dma_lock();
+
+ dma_outb(0, DMA1_RESET_REG);
+ dma_outb(0, DMA2_RESET_REG);
+
+ for (i = 0; i < 8; i++) {
+ set_dma_addr(i, 0x000000);
+ /* DMA count is a bit weird so this is not 0 */
+ set_dma_count(i, 1);
+ }
+
+ /* Enable cascade DMA or channel 0-3 won't work */
+ enable_dma(4);
+
+ release_dma_lock(flags);
+}
+
+static struct syscore_ops i8237_syscore_ops = {
+ .resume = i8237A_resume,
+};
+
+static int __init i8237A_init_ops(void)
+{
+ /*
+ * From SKL PCH onwards, the legacy DMA device is removed in which the
+ * I/O ports (81h-83h, 87h, 89h-8Bh, 8Fh) related to it are removed
+ * as well. All removed ports must return 0xff for a inb() request.
+ *
+ * Note: DMA_PAGE_2 (port 0x81) should not be checked for detecting
+ * the presence of DMA device since it may be used by BIOS to decode
+ * LPC traffic for POST codes. Original LPC only decodes one byte of
+ * port 0x80 but some BIOS may choose to enhance PCH LPC port 0x8x
+ * decoding.
+ */
+ if (dma_inb(DMA_PAGE_0) == 0xFF)
+ return -ENODEV;
+
+ /*
+ * It is not required to load this driver as newer SoC may not
+ * support 8237 DMA or bus mastering from LPC. Platform firmware
+ * must announce the support for such legacy devices via
+ * ACPI_FADT_LEGACY_DEVICES field in FADT table.
+ */
+ if (x86_pnpbios_disabled() && dmi_get_bios_year() >= 2017)
+ return -ENODEV;
+
+ register_syscore_ops(&i8237_syscore_ops);
+ return 0;
+}
+device_initcall(i8237A_init_ops);
diff --git a/arch/x86/kernel/i8253.c b/arch/x86/kernel/i8253.c
new file mode 100644
index 0000000..0d307a6
--- /dev/null
+++ b/arch/x86/kernel/i8253.c
@@ -0,0 +1,44 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * 8253/PIT functions
+ *
+ */
+#include <linux/clockchips.h>
+#include <linux/init.h>
+#include <linux/timex.h>
+#include <linux/i8253.h>
+
+#include <asm/hpet.h>
+#include <asm/time.h>
+#include <asm/smp.h>
+
+/*
+ * HPET replaces the PIT, when enabled. So we need to know, which of
+ * the two timers is used
+ */
+struct clock_event_device *global_clock_event;
+
+void __init setup_pit_timer(void)
+{
+ clockevent_i8253_init(true);
+ global_clock_event = &i8253_clockevent;
+}
+
+#ifndef CONFIG_X86_64
+static int __init init_pit_clocksource(void)
+{
+ /*
+ * Several reasons not to register PIT as a clocksource:
+ *
+ * - On SMP PIT does not scale due to i8253_lock
+ * - when HPET is enabled
+ * - when local APIC timer is active (PIT is switched off)
+ */
+ if (num_possible_cpus() > 1 || is_hpet_enabled() ||
+ !clockevent_state_periodic(&i8253_clockevent))
+ return 0;
+
+ return clocksource_i8253_init();
+}
+arch_initcall(init_pit_clocksource);
+#endif /* !CONFIG_X86_64 */
diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c
new file mode 100644
index 0000000..519649d
--- /dev/null
+++ b/arch/x86/kernel/i8259.c
@@ -0,0 +1,434 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/linkage.h>
+#include <linux/errno.h>
+#include <linux/signal.h>
+#include <linux/sched.h>
+#include <linux/ioport.h>
+#include <linux/interrupt.h>
+#include <linux/irq.h>
+#include <linux/timex.h>
+#include <linux/random.h>
+#include <linux/init.h>
+#include <linux/kernel_stat.h>
+#include <linux/syscore_ops.h>
+#include <linux/bitops.h>
+#include <linux/acpi.h>
+#include <linux/io.h>
+#include <linux/delay.h>
+
+#include <linux/atomic.h>
+#include <asm/timer.h>
+#include <asm/hw_irq.h>
+#include <asm/pgtable.h>
+#include <asm/desc.h>
+#include <asm/apic.h>
+#include <asm/i8259.h>
+
+/*
+ * This is the 'legacy' 8259A Programmable Interrupt Controller,
+ * present in the majority of PC/AT boxes.
+ * plus some generic x86 specific things if generic specifics makes
+ * any sense at all.
+ */
+static void init_8259A(int auto_eoi);
+
+static int i8259A_auto_eoi;
+DEFINE_RAW_SPINLOCK(i8259A_lock);
+
+/*
+ * 8259A PIC functions to handle ISA devices:
+ */
+
+/*
+ * This contains the irq mask for both 8259A irq controllers,
+ */
+unsigned int cached_irq_mask = 0xffff;
+
+/*
+ * Not all IRQs can be routed through the IO-APIC, eg. on certain (older)
+ * boards the timer interrupt is not really connected to any IO-APIC pin,
+ * it's fed to the master 8259A's IR0 line only.
+ *
+ * Any '1' bit in this mask means the IRQ is routed through the IO-APIC.
+ * this 'mixed mode' IRQ handling costs nothing because it's only used
+ * at IRQ setup time.
+ */
+unsigned long io_apic_irqs;
+
+static void mask_8259A_irq(unsigned int irq)
+{
+ unsigned int mask = 1 << irq;
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&i8259A_lock, flags);
+ cached_irq_mask |= mask;
+ if (irq & 8)
+ outb(cached_slave_mask, PIC_SLAVE_IMR);
+ else
+ outb(cached_master_mask, PIC_MASTER_IMR);
+ raw_spin_unlock_irqrestore(&i8259A_lock, flags);
+}
+
+static void disable_8259A_irq(struct irq_data *data)
+{
+ mask_8259A_irq(data->irq);
+}
+
+static void unmask_8259A_irq(unsigned int irq)
+{
+ unsigned int mask = ~(1 << irq);
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&i8259A_lock, flags);
+ cached_irq_mask &= mask;
+ if (irq & 8)
+ outb(cached_slave_mask, PIC_SLAVE_IMR);
+ else
+ outb(cached_master_mask, PIC_MASTER_IMR);
+ raw_spin_unlock_irqrestore(&i8259A_lock, flags);
+}
+
+static void enable_8259A_irq(struct irq_data *data)
+{
+ unmask_8259A_irq(data->irq);
+}
+
+static int i8259A_irq_pending(unsigned int irq)
+{
+ unsigned int mask = 1<<irq;
+ unsigned long flags;
+ int ret;
+
+ raw_spin_lock_irqsave(&i8259A_lock, flags);
+ if (irq < 8)
+ ret = inb(PIC_MASTER_CMD) & mask;
+ else
+ ret = inb(PIC_SLAVE_CMD) & (mask >> 8);
+ raw_spin_unlock_irqrestore(&i8259A_lock, flags);
+
+ return ret;
+}
+
+static void make_8259A_irq(unsigned int irq)
+{
+ disable_irq_nosync(irq);
+ io_apic_irqs &= ~(1<<irq);
+ irq_set_chip_and_handler(irq, &i8259A_chip, handle_level_irq);
+ enable_irq(irq);
+ lapic_assign_legacy_vector(irq, true);
+}
+
+/*
+ * This function assumes to be called rarely. Switching between
+ * 8259A registers is slow.
+ * This has to be protected by the irq controller spinlock
+ * before being called.
+ */
+static inline int i8259A_irq_real(unsigned int irq)
+{
+ int value;
+ int irqmask = 1<<irq;
+
+ if (irq < 8) {
+ outb(0x0B, PIC_MASTER_CMD); /* ISR register */
+ value = inb(PIC_MASTER_CMD) & irqmask;
+ outb(0x0A, PIC_MASTER_CMD); /* back to the IRR register */
+ return value;
+ }
+ outb(0x0B, PIC_SLAVE_CMD); /* ISR register */
+ value = inb(PIC_SLAVE_CMD) & (irqmask >> 8);
+ outb(0x0A, PIC_SLAVE_CMD); /* back to the IRR register */
+ return value;
+}
+
+/*
+ * Careful! The 8259A is a fragile beast, it pretty
+ * much _has_ to be done exactly like this (mask it
+ * first, _then_ send the EOI, and the order of EOI
+ * to the two 8259s is important!
+ */
+static void mask_and_ack_8259A(struct irq_data *data)
+{
+ unsigned int irq = data->irq;
+ unsigned int irqmask = 1 << irq;
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&i8259A_lock, flags);
+ /*
+ * Lightweight spurious IRQ detection. We do not want
+ * to overdo spurious IRQ handling - it's usually a sign
+ * of hardware problems, so we only do the checks we can
+ * do without slowing down good hardware unnecessarily.
+ *
+ * Note that IRQ7 and IRQ15 (the two spurious IRQs
+ * usually resulting from the 8259A-1|2 PICs) occur
+ * even if the IRQ is masked in the 8259A. Thus we
+ * can check spurious 8259A IRQs without doing the
+ * quite slow i8259A_irq_real() call for every IRQ.
+ * This does not cover 100% of spurious interrupts,
+ * but should be enough to warn the user that there
+ * is something bad going on ...
+ */
+ if (cached_irq_mask & irqmask)
+ goto spurious_8259A_irq;
+ cached_irq_mask |= irqmask;
+
+handle_real_irq:
+ if (irq & 8) {
+ inb(PIC_SLAVE_IMR); /* DUMMY - (do we need this?) */
+ outb(cached_slave_mask, PIC_SLAVE_IMR);
+ /* 'Specific EOI' to slave */
+ outb(0x60+(irq&7), PIC_SLAVE_CMD);
+ /* 'Specific EOI' to master-IRQ2 */
+ outb(0x60+PIC_CASCADE_IR, PIC_MASTER_CMD);
+ } else {
+ inb(PIC_MASTER_IMR); /* DUMMY - (do we need this?) */
+ outb(cached_master_mask, PIC_MASTER_IMR);
+ outb(0x60+irq, PIC_MASTER_CMD); /* 'Specific EOI to master */
+ }
+ raw_spin_unlock_irqrestore(&i8259A_lock, flags);
+ return;
+
+spurious_8259A_irq:
+ /*
+ * this is the slow path - should happen rarely.
+ */
+ if (i8259A_irq_real(irq))
+ /*
+ * oops, the IRQ _is_ in service according to the
+ * 8259A - not spurious, go handle it.
+ */
+ goto handle_real_irq;
+
+ {
+ static int spurious_irq_mask;
+ /*
+ * At this point we can be sure the IRQ is spurious,
+ * lets ACK and report it. [once per IRQ]
+ */
+ if (!(spurious_irq_mask & irqmask)) {
+ printk(KERN_DEBUG
+ "spurious 8259A interrupt: IRQ%d.\n", irq);
+ spurious_irq_mask |= irqmask;
+ }
+ atomic_inc(&irq_err_count);
+ /*
+ * Theoretically we do not have to handle this IRQ,
+ * but in Linux this does not cause problems and is
+ * simpler for us.
+ */
+ goto handle_real_irq;
+ }
+}
+
+struct irq_chip i8259A_chip = {
+ .name = "XT-PIC",
+ .irq_mask = disable_8259A_irq,
+ .irq_disable = disable_8259A_irq,
+ .irq_unmask = enable_8259A_irq,
+ .irq_mask_ack = mask_and_ack_8259A,
+};
+
+static char irq_trigger[2];
+/**
+ * ELCR registers (0x4d0, 0x4d1) control edge/level of IRQ
+ */
+static void restore_ELCR(char *trigger)
+{
+ outb(trigger[0], 0x4d0);
+ outb(trigger[1], 0x4d1);
+}
+
+static void save_ELCR(char *trigger)
+{
+ /* IRQ 0,1,2,8,13 are marked as reserved */
+ trigger[0] = inb(0x4d0) & 0xF8;
+ trigger[1] = inb(0x4d1) & 0xDE;
+}
+
+static void i8259A_resume(void)
+{
+ init_8259A(i8259A_auto_eoi);
+ restore_ELCR(irq_trigger);
+}
+
+static int i8259A_suspend(void)
+{
+ save_ELCR(irq_trigger);
+ return 0;
+}
+
+static void i8259A_shutdown(void)
+{
+ /* Put the i8259A into a quiescent state that
+ * the kernel initialization code can get it
+ * out of.
+ */
+ outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */
+ outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-2 */
+}
+
+static struct syscore_ops i8259_syscore_ops = {
+ .suspend = i8259A_suspend,
+ .resume = i8259A_resume,
+ .shutdown = i8259A_shutdown,
+};
+
+static void mask_8259A(void)
+{
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&i8259A_lock, flags);
+
+ outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */
+ outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-2 */
+
+ raw_spin_unlock_irqrestore(&i8259A_lock, flags);
+}
+
+static void unmask_8259A(void)
+{
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&i8259A_lock, flags);
+
+ outb(cached_master_mask, PIC_MASTER_IMR); /* restore master IRQ mask */
+ outb(cached_slave_mask, PIC_SLAVE_IMR); /* restore slave IRQ mask */
+
+ raw_spin_unlock_irqrestore(&i8259A_lock, flags);
+}
+
+static int probe_8259A(void)
+{
+ unsigned long flags;
+ unsigned char probe_val = ~(1 << PIC_CASCADE_IR);
+ unsigned char new_val;
+ /*
+ * Check to see if we have a PIC.
+ * Mask all except the cascade and read
+ * back the value we just wrote. If we don't
+ * have a PIC, we will read 0xff as opposed to the
+ * value we wrote.
+ */
+ raw_spin_lock_irqsave(&i8259A_lock, flags);
+
+ outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-2 */
+ outb(probe_val, PIC_MASTER_IMR);
+ new_val = inb(PIC_MASTER_IMR);
+ if (new_val != probe_val) {
+ printk(KERN_INFO "Using NULL legacy PIC\n");
+ legacy_pic = &null_legacy_pic;
+ }
+
+ raw_spin_unlock_irqrestore(&i8259A_lock, flags);
+ return nr_legacy_irqs();
+}
+
+static void init_8259A(int auto_eoi)
+{
+ unsigned long flags;
+
+ i8259A_auto_eoi = auto_eoi;
+
+ raw_spin_lock_irqsave(&i8259A_lock, flags);
+
+ outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */
+
+ /*
+ * outb_pic - this has to work on a wide range of PC hardware.
+ */
+ outb_pic(0x11, PIC_MASTER_CMD); /* ICW1: select 8259A-1 init */
+
+ /* ICW2: 8259A-1 IR0-7 mapped to ISA_IRQ_VECTOR(0) */
+ outb_pic(ISA_IRQ_VECTOR(0), PIC_MASTER_IMR);
+
+ /* 8259A-1 (the master) has a slave on IR2 */
+ outb_pic(1U << PIC_CASCADE_IR, PIC_MASTER_IMR);
+
+ if (auto_eoi) /* master does Auto EOI */
+ outb_pic(MASTER_ICW4_DEFAULT | PIC_ICW4_AEOI, PIC_MASTER_IMR);
+ else /* master expects normal EOI */
+ outb_pic(MASTER_ICW4_DEFAULT, PIC_MASTER_IMR);
+
+ outb_pic(0x11, PIC_SLAVE_CMD); /* ICW1: select 8259A-2 init */
+
+ /* ICW2: 8259A-2 IR0-7 mapped to ISA_IRQ_VECTOR(8) */
+ outb_pic(ISA_IRQ_VECTOR(8), PIC_SLAVE_IMR);
+ /* 8259A-2 is a slave on master's IR2 */
+ outb_pic(PIC_CASCADE_IR, PIC_SLAVE_IMR);
+ /* (slave's support for AEOI in flat mode is to be investigated) */
+ outb_pic(SLAVE_ICW4_DEFAULT, PIC_SLAVE_IMR);
+
+ if (auto_eoi)
+ /*
+ * In AEOI mode we just have to mask the interrupt
+ * when acking.
+ */
+ i8259A_chip.irq_mask_ack = disable_8259A_irq;
+ else
+ i8259A_chip.irq_mask_ack = mask_and_ack_8259A;
+
+ udelay(100); /* wait for 8259A to initialize */
+
+ outb(cached_master_mask, PIC_MASTER_IMR); /* restore master IRQ mask */
+ outb(cached_slave_mask, PIC_SLAVE_IMR); /* restore slave IRQ mask */
+
+ raw_spin_unlock_irqrestore(&i8259A_lock, flags);
+}
+
+/*
+ * make i8259 a driver so that we can select pic functions at run time. the goal
+ * is to make x86 binary compatible among pc compatible and non-pc compatible
+ * platforms, such as x86 MID.
+ */
+
+static void legacy_pic_noop(void) { };
+static void legacy_pic_uint_noop(unsigned int unused) { };
+static void legacy_pic_int_noop(int unused) { };
+static int legacy_pic_irq_pending_noop(unsigned int irq)
+{
+ return 0;
+}
+static int legacy_pic_probe(void)
+{
+ return 0;
+}
+
+struct legacy_pic null_legacy_pic = {
+ .nr_legacy_irqs = 0,
+ .chip = &dummy_irq_chip,
+ .mask = legacy_pic_uint_noop,
+ .unmask = legacy_pic_uint_noop,
+ .mask_all = legacy_pic_noop,
+ .restore_mask = legacy_pic_noop,
+ .init = legacy_pic_int_noop,
+ .probe = legacy_pic_probe,
+ .irq_pending = legacy_pic_irq_pending_noop,
+ .make_irq = legacy_pic_uint_noop,
+};
+
+struct legacy_pic default_legacy_pic = {
+ .nr_legacy_irqs = NR_IRQS_LEGACY,
+ .chip = &i8259A_chip,
+ .mask = mask_8259A_irq,
+ .unmask = unmask_8259A_irq,
+ .mask_all = mask_8259A,
+ .restore_mask = unmask_8259A,
+ .init = init_8259A,
+ .probe = probe_8259A,
+ .irq_pending = i8259A_irq_pending,
+ .make_irq = make_8259A_irq,
+};
+
+struct legacy_pic *legacy_pic = &default_legacy_pic;
+EXPORT_SYMBOL(legacy_pic);
+
+static int __init i8259A_init_ops(void)
+{
+ if (legacy_pic == &default_legacy_pic)
+ register_syscore_ops(&i8259_syscore_ops);
+
+ return 0;
+}
+
+device_initcall(i8259A_init_ops);
diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c
new file mode 100644
index 0000000..01adea2
--- /dev/null
+++ b/arch/x86/kernel/idt.c
@@ -0,0 +1,368 @@
+/*
+ * Interrupt descriptor table related code
+ *
+ * This file is licensed under the GPL V2
+ */
+#include <linux/interrupt.h>
+
+#include <asm/traps.h>
+#include <asm/proto.h>
+#include <asm/desc.h>
+#include <asm/hw_irq.h>
+
+struct idt_data {
+ unsigned int vector;
+ unsigned int segment;
+ struct idt_bits bits;
+ const void *addr;
+};
+
+#define DPL0 0x0
+#define DPL3 0x3
+
+#define DEFAULT_STACK 0
+
+#define G(_vector, _addr, _ist, _type, _dpl, _segment) \
+ { \
+ .vector = _vector, \
+ .bits.ist = _ist, \
+ .bits.type = _type, \
+ .bits.dpl = _dpl, \
+ .bits.p = 1, \
+ .addr = _addr, \
+ .segment = _segment, \
+ }
+
+/* Interrupt gate */
+#define INTG(_vector, _addr) \
+ G(_vector, _addr, DEFAULT_STACK, GATE_INTERRUPT, DPL0, __KERNEL_CS)
+
+/* System interrupt gate */
+#define SYSG(_vector, _addr) \
+ G(_vector, _addr, DEFAULT_STACK, GATE_INTERRUPT, DPL3, __KERNEL_CS)
+
+/* Interrupt gate with interrupt stack */
+#define ISTG(_vector, _addr, _ist) \
+ G(_vector, _addr, _ist, GATE_INTERRUPT, DPL0, __KERNEL_CS)
+
+/* System interrupt gate with interrupt stack */
+#define SISTG(_vector, _addr, _ist) \
+ G(_vector, _addr, _ist, GATE_INTERRUPT, DPL3, __KERNEL_CS)
+
+/* Task gate */
+#define TSKG(_vector, _gdt) \
+ G(_vector, NULL, DEFAULT_STACK, GATE_TASK, DPL0, _gdt << 3)
+
+/*
+ * Early traps running on the DEFAULT_STACK because the other interrupt
+ * stacks work only after cpu_init().
+ */
+static const __initconst struct idt_data early_idts[] = {
+ INTG(X86_TRAP_DB, debug),
+ SYSG(X86_TRAP_BP, int3),
+#ifdef CONFIG_X86_32
+ INTG(X86_TRAP_PF, page_fault),
+#endif
+};
+
+/*
+ * The default IDT entries which are set up in trap_init() before
+ * cpu_init() is invoked. Interrupt stacks cannot be used at that point and
+ * the traps which use them are reinitialized with IST after cpu_init() has
+ * set up TSS.
+ */
+static const __initconst struct idt_data def_idts[] = {
+ INTG(X86_TRAP_DE, divide_error),
+ INTG(X86_TRAP_NMI, nmi),
+ INTG(X86_TRAP_BR, bounds),
+ INTG(X86_TRAP_UD, invalid_op),
+ INTG(X86_TRAP_NM, device_not_available),
+ INTG(X86_TRAP_OLD_MF, coprocessor_segment_overrun),
+ INTG(X86_TRAP_TS, invalid_TSS),
+ INTG(X86_TRAP_NP, segment_not_present),
+ INTG(X86_TRAP_SS, stack_segment),
+ INTG(X86_TRAP_GP, general_protection),
+ INTG(X86_TRAP_SPURIOUS, spurious_interrupt_bug),
+ INTG(X86_TRAP_MF, coprocessor_error),
+ INTG(X86_TRAP_AC, alignment_check),
+ INTG(X86_TRAP_XF, simd_coprocessor_error),
+
+#ifdef CONFIG_X86_32
+ TSKG(X86_TRAP_DF, GDT_ENTRY_DOUBLEFAULT_TSS),
+#else
+ INTG(X86_TRAP_DF, double_fault),
+#endif
+ INTG(X86_TRAP_DB, debug),
+
+#ifdef CONFIG_X86_MCE
+ INTG(X86_TRAP_MC, &machine_check),
+#endif
+
+ SYSG(X86_TRAP_OF, overflow),
+#if defined(CONFIG_IA32_EMULATION)
+ SYSG(IA32_SYSCALL_VECTOR, entry_INT80_compat),
+#elif defined(CONFIG_X86_32)
+ SYSG(IA32_SYSCALL_VECTOR, entry_INT80_32),
+#endif
+};
+
+/*
+ * The APIC and SMP idt entries
+ */
+static const __initconst struct idt_data apic_idts[] = {
+#ifdef CONFIG_SMP
+ INTG(RESCHEDULE_VECTOR, reschedule_interrupt),
+ INTG(CALL_FUNCTION_VECTOR, call_function_interrupt),
+ INTG(CALL_FUNCTION_SINGLE_VECTOR, call_function_single_interrupt),
+ INTG(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt),
+ INTG(REBOOT_VECTOR, reboot_interrupt),
+#endif
+
+#ifdef CONFIG_X86_THERMAL_VECTOR
+ INTG(THERMAL_APIC_VECTOR, thermal_interrupt),
+#endif
+
+#ifdef CONFIG_X86_MCE_THRESHOLD
+ INTG(THRESHOLD_APIC_VECTOR, threshold_interrupt),
+#endif
+
+#ifdef CONFIG_X86_MCE_AMD
+ INTG(DEFERRED_ERROR_VECTOR, deferred_error_interrupt),
+#endif
+
+#ifdef CONFIG_X86_LOCAL_APIC
+ INTG(LOCAL_TIMER_VECTOR, apic_timer_interrupt),
+ INTG(X86_PLATFORM_IPI_VECTOR, x86_platform_ipi),
+# ifdef CONFIG_HAVE_KVM
+ INTG(POSTED_INTR_VECTOR, kvm_posted_intr_ipi),
+ INTG(POSTED_INTR_WAKEUP_VECTOR, kvm_posted_intr_wakeup_ipi),
+ INTG(POSTED_INTR_NESTED_VECTOR, kvm_posted_intr_nested_ipi),
+# endif
+# ifdef CONFIG_IRQ_WORK
+ INTG(IRQ_WORK_VECTOR, irq_work_interrupt),
+# endif
+#ifdef CONFIG_X86_UV
+ INTG(UV_BAU_MESSAGE, uv_bau_message_intr1),
+#endif
+ INTG(SPURIOUS_APIC_VECTOR, spurious_interrupt),
+ INTG(ERROR_APIC_VECTOR, error_interrupt),
+#endif
+};
+
+#ifdef CONFIG_X86_64
+/*
+ * Early traps running on the DEFAULT_STACK because the other interrupt
+ * stacks work only after cpu_init().
+ */
+static const __initconst struct idt_data early_pf_idts[] = {
+ INTG(X86_TRAP_PF, page_fault),
+};
+
+/*
+ * Override for the debug_idt. Same as the default, but with interrupt
+ * stack set to DEFAULT_STACK (0). Required for NMI trap handling.
+ */
+static const __initconst struct idt_data dbg_idts[] = {
+ INTG(X86_TRAP_DB, debug),
+};
+#endif
+
+/* Must be page-aligned because the real IDT is used in a fixmap. */
+gate_desc idt_table[IDT_ENTRIES] __page_aligned_bss;
+
+struct desc_ptr idt_descr __ro_after_init = {
+ .size = (IDT_ENTRIES * 2 * sizeof(unsigned long)) - 1,
+ .address = (unsigned long) idt_table,
+};
+
+#ifdef CONFIG_X86_64
+/* No need to be aligned, but done to keep all IDTs defined the same way. */
+gate_desc debug_idt_table[IDT_ENTRIES] __page_aligned_bss;
+
+/*
+ * The exceptions which use Interrupt stacks. They are setup after
+ * cpu_init() when the TSS has been initialized.
+ */
+static const __initconst struct idt_data ist_idts[] = {
+ ISTG(X86_TRAP_DB, debug, DEBUG_STACK),
+ ISTG(X86_TRAP_NMI, nmi, NMI_STACK),
+ ISTG(X86_TRAP_DF, double_fault, DOUBLEFAULT_STACK),
+#ifdef CONFIG_X86_MCE
+ ISTG(X86_TRAP_MC, &machine_check, MCE_STACK),
+#endif
+};
+
+/*
+ * Override for the debug_idt. Same as the default, but with interrupt
+ * stack set to DEFAULT_STACK (0). Required for NMI trap handling.
+ */
+const struct desc_ptr debug_idt_descr = {
+ .size = IDT_ENTRIES * 16 - 1,
+ .address = (unsigned long) debug_idt_table,
+};
+#endif
+
+static inline void idt_init_desc(gate_desc *gate, const struct idt_data *d)
+{
+ unsigned long addr = (unsigned long) d->addr;
+
+ gate->offset_low = (u16) addr;
+ gate->segment = (u16) d->segment;
+ gate->bits = d->bits;
+ gate->offset_middle = (u16) (addr >> 16);
+#ifdef CONFIG_X86_64
+ gate->offset_high = (u32) (addr >> 32);
+ gate->reserved = 0;
+#endif
+}
+
+static void
+idt_setup_from_table(gate_desc *idt, const struct idt_data *t, int size, bool sys)
+{
+ gate_desc desc;
+
+ for (; size > 0; t++, size--) {
+ idt_init_desc(&desc, t);
+ write_idt_entry(idt, t->vector, &desc);
+ if (sys)
+ set_bit(t->vector, system_vectors);
+ }
+}
+
+static void set_intr_gate(unsigned int n, const void *addr)
+{
+ struct idt_data data;
+
+ BUG_ON(n > 0xFF);
+
+ memset(&data, 0, sizeof(data));
+ data.vector = n;
+ data.addr = addr;
+ data.segment = __KERNEL_CS;
+ data.bits.type = GATE_INTERRUPT;
+ data.bits.p = 1;
+
+ idt_setup_from_table(idt_table, &data, 1, false);
+}
+
+/**
+ * idt_setup_early_traps - Initialize the idt table with early traps
+ *
+ * On X8664 these traps do not use interrupt stacks as they can't work
+ * before cpu_init() is invoked and sets up TSS. The IST variants are
+ * installed after that.
+ */
+void __init idt_setup_early_traps(void)
+{
+ idt_setup_from_table(idt_table, early_idts, ARRAY_SIZE(early_idts),
+ true);
+ load_idt(&idt_descr);
+}
+
+/**
+ * idt_setup_traps - Initialize the idt table with default traps
+ */
+void __init idt_setup_traps(void)
+{
+ idt_setup_from_table(idt_table, def_idts, ARRAY_SIZE(def_idts), true);
+}
+
+#ifdef CONFIG_X86_64
+/**
+ * idt_setup_early_pf - Initialize the idt table with early pagefault handler
+ *
+ * On X8664 this does not use interrupt stacks as they can't work before
+ * cpu_init() is invoked and sets up TSS. The IST variant is installed
+ * after that.
+ *
+ * FIXME: Why is 32bit and 64bit installing the PF handler at different
+ * places in the early setup code?
+ */
+void __init idt_setup_early_pf(void)
+{
+ idt_setup_from_table(idt_table, early_pf_idts,
+ ARRAY_SIZE(early_pf_idts), true);
+}
+
+/**
+ * idt_setup_ist_traps - Initialize the idt table with traps using IST
+ */
+void __init idt_setup_ist_traps(void)
+{
+ idt_setup_from_table(idt_table, ist_idts, ARRAY_SIZE(ist_idts), true);
+}
+
+/**
+ * idt_setup_debugidt_traps - Initialize the debug idt table with debug traps
+ */
+void __init idt_setup_debugidt_traps(void)
+{
+ memcpy(&debug_idt_table, &idt_table, IDT_ENTRIES * 16);
+
+ idt_setup_from_table(debug_idt_table, dbg_idts, ARRAY_SIZE(dbg_idts), false);
+}
+#endif
+
+/**
+ * idt_setup_apic_and_irq_gates - Setup APIC/SMP and normal interrupt gates
+ */
+void __init idt_setup_apic_and_irq_gates(void)
+{
+ int i = FIRST_EXTERNAL_VECTOR;
+ void *entry;
+
+ idt_setup_from_table(idt_table, apic_idts, ARRAY_SIZE(apic_idts), true);
+
+ for_each_clear_bit_from(i, system_vectors, FIRST_SYSTEM_VECTOR) {
+ entry = irq_entries_start + 8 * (i - FIRST_EXTERNAL_VECTOR);
+ set_intr_gate(i, entry);
+ }
+
+#ifdef CONFIG_X86_LOCAL_APIC
+ for_each_clear_bit_from(i, system_vectors, NR_VECTORS) {
+ set_bit(i, system_vectors);
+ set_intr_gate(i, spurious_interrupt);
+ }
+#endif
+}
+
+/**
+ * idt_setup_early_handler - Initializes the idt table with early handlers
+ */
+void __init idt_setup_early_handler(void)
+{
+ int i;
+
+ for (i = 0; i < NUM_EXCEPTION_VECTORS; i++)
+ set_intr_gate(i, early_idt_handler_array[i]);
+#ifdef CONFIG_X86_32
+ for ( ; i < NR_VECTORS; i++)
+ set_intr_gate(i, early_ignore_irq);
+#endif
+ load_idt(&idt_descr);
+}
+
+/**
+ * idt_invalidate - Invalidate interrupt descriptor table
+ * @addr: The virtual address of the 'invalid' IDT
+ */
+void idt_invalidate(void *addr)
+{
+ struct desc_ptr idt = { .address = (unsigned long) addr, .size = 0 };
+
+ load_idt(&idt);
+}
+
+void __init update_intr_gate(unsigned int n, const void *addr)
+{
+ if (WARN_ON_ONCE(!test_bit(n, system_vectors)))
+ return;
+ set_intr_gate(n, addr);
+}
+
+void alloc_intr_gate(unsigned int n, const void *addr)
+{
+ BUG_ON(n < FIRST_SYSTEM_VECTOR);
+ if (!test_and_set_bit(n, system_vectors))
+ set_intr_gate(n, addr);
+}
diff --git a/arch/x86/kernel/io_delay.c b/arch/x86/kernel/io_delay.c
new file mode 100644
index 0000000..805b7a3
--- /dev/null
+++ b/arch/x86/kernel/io_delay.c
@@ -0,0 +1,132 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * I/O delay strategies for inb_p/outb_p
+ *
+ * Allow for a DMI based override of port 0x80, needed for certain HP laptops
+ * and possibly other systems. Also allow for the gradual elimination of
+ * outb_p/inb_p API uses.
+ */
+#include <linux/kernel.h>
+#include <linux/export.h>
+#include <linux/delay.h>
+#include <linux/init.h>
+#include <linux/dmi.h>
+#include <linux/io.h>
+
+int io_delay_type __read_mostly = CONFIG_DEFAULT_IO_DELAY_TYPE;
+
+static int __initdata io_delay_override;
+
+/*
+ * Paravirt wants native_io_delay to be a constant.
+ */
+void native_io_delay(void)
+{
+ switch (io_delay_type) {
+ default:
+ case CONFIG_IO_DELAY_TYPE_0X80:
+ asm volatile ("outb %al, $0x80");
+ break;
+ case CONFIG_IO_DELAY_TYPE_0XED:
+ asm volatile ("outb %al, $0xed");
+ break;
+ case CONFIG_IO_DELAY_TYPE_UDELAY:
+ /*
+ * 2 usecs is an upper-bound for the outb delay but
+ * note that udelay doesn't have the bus-level
+ * side-effects that outb does, nor does udelay() have
+ * precise timings during very early bootup (the delays
+ * are shorter until calibrated):
+ */
+ udelay(2);
+ case CONFIG_IO_DELAY_TYPE_NONE:
+ break;
+ }
+}
+EXPORT_SYMBOL(native_io_delay);
+
+static int __init dmi_io_delay_0xed_port(const struct dmi_system_id *id)
+{
+ if (io_delay_type == CONFIG_IO_DELAY_TYPE_0X80) {
+ pr_notice("%s: using 0xed I/O delay port\n", id->ident);
+ io_delay_type = CONFIG_IO_DELAY_TYPE_0XED;
+ }
+
+ return 0;
+}
+
+/*
+ * Quirk table for systems that misbehave (lock up, etc.) if port
+ * 0x80 is used:
+ */
+static const struct dmi_system_id io_delay_0xed_port_dmi_table[] __initconst = {
+ {
+ .callback = dmi_io_delay_0xed_port,
+ .ident = "Compaq Presario V6000",
+ .matches = {
+ DMI_MATCH(DMI_BOARD_VENDOR, "Quanta"),
+ DMI_MATCH(DMI_BOARD_NAME, "30B7")
+ }
+ },
+ {
+ .callback = dmi_io_delay_0xed_port,
+ .ident = "HP Pavilion dv9000z",
+ .matches = {
+ DMI_MATCH(DMI_BOARD_VENDOR, "Quanta"),
+ DMI_MATCH(DMI_BOARD_NAME, "30B9")
+ }
+ },
+ {
+ .callback = dmi_io_delay_0xed_port,
+ .ident = "HP Pavilion dv6000",
+ .matches = {
+ DMI_MATCH(DMI_BOARD_VENDOR, "Quanta"),
+ DMI_MATCH(DMI_BOARD_NAME, "30B8")
+ }
+ },
+ {
+ .callback = dmi_io_delay_0xed_port,
+ .ident = "HP Pavilion tx1000",
+ .matches = {
+ DMI_MATCH(DMI_BOARD_VENDOR, "Quanta"),
+ DMI_MATCH(DMI_BOARD_NAME, "30BF")
+ }
+ },
+ {
+ .callback = dmi_io_delay_0xed_port,
+ .ident = "Presario F700",
+ .matches = {
+ DMI_MATCH(DMI_BOARD_VENDOR, "Quanta"),
+ DMI_MATCH(DMI_BOARD_NAME, "30D3")
+ }
+ },
+ { }
+};
+
+void __init io_delay_init(void)
+{
+ if (!io_delay_override)
+ dmi_check_system(io_delay_0xed_port_dmi_table);
+}
+
+static int __init io_delay_param(char *s)
+{
+ if (!s)
+ return -EINVAL;
+
+ if (!strcmp(s, "0x80"))
+ io_delay_type = CONFIG_IO_DELAY_TYPE_0X80;
+ else if (!strcmp(s, "0xed"))
+ io_delay_type = CONFIG_IO_DELAY_TYPE_0XED;
+ else if (!strcmp(s, "udelay"))
+ io_delay_type = CONFIG_IO_DELAY_TYPE_UDELAY;
+ else if (!strcmp(s, "none"))
+ io_delay_type = CONFIG_IO_DELAY_TYPE_NONE;
+ else
+ return -EINVAL;
+
+ io_delay_override = 1;
+ return 0;
+}
+
+early_param("io_delay", io_delay_param);
diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c
new file mode 100644
index 0000000..0fe1c87
--- /dev/null
+++ b/arch/x86/kernel/ioport.c
@@ -0,0 +1,138 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * This contains the io-permission bitmap code - written by obz, with changes
+ * by Linus. 32/64 bits code unification by Miguel Botón.
+ */
+
+#include <linux/sched.h>
+#include <linux/sched/task_stack.h>
+#include <linux/kernel.h>
+#include <linux/capability.h>
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/ioport.h>
+#include <linux/smp.h>
+#include <linux/stddef.h>
+#include <linux/slab.h>
+#include <linux/thread_info.h>
+#include <linux/syscalls.h>
+#include <linux/bitmap.h>
+#include <asm/syscalls.h>
+#include <asm/desc.h>
+
+/*
+ * this changes the io permissions bitmap in the current task.
+ */
+long ksys_ioperm(unsigned long from, unsigned long num, int turn_on)
+{
+ struct thread_struct *t = ¤t->thread;
+ struct tss_struct *tss;
+ unsigned int i, max_long, bytes, bytes_updated;
+
+ if ((from + num <= from) || (from + num > IO_BITMAP_BITS))
+ return -EINVAL;
+ if (turn_on && !capable(CAP_SYS_RAWIO))
+ return -EPERM;
+
+ /*
+ * If it's the first ioperm() call in this thread's lifetime, set the
+ * IO bitmap up. ioperm() is much less timing critical than clone(),
+ * this is why we delay this operation until now:
+ */
+ if (!t->io_bitmap_ptr) {
+ unsigned long *bitmap = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
+
+ if (!bitmap)
+ return -ENOMEM;
+
+ memset(bitmap, 0xff, IO_BITMAP_BYTES);
+ t->io_bitmap_ptr = bitmap;
+ set_thread_flag(TIF_IO_BITMAP);
+
+ /*
+ * Now that we have an IO bitmap, we need our TSS limit to be
+ * correct. It's fine if we are preempted after doing this:
+ * with TIF_IO_BITMAP set, context switches will keep our TSS
+ * limit correct.
+ */
+ preempt_disable();
+ refresh_tss_limit();
+ preempt_enable();
+ }
+
+ /*
+ * do it in the per-thread copy and in the TSS ...
+ *
+ * Disable preemption via get_cpu() - we must not switch away
+ * because the ->io_bitmap_max value must match the bitmap
+ * contents:
+ */
+ tss = &per_cpu(cpu_tss_rw, get_cpu());
+
+ if (turn_on)
+ bitmap_clear(t->io_bitmap_ptr, from, num);
+ else
+ bitmap_set(t->io_bitmap_ptr, from, num);
+
+ /*
+ * Search for a (possibly new) maximum. This is simple and stupid,
+ * to keep it obviously correct:
+ */
+ max_long = 0;
+ for (i = 0; i < IO_BITMAP_LONGS; i++)
+ if (t->io_bitmap_ptr[i] != ~0UL)
+ max_long = i;
+
+ bytes = (max_long + 1) * sizeof(unsigned long);
+ bytes_updated = max(bytes, t->io_bitmap_max);
+
+ t->io_bitmap_max = bytes;
+
+ /* Update the TSS: */
+ memcpy(tss->io_bitmap, t->io_bitmap_ptr, bytes_updated);
+
+ put_cpu();
+
+ return 0;
+}
+
+SYSCALL_DEFINE3(ioperm, unsigned long, from, unsigned long, num, int, turn_on)
+{
+ return ksys_ioperm(from, num, turn_on);
+}
+
+/*
+ * sys_iopl has to be used when you want to access the IO ports
+ * beyond the 0x3ff range: to get the full 65536 ports bitmapped
+ * you'd need 8kB of bitmaps/process, which is a bit excessive.
+ *
+ * Here we just change the flags value on the stack: we allow
+ * only the super-user to do it. This depends on the stack-layout
+ * on system-call entry - see also fork() and the signal handling
+ * code.
+ */
+SYSCALL_DEFINE1(iopl, unsigned int, level)
+{
+ struct pt_regs *regs = current_pt_regs();
+ struct thread_struct *t = ¤t->thread;
+
+ /*
+ * Careful: the IOPL bits in regs->flags are undefined under Xen PV
+ * and changing them has no effect.
+ */
+ unsigned int old = t->iopl >> X86_EFLAGS_IOPL_BIT;
+
+ if (level > 3)
+ return -EINVAL;
+ /* Trying to gain more privileges? */
+ if (level > old) {
+ if (!capable(CAP_SYS_RAWIO))
+ return -EPERM;
+ }
+ regs->flags = (regs->flags & ~X86_EFLAGS_IOPL) |
+ (level << X86_EFLAGS_IOPL_BIT);
+ t->iopl = level << X86_EFLAGS_IOPL_BIT;
+ set_iopl_mask(t->iopl);
+
+ return 0;
+}
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
new file mode 100644
index 0000000..59b5f2e
--- /dev/null
+++ b/arch/x86/kernel/irq.c
@@ -0,0 +1,388 @@
+/*
+ * Common interrupt code for 32 and 64 bit
+ */
+#include <linux/cpu.h>
+#include <linux/interrupt.h>
+#include <linux/kernel_stat.h>
+#include <linux/of.h>
+#include <linux/seq_file.h>
+#include <linux/smp.h>
+#include <linux/ftrace.h>
+#include <linux/delay.h>
+#include <linux/export.h>
+#include <linux/irq.h>
+
+#include <asm/apic.h>
+#include <asm/io_apic.h>
+#include <asm/irq.h>
+#include <asm/mce.h>
+#include <asm/hw_irq.h>
+#include <asm/desc.h>
+
+#define CREATE_TRACE_POINTS
+#include <asm/trace/irq_vectors.h>
+
+DEFINE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat);
+EXPORT_PER_CPU_SYMBOL(irq_stat);
+
+DEFINE_PER_CPU(struct pt_regs *, irq_regs);
+EXPORT_PER_CPU_SYMBOL(irq_regs);
+
+atomic_t irq_err_count;
+
+/*
+ * 'what should we do if we get a hw irq event on an illegal vector'.
+ * each architecture has to answer this themselves.
+ */
+void ack_bad_irq(unsigned int irq)
+{
+ if (printk_ratelimit())
+ pr_err("unexpected IRQ trap at vector %02x\n", irq);
+
+ /*
+ * Currently unexpected vectors happen only on SMP and APIC.
+ * We _must_ ack these because every local APIC has only N
+ * irq slots per priority level, and a 'hanging, unacked' IRQ
+ * holds up an irq slot - in excessive cases (when multiple
+ * unexpected vectors occur) that might lock up the APIC
+ * completely.
+ * But only ack when the APIC is enabled -AK
+ */
+ ack_APIC_irq();
+}
+
+#define irq_stats(x) (&per_cpu(irq_stat, x))
+/*
+ * /proc/interrupts printing for arch specific interrupts
+ */
+int arch_show_interrupts(struct seq_file *p, int prec)
+{
+ int j;
+
+ seq_printf(p, "%*s: ", prec, "NMI");
+ for_each_online_cpu(j)
+ seq_printf(p, "%10u ", irq_stats(j)->__nmi_count);
+ seq_puts(p, " Non-maskable interrupts\n");
+#ifdef CONFIG_X86_LOCAL_APIC
+ seq_printf(p, "%*s: ", prec, "LOC");
+ for_each_online_cpu(j)
+ seq_printf(p, "%10u ", irq_stats(j)->apic_timer_irqs);
+ seq_puts(p, " Local timer interrupts\n");
+
+ seq_printf(p, "%*s: ", prec, "SPU");
+ for_each_online_cpu(j)
+ seq_printf(p, "%10u ", irq_stats(j)->irq_spurious_count);
+ seq_puts(p, " Spurious interrupts\n");
+ seq_printf(p, "%*s: ", prec, "PMI");
+ for_each_online_cpu(j)
+ seq_printf(p, "%10u ", irq_stats(j)->apic_perf_irqs);
+ seq_puts(p, " Performance monitoring interrupts\n");
+ seq_printf(p, "%*s: ", prec, "IWI");
+ for_each_online_cpu(j)
+ seq_printf(p, "%10u ", irq_stats(j)->apic_irq_work_irqs);
+ seq_puts(p, " IRQ work interrupts\n");
+ seq_printf(p, "%*s: ", prec, "RTR");
+ for_each_online_cpu(j)
+ seq_printf(p, "%10u ", irq_stats(j)->icr_read_retry_count);
+ seq_puts(p, " APIC ICR read retries\n");
+ if (x86_platform_ipi_callback) {
+ seq_printf(p, "%*s: ", prec, "PLT");
+ for_each_online_cpu(j)
+ seq_printf(p, "%10u ", irq_stats(j)->x86_platform_ipis);
+ seq_puts(p, " Platform interrupts\n");
+ }
+#endif
+#ifdef CONFIG_SMP
+ seq_printf(p, "%*s: ", prec, "RES");
+ for_each_online_cpu(j)
+ seq_printf(p, "%10u ", irq_stats(j)->irq_resched_count);
+ seq_puts(p, " Rescheduling interrupts\n");
+ seq_printf(p, "%*s: ", prec, "CAL");
+ for_each_online_cpu(j)
+ seq_printf(p, "%10u ", irq_stats(j)->irq_call_count);
+ seq_puts(p, " Function call interrupts\n");
+ seq_printf(p, "%*s: ", prec, "TLB");
+ for_each_online_cpu(j)
+ seq_printf(p, "%10u ", irq_stats(j)->irq_tlb_count);
+ seq_puts(p, " TLB shootdowns\n");
+#endif
+#ifdef CONFIG_X86_THERMAL_VECTOR
+ seq_printf(p, "%*s: ", prec, "TRM");
+ for_each_online_cpu(j)
+ seq_printf(p, "%10u ", irq_stats(j)->irq_thermal_count);
+ seq_puts(p, " Thermal event interrupts\n");
+#endif
+#ifdef CONFIG_X86_MCE_THRESHOLD
+ seq_printf(p, "%*s: ", prec, "THR");
+ for_each_online_cpu(j)
+ seq_printf(p, "%10u ", irq_stats(j)->irq_threshold_count);
+ seq_puts(p, " Threshold APIC interrupts\n");
+#endif
+#ifdef CONFIG_X86_MCE_AMD
+ seq_printf(p, "%*s: ", prec, "DFR");
+ for_each_online_cpu(j)
+ seq_printf(p, "%10u ", irq_stats(j)->irq_deferred_error_count);
+ seq_puts(p, " Deferred Error APIC interrupts\n");
+#endif
+#ifdef CONFIG_X86_MCE
+ seq_printf(p, "%*s: ", prec, "MCE");
+ for_each_online_cpu(j)
+ seq_printf(p, "%10u ", per_cpu(mce_exception_count, j));
+ seq_puts(p, " Machine check exceptions\n");
+ seq_printf(p, "%*s: ", prec, "MCP");
+ for_each_online_cpu(j)
+ seq_printf(p, "%10u ", per_cpu(mce_poll_count, j));
+ seq_puts(p, " Machine check polls\n");
+#endif
+#if IS_ENABLED(CONFIG_HYPERV) || defined(CONFIG_XEN)
+ if (test_bit(HYPERVISOR_CALLBACK_VECTOR, system_vectors)) {
+ seq_printf(p, "%*s: ", prec, "HYP");
+ for_each_online_cpu(j)
+ seq_printf(p, "%10u ",
+ irq_stats(j)->irq_hv_callback_count);
+ seq_puts(p, " Hypervisor callback interrupts\n");
+ }
+#endif
+#if IS_ENABLED(CONFIG_HYPERV)
+ if (test_bit(HYPERV_REENLIGHTENMENT_VECTOR, system_vectors)) {
+ seq_printf(p, "%*s: ", prec, "HRE");
+ for_each_online_cpu(j)
+ seq_printf(p, "%10u ",
+ irq_stats(j)->irq_hv_reenlightenment_count);
+ seq_puts(p, " Hyper-V reenlightenment interrupts\n");
+ }
+ if (test_bit(HYPERV_STIMER0_VECTOR, system_vectors)) {
+ seq_printf(p, "%*s: ", prec, "HVS");
+ for_each_online_cpu(j)
+ seq_printf(p, "%10u ",
+ irq_stats(j)->hyperv_stimer0_count);
+ seq_puts(p, " Hyper-V stimer0 interrupts\n");
+ }
+#endif
+ seq_printf(p, "%*s: %10u\n", prec, "ERR", atomic_read(&irq_err_count));
+#if defined(CONFIG_X86_IO_APIC)
+ seq_printf(p, "%*s: %10u\n", prec, "MIS", atomic_read(&irq_mis_count));
+#endif
+#ifdef CONFIG_HAVE_KVM
+ seq_printf(p, "%*s: ", prec, "PIN");
+ for_each_online_cpu(j)
+ seq_printf(p, "%10u ", irq_stats(j)->kvm_posted_intr_ipis);
+ seq_puts(p, " Posted-interrupt notification event\n");
+
+ seq_printf(p, "%*s: ", prec, "NPI");
+ for_each_online_cpu(j)
+ seq_printf(p, "%10u ",
+ irq_stats(j)->kvm_posted_intr_nested_ipis);
+ seq_puts(p, " Nested posted-interrupt event\n");
+
+ seq_printf(p, "%*s: ", prec, "PIW");
+ for_each_online_cpu(j)
+ seq_printf(p, "%10u ",
+ irq_stats(j)->kvm_posted_intr_wakeup_ipis);
+ seq_puts(p, " Posted-interrupt wakeup event\n");
+#endif
+ return 0;
+}
+
+/*
+ * /proc/stat helpers
+ */
+u64 arch_irq_stat_cpu(unsigned int cpu)
+{
+ u64 sum = irq_stats(cpu)->__nmi_count;
+
+#ifdef CONFIG_X86_LOCAL_APIC
+ sum += irq_stats(cpu)->apic_timer_irqs;
+ sum += irq_stats(cpu)->irq_spurious_count;
+ sum += irq_stats(cpu)->apic_perf_irqs;
+ sum += irq_stats(cpu)->apic_irq_work_irqs;
+ sum += irq_stats(cpu)->icr_read_retry_count;
+ if (x86_platform_ipi_callback)
+ sum += irq_stats(cpu)->x86_platform_ipis;
+#endif
+#ifdef CONFIG_SMP
+ sum += irq_stats(cpu)->irq_resched_count;
+ sum += irq_stats(cpu)->irq_call_count;
+#endif
+#ifdef CONFIG_X86_THERMAL_VECTOR
+ sum += irq_stats(cpu)->irq_thermal_count;
+#endif
+#ifdef CONFIG_X86_MCE_THRESHOLD
+ sum += irq_stats(cpu)->irq_threshold_count;
+#endif
+#ifdef CONFIG_X86_MCE
+ sum += per_cpu(mce_exception_count, cpu);
+ sum += per_cpu(mce_poll_count, cpu);
+#endif
+ return sum;
+}
+
+u64 arch_irq_stat(void)
+{
+ u64 sum = atomic_read(&irq_err_count);
+ return sum;
+}
+
+
+/*
+ * do_IRQ handles all normal device IRQ's (the special
+ * SMP cross-CPU interrupts have their own specific
+ * handlers).
+ */
+__visible unsigned int __irq_entry do_IRQ(struct pt_regs *regs)
+{
+ struct pt_regs *old_regs = set_irq_regs(regs);
+ struct irq_desc * desc;
+ /* high bit used in ret_from_ code */
+ unsigned vector = ~regs->orig_ax;
+
+ entering_irq();
+
+ /* entering_irq() tells RCU that we're not quiescent. Check it. */
+ RCU_LOCKDEP_WARN(!rcu_is_watching(), "IRQ failed to wake up RCU");
+
+ desc = __this_cpu_read(vector_irq[vector]);
+
+ if (!handle_irq(desc, regs)) {
+ ack_APIC_irq();
+
+ if (desc != VECTOR_RETRIGGERED) {
+ pr_emerg_ratelimited("%s: %d.%d No irq handler for vector\n",
+ __func__, smp_processor_id(),
+ vector);
+ } else {
+ __this_cpu_write(vector_irq[vector], VECTOR_UNUSED);
+ }
+ }
+
+ exiting_irq();
+
+ set_irq_regs(old_regs);
+ return 1;
+}
+
+#ifdef CONFIG_X86_LOCAL_APIC
+/* Function pointer for generic interrupt vector handling */
+void (*x86_platform_ipi_callback)(void) = NULL;
+/*
+ * Handler for X86_PLATFORM_IPI_VECTOR.
+ */
+__visible void __irq_entry smp_x86_platform_ipi(struct pt_regs *regs)
+{
+ struct pt_regs *old_regs = set_irq_regs(regs);
+
+ entering_ack_irq();
+ trace_x86_platform_ipi_entry(X86_PLATFORM_IPI_VECTOR);
+ inc_irq_stat(x86_platform_ipis);
+ if (x86_platform_ipi_callback)
+ x86_platform_ipi_callback();
+ trace_x86_platform_ipi_exit(X86_PLATFORM_IPI_VECTOR);
+ exiting_irq();
+ set_irq_regs(old_regs);
+}
+#endif
+
+#ifdef CONFIG_HAVE_KVM
+static void dummy_handler(void) {}
+static void (*kvm_posted_intr_wakeup_handler)(void) = dummy_handler;
+
+void kvm_set_posted_intr_wakeup_handler(void (*handler)(void))
+{
+ if (handler)
+ kvm_posted_intr_wakeup_handler = handler;
+ else
+ kvm_posted_intr_wakeup_handler = dummy_handler;
+}
+EXPORT_SYMBOL_GPL(kvm_set_posted_intr_wakeup_handler);
+
+/*
+ * Handler for POSTED_INTERRUPT_VECTOR.
+ */
+__visible void smp_kvm_posted_intr_ipi(struct pt_regs *regs)
+{
+ struct pt_regs *old_regs = set_irq_regs(regs);
+
+ entering_ack_irq();
+ inc_irq_stat(kvm_posted_intr_ipis);
+ exiting_irq();
+ set_irq_regs(old_regs);
+}
+
+/*
+ * Handler for POSTED_INTERRUPT_WAKEUP_VECTOR.
+ */
+__visible void smp_kvm_posted_intr_wakeup_ipi(struct pt_regs *regs)
+{
+ struct pt_regs *old_regs = set_irq_regs(regs);
+
+ entering_ack_irq();
+ inc_irq_stat(kvm_posted_intr_wakeup_ipis);
+ kvm_posted_intr_wakeup_handler();
+ exiting_irq();
+ set_irq_regs(old_regs);
+}
+
+/*
+ * Handler for POSTED_INTERRUPT_NESTED_VECTOR.
+ */
+__visible void smp_kvm_posted_intr_nested_ipi(struct pt_regs *regs)
+{
+ struct pt_regs *old_regs = set_irq_regs(regs);
+
+ entering_ack_irq();
+ inc_irq_stat(kvm_posted_intr_nested_ipis);
+ exiting_irq();
+ set_irq_regs(old_regs);
+}
+#endif
+
+
+#ifdef CONFIG_HOTPLUG_CPU
+/* A cpu has been removed from cpu_online_mask. Reset irq affinities. */
+void fixup_irqs(void)
+{
+ unsigned int irr, vector;
+ struct irq_desc *desc;
+ struct irq_data *data;
+ struct irq_chip *chip;
+
+ irq_migrate_all_off_this_cpu();
+
+ /*
+ * We can remove mdelay() and then send spuriuous interrupts to
+ * new cpu targets for all the irqs that were handled previously by
+ * this cpu. While it works, I have seen spurious interrupt messages
+ * (nothing wrong but still...).
+ *
+ * So for now, retain mdelay(1) and check the IRR and then send those
+ * interrupts to new targets as this cpu is already offlined...
+ */
+ mdelay(1);
+
+ /*
+ * We can walk the vector array of this cpu without holding
+ * vector_lock because the cpu is already marked !online, so
+ * nothing else will touch it.
+ */
+ for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) {
+ if (IS_ERR_OR_NULL(__this_cpu_read(vector_irq[vector])))
+ continue;
+
+ irr = apic_read(APIC_IRR + (vector / 32 * 0x10));
+ if (irr & (1 << (vector % 32))) {
+ desc = __this_cpu_read(vector_irq[vector]);
+
+ raw_spin_lock(&desc->lock);
+ data = irq_desc_get_irq_data(desc);
+ chip = irq_data_get_irq_chip(data);
+ if (chip->irq_retrigger) {
+ chip->irq_retrigger(data);
+ __this_cpu_write(vector_irq[vector], VECTOR_RETRIGGERED);
+ }
+ raw_spin_unlock(&desc->lock);
+ }
+ if (__this_cpu_read(vector_irq[vector]) != VECTOR_RETRIGGERED)
+ __this_cpu_write(vector_irq[vector], VECTOR_UNUSED);
+ }
+}
+#endif
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
new file mode 100644
index 0000000..95600a9
--- /dev/null
+++ b/arch/x86/kernel/irq_32.c
@@ -0,0 +1,164 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 1992, 1998 Linus Torvalds, Ingo Molnar
+ *
+ * This file contains the lowest level x86-specific interrupt
+ * entry, irq-stacks and irq statistics code. All the remaining
+ * irq logic is done by the generic kernel/irq/ code and
+ * by the x86-specific irq controller code. (e.g. i8259.c and
+ * io_apic.c.)
+ */
+
+#include <linux/seq_file.h>
+#include <linux/interrupt.h>
+#include <linux/irq.h>
+#include <linux/kernel_stat.h>
+#include <linux/notifier.h>
+#include <linux/cpu.h>
+#include <linux/delay.h>
+#include <linux/uaccess.h>
+#include <linux/percpu.h>
+#include <linux/mm.h>
+
+#include <asm/apic.h>
+#include <asm/nospec-branch.h>
+
+#ifdef CONFIG_DEBUG_STACKOVERFLOW
+
+int sysctl_panic_on_stackoverflow __read_mostly;
+
+/* Debugging check for stack overflow: is there less than 1KB free? */
+static int check_stack_overflow(void)
+{
+ long sp;
+
+ __asm__ __volatile__("andl %%esp,%0" :
+ "=r" (sp) : "0" (THREAD_SIZE - 1));
+
+ return sp < (sizeof(struct thread_info) + STACK_WARN);
+}
+
+static void print_stack_overflow(void)
+{
+ printk(KERN_WARNING "low stack detected by irq handler\n");
+ dump_stack();
+ if (sysctl_panic_on_stackoverflow)
+ panic("low stack detected by irq handler - check messages\n");
+}
+
+#else
+static inline int check_stack_overflow(void) { return 0; }
+static inline void print_stack_overflow(void) { }
+#endif
+
+DEFINE_PER_CPU(struct irq_stack *, hardirq_stack);
+DEFINE_PER_CPU(struct irq_stack *, softirq_stack);
+
+static void call_on_stack(void *func, void *stack)
+{
+ asm volatile("xchgl %%ebx,%%esp \n"
+ CALL_NOSPEC
+ "movl %%ebx,%%esp \n"
+ : "=b" (stack)
+ : "0" (stack),
+ [thunk_target] "D"(func)
+ : "memory", "cc", "edx", "ecx", "eax");
+}
+
+static inline void *current_stack(void)
+{
+ return (void *)(current_stack_pointer & ~(THREAD_SIZE - 1));
+}
+
+static inline int execute_on_irq_stack(int overflow, struct irq_desc *desc)
+{
+ struct irq_stack *curstk, *irqstk;
+ u32 *isp, *prev_esp, arg1;
+
+ curstk = (struct irq_stack *) current_stack();
+ irqstk = __this_cpu_read(hardirq_stack);
+
+ /*
+ * this is where we switch to the IRQ stack. However, if we are
+ * already using the IRQ stack (because we interrupted a hardirq
+ * handler) we can't do that and just have to keep using the
+ * current stack (which is the irq stack already after all)
+ */
+ if (unlikely(curstk == irqstk))
+ return 0;
+
+ isp = (u32 *) ((char *)irqstk + sizeof(*irqstk));
+
+ /* Save the next esp at the bottom of the stack */
+ prev_esp = (u32 *)irqstk;
+ *prev_esp = current_stack_pointer;
+
+ if (unlikely(overflow))
+ call_on_stack(print_stack_overflow, isp);
+
+ asm volatile("xchgl %%ebx,%%esp \n"
+ CALL_NOSPEC
+ "movl %%ebx,%%esp \n"
+ : "=a" (arg1), "=b" (isp)
+ : "0" (desc), "1" (isp),
+ [thunk_target] "D" (desc->handle_irq)
+ : "memory", "cc", "ecx");
+ return 1;
+}
+
+/*
+ * allocate per-cpu stacks for hardirq and for softirq processing
+ */
+void irq_ctx_init(int cpu)
+{
+ struct irq_stack *irqstk;
+
+ if (per_cpu(hardirq_stack, cpu))
+ return;
+
+ irqstk = page_address(alloc_pages_node(cpu_to_node(cpu),
+ THREADINFO_GFP,
+ THREAD_SIZE_ORDER));
+ per_cpu(hardirq_stack, cpu) = irqstk;
+
+ irqstk = page_address(alloc_pages_node(cpu_to_node(cpu),
+ THREADINFO_GFP,
+ THREAD_SIZE_ORDER));
+ per_cpu(softirq_stack, cpu) = irqstk;
+
+ printk(KERN_DEBUG "CPU %u irqstacks, hard=%p soft=%p\n",
+ cpu, per_cpu(hardirq_stack, cpu), per_cpu(softirq_stack, cpu));
+}
+
+void do_softirq_own_stack(void)
+{
+ struct irq_stack *irqstk;
+ u32 *isp, *prev_esp;
+
+ irqstk = __this_cpu_read(softirq_stack);
+
+ /* build the stack frame on the softirq stack */
+ isp = (u32 *) ((char *)irqstk + sizeof(*irqstk));
+
+ /* Push the previous esp onto the stack */
+ prev_esp = (u32 *)irqstk;
+ *prev_esp = current_stack_pointer;
+
+ call_on_stack(__do_softirq, isp);
+}
+
+bool handle_irq(struct irq_desc *desc, struct pt_regs *regs)
+{
+ int overflow = check_stack_overflow();
+
+ if (IS_ERR_OR_NULL(desc))
+ return false;
+
+ if (user_mode(regs) || !execute_on_irq_stack(overflow, desc)) {
+ if (unlikely(overflow))
+ print_stack_overflow();
+ generic_handle_irq_desc(desc);
+ }
+
+ return true;
+}
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
new file mode 100644
index 0000000..0469cd0
--- /dev/null
+++ b/arch/x86/kernel/irq_64.c
@@ -0,0 +1,80 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 1992, 1998 Linus Torvalds, Ingo Molnar
+ *
+ * This file contains the lowest level x86_64-specific interrupt
+ * entry and irq statistics code. All the remaining irq logic is
+ * done by the generic kernel/irq/ code and in the
+ * x86_64-specific irq controller code. (e.g. i8259.c and
+ * io_apic.c.)
+ */
+
+#include <linux/kernel_stat.h>
+#include <linux/interrupt.h>
+#include <linux/irq.h>
+#include <linux/seq_file.h>
+#include <linux/delay.h>
+#include <linux/ftrace.h>
+#include <linux/uaccess.h>
+#include <linux/smp.h>
+#include <linux/sched/task_stack.h>
+#include <asm/io_apic.h>
+#include <asm/apic.h>
+
+int sysctl_panic_on_stackoverflow;
+
+/*
+ * Probabilistic stack overflow check:
+ *
+ * Only check the stack in process context, because everything else
+ * runs on the big interrupt stacks. Checking reliably is too expensive,
+ * so we just check from interrupts.
+ */
+static inline void stack_overflow_check(struct pt_regs *regs)
+{
+#ifdef CONFIG_DEBUG_STACKOVERFLOW
+#define STACK_TOP_MARGIN 128
+ struct orig_ist *oist;
+ u64 irq_stack_top, irq_stack_bottom;
+ u64 estack_top, estack_bottom;
+ u64 curbase = (u64)task_stack_page(current);
+
+ if (user_mode(regs))
+ return;
+
+ if (regs->sp >= curbase + sizeof(struct pt_regs) + STACK_TOP_MARGIN &&
+ regs->sp <= curbase + THREAD_SIZE)
+ return;
+
+ irq_stack_top = (u64)this_cpu_ptr(irq_stack_union.irq_stack) +
+ STACK_TOP_MARGIN;
+ irq_stack_bottom = (u64)__this_cpu_read(irq_stack_ptr);
+ if (regs->sp >= irq_stack_top && regs->sp <= irq_stack_bottom)
+ return;
+
+ oist = this_cpu_ptr(&orig_ist);
+ estack_top = (u64)oist->ist[0] - EXCEPTION_STKSZ + STACK_TOP_MARGIN;
+ estack_bottom = (u64)oist->ist[N_EXCEPTION_STACKS - 1];
+ if (regs->sp >= estack_top && regs->sp <= estack_bottom)
+ return;
+
+ WARN_ONCE(1, "do_IRQ(): %s has overflown the kernel stack (cur:%Lx,sp:%lx,irq stk top-bottom:%Lx-%Lx,exception stk top-bottom:%Lx-%Lx,ip:%pF)\n",
+ current->comm, curbase, regs->sp,
+ irq_stack_top, irq_stack_bottom,
+ estack_top, estack_bottom, (void *)regs->ip);
+
+ if (sysctl_panic_on_stackoverflow)
+ panic("low stack detected by irq handler - check messages\n");
+#endif
+}
+
+bool handle_irq(struct irq_desc *desc, struct pt_regs *regs)
+{
+ stack_overflow_check(regs);
+
+ if (IS_ERR_OR_NULL(desc))
+ return false;
+
+ generic_handle_irq_desc(desc);
+ return true;
+}
diff --git a/arch/x86/kernel/irq_work.c b/arch/x86/kernel/irq_work.c
new file mode 100644
index 0000000..80bee76
--- /dev/null
+++ b/arch/x86/kernel/irq_work.c
@@ -0,0 +1,34 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * x86 specific code for irq_work
+ *
+ * Copyright (C) 2010 Red Hat, Inc., Peter Zijlstra
+ */
+
+#include <linux/kernel.h>
+#include <linux/irq_work.h>
+#include <linux/hardirq.h>
+#include <asm/apic.h>
+#include <asm/trace/irq_vectors.h>
+#include <linux/interrupt.h>
+
+#ifdef CONFIG_X86_LOCAL_APIC
+__visible void __irq_entry smp_irq_work_interrupt(struct pt_regs *regs)
+{
+ ipi_entering_ack_irq();
+ trace_irq_work_entry(IRQ_WORK_VECTOR);
+ inc_irq_stat(apic_irq_work_irqs);
+ irq_work_run();
+ trace_irq_work_exit(IRQ_WORK_VECTOR);
+ exiting_irq();
+}
+
+void arch_irq_work_raise(void)
+{
+ if (!arch_irq_work_has_interrupt())
+ return;
+
+ apic->send_IPI_self(IRQ_WORK_VECTOR);
+ apic_wait_icr_idle();
+}
+#endif
diff --git a/arch/x86/kernel/irqflags.S b/arch/x86/kernel/irqflags.S
new file mode 100644
index 0000000..ddeeaac
--- /dev/null
+++ b/arch/x86/kernel/irqflags.S
@@ -0,0 +1,26 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#include <asm/asm.h>
+#include <asm/export.h>
+#include <linux/linkage.h>
+
+/*
+ * unsigned long native_save_fl(void)
+ */
+ENTRY(native_save_fl)
+ pushf
+ pop %_ASM_AX
+ ret
+ENDPROC(native_save_fl)
+EXPORT_SYMBOL(native_save_fl)
+
+/*
+ * void native_restore_fl(unsigned long flags)
+ * %eax/%rdi: flags
+ */
+ENTRY(native_restore_fl)
+ push %_ASM_ARG1
+ popf
+ ret
+ENDPROC(native_restore_fl)
+EXPORT_SYMBOL(native_restore_fl)
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
new file mode 100644
index 0000000..a0693b71c
--- /dev/null
+++ b/arch/x86/kernel/irqinit.c
@@ -0,0 +1,109 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/linkage.h>
+#include <linux/errno.h>
+#include <linux/signal.h>
+#include <linux/sched.h>
+#include <linux/ioport.h>
+#include <linux/interrupt.h>
+#include <linux/irq.h>
+#include <linux/timex.h>
+#include <linux/random.h>
+#include <linux/kprobes.h>
+#include <linux/init.h>
+#include <linux/kernel_stat.h>
+#include <linux/device.h>
+#include <linux/bitops.h>
+#include <linux/acpi.h>
+#include <linux/io.h>
+#include <linux/delay.h>
+
+#include <linux/atomic.h>
+#include <asm/timer.h>
+#include <asm/hw_irq.h>
+#include <asm/pgtable.h>
+#include <asm/desc.h>
+#include <asm/apic.h>
+#include <asm/setup.h>
+#include <asm/i8259.h>
+#include <asm/traps.h>
+#include <asm/prom.h>
+
+/*
+ * ISA PIC or low IO-APIC triggered (INTA-cycle or APIC) interrupts:
+ * (these are usually mapped to vectors 0x30-0x3f)
+ */
+
+/*
+ * The IO-APIC gives us many more interrupt sources. Most of these
+ * are unused but an SMP system is supposed to have enough memory ...
+ * sometimes (mostly wrt. hw bugs) we get corrupted vectors all
+ * across the spectrum, so we really want to be prepared to get all
+ * of these. Plus, more powerful systems might have more than 64
+ * IO-APIC registers.
+ *
+ * (these are usually mapped into the 0x30-0xff vector range)
+ */
+
+/*
+ * IRQ2 is cascade interrupt to second interrupt controller
+ */
+static struct irqaction irq2 = {
+ .handler = no_action,
+ .name = "cascade",
+ .flags = IRQF_NO_THREAD,
+};
+
+DEFINE_PER_CPU(vector_irq_t, vector_irq) = {
+ [0 ... NR_VECTORS - 1] = VECTOR_UNUSED,
+};
+
+void __init init_ISA_irqs(void)
+{
+ struct irq_chip *chip = legacy_pic->chip;
+ int i;
+
+ /*
+ * Try to set up the through-local-APIC virtual wire mode earlier.
+ *
+ * On some 32-bit UP machines, whose APIC has been disabled by BIOS
+ * and then got re-enabled by "lapic", it hangs at boot time without this.
+ */
+ init_bsp_APIC();
+
+ legacy_pic->init(0);
+
+ for (i = 0; i < nr_legacy_irqs(); i++)
+ irq_set_chip_and_handler(i, chip, handle_level_irq);
+}
+
+void __init init_IRQ(void)
+{
+ int i;
+
+ /*
+ * On cpu 0, Assign ISA_IRQ_VECTOR(irq) to IRQ 0..15.
+ * If these IRQ's are handled by legacy interrupt-controllers like PIC,
+ * then this configuration will likely be static after the boot. If
+ * these IRQ's are handled by more mordern controllers like IO-APIC,
+ * then this vector space can be freed and re-used dynamically as the
+ * irq's migrate etc.
+ */
+ for (i = 0; i < nr_legacy_irqs(); i++)
+ per_cpu(vector_irq, 0)[ISA_IRQ_VECTOR(i)] = irq_to_desc(i);
+
+ x86_init.irqs.intr_init();
+}
+
+void __init native_init_IRQ(void)
+{
+ /* Execute any quirks before the call gates are initialised: */
+ x86_init.irqs.pre_vector_init();
+
+ idt_setup_apic_and_irq_gates();
+ lapic_assign_system_vectors();
+
+ if (!acpi_ioapic && !of_ioapic && nr_legacy_irqs())
+ setup_irq(2, &irq2);
+
+ irq_ctx_init(smp_processor_id());
+}
diff --git a/arch/x86/kernel/itmt.c b/arch/x86/kernel/itmt.c
new file mode 100644
index 0000000..d177940
--- /dev/null
+++ b/arch/x86/kernel/itmt.c
@@ -0,0 +1,212 @@
+/*
+ * itmt.c: Support Intel Turbo Boost Max Technology 3.0
+ *
+ * (C) Copyright 2016 Intel Corporation
+ * Author: Tim Chen <tim.c.chen@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ *
+ * On platforms supporting Intel Turbo Boost Max Technology 3.0, (ITMT),
+ * the maximum turbo frequencies of some cores in a CPU package may be
+ * higher than for the other cores in the same package. In that case,
+ * better performance can be achieved by making the scheduler prefer
+ * to run tasks on the CPUs with higher max turbo frequencies.
+ *
+ * This file provides functions and data structures for enabling the
+ * scheduler to favor scheduling on cores can be boosted to a higher
+ * frequency under ITMT.
+ */
+
+#include <linux/sched.h>
+#include <linux/cpumask.h>
+#include <linux/cpuset.h>
+#include <linux/mutex.h>
+#include <linux/sysctl.h>
+#include <linux/nodemask.h>
+
+static DEFINE_MUTEX(itmt_update_mutex);
+DEFINE_PER_CPU_READ_MOSTLY(int, sched_core_priority);
+
+/* Boolean to track if system has ITMT capabilities */
+static bool __read_mostly sched_itmt_capable;
+
+/*
+ * Boolean to control whether we want to move processes to cpu capable
+ * of higher turbo frequency for cpus supporting Intel Turbo Boost Max
+ * Technology 3.0.
+ *
+ * It can be set via /proc/sys/kernel/sched_itmt_enabled
+ */
+unsigned int __read_mostly sysctl_sched_itmt_enabled;
+
+static int sched_itmt_update_handler(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp,
+ loff_t *ppos)
+{
+ unsigned int old_sysctl;
+ int ret;
+
+ mutex_lock(&itmt_update_mutex);
+
+ if (!sched_itmt_capable) {
+ mutex_unlock(&itmt_update_mutex);
+ return -EINVAL;
+ }
+
+ old_sysctl = sysctl_sched_itmt_enabled;
+ ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+
+ if (!ret && write && old_sysctl != sysctl_sched_itmt_enabled) {
+ x86_topology_update = true;
+ rebuild_sched_domains();
+ }
+
+ mutex_unlock(&itmt_update_mutex);
+
+ return ret;
+}
+
+static unsigned int zero;
+static unsigned int one = 1;
+static struct ctl_table itmt_kern_table[] = {
+ {
+ .procname = "sched_itmt_enabled",
+ .data = &sysctl_sched_itmt_enabled,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = sched_itmt_update_handler,
+ .extra1 = &zero,
+ .extra2 = &one,
+ },
+ {}
+};
+
+static struct ctl_table itmt_root_table[] = {
+ {
+ .procname = "kernel",
+ .mode = 0555,
+ .child = itmt_kern_table,
+ },
+ {}
+};
+
+static struct ctl_table_header *itmt_sysctl_header;
+
+/**
+ * sched_set_itmt_support() - Indicate platform supports ITMT
+ *
+ * This function is used by the OS to indicate to scheduler that the platform
+ * is capable of supporting the ITMT feature.
+ *
+ * The current scheme has the pstate driver detects if the system
+ * is ITMT capable and call sched_set_itmt_support.
+ *
+ * This must be done only after sched_set_itmt_core_prio
+ * has been called to set the cpus' priorities.
+ * It must not be called with cpu hot plug lock
+ * held as we need to acquire the lock to rebuild sched domains
+ * later.
+ *
+ * Return: 0 on success
+ */
+int sched_set_itmt_support(void)
+{
+ mutex_lock(&itmt_update_mutex);
+
+ if (sched_itmt_capable) {
+ mutex_unlock(&itmt_update_mutex);
+ return 0;
+ }
+
+ itmt_sysctl_header = register_sysctl_table(itmt_root_table);
+ if (!itmt_sysctl_header) {
+ mutex_unlock(&itmt_update_mutex);
+ return -ENOMEM;
+ }
+
+ sched_itmt_capable = true;
+
+ sysctl_sched_itmt_enabled = 1;
+
+ x86_topology_update = true;
+ rebuild_sched_domains();
+
+ mutex_unlock(&itmt_update_mutex);
+
+ return 0;
+}
+
+/**
+ * sched_clear_itmt_support() - Revoke platform's support of ITMT
+ *
+ * This function is used by the OS to indicate that it has
+ * revoked the platform's support of ITMT feature.
+ *
+ * It must not be called with cpu hot plug lock
+ * held as we need to acquire the lock to rebuild sched domains
+ * later.
+ */
+void sched_clear_itmt_support(void)
+{
+ mutex_lock(&itmt_update_mutex);
+
+ if (!sched_itmt_capable) {
+ mutex_unlock(&itmt_update_mutex);
+ return;
+ }
+ sched_itmt_capable = false;
+
+ if (itmt_sysctl_header) {
+ unregister_sysctl_table(itmt_sysctl_header);
+ itmt_sysctl_header = NULL;
+ }
+
+ if (sysctl_sched_itmt_enabled) {
+ /* disable sched_itmt if we are no longer ITMT capable */
+ sysctl_sched_itmt_enabled = 0;
+ x86_topology_update = true;
+ rebuild_sched_domains();
+ }
+
+ mutex_unlock(&itmt_update_mutex);
+}
+
+int arch_asym_cpu_priority(int cpu)
+{
+ return per_cpu(sched_core_priority, cpu);
+}
+
+/**
+ * sched_set_itmt_core_prio() - Set CPU priority based on ITMT
+ * @prio: Priority of cpu core
+ * @core_cpu: The cpu number associated with the core
+ *
+ * The pstate driver will find out the max boost frequency
+ * and call this function to set a priority proportional
+ * to the max boost frequency. CPU with higher boost
+ * frequency will receive higher priority.
+ *
+ * No need to rebuild sched domain after updating
+ * the CPU priorities. The sched domains have no
+ * dependency on CPU priorities.
+ */
+void sched_set_itmt_core_prio(int prio, int core_cpu)
+{
+ int cpu, i = 1;
+
+ for_each_cpu(cpu, topology_sibling_cpumask(core_cpu)) {
+ int smt_prio;
+
+ /*
+ * Ensure that the siblings are moved to the end
+ * of the priority chain and only used when
+ * all other high priority cpus are out of capacity.
+ */
+ smt_prio = prio * smp_num_siblings / i;
+ per_cpu(sched_core_priority, cpu) = smt_prio;
+ i++;
+ }
+}
diff --git a/arch/x86/kernel/jailhouse.c b/arch/x86/kernel/jailhouse.c
new file mode 100644
index 0000000..108c48d
--- /dev/null
+++ b/arch/x86/kernel/jailhouse.c
@@ -0,0 +1,219 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Jailhouse paravirt_ops implementation
+ *
+ * Copyright (c) Siemens AG, 2015-2017
+ *
+ * Authors:
+ * Jan Kiszka <jan.kiszka@siemens.com>
+ */
+
+#include <linux/acpi_pmtmr.h>
+#include <linux/kernel.h>
+#include <linux/reboot.h>
+#include <asm/apic.h>
+#include <asm/cpu.h>
+#include <asm/hypervisor.h>
+#include <asm/i8259.h>
+#include <asm/irqdomain.h>
+#include <asm/pci_x86.h>
+#include <asm/reboot.h>
+#include <asm/setup.h>
+
+static __initdata struct jailhouse_setup_data setup_data;
+static unsigned int precalibrated_tsc_khz;
+
+static uint32_t jailhouse_cpuid_base(void)
+{
+ if (boot_cpu_data.cpuid_level < 0 ||
+ !boot_cpu_has(X86_FEATURE_HYPERVISOR))
+ return 0;
+
+ return hypervisor_cpuid_base("Jailhouse\0\0\0", 0);
+}
+
+static uint32_t __init jailhouse_detect(void)
+{
+ return jailhouse_cpuid_base();
+}
+
+static void jailhouse_get_wallclock(struct timespec64 *now)
+{
+ memset(now, 0, sizeof(*now));
+}
+
+static void __init jailhouse_timer_init(void)
+{
+ lapic_timer_frequency = setup_data.apic_khz * (1000 / HZ);
+}
+
+static unsigned long jailhouse_get_tsc(void)
+{
+ return precalibrated_tsc_khz;
+}
+
+static void __init jailhouse_x2apic_init(void)
+{
+#ifdef CONFIG_X86_X2APIC
+ if (!x2apic_enabled())
+ return;
+ /*
+ * We do not have access to IR inside Jailhouse non-root cells. So
+ * we have to run in physical mode.
+ */
+ x2apic_phys = 1;
+ /*
+ * This will trigger the switch to apic_x2apic_phys. Empty OEM IDs
+ * ensure that only this APIC driver picks up the call.
+ */
+ default_acpi_madt_oem_check("", "");
+#endif
+}
+
+static void __init jailhouse_get_smp_config(unsigned int early)
+{
+ struct ioapic_domain_cfg ioapic_cfg = {
+ .type = IOAPIC_DOMAIN_STRICT,
+ .ops = &mp_ioapic_irqdomain_ops,
+ };
+ struct mpc_intsrc mp_irq = {
+ .type = MP_INTSRC,
+ .irqtype = mp_INT,
+ .irqflag = MP_IRQPOL_ACTIVE_HIGH | MP_IRQTRIG_EDGE,
+ };
+ unsigned int cpu;
+
+ jailhouse_x2apic_init();
+
+ register_lapic_address(0xfee00000);
+
+ for (cpu = 0; cpu < setup_data.num_cpus; cpu++) {
+ generic_processor_info(setup_data.cpu_ids[cpu],
+ boot_cpu_apic_version);
+ }
+
+ smp_found_config = 1;
+
+ if (setup_data.standard_ioapic) {
+ mp_register_ioapic(0, 0xfec00000, gsi_top, &ioapic_cfg);
+
+ /* Register 1:1 mapping for legacy UART IRQs 3 and 4 */
+ mp_irq.srcbusirq = mp_irq.dstirq = 3;
+ mp_save_irq(&mp_irq);
+
+ mp_irq.srcbusirq = mp_irq.dstirq = 4;
+ mp_save_irq(&mp_irq);
+ }
+}
+
+static void jailhouse_no_restart(void)
+{
+ pr_notice("Jailhouse: Restart not supported, halting\n");
+ machine_halt();
+}
+
+static int __init jailhouse_pci_arch_init(void)
+{
+ pci_direct_init(1);
+
+ /*
+ * There are no bridges on the virtual PCI root bus under Jailhouse,
+ * thus no other way to discover all devices than a full scan.
+ * Respect any overrides via the command line, though.
+ */
+ if (pcibios_last_bus < 0)
+ pcibios_last_bus = 0xff;
+
+#ifdef CONFIG_PCI_MMCONFIG
+ if (setup_data.pci_mmconfig_base) {
+ pci_mmconfig_add(0, 0, pcibios_last_bus,
+ setup_data.pci_mmconfig_base);
+ pci_mmcfg_arch_init();
+ }
+#endif
+
+ return 0;
+}
+
+static void __init jailhouse_init_platform(void)
+{
+ u64 pa_data = boot_params.hdr.setup_data;
+ struct setup_data header;
+ void *mapping;
+
+ x86_init.irqs.pre_vector_init = x86_init_noop;
+ x86_init.timers.timer_init = jailhouse_timer_init;
+ x86_init.mpparse.get_smp_config = jailhouse_get_smp_config;
+ x86_init.pci.arch_init = jailhouse_pci_arch_init;
+
+ x86_platform.calibrate_cpu = jailhouse_get_tsc;
+ x86_platform.calibrate_tsc = jailhouse_get_tsc;
+ x86_platform.get_wallclock = jailhouse_get_wallclock;
+ x86_platform.legacy.rtc = 0;
+ x86_platform.legacy.warm_reset = 0;
+ x86_platform.legacy.i8042 = X86_LEGACY_I8042_PLATFORM_ABSENT;
+
+ legacy_pic = &null_legacy_pic;
+
+ machine_ops.emergency_restart = jailhouse_no_restart;
+
+ while (pa_data) {
+ mapping = early_memremap(pa_data, sizeof(header));
+ memcpy(&header, mapping, sizeof(header));
+ early_memunmap(mapping, sizeof(header));
+
+ if (header.type == SETUP_JAILHOUSE &&
+ header.len >= sizeof(setup_data)) {
+ pa_data += offsetof(struct setup_data, data);
+
+ mapping = early_memremap(pa_data, sizeof(setup_data));
+ memcpy(&setup_data, mapping, sizeof(setup_data));
+ early_memunmap(mapping, sizeof(setup_data));
+
+ break;
+ }
+
+ pa_data = header.next;
+ }
+
+ if (!pa_data)
+ panic("Jailhouse: No valid setup data found");
+
+ if (setup_data.compatible_version > JAILHOUSE_SETUP_REQUIRED_VERSION)
+ panic("Jailhouse: Unsupported setup data structure");
+
+ pmtmr_ioport = setup_data.pm_timer_address;
+ pr_debug("Jailhouse: PM-Timer IO Port: %#x\n", pmtmr_ioport);
+
+ precalibrated_tsc_khz = setup_data.tsc_khz;
+ setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ);
+
+ pci_probe = 0;
+
+ /*
+ * Avoid that the kernel complains about missing ACPI tables - there
+ * are none in a non-root cell.
+ */
+ disable_acpi();
+}
+
+bool jailhouse_paravirt(void)
+{
+ return jailhouse_cpuid_base() != 0;
+}
+
+static bool jailhouse_x2apic_available(void)
+{
+ /*
+ * The x2APIC is only available if the root cell enabled it. Jailhouse
+ * does not support switching between xAPIC and x2APIC.
+ */
+ return x2apic_enabled();
+}
+
+const struct hypervisor_x86 x86_hyper_jailhouse __refconst = {
+ .name = "Jailhouse",
+ .detect = jailhouse_detect,
+ .init.init_platform = jailhouse_init_platform,
+ .init.x2apic_available = jailhouse_x2apic_available,
+};
diff --git a/arch/x86/kernel/jump_label.c b/arch/x86/kernel/jump_label.c
new file mode 100644
index 0000000..eeea935
--- /dev/null
+++ b/arch/x86/kernel/jump_label.c
@@ -0,0 +1,146 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * jump label x86 support
+ *
+ * Copyright (C) 2009 Jason Baron <jbaron@redhat.com>
+ *
+ */
+#include <linux/jump_label.h>
+#include <linux/memory.h>
+#include <linux/uaccess.h>
+#include <linux/module.h>
+#include <linux/list.h>
+#include <linux/jhash.h>
+#include <linux/cpu.h>
+#include <asm/kprobes.h>
+#include <asm/alternative.h>
+#include <asm/text-patching.h>
+
+#ifdef HAVE_JUMP_LABEL
+
+union jump_code_union {
+ char code[JUMP_LABEL_NOP_SIZE];
+ struct {
+ char jump;
+ int offset;
+ } __attribute__((packed));
+};
+
+static void bug_at(unsigned char *ip, int line)
+{
+ /*
+ * The location is not an op that we were expecting.
+ * Something went wrong. Crash the box, as something could be
+ * corrupting the kernel.
+ */
+ pr_crit("jump_label: Fatal kernel bug, unexpected op at %pS [%p] (%5ph) %d\n", ip, ip, ip, line);
+ BUG();
+}
+
+static void __ref __jump_label_transform(struct jump_entry *entry,
+ enum jump_label_type type,
+ void *(*poker)(void *, const void *, size_t),
+ int init)
+{
+ union jump_code_union code;
+ const unsigned char default_nop[] = { STATIC_KEY_INIT_NOP };
+ const unsigned char *ideal_nop = ideal_nops[NOP_ATOMIC5];
+
+ if (early_boot_irqs_disabled)
+ poker = text_poke_early;
+
+ if (type == JUMP_LABEL_JMP) {
+ if (init) {
+ /*
+ * Jump label is enabled for the first time.
+ * So we expect a default_nop...
+ */
+ if (unlikely(memcmp((void *)entry->code, default_nop, 5)
+ != 0))
+ bug_at((void *)entry->code, __LINE__);
+ } else {
+ /*
+ * ...otherwise expect an ideal_nop. Otherwise
+ * something went horribly wrong.
+ */
+ if (unlikely(memcmp((void *)entry->code, ideal_nop, 5)
+ != 0))
+ bug_at((void *)entry->code, __LINE__);
+ }
+
+ code.jump = 0xe9;
+ code.offset = entry->target -
+ (entry->code + JUMP_LABEL_NOP_SIZE);
+ } else {
+ /*
+ * We are disabling this jump label. If it is not what
+ * we think it is, then something must have gone wrong.
+ * If this is the first initialization call, then we
+ * are converting the default nop to the ideal nop.
+ */
+ if (init) {
+ if (unlikely(memcmp((void *)entry->code, default_nop, 5) != 0))
+ bug_at((void *)entry->code, __LINE__);
+ } else {
+ code.jump = 0xe9;
+ code.offset = entry->target -
+ (entry->code + JUMP_LABEL_NOP_SIZE);
+ if (unlikely(memcmp((void *)entry->code, &code, 5) != 0))
+ bug_at((void *)entry->code, __LINE__);
+ }
+ memcpy(&code, ideal_nops[NOP_ATOMIC5], JUMP_LABEL_NOP_SIZE);
+ }
+
+ /*
+ * Make text_poke_bp() a default fallback poker.
+ *
+ * At the time the change is being done, just ignore whether we
+ * are doing nop -> jump or jump -> nop transition, and assume
+ * always nop being the 'currently valid' instruction
+ *
+ */
+ if (poker)
+ (*poker)((void *)entry->code, &code, JUMP_LABEL_NOP_SIZE);
+ else
+ text_poke_bp((void *)entry->code, &code, JUMP_LABEL_NOP_SIZE,
+ (void *)entry->code + JUMP_LABEL_NOP_SIZE);
+}
+
+void arch_jump_label_transform(struct jump_entry *entry,
+ enum jump_label_type type)
+{
+ mutex_lock(&text_mutex);
+ __jump_label_transform(entry, type, NULL, 0);
+ mutex_unlock(&text_mutex);
+}
+
+static enum {
+ JL_STATE_START,
+ JL_STATE_NO_UPDATE,
+ JL_STATE_UPDATE,
+} jlstate __initdata_or_module = JL_STATE_START;
+
+__init_or_module void arch_jump_label_transform_static(struct jump_entry *entry,
+ enum jump_label_type type)
+{
+ /*
+ * This function is called at boot up and when modules are
+ * first loaded. Check if the default nop, the one that is
+ * inserted at compile time, is the ideal nop. If it is, then
+ * we do not need to update the nop, and we can leave it as is.
+ * If it is not, then we need to update the nop to the ideal nop.
+ */
+ if (jlstate == JL_STATE_START) {
+ const unsigned char default_nop[] = { STATIC_KEY_INIT_NOP };
+ const unsigned char *ideal_nop = ideal_nops[NOP_ATOMIC5];
+
+ if (memcmp(ideal_nop, default_nop, 5) != 0)
+ jlstate = JL_STATE_UPDATE;
+ else
+ jlstate = JL_STATE_NO_UPDATE;
+ }
+ if (jlstate == JL_STATE_UPDATE)
+ __jump_label_transform(entry, type, text_poke_early, 1);
+}
+
+#endif
diff --git a/arch/x86/kernel/kdebugfs.c b/arch/x86/kernel/kdebugfs.c
new file mode 100644
index 0000000..fd6f8fb
--- /dev/null
+++ b/arch/x86/kernel/kdebugfs.c
@@ -0,0 +1,202 @@
+/*
+ * Architecture specific debugfs files
+ *
+ * Copyright (C) 2007, Intel Corp.
+ * Huang Ying <ying.huang@intel.com>
+ *
+ * This file is released under the GPLv2.
+ */
+#include <linux/debugfs.h>
+#include <linux/uaccess.h>
+#include <linux/export.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/stat.h>
+#include <linux/io.h>
+#include <linux/mm.h>
+
+#include <asm/setup.h>
+
+struct dentry *arch_debugfs_dir;
+EXPORT_SYMBOL(arch_debugfs_dir);
+
+#ifdef CONFIG_DEBUG_BOOT_PARAMS
+struct setup_data_node {
+ u64 paddr;
+ u32 type;
+ u32 len;
+};
+
+static ssize_t setup_data_read(struct file *file, char __user *user_buf,
+ size_t count, loff_t *ppos)
+{
+ struct setup_data_node *node = file->private_data;
+ unsigned long remain;
+ loff_t pos = *ppos;
+ void *p;
+ u64 pa;
+
+ if (pos < 0)
+ return -EINVAL;
+
+ if (pos >= node->len)
+ return 0;
+
+ if (count > node->len - pos)
+ count = node->len - pos;
+
+ pa = node->paddr + sizeof(struct setup_data) + pos;
+ p = memremap(pa, count, MEMREMAP_WB);
+ if (!p)
+ return -ENOMEM;
+
+ remain = copy_to_user(user_buf, p, count);
+
+ memunmap(p);
+
+ if (remain)
+ return -EFAULT;
+
+ *ppos = pos + count;
+
+ return count;
+}
+
+static const struct file_operations fops_setup_data = {
+ .read = setup_data_read,
+ .open = simple_open,
+ .llseek = default_llseek,
+};
+
+static int __init
+create_setup_data_node(struct dentry *parent, int no,
+ struct setup_data_node *node)
+{
+ struct dentry *d, *type, *data;
+ char buf[16];
+
+ sprintf(buf, "%d", no);
+ d = debugfs_create_dir(buf, parent);
+ if (!d)
+ return -ENOMEM;
+
+ type = debugfs_create_x32("type", S_IRUGO, d, &node->type);
+ if (!type)
+ goto err_dir;
+
+ data = debugfs_create_file("data", S_IRUGO, d, node, &fops_setup_data);
+ if (!data)
+ goto err_type;
+
+ return 0;
+
+err_type:
+ debugfs_remove(type);
+err_dir:
+ debugfs_remove(d);
+ return -ENOMEM;
+}
+
+static int __init create_setup_data_nodes(struct dentry *parent)
+{
+ struct setup_data_node *node;
+ struct setup_data *data;
+ int error;
+ struct dentry *d;
+ u64 pa_data;
+ int no = 0;
+
+ d = debugfs_create_dir("setup_data", parent);
+ if (!d)
+ return -ENOMEM;
+
+ pa_data = boot_params.hdr.setup_data;
+
+ while (pa_data) {
+ node = kmalloc(sizeof(*node), GFP_KERNEL);
+ if (!node) {
+ error = -ENOMEM;
+ goto err_dir;
+ }
+
+ data = memremap(pa_data, sizeof(*data), MEMREMAP_WB);
+ if (!data) {
+ kfree(node);
+ error = -ENOMEM;
+ goto err_dir;
+ }
+
+ node->paddr = pa_data;
+ node->type = data->type;
+ node->len = data->len;
+ error = create_setup_data_node(d, no, node);
+ pa_data = data->next;
+
+ memunmap(data);
+ if (error)
+ goto err_dir;
+ no++;
+ }
+
+ return 0;
+
+err_dir:
+ debugfs_remove(d);
+ return error;
+}
+
+static struct debugfs_blob_wrapper boot_params_blob = {
+ .data = &boot_params,
+ .size = sizeof(boot_params),
+};
+
+static int __init boot_params_kdebugfs_init(void)
+{
+ struct dentry *dbp, *version, *data;
+ int error = -ENOMEM;
+
+ dbp = debugfs_create_dir("boot_params", arch_debugfs_dir);
+ if (!dbp)
+ return -ENOMEM;
+
+ version = debugfs_create_x16("version", S_IRUGO, dbp,
+ &boot_params.hdr.version);
+ if (!version)
+ goto err_dir;
+
+ data = debugfs_create_blob("data", S_IRUGO, dbp,
+ &boot_params_blob);
+ if (!data)
+ goto err_version;
+
+ error = create_setup_data_nodes(dbp);
+ if (error)
+ goto err_data;
+
+ return 0;
+
+err_data:
+ debugfs_remove(data);
+err_version:
+ debugfs_remove(version);
+err_dir:
+ debugfs_remove(dbp);
+ return error;
+}
+#endif /* CONFIG_DEBUG_BOOT_PARAMS */
+
+static int __init arch_kdebugfs_init(void)
+{
+ int error = 0;
+
+ arch_debugfs_dir = debugfs_create_dir("x86", NULL);
+ if (!arch_debugfs_dir)
+ return -ENOMEM;
+
+#ifdef CONFIG_DEBUG_BOOT_PARAMS
+ error = boot_params_kdebugfs_init();
+#endif
+
+ return error;
+}
+arch_initcall(arch_kdebugfs_init);
diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c
new file mode 100644
index 0000000..278cd07
--- /dev/null
+++ b/arch/x86/kernel/kexec-bzimage64.c
@@ -0,0 +1,547 @@
+/*
+ * Kexec bzImage loader
+ *
+ * Copyright (C) 2014 Red Hat Inc.
+ * Authors:
+ * Vivek Goyal <vgoyal@redhat.com>
+ *
+ * This source code is licensed under the GNU General Public License,
+ * Version 2. See the file COPYING for more details.
+ */
+
+#define pr_fmt(fmt) "kexec-bzImage64: " fmt
+
+#include <linux/string.h>
+#include <linux/printk.h>
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/kexec.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/efi.h>
+#include <linux/verification.h>
+
+#include <asm/bootparam.h>
+#include <asm/setup.h>
+#include <asm/crash.h>
+#include <asm/efi.h>
+#include <asm/e820/api.h>
+#include <asm/kexec-bzimage64.h>
+
+#define MAX_ELFCOREHDR_STR_LEN 30 /* elfcorehdr=0x<64bit-value> */
+
+/*
+ * Defines lowest physical address for various segments. Not sure where
+ * exactly these limits came from. Current bzimage64 loader in kexec-tools
+ * uses these so I am retaining it. It can be changed over time as we gain
+ * more insight.
+ */
+#define MIN_PURGATORY_ADDR 0x3000
+#define MIN_BOOTPARAM_ADDR 0x3000
+#define MIN_KERNEL_LOAD_ADDR 0x100000
+#define MIN_INITRD_LOAD_ADDR 0x1000000
+
+/*
+ * This is a place holder for all boot loader specific data structure which
+ * gets allocated in one call but gets freed much later during cleanup
+ * time. Right now there is only one field but it can grow as need be.
+ */
+struct bzimage64_data {
+ /*
+ * Temporary buffer to hold bootparams buffer. This should be
+ * freed once the bootparam segment has been loaded.
+ */
+ void *bootparams_buf;
+};
+
+static int setup_initrd(struct boot_params *params,
+ unsigned long initrd_load_addr, unsigned long initrd_len)
+{
+ params->hdr.ramdisk_image = initrd_load_addr & 0xffffffffUL;
+ params->hdr.ramdisk_size = initrd_len & 0xffffffffUL;
+
+ params->ext_ramdisk_image = initrd_load_addr >> 32;
+ params->ext_ramdisk_size = initrd_len >> 32;
+
+ return 0;
+}
+
+static int setup_cmdline(struct kimage *image, struct boot_params *params,
+ unsigned long bootparams_load_addr,
+ unsigned long cmdline_offset, char *cmdline,
+ unsigned long cmdline_len)
+{
+ char *cmdline_ptr = ((char *)params) + cmdline_offset;
+ unsigned long cmdline_ptr_phys, len = 0;
+ uint32_t cmdline_low_32, cmdline_ext_32;
+
+ if (image->type == KEXEC_TYPE_CRASH) {
+ len = sprintf(cmdline_ptr,
+ "elfcorehdr=0x%lx ", image->arch.elf_load_addr);
+ }
+ memcpy(cmdline_ptr + len, cmdline, cmdline_len);
+ cmdline_len += len;
+
+ cmdline_ptr[cmdline_len - 1] = '\0';
+
+ pr_debug("Final command line is: %s\n", cmdline_ptr);
+ cmdline_ptr_phys = bootparams_load_addr + cmdline_offset;
+ cmdline_low_32 = cmdline_ptr_phys & 0xffffffffUL;
+ cmdline_ext_32 = cmdline_ptr_phys >> 32;
+
+ params->hdr.cmd_line_ptr = cmdline_low_32;
+ if (cmdline_ext_32)
+ params->ext_cmd_line_ptr = cmdline_ext_32;
+
+ return 0;
+}
+
+static int setup_e820_entries(struct boot_params *params)
+{
+ unsigned int nr_e820_entries;
+
+ nr_e820_entries = e820_table_kexec->nr_entries;
+
+ /* TODO: Pass entries more than E820_MAX_ENTRIES_ZEROPAGE in bootparams setup data */
+ if (nr_e820_entries > E820_MAX_ENTRIES_ZEROPAGE)
+ nr_e820_entries = E820_MAX_ENTRIES_ZEROPAGE;
+
+ params->e820_entries = nr_e820_entries;
+ memcpy(¶ms->e820_table, &e820_table_kexec->entries, nr_e820_entries*sizeof(struct e820_entry));
+
+ return 0;
+}
+
+#ifdef CONFIG_EFI
+static int setup_efi_info_memmap(struct boot_params *params,
+ unsigned long params_load_addr,
+ unsigned int efi_map_offset,
+ unsigned int efi_map_sz)
+{
+ void *efi_map = (void *)params + efi_map_offset;
+ unsigned long efi_map_phys_addr = params_load_addr + efi_map_offset;
+ struct efi_info *ei = ¶ms->efi_info;
+
+ if (!efi_map_sz)
+ return 0;
+
+ efi_runtime_map_copy(efi_map, efi_map_sz);
+
+ ei->efi_memmap = efi_map_phys_addr & 0xffffffff;
+ ei->efi_memmap_hi = efi_map_phys_addr >> 32;
+ ei->efi_memmap_size = efi_map_sz;
+
+ return 0;
+}
+
+static int
+prepare_add_efi_setup_data(struct boot_params *params,
+ unsigned long params_load_addr,
+ unsigned int efi_setup_data_offset)
+{
+ unsigned long setup_data_phys;
+ struct setup_data *sd = (void *)params + efi_setup_data_offset;
+ struct efi_setup_data *esd = (void *)sd + sizeof(struct setup_data);
+
+ esd->fw_vendor = efi.fw_vendor;
+ esd->runtime = efi.runtime;
+ esd->tables = efi.config_table;
+ esd->smbios = efi.smbios;
+
+ sd->type = SETUP_EFI;
+ sd->len = sizeof(struct efi_setup_data);
+
+ /* Add setup data */
+ setup_data_phys = params_load_addr + efi_setup_data_offset;
+ sd->next = params->hdr.setup_data;
+ params->hdr.setup_data = setup_data_phys;
+
+ return 0;
+}
+
+static int
+setup_efi_state(struct boot_params *params, unsigned long params_load_addr,
+ unsigned int efi_map_offset, unsigned int efi_map_sz,
+ unsigned int efi_setup_data_offset)
+{
+ struct efi_info *current_ei = &boot_params.efi_info;
+ struct efi_info *ei = ¶ms->efi_info;
+
+ if (!current_ei->efi_memmap_size)
+ return 0;
+
+ /*
+ * If 1:1 mapping is not enabled, second kernel can not setup EFI
+ * and use EFI run time services. User space will have to pass
+ * acpi_rsdp=<addr> on kernel command line to make second kernel boot
+ * without efi.
+ */
+ if (efi_enabled(EFI_OLD_MEMMAP))
+ return 0;
+
+ ei->efi_loader_signature = current_ei->efi_loader_signature;
+ ei->efi_systab = current_ei->efi_systab;
+ ei->efi_systab_hi = current_ei->efi_systab_hi;
+
+ ei->efi_memdesc_version = current_ei->efi_memdesc_version;
+ ei->efi_memdesc_size = efi_get_runtime_map_desc_size();
+
+ setup_efi_info_memmap(params, params_load_addr, efi_map_offset,
+ efi_map_sz);
+ prepare_add_efi_setup_data(params, params_load_addr,
+ efi_setup_data_offset);
+ return 0;
+}
+#endif /* CONFIG_EFI */
+
+static int
+setup_boot_parameters(struct kimage *image, struct boot_params *params,
+ unsigned long params_load_addr,
+ unsigned int efi_map_offset, unsigned int efi_map_sz,
+ unsigned int efi_setup_data_offset)
+{
+ unsigned int nr_e820_entries;
+ unsigned long long mem_k, start, end;
+ int i, ret = 0;
+
+ /* Get subarch from existing bootparams */
+ params->hdr.hardware_subarch = boot_params.hdr.hardware_subarch;
+
+ /* Copying screen_info will do? */
+ memcpy(¶ms->screen_info, &boot_params.screen_info,
+ sizeof(struct screen_info));
+
+ /* Fill in memsize later */
+ params->screen_info.ext_mem_k = 0;
+ params->alt_mem_k = 0;
+
+ /* Default APM info */
+ memset(¶ms->apm_bios_info, 0, sizeof(params->apm_bios_info));
+
+ /* Default drive info */
+ memset(¶ms->hd0_info, 0, sizeof(params->hd0_info));
+ memset(¶ms->hd1_info, 0, sizeof(params->hd1_info));
+
+ if (image->type == KEXEC_TYPE_CRASH) {
+ ret = crash_setup_memmap_entries(image, params);
+ if (ret)
+ return ret;
+ } else
+ setup_e820_entries(params);
+
+ nr_e820_entries = params->e820_entries;
+
+ for (i = 0; i < nr_e820_entries; i++) {
+ if (params->e820_table[i].type != E820_TYPE_RAM)
+ continue;
+ start = params->e820_table[i].addr;
+ end = params->e820_table[i].addr + params->e820_table[i].size - 1;
+
+ if ((start <= 0x100000) && end > 0x100000) {
+ mem_k = (end >> 10) - (0x100000 >> 10);
+ params->screen_info.ext_mem_k = mem_k;
+ params->alt_mem_k = mem_k;
+ if (mem_k > 0xfc00)
+ params->screen_info.ext_mem_k = 0xfc00; /* 64M*/
+ if (mem_k > 0xffffffff)
+ params->alt_mem_k = 0xffffffff;
+ }
+ }
+
+#ifdef CONFIG_EFI
+ /* Setup EFI state */
+ setup_efi_state(params, params_load_addr, efi_map_offset, efi_map_sz,
+ efi_setup_data_offset);
+#endif
+
+ /* Setup EDD info */
+ memcpy(params->eddbuf, boot_params.eddbuf,
+ EDDMAXNR * sizeof(struct edd_info));
+ params->eddbuf_entries = boot_params.eddbuf_entries;
+
+ memcpy(params->edd_mbr_sig_buffer, boot_params.edd_mbr_sig_buffer,
+ EDD_MBR_SIG_MAX * sizeof(unsigned int));
+
+ return ret;
+}
+
+static int bzImage64_probe(const char *buf, unsigned long len)
+{
+ int ret = -ENOEXEC;
+ struct setup_header *header;
+
+ /* kernel should be at least two sectors long */
+ if (len < 2 * 512) {
+ pr_err("File is too short to be a bzImage\n");
+ return ret;
+ }
+
+ header = (struct setup_header *)(buf + offsetof(struct boot_params, hdr));
+ if (memcmp((char *)&header->header, "HdrS", 4) != 0) {
+ pr_err("Not a bzImage\n");
+ return ret;
+ }
+
+ if (header->boot_flag != 0xAA55) {
+ pr_err("No x86 boot sector present\n");
+ return ret;
+ }
+
+ if (header->version < 0x020C) {
+ pr_err("Must be at least protocol version 2.12\n");
+ return ret;
+ }
+
+ if (!(header->loadflags & LOADED_HIGH)) {
+ pr_err("zImage not a bzImage\n");
+ return ret;
+ }
+
+ if (!(header->xloadflags & XLF_KERNEL_64)) {
+ pr_err("Not a bzImage64. XLF_KERNEL_64 is not set.\n");
+ return ret;
+ }
+
+ if (!(header->xloadflags & XLF_CAN_BE_LOADED_ABOVE_4G)) {
+ pr_err("XLF_CAN_BE_LOADED_ABOVE_4G is not set.\n");
+ return ret;
+ }
+
+ /*
+ * Can't handle 32bit EFI as it does not allow loading kernel
+ * above 4G. This should be handled by 32bit bzImage loader
+ */
+ if (efi_enabled(EFI_RUNTIME_SERVICES) && !efi_enabled(EFI_64BIT)) {
+ pr_debug("EFI is 32 bit. Can't load kernel above 4G.\n");
+ return ret;
+ }
+
+ /* I've got a bzImage */
+ pr_debug("It's a relocatable bzImage64\n");
+ ret = 0;
+
+ return ret;
+}
+
+static void *bzImage64_load(struct kimage *image, char *kernel,
+ unsigned long kernel_len, char *initrd,
+ unsigned long initrd_len, char *cmdline,
+ unsigned long cmdline_len)
+{
+
+ struct setup_header *header;
+ int setup_sects, kern16_size, ret = 0;
+ unsigned long setup_header_size, params_cmdline_sz;
+ struct boot_params *params;
+ unsigned long bootparam_load_addr, kernel_load_addr, initrd_load_addr;
+ struct bzimage64_data *ldata;
+ struct kexec_entry64_regs regs64;
+ void *stack;
+ unsigned int setup_hdr_offset = offsetof(struct boot_params, hdr);
+ unsigned int efi_map_offset, efi_map_sz, efi_setup_data_offset;
+ struct kexec_buf kbuf = { .image = image, .buf_max = ULONG_MAX,
+ .top_down = true };
+ struct kexec_buf pbuf = { .image = image, .buf_min = MIN_PURGATORY_ADDR,
+ .buf_max = ULONG_MAX, .top_down = true };
+
+ header = (struct setup_header *)(kernel + setup_hdr_offset);
+ setup_sects = header->setup_sects;
+ if (setup_sects == 0)
+ setup_sects = 4;
+
+ kern16_size = (setup_sects + 1) * 512;
+ if (kernel_len < kern16_size) {
+ pr_err("bzImage truncated\n");
+ return ERR_PTR(-ENOEXEC);
+ }
+
+ if (cmdline_len > header->cmdline_size) {
+ pr_err("Kernel command line too long\n");
+ return ERR_PTR(-EINVAL);
+ }
+
+ /*
+ * In case of crash dump, we will append elfcorehdr=<addr> to
+ * command line. Make sure it does not overflow
+ */
+ if (cmdline_len + MAX_ELFCOREHDR_STR_LEN > header->cmdline_size) {
+ pr_debug("Appending elfcorehdr=<addr> to command line exceeds maximum allowed length\n");
+ return ERR_PTR(-EINVAL);
+ }
+
+ /* Allocate and load backup region */
+ if (image->type == KEXEC_TYPE_CRASH) {
+ ret = crash_load_segments(image);
+ if (ret)
+ return ERR_PTR(ret);
+ }
+
+ /*
+ * Load purgatory. For 64bit entry point, purgatory code can be
+ * anywhere.
+ */
+ ret = kexec_load_purgatory(image, &pbuf);
+ if (ret) {
+ pr_err("Loading purgatory failed\n");
+ return ERR_PTR(ret);
+ }
+
+ pr_debug("Loaded purgatory at 0x%lx\n", pbuf.mem);
+
+
+ /*
+ * Load Bootparams and cmdline and space for efi stuff.
+ *
+ * Allocate memory together for multiple data structures so
+ * that they all can go in single area/segment and we don't
+ * have to create separate segment for each. Keeps things
+ * little bit simple
+ */
+ efi_map_sz = efi_get_runtime_map_size();
+ params_cmdline_sz = sizeof(struct boot_params) + cmdline_len +
+ MAX_ELFCOREHDR_STR_LEN;
+ params_cmdline_sz = ALIGN(params_cmdline_sz, 16);
+ kbuf.bufsz = params_cmdline_sz + ALIGN(efi_map_sz, 16) +
+ sizeof(struct setup_data) +
+ sizeof(struct efi_setup_data);
+
+ params = kzalloc(kbuf.bufsz, GFP_KERNEL);
+ if (!params)
+ return ERR_PTR(-ENOMEM);
+ efi_map_offset = params_cmdline_sz;
+ efi_setup_data_offset = efi_map_offset + ALIGN(efi_map_sz, 16);
+
+ /* Copy setup header onto bootparams. Documentation/x86/boot.txt */
+ setup_header_size = 0x0202 + kernel[0x0201] - setup_hdr_offset;
+
+ /* Is there a limit on setup header size? */
+ memcpy(¶ms->hdr, (kernel + setup_hdr_offset), setup_header_size);
+
+ kbuf.buffer = params;
+ kbuf.memsz = kbuf.bufsz;
+ kbuf.buf_align = 16;
+ kbuf.buf_min = MIN_BOOTPARAM_ADDR;
+ ret = kexec_add_buffer(&kbuf);
+ if (ret)
+ goto out_free_params;
+ bootparam_load_addr = kbuf.mem;
+ pr_debug("Loaded boot_param, command line and misc at 0x%lx bufsz=0x%lx memsz=0x%lx\n",
+ bootparam_load_addr, kbuf.bufsz, kbuf.bufsz);
+
+ /* Load kernel */
+ kbuf.buffer = kernel + kern16_size;
+ kbuf.bufsz = kernel_len - kern16_size;
+ kbuf.memsz = PAGE_ALIGN(header->init_size);
+ kbuf.buf_align = header->kernel_alignment;
+ kbuf.buf_min = MIN_KERNEL_LOAD_ADDR;
+ ret = kexec_add_buffer(&kbuf);
+ if (ret)
+ goto out_free_params;
+ kernel_load_addr = kbuf.mem;
+
+ pr_debug("Loaded 64bit kernel at 0x%lx bufsz=0x%lx memsz=0x%lx\n",
+ kernel_load_addr, kbuf.bufsz, kbuf.memsz);
+
+ /* Load initrd high */
+ if (initrd) {
+ kbuf.buffer = initrd;
+ kbuf.bufsz = kbuf.memsz = initrd_len;
+ kbuf.buf_align = PAGE_SIZE;
+ kbuf.buf_min = MIN_INITRD_LOAD_ADDR;
+ ret = kexec_add_buffer(&kbuf);
+ if (ret)
+ goto out_free_params;
+ initrd_load_addr = kbuf.mem;
+
+ pr_debug("Loaded initrd at 0x%lx bufsz=0x%lx memsz=0x%lx\n",
+ initrd_load_addr, initrd_len, initrd_len);
+
+ setup_initrd(params, initrd_load_addr, initrd_len);
+ }
+
+ setup_cmdline(image, params, bootparam_load_addr,
+ sizeof(struct boot_params), cmdline, cmdline_len);
+
+ /* bootloader info. Do we need a separate ID for kexec kernel loader? */
+ params->hdr.type_of_loader = 0x0D << 4;
+ params->hdr.loadflags = 0;
+
+ /* Setup purgatory regs for entry */
+ ret = kexec_purgatory_get_set_symbol(image, "entry64_regs", ®s64,
+ sizeof(regs64), 1);
+ if (ret)
+ goto out_free_params;
+
+ regs64.rbx = 0; /* Bootstrap Processor */
+ regs64.rsi = bootparam_load_addr;
+ regs64.rip = kernel_load_addr + 0x200;
+ stack = kexec_purgatory_get_symbol_addr(image, "stack_end");
+ if (IS_ERR(stack)) {
+ pr_err("Could not find address of symbol stack_end\n");
+ ret = -EINVAL;
+ goto out_free_params;
+ }
+
+ regs64.rsp = (unsigned long)stack;
+ ret = kexec_purgatory_get_set_symbol(image, "entry64_regs", ®s64,
+ sizeof(regs64), 0);
+ if (ret)
+ goto out_free_params;
+
+ ret = setup_boot_parameters(image, params, bootparam_load_addr,
+ efi_map_offset, efi_map_sz,
+ efi_setup_data_offset);
+ if (ret)
+ goto out_free_params;
+
+ /* Allocate loader specific data */
+ ldata = kzalloc(sizeof(struct bzimage64_data), GFP_KERNEL);
+ if (!ldata) {
+ ret = -ENOMEM;
+ goto out_free_params;
+ }
+
+ /*
+ * Store pointer to params so that it could be freed after loading
+ * params segment has been loaded and contents have been copied
+ * somewhere else.
+ */
+ ldata->bootparams_buf = params;
+ return ldata;
+
+out_free_params:
+ kfree(params);
+ return ERR_PTR(ret);
+}
+
+/* This cleanup function is called after various segments have been loaded */
+static int bzImage64_cleanup(void *loader_data)
+{
+ struct bzimage64_data *ldata = loader_data;
+
+ if (!ldata)
+ return 0;
+
+ kfree(ldata->bootparams_buf);
+ ldata->bootparams_buf = NULL;
+
+ return 0;
+}
+
+#ifdef CONFIG_KEXEC_BZIMAGE_VERIFY_SIG
+static int bzImage64_verify_sig(const char *kernel, unsigned long kernel_len)
+{
+ return verify_pefile_signature(kernel, kernel_len,
+ VERIFY_USE_SECONDARY_KEYRING,
+ VERIFYING_KEXEC_PE_SIGNATURE);
+}
+#endif
+
+const struct kexec_file_ops kexec_bzImage64_ops = {
+ .probe = bzImage64_probe,
+ .load = bzImage64_load,
+ .cleanup = bzImage64_cleanup,
+#ifdef CONFIG_KEXEC_BZIMAGE_VERIFY_SIG
+ .verify_sig = bzImage64_verify_sig,
+#endif
+};
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
new file mode 100644
index 0000000..8e36f24
--- /dev/null
+++ b/arch/x86/kernel/kgdb.c
@@ -0,0 +1,816 @@
+/*
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ */
+
+/*
+ * Copyright (C) 2004 Amit S. Kale <amitkale@linsyssoft.com>
+ * Copyright (C) 2000-2001 VERITAS Software Corporation.
+ * Copyright (C) 2002 Andi Kleen, SuSE Labs
+ * Copyright (C) 2004 LinSysSoft Technologies Pvt. Ltd.
+ * Copyright (C) 2007 MontaVista Software, Inc.
+ * Copyright (C) 2007-2008 Jason Wessel, Wind River Systems, Inc.
+ */
+/****************************************************************************
+ * Contributor: Lake Stevens Instrument Division$
+ * Written by: Glenn Engel $
+ * Updated by: Amit Kale<akale@veritas.com>
+ * Updated by: Tom Rini <trini@kernel.crashing.org>
+ * Updated by: Jason Wessel <jason.wessel@windriver.com>
+ * Modified for 386 by Jim Kingdon, Cygnus Support.
+ * Origianl kgdb, compatibility with 2.1.xx kernel by
+ * David Grothe <dave@gcom.com>
+ * Integrated into 2.2.5 kernel by Tigran Aivazian <tigran@sco.com>
+ * X86_64 changes from Andi Kleen's patch merged by Jim Houston
+ */
+#include <linux/spinlock.h>
+#include <linux/kdebug.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <linux/ptrace.h>
+#include <linux/sched.h>
+#include <linux/delay.h>
+#include <linux/kgdb.h>
+#include <linux/smp.h>
+#include <linux/nmi.h>
+#include <linux/hw_breakpoint.h>
+#include <linux/uaccess.h>
+#include <linux/memory.h>
+
+#include <asm/text-patching.h>
+#include <asm/debugreg.h>
+#include <asm/apicdef.h>
+#include <asm/apic.h>
+#include <asm/nmi.h>
+#include <asm/switch_to.h>
+
+struct dbg_reg_def_t dbg_reg_def[DBG_MAX_REG_NUM] =
+{
+#ifdef CONFIG_X86_32
+ { "ax", 4, offsetof(struct pt_regs, ax) },
+ { "cx", 4, offsetof(struct pt_regs, cx) },
+ { "dx", 4, offsetof(struct pt_regs, dx) },
+ { "bx", 4, offsetof(struct pt_regs, bx) },
+ { "sp", 4, offsetof(struct pt_regs, sp) },
+ { "bp", 4, offsetof(struct pt_regs, bp) },
+ { "si", 4, offsetof(struct pt_regs, si) },
+ { "di", 4, offsetof(struct pt_regs, di) },
+ { "ip", 4, offsetof(struct pt_regs, ip) },
+ { "flags", 4, offsetof(struct pt_regs, flags) },
+ { "cs", 4, offsetof(struct pt_regs, cs) },
+ { "ss", 4, offsetof(struct pt_regs, ss) },
+ { "ds", 4, offsetof(struct pt_regs, ds) },
+ { "es", 4, offsetof(struct pt_regs, es) },
+#else
+ { "ax", 8, offsetof(struct pt_regs, ax) },
+ { "bx", 8, offsetof(struct pt_regs, bx) },
+ { "cx", 8, offsetof(struct pt_regs, cx) },
+ { "dx", 8, offsetof(struct pt_regs, dx) },
+ { "si", 8, offsetof(struct pt_regs, si) },
+ { "di", 8, offsetof(struct pt_regs, di) },
+ { "bp", 8, offsetof(struct pt_regs, bp) },
+ { "sp", 8, offsetof(struct pt_regs, sp) },
+ { "r8", 8, offsetof(struct pt_regs, r8) },
+ { "r9", 8, offsetof(struct pt_regs, r9) },
+ { "r10", 8, offsetof(struct pt_regs, r10) },
+ { "r11", 8, offsetof(struct pt_regs, r11) },
+ { "r12", 8, offsetof(struct pt_regs, r12) },
+ { "r13", 8, offsetof(struct pt_regs, r13) },
+ { "r14", 8, offsetof(struct pt_regs, r14) },
+ { "r15", 8, offsetof(struct pt_regs, r15) },
+ { "ip", 8, offsetof(struct pt_regs, ip) },
+ { "flags", 4, offsetof(struct pt_regs, flags) },
+ { "cs", 4, offsetof(struct pt_regs, cs) },
+ { "ss", 4, offsetof(struct pt_regs, ss) },
+ { "ds", 4, -1 },
+ { "es", 4, -1 },
+#endif
+ { "fs", 4, -1 },
+ { "gs", 4, -1 },
+};
+
+int dbg_set_reg(int regno, void *mem, struct pt_regs *regs)
+{
+ if (
+#ifdef CONFIG_X86_32
+ regno == GDB_SS || regno == GDB_FS || regno == GDB_GS ||
+#endif
+ regno == GDB_SP || regno == GDB_ORIG_AX)
+ return 0;
+
+ if (dbg_reg_def[regno].offset != -1)
+ memcpy((void *)regs + dbg_reg_def[regno].offset, mem,
+ dbg_reg_def[regno].size);
+ return 0;
+}
+
+char *dbg_get_reg(int regno, void *mem, struct pt_regs *regs)
+{
+ if (regno == GDB_ORIG_AX) {
+ memcpy(mem, ®s->orig_ax, sizeof(regs->orig_ax));
+ return "orig_ax";
+ }
+ if (regno >= DBG_MAX_REG_NUM || regno < 0)
+ return NULL;
+
+ if (dbg_reg_def[regno].offset != -1)
+ memcpy(mem, (void *)regs + dbg_reg_def[regno].offset,
+ dbg_reg_def[regno].size);
+
+#ifdef CONFIG_X86_32
+ switch (regno) {
+ case GDB_SS:
+ if (!user_mode(regs))
+ *(unsigned long *)mem = __KERNEL_DS;
+ break;
+ case GDB_SP:
+ if (!user_mode(regs))
+ *(unsigned long *)mem = kernel_stack_pointer(regs);
+ break;
+ case GDB_GS:
+ case GDB_FS:
+ *(unsigned long *)mem = 0xFFFF;
+ break;
+ }
+#endif
+ return dbg_reg_def[regno].name;
+}
+
+/**
+ * sleeping_thread_to_gdb_regs - Convert ptrace regs to GDB regs
+ * @gdb_regs: A pointer to hold the registers in the order GDB wants.
+ * @p: The &struct task_struct of the desired process.
+ *
+ * Convert the register values of the sleeping process in @p to
+ * the format that GDB expects.
+ * This function is called when kgdb does not have access to the
+ * &struct pt_regs and therefore it should fill the gdb registers
+ * @gdb_regs with what has been saved in &struct thread_struct
+ * thread field during switch_to.
+ */
+void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p)
+{
+#ifndef CONFIG_X86_32
+ u32 *gdb_regs32 = (u32 *)gdb_regs;
+#endif
+ gdb_regs[GDB_AX] = 0;
+ gdb_regs[GDB_BX] = 0;
+ gdb_regs[GDB_CX] = 0;
+ gdb_regs[GDB_DX] = 0;
+ gdb_regs[GDB_SI] = 0;
+ gdb_regs[GDB_DI] = 0;
+ gdb_regs[GDB_BP] = ((struct inactive_task_frame *)p->thread.sp)->bp;
+#ifdef CONFIG_X86_32
+ gdb_regs[GDB_DS] = __KERNEL_DS;
+ gdb_regs[GDB_ES] = __KERNEL_DS;
+ gdb_regs[GDB_PS] = 0;
+ gdb_regs[GDB_CS] = __KERNEL_CS;
+ gdb_regs[GDB_SS] = __KERNEL_DS;
+ gdb_regs[GDB_FS] = 0xFFFF;
+ gdb_regs[GDB_GS] = 0xFFFF;
+#else
+ gdb_regs32[GDB_PS] = 0;
+ gdb_regs32[GDB_CS] = __KERNEL_CS;
+ gdb_regs32[GDB_SS] = __KERNEL_DS;
+ gdb_regs[GDB_R8] = 0;
+ gdb_regs[GDB_R9] = 0;
+ gdb_regs[GDB_R10] = 0;
+ gdb_regs[GDB_R11] = 0;
+ gdb_regs[GDB_R12] = 0;
+ gdb_regs[GDB_R13] = 0;
+ gdb_regs[GDB_R14] = 0;
+ gdb_regs[GDB_R15] = 0;
+#endif
+ gdb_regs[GDB_PC] = 0;
+ gdb_regs[GDB_SP] = p->thread.sp;
+}
+
+static struct hw_breakpoint {
+ unsigned enabled;
+ unsigned long addr;
+ int len;
+ int type;
+ struct perf_event * __percpu *pev;
+} breakinfo[HBP_NUM];
+
+static unsigned long early_dr7;
+
+static void kgdb_correct_hw_break(void)
+{
+ int breakno;
+
+ for (breakno = 0; breakno < HBP_NUM; breakno++) {
+ struct perf_event *bp;
+ struct arch_hw_breakpoint *info;
+ int val;
+ int cpu = raw_smp_processor_id();
+ if (!breakinfo[breakno].enabled)
+ continue;
+ if (dbg_is_early) {
+ set_debugreg(breakinfo[breakno].addr, breakno);
+ early_dr7 |= encode_dr7(breakno,
+ breakinfo[breakno].len,
+ breakinfo[breakno].type);
+ set_debugreg(early_dr7, 7);
+ continue;
+ }
+ bp = *per_cpu_ptr(breakinfo[breakno].pev, cpu);
+ info = counter_arch_bp(bp);
+ if (bp->attr.disabled != 1)
+ continue;
+ bp->attr.bp_addr = breakinfo[breakno].addr;
+ bp->attr.bp_len = breakinfo[breakno].len;
+ bp->attr.bp_type = breakinfo[breakno].type;
+ info->address = breakinfo[breakno].addr;
+ info->len = breakinfo[breakno].len;
+ info->type = breakinfo[breakno].type;
+ val = arch_install_hw_breakpoint(bp);
+ if (!val)
+ bp->attr.disabled = 0;
+ }
+ if (!dbg_is_early)
+ hw_breakpoint_restore();
+}
+
+static int hw_break_reserve_slot(int breakno)
+{
+ int cpu;
+ int cnt = 0;
+ struct perf_event **pevent;
+
+ if (dbg_is_early)
+ return 0;
+
+ for_each_online_cpu(cpu) {
+ cnt++;
+ pevent = per_cpu_ptr(breakinfo[breakno].pev, cpu);
+ if (dbg_reserve_bp_slot(*pevent))
+ goto fail;
+ }
+
+ return 0;
+
+fail:
+ for_each_online_cpu(cpu) {
+ cnt--;
+ if (!cnt)
+ break;
+ pevent = per_cpu_ptr(breakinfo[breakno].pev, cpu);
+ dbg_release_bp_slot(*pevent);
+ }
+ return -1;
+}
+
+static int hw_break_release_slot(int breakno)
+{
+ struct perf_event **pevent;
+ int cpu;
+
+ if (dbg_is_early)
+ return 0;
+
+ for_each_online_cpu(cpu) {
+ pevent = per_cpu_ptr(breakinfo[breakno].pev, cpu);
+ if (dbg_release_bp_slot(*pevent))
+ /*
+ * The debugger is responsible for handing the retry on
+ * remove failure.
+ */
+ return -1;
+ }
+ return 0;
+}
+
+static int
+kgdb_remove_hw_break(unsigned long addr, int len, enum kgdb_bptype bptype)
+{
+ int i;
+
+ for (i = 0; i < HBP_NUM; i++)
+ if (breakinfo[i].addr == addr && breakinfo[i].enabled)
+ break;
+ if (i == HBP_NUM)
+ return -1;
+
+ if (hw_break_release_slot(i)) {
+ printk(KERN_ERR "Cannot remove hw breakpoint at %lx\n", addr);
+ return -1;
+ }
+ breakinfo[i].enabled = 0;
+
+ return 0;
+}
+
+static void kgdb_remove_all_hw_break(void)
+{
+ int i;
+ int cpu = raw_smp_processor_id();
+ struct perf_event *bp;
+
+ for (i = 0; i < HBP_NUM; i++) {
+ if (!breakinfo[i].enabled)
+ continue;
+ bp = *per_cpu_ptr(breakinfo[i].pev, cpu);
+ if (!bp->attr.disabled) {
+ arch_uninstall_hw_breakpoint(bp);
+ bp->attr.disabled = 1;
+ continue;
+ }
+ if (dbg_is_early)
+ early_dr7 &= ~encode_dr7(i, breakinfo[i].len,
+ breakinfo[i].type);
+ else if (hw_break_release_slot(i))
+ printk(KERN_ERR "KGDB: hw bpt remove failed %lx\n",
+ breakinfo[i].addr);
+ breakinfo[i].enabled = 0;
+ }
+}
+
+static int
+kgdb_set_hw_break(unsigned long addr, int len, enum kgdb_bptype bptype)
+{
+ int i;
+
+ for (i = 0; i < HBP_NUM; i++)
+ if (!breakinfo[i].enabled)
+ break;
+ if (i == HBP_NUM)
+ return -1;
+
+ switch (bptype) {
+ case BP_HARDWARE_BREAKPOINT:
+ len = 1;
+ breakinfo[i].type = X86_BREAKPOINT_EXECUTE;
+ break;
+ case BP_WRITE_WATCHPOINT:
+ breakinfo[i].type = X86_BREAKPOINT_WRITE;
+ break;
+ case BP_ACCESS_WATCHPOINT:
+ breakinfo[i].type = X86_BREAKPOINT_RW;
+ break;
+ default:
+ return -1;
+ }
+ switch (len) {
+ case 1:
+ breakinfo[i].len = X86_BREAKPOINT_LEN_1;
+ break;
+ case 2:
+ breakinfo[i].len = X86_BREAKPOINT_LEN_2;
+ break;
+ case 4:
+ breakinfo[i].len = X86_BREAKPOINT_LEN_4;
+ break;
+#ifdef CONFIG_X86_64
+ case 8:
+ breakinfo[i].len = X86_BREAKPOINT_LEN_8;
+ break;
+#endif
+ default:
+ return -1;
+ }
+ breakinfo[i].addr = addr;
+ if (hw_break_reserve_slot(i)) {
+ breakinfo[i].addr = 0;
+ return -1;
+ }
+ breakinfo[i].enabled = 1;
+
+ return 0;
+}
+
+/**
+ * kgdb_disable_hw_debug - Disable hardware debugging while we in kgdb.
+ * @regs: Current &struct pt_regs.
+ *
+ * This function will be called if the particular architecture must
+ * disable hardware debugging while it is processing gdb packets or
+ * handling exception.
+ */
+static void kgdb_disable_hw_debug(struct pt_regs *regs)
+{
+ int i;
+ int cpu = raw_smp_processor_id();
+ struct perf_event *bp;
+
+ /* Disable hardware debugging while we are in kgdb: */
+ set_debugreg(0UL, 7);
+ for (i = 0; i < HBP_NUM; i++) {
+ if (!breakinfo[i].enabled)
+ continue;
+ if (dbg_is_early) {
+ early_dr7 &= ~encode_dr7(i, breakinfo[i].len,
+ breakinfo[i].type);
+ continue;
+ }
+ bp = *per_cpu_ptr(breakinfo[i].pev, cpu);
+ if (bp->attr.disabled == 1)
+ continue;
+ arch_uninstall_hw_breakpoint(bp);
+ bp->attr.disabled = 1;
+ }
+}
+
+#ifdef CONFIG_SMP
+/**
+ * kgdb_roundup_cpus - Get other CPUs into a holding pattern
+ * @flags: Current IRQ state
+ *
+ * On SMP systems, we need to get the attention of the other CPUs
+ * and get them be in a known state. This should do what is needed
+ * to get the other CPUs to call kgdb_wait(). Note that on some arches,
+ * the NMI approach is not used for rounding up all the CPUs. For example,
+ * in case of MIPS, smp_call_function() is used to roundup CPUs. In
+ * this case, we have to make sure that interrupts are enabled before
+ * calling smp_call_function(). The argument to this function is
+ * the flags that will be used when restoring the interrupts. There is
+ * local_irq_save() call before kgdb_roundup_cpus().
+ *
+ * On non-SMP systems, this is not called.
+ */
+void kgdb_roundup_cpus(unsigned long flags)
+{
+ apic->send_IPI_allbutself(APIC_DM_NMI);
+}
+#endif
+
+/**
+ * kgdb_arch_handle_exception - Handle architecture specific GDB packets.
+ * @e_vector: The error vector of the exception that happened.
+ * @signo: The signal number of the exception that happened.
+ * @err_code: The error code of the exception that happened.
+ * @remcomInBuffer: The buffer of the packet we have read.
+ * @remcomOutBuffer: The buffer of %BUFMAX bytes to write a packet into.
+ * @linux_regs: The &struct pt_regs of the current process.
+ *
+ * This function MUST handle the 'c' and 's' command packets,
+ * as well packets to set / remove a hardware breakpoint, if used.
+ * If there are additional packets which the hardware needs to handle,
+ * they are handled here. The code should return -1 if it wants to
+ * process more packets, and a %0 or %1 if it wants to exit from the
+ * kgdb callback.
+ */
+int kgdb_arch_handle_exception(int e_vector, int signo, int err_code,
+ char *remcomInBuffer, char *remcomOutBuffer,
+ struct pt_regs *linux_regs)
+{
+ unsigned long addr;
+ char *ptr;
+
+ switch (remcomInBuffer[0]) {
+ case 'c':
+ case 's':
+ /* try to read optional parameter, pc unchanged if no parm */
+ ptr = &remcomInBuffer[1];
+ if (kgdb_hex2long(&ptr, &addr))
+ linux_regs->ip = addr;
+ case 'D':
+ case 'k':
+ /* clear the trace bit */
+ linux_regs->flags &= ~X86_EFLAGS_TF;
+ atomic_set(&kgdb_cpu_doing_single_step, -1);
+
+ /* set the trace bit if we're stepping */
+ if (remcomInBuffer[0] == 's') {
+ linux_regs->flags |= X86_EFLAGS_TF;
+ atomic_set(&kgdb_cpu_doing_single_step,
+ raw_smp_processor_id());
+ }
+
+ return 0;
+ }
+
+ /* this means that we do not want to exit from the handler: */
+ return -1;
+}
+
+static inline int
+single_step_cont(struct pt_regs *regs, struct die_args *args)
+{
+ /*
+ * Single step exception from kernel space to user space so
+ * eat the exception and continue the process:
+ */
+ printk(KERN_ERR "KGDB: trap/step from kernel to user space, "
+ "resuming...\n");
+ kgdb_arch_handle_exception(args->trapnr, args->signr,
+ args->err, "c", "", regs);
+ /*
+ * Reset the BS bit in dr6 (pointed by args->err) to
+ * denote completion of processing
+ */
+ (*(unsigned long *)ERR_PTR(args->err)) &= ~DR_STEP;
+
+ return NOTIFY_STOP;
+}
+
+static DECLARE_BITMAP(was_in_debug_nmi, NR_CPUS);
+
+static int kgdb_nmi_handler(unsigned int cmd, struct pt_regs *regs)
+{
+ int cpu;
+
+ switch (cmd) {
+ case NMI_LOCAL:
+ if (atomic_read(&kgdb_active) != -1) {
+ /* KGDB CPU roundup */
+ cpu = raw_smp_processor_id();
+ kgdb_nmicallback(cpu, regs);
+ set_bit(cpu, was_in_debug_nmi);
+ touch_nmi_watchdog();
+
+ return NMI_HANDLED;
+ }
+ break;
+
+ case NMI_UNKNOWN:
+ cpu = raw_smp_processor_id();
+
+ if (__test_and_clear_bit(cpu, was_in_debug_nmi))
+ return NMI_HANDLED;
+
+ break;
+ default:
+ /* do nothing */
+ break;
+ }
+ return NMI_DONE;
+}
+
+static int __kgdb_notify(struct die_args *args, unsigned long cmd)
+{
+ struct pt_regs *regs = args->regs;
+
+ switch (cmd) {
+ case DIE_DEBUG:
+ if (atomic_read(&kgdb_cpu_doing_single_step) != -1) {
+ if (user_mode(regs))
+ return single_step_cont(regs, args);
+ break;
+ } else if (test_thread_flag(TIF_SINGLESTEP))
+ /* This means a user thread is single stepping
+ * a system call which should be ignored
+ */
+ return NOTIFY_DONE;
+ /* fall through */
+ default:
+ if (user_mode(regs))
+ return NOTIFY_DONE;
+ }
+
+ if (kgdb_handle_exception(args->trapnr, args->signr, cmd, regs))
+ return NOTIFY_DONE;
+
+ /* Must touch watchdog before return to normal operation */
+ touch_nmi_watchdog();
+ return NOTIFY_STOP;
+}
+
+int kgdb_ll_trap(int cmd, const char *str,
+ struct pt_regs *regs, long err, int trap, int sig)
+{
+ struct die_args args = {
+ .regs = regs,
+ .str = str,
+ .err = err,
+ .trapnr = trap,
+ .signr = sig,
+
+ };
+
+ if (!kgdb_io_module_registered)
+ return NOTIFY_DONE;
+
+ return __kgdb_notify(&args, cmd);
+}
+
+static int
+kgdb_notify(struct notifier_block *self, unsigned long cmd, void *ptr)
+{
+ unsigned long flags;
+ int ret;
+
+ local_irq_save(flags);
+ ret = __kgdb_notify(ptr, cmd);
+ local_irq_restore(flags);
+
+ return ret;
+}
+
+static struct notifier_block kgdb_notifier = {
+ .notifier_call = kgdb_notify,
+};
+
+/**
+ * kgdb_arch_init - Perform any architecture specific initialization.
+ *
+ * This function will handle the initialization of any architecture
+ * specific callbacks.
+ */
+int kgdb_arch_init(void)
+{
+ int retval;
+
+ retval = register_die_notifier(&kgdb_notifier);
+ if (retval)
+ goto out;
+
+ retval = register_nmi_handler(NMI_LOCAL, kgdb_nmi_handler,
+ 0, "kgdb");
+ if (retval)
+ goto out1;
+
+ retval = register_nmi_handler(NMI_UNKNOWN, kgdb_nmi_handler,
+ 0, "kgdb");
+
+ if (retval)
+ goto out2;
+
+ return retval;
+
+out2:
+ unregister_nmi_handler(NMI_LOCAL, "kgdb");
+out1:
+ unregister_die_notifier(&kgdb_notifier);
+out:
+ return retval;
+}
+
+static void kgdb_hw_overflow_handler(struct perf_event *event,
+ struct perf_sample_data *data, struct pt_regs *regs)
+{
+ struct task_struct *tsk = current;
+ int i;
+
+ for (i = 0; i < 4; i++)
+ if (breakinfo[i].enabled)
+ tsk->thread.debugreg6 |= (DR_TRAP0 << i);
+}
+
+void kgdb_arch_late(void)
+{
+ int i, cpu;
+ struct perf_event_attr attr;
+ struct perf_event **pevent;
+
+ /*
+ * Pre-allocate the hw breakpoint structions in the non-atomic
+ * portion of kgdb because this operation requires mutexs to
+ * complete.
+ */
+ hw_breakpoint_init(&attr);
+ attr.bp_addr = (unsigned long)kgdb_arch_init;
+ attr.bp_len = HW_BREAKPOINT_LEN_1;
+ attr.bp_type = HW_BREAKPOINT_W;
+ attr.disabled = 1;
+ for (i = 0; i < HBP_NUM; i++) {
+ if (breakinfo[i].pev)
+ continue;
+ breakinfo[i].pev = register_wide_hw_breakpoint(&attr, NULL, NULL);
+ if (IS_ERR((void * __force)breakinfo[i].pev)) {
+ printk(KERN_ERR "kgdb: Could not allocate hw"
+ "breakpoints\nDisabling the kernel debugger\n");
+ breakinfo[i].pev = NULL;
+ kgdb_arch_exit();
+ return;
+ }
+ for_each_online_cpu(cpu) {
+ pevent = per_cpu_ptr(breakinfo[i].pev, cpu);
+ pevent[0]->hw.sample_period = 1;
+ pevent[0]->overflow_handler = kgdb_hw_overflow_handler;
+ if (pevent[0]->destroy != NULL) {
+ pevent[0]->destroy = NULL;
+ release_bp_slot(*pevent);
+ }
+ }
+ }
+}
+
+/**
+ * kgdb_arch_exit - Perform any architecture specific uninitalization.
+ *
+ * This function will handle the uninitalization of any architecture
+ * specific callbacks, for dynamic registration and unregistration.
+ */
+void kgdb_arch_exit(void)
+{
+ int i;
+ for (i = 0; i < 4; i++) {
+ if (breakinfo[i].pev) {
+ unregister_wide_hw_breakpoint(breakinfo[i].pev);
+ breakinfo[i].pev = NULL;
+ }
+ }
+ unregister_nmi_handler(NMI_UNKNOWN, "kgdb");
+ unregister_nmi_handler(NMI_LOCAL, "kgdb");
+ unregister_die_notifier(&kgdb_notifier);
+}
+
+/**
+ *
+ * kgdb_skipexception - Bail out of KGDB when we've been triggered.
+ * @exception: Exception vector number
+ * @regs: Current &struct pt_regs.
+ *
+ * On some architectures we need to skip a breakpoint exception when
+ * it occurs after a breakpoint has been removed.
+ *
+ * Skip an int3 exception when it occurs after a breakpoint has been
+ * removed. Backtrack eip by 1 since the int3 would have caused it to
+ * increment by 1.
+ */
+int kgdb_skipexception(int exception, struct pt_regs *regs)
+{
+ if (exception == 3 && kgdb_isremovedbreak(regs->ip - 1)) {
+ regs->ip -= 1;
+ return 1;
+ }
+ return 0;
+}
+
+unsigned long kgdb_arch_pc(int exception, struct pt_regs *regs)
+{
+ if (exception == 3)
+ return instruction_pointer(regs) - 1;
+ return instruction_pointer(regs);
+}
+
+void kgdb_arch_set_pc(struct pt_regs *regs, unsigned long ip)
+{
+ regs->ip = ip;
+}
+
+int kgdb_arch_set_breakpoint(struct kgdb_bkpt *bpt)
+{
+ int err;
+ char opc[BREAK_INSTR_SIZE];
+
+ bpt->type = BP_BREAKPOINT;
+ err = probe_kernel_read(bpt->saved_instr, (char *)bpt->bpt_addr,
+ BREAK_INSTR_SIZE);
+ if (err)
+ return err;
+ err = probe_kernel_write((char *)bpt->bpt_addr,
+ arch_kgdb_ops.gdb_bpt_instr, BREAK_INSTR_SIZE);
+ if (!err)
+ return err;
+ /*
+ * It is safe to call text_poke() because normal kernel execution
+ * is stopped on all cores, so long as the text_mutex is not locked.
+ */
+ if (mutex_is_locked(&text_mutex))
+ return -EBUSY;
+ text_poke((void *)bpt->bpt_addr, arch_kgdb_ops.gdb_bpt_instr,
+ BREAK_INSTR_SIZE);
+ err = probe_kernel_read(opc, (char *)bpt->bpt_addr, BREAK_INSTR_SIZE);
+ if (err)
+ return err;
+ if (memcmp(opc, arch_kgdb_ops.gdb_bpt_instr, BREAK_INSTR_SIZE))
+ return -EINVAL;
+ bpt->type = BP_POKE_BREAKPOINT;
+
+ return err;
+}
+
+int kgdb_arch_remove_breakpoint(struct kgdb_bkpt *bpt)
+{
+ int err;
+ char opc[BREAK_INSTR_SIZE];
+
+ if (bpt->type != BP_POKE_BREAKPOINT)
+ goto knl_write;
+ /*
+ * It is safe to call text_poke() because normal kernel execution
+ * is stopped on all cores, so long as the text_mutex is not locked.
+ */
+ if (mutex_is_locked(&text_mutex))
+ goto knl_write;
+ text_poke((void *)bpt->bpt_addr, bpt->saved_instr, BREAK_INSTR_SIZE);
+ err = probe_kernel_read(opc, (char *)bpt->bpt_addr, BREAK_INSTR_SIZE);
+ if (err || memcmp(opc, bpt->saved_instr, BREAK_INSTR_SIZE))
+ goto knl_write;
+ return err;
+
+knl_write:
+ return probe_kernel_write((char *)bpt->bpt_addr,
+ (char *)bpt->saved_instr, BREAK_INSTR_SIZE);
+}
+
+struct kgdb_arch arch_kgdb_ops = {
+ /* Breakpoint instruction: */
+ .gdb_bpt_instr = { 0xcc },
+ .flags = KGDB_HW_BREAKPOINT,
+ .set_hw_breakpoint = kgdb_set_hw_break,
+ .remove_hw_breakpoint = kgdb_remove_hw_break,
+ .disable_hw_break = kgdb_disable_hw_debug,
+ .remove_all_hw_break = kgdb_remove_all_hw_break,
+ .correct_hw_break = kgdb_correct_hw_break,
+};
diff --git a/arch/x86/kernel/kprobes/Makefile b/arch/x86/kernel/kprobes/Makefile
new file mode 100644
index 0000000..0d33169
--- /dev/null
+++ b/arch/x86/kernel/kprobes/Makefile
@@ -0,0 +1,7 @@
+#
+# Makefile for kernel probes
+#
+
+obj-$(CONFIG_KPROBES) += core.o
+obj-$(CONFIG_OPTPROBES) += opt.o
+obj-$(CONFIG_KPROBES_ON_FTRACE) += ftrace.o
diff --git a/arch/x86/kernel/kprobes/common.h b/arch/x86/kernel/kprobes/common.h
new file mode 100644
index 0000000..2b949f4
--- /dev/null
+++ b/arch/x86/kernel/kprobes/common.h
@@ -0,0 +1,108 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __X86_KERNEL_KPROBES_COMMON_H
+#define __X86_KERNEL_KPROBES_COMMON_H
+
+/* Kprobes and Optprobes common header */
+
+#include <asm/asm.h>
+
+#ifdef CONFIG_FRAME_POINTER
+# define SAVE_RBP_STRING " push %" _ASM_BP "\n" \
+ " mov %" _ASM_SP ", %" _ASM_BP "\n"
+#else
+# define SAVE_RBP_STRING " push %" _ASM_BP "\n"
+#endif
+
+#ifdef CONFIG_X86_64
+#define SAVE_REGS_STRING \
+ /* Skip cs, ip, orig_ax. */ \
+ " subq $24, %rsp\n" \
+ " pushq %rdi\n" \
+ " pushq %rsi\n" \
+ " pushq %rdx\n" \
+ " pushq %rcx\n" \
+ " pushq %rax\n" \
+ " pushq %r8\n" \
+ " pushq %r9\n" \
+ " pushq %r10\n" \
+ " pushq %r11\n" \
+ " pushq %rbx\n" \
+ SAVE_RBP_STRING \
+ " pushq %r12\n" \
+ " pushq %r13\n" \
+ " pushq %r14\n" \
+ " pushq %r15\n"
+#define RESTORE_REGS_STRING \
+ " popq %r15\n" \
+ " popq %r14\n" \
+ " popq %r13\n" \
+ " popq %r12\n" \
+ " popq %rbp\n" \
+ " popq %rbx\n" \
+ " popq %r11\n" \
+ " popq %r10\n" \
+ " popq %r9\n" \
+ " popq %r8\n" \
+ " popq %rax\n" \
+ " popq %rcx\n" \
+ " popq %rdx\n" \
+ " popq %rsi\n" \
+ " popq %rdi\n" \
+ /* Skip orig_ax, ip, cs */ \
+ " addq $24, %rsp\n"
+#else
+#define SAVE_REGS_STRING \
+ /* Skip cs, ip, orig_ax and gs. */ \
+ " subl $16, %esp\n" \
+ " pushl %fs\n" \
+ " pushl %es\n" \
+ " pushl %ds\n" \
+ " pushl %eax\n" \
+ SAVE_RBP_STRING \
+ " pushl %edi\n" \
+ " pushl %esi\n" \
+ " pushl %edx\n" \
+ " pushl %ecx\n" \
+ " pushl %ebx\n"
+#define RESTORE_REGS_STRING \
+ " popl %ebx\n" \
+ " popl %ecx\n" \
+ " popl %edx\n" \
+ " popl %esi\n" \
+ " popl %edi\n" \
+ " popl %ebp\n" \
+ " popl %eax\n" \
+ /* Skip ds, es, fs, gs, orig_ax, and ip. Note: don't pop cs here*/\
+ " addl $24, %esp\n"
+#endif
+
+/* Ensure if the instruction can be boostable */
+extern int can_boost(struct insn *insn, void *orig_addr);
+/* Recover instruction if given address is probed */
+extern unsigned long recover_probed_instruction(kprobe_opcode_t *buf,
+ unsigned long addr);
+/*
+ * Copy an instruction and adjust the displacement if the instruction
+ * uses the %rip-relative addressing mode.
+ */
+extern int __copy_instruction(u8 *dest, u8 *src, u8 *real, struct insn *insn);
+
+/* Generate a relative-jump/call instruction */
+extern void synthesize_reljump(void *dest, void *from, void *to);
+extern void synthesize_relcall(void *dest, void *from, void *to);
+
+#ifdef CONFIG_OPTPROBES
+extern int setup_detour_execution(struct kprobe *p, struct pt_regs *regs, int reenter);
+extern unsigned long __recover_optprobed_insn(kprobe_opcode_t *buf, unsigned long addr);
+#else /* !CONFIG_OPTPROBES */
+static inline int setup_detour_execution(struct kprobe *p, struct pt_regs *regs, int reenter)
+{
+ return 0;
+}
+static inline unsigned long __recover_optprobed_insn(kprobe_opcode_t *buf, unsigned long addr)
+{
+ return addr;
+}
+#endif
+
+#endif
diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
new file mode 100644
index 0000000..b0d1e81
--- /dev/null
+++ b/arch/x86/kernel/kprobes/core.c
@@ -0,0 +1,1091 @@
+/*
+ * Kernel Probes (KProbes)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2002, 2004
+ *
+ * 2002-Oct Created by Vamsi Krishna S <vamsi_krishna@in.ibm.com> Kernel
+ * Probes initial implementation ( includes contributions from
+ * Rusty Russell).
+ * 2004-July Suparna Bhattacharya <suparna@in.ibm.com> added jumper probes
+ * interface to access function arguments.
+ * 2004-Oct Jim Keniston <jkenisto@us.ibm.com> and Prasanna S Panchamukhi
+ * <prasanna@in.ibm.com> adapted for x86_64 from i386.
+ * 2005-Mar Roland McGrath <roland@redhat.com>
+ * Fixed to handle %rip-relative addressing mode correctly.
+ * 2005-May Hien Nguyen <hien@us.ibm.com>, Jim Keniston
+ * <jkenisto@us.ibm.com> and Prasanna S Panchamukhi
+ * <prasanna@in.ibm.com> added function-return probes.
+ * 2005-May Rusty Lynch <rusty.lynch@intel.com>
+ * Added function return probes functionality
+ * 2006-Feb Masami Hiramatsu <hiramatu@sdl.hitachi.co.jp> added
+ * kprobe-booster and kretprobe-booster for i386.
+ * 2007-Dec Masami Hiramatsu <mhiramat@redhat.com> added kprobe-booster
+ * and kretprobe-booster for x86-64
+ * 2007-Dec Masami Hiramatsu <mhiramat@redhat.com>, Arjan van de Ven
+ * <arjan@infradead.org> and Jim Keniston <jkenisto@us.ibm.com>
+ * unified x86 kprobes code.
+ */
+#include <linux/kprobes.h>
+#include <linux/ptrace.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+#include <linux/hardirq.h>
+#include <linux/preempt.h>
+#include <linux/sched/debug.h>
+#include <linux/extable.h>
+#include <linux/kdebug.h>
+#include <linux/kallsyms.h>
+#include <linux/ftrace.h>
+#include <linux/frame.h>
+#include <linux/kasan.h>
+#include <linux/moduleloader.h>
+
+#include <asm/text-patching.h>
+#include <asm/cacheflush.h>
+#include <asm/desc.h>
+#include <asm/pgtable.h>
+#include <linux/uaccess.h>
+#include <asm/alternative.h>
+#include <asm/insn.h>
+#include <asm/debugreg.h>
+#include <asm/set_memory.h>
+
+#include "common.h"
+
+DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL;
+DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk);
+
+#define stack_addr(regs) ((unsigned long *)kernel_stack_pointer(regs))
+
+#define W(row, b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, ba, bb, bc, bd, be, bf)\
+ (((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) | \
+ (b4##UL << 0x4)|(b5##UL << 0x5)|(b6##UL << 0x6)|(b7##UL << 0x7) | \
+ (b8##UL << 0x8)|(b9##UL << 0x9)|(ba##UL << 0xa)|(bb##UL << 0xb) | \
+ (bc##UL << 0xc)|(bd##UL << 0xd)|(be##UL << 0xe)|(bf##UL << 0xf)) \
+ << (row % 32))
+ /*
+ * Undefined/reserved opcodes, conditional jump, Opcode Extension
+ * Groups, and some special opcodes can not boost.
+ * This is non-const and volatile to keep gcc from statically
+ * optimizing it out, as variable_test_bit makes gcc think only
+ * *(unsigned long*) is used.
+ */
+static volatile u32 twobyte_is_boostable[256 / 32] = {
+ /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
+ /* ---------------------------------------------- */
+ W(0x00, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0) | /* 00 */
+ W(0x10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1) , /* 10 */
+ W(0x20, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 20 */
+ W(0x30, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 30 */
+ W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 40 */
+ W(0x50, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 50 */
+ W(0x60, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1) | /* 60 */
+ W(0x70, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1) , /* 70 */
+ W(0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 80 */
+ W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 90 */
+ W(0xa0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1) | /* a0 */
+ W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1) , /* b0 */
+ W(0xc0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1) | /* c0 */
+ W(0xd0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1) , /* d0 */
+ W(0xe0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1) | /* e0 */
+ W(0xf0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0) /* f0 */
+ /* ----------------------------------------------- */
+ /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
+};
+#undef W
+
+struct kretprobe_blackpoint kretprobe_blacklist[] = {
+ {"__switch_to", }, /* This function switches only current task, but
+ doesn't switch kernel stack.*/
+ {NULL, NULL} /* Terminator */
+};
+
+const int kretprobe_blacklist_size = ARRAY_SIZE(kretprobe_blacklist);
+
+static nokprobe_inline void
+__synthesize_relative_insn(void *dest, void *from, void *to, u8 op)
+{
+ struct __arch_relative_insn {
+ u8 op;
+ s32 raddr;
+ } __packed *insn;
+
+ insn = (struct __arch_relative_insn *)dest;
+ insn->raddr = (s32)((long)(to) - ((long)(from) + 5));
+ insn->op = op;
+}
+
+/* Insert a jump instruction at address 'from', which jumps to address 'to'.*/
+void synthesize_reljump(void *dest, void *from, void *to)
+{
+ __synthesize_relative_insn(dest, from, to, RELATIVEJUMP_OPCODE);
+}
+NOKPROBE_SYMBOL(synthesize_reljump);
+
+/* Insert a call instruction at address 'from', which calls address 'to'.*/
+void synthesize_relcall(void *dest, void *from, void *to)
+{
+ __synthesize_relative_insn(dest, from, to, RELATIVECALL_OPCODE);
+}
+NOKPROBE_SYMBOL(synthesize_relcall);
+
+/*
+ * Skip the prefixes of the instruction.
+ */
+static kprobe_opcode_t *skip_prefixes(kprobe_opcode_t *insn)
+{
+ insn_attr_t attr;
+
+ attr = inat_get_opcode_attribute((insn_byte_t)*insn);
+ while (inat_is_legacy_prefix(attr)) {
+ insn++;
+ attr = inat_get_opcode_attribute((insn_byte_t)*insn);
+ }
+#ifdef CONFIG_X86_64
+ if (inat_is_rex_prefix(attr))
+ insn++;
+#endif
+ return insn;
+}
+NOKPROBE_SYMBOL(skip_prefixes);
+
+/*
+ * Returns non-zero if INSN is boostable.
+ * RIP relative instructions are adjusted at copying time in 64 bits mode
+ */
+int can_boost(struct insn *insn, void *addr)
+{
+ kprobe_opcode_t opcode;
+
+ if (search_exception_tables((unsigned long)addr))
+ return 0; /* Page fault may occur on this address. */
+
+ /* 2nd-byte opcode */
+ if (insn->opcode.nbytes == 2)
+ return test_bit(insn->opcode.bytes[1],
+ (unsigned long *)twobyte_is_boostable);
+
+ if (insn->opcode.nbytes != 1)
+ return 0;
+
+ /* Can't boost Address-size override prefix */
+ if (unlikely(inat_is_address_size_prefix(insn->attr)))
+ return 0;
+
+ opcode = insn->opcode.bytes[0];
+
+ switch (opcode & 0xf0) {
+ case 0x60:
+ /* can't boost "bound" */
+ return (opcode != 0x62);
+ case 0x70:
+ return 0; /* can't boost conditional jump */
+ case 0x90:
+ return opcode != 0x9a; /* can't boost call far */
+ case 0xc0:
+ /* can't boost software-interruptions */
+ return (0xc1 < opcode && opcode < 0xcc) || opcode == 0xcf;
+ case 0xd0:
+ /* can boost AA* and XLAT */
+ return (opcode == 0xd4 || opcode == 0xd5 || opcode == 0xd7);
+ case 0xe0:
+ /* can boost in/out and absolute jmps */
+ return ((opcode & 0x04) || opcode == 0xea);
+ case 0xf0:
+ /* clear and set flags are boostable */
+ return (opcode == 0xf5 || (0xf7 < opcode && opcode < 0xfe));
+ default:
+ /* CS override prefix and call are not boostable */
+ return (opcode != 0x2e && opcode != 0x9a);
+ }
+}
+
+static unsigned long
+__recover_probed_insn(kprobe_opcode_t *buf, unsigned long addr)
+{
+ struct kprobe *kp;
+ unsigned long faddr;
+
+ kp = get_kprobe((void *)addr);
+ faddr = ftrace_location(addr);
+ /*
+ * Addresses inside the ftrace location are refused by
+ * arch_check_ftrace_location(). Something went terribly wrong
+ * if such an address is checked here.
+ */
+ if (WARN_ON(faddr && faddr != addr))
+ return 0UL;
+ /*
+ * Use the current code if it is not modified by Kprobe
+ * and it cannot be modified by ftrace.
+ */
+ if (!kp && !faddr)
+ return addr;
+
+ /*
+ * Basically, kp->ainsn.insn has an original instruction.
+ * However, RIP-relative instruction can not do single-stepping
+ * at different place, __copy_instruction() tweaks the displacement of
+ * that instruction. In that case, we can't recover the instruction
+ * from the kp->ainsn.insn.
+ *
+ * On the other hand, in case on normal Kprobe, kp->opcode has a copy
+ * of the first byte of the probed instruction, which is overwritten
+ * by int3. And the instruction at kp->addr is not modified by kprobes
+ * except for the first byte, we can recover the original instruction
+ * from it and kp->opcode.
+ *
+ * In case of Kprobes using ftrace, we do not have a copy of
+ * the original instruction. In fact, the ftrace location might
+ * be modified at anytime and even could be in an inconsistent state.
+ * Fortunately, we know that the original code is the ideal 5-byte
+ * long NOP.
+ */
+ if (probe_kernel_read(buf, (void *)addr,
+ MAX_INSN_SIZE * sizeof(kprobe_opcode_t)))
+ return 0UL;
+
+ if (faddr)
+ memcpy(buf, ideal_nops[NOP_ATOMIC5], 5);
+ else
+ buf[0] = kp->opcode;
+ return (unsigned long)buf;
+}
+
+/*
+ * Recover the probed instruction at addr for further analysis.
+ * Caller must lock kprobes by kprobe_mutex, or disable preemption
+ * for preventing to release referencing kprobes.
+ * Returns zero if the instruction can not get recovered (or access failed).
+ */
+unsigned long recover_probed_instruction(kprobe_opcode_t *buf, unsigned long addr)
+{
+ unsigned long __addr;
+
+ __addr = __recover_optprobed_insn(buf, addr);
+ if (__addr != addr)
+ return __addr;
+
+ return __recover_probed_insn(buf, addr);
+}
+
+/* Check if paddr is at an instruction boundary */
+static int can_probe(unsigned long paddr)
+{
+ unsigned long addr, __addr, offset = 0;
+ struct insn insn;
+ kprobe_opcode_t buf[MAX_INSN_SIZE];
+
+ if (!kallsyms_lookup_size_offset(paddr, NULL, &offset))
+ return 0;
+
+ /* Decode instructions */
+ addr = paddr - offset;
+ while (addr < paddr) {
+ /*
+ * Check if the instruction has been modified by another
+ * kprobe, in which case we replace the breakpoint by the
+ * original instruction in our buffer.
+ * Also, jump optimization will change the breakpoint to
+ * relative-jump. Since the relative-jump itself is
+ * normally used, we just go through if there is no kprobe.
+ */
+ __addr = recover_probed_instruction(buf, addr);
+ if (!__addr)
+ return 0;
+ kernel_insn_init(&insn, (void *)__addr, MAX_INSN_SIZE);
+ insn_get_length(&insn);
+
+ /*
+ * Another debugging subsystem might insert this breakpoint.
+ * In that case, we can't recover it.
+ */
+ if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION)
+ return 0;
+ addr += insn.length;
+ }
+
+ return (addr == paddr);
+}
+
+/*
+ * Returns non-zero if opcode modifies the interrupt flag.
+ */
+static int is_IF_modifier(kprobe_opcode_t *insn)
+{
+ /* Skip prefixes */
+ insn = skip_prefixes(insn);
+
+ switch (*insn) {
+ case 0xfa: /* cli */
+ case 0xfb: /* sti */
+ case 0xcf: /* iret/iretd */
+ case 0x9d: /* popf/popfd */
+ return 1;
+ }
+
+ return 0;
+}
+
+/*
+ * Copy an instruction with recovering modified instruction by kprobes
+ * and adjust the displacement if the instruction uses the %rip-relative
+ * addressing mode. Note that since @real will be the final place of copied
+ * instruction, displacement must be adjust by @real, not @dest.
+ * This returns the length of copied instruction, or 0 if it has an error.
+ */
+int __copy_instruction(u8 *dest, u8 *src, u8 *real, struct insn *insn)
+{
+ kprobe_opcode_t buf[MAX_INSN_SIZE];
+ unsigned long recovered_insn =
+ recover_probed_instruction(buf, (unsigned long)src);
+
+ if (!recovered_insn || !insn)
+ return 0;
+
+ /* This can access kernel text if given address is not recovered */
+ if (probe_kernel_read(dest, (void *)recovered_insn, MAX_INSN_SIZE))
+ return 0;
+
+ kernel_insn_init(insn, dest, MAX_INSN_SIZE);
+ insn_get_length(insn);
+
+ /* Another subsystem puts a breakpoint, failed to recover */
+ if (insn->opcode.bytes[0] == BREAKPOINT_INSTRUCTION)
+ return 0;
+
+ /* We should not singlestep on the exception masking instructions */
+ if (insn_masking_exception(insn))
+ return 0;
+
+#ifdef CONFIG_X86_64
+ /* Only x86_64 has RIP relative instructions */
+ if (insn_rip_relative(insn)) {
+ s64 newdisp;
+ u8 *disp;
+ /*
+ * The copied instruction uses the %rip-relative addressing
+ * mode. Adjust the displacement for the difference between
+ * the original location of this instruction and the location
+ * of the copy that will actually be run. The tricky bit here
+ * is making sure that the sign extension happens correctly in
+ * this calculation, since we need a signed 32-bit result to
+ * be sign-extended to 64 bits when it's added to the %rip
+ * value and yield the same 64-bit result that the sign-
+ * extension of the original signed 32-bit displacement would
+ * have given.
+ */
+ newdisp = (u8 *) src + (s64) insn->displacement.value
+ - (u8 *) real;
+ if ((s64) (s32) newdisp != newdisp) {
+ pr_err("Kprobes error: new displacement does not fit into s32 (%llx)\n", newdisp);
+ return 0;
+ }
+ disp = (u8 *) dest + insn_offset_displacement(insn);
+ *(s32 *) disp = (s32) newdisp;
+ }
+#endif
+ return insn->length;
+}
+
+/* Prepare reljump right after instruction to boost */
+static int prepare_boost(kprobe_opcode_t *buf, struct kprobe *p,
+ struct insn *insn)
+{
+ int len = insn->length;
+
+ if (can_boost(insn, p->addr) &&
+ MAX_INSN_SIZE - len >= RELATIVEJUMP_SIZE) {
+ /*
+ * These instructions can be executed directly if it
+ * jumps back to correct address.
+ */
+ synthesize_reljump(buf + len, p->ainsn.insn + len,
+ p->addr + insn->length);
+ len += RELATIVEJUMP_SIZE;
+ p->ainsn.boostable = true;
+ } else {
+ p->ainsn.boostable = false;
+ }
+
+ return len;
+}
+
+/* Make page to RO mode when allocate it */
+void *alloc_insn_page(void)
+{
+ void *page;
+
+ page = module_alloc(PAGE_SIZE);
+ if (page)
+ set_memory_ro((unsigned long)page & PAGE_MASK, 1);
+
+ return page;
+}
+
+/* Recover page to RW mode before releasing it */
+void free_insn_page(void *page)
+{
+ set_memory_nx((unsigned long)page & PAGE_MASK, 1);
+ set_memory_rw((unsigned long)page & PAGE_MASK, 1);
+ module_memfree(page);
+}
+
+static int arch_copy_kprobe(struct kprobe *p)
+{
+ struct insn insn;
+ kprobe_opcode_t buf[MAX_INSN_SIZE];
+ int len;
+
+ /* Copy an instruction with recovering if other optprobe modifies it.*/
+ len = __copy_instruction(buf, p->addr, p->ainsn.insn, &insn);
+ if (!len)
+ return -EINVAL;
+
+ /*
+ * __copy_instruction can modify the displacement of the instruction,
+ * but it doesn't affect boostable check.
+ */
+ len = prepare_boost(buf, p, &insn);
+
+ /* Check whether the instruction modifies Interrupt Flag or not */
+ p->ainsn.if_modifier = is_IF_modifier(buf);
+
+ /* Also, displacement change doesn't affect the first byte */
+ p->opcode = buf[0];
+
+ /* OK, write back the instruction(s) into ROX insn buffer */
+ text_poke(p->ainsn.insn, buf, len);
+
+ return 0;
+}
+
+int arch_prepare_kprobe(struct kprobe *p)
+{
+ int ret;
+
+ if (alternatives_text_reserved(p->addr, p->addr))
+ return -EINVAL;
+
+ if (!can_probe((unsigned long)p->addr))
+ return -EILSEQ;
+ /* insn: must be on special executable page on x86. */
+ p->ainsn.insn = get_insn_slot();
+ if (!p->ainsn.insn)
+ return -ENOMEM;
+
+ ret = arch_copy_kprobe(p);
+ if (ret) {
+ free_insn_slot(p->ainsn.insn, 0);
+ p->ainsn.insn = NULL;
+ }
+
+ return ret;
+}
+
+void arch_arm_kprobe(struct kprobe *p)
+{
+ text_poke(p->addr, ((unsigned char []){BREAKPOINT_INSTRUCTION}), 1);
+}
+
+void arch_disarm_kprobe(struct kprobe *p)
+{
+ text_poke(p->addr, &p->opcode, 1);
+}
+
+void arch_remove_kprobe(struct kprobe *p)
+{
+ if (p->ainsn.insn) {
+ free_insn_slot(p->ainsn.insn, p->ainsn.boostable);
+ p->ainsn.insn = NULL;
+ }
+}
+
+static nokprobe_inline void
+save_previous_kprobe(struct kprobe_ctlblk *kcb)
+{
+ kcb->prev_kprobe.kp = kprobe_running();
+ kcb->prev_kprobe.status = kcb->kprobe_status;
+ kcb->prev_kprobe.old_flags = kcb->kprobe_old_flags;
+ kcb->prev_kprobe.saved_flags = kcb->kprobe_saved_flags;
+}
+
+static nokprobe_inline void
+restore_previous_kprobe(struct kprobe_ctlblk *kcb)
+{
+ __this_cpu_write(current_kprobe, kcb->prev_kprobe.kp);
+ kcb->kprobe_status = kcb->prev_kprobe.status;
+ kcb->kprobe_old_flags = kcb->prev_kprobe.old_flags;
+ kcb->kprobe_saved_flags = kcb->prev_kprobe.saved_flags;
+}
+
+static nokprobe_inline void
+set_current_kprobe(struct kprobe *p, struct pt_regs *regs,
+ struct kprobe_ctlblk *kcb)
+{
+ __this_cpu_write(current_kprobe, p);
+ kcb->kprobe_saved_flags = kcb->kprobe_old_flags
+ = (regs->flags & (X86_EFLAGS_TF | X86_EFLAGS_IF));
+ if (p->ainsn.if_modifier)
+ kcb->kprobe_saved_flags &= ~X86_EFLAGS_IF;
+}
+
+static nokprobe_inline void clear_btf(void)
+{
+ if (test_thread_flag(TIF_BLOCKSTEP)) {
+ unsigned long debugctl = get_debugctlmsr();
+
+ debugctl &= ~DEBUGCTLMSR_BTF;
+ update_debugctlmsr(debugctl);
+ }
+}
+
+static nokprobe_inline void restore_btf(void)
+{
+ if (test_thread_flag(TIF_BLOCKSTEP)) {
+ unsigned long debugctl = get_debugctlmsr();
+
+ debugctl |= DEBUGCTLMSR_BTF;
+ update_debugctlmsr(debugctl);
+ }
+}
+
+void arch_prepare_kretprobe(struct kretprobe_instance *ri, struct pt_regs *regs)
+{
+ unsigned long *sara = stack_addr(regs);
+
+ ri->ret_addr = (kprobe_opcode_t *) *sara;
+
+ /* Replace the return addr with trampoline addr */
+ *sara = (unsigned long) &kretprobe_trampoline;
+}
+NOKPROBE_SYMBOL(arch_prepare_kretprobe);
+
+static void setup_singlestep(struct kprobe *p, struct pt_regs *regs,
+ struct kprobe_ctlblk *kcb, int reenter)
+{
+ if (setup_detour_execution(p, regs, reenter))
+ return;
+
+#if !defined(CONFIG_PREEMPT)
+ if (p->ainsn.boostable && !p->post_handler) {
+ /* Boost up -- we can execute copied instructions directly */
+ if (!reenter)
+ reset_current_kprobe();
+ /*
+ * Reentering boosted probe doesn't reset current_kprobe,
+ * nor set current_kprobe, because it doesn't use single
+ * stepping.
+ */
+ regs->ip = (unsigned long)p->ainsn.insn;
+ return;
+ }
+#endif
+ if (reenter) {
+ save_previous_kprobe(kcb);
+ set_current_kprobe(p, regs, kcb);
+ kcb->kprobe_status = KPROBE_REENTER;
+ } else
+ kcb->kprobe_status = KPROBE_HIT_SS;
+ /* Prepare real single stepping */
+ clear_btf();
+ regs->flags |= X86_EFLAGS_TF;
+ regs->flags &= ~X86_EFLAGS_IF;
+ /* single step inline if the instruction is an int3 */
+ if (p->opcode == BREAKPOINT_INSTRUCTION)
+ regs->ip = (unsigned long)p->addr;
+ else
+ regs->ip = (unsigned long)p->ainsn.insn;
+}
+NOKPROBE_SYMBOL(setup_singlestep);
+
+/*
+ * We have reentered the kprobe_handler(), since another probe was hit while
+ * within the handler. We save the original kprobes variables and just single
+ * step on the instruction of the new probe without calling any user handlers.
+ */
+static int reenter_kprobe(struct kprobe *p, struct pt_regs *regs,
+ struct kprobe_ctlblk *kcb)
+{
+ switch (kcb->kprobe_status) {
+ case KPROBE_HIT_SSDONE:
+ case KPROBE_HIT_ACTIVE:
+ case KPROBE_HIT_SS:
+ kprobes_inc_nmissed_count(p);
+ setup_singlestep(p, regs, kcb, 1);
+ break;
+ case KPROBE_REENTER:
+ /* A probe has been hit in the codepath leading up to, or just
+ * after, single-stepping of a probed instruction. This entire
+ * codepath should strictly reside in .kprobes.text section.
+ * Raise a BUG or we'll continue in an endless reentering loop
+ * and eventually a stack overflow.
+ */
+ pr_err("Unrecoverable kprobe detected.\n");
+ dump_kprobe(p);
+ BUG();
+ default:
+ /* impossible cases */
+ WARN_ON(1);
+ return 0;
+ }
+
+ return 1;
+}
+NOKPROBE_SYMBOL(reenter_kprobe);
+
+/*
+ * Interrupts are disabled on entry as trap3 is an interrupt gate and they
+ * remain disabled throughout this function.
+ */
+int kprobe_int3_handler(struct pt_regs *regs)
+{
+ kprobe_opcode_t *addr;
+ struct kprobe *p;
+ struct kprobe_ctlblk *kcb;
+
+ if (user_mode(regs))
+ return 0;
+
+ addr = (kprobe_opcode_t *)(regs->ip - sizeof(kprobe_opcode_t));
+ /*
+ * We don't want to be preempted for the entire duration of kprobe
+ * processing. Since int3 and debug trap disables irqs and we clear
+ * IF while singlestepping, it must be no preemptible.
+ */
+
+ kcb = get_kprobe_ctlblk();
+ p = get_kprobe(addr);
+
+ if (p) {
+ if (kprobe_running()) {
+ if (reenter_kprobe(p, regs, kcb))
+ return 1;
+ } else {
+ set_current_kprobe(p, regs, kcb);
+ kcb->kprobe_status = KPROBE_HIT_ACTIVE;
+
+ /*
+ * If we have no pre-handler or it returned 0, we
+ * continue with normal processing. If we have a
+ * pre-handler and it returned non-zero, that means
+ * user handler setup registers to exit to another
+ * instruction, we must skip the single stepping.
+ */
+ if (!p->pre_handler || !p->pre_handler(p, regs))
+ setup_singlestep(p, regs, kcb, 0);
+ else
+ reset_current_kprobe();
+ return 1;
+ }
+ } else if (*addr != BREAKPOINT_INSTRUCTION) {
+ /*
+ * The breakpoint instruction was removed right
+ * after we hit it. Another cpu has removed
+ * either a probepoint or a debugger breakpoint
+ * at this address. In either case, no further
+ * handling of this interrupt is appropriate.
+ * Back up over the (now missing) int3 and run
+ * the original instruction.
+ */
+ regs->ip = (unsigned long)addr;
+ return 1;
+ } /* else: not a kprobe fault; let the kernel handle it */
+
+ return 0;
+}
+NOKPROBE_SYMBOL(kprobe_int3_handler);
+
+/*
+ * When a retprobed function returns, this code saves registers and
+ * calls trampoline_handler() runs, which calls the kretprobe's handler.
+ */
+asm(
+ ".global kretprobe_trampoline\n"
+ ".type kretprobe_trampoline, @function\n"
+ "kretprobe_trampoline:\n"
+#ifdef CONFIG_X86_64
+ /* We don't bother saving the ss register */
+ " pushq %rsp\n"
+ " pushfq\n"
+ SAVE_REGS_STRING
+ " movq %rsp, %rdi\n"
+ " call trampoline_handler\n"
+ /* Replace saved sp with true return address. */
+ " movq %rax, 152(%rsp)\n"
+ RESTORE_REGS_STRING
+ " popfq\n"
+#else
+ " pushf\n"
+ SAVE_REGS_STRING
+ " movl %esp, %eax\n"
+ " call trampoline_handler\n"
+ /* Move flags to cs */
+ " movl 56(%esp), %edx\n"
+ " movl %edx, 52(%esp)\n"
+ /* Replace saved flags with true return address. */
+ " movl %eax, 56(%esp)\n"
+ RESTORE_REGS_STRING
+ " popf\n"
+#endif
+ " ret\n"
+ ".size kretprobe_trampoline, .-kretprobe_trampoline\n"
+);
+NOKPROBE_SYMBOL(kretprobe_trampoline);
+STACK_FRAME_NON_STANDARD(kretprobe_trampoline);
+
+/*
+ * Called from kretprobe_trampoline
+ */
+__visible __used void *trampoline_handler(struct pt_regs *regs)
+{
+ struct kretprobe_instance *ri = NULL;
+ struct hlist_head *head, empty_rp;
+ struct hlist_node *tmp;
+ unsigned long flags, orig_ret_address = 0;
+ unsigned long trampoline_address = (unsigned long)&kretprobe_trampoline;
+ kprobe_opcode_t *correct_ret_addr = NULL;
+
+ INIT_HLIST_HEAD(&empty_rp);
+ kretprobe_hash_lock(current, &head, &flags);
+ /* fixup registers */
+#ifdef CONFIG_X86_64
+ regs->cs = __KERNEL_CS;
+#else
+ regs->cs = __KERNEL_CS | get_kernel_rpl();
+ regs->gs = 0;
+#endif
+ regs->ip = trampoline_address;
+ regs->orig_ax = ~0UL;
+
+ /*
+ * It is possible to have multiple instances associated with a given
+ * task either because multiple functions in the call path have
+ * return probes installed on them, and/or more than one
+ * return probe was registered for a target function.
+ *
+ * We can handle this because:
+ * - instances are always pushed into the head of the list
+ * - when multiple return probes are registered for the same
+ * function, the (chronologically) first instance's ret_addr
+ * will be the real return address, and all the rest will
+ * point to kretprobe_trampoline.
+ */
+ hlist_for_each_entry(ri, head, hlist) {
+ if (ri->task != current)
+ /* another task is sharing our hash bucket */
+ continue;
+
+ orig_ret_address = (unsigned long)ri->ret_addr;
+
+ if (orig_ret_address != trampoline_address)
+ /*
+ * This is the real return address. Any other
+ * instances associated with this task are for
+ * other calls deeper on the call stack
+ */
+ break;
+ }
+
+ kretprobe_assert(ri, orig_ret_address, trampoline_address);
+
+ correct_ret_addr = ri->ret_addr;
+ hlist_for_each_entry_safe(ri, tmp, head, hlist) {
+ if (ri->task != current)
+ /* another task is sharing our hash bucket */
+ continue;
+
+ orig_ret_address = (unsigned long)ri->ret_addr;
+ if (ri->rp && ri->rp->handler) {
+ __this_cpu_write(current_kprobe, &ri->rp->kp);
+ get_kprobe_ctlblk()->kprobe_status = KPROBE_HIT_ACTIVE;
+ ri->ret_addr = correct_ret_addr;
+ ri->rp->handler(ri, regs);
+ __this_cpu_write(current_kprobe, NULL);
+ }
+
+ recycle_rp_inst(ri, &empty_rp);
+
+ if (orig_ret_address != trampoline_address)
+ /*
+ * This is the real return address. Any other
+ * instances associated with this task are for
+ * other calls deeper on the call stack
+ */
+ break;
+ }
+
+ kretprobe_hash_unlock(current, &flags);
+
+ hlist_for_each_entry_safe(ri, tmp, &empty_rp, hlist) {
+ hlist_del(&ri->hlist);
+ kfree(ri);
+ }
+ return (void *)orig_ret_address;
+}
+NOKPROBE_SYMBOL(trampoline_handler);
+
+/*
+ * Called after single-stepping. p->addr is the address of the
+ * instruction whose first byte has been replaced by the "int 3"
+ * instruction. To avoid the SMP problems that can occur when we
+ * temporarily put back the original opcode to single-step, we
+ * single-stepped a copy of the instruction. The address of this
+ * copy is p->ainsn.insn.
+ *
+ * This function prepares to return from the post-single-step
+ * interrupt. We have to fix up the stack as follows:
+ *
+ * 0) Except in the case of absolute or indirect jump or call instructions,
+ * the new ip is relative to the copied instruction. We need to make
+ * it relative to the original instruction.
+ *
+ * 1) If the single-stepped instruction was pushfl, then the TF and IF
+ * flags are set in the just-pushed flags, and may need to be cleared.
+ *
+ * 2) If the single-stepped instruction was a call, the return address
+ * that is atop the stack is the address following the copied instruction.
+ * We need to make it the address following the original instruction.
+ *
+ * If this is the first time we've single-stepped the instruction at
+ * this probepoint, and the instruction is boostable, boost it: add a
+ * jump instruction after the copied instruction, that jumps to the next
+ * instruction after the probepoint.
+ */
+static void resume_execution(struct kprobe *p, struct pt_regs *regs,
+ struct kprobe_ctlblk *kcb)
+{
+ unsigned long *tos = stack_addr(regs);
+ unsigned long copy_ip = (unsigned long)p->ainsn.insn;
+ unsigned long orig_ip = (unsigned long)p->addr;
+ kprobe_opcode_t *insn = p->ainsn.insn;
+
+ /* Skip prefixes */
+ insn = skip_prefixes(insn);
+
+ regs->flags &= ~X86_EFLAGS_TF;
+ switch (*insn) {
+ case 0x9c: /* pushfl */
+ *tos &= ~(X86_EFLAGS_TF | X86_EFLAGS_IF);
+ *tos |= kcb->kprobe_old_flags;
+ break;
+ case 0xc2: /* iret/ret/lret */
+ case 0xc3:
+ case 0xca:
+ case 0xcb:
+ case 0xcf:
+ case 0xea: /* jmp absolute -- ip is correct */
+ /* ip is already adjusted, no more changes required */
+ p->ainsn.boostable = true;
+ goto no_change;
+ case 0xe8: /* call relative - Fix return addr */
+ *tos = orig_ip + (*tos - copy_ip);
+ break;
+#ifdef CONFIG_X86_32
+ case 0x9a: /* call absolute -- same as call absolute, indirect */
+ *tos = orig_ip + (*tos - copy_ip);
+ goto no_change;
+#endif
+ case 0xff:
+ if ((insn[1] & 0x30) == 0x10) {
+ /*
+ * call absolute, indirect
+ * Fix return addr; ip is correct.
+ * But this is not boostable
+ */
+ *tos = orig_ip + (*tos - copy_ip);
+ goto no_change;
+ } else if (((insn[1] & 0x31) == 0x20) ||
+ ((insn[1] & 0x31) == 0x21)) {
+ /*
+ * jmp near and far, absolute indirect
+ * ip is correct. And this is boostable
+ */
+ p->ainsn.boostable = true;
+ goto no_change;
+ }
+ default:
+ break;
+ }
+
+ regs->ip += orig_ip - copy_ip;
+
+no_change:
+ restore_btf();
+}
+NOKPROBE_SYMBOL(resume_execution);
+
+/*
+ * Interrupts are disabled on entry as trap1 is an interrupt gate and they
+ * remain disabled throughout this function.
+ */
+int kprobe_debug_handler(struct pt_regs *regs)
+{
+ struct kprobe *cur = kprobe_running();
+ struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
+
+ if (!cur)
+ return 0;
+
+ resume_execution(cur, regs, kcb);
+ regs->flags |= kcb->kprobe_saved_flags;
+
+ if ((kcb->kprobe_status != KPROBE_REENTER) && cur->post_handler) {
+ kcb->kprobe_status = KPROBE_HIT_SSDONE;
+ cur->post_handler(cur, regs, 0);
+ }
+
+ /* Restore back the original saved kprobes variables and continue. */
+ if (kcb->kprobe_status == KPROBE_REENTER) {
+ restore_previous_kprobe(kcb);
+ goto out;
+ }
+ reset_current_kprobe();
+out:
+ /*
+ * if somebody else is singlestepping across a probe point, flags
+ * will have TF set, in which case, continue the remaining processing
+ * of do_debug, as if this is not a probe hit.
+ */
+ if (regs->flags & X86_EFLAGS_TF)
+ return 0;
+
+ return 1;
+}
+NOKPROBE_SYMBOL(kprobe_debug_handler);
+
+int kprobe_fault_handler(struct pt_regs *regs, int trapnr)
+{
+ struct kprobe *cur = kprobe_running();
+ struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
+
+ if (unlikely(regs->ip == (unsigned long)cur->ainsn.insn)) {
+ /* This must happen on single-stepping */
+ WARN_ON(kcb->kprobe_status != KPROBE_HIT_SS &&
+ kcb->kprobe_status != KPROBE_REENTER);
+ /*
+ * We are here because the instruction being single
+ * stepped caused a page fault. We reset the current
+ * kprobe and the ip points back to the probe address
+ * and allow the page fault handler to continue as a
+ * normal page fault.
+ */
+ regs->ip = (unsigned long)cur->addr;
+ /*
+ * Trap flag (TF) has been set here because this fault
+ * happened where the single stepping will be done.
+ * So clear it by resetting the current kprobe:
+ */
+ regs->flags &= ~X86_EFLAGS_TF;
+
+ /*
+ * If the TF flag was set before the kprobe hit,
+ * don't touch it:
+ */
+ regs->flags |= kcb->kprobe_old_flags;
+
+ if (kcb->kprobe_status == KPROBE_REENTER)
+ restore_previous_kprobe(kcb);
+ else
+ reset_current_kprobe();
+ } else if (kcb->kprobe_status == KPROBE_HIT_ACTIVE ||
+ kcb->kprobe_status == KPROBE_HIT_SSDONE) {
+ /*
+ * We increment the nmissed count for accounting,
+ * we can also use npre/npostfault count for accounting
+ * these specific fault cases.
+ */
+ kprobes_inc_nmissed_count(cur);
+
+ /*
+ * We come here because instructions in the pre/post
+ * handler caused the page_fault, this could happen
+ * if handler tries to access user space by
+ * copy_from_user(), get_user() etc. Let the
+ * user-specified handler try to fix it first.
+ */
+ if (cur->fault_handler && cur->fault_handler(cur, regs, trapnr))
+ return 1;
+
+ /*
+ * In case the user-specified fault handler returned
+ * zero, try to fix up.
+ */
+ if (fixup_exception(regs, trapnr))
+ return 1;
+
+ /*
+ * fixup routine could not handle it,
+ * Let do_page_fault() fix it.
+ */
+ }
+
+ return 0;
+}
+NOKPROBE_SYMBOL(kprobe_fault_handler);
+
+/*
+ * Wrapper routine for handling exceptions.
+ */
+int kprobe_exceptions_notify(struct notifier_block *self, unsigned long val,
+ void *data)
+{
+ struct die_args *args = data;
+ int ret = NOTIFY_DONE;
+
+ if (args->regs && user_mode(args->regs))
+ return ret;
+
+ if (val == DIE_GPF) {
+ /*
+ * To be potentially processing a kprobe fault and to
+ * trust the result from kprobe_running(), we have
+ * be non-preemptible.
+ */
+ if (!preemptible() && kprobe_running() &&
+ kprobe_fault_handler(args->regs, args->trapnr))
+ ret = NOTIFY_STOP;
+ }
+ return ret;
+}
+NOKPROBE_SYMBOL(kprobe_exceptions_notify);
+
+bool arch_within_kprobe_blacklist(unsigned long addr)
+{
+ bool is_in_entry_trampoline_section = false;
+
+#ifdef CONFIG_X86_64
+ is_in_entry_trampoline_section =
+ (addr >= (unsigned long)__entry_trampoline_start &&
+ addr < (unsigned long)__entry_trampoline_end);
+#endif
+ return (addr >= (unsigned long)__kprobes_text_start &&
+ addr < (unsigned long)__kprobes_text_end) ||
+ (addr >= (unsigned long)__entry_text_start &&
+ addr < (unsigned long)__entry_text_end) ||
+ is_in_entry_trampoline_section;
+}
+
+int __init arch_init_kprobes(void)
+{
+ return 0;
+}
+
+int arch_trampoline_kprobe(struct kprobe *p)
+{
+ return 0;
+}
diff --git a/arch/x86/kernel/kprobes/ftrace.c b/arch/x86/kernel/kprobes/ftrace.c
new file mode 100644
index 0000000..ef819e1
--- /dev/null
+++ b/arch/x86/kernel/kprobes/ftrace.c
@@ -0,0 +1,76 @@
+/*
+ * Dynamic Ftrace based Kprobes Optimization
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) Hitachi Ltd., 2012
+ */
+#include <linux/kprobes.h>
+#include <linux/ptrace.h>
+#include <linux/hardirq.h>
+#include <linux/preempt.h>
+#include <linux/ftrace.h>
+
+#include "common.h"
+
+/* Ftrace callback handler for kprobes -- called under preepmt disabed */
+void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip,
+ struct ftrace_ops *ops, struct pt_regs *regs)
+{
+ struct kprobe *p;
+ struct kprobe_ctlblk *kcb;
+
+ /* Preempt is disabled by ftrace */
+ p = get_kprobe((kprobe_opcode_t *)ip);
+ if (unlikely(!p) || kprobe_disabled(p))
+ return;
+
+ kcb = get_kprobe_ctlblk();
+ if (kprobe_running()) {
+ kprobes_inc_nmissed_count(p);
+ } else {
+ unsigned long orig_ip = regs->ip;
+ /* Kprobe handler expects regs->ip = ip + 1 as breakpoint hit */
+ regs->ip = ip + sizeof(kprobe_opcode_t);
+
+ __this_cpu_write(current_kprobe, p);
+ kcb->kprobe_status = KPROBE_HIT_ACTIVE;
+ if (!p->pre_handler || !p->pre_handler(p, regs)) {
+ /*
+ * Emulate singlestep (and also recover regs->ip)
+ * as if there is a 5byte nop
+ */
+ regs->ip = (unsigned long)p->addr + MCOUNT_INSN_SIZE;
+ if (unlikely(p->post_handler)) {
+ kcb->kprobe_status = KPROBE_HIT_SSDONE;
+ p->post_handler(p, regs, 0);
+ }
+ regs->ip = orig_ip;
+ }
+ /*
+ * If pre_handler returns !0, it changes regs->ip. We have to
+ * skip emulating post_handler.
+ */
+ __this_cpu_write(current_kprobe, NULL);
+ }
+}
+NOKPROBE_SYMBOL(kprobe_ftrace_handler);
+
+int arch_prepare_kprobe_ftrace(struct kprobe *p)
+{
+ p->ainsn.insn = NULL;
+ p->ainsn.boostable = false;
+ return 0;
+}
diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c
new file mode 100644
index 0000000..6adf6e6
--- /dev/null
+++ b/arch/x86/kernel/kprobes/opt.c
@@ -0,0 +1,498 @@
+/*
+ * Kernel Probes Jump Optimization (Optprobes)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2002, 2004
+ * Copyright (C) Hitachi Ltd., 2012
+ */
+#include <linux/kprobes.h>
+#include <linux/ptrace.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+#include <linux/hardirq.h>
+#include <linux/preempt.h>
+#include <linux/extable.h>
+#include <linux/kdebug.h>
+#include <linux/kallsyms.h>
+#include <linux/ftrace.h>
+#include <linux/frame.h>
+
+#include <asm/text-patching.h>
+#include <asm/cacheflush.h>
+#include <asm/desc.h>
+#include <asm/pgtable.h>
+#include <linux/uaccess.h>
+#include <asm/alternative.h>
+#include <asm/insn.h>
+#include <asm/debugreg.h>
+#include <asm/set_memory.h>
+#include <asm/sections.h>
+#include <asm/nospec-branch.h>
+
+#include "common.h"
+
+unsigned long __recover_optprobed_insn(kprobe_opcode_t *buf, unsigned long addr)
+{
+ struct optimized_kprobe *op;
+ struct kprobe *kp;
+ long offs;
+ int i;
+
+ for (i = 0; i < RELATIVEJUMP_SIZE; i++) {
+ kp = get_kprobe((void *)addr - i);
+ /* This function only handles jump-optimized kprobe */
+ if (kp && kprobe_optimized(kp)) {
+ op = container_of(kp, struct optimized_kprobe, kp);
+ /* If op->list is not empty, op is under optimizing */
+ if (list_empty(&op->list))
+ goto found;
+ }
+ }
+
+ return addr;
+found:
+ /*
+ * If the kprobe can be optimized, original bytes which can be
+ * overwritten by jump destination address. In this case, original
+ * bytes must be recovered from op->optinsn.copied_insn buffer.
+ */
+ if (probe_kernel_read(buf, (void *)addr,
+ MAX_INSN_SIZE * sizeof(kprobe_opcode_t)))
+ return 0UL;
+
+ if (addr == (unsigned long)kp->addr) {
+ buf[0] = kp->opcode;
+ memcpy(buf + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE);
+ } else {
+ offs = addr - (unsigned long)kp->addr - 1;
+ memcpy(buf, op->optinsn.copied_insn + offs, RELATIVE_ADDR_SIZE - offs);
+ }
+
+ return (unsigned long)buf;
+}
+
+/* Insert a move instruction which sets a pointer to eax/rdi (1st arg). */
+static void synthesize_set_arg1(kprobe_opcode_t *addr, unsigned long val)
+{
+#ifdef CONFIG_X86_64
+ *addr++ = 0x48;
+ *addr++ = 0xbf;
+#else
+ *addr++ = 0xb8;
+#endif
+ *(unsigned long *)addr = val;
+}
+
+asm (
+ "optprobe_template_func:\n"
+ ".global optprobe_template_entry\n"
+ "optprobe_template_entry:\n"
+#ifdef CONFIG_X86_64
+ /* We don't bother saving the ss register */
+ " pushq %rsp\n"
+ " pushfq\n"
+ SAVE_REGS_STRING
+ " movq %rsp, %rsi\n"
+ ".global optprobe_template_val\n"
+ "optprobe_template_val:\n"
+ ASM_NOP5
+ ASM_NOP5
+ ".global optprobe_template_call\n"
+ "optprobe_template_call:\n"
+ ASM_NOP5
+ /* Move flags to rsp */
+ " movq 144(%rsp), %rdx\n"
+ " movq %rdx, 152(%rsp)\n"
+ RESTORE_REGS_STRING
+ /* Skip flags entry */
+ " addq $8, %rsp\n"
+ " popfq\n"
+#else /* CONFIG_X86_32 */
+ " pushf\n"
+ SAVE_REGS_STRING
+ " movl %esp, %edx\n"
+ ".global optprobe_template_val\n"
+ "optprobe_template_val:\n"
+ ASM_NOP5
+ ".global optprobe_template_call\n"
+ "optprobe_template_call:\n"
+ ASM_NOP5
+ RESTORE_REGS_STRING
+ " addl $4, %esp\n" /* skip cs */
+ " popf\n"
+#endif
+ ".global optprobe_template_end\n"
+ "optprobe_template_end:\n"
+ ".type optprobe_template_func, @function\n"
+ ".size optprobe_template_func, .-optprobe_template_func\n");
+
+void optprobe_template_func(void);
+STACK_FRAME_NON_STANDARD(optprobe_template_func);
+
+#define TMPL_MOVE_IDX \
+ ((long)optprobe_template_val - (long)optprobe_template_entry)
+#define TMPL_CALL_IDX \
+ ((long)optprobe_template_call - (long)optprobe_template_entry)
+#define TMPL_END_IDX \
+ ((long)optprobe_template_end - (long)optprobe_template_entry)
+
+#define INT3_SIZE sizeof(kprobe_opcode_t)
+
+/* Optimized kprobe call back function: called from optinsn */
+static void
+optimized_callback(struct optimized_kprobe *op, struct pt_regs *regs)
+{
+ /* This is possible if op is under delayed unoptimizing */
+ if (kprobe_disabled(&op->kp))
+ return;
+
+ preempt_disable();
+ if (kprobe_running()) {
+ kprobes_inc_nmissed_count(&op->kp);
+ } else {
+ struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
+ /* Save skipped registers */
+#ifdef CONFIG_X86_64
+ regs->cs = __KERNEL_CS;
+#else
+ regs->cs = __KERNEL_CS | get_kernel_rpl();
+ regs->gs = 0;
+#endif
+ regs->ip = (unsigned long)op->kp.addr + INT3_SIZE;
+ regs->orig_ax = ~0UL;
+
+ __this_cpu_write(current_kprobe, &op->kp);
+ kcb->kprobe_status = KPROBE_HIT_ACTIVE;
+ opt_pre_handler(&op->kp, regs);
+ __this_cpu_write(current_kprobe, NULL);
+ }
+ preempt_enable();
+}
+NOKPROBE_SYMBOL(optimized_callback);
+
+static int copy_optimized_instructions(u8 *dest, u8 *src, u8 *real)
+{
+ struct insn insn;
+ int len = 0, ret;
+
+ while (len < RELATIVEJUMP_SIZE) {
+ ret = __copy_instruction(dest + len, src + len, real + len, &insn);
+ if (!ret || !can_boost(&insn, src + len))
+ return -EINVAL;
+ len += ret;
+ }
+ /* Check whether the address range is reserved */
+ if (ftrace_text_reserved(src, src + len - 1) ||
+ alternatives_text_reserved(src, src + len - 1) ||
+ jump_label_text_reserved(src, src + len - 1))
+ return -EBUSY;
+
+ return len;
+}
+
+/* Check whether insn is indirect jump */
+static int __insn_is_indirect_jump(struct insn *insn)
+{
+ return ((insn->opcode.bytes[0] == 0xff &&
+ (X86_MODRM_REG(insn->modrm.value) & 6) == 4) || /* Jump */
+ insn->opcode.bytes[0] == 0xea); /* Segment based jump */
+}
+
+/* Check whether insn jumps into specified address range */
+static int insn_jump_into_range(struct insn *insn, unsigned long start, int len)
+{
+ unsigned long target = 0;
+
+ switch (insn->opcode.bytes[0]) {
+ case 0xe0: /* loopne */
+ case 0xe1: /* loope */
+ case 0xe2: /* loop */
+ case 0xe3: /* jcxz */
+ case 0xe9: /* near relative jump */
+ case 0xeb: /* short relative jump */
+ break;
+ case 0x0f:
+ if ((insn->opcode.bytes[1] & 0xf0) == 0x80) /* jcc near */
+ break;
+ return 0;
+ default:
+ if ((insn->opcode.bytes[0] & 0xf0) == 0x70) /* jcc short */
+ break;
+ return 0;
+ }
+ target = (unsigned long)insn->next_byte + insn->immediate.value;
+
+ return (start <= target && target <= start + len);
+}
+
+static int insn_is_indirect_jump(struct insn *insn)
+{
+ int ret = __insn_is_indirect_jump(insn);
+
+#ifdef CONFIG_RETPOLINE
+ /*
+ * Jump to x86_indirect_thunk_* is treated as an indirect jump.
+ * Note that even with CONFIG_RETPOLINE=y, the kernel compiled with
+ * older gcc may use indirect jump. So we add this check instead of
+ * replace indirect-jump check.
+ */
+ if (!ret)
+ ret = insn_jump_into_range(insn,
+ (unsigned long)__indirect_thunk_start,
+ (unsigned long)__indirect_thunk_end -
+ (unsigned long)__indirect_thunk_start);
+#endif
+ return ret;
+}
+
+/* Decode whole function to ensure any instructions don't jump into target */
+static int can_optimize(unsigned long paddr)
+{
+ unsigned long addr, size = 0, offset = 0;
+ struct insn insn;
+ kprobe_opcode_t buf[MAX_INSN_SIZE];
+
+ /* Lookup symbol including addr */
+ if (!kallsyms_lookup_size_offset(paddr, &size, &offset))
+ return 0;
+
+ /*
+ * Do not optimize in the entry code due to the unstable
+ * stack handling and registers setup.
+ */
+ if (((paddr >= (unsigned long)__entry_text_start) &&
+ (paddr < (unsigned long)__entry_text_end)) ||
+ ((paddr >= (unsigned long)__irqentry_text_start) &&
+ (paddr < (unsigned long)__irqentry_text_end)))
+ return 0;
+
+ /* Check there is enough space for a relative jump. */
+ if (size - offset < RELATIVEJUMP_SIZE)
+ return 0;
+
+ /* Decode instructions */
+ addr = paddr - offset;
+ while (addr < paddr - offset + size) { /* Decode until function end */
+ unsigned long recovered_insn;
+ if (search_exception_tables(addr))
+ /*
+ * Since some fixup code will jumps into this function,
+ * we can't optimize kprobe in this function.
+ */
+ return 0;
+ recovered_insn = recover_probed_instruction(buf, addr);
+ if (!recovered_insn)
+ return 0;
+ kernel_insn_init(&insn, (void *)recovered_insn, MAX_INSN_SIZE);
+ insn_get_length(&insn);
+ /* Another subsystem puts a breakpoint */
+ if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION)
+ return 0;
+ /* Recover address */
+ insn.kaddr = (void *)addr;
+ insn.next_byte = (void *)(addr + insn.length);
+ /* Check any instructions don't jump into target */
+ if (insn_is_indirect_jump(&insn) ||
+ insn_jump_into_range(&insn, paddr + INT3_SIZE,
+ RELATIVE_ADDR_SIZE))
+ return 0;
+ addr += insn.length;
+ }
+
+ return 1;
+}
+
+/* Check optimized_kprobe can actually be optimized. */
+int arch_check_optimized_kprobe(struct optimized_kprobe *op)
+{
+ int i;
+ struct kprobe *p;
+
+ for (i = 1; i < op->optinsn.size; i++) {
+ p = get_kprobe(op->kp.addr + i);
+ if (p && !kprobe_disabled(p))
+ return -EEXIST;
+ }
+
+ return 0;
+}
+
+/* Check the addr is within the optimized instructions. */
+int arch_within_optimized_kprobe(struct optimized_kprobe *op,
+ unsigned long addr)
+{
+ return ((unsigned long)op->kp.addr <= addr &&
+ (unsigned long)op->kp.addr + op->optinsn.size > addr);
+}
+
+/* Free optimized instruction slot */
+static
+void __arch_remove_optimized_kprobe(struct optimized_kprobe *op, int dirty)
+{
+ if (op->optinsn.insn) {
+ free_optinsn_slot(op->optinsn.insn, dirty);
+ op->optinsn.insn = NULL;
+ op->optinsn.size = 0;
+ }
+}
+
+void arch_remove_optimized_kprobe(struct optimized_kprobe *op)
+{
+ __arch_remove_optimized_kprobe(op, 1);
+}
+
+/*
+ * Copy replacing target instructions
+ * Target instructions MUST be relocatable (checked inside)
+ * This is called when new aggr(opt)probe is allocated or reused.
+ */
+int arch_prepare_optimized_kprobe(struct optimized_kprobe *op,
+ struct kprobe *__unused)
+{
+ u8 *buf = NULL, *slot;
+ int ret, len;
+ long rel;
+
+ if (!can_optimize((unsigned long)op->kp.addr))
+ return -EILSEQ;
+
+ buf = kzalloc(MAX_OPTINSN_SIZE, GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
+
+ op->optinsn.insn = slot = get_optinsn_slot();
+ if (!slot) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ /*
+ * Verify if the address gap is in 2GB range, because this uses
+ * a relative jump.
+ */
+ rel = (long)slot - (long)op->kp.addr + RELATIVEJUMP_SIZE;
+ if (abs(rel) > 0x7fffffff) {
+ ret = -ERANGE;
+ goto err;
+ }
+
+ /* Copy arch-dep-instance from template */
+ memcpy(buf, optprobe_template_entry, TMPL_END_IDX);
+
+ /* Copy instructions into the out-of-line buffer */
+ ret = copy_optimized_instructions(buf + TMPL_END_IDX, op->kp.addr,
+ slot + TMPL_END_IDX);
+ if (ret < 0)
+ goto err;
+ op->optinsn.size = ret;
+ len = TMPL_END_IDX + op->optinsn.size;
+
+ /* Set probe information */
+ synthesize_set_arg1(buf + TMPL_MOVE_IDX, (unsigned long)op);
+
+ /* Set probe function call */
+ synthesize_relcall(buf + TMPL_CALL_IDX,
+ slot + TMPL_CALL_IDX, optimized_callback);
+
+ /* Set returning jmp instruction at the tail of out-of-line buffer */
+ synthesize_reljump(buf + len, slot + len,
+ (u8 *)op->kp.addr + op->optinsn.size);
+ len += RELATIVEJUMP_SIZE;
+
+ /* We have to use text_poke for instuction buffer because it is RO */
+ text_poke(slot, buf, len);
+ ret = 0;
+out:
+ kfree(buf);
+ return ret;
+
+err:
+ __arch_remove_optimized_kprobe(op, 0);
+ goto out;
+}
+
+/*
+ * Replace breakpoints (int3) with relative jumps.
+ * Caller must call with locking kprobe_mutex and text_mutex.
+ */
+void arch_optimize_kprobes(struct list_head *oplist)
+{
+ struct optimized_kprobe *op, *tmp;
+ u8 insn_buf[RELATIVEJUMP_SIZE];
+
+ list_for_each_entry_safe(op, tmp, oplist, list) {
+ s32 rel = (s32)((long)op->optinsn.insn -
+ ((long)op->kp.addr + RELATIVEJUMP_SIZE));
+
+ WARN_ON(kprobe_disabled(&op->kp));
+
+ /* Backup instructions which will be replaced by jump address */
+ memcpy(op->optinsn.copied_insn, op->kp.addr + INT3_SIZE,
+ RELATIVE_ADDR_SIZE);
+
+ insn_buf[0] = RELATIVEJUMP_OPCODE;
+ *(s32 *)(&insn_buf[1]) = rel;
+
+ text_poke_bp(op->kp.addr, insn_buf, RELATIVEJUMP_SIZE,
+ op->optinsn.insn);
+
+ list_del_init(&op->list);
+ }
+}
+
+/* Replace a relative jump with a breakpoint (int3). */
+void arch_unoptimize_kprobe(struct optimized_kprobe *op)
+{
+ u8 insn_buf[RELATIVEJUMP_SIZE];
+
+ /* Set int3 to first byte for kprobes */
+ insn_buf[0] = BREAKPOINT_INSTRUCTION;
+ memcpy(insn_buf + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE);
+ text_poke_bp(op->kp.addr, insn_buf, RELATIVEJUMP_SIZE,
+ op->optinsn.insn);
+}
+
+/*
+ * Recover original instructions and breakpoints from relative jumps.
+ * Caller must call with locking kprobe_mutex.
+ */
+extern void arch_unoptimize_kprobes(struct list_head *oplist,
+ struct list_head *done_list)
+{
+ struct optimized_kprobe *op, *tmp;
+
+ list_for_each_entry_safe(op, tmp, oplist, list) {
+ arch_unoptimize_kprobe(op);
+ list_move(&op->list, done_list);
+ }
+}
+
+int setup_detour_execution(struct kprobe *p, struct pt_regs *regs, int reenter)
+{
+ struct optimized_kprobe *op;
+
+ if (p->flags & KPROBE_FLAG_OPTIMIZED) {
+ /* This kprobe is really able to run optimized path. */
+ op = container_of(p, struct optimized_kprobe, kp);
+ /* Detour through copied instructions */
+ regs->ip = (unsigned long)op->optinsn.insn + TMPL_END_IDX;
+ if (!reenter)
+ reset_current_kprobe();
+ return 1;
+ }
+ return 0;
+}
+NOKPROBE_SYMBOL(setup_detour_execution);
diff --git a/arch/x86/kernel/ksysfs.c b/arch/x86/kernel/ksysfs.c
new file mode 100644
index 0000000..163ae70
--- /dev/null
+++ b/arch/x86/kernel/ksysfs.c
@@ -0,0 +1,340 @@
+/*
+ * Architecture specific sysfs attributes in /sys/kernel
+ *
+ * Copyright (C) 2007, Intel Corp.
+ * Huang Ying <ying.huang@intel.com>
+ * Copyright (C) 2013, 2013 Red Hat, Inc.
+ * Dave Young <dyoung@redhat.com>
+ *
+ * This file is released under the GPLv2
+ */
+
+#include <linux/kobject.h>
+#include <linux/string.h>
+#include <linux/sysfs.h>
+#include <linux/init.h>
+#include <linux/stat.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+#include <linux/io.h>
+
+#include <asm/setup.h>
+
+static ssize_t version_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sprintf(buf, "0x%04x\n", boot_params.hdr.version);
+}
+
+static struct kobj_attribute boot_params_version_attr = __ATTR_RO(version);
+
+static ssize_t boot_params_data_read(struct file *fp, struct kobject *kobj,
+ struct bin_attribute *bin_attr,
+ char *buf, loff_t off, size_t count)
+{
+ memcpy(buf, (void *)&boot_params + off, count);
+ return count;
+}
+
+static struct bin_attribute boot_params_data_attr = {
+ .attr = {
+ .name = "data",
+ .mode = S_IRUGO,
+ },
+ .read = boot_params_data_read,
+ .size = sizeof(boot_params),
+};
+
+static struct attribute *boot_params_version_attrs[] = {
+ &boot_params_version_attr.attr,
+ NULL,
+};
+
+static struct bin_attribute *boot_params_data_attrs[] = {
+ &boot_params_data_attr,
+ NULL,
+};
+
+static const struct attribute_group boot_params_attr_group = {
+ .attrs = boot_params_version_attrs,
+ .bin_attrs = boot_params_data_attrs,
+};
+
+static int kobj_to_setup_data_nr(struct kobject *kobj, int *nr)
+{
+ const char *name;
+
+ name = kobject_name(kobj);
+ return kstrtoint(name, 10, nr);
+}
+
+static int get_setup_data_paddr(int nr, u64 *paddr)
+{
+ int i = 0;
+ struct setup_data *data;
+ u64 pa_data = boot_params.hdr.setup_data;
+
+ while (pa_data) {
+ if (nr == i) {
+ *paddr = pa_data;
+ return 0;
+ }
+ data = memremap(pa_data, sizeof(*data), MEMREMAP_WB);
+ if (!data)
+ return -ENOMEM;
+
+ pa_data = data->next;
+ memunmap(data);
+ i++;
+ }
+ return -EINVAL;
+}
+
+static int __init get_setup_data_size(int nr, size_t *size)
+{
+ int i = 0;
+ struct setup_data *data;
+ u64 pa_data = boot_params.hdr.setup_data;
+
+ while (pa_data) {
+ data = memremap(pa_data, sizeof(*data), MEMREMAP_WB);
+ if (!data)
+ return -ENOMEM;
+ if (nr == i) {
+ *size = data->len;
+ memunmap(data);
+ return 0;
+ }
+
+ pa_data = data->next;
+ memunmap(data);
+ i++;
+ }
+ return -EINVAL;
+}
+
+static ssize_t type_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ int nr, ret;
+ u64 paddr;
+ struct setup_data *data;
+
+ ret = kobj_to_setup_data_nr(kobj, &nr);
+ if (ret)
+ return ret;
+
+ ret = get_setup_data_paddr(nr, &paddr);
+ if (ret)
+ return ret;
+ data = memremap(paddr, sizeof(*data), MEMREMAP_WB);
+ if (!data)
+ return -ENOMEM;
+
+ ret = sprintf(buf, "0x%x\n", data->type);
+ memunmap(data);
+ return ret;
+}
+
+static ssize_t setup_data_data_read(struct file *fp,
+ struct kobject *kobj,
+ struct bin_attribute *bin_attr,
+ char *buf,
+ loff_t off, size_t count)
+{
+ int nr, ret = 0;
+ u64 paddr;
+ struct setup_data *data;
+ void *p;
+
+ ret = kobj_to_setup_data_nr(kobj, &nr);
+ if (ret)
+ return ret;
+
+ ret = get_setup_data_paddr(nr, &paddr);
+ if (ret)
+ return ret;
+ data = memremap(paddr, sizeof(*data), MEMREMAP_WB);
+ if (!data)
+ return -ENOMEM;
+
+ if (off > data->len) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (count > data->len - off)
+ count = data->len - off;
+
+ if (!count)
+ goto out;
+
+ ret = count;
+ p = memremap(paddr + sizeof(*data), data->len, MEMREMAP_WB);
+ if (!p) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ memcpy(buf, p + off, count);
+ memunmap(p);
+out:
+ memunmap(data);
+ return ret;
+}
+
+static struct kobj_attribute type_attr = __ATTR_RO(type);
+
+static struct bin_attribute data_attr __ro_after_init = {
+ .attr = {
+ .name = "data",
+ .mode = S_IRUGO,
+ },
+ .read = setup_data_data_read,
+};
+
+static struct attribute *setup_data_type_attrs[] = {
+ &type_attr.attr,
+ NULL,
+};
+
+static struct bin_attribute *setup_data_data_attrs[] = {
+ &data_attr,
+ NULL,
+};
+
+static const struct attribute_group setup_data_attr_group = {
+ .attrs = setup_data_type_attrs,
+ .bin_attrs = setup_data_data_attrs,
+};
+
+static int __init create_setup_data_node(struct kobject *parent,
+ struct kobject **kobjp, int nr)
+{
+ int ret = 0;
+ size_t size;
+ struct kobject *kobj;
+ char name[16]; /* should be enough for setup_data nodes numbers */
+ snprintf(name, 16, "%d", nr);
+
+ kobj = kobject_create_and_add(name, parent);
+ if (!kobj)
+ return -ENOMEM;
+
+ ret = get_setup_data_size(nr, &size);
+ if (ret)
+ goto out_kobj;
+
+ data_attr.size = size;
+ ret = sysfs_create_group(kobj, &setup_data_attr_group);
+ if (ret)
+ goto out_kobj;
+ *kobjp = kobj;
+
+ return 0;
+out_kobj:
+ kobject_put(kobj);
+ return ret;
+}
+
+static void __init cleanup_setup_data_node(struct kobject *kobj)
+{
+ sysfs_remove_group(kobj, &setup_data_attr_group);
+ kobject_put(kobj);
+}
+
+static int __init get_setup_data_total_num(u64 pa_data, int *nr)
+{
+ int ret = 0;
+ struct setup_data *data;
+
+ *nr = 0;
+ while (pa_data) {
+ *nr += 1;
+ data = memremap(pa_data, sizeof(*data), MEMREMAP_WB);
+ if (!data) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ pa_data = data->next;
+ memunmap(data);
+ }
+
+out:
+ return ret;
+}
+
+static int __init create_setup_data_nodes(struct kobject *parent)
+{
+ struct kobject *setup_data_kobj, **kobjp;
+ u64 pa_data;
+ int i, j, nr, ret = 0;
+
+ pa_data = boot_params.hdr.setup_data;
+ if (!pa_data)
+ return 0;
+
+ setup_data_kobj = kobject_create_and_add("setup_data", parent);
+ if (!setup_data_kobj) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ret = get_setup_data_total_num(pa_data, &nr);
+ if (ret)
+ goto out_setup_data_kobj;
+
+ kobjp = kmalloc_array(nr, sizeof(*kobjp), GFP_KERNEL);
+ if (!kobjp) {
+ ret = -ENOMEM;
+ goto out_setup_data_kobj;
+ }
+
+ for (i = 0; i < nr; i++) {
+ ret = create_setup_data_node(setup_data_kobj, kobjp + i, i);
+ if (ret)
+ goto out_clean_nodes;
+ }
+
+ kfree(kobjp);
+ return 0;
+
+out_clean_nodes:
+ for (j = i - 1; j >= 0; j--)
+ cleanup_setup_data_node(*(kobjp + j));
+ kfree(kobjp);
+out_setup_data_kobj:
+ kobject_put(setup_data_kobj);
+out:
+ return ret;
+}
+
+static int __init boot_params_ksysfs_init(void)
+{
+ int ret;
+ struct kobject *boot_params_kobj;
+
+ boot_params_kobj = kobject_create_and_add("boot_params",
+ kernel_kobj);
+ if (!boot_params_kobj) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ret = sysfs_create_group(boot_params_kobj, &boot_params_attr_group);
+ if (ret)
+ goto out_boot_params_kobj;
+
+ ret = create_setup_data_nodes(boot_params_kobj);
+ if (ret)
+ goto out_create_group;
+
+ return 0;
+out_create_group:
+ sysfs_remove_group(boot_params_kobj, &boot_params_attr_group);
+out_boot_params_kobj:
+ kobject_put(boot_params_kobj);
+out:
+ return ret;
+}
+
+arch_initcall(boot_params_ksysfs_init);
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
new file mode 100644
index 0000000..d9b7192
--- /dev/null
+++ b/arch/x86/kernel/kvm.c
@@ -0,0 +1,864 @@
+/*
+ * KVM paravirt_ops implementation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * Copyright (C) 2007, Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ * Copyright IBM Corporation, 2007
+ * Authors: Anthony Liguori <aliguori@us.ibm.com>
+ */
+
+#include <linux/context_tracking.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/kvm_para.h>
+#include <linux/cpu.h>
+#include <linux/mm.h>
+#include <linux/highmem.h>
+#include <linux/hardirq.h>
+#include <linux/notifier.h>
+#include <linux/reboot.h>
+#include <linux/hash.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/kprobes.h>
+#include <linux/debugfs.h>
+#include <linux/nmi.h>
+#include <linux/swait.h>
+#include <asm/timer.h>
+#include <asm/cpu.h>
+#include <asm/traps.h>
+#include <asm/desc.h>
+#include <asm/tlbflush.h>
+#include <asm/apic.h>
+#include <asm/apicdef.h>
+#include <asm/hypervisor.h>
+#include <asm/tlb.h>
+
+static int kvmapf = 1;
+
+static int __init parse_no_kvmapf(char *arg)
+{
+ kvmapf = 0;
+ return 0;
+}
+
+early_param("no-kvmapf", parse_no_kvmapf);
+
+static int steal_acc = 1;
+static int __init parse_no_stealacc(char *arg)
+{
+ steal_acc = 0;
+ return 0;
+}
+
+early_param("no-steal-acc", parse_no_stealacc);
+
+static DEFINE_PER_CPU_DECRYPTED(struct kvm_vcpu_pv_apf_data, apf_reason) __aligned(64);
+static DEFINE_PER_CPU_DECRYPTED(struct kvm_steal_time, steal_time) __aligned(64);
+static int has_steal_clock = 0;
+
+/*
+ * No need for any "IO delay" on KVM
+ */
+static void kvm_io_delay(void)
+{
+}
+
+#define KVM_TASK_SLEEP_HASHBITS 8
+#define KVM_TASK_SLEEP_HASHSIZE (1<<KVM_TASK_SLEEP_HASHBITS)
+
+struct kvm_task_sleep_node {
+ struct hlist_node link;
+ struct swait_queue_head wq;
+ u32 token;
+ int cpu;
+ bool halted;
+};
+
+static struct kvm_task_sleep_head {
+ raw_spinlock_t lock;
+ struct hlist_head list;
+} async_pf_sleepers[KVM_TASK_SLEEP_HASHSIZE];
+
+static struct kvm_task_sleep_node *_find_apf_task(struct kvm_task_sleep_head *b,
+ u32 token)
+{
+ struct hlist_node *p;
+
+ hlist_for_each(p, &b->list) {
+ struct kvm_task_sleep_node *n =
+ hlist_entry(p, typeof(*n), link);
+ if (n->token == token)
+ return n;
+ }
+
+ return NULL;
+}
+
+/*
+ * @interrupt_kernel: Is this called from a routine which interrupts the kernel
+ * (other than user space)?
+ */
+void kvm_async_pf_task_wait(u32 token, int interrupt_kernel)
+{
+ u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS);
+ struct kvm_task_sleep_head *b = &async_pf_sleepers[key];
+ struct kvm_task_sleep_node n, *e;
+ DECLARE_SWAITQUEUE(wait);
+
+ rcu_irq_enter();
+
+ raw_spin_lock(&b->lock);
+ e = _find_apf_task(b, token);
+ if (e) {
+ /* dummy entry exist -> wake up was delivered ahead of PF */
+ hlist_del(&e->link);
+ kfree(e);
+ raw_spin_unlock(&b->lock);
+
+ rcu_irq_exit();
+ return;
+ }
+
+ n.token = token;
+ n.cpu = smp_processor_id();
+ n.halted = is_idle_task(current) ||
+ (IS_ENABLED(CONFIG_PREEMPT_COUNT)
+ ? preempt_count() > 1 || rcu_preempt_depth()
+ : interrupt_kernel);
+ init_swait_queue_head(&n.wq);
+ hlist_add_head(&n.link, &b->list);
+ raw_spin_unlock(&b->lock);
+
+ for (;;) {
+ if (!n.halted)
+ prepare_to_swait_exclusive(&n.wq, &wait, TASK_UNINTERRUPTIBLE);
+ if (hlist_unhashed(&n.link))
+ break;
+
+ rcu_irq_exit();
+
+ if (!n.halted) {
+ local_irq_enable();
+ schedule();
+ local_irq_disable();
+ } else {
+ /*
+ * We cannot reschedule. So halt.
+ */
+ native_safe_halt();
+ local_irq_disable();
+ }
+
+ rcu_irq_enter();
+ }
+ if (!n.halted)
+ finish_swait(&n.wq, &wait);
+
+ rcu_irq_exit();
+ return;
+}
+EXPORT_SYMBOL_GPL(kvm_async_pf_task_wait);
+
+static void apf_task_wake_one(struct kvm_task_sleep_node *n)
+{
+ hlist_del_init(&n->link);
+ if (n->halted)
+ smp_send_reschedule(n->cpu);
+ else if (swq_has_sleeper(&n->wq))
+ swake_up_one(&n->wq);
+}
+
+static void apf_task_wake_all(void)
+{
+ int i;
+
+ for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++) {
+ struct hlist_node *p, *next;
+ struct kvm_task_sleep_head *b = &async_pf_sleepers[i];
+ raw_spin_lock(&b->lock);
+ hlist_for_each_safe(p, next, &b->list) {
+ struct kvm_task_sleep_node *n =
+ hlist_entry(p, typeof(*n), link);
+ if (n->cpu == smp_processor_id())
+ apf_task_wake_one(n);
+ }
+ raw_spin_unlock(&b->lock);
+ }
+}
+
+void kvm_async_pf_task_wake(u32 token)
+{
+ u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS);
+ struct kvm_task_sleep_head *b = &async_pf_sleepers[key];
+ struct kvm_task_sleep_node *n;
+
+ if (token == ~0) {
+ apf_task_wake_all();
+ return;
+ }
+
+again:
+ raw_spin_lock(&b->lock);
+ n = _find_apf_task(b, token);
+ if (!n) {
+ /*
+ * async PF was not yet handled.
+ * Add dummy entry for the token.
+ */
+ n = kzalloc(sizeof(*n), GFP_ATOMIC);
+ if (!n) {
+ /*
+ * Allocation failed! Busy wait while other cpu
+ * handles async PF.
+ */
+ raw_spin_unlock(&b->lock);
+ cpu_relax();
+ goto again;
+ }
+ n->token = token;
+ n->cpu = smp_processor_id();
+ init_swait_queue_head(&n->wq);
+ hlist_add_head(&n->link, &b->list);
+ } else
+ apf_task_wake_one(n);
+ raw_spin_unlock(&b->lock);
+ return;
+}
+EXPORT_SYMBOL_GPL(kvm_async_pf_task_wake);
+
+u32 kvm_read_and_reset_pf_reason(void)
+{
+ u32 reason = 0;
+
+ if (__this_cpu_read(apf_reason.enabled)) {
+ reason = __this_cpu_read(apf_reason.reason);
+ __this_cpu_write(apf_reason.reason, 0);
+ }
+
+ return reason;
+}
+EXPORT_SYMBOL_GPL(kvm_read_and_reset_pf_reason);
+NOKPROBE_SYMBOL(kvm_read_and_reset_pf_reason);
+
+dotraplinkage void
+do_async_page_fault(struct pt_regs *regs, unsigned long error_code)
+{
+ enum ctx_state prev_state;
+
+ switch (kvm_read_and_reset_pf_reason()) {
+ default:
+ do_page_fault(regs, error_code);
+ break;
+ case KVM_PV_REASON_PAGE_NOT_PRESENT:
+ /* page is swapped out by the host. */
+ prev_state = exception_enter();
+ kvm_async_pf_task_wait((u32)read_cr2(), !user_mode(regs));
+ exception_exit(prev_state);
+ break;
+ case KVM_PV_REASON_PAGE_READY:
+ rcu_irq_enter();
+ kvm_async_pf_task_wake((u32)read_cr2());
+ rcu_irq_exit();
+ break;
+ }
+}
+NOKPROBE_SYMBOL(do_async_page_fault);
+
+static void __init paravirt_ops_setup(void)
+{
+ pv_info.name = "KVM";
+
+ if (kvm_para_has_feature(KVM_FEATURE_NOP_IO_DELAY))
+ pv_cpu_ops.io_delay = kvm_io_delay;
+
+#ifdef CONFIG_X86_IO_APIC
+ no_timer_check = 1;
+#endif
+}
+
+static void kvm_register_steal_time(void)
+{
+ int cpu = smp_processor_id();
+ struct kvm_steal_time *st = &per_cpu(steal_time, cpu);
+
+ if (!has_steal_clock)
+ return;
+
+ wrmsrl(MSR_KVM_STEAL_TIME, (slow_virt_to_phys(st) | KVM_MSR_ENABLED));
+ pr_info("kvm-stealtime: cpu %d, msr %llx\n",
+ cpu, (unsigned long long) slow_virt_to_phys(st));
+}
+
+static DEFINE_PER_CPU_DECRYPTED(unsigned long, kvm_apic_eoi) = KVM_PV_EOI_DISABLED;
+
+static notrace void kvm_guest_apic_eoi_write(u32 reg, u32 val)
+{
+ /**
+ * This relies on __test_and_clear_bit to modify the memory
+ * in a way that is atomic with respect to the local CPU.
+ * The hypervisor only accesses this memory from the local CPU so
+ * there's no need for lock or memory barriers.
+ * An optimization barrier is implied in apic write.
+ */
+ if (__test_and_clear_bit(KVM_PV_EOI_BIT, this_cpu_ptr(&kvm_apic_eoi)))
+ return;
+ apic->native_eoi_write(APIC_EOI, APIC_EOI_ACK);
+}
+
+static void kvm_guest_cpu_init(void)
+{
+ if (!kvm_para_available())
+ return;
+
+ if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF) && kvmapf) {
+ u64 pa = slow_virt_to_phys(this_cpu_ptr(&apf_reason));
+
+#ifdef CONFIG_PREEMPT
+ pa |= KVM_ASYNC_PF_SEND_ALWAYS;
+#endif
+ pa |= KVM_ASYNC_PF_ENABLED;
+
+ if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF_VMEXIT))
+ pa |= KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT;
+
+ wrmsrl(MSR_KVM_ASYNC_PF_EN, pa);
+ __this_cpu_write(apf_reason.enabled, 1);
+ printk(KERN_INFO"KVM setup async PF for cpu %d\n",
+ smp_processor_id());
+ }
+
+ if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) {
+ unsigned long pa;
+ /* Size alignment is implied but just to make it explicit. */
+ BUILD_BUG_ON(__alignof__(kvm_apic_eoi) < 4);
+ __this_cpu_write(kvm_apic_eoi, 0);
+ pa = slow_virt_to_phys(this_cpu_ptr(&kvm_apic_eoi))
+ | KVM_MSR_ENABLED;
+ wrmsrl(MSR_KVM_PV_EOI_EN, pa);
+ }
+
+ if (has_steal_clock)
+ kvm_register_steal_time();
+}
+
+static void kvm_pv_disable_apf(void)
+{
+ if (!__this_cpu_read(apf_reason.enabled))
+ return;
+
+ wrmsrl(MSR_KVM_ASYNC_PF_EN, 0);
+ __this_cpu_write(apf_reason.enabled, 0);
+
+ printk(KERN_INFO"Unregister pv shared memory for cpu %d\n",
+ smp_processor_id());
+}
+
+static void kvm_pv_guest_cpu_reboot(void *unused)
+{
+ /*
+ * We disable PV EOI before we load a new kernel by kexec,
+ * since MSR_KVM_PV_EOI_EN stores a pointer into old kernel's memory.
+ * New kernel can re-enable when it boots.
+ */
+ if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
+ wrmsrl(MSR_KVM_PV_EOI_EN, 0);
+ kvm_pv_disable_apf();
+ kvm_disable_steal_time();
+}
+
+static int kvm_pv_reboot_notify(struct notifier_block *nb,
+ unsigned long code, void *unused)
+{
+ if (code == SYS_RESTART)
+ on_each_cpu(kvm_pv_guest_cpu_reboot, NULL, 1);
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block kvm_pv_reboot_nb = {
+ .notifier_call = kvm_pv_reboot_notify,
+};
+
+static u64 kvm_steal_clock(int cpu)
+{
+ u64 steal;
+ struct kvm_steal_time *src;
+ int version;
+
+ src = &per_cpu(steal_time, cpu);
+ do {
+ version = src->version;
+ virt_rmb();
+ steal = src->steal;
+ virt_rmb();
+ } while ((version & 1) || (version != src->version));
+
+ return steal;
+}
+
+void kvm_disable_steal_time(void)
+{
+ if (!has_steal_clock)
+ return;
+
+ wrmsr(MSR_KVM_STEAL_TIME, 0, 0);
+}
+
+static inline void __set_percpu_decrypted(void *ptr, unsigned long size)
+{
+ early_set_memory_decrypted((unsigned long) ptr, size);
+}
+
+/*
+ * Iterate through all possible CPUs and map the memory region pointed
+ * by apf_reason, steal_time and kvm_apic_eoi as decrypted at once.
+ *
+ * Note: we iterate through all possible CPUs to ensure that CPUs
+ * hotplugged will have their per-cpu variable already mapped as
+ * decrypted.
+ */
+static void __init sev_map_percpu_data(void)
+{
+ int cpu;
+
+ if (!sev_active())
+ return;
+
+ for_each_possible_cpu(cpu) {
+ __set_percpu_decrypted(&per_cpu(apf_reason, cpu), sizeof(apf_reason));
+ __set_percpu_decrypted(&per_cpu(steal_time, cpu), sizeof(steal_time));
+ __set_percpu_decrypted(&per_cpu(kvm_apic_eoi, cpu), sizeof(kvm_apic_eoi));
+ }
+}
+
+#ifdef CONFIG_SMP
+#define KVM_IPI_CLUSTER_SIZE (2 * BITS_PER_LONG)
+
+static void __send_ipi_mask(const struct cpumask *mask, int vector)
+{
+ unsigned long flags;
+ int cpu, apic_id, icr;
+ int min = 0, max = 0;
+#ifdef CONFIG_X86_64
+ __uint128_t ipi_bitmap = 0;
+#else
+ u64 ipi_bitmap = 0;
+#endif
+
+ if (cpumask_empty(mask))
+ return;
+
+ local_irq_save(flags);
+
+ switch (vector) {
+ default:
+ icr = APIC_DM_FIXED | vector;
+ break;
+ case NMI_VECTOR:
+ icr = APIC_DM_NMI;
+ break;
+ }
+
+ for_each_cpu(cpu, mask) {
+ apic_id = per_cpu(x86_cpu_to_apicid, cpu);
+ if (!ipi_bitmap) {
+ min = max = apic_id;
+ } else if (apic_id < min && max - apic_id < KVM_IPI_CLUSTER_SIZE) {
+ ipi_bitmap <<= min - apic_id;
+ min = apic_id;
+ } else if (apic_id < min + KVM_IPI_CLUSTER_SIZE) {
+ max = apic_id < max ? max : apic_id;
+ } else {
+ kvm_hypercall4(KVM_HC_SEND_IPI, (unsigned long)ipi_bitmap,
+ (unsigned long)(ipi_bitmap >> BITS_PER_LONG), min, icr);
+ min = max = apic_id;
+ ipi_bitmap = 0;
+ }
+ __set_bit(apic_id - min, (unsigned long *)&ipi_bitmap);
+ }
+
+ if (ipi_bitmap) {
+ kvm_hypercall4(KVM_HC_SEND_IPI, (unsigned long)ipi_bitmap,
+ (unsigned long)(ipi_bitmap >> BITS_PER_LONG), min, icr);
+ }
+
+ local_irq_restore(flags);
+}
+
+static void kvm_send_ipi_mask(const struct cpumask *mask, int vector)
+{
+ __send_ipi_mask(mask, vector);
+}
+
+static void kvm_send_ipi_mask_allbutself(const struct cpumask *mask, int vector)
+{
+ unsigned int this_cpu = smp_processor_id();
+ struct cpumask new_mask;
+ const struct cpumask *local_mask;
+
+ cpumask_copy(&new_mask, mask);
+ cpumask_clear_cpu(this_cpu, &new_mask);
+ local_mask = &new_mask;
+ __send_ipi_mask(local_mask, vector);
+}
+
+static void kvm_send_ipi_allbutself(int vector)
+{
+ kvm_send_ipi_mask_allbutself(cpu_online_mask, vector);
+}
+
+static void kvm_send_ipi_all(int vector)
+{
+ __send_ipi_mask(cpu_online_mask, vector);
+}
+
+/*
+ * Set the IPI entry points
+ */
+static void kvm_setup_pv_ipi(void)
+{
+ apic->send_IPI_mask = kvm_send_ipi_mask;
+ apic->send_IPI_mask_allbutself = kvm_send_ipi_mask_allbutself;
+ apic->send_IPI_allbutself = kvm_send_ipi_allbutself;
+ apic->send_IPI_all = kvm_send_ipi_all;
+ pr_info("KVM setup pv IPIs\n");
+}
+
+static void __init kvm_smp_prepare_cpus(unsigned int max_cpus)
+{
+ native_smp_prepare_cpus(max_cpus);
+ if (kvm_para_has_hint(KVM_HINTS_REALTIME))
+ static_branch_disable(&virt_spin_lock_key);
+}
+
+static void __init kvm_smp_prepare_boot_cpu(void)
+{
+ /*
+ * Map the per-cpu variables as decrypted before kvm_guest_cpu_init()
+ * shares the guest physical address with the hypervisor.
+ */
+ sev_map_percpu_data();
+
+ kvm_guest_cpu_init();
+ native_smp_prepare_boot_cpu();
+ kvm_spinlock_init();
+}
+
+static void kvm_guest_cpu_offline(void)
+{
+ kvm_disable_steal_time();
+ if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
+ wrmsrl(MSR_KVM_PV_EOI_EN, 0);
+ kvm_pv_disable_apf();
+ apf_task_wake_all();
+}
+
+static int kvm_cpu_online(unsigned int cpu)
+{
+ local_irq_disable();
+ kvm_guest_cpu_init();
+ local_irq_enable();
+ return 0;
+}
+
+static int kvm_cpu_down_prepare(unsigned int cpu)
+{
+ local_irq_disable();
+ kvm_guest_cpu_offline();
+ local_irq_enable();
+ return 0;
+}
+#endif
+
+static void __init kvm_apf_trap_init(void)
+{
+ update_intr_gate(X86_TRAP_PF, async_page_fault);
+}
+
+static DEFINE_PER_CPU(cpumask_var_t, __pv_tlb_mask);
+
+static void kvm_flush_tlb_others(const struct cpumask *cpumask,
+ const struct flush_tlb_info *info)
+{
+ u8 state;
+ int cpu;
+ struct kvm_steal_time *src;
+ struct cpumask *flushmask = this_cpu_cpumask_var_ptr(__pv_tlb_mask);
+
+ cpumask_copy(flushmask, cpumask);
+ /*
+ * We have to call flush only on online vCPUs. And
+ * queue flush_on_enter for pre-empted vCPUs
+ */
+ for_each_cpu(cpu, flushmask) {
+ src = &per_cpu(steal_time, cpu);
+ state = READ_ONCE(src->preempted);
+ if ((state & KVM_VCPU_PREEMPTED)) {
+ if (try_cmpxchg(&src->preempted, &state,
+ state | KVM_VCPU_FLUSH_TLB))
+ __cpumask_clear_cpu(cpu, flushmask);
+ }
+ }
+
+ native_flush_tlb_others(flushmask, info);
+}
+
+static void __init kvm_guest_init(void)
+{
+ int i;
+
+ if (!kvm_para_available())
+ return;
+
+ paravirt_ops_setup();
+ register_reboot_notifier(&kvm_pv_reboot_nb);
+ for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++)
+ raw_spin_lock_init(&async_pf_sleepers[i].lock);
+ if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF))
+ x86_init.irqs.trap_init = kvm_apf_trap_init;
+
+ if (kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) {
+ has_steal_clock = 1;
+ pv_time_ops.steal_clock = kvm_steal_clock;
+ }
+
+ if (kvm_para_has_feature(KVM_FEATURE_PV_TLB_FLUSH) &&
+ !kvm_para_has_hint(KVM_HINTS_REALTIME) &&
+ kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) {
+ pv_mmu_ops.flush_tlb_others = kvm_flush_tlb_others;
+ pv_mmu_ops.tlb_remove_table = tlb_remove_table;
+ }
+
+ if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
+ apic_set_eoi_write(kvm_guest_apic_eoi_write);
+
+#ifdef CONFIG_SMP
+ smp_ops.smp_prepare_cpus = kvm_smp_prepare_cpus;
+ smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu;
+ if (cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "x86/kvm:online",
+ kvm_cpu_online, kvm_cpu_down_prepare) < 0)
+ pr_err("kvm_guest: Failed to install cpu hotplug callbacks\n");
+#else
+ sev_map_percpu_data();
+ kvm_guest_cpu_init();
+#endif
+
+ /*
+ * Hard lockup detection is enabled by default. Disable it, as guests
+ * can get false positives too easily, for example if the host is
+ * overcommitted.
+ */
+ hardlockup_detector_disable();
+}
+
+static noinline uint32_t __kvm_cpuid_base(void)
+{
+ if (boot_cpu_data.cpuid_level < 0)
+ return 0; /* So we don't blow up on old processors */
+
+ if (boot_cpu_has(X86_FEATURE_HYPERVISOR))
+ return hypervisor_cpuid_base("KVMKVMKVM\0\0\0", 0);
+
+ return 0;
+}
+
+static inline uint32_t kvm_cpuid_base(void)
+{
+ static int kvm_cpuid_base = -1;
+
+ if (kvm_cpuid_base == -1)
+ kvm_cpuid_base = __kvm_cpuid_base();
+
+ return kvm_cpuid_base;
+}
+
+bool kvm_para_available(void)
+{
+ return kvm_cpuid_base() != 0;
+}
+EXPORT_SYMBOL_GPL(kvm_para_available);
+
+unsigned int kvm_arch_para_features(void)
+{
+ return cpuid_eax(kvm_cpuid_base() | KVM_CPUID_FEATURES);
+}
+
+unsigned int kvm_arch_para_hints(void)
+{
+ return cpuid_edx(kvm_cpuid_base() | KVM_CPUID_FEATURES);
+}
+
+static uint32_t __init kvm_detect(void)
+{
+ return kvm_cpuid_base();
+}
+
+static void __init kvm_apic_init(void)
+{
+#if defined(CONFIG_SMP)
+ if (kvm_para_has_feature(KVM_FEATURE_PV_SEND_IPI))
+ kvm_setup_pv_ipi();
+#endif
+}
+
+static void __init kvm_init_platform(void)
+{
+ kvmclock_init();
+ x86_platform.apic_post_init = kvm_apic_init;
+}
+
+const __initconst struct hypervisor_x86 x86_hyper_kvm = {
+ .name = "KVM",
+ .detect = kvm_detect,
+ .type = X86_HYPER_KVM,
+ .init.guest_late_init = kvm_guest_init,
+ .init.x2apic_available = kvm_para_available,
+ .init.init_platform = kvm_init_platform,
+};
+
+static __init int activate_jump_labels(void)
+{
+ if (has_steal_clock) {
+ static_key_slow_inc(¶virt_steal_enabled);
+ if (steal_acc)
+ static_key_slow_inc(¶virt_steal_rq_enabled);
+ }
+
+ return 0;
+}
+arch_initcall(activate_jump_labels);
+
+static __init int kvm_setup_pv_tlb_flush(void)
+{
+ int cpu;
+
+ if (kvm_para_has_feature(KVM_FEATURE_PV_TLB_FLUSH) &&
+ !kvm_para_has_hint(KVM_HINTS_REALTIME) &&
+ kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) {
+ for_each_possible_cpu(cpu) {
+ zalloc_cpumask_var_node(per_cpu_ptr(&__pv_tlb_mask, cpu),
+ GFP_KERNEL, cpu_to_node(cpu));
+ }
+ pr_info("KVM setup pv remote TLB flush\n");
+ }
+
+ return 0;
+}
+arch_initcall(kvm_setup_pv_tlb_flush);
+
+#ifdef CONFIG_PARAVIRT_SPINLOCKS
+
+/* Kick a cpu by its apicid. Used to wake up a halted vcpu */
+static void kvm_kick_cpu(int cpu)
+{
+ int apicid;
+ unsigned long flags = 0;
+
+ apicid = per_cpu(x86_cpu_to_apicid, cpu);
+ kvm_hypercall2(KVM_HC_KICK_CPU, flags, apicid);
+}
+
+#include <asm/qspinlock.h>
+
+static void kvm_wait(u8 *ptr, u8 val)
+{
+ unsigned long flags;
+
+ if (in_nmi())
+ return;
+
+ local_irq_save(flags);
+
+ if (READ_ONCE(*ptr) != val)
+ goto out;
+
+ /*
+ * halt until it's our turn and kicked. Note that we do safe halt
+ * for irq enabled case to avoid hang when lock info is overwritten
+ * in irq spinlock slowpath and no spurious interrupt occur to save us.
+ */
+ if (arch_irqs_disabled_flags(flags))
+ halt();
+ else
+ safe_halt();
+
+out:
+ local_irq_restore(flags);
+}
+
+#ifdef CONFIG_X86_32
+__visible bool __kvm_vcpu_is_preempted(long cpu)
+{
+ struct kvm_steal_time *src = &per_cpu(steal_time, cpu);
+
+ return !!(src->preempted & KVM_VCPU_PREEMPTED);
+}
+PV_CALLEE_SAVE_REGS_THUNK(__kvm_vcpu_is_preempted);
+
+#else
+
+#include <asm/asm-offsets.h>
+
+extern bool __raw_callee_save___kvm_vcpu_is_preempted(long);
+
+/*
+ * Hand-optimize version for x86-64 to avoid 8 64-bit register saving and
+ * restoring to/from the stack.
+ */
+asm(
+".pushsection .text;"
+".global __raw_callee_save___kvm_vcpu_is_preempted;"
+".type __raw_callee_save___kvm_vcpu_is_preempted, @function;"
+"__raw_callee_save___kvm_vcpu_is_preempted:"
+"movq __per_cpu_offset(,%rdi,8), %rax;"
+"cmpb $0, " __stringify(KVM_STEAL_TIME_preempted) "+steal_time(%rax);"
+"setne %al;"
+"ret;"
+".popsection");
+
+#endif
+
+/*
+ * Setup pv_lock_ops to exploit KVM_FEATURE_PV_UNHALT if present.
+ */
+void __init kvm_spinlock_init(void)
+{
+ if (!kvm_para_available())
+ return;
+ /* Does host kernel support KVM_FEATURE_PV_UNHALT? */
+ if (!kvm_para_has_feature(KVM_FEATURE_PV_UNHALT))
+ return;
+
+ if (kvm_para_has_hint(KVM_HINTS_REALTIME))
+ return;
+
+ /* Don't use the pvqspinlock code if there is only 1 vCPU. */
+ if (num_possible_cpus() == 1)
+ return;
+
+ __pv_init_lock_hash();
+ pv_lock_ops.queued_spin_lock_slowpath = __pv_queued_spin_lock_slowpath;
+ pv_lock_ops.queued_spin_unlock = PV_CALLEE_SAVE(__pv_queued_spin_unlock);
+ pv_lock_ops.wait = kvm_wait;
+ pv_lock_ops.kick = kvm_kick_cpu;
+
+ if (kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) {
+ pv_lock_ops.vcpu_is_preempted =
+ PV_CALLEE_SAVE(__kvm_vcpu_is_preempted);
+ }
+}
+
+#endif /* CONFIG_PARAVIRT_SPINLOCKS */
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
new file mode 100644
index 0000000..013fe3d
--- /dev/null
+++ b/arch/x86/kernel/kvmclock.c
@@ -0,0 +1,373 @@
+/* KVM paravirtual clock driver. A clocksource implementation
+ Copyright (C) 2008 Glauber de Oliveira Costa, Red Hat Inc.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+*/
+
+#include <linux/clocksource.h>
+#include <linux/kvm_para.h>
+#include <asm/pvclock.h>
+#include <asm/msr.h>
+#include <asm/apic.h>
+#include <linux/percpu.h>
+#include <linux/hardirq.h>
+#include <linux/cpuhotplug.h>
+#include <linux/sched.h>
+#include <linux/sched/clock.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/set_memory.h>
+
+#include <asm/hypervisor.h>
+#include <asm/mem_encrypt.h>
+#include <asm/x86_init.h>
+#include <asm/reboot.h>
+#include <asm/kvmclock.h>
+
+static int kvmclock __initdata = 1;
+static int kvmclock_vsyscall __initdata = 1;
+static int msr_kvm_system_time __ro_after_init = MSR_KVM_SYSTEM_TIME;
+static int msr_kvm_wall_clock __ro_after_init = MSR_KVM_WALL_CLOCK;
+static u64 kvm_sched_clock_offset __ro_after_init;
+
+static int __init parse_no_kvmclock(char *arg)
+{
+ kvmclock = 0;
+ return 0;
+}
+early_param("no-kvmclock", parse_no_kvmclock);
+
+static int __init parse_no_kvmclock_vsyscall(char *arg)
+{
+ kvmclock_vsyscall = 0;
+ return 0;
+}
+early_param("no-kvmclock-vsyscall", parse_no_kvmclock_vsyscall);
+
+/* Aligned to page sizes to match whats mapped via vsyscalls to userspace */
+#define HV_CLOCK_SIZE (sizeof(struct pvclock_vsyscall_time_info) * NR_CPUS)
+#define HVC_BOOT_ARRAY_SIZE \
+ (PAGE_SIZE / sizeof(struct pvclock_vsyscall_time_info))
+
+static struct pvclock_vsyscall_time_info
+ hv_clock_boot[HVC_BOOT_ARRAY_SIZE] __bss_decrypted __aligned(PAGE_SIZE);
+static struct pvclock_wall_clock wall_clock __bss_decrypted;
+static DEFINE_PER_CPU(struct pvclock_vsyscall_time_info *, hv_clock_per_cpu);
+static struct pvclock_vsyscall_time_info *hvclock_mem;
+
+static inline struct pvclock_vcpu_time_info *this_cpu_pvti(void)
+{
+ return &this_cpu_read(hv_clock_per_cpu)->pvti;
+}
+
+static inline struct pvclock_vsyscall_time_info *this_cpu_hvclock(void)
+{
+ return this_cpu_read(hv_clock_per_cpu);
+}
+
+/*
+ * The wallclock is the time of day when we booted. Since then, some time may
+ * have elapsed since the hypervisor wrote the data. So we try to account for
+ * that with system time
+ */
+static void kvm_get_wallclock(struct timespec64 *now)
+{
+ wrmsrl(msr_kvm_wall_clock, slow_virt_to_phys(&wall_clock));
+ preempt_disable();
+ pvclock_read_wallclock(&wall_clock, this_cpu_pvti(), now);
+ preempt_enable();
+}
+
+static int kvm_set_wallclock(const struct timespec64 *now)
+{
+ return -ENODEV;
+}
+
+static u64 kvm_clock_read(void)
+{
+ u64 ret;
+
+ preempt_disable_notrace();
+ ret = pvclock_clocksource_read(this_cpu_pvti());
+ preempt_enable_notrace();
+ return ret;
+}
+
+static u64 kvm_clock_get_cycles(struct clocksource *cs)
+{
+ return kvm_clock_read();
+}
+
+static u64 kvm_sched_clock_read(void)
+{
+ return kvm_clock_read() - kvm_sched_clock_offset;
+}
+
+static inline void kvm_sched_clock_init(bool stable)
+{
+ if (!stable) {
+ pv_time_ops.sched_clock = kvm_clock_read;
+ clear_sched_clock_stable();
+ return;
+ }
+
+ kvm_sched_clock_offset = kvm_clock_read();
+ pv_time_ops.sched_clock = kvm_sched_clock_read;
+
+ pr_info("kvm-clock: using sched offset of %llu cycles",
+ kvm_sched_clock_offset);
+
+ BUILD_BUG_ON(sizeof(kvm_sched_clock_offset) >
+ sizeof(((struct pvclock_vcpu_time_info *)NULL)->system_time));
+}
+
+/*
+ * If we don't do that, there is the possibility that the guest
+ * will calibrate under heavy load - thus, getting a lower lpj -
+ * and execute the delays themselves without load. This is wrong,
+ * because no delay loop can finish beforehand.
+ * Any heuristics is subject to fail, because ultimately, a large
+ * poll of guests can be running and trouble each other. So we preset
+ * lpj here
+ */
+static unsigned long kvm_get_tsc_khz(void)
+{
+ setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ);
+ return pvclock_tsc_khz(this_cpu_pvti());
+}
+
+static void __init kvm_get_preset_lpj(void)
+{
+ unsigned long khz;
+ u64 lpj;
+
+ khz = kvm_get_tsc_khz();
+
+ lpj = ((u64)khz * 1000);
+ do_div(lpj, HZ);
+ preset_lpj = lpj;
+}
+
+bool kvm_check_and_clear_guest_paused(void)
+{
+ struct pvclock_vsyscall_time_info *src = this_cpu_hvclock();
+ bool ret = false;
+
+ if (!src)
+ return ret;
+
+ if ((src->pvti.flags & PVCLOCK_GUEST_STOPPED) != 0) {
+ src->pvti.flags &= ~PVCLOCK_GUEST_STOPPED;
+ pvclock_touch_watchdogs();
+ ret = true;
+ }
+ return ret;
+}
+
+struct clocksource kvm_clock = {
+ .name = "kvm-clock",
+ .read = kvm_clock_get_cycles,
+ .rating = 400,
+ .mask = CLOCKSOURCE_MASK(64),
+ .flags = CLOCK_SOURCE_IS_CONTINUOUS,
+};
+EXPORT_SYMBOL_GPL(kvm_clock);
+
+static void kvm_register_clock(char *txt)
+{
+ struct pvclock_vsyscall_time_info *src = this_cpu_hvclock();
+ u64 pa;
+
+ if (!src)
+ return;
+
+ pa = slow_virt_to_phys(&src->pvti) | 0x01ULL;
+ wrmsrl(msr_kvm_system_time, pa);
+ pr_info("kvm-clock: cpu %d, msr %llx, %s", smp_processor_id(), pa, txt);
+}
+
+static void kvm_save_sched_clock_state(void)
+{
+}
+
+static void kvm_restore_sched_clock_state(void)
+{
+ kvm_register_clock("primary cpu clock, resume");
+}
+
+#ifdef CONFIG_X86_LOCAL_APIC
+static void kvm_setup_secondary_clock(void)
+{
+ kvm_register_clock("secondary cpu clock");
+}
+#endif
+
+/*
+ * After the clock is registered, the host will keep writing to the
+ * registered memory location. If the guest happens to shutdown, this memory
+ * won't be valid. In cases like kexec, in which you install a new kernel, this
+ * means a random memory location will be kept being written. So before any
+ * kind of shutdown from our side, we unregister the clock by writing anything
+ * that does not have the 'enable' bit set in the msr
+ */
+#ifdef CONFIG_KEXEC_CORE
+static void kvm_crash_shutdown(struct pt_regs *regs)
+{
+ native_write_msr(msr_kvm_system_time, 0, 0);
+ kvm_disable_steal_time();
+ native_machine_crash_shutdown(regs);
+}
+#endif
+
+static void kvm_shutdown(void)
+{
+ native_write_msr(msr_kvm_system_time, 0, 0);
+ kvm_disable_steal_time();
+ native_machine_shutdown();
+}
+
+static void __init kvmclock_init_mem(void)
+{
+ unsigned long ncpus;
+ unsigned int order;
+ struct page *p;
+ int r;
+
+ if (HVC_BOOT_ARRAY_SIZE >= num_possible_cpus())
+ return;
+
+ ncpus = num_possible_cpus() - HVC_BOOT_ARRAY_SIZE;
+ order = get_order(ncpus * sizeof(*hvclock_mem));
+
+ p = alloc_pages(GFP_KERNEL, order);
+ if (!p) {
+ pr_warn("%s: failed to alloc %d pages", __func__, (1U << order));
+ return;
+ }
+
+ hvclock_mem = page_address(p);
+
+ /*
+ * hvclock is shared between the guest and the hypervisor, must
+ * be mapped decrypted.
+ */
+ if (sev_active()) {
+ r = set_memory_decrypted((unsigned long) hvclock_mem,
+ 1UL << order);
+ if (r) {
+ __free_pages(p, order);
+ hvclock_mem = NULL;
+ pr_warn("kvmclock: set_memory_decrypted() failed. Disabling\n");
+ return;
+ }
+ }
+
+ memset(hvclock_mem, 0, PAGE_SIZE << order);
+}
+
+static int __init kvm_setup_vsyscall_timeinfo(void)
+{
+#ifdef CONFIG_X86_64
+ u8 flags;
+
+ if (!per_cpu(hv_clock_per_cpu, 0) || !kvmclock_vsyscall)
+ return 0;
+
+ flags = pvclock_read_flags(&hv_clock_boot[0].pvti);
+ if (!(flags & PVCLOCK_TSC_STABLE_BIT))
+ return 0;
+
+ kvm_clock.archdata.vclock_mode = VCLOCK_PVCLOCK;
+#endif
+
+ kvmclock_init_mem();
+
+ return 0;
+}
+early_initcall(kvm_setup_vsyscall_timeinfo);
+
+static int kvmclock_setup_percpu(unsigned int cpu)
+{
+ struct pvclock_vsyscall_time_info *p = per_cpu(hv_clock_per_cpu, cpu);
+
+ /*
+ * The per cpu area setup replicates CPU0 data to all cpu
+ * pointers. So carefully check. CPU0 has been set up in init
+ * already.
+ */
+ if (!cpu || (p && p != per_cpu(hv_clock_per_cpu, 0)))
+ return 0;
+
+ /* Use the static page for the first CPUs, allocate otherwise */
+ if (cpu < HVC_BOOT_ARRAY_SIZE)
+ p = &hv_clock_boot[cpu];
+ else if (hvclock_mem)
+ p = hvclock_mem + cpu - HVC_BOOT_ARRAY_SIZE;
+ else
+ return -ENOMEM;
+
+ per_cpu(hv_clock_per_cpu, cpu) = p;
+ return p ? 0 : -ENOMEM;
+}
+
+void __init kvmclock_init(void)
+{
+ u8 flags;
+
+ if (!kvm_para_available() || !kvmclock)
+ return;
+
+ if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE2)) {
+ msr_kvm_system_time = MSR_KVM_SYSTEM_TIME_NEW;
+ msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK_NEW;
+ } else if (!kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE)) {
+ return;
+ }
+
+ if (cpuhp_setup_state(CPUHP_BP_PREPARE_DYN, "kvmclock:setup_percpu",
+ kvmclock_setup_percpu, NULL) < 0) {
+ return;
+ }
+
+ pr_info("kvm-clock: Using msrs %x and %x",
+ msr_kvm_system_time, msr_kvm_wall_clock);
+
+ this_cpu_write(hv_clock_per_cpu, &hv_clock_boot[0]);
+ kvm_register_clock("primary cpu clock");
+ pvclock_set_pvti_cpu0_va(hv_clock_boot);
+
+ if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE_STABLE_BIT))
+ pvclock_set_flags(PVCLOCK_TSC_STABLE_BIT);
+
+ flags = pvclock_read_flags(&hv_clock_boot[0].pvti);
+ kvm_sched_clock_init(flags & PVCLOCK_TSC_STABLE_BIT);
+
+ x86_platform.calibrate_tsc = kvm_get_tsc_khz;
+ x86_platform.calibrate_cpu = kvm_get_tsc_khz;
+ x86_platform.get_wallclock = kvm_get_wallclock;
+ x86_platform.set_wallclock = kvm_set_wallclock;
+#ifdef CONFIG_X86_LOCAL_APIC
+ x86_cpuinit.early_percpu_clock_init = kvm_setup_secondary_clock;
+#endif
+ x86_platform.save_sched_clock_state = kvm_save_sched_clock_state;
+ x86_platform.restore_sched_clock_state = kvm_restore_sched_clock_state;
+ machine_ops.shutdown = kvm_shutdown;
+#ifdef CONFIG_KEXEC_CORE
+ machine_ops.crash_shutdown = kvm_crash_shutdown;
+#endif
+ kvm_get_preset_lpj();
+ clocksource_register_hz(&kvm_clock, NSEC_PER_SEC);
+ pv_info.name = "KVM";
+}
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
new file mode 100644
index 0000000..65590ee
--- /dev/null
+++ b/arch/x86/kernel/ldt.c
@@ -0,0 +1,582 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 1992 Krishna Balasubramanian and Linus Torvalds
+ * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
+ * Copyright (C) 2002 Andi Kleen
+ *
+ * This handles calls from both 32bit and 64bit mode.
+ *
+ * Lock order:
+ * contex.ldt_usr_sem
+ * mmap_sem
+ * context.lock
+ */
+
+#include <linux/errno.h>
+#include <linux/gfp.h>
+#include <linux/sched.h>
+#include <linux/string.h>
+#include <linux/mm.h>
+#include <linux/smp.h>
+#include <linux/syscalls.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/uaccess.h>
+
+#include <asm/ldt.h>
+#include <asm/tlb.h>
+#include <asm/desc.h>
+#include <asm/mmu_context.h>
+#include <asm/syscalls.h>
+
+static void refresh_ldt_segments(void)
+{
+#ifdef CONFIG_X86_64
+ unsigned short sel;
+
+ /*
+ * Make sure that the cached DS and ES descriptors match the updated
+ * LDT.
+ */
+ savesegment(ds, sel);
+ if ((sel & SEGMENT_TI_MASK) == SEGMENT_LDT)
+ loadsegment(ds, sel);
+
+ savesegment(es, sel);
+ if ((sel & SEGMENT_TI_MASK) == SEGMENT_LDT)
+ loadsegment(es, sel);
+#endif
+}
+
+/* context.lock is held by the task which issued the smp function call */
+static void flush_ldt(void *__mm)
+{
+ struct mm_struct *mm = __mm;
+
+ if (this_cpu_read(cpu_tlbstate.loaded_mm) != mm)
+ return;
+
+ load_mm_ldt(mm);
+
+ refresh_ldt_segments();
+}
+
+/* The caller must call finalize_ldt_struct on the result. LDT starts zeroed. */
+static struct ldt_struct *alloc_ldt_struct(unsigned int num_entries)
+{
+ struct ldt_struct *new_ldt;
+ unsigned int alloc_size;
+
+ if (num_entries > LDT_ENTRIES)
+ return NULL;
+
+ new_ldt = kmalloc(sizeof(struct ldt_struct), GFP_KERNEL);
+ if (!new_ldt)
+ return NULL;
+
+ BUILD_BUG_ON(LDT_ENTRY_SIZE != sizeof(struct desc_struct));
+ alloc_size = num_entries * LDT_ENTRY_SIZE;
+
+ /*
+ * Xen is very picky: it requires a page-aligned LDT that has no
+ * trailing nonzero bytes in any page that contains LDT descriptors.
+ * Keep it simple: zero the whole allocation and never allocate less
+ * than PAGE_SIZE.
+ */
+ if (alloc_size > PAGE_SIZE)
+ new_ldt->entries = vzalloc(alloc_size);
+ else
+ new_ldt->entries = (void *)get_zeroed_page(GFP_KERNEL);
+
+ if (!new_ldt->entries) {
+ kfree(new_ldt);
+ return NULL;
+ }
+
+ /* The new LDT isn't aliased for PTI yet. */
+ new_ldt->slot = -1;
+
+ new_ldt->nr_entries = num_entries;
+ return new_ldt;
+}
+
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+
+static void do_sanity_check(struct mm_struct *mm,
+ bool had_kernel_mapping,
+ bool had_user_mapping)
+{
+ if (mm->context.ldt) {
+ /*
+ * We already had an LDT. The top-level entry should already
+ * have been allocated and synchronized with the usermode
+ * tables.
+ */
+ WARN_ON(!had_kernel_mapping);
+ if (static_cpu_has(X86_FEATURE_PTI))
+ WARN_ON(!had_user_mapping);
+ } else {
+ /*
+ * This is the first time we're mapping an LDT for this process.
+ * Sync the pgd to the usermode tables.
+ */
+ WARN_ON(had_kernel_mapping);
+ if (static_cpu_has(X86_FEATURE_PTI))
+ WARN_ON(had_user_mapping);
+ }
+}
+
+#ifdef CONFIG_X86_PAE
+
+static pmd_t *pgd_to_pmd_walk(pgd_t *pgd, unsigned long va)
+{
+ p4d_t *p4d;
+ pud_t *pud;
+
+ if (pgd->pgd == 0)
+ return NULL;
+
+ p4d = p4d_offset(pgd, va);
+ if (p4d_none(*p4d))
+ return NULL;
+
+ pud = pud_offset(p4d, va);
+ if (pud_none(*pud))
+ return NULL;
+
+ return pmd_offset(pud, va);
+}
+
+static void map_ldt_struct_to_user(struct mm_struct *mm)
+{
+ pgd_t *k_pgd = pgd_offset(mm, LDT_BASE_ADDR);
+ pgd_t *u_pgd = kernel_to_user_pgdp(k_pgd);
+ pmd_t *k_pmd, *u_pmd;
+
+ k_pmd = pgd_to_pmd_walk(k_pgd, LDT_BASE_ADDR);
+ u_pmd = pgd_to_pmd_walk(u_pgd, LDT_BASE_ADDR);
+
+ if (static_cpu_has(X86_FEATURE_PTI) && !mm->context.ldt)
+ set_pmd(u_pmd, *k_pmd);
+}
+
+static void sanity_check_ldt_mapping(struct mm_struct *mm)
+{
+ pgd_t *k_pgd = pgd_offset(mm, LDT_BASE_ADDR);
+ pgd_t *u_pgd = kernel_to_user_pgdp(k_pgd);
+ bool had_kernel, had_user;
+ pmd_t *k_pmd, *u_pmd;
+
+ k_pmd = pgd_to_pmd_walk(k_pgd, LDT_BASE_ADDR);
+ u_pmd = pgd_to_pmd_walk(u_pgd, LDT_BASE_ADDR);
+ had_kernel = (k_pmd->pmd != 0);
+ had_user = (u_pmd->pmd != 0);
+
+ do_sanity_check(mm, had_kernel, had_user);
+}
+
+#else /* !CONFIG_X86_PAE */
+
+static void map_ldt_struct_to_user(struct mm_struct *mm)
+{
+ pgd_t *pgd = pgd_offset(mm, LDT_BASE_ADDR);
+
+ if (static_cpu_has(X86_FEATURE_PTI) && !mm->context.ldt)
+ set_pgd(kernel_to_user_pgdp(pgd), *pgd);
+}
+
+static void sanity_check_ldt_mapping(struct mm_struct *mm)
+{
+ pgd_t *pgd = pgd_offset(mm, LDT_BASE_ADDR);
+ bool had_kernel = (pgd->pgd != 0);
+ bool had_user = (kernel_to_user_pgdp(pgd)->pgd != 0);
+
+ do_sanity_check(mm, had_kernel, had_user);
+}
+
+#endif /* CONFIG_X86_PAE */
+
+/*
+ * If PTI is enabled, this maps the LDT into the kernelmode and
+ * usermode tables for the given mm.
+ */
+static int
+map_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt, int slot)
+{
+ unsigned long va;
+ bool is_vmalloc;
+ spinlock_t *ptl;
+ int i, nr_pages;
+
+ if (!static_cpu_has(X86_FEATURE_PTI))
+ return 0;
+
+ /*
+ * Any given ldt_struct should have map_ldt_struct() called at most
+ * once.
+ */
+ WARN_ON(ldt->slot != -1);
+
+ /* Check if the current mappings are sane */
+ sanity_check_ldt_mapping(mm);
+
+ is_vmalloc = is_vmalloc_addr(ldt->entries);
+
+ nr_pages = DIV_ROUND_UP(ldt->nr_entries * LDT_ENTRY_SIZE, PAGE_SIZE);
+
+ for (i = 0; i < nr_pages; i++) {
+ unsigned long offset = i << PAGE_SHIFT;
+ const void *src = (char *)ldt->entries + offset;
+ unsigned long pfn;
+ pgprot_t pte_prot;
+ pte_t pte, *ptep;
+
+ va = (unsigned long)ldt_slot_va(slot) + offset;
+ pfn = is_vmalloc ? vmalloc_to_pfn(src) :
+ page_to_pfn(virt_to_page(src));
+ /*
+ * Treat the PTI LDT range as a *userspace* range.
+ * get_locked_pte() will allocate all needed pagetables
+ * and account for them in this mm.
+ */
+ ptep = get_locked_pte(mm, va, &ptl);
+ if (!ptep)
+ return -ENOMEM;
+ /*
+ * Map it RO so the easy to find address is not a primary
+ * target via some kernel interface which misses a
+ * permission check.
+ */
+ pte_prot = __pgprot(__PAGE_KERNEL_RO & ~_PAGE_GLOBAL);
+ /* Filter out unsuppored __PAGE_KERNEL* bits: */
+ pgprot_val(pte_prot) &= __supported_pte_mask;
+ pte = pfn_pte(pfn, pte_prot);
+ set_pte_at(mm, va, ptep, pte);
+ pte_unmap_unlock(ptep, ptl);
+ }
+
+ /* Propagate LDT mapping to the user page-table */
+ map_ldt_struct_to_user(mm);
+
+ ldt->slot = slot;
+ return 0;
+}
+
+static void unmap_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt)
+{
+ unsigned long va;
+ int i, nr_pages;
+
+ if (!ldt)
+ return;
+
+ /* LDT map/unmap is only required for PTI */
+ if (!static_cpu_has(X86_FEATURE_PTI))
+ return;
+
+ nr_pages = DIV_ROUND_UP(ldt->nr_entries * LDT_ENTRY_SIZE, PAGE_SIZE);
+
+ for (i = 0; i < nr_pages; i++) {
+ unsigned long offset = i << PAGE_SHIFT;
+ spinlock_t *ptl;
+ pte_t *ptep;
+
+ va = (unsigned long)ldt_slot_va(ldt->slot) + offset;
+ ptep = get_locked_pte(mm, va, &ptl);
+ pte_clear(mm, va, ptep);
+ pte_unmap_unlock(ptep, ptl);
+ }
+
+ va = (unsigned long)ldt_slot_va(ldt->slot);
+ flush_tlb_mm_range(mm, va, va + nr_pages * PAGE_SIZE, 0);
+}
+
+#else /* !CONFIG_PAGE_TABLE_ISOLATION */
+
+static int
+map_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt, int slot)
+{
+ return 0;
+}
+
+static void unmap_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt)
+{
+}
+#endif /* CONFIG_PAGE_TABLE_ISOLATION */
+
+static void free_ldt_pgtables(struct mm_struct *mm)
+{
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+ struct mmu_gather tlb;
+ unsigned long start = LDT_BASE_ADDR;
+ unsigned long end = LDT_END_ADDR;
+
+ if (!static_cpu_has(X86_FEATURE_PTI))
+ return;
+
+ tlb_gather_mmu(&tlb, mm, start, end);
+ free_pgd_range(&tlb, start, end, start, end);
+ tlb_finish_mmu(&tlb, start, end);
+#endif
+}
+
+/* After calling this, the LDT is immutable. */
+static void finalize_ldt_struct(struct ldt_struct *ldt)
+{
+ paravirt_alloc_ldt(ldt->entries, ldt->nr_entries);
+}
+
+static void install_ldt(struct mm_struct *mm, struct ldt_struct *ldt)
+{
+ mutex_lock(&mm->context.lock);
+
+ /* Synchronizes with READ_ONCE in load_mm_ldt. */
+ smp_store_release(&mm->context.ldt, ldt);
+
+ /* Activate the LDT for all CPUs using currents mm. */
+ on_each_cpu_mask(mm_cpumask(mm), flush_ldt, mm, true);
+
+ mutex_unlock(&mm->context.lock);
+}
+
+static void free_ldt_struct(struct ldt_struct *ldt)
+{
+ if (likely(!ldt))
+ return;
+
+ paravirt_free_ldt(ldt->entries, ldt->nr_entries);
+ if (ldt->nr_entries * LDT_ENTRY_SIZE > PAGE_SIZE)
+ vfree_atomic(ldt->entries);
+ else
+ free_page((unsigned long)ldt->entries);
+ kfree(ldt);
+}
+
+/*
+ * Called on fork from arch_dup_mmap(). Just copy the current LDT state,
+ * the new task is not running, so nothing can be installed.
+ */
+int ldt_dup_context(struct mm_struct *old_mm, struct mm_struct *mm)
+{
+ struct ldt_struct *new_ldt;
+ int retval = 0;
+
+ if (!old_mm)
+ return 0;
+
+ mutex_lock(&old_mm->context.lock);
+ if (!old_mm->context.ldt)
+ goto out_unlock;
+
+ new_ldt = alloc_ldt_struct(old_mm->context.ldt->nr_entries);
+ if (!new_ldt) {
+ retval = -ENOMEM;
+ goto out_unlock;
+ }
+
+ memcpy(new_ldt->entries, old_mm->context.ldt->entries,
+ new_ldt->nr_entries * LDT_ENTRY_SIZE);
+ finalize_ldt_struct(new_ldt);
+
+ retval = map_ldt_struct(mm, new_ldt, 0);
+ if (retval) {
+ free_ldt_pgtables(mm);
+ free_ldt_struct(new_ldt);
+ goto out_unlock;
+ }
+ mm->context.ldt = new_ldt;
+
+out_unlock:
+ mutex_unlock(&old_mm->context.lock);
+ return retval;
+}
+
+/*
+ * No need to lock the MM as we are the last user
+ *
+ * 64bit: Don't touch the LDT register - we're already in the next thread.
+ */
+void destroy_context_ldt(struct mm_struct *mm)
+{
+ free_ldt_struct(mm->context.ldt);
+ mm->context.ldt = NULL;
+}
+
+void ldt_arch_exit_mmap(struct mm_struct *mm)
+{
+ free_ldt_pgtables(mm);
+}
+
+static int read_ldt(void __user *ptr, unsigned long bytecount)
+{
+ struct mm_struct *mm = current->mm;
+ unsigned long entries_size;
+ int retval;
+
+ down_read(&mm->context.ldt_usr_sem);
+
+ if (!mm->context.ldt) {
+ retval = 0;
+ goto out_unlock;
+ }
+
+ if (bytecount > LDT_ENTRY_SIZE * LDT_ENTRIES)
+ bytecount = LDT_ENTRY_SIZE * LDT_ENTRIES;
+
+ entries_size = mm->context.ldt->nr_entries * LDT_ENTRY_SIZE;
+ if (entries_size > bytecount)
+ entries_size = bytecount;
+
+ if (copy_to_user(ptr, mm->context.ldt->entries, entries_size)) {
+ retval = -EFAULT;
+ goto out_unlock;
+ }
+
+ if (entries_size != bytecount) {
+ /* Zero-fill the rest and pretend we read bytecount bytes. */
+ if (clear_user(ptr + entries_size, bytecount - entries_size)) {
+ retval = -EFAULT;
+ goto out_unlock;
+ }
+ }
+ retval = bytecount;
+
+out_unlock:
+ up_read(&mm->context.ldt_usr_sem);
+ return retval;
+}
+
+static int read_default_ldt(void __user *ptr, unsigned long bytecount)
+{
+ /* CHECKME: Can we use _one_ random number ? */
+#ifdef CONFIG_X86_32
+ unsigned long size = 5 * sizeof(struct desc_struct);
+#else
+ unsigned long size = 128;
+#endif
+ if (bytecount > size)
+ bytecount = size;
+ if (clear_user(ptr, bytecount))
+ return -EFAULT;
+ return bytecount;
+}
+
+static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode)
+{
+ struct mm_struct *mm = current->mm;
+ struct ldt_struct *new_ldt, *old_ldt;
+ unsigned int old_nr_entries, new_nr_entries;
+ struct user_desc ldt_info;
+ struct desc_struct ldt;
+ int error;
+
+ error = -EINVAL;
+ if (bytecount != sizeof(ldt_info))
+ goto out;
+ error = -EFAULT;
+ if (copy_from_user(&ldt_info, ptr, sizeof(ldt_info)))
+ goto out;
+
+ error = -EINVAL;
+ if (ldt_info.entry_number >= LDT_ENTRIES)
+ goto out;
+ if (ldt_info.contents == 3) {
+ if (oldmode)
+ goto out;
+ if (ldt_info.seg_not_present == 0)
+ goto out;
+ }
+
+ if ((oldmode && !ldt_info.base_addr && !ldt_info.limit) ||
+ LDT_empty(&ldt_info)) {
+ /* The user wants to clear the entry. */
+ memset(&ldt, 0, sizeof(ldt));
+ } else {
+ if (!IS_ENABLED(CONFIG_X86_16BIT) && !ldt_info.seg_32bit) {
+ error = -EINVAL;
+ goto out;
+ }
+
+ fill_ldt(&ldt, &ldt_info);
+ if (oldmode)
+ ldt.avl = 0;
+ }
+
+ if (down_write_killable(&mm->context.ldt_usr_sem))
+ return -EINTR;
+
+ old_ldt = mm->context.ldt;
+ old_nr_entries = old_ldt ? old_ldt->nr_entries : 0;
+ new_nr_entries = max(ldt_info.entry_number + 1, old_nr_entries);
+
+ error = -ENOMEM;
+ new_ldt = alloc_ldt_struct(new_nr_entries);
+ if (!new_ldt)
+ goto out_unlock;
+
+ if (old_ldt)
+ memcpy(new_ldt->entries, old_ldt->entries, old_nr_entries * LDT_ENTRY_SIZE);
+
+ new_ldt->entries[ldt_info.entry_number] = ldt;
+ finalize_ldt_struct(new_ldt);
+
+ /*
+ * If we are using PTI, map the new LDT into the userspace pagetables.
+ * If there is already an LDT, use the other slot so that other CPUs
+ * will continue to use the old LDT until install_ldt() switches
+ * them over to the new LDT.
+ */
+ error = map_ldt_struct(mm, new_ldt, old_ldt ? !old_ldt->slot : 0);
+ if (error) {
+ /*
+ * This only can fail for the first LDT setup. If an LDT is
+ * already installed then the PTE page is already
+ * populated. Mop up a half populated page table.
+ */
+ if (!WARN_ON_ONCE(old_ldt))
+ free_ldt_pgtables(mm);
+ free_ldt_struct(new_ldt);
+ goto out_unlock;
+ }
+
+ install_ldt(mm, new_ldt);
+ unmap_ldt_struct(mm, old_ldt);
+ free_ldt_struct(old_ldt);
+ error = 0;
+
+out_unlock:
+ up_write(&mm->context.ldt_usr_sem);
+out:
+ return error;
+}
+
+SYSCALL_DEFINE3(modify_ldt, int , func , void __user * , ptr ,
+ unsigned long , bytecount)
+{
+ int ret = -ENOSYS;
+
+ switch (func) {
+ case 0:
+ ret = read_ldt(ptr, bytecount);
+ break;
+ case 1:
+ ret = write_ldt(ptr, bytecount, 1);
+ break;
+ case 2:
+ ret = read_default_ldt(ptr, bytecount);
+ break;
+ case 0x11:
+ ret = write_ldt(ptr, bytecount, 0);
+ break;
+ }
+ /*
+ * The SYSCALL_DEFINE() macros give us an 'unsigned long'
+ * return type, but tht ABI for sys_modify_ldt() expects
+ * 'int'. This cast gives us an int-sized value in %rax
+ * for the return code. The 'unsigned' is necessary so
+ * the compiler does not try to sign-extend the negative
+ * return codes into the high half of the register when
+ * taking the value from int->long.
+ */
+ return (unsigned int)ret;
+}
diff --git a/arch/x86/kernel/livepatch.c b/arch/x86/kernel/livepatch.c
new file mode 100644
index 0000000..e9d252d
--- /dev/null
+++ b/arch/x86/kernel/livepatch.c
@@ -0,0 +1,65 @@
+/*
+ * livepatch.c - x86-specific Kernel Live Patching Core
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/module.h>
+#include <linux/kallsyms.h>
+#include <linux/livepatch.h>
+#include <asm/text-patching.h>
+
+/* Apply per-object alternatives. Based on x86 module_finalize() */
+void arch_klp_init_object_loaded(struct klp_patch *patch,
+ struct klp_object *obj)
+{
+ int cnt;
+ struct klp_modinfo *info;
+ Elf_Shdr *s, *alt = NULL, *para = NULL;
+ void *aseg, *pseg;
+ const char *objname;
+ char sec_objname[MODULE_NAME_LEN];
+ char secname[KSYM_NAME_LEN];
+
+ info = patch->mod->klp_info;
+ objname = obj->name ? obj->name : "vmlinux";
+
+ /* See livepatch core code for BUILD_BUG_ON() explanation */
+ BUILD_BUG_ON(MODULE_NAME_LEN < 56 || KSYM_NAME_LEN != 128);
+
+ for (s = info->sechdrs; s < info->sechdrs + info->hdr.e_shnum; s++) {
+ /* Apply per-object .klp.arch sections */
+ cnt = sscanf(info->secstrings + s->sh_name,
+ ".klp.arch.%55[^.].%127s",
+ sec_objname, secname);
+ if (cnt != 2)
+ continue;
+ if (strcmp(sec_objname, objname))
+ continue;
+ if (!strcmp(".altinstructions", secname))
+ alt = s;
+ if (!strcmp(".parainstructions", secname))
+ para = s;
+ }
+
+ if (alt) {
+ aseg = (void *) alt->sh_addr;
+ apply_alternatives(aseg, aseg + alt->sh_size);
+ }
+
+ if (para) {
+ pseg = (void *) para->sh_addr;
+ apply_paravirt(pseg, pseg + para->sh_size);
+ }
+}
diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c
new file mode 100644
index 0000000..5409c28
--- /dev/null
+++ b/arch/x86/kernel/machine_kexec_32.c
@@ -0,0 +1,266 @@
+/*
+ * handle transition of Linux booting another kernel
+ * Copyright (C) 2002-2005 Eric Biederman <ebiederm@xmission.com>
+ *
+ * This source code is licensed under the GNU General Public License,
+ * Version 2. See the file COPYING for more details.
+ */
+
+#include <linux/mm.h>
+#include <linux/kexec.h>
+#include <linux/delay.h>
+#include <linux/numa.h>
+#include <linux/ftrace.h>
+#include <linux/suspend.h>
+#include <linux/gfp.h>
+#include <linux/io.h>
+
+#include <asm/pgtable.h>
+#include <asm/pgalloc.h>
+#include <asm/tlbflush.h>
+#include <asm/mmu_context.h>
+#include <asm/apic.h>
+#include <asm/io_apic.h>
+#include <asm/cpufeature.h>
+#include <asm/desc.h>
+#include <asm/set_memory.h>
+#include <asm/debugreg.h>
+
+static void set_gdt(void *newgdt, __u16 limit)
+{
+ struct desc_ptr curgdt;
+
+ /* ia32 supports unaligned loads & stores */
+ curgdt.size = limit;
+ curgdt.address = (unsigned long)newgdt;
+
+ load_gdt(&curgdt);
+}
+
+static void load_segments(void)
+{
+#define __STR(X) #X
+#define STR(X) __STR(X)
+
+ __asm__ __volatile__ (
+ "\tljmp $"STR(__KERNEL_CS)",$1f\n"
+ "\t1:\n"
+ "\tmovl $"STR(__KERNEL_DS)",%%eax\n"
+ "\tmovl %%eax,%%ds\n"
+ "\tmovl %%eax,%%es\n"
+ "\tmovl %%eax,%%ss\n"
+ : : : "eax", "memory");
+#undef STR
+#undef __STR
+}
+
+static void machine_kexec_free_page_tables(struct kimage *image)
+{
+ free_pages((unsigned long)image->arch.pgd, PGD_ALLOCATION_ORDER);
+ image->arch.pgd = NULL;
+#ifdef CONFIG_X86_PAE
+ free_page((unsigned long)image->arch.pmd0);
+ image->arch.pmd0 = NULL;
+ free_page((unsigned long)image->arch.pmd1);
+ image->arch.pmd1 = NULL;
+#endif
+ free_page((unsigned long)image->arch.pte0);
+ image->arch.pte0 = NULL;
+ free_page((unsigned long)image->arch.pte1);
+ image->arch.pte1 = NULL;
+}
+
+static int machine_kexec_alloc_page_tables(struct kimage *image)
+{
+ image->arch.pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
+ PGD_ALLOCATION_ORDER);
+#ifdef CONFIG_X86_PAE
+ image->arch.pmd0 = (pmd_t *)get_zeroed_page(GFP_KERNEL);
+ image->arch.pmd1 = (pmd_t *)get_zeroed_page(GFP_KERNEL);
+#endif
+ image->arch.pte0 = (pte_t *)get_zeroed_page(GFP_KERNEL);
+ image->arch.pte1 = (pte_t *)get_zeroed_page(GFP_KERNEL);
+ if (!image->arch.pgd ||
+#ifdef CONFIG_X86_PAE
+ !image->arch.pmd0 || !image->arch.pmd1 ||
+#endif
+ !image->arch.pte0 || !image->arch.pte1) {
+ return -ENOMEM;
+ }
+ return 0;
+}
+
+static void machine_kexec_page_table_set_one(
+ pgd_t *pgd, pmd_t *pmd, pte_t *pte,
+ unsigned long vaddr, unsigned long paddr)
+{
+ p4d_t *p4d;
+ pud_t *pud;
+
+ pgd += pgd_index(vaddr);
+#ifdef CONFIG_X86_PAE
+ if (!(pgd_val(*pgd) & _PAGE_PRESENT))
+ set_pgd(pgd, __pgd(__pa(pmd) | _PAGE_PRESENT));
+#endif
+ p4d = p4d_offset(pgd, vaddr);
+ pud = pud_offset(p4d, vaddr);
+ pmd = pmd_offset(pud, vaddr);
+ if (!(pmd_val(*pmd) & _PAGE_PRESENT))
+ set_pmd(pmd, __pmd(__pa(pte) | _PAGE_TABLE));
+ pte = pte_offset_kernel(pmd, vaddr);
+ set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, PAGE_KERNEL_EXEC));
+}
+
+static void machine_kexec_prepare_page_tables(struct kimage *image)
+{
+ void *control_page;
+ pmd_t *pmd = NULL;
+
+ control_page = page_address(image->control_code_page);
+#ifdef CONFIG_X86_PAE
+ pmd = image->arch.pmd0;
+#endif
+ machine_kexec_page_table_set_one(
+ image->arch.pgd, pmd, image->arch.pte0,
+ (unsigned long)control_page, __pa(control_page));
+#ifdef CONFIG_X86_PAE
+ pmd = image->arch.pmd1;
+#endif
+ machine_kexec_page_table_set_one(
+ image->arch.pgd, pmd, image->arch.pte1,
+ __pa(control_page), __pa(control_page));
+}
+
+/*
+ * A architecture hook called to validate the
+ * proposed image and prepare the control pages
+ * as needed. The pages for KEXEC_CONTROL_PAGE_SIZE
+ * have been allocated, but the segments have yet
+ * been copied into the kernel.
+ *
+ * Do what every setup is needed on image and the
+ * reboot code buffer to allow us to avoid allocations
+ * later.
+ *
+ * - Make control page executable.
+ * - Allocate page tables
+ * - Setup page tables
+ */
+int machine_kexec_prepare(struct kimage *image)
+{
+ int error;
+
+ set_pages_x(image->control_code_page, 1);
+ error = machine_kexec_alloc_page_tables(image);
+ if (error)
+ return error;
+ machine_kexec_prepare_page_tables(image);
+ return 0;
+}
+
+/*
+ * Undo anything leftover by machine_kexec_prepare
+ * when an image is freed.
+ */
+void machine_kexec_cleanup(struct kimage *image)
+{
+ set_pages_nx(image->control_code_page, 1);
+ machine_kexec_free_page_tables(image);
+}
+
+/*
+ * Do not allocate memory (or fail in any way) in machine_kexec().
+ * We are past the point of no return, committed to rebooting now.
+ */
+void machine_kexec(struct kimage *image)
+{
+ unsigned long page_list[PAGES_NR];
+ void *control_page;
+ int save_ftrace_enabled;
+ asmlinkage unsigned long
+ (*relocate_kernel_ptr)(unsigned long indirection_page,
+ unsigned long control_page,
+ unsigned long start_address,
+ unsigned int has_pae,
+ unsigned int preserve_context);
+
+#ifdef CONFIG_KEXEC_JUMP
+ if (image->preserve_context)
+ save_processor_state();
+#endif
+
+ save_ftrace_enabled = __ftrace_enabled_save();
+
+ /* Interrupts aren't acceptable while we reboot */
+ local_irq_disable();
+ hw_breakpoint_disable();
+
+ if (image->preserve_context) {
+#ifdef CONFIG_X86_IO_APIC
+ /*
+ * We need to put APICs in legacy mode so that we can
+ * get timer interrupts in second kernel. kexec/kdump
+ * paths already have calls to restore_boot_irq_mode()
+ * in one form or other. kexec jump path also need one.
+ */
+ clear_IO_APIC();
+ restore_boot_irq_mode();
+#endif
+ }
+
+ control_page = page_address(image->control_code_page);
+ memcpy(control_page, relocate_kernel, KEXEC_CONTROL_CODE_MAX_SIZE);
+
+ relocate_kernel_ptr = control_page;
+ page_list[PA_CONTROL_PAGE] = __pa(control_page);
+ page_list[VA_CONTROL_PAGE] = (unsigned long)control_page;
+ page_list[PA_PGD] = __pa(image->arch.pgd);
+
+ if (image->type == KEXEC_TYPE_DEFAULT)
+ page_list[PA_SWAP_PAGE] = (page_to_pfn(image->swap_page)
+ << PAGE_SHIFT);
+
+ /*
+ * The segment registers are funny things, they have both a
+ * visible and an invisible part. Whenever the visible part is
+ * set to a specific selector, the invisible part is loaded
+ * with from a table in memory. At no other time is the
+ * descriptor table in memory accessed.
+ *
+ * I take advantage of this here by force loading the
+ * segments, before I zap the gdt with an invalid value.
+ */
+ load_segments();
+ /*
+ * The gdt & idt are now invalid.
+ * If you want to load them you must set up your own idt & gdt.
+ */
+ idt_invalidate(phys_to_virt(0));
+ set_gdt(phys_to_virt(0), 0);
+
+ /* now call it */
+ image->start = relocate_kernel_ptr((unsigned long)image->head,
+ (unsigned long)page_list,
+ image->start,
+ boot_cpu_has(X86_FEATURE_PAE),
+ image->preserve_context);
+
+#ifdef CONFIG_KEXEC_JUMP
+ if (image->preserve_context)
+ restore_processor_state();
+#endif
+
+ __ftrace_enabled_restore(save_ftrace_enabled);
+}
+
+void arch_crash_save_vmcoreinfo(void)
+{
+#ifdef CONFIG_NUMA
+ VMCOREINFO_SYMBOL(node_data);
+ VMCOREINFO_LENGTH(node_data, MAX_NUMNODES);
+#endif
+#ifdef CONFIG_X86_PAE
+ VMCOREINFO_CONFIG(X86_PAE);
+#endif
+}
+
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
new file mode 100644
index 0000000..4c8acdf
--- /dev/null
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -0,0 +1,576 @@
+/*
+ * handle transition of Linux booting another kernel
+ * Copyright (C) 2002-2005 Eric Biederman <ebiederm@xmission.com>
+ *
+ * This source code is licensed under the GNU General Public License,
+ * Version 2. See the file COPYING for more details.
+ */
+
+#define pr_fmt(fmt) "kexec: " fmt
+
+#include <linux/mm.h>
+#include <linux/kexec.h>
+#include <linux/string.h>
+#include <linux/gfp.h>
+#include <linux/reboot.h>
+#include <linux/numa.h>
+#include <linux/ftrace.h>
+#include <linux/io.h>
+#include <linux/suspend.h>
+#include <linux/vmalloc.h>
+
+#include <asm/init.h>
+#include <asm/pgtable.h>
+#include <asm/tlbflush.h>
+#include <asm/mmu_context.h>
+#include <asm/io_apic.h>
+#include <asm/debugreg.h>
+#include <asm/kexec-bzimage64.h>
+#include <asm/setup.h>
+#include <asm/set_memory.h>
+
+#ifdef CONFIG_KEXEC_FILE
+const struct kexec_file_ops * const kexec_file_loaders[] = {
+ &kexec_bzImage64_ops,
+ NULL
+};
+#endif
+
+static void free_transition_pgtable(struct kimage *image)
+{
+ free_page((unsigned long)image->arch.p4d);
+ image->arch.p4d = NULL;
+ free_page((unsigned long)image->arch.pud);
+ image->arch.pud = NULL;
+ free_page((unsigned long)image->arch.pmd);
+ image->arch.pmd = NULL;
+ free_page((unsigned long)image->arch.pte);
+ image->arch.pte = NULL;
+}
+
+static int init_transition_pgtable(struct kimage *image, pgd_t *pgd)
+{
+ p4d_t *p4d;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
+ unsigned long vaddr, paddr;
+ int result = -ENOMEM;
+
+ vaddr = (unsigned long)relocate_kernel;
+ paddr = __pa(page_address(image->control_code_page)+PAGE_SIZE);
+ pgd += pgd_index(vaddr);
+ if (!pgd_present(*pgd)) {
+ p4d = (p4d_t *)get_zeroed_page(GFP_KERNEL);
+ if (!p4d)
+ goto err;
+ image->arch.p4d = p4d;
+ set_pgd(pgd, __pgd(__pa(p4d) | _KERNPG_TABLE));
+ }
+ p4d = p4d_offset(pgd, vaddr);
+ if (!p4d_present(*p4d)) {
+ pud = (pud_t *)get_zeroed_page(GFP_KERNEL);
+ if (!pud)
+ goto err;
+ image->arch.pud = pud;
+ set_p4d(p4d, __p4d(__pa(pud) | _KERNPG_TABLE));
+ }
+ pud = pud_offset(p4d, vaddr);
+ if (!pud_present(*pud)) {
+ pmd = (pmd_t *)get_zeroed_page(GFP_KERNEL);
+ if (!pmd)
+ goto err;
+ image->arch.pmd = pmd;
+ set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE));
+ }
+ pmd = pmd_offset(pud, vaddr);
+ if (!pmd_present(*pmd)) {
+ pte = (pte_t *)get_zeroed_page(GFP_KERNEL);
+ if (!pte)
+ goto err;
+ image->arch.pte = pte;
+ set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE));
+ }
+ pte = pte_offset_kernel(pmd, vaddr);
+ set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, PAGE_KERNEL_EXEC_NOENC));
+ return 0;
+err:
+ return result;
+}
+
+static void *alloc_pgt_page(void *data)
+{
+ struct kimage *image = (struct kimage *)data;
+ struct page *page;
+ void *p = NULL;
+
+ page = kimage_alloc_control_pages(image, 0);
+ if (page) {
+ p = page_address(page);
+ clear_page(p);
+ }
+
+ return p;
+}
+
+static int init_pgtable(struct kimage *image, unsigned long start_pgtable)
+{
+ struct x86_mapping_info info = {
+ .alloc_pgt_page = alloc_pgt_page,
+ .context = image,
+ .page_flag = __PAGE_KERNEL_LARGE_EXEC,
+ .kernpg_flag = _KERNPG_TABLE_NOENC,
+ };
+ unsigned long mstart, mend;
+ pgd_t *level4p;
+ int result;
+ int i;
+
+ level4p = (pgd_t *)__va(start_pgtable);
+ clear_page(level4p);
+
+ if (direct_gbpages)
+ info.direct_gbpages = true;
+
+ for (i = 0; i < nr_pfn_mapped; i++) {
+ mstart = pfn_mapped[i].start << PAGE_SHIFT;
+ mend = pfn_mapped[i].end << PAGE_SHIFT;
+
+ result = kernel_ident_mapping_init(&info,
+ level4p, mstart, mend);
+ if (result)
+ return result;
+ }
+
+ /*
+ * segments's mem ranges could be outside 0 ~ max_pfn,
+ * for example when jump back to original kernel from kexeced kernel.
+ * or first kernel is booted with user mem map, and second kernel
+ * could be loaded out of that range.
+ */
+ for (i = 0; i < image->nr_segments; i++) {
+ mstart = image->segment[i].mem;
+ mend = mstart + image->segment[i].memsz;
+
+ result = kernel_ident_mapping_init(&info,
+ level4p, mstart, mend);
+
+ if (result)
+ return result;
+ }
+
+ return init_transition_pgtable(image, level4p);
+}
+
+static void set_idt(void *newidt, u16 limit)
+{
+ struct desc_ptr curidt;
+
+ /* x86-64 supports unaliged loads & stores */
+ curidt.size = limit;
+ curidt.address = (unsigned long)newidt;
+
+ __asm__ __volatile__ (
+ "lidtq %0\n"
+ : : "m" (curidt)
+ );
+};
+
+
+static void set_gdt(void *newgdt, u16 limit)
+{
+ struct desc_ptr curgdt;
+
+ /* x86-64 supports unaligned loads & stores */
+ curgdt.size = limit;
+ curgdt.address = (unsigned long)newgdt;
+
+ __asm__ __volatile__ (
+ "lgdtq %0\n"
+ : : "m" (curgdt)
+ );
+};
+
+static void load_segments(void)
+{
+ __asm__ __volatile__ (
+ "\tmovl %0,%%ds\n"
+ "\tmovl %0,%%es\n"
+ "\tmovl %0,%%ss\n"
+ "\tmovl %0,%%fs\n"
+ "\tmovl %0,%%gs\n"
+ : : "a" (__KERNEL_DS) : "memory"
+ );
+}
+
+#ifdef CONFIG_KEXEC_FILE
+/* Update purgatory as needed after various image segments have been prepared */
+static int arch_update_purgatory(struct kimage *image)
+{
+ int ret = 0;
+
+ if (!image->file_mode)
+ return 0;
+
+ /* Setup copying of backup region */
+ if (image->type == KEXEC_TYPE_CRASH) {
+ ret = kexec_purgatory_get_set_symbol(image,
+ "purgatory_backup_dest",
+ &image->arch.backup_load_addr,
+ sizeof(image->arch.backup_load_addr), 0);
+ if (ret)
+ return ret;
+
+ ret = kexec_purgatory_get_set_symbol(image,
+ "purgatory_backup_src",
+ &image->arch.backup_src_start,
+ sizeof(image->arch.backup_src_start), 0);
+ if (ret)
+ return ret;
+
+ ret = kexec_purgatory_get_set_symbol(image,
+ "purgatory_backup_sz",
+ &image->arch.backup_src_sz,
+ sizeof(image->arch.backup_src_sz), 0);
+ if (ret)
+ return ret;
+ }
+
+ return ret;
+}
+#else /* !CONFIG_KEXEC_FILE */
+static inline int arch_update_purgatory(struct kimage *image)
+{
+ return 0;
+}
+#endif /* CONFIG_KEXEC_FILE */
+
+int machine_kexec_prepare(struct kimage *image)
+{
+ unsigned long start_pgtable;
+ int result;
+
+ /* Calculate the offsets */
+ start_pgtable = page_to_pfn(image->control_code_page) << PAGE_SHIFT;
+
+ /* Setup the identity mapped 64bit page table */
+ result = init_pgtable(image, start_pgtable);
+ if (result)
+ return result;
+
+ /* update purgatory as needed */
+ result = arch_update_purgatory(image);
+ if (result)
+ return result;
+
+ return 0;
+}
+
+void machine_kexec_cleanup(struct kimage *image)
+{
+ free_transition_pgtable(image);
+}
+
+/*
+ * Do not allocate memory (or fail in any way) in machine_kexec().
+ * We are past the point of no return, committed to rebooting now.
+ */
+void machine_kexec(struct kimage *image)
+{
+ unsigned long page_list[PAGES_NR];
+ void *control_page;
+ int save_ftrace_enabled;
+
+#ifdef CONFIG_KEXEC_JUMP
+ if (image->preserve_context)
+ save_processor_state();
+#endif
+
+ save_ftrace_enabled = __ftrace_enabled_save();
+
+ /* Interrupts aren't acceptable while we reboot */
+ local_irq_disable();
+ hw_breakpoint_disable();
+
+ if (image->preserve_context) {
+#ifdef CONFIG_X86_IO_APIC
+ /*
+ * We need to put APICs in legacy mode so that we can
+ * get timer interrupts in second kernel. kexec/kdump
+ * paths already have calls to restore_boot_irq_mode()
+ * in one form or other. kexec jump path also need one.
+ */
+ clear_IO_APIC();
+ restore_boot_irq_mode();
+#endif
+ }
+
+ control_page = page_address(image->control_code_page) + PAGE_SIZE;
+ memcpy(control_page, relocate_kernel, KEXEC_CONTROL_CODE_MAX_SIZE);
+
+ page_list[PA_CONTROL_PAGE] = virt_to_phys(control_page);
+ page_list[VA_CONTROL_PAGE] = (unsigned long)control_page;
+ page_list[PA_TABLE_PAGE] =
+ (unsigned long)__pa(page_address(image->control_code_page));
+
+ if (image->type == KEXEC_TYPE_DEFAULT)
+ page_list[PA_SWAP_PAGE] = (page_to_pfn(image->swap_page)
+ << PAGE_SHIFT);
+
+ /*
+ * The segment registers are funny things, they have both a
+ * visible and an invisible part. Whenever the visible part is
+ * set to a specific selector, the invisible part is loaded
+ * with from a table in memory. At no other time is the
+ * descriptor table in memory accessed.
+ *
+ * I take advantage of this here by force loading the
+ * segments, before I zap the gdt with an invalid value.
+ */
+ load_segments();
+ /*
+ * The gdt & idt are now invalid.
+ * If you want to load them you must set up your own idt & gdt.
+ */
+ set_gdt(phys_to_virt(0), 0);
+ set_idt(phys_to_virt(0), 0);
+
+ /* now call it */
+ image->start = relocate_kernel((unsigned long)image->head,
+ (unsigned long)page_list,
+ image->start,
+ image->preserve_context,
+ sme_active());
+
+#ifdef CONFIG_KEXEC_JUMP
+ if (image->preserve_context)
+ restore_processor_state();
+#endif
+
+ __ftrace_enabled_restore(save_ftrace_enabled);
+}
+
+void arch_crash_save_vmcoreinfo(void)
+{
+ VMCOREINFO_NUMBER(phys_base);
+ VMCOREINFO_SYMBOL(init_top_pgt);
+ vmcoreinfo_append_str("NUMBER(pgtable_l5_enabled)=%d\n",
+ pgtable_l5_enabled());
+
+#ifdef CONFIG_NUMA
+ VMCOREINFO_SYMBOL(node_data);
+ VMCOREINFO_LENGTH(node_data, MAX_NUMNODES);
+#endif
+ vmcoreinfo_append_str("KERNELOFFSET=%lx\n",
+ kaslr_offset());
+ VMCOREINFO_NUMBER(KERNEL_IMAGE_SIZE);
+}
+
+/* arch-dependent functionality related to kexec file-based syscall */
+
+#ifdef CONFIG_KEXEC_FILE
+void *arch_kexec_kernel_image_load(struct kimage *image)
+{
+ vfree(image->arch.elf_headers);
+ image->arch.elf_headers = NULL;
+
+ if (!image->fops || !image->fops->load)
+ return ERR_PTR(-ENOEXEC);
+
+ return image->fops->load(image, image->kernel_buf,
+ image->kernel_buf_len, image->initrd_buf,
+ image->initrd_buf_len, image->cmdline_buf,
+ image->cmdline_buf_len);
+}
+
+/*
+ * Apply purgatory relocations.
+ *
+ * @pi: Purgatory to be relocated.
+ * @section: Section relocations applying to.
+ * @relsec: Section containing RELAs.
+ * @symtabsec: Corresponding symtab.
+ *
+ * TODO: Some of the code belongs to generic code. Move that in kexec.c.
+ */
+int arch_kexec_apply_relocations_add(struct purgatory_info *pi,
+ Elf_Shdr *section, const Elf_Shdr *relsec,
+ const Elf_Shdr *symtabsec)
+{
+ unsigned int i;
+ Elf64_Rela *rel;
+ Elf64_Sym *sym;
+ void *location;
+ unsigned long address, sec_base, value;
+ const char *strtab, *name, *shstrtab;
+ const Elf_Shdr *sechdrs;
+
+ /* String & section header string table */
+ sechdrs = (void *)pi->ehdr + pi->ehdr->e_shoff;
+ strtab = (char *)pi->ehdr + sechdrs[symtabsec->sh_link].sh_offset;
+ shstrtab = (char *)pi->ehdr + sechdrs[pi->ehdr->e_shstrndx].sh_offset;
+
+ rel = (void *)pi->ehdr + relsec->sh_offset;
+
+ pr_debug("Applying relocate section %s to %u\n",
+ shstrtab + relsec->sh_name, relsec->sh_info);
+
+ for (i = 0; i < relsec->sh_size / sizeof(*rel); i++) {
+
+ /*
+ * rel[i].r_offset contains byte offset from beginning
+ * of section to the storage unit affected.
+ *
+ * This is location to update. This is temporary buffer
+ * where section is currently loaded. This will finally be
+ * loaded to a different address later, pointed to by
+ * ->sh_addr. kexec takes care of moving it
+ * (kexec_load_segment()).
+ */
+ location = pi->purgatory_buf;
+ location += section->sh_offset;
+ location += rel[i].r_offset;
+
+ /* Final address of the location */
+ address = section->sh_addr + rel[i].r_offset;
+
+ /*
+ * rel[i].r_info contains information about symbol table index
+ * w.r.t which relocation must be made and type of relocation
+ * to apply. ELF64_R_SYM() and ELF64_R_TYPE() macros get
+ * these respectively.
+ */
+ sym = (void *)pi->ehdr + symtabsec->sh_offset;
+ sym += ELF64_R_SYM(rel[i].r_info);
+
+ if (sym->st_name)
+ name = strtab + sym->st_name;
+ else
+ name = shstrtab + sechdrs[sym->st_shndx].sh_name;
+
+ pr_debug("Symbol: %s info: %02x shndx: %02x value=%llx size: %llx\n",
+ name, sym->st_info, sym->st_shndx, sym->st_value,
+ sym->st_size);
+
+ if (sym->st_shndx == SHN_UNDEF) {
+ pr_err("Undefined symbol: %s\n", name);
+ return -ENOEXEC;
+ }
+
+ if (sym->st_shndx == SHN_COMMON) {
+ pr_err("symbol '%s' in common section\n", name);
+ return -ENOEXEC;
+ }
+
+ if (sym->st_shndx == SHN_ABS)
+ sec_base = 0;
+ else if (sym->st_shndx >= pi->ehdr->e_shnum) {
+ pr_err("Invalid section %d for symbol %s\n",
+ sym->st_shndx, name);
+ return -ENOEXEC;
+ } else
+ sec_base = pi->sechdrs[sym->st_shndx].sh_addr;
+
+ value = sym->st_value;
+ value += sec_base;
+ value += rel[i].r_addend;
+
+ switch (ELF64_R_TYPE(rel[i].r_info)) {
+ case R_X86_64_NONE:
+ break;
+ case R_X86_64_64:
+ *(u64 *)location = value;
+ break;
+ case R_X86_64_32:
+ *(u32 *)location = value;
+ if (value != *(u32 *)location)
+ goto overflow;
+ break;
+ case R_X86_64_32S:
+ *(s32 *)location = value;
+ if ((s64)value != *(s32 *)location)
+ goto overflow;
+ break;
+ case R_X86_64_PC32:
+ case R_X86_64_PLT32:
+ value -= (u64)address;
+ *(u32 *)location = value;
+ break;
+ default:
+ pr_err("Unknown rela relocation: %llu\n",
+ ELF64_R_TYPE(rel[i].r_info));
+ return -ENOEXEC;
+ }
+ }
+ return 0;
+
+overflow:
+ pr_err("Overflow in relocation type %d value 0x%lx\n",
+ (int)ELF64_R_TYPE(rel[i].r_info), value);
+ return -ENOEXEC;
+}
+#endif /* CONFIG_KEXEC_FILE */
+
+static int
+kexec_mark_range(unsigned long start, unsigned long end, bool protect)
+{
+ struct page *page;
+ unsigned int nr_pages;
+
+ /*
+ * For physical range: [start, end]. We must skip the unassigned
+ * crashk resource with zero-valued "end" member.
+ */
+ if (!end || start > end)
+ return 0;
+
+ page = pfn_to_page(start >> PAGE_SHIFT);
+ nr_pages = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1;
+ if (protect)
+ return set_pages_ro(page, nr_pages);
+ else
+ return set_pages_rw(page, nr_pages);
+}
+
+static void kexec_mark_crashkres(bool protect)
+{
+ unsigned long control;
+
+ kexec_mark_range(crashk_low_res.start, crashk_low_res.end, protect);
+
+ /* Don't touch the control code page used in crash_kexec().*/
+ control = PFN_PHYS(page_to_pfn(kexec_crash_image->control_code_page));
+ /* Control code page is located in the 2nd page. */
+ kexec_mark_range(crashk_res.start, control + PAGE_SIZE - 1, protect);
+ control += KEXEC_CONTROL_PAGE_SIZE;
+ kexec_mark_range(control, crashk_res.end, protect);
+}
+
+void arch_kexec_protect_crashkres(void)
+{
+ kexec_mark_crashkres(true);
+}
+
+void arch_kexec_unprotect_crashkres(void)
+{
+ kexec_mark_crashkres(false);
+}
+
+int arch_kexec_post_alloc_pages(void *vaddr, unsigned int pages, gfp_t gfp)
+{
+ /*
+ * If SME is active we need to be sure that kexec pages are
+ * not encrypted because when we boot to the new kernel the
+ * pages won't be accessed encrypted (initially).
+ */
+ return set_memory_decrypted((unsigned long)vaddr, pages);
+}
+
+void arch_kexec_pre_free_pages(void *vaddr, unsigned int pages)
+{
+ /*
+ * If SME is active we need to reset the pages back to being
+ * an encrypted mapping before freeing them.
+ */
+ set_memory_encrypted((unsigned long)vaddr, pages);
+}
diff --git a/arch/x86/kernel/mmconf-fam10h_64.c b/arch/x86/kernel/mmconf-fam10h_64.c
new file mode 100644
index 0000000..b5cb49e
--- /dev/null
+++ b/arch/x86/kernel/mmconf-fam10h_64.c
@@ -0,0 +1,238 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * AMD Family 10h mmconfig enablement
+ */
+
+#include <linux/types.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/pci.h>
+#include <linux/dmi.h>
+#include <linux/range.h>
+
+#include <asm/pci-direct.h>
+#include <linux/sort.h>
+#include <asm/io.h>
+#include <asm/msr.h>
+#include <asm/acpi.h>
+#include <asm/mmconfig.h>
+#include <asm/pci_x86.h>
+
+struct pci_hostbridge_probe {
+ u32 bus;
+ u32 slot;
+ u32 vendor;
+ u32 device;
+};
+
+static u64 fam10h_pci_mmconf_base;
+
+static struct pci_hostbridge_probe pci_probes[] = {
+ { 0, 0x18, PCI_VENDOR_ID_AMD, 0x1200 },
+ { 0xff, 0, PCI_VENDOR_ID_AMD, 0x1200 },
+};
+
+static int cmp_range(const void *x1, const void *x2)
+{
+ const struct range *r1 = x1;
+ const struct range *r2 = x2;
+ int start1, start2;
+
+ start1 = r1->start >> 32;
+ start2 = r2->start >> 32;
+
+ return start1 - start2;
+}
+
+#define MMCONF_UNIT (1ULL << FAM10H_MMIO_CONF_BASE_SHIFT)
+#define MMCONF_MASK (~(MMCONF_UNIT - 1))
+#define MMCONF_SIZE (MMCONF_UNIT << 8)
+/* need to avoid (0xfd<<32), (0xfe<<32), and (0xff<<32), ht used space */
+#define FAM10H_PCI_MMCONF_BASE (0xfcULL<<32)
+#define BASE_VALID(b) ((b) + MMCONF_SIZE <= (0xfdULL<<32) || (b) >= (1ULL<<40))
+static void get_fam10h_pci_mmconf_base(void)
+{
+ int i;
+ unsigned bus;
+ unsigned slot;
+ int found;
+
+ u64 val;
+ u32 address;
+ u64 tom2;
+ u64 base = FAM10H_PCI_MMCONF_BASE;
+
+ int hi_mmio_num;
+ struct range range[8];
+
+ /* only try to get setting from BSP */
+ if (fam10h_pci_mmconf_base)
+ return;
+
+ if (!early_pci_allowed())
+ return;
+
+ found = 0;
+ for (i = 0; i < ARRAY_SIZE(pci_probes); i++) {
+ u32 id;
+ u16 device;
+ u16 vendor;
+
+ bus = pci_probes[i].bus;
+ slot = pci_probes[i].slot;
+ id = read_pci_config(bus, slot, 0, PCI_VENDOR_ID);
+
+ vendor = id & 0xffff;
+ device = (id>>16) & 0xffff;
+ if (pci_probes[i].vendor == vendor &&
+ pci_probes[i].device == device) {
+ found = 1;
+ break;
+ }
+ }
+
+ if (!found)
+ return;
+
+ /* SYS_CFG */
+ address = MSR_K8_SYSCFG;
+ rdmsrl(address, val);
+
+ /* TOP_MEM2 is not enabled? */
+ if (!(val & (1<<21))) {
+ tom2 = 1ULL << 32;
+ } else {
+ /* TOP_MEM2 */
+ address = MSR_K8_TOP_MEM2;
+ rdmsrl(address, val);
+ tom2 = max(val & 0xffffff800000ULL, 1ULL << 32);
+ }
+
+ if (base <= tom2)
+ base = (tom2 + 2 * MMCONF_UNIT - 1) & MMCONF_MASK;
+
+ /*
+ * need to check if the range is in the high mmio range that is
+ * above 4G
+ */
+ hi_mmio_num = 0;
+ for (i = 0; i < 8; i++) {
+ u32 reg;
+ u64 start;
+ u64 end;
+ reg = read_pci_config(bus, slot, 1, 0x80 + (i << 3));
+ if (!(reg & 3))
+ continue;
+
+ start = (u64)(reg & 0xffffff00) << 8; /* 39:16 on 31:8*/
+ reg = read_pci_config(bus, slot, 1, 0x84 + (i << 3));
+ end = ((u64)(reg & 0xffffff00) << 8) | 0xffff; /* 39:16 on 31:8*/
+
+ if (end < tom2)
+ continue;
+
+ range[hi_mmio_num].start = start;
+ range[hi_mmio_num].end = end;
+ hi_mmio_num++;
+ }
+
+ if (!hi_mmio_num)
+ goto out;
+
+ /* sort the range */
+ sort(range, hi_mmio_num, sizeof(struct range), cmp_range, NULL);
+
+ if (range[hi_mmio_num - 1].end < base)
+ goto out;
+ if (range[0].start > base + MMCONF_SIZE)
+ goto out;
+
+ /* need to find one window */
+ base = (range[0].start & MMCONF_MASK) - MMCONF_UNIT;
+ if ((base > tom2) && BASE_VALID(base))
+ goto out;
+ base = (range[hi_mmio_num - 1].end + MMCONF_UNIT) & MMCONF_MASK;
+ if (BASE_VALID(base))
+ goto out;
+ /* need to find window between ranges */
+ for (i = 1; i < hi_mmio_num; i++) {
+ base = (range[i - 1].end + MMCONF_UNIT) & MMCONF_MASK;
+ val = range[i].start & MMCONF_MASK;
+ if (val >= base + MMCONF_SIZE && BASE_VALID(base))
+ goto out;
+ }
+ return;
+
+out:
+ fam10h_pci_mmconf_base = base;
+}
+
+void fam10h_check_enable_mmcfg(void)
+{
+ u64 val;
+ u32 address;
+
+ if (!(pci_probe & PCI_CHECK_ENABLE_AMD_MMCONF))
+ return;
+
+ address = MSR_FAM10H_MMIO_CONF_BASE;
+ rdmsrl(address, val);
+
+ /* try to make sure that AP's setting is identical to BSP setting */
+ if (val & FAM10H_MMIO_CONF_ENABLE) {
+ unsigned busnbits;
+ busnbits = (val >> FAM10H_MMIO_CONF_BUSRANGE_SHIFT) &
+ FAM10H_MMIO_CONF_BUSRANGE_MASK;
+
+ /* only trust the one handle 256 buses, if acpi=off */
+ if (!acpi_pci_disabled || busnbits >= 8) {
+ u64 base = val & MMCONF_MASK;
+
+ if (!fam10h_pci_mmconf_base) {
+ fam10h_pci_mmconf_base = base;
+ return;
+ } else if (fam10h_pci_mmconf_base == base)
+ return;
+ }
+ }
+
+ /*
+ * if it is not enabled, try to enable it and assume only one segment
+ * with 256 buses
+ */
+ get_fam10h_pci_mmconf_base();
+ if (!fam10h_pci_mmconf_base) {
+ pci_probe &= ~PCI_CHECK_ENABLE_AMD_MMCONF;
+ return;
+ }
+
+ printk(KERN_INFO "Enable MMCONFIG on AMD Family 10h\n");
+ val &= ~((FAM10H_MMIO_CONF_BASE_MASK<<FAM10H_MMIO_CONF_BASE_SHIFT) |
+ (FAM10H_MMIO_CONF_BUSRANGE_MASK<<FAM10H_MMIO_CONF_BUSRANGE_SHIFT));
+ val |= fam10h_pci_mmconf_base | (8 << FAM10H_MMIO_CONF_BUSRANGE_SHIFT) |
+ FAM10H_MMIO_CONF_ENABLE;
+ wrmsrl(address, val);
+}
+
+static int __init set_check_enable_amd_mmconf(const struct dmi_system_id *d)
+{
+ pci_probe |= PCI_CHECK_ENABLE_AMD_MMCONF;
+ return 0;
+}
+
+static const struct dmi_system_id __initconst mmconf_dmi_table[] = {
+ {
+ .callback = set_check_enable_amd_mmconf,
+ .ident = "Sun Microsystems Machine",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Sun Microsystems"),
+ },
+ },
+ {}
+};
+
+/* Called from a non __init function, but only on the BSP. */
+void __ref check_enable_amd_mmconf_dmi(void)
+{
+ dmi_check_system(mmconf_dmi_table);
+}
diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c
new file mode 100644
index 0000000..f58336a
--- /dev/null
+++ b/arch/x86/kernel/module.c
@@ -0,0 +1,280 @@
+/* Kernel module help for x86.
+ Copyright (C) 2001 Rusty Russell.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+*/
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/moduleloader.h>
+#include <linux/elf.h>
+#include <linux/vmalloc.h>
+#include <linux/fs.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <linux/kasan.h>
+#include <linux/bug.h>
+#include <linux/mm.h>
+#include <linux/gfp.h>
+#include <linux/jump_label.h>
+#include <linux/random.h>
+
+#include <asm/text-patching.h>
+#include <asm/page.h>
+#include <asm/pgtable.h>
+#include <asm/setup.h>
+#include <asm/unwind.h>
+
+#if 0
+#define DEBUGP(fmt, ...) \
+ printk(KERN_DEBUG fmt, ##__VA_ARGS__)
+#else
+#define DEBUGP(fmt, ...) \
+do { \
+ if (0) \
+ printk(KERN_DEBUG fmt, ##__VA_ARGS__); \
+} while (0)
+#endif
+
+#ifdef CONFIG_RANDOMIZE_BASE
+static unsigned long module_load_offset;
+
+/* Mutex protects the module_load_offset. */
+static DEFINE_MUTEX(module_kaslr_mutex);
+
+static unsigned long int get_module_load_offset(void)
+{
+ if (kaslr_enabled()) {
+ mutex_lock(&module_kaslr_mutex);
+ /*
+ * Calculate the module_load_offset the first time this
+ * code is called. Once calculated it stays the same until
+ * reboot.
+ */
+ if (module_load_offset == 0)
+ module_load_offset =
+ (get_random_int() % 1024 + 1) * PAGE_SIZE;
+ mutex_unlock(&module_kaslr_mutex);
+ }
+ return module_load_offset;
+}
+#else
+static unsigned long int get_module_load_offset(void)
+{
+ return 0;
+}
+#endif
+
+void *module_alloc(unsigned long size)
+{
+ void *p;
+
+ if (PAGE_ALIGN(size) > MODULES_LEN)
+ return NULL;
+
+ p = __vmalloc_node_range(size, MODULE_ALIGN,
+ MODULES_VADDR + get_module_load_offset(),
+ MODULES_END, GFP_KERNEL,
+ PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE,
+ __builtin_return_address(0));
+ if (p && (kasan_module_alloc(p, size) < 0)) {
+ vfree(p);
+ return NULL;
+ }
+
+ return p;
+}
+
+#ifdef CONFIG_X86_32
+int apply_relocate(Elf32_Shdr *sechdrs,
+ const char *strtab,
+ unsigned int symindex,
+ unsigned int relsec,
+ struct module *me)
+{
+ unsigned int i;
+ Elf32_Rel *rel = (void *)sechdrs[relsec].sh_addr;
+ Elf32_Sym *sym;
+ uint32_t *location;
+
+ DEBUGP("Applying relocate section %u to %u\n",
+ relsec, sechdrs[relsec].sh_info);
+ for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rel); i++) {
+ /* This is where to make the change */
+ location = (void *)sechdrs[sechdrs[relsec].sh_info].sh_addr
+ + rel[i].r_offset;
+ /* This is the symbol it is referring to. Note that all
+ undefined symbols have been resolved. */
+ sym = (Elf32_Sym *)sechdrs[symindex].sh_addr
+ + ELF32_R_SYM(rel[i].r_info);
+
+ switch (ELF32_R_TYPE(rel[i].r_info)) {
+ case R_386_32:
+ /* We add the value into the location given */
+ *location += sym->st_value;
+ break;
+ case R_386_PC32:
+ /* Add the value, subtract its position */
+ *location += sym->st_value - (uint32_t)location;
+ break;
+ default:
+ pr_err("%s: Unknown relocation: %u\n",
+ me->name, ELF32_R_TYPE(rel[i].r_info));
+ return -ENOEXEC;
+ }
+ }
+ return 0;
+}
+#else /*X86_64*/
+int apply_relocate_add(Elf64_Shdr *sechdrs,
+ const char *strtab,
+ unsigned int symindex,
+ unsigned int relsec,
+ struct module *me)
+{
+ unsigned int i;
+ Elf64_Rela *rel = (void *)sechdrs[relsec].sh_addr;
+ Elf64_Sym *sym;
+ void *loc;
+ u64 val;
+
+ DEBUGP("Applying relocate section %u to %u\n",
+ relsec, sechdrs[relsec].sh_info);
+ for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rel); i++) {
+ /* This is where to make the change */
+ loc = (void *)sechdrs[sechdrs[relsec].sh_info].sh_addr
+ + rel[i].r_offset;
+
+ /* This is the symbol it is referring to. Note that all
+ undefined symbols have been resolved. */
+ sym = (Elf64_Sym *)sechdrs[symindex].sh_addr
+ + ELF64_R_SYM(rel[i].r_info);
+
+ DEBUGP("type %d st_value %Lx r_addend %Lx loc %Lx\n",
+ (int)ELF64_R_TYPE(rel[i].r_info),
+ sym->st_value, rel[i].r_addend, (u64)loc);
+
+ val = sym->st_value + rel[i].r_addend;
+
+ switch (ELF64_R_TYPE(rel[i].r_info)) {
+ case R_X86_64_NONE:
+ break;
+ case R_X86_64_64:
+ if (*(u64 *)loc != 0)
+ goto invalid_relocation;
+ *(u64 *)loc = val;
+ break;
+ case R_X86_64_32:
+ if (*(u32 *)loc != 0)
+ goto invalid_relocation;
+ *(u32 *)loc = val;
+ if (val != *(u32 *)loc)
+ goto overflow;
+ break;
+ case R_X86_64_32S:
+ if (*(s32 *)loc != 0)
+ goto invalid_relocation;
+ *(s32 *)loc = val;
+ if ((s64)val != *(s32 *)loc)
+ goto overflow;
+ break;
+ case R_X86_64_PC32:
+ case R_X86_64_PLT32:
+ if (*(u32 *)loc != 0)
+ goto invalid_relocation;
+ val -= (u64)loc;
+ *(u32 *)loc = val;
+#if 0
+ if ((s64)val != *(s32 *)loc)
+ goto overflow;
+#endif
+ break;
+ default:
+ pr_err("%s: Unknown rela relocation: %llu\n",
+ me->name, ELF64_R_TYPE(rel[i].r_info));
+ return -ENOEXEC;
+ }
+ }
+ return 0;
+
+invalid_relocation:
+ pr_err("x86/modules: Skipping invalid relocation target, existing value is nonzero for type %d, loc %p, val %Lx\n",
+ (int)ELF64_R_TYPE(rel[i].r_info), loc, val);
+ return -ENOEXEC;
+
+overflow:
+ pr_err("overflow in relocation type %d val %Lx\n",
+ (int)ELF64_R_TYPE(rel[i].r_info), val);
+ pr_err("`%s' likely not compiled with -mcmodel=kernel\n",
+ me->name);
+ return -ENOEXEC;
+}
+#endif
+
+int module_finalize(const Elf_Ehdr *hdr,
+ const Elf_Shdr *sechdrs,
+ struct module *me)
+{
+ const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL,
+ *para = NULL, *orc = NULL, *orc_ip = NULL;
+ char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset;
+
+ for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) {
+ if (!strcmp(".text", secstrings + s->sh_name))
+ text = s;
+ if (!strcmp(".altinstructions", secstrings + s->sh_name))
+ alt = s;
+ if (!strcmp(".smp_locks", secstrings + s->sh_name))
+ locks = s;
+ if (!strcmp(".parainstructions", secstrings + s->sh_name))
+ para = s;
+ if (!strcmp(".orc_unwind", secstrings + s->sh_name))
+ orc = s;
+ if (!strcmp(".orc_unwind_ip", secstrings + s->sh_name))
+ orc_ip = s;
+ }
+
+ if (alt) {
+ /* patch .altinstructions */
+ void *aseg = (void *)alt->sh_addr;
+ apply_alternatives(aseg, aseg + alt->sh_size);
+ }
+ if (locks && text) {
+ void *lseg = (void *)locks->sh_addr;
+ void *tseg = (void *)text->sh_addr;
+ alternatives_smp_module_add(me, me->name,
+ lseg, lseg + locks->sh_size,
+ tseg, tseg + text->sh_size);
+ }
+
+ if (para) {
+ void *pseg = (void *)para->sh_addr;
+ apply_paravirt(pseg, pseg + para->sh_size);
+ }
+
+ /* make jump label nops */
+ jump_label_apply_nops(me);
+
+ if (orc && orc_ip)
+ unwind_module_init(me, (void *)orc_ip->sh_addr, orc_ip->sh_size,
+ (void *)orc->sh_addr, orc->sh_size);
+
+ return 0;
+}
+
+void module_arch_cleanup(struct module *mod)
+{
+ alternatives_smp_module_del(mod);
+}
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
new file mode 100644
index 0000000..f1c5eb9
--- /dev/null
+++ b/arch/x86/kernel/mpparse.c
@@ -0,0 +1,965 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Intel Multiprocessor Specification 1.1 and 1.4
+ * compliant MP-table parsing routines.
+ *
+ * (c) 1995 Alan Cox, Building #3 <alan@lxorguk.ukuu.org.uk>
+ * (c) 1998, 1999, 2000, 2009 Ingo Molnar <mingo@redhat.com>
+ * (c) 2008 Alexey Starikovskiy <astarikovskiy@suse.de>
+ */
+
+#include <linux/mm.h>
+#include <linux/init.h>
+#include <linux/delay.h>
+#include <linux/bootmem.h>
+#include <linux/memblock.h>
+#include <linux/kernel_stat.h>
+#include <linux/mc146818rtc.h>
+#include <linux/bitops.h>
+#include <linux/acpi.h>
+#include <linux/smp.h>
+#include <linux/pci.h>
+
+#include <asm/irqdomain.h>
+#include <asm/mtrr.h>
+#include <asm/mpspec.h>
+#include <asm/pgalloc.h>
+#include <asm/io_apic.h>
+#include <asm/proto.h>
+#include <asm/bios_ebda.h>
+#include <asm/e820/api.h>
+#include <asm/setup.h>
+#include <asm/smp.h>
+
+#include <asm/apic.h>
+/*
+ * Checksum an MP configuration block.
+ */
+
+static int __init mpf_checksum(unsigned char *mp, int len)
+{
+ int sum = 0;
+
+ while (len--)
+ sum += *mp++;
+
+ return sum & 0xFF;
+}
+
+int __init default_mpc_apic_id(struct mpc_cpu *m)
+{
+ return m->apicid;
+}
+
+static void __init MP_processor_info(struct mpc_cpu *m)
+{
+ int apicid;
+ char *bootup_cpu = "";
+
+ if (!(m->cpuflag & CPU_ENABLED)) {
+ disabled_cpus++;
+ return;
+ }
+
+ apicid = x86_init.mpparse.mpc_apic_id(m);
+
+ if (m->cpuflag & CPU_BOOTPROCESSOR) {
+ bootup_cpu = " (Bootup-CPU)";
+ boot_cpu_physical_apicid = m->apicid;
+ }
+
+ pr_info("Processor #%d%s\n", m->apicid, bootup_cpu);
+ generic_processor_info(apicid, m->apicver);
+}
+
+#ifdef CONFIG_X86_IO_APIC
+void __init default_mpc_oem_bus_info(struct mpc_bus *m, char *str)
+{
+ memcpy(str, m->bustype, 6);
+ str[6] = 0;
+ apic_printk(APIC_VERBOSE, "Bus #%d is %s\n", m->busid, str);
+}
+
+static void __init MP_bus_info(struct mpc_bus *m)
+{
+ char str[7];
+
+ x86_init.mpparse.mpc_oem_bus_info(m, str);
+
+#if MAX_MP_BUSSES < 256
+ if (m->busid >= MAX_MP_BUSSES) {
+ pr_warn("MP table busid value (%d) for bustype %s is too large, max. supported is %d\n",
+ m->busid, str, MAX_MP_BUSSES - 1);
+ return;
+ }
+#endif
+
+ set_bit(m->busid, mp_bus_not_pci);
+ if (strncmp(str, BUSTYPE_ISA, sizeof(BUSTYPE_ISA) - 1) == 0) {
+#ifdef CONFIG_EISA
+ mp_bus_id_to_type[m->busid] = MP_BUS_ISA;
+#endif
+ } else if (strncmp(str, BUSTYPE_PCI, sizeof(BUSTYPE_PCI) - 1) == 0) {
+ if (x86_init.mpparse.mpc_oem_pci_bus)
+ x86_init.mpparse.mpc_oem_pci_bus(m);
+
+ clear_bit(m->busid, mp_bus_not_pci);
+#ifdef CONFIG_EISA
+ mp_bus_id_to_type[m->busid] = MP_BUS_PCI;
+ } else if (strncmp(str, BUSTYPE_EISA, sizeof(BUSTYPE_EISA) - 1) == 0) {
+ mp_bus_id_to_type[m->busid] = MP_BUS_EISA;
+#endif
+ } else
+ pr_warn("Unknown bustype %s - ignoring\n", str);
+}
+
+static void __init MP_ioapic_info(struct mpc_ioapic *m)
+{
+ struct ioapic_domain_cfg cfg = {
+ .type = IOAPIC_DOMAIN_LEGACY,
+ .ops = &mp_ioapic_irqdomain_ops,
+ };
+
+ if (m->flags & MPC_APIC_USABLE)
+ mp_register_ioapic(m->apicid, m->apicaddr, gsi_top, &cfg);
+}
+
+static void __init print_mp_irq_info(struct mpc_intsrc *mp_irq)
+{
+ apic_printk(APIC_VERBOSE,
+ "Int: type %d, pol %d, trig %d, bus %02x, IRQ %02x, APIC ID %x, APIC INT %02x\n",
+ mp_irq->irqtype, mp_irq->irqflag & 3,
+ (mp_irq->irqflag >> 2) & 3, mp_irq->srcbus,
+ mp_irq->srcbusirq, mp_irq->dstapic, mp_irq->dstirq);
+}
+
+#else /* CONFIG_X86_IO_APIC */
+static inline void __init MP_bus_info(struct mpc_bus *m) {}
+static inline void __init MP_ioapic_info(struct mpc_ioapic *m) {}
+#endif /* CONFIG_X86_IO_APIC */
+
+static void __init MP_lintsrc_info(struct mpc_lintsrc *m)
+{
+ apic_printk(APIC_VERBOSE,
+ "Lint: type %d, pol %d, trig %d, bus %02x, IRQ %02x, APIC ID %x, APIC LINT %02x\n",
+ m->irqtype, m->irqflag & 3, (m->irqflag >> 2) & 3, m->srcbusid,
+ m->srcbusirq, m->destapic, m->destapiclint);
+}
+
+/*
+ * Read/parse the MPC
+ */
+static int __init smp_check_mpc(struct mpc_table *mpc, char *oem, char *str)
+{
+
+ if (memcmp(mpc->signature, MPC_SIGNATURE, 4)) {
+ pr_err("MPTABLE: bad signature [%c%c%c%c]!\n",
+ mpc->signature[0], mpc->signature[1],
+ mpc->signature[2], mpc->signature[3]);
+ return 0;
+ }
+ if (mpf_checksum((unsigned char *)mpc, mpc->length)) {
+ pr_err("MPTABLE: checksum error!\n");
+ return 0;
+ }
+ if (mpc->spec != 0x01 && mpc->spec != 0x04) {
+ pr_err("MPTABLE: bad table version (%d)!!\n", mpc->spec);
+ return 0;
+ }
+ if (!mpc->lapic) {
+ pr_err("MPTABLE: null local APIC address!\n");
+ return 0;
+ }
+ memcpy(oem, mpc->oem, 8);
+ oem[8] = 0;
+ pr_info("MPTABLE: OEM ID: %s\n", oem);
+
+ memcpy(str, mpc->productid, 12);
+ str[12] = 0;
+
+ pr_info("MPTABLE: Product ID: %s\n", str);
+
+ pr_info("MPTABLE: APIC at: 0x%X\n", mpc->lapic);
+
+ return 1;
+}
+
+static void skip_entry(unsigned char **ptr, int *count, int size)
+{
+ *ptr += size;
+ *count += size;
+}
+
+static void __init smp_dump_mptable(struct mpc_table *mpc, unsigned char *mpt)
+{
+ pr_err("Your mptable is wrong, contact your HW vendor!\n");
+ pr_cont("type %x\n", *mpt);
+ print_hex_dump(KERN_ERR, " ", DUMP_PREFIX_ADDRESS, 16,
+ 1, mpc, mpc->length, 1);
+}
+
+void __init default_smp_read_mpc_oem(struct mpc_table *mpc) { }
+
+static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early)
+{
+ char str[16];
+ char oem[10];
+
+ int count = sizeof(*mpc);
+ unsigned char *mpt = ((unsigned char *)mpc) + count;
+
+ if (!smp_check_mpc(mpc, oem, str))
+ return 0;
+
+ /* Initialize the lapic mapping */
+ if (!acpi_lapic)
+ register_lapic_address(mpc->lapic);
+
+ if (early)
+ return 1;
+
+ if (mpc->oemptr)
+ x86_init.mpparse.smp_read_mpc_oem(mpc);
+
+ /*
+ * Now process the configuration blocks.
+ */
+ x86_init.mpparse.mpc_record(0);
+
+ while (count < mpc->length) {
+ switch (*mpt) {
+ case MP_PROCESSOR:
+ /* ACPI may have already provided this data */
+ if (!acpi_lapic)
+ MP_processor_info((struct mpc_cpu *)mpt);
+ skip_entry(&mpt, &count, sizeof(struct mpc_cpu));
+ break;
+ case MP_BUS:
+ MP_bus_info((struct mpc_bus *)mpt);
+ skip_entry(&mpt, &count, sizeof(struct mpc_bus));
+ break;
+ case MP_IOAPIC:
+ MP_ioapic_info((struct mpc_ioapic *)mpt);
+ skip_entry(&mpt, &count, sizeof(struct mpc_ioapic));
+ break;
+ case MP_INTSRC:
+ mp_save_irq((struct mpc_intsrc *)mpt);
+ skip_entry(&mpt, &count, sizeof(struct mpc_intsrc));
+ break;
+ case MP_LINTSRC:
+ MP_lintsrc_info((struct mpc_lintsrc *)mpt);
+ skip_entry(&mpt, &count, sizeof(struct mpc_lintsrc));
+ break;
+ default:
+ /* wrong mptable */
+ smp_dump_mptable(mpc, mpt);
+ count = mpc->length;
+ break;
+ }
+ x86_init.mpparse.mpc_record(1);
+ }
+
+ if (!num_processors)
+ pr_err("MPTABLE: no processors registered!\n");
+ return num_processors;
+}
+
+#ifdef CONFIG_X86_IO_APIC
+
+static int __init ELCR_trigger(unsigned int irq)
+{
+ unsigned int port;
+
+ port = 0x4d0 + (irq >> 3);
+ return (inb(port) >> (irq & 7)) & 1;
+}
+
+static void __init construct_default_ioirq_mptable(int mpc_default_type)
+{
+ struct mpc_intsrc intsrc;
+ int i;
+ int ELCR_fallback = 0;
+
+ intsrc.type = MP_INTSRC;
+ intsrc.irqflag = MP_IRQTRIG_DEFAULT | MP_IRQPOL_DEFAULT;
+ intsrc.srcbus = 0;
+ intsrc.dstapic = mpc_ioapic_id(0);
+
+ intsrc.irqtype = mp_INT;
+
+ /*
+ * If true, we have an ISA/PCI system with no IRQ entries
+ * in the MP table. To prevent the PCI interrupts from being set up
+ * incorrectly, we try to use the ELCR. The sanity check to see if
+ * there is good ELCR data is very simple - IRQ0, 1, 2 and 13 can
+ * never be level sensitive, so we simply see if the ELCR agrees.
+ * If it does, we assume it's valid.
+ */
+ if (mpc_default_type == 5) {
+ pr_info("ISA/PCI bus type with no IRQ information... falling back to ELCR\n");
+
+ if (ELCR_trigger(0) || ELCR_trigger(1) || ELCR_trigger(2) ||
+ ELCR_trigger(13))
+ pr_err("ELCR contains invalid data... not using ELCR\n");
+ else {
+ pr_info("Using ELCR to identify PCI interrupts\n");
+ ELCR_fallback = 1;
+ }
+ }
+
+ for (i = 0; i < 16; i++) {
+ switch (mpc_default_type) {
+ case 2:
+ if (i == 0 || i == 13)
+ continue; /* IRQ0 & IRQ13 not connected */
+ /* fall through */
+ default:
+ if (i == 2)
+ continue; /* IRQ2 is never connected */
+ }
+
+ if (ELCR_fallback) {
+ /*
+ * If the ELCR indicates a level-sensitive interrupt, we
+ * copy that information over to the MP table in the
+ * irqflag field (level sensitive, active high polarity).
+ */
+ if (ELCR_trigger(i)) {
+ intsrc.irqflag = MP_IRQTRIG_LEVEL |
+ MP_IRQPOL_ACTIVE_HIGH;
+ } else {
+ intsrc.irqflag = MP_IRQTRIG_DEFAULT |
+ MP_IRQPOL_DEFAULT;
+ }
+ }
+
+ intsrc.srcbusirq = i;
+ intsrc.dstirq = i ? i : 2; /* IRQ0 to INTIN2 */
+ mp_save_irq(&intsrc);
+ }
+
+ intsrc.irqtype = mp_ExtINT;
+ intsrc.srcbusirq = 0;
+ intsrc.dstirq = 0; /* 8259A to INTIN0 */
+ mp_save_irq(&intsrc);
+}
+
+
+static void __init construct_ioapic_table(int mpc_default_type)
+{
+ struct mpc_ioapic ioapic;
+ struct mpc_bus bus;
+
+ bus.type = MP_BUS;
+ bus.busid = 0;
+ switch (mpc_default_type) {
+ default:
+ pr_err("???\nUnknown standard configuration %d\n",
+ mpc_default_type);
+ /* fall through */
+ case 1:
+ case 5:
+ memcpy(bus.bustype, "ISA ", 6);
+ break;
+ case 2:
+ case 6:
+ case 3:
+ memcpy(bus.bustype, "EISA ", 6);
+ break;
+ }
+ MP_bus_info(&bus);
+ if (mpc_default_type > 4) {
+ bus.busid = 1;
+ memcpy(bus.bustype, "PCI ", 6);
+ MP_bus_info(&bus);
+ }
+
+ ioapic.type = MP_IOAPIC;
+ ioapic.apicid = 2;
+ ioapic.apicver = mpc_default_type > 4 ? 0x10 : 0x01;
+ ioapic.flags = MPC_APIC_USABLE;
+ ioapic.apicaddr = IO_APIC_DEFAULT_PHYS_BASE;
+ MP_ioapic_info(&ioapic);
+
+ /*
+ * We set up most of the low 16 IO-APIC pins according to MPS rules.
+ */
+ construct_default_ioirq_mptable(mpc_default_type);
+}
+#else
+static inline void __init construct_ioapic_table(int mpc_default_type) { }
+#endif
+
+static inline void __init construct_default_ISA_mptable(int mpc_default_type)
+{
+ struct mpc_cpu processor;
+ struct mpc_lintsrc lintsrc;
+ int linttypes[2] = { mp_ExtINT, mp_NMI };
+ int i;
+
+ /*
+ * local APIC has default address
+ */
+ mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
+
+ /*
+ * 2 CPUs, numbered 0 & 1.
+ */
+ processor.type = MP_PROCESSOR;
+ /* Either an integrated APIC or a discrete 82489DX. */
+ processor.apicver = mpc_default_type > 4 ? 0x10 : 0x01;
+ processor.cpuflag = CPU_ENABLED;
+ processor.cpufeature = (boot_cpu_data.x86 << 8) |
+ (boot_cpu_data.x86_model << 4) | boot_cpu_data.x86_stepping;
+ processor.featureflag = boot_cpu_data.x86_capability[CPUID_1_EDX];
+ processor.reserved[0] = 0;
+ processor.reserved[1] = 0;
+ for (i = 0; i < 2; i++) {
+ processor.apicid = i;
+ MP_processor_info(&processor);
+ }
+
+ construct_ioapic_table(mpc_default_type);
+
+ lintsrc.type = MP_LINTSRC;
+ lintsrc.irqflag = MP_IRQTRIG_DEFAULT | MP_IRQPOL_DEFAULT;
+ lintsrc.srcbusid = 0;
+ lintsrc.srcbusirq = 0;
+ lintsrc.destapic = MP_APIC_ALL;
+ for (i = 0; i < 2; i++) {
+ lintsrc.irqtype = linttypes[i];
+ lintsrc.destapiclint = i;
+ MP_lintsrc_info(&lintsrc);
+ }
+}
+
+static unsigned long mpf_base;
+static bool mpf_found;
+
+static unsigned long __init get_mpc_size(unsigned long physptr)
+{
+ struct mpc_table *mpc;
+ unsigned long size;
+
+ mpc = early_memremap(physptr, PAGE_SIZE);
+ size = mpc->length;
+ early_memunmap(mpc, PAGE_SIZE);
+ apic_printk(APIC_VERBOSE, " mpc: %lx-%lx\n", physptr, physptr + size);
+
+ return size;
+}
+
+static int __init check_physptr(struct mpf_intel *mpf, unsigned int early)
+{
+ struct mpc_table *mpc;
+ unsigned long size;
+
+ size = get_mpc_size(mpf->physptr);
+ mpc = early_memremap(mpf->physptr, size);
+
+ /*
+ * Read the physical hardware table. Anything here will
+ * override the defaults.
+ */
+ if (!smp_read_mpc(mpc, early)) {
+#ifdef CONFIG_X86_LOCAL_APIC
+ smp_found_config = 0;
+#endif
+ pr_err("BIOS bug, MP table errors detected!...\n");
+ pr_cont("... disabling SMP support. (tell your hw vendor)\n");
+ early_memunmap(mpc, size);
+ return -1;
+ }
+ early_memunmap(mpc, size);
+
+ if (early)
+ return -1;
+
+#ifdef CONFIG_X86_IO_APIC
+ /*
+ * If there are no explicit MP IRQ entries, then we are
+ * broken. We set up most of the low 16 IO-APIC pins to
+ * ISA defaults and hope it will work.
+ */
+ if (!mp_irq_entries) {
+ struct mpc_bus bus;
+
+ pr_err("BIOS bug, no explicit IRQ entries, using default mptable. (tell your hw vendor)\n");
+
+ bus.type = MP_BUS;
+ bus.busid = 0;
+ memcpy(bus.bustype, "ISA ", 6);
+ MP_bus_info(&bus);
+
+ construct_default_ioirq_mptable(0);
+ }
+#endif
+
+ return 0;
+}
+
+/*
+ * Scan the memory blocks for an SMP configuration block.
+ */
+void __init default_get_smp_config(unsigned int early)
+{
+ struct mpf_intel *mpf;
+
+ if (!smp_found_config)
+ return;
+
+ if (!mpf_found)
+ return;
+
+ if (acpi_lapic && early)
+ return;
+
+ /*
+ * MPS doesn't support hyperthreading, aka only have
+ * thread 0 apic id in MPS table
+ */
+ if (acpi_lapic && acpi_ioapic)
+ return;
+
+ mpf = early_memremap(mpf_base, sizeof(*mpf));
+ if (!mpf) {
+ pr_err("MPTABLE: error mapping MP table\n");
+ return;
+ }
+
+ pr_info("Intel MultiProcessor Specification v1.%d\n",
+ mpf->specification);
+#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_32)
+ if (mpf->feature2 & (1 << 7)) {
+ pr_info(" IMCR and PIC compatibility mode.\n");
+ pic_mode = 1;
+ } else {
+ pr_info(" Virtual Wire compatibility mode.\n");
+ pic_mode = 0;
+ }
+#endif
+ /*
+ * Now see if we need to read further.
+ */
+ if (mpf->feature1) {
+ if (early) {
+ /*
+ * local APIC has default address
+ */
+ mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
+ return;
+ }
+
+ pr_info("Default MP configuration #%d\n", mpf->feature1);
+ construct_default_ISA_mptable(mpf->feature1);
+
+ } else if (mpf->physptr) {
+ if (check_physptr(mpf, early)) {
+ early_memunmap(mpf, sizeof(*mpf));
+ return;
+ }
+ } else
+ BUG();
+
+ if (!early)
+ pr_info("Processors: %d\n", num_processors);
+ /*
+ * Only use the first configuration found.
+ */
+
+ early_memunmap(mpf, sizeof(*mpf));
+}
+
+static void __init smp_reserve_memory(struct mpf_intel *mpf)
+{
+ memblock_reserve(mpf->physptr, get_mpc_size(mpf->physptr));
+}
+
+static int __init smp_scan_config(unsigned long base, unsigned long length)
+{
+ unsigned int *bp;
+ struct mpf_intel *mpf;
+ int ret = 0;
+
+ apic_printk(APIC_VERBOSE, "Scan for SMP in [mem %#010lx-%#010lx]\n",
+ base, base + length - 1);
+ BUILD_BUG_ON(sizeof(*mpf) != 16);
+
+ while (length > 0) {
+ bp = early_memremap(base, length);
+ mpf = (struct mpf_intel *)bp;
+ if ((*bp == SMP_MAGIC_IDENT) &&
+ (mpf->length == 1) &&
+ !mpf_checksum((unsigned char *)bp, 16) &&
+ ((mpf->specification == 1)
+ || (mpf->specification == 4))) {
+#ifdef CONFIG_X86_LOCAL_APIC
+ smp_found_config = 1;
+#endif
+ mpf_base = base;
+ mpf_found = true;
+
+ pr_info("found SMP MP-table at [mem %#010lx-%#010lx] mapped at [%p]\n",
+ base, base + sizeof(*mpf) - 1, mpf);
+
+ memblock_reserve(base, sizeof(*mpf));
+ if (mpf->physptr)
+ smp_reserve_memory(mpf);
+
+ ret = 1;
+ }
+ early_memunmap(bp, length);
+
+ if (ret)
+ break;
+
+ base += 16;
+ length -= 16;
+ }
+ return ret;
+}
+
+void __init default_find_smp_config(void)
+{
+ unsigned int address;
+
+ /*
+ * FIXME: Linux assumes you have 640K of base ram..
+ * this continues the error...
+ *
+ * 1) Scan the bottom 1K for a signature
+ * 2) Scan the top 1K of base RAM
+ * 3) Scan the 64K of bios
+ */
+ if (smp_scan_config(0x0, 0x400) ||
+ smp_scan_config(639 * 0x400, 0x400) ||
+ smp_scan_config(0xF0000, 0x10000))
+ return;
+ /*
+ * If it is an SMP machine we should know now, unless the
+ * configuration is in an EISA bus machine with an
+ * extended bios data area.
+ *
+ * there is a real-mode segmented pointer pointing to the
+ * 4K EBDA area at 0x40E, calculate and scan it here.
+ *
+ * NOTE! There are Linux loaders that will corrupt the EBDA
+ * area, and as such this kind of SMP config may be less
+ * trustworthy, simply because the SMP table may have been
+ * stomped on during early boot. These loaders are buggy and
+ * should be fixed.
+ *
+ * MP1.4 SPEC states to only scan first 1K of 4K EBDA.
+ */
+
+ address = get_bios_ebda();
+ if (address)
+ smp_scan_config(address, 0x400);
+}
+
+#ifdef CONFIG_X86_IO_APIC
+static u8 __initdata irq_used[MAX_IRQ_SOURCES];
+
+static int __init get_MP_intsrc_index(struct mpc_intsrc *m)
+{
+ int i;
+
+ if (m->irqtype != mp_INT)
+ return 0;
+
+ if (m->irqflag != (MP_IRQTRIG_LEVEL | MP_IRQPOL_ACTIVE_LOW))
+ return 0;
+
+ /* not legacy */
+
+ for (i = 0; i < mp_irq_entries; i++) {
+ if (mp_irqs[i].irqtype != mp_INT)
+ continue;
+
+ if (mp_irqs[i].irqflag != (MP_IRQTRIG_LEVEL |
+ MP_IRQPOL_ACTIVE_LOW))
+ continue;
+
+ if (mp_irqs[i].srcbus != m->srcbus)
+ continue;
+ if (mp_irqs[i].srcbusirq != m->srcbusirq)
+ continue;
+ if (irq_used[i]) {
+ /* already claimed */
+ return -2;
+ }
+ irq_used[i] = 1;
+ return i;
+ }
+
+ /* not found */
+ return -1;
+}
+
+#define SPARE_SLOT_NUM 20
+
+static struct mpc_intsrc __initdata *m_spare[SPARE_SLOT_NUM];
+
+static void __init check_irq_src(struct mpc_intsrc *m, int *nr_m_spare)
+{
+ int i;
+
+ apic_printk(APIC_VERBOSE, "OLD ");
+ print_mp_irq_info(m);
+
+ i = get_MP_intsrc_index(m);
+ if (i > 0) {
+ memcpy(m, &mp_irqs[i], sizeof(*m));
+ apic_printk(APIC_VERBOSE, "NEW ");
+ print_mp_irq_info(&mp_irqs[i]);
+ return;
+ }
+ if (!i) {
+ /* legacy, do nothing */
+ return;
+ }
+ if (*nr_m_spare < SPARE_SLOT_NUM) {
+ /*
+ * not found (-1), or duplicated (-2) are invalid entries,
+ * we need to use the slot later
+ */
+ m_spare[*nr_m_spare] = m;
+ *nr_m_spare += 1;
+ }
+}
+
+static int __init
+check_slot(unsigned long mpc_new_phys, unsigned long mpc_new_length, int count)
+{
+ if (!mpc_new_phys || count <= mpc_new_length) {
+ WARN(1, "update_mptable: No spare slots (length: %x)\n", count);
+ return -1;
+ }
+
+ return 0;
+}
+#else /* CONFIG_X86_IO_APIC */
+static
+inline void __init check_irq_src(struct mpc_intsrc *m, int *nr_m_spare) {}
+#endif /* CONFIG_X86_IO_APIC */
+
+static int __init replace_intsrc_all(struct mpc_table *mpc,
+ unsigned long mpc_new_phys,
+ unsigned long mpc_new_length)
+{
+#ifdef CONFIG_X86_IO_APIC
+ int i;
+#endif
+ int count = sizeof(*mpc);
+ int nr_m_spare = 0;
+ unsigned char *mpt = ((unsigned char *)mpc) + count;
+
+ pr_info("mpc_length %x\n", mpc->length);
+ while (count < mpc->length) {
+ switch (*mpt) {
+ case MP_PROCESSOR:
+ skip_entry(&mpt, &count, sizeof(struct mpc_cpu));
+ break;
+ case MP_BUS:
+ skip_entry(&mpt, &count, sizeof(struct mpc_bus));
+ break;
+ case MP_IOAPIC:
+ skip_entry(&mpt, &count, sizeof(struct mpc_ioapic));
+ break;
+ case MP_INTSRC:
+ check_irq_src((struct mpc_intsrc *)mpt, &nr_m_spare);
+ skip_entry(&mpt, &count, sizeof(struct mpc_intsrc));
+ break;
+ case MP_LINTSRC:
+ skip_entry(&mpt, &count, sizeof(struct mpc_lintsrc));
+ break;
+ default:
+ /* wrong mptable */
+ smp_dump_mptable(mpc, mpt);
+ goto out;
+ }
+ }
+
+#ifdef CONFIG_X86_IO_APIC
+ for (i = 0; i < mp_irq_entries; i++) {
+ if (irq_used[i])
+ continue;
+
+ if (mp_irqs[i].irqtype != mp_INT)
+ continue;
+
+ if (mp_irqs[i].irqflag != (MP_IRQTRIG_LEVEL |
+ MP_IRQPOL_ACTIVE_LOW))
+ continue;
+
+ if (nr_m_spare > 0) {
+ apic_printk(APIC_VERBOSE, "*NEW* found\n");
+ nr_m_spare--;
+ memcpy(m_spare[nr_m_spare], &mp_irqs[i], sizeof(mp_irqs[i]));
+ m_spare[nr_m_spare] = NULL;
+ } else {
+ struct mpc_intsrc *m = (struct mpc_intsrc *)mpt;
+ count += sizeof(struct mpc_intsrc);
+ if (check_slot(mpc_new_phys, mpc_new_length, count) < 0)
+ goto out;
+ memcpy(m, &mp_irqs[i], sizeof(*m));
+ mpc->length = count;
+ mpt += sizeof(struct mpc_intsrc);
+ }
+ print_mp_irq_info(&mp_irqs[i]);
+ }
+#endif
+out:
+ /* update checksum */
+ mpc->checksum = 0;
+ mpc->checksum -= mpf_checksum((unsigned char *)mpc, mpc->length);
+
+ return 0;
+}
+
+int enable_update_mptable;
+
+static int __init update_mptable_setup(char *str)
+{
+ enable_update_mptable = 1;
+#ifdef CONFIG_PCI
+ pci_routeirq = 1;
+#endif
+ return 0;
+}
+early_param("update_mptable", update_mptable_setup);
+
+static unsigned long __initdata mpc_new_phys;
+static unsigned long mpc_new_length __initdata = 4096;
+
+/* alloc_mptable or alloc_mptable=4k */
+static int __initdata alloc_mptable;
+static int __init parse_alloc_mptable_opt(char *p)
+{
+ enable_update_mptable = 1;
+#ifdef CONFIG_PCI
+ pci_routeirq = 1;
+#endif
+ alloc_mptable = 1;
+ if (!p)
+ return 0;
+ mpc_new_length = memparse(p, &p);
+ return 0;
+}
+early_param("alloc_mptable", parse_alloc_mptable_opt);
+
+void __init e820__memblock_alloc_reserved_mpc_new(void)
+{
+ if (enable_update_mptable && alloc_mptable)
+ mpc_new_phys = e820__memblock_alloc_reserved(mpc_new_length, 4);
+}
+
+static int __init update_mp_table(void)
+{
+ char str[16];
+ char oem[10];
+ struct mpf_intel *mpf;
+ struct mpc_table *mpc, *mpc_new;
+ unsigned long size;
+
+ if (!enable_update_mptable)
+ return 0;
+
+ if (!mpf_found)
+ return 0;
+
+ mpf = early_memremap(mpf_base, sizeof(*mpf));
+ if (!mpf) {
+ pr_err("MPTABLE: mpf early_memremap() failed\n");
+ return 0;
+ }
+
+ /*
+ * Now see if we need to go further.
+ */
+ if (mpf->feature1)
+ goto do_unmap_mpf;
+
+ if (!mpf->physptr)
+ goto do_unmap_mpf;
+
+ size = get_mpc_size(mpf->physptr);
+ mpc = early_memremap(mpf->physptr, size);
+ if (!mpc) {
+ pr_err("MPTABLE: mpc early_memremap() failed\n");
+ goto do_unmap_mpf;
+ }
+
+ if (!smp_check_mpc(mpc, oem, str))
+ goto do_unmap_mpc;
+
+ pr_info("mpf: %llx\n", (u64)mpf_base);
+ pr_info("physptr: %x\n", mpf->physptr);
+
+ if (mpc_new_phys && mpc->length > mpc_new_length) {
+ mpc_new_phys = 0;
+ pr_info("mpc_new_length is %ld, please use alloc_mptable=8k\n",
+ mpc_new_length);
+ }
+
+ if (!mpc_new_phys) {
+ unsigned char old, new;
+ /* check if we can change the position */
+ mpc->checksum = 0;
+ old = mpf_checksum((unsigned char *)mpc, mpc->length);
+ mpc->checksum = 0xff;
+ new = mpf_checksum((unsigned char *)mpc, mpc->length);
+ if (old == new) {
+ pr_info("mpc is readonly, please try alloc_mptable instead\n");
+ goto do_unmap_mpc;
+ }
+ pr_info("use in-position replacing\n");
+ } else {
+ mpc_new = early_memremap(mpc_new_phys, mpc_new_length);
+ if (!mpc_new) {
+ pr_err("MPTABLE: new mpc early_memremap() failed\n");
+ goto do_unmap_mpc;
+ }
+ mpf->physptr = mpc_new_phys;
+ memcpy(mpc_new, mpc, mpc->length);
+ early_memunmap(mpc, size);
+ mpc = mpc_new;
+ size = mpc_new_length;
+ /* check if we can modify that */
+ if (mpc_new_phys - mpf->physptr) {
+ struct mpf_intel *mpf_new;
+ /* steal 16 bytes from [0, 1k) */
+ mpf_new = early_memremap(0x400 - 16, sizeof(*mpf_new));
+ if (!mpf_new) {
+ pr_err("MPTABLE: new mpf early_memremap() failed\n");
+ goto do_unmap_mpc;
+ }
+ pr_info("mpf new: %x\n", 0x400 - 16);
+ memcpy(mpf_new, mpf, 16);
+ early_memunmap(mpf, sizeof(*mpf));
+ mpf = mpf_new;
+ mpf->physptr = mpc_new_phys;
+ }
+ mpf->checksum = 0;
+ mpf->checksum -= mpf_checksum((unsigned char *)mpf, 16);
+ pr_info("physptr new: %x\n", mpf->physptr);
+ }
+
+ /*
+ * only replace the one with mp_INT and
+ * MP_IRQ_TRIGGER_LEVEL|MP_IRQ_POLARITY_LOW,
+ * already in mp_irqs , stored by ... and mp_config_acpi_gsi,
+ * may need pci=routeirq for all coverage
+ */
+ replace_intsrc_all(mpc, mpc_new_phys, mpc_new_length);
+
+do_unmap_mpc:
+ early_memunmap(mpc, size);
+
+do_unmap_mpf:
+ early_memunmap(mpf, sizeof(*mpf));
+
+ return 0;
+}
+
+late_initcall(update_mp_table);
diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c
new file mode 100644
index 0000000..ef68880
--- /dev/null
+++ b/arch/x86/kernel/msr.c
@@ -0,0 +1,244 @@
+/* ----------------------------------------------------------------------- *
+ *
+ * Copyright 2000-2008 H. Peter Anvin - All Rights Reserved
+ * Copyright 2009 Intel Corporation; author: H. Peter Anvin
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, Inc., 675 Mass Ave, Cambridge MA 02139,
+ * USA; either version 2 of the License, or (at your option) any later
+ * version; incorporated herein by reference.
+ *
+ * ----------------------------------------------------------------------- */
+
+/*
+ * x86 MSR access device
+ *
+ * This device is accessed by lseek() to the appropriate register number
+ * and then read/write in chunks of 8 bytes. A larger size means multiple
+ * reads or writes of the same register.
+ *
+ * This driver uses /dev/cpu/%d/msr where %d is the minor number, and on
+ * an SMP box will direct the access to CPU %d.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+
+#include <linux/types.h>
+#include <linux/errno.h>
+#include <linux/fcntl.h>
+#include <linux/init.h>
+#include <linux/poll.h>
+#include <linux/smp.h>
+#include <linux/major.h>
+#include <linux/fs.h>
+#include <linux/device.h>
+#include <linux/cpu.h>
+#include <linux/notifier.h>
+#include <linux/uaccess.h>
+#include <linux/gfp.h>
+
+#include <asm/cpufeature.h>
+#include <asm/msr.h>
+
+static struct class *msr_class;
+static enum cpuhp_state cpuhp_msr_state;
+
+static ssize_t msr_read(struct file *file, char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ u32 __user *tmp = (u32 __user *) buf;
+ u32 data[2];
+ u32 reg = *ppos;
+ int cpu = iminor(file_inode(file));
+ int err = 0;
+ ssize_t bytes = 0;
+
+ if (count % 8)
+ return -EINVAL; /* Invalid chunk size */
+
+ for (; count; count -= 8) {
+ err = rdmsr_safe_on_cpu(cpu, reg, &data[0], &data[1]);
+ if (err)
+ break;
+ if (copy_to_user(tmp, &data, 8)) {
+ err = -EFAULT;
+ break;
+ }
+ tmp += 2;
+ bytes += 8;
+ }
+
+ return bytes ? bytes : err;
+}
+
+static ssize_t msr_write(struct file *file, const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ const u32 __user *tmp = (const u32 __user *)buf;
+ u32 data[2];
+ u32 reg = *ppos;
+ int cpu = iminor(file_inode(file));
+ int err = 0;
+ ssize_t bytes = 0;
+
+ if (count % 8)
+ return -EINVAL; /* Invalid chunk size */
+
+ for (; count; count -= 8) {
+ if (copy_from_user(&data, tmp, 8)) {
+ err = -EFAULT;
+ break;
+ }
+ err = wrmsr_safe_on_cpu(cpu, reg, data[0], data[1]);
+ if (err)
+ break;
+ tmp += 2;
+ bytes += 8;
+ }
+
+ return bytes ? bytes : err;
+}
+
+static long msr_ioctl(struct file *file, unsigned int ioc, unsigned long arg)
+{
+ u32 __user *uregs = (u32 __user *)arg;
+ u32 regs[8];
+ int cpu = iminor(file_inode(file));
+ int err;
+
+ switch (ioc) {
+ case X86_IOC_RDMSR_REGS:
+ if (!(file->f_mode & FMODE_READ)) {
+ err = -EBADF;
+ break;
+ }
+ if (copy_from_user(®s, uregs, sizeof regs)) {
+ err = -EFAULT;
+ break;
+ }
+ err = rdmsr_safe_regs_on_cpu(cpu, regs);
+ if (err)
+ break;
+ if (copy_to_user(uregs, ®s, sizeof regs))
+ err = -EFAULT;
+ break;
+
+ case X86_IOC_WRMSR_REGS:
+ if (!(file->f_mode & FMODE_WRITE)) {
+ err = -EBADF;
+ break;
+ }
+ if (copy_from_user(®s, uregs, sizeof regs)) {
+ err = -EFAULT;
+ break;
+ }
+ err = wrmsr_safe_regs_on_cpu(cpu, regs);
+ if (err)
+ break;
+ if (copy_to_user(uregs, ®s, sizeof regs))
+ err = -EFAULT;
+ break;
+
+ default:
+ err = -ENOTTY;
+ break;
+ }
+
+ return err;
+}
+
+static int msr_open(struct inode *inode, struct file *file)
+{
+ unsigned int cpu = iminor(file_inode(file));
+ struct cpuinfo_x86 *c;
+
+ if (!capable(CAP_SYS_RAWIO))
+ return -EPERM;
+
+ if (cpu >= nr_cpu_ids || !cpu_online(cpu))
+ return -ENXIO; /* No such CPU */
+
+ c = &cpu_data(cpu);
+ if (!cpu_has(c, X86_FEATURE_MSR))
+ return -EIO; /* MSR not supported */
+
+ return 0;
+}
+
+/*
+ * File operations we support
+ */
+static const struct file_operations msr_fops = {
+ .owner = THIS_MODULE,
+ .llseek = no_seek_end_llseek,
+ .read = msr_read,
+ .write = msr_write,
+ .open = msr_open,
+ .unlocked_ioctl = msr_ioctl,
+ .compat_ioctl = msr_ioctl,
+};
+
+static int msr_device_create(unsigned int cpu)
+{
+ struct device *dev;
+
+ dev = device_create(msr_class, NULL, MKDEV(MSR_MAJOR, cpu), NULL,
+ "msr%d", cpu);
+ return PTR_ERR_OR_ZERO(dev);
+}
+
+static int msr_device_destroy(unsigned int cpu)
+{
+ device_destroy(msr_class, MKDEV(MSR_MAJOR, cpu));
+ return 0;
+}
+
+static char *msr_devnode(struct device *dev, umode_t *mode)
+{
+ return kasprintf(GFP_KERNEL, "cpu/%u/msr", MINOR(dev->devt));
+}
+
+static int __init msr_init(void)
+{
+ int err;
+
+ if (__register_chrdev(MSR_MAJOR, 0, NR_CPUS, "cpu/msr", &msr_fops)) {
+ pr_err("unable to get major %d for msr\n", MSR_MAJOR);
+ return -EBUSY;
+ }
+ msr_class = class_create(THIS_MODULE, "msr");
+ if (IS_ERR(msr_class)) {
+ err = PTR_ERR(msr_class);
+ goto out_chrdev;
+ }
+ msr_class->devnode = msr_devnode;
+
+ err = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/msr:online",
+ msr_device_create, msr_device_destroy);
+ if (err < 0)
+ goto out_class;
+ cpuhp_msr_state = err;
+ return 0;
+
+out_class:
+ class_destroy(msr_class);
+out_chrdev:
+ __unregister_chrdev(MSR_MAJOR, 0, NR_CPUS, "cpu/msr");
+ return err;
+}
+module_init(msr_init);
+
+static void __exit msr_exit(void)
+{
+ cpuhp_remove_state(cpuhp_msr_state);
+ class_destroy(msr_class);
+ __unregister_chrdev(MSR_MAJOR, 0, NR_CPUS, "cpu/msr");
+}
+module_exit(msr_exit)
+
+MODULE_AUTHOR("H. Peter Anvin <hpa@zytor.com>");
+MODULE_DESCRIPTION("x86 generic MSR driver");
+MODULE_LICENSE("GPL");
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
new file mode 100644
index 0000000..18bc9b5
--- /dev/null
+++ b/arch/x86/kernel/nmi.c
@@ -0,0 +1,554 @@
+/*
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs
+ * Copyright (C) 2011 Don Zickus Red Hat, Inc.
+ *
+ * Pentium III FXSR, SSE support
+ * Gareth Hughes <gareth@valinux.com>, May 2000
+ */
+
+/*
+ * Handle hardware traps and faults.
+ */
+#include <linux/spinlock.h>
+#include <linux/kprobes.h>
+#include <linux/kdebug.h>
+#include <linux/sched/debug.h>
+#include <linux/nmi.h>
+#include <linux/debugfs.h>
+#include <linux/delay.h>
+#include <linux/hardirq.h>
+#include <linux/ratelimit.h>
+#include <linux/slab.h>
+#include <linux/export.h>
+#include <linux/sched/clock.h>
+
+#if defined(CONFIG_EDAC)
+#include <linux/edac.h>
+#endif
+
+#include <linux/atomic.h>
+#include <asm/traps.h>
+#include <asm/mach_traps.h>
+#include <asm/nmi.h>
+#include <asm/x86_init.h>
+#include <asm/reboot.h>
+#include <asm/cache.h>
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/nmi.h>
+
+struct nmi_desc {
+ raw_spinlock_t lock;
+ struct list_head head;
+};
+
+static struct nmi_desc nmi_desc[NMI_MAX] =
+{
+ {
+ .lock = __RAW_SPIN_LOCK_UNLOCKED(&nmi_desc[0].lock),
+ .head = LIST_HEAD_INIT(nmi_desc[0].head),
+ },
+ {
+ .lock = __RAW_SPIN_LOCK_UNLOCKED(&nmi_desc[1].lock),
+ .head = LIST_HEAD_INIT(nmi_desc[1].head),
+ },
+ {
+ .lock = __RAW_SPIN_LOCK_UNLOCKED(&nmi_desc[2].lock),
+ .head = LIST_HEAD_INIT(nmi_desc[2].head),
+ },
+ {
+ .lock = __RAW_SPIN_LOCK_UNLOCKED(&nmi_desc[3].lock),
+ .head = LIST_HEAD_INIT(nmi_desc[3].head),
+ },
+
+};
+
+struct nmi_stats {
+ unsigned int normal;
+ unsigned int unknown;
+ unsigned int external;
+ unsigned int swallow;
+};
+
+static DEFINE_PER_CPU(struct nmi_stats, nmi_stats);
+
+static int ignore_nmis __read_mostly;
+
+int unknown_nmi_panic;
+/*
+ * Prevent NMI reason port (0x61) being accessed simultaneously, can
+ * only be used in NMI handler.
+ */
+static DEFINE_RAW_SPINLOCK(nmi_reason_lock);
+
+static int __init setup_unknown_nmi_panic(char *str)
+{
+ unknown_nmi_panic = 1;
+ return 1;
+}
+__setup("unknown_nmi_panic", setup_unknown_nmi_panic);
+
+#define nmi_to_desc(type) (&nmi_desc[type])
+
+static u64 nmi_longest_ns = 1 * NSEC_PER_MSEC;
+
+static int __init nmi_warning_debugfs(void)
+{
+ debugfs_create_u64("nmi_longest_ns", 0644,
+ arch_debugfs_dir, &nmi_longest_ns);
+ return 0;
+}
+fs_initcall(nmi_warning_debugfs);
+
+static void nmi_max_handler(struct irq_work *w)
+{
+ struct nmiaction *a = container_of(w, struct nmiaction, irq_work);
+ int remainder_ns, decimal_msecs;
+ u64 whole_msecs = READ_ONCE(a->max_duration);
+
+ remainder_ns = do_div(whole_msecs, (1000 * 1000));
+ decimal_msecs = remainder_ns / 1000;
+
+ printk_ratelimited(KERN_INFO
+ "INFO: NMI handler (%ps) took too long to run: %lld.%03d msecs\n",
+ a->handler, whole_msecs, decimal_msecs);
+}
+
+static int nmi_handle(unsigned int type, struct pt_regs *regs)
+{
+ struct nmi_desc *desc = nmi_to_desc(type);
+ struct nmiaction *a;
+ int handled=0;
+
+ rcu_read_lock();
+
+ /*
+ * NMIs are edge-triggered, which means if you have enough
+ * of them concurrently, you can lose some because only one
+ * can be latched at any given time. Walk the whole list
+ * to handle those situations.
+ */
+ list_for_each_entry_rcu(a, &desc->head, list) {
+ int thishandled;
+ u64 delta;
+
+ delta = sched_clock();
+ thishandled = a->handler(type, regs);
+ handled += thishandled;
+ delta = sched_clock() - delta;
+ trace_nmi_handler(a->handler, (int)delta, thishandled);
+
+ if (delta < nmi_longest_ns || delta < a->max_duration)
+ continue;
+
+ a->max_duration = delta;
+ irq_work_queue(&a->irq_work);
+ }
+
+ rcu_read_unlock();
+
+ /* return total number of NMI events handled */
+ return handled;
+}
+NOKPROBE_SYMBOL(nmi_handle);
+
+int __register_nmi_handler(unsigned int type, struct nmiaction *action)
+{
+ struct nmi_desc *desc = nmi_to_desc(type);
+ unsigned long flags;
+
+ if (!action->handler)
+ return -EINVAL;
+
+ init_irq_work(&action->irq_work, nmi_max_handler);
+
+ raw_spin_lock_irqsave(&desc->lock, flags);
+
+ /*
+ * Indicate if there are multiple registrations on the
+ * internal NMI handler call chains (SERR and IO_CHECK).
+ */
+ WARN_ON_ONCE(type == NMI_SERR && !list_empty(&desc->head));
+ WARN_ON_ONCE(type == NMI_IO_CHECK && !list_empty(&desc->head));
+
+ /*
+ * some handlers need to be executed first otherwise a fake
+ * event confuses some handlers (kdump uses this flag)
+ */
+ if (action->flags & NMI_FLAG_FIRST)
+ list_add_rcu(&action->list, &desc->head);
+ else
+ list_add_tail_rcu(&action->list, &desc->head);
+
+ raw_spin_unlock_irqrestore(&desc->lock, flags);
+ return 0;
+}
+EXPORT_SYMBOL(__register_nmi_handler);
+
+void unregister_nmi_handler(unsigned int type, const char *name)
+{
+ struct nmi_desc *desc = nmi_to_desc(type);
+ struct nmiaction *n;
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&desc->lock, flags);
+
+ list_for_each_entry_rcu(n, &desc->head, list) {
+ /*
+ * the name passed in to describe the nmi handler
+ * is used as the lookup key
+ */
+ if (!strcmp(n->name, name)) {
+ WARN(in_nmi(),
+ "Trying to free NMI (%s) from NMI context!\n", n->name);
+ list_del_rcu(&n->list);
+ break;
+ }
+ }
+
+ raw_spin_unlock_irqrestore(&desc->lock, flags);
+ synchronize_rcu();
+}
+EXPORT_SYMBOL_GPL(unregister_nmi_handler);
+
+static void
+pci_serr_error(unsigned char reason, struct pt_regs *regs)
+{
+ /* check to see if anyone registered against these types of errors */
+ if (nmi_handle(NMI_SERR, regs))
+ return;
+
+ pr_emerg("NMI: PCI system error (SERR) for reason %02x on CPU %d.\n",
+ reason, smp_processor_id());
+
+ if (panic_on_unrecovered_nmi)
+ nmi_panic(regs, "NMI: Not continuing");
+
+ pr_emerg("Dazed and confused, but trying to continue\n");
+
+ /* Clear and disable the PCI SERR error line. */
+ reason = (reason & NMI_REASON_CLEAR_MASK) | NMI_REASON_CLEAR_SERR;
+ outb(reason, NMI_REASON_PORT);
+}
+NOKPROBE_SYMBOL(pci_serr_error);
+
+static void
+io_check_error(unsigned char reason, struct pt_regs *regs)
+{
+ unsigned long i;
+
+ /* check to see if anyone registered against these types of errors */
+ if (nmi_handle(NMI_IO_CHECK, regs))
+ return;
+
+ pr_emerg(
+ "NMI: IOCK error (debug interrupt?) for reason %02x on CPU %d.\n",
+ reason, smp_processor_id());
+ show_regs(regs);
+
+ if (panic_on_io_nmi) {
+ nmi_panic(regs, "NMI IOCK error: Not continuing");
+
+ /*
+ * If we end up here, it means we have received an NMI while
+ * processing panic(). Simply return without delaying and
+ * re-enabling NMIs.
+ */
+ return;
+ }
+
+ /* Re-enable the IOCK line, wait for a few seconds */
+ reason = (reason & NMI_REASON_CLEAR_MASK) | NMI_REASON_CLEAR_IOCHK;
+ outb(reason, NMI_REASON_PORT);
+
+ i = 20000;
+ while (--i) {
+ touch_nmi_watchdog();
+ udelay(100);
+ }
+
+ reason &= ~NMI_REASON_CLEAR_IOCHK;
+ outb(reason, NMI_REASON_PORT);
+}
+NOKPROBE_SYMBOL(io_check_error);
+
+static void
+unknown_nmi_error(unsigned char reason, struct pt_regs *regs)
+{
+ int handled;
+
+ /*
+ * Use 'false' as back-to-back NMIs are dealt with one level up.
+ * Of course this makes having multiple 'unknown' handlers useless
+ * as only the first one is ever run (unless it can actually determine
+ * if it caused the NMI)
+ */
+ handled = nmi_handle(NMI_UNKNOWN, regs);
+ if (handled) {
+ __this_cpu_add(nmi_stats.unknown, handled);
+ return;
+ }
+
+ __this_cpu_add(nmi_stats.unknown, 1);
+
+ pr_emerg("Uhhuh. NMI received for unknown reason %02x on CPU %d.\n",
+ reason, smp_processor_id());
+
+ pr_emerg("Do you have a strange power saving mode enabled?\n");
+ if (unknown_nmi_panic || panic_on_unrecovered_nmi)
+ nmi_panic(regs, "NMI: Not continuing");
+
+ pr_emerg("Dazed and confused, but trying to continue\n");
+}
+NOKPROBE_SYMBOL(unknown_nmi_error);
+
+static DEFINE_PER_CPU(bool, swallow_nmi);
+static DEFINE_PER_CPU(unsigned long, last_nmi_rip);
+
+static void default_do_nmi(struct pt_regs *regs)
+{
+ unsigned char reason = 0;
+ int handled;
+ bool b2b = false;
+
+ /*
+ * CPU-specific NMI must be processed before non-CPU-specific
+ * NMI, otherwise we may lose it, because the CPU-specific
+ * NMI can not be detected/processed on other CPUs.
+ */
+
+ /*
+ * Back-to-back NMIs are interesting because they can either
+ * be two NMI or more than two NMIs (any thing over two is dropped
+ * due to NMI being edge-triggered). If this is the second half
+ * of the back-to-back NMI, assume we dropped things and process
+ * more handlers. Otherwise reset the 'swallow' NMI behaviour
+ */
+ if (regs->ip == __this_cpu_read(last_nmi_rip))
+ b2b = true;
+ else
+ __this_cpu_write(swallow_nmi, false);
+
+ __this_cpu_write(last_nmi_rip, regs->ip);
+
+ handled = nmi_handle(NMI_LOCAL, regs);
+ __this_cpu_add(nmi_stats.normal, handled);
+ if (handled) {
+ /*
+ * There are cases when a NMI handler handles multiple
+ * events in the current NMI. One of these events may
+ * be queued for in the next NMI. Because the event is
+ * already handled, the next NMI will result in an unknown
+ * NMI. Instead lets flag this for a potential NMI to
+ * swallow.
+ */
+ if (handled > 1)
+ __this_cpu_write(swallow_nmi, true);
+ return;
+ }
+
+ /*
+ * Non-CPU-specific NMI: NMI sources can be processed on any CPU.
+ *
+ * Another CPU may be processing panic routines while holding
+ * nmi_reason_lock. Check if the CPU issued the IPI for crash dumping,
+ * and if so, call its callback directly. If there is no CPU preparing
+ * crash dump, we simply loop here.
+ */
+ while (!raw_spin_trylock(&nmi_reason_lock)) {
+ run_crash_ipi_callback(regs);
+ cpu_relax();
+ }
+
+ reason = x86_platform.get_nmi_reason();
+
+ if (reason & NMI_REASON_MASK) {
+ if (reason & NMI_REASON_SERR)
+ pci_serr_error(reason, regs);
+ else if (reason & NMI_REASON_IOCHK)
+ io_check_error(reason, regs);
+#ifdef CONFIG_X86_32
+ /*
+ * Reassert NMI in case it became active
+ * meanwhile as it's edge-triggered:
+ */
+ reassert_nmi();
+#endif
+ __this_cpu_add(nmi_stats.external, 1);
+ raw_spin_unlock(&nmi_reason_lock);
+ return;
+ }
+ raw_spin_unlock(&nmi_reason_lock);
+
+ /*
+ * Only one NMI can be latched at a time. To handle
+ * this we may process multiple nmi handlers at once to
+ * cover the case where an NMI is dropped. The downside
+ * to this approach is we may process an NMI prematurely,
+ * while its real NMI is sitting latched. This will cause
+ * an unknown NMI on the next run of the NMI processing.
+ *
+ * We tried to flag that condition above, by setting the
+ * swallow_nmi flag when we process more than one event.
+ * This condition is also only present on the second half
+ * of a back-to-back NMI, so we flag that condition too.
+ *
+ * If both are true, we assume we already processed this
+ * NMI previously and we swallow it. Otherwise we reset
+ * the logic.
+ *
+ * There are scenarios where we may accidentally swallow
+ * a 'real' unknown NMI. For example, while processing
+ * a perf NMI another perf NMI comes in along with a
+ * 'real' unknown NMI. These two NMIs get combined into
+ * one (as descibed above). When the next NMI gets
+ * processed, it will be flagged by perf as handled, but
+ * noone will know that there was a 'real' unknown NMI sent
+ * also. As a result it gets swallowed. Or if the first
+ * perf NMI returns two events handled then the second
+ * NMI will get eaten by the logic below, again losing a
+ * 'real' unknown NMI. But this is the best we can do
+ * for now.
+ */
+ if (b2b && __this_cpu_read(swallow_nmi))
+ __this_cpu_add(nmi_stats.swallow, 1);
+ else
+ unknown_nmi_error(reason, regs);
+}
+NOKPROBE_SYMBOL(default_do_nmi);
+
+/*
+ * NMIs can page fault or hit breakpoints which will cause it to lose
+ * its NMI context with the CPU when the breakpoint or page fault does an IRET.
+ *
+ * As a result, NMIs can nest if NMIs get unmasked due an IRET during
+ * NMI processing. On x86_64, the asm glue protects us from nested NMIs
+ * if the outer NMI came from kernel mode, but we can still nest if the
+ * outer NMI came from user mode.
+ *
+ * To handle these nested NMIs, we have three states:
+ *
+ * 1) not running
+ * 2) executing
+ * 3) latched
+ *
+ * When no NMI is in progress, it is in the "not running" state.
+ * When an NMI comes in, it goes into the "executing" state.
+ * Normally, if another NMI is triggered, it does not interrupt
+ * the running NMI and the HW will simply latch it so that when
+ * the first NMI finishes, it will restart the second NMI.
+ * (Note, the latch is binary, thus multiple NMIs triggering,
+ * when one is running, are ignored. Only one NMI is restarted.)
+ *
+ * If an NMI executes an iret, another NMI can preempt it. We do not
+ * want to allow this new NMI to run, but we want to execute it when the
+ * first one finishes. We set the state to "latched", and the exit of
+ * the first NMI will perform a dec_return, if the result is zero
+ * (NOT_RUNNING), then it will simply exit the NMI handler. If not, the
+ * dec_return would have set the state to NMI_EXECUTING (what we want it
+ * to be when we are running). In this case, we simply jump back to
+ * rerun the NMI handler again, and restart the 'latched' NMI.
+ *
+ * No trap (breakpoint or page fault) should be hit before nmi_restart,
+ * thus there is no race between the first check of state for NOT_RUNNING
+ * and setting it to NMI_EXECUTING. The HW will prevent nested NMIs
+ * at this point.
+ *
+ * In case the NMI takes a page fault, we need to save off the CR2
+ * because the NMI could have preempted another page fault and corrupt
+ * the CR2 that is about to be read. As nested NMIs must be restarted
+ * and they can not take breakpoints or page faults, the update of the
+ * CR2 must be done before converting the nmi state back to NOT_RUNNING.
+ * Otherwise, there would be a race of another nested NMI coming in
+ * after setting state to NOT_RUNNING but before updating the nmi_cr2.
+ */
+enum nmi_states {
+ NMI_NOT_RUNNING = 0,
+ NMI_EXECUTING,
+ NMI_LATCHED,
+};
+static DEFINE_PER_CPU(enum nmi_states, nmi_state);
+static DEFINE_PER_CPU(unsigned long, nmi_cr2);
+
+#ifdef CONFIG_X86_64
+/*
+ * In x86_64, we need to handle breakpoint -> NMI -> breakpoint. Without
+ * some care, the inner breakpoint will clobber the outer breakpoint's
+ * stack.
+ *
+ * If a breakpoint is being processed, and the debug stack is being
+ * used, if an NMI comes in and also hits a breakpoint, the stack
+ * pointer will be set to the same fixed address as the breakpoint that
+ * was interrupted, causing that stack to be corrupted. To handle this
+ * case, check if the stack that was interrupted is the debug stack, and
+ * if so, change the IDT so that new breakpoints will use the current
+ * stack and not switch to the fixed address. On return of the NMI,
+ * switch back to the original IDT.
+ */
+static DEFINE_PER_CPU(int, update_debug_stack);
+#endif
+
+dotraplinkage notrace void
+do_nmi(struct pt_regs *regs, long error_code)
+{
+ if (this_cpu_read(nmi_state) != NMI_NOT_RUNNING) {
+ this_cpu_write(nmi_state, NMI_LATCHED);
+ return;
+ }
+ this_cpu_write(nmi_state, NMI_EXECUTING);
+ this_cpu_write(nmi_cr2, read_cr2());
+nmi_restart:
+
+#ifdef CONFIG_X86_64
+ /*
+ * If we interrupted a breakpoint, it is possible that
+ * the nmi handler will have breakpoints too. We need to
+ * change the IDT such that breakpoints that happen here
+ * continue to use the NMI stack.
+ */
+ if (unlikely(is_debug_stack(regs->sp))) {
+ debug_stack_set_zero();
+ this_cpu_write(update_debug_stack, 1);
+ }
+#endif
+
+ nmi_enter();
+
+ inc_irq_stat(__nmi_count);
+
+ if (!ignore_nmis)
+ default_do_nmi(regs);
+
+ nmi_exit();
+
+#ifdef CONFIG_X86_64
+ if (unlikely(this_cpu_read(update_debug_stack))) {
+ debug_stack_reset();
+ this_cpu_write(update_debug_stack, 0);
+ }
+#endif
+
+ if (unlikely(this_cpu_read(nmi_cr2) != read_cr2()))
+ write_cr2(this_cpu_read(nmi_cr2));
+ if (this_cpu_dec_return(nmi_state))
+ goto nmi_restart;
+}
+NOKPROBE_SYMBOL(do_nmi);
+
+void stop_nmi(void)
+{
+ ignore_nmis++;
+}
+
+void restart_nmi(void)
+{
+ ignore_nmis--;
+}
+
+/* reset the back-to-back NMI logic */
+void local_touch_nmi(void)
+{
+ __this_cpu_write(last_nmi_rip, 0);
+}
+EXPORT_SYMBOL_GPL(local_touch_nmi);
diff --git a/arch/x86/kernel/nmi_selftest.c b/arch/x86/kernel/nmi_selftest.c
new file mode 100644
index 0000000..a1a96df
--- /dev/null
+++ b/arch/x86/kernel/nmi_selftest.c
@@ -0,0 +1,184 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * arch/x86/kernel/nmi-selftest.c
+ *
+ * Testsuite for NMI: IPIs
+ *
+ * Started by Don Zickus:
+ * (using lib/locking-selftest.c as a guide)
+ *
+ * Copyright (C) 2011 Red Hat, Inc., Don Zickus <dzickus@redhat.com>
+ */
+
+#include <linux/smp.h>
+#include <linux/cpumask.h>
+#include <linux/delay.h>
+#include <linux/init.h>
+#include <linux/percpu.h>
+
+#include <asm/apic.h>
+#include <asm/nmi.h>
+
+#define SUCCESS 0
+#define FAILURE 1
+#define TIMEOUT 2
+
+static int __initdata nmi_fail;
+
+/* check to see if NMI IPIs work on this machine */
+static DECLARE_BITMAP(nmi_ipi_mask, NR_CPUS) __initdata;
+
+static int __initdata testcase_total;
+static int __initdata testcase_successes;
+static int __initdata expected_testcase_failures;
+static int __initdata unexpected_testcase_failures;
+static int __initdata unexpected_testcase_unknowns;
+
+static int __init nmi_unk_cb(unsigned int val, struct pt_regs *regs)
+{
+ unexpected_testcase_unknowns++;
+ return NMI_HANDLED;
+}
+
+static void __init init_nmi_testsuite(void)
+{
+ /* trap all the unknown NMIs we may generate */
+ register_nmi_handler(NMI_UNKNOWN, nmi_unk_cb, 0, "nmi_selftest_unk",
+ __initdata);
+}
+
+static void __init cleanup_nmi_testsuite(void)
+{
+ unregister_nmi_handler(NMI_UNKNOWN, "nmi_selftest_unk");
+}
+
+static int __init test_nmi_ipi_callback(unsigned int val, struct pt_regs *regs)
+{
+ int cpu = raw_smp_processor_id();
+
+ if (cpumask_test_and_clear_cpu(cpu, to_cpumask(nmi_ipi_mask)))
+ return NMI_HANDLED;
+
+ return NMI_DONE;
+}
+
+static void __init test_nmi_ipi(struct cpumask *mask)
+{
+ unsigned long timeout;
+
+ if (register_nmi_handler(NMI_LOCAL, test_nmi_ipi_callback,
+ NMI_FLAG_FIRST, "nmi_selftest", __initdata)) {
+ nmi_fail = FAILURE;
+ return;
+ }
+
+ /* sync above data before sending NMI */
+ wmb();
+
+ apic->send_IPI_mask(mask, NMI_VECTOR);
+
+ /* Don't wait longer than a second */
+ timeout = USEC_PER_SEC;
+ while (!cpumask_empty(mask) && --timeout)
+ udelay(1);
+
+ /* What happens if we timeout, do we still unregister?? */
+ unregister_nmi_handler(NMI_LOCAL, "nmi_selftest");
+
+ if (!timeout)
+ nmi_fail = TIMEOUT;
+ return;
+}
+
+static void __init remote_ipi(void)
+{
+ cpumask_copy(to_cpumask(nmi_ipi_mask), cpu_online_mask);
+ cpumask_clear_cpu(smp_processor_id(), to_cpumask(nmi_ipi_mask));
+ if (!cpumask_empty(to_cpumask(nmi_ipi_mask)))
+ test_nmi_ipi(to_cpumask(nmi_ipi_mask));
+}
+
+static void __init local_ipi(void)
+{
+ cpumask_clear(to_cpumask(nmi_ipi_mask));
+ cpumask_set_cpu(smp_processor_id(), to_cpumask(nmi_ipi_mask));
+ test_nmi_ipi(to_cpumask(nmi_ipi_mask));
+}
+
+static void __init reset_nmi(void)
+{
+ nmi_fail = 0;
+}
+
+static void __init dotest(void (*testcase_fn)(void), int expected)
+{
+ testcase_fn();
+ /*
+ * Filter out expected failures:
+ */
+ if (nmi_fail != expected) {
+ unexpected_testcase_failures++;
+
+ if (nmi_fail == FAILURE)
+ printk(KERN_CONT "FAILED |");
+ else if (nmi_fail == TIMEOUT)
+ printk(KERN_CONT "TIMEOUT|");
+ else
+ printk(KERN_CONT "ERROR |");
+ dump_stack();
+ } else {
+ testcase_successes++;
+ printk(KERN_CONT " ok |");
+ }
+ testcase_total++;
+
+ reset_nmi();
+}
+
+static inline void __init print_testname(const char *testname)
+{
+ printk("%12s:", testname);
+}
+
+void __init nmi_selftest(void)
+{
+ init_nmi_testsuite();
+
+ /*
+ * Run the testsuite:
+ */
+ printk("----------------\n");
+ printk("| NMI testsuite:\n");
+ printk("--------------------\n");
+
+ print_testname("remote IPI");
+ dotest(remote_ipi, SUCCESS);
+ printk(KERN_CONT "\n");
+ print_testname("local IPI");
+ dotest(local_ipi, SUCCESS);
+ printk(KERN_CONT "\n");
+
+ cleanup_nmi_testsuite();
+
+ if (unexpected_testcase_failures) {
+ printk("--------------------\n");
+ printk("BUG: %3d unexpected failures (out of %3d) - debugging disabled! |\n",
+ unexpected_testcase_failures, testcase_total);
+ printk("-----------------------------------------------------------------\n");
+ } else if (expected_testcase_failures && testcase_successes) {
+ printk("--------------------\n");
+ printk("%3d out of %3d testcases failed, as expected. |\n",
+ expected_testcase_failures, testcase_total);
+ printk("----------------------------------------------------\n");
+ } else if (expected_testcase_failures && !testcase_successes) {
+ printk("--------------------\n");
+ printk("All %3d testcases failed, as expected. |\n",
+ expected_testcase_failures);
+ printk("----------------------------------------\n");
+ } else {
+ printk("--------------------\n");
+ printk("Good, all %3d testcases passed! |\n",
+ testcase_successes);
+ printk("---------------------------------\n");
+ }
+}
diff --git a/arch/x86/kernel/paravirt-spinlocks.c b/arch/x86/kernel/paravirt-spinlocks.c
new file mode 100644
index 0000000..71f2d11
--- /dev/null
+++ b/arch/x86/kernel/paravirt-spinlocks.c
@@ -0,0 +1,45 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Split spinlock implementation out into its own file, so it can be
+ * compiled in a FTRACE-compatible way.
+ */
+#include <linux/spinlock.h>
+#include <linux/export.h>
+#include <linux/jump_label.h>
+
+#include <asm/paravirt.h>
+
+__visible void __native_queued_spin_unlock(struct qspinlock *lock)
+{
+ native_queued_spin_unlock(lock);
+}
+PV_CALLEE_SAVE_REGS_THUNK(__native_queued_spin_unlock);
+
+bool pv_is_native_spin_unlock(void)
+{
+ return pv_lock_ops.queued_spin_unlock.func ==
+ __raw_callee_save___native_queued_spin_unlock;
+}
+
+__visible bool __native_vcpu_is_preempted(long cpu)
+{
+ return false;
+}
+PV_CALLEE_SAVE_REGS_THUNK(__native_vcpu_is_preempted);
+
+bool pv_is_native_vcpu_is_preempted(void)
+{
+ return pv_lock_ops.vcpu_is_preempted.func ==
+ __raw_callee_save___native_vcpu_is_preempted;
+}
+
+struct pv_lock_ops pv_lock_ops = {
+#ifdef CONFIG_SMP
+ .queued_spin_lock_slowpath = native_queued_spin_lock_slowpath,
+ .queued_spin_unlock = PV_CALLEE_SAVE(__native_queued_spin_unlock),
+ .wait = paravirt_nop,
+ .kick = paravirt_nop,
+ .vcpu_is_preempted = PV_CALLEE_SAVE(__native_vcpu_is_preempted),
+#endif /* SMP */
+};
+EXPORT_SYMBOL(pv_lock_ops);
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
new file mode 100644
index 0000000..8dc69d8
--- /dev/null
+++ b/arch/x86/kernel/paravirt.c
@@ -0,0 +1,483 @@
+/* Paravirtualization interfaces
+ Copyright (C) 2006 Rusty Russell IBM Corporation
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+
+ 2007 - x86_64 support added by Glauber de Oliveira Costa, Red Hat Inc
+*/
+
+#include <linux/errno.h>
+#include <linux/init.h>
+#include <linux/export.h>
+#include <linux/efi.h>
+#include <linux/bcd.h>
+#include <linux/highmem.h>
+#include <linux/kprobes.h>
+
+#include <asm/bug.h>
+#include <asm/paravirt.h>
+#include <asm/debugreg.h>
+#include <asm/desc.h>
+#include <asm/setup.h>
+#include <asm/pgtable.h>
+#include <asm/time.h>
+#include <asm/pgalloc.h>
+#include <asm/irq.h>
+#include <asm/delay.h>
+#include <asm/fixmap.h>
+#include <asm/apic.h>
+#include <asm/tlbflush.h>
+#include <asm/timer.h>
+#include <asm/special_insns.h>
+#include <asm/tlb.h>
+
+/*
+ * nop stub, which must not clobber anything *including the stack* to
+ * avoid confusing the entry prologues.
+ */
+extern void _paravirt_nop(void);
+asm (".pushsection .entry.text, \"ax\"\n"
+ ".global _paravirt_nop\n"
+ "_paravirt_nop:\n\t"
+ "ret\n\t"
+ ".size _paravirt_nop, . - _paravirt_nop\n\t"
+ ".type _paravirt_nop, @function\n\t"
+ ".popsection");
+
+/* identity function, which can be inlined */
+u32 notrace _paravirt_ident_32(u32 x)
+{
+ return x;
+}
+
+u64 notrace _paravirt_ident_64(u64 x)
+{
+ return x;
+}
+
+void __init default_banner(void)
+{
+ printk(KERN_INFO "Booting paravirtualized kernel on %s\n",
+ pv_info.name);
+}
+
+/* Undefined instruction for dealing with missing ops pointers. */
+static const unsigned char ud2a[] = { 0x0f, 0x0b };
+
+struct branch {
+ unsigned char opcode;
+ u32 delta;
+} __attribute__((packed));
+
+unsigned paravirt_patch_call(void *insnbuf,
+ const void *target, u16 tgt_clobbers,
+ unsigned long addr, u16 site_clobbers,
+ unsigned len)
+{
+ struct branch *b = insnbuf;
+ unsigned long delta = (unsigned long)target - (addr+5);
+
+ if (len < 5) {
+#ifdef CONFIG_RETPOLINE
+ WARN_ONCE(1, "Failing to patch indirect CALL in %ps\n", (void *)addr);
+#endif
+ return len; /* call too long for patch site */
+ }
+
+ b->opcode = 0xe8; /* call */
+ b->delta = delta;
+ BUILD_BUG_ON(sizeof(*b) != 5);
+
+ return 5;
+}
+
+unsigned paravirt_patch_jmp(void *insnbuf, const void *target,
+ unsigned long addr, unsigned len)
+{
+ struct branch *b = insnbuf;
+ unsigned long delta = (unsigned long)target - (addr+5);
+
+ if (len < 5) {
+#ifdef CONFIG_RETPOLINE
+ WARN_ONCE(1, "Failing to patch indirect JMP in %ps\n", (void *)addr);
+#endif
+ return len; /* call too long for patch site */
+ }
+
+ b->opcode = 0xe9; /* jmp */
+ b->delta = delta;
+
+ return 5;
+}
+
+DEFINE_STATIC_KEY_TRUE(virt_spin_lock_key);
+
+void __init native_pv_lock_init(void)
+{
+ if (!static_cpu_has(X86_FEATURE_HYPERVISOR))
+ static_branch_disable(&virt_spin_lock_key);
+}
+
+/*
+ * Neat trick to map patch type back to the call within the
+ * corresponding structure.
+ */
+static void *get_call_destination(u8 type)
+{
+ struct paravirt_patch_template tmpl = {
+ .pv_init_ops = pv_init_ops,
+ .pv_time_ops = pv_time_ops,
+ .pv_cpu_ops = pv_cpu_ops,
+ .pv_irq_ops = pv_irq_ops,
+ .pv_mmu_ops = pv_mmu_ops,
+#ifdef CONFIG_PARAVIRT_SPINLOCKS
+ .pv_lock_ops = pv_lock_ops,
+#endif
+ };
+ return *((void **)&tmpl + type);
+}
+
+unsigned paravirt_patch_default(u8 type, u16 clobbers, void *insnbuf,
+ unsigned long addr, unsigned len)
+{
+ void *opfunc = get_call_destination(type);
+ unsigned ret;
+
+ if (opfunc == NULL)
+ /* If there's no function, patch it with a ud2a (BUG) */
+ ret = paravirt_patch_insns(insnbuf, len, ud2a, ud2a+sizeof(ud2a));
+ else if (opfunc == _paravirt_nop)
+ ret = 0;
+
+ /* identity functions just return their single argument */
+ else if (opfunc == _paravirt_ident_32)
+ ret = paravirt_patch_ident_32(insnbuf, len);
+ else if (opfunc == _paravirt_ident_64)
+ ret = paravirt_patch_ident_64(insnbuf, len);
+
+ else if (type == PARAVIRT_PATCH(pv_cpu_ops.iret) ||
+ type == PARAVIRT_PATCH(pv_cpu_ops.usergs_sysret64))
+ /* If operation requires a jmp, then jmp */
+ ret = paravirt_patch_jmp(insnbuf, opfunc, addr, len);
+ else
+ /* Otherwise call the function; assume target could
+ clobber any caller-save reg */
+ ret = paravirt_patch_call(insnbuf, opfunc, CLBR_ANY,
+ addr, clobbers, len);
+
+ return ret;
+}
+
+unsigned paravirt_patch_insns(void *insnbuf, unsigned len,
+ const char *start, const char *end)
+{
+ unsigned insn_len = end - start;
+
+ if (insn_len > len || start == NULL)
+ insn_len = len;
+ else
+ memcpy(insnbuf, start, insn_len);
+
+ return insn_len;
+}
+
+static void native_flush_tlb(void)
+{
+ __native_flush_tlb();
+}
+
+/*
+ * Global pages have to be flushed a bit differently. Not a real
+ * performance problem because this does not happen often.
+ */
+static void native_flush_tlb_global(void)
+{
+ __native_flush_tlb_global();
+}
+
+static void native_flush_tlb_one_user(unsigned long addr)
+{
+ __native_flush_tlb_one_user(addr);
+}
+
+struct static_key paravirt_steal_enabled;
+struct static_key paravirt_steal_rq_enabled;
+
+static u64 native_steal_clock(int cpu)
+{
+ return 0;
+}
+
+/* These are in entry.S */
+extern void native_iret(void);
+extern void native_usergs_sysret64(void);
+
+static struct resource reserve_ioports = {
+ .start = 0,
+ .end = IO_SPACE_LIMIT,
+ .name = "paravirt-ioport",
+ .flags = IORESOURCE_IO | IORESOURCE_BUSY,
+};
+
+/*
+ * Reserve the whole legacy IO space to prevent any legacy drivers
+ * from wasting time probing for their hardware. This is a fairly
+ * brute-force approach to disabling all non-virtual drivers.
+ *
+ * Note that this must be called very early to have any effect.
+ */
+int paravirt_disable_iospace(void)
+{
+ return request_resource(&ioport_resource, &reserve_ioports);
+}
+
+static DEFINE_PER_CPU(enum paravirt_lazy_mode, paravirt_lazy_mode) = PARAVIRT_LAZY_NONE;
+
+static inline void enter_lazy(enum paravirt_lazy_mode mode)
+{
+ BUG_ON(this_cpu_read(paravirt_lazy_mode) != PARAVIRT_LAZY_NONE);
+
+ this_cpu_write(paravirt_lazy_mode, mode);
+}
+
+static void leave_lazy(enum paravirt_lazy_mode mode)
+{
+ BUG_ON(this_cpu_read(paravirt_lazy_mode) != mode);
+
+ this_cpu_write(paravirt_lazy_mode, PARAVIRT_LAZY_NONE);
+}
+
+void paravirt_enter_lazy_mmu(void)
+{
+ enter_lazy(PARAVIRT_LAZY_MMU);
+}
+
+void paravirt_leave_lazy_mmu(void)
+{
+ leave_lazy(PARAVIRT_LAZY_MMU);
+}
+
+void paravirt_flush_lazy_mmu(void)
+{
+ preempt_disable();
+
+ if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) {
+ arch_leave_lazy_mmu_mode();
+ arch_enter_lazy_mmu_mode();
+ }
+
+ preempt_enable();
+}
+
+void paravirt_start_context_switch(struct task_struct *prev)
+{
+ BUG_ON(preemptible());
+
+ if (this_cpu_read(paravirt_lazy_mode) == PARAVIRT_LAZY_MMU) {
+ arch_leave_lazy_mmu_mode();
+ set_ti_thread_flag(task_thread_info(prev), TIF_LAZY_MMU_UPDATES);
+ }
+ enter_lazy(PARAVIRT_LAZY_CPU);
+}
+
+void paravirt_end_context_switch(struct task_struct *next)
+{
+ BUG_ON(preemptible());
+
+ leave_lazy(PARAVIRT_LAZY_CPU);
+
+ if (test_and_clear_ti_thread_flag(task_thread_info(next), TIF_LAZY_MMU_UPDATES))
+ arch_enter_lazy_mmu_mode();
+}
+
+enum paravirt_lazy_mode paravirt_get_lazy_mode(void)
+{
+ if (in_interrupt())
+ return PARAVIRT_LAZY_NONE;
+
+ return this_cpu_read(paravirt_lazy_mode);
+}
+
+struct pv_info pv_info = {
+ .name = "bare hardware",
+ .kernel_rpl = 0,
+ .shared_kernel_pmd = 1, /* Only used when CONFIG_X86_PAE is set */
+
+#ifdef CONFIG_X86_64
+ .extra_user_64bit_cs = __USER_CS,
+#endif
+};
+
+struct pv_init_ops pv_init_ops = {
+ .patch = native_patch,
+};
+
+struct pv_time_ops pv_time_ops = {
+ .sched_clock = native_sched_clock,
+ .steal_clock = native_steal_clock,
+};
+
+__visible struct pv_irq_ops pv_irq_ops = {
+ .save_fl = __PV_IS_CALLEE_SAVE(native_save_fl),
+ .restore_fl = __PV_IS_CALLEE_SAVE(native_restore_fl),
+ .irq_disable = __PV_IS_CALLEE_SAVE(native_irq_disable),
+ .irq_enable = __PV_IS_CALLEE_SAVE(native_irq_enable),
+ .safe_halt = native_safe_halt,
+ .halt = native_halt,
+};
+
+__visible struct pv_cpu_ops pv_cpu_ops = {
+ .cpuid = native_cpuid,
+ .get_debugreg = native_get_debugreg,
+ .set_debugreg = native_set_debugreg,
+ .read_cr0 = native_read_cr0,
+ .write_cr0 = native_write_cr0,
+ .write_cr4 = native_write_cr4,
+#ifdef CONFIG_X86_64
+ .read_cr8 = native_read_cr8,
+ .write_cr8 = native_write_cr8,
+#endif
+ .wbinvd = native_wbinvd,
+ .read_msr = native_read_msr,
+ .write_msr = native_write_msr,
+ .read_msr_safe = native_read_msr_safe,
+ .write_msr_safe = native_write_msr_safe,
+ .read_pmc = native_read_pmc,
+ .load_tr_desc = native_load_tr_desc,
+ .set_ldt = native_set_ldt,
+ .load_gdt = native_load_gdt,
+ .load_idt = native_load_idt,
+ .store_tr = native_store_tr,
+ .load_tls = native_load_tls,
+#ifdef CONFIG_X86_64
+ .load_gs_index = native_load_gs_index,
+#endif
+ .write_ldt_entry = native_write_ldt_entry,
+ .write_gdt_entry = native_write_gdt_entry,
+ .write_idt_entry = native_write_idt_entry,
+
+ .alloc_ldt = paravirt_nop,
+ .free_ldt = paravirt_nop,
+
+ .load_sp0 = native_load_sp0,
+
+#ifdef CONFIG_X86_64
+ .usergs_sysret64 = native_usergs_sysret64,
+#endif
+ .iret = native_iret,
+ .swapgs = native_swapgs,
+
+ .set_iopl_mask = native_set_iopl_mask,
+ .io_delay = native_io_delay,
+
+ .start_context_switch = paravirt_nop,
+ .end_context_switch = paravirt_nop,
+};
+
+/* At this point, native_get/set_debugreg has real function entries */
+NOKPROBE_SYMBOL(native_get_debugreg);
+NOKPROBE_SYMBOL(native_set_debugreg);
+NOKPROBE_SYMBOL(native_load_idt);
+
+#if defined(CONFIG_X86_32) && !defined(CONFIG_X86_PAE)
+/* 32-bit pagetable entries */
+#define PTE_IDENT __PV_IS_CALLEE_SAVE(_paravirt_ident_32)
+#else
+/* 64-bit pagetable entries */
+#define PTE_IDENT __PV_IS_CALLEE_SAVE(_paravirt_ident_64)
+#endif
+
+struct pv_mmu_ops pv_mmu_ops __ro_after_init = {
+
+ .read_cr2 = native_read_cr2,
+ .write_cr2 = native_write_cr2,
+ .read_cr3 = __native_read_cr3,
+ .write_cr3 = native_write_cr3,
+
+ .flush_tlb_user = native_flush_tlb,
+ .flush_tlb_kernel = native_flush_tlb_global,
+ .flush_tlb_one_user = native_flush_tlb_one_user,
+ .flush_tlb_others = native_flush_tlb_others,
+ .tlb_remove_table = (void (*)(struct mmu_gather *, void *))tlb_remove_page,
+
+ .pgd_alloc = __paravirt_pgd_alloc,
+ .pgd_free = paravirt_nop,
+
+ .alloc_pte = paravirt_nop,
+ .alloc_pmd = paravirt_nop,
+ .alloc_pud = paravirt_nop,
+ .alloc_p4d = paravirt_nop,
+ .release_pte = paravirt_nop,
+ .release_pmd = paravirt_nop,
+ .release_pud = paravirt_nop,
+ .release_p4d = paravirt_nop,
+
+ .set_pte = native_set_pte,
+ .set_pte_at = native_set_pte_at,
+ .set_pmd = native_set_pmd,
+
+ .ptep_modify_prot_start = __ptep_modify_prot_start,
+ .ptep_modify_prot_commit = __ptep_modify_prot_commit,
+
+#if CONFIG_PGTABLE_LEVELS >= 3
+#ifdef CONFIG_X86_PAE
+ .set_pte_atomic = native_set_pte_atomic,
+ .pte_clear = native_pte_clear,
+ .pmd_clear = native_pmd_clear,
+#endif
+ .set_pud = native_set_pud,
+
+ .pmd_val = PTE_IDENT,
+ .make_pmd = PTE_IDENT,
+
+#if CONFIG_PGTABLE_LEVELS >= 4
+ .pud_val = PTE_IDENT,
+ .make_pud = PTE_IDENT,
+
+ .set_p4d = native_set_p4d,
+
+#if CONFIG_PGTABLE_LEVELS >= 5
+ .p4d_val = PTE_IDENT,
+ .make_p4d = PTE_IDENT,
+
+ .set_pgd = native_set_pgd,
+#endif /* CONFIG_PGTABLE_LEVELS >= 5 */
+#endif /* CONFIG_PGTABLE_LEVELS >= 4 */
+#endif /* CONFIG_PGTABLE_LEVELS >= 3 */
+
+ .pte_val = PTE_IDENT,
+ .pgd_val = PTE_IDENT,
+
+ .make_pte = PTE_IDENT,
+ .make_pgd = PTE_IDENT,
+
+ .dup_mmap = paravirt_nop,
+ .exit_mmap = paravirt_nop,
+ .activate_mm = paravirt_nop,
+
+ .lazy_mode = {
+ .enter = paravirt_nop,
+ .leave = paravirt_nop,
+ .flush = paravirt_nop,
+ },
+
+ .set_fixmap = native_set_fixmap,
+};
+
+EXPORT_SYMBOL_GPL(pv_time_ops);
+EXPORT_SYMBOL (pv_cpu_ops);
+EXPORT_SYMBOL (pv_mmu_ops);
+EXPORT_SYMBOL_GPL(pv_info);
+EXPORT_SYMBOL (pv_irq_ops);
diff --git a/arch/x86/kernel/paravirt_patch_32.c b/arch/x86/kernel/paravirt_patch_32.c
new file mode 100644
index 0000000..758e69d
--- /dev/null
+++ b/arch/x86/kernel/paravirt_patch_32.c
@@ -0,0 +1,82 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <asm/paravirt.h>
+
+DEF_NATIVE(pv_irq_ops, irq_disable, "cli");
+DEF_NATIVE(pv_irq_ops, irq_enable, "sti");
+DEF_NATIVE(pv_irq_ops, restore_fl, "push %eax; popf");
+DEF_NATIVE(pv_irq_ops, save_fl, "pushf; pop %eax");
+DEF_NATIVE(pv_cpu_ops, iret, "iret");
+DEF_NATIVE(pv_mmu_ops, read_cr2, "mov %cr2, %eax");
+DEF_NATIVE(pv_mmu_ops, write_cr3, "mov %eax, %cr3");
+DEF_NATIVE(pv_mmu_ops, read_cr3, "mov %cr3, %eax");
+
+#if defined(CONFIG_PARAVIRT_SPINLOCKS)
+DEF_NATIVE(pv_lock_ops, queued_spin_unlock, "movb $0, (%eax)");
+DEF_NATIVE(pv_lock_ops, vcpu_is_preempted, "xor %eax, %eax");
+#endif
+
+unsigned paravirt_patch_ident_32(void *insnbuf, unsigned len)
+{
+ /* arg in %eax, return in %eax */
+ return 0;
+}
+
+unsigned paravirt_patch_ident_64(void *insnbuf, unsigned len)
+{
+ /* arg in %edx:%eax, return in %edx:%eax */
+ return 0;
+}
+
+extern bool pv_is_native_spin_unlock(void);
+extern bool pv_is_native_vcpu_is_preempted(void);
+
+unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
+ unsigned long addr, unsigned len)
+{
+ const unsigned char *start, *end;
+ unsigned ret;
+
+#define PATCH_SITE(ops, x) \
+ case PARAVIRT_PATCH(ops.x): \
+ start = start_##ops##_##x; \
+ end = end_##ops##_##x; \
+ goto patch_site
+ switch (type) {
+ PATCH_SITE(pv_irq_ops, irq_disable);
+ PATCH_SITE(pv_irq_ops, irq_enable);
+ PATCH_SITE(pv_irq_ops, restore_fl);
+ PATCH_SITE(pv_irq_ops, save_fl);
+ PATCH_SITE(pv_cpu_ops, iret);
+ PATCH_SITE(pv_mmu_ops, read_cr2);
+ PATCH_SITE(pv_mmu_ops, read_cr3);
+ PATCH_SITE(pv_mmu_ops, write_cr3);
+#if defined(CONFIG_PARAVIRT_SPINLOCKS)
+ case PARAVIRT_PATCH(pv_lock_ops.queued_spin_unlock):
+ if (pv_is_native_spin_unlock()) {
+ start = start_pv_lock_ops_queued_spin_unlock;
+ end = end_pv_lock_ops_queued_spin_unlock;
+ goto patch_site;
+ }
+ goto patch_default;
+
+ case PARAVIRT_PATCH(pv_lock_ops.vcpu_is_preempted):
+ if (pv_is_native_vcpu_is_preempted()) {
+ start = start_pv_lock_ops_vcpu_is_preempted;
+ end = end_pv_lock_ops_vcpu_is_preempted;
+ goto patch_site;
+ }
+ goto patch_default;
+#endif
+
+ default:
+patch_default: __maybe_unused
+ ret = paravirt_patch_default(type, clobbers, ibuf, addr, len);
+ break;
+
+patch_site:
+ ret = paravirt_patch_insns(ibuf, len, start, end);
+ break;
+ }
+#undef PATCH_SITE
+ return ret;
+}
diff --git a/arch/x86/kernel/paravirt_patch_64.c b/arch/x86/kernel/paravirt_patch_64.c
new file mode 100644
index 0000000..9cb98f7
--- /dev/null
+++ b/arch/x86/kernel/paravirt_patch_64.c
@@ -0,0 +1,92 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <asm/paravirt.h>
+#include <asm/asm-offsets.h>
+#include <linux/stringify.h>
+
+DEF_NATIVE(pv_irq_ops, irq_disable, "cli");
+DEF_NATIVE(pv_irq_ops, irq_enable, "sti");
+DEF_NATIVE(pv_irq_ops, restore_fl, "pushq %rdi; popfq");
+DEF_NATIVE(pv_irq_ops, save_fl, "pushfq; popq %rax");
+DEF_NATIVE(pv_mmu_ops, read_cr2, "movq %cr2, %rax");
+DEF_NATIVE(pv_mmu_ops, read_cr3, "movq %cr3, %rax");
+DEF_NATIVE(pv_mmu_ops, write_cr3, "movq %rdi, %cr3");
+DEF_NATIVE(pv_cpu_ops, wbinvd, "wbinvd");
+
+DEF_NATIVE(pv_cpu_ops, usergs_sysret64, "swapgs; sysretq");
+DEF_NATIVE(pv_cpu_ops, swapgs, "swapgs");
+
+DEF_NATIVE(, mov32, "mov %edi, %eax");
+DEF_NATIVE(, mov64, "mov %rdi, %rax");
+
+#if defined(CONFIG_PARAVIRT_SPINLOCKS)
+DEF_NATIVE(pv_lock_ops, queued_spin_unlock, "movb $0, (%rdi)");
+DEF_NATIVE(pv_lock_ops, vcpu_is_preempted, "xor %eax, %eax");
+#endif
+
+unsigned paravirt_patch_ident_32(void *insnbuf, unsigned len)
+{
+ return paravirt_patch_insns(insnbuf, len,
+ start__mov32, end__mov32);
+}
+
+unsigned paravirt_patch_ident_64(void *insnbuf, unsigned len)
+{
+ return paravirt_patch_insns(insnbuf, len,
+ start__mov64, end__mov64);
+}
+
+extern bool pv_is_native_spin_unlock(void);
+extern bool pv_is_native_vcpu_is_preempted(void);
+
+unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
+ unsigned long addr, unsigned len)
+{
+ const unsigned char *start, *end;
+ unsigned ret;
+
+#define PATCH_SITE(ops, x) \
+ case PARAVIRT_PATCH(ops.x): \
+ start = start_##ops##_##x; \
+ end = end_##ops##_##x; \
+ goto patch_site
+ switch(type) {
+ PATCH_SITE(pv_irq_ops, restore_fl);
+ PATCH_SITE(pv_irq_ops, save_fl);
+ PATCH_SITE(pv_irq_ops, irq_enable);
+ PATCH_SITE(pv_irq_ops, irq_disable);
+ PATCH_SITE(pv_cpu_ops, usergs_sysret64);
+ PATCH_SITE(pv_cpu_ops, swapgs);
+ PATCH_SITE(pv_mmu_ops, read_cr2);
+ PATCH_SITE(pv_mmu_ops, read_cr3);
+ PATCH_SITE(pv_mmu_ops, write_cr3);
+ PATCH_SITE(pv_cpu_ops, wbinvd);
+#if defined(CONFIG_PARAVIRT_SPINLOCKS)
+ case PARAVIRT_PATCH(pv_lock_ops.queued_spin_unlock):
+ if (pv_is_native_spin_unlock()) {
+ start = start_pv_lock_ops_queued_spin_unlock;
+ end = end_pv_lock_ops_queued_spin_unlock;
+ goto patch_site;
+ }
+ goto patch_default;
+
+ case PARAVIRT_PATCH(pv_lock_ops.vcpu_is_preempted):
+ if (pv_is_native_vcpu_is_preempted()) {
+ start = start_pv_lock_ops_vcpu_is_preempted;
+ end = end_pv_lock_ops_vcpu_is_preempted;
+ goto patch_site;
+ }
+ goto patch_default;
+#endif
+
+ default:
+patch_default: __maybe_unused
+ ret = paravirt_patch_default(type, clobbers, ibuf, addr, len);
+ break;
+
+patch_site:
+ ret = paravirt_patch_insns(ibuf, len, start, end);
+ break;
+ }
+#undef PATCH_SITE
+ return ret;
+}
diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c
new file mode 100644
index 0000000..bbfc8b1
--- /dev/null
+++ b/arch/x86/kernel/pci-calgary_64.c
@@ -0,0 +1,1612 @@
+/*
+ * Derived from arch/powerpc/kernel/iommu.c
+ *
+ * Copyright IBM Corporation, 2006-2007
+ * Copyright (C) 2006 Jon Mason <jdmason@kudzu.us>
+ *
+ * Author: Jon Mason <jdmason@kudzu.us>
+ * Author: Muli Ben-Yehuda <muli@il.ibm.com>
+
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#define pr_fmt(fmt) "Calgary: " fmt
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+#include <linux/spinlock.h>
+#include <linux/string.h>
+#include <linux/crash_dump.h>
+#include <linux/dma-mapping.h>
+#include <linux/dma-direct.h>
+#include <linux/bitmap.h>
+#include <linux/pci_ids.h>
+#include <linux/pci.h>
+#include <linux/delay.h>
+#include <linux/scatterlist.h>
+#include <linux/iommu-helper.h>
+
+#include <asm/iommu.h>
+#include <asm/calgary.h>
+#include <asm/tce.h>
+#include <asm/pci-direct.h>
+#include <asm/dma.h>
+#include <asm/rio.h>
+#include <asm/bios_ebda.h>
+#include <asm/x86_init.h>
+#include <asm/iommu_table.h>
+
+#define CALGARY_MAPPING_ERROR 0
+
+#ifdef CONFIG_CALGARY_IOMMU_ENABLED_BY_DEFAULT
+int use_calgary __read_mostly = 1;
+#else
+int use_calgary __read_mostly = 0;
+#endif /* CONFIG_CALGARY_DEFAULT_ENABLED */
+
+#define PCI_DEVICE_ID_IBM_CALGARY 0x02a1
+#define PCI_DEVICE_ID_IBM_CALIOC2 0x0308
+
+/* register offsets inside the host bridge space */
+#define CALGARY_CONFIG_REG 0x0108
+#define PHB_CSR_OFFSET 0x0110 /* Channel Status */
+#define PHB_PLSSR_OFFSET 0x0120
+#define PHB_CONFIG_RW_OFFSET 0x0160
+#define PHB_IOBASE_BAR_LOW 0x0170
+#define PHB_IOBASE_BAR_HIGH 0x0180
+#define PHB_MEM_1_LOW 0x0190
+#define PHB_MEM_1_HIGH 0x01A0
+#define PHB_IO_ADDR_SIZE 0x01B0
+#define PHB_MEM_1_SIZE 0x01C0
+#define PHB_MEM_ST_OFFSET 0x01D0
+#define PHB_AER_OFFSET 0x0200
+#define PHB_CONFIG_0_HIGH 0x0220
+#define PHB_CONFIG_0_LOW 0x0230
+#define PHB_CONFIG_0_END 0x0240
+#define PHB_MEM_2_LOW 0x02B0
+#define PHB_MEM_2_HIGH 0x02C0
+#define PHB_MEM_2_SIZE_HIGH 0x02D0
+#define PHB_MEM_2_SIZE_LOW 0x02E0
+#define PHB_DOSHOLE_OFFSET 0x08E0
+
+/* CalIOC2 specific */
+#define PHB_SAVIOR_L2 0x0DB0
+#define PHB_PAGE_MIG_CTRL 0x0DA8
+#define PHB_PAGE_MIG_DEBUG 0x0DA0
+#define PHB_ROOT_COMPLEX_STATUS 0x0CB0
+
+/* PHB_CONFIG_RW */
+#define PHB_TCE_ENABLE 0x20000000
+#define PHB_SLOT_DISABLE 0x1C000000
+#define PHB_DAC_DISABLE 0x01000000
+#define PHB_MEM2_ENABLE 0x00400000
+#define PHB_MCSR_ENABLE 0x00100000
+/* TAR (Table Address Register) */
+#define TAR_SW_BITS 0x0000ffffffff800fUL
+#define TAR_VALID 0x0000000000000008UL
+/* CSR (Channel/DMA Status Register) */
+#define CSR_AGENT_MASK 0xffe0ffff
+/* CCR (Calgary Configuration Register) */
+#define CCR_2SEC_TIMEOUT 0x000000000000000EUL
+/* PMCR/PMDR (Page Migration Control/Debug Registers */
+#define PMR_SOFTSTOP 0x80000000
+#define PMR_SOFTSTOPFAULT 0x40000000
+#define PMR_HARDSTOP 0x20000000
+
+/*
+ * The maximum PHB bus number.
+ * x3950M2 (rare): 8 chassis, 48 PHBs per chassis = 384
+ * x3950M2: 4 chassis, 48 PHBs per chassis = 192
+ * x3950 (PCIE): 8 chassis, 32 PHBs per chassis = 256
+ * x3950 (PCIX): 8 chassis, 16 PHBs per chassis = 128
+ */
+#define MAX_PHB_BUS_NUM 256
+
+#define PHBS_PER_CALGARY 4
+
+/* register offsets in Calgary's internal register space */
+static const unsigned long tar_offsets[] = {
+ 0x0580 /* TAR0 */,
+ 0x0588 /* TAR1 */,
+ 0x0590 /* TAR2 */,
+ 0x0598 /* TAR3 */
+};
+
+static const unsigned long split_queue_offsets[] = {
+ 0x4870 /* SPLIT QUEUE 0 */,
+ 0x5870 /* SPLIT QUEUE 1 */,
+ 0x6870 /* SPLIT QUEUE 2 */,
+ 0x7870 /* SPLIT QUEUE 3 */
+};
+
+static const unsigned long phb_offsets[] = {
+ 0x8000 /* PHB0 */,
+ 0x9000 /* PHB1 */,
+ 0xA000 /* PHB2 */,
+ 0xB000 /* PHB3 */
+};
+
+/* PHB debug registers */
+
+static const unsigned long phb_debug_offsets[] = {
+ 0x4000 /* PHB 0 DEBUG */,
+ 0x5000 /* PHB 1 DEBUG */,
+ 0x6000 /* PHB 2 DEBUG */,
+ 0x7000 /* PHB 3 DEBUG */
+};
+
+/*
+ * STUFF register for each debug PHB,
+ * byte 1 = start bus number, byte 2 = end bus number
+ */
+
+#define PHB_DEBUG_STUFF_OFFSET 0x0020
+
+#define EMERGENCY_PAGES 32 /* = 128KB */
+
+unsigned int specified_table_size = TCE_TABLE_SIZE_UNSPECIFIED;
+static int translate_empty_slots __read_mostly = 0;
+static int calgary_detected __read_mostly = 0;
+
+static struct rio_table_hdr *rio_table_hdr __initdata;
+static struct scal_detail *scal_devs[MAX_NUMNODES] __initdata;
+static struct rio_detail *rio_devs[MAX_NUMNODES * 4] __initdata;
+
+struct calgary_bus_info {
+ void *tce_space;
+ unsigned char translation_disabled;
+ signed char phbid;
+ void __iomem *bbar;
+};
+
+static void calgary_handle_quirks(struct iommu_table *tbl, struct pci_dev *dev);
+static void calgary_tce_cache_blast(struct iommu_table *tbl);
+static void calgary_dump_error_regs(struct iommu_table *tbl);
+static void calioc2_handle_quirks(struct iommu_table *tbl, struct pci_dev *dev);
+static void calioc2_tce_cache_blast(struct iommu_table *tbl);
+static void calioc2_dump_error_regs(struct iommu_table *tbl);
+static void calgary_init_bitmap_from_tce_table(struct iommu_table *tbl);
+static void get_tce_space_from_tar(void);
+
+static const struct cal_chipset_ops calgary_chip_ops = {
+ .handle_quirks = calgary_handle_quirks,
+ .tce_cache_blast = calgary_tce_cache_blast,
+ .dump_error_regs = calgary_dump_error_regs
+};
+
+static const struct cal_chipset_ops calioc2_chip_ops = {
+ .handle_quirks = calioc2_handle_quirks,
+ .tce_cache_blast = calioc2_tce_cache_blast,
+ .dump_error_regs = calioc2_dump_error_regs
+};
+
+static struct calgary_bus_info bus_info[MAX_PHB_BUS_NUM] = { { NULL, 0, 0 }, };
+
+static inline int translation_enabled(struct iommu_table *tbl)
+{
+ /* only PHBs with translation enabled have an IOMMU table */
+ return (tbl != NULL);
+}
+
+static void iommu_range_reserve(struct iommu_table *tbl,
+ unsigned long start_addr, unsigned int npages)
+{
+ unsigned long index;
+ unsigned long end;
+ unsigned long flags;
+
+ index = start_addr >> PAGE_SHIFT;
+
+ /* bail out if we're asked to reserve a region we don't cover */
+ if (index >= tbl->it_size)
+ return;
+
+ end = index + npages;
+ if (end > tbl->it_size) /* don't go off the table */
+ end = tbl->it_size;
+
+ spin_lock_irqsave(&tbl->it_lock, flags);
+
+ bitmap_set(tbl->it_map, index, npages);
+
+ spin_unlock_irqrestore(&tbl->it_lock, flags);
+}
+
+static unsigned long iommu_range_alloc(struct device *dev,
+ struct iommu_table *tbl,
+ unsigned int npages)
+{
+ unsigned long flags;
+ unsigned long offset;
+ unsigned long boundary_size;
+
+ boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1,
+ PAGE_SIZE) >> PAGE_SHIFT;
+
+ BUG_ON(npages == 0);
+
+ spin_lock_irqsave(&tbl->it_lock, flags);
+
+ offset = iommu_area_alloc(tbl->it_map, tbl->it_size, tbl->it_hint,
+ npages, 0, boundary_size, 0);
+ if (offset == ~0UL) {
+ tbl->chip_ops->tce_cache_blast(tbl);
+
+ offset = iommu_area_alloc(tbl->it_map, tbl->it_size, 0,
+ npages, 0, boundary_size, 0);
+ if (offset == ~0UL) {
+ pr_warn("IOMMU full\n");
+ spin_unlock_irqrestore(&tbl->it_lock, flags);
+ if (panic_on_overflow)
+ panic("Calgary: fix the allocator.\n");
+ else
+ return CALGARY_MAPPING_ERROR;
+ }
+ }
+
+ tbl->it_hint = offset + npages;
+ BUG_ON(tbl->it_hint > tbl->it_size);
+
+ spin_unlock_irqrestore(&tbl->it_lock, flags);
+
+ return offset;
+}
+
+static dma_addr_t iommu_alloc(struct device *dev, struct iommu_table *tbl,
+ void *vaddr, unsigned int npages, int direction)
+{
+ unsigned long entry;
+ dma_addr_t ret;
+
+ entry = iommu_range_alloc(dev, tbl, npages);
+
+ if (unlikely(entry == CALGARY_MAPPING_ERROR)) {
+ pr_warn("failed to allocate %u pages in iommu %p\n",
+ npages, tbl);
+ return CALGARY_MAPPING_ERROR;
+ }
+
+ /* set the return dma address */
+ ret = (entry << PAGE_SHIFT) | ((unsigned long)vaddr & ~PAGE_MASK);
+
+ /* put the TCEs in the HW table */
+ tce_build(tbl, entry, npages, (unsigned long)vaddr & PAGE_MASK,
+ direction);
+ return ret;
+}
+
+static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
+ unsigned int npages)
+{
+ unsigned long entry;
+ unsigned long badend;
+ unsigned long flags;
+
+ /* were we called with bad_dma_address? */
+ badend = CALGARY_MAPPING_ERROR + (EMERGENCY_PAGES * PAGE_SIZE);
+ if (unlikely(dma_addr < badend)) {
+ WARN(1, KERN_ERR "Calgary: driver tried unmapping bad DMA "
+ "address 0x%Lx\n", dma_addr);
+ return;
+ }
+
+ entry = dma_addr >> PAGE_SHIFT;
+
+ BUG_ON(entry + npages > tbl->it_size);
+
+ tce_free(tbl, entry, npages);
+
+ spin_lock_irqsave(&tbl->it_lock, flags);
+
+ bitmap_clear(tbl->it_map, entry, npages);
+
+ spin_unlock_irqrestore(&tbl->it_lock, flags);
+}
+
+static inline struct iommu_table *find_iommu_table(struct device *dev)
+{
+ struct pci_dev *pdev;
+ struct pci_bus *pbus;
+ struct iommu_table *tbl;
+
+ pdev = to_pci_dev(dev);
+
+ /* search up the device tree for an iommu */
+ pbus = pdev->bus;
+ do {
+ tbl = pci_iommu(pbus);
+ if (tbl && tbl->it_busno == pbus->number)
+ break;
+ tbl = NULL;
+ pbus = pbus->parent;
+ } while (pbus);
+
+ BUG_ON(tbl && (tbl->it_busno != pbus->number));
+
+ return tbl;
+}
+
+static void calgary_unmap_sg(struct device *dev, struct scatterlist *sglist,
+ int nelems,enum dma_data_direction dir,
+ unsigned long attrs)
+{
+ struct iommu_table *tbl = find_iommu_table(dev);
+ struct scatterlist *s;
+ int i;
+
+ if (!translation_enabled(tbl))
+ return;
+
+ for_each_sg(sglist, s, nelems, i) {
+ unsigned int npages;
+ dma_addr_t dma = s->dma_address;
+ unsigned int dmalen = s->dma_length;
+
+ if (dmalen == 0)
+ break;
+
+ npages = iommu_num_pages(dma, dmalen, PAGE_SIZE);
+ iommu_free(tbl, dma, npages);
+ }
+}
+
+static int calgary_map_sg(struct device *dev, struct scatterlist *sg,
+ int nelems, enum dma_data_direction dir,
+ unsigned long attrs)
+{
+ struct iommu_table *tbl = find_iommu_table(dev);
+ struct scatterlist *s;
+ unsigned long vaddr;
+ unsigned int npages;
+ unsigned long entry;
+ int i;
+
+ for_each_sg(sg, s, nelems, i) {
+ BUG_ON(!sg_page(s));
+
+ vaddr = (unsigned long) sg_virt(s);
+ npages = iommu_num_pages(vaddr, s->length, PAGE_SIZE);
+
+ entry = iommu_range_alloc(dev, tbl, npages);
+ if (entry == CALGARY_MAPPING_ERROR) {
+ /* makes sure unmap knows to stop */
+ s->dma_length = 0;
+ goto error;
+ }
+
+ s->dma_address = (entry << PAGE_SHIFT) | s->offset;
+
+ /* insert into HW table */
+ tce_build(tbl, entry, npages, vaddr & PAGE_MASK, dir);
+
+ s->dma_length = s->length;
+ }
+
+ return nelems;
+error:
+ calgary_unmap_sg(dev, sg, nelems, dir, 0);
+ for_each_sg(sg, s, nelems, i) {
+ sg->dma_address = CALGARY_MAPPING_ERROR;
+ sg->dma_length = 0;
+ }
+ return 0;
+}
+
+static dma_addr_t calgary_map_page(struct device *dev, struct page *page,
+ unsigned long offset, size_t size,
+ enum dma_data_direction dir,
+ unsigned long attrs)
+{
+ void *vaddr = page_address(page) + offset;
+ unsigned long uaddr;
+ unsigned int npages;
+ struct iommu_table *tbl = find_iommu_table(dev);
+
+ uaddr = (unsigned long)vaddr;
+ npages = iommu_num_pages(uaddr, size, PAGE_SIZE);
+
+ return iommu_alloc(dev, tbl, vaddr, npages, dir);
+}
+
+static void calgary_unmap_page(struct device *dev, dma_addr_t dma_addr,
+ size_t size, enum dma_data_direction dir,
+ unsigned long attrs)
+{
+ struct iommu_table *tbl = find_iommu_table(dev);
+ unsigned int npages;
+
+ npages = iommu_num_pages(dma_addr, size, PAGE_SIZE);
+ iommu_free(tbl, dma_addr, npages);
+}
+
+static void* calgary_alloc_coherent(struct device *dev, size_t size,
+ dma_addr_t *dma_handle, gfp_t flag, unsigned long attrs)
+{
+ void *ret = NULL;
+ dma_addr_t mapping;
+ unsigned int npages, order;
+ struct iommu_table *tbl = find_iommu_table(dev);
+
+ size = PAGE_ALIGN(size); /* size rounded up to full pages */
+ npages = size >> PAGE_SHIFT;
+ order = get_order(size);
+
+ /* alloc enough pages (and possibly more) */
+ ret = (void *)__get_free_pages(flag, order);
+ if (!ret)
+ goto error;
+ memset(ret, 0, size);
+
+ /* set up tces to cover the allocated range */
+ mapping = iommu_alloc(dev, tbl, ret, npages, DMA_BIDIRECTIONAL);
+ if (mapping == CALGARY_MAPPING_ERROR)
+ goto free;
+ *dma_handle = mapping;
+ return ret;
+free:
+ free_pages((unsigned long)ret, get_order(size));
+ ret = NULL;
+error:
+ return ret;
+}
+
+static void calgary_free_coherent(struct device *dev, size_t size,
+ void *vaddr, dma_addr_t dma_handle,
+ unsigned long attrs)
+{
+ unsigned int npages;
+ struct iommu_table *tbl = find_iommu_table(dev);
+
+ size = PAGE_ALIGN(size);
+ npages = size >> PAGE_SHIFT;
+
+ iommu_free(tbl, dma_handle, npages);
+ free_pages((unsigned long)vaddr, get_order(size));
+}
+
+static int calgary_mapping_error(struct device *dev, dma_addr_t dma_addr)
+{
+ return dma_addr == CALGARY_MAPPING_ERROR;
+}
+
+static const struct dma_map_ops calgary_dma_ops = {
+ .alloc = calgary_alloc_coherent,
+ .free = calgary_free_coherent,
+ .map_sg = calgary_map_sg,
+ .unmap_sg = calgary_unmap_sg,
+ .map_page = calgary_map_page,
+ .unmap_page = calgary_unmap_page,
+ .mapping_error = calgary_mapping_error,
+ .dma_supported = dma_direct_supported,
+};
+
+static inline void __iomem * busno_to_bbar(unsigned char num)
+{
+ return bus_info[num].bbar;
+}
+
+static inline int busno_to_phbid(unsigned char num)
+{
+ return bus_info[num].phbid;
+}
+
+static inline unsigned long split_queue_offset(unsigned char num)
+{
+ size_t idx = busno_to_phbid(num);
+
+ return split_queue_offsets[idx];
+}
+
+static inline unsigned long tar_offset(unsigned char num)
+{
+ size_t idx = busno_to_phbid(num);
+
+ return tar_offsets[idx];
+}
+
+static inline unsigned long phb_offset(unsigned char num)
+{
+ size_t idx = busno_to_phbid(num);
+
+ return phb_offsets[idx];
+}
+
+static inline void __iomem* calgary_reg(void __iomem *bar, unsigned long offset)
+{
+ unsigned long target = ((unsigned long)bar) | offset;
+ return (void __iomem*)target;
+}
+
+static inline int is_calioc2(unsigned short device)
+{
+ return (device == PCI_DEVICE_ID_IBM_CALIOC2);
+}
+
+static inline int is_calgary(unsigned short device)
+{
+ return (device == PCI_DEVICE_ID_IBM_CALGARY);
+}
+
+static inline int is_cal_pci_dev(unsigned short device)
+{
+ return (is_calgary(device) || is_calioc2(device));
+}
+
+static void calgary_tce_cache_blast(struct iommu_table *tbl)
+{
+ u64 val;
+ u32 aer;
+ int i = 0;
+ void __iomem *bbar = tbl->bbar;
+ void __iomem *target;
+
+ /* disable arbitration on the bus */
+ target = calgary_reg(bbar, phb_offset(tbl->it_busno) | PHB_AER_OFFSET);
+ aer = readl(target);
+ writel(0, target);
+
+ /* read plssr to ensure it got there */
+ target = calgary_reg(bbar, phb_offset(tbl->it_busno) | PHB_PLSSR_OFFSET);
+ val = readl(target);
+
+ /* poll split queues until all DMA activity is done */
+ target = calgary_reg(bbar, split_queue_offset(tbl->it_busno));
+ do {
+ val = readq(target);
+ i++;
+ } while ((val & 0xff) != 0xff && i < 100);
+ if (i == 100)
+ pr_warn("PCI bus not quiesced, continuing anyway\n");
+
+ /* invalidate TCE cache */
+ target = calgary_reg(bbar, tar_offset(tbl->it_busno));
+ writeq(tbl->tar_val, target);
+
+ /* enable arbitration */
+ target = calgary_reg(bbar, phb_offset(tbl->it_busno) | PHB_AER_OFFSET);
+ writel(aer, target);
+ (void)readl(target); /* flush */
+}
+
+static void calioc2_tce_cache_blast(struct iommu_table *tbl)
+{
+ void __iomem *bbar = tbl->bbar;
+ void __iomem *target;
+ u64 val64;
+ u32 val;
+ int i = 0;
+ int count = 1;
+ unsigned char bus = tbl->it_busno;
+
+begin:
+ printk(KERN_DEBUG "Calgary: CalIOC2 bus 0x%x entering tce cache blast "
+ "sequence - count %d\n", bus, count);
+
+ /* 1. using the Page Migration Control reg set SoftStop */
+ target = calgary_reg(bbar, phb_offset(bus) | PHB_PAGE_MIG_CTRL);
+ val = be32_to_cpu(readl(target));
+ printk(KERN_DEBUG "1a. read 0x%x [LE] from %p\n", val, target);
+ val |= PMR_SOFTSTOP;
+ printk(KERN_DEBUG "1b. writing 0x%x [LE] to %p\n", val, target);
+ writel(cpu_to_be32(val), target);
+
+ /* 2. poll split queues until all DMA activity is done */
+ printk(KERN_DEBUG "2a. starting to poll split queues\n");
+ target = calgary_reg(bbar, split_queue_offset(bus));
+ do {
+ val64 = readq(target);
+ i++;
+ } while ((val64 & 0xff) != 0xff && i < 100);
+ if (i == 100)
+ pr_warn("CalIOC2: PCI bus not quiesced, continuing anyway\n");
+
+ /* 3. poll Page Migration DEBUG for SoftStopFault */
+ target = calgary_reg(bbar, phb_offset(bus) | PHB_PAGE_MIG_DEBUG);
+ val = be32_to_cpu(readl(target));
+ printk(KERN_DEBUG "3. read 0x%x [LE] from %p\n", val, target);
+
+ /* 4. if SoftStopFault - goto (1) */
+ if (val & PMR_SOFTSTOPFAULT) {
+ if (++count < 100)
+ goto begin;
+ else {
+ pr_warn("CalIOC2: too many SoftStopFaults, aborting TCE cache flush sequence!\n");
+ return; /* pray for the best */
+ }
+ }
+
+ /* 5. Slam into HardStop by reading PHB_PAGE_MIG_CTRL */
+ target = calgary_reg(bbar, phb_offset(bus) | PHB_PAGE_MIG_CTRL);
+ printk(KERN_DEBUG "5a. slamming into HardStop by reading %p\n", target);
+ val = be32_to_cpu(readl(target));
+ printk(KERN_DEBUG "5b. read 0x%x [LE] from %p\n", val, target);
+ target = calgary_reg(bbar, phb_offset(bus) | PHB_PAGE_MIG_DEBUG);
+ val = be32_to_cpu(readl(target));
+ printk(KERN_DEBUG "5c. read 0x%x [LE] from %p (debug)\n", val, target);
+
+ /* 6. invalidate TCE cache */
+ printk(KERN_DEBUG "6. invalidating TCE cache\n");
+ target = calgary_reg(bbar, tar_offset(bus));
+ writeq(tbl->tar_val, target);
+
+ /* 7. Re-read PMCR */
+ printk(KERN_DEBUG "7a. Re-reading PMCR\n");
+ target = calgary_reg(bbar, phb_offset(bus) | PHB_PAGE_MIG_CTRL);
+ val = be32_to_cpu(readl(target));
+ printk(KERN_DEBUG "7b. read 0x%x [LE] from %p\n", val, target);
+
+ /* 8. Remove HardStop */
+ printk(KERN_DEBUG "8a. removing HardStop from PMCR\n");
+ target = calgary_reg(bbar, phb_offset(bus) | PHB_PAGE_MIG_CTRL);
+ val = 0;
+ printk(KERN_DEBUG "8b. writing 0x%x [LE] to %p\n", val, target);
+ writel(cpu_to_be32(val), target);
+ val = be32_to_cpu(readl(target));
+ printk(KERN_DEBUG "8c. read 0x%x [LE] from %p\n", val, target);
+}
+
+static void __init calgary_reserve_mem_region(struct pci_dev *dev, u64 start,
+ u64 limit)
+{
+ unsigned int numpages;
+
+ limit = limit | 0xfffff;
+ limit++;
+
+ numpages = ((limit - start) >> PAGE_SHIFT);
+ iommu_range_reserve(pci_iommu(dev->bus), start, numpages);
+}
+
+static void __init calgary_reserve_peripheral_mem_1(struct pci_dev *dev)
+{
+ void __iomem *target;
+ u64 low, high, sizelow;
+ u64 start, limit;
+ struct iommu_table *tbl = pci_iommu(dev->bus);
+ unsigned char busnum = dev->bus->number;
+ void __iomem *bbar = tbl->bbar;
+
+ /* peripheral MEM_1 region */
+ target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_1_LOW);
+ low = be32_to_cpu(readl(target));
+ target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_1_HIGH);
+ high = be32_to_cpu(readl(target));
+ target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_1_SIZE);
+ sizelow = be32_to_cpu(readl(target));
+
+ start = (high << 32) | low;
+ limit = sizelow;
+
+ calgary_reserve_mem_region(dev, start, limit);
+}
+
+static void __init calgary_reserve_peripheral_mem_2(struct pci_dev *dev)
+{
+ void __iomem *target;
+ u32 val32;
+ u64 low, high, sizelow, sizehigh;
+ u64 start, limit;
+ struct iommu_table *tbl = pci_iommu(dev->bus);
+ unsigned char busnum = dev->bus->number;
+ void __iomem *bbar = tbl->bbar;
+
+ /* is it enabled? */
+ target = calgary_reg(bbar, phb_offset(busnum) | PHB_CONFIG_RW_OFFSET);
+ val32 = be32_to_cpu(readl(target));
+ if (!(val32 & PHB_MEM2_ENABLE))
+ return;
+
+ target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_2_LOW);
+ low = be32_to_cpu(readl(target));
+ target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_2_HIGH);
+ high = be32_to_cpu(readl(target));
+ target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_2_SIZE_LOW);
+ sizelow = be32_to_cpu(readl(target));
+ target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_2_SIZE_HIGH);
+ sizehigh = be32_to_cpu(readl(target));
+
+ start = (high << 32) | low;
+ limit = (sizehigh << 32) | sizelow;
+
+ calgary_reserve_mem_region(dev, start, limit);
+}
+
+/*
+ * some regions of the IO address space do not get translated, so we
+ * must not give devices IO addresses in those regions. The regions
+ * are the 640KB-1MB region and the two PCI peripheral memory holes.
+ * Reserve all of them in the IOMMU bitmap to avoid giving them out
+ * later.
+ */
+static void __init calgary_reserve_regions(struct pci_dev *dev)
+{
+ unsigned int npages;
+ u64 start;
+ struct iommu_table *tbl = pci_iommu(dev->bus);
+
+ /* reserve EMERGENCY_PAGES from bad_dma_address and up */
+ iommu_range_reserve(tbl, CALGARY_MAPPING_ERROR, EMERGENCY_PAGES);
+
+ /* avoid the BIOS/VGA first 640KB-1MB region */
+ /* for CalIOC2 - avoid the entire first MB */
+ if (is_calgary(dev->device)) {
+ start = (640 * 1024);
+ npages = ((1024 - 640) * 1024) >> PAGE_SHIFT;
+ } else { /* calioc2 */
+ start = 0;
+ npages = (1 * 1024 * 1024) >> PAGE_SHIFT;
+ }
+ iommu_range_reserve(tbl, start, npages);
+
+ /* reserve the two PCI peripheral memory regions in IO space */
+ calgary_reserve_peripheral_mem_1(dev);
+ calgary_reserve_peripheral_mem_2(dev);
+}
+
+static int __init calgary_setup_tar(struct pci_dev *dev, void __iomem *bbar)
+{
+ u64 val64;
+ u64 table_phys;
+ void __iomem *target;
+ int ret;
+ struct iommu_table *tbl;
+
+ /* build TCE tables for each PHB */
+ ret = build_tce_table(dev, bbar);
+ if (ret)
+ return ret;
+
+ tbl = pci_iommu(dev->bus);
+ tbl->it_base = (unsigned long)bus_info[dev->bus->number].tce_space;
+
+ if (is_kdump_kernel())
+ calgary_init_bitmap_from_tce_table(tbl);
+ else
+ tce_free(tbl, 0, tbl->it_size);
+
+ if (is_calgary(dev->device))
+ tbl->chip_ops = &calgary_chip_ops;
+ else if (is_calioc2(dev->device))
+ tbl->chip_ops = &calioc2_chip_ops;
+ else
+ BUG();
+
+ calgary_reserve_regions(dev);
+
+ /* set TARs for each PHB */
+ target = calgary_reg(bbar, tar_offset(dev->bus->number));
+ val64 = be64_to_cpu(readq(target));
+
+ /* zero out all TAR bits under sw control */
+ val64 &= ~TAR_SW_BITS;
+ table_phys = (u64)__pa(tbl->it_base);
+
+ val64 |= table_phys;
+
+ BUG_ON(specified_table_size > TCE_TABLE_SIZE_8M);
+ val64 |= (u64) specified_table_size;
+
+ tbl->tar_val = cpu_to_be64(val64);
+
+ writeq(tbl->tar_val, target);
+ readq(target); /* flush */
+
+ return 0;
+}
+
+static void __init calgary_free_bus(struct pci_dev *dev)
+{
+ u64 val64;
+ struct iommu_table *tbl = pci_iommu(dev->bus);
+ void __iomem *target;
+ unsigned int bitmapsz;
+
+ target = calgary_reg(tbl->bbar, tar_offset(dev->bus->number));
+ val64 = be64_to_cpu(readq(target));
+ val64 &= ~TAR_SW_BITS;
+ writeq(cpu_to_be64(val64), target);
+ readq(target); /* flush */
+
+ bitmapsz = tbl->it_size / BITS_PER_BYTE;
+ free_pages((unsigned long)tbl->it_map, get_order(bitmapsz));
+ tbl->it_map = NULL;
+
+ kfree(tbl);
+
+ set_pci_iommu(dev->bus, NULL);
+
+ /* Can't free bootmem allocated memory after system is up :-( */
+ bus_info[dev->bus->number].tce_space = NULL;
+}
+
+static void calgary_dump_error_regs(struct iommu_table *tbl)
+{
+ void __iomem *bbar = tbl->bbar;
+ void __iomem *target;
+ u32 csr, plssr;
+
+ target = calgary_reg(bbar, phb_offset(tbl->it_busno) | PHB_CSR_OFFSET);
+ csr = be32_to_cpu(readl(target));
+
+ target = calgary_reg(bbar, phb_offset(tbl->it_busno) | PHB_PLSSR_OFFSET);
+ plssr = be32_to_cpu(readl(target));
+
+ /* If no error, the agent ID in the CSR is not valid */
+ pr_emerg("DMA error on Calgary PHB 0x%x, 0x%08x@CSR 0x%08x@PLSSR\n",
+ tbl->it_busno, csr, plssr);
+}
+
+static void calioc2_dump_error_regs(struct iommu_table *tbl)
+{
+ void __iomem *bbar = tbl->bbar;
+ u32 csr, csmr, plssr, mck, rcstat;
+ void __iomem *target;
+ unsigned long phboff = phb_offset(tbl->it_busno);
+ unsigned long erroff;
+ u32 errregs[7];
+ int i;
+
+ /* dump CSR */
+ target = calgary_reg(bbar, phboff | PHB_CSR_OFFSET);
+ csr = be32_to_cpu(readl(target));
+ /* dump PLSSR */
+ target = calgary_reg(bbar, phboff | PHB_PLSSR_OFFSET);
+ plssr = be32_to_cpu(readl(target));
+ /* dump CSMR */
+ target = calgary_reg(bbar, phboff | 0x290);
+ csmr = be32_to_cpu(readl(target));
+ /* dump mck */
+ target = calgary_reg(bbar, phboff | 0x800);
+ mck = be32_to_cpu(readl(target));
+
+ pr_emerg("DMA error on CalIOC2 PHB 0x%x\n", tbl->it_busno);
+
+ pr_emerg("0x%08x@CSR 0x%08x@PLSSR 0x%08x@CSMR 0x%08x@MCK\n",
+ csr, plssr, csmr, mck);
+
+ /* dump rest of error regs */
+ pr_emerg("");
+ for (i = 0; i < ARRAY_SIZE(errregs); i++) {
+ /* err regs are at 0x810 - 0x870 */
+ erroff = (0x810 + (i * 0x10));
+ target = calgary_reg(bbar, phboff | erroff);
+ errregs[i] = be32_to_cpu(readl(target));
+ pr_cont("0x%08x@0x%lx ", errregs[i], erroff);
+ }
+ pr_cont("\n");
+
+ /* root complex status */
+ target = calgary_reg(bbar, phboff | PHB_ROOT_COMPLEX_STATUS);
+ rcstat = be32_to_cpu(readl(target));
+ printk(KERN_EMERG "Calgary: 0x%08x@0x%x\n", rcstat,
+ PHB_ROOT_COMPLEX_STATUS);
+}
+
+static void calgary_watchdog(struct timer_list *t)
+{
+ struct iommu_table *tbl = from_timer(tbl, t, watchdog_timer);
+ void __iomem *bbar = tbl->bbar;
+ u32 val32;
+ void __iomem *target;
+
+ target = calgary_reg(bbar, phb_offset(tbl->it_busno) | PHB_CSR_OFFSET);
+ val32 = be32_to_cpu(readl(target));
+
+ /* If no error, the agent ID in the CSR is not valid */
+ if (val32 & CSR_AGENT_MASK) {
+ tbl->chip_ops->dump_error_regs(tbl);
+
+ /* reset error */
+ writel(0, target);
+
+ /* Disable bus that caused the error */
+ target = calgary_reg(bbar, phb_offset(tbl->it_busno) |
+ PHB_CONFIG_RW_OFFSET);
+ val32 = be32_to_cpu(readl(target));
+ val32 |= PHB_SLOT_DISABLE;
+ writel(cpu_to_be32(val32), target);
+ readl(target); /* flush */
+ } else {
+ /* Reset the timer */
+ mod_timer(&tbl->watchdog_timer, jiffies + 2 * HZ);
+ }
+}
+
+static void __init calgary_set_split_completion_timeout(void __iomem *bbar,
+ unsigned char busnum, unsigned long timeout)
+{
+ u64 val64;
+ void __iomem *target;
+ unsigned int phb_shift = ~0; /* silence gcc */
+ u64 mask;
+
+ switch (busno_to_phbid(busnum)) {
+ case 0: phb_shift = (63 - 19);
+ break;
+ case 1: phb_shift = (63 - 23);
+ break;
+ case 2: phb_shift = (63 - 27);
+ break;
+ case 3: phb_shift = (63 - 35);
+ break;
+ default:
+ BUG_ON(busno_to_phbid(busnum));
+ }
+
+ target = calgary_reg(bbar, CALGARY_CONFIG_REG);
+ val64 = be64_to_cpu(readq(target));
+
+ /* zero out this PHB's timer bits */
+ mask = ~(0xFUL << phb_shift);
+ val64 &= mask;
+ val64 |= (timeout << phb_shift);
+ writeq(cpu_to_be64(val64), target);
+ readq(target); /* flush */
+}
+
+static void __init calioc2_handle_quirks(struct iommu_table *tbl, struct pci_dev *dev)
+{
+ unsigned char busnum = dev->bus->number;
+ void __iomem *bbar = tbl->bbar;
+ void __iomem *target;
+ u32 val;
+
+ /*
+ * CalIOC2 designers recommend setting bit 8 in 0xnDB0 to 1
+ */
+ target = calgary_reg(bbar, phb_offset(busnum) | PHB_SAVIOR_L2);
+ val = cpu_to_be32(readl(target));
+ val |= 0x00800000;
+ writel(cpu_to_be32(val), target);
+}
+
+static void __init calgary_handle_quirks(struct iommu_table *tbl, struct pci_dev *dev)
+{
+ unsigned char busnum = dev->bus->number;
+
+ /*
+ * Give split completion a longer timeout on bus 1 for aic94xx
+ * http://bugzilla.kernel.org/show_bug.cgi?id=7180
+ */
+ if (is_calgary(dev->device) && (busnum == 1))
+ calgary_set_split_completion_timeout(tbl->bbar, busnum,
+ CCR_2SEC_TIMEOUT);
+}
+
+static void __init calgary_enable_translation(struct pci_dev *dev)
+{
+ u32 val32;
+ unsigned char busnum;
+ void __iomem *target;
+ void __iomem *bbar;
+ struct iommu_table *tbl;
+
+ busnum = dev->bus->number;
+ tbl = pci_iommu(dev->bus);
+ bbar = tbl->bbar;
+
+ /* enable TCE in PHB Config Register */
+ target = calgary_reg(bbar, phb_offset(busnum) | PHB_CONFIG_RW_OFFSET);
+ val32 = be32_to_cpu(readl(target));
+ val32 |= PHB_TCE_ENABLE | PHB_DAC_DISABLE | PHB_MCSR_ENABLE;
+
+ printk(KERN_INFO "Calgary: enabling translation on %s PHB %#x\n",
+ (dev->device == PCI_DEVICE_ID_IBM_CALGARY) ?
+ "Calgary" : "CalIOC2", busnum);
+ printk(KERN_INFO "Calgary: errant DMAs will now be prevented on this "
+ "bus.\n");
+
+ writel(cpu_to_be32(val32), target);
+ readl(target); /* flush */
+
+ timer_setup(&tbl->watchdog_timer, calgary_watchdog, 0);
+ mod_timer(&tbl->watchdog_timer, jiffies);
+}
+
+static void __init calgary_disable_translation(struct pci_dev *dev)
+{
+ u32 val32;
+ unsigned char busnum;
+ void __iomem *target;
+ void __iomem *bbar;
+ struct iommu_table *tbl;
+
+ busnum = dev->bus->number;
+ tbl = pci_iommu(dev->bus);
+ bbar = tbl->bbar;
+
+ /* disable TCE in PHB Config Register */
+ target = calgary_reg(bbar, phb_offset(busnum) | PHB_CONFIG_RW_OFFSET);
+ val32 = be32_to_cpu(readl(target));
+ val32 &= ~(PHB_TCE_ENABLE | PHB_DAC_DISABLE | PHB_MCSR_ENABLE);
+
+ printk(KERN_INFO "Calgary: disabling translation on PHB %#x!\n", busnum);
+ writel(cpu_to_be32(val32), target);
+ readl(target); /* flush */
+
+ del_timer_sync(&tbl->watchdog_timer);
+}
+
+static void __init calgary_init_one_nontraslated(struct pci_dev *dev)
+{
+ pci_dev_get(dev);
+ set_pci_iommu(dev->bus, NULL);
+
+ /* is the device behind a bridge? */
+ if (dev->bus->parent)
+ dev->bus->parent->self = dev;
+ else
+ dev->bus->self = dev;
+}
+
+static int __init calgary_init_one(struct pci_dev *dev)
+{
+ void __iomem *bbar;
+ struct iommu_table *tbl;
+ int ret;
+
+ bbar = busno_to_bbar(dev->bus->number);
+ ret = calgary_setup_tar(dev, bbar);
+ if (ret)
+ goto done;
+
+ pci_dev_get(dev);
+
+ if (dev->bus->parent) {
+ if (dev->bus->parent->self)
+ printk(KERN_WARNING "Calgary: IEEEE, dev %p has "
+ "bus->parent->self!\n", dev);
+ dev->bus->parent->self = dev;
+ } else
+ dev->bus->self = dev;
+
+ tbl = pci_iommu(dev->bus);
+ tbl->chip_ops->handle_quirks(tbl, dev);
+
+ calgary_enable_translation(dev);
+
+ return 0;
+
+done:
+ return ret;
+}
+
+static int __init calgary_locate_bbars(void)
+{
+ int ret;
+ int rioidx, phb, bus;
+ void __iomem *bbar;
+ void __iomem *target;
+ unsigned long offset;
+ u8 start_bus, end_bus;
+ u32 val;
+
+ ret = -ENODATA;
+ for (rioidx = 0; rioidx < rio_table_hdr->num_rio_dev; rioidx++) {
+ struct rio_detail *rio = rio_devs[rioidx];
+
+ if ((rio->type != COMPAT_CALGARY) && (rio->type != ALT_CALGARY))
+ continue;
+
+ /* map entire 1MB of Calgary config space */
+ bbar = ioremap_nocache(rio->BBAR, 1024 * 1024);
+ if (!bbar)
+ goto error;
+
+ for (phb = 0; phb < PHBS_PER_CALGARY; phb++) {
+ offset = phb_debug_offsets[phb] | PHB_DEBUG_STUFF_OFFSET;
+ target = calgary_reg(bbar, offset);
+
+ val = be32_to_cpu(readl(target));
+
+ start_bus = (u8)((val & 0x00FF0000) >> 16);
+ end_bus = (u8)((val & 0x0000FF00) >> 8);
+
+ if (end_bus) {
+ for (bus = start_bus; bus <= end_bus; bus++) {
+ bus_info[bus].bbar = bbar;
+ bus_info[bus].phbid = phb;
+ }
+ } else {
+ bus_info[start_bus].bbar = bbar;
+ bus_info[start_bus].phbid = phb;
+ }
+ }
+ }
+
+ return 0;
+
+error:
+ /* scan bus_info and iounmap any bbars we previously ioremap'd */
+ for (bus = 0; bus < ARRAY_SIZE(bus_info); bus++)
+ if (bus_info[bus].bbar)
+ iounmap(bus_info[bus].bbar);
+
+ return ret;
+}
+
+static int __init calgary_init(void)
+{
+ int ret;
+ struct pci_dev *dev = NULL;
+ struct calgary_bus_info *info;
+
+ ret = calgary_locate_bbars();
+ if (ret)
+ return ret;
+
+ /* Purely for kdump kernel case */
+ if (is_kdump_kernel())
+ get_tce_space_from_tar();
+
+ do {
+ dev = pci_get_device(PCI_VENDOR_ID_IBM, PCI_ANY_ID, dev);
+ if (!dev)
+ break;
+ if (!is_cal_pci_dev(dev->device))
+ continue;
+
+ info = &bus_info[dev->bus->number];
+ if (info->translation_disabled) {
+ calgary_init_one_nontraslated(dev);
+ continue;
+ }
+
+ if (!info->tce_space && !translate_empty_slots)
+ continue;
+
+ ret = calgary_init_one(dev);
+ if (ret)
+ goto error;
+ } while (1);
+
+ dev = NULL;
+ for_each_pci_dev(dev) {
+ struct iommu_table *tbl;
+
+ tbl = find_iommu_table(&dev->dev);
+
+ if (translation_enabled(tbl))
+ dev->dev.dma_ops = &calgary_dma_ops;
+ }
+
+ return ret;
+
+error:
+ do {
+ dev = pci_get_device(PCI_VENDOR_ID_IBM, PCI_ANY_ID, dev);
+ if (!dev)
+ break;
+ if (!is_cal_pci_dev(dev->device))
+ continue;
+
+ info = &bus_info[dev->bus->number];
+ if (info->translation_disabled) {
+ pci_dev_put(dev);
+ continue;
+ }
+ if (!info->tce_space && !translate_empty_slots)
+ continue;
+
+ calgary_disable_translation(dev);
+ calgary_free_bus(dev);
+ pci_dev_put(dev); /* Undo calgary_init_one()'s pci_dev_get() */
+ dev->dev.dma_ops = NULL;
+ } while (1);
+
+ return ret;
+}
+
+static inline int __init determine_tce_table_size(void)
+{
+ int ret;
+
+ if (specified_table_size != TCE_TABLE_SIZE_UNSPECIFIED)
+ return specified_table_size;
+
+ if (is_kdump_kernel() && saved_max_pfn) {
+ /*
+ * Table sizes are from 0 to 7 (TCE_TABLE_SIZE_64K to
+ * TCE_TABLE_SIZE_8M). Table size 0 has 8K entries and each
+ * larger table size has twice as many entries, so shift the
+ * max ram address by 13 to divide by 8K and then look at the
+ * order of the result to choose between 0-7.
+ */
+ ret = get_order((saved_max_pfn * PAGE_SIZE) >> 13);
+ if (ret > TCE_TABLE_SIZE_8M)
+ ret = TCE_TABLE_SIZE_8M;
+ } else {
+ /*
+ * Use 8M by default (suggested by Muli) if it's not
+ * kdump kernel and saved_max_pfn isn't set.
+ */
+ ret = TCE_TABLE_SIZE_8M;
+ }
+
+ return ret;
+}
+
+static int __init build_detail_arrays(void)
+{
+ unsigned long ptr;
+ unsigned numnodes, i;
+ int scal_detail_size, rio_detail_size;
+
+ numnodes = rio_table_hdr->num_scal_dev;
+ if (numnodes > MAX_NUMNODES){
+ printk(KERN_WARNING
+ "Calgary: MAX_NUMNODES too low! Defined as %d, "
+ "but system has %d nodes.\n",
+ MAX_NUMNODES, numnodes);
+ return -ENODEV;
+ }
+
+ switch (rio_table_hdr->version){
+ case 2:
+ scal_detail_size = 11;
+ rio_detail_size = 13;
+ break;
+ case 3:
+ scal_detail_size = 12;
+ rio_detail_size = 15;
+ break;
+ default:
+ printk(KERN_WARNING
+ "Calgary: Invalid Rio Grande Table Version: %d\n",
+ rio_table_hdr->version);
+ return -EPROTO;
+ }
+
+ ptr = ((unsigned long)rio_table_hdr) + 3;
+ for (i = 0; i < numnodes; i++, ptr += scal_detail_size)
+ scal_devs[i] = (struct scal_detail *)ptr;
+
+ for (i = 0; i < rio_table_hdr->num_rio_dev;
+ i++, ptr += rio_detail_size)
+ rio_devs[i] = (struct rio_detail *)ptr;
+
+ return 0;
+}
+
+static int __init calgary_bus_has_devices(int bus, unsigned short pci_dev)
+{
+ int dev;
+ u32 val;
+
+ if (pci_dev == PCI_DEVICE_ID_IBM_CALIOC2) {
+ /*
+ * FIXME: properly scan for devices across the
+ * PCI-to-PCI bridge on every CalIOC2 port.
+ */
+ return 1;
+ }
+
+ for (dev = 1; dev < 8; dev++) {
+ val = read_pci_config(bus, dev, 0, 0);
+ if (val != 0xffffffff)
+ break;
+ }
+ return (val != 0xffffffff);
+}
+
+/*
+ * calgary_init_bitmap_from_tce_table():
+ * Function for kdump case. In the second/kdump kernel initialize
+ * the bitmap based on the tce table entries obtained from first kernel
+ */
+static void calgary_init_bitmap_from_tce_table(struct iommu_table *tbl)
+{
+ u64 *tp;
+ unsigned int index;
+ tp = ((u64 *)tbl->it_base);
+ for (index = 0 ; index < tbl->it_size; index++) {
+ if (*tp != 0x0)
+ set_bit(index, tbl->it_map);
+ tp++;
+ }
+}
+
+/*
+ * get_tce_space_from_tar():
+ * Function for kdump case. Get the tce tables from first kernel
+ * by reading the contents of the base address register of calgary iommu
+ */
+static void __init get_tce_space_from_tar(void)
+{
+ int bus;
+ void __iomem *target;
+ unsigned long tce_space;
+
+ for (bus = 0; bus < MAX_PHB_BUS_NUM; bus++) {
+ struct calgary_bus_info *info = &bus_info[bus];
+ unsigned short pci_device;
+ u32 val;
+
+ val = read_pci_config(bus, 0, 0, 0);
+ pci_device = (val & 0xFFFF0000) >> 16;
+
+ if (!is_cal_pci_dev(pci_device))
+ continue;
+ if (info->translation_disabled)
+ continue;
+
+ if (calgary_bus_has_devices(bus, pci_device) ||
+ translate_empty_slots) {
+ target = calgary_reg(bus_info[bus].bbar,
+ tar_offset(bus));
+ tce_space = be64_to_cpu(readq(target));
+ tce_space = tce_space & TAR_SW_BITS;
+
+ tce_space = tce_space & (~specified_table_size);
+ info->tce_space = (u64 *)__va(tce_space);
+ }
+ }
+ return;
+}
+
+static int __init calgary_iommu_init(void)
+{
+ int ret;
+
+ /* ok, we're trying to use Calgary - let's roll */
+ printk(KERN_INFO "PCI-DMA: Using Calgary IOMMU\n");
+
+ ret = calgary_init();
+ if (ret) {
+ printk(KERN_ERR "PCI-DMA: Calgary init failed %d, "
+ "falling back to no_iommu\n", ret);
+ return ret;
+ }
+
+ return 0;
+}
+
+int __init detect_calgary(void)
+{
+ int bus;
+ void *tbl;
+ int calgary_found = 0;
+ unsigned long ptr;
+ unsigned int offset, prev_offset;
+ int ret;
+
+ /*
+ * if the user specified iommu=off or iommu=soft or we found
+ * another HW IOMMU already, bail out.
+ */
+ if (no_iommu || iommu_detected)
+ return -ENODEV;
+
+ if (!use_calgary)
+ return -ENODEV;
+
+ if (!early_pci_allowed())
+ return -ENODEV;
+
+ printk(KERN_DEBUG "Calgary: detecting Calgary via BIOS EBDA area\n");
+
+ ptr = (unsigned long)phys_to_virt(get_bios_ebda());
+
+ rio_table_hdr = NULL;
+ prev_offset = 0;
+ offset = 0x180;
+ /*
+ * The next offset is stored in the 1st word.
+ * Only parse up until the offset increases:
+ */
+ while (offset > prev_offset) {
+ /* The block id is stored in the 2nd word */
+ if (*((unsigned short *)(ptr + offset + 2)) == 0x4752){
+ /* set the pointer past the offset & block id */
+ rio_table_hdr = (struct rio_table_hdr *)(ptr + offset + 4);
+ break;
+ }
+ prev_offset = offset;
+ offset = *((unsigned short *)(ptr + offset));
+ }
+ if (!rio_table_hdr) {
+ printk(KERN_DEBUG "Calgary: Unable to locate Rio Grande table "
+ "in EBDA - bailing!\n");
+ return -ENODEV;
+ }
+
+ ret = build_detail_arrays();
+ if (ret) {
+ printk(KERN_DEBUG "Calgary: build_detail_arrays ret %d\n", ret);
+ return -ENOMEM;
+ }
+
+ specified_table_size = determine_tce_table_size();
+
+ for (bus = 0; bus < MAX_PHB_BUS_NUM; bus++) {
+ struct calgary_bus_info *info = &bus_info[bus];
+ unsigned short pci_device;
+ u32 val;
+
+ val = read_pci_config(bus, 0, 0, 0);
+ pci_device = (val & 0xFFFF0000) >> 16;
+
+ if (!is_cal_pci_dev(pci_device))
+ continue;
+
+ if (info->translation_disabled)
+ continue;
+
+ if (calgary_bus_has_devices(bus, pci_device) ||
+ translate_empty_slots) {
+ /*
+ * If it is kdump kernel, find and use tce tables
+ * from first kernel, else allocate tce tables here
+ */
+ if (!is_kdump_kernel()) {
+ tbl = alloc_tce_table();
+ if (!tbl)
+ goto cleanup;
+ info->tce_space = tbl;
+ }
+ calgary_found = 1;
+ }
+ }
+
+ printk(KERN_DEBUG "Calgary: finished detection, Calgary %s\n",
+ calgary_found ? "found" : "not found");
+
+ if (calgary_found) {
+ iommu_detected = 1;
+ calgary_detected = 1;
+ printk(KERN_INFO "PCI-DMA: Calgary IOMMU detected.\n");
+ printk(KERN_INFO "PCI-DMA: Calgary TCE table spec is %d\n",
+ specified_table_size);
+
+ x86_init.iommu.iommu_init = calgary_iommu_init;
+ }
+ return calgary_found;
+
+cleanup:
+ for (--bus; bus >= 0; --bus) {
+ struct calgary_bus_info *info = &bus_info[bus];
+
+ if (info->tce_space)
+ free_tce_table(info->tce_space);
+ }
+ return -ENOMEM;
+}
+
+static int __init calgary_parse_options(char *p)
+{
+ unsigned int bridge;
+ unsigned long val;
+ size_t len;
+ ssize_t ret;
+
+ while (*p) {
+ if (!strncmp(p, "64k", 3))
+ specified_table_size = TCE_TABLE_SIZE_64K;
+ else if (!strncmp(p, "128k", 4))
+ specified_table_size = TCE_TABLE_SIZE_128K;
+ else if (!strncmp(p, "256k", 4))
+ specified_table_size = TCE_TABLE_SIZE_256K;
+ else if (!strncmp(p, "512k", 4))
+ specified_table_size = TCE_TABLE_SIZE_512K;
+ else if (!strncmp(p, "1M", 2))
+ specified_table_size = TCE_TABLE_SIZE_1M;
+ else if (!strncmp(p, "2M", 2))
+ specified_table_size = TCE_TABLE_SIZE_2M;
+ else if (!strncmp(p, "4M", 2))
+ specified_table_size = TCE_TABLE_SIZE_4M;
+ else if (!strncmp(p, "8M", 2))
+ specified_table_size = TCE_TABLE_SIZE_8M;
+
+ len = strlen("translate_empty_slots");
+ if (!strncmp(p, "translate_empty_slots", len))
+ translate_empty_slots = 1;
+
+ len = strlen("disable");
+ if (!strncmp(p, "disable", len)) {
+ p += len;
+ if (*p == '=')
+ ++p;
+ if (*p == '\0')
+ break;
+ ret = kstrtoul(p, 0, &val);
+ if (ret)
+ break;
+
+ bridge = val;
+ if (bridge < MAX_PHB_BUS_NUM) {
+ printk(KERN_INFO "Calgary: disabling "
+ "translation for PHB %#x\n", bridge);
+ bus_info[bridge].translation_disabled = 1;
+ }
+ }
+
+ p = strpbrk(p, ",");
+ if (!p)
+ break;
+
+ p++; /* skip ',' */
+ }
+ return 1;
+}
+__setup("calgary=", calgary_parse_options);
+
+static void __init calgary_fixup_one_tce_space(struct pci_dev *dev)
+{
+ struct iommu_table *tbl;
+ unsigned int npages;
+ int i;
+
+ tbl = pci_iommu(dev->bus);
+
+ for (i = 0; i < 4; i++) {
+ struct resource *r = &dev->resource[PCI_BRIDGE_RESOURCES + i];
+
+ /* Don't give out TCEs that map MEM resources */
+ if (!(r->flags & IORESOURCE_MEM))
+ continue;
+
+ /* 0-based? we reserve the whole 1st MB anyway */
+ if (!r->start)
+ continue;
+
+ /* cover the whole region */
+ npages = resource_size(r) >> PAGE_SHIFT;
+ npages++;
+
+ iommu_range_reserve(tbl, r->start, npages);
+ }
+}
+
+static int __init calgary_fixup_tce_spaces(void)
+{
+ struct pci_dev *dev = NULL;
+ struct calgary_bus_info *info;
+
+ if (no_iommu || swiotlb || !calgary_detected)
+ return -ENODEV;
+
+ printk(KERN_DEBUG "Calgary: fixing up tce spaces\n");
+
+ do {
+ dev = pci_get_device(PCI_VENDOR_ID_IBM, PCI_ANY_ID, dev);
+ if (!dev)
+ break;
+ if (!is_cal_pci_dev(dev->device))
+ continue;
+
+ info = &bus_info[dev->bus->number];
+ if (info->translation_disabled)
+ continue;
+
+ if (!info->tce_space)
+ continue;
+
+ calgary_fixup_one_tce_space(dev);
+
+ } while (1);
+
+ return 0;
+}
+
+/*
+ * We need to be call after pcibios_assign_resources (fs_initcall level)
+ * and before device_initcall.
+ */
+rootfs_initcall(calgary_fixup_tce_spaces);
+
+IOMMU_INIT_POST(detect_calgary);
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
new file mode 100644
index 0000000..7ba73fe
--- /dev/null
+++ b/arch/x86/kernel/pci-dma.c
@@ -0,0 +1,196 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/dma-direct.h>
+#include <linux/dma-debug.h>
+#include <linux/dmar.h>
+#include <linux/export.h>
+#include <linux/bootmem.h>
+#include <linux/gfp.h>
+#include <linux/pci.h>
+
+#include <asm/proto.h>
+#include <asm/dma.h>
+#include <asm/iommu.h>
+#include <asm/gart.h>
+#include <asm/calgary.h>
+#include <asm/x86_init.h>
+#include <asm/iommu_table.h>
+
+static bool disable_dac_quirk __read_mostly;
+
+const struct dma_map_ops *dma_ops = &dma_direct_ops;
+EXPORT_SYMBOL(dma_ops);
+
+#ifdef CONFIG_IOMMU_DEBUG
+int panic_on_overflow __read_mostly = 1;
+int force_iommu __read_mostly = 1;
+#else
+int panic_on_overflow __read_mostly = 0;
+int force_iommu __read_mostly = 0;
+#endif
+
+int iommu_merge __read_mostly = 0;
+
+int no_iommu __read_mostly;
+/* Set this to 1 if there is a HW IOMMU in the system */
+int iommu_detected __read_mostly = 0;
+
+/*
+ * This variable becomes 1 if iommu=pt is passed on the kernel command line.
+ * If this variable is 1, IOMMU implementations do no DMA translation for
+ * devices and allow every device to access to whole physical memory. This is
+ * useful if a user wants to use an IOMMU only for KVM device assignment to
+ * guests and not for driver dma translation.
+ * It is also possible to disable by default in kernel config, and enable with
+ * iommu=nopt at boot time.
+ */
+#ifdef CONFIG_IOMMU_DEFAULT_PASSTHROUGH
+int iommu_pass_through __read_mostly = 1;
+#else
+int iommu_pass_through __read_mostly;
+#endif
+
+extern struct iommu_table_entry __iommu_table[], __iommu_table_end[];
+
+/* Dummy device used for NULL arguments (normally ISA). */
+struct device x86_dma_fallback_dev = {
+ .init_name = "fallback device",
+ .coherent_dma_mask = ISA_DMA_BIT_MASK,
+ .dma_mask = &x86_dma_fallback_dev.coherent_dma_mask,
+};
+EXPORT_SYMBOL(x86_dma_fallback_dev);
+
+void __init pci_iommu_alloc(void)
+{
+ struct iommu_table_entry *p;
+
+ sort_iommu_table(__iommu_table, __iommu_table_end);
+ check_iommu_entries(__iommu_table, __iommu_table_end);
+
+ for (p = __iommu_table; p < __iommu_table_end; p++) {
+ if (p && p->detect && p->detect() > 0) {
+ p->flags |= IOMMU_DETECTED;
+ if (p->early_init)
+ p->early_init();
+ if (p->flags & IOMMU_FINISH_IF_DETECTED)
+ break;
+ }
+ }
+}
+
+bool arch_dma_alloc_attrs(struct device **dev)
+{
+ if (!*dev)
+ *dev = &x86_dma_fallback_dev;
+
+ if (!is_device_dma_capable(*dev))
+ return false;
+ return true;
+
+}
+EXPORT_SYMBOL(arch_dma_alloc_attrs);
+
+/*
+ * See <Documentation/x86/x86_64/boot-options.txt> for the iommu kernel
+ * parameter documentation.
+ */
+static __init int iommu_setup(char *p)
+{
+ iommu_merge = 1;
+
+ if (!p)
+ return -EINVAL;
+
+ while (*p) {
+ if (!strncmp(p, "off", 3))
+ no_iommu = 1;
+ /* gart_parse_options has more force support */
+ if (!strncmp(p, "force", 5))
+ force_iommu = 1;
+ if (!strncmp(p, "noforce", 7)) {
+ iommu_merge = 0;
+ force_iommu = 0;
+ }
+
+ if (!strncmp(p, "biomerge", 8)) {
+ iommu_merge = 1;
+ force_iommu = 1;
+ }
+ if (!strncmp(p, "panic", 5))
+ panic_on_overflow = 1;
+ if (!strncmp(p, "nopanic", 7))
+ panic_on_overflow = 0;
+ if (!strncmp(p, "merge", 5)) {
+ iommu_merge = 1;
+ force_iommu = 1;
+ }
+ if (!strncmp(p, "nomerge", 7))
+ iommu_merge = 0;
+ if (!strncmp(p, "forcesac", 8))
+ pr_warn("forcesac option ignored.\n");
+ if (!strncmp(p, "allowdac", 8))
+ pr_warn("allowdac option ignored.\n");
+ if (!strncmp(p, "nodac", 5))
+ pr_warn("nodac option ignored.\n");
+ if (!strncmp(p, "usedac", 6)) {
+ disable_dac_quirk = true;
+ return 1;
+ }
+#ifdef CONFIG_SWIOTLB
+ if (!strncmp(p, "soft", 4))
+ swiotlb = 1;
+#endif
+ if (!strncmp(p, "pt", 2))
+ iommu_pass_through = 1;
+ if (!strncmp(p, "nopt", 4))
+ iommu_pass_through = 0;
+
+ gart_parse_options(p);
+
+#ifdef CONFIG_CALGARY_IOMMU
+ if (!strncmp(p, "calgary", 7))
+ use_calgary = 1;
+#endif /* CONFIG_CALGARY_IOMMU */
+
+ p += strcspn(p, ",");
+ if (*p == ',')
+ ++p;
+ }
+ return 0;
+}
+early_param("iommu", iommu_setup);
+
+static int __init pci_iommu_init(void)
+{
+ struct iommu_table_entry *p;
+
+ x86_init.iommu.iommu_init();
+
+ for (p = __iommu_table; p < __iommu_table_end; p++) {
+ if (p && (p->flags & IOMMU_DETECTED) && p->late_init)
+ p->late_init();
+ }
+
+ return 0;
+}
+/* Must execute after PCI subsystem */
+rootfs_initcall(pci_iommu_init);
+
+#ifdef CONFIG_PCI
+/* Many VIA bridges seem to corrupt data for DAC. Disable it here */
+
+static int via_no_dac_cb(struct pci_dev *pdev, void *data)
+{
+ pdev->dev.bus_dma_mask = DMA_BIT_MASK(32);
+ return 0;
+}
+
+static void via_no_dac(struct pci_dev *dev)
+{
+ if (!disable_dac_quirk) {
+ dev_info(&dev->dev, "disabling DAC on VIA PCI bridge\n");
+ pci_walk_bus(dev->subordinate, via_no_dac_cb, NULL);
+ }
+}
+DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_VENDOR_ID_VIA, PCI_ANY_ID,
+ PCI_CLASS_BRIDGE_PCI, 8, via_no_dac);
+#endif
diff --git a/arch/x86/kernel/pci-iommu_table.c b/arch/x86/kernel/pci-iommu_table.c
new file mode 100644
index 0000000..2e9006c
--- /dev/null
+++ b/arch/x86/kernel/pci-iommu_table.c
@@ -0,0 +1,80 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/dma-mapping.h>
+#include <asm/iommu_table.h>
+#include <linux/string.h>
+#include <linux/kallsyms.h>
+
+
+#define DEBUG 1
+
+static struct iommu_table_entry * __init
+find_dependents_of(struct iommu_table_entry *start,
+ struct iommu_table_entry *finish,
+ struct iommu_table_entry *q)
+{
+ struct iommu_table_entry *p;
+
+ if (!q)
+ return NULL;
+
+ for (p = start; p < finish; p++)
+ if (p->detect == q->depend)
+ return p;
+
+ return NULL;
+}
+
+
+void __init sort_iommu_table(struct iommu_table_entry *start,
+ struct iommu_table_entry *finish) {
+
+ struct iommu_table_entry *p, *q, tmp;
+
+ for (p = start; p < finish; p++) {
+again:
+ q = find_dependents_of(start, finish, p);
+ /* We are bit sneaky here. We use the memory address to figure
+ * out if the node we depend on is past our point, if so, swap.
+ */
+ if (q > p) {
+ tmp = *p;
+ memmove(p, q, sizeof(*p));
+ *q = tmp;
+ goto again;
+ }
+ }
+
+}
+
+#ifdef DEBUG
+void __init check_iommu_entries(struct iommu_table_entry *start,
+ struct iommu_table_entry *finish)
+{
+ struct iommu_table_entry *p, *q, *x;
+
+ /* Simple cyclic dependency checker. */
+ for (p = start; p < finish; p++) {
+ q = find_dependents_of(start, finish, p);
+ x = find_dependents_of(start, finish, q);
+ if (p == x) {
+ printk(KERN_ERR "CYCLIC DEPENDENCY FOUND! %pS depends on %pS and vice-versa. BREAKING IT.\n",
+ p->detect, q->detect);
+ /* Heavy handed way..*/
+ x->depend = NULL;
+ }
+ }
+
+ for (p = start; p < finish; p++) {
+ q = find_dependents_of(p, finish, p);
+ if (q && q > p) {
+ printk(KERN_ERR "EXECUTION ORDER INVALID! %pS should be called before %pS!\n",
+ p->detect, q->detect);
+ }
+ }
+}
+#else
+void __init check_iommu_entries(struct iommu_table_entry *start,
+ struct iommu_table_entry *finish)
+{
+}
+#endif
diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c
new file mode 100644
index 0000000..71c0b01
--- /dev/null
+++ b/arch/x86/kernel/pci-swiotlb.c
@@ -0,0 +1,81 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Glue code to lib/swiotlb.c */
+
+#include <linux/pci.h>
+#include <linux/cache.h>
+#include <linux/init.h>
+#include <linux/swiotlb.h>
+#include <linux/bootmem.h>
+#include <linux/dma-direct.h>
+#include <linux/mem_encrypt.h>
+
+#include <asm/iommu.h>
+#include <asm/swiotlb.h>
+#include <asm/dma.h>
+#include <asm/xen/swiotlb-xen.h>
+#include <asm/iommu_table.h>
+
+int swiotlb __read_mostly;
+
+/*
+ * pci_swiotlb_detect_override - set swiotlb to 1 if necessary
+ *
+ * This returns non-zero if we are forced to use swiotlb (by the boot
+ * option).
+ */
+int __init pci_swiotlb_detect_override(void)
+{
+ if (swiotlb_force == SWIOTLB_FORCE)
+ swiotlb = 1;
+
+ return swiotlb;
+}
+IOMMU_INIT_FINISH(pci_swiotlb_detect_override,
+ pci_xen_swiotlb_detect,
+ pci_swiotlb_init,
+ pci_swiotlb_late_init);
+
+/*
+ * If 4GB or more detected (and iommu=off not set) or if SME is active
+ * then set swiotlb to 1 and return 1.
+ */
+int __init pci_swiotlb_detect_4gb(void)
+{
+ /* don't initialize swiotlb if iommu=off (no_iommu=1) */
+ if (!no_iommu && max_possible_pfn > MAX_DMA32_PFN)
+ swiotlb = 1;
+
+ /*
+ * If SME is active then swiotlb will be set to 1 so that bounce
+ * buffers are allocated and used for devices that do not support
+ * the addressing range required for the encryption mask.
+ */
+ if (sme_active())
+ swiotlb = 1;
+
+ return swiotlb;
+}
+IOMMU_INIT(pci_swiotlb_detect_4gb,
+ pci_swiotlb_detect_override,
+ pci_swiotlb_init,
+ pci_swiotlb_late_init);
+
+void __init pci_swiotlb_init(void)
+{
+ if (swiotlb) {
+ swiotlb_init(0);
+ dma_ops = &swiotlb_dma_ops;
+ }
+}
+
+void __init pci_swiotlb_late_init(void)
+{
+ /* An IOMMU turned us off. */
+ if (!swiotlb)
+ swiotlb_exit();
+ else {
+ printk(KERN_INFO "PCI-DMA: "
+ "Using software bounce buffering for IO (SWIOTLB)\n");
+ swiotlb_print_info();
+ }
+}
diff --git a/arch/x86/kernel/pcspeaker.c b/arch/x86/kernel/pcspeaker.c
new file mode 100644
index 0000000..4a710ff
--- /dev/null
+++ b/arch/x86/kernel/pcspeaker.c
@@ -0,0 +1,14 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/platform_device.h>
+#include <linux/err.h>
+#include <linux/init.h>
+
+static __init int add_pcspkr(void)
+{
+ struct platform_device *pd;
+
+ pd = platform_device_register_simple("pcspkr", -1, NULL, 0);
+
+ return PTR_ERR_OR_ZERO(pd);
+}
+device_initcall(add_pcspkr);
diff --git a/arch/x86/kernel/perf_regs.c b/arch/x86/kernel/perf_regs.c
new file mode 100644
index 0000000..c06c4c1
--- /dev/null
+++ b/arch/x86/kernel/perf_regs.c
@@ -0,0 +1,179 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/sched/task_stack.h>
+#include <linux/perf_event.h>
+#include <linux/bug.h>
+#include <linux/stddef.h>
+#include <asm/perf_regs.h>
+#include <asm/ptrace.h>
+
+#ifdef CONFIG_X86_32
+#define PERF_REG_X86_MAX PERF_REG_X86_32_MAX
+#else
+#define PERF_REG_X86_MAX PERF_REG_X86_64_MAX
+#endif
+
+#define PT_REGS_OFFSET(id, r) [id] = offsetof(struct pt_regs, r)
+
+static unsigned int pt_regs_offset[PERF_REG_X86_MAX] = {
+ PT_REGS_OFFSET(PERF_REG_X86_AX, ax),
+ PT_REGS_OFFSET(PERF_REG_X86_BX, bx),
+ PT_REGS_OFFSET(PERF_REG_X86_CX, cx),
+ PT_REGS_OFFSET(PERF_REG_X86_DX, dx),
+ PT_REGS_OFFSET(PERF_REG_X86_SI, si),
+ PT_REGS_OFFSET(PERF_REG_X86_DI, di),
+ PT_REGS_OFFSET(PERF_REG_X86_BP, bp),
+ PT_REGS_OFFSET(PERF_REG_X86_SP, sp),
+ PT_REGS_OFFSET(PERF_REG_X86_IP, ip),
+ PT_REGS_OFFSET(PERF_REG_X86_FLAGS, flags),
+ PT_REGS_OFFSET(PERF_REG_X86_CS, cs),
+ PT_REGS_OFFSET(PERF_REG_X86_SS, ss),
+#ifdef CONFIG_X86_32
+ PT_REGS_OFFSET(PERF_REG_X86_DS, ds),
+ PT_REGS_OFFSET(PERF_REG_X86_ES, es),
+ PT_REGS_OFFSET(PERF_REG_X86_FS, fs),
+ PT_REGS_OFFSET(PERF_REG_X86_GS, gs),
+#else
+ /*
+ * The pt_regs struct does not store
+ * ds, es, fs, gs in 64 bit mode.
+ */
+ (unsigned int) -1,
+ (unsigned int) -1,
+ (unsigned int) -1,
+ (unsigned int) -1,
+#endif
+#ifdef CONFIG_X86_64
+ PT_REGS_OFFSET(PERF_REG_X86_R8, r8),
+ PT_REGS_OFFSET(PERF_REG_X86_R9, r9),
+ PT_REGS_OFFSET(PERF_REG_X86_R10, r10),
+ PT_REGS_OFFSET(PERF_REG_X86_R11, r11),
+ PT_REGS_OFFSET(PERF_REG_X86_R12, r12),
+ PT_REGS_OFFSET(PERF_REG_X86_R13, r13),
+ PT_REGS_OFFSET(PERF_REG_X86_R14, r14),
+ PT_REGS_OFFSET(PERF_REG_X86_R15, r15),
+#endif
+};
+
+u64 perf_reg_value(struct pt_regs *regs, int idx)
+{
+ if (WARN_ON_ONCE(idx >= ARRAY_SIZE(pt_regs_offset)))
+ return 0;
+
+ return regs_get_register(regs, pt_regs_offset[idx]);
+}
+
+#define REG_RESERVED (~((1ULL << PERF_REG_X86_MAX) - 1ULL))
+
+#ifdef CONFIG_X86_32
+int perf_reg_validate(u64 mask)
+{
+ if (!mask || mask & REG_RESERVED)
+ return -EINVAL;
+
+ return 0;
+}
+
+u64 perf_reg_abi(struct task_struct *task)
+{
+ return PERF_SAMPLE_REGS_ABI_32;
+}
+
+void perf_get_regs_user(struct perf_regs *regs_user,
+ struct pt_regs *regs,
+ struct pt_regs *regs_user_copy)
+{
+ regs_user->regs = task_pt_regs(current);
+ regs_user->abi = perf_reg_abi(current);
+}
+#else /* CONFIG_X86_64 */
+#define REG_NOSUPPORT ((1ULL << PERF_REG_X86_DS) | \
+ (1ULL << PERF_REG_X86_ES) | \
+ (1ULL << PERF_REG_X86_FS) | \
+ (1ULL << PERF_REG_X86_GS))
+
+int perf_reg_validate(u64 mask)
+{
+ if (!mask || mask & REG_RESERVED)
+ return -EINVAL;
+
+ if (mask & REG_NOSUPPORT)
+ return -EINVAL;
+
+ return 0;
+}
+
+u64 perf_reg_abi(struct task_struct *task)
+{
+ if (test_tsk_thread_flag(task, TIF_IA32))
+ return PERF_SAMPLE_REGS_ABI_32;
+ else
+ return PERF_SAMPLE_REGS_ABI_64;
+}
+
+void perf_get_regs_user(struct perf_regs *regs_user,
+ struct pt_regs *regs,
+ struct pt_regs *regs_user_copy)
+{
+ struct pt_regs *user_regs = task_pt_regs(current);
+
+ /*
+ * If we're in an NMI that interrupted task_pt_regs setup, then
+ * we can't sample user regs at all. This check isn't really
+ * sufficient, though, as we could be in an NMI inside an interrupt
+ * that happened during task_pt_regs setup.
+ */
+ if (regs->sp > (unsigned long)&user_regs->r11 &&
+ regs->sp <= (unsigned long)(user_regs + 1)) {
+ regs_user->abi = PERF_SAMPLE_REGS_ABI_NONE;
+ regs_user->regs = NULL;
+ return;
+ }
+
+ /*
+ * These registers are always saved on 64-bit syscall entry.
+ * On 32-bit entry points, they are saved too except r8..r11.
+ */
+ regs_user_copy->ip = user_regs->ip;
+ regs_user_copy->ax = user_regs->ax;
+ regs_user_copy->cx = user_regs->cx;
+ regs_user_copy->dx = user_regs->dx;
+ regs_user_copy->si = user_regs->si;
+ regs_user_copy->di = user_regs->di;
+ regs_user_copy->r8 = user_regs->r8;
+ regs_user_copy->r9 = user_regs->r9;
+ regs_user_copy->r10 = user_regs->r10;
+ regs_user_copy->r11 = user_regs->r11;
+ regs_user_copy->orig_ax = user_regs->orig_ax;
+ regs_user_copy->flags = user_regs->flags;
+ regs_user_copy->sp = user_regs->sp;
+ regs_user_copy->cs = user_regs->cs;
+ regs_user_copy->ss = user_regs->ss;
+ /*
+ * Store user space frame-pointer value on sample
+ * to facilitate stack unwinding for cases when
+ * user space executable code has such support
+ * enabled at compile time:
+ */
+ regs_user_copy->bp = user_regs->bp;
+
+ regs_user_copy->bx = -1;
+ regs_user_copy->r12 = -1;
+ regs_user_copy->r13 = -1;
+ regs_user_copy->r14 = -1;
+ regs_user_copy->r15 = -1;
+ /*
+ * For this to be at all useful, we need a reasonable guess for
+ * the ABI. Be careful: we're in NMI context, and we're
+ * considering current to be the current task, so we should
+ * be careful not to look at any other percpu variables that might
+ * change during context switches.
+ */
+ regs_user->abi = user_64bit_mode(user_regs) ?
+ PERF_SAMPLE_REGS_ABI_64 : PERF_SAMPLE_REGS_ABI_32;
+
+ regs_user->regs = regs_user_copy;
+}
+#endif /* CONFIG_X86_32 */
diff --git a/arch/x86/kernel/platform-quirks.c b/arch/x86/kernel/platform-quirks.c
new file mode 100644
index 0000000..b348a67
--- /dev/null
+++ b/arch/x86/kernel/platform-quirks.c
@@ -0,0 +1,46 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/kernel.h>
+#include <linux/init.h>
+
+#include <asm/setup.h>
+#include <asm/bios_ebda.h>
+
+void __init x86_early_init_platform_quirks(void)
+{
+ x86_platform.legacy.i8042 = X86_LEGACY_I8042_EXPECTED_PRESENT;
+ x86_platform.legacy.rtc = 1;
+ x86_platform.legacy.warm_reset = 1;
+ x86_platform.legacy.reserve_bios_regions = 0;
+ x86_platform.legacy.devices.pnpbios = 1;
+
+ switch (boot_params.hdr.hardware_subarch) {
+ case X86_SUBARCH_PC:
+ x86_platform.legacy.reserve_bios_regions = 1;
+ break;
+ case X86_SUBARCH_XEN:
+ x86_platform.legacy.devices.pnpbios = 0;
+ x86_platform.legacy.rtc = 0;
+ break;
+ case X86_SUBARCH_INTEL_MID:
+ case X86_SUBARCH_CE4100:
+ x86_platform.legacy.devices.pnpbios = 0;
+ x86_platform.legacy.rtc = 0;
+ x86_platform.legacy.i8042 = X86_LEGACY_I8042_PLATFORM_ABSENT;
+ break;
+ }
+
+ if (x86_platform.set_legacy_features)
+ x86_platform.set_legacy_features();
+}
+
+bool __init x86_pnpbios_disabled(void)
+{
+ return x86_platform.legacy.devices.pnpbios == 0;
+}
+
+#if defined(CONFIG_PNPBIOS)
+bool __init arch_pnpbios_disabled(void)
+{
+ return x86_pnpbios_disabled();
+}
+#endif
diff --git a/arch/x86/kernel/pmem.c b/arch/x86/kernel/pmem.c
new file mode 100644
index 0000000..6b07faa
--- /dev/null
+++ b/arch/x86/kernel/pmem.c
@@ -0,0 +1,32 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2015, Christoph Hellwig.
+ * Copyright (c) 2015, Intel Corporation.
+ */
+#include <linux/platform_device.h>
+#include <linux/init.h>
+#include <linux/ioport.h>
+
+static int found(struct resource *res, void *data)
+{
+ return 1;
+}
+
+static __init int register_e820_pmem(void)
+{
+ struct platform_device *pdev;
+ int rc;
+
+ rc = walk_iomem_res_desc(IORES_DESC_PERSISTENT_MEMORY_LEGACY,
+ IORESOURCE_MEM, 0, -1, NULL, found);
+ if (rc <= 0)
+ return 0;
+
+ /*
+ * See drivers/nvdimm/e820.c for the implementation, this is
+ * simply here to trigger the module to load on demand.
+ */
+ pdev = platform_device_alloc("e820_pmem", -1);
+ return platform_device_add(pdev);
+}
+device_initcall(register_e820_pmem);
diff --git a/arch/x86/kernel/probe_roms.c b/arch/x86/kernel/probe_roms.c
new file mode 100644
index 0000000..ee02863
--- /dev/null
+++ b/arch/x86/kernel/probe_roms.c
@@ -0,0 +1,269 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/uaccess.h>
+#include <linux/mmzone.h>
+#include <linux/ioport.h>
+#include <linux/seq_file.h>
+#include <linux/console.h>
+#include <linux/init.h>
+#include <linux/edd.h>
+#include <linux/dmi.h>
+#include <linux/pfn.h>
+#include <linux/pci.h>
+#include <linux/export.h>
+
+#include <asm/probe_roms.h>
+#include <asm/pci-direct.h>
+#include <asm/e820/api.h>
+#include <asm/mmzone.h>
+#include <asm/setup.h>
+#include <asm/sections.h>
+#include <asm/io.h>
+#include <asm/setup_arch.h>
+
+static struct resource system_rom_resource = {
+ .name = "System ROM",
+ .start = 0xf0000,
+ .end = 0xfffff,
+ .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
+};
+
+static struct resource extension_rom_resource = {
+ .name = "Extension ROM",
+ .start = 0xe0000,
+ .end = 0xeffff,
+ .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
+};
+
+static struct resource adapter_rom_resources[] = { {
+ .name = "Adapter ROM",
+ .start = 0xc8000,
+ .end = 0,
+ .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
+}, {
+ .name = "Adapter ROM",
+ .start = 0,
+ .end = 0,
+ .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
+}, {
+ .name = "Adapter ROM",
+ .start = 0,
+ .end = 0,
+ .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
+}, {
+ .name = "Adapter ROM",
+ .start = 0,
+ .end = 0,
+ .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
+}, {
+ .name = "Adapter ROM",
+ .start = 0,
+ .end = 0,
+ .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
+}, {
+ .name = "Adapter ROM",
+ .start = 0,
+ .end = 0,
+ .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
+} };
+
+static struct resource video_rom_resource = {
+ .name = "Video ROM",
+ .start = 0xc0000,
+ .end = 0xc7fff,
+ .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
+};
+
+/* does this oprom support the given pci device, or any of the devices
+ * that the driver supports?
+ */
+static bool match_id(struct pci_dev *pdev, unsigned short vendor, unsigned short device)
+{
+ struct pci_driver *drv = pdev->driver;
+ const struct pci_device_id *id;
+
+ if (pdev->vendor == vendor && pdev->device == device)
+ return true;
+
+ for (id = drv ? drv->id_table : NULL; id && id->vendor; id++)
+ if (id->vendor == vendor && id->device == device)
+ break;
+
+ return id && id->vendor;
+}
+
+static bool probe_list(struct pci_dev *pdev, unsigned short vendor,
+ const unsigned char *rom_list)
+{
+ unsigned short device;
+
+ do {
+ if (probe_kernel_address(rom_list, device) != 0)
+ device = 0;
+
+ if (device && match_id(pdev, vendor, device))
+ break;
+
+ rom_list += 2;
+ } while (device);
+
+ return !!device;
+}
+
+static struct resource *find_oprom(struct pci_dev *pdev)
+{
+ struct resource *oprom = NULL;
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(adapter_rom_resources); i++) {
+ struct resource *res = &adapter_rom_resources[i];
+ unsigned short offset, vendor, device, list, rev;
+ const unsigned char *rom;
+
+ if (res->end == 0)
+ break;
+
+ rom = isa_bus_to_virt(res->start);
+ if (probe_kernel_address(rom + 0x18, offset) != 0)
+ continue;
+
+ if (probe_kernel_address(rom + offset + 0x4, vendor) != 0)
+ continue;
+
+ if (probe_kernel_address(rom + offset + 0x6, device) != 0)
+ continue;
+
+ if (match_id(pdev, vendor, device)) {
+ oprom = res;
+ break;
+ }
+
+ if (probe_kernel_address(rom + offset + 0x8, list) == 0 &&
+ probe_kernel_address(rom + offset + 0xc, rev) == 0 &&
+ rev >= 3 && list &&
+ probe_list(pdev, vendor, rom + offset + list)) {
+ oprom = res;
+ break;
+ }
+ }
+
+ return oprom;
+}
+
+void __iomem *pci_map_biosrom(struct pci_dev *pdev)
+{
+ struct resource *oprom = find_oprom(pdev);
+
+ if (!oprom)
+ return NULL;
+
+ return ioremap(oprom->start, resource_size(oprom));
+}
+EXPORT_SYMBOL(pci_map_biosrom);
+
+void pci_unmap_biosrom(void __iomem *image)
+{
+ iounmap(image);
+}
+EXPORT_SYMBOL(pci_unmap_biosrom);
+
+size_t pci_biosrom_size(struct pci_dev *pdev)
+{
+ struct resource *oprom = find_oprom(pdev);
+
+ return oprom ? resource_size(oprom) : 0;
+}
+EXPORT_SYMBOL(pci_biosrom_size);
+
+#define ROMSIGNATURE 0xaa55
+
+static int __init romsignature(const unsigned char *rom)
+{
+ const unsigned short * const ptr = (const unsigned short *)rom;
+ unsigned short sig;
+
+ return probe_kernel_address(ptr, sig) == 0 && sig == ROMSIGNATURE;
+}
+
+static int __init romchecksum(const unsigned char *rom, unsigned long length)
+{
+ unsigned char sum, c;
+
+ for (sum = 0; length && probe_kernel_address(rom++, c) == 0; length--)
+ sum += c;
+ return !length && !sum;
+}
+
+void __init probe_roms(void)
+{
+ const unsigned char *rom;
+ unsigned long start, length, upper;
+ unsigned char c;
+ int i;
+
+ /* video rom */
+ upper = adapter_rom_resources[0].start;
+ for (start = video_rom_resource.start; start < upper; start += 2048) {
+ rom = isa_bus_to_virt(start);
+ if (!romsignature(rom))
+ continue;
+
+ video_rom_resource.start = start;
+
+ if (probe_kernel_address(rom + 2, c) != 0)
+ continue;
+
+ /* 0 < length <= 0x7f * 512, historically */
+ length = c * 512;
+
+ /* if checksum okay, trust length byte */
+ if (length && romchecksum(rom, length))
+ video_rom_resource.end = start + length - 1;
+
+ request_resource(&iomem_resource, &video_rom_resource);
+ break;
+ }
+
+ start = (video_rom_resource.end + 1 + 2047) & ~2047UL;
+ if (start < upper)
+ start = upper;
+
+ /* system rom */
+ request_resource(&iomem_resource, &system_rom_resource);
+ upper = system_rom_resource.start;
+
+ /* check for extension rom (ignore length byte!) */
+ rom = isa_bus_to_virt(extension_rom_resource.start);
+ if (romsignature(rom)) {
+ length = resource_size(&extension_rom_resource);
+ if (romchecksum(rom, length)) {
+ request_resource(&iomem_resource, &extension_rom_resource);
+ upper = extension_rom_resource.start;
+ }
+ }
+
+ /* check for adapter roms on 2k boundaries */
+ for (i = 0; i < ARRAY_SIZE(adapter_rom_resources) && start < upper; start += 2048) {
+ rom = isa_bus_to_virt(start);
+ if (!romsignature(rom))
+ continue;
+
+ if (probe_kernel_address(rom + 2, c) != 0)
+ continue;
+
+ /* 0 < length <= 0x7f * 512, historically */
+ length = c * 512;
+
+ /* but accept any length that fits if checksum okay */
+ if (!length || start + length > upper || !romchecksum(rom, length))
+ continue;
+
+ adapter_rom_resources[i].start = start;
+ adapter_rom_resources[i].end = start + length - 1;
+ request_resource(&iomem_resource, &adapter_rom_resources[i]);
+
+ start = adapter_rom_resources[i++].end & ~2047UL;
+ }
+}
+
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
new file mode 100644
index 0000000..7d31192
--- /dev/null
+++ b/arch/x86/kernel/process.c
@@ -0,0 +1,858 @@
+// SPDX-License-Identifier: GPL-2.0
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/smp.h>
+#include <linux/prctl.h>
+#include <linux/slab.h>
+#include <linux/sched.h>
+#include <linux/sched/idle.h>
+#include <linux/sched/debug.h>
+#include <linux/sched/task.h>
+#include <linux/sched/task_stack.h>
+#include <linux/init.h>
+#include <linux/export.h>
+#include <linux/pm.h>
+#include <linux/tick.h>
+#include <linux/random.h>
+#include <linux/user-return-notifier.h>
+#include <linux/dmi.h>
+#include <linux/utsname.h>
+#include <linux/stackprotector.h>
+#include <linux/cpuidle.h>
+#include <trace/events/power.h>
+#include <linux/hw_breakpoint.h>
+#include <asm/cpu.h>
+#include <asm/apic.h>
+#include <asm/syscalls.h>
+#include <linux/uaccess.h>
+#include <asm/mwait.h>
+#include <asm/fpu/internal.h>
+#include <asm/debugreg.h>
+#include <asm/nmi.h>
+#include <asm/tlbflush.h>
+#include <asm/mce.h>
+#include <asm/vm86.h>
+#include <asm/switch_to.h>
+#include <asm/desc.h>
+#include <asm/prctl.h>
+#include <asm/spec-ctrl.h>
+
+#include "process.h"
+
+/*
+ * per-CPU TSS segments. Threads are completely 'soft' on Linux,
+ * no more per-task TSS's. The TSS size is kept cacheline-aligned
+ * so they are allowed to end up in the .data..cacheline_aligned
+ * section. Since TSS's are completely CPU-local, we want them
+ * on exact cacheline boundaries, to eliminate cacheline ping-pong.
+ */
+__visible DEFINE_PER_CPU_PAGE_ALIGNED(struct tss_struct, cpu_tss_rw) = {
+ .x86_tss = {
+ /*
+ * .sp0 is only used when entering ring 0 from a lower
+ * privilege level. Since the init task never runs anything
+ * but ring 0 code, there is no need for a valid value here.
+ * Poison it.
+ */
+ .sp0 = (1UL << (BITS_PER_LONG-1)) + 1,
+
+ /*
+ * .sp1 is cpu_current_top_of_stack. The init task never
+ * runs user code, but cpu_current_top_of_stack should still
+ * be well defined before the first context switch.
+ */
+ .sp1 = TOP_OF_INIT_STACK,
+
+#ifdef CONFIG_X86_32
+ .ss0 = __KERNEL_DS,
+ .ss1 = __KERNEL_CS,
+ .io_bitmap_base = INVALID_IO_BITMAP_OFFSET,
+#endif
+ },
+#ifdef CONFIG_X86_32
+ /*
+ * Note that the .io_bitmap member must be extra-big. This is because
+ * the CPU will access an additional byte beyond the end of the IO
+ * permission bitmap. The extra byte must be all 1 bits, and must
+ * be within the limit.
+ */
+ .io_bitmap = { [0 ... IO_BITMAP_LONGS] = ~0 },
+#endif
+};
+EXPORT_PER_CPU_SYMBOL(cpu_tss_rw);
+
+DEFINE_PER_CPU(bool, __tss_limit_invalid);
+EXPORT_PER_CPU_SYMBOL_GPL(__tss_limit_invalid);
+
+/*
+ * this gets called so that we can store lazy state into memory and copy the
+ * current task into the new thread.
+ */
+int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
+{
+ memcpy(dst, src, arch_task_struct_size);
+#ifdef CONFIG_VM86
+ dst->thread.vm86 = NULL;
+#endif
+
+ return fpu__copy(&dst->thread.fpu, &src->thread.fpu);
+}
+
+/*
+ * Free current thread data structures etc..
+ */
+void exit_thread(struct task_struct *tsk)
+{
+ struct thread_struct *t = &tsk->thread;
+ unsigned long *bp = t->io_bitmap_ptr;
+ struct fpu *fpu = &t->fpu;
+
+ if (bp) {
+ struct tss_struct *tss = &per_cpu(cpu_tss_rw, get_cpu());
+
+ t->io_bitmap_ptr = NULL;
+ clear_thread_flag(TIF_IO_BITMAP);
+ /*
+ * Careful, clear this in the TSS too:
+ */
+ memset(tss->io_bitmap, 0xff, t->io_bitmap_max);
+ t->io_bitmap_max = 0;
+ put_cpu();
+ kfree(bp);
+ }
+
+ free_vm86(t);
+
+ fpu__drop(fpu);
+}
+
+void flush_thread(void)
+{
+ struct task_struct *tsk = current;
+
+ flush_ptrace_hw_breakpoint(tsk);
+ memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
+
+ fpu__clear(&tsk->thread.fpu);
+}
+
+void disable_TSC(void)
+{
+ preempt_disable();
+ if (!test_and_set_thread_flag(TIF_NOTSC))
+ /*
+ * Must flip the CPU state synchronously with
+ * TIF_NOTSC in the current running context.
+ */
+ cr4_set_bits(X86_CR4_TSD);
+ preempt_enable();
+}
+
+static void enable_TSC(void)
+{
+ preempt_disable();
+ if (test_and_clear_thread_flag(TIF_NOTSC))
+ /*
+ * Must flip the CPU state synchronously with
+ * TIF_NOTSC in the current running context.
+ */
+ cr4_clear_bits(X86_CR4_TSD);
+ preempt_enable();
+}
+
+int get_tsc_mode(unsigned long adr)
+{
+ unsigned int val;
+
+ if (test_thread_flag(TIF_NOTSC))
+ val = PR_TSC_SIGSEGV;
+ else
+ val = PR_TSC_ENABLE;
+
+ return put_user(val, (unsigned int __user *)adr);
+}
+
+int set_tsc_mode(unsigned int val)
+{
+ if (val == PR_TSC_SIGSEGV)
+ disable_TSC();
+ else if (val == PR_TSC_ENABLE)
+ enable_TSC();
+ else
+ return -EINVAL;
+
+ return 0;
+}
+
+DEFINE_PER_CPU(u64, msr_misc_features_shadow);
+
+static void set_cpuid_faulting(bool on)
+{
+ u64 msrval;
+
+ msrval = this_cpu_read(msr_misc_features_shadow);
+ msrval &= ~MSR_MISC_FEATURES_ENABLES_CPUID_FAULT;
+ msrval |= (on << MSR_MISC_FEATURES_ENABLES_CPUID_FAULT_BIT);
+ this_cpu_write(msr_misc_features_shadow, msrval);
+ wrmsrl(MSR_MISC_FEATURES_ENABLES, msrval);
+}
+
+static void disable_cpuid(void)
+{
+ preempt_disable();
+ if (!test_and_set_thread_flag(TIF_NOCPUID)) {
+ /*
+ * Must flip the CPU state synchronously with
+ * TIF_NOCPUID in the current running context.
+ */
+ set_cpuid_faulting(true);
+ }
+ preempt_enable();
+}
+
+static void enable_cpuid(void)
+{
+ preempt_disable();
+ if (test_and_clear_thread_flag(TIF_NOCPUID)) {
+ /*
+ * Must flip the CPU state synchronously with
+ * TIF_NOCPUID in the current running context.
+ */
+ set_cpuid_faulting(false);
+ }
+ preempt_enable();
+}
+
+static int get_cpuid_mode(void)
+{
+ return !test_thread_flag(TIF_NOCPUID);
+}
+
+static int set_cpuid_mode(struct task_struct *task, unsigned long cpuid_enabled)
+{
+ if (!static_cpu_has(X86_FEATURE_CPUID_FAULT))
+ return -ENODEV;
+
+ if (cpuid_enabled)
+ enable_cpuid();
+ else
+ disable_cpuid();
+
+ return 0;
+}
+
+/*
+ * Called immediately after a successful exec.
+ */
+void arch_setup_new_exec(void)
+{
+ /* If cpuid was previously disabled for this task, re-enable it. */
+ if (test_thread_flag(TIF_NOCPUID))
+ enable_cpuid();
+}
+
+static inline void switch_to_bitmap(struct thread_struct *prev,
+ struct thread_struct *next,
+ unsigned long tifp, unsigned long tifn)
+{
+ struct tss_struct *tss = this_cpu_ptr(&cpu_tss_rw);
+
+ if (tifn & _TIF_IO_BITMAP) {
+ /*
+ * Copy the relevant range of the IO bitmap.
+ * Normally this is 128 bytes or less:
+ */
+ memcpy(tss->io_bitmap, next->io_bitmap_ptr,
+ max(prev->io_bitmap_max, next->io_bitmap_max));
+ /*
+ * Make sure that the TSS limit is correct for the CPU
+ * to notice the IO bitmap.
+ */
+ refresh_tss_limit();
+ } else if (tifp & _TIF_IO_BITMAP) {
+ /*
+ * Clear any possible leftover bits:
+ */
+ memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
+ }
+}
+
+#ifdef CONFIG_SMP
+
+struct ssb_state {
+ struct ssb_state *shared_state;
+ raw_spinlock_t lock;
+ unsigned int disable_state;
+ unsigned long local_state;
+};
+
+#define LSTATE_SSB 0
+
+static DEFINE_PER_CPU(struct ssb_state, ssb_state);
+
+void speculative_store_bypass_ht_init(void)
+{
+ struct ssb_state *st = this_cpu_ptr(&ssb_state);
+ unsigned int this_cpu = smp_processor_id();
+ unsigned int cpu;
+
+ st->local_state = 0;
+
+ /*
+ * Shared state setup happens once on the first bringup
+ * of the CPU. It's not destroyed on CPU hotunplug.
+ */
+ if (st->shared_state)
+ return;
+
+ raw_spin_lock_init(&st->lock);
+
+ /*
+ * Go over HT siblings and check whether one of them has set up the
+ * shared state pointer already.
+ */
+ for_each_cpu(cpu, topology_sibling_cpumask(this_cpu)) {
+ if (cpu == this_cpu)
+ continue;
+
+ if (!per_cpu(ssb_state, cpu).shared_state)
+ continue;
+
+ /* Link it to the state of the sibling: */
+ st->shared_state = per_cpu(ssb_state, cpu).shared_state;
+ return;
+ }
+
+ /*
+ * First HT sibling to come up on the core. Link shared state of
+ * the first HT sibling to itself. The siblings on the same core
+ * which come up later will see the shared state pointer and link
+ * themself to the state of this CPU.
+ */
+ st->shared_state = st;
+}
+
+/*
+ * Logic is: First HT sibling enables SSBD for both siblings in the core
+ * and last sibling to disable it, disables it for the whole core. This how
+ * MSR_SPEC_CTRL works in "hardware":
+ *
+ * CORE_SPEC_CTRL = THREAD0_SPEC_CTRL | THREAD1_SPEC_CTRL
+ */
+static __always_inline void amd_set_core_ssb_state(unsigned long tifn)
+{
+ struct ssb_state *st = this_cpu_ptr(&ssb_state);
+ u64 msr = x86_amd_ls_cfg_base;
+
+ if (!static_cpu_has(X86_FEATURE_ZEN)) {
+ msr |= ssbd_tif_to_amd_ls_cfg(tifn);
+ wrmsrl(MSR_AMD64_LS_CFG, msr);
+ return;
+ }
+
+ if (tifn & _TIF_SSBD) {
+ /*
+ * Since this can race with prctl(), block reentry on the
+ * same CPU.
+ */
+ if (__test_and_set_bit(LSTATE_SSB, &st->local_state))
+ return;
+
+ msr |= x86_amd_ls_cfg_ssbd_mask;
+
+ raw_spin_lock(&st->shared_state->lock);
+ /* First sibling enables SSBD: */
+ if (!st->shared_state->disable_state)
+ wrmsrl(MSR_AMD64_LS_CFG, msr);
+ st->shared_state->disable_state++;
+ raw_spin_unlock(&st->shared_state->lock);
+ } else {
+ if (!__test_and_clear_bit(LSTATE_SSB, &st->local_state))
+ return;
+
+ raw_spin_lock(&st->shared_state->lock);
+ st->shared_state->disable_state--;
+ if (!st->shared_state->disable_state)
+ wrmsrl(MSR_AMD64_LS_CFG, msr);
+ raw_spin_unlock(&st->shared_state->lock);
+ }
+}
+#else
+static __always_inline void amd_set_core_ssb_state(unsigned long tifn)
+{
+ u64 msr = x86_amd_ls_cfg_base | ssbd_tif_to_amd_ls_cfg(tifn);
+
+ wrmsrl(MSR_AMD64_LS_CFG, msr);
+}
+#endif
+
+static __always_inline void amd_set_ssb_virt_state(unsigned long tifn)
+{
+ /*
+ * SSBD has the same definition in SPEC_CTRL and VIRT_SPEC_CTRL,
+ * so ssbd_tif_to_spec_ctrl() just works.
+ */
+ wrmsrl(MSR_AMD64_VIRT_SPEC_CTRL, ssbd_tif_to_spec_ctrl(tifn));
+}
+
+/*
+ * Update the MSRs managing speculation control, during context switch.
+ *
+ * tifp: Previous task's thread flags
+ * tifn: Next task's thread flags
+ */
+static __always_inline void __speculation_ctrl_update(unsigned long tifp,
+ unsigned long tifn)
+{
+ unsigned long tif_diff = tifp ^ tifn;
+ u64 msr = x86_spec_ctrl_base;
+ bool updmsr = false;
+
+ /*
+ * If TIF_SSBD is different, select the proper mitigation
+ * method. Note that if SSBD mitigation is disabled or permanentely
+ * enabled this branch can't be taken because nothing can set
+ * TIF_SSBD.
+ */
+ if (tif_diff & _TIF_SSBD) {
+ if (static_cpu_has(X86_FEATURE_VIRT_SSBD)) {
+ amd_set_ssb_virt_state(tifn);
+ } else if (static_cpu_has(X86_FEATURE_LS_CFG_SSBD)) {
+ amd_set_core_ssb_state(tifn);
+ } else if (static_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD) ||
+ static_cpu_has(X86_FEATURE_AMD_SSBD)) {
+ msr |= ssbd_tif_to_spec_ctrl(tifn);
+ updmsr = true;
+ }
+ }
+
+ /*
+ * Only evaluate TIF_SPEC_IB if conditional STIBP is enabled,
+ * otherwise avoid the MSR write.
+ */
+ if (IS_ENABLED(CONFIG_SMP) &&
+ static_branch_unlikely(&switch_to_cond_stibp)) {
+ updmsr |= !!(tif_diff & _TIF_SPEC_IB);
+ msr |= stibp_tif_to_spec_ctrl(tifn);
+ }
+
+ if (updmsr)
+ wrmsrl(MSR_IA32_SPEC_CTRL, msr);
+}
+
+static unsigned long speculation_ctrl_update_tif(struct task_struct *tsk)
+{
+ if (test_and_clear_tsk_thread_flag(tsk, TIF_SPEC_FORCE_UPDATE)) {
+ if (task_spec_ssb_disable(tsk))
+ set_tsk_thread_flag(tsk, TIF_SSBD);
+ else
+ clear_tsk_thread_flag(tsk, TIF_SSBD);
+
+ if (task_spec_ib_disable(tsk))
+ set_tsk_thread_flag(tsk, TIF_SPEC_IB);
+ else
+ clear_tsk_thread_flag(tsk, TIF_SPEC_IB);
+ }
+ /* Return the updated threadinfo flags*/
+ return task_thread_info(tsk)->flags;
+}
+
+void speculation_ctrl_update(unsigned long tif)
+{
+ /* Forced update. Make sure all relevant TIF flags are different */
+ preempt_disable();
+ __speculation_ctrl_update(~tif, tif);
+ preempt_enable();
+}
+
+/* Called from seccomp/prctl update */
+void speculation_ctrl_update_current(void)
+{
+ preempt_disable();
+ speculation_ctrl_update(speculation_ctrl_update_tif(current));
+ preempt_enable();
+}
+
+void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p)
+{
+ struct thread_struct *prev, *next;
+ unsigned long tifp, tifn;
+
+ prev = &prev_p->thread;
+ next = &next_p->thread;
+
+ tifn = READ_ONCE(task_thread_info(next_p)->flags);
+ tifp = READ_ONCE(task_thread_info(prev_p)->flags);
+ switch_to_bitmap(prev, next, tifp, tifn);
+
+ propagate_user_return_notify(prev_p, next_p);
+
+ if ((tifp & _TIF_BLOCKSTEP || tifn & _TIF_BLOCKSTEP) &&
+ arch_has_block_step()) {
+ unsigned long debugctl, msk;
+
+ rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
+ debugctl &= ~DEBUGCTLMSR_BTF;
+ msk = tifn & _TIF_BLOCKSTEP;
+ debugctl |= (msk >> TIF_BLOCKSTEP) << DEBUGCTLMSR_BTF_SHIFT;
+ wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
+ }
+
+ if ((tifp ^ tifn) & _TIF_NOTSC)
+ cr4_toggle_bits_irqsoff(X86_CR4_TSD);
+
+ if ((tifp ^ tifn) & _TIF_NOCPUID)
+ set_cpuid_faulting(!!(tifn & _TIF_NOCPUID));
+
+ if (likely(!((tifp | tifn) & _TIF_SPEC_FORCE_UPDATE))) {
+ __speculation_ctrl_update(tifp, tifn);
+ } else {
+ speculation_ctrl_update_tif(prev_p);
+ tifn = speculation_ctrl_update_tif(next_p);
+
+ /* Enforce MSR update to ensure consistent state */
+ __speculation_ctrl_update(~tifn, tifn);
+ }
+}
+
+/*
+ * Idle related variables and functions
+ */
+unsigned long boot_option_idle_override = IDLE_NO_OVERRIDE;
+EXPORT_SYMBOL(boot_option_idle_override);
+
+static void (*x86_idle)(void);
+
+#ifndef CONFIG_SMP
+static inline void play_dead(void)
+{
+ BUG();
+}
+#endif
+
+void arch_cpu_idle_enter(void)
+{
+ tsc_verify_tsc_adjust(false);
+ local_touch_nmi();
+}
+
+void arch_cpu_idle_dead(void)
+{
+ play_dead();
+}
+
+/*
+ * Called from the generic idle code.
+ */
+void arch_cpu_idle(void)
+{
+ x86_idle();
+}
+
+/*
+ * We use this if we don't have any better idle routine..
+ */
+void __cpuidle default_idle(void)
+{
+ trace_cpu_idle_rcuidle(1, smp_processor_id());
+ safe_halt();
+ trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id());
+}
+#ifdef CONFIG_APM_MODULE
+EXPORT_SYMBOL(default_idle);
+#endif
+
+#ifdef CONFIG_XEN
+bool xen_set_default_idle(void)
+{
+ bool ret = !!x86_idle;
+
+ x86_idle = default_idle;
+
+ return ret;
+}
+#endif
+
+void stop_this_cpu(void *dummy)
+{
+ local_irq_disable();
+ /*
+ * Remove this CPU:
+ */
+ set_cpu_online(smp_processor_id(), false);
+ disable_local_APIC();
+ mcheck_cpu_clear(this_cpu_ptr(&cpu_info));
+
+ /*
+ * Use wbinvd on processors that support SME. This provides support
+ * for performing a successful kexec when going from SME inactive
+ * to SME active (or vice-versa). The cache must be cleared so that
+ * if there are entries with the same physical address, both with and
+ * without the encryption bit, they don't race each other when flushed
+ * and potentially end up with the wrong entry being committed to
+ * memory.
+ */
+ if (boot_cpu_has(X86_FEATURE_SME))
+ native_wbinvd();
+ for (;;) {
+ /*
+ * Use native_halt() so that memory contents don't change
+ * (stack usage and variables) after possibly issuing the
+ * native_wbinvd() above.
+ */
+ native_halt();
+ }
+}
+
+/*
+ * AMD Erratum 400 aware idle routine. We handle it the same way as C3 power
+ * states (local apic timer and TSC stop).
+ */
+static void amd_e400_idle(void)
+{
+ /*
+ * We cannot use static_cpu_has_bug() here because X86_BUG_AMD_APIC_C1E
+ * gets set after static_cpu_has() places have been converted via
+ * alternatives.
+ */
+ if (!boot_cpu_has_bug(X86_BUG_AMD_APIC_C1E)) {
+ default_idle();
+ return;
+ }
+
+ tick_broadcast_enter();
+
+ default_idle();
+
+ /*
+ * The switch back from broadcast mode needs to be called with
+ * interrupts disabled.
+ */
+ local_irq_disable();
+ tick_broadcast_exit();
+ local_irq_enable();
+}
+
+/*
+ * Intel Core2 and older machines prefer MWAIT over HALT for C1.
+ * We can't rely on cpuidle installing MWAIT, because it will not load
+ * on systems that support only C1 -- so the boot default must be MWAIT.
+ *
+ * Some AMD machines are the opposite, they depend on using HALT.
+ *
+ * So for default C1, which is used during boot until cpuidle loads,
+ * use MWAIT-C1 on Intel HW that has it, else use HALT.
+ */
+static int prefer_mwait_c1_over_halt(const struct cpuinfo_x86 *c)
+{
+ if (c->x86_vendor != X86_VENDOR_INTEL)
+ return 0;
+
+ if (!cpu_has(c, X86_FEATURE_MWAIT) || static_cpu_has_bug(X86_BUG_MONITOR))
+ return 0;
+
+ return 1;
+}
+
+/*
+ * MONITOR/MWAIT with no hints, used for default C1 state. This invokes MWAIT
+ * with interrupts enabled and no flags, which is backwards compatible with the
+ * original MWAIT implementation.
+ */
+static __cpuidle void mwait_idle(void)
+{
+ if (!current_set_polling_and_test()) {
+ trace_cpu_idle_rcuidle(1, smp_processor_id());
+ if (this_cpu_has(X86_BUG_CLFLUSH_MONITOR)) {
+ mb(); /* quirk */
+ clflush((void *)¤t_thread_info()->flags);
+ mb(); /* quirk */
+ }
+
+ __monitor((void *)¤t_thread_info()->flags, 0, 0);
+ if (!need_resched())
+ __sti_mwait(0, 0);
+ else
+ local_irq_enable();
+ trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id());
+ } else {
+ local_irq_enable();
+ }
+ __current_clr_polling();
+}
+
+void select_idle_routine(const struct cpuinfo_x86 *c)
+{
+#ifdef CONFIG_SMP
+ if (boot_option_idle_override == IDLE_POLL && smp_num_siblings > 1)
+ pr_warn_once("WARNING: polling idle and HT enabled, performance may degrade\n");
+#endif
+ if (x86_idle || boot_option_idle_override == IDLE_POLL)
+ return;
+
+ if (boot_cpu_has_bug(X86_BUG_AMD_E400)) {
+ pr_info("using AMD E400 aware idle routine\n");
+ x86_idle = amd_e400_idle;
+ } else if (prefer_mwait_c1_over_halt(c)) {
+ pr_info("using mwait in idle threads\n");
+ x86_idle = mwait_idle;
+ } else
+ x86_idle = default_idle;
+}
+
+void amd_e400_c1e_apic_setup(void)
+{
+ if (boot_cpu_has_bug(X86_BUG_AMD_APIC_C1E)) {
+ pr_info("Switch to broadcast mode on CPU%d\n", smp_processor_id());
+ local_irq_disable();
+ tick_broadcast_force();
+ local_irq_enable();
+ }
+}
+
+void __init arch_post_acpi_subsys_init(void)
+{
+ u32 lo, hi;
+
+ if (!boot_cpu_has_bug(X86_BUG_AMD_E400))
+ return;
+
+ /*
+ * AMD E400 detection needs to happen after ACPI has been enabled. If
+ * the machine is affected K8_INTP_C1E_ACTIVE_MASK bits are set in
+ * MSR_K8_INT_PENDING_MSG.
+ */
+ rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi);
+ if (!(lo & K8_INTP_C1E_ACTIVE_MASK))
+ return;
+
+ boot_cpu_set_bug(X86_BUG_AMD_APIC_C1E);
+
+ if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC))
+ mark_tsc_unstable("TSC halt in AMD C1E");
+ pr_info("System has AMD C1E enabled\n");
+}
+
+static int __init idle_setup(char *str)
+{
+ if (!str)
+ return -EINVAL;
+
+ if (!strcmp(str, "poll")) {
+ pr_info("using polling idle threads\n");
+ boot_option_idle_override = IDLE_POLL;
+ cpu_idle_poll_ctrl(true);
+ } else if (!strcmp(str, "halt")) {
+ /*
+ * When the boot option of idle=halt is added, halt is
+ * forced to be used for CPU idle. In such case CPU C2/C3
+ * won't be used again.
+ * To continue to load the CPU idle driver, don't touch
+ * the boot_option_idle_override.
+ */
+ x86_idle = default_idle;
+ boot_option_idle_override = IDLE_HALT;
+ } else if (!strcmp(str, "nomwait")) {
+ /*
+ * If the boot option of "idle=nomwait" is added,
+ * it means that mwait will be disabled for CPU C2/C3
+ * states. In such case it won't touch the variable
+ * of boot_option_idle_override.
+ */
+ boot_option_idle_override = IDLE_NOMWAIT;
+ } else
+ return -1;
+
+ return 0;
+}
+early_param("idle", idle_setup);
+
+unsigned long arch_align_stack(unsigned long sp)
+{
+ if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
+ sp -= get_random_int() % 8192;
+ return sp & ~0xf;
+}
+
+unsigned long arch_randomize_brk(struct mm_struct *mm)
+{
+ return randomize_page(mm->brk, 0x02000000);
+}
+
+/*
+ * Called from fs/proc with a reference on @p to find the function
+ * which called into schedule(). This needs to be done carefully
+ * because the task might wake up and we might look at a stack
+ * changing under us.
+ */
+unsigned long get_wchan(struct task_struct *p)
+{
+ unsigned long start, bottom, top, sp, fp, ip, ret = 0;
+ int count = 0;
+
+ if (!p || p == current || p->state == TASK_RUNNING)
+ return 0;
+
+ if (!try_get_task_stack(p))
+ return 0;
+
+ start = (unsigned long)task_stack_page(p);
+ if (!start)
+ goto out;
+
+ /*
+ * Layout of the stack page:
+ *
+ * ----------- topmax = start + THREAD_SIZE - sizeof(unsigned long)
+ * PADDING
+ * ----------- top = topmax - TOP_OF_KERNEL_STACK_PADDING
+ * stack
+ * ----------- bottom = start
+ *
+ * The tasks stack pointer points at the location where the
+ * framepointer is stored. The data on the stack is:
+ * ... IP FP ... IP FP
+ *
+ * We need to read FP and IP, so we need to adjust the upper
+ * bound by another unsigned long.
+ */
+ top = start + THREAD_SIZE - TOP_OF_KERNEL_STACK_PADDING;
+ top -= 2 * sizeof(unsigned long);
+ bottom = start;
+
+ sp = READ_ONCE(p->thread.sp);
+ if (sp < bottom || sp > top)
+ goto out;
+
+ fp = READ_ONCE_NOCHECK(((struct inactive_task_frame *)sp)->bp);
+ do {
+ if (fp < bottom || fp > top)
+ goto out;
+ ip = READ_ONCE_NOCHECK(*(unsigned long *)(fp + sizeof(unsigned long)));
+ if (!in_sched_functions(ip)) {
+ ret = ip;
+ goto out;
+ }
+ fp = READ_ONCE_NOCHECK(*(unsigned long *)fp);
+ } while (count++ < 16 && p->state != TASK_RUNNING);
+
+out:
+ put_task_stack(p);
+ return ret;
+}
+
+long do_arch_prctl_common(struct task_struct *task, int option,
+ unsigned long cpuid_enabled)
+{
+ switch (option) {
+ case ARCH_GET_CPUID:
+ return get_cpuid_mode();
+ case ARCH_SET_CPUID:
+ return set_cpuid_mode(task, cpuid_enabled);
+ }
+
+ return -EINVAL;
+}
diff --git a/arch/x86/kernel/process.h b/arch/x86/kernel/process.h
new file mode 100644
index 0000000..898e97c
--- /dev/null
+++ b/arch/x86/kernel/process.h
@@ -0,0 +1,39 @@
+// SPDX-License-Identifier: GPL-2.0
+//
+// Code shared between 32 and 64 bit
+
+#include <asm/spec-ctrl.h>
+
+void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p);
+
+/*
+ * This needs to be inline to optimize for the common case where no extra
+ * work needs to be done.
+ */
+static inline void switch_to_extra(struct task_struct *prev,
+ struct task_struct *next)
+{
+ unsigned long next_tif = task_thread_info(next)->flags;
+ unsigned long prev_tif = task_thread_info(prev)->flags;
+
+ if (IS_ENABLED(CONFIG_SMP)) {
+ /*
+ * Avoid __switch_to_xtra() invocation when conditional
+ * STIPB is disabled and the only different bit is
+ * TIF_SPEC_IB. For CONFIG_SMP=n TIF_SPEC_IB is not
+ * in the TIF_WORK_CTXSW masks.
+ */
+ if (!static_branch_likely(&switch_to_cond_stibp)) {
+ prev_tif &= ~_TIF_SPEC_IB;
+ next_tif &= ~_TIF_SPEC_IB;
+ }
+ }
+
+ /*
+ * __switch_to_xtra() handles debug registers, i/o bitmaps,
+ * speculation mitigations etc.
+ */
+ if (unlikely(next_tif & _TIF_WORK_CTXSW_NEXT ||
+ prev_tif & _TIF_WORK_CTXSW_PREV))
+ __switch_to_xtra(prev, next);
+}
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
new file mode 100644
index 0000000..d3e593e
--- /dev/null
+++ b/arch/x86/kernel/process_32.c
@@ -0,0 +1,309 @@
+/*
+ * Copyright (C) 1995 Linus Torvalds
+ *
+ * Pentium III FXSR, SSE support
+ * Gareth Hughes <gareth@valinux.com>, May 2000
+ */
+
+/*
+ * This file handles the architecture-dependent parts of process handling..
+ */
+
+#include <linux/cpu.h>
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/sched/task.h>
+#include <linux/sched/task_stack.h>
+#include <linux/fs.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/elfcore.h>
+#include <linux/smp.h>
+#include <linux/stddef.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/user.h>
+#include <linux/interrupt.h>
+#include <linux/delay.h>
+#include <linux/reboot.h>
+#include <linux/mc146818rtc.h>
+#include <linux/export.h>
+#include <linux/kallsyms.h>
+#include <linux/ptrace.h>
+#include <linux/personality.h>
+#include <linux/percpu.h>
+#include <linux/prctl.h>
+#include <linux/ftrace.h>
+#include <linux/uaccess.h>
+#include <linux/io.h>
+#include <linux/kdebug.h>
+#include <linux/syscalls.h>
+
+#include <asm/pgtable.h>
+#include <asm/ldt.h>
+#include <asm/processor.h>
+#include <asm/fpu/internal.h>
+#include <asm/desc.h>
+#ifdef CONFIG_MATH_EMULATION
+#include <asm/math_emu.h>
+#endif
+
+#include <linux/err.h>
+
+#include <asm/tlbflush.h>
+#include <asm/cpu.h>
+#include <asm/syscalls.h>
+#include <asm/debugreg.h>
+#include <asm/switch_to.h>
+#include <asm/vm86.h>
+#include <asm/intel_rdt_sched.h>
+#include <asm/proto.h>
+
+#include "process.h"
+
+void __show_regs(struct pt_regs *regs, enum show_regs_mode mode)
+{
+ unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L;
+ unsigned long d0, d1, d2, d3, d6, d7;
+ unsigned long sp;
+ unsigned short ss, gs;
+
+ if (user_mode(regs)) {
+ sp = regs->sp;
+ ss = regs->ss;
+ gs = get_user_gs(regs);
+ } else {
+ sp = kernel_stack_pointer(regs);
+ savesegment(ss, ss);
+ savesegment(gs, gs);
+ }
+
+ show_ip(regs, KERN_DEFAULT);
+
+ printk(KERN_DEFAULT "EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n",
+ regs->ax, regs->bx, regs->cx, regs->dx);
+ printk(KERN_DEFAULT "ESI: %08lx EDI: %08lx EBP: %08lx ESP: %08lx\n",
+ regs->si, regs->di, regs->bp, sp);
+ printk(KERN_DEFAULT "DS: %04x ES: %04x FS: %04x GS: %04x SS: %04x EFLAGS: %08lx\n",
+ (u16)regs->ds, (u16)regs->es, (u16)regs->fs, gs, ss, regs->flags);
+
+ if (mode != SHOW_REGS_ALL)
+ return;
+
+ cr0 = read_cr0();
+ cr2 = read_cr2();
+ cr3 = __read_cr3();
+ cr4 = __read_cr4();
+ printk(KERN_DEFAULT "CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n",
+ cr0, cr2, cr3, cr4);
+
+ get_debugreg(d0, 0);
+ get_debugreg(d1, 1);
+ get_debugreg(d2, 2);
+ get_debugreg(d3, 3);
+ get_debugreg(d6, 6);
+ get_debugreg(d7, 7);
+
+ /* Only print out debug registers if they are in their non-default state. */
+ if ((d0 == 0) && (d1 == 0) && (d2 == 0) && (d3 == 0) &&
+ (d6 == DR6_RESERVED) && (d7 == 0x400))
+ return;
+
+ printk(KERN_DEFAULT "DR0: %08lx DR1: %08lx DR2: %08lx DR3: %08lx\n",
+ d0, d1, d2, d3);
+ printk(KERN_DEFAULT "DR6: %08lx DR7: %08lx\n",
+ d6, d7);
+}
+
+void release_thread(struct task_struct *dead_task)
+{
+ BUG_ON(dead_task->mm);
+ release_vm86_irqs(dead_task);
+}
+
+int copy_thread_tls(unsigned long clone_flags, unsigned long sp,
+ unsigned long arg, struct task_struct *p, unsigned long tls)
+{
+ struct pt_regs *childregs = task_pt_regs(p);
+ struct fork_frame *fork_frame = container_of(childregs, struct fork_frame, regs);
+ struct inactive_task_frame *frame = &fork_frame->frame;
+ struct task_struct *tsk;
+ int err;
+
+ frame->bp = 0;
+ frame->ret_addr = (unsigned long) ret_from_fork;
+ p->thread.sp = (unsigned long) fork_frame;
+ p->thread.sp0 = (unsigned long) (childregs+1);
+ memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
+
+ if (unlikely(p->flags & PF_KTHREAD)) {
+ /* kernel thread */
+ memset(childregs, 0, sizeof(struct pt_regs));
+ frame->bx = sp; /* function */
+ frame->di = arg;
+ p->thread.io_bitmap_ptr = NULL;
+ return 0;
+ }
+ frame->bx = 0;
+ *childregs = *current_pt_regs();
+ childregs->ax = 0;
+ if (sp)
+ childregs->sp = sp;
+
+ task_user_gs(p) = get_user_gs(current_pt_regs());
+
+ p->thread.io_bitmap_ptr = NULL;
+ tsk = current;
+ err = -ENOMEM;
+
+ if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) {
+ p->thread.io_bitmap_ptr = kmemdup(tsk->thread.io_bitmap_ptr,
+ IO_BITMAP_BYTES, GFP_KERNEL);
+ if (!p->thread.io_bitmap_ptr) {
+ p->thread.io_bitmap_max = 0;
+ return -ENOMEM;
+ }
+ set_tsk_thread_flag(p, TIF_IO_BITMAP);
+ }
+
+ err = 0;
+
+ /*
+ * Set a new TLS for the child thread?
+ */
+ if (clone_flags & CLONE_SETTLS)
+ err = do_set_thread_area(p, -1,
+ (struct user_desc __user *)tls, 0);
+
+ if (err && p->thread.io_bitmap_ptr) {
+ kfree(p->thread.io_bitmap_ptr);
+ p->thread.io_bitmap_max = 0;
+ }
+ return err;
+}
+
+void
+start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
+{
+ set_user_gs(regs, 0);
+ regs->fs = 0;
+ regs->ds = __USER_DS;
+ regs->es = __USER_DS;
+ regs->ss = __USER_DS;
+ regs->cs = __USER_CS;
+ regs->ip = new_ip;
+ regs->sp = new_sp;
+ regs->flags = X86_EFLAGS_IF;
+ force_iret();
+}
+EXPORT_SYMBOL_GPL(start_thread);
+
+
+/*
+ * switch_to(x,y) should switch tasks from x to y.
+ *
+ * We fsave/fwait so that an exception goes off at the right time
+ * (as a call from the fsave or fwait in effect) rather than to
+ * the wrong process. Lazy FP saving no longer makes any sense
+ * with modern CPU's, and this simplifies a lot of things (SMP
+ * and UP become the same).
+ *
+ * NOTE! We used to use the x86 hardware context switching. The
+ * reason for not using it any more becomes apparent when you
+ * try to recover gracefully from saved state that is no longer
+ * valid (stale segment register values in particular). With the
+ * hardware task-switch, there is no way to fix up bad state in
+ * a reasonable manner.
+ *
+ * The fact that Intel documents the hardware task-switching to
+ * be slow is a fairly red herring - this code is not noticeably
+ * faster. However, there _is_ some room for improvement here,
+ * so the performance issues may eventually be a valid point.
+ * More important, however, is the fact that this allows us much
+ * more flexibility.
+ *
+ * The return value (in %ax) will be the "prev" task after
+ * the task-switch, and shows up in ret_from_fork in entry.S,
+ * for example.
+ */
+__visible __notrace_funcgraph struct task_struct *
+__switch_to(struct task_struct *prev_p, struct task_struct *next_p)
+{
+ struct thread_struct *prev = &prev_p->thread,
+ *next = &next_p->thread;
+ struct fpu *prev_fpu = &prev->fpu;
+ struct fpu *next_fpu = &next->fpu;
+ int cpu = smp_processor_id();
+
+ /* never put a printk in __switch_to... printk() calls wake_up*() indirectly */
+
+ switch_fpu_prepare(prev_fpu, cpu);
+
+ /*
+ * Save away %gs. No need to save %fs, as it was saved on the
+ * stack on entry. No need to save %es and %ds, as those are
+ * always kernel segments while inside the kernel. Doing this
+ * before setting the new TLS descriptors avoids the situation
+ * where we temporarily have non-reloadable segments in %fs
+ * and %gs. This could be an issue if the NMI handler ever
+ * used %fs or %gs (it does not today), or if the kernel is
+ * running inside of a hypervisor layer.
+ */
+ lazy_save_gs(prev->gs);
+
+ /*
+ * Load the per-thread Thread-Local Storage descriptor.
+ */
+ load_TLS(next, cpu);
+
+ /*
+ * Restore IOPL if needed. In normal use, the flags restore
+ * in the switch assembly will handle this. But if the kernel
+ * is running virtualized at a non-zero CPL, the popf will
+ * not restore flags, so it must be done in a separate step.
+ */
+ if (get_kernel_rpl() && unlikely(prev->iopl != next->iopl))
+ set_iopl_mask(next->iopl);
+
+ switch_to_extra(prev_p, next_p);
+
+ /*
+ * Leave lazy mode, flushing any hypercalls made here.
+ * This must be done before restoring TLS segments so
+ * the GDT and LDT are properly updated, and must be
+ * done before fpu__restore(), so the TS bit is up
+ * to date.
+ */
+ arch_end_context_switch(next_p);
+
+ /*
+ * Reload esp0 and cpu_current_top_of_stack. This changes
+ * current_thread_info(). Refresh the SYSENTER configuration in
+ * case prev or next is vm86.
+ */
+ update_task_stack(next_p);
+ refresh_sysenter_cs(next);
+ this_cpu_write(cpu_current_top_of_stack,
+ (unsigned long)task_stack_page(next_p) +
+ THREAD_SIZE);
+
+ /*
+ * Restore %gs if needed (which is common)
+ */
+ if (prev->gs | next->gs)
+ lazy_load_gs(next->gs);
+
+ switch_fpu_finish(next_fpu, cpu);
+
+ this_cpu_write(current_task, next_p);
+
+ /* Load the Intel cache allocation PQR MSR. */
+ intel_rdt_sched_in();
+
+ return prev_p;
+}
+
+SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2)
+{
+ return do_arch_prctl_common(current, option, arg2);
+}
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
new file mode 100644
index 0000000..a0854f2
--- /dev/null
+++ b/arch/x86/kernel/process_64.c
@@ -0,0 +1,720 @@
+/*
+ * Copyright (C) 1995 Linus Torvalds
+ *
+ * Pentium III FXSR, SSE support
+ * Gareth Hughes <gareth@valinux.com>, May 2000
+ *
+ * X86-64 port
+ * Andi Kleen.
+ *
+ * CPU hotplug support - ashok.raj@intel.com
+ */
+
+/*
+ * This file handles the architecture-dependent parts of process handling..
+ */
+
+#include <linux/cpu.h>
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/sched/task.h>
+#include <linux/sched/task_stack.h>
+#include <linux/fs.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/elfcore.h>
+#include <linux/smp.h>
+#include <linux/slab.h>
+#include <linux/user.h>
+#include <linux/interrupt.h>
+#include <linux/delay.h>
+#include <linux/export.h>
+#include <linux/ptrace.h>
+#include <linux/notifier.h>
+#include <linux/kprobes.h>
+#include <linux/kdebug.h>
+#include <linux/prctl.h>
+#include <linux/uaccess.h>
+#include <linux/io.h>
+#include <linux/ftrace.h>
+#include <linux/syscalls.h>
+
+#include <asm/pgtable.h>
+#include <asm/processor.h>
+#include <asm/fpu/internal.h>
+#include <asm/mmu_context.h>
+#include <asm/prctl.h>
+#include <asm/desc.h>
+#include <asm/proto.h>
+#include <asm/ia32.h>
+#include <asm/syscalls.h>
+#include <asm/debugreg.h>
+#include <asm/switch_to.h>
+#include <asm/xen/hypervisor.h>
+#include <asm/vdso.h>
+#include <asm/intel_rdt_sched.h>
+#include <asm/unistd.h>
+#ifdef CONFIG_IA32_EMULATION
+/* Not included via unistd.h */
+#include <asm/unistd_32_ia32.h>
+#endif
+
+#include "process.h"
+
+__visible DEFINE_PER_CPU(unsigned long, rsp_scratch);
+
+/* Prints also some state that isn't saved in the pt_regs */
+void __show_regs(struct pt_regs *regs, enum show_regs_mode mode)
+{
+ unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
+ unsigned long d0, d1, d2, d3, d6, d7;
+ unsigned int fsindex, gsindex;
+ unsigned int ds, cs, es;
+
+ show_iret_regs(regs);
+
+ if (regs->orig_ax != -1)
+ pr_cont(" ORIG_RAX: %016lx\n", regs->orig_ax);
+ else
+ pr_cont("\n");
+
+ printk(KERN_DEFAULT "RAX: %016lx RBX: %016lx RCX: %016lx\n",
+ regs->ax, regs->bx, regs->cx);
+ printk(KERN_DEFAULT "RDX: %016lx RSI: %016lx RDI: %016lx\n",
+ regs->dx, regs->si, regs->di);
+ printk(KERN_DEFAULT "RBP: %016lx R08: %016lx R09: %016lx\n",
+ regs->bp, regs->r8, regs->r9);
+ printk(KERN_DEFAULT "R10: %016lx R11: %016lx R12: %016lx\n",
+ regs->r10, regs->r11, regs->r12);
+ printk(KERN_DEFAULT "R13: %016lx R14: %016lx R15: %016lx\n",
+ regs->r13, regs->r14, regs->r15);
+
+ if (mode == SHOW_REGS_SHORT)
+ return;
+
+ if (mode == SHOW_REGS_USER) {
+ rdmsrl(MSR_FS_BASE, fs);
+ rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
+ printk(KERN_DEFAULT "FS: %016lx GS: %016lx\n",
+ fs, shadowgs);
+ return;
+ }
+
+ asm("movl %%ds,%0" : "=r" (ds));
+ asm("movl %%cs,%0" : "=r" (cs));
+ asm("movl %%es,%0" : "=r" (es));
+ asm("movl %%fs,%0" : "=r" (fsindex));
+ asm("movl %%gs,%0" : "=r" (gsindex));
+
+ rdmsrl(MSR_FS_BASE, fs);
+ rdmsrl(MSR_GS_BASE, gs);
+ rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
+
+ cr0 = read_cr0();
+ cr2 = read_cr2();
+ cr3 = __read_cr3();
+ cr4 = __read_cr4();
+
+ printk(KERN_DEFAULT "FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
+ fs, fsindex, gs, gsindex, shadowgs);
+ printk(KERN_DEFAULT "CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds,
+ es, cr0);
+ printk(KERN_DEFAULT "CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3,
+ cr4);
+
+ get_debugreg(d0, 0);
+ get_debugreg(d1, 1);
+ get_debugreg(d2, 2);
+ get_debugreg(d3, 3);
+ get_debugreg(d6, 6);
+ get_debugreg(d7, 7);
+
+ /* Only print out debug registers if they are in their non-default state. */
+ if (!((d0 == 0) && (d1 == 0) && (d2 == 0) && (d3 == 0) &&
+ (d6 == DR6_RESERVED) && (d7 == 0x400))) {
+ printk(KERN_DEFAULT "DR0: %016lx DR1: %016lx DR2: %016lx\n",
+ d0, d1, d2);
+ printk(KERN_DEFAULT "DR3: %016lx DR6: %016lx DR7: %016lx\n",
+ d3, d6, d7);
+ }
+
+ if (boot_cpu_has(X86_FEATURE_OSPKE))
+ printk(KERN_DEFAULT "PKRU: %08x\n", read_pkru());
+}
+
+void release_thread(struct task_struct *dead_task)
+{
+ if (dead_task->mm) {
+#ifdef CONFIG_MODIFY_LDT_SYSCALL
+ if (dead_task->mm->context.ldt) {
+ pr_warn("WARNING: dead process %s still has LDT? <%p/%d>\n",
+ dead_task->comm,
+ dead_task->mm->context.ldt->entries,
+ dead_task->mm->context.ldt->nr_entries);
+ BUG();
+ }
+#endif
+ }
+}
+
+enum which_selector {
+ FS,
+ GS
+};
+
+/*
+ * Saves the FS or GS base for an outgoing thread if FSGSBASE extensions are
+ * not available. The goal is to be reasonably fast on non-FSGSBASE systems.
+ * It's forcibly inlined because it'll generate better code and this function
+ * is hot.
+ */
+static __always_inline void save_base_legacy(struct task_struct *prev_p,
+ unsigned short selector,
+ enum which_selector which)
+{
+ if (likely(selector == 0)) {
+ /*
+ * On Intel (without X86_BUG_NULL_SEG), the segment base could
+ * be the pre-existing saved base or it could be zero. On AMD
+ * (with X86_BUG_NULL_SEG), the segment base could be almost
+ * anything.
+ *
+ * This branch is very hot (it's hit twice on almost every
+ * context switch between 64-bit programs), and avoiding
+ * the RDMSR helps a lot, so we just assume that whatever
+ * value is already saved is correct. This matches historical
+ * Linux behavior, so it won't break existing applications.
+ *
+ * To avoid leaking state, on non-X86_BUG_NULL_SEG CPUs, if we
+ * report that the base is zero, it needs to actually be zero:
+ * see the corresponding logic in load_seg_legacy.
+ */
+ } else {
+ /*
+ * If the selector is 1, 2, or 3, then the base is zero on
+ * !X86_BUG_NULL_SEG CPUs and could be anything on
+ * X86_BUG_NULL_SEG CPUs. In the latter case, Linux
+ * has never attempted to preserve the base across context
+ * switches.
+ *
+ * If selector > 3, then it refers to a real segment, and
+ * saving the base isn't necessary.
+ */
+ if (which == FS)
+ prev_p->thread.fsbase = 0;
+ else
+ prev_p->thread.gsbase = 0;
+ }
+}
+
+static __always_inline void save_fsgs(struct task_struct *task)
+{
+ savesegment(fs, task->thread.fsindex);
+ savesegment(gs, task->thread.gsindex);
+ save_base_legacy(task, task->thread.fsindex, FS);
+ save_base_legacy(task, task->thread.gsindex, GS);
+}
+
+#if IS_ENABLED(CONFIG_KVM)
+/*
+ * While a process is running,current->thread.fsbase and current->thread.gsbase
+ * may not match the corresponding CPU registers (see save_base_legacy()). KVM
+ * wants an efficient way to save and restore FSBASE and GSBASE.
+ * When FSGSBASE extensions are enabled, this will have to use RD{FS,GS}BASE.
+ */
+void save_fsgs_for_kvm(void)
+{
+ save_fsgs(current);
+}
+EXPORT_SYMBOL_GPL(save_fsgs_for_kvm);
+#endif
+
+static __always_inline void loadseg(enum which_selector which,
+ unsigned short sel)
+{
+ if (which == FS)
+ loadsegment(fs, sel);
+ else
+ load_gs_index(sel);
+}
+
+static __always_inline void load_seg_legacy(unsigned short prev_index,
+ unsigned long prev_base,
+ unsigned short next_index,
+ unsigned long next_base,
+ enum which_selector which)
+{
+ if (likely(next_index <= 3)) {
+ /*
+ * The next task is using 64-bit TLS, is not using this
+ * segment at all, or is having fun with arcane CPU features.
+ */
+ if (next_base == 0) {
+ /*
+ * Nasty case: on AMD CPUs, we need to forcibly zero
+ * the base.
+ */
+ if (static_cpu_has_bug(X86_BUG_NULL_SEG)) {
+ loadseg(which, __USER_DS);
+ loadseg(which, next_index);
+ } else {
+ /*
+ * We could try to exhaustively detect cases
+ * under which we can skip the segment load,
+ * but there's really only one case that matters
+ * for performance: if both the previous and
+ * next states are fully zeroed, we can skip
+ * the load.
+ *
+ * (This assumes that prev_base == 0 has no
+ * false positives. This is the case on
+ * Intel-style CPUs.)
+ */
+ if (likely(prev_index | next_index | prev_base))
+ loadseg(which, next_index);
+ }
+ } else {
+ if (prev_index != next_index)
+ loadseg(which, next_index);
+ wrmsrl(which == FS ? MSR_FS_BASE : MSR_KERNEL_GS_BASE,
+ next_base);
+ }
+ } else {
+ /*
+ * The next task is using a real segment. Loading the selector
+ * is sufficient.
+ */
+ loadseg(which, next_index);
+ }
+}
+
+int copy_thread_tls(unsigned long clone_flags, unsigned long sp,
+ unsigned long arg, struct task_struct *p, unsigned long tls)
+{
+ int err;
+ struct pt_regs *childregs;
+ struct fork_frame *fork_frame;
+ struct inactive_task_frame *frame;
+ struct task_struct *me = current;
+
+ childregs = task_pt_regs(p);
+ fork_frame = container_of(childregs, struct fork_frame, regs);
+ frame = &fork_frame->frame;
+ frame->bp = 0;
+ frame->ret_addr = (unsigned long) ret_from_fork;
+ p->thread.sp = (unsigned long) fork_frame;
+ p->thread.io_bitmap_ptr = NULL;
+
+ savesegment(gs, p->thread.gsindex);
+ p->thread.gsbase = p->thread.gsindex ? 0 : me->thread.gsbase;
+ savesegment(fs, p->thread.fsindex);
+ p->thread.fsbase = p->thread.fsindex ? 0 : me->thread.fsbase;
+ savesegment(es, p->thread.es);
+ savesegment(ds, p->thread.ds);
+ memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
+
+ if (unlikely(p->flags & PF_KTHREAD)) {
+ /* kernel thread */
+ memset(childregs, 0, sizeof(struct pt_regs));
+ frame->bx = sp; /* function */
+ frame->r12 = arg;
+ return 0;
+ }
+ frame->bx = 0;
+ *childregs = *current_pt_regs();
+
+ childregs->ax = 0;
+ if (sp)
+ childregs->sp = sp;
+
+ err = -ENOMEM;
+ if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
+ p->thread.io_bitmap_ptr = kmemdup(me->thread.io_bitmap_ptr,
+ IO_BITMAP_BYTES, GFP_KERNEL);
+ if (!p->thread.io_bitmap_ptr) {
+ p->thread.io_bitmap_max = 0;
+ return -ENOMEM;
+ }
+ set_tsk_thread_flag(p, TIF_IO_BITMAP);
+ }
+
+ /*
+ * Set a new TLS for the child thread?
+ */
+ if (clone_flags & CLONE_SETTLS) {
+#ifdef CONFIG_IA32_EMULATION
+ if (in_ia32_syscall())
+ err = do_set_thread_area(p, -1,
+ (struct user_desc __user *)tls, 0);
+ else
+#endif
+ err = do_arch_prctl_64(p, ARCH_SET_FS, tls);
+ if (err)
+ goto out;
+ }
+ err = 0;
+out:
+ if (err && p->thread.io_bitmap_ptr) {
+ kfree(p->thread.io_bitmap_ptr);
+ p->thread.io_bitmap_max = 0;
+ }
+
+ return err;
+}
+
+static void
+start_thread_common(struct pt_regs *regs, unsigned long new_ip,
+ unsigned long new_sp,
+ unsigned int _cs, unsigned int _ss, unsigned int _ds)
+{
+ WARN_ON_ONCE(regs != current_pt_regs());
+
+ if (static_cpu_has(X86_BUG_NULL_SEG)) {
+ /* Loading zero below won't clear the base. */
+ loadsegment(fs, __USER_DS);
+ load_gs_index(__USER_DS);
+ }
+
+ loadsegment(fs, 0);
+ loadsegment(es, _ds);
+ loadsegment(ds, _ds);
+ load_gs_index(0);
+
+ regs->ip = new_ip;
+ regs->sp = new_sp;
+ regs->cs = _cs;
+ regs->ss = _ss;
+ regs->flags = X86_EFLAGS_IF;
+ force_iret();
+}
+
+void
+start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
+{
+ start_thread_common(regs, new_ip, new_sp,
+ __USER_CS, __USER_DS, 0);
+}
+EXPORT_SYMBOL_GPL(start_thread);
+
+#ifdef CONFIG_COMPAT
+void compat_start_thread(struct pt_regs *regs, u32 new_ip, u32 new_sp)
+{
+ start_thread_common(regs, new_ip, new_sp,
+ test_thread_flag(TIF_X32)
+ ? __USER_CS : __USER32_CS,
+ __USER_DS, __USER_DS);
+}
+#endif
+
+/*
+ * switch_to(x,y) should switch tasks from x to y.
+ *
+ * This could still be optimized:
+ * - fold all the options into a flag word and test it with a single test.
+ * - could test fs/gs bitsliced
+ *
+ * Kprobes not supported here. Set the probe on schedule instead.
+ * Function graph tracer not supported too.
+ */
+__visible __notrace_funcgraph struct task_struct *
+__switch_to(struct task_struct *prev_p, struct task_struct *next_p)
+{
+ struct thread_struct *prev = &prev_p->thread;
+ struct thread_struct *next = &next_p->thread;
+ struct fpu *prev_fpu = &prev->fpu;
+ struct fpu *next_fpu = &next->fpu;
+ int cpu = smp_processor_id();
+
+ WARN_ON_ONCE(IS_ENABLED(CONFIG_DEBUG_ENTRY) &&
+ this_cpu_read(irq_count) != -1);
+
+ switch_fpu_prepare(prev_fpu, cpu);
+
+ /* We must save %fs and %gs before load_TLS() because
+ * %fs and %gs may be cleared by load_TLS().
+ *
+ * (e.g. xen_load_tls())
+ */
+ save_fsgs(prev_p);
+
+ /*
+ * Load TLS before restoring any segments so that segment loads
+ * reference the correct GDT entries.
+ */
+ load_TLS(next, cpu);
+
+ /*
+ * Leave lazy mode, flushing any hypercalls made here. This
+ * must be done after loading TLS entries in the GDT but before
+ * loading segments that might reference them, and and it must
+ * be done before fpu__restore(), so the TS bit is up to
+ * date.
+ */
+ arch_end_context_switch(next_p);
+
+ /* Switch DS and ES.
+ *
+ * Reading them only returns the selectors, but writing them (if
+ * nonzero) loads the full descriptor from the GDT or LDT. The
+ * LDT for next is loaded in switch_mm, and the GDT is loaded
+ * above.
+ *
+ * We therefore need to write new values to the segment
+ * registers on every context switch unless both the new and old
+ * values are zero.
+ *
+ * Note that we don't need to do anything for CS and SS, as
+ * those are saved and restored as part of pt_regs.
+ */
+ savesegment(es, prev->es);
+ if (unlikely(next->es | prev->es))
+ loadsegment(es, next->es);
+
+ savesegment(ds, prev->ds);
+ if (unlikely(next->ds | prev->ds))
+ loadsegment(ds, next->ds);
+
+ load_seg_legacy(prev->fsindex, prev->fsbase,
+ next->fsindex, next->fsbase, FS);
+ load_seg_legacy(prev->gsindex, prev->gsbase,
+ next->gsindex, next->gsbase, GS);
+
+ switch_fpu_finish(next_fpu, cpu);
+
+ /*
+ * Switch the PDA and FPU contexts.
+ */
+ this_cpu_write(current_task, next_p);
+ this_cpu_write(cpu_current_top_of_stack, task_top_of_stack(next_p));
+
+ /* Reload sp0. */
+ update_task_stack(next_p);
+
+ switch_to_extra(prev_p, next_p);
+
+#ifdef CONFIG_XEN_PV
+ /*
+ * On Xen PV, IOPL bits in pt_regs->flags have no effect, and
+ * current_pt_regs()->flags may not match the current task's
+ * intended IOPL. We need to switch it manually.
+ */
+ if (unlikely(static_cpu_has(X86_FEATURE_XENPV) &&
+ prev->iopl != next->iopl))
+ xen_set_iopl_mask(next->iopl);
+#endif
+
+ if (static_cpu_has_bug(X86_BUG_SYSRET_SS_ATTRS)) {
+ /*
+ * AMD CPUs have a misfeature: SYSRET sets the SS selector but
+ * does not update the cached descriptor. As a result, if we
+ * do SYSRET while SS is NULL, we'll end up in user mode with
+ * SS apparently equal to __USER_DS but actually unusable.
+ *
+ * The straightforward workaround would be to fix it up just
+ * before SYSRET, but that would slow down the system call
+ * fast paths. Instead, we ensure that SS is never NULL in
+ * system call context. We do this by replacing NULL SS
+ * selectors at every context switch. SYSCALL sets up a valid
+ * SS, so the only way to get NULL is to re-enter the kernel
+ * from CPL 3 through an interrupt. Since that can't happen
+ * in the same task as a running syscall, we are guaranteed to
+ * context switch between every interrupt vector entry and a
+ * subsequent SYSRET.
+ *
+ * We read SS first because SS reads are much faster than
+ * writes. Out of caution, we force SS to __KERNEL_DS even if
+ * it previously had a different non-NULL value.
+ */
+ unsigned short ss_sel;
+ savesegment(ss, ss_sel);
+ if (ss_sel != __KERNEL_DS)
+ loadsegment(ss, __KERNEL_DS);
+ }
+
+ /* Load the Intel cache allocation PQR MSR. */
+ intel_rdt_sched_in();
+
+ return prev_p;
+}
+
+void set_personality_64bit(void)
+{
+ /* inherit personality from parent */
+
+ /* Make sure to be in 64bit mode */
+ clear_thread_flag(TIF_IA32);
+ clear_thread_flag(TIF_ADDR32);
+ clear_thread_flag(TIF_X32);
+ /* Pretend that this comes from a 64bit execve */
+ task_pt_regs(current)->orig_ax = __NR_execve;
+ current_thread_info()->status &= ~TS_COMPAT;
+
+ /* Ensure the corresponding mm is not marked. */
+ if (current->mm)
+ current->mm->context.ia32_compat = 0;
+
+ /* TBD: overwrites user setup. Should have two bits.
+ But 64bit processes have always behaved this way,
+ so it's not too bad. The main problem is just that
+ 32bit childs are affected again. */
+ current->personality &= ~READ_IMPLIES_EXEC;
+}
+
+static void __set_personality_x32(void)
+{
+#ifdef CONFIG_X86_X32
+ clear_thread_flag(TIF_IA32);
+ set_thread_flag(TIF_X32);
+ if (current->mm)
+ current->mm->context.ia32_compat = TIF_X32;
+ current->personality &= ~READ_IMPLIES_EXEC;
+ /*
+ * in_compat_syscall() uses the presence of the x32 syscall bit
+ * flag to determine compat status. The x86 mmap() code relies on
+ * the syscall bitness so set x32 syscall bit right here to make
+ * in_compat_syscall() work during exec().
+ *
+ * Pretend to come from a x32 execve.
+ */
+ task_pt_regs(current)->orig_ax = __NR_x32_execve | __X32_SYSCALL_BIT;
+ current_thread_info()->status &= ~TS_COMPAT;
+#endif
+}
+
+static void __set_personality_ia32(void)
+{
+#ifdef CONFIG_IA32_EMULATION
+ set_thread_flag(TIF_IA32);
+ clear_thread_flag(TIF_X32);
+ if (current->mm)
+ current->mm->context.ia32_compat = TIF_IA32;
+ current->personality |= force_personality32;
+ /* Prepare the first "return" to user space */
+ task_pt_regs(current)->orig_ax = __NR_ia32_execve;
+ current_thread_info()->status |= TS_COMPAT;
+#endif
+}
+
+void set_personality_ia32(bool x32)
+{
+ /* Make sure to be in 32bit mode */
+ set_thread_flag(TIF_ADDR32);
+
+ if (x32)
+ __set_personality_x32();
+ else
+ __set_personality_ia32();
+}
+EXPORT_SYMBOL_GPL(set_personality_ia32);
+
+#ifdef CONFIG_CHECKPOINT_RESTORE
+static long prctl_map_vdso(const struct vdso_image *image, unsigned long addr)
+{
+ int ret;
+
+ ret = map_vdso_once(image, addr);
+ if (ret)
+ return ret;
+
+ return (long)image->size;
+}
+#endif
+
+long do_arch_prctl_64(struct task_struct *task, int option, unsigned long arg2)
+{
+ int ret = 0;
+ int doit = task == current;
+ int cpu;
+
+ switch (option) {
+ case ARCH_SET_GS:
+ if (arg2 >= TASK_SIZE_MAX)
+ return -EPERM;
+ cpu = get_cpu();
+ task->thread.gsindex = 0;
+ task->thread.gsbase = arg2;
+ if (doit) {
+ load_gs_index(0);
+ ret = wrmsrl_safe(MSR_KERNEL_GS_BASE, arg2);
+ }
+ put_cpu();
+ break;
+ case ARCH_SET_FS:
+ /* Not strictly needed for fs, but do it for symmetry
+ with gs */
+ if (arg2 >= TASK_SIZE_MAX)
+ return -EPERM;
+ cpu = get_cpu();
+ task->thread.fsindex = 0;
+ task->thread.fsbase = arg2;
+ if (doit) {
+ /* set the selector to 0 to not confuse __switch_to */
+ loadsegment(fs, 0);
+ ret = wrmsrl_safe(MSR_FS_BASE, arg2);
+ }
+ put_cpu();
+ break;
+ case ARCH_GET_FS: {
+ unsigned long base;
+
+ if (doit)
+ rdmsrl(MSR_FS_BASE, base);
+ else
+ base = task->thread.fsbase;
+ ret = put_user(base, (unsigned long __user *)arg2);
+ break;
+ }
+ case ARCH_GET_GS: {
+ unsigned long base;
+
+ if (doit)
+ rdmsrl(MSR_KERNEL_GS_BASE, base);
+ else
+ base = task->thread.gsbase;
+ ret = put_user(base, (unsigned long __user *)arg2);
+ break;
+ }
+
+#ifdef CONFIG_CHECKPOINT_RESTORE
+# ifdef CONFIG_X86_X32_ABI
+ case ARCH_MAP_VDSO_X32:
+ return prctl_map_vdso(&vdso_image_x32, arg2);
+# endif
+# if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
+ case ARCH_MAP_VDSO_32:
+ return prctl_map_vdso(&vdso_image_32, arg2);
+# endif
+ case ARCH_MAP_VDSO_64:
+ return prctl_map_vdso(&vdso_image_64, arg2);
+#endif
+
+ default:
+ ret = -EINVAL;
+ break;
+ }
+
+ return ret;
+}
+
+SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2)
+{
+ long ret;
+
+ ret = do_arch_prctl_64(current, option, arg2);
+ if (ret == -EINVAL)
+ ret = do_arch_prctl_common(current, option, arg2);
+
+ return ret;
+}
+
+#ifdef CONFIG_IA32_EMULATION
+COMPAT_SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2)
+{
+ return do_arch_prctl_common(current, option, arg2);
+}
+#endif
+
+unsigned long KSTK_ESP(struct task_struct *task)
+{
+ return task_pt_regs(task)->sp;
+}
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
new file mode 100644
index 0000000..e2ee403
--- /dev/null
+++ b/arch/x86/kernel/ptrace.c
@@ -0,0 +1,1401 @@
+/* By Ross Biro 1/23/92 */
+/*
+ * Pentium III FXSR, SSE support
+ * Gareth Hughes <gareth@valinux.com>, May 2000
+ */
+
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/sched/task_stack.h>
+#include <linux/mm.h>
+#include <linux/smp.h>
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/ptrace.h>
+#include <linux/tracehook.h>
+#include <linux/user.h>
+#include <linux/elf.h>
+#include <linux/security.h>
+#include <linux/audit.h>
+#include <linux/seccomp.h>
+#include <linux/signal.h>
+#include <linux/perf_event.h>
+#include <linux/hw_breakpoint.h>
+#include <linux/rcupdate.h>
+#include <linux/export.h>
+#include <linux/context_tracking.h>
+
+#include <linux/uaccess.h>
+#include <asm/pgtable.h>
+#include <asm/processor.h>
+#include <asm/fpu/internal.h>
+#include <asm/fpu/signal.h>
+#include <asm/fpu/regset.h>
+#include <asm/debugreg.h>
+#include <asm/ldt.h>
+#include <asm/desc.h>
+#include <asm/prctl.h>
+#include <asm/proto.h>
+#include <asm/hw_breakpoint.h>
+#include <asm/traps.h>
+#include <asm/syscall.h>
+
+#include "tls.h"
+
+enum x86_regset {
+ REGSET_GENERAL,
+ REGSET_FP,
+ REGSET_XFP,
+ REGSET_IOPERM64 = REGSET_XFP,
+ REGSET_XSTATE,
+ REGSET_TLS,
+ REGSET_IOPERM32,
+};
+
+struct pt_regs_offset {
+ const char *name;
+ int offset;
+};
+
+#define REG_OFFSET_NAME(r) {.name = #r, .offset = offsetof(struct pt_regs, r)}
+#define REG_OFFSET_END {.name = NULL, .offset = 0}
+
+static const struct pt_regs_offset regoffset_table[] = {
+#ifdef CONFIG_X86_64
+ REG_OFFSET_NAME(r15),
+ REG_OFFSET_NAME(r14),
+ REG_OFFSET_NAME(r13),
+ REG_OFFSET_NAME(r12),
+ REG_OFFSET_NAME(r11),
+ REG_OFFSET_NAME(r10),
+ REG_OFFSET_NAME(r9),
+ REG_OFFSET_NAME(r8),
+#endif
+ REG_OFFSET_NAME(bx),
+ REG_OFFSET_NAME(cx),
+ REG_OFFSET_NAME(dx),
+ REG_OFFSET_NAME(si),
+ REG_OFFSET_NAME(di),
+ REG_OFFSET_NAME(bp),
+ REG_OFFSET_NAME(ax),
+#ifdef CONFIG_X86_32
+ REG_OFFSET_NAME(ds),
+ REG_OFFSET_NAME(es),
+ REG_OFFSET_NAME(fs),
+ REG_OFFSET_NAME(gs),
+#endif
+ REG_OFFSET_NAME(orig_ax),
+ REG_OFFSET_NAME(ip),
+ REG_OFFSET_NAME(cs),
+ REG_OFFSET_NAME(flags),
+ REG_OFFSET_NAME(sp),
+ REG_OFFSET_NAME(ss),
+ REG_OFFSET_END,
+};
+
+/**
+ * regs_query_register_offset() - query register offset from its name
+ * @name: the name of a register
+ *
+ * regs_query_register_offset() returns the offset of a register in struct
+ * pt_regs from its name. If the name is invalid, this returns -EINVAL;
+ */
+int regs_query_register_offset(const char *name)
+{
+ const struct pt_regs_offset *roff;
+ for (roff = regoffset_table; roff->name != NULL; roff++)
+ if (!strcmp(roff->name, name))
+ return roff->offset;
+ return -EINVAL;
+}
+
+/**
+ * regs_query_register_name() - query register name from its offset
+ * @offset: the offset of a register in struct pt_regs.
+ *
+ * regs_query_register_name() returns the name of a register from its
+ * offset in struct pt_regs. If the @offset is invalid, this returns NULL;
+ */
+const char *regs_query_register_name(unsigned int offset)
+{
+ const struct pt_regs_offset *roff;
+ for (roff = regoffset_table; roff->name != NULL; roff++)
+ if (roff->offset == offset)
+ return roff->name;
+ return NULL;
+}
+
+/*
+ * does not yet catch signals sent when the child dies.
+ * in exit.c or in signal.c.
+ */
+
+/*
+ * Determines which flags the user has access to [1 = access, 0 = no access].
+ */
+#define FLAG_MASK_32 ((unsigned long) \
+ (X86_EFLAGS_CF | X86_EFLAGS_PF | \
+ X86_EFLAGS_AF | X86_EFLAGS_ZF | \
+ X86_EFLAGS_SF | X86_EFLAGS_TF | \
+ X86_EFLAGS_DF | X86_EFLAGS_OF | \
+ X86_EFLAGS_RF | X86_EFLAGS_AC))
+
+/*
+ * Determines whether a value may be installed in a segment register.
+ */
+static inline bool invalid_selector(u16 value)
+{
+ return unlikely(value != 0 && (value & SEGMENT_RPL_MASK) != USER_RPL);
+}
+
+#ifdef CONFIG_X86_32
+
+#define FLAG_MASK FLAG_MASK_32
+
+/*
+ * X86_32 CPUs don't save ss and esp if the CPU is already in kernel mode
+ * when it traps. The previous stack will be directly underneath the saved
+ * registers, and 'sp/ss' won't even have been saved. Thus the '®s->sp'.
+ *
+ * Now, if the stack is empty, '®s->sp' is out of range. In this
+ * case we try to take the previous stack. To always return a non-null
+ * stack pointer we fall back to regs as stack if no previous stack
+ * exists.
+ *
+ * This is valid only for kernel mode traps.
+ */
+unsigned long kernel_stack_pointer(struct pt_regs *regs)
+{
+ unsigned long context = (unsigned long)regs & ~(THREAD_SIZE - 1);
+ unsigned long sp = (unsigned long)®s->sp;
+ u32 *prev_esp;
+
+ if (context == (sp & ~(THREAD_SIZE - 1)))
+ return sp;
+
+ prev_esp = (u32 *)(context);
+ if (*prev_esp)
+ return (unsigned long)*prev_esp;
+
+ return (unsigned long)regs;
+}
+EXPORT_SYMBOL_GPL(kernel_stack_pointer);
+
+static unsigned long *pt_regs_access(struct pt_regs *regs, unsigned long regno)
+{
+ BUILD_BUG_ON(offsetof(struct pt_regs, bx) != 0);
+ return ®s->bx + (regno >> 2);
+}
+
+static u16 get_segment_reg(struct task_struct *task, unsigned long offset)
+{
+ /*
+ * Returning the value truncates it to 16 bits.
+ */
+ unsigned int retval;
+ if (offset != offsetof(struct user_regs_struct, gs))
+ retval = *pt_regs_access(task_pt_regs(task), offset);
+ else {
+ if (task == current)
+ retval = get_user_gs(task_pt_regs(task));
+ else
+ retval = task_user_gs(task);
+ }
+ return retval;
+}
+
+static int set_segment_reg(struct task_struct *task,
+ unsigned long offset, u16 value)
+{
+ /*
+ * The value argument was already truncated to 16 bits.
+ */
+ if (invalid_selector(value))
+ return -EIO;
+
+ /*
+ * For %cs and %ss we cannot permit a null selector.
+ * We can permit a bogus selector as long as it has USER_RPL.
+ * Null selectors are fine for other segment registers, but
+ * we will never get back to user mode with invalid %cs or %ss
+ * and will take the trap in iret instead. Much code relies
+ * on user_mode() to distinguish a user trap frame (which can
+ * safely use invalid selectors) from a kernel trap frame.
+ */
+ switch (offset) {
+ case offsetof(struct user_regs_struct, cs):
+ case offsetof(struct user_regs_struct, ss):
+ if (unlikely(value == 0))
+ return -EIO;
+
+ default:
+ *pt_regs_access(task_pt_regs(task), offset) = value;
+ break;
+
+ case offsetof(struct user_regs_struct, gs):
+ if (task == current)
+ set_user_gs(task_pt_regs(task), value);
+ else
+ task_user_gs(task) = value;
+ }
+
+ return 0;
+}
+
+#else /* CONFIG_X86_64 */
+
+#define FLAG_MASK (FLAG_MASK_32 | X86_EFLAGS_NT)
+
+static unsigned long *pt_regs_access(struct pt_regs *regs, unsigned long offset)
+{
+ BUILD_BUG_ON(offsetof(struct pt_regs, r15) != 0);
+ return ®s->r15 + (offset / sizeof(regs->r15));
+}
+
+static u16 get_segment_reg(struct task_struct *task, unsigned long offset)
+{
+ /*
+ * Returning the value truncates it to 16 bits.
+ */
+ unsigned int seg;
+
+ switch (offset) {
+ case offsetof(struct user_regs_struct, fs):
+ if (task == current) {
+ /* Older gas can't assemble movq %?s,%r?? */
+ asm("movl %%fs,%0" : "=r" (seg));
+ return seg;
+ }
+ return task->thread.fsindex;
+ case offsetof(struct user_regs_struct, gs):
+ if (task == current) {
+ asm("movl %%gs,%0" : "=r" (seg));
+ return seg;
+ }
+ return task->thread.gsindex;
+ case offsetof(struct user_regs_struct, ds):
+ if (task == current) {
+ asm("movl %%ds,%0" : "=r" (seg));
+ return seg;
+ }
+ return task->thread.ds;
+ case offsetof(struct user_regs_struct, es):
+ if (task == current) {
+ asm("movl %%es,%0" : "=r" (seg));
+ return seg;
+ }
+ return task->thread.es;
+
+ case offsetof(struct user_regs_struct, cs):
+ case offsetof(struct user_regs_struct, ss):
+ break;
+ }
+ return *pt_regs_access(task_pt_regs(task), offset);
+}
+
+static int set_segment_reg(struct task_struct *task,
+ unsigned long offset, u16 value)
+{
+ /*
+ * The value argument was already truncated to 16 bits.
+ */
+ if (invalid_selector(value))
+ return -EIO;
+
+ switch (offset) {
+ case offsetof(struct user_regs_struct,fs):
+ task->thread.fsindex = value;
+ if (task == current)
+ loadsegment(fs, task->thread.fsindex);
+ break;
+ case offsetof(struct user_regs_struct,gs):
+ task->thread.gsindex = value;
+ if (task == current)
+ load_gs_index(task->thread.gsindex);
+ break;
+ case offsetof(struct user_regs_struct,ds):
+ task->thread.ds = value;
+ if (task == current)
+ loadsegment(ds, task->thread.ds);
+ break;
+ case offsetof(struct user_regs_struct,es):
+ task->thread.es = value;
+ if (task == current)
+ loadsegment(es, task->thread.es);
+ break;
+
+ /*
+ * Can't actually change these in 64-bit mode.
+ */
+ case offsetof(struct user_regs_struct,cs):
+ if (unlikely(value == 0))
+ return -EIO;
+ task_pt_regs(task)->cs = value;
+ break;
+ case offsetof(struct user_regs_struct,ss):
+ if (unlikely(value == 0))
+ return -EIO;
+ task_pt_regs(task)->ss = value;
+ break;
+ }
+
+ return 0;
+}
+
+#endif /* CONFIG_X86_32 */
+
+static unsigned long get_flags(struct task_struct *task)
+{
+ unsigned long retval = task_pt_regs(task)->flags;
+
+ /*
+ * If the debugger set TF, hide it from the readout.
+ */
+ if (test_tsk_thread_flag(task, TIF_FORCED_TF))
+ retval &= ~X86_EFLAGS_TF;
+
+ return retval;
+}
+
+static int set_flags(struct task_struct *task, unsigned long value)
+{
+ struct pt_regs *regs = task_pt_regs(task);
+
+ /*
+ * If the user value contains TF, mark that
+ * it was not "us" (the debugger) that set it.
+ * If not, make sure it stays set if we had.
+ */
+ if (value & X86_EFLAGS_TF)
+ clear_tsk_thread_flag(task, TIF_FORCED_TF);
+ else if (test_tsk_thread_flag(task, TIF_FORCED_TF))
+ value |= X86_EFLAGS_TF;
+
+ regs->flags = (regs->flags & ~FLAG_MASK) | (value & FLAG_MASK);
+
+ return 0;
+}
+
+static int putreg(struct task_struct *child,
+ unsigned long offset, unsigned long value)
+{
+ switch (offset) {
+ case offsetof(struct user_regs_struct, cs):
+ case offsetof(struct user_regs_struct, ds):
+ case offsetof(struct user_regs_struct, es):
+ case offsetof(struct user_regs_struct, fs):
+ case offsetof(struct user_regs_struct, gs):
+ case offsetof(struct user_regs_struct, ss):
+ return set_segment_reg(child, offset, value);
+
+ case offsetof(struct user_regs_struct, flags):
+ return set_flags(child, value);
+
+#ifdef CONFIG_X86_64
+ case offsetof(struct user_regs_struct,fs_base):
+ if (value >= TASK_SIZE_MAX)
+ return -EIO;
+ /*
+ * When changing the segment base, use do_arch_prctl_64
+ * to set either thread.fs or thread.fsindex and the
+ * corresponding GDT slot.
+ */
+ if (child->thread.fsbase != value)
+ return do_arch_prctl_64(child, ARCH_SET_FS, value);
+ return 0;
+ case offsetof(struct user_regs_struct,gs_base):
+ /*
+ * Exactly the same here as the %fs handling above.
+ */
+ if (value >= TASK_SIZE_MAX)
+ return -EIO;
+ if (child->thread.gsbase != value)
+ return do_arch_prctl_64(child, ARCH_SET_GS, value);
+ return 0;
+#endif
+ }
+
+ *pt_regs_access(task_pt_regs(child), offset) = value;
+ return 0;
+}
+
+static unsigned long getreg(struct task_struct *task, unsigned long offset)
+{
+ switch (offset) {
+ case offsetof(struct user_regs_struct, cs):
+ case offsetof(struct user_regs_struct, ds):
+ case offsetof(struct user_regs_struct, es):
+ case offsetof(struct user_regs_struct, fs):
+ case offsetof(struct user_regs_struct, gs):
+ case offsetof(struct user_regs_struct, ss):
+ return get_segment_reg(task, offset);
+
+ case offsetof(struct user_regs_struct, flags):
+ return get_flags(task);
+
+#ifdef CONFIG_X86_64
+ case offsetof(struct user_regs_struct, fs_base): {
+ /*
+ * XXX: This will not behave as expected if called on
+ * current or if fsindex != 0.
+ */
+ return task->thread.fsbase;
+ }
+ case offsetof(struct user_regs_struct, gs_base): {
+ /*
+ * XXX: This will not behave as expected if called on
+ * current or if fsindex != 0.
+ */
+ return task->thread.gsbase;
+ }
+#endif
+ }
+
+ return *pt_regs_access(task_pt_regs(task), offset);
+}
+
+static int genregs_get(struct task_struct *target,
+ const struct user_regset *regset,
+ unsigned int pos, unsigned int count,
+ void *kbuf, void __user *ubuf)
+{
+ if (kbuf) {
+ unsigned long *k = kbuf;
+ while (count >= sizeof(*k)) {
+ *k++ = getreg(target, pos);
+ count -= sizeof(*k);
+ pos += sizeof(*k);
+ }
+ } else {
+ unsigned long __user *u = ubuf;
+ while (count >= sizeof(*u)) {
+ if (__put_user(getreg(target, pos), u++))
+ return -EFAULT;
+ count -= sizeof(*u);
+ pos += sizeof(*u);
+ }
+ }
+
+ return 0;
+}
+
+static int genregs_set(struct task_struct *target,
+ const struct user_regset *regset,
+ unsigned int pos, unsigned int count,
+ const void *kbuf, const void __user *ubuf)
+{
+ int ret = 0;
+ if (kbuf) {
+ const unsigned long *k = kbuf;
+ while (count >= sizeof(*k) && !ret) {
+ ret = putreg(target, pos, *k++);
+ count -= sizeof(*k);
+ pos += sizeof(*k);
+ }
+ } else {
+ const unsigned long __user *u = ubuf;
+ while (count >= sizeof(*u) && !ret) {
+ unsigned long word;
+ ret = __get_user(word, u++);
+ if (ret)
+ break;
+ ret = putreg(target, pos, word);
+ count -= sizeof(*u);
+ pos += sizeof(*u);
+ }
+ }
+ return ret;
+}
+
+static void ptrace_triggered(struct perf_event *bp,
+ struct perf_sample_data *data,
+ struct pt_regs *regs)
+{
+ int i;
+ struct thread_struct *thread = &(current->thread);
+
+ /*
+ * Store in the virtual DR6 register the fact that the breakpoint
+ * was hit so the thread's debugger will see it.
+ */
+ for (i = 0; i < HBP_NUM; i++) {
+ if (thread->ptrace_bps[i] == bp)
+ break;
+ }
+
+ thread->debugreg6 |= (DR_TRAP0 << i);
+}
+
+/*
+ * Walk through every ptrace breakpoints for this thread and
+ * build the dr7 value on top of their attributes.
+ *
+ */
+static unsigned long ptrace_get_dr7(struct perf_event *bp[])
+{
+ int i;
+ int dr7 = 0;
+ struct arch_hw_breakpoint *info;
+
+ for (i = 0; i < HBP_NUM; i++) {
+ if (bp[i] && !bp[i]->attr.disabled) {
+ info = counter_arch_bp(bp[i]);
+ dr7 |= encode_dr7(i, info->len, info->type);
+ }
+ }
+
+ return dr7;
+}
+
+static int ptrace_fill_bp_fields(struct perf_event_attr *attr,
+ int len, int type, bool disabled)
+{
+ int err, bp_len, bp_type;
+
+ err = arch_bp_generic_fields(len, type, &bp_len, &bp_type);
+ if (!err) {
+ attr->bp_len = bp_len;
+ attr->bp_type = bp_type;
+ attr->disabled = disabled;
+ }
+
+ return err;
+}
+
+static struct perf_event *
+ptrace_register_breakpoint(struct task_struct *tsk, int len, int type,
+ unsigned long addr, bool disabled)
+{
+ struct perf_event_attr attr;
+ int err;
+
+ ptrace_breakpoint_init(&attr);
+ attr.bp_addr = addr;
+
+ err = ptrace_fill_bp_fields(&attr, len, type, disabled);
+ if (err)
+ return ERR_PTR(err);
+
+ return register_user_hw_breakpoint(&attr, ptrace_triggered,
+ NULL, tsk);
+}
+
+static int ptrace_modify_breakpoint(struct perf_event *bp, int len, int type,
+ int disabled)
+{
+ struct perf_event_attr attr = bp->attr;
+ int err;
+
+ err = ptrace_fill_bp_fields(&attr, len, type, disabled);
+ if (err)
+ return err;
+
+ return modify_user_hw_breakpoint(bp, &attr);
+}
+
+/*
+ * Handle ptrace writes to debug register 7.
+ */
+static int ptrace_write_dr7(struct task_struct *tsk, unsigned long data)
+{
+ struct thread_struct *thread = &tsk->thread;
+ unsigned long old_dr7;
+ bool second_pass = false;
+ int i, rc, ret = 0;
+
+ data &= ~DR_CONTROL_RESERVED;
+ old_dr7 = ptrace_get_dr7(thread->ptrace_bps);
+
+restore:
+ rc = 0;
+ for (i = 0; i < HBP_NUM; i++) {
+ unsigned len, type;
+ bool disabled = !decode_dr7(data, i, &len, &type);
+ struct perf_event *bp = thread->ptrace_bps[i];
+
+ if (!bp) {
+ if (disabled)
+ continue;
+
+ bp = ptrace_register_breakpoint(tsk,
+ len, type, 0, disabled);
+ if (IS_ERR(bp)) {
+ rc = PTR_ERR(bp);
+ break;
+ }
+
+ thread->ptrace_bps[i] = bp;
+ continue;
+ }
+
+ rc = ptrace_modify_breakpoint(bp, len, type, disabled);
+ if (rc)
+ break;
+ }
+
+ /* Restore if the first pass failed, second_pass shouldn't fail. */
+ if (rc && !WARN_ON(second_pass)) {
+ ret = rc;
+ data = old_dr7;
+ second_pass = true;
+ goto restore;
+ }
+
+ return ret;
+}
+
+/*
+ * Handle PTRACE_PEEKUSR calls for the debug register area.
+ */
+static unsigned long ptrace_get_debugreg(struct task_struct *tsk, int n)
+{
+ struct thread_struct *thread = &tsk->thread;
+ unsigned long val = 0;
+
+ if (n < HBP_NUM) {
+ struct perf_event *bp = thread->ptrace_bps[n];
+
+ if (bp)
+ val = bp->hw.info.address;
+ } else if (n == 6) {
+ val = thread->debugreg6;
+ } else if (n == 7) {
+ val = thread->ptrace_dr7;
+ }
+ return val;
+}
+
+static int ptrace_set_breakpoint_addr(struct task_struct *tsk, int nr,
+ unsigned long addr)
+{
+ struct thread_struct *t = &tsk->thread;
+ struct perf_event *bp = t->ptrace_bps[nr];
+ int err = 0;
+
+ if (!bp) {
+ /*
+ * Put stub len and type to create an inactive but correct bp.
+ *
+ * CHECKME: the previous code returned -EIO if the addr wasn't
+ * a valid task virtual addr. The new one will return -EINVAL in
+ * this case.
+ * -EINVAL may be what we want for in-kernel breakpoints users,
+ * but -EIO looks better for ptrace, since we refuse a register
+ * writing for the user. And anyway this is the previous
+ * behaviour.
+ */
+ bp = ptrace_register_breakpoint(tsk,
+ X86_BREAKPOINT_LEN_1, X86_BREAKPOINT_WRITE,
+ addr, true);
+ if (IS_ERR(bp))
+ err = PTR_ERR(bp);
+ else
+ t->ptrace_bps[nr] = bp;
+ } else {
+ struct perf_event_attr attr = bp->attr;
+
+ attr.bp_addr = addr;
+ err = modify_user_hw_breakpoint(bp, &attr);
+ }
+
+ return err;
+}
+
+/*
+ * Handle PTRACE_POKEUSR calls for the debug register area.
+ */
+static int ptrace_set_debugreg(struct task_struct *tsk, int n,
+ unsigned long val)
+{
+ struct thread_struct *thread = &tsk->thread;
+ /* There are no DR4 or DR5 registers */
+ int rc = -EIO;
+
+ if (n < HBP_NUM) {
+ rc = ptrace_set_breakpoint_addr(tsk, n, val);
+ } else if (n == 6) {
+ thread->debugreg6 = val;
+ rc = 0;
+ } else if (n == 7) {
+ rc = ptrace_write_dr7(tsk, val);
+ if (!rc)
+ thread->ptrace_dr7 = val;
+ }
+ return rc;
+}
+
+/*
+ * These access the current or another (stopped) task's io permission
+ * bitmap for debugging or core dump.
+ */
+static int ioperm_active(struct task_struct *target,
+ const struct user_regset *regset)
+{
+ return target->thread.io_bitmap_max / regset->size;
+}
+
+static int ioperm_get(struct task_struct *target,
+ const struct user_regset *regset,
+ unsigned int pos, unsigned int count,
+ void *kbuf, void __user *ubuf)
+{
+ if (!target->thread.io_bitmap_ptr)
+ return -ENXIO;
+
+ return user_regset_copyout(&pos, &count, &kbuf, &ubuf,
+ target->thread.io_bitmap_ptr,
+ 0, IO_BITMAP_BYTES);
+}
+
+/*
+ * Called by kernel/ptrace.c when detaching..
+ *
+ * Make sure the single step bit is not set.
+ */
+void ptrace_disable(struct task_struct *child)
+{
+ user_disable_single_step(child);
+#ifdef TIF_SYSCALL_EMU
+ clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);
+#endif
+}
+
+#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
+static const struct user_regset_view user_x86_32_view; /* Initialized below. */
+#endif
+
+long arch_ptrace(struct task_struct *child, long request,
+ unsigned long addr, unsigned long data)
+{
+ int ret;
+ unsigned long __user *datap = (unsigned long __user *)data;
+
+ switch (request) {
+ /* read the word at location addr in the USER area. */
+ case PTRACE_PEEKUSR: {
+ unsigned long tmp;
+
+ ret = -EIO;
+ if ((addr & (sizeof(data) - 1)) || addr >= sizeof(struct user))
+ break;
+
+ tmp = 0; /* Default return condition */
+ if (addr < sizeof(struct user_regs_struct))
+ tmp = getreg(child, addr);
+ else if (addr >= offsetof(struct user, u_debugreg[0]) &&
+ addr <= offsetof(struct user, u_debugreg[7])) {
+ addr -= offsetof(struct user, u_debugreg[0]);
+ tmp = ptrace_get_debugreg(child, addr / sizeof(data));
+ }
+ ret = put_user(tmp, datap);
+ break;
+ }
+
+ case PTRACE_POKEUSR: /* write the word at location addr in the USER area */
+ ret = -EIO;
+ if ((addr & (sizeof(data) - 1)) || addr >= sizeof(struct user))
+ break;
+
+ if (addr < sizeof(struct user_regs_struct))
+ ret = putreg(child, addr, data);
+ else if (addr >= offsetof(struct user, u_debugreg[0]) &&
+ addr <= offsetof(struct user, u_debugreg[7])) {
+ addr -= offsetof(struct user, u_debugreg[0]);
+ ret = ptrace_set_debugreg(child,
+ addr / sizeof(data), data);
+ }
+ break;
+
+ case PTRACE_GETREGS: /* Get all gp regs from the child. */
+ return copy_regset_to_user(child,
+ task_user_regset_view(current),
+ REGSET_GENERAL,
+ 0, sizeof(struct user_regs_struct),
+ datap);
+
+ case PTRACE_SETREGS: /* Set all gp regs in the child. */
+ return copy_regset_from_user(child,
+ task_user_regset_view(current),
+ REGSET_GENERAL,
+ 0, sizeof(struct user_regs_struct),
+ datap);
+
+ case PTRACE_GETFPREGS: /* Get the child FPU state. */
+ return copy_regset_to_user(child,
+ task_user_regset_view(current),
+ REGSET_FP,
+ 0, sizeof(struct user_i387_struct),
+ datap);
+
+ case PTRACE_SETFPREGS: /* Set the child FPU state. */
+ return copy_regset_from_user(child,
+ task_user_regset_view(current),
+ REGSET_FP,
+ 0, sizeof(struct user_i387_struct),
+ datap);
+
+#ifdef CONFIG_X86_32
+ case PTRACE_GETFPXREGS: /* Get the child extended FPU state. */
+ return copy_regset_to_user(child, &user_x86_32_view,
+ REGSET_XFP,
+ 0, sizeof(struct user_fxsr_struct),
+ datap) ? -EIO : 0;
+
+ case PTRACE_SETFPXREGS: /* Set the child extended FPU state. */
+ return copy_regset_from_user(child, &user_x86_32_view,
+ REGSET_XFP,
+ 0, sizeof(struct user_fxsr_struct),
+ datap) ? -EIO : 0;
+#endif
+
+#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
+ case PTRACE_GET_THREAD_AREA:
+ if ((int) addr < 0)
+ return -EIO;
+ ret = do_get_thread_area(child, addr,
+ (struct user_desc __user *)data);
+ break;
+
+ case PTRACE_SET_THREAD_AREA:
+ if ((int) addr < 0)
+ return -EIO;
+ ret = do_set_thread_area(child, addr,
+ (struct user_desc __user *)data, 0);
+ break;
+#endif
+
+#ifdef CONFIG_X86_64
+ /* normal 64bit interface to access TLS data.
+ Works just like arch_prctl, except that the arguments
+ are reversed. */
+ case PTRACE_ARCH_PRCTL:
+ ret = do_arch_prctl_64(child, data, addr);
+ break;
+#endif
+
+ default:
+ ret = ptrace_request(child, request, addr, data);
+ break;
+ }
+
+ return ret;
+}
+
+#ifdef CONFIG_IA32_EMULATION
+
+#include <linux/compat.h>
+#include <linux/syscalls.h>
+#include <asm/ia32.h>
+#include <asm/user32.h>
+
+#define R32(l,q) \
+ case offsetof(struct user32, regs.l): \
+ regs->q = value; break
+
+#define SEG32(rs) \
+ case offsetof(struct user32, regs.rs): \
+ return set_segment_reg(child, \
+ offsetof(struct user_regs_struct, rs), \
+ value); \
+ break
+
+static int putreg32(struct task_struct *child, unsigned regno, u32 value)
+{
+ struct pt_regs *regs = task_pt_regs(child);
+
+ switch (regno) {
+
+ SEG32(cs);
+ SEG32(ds);
+ SEG32(es);
+ SEG32(fs);
+ SEG32(gs);
+ SEG32(ss);
+
+ R32(ebx, bx);
+ R32(ecx, cx);
+ R32(edx, dx);
+ R32(edi, di);
+ R32(esi, si);
+ R32(ebp, bp);
+ R32(eax, ax);
+ R32(eip, ip);
+ R32(esp, sp);
+
+ case offsetof(struct user32, regs.orig_eax):
+ /*
+ * Warning: bizarre corner case fixup here. A 32-bit
+ * debugger setting orig_eax to -1 wants to disable
+ * syscall restart. Make sure that the syscall
+ * restart code sign-extends orig_ax. Also make sure
+ * we interpret the -ERESTART* codes correctly if
+ * loaded into regs->ax in case the task is not
+ * actually still sitting at the exit from a 32-bit
+ * syscall with TS_COMPAT still set.
+ */
+ regs->orig_ax = value;
+ if (syscall_get_nr(child, regs) >= 0)
+ child->thread_info.status |= TS_I386_REGS_POKED;
+ break;
+
+ case offsetof(struct user32, regs.eflags):
+ return set_flags(child, value);
+
+ case offsetof(struct user32, u_debugreg[0]) ...
+ offsetof(struct user32, u_debugreg[7]):
+ regno -= offsetof(struct user32, u_debugreg[0]);
+ return ptrace_set_debugreg(child, regno / 4, value);
+
+ default:
+ if (regno > sizeof(struct user32) || (regno & 3))
+ return -EIO;
+
+ /*
+ * Other dummy fields in the virtual user structure
+ * are ignored
+ */
+ break;
+ }
+ return 0;
+}
+
+#undef R32
+#undef SEG32
+
+#define R32(l,q) \
+ case offsetof(struct user32, regs.l): \
+ *val = regs->q; break
+
+#define SEG32(rs) \
+ case offsetof(struct user32, regs.rs): \
+ *val = get_segment_reg(child, \
+ offsetof(struct user_regs_struct, rs)); \
+ break
+
+static int getreg32(struct task_struct *child, unsigned regno, u32 *val)
+{
+ struct pt_regs *regs = task_pt_regs(child);
+
+ switch (regno) {
+
+ SEG32(ds);
+ SEG32(es);
+ SEG32(fs);
+ SEG32(gs);
+
+ R32(cs, cs);
+ R32(ss, ss);
+ R32(ebx, bx);
+ R32(ecx, cx);
+ R32(edx, dx);
+ R32(edi, di);
+ R32(esi, si);
+ R32(ebp, bp);
+ R32(eax, ax);
+ R32(orig_eax, orig_ax);
+ R32(eip, ip);
+ R32(esp, sp);
+
+ case offsetof(struct user32, regs.eflags):
+ *val = get_flags(child);
+ break;
+
+ case offsetof(struct user32, u_debugreg[0]) ...
+ offsetof(struct user32, u_debugreg[7]):
+ regno -= offsetof(struct user32, u_debugreg[0]);
+ *val = ptrace_get_debugreg(child, regno / 4);
+ break;
+
+ default:
+ if (regno > sizeof(struct user32) || (regno & 3))
+ return -EIO;
+
+ /*
+ * Other dummy fields in the virtual user structure
+ * are ignored
+ */
+ *val = 0;
+ break;
+ }
+ return 0;
+}
+
+#undef R32
+#undef SEG32
+
+static int genregs32_get(struct task_struct *target,
+ const struct user_regset *regset,
+ unsigned int pos, unsigned int count,
+ void *kbuf, void __user *ubuf)
+{
+ if (kbuf) {
+ compat_ulong_t *k = kbuf;
+ while (count >= sizeof(*k)) {
+ getreg32(target, pos, k++);
+ count -= sizeof(*k);
+ pos += sizeof(*k);
+ }
+ } else {
+ compat_ulong_t __user *u = ubuf;
+ while (count >= sizeof(*u)) {
+ compat_ulong_t word;
+ getreg32(target, pos, &word);
+ if (__put_user(word, u++))
+ return -EFAULT;
+ count -= sizeof(*u);
+ pos += sizeof(*u);
+ }
+ }
+
+ return 0;
+}
+
+static int genregs32_set(struct task_struct *target,
+ const struct user_regset *regset,
+ unsigned int pos, unsigned int count,
+ const void *kbuf, const void __user *ubuf)
+{
+ int ret = 0;
+ if (kbuf) {
+ const compat_ulong_t *k = kbuf;
+ while (count >= sizeof(*k) && !ret) {
+ ret = putreg32(target, pos, *k++);
+ count -= sizeof(*k);
+ pos += sizeof(*k);
+ }
+ } else {
+ const compat_ulong_t __user *u = ubuf;
+ while (count >= sizeof(*u) && !ret) {
+ compat_ulong_t word;
+ ret = __get_user(word, u++);
+ if (ret)
+ break;
+ ret = putreg32(target, pos, word);
+ count -= sizeof(*u);
+ pos += sizeof(*u);
+ }
+ }
+ return ret;
+}
+
+static long ia32_arch_ptrace(struct task_struct *child, compat_long_t request,
+ compat_ulong_t caddr, compat_ulong_t cdata)
+{
+ unsigned long addr = caddr;
+ unsigned long data = cdata;
+ void __user *datap = compat_ptr(data);
+ int ret;
+ __u32 val;
+
+ switch (request) {
+ case PTRACE_PEEKUSR:
+ ret = getreg32(child, addr, &val);
+ if (ret == 0)
+ ret = put_user(val, (__u32 __user *)datap);
+ break;
+
+ case PTRACE_POKEUSR:
+ ret = putreg32(child, addr, data);
+ break;
+
+ case PTRACE_GETREGS: /* Get all gp regs from the child. */
+ return copy_regset_to_user(child, &user_x86_32_view,
+ REGSET_GENERAL,
+ 0, sizeof(struct user_regs_struct32),
+ datap);
+
+ case PTRACE_SETREGS: /* Set all gp regs in the child. */
+ return copy_regset_from_user(child, &user_x86_32_view,
+ REGSET_GENERAL, 0,
+ sizeof(struct user_regs_struct32),
+ datap);
+
+ case PTRACE_GETFPREGS: /* Get the child FPU state. */
+ return copy_regset_to_user(child, &user_x86_32_view,
+ REGSET_FP, 0,
+ sizeof(struct user_i387_ia32_struct),
+ datap);
+
+ case PTRACE_SETFPREGS: /* Set the child FPU state. */
+ return copy_regset_from_user(
+ child, &user_x86_32_view, REGSET_FP,
+ 0, sizeof(struct user_i387_ia32_struct), datap);
+
+ case PTRACE_GETFPXREGS: /* Get the child extended FPU state. */
+ return copy_regset_to_user(child, &user_x86_32_view,
+ REGSET_XFP, 0,
+ sizeof(struct user32_fxsr_struct),
+ datap);
+
+ case PTRACE_SETFPXREGS: /* Set the child extended FPU state. */
+ return copy_regset_from_user(child, &user_x86_32_view,
+ REGSET_XFP, 0,
+ sizeof(struct user32_fxsr_struct),
+ datap);
+
+ case PTRACE_GET_THREAD_AREA:
+ case PTRACE_SET_THREAD_AREA:
+ return arch_ptrace(child, request, addr, data);
+
+ default:
+ return compat_ptrace_request(child, request, addr, data);
+ }
+
+ return ret;
+}
+#endif /* CONFIG_IA32_EMULATION */
+
+#ifdef CONFIG_X86_X32_ABI
+static long x32_arch_ptrace(struct task_struct *child,
+ compat_long_t request, compat_ulong_t caddr,
+ compat_ulong_t cdata)
+{
+ unsigned long addr = caddr;
+ unsigned long data = cdata;
+ void __user *datap = compat_ptr(data);
+ int ret;
+
+ switch (request) {
+ /* Read 32bits at location addr in the USER area. Only allow
+ to return the lower 32bits of segment and debug registers. */
+ case PTRACE_PEEKUSR: {
+ u32 tmp;
+
+ ret = -EIO;
+ if ((addr & (sizeof(data) - 1)) || addr >= sizeof(struct user) ||
+ addr < offsetof(struct user_regs_struct, cs))
+ break;
+
+ tmp = 0; /* Default return condition */
+ if (addr < sizeof(struct user_regs_struct))
+ tmp = getreg(child, addr);
+ else if (addr >= offsetof(struct user, u_debugreg[0]) &&
+ addr <= offsetof(struct user, u_debugreg[7])) {
+ addr -= offsetof(struct user, u_debugreg[0]);
+ tmp = ptrace_get_debugreg(child, addr / sizeof(data));
+ }
+ ret = put_user(tmp, (__u32 __user *)datap);
+ break;
+ }
+
+ /* Write the word at location addr in the USER area. Only allow
+ to update segment and debug registers with the upper 32bits
+ zero-extended. */
+ case PTRACE_POKEUSR:
+ ret = -EIO;
+ if ((addr & (sizeof(data) - 1)) || addr >= sizeof(struct user) ||
+ addr < offsetof(struct user_regs_struct, cs))
+ break;
+
+ if (addr < sizeof(struct user_regs_struct))
+ ret = putreg(child, addr, data);
+ else if (addr >= offsetof(struct user, u_debugreg[0]) &&
+ addr <= offsetof(struct user, u_debugreg[7])) {
+ addr -= offsetof(struct user, u_debugreg[0]);
+ ret = ptrace_set_debugreg(child,
+ addr / sizeof(data), data);
+ }
+ break;
+
+ case PTRACE_GETREGS: /* Get all gp regs from the child. */
+ return copy_regset_to_user(child,
+ task_user_regset_view(current),
+ REGSET_GENERAL,
+ 0, sizeof(struct user_regs_struct),
+ datap);
+
+ case PTRACE_SETREGS: /* Set all gp regs in the child. */
+ return copy_regset_from_user(child,
+ task_user_regset_view(current),
+ REGSET_GENERAL,
+ 0, sizeof(struct user_regs_struct),
+ datap);
+
+ case PTRACE_GETFPREGS: /* Get the child FPU state. */
+ return copy_regset_to_user(child,
+ task_user_regset_view(current),
+ REGSET_FP,
+ 0, sizeof(struct user_i387_struct),
+ datap);
+
+ case PTRACE_SETFPREGS: /* Set the child FPU state. */
+ return copy_regset_from_user(child,
+ task_user_regset_view(current),
+ REGSET_FP,
+ 0, sizeof(struct user_i387_struct),
+ datap);
+
+ default:
+ return compat_ptrace_request(child, request, addr, data);
+ }
+
+ return ret;
+}
+#endif
+
+#ifdef CONFIG_COMPAT
+long compat_arch_ptrace(struct task_struct *child, compat_long_t request,
+ compat_ulong_t caddr, compat_ulong_t cdata)
+{
+#ifdef CONFIG_X86_X32_ABI
+ if (!in_ia32_syscall())
+ return x32_arch_ptrace(child, request, caddr, cdata);
+#endif
+#ifdef CONFIG_IA32_EMULATION
+ return ia32_arch_ptrace(child, request, caddr, cdata);
+#else
+ return 0;
+#endif
+}
+#endif /* CONFIG_COMPAT */
+
+#ifdef CONFIG_X86_64
+
+static struct user_regset x86_64_regsets[] __ro_after_init = {
+ [REGSET_GENERAL] = {
+ .core_note_type = NT_PRSTATUS,
+ .n = sizeof(struct user_regs_struct) / sizeof(long),
+ .size = sizeof(long), .align = sizeof(long),
+ .get = genregs_get, .set = genregs_set
+ },
+ [REGSET_FP] = {
+ .core_note_type = NT_PRFPREG,
+ .n = sizeof(struct user_i387_struct) / sizeof(long),
+ .size = sizeof(long), .align = sizeof(long),
+ .active = regset_xregset_fpregs_active, .get = xfpregs_get, .set = xfpregs_set
+ },
+ [REGSET_XSTATE] = {
+ .core_note_type = NT_X86_XSTATE,
+ .size = sizeof(u64), .align = sizeof(u64),
+ .active = xstateregs_active, .get = xstateregs_get,
+ .set = xstateregs_set
+ },
+ [REGSET_IOPERM64] = {
+ .core_note_type = NT_386_IOPERM,
+ .n = IO_BITMAP_LONGS,
+ .size = sizeof(long), .align = sizeof(long),
+ .active = ioperm_active, .get = ioperm_get
+ },
+};
+
+static const struct user_regset_view user_x86_64_view = {
+ .name = "x86_64", .e_machine = EM_X86_64,
+ .regsets = x86_64_regsets, .n = ARRAY_SIZE(x86_64_regsets)
+};
+
+#else /* CONFIG_X86_32 */
+
+#define user_regs_struct32 user_regs_struct
+#define genregs32_get genregs_get
+#define genregs32_set genregs_set
+
+#endif /* CONFIG_X86_64 */
+
+#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
+static struct user_regset x86_32_regsets[] __ro_after_init = {
+ [REGSET_GENERAL] = {
+ .core_note_type = NT_PRSTATUS,
+ .n = sizeof(struct user_regs_struct32) / sizeof(u32),
+ .size = sizeof(u32), .align = sizeof(u32),
+ .get = genregs32_get, .set = genregs32_set
+ },
+ [REGSET_FP] = {
+ .core_note_type = NT_PRFPREG,
+ .n = sizeof(struct user_i387_ia32_struct) / sizeof(u32),
+ .size = sizeof(u32), .align = sizeof(u32),
+ .active = regset_fpregs_active, .get = fpregs_get, .set = fpregs_set
+ },
+ [REGSET_XFP] = {
+ .core_note_type = NT_PRXFPREG,
+ .n = sizeof(struct user32_fxsr_struct) / sizeof(u32),
+ .size = sizeof(u32), .align = sizeof(u32),
+ .active = regset_xregset_fpregs_active, .get = xfpregs_get, .set = xfpregs_set
+ },
+ [REGSET_XSTATE] = {
+ .core_note_type = NT_X86_XSTATE,
+ .size = sizeof(u64), .align = sizeof(u64),
+ .active = xstateregs_active, .get = xstateregs_get,
+ .set = xstateregs_set
+ },
+ [REGSET_TLS] = {
+ .core_note_type = NT_386_TLS,
+ .n = GDT_ENTRY_TLS_ENTRIES, .bias = GDT_ENTRY_TLS_MIN,
+ .size = sizeof(struct user_desc),
+ .align = sizeof(struct user_desc),
+ .active = regset_tls_active,
+ .get = regset_tls_get, .set = regset_tls_set
+ },
+ [REGSET_IOPERM32] = {
+ .core_note_type = NT_386_IOPERM,
+ .n = IO_BITMAP_BYTES / sizeof(u32),
+ .size = sizeof(u32), .align = sizeof(u32),
+ .active = ioperm_active, .get = ioperm_get
+ },
+};
+
+static const struct user_regset_view user_x86_32_view = {
+ .name = "i386", .e_machine = EM_386,
+ .regsets = x86_32_regsets, .n = ARRAY_SIZE(x86_32_regsets)
+};
+#endif
+
+/*
+ * This represents bytes 464..511 in the memory layout exported through
+ * the REGSET_XSTATE interface.
+ */
+u64 xstate_fx_sw_bytes[USER_XSTATE_FX_SW_WORDS];
+
+void __init update_regset_xstate_info(unsigned int size, u64 xstate_mask)
+{
+#ifdef CONFIG_X86_64
+ x86_64_regsets[REGSET_XSTATE].n = size / sizeof(u64);
+#endif
+#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
+ x86_32_regsets[REGSET_XSTATE].n = size / sizeof(u64);
+#endif
+ xstate_fx_sw_bytes[USER_XSTATE_XCR0_WORD] = xstate_mask;
+}
+
+const struct user_regset_view *task_user_regset_view(struct task_struct *task)
+{
+#ifdef CONFIG_IA32_EMULATION
+ if (!user_64bit_mode(task_pt_regs(task)))
+#endif
+#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
+ return &user_x86_32_view;
+#endif
+#ifdef CONFIG_X86_64
+ return &user_x86_64_view;
+#endif
+}
+
+static void fill_sigtrap_info(struct task_struct *tsk,
+ struct pt_regs *regs,
+ int error_code, int si_code,
+ struct siginfo *info)
+{
+ tsk->thread.trap_nr = X86_TRAP_DB;
+ tsk->thread.error_code = error_code;
+
+ info->si_signo = SIGTRAP;
+ info->si_code = si_code;
+ info->si_addr = user_mode(regs) ? (void __user *)regs->ip : NULL;
+}
+
+void user_single_step_siginfo(struct task_struct *tsk,
+ struct pt_regs *regs,
+ struct siginfo *info)
+{
+ fill_sigtrap_info(tsk, regs, 0, TRAP_BRKPT, info);
+}
+
+void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs,
+ int error_code, int si_code)
+{
+ struct siginfo info;
+
+ clear_siginfo(&info);
+ fill_sigtrap_info(tsk, regs, error_code, si_code, &info);
+ /* Send us the fake SIGTRAP */
+ force_sig_info(SIGTRAP, &info, tsk);
+}
diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c
new file mode 100644
index 0000000..637982e
--- /dev/null
+++ b/arch/x86/kernel/pvclock.c
@@ -0,0 +1,167 @@
+/* paravirtual clock -- common code used by kvm/xen
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+*/
+
+#include <linux/kernel.h>
+#include <linux/percpu.h>
+#include <linux/notifier.h>
+#include <linux/sched.h>
+#include <linux/gfp.h>
+#include <linux/bootmem.h>
+#include <linux/nmi.h>
+
+#include <asm/fixmap.h>
+#include <asm/pvclock.h>
+#include <asm/vgtod.h>
+
+static u8 valid_flags __read_mostly = 0;
+static struct pvclock_vsyscall_time_info *pvti_cpu0_va __read_mostly;
+
+void pvclock_set_flags(u8 flags)
+{
+ valid_flags = flags;
+}
+
+unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src)
+{
+ u64 pv_tsc_khz = 1000000ULL << 32;
+
+ do_div(pv_tsc_khz, src->tsc_to_system_mul);
+ if (src->tsc_shift < 0)
+ pv_tsc_khz <<= -src->tsc_shift;
+ else
+ pv_tsc_khz >>= src->tsc_shift;
+ return pv_tsc_khz;
+}
+
+void pvclock_touch_watchdogs(void)
+{
+ touch_softlockup_watchdog_sync();
+ clocksource_touch_watchdog();
+ rcu_cpu_stall_reset();
+ reset_hung_task_detector();
+}
+
+static atomic64_t last_value = ATOMIC64_INIT(0);
+
+void pvclock_resume(void)
+{
+ atomic64_set(&last_value, 0);
+}
+
+u8 pvclock_read_flags(struct pvclock_vcpu_time_info *src)
+{
+ unsigned version;
+ u8 flags;
+
+ do {
+ version = pvclock_read_begin(src);
+ flags = src->flags;
+ } while (pvclock_read_retry(src, version));
+
+ return flags & valid_flags;
+}
+
+u64 pvclock_clocksource_read(struct pvclock_vcpu_time_info *src)
+{
+ unsigned version;
+ u64 ret;
+ u64 last;
+ u8 flags;
+
+ do {
+ version = pvclock_read_begin(src);
+ ret = __pvclock_read_cycles(src, rdtsc_ordered());
+ flags = src->flags;
+ } while (pvclock_read_retry(src, version));
+
+ if (unlikely((flags & PVCLOCK_GUEST_STOPPED) != 0)) {
+ src->flags &= ~PVCLOCK_GUEST_STOPPED;
+ pvclock_touch_watchdogs();
+ }
+
+ if ((valid_flags & PVCLOCK_TSC_STABLE_BIT) &&
+ (flags & PVCLOCK_TSC_STABLE_BIT))
+ return ret;
+
+ /*
+ * Assumption here is that last_value, a global accumulator, always goes
+ * forward. If we are less than that, we should not be much smaller.
+ * We assume there is an error marging we're inside, and then the correction
+ * does not sacrifice accuracy.
+ *
+ * For reads: global may have changed between test and return,
+ * but this means someone else updated poked the clock at a later time.
+ * We just need to make sure we are not seeing a backwards event.
+ *
+ * For updates: last_value = ret is not enough, since two vcpus could be
+ * updating at the same time, and one of them could be slightly behind,
+ * making the assumption that last_value always go forward fail to hold.
+ */
+ last = atomic64_read(&last_value);
+ do {
+ if (ret < last)
+ return last;
+ last = atomic64_cmpxchg(&last_value, last, ret);
+ } while (unlikely(last != ret));
+
+ return ret;
+}
+
+void pvclock_read_wallclock(struct pvclock_wall_clock *wall_clock,
+ struct pvclock_vcpu_time_info *vcpu_time,
+ struct timespec64 *ts)
+{
+ u32 version;
+ u64 delta;
+ struct timespec64 now;
+
+ /* get wallclock at system boot */
+ do {
+ version = wall_clock->version;
+ rmb(); /* fetch version before time */
+ /*
+ * Note: wall_clock->sec is a u32 value, so it can
+ * only store dates between 1970 and 2106. To allow
+ * times beyond that, we need to create a new hypercall
+ * interface with an extended pvclock_wall_clock structure
+ * like ARM has.
+ */
+ now.tv_sec = wall_clock->sec;
+ now.tv_nsec = wall_clock->nsec;
+ rmb(); /* fetch time before checking version */
+ } while ((wall_clock->version & 1) || (version != wall_clock->version));
+
+ delta = pvclock_clocksource_read(vcpu_time); /* time since system boot */
+ delta += now.tv_sec * NSEC_PER_SEC + now.tv_nsec;
+
+ now.tv_nsec = do_div(delta, NSEC_PER_SEC);
+ now.tv_sec = delta;
+
+ set_normalized_timespec64(ts, now.tv_sec, now.tv_nsec);
+}
+
+void pvclock_set_pvti_cpu0_va(struct pvclock_vsyscall_time_info *pvti)
+{
+ WARN_ON(vclock_was_used(VCLOCK_PVCLOCK));
+ pvti_cpu0_va = pvti;
+}
+
+struct pvclock_vsyscall_time_info *pvclock_get_pvti_cpu0_va(void)
+{
+ return pvti_cpu0_va;
+}
+EXPORT_SYMBOL_GPL(pvclock_get_pvti_cpu0_va);
diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c
new file mode 100644
index 0000000..736348e
--- /dev/null
+++ b/arch/x86/kernel/quirks.c
@@ -0,0 +1,676 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * This file contains work-arounds for x86 and x86_64 platform bugs.
+ */
+#include <linux/dmi.h>
+#include <linux/pci.h>
+#include <linux/irq.h>
+
+#include <asm/hpet.h>
+
+#if defined(CONFIG_X86_IO_APIC) && defined(CONFIG_SMP) && defined(CONFIG_PCI)
+
+static void quirk_intel_irqbalance(struct pci_dev *dev)
+{
+ u8 config;
+ u16 word;
+
+ /* BIOS may enable hardware IRQ balancing for
+ * E7520/E7320/E7525(revision ID 0x9 and below)
+ * based platforms.
+ * Disable SW irqbalance/affinity on those platforms.
+ */
+ if (dev->revision > 0x9)
+ return;
+
+ /* enable access to config space*/
+ pci_read_config_byte(dev, 0xf4, &config);
+ pci_write_config_byte(dev, 0xf4, config|0x2);
+
+ /*
+ * read xTPR register. We may not have a pci_dev for device 8
+ * because it might be hidden until the above write.
+ */
+ pci_bus_read_config_word(dev->bus, PCI_DEVFN(8, 0), 0x4c, &word);
+
+ if (!(word & (1 << 13))) {
+ dev_info(&dev->dev, "Intel E7520/7320/7525 detected; "
+ "disabling irq balancing and affinity\n");
+ noirqdebug_setup("");
+#ifdef CONFIG_PROC_FS
+ no_irq_affinity = 1;
+#endif
+ }
+
+ /* put back the original value for config space*/
+ if (!(config & 0x2))
+ pci_write_config_byte(dev, 0xf4, config);
+}
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7320_MCH,
+ quirk_intel_irqbalance);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7525_MCH,
+ quirk_intel_irqbalance);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7520_MCH,
+ quirk_intel_irqbalance);
+#endif
+
+#if defined(CONFIG_HPET_TIMER)
+unsigned long force_hpet_address;
+
+static enum {
+ NONE_FORCE_HPET_RESUME,
+ OLD_ICH_FORCE_HPET_RESUME,
+ ICH_FORCE_HPET_RESUME,
+ VT8237_FORCE_HPET_RESUME,
+ NVIDIA_FORCE_HPET_RESUME,
+ ATI_FORCE_HPET_RESUME,
+} force_hpet_resume_type;
+
+static void __iomem *rcba_base;
+
+static void ich_force_hpet_resume(void)
+{
+ u32 val;
+
+ if (!force_hpet_address)
+ return;
+
+ BUG_ON(rcba_base == NULL);
+
+ /* read the Function Disable register, dword mode only */
+ val = readl(rcba_base + 0x3404);
+ if (!(val & 0x80)) {
+ /* HPET disabled in HPTC. Trying to enable */
+ writel(val | 0x80, rcba_base + 0x3404);
+ }
+
+ val = readl(rcba_base + 0x3404);
+ if (!(val & 0x80))
+ BUG();
+ else
+ printk(KERN_DEBUG "Force enabled HPET at resume\n");
+
+ return;
+}
+
+static void ich_force_enable_hpet(struct pci_dev *dev)
+{
+ u32 val;
+ u32 uninitialized_var(rcba);
+ int err = 0;
+
+ if (hpet_address || force_hpet_address)
+ return;
+
+ pci_read_config_dword(dev, 0xF0, &rcba);
+ rcba &= 0xFFFFC000;
+ if (rcba == 0) {
+ dev_printk(KERN_DEBUG, &dev->dev, "RCBA disabled; "
+ "cannot force enable HPET\n");
+ return;
+ }
+
+ /* use bits 31:14, 16 kB aligned */
+ rcba_base = ioremap_nocache(rcba, 0x4000);
+ if (rcba_base == NULL) {
+ dev_printk(KERN_DEBUG, &dev->dev, "ioremap failed; "
+ "cannot force enable HPET\n");
+ return;
+ }
+
+ /* read the Function Disable register, dword mode only */
+ val = readl(rcba_base + 0x3404);
+
+ if (val & 0x80) {
+ /* HPET is enabled in HPTC. Just not reported by BIOS */
+ val = val & 0x3;
+ force_hpet_address = 0xFED00000 | (val << 12);
+ dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at "
+ "0x%lx\n", force_hpet_address);
+ iounmap(rcba_base);
+ return;
+ }
+
+ /* HPET disabled in HPTC. Trying to enable */
+ writel(val | 0x80, rcba_base + 0x3404);
+
+ val = readl(rcba_base + 0x3404);
+ if (!(val & 0x80)) {
+ err = 1;
+ } else {
+ val = val & 0x3;
+ force_hpet_address = 0xFED00000 | (val << 12);
+ }
+
+ if (err) {
+ force_hpet_address = 0;
+ iounmap(rcba_base);
+ dev_printk(KERN_DEBUG, &dev->dev,
+ "Failed to force enable HPET\n");
+ } else {
+ force_hpet_resume_type = ICH_FORCE_HPET_RESUME;
+ dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at "
+ "0x%lx\n", force_hpet_address);
+ }
+}
+
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ESB2_0,
+ ich_force_enable_hpet);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH6_0,
+ ich_force_enable_hpet);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH6_1,
+ ich_force_enable_hpet);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH7_0,
+ ich_force_enable_hpet);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH7_1,
+ ich_force_enable_hpet);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH7_31,
+ ich_force_enable_hpet);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH8_1,
+ ich_force_enable_hpet);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH8_4,
+ ich_force_enable_hpet);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH9_7,
+ ich_force_enable_hpet);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x3a16, /* ICH10 */
+ ich_force_enable_hpet);
+
+static struct pci_dev *cached_dev;
+
+static void hpet_print_force_info(void)
+{
+ printk(KERN_INFO "HPET not enabled in BIOS. "
+ "You might try hpet=force boot option\n");
+}
+
+static void old_ich_force_hpet_resume(void)
+{
+ u32 val;
+ u32 uninitialized_var(gen_cntl);
+
+ if (!force_hpet_address || !cached_dev)
+ return;
+
+ pci_read_config_dword(cached_dev, 0xD0, &gen_cntl);
+ gen_cntl &= (~(0x7 << 15));
+ gen_cntl |= (0x4 << 15);
+
+ pci_write_config_dword(cached_dev, 0xD0, gen_cntl);
+ pci_read_config_dword(cached_dev, 0xD0, &gen_cntl);
+ val = gen_cntl >> 15;
+ val &= 0x7;
+ if (val == 0x4)
+ printk(KERN_DEBUG "Force enabled HPET at resume\n");
+ else
+ BUG();
+}
+
+static void old_ich_force_enable_hpet(struct pci_dev *dev)
+{
+ u32 val;
+ u32 uninitialized_var(gen_cntl);
+
+ if (hpet_address || force_hpet_address)
+ return;
+
+ pci_read_config_dword(dev, 0xD0, &gen_cntl);
+ /*
+ * Bit 17 is HPET enable bit.
+ * Bit 16:15 control the HPET base address.
+ */
+ val = gen_cntl >> 15;
+ val &= 0x7;
+ if (val & 0x4) {
+ val &= 0x3;
+ force_hpet_address = 0xFED00000 | (val << 12);
+ dev_printk(KERN_DEBUG, &dev->dev, "HPET at 0x%lx\n",
+ force_hpet_address);
+ return;
+ }
+
+ /*
+ * HPET is disabled. Trying enabling at FED00000 and check
+ * whether it sticks
+ */
+ gen_cntl &= (~(0x7 << 15));
+ gen_cntl |= (0x4 << 15);
+ pci_write_config_dword(dev, 0xD0, gen_cntl);
+
+ pci_read_config_dword(dev, 0xD0, &gen_cntl);
+
+ val = gen_cntl >> 15;
+ val &= 0x7;
+ if (val & 0x4) {
+ /* HPET is enabled in HPTC. Just not reported by BIOS */
+ val &= 0x3;
+ force_hpet_address = 0xFED00000 | (val << 12);
+ dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at "
+ "0x%lx\n", force_hpet_address);
+ cached_dev = dev;
+ force_hpet_resume_type = OLD_ICH_FORCE_HPET_RESUME;
+ return;
+ }
+
+ dev_printk(KERN_DEBUG, &dev->dev, "Failed to force enable HPET\n");
+}
+
+/*
+ * Undocumented chipset features. Make sure that the user enforced
+ * this.
+ */
+static void old_ich_force_enable_hpet_user(struct pci_dev *dev)
+{
+ if (hpet_force_user)
+ old_ich_force_enable_hpet(dev);
+}
+
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ESB_1,
+ old_ich_force_enable_hpet_user);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801CA_0,
+ old_ich_force_enable_hpet_user);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801CA_12,
+ old_ich_force_enable_hpet_user);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801DB_0,
+ old_ich_force_enable_hpet_user);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801DB_12,
+ old_ich_force_enable_hpet_user);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801EB_0,
+ old_ich_force_enable_hpet);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801EB_12,
+ old_ich_force_enable_hpet);
+
+
+static void vt8237_force_hpet_resume(void)
+{
+ u32 val;
+
+ if (!force_hpet_address || !cached_dev)
+ return;
+
+ val = 0xfed00000 | 0x80;
+ pci_write_config_dword(cached_dev, 0x68, val);
+
+ pci_read_config_dword(cached_dev, 0x68, &val);
+ if (val & 0x80)
+ printk(KERN_DEBUG "Force enabled HPET at resume\n");
+ else
+ BUG();
+}
+
+static void vt8237_force_enable_hpet(struct pci_dev *dev)
+{
+ u32 uninitialized_var(val);
+
+ if (hpet_address || force_hpet_address)
+ return;
+
+ if (!hpet_force_user) {
+ hpet_print_force_info();
+ return;
+ }
+
+ pci_read_config_dword(dev, 0x68, &val);
+ /*
+ * Bit 7 is HPET enable bit.
+ * Bit 31:10 is HPET base address (contrary to what datasheet claims)
+ */
+ if (val & 0x80) {
+ force_hpet_address = (val & ~0x3ff);
+ dev_printk(KERN_DEBUG, &dev->dev, "HPET at 0x%lx\n",
+ force_hpet_address);
+ return;
+ }
+
+ /*
+ * HPET is disabled. Trying enabling at FED00000 and check
+ * whether it sticks
+ */
+ val = 0xfed00000 | 0x80;
+ pci_write_config_dword(dev, 0x68, val);
+
+ pci_read_config_dword(dev, 0x68, &val);
+ if (val & 0x80) {
+ force_hpet_address = (val & ~0x3ff);
+ dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at "
+ "0x%lx\n", force_hpet_address);
+ cached_dev = dev;
+ force_hpet_resume_type = VT8237_FORCE_HPET_RESUME;
+ return;
+ }
+
+ dev_printk(KERN_DEBUG, &dev->dev, "Failed to force enable HPET\n");
+}
+
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8235,
+ vt8237_force_enable_hpet);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8237,
+ vt8237_force_enable_hpet);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_CX700,
+ vt8237_force_enable_hpet);
+
+static void ati_force_hpet_resume(void)
+{
+ pci_write_config_dword(cached_dev, 0x14, 0xfed00000);
+ printk(KERN_DEBUG "Force enabled HPET at resume\n");
+}
+
+static u32 ati_ixp4x0_rev(struct pci_dev *dev)
+{
+ int err = 0;
+ u32 d = 0;
+ u8 b = 0;
+
+ err = pci_read_config_byte(dev, 0xac, &b);
+ b &= ~(1<<5);
+ err |= pci_write_config_byte(dev, 0xac, b);
+ err |= pci_read_config_dword(dev, 0x70, &d);
+ d |= 1<<8;
+ err |= pci_write_config_dword(dev, 0x70, d);
+ err |= pci_read_config_dword(dev, 0x8, &d);
+ d &= 0xff;
+ dev_printk(KERN_DEBUG, &dev->dev, "SB4X0 revision 0x%x\n", d);
+
+ WARN_ON_ONCE(err);
+
+ return d;
+}
+
+static void ati_force_enable_hpet(struct pci_dev *dev)
+{
+ u32 d, val;
+ u8 b;
+
+ if (hpet_address || force_hpet_address)
+ return;
+
+ if (!hpet_force_user) {
+ hpet_print_force_info();
+ return;
+ }
+
+ d = ati_ixp4x0_rev(dev);
+ if (d < 0x82)
+ return;
+
+ /* base address */
+ pci_write_config_dword(dev, 0x14, 0xfed00000);
+ pci_read_config_dword(dev, 0x14, &val);
+
+ /* enable interrupt */
+ outb(0x72, 0xcd6); b = inb(0xcd7);
+ b |= 0x1;
+ outb(0x72, 0xcd6); outb(b, 0xcd7);
+ outb(0x72, 0xcd6); b = inb(0xcd7);
+ if (!(b & 0x1))
+ return;
+ pci_read_config_dword(dev, 0x64, &d);
+ d |= (1<<10);
+ pci_write_config_dword(dev, 0x64, d);
+ pci_read_config_dword(dev, 0x64, &d);
+ if (!(d & (1<<10)))
+ return;
+
+ force_hpet_address = val;
+ force_hpet_resume_type = ATI_FORCE_HPET_RESUME;
+ dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at 0x%lx\n",
+ force_hpet_address);
+ cached_dev = dev;
+}
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_IXP400_SMBUS,
+ ati_force_enable_hpet);
+
+/*
+ * Undocumented chipset feature taken from LinuxBIOS.
+ */
+static void nvidia_force_hpet_resume(void)
+{
+ pci_write_config_dword(cached_dev, 0x44, 0xfed00001);
+ printk(KERN_DEBUG "Force enabled HPET at resume\n");
+}
+
+static void nvidia_force_enable_hpet(struct pci_dev *dev)
+{
+ u32 uninitialized_var(val);
+
+ if (hpet_address || force_hpet_address)
+ return;
+
+ if (!hpet_force_user) {
+ hpet_print_force_info();
+ return;
+ }
+
+ pci_write_config_dword(dev, 0x44, 0xfed00001);
+ pci_read_config_dword(dev, 0x44, &val);
+ force_hpet_address = val & 0xfffffffe;
+ force_hpet_resume_type = NVIDIA_FORCE_HPET_RESUME;
+ dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at 0x%lx\n",
+ force_hpet_address);
+ cached_dev = dev;
+ return;
+}
+
+/* ISA Bridges */
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NVIDIA, 0x0050,
+ nvidia_force_enable_hpet);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NVIDIA, 0x0051,
+ nvidia_force_enable_hpet);
+
+/* LPC bridges */
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NVIDIA, 0x0260,
+ nvidia_force_enable_hpet);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NVIDIA, 0x0360,
+ nvidia_force_enable_hpet);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NVIDIA, 0x0361,
+ nvidia_force_enable_hpet);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NVIDIA, 0x0362,
+ nvidia_force_enable_hpet);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NVIDIA, 0x0363,
+ nvidia_force_enable_hpet);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NVIDIA, 0x0364,
+ nvidia_force_enable_hpet);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NVIDIA, 0x0365,
+ nvidia_force_enable_hpet);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NVIDIA, 0x0366,
+ nvidia_force_enable_hpet);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NVIDIA, 0x0367,
+ nvidia_force_enable_hpet);
+
+void force_hpet_resume(void)
+{
+ switch (force_hpet_resume_type) {
+ case ICH_FORCE_HPET_RESUME:
+ ich_force_hpet_resume();
+ return;
+ case OLD_ICH_FORCE_HPET_RESUME:
+ old_ich_force_hpet_resume();
+ return;
+ case VT8237_FORCE_HPET_RESUME:
+ vt8237_force_hpet_resume();
+ return;
+ case NVIDIA_FORCE_HPET_RESUME:
+ nvidia_force_hpet_resume();
+ return;
+ case ATI_FORCE_HPET_RESUME:
+ ati_force_hpet_resume();
+ return;
+ default:
+ break;
+ }
+}
+
+/*
+ * According to the datasheet e6xx systems have the HPET hardwired to
+ * 0xfed00000
+ */
+static void e6xx_force_enable_hpet(struct pci_dev *dev)
+{
+ if (hpet_address || force_hpet_address)
+ return;
+
+ force_hpet_address = 0xFED00000;
+ force_hpet_resume_type = NONE_FORCE_HPET_RESUME;
+ dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at "
+ "0x%lx\n", force_hpet_address);
+ return;
+}
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E6XX_CU,
+ e6xx_force_enable_hpet);
+
+/*
+ * HPET MSI on some boards (ATI SB700/SB800) has side effect on
+ * floppy DMA. Disable HPET MSI on such platforms.
+ * See erratum #27 (Misinterpreted MSI Requests May Result in
+ * Corrupted LPC DMA Data) in AMD Publication #46837,
+ * "SB700 Family Product Errata", Rev. 1.0, March 2010.
+ */
+static void force_disable_hpet_msi(struct pci_dev *unused)
+{
+ hpet_msi_disable = true;
+}
+
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_SBX00_SMBUS,
+ force_disable_hpet_msi);
+
+#endif
+
+#if defined(CONFIG_PCI) && defined(CONFIG_NUMA)
+/* Set correct numa_node information for AMD NB functions */
+static void quirk_amd_nb_node(struct pci_dev *dev)
+{
+ struct pci_dev *nb_ht;
+ unsigned int devfn;
+ u32 node;
+ u32 val;
+
+ devfn = PCI_DEVFN(PCI_SLOT(dev->devfn), 0);
+ nb_ht = pci_get_slot(dev->bus, devfn);
+ if (!nb_ht)
+ return;
+
+ pci_read_config_dword(nb_ht, 0x60, &val);
+ node = pcibus_to_node(dev->bus) | (val & 7);
+ /*
+ * Some hardware may return an invalid node ID,
+ * so check it first:
+ */
+ if (node_online(node))
+ set_dev_node(&dev->dev, node);
+ pci_dev_put(nb_ht);
+}
+
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB,
+ quirk_amd_nb_node);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_ADDRMAP,
+ quirk_amd_nb_node);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MEMCTL,
+ quirk_amd_nb_node);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MISC,
+ quirk_amd_nb_node);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_HT,
+ quirk_amd_nb_node);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MAP,
+ quirk_amd_nb_node);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_DRAM,
+ quirk_amd_nb_node);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC,
+ quirk_amd_nb_node);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_LINK,
+ quirk_amd_nb_node);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F0,
+ quirk_amd_nb_node);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F1,
+ quirk_amd_nb_node);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F2,
+ quirk_amd_nb_node);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F3,
+ quirk_amd_nb_node);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F4,
+ quirk_amd_nb_node);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F5,
+ quirk_amd_nb_node);
+
+#endif
+
+#ifdef CONFIG_PCI
+/*
+ * Processor does not ensure DRAM scrub read/write sequence
+ * is atomic wrt accesses to CC6 save state area. Therefore
+ * if a concurrent scrub read/write access is to same address
+ * the entry may appear as if it is not written. This quirk
+ * applies to Fam16h models 00h-0Fh
+ *
+ * See "Revision Guide" for AMD F16h models 00h-0fh,
+ * document 51810 rev. 3.04, Nov 2013
+ */
+static void amd_disable_seq_and_redirect_scrub(struct pci_dev *dev)
+{
+ u32 val;
+
+ /*
+ * Suggested workaround:
+ * set D18F3x58[4:0] = 00h and set D18F3x5C[0] = 0b
+ */
+ pci_read_config_dword(dev, 0x58, &val);
+ if (val & 0x1F) {
+ val &= ~(0x1F);
+ pci_write_config_dword(dev, 0x58, val);
+ }
+
+ pci_read_config_dword(dev, 0x5C, &val);
+ if (val & BIT(0)) {
+ val &= ~BIT(0);
+ pci_write_config_dword(dev, 0x5c, val);
+ }
+}
+
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_NB_F3,
+ amd_disable_seq_and_redirect_scrub);
+
+#if defined(CONFIG_X86_64) && defined(CONFIG_X86_MCE)
+#include <linux/jump_label.h>
+#include <asm/string_64.h>
+
+/* Ivy Bridge, Haswell, Broadwell */
+static void quirk_intel_brickland_xeon_ras_cap(struct pci_dev *pdev)
+{
+ u32 capid0;
+
+ pci_read_config_dword(pdev, 0x84, &capid0);
+
+ if (capid0 & 0x10)
+ static_branch_inc(&mcsafe_key);
+}
+
+/* Skylake */
+static void quirk_intel_purley_xeon_ras_cap(struct pci_dev *pdev)
+{
+ u32 capid0, capid5;
+
+ pci_read_config_dword(pdev, 0x84, &capid0);
+ pci_read_config_dword(pdev, 0x98, &capid5);
+
+ /*
+ * CAPID0{7:6} indicate whether this is an advanced RAS SKU
+ * CAPID5{8:5} indicate that various NVDIMM usage modes are
+ * enabled, so memory machine check recovery is also enabled.
+ */
+ if ((capid0 & 0xc0) == 0xc0 || (capid5 & 0x1e0))
+ static_branch_inc(&mcsafe_key);
+
+}
+DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x0ec3, quirk_intel_brickland_xeon_ras_cap);
+DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2fc0, quirk_intel_brickland_xeon_ras_cap);
+DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x6fc0, quirk_intel_brickland_xeon_ras_cap);
+DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2083, quirk_intel_purley_xeon_ras_cap);
+#endif
+#endif
+
+bool x86_apple_machine;
+EXPORT_SYMBOL(x86_apple_machine);
+
+void __init early_platform_quirks(void)
+{
+ x86_apple_machine = dmi_match(DMI_SYS_VENDOR, "Apple Inc.") ||
+ dmi_match(DMI_SYS_VENDOR, "Apple Computer, Inc.");
+}
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
new file mode 100644
index 0000000..725624b
--- /dev/null
+++ b/arch/x86/kernel/reboot.c
@@ -0,0 +1,887 @@
+// SPDX-License-Identifier: GPL-2.0
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/export.h>
+#include <linux/reboot.h>
+#include <linux/init.h>
+#include <linux/pm.h>
+#include <linux/efi.h>
+#include <linux/dmi.h>
+#include <linux/sched.h>
+#include <linux/tboot.h>
+#include <linux/delay.h>
+#include <linux/frame.h>
+#include <acpi/reboot.h>
+#include <asm/io.h>
+#include <asm/apic.h>
+#include <asm/io_apic.h>
+#include <asm/desc.h>
+#include <asm/hpet.h>
+#include <asm/pgtable.h>
+#include <asm/proto.h>
+#include <asm/reboot_fixups.h>
+#include <asm/reboot.h>
+#include <asm/pci_x86.h>
+#include <asm/virtext.h>
+#include <asm/cpu.h>
+#include <asm/nmi.h>
+#include <asm/smp.h>
+
+#include <linux/ctype.h>
+#include <linux/mc146818rtc.h>
+#include <asm/realmode.h>
+#include <asm/x86_init.h>
+#include <asm/efi.h>
+
+/*
+ * Power off function, if any
+ */
+void (*pm_power_off)(void);
+EXPORT_SYMBOL(pm_power_off);
+
+/*
+ * This is set if we need to go through the 'emergency' path.
+ * When machine_emergency_restart() is called, we may be on
+ * an inconsistent state and won't be able to do a clean cleanup
+ */
+static int reboot_emergency;
+
+/* This is set by the PCI code if either type 1 or type 2 PCI is detected */
+bool port_cf9_safe = false;
+
+/*
+ * Reboot options and system auto-detection code provided by
+ * Dell Inc. so their systems "just work". :-)
+ */
+
+/*
+ * Some machines require the "reboot=a" commandline options
+ */
+static int __init set_acpi_reboot(const struct dmi_system_id *d)
+{
+ if (reboot_type != BOOT_ACPI) {
+ reboot_type = BOOT_ACPI;
+ pr_info("%s series board detected. Selecting %s-method for reboots.\n",
+ d->ident, "ACPI");
+ }
+ return 0;
+}
+
+/*
+ * Some machines require the "reboot=b" or "reboot=k" commandline options,
+ * this quirk makes that automatic.
+ */
+static int __init set_bios_reboot(const struct dmi_system_id *d)
+{
+ if (reboot_type != BOOT_BIOS) {
+ reboot_type = BOOT_BIOS;
+ pr_info("%s series board detected. Selecting %s-method for reboots.\n",
+ d->ident, "BIOS");
+ }
+ return 0;
+}
+
+void __noreturn machine_real_restart(unsigned int type)
+{
+ local_irq_disable();
+
+ /*
+ * Write zero to CMOS register number 0x0f, which the BIOS POST
+ * routine will recognize as telling it to do a proper reboot. (Well
+ * that's what this book in front of me says -- it may only apply to
+ * the Phoenix BIOS though, it's not clear). At the same time,
+ * disable NMIs by setting the top bit in the CMOS address register,
+ * as we're about to do peculiar things to the CPU. I'm not sure if
+ * `outb_p' is needed instead of just `outb'. Use it to be on the
+ * safe side. (Yes, CMOS_WRITE does outb_p's. - Paul G.)
+ */
+ spin_lock(&rtc_lock);
+ CMOS_WRITE(0x00, 0x8f);
+ spin_unlock(&rtc_lock);
+
+ /*
+ * Switch back to the initial page table.
+ */
+#ifdef CONFIG_X86_32
+ load_cr3(initial_page_table);
+#else
+ write_cr3(real_mode_header->trampoline_pgd);
+
+ /* Exiting long mode will fail if CR4.PCIDE is set. */
+ if (static_cpu_has(X86_FEATURE_PCID))
+ cr4_clear_bits(X86_CR4_PCIDE);
+#endif
+
+ /* Jump to the identity-mapped low memory code */
+#ifdef CONFIG_X86_32
+ asm volatile("jmpl *%0" : :
+ "rm" (real_mode_header->machine_real_restart_asm),
+ "a" (type));
+#else
+ asm volatile("ljmpl *%0" : :
+ "m" (real_mode_header->machine_real_restart_asm),
+ "D" (type));
+#endif
+ unreachable();
+}
+#ifdef CONFIG_APM_MODULE
+EXPORT_SYMBOL(machine_real_restart);
+#endif
+STACK_FRAME_NON_STANDARD(machine_real_restart);
+
+/*
+ * Some Apple MacBook and MacBookPro's needs reboot=p to be able to reboot
+ */
+static int __init set_pci_reboot(const struct dmi_system_id *d)
+{
+ if (reboot_type != BOOT_CF9_FORCE) {
+ reboot_type = BOOT_CF9_FORCE;
+ pr_info("%s series board detected. Selecting %s-method for reboots.\n",
+ d->ident, "PCI");
+ }
+ return 0;
+}
+
+static int __init set_kbd_reboot(const struct dmi_system_id *d)
+{
+ if (reboot_type != BOOT_KBD) {
+ reboot_type = BOOT_KBD;
+ pr_info("%s series board detected. Selecting %s-method for reboot.\n",
+ d->ident, "KBD");
+ }
+ return 0;
+}
+
+/*
+ * This is a single dmi_table handling all reboot quirks.
+ */
+static const struct dmi_system_id reboot_dmi_table[] __initconst = {
+
+ /* Acer */
+ { /* Handle reboot issue on Acer Aspire one */
+ .callback = set_kbd_reboot,
+ .ident = "Acer Aspire One A110",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Acer"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "AOA110"),
+ },
+ },
+
+ /* Apple */
+ { /* Handle problems with rebooting on Apple MacBook5 */
+ .callback = set_pci_reboot,
+ .ident = "Apple MacBook5",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "MacBook5"),
+ },
+ },
+ { /* Handle problems with rebooting on Apple MacBookPro5 */
+ .callback = set_pci_reboot,
+ .ident = "Apple MacBookPro5",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "MacBookPro5"),
+ },
+ },
+ { /* Handle problems with rebooting on Apple Macmini3,1 */
+ .callback = set_pci_reboot,
+ .ident = "Apple Macmini3,1",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "Macmini3,1"),
+ },
+ },
+ { /* Handle problems with rebooting on the iMac9,1. */
+ .callback = set_pci_reboot,
+ .ident = "Apple iMac9,1",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "iMac9,1"),
+ },
+ },
+ { /* Handle problems with rebooting on the iMac10,1. */
+ .callback = set_pci_reboot,
+ .ident = "Apple iMac10,1",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "iMac10,1"),
+ },
+ },
+
+ /* ASRock */
+ { /* Handle problems with rebooting on ASRock Q1900DC-ITX */
+ .callback = set_pci_reboot,
+ .ident = "ASRock Q1900DC-ITX",
+ .matches = {
+ DMI_MATCH(DMI_BOARD_VENDOR, "ASRock"),
+ DMI_MATCH(DMI_BOARD_NAME, "Q1900DC-ITX"),
+ },
+ },
+
+ /* ASUS */
+ { /* Handle problems with rebooting on ASUS P4S800 */
+ .callback = set_bios_reboot,
+ .ident = "ASUS P4S800",
+ .matches = {
+ DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."),
+ DMI_MATCH(DMI_BOARD_NAME, "P4S800"),
+ },
+ },
+ { /* Handle problems with rebooting on ASUS EeeBook X205TA */
+ .callback = set_acpi_reboot,
+ .ident = "ASUS EeeBook X205TA",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "ASUSTeK COMPUTER INC."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "X205TA"),
+ },
+ },
+ { /* Handle problems with rebooting on ASUS EeeBook X205TAW */
+ .callback = set_acpi_reboot,
+ .ident = "ASUS EeeBook X205TAW",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "ASUSTeK COMPUTER INC."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "X205TAW"),
+ },
+ },
+
+ /* Certec */
+ { /* Handle problems with rebooting on Certec BPC600 */
+ .callback = set_pci_reboot,
+ .ident = "Certec BPC600",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Certec"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "BPC600"),
+ },
+ },
+
+ /* Dell */
+ { /* Handle problems with rebooting on Dell DXP061 */
+ .callback = set_bios_reboot,
+ .ident = "Dell DXP061",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "Dell DXP061"),
+ },
+ },
+ { /* Handle problems with rebooting on Dell E520's */
+ .callback = set_bios_reboot,
+ .ident = "Dell E520",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "Dell DM061"),
+ },
+ },
+ { /* Handle problems with rebooting on the Latitude E5410. */
+ .callback = set_pci_reboot,
+ .ident = "Dell Latitude E5410",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "Latitude E5410"),
+ },
+ },
+ { /* Handle problems with rebooting on the Latitude E5420. */
+ .callback = set_pci_reboot,
+ .ident = "Dell Latitude E5420",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "Latitude E5420"),
+ },
+ },
+ { /* Handle problems with rebooting on the Latitude E6320. */
+ .callback = set_pci_reboot,
+ .ident = "Dell Latitude E6320",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "Latitude E6320"),
+ },
+ },
+ { /* Handle problems with rebooting on the Latitude E6420. */
+ .callback = set_pci_reboot,
+ .ident = "Dell Latitude E6420",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "Latitude E6420"),
+ },
+ },
+ { /* Handle problems with rebooting on Dell Optiplex 330 with 0KP561 */
+ .callback = set_bios_reboot,
+ .ident = "Dell OptiPlex 330",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 330"),
+ DMI_MATCH(DMI_BOARD_NAME, "0KP561"),
+ },
+ },
+ { /* Handle problems with rebooting on Dell Optiplex 360 with 0T656F */
+ .callback = set_bios_reboot,
+ .ident = "Dell OptiPlex 360",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 360"),
+ DMI_MATCH(DMI_BOARD_NAME, "0T656F"),
+ },
+ },
+ { /* Handle problems with rebooting on Dell Optiplex 745's SFF */
+ .callback = set_bios_reboot,
+ .ident = "Dell OptiPlex 745",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 745"),
+ },
+ },
+ { /* Handle problems with rebooting on Dell Optiplex 745's DFF */
+ .callback = set_bios_reboot,
+ .ident = "Dell OptiPlex 745",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 745"),
+ DMI_MATCH(DMI_BOARD_NAME, "0MM599"),
+ },
+ },
+ { /* Handle problems with rebooting on Dell Optiplex 745 with 0KW626 */
+ .callback = set_bios_reboot,
+ .ident = "Dell OptiPlex 745",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 745"),
+ DMI_MATCH(DMI_BOARD_NAME, "0KW626"),
+ },
+ },
+ { /* Handle problems with rebooting on Dell OptiPlex 760 with 0G919G */
+ .callback = set_bios_reboot,
+ .ident = "Dell OptiPlex 760",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 760"),
+ DMI_MATCH(DMI_BOARD_NAME, "0G919G"),
+ },
+ },
+ { /* Handle problems with rebooting on the OptiPlex 990. */
+ .callback = set_pci_reboot,
+ .ident = "Dell OptiPlex 990",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 990"),
+ },
+ },
+ { /* Handle problems with rebooting on Dell 300's */
+ .callback = set_bios_reboot,
+ .ident = "Dell PowerEdge 300",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 300/"),
+ },
+ },
+ { /* Handle problems with rebooting on Dell 1300's */
+ .callback = set_bios_reboot,
+ .ident = "Dell PowerEdge 1300",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 1300/"),
+ },
+ },
+ { /* Handle problems with rebooting on Dell 2400's */
+ .callback = set_bios_reboot,
+ .ident = "Dell PowerEdge 2400",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 2400"),
+ },
+ },
+ { /* Handle problems with rebooting on the Dell PowerEdge C6100. */
+ .callback = set_pci_reboot,
+ .ident = "Dell PowerEdge C6100",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Dell"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "C6100"),
+ },
+ },
+ { /* Handle problems with rebooting on the Precision M6600. */
+ .callback = set_pci_reboot,
+ .ident = "Dell Precision M6600",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "Precision M6600"),
+ },
+ },
+ { /* Handle problems with rebooting on Dell T5400's */
+ .callback = set_bios_reboot,
+ .ident = "Dell Precision T5400",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "Precision WorkStation T5400"),
+ },
+ },
+ { /* Handle problems with rebooting on Dell T7400's */
+ .callback = set_bios_reboot,
+ .ident = "Dell Precision T7400",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "Precision WorkStation T7400"),
+ },
+ },
+ { /* Handle problems with rebooting on Dell XPS710 */
+ .callback = set_bios_reboot,
+ .ident = "Dell XPS710",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "Dell XPS710"),
+ },
+ },
+ { /* Handle problems with rebooting on Dell Optiplex 7450 AIO */
+ .callback = set_acpi_reboot,
+ .ident = "Dell OptiPlex 7450 AIO",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 7450 AIO"),
+ },
+ },
+
+ /* Hewlett-Packard */
+ { /* Handle problems with rebooting on HP laptops */
+ .callback = set_bios_reboot,
+ .ident = "HP Compaq Laptop",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq"),
+ },
+ },
+
+ /* Sony */
+ { /* Handle problems with rebooting on Sony VGN-Z540N */
+ .callback = set_bios_reboot,
+ .ident = "Sony VGN-Z540N",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Sony Corporation"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "VGN-Z540N"),
+ },
+ },
+
+ { }
+};
+
+static int __init reboot_init(void)
+{
+ int rv;
+
+ /*
+ * Only do the DMI check if reboot_type hasn't been overridden
+ * on the command line
+ */
+ if (!reboot_default)
+ return 0;
+
+ /*
+ * The DMI quirks table takes precedence. If no quirks entry
+ * matches and the ACPI Hardware Reduced bit is set and EFI
+ * runtime services are enabled, force EFI reboot.
+ */
+ rv = dmi_check_system(reboot_dmi_table);
+
+ if (!rv && efi_reboot_required() && !efi_runtime_disabled())
+ reboot_type = BOOT_EFI;
+
+ return 0;
+}
+core_initcall(reboot_init);
+
+static inline void kb_wait(void)
+{
+ int i;
+
+ for (i = 0; i < 0x10000; i++) {
+ if ((inb(0x64) & 0x02) == 0)
+ break;
+ udelay(2);
+ }
+}
+
+static void vmxoff_nmi(int cpu, struct pt_regs *regs)
+{
+ cpu_emergency_vmxoff();
+}
+
+/* Use NMIs as IPIs to tell all CPUs to disable virtualization */
+static void emergency_vmx_disable_all(void)
+{
+ /* Just make sure we won't change CPUs while doing this */
+ local_irq_disable();
+
+ /*
+ * We need to disable VMX on all CPUs before rebooting, otherwise
+ * we risk hanging up the machine, because the CPU ignore INIT
+ * signals when VMX is enabled.
+ *
+ * We can't take any locks and we may be on an inconsistent
+ * state, so we use NMIs as IPIs to tell the other CPUs to disable
+ * VMX and halt.
+ *
+ * For safety, we will avoid running the nmi_shootdown_cpus()
+ * stuff unnecessarily, but we don't have a way to check
+ * if other CPUs have VMX enabled. So we will call it only if the
+ * CPU we are running on has VMX enabled.
+ *
+ * We will miss cases where VMX is not enabled on all CPUs. This
+ * shouldn't do much harm because KVM always enable VMX on all
+ * CPUs anyway. But we can miss it on the small window where KVM
+ * is still enabling VMX.
+ */
+ if (cpu_has_vmx() && cpu_vmx_enabled()) {
+ /* Disable VMX on this CPU. */
+ cpu_vmxoff();
+
+ /* Halt and disable VMX on the other CPUs */
+ nmi_shootdown_cpus(vmxoff_nmi);
+
+ }
+}
+
+
+void __attribute__((weak)) mach_reboot_fixups(void)
+{
+}
+
+/*
+ * To the best of our knowledge Windows compatible x86 hardware expects
+ * the following on reboot:
+ *
+ * 1) If the FADT has the ACPI reboot register flag set, try it
+ * 2) If still alive, write to the keyboard controller
+ * 3) If still alive, write to the ACPI reboot register again
+ * 4) If still alive, write to the keyboard controller again
+ * 5) If still alive, call the EFI runtime service to reboot
+ * 6) If no EFI runtime service, call the BIOS to do a reboot
+ *
+ * We default to following the same pattern. We also have
+ * two other reboot methods: 'triple fault' and 'PCI', which
+ * can be triggered via the reboot= kernel boot option or
+ * via quirks.
+ *
+ * This means that this function can never return, it can misbehave
+ * by not rebooting properly and hanging.
+ */
+static void native_machine_emergency_restart(void)
+{
+ int i;
+ int attempt = 0;
+ int orig_reboot_type = reboot_type;
+ unsigned short mode;
+
+ if (reboot_emergency)
+ emergency_vmx_disable_all();
+
+ tboot_shutdown(TB_SHUTDOWN_REBOOT);
+
+ /* Tell the BIOS if we want cold or warm reboot */
+ mode = reboot_mode == REBOOT_WARM ? 0x1234 : 0;
+ *((unsigned short *)__va(0x472)) = mode;
+
+ /*
+ * If an EFI capsule has been registered with the firmware then
+ * override the reboot= parameter.
+ */
+ if (efi_capsule_pending(NULL)) {
+ pr_info("EFI capsule is pending, forcing EFI reboot.\n");
+ reboot_type = BOOT_EFI;
+ }
+
+ for (;;) {
+ /* Could also try the reset bit in the Hammer NB */
+ switch (reboot_type) {
+ case BOOT_ACPI:
+ acpi_reboot();
+ reboot_type = BOOT_KBD;
+ break;
+
+ case BOOT_KBD:
+ mach_reboot_fixups(); /* For board specific fixups */
+
+ for (i = 0; i < 10; i++) {
+ kb_wait();
+ udelay(50);
+ outb(0xfe, 0x64); /* Pulse reset low */
+ udelay(50);
+ }
+ if (attempt == 0 && orig_reboot_type == BOOT_ACPI) {
+ attempt = 1;
+ reboot_type = BOOT_ACPI;
+ } else {
+ reboot_type = BOOT_EFI;
+ }
+ break;
+
+ case BOOT_EFI:
+ efi_reboot(reboot_mode, NULL);
+ reboot_type = BOOT_BIOS;
+ break;
+
+ case BOOT_BIOS:
+ machine_real_restart(MRR_BIOS);
+
+ /* We're probably dead after this, but... */
+ reboot_type = BOOT_CF9_SAFE;
+ break;
+
+ case BOOT_CF9_FORCE:
+ port_cf9_safe = true;
+ /* Fall through */
+
+ case BOOT_CF9_SAFE:
+ if (port_cf9_safe) {
+ u8 reboot_code = reboot_mode == REBOOT_WARM ? 0x06 : 0x0E;
+ u8 cf9 = inb(0xcf9) & ~reboot_code;
+ outb(cf9|2, 0xcf9); /* Request hard reset */
+ udelay(50);
+ /* Actually do the reset */
+ outb(cf9|reboot_code, 0xcf9);
+ udelay(50);
+ }
+ reboot_type = BOOT_TRIPLE;
+ break;
+
+ case BOOT_TRIPLE:
+ idt_invalidate(NULL);
+ __asm__ __volatile__("int3");
+
+ /* We're probably dead after this, but... */
+ reboot_type = BOOT_KBD;
+ break;
+ }
+ }
+}
+
+void native_machine_shutdown(void)
+{
+ /* Stop the cpus and apics */
+#ifdef CONFIG_X86_IO_APIC
+ /*
+ * Disabling IO APIC before local APIC is a workaround for
+ * erratum AVR31 in "Intel Atom Processor C2000 Product Family
+ * Specification Update". In this situation, interrupts that target
+ * a Logical Processor whose Local APIC is either in the process of
+ * being hardware disabled or software disabled are neither delivered
+ * nor discarded. When this erratum occurs, the processor may hang.
+ *
+ * Even without the erratum, it still makes sense to quiet IO APIC
+ * before disabling Local APIC.
+ */
+ clear_IO_APIC();
+#endif
+
+#ifdef CONFIG_SMP
+ /*
+ * Stop all of the others. Also disable the local irq to
+ * not receive the per-cpu timer interrupt which may trigger
+ * scheduler's load balance.
+ */
+ local_irq_disable();
+ stop_other_cpus();
+#endif
+
+ lapic_shutdown();
+ restore_boot_irq_mode();
+
+#ifdef CONFIG_HPET_TIMER
+ hpet_disable();
+#endif
+
+#ifdef CONFIG_X86_64
+ x86_platform.iommu_shutdown();
+#endif
+}
+
+static void __machine_emergency_restart(int emergency)
+{
+ reboot_emergency = emergency;
+ machine_ops.emergency_restart();
+}
+
+static void native_machine_restart(char *__unused)
+{
+ pr_notice("machine restart\n");
+
+ if (!reboot_force)
+ machine_shutdown();
+ __machine_emergency_restart(0);
+}
+
+static void native_machine_halt(void)
+{
+ /* Stop other cpus and apics */
+ machine_shutdown();
+
+ tboot_shutdown(TB_SHUTDOWN_HALT);
+
+ stop_this_cpu(NULL);
+}
+
+static void native_machine_power_off(void)
+{
+ if (pm_power_off) {
+ if (!reboot_force)
+ machine_shutdown();
+ pm_power_off();
+ }
+ /* A fallback in case there is no PM info available */
+ tboot_shutdown(TB_SHUTDOWN_HALT);
+}
+
+struct machine_ops machine_ops __ro_after_init = {
+ .power_off = native_machine_power_off,
+ .shutdown = native_machine_shutdown,
+ .emergency_restart = native_machine_emergency_restart,
+ .restart = native_machine_restart,
+ .halt = native_machine_halt,
+#ifdef CONFIG_KEXEC_CORE
+ .crash_shutdown = native_machine_crash_shutdown,
+#endif
+};
+
+void machine_power_off(void)
+{
+ machine_ops.power_off();
+}
+
+void machine_shutdown(void)
+{
+ machine_ops.shutdown();
+}
+
+void machine_emergency_restart(void)
+{
+ __machine_emergency_restart(1);
+}
+
+void machine_restart(char *cmd)
+{
+ machine_ops.restart(cmd);
+}
+
+void machine_halt(void)
+{
+ machine_ops.halt();
+}
+
+#ifdef CONFIG_KEXEC_CORE
+void machine_crash_shutdown(struct pt_regs *regs)
+{
+ machine_ops.crash_shutdown(regs);
+}
+#endif
+
+
+/* This is the CPU performing the emergency shutdown work. */
+int crashing_cpu = -1;
+
+#if defined(CONFIG_SMP)
+
+static nmi_shootdown_cb shootdown_callback;
+
+static atomic_t waiting_for_crash_ipi;
+static int crash_ipi_issued;
+
+static int crash_nmi_callback(unsigned int val, struct pt_regs *regs)
+{
+ int cpu;
+
+ cpu = raw_smp_processor_id();
+
+ /*
+ * Don't do anything if this handler is invoked on crashing cpu.
+ * Otherwise, system will completely hang. Crashing cpu can get
+ * an NMI if system was initially booted with nmi_watchdog parameter.
+ */
+ if (cpu == crashing_cpu)
+ return NMI_HANDLED;
+ local_irq_disable();
+
+ shootdown_callback(cpu, regs);
+
+ atomic_dec(&waiting_for_crash_ipi);
+ /* Assume hlt works */
+ halt();
+ for (;;)
+ cpu_relax();
+
+ return NMI_HANDLED;
+}
+
+static void smp_send_nmi_allbutself(void)
+{
+ apic->send_IPI_allbutself(NMI_VECTOR);
+}
+
+/*
+ * Halt all other CPUs, calling the specified function on each of them
+ *
+ * This function can be used to halt all other CPUs on crash
+ * or emergency reboot time. The function passed as parameter
+ * will be called inside a NMI handler on all CPUs.
+ */
+void nmi_shootdown_cpus(nmi_shootdown_cb callback)
+{
+ unsigned long msecs;
+ local_irq_disable();
+
+ /* Make a note of crashing cpu. Will be used in NMI callback. */
+ crashing_cpu = safe_smp_processor_id();
+
+ shootdown_callback = callback;
+
+ atomic_set(&waiting_for_crash_ipi, num_online_cpus() - 1);
+ /* Would it be better to replace the trap vector here? */
+ if (register_nmi_handler(NMI_LOCAL, crash_nmi_callback,
+ NMI_FLAG_FIRST, "crash"))
+ return; /* Return what? */
+ /*
+ * Ensure the new callback function is set before sending
+ * out the NMI
+ */
+ wmb();
+
+ smp_send_nmi_allbutself();
+
+ /* Kick CPUs looping in NMI context. */
+ WRITE_ONCE(crash_ipi_issued, 1);
+
+ msecs = 1000; /* Wait at most a second for the other cpus to stop */
+ while ((atomic_read(&waiting_for_crash_ipi) > 0) && msecs) {
+ mdelay(1);
+ msecs--;
+ }
+
+ /* Leave the nmi callback set */
+}
+
+/*
+ * Check if the crash dumping IPI got issued and if so, call its callback
+ * directly. This function is used when we have already been in NMI handler.
+ * It doesn't return.
+ */
+void run_crash_ipi_callback(struct pt_regs *regs)
+{
+ if (crash_ipi_issued)
+ crash_nmi_callback(0, regs);
+}
+
+/* Override the weak function in kernel/panic.c */
+void nmi_panic_self_stop(struct pt_regs *regs)
+{
+ while (1) {
+ /* If no CPU is preparing crash dump, we simply loop here. */
+ run_crash_ipi_callback(regs);
+ cpu_relax();
+ }
+}
+
+#else /* !CONFIG_SMP */
+void nmi_shootdown_cpus(nmi_shootdown_cb callback)
+{
+ /* No other CPUs to shoot down */
+}
+
+void run_crash_ipi_callback(struct pt_regs *regs)
+{
+}
+#endif
diff --git a/arch/x86/kernel/reboot_fixups_32.c b/arch/x86/kernel/reboot_fixups_32.c
new file mode 100644
index 0000000..b7c0f14
--- /dev/null
+++ b/arch/x86/kernel/reboot_fixups_32.c
@@ -0,0 +1,103 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * This is a good place to put board specific reboot fixups.
+ *
+ * List of supported fixups:
+ * geode-gx1/cs5530a - Jaya Kumar <jayalk@intworks.biz>
+ * geode-gx/lx/cs5536 - Andres Salomon <dilinger@debian.org>
+ *
+ */
+
+#include <asm/delay.h>
+#include <linux/pci.h>
+#include <linux/interrupt.h>
+#include <asm/reboot_fixups.h>
+#include <asm/msr.h>
+#include <linux/cs5535.h>
+
+static void cs5530a_warm_reset(struct pci_dev *dev)
+{
+ /* writing 1 to the reset control register, 0x44 causes the
+ cs5530a to perform a system warm reset */
+ pci_write_config_byte(dev, 0x44, 0x1);
+ udelay(50); /* shouldn't get here but be safe and spin-a-while */
+ return;
+}
+
+static void cs5536_warm_reset(struct pci_dev *dev)
+{
+ /* writing 1 to the LSB of this MSR causes a hard reset */
+ wrmsrl(MSR_DIVIL_SOFT_RESET, 1ULL);
+ udelay(50); /* shouldn't get here but be safe and spin a while */
+}
+
+static void rdc321x_reset(struct pci_dev *dev)
+{
+ unsigned i;
+ /* Voluntary reset the watchdog timer */
+ outl(0x80003840, 0xCF8);
+ /* Generate a CPU reset on next tick */
+ i = inl(0xCFC);
+ /* Use the minimum timer resolution */
+ i |= 0x1600;
+ outl(i, 0xCFC);
+ outb(1, 0x92);
+}
+
+static void ce4100_reset(struct pci_dev *dev)
+{
+ int i;
+
+ for (i = 0; i < 10; i++) {
+ outb(0x2, 0xcf9);
+ udelay(50);
+ }
+}
+
+struct device_fixup {
+ unsigned int vendor;
+ unsigned int device;
+ void (*reboot_fixup)(struct pci_dev *);
+};
+
+/*
+ * PCI ids solely used for fixups_table go here
+ */
+#define PCI_DEVICE_ID_INTEL_CE4100 0x0708
+
+static const struct device_fixup fixups_table[] = {
+{ PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5530_LEGACY, cs5530a_warm_reset },
+{ PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CS5536_ISA, cs5536_warm_reset },
+{ PCI_VENDOR_ID_NS, PCI_DEVICE_ID_NS_SC1100_BRIDGE, cs5530a_warm_reset },
+{ PCI_VENDOR_ID_RDC, PCI_DEVICE_ID_RDC_R6030, rdc321x_reset },
+{ PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CE4100, ce4100_reset },
+};
+
+/*
+ * we see if any fixup is available for our current hardware. if there
+ * is a fixup, we call it and we expect to never return from it. if we
+ * do return, we keep looking and then eventually fall back to the
+ * standard mach_reboot on return.
+ */
+void mach_reboot_fixups(void)
+{
+ const struct device_fixup *cur;
+ struct pci_dev *dev;
+ int i;
+
+ /* we can be called from sysrq-B code. In such a case it is
+ * prohibited to dig PCI */
+ if (in_interrupt())
+ return;
+
+ for (i=0; i < ARRAY_SIZE(fixups_table); i++) {
+ cur = &(fixups_table[i]);
+ dev = pci_get_device(cur->vendor, cur->device, NULL);
+ if (!dev)
+ continue;
+
+ cur->reboot_fixup(dev);
+ pci_dev_put(dev);
+ }
+}
+
diff --git a/arch/x86/kernel/relocate_kernel_32.S b/arch/x86/kernel/relocate_kernel_32.S
new file mode 100644
index 0000000..77630d5
--- /dev/null
+++ b/arch/x86/kernel/relocate_kernel_32.S
@@ -0,0 +1,277 @@
+/*
+ * relocate_kernel.S - put the kernel image in place to boot
+ * Copyright (C) 2002-2004 Eric Biederman <ebiederm@xmission.com>
+ *
+ * This source code is licensed under the GNU General Public License,
+ * Version 2. See the file COPYING for more details.
+ */
+
+#include <linux/linkage.h>
+#include <asm/page_types.h>
+#include <asm/kexec.h>
+#include <asm/processor-flags.h>
+
+/*
+ * Must be relocatable PIC code callable as a C function
+ */
+
+#define PTR(x) (x << 2)
+
+/*
+ * control_page + KEXEC_CONTROL_CODE_MAX_SIZE
+ * ~ control_page + PAGE_SIZE are used as data storage and stack for
+ * jumping back
+ */
+#define DATA(offset) (KEXEC_CONTROL_CODE_MAX_SIZE+(offset))
+
+/* Minimal CPU state */
+#define ESP DATA(0x0)
+#define CR0 DATA(0x4)
+#define CR3 DATA(0x8)
+#define CR4 DATA(0xc)
+
+/* other data */
+#define CP_VA_CONTROL_PAGE DATA(0x10)
+#define CP_PA_PGD DATA(0x14)
+#define CP_PA_SWAP_PAGE DATA(0x18)
+#define CP_PA_BACKUP_PAGES_MAP DATA(0x1c)
+
+ .text
+ .globl relocate_kernel
+relocate_kernel:
+ /* Save the CPU context, used for jumping back */
+
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ pushl %ebp
+ pushf
+
+ movl 20+8(%esp), %ebp /* list of pages */
+ movl PTR(VA_CONTROL_PAGE)(%ebp), %edi
+ movl %esp, ESP(%edi)
+ movl %cr0, %eax
+ movl %eax, CR0(%edi)
+ movl %cr3, %eax
+ movl %eax, CR3(%edi)
+ movl %cr4, %eax
+ movl %eax, CR4(%edi)
+
+ /* read the arguments and say goodbye to the stack */
+ movl 20+4(%esp), %ebx /* page_list */
+ movl 20+8(%esp), %ebp /* list of pages */
+ movl 20+12(%esp), %edx /* start address */
+ movl 20+16(%esp), %ecx /* cpu_has_pae */
+ movl 20+20(%esp), %esi /* preserve_context */
+
+ /* zero out flags, and disable interrupts */
+ pushl $0
+ popfl
+
+ /* save some information for jumping back */
+ movl PTR(VA_CONTROL_PAGE)(%ebp), %edi
+ movl %edi, CP_VA_CONTROL_PAGE(%edi)
+ movl PTR(PA_PGD)(%ebp), %eax
+ movl %eax, CP_PA_PGD(%edi)
+ movl PTR(PA_SWAP_PAGE)(%ebp), %eax
+ movl %eax, CP_PA_SWAP_PAGE(%edi)
+ movl %ebx, CP_PA_BACKUP_PAGES_MAP(%edi)
+
+ /*
+ * get physical address of control page now
+ * this is impossible after page table switch
+ */
+ movl PTR(PA_CONTROL_PAGE)(%ebp), %edi
+
+ /* switch to new set of page tables */
+ movl PTR(PA_PGD)(%ebp), %eax
+ movl %eax, %cr3
+
+ /* setup a new stack at the end of the physical control page */
+ lea PAGE_SIZE(%edi), %esp
+
+ /* jump to identity mapped page */
+ movl %edi, %eax
+ addl $(identity_mapped - relocate_kernel), %eax
+ pushl %eax
+ ret
+
+identity_mapped:
+ /* set return address to 0 if not preserving context */
+ pushl $0
+ /* store the start address on the stack */
+ pushl %edx
+
+ /*
+ * Set cr0 to a known state:
+ * - Paging disabled
+ * - Alignment check disabled
+ * - Write protect disabled
+ * - No task switch
+ * - Don't do FP software emulation.
+ * - Proctected mode enabled
+ */
+ movl %cr0, %eax
+ andl $~(X86_CR0_PG | X86_CR0_AM | X86_CR0_WP | X86_CR0_TS | X86_CR0_EM), %eax
+ orl $(X86_CR0_PE), %eax
+ movl %eax, %cr0
+
+ /* clear cr4 if applicable */
+ testl %ecx, %ecx
+ jz 1f
+ /*
+ * Set cr4 to a known state:
+ * Setting everything to zero seems safe.
+ */
+ xorl %eax, %eax
+ movl %eax, %cr4
+
+ jmp 1f
+1:
+
+ /* Flush the TLB (needed?) */
+ xorl %eax, %eax
+ movl %eax, %cr3
+
+ movl CP_PA_SWAP_PAGE(%edi), %eax
+ pushl %eax
+ pushl %ebx
+ call swap_pages
+ addl $8, %esp
+
+ /*
+ * To be certain of avoiding problems with self-modifying code
+ * I need to execute a serializing instruction here.
+ * So I flush the TLB, it's handy, and not processor dependent.
+ */
+ xorl %eax, %eax
+ movl %eax, %cr3
+
+ /*
+ * set all of the registers to known values
+ * leave %esp alone
+ */
+
+ testl %esi, %esi
+ jnz 1f
+ xorl %edi, %edi
+ xorl %eax, %eax
+ xorl %ebx, %ebx
+ xorl %ecx, %ecx
+ xorl %edx, %edx
+ xorl %esi, %esi
+ xorl %ebp, %ebp
+ ret
+1:
+ popl %edx
+ movl CP_PA_SWAP_PAGE(%edi), %esp
+ addl $PAGE_SIZE, %esp
+2:
+ call *%edx
+
+ /* get the re-entry point of the peer system */
+ movl 0(%esp), %ebp
+ call 1f
+1:
+ popl %ebx
+ subl $(1b - relocate_kernel), %ebx
+ movl CP_VA_CONTROL_PAGE(%ebx), %edi
+ lea PAGE_SIZE(%ebx), %esp
+ movl CP_PA_SWAP_PAGE(%ebx), %eax
+ movl CP_PA_BACKUP_PAGES_MAP(%ebx), %edx
+ pushl %eax
+ pushl %edx
+ call swap_pages
+ addl $8, %esp
+ movl CP_PA_PGD(%ebx), %eax
+ movl %eax, %cr3
+ movl %cr0, %eax
+ orl $X86_CR0_PG, %eax
+ movl %eax, %cr0
+ lea PAGE_SIZE(%edi), %esp
+ movl %edi, %eax
+ addl $(virtual_mapped - relocate_kernel), %eax
+ pushl %eax
+ ret
+
+virtual_mapped:
+ movl CR4(%edi), %eax
+ movl %eax, %cr4
+ movl CR3(%edi), %eax
+ movl %eax, %cr3
+ movl CR0(%edi), %eax
+ movl %eax, %cr0
+ movl ESP(%edi), %esp
+ movl %ebp, %eax
+
+ popf
+ popl %ebp
+ popl %edi
+ popl %esi
+ popl %ebx
+ ret
+
+ /* Do the copies */
+swap_pages:
+ movl 8(%esp), %edx
+ movl 4(%esp), %ecx
+ pushl %ebp
+ pushl %ebx
+ pushl %edi
+ pushl %esi
+ movl %ecx, %ebx
+ jmp 1f
+
+0: /* top, read another word from the indirection page */
+ movl (%ebx), %ecx
+ addl $4, %ebx
+1:
+ testb $0x1, %cl /* is it a destination page */
+ jz 2f
+ movl %ecx, %edi
+ andl $0xfffff000, %edi
+ jmp 0b
+2:
+ testb $0x2, %cl /* is it an indirection page */
+ jz 2f
+ movl %ecx, %ebx
+ andl $0xfffff000, %ebx
+ jmp 0b
+2:
+ testb $0x4, %cl /* is it the done indicator */
+ jz 2f
+ jmp 3f
+2:
+ testb $0x8, %cl /* is it the source indicator */
+ jz 0b /* Ignore it otherwise */
+ movl %ecx, %esi /* For every source page do a copy */
+ andl $0xfffff000, %esi
+
+ movl %edi, %eax
+ movl %esi, %ebp
+
+ movl %edx, %edi
+ movl $1024, %ecx
+ rep ; movsl
+
+ movl %ebp, %edi
+ movl %eax, %esi
+ movl $1024, %ecx
+ rep ; movsl
+
+ movl %eax, %edi
+ movl %edx, %esi
+ movl $1024, %ecx
+ rep ; movsl
+
+ lea PAGE_SIZE(%ebp), %esi
+ jmp 0b
+3:
+ popl %esi
+ popl %edi
+ popl %ebx
+ popl %ebp
+ ret
+
+ .globl kexec_control_code_size
+.set kexec_control_code_size, . - relocate_kernel
diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S
new file mode 100644
index 0000000..11eda21
--- /dev/null
+++ b/arch/x86/kernel/relocate_kernel_64.S
@@ -0,0 +1,290 @@
+/*
+ * relocate_kernel.S - put the kernel image in place to boot
+ * Copyright (C) 2002-2005 Eric Biederman <ebiederm@xmission.com>
+ *
+ * This source code is licensed under the GNU General Public License,
+ * Version 2. See the file COPYING for more details.
+ */
+
+#include <linux/linkage.h>
+#include <asm/page_types.h>
+#include <asm/kexec.h>
+#include <asm/processor-flags.h>
+#include <asm/pgtable_types.h>
+
+/*
+ * Must be relocatable PIC code callable as a C function
+ */
+
+#define PTR(x) (x << 3)
+#define PAGE_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
+
+/*
+ * control_page + KEXEC_CONTROL_CODE_MAX_SIZE
+ * ~ control_page + PAGE_SIZE are used as data storage and stack for
+ * jumping back
+ */
+#define DATA(offset) (KEXEC_CONTROL_CODE_MAX_SIZE+(offset))
+
+/* Minimal CPU state */
+#define RSP DATA(0x0)
+#define CR0 DATA(0x8)
+#define CR3 DATA(0x10)
+#define CR4 DATA(0x18)
+
+/* other data */
+#define CP_PA_TABLE_PAGE DATA(0x20)
+#define CP_PA_SWAP_PAGE DATA(0x28)
+#define CP_PA_BACKUP_PAGES_MAP DATA(0x30)
+
+ .text
+ .align PAGE_SIZE
+ .code64
+ .globl relocate_kernel
+relocate_kernel:
+ /*
+ * %rdi indirection_page
+ * %rsi page_list
+ * %rdx start address
+ * %rcx preserve_context
+ * %r8 sme_active
+ */
+
+ /* Save the CPU context, used for jumping back */
+ pushq %rbx
+ pushq %rbp
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+ pushf
+
+ movq PTR(VA_CONTROL_PAGE)(%rsi), %r11
+ movq %rsp, RSP(%r11)
+ movq %cr0, %rax
+ movq %rax, CR0(%r11)
+ movq %cr3, %rax
+ movq %rax, CR3(%r11)
+ movq %cr4, %rax
+ movq %rax, CR4(%r11)
+
+ /* Save CR4. Required to enable the right paging mode later. */
+ movq %rax, %r13
+
+ /* zero out flags, and disable interrupts */
+ pushq $0
+ popfq
+
+ /* Save SME active flag */
+ movq %r8, %r12
+
+ /*
+ * get physical address of control page now
+ * this is impossible after page table switch
+ */
+ movq PTR(PA_CONTROL_PAGE)(%rsi), %r8
+
+ /* get physical address of page table now too */
+ movq PTR(PA_TABLE_PAGE)(%rsi), %r9
+
+ /* get physical address of swap page now */
+ movq PTR(PA_SWAP_PAGE)(%rsi), %r10
+
+ /* save some information for jumping back */
+ movq %r9, CP_PA_TABLE_PAGE(%r11)
+ movq %r10, CP_PA_SWAP_PAGE(%r11)
+ movq %rdi, CP_PA_BACKUP_PAGES_MAP(%r11)
+
+ /* Switch to the identity mapped page tables */
+ movq %r9, %cr3
+
+ /* setup a new stack at the end of the physical control page */
+ lea PAGE_SIZE(%r8), %rsp
+
+ /* jump to identity mapped page */
+ addq $(identity_mapped - relocate_kernel), %r8
+ pushq %r8
+ ret
+
+identity_mapped:
+ /* set return address to 0 if not preserving context */
+ pushq $0
+ /* store the start address on the stack */
+ pushq %rdx
+
+ /*
+ * Set cr0 to a known state:
+ * - Paging enabled
+ * - Alignment check disabled
+ * - Write protect disabled
+ * - No task switch
+ * - Don't do FP software emulation.
+ * - Proctected mode enabled
+ */
+ movq %cr0, %rax
+ andq $~(X86_CR0_AM | X86_CR0_WP | X86_CR0_TS | X86_CR0_EM), %rax
+ orl $(X86_CR0_PG | X86_CR0_PE), %eax
+ movq %rax, %cr0
+
+ /*
+ * Set cr4 to a known state:
+ * - physical address extension enabled
+ * - 5-level paging, if it was enabled before
+ */
+ movl $X86_CR4_PAE, %eax
+ testq $X86_CR4_LA57, %r13
+ jz 1f
+ orl $X86_CR4_LA57, %eax
+1:
+ movq %rax, %cr4
+
+ jmp 1f
+1:
+
+ /* Flush the TLB (needed?) */
+ movq %r9, %cr3
+
+ /*
+ * If SME is active, there could be old encrypted cache line
+ * entries that will conflict with the now unencrypted memory
+ * used by kexec. Flush the caches before copying the kernel.
+ */
+ testq %r12, %r12
+ jz 1f
+ wbinvd
+1:
+
+ movq %rcx, %r11
+ call swap_pages
+
+ /*
+ * To be certain of avoiding problems with self-modifying code
+ * I need to execute a serializing instruction here.
+ * So I flush the TLB by reloading %cr3 here, it's handy,
+ * and not processor dependent.
+ */
+ movq %cr3, %rax
+ movq %rax, %cr3
+
+ /*
+ * set all of the registers to known values
+ * leave %rsp alone
+ */
+
+ testq %r11, %r11
+ jnz 1f
+ xorl %eax, %eax
+ xorl %ebx, %ebx
+ xorl %ecx, %ecx
+ xorl %edx, %edx
+ xorl %esi, %esi
+ xorl %edi, %edi
+ xorl %ebp, %ebp
+ xorl %r8d, %r8d
+ xorl %r9d, %r9d
+ xorl %r10d, %r10d
+ xorl %r11d, %r11d
+ xorl %r12d, %r12d
+ xorl %r13d, %r13d
+ xorl %r14d, %r14d
+ xorl %r15d, %r15d
+
+ ret
+
+1:
+ popq %rdx
+ leaq PAGE_SIZE(%r10), %rsp
+ call *%rdx
+
+ /* get the re-entry point of the peer system */
+ movq 0(%rsp), %rbp
+ call 1f
+1:
+ popq %r8
+ subq $(1b - relocate_kernel), %r8
+ movq CP_PA_SWAP_PAGE(%r8), %r10
+ movq CP_PA_BACKUP_PAGES_MAP(%r8), %rdi
+ movq CP_PA_TABLE_PAGE(%r8), %rax
+ movq %rax, %cr3
+ lea PAGE_SIZE(%r8), %rsp
+ call swap_pages
+ movq $virtual_mapped, %rax
+ pushq %rax
+ ret
+
+virtual_mapped:
+ movq RSP(%r8), %rsp
+ movq CR4(%r8), %rax
+ movq %rax, %cr4
+ movq CR3(%r8), %rax
+ movq CR0(%r8), %r8
+ movq %rax, %cr3
+ movq %r8, %cr0
+ movq %rbp, %rax
+
+ popf
+ popq %r15
+ popq %r14
+ popq %r13
+ popq %r12
+ popq %rbp
+ popq %rbx
+ ret
+
+ /* Do the copies */
+swap_pages:
+ movq %rdi, %rcx /* Put the page_list in %rcx */
+ xorl %edi, %edi
+ xorl %esi, %esi
+ jmp 1f
+
+0: /* top, read another word for the indirection page */
+
+ movq (%rbx), %rcx
+ addq $8, %rbx
+1:
+ testb $0x1, %cl /* is it a destination page? */
+ jz 2f
+ movq %rcx, %rdi
+ andq $0xfffffffffffff000, %rdi
+ jmp 0b
+2:
+ testb $0x2, %cl /* is it an indirection page? */
+ jz 2f
+ movq %rcx, %rbx
+ andq $0xfffffffffffff000, %rbx
+ jmp 0b
+2:
+ testb $0x4, %cl /* is it the done indicator? */
+ jz 2f
+ jmp 3f
+2:
+ testb $0x8, %cl /* is it the source indicator? */
+ jz 0b /* Ignore it otherwise */
+ movq %rcx, %rsi /* For ever source page do a copy */
+ andq $0xfffffffffffff000, %rsi
+
+ movq %rdi, %rdx
+ movq %rsi, %rax
+
+ movq %r10, %rdi
+ movl $512, %ecx
+ rep ; movsq
+
+ movq %rax, %rdi
+ movq %rdx, %rsi
+ movl $512, %ecx
+ rep ; movsq
+
+ movq %rdx, %rdi
+ movq %r10, %rsi
+ movl $512, %ecx
+ rep ; movsq
+
+ lea PAGE_SIZE(%rax), %rsi
+ jmp 0b
+3:
+ ret
+
+ .globl kexec_control_code_size
+.set kexec_control_code_size, . - relocate_kernel
diff --git a/arch/x86/kernel/resource.c b/arch/x86/kernel/resource.c
new file mode 100644
index 0000000..9b9fb78
--- /dev/null
+++ b/arch/x86/kernel/resource.c
@@ -0,0 +1,51 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/ioport.h>
+#include <asm/e820/api.h>
+
+static void resource_clip(struct resource *res, resource_size_t start,
+ resource_size_t end)
+{
+ resource_size_t low = 0, high = 0;
+
+ if (res->end < start || res->start > end)
+ return; /* no conflict */
+
+ if (res->start < start)
+ low = start - res->start;
+
+ if (res->end > end)
+ high = res->end - end;
+
+ /* Keep the area above or below the conflict, whichever is larger */
+ if (low > high)
+ res->end = start - 1;
+ else
+ res->start = end + 1;
+}
+
+static void remove_e820_regions(struct resource *avail)
+{
+ int i;
+ struct e820_entry *entry;
+
+ for (i = 0; i < e820_table->nr_entries; i++) {
+ entry = &e820_table->entries[i];
+
+ resource_clip(avail, entry->addr,
+ entry->addr + entry->size - 1);
+ }
+}
+
+void arch_remove_reservations(struct resource *avail)
+{
+ /*
+ * Trim out BIOS area (high 2MB) and E820 regions. We do not remove
+ * the low 1MB unconditionally, as this area is needed for some ISA
+ * cards requiring a memory range, e.g. the i82365 PCMCIA controller.
+ */
+ if (avail->flags & IORESOURCE_MEM) {
+ resource_clip(avail, BIOS_ROM_BASE, BIOS_ROM_END);
+
+ remove_e820_regions(avail);
+ }
+}
diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c
new file mode 100644
index 0000000..586f718
--- /dev/null
+++ b/arch/x86/kernel/rtc.c
@@ -0,0 +1,207 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * RTC related functions
+ */
+#include <linux/platform_device.h>
+#include <linux/mc146818rtc.h>
+#include <linux/acpi.h>
+#include <linux/bcd.h>
+#include <linux/export.h>
+#include <linux/pnp.h>
+#include <linux/of.h>
+
+#include <asm/vsyscall.h>
+#include <asm/x86_init.h>
+#include <asm/time.h>
+#include <asm/intel-mid.h>
+#include <asm/setup.h>
+
+#ifdef CONFIG_X86_32
+/*
+ * This is a special lock that is owned by the CPU and holds the index
+ * register we are working with. It is required for NMI access to the
+ * CMOS/RTC registers. See include/asm-i386/mc146818rtc.h for details.
+ */
+volatile unsigned long cmos_lock;
+EXPORT_SYMBOL(cmos_lock);
+#endif /* CONFIG_X86_32 */
+
+/* For two digit years assume time is always after that */
+#define CMOS_YEARS_OFFS 2000
+
+DEFINE_SPINLOCK(rtc_lock);
+EXPORT_SYMBOL(rtc_lock);
+
+/*
+ * In order to set the CMOS clock precisely, set_rtc_mmss has to be
+ * called 500 ms after the second nowtime has started, because when
+ * nowtime is written into the registers of the CMOS clock, it will
+ * jump to the next second precisely 500 ms later. Check the Motorola
+ * MC146818A or Dallas DS12887 data sheet for details.
+ */
+int mach_set_rtc_mmss(const struct timespec64 *now)
+{
+ unsigned long long nowtime = now->tv_sec;
+ struct rtc_time tm;
+ int retval = 0;
+
+ rtc_time64_to_tm(nowtime, &tm);
+ if (!rtc_valid_tm(&tm)) {
+ retval = mc146818_set_time(&tm);
+ if (retval)
+ printk(KERN_ERR "%s: RTC write failed with error %d\n",
+ __func__, retval);
+ } else {
+ printk(KERN_ERR
+ "%s: Invalid RTC value: write of %llx to RTC failed\n",
+ __func__, nowtime);
+ retval = -EINVAL;
+ }
+ return retval;
+}
+
+void mach_get_cmos_time(struct timespec64 *now)
+{
+ unsigned int status, year, mon, day, hour, min, sec, century = 0;
+ unsigned long flags;
+
+ /*
+ * If pm_trace abused the RTC as storage, set the timespec to 0,
+ * which tells the caller that this RTC value is unusable.
+ */
+ if (!pm_trace_rtc_valid()) {
+ now->tv_sec = now->tv_nsec = 0;
+ return;
+ }
+
+ spin_lock_irqsave(&rtc_lock, flags);
+
+ /*
+ * If UIP is clear, then we have >= 244 microseconds before
+ * RTC registers will be updated. Spec sheet says that this
+ * is the reliable way to read RTC - registers. If UIP is set
+ * then the register access might be invalid.
+ */
+ while ((CMOS_READ(RTC_FREQ_SELECT) & RTC_UIP))
+ cpu_relax();
+
+ sec = CMOS_READ(RTC_SECONDS);
+ min = CMOS_READ(RTC_MINUTES);
+ hour = CMOS_READ(RTC_HOURS);
+ day = CMOS_READ(RTC_DAY_OF_MONTH);
+ mon = CMOS_READ(RTC_MONTH);
+ year = CMOS_READ(RTC_YEAR);
+
+#ifdef CONFIG_ACPI
+ if (acpi_gbl_FADT.header.revision >= FADT2_REVISION_ID &&
+ acpi_gbl_FADT.century)
+ century = CMOS_READ(acpi_gbl_FADT.century);
+#endif
+
+ status = CMOS_READ(RTC_CONTROL);
+ WARN_ON_ONCE(RTC_ALWAYS_BCD && (status & RTC_DM_BINARY));
+
+ spin_unlock_irqrestore(&rtc_lock, flags);
+
+ if (RTC_ALWAYS_BCD || !(status & RTC_DM_BINARY)) {
+ sec = bcd2bin(sec);
+ min = bcd2bin(min);
+ hour = bcd2bin(hour);
+ day = bcd2bin(day);
+ mon = bcd2bin(mon);
+ year = bcd2bin(year);
+ }
+
+ if (century) {
+ century = bcd2bin(century);
+ year += century * 100;
+ } else
+ year += CMOS_YEARS_OFFS;
+
+ now->tv_sec = mktime64(year, mon, day, hour, min, sec);
+ now->tv_nsec = 0;
+}
+
+/* Routines for accessing the CMOS RAM/RTC. */
+unsigned char rtc_cmos_read(unsigned char addr)
+{
+ unsigned char val;
+
+ lock_cmos_prefix(addr);
+ outb(addr, RTC_PORT(0));
+ val = inb(RTC_PORT(1));
+ lock_cmos_suffix(addr);
+
+ return val;
+}
+EXPORT_SYMBOL(rtc_cmos_read);
+
+void rtc_cmos_write(unsigned char val, unsigned char addr)
+{
+ lock_cmos_prefix(addr);
+ outb(addr, RTC_PORT(0));
+ outb(val, RTC_PORT(1));
+ lock_cmos_suffix(addr);
+}
+EXPORT_SYMBOL(rtc_cmos_write);
+
+int update_persistent_clock64(struct timespec64 now)
+{
+ return x86_platform.set_wallclock(&now);
+}
+
+/* not static: needed by APM */
+void read_persistent_clock64(struct timespec64 *ts)
+{
+ x86_platform.get_wallclock(ts);
+}
+
+
+static struct resource rtc_resources[] = {
+ [0] = {
+ .start = RTC_PORT(0),
+ .end = RTC_PORT(1),
+ .flags = IORESOURCE_IO,
+ },
+ [1] = {
+ .start = RTC_IRQ,
+ .end = RTC_IRQ,
+ .flags = IORESOURCE_IRQ,
+ }
+};
+
+static struct platform_device rtc_device = {
+ .name = "rtc_cmos",
+ .id = -1,
+ .resource = rtc_resources,
+ .num_resources = ARRAY_SIZE(rtc_resources),
+};
+
+static __init int add_rtc_cmos(void)
+{
+#ifdef CONFIG_PNP
+ static const char * const ids[] __initconst =
+ { "PNP0b00", "PNP0b01", "PNP0b02", };
+ struct pnp_dev *dev;
+ struct pnp_id *id;
+ int i;
+
+ pnp_for_each_dev(dev) {
+ for (id = dev->id; id; id = id->next) {
+ for (i = 0; i < ARRAY_SIZE(ids); i++) {
+ if (compare_pnp_id(id, ids[i]) != 0)
+ return 0;
+ }
+ }
+ }
+#endif
+ if (!x86_platform.legacy.rtc)
+ return -ENODEV;
+
+ platform_device_register(&rtc_device);
+ dev_info(&rtc_device.dev,
+ "registered platform RTC device (no PNP device found)\n");
+
+ return 0;
+}
+device_initcall(add_rtc_cmos);
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
new file mode 100644
index 0000000..b4866ba
--- /dev/null
+++ b/arch/x86/kernel/setup.c
@@ -0,0 +1,1311 @@
+/*
+ * Copyright (C) 1995 Linus Torvalds
+ *
+ * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
+ *
+ * Memory region support
+ * David Parsons <orc@pell.chi.il.us>, July-August 1999
+ *
+ * Added E820 sanitization routine (removes overlapping memory regions);
+ * Brian Moyle <bmoyle@mvista.com>, February 2001
+ *
+ * Moved CPU detection code to cpu/${cpu}.c
+ * Patrick Mochel <mochel@osdl.org>, March 2002
+ *
+ * Provisions for empty E820 memory regions (reported by certain BIOSes).
+ * Alex Achenbach <xela@slit.de>, December 2002.
+ *
+ */
+
+/*
+ * This file handles the architecture-dependent parts of initialization
+ */
+
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/mmzone.h>
+#include <linux/screen_info.h>
+#include <linux/ioport.h>
+#include <linux/acpi.h>
+#include <linux/sfi.h>
+#include <linux/apm_bios.h>
+#include <linux/initrd.h>
+#include <linux/bootmem.h>
+#include <linux/memblock.h>
+#include <linux/seq_file.h>
+#include <linux/console.h>
+#include <linux/root_dev.h>
+#include <linux/highmem.h>
+#include <linux/export.h>
+#include <linux/efi.h>
+#include <linux/init.h>
+#include <linux/edd.h>
+#include <linux/iscsi_ibft.h>
+#include <linux/nodemask.h>
+#include <linux/kexec.h>
+#include <linux/dmi.h>
+#include <linux/pfn.h>
+#include <linux/pci.h>
+#include <asm/pci-direct.h>
+#include <linux/init_ohci1394_dma.h>
+#include <linux/kvm_para.h>
+#include <linux/dma-contiguous.h>
+#include <xen/xen.h>
+
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/stddef.h>
+#include <linux/unistd.h>
+#include <linux/ptrace.h>
+#include <linux/user.h>
+#include <linux/delay.h>
+
+#include <linux/kallsyms.h>
+#include <linux/cpufreq.h>
+#include <linux/dma-mapping.h>
+#include <linux/ctype.h>
+#include <linux/uaccess.h>
+
+#include <linux/percpu.h>
+#include <linux/crash_dump.h>
+#include <linux/tboot.h>
+#include <linux/jiffies.h>
+#include <linux/mem_encrypt.h>
+
+#include <linux/usb/xhci-dbgp.h>
+#include <video/edid.h>
+
+#include <asm/mtrr.h>
+#include <asm/apic.h>
+#include <asm/realmode.h>
+#include <asm/e820/api.h>
+#include <asm/mpspec.h>
+#include <asm/setup.h>
+#include <asm/efi.h>
+#include <asm/timer.h>
+#include <asm/i8259.h>
+#include <asm/sections.h>
+#include <asm/io_apic.h>
+#include <asm/ist.h>
+#include <asm/setup_arch.h>
+#include <asm/bios_ebda.h>
+#include <asm/cacheflush.h>
+#include <asm/processor.h>
+#include <asm/bugs.h>
+#include <asm/kasan.h>
+
+#include <asm/vsyscall.h>
+#include <asm/cpu.h>
+#include <asm/desc.h>
+#include <asm/dma.h>
+#include <asm/iommu.h>
+#include <asm/gart.h>
+#include <asm/mmu_context.h>
+#include <asm/proto.h>
+
+#include <asm/paravirt.h>
+#include <asm/hypervisor.h>
+#include <asm/olpc_ofw.h>
+
+#include <asm/percpu.h>
+#include <asm/topology.h>
+#include <asm/apicdef.h>
+#include <asm/amd_nb.h>
+#include <asm/mce.h>
+#include <asm/alternative.h>
+#include <asm/prom.h>
+#include <asm/microcode.h>
+#include <asm/kaslr.h>
+#include <asm/unwind.h>
+
+/*
+ * max_low_pfn_mapped: highest direct mapped pfn under 4GB
+ * max_pfn_mapped: highest direct mapped pfn over 4GB
+ *
+ * The direct mapping only covers E820_TYPE_RAM regions, so the ranges and gaps are
+ * represented by pfn_mapped
+ */
+unsigned long max_low_pfn_mapped;
+unsigned long max_pfn_mapped;
+
+#ifdef CONFIG_DMI
+RESERVE_BRK(dmi_alloc, 65536);
+#endif
+
+
+static __initdata unsigned long _brk_start = (unsigned long)__brk_base;
+unsigned long _brk_end = (unsigned long)__brk_base;
+
+struct boot_params boot_params;
+
+/*
+ * Machine setup..
+ */
+static struct resource data_resource = {
+ .name = "Kernel data",
+ .start = 0,
+ .end = 0,
+ .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM
+};
+
+static struct resource code_resource = {
+ .name = "Kernel code",
+ .start = 0,
+ .end = 0,
+ .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM
+};
+
+static struct resource bss_resource = {
+ .name = "Kernel bss",
+ .start = 0,
+ .end = 0,
+ .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM
+};
+
+
+#ifdef CONFIG_X86_32
+/* cpu data as detected by the assembly code in head_32.S */
+struct cpuinfo_x86 new_cpu_data;
+
+/* common cpu data for all cpus */
+struct cpuinfo_x86 boot_cpu_data __read_mostly;
+EXPORT_SYMBOL(boot_cpu_data);
+
+unsigned int def_to_bigsmp;
+
+/* for MCA, but anyone else can use it if they want */
+unsigned int machine_id;
+unsigned int machine_submodel_id;
+unsigned int BIOS_revision;
+
+struct apm_info apm_info;
+EXPORT_SYMBOL(apm_info);
+
+#if defined(CONFIG_X86_SPEEDSTEP_SMI) || \
+ defined(CONFIG_X86_SPEEDSTEP_SMI_MODULE)
+struct ist_info ist_info;
+EXPORT_SYMBOL(ist_info);
+#else
+struct ist_info ist_info;
+#endif
+
+#else
+struct cpuinfo_x86 boot_cpu_data __read_mostly;
+EXPORT_SYMBOL(boot_cpu_data);
+#endif
+
+
+#if !defined(CONFIG_X86_PAE) || defined(CONFIG_X86_64)
+__visible unsigned long mmu_cr4_features __ro_after_init;
+#else
+__visible unsigned long mmu_cr4_features __ro_after_init = X86_CR4_PAE;
+#endif
+
+/* Boot loader ID and version as integers, for the benefit of proc_dointvec */
+int bootloader_type, bootloader_version;
+
+/*
+ * Setup options
+ */
+struct screen_info screen_info;
+EXPORT_SYMBOL(screen_info);
+struct edid_info edid_info;
+EXPORT_SYMBOL_GPL(edid_info);
+
+extern int root_mountflags;
+
+unsigned long saved_video_mode;
+
+#define RAMDISK_IMAGE_START_MASK 0x07FF
+#define RAMDISK_PROMPT_FLAG 0x8000
+#define RAMDISK_LOAD_FLAG 0x4000
+
+static char __initdata command_line[COMMAND_LINE_SIZE];
+#ifdef CONFIG_CMDLINE_BOOL
+static char __initdata builtin_cmdline[COMMAND_LINE_SIZE] = CONFIG_CMDLINE;
+#endif
+
+#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
+struct edd edd;
+#ifdef CONFIG_EDD_MODULE
+EXPORT_SYMBOL(edd);
+#endif
+/**
+ * copy_edd() - Copy the BIOS EDD information
+ * from boot_params into a safe place.
+ *
+ */
+static inline void __init copy_edd(void)
+{
+ memcpy(edd.mbr_signature, boot_params.edd_mbr_sig_buffer,
+ sizeof(edd.mbr_signature));
+ memcpy(edd.edd_info, boot_params.eddbuf, sizeof(edd.edd_info));
+ edd.mbr_signature_nr = boot_params.edd_mbr_sig_buf_entries;
+ edd.edd_info_nr = boot_params.eddbuf_entries;
+}
+#else
+static inline void __init copy_edd(void)
+{
+}
+#endif
+
+void * __init extend_brk(size_t size, size_t align)
+{
+ size_t mask = align - 1;
+ void *ret;
+
+ BUG_ON(_brk_start == 0);
+ BUG_ON(align & mask);
+
+ _brk_end = (_brk_end + mask) & ~mask;
+ BUG_ON((char *)(_brk_end + size) > __brk_limit);
+
+ ret = (void *)_brk_end;
+ _brk_end += size;
+
+ memset(ret, 0, size);
+
+ return ret;
+}
+
+#ifdef CONFIG_X86_32
+static void __init cleanup_highmap(void)
+{
+}
+#endif
+
+static void __init reserve_brk(void)
+{
+ if (_brk_end > _brk_start)
+ memblock_reserve(__pa_symbol(_brk_start),
+ _brk_end - _brk_start);
+
+ /* Mark brk area as locked down and no longer taking any
+ new allocations */
+ _brk_start = 0;
+}
+
+u64 relocated_ramdisk;
+
+#ifdef CONFIG_BLK_DEV_INITRD
+
+static u64 __init get_ramdisk_image(void)
+{
+ u64 ramdisk_image = boot_params.hdr.ramdisk_image;
+
+ ramdisk_image |= (u64)boot_params.ext_ramdisk_image << 32;
+
+ return ramdisk_image;
+}
+static u64 __init get_ramdisk_size(void)
+{
+ u64 ramdisk_size = boot_params.hdr.ramdisk_size;
+
+ ramdisk_size |= (u64)boot_params.ext_ramdisk_size << 32;
+
+ return ramdisk_size;
+}
+
+static void __init relocate_initrd(void)
+{
+ /* Assume only end is not page aligned */
+ u64 ramdisk_image = get_ramdisk_image();
+ u64 ramdisk_size = get_ramdisk_size();
+ u64 area_size = PAGE_ALIGN(ramdisk_size);
+
+ /* We need to move the initrd down into directly mapped mem */
+ relocated_ramdisk = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped),
+ area_size, PAGE_SIZE);
+
+ if (!relocated_ramdisk)
+ panic("Cannot find place for new RAMDISK of size %lld\n",
+ ramdisk_size);
+
+ /* Note: this includes all the mem currently occupied by
+ the initrd, we rely on that fact to keep the data intact. */
+ memblock_reserve(relocated_ramdisk, area_size);
+ initrd_start = relocated_ramdisk + PAGE_OFFSET;
+ initrd_end = initrd_start + ramdisk_size;
+ printk(KERN_INFO "Allocated new RAMDISK: [mem %#010llx-%#010llx]\n",
+ relocated_ramdisk, relocated_ramdisk + ramdisk_size - 1);
+
+ copy_from_early_mem((void *)initrd_start, ramdisk_image, ramdisk_size);
+
+ printk(KERN_INFO "Move RAMDISK from [mem %#010llx-%#010llx] to"
+ " [mem %#010llx-%#010llx]\n",
+ ramdisk_image, ramdisk_image + ramdisk_size - 1,
+ relocated_ramdisk, relocated_ramdisk + ramdisk_size - 1);
+}
+
+static void __init early_reserve_initrd(void)
+{
+ /* Assume only end is not page aligned */
+ u64 ramdisk_image = get_ramdisk_image();
+ u64 ramdisk_size = get_ramdisk_size();
+ u64 ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size);
+
+ if (!boot_params.hdr.type_of_loader ||
+ !ramdisk_image || !ramdisk_size)
+ return; /* No initrd provided by bootloader */
+
+ memblock_reserve(ramdisk_image, ramdisk_end - ramdisk_image);
+}
+static void __init reserve_initrd(void)
+{
+ /* Assume only end is not page aligned */
+ u64 ramdisk_image = get_ramdisk_image();
+ u64 ramdisk_size = get_ramdisk_size();
+ u64 ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size);
+ u64 mapped_size;
+
+ if (!boot_params.hdr.type_of_loader ||
+ !ramdisk_image || !ramdisk_size)
+ return; /* No initrd provided by bootloader */
+
+ initrd_start = 0;
+
+ mapped_size = memblock_mem_size(max_pfn_mapped);
+ if (ramdisk_size >= (mapped_size>>1))
+ panic("initrd too large to handle, "
+ "disabling initrd (%lld needed, %lld available)\n",
+ ramdisk_size, mapped_size>>1);
+
+ printk(KERN_INFO "RAMDISK: [mem %#010llx-%#010llx]\n", ramdisk_image,
+ ramdisk_end - 1);
+
+ if (pfn_range_is_mapped(PFN_DOWN(ramdisk_image),
+ PFN_DOWN(ramdisk_end))) {
+ /* All are mapped, easy case */
+ initrd_start = ramdisk_image + PAGE_OFFSET;
+ initrd_end = initrd_start + ramdisk_size;
+ return;
+ }
+
+ relocate_initrd();
+
+ memblock_free(ramdisk_image, ramdisk_end - ramdisk_image);
+}
+
+#else
+static void __init early_reserve_initrd(void)
+{
+}
+static void __init reserve_initrd(void)
+{
+}
+#endif /* CONFIG_BLK_DEV_INITRD */
+
+static void __init parse_setup_data(void)
+{
+ struct setup_data *data;
+ u64 pa_data, pa_next;
+
+ pa_data = boot_params.hdr.setup_data;
+ while (pa_data) {
+ u32 data_len, data_type;
+
+ data = early_memremap(pa_data, sizeof(*data));
+ data_len = data->len + sizeof(struct setup_data);
+ data_type = data->type;
+ pa_next = data->next;
+ early_memunmap(data, sizeof(*data));
+
+ switch (data_type) {
+ case SETUP_E820_EXT:
+ e820__memory_setup_extended(pa_data, data_len);
+ break;
+ case SETUP_DTB:
+ add_dtb(pa_data);
+ break;
+ case SETUP_EFI:
+ parse_efi_setup(pa_data, data_len);
+ break;
+ default:
+ break;
+ }
+ pa_data = pa_next;
+ }
+}
+
+static void __init memblock_x86_reserve_range_setup_data(void)
+{
+ struct setup_data *data;
+ u64 pa_data;
+
+ pa_data = boot_params.hdr.setup_data;
+ while (pa_data) {
+ data = early_memremap(pa_data, sizeof(*data));
+ memblock_reserve(pa_data, sizeof(*data) + data->len);
+ pa_data = data->next;
+ early_memunmap(data, sizeof(*data));
+ }
+}
+
+/*
+ * --------- Crashkernel reservation ------------------------------
+ */
+
+#ifdef CONFIG_KEXEC_CORE
+
+/* 16M alignment for crash kernel regions */
+#define CRASH_ALIGN (16 << 20)
+
+/*
+ * Keep the crash kernel below this limit. On 32 bits earlier kernels
+ * would limit the kernel to the low 512 MiB due to mapping restrictions.
+ * On 64bit, old kexec-tools need to under 896MiB.
+ */
+#ifdef CONFIG_X86_32
+# define CRASH_ADDR_LOW_MAX (512 << 20)
+# define CRASH_ADDR_HIGH_MAX (512 << 20)
+#else
+# define CRASH_ADDR_LOW_MAX (896UL << 20)
+# define CRASH_ADDR_HIGH_MAX MAXMEM
+#endif
+
+static int __init reserve_crashkernel_low(void)
+{
+#ifdef CONFIG_X86_64
+ unsigned long long base, low_base = 0, low_size = 0;
+ unsigned long total_low_mem;
+ int ret;
+
+ total_low_mem = memblock_mem_size(1UL << (32 - PAGE_SHIFT));
+
+ /* crashkernel=Y,low */
+ ret = parse_crashkernel_low(boot_command_line, total_low_mem, &low_size, &base);
+ if (ret) {
+ /*
+ * two parts from lib/swiotlb.c:
+ * -swiotlb size: user-specified with swiotlb= or default.
+ *
+ * -swiotlb overflow buffer: now hardcoded to 32k. We round it
+ * to 8M for other buffers that may need to stay low too. Also
+ * make sure we allocate enough extra low memory so that we
+ * don't run out of DMA buffers for 32-bit devices.
+ */
+ low_size = max(swiotlb_size_or_default() + (8UL << 20), 256UL << 20);
+ } else {
+ /* passed with crashkernel=0,low ? */
+ if (!low_size)
+ return 0;
+ }
+
+ low_base = memblock_find_in_range(0, 1ULL << 32, low_size, CRASH_ALIGN);
+ if (!low_base) {
+ pr_err("Cannot reserve %ldMB crashkernel low memory, please try smaller size.\n",
+ (unsigned long)(low_size >> 20));
+ return -ENOMEM;
+ }
+
+ ret = memblock_reserve(low_base, low_size);
+ if (ret) {
+ pr_err("%s: Error reserving crashkernel low memblock.\n", __func__);
+ return ret;
+ }
+
+ pr_info("Reserving %ldMB of low memory at %ldMB for crashkernel (System low RAM: %ldMB)\n",
+ (unsigned long)(low_size >> 20),
+ (unsigned long)(low_base >> 20),
+ (unsigned long)(total_low_mem >> 20));
+
+ crashk_low_res.start = low_base;
+ crashk_low_res.end = low_base + low_size - 1;
+ insert_resource(&iomem_resource, &crashk_low_res);
+#endif
+ return 0;
+}
+
+static void __init reserve_crashkernel(void)
+{
+ unsigned long long crash_size, crash_base, total_mem;
+ bool high = false;
+ int ret;
+
+ total_mem = memblock_phys_mem_size();
+
+ /* crashkernel=XM */
+ ret = parse_crashkernel(boot_command_line, total_mem, &crash_size, &crash_base);
+ if (ret != 0 || crash_size <= 0) {
+ /* crashkernel=X,high */
+ ret = parse_crashkernel_high(boot_command_line, total_mem,
+ &crash_size, &crash_base);
+ if (ret != 0 || crash_size <= 0)
+ return;
+ high = true;
+ }
+
+ if (xen_pv_domain()) {
+ pr_info("Ignoring crashkernel for a Xen PV domain\n");
+ return;
+ }
+
+ /* 0 means: find the address automatically */
+ if (crash_base <= 0) {
+ /*
+ * Set CRASH_ADDR_LOW_MAX upper bound for crash memory,
+ * as old kexec-tools loads bzImage below that, unless
+ * "crashkernel=size[KMG],high" is specified.
+ */
+ crash_base = memblock_find_in_range(CRASH_ALIGN,
+ high ? CRASH_ADDR_HIGH_MAX
+ : CRASH_ADDR_LOW_MAX,
+ crash_size, CRASH_ALIGN);
+ if (!crash_base) {
+ pr_info("crashkernel reservation failed - No suitable area found.\n");
+ return;
+ }
+
+ } else {
+ unsigned long long start;
+
+ start = memblock_find_in_range(crash_base,
+ crash_base + crash_size,
+ crash_size, 1 << 20);
+ if (start != crash_base) {
+ pr_info("crashkernel reservation failed - memory is in use.\n");
+ return;
+ }
+ }
+ ret = memblock_reserve(crash_base, crash_size);
+ if (ret) {
+ pr_err("%s: Error reserving crashkernel memblock.\n", __func__);
+ return;
+ }
+
+ if (crash_base >= (1ULL << 32) && reserve_crashkernel_low()) {
+ memblock_free(crash_base, crash_size);
+ return;
+ }
+
+ pr_info("Reserving %ldMB of memory at %ldMB for crashkernel (System RAM: %ldMB)\n",
+ (unsigned long)(crash_size >> 20),
+ (unsigned long)(crash_base >> 20),
+ (unsigned long)(total_mem >> 20));
+
+ crashk_res.start = crash_base;
+ crashk_res.end = crash_base + crash_size - 1;
+ insert_resource(&iomem_resource, &crashk_res);
+}
+#else
+static void __init reserve_crashkernel(void)
+{
+}
+#endif
+
+static struct resource standard_io_resources[] = {
+ { .name = "dma1", .start = 0x00, .end = 0x1f,
+ .flags = IORESOURCE_BUSY | IORESOURCE_IO },
+ { .name = "pic1", .start = 0x20, .end = 0x21,
+ .flags = IORESOURCE_BUSY | IORESOURCE_IO },
+ { .name = "timer0", .start = 0x40, .end = 0x43,
+ .flags = IORESOURCE_BUSY | IORESOURCE_IO },
+ { .name = "timer1", .start = 0x50, .end = 0x53,
+ .flags = IORESOURCE_BUSY | IORESOURCE_IO },
+ { .name = "keyboard", .start = 0x60, .end = 0x60,
+ .flags = IORESOURCE_BUSY | IORESOURCE_IO },
+ { .name = "keyboard", .start = 0x64, .end = 0x64,
+ .flags = IORESOURCE_BUSY | IORESOURCE_IO },
+ { .name = "dma page reg", .start = 0x80, .end = 0x8f,
+ .flags = IORESOURCE_BUSY | IORESOURCE_IO },
+ { .name = "pic2", .start = 0xa0, .end = 0xa1,
+ .flags = IORESOURCE_BUSY | IORESOURCE_IO },
+ { .name = "dma2", .start = 0xc0, .end = 0xdf,
+ .flags = IORESOURCE_BUSY | IORESOURCE_IO },
+ { .name = "fpu", .start = 0xf0, .end = 0xff,
+ .flags = IORESOURCE_BUSY | IORESOURCE_IO }
+};
+
+void __init reserve_standard_io_resources(void)
+{
+ int i;
+
+ /* request I/O space for devices used on all i[345]86 PCs */
+ for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++)
+ request_resource(&ioport_resource, &standard_io_resources[i]);
+
+}
+
+static __init void reserve_ibft_region(void)
+{
+ unsigned long addr, size = 0;
+
+ addr = find_ibft_region(&size);
+
+ if (size)
+ memblock_reserve(addr, size);
+}
+
+static bool __init snb_gfx_workaround_needed(void)
+{
+#ifdef CONFIG_PCI
+ int i;
+ u16 vendor, devid;
+ static const __initconst u16 snb_ids[] = {
+ 0x0102,
+ 0x0112,
+ 0x0122,
+ 0x0106,
+ 0x0116,
+ 0x0126,
+ 0x010a,
+ };
+
+ /* Assume no if something weird is going on with PCI */
+ if (!early_pci_allowed())
+ return false;
+
+ vendor = read_pci_config_16(0, 2, 0, PCI_VENDOR_ID);
+ if (vendor != 0x8086)
+ return false;
+
+ devid = read_pci_config_16(0, 2, 0, PCI_DEVICE_ID);
+ for (i = 0; i < ARRAY_SIZE(snb_ids); i++)
+ if (devid == snb_ids[i])
+ return true;
+#endif
+
+ return false;
+}
+
+/*
+ * Sandy Bridge graphics has trouble with certain ranges, exclude
+ * them from allocation.
+ */
+static void __init trim_snb_memory(void)
+{
+ static const __initconst unsigned long bad_pages[] = {
+ 0x20050000,
+ 0x20110000,
+ 0x20130000,
+ 0x20138000,
+ 0x40004000,
+ };
+ int i;
+
+ if (!snb_gfx_workaround_needed())
+ return;
+
+ printk(KERN_DEBUG "reserving inaccessible SNB gfx pages\n");
+
+ /*
+ * Reserve all memory below the 1 MB mark that has not
+ * already been reserved.
+ */
+ memblock_reserve(0, 1<<20);
+
+ for (i = 0; i < ARRAY_SIZE(bad_pages); i++) {
+ if (memblock_reserve(bad_pages[i], PAGE_SIZE))
+ printk(KERN_WARNING "failed to reserve 0x%08lx\n",
+ bad_pages[i]);
+ }
+}
+
+/*
+ * Here we put platform-specific memory range workarounds, i.e.
+ * memory known to be corrupt or otherwise in need to be reserved on
+ * specific platforms.
+ *
+ * If this gets used more widely it could use a real dispatch mechanism.
+ */
+static void __init trim_platform_memory_ranges(void)
+{
+ trim_snb_memory();
+}
+
+static void __init trim_bios_range(void)
+{
+ /*
+ * A special case is the first 4Kb of memory;
+ * This is a BIOS owned area, not kernel ram, but generally
+ * not listed as such in the E820 table.
+ *
+ * This typically reserves additional memory (64KiB by default)
+ * since some BIOSes are known to corrupt low memory. See the
+ * Kconfig help text for X86_RESERVE_LOW.
+ */
+ e820__range_update(0, PAGE_SIZE, E820_TYPE_RAM, E820_TYPE_RESERVED);
+
+ /*
+ * special case: Some BIOSen report the PC BIOS
+ * area (640->1Mb) as ram even though it is not.
+ * take them out.
+ */
+ e820__range_remove(BIOS_BEGIN, BIOS_END - BIOS_BEGIN, E820_TYPE_RAM, 1);
+
+ e820__update_table(e820_table);
+}
+
+/* called before trim_bios_range() to spare extra sanitize */
+static void __init e820_add_kernel_range(void)
+{
+ u64 start = __pa_symbol(_text);
+ u64 size = __pa_symbol(_end) - start;
+
+ /*
+ * Complain if .text .data and .bss are not marked as E820_TYPE_RAM and
+ * attempt to fix it by adding the range. We may have a confused BIOS,
+ * or the user may have used memmap=exactmap or memmap=xxM$yyM to
+ * exclude kernel range. If we really are running on top non-RAM,
+ * we will crash later anyways.
+ */
+ if (e820__mapped_all(start, start + size, E820_TYPE_RAM))
+ return;
+
+ pr_warn(".text .data .bss are not marked as E820_TYPE_RAM!\n");
+ e820__range_remove(start, size, E820_TYPE_RAM, 0);
+ e820__range_add(start, size, E820_TYPE_RAM);
+}
+
+static unsigned reserve_low = CONFIG_X86_RESERVE_LOW << 10;
+
+static int __init parse_reservelow(char *p)
+{
+ unsigned long long size;
+
+ if (!p)
+ return -EINVAL;
+
+ size = memparse(p, &p);
+
+ if (size < 4096)
+ size = 4096;
+
+ if (size > 640*1024)
+ size = 640*1024;
+
+ reserve_low = size;
+
+ return 0;
+}
+
+early_param("reservelow", parse_reservelow);
+
+static void __init trim_low_memory_range(void)
+{
+ memblock_reserve(0, ALIGN(reserve_low, PAGE_SIZE));
+}
+
+/*
+ * Dump out kernel offset information on panic.
+ */
+static int
+dump_kernel_offset(struct notifier_block *self, unsigned long v, void *p)
+{
+ if (kaslr_enabled()) {
+ pr_emerg("Kernel Offset: 0x%lx from 0x%lx (relocation range: 0x%lx-0x%lx)\n",
+ kaslr_offset(),
+ __START_KERNEL,
+ __START_KERNEL_map,
+ MODULES_VADDR-1);
+ } else {
+ pr_emerg("Kernel Offset: disabled\n");
+ }
+
+ return 0;
+}
+
+/*
+ * Determine if we were loaded by an EFI loader. If so, then we have also been
+ * passed the efi memmap, systab, etc., so we should use these data structures
+ * for initialization. Note, the efi init code path is determined by the
+ * global efi_enabled. This allows the same kernel image to be used on existing
+ * systems (with a traditional BIOS) as well as on EFI systems.
+ */
+/*
+ * setup_arch - architecture-specific boot-time initializations
+ *
+ * Note: On x86_64, fixmaps are ready for use even before this is called.
+ */
+
+void __init setup_arch(char **cmdline_p)
+{
+ memblock_reserve(__pa_symbol(_text),
+ (unsigned long)__bss_stop - (unsigned long)_text);
+
+ /*
+ * Make sure page 0 is always reserved because on systems with
+ * L1TF its contents can be leaked to user processes.
+ */
+ memblock_reserve(0, PAGE_SIZE);
+
+ early_reserve_initrd();
+
+ /*
+ * At this point everything still needed from the boot loader
+ * or BIOS or kernel text should be early reserved or marked not
+ * RAM in e820. All other memory is free game.
+ */
+
+#ifdef CONFIG_X86_32
+ memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data));
+
+ /*
+ * copy kernel address range established so far and switch
+ * to the proper swapper page table
+ */
+ clone_pgd_range(swapper_pg_dir + KERNEL_PGD_BOUNDARY,
+ initial_page_table + KERNEL_PGD_BOUNDARY,
+ KERNEL_PGD_PTRS);
+
+ load_cr3(swapper_pg_dir);
+ /*
+ * Note: Quark X1000 CPUs advertise PGE incorrectly and require
+ * a cr3 based tlb flush, so the following __flush_tlb_all()
+ * will not flush anything because the cpu quirk which clears
+ * X86_FEATURE_PGE has not been invoked yet. Though due to the
+ * load_cr3() above the TLB has been flushed already. The
+ * quirk is invoked before subsequent calls to __flush_tlb_all()
+ * so proper operation is guaranteed.
+ */
+ __flush_tlb_all();
+#else
+ printk(KERN_INFO "Command line: %s\n", boot_command_line);
+ boot_cpu_data.x86_phys_bits = MAX_PHYSMEM_BITS;
+#endif
+
+ /*
+ * If we have OLPC OFW, we might end up relocating the fixmap due to
+ * reserve_top(), so do this before touching the ioremap area.
+ */
+ olpc_ofw_detect();
+
+ idt_setup_early_traps();
+ early_cpu_init();
+ arch_init_ideal_nops();
+ jump_label_init();
+ early_ioremap_init();
+
+ setup_olpc_ofw_pgd();
+
+ ROOT_DEV = old_decode_dev(boot_params.hdr.root_dev);
+ screen_info = boot_params.screen_info;
+ edid_info = boot_params.edid_info;
+#ifdef CONFIG_X86_32
+ apm_info.bios = boot_params.apm_bios_info;
+ ist_info = boot_params.ist_info;
+#endif
+ saved_video_mode = boot_params.hdr.vid_mode;
+ bootloader_type = boot_params.hdr.type_of_loader;
+ if ((bootloader_type >> 4) == 0xe) {
+ bootloader_type &= 0xf;
+ bootloader_type |= (boot_params.hdr.ext_loader_type+0x10) << 4;
+ }
+ bootloader_version = bootloader_type & 0xf;
+ bootloader_version |= boot_params.hdr.ext_loader_ver << 4;
+
+#ifdef CONFIG_BLK_DEV_RAM
+ rd_image_start = boot_params.hdr.ram_size & RAMDISK_IMAGE_START_MASK;
+ rd_prompt = ((boot_params.hdr.ram_size & RAMDISK_PROMPT_FLAG) != 0);
+ rd_doload = ((boot_params.hdr.ram_size & RAMDISK_LOAD_FLAG) != 0);
+#endif
+#ifdef CONFIG_EFI
+ if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature,
+ EFI32_LOADER_SIGNATURE, 4)) {
+ set_bit(EFI_BOOT, &efi.flags);
+ } else if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature,
+ EFI64_LOADER_SIGNATURE, 4)) {
+ set_bit(EFI_BOOT, &efi.flags);
+ set_bit(EFI_64BIT, &efi.flags);
+ }
+#endif
+
+ x86_init.oem.arch_setup();
+
+ iomem_resource.end = (1ULL << boot_cpu_data.x86_phys_bits) - 1;
+ e820__memory_setup();
+ parse_setup_data();
+
+ copy_edd();
+
+ if (!boot_params.hdr.root_flags)
+ root_mountflags &= ~MS_RDONLY;
+ init_mm.start_code = (unsigned long) _text;
+ init_mm.end_code = (unsigned long) _etext;
+ init_mm.end_data = (unsigned long) _edata;
+ init_mm.brk = _brk_end;
+
+ mpx_mm_init(&init_mm);
+
+ code_resource.start = __pa_symbol(_text);
+ code_resource.end = __pa_symbol(_etext)-1;
+ data_resource.start = __pa_symbol(_etext);
+ data_resource.end = __pa_symbol(_edata)-1;
+ bss_resource.start = __pa_symbol(__bss_start);
+ bss_resource.end = __pa_symbol(__bss_stop)-1;
+
+#ifdef CONFIG_CMDLINE_BOOL
+#ifdef CONFIG_CMDLINE_OVERRIDE
+ strlcpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE);
+#else
+ if (builtin_cmdline[0]) {
+ /* append boot loader cmdline to builtin */
+ strlcat(builtin_cmdline, " ", COMMAND_LINE_SIZE);
+ strlcat(builtin_cmdline, boot_command_line, COMMAND_LINE_SIZE);
+ strlcpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE);
+ }
+#endif
+#endif
+
+ strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
+ *cmdline_p = command_line;
+
+ /*
+ * x86_configure_nx() is called before parse_early_param() to detect
+ * whether hardware doesn't support NX (so that the early EHCI debug
+ * console setup can safely call set_fixmap()). It may then be called
+ * again from within noexec_setup() during parsing early parameters
+ * to honor the respective command line option.
+ */
+ x86_configure_nx();
+
+ parse_early_param();
+
+ if (efi_enabled(EFI_BOOT))
+ efi_memblock_x86_reserve_range();
+#ifdef CONFIG_MEMORY_HOTPLUG
+ /*
+ * Memory used by the kernel cannot be hot-removed because Linux
+ * cannot migrate the kernel pages. When memory hotplug is
+ * enabled, we should prevent memblock from allocating memory
+ * for the kernel.
+ *
+ * ACPI SRAT records all hotpluggable memory ranges. But before
+ * SRAT is parsed, we don't know about it.
+ *
+ * The kernel image is loaded into memory at very early time. We
+ * cannot prevent this anyway. So on NUMA system, we set any
+ * node the kernel resides in as un-hotpluggable.
+ *
+ * Since on modern servers, one node could have double-digit
+ * gigabytes memory, we can assume the memory around the kernel
+ * image is also un-hotpluggable. So before SRAT is parsed, just
+ * allocate memory near the kernel image to try the best to keep
+ * the kernel away from hotpluggable memory.
+ */
+ if (movable_node_is_enabled())
+ memblock_set_bottom_up(true);
+#endif
+
+ x86_report_nx();
+
+ /* after early param, so could get panic from serial */
+ memblock_x86_reserve_range_setup_data();
+
+ if (acpi_mps_check()) {
+#ifdef CONFIG_X86_LOCAL_APIC
+ disable_apic = 1;
+#endif
+ setup_clear_cpu_cap(X86_FEATURE_APIC);
+ }
+
+ e820__reserve_setup_data();
+ e820__finish_early_params();
+
+ if (efi_enabled(EFI_BOOT))
+ efi_init();
+
+ dmi_scan_machine();
+ dmi_memdev_walk();
+ dmi_set_dump_stack_arch_desc();
+
+ /*
+ * VMware detection requires dmi to be available, so this
+ * needs to be done after dmi_scan_machine(), for the boot CPU.
+ */
+ init_hypervisor_platform();
+
+ tsc_early_init();
+ x86_init.resources.probe_roms();
+
+ /* after parse_early_param, so could debug it */
+ insert_resource(&iomem_resource, &code_resource);
+ insert_resource(&iomem_resource, &data_resource);
+ insert_resource(&iomem_resource, &bss_resource);
+
+ e820_add_kernel_range();
+ trim_bios_range();
+#ifdef CONFIG_X86_32
+ if (ppro_with_ram_bug()) {
+ e820__range_update(0x70000000ULL, 0x40000ULL, E820_TYPE_RAM,
+ E820_TYPE_RESERVED);
+ e820__update_table(e820_table);
+ printk(KERN_INFO "fixed physical RAM map:\n");
+ e820__print_table("bad_ppro");
+ }
+#else
+ early_gart_iommu_check();
+#endif
+
+ /*
+ * partially used pages are not usable - thus
+ * we are rounding upwards:
+ */
+ max_pfn = e820__end_of_ram_pfn();
+
+ /* update e820 for memory not covered by WB MTRRs */
+ mtrr_bp_init();
+ if (mtrr_trim_uncached_memory(max_pfn))
+ max_pfn = e820__end_of_ram_pfn();
+
+ max_possible_pfn = max_pfn;
+
+ /*
+ * This call is required when the CPU does not support PAT. If
+ * mtrr_bp_init() invoked it already via pat_init() the call has no
+ * effect.
+ */
+ init_cache_modes();
+
+ /*
+ * Define random base addresses for memory sections after max_pfn is
+ * defined and before each memory section base is used.
+ */
+ kernel_randomize_memory();
+
+#ifdef CONFIG_X86_32
+ /* max_low_pfn get updated here */
+ find_low_pfn_range();
+#else
+ check_x2apic();
+
+ /* How many end-of-memory variables you have, grandma! */
+ /* need this before calling reserve_initrd */
+ if (max_pfn > (1UL<<(32 - PAGE_SHIFT)))
+ max_low_pfn = e820__end_of_low_ram_pfn();
+ else
+ max_low_pfn = max_pfn;
+
+ high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1;
+#endif
+
+ /*
+ * Find and reserve possible boot-time SMP configuration:
+ */
+ find_smp_config();
+
+ reserve_ibft_region();
+
+ early_alloc_pgt_buf();
+
+ /*
+ * Need to conclude brk, before e820__memblock_setup()
+ * it could use memblock_find_in_range, could overlap with
+ * brk area.
+ */
+ reserve_brk();
+
+ cleanup_highmap();
+
+ memblock_set_current_limit(ISA_END_ADDRESS);
+ e820__memblock_setup();
+
+ reserve_bios_regions();
+
+ if (efi_enabled(EFI_MEMMAP)) {
+ efi_fake_memmap();
+ efi_find_mirror();
+ efi_esrt_init();
+
+ /*
+ * The EFI specification says that boot service code won't be
+ * called after ExitBootServices(). This is, in fact, a lie.
+ */
+ efi_reserve_boot_services();
+ }
+
+ /* preallocate 4k for mptable mpc */
+ e820__memblock_alloc_reserved_mpc_new();
+
+#ifdef CONFIG_X86_CHECK_BIOS_CORRUPTION
+ setup_bios_corruption_check();
+#endif
+
+#ifdef CONFIG_X86_32
+ printk(KERN_DEBUG "initial memory mapped: [mem 0x00000000-%#010lx]\n",
+ (max_pfn_mapped<<PAGE_SHIFT) - 1);
+#endif
+
+ reserve_real_mode();
+
+ trim_platform_memory_ranges();
+ trim_low_memory_range();
+
+ init_mem_mapping();
+
+ idt_setup_early_pf();
+
+ /*
+ * Update mmu_cr4_features (and, indirectly, trampoline_cr4_features)
+ * with the current CR4 value. This may not be necessary, but
+ * auditing all the early-boot CR4 manipulation would be needed to
+ * rule it out.
+ *
+ * Mask off features that don't work outside long mode (just
+ * PCIDE for now).
+ */
+ mmu_cr4_features = __read_cr4() & ~X86_CR4_PCIDE;
+
+ memblock_set_current_limit(get_max_mapped());
+
+ /*
+ * NOTE: On x86-32, only from this point on, fixmaps are ready for use.
+ */
+
+#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
+ if (init_ohci1394_dma_early)
+ init_ohci1394_dma_on_all_controllers();
+#endif
+ /* Allocate bigger log buffer */
+ setup_log_buf(1);
+
+ if (efi_enabled(EFI_BOOT)) {
+ switch (boot_params.secure_boot) {
+ case efi_secureboot_mode_disabled:
+ pr_info("Secure boot disabled\n");
+ break;
+ case efi_secureboot_mode_enabled:
+ pr_info("Secure boot enabled\n");
+ break;
+ default:
+ pr_info("Secure boot could not be determined\n");
+ break;
+ }
+ }
+
+ reserve_initrd();
+
+ acpi_table_upgrade();
+
+ vsmp_init();
+
+ io_delay_init();
+
+ early_platform_quirks();
+
+ /*
+ * Parse the ACPI tables for possible boot-time SMP configuration.
+ */
+ acpi_boot_table_init();
+
+ early_acpi_boot_init();
+
+ initmem_init();
+ dma_contiguous_reserve(max_pfn_mapped << PAGE_SHIFT);
+
+ /*
+ * Reserve memory for crash kernel after SRAT is parsed so that it
+ * won't consume hotpluggable memory.
+ */
+ reserve_crashkernel();
+
+ memblock_find_dma_reserve();
+
+ if (!early_xdbc_setup_hardware())
+ early_xdbc_register_console();
+
+ x86_init.paging.pagetable_init();
+
+ kasan_init();
+
+ /*
+ * Sync back kernel address range.
+ *
+ * FIXME: Can the later sync in setup_cpu_entry_areas() replace
+ * this call?
+ */
+ sync_initial_page_table();
+
+ tboot_probe();
+
+ map_vsyscall();
+
+ generic_apic_probe();
+
+ early_quirks();
+
+ /*
+ * Read APIC and some other early information from ACPI tables.
+ */
+ acpi_boot_init();
+ sfi_init();
+ x86_dtb_init();
+
+ /*
+ * get boot-time SMP configuration:
+ */
+ get_smp_config();
+
+ /*
+ * Systems w/o ACPI and mptables might not have it mapped the local
+ * APIC yet, but prefill_possible_map() might need to access it.
+ */
+ init_apic_mappings();
+
+ prefill_possible_map();
+
+ init_cpu_to_node();
+
+ io_apic_init_mappings();
+
+ x86_init.hyper.guest_late_init();
+
+ e820__reserve_resources();
+ e820__register_nosave_regions(max_low_pfn);
+
+ x86_init.resources.reserve_resources();
+
+ e820__setup_pci_gap();
+
+#ifdef CONFIG_VT
+#if defined(CONFIG_VGA_CONSOLE)
+ if (!efi_enabled(EFI_BOOT) || (efi_mem_type(0xa0000) != EFI_CONVENTIONAL_MEMORY))
+ conswitchp = &vga_con;
+#elif defined(CONFIG_DUMMY_CONSOLE)
+ conswitchp = &dummy_con;
+#endif
+#endif
+ x86_init.oem.banner();
+
+ x86_init.timers.wallclock_init();
+
+ mcheck_init();
+
+ register_refined_jiffies(CLOCK_TICK_RATE);
+
+#ifdef CONFIG_EFI
+ if (efi_enabled(EFI_BOOT))
+ efi_apply_memmap_quirks();
+#endif
+
+ unwind_init();
+}
+
+#ifdef CONFIG_X86_32
+
+static struct resource video_ram_resource = {
+ .name = "Video RAM area",
+ .start = 0xa0000,
+ .end = 0xbffff,
+ .flags = IORESOURCE_BUSY | IORESOURCE_MEM
+};
+
+void __init i386_reserve_resources(void)
+{
+ request_resource(&iomem_resource, &video_ram_resource);
+ reserve_standard_io_resources();
+}
+
+#endif /* CONFIG_X86_32 */
+
+static struct notifier_block kernel_offset_notifier = {
+ .notifier_call = dump_kernel_offset
+};
+
+static int __init register_kernel_offset_dumper(void)
+{
+ atomic_notifier_chain_register(&panic_notifier_list,
+ &kernel_offset_notifier);
+ return 0;
+}
+__initcall(register_kernel_offset_dumper);
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
new file mode 100644
index 0000000..ea554f8
--- /dev/null
+++ b/arch/x86/kernel/setup_percpu.c
@@ -0,0 +1,301 @@
+// SPDX-License-Identifier: GPL-2.0
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kernel.h>
+#include <linux/export.h>
+#include <linux/init.h>
+#include <linux/bootmem.h>
+#include <linux/percpu.h>
+#include <linux/kexec.h>
+#include <linux/crash_dump.h>
+#include <linux/smp.h>
+#include <linux/topology.h>
+#include <linux/pfn.h>
+#include <asm/sections.h>
+#include <asm/processor.h>
+#include <asm/desc.h>
+#include <asm/setup.h>
+#include <asm/mpspec.h>
+#include <asm/apicdef.h>
+#include <asm/highmem.h>
+#include <asm/proto.h>
+#include <asm/cpumask.h>
+#include <asm/cpu.h>
+#include <asm/stackprotector.h>
+
+DEFINE_PER_CPU_READ_MOSTLY(int, cpu_number);
+EXPORT_PER_CPU_SYMBOL(cpu_number);
+
+#ifdef CONFIG_X86_64
+#define BOOT_PERCPU_OFFSET ((unsigned long)__per_cpu_load)
+#else
+#define BOOT_PERCPU_OFFSET 0
+#endif
+
+DEFINE_PER_CPU_READ_MOSTLY(unsigned long, this_cpu_off) = BOOT_PERCPU_OFFSET;
+EXPORT_PER_CPU_SYMBOL(this_cpu_off);
+
+unsigned long __per_cpu_offset[NR_CPUS] __ro_after_init = {
+ [0 ... NR_CPUS-1] = BOOT_PERCPU_OFFSET,
+};
+EXPORT_SYMBOL(__per_cpu_offset);
+
+/*
+ * On x86_64 symbols referenced from code should be reachable using
+ * 32bit relocations. Reserve space for static percpu variables in
+ * modules so that they are always served from the first chunk which
+ * is located at the percpu segment base. On x86_32, anything can
+ * address anywhere. No need to reserve space in the first chunk.
+ */
+#ifdef CONFIG_X86_64
+#define PERCPU_FIRST_CHUNK_RESERVE PERCPU_MODULE_RESERVE
+#else
+#define PERCPU_FIRST_CHUNK_RESERVE 0
+#endif
+
+#ifdef CONFIG_X86_32
+/**
+ * pcpu_need_numa - determine percpu allocation needs to consider NUMA
+ *
+ * If NUMA is not configured or there is only one NUMA node available,
+ * there is no reason to consider NUMA. This function determines
+ * whether percpu allocation should consider NUMA or not.
+ *
+ * RETURNS:
+ * true if NUMA should be considered; otherwise, false.
+ */
+static bool __init pcpu_need_numa(void)
+{
+#ifdef CONFIG_NEED_MULTIPLE_NODES
+ pg_data_t *last = NULL;
+ unsigned int cpu;
+
+ for_each_possible_cpu(cpu) {
+ int node = early_cpu_to_node(cpu);
+
+ if (node_online(node) && NODE_DATA(node) &&
+ last && last != NODE_DATA(node))
+ return true;
+
+ last = NODE_DATA(node);
+ }
+#endif
+ return false;
+}
+#endif
+
+/**
+ * pcpu_alloc_bootmem - NUMA friendly alloc_bootmem wrapper for percpu
+ * @cpu: cpu to allocate for
+ * @size: size allocation in bytes
+ * @align: alignment
+ *
+ * Allocate @size bytes aligned at @align for cpu @cpu. This wrapper
+ * does the right thing for NUMA regardless of the current
+ * configuration.
+ *
+ * RETURNS:
+ * Pointer to the allocated area on success, NULL on failure.
+ */
+static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size,
+ unsigned long align)
+{
+ const unsigned long goal = __pa(MAX_DMA_ADDRESS);
+#ifdef CONFIG_NEED_MULTIPLE_NODES
+ int node = early_cpu_to_node(cpu);
+ void *ptr;
+
+ if (!node_online(node) || !NODE_DATA(node)) {
+ ptr = __alloc_bootmem_nopanic(size, align, goal);
+ pr_info("cpu %d has no node %d or node-local memory\n",
+ cpu, node);
+ pr_debug("per cpu data for cpu%d %lu bytes at %016lx\n",
+ cpu, size, __pa(ptr));
+ } else {
+ ptr = __alloc_bootmem_node_nopanic(NODE_DATA(node),
+ size, align, goal);
+ pr_debug("per cpu data for cpu%d %lu bytes on node%d at %016lx\n",
+ cpu, size, node, __pa(ptr));
+ }
+ return ptr;
+#else
+ return __alloc_bootmem_nopanic(size, align, goal);
+#endif
+}
+
+/*
+ * Helpers for first chunk memory allocation
+ */
+static void * __init pcpu_fc_alloc(unsigned int cpu, size_t size, size_t align)
+{
+ return pcpu_alloc_bootmem(cpu, size, align);
+}
+
+static void __init pcpu_fc_free(void *ptr, size_t size)
+{
+ free_bootmem(__pa(ptr), size);
+}
+
+static int __init pcpu_cpu_distance(unsigned int from, unsigned int to)
+{
+#ifdef CONFIG_NEED_MULTIPLE_NODES
+ if (early_cpu_to_node(from) == early_cpu_to_node(to))
+ return LOCAL_DISTANCE;
+ else
+ return REMOTE_DISTANCE;
+#else
+ return LOCAL_DISTANCE;
+#endif
+}
+
+static void __init pcpup_populate_pte(unsigned long addr)
+{
+ populate_extra_pte(addr);
+}
+
+static inline void setup_percpu_segment(int cpu)
+{
+#ifdef CONFIG_X86_32
+ struct desc_struct d = GDT_ENTRY_INIT(0x8092, per_cpu_offset(cpu),
+ 0xFFFFF);
+
+ write_gdt_entry(get_cpu_gdt_rw(cpu), GDT_ENTRY_PERCPU, &d, DESCTYPE_S);
+#endif
+}
+
+void __init setup_per_cpu_areas(void)
+{
+ unsigned int cpu;
+ unsigned long delta;
+ int rc;
+
+ pr_info("NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%u nr_node_ids:%d\n",
+ NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids);
+
+ /*
+ * Allocate percpu area. Embedding allocator is our favorite;
+ * however, on NUMA configurations, it can result in very
+ * sparse unit mapping and vmalloc area isn't spacious enough
+ * on 32bit. Use page in that case.
+ */
+#ifdef CONFIG_X86_32
+ if (pcpu_chosen_fc == PCPU_FC_AUTO && pcpu_need_numa())
+ pcpu_chosen_fc = PCPU_FC_PAGE;
+#endif
+ rc = -EINVAL;
+ if (pcpu_chosen_fc != PCPU_FC_PAGE) {
+ const size_t dyn_size = PERCPU_MODULE_RESERVE +
+ PERCPU_DYNAMIC_RESERVE - PERCPU_FIRST_CHUNK_RESERVE;
+ size_t atom_size;
+
+ /*
+ * On 64bit, use PMD_SIZE for atom_size so that embedded
+ * percpu areas are aligned to PMD. This, in the future,
+ * can also allow using PMD mappings in vmalloc area. Use
+ * PAGE_SIZE on 32bit as vmalloc space is highly contended
+ * and large vmalloc area allocs can easily fail.
+ */
+#ifdef CONFIG_X86_64
+ atom_size = PMD_SIZE;
+#else
+ atom_size = PAGE_SIZE;
+#endif
+ rc = pcpu_embed_first_chunk(PERCPU_FIRST_CHUNK_RESERVE,
+ dyn_size, atom_size,
+ pcpu_cpu_distance,
+ pcpu_fc_alloc, pcpu_fc_free);
+ if (rc < 0)
+ pr_warning("%s allocator failed (%d), falling back to page size\n",
+ pcpu_fc_names[pcpu_chosen_fc], rc);
+ }
+ if (rc < 0)
+ rc = pcpu_page_first_chunk(PERCPU_FIRST_CHUNK_RESERVE,
+ pcpu_fc_alloc, pcpu_fc_free,
+ pcpup_populate_pte);
+ if (rc < 0)
+ panic("cannot initialize percpu area (err=%d)", rc);
+
+ /* alrighty, percpu areas up and running */
+ delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start;
+ for_each_possible_cpu(cpu) {
+ per_cpu_offset(cpu) = delta + pcpu_unit_offsets[cpu];
+ per_cpu(this_cpu_off, cpu) = per_cpu_offset(cpu);
+ per_cpu(cpu_number, cpu) = cpu;
+ setup_percpu_segment(cpu);
+ setup_stack_canary_segment(cpu);
+ /*
+ * Copy data used in early init routines from the
+ * initial arrays to the per cpu data areas. These
+ * arrays then become expendable and the *_early_ptr's
+ * are zeroed indicating that the static arrays are
+ * gone.
+ */
+#ifdef CONFIG_X86_LOCAL_APIC
+ per_cpu(x86_cpu_to_apicid, cpu) =
+ early_per_cpu_map(x86_cpu_to_apicid, cpu);
+ per_cpu(x86_bios_cpu_apicid, cpu) =
+ early_per_cpu_map(x86_bios_cpu_apicid, cpu);
+ per_cpu(x86_cpu_to_acpiid, cpu) =
+ early_per_cpu_map(x86_cpu_to_acpiid, cpu);
+#endif
+#ifdef CONFIG_X86_32
+ per_cpu(x86_cpu_to_logical_apicid, cpu) =
+ early_per_cpu_map(x86_cpu_to_logical_apicid, cpu);
+#endif
+#ifdef CONFIG_X86_64
+ per_cpu(irq_stack_ptr, cpu) =
+ per_cpu(irq_stack_union.irq_stack, cpu) +
+ IRQ_STACK_SIZE;
+#endif
+#ifdef CONFIG_NUMA
+ per_cpu(x86_cpu_to_node_map, cpu) =
+ early_per_cpu_map(x86_cpu_to_node_map, cpu);
+ /*
+ * Ensure that the boot cpu numa_node is correct when the boot
+ * cpu is on a node that doesn't have memory installed.
+ * Also cpu_up() will call cpu_to_node() for APs when
+ * MEMORY_HOTPLUG is defined, before per_cpu(numa_node) is set
+ * up later with c_init aka intel_init/amd_init.
+ * So set them all (boot cpu and all APs).
+ */
+ set_cpu_numa_node(cpu, early_cpu_to_node(cpu));
+#endif
+ /*
+ * Up to this point, the boot CPU has been using .init.data
+ * area. Reload any changed state for the boot CPU.
+ */
+ if (!cpu)
+ switch_to_new_gdt(cpu);
+ }
+
+ /* indicate the early static arrays will soon be gone */
+#ifdef CONFIG_X86_LOCAL_APIC
+ early_per_cpu_ptr(x86_cpu_to_apicid) = NULL;
+ early_per_cpu_ptr(x86_bios_cpu_apicid) = NULL;
+ early_per_cpu_ptr(x86_cpu_to_acpiid) = NULL;
+#endif
+#ifdef CONFIG_X86_32
+ early_per_cpu_ptr(x86_cpu_to_logical_apicid) = NULL;
+#endif
+#ifdef CONFIG_NUMA
+ early_per_cpu_ptr(x86_cpu_to_node_map) = NULL;
+#endif
+
+ /* Setup node to cpumask map */
+ setup_node_to_cpumask_map();
+
+ /* Setup cpu initialized, callin, callout masks */
+ setup_cpu_local_masks();
+
+ /*
+ * Sync back kernel address range again. We already did this in
+ * setup_arch(), but percpu data also needs to be available in
+ * the smpboot asm. We can't reliably pick up percpu mappings
+ * using vmalloc_fault(), because exception dispatch needs
+ * percpu data.
+ *
+ * FIXME: Can the later sync in setup_cpu_entry_areas() replace
+ * this call?
+ */
+ sync_initial_page_table();
+}
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
new file mode 100644
index 0000000..92a3b31
--- /dev/null
+++ b/arch/x86/kernel/signal.c
@@ -0,0 +1,895 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ * Copyright (C) 2000, 2001, 2002 Andi Kleen SuSE Labs
+ *
+ * 1997-11-28 Modified for POSIX.1b signals by Richard Henderson
+ * 2000-06-20 Pentium III FXSR, SSE support by Gareth Hughes
+ * 2000-2002 x86-64 support by Andi Kleen
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/sched.h>
+#include <linux/sched/task_stack.h>
+#include <linux/mm.h>
+#include <linux/smp.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/wait.h>
+#include <linux/tracehook.h>
+#include <linux/unistd.h>
+#include <linux/stddef.h>
+#include <linux/personality.h>
+#include <linux/uaccess.h>
+#include <linux/user-return-notifier.h>
+#include <linux/uprobes.h>
+#include <linux/context_tracking.h>
+#include <linux/syscalls.h>
+
+#include <asm/processor.h>
+#include <asm/ucontext.h>
+#include <asm/fpu/internal.h>
+#include <asm/fpu/signal.h>
+#include <asm/vdso.h>
+#include <asm/mce.h>
+#include <asm/sighandling.h>
+#include <asm/vm86.h>
+
+#ifdef CONFIG_X86_64
+#include <asm/proto.h>
+#include <asm/ia32_unistd.h>
+#endif /* CONFIG_X86_64 */
+
+#include <asm/syscall.h>
+#include <asm/syscalls.h>
+
+#include <asm/sigframe.h>
+#include <asm/signal.h>
+
+#define COPY(x) do { \
+ get_user_ex(regs->x, &sc->x); \
+} while (0)
+
+#define GET_SEG(seg) ({ \
+ unsigned short tmp; \
+ get_user_ex(tmp, &sc->seg); \
+ tmp; \
+})
+
+#define COPY_SEG(seg) do { \
+ regs->seg = GET_SEG(seg); \
+} while (0)
+
+#define COPY_SEG_CPL3(seg) do { \
+ regs->seg = GET_SEG(seg) | 3; \
+} while (0)
+
+#ifdef CONFIG_X86_64
+/*
+ * If regs->ss will cause an IRET fault, change it. Otherwise leave it
+ * alone. Using this generally makes no sense unless
+ * user_64bit_mode(regs) would return true.
+ */
+static void force_valid_ss(struct pt_regs *regs)
+{
+ u32 ar;
+ asm volatile ("lar %[old_ss], %[ar]\n\t"
+ "jz 1f\n\t" /* If invalid: */
+ "xorl %[ar], %[ar]\n\t" /* set ar = 0 */
+ "1:"
+ : [ar] "=r" (ar)
+ : [old_ss] "rm" ((u16)regs->ss));
+
+ /*
+ * For a valid 64-bit user context, we need DPL 3, type
+ * read-write data or read-write exp-down data, and S and P
+ * set. We can't use VERW because VERW doesn't check the
+ * P bit.
+ */
+ ar &= AR_DPL_MASK | AR_S | AR_P | AR_TYPE_MASK;
+ if (ar != (AR_DPL3 | AR_S | AR_P | AR_TYPE_RWDATA) &&
+ ar != (AR_DPL3 | AR_S | AR_P | AR_TYPE_RWDATA_EXPDOWN))
+ regs->ss = __USER_DS;
+}
+#endif
+
+static int restore_sigcontext(struct pt_regs *regs,
+ struct sigcontext __user *sc,
+ unsigned long uc_flags)
+{
+ unsigned long buf_val;
+ void __user *buf;
+ unsigned int tmpflags;
+ unsigned int err = 0;
+
+ /* Always make any pending restarted system calls return -EINTR */
+ current->restart_block.fn = do_no_restart_syscall;
+
+ get_user_try {
+
+#ifdef CONFIG_X86_32
+ set_user_gs(regs, GET_SEG(gs));
+ COPY_SEG(fs);
+ COPY_SEG(es);
+ COPY_SEG(ds);
+#endif /* CONFIG_X86_32 */
+
+ COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx);
+ COPY(dx); COPY(cx); COPY(ip); COPY(ax);
+
+#ifdef CONFIG_X86_64
+ COPY(r8);
+ COPY(r9);
+ COPY(r10);
+ COPY(r11);
+ COPY(r12);
+ COPY(r13);
+ COPY(r14);
+ COPY(r15);
+#endif /* CONFIG_X86_64 */
+
+ COPY_SEG_CPL3(cs);
+ COPY_SEG_CPL3(ss);
+
+#ifdef CONFIG_X86_64
+ /*
+ * Fix up SS if needed for the benefit of old DOSEMU and
+ * CRIU.
+ */
+ if (unlikely(!(uc_flags & UC_STRICT_RESTORE_SS) &&
+ user_64bit_mode(regs)))
+ force_valid_ss(regs);
+#endif
+
+ get_user_ex(tmpflags, &sc->flags);
+ regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS);
+ regs->orig_ax = -1; /* disable syscall checks */
+
+ get_user_ex(buf_val, &sc->fpstate);
+ buf = (void __user *)buf_val;
+ } get_user_catch(err);
+
+ err |= fpu__restore_sig(buf, IS_ENABLED(CONFIG_X86_32));
+
+ force_iret();
+
+ return err;
+}
+
+int setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate,
+ struct pt_regs *regs, unsigned long mask)
+{
+ int err = 0;
+
+ put_user_try {
+
+#ifdef CONFIG_X86_32
+ put_user_ex(get_user_gs(regs), (unsigned int __user *)&sc->gs);
+ put_user_ex(regs->fs, (unsigned int __user *)&sc->fs);
+ put_user_ex(regs->es, (unsigned int __user *)&sc->es);
+ put_user_ex(regs->ds, (unsigned int __user *)&sc->ds);
+#endif /* CONFIG_X86_32 */
+
+ put_user_ex(regs->di, &sc->di);
+ put_user_ex(regs->si, &sc->si);
+ put_user_ex(regs->bp, &sc->bp);
+ put_user_ex(regs->sp, &sc->sp);
+ put_user_ex(regs->bx, &sc->bx);
+ put_user_ex(regs->dx, &sc->dx);
+ put_user_ex(regs->cx, &sc->cx);
+ put_user_ex(regs->ax, &sc->ax);
+#ifdef CONFIG_X86_64
+ put_user_ex(regs->r8, &sc->r8);
+ put_user_ex(regs->r9, &sc->r9);
+ put_user_ex(regs->r10, &sc->r10);
+ put_user_ex(regs->r11, &sc->r11);
+ put_user_ex(regs->r12, &sc->r12);
+ put_user_ex(regs->r13, &sc->r13);
+ put_user_ex(regs->r14, &sc->r14);
+ put_user_ex(regs->r15, &sc->r15);
+#endif /* CONFIG_X86_64 */
+
+ put_user_ex(current->thread.trap_nr, &sc->trapno);
+ put_user_ex(current->thread.error_code, &sc->err);
+ put_user_ex(regs->ip, &sc->ip);
+#ifdef CONFIG_X86_32
+ put_user_ex(regs->cs, (unsigned int __user *)&sc->cs);
+ put_user_ex(regs->flags, &sc->flags);
+ put_user_ex(regs->sp, &sc->sp_at_signal);
+ put_user_ex(regs->ss, (unsigned int __user *)&sc->ss);
+#else /* !CONFIG_X86_32 */
+ put_user_ex(regs->flags, &sc->flags);
+ put_user_ex(regs->cs, &sc->cs);
+ put_user_ex(0, &sc->gs);
+ put_user_ex(0, &sc->fs);
+ put_user_ex(regs->ss, &sc->ss);
+#endif /* CONFIG_X86_32 */
+
+ put_user_ex(fpstate, &sc->fpstate);
+
+ /* non-iBCS2 extensions.. */
+ put_user_ex(mask, &sc->oldmask);
+ put_user_ex(current->thread.cr2, &sc->cr2);
+ } put_user_catch(err);
+
+ return err;
+}
+
+/*
+ * Set up a signal frame.
+ */
+
+/*
+ * Determine which stack to use..
+ */
+static unsigned long align_sigframe(unsigned long sp)
+{
+#ifdef CONFIG_X86_32
+ /*
+ * Align the stack pointer according to the i386 ABI,
+ * i.e. so that on function entry ((sp + 4) & 15) == 0.
+ */
+ sp = ((sp + 4) & -16ul) - 4;
+#else /* !CONFIG_X86_32 */
+ sp = round_down(sp, 16) - 8;
+#endif
+ return sp;
+}
+
+static void __user *
+get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size,
+ void __user **fpstate)
+{
+ /* Default to using normal stack */
+ unsigned long math_size = 0;
+ unsigned long sp = regs->sp;
+ unsigned long buf_fx = 0;
+ int onsigstack = on_sig_stack(sp);
+ struct fpu *fpu = ¤t->thread.fpu;
+
+ /* redzone */
+ if (IS_ENABLED(CONFIG_X86_64))
+ sp -= 128;
+
+ /* This is the X/Open sanctioned signal stack switching. */
+ if (ka->sa.sa_flags & SA_ONSTACK) {
+ if (sas_ss_flags(sp) == 0)
+ sp = current->sas_ss_sp + current->sas_ss_size;
+ } else if (IS_ENABLED(CONFIG_X86_32) &&
+ !onsigstack &&
+ regs->ss != __USER_DS &&
+ !(ka->sa.sa_flags & SA_RESTORER) &&
+ ka->sa.sa_restorer) {
+ /* This is the legacy signal stack switching. */
+ sp = (unsigned long) ka->sa.sa_restorer;
+ }
+
+ if (fpu->initialized) {
+ sp = fpu__alloc_mathframe(sp, IS_ENABLED(CONFIG_X86_32),
+ &buf_fx, &math_size);
+ *fpstate = (void __user *)sp;
+ }
+
+ sp = align_sigframe(sp - frame_size);
+
+ /*
+ * If we are on the alternate signal stack and would overflow it, don't.
+ * Return an always-bogus address instead so we will die with SIGSEGV.
+ */
+ if (onsigstack && !likely(on_sig_stack(sp)))
+ return (void __user *)-1L;
+
+ /* save i387 and extended state */
+ if (fpu->initialized &&
+ copy_fpstate_to_sigframe(*fpstate, (void __user *)buf_fx, math_size) < 0)
+ return (void __user *)-1L;
+
+ return (void __user *)sp;
+}
+
+#ifdef CONFIG_X86_32
+static const struct {
+ u16 poplmovl;
+ u32 val;
+ u16 int80;
+} __attribute__((packed)) retcode = {
+ 0xb858, /* popl %eax; movl $..., %eax */
+ __NR_sigreturn,
+ 0x80cd, /* int $0x80 */
+};
+
+static const struct {
+ u8 movl;
+ u32 val;
+ u16 int80;
+ u8 pad;
+} __attribute__((packed)) rt_retcode = {
+ 0xb8, /* movl $..., %eax */
+ __NR_rt_sigreturn,
+ 0x80cd, /* int $0x80 */
+ 0
+};
+
+static int
+__setup_frame(int sig, struct ksignal *ksig, sigset_t *set,
+ struct pt_regs *regs)
+{
+ struct sigframe __user *frame;
+ void __user *restorer;
+ int err = 0;
+ void __user *fpstate = NULL;
+
+ frame = get_sigframe(&ksig->ka, regs, sizeof(*frame), &fpstate);
+
+ if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
+ return -EFAULT;
+
+ if (__put_user(sig, &frame->sig))
+ return -EFAULT;
+
+ if (setup_sigcontext(&frame->sc, fpstate, regs, set->sig[0]))
+ return -EFAULT;
+
+ if (_NSIG_WORDS > 1) {
+ if (__copy_to_user(&frame->extramask, &set->sig[1],
+ sizeof(frame->extramask)))
+ return -EFAULT;
+ }
+
+ if (current->mm->context.vdso)
+ restorer = current->mm->context.vdso +
+ vdso_image_32.sym___kernel_sigreturn;
+ else
+ restorer = &frame->retcode;
+ if (ksig->ka.sa.sa_flags & SA_RESTORER)
+ restorer = ksig->ka.sa.sa_restorer;
+
+ /* Set up to return from userspace. */
+ err |= __put_user(restorer, &frame->pretcode);
+
+ /*
+ * This is popl %eax ; movl $__NR_sigreturn, %eax ; int $0x80
+ *
+ * WE DO NOT USE IT ANY MORE! It's only left here for historical
+ * reasons and because gdb uses it as a signature to notice
+ * signal handler stack frames.
+ */
+ err |= __put_user(*((u64 *)&retcode), (u64 *)frame->retcode);
+
+ if (err)
+ return -EFAULT;
+
+ /* Set up registers for signal handler */
+ regs->sp = (unsigned long)frame;
+ regs->ip = (unsigned long)ksig->ka.sa.sa_handler;
+ regs->ax = (unsigned long)sig;
+ regs->dx = 0;
+ regs->cx = 0;
+
+ regs->ds = __USER_DS;
+ regs->es = __USER_DS;
+ regs->ss = __USER_DS;
+ regs->cs = __USER_CS;
+
+ return 0;
+}
+
+static int __setup_rt_frame(int sig, struct ksignal *ksig,
+ sigset_t *set, struct pt_regs *regs)
+{
+ struct rt_sigframe __user *frame;
+ void __user *restorer;
+ int err = 0;
+ void __user *fpstate = NULL;
+
+ frame = get_sigframe(&ksig->ka, regs, sizeof(*frame), &fpstate);
+
+ if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
+ return -EFAULT;
+
+ put_user_try {
+ put_user_ex(sig, &frame->sig);
+ put_user_ex(&frame->info, &frame->pinfo);
+ put_user_ex(&frame->uc, &frame->puc);
+
+ /* Create the ucontext. */
+ if (boot_cpu_has(X86_FEATURE_XSAVE))
+ put_user_ex(UC_FP_XSTATE, &frame->uc.uc_flags);
+ else
+ put_user_ex(0, &frame->uc.uc_flags);
+ put_user_ex(0, &frame->uc.uc_link);
+ save_altstack_ex(&frame->uc.uc_stack, regs->sp);
+
+ /* Set up to return from userspace. */
+ restorer = current->mm->context.vdso +
+ vdso_image_32.sym___kernel_rt_sigreturn;
+ if (ksig->ka.sa.sa_flags & SA_RESTORER)
+ restorer = ksig->ka.sa.sa_restorer;
+ put_user_ex(restorer, &frame->pretcode);
+
+ /*
+ * This is movl $__NR_rt_sigreturn, %ax ; int $0x80
+ *
+ * WE DO NOT USE IT ANY MORE! It's only left here for historical
+ * reasons and because gdb uses it as a signature to notice
+ * signal handler stack frames.
+ */
+ put_user_ex(*((u64 *)&rt_retcode), (u64 *)frame->retcode);
+ } put_user_catch(err);
+
+ err |= copy_siginfo_to_user(&frame->info, &ksig->info);
+ err |= setup_sigcontext(&frame->uc.uc_mcontext, fpstate,
+ regs, set->sig[0]);
+ err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
+
+ if (err)
+ return -EFAULT;
+
+ /* Set up registers for signal handler */
+ regs->sp = (unsigned long)frame;
+ regs->ip = (unsigned long)ksig->ka.sa.sa_handler;
+ regs->ax = (unsigned long)sig;
+ regs->dx = (unsigned long)&frame->info;
+ regs->cx = (unsigned long)&frame->uc;
+
+ regs->ds = __USER_DS;
+ regs->es = __USER_DS;
+ regs->ss = __USER_DS;
+ regs->cs = __USER_CS;
+
+ return 0;
+}
+#else /* !CONFIG_X86_32 */
+static unsigned long frame_uc_flags(struct pt_regs *regs)
+{
+ unsigned long flags;
+
+ if (boot_cpu_has(X86_FEATURE_XSAVE))
+ flags = UC_FP_XSTATE | UC_SIGCONTEXT_SS;
+ else
+ flags = UC_SIGCONTEXT_SS;
+
+ if (likely(user_64bit_mode(regs)))
+ flags |= UC_STRICT_RESTORE_SS;
+
+ return flags;
+}
+
+static int __setup_rt_frame(int sig, struct ksignal *ksig,
+ sigset_t *set, struct pt_regs *regs)
+{
+ struct rt_sigframe __user *frame;
+ void __user *fp = NULL;
+ int err = 0;
+
+ frame = get_sigframe(&ksig->ka, regs, sizeof(struct rt_sigframe), &fp);
+
+ if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
+ return -EFAULT;
+
+ if (ksig->ka.sa.sa_flags & SA_SIGINFO) {
+ if (copy_siginfo_to_user(&frame->info, &ksig->info))
+ return -EFAULT;
+ }
+
+ put_user_try {
+ /* Create the ucontext. */
+ put_user_ex(frame_uc_flags(regs), &frame->uc.uc_flags);
+ put_user_ex(0, &frame->uc.uc_link);
+ save_altstack_ex(&frame->uc.uc_stack, regs->sp);
+
+ /* Set up to return from userspace. If provided, use a stub
+ already in userspace. */
+ /* x86-64 should always use SA_RESTORER. */
+ if (ksig->ka.sa.sa_flags & SA_RESTORER) {
+ put_user_ex(ksig->ka.sa.sa_restorer, &frame->pretcode);
+ } else {
+ /* could use a vstub here */
+ err |= -EFAULT;
+ }
+ } put_user_catch(err);
+
+ err |= setup_sigcontext(&frame->uc.uc_mcontext, fp, regs, set->sig[0]);
+ err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
+
+ if (err)
+ return -EFAULT;
+
+ /* Set up registers for signal handler */
+ regs->di = sig;
+ /* In case the signal handler was declared without prototypes */
+ regs->ax = 0;
+
+ /* This also works for non SA_SIGINFO handlers because they expect the
+ next argument after the signal number on the stack. */
+ regs->si = (unsigned long)&frame->info;
+ regs->dx = (unsigned long)&frame->uc;
+ regs->ip = (unsigned long) ksig->ka.sa.sa_handler;
+
+ regs->sp = (unsigned long)frame;
+
+ /*
+ * Set up the CS and SS registers to run signal handlers in
+ * 64-bit mode, even if the handler happens to be interrupting
+ * 32-bit or 16-bit code.
+ *
+ * SS is subtle. In 64-bit mode, we don't need any particular
+ * SS descriptor, but we do need SS to be valid. It's possible
+ * that the old SS is entirely bogus -- this can happen if the
+ * signal we're trying to deliver is #GP or #SS caused by a bad
+ * SS value. We also have a compatbility issue here: DOSEMU
+ * relies on the contents of the SS register indicating the
+ * SS value at the time of the signal, even though that code in
+ * DOSEMU predates sigreturn's ability to restore SS. (DOSEMU
+ * avoids relying on sigreturn to restore SS; instead it uses
+ * a trampoline.) So we do our best: if the old SS was valid,
+ * we keep it. Otherwise we replace it.
+ */
+ regs->cs = __USER_CS;
+
+ if (unlikely(regs->ss != __USER_DS))
+ force_valid_ss(regs);
+
+ return 0;
+}
+#endif /* CONFIG_X86_32 */
+
+static int x32_setup_rt_frame(struct ksignal *ksig,
+ compat_sigset_t *set,
+ struct pt_regs *regs)
+{
+#ifdef CONFIG_X86_X32_ABI
+ struct rt_sigframe_x32 __user *frame;
+ void __user *restorer;
+ int err = 0;
+ void __user *fpstate = NULL;
+
+ frame = get_sigframe(&ksig->ka, regs, sizeof(*frame), &fpstate);
+
+ if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
+ return -EFAULT;
+
+ if (ksig->ka.sa.sa_flags & SA_SIGINFO) {
+ if (__copy_siginfo_to_user32(&frame->info, &ksig->info, true))
+ return -EFAULT;
+ }
+
+ put_user_try {
+ /* Create the ucontext. */
+ put_user_ex(frame_uc_flags(regs), &frame->uc.uc_flags);
+ put_user_ex(0, &frame->uc.uc_link);
+ compat_save_altstack_ex(&frame->uc.uc_stack, regs->sp);
+ put_user_ex(0, &frame->uc.uc__pad0);
+
+ if (ksig->ka.sa.sa_flags & SA_RESTORER) {
+ restorer = ksig->ka.sa.sa_restorer;
+ } else {
+ /* could use a vstub here */
+ restorer = NULL;
+ err |= -EFAULT;
+ }
+ put_user_ex(restorer, &frame->pretcode);
+ } put_user_catch(err);
+
+ err |= setup_sigcontext(&frame->uc.uc_mcontext, fpstate,
+ regs, set->sig[0]);
+ err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
+
+ if (err)
+ return -EFAULT;
+
+ /* Set up registers for signal handler */
+ regs->sp = (unsigned long) frame;
+ regs->ip = (unsigned long) ksig->ka.sa.sa_handler;
+
+ /* We use the x32 calling convention here... */
+ regs->di = ksig->sig;
+ regs->si = (unsigned long) &frame->info;
+ regs->dx = (unsigned long) &frame->uc;
+
+ loadsegment(ds, __USER_DS);
+ loadsegment(es, __USER_DS);
+
+ regs->cs = __USER_CS;
+ regs->ss = __USER_DS;
+#endif /* CONFIG_X86_X32_ABI */
+
+ return 0;
+}
+
+/*
+ * Do a signal return; undo the signal stack.
+ */
+#ifdef CONFIG_X86_32
+SYSCALL_DEFINE0(sigreturn)
+{
+ struct pt_regs *regs = current_pt_regs();
+ struct sigframe __user *frame;
+ sigset_t set;
+
+ frame = (struct sigframe __user *)(regs->sp - 8);
+
+ if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
+ goto badframe;
+ if (__get_user(set.sig[0], &frame->sc.oldmask) || (_NSIG_WORDS > 1
+ && __copy_from_user(&set.sig[1], &frame->extramask,
+ sizeof(frame->extramask))))
+ goto badframe;
+
+ set_current_blocked(&set);
+
+ /*
+ * x86_32 has no uc_flags bits relevant to restore_sigcontext.
+ * Save a few cycles by skipping the __get_user.
+ */
+ if (restore_sigcontext(regs, &frame->sc, 0))
+ goto badframe;
+ return regs->ax;
+
+badframe:
+ signal_fault(regs, frame, "sigreturn");
+
+ return 0;
+}
+#endif /* CONFIG_X86_32 */
+
+SYSCALL_DEFINE0(rt_sigreturn)
+{
+ struct pt_regs *regs = current_pt_regs();
+ struct rt_sigframe __user *frame;
+ sigset_t set;
+ unsigned long uc_flags;
+
+ frame = (struct rt_sigframe __user *)(regs->sp - sizeof(long));
+ if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
+ goto badframe;
+ if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set)))
+ goto badframe;
+ if (__get_user(uc_flags, &frame->uc.uc_flags))
+ goto badframe;
+
+ set_current_blocked(&set);
+
+ if (restore_sigcontext(regs, &frame->uc.uc_mcontext, uc_flags))
+ goto badframe;
+
+ if (restore_altstack(&frame->uc.uc_stack))
+ goto badframe;
+
+ return regs->ax;
+
+badframe:
+ signal_fault(regs, frame, "rt_sigreturn");
+ return 0;
+}
+
+static inline int is_ia32_compat_frame(struct ksignal *ksig)
+{
+ return IS_ENABLED(CONFIG_IA32_EMULATION) &&
+ ksig->ka.sa.sa_flags & SA_IA32_ABI;
+}
+
+static inline int is_ia32_frame(struct ksignal *ksig)
+{
+ return IS_ENABLED(CONFIG_X86_32) || is_ia32_compat_frame(ksig);
+}
+
+static inline int is_x32_frame(struct ksignal *ksig)
+{
+ return IS_ENABLED(CONFIG_X86_X32_ABI) &&
+ ksig->ka.sa.sa_flags & SA_X32_ABI;
+}
+
+static int
+setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs)
+{
+ int usig = ksig->sig;
+ sigset_t *set = sigmask_to_save();
+ compat_sigset_t *cset = (compat_sigset_t *) set;
+
+ /*
+ * Increment event counter and perform fixup for the pre-signal
+ * frame.
+ */
+ rseq_signal_deliver(ksig, regs);
+
+ /* Set up the stack frame */
+ if (is_ia32_frame(ksig)) {
+ if (ksig->ka.sa.sa_flags & SA_SIGINFO)
+ return ia32_setup_rt_frame(usig, ksig, cset, regs);
+ else
+ return ia32_setup_frame(usig, ksig, cset, regs);
+ } else if (is_x32_frame(ksig)) {
+ return x32_setup_rt_frame(ksig, cset, regs);
+ } else {
+ return __setup_rt_frame(ksig->sig, ksig, set, regs);
+ }
+}
+
+static void
+handle_signal(struct ksignal *ksig, struct pt_regs *regs)
+{
+ bool stepping, failed;
+ struct fpu *fpu = ¤t->thread.fpu;
+
+ if (v8086_mode(regs))
+ save_v86_state((struct kernel_vm86_regs *) regs, VM86_SIGNAL);
+
+ /* Are we from a system call? */
+ if (syscall_get_nr(current, regs) >= 0) {
+ /* If so, check system call restarting.. */
+ switch (syscall_get_error(current, regs)) {
+ case -ERESTART_RESTARTBLOCK:
+ case -ERESTARTNOHAND:
+ regs->ax = -EINTR;
+ break;
+
+ case -ERESTARTSYS:
+ if (!(ksig->ka.sa.sa_flags & SA_RESTART)) {
+ regs->ax = -EINTR;
+ break;
+ }
+ /* fallthrough */
+ case -ERESTARTNOINTR:
+ regs->ax = regs->orig_ax;
+ regs->ip -= 2;
+ break;
+ }
+ }
+
+ /*
+ * If TF is set due to a debugger (TIF_FORCED_TF), clear TF now
+ * so that register information in the sigcontext is correct and
+ * then notify the tracer before entering the signal handler.
+ */
+ stepping = test_thread_flag(TIF_SINGLESTEP);
+ if (stepping)
+ user_disable_single_step(current);
+
+ failed = (setup_rt_frame(ksig, regs) < 0);
+ if (!failed) {
+ /*
+ * Clear the direction flag as per the ABI for function entry.
+ *
+ * Clear RF when entering the signal handler, because
+ * it might disable possible debug exception from the
+ * signal handler.
+ *
+ * Clear TF for the case when it wasn't set by debugger to
+ * avoid the recursive send_sigtrap() in SIGTRAP handler.
+ */
+ regs->flags &= ~(X86_EFLAGS_DF|X86_EFLAGS_RF|X86_EFLAGS_TF);
+ /*
+ * Ensure the signal handler starts with the new fpu state.
+ */
+ if (fpu->initialized)
+ fpu__clear(fpu);
+ }
+ signal_setup_done(failed, ksig, stepping);
+}
+
+static inline unsigned long get_nr_restart_syscall(const struct pt_regs *regs)
+{
+ /*
+ * This function is fundamentally broken as currently
+ * implemented.
+ *
+ * The idea is that we want to trigger a call to the
+ * restart_block() syscall and that we want in_ia32_syscall(),
+ * in_x32_syscall(), etc. to match whatever they were in the
+ * syscall being restarted. We assume that the syscall
+ * instruction at (regs->ip - 2) matches whatever syscall
+ * instruction we used to enter in the first place.
+ *
+ * The problem is that we can get here when ptrace pokes
+ * syscall-like values into regs even if we're not in a syscall
+ * at all.
+ *
+ * For now, we maintain historical behavior and guess based on
+ * stored state. We could do better by saving the actual
+ * syscall arch in restart_block or (with caveats on x32) by
+ * checking if regs->ip points to 'int $0x80'. The current
+ * behavior is incorrect if a tracer has a different bitness
+ * than the tracee.
+ */
+#ifdef CONFIG_IA32_EMULATION
+ if (current_thread_info()->status & (TS_COMPAT|TS_I386_REGS_POKED))
+ return __NR_ia32_restart_syscall;
+#endif
+#ifdef CONFIG_X86_X32_ABI
+ return __NR_restart_syscall | (regs->orig_ax & __X32_SYSCALL_BIT);
+#else
+ return __NR_restart_syscall;
+#endif
+}
+
+/*
+ * Note that 'init' is a special process: it doesn't get signals it doesn't
+ * want to handle. Thus you cannot kill init even with a SIGKILL even by
+ * mistake.
+ */
+void do_signal(struct pt_regs *regs)
+{
+ struct ksignal ksig;
+
+ if (get_signal(&ksig)) {
+ /* Whee! Actually deliver the signal. */
+ handle_signal(&ksig, regs);
+ return;
+ }
+
+ /* Did we come from a system call? */
+ if (syscall_get_nr(current, regs) >= 0) {
+ /* Restart the system call - no handlers present */
+ switch (syscall_get_error(current, regs)) {
+ case -ERESTARTNOHAND:
+ case -ERESTARTSYS:
+ case -ERESTARTNOINTR:
+ regs->ax = regs->orig_ax;
+ regs->ip -= 2;
+ break;
+
+ case -ERESTART_RESTARTBLOCK:
+ regs->ax = get_nr_restart_syscall(regs);
+ regs->ip -= 2;
+ break;
+ }
+ }
+
+ /*
+ * If there's no signal to deliver, we just put the saved sigmask
+ * back.
+ */
+ restore_saved_sigmask();
+}
+
+void signal_fault(struct pt_regs *regs, void __user *frame, char *where)
+{
+ struct task_struct *me = current;
+
+ if (show_unhandled_signals && printk_ratelimit()) {
+ printk("%s"
+ "%s[%d] bad frame in %s frame:%p ip:%lx sp:%lx orax:%lx",
+ task_pid_nr(current) > 1 ? KERN_INFO : KERN_EMERG,
+ me->comm, me->pid, where, frame,
+ regs->ip, regs->sp, regs->orig_ax);
+ print_vma_addr(KERN_CONT " in ", regs->ip);
+ pr_cont("\n");
+ }
+
+ force_sig(SIGSEGV, me);
+}
+
+#ifdef CONFIG_X86_X32_ABI
+asmlinkage long sys32_x32_rt_sigreturn(void)
+{
+ struct pt_regs *regs = current_pt_regs();
+ struct rt_sigframe_x32 __user *frame;
+ sigset_t set;
+ unsigned long uc_flags;
+
+ frame = (struct rt_sigframe_x32 __user *)(regs->sp - 8);
+
+ if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
+ goto badframe;
+ if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set)))
+ goto badframe;
+ if (__get_user(uc_flags, &frame->uc.uc_flags))
+ goto badframe;
+
+ set_current_blocked(&set);
+
+ if (restore_sigcontext(regs, &frame->uc.uc_mcontext, uc_flags))
+ goto badframe;
+
+ if (compat_restore_altstack(&frame->uc.uc_stack))
+ goto badframe;
+
+ return regs->ax;
+
+badframe:
+ signal_fault(regs, frame, "x32 rt_sigreturn");
+ return 0;
+}
+#endif
diff --git a/arch/x86/kernel/signal_compat.c b/arch/x86/kernel/signal_compat.c
new file mode 100644
index 0000000..9ccbf05
--- /dev/null
+++ b/arch/x86/kernel/signal_compat.c
@@ -0,0 +1,182 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/compat.h>
+#include <linux/uaccess.h>
+#include <linux/ptrace.h>
+
+/*
+ * The compat_siginfo_t structure and handing code is very easy
+ * to break in several ways. It must always be updated when new
+ * updates are made to the main siginfo_t, and
+ * copy_siginfo_to_user32() must be updated when the
+ * (arch-independent) copy_siginfo_to_user() is updated.
+ *
+ * It is also easy to put a new member in the compat_siginfo_t
+ * which has implicit alignment which can move internal structure
+ * alignment around breaking the ABI. This can happen if you,
+ * for instance, put a plain 64-bit value in there.
+ */
+static inline void signal_compat_build_tests(void)
+{
+ int _sifields_offset = offsetof(compat_siginfo_t, _sifields);
+
+ /*
+ * If adding a new si_code, there is probably new data in
+ * the siginfo. Make sure folks bumping the si_code
+ * limits also have to look at this code. Make sure any
+ * new fields are handled in copy_siginfo_to_user32()!
+ */
+ BUILD_BUG_ON(NSIGILL != 11);
+ BUILD_BUG_ON(NSIGFPE != 15);
+ BUILD_BUG_ON(NSIGSEGV != 7);
+ BUILD_BUG_ON(NSIGBUS != 5);
+ BUILD_BUG_ON(NSIGTRAP != 5);
+ BUILD_BUG_ON(NSIGCHLD != 6);
+ BUILD_BUG_ON(NSIGSYS != 1);
+
+ /* This is part of the ABI and can never change in size: */
+ BUILD_BUG_ON(sizeof(compat_siginfo_t) != 128);
+ /*
+ * The offsets of all the (unioned) si_fields are fixed
+ * in the ABI, of course. Make sure none of them ever
+ * move and are always at the beginning:
+ */
+ BUILD_BUG_ON(offsetof(compat_siginfo_t, _sifields) != 3 * sizeof(int));
+#define CHECK_CSI_OFFSET(name) BUILD_BUG_ON(_sifields_offset != offsetof(compat_siginfo_t, _sifields.name))
+
+ BUILD_BUG_ON(offsetof(siginfo_t, si_signo) != 0);
+ BUILD_BUG_ON(offsetof(siginfo_t, si_errno) != 4);
+ BUILD_BUG_ON(offsetof(siginfo_t, si_code) != 8);
+
+ BUILD_BUG_ON(offsetof(compat_siginfo_t, si_signo) != 0);
+ BUILD_BUG_ON(offsetof(compat_siginfo_t, si_errno) != 4);
+ BUILD_BUG_ON(offsetof(compat_siginfo_t, si_code) != 8);
+ /*
+ * Ensure that the size of each si_field never changes.
+ * If it does, it is a sign that the
+ * copy_siginfo_to_user32() code below needs to updated
+ * along with the size in the CHECK_SI_SIZE().
+ *
+ * We repeat this check for both the generic and compat
+ * siginfos.
+ *
+ * Note: it is OK for these to grow as long as the whole
+ * structure stays within the padding size (checked
+ * above).
+ */
+#define CHECK_CSI_SIZE(name, size) BUILD_BUG_ON(size != sizeof(((compat_siginfo_t *)0)->_sifields.name))
+#define CHECK_SI_SIZE(name, size) BUILD_BUG_ON(size != sizeof(((siginfo_t *)0)->_sifields.name))
+
+ CHECK_CSI_OFFSET(_kill);
+ CHECK_CSI_SIZE (_kill, 2*sizeof(int));
+ CHECK_SI_SIZE (_kill, 2*sizeof(int));
+
+ BUILD_BUG_ON(offsetof(siginfo_t, si_pid) != 0x10);
+ BUILD_BUG_ON(offsetof(siginfo_t, si_uid) != 0x14);
+ BUILD_BUG_ON(offsetof(compat_siginfo_t, si_pid) != 0xC);
+ BUILD_BUG_ON(offsetof(compat_siginfo_t, si_uid) != 0x10);
+
+ CHECK_CSI_OFFSET(_timer);
+ CHECK_CSI_SIZE (_timer, 3*sizeof(int));
+ CHECK_SI_SIZE (_timer, 6*sizeof(int));
+
+ BUILD_BUG_ON(offsetof(siginfo_t, si_tid) != 0x10);
+ BUILD_BUG_ON(offsetof(siginfo_t, si_overrun) != 0x14);
+ BUILD_BUG_ON(offsetof(siginfo_t, si_value) != 0x18);
+ BUILD_BUG_ON(offsetof(compat_siginfo_t, si_tid) != 0x0C);
+ BUILD_BUG_ON(offsetof(compat_siginfo_t, si_overrun) != 0x10);
+ BUILD_BUG_ON(offsetof(compat_siginfo_t, si_value) != 0x14);
+
+ CHECK_CSI_OFFSET(_rt);
+ CHECK_CSI_SIZE (_rt, 3*sizeof(int));
+ CHECK_SI_SIZE (_rt, 4*sizeof(int));
+
+ BUILD_BUG_ON(offsetof(siginfo_t, si_pid) != 0x10);
+ BUILD_BUG_ON(offsetof(siginfo_t, si_uid) != 0x14);
+ BUILD_BUG_ON(offsetof(siginfo_t, si_value) != 0x18);
+ BUILD_BUG_ON(offsetof(compat_siginfo_t, si_pid) != 0x0C);
+ BUILD_BUG_ON(offsetof(compat_siginfo_t, si_uid) != 0x10);
+ BUILD_BUG_ON(offsetof(compat_siginfo_t, si_value) != 0x14);
+
+ CHECK_CSI_OFFSET(_sigchld);
+ CHECK_CSI_SIZE (_sigchld, 5*sizeof(int));
+ CHECK_SI_SIZE (_sigchld, 8*sizeof(int));
+
+ BUILD_BUG_ON(offsetof(siginfo_t, si_pid) != 0x10);
+ BUILD_BUG_ON(offsetof(siginfo_t, si_uid) != 0x14);
+ BUILD_BUG_ON(offsetof(siginfo_t, si_status) != 0x18);
+ BUILD_BUG_ON(offsetof(siginfo_t, si_utime) != 0x20);
+ BUILD_BUG_ON(offsetof(siginfo_t, si_stime) != 0x28);
+ BUILD_BUG_ON(offsetof(compat_siginfo_t, si_pid) != 0x0C);
+ BUILD_BUG_ON(offsetof(compat_siginfo_t, si_uid) != 0x10);
+ BUILD_BUG_ON(offsetof(compat_siginfo_t, si_status) != 0x14);
+ BUILD_BUG_ON(offsetof(compat_siginfo_t, si_utime) != 0x18);
+ BUILD_BUG_ON(offsetof(compat_siginfo_t, si_stime) != 0x1C);
+
+#ifdef CONFIG_X86_X32_ABI
+ CHECK_CSI_OFFSET(_sigchld_x32);
+ CHECK_CSI_SIZE (_sigchld_x32, 7*sizeof(int));
+ /* no _sigchld_x32 in the generic siginfo_t */
+ BUILD_BUG_ON(offsetof(compat_siginfo_t, _sifields._sigchld_x32._utime) != 0x18);
+ BUILD_BUG_ON(offsetof(compat_siginfo_t, _sifields._sigchld_x32._stime) != 0x20);
+#endif
+
+ CHECK_CSI_OFFSET(_sigfault);
+ CHECK_CSI_SIZE (_sigfault, 4*sizeof(int));
+ CHECK_SI_SIZE (_sigfault, 8*sizeof(int));
+
+ BUILD_BUG_ON(offsetof(siginfo_t, si_addr) != 0x10);
+ BUILD_BUG_ON(offsetof(compat_siginfo_t, si_addr) != 0x0C);
+
+ BUILD_BUG_ON(offsetof(siginfo_t, si_addr_lsb) != 0x18);
+ BUILD_BUG_ON(offsetof(compat_siginfo_t, si_addr_lsb) != 0x10);
+
+ BUILD_BUG_ON(offsetof(siginfo_t, si_lower) != 0x20);
+ BUILD_BUG_ON(offsetof(siginfo_t, si_upper) != 0x28);
+ BUILD_BUG_ON(offsetof(compat_siginfo_t, si_lower) != 0x14);
+ BUILD_BUG_ON(offsetof(compat_siginfo_t, si_upper) != 0x18);
+
+ BUILD_BUG_ON(offsetof(siginfo_t, si_pkey) != 0x20);
+ BUILD_BUG_ON(offsetof(compat_siginfo_t, si_pkey) != 0x14);
+
+ CHECK_CSI_OFFSET(_sigpoll);
+ CHECK_CSI_SIZE (_sigpoll, 2*sizeof(int));
+ CHECK_SI_SIZE (_sigpoll, 4*sizeof(int));
+
+ BUILD_BUG_ON(offsetof(siginfo_t, si_band) != 0x10);
+ BUILD_BUG_ON(offsetof(siginfo_t, si_fd) != 0x18);
+ BUILD_BUG_ON(offsetof(compat_siginfo_t, si_band) != 0x0C);
+ BUILD_BUG_ON(offsetof(compat_siginfo_t, si_fd) != 0x10);
+
+ CHECK_CSI_OFFSET(_sigsys);
+ CHECK_CSI_SIZE (_sigsys, 3*sizeof(int));
+ CHECK_SI_SIZE (_sigsys, 4*sizeof(int));
+
+ BUILD_BUG_ON(offsetof(siginfo_t, si_call_addr) != 0x10);
+ BUILD_BUG_ON(offsetof(siginfo_t, si_syscall) != 0x18);
+ BUILD_BUG_ON(offsetof(siginfo_t, si_arch) != 0x1C);
+ BUILD_BUG_ON(offsetof(compat_siginfo_t, si_call_addr) != 0x0C);
+ BUILD_BUG_ON(offsetof(compat_siginfo_t, si_syscall) != 0x10);
+ BUILD_BUG_ON(offsetof(compat_siginfo_t, si_arch) != 0x14);
+
+ /* any new si_fields should be added here */
+}
+
+void sigaction_compat_abi(struct k_sigaction *act, struct k_sigaction *oact)
+{
+ signal_compat_build_tests();
+
+ /* Don't leak in-kernel non-uapi flags to user-space */
+ if (oact)
+ oact->sa.sa_flags &= ~(SA_IA32_ABI | SA_X32_ABI);
+
+ if (!act)
+ return;
+
+ /* Don't let flags to be set from userspace */
+ act->sa.sa_flags &= ~(SA_IA32_ABI | SA_X32_ABI);
+
+ if (in_ia32_syscall())
+ act->sa.sa_flags |= SA_IA32_ABI;
+ if (in_x32_syscall())
+ act->sa.sa_flags |= SA_X32_ABI;
+}
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
new file mode 100644
index 0000000..04adc8d
--- /dev/null
+++ b/arch/x86/kernel/smp.c
@@ -0,0 +1,328 @@
+/*
+ * Intel SMP support routines.
+ *
+ * (c) 1995 Alan Cox, Building #3 <alan@lxorguk.ukuu.org.uk>
+ * (c) 1998-99, 2000, 2009 Ingo Molnar <mingo@redhat.com>
+ * (c) 2002,2003 Andi Kleen, SuSE Labs.
+ *
+ * i386 and x86_64 integration by Glauber Costa <gcosta@redhat.com>
+ *
+ * This code is released under the GNU General Public License version 2 or
+ * later.
+ */
+
+#include <linux/init.h>
+
+#include <linux/mm.h>
+#include <linux/delay.h>
+#include <linux/spinlock.h>
+#include <linux/export.h>
+#include <linux/kernel_stat.h>
+#include <linux/mc146818rtc.h>
+#include <linux/cache.h>
+#include <linux/interrupt.h>
+#include <linux/cpu.h>
+#include <linux/gfp.h>
+
+#include <asm/mtrr.h>
+#include <asm/tlbflush.h>
+#include <asm/mmu_context.h>
+#include <asm/proto.h>
+#include <asm/apic.h>
+#include <asm/nmi.h>
+#include <asm/mce.h>
+#include <asm/trace/irq_vectors.h>
+#include <asm/kexec.h>
+#include <asm/virtext.h>
+
+/*
+ * Some notes on x86 processor bugs affecting SMP operation:
+ *
+ * Pentium, Pentium Pro, II, III (and all CPUs) have bugs.
+ * The Linux implications for SMP are handled as follows:
+ *
+ * Pentium III / [Xeon]
+ * None of the E1AP-E3AP errata are visible to the user.
+ *
+ * E1AP. see PII A1AP
+ * E2AP. see PII A2AP
+ * E3AP. see PII A3AP
+ *
+ * Pentium II / [Xeon]
+ * None of the A1AP-A3AP errata are visible to the user.
+ *
+ * A1AP. see PPro 1AP
+ * A2AP. see PPro 2AP
+ * A3AP. see PPro 7AP
+ *
+ * Pentium Pro
+ * None of 1AP-9AP errata are visible to the normal user,
+ * except occasional delivery of 'spurious interrupt' as trap #15.
+ * This is very rare and a non-problem.
+ *
+ * 1AP. Linux maps APIC as non-cacheable
+ * 2AP. worked around in hardware
+ * 3AP. fixed in C0 and above steppings microcode update.
+ * Linux does not use excessive STARTUP_IPIs.
+ * 4AP. worked around in hardware
+ * 5AP. symmetric IO mode (normal Linux operation) not affected.
+ * 'noapic' mode has vector 0xf filled out properly.
+ * 6AP. 'noapic' mode might be affected - fixed in later steppings
+ * 7AP. We do not assume writes to the LVT deassering IRQs
+ * 8AP. We do not enable low power mode (deep sleep) during MP bootup
+ * 9AP. We do not use mixed mode
+ *
+ * Pentium
+ * There is a marginal case where REP MOVS on 100MHz SMP
+ * machines with B stepping processors can fail. XXX should provide
+ * an L1cache=Writethrough or L1cache=off option.
+ *
+ * B stepping CPUs may hang. There are hardware work arounds
+ * for this. We warn about it in case your board doesn't have the work
+ * arounds. Basically that's so I can tell anyone with a B stepping
+ * CPU and SMP problems "tough".
+ *
+ * Specific items [From Pentium Processor Specification Update]
+ *
+ * 1AP. Linux doesn't use remote read
+ * 2AP. Linux doesn't trust APIC errors
+ * 3AP. We work around this
+ * 4AP. Linux never generated 3 interrupts of the same priority
+ * to cause a lost local interrupt.
+ * 5AP. Remote read is never used
+ * 6AP. not affected - worked around in hardware
+ * 7AP. not affected - worked around in hardware
+ * 8AP. worked around in hardware - we get explicit CS errors if not
+ * 9AP. only 'noapic' mode affected. Might generate spurious
+ * interrupts, we log only the first one and count the
+ * rest silently.
+ * 10AP. not affected - worked around in hardware
+ * 11AP. Linux reads the APIC between writes to avoid this, as per
+ * the documentation. Make sure you preserve this as it affects
+ * the C stepping chips too.
+ * 12AP. not affected - worked around in hardware
+ * 13AP. not affected - worked around in hardware
+ * 14AP. we always deassert INIT during bootup
+ * 15AP. not affected - worked around in hardware
+ * 16AP. not affected - worked around in hardware
+ * 17AP. not affected - worked around in hardware
+ * 18AP. not affected - worked around in hardware
+ * 19AP. not affected - worked around in BIOS
+ *
+ * If this sounds worrying believe me these bugs are either ___RARE___,
+ * or are signal timing bugs worked around in hardware and there's
+ * about nothing of note with C stepping upwards.
+ */
+
+static atomic_t stopping_cpu = ATOMIC_INIT(-1);
+static bool smp_no_nmi_ipi = false;
+
+/*
+ * this function sends a 'reschedule' IPI to another CPU.
+ * it goes straight through and wastes no time serializing
+ * anything. Worst case is that we lose a reschedule ...
+ */
+static void native_smp_send_reschedule(int cpu)
+{
+ if (unlikely(cpu_is_offline(cpu))) {
+ WARN(1, "sched: Unexpected reschedule of offline CPU#%d!\n", cpu);
+ return;
+ }
+ apic->send_IPI(cpu, RESCHEDULE_VECTOR);
+}
+
+void native_send_call_func_single_ipi(int cpu)
+{
+ apic->send_IPI(cpu, CALL_FUNCTION_SINGLE_VECTOR);
+}
+
+void native_send_call_func_ipi(const struct cpumask *mask)
+{
+ cpumask_var_t allbutself;
+
+ if (!alloc_cpumask_var(&allbutself, GFP_ATOMIC)) {
+ apic->send_IPI_mask(mask, CALL_FUNCTION_VECTOR);
+ return;
+ }
+
+ cpumask_copy(allbutself, cpu_online_mask);
+ cpumask_clear_cpu(smp_processor_id(), allbutself);
+
+ if (cpumask_equal(mask, allbutself) &&
+ cpumask_equal(cpu_online_mask, cpu_callout_mask))
+ apic->send_IPI_allbutself(CALL_FUNCTION_VECTOR);
+ else
+ apic->send_IPI_mask(mask, CALL_FUNCTION_VECTOR);
+
+ free_cpumask_var(allbutself);
+}
+
+static int smp_stop_nmi_callback(unsigned int val, struct pt_regs *regs)
+{
+ /* We are registered on stopping cpu too, avoid spurious NMI */
+ if (raw_smp_processor_id() == atomic_read(&stopping_cpu))
+ return NMI_HANDLED;
+
+ cpu_emergency_vmxoff();
+ stop_this_cpu(NULL);
+
+ return NMI_HANDLED;
+}
+
+/*
+ * this function calls the 'stop' function on all other CPUs in the system.
+ */
+
+asmlinkage __visible void smp_reboot_interrupt(void)
+{
+ ipi_entering_ack_irq();
+ cpu_emergency_vmxoff();
+ stop_this_cpu(NULL);
+ irq_exit();
+}
+
+static void native_stop_other_cpus(int wait)
+{
+ unsigned long flags;
+ unsigned long timeout;
+
+ if (reboot_force)
+ return;
+
+ /*
+ * Use an own vector here because smp_call_function
+ * does lots of things not suitable in a panic situation.
+ */
+
+ /*
+ * We start by using the REBOOT_VECTOR irq.
+ * The irq is treated as a sync point to allow critical
+ * regions of code on other cpus to release their spin locks
+ * and re-enable irqs. Jumping straight to an NMI might
+ * accidentally cause deadlocks with further shutdown/panic
+ * code. By syncing, we give the cpus up to one second to
+ * finish their work before we force them off with the NMI.
+ */
+ if (num_online_cpus() > 1) {
+ /* did someone beat us here? */
+ if (atomic_cmpxchg(&stopping_cpu, -1, safe_smp_processor_id()) != -1)
+ return;
+
+ /* sync above data before sending IRQ */
+ wmb();
+
+ apic->send_IPI_allbutself(REBOOT_VECTOR);
+
+ /*
+ * Don't wait longer than a second if the caller
+ * didn't ask us to wait.
+ */
+ timeout = USEC_PER_SEC;
+ while (num_online_cpus() > 1 && (wait || timeout--))
+ udelay(1);
+ }
+
+ /* if the REBOOT_VECTOR didn't work, try with the NMI */
+ if ((num_online_cpus() > 1) && (!smp_no_nmi_ipi)) {
+ if (register_nmi_handler(NMI_LOCAL, smp_stop_nmi_callback,
+ NMI_FLAG_FIRST, "smp_stop"))
+ /* Note: we ignore failures here */
+ /* Hope the REBOOT_IRQ is good enough */
+ goto finish;
+
+ /* sync above data before sending IRQ */
+ wmb();
+
+ pr_emerg("Shutting down cpus with NMI\n");
+
+ apic->send_IPI_allbutself(NMI_VECTOR);
+
+ /*
+ * Don't wait longer than a 10 ms if the caller
+ * didn't ask us to wait.
+ */
+ timeout = USEC_PER_MSEC * 10;
+ while (num_online_cpus() > 1 && (wait || timeout--))
+ udelay(1);
+ }
+
+finish:
+ local_irq_save(flags);
+ disable_local_APIC();
+ mcheck_cpu_clear(this_cpu_ptr(&cpu_info));
+ local_irq_restore(flags);
+}
+
+/*
+ * Reschedule call back. KVM uses this interrupt to force a cpu out of
+ * guest mode
+ */
+__visible void __irq_entry smp_reschedule_interrupt(struct pt_regs *regs)
+{
+ ack_APIC_irq();
+ inc_irq_stat(irq_resched_count);
+ kvm_set_cpu_l1tf_flush_l1d();
+
+ if (trace_resched_ipi_enabled()) {
+ /*
+ * scheduler_ipi() might call irq_enter() as well, but
+ * nested calls are fine.
+ */
+ irq_enter();
+ trace_reschedule_entry(RESCHEDULE_VECTOR);
+ scheduler_ipi();
+ trace_reschedule_exit(RESCHEDULE_VECTOR);
+ irq_exit();
+ return;
+ }
+ scheduler_ipi();
+}
+
+__visible void __irq_entry smp_call_function_interrupt(struct pt_regs *regs)
+{
+ ipi_entering_ack_irq();
+ trace_call_function_entry(CALL_FUNCTION_VECTOR);
+ inc_irq_stat(irq_call_count);
+ generic_smp_call_function_interrupt();
+ trace_call_function_exit(CALL_FUNCTION_VECTOR);
+ exiting_irq();
+}
+
+__visible void __irq_entry smp_call_function_single_interrupt(struct pt_regs *r)
+{
+ ipi_entering_ack_irq();
+ trace_call_function_single_entry(CALL_FUNCTION_SINGLE_VECTOR);
+ inc_irq_stat(irq_call_count);
+ generic_smp_call_function_single_interrupt();
+ trace_call_function_single_exit(CALL_FUNCTION_SINGLE_VECTOR);
+ exiting_irq();
+}
+
+static int __init nonmi_ipi_setup(char *str)
+{
+ smp_no_nmi_ipi = true;
+ return 1;
+}
+
+__setup("nonmi_ipi", nonmi_ipi_setup);
+
+struct smp_ops smp_ops = {
+ .smp_prepare_boot_cpu = native_smp_prepare_boot_cpu,
+ .smp_prepare_cpus = native_smp_prepare_cpus,
+ .smp_cpus_done = native_smp_cpus_done,
+
+ .stop_other_cpus = native_stop_other_cpus,
+#if defined(CONFIG_KEXEC_CORE)
+ .crash_stop_other_cpus = kdump_nmi_shootdown_cpus,
+#endif
+ .smp_send_reschedule = native_smp_send_reschedule,
+
+ .cpu_up = native_cpu_up,
+ .cpu_die = native_cpu_die,
+ .cpu_disable = native_cpu_disable,
+ .play_dead = native_play_dead,
+
+ .send_call_func_ipi = native_send_call_func_ipi,
+ .send_call_func_single_ipi = native_send_call_func_single_ipi,
+};
+EXPORT_SYMBOL_GPL(smp_ops);
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
new file mode 100644
index 0000000..f02ecaf
--- /dev/null
+++ b/arch/x86/kernel/smpboot.c
@@ -0,0 +1,1699 @@
+ /*
+ * x86 SMP booting functions
+ *
+ * (c) 1995 Alan Cox, Building #3 <alan@lxorguk.ukuu.org.uk>
+ * (c) 1998, 1999, 2000, 2009 Ingo Molnar <mingo@redhat.com>
+ * Copyright 2001 Andi Kleen, SuSE Labs.
+ *
+ * Much of the core SMP work is based on previous work by Thomas Radke, to
+ * whom a great many thanks are extended.
+ *
+ * Thanks to Intel for making available several different Pentium,
+ * Pentium Pro and Pentium-II/Xeon MP machines.
+ * Original development of Linux SMP code supported by Caldera.
+ *
+ * This code is released under the GNU General Public License version 2 or
+ * later.
+ *
+ * Fixes
+ * Felix Koop : NR_CPUS used properly
+ * Jose Renau : Handle single CPU case.
+ * Alan Cox : By repeated request 8) - Total BogoMIPS report.
+ * Greg Wright : Fix for kernel stacks panic.
+ * Erich Boleyn : MP v1.4 and additional changes.
+ * Matthias Sattler : Changes for 2.1 kernel map.
+ * Michel Lespinasse : Changes for 2.1 kernel map.
+ * Michael Chastain : Change trampoline.S to gnu as.
+ * Alan Cox : Dumb bug: 'B' step PPro's are fine
+ * Ingo Molnar : Added APIC timers, based on code
+ * from Jose Renau
+ * Ingo Molnar : various cleanups and rewrites
+ * Tigran Aivazian : fixed "0.00 in /proc/uptime on SMP" bug.
+ * Maciej W. Rozycki : Bits for genuine 82489DX APICs
+ * Andi Kleen : Changed for SMP boot into long mode.
+ * Martin J. Bligh : Added support for multi-quad systems
+ * Dave Jones : Report invalid combinations of Athlon CPUs.
+ * Rusty Russell : Hacked into shape for new "hotplug" boot process.
+ * Andi Kleen : Converted to new state machine.
+ * Ashok Raj : CPU hotplug support
+ * Glauber Costa : i386 and x86_64 integration
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/init.h>
+#include <linux/smp.h>
+#include <linux/export.h>
+#include <linux/sched.h>
+#include <linux/sched/topology.h>
+#include <linux/sched/hotplug.h>
+#include <linux/sched/task_stack.h>
+#include <linux/percpu.h>
+#include <linux/bootmem.h>
+#include <linux/err.h>
+#include <linux/nmi.h>
+#include <linux/tboot.h>
+#include <linux/stackprotector.h>
+#include <linux/gfp.h>
+#include <linux/cpuidle.h>
+
+#include <asm/acpi.h>
+#include <asm/desc.h>
+#include <asm/nmi.h>
+#include <asm/irq.h>
+#include <asm/realmode.h>
+#include <asm/cpu.h>
+#include <asm/numa.h>
+#include <asm/pgtable.h>
+#include <asm/tlbflush.h>
+#include <asm/mtrr.h>
+#include <asm/mwait.h>
+#include <asm/apic.h>
+#include <asm/io_apic.h>
+#include <asm/fpu/internal.h>
+#include <asm/setup.h>
+#include <asm/uv/uv.h>
+#include <linux/mc146818rtc.h>
+#include <asm/i8259.h>
+#include <asm/misc.h>
+#include <asm/qspinlock.h>
+#include <asm/intel-family.h>
+#include <asm/cpu_device_id.h>
+#include <asm/spec-ctrl.h>
+#include <asm/hw_irq.h>
+
+/* representing HT siblings of each logical CPU */
+DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_sibling_map);
+EXPORT_PER_CPU_SYMBOL(cpu_sibling_map);
+
+/* representing HT and core siblings of each logical CPU */
+DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_core_map);
+EXPORT_PER_CPU_SYMBOL(cpu_core_map);
+
+DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_llc_shared_map);
+
+/* Per CPU bogomips and other parameters */
+DEFINE_PER_CPU_READ_MOSTLY(struct cpuinfo_x86, cpu_info);
+EXPORT_PER_CPU_SYMBOL(cpu_info);
+
+/* Logical package management. We might want to allocate that dynamically */
+unsigned int __max_logical_packages __read_mostly;
+EXPORT_SYMBOL(__max_logical_packages);
+static unsigned int logical_packages __read_mostly;
+
+/* Maximum number of SMT threads on any online core */
+int __read_mostly __max_smt_threads = 1;
+
+/* Flag to indicate if a complete sched domain rebuild is required */
+bool x86_topology_update;
+
+int arch_update_cpu_topology(void)
+{
+ int retval = x86_topology_update;
+
+ x86_topology_update = false;
+ return retval;
+}
+
+static inline void smpboot_setup_warm_reset_vector(unsigned long start_eip)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&rtc_lock, flags);
+ CMOS_WRITE(0xa, 0xf);
+ spin_unlock_irqrestore(&rtc_lock, flags);
+ *((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_HIGH)) =
+ start_eip >> 4;
+ *((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_LOW)) =
+ start_eip & 0xf;
+}
+
+static inline void smpboot_restore_warm_reset_vector(void)
+{
+ unsigned long flags;
+
+ /*
+ * Paranoid: Set warm reset code and vector here back
+ * to default values.
+ */
+ spin_lock_irqsave(&rtc_lock, flags);
+ CMOS_WRITE(0, 0xf);
+ spin_unlock_irqrestore(&rtc_lock, flags);
+
+ *((volatile u32 *)phys_to_virt(TRAMPOLINE_PHYS_LOW)) = 0;
+}
+
+/*
+ * Report back to the Boot Processor during boot time or to the caller processor
+ * during CPU online.
+ */
+static void smp_callin(void)
+{
+ int cpuid, phys_id;
+
+ /*
+ * If waken up by an INIT in an 82489DX configuration
+ * cpu_callout_mask guarantees we don't get here before
+ * an INIT_deassert IPI reaches our local APIC, so it is
+ * now safe to touch our local APIC.
+ */
+ cpuid = smp_processor_id();
+
+ /*
+ * (This works even if the APIC is not enabled.)
+ */
+ phys_id = read_apic_id();
+
+ /*
+ * the boot CPU has finished the init stage and is spinning
+ * on callin_map until we finish. We are free to set up this
+ * CPU, first the APIC. (this is probably redundant on most
+ * boards)
+ */
+ apic_ap_setup();
+
+ /*
+ * Save our processor parameters. Note: this information
+ * is needed for clock calibration.
+ */
+ smp_store_cpu_info(cpuid);
+
+ /*
+ * The topology information must be up to date before
+ * calibrate_delay() and notify_cpu_starting().
+ */
+ set_cpu_sibling_map(raw_smp_processor_id());
+
+ /*
+ * Get our bogomips.
+ * Update loops_per_jiffy in cpu_data. Previous call to
+ * smp_store_cpu_info() stored a value that is close but not as
+ * accurate as the value just calculated.
+ */
+ calibrate_delay();
+ cpu_data(cpuid).loops_per_jiffy = loops_per_jiffy;
+ pr_debug("Stack at about %p\n", &cpuid);
+
+ wmb();
+
+ notify_cpu_starting(cpuid);
+
+ /*
+ * Allow the master to continue.
+ */
+ cpumask_set_cpu(cpuid, cpu_callin_mask);
+}
+
+static int cpu0_logical_apicid;
+static int enable_start_cpu0;
+/*
+ * Activate a secondary processor.
+ */
+static void notrace start_secondary(void *unused)
+{
+ /*
+ * Don't put *anything* except direct CPU state initialization
+ * before cpu_init(), SMP booting is too fragile that we want to
+ * limit the things done here to the most necessary things.
+ */
+ if (boot_cpu_has(X86_FEATURE_PCID))
+ __write_cr4(__read_cr4() | X86_CR4_PCIDE);
+
+#ifdef CONFIG_X86_32
+ /* switch away from the initial page table */
+ load_cr3(swapper_pg_dir);
+ /*
+ * Initialize the CR4 shadow before doing anything that could
+ * try to read it.
+ */
+ cr4_init_shadow();
+ __flush_tlb_all();
+#endif
+ load_current_idt();
+ cpu_init();
+ x86_cpuinit.early_percpu_clock_init();
+ preempt_disable();
+ smp_callin();
+
+ enable_start_cpu0 = 0;
+
+ /* otherwise gcc will move up smp_processor_id before the cpu_init */
+ barrier();
+ /*
+ * Check TSC synchronization with the boot CPU:
+ */
+ check_tsc_sync_target();
+
+ speculative_store_bypass_ht_init();
+
+ /*
+ * Lock vector_lock, set CPU online and bring the vector
+ * allocator online. Online must be set with vector_lock held
+ * to prevent a concurrent irq setup/teardown from seeing a
+ * half valid vector space.
+ */
+ lock_vector_lock();
+ set_cpu_online(smp_processor_id(), true);
+ lapic_online();
+ unlock_vector_lock();
+ cpu_set_state_online(smp_processor_id());
+ x86_platform.nmi_init();
+
+ /* enable local interrupts */
+ local_irq_enable();
+
+ /* to prevent fake stack check failure in clock setup */
+ boot_init_stack_canary();
+
+ x86_cpuinit.setup_percpu_clockev();
+
+ wmb();
+ cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
+}
+
+/**
+ * topology_is_primary_thread - Check whether CPU is the primary SMT thread
+ * @cpu: CPU to check
+ */
+bool topology_is_primary_thread(unsigned int cpu)
+{
+ return apic_id_is_primary_thread(per_cpu(x86_cpu_to_apicid, cpu));
+}
+
+/**
+ * topology_smt_supported - Check whether SMT is supported by the CPUs
+ */
+bool topology_smt_supported(void)
+{
+ return smp_num_siblings > 1;
+}
+
+/**
+ * topology_phys_to_logical_pkg - Map a physical package id to a logical
+ *
+ * Returns logical package id or -1 if not found
+ */
+int topology_phys_to_logical_pkg(unsigned int phys_pkg)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ struct cpuinfo_x86 *c = &cpu_data(cpu);
+
+ if (c->initialized && c->phys_proc_id == phys_pkg)
+ return c->logical_proc_id;
+ }
+ return -1;
+}
+EXPORT_SYMBOL(topology_phys_to_logical_pkg);
+
+/**
+ * topology_update_package_map - Update the physical to logical package map
+ * @pkg: The physical package id as retrieved via CPUID
+ * @cpu: The cpu for which this is updated
+ */
+int topology_update_package_map(unsigned int pkg, unsigned int cpu)
+{
+ int new;
+
+ /* Already available somewhere? */
+ new = topology_phys_to_logical_pkg(pkg);
+ if (new >= 0)
+ goto found;
+
+ new = logical_packages++;
+ if (new != pkg) {
+ pr_info("CPU %u Converting physical %u to logical package %u\n",
+ cpu, pkg, new);
+ }
+found:
+ cpu_data(cpu).logical_proc_id = new;
+ return 0;
+}
+
+void __init smp_store_boot_cpu_info(void)
+{
+ int id = 0; /* CPU 0 */
+ struct cpuinfo_x86 *c = &cpu_data(id);
+
+ *c = boot_cpu_data;
+ c->cpu_index = id;
+ topology_update_package_map(c->phys_proc_id, id);
+ c->initialized = true;
+}
+
+/*
+ * The bootstrap kernel entry code has set these up. Save them for
+ * a given CPU
+ */
+void smp_store_cpu_info(int id)
+{
+ struct cpuinfo_x86 *c = &cpu_data(id);
+
+ /* Copy boot_cpu_data only on the first bringup */
+ if (!c->initialized)
+ *c = boot_cpu_data;
+ c->cpu_index = id;
+ /*
+ * During boot time, CPU0 has this setup already. Save the info when
+ * bringing up AP or offlined CPU0.
+ */
+ identify_secondary_cpu(c);
+ c->initialized = true;
+}
+
+static bool
+topology_same_node(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
+{
+ int cpu1 = c->cpu_index, cpu2 = o->cpu_index;
+
+ return (cpu_to_node(cpu1) == cpu_to_node(cpu2));
+}
+
+static bool
+topology_sane(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o, const char *name)
+{
+ int cpu1 = c->cpu_index, cpu2 = o->cpu_index;
+
+ return !WARN_ONCE(!topology_same_node(c, o),
+ "sched: CPU #%d's %s-sibling CPU #%d is not on the same node! "
+ "[node: %d != %d]. Ignoring dependency.\n",
+ cpu1, name, cpu2, cpu_to_node(cpu1), cpu_to_node(cpu2));
+}
+
+#define link_mask(mfunc, c1, c2) \
+do { \
+ cpumask_set_cpu((c1), mfunc(c2)); \
+ cpumask_set_cpu((c2), mfunc(c1)); \
+} while (0)
+
+static bool match_smt(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
+{
+ if (boot_cpu_has(X86_FEATURE_TOPOEXT)) {
+ int cpu1 = c->cpu_index, cpu2 = o->cpu_index;
+
+ if (c->phys_proc_id == o->phys_proc_id &&
+ per_cpu(cpu_llc_id, cpu1) == per_cpu(cpu_llc_id, cpu2)) {
+ if (c->cpu_core_id == o->cpu_core_id)
+ return topology_sane(c, o, "smt");
+
+ if ((c->cu_id != 0xff) &&
+ (o->cu_id != 0xff) &&
+ (c->cu_id == o->cu_id))
+ return topology_sane(c, o, "smt");
+ }
+
+ } else if (c->phys_proc_id == o->phys_proc_id &&
+ c->cpu_core_id == o->cpu_core_id) {
+ return topology_sane(c, o, "smt");
+ }
+
+ return false;
+}
+
+/*
+ * Define snc_cpu[] for SNC (Sub-NUMA Cluster) CPUs.
+ *
+ * These are Intel CPUs that enumerate an LLC that is shared by
+ * multiple NUMA nodes. The LLC on these systems is shared for
+ * off-package data access but private to the NUMA node (half
+ * of the package) for on-package access.
+ *
+ * CPUID (the source of the information about the LLC) can only
+ * enumerate the cache as being shared *or* unshared, but not
+ * this particular configuration. The CPU in this case enumerates
+ * the cache to be shared across the entire package (spanning both
+ * NUMA nodes).
+ */
+
+static const struct x86_cpu_id snc_cpu[] = {
+ { X86_VENDOR_INTEL, 6, INTEL_FAM6_SKYLAKE_X },
+ {}
+};
+
+static bool match_llc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
+{
+ int cpu1 = c->cpu_index, cpu2 = o->cpu_index;
+
+ /* Do not match if we do not have a valid APICID for cpu: */
+ if (per_cpu(cpu_llc_id, cpu1) == BAD_APICID)
+ return false;
+
+ /* Do not match if LLC id does not match: */
+ if (per_cpu(cpu_llc_id, cpu1) != per_cpu(cpu_llc_id, cpu2))
+ return false;
+
+ /*
+ * Allow the SNC topology without warning. Return of false
+ * means 'c' does not share the LLC of 'o'. This will be
+ * reflected to userspace.
+ */
+ if (!topology_same_node(c, o) && x86_match_cpu(snc_cpu))
+ return false;
+
+ return topology_sane(c, o, "llc");
+}
+
+/*
+ * Unlike the other levels, we do not enforce keeping a
+ * multicore group inside a NUMA node. If this happens, we will
+ * discard the MC level of the topology later.
+ */
+static bool match_die(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
+{
+ if (c->phys_proc_id == o->phys_proc_id)
+ return true;
+ return false;
+}
+
+#if defined(CONFIG_SCHED_SMT) || defined(CONFIG_SCHED_MC)
+static inline int x86_sched_itmt_flags(void)
+{
+ return sysctl_sched_itmt_enabled ? SD_ASYM_PACKING : 0;
+}
+
+#ifdef CONFIG_SCHED_MC
+static int x86_core_flags(void)
+{
+ return cpu_core_flags() | x86_sched_itmt_flags();
+}
+#endif
+#ifdef CONFIG_SCHED_SMT
+static int x86_smt_flags(void)
+{
+ return cpu_smt_flags() | x86_sched_itmt_flags();
+}
+#endif
+#endif
+
+static struct sched_domain_topology_level x86_numa_in_package_topology[] = {
+#ifdef CONFIG_SCHED_SMT
+ { cpu_smt_mask, x86_smt_flags, SD_INIT_NAME(SMT) },
+#endif
+#ifdef CONFIG_SCHED_MC
+ { cpu_coregroup_mask, x86_core_flags, SD_INIT_NAME(MC) },
+#endif
+ { NULL, },
+};
+
+static struct sched_domain_topology_level x86_topology[] = {
+#ifdef CONFIG_SCHED_SMT
+ { cpu_smt_mask, x86_smt_flags, SD_INIT_NAME(SMT) },
+#endif
+#ifdef CONFIG_SCHED_MC
+ { cpu_coregroup_mask, x86_core_flags, SD_INIT_NAME(MC) },
+#endif
+ { cpu_cpu_mask, SD_INIT_NAME(DIE) },
+ { NULL, },
+};
+
+/*
+ * Set if a package/die has multiple NUMA nodes inside.
+ * AMD Magny-Cours, Intel Cluster-on-Die, and Intel
+ * Sub-NUMA Clustering have this.
+ */
+static bool x86_has_numa_in_package;
+
+void set_cpu_sibling_map(int cpu)
+{
+ bool has_smt = smp_num_siblings > 1;
+ bool has_mp = has_smt || boot_cpu_data.x86_max_cores > 1;
+ struct cpuinfo_x86 *c = &cpu_data(cpu);
+ struct cpuinfo_x86 *o;
+ int i, threads;
+
+ cpumask_set_cpu(cpu, cpu_sibling_setup_mask);
+
+ if (!has_mp) {
+ cpumask_set_cpu(cpu, topology_sibling_cpumask(cpu));
+ cpumask_set_cpu(cpu, cpu_llc_shared_mask(cpu));
+ cpumask_set_cpu(cpu, topology_core_cpumask(cpu));
+ c->booted_cores = 1;
+ return;
+ }
+
+ for_each_cpu(i, cpu_sibling_setup_mask) {
+ o = &cpu_data(i);
+
+ if ((i == cpu) || (has_smt && match_smt(c, o)))
+ link_mask(topology_sibling_cpumask, cpu, i);
+
+ if ((i == cpu) || (has_mp && match_llc(c, o)))
+ link_mask(cpu_llc_shared_mask, cpu, i);
+
+ }
+
+ /*
+ * This needs a separate iteration over the cpus because we rely on all
+ * topology_sibling_cpumask links to be set-up.
+ */
+ for_each_cpu(i, cpu_sibling_setup_mask) {
+ o = &cpu_data(i);
+
+ if ((i == cpu) || (has_mp && match_die(c, o))) {
+ link_mask(topology_core_cpumask, cpu, i);
+
+ /*
+ * Does this new cpu bringup a new core?
+ */
+ if (cpumask_weight(
+ topology_sibling_cpumask(cpu)) == 1) {
+ /*
+ * for each core in package, increment
+ * the booted_cores for this new cpu
+ */
+ if (cpumask_first(
+ topology_sibling_cpumask(i)) == i)
+ c->booted_cores++;
+ /*
+ * increment the core count for all
+ * the other cpus in this package
+ */
+ if (i != cpu)
+ cpu_data(i).booted_cores++;
+ } else if (i != cpu && !c->booted_cores)
+ c->booted_cores = cpu_data(i).booted_cores;
+ }
+ if (match_die(c, o) && !topology_same_node(c, o))
+ x86_has_numa_in_package = true;
+ }
+
+ threads = cpumask_weight(topology_sibling_cpumask(cpu));
+ if (threads > __max_smt_threads)
+ __max_smt_threads = threads;
+}
+
+/* maps the cpu to the sched domain representing multi-core */
+const struct cpumask *cpu_coregroup_mask(int cpu)
+{
+ return cpu_llc_shared_mask(cpu);
+}
+
+static void impress_friends(void)
+{
+ int cpu;
+ unsigned long bogosum = 0;
+ /*
+ * Allow the user to impress friends.
+ */
+ pr_debug("Before bogomips\n");
+ for_each_possible_cpu(cpu)
+ if (cpumask_test_cpu(cpu, cpu_callout_mask))
+ bogosum += cpu_data(cpu).loops_per_jiffy;
+ pr_info("Total of %d processors activated (%lu.%02lu BogoMIPS)\n",
+ num_online_cpus(),
+ bogosum/(500000/HZ),
+ (bogosum/(5000/HZ))%100);
+
+ pr_debug("Before bogocount - setting activated=1\n");
+}
+
+void __inquire_remote_apic(int apicid)
+{
+ unsigned i, regs[] = { APIC_ID >> 4, APIC_LVR >> 4, APIC_SPIV >> 4 };
+ const char * const names[] = { "ID", "VERSION", "SPIV" };
+ int timeout;
+ u32 status;
+
+ pr_info("Inquiring remote APIC 0x%x...\n", apicid);
+
+ for (i = 0; i < ARRAY_SIZE(regs); i++) {
+ pr_info("... APIC 0x%x %s: ", apicid, names[i]);
+
+ /*
+ * Wait for idle.
+ */
+ status = safe_apic_wait_icr_idle();
+ if (status)
+ pr_cont("a previous APIC delivery may have failed\n");
+
+ apic_icr_write(APIC_DM_REMRD | regs[i], apicid);
+
+ timeout = 0;
+ do {
+ udelay(100);
+ status = apic_read(APIC_ICR) & APIC_ICR_RR_MASK;
+ } while (status == APIC_ICR_RR_INPROG && timeout++ < 1000);
+
+ switch (status) {
+ case APIC_ICR_RR_VALID:
+ status = apic_read(APIC_RRR);
+ pr_cont("%08x\n", status);
+ break;
+ default:
+ pr_cont("failed\n");
+ }
+ }
+}
+
+/*
+ * The Multiprocessor Specification 1.4 (1997) example code suggests
+ * that there should be a 10ms delay between the BSP asserting INIT
+ * and de-asserting INIT, when starting a remote processor.
+ * But that slows boot and resume on modern processors, which include
+ * many cores and don't require that delay.
+ *
+ * Cmdline "init_cpu_udelay=" is available to over-ride this delay.
+ * Modern processor families are quirked to remove the delay entirely.
+ */
+#define UDELAY_10MS_DEFAULT 10000
+
+static unsigned int init_udelay = UINT_MAX;
+
+static int __init cpu_init_udelay(char *str)
+{
+ get_option(&str, &init_udelay);
+
+ return 0;
+}
+early_param("cpu_init_udelay", cpu_init_udelay);
+
+static void __init smp_quirk_init_udelay(void)
+{
+ /* if cmdline changed it from default, leave it alone */
+ if (init_udelay != UINT_MAX)
+ return;
+
+ /* if modern processor, use no delay */
+ if (((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && (boot_cpu_data.x86 == 6)) ||
+ ((boot_cpu_data.x86_vendor == X86_VENDOR_AMD) && (boot_cpu_data.x86 >= 0xF))) {
+ init_udelay = 0;
+ return;
+ }
+ /* else, use legacy delay */
+ init_udelay = UDELAY_10MS_DEFAULT;
+}
+
+/*
+ * Poke the other CPU in the eye via NMI to wake it up. Remember that the normal
+ * INIT, INIT, STARTUP sequence will reset the chip hard for us, and this
+ * won't ... remember to clear down the APIC, etc later.
+ */
+int
+wakeup_secondary_cpu_via_nmi(int apicid, unsigned long start_eip)
+{
+ unsigned long send_status, accept_status = 0;
+ int maxlvt;
+
+ /* Target chip */
+ /* Boot on the stack */
+ /* Kick the second */
+ apic_icr_write(APIC_DM_NMI | apic->dest_logical, apicid);
+
+ pr_debug("Waiting for send to finish...\n");
+ send_status = safe_apic_wait_icr_idle();
+
+ /*
+ * Give the other CPU some time to accept the IPI.
+ */
+ udelay(200);
+ if (APIC_INTEGRATED(boot_cpu_apic_version)) {
+ maxlvt = lapic_get_maxlvt();
+ if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
+ apic_write(APIC_ESR, 0);
+ accept_status = (apic_read(APIC_ESR) & 0xEF);
+ }
+ pr_debug("NMI sent\n");
+
+ if (send_status)
+ pr_err("APIC never delivered???\n");
+ if (accept_status)
+ pr_err("APIC delivery error (%lx)\n", accept_status);
+
+ return (send_status | accept_status);
+}
+
+static int
+wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)
+{
+ unsigned long send_status = 0, accept_status = 0;
+ int maxlvt, num_starts, j;
+
+ maxlvt = lapic_get_maxlvt();
+
+ /*
+ * Be paranoid about clearing APIC errors.
+ */
+ if (APIC_INTEGRATED(boot_cpu_apic_version)) {
+ if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
+ apic_write(APIC_ESR, 0);
+ apic_read(APIC_ESR);
+ }
+
+ pr_debug("Asserting INIT\n");
+
+ /*
+ * Turn INIT on target chip
+ */
+ /*
+ * Send IPI
+ */
+ apic_icr_write(APIC_INT_LEVELTRIG | APIC_INT_ASSERT | APIC_DM_INIT,
+ phys_apicid);
+
+ pr_debug("Waiting for send to finish...\n");
+ send_status = safe_apic_wait_icr_idle();
+
+ udelay(init_udelay);
+
+ pr_debug("Deasserting INIT\n");
+
+ /* Target chip */
+ /* Send IPI */
+ apic_icr_write(APIC_INT_LEVELTRIG | APIC_DM_INIT, phys_apicid);
+
+ pr_debug("Waiting for send to finish...\n");
+ send_status = safe_apic_wait_icr_idle();
+
+ mb();
+
+ /*
+ * Should we send STARTUP IPIs ?
+ *
+ * Determine this based on the APIC version.
+ * If we don't have an integrated APIC, don't send the STARTUP IPIs.
+ */
+ if (APIC_INTEGRATED(boot_cpu_apic_version))
+ num_starts = 2;
+ else
+ num_starts = 0;
+
+ /*
+ * Run STARTUP IPI loop.
+ */
+ pr_debug("#startup loops: %d\n", num_starts);
+
+ for (j = 1; j <= num_starts; j++) {
+ pr_debug("Sending STARTUP #%d\n", j);
+ if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
+ apic_write(APIC_ESR, 0);
+ apic_read(APIC_ESR);
+ pr_debug("After apic_write\n");
+
+ /*
+ * STARTUP IPI
+ */
+
+ /* Target chip */
+ /* Boot on the stack */
+ /* Kick the second */
+ apic_icr_write(APIC_DM_STARTUP | (start_eip >> 12),
+ phys_apicid);
+
+ /*
+ * Give the other CPU some time to accept the IPI.
+ */
+ if (init_udelay == 0)
+ udelay(10);
+ else
+ udelay(300);
+
+ pr_debug("Startup point 1\n");
+
+ pr_debug("Waiting for send to finish...\n");
+ send_status = safe_apic_wait_icr_idle();
+
+ /*
+ * Give the other CPU some time to accept the IPI.
+ */
+ if (init_udelay == 0)
+ udelay(10);
+ else
+ udelay(200);
+
+ if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
+ apic_write(APIC_ESR, 0);
+ accept_status = (apic_read(APIC_ESR) & 0xEF);
+ if (send_status || accept_status)
+ break;
+ }
+ pr_debug("After Startup\n");
+
+ if (send_status)
+ pr_err("APIC never delivered???\n");
+ if (accept_status)
+ pr_err("APIC delivery error (%lx)\n", accept_status);
+
+ return (send_status | accept_status);
+}
+
+/* reduce the number of lines printed when booting a large cpu count system */
+static void announce_cpu(int cpu, int apicid)
+{
+ static int current_node = -1;
+ int node = early_cpu_to_node(cpu);
+ static int width, node_width;
+
+ if (!width)
+ width = num_digits(num_possible_cpus()) + 1; /* + '#' sign */
+
+ if (!node_width)
+ node_width = num_digits(num_possible_nodes()) + 1; /* + '#' */
+
+ if (cpu == 1)
+ printk(KERN_INFO "x86: Booting SMP configuration:\n");
+
+ if (system_state < SYSTEM_RUNNING) {
+ if (node != current_node) {
+ if (current_node > (-1))
+ pr_cont("\n");
+ current_node = node;
+
+ printk(KERN_INFO ".... node %*s#%d, CPUs: ",
+ node_width - num_digits(node), " ", node);
+ }
+
+ /* Add padding for the BSP */
+ if (cpu == 1)
+ pr_cont("%*s", width + 1, " ");
+
+ pr_cont("%*s#%d", width - num_digits(cpu), " ", cpu);
+
+ } else
+ pr_info("Booting Node %d Processor %d APIC 0x%x\n",
+ node, cpu, apicid);
+}
+
+static int wakeup_cpu0_nmi(unsigned int cmd, struct pt_regs *regs)
+{
+ int cpu;
+
+ cpu = smp_processor_id();
+ if (cpu == 0 && !cpu_online(cpu) && enable_start_cpu0)
+ return NMI_HANDLED;
+
+ return NMI_DONE;
+}
+
+/*
+ * Wake up AP by INIT, INIT, STARTUP sequence.
+ *
+ * Instead of waiting for STARTUP after INITs, BSP will execute the BIOS
+ * boot-strap code which is not a desired behavior for waking up BSP. To
+ * void the boot-strap code, wake up CPU0 by NMI instead.
+ *
+ * This works to wake up soft offlined CPU0 only. If CPU0 is hard offlined
+ * (i.e. physically hot removed and then hot added), NMI won't wake it up.
+ * We'll change this code in the future to wake up hard offlined CPU0 if
+ * real platform and request are available.
+ */
+static int
+wakeup_cpu_via_init_nmi(int cpu, unsigned long start_ip, int apicid,
+ int *cpu0_nmi_registered)
+{
+ int id;
+ int boot_error;
+
+ preempt_disable();
+
+ /*
+ * Wake up AP by INIT, INIT, STARTUP sequence.
+ */
+ if (cpu) {
+ boot_error = wakeup_secondary_cpu_via_init(apicid, start_ip);
+ goto out;
+ }
+
+ /*
+ * Wake up BSP by nmi.
+ *
+ * Register a NMI handler to help wake up CPU0.
+ */
+ boot_error = register_nmi_handler(NMI_LOCAL,
+ wakeup_cpu0_nmi, 0, "wake_cpu0");
+
+ if (!boot_error) {
+ enable_start_cpu0 = 1;
+ *cpu0_nmi_registered = 1;
+ if (apic->dest_logical == APIC_DEST_LOGICAL)
+ id = cpu0_logical_apicid;
+ else
+ id = apicid;
+ boot_error = wakeup_secondary_cpu_via_nmi(id, start_ip);
+ }
+
+out:
+ preempt_enable();
+
+ return boot_error;
+}
+
+void common_cpu_up(unsigned int cpu, struct task_struct *idle)
+{
+ /* Just in case we booted with a single CPU. */
+ alternatives_enable_smp();
+
+ per_cpu(current_task, cpu) = idle;
+
+#ifdef CONFIG_X86_32
+ /* Stack for startup_32 can be just as for start_secondary onwards */
+ irq_ctx_init(cpu);
+ per_cpu(cpu_current_top_of_stack, cpu) = task_top_of_stack(idle);
+#else
+ initial_gs = per_cpu_offset(cpu);
+#endif
+}
+
+/*
+ * NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad
+ * (ie clustered apic addressing mode), this is a LOGICAL apic ID.
+ * Returns zero if CPU booted OK, else error code from
+ * ->wakeup_secondary_cpu.
+ */
+static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle,
+ int *cpu0_nmi_registered)
+{
+ volatile u32 *trampoline_status =
+ (volatile u32 *) __va(real_mode_header->trampoline_status);
+ /* start_ip had better be page-aligned! */
+ unsigned long start_ip = real_mode_header->trampoline_start;
+
+ unsigned long boot_error = 0;
+ unsigned long timeout;
+
+ idle->thread.sp = (unsigned long)task_pt_regs(idle);
+ early_gdt_descr.address = (unsigned long)get_cpu_gdt_rw(cpu);
+ initial_code = (unsigned long)start_secondary;
+ initial_stack = idle->thread.sp;
+
+ /* Enable the espfix hack for this CPU */
+ init_espfix_ap(cpu);
+
+ /* So we see what's up */
+ announce_cpu(cpu, apicid);
+
+ /*
+ * This grunge runs the startup process for
+ * the targeted processor.
+ */
+
+ if (x86_platform.legacy.warm_reset) {
+
+ pr_debug("Setting warm reset code and vector.\n");
+
+ smpboot_setup_warm_reset_vector(start_ip);
+ /*
+ * Be paranoid about clearing APIC errors.
+ */
+ if (APIC_INTEGRATED(boot_cpu_apic_version)) {
+ apic_write(APIC_ESR, 0);
+ apic_read(APIC_ESR);
+ }
+ }
+
+ /*
+ * AP might wait on cpu_callout_mask in cpu_init() with
+ * cpu_initialized_mask set if previous attempt to online
+ * it timed-out. Clear cpu_initialized_mask so that after
+ * INIT/SIPI it could start with a clean state.
+ */
+ cpumask_clear_cpu(cpu, cpu_initialized_mask);
+ smp_mb();
+
+ /*
+ * Wake up a CPU in difference cases:
+ * - Use the method in the APIC driver if it's defined
+ * Otherwise,
+ * - Use an INIT boot APIC message for APs or NMI for BSP.
+ */
+ if (apic->wakeup_secondary_cpu)
+ boot_error = apic->wakeup_secondary_cpu(apicid, start_ip);
+ else
+ boot_error = wakeup_cpu_via_init_nmi(cpu, start_ip, apicid,
+ cpu0_nmi_registered);
+
+ if (!boot_error) {
+ /*
+ * Wait 10s total for first sign of life from AP
+ */
+ boot_error = -1;
+ timeout = jiffies + 10*HZ;
+ while (time_before(jiffies, timeout)) {
+ if (cpumask_test_cpu(cpu, cpu_initialized_mask)) {
+ /*
+ * Tell AP to proceed with initialization
+ */
+ cpumask_set_cpu(cpu, cpu_callout_mask);
+ boot_error = 0;
+ break;
+ }
+ schedule();
+ }
+ }
+
+ if (!boot_error) {
+ /*
+ * Wait till AP completes initial initialization
+ */
+ while (!cpumask_test_cpu(cpu, cpu_callin_mask)) {
+ /*
+ * Allow other tasks to run while we wait for the
+ * AP to come online. This also gives a chance
+ * for the MTRR work(triggered by the AP coming online)
+ * to be completed in the stop machine context.
+ */
+ schedule();
+ }
+ }
+
+ /* mark "stuck" area as not stuck */
+ *trampoline_status = 0;
+
+ if (x86_platform.legacy.warm_reset) {
+ /*
+ * Cleanup possible dangling ends...
+ */
+ smpboot_restore_warm_reset_vector();
+ }
+
+ return boot_error;
+}
+
+int native_cpu_up(unsigned int cpu, struct task_struct *tidle)
+{
+ int apicid = apic->cpu_present_to_apicid(cpu);
+ int cpu0_nmi_registered = 0;
+ unsigned long flags;
+ int err, ret = 0;
+
+ lockdep_assert_irqs_enabled();
+
+ pr_debug("++++++++++++++++++++=_---CPU UP %u\n", cpu);
+
+ if (apicid == BAD_APICID ||
+ !physid_isset(apicid, phys_cpu_present_map) ||
+ !apic->apic_id_valid(apicid)) {
+ pr_err("%s: bad cpu %d\n", __func__, cpu);
+ return -EINVAL;
+ }
+
+ /*
+ * Already booted CPU?
+ */
+ if (cpumask_test_cpu(cpu, cpu_callin_mask)) {
+ pr_debug("do_boot_cpu %d Already started\n", cpu);
+ return -ENOSYS;
+ }
+
+ /*
+ * Save current MTRR state in case it was changed since early boot
+ * (e.g. by the ACPI SMI) to initialize new CPUs with MTRRs in sync:
+ */
+ mtrr_save_state();
+
+ /* x86 CPUs take themselves offline, so delayed offline is OK. */
+ err = cpu_check_up_prepare(cpu);
+ if (err && err != -EBUSY)
+ return err;
+
+ /* the FPU context is blank, nobody can own it */
+ per_cpu(fpu_fpregs_owner_ctx, cpu) = NULL;
+
+ common_cpu_up(cpu, tidle);
+
+ err = do_boot_cpu(apicid, cpu, tidle, &cpu0_nmi_registered);
+ if (err) {
+ pr_err("do_boot_cpu failed(%d) to wakeup CPU#%u\n", err, cpu);
+ ret = -EIO;
+ goto unreg_nmi;
+ }
+
+ /*
+ * Check TSC synchronization with the AP (keep irqs disabled
+ * while doing so):
+ */
+ local_irq_save(flags);
+ check_tsc_sync_source(cpu);
+ local_irq_restore(flags);
+
+ while (!cpu_online(cpu)) {
+ cpu_relax();
+ touch_nmi_watchdog();
+ }
+
+unreg_nmi:
+ /*
+ * Clean up the nmi handler. Do this after the callin and callout sync
+ * to avoid impact of possible long unregister time.
+ */
+ if (cpu0_nmi_registered)
+ unregister_nmi_handler(NMI_LOCAL, "wake_cpu0");
+
+ return ret;
+}
+
+/**
+ * arch_disable_smp_support() - disables SMP support for x86 at runtime
+ */
+void arch_disable_smp_support(void)
+{
+ disable_ioapic_support();
+}
+
+/*
+ * Fall back to non SMP mode after errors.
+ *
+ * RED-PEN audit/test this more. I bet there is more state messed up here.
+ */
+static __init void disable_smp(void)
+{
+ pr_info("SMP disabled\n");
+
+ disable_ioapic_support();
+
+ init_cpu_present(cpumask_of(0));
+ init_cpu_possible(cpumask_of(0));
+
+ if (smp_found_config)
+ physid_set_mask_of_physid(boot_cpu_physical_apicid, &phys_cpu_present_map);
+ else
+ physid_set_mask_of_physid(0, &phys_cpu_present_map);
+ cpumask_set_cpu(0, topology_sibling_cpumask(0));
+ cpumask_set_cpu(0, topology_core_cpumask(0));
+}
+
+/*
+ * Various sanity checks.
+ */
+static void __init smp_sanity_check(void)
+{
+ preempt_disable();
+
+#if !defined(CONFIG_X86_BIGSMP) && defined(CONFIG_X86_32)
+ if (def_to_bigsmp && nr_cpu_ids > 8) {
+ unsigned int cpu;
+ unsigned nr;
+
+ pr_warn("More than 8 CPUs detected - skipping them\n"
+ "Use CONFIG_X86_BIGSMP\n");
+
+ nr = 0;
+ for_each_present_cpu(cpu) {
+ if (nr >= 8)
+ set_cpu_present(cpu, false);
+ nr++;
+ }
+
+ nr = 0;
+ for_each_possible_cpu(cpu) {
+ if (nr >= 8)
+ set_cpu_possible(cpu, false);
+ nr++;
+ }
+
+ nr_cpu_ids = 8;
+ }
+#endif
+
+ if (!physid_isset(hard_smp_processor_id(), phys_cpu_present_map)) {
+ pr_warn("weird, boot CPU (#%d) not listed by the BIOS\n",
+ hard_smp_processor_id());
+
+ physid_set(hard_smp_processor_id(), phys_cpu_present_map);
+ }
+
+ /*
+ * Should not be necessary because the MP table should list the boot
+ * CPU too, but we do it for the sake of robustness anyway.
+ */
+ if (!apic->check_phys_apicid_present(boot_cpu_physical_apicid)) {
+ pr_notice("weird, boot CPU (#%d) not listed by the BIOS\n",
+ boot_cpu_physical_apicid);
+ physid_set(hard_smp_processor_id(), phys_cpu_present_map);
+ }
+ preempt_enable();
+}
+
+static void __init smp_cpu_index_default(void)
+{
+ int i;
+ struct cpuinfo_x86 *c;
+
+ for_each_possible_cpu(i) {
+ c = &cpu_data(i);
+ /* mark all to hotplug */
+ c->cpu_index = nr_cpu_ids;
+ }
+}
+
+static void __init smp_get_logical_apicid(void)
+{
+ if (x2apic_mode)
+ cpu0_logical_apicid = apic_read(APIC_LDR);
+ else
+ cpu0_logical_apicid = GET_APIC_LOGICAL_ID(apic_read(APIC_LDR));
+}
+
+/*
+ * Prepare for SMP bootup.
+ * @max_cpus: configured maximum number of CPUs, It is a legacy parameter
+ * for common interface support.
+ */
+void __init native_smp_prepare_cpus(unsigned int max_cpus)
+{
+ unsigned int i;
+
+ smp_cpu_index_default();
+
+ /*
+ * Setup boot CPU information
+ */
+ smp_store_boot_cpu_info(); /* Final full version of the data */
+ cpumask_copy(cpu_callin_mask, cpumask_of(0));
+ mb();
+
+ for_each_possible_cpu(i) {
+ zalloc_cpumask_var(&per_cpu(cpu_sibling_map, i), GFP_KERNEL);
+ zalloc_cpumask_var(&per_cpu(cpu_core_map, i), GFP_KERNEL);
+ zalloc_cpumask_var(&per_cpu(cpu_llc_shared_map, i), GFP_KERNEL);
+ }
+
+ /*
+ * Set 'default' x86 topology, this matches default_topology() in that
+ * it has NUMA nodes as a topology level. See also
+ * native_smp_cpus_done().
+ *
+ * Must be done before set_cpus_sibling_map() is ran.
+ */
+ set_sched_topology(x86_topology);
+
+ set_cpu_sibling_map(0);
+
+ smp_sanity_check();
+
+ switch (apic_intr_mode) {
+ case APIC_PIC:
+ case APIC_VIRTUAL_WIRE_NO_CONFIG:
+ disable_smp();
+ return;
+ case APIC_SYMMETRIC_IO_NO_ROUTING:
+ disable_smp();
+ /* Setup local timer */
+ x86_init.timers.setup_percpu_clockev();
+ return;
+ case APIC_VIRTUAL_WIRE:
+ case APIC_SYMMETRIC_IO:
+ break;
+ }
+
+ /* Setup local timer */
+ x86_init.timers.setup_percpu_clockev();
+
+ smp_get_logical_apicid();
+
+ pr_info("CPU0: ");
+ print_cpu_info(&cpu_data(0));
+
+ native_pv_lock_init();
+
+ uv_system_init();
+
+ set_mtrr_aps_delayed_init();
+
+ smp_quirk_init_udelay();
+
+ speculative_store_bypass_ht_init();
+}
+
+void arch_enable_nonboot_cpus_begin(void)
+{
+ set_mtrr_aps_delayed_init();
+}
+
+void arch_enable_nonboot_cpus_end(void)
+{
+ mtrr_aps_init();
+}
+
+/*
+ * Early setup to make printk work.
+ */
+void __init native_smp_prepare_boot_cpu(void)
+{
+ int me = smp_processor_id();
+ switch_to_new_gdt(me);
+ /* already set me in cpu_online_mask in boot_cpu_init() */
+ cpumask_set_cpu(me, cpu_callout_mask);
+ cpu_set_state_online(me);
+}
+
+void __init calculate_max_logical_packages(void)
+{
+ int ncpus;
+
+ /*
+ * Today neither Intel nor AMD support heterogenous systems so
+ * extrapolate the boot cpu's data to all packages.
+ */
+ ncpus = cpu_data(0).booted_cores * topology_max_smt_threads();
+ __max_logical_packages = DIV_ROUND_UP(nr_cpu_ids, ncpus);
+ pr_info("Max logical packages: %u\n", __max_logical_packages);
+}
+
+void __init native_smp_cpus_done(unsigned int max_cpus)
+{
+ pr_debug("Boot done\n");
+
+ calculate_max_logical_packages();
+
+ if (x86_has_numa_in_package)
+ set_sched_topology(x86_numa_in_package_topology);
+
+ nmi_selftest();
+ impress_friends();
+ mtrr_aps_init();
+}
+
+static int __initdata setup_possible_cpus = -1;
+static int __init _setup_possible_cpus(char *str)
+{
+ get_option(&str, &setup_possible_cpus);
+ return 0;
+}
+early_param("possible_cpus", _setup_possible_cpus);
+
+
+/*
+ * cpu_possible_mask should be static, it cannot change as cpu's
+ * are onlined, or offlined. The reason is per-cpu data-structures
+ * are allocated by some modules at init time, and dont expect to
+ * do this dynamically on cpu arrival/departure.
+ * cpu_present_mask on the other hand can change dynamically.
+ * In case when cpu_hotplug is not compiled, then we resort to current
+ * behaviour, which is cpu_possible == cpu_present.
+ * - Ashok Raj
+ *
+ * Three ways to find out the number of additional hotplug CPUs:
+ * - If the BIOS specified disabled CPUs in ACPI/mptables use that.
+ * - The user can overwrite it with possible_cpus=NUM
+ * - Otherwise don't reserve additional CPUs.
+ * We do this because additional CPUs waste a lot of memory.
+ * -AK
+ */
+__init void prefill_possible_map(void)
+{
+ int i, possible;
+
+ /* No boot processor was found in mptable or ACPI MADT */
+ if (!num_processors) {
+ if (boot_cpu_has(X86_FEATURE_APIC)) {
+ int apicid = boot_cpu_physical_apicid;
+ int cpu = hard_smp_processor_id();
+
+ pr_warn("Boot CPU (id %d) not listed by BIOS\n", cpu);
+
+ /* Make sure boot cpu is enumerated */
+ if (apic->cpu_present_to_apicid(0) == BAD_APICID &&
+ apic->apic_id_valid(apicid))
+ generic_processor_info(apicid, boot_cpu_apic_version);
+ }
+
+ if (!num_processors)
+ num_processors = 1;
+ }
+
+ i = setup_max_cpus ?: 1;
+ if (setup_possible_cpus == -1) {
+ possible = num_processors;
+#ifdef CONFIG_HOTPLUG_CPU
+ if (setup_max_cpus)
+ possible += disabled_cpus;
+#else
+ if (possible > i)
+ possible = i;
+#endif
+ } else
+ possible = setup_possible_cpus;
+
+ total_cpus = max_t(int, possible, num_processors + disabled_cpus);
+
+ /* nr_cpu_ids could be reduced via nr_cpus= */
+ if (possible > nr_cpu_ids) {
+ pr_warn("%d Processors exceeds NR_CPUS limit of %u\n",
+ possible, nr_cpu_ids);
+ possible = nr_cpu_ids;
+ }
+
+#ifdef CONFIG_HOTPLUG_CPU
+ if (!setup_max_cpus)
+#endif
+ if (possible > i) {
+ pr_warn("%d Processors exceeds max_cpus limit of %u\n",
+ possible, setup_max_cpus);
+ possible = i;
+ }
+
+ nr_cpu_ids = possible;
+
+ pr_info("Allowing %d CPUs, %d hotplug CPUs\n",
+ possible, max_t(int, possible - num_processors, 0));
+
+ reset_cpu_possible_mask();
+
+ for (i = 0; i < possible; i++)
+ set_cpu_possible(i, true);
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+
+/* Recompute SMT state for all CPUs on offline */
+static void recompute_smt_state(void)
+{
+ int max_threads, cpu;
+
+ max_threads = 0;
+ for_each_online_cpu (cpu) {
+ int threads = cpumask_weight(topology_sibling_cpumask(cpu));
+
+ if (threads > max_threads)
+ max_threads = threads;
+ }
+ __max_smt_threads = max_threads;
+}
+
+static void remove_siblinginfo(int cpu)
+{
+ int sibling;
+ struct cpuinfo_x86 *c = &cpu_data(cpu);
+
+ for_each_cpu(sibling, topology_core_cpumask(cpu)) {
+ cpumask_clear_cpu(cpu, topology_core_cpumask(sibling));
+ /*/
+ * last thread sibling in this cpu core going down
+ */
+ if (cpumask_weight(topology_sibling_cpumask(cpu)) == 1)
+ cpu_data(sibling).booted_cores--;
+ }
+
+ for_each_cpu(sibling, topology_sibling_cpumask(cpu))
+ cpumask_clear_cpu(cpu, topology_sibling_cpumask(sibling));
+ for_each_cpu(sibling, cpu_llc_shared_mask(cpu))
+ cpumask_clear_cpu(cpu, cpu_llc_shared_mask(sibling));
+ cpumask_clear(cpu_llc_shared_mask(cpu));
+ cpumask_clear(topology_sibling_cpumask(cpu));
+ cpumask_clear(topology_core_cpumask(cpu));
+ c->cpu_core_id = 0;
+ c->booted_cores = 0;
+ cpumask_clear_cpu(cpu, cpu_sibling_setup_mask);
+ recompute_smt_state();
+}
+
+static void remove_cpu_from_maps(int cpu)
+{
+ set_cpu_online(cpu, false);
+ cpumask_clear_cpu(cpu, cpu_callout_mask);
+ cpumask_clear_cpu(cpu, cpu_callin_mask);
+ /* was set by cpu_init() */
+ cpumask_clear_cpu(cpu, cpu_initialized_mask);
+ numa_remove_cpu(cpu);
+}
+
+void cpu_disable_common(void)
+{
+ int cpu = smp_processor_id();
+
+ remove_siblinginfo(cpu);
+
+ /* It's now safe to remove this processor from the online map */
+ lock_vector_lock();
+ remove_cpu_from_maps(cpu);
+ unlock_vector_lock();
+ fixup_irqs();
+ lapic_offline();
+}
+
+int native_cpu_disable(void)
+{
+ int ret;
+
+ ret = lapic_can_unplug_cpu();
+ if (ret)
+ return ret;
+
+ clear_local_APIC();
+ cpu_disable_common();
+
+ return 0;
+}
+
+int common_cpu_die(unsigned int cpu)
+{
+ int ret = 0;
+
+ /* We don't do anything here: idle task is faking death itself. */
+
+ /* They ack this in play_dead() by setting CPU_DEAD */
+ if (cpu_wait_death(cpu, 5)) {
+ if (system_state == SYSTEM_RUNNING)
+ pr_info("CPU %u is now offline\n", cpu);
+ } else {
+ pr_err("CPU %u didn't die...\n", cpu);
+ ret = -1;
+ }
+
+ return ret;
+}
+
+void native_cpu_die(unsigned int cpu)
+{
+ common_cpu_die(cpu);
+}
+
+void play_dead_common(void)
+{
+ idle_task_exit();
+
+ /* Ack it */
+ (void)cpu_report_death();
+
+ /*
+ * With physical CPU hotplug, we should halt the cpu
+ */
+ local_irq_disable();
+}
+
+static bool wakeup_cpu0(void)
+{
+ if (smp_processor_id() == 0 && enable_start_cpu0)
+ return true;
+
+ return false;
+}
+
+/*
+ * We need to flush the caches before going to sleep, lest we have
+ * dirty data in our caches when we come back up.
+ */
+static inline void mwait_play_dead(void)
+{
+ unsigned int eax, ebx, ecx, edx;
+ unsigned int highest_cstate = 0;
+ unsigned int highest_subcstate = 0;
+ void *mwait_ptr;
+ int i;
+
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
+ return;
+ if (!this_cpu_has(X86_FEATURE_MWAIT))
+ return;
+ if (!this_cpu_has(X86_FEATURE_CLFLUSH))
+ return;
+ if (__this_cpu_read(cpu_info.cpuid_level) < CPUID_MWAIT_LEAF)
+ return;
+
+ eax = CPUID_MWAIT_LEAF;
+ ecx = 0;
+ native_cpuid(&eax, &ebx, &ecx, &edx);
+
+ /*
+ * eax will be 0 if EDX enumeration is not valid.
+ * Initialized below to cstate, sub_cstate value when EDX is valid.
+ */
+ if (!(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED)) {
+ eax = 0;
+ } else {
+ edx >>= MWAIT_SUBSTATE_SIZE;
+ for (i = 0; i < 7 && edx; i++, edx >>= MWAIT_SUBSTATE_SIZE) {
+ if (edx & MWAIT_SUBSTATE_MASK) {
+ highest_cstate = i;
+ highest_subcstate = edx & MWAIT_SUBSTATE_MASK;
+ }
+ }
+ eax = (highest_cstate << MWAIT_SUBSTATE_SIZE) |
+ (highest_subcstate - 1);
+ }
+
+ /*
+ * This should be a memory location in a cache line which is
+ * unlikely to be touched by other processors. The actual
+ * content is immaterial as it is not actually modified in any way.
+ */
+ mwait_ptr = ¤t_thread_info()->flags;
+
+ wbinvd();
+
+ while (1) {
+ /*
+ * The CLFLUSH is a workaround for erratum AAI65 for
+ * the Xeon 7400 series. It's not clear it is actually
+ * needed, but it should be harmless in either case.
+ * The WBINVD is insufficient due to the spurious-wakeup
+ * case where we return around the loop.
+ */
+ mb();
+ clflush(mwait_ptr);
+ mb();
+ __monitor(mwait_ptr, 0, 0);
+ mb();
+ __mwait(eax, 0);
+ /*
+ * If NMI wants to wake up CPU0, start CPU0.
+ */
+ if (wakeup_cpu0())
+ start_cpu0();
+ }
+}
+
+void hlt_play_dead(void)
+{
+ if (__this_cpu_read(cpu_info.x86) >= 4)
+ wbinvd();
+
+ while (1) {
+ native_halt();
+ /*
+ * If NMI wants to wake up CPU0, start CPU0.
+ */
+ if (wakeup_cpu0())
+ start_cpu0();
+ }
+}
+
+void native_play_dead(void)
+{
+ play_dead_common();
+ tboot_shutdown(TB_SHUTDOWN_WFS);
+
+ mwait_play_dead(); /* Only returns on failure */
+ if (cpuidle_play_dead())
+ hlt_play_dead();
+}
+
+#else /* ... !CONFIG_HOTPLUG_CPU */
+int native_cpu_disable(void)
+{
+ return -ENOSYS;
+}
+
+void native_cpu_die(unsigned int cpu)
+{
+ /* We said "no" in __cpu_disable */
+ BUG();
+}
+
+void native_play_dead(void)
+{
+ BUG();
+}
+
+#endif
diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c
new file mode 100644
index 0000000..7627455
--- /dev/null
+++ b/arch/x86/kernel/stacktrace.c
@@ -0,0 +1,229 @@
+/*
+ * Stack trace management functions
+ *
+ * Copyright (C) 2006-2009 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ */
+#include <linux/sched.h>
+#include <linux/sched/debug.h>
+#include <linux/sched/task_stack.h>
+#include <linux/stacktrace.h>
+#include <linux/export.h>
+#include <linux/uaccess.h>
+#include <asm/stacktrace.h>
+#include <asm/unwind.h>
+
+static int save_stack_address(struct stack_trace *trace, unsigned long addr,
+ bool nosched)
+{
+ if (nosched && in_sched_functions(addr))
+ return 0;
+
+ if (trace->skip > 0) {
+ trace->skip--;
+ return 0;
+ }
+
+ if (trace->nr_entries >= trace->max_entries)
+ return -1;
+
+ trace->entries[trace->nr_entries++] = addr;
+ return 0;
+}
+
+static void noinline __save_stack_trace(struct stack_trace *trace,
+ struct task_struct *task, struct pt_regs *regs,
+ bool nosched)
+{
+ struct unwind_state state;
+ unsigned long addr;
+
+ if (regs)
+ save_stack_address(trace, regs->ip, nosched);
+
+ for (unwind_start(&state, task, regs, NULL); !unwind_done(&state);
+ unwind_next_frame(&state)) {
+ addr = unwind_get_return_address(&state);
+ if (!addr || save_stack_address(trace, addr, nosched))
+ break;
+ }
+
+ if (trace->nr_entries < trace->max_entries)
+ trace->entries[trace->nr_entries++] = ULONG_MAX;
+}
+
+/*
+ * Save stack-backtrace addresses into a stack_trace buffer.
+ */
+void save_stack_trace(struct stack_trace *trace)
+{
+ trace->skip++;
+ __save_stack_trace(trace, current, NULL, false);
+}
+EXPORT_SYMBOL_GPL(save_stack_trace);
+
+void save_stack_trace_regs(struct pt_regs *regs, struct stack_trace *trace)
+{
+ __save_stack_trace(trace, current, regs, false);
+}
+
+void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace)
+{
+ if (!try_get_task_stack(tsk))
+ return;
+
+ if (tsk == current)
+ trace->skip++;
+ __save_stack_trace(trace, tsk, NULL, true);
+
+ put_task_stack(tsk);
+}
+EXPORT_SYMBOL_GPL(save_stack_trace_tsk);
+
+#ifdef CONFIG_HAVE_RELIABLE_STACKTRACE
+
+static int __always_inline
+__save_stack_trace_reliable(struct stack_trace *trace,
+ struct task_struct *task)
+{
+ struct unwind_state state;
+ struct pt_regs *regs;
+ unsigned long addr;
+
+ for (unwind_start(&state, task, NULL, NULL);
+ !unwind_done(&state) && !unwind_error(&state);
+ unwind_next_frame(&state)) {
+
+ regs = unwind_get_entry_regs(&state, NULL);
+ if (regs) {
+ /* Success path for user tasks */
+ if (user_mode(regs))
+ goto success;
+
+ /*
+ * Kernel mode registers on the stack indicate an
+ * in-kernel interrupt or exception (e.g., preemption
+ * or a page fault), which can make frame pointers
+ * unreliable.
+ */
+
+ if (IS_ENABLED(CONFIG_FRAME_POINTER))
+ return -EINVAL;
+ }
+
+ addr = unwind_get_return_address(&state);
+
+ /*
+ * A NULL or invalid return address probably means there's some
+ * generated code which __kernel_text_address() doesn't know
+ * about.
+ */
+ if (!addr)
+ return -EINVAL;
+
+ if (save_stack_address(trace, addr, false))
+ return -EINVAL;
+ }
+
+ /* Check for stack corruption */
+ if (unwind_error(&state))
+ return -EINVAL;
+
+ /* Success path for non-user tasks, i.e. kthreads and idle tasks */
+ if (!(task->flags & (PF_KTHREAD | PF_IDLE)))
+ return -EINVAL;
+
+success:
+ if (trace->nr_entries < trace->max_entries)
+ trace->entries[trace->nr_entries++] = ULONG_MAX;
+
+ return 0;
+}
+
+/*
+ * This function returns an error if it detects any unreliable features of the
+ * stack. Otherwise it guarantees that the stack trace is reliable.
+ *
+ * If the task is not 'current', the caller *must* ensure the task is inactive.
+ */
+int save_stack_trace_tsk_reliable(struct task_struct *tsk,
+ struct stack_trace *trace)
+{
+ int ret;
+
+ /*
+ * If the task doesn't have a stack (e.g., a zombie), the stack is
+ * "reliably" empty.
+ */
+ if (!try_get_task_stack(tsk))
+ return 0;
+
+ ret = __save_stack_trace_reliable(trace, tsk);
+
+ put_task_stack(tsk);
+
+ return ret;
+}
+#endif /* CONFIG_HAVE_RELIABLE_STACKTRACE */
+
+/* Userspace stacktrace - based on kernel/trace/trace_sysprof.c */
+
+struct stack_frame_user {
+ const void __user *next_fp;
+ unsigned long ret_addr;
+};
+
+static int
+copy_stack_frame(const void __user *fp, struct stack_frame_user *frame)
+{
+ int ret;
+
+ if (!access_ok(VERIFY_READ, fp, sizeof(*frame)))
+ return 0;
+
+ ret = 1;
+ pagefault_disable();
+ if (__copy_from_user_inatomic(frame, fp, sizeof(*frame)))
+ ret = 0;
+ pagefault_enable();
+
+ return ret;
+}
+
+static inline void __save_stack_trace_user(struct stack_trace *trace)
+{
+ const struct pt_regs *regs = task_pt_regs(current);
+ const void __user *fp = (const void __user *)regs->bp;
+
+ if (trace->nr_entries < trace->max_entries)
+ trace->entries[trace->nr_entries++] = regs->ip;
+
+ while (trace->nr_entries < trace->max_entries) {
+ struct stack_frame_user frame;
+
+ frame.next_fp = NULL;
+ frame.ret_addr = 0;
+ if (!copy_stack_frame(fp, &frame))
+ break;
+ if ((unsigned long)fp < regs->sp)
+ break;
+ if (frame.ret_addr) {
+ trace->entries[trace->nr_entries++] =
+ frame.ret_addr;
+ }
+ if (fp == frame.next_fp)
+ break;
+ fp = frame.next_fp;
+ }
+}
+
+void save_stack_trace_user(struct stack_trace *trace)
+{
+ /*
+ * Trace user stack if we are not a kernel thread
+ */
+ if (current->mm) {
+ __save_stack_trace_user(trace);
+ }
+ if (trace->nr_entries < trace->max_entries)
+ trace->entries[trace->nr_entries++] = ULONG_MAX;
+}
diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c
new file mode 100644
index 0000000..60d2c37
--- /dev/null
+++ b/arch/x86/kernel/step.c
@@ -0,0 +1,237 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * x86 single-step support code, common to 32-bit and 64-bit.
+ */
+#include <linux/sched.h>
+#include <linux/sched/task_stack.h>
+#include <linux/mm.h>
+#include <linux/ptrace.h>
+#include <asm/desc.h>
+#include <asm/mmu_context.h>
+
+unsigned long convert_ip_to_linear(struct task_struct *child, struct pt_regs *regs)
+{
+ unsigned long addr, seg;
+
+ addr = regs->ip;
+ seg = regs->cs;
+ if (v8086_mode(regs)) {
+ addr = (addr & 0xffff) + (seg << 4);
+ return addr;
+ }
+
+#ifdef CONFIG_MODIFY_LDT_SYSCALL
+ /*
+ * We'll assume that the code segments in the GDT
+ * are all zero-based. That is largely true: the
+ * TLS segments are used for data, and the PNPBIOS
+ * and APM bios ones we just ignore here.
+ */
+ if ((seg & SEGMENT_TI_MASK) == SEGMENT_LDT) {
+ struct desc_struct *desc;
+ unsigned long base;
+
+ seg >>= 3;
+
+ mutex_lock(&child->mm->context.lock);
+ if (unlikely(!child->mm->context.ldt ||
+ seg >= child->mm->context.ldt->nr_entries))
+ addr = -1L; /* bogus selector, access would fault */
+ else {
+ desc = &child->mm->context.ldt->entries[seg];
+ base = get_desc_base(desc);
+
+ /* 16-bit code segment? */
+ if (!desc->d)
+ addr &= 0xffff;
+ addr += base;
+ }
+ mutex_unlock(&child->mm->context.lock);
+ }
+#endif
+
+ return addr;
+}
+
+static int is_setting_trap_flag(struct task_struct *child, struct pt_regs *regs)
+{
+ int i, copied;
+ unsigned char opcode[15];
+ unsigned long addr = convert_ip_to_linear(child, regs);
+
+ copied = access_process_vm(child, addr, opcode, sizeof(opcode),
+ FOLL_FORCE);
+ for (i = 0; i < copied; i++) {
+ switch (opcode[i]) {
+ /* popf and iret */
+ case 0x9d: case 0xcf:
+ return 1;
+
+ /* CHECKME: 64 65 */
+
+ /* opcode and address size prefixes */
+ case 0x66: case 0x67:
+ continue;
+ /* irrelevant prefixes (segment overrides and repeats) */
+ case 0x26: case 0x2e:
+ case 0x36: case 0x3e:
+ case 0x64: case 0x65:
+ case 0xf0: case 0xf2: case 0xf3:
+ continue;
+
+#ifdef CONFIG_X86_64
+ case 0x40 ... 0x4f:
+ if (!user_64bit_mode(regs))
+ /* 32-bit mode: register increment */
+ return 0;
+ /* 64-bit mode: REX prefix */
+ continue;
+#endif
+
+ /* CHECKME: f2, f3 */
+
+ /*
+ * pushf: NOTE! We should probably not let
+ * the user see the TF bit being set. But
+ * it's more pain than it's worth to avoid
+ * it, and a debugger could emulate this
+ * all in user space if it _really_ cares.
+ */
+ case 0x9c:
+ default:
+ return 0;
+ }
+ }
+ return 0;
+}
+
+/*
+ * Enable single-stepping. Return nonzero if user mode is not using TF itself.
+ */
+static int enable_single_step(struct task_struct *child)
+{
+ struct pt_regs *regs = task_pt_regs(child);
+ unsigned long oflags;
+
+ /*
+ * If we stepped into a sysenter/syscall insn, it trapped in
+ * kernel mode; do_debug() cleared TF and set TIF_SINGLESTEP.
+ * If user-mode had set TF itself, then it's still clear from
+ * do_debug() and we need to set it again to restore the user
+ * state so we don't wrongly set TIF_FORCED_TF below.
+ * If enable_single_step() was used last and that is what
+ * set TIF_SINGLESTEP, then both TF and TIF_FORCED_TF are
+ * already set and our bookkeeping is fine.
+ */
+ if (unlikely(test_tsk_thread_flag(child, TIF_SINGLESTEP)))
+ regs->flags |= X86_EFLAGS_TF;
+
+ /*
+ * Always set TIF_SINGLESTEP - this guarantees that
+ * we single-step system calls etc.. This will also
+ * cause us to set TF when returning to user mode.
+ */
+ set_tsk_thread_flag(child, TIF_SINGLESTEP);
+
+ oflags = regs->flags;
+
+ /* Set TF on the kernel stack.. */
+ regs->flags |= X86_EFLAGS_TF;
+
+ /*
+ * ..but if TF is changed by the instruction we will trace,
+ * don't mark it as being "us" that set it, so that we
+ * won't clear it by hand later.
+ *
+ * Note that if we don't actually execute the popf because
+ * of a signal arriving right now or suchlike, we will lose
+ * track of the fact that it really was "us" that set it.
+ */
+ if (is_setting_trap_flag(child, regs)) {
+ clear_tsk_thread_flag(child, TIF_FORCED_TF);
+ return 0;
+ }
+
+ /*
+ * If TF was already set, check whether it was us who set it.
+ * If not, we should never attempt a block step.
+ */
+ if (oflags & X86_EFLAGS_TF)
+ return test_tsk_thread_flag(child, TIF_FORCED_TF);
+
+ set_tsk_thread_flag(child, TIF_FORCED_TF);
+
+ return 1;
+}
+
+void set_task_blockstep(struct task_struct *task, bool on)
+{
+ unsigned long debugctl;
+
+ /*
+ * Ensure irq/preemption can't change debugctl in between.
+ * Note also that both TIF_BLOCKSTEP and debugctl should
+ * be changed atomically wrt preemption.
+ *
+ * NOTE: this means that set/clear TIF_BLOCKSTEP is only safe if
+ * task is current or it can't be running, otherwise we can race
+ * with __switch_to_xtra(). We rely on ptrace_freeze_traced() but
+ * PTRACE_KILL is not safe.
+ */
+ local_irq_disable();
+ debugctl = get_debugctlmsr();
+ if (on) {
+ debugctl |= DEBUGCTLMSR_BTF;
+ set_tsk_thread_flag(task, TIF_BLOCKSTEP);
+ } else {
+ debugctl &= ~DEBUGCTLMSR_BTF;
+ clear_tsk_thread_flag(task, TIF_BLOCKSTEP);
+ }
+ if (task == current)
+ update_debugctlmsr(debugctl);
+ local_irq_enable();
+}
+
+/*
+ * Enable single or block step.
+ */
+static void enable_step(struct task_struct *child, bool block)
+{
+ /*
+ * Make sure block stepping (BTF) is not enabled unless it should be.
+ * Note that we don't try to worry about any is_setting_trap_flag()
+ * instructions after the first when using block stepping.
+ * So no one should try to use debugger block stepping in a program
+ * that uses user-mode single stepping itself.
+ */
+ if (enable_single_step(child) && block)
+ set_task_blockstep(child, true);
+ else if (test_tsk_thread_flag(child, TIF_BLOCKSTEP))
+ set_task_blockstep(child, false);
+}
+
+void user_enable_single_step(struct task_struct *child)
+{
+ enable_step(child, 0);
+}
+
+void user_enable_block_step(struct task_struct *child)
+{
+ enable_step(child, 1);
+}
+
+void user_disable_single_step(struct task_struct *child)
+{
+ /*
+ * Make sure block stepping (BTF) is disabled.
+ */
+ if (test_tsk_thread_flag(child, TIF_BLOCKSTEP))
+ set_task_blockstep(child, false);
+
+ /* Always clear TIF_SINGLESTEP... */
+ clear_tsk_thread_flag(child, TIF_SINGLESTEP);
+
+ /* But touch TF only if it was set by us.. */
+ if (test_and_clear_tsk_thread_flag(child, TIF_FORCED_TF))
+ task_pt_regs(child)->flags &= ~X86_EFLAGS_TF;
+}
diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c
new file mode 100644
index 0000000..6a78d4b
--- /dev/null
+++ b/arch/x86/kernel/sys_x86_64.c
@@ -0,0 +1,244 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/compat.h>
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/sched/mm.h>
+#include <linux/syscalls.h>
+#include <linux/mm.h>
+#include <linux/fs.h>
+#include <linux/smp.h>
+#include <linux/sem.h>
+#include <linux/msg.h>
+#include <linux/shm.h>
+#include <linux/stat.h>
+#include <linux/mman.h>
+#include <linux/file.h>
+#include <linux/utsname.h>
+#include <linux/personality.h>
+#include <linux/random.h>
+#include <linux/uaccess.h>
+#include <linux/elf.h>
+
+#include <asm/elf.h>
+#include <asm/ia32.h>
+#include <asm/syscalls.h>
+#include <asm/mpx.h>
+
+/*
+ * Align a virtual address to avoid aliasing in the I$ on AMD F15h.
+ */
+static unsigned long get_align_mask(void)
+{
+ /* handle 32- and 64-bit case with a single conditional */
+ if (va_align.flags < 0 || !(va_align.flags & (2 - mmap_is_ia32())))
+ return 0;
+
+ if (!(current->flags & PF_RANDOMIZE))
+ return 0;
+
+ return va_align.mask;
+}
+
+/*
+ * To avoid aliasing in the I$ on AMD F15h, the bits defined by the
+ * va_align.bits, [12:upper_bit), are set to a random value instead of
+ * zeroing them. This random value is computed once per boot. This form
+ * of ASLR is known as "per-boot ASLR".
+ *
+ * To achieve this, the random value is added to the info.align_offset
+ * value before calling vm_unmapped_area() or ORed directly to the
+ * address.
+ */
+static unsigned long get_align_bits(void)
+{
+ return va_align.bits & get_align_mask();
+}
+
+unsigned long align_vdso_addr(unsigned long addr)
+{
+ unsigned long align_mask = get_align_mask();
+ addr = (addr + align_mask) & ~align_mask;
+ return addr | get_align_bits();
+}
+
+static int __init control_va_addr_alignment(char *str)
+{
+ /* guard against enabling this on other CPU families */
+ if (va_align.flags < 0)
+ return 1;
+
+ if (*str == 0)
+ return 1;
+
+ if (*str == '=')
+ str++;
+
+ if (!strcmp(str, "32"))
+ va_align.flags = ALIGN_VA_32;
+ else if (!strcmp(str, "64"))
+ va_align.flags = ALIGN_VA_64;
+ else if (!strcmp(str, "off"))
+ va_align.flags = 0;
+ else if (!strcmp(str, "on"))
+ va_align.flags = ALIGN_VA_32 | ALIGN_VA_64;
+ else
+ return 0;
+
+ return 1;
+}
+__setup("align_va_addr", control_va_addr_alignment);
+
+SYSCALL_DEFINE6(mmap, unsigned long, addr, unsigned long, len,
+ unsigned long, prot, unsigned long, flags,
+ unsigned long, fd, unsigned long, off)
+{
+ long error;
+ error = -EINVAL;
+ if (off & ~PAGE_MASK)
+ goto out;
+
+ error = ksys_mmap_pgoff(addr, len, prot, flags, fd, off >> PAGE_SHIFT);
+out:
+ return error;
+}
+
+static void find_start_end(unsigned long addr, unsigned long flags,
+ unsigned long *begin, unsigned long *end)
+{
+ if (!in_compat_syscall() && (flags & MAP_32BIT)) {
+ /* This is usually used needed to map code in small
+ model, so it needs to be in the first 31bit. Limit
+ it to that. This means we need to move the
+ unmapped base down for this case. This can give
+ conflicts with the heap, but we assume that glibc
+ malloc knows how to fall back to mmap. Give it 1GB
+ of playground for now. -AK */
+ *begin = 0x40000000;
+ *end = 0x80000000;
+ if (current->flags & PF_RANDOMIZE) {
+ *begin = randomize_page(*begin, 0x02000000);
+ }
+ return;
+ }
+
+ *begin = get_mmap_base(1);
+ if (in_compat_syscall())
+ *end = task_size_32bit();
+ else
+ *end = task_size_64bit(addr > DEFAULT_MAP_WINDOW);
+}
+
+unsigned long
+arch_get_unmapped_area(struct file *filp, unsigned long addr,
+ unsigned long len, unsigned long pgoff, unsigned long flags)
+{
+ struct mm_struct *mm = current->mm;
+ struct vm_area_struct *vma;
+ struct vm_unmapped_area_info info;
+ unsigned long begin, end;
+
+ addr = mpx_unmapped_area_check(addr, len, flags);
+ if (IS_ERR_VALUE(addr))
+ return addr;
+
+ if (flags & MAP_FIXED)
+ return addr;
+
+ find_start_end(addr, flags, &begin, &end);
+
+ if (len > end)
+ return -ENOMEM;
+
+ if (addr) {
+ addr = PAGE_ALIGN(addr);
+ vma = find_vma(mm, addr);
+ if (end - len >= addr &&
+ (!vma || addr + len <= vm_start_gap(vma)))
+ return addr;
+ }
+
+ info.flags = 0;
+ info.length = len;
+ info.low_limit = begin;
+ info.high_limit = end;
+ info.align_mask = 0;
+ info.align_offset = pgoff << PAGE_SHIFT;
+ if (filp) {
+ info.align_mask = get_align_mask();
+ info.align_offset += get_align_bits();
+ }
+ return vm_unmapped_area(&info);
+}
+
+unsigned long
+arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
+ const unsigned long len, const unsigned long pgoff,
+ const unsigned long flags)
+{
+ struct vm_area_struct *vma;
+ struct mm_struct *mm = current->mm;
+ unsigned long addr = addr0;
+ struct vm_unmapped_area_info info;
+
+ addr = mpx_unmapped_area_check(addr, len, flags);
+ if (IS_ERR_VALUE(addr))
+ return addr;
+
+ /* requested length too big for entire address space */
+ if (len > TASK_SIZE)
+ return -ENOMEM;
+
+ /* No address checking. See comment at mmap_address_hint_valid() */
+ if (flags & MAP_FIXED)
+ return addr;
+
+ /* for MAP_32BIT mappings we force the legacy mmap base */
+ if (!in_compat_syscall() && (flags & MAP_32BIT))
+ goto bottomup;
+
+ /* requesting a specific address */
+ if (addr) {
+ addr &= PAGE_MASK;
+ if (!mmap_address_hint_valid(addr, len))
+ goto get_unmapped_area;
+
+ vma = find_vma(mm, addr);
+ if (!vma || addr + len <= vm_start_gap(vma))
+ return addr;
+ }
+get_unmapped_area:
+
+ info.flags = VM_UNMAPPED_AREA_TOPDOWN;
+ info.length = len;
+ info.low_limit = PAGE_SIZE;
+ info.high_limit = get_mmap_base(0);
+
+ /*
+ * If hint address is above DEFAULT_MAP_WINDOW, look for unmapped area
+ * in the full address space.
+ *
+ * !in_compat_syscall() check to avoid high addresses for x32.
+ */
+ if (addr > DEFAULT_MAP_WINDOW && !in_compat_syscall())
+ info.high_limit += TASK_SIZE_MAX - DEFAULT_MAP_WINDOW;
+
+ info.align_mask = 0;
+ info.align_offset = pgoff << PAGE_SHIFT;
+ if (filp) {
+ info.align_mask = get_align_mask();
+ info.align_offset += get_align_bits();
+ }
+ addr = vm_unmapped_area(&info);
+ if (!(addr & ~PAGE_MASK))
+ return addr;
+ VM_BUG_ON(addr != -ENOMEM);
+
+bottomup:
+ /*
+ * A failed mmap() very likely causes application failure,
+ * so fall back to the bottom-up function here. This scenario
+ * can happen with large stack limits and large mmap()
+ * allocations.
+ */
+ return arch_get_unmapped_area(filp, addr0, len, pgoff, flags);
+}
diff --git a/arch/x86/kernel/sysfb.c b/arch/x86/kernel/sysfb.c
new file mode 100644
index 0000000..160386e
--- /dev/null
+++ b/arch/x86/kernel/sysfb.c
@@ -0,0 +1,74 @@
+/*
+ * Generic System Framebuffers on x86
+ * Copyright (c) 2012-2013 David Herrmann <dh.herrmann@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ */
+
+/*
+ * Simple-Framebuffer support for x86 systems
+ * Create a platform-device for any available boot framebuffer. The
+ * simple-framebuffer platform device is already available on DT systems, so
+ * this module parses the global "screen_info" object and creates a suitable
+ * platform device compatible with the "simple-framebuffer" DT object. If
+ * the framebuffer is incompatible, we instead create a legacy
+ * "vesa-framebuffer", "efi-framebuffer" or "platform-framebuffer" device and
+ * pass the screen_info as platform_data. This allows legacy drivers
+ * to pick these devices up without messing with simple-framebuffer drivers.
+ * The global "screen_info" is still valid at all times.
+ *
+ * If CONFIG_X86_SYSFB is not selected, we never register "simple-framebuffer"
+ * platform devices, but only use legacy framebuffer devices for
+ * backwards compatibility.
+ *
+ * TODO: We set the dev_id field of all platform-devices to 0. This allows
+ * other x86 OF/DT parsers to create such devices, too. However, they must
+ * start at offset 1 for this to work.
+ */
+
+#include <linux/err.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/platform_data/simplefb.h>
+#include <linux/platform_device.h>
+#include <linux/screen_info.h>
+#include <asm/sysfb.h>
+
+static __init int sysfb_init(void)
+{
+ struct screen_info *si = &screen_info;
+ struct simplefb_platform_data mode;
+ struct platform_device *pd;
+ const char *name;
+ bool compatible;
+ int ret;
+
+ sysfb_apply_efi_quirks();
+
+ /* try to create a simple-framebuffer device */
+ compatible = parse_mode(si, &mode);
+ if (compatible) {
+ ret = create_simplefb(si, &mode);
+ if (!ret)
+ return 0;
+ }
+
+ /* if the FB is incompatible, create a legacy framebuffer device */
+ if (si->orig_video_isVGA == VIDEO_TYPE_EFI)
+ name = "efi-framebuffer";
+ else if (si->orig_video_isVGA == VIDEO_TYPE_VLFB)
+ name = "vesa-framebuffer";
+ else
+ name = "platform-framebuffer";
+
+ pd = platform_device_register_resndata(NULL, name, 0,
+ NULL, 0, si, sizeof(*si));
+ return PTR_ERR_OR_ZERO(pd);
+}
+
+/* must execute after PCI subsystem for EFI quirks */
+device_initcall(sysfb_init);
diff --git a/arch/x86/kernel/sysfb_efi.c b/arch/x86/kernel/sysfb_efi.c
new file mode 100644
index 0000000..623965e
--- /dev/null
+++ b/arch/x86/kernel/sysfb_efi.c
@@ -0,0 +1,239 @@
+/*
+ * Generic System Framebuffers on x86
+ * Copyright (c) 2012-2013 David Herrmann <dh.herrmann@gmail.com>
+ *
+ * EFI Quirks Copyright (c) 2006 Edgar Hucek <gimli@dark-green.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ */
+
+/*
+ * EFI Quirks
+ * Several EFI systems do not correctly advertise their boot framebuffers.
+ * Hence, we use this static table of known broken machines and fix up the
+ * information so framebuffer drivers can load corectly.
+ */
+
+#include <linux/dmi.h>
+#include <linux/err.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/pci.h>
+#include <linux/screen_info.h>
+#include <video/vga.h>
+#include <asm/sysfb.h>
+
+enum {
+ OVERRIDE_NONE = 0x0,
+ OVERRIDE_BASE = 0x1,
+ OVERRIDE_STRIDE = 0x2,
+ OVERRIDE_HEIGHT = 0x4,
+ OVERRIDE_WIDTH = 0x8,
+};
+
+struct efifb_dmi_info efifb_dmi_list[] = {
+ [M_I17] = { "i17", 0x80010000, 1472 * 4, 1440, 900, OVERRIDE_NONE },
+ [M_I20] = { "i20", 0x80010000, 1728 * 4, 1680, 1050, OVERRIDE_NONE }, /* guess */
+ [M_I20_SR] = { "imac7", 0x40010000, 1728 * 4, 1680, 1050, OVERRIDE_NONE },
+ [M_I24] = { "i24", 0x80010000, 2048 * 4, 1920, 1200, OVERRIDE_NONE }, /* guess */
+ [M_I24_8_1] = { "imac8", 0xc0060000, 2048 * 4, 1920, 1200, OVERRIDE_NONE },
+ [M_I24_10_1] = { "imac10", 0xc0010000, 2048 * 4, 1920, 1080, OVERRIDE_NONE },
+ [M_I27_11_1] = { "imac11", 0xc0010000, 2560 * 4, 2560, 1440, OVERRIDE_NONE },
+ [M_MINI]= { "mini", 0x80000000, 2048 * 4, 1024, 768, OVERRIDE_NONE },
+ [M_MINI_3_1] = { "mini31", 0x40010000, 1024 * 4, 1024, 768, OVERRIDE_NONE },
+ [M_MINI_4_1] = { "mini41", 0xc0010000, 2048 * 4, 1920, 1200, OVERRIDE_NONE },
+ [M_MB] = { "macbook", 0x80000000, 2048 * 4, 1280, 800, OVERRIDE_NONE },
+ [M_MB_5_1] = { "macbook51", 0x80010000, 2048 * 4, 1280, 800, OVERRIDE_NONE },
+ [M_MB_6_1] = { "macbook61", 0x80010000, 2048 * 4, 1280, 800, OVERRIDE_NONE },
+ [M_MB_7_1] = { "macbook71", 0x80010000, 2048 * 4, 1280, 800, OVERRIDE_NONE },
+ [M_MBA] = { "mba", 0x80000000, 2048 * 4, 1280, 800, OVERRIDE_NONE },
+ /* 11" Macbook Air 3,1 passes the wrong stride */
+ [M_MBA_3] = { "mba3", 0, 2048 * 4, 0, 0, OVERRIDE_STRIDE },
+ [M_MBP] = { "mbp", 0x80010000, 1472 * 4, 1440, 900, OVERRIDE_NONE },
+ [M_MBP_2] = { "mbp2", 0, 0, 0, 0, OVERRIDE_NONE }, /* placeholder */
+ [M_MBP_2_2] = { "mbp22", 0x80010000, 1472 * 4, 1440, 900, OVERRIDE_NONE },
+ [M_MBP_SR] = { "mbp3", 0x80030000, 2048 * 4, 1440, 900, OVERRIDE_NONE },
+ [M_MBP_4] = { "mbp4", 0xc0060000, 2048 * 4, 1920, 1200, OVERRIDE_NONE },
+ [M_MBP_5_1] = { "mbp51", 0xc0010000, 2048 * 4, 1440, 900, OVERRIDE_NONE },
+ [M_MBP_5_2] = { "mbp52", 0xc0010000, 2048 * 4, 1920, 1200, OVERRIDE_NONE },
+ [M_MBP_5_3] = { "mbp53", 0xd0010000, 2048 * 4, 1440, 900, OVERRIDE_NONE },
+ [M_MBP_6_1] = { "mbp61", 0x90030000, 2048 * 4, 1920, 1200, OVERRIDE_NONE },
+ [M_MBP_6_2] = { "mbp62", 0x90030000, 2048 * 4, 1680, 1050, OVERRIDE_NONE },
+ [M_MBP_7_1] = { "mbp71", 0xc0010000, 2048 * 4, 1280, 800, OVERRIDE_NONE },
+ [M_MBP_8_2] = { "mbp82", 0x90010000, 1472 * 4, 1440, 900, OVERRIDE_NONE },
+ [M_UNKNOWN] = { NULL, 0, 0, 0, 0, OVERRIDE_NONE }
+};
+
+void efifb_setup_from_dmi(struct screen_info *si, const char *opt)
+{
+ int i;
+
+ for (i = 0; i < M_UNKNOWN; i++) {
+ if (efifb_dmi_list[i].base != 0 &&
+ !strcmp(opt, efifb_dmi_list[i].optname)) {
+ si->lfb_base = efifb_dmi_list[i].base;
+ si->lfb_linelength = efifb_dmi_list[i].stride;
+ si->lfb_width = efifb_dmi_list[i].width;
+ si->lfb_height = efifb_dmi_list[i].height;
+ }
+ }
+}
+
+#define choose_value(dmivalue, fwvalue, field, flags) ({ \
+ typeof(fwvalue) _ret_ = fwvalue; \
+ if ((flags) & (field)) \
+ _ret_ = dmivalue; \
+ else if ((fwvalue) == 0) \
+ _ret_ = dmivalue; \
+ _ret_; \
+ })
+
+static int __init efifb_set_system(const struct dmi_system_id *id)
+{
+ struct efifb_dmi_info *info = id->driver_data;
+
+ if (info->base == 0 && info->height == 0 && info->width == 0 &&
+ info->stride == 0)
+ return 0;
+
+ /* Trust the bootloader over the DMI tables */
+ if (screen_info.lfb_base == 0) {
+#if defined(CONFIG_PCI)
+ struct pci_dev *dev = NULL;
+ int found_bar = 0;
+#endif
+ if (info->base) {
+ screen_info.lfb_base = choose_value(info->base,
+ screen_info.lfb_base, OVERRIDE_BASE,
+ info->flags);
+
+#if defined(CONFIG_PCI)
+ /* make sure that the address in the table is actually
+ * on a VGA device's PCI BAR */
+
+ for_each_pci_dev(dev) {
+ int i;
+ if ((dev->class >> 8) != PCI_CLASS_DISPLAY_VGA)
+ continue;
+ for (i = 0; i < DEVICE_COUNT_RESOURCE; i++) {
+ resource_size_t start, end;
+ unsigned long flags;
+
+ flags = pci_resource_flags(dev, i);
+ if (!(flags & IORESOURCE_MEM))
+ continue;
+
+ if (flags & IORESOURCE_UNSET)
+ continue;
+
+ if (pci_resource_len(dev, i) == 0)
+ continue;
+
+ start = pci_resource_start(dev, i);
+ end = pci_resource_end(dev, i);
+ if (screen_info.lfb_base >= start &&
+ screen_info.lfb_base < end) {
+ found_bar = 1;
+ break;
+ }
+ }
+ }
+ if (!found_bar)
+ screen_info.lfb_base = 0;
+#endif
+ }
+ }
+ if (screen_info.lfb_base) {
+ screen_info.lfb_linelength = choose_value(info->stride,
+ screen_info.lfb_linelength, OVERRIDE_STRIDE,
+ info->flags);
+ screen_info.lfb_width = choose_value(info->width,
+ screen_info.lfb_width, OVERRIDE_WIDTH,
+ info->flags);
+ screen_info.lfb_height = choose_value(info->height,
+ screen_info.lfb_height, OVERRIDE_HEIGHT,
+ info->flags);
+ if (screen_info.orig_video_isVGA == 0)
+ screen_info.orig_video_isVGA = VIDEO_TYPE_EFI;
+ } else {
+ screen_info.lfb_linelength = 0;
+ screen_info.lfb_width = 0;
+ screen_info.lfb_height = 0;
+ screen_info.orig_video_isVGA = 0;
+ return 0;
+ }
+
+ printk(KERN_INFO "efifb: dmi detected %s - framebuffer at 0x%08x "
+ "(%dx%d, stride %d)\n", id->ident,
+ screen_info.lfb_base, screen_info.lfb_width,
+ screen_info.lfb_height, screen_info.lfb_linelength);
+
+ return 1;
+}
+
+#define EFIFB_DMI_SYSTEM_ID(vendor, name, enumid) \
+ { \
+ efifb_set_system, \
+ name, \
+ { \
+ DMI_MATCH(DMI_BIOS_VENDOR, vendor), \
+ DMI_MATCH(DMI_PRODUCT_NAME, name) \
+ }, \
+ &efifb_dmi_list[enumid] \
+ }
+
+static const struct dmi_system_id efifb_dmi_system_table[] __initconst = {
+ EFIFB_DMI_SYSTEM_ID("Apple Computer, Inc.", "iMac4,1", M_I17),
+ /* At least one of these two will be right; maybe both? */
+ EFIFB_DMI_SYSTEM_ID("Apple Computer, Inc.", "iMac5,1", M_I20),
+ EFIFB_DMI_SYSTEM_ID("Apple Inc.", "iMac5,1", M_I20),
+ /* At least one of these two will be right; maybe both? */
+ EFIFB_DMI_SYSTEM_ID("Apple Computer, Inc.", "iMac6,1", M_I24),
+ EFIFB_DMI_SYSTEM_ID("Apple Inc.", "iMac6,1", M_I24),
+ EFIFB_DMI_SYSTEM_ID("Apple Inc.", "iMac7,1", M_I20_SR),
+ EFIFB_DMI_SYSTEM_ID("Apple Inc.", "iMac8,1", M_I24_8_1),
+ EFIFB_DMI_SYSTEM_ID("Apple Inc.", "iMac10,1", M_I24_10_1),
+ EFIFB_DMI_SYSTEM_ID("Apple Inc.", "iMac11,1", M_I27_11_1),
+ EFIFB_DMI_SYSTEM_ID("Apple Computer, Inc.", "Macmini1,1", M_MINI),
+ EFIFB_DMI_SYSTEM_ID("Apple Inc.", "Macmini3,1", M_MINI_3_1),
+ EFIFB_DMI_SYSTEM_ID("Apple Inc.", "Macmini4,1", M_MINI_4_1),
+ EFIFB_DMI_SYSTEM_ID("Apple Computer, Inc.", "MacBook1,1", M_MB),
+ /* At least one of these two will be right; maybe both? */
+ EFIFB_DMI_SYSTEM_ID("Apple Computer, Inc.", "MacBook2,1", M_MB),
+ EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBook2,1", M_MB),
+ /* At least one of these two will be right; maybe both? */
+ EFIFB_DMI_SYSTEM_ID("Apple Computer, Inc.", "MacBook3,1", M_MB),
+ EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBook3,1", M_MB),
+ EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBook4,1", M_MB),
+ EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBook5,1", M_MB_5_1),
+ EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBook6,1", M_MB_6_1),
+ EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBook7,1", M_MB_7_1),
+ EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBookAir1,1", M_MBA),
+ EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBookAir3,1", M_MBA_3),
+ EFIFB_DMI_SYSTEM_ID("Apple Computer, Inc.", "MacBookPro1,1", M_MBP),
+ EFIFB_DMI_SYSTEM_ID("Apple Computer, Inc.", "MacBookPro2,1", M_MBP_2),
+ EFIFB_DMI_SYSTEM_ID("Apple Computer, Inc.", "MacBookPro2,2", M_MBP_2_2),
+ EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBookPro2,1", M_MBP_2),
+ EFIFB_DMI_SYSTEM_ID("Apple Computer, Inc.", "MacBookPro3,1", M_MBP_SR),
+ EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBookPro3,1", M_MBP_SR),
+ EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBookPro4,1", M_MBP_4),
+ EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBookPro5,1", M_MBP_5_1),
+ EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBookPro5,2", M_MBP_5_2),
+ EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBookPro5,3", M_MBP_5_3),
+ EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBookPro6,1", M_MBP_6_1),
+ EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBookPro6,2", M_MBP_6_2),
+ EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBookPro7,1", M_MBP_7_1),
+ EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBookPro8,2", M_MBP_8_2),
+ {},
+};
+
+__init void sysfb_apply_efi_quirks(void)
+{
+ if (screen_info.orig_video_isVGA != VIDEO_TYPE_EFI ||
+ !(screen_info.capabilities & VIDEO_CAPABILITY_SKIP_QUIRKS))
+ dmi_check_system(efifb_dmi_system_table);
+}
diff --git a/arch/x86/kernel/sysfb_simplefb.c b/arch/x86/kernel/sysfb_simplefb.c
new file mode 100644
index 0000000..85195d4
--- /dev/null
+++ b/arch/x86/kernel/sysfb_simplefb.c
@@ -0,0 +1,115 @@
+/*
+ * Generic System Framebuffers on x86
+ * Copyright (c) 2012-2013 David Herrmann <dh.herrmann@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ */
+
+/*
+ * simple-framebuffer probing
+ * Try to convert "screen_info" into a "simple-framebuffer" compatible mode.
+ * If the mode is incompatible, we return "false" and let the caller create
+ * legacy nodes instead.
+ */
+
+#include <linux/err.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/platform_data/simplefb.h>
+#include <linux/platform_device.h>
+#include <linux/screen_info.h>
+#include <asm/sysfb.h>
+
+static const char simplefb_resname[] = "BOOTFB";
+static const struct simplefb_format formats[] = SIMPLEFB_FORMATS;
+
+/* try parsing x86 screen_info into a simple-framebuffer mode struct */
+__init bool parse_mode(const struct screen_info *si,
+ struct simplefb_platform_data *mode)
+{
+ const struct simplefb_format *f;
+ __u8 type;
+ unsigned int i;
+
+ type = si->orig_video_isVGA;
+ if (type != VIDEO_TYPE_VLFB && type != VIDEO_TYPE_EFI)
+ return false;
+
+ for (i = 0; i < ARRAY_SIZE(formats); ++i) {
+ f = &formats[i];
+ if (si->lfb_depth == f->bits_per_pixel &&
+ si->red_size == f->red.length &&
+ si->red_pos == f->red.offset &&
+ si->green_size == f->green.length &&
+ si->green_pos == f->green.offset &&
+ si->blue_size == f->blue.length &&
+ si->blue_pos == f->blue.offset &&
+ si->rsvd_size == f->transp.length &&
+ si->rsvd_pos == f->transp.offset) {
+ mode->format = f->name;
+ mode->width = si->lfb_width;
+ mode->height = si->lfb_height;
+ mode->stride = si->lfb_linelength;
+ return true;
+ }
+ }
+
+ return false;
+}
+
+__init int create_simplefb(const struct screen_info *si,
+ const struct simplefb_platform_data *mode)
+{
+ struct platform_device *pd;
+ struct resource res;
+ u64 base, size;
+ u32 length;
+
+ /*
+ * If the 64BIT_BASE capability is set, ext_lfb_base will contain the
+ * upper half of the base address. Assemble the address, then make sure
+ * it is valid and we can actually access it.
+ */
+ base = si->lfb_base;
+ if (si->capabilities & VIDEO_CAPABILITY_64BIT_BASE)
+ base |= (u64)si->ext_lfb_base << 32;
+ if (!base || (u64)(resource_size_t)base != base) {
+ printk(KERN_DEBUG "sysfb: inaccessible VRAM base\n");
+ return -EINVAL;
+ }
+
+ /*
+ * Don't use lfb_size as IORESOURCE size, since it may contain the
+ * entire VMEM, and thus require huge mappings. Use just the part we
+ * need, that is, the part where the framebuffer is located. But verify
+ * that it does not exceed the advertised VMEM.
+ * Note that in case of VBE, the lfb_size is shifted by 16 bits for
+ * historical reasons.
+ */
+ size = si->lfb_size;
+ if (si->orig_video_isVGA == VIDEO_TYPE_VLFB)
+ size <<= 16;
+ length = mode->height * mode->stride;
+ length = PAGE_ALIGN(length);
+ if (length > size) {
+ printk(KERN_WARNING "sysfb: VRAM smaller than advertised\n");
+ return -EINVAL;
+ }
+
+ /* setup IORESOURCE_MEM as framebuffer memory */
+ memset(&res, 0, sizeof(res));
+ res.flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+ res.name = simplefb_resname;
+ res.start = base;
+ res.end = res.start + length - 1;
+ if (res.end <= res.start)
+ return -EINVAL;
+
+ pd = platform_device_register_resndata(NULL, "simple-framebuffer", 0,
+ &res, 1, mode, sizeof(*mode));
+ return PTR_ERR_OR_ZERO(pd);
+}
diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c
new file mode 100644
index 0000000..a2486f4
--- /dev/null
+++ b/arch/x86/kernel/tboot.c
@@ -0,0 +1,541 @@
+/*
+ * tboot.c: main implementation of helper functions used by kernel for
+ * runtime support of Intel(R) Trusted Execution Technology
+ *
+ * Copyright (c) 2006-2009, Intel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ */
+
+#include <linux/dma_remapping.h>
+#include <linux/init_task.h>
+#include <linux/spinlock.h>
+#include <linux/export.h>
+#include <linux/delay.h>
+#include <linux/sched.h>
+#include <linux/init.h>
+#include <linux/dmar.h>
+#include <linux/cpu.h>
+#include <linux/pfn.h>
+#include <linux/mm.h>
+#include <linux/tboot.h>
+#include <linux/debugfs.h>
+
+#include <asm/realmode.h>
+#include <asm/processor.h>
+#include <asm/bootparam.h>
+#include <asm/pgtable.h>
+#include <asm/pgalloc.h>
+#include <asm/swiotlb.h>
+#include <asm/fixmap.h>
+#include <asm/proto.h>
+#include <asm/setup.h>
+#include <asm/e820/api.h>
+#include <asm/io.h>
+
+#include "../realmode/rm/wakeup.h"
+
+/* Global pointer to shared data; NULL means no measured launch. */
+struct tboot *tboot __read_mostly;
+EXPORT_SYMBOL(tboot);
+
+/* timeout for APs (in secs) to enter wait-for-SIPI state during shutdown */
+#define AP_WAIT_TIMEOUT 1
+
+#undef pr_fmt
+#define pr_fmt(fmt) "tboot: " fmt
+
+static u8 tboot_uuid[16] __initdata = TBOOT_UUID;
+
+void __init tboot_probe(void)
+{
+ /* Look for valid page-aligned address for shared page. */
+ if (!boot_params.tboot_addr)
+ return;
+ /*
+ * also verify that it is mapped as we expect it before calling
+ * set_fixmap(), to reduce chance of garbage value causing crash
+ */
+ if (!e820__mapped_any(boot_params.tboot_addr,
+ boot_params.tboot_addr, E820_TYPE_RESERVED)) {
+ pr_warning("non-0 tboot_addr but it is not of type E820_TYPE_RESERVED\n");
+ return;
+ }
+
+ /* Map and check for tboot UUID. */
+ set_fixmap(FIX_TBOOT_BASE, boot_params.tboot_addr);
+ tboot = (struct tboot *)fix_to_virt(FIX_TBOOT_BASE);
+ if (memcmp(&tboot_uuid, &tboot->uuid, sizeof(tboot->uuid))) {
+ pr_warning("tboot at 0x%llx is invalid\n",
+ boot_params.tboot_addr);
+ tboot = NULL;
+ return;
+ }
+ if (tboot->version < 5) {
+ pr_warning("tboot version is invalid: %u\n", tboot->version);
+ tboot = NULL;
+ return;
+ }
+
+ pr_info("found shared page at phys addr 0x%llx:\n",
+ boot_params.tboot_addr);
+ pr_debug("version: %d\n", tboot->version);
+ pr_debug("log_addr: 0x%08x\n", tboot->log_addr);
+ pr_debug("shutdown_entry: 0x%x\n", tboot->shutdown_entry);
+ pr_debug("tboot_base: 0x%08x\n", tboot->tboot_base);
+ pr_debug("tboot_size: 0x%x\n", tboot->tboot_size);
+}
+
+static pgd_t *tboot_pg_dir;
+static struct mm_struct tboot_mm = {
+ .mm_rb = RB_ROOT,
+ .pgd = swapper_pg_dir,
+ .mm_users = ATOMIC_INIT(2),
+ .mm_count = ATOMIC_INIT(1),
+ .mmap_sem = __RWSEM_INITIALIZER(init_mm.mmap_sem),
+ .page_table_lock = __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock),
+ .mmlist = LIST_HEAD_INIT(init_mm.mmlist),
+};
+
+static inline void switch_to_tboot_pt(void)
+{
+ write_cr3(virt_to_phys(tboot_pg_dir));
+}
+
+static int map_tboot_page(unsigned long vaddr, unsigned long pfn,
+ pgprot_t prot)
+{
+ pgd_t *pgd;
+ p4d_t *p4d;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
+
+ pgd = pgd_offset(&tboot_mm, vaddr);
+ p4d = p4d_alloc(&tboot_mm, pgd, vaddr);
+ if (!p4d)
+ return -1;
+ pud = pud_alloc(&tboot_mm, p4d, vaddr);
+ if (!pud)
+ return -1;
+ pmd = pmd_alloc(&tboot_mm, pud, vaddr);
+ if (!pmd)
+ return -1;
+ pte = pte_alloc_map(&tboot_mm, pmd, vaddr);
+ if (!pte)
+ return -1;
+ set_pte_at(&tboot_mm, vaddr, pte, pfn_pte(pfn, prot));
+ pte_unmap(pte);
+
+ /*
+ * PTI poisons low addresses in the kernel page tables in the
+ * name of making them unusable for userspace. To execute
+ * code at such a low address, the poison must be cleared.
+ *
+ * Note: 'pgd' actually gets set in p4d_alloc() _or_
+ * pud_alloc() depending on 4/5-level paging.
+ */
+ pgd->pgd &= ~_PAGE_NX;
+
+ return 0;
+}
+
+static int map_tboot_pages(unsigned long vaddr, unsigned long start_pfn,
+ unsigned long nr)
+{
+ /* Reuse the original kernel mapping */
+ tboot_pg_dir = pgd_alloc(&tboot_mm);
+ if (!tboot_pg_dir)
+ return -1;
+
+ for (; nr > 0; nr--, vaddr += PAGE_SIZE, start_pfn++) {
+ if (map_tboot_page(vaddr, start_pfn, PAGE_KERNEL_EXEC))
+ return -1;
+ }
+
+ return 0;
+}
+
+static void tboot_create_trampoline(void)
+{
+ u32 map_base, map_size;
+
+ /* Create identity map for tboot shutdown code. */
+ map_base = PFN_DOWN(tboot->tboot_base);
+ map_size = PFN_UP(tboot->tboot_size);
+ if (map_tboot_pages(map_base << PAGE_SHIFT, map_base, map_size))
+ panic("tboot: Error mapping tboot pages (mfns) @ 0x%x, 0x%x\n",
+ map_base, map_size);
+}
+
+#ifdef CONFIG_ACPI_SLEEP
+
+static void add_mac_region(phys_addr_t start, unsigned long size)
+{
+ struct tboot_mac_region *mr;
+ phys_addr_t end = start + size;
+
+ if (tboot->num_mac_regions >= MAX_TB_MAC_REGIONS)
+ panic("tboot: Too many MAC regions\n");
+
+ if (start && size) {
+ mr = &tboot->mac_regions[tboot->num_mac_regions++];
+ mr->start = round_down(start, PAGE_SIZE);
+ mr->size = round_up(end, PAGE_SIZE) - mr->start;
+ }
+}
+
+static int tboot_setup_sleep(void)
+{
+ int i;
+
+ tboot->num_mac_regions = 0;
+
+ for (i = 0; i < e820_table->nr_entries; i++) {
+ if ((e820_table->entries[i].type != E820_TYPE_RAM)
+ && (e820_table->entries[i].type != E820_TYPE_RESERVED_KERN))
+ continue;
+
+ add_mac_region(e820_table->entries[i].addr, e820_table->entries[i].size);
+ }
+
+ tboot->acpi_sinfo.kernel_s3_resume_vector =
+ real_mode_header->wakeup_start;
+
+ return 0;
+}
+
+#else /* no CONFIG_ACPI_SLEEP */
+
+static int tboot_setup_sleep(void)
+{
+ /* S3 shutdown requested, but S3 not supported by the kernel... */
+ BUG();
+ return -1;
+}
+
+#endif
+
+void tboot_shutdown(u32 shutdown_type)
+{
+ void (*shutdown)(void);
+
+ if (!tboot_enabled())
+ return;
+
+ /*
+ * if we're being called before the 1:1 mapping is set up then just
+ * return and let the normal shutdown happen; this should only be
+ * due to very early panic()
+ */
+ if (!tboot_pg_dir)
+ return;
+
+ /* if this is S3 then set regions to MAC */
+ if (shutdown_type == TB_SHUTDOWN_S3)
+ if (tboot_setup_sleep())
+ return;
+
+ tboot->shutdown_type = shutdown_type;
+
+ switch_to_tboot_pt();
+
+ shutdown = (void(*)(void))(unsigned long)tboot->shutdown_entry;
+ shutdown();
+
+ /* should not reach here */
+ while (1)
+ halt();
+}
+
+static void tboot_copy_fadt(const struct acpi_table_fadt *fadt)
+{
+#define TB_COPY_GAS(tbg, g) \
+ tbg.space_id = g.space_id; \
+ tbg.bit_width = g.bit_width; \
+ tbg.bit_offset = g.bit_offset; \
+ tbg.access_width = g.access_width; \
+ tbg.address = g.address;
+
+ TB_COPY_GAS(tboot->acpi_sinfo.pm1a_cnt_blk, fadt->xpm1a_control_block);
+ TB_COPY_GAS(tboot->acpi_sinfo.pm1b_cnt_blk, fadt->xpm1b_control_block);
+ TB_COPY_GAS(tboot->acpi_sinfo.pm1a_evt_blk, fadt->xpm1a_event_block);
+ TB_COPY_GAS(tboot->acpi_sinfo.pm1b_evt_blk, fadt->xpm1b_event_block);
+
+ /*
+ * We need phys addr of waking vector, but can't use virt_to_phys() on
+ * &acpi_gbl_FACS because it is ioremap'ed, so calc from FACS phys
+ * addr.
+ */
+ tboot->acpi_sinfo.wakeup_vector = fadt->facs +
+ offsetof(struct acpi_table_facs, firmware_waking_vector);
+}
+
+static int tboot_sleep(u8 sleep_state, u32 pm1a_control, u32 pm1b_control)
+{
+ static u32 acpi_shutdown_map[ACPI_S_STATE_COUNT] = {
+ /* S0,1,2: */ -1, -1, -1,
+ /* S3: */ TB_SHUTDOWN_S3,
+ /* S4: */ TB_SHUTDOWN_S4,
+ /* S5: */ TB_SHUTDOWN_S5 };
+
+ if (!tboot_enabled())
+ return 0;
+
+ tboot_copy_fadt(&acpi_gbl_FADT);
+ tboot->acpi_sinfo.pm1a_cnt_val = pm1a_control;
+ tboot->acpi_sinfo.pm1b_cnt_val = pm1b_control;
+ /* we always use the 32b wakeup vector */
+ tboot->acpi_sinfo.vector_width = 32;
+
+ if (sleep_state >= ACPI_S_STATE_COUNT ||
+ acpi_shutdown_map[sleep_state] == -1) {
+ pr_warning("unsupported sleep state 0x%x\n", sleep_state);
+ return -1;
+ }
+
+ tboot_shutdown(acpi_shutdown_map[sleep_state]);
+ return 0;
+}
+
+static int tboot_extended_sleep(u8 sleep_state, u32 val_a, u32 val_b)
+{
+ if (!tboot_enabled())
+ return 0;
+
+ pr_warning("tboot is not able to suspend on platforms with reduced hardware sleep (ACPIv5)");
+ return -ENODEV;
+}
+
+static atomic_t ap_wfs_count;
+
+static int tboot_wait_for_aps(int num_aps)
+{
+ unsigned long timeout;
+
+ timeout = AP_WAIT_TIMEOUT*HZ;
+ while (atomic_read((atomic_t *)&tboot->num_in_wfs) != num_aps &&
+ timeout) {
+ mdelay(1);
+ timeout--;
+ }
+
+ if (timeout)
+ pr_warning("tboot wait for APs timeout\n");
+
+ return !(atomic_read((atomic_t *)&tboot->num_in_wfs) == num_aps);
+}
+
+static int tboot_dying_cpu(unsigned int cpu)
+{
+ atomic_inc(&ap_wfs_count);
+ if (num_online_cpus() == 1) {
+ if (tboot_wait_for_aps(atomic_read(&ap_wfs_count)))
+ return -EBUSY;
+ }
+ return 0;
+}
+
+#ifdef CONFIG_DEBUG_FS
+
+#define TBOOT_LOG_UUID { 0x26, 0x25, 0x19, 0xc0, 0x30, 0x6b, 0xb4, 0x4d, \
+ 0x4c, 0x84, 0xa3, 0xe9, 0x53, 0xb8, 0x81, 0x74 }
+
+#define TBOOT_SERIAL_LOG_ADDR 0x60000
+#define TBOOT_SERIAL_LOG_SIZE 0x08000
+#define LOG_MAX_SIZE_OFF 16
+#define LOG_BUF_OFF 24
+
+static uint8_t tboot_log_uuid[16] = TBOOT_LOG_UUID;
+
+static ssize_t tboot_log_read(struct file *file, char __user *user_buf, size_t count, loff_t *ppos)
+{
+ void __iomem *log_base;
+ u8 log_uuid[16];
+ u32 max_size;
+ void *kbuf;
+ int ret = -EFAULT;
+
+ log_base = ioremap_nocache(TBOOT_SERIAL_LOG_ADDR, TBOOT_SERIAL_LOG_SIZE);
+ if (!log_base)
+ return ret;
+
+ memcpy_fromio(log_uuid, log_base, sizeof(log_uuid));
+ if (memcmp(&tboot_log_uuid, log_uuid, sizeof(log_uuid)))
+ goto err_iounmap;
+
+ max_size = readl(log_base + LOG_MAX_SIZE_OFF);
+ if (*ppos >= max_size) {
+ ret = 0;
+ goto err_iounmap;
+ }
+
+ if (*ppos + count > max_size)
+ count = max_size - *ppos;
+
+ kbuf = kmalloc(count, GFP_KERNEL);
+ if (!kbuf) {
+ ret = -ENOMEM;
+ goto err_iounmap;
+ }
+
+ memcpy_fromio(kbuf, log_base + LOG_BUF_OFF + *ppos, count);
+ if (copy_to_user(user_buf, kbuf, count))
+ goto err_kfree;
+
+ *ppos += count;
+
+ ret = count;
+
+err_kfree:
+ kfree(kbuf);
+
+err_iounmap:
+ iounmap(log_base);
+
+ return ret;
+}
+
+static const struct file_operations tboot_log_fops = {
+ .read = tboot_log_read,
+ .llseek = default_llseek,
+};
+
+#endif /* CONFIG_DEBUG_FS */
+
+static __init int tboot_late_init(void)
+{
+ if (!tboot_enabled())
+ return 0;
+
+ tboot_create_trampoline();
+
+ atomic_set(&ap_wfs_count, 0);
+ cpuhp_setup_state(CPUHP_AP_X86_TBOOT_DYING, "x86/tboot:dying", NULL,
+ tboot_dying_cpu);
+#ifdef CONFIG_DEBUG_FS
+ debugfs_create_file("tboot_log", S_IRUSR,
+ arch_debugfs_dir, NULL, &tboot_log_fops);
+#endif
+
+ acpi_os_set_prepare_sleep(&tboot_sleep);
+ acpi_os_set_prepare_extended_sleep(&tboot_extended_sleep);
+ return 0;
+}
+
+late_initcall(tboot_late_init);
+
+/*
+ * TXT configuration registers (offsets from TXT_{PUB, PRIV}_CONFIG_REGS_BASE)
+ */
+
+#define TXT_PUB_CONFIG_REGS_BASE 0xfed30000
+#define TXT_PRIV_CONFIG_REGS_BASE 0xfed20000
+
+/* # pages for each config regs space - used by fixmap */
+#define NR_TXT_CONFIG_PAGES ((TXT_PUB_CONFIG_REGS_BASE - \
+ TXT_PRIV_CONFIG_REGS_BASE) >> PAGE_SHIFT)
+
+/* offsets from pub/priv config space */
+#define TXTCR_HEAP_BASE 0x0300
+#define TXTCR_HEAP_SIZE 0x0308
+
+#define SHA1_SIZE 20
+
+struct sha1_hash {
+ u8 hash[SHA1_SIZE];
+};
+
+struct sinit_mle_data {
+ u32 version; /* currently 6 */
+ struct sha1_hash bios_acm_id;
+ u32 edx_senter_flags;
+ u64 mseg_valid;
+ struct sha1_hash sinit_hash;
+ struct sha1_hash mle_hash;
+ struct sha1_hash stm_hash;
+ struct sha1_hash lcp_policy_hash;
+ u32 lcp_policy_control;
+ u32 rlp_wakeup_addr;
+ u32 reserved;
+ u32 num_mdrs;
+ u32 mdrs_off;
+ u32 num_vtd_dmars;
+ u32 vtd_dmars_off;
+} __packed;
+
+struct acpi_table_header *tboot_get_dmar_table(struct acpi_table_header *dmar_tbl)
+{
+ void *heap_base, *heap_ptr, *config;
+
+ if (!tboot_enabled())
+ return dmar_tbl;
+
+ /*
+ * ACPI tables may not be DMA protected by tboot, so use DMAR copy
+ * SINIT saved in SinitMleData in TXT heap (which is DMA protected)
+ */
+
+ /* map config space in order to get heap addr */
+ config = ioremap(TXT_PUB_CONFIG_REGS_BASE, NR_TXT_CONFIG_PAGES *
+ PAGE_SIZE);
+ if (!config)
+ return NULL;
+
+ /* now map TXT heap */
+ heap_base = ioremap(*(u64 *)(config + TXTCR_HEAP_BASE),
+ *(u64 *)(config + TXTCR_HEAP_SIZE));
+ iounmap(config);
+ if (!heap_base)
+ return NULL;
+
+ /* walk heap to SinitMleData */
+ /* skip BiosData */
+ heap_ptr = heap_base + *(u64 *)heap_base;
+ /* skip OsMleData */
+ heap_ptr += *(u64 *)heap_ptr;
+ /* skip OsSinitData */
+ heap_ptr += *(u64 *)heap_ptr;
+ /* now points to SinitMleDataSize; set to SinitMleData */
+ heap_ptr += sizeof(u64);
+ /* get addr of DMAR table */
+ dmar_tbl = (struct acpi_table_header *)(heap_ptr +
+ ((struct sinit_mle_data *)heap_ptr)->vtd_dmars_off -
+ sizeof(u64));
+
+ /* don't unmap heap because dmar.c needs access to this */
+
+ return dmar_tbl;
+}
+
+int tboot_force_iommu(void)
+{
+ if (!tboot_enabled())
+ return 0;
+
+ if (intel_iommu_tboot_noforce)
+ return 1;
+
+ if (no_iommu || swiotlb || dmar_disabled)
+ pr_warning("Forcing Intel-IOMMU to enabled\n");
+
+ dmar_disabled = 0;
+#ifdef CONFIG_SWIOTLB
+ swiotlb = 0;
+#endif
+ no_iommu = 0;
+
+ return 1;
+}
diff --git a/arch/x86/kernel/tce_64.c b/arch/x86/kernel/tce_64.c
new file mode 100644
index 0000000..f386bad
--- /dev/null
+++ b/arch/x86/kernel/tce_64.c
@@ -0,0 +1,190 @@
+/*
+ * This file manages the translation entries for the IBM Calgary IOMMU.
+ *
+ * Derived from arch/powerpc/platforms/pseries/iommu.c
+ *
+ * Copyright (C) IBM Corporation, 2006
+ *
+ * Author: Jon Mason <jdmason@us.ibm.com>
+ * Author: Muli Ben-Yehuda <muli@il.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+#include <linux/spinlock.h>
+#include <linux/string.h>
+#include <linux/pci.h>
+#include <linux/dma-mapping.h>
+#include <linux/bootmem.h>
+#include <asm/tce.h>
+#include <asm/calgary.h>
+#include <asm/proto.h>
+#include <asm/cacheflush.h>
+
+/* flush a tce at 'tceaddr' to main memory */
+static inline void flush_tce(void* tceaddr)
+{
+ /* a single tce can't cross a cache line */
+ if (boot_cpu_has(X86_FEATURE_CLFLUSH))
+ clflush(tceaddr);
+ else
+ wbinvd();
+}
+
+void tce_build(struct iommu_table *tbl, unsigned long index,
+ unsigned int npages, unsigned long uaddr, int direction)
+{
+ u64* tp;
+ u64 t;
+ u64 rpn;
+
+ t = (1 << TCE_READ_SHIFT);
+ if (direction != DMA_TO_DEVICE)
+ t |= (1 << TCE_WRITE_SHIFT);
+
+ tp = ((u64*)tbl->it_base) + index;
+
+ while (npages--) {
+ rpn = (virt_to_bus((void*)uaddr)) >> PAGE_SHIFT;
+ t &= ~TCE_RPN_MASK;
+ t |= (rpn << TCE_RPN_SHIFT);
+
+ *tp = cpu_to_be64(t);
+ flush_tce(tp);
+
+ uaddr += PAGE_SIZE;
+ tp++;
+ }
+}
+
+void tce_free(struct iommu_table *tbl, long index, unsigned int npages)
+{
+ u64* tp;
+
+ tp = ((u64*)tbl->it_base) + index;
+
+ while (npages--) {
+ *tp = cpu_to_be64(0);
+ flush_tce(tp);
+ tp++;
+ }
+}
+
+static inline unsigned int table_size_to_number_of_entries(unsigned char size)
+{
+ /*
+ * size is the order of the table, 0-7
+ * smallest table is 8K entries, so shift result by 13 to
+ * multiply by 8K
+ */
+ return (1 << size) << 13;
+}
+
+static int tce_table_setparms(struct pci_dev *dev, struct iommu_table *tbl)
+{
+ unsigned int bitmapsz;
+ unsigned long bmppages;
+ int ret;
+
+ tbl->it_busno = dev->bus->number;
+
+ /* set the tce table size - measured in entries */
+ tbl->it_size = table_size_to_number_of_entries(specified_table_size);
+
+ /*
+ * number of bytes needed for the bitmap size in number of
+ * entries; we need one bit per entry
+ */
+ bitmapsz = tbl->it_size / BITS_PER_BYTE;
+ bmppages = __get_free_pages(GFP_KERNEL, get_order(bitmapsz));
+ if (!bmppages) {
+ printk(KERN_ERR "Calgary: cannot allocate bitmap\n");
+ ret = -ENOMEM;
+ goto done;
+ }
+
+ tbl->it_map = (unsigned long*)bmppages;
+
+ memset(tbl->it_map, 0, bitmapsz);
+
+ tbl->it_hint = 0;
+
+ spin_lock_init(&tbl->it_lock);
+
+ return 0;
+
+done:
+ return ret;
+}
+
+int __init build_tce_table(struct pci_dev *dev, void __iomem *bbar)
+{
+ struct iommu_table *tbl;
+ int ret;
+
+ if (pci_iommu(dev->bus)) {
+ printk(KERN_ERR "Calgary: dev %p has sysdata->iommu %p\n",
+ dev, pci_iommu(dev->bus));
+ BUG();
+ }
+
+ tbl = kzalloc(sizeof(struct iommu_table), GFP_KERNEL);
+ if (!tbl) {
+ printk(KERN_ERR "Calgary: error allocating iommu_table\n");
+ ret = -ENOMEM;
+ goto done;
+ }
+
+ ret = tce_table_setparms(dev, tbl);
+ if (ret)
+ goto free_tbl;
+
+ tbl->bbar = bbar;
+
+ set_pci_iommu(dev->bus, tbl);
+
+ return 0;
+
+free_tbl:
+ kfree(tbl);
+done:
+ return ret;
+}
+
+void * __init alloc_tce_table(void)
+{
+ unsigned int size;
+
+ size = table_size_to_number_of_entries(specified_table_size);
+ size *= TCE_ENTRY_SIZE;
+
+ return __alloc_bootmem_low(size, size, 0);
+}
+
+void __init free_tce_table(void *tbl)
+{
+ unsigned int size;
+
+ if (!tbl)
+ return;
+
+ size = table_size_to_number_of_entries(specified_table_size);
+ size *= TCE_ENTRY_SIZE;
+
+ free_bootmem(__pa(tbl), size);
+}
diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c
new file mode 100644
index 0000000..fddaefc
--- /dev/null
+++ b/arch/x86/kernel/time.c
@@ -0,0 +1,107 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 1991,1992,1995 Linus Torvalds
+ * Copyright (c) 1994 Alan Modra
+ * Copyright (c) 1995 Markus Kuhn
+ * Copyright (c) 1996 Ingo Molnar
+ * Copyright (c) 1998 Andrea Arcangeli
+ * Copyright (c) 2002,2006 Vojtech Pavlik
+ * Copyright (c) 2003 Andi Kleen
+ *
+ */
+
+#include <linux/clockchips.h>
+#include <linux/interrupt.h>
+#include <linux/irq.h>
+#include <linux/i8253.h>
+#include <linux/time.h>
+#include <linux/export.h>
+
+#include <asm/vsyscall.h>
+#include <asm/x86_init.h>
+#include <asm/i8259.h>
+#include <asm/timer.h>
+#include <asm/hpet.h>
+#include <asm/time.h>
+
+#ifdef CONFIG_X86_64
+__visible volatile unsigned long jiffies __cacheline_aligned_in_smp = INITIAL_JIFFIES;
+#endif
+
+unsigned long profile_pc(struct pt_regs *regs)
+{
+ unsigned long pc = instruction_pointer(regs);
+
+ if (!user_mode(regs) && in_lock_functions(pc)) {
+#ifdef CONFIG_FRAME_POINTER
+ return *(unsigned long *)(regs->bp + sizeof(long));
+#else
+ unsigned long *sp =
+ (unsigned long *)kernel_stack_pointer(regs);
+ /*
+ * Return address is either directly at stack pointer
+ * or above a saved flags. Eflags has bits 22-31 zero,
+ * kernel addresses don't.
+ */
+ if (sp[0] >> 22)
+ return sp[0];
+ if (sp[1] >> 22)
+ return sp[1];
+#endif
+ }
+ return pc;
+}
+EXPORT_SYMBOL(profile_pc);
+
+/*
+ * Default timer interrupt handler for PIT/HPET
+ */
+static irqreturn_t timer_interrupt(int irq, void *dev_id)
+{
+ global_clock_event->event_handler(global_clock_event);
+ return IRQ_HANDLED;
+}
+
+static struct irqaction irq0 = {
+ .handler = timer_interrupt,
+ .flags = IRQF_NOBALANCING | IRQF_IRQPOLL | IRQF_TIMER,
+ .name = "timer"
+};
+
+static void __init setup_default_timer_irq(void)
+{
+ /*
+ * Unconditionally register the legacy timer; even without legacy
+ * PIC/PIT we need this for the HPET0 in legacy replacement mode.
+ */
+ if (setup_irq(0, &irq0))
+ pr_info("Failed to register legacy timer interrupt\n");
+}
+
+/* Default timer init function */
+void __init hpet_time_init(void)
+{
+ if (!hpet_enable())
+ setup_pit_timer();
+ setup_default_timer_irq();
+}
+
+static __init void x86_late_time_init(void)
+{
+ x86_init.timers.timer_init();
+ /*
+ * After PIT/HPET timers init, select and setup
+ * the final interrupt mode for delivering IRQs.
+ */
+ x86_init.irqs.intr_mode_init();
+ tsc_init();
+}
+
+/*
+ * Initialize TSC and delay the periodic timer init to
+ * late x86_late_time_init() so ioremap works.
+ */
+void __init time_init(void)
+{
+ late_time_init = x86_late_time_init;
+}
diff --git a/arch/x86/kernel/tls.c b/arch/x86/kernel/tls.c
new file mode 100644
index 0000000..a5b802a
--- /dev/null
+++ b/arch/x86/kernel/tls.c
@@ -0,0 +1,316 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/user.h>
+#include <linux/regset.h>
+#include <linux/syscalls.h>
+
+#include <linux/uaccess.h>
+#include <asm/desc.h>
+#include <asm/ldt.h>
+#include <asm/processor.h>
+#include <asm/proto.h>
+
+#include "tls.h"
+
+/*
+ * sys_alloc_thread_area: get a yet unused TLS descriptor index.
+ */
+static int get_free_idx(void)
+{
+ struct thread_struct *t = ¤t->thread;
+ int idx;
+
+ for (idx = 0; idx < GDT_ENTRY_TLS_ENTRIES; idx++)
+ if (desc_empty(&t->tls_array[idx]))
+ return idx + GDT_ENTRY_TLS_MIN;
+ return -ESRCH;
+}
+
+static bool tls_desc_okay(const struct user_desc *info)
+{
+ /*
+ * For historical reasons (i.e. no one ever documented how any
+ * of the segmentation APIs work), user programs can and do
+ * assume that a struct user_desc that's all zeros except for
+ * entry_number means "no segment at all". This never actually
+ * worked. In fact, up to Linux 3.19, a struct user_desc like
+ * this would create a 16-bit read-write segment with base and
+ * limit both equal to zero.
+ *
+ * That was close enough to "no segment at all" until we
+ * hardened this function to disallow 16-bit TLS segments. Fix
+ * it up by interpreting these zeroed segments the way that they
+ * were almost certainly intended to be interpreted.
+ *
+ * The correct way to ask for "no segment at all" is to specify
+ * a user_desc that satisfies LDT_empty. To keep everything
+ * working, we accept both.
+ *
+ * Note that there's a similar kludge in modify_ldt -- look at
+ * the distinction between modes 1 and 0x11.
+ */
+ if (LDT_empty(info) || LDT_zero(info))
+ return true;
+
+ /*
+ * espfix is required for 16-bit data segments, but espfix
+ * only works for LDT segments.
+ */
+ if (!info->seg_32bit)
+ return false;
+
+ /* Only allow data segments in the TLS array. */
+ if (info->contents > 1)
+ return false;
+
+ /*
+ * Non-present segments with DPL 3 present an interesting attack
+ * surface. The kernel should handle such segments correctly,
+ * but TLS is very difficult to protect in a sandbox, so prevent
+ * such segments from being created.
+ *
+ * If userspace needs to remove a TLS entry, it can still delete
+ * it outright.
+ */
+ if (info->seg_not_present)
+ return false;
+
+ return true;
+}
+
+static void set_tls_desc(struct task_struct *p, int idx,
+ const struct user_desc *info, int n)
+{
+ struct thread_struct *t = &p->thread;
+ struct desc_struct *desc = &t->tls_array[idx - GDT_ENTRY_TLS_MIN];
+ int cpu;
+
+ /*
+ * We must not get preempted while modifying the TLS.
+ */
+ cpu = get_cpu();
+
+ while (n-- > 0) {
+ if (LDT_empty(info) || LDT_zero(info))
+ memset(desc, 0, sizeof(*desc));
+ else
+ fill_ldt(desc, info);
+ ++info;
+ ++desc;
+ }
+
+ if (t == ¤t->thread)
+ load_TLS(t, cpu);
+
+ put_cpu();
+}
+
+/*
+ * Set a given TLS descriptor:
+ */
+int do_set_thread_area(struct task_struct *p, int idx,
+ struct user_desc __user *u_info,
+ int can_allocate)
+{
+ struct user_desc info;
+ unsigned short __maybe_unused sel, modified_sel;
+
+ if (copy_from_user(&info, u_info, sizeof(info)))
+ return -EFAULT;
+
+ if (!tls_desc_okay(&info))
+ return -EINVAL;
+
+ if (idx == -1)
+ idx = info.entry_number;
+
+ /*
+ * index -1 means the kernel should try to find and
+ * allocate an empty descriptor:
+ */
+ if (idx == -1 && can_allocate) {
+ idx = get_free_idx();
+ if (idx < 0)
+ return idx;
+ if (put_user(idx, &u_info->entry_number))
+ return -EFAULT;
+ }
+
+ if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
+ return -EINVAL;
+
+ set_tls_desc(p, idx, &info, 1);
+
+ /*
+ * If DS, ES, FS, or GS points to the modified segment, forcibly
+ * refresh it. Only needed on x86_64 because x86_32 reloads them
+ * on return to user mode.
+ */
+ modified_sel = (idx << 3) | 3;
+
+ if (p == current) {
+#ifdef CONFIG_X86_64
+ savesegment(ds, sel);
+ if (sel == modified_sel)
+ loadsegment(ds, sel);
+
+ savesegment(es, sel);
+ if (sel == modified_sel)
+ loadsegment(es, sel);
+
+ savesegment(fs, sel);
+ if (sel == modified_sel)
+ loadsegment(fs, sel);
+
+ savesegment(gs, sel);
+ if (sel == modified_sel)
+ load_gs_index(sel);
+#endif
+
+#ifdef CONFIG_X86_32_LAZY_GS
+ savesegment(gs, sel);
+ if (sel == modified_sel)
+ loadsegment(gs, sel);
+#endif
+ } else {
+#ifdef CONFIG_X86_64
+ if (p->thread.fsindex == modified_sel)
+ p->thread.fsbase = info.base_addr;
+
+ if (p->thread.gsindex == modified_sel)
+ p->thread.gsbase = info.base_addr;
+#endif
+ }
+
+ return 0;
+}
+
+SYSCALL_DEFINE1(set_thread_area, struct user_desc __user *, u_info)
+{
+ return do_set_thread_area(current, -1, u_info, 1);
+}
+
+
+/*
+ * Get the current Thread-Local Storage area:
+ */
+
+static void fill_user_desc(struct user_desc *info, int idx,
+ const struct desc_struct *desc)
+
+{
+ memset(info, 0, sizeof(*info));
+ info->entry_number = idx;
+ info->base_addr = get_desc_base(desc);
+ info->limit = get_desc_limit(desc);
+ info->seg_32bit = desc->d;
+ info->contents = desc->type >> 2;
+ info->read_exec_only = !(desc->type & 2);
+ info->limit_in_pages = desc->g;
+ info->seg_not_present = !desc->p;
+ info->useable = desc->avl;
+#ifdef CONFIG_X86_64
+ info->lm = desc->l;
+#endif
+}
+
+int do_get_thread_area(struct task_struct *p, int idx,
+ struct user_desc __user *u_info)
+{
+ struct user_desc info;
+
+ if (idx == -1 && get_user(idx, &u_info->entry_number))
+ return -EFAULT;
+
+ if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
+ return -EINVAL;
+
+ fill_user_desc(&info, idx,
+ &p->thread.tls_array[idx - GDT_ENTRY_TLS_MIN]);
+
+ if (copy_to_user(u_info, &info, sizeof(info)))
+ return -EFAULT;
+ return 0;
+}
+
+SYSCALL_DEFINE1(get_thread_area, struct user_desc __user *, u_info)
+{
+ return do_get_thread_area(current, -1, u_info);
+}
+
+int regset_tls_active(struct task_struct *target,
+ const struct user_regset *regset)
+{
+ struct thread_struct *t = &target->thread;
+ int n = GDT_ENTRY_TLS_ENTRIES;
+ while (n > 0 && desc_empty(&t->tls_array[n - 1]))
+ --n;
+ return n;
+}
+
+int regset_tls_get(struct task_struct *target, const struct user_regset *regset,
+ unsigned int pos, unsigned int count,
+ void *kbuf, void __user *ubuf)
+{
+ const struct desc_struct *tls;
+
+ if (pos >= GDT_ENTRY_TLS_ENTRIES * sizeof(struct user_desc) ||
+ (pos % sizeof(struct user_desc)) != 0 ||
+ (count % sizeof(struct user_desc)) != 0)
+ return -EINVAL;
+
+ pos /= sizeof(struct user_desc);
+ count /= sizeof(struct user_desc);
+
+ tls = &target->thread.tls_array[pos];
+
+ if (kbuf) {
+ struct user_desc *info = kbuf;
+ while (count-- > 0)
+ fill_user_desc(info++, GDT_ENTRY_TLS_MIN + pos++,
+ tls++);
+ } else {
+ struct user_desc __user *u_info = ubuf;
+ while (count-- > 0) {
+ struct user_desc info;
+ fill_user_desc(&info, GDT_ENTRY_TLS_MIN + pos++, tls++);
+ if (__copy_to_user(u_info++, &info, sizeof(info)))
+ return -EFAULT;
+ }
+ }
+
+ return 0;
+}
+
+int regset_tls_set(struct task_struct *target, const struct user_regset *regset,
+ unsigned int pos, unsigned int count,
+ const void *kbuf, const void __user *ubuf)
+{
+ struct user_desc infobuf[GDT_ENTRY_TLS_ENTRIES];
+ const struct user_desc *info;
+ int i;
+
+ if (pos >= GDT_ENTRY_TLS_ENTRIES * sizeof(struct user_desc) ||
+ (pos % sizeof(struct user_desc)) != 0 ||
+ (count % sizeof(struct user_desc)) != 0)
+ return -EINVAL;
+
+ if (kbuf)
+ info = kbuf;
+ else if (__copy_from_user(infobuf, ubuf, count))
+ return -EFAULT;
+ else
+ info = infobuf;
+
+ for (i = 0; i < count / sizeof(struct user_desc); i++)
+ if (!tls_desc_okay(info + i))
+ return -EINVAL;
+
+ set_tls_desc(target,
+ GDT_ENTRY_TLS_MIN + (pos / sizeof(struct user_desc)),
+ info, count / sizeof(struct user_desc));
+
+ return 0;
+}
diff --git a/arch/x86/kernel/tls.h b/arch/x86/kernel/tls.h
new file mode 100644
index 0000000..2f083a2
--- /dev/null
+++ b/arch/x86/kernel/tls.h
@@ -0,0 +1,21 @@
+/*
+ * Internal declarations for x86 TLS implementation functions.
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License v.2.
+ *
+ * Red Hat Author: Roland McGrath.
+ */
+
+#ifndef _ARCH_X86_KERNEL_TLS_H
+
+#include <linux/regset.h>
+
+extern user_regset_active_fn regset_tls_active;
+extern user_regset_get_fn regset_tls_get;
+extern user_regset_set_fn regset_tls_set;
+
+#endif /* _ARCH_X86_KERNEL_TLS_H */
diff --git a/arch/x86/kernel/topology.c b/arch/x86/kernel/topology.c
new file mode 100644
index 0000000..738bf42
--- /dev/null
+++ b/arch/x86/kernel/topology.c
@@ -0,0 +1,175 @@
+/*
+ * Populate sysfs with topology information
+ *
+ * Written by: Matthew Dobson, IBM Corporation
+ * Original Code: Paul Dorwin, IBM Corporation, Patrick Mochel, OSDL
+ *
+ * Copyright (C) 2002, IBM Corp.
+ *
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * Send feedback to <colpatch@us.ibm.com>
+ */
+#include <linux/nodemask.h>
+#include <linux/export.h>
+#include <linux/mmzone.h>
+#include <linux/init.h>
+#include <linux/smp.h>
+#include <linux/irq.h>
+#include <asm/cpu.h>
+
+static DEFINE_PER_CPU(struct x86_cpu, cpu_devices);
+
+#ifdef CONFIG_HOTPLUG_CPU
+
+#ifdef CONFIG_BOOTPARAM_HOTPLUG_CPU0
+static int cpu0_hotpluggable = 1;
+#else
+static int cpu0_hotpluggable;
+static int __init enable_cpu0_hotplug(char *str)
+{
+ cpu0_hotpluggable = 1;
+ return 1;
+}
+
+__setup("cpu0_hotplug", enable_cpu0_hotplug);
+#endif
+
+#ifdef CONFIG_DEBUG_HOTPLUG_CPU0
+/*
+ * This function offlines a CPU as early as possible and allows userspace to
+ * boot up without the CPU. The CPU can be onlined back by user after boot.
+ *
+ * This is only called for debugging CPU offline/online feature.
+ */
+int _debug_hotplug_cpu(int cpu, int action)
+{
+ struct device *dev = get_cpu_device(cpu);
+ int ret;
+
+ if (!cpu_is_hotpluggable(cpu))
+ return -EINVAL;
+
+ lock_device_hotplug();
+
+ switch (action) {
+ case 0:
+ ret = cpu_down(cpu);
+ if (!ret) {
+ pr_info("CPU %u is now offline\n", cpu);
+ dev->offline = true;
+ kobject_uevent(&dev->kobj, KOBJ_OFFLINE);
+ } else
+ pr_debug("Can't offline CPU%d.\n", cpu);
+ break;
+ case 1:
+ ret = cpu_up(cpu);
+ if (!ret) {
+ dev->offline = false;
+ kobject_uevent(&dev->kobj, KOBJ_ONLINE);
+ } else {
+ pr_debug("Can't online CPU%d.\n", cpu);
+ }
+ break;
+ default:
+ ret = -EINVAL;
+ }
+
+ unlock_device_hotplug();
+
+ return ret;
+}
+
+static int __init debug_hotplug_cpu(void)
+{
+ _debug_hotplug_cpu(0, 0);
+ return 0;
+}
+
+late_initcall_sync(debug_hotplug_cpu);
+#endif /* CONFIG_DEBUG_HOTPLUG_CPU0 */
+
+int arch_register_cpu(int num)
+{
+ struct cpuinfo_x86 *c = &cpu_data(num);
+
+ /*
+ * Currently CPU0 is only hotpluggable on Intel platforms. Other
+ * vendors can add hotplug support later.
+ * Xen PV guests don't support CPU0 hotplug at all.
+ */
+ if (c->x86_vendor != X86_VENDOR_INTEL ||
+ boot_cpu_has(X86_FEATURE_XENPV))
+ cpu0_hotpluggable = 0;
+
+ /*
+ * Two known BSP/CPU0 dependencies: Resume from suspend/hibernate
+ * depends on BSP. PIC interrupts depend on BSP.
+ *
+ * If the BSP depencies are under control, one can tell kernel to
+ * enable BSP hotplug. This basically adds a control file and
+ * one can attempt to offline BSP.
+ */
+ if (num == 0 && cpu0_hotpluggable) {
+ unsigned int irq;
+ /*
+ * We won't take down the boot processor on i386 if some
+ * interrupts only are able to be serviced by the BSP in PIC.
+ */
+ for_each_active_irq(irq) {
+ if (!IO_APIC_IRQ(irq) && irq_has_action(irq)) {
+ cpu0_hotpluggable = 0;
+ break;
+ }
+ }
+ }
+ if (num || cpu0_hotpluggable)
+ per_cpu(cpu_devices, num).cpu.hotpluggable = 1;
+
+ return register_cpu(&per_cpu(cpu_devices, num).cpu, num);
+}
+EXPORT_SYMBOL(arch_register_cpu);
+
+void arch_unregister_cpu(int num)
+{
+ unregister_cpu(&per_cpu(cpu_devices, num).cpu);
+}
+EXPORT_SYMBOL(arch_unregister_cpu);
+#else /* CONFIG_HOTPLUG_CPU */
+
+static int __init arch_register_cpu(int num)
+{
+ return register_cpu(&per_cpu(cpu_devices, num).cpu, num);
+}
+#endif /* CONFIG_HOTPLUG_CPU */
+
+static int __init topology_init(void)
+{
+ int i;
+
+#ifdef CONFIG_NUMA
+ for_each_online_node(i)
+ register_one_node(i);
+#endif
+
+ for_each_present_cpu(i)
+ arch_register_cpu(i);
+
+ return 0;
+}
+subsys_initcall(topology_init);
diff --git a/arch/x86/kernel/trace_clock.c b/arch/x86/kernel/trace_clock.c
new file mode 100644
index 0000000..b8e7abe
--- /dev/null
+++ b/arch/x86/kernel/trace_clock.c
@@ -0,0 +1,17 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * X86 trace clocks
+ */
+#include <asm/trace_clock.h>
+#include <asm/barrier.h>
+#include <asm/msr.h>
+
+/*
+ * trace_clock_x86_tsc(): A clock that is just the cycle counter.
+ *
+ * Unlike the other clocks, this is not in nanoseconds.
+ */
+u64 notrace trace_clock_x86_tsc(void)
+{
+ return rdtsc_ordered();
+}
diff --git a/arch/x86/kernel/tracepoint.c b/arch/x86/kernel/tracepoint.c
new file mode 100644
index 0000000..5bd30c4
--- /dev/null
+++ b/arch/x86/kernel/tracepoint.c
@@ -0,0 +1,42 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Code for supporting irq vector tracepoints.
+ *
+ * Copyright (C) 2013 Seiji Aguchi <seiji.aguchi@hds.com>
+ *
+ */
+#include <linux/jump_label.h>
+#include <linux/atomic.h>
+
+#include <asm/hw_irq.h>
+#include <asm/desc.h>
+
+DEFINE_STATIC_KEY_FALSE(trace_pagefault_key);
+
+int trace_pagefault_reg(void)
+{
+ static_branch_inc(&trace_pagefault_key);
+ return 0;
+}
+
+void trace_pagefault_unreg(void)
+{
+ static_branch_dec(&trace_pagefault_key);
+}
+
+#ifdef CONFIG_SMP
+
+DEFINE_STATIC_KEY_FALSE(trace_resched_ipi_key);
+
+int trace_resched_ipi_reg(void)
+{
+ static_branch_inc(&trace_resched_ipi_key);
+ return 0;
+}
+
+void trace_resched_ipi_unreg(void)
+{
+ static_branch_dec(&trace_resched_ipi_key);
+}
+
+#endif
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
new file mode 100644
index 0000000..e6db475
--- /dev/null
+++ b/arch/x86/kernel/traps.c
@@ -0,0 +1,975 @@
+/*
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs
+ *
+ * Pentium III FXSR, SSE support
+ * Gareth Hughes <gareth@valinux.com>, May 2000
+ */
+
+/*
+ * Handle hardware traps and faults.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/context_tracking.h>
+#include <linux/interrupt.h>
+#include <linux/kallsyms.h>
+#include <linux/spinlock.h>
+#include <linux/kprobes.h>
+#include <linux/uaccess.h>
+#include <linux/kdebug.h>
+#include <linux/kgdb.h>
+#include <linux/kernel.h>
+#include <linux/export.h>
+#include <linux/ptrace.h>
+#include <linux/uprobes.h>
+#include <linux/string.h>
+#include <linux/delay.h>
+#include <linux/errno.h>
+#include <linux/kexec.h>
+#include <linux/sched.h>
+#include <linux/sched/task_stack.h>
+#include <linux/timer.h>
+#include <linux/init.h>
+#include <linux/bug.h>
+#include <linux/nmi.h>
+#include <linux/mm.h>
+#include <linux/smp.h>
+#include <linux/io.h>
+
+#if defined(CONFIG_EDAC)
+#include <linux/edac.h>
+#endif
+
+#include <asm/stacktrace.h>
+#include <asm/processor.h>
+#include <asm/debugreg.h>
+#include <linux/atomic.h>
+#include <asm/text-patching.h>
+#include <asm/ftrace.h>
+#include <asm/traps.h>
+#include <asm/desc.h>
+#include <asm/fpu/internal.h>
+#include <asm/cpu_entry_area.h>
+#include <asm/mce.h>
+#include <asm/fixmap.h>
+#include <asm/mach_traps.h>
+#include <asm/alternative.h>
+#include <asm/fpu/xstate.h>
+#include <asm/trace/mpx.h>
+#include <asm/mpx.h>
+#include <asm/vm86.h>
+#include <asm/umip.h>
+
+#ifdef CONFIG_X86_64
+#include <asm/x86_init.h>
+#include <asm/pgalloc.h>
+#include <asm/proto.h>
+#else
+#include <asm/processor-flags.h>
+#include <asm/setup.h>
+#include <asm/proto.h>
+#endif
+
+DECLARE_BITMAP(system_vectors, NR_VECTORS);
+
+static inline void cond_local_irq_enable(struct pt_regs *regs)
+{
+ if (regs->flags & X86_EFLAGS_IF)
+ local_irq_enable();
+}
+
+static inline void cond_local_irq_disable(struct pt_regs *regs)
+{
+ if (regs->flags & X86_EFLAGS_IF)
+ local_irq_disable();
+}
+
+/*
+ * In IST context, we explicitly disable preemption. This serves two
+ * purposes: it makes it much less likely that we would accidentally
+ * schedule in IST context and it will force a warning if we somehow
+ * manage to schedule by accident.
+ */
+void ist_enter(struct pt_regs *regs)
+{
+ if (user_mode(regs)) {
+ RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
+ } else {
+ /*
+ * We might have interrupted pretty much anything. In
+ * fact, if we're a machine check, we can even interrupt
+ * NMI processing. We don't want in_nmi() to return true,
+ * but we need to notify RCU.
+ */
+ rcu_nmi_enter();
+ }
+
+ preempt_disable();
+
+ /* This code is a bit fragile. Test it. */
+ RCU_LOCKDEP_WARN(!rcu_is_watching(), "ist_enter didn't work");
+}
+
+void ist_exit(struct pt_regs *regs)
+{
+ preempt_enable_no_resched();
+
+ if (!user_mode(regs))
+ rcu_nmi_exit();
+}
+
+/**
+ * ist_begin_non_atomic() - begin a non-atomic section in an IST exception
+ * @regs: regs passed to the IST exception handler
+ *
+ * IST exception handlers normally cannot schedule. As a special
+ * exception, if the exception interrupted userspace code (i.e.
+ * user_mode(regs) would return true) and the exception was not
+ * a double fault, it can be safe to schedule. ist_begin_non_atomic()
+ * begins a non-atomic section within an ist_enter()/ist_exit() region.
+ * Callers are responsible for enabling interrupts themselves inside
+ * the non-atomic section, and callers must call ist_end_non_atomic()
+ * before ist_exit().
+ */
+void ist_begin_non_atomic(struct pt_regs *regs)
+{
+ BUG_ON(!user_mode(regs));
+
+ /*
+ * Sanity check: we need to be on the normal thread stack. This
+ * will catch asm bugs and any attempt to use ist_preempt_enable
+ * from double_fault.
+ */
+ BUG_ON(!on_thread_stack());
+
+ preempt_enable_no_resched();
+}
+
+/**
+ * ist_end_non_atomic() - begin a non-atomic section in an IST exception
+ *
+ * Ends a non-atomic section started with ist_begin_non_atomic().
+ */
+void ist_end_non_atomic(void)
+{
+ preempt_disable();
+}
+
+int is_valid_bugaddr(unsigned long addr)
+{
+ unsigned short ud;
+
+ if (addr < TASK_SIZE_MAX)
+ return 0;
+
+ if (probe_kernel_address((unsigned short *)addr, ud))
+ return 0;
+
+ return ud == INSN_UD0 || ud == INSN_UD2;
+}
+
+int fixup_bug(struct pt_regs *regs, int trapnr)
+{
+ if (trapnr != X86_TRAP_UD)
+ return 0;
+
+ switch (report_bug(regs->ip, regs)) {
+ case BUG_TRAP_TYPE_NONE:
+ case BUG_TRAP_TYPE_BUG:
+ break;
+
+ case BUG_TRAP_TYPE_WARN:
+ regs->ip += LEN_UD2;
+ return 1;
+ }
+
+ return 0;
+}
+
+static nokprobe_inline int
+do_trap_no_signal(struct task_struct *tsk, int trapnr, char *str,
+ struct pt_regs *regs, long error_code)
+{
+ if (v8086_mode(regs)) {
+ /*
+ * Traps 0, 1, 3, 4, and 5 should be forwarded to vm86.
+ * On nmi (interrupt 2), do_trap should not be called.
+ */
+ if (trapnr < X86_TRAP_UD) {
+ if (!handle_vm86_trap((struct kernel_vm86_regs *) regs,
+ error_code, trapnr))
+ return 0;
+ }
+ return -1;
+ }
+
+ if (!user_mode(regs)) {
+ if (fixup_exception(regs, trapnr))
+ return 0;
+
+ tsk->thread.error_code = error_code;
+ tsk->thread.trap_nr = trapnr;
+ die(str, regs, error_code);
+ }
+
+ return -1;
+}
+
+static siginfo_t *fill_trap_info(struct pt_regs *regs, int signr, int trapnr,
+ siginfo_t *info)
+{
+ unsigned long siaddr;
+ int sicode;
+
+ switch (trapnr) {
+ default:
+ return SEND_SIG_PRIV;
+
+ case X86_TRAP_DE:
+ sicode = FPE_INTDIV;
+ siaddr = uprobe_get_trap_addr(regs);
+ break;
+ case X86_TRAP_UD:
+ sicode = ILL_ILLOPN;
+ siaddr = uprobe_get_trap_addr(regs);
+ break;
+ case X86_TRAP_AC:
+ sicode = BUS_ADRALN;
+ siaddr = 0;
+ break;
+ }
+
+ info->si_signo = signr;
+ info->si_errno = 0;
+ info->si_code = sicode;
+ info->si_addr = (void __user *)siaddr;
+ return info;
+}
+
+static void
+do_trap(int trapnr, int signr, char *str, struct pt_regs *regs,
+ long error_code, siginfo_t *info)
+{
+ struct task_struct *tsk = current;
+
+
+ if (!do_trap_no_signal(tsk, trapnr, str, regs, error_code))
+ return;
+ /*
+ * We want error_code and trap_nr set for userspace faults and
+ * kernelspace faults which result in die(), but not
+ * kernelspace faults which are fixed up. die() gives the
+ * process no chance to handle the signal and notice the
+ * kernel fault information, so that won't result in polluting
+ * the information about previously queued, but not yet
+ * delivered, faults. See also do_general_protection below.
+ */
+ tsk->thread.error_code = error_code;
+ tsk->thread.trap_nr = trapnr;
+
+ if (show_unhandled_signals && unhandled_signal(tsk, signr) &&
+ printk_ratelimit()) {
+ pr_info("%s[%d] trap %s ip:%lx sp:%lx error:%lx",
+ tsk->comm, tsk->pid, str,
+ regs->ip, regs->sp, error_code);
+ print_vma_addr(KERN_CONT " in ", regs->ip);
+ pr_cont("\n");
+ }
+
+ force_sig_info(signr, info ?: SEND_SIG_PRIV, tsk);
+}
+NOKPROBE_SYMBOL(do_trap);
+
+static void do_error_trap(struct pt_regs *regs, long error_code, char *str,
+ unsigned long trapnr, int signr)
+{
+ siginfo_t info;
+
+ RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
+
+ /*
+ * WARN*()s end up here; fix them up before we call the
+ * notifier chain.
+ */
+ if (!user_mode(regs) && fixup_bug(regs, trapnr))
+ return;
+
+ if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) !=
+ NOTIFY_STOP) {
+ cond_local_irq_enable(regs);
+ clear_siginfo(&info);
+ do_trap(trapnr, signr, str, regs, error_code,
+ fill_trap_info(regs, signr, trapnr, &info));
+ }
+}
+
+#define DO_ERROR(trapnr, signr, str, name) \
+dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \
+{ \
+ do_error_trap(regs, error_code, str, trapnr, signr); \
+}
+
+DO_ERROR(X86_TRAP_DE, SIGFPE, "divide error", divide_error)
+DO_ERROR(X86_TRAP_OF, SIGSEGV, "overflow", overflow)
+DO_ERROR(X86_TRAP_UD, SIGILL, "invalid opcode", invalid_op)
+DO_ERROR(X86_TRAP_OLD_MF, SIGFPE, "coprocessor segment overrun",coprocessor_segment_overrun)
+DO_ERROR(X86_TRAP_TS, SIGSEGV, "invalid TSS", invalid_TSS)
+DO_ERROR(X86_TRAP_NP, SIGBUS, "segment not present", segment_not_present)
+DO_ERROR(X86_TRAP_SS, SIGBUS, "stack segment", stack_segment)
+DO_ERROR(X86_TRAP_AC, SIGBUS, "alignment check", alignment_check)
+
+#ifdef CONFIG_VMAP_STACK
+__visible void __noreturn handle_stack_overflow(const char *message,
+ struct pt_regs *regs,
+ unsigned long fault_address)
+{
+ printk(KERN_EMERG "BUG: stack guard page was hit at %p (stack is %p..%p)\n",
+ (void *)fault_address, current->stack,
+ (char *)current->stack + THREAD_SIZE - 1);
+ die(message, regs, 0);
+
+ /* Be absolutely certain we don't return. */
+ panic(message);
+}
+#endif
+
+#ifdef CONFIG_X86_64
+/* Runs on IST stack */
+dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
+{
+ static const char str[] = "double fault";
+ struct task_struct *tsk = current;
+#ifdef CONFIG_VMAP_STACK
+ unsigned long cr2;
+#endif
+
+#ifdef CONFIG_X86_ESPFIX64
+ extern unsigned char native_irq_return_iret[];
+
+ /*
+ * If IRET takes a non-IST fault on the espfix64 stack, then we
+ * end up promoting it to a doublefault. In that case, take
+ * advantage of the fact that we're not using the normal (TSS.sp0)
+ * stack right now. We can write a fake #GP(0) frame at TSS.sp0
+ * and then modify our own IRET frame so that, when we return,
+ * we land directly at the #GP(0) vector with the stack already
+ * set up according to its expectations.
+ *
+ * The net result is that our #GP handler will think that we
+ * entered from usermode with the bad user context.
+ *
+ * No need for ist_enter here because we don't use RCU.
+ */
+ if (((long)regs->sp >> P4D_SHIFT) == ESPFIX_PGD_ENTRY &&
+ regs->cs == __KERNEL_CS &&
+ regs->ip == (unsigned long)native_irq_return_iret)
+ {
+ struct pt_regs *gpregs = (struct pt_regs *)this_cpu_read(cpu_tss_rw.x86_tss.sp0) - 1;
+
+ /*
+ * regs->sp points to the failing IRET frame on the
+ * ESPFIX64 stack. Copy it to the entry stack. This fills
+ * in gpregs->ss through gpregs->ip.
+ *
+ */
+ memmove(&gpregs->ip, (void *)regs->sp, 5*8);
+ gpregs->orig_ax = 0; /* Missing (lost) #GP error code */
+
+ /*
+ * Adjust our frame so that we return straight to the #GP
+ * vector with the expected RSP value. This is safe because
+ * we won't enable interupts or schedule before we invoke
+ * general_protection, so nothing will clobber the stack
+ * frame we just set up.
+ */
+ regs->ip = (unsigned long)general_protection;
+ regs->sp = (unsigned long)&gpregs->orig_ax;
+
+ return;
+ }
+#endif
+
+ ist_enter(regs);
+ notify_die(DIE_TRAP, str, regs, error_code, X86_TRAP_DF, SIGSEGV);
+
+ tsk->thread.error_code = error_code;
+ tsk->thread.trap_nr = X86_TRAP_DF;
+
+#ifdef CONFIG_VMAP_STACK
+ /*
+ * If we overflow the stack into a guard page, the CPU will fail
+ * to deliver #PF and will send #DF instead. Similarly, if we
+ * take any non-IST exception while too close to the bottom of
+ * the stack, the processor will get a page fault while
+ * delivering the exception and will generate a double fault.
+ *
+ * According to the SDM (footnote in 6.15 under "Interrupt 14 -
+ * Page-Fault Exception (#PF):
+ *
+ * Processors update CR2 whenever a page fault is detected. If a
+ * second page fault occurs while an earlier page fault is being
+ * delivered, the faulting linear address of the second fault will
+ * overwrite the contents of CR2 (replacing the previous
+ * address). These updates to CR2 occur even if the page fault
+ * results in a double fault or occurs during the delivery of a
+ * double fault.
+ *
+ * The logic below has a small possibility of incorrectly diagnosing
+ * some errors as stack overflows. For example, if the IDT or GDT
+ * gets corrupted such that #GP delivery fails due to a bad descriptor
+ * causing #GP and we hit this condition while CR2 coincidentally
+ * points to the stack guard page, we'll think we overflowed the
+ * stack. Given that we're going to panic one way or another
+ * if this happens, this isn't necessarily worth fixing.
+ *
+ * If necessary, we could improve the test by only diagnosing
+ * a stack overflow if the saved RSP points within 47 bytes of
+ * the bottom of the stack: if RSP == tsk_stack + 48 and we
+ * take an exception, the stack is already aligned and there
+ * will be enough room SS, RSP, RFLAGS, CS, RIP, and a
+ * possible error code, so a stack overflow would *not* double
+ * fault. With any less space left, exception delivery could
+ * fail, and, as a practical matter, we've overflowed the
+ * stack even if the actual trigger for the double fault was
+ * something else.
+ */
+ cr2 = read_cr2();
+ if ((unsigned long)task_stack_page(tsk) - 1 - cr2 < PAGE_SIZE)
+ handle_stack_overflow("kernel stack overflow (double-fault)", regs, cr2);
+#endif
+
+#ifdef CONFIG_DOUBLEFAULT
+ df_debug(regs, error_code);
+#endif
+ /*
+ * This is always a kernel trap and never fixable (and thus must
+ * never return).
+ */
+ for (;;)
+ die(str, regs, error_code);
+}
+#endif
+
+dotraplinkage void do_bounds(struct pt_regs *regs, long error_code)
+{
+ const struct mpx_bndcsr *bndcsr;
+ siginfo_t *info;
+
+ RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
+ if (notify_die(DIE_TRAP, "bounds", regs, error_code,
+ X86_TRAP_BR, SIGSEGV) == NOTIFY_STOP)
+ return;
+ cond_local_irq_enable(regs);
+
+ if (!user_mode(regs))
+ die("bounds", regs, error_code);
+
+ if (!cpu_feature_enabled(X86_FEATURE_MPX)) {
+ /* The exception is not from Intel MPX */
+ goto exit_trap;
+ }
+
+ /*
+ * We need to look at BNDSTATUS to resolve this exception.
+ * A NULL here might mean that it is in its 'init state',
+ * which is all zeros which indicates MPX was not
+ * responsible for the exception.
+ */
+ bndcsr = get_xsave_field_ptr(XFEATURE_MASK_BNDCSR);
+ if (!bndcsr)
+ goto exit_trap;
+
+ trace_bounds_exception_mpx(bndcsr);
+ /*
+ * The error code field of the BNDSTATUS register communicates status
+ * information of a bound range exception #BR or operation involving
+ * bound directory.
+ */
+ switch (bndcsr->bndstatus & MPX_BNDSTA_ERROR_CODE) {
+ case 2: /* Bound directory has invalid entry. */
+ if (mpx_handle_bd_fault())
+ goto exit_trap;
+ break; /* Success, it was handled */
+ case 1: /* Bound violation. */
+ info = mpx_generate_siginfo(regs);
+ if (IS_ERR(info)) {
+ /*
+ * We failed to decode the MPX instruction. Act as if
+ * the exception was not caused by MPX.
+ */
+ goto exit_trap;
+ }
+ /*
+ * Success, we decoded the instruction and retrieved
+ * an 'info' containing the address being accessed
+ * which caused the exception. This information
+ * allows and application to possibly handle the
+ * #BR exception itself.
+ */
+ do_trap(X86_TRAP_BR, SIGSEGV, "bounds", regs, error_code, info);
+ kfree(info);
+ break;
+ case 0: /* No exception caused by Intel MPX operations. */
+ goto exit_trap;
+ default:
+ die("bounds", regs, error_code);
+ }
+
+ return;
+
+exit_trap:
+ /*
+ * This path out is for all the cases where we could not
+ * handle the exception in some way (like allocating a
+ * table or telling userspace about it. We will also end
+ * up here if the kernel has MPX turned off at compile
+ * time..
+ */
+ do_trap(X86_TRAP_BR, SIGSEGV, "bounds", regs, error_code, NULL);
+}
+
+dotraplinkage void
+do_general_protection(struct pt_regs *regs, long error_code)
+{
+ struct task_struct *tsk;
+
+ RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
+ cond_local_irq_enable(regs);
+
+ if (static_cpu_has(X86_FEATURE_UMIP)) {
+ if (user_mode(regs) && fixup_umip_exception(regs))
+ return;
+ }
+
+ if (v8086_mode(regs)) {
+ local_irq_enable();
+ handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code);
+ return;
+ }
+
+ tsk = current;
+ if (!user_mode(regs)) {
+ if (fixup_exception(regs, X86_TRAP_GP))
+ return;
+
+ tsk->thread.error_code = error_code;
+ tsk->thread.trap_nr = X86_TRAP_GP;
+ if (notify_die(DIE_GPF, "general protection fault", regs, error_code,
+ X86_TRAP_GP, SIGSEGV) != NOTIFY_STOP)
+ die("general protection fault", regs, error_code);
+ return;
+ }
+
+ tsk->thread.error_code = error_code;
+ tsk->thread.trap_nr = X86_TRAP_GP;
+
+ if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
+ printk_ratelimit()) {
+ pr_info("%s[%d] general protection ip:%lx sp:%lx error:%lx",
+ tsk->comm, task_pid_nr(tsk),
+ regs->ip, regs->sp, error_code);
+ print_vma_addr(KERN_CONT " in ", regs->ip);
+ pr_cont("\n");
+ }
+
+ force_sig_info(SIGSEGV, SEND_SIG_PRIV, tsk);
+}
+NOKPROBE_SYMBOL(do_general_protection);
+
+dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code)
+{
+#ifdef CONFIG_DYNAMIC_FTRACE
+ /*
+ * ftrace must be first, everything else may cause a recursive crash.
+ * See note by declaration of modifying_ftrace_code in ftrace.c
+ */
+ if (unlikely(atomic_read(&modifying_ftrace_code)) &&
+ ftrace_int3_handler(regs))
+ return;
+#endif
+ if (poke_int3_handler(regs))
+ return;
+
+ /*
+ * Use ist_enter despite the fact that we don't use an IST stack.
+ * We can be called from a kprobe in non-CONTEXT_KERNEL kernel
+ * mode or even during context tracking state changes.
+ *
+ * This means that we can't schedule. That's okay.
+ */
+ ist_enter(regs);
+ RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
+#ifdef CONFIG_KGDB_LOW_LEVEL_TRAP
+ if (kgdb_ll_trap(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP,
+ SIGTRAP) == NOTIFY_STOP)
+ goto exit;
+#endif /* CONFIG_KGDB_LOW_LEVEL_TRAP */
+
+#ifdef CONFIG_KPROBES
+ if (kprobe_int3_handler(regs))
+ goto exit;
+#endif
+
+ if (notify_die(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP,
+ SIGTRAP) == NOTIFY_STOP)
+ goto exit;
+
+ cond_local_irq_enable(regs);
+ do_trap(X86_TRAP_BP, SIGTRAP, "int3", regs, error_code, NULL);
+ cond_local_irq_disable(regs);
+
+exit:
+ ist_exit(regs);
+}
+NOKPROBE_SYMBOL(do_int3);
+
+#ifdef CONFIG_X86_64
+/*
+ * Help handler running on a per-cpu (IST or entry trampoline) stack
+ * to switch to the normal thread stack if the interrupted code was in
+ * user mode. The actual stack switch is done in entry_64.S
+ */
+asmlinkage __visible notrace struct pt_regs *sync_regs(struct pt_regs *eregs)
+{
+ struct pt_regs *regs = (struct pt_regs *)this_cpu_read(cpu_current_top_of_stack) - 1;
+ if (regs != eregs)
+ *regs = *eregs;
+ return regs;
+}
+NOKPROBE_SYMBOL(sync_regs);
+
+struct bad_iret_stack {
+ void *error_entry_ret;
+ struct pt_regs regs;
+};
+
+asmlinkage __visible notrace
+struct bad_iret_stack *fixup_bad_iret(struct bad_iret_stack *s)
+{
+ /*
+ * This is called from entry_64.S early in handling a fault
+ * caused by a bad iret to user mode. To handle the fault
+ * correctly, we want to move our stack frame to where it would
+ * be had we entered directly on the entry stack (rather than
+ * just below the IRET frame) and we want to pretend that the
+ * exception came from the IRET target.
+ */
+ struct bad_iret_stack *new_stack =
+ (struct bad_iret_stack *)this_cpu_read(cpu_tss_rw.x86_tss.sp0) - 1;
+
+ /* Copy the IRET target to the new stack. */
+ memmove(&new_stack->regs.ip, (void *)s->regs.sp, 5*8);
+
+ /* Copy the remainder of the stack from the current stack. */
+ memmove(new_stack, s, offsetof(struct bad_iret_stack, regs.ip));
+
+ BUG_ON(!user_mode(&new_stack->regs));
+ return new_stack;
+}
+NOKPROBE_SYMBOL(fixup_bad_iret);
+#endif
+
+static bool is_sysenter_singlestep(struct pt_regs *regs)
+{
+ /*
+ * We don't try for precision here. If we're anywhere in the region of
+ * code that can be single-stepped in the SYSENTER entry path, then
+ * assume that this is a useless single-step trap due to SYSENTER
+ * being invoked with TF set. (We don't know in advance exactly
+ * which instructions will be hit because BTF could plausibly
+ * be set.)
+ */
+#ifdef CONFIG_X86_32
+ return (regs->ip - (unsigned long)__begin_SYSENTER_singlestep_region) <
+ (unsigned long)__end_SYSENTER_singlestep_region -
+ (unsigned long)__begin_SYSENTER_singlestep_region;
+#elif defined(CONFIG_IA32_EMULATION)
+ return (regs->ip - (unsigned long)entry_SYSENTER_compat) <
+ (unsigned long)__end_entry_SYSENTER_compat -
+ (unsigned long)entry_SYSENTER_compat;
+#else
+ return false;
+#endif
+}
+
+/*
+ * Our handling of the processor debug registers is non-trivial.
+ * We do not clear them on entry and exit from the kernel. Therefore
+ * it is possible to get a watchpoint trap here from inside the kernel.
+ * However, the code in ./ptrace.c has ensured that the user can
+ * only set watchpoints on userspace addresses. Therefore the in-kernel
+ * watchpoint trap can only occur in code which is reading/writing
+ * from user space. Such code must not hold kernel locks (since it
+ * can equally take a page fault), therefore it is safe to call
+ * force_sig_info even though that claims and releases locks.
+ *
+ * Code in ./signal.c ensures that the debug control register
+ * is restored before we deliver any signal, and therefore that
+ * user code runs with the correct debug control register even though
+ * we clear it here.
+ *
+ * Being careful here means that we don't have to be as careful in a
+ * lot of more complicated places (task switching can be a bit lazy
+ * about restoring all the debug state, and ptrace doesn't have to
+ * find every occurrence of the TF bit that could be saved away even
+ * by user code)
+ *
+ * May run on IST stack.
+ */
+dotraplinkage void do_debug(struct pt_regs *regs, long error_code)
+{
+ struct task_struct *tsk = current;
+ int user_icebp = 0;
+ unsigned long dr6;
+ int si_code;
+
+ ist_enter(regs);
+
+ get_debugreg(dr6, 6);
+ /*
+ * The Intel SDM says:
+ *
+ * Certain debug exceptions may clear bits 0-3. The remaining
+ * contents of the DR6 register are never cleared by the
+ * processor. To avoid confusion in identifying debug
+ * exceptions, debug handlers should clear the register before
+ * returning to the interrupted task.
+ *
+ * Keep it simple: clear DR6 immediately.
+ */
+ set_debugreg(0, 6);
+
+ /* Filter out all the reserved bits which are preset to 1 */
+ dr6 &= ~DR6_RESERVED;
+
+ /*
+ * The SDM says "The processor clears the BTF flag when it
+ * generates a debug exception." Clear TIF_BLOCKSTEP to keep
+ * TIF_BLOCKSTEP in sync with the hardware BTF flag.
+ */
+ clear_tsk_thread_flag(tsk, TIF_BLOCKSTEP);
+
+ if (unlikely(!user_mode(regs) && (dr6 & DR_STEP) &&
+ is_sysenter_singlestep(regs))) {
+ dr6 &= ~DR_STEP;
+ if (!dr6)
+ goto exit;
+ /*
+ * else we might have gotten a single-step trap and hit a
+ * watchpoint at the same time, in which case we should fall
+ * through and handle the watchpoint.
+ */
+ }
+
+ /*
+ * If dr6 has no reason to give us about the origin of this trap,
+ * then it's very likely the result of an icebp/int01 trap.
+ * User wants a sigtrap for that.
+ */
+ if (!dr6 && user_mode(regs))
+ user_icebp = 1;
+
+ /* Store the virtualized DR6 value */
+ tsk->thread.debugreg6 = dr6;
+
+#ifdef CONFIG_KPROBES
+ if (kprobe_debug_handler(regs))
+ goto exit;
+#endif
+
+ if (notify_die(DIE_DEBUG, "debug", regs, (long)&dr6, error_code,
+ SIGTRAP) == NOTIFY_STOP)
+ goto exit;
+
+ /*
+ * Let others (NMI) know that the debug stack is in use
+ * as we may switch to the interrupt stack.
+ */
+ debug_stack_usage_inc();
+
+ /* It's safe to allow irq's after DR6 has been saved */
+ cond_local_irq_enable(regs);
+
+ if (v8086_mode(regs)) {
+ handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code,
+ X86_TRAP_DB);
+ cond_local_irq_disable(regs);
+ debug_stack_usage_dec();
+ goto exit;
+ }
+
+ if (WARN_ON_ONCE((dr6 & DR_STEP) && !user_mode(regs))) {
+ /*
+ * Historical junk that used to handle SYSENTER single-stepping.
+ * This should be unreachable now. If we survive for a while
+ * without anyone hitting this warning, we'll turn this into
+ * an oops.
+ */
+ tsk->thread.debugreg6 &= ~DR_STEP;
+ set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
+ regs->flags &= ~X86_EFLAGS_TF;
+ }
+ si_code = get_si_code(tsk->thread.debugreg6);
+ if (tsk->thread.debugreg6 & (DR_STEP | DR_TRAP_BITS) || user_icebp)
+ send_sigtrap(tsk, regs, error_code, si_code);
+ cond_local_irq_disable(regs);
+ debug_stack_usage_dec();
+
+exit:
+ ist_exit(regs);
+}
+NOKPROBE_SYMBOL(do_debug);
+
+/*
+ * Note that we play around with the 'TS' bit in an attempt to get
+ * the correct behaviour even in the presence of the asynchronous
+ * IRQ13 behaviour
+ */
+static void math_error(struct pt_regs *regs, int error_code, int trapnr)
+{
+ struct task_struct *task = current;
+ struct fpu *fpu = &task->thread.fpu;
+ siginfo_t info;
+ char *str = (trapnr == X86_TRAP_MF) ? "fpu exception" :
+ "simd exception";
+
+ cond_local_irq_enable(regs);
+
+ if (!user_mode(regs)) {
+ if (fixup_exception(regs, trapnr))
+ return;
+
+ task->thread.error_code = error_code;
+ task->thread.trap_nr = trapnr;
+
+ if (notify_die(DIE_TRAP, str, regs, error_code,
+ trapnr, SIGFPE) != NOTIFY_STOP)
+ die(str, regs, error_code);
+ return;
+ }
+
+ /*
+ * Save the info for the exception handler and clear the error.
+ */
+ fpu__save(fpu);
+
+ task->thread.trap_nr = trapnr;
+ task->thread.error_code = error_code;
+ clear_siginfo(&info);
+ info.si_signo = SIGFPE;
+ info.si_errno = 0;
+ info.si_addr = (void __user *)uprobe_get_trap_addr(regs);
+
+ info.si_code = fpu__exception_code(fpu, trapnr);
+
+ /* Retry when we get spurious exceptions: */
+ if (!info.si_code)
+ return;
+
+ force_sig_info(SIGFPE, &info, task);
+}
+
+dotraplinkage void do_coprocessor_error(struct pt_regs *regs, long error_code)
+{
+ RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
+ math_error(regs, error_code, X86_TRAP_MF);
+}
+
+dotraplinkage void
+do_simd_coprocessor_error(struct pt_regs *regs, long error_code)
+{
+ RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
+ math_error(regs, error_code, X86_TRAP_XF);
+}
+
+dotraplinkage void
+do_spurious_interrupt_bug(struct pt_regs *regs, long error_code)
+{
+ cond_local_irq_enable(regs);
+}
+
+dotraplinkage void
+do_device_not_available(struct pt_regs *regs, long error_code)
+{
+ unsigned long cr0;
+
+ RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
+
+#ifdef CONFIG_MATH_EMULATION
+ if (!boot_cpu_has(X86_FEATURE_FPU) && (read_cr0() & X86_CR0_EM)) {
+ struct math_emu_info info = { };
+
+ cond_local_irq_enable(regs);
+
+ info.regs = regs;
+ math_emulate(&info);
+ return;
+ }
+#endif
+
+ /* This should not happen. */
+ cr0 = read_cr0();
+ if (WARN(cr0 & X86_CR0_TS, "CR0.TS was set")) {
+ /* Try to fix it up and carry on. */
+ write_cr0(cr0 & ~X86_CR0_TS);
+ } else {
+ /*
+ * Something terrible happened, and we're better off trying
+ * to kill the task than getting stuck in a never-ending
+ * loop of #NM faults.
+ */
+ die("unexpected #NM exception", regs, error_code);
+ }
+}
+NOKPROBE_SYMBOL(do_device_not_available);
+
+#ifdef CONFIG_X86_32
+dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code)
+{
+ siginfo_t info;
+
+ RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
+ local_irq_enable();
+
+ clear_siginfo(&info);
+ info.si_signo = SIGILL;
+ info.si_errno = 0;
+ info.si_code = ILL_BADSTK;
+ info.si_addr = NULL;
+ if (notify_die(DIE_TRAP, "iret exception", regs, error_code,
+ X86_TRAP_IRET, SIGILL) != NOTIFY_STOP) {
+ do_trap(X86_TRAP_IRET, SIGILL, "iret exception", regs, error_code,
+ &info);
+ }
+}
+#endif
+
+void __init trap_init(void)
+{
+ /* Init cpu_entry_area before IST entries are set up */
+ setup_cpu_entry_areas();
+
+ idt_setup_traps();
+
+ /*
+ * Set the IDT descriptor to a fixed read-only location, so that the
+ * "sidt" instruction will not leak the location of the kernel, and
+ * to defend the IDT against arbitrary memory write vulnerabilities.
+ * It will be reloaded in cpu_init() */
+ cea_set_pte(CPU_ENTRY_AREA_RO_IDT_VADDR, __pa_symbol(idt_table),
+ PAGE_KERNEL_RO);
+ idt_descr.address = CPU_ENTRY_AREA_RO_IDT;
+
+ /*
+ * Should be a barrier for any external CPU state:
+ */
+ cpu_init();
+
+ idt_setup_ist_traps();
+
+ x86_init.irqs.trap_init();
+
+ idt_setup_debugidt_traps();
+}
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
new file mode 100644
index 0000000..6d5dc5d
--- /dev/null
+++ b/arch/x86/kernel/tsc.c
@@ -0,0 +1,1511 @@
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/sched/clock.h>
+#include <linux/init.h>
+#include <linux/export.h>
+#include <linux/timer.h>
+#include <linux/acpi_pmtmr.h>
+#include <linux/cpufreq.h>
+#include <linux/delay.h>
+#include <linux/clocksource.h>
+#include <linux/percpu.h>
+#include <linux/timex.h>
+#include <linux/static_key.h>
+
+#include <asm/hpet.h>
+#include <asm/timer.h>
+#include <asm/vgtod.h>
+#include <asm/time.h>
+#include <asm/delay.h>
+#include <asm/hypervisor.h>
+#include <asm/nmi.h>
+#include <asm/x86_init.h>
+#include <asm/geode.h>
+#include <asm/apic.h>
+#include <asm/intel-family.h>
+#include <asm/i8259.h>
+#include <asm/uv/uv.h>
+
+unsigned int __read_mostly cpu_khz; /* TSC clocks / usec, not used here */
+EXPORT_SYMBOL(cpu_khz);
+
+unsigned int __read_mostly tsc_khz;
+EXPORT_SYMBOL(tsc_khz);
+
+#define KHZ 1000
+
+/*
+ * TSC can be unstable due to cpufreq or due to unsynced TSCs
+ */
+static int __read_mostly tsc_unstable;
+
+static DEFINE_STATIC_KEY_FALSE(__use_tsc);
+
+int tsc_clocksource_reliable;
+
+static u32 art_to_tsc_numerator;
+static u32 art_to_tsc_denominator;
+static u64 art_to_tsc_offset;
+struct clocksource *art_related_clocksource;
+
+struct cyc2ns {
+ struct cyc2ns_data data[2]; /* 0 + 2*16 = 32 */
+ seqcount_t seq; /* 32 + 4 = 36 */
+
+}; /* fits one cacheline */
+
+static DEFINE_PER_CPU_ALIGNED(struct cyc2ns, cyc2ns);
+
+void __always_inline cyc2ns_read_begin(struct cyc2ns_data *data)
+{
+ int seq, idx;
+
+ preempt_disable_notrace();
+
+ do {
+ seq = this_cpu_read(cyc2ns.seq.sequence);
+ idx = seq & 1;
+
+ data->cyc2ns_offset = this_cpu_read(cyc2ns.data[idx].cyc2ns_offset);
+ data->cyc2ns_mul = this_cpu_read(cyc2ns.data[idx].cyc2ns_mul);
+ data->cyc2ns_shift = this_cpu_read(cyc2ns.data[idx].cyc2ns_shift);
+
+ } while (unlikely(seq != this_cpu_read(cyc2ns.seq.sequence)));
+}
+
+void __always_inline cyc2ns_read_end(void)
+{
+ preempt_enable_notrace();
+}
+
+/*
+ * Accelerators for sched_clock()
+ * convert from cycles(64bits) => nanoseconds (64bits)
+ * basic equation:
+ * ns = cycles / (freq / ns_per_sec)
+ * ns = cycles * (ns_per_sec / freq)
+ * ns = cycles * (10^9 / (cpu_khz * 10^3))
+ * ns = cycles * (10^6 / cpu_khz)
+ *
+ * Then we use scaling math (suggested by george@mvista.com) to get:
+ * ns = cycles * (10^6 * SC / cpu_khz) / SC
+ * ns = cycles * cyc2ns_scale / SC
+ *
+ * And since SC is a constant power of two, we can convert the div
+ * into a shift. The larger SC is, the more accurate the conversion, but
+ * cyc2ns_scale needs to be a 32-bit value so that 32-bit multiplication
+ * (64-bit result) can be used.
+ *
+ * We can use khz divisor instead of mhz to keep a better precision.
+ * (mathieu.desnoyers@polymtl.ca)
+ *
+ * -johnstul@us.ibm.com "math is hard, lets go shopping!"
+ */
+
+static __always_inline unsigned long long cycles_2_ns(unsigned long long cyc)
+{
+ struct cyc2ns_data data;
+ unsigned long long ns;
+
+ cyc2ns_read_begin(&data);
+
+ ns = data.cyc2ns_offset;
+ ns += mul_u64_u32_shr(cyc, data.cyc2ns_mul, data.cyc2ns_shift);
+
+ cyc2ns_read_end();
+
+ return ns;
+}
+
+static void __set_cyc2ns_scale(unsigned long khz, int cpu, unsigned long long tsc_now)
+{
+ unsigned long long ns_now;
+ struct cyc2ns_data data;
+ struct cyc2ns *c2n;
+
+ ns_now = cycles_2_ns(tsc_now);
+
+ /*
+ * Compute a new multiplier as per the above comment and ensure our
+ * time function is continuous; see the comment near struct
+ * cyc2ns_data.
+ */
+ clocks_calc_mult_shift(&data.cyc2ns_mul, &data.cyc2ns_shift, khz,
+ NSEC_PER_MSEC, 0);
+
+ /*
+ * cyc2ns_shift is exported via arch_perf_update_userpage() where it is
+ * not expected to be greater than 31 due to the original published
+ * conversion algorithm shifting a 32-bit value (now specifies a 64-bit
+ * value) - refer perf_event_mmap_page documentation in perf_event.h.
+ */
+ if (data.cyc2ns_shift == 32) {
+ data.cyc2ns_shift = 31;
+ data.cyc2ns_mul >>= 1;
+ }
+
+ data.cyc2ns_offset = ns_now -
+ mul_u64_u32_shr(tsc_now, data.cyc2ns_mul, data.cyc2ns_shift);
+
+ c2n = per_cpu_ptr(&cyc2ns, cpu);
+
+ raw_write_seqcount_latch(&c2n->seq);
+ c2n->data[0] = data;
+ raw_write_seqcount_latch(&c2n->seq);
+ c2n->data[1] = data;
+}
+
+static void set_cyc2ns_scale(unsigned long khz, int cpu, unsigned long long tsc_now)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ sched_clock_idle_sleep_event();
+
+ if (khz)
+ __set_cyc2ns_scale(khz, cpu, tsc_now);
+
+ sched_clock_idle_wakeup_event();
+ local_irq_restore(flags);
+}
+
+/*
+ * Initialize cyc2ns for boot cpu
+ */
+static void __init cyc2ns_init_boot_cpu(void)
+{
+ struct cyc2ns *c2n = this_cpu_ptr(&cyc2ns);
+
+ seqcount_init(&c2n->seq);
+ __set_cyc2ns_scale(tsc_khz, smp_processor_id(), rdtsc());
+}
+
+/*
+ * Secondary CPUs do not run through tsc_init(), so set up
+ * all the scale factors for all CPUs, assuming the same
+ * speed as the bootup CPU. (cpufreq notifiers will fix this
+ * up if their speed diverges)
+ */
+static void __init cyc2ns_init_secondary_cpus(void)
+{
+ unsigned int cpu, this_cpu = smp_processor_id();
+ struct cyc2ns *c2n = this_cpu_ptr(&cyc2ns);
+ struct cyc2ns_data *data = c2n->data;
+
+ for_each_possible_cpu(cpu) {
+ if (cpu != this_cpu) {
+ seqcount_init(&c2n->seq);
+ c2n = per_cpu_ptr(&cyc2ns, cpu);
+ c2n->data[0] = data[0];
+ c2n->data[1] = data[1];
+ }
+ }
+}
+
+/*
+ * Scheduler clock - returns current time in nanosec units.
+ */
+u64 native_sched_clock(void)
+{
+ if (static_branch_likely(&__use_tsc)) {
+ u64 tsc_now = rdtsc();
+
+ /* return the value in ns */
+ return cycles_2_ns(tsc_now);
+ }
+
+ /*
+ * Fall back to jiffies if there's no TSC available:
+ * ( But note that we still use it if the TSC is marked
+ * unstable. We do this because unlike Time Of Day,
+ * the scheduler clock tolerates small errors and it's
+ * very important for it to be as fast as the platform
+ * can achieve it. )
+ */
+
+ /* No locking but a rare wrong value is not a big deal: */
+ return (jiffies_64 - INITIAL_JIFFIES) * (1000000000 / HZ);
+}
+
+/*
+ * Generate a sched_clock if you already have a TSC value.
+ */
+u64 native_sched_clock_from_tsc(u64 tsc)
+{
+ return cycles_2_ns(tsc);
+}
+
+/* We need to define a real function for sched_clock, to override the
+ weak default version */
+#ifdef CONFIG_PARAVIRT
+unsigned long long sched_clock(void)
+{
+ return paravirt_sched_clock();
+}
+
+bool using_native_sched_clock(void)
+{
+ return pv_time_ops.sched_clock == native_sched_clock;
+}
+#else
+unsigned long long
+sched_clock(void) __attribute__((alias("native_sched_clock")));
+
+bool using_native_sched_clock(void) { return true; }
+#endif
+
+int check_tsc_unstable(void)
+{
+ return tsc_unstable;
+}
+EXPORT_SYMBOL_GPL(check_tsc_unstable);
+
+#ifdef CONFIG_X86_TSC
+int __init notsc_setup(char *str)
+{
+ mark_tsc_unstable("boot parameter notsc");
+ return 1;
+}
+#else
+/*
+ * disable flag for tsc. Takes effect by clearing the TSC cpu flag
+ * in cpu/common.c
+ */
+int __init notsc_setup(char *str)
+{
+ setup_clear_cpu_cap(X86_FEATURE_TSC);
+ return 1;
+}
+#endif
+
+__setup("notsc", notsc_setup);
+
+static int no_sched_irq_time;
+
+static int __init tsc_setup(char *str)
+{
+ if (!strcmp(str, "reliable"))
+ tsc_clocksource_reliable = 1;
+ if (!strncmp(str, "noirqtime", 9))
+ no_sched_irq_time = 1;
+ if (!strcmp(str, "unstable"))
+ mark_tsc_unstable("boot parameter");
+ return 1;
+}
+
+__setup("tsc=", tsc_setup);
+
+#define MAX_RETRIES 5
+#define SMI_TRESHOLD 50000
+
+/*
+ * Read TSC and the reference counters. Take care of SMI disturbance
+ */
+static u64 tsc_read_refs(u64 *p, int hpet)
+{
+ u64 t1, t2;
+ int i;
+
+ for (i = 0; i < MAX_RETRIES; i++) {
+ t1 = get_cycles();
+ if (hpet)
+ *p = hpet_readl(HPET_COUNTER) & 0xFFFFFFFF;
+ else
+ *p = acpi_pm_read_early();
+ t2 = get_cycles();
+ if ((t2 - t1) < SMI_TRESHOLD)
+ return t2;
+ }
+ return ULLONG_MAX;
+}
+
+/*
+ * Calculate the TSC frequency from HPET reference
+ */
+static unsigned long calc_hpet_ref(u64 deltatsc, u64 hpet1, u64 hpet2)
+{
+ u64 tmp;
+
+ if (hpet2 < hpet1)
+ hpet2 += 0x100000000ULL;
+ hpet2 -= hpet1;
+ tmp = ((u64)hpet2 * hpet_readl(HPET_PERIOD));
+ do_div(tmp, 1000000);
+ deltatsc = div64_u64(deltatsc, tmp);
+
+ return (unsigned long) deltatsc;
+}
+
+/*
+ * Calculate the TSC frequency from PMTimer reference
+ */
+static unsigned long calc_pmtimer_ref(u64 deltatsc, u64 pm1, u64 pm2)
+{
+ u64 tmp;
+
+ if (!pm1 && !pm2)
+ return ULONG_MAX;
+
+ if (pm2 < pm1)
+ pm2 += (u64)ACPI_PM_OVRRUN;
+ pm2 -= pm1;
+ tmp = pm2 * 1000000000LL;
+ do_div(tmp, PMTMR_TICKS_PER_SEC);
+ do_div(deltatsc, tmp);
+
+ return (unsigned long) deltatsc;
+}
+
+#define CAL_MS 10
+#define CAL_LATCH (PIT_TICK_RATE / (1000 / CAL_MS))
+#define CAL_PIT_LOOPS 1000
+
+#define CAL2_MS 50
+#define CAL2_LATCH (PIT_TICK_RATE / (1000 / CAL2_MS))
+#define CAL2_PIT_LOOPS 5000
+
+
+/*
+ * Try to calibrate the TSC against the Programmable
+ * Interrupt Timer and return the frequency of the TSC
+ * in kHz.
+ *
+ * Return ULONG_MAX on failure to calibrate.
+ */
+static unsigned long pit_calibrate_tsc(u32 latch, unsigned long ms, int loopmin)
+{
+ u64 tsc, t1, t2, delta;
+ unsigned long tscmin, tscmax;
+ int pitcnt;
+
+ if (!has_legacy_pic()) {
+ /*
+ * Relies on tsc_early_delay_calibrate() to have given us semi
+ * usable udelay(), wait for the same 50ms we would have with
+ * the PIT loop below.
+ */
+ udelay(10 * USEC_PER_MSEC);
+ udelay(10 * USEC_PER_MSEC);
+ udelay(10 * USEC_PER_MSEC);
+ udelay(10 * USEC_PER_MSEC);
+ udelay(10 * USEC_PER_MSEC);
+ return ULONG_MAX;
+ }
+
+ /* Set the Gate high, disable speaker */
+ outb((inb(0x61) & ~0x02) | 0x01, 0x61);
+
+ /*
+ * Setup CTC channel 2* for mode 0, (interrupt on terminal
+ * count mode), binary count. Set the latch register to 50ms
+ * (LSB then MSB) to begin countdown.
+ */
+ outb(0xb0, 0x43);
+ outb(latch & 0xff, 0x42);
+ outb(latch >> 8, 0x42);
+
+ tsc = t1 = t2 = get_cycles();
+
+ pitcnt = 0;
+ tscmax = 0;
+ tscmin = ULONG_MAX;
+ while ((inb(0x61) & 0x20) == 0) {
+ t2 = get_cycles();
+ delta = t2 - tsc;
+ tsc = t2;
+ if ((unsigned long) delta < tscmin)
+ tscmin = (unsigned int) delta;
+ if ((unsigned long) delta > tscmax)
+ tscmax = (unsigned int) delta;
+ pitcnt++;
+ }
+
+ /*
+ * Sanity checks:
+ *
+ * If we were not able to read the PIT more than loopmin
+ * times, then we have been hit by a massive SMI
+ *
+ * If the maximum is 10 times larger than the minimum,
+ * then we got hit by an SMI as well.
+ */
+ if (pitcnt < loopmin || tscmax > 10 * tscmin)
+ return ULONG_MAX;
+
+ /* Calculate the PIT value */
+ delta = t2 - t1;
+ do_div(delta, ms);
+ return delta;
+}
+
+/*
+ * This reads the current MSB of the PIT counter, and
+ * checks if we are running on sufficiently fast and
+ * non-virtualized hardware.
+ *
+ * Our expectations are:
+ *
+ * - the PIT is running at roughly 1.19MHz
+ *
+ * - each IO is going to take about 1us on real hardware,
+ * but we allow it to be much faster (by a factor of 10) or
+ * _slightly_ slower (ie we allow up to a 2us read+counter
+ * update - anything else implies a unacceptably slow CPU
+ * or PIT for the fast calibration to work.
+ *
+ * - with 256 PIT ticks to read the value, we have 214us to
+ * see the same MSB (and overhead like doing a single TSC
+ * read per MSB value etc).
+ *
+ * - We're doing 2 reads per loop (LSB, MSB), and we expect
+ * them each to take about a microsecond on real hardware.
+ * So we expect a count value of around 100. But we'll be
+ * generous, and accept anything over 50.
+ *
+ * - if the PIT is stuck, and we see *many* more reads, we
+ * return early (and the next caller of pit_expect_msb()
+ * then consider it a failure when they don't see the
+ * next expected value).
+ *
+ * These expectations mean that we know that we have seen the
+ * transition from one expected value to another with a fairly
+ * high accuracy, and we didn't miss any events. We can thus
+ * use the TSC value at the transitions to calculate a pretty
+ * good value for the TSC frequencty.
+ */
+static inline int pit_verify_msb(unsigned char val)
+{
+ /* Ignore LSB */
+ inb(0x42);
+ return inb(0x42) == val;
+}
+
+static inline int pit_expect_msb(unsigned char val, u64 *tscp, unsigned long *deltap)
+{
+ int count;
+ u64 tsc = 0, prev_tsc = 0;
+
+ for (count = 0; count < 50000; count++) {
+ if (!pit_verify_msb(val))
+ break;
+ prev_tsc = tsc;
+ tsc = get_cycles();
+ }
+ *deltap = get_cycles() - prev_tsc;
+ *tscp = tsc;
+
+ /*
+ * We require _some_ success, but the quality control
+ * will be based on the error terms on the TSC values.
+ */
+ return count > 5;
+}
+
+/*
+ * How many MSB values do we want to see? We aim for
+ * a maximum error rate of 500ppm (in practice the
+ * real error is much smaller), but refuse to spend
+ * more than 50ms on it.
+ */
+#define MAX_QUICK_PIT_MS 50
+#define MAX_QUICK_PIT_ITERATIONS (MAX_QUICK_PIT_MS * PIT_TICK_RATE / 1000 / 256)
+
+static unsigned long quick_pit_calibrate(void)
+{
+ int i;
+ u64 tsc, delta;
+ unsigned long d1, d2;
+
+ if (!has_legacy_pic())
+ return 0;
+
+ /* Set the Gate high, disable speaker */
+ outb((inb(0x61) & ~0x02) | 0x01, 0x61);
+
+ /*
+ * Counter 2, mode 0 (one-shot), binary count
+ *
+ * NOTE! Mode 2 decrements by two (and then the
+ * output is flipped each time, giving the same
+ * final output frequency as a decrement-by-one),
+ * so mode 0 is much better when looking at the
+ * individual counts.
+ */
+ outb(0xb0, 0x43);
+
+ /* Start at 0xffff */
+ outb(0xff, 0x42);
+ outb(0xff, 0x42);
+
+ /*
+ * The PIT starts counting at the next edge, so we
+ * need to delay for a microsecond. The easiest way
+ * to do that is to just read back the 16-bit counter
+ * once from the PIT.
+ */
+ pit_verify_msb(0);
+
+ if (pit_expect_msb(0xff, &tsc, &d1)) {
+ for (i = 1; i <= MAX_QUICK_PIT_ITERATIONS; i++) {
+ if (!pit_expect_msb(0xff-i, &delta, &d2))
+ break;
+
+ delta -= tsc;
+
+ /*
+ * Extrapolate the error and fail fast if the error will
+ * never be below 500 ppm.
+ */
+ if (i == 1 &&
+ d1 + d2 >= (delta * MAX_QUICK_PIT_ITERATIONS) >> 11)
+ return 0;
+
+ /*
+ * Iterate until the error is less than 500 ppm
+ */
+ if (d1+d2 >= delta >> 11)
+ continue;
+
+ /*
+ * Check the PIT one more time to verify that
+ * all TSC reads were stable wrt the PIT.
+ *
+ * This also guarantees serialization of the
+ * last cycle read ('d2') in pit_expect_msb.
+ */
+ if (!pit_verify_msb(0xfe - i))
+ break;
+ goto success;
+ }
+ }
+ pr_info("Fast TSC calibration failed\n");
+ return 0;
+
+success:
+ /*
+ * Ok, if we get here, then we've seen the
+ * MSB of the PIT decrement 'i' times, and the
+ * error has shrunk to less than 500 ppm.
+ *
+ * As a result, we can depend on there not being
+ * any odd delays anywhere, and the TSC reads are
+ * reliable (within the error).
+ *
+ * kHz = ticks / time-in-seconds / 1000;
+ * kHz = (t2 - t1) / (I * 256 / PIT_TICK_RATE) / 1000
+ * kHz = ((t2 - t1) * PIT_TICK_RATE) / (I * 256 * 1000)
+ */
+ delta *= PIT_TICK_RATE;
+ do_div(delta, i*256*1000);
+ pr_info("Fast TSC calibration using PIT\n");
+ return delta;
+}
+
+/**
+ * native_calibrate_tsc
+ * Determine TSC frequency via CPUID, else return 0.
+ */
+unsigned long native_calibrate_tsc(void)
+{
+ unsigned int eax_denominator, ebx_numerator, ecx_hz, edx;
+ unsigned int crystal_khz;
+
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
+ return 0;
+
+ if (boot_cpu_data.cpuid_level < 0x15)
+ return 0;
+
+ eax_denominator = ebx_numerator = ecx_hz = edx = 0;
+
+ /* CPUID 15H TSC/Crystal ratio, plus optionally Crystal Hz */
+ cpuid(0x15, &eax_denominator, &ebx_numerator, &ecx_hz, &edx);
+
+ if (ebx_numerator == 0 || eax_denominator == 0)
+ return 0;
+
+ crystal_khz = ecx_hz / 1000;
+
+ if (crystal_khz == 0) {
+ switch (boot_cpu_data.x86_model) {
+ case INTEL_FAM6_SKYLAKE_MOBILE:
+ case INTEL_FAM6_SKYLAKE_DESKTOP:
+ case INTEL_FAM6_KABYLAKE_MOBILE:
+ case INTEL_FAM6_KABYLAKE_DESKTOP:
+ crystal_khz = 24000; /* 24.0 MHz */
+ break;
+ case INTEL_FAM6_ATOM_DENVERTON:
+ crystal_khz = 25000; /* 25.0 MHz */
+ break;
+ case INTEL_FAM6_ATOM_GOLDMONT:
+ crystal_khz = 19200; /* 19.2 MHz */
+ break;
+ }
+ }
+
+ if (crystal_khz == 0)
+ return 0;
+ /*
+ * TSC frequency determined by CPUID is a "hardware reported"
+ * frequency and is the most accurate one so far we have. This
+ * is considered a known frequency.
+ */
+ setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ);
+
+ /*
+ * For Atom SoCs TSC is the only reliable clocksource.
+ * Mark TSC reliable so no watchdog on it.
+ */
+ if (boot_cpu_data.x86_model == INTEL_FAM6_ATOM_GOLDMONT)
+ setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE);
+
+ return crystal_khz * ebx_numerator / eax_denominator;
+}
+
+static unsigned long cpu_khz_from_cpuid(void)
+{
+ unsigned int eax_base_mhz, ebx_max_mhz, ecx_bus_mhz, edx;
+
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
+ return 0;
+
+ if (boot_cpu_data.cpuid_level < 0x16)
+ return 0;
+
+ eax_base_mhz = ebx_max_mhz = ecx_bus_mhz = edx = 0;
+
+ cpuid(0x16, &eax_base_mhz, &ebx_max_mhz, &ecx_bus_mhz, &edx);
+
+ return eax_base_mhz * 1000;
+}
+
+/*
+ * calibrate cpu using pit, hpet, and ptimer methods. They are available
+ * later in boot after acpi is initialized.
+ */
+static unsigned long pit_hpet_ptimer_calibrate_cpu(void)
+{
+ u64 tsc1, tsc2, delta, ref1, ref2;
+ unsigned long tsc_pit_min = ULONG_MAX, tsc_ref_min = ULONG_MAX;
+ unsigned long flags, latch, ms;
+ int hpet = is_hpet_enabled(), i, loopmin;
+
+ /*
+ * Run 5 calibration loops to get the lowest frequency value
+ * (the best estimate). We use two different calibration modes
+ * here:
+ *
+ * 1) PIT loop. We set the PIT Channel 2 to oneshot mode and
+ * load a timeout of 50ms. We read the time right after we
+ * started the timer and wait until the PIT count down reaches
+ * zero. In each wait loop iteration we read the TSC and check
+ * the delta to the previous read. We keep track of the min
+ * and max values of that delta. The delta is mostly defined
+ * by the IO time of the PIT access, so we can detect when a
+ * SMI/SMM disturbance happened between the two reads. If the
+ * maximum time is significantly larger than the minimum time,
+ * then we discard the result and have another try.
+ *
+ * 2) Reference counter. If available we use the HPET or the
+ * PMTIMER as a reference to check the sanity of that value.
+ * We use separate TSC readouts and check inside of the
+ * reference read for a SMI/SMM disturbance. We dicard
+ * disturbed values here as well. We do that around the PIT
+ * calibration delay loop as we have to wait for a certain
+ * amount of time anyway.
+ */
+
+ /* Preset PIT loop values */
+ latch = CAL_LATCH;
+ ms = CAL_MS;
+ loopmin = CAL_PIT_LOOPS;
+
+ for (i = 0; i < 3; i++) {
+ unsigned long tsc_pit_khz;
+
+ /*
+ * Read the start value and the reference count of
+ * hpet/pmtimer when available. Then do the PIT
+ * calibration, which will take at least 50ms, and
+ * read the end value.
+ */
+ local_irq_save(flags);
+ tsc1 = tsc_read_refs(&ref1, hpet);
+ tsc_pit_khz = pit_calibrate_tsc(latch, ms, loopmin);
+ tsc2 = tsc_read_refs(&ref2, hpet);
+ local_irq_restore(flags);
+
+ /* Pick the lowest PIT TSC calibration so far */
+ tsc_pit_min = min(tsc_pit_min, tsc_pit_khz);
+
+ /* hpet or pmtimer available ? */
+ if (ref1 == ref2)
+ continue;
+
+ /* Check, whether the sampling was disturbed by an SMI */
+ if (tsc1 == ULLONG_MAX || tsc2 == ULLONG_MAX)
+ continue;
+
+ tsc2 = (tsc2 - tsc1) * 1000000LL;
+ if (hpet)
+ tsc2 = calc_hpet_ref(tsc2, ref1, ref2);
+ else
+ tsc2 = calc_pmtimer_ref(tsc2, ref1, ref2);
+
+ tsc_ref_min = min(tsc_ref_min, (unsigned long) tsc2);
+
+ /* Check the reference deviation */
+ delta = ((u64) tsc_pit_min) * 100;
+ do_div(delta, tsc_ref_min);
+
+ /*
+ * If both calibration results are inside a 10% window
+ * then we can be sure, that the calibration
+ * succeeded. We break out of the loop right away. We
+ * use the reference value, as it is more precise.
+ */
+ if (delta >= 90 && delta <= 110) {
+ pr_info("PIT calibration matches %s. %d loops\n",
+ hpet ? "HPET" : "PMTIMER", i + 1);
+ return tsc_ref_min;
+ }
+
+ /*
+ * Check whether PIT failed more than once. This
+ * happens in virtualized environments. We need to
+ * give the virtual PC a slightly longer timeframe for
+ * the HPET/PMTIMER to make the result precise.
+ */
+ if (i == 1 && tsc_pit_min == ULONG_MAX) {
+ latch = CAL2_LATCH;
+ ms = CAL2_MS;
+ loopmin = CAL2_PIT_LOOPS;
+ }
+ }
+
+ /*
+ * Now check the results.
+ */
+ if (tsc_pit_min == ULONG_MAX) {
+ /* PIT gave no useful value */
+ pr_warn("Unable to calibrate against PIT\n");
+
+ /* We don't have an alternative source, disable TSC */
+ if (!hpet && !ref1 && !ref2) {
+ pr_notice("No reference (HPET/PMTIMER) available\n");
+ return 0;
+ }
+
+ /* The alternative source failed as well, disable TSC */
+ if (tsc_ref_min == ULONG_MAX) {
+ pr_warn("HPET/PMTIMER calibration failed\n");
+ return 0;
+ }
+
+ /* Use the alternative source */
+ pr_info("using %s reference calibration\n",
+ hpet ? "HPET" : "PMTIMER");
+
+ return tsc_ref_min;
+ }
+
+ /* We don't have an alternative source, use the PIT calibration value */
+ if (!hpet && !ref1 && !ref2) {
+ pr_info("Using PIT calibration value\n");
+ return tsc_pit_min;
+ }
+
+ /* The alternative source failed, use the PIT calibration value */
+ if (tsc_ref_min == ULONG_MAX) {
+ pr_warn("HPET/PMTIMER calibration failed. Using PIT calibration.\n");
+ return tsc_pit_min;
+ }
+
+ /*
+ * The calibration values differ too much. In doubt, we use
+ * the PIT value as we know that there are PMTIMERs around
+ * running at double speed. At least we let the user know:
+ */
+ pr_warn("PIT calibration deviates from %s: %lu %lu\n",
+ hpet ? "HPET" : "PMTIMER", tsc_pit_min, tsc_ref_min);
+ pr_info("Using PIT calibration value\n");
+ return tsc_pit_min;
+}
+
+/**
+ * native_calibrate_cpu_early - can calibrate the cpu early in boot
+ */
+unsigned long native_calibrate_cpu_early(void)
+{
+ unsigned long flags, fast_calibrate = cpu_khz_from_cpuid();
+
+ if (!fast_calibrate)
+ fast_calibrate = cpu_khz_from_msr();
+ if (!fast_calibrate) {
+ local_irq_save(flags);
+ fast_calibrate = quick_pit_calibrate();
+ local_irq_restore(flags);
+ }
+ return fast_calibrate;
+}
+
+
+/**
+ * native_calibrate_cpu - calibrate the cpu
+ */
+static unsigned long native_calibrate_cpu(void)
+{
+ unsigned long tsc_freq = native_calibrate_cpu_early();
+
+ if (!tsc_freq)
+ tsc_freq = pit_hpet_ptimer_calibrate_cpu();
+
+ return tsc_freq;
+}
+
+void recalibrate_cpu_khz(void)
+{
+#ifndef CONFIG_SMP
+ unsigned long cpu_khz_old = cpu_khz;
+
+ if (!boot_cpu_has(X86_FEATURE_TSC))
+ return;
+
+ cpu_khz = x86_platform.calibrate_cpu();
+ tsc_khz = x86_platform.calibrate_tsc();
+ if (tsc_khz == 0)
+ tsc_khz = cpu_khz;
+ else if (abs(cpu_khz - tsc_khz) * 10 > tsc_khz)
+ cpu_khz = tsc_khz;
+ cpu_data(0).loops_per_jiffy = cpufreq_scale(cpu_data(0).loops_per_jiffy,
+ cpu_khz_old, cpu_khz);
+#endif
+}
+
+EXPORT_SYMBOL(recalibrate_cpu_khz);
+
+
+static unsigned long long cyc2ns_suspend;
+
+void tsc_save_sched_clock_state(void)
+{
+ if (!sched_clock_stable())
+ return;
+
+ cyc2ns_suspend = sched_clock();
+}
+
+/*
+ * Even on processors with invariant TSC, TSC gets reset in some the
+ * ACPI system sleep states. And in some systems BIOS seem to reinit TSC to
+ * arbitrary value (still sync'd across cpu's) during resume from such sleep
+ * states. To cope up with this, recompute the cyc2ns_offset for each cpu so
+ * that sched_clock() continues from the point where it was left off during
+ * suspend.
+ */
+void tsc_restore_sched_clock_state(void)
+{
+ unsigned long long offset;
+ unsigned long flags;
+ int cpu;
+
+ if (!sched_clock_stable())
+ return;
+
+ local_irq_save(flags);
+
+ /*
+ * We're coming out of suspend, there's no concurrency yet; don't
+ * bother being nice about the RCU stuff, just write to both
+ * data fields.
+ */
+
+ this_cpu_write(cyc2ns.data[0].cyc2ns_offset, 0);
+ this_cpu_write(cyc2ns.data[1].cyc2ns_offset, 0);
+
+ offset = cyc2ns_suspend - sched_clock();
+
+ for_each_possible_cpu(cpu) {
+ per_cpu(cyc2ns.data[0].cyc2ns_offset, cpu) = offset;
+ per_cpu(cyc2ns.data[1].cyc2ns_offset, cpu) = offset;
+ }
+
+ local_irq_restore(flags);
+}
+
+#ifdef CONFIG_CPU_FREQ
+/* Frequency scaling support. Adjust the TSC based timer when the cpu frequency
+ * changes.
+ *
+ * RED-PEN: On SMP we assume all CPUs run with the same frequency. It's
+ * not that important because current Opteron setups do not support
+ * scaling on SMP anyroads.
+ *
+ * Should fix up last_tsc too. Currently gettimeofday in the
+ * first tick after the change will be slightly wrong.
+ */
+
+static unsigned int ref_freq;
+static unsigned long loops_per_jiffy_ref;
+static unsigned long tsc_khz_ref;
+
+static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
+ void *data)
+{
+ struct cpufreq_freqs *freq = data;
+ unsigned long *lpj;
+
+ lpj = &boot_cpu_data.loops_per_jiffy;
+#ifdef CONFIG_SMP
+ if (!(freq->flags & CPUFREQ_CONST_LOOPS))
+ lpj = &cpu_data(freq->cpu).loops_per_jiffy;
+#endif
+
+ if (!ref_freq) {
+ ref_freq = freq->old;
+ loops_per_jiffy_ref = *lpj;
+ tsc_khz_ref = tsc_khz;
+ }
+ if ((val == CPUFREQ_PRECHANGE && freq->old < freq->new) ||
+ (val == CPUFREQ_POSTCHANGE && freq->old > freq->new)) {
+ *lpj = cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new);
+
+ tsc_khz = cpufreq_scale(tsc_khz_ref, ref_freq, freq->new);
+ if (!(freq->flags & CPUFREQ_CONST_LOOPS))
+ mark_tsc_unstable("cpufreq changes");
+
+ set_cyc2ns_scale(tsc_khz, freq->cpu, rdtsc());
+ }
+
+ return 0;
+}
+
+static struct notifier_block time_cpufreq_notifier_block = {
+ .notifier_call = time_cpufreq_notifier
+};
+
+static int __init cpufreq_register_tsc_scaling(void)
+{
+ if (!boot_cpu_has(X86_FEATURE_TSC))
+ return 0;
+ if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
+ return 0;
+ cpufreq_register_notifier(&time_cpufreq_notifier_block,
+ CPUFREQ_TRANSITION_NOTIFIER);
+ return 0;
+}
+
+core_initcall(cpufreq_register_tsc_scaling);
+
+#endif /* CONFIG_CPU_FREQ */
+
+#define ART_CPUID_LEAF (0x15)
+#define ART_MIN_DENOMINATOR (1)
+
+
+/*
+ * If ART is present detect the numerator:denominator to convert to TSC
+ */
+static void __init detect_art(void)
+{
+ unsigned int unused[2];
+
+ if (boot_cpu_data.cpuid_level < ART_CPUID_LEAF)
+ return;
+
+ /*
+ * Don't enable ART in a VM, non-stop TSC and TSC_ADJUST required,
+ * and the TSC counter resets must not occur asynchronously.
+ */
+ if (boot_cpu_has(X86_FEATURE_HYPERVISOR) ||
+ !boot_cpu_has(X86_FEATURE_NONSTOP_TSC) ||
+ !boot_cpu_has(X86_FEATURE_TSC_ADJUST) ||
+ tsc_async_resets)
+ return;
+
+ cpuid(ART_CPUID_LEAF, &art_to_tsc_denominator,
+ &art_to_tsc_numerator, unused, unused+1);
+
+ if (art_to_tsc_denominator < ART_MIN_DENOMINATOR)
+ return;
+
+ rdmsrl(MSR_IA32_TSC_ADJUST, art_to_tsc_offset);
+
+ /* Make this sticky over multiple CPU init calls */
+ setup_force_cpu_cap(X86_FEATURE_ART);
+}
+
+
+/* clocksource code */
+
+static void tsc_resume(struct clocksource *cs)
+{
+ tsc_verify_tsc_adjust(true);
+}
+
+/*
+ * We used to compare the TSC to the cycle_last value in the clocksource
+ * structure to avoid a nasty time-warp. This can be observed in a
+ * very small window right after one CPU updated cycle_last under
+ * xtime/vsyscall_gtod lock and the other CPU reads a TSC value which
+ * is smaller than the cycle_last reference value due to a TSC which
+ * is slighty behind. This delta is nowhere else observable, but in
+ * that case it results in a forward time jump in the range of hours
+ * due to the unsigned delta calculation of the time keeping core
+ * code, which is necessary to support wrapping clocksources like pm
+ * timer.
+ *
+ * This sanity check is now done in the core timekeeping code.
+ * checking the result of read_tsc() - cycle_last for being negative.
+ * That works because CLOCKSOURCE_MASK(64) does not mask out any bit.
+ */
+static u64 read_tsc(struct clocksource *cs)
+{
+ return (u64)rdtsc_ordered();
+}
+
+static void tsc_cs_mark_unstable(struct clocksource *cs)
+{
+ if (tsc_unstable)
+ return;
+
+ tsc_unstable = 1;
+ if (using_native_sched_clock())
+ clear_sched_clock_stable();
+ disable_sched_clock_irqtime();
+ pr_info("Marking TSC unstable due to clocksource watchdog\n");
+}
+
+static void tsc_cs_tick_stable(struct clocksource *cs)
+{
+ if (tsc_unstable)
+ return;
+
+ if (using_native_sched_clock())
+ sched_clock_tick_stable();
+}
+
+/*
+ * .mask MUST be CLOCKSOURCE_MASK(64). See comment above read_tsc()
+ */
+static struct clocksource clocksource_tsc_early = {
+ .name = "tsc-early",
+ .rating = 299,
+ .read = read_tsc,
+ .mask = CLOCKSOURCE_MASK(64),
+ .flags = CLOCK_SOURCE_IS_CONTINUOUS |
+ CLOCK_SOURCE_MUST_VERIFY,
+ .archdata = { .vclock_mode = VCLOCK_TSC },
+ .resume = tsc_resume,
+ .mark_unstable = tsc_cs_mark_unstable,
+ .tick_stable = tsc_cs_tick_stable,
+ .list = LIST_HEAD_INIT(clocksource_tsc_early.list),
+};
+
+/*
+ * Must mark VALID_FOR_HRES early such that when we unregister tsc_early
+ * this one will immediately take over. We will only register if TSC has
+ * been found good.
+ */
+static struct clocksource clocksource_tsc = {
+ .name = "tsc",
+ .rating = 300,
+ .read = read_tsc,
+ .mask = CLOCKSOURCE_MASK(64),
+ .flags = CLOCK_SOURCE_IS_CONTINUOUS |
+ CLOCK_SOURCE_VALID_FOR_HRES |
+ CLOCK_SOURCE_MUST_VERIFY,
+ .archdata = { .vclock_mode = VCLOCK_TSC },
+ .resume = tsc_resume,
+ .mark_unstable = tsc_cs_mark_unstable,
+ .tick_stable = tsc_cs_tick_stable,
+ .list = LIST_HEAD_INIT(clocksource_tsc.list),
+};
+
+void mark_tsc_unstable(char *reason)
+{
+ if (tsc_unstable)
+ return;
+
+ tsc_unstable = 1;
+ if (using_native_sched_clock())
+ clear_sched_clock_stable();
+ disable_sched_clock_irqtime();
+ pr_info("Marking TSC unstable due to %s\n", reason);
+
+ clocksource_mark_unstable(&clocksource_tsc_early);
+ clocksource_mark_unstable(&clocksource_tsc);
+}
+
+EXPORT_SYMBOL_GPL(mark_tsc_unstable);
+
+static void __init check_system_tsc_reliable(void)
+{
+#if defined(CONFIG_MGEODEGX1) || defined(CONFIG_MGEODE_LX) || defined(CONFIG_X86_GENERIC)
+ if (is_geode_lx()) {
+ /* RTSC counts during suspend */
+#define RTSC_SUSP 0x100
+ unsigned long res_low, res_high;
+
+ rdmsr_safe(MSR_GEODE_BUSCONT_CONF0, &res_low, &res_high);
+ /* Geode_LX - the OLPC CPU has a very reliable TSC */
+ if (res_low & RTSC_SUSP)
+ tsc_clocksource_reliable = 1;
+ }
+#endif
+ if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE))
+ tsc_clocksource_reliable = 1;
+}
+
+/*
+ * Make an educated guess if the TSC is trustworthy and synchronized
+ * over all CPUs.
+ */
+int unsynchronized_tsc(void)
+{
+ if (!boot_cpu_has(X86_FEATURE_TSC) || tsc_unstable)
+ return 1;
+
+#ifdef CONFIG_SMP
+ if (apic_is_clustered_box())
+ return 1;
+#endif
+
+ if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
+ return 0;
+
+ if (tsc_clocksource_reliable)
+ return 0;
+ /*
+ * Intel systems are normally all synchronized.
+ * Exceptions must mark TSC as unstable:
+ */
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) {
+ /* assume multi socket systems are not synchronized: */
+ if (num_possible_cpus() > 1)
+ return 1;
+ }
+
+ return 0;
+}
+
+/*
+ * Convert ART to TSC given numerator/denominator found in detect_art()
+ */
+struct system_counterval_t convert_art_to_tsc(u64 art)
+{
+ u64 tmp, res, rem;
+
+ rem = do_div(art, art_to_tsc_denominator);
+
+ res = art * art_to_tsc_numerator;
+ tmp = rem * art_to_tsc_numerator;
+
+ do_div(tmp, art_to_tsc_denominator);
+ res += tmp + art_to_tsc_offset;
+
+ return (struct system_counterval_t) {.cs = art_related_clocksource,
+ .cycles = res};
+}
+EXPORT_SYMBOL(convert_art_to_tsc);
+
+/**
+ * convert_art_ns_to_tsc() - Convert ART in nanoseconds to TSC.
+ * @art_ns: ART (Always Running Timer) in unit of nanoseconds
+ *
+ * PTM requires all timestamps to be in units of nanoseconds. When user
+ * software requests a cross-timestamp, this function converts system timestamp
+ * to TSC.
+ *
+ * This is valid when CPU feature flag X86_FEATURE_TSC_KNOWN_FREQ is set
+ * indicating the tsc_khz is derived from CPUID[15H]. Drivers should check
+ * that this flag is set before conversion to TSC is attempted.
+ *
+ * Return:
+ * struct system_counterval_t - system counter value with the pointer to the
+ * corresponding clocksource
+ * @cycles: System counter value
+ * @cs: Clocksource corresponding to system counter value. Used
+ * by timekeeping code to verify comparibility of two cycle
+ * values.
+ */
+
+struct system_counterval_t convert_art_ns_to_tsc(u64 art_ns)
+{
+ u64 tmp, res, rem;
+
+ rem = do_div(art_ns, USEC_PER_SEC);
+
+ res = art_ns * tsc_khz;
+ tmp = rem * tsc_khz;
+
+ do_div(tmp, USEC_PER_SEC);
+ res += tmp;
+
+ return (struct system_counterval_t) { .cs = art_related_clocksource,
+ .cycles = res};
+}
+EXPORT_SYMBOL(convert_art_ns_to_tsc);
+
+
+static void tsc_refine_calibration_work(struct work_struct *work);
+static DECLARE_DELAYED_WORK(tsc_irqwork, tsc_refine_calibration_work);
+/**
+ * tsc_refine_calibration_work - Further refine tsc freq calibration
+ * @work - ignored.
+ *
+ * This functions uses delayed work over a period of a
+ * second to further refine the TSC freq value. Since this is
+ * timer based, instead of loop based, we don't block the boot
+ * process while this longer calibration is done.
+ *
+ * If there are any calibration anomalies (too many SMIs, etc),
+ * or the refined calibration is off by 1% of the fast early
+ * calibration, we throw out the new calibration and use the
+ * early calibration.
+ */
+static void tsc_refine_calibration_work(struct work_struct *work)
+{
+ static u64 tsc_start = -1, ref_start;
+ static int hpet;
+ u64 tsc_stop, ref_stop, delta;
+ unsigned long freq;
+ int cpu;
+
+ /* Don't bother refining TSC on unstable systems */
+ if (tsc_unstable)
+ goto unreg;
+
+ /*
+ * Since the work is started early in boot, we may be
+ * delayed the first time we expire. So set the workqueue
+ * again once we know timers are working.
+ */
+ if (tsc_start == -1) {
+ /*
+ * Only set hpet once, to avoid mixing hardware
+ * if the hpet becomes enabled later.
+ */
+ hpet = is_hpet_enabled();
+ schedule_delayed_work(&tsc_irqwork, HZ);
+ tsc_start = tsc_read_refs(&ref_start, hpet);
+ return;
+ }
+
+ tsc_stop = tsc_read_refs(&ref_stop, hpet);
+
+ /* hpet or pmtimer available ? */
+ if (ref_start == ref_stop)
+ goto out;
+
+ /* Check, whether the sampling was disturbed by an SMI */
+ if (tsc_start == ULLONG_MAX || tsc_stop == ULLONG_MAX)
+ goto out;
+
+ delta = tsc_stop - tsc_start;
+ delta *= 1000000LL;
+ if (hpet)
+ freq = calc_hpet_ref(delta, ref_start, ref_stop);
+ else
+ freq = calc_pmtimer_ref(delta, ref_start, ref_stop);
+
+ /* Make sure we're within 1% */
+ if (abs(tsc_khz - freq) > tsc_khz/100)
+ goto out;
+
+ tsc_khz = freq;
+ pr_info("Refined TSC clocksource calibration: %lu.%03lu MHz\n",
+ (unsigned long)tsc_khz / 1000,
+ (unsigned long)tsc_khz % 1000);
+
+ /* Inform the TSC deadline clockevent devices about the recalibration */
+ lapic_update_tsc_freq();
+
+ /* Update the sched_clock() rate to match the clocksource one */
+ for_each_possible_cpu(cpu)
+ set_cyc2ns_scale(tsc_khz, cpu, tsc_stop);
+
+out:
+ if (tsc_unstable)
+ goto unreg;
+
+ if (boot_cpu_has(X86_FEATURE_ART))
+ art_related_clocksource = &clocksource_tsc;
+ clocksource_register_khz(&clocksource_tsc, tsc_khz);
+unreg:
+ clocksource_unregister(&clocksource_tsc_early);
+}
+
+
+static int __init init_tsc_clocksource(void)
+{
+ if (!boot_cpu_has(X86_FEATURE_TSC) || !tsc_khz)
+ return 0;
+
+ if (tsc_unstable)
+ goto unreg;
+
+ if (tsc_clocksource_reliable)
+ clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY;
+
+ if (boot_cpu_has(X86_FEATURE_NONSTOP_TSC_S3))
+ clocksource_tsc.flags |= CLOCK_SOURCE_SUSPEND_NONSTOP;
+
+ /*
+ * When TSC frequency is known (retrieved via MSR or CPUID), we skip
+ * the refined calibration and directly register it as a clocksource.
+ */
+ if (boot_cpu_has(X86_FEATURE_TSC_KNOWN_FREQ)) {
+ if (boot_cpu_has(X86_FEATURE_ART))
+ art_related_clocksource = &clocksource_tsc;
+ clocksource_register_khz(&clocksource_tsc, tsc_khz);
+unreg:
+ clocksource_unregister(&clocksource_tsc_early);
+ return 0;
+ }
+
+ schedule_delayed_work(&tsc_irqwork, 0);
+ return 0;
+}
+/*
+ * We use device_initcall here, to ensure we run after the hpet
+ * is fully initialized, which may occur at fs_initcall time.
+ */
+device_initcall(init_tsc_clocksource);
+
+static bool __init determine_cpu_tsc_frequencies(bool early)
+{
+ /* Make sure that cpu and tsc are not already calibrated */
+ WARN_ON(cpu_khz || tsc_khz);
+
+ if (early) {
+ cpu_khz = x86_platform.calibrate_cpu();
+ tsc_khz = x86_platform.calibrate_tsc();
+ } else {
+ /* We should not be here with non-native cpu calibration */
+ WARN_ON(x86_platform.calibrate_cpu != native_calibrate_cpu);
+ cpu_khz = pit_hpet_ptimer_calibrate_cpu();
+ }
+
+ /*
+ * Trust non-zero tsc_khz as authoritative,
+ * and use it to sanity check cpu_khz,
+ * which will be off if system timer is off.
+ */
+ if (tsc_khz == 0)
+ tsc_khz = cpu_khz;
+ else if (abs(cpu_khz - tsc_khz) * 10 > tsc_khz)
+ cpu_khz = tsc_khz;
+
+ if (tsc_khz == 0)
+ return false;
+
+ pr_info("Detected %lu.%03lu MHz processor\n",
+ (unsigned long)cpu_khz / KHZ,
+ (unsigned long)cpu_khz % KHZ);
+
+ if (cpu_khz != tsc_khz) {
+ pr_info("Detected %lu.%03lu MHz TSC",
+ (unsigned long)tsc_khz / KHZ,
+ (unsigned long)tsc_khz % KHZ);
+ }
+ return true;
+}
+
+static unsigned long __init get_loops_per_jiffy(void)
+{
+ u64 lpj = (u64)tsc_khz * KHZ;
+
+ do_div(lpj, HZ);
+ return lpj;
+}
+
+static void __init tsc_enable_sched_clock(void)
+{
+ /* Sanitize TSC ADJUST before cyc2ns gets initialized */
+ tsc_store_and_check_tsc_adjust(true);
+ cyc2ns_init_boot_cpu();
+ static_branch_enable(&__use_tsc);
+}
+
+void __init tsc_early_init(void)
+{
+ if (!boot_cpu_has(X86_FEATURE_TSC))
+ return;
+ /* Don't change UV TSC multi-chassis synchronization */
+ if (is_early_uv_system())
+ return;
+ if (!determine_cpu_tsc_frequencies(true))
+ return;
+ loops_per_jiffy = get_loops_per_jiffy();
+
+ tsc_enable_sched_clock();
+}
+
+void __init tsc_init(void)
+{
+ /*
+ * native_calibrate_cpu_early can only calibrate using methods that are
+ * available early in boot.
+ */
+ if (x86_platform.calibrate_cpu == native_calibrate_cpu_early)
+ x86_platform.calibrate_cpu = native_calibrate_cpu;
+
+ if (!boot_cpu_has(X86_FEATURE_TSC)) {
+ setup_clear_cpu_cap(X86_FEATURE_TSC_DEADLINE_TIMER);
+ return;
+ }
+
+ if (!tsc_khz) {
+ /* We failed to determine frequencies earlier, try again */
+ if (!determine_cpu_tsc_frequencies(false)) {
+ mark_tsc_unstable("could not calculate TSC khz");
+ setup_clear_cpu_cap(X86_FEATURE_TSC_DEADLINE_TIMER);
+ return;
+ }
+ tsc_enable_sched_clock();
+ }
+
+ cyc2ns_init_secondary_cpus();
+
+ if (!no_sched_irq_time)
+ enable_sched_clock_irqtime();
+
+ lpj_fine = get_loops_per_jiffy();
+ use_tsc_delay();
+
+ check_system_tsc_reliable();
+
+ if (unsynchronized_tsc()) {
+ mark_tsc_unstable("TSCs unsynchronized");
+ return;
+ }
+
+ clocksource_register_khz(&clocksource_tsc_early, tsc_khz);
+ detect_art();
+}
+
+#ifdef CONFIG_SMP
+/*
+ * If we have a constant TSC and are using the TSC for the delay loop,
+ * we can skip clock calibration if another cpu in the same socket has already
+ * been calibrated. This assumes that CONSTANT_TSC applies to all
+ * cpus in the socket - this should be a safe assumption.
+ */
+unsigned long calibrate_delay_is_known(void)
+{
+ int sibling, cpu = smp_processor_id();
+ int constant_tsc = cpu_has(&cpu_data(cpu), X86_FEATURE_CONSTANT_TSC);
+ const struct cpumask *mask = topology_core_cpumask(cpu);
+
+ if (!constant_tsc || !mask)
+ return 0;
+
+ sibling = cpumask_any_but(mask, cpu);
+ if (sibling < nr_cpu_ids)
+ return cpu_data(sibling).loops_per_jiffy;
+ return 0;
+}
+#endif
diff --git a/arch/x86/kernel/tsc_msr.c b/arch/x86/kernel/tsc_msr.c
new file mode 100644
index 0000000..27ef714
--- /dev/null
+++ b/arch/x86/kernel/tsc_msr.c
@@ -0,0 +1,129 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * TSC frequency enumeration via MSR
+ *
+ * Copyright (C) 2013, 2018 Intel Corporation
+ * Author: Bin Gao <bin.gao@intel.com>
+ */
+
+#include <linux/kernel.h>
+
+#include <asm/apic.h>
+#include <asm/cpu_device_id.h>
+#include <asm/intel-family.h>
+#include <asm/msr.h>
+#include <asm/param.h>
+#include <asm/tsc.h>
+
+#define MAX_NUM_FREQS 9
+
+/*
+ * If MSR_PERF_STAT[31] is set, the maximum resolved bus ratio can be
+ * read in MSR_PLATFORM_ID[12:8], otherwise in MSR_PERF_STAT[44:40].
+ * Unfortunately some Intel Atom SoCs aren't quite compliant to this,
+ * so we need manually differentiate SoC families. This is what the
+ * field msr_plat does.
+ */
+struct freq_desc {
+ u8 msr_plat; /* 1: use MSR_PLATFORM_INFO, 0: MSR_IA32_PERF_STATUS */
+ u32 freqs[MAX_NUM_FREQS];
+};
+
+/*
+ * Penwell and Clovertrail use spread spectrum clock,
+ * so the freq number is not exactly the same as reported
+ * by MSR based on SDM.
+ */
+static const struct freq_desc freq_desc_pnw = {
+ 0, { 0, 0, 0, 0, 0, 99840, 0, 83200 }
+};
+
+static const struct freq_desc freq_desc_clv = {
+ 0, { 0, 133200, 0, 0, 0, 99840, 0, 83200 }
+};
+
+static const struct freq_desc freq_desc_byt = {
+ 1, { 83300, 100000, 133300, 116700, 80000, 0, 0, 0 }
+};
+
+static const struct freq_desc freq_desc_cht = {
+ 1, { 83300, 100000, 133300, 116700, 80000, 93300, 90000, 88900, 87500 }
+};
+
+static const struct freq_desc freq_desc_tng = {
+ 1, { 0, 100000, 133300, 0, 0, 0, 0, 0 }
+};
+
+static const struct freq_desc freq_desc_ann = {
+ 1, { 83300, 100000, 133300, 100000, 0, 0, 0, 0 }
+};
+
+static const struct x86_cpu_id tsc_msr_cpu_ids[] = {
+ INTEL_CPU_FAM6(ATOM_PENWELL, freq_desc_pnw),
+ INTEL_CPU_FAM6(ATOM_CLOVERVIEW, freq_desc_clv),
+ INTEL_CPU_FAM6(ATOM_SILVERMONT1, freq_desc_byt),
+ INTEL_CPU_FAM6(ATOM_AIRMONT, freq_desc_cht),
+ INTEL_CPU_FAM6(ATOM_MERRIFIELD, freq_desc_tng),
+ INTEL_CPU_FAM6(ATOM_MOOREFIELD, freq_desc_ann),
+ {}
+};
+
+/*
+ * MSR-based CPU/TSC frequency discovery for certain CPUs.
+ *
+ * Set global "lapic_timer_frequency" to bus_clock_cycles/jiffy
+ * Return processor base frequency in KHz, or 0 on failure.
+ */
+unsigned long cpu_khz_from_msr(void)
+{
+ u32 lo, hi, ratio, freq;
+ const struct freq_desc *freq_desc;
+ const struct x86_cpu_id *id;
+ unsigned long res;
+
+ id = x86_match_cpu(tsc_msr_cpu_ids);
+ if (!id)
+ return 0;
+
+ freq_desc = (struct freq_desc *)id->driver_data;
+ if (freq_desc->msr_plat) {
+ rdmsr(MSR_PLATFORM_INFO, lo, hi);
+ ratio = (lo >> 8) & 0xff;
+ } else {
+ rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
+ ratio = (hi >> 8) & 0x1f;
+ }
+
+ /* Get FSB FREQ ID */
+ rdmsr(MSR_FSB_FREQ, lo, hi);
+
+ /* Map CPU reference clock freq ID(0-7) to CPU reference clock freq(KHz) */
+ freq = freq_desc->freqs[lo & 0x7];
+
+ /* TSC frequency = maximum resolved freq * maximum resolved bus ratio */
+ res = freq * ratio;
+
+#ifdef CONFIG_X86_LOCAL_APIC
+ lapic_timer_frequency = (freq * 1000) / HZ;
+#endif
+
+ /*
+ * TSC frequency determined by MSR is always considered "known"
+ * because it is reported by HW.
+ * Another fact is that on MSR capable platforms, PIT/HPET is
+ * generally not available so calibration won't work at all.
+ */
+ setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ);
+
+ /*
+ * Unfortunately there is no way for hardware to tell whether the
+ * TSC is reliable. We were told by silicon design team that TSC
+ * on Atom SoCs are always "reliable". TSC is also the only
+ * reliable clocksource on these SoCs (HPET is either not present
+ * or not functional) so mark TSC reliable which removes the
+ * requirement for a watchdog clocksource.
+ */
+ setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE);
+
+ return res;
+}
diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c
new file mode 100644
index 0000000..ec534f9
--- /dev/null
+++ b/arch/x86/kernel/tsc_sync.c
@@ -0,0 +1,494 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * check TSC synchronization.
+ *
+ * Copyright (C) 2006, Red Hat, Inc., Ingo Molnar
+ *
+ * We check whether all boot CPUs have their TSC's synchronized,
+ * print a warning if not and turn off the TSC clock-source.
+ *
+ * The warp-check is point-to-point between two CPUs, the CPU
+ * initiating the bootup is the 'source CPU', the freshly booting
+ * CPU is the 'target CPU'.
+ *
+ * Only two CPUs may participate - they can enter in any order.
+ * ( The serial nature of the boot logic and the CPU hotplug lock
+ * protects against more than 2 CPUs entering this code. )
+ */
+#include <linux/topology.h>
+#include <linux/spinlock.h>
+#include <linux/kernel.h>
+#include <linux/smp.h>
+#include <linux/nmi.h>
+#include <asm/tsc.h>
+
+struct tsc_adjust {
+ s64 bootval;
+ s64 adjusted;
+ unsigned long nextcheck;
+ bool warned;
+};
+
+static DEFINE_PER_CPU(struct tsc_adjust, tsc_adjust);
+
+/*
+ * TSC's on different sockets may be reset asynchronously.
+ * This may cause the TSC ADJUST value on socket 0 to be NOT 0.
+ */
+bool __read_mostly tsc_async_resets;
+
+void mark_tsc_async_resets(char *reason)
+{
+ if (tsc_async_resets)
+ return;
+ tsc_async_resets = true;
+ pr_info("tsc: Marking TSC async resets true due to %s\n", reason);
+}
+
+void tsc_verify_tsc_adjust(bool resume)
+{
+ struct tsc_adjust *adj = this_cpu_ptr(&tsc_adjust);
+ s64 curval;
+
+ if (!boot_cpu_has(X86_FEATURE_TSC_ADJUST))
+ return;
+
+ /* Skip unnecessary error messages if TSC already unstable */
+ if (check_tsc_unstable())
+ return;
+
+ /* Rate limit the MSR check */
+ if (!resume && time_before(jiffies, adj->nextcheck))
+ return;
+
+ adj->nextcheck = jiffies + HZ;
+
+ rdmsrl(MSR_IA32_TSC_ADJUST, curval);
+ if (adj->adjusted == curval)
+ return;
+
+ /* Restore the original value */
+ wrmsrl(MSR_IA32_TSC_ADJUST, adj->adjusted);
+
+ if (!adj->warned || resume) {
+ pr_warn(FW_BUG "TSC ADJUST differs: CPU%u %lld --> %lld. Restoring\n",
+ smp_processor_id(), adj->adjusted, curval);
+ adj->warned = true;
+ }
+}
+
+static void tsc_sanitize_first_cpu(struct tsc_adjust *cur, s64 bootval,
+ unsigned int cpu, bool bootcpu)
+{
+ /*
+ * First online CPU in a package stores the boot value in the
+ * adjustment value. This value might change later via the sync
+ * mechanism. If that fails we still can yell about boot values not
+ * being consistent.
+ *
+ * On the boot cpu we just force set the ADJUST value to 0 if it's
+ * non zero. We don't do that on non boot cpus because physical
+ * hotplug should have set the ADJUST register to a value > 0 so
+ * the TSC is in sync with the already running cpus.
+ *
+ * Also don't force the ADJUST value to zero if that is a valid value
+ * for socket 0 as determined by the system arch. This is required
+ * when multiple sockets are reset asynchronously with each other
+ * and socket 0 may not have an TSC ADJUST value of 0.
+ */
+ if (bootcpu && bootval != 0) {
+ if (likely(!tsc_async_resets)) {
+ pr_warn(FW_BUG "TSC ADJUST: CPU%u: %lld force to 0\n",
+ cpu, bootval);
+ wrmsrl(MSR_IA32_TSC_ADJUST, 0);
+ bootval = 0;
+ } else {
+ pr_info("TSC ADJUST: CPU%u: %lld NOT forced to 0\n",
+ cpu, bootval);
+ }
+ }
+ cur->adjusted = bootval;
+}
+
+#ifndef CONFIG_SMP
+bool __init tsc_store_and_check_tsc_adjust(bool bootcpu)
+{
+ struct tsc_adjust *cur = this_cpu_ptr(&tsc_adjust);
+ s64 bootval;
+
+ if (!boot_cpu_has(X86_FEATURE_TSC_ADJUST))
+ return false;
+
+ /* Skip unnecessary error messages if TSC already unstable */
+ if (check_tsc_unstable())
+ return false;
+
+ rdmsrl(MSR_IA32_TSC_ADJUST, bootval);
+ cur->bootval = bootval;
+ cur->nextcheck = jiffies + HZ;
+ tsc_sanitize_first_cpu(cur, bootval, smp_processor_id(), bootcpu);
+ return false;
+}
+
+#else /* !CONFIG_SMP */
+
+/*
+ * Store and check the TSC ADJUST MSR if available
+ */
+bool tsc_store_and_check_tsc_adjust(bool bootcpu)
+{
+ struct tsc_adjust *ref, *cur = this_cpu_ptr(&tsc_adjust);
+ unsigned int refcpu, cpu = smp_processor_id();
+ struct cpumask *mask;
+ s64 bootval;
+
+ if (!boot_cpu_has(X86_FEATURE_TSC_ADJUST))
+ return false;
+
+ rdmsrl(MSR_IA32_TSC_ADJUST, bootval);
+ cur->bootval = bootval;
+ cur->nextcheck = jiffies + HZ;
+ cur->warned = false;
+
+ /*
+ * If a non-zero TSC value for socket 0 may be valid then the default
+ * adjusted value cannot assumed to be zero either.
+ */
+ if (tsc_async_resets)
+ cur->adjusted = bootval;
+
+ /*
+ * Check whether this CPU is the first in a package to come up. In
+ * this case do not check the boot value against another package
+ * because the new package might have been physically hotplugged,
+ * where TSC_ADJUST is expected to be different. When called on the
+ * boot CPU topology_core_cpumask() might not be available yet.
+ */
+ mask = topology_core_cpumask(cpu);
+ refcpu = mask ? cpumask_any_but(mask, cpu) : nr_cpu_ids;
+
+ if (refcpu >= nr_cpu_ids) {
+ tsc_sanitize_first_cpu(cur, bootval, smp_processor_id(),
+ bootcpu);
+ return false;
+ }
+
+ ref = per_cpu_ptr(&tsc_adjust, refcpu);
+ /*
+ * Compare the boot value and complain if it differs in the
+ * package.
+ */
+ if (bootval != ref->bootval)
+ printk_once(FW_BUG "TSC ADJUST differs within socket(s), fixing all errors\n");
+
+ /*
+ * The TSC_ADJUST values in a package must be the same. If the boot
+ * value on this newly upcoming CPU differs from the adjustment
+ * value of the already online CPU in this package, set it to that
+ * adjusted value.
+ */
+ if (bootval != ref->adjusted) {
+ cur->adjusted = ref->adjusted;
+ wrmsrl(MSR_IA32_TSC_ADJUST, ref->adjusted);
+ }
+ /*
+ * We have the TSCs forced to be in sync on this package. Skip sync
+ * test:
+ */
+ return true;
+}
+
+/*
+ * Entry/exit counters that make sure that both CPUs
+ * run the measurement code at once:
+ */
+static atomic_t start_count;
+static atomic_t stop_count;
+static atomic_t skip_test;
+static atomic_t test_runs;
+
+/*
+ * We use a raw spinlock in this exceptional case, because
+ * we want to have the fastest, inlined, non-debug version
+ * of a critical section, to be able to prove TSC time-warps:
+ */
+static arch_spinlock_t sync_lock = __ARCH_SPIN_LOCK_UNLOCKED;
+
+static cycles_t last_tsc;
+static cycles_t max_warp;
+static int nr_warps;
+static int random_warps;
+
+/*
+ * TSC-warp measurement loop running on both CPUs. This is not called
+ * if there is no TSC.
+ */
+static cycles_t check_tsc_warp(unsigned int timeout)
+{
+ cycles_t start, now, prev, end, cur_max_warp = 0;
+ int i, cur_warps = 0;
+
+ start = rdtsc_ordered();
+ /*
+ * The measurement runs for 'timeout' msecs:
+ */
+ end = start + (cycles_t) tsc_khz * timeout;
+ now = start;
+
+ for (i = 0; ; i++) {
+ /*
+ * We take the global lock, measure TSC, save the
+ * previous TSC that was measured (possibly on
+ * another CPU) and update the previous TSC timestamp.
+ */
+ arch_spin_lock(&sync_lock);
+ prev = last_tsc;
+ now = rdtsc_ordered();
+ last_tsc = now;
+ arch_spin_unlock(&sync_lock);
+
+ /*
+ * Be nice every now and then (and also check whether
+ * measurement is done [we also insert a 10 million
+ * loops safety exit, so we dont lock up in case the
+ * TSC readout is totally broken]):
+ */
+ if (unlikely(!(i & 7))) {
+ if (now > end || i > 10000000)
+ break;
+ cpu_relax();
+ touch_nmi_watchdog();
+ }
+ /*
+ * Outside the critical section we can now see whether
+ * we saw a time-warp of the TSC going backwards:
+ */
+ if (unlikely(prev > now)) {
+ arch_spin_lock(&sync_lock);
+ max_warp = max(max_warp, prev - now);
+ cur_max_warp = max_warp;
+ /*
+ * Check whether this bounces back and forth. Only
+ * one CPU should observe time going backwards.
+ */
+ if (cur_warps != nr_warps)
+ random_warps++;
+ nr_warps++;
+ cur_warps = nr_warps;
+ arch_spin_unlock(&sync_lock);
+ }
+ }
+ WARN(!(now-start),
+ "Warning: zero tsc calibration delta: %Ld [max: %Ld]\n",
+ now-start, end-start);
+ return cur_max_warp;
+}
+
+/*
+ * If the target CPU coming online doesn't have any of its core-siblings
+ * online, a timeout of 20msec will be used for the TSC-warp measurement
+ * loop. Otherwise a smaller timeout of 2msec will be used, as we have some
+ * information about this socket already (and this information grows as we
+ * have more and more logical-siblings in that socket).
+ *
+ * Ideally we should be able to skip the TSC sync check on the other
+ * core-siblings, if the first logical CPU in a socket passed the sync test.
+ * But as the TSC is per-logical CPU and can potentially be modified wrongly
+ * by the bios, TSC sync test for smaller duration should be able
+ * to catch such errors. Also this will catch the condition where all the
+ * cores in the socket doesn't get reset at the same time.
+ */
+static inline unsigned int loop_timeout(int cpu)
+{
+ return (cpumask_weight(topology_core_cpumask(cpu)) > 1) ? 2 : 20;
+}
+
+/*
+ * Source CPU calls into this - it waits for the freshly booted
+ * target CPU to arrive and then starts the measurement:
+ */
+void check_tsc_sync_source(int cpu)
+{
+ int cpus = 2;
+
+ /*
+ * No need to check if we already know that the TSC is not
+ * synchronized or if we have no TSC.
+ */
+ if (unsynchronized_tsc())
+ return;
+
+ /*
+ * Set the maximum number of test runs to
+ * 1 if the CPU does not provide the TSC_ADJUST MSR
+ * 3 if the MSR is available, so the target can try to adjust
+ */
+ if (!boot_cpu_has(X86_FEATURE_TSC_ADJUST))
+ atomic_set(&test_runs, 1);
+ else
+ atomic_set(&test_runs, 3);
+retry:
+ /*
+ * Wait for the target to start or to skip the test:
+ */
+ while (atomic_read(&start_count) != cpus - 1) {
+ if (atomic_read(&skip_test) > 0) {
+ atomic_set(&skip_test, 0);
+ return;
+ }
+ cpu_relax();
+ }
+
+ /*
+ * Trigger the target to continue into the measurement too:
+ */
+ atomic_inc(&start_count);
+
+ check_tsc_warp(loop_timeout(cpu));
+
+ while (atomic_read(&stop_count) != cpus-1)
+ cpu_relax();
+
+ /*
+ * If the test was successful set the number of runs to zero and
+ * stop. If not, decrement the number of runs an check if we can
+ * retry. In case of random warps no retry is attempted.
+ */
+ if (!nr_warps) {
+ atomic_set(&test_runs, 0);
+
+ pr_debug("TSC synchronization [CPU#%d -> CPU#%d]: passed\n",
+ smp_processor_id(), cpu);
+
+ } else if (atomic_dec_and_test(&test_runs) || random_warps) {
+ /* Force it to 0 if random warps brought us here */
+ atomic_set(&test_runs, 0);
+
+ pr_warning("TSC synchronization [CPU#%d -> CPU#%d]:\n",
+ smp_processor_id(), cpu);
+ pr_warning("Measured %Ld cycles TSC warp between CPUs, "
+ "turning off TSC clock.\n", max_warp);
+ if (random_warps)
+ pr_warning("TSC warped randomly between CPUs\n");
+ mark_tsc_unstable("check_tsc_sync_source failed");
+ }
+
+ /*
+ * Reset it - just in case we boot another CPU later:
+ */
+ atomic_set(&start_count, 0);
+ random_warps = 0;
+ nr_warps = 0;
+ max_warp = 0;
+ last_tsc = 0;
+
+ /*
+ * Let the target continue with the bootup:
+ */
+ atomic_inc(&stop_count);
+
+ /*
+ * Retry, if there is a chance to do so.
+ */
+ if (atomic_read(&test_runs) > 0)
+ goto retry;
+}
+
+/*
+ * Freshly booted CPUs call into this:
+ */
+void check_tsc_sync_target(void)
+{
+ struct tsc_adjust *cur = this_cpu_ptr(&tsc_adjust);
+ unsigned int cpu = smp_processor_id();
+ cycles_t cur_max_warp, gbl_max_warp;
+ int cpus = 2;
+
+ /* Also aborts if there is no TSC. */
+ if (unsynchronized_tsc())
+ return;
+
+ /*
+ * Store, verify and sanitize the TSC adjust register. If
+ * successful skip the test.
+ *
+ * The test is also skipped when the TSC is marked reliable. This
+ * is true for SoCs which have no fallback clocksource. On these
+ * SoCs the TSC is frequency synchronized, but still the TSC ADJUST
+ * register might have been wreckaged by the BIOS..
+ */
+ if (tsc_store_and_check_tsc_adjust(false) || tsc_clocksource_reliable) {
+ atomic_inc(&skip_test);
+ return;
+ }
+
+retry:
+ /*
+ * Register this CPU's participation and wait for the
+ * source CPU to start the measurement:
+ */
+ atomic_inc(&start_count);
+ while (atomic_read(&start_count) != cpus)
+ cpu_relax();
+
+ cur_max_warp = check_tsc_warp(loop_timeout(cpu));
+
+ /*
+ * Store the maximum observed warp value for a potential retry:
+ */
+ gbl_max_warp = max_warp;
+
+ /*
+ * Ok, we are done:
+ */
+ atomic_inc(&stop_count);
+
+ /*
+ * Wait for the source CPU to print stuff:
+ */
+ while (atomic_read(&stop_count) != cpus)
+ cpu_relax();
+
+ /*
+ * Reset it for the next sync test:
+ */
+ atomic_set(&stop_count, 0);
+
+ /*
+ * Check the number of remaining test runs. If not zero, the test
+ * failed and a retry with adjusted TSC is possible. If zero the
+ * test was either successful or failed terminally.
+ */
+ if (!atomic_read(&test_runs))
+ return;
+
+ /*
+ * If the warp value of this CPU is 0, then the other CPU
+ * observed time going backwards so this TSC was ahead and
+ * needs to move backwards.
+ */
+ if (!cur_max_warp)
+ cur_max_warp = -gbl_max_warp;
+
+ /*
+ * Add the result to the previous adjustment value.
+ *
+ * The adjustement value is slightly off by the overhead of the
+ * sync mechanism (observed values are ~200 TSC cycles), but this
+ * really depends on CPU, node distance and frequency. So
+ * compensating for this is hard to get right. Experiments show
+ * that the warp is not longer detectable when the observed warp
+ * value is used. In the worst case the adjustment needs to go
+ * through a 3rd run for fine tuning.
+ */
+ cur->adjusted += cur_max_warp;
+
+ pr_warn("TSC ADJUST compensate: CPU%u observed %lld warp. Adjust: %lld\n",
+ cpu, cur_max_warp, cur->adjusted);
+
+ wrmsrl(MSR_IA32_TSC_ADJUST, cur->adjusted);
+ goto retry;
+
+}
+
+#endif /* CONFIG_SMP */
diff --git a/arch/x86/kernel/umip.c b/arch/x86/kernel/umip.c
new file mode 100644
index 0000000..ff20b35
--- /dev/null
+++ b/arch/x86/kernel/umip.c
@@ -0,0 +1,429 @@
+/*
+ * umip.c Emulation for instruction protected by the Intel User-Mode
+ * Instruction Prevention feature
+ *
+ * Copyright (c) 2017, Intel Corporation.
+ * Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
+ */
+
+#include <linux/uaccess.h>
+#include <asm/umip.h>
+#include <asm/traps.h>
+#include <asm/insn.h>
+#include <asm/insn-eval.h>
+#include <linux/ratelimit.h>
+
+#undef pr_fmt
+#define pr_fmt(fmt) "umip: " fmt
+
+/** DOC: Emulation for User-Mode Instruction Prevention (UMIP)
+ *
+ * The feature User-Mode Instruction Prevention present in recent Intel
+ * processor prevents a group of instructions (sgdt, sidt, sldt, smsw, and str)
+ * from being executed with CPL > 0. Otherwise, a general protection fault is
+ * issued.
+ *
+ * Rather than relaying to the user space the general protection fault caused by
+ * the UMIP-protected instructions (in the form of a SIGSEGV signal), it can be
+ * trapped and emulate the result of such instructions to provide dummy values.
+ * This allows to both conserve the current kernel behavior and not reveal the
+ * system resources that UMIP intends to protect (i.e., the locations of the
+ * global descriptor and interrupt descriptor tables, the segment selectors of
+ * the local descriptor table, the value of the task state register and the
+ * contents of the CR0 register).
+ *
+ * This emulation is needed because certain applications (e.g., WineHQ and
+ * DOSEMU2) rely on this subset of instructions to function.
+ *
+ * The instructions protected by UMIP can be split in two groups. Those which
+ * return a kernel memory address (sgdt and sidt) and those which return a
+ * value (sldt, str and smsw).
+ *
+ * For the instructions that return a kernel memory address, applications
+ * such as WineHQ rely on the result being located in the kernel memory space,
+ * not the actual location of the table. The result is emulated as a hard-coded
+ * value that, lies close to the top of the kernel memory. The limit for the GDT
+ * and the IDT are set to zero.
+ *
+ * Given that sldt and str are not commonly used in programs that run on WineHQ
+ * or DOSEMU2, they are not emulated.
+ *
+ * The instruction smsw is emulated to return the value that the register CR0
+ * has at boot time as set in the head_32.
+ *
+ * Also, emulation is provided only for 32-bit processes; 64-bit processes
+ * that attempt to use the instructions that UMIP protects will receive the
+ * SIGSEGV signal issued as a consequence of the general protection fault.
+ *
+ * Care is taken to appropriately emulate the results when segmentation is
+ * used. That is, rather than relying on USER_DS and USER_CS, the function
+ * insn_get_addr_ref() inspects the segment descriptor pointed by the
+ * registers in pt_regs. This ensures that we correctly obtain the segment
+ * base address and the address and operand sizes even if the user space
+ * application uses a local descriptor table.
+ */
+
+#define UMIP_DUMMY_GDT_BASE 0xfffe0000
+#define UMIP_DUMMY_IDT_BASE 0xffff0000
+
+/*
+ * The SGDT and SIDT instructions store the contents of the global descriptor
+ * table and interrupt table registers, respectively. The destination is a
+ * memory operand of X+2 bytes. X bytes are used to store the base address of
+ * the table and 2 bytes are used to store the limit. In 32-bit processes, the
+ * only processes for which emulation is provided, X has a value of 4.
+ */
+#define UMIP_GDT_IDT_BASE_SIZE 4
+#define UMIP_GDT_IDT_LIMIT_SIZE 2
+
+#define UMIP_INST_SGDT 0 /* 0F 01 /0 */
+#define UMIP_INST_SIDT 1 /* 0F 01 /1 */
+#define UMIP_INST_SMSW 2 /* 0F 01 /4 */
+#define UMIP_INST_SLDT 3 /* 0F 00 /0 */
+#define UMIP_INST_STR 4 /* 0F 00 /1 */
+
+const char * const umip_insns[5] = {
+ [UMIP_INST_SGDT] = "SGDT",
+ [UMIP_INST_SIDT] = "SIDT",
+ [UMIP_INST_SMSW] = "SMSW",
+ [UMIP_INST_SLDT] = "SLDT",
+ [UMIP_INST_STR] = "STR",
+};
+
+#define umip_pr_err(regs, fmt, ...) \
+ umip_printk(regs, KERN_ERR, fmt, ##__VA_ARGS__)
+#define umip_pr_warning(regs, fmt, ...) \
+ umip_printk(regs, KERN_WARNING, fmt, ##__VA_ARGS__)
+
+/**
+ * umip_printk() - Print a rate-limited message
+ * @regs: Register set with the context in which the warning is printed
+ * @log_level: Kernel log level to print the message
+ * @fmt: The text string to print
+ *
+ * Print the text contained in @fmt. The print rate is limited to bursts of 5
+ * messages every two minutes. The purpose of this customized version of
+ * printk() is to print messages when user space processes use any of the
+ * UMIP-protected instructions. Thus, the printed text is prepended with the
+ * task name and process ID number of the current task as well as the
+ * instruction and stack pointers in @regs as seen when entering kernel mode.
+ *
+ * Returns:
+ *
+ * None.
+ */
+static __printf(3, 4)
+void umip_printk(const struct pt_regs *regs, const char *log_level,
+ const char *fmt, ...)
+{
+ /* Bursts of 5 messages every two minutes */
+ static DEFINE_RATELIMIT_STATE(ratelimit, 2 * 60 * HZ, 5);
+ struct task_struct *tsk = current;
+ struct va_format vaf;
+ va_list args;
+
+ if (!__ratelimit(&ratelimit))
+ return;
+
+ va_start(args, fmt);
+ vaf.fmt = fmt;
+ vaf.va = &args;
+ printk("%s" pr_fmt("%s[%d] ip:%lx sp:%lx: %pV"), log_level, tsk->comm,
+ task_pid_nr(tsk), regs->ip, regs->sp, &vaf);
+ va_end(args);
+}
+
+/**
+ * identify_insn() - Identify a UMIP-protected instruction
+ * @insn: Instruction structure with opcode and ModRM byte.
+ *
+ * From the opcode and ModRM.reg in @insn identify, if any, a UMIP-protected
+ * instruction that can be emulated.
+ *
+ * Returns:
+ *
+ * On success, a constant identifying a specific UMIP-protected instruction that
+ * can be emulated.
+ *
+ * -EINVAL on error or when not an UMIP-protected instruction that can be
+ * emulated.
+ */
+static int identify_insn(struct insn *insn)
+{
+ /* By getting modrm we also get the opcode. */
+ insn_get_modrm(insn);
+
+ if (!insn->modrm.nbytes)
+ return -EINVAL;
+
+ /* All the instructions of interest start with 0x0f. */
+ if (insn->opcode.bytes[0] != 0xf)
+ return -EINVAL;
+
+ if (insn->opcode.bytes[1] == 0x1) {
+ switch (X86_MODRM_REG(insn->modrm.value)) {
+ case 0:
+ return UMIP_INST_SGDT;
+ case 1:
+ return UMIP_INST_SIDT;
+ case 4:
+ return UMIP_INST_SMSW;
+ default:
+ return -EINVAL;
+ }
+ } else if (insn->opcode.bytes[1] == 0x0) {
+ if (X86_MODRM_REG(insn->modrm.value) == 0)
+ return UMIP_INST_SLDT;
+ else if (X86_MODRM_REG(insn->modrm.value) == 1)
+ return UMIP_INST_STR;
+ else
+ return -EINVAL;
+ } else {
+ return -EINVAL;
+ }
+}
+
+/**
+ * emulate_umip_insn() - Emulate UMIP instructions and return dummy values
+ * @insn: Instruction structure with operands
+ * @umip_inst: A constant indicating the instruction to emulate
+ * @data: Buffer into which the dummy result is stored
+ * @data_size: Size of the emulated result
+ *
+ * Emulate an instruction protected by UMIP and provide a dummy result. The
+ * result of the emulation is saved in @data. The size of the results depends
+ * on both the instruction and type of operand (register vs memory address).
+ * The size of the result is updated in @data_size. Caller is responsible
+ * of providing a @data buffer of at least UMIP_GDT_IDT_BASE_SIZE +
+ * UMIP_GDT_IDT_LIMIT_SIZE bytes.
+ *
+ * Returns:
+ *
+ * 0 on success, -EINVAL on error while emulating.
+ */
+static int emulate_umip_insn(struct insn *insn, int umip_inst,
+ unsigned char *data, int *data_size)
+{
+ unsigned long dummy_base_addr, dummy_value;
+ unsigned short dummy_limit = 0;
+
+ if (!data || !data_size || !insn)
+ return -EINVAL;
+ /*
+ * These two instructions return the base address and limit of the
+ * global and interrupt descriptor table, respectively. According to the
+ * Intel Software Development manual, the base address can be 24-bit,
+ * 32-bit or 64-bit. Limit is always 16-bit. If the operand size is
+ * 16-bit, the returned value of the base address is supposed to be a
+ * zero-extended 24-byte number. However, it seems that a 32-byte number
+ * is always returned irrespective of the operand size.
+ */
+ if (umip_inst == UMIP_INST_SGDT || umip_inst == UMIP_INST_SIDT) {
+ /* SGDT and SIDT do not use registers operands. */
+ if (X86_MODRM_MOD(insn->modrm.value) == 3)
+ return -EINVAL;
+
+ if (umip_inst == UMIP_INST_SGDT)
+ dummy_base_addr = UMIP_DUMMY_GDT_BASE;
+ else
+ dummy_base_addr = UMIP_DUMMY_IDT_BASE;
+
+ *data_size = UMIP_GDT_IDT_LIMIT_SIZE + UMIP_GDT_IDT_BASE_SIZE;
+
+ memcpy(data + 2, &dummy_base_addr, UMIP_GDT_IDT_BASE_SIZE);
+ memcpy(data, &dummy_limit, UMIP_GDT_IDT_LIMIT_SIZE);
+
+ } else if (umip_inst == UMIP_INST_SMSW) {
+ dummy_value = CR0_STATE;
+
+ /*
+ * Even though the CR0 register has 4 bytes, the number
+ * of bytes to be copied in the result buffer is determined
+ * by whether the operand is a register or a memory location.
+ * If operand is a register, return as many bytes as the operand
+ * size. If operand is memory, return only the two least
+ * siginificant bytes of CR0.
+ */
+ if (X86_MODRM_MOD(insn->modrm.value) == 3)
+ *data_size = insn->opnd_bytes;
+ else
+ *data_size = 2;
+
+ memcpy(data, &dummy_value, *data_size);
+ /* STR and SLDT are not emulated */
+ } else {
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+/**
+ * force_sig_info_umip_fault() - Force a SIGSEGV with SEGV_MAPERR
+ * @addr: Address that caused the signal
+ * @regs: Register set containing the instruction pointer
+ *
+ * Force a SIGSEGV signal with SEGV_MAPERR as the error code. This function is
+ * intended to be used to provide a segmentation fault when the result of the
+ * UMIP emulation could not be copied to the user space memory.
+ *
+ * Returns: none
+ */
+static void force_sig_info_umip_fault(void __user *addr, struct pt_regs *regs)
+{
+ siginfo_t info;
+ struct task_struct *tsk = current;
+
+ tsk->thread.cr2 = (unsigned long)addr;
+ tsk->thread.error_code = X86_PF_USER | X86_PF_WRITE;
+ tsk->thread.trap_nr = X86_TRAP_PF;
+
+ clear_siginfo(&info);
+ info.si_signo = SIGSEGV;
+ info.si_errno = 0;
+ info.si_code = SEGV_MAPERR;
+ info.si_addr = addr;
+ force_sig_info(SIGSEGV, &info, tsk);
+
+ if (!(show_unhandled_signals && unhandled_signal(tsk, SIGSEGV)))
+ return;
+
+ umip_pr_err(regs, "segfault in emulation. error%x\n",
+ X86_PF_USER | X86_PF_WRITE);
+}
+
+/**
+ * fixup_umip_exception() - Fixup a general protection fault caused by UMIP
+ * @regs: Registers as saved when entering the #GP handler
+ *
+ * The instructions sgdt, sidt, str, smsw, sldt cause a general protection
+ * fault if executed with CPL > 0 (i.e., from user space). If the offending
+ * user-space process is not in long mode, this function fixes the exception
+ * up and provides dummy results for sgdt, sidt and smsw; str and sldt are not
+ * fixed up. Also long mode user-space processes are not fixed up.
+ *
+ * If operands are memory addresses, results are copied to user-space memory as
+ * indicated by the instruction pointed by eIP using the registers indicated in
+ * the instruction operands. If operands are registers, results are copied into
+ * the context that was saved when entering kernel mode.
+ *
+ * Returns:
+ *
+ * True if emulation was successful; false if not.
+ */
+bool fixup_umip_exception(struct pt_regs *regs)
+{
+ int not_copied, nr_copied, reg_offset, dummy_data_size, umip_inst;
+ unsigned long seg_base = 0, *reg_addr;
+ /* 10 bytes is the maximum size of the result of UMIP instructions */
+ unsigned char dummy_data[10] = { 0 };
+ unsigned char buf[MAX_INSN_SIZE];
+ void __user *uaddr;
+ struct insn insn;
+ int seg_defs;
+
+ if (!regs)
+ return false;
+
+ /*
+ * If not in user-space long mode, a custom code segment could be in
+ * use. This is true in protected mode (if the process defined a local
+ * descriptor table), or virtual-8086 mode. In most of the cases
+ * seg_base will be zero as in USER_CS.
+ */
+ if (!user_64bit_mode(regs))
+ seg_base = insn_get_seg_base(regs, INAT_SEG_REG_CS);
+
+ if (seg_base == -1L)
+ return false;
+
+ not_copied = copy_from_user(buf, (void __user *)(seg_base + regs->ip),
+ sizeof(buf));
+ nr_copied = sizeof(buf) - not_copied;
+
+ /*
+ * The copy_from_user above could have failed if user code is protected
+ * by a memory protection key. Give up on emulation in such a case.
+ * Should we issue a page fault?
+ */
+ if (!nr_copied)
+ return false;
+
+ insn_init(&insn, buf, nr_copied, user_64bit_mode(regs));
+
+ /*
+ * Override the default operand and address sizes with what is specified
+ * in the code segment descriptor. The instruction decoder only sets
+ * the address size it to either 4 or 8 address bytes and does nothing
+ * for the operand bytes. This OK for most of the cases, but we could
+ * have special cases where, for instance, a 16-bit code segment
+ * descriptor is used.
+ * If there is an address override prefix, the instruction decoder
+ * correctly updates these values, even for 16-bit defaults.
+ */
+ seg_defs = insn_get_code_seg_params(regs);
+ if (seg_defs == -EINVAL)
+ return false;
+
+ insn.addr_bytes = INSN_CODE_SEG_ADDR_SZ(seg_defs);
+ insn.opnd_bytes = INSN_CODE_SEG_OPND_SZ(seg_defs);
+
+ insn_get_length(&insn);
+ if (nr_copied < insn.length)
+ return false;
+
+ umip_inst = identify_insn(&insn);
+ if (umip_inst < 0)
+ return false;
+
+ umip_pr_warning(regs, "%s instruction cannot be used by applications.\n",
+ umip_insns[umip_inst]);
+
+ /* Do not emulate SLDT, STR or user long mode processes. */
+ if (umip_inst == UMIP_INST_STR || umip_inst == UMIP_INST_SLDT || user_64bit_mode(regs))
+ return false;
+
+ umip_pr_warning(regs, "For now, expensive software emulation returns the result.\n");
+
+ if (emulate_umip_insn(&insn, umip_inst, dummy_data, &dummy_data_size))
+ return false;
+
+ /*
+ * If operand is a register, write result to the copy of the register
+ * value that was pushed to the stack when entering into kernel mode.
+ * Upon exit, the value we write will be restored to the actual hardware
+ * register.
+ */
+ if (X86_MODRM_MOD(insn.modrm.value) == 3) {
+ reg_offset = insn_get_modrm_rm_off(&insn, regs);
+
+ /*
+ * Negative values are usually errors. In memory addressing,
+ * the exception is -EDOM. Since we expect a register operand,
+ * all negative values are errors.
+ */
+ if (reg_offset < 0)
+ return false;
+
+ reg_addr = (unsigned long *)((unsigned long)regs + reg_offset);
+ memcpy(reg_addr, dummy_data, dummy_data_size);
+ } else {
+ uaddr = insn_get_addr_ref(&insn, regs);
+ if ((unsigned long)uaddr == -1L)
+ return false;
+
+ nr_copied = copy_to_user(uaddr, dummy_data, dummy_data_size);
+ if (nr_copied > 0) {
+ /*
+ * If copy fails, send a signal and tell caller that
+ * fault was fixed up.
+ */
+ force_sig_info_umip_fault(uaddr, regs);
+ return true;
+ }
+ }
+
+ /* increase IP to let the program keep going */
+ regs->ip += insn.length;
+ return true;
+}
diff --git a/arch/x86/kernel/unwind_frame.c b/arch/x86/kernel/unwind_frame.c
new file mode 100644
index 0000000..3dc26f9
--- /dev/null
+++ b/arch/x86/kernel/unwind_frame.c
@@ -0,0 +1,416 @@
+#include <linux/sched.h>
+#include <linux/sched/task.h>
+#include <linux/sched/task_stack.h>
+#include <linux/interrupt.h>
+#include <asm/sections.h>
+#include <asm/ptrace.h>
+#include <asm/bitops.h>
+#include <asm/stacktrace.h>
+#include <asm/unwind.h>
+
+#define FRAME_HEADER_SIZE (sizeof(long) * 2)
+
+unsigned long unwind_get_return_address(struct unwind_state *state)
+{
+ if (unwind_done(state))
+ return 0;
+
+ return __kernel_text_address(state->ip) ? state->ip : 0;
+}
+EXPORT_SYMBOL_GPL(unwind_get_return_address);
+
+unsigned long *unwind_get_return_address_ptr(struct unwind_state *state)
+{
+ if (unwind_done(state))
+ return NULL;
+
+ return state->regs ? &state->regs->ip : state->bp + 1;
+}
+
+static void unwind_dump(struct unwind_state *state)
+{
+ static bool dumped_before = false;
+ bool prev_zero, zero = false;
+ unsigned long word, *sp;
+ struct stack_info stack_info = {0};
+ unsigned long visit_mask = 0;
+
+ if (dumped_before)
+ return;
+
+ dumped_before = true;
+
+ printk_deferred("unwind stack type:%d next_sp:%p mask:0x%lx graph_idx:%d\n",
+ state->stack_info.type, state->stack_info.next_sp,
+ state->stack_mask, state->graph_idx);
+
+ for (sp = PTR_ALIGN(state->orig_sp, sizeof(long)); sp;
+ sp = PTR_ALIGN(stack_info.next_sp, sizeof(long))) {
+ if (get_stack_info(sp, state->task, &stack_info, &visit_mask))
+ break;
+
+ for (; sp < stack_info.end; sp++) {
+
+ word = READ_ONCE_NOCHECK(*sp);
+
+ prev_zero = zero;
+ zero = word == 0;
+
+ if (zero) {
+ if (!prev_zero)
+ printk_deferred("%p: %0*x ...\n",
+ sp, BITS_PER_LONG/4, 0);
+ continue;
+ }
+
+ printk_deferred("%p: %0*lx (%pB)\n",
+ sp, BITS_PER_LONG/4, word, (void *)word);
+ }
+ }
+}
+
+static size_t regs_size(struct pt_regs *regs)
+{
+ /* x86_32 regs from kernel mode are two words shorter: */
+ if (IS_ENABLED(CONFIG_X86_32) && !user_mode(regs))
+ return sizeof(*regs) - 2*sizeof(long);
+
+ return sizeof(*regs);
+}
+
+static bool in_entry_code(unsigned long ip)
+{
+ char *addr = (char *)ip;
+
+ if (addr >= __entry_text_start && addr < __entry_text_end)
+ return true;
+
+ if (addr >= __irqentry_text_start && addr < __irqentry_text_end)
+ return true;
+
+ return false;
+}
+
+static inline unsigned long *last_frame(struct unwind_state *state)
+{
+ return (unsigned long *)task_pt_regs(state->task) - 2;
+}
+
+static bool is_last_frame(struct unwind_state *state)
+{
+ return state->bp == last_frame(state);
+}
+
+#ifdef CONFIG_X86_32
+#define GCC_REALIGN_WORDS 3
+#else
+#define GCC_REALIGN_WORDS 1
+#endif
+
+static inline unsigned long *last_aligned_frame(struct unwind_state *state)
+{
+ return last_frame(state) - GCC_REALIGN_WORDS;
+}
+
+static bool is_last_aligned_frame(struct unwind_state *state)
+{
+ unsigned long *last_bp = last_frame(state);
+ unsigned long *aligned_bp = last_aligned_frame(state);
+
+ /*
+ * GCC can occasionally decide to realign the stack pointer and change
+ * the offset of the stack frame in the prologue of a function called
+ * by head/entry code. Examples:
+ *
+ * <start_secondary>:
+ * push %edi
+ * lea 0x8(%esp),%edi
+ * and $0xfffffff8,%esp
+ * pushl -0x4(%edi)
+ * push %ebp
+ * mov %esp,%ebp
+ *
+ * <x86_64_start_kernel>:
+ * lea 0x8(%rsp),%r10
+ * and $0xfffffffffffffff0,%rsp
+ * pushq -0x8(%r10)
+ * push %rbp
+ * mov %rsp,%rbp
+ *
+ * After aligning the stack, it pushes a duplicate copy of the return
+ * address before pushing the frame pointer.
+ */
+ return (state->bp == aligned_bp && *(aligned_bp + 1) == *(last_bp + 1));
+}
+
+static bool is_last_ftrace_frame(struct unwind_state *state)
+{
+ unsigned long *last_bp = last_frame(state);
+ unsigned long *last_ftrace_bp = last_bp - 3;
+
+ /*
+ * When unwinding from an ftrace handler of a function called by entry
+ * code, the stack layout of the last frame is:
+ *
+ * bp
+ * parent ret addr
+ * bp
+ * function ret addr
+ * parent ret addr
+ * pt_regs
+ * -----------------
+ */
+ return (state->bp == last_ftrace_bp &&
+ *state->bp == *(state->bp + 2) &&
+ *(state->bp + 1) == *(state->bp + 4));
+}
+
+static bool is_last_task_frame(struct unwind_state *state)
+{
+ return is_last_frame(state) || is_last_aligned_frame(state) ||
+ is_last_ftrace_frame(state);
+}
+
+/*
+ * This determines if the frame pointer actually contains an encoded pointer to
+ * pt_regs on the stack. See ENCODE_FRAME_POINTER.
+ */
+#ifdef CONFIG_X86_64
+static struct pt_regs *decode_frame_pointer(unsigned long *bp)
+{
+ unsigned long regs = (unsigned long)bp;
+
+ if (!(regs & 0x1))
+ return NULL;
+
+ return (struct pt_regs *)(regs & ~0x1);
+}
+#else
+static struct pt_regs *decode_frame_pointer(unsigned long *bp)
+{
+ unsigned long regs = (unsigned long)bp;
+
+ if (regs & 0x80000000)
+ return NULL;
+
+ return (struct pt_regs *)(regs | 0x80000000);
+}
+#endif
+
+#ifdef CONFIG_X86_32
+#define KERNEL_REGS_SIZE (sizeof(struct pt_regs) - 2*sizeof(long))
+#else
+#define KERNEL_REGS_SIZE (sizeof(struct pt_regs))
+#endif
+
+static bool update_stack_state(struct unwind_state *state,
+ unsigned long *next_bp)
+{
+ struct stack_info *info = &state->stack_info;
+ enum stack_type prev_type = info->type;
+ struct pt_regs *regs;
+ unsigned long *frame, *prev_frame_end, *addr_p, addr;
+ size_t len;
+
+ if (state->regs)
+ prev_frame_end = (void *)state->regs + regs_size(state->regs);
+ else
+ prev_frame_end = (void *)state->bp + FRAME_HEADER_SIZE;
+
+ /* Is the next frame pointer an encoded pointer to pt_regs? */
+ regs = decode_frame_pointer(next_bp);
+ if (regs) {
+ frame = (unsigned long *)regs;
+ len = KERNEL_REGS_SIZE;
+ state->got_irq = true;
+ } else {
+ frame = next_bp;
+ len = FRAME_HEADER_SIZE;
+ }
+
+ /*
+ * If the next bp isn't on the current stack, switch to the next one.
+ *
+ * We may have to traverse multiple stacks to deal with the possibility
+ * that info->next_sp could point to an empty stack and the next bp
+ * could be on a subsequent stack.
+ */
+ while (!on_stack(info, frame, len))
+ if (get_stack_info(info->next_sp, state->task, info,
+ &state->stack_mask))
+ return false;
+
+ /* Make sure it only unwinds up and doesn't overlap the prev frame: */
+ if (state->orig_sp && state->stack_info.type == prev_type &&
+ frame < prev_frame_end)
+ return false;
+
+ /*
+ * On 32-bit with user mode regs, make sure the last two regs are safe
+ * to access:
+ */
+ if (IS_ENABLED(CONFIG_X86_32) && regs && user_mode(regs) &&
+ !on_stack(info, frame, len + 2*sizeof(long)))
+ return false;
+
+ /* Move state to the next frame: */
+ if (regs) {
+ state->regs = regs;
+ state->bp = NULL;
+ } else {
+ state->bp = next_bp;
+ state->regs = NULL;
+ }
+
+ /* Save the return address: */
+ if (state->regs && user_mode(state->regs))
+ state->ip = 0;
+ else {
+ addr_p = unwind_get_return_address_ptr(state);
+ addr = READ_ONCE_TASK_STACK(state->task, *addr_p);
+ state->ip = ftrace_graph_ret_addr(state->task, &state->graph_idx,
+ addr, addr_p);
+ }
+
+ /* Save the original stack pointer for unwind_dump(): */
+ if (!state->orig_sp)
+ state->orig_sp = frame;
+
+ return true;
+}
+
+bool unwind_next_frame(struct unwind_state *state)
+{
+ struct pt_regs *regs;
+ unsigned long *next_bp;
+
+ if (unwind_done(state))
+ return false;
+
+ /* Have we reached the end? */
+ if (state->regs && user_mode(state->regs))
+ goto the_end;
+
+ if (is_last_task_frame(state)) {
+ regs = task_pt_regs(state->task);
+
+ /*
+ * kthreads (other than the boot CPU's idle thread) have some
+ * partial regs at the end of their stack which were placed
+ * there by copy_thread_tls(). But the regs don't have any
+ * useful information, so we can skip them.
+ *
+ * This user_mode() check is slightly broader than a PF_KTHREAD
+ * check because it also catches the awkward situation where a
+ * newly forked kthread transitions into a user task by calling
+ * do_execve(), which eventually clears PF_KTHREAD.
+ */
+ if (!user_mode(regs))
+ goto the_end;
+
+ /*
+ * We're almost at the end, but not quite: there's still the
+ * syscall regs frame. Entry code doesn't encode the regs
+ * pointer for syscalls, so we have to set it manually.
+ */
+ state->regs = regs;
+ state->bp = NULL;
+ state->ip = 0;
+ return true;
+ }
+
+ /* Get the next frame pointer: */
+ if (state->regs)
+ next_bp = (unsigned long *)state->regs->bp;
+ else
+ next_bp = (unsigned long *)READ_ONCE_TASK_STACK(state->task, *state->bp);
+
+ /* Move to the next frame if it's safe: */
+ if (!update_stack_state(state, next_bp))
+ goto bad_address;
+
+ return true;
+
+bad_address:
+ state->error = true;
+
+ /*
+ * When unwinding a non-current task, the task might actually be
+ * running on another CPU, in which case it could be modifying its
+ * stack while we're reading it. This is generally not a problem and
+ * can be ignored as long as the caller understands that unwinding
+ * another task will not always succeed.
+ */
+ if (state->task != current)
+ goto the_end;
+
+ /*
+ * Don't warn if the unwinder got lost due to an interrupt in entry
+ * code or in the C handler before the first frame pointer got set up:
+ */
+ if (state->got_irq && in_entry_code(state->ip))
+ goto the_end;
+ if (state->regs &&
+ state->regs->sp >= (unsigned long)last_aligned_frame(state) &&
+ state->regs->sp < (unsigned long)task_pt_regs(state->task))
+ goto the_end;
+
+ /*
+ * There are some known frame pointer issues on 32-bit. Disable
+ * unwinder warnings on 32-bit until it gets objtool support.
+ */
+ if (IS_ENABLED(CONFIG_X86_32))
+ goto the_end;
+
+ if (state->regs) {
+ printk_deferred_once(KERN_WARNING
+ "WARNING: kernel stack regs at %p in %s:%d has bad 'bp' value %p\n",
+ state->regs, state->task->comm,
+ state->task->pid, next_bp);
+ unwind_dump(state);
+ } else {
+ printk_deferred_once(KERN_WARNING
+ "WARNING: kernel stack frame pointer at %p in %s:%d has bad value %p\n",
+ state->bp, state->task->comm,
+ state->task->pid, next_bp);
+ unwind_dump(state);
+ }
+the_end:
+ state->stack_info.type = STACK_TYPE_UNKNOWN;
+ return false;
+}
+EXPORT_SYMBOL_GPL(unwind_next_frame);
+
+void __unwind_start(struct unwind_state *state, struct task_struct *task,
+ struct pt_regs *regs, unsigned long *first_frame)
+{
+ unsigned long *bp;
+
+ memset(state, 0, sizeof(*state));
+ state->task = task;
+ state->got_irq = (regs);
+
+ /* Don't even attempt to start from user mode regs: */
+ if (regs && user_mode(regs)) {
+ state->stack_info.type = STACK_TYPE_UNKNOWN;
+ return;
+ }
+
+ bp = get_frame_pointer(task, regs);
+
+ /* Initialize stack info and make sure the frame data is accessible: */
+ get_stack_info(bp, state->task, &state->stack_info,
+ &state->stack_mask);
+ update_stack_state(state, bp);
+
+ /*
+ * The caller can provide the address of the first frame directly
+ * (first_frame) or indirectly (regs->sp) to indicate which stack frame
+ * to start unwinding at. Skip ahead until we reach it.
+ */
+ while (!unwind_done(state) &&
+ (!on_stack(&state->stack_info, first_frame, sizeof(long)) ||
+ state->bp < first_frame))
+ unwind_next_frame(state);
+}
+EXPORT_SYMBOL_GPL(__unwind_start);
diff --git a/arch/x86/kernel/unwind_guess.c b/arch/x86/kernel/unwind_guess.c
new file mode 100644
index 0000000..4f0e17b
--- /dev/null
+++ b/arch/x86/kernel/unwind_guess.c
@@ -0,0 +1,72 @@
+#include <linux/sched.h>
+#include <linux/ftrace.h>
+#include <asm/ptrace.h>
+#include <asm/bitops.h>
+#include <asm/stacktrace.h>
+#include <asm/unwind.h>
+
+unsigned long unwind_get_return_address(struct unwind_state *state)
+{
+ unsigned long addr;
+
+ if (unwind_done(state))
+ return 0;
+
+ addr = READ_ONCE_NOCHECK(*state->sp);
+
+ return ftrace_graph_ret_addr(state->task, &state->graph_idx,
+ addr, state->sp);
+}
+EXPORT_SYMBOL_GPL(unwind_get_return_address);
+
+unsigned long *unwind_get_return_address_ptr(struct unwind_state *state)
+{
+ return NULL;
+}
+
+bool unwind_next_frame(struct unwind_state *state)
+{
+ struct stack_info *info = &state->stack_info;
+
+ if (unwind_done(state))
+ return false;
+
+ do {
+ for (state->sp++; state->sp < info->end; state->sp++) {
+ unsigned long addr = READ_ONCE_NOCHECK(*state->sp);
+
+ if (__kernel_text_address(addr))
+ return true;
+ }
+
+ state->sp = PTR_ALIGN(info->next_sp, sizeof(long));
+
+ } while (!get_stack_info(state->sp, state->task, info,
+ &state->stack_mask));
+
+ return false;
+}
+EXPORT_SYMBOL_GPL(unwind_next_frame);
+
+void __unwind_start(struct unwind_state *state, struct task_struct *task,
+ struct pt_regs *regs, unsigned long *first_frame)
+{
+ memset(state, 0, sizeof(*state));
+
+ state->task = task;
+ state->sp = PTR_ALIGN(first_frame, sizeof(long));
+
+ get_stack_info(first_frame, state->task, &state->stack_info,
+ &state->stack_mask);
+
+ /*
+ * The caller can provide the address of the first frame directly
+ * (first_frame) or indirectly (regs->sp) to indicate which stack frame
+ * to start unwinding at. Skip ahead until we reach it.
+ */
+ if (!unwind_done(state) &&
+ (!on_stack(&state->stack_info, first_frame, sizeof(long)) ||
+ !__kernel_text_address(*first_frame)))
+ unwind_next_frame(state);
+}
+EXPORT_SYMBOL_GPL(__unwind_start);
diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c
new file mode 100644
index 0000000..26038ea
--- /dev/null
+++ b/arch/x86/kernel/unwind_orc.c
@@ -0,0 +1,624 @@
+#include <linux/module.h>
+#include <linux/sort.h>
+#include <asm/ptrace.h>
+#include <asm/stacktrace.h>
+#include <asm/unwind.h>
+#include <asm/orc_types.h>
+#include <asm/orc_lookup.h>
+
+#define orc_warn(fmt, ...) \
+ printk_deferred_once(KERN_WARNING pr_fmt("WARNING: " fmt), ##__VA_ARGS__)
+
+extern int __start_orc_unwind_ip[];
+extern int __stop_orc_unwind_ip[];
+extern struct orc_entry __start_orc_unwind[];
+extern struct orc_entry __stop_orc_unwind[];
+
+static DEFINE_MUTEX(sort_mutex);
+int *cur_orc_ip_table = __start_orc_unwind_ip;
+struct orc_entry *cur_orc_table = __start_orc_unwind;
+
+unsigned int lookup_num_blocks;
+bool orc_init;
+
+static inline unsigned long orc_ip(const int *ip)
+{
+ return (unsigned long)ip + *ip;
+}
+
+static struct orc_entry *__orc_find(int *ip_table, struct orc_entry *u_table,
+ unsigned int num_entries, unsigned long ip)
+{
+ int *first = ip_table;
+ int *last = ip_table + num_entries - 1;
+ int *mid = first, *found = first;
+
+ if (!num_entries)
+ return NULL;
+
+ /*
+ * Do a binary range search to find the rightmost duplicate of a given
+ * starting address. Some entries are section terminators which are
+ * "weak" entries for ensuring there are no gaps. They should be
+ * ignored when they conflict with a real entry.
+ */
+ while (first <= last) {
+ mid = first + ((last - first) / 2);
+
+ if (orc_ip(mid) <= ip) {
+ found = mid;
+ first = mid + 1;
+ } else
+ last = mid - 1;
+ }
+
+ return u_table + (found - ip_table);
+}
+
+#ifdef CONFIG_MODULES
+static struct orc_entry *orc_module_find(unsigned long ip)
+{
+ struct module *mod;
+
+ mod = __module_address(ip);
+ if (!mod || !mod->arch.orc_unwind || !mod->arch.orc_unwind_ip)
+ return NULL;
+ return __orc_find(mod->arch.orc_unwind_ip, mod->arch.orc_unwind,
+ mod->arch.num_orcs, ip);
+}
+#else
+static struct orc_entry *orc_module_find(unsigned long ip)
+{
+ return NULL;
+}
+#endif
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+static struct orc_entry *orc_find(unsigned long ip);
+
+/*
+ * Ftrace dynamic trampolines do not have orc entries of their own.
+ * But they are copies of the ftrace entries that are static and
+ * defined in ftrace_*.S, which do have orc entries.
+ *
+ * If the undwinder comes across a ftrace trampoline, then find the
+ * ftrace function that was used to create it, and use that ftrace
+ * function's orc entrie, as the placement of the return code in
+ * the stack will be identical.
+ */
+static struct orc_entry *orc_ftrace_find(unsigned long ip)
+{
+ struct ftrace_ops *ops;
+ unsigned long caller;
+
+ ops = ftrace_ops_trampoline(ip);
+ if (!ops)
+ return NULL;
+
+ if (ops->flags & FTRACE_OPS_FL_SAVE_REGS)
+ caller = (unsigned long)ftrace_regs_call;
+ else
+ caller = (unsigned long)ftrace_call;
+
+ /* Prevent unlikely recursion */
+ if (ip == caller)
+ return NULL;
+
+ return orc_find(caller);
+}
+#else
+static struct orc_entry *orc_ftrace_find(unsigned long ip)
+{
+ return NULL;
+}
+#endif
+
+static struct orc_entry *orc_find(unsigned long ip)
+{
+ static struct orc_entry *orc;
+
+ if (!orc_init)
+ return NULL;
+
+ /* For non-init vmlinux addresses, use the fast lookup table: */
+ if (ip >= LOOKUP_START_IP && ip < LOOKUP_STOP_IP) {
+ unsigned int idx, start, stop;
+
+ idx = (ip - LOOKUP_START_IP) / LOOKUP_BLOCK_SIZE;
+
+ if (unlikely((idx >= lookup_num_blocks-1))) {
+ orc_warn("WARNING: bad lookup idx: idx=%u num=%u ip=%pB\n",
+ idx, lookup_num_blocks, (void *)ip);
+ return NULL;
+ }
+
+ start = orc_lookup[idx];
+ stop = orc_lookup[idx + 1] + 1;
+
+ if (unlikely((__start_orc_unwind + start >= __stop_orc_unwind) ||
+ (__start_orc_unwind + stop > __stop_orc_unwind))) {
+ orc_warn("WARNING: bad lookup value: idx=%u num=%u start=%u stop=%u ip=%pB\n",
+ idx, lookup_num_blocks, start, stop, (void *)ip);
+ return NULL;
+ }
+
+ return __orc_find(__start_orc_unwind_ip + start,
+ __start_orc_unwind + start, stop - start, ip);
+ }
+
+ /* vmlinux .init slow lookup: */
+ if (init_kernel_text(ip))
+ return __orc_find(__start_orc_unwind_ip, __start_orc_unwind,
+ __stop_orc_unwind_ip - __start_orc_unwind_ip, ip);
+
+ /* Module lookup: */
+ orc = orc_module_find(ip);
+ if (orc)
+ return orc;
+
+ return orc_ftrace_find(ip);
+}
+
+static void orc_sort_swap(void *_a, void *_b, int size)
+{
+ struct orc_entry *orc_a, *orc_b;
+ struct orc_entry orc_tmp;
+ int *a = _a, *b = _b, tmp;
+ int delta = _b - _a;
+
+ /* Swap the .orc_unwind_ip entries: */
+ tmp = *a;
+ *a = *b + delta;
+ *b = tmp - delta;
+
+ /* Swap the corresponding .orc_unwind entries: */
+ orc_a = cur_orc_table + (a - cur_orc_ip_table);
+ orc_b = cur_orc_table + (b - cur_orc_ip_table);
+ orc_tmp = *orc_a;
+ *orc_a = *orc_b;
+ *orc_b = orc_tmp;
+}
+
+static int orc_sort_cmp(const void *_a, const void *_b)
+{
+ struct orc_entry *orc_a;
+ const int *a = _a, *b = _b;
+ unsigned long a_val = orc_ip(a);
+ unsigned long b_val = orc_ip(b);
+
+ if (a_val > b_val)
+ return 1;
+ if (a_val < b_val)
+ return -1;
+
+ /*
+ * The "weak" section terminator entries need to always be on the left
+ * to ensure the lookup code skips them in favor of real entries.
+ * These terminator entries exist to handle any gaps created by
+ * whitelisted .o files which didn't get objtool generation.
+ */
+ orc_a = cur_orc_table + (a - cur_orc_ip_table);
+ return orc_a->sp_reg == ORC_REG_UNDEFINED && !orc_a->end ? -1 : 1;
+}
+
+#ifdef CONFIG_MODULES
+void unwind_module_init(struct module *mod, void *_orc_ip, size_t orc_ip_size,
+ void *_orc, size_t orc_size)
+{
+ int *orc_ip = _orc_ip;
+ struct orc_entry *orc = _orc;
+ unsigned int num_entries = orc_ip_size / sizeof(int);
+
+ WARN_ON_ONCE(orc_ip_size % sizeof(int) != 0 ||
+ orc_size % sizeof(*orc) != 0 ||
+ num_entries != orc_size / sizeof(*orc));
+
+ /*
+ * The 'cur_orc_*' globals allow the orc_sort_swap() callback to
+ * associate an .orc_unwind_ip table entry with its corresponding
+ * .orc_unwind entry so they can both be swapped.
+ */
+ mutex_lock(&sort_mutex);
+ cur_orc_ip_table = orc_ip;
+ cur_orc_table = orc;
+ sort(orc_ip, num_entries, sizeof(int), orc_sort_cmp, orc_sort_swap);
+ mutex_unlock(&sort_mutex);
+
+ mod->arch.orc_unwind_ip = orc_ip;
+ mod->arch.orc_unwind = orc;
+ mod->arch.num_orcs = num_entries;
+}
+#endif
+
+void __init unwind_init(void)
+{
+ size_t orc_ip_size = (void *)__stop_orc_unwind_ip - (void *)__start_orc_unwind_ip;
+ size_t orc_size = (void *)__stop_orc_unwind - (void *)__start_orc_unwind;
+ size_t num_entries = orc_ip_size / sizeof(int);
+ struct orc_entry *orc;
+ int i;
+
+ if (!num_entries || orc_ip_size % sizeof(int) != 0 ||
+ orc_size % sizeof(struct orc_entry) != 0 ||
+ num_entries != orc_size / sizeof(struct orc_entry)) {
+ orc_warn("WARNING: Bad or missing .orc_unwind table. Disabling unwinder.\n");
+ return;
+ }
+
+ /* Sort the .orc_unwind and .orc_unwind_ip tables: */
+ sort(__start_orc_unwind_ip, num_entries, sizeof(int), orc_sort_cmp,
+ orc_sort_swap);
+
+ /* Initialize the fast lookup table: */
+ lookup_num_blocks = orc_lookup_end - orc_lookup;
+ for (i = 0; i < lookup_num_blocks-1; i++) {
+ orc = __orc_find(__start_orc_unwind_ip, __start_orc_unwind,
+ num_entries,
+ LOOKUP_START_IP + (LOOKUP_BLOCK_SIZE * i));
+ if (!orc) {
+ orc_warn("WARNING: Corrupt .orc_unwind table. Disabling unwinder.\n");
+ return;
+ }
+
+ orc_lookup[i] = orc - __start_orc_unwind;
+ }
+
+ /* Initialize the ending block: */
+ orc = __orc_find(__start_orc_unwind_ip, __start_orc_unwind, num_entries,
+ LOOKUP_STOP_IP);
+ if (!orc) {
+ orc_warn("WARNING: Corrupt .orc_unwind table. Disabling unwinder.\n");
+ return;
+ }
+ orc_lookup[lookup_num_blocks-1] = orc - __start_orc_unwind;
+
+ orc_init = true;
+}
+
+unsigned long unwind_get_return_address(struct unwind_state *state)
+{
+ if (unwind_done(state))
+ return 0;
+
+ return __kernel_text_address(state->ip) ? state->ip : 0;
+}
+EXPORT_SYMBOL_GPL(unwind_get_return_address);
+
+unsigned long *unwind_get_return_address_ptr(struct unwind_state *state)
+{
+ if (unwind_done(state))
+ return NULL;
+
+ if (state->regs)
+ return &state->regs->ip;
+
+ if (state->sp)
+ return (unsigned long *)state->sp - 1;
+
+ return NULL;
+}
+
+static bool stack_access_ok(struct unwind_state *state, unsigned long _addr,
+ size_t len)
+{
+ struct stack_info *info = &state->stack_info;
+ void *addr = (void *)_addr;
+
+ if (!on_stack(info, addr, len) &&
+ (get_stack_info(addr, state->task, info, &state->stack_mask)))
+ return false;
+
+ return true;
+}
+
+static bool deref_stack_reg(struct unwind_state *state, unsigned long addr,
+ unsigned long *val)
+{
+ if (!stack_access_ok(state, addr, sizeof(long)))
+ return false;
+
+ *val = READ_ONCE_NOCHECK(*(unsigned long *)addr);
+ return true;
+}
+
+static bool deref_stack_regs(struct unwind_state *state, unsigned long addr,
+ unsigned long *ip, unsigned long *sp)
+{
+ struct pt_regs *regs = (struct pt_regs *)addr;
+
+ /* x86-32 support will be more complicated due to the ®s->sp hack */
+ BUILD_BUG_ON(IS_ENABLED(CONFIG_X86_32));
+
+ if (!stack_access_ok(state, addr, sizeof(struct pt_regs)))
+ return false;
+
+ *ip = regs->ip;
+ *sp = regs->sp;
+ return true;
+}
+
+static bool deref_stack_iret_regs(struct unwind_state *state, unsigned long addr,
+ unsigned long *ip, unsigned long *sp)
+{
+ struct pt_regs *regs = (void *)addr - IRET_FRAME_OFFSET;
+
+ if (!stack_access_ok(state, addr, IRET_FRAME_SIZE))
+ return false;
+
+ *ip = regs->ip;
+ *sp = regs->sp;
+ return true;
+}
+
+bool unwind_next_frame(struct unwind_state *state)
+{
+ unsigned long ip_p, sp, orig_ip = state->ip, prev_sp = state->sp;
+ enum stack_type prev_type = state->stack_info.type;
+ struct orc_entry *orc;
+ bool indirect = false;
+
+ if (unwind_done(state))
+ return false;
+
+ /* Don't let modules unload while we're reading their ORC data. */
+ preempt_disable();
+
+ /* End-of-stack check for user tasks: */
+ if (state->regs && user_mode(state->regs))
+ goto the_end;
+
+ /*
+ * Find the orc_entry associated with the text address.
+ *
+ * Decrement call return addresses by one so they work for sibling
+ * calls and calls to noreturn functions.
+ */
+ orc = orc_find(state->signal ? state->ip : state->ip - 1);
+ if (!orc)
+ goto err;
+
+ /* End-of-stack check for kernel threads: */
+ if (orc->sp_reg == ORC_REG_UNDEFINED) {
+ if (!orc->end)
+ goto err;
+
+ goto the_end;
+ }
+
+ /* Find the previous frame's stack: */
+ switch (orc->sp_reg) {
+ case ORC_REG_SP:
+ sp = state->sp + orc->sp_offset;
+ break;
+
+ case ORC_REG_BP:
+ sp = state->bp + orc->sp_offset;
+ break;
+
+ case ORC_REG_SP_INDIRECT:
+ sp = state->sp + orc->sp_offset;
+ indirect = true;
+ break;
+
+ case ORC_REG_BP_INDIRECT:
+ sp = state->bp + orc->sp_offset;
+ indirect = true;
+ break;
+
+ case ORC_REG_R10:
+ if (!state->regs || !state->full_regs) {
+ orc_warn("missing regs for base reg R10 at ip %pB\n",
+ (void *)state->ip);
+ goto err;
+ }
+ sp = state->regs->r10;
+ break;
+
+ case ORC_REG_R13:
+ if (!state->regs || !state->full_regs) {
+ orc_warn("missing regs for base reg R13 at ip %pB\n",
+ (void *)state->ip);
+ goto err;
+ }
+ sp = state->regs->r13;
+ break;
+
+ case ORC_REG_DI:
+ if (!state->regs || !state->full_regs) {
+ orc_warn("missing regs for base reg DI at ip %pB\n",
+ (void *)state->ip);
+ goto err;
+ }
+ sp = state->regs->di;
+ break;
+
+ case ORC_REG_DX:
+ if (!state->regs || !state->full_regs) {
+ orc_warn("missing regs for base reg DX at ip %pB\n",
+ (void *)state->ip);
+ goto err;
+ }
+ sp = state->regs->dx;
+ break;
+
+ default:
+ orc_warn("unknown SP base reg %d for ip %pB\n",
+ orc->sp_reg, (void *)state->ip);
+ goto err;
+ }
+
+ if (indirect) {
+ if (!deref_stack_reg(state, sp, &sp))
+ goto err;
+ }
+
+ /* Find IP, SP and possibly regs: */
+ switch (orc->type) {
+ case ORC_TYPE_CALL:
+ ip_p = sp - sizeof(long);
+
+ if (!deref_stack_reg(state, ip_p, &state->ip))
+ goto err;
+
+ state->ip = ftrace_graph_ret_addr(state->task, &state->graph_idx,
+ state->ip, (void *)ip_p);
+
+ state->sp = sp;
+ state->regs = NULL;
+ state->signal = false;
+ break;
+
+ case ORC_TYPE_REGS:
+ if (!deref_stack_regs(state, sp, &state->ip, &state->sp)) {
+ orc_warn("can't dereference registers at %p for ip %pB\n",
+ (void *)sp, (void *)orig_ip);
+ goto err;
+ }
+
+ state->regs = (struct pt_regs *)sp;
+ state->full_regs = true;
+ state->signal = true;
+ break;
+
+ case ORC_TYPE_REGS_IRET:
+ if (!deref_stack_iret_regs(state, sp, &state->ip, &state->sp)) {
+ orc_warn("can't dereference iret registers at %p for ip %pB\n",
+ (void *)sp, (void *)orig_ip);
+ goto err;
+ }
+
+ state->regs = (void *)sp - IRET_FRAME_OFFSET;
+ state->full_regs = false;
+ state->signal = true;
+ break;
+
+ default:
+ orc_warn("unknown .orc_unwind entry type %d for ip %pB\n",
+ orc->type, (void *)orig_ip);
+ break;
+ }
+
+ /* Find BP: */
+ switch (orc->bp_reg) {
+ case ORC_REG_UNDEFINED:
+ if (state->regs && state->full_regs)
+ state->bp = state->regs->bp;
+ break;
+
+ case ORC_REG_PREV_SP:
+ if (!deref_stack_reg(state, sp + orc->bp_offset, &state->bp))
+ goto err;
+ break;
+
+ case ORC_REG_BP:
+ if (!deref_stack_reg(state, state->bp + orc->bp_offset, &state->bp))
+ goto err;
+ break;
+
+ default:
+ orc_warn("unknown BP base reg %d for ip %pB\n",
+ orc->bp_reg, (void *)orig_ip);
+ goto err;
+ }
+
+ /* Prevent a recursive loop due to bad ORC data: */
+ if (state->stack_info.type == prev_type &&
+ on_stack(&state->stack_info, (void *)state->sp, sizeof(long)) &&
+ state->sp <= prev_sp) {
+ orc_warn("stack going in the wrong direction? ip=%pB\n",
+ (void *)orig_ip);
+ goto err;
+ }
+
+ preempt_enable();
+ return true;
+
+err:
+ state->error = true;
+
+the_end:
+ preempt_enable();
+ state->stack_info.type = STACK_TYPE_UNKNOWN;
+ return false;
+}
+EXPORT_SYMBOL_GPL(unwind_next_frame);
+
+void __unwind_start(struct unwind_state *state, struct task_struct *task,
+ struct pt_regs *regs, unsigned long *first_frame)
+{
+ memset(state, 0, sizeof(*state));
+ state->task = task;
+
+ /*
+ * Refuse to unwind the stack of a task while it's executing on another
+ * CPU. This check is racy, but that's ok: the unwinder has other
+ * checks to prevent it from going off the rails.
+ */
+ if (task_on_another_cpu(task))
+ goto done;
+
+ if (regs) {
+ if (user_mode(regs))
+ goto done;
+
+ state->ip = regs->ip;
+ state->sp = kernel_stack_pointer(regs);
+ state->bp = regs->bp;
+ state->regs = regs;
+ state->full_regs = true;
+ state->signal = true;
+
+ } else if (task == current) {
+ asm volatile("lea (%%rip), %0\n\t"
+ "mov %%rsp, %1\n\t"
+ "mov %%rbp, %2\n\t"
+ : "=r" (state->ip), "=r" (state->sp),
+ "=r" (state->bp));
+
+ } else {
+ struct inactive_task_frame *frame = (void *)task->thread.sp;
+
+ state->sp = task->thread.sp;
+ state->bp = READ_ONCE_NOCHECK(frame->bp);
+ state->ip = READ_ONCE_NOCHECK(frame->ret_addr);
+ }
+
+ if (get_stack_info((unsigned long *)state->sp, state->task,
+ &state->stack_info, &state->stack_mask)) {
+ /*
+ * We weren't on a valid stack. It's possible that
+ * we overflowed a valid stack into a guard page.
+ * See if the next page up is valid so that we can
+ * generate some kind of backtrace if this happens.
+ */
+ void *next_page = (void *)PAGE_ALIGN((unsigned long)state->sp);
+ if (get_stack_info(next_page, state->task, &state->stack_info,
+ &state->stack_mask))
+ return;
+ }
+
+ /*
+ * The caller can provide the address of the first frame directly
+ * (first_frame) or indirectly (regs->sp) to indicate which stack frame
+ * to start unwinding at. Skip ahead until we reach it.
+ */
+
+ /* When starting from regs, skip the regs frame: */
+ if (regs) {
+ unwind_next_frame(state);
+ return;
+ }
+
+ /* Otherwise, skip ahead to the user-specified starting frame: */
+ while (!unwind_done(state) &&
+ (!on_stack(&state->stack_info, first_frame, sizeof(long)) ||
+ state->sp <= (unsigned long)first_frame))
+ unwind_next_frame(state);
+
+ return;
+
+done:
+ state->stack_info.type = STACK_TYPE_UNKNOWN;
+ return;
+}
+EXPORT_SYMBOL_GPL(__unwind_start);
diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
new file mode 100644
index 0000000..deb576b
--- /dev/null
+++ b/arch/x86/kernel/uprobes.c
@@ -0,0 +1,1102 @@
+/*
+ * User-space Probes (UProbes) for x86
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2008-2011
+ * Authors:
+ * Srikar Dronamraju
+ * Jim Keniston
+ */
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/ptrace.h>
+#include <linux/uprobes.h>
+#include <linux/uaccess.h>
+
+#include <linux/kdebug.h>
+#include <asm/processor.h>
+#include <asm/insn.h>
+#include <asm/mmu_context.h>
+
+/* Post-execution fixups. */
+
+/* Adjust IP back to vicinity of actual insn */
+#define UPROBE_FIX_IP 0x01
+
+/* Adjust the return address of a call insn */
+#define UPROBE_FIX_CALL 0x02
+
+/* Instruction will modify TF, don't change it */
+#define UPROBE_FIX_SETF 0x04
+
+#define UPROBE_FIX_RIP_SI 0x08
+#define UPROBE_FIX_RIP_DI 0x10
+#define UPROBE_FIX_RIP_BX 0x20
+#define UPROBE_FIX_RIP_MASK \
+ (UPROBE_FIX_RIP_SI | UPROBE_FIX_RIP_DI | UPROBE_FIX_RIP_BX)
+
+#define UPROBE_TRAP_NR UINT_MAX
+
+/* Adaptations for mhiramat x86 decoder v14. */
+#define OPCODE1(insn) ((insn)->opcode.bytes[0])
+#define OPCODE2(insn) ((insn)->opcode.bytes[1])
+#define OPCODE3(insn) ((insn)->opcode.bytes[2])
+#define MODRM_REG(insn) X86_MODRM_REG((insn)->modrm.value)
+
+#define W(row, b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, ba, bb, bc, bd, be, bf)\
+ (((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) | \
+ (b4##UL << 0x4)|(b5##UL << 0x5)|(b6##UL << 0x6)|(b7##UL << 0x7) | \
+ (b8##UL << 0x8)|(b9##UL << 0x9)|(ba##UL << 0xa)|(bb##UL << 0xb) | \
+ (bc##UL << 0xc)|(bd##UL << 0xd)|(be##UL << 0xe)|(bf##UL << 0xf)) \
+ << (row % 32))
+
+/*
+ * Good-instruction tables for 32-bit apps. This is non-const and volatile
+ * to keep gcc from statically optimizing it out, as variable_test_bit makes
+ * some versions of gcc to think only *(unsigned long*) is used.
+ *
+ * Opcodes we'll probably never support:
+ * 6c-6f - ins,outs. SEGVs if used in userspace
+ * e4-e7 - in,out imm. SEGVs if used in userspace
+ * ec-ef - in,out acc. SEGVs if used in userspace
+ * cc - int3. SIGTRAP if used in userspace
+ * ce - into. Not used in userspace - no kernel support to make it useful. SEGVs
+ * (why we support bound (62) then? it's similar, and similarly unused...)
+ * f1 - int1. SIGTRAP if used in userspace
+ * f4 - hlt. SEGVs if used in userspace
+ * fa - cli. SEGVs if used in userspace
+ * fb - sti. SEGVs if used in userspace
+ *
+ * Opcodes which need some work to be supported:
+ * 07,17,1f - pop es/ss/ds
+ * Normally not used in userspace, but would execute if used.
+ * Can cause GP or stack exception if tries to load wrong segment descriptor.
+ * We hesitate to run them under single step since kernel's handling
+ * of userspace single-stepping (TF flag) is fragile.
+ * We can easily refuse to support push es/cs/ss/ds (06/0e/16/1e)
+ * on the same grounds that they are never used.
+ * cd - int N.
+ * Used by userspace for "int 80" syscall entry. (Other "int N"
+ * cause GP -> SEGV since their IDT gates don't allow calls from CPL 3).
+ * Not supported since kernel's handling of userspace single-stepping
+ * (TF flag) is fragile.
+ * cf - iret. Normally not used in userspace. Doesn't SEGV unless arguments are bad
+ */
+#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
+static volatile u32 good_insns_32[256 / 32] = {
+ /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
+ /* ---------------------------------------------- */
+ W(0x00, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1) | /* 00 */
+ W(0x10, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0) , /* 10 */
+ W(0x20, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 20 */
+ W(0x30, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 30 */
+ W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 40 */
+ W(0x50, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 50 */
+ W(0x60, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0) | /* 60 */
+ W(0x70, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 70 */
+ W(0x80, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 80 */
+ W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 90 */
+ W(0xa0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* a0 */
+ W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* b0 */
+ W(0xc0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0) | /* c0 */
+ W(0xd0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */
+ W(0xe0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* e0 */
+ W(0xf0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1) /* f0 */
+ /* ---------------------------------------------- */
+ /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
+};
+#else
+#define good_insns_32 NULL
+#endif
+
+/* Good-instruction tables for 64-bit apps.
+ *
+ * Genuinely invalid opcodes:
+ * 06,07 - formerly push/pop es
+ * 0e - formerly push cs
+ * 16,17 - formerly push/pop ss
+ * 1e,1f - formerly push/pop ds
+ * 27,2f,37,3f - formerly daa/das/aaa/aas
+ * 60,61 - formerly pusha/popa
+ * 62 - formerly bound. EVEX prefix for AVX512 (not yet supported)
+ * 82 - formerly redundant encoding of Group1
+ * 9a - formerly call seg:ofs
+ * ce - formerly into
+ * d4,d5 - formerly aam/aad
+ * d6 - formerly undocumented salc
+ * ea - formerly jmp seg:ofs
+ *
+ * Opcodes we'll probably never support:
+ * 6c-6f - ins,outs. SEGVs if used in userspace
+ * e4-e7 - in,out imm. SEGVs if used in userspace
+ * ec-ef - in,out acc. SEGVs if used in userspace
+ * cc - int3. SIGTRAP if used in userspace
+ * f1 - int1. SIGTRAP if used in userspace
+ * f4 - hlt. SEGVs if used in userspace
+ * fa - cli. SEGVs if used in userspace
+ * fb - sti. SEGVs if used in userspace
+ *
+ * Opcodes which need some work to be supported:
+ * cd - int N.
+ * Used by userspace for "int 80" syscall entry. (Other "int N"
+ * cause GP -> SEGV since their IDT gates don't allow calls from CPL 3).
+ * Not supported since kernel's handling of userspace single-stepping
+ * (TF flag) is fragile.
+ * cf - iret. Normally not used in userspace. Doesn't SEGV unless arguments are bad
+ */
+#if defined(CONFIG_X86_64)
+static volatile u32 good_insns_64[256 / 32] = {
+ /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
+ /* ---------------------------------------------- */
+ W(0x00, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1) | /* 00 */
+ W(0x10, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0) , /* 10 */
+ W(0x20, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0) | /* 20 */
+ W(0x30, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0) , /* 30 */
+ W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 40 */
+ W(0x50, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 50 */
+ W(0x60, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0) | /* 60 */
+ W(0x70, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 70 */
+ W(0x80, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 80 */
+ W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1) , /* 90 */
+ W(0xa0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* a0 */
+ W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* b0 */
+ W(0xc0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0) | /* c0 */
+ W(0xd0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */
+ W(0xe0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0) | /* e0 */
+ W(0xf0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1) /* f0 */
+ /* ---------------------------------------------- */
+ /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
+};
+#else
+#define good_insns_64 NULL
+#endif
+
+/* Using this for both 64-bit and 32-bit apps.
+ * Opcodes we don't support:
+ * 0f 00 - SLDT/STR/LLDT/LTR/VERR/VERW/-/- group. System insns
+ * 0f 01 - SGDT/SIDT/LGDT/LIDT/SMSW/-/LMSW/INVLPG group.
+ * Also encodes tons of other system insns if mod=11.
+ * Some are in fact non-system: xend, xtest, rdtscp, maybe more
+ * 0f 05 - syscall
+ * 0f 06 - clts (CPL0 insn)
+ * 0f 07 - sysret
+ * 0f 08 - invd (CPL0 insn)
+ * 0f 09 - wbinvd (CPL0 insn)
+ * 0f 0b - ud2
+ * 0f 30 - wrmsr (CPL0 insn) (then why rdmsr is allowed, it's also CPL0 insn?)
+ * 0f 34 - sysenter
+ * 0f 35 - sysexit
+ * 0f 37 - getsec
+ * 0f 78 - vmread (Intel VMX. CPL0 insn)
+ * 0f 79 - vmwrite (Intel VMX. CPL0 insn)
+ * Note: with prefixes, these two opcodes are
+ * extrq/insertq/AVX512 convert vector ops.
+ * 0f ae - group15: [f]xsave,[f]xrstor,[v]{ld,st}mxcsr,clflush[opt],
+ * {rd,wr}{fs,gs}base,{s,l,m}fence.
+ * Why? They are all user-executable.
+ */
+static volatile u32 good_2byte_insns[256 / 32] = {
+ /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
+ /* ---------------------------------------------- */
+ W(0x00, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1) | /* 00 */
+ W(0x10, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 10 */
+ W(0x20, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 20 */
+ W(0x30, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1) , /* 30 */
+ W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 40 */
+ W(0x50, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 50 */
+ W(0x60, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 60 */
+ W(0x70, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1) , /* 70 */
+ W(0x80, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 80 */
+ W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 90 */
+ W(0xa0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1) | /* a0 */
+ W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* b0 */
+ W(0xc0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* c0 */
+ W(0xd0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */
+ W(0xe0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* e0 */
+ W(0xf0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) /* f0 */
+ /* ---------------------------------------------- */
+ /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
+};
+#undef W
+
+/*
+ * opcodes we may need to refine support for:
+ *
+ * 0f - 2-byte instructions: For many of these instructions, the validity
+ * depends on the prefix and/or the reg field. On such instructions, we
+ * just consider the opcode combination valid if it corresponds to any
+ * valid instruction.
+ *
+ * 8f - Group 1 - only reg = 0 is OK
+ * c6-c7 - Group 11 - only reg = 0 is OK
+ * d9-df - fpu insns with some illegal encodings
+ * f2, f3 - repnz, repz prefixes. These are also the first byte for
+ * certain floating-point instructions, such as addsd.
+ *
+ * fe - Group 4 - only reg = 0 or 1 is OK
+ * ff - Group 5 - only reg = 0-6 is OK
+ *
+ * others -- Do we need to support these?
+ *
+ * 0f - (floating-point?) prefetch instructions
+ * 07, 17, 1f - pop es, pop ss, pop ds
+ * 26, 2e, 36, 3e - es:, cs:, ss:, ds: segment prefixes --
+ * but 64 and 65 (fs: and gs:) seem to be used, so we support them
+ * 67 - addr16 prefix
+ * ce - into
+ * f0 - lock prefix
+ */
+
+/*
+ * TODO:
+ * - Where necessary, examine the modrm byte and allow only valid instructions
+ * in the different Groups and fpu instructions.
+ */
+
+static bool is_prefix_bad(struct insn *insn)
+{
+ int i;
+
+ for (i = 0; i < insn->prefixes.nbytes; i++) {
+ insn_attr_t attr;
+
+ attr = inat_get_opcode_attribute(insn->prefixes.bytes[i]);
+ switch (attr) {
+ case INAT_MAKE_PREFIX(INAT_PFX_ES):
+ case INAT_MAKE_PREFIX(INAT_PFX_CS):
+ case INAT_MAKE_PREFIX(INAT_PFX_DS):
+ case INAT_MAKE_PREFIX(INAT_PFX_SS):
+ case INAT_MAKE_PREFIX(INAT_PFX_LOCK):
+ return true;
+ }
+ }
+ return false;
+}
+
+static int uprobe_init_insn(struct arch_uprobe *auprobe, struct insn *insn, bool x86_64)
+{
+ u32 volatile *good_insns;
+
+ insn_init(insn, auprobe->insn, sizeof(auprobe->insn), x86_64);
+ /* has the side-effect of processing the entire instruction */
+ insn_get_length(insn);
+ if (!insn_complete(insn))
+ return -ENOEXEC;
+
+ if (is_prefix_bad(insn))
+ return -ENOTSUPP;
+
+ /* We should not singlestep on the exception masking instructions */
+ if (insn_masking_exception(insn))
+ return -ENOTSUPP;
+
+ if (x86_64)
+ good_insns = good_insns_64;
+ else
+ good_insns = good_insns_32;
+
+ if (test_bit(OPCODE1(insn), (unsigned long *)good_insns))
+ return 0;
+
+ if (insn->opcode.nbytes == 2) {
+ if (test_bit(OPCODE2(insn), (unsigned long *)good_2byte_insns))
+ return 0;
+ }
+
+ return -ENOTSUPP;
+}
+
+#ifdef CONFIG_X86_64
+/*
+ * If arch_uprobe->insn doesn't use rip-relative addressing, return
+ * immediately. Otherwise, rewrite the instruction so that it accesses
+ * its memory operand indirectly through a scratch register. Set
+ * defparam->fixups accordingly. (The contents of the scratch register
+ * will be saved before we single-step the modified instruction,
+ * and restored afterward).
+ *
+ * We do this because a rip-relative instruction can access only a
+ * relatively small area (+/- 2 GB from the instruction), and the XOL
+ * area typically lies beyond that area. At least for instructions
+ * that store to memory, we can't execute the original instruction
+ * and "fix things up" later, because the misdirected store could be
+ * disastrous.
+ *
+ * Some useful facts about rip-relative instructions:
+ *
+ * - There's always a modrm byte with bit layout "00 reg 101".
+ * - There's never a SIB byte.
+ * - The displacement is always 4 bytes.
+ * - REX.B=1 bit in REX prefix, which normally extends r/m field,
+ * has no effect on rip-relative mode. It doesn't make modrm byte
+ * with r/m=101 refer to register 1101 = R13.
+ */
+static void riprel_analyze(struct arch_uprobe *auprobe, struct insn *insn)
+{
+ u8 *cursor;
+ u8 reg;
+ u8 reg2;
+
+ if (!insn_rip_relative(insn))
+ return;
+
+ /*
+ * insn_rip_relative() would have decoded rex_prefix, vex_prefix, modrm.
+ * Clear REX.b bit (extension of MODRM.rm field):
+ * we want to encode low numbered reg, not r8+.
+ */
+ if (insn->rex_prefix.nbytes) {
+ cursor = auprobe->insn + insn_offset_rex_prefix(insn);
+ /* REX byte has 0100wrxb layout, clearing REX.b bit */
+ *cursor &= 0xfe;
+ }
+ /*
+ * Similar treatment for VEX3/EVEX prefix.
+ * TODO: add XOP treatment when insn decoder supports them
+ */
+ if (insn->vex_prefix.nbytes >= 3) {
+ /*
+ * vex2: c5 rvvvvLpp (has no b bit)
+ * vex3/xop: c4/8f rxbmmmmm wvvvvLpp
+ * evex: 62 rxbR00mm wvvvv1pp zllBVaaa
+ * Setting VEX3.b (setting because it has inverted meaning).
+ * Setting EVEX.x since (in non-SIB encoding) EVEX.x
+ * is the 4th bit of MODRM.rm, and needs the same treatment.
+ * For VEX3-encoded insns, VEX3.x value has no effect in
+ * non-SIB encoding, the change is superfluous but harmless.
+ */
+ cursor = auprobe->insn + insn_offset_vex_prefix(insn) + 1;
+ *cursor |= 0x60;
+ }
+
+ /*
+ * Convert from rip-relative addressing to register-relative addressing
+ * via a scratch register.
+ *
+ * This is tricky since there are insns with modrm byte
+ * which also use registers not encoded in modrm byte:
+ * [i]div/[i]mul: implicitly use dx:ax
+ * shift ops: implicitly use cx
+ * cmpxchg: implicitly uses ax
+ * cmpxchg8/16b: implicitly uses dx:ax and bx:cx
+ * Encoding: 0f c7/1 modrm
+ * The code below thinks that reg=1 (cx), chooses si as scratch.
+ * mulx: implicitly uses dx: mulx r/m,r1,r2 does r1:r2 = dx * r/m.
+ * First appeared in Haswell (BMI2 insn). It is vex-encoded.
+ * Example where none of bx,cx,dx can be used as scratch reg:
+ * c4 e2 63 f6 0d disp32 mulx disp32(%rip),%ebx,%ecx
+ * [v]pcmpistri: implicitly uses cx, xmm0
+ * [v]pcmpistrm: implicitly uses xmm0
+ * [v]pcmpestri: implicitly uses ax, dx, cx, xmm0
+ * [v]pcmpestrm: implicitly uses ax, dx, xmm0
+ * Evil SSE4.2 string comparison ops from hell.
+ * maskmovq/[v]maskmovdqu: implicitly uses (ds:rdi) as destination.
+ * Encoding: 0f f7 modrm, 66 0f f7 modrm, vex-encoded: c5 f9 f7 modrm.
+ * Store op1, byte-masked by op2 msb's in each byte, to (ds:rdi).
+ * AMD says it has no 3-operand form (vex.vvvv must be 1111)
+ * and that it can have only register operands, not mem
+ * (its modrm byte must have mode=11).
+ * If these restrictions will ever be lifted,
+ * we'll need code to prevent selection of di as scratch reg!
+ *
+ * Summary: I don't know any insns with modrm byte which
+ * use SI register implicitly. DI register is used only
+ * by one insn (maskmovq) and BX register is used
+ * only by one too (cmpxchg8b).
+ * BP is stack-segment based (may be a problem?).
+ * AX, DX, CX are off-limits (many implicit users).
+ * SP is unusable (it's stack pointer - think about "pop mem";
+ * also, rsp+disp32 needs sib encoding -> insn length change).
+ */
+
+ reg = MODRM_REG(insn); /* Fetch modrm.reg */
+ reg2 = 0xff; /* Fetch vex.vvvv */
+ if (insn->vex_prefix.nbytes)
+ reg2 = insn->vex_prefix.bytes[2];
+ /*
+ * TODO: add XOP vvvv reading.
+ *
+ * vex.vvvv field is in bits 6-3, bits are inverted.
+ * But in 32-bit mode, high-order bit may be ignored.
+ * Therefore, let's consider only 3 low-order bits.
+ */
+ reg2 = ((reg2 >> 3) & 0x7) ^ 0x7;
+ /*
+ * Register numbering is ax,cx,dx,bx, sp,bp,si,di, r8..r15.
+ *
+ * Choose scratch reg. Order is important: must not select bx
+ * if we can use si (cmpxchg8b case!)
+ */
+ if (reg != 6 && reg2 != 6) {
+ reg2 = 6;
+ auprobe->defparam.fixups |= UPROBE_FIX_RIP_SI;
+ } else if (reg != 7 && reg2 != 7) {
+ reg2 = 7;
+ auprobe->defparam.fixups |= UPROBE_FIX_RIP_DI;
+ /* TODO (paranoia): force maskmovq to not use di */
+ } else {
+ reg2 = 3;
+ auprobe->defparam.fixups |= UPROBE_FIX_RIP_BX;
+ }
+ /*
+ * Point cursor at the modrm byte. The next 4 bytes are the
+ * displacement. Beyond the displacement, for some instructions,
+ * is the immediate operand.
+ */
+ cursor = auprobe->insn + insn_offset_modrm(insn);
+ /*
+ * Change modrm from "00 reg 101" to "10 reg reg2". Example:
+ * 89 05 disp32 mov %eax,disp32(%rip) becomes
+ * 89 86 disp32 mov %eax,disp32(%rsi)
+ */
+ *cursor = 0x80 | (reg << 3) | reg2;
+}
+
+static inline unsigned long *
+scratch_reg(struct arch_uprobe *auprobe, struct pt_regs *regs)
+{
+ if (auprobe->defparam.fixups & UPROBE_FIX_RIP_SI)
+ return ®s->si;
+ if (auprobe->defparam.fixups & UPROBE_FIX_RIP_DI)
+ return ®s->di;
+ return ®s->bx;
+}
+
+/*
+ * If we're emulating a rip-relative instruction, save the contents
+ * of the scratch register and store the target address in that register.
+ */
+static void riprel_pre_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
+{
+ if (auprobe->defparam.fixups & UPROBE_FIX_RIP_MASK) {
+ struct uprobe_task *utask = current->utask;
+ unsigned long *sr = scratch_reg(auprobe, regs);
+
+ utask->autask.saved_scratch_register = *sr;
+ *sr = utask->vaddr + auprobe->defparam.ilen;
+ }
+}
+
+static void riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
+{
+ if (auprobe->defparam.fixups & UPROBE_FIX_RIP_MASK) {
+ struct uprobe_task *utask = current->utask;
+ unsigned long *sr = scratch_reg(auprobe, regs);
+
+ *sr = utask->autask.saved_scratch_register;
+ }
+}
+#else /* 32-bit: */
+/*
+ * No RIP-relative addressing on 32-bit
+ */
+static void riprel_analyze(struct arch_uprobe *auprobe, struct insn *insn)
+{
+}
+static void riprel_pre_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
+{
+}
+static void riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
+{
+}
+#endif /* CONFIG_X86_64 */
+
+struct uprobe_xol_ops {
+ bool (*emulate)(struct arch_uprobe *, struct pt_regs *);
+ int (*pre_xol)(struct arch_uprobe *, struct pt_regs *);
+ int (*post_xol)(struct arch_uprobe *, struct pt_regs *);
+ void (*abort)(struct arch_uprobe *, struct pt_regs *);
+};
+
+static inline int sizeof_long(void)
+{
+ return in_ia32_syscall() ? 4 : 8;
+}
+
+static int default_pre_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs)
+{
+ riprel_pre_xol(auprobe, regs);
+ return 0;
+}
+
+static int emulate_push_stack(struct pt_regs *regs, unsigned long val)
+{
+ unsigned long new_sp = regs->sp - sizeof_long();
+
+ if (copy_to_user((void __user *)new_sp, &val, sizeof_long()))
+ return -EFAULT;
+
+ regs->sp = new_sp;
+ return 0;
+}
+
+/*
+ * We have to fix things up as follows:
+ *
+ * Typically, the new ip is relative to the copied instruction. We need
+ * to make it relative to the original instruction (FIX_IP). Exceptions
+ * are return instructions and absolute or indirect jump or call instructions.
+ *
+ * If the single-stepped instruction was a call, the return address that
+ * is atop the stack is the address following the copied instruction. We
+ * need to make it the address following the original instruction (FIX_CALL).
+ *
+ * If the original instruction was a rip-relative instruction such as
+ * "movl %edx,0xnnnn(%rip)", we have instead executed an equivalent
+ * instruction using a scratch register -- e.g., "movl %edx,0xnnnn(%rsi)".
+ * We need to restore the contents of the scratch register
+ * (FIX_RIP_reg).
+ */
+static int default_post_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs)
+{
+ struct uprobe_task *utask = current->utask;
+
+ riprel_post_xol(auprobe, regs);
+ if (auprobe->defparam.fixups & UPROBE_FIX_IP) {
+ long correction = utask->vaddr - utask->xol_vaddr;
+ regs->ip += correction;
+ } else if (auprobe->defparam.fixups & UPROBE_FIX_CALL) {
+ regs->sp += sizeof_long(); /* Pop incorrect return address */
+ if (emulate_push_stack(regs, utask->vaddr + auprobe->defparam.ilen))
+ return -ERESTART;
+ }
+ /* popf; tell the caller to not touch TF */
+ if (auprobe->defparam.fixups & UPROBE_FIX_SETF)
+ utask->autask.saved_tf = true;
+
+ return 0;
+}
+
+static void default_abort_op(struct arch_uprobe *auprobe, struct pt_regs *regs)
+{
+ riprel_post_xol(auprobe, regs);
+}
+
+static const struct uprobe_xol_ops default_xol_ops = {
+ .pre_xol = default_pre_xol_op,
+ .post_xol = default_post_xol_op,
+ .abort = default_abort_op,
+};
+
+static bool branch_is_call(struct arch_uprobe *auprobe)
+{
+ return auprobe->branch.opc1 == 0xe8;
+}
+
+#define CASE_COND \
+ COND(70, 71, XF(OF)) \
+ COND(72, 73, XF(CF)) \
+ COND(74, 75, XF(ZF)) \
+ COND(78, 79, XF(SF)) \
+ COND(7a, 7b, XF(PF)) \
+ COND(76, 77, XF(CF) || XF(ZF)) \
+ COND(7c, 7d, XF(SF) != XF(OF)) \
+ COND(7e, 7f, XF(ZF) || XF(SF) != XF(OF))
+
+#define COND(op_y, op_n, expr) \
+ case 0x ## op_y: DO((expr) != 0) \
+ case 0x ## op_n: DO((expr) == 0)
+
+#define XF(xf) (!!(flags & X86_EFLAGS_ ## xf))
+
+static bool is_cond_jmp_opcode(u8 opcode)
+{
+ switch (opcode) {
+ #define DO(expr) \
+ return true;
+ CASE_COND
+ #undef DO
+
+ default:
+ return false;
+ }
+}
+
+static bool check_jmp_cond(struct arch_uprobe *auprobe, struct pt_regs *regs)
+{
+ unsigned long flags = regs->flags;
+
+ switch (auprobe->branch.opc1) {
+ #define DO(expr) \
+ return expr;
+ CASE_COND
+ #undef DO
+
+ default: /* not a conditional jmp */
+ return true;
+ }
+}
+
+#undef XF
+#undef COND
+#undef CASE_COND
+
+static bool branch_emulate_op(struct arch_uprobe *auprobe, struct pt_regs *regs)
+{
+ unsigned long new_ip = regs->ip += auprobe->branch.ilen;
+ unsigned long offs = (long)auprobe->branch.offs;
+
+ if (branch_is_call(auprobe)) {
+ /*
+ * If it fails we execute this (mangled, see the comment in
+ * branch_clear_offset) insn out-of-line. In the likely case
+ * this should trigger the trap, and the probed application
+ * should die or restart the same insn after it handles the
+ * signal, arch_uprobe_post_xol() won't be even called.
+ *
+ * But there is corner case, see the comment in ->post_xol().
+ */
+ if (emulate_push_stack(regs, new_ip))
+ return false;
+ } else if (!check_jmp_cond(auprobe, regs)) {
+ offs = 0;
+ }
+
+ regs->ip = new_ip + offs;
+ return true;
+}
+
+static bool push_emulate_op(struct arch_uprobe *auprobe, struct pt_regs *regs)
+{
+ unsigned long *src_ptr = (void *)regs + auprobe->push.reg_offset;
+
+ if (emulate_push_stack(regs, *src_ptr))
+ return false;
+ regs->ip += auprobe->push.ilen;
+ return true;
+}
+
+static int branch_post_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs)
+{
+ BUG_ON(!branch_is_call(auprobe));
+ /*
+ * We can only get here if branch_emulate_op() failed to push the ret
+ * address _and_ another thread expanded our stack before the (mangled)
+ * "call" insn was executed out-of-line. Just restore ->sp and restart.
+ * We could also restore ->ip and try to call branch_emulate_op() again.
+ */
+ regs->sp += sizeof_long();
+ return -ERESTART;
+}
+
+static void branch_clear_offset(struct arch_uprobe *auprobe, struct insn *insn)
+{
+ /*
+ * Turn this insn into "call 1f; 1:", this is what we will execute
+ * out-of-line if ->emulate() fails. We only need this to generate
+ * a trap, so that the probed task receives the correct signal with
+ * the properly filled siginfo.
+ *
+ * But see the comment in ->post_xol(), in the unlikely case it can
+ * succeed. So we need to ensure that the new ->ip can not fall into
+ * the non-canonical area and trigger #GP.
+ *
+ * We could turn it into (say) "pushf", but then we would need to
+ * divorce ->insn[] and ->ixol[]. We need to preserve the 1st byte
+ * of ->insn[] for set_orig_insn().
+ */
+ memset(auprobe->insn + insn_offset_immediate(insn),
+ 0, insn->immediate.nbytes);
+}
+
+static const struct uprobe_xol_ops branch_xol_ops = {
+ .emulate = branch_emulate_op,
+ .post_xol = branch_post_xol_op,
+};
+
+static const struct uprobe_xol_ops push_xol_ops = {
+ .emulate = push_emulate_op,
+};
+
+/* Returns -ENOSYS if branch_xol_ops doesn't handle this insn */
+static int branch_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn)
+{
+ u8 opc1 = OPCODE1(insn);
+ int i;
+
+ switch (opc1) {
+ case 0xeb: /* jmp 8 */
+ case 0xe9: /* jmp 32 */
+ case 0x90: /* prefix* + nop; same as jmp with .offs = 0 */
+ break;
+
+ case 0xe8: /* call relative */
+ branch_clear_offset(auprobe, insn);
+ break;
+
+ case 0x0f:
+ if (insn->opcode.nbytes != 2)
+ return -ENOSYS;
+ /*
+ * If it is a "near" conditional jmp, OPCODE2() - 0x10 matches
+ * OPCODE1() of the "short" jmp which checks the same condition.
+ */
+ opc1 = OPCODE2(insn) - 0x10;
+ default:
+ if (!is_cond_jmp_opcode(opc1))
+ return -ENOSYS;
+ }
+
+ /*
+ * 16-bit overrides such as CALLW (66 e8 nn nn) are not supported.
+ * Intel and AMD behavior differ in 64-bit mode: Intel ignores 66 prefix.
+ * No one uses these insns, reject any branch insns with such prefix.
+ */
+ for (i = 0; i < insn->prefixes.nbytes; i++) {
+ if (insn->prefixes.bytes[i] == 0x66)
+ return -ENOTSUPP;
+ }
+
+ auprobe->branch.opc1 = opc1;
+ auprobe->branch.ilen = insn->length;
+ auprobe->branch.offs = insn->immediate.value;
+
+ auprobe->ops = &branch_xol_ops;
+ return 0;
+}
+
+/* Returns -ENOSYS if push_xol_ops doesn't handle this insn */
+static int push_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn)
+{
+ u8 opc1 = OPCODE1(insn), reg_offset = 0;
+
+ if (opc1 < 0x50 || opc1 > 0x57)
+ return -ENOSYS;
+
+ if (insn->length > 2)
+ return -ENOSYS;
+ if (insn->length == 2) {
+ /* only support rex_prefix 0x41 (x64 only) */
+#ifdef CONFIG_X86_64
+ if (insn->rex_prefix.nbytes != 1 ||
+ insn->rex_prefix.bytes[0] != 0x41)
+ return -ENOSYS;
+
+ switch (opc1) {
+ case 0x50:
+ reg_offset = offsetof(struct pt_regs, r8);
+ break;
+ case 0x51:
+ reg_offset = offsetof(struct pt_regs, r9);
+ break;
+ case 0x52:
+ reg_offset = offsetof(struct pt_regs, r10);
+ break;
+ case 0x53:
+ reg_offset = offsetof(struct pt_regs, r11);
+ break;
+ case 0x54:
+ reg_offset = offsetof(struct pt_regs, r12);
+ break;
+ case 0x55:
+ reg_offset = offsetof(struct pt_regs, r13);
+ break;
+ case 0x56:
+ reg_offset = offsetof(struct pt_regs, r14);
+ break;
+ case 0x57:
+ reg_offset = offsetof(struct pt_regs, r15);
+ break;
+ }
+#else
+ return -ENOSYS;
+#endif
+ } else {
+ switch (opc1) {
+ case 0x50:
+ reg_offset = offsetof(struct pt_regs, ax);
+ break;
+ case 0x51:
+ reg_offset = offsetof(struct pt_regs, cx);
+ break;
+ case 0x52:
+ reg_offset = offsetof(struct pt_regs, dx);
+ break;
+ case 0x53:
+ reg_offset = offsetof(struct pt_regs, bx);
+ break;
+ case 0x54:
+ reg_offset = offsetof(struct pt_regs, sp);
+ break;
+ case 0x55:
+ reg_offset = offsetof(struct pt_regs, bp);
+ break;
+ case 0x56:
+ reg_offset = offsetof(struct pt_regs, si);
+ break;
+ case 0x57:
+ reg_offset = offsetof(struct pt_regs, di);
+ break;
+ }
+ }
+
+ auprobe->push.reg_offset = reg_offset;
+ auprobe->push.ilen = insn->length;
+ auprobe->ops = &push_xol_ops;
+ return 0;
+}
+
+/**
+ * arch_uprobe_analyze_insn - instruction analysis including validity and fixups.
+ * @mm: the probed address space.
+ * @arch_uprobe: the probepoint information.
+ * @addr: virtual address at which to install the probepoint
+ * Return 0 on success or a -ve number on error.
+ */
+int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long addr)
+{
+ struct insn insn;
+ u8 fix_ip_or_call = UPROBE_FIX_IP;
+ int ret;
+
+ ret = uprobe_init_insn(auprobe, &insn, is_64bit_mm(mm));
+ if (ret)
+ return ret;
+
+ ret = branch_setup_xol_ops(auprobe, &insn);
+ if (ret != -ENOSYS)
+ return ret;
+
+ ret = push_setup_xol_ops(auprobe, &insn);
+ if (ret != -ENOSYS)
+ return ret;
+
+ /*
+ * Figure out which fixups default_post_xol_op() will need to perform,
+ * and annotate defparam->fixups accordingly.
+ */
+ switch (OPCODE1(&insn)) {
+ case 0x9d: /* popf */
+ auprobe->defparam.fixups |= UPROBE_FIX_SETF;
+ break;
+ case 0xc3: /* ret or lret -- ip is correct */
+ case 0xcb:
+ case 0xc2:
+ case 0xca:
+ case 0xea: /* jmp absolute -- ip is correct */
+ fix_ip_or_call = 0;
+ break;
+ case 0x9a: /* call absolute - Fix return addr, not ip */
+ fix_ip_or_call = UPROBE_FIX_CALL;
+ break;
+ case 0xff:
+ switch (MODRM_REG(&insn)) {
+ case 2: case 3: /* call or lcall, indirect */
+ fix_ip_or_call = UPROBE_FIX_CALL;
+ break;
+ case 4: case 5: /* jmp or ljmp, indirect */
+ fix_ip_or_call = 0;
+ break;
+ }
+ /* fall through */
+ default:
+ riprel_analyze(auprobe, &insn);
+ }
+
+ auprobe->defparam.ilen = insn.length;
+ auprobe->defparam.fixups |= fix_ip_or_call;
+
+ auprobe->ops = &default_xol_ops;
+ return 0;
+}
+
+/*
+ * arch_uprobe_pre_xol - prepare to execute out of line.
+ * @auprobe: the probepoint information.
+ * @regs: reflects the saved user state of current task.
+ */
+int arch_uprobe_pre_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
+{
+ struct uprobe_task *utask = current->utask;
+
+ if (auprobe->ops->pre_xol) {
+ int err = auprobe->ops->pre_xol(auprobe, regs);
+ if (err)
+ return err;
+ }
+
+ regs->ip = utask->xol_vaddr;
+ utask->autask.saved_trap_nr = current->thread.trap_nr;
+ current->thread.trap_nr = UPROBE_TRAP_NR;
+
+ utask->autask.saved_tf = !!(regs->flags & X86_EFLAGS_TF);
+ regs->flags |= X86_EFLAGS_TF;
+ if (test_tsk_thread_flag(current, TIF_BLOCKSTEP))
+ set_task_blockstep(current, false);
+
+ return 0;
+}
+
+/*
+ * If xol insn itself traps and generates a signal(Say,
+ * SIGILL/SIGSEGV/etc), then detect the case where a singlestepped
+ * instruction jumps back to its own address. It is assumed that anything
+ * like do_page_fault/do_trap/etc sets thread.trap_nr != -1.
+ *
+ * arch_uprobe_pre_xol/arch_uprobe_post_xol save/restore thread.trap_nr,
+ * arch_uprobe_xol_was_trapped() simply checks that ->trap_nr is not equal to
+ * UPROBE_TRAP_NR == -1 set by arch_uprobe_pre_xol().
+ */
+bool arch_uprobe_xol_was_trapped(struct task_struct *t)
+{
+ if (t->thread.trap_nr != UPROBE_TRAP_NR)
+ return true;
+
+ return false;
+}
+
+/*
+ * Called after single-stepping. To avoid the SMP problems that can
+ * occur when we temporarily put back the original opcode to
+ * single-step, we single-stepped a copy of the instruction.
+ *
+ * This function prepares to resume execution after the single-step.
+ */
+int arch_uprobe_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
+{
+ struct uprobe_task *utask = current->utask;
+ bool send_sigtrap = utask->autask.saved_tf;
+ int err = 0;
+
+ WARN_ON_ONCE(current->thread.trap_nr != UPROBE_TRAP_NR);
+ current->thread.trap_nr = utask->autask.saved_trap_nr;
+
+ if (auprobe->ops->post_xol) {
+ err = auprobe->ops->post_xol(auprobe, regs);
+ if (err) {
+ /*
+ * Restore ->ip for restart or post mortem analysis.
+ * ->post_xol() must not return -ERESTART unless this
+ * is really possible.
+ */
+ regs->ip = utask->vaddr;
+ if (err == -ERESTART)
+ err = 0;
+ send_sigtrap = false;
+ }
+ }
+ /*
+ * arch_uprobe_pre_xol() doesn't save the state of TIF_BLOCKSTEP
+ * so we can get an extra SIGTRAP if we do not clear TF. We need
+ * to examine the opcode to make it right.
+ */
+ if (send_sigtrap)
+ send_sig(SIGTRAP, current, 0);
+
+ if (!utask->autask.saved_tf)
+ regs->flags &= ~X86_EFLAGS_TF;
+
+ return err;
+}
+
+/* callback routine for handling exceptions. */
+int arch_uprobe_exception_notify(struct notifier_block *self, unsigned long val, void *data)
+{
+ struct die_args *args = data;
+ struct pt_regs *regs = args->regs;
+ int ret = NOTIFY_DONE;
+
+ /* We are only interested in userspace traps */
+ if (regs && !user_mode(regs))
+ return NOTIFY_DONE;
+
+ switch (val) {
+ case DIE_INT3:
+ if (uprobe_pre_sstep_notifier(regs))
+ ret = NOTIFY_STOP;
+
+ break;
+
+ case DIE_DEBUG:
+ if (uprobe_post_sstep_notifier(regs))
+ ret = NOTIFY_STOP;
+
+ default:
+ break;
+ }
+
+ return ret;
+}
+
+/*
+ * This function gets called when XOL instruction either gets trapped or
+ * the thread has a fatal signal. Reset the instruction pointer to its
+ * probed address for the potential restart or for post mortem analysis.
+ */
+void arch_uprobe_abort_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
+{
+ struct uprobe_task *utask = current->utask;
+
+ if (auprobe->ops->abort)
+ auprobe->ops->abort(auprobe, regs);
+
+ current->thread.trap_nr = utask->autask.saved_trap_nr;
+ regs->ip = utask->vaddr;
+ /* clear TF if it was set by us in arch_uprobe_pre_xol() */
+ if (!utask->autask.saved_tf)
+ regs->flags &= ~X86_EFLAGS_TF;
+}
+
+static bool __skip_sstep(struct arch_uprobe *auprobe, struct pt_regs *regs)
+{
+ if (auprobe->ops->emulate)
+ return auprobe->ops->emulate(auprobe, regs);
+ return false;
+}
+
+bool arch_uprobe_skip_sstep(struct arch_uprobe *auprobe, struct pt_regs *regs)
+{
+ bool ret = __skip_sstep(auprobe, regs);
+ if (ret && (regs->flags & X86_EFLAGS_TF))
+ send_sig(SIGTRAP, current, 0);
+ return ret;
+}
+
+unsigned long
+arch_uretprobe_hijack_return_addr(unsigned long trampoline_vaddr, struct pt_regs *regs)
+{
+ int rasize = sizeof_long(), nleft;
+ unsigned long orig_ret_vaddr = 0; /* clear high bits for 32-bit apps */
+
+ if (copy_from_user(&orig_ret_vaddr, (void __user *)regs->sp, rasize))
+ return -1;
+
+ /* check whether address has been already hijacked */
+ if (orig_ret_vaddr == trampoline_vaddr)
+ return orig_ret_vaddr;
+
+ nleft = copy_to_user((void __user *)regs->sp, &trampoline_vaddr, rasize);
+ if (likely(!nleft))
+ return orig_ret_vaddr;
+
+ if (nleft != rasize) {
+ pr_err("return address clobbered: pid=%d, %%sp=%#lx, %%ip=%#lx\n",
+ current->pid, regs->sp, regs->ip);
+
+ force_sig_info(SIGSEGV, SEND_SIG_FORCED, current);
+ }
+
+ return -1;
+}
+
+bool arch_uretprobe_is_alive(struct return_instance *ret, enum rp_check ctx,
+ struct pt_regs *regs)
+{
+ if (ctx == RP_CHECK_CALL) /* sp was just decremented by "call" insn */
+ return regs->sp < ret->stack;
+ else
+ return regs->sp <= ret->stack;
+}
diff --git a/arch/x86/kernel/verify_cpu.S b/arch/x86/kernel/verify_cpu.S
new file mode 100644
index 0000000..3d3c2f7
--- /dev/null
+++ b/arch/x86/kernel/verify_cpu.S
@@ -0,0 +1,142 @@
+/*
+ *
+ * verify_cpu.S - Code for cpu long mode and SSE verification. This
+ * code has been borrowed from boot/setup.S and was introduced by
+ * Andi Kleen.
+ *
+ * Copyright (c) 2007 Andi Kleen (ak@suse.de)
+ * Copyright (c) 2007 Eric Biederman (ebiederm@xmission.com)
+ * Copyright (c) 2007 Vivek Goyal (vgoyal@in.ibm.com)
+ * Copyright (c) 2010 Kees Cook (kees.cook@canonical.com)
+ *
+ * This source code is licensed under the GNU General Public License,
+ * Version 2. See the file COPYING for more details.
+ *
+ * This is a common code for verification whether CPU supports
+ * long mode and SSE or not. It is not called directly instead this
+ * file is included at various places and compiled in that context.
+ * This file is expected to run in 32bit code. Currently:
+ *
+ * arch/x86/boot/compressed/head_64.S: Boot cpu verification
+ * arch/x86/kernel/trampoline_64.S: secondary processor verification
+ * arch/x86/kernel/head_32.S: processor startup
+ *
+ * verify_cpu, returns the status of longmode and SSE in register %eax.
+ * 0: Success 1: Failure
+ *
+ * On Intel, the XD_DISABLE flag will be cleared as a side-effect.
+ *
+ * The caller needs to check for the error code and take the action
+ * appropriately. Either display a message or halt.
+ */
+
+#include <asm/cpufeatures.h>
+#include <asm/msr-index.h>
+
+ENTRY(verify_cpu)
+ pushf # Save caller passed flags
+ push $0 # Kill any dangerous flags
+ popf
+
+#ifndef __x86_64__
+ pushfl # standard way to check for cpuid
+ popl %eax
+ movl %eax,%ebx
+ xorl $0x200000,%eax
+ pushl %eax
+ popfl
+ pushfl
+ popl %eax
+ cmpl %eax,%ebx
+ jz .Lverify_cpu_no_longmode # cpu has no cpuid
+#endif
+
+ movl $0x0,%eax # See if cpuid 1 is implemented
+ cpuid
+ cmpl $0x1,%eax
+ jb .Lverify_cpu_no_longmode # no cpuid 1
+
+ xor %di,%di
+ cmpl $0x68747541,%ebx # AuthenticAMD
+ jnz .Lverify_cpu_noamd
+ cmpl $0x69746e65,%edx
+ jnz .Lverify_cpu_noamd
+ cmpl $0x444d4163,%ecx
+ jnz .Lverify_cpu_noamd
+ mov $1,%di # cpu is from AMD
+ jmp .Lverify_cpu_check
+
+.Lverify_cpu_noamd:
+ cmpl $0x756e6547,%ebx # GenuineIntel?
+ jnz .Lverify_cpu_check
+ cmpl $0x49656e69,%edx
+ jnz .Lverify_cpu_check
+ cmpl $0x6c65746e,%ecx
+ jnz .Lverify_cpu_check
+
+ # only call IA32_MISC_ENABLE when:
+ # family > 6 || (family == 6 && model >= 0xd)
+ movl $0x1, %eax # check CPU family and model
+ cpuid
+ movl %eax, %ecx
+
+ andl $0x0ff00f00, %eax # mask family and extended family
+ shrl $8, %eax
+ cmpl $6, %eax
+ ja .Lverify_cpu_clear_xd # family > 6, ok
+ jb .Lverify_cpu_check # family < 6, skip
+
+ andl $0x000f00f0, %ecx # mask model and extended model
+ shrl $4, %ecx
+ cmpl $0xd, %ecx
+ jb .Lverify_cpu_check # family == 6, model < 0xd, skip
+
+.Lverify_cpu_clear_xd:
+ movl $MSR_IA32_MISC_ENABLE, %ecx
+ rdmsr
+ btrl $2, %edx # clear MSR_IA32_MISC_ENABLE_XD_DISABLE
+ jnc .Lverify_cpu_check # only write MSR if bit was changed
+ wrmsr
+
+.Lverify_cpu_check:
+ movl $0x1,%eax # Does the cpu have what it takes
+ cpuid
+ andl $REQUIRED_MASK0,%edx
+ xorl $REQUIRED_MASK0,%edx
+ jnz .Lverify_cpu_no_longmode
+
+ movl $0x80000000,%eax # See if extended cpuid is implemented
+ cpuid
+ cmpl $0x80000001,%eax
+ jb .Lverify_cpu_no_longmode # no extended cpuid
+
+ movl $0x80000001,%eax # Does the cpu have what it takes
+ cpuid
+ andl $REQUIRED_MASK1,%edx
+ xorl $REQUIRED_MASK1,%edx
+ jnz .Lverify_cpu_no_longmode
+
+.Lverify_cpu_sse_test:
+ movl $1,%eax
+ cpuid
+ andl $SSE_MASK,%edx
+ cmpl $SSE_MASK,%edx
+ je .Lverify_cpu_sse_ok
+ test %di,%di
+ jz .Lverify_cpu_no_longmode # only try to force SSE on AMD
+ movl $MSR_K7_HWCR,%ecx
+ rdmsr
+ btr $15,%eax # enable SSE
+ wrmsr
+ xor %di,%di # don't loop
+ jmp .Lverify_cpu_sse_test # try again
+
+.Lverify_cpu_no_longmode:
+ popf # Restore caller passed flags
+ movl $1,%eax
+ ret
+.Lverify_cpu_sse_ok:
+ popf # Restore caller passed flags
+ xorl %eax, %eax
+ ret
+ENDPROC(verify_cpu)
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
new file mode 100644
index 0000000..1c03e4a
--- /dev/null
+++ b/arch/x86/kernel/vm86_32.c
@@ -0,0 +1,874 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 1994 Linus Torvalds
+ *
+ * 29 dec 2001 - Fixed oopses caused by unchecked access to the vm86
+ * stack - Manfred Spraul <manfred@colorfullife.com>
+ *
+ * 22 mar 2002 - Manfred detected the stackfaults, but didn't handle
+ * them correctly. Now the emulation will be in a
+ * consistent state after stackfaults - Kasper Dupont
+ * <kasperd@daimi.au.dk>
+ *
+ * 22 mar 2002 - Added missing clear_IF in set_vflags_* Kasper Dupont
+ * <kasperd@daimi.au.dk>
+ *
+ * ?? ??? 2002 - Fixed premature returns from handle_vm86_fault
+ * caused by Kasper Dupont's changes - Stas Sergeev
+ *
+ * 4 apr 2002 - Fixed CHECK_IF_IN_TRAP broken by Stas' changes.
+ * Kasper Dupont <kasperd@daimi.au.dk>
+ *
+ * 9 apr 2002 - Changed syntax of macros in handle_vm86_fault.
+ * Kasper Dupont <kasperd@daimi.au.dk>
+ *
+ * 9 apr 2002 - Changed stack access macros to jump to a label
+ * instead of returning to userspace. This simplifies
+ * do_int, and is needed by handle_vm6_fault. Kasper
+ * Dupont <kasperd@daimi.au.dk>
+ *
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/capability.h>
+#include <linux/errno.h>
+#include <linux/interrupt.h>
+#include <linux/syscalls.h>
+#include <linux/sched.h>
+#include <linux/sched/task_stack.h>
+#include <linux/kernel.h>
+#include <linux/signal.h>
+#include <linux/string.h>
+#include <linux/mm.h>
+#include <linux/smp.h>
+#include <linux/highmem.h>
+#include <linux/ptrace.h>
+#include <linux/audit.h>
+#include <linux/stddef.h>
+#include <linux/slab.h>
+#include <linux/security.h>
+
+#include <linux/uaccess.h>
+#include <asm/io.h>
+#include <asm/tlbflush.h>
+#include <asm/irq.h>
+#include <asm/traps.h>
+#include <asm/vm86.h>
+#include <asm/switch_to.h>
+
+/*
+ * Known problems:
+ *
+ * Interrupt handling is not guaranteed:
+ * - a real x86 will disable all interrupts for one instruction
+ * after a "mov ss,xx" to make stack handling atomic even without
+ * the 'lss' instruction. We can't guarantee this in v86 mode,
+ * as the next instruction might result in a page fault or similar.
+ * - a real x86 will have interrupts disabled for one instruction
+ * past the 'sti' that enables them. We don't bother with all the
+ * details yet.
+ *
+ * Let's hope these problems do not actually matter for anything.
+ */
+
+
+/*
+ * 8- and 16-bit register defines..
+ */
+#define AL(regs) (((unsigned char *)&((regs)->pt.ax))[0])
+#define AH(regs) (((unsigned char *)&((regs)->pt.ax))[1])
+#define IP(regs) (*(unsigned short *)&((regs)->pt.ip))
+#define SP(regs) (*(unsigned short *)&((regs)->pt.sp))
+
+/*
+ * virtual flags (16 and 32-bit versions)
+ */
+#define VFLAGS (*(unsigned short *)&(current->thread.vm86->veflags))
+#define VEFLAGS (current->thread.vm86->veflags)
+
+#define set_flags(X, new, mask) \
+((X) = ((X) & ~(mask)) | ((new) & (mask)))
+
+#define SAFE_MASK (0xDD5)
+#define RETURN_MASK (0xDFF)
+
+void save_v86_state(struct kernel_vm86_regs *regs, int retval)
+{
+ struct task_struct *tsk = current;
+ struct vm86plus_struct __user *user;
+ struct vm86 *vm86 = current->thread.vm86;
+ long err = 0;
+
+ /*
+ * This gets called from entry.S with interrupts disabled, but
+ * from process context. Enable interrupts here, before trying
+ * to access user space.
+ */
+ local_irq_enable();
+
+ if (!vm86 || !vm86->user_vm86) {
+ pr_alert("no user_vm86: BAD\n");
+ do_exit(SIGSEGV);
+ }
+ set_flags(regs->pt.flags, VEFLAGS, X86_EFLAGS_VIF | vm86->veflags_mask);
+ user = vm86->user_vm86;
+
+ if (!access_ok(VERIFY_WRITE, user, vm86->vm86plus.is_vm86pus ?
+ sizeof(struct vm86plus_struct) :
+ sizeof(struct vm86_struct))) {
+ pr_alert("could not access userspace vm86 info\n");
+ do_exit(SIGSEGV);
+ }
+
+ put_user_try {
+ put_user_ex(regs->pt.bx, &user->regs.ebx);
+ put_user_ex(regs->pt.cx, &user->regs.ecx);
+ put_user_ex(regs->pt.dx, &user->regs.edx);
+ put_user_ex(regs->pt.si, &user->regs.esi);
+ put_user_ex(regs->pt.di, &user->regs.edi);
+ put_user_ex(regs->pt.bp, &user->regs.ebp);
+ put_user_ex(regs->pt.ax, &user->regs.eax);
+ put_user_ex(regs->pt.ip, &user->regs.eip);
+ put_user_ex(regs->pt.cs, &user->regs.cs);
+ put_user_ex(regs->pt.flags, &user->regs.eflags);
+ put_user_ex(regs->pt.sp, &user->regs.esp);
+ put_user_ex(regs->pt.ss, &user->regs.ss);
+ put_user_ex(regs->es, &user->regs.es);
+ put_user_ex(regs->ds, &user->regs.ds);
+ put_user_ex(regs->fs, &user->regs.fs);
+ put_user_ex(regs->gs, &user->regs.gs);
+
+ put_user_ex(vm86->screen_bitmap, &user->screen_bitmap);
+ } put_user_catch(err);
+ if (err) {
+ pr_alert("could not access userspace vm86 info\n");
+ do_exit(SIGSEGV);
+ }
+
+ preempt_disable();
+ tsk->thread.sp0 = vm86->saved_sp0;
+ tsk->thread.sysenter_cs = __KERNEL_CS;
+ update_task_stack(tsk);
+ refresh_sysenter_cs(&tsk->thread);
+ vm86->saved_sp0 = 0;
+ preempt_enable();
+
+ memcpy(®s->pt, &vm86->regs32, sizeof(struct pt_regs));
+
+ lazy_load_gs(vm86->regs32.gs);
+
+ regs->pt.ax = retval;
+}
+
+static void mark_screen_rdonly(struct mm_struct *mm)
+{
+ struct vm_area_struct *vma;
+ spinlock_t *ptl;
+ pgd_t *pgd;
+ p4d_t *p4d;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
+ int i;
+
+ down_write(&mm->mmap_sem);
+ pgd = pgd_offset(mm, 0xA0000);
+ if (pgd_none_or_clear_bad(pgd))
+ goto out;
+ p4d = p4d_offset(pgd, 0xA0000);
+ if (p4d_none_or_clear_bad(p4d))
+ goto out;
+ pud = pud_offset(p4d, 0xA0000);
+ if (pud_none_or_clear_bad(pud))
+ goto out;
+ pmd = pmd_offset(pud, 0xA0000);
+
+ if (pmd_trans_huge(*pmd)) {
+ vma = find_vma(mm, 0xA0000);
+ split_huge_pmd(vma, pmd, 0xA0000);
+ }
+ if (pmd_none_or_clear_bad(pmd))
+ goto out;
+ pte = pte_offset_map_lock(mm, pmd, 0xA0000, &ptl);
+ for (i = 0; i < 32; i++) {
+ if (pte_present(*pte))
+ set_pte(pte, pte_wrprotect(*pte));
+ pte++;
+ }
+ pte_unmap_unlock(pte, ptl);
+out:
+ up_write(&mm->mmap_sem);
+ flush_tlb_mm_range(mm, 0xA0000, 0xA0000 + 32*PAGE_SIZE, 0UL);
+}
+
+
+
+static int do_vm86_irq_handling(int subfunction, int irqnumber);
+static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus);
+
+SYSCALL_DEFINE1(vm86old, struct vm86_struct __user *, user_vm86)
+{
+ return do_sys_vm86((struct vm86plus_struct __user *) user_vm86, false);
+}
+
+
+SYSCALL_DEFINE2(vm86, unsigned long, cmd, unsigned long, arg)
+{
+ switch (cmd) {
+ case VM86_REQUEST_IRQ:
+ case VM86_FREE_IRQ:
+ case VM86_GET_IRQ_BITS:
+ case VM86_GET_AND_RESET_IRQ:
+ return do_vm86_irq_handling(cmd, (int)arg);
+ case VM86_PLUS_INSTALL_CHECK:
+ /*
+ * NOTE: on old vm86 stuff this will return the error
+ * from access_ok(), because the subfunction is
+ * interpreted as (invalid) address to vm86_struct.
+ * So the installation check works.
+ */
+ return 0;
+ }
+
+ /* we come here only for functions VM86_ENTER, VM86_ENTER_NO_BYPASS */
+ return do_sys_vm86((struct vm86plus_struct __user *) arg, true);
+}
+
+
+static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus)
+{
+ struct task_struct *tsk = current;
+ struct vm86 *vm86 = tsk->thread.vm86;
+ struct kernel_vm86_regs vm86regs;
+ struct pt_regs *regs = current_pt_regs();
+ unsigned long err = 0;
+
+ err = security_mmap_addr(0);
+ if (err) {
+ /*
+ * vm86 cannot virtualize the address space, so vm86 users
+ * need to manage the low 1MB themselves using mmap. Given
+ * that BIOS places important data in the first page, vm86
+ * is essentially useless if mmap_min_addr != 0. DOSEMU,
+ * for example, won't even bother trying to use vm86 if it
+ * can't map a page at virtual address 0.
+ *
+ * To reduce the available kernel attack surface, simply
+ * disallow vm86(old) for users who cannot mmap at va 0.
+ *
+ * The implementation of security_mmap_addr will allow
+ * suitably privileged users to map va 0 even if
+ * vm.mmap_min_addr is set above 0, and we want this
+ * behavior for vm86 as well, as it ensures that legacy
+ * tools like vbetool will not fail just because of
+ * vm.mmap_min_addr.
+ */
+ pr_info_once("Denied a call to vm86(old) from %s[%d] (uid: %d). Set the vm.mmap_min_addr sysctl to 0 and/or adjust LSM mmap_min_addr policy to enable vm86 if you are using a vm86-based DOS emulator.\n",
+ current->comm, task_pid_nr(current),
+ from_kuid_munged(&init_user_ns, current_uid()));
+ return -EPERM;
+ }
+
+ if (!vm86) {
+ if (!(vm86 = kzalloc(sizeof(*vm86), GFP_KERNEL)))
+ return -ENOMEM;
+ tsk->thread.vm86 = vm86;
+ }
+ if (vm86->saved_sp0)
+ return -EPERM;
+
+ if (!access_ok(VERIFY_READ, user_vm86, plus ?
+ sizeof(struct vm86_struct) :
+ sizeof(struct vm86plus_struct)))
+ return -EFAULT;
+
+ memset(&vm86regs, 0, sizeof(vm86regs));
+ get_user_try {
+ unsigned short seg;
+ get_user_ex(vm86regs.pt.bx, &user_vm86->regs.ebx);
+ get_user_ex(vm86regs.pt.cx, &user_vm86->regs.ecx);
+ get_user_ex(vm86regs.pt.dx, &user_vm86->regs.edx);
+ get_user_ex(vm86regs.pt.si, &user_vm86->regs.esi);
+ get_user_ex(vm86regs.pt.di, &user_vm86->regs.edi);
+ get_user_ex(vm86regs.pt.bp, &user_vm86->regs.ebp);
+ get_user_ex(vm86regs.pt.ax, &user_vm86->regs.eax);
+ get_user_ex(vm86regs.pt.ip, &user_vm86->regs.eip);
+ get_user_ex(seg, &user_vm86->regs.cs);
+ vm86regs.pt.cs = seg;
+ get_user_ex(vm86regs.pt.flags, &user_vm86->regs.eflags);
+ get_user_ex(vm86regs.pt.sp, &user_vm86->regs.esp);
+ get_user_ex(seg, &user_vm86->regs.ss);
+ vm86regs.pt.ss = seg;
+ get_user_ex(vm86regs.es, &user_vm86->regs.es);
+ get_user_ex(vm86regs.ds, &user_vm86->regs.ds);
+ get_user_ex(vm86regs.fs, &user_vm86->regs.fs);
+ get_user_ex(vm86regs.gs, &user_vm86->regs.gs);
+
+ get_user_ex(vm86->flags, &user_vm86->flags);
+ get_user_ex(vm86->screen_bitmap, &user_vm86->screen_bitmap);
+ get_user_ex(vm86->cpu_type, &user_vm86->cpu_type);
+ } get_user_catch(err);
+ if (err)
+ return err;
+
+ if (copy_from_user(&vm86->int_revectored,
+ &user_vm86->int_revectored,
+ sizeof(struct revectored_struct)))
+ return -EFAULT;
+ if (copy_from_user(&vm86->int21_revectored,
+ &user_vm86->int21_revectored,
+ sizeof(struct revectored_struct)))
+ return -EFAULT;
+ if (plus) {
+ if (copy_from_user(&vm86->vm86plus, &user_vm86->vm86plus,
+ sizeof(struct vm86plus_info_struct)))
+ return -EFAULT;
+ vm86->vm86plus.is_vm86pus = 1;
+ } else
+ memset(&vm86->vm86plus, 0,
+ sizeof(struct vm86plus_info_struct));
+
+ memcpy(&vm86->regs32, regs, sizeof(struct pt_regs));
+ vm86->user_vm86 = user_vm86;
+
+/*
+ * The flags register is also special: we cannot trust that the user
+ * has set it up safely, so this makes sure interrupt etc flags are
+ * inherited from protected mode.
+ */
+ VEFLAGS = vm86regs.pt.flags;
+ vm86regs.pt.flags &= SAFE_MASK;
+ vm86regs.pt.flags |= regs->flags & ~SAFE_MASK;
+ vm86regs.pt.flags |= X86_VM_MASK;
+
+ vm86regs.pt.orig_ax = regs->orig_ax;
+
+ switch (vm86->cpu_type) {
+ case CPU_286:
+ vm86->veflags_mask = 0;
+ break;
+ case CPU_386:
+ vm86->veflags_mask = X86_EFLAGS_NT | X86_EFLAGS_IOPL;
+ break;
+ case CPU_486:
+ vm86->veflags_mask = X86_EFLAGS_AC | X86_EFLAGS_NT | X86_EFLAGS_IOPL;
+ break;
+ default:
+ vm86->veflags_mask = X86_EFLAGS_ID | X86_EFLAGS_AC | X86_EFLAGS_NT | X86_EFLAGS_IOPL;
+ break;
+ }
+
+/*
+ * Save old state
+ */
+ vm86->saved_sp0 = tsk->thread.sp0;
+ lazy_save_gs(vm86->regs32.gs);
+
+ /* make room for real-mode segments */
+ preempt_disable();
+ tsk->thread.sp0 += 16;
+
+ if (static_cpu_has(X86_FEATURE_SEP)) {
+ tsk->thread.sysenter_cs = 0;
+ refresh_sysenter_cs(&tsk->thread);
+ }
+
+ update_task_stack(tsk);
+ preempt_enable();
+
+ if (vm86->flags & VM86_SCREEN_BITMAP)
+ mark_screen_rdonly(tsk->mm);
+
+ memcpy((struct kernel_vm86_regs *)regs, &vm86regs, sizeof(vm86regs));
+ force_iret();
+ return regs->ax;
+}
+
+static inline void set_IF(struct kernel_vm86_regs *regs)
+{
+ VEFLAGS |= X86_EFLAGS_VIF;
+}
+
+static inline void clear_IF(struct kernel_vm86_regs *regs)
+{
+ VEFLAGS &= ~X86_EFLAGS_VIF;
+}
+
+static inline void clear_TF(struct kernel_vm86_regs *regs)
+{
+ regs->pt.flags &= ~X86_EFLAGS_TF;
+}
+
+static inline void clear_AC(struct kernel_vm86_regs *regs)
+{
+ regs->pt.flags &= ~X86_EFLAGS_AC;
+}
+
+/*
+ * It is correct to call set_IF(regs) from the set_vflags_*
+ * functions. However someone forgot to call clear_IF(regs)
+ * in the opposite case.
+ * After the command sequence CLI PUSHF STI POPF you should
+ * end up with interrupts disabled, but you ended up with
+ * interrupts enabled.
+ * ( I was testing my own changes, but the only bug I
+ * could find was in a function I had not changed. )
+ * [KD]
+ */
+
+static inline void set_vflags_long(unsigned long flags, struct kernel_vm86_regs *regs)
+{
+ set_flags(VEFLAGS, flags, current->thread.vm86->veflags_mask);
+ set_flags(regs->pt.flags, flags, SAFE_MASK);
+ if (flags & X86_EFLAGS_IF)
+ set_IF(regs);
+ else
+ clear_IF(regs);
+}
+
+static inline void set_vflags_short(unsigned short flags, struct kernel_vm86_regs *regs)
+{
+ set_flags(VFLAGS, flags, current->thread.vm86->veflags_mask);
+ set_flags(regs->pt.flags, flags, SAFE_MASK);
+ if (flags & X86_EFLAGS_IF)
+ set_IF(regs);
+ else
+ clear_IF(regs);
+}
+
+static inline unsigned long get_vflags(struct kernel_vm86_regs *regs)
+{
+ unsigned long flags = regs->pt.flags & RETURN_MASK;
+
+ if (VEFLAGS & X86_EFLAGS_VIF)
+ flags |= X86_EFLAGS_IF;
+ flags |= X86_EFLAGS_IOPL;
+ return flags | (VEFLAGS & current->thread.vm86->veflags_mask);
+}
+
+static inline int is_revectored(int nr, struct revectored_struct *bitmap)
+{
+ return test_bit(nr, bitmap->__map);
+}
+
+#define val_byte(val, n) (((__u8 *)&val)[n])
+
+#define pushb(base, ptr, val, err_label) \
+ do { \
+ __u8 __val = val; \
+ ptr--; \
+ if (put_user(__val, base + ptr) < 0) \
+ goto err_label; \
+ } while (0)
+
+#define pushw(base, ptr, val, err_label) \
+ do { \
+ __u16 __val = val; \
+ ptr--; \
+ if (put_user(val_byte(__val, 1), base + ptr) < 0) \
+ goto err_label; \
+ ptr--; \
+ if (put_user(val_byte(__val, 0), base + ptr) < 0) \
+ goto err_label; \
+ } while (0)
+
+#define pushl(base, ptr, val, err_label) \
+ do { \
+ __u32 __val = val; \
+ ptr--; \
+ if (put_user(val_byte(__val, 3), base + ptr) < 0) \
+ goto err_label; \
+ ptr--; \
+ if (put_user(val_byte(__val, 2), base + ptr) < 0) \
+ goto err_label; \
+ ptr--; \
+ if (put_user(val_byte(__val, 1), base + ptr) < 0) \
+ goto err_label; \
+ ptr--; \
+ if (put_user(val_byte(__val, 0), base + ptr) < 0) \
+ goto err_label; \
+ } while (0)
+
+#define popb(base, ptr, err_label) \
+ ({ \
+ __u8 __res; \
+ if (get_user(__res, base + ptr) < 0) \
+ goto err_label; \
+ ptr++; \
+ __res; \
+ })
+
+#define popw(base, ptr, err_label) \
+ ({ \
+ __u16 __res; \
+ if (get_user(val_byte(__res, 0), base + ptr) < 0) \
+ goto err_label; \
+ ptr++; \
+ if (get_user(val_byte(__res, 1), base + ptr) < 0) \
+ goto err_label; \
+ ptr++; \
+ __res; \
+ })
+
+#define popl(base, ptr, err_label) \
+ ({ \
+ __u32 __res; \
+ if (get_user(val_byte(__res, 0), base + ptr) < 0) \
+ goto err_label; \
+ ptr++; \
+ if (get_user(val_byte(__res, 1), base + ptr) < 0) \
+ goto err_label; \
+ ptr++; \
+ if (get_user(val_byte(__res, 2), base + ptr) < 0) \
+ goto err_label; \
+ ptr++; \
+ if (get_user(val_byte(__res, 3), base + ptr) < 0) \
+ goto err_label; \
+ ptr++; \
+ __res; \
+ })
+
+/* There are so many possible reasons for this function to return
+ * VM86_INTx, so adding another doesn't bother me. We can expect
+ * userspace programs to be able to handle it. (Getting a problem
+ * in userspace is always better than an Oops anyway.) [KD]
+ */
+static void do_int(struct kernel_vm86_regs *regs, int i,
+ unsigned char __user *ssp, unsigned short sp)
+{
+ unsigned long __user *intr_ptr;
+ unsigned long segoffs;
+ struct vm86 *vm86 = current->thread.vm86;
+
+ if (regs->pt.cs == BIOSSEG)
+ goto cannot_handle;
+ if (is_revectored(i, &vm86->int_revectored))
+ goto cannot_handle;
+ if (i == 0x21 && is_revectored(AH(regs), &vm86->int21_revectored))
+ goto cannot_handle;
+ intr_ptr = (unsigned long __user *) (i << 2);
+ if (get_user(segoffs, intr_ptr))
+ goto cannot_handle;
+ if ((segoffs >> 16) == BIOSSEG)
+ goto cannot_handle;
+ pushw(ssp, sp, get_vflags(regs), cannot_handle);
+ pushw(ssp, sp, regs->pt.cs, cannot_handle);
+ pushw(ssp, sp, IP(regs), cannot_handle);
+ regs->pt.cs = segoffs >> 16;
+ SP(regs) -= 6;
+ IP(regs) = segoffs & 0xffff;
+ clear_TF(regs);
+ clear_IF(regs);
+ clear_AC(regs);
+ return;
+
+cannot_handle:
+ save_v86_state(regs, VM86_INTx + (i << 8));
+}
+
+int handle_vm86_trap(struct kernel_vm86_regs *regs, long error_code, int trapno)
+{
+ struct vm86 *vm86 = current->thread.vm86;
+
+ if (vm86->vm86plus.is_vm86pus) {
+ if ((trapno == 3) || (trapno == 1)) {
+ save_v86_state(regs, VM86_TRAP + (trapno << 8));
+ return 0;
+ }
+ do_int(regs, trapno, (unsigned char __user *) (regs->pt.ss << 4), SP(regs));
+ return 0;
+ }
+ if (trapno != 1)
+ return 1; /* we let this handle by the calling routine */
+ current->thread.trap_nr = trapno;
+ current->thread.error_code = error_code;
+ force_sig(SIGTRAP, current);
+ return 0;
+}
+
+void handle_vm86_fault(struct kernel_vm86_regs *regs, long error_code)
+{
+ unsigned char opcode;
+ unsigned char __user *csp;
+ unsigned char __user *ssp;
+ unsigned short ip, sp, orig_flags;
+ int data32, pref_done;
+ struct vm86plus_info_struct *vmpi = ¤t->thread.vm86->vm86plus;
+
+#define CHECK_IF_IN_TRAP \
+ if (vmpi->vm86dbg_active && vmpi->vm86dbg_TFpendig) \
+ newflags |= X86_EFLAGS_TF
+
+ orig_flags = *(unsigned short *)®s->pt.flags;
+
+ csp = (unsigned char __user *) (regs->pt.cs << 4);
+ ssp = (unsigned char __user *) (regs->pt.ss << 4);
+ sp = SP(regs);
+ ip = IP(regs);
+
+ data32 = 0;
+ pref_done = 0;
+ do {
+ switch (opcode = popb(csp, ip, simulate_sigsegv)) {
+ case 0x66: /* 32-bit data */ data32 = 1; break;
+ case 0x67: /* 32-bit address */ break;
+ case 0x2e: /* CS */ break;
+ case 0x3e: /* DS */ break;
+ case 0x26: /* ES */ break;
+ case 0x36: /* SS */ break;
+ case 0x65: /* GS */ break;
+ case 0x64: /* FS */ break;
+ case 0xf2: /* repnz */ break;
+ case 0xf3: /* rep */ break;
+ default: pref_done = 1;
+ }
+ } while (!pref_done);
+
+ switch (opcode) {
+
+ /* pushf */
+ case 0x9c:
+ if (data32) {
+ pushl(ssp, sp, get_vflags(regs), simulate_sigsegv);
+ SP(regs) -= 4;
+ } else {
+ pushw(ssp, sp, get_vflags(regs), simulate_sigsegv);
+ SP(regs) -= 2;
+ }
+ IP(regs) = ip;
+ goto vm86_fault_return;
+
+ /* popf */
+ case 0x9d:
+ {
+ unsigned long newflags;
+ if (data32) {
+ newflags = popl(ssp, sp, simulate_sigsegv);
+ SP(regs) += 4;
+ } else {
+ newflags = popw(ssp, sp, simulate_sigsegv);
+ SP(regs) += 2;
+ }
+ IP(regs) = ip;
+ CHECK_IF_IN_TRAP;
+ if (data32)
+ set_vflags_long(newflags, regs);
+ else
+ set_vflags_short(newflags, regs);
+
+ goto check_vip;
+ }
+
+ /* int xx */
+ case 0xcd: {
+ int intno = popb(csp, ip, simulate_sigsegv);
+ IP(regs) = ip;
+ if (vmpi->vm86dbg_active) {
+ if ((1 << (intno & 7)) & vmpi->vm86dbg_intxxtab[intno >> 3]) {
+ save_v86_state(regs, VM86_INTx + (intno << 8));
+ return;
+ }
+ }
+ do_int(regs, intno, ssp, sp);
+ return;
+ }
+
+ /* iret */
+ case 0xcf:
+ {
+ unsigned long newip;
+ unsigned long newcs;
+ unsigned long newflags;
+ if (data32) {
+ newip = popl(ssp, sp, simulate_sigsegv);
+ newcs = popl(ssp, sp, simulate_sigsegv);
+ newflags = popl(ssp, sp, simulate_sigsegv);
+ SP(regs) += 12;
+ } else {
+ newip = popw(ssp, sp, simulate_sigsegv);
+ newcs = popw(ssp, sp, simulate_sigsegv);
+ newflags = popw(ssp, sp, simulate_sigsegv);
+ SP(regs) += 6;
+ }
+ IP(regs) = newip;
+ regs->pt.cs = newcs;
+ CHECK_IF_IN_TRAP;
+ if (data32) {
+ set_vflags_long(newflags, regs);
+ } else {
+ set_vflags_short(newflags, regs);
+ }
+ goto check_vip;
+ }
+
+ /* cli */
+ case 0xfa:
+ IP(regs) = ip;
+ clear_IF(regs);
+ goto vm86_fault_return;
+
+ /* sti */
+ /*
+ * Damn. This is incorrect: the 'sti' instruction should actually
+ * enable interrupts after the /next/ instruction. Not good.
+ *
+ * Probably needs some horsing around with the TF flag. Aiee..
+ */
+ case 0xfb:
+ IP(regs) = ip;
+ set_IF(regs);
+ goto check_vip;
+
+ default:
+ save_v86_state(regs, VM86_UNKNOWN);
+ }
+
+ return;
+
+check_vip:
+ if ((VEFLAGS & (X86_EFLAGS_VIP | X86_EFLAGS_VIF)) ==
+ (X86_EFLAGS_VIP | X86_EFLAGS_VIF)) {
+ save_v86_state(regs, VM86_STI);
+ return;
+ }
+
+vm86_fault_return:
+ if (vmpi->force_return_for_pic && (VEFLAGS & (X86_EFLAGS_IF | X86_EFLAGS_VIF))) {
+ save_v86_state(regs, VM86_PICRETURN);
+ return;
+ }
+ if (orig_flags & X86_EFLAGS_TF)
+ handle_vm86_trap(regs, 0, X86_TRAP_DB);
+ return;
+
+simulate_sigsegv:
+ /* FIXME: After a long discussion with Stas we finally
+ * agreed, that this is wrong. Here we should
+ * really send a SIGSEGV to the user program.
+ * But how do we create the correct context? We
+ * are inside a general protection fault handler
+ * and has just returned from a page fault handler.
+ * The correct context for the signal handler
+ * should be a mixture of the two, but how do we
+ * get the information? [KD]
+ */
+ save_v86_state(regs, VM86_UNKNOWN);
+}
+
+/* ---------------- vm86 special IRQ passing stuff ----------------- */
+
+#define VM86_IRQNAME "vm86irq"
+
+static struct vm86_irqs {
+ struct task_struct *tsk;
+ int sig;
+} vm86_irqs[16];
+
+static DEFINE_SPINLOCK(irqbits_lock);
+static int irqbits;
+
+#define ALLOWED_SIGS (1 /* 0 = don't send a signal */ \
+ | (1 << SIGUSR1) | (1 << SIGUSR2) | (1 << SIGIO) | (1 << SIGURG) \
+ | (1 << SIGUNUSED))
+
+static irqreturn_t irq_handler(int intno, void *dev_id)
+{
+ int irq_bit;
+ unsigned long flags;
+
+ spin_lock_irqsave(&irqbits_lock, flags);
+ irq_bit = 1 << intno;
+ if ((irqbits & irq_bit) || !vm86_irqs[intno].tsk)
+ goto out;
+ irqbits |= irq_bit;
+ if (vm86_irqs[intno].sig)
+ send_sig(vm86_irqs[intno].sig, vm86_irqs[intno].tsk, 1);
+ /*
+ * IRQ will be re-enabled when user asks for the irq (whether
+ * polling or as a result of the signal)
+ */
+ disable_irq_nosync(intno);
+ spin_unlock_irqrestore(&irqbits_lock, flags);
+ return IRQ_HANDLED;
+
+out:
+ spin_unlock_irqrestore(&irqbits_lock, flags);
+ return IRQ_NONE;
+}
+
+static inline void free_vm86_irq(int irqnumber)
+{
+ unsigned long flags;
+
+ free_irq(irqnumber, NULL);
+ vm86_irqs[irqnumber].tsk = NULL;
+
+ spin_lock_irqsave(&irqbits_lock, flags);
+ irqbits &= ~(1 << irqnumber);
+ spin_unlock_irqrestore(&irqbits_lock, flags);
+}
+
+void release_vm86_irqs(struct task_struct *task)
+{
+ int i;
+ for (i = FIRST_VM86_IRQ ; i <= LAST_VM86_IRQ; i++)
+ if (vm86_irqs[i].tsk == task)
+ free_vm86_irq(i);
+}
+
+static inline int get_and_reset_irq(int irqnumber)
+{
+ int bit;
+ unsigned long flags;
+ int ret = 0;
+
+ if (invalid_vm86_irq(irqnumber)) return 0;
+ if (vm86_irqs[irqnumber].tsk != current) return 0;
+ spin_lock_irqsave(&irqbits_lock, flags);
+ bit = irqbits & (1 << irqnumber);
+ irqbits &= ~bit;
+ if (bit) {
+ enable_irq(irqnumber);
+ ret = 1;
+ }
+
+ spin_unlock_irqrestore(&irqbits_lock, flags);
+ return ret;
+}
+
+
+static int do_vm86_irq_handling(int subfunction, int irqnumber)
+{
+ int ret;
+ switch (subfunction) {
+ case VM86_GET_AND_RESET_IRQ: {
+ return get_and_reset_irq(irqnumber);
+ }
+ case VM86_GET_IRQ_BITS: {
+ return irqbits;
+ }
+ case VM86_REQUEST_IRQ: {
+ int sig = irqnumber >> 8;
+ int irq = irqnumber & 255;
+ if (!capable(CAP_SYS_ADMIN)) return -EPERM;
+ if (!((1 << sig) & ALLOWED_SIGS)) return -EPERM;
+ if (invalid_vm86_irq(irq)) return -EPERM;
+ if (vm86_irqs[irq].tsk) return -EPERM;
+ ret = request_irq(irq, &irq_handler, 0, VM86_IRQNAME, NULL);
+ if (ret) return ret;
+ vm86_irqs[irq].sig = sig;
+ vm86_irqs[irq].tsk = current;
+ return irq;
+ }
+ case VM86_FREE_IRQ: {
+ if (invalid_vm86_irq(irqnumber)) return -EPERM;
+ if (!vm86_irqs[irqnumber].tsk) return 0;
+ if (vm86_irqs[irqnumber].tsk != current) return -EPERM;
+ free_vm86_irq(irqnumber);
+ return 0;
+ }
+ }
+ return -EINVAL;
+}
+
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
new file mode 100644
index 0000000..5dd3317
--- /dev/null
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -0,0 +1,437 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * ld script for the x86 kernel
+ *
+ * Historic 32-bit version written by Martin Mares <mj@atrey.karlin.mff.cuni.cz>
+ *
+ * Modernisation, unification and other changes and fixes:
+ * Copyright (C) 2007-2009 Sam Ravnborg <sam@ravnborg.org>
+ *
+ *
+ * Don't define absolute symbols until and unless you know that symbol
+ * value is should remain constant even if kernel image is relocated
+ * at run time. Absolute symbols are not relocated. If symbol value should
+ * change if kernel is relocated, make the symbol section relative and
+ * put it inside the section definition.
+ */
+
+#ifdef CONFIG_X86_32
+#define LOAD_OFFSET __PAGE_OFFSET
+#else
+#define LOAD_OFFSET __START_KERNEL_map
+#endif
+
+#include <asm-generic/vmlinux.lds.h>
+#include <asm/asm-offsets.h>
+#include <asm/thread_info.h>
+#include <asm/page_types.h>
+#include <asm/orc_lookup.h>
+#include <asm/cache.h>
+#include <asm/boot.h>
+
+#undef i386 /* in case the preprocessor is a 32bit one */
+
+OUTPUT_FORMAT(CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT)
+
+#ifdef CONFIG_X86_32
+OUTPUT_ARCH(i386)
+ENTRY(phys_startup_32)
+jiffies = jiffies_64;
+#else
+OUTPUT_ARCH(i386:x86-64)
+ENTRY(phys_startup_64)
+jiffies_64 = jiffies;
+#endif
+
+#if defined(CONFIG_X86_64)
+/*
+ * On 64-bit, align RODATA to 2MB so we retain large page mappings for
+ * boundaries spanning kernel text, rodata and data sections.
+ *
+ * However, kernel identity mappings will have different RWX permissions
+ * to the pages mapping to text and to the pages padding (which are freed) the
+ * text section. Hence kernel identity mappings will be broken to smaller
+ * pages. For 64-bit, kernel text and kernel identity mappings are different,
+ * so we can enable protection checks as well as retain 2MB large page
+ * mappings for kernel text.
+ */
+#define X86_ALIGN_RODATA_BEGIN . = ALIGN(HPAGE_SIZE);
+
+#define X86_ALIGN_RODATA_END \
+ . = ALIGN(HPAGE_SIZE); \
+ __end_rodata_hpage_align = .; \
+ __end_rodata_aligned = .;
+
+#define ALIGN_ENTRY_TEXT_BEGIN . = ALIGN(PMD_SIZE);
+#define ALIGN_ENTRY_TEXT_END . = ALIGN(PMD_SIZE);
+
+/*
+ * This section contains data which will be mapped as decrypted. Memory
+ * encryption operates on a page basis. Make this section PMD-aligned
+ * to avoid splitting the pages while mapping the section early.
+ *
+ * Note: We use a separate section so that only this section gets
+ * decrypted to avoid exposing more than we wish.
+ */
+#define BSS_DECRYPTED \
+ . = ALIGN(PMD_SIZE); \
+ __start_bss_decrypted = .; \
+ *(.bss..decrypted); \
+ . = ALIGN(PAGE_SIZE); \
+ __start_bss_decrypted_unused = .; \
+ . = ALIGN(PMD_SIZE); \
+ __end_bss_decrypted = .; \
+
+#else
+
+#define X86_ALIGN_RODATA_BEGIN
+#define X86_ALIGN_RODATA_END \
+ . = ALIGN(PAGE_SIZE); \
+ __end_rodata_aligned = .;
+
+#define ALIGN_ENTRY_TEXT_BEGIN
+#define ALIGN_ENTRY_TEXT_END
+#define BSS_DECRYPTED
+
+#endif
+
+PHDRS {
+ text PT_LOAD FLAGS(5); /* R_E */
+ data PT_LOAD FLAGS(6); /* RW_ */
+#ifdef CONFIG_X86_64
+#ifdef CONFIG_SMP
+ percpu PT_LOAD FLAGS(6); /* RW_ */
+#endif
+ init PT_LOAD FLAGS(7); /* RWE */
+#endif
+ note PT_NOTE FLAGS(0); /* ___ */
+}
+
+SECTIONS
+{
+#ifdef CONFIG_X86_32
+ . = LOAD_OFFSET + LOAD_PHYSICAL_ADDR;
+ phys_startup_32 = ABSOLUTE(startup_32 - LOAD_OFFSET);
+#else
+ . = __START_KERNEL;
+ phys_startup_64 = ABSOLUTE(startup_64 - LOAD_OFFSET);
+#endif
+
+ /* Text and read-only data */
+ .text : AT(ADDR(.text) - LOAD_OFFSET) {
+ _text = .;
+ _stext = .;
+ /* bootstrapping code */
+ HEAD_TEXT
+ TEXT_TEXT
+ SCHED_TEXT
+ CPUIDLE_TEXT
+ LOCK_TEXT
+ KPROBES_TEXT
+ ALIGN_ENTRY_TEXT_BEGIN
+ ENTRY_TEXT
+ IRQENTRY_TEXT
+ ALIGN_ENTRY_TEXT_END
+ SOFTIRQENTRY_TEXT
+ *(.fixup)
+ *(.gnu.warning)
+
+#ifdef CONFIG_X86_64
+ . = ALIGN(PAGE_SIZE);
+ __entry_trampoline_start = .;
+ _entry_trampoline = .;
+ *(.entry_trampoline)
+ . = ALIGN(PAGE_SIZE);
+ __entry_trampoline_end = .;
+ ASSERT(. - _entry_trampoline == PAGE_SIZE, "entry trampoline is too big");
+#endif
+
+#ifdef CONFIG_RETPOLINE
+ __indirect_thunk_start = .;
+ *(.text.__x86.indirect_thunk)
+ __indirect_thunk_end = .;
+#endif
+
+ /* End of text section */
+ _etext = .;
+ } :text = 0x9090
+
+ NOTES :text :note
+
+ EXCEPTION_TABLE(16) :text = 0x9090
+
+ /* .text should occupy whole number of pages */
+ . = ALIGN(PAGE_SIZE);
+ X86_ALIGN_RODATA_BEGIN
+ RO_DATA(PAGE_SIZE)
+ X86_ALIGN_RODATA_END
+
+ /* Data */
+ .data : AT(ADDR(.data) - LOAD_OFFSET) {
+ /* Start of data section */
+ _sdata = .;
+
+ /* init_task */
+ INIT_TASK_DATA(THREAD_SIZE)
+
+#ifdef CONFIG_X86_32
+ /* 32 bit has nosave before _edata */
+ NOSAVE_DATA
+#endif
+
+ PAGE_ALIGNED_DATA(PAGE_SIZE)
+
+ CACHELINE_ALIGNED_DATA(L1_CACHE_BYTES)
+
+ DATA_DATA
+ CONSTRUCTORS
+
+ /* rarely changed data like cpu maps */
+ READ_MOSTLY_DATA(INTERNODE_CACHE_BYTES)
+
+ /* End of data section */
+ _edata = .;
+ } :data
+
+ BUG_TABLE
+
+ ORC_UNWIND_TABLE
+
+ . = ALIGN(PAGE_SIZE);
+ __vvar_page = .;
+
+ .vvar : AT(ADDR(.vvar) - LOAD_OFFSET) {
+ /* work around gold bug 13023 */
+ __vvar_beginning_hack = .;
+
+ /* Place all vvars at the offsets in asm/vvar.h. */
+#define EMIT_VVAR(name, offset) \
+ . = __vvar_beginning_hack + offset; \
+ *(.vvar_ ## name)
+#define __VVAR_KERNEL_LDS
+#include <asm/vvar.h>
+#undef __VVAR_KERNEL_LDS
+#undef EMIT_VVAR
+
+ /*
+ * Pad the rest of the page with zeros. Otherwise the loader
+ * can leave garbage here.
+ */
+ . = __vvar_beginning_hack + PAGE_SIZE;
+ } :data
+
+ . = ALIGN(__vvar_page + PAGE_SIZE, PAGE_SIZE);
+
+ /* Init code and data - will be freed after init */
+ . = ALIGN(PAGE_SIZE);
+ .init.begin : AT(ADDR(.init.begin) - LOAD_OFFSET) {
+ __init_begin = .; /* paired with __init_end */
+ }
+
+#if defined(CONFIG_X86_64) && defined(CONFIG_SMP)
+ /*
+ * percpu offsets are zero-based on SMP. PERCPU_VADDR() changes the
+ * output PHDR, so the next output section - .init.text - should
+ * start another segment - init.
+ */
+ PERCPU_VADDR(INTERNODE_CACHE_BYTES, 0, :percpu)
+ ASSERT(SIZEOF(.data..percpu) < CONFIG_PHYSICAL_START,
+ "per-CPU data too large - increase CONFIG_PHYSICAL_START")
+#endif
+
+ INIT_TEXT_SECTION(PAGE_SIZE)
+#ifdef CONFIG_X86_64
+ :init
+#endif
+
+ /*
+ * Section for code used exclusively before alternatives are run. All
+ * references to such code must be patched out by alternatives, normally
+ * by using X86_FEATURE_ALWAYS CPU feature bit.
+ *
+ * See static_cpu_has() for an example.
+ */
+ .altinstr_aux : AT(ADDR(.altinstr_aux) - LOAD_OFFSET) {
+ *(.altinstr_aux)
+ }
+
+ INIT_DATA_SECTION(16)
+
+ .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) {
+ __x86_cpu_dev_start = .;
+ *(.x86_cpu_dev.init)
+ __x86_cpu_dev_end = .;
+ }
+
+#ifdef CONFIG_X86_INTEL_MID
+ .x86_intel_mid_dev.init : AT(ADDR(.x86_intel_mid_dev.init) - \
+ LOAD_OFFSET) {
+ __x86_intel_mid_dev_start = .;
+ *(.x86_intel_mid_dev.init)
+ __x86_intel_mid_dev_end = .;
+ }
+#endif
+
+ /*
+ * start address and size of operations which during runtime
+ * can be patched with virtualization friendly instructions or
+ * baremetal native ones. Think page table operations.
+ * Details in paravirt_types.h
+ */
+ . = ALIGN(8);
+ .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) {
+ __parainstructions = .;
+ *(.parainstructions)
+ __parainstructions_end = .;
+ }
+
+ /*
+ * struct alt_inst entries. From the header (alternative.h):
+ * "Alternative instructions for different CPU types or capabilities"
+ * Think locking instructions on spinlocks.
+ */
+ . = ALIGN(8);
+ .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) {
+ __alt_instructions = .;
+ *(.altinstructions)
+ __alt_instructions_end = .;
+ }
+
+ /*
+ * And here are the replacement instructions. The linker sticks
+ * them as binary blobs. The .altinstructions has enough data to
+ * get the address and the length of them to patch the kernel safely.
+ */
+ .altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) {
+ *(.altinstr_replacement)
+ }
+
+ /*
+ * struct iommu_table_entry entries are injected in this section.
+ * It is an array of IOMMUs which during run time gets sorted depending
+ * on its dependency order. After rootfs_initcall is complete
+ * this section can be safely removed.
+ */
+ .iommu_table : AT(ADDR(.iommu_table) - LOAD_OFFSET) {
+ __iommu_table = .;
+ *(.iommu_table)
+ __iommu_table_end = .;
+ }
+
+ . = ALIGN(8);
+ .apicdrivers : AT(ADDR(.apicdrivers) - LOAD_OFFSET) {
+ __apicdrivers = .;
+ *(.apicdrivers);
+ __apicdrivers_end = .;
+ }
+
+ . = ALIGN(8);
+ /*
+ * .exit.text is discard at runtime, not link time, to deal with
+ * references from .altinstructions and .eh_frame
+ */
+ .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) {
+ EXIT_TEXT
+ }
+
+ .exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) {
+ EXIT_DATA
+ }
+
+#if !defined(CONFIG_X86_64) || !defined(CONFIG_SMP)
+ PERCPU_SECTION(INTERNODE_CACHE_BYTES)
+#endif
+
+ . = ALIGN(PAGE_SIZE);
+
+ /* freed after init ends here */
+ .init.end : AT(ADDR(.init.end) - LOAD_OFFSET) {
+ __init_end = .;
+ }
+
+ /*
+ * smp_locks might be freed after init
+ * start/end must be page aligned
+ */
+ . = ALIGN(PAGE_SIZE);
+ .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) {
+ __smp_locks = .;
+ *(.smp_locks)
+ . = ALIGN(PAGE_SIZE);
+ __smp_locks_end = .;
+ }
+
+#ifdef CONFIG_X86_64
+ .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) {
+ NOSAVE_DATA
+ }
+#endif
+
+ /* BSS */
+ . = ALIGN(PAGE_SIZE);
+ .bss : AT(ADDR(.bss) - LOAD_OFFSET) {
+ __bss_start = .;
+ *(.bss..page_aligned)
+ *(.bss)
+ BSS_DECRYPTED
+ . = ALIGN(PAGE_SIZE);
+ __bss_stop = .;
+ }
+
+ . = ALIGN(PAGE_SIZE);
+ .brk : AT(ADDR(.brk) - LOAD_OFFSET) {
+ __brk_base = .;
+ . += 64 * 1024; /* 64k alignment slop space */
+ *(.brk_reservation) /* areas brk users have reserved */
+ __brk_limit = .;
+ }
+
+ . = ALIGN(PAGE_SIZE); /* keep VO_INIT_SIZE page aligned */
+ _end = .;
+
+ STABS_DEBUG
+ DWARF_DEBUG
+
+ /* Sections to be discarded */
+ DISCARDS
+ /DISCARD/ : {
+ *(.eh_frame)
+ }
+}
+
+
+#ifdef CONFIG_X86_32
+/*
+ * The ASSERT() sink to . is intentional, for binutils 2.14 compatibility:
+ */
+. = ASSERT((_end - LOAD_OFFSET <= KERNEL_IMAGE_SIZE),
+ "kernel image bigger than KERNEL_IMAGE_SIZE");
+#else
+/*
+ * Per-cpu symbols which need to be offset from __per_cpu_load
+ * for the boot processor.
+ */
+#define INIT_PER_CPU(x) init_per_cpu__##x = x + __per_cpu_load
+INIT_PER_CPU(gdt_page);
+INIT_PER_CPU(irq_stack_union);
+
+/*
+ * Build-time check on the image size:
+ */
+. = ASSERT((_end - _text <= KERNEL_IMAGE_SIZE),
+ "kernel image bigger than KERNEL_IMAGE_SIZE");
+
+#ifdef CONFIG_SMP
+. = ASSERT((irq_stack_union == 0),
+ "irq_stack_union is not at start of per-cpu area");
+#endif
+
+#endif /* CONFIG_X86_32 */
+
+#ifdef CONFIG_KEXEC_CORE
+#include <asm/kexec.h>
+
+. = ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE,
+ "kexec control code size is too big");
+#endif
+
diff --git a/arch/x86/kernel/vsmp_64.c b/arch/x86/kernel/vsmp_64.c
new file mode 100644
index 0000000..44685fb
--- /dev/null
+++ b/arch/x86/kernel/vsmp_64.c
@@ -0,0 +1,226 @@
+/*
+ * vSMPowered(tm) systems specific initialization
+ * Copyright (C) 2005 ScaleMP Inc.
+ *
+ * Use of this code is subject to the terms and conditions of the
+ * GNU general public license version 2. See "COPYING" or
+ * http://www.gnu.org/licenses/gpl.html
+ *
+ * Ravikiran Thirumalai <kiran@scalemp.com>,
+ * Shai Fultheim <shai@scalemp.com>
+ * Paravirt ops integration: Glauber de Oliveira Costa <gcosta@redhat.com>,
+ * Ravikiran Thirumalai <kiran@scalemp.com>
+ */
+
+#include <linux/init.h>
+#include <linux/pci_ids.h>
+#include <linux/pci_regs.h>
+#include <linux/smp.h>
+#include <linux/irq.h>
+
+#include <asm/apic.h>
+#include <asm/pci-direct.h>
+#include <asm/io.h>
+#include <asm/paravirt.h>
+#include <asm/setup.h>
+
+#define TOPOLOGY_REGISTER_OFFSET 0x10
+
+#if defined CONFIG_PCI && defined CONFIG_PARAVIRT
+/*
+ * Interrupt control on vSMPowered systems:
+ * ~AC is a shadow of IF. If IF is 'on' AC should be 'off'
+ * and vice versa.
+ */
+
+asmlinkage __visible unsigned long vsmp_save_fl(void)
+{
+ unsigned long flags = native_save_fl();
+
+ if (!(flags & X86_EFLAGS_IF) || (flags & X86_EFLAGS_AC))
+ flags &= ~X86_EFLAGS_IF;
+ return flags;
+}
+PV_CALLEE_SAVE_REGS_THUNK(vsmp_save_fl);
+
+__visible void vsmp_restore_fl(unsigned long flags)
+{
+ if (flags & X86_EFLAGS_IF)
+ flags &= ~X86_EFLAGS_AC;
+ else
+ flags |= X86_EFLAGS_AC;
+ native_restore_fl(flags);
+}
+PV_CALLEE_SAVE_REGS_THUNK(vsmp_restore_fl);
+
+asmlinkage __visible void vsmp_irq_disable(void)
+{
+ unsigned long flags = native_save_fl();
+
+ native_restore_fl((flags & ~X86_EFLAGS_IF) | X86_EFLAGS_AC);
+}
+PV_CALLEE_SAVE_REGS_THUNK(vsmp_irq_disable);
+
+asmlinkage __visible void vsmp_irq_enable(void)
+{
+ unsigned long flags = native_save_fl();
+
+ native_restore_fl((flags | X86_EFLAGS_IF) & (~X86_EFLAGS_AC));
+}
+PV_CALLEE_SAVE_REGS_THUNK(vsmp_irq_enable);
+
+static unsigned __init vsmp_patch(u8 type, u16 clobbers, void *ibuf,
+ unsigned long addr, unsigned len)
+{
+ switch (type) {
+ case PARAVIRT_PATCH(pv_irq_ops.irq_enable):
+ case PARAVIRT_PATCH(pv_irq_ops.irq_disable):
+ case PARAVIRT_PATCH(pv_irq_ops.save_fl):
+ case PARAVIRT_PATCH(pv_irq_ops.restore_fl):
+ return paravirt_patch_default(type, clobbers, ibuf, addr, len);
+ default:
+ return native_patch(type, clobbers, ibuf, addr, len);
+ }
+
+}
+
+static void __init set_vsmp_pv_ops(void)
+{
+ void __iomem *address;
+ unsigned int cap, ctl, cfg;
+
+ /* set vSMP magic bits to indicate vSMP capable kernel */
+ cfg = read_pci_config(0, 0x1f, 0, PCI_BASE_ADDRESS_0);
+ address = early_ioremap(cfg, 8);
+ cap = readl(address);
+ ctl = readl(address + 4);
+ printk(KERN_INFO "vSMP CTL: capabilities:0x%08x control:0x%08x\n",
+ cap, ctl);
+
+ /* If possible, let the vSMP foundation route the interrupt optimally */
+#ifdef CONFIG_SMP
+ if (cap & ctl & BIT(8)) {
+ ctl &= ~BIT(8);
+
+#ifdef CONFIG_PROC_FS
+ /* Don't let users change irq affinity via procfs */
+ no_irq_affinity = 1;
+#endif
+ }
+#endif
+
+ if (cap & ctl & (1 << 4)) {
+ /* Setup irq ops and turn on vSMP IRQ fastpath handling */
+ pv_irq_ops.irq_disable = PV_CALLEE_SAVE(vsmp_irq_disable);
+ pv_irq_ops.irq_enable = PV_CALLEE_SAVE(vsmp_irq_enable);
+ pv_irq_ops.save_fl = PV_CALLEE_SAVE(vsmp_save_fl);
+ pv_irq_ops.restore_fl = PV_CALLEE_SAVE(vsmp_restore_fl);
+ pv_init_ops.patch = vsmp_patch;
+ ctl &= ~(1 << 4);
+ }
+ writel(ctl, address + 4);
+ ctl = readl(address + 4);
+ pr_info("vSMP CTL: control set to:0x%08x\n", ctl);
+
+ early_iounmap(address, 8);
+}
+#else
+static void __init set_vsmp_pv_ops(void)
+{
+}
+#endif
+
+#ifdef CONFIG_PCI
+static int is_vsmp = -1;
+
+static void __init detect_vsmp_box(void)
+{
+ is_vsmp = 0;
+
+ if (!early_pci_allowed())
+ return;
+
+ /* Check if we are running on a ScaleMP vSMPowered box */
+ if (read_pci_config(0, 0x1f, 0, PCI_VENDOR_ID) ==
+ (PCI_VENDOR_ID_SCALEMP | (PCI_DEVICE_ID_SCALEMP_VSMP_CTL << 16)))
+ is_vsmp = 1;
+}
+
+static int is_vsmp_box(void)
+{
+ if (is_vsmp != -1)
+ return is_vsmp;
+ else {
+ WARN_ON_ONCE(1);
+ return 0;
+ }
+}
+
+#else
+static void __init detect_vsmp_box(void)
+{
+}
+static int is_vsmp_box(void)
+{
+ return 0;
+}
+#endif
+
+static void __init vsmp_cap_cpus(void)
+{
+#if !defined(CONFIG_X86_VSMP) && defined(CONFIG_SMP)
+ void __iomem *address;
+ unsigned int cfg, topology, node_shift, maxcpus;
+
+ /*
+ * CONFIG_X86_VSMP is not configured, so limit the number CPUs to the
+ * ones present in the first board, unless explicitly overridden by
+ * setup_max_cpus
+ */
+ if (setup_max_cpus != NR_CPUS)
+ return;
+
+ /* Read the vSMP Foundation topology register */
+ cfg = read_pci_config(0, 0x1f, 0, PCI_BASE_ADDRESS_0);
+ address = early_ioremap(cfg + TOPOLOGY_REGISTER_OFFSET, 4);
+ if (WARN_ON(!address))
+ return;
+
+ topology = readl(address);
+ node_shift = (topology >> 16) & 0x7;
+ if (!node_shift)
+ /* The value 0 should be decoded as 8 */
+ node_shift = 8;
+ maxcpus = (topology & ((1 << node_shift) - 1)) + 1;
+
+ pr_info("vSMP CTL: Capping CPUs to %d (CONFIG_X86_VSMP is unset)\n",
+ maxcpus);
+ setup_max_cpus = maxcpus;
+ early_iounmap(address, 4);
+#endif
+}
+
+static int apicid_phys_pkg_id(int initial_apic_id, int index_msb)
+{
+ return hard_smp_processor_id() >> index_msb;
+}
+
+static void vsmp_apic_post_init(void)
+{
+ /* need to update phys_pkg_id */
+ apic->phys_pkg_id = apicid_phys_pkg_id;
+}
+
+void __init vsmp_init(void)
+{
+ detect_vsmp_box();
+ if (!is_vsmp_box())
+ return;
+
+ x86_platform.apic_post_init = vsmp_apic_post_init;
+
+ vsmp_cap_cpus();
+
+ set_vsmp_pv_ops();
+ return;
+}
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
new file mode 100644
index 0000000..2792b55
--- /dev/null
+++ b/arch/x86/kernel/x86_init.c
@@ -0,0 +1,160 @@
+/*
+ * Copyright (C) 2009 Thomas Gleixner <tglx@linutronix.de>
+ *
+ * For licencing details see kernel-base/COPYING
+ */
+#include <linux/init.h>
+#include <linux/ioport.h>
+#include <linux/export.h>
+#include <linux/pci.h>
+
+#include <asm/acpi.h>
+#include <asm/bios_ebda.h>
+#include <asm/paravirt.h>
+#include <asm/pci_x86.h>
+#include <asm/mpspec.h>
+#include <asm/setup.h>
+#include <asm/apic.h>
+#include <asm/e820/api.h>
+#include <asm/time.h>
+#include <asm/irq.h>
+#include <asm/io_apic.h>
+#include <asm/hpet.h>
+#include <asm/pat.h>
+#include <asm/tsc.h>
+#include <asm/iommu.h>
+#include <asm/mach_traps.h>
+
+void x86_init_noop(void) { }
+void __init x86_init_uint_noop(unsigned int unused) { }
+static int __init iommu_init_noop(void) { return 0; }
+static void iommu_shutdown_noop(void) { }
+static bool __init bool_x86_init_noop(void) { return false; }
+static void x86_op_int_noop(int cpu) { }
+static u64 u64_x86_init_noop(void) { return 0; }
+
+/*
+ * The platform setup functions are preset with the default functions
+ * for standard PC hardware.
+ */
+struct x86_init_ops x86_init __initdata = {
+
+ .resources = {
+ .probe_roms = probe_roms,
+ .reserve_resources = reserve_standard_io_resources,
+ .memory_setup = e820__memory_setup_default,
+ },
+
+ .mpparse = {
+ .mpc_record = x86_init_uint_noop,
+ .setup_ioapic_ids = x86_init_noop,
+ .mpc_apic_id = default_mpc_apic_id,
+ .smp_read_mpc_oem = default_smp_read_mpc_oem,
+ .mpc_oem_bus_info = default_mpc_oem_bus_info,
+ .find_smp_config = default_find_smp_config,
+ .get_smp_config = default_get_smp_config,
+ },
+
+ .irqs = {
+ .pre_vector_init = init_ISA_irqs,
+ .intr_init = native_init_IRQ,
+ .trap_init = x86_init_noop,
+ .intr_mode_init = apic_intr_mode_init
+ },
+
+ .oem = {
+ .arch_setup = x86_init_noop,
+ .banner = default_banner,
+ },
+
+ .paging = {
+ .pagetable_init = native_pagetable_init,
+ },
+
+ .timers = {
+ .setup_percpu_clockev = setup_boot_APIC_clock,
+ .timer_init = hpet_time_init,
+ .wallclock_init = x86_init_noop,
+ },
+
+ .iommu = {
+ .iommu_init = iommu_init_noop,
+ },
+
+ .pci = {
+ .init = x86_default_pci_init,
+ .init_irq = x86_default_pci_init_irq,
+ .fixup_irqs = x86_default_pci_fixup_irqs,
+ },
+
+ .hyper = {
+ .init_platform = x86_init_noop,
+ .guest_late_init = x86_init_noop,
+ .x2apic_available = bool_x86_init_noop,
+ .init_mem_mapping = x86_init_noop,
+ .init_after_bootmem = x86_init_noop,
+ },
+
+ .acpi = {
+ .get_root_pointer = u64_x86_init_noop,
+ .reduced_hw_early_init = acpi_generic_reduced_hw_init,
+ },
+};
+
+struct x86_cpuinit_ops x86_cpuinit = {
+ .early_percpu_clock_init = x86_init_noop,
+ .setup_percpu_clockev = setup_secondary_APIC_clock,
+};
+
+static void default_nmi_init(void) { };
+
+struct x86_platform_ops x86_platform __ro_after_init = {
+ .calibrate_cpu = native_calibrate_cpu_early,
+ .calibrate_tsc = native_calibrate_tsc,
+ .get_wallclock = mach_get_cmos_time,
+ .set_wallclock = mach_set_rtc_mmss,
+ .iommu_shutdown = iommu_shutdown_noop,
+ .is_untracked_pat_range = is_ISA_range,
+ .nmi_init = default_nmi_init,
+ .get_nmi_reason = default_get_nmi_reason,
+ .save_sched_clock_state = tsc_save_sched_clock_state,
+ .restore_sched_clock_state = tsc_restore_sched_clock_state,
+ .hyper.pin_vcpu = x86_op_int_noop,
+};
+
+EXPORT_SYMBOL_GPL(x86_platform);
+
+#if defined(CONFIG_PCI_MSI)
+struct x86_msi_ops x86_msi __ro_after_init = {
+ .setup_msi_irqs = native_setup_msi_irqs,
+ .teardown_msi_irq = native_teardown_msi_irq,
+ .teardown_msi_irqs = default_teardown_msi_irqs,
+ .restore_msi_irqs = default_restore_msi_irqs,
+};
+
+/* MSI arch specific hooks */
+int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
+{
+ return x86_msi.setup_msi_irqs(dev, nvec, type);
+}
+
+void arch_teardown_msi_irqs(struct pci_dev *dev)
+{
+ x86_msi.teardown_msi_irqs(dev);
+}
+
+void arch_teardown_msi_irq(unsigned int irq)
+{
+ x86_msi.teardown_msi_irq(irq);
+}
+
+void arch_restore_msi_irqs(struct pci_dev *dev)
+{
+ x86_msi.restore_msi_irqs(dev);
+}
+#endif
+
+struct x86_apic_ops x86_apic_ops __ro_after_init = {
+ .io_apic_read = native_io_apic_read,
+ .restore = native_restore_boot_irq_mode,
+};