Update Linux to v5.10.109
Sourced from [1]
[1] https://cdn.kernel.org/pub/linux/kernel/v5.x/linux-5.10.109.tar.xz
Change-Id: I19bca9fc6762d4e63bcf3e4cba88bbe560d9c76c
Signed-off-by: Olivier Deprez <olivier.deprez@arm.com>
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index 84373dc..5864219 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -7,29 +7,32 @@
KASAN_SANITIZE_mem_encrypt.o := n
KASAN_SANITIZE_mem_encrypt_identity.o := n
+# Disable KCSAN entirely, because otherwise we get warnings that some functions
+# reference __initdata sections.
+KCSAN_SANITIZE := n
+
ifdef CONFIG_FUNCTION_TRACER
CFLAGS_REMOVE_mem_encrypt.o = -pg
CFLAGS_REMOVE_mem_encrypt_identity.o = -pg
endif
-obj-y := init.o init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \
- pat.o pgtable.o physaddr.o setup_nx.o tlb.o cpu_entry_area.o
+obj-y := init.o init_$(BITS).o fault.o ioremap.o extable.o mmap.o \
+ pgtable.o physaddr.o setup_nx.o tlb.o cpu_entry_area.o maccess.o
+
+obj-y += pat/
# Make sure __phys_addr has no stackprotector
-nostackp := $(call cc-option, -fno-stack-protector)
-CFLAGS_physaddr.o := $(nostackp)
-CFLAGS_setup_nx.o := $(nostackp)
-CFLAGS_mem_encrypt_identity.o := $(nostackp)
+CFLAGS_physaddr.o := -fno-stack-protector
+CFLAGS_setup_nx.o := -fno-stack-protector
+CFLAGS_mem_encrypt_identity.o := -fno-stack-protector
CFLAGS_fault.o := -I $(srctree)/$(src)/../include/asm/trace
-obj-$(CONFIG_X86_PAT) += pat_rbtree.o
-
obj-$(CONFIG_X86_32) += pgtable_32.o iomap_32.o
obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
-obj-$(CONFIG_X86_PTDUMP_CORE) += dump_pagetables.o
-obj-$(CONFIG_X86_PTDUMP) += debug_pagetables.o
+obj-$(CONFIG_PTDUMP_CORE) += dump_pagetables.o
+obj-$(CONFIG_PTDUMP_DEBUGFS) += debug_pagetables.o
obj-$(CONFIG_HIGHMEM) += highmem_32.o
@@ -45,7 +48,6 @@
obj-$(CONFIG_ACPI_NUMA) += srat.o
obj-$(CONFIG_NUMA_EMU) += numa_emulation.o
-obj-$(CONFIG_X86_INTEL_MPX) += mpx.o
obj-$(CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS) += pkeys.o
obj-$(CONFIG_RANDOMIZE_MEMORY) += kaslr.o
obj-$(CONFIG_PAGE_TABLE_ISOLATION) += pti.o
diff --git a/arch/x86/mm/cpu_entry_area.c b/arch/x86/mm/cpu_entry_area.c
index d964364..6c2f1b7 100644
--- a/arch/x86/mm/cpu_entry_area.c
+++ b/arch/x86/mm/cpu_entry_area.c
@@ -4,9 +4,9 @@
#include <linux/percpu.h>
#include <linux/kallsyms.h>
#include <linux/kcore.h>
+#include <linux/pgtable.h>
#include <asm/cpu_entry_area.h>
-#include <asm/pgtable.h>
#include <asm/fixmap.h>
#include <asm/desc.h>
@@ -17,7 +17,12 @@
DEFINE_PER_CPU(struct cea_exception_stacks*, cea_exception_stacks);
#endif
-struct cpu_entry_area *get_cpu_entry_area(int cpu)
+#ifdef CONFIG_X86_32
+DECLARE_PER_CPU_PAGE_ALIGNED(struct doublefault_stack, doublefault_stack);
+#endif
+
+/* Is called from entry code, so must be noinstr */
+noinstr struct cpu_entry_area *get_cpu_entry_area(int cpu)
{
unsigned long va = CPU_ENTRY_AREA_PER_CPU + cpu * CPU_ENTRY_AREA_SIZE;
BUILD_BUG_ON(sizeof(struct cpu_entry_area) % PAGE_SIZE != 0);
@@ -103,12 +108,24 @@
*/
cea_map_stack(DF);
cea_map_stack(NMI);
- cea_map_stack(DB1);
cea_map_stack(DB);
cea_map_stack(MCE);
+
+ if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT)) {
+ if (cc_platform_has(CC_ATTR_GUEST_STATE_ENCRYPT)) {
+ cea_map_stack(VC);
+ cea_map_stack(VC2);
+ }
+ }
}
#else
-static inline void percpu_setup_exception_stacks(unsigned int cpu) {}
+static inline void percpu_setup_exception_stacks(unsigned int cpu)
+{
+ struct cpu_entry_area *cea = get_cpu_entry_area(cpu);
+
+ cea_map_percpu_pages(&cea->doublefault_stack,
+ &per_cpu(doublefault_stack, cpu), 1, PAGE_KERNEL);
+}
#endif
/* Setup the fixmap mappings only once per-processor */
@@ -161,6 +178,14 @@
BUILD_BUG_ON((offsetof(struct tss_struct, x86_tss) ^
offsetofend(struct tss_struct, x86_tss)) & PAGE_MASK);
BUILD_BUG_ON(sizeof(struct tss_struct) % PAGE_SIZE != 0);
+ /*
+ * VMX changes the host TR limit to 0x67 after a VM exit. This is
+ * okay, since 0x67 covers the size of struct x86_hw_tss. Make sure
+ * that this is correct.
+ */
+ BUILD_BUG_ON(offsetof(struct tss_struct, x86_tss) != 0);
+ BUILD_BUG_ON(sizeof(struct x86_hw_tss) != 0x68);
+
cea_map_percpu_pages(&cea->tss, &per_cpu(cpu_tss_rw, cpu),
sizeof(struct tss_struct) / PAGE_SIZE, tss_prot);
diff --git a/arch/x86/mm/debug_pagetables.c b/arch/x86/mm/debug_pagetables.c
index 39001a4..092ea43 100644
--- a/arch/x86/mm/debug_pagetables.c
+++ b/arch/x86/mm/debug_pagetables.c
@@ -3,11 +3,11 @@
#include <linux/efi.h>
#include <linux/module.h>
#include <linux/seq_file.h>
-#include <asm/pgtable.h>
+#include <linux/pgtable.h>
static int ptdump_show(struct seq_file *m, void *v)
{
- ptdump_walk_pgd_level_debugfs(m, NULL, false);
+ ptdump_walk_pgd_level_debugfs(m, &init_mm, false);
return 0;
}
@@ -15,11 +15,8 @@
static int ptdump_curknl_show(struct seq_file *m, void *v)
{
- if (current->mm->pgd) {
- down_read(¤t->mm->mmap_sem);
- ptdump_walk_pgd_level_debugfs(m, current->mm->pgd, false);
- up_read(¤t->mm->mmap_sem);
- }
+ if (current->mm->pgd)
+ ptdump_walk_pgd_level_debugfs(m, current->mm, false);
return 0;
}
@@ -28,11 +25,8 @@
#ifdef CONFIG_PAGE_TABLE_ISOLATION
static int ptdump_curusr_show(struct seq_file *m, void *v)
{
- if (current->mm->pgd) {
- down_read(¤t->mm->mmap_sem);
- ptdump_walk_pgd_level_debugfs(m, current->mm->pgd, true);
- up_read(¤t->mm->mmap_sem);
- }
+ if (current->mm->pgd)
+ ptdump_walk_pgd_level_debugfs(m, current->mm, true);
return 0;
}
@@ -43,7 +37,7 @@
static int ptdump_efi_show(struct seq_file *m, void *v)
{
if (efi_mm.pgd)
- ptdump_walk_pgd_level_debugfs(m, efi_mm.pgd, false);
+ ptdump_walk_pgd_level_debugfs(m, &efi_mm, false);
return 0;
}
diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
index ab67822..e1b599e 100644
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -16,9 +16,9 @@
#include <linux/seq_file.h>
#include <linux/highmem.h>
#include <linux/pci.h>
+#include <linux/ptdump.h>
#include <asm/e820/types.h>
-#include <asm/pgtable.h>
/*
* The dumper groups pagetable entries of the same type into one, and for
@@ -26,16 +26,18 @@
* when a "break" in the continuity is found.
*/
struct pg_state {
+ struct ptdump_state ptdump;
int level;
- pgprot_t current_prot;
+ pgprotval_t current_prot;
pgprotval_t effective_prot;
+ pgprotval_t prot_levels[5];
unsigned long start_address;
- unsigned long current_address;
const struct addr_marker *marker;
unsigned long lines;
bool to_dmesg;
bool check_wx;
unsigned long wx_pages;
+ struct seq_file *seq;
};
struct addr_marker {
@@ -174,11 +176,10 @@
/*
* Print a readable form of a pgprot_t to the seq_file
*/
-static void printk_prot(struct seq_file *m, pgprot_t prot, int level, bool dmsg)
+static void printk_prot(struct seq_file *m, pgprotval_t pr, int level, bool dmsg)
{
- pgprotval_t pr = pgprot_val(prot);
static const char * const level_name[] =
- { "cr3", "pgd", "p4d", "pud", "pmd", "pte" };
+ { "pgd", "p4d", "pud", "pmd", "pte" };
if (!(pr & _PAGE_PRESENT)) {
/* Not present */
@@ -202,12 +203,12 @@
pt_dump_cont_printf(m, dmsg, " ");
/* Bit 7 has a different meaning on level 3 vs 4 */
- if (level <= 4 && pr & _PAGE_PSE)
+ if (level <= 3 && pr & _PAGE_PSE)
pt_dump_cont_printf(m, dmsg, "PSE ");
else
pt_dump_cont_printf(m, dmsg, " ");
- if ((level == 5 && pr & _PAGE_PAT) ||
- ((level == 4 || level == 3) && pr & _PAGE_PAT_LARGE))
+ if ((level == 4 && pr & _PAGE_PAT) ||
+ ((level == 3 || level == 2) && pr & _PAGE_PAT_LARGE))
pt_dump_cont_printf(m, dmsg, "PAT ");
else
pt_dump_cont_printf(m, dmsg, " ");
@@ -223,24 +224,11 @@
pt_dump_cont_printf(m, dmsg, "%s\n", level_name[level]);
}
-/*
- * On 64 bits, sign-extend the 48 bit address to 64 bit
- */
-static unsigned long normalize_addr(unsigned long u)
-{
- int shift;
- if (!IS_ENABLED(CONFIG_X86_64))
- return u;
-
- shift = 64 - (__VIRTUAL_MASK_SHIFT + 1);
- return (signed long)(u << shift) >> shift;
-}
-
-static void note_wx(struct pg_state *st)
+static void note_wx(struct pg_state *st, unsigned long addr)
{
unsigned long npages;
- npages = (st->current_address - st->start_address) / PAGE_SIZE;
+ npages = (addr - st->start_address) / PAGE_SIZE;
#ifdef CONFIG_PCI_BIOS
/*
@@ -248,7 +236,7 @@
* Inform about it, but avoid the warning.
*/
if (pcibios_enabled && st->start_address >= PAGE_OFFSET + BIOS_BEGIN &&
- st->current_address <= PAGE_OFFSET + BIOS_END) {
+ addr <= PAGE_OFFSET + BIOS_END) {
pr_warn_once("x86/mm: PCI BIOS W+X mapping %lu pages\n", npages);
return;
}
@@ -260,27 +248,53 @@
(void *)st->start_address);
}
+static void effective_prot(struct ptdump_state *pt_st, int level, u64 val)
+{
+ struct pg_state *st = container_of(pt_st, struct pg_state, ptdump);
+ pgprotval_t prot = val & PTE_FLAGS_MASK;
+ pgprotval_t effective;
+
+ if (level > 0) {
+ pgprotval_t higher_prot = st->prot_levels[level - 1];
+
+ effective = (higher_prot & prot & (_PAGE_USER | _PAGE_RW)) |
+ ((higher_prot | prot) & _PAGE_NX);
+ } else {
+ effective = prot;
+ }
+
+ st->prot_levels[level] = effective;
+}
+
/*
* This function gets called on a break in a continuous series
* of PTE entries; the next one is different so we need to
* print what we collected so far.
*/
-static void note_page(struct seq_file *m, struct pg_state *st,
- pgprot_t new_prot, pgprotval_t new_eff, int level)
+static void note_page(struct ptdump_state *pt_st, unsigned long addr, int level,
+ u64 val)
{
- pgprotval_t prot, cur, eff;
+ struct pg_state *st = container_of(pt_st, struct pg_state, ptdump);
+ pgprotval_t new_prot, new_eff;
+ pgprotval_t cur, eff;
static const char units[] = "BKMGTPE";
+ struct seq_file *m = st->seq;
+
+ new_prot = val & PTE_FLAGS_MASK;
+ if (!val)
+ new_eff = 0;
+ else
+ new_eff = st->prot_levels[level];
/*
* If we have a "break" in the series, we need to flush the state that
* we have now. "break" is either changing perms, levels or
* address space marker.
*/
- prot = pgprot_val(new_prot);
- cur = pgprot_val(st->current_prot);
+ cur = st->current_prot;
eff = st->effective_prot;
- if (!st->level) {
+ if (st->level == -1) {
/* First entry */
st->current_prot = new_prot;
st->effective_prot = new_eff;
@@ -289,14 +303,14 @@
st->lines = 0;
pt_dump_seq_printf(m, st->to_dmesg, "---[ %s ]---\n",
st->marker->name);
- } else if (prot != cur || new_eff != eff || level != st->level ||
- st->current_address >= st->marker[1].start_address) {
+ } else if (new_prot != cur || new_eff != eff || level != st->level ||
+ addr >= st->marker[1].start_address) {
const char *unit = units;
unsigned long delta;
int width = sizeof(unsigned long) * 2;
if (st->check_wx && (eff & _PAGE_RW) && !(eff & _PAGE_NX))
- note_wx(st);
+ note_wx(st, addr);
/*
* Now print the actual finished series
@@ -306,9 +320,9 @@
pt_dump_seq_printf(m, st->to_dmesg,
"0x%0*lx-0x%0*lx ",
width, st->start_address,
- width, st->current_address);
+ width, addr);
- delta = st->current_address - st->start_address;
+ delta = addr - st->start_address;
while (!(delta & 1023) && unit[1]) {
delta >>= 10;
unit++;
@@ -325,7 +339,7 @@
* such as the start of vmalloc space etc.
* This helps in the interpretation.
*/
- if (st->current_address >= st->marker[1].start_address) {
+ if (addr >= st->marker[1].start_address) {
if (st->marker->max_lines &&
st->lines > st->marker->max_lines) {
unsigned long nskip =
@@ -341,222 +355,41 @@
st->marker->name);
}
- st->start_address = st->current_address;
+ st->start_address = addr;
st->current_prot = new_prot;
st->effective_prot = new_eff;
st->level = level;
}
}
-static inline pgprotval_t effective_prot(pgprotval_t prot1, pgprotval_t prot2)
-{
- return (prot1 & prot2 & (_PAGE_USER | _PAGE_RW)) |
- ((prot1 | prot2) & _PAGE_NX);
-}
-
-static void walk_pte_level(struct seq_file *m, struct pg_state *st, pmd_t addr,
- pgprotval_t eff_in, unsigned long P)
-{
- int i;
- pte_t *pte;
- pgprotval_t prot, eff;
-
- for (i = 0; i < PTRS_PER_PTE; i++) {
- st->current_address = normalize_addr(P + i * PTE_LEVEL_MULT);
- pte = pte_offset_map(&addr, st->current_address);
- prot = pte_flags(*pte);
- eff = effective_prot(eff_in, prot);
- note_page(m, st, __pgprot(prot), eff, 5);
- pte_unmap(pte);
- }
-}
-#ifdef CONFIG_KASAN
-
-/*
- * This is an optimization for KASAN=y case. Since all kasan page tables
- * eventually point to the kasan_early_shadow_page we could call note_page()
- * right away without walking through lower level page tables. This saves
- * us dozens of seconds (minutes for 5-level config) while checking for
- * W+X mapping or reading kernel_page_tables debugfs file.
- */
-static inline bool kasan_page_table(struct seq_file *m, struct pg_state *st,
- void *pt)
-{
- if (__pa(pt) == __pa(kasan_early_shadow_pmd) ||
- (pgtable_l5_enabled() &&
- __pa(pt) == __pa(kasan_early_shadow_p4d)) ||
- __pa(pt) == __pa(kasan_early_shadow_pud)) {
- pgprotval_t prot = pte_flags(kasan_early_shadow_pte[0]);
- note_page(m, st, __pgprot(prot), 0, 5);
- return true;
- }
- return false;
-}
-#else
-static inline bool kasan_page_table(struct seq_file *m, struct pg_state *st,
- void *pt)
-{
- return false;
-}
-#endif
-
-#if PTRS_PER_PMD > 1
-
-static void walk_pmd_level(struct seq_file *m, struct pg_state *st, pud_t addr,
- pgprotval_t eff_in, unsigned long P)
-{
- int i;
- pmd_t *start, *pmd_start;
- pgprotval_t prot, eff;
-
- pmd_start = start = (pmd_t *)pud_page_vaddr(addr);
- for (i = 0; i < PTRS_PER_PMD; i++) {
- st->current_address = normalize_addr(P + i * PMD_LEVEL_MULT);
- if (!pmd_none(*start)) {
- prot = pmd_flags(*start);
- eff = effective_prot(eff_in, prot);
- if (pmd_large(*start) || !pmd_present(*start)) {
- note_page(m, st, __pgprot(prot), eff, 4);
- } else if (!kasan_page_table(m, st, pmd_start)) {
- walk_pte_level(m, st, *start, eff,
- P + i * PMD_LEVEL_MULT);
- }
- } else
- note_page(m, st, __pgprot(0), 0, 4);
- start++;
- }
-}
-
-#else
-#define walk_pmd_level(m,s,a,e,p) walk_pte_level(m,s,__pmd(pud_val(a)),e,p)
-#define pud_large(a) pmd_large(__pmd(pud_val(a)))
-#define pud_none(a) pmd_none(__pmd(pud_val(a)))
-#endif
-
-#if PTRS_PER_PUD > 1
-
-static void walk_pud_level(struct seq_file *m, struct pg_state *st, p4d_t addr,
- pgprotval_t eff_in, unsigned long P)
-{
- int i;
- pud_t *start, *pud_start;
- pgprotval_t prot, eff;
-
- pud_start = start = (pud_t *)p4d_page_vaddr(addr);
-
- for (i = 0; i < PTRS_PER_PUD; i++) {
- st->current_address = normalize_addr(P + i * PUD_LEVEL_MULT);
- if (!pud_none(*start)) {
- prot = pud_flags(*start);
- eff = effective_prot(eff_in, prot);
- if (pud_large(*start) || !pud_present(*start)) {
- note_page(m, st, __pgprot(prot), eff, 3);
- } else if (!kasan_page_table(m, st, pud_start)) {
- walk_pmd_level(m, st, *start, eff,
- P + i * PUD_LEVEL_MULT);
- }
- } else
- note_page(m, st, __pgprot(0), 0, 3);
-
- start++;
- }
-}
-
-#else
-#define walk_pud_level(m,s,a,e,p) walk_pmd_level(m,s,__pud(p4d_val(a)),e,p)
-#define p4d_large(a) pud_large(__pud(p4d_val(a)))
-#define p4d_none(a) pud_none(__pud(p4d_val(a)))
-#endif
-
-static void walk_p4d_level(struct seq_file *m, struct pg_state *st, pgd_t addr,
- pgprotval_t eff_in, unsigned long P)
-{
- int i;
- p4d_t *start, *p4d_start;
- pgprotval_t prot, eff;
-
- if (PTRS_PER_P4D == 1)
- return walk_pud_level(m, st, __p4d(pgd_val(addr)), eff_in, P);
-
- p4d_start = start = (p4d_t *)pgd_page_vaddr(addr);
-
- for (i = 0; i < PTRS_PER_P4D; i++) {
- st->current_address = normalize_addr(P + i * P4D_LEVEL_MULT);
- if (!p4d_none(*start)) {
- prot = p4d_flags(*start);
- eff = effective_prot(eff_in, prot);
- if (p4d_large(*start) || !p4d_present(*start)) {
- note_page(m, st, __pgprot(prot), eff, 2);
- } else if (!kasan_page_table(m, st, p4d_start)) {
- walk_pud_level(m, st, *start, eff,
- P + i * P4D_LEVEL_MULT);
- }
- } else
- note_page(m, st, __pgprot(0), 0, 2);
-
- start++;
- }
-}
-
-#define pgd_large(a) (pgtable_l5_enabled() ? pgd_large(a) : p4d_large(__p4d(pgd_val(a))))
-#define pgd_none(a) (pgtable_l5_enabled() ? pgd_none(a) : p4d_none(__p4d(pgd_val(a))))
-
-static inline bool is_hypervisor_range(int idx)
-{
-#ifdef CONFIG_X86_64
- /*
- * A hole in the beginning of kernel address space reserved
- * for a hypervisor.
- */
- return (idx >= pgd_index(GUARD_HOLE_BASE_ADDR)) &&
- (idx < pgd_index(GUARD_HOLE_END_ADDR));
-#else
- return false;
-#endif
-}
-
-static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd,
+static void ptdump_walk_pgd_level_core(struct seq_file *m,
+ struct mm_struct *mm, pgd_t *pgd,
bool checkwx, bool dmesg)
{
- pgd_t *start = INIT_PGD;
- pgprotval_t prot, eff;
- int i;
- struct pg_state st = {};
-
- if (pgd) {
- start = pgd;
- st.to_dmesg = dmesg;
- }
-
- st.check_wx = checkwx;
- if (checkwx)
- st.wx_pages = 0;
-
- for (i = 0; i < PTRS_PER_PGD; i++) {
- st.current_address = normalize_addr(i * PGD_LEVEL_MULT);
- if (!pgd_none(*start) && !is_hypervisor_range(i)) {
- prot = pgd_flags(*start);
-#ifdef CONFIG_X86_PAE
- eff = _PAGE_USER | _PAGE_RW;
+ const struct ptdump_range ptdump_ranges[] = {
+#ifdef CONFIG_X86_64
+ {0, PTRS_PER_PGD * PGD_LEVEL_MULT / 2},
+ {GUARD_HOLE_END_ADDR, ~0UL},
#else
- eff = prot;
+ {0, ~0UL},
#endif
- if (pgd_large(*start) || !pgd_present(*start)) {
- note_page(m, &st, __pgprot(prot), eff, 1);
- } else {
- walk_p4d_level(m, &st, *start, eff,
- i * PGD_LEVEL_MULT);
- }
- } else
- note_page(m, &st, __pgprot(0), 0, 1);
+ {0, 0}
+};
- cond_resched();
- start++;
- }
+ struct pg_state st = {
+ .ptdump = {
+ .note_page = note_page,
+ .effective_prot = effective_prot,
+ .range = ptdump_ranges
+ },
+ .level = -1,
+ .to_dmesg = dmesg,
+ .check_wx = checkwx,
+ .seq = m
+ };
- /* Flush out the last page */
- st.current_address = normalize_addr(PTRS_PER_PGD*PGD_LEVEL_MULT);
- note_page(m, &st, __pgprot(0), 0, 0);
+ ptdump_walk_pgd(&st.ptdump, mm, pgd);
+
if (!checkwx)
return;
if (st.wx_pages)
@@ -566,18 +399,20 @@
pr_info("x86/mm: Checked W+X mappings: passed, no W+X pages found.\n");
}
-void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd)
+void ptdump_walk_pgd_level(struct seq_file *m, struct mm_struct *mm)
{
- ptdump_walk_pgd_level_core(m, pgd, false, true);
+ ptdump_walk_pgd_level_core(m, mm, mm->pgd, false, true);
}
-void ptdump_walk_pgd_level_debugfs(struct seq_file *m, pgd_t *pgd, bool user)
+void ptdump_walk_pgd_level_debugfs(struct seq_file *m, struct mm_struct *mm,
+ bool user)
{
+ pgd_t *pgd = mm->pgd;
#ifdef CONFIG_PAGE_TABLE_ISOLATION
if (user && boot_cpu_has(X86_FEATURE_PTI))
pgd = kernel_to_user_pgdp(pgd);
#endif
- ptdump_walk_pgd_level_core(m, pgd, false, false);
+ ptdump_walk_pgd_level_core(m, mm, pgd, false, false);
}
EXPORT_SYMBOL_GPL(ptdump_walk_pgd_level_debugfs);
@@ -592,13 +427,13 @@
pr_info("x86/mm: Checking user space page tables\n");
pgd = kernel_to_user_pgdp(pgd);
- ptdump_walk_pgd_level_core(NULL, pgd, true, false);
+ ptdump_walk_pgd_level_core(NULL, &init_mm, pgd, true, false);
#endif
}
void ptdump_walk_pgd_level_checkwx(void)
{
- ptdump_walk_pgd_level_core(NULL, NULL, true, false);
+ ptdump_walk_pgd_level_core(NULL, &init_mm, INIT_PGD, true, false);
}
static int __init pt_dump_init(void)
diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c
index 4d75bc6..b93d6cd 100644
--- a/arch/x86/mm/extable.c
+++ b/arch/x86/mm/extable.c
@@ -5,6 +5,7 @@
#include <xen/xen.h>
#include <asm/fpu/internal.h>
+#include <asm/sev-es.h>
#include <asm/traps.h>
#include <asm/kdebug.h>
@@ -45,55 +46,6 @@
EXPORT_SYMBOL_GPL(ex_handler_fault);
/*
- * Handler for UD0 exception following a failed test against the
- * result of a refcount inc/dec/add/sub.
- */
-__visible bool ex_handler_refcount(const struct exception_table_entry *fixup,
- struct pt_regs *regs, int trapnr,
- unsigned long error_code,
- unsigned long fault_addr)
-{
- /* First unconditionally saturate the refcount. */
- *(int *)regs->cx = INT_MIN / 2;
-
- /*
- * Strictly speaking, this reports the fixup destination, not
- * the fault location, and not the actually overflowing
- * instruction, which is the instruction before the "js", but
- * since that instruction could be a variety of lengths, just
- * report the location after the overflow, which should be close
- * enough for finding the overflow, as it's at least back in
- * the function, having returned from .text.unlikely.
- */
- regs->ip = ex_fixup_addr(fixup);
-
- /*
- * This function has been called because either a negative refcount
- * value was seen by any of the refcount functions, or a zero
- * refcount value was seen by refcount_dec().
- *
- * If we crossed from INT_MAX to INT_MIN, OF (Overflow Flag: result
- * wrapped around) will be set. Additionally, seeing the refcount
- * reach 0 will set ZF (Zero Flag: result was zero). In each of
- * these cases we want a report, since it's a boundary condition.
- * The SF case is not reported since it indicates post-boundary
- * manipulations below zero or above INT_MAX. And if none of the
- * flags are set, something has gone very wrong, so report it.
- */
- if (regs->flags & (X86_EFLAGS_OF | X86_EFLAGS_ZF)) {
- bool zero = regs->flags & X86_EFLAGS_ZF;
-
- refcount_error_report(regs, zero ? "hit zero" : "overflow");
- } else if ((regs->flags & X86_EFLAGS_SF) == 0) {
- /* Report if none of OF, ZF, nor SF are set. */
- refcount_error_report(regs, "unexpected saturation");
- }
-
- return true;
-}
-EXPORT_SYMBOL(ex_handler_refcount);
-
-/*
* Handler for when we fail to restore a task's FPU state. We should never get
* here because the FPU state of a task using the FPU (task->thread.fpu.state)
* should always be valid. However, past bugs have allowed userspace to set
@@ -129,17 +81,17 @@
}
EXPORT_SYMBOL(ex_handler_uaccess);
-__visible bool ex_handler_ext(const struct exception_table_entry *fixup,
- struct pt_regs *regs, int trapnr,
- unsigned long error_code,
- unsigned long fault_addr)
+__visible bool ex_handler_copy(const struct exception_table_entry *fixup,
+ struct pt_regs *regs, int trapnr,
+ unsigned long error_code,
+ unsigned long fault_addr)
{
- /* Special hack for uaccess_err */
- current->thread.uaccess_err = 1;
+ WARN_ONCE(trapnr == X86_TRAP_GP, "General protection fault in user access. Non-canonical address?");
regs->ip = ex_fixup_addr(fixup);
+ regs->ax = trapnr;
return true;
}
-EXPORT_SYMBOL(ex_handler_ext);
+EXPORT_SYMBOL(ex_handler_copy);
__visible bool ex_handler_rdmsr_unsafe(const struct exception_table_entry *fixup,
struct pt_regs *regs, int trapnr,
@@ -186,17 +138,21 @@
}
EXPORT_SYMBOL(ex_handler_clear_fs);
-__visible bool ex_has_fault_handler(unsigned long ip)
+enum handler_type ex_get_fault_handler_type(unsigned long ip)
{
const struct exception_table_entry *e;
ex_handler_t handler;
e = search_exception_tables(ip);
if (!e)
- return false;
+ return EX_HANDLER_NONE;
handler = ex_fixup_handler(e);
-
- return handler == ex_handler_fault;
+ if (handler == ex_handler_fault)
+ return EX_HANDLER_FAULT;
+ else if (handler == ex_handler_uaccess || handler == ex_handler_copy)
+ return EX_HANDLER_UACCESS;
+ else
+ return EX_HANDLER_OTHER;
}
int fixup_exception(struct pt_regs *regs, int trapnr, unsigned long error_code,
@@ -265,8 +221,19 @@
if (fixup_exception(regs, trapnr, regs->orig_ax, 0))
return;
- if (fixup_bug(regs, trapnr))
- return;
+ if (trapnr == X86_TRAP_UD) {
+ if (report_bug(regs->ip, regs) == BUG_TRAP_TYPE_WARN) {
+ /* Skip the ud2. */
+ regs->ip += LEN_UD2;
+ return;
+ }
+
+ /*
+ * If this was a BUG and report_bug returns or if this
+ * was just a normal #UD, we want to continue onward and
+ * crash.
+ */
+ }
fail:
early_printk("PANIC: early exception 0x%02x IP %lx:%lx error %lx cr2 0x%lx\n",
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index c494c8c..9c1545c 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -21,7 +21,6 @@
#include <asm/cpufeature.h> /* boot_cpu_has, ... */
#include <asm/traps.h> /* dotraplinkage, ... */
-#include <asm/pgalloc.h> /* pgd_*(), ... */
#include <asm/fixmap.h> /* VSYSCALL_ADDR */
#include <asm/vsyscall.h> /* emulate_vsyscall */
#include <asm/vm86.h> /* struct vm86 */
@@ -29,6 +28,8 @@
#include <asm/efi.h> /* efi_recover_from_page_fault()*/
#include <asm/desc.h> /* store_idt(), ... */
#include <asm/cpu_entry_area.h> /* exception stack */
+#include <asm/pgtable_areas.h> /* VMALLOC_START, ... */
+#include <asm/kvm_para.h> /* kvm_handle_async_pf */
#define CREATE_TRACE_POINTS
#include <asm/trace/exceptions.h>
@@ -52,7 +53,7 @@
* 32-bit mode:
*
* Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch.
- * Check that here and ignore it.
+ * Check that here and ignore it. This is AMD erratum #91.
*
* 64-bit mode:
*
@@ -81,11 +82,7 @@
#ifdef CONFIG_X86_64
case 0x40:
/*
- * In AMD64 long mode 0x40..0x4F are valid REX prefixes
- * Need to figure out under what instruction mode the
- * instruction was issued. Could check the LDT for lm,
- * but for now it's good enough to assume that long
- * mode only uses well known segments or kernel.
+ * In 64-bit mode 0x40..0x4F are valid REX prefixes
*/
return (!user_mode(regs) || user_64bit_mode(regs));
#endif
@@ -97,7 +94,7 @@
return !instr_lo || (instr_lo>>1) == 1;
case 0x00:
/* Prefetch instruction is 0x0F0D or 0x0F18 */
- if (probe_kernel_address(instr, opcode))
+ if (get_kernel_nofault(opcode, instr))
return 0;
*prefetch = (instr_lo == 0xF) &&
@@ -125,20 +122,31 @@
instr = (void *)convert_ip_to_linear(current, regs);
max_instr = instr + 15;
- if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE_MAX)
- return 0;
+ /*
+ * This code has historically always bailed out if IP points to a
+ * not-present page (e.g. due to a race). No one has ever
+ * complained about this.
+ */
+ pagefault_disable();
while (instr < max_instr) {
unsigned char opcode;
- if (probe_kernel_address(instr, opcode))
- break;
+ if (user_mode(regs)) {
+ if (get_user(opcode, instr))
+ break;
+ } else {
+ if (get_kernel_nofault(opcode, instr))
+ break;
+ }
instr++;
if (!check_prefetch_opcode(regs, instr, opcode, &prefetch))
break;
}
+
+ pagefault_enable();
return prefetch;
}
@@ -189,47 +197,19 @@
return pmd_k;
}
-static void vmalloc_sync(void)
-{
- unsigned long address;
-
- if (SHARED_KERNEL_PMD)
- return;
-
- for (address = VMALLOC_START & PMD_MASK;
- address >= TASK_SIZE_MAX && address < VMALLOC_END;
- address += PMD_SIZE) {
- struct page *page;
-
- spin_lock(&pgd_lock);
- list_for_each_entry(page, &pgd_list, lru) {
- spinlock_t *pgt_lock;
-
- /* the pgt_lock only for Xen */
- pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
-
- spin_lock(pgt_lock);
- vmalloc_sync_one(page_address(page), address);
- spin_unlock(pgt_lock);
- }
- spin_unlock(&pgd_lock);
- }
-}
-
-void vmalloc_sync_mappings(void)
-{
- vmalloc_sync();
-}
-
-void vmalloc_sync_unmappings(void)
-{
- vmalloc_sync();
-}
-
/*
- * 32-bit:
- *
* Handle a fault on the vmalloc or module mapping area
+ *
+ * This is needed because there is a race condition between the time
+ * when the vmalloc mapping code updates the PMD to the point in time
+ * where it synchronizes this update with the other page-tables in the
+ * system.
+ *
+ * In this race window another thread/CPU can map an area on the same
+ * PMD, finds it already present and does not synchronize it with the
+ * rest of the system yet. As a result v[mz]alloc might return areas
+ * which are not mapped in every page-table in the system, causing an
+ * unhandled page-fault when they are accessed.
*/
static noinline int vmalloc_fault(unsigned long address)
{
@@ -264,6 +244,30 @@
}
NOKPROBE_SYMBOL(vmalloc_fault);
+void arch_sync_kernel_mappings(unsigned long start, unsigned long end)
+{
+ unsigned long addr;
+
+ for (addr = start & PMD_MASK;
+ addr >= TASK_SIZE_MAX && addr < VMALLOC_END;
+ addr += PMD_SIZE) {
+ struct page *page;
+
+ spin_lock(&pgd_lock);
+ list_for_each_entry(page, &pgd_list, lru) {
+ spinlock_t *pgt_lock;
+
+ /* the pgt_lock only for Xen */
+ pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
+
+ spin_lock(pgt_lock);
+ vmalloc_sync_one(page_address(page), addr);
+ spin_unlock(pgt_lock);
+ }
+ spin_unlock(&pgd_lock);
+ }
+}
+
/*
* Did it hit the DOS screen memory VA from vm86 mode?
*/
@@ -328,96 +332,6 @@
#else /* CONFIG_X86_64: */
-void vmalloc_sync_mappings(void)
-{
- /*
- * 64-bit mappings might allocate new p4d/pud pages
- * that need to be propagated to all tasks' PGDs.
- */
- sync_global_pgds(VMALLOC_START & PGDIR_MASK, VMALLOC_END);
-}
-
-void vmalloc_sync_unmappings(void)
-{
- /*
- * Unmappings never allocate or free p4d/pud pages.
- * No work is required here.
- */
-}
-
-/*
- * 64-bit:
- *
- * Handle a fault on the vmalloc area
- */
-static noinline int vmalloc_fault(unsigned long address)
-{
- pgd_t *pgd, *pgd_k;
- p4d_t *p4d, *p4d_k;
- pud_t *pud;
- pmd_t *pmd;
- pte_t *pte;
-
- /* Make sure we are in vmalloc area: */
- if (!(address >= VMALLOC_START && address < VMALLOC_END))
- return -1;
-
- /*
- * Copy kernel mappings over when needed. This can also
- * happen within a race in page table update. In the later
- * case just flush:
- */
- pgd = (pgd_t *)__va(read_cr3_pa()) + pgd_index(address);
- pgd_k = pgd_offset_k(address);
- if (pgd_none(*pgd_k))
- return -1;
-
- if (pgtable_l5_enabled()) {
- if (pgd_none(*pgd)) {
- set_pgd(pgd, *pgd_k);
- arch_flush_lazy_mmu_mode();
- } else {
- BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_k));
- }
- }
-
- /* With 4-level paging, copying happens on the p4d level. */
- p4d = p4d_offset(pgd, address);
- p4d_k = p4d_offset(pgd_k, address);
- if (p4d_none(*p4d_k))
- return -1;
-
- if (p4d_none(*p4d) && !pgtable_l5_enabled()) {
- set_p4d(p4d, *p4d_k);
- arch_flush_lazy_mmu_mode();
- } else {
- BUG_ON(p4d_pfn(*p4d) != p4d_pfn(*p4d_k));
- }
-
- BUILD_BUG_ON(CONFIG_PGTABLE_LEVELS < 4);
-
- pud = pud_offset(p4d, address);
- if (pud_none(*pud))
- return -1;
-
- if (pud_large(*pud))
- return 0;
-
- pmd = pmd_offset(pud, address);
- if (pmd_none(*pmd))
- return -1;
-
- if (pmd_large(*pmd))
- return 0;
-
- pte = pte_offset_kernel(pmd, address);
- if (!pte_present(*pte))
- return -1;
-
- return 0;
-}
-NOKPROBE_SYMBOL(vmalloc_fault);
-
#ifdef CONFIG_CPU_SUP_AMD
static const char errata93_warning[] =
KERN_ERR
@@ -440,7 +354,7 @@
{
unsigned long dummy;
- return probe_kernel_address((unsigned long *)p, dummy);
+ return get_kernel_nofault(dummy, (unsigned long *)p);
}
static void dump_pagetable(unsigned long address)
@@ -553,21 +467,13 @@
return 0;
}
+/* Pentium F0 0F C7 C8 bug workaround: */
static int is_f00f_bug(struct pt_regs *regs, unsigned long address)
{
#ifdef CONFIG_X86_F00F_BUG
- unsigned long nr;
-
- /*
- * Pentium F0 0F C7 C8 bug workaround:
- */
- if (boot_cpu_has_bug(X86_BUG_F00F)) {
- nr = (address - idt_descr.address) >> 3;
-
- if (nr == 6) {
- do_invalid_op(regs, 0);
- return 1;
- }
+ if (boot_cpu_has_bug(X86_BUG_F00F) && idt_is_f00f_address(address)) {
+ handle_invalid_op(regs);
+ return 1;
}
#endif
return 0;
@@ -589,7 +495,7 @@
return;
}
- if (probe_kernel_read(&desc, (void *)(gdt->address + offset),
+ if (copy_from_kernel_nofault(&desc, (void *)(gdt->address + offset),
sizeof(struct ldttss_desc))) {
pr_alert("%s: 0x%hx -- GDT entry is not readable\n",
name, index);
@@ -925,6 +831,8 @@
force_sig_fault(SIGSEGV, si_code, (void __user *)address);
+ local_irq_disable();
+
return;
}
@@ -950,7 +858,7 @@
* Something tried to access memory that isn't in our memory map..
* Fix it, but check if it's kernel or user first..
*/
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
__bad_area_nosemaphore(regs, error_code, address, pkey, si_code);
}
@@ -1004,7 +912,7 @@
* 2. T1 : set PKRU to deny access to pkey=4, touches page
* 3. T1 : faults...
* 4. T2: mprotect_key(foo, PAGE_SIZE, pkey=5);
- * 5. T1 : enters fault handler, takes mmap_sem, etc...
+ * 5. T1 : enters fault handler, takes mmap_lock, etc...
* 6. T1 : reaches here, sees vma_pkey(vma)=5, when we really
* faulted on a pte with its pkey=4.
*/
@@ -1221,13 +1129,13 @@
return 1;
/* read, not present: */
- if (unlikely(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))))
+ if (unlikely(!vma_is_accessible(vma)))
return 1;
return 0;
}
-static int fault_in_kernel_space(unsigned long address)
+bool fault_in_kernel_space(unsigned long address)
{
/*
* On 64-bit systems, the vsyscall page is at an address above
@@ -1256,6 +1164,7 @@
*/
WARN_ON_ONCE(hw_error_code & X86_PF_PK);
+#ifdef CONFIG_X86_32
/*
* We can fault-in kernel-space virtual memory on-demand. The
* 'reference' page table is init_mm.pgd.
@@ -1273,11 +1182,18 @@
* 3. A fault caused by a page-level protection violation.
* (A demand fault would be on a non-present page which
* would have X86_PF_PROT==0).
+ *
+ * This is only needed to close a race condition on x86-32 in
+ * the vmalloc mapping/unmapping code. See the comment above
+ * vmalloc_fault() for details. On x86-64 the race does not
+ * exist as the vmalloc mappings don't need to be synchronized
+ * there.
*/
if (!(hw_error_code & (X86_PF_RSVD | X86_PF_USER | X86_PF_PROT))) {
if (vmalloc_fault(address) >= 0)
return;
}
+#endif
/* Was the fault spurious, caused by lazy TLB invalidation? */
if (spurious_kernel_fault(hw_error_code, address))
@@ -1308,8 +1224,8 @@
struct vm_area_struct *vma;
struct task_struct *tsk;
struct mm_struct *mm;
- vm_fault_t fault, major = 0;
- unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
+ vm_fault_t fault;
+ unsigned int flags = FAULT_FLAG_DEFAULT;
tsk = current;
mm = tsk->mm;
@@ -1393,15 +1309,15 @@
* Kernel-mode access to the user address space should only occur
* on well-defined single instructions listed in the exception
* tables. But, an erroneous kernel fault occurring outside one of
- * those areas which also holds mmap_sem might deadlock attempting
+ * those areas which also holds mmap_lock might deadlock attempting
* to validate the fault against the address space.
*
* Only do the expensive exception table search when we might be at
* risk of a deadlock. This happens if we
- * 1. Failed to acquire mmap_sem, and
+ * 1. Failed to acquire mmap_lock, and
* 2. The access did not originate in userspace.
*/
- if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
+ if (unlikely(!mmap_read_trylock(mm))) {
if (!user_mode(regs) && !search_exception_tables(regs->ip)) {
/*
* Fault from code in kernel from
@@ -1411,7 +1327,7 @@
return;
}
retry:
- down_read(&mm->mmap_sem);
+ mmap_read_lock(mm);
} else {
/*
* The above down_read_trylock() might have succeeded in
@@ -1451,84 +1367,46 @@
* If for any reason at all we couldn't handle the fault,
* make sure we exit gracefully rather than endlessly redo
* the fault. Since we never set FAULT_FLAG_RETRY_NOWAIT, if
- * we get VM_FAULT_RETRY back, the mmap_sem has been unlocked.
+ * we get VM_FAULT_RETRY back, the mmap_lock has been unlocked.
*
- * Note that handle_userfault() may also release and reacquire mmap_sem
+ * Note that handle_userfault() may also release and reacquire mmap_lock
* (and not return with VM_FAULT_RETRY), when returning to userland to
* repeat the page fault later with a VM_FAULT_NOPAGE retval
* (potentially after handling any pending signal during the return to
* userland). The return to userland is identified whenever
* FAULT_FLAG_USER|FAULT_FLAG_KILLABLE are both set in flags.
*/
- fault = handle_mm_fault(vma, address, flags);
- major |= fault & VM_FAULT_MAJOR;
+ fault = handle_mm_fault(vma, address, flags, regs);
- /*
- * If we need to retry the mmap_sem has already been released,
- * and if there is a fatal signal pending there is no guarantee
- * that we made any progress. Handle this case first.
- */
- if (unlikely(fault & VM_FAULT_RETRY)) {
- /* Retry at most once */
- if (flags & FAULT_FLAG_ALLOW_RETRY) {
- flags &= ~FAULT_FLAG_ALLOW_RETRY;
- flags |= FAULT_FLAG_TRIED;
- if (!fatal_signal_pending(tsk))
- goto retry;
- }
-
- /* User mode? Just return to handle the fatal exception */
- if (flags & FAULT_FLAG_USER)
- return;
-
- /* Not returning to user mode? Handle exceptions or die: */
- no_context(regs, hw_error_code, address, SIGBUS, BUS_ADRERR);
+ /* Quick path to respond to signals */
+ if (fault_signal_pending(fault, regs)) {
+ if (!user_mode(regs))
+ no_context(regs, hw_error_code, address, SIGBUS,
+ BUS_ADRERR);
return;
}
- up_read(&mm->mmap_sem);
+ /*
+ * If we need to retry the mmap_lock has already been released,
+ * and if there is a fatal signal pending there is no guarantee
+ * that we made any progress. Handle this case first.
+ */
+ if (unlikely((fault & VM_FAULT_RETRY) &&
+ (flags & FAULT_FLAG_ALLOW_RETRY))) {
+ flags |= FAULT_FLAG_TRIED;
+ goto retry;
+ }
+
+ mmap_read_unlock(mm);
if (unlikely(fault & VM_FAULT_ERROR)) {
mm_fault_error(regs, hw_error_code, address, fault);
return;
}
- /*
- * Major/minor page fault accounting. If any of the events
- * returned VM_FAULT_MAJOR, we account it as a major fault.
- */
- if (major) {
- tsk->maj_flt++;
- perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, regs, address);
- } else {
- tsk->min_flt++;
- perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs, address);
- }
-
check_v8086_mode(regs, address, tsk);
}
NOKPROBE_SYMBOL(do_user_addr_fault);
-/*
- * Explicitly marked noinline such that the function tracer sees this as the
- * page_fault entry point.
- */
-static noinline void
-__do_page_fault(struct pt_regs *regs, unsigned long hw_error_code,
- unsigned long address)
-{
- prefetchw(¤t->mm->mmap_sem);
-
- if (unlikely(kmmio_fault(regs, address)))
- return;
-
- /* Was the fault on kernel-controlled part of the address space? */
- if (unlikely(fault_in_kernel_space(address)))
- do_kern_addr_fault(regs, hw_error_code, address);
- else
- do_user_addr_fault(regs, hw_error_code, address);
-}
-NOKPROBE_SYMBOL(__do_page_fault);
-
static __always_inline void
trace_page_fault_entries(struct pt_regs *regs, unsigned long error_code,
unsigned long address)
@@ -1542,14 +1420,77 @@
trace_page_fault_kernel(address, regs, error_code);
}
-dotraplinkage void
-do_page_fault(struct pt_regs *regs, unsigned long error_code, unsigned long address)
+static __always_inline void
+handle_page_fault(struct pt_regs *regs, unsigned long error_code,
+ unsigned long address)
{
- enum ctx_state prev_state;
-
- prev_state = exception_enter();
trace_page_fault_entries(regs, error_code, address);
- __do_page_fault(regs, error_code, address);
- exception_exit(prev_state);
+
+ if (unlikely(kmmio_fault(regs, address)))
+ return;
+
+ /* Was the fault on kernel-controlled part of the address space? */
+ if (unlikely(fault_in_kernel_space(address))) {
+ do_kern_addr_fault(regs, error_code, address);
+ } else {
+ do_user_addr_fault(regs, error_code, address);
+ /*
+ * User address page fault handling might have reenabled
+ * interrupts. Fixing up all potential exit points of
+ * do_user_addr_fault() and its leaf functions is just not
+ * doable w/o creating an unholy mess or turning the code
+ * upside down.
+ */
+ local_irq_disable();
+ }
}
-NOKPROBE_SYMBOL(do_page_fault);
+
+DEFINE_IDTENTRY_RAW_ERRORCODE(exc_page_fault)
+{
+ unsigned long address = read_cr2();
+ irqentry_state_t state;
+
+ prefetchw(¤t->mm->mmap_lock);
+
+ /*
+ * KVM uses #PF vector to deliver 'page not present' events to guests
+ * (asynchronous page fault mechanism). The event happens when a
+ * userspace task is trying to access some valid (from guest's point of
+ * view) memory which is not currently mapped by the host (e.g. the
+ * memory is swapped out). Note, the corresponding "page ready" event
+ * which is injected when the memory becomes available, is delived via
+ * an interrupt mechanism and not a #PF exception
+ * (see arch/x86/kernel/kvm.c: sysvec_kvm_asyncpf_interrupt()).
+ *
+ * We are relying on the interrupted context being sane (valid RSP,
+ * relevant locks not held, etc.), which is fine as long as the
+ * interrupted context had IF=1. We are also relying on the KVM
+ * async pf type field and CR2 being read consistently instead of
+ * getting values from real and async page faults mixed up.
+ *
+ * Fingers crossed.
+ *
+ * The async #PF handling code takes care of idtentry handling
+ * itself.
+ */
+ if (kvm_handle_async_pf(regs, (u32)address))
+ return;
+
+ /*
+ * Entry handling for valid #PF from kernel mode is slightly
+ * different: RCU is already watching and rcu_irq_enter() must not
+ * be invoked because a kernel fault on a user space address might
+ * sleep.
+ *
+ * In case the fault hit a RCU idle region the conditional entry
+ * code reenabled RCU to avoid subsequent wreckage which helps
+ * debugability.
+ */
+ state = irqentry_enter(regs);
+
+ instrumentation_begin();
+ handle_page_fault(regs, error_code, address);
+ instrumentation_end();
+
+ irqentry_exit(regs, state);
+}
diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c
index 0a1898b..075fe51 100644
--- a/arch/x86/mm/highmem_32.c
+++ b/arch/x86/mm/highmem_32.c
@@ -4,44 +4,11 @@
#include <linux/swap.h> /* for totalram_pages */
#include <linux/memblock.h>
-void *kmap(struct page *page)
-{
- might_sleep();
- if (!PageHighMem(page))
- return page_address(page);
- return kmap_high(page);
-}
-EXPORT_SYMBOL(kmap);
-
-void kunmap(struct page *page)
-{
- if (in_interrupt())
- BUG();
- if (!PageHighMem(page))
- return;
- kunmap_high(page);
-}
-EXPORT_SYMBOL(kunmap);
-
-/*
- * kmap_atomic/kunmap_atomic is significantly faster than kmap/kunmap because
- * no global lock is needed and because the kmap code must perform a global TLB
- * invalidation when the kmap pool wraps.
- *
- * However when holding an atomic kmap it is not legal to sleep, so atomic
- * kmaps are appropriate for short, tight code paths only.
- */
-void *kmap_atomic_prot(struct page *page, pgprot_t prot)
+void *kmap_atomic_high_prot(struct page *page, pgprot_t prot)
{
unsigned long vaddr;
int idx, type;
- preempt_disable();
- pagefault_disable();
-
- if (!PageHighMem(page))
- return page_address(page);
-
type = kmap_atomic_idx_push();
idx = type + KM_TYPE_NR*smp_processor_id();
vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
@@ -51,13 +18,7 @@
return (void *)vaddr;
}
-EXPORT_SYMBOL(kmap_atomic_prot);
-
-void *kmap_atomic(struct page *page)
-{
- return kmap_atomic_prot(page, kmap_prot);
-}
-EXPORT_SYMBOL(kmap_atomic);
+EXPORT_SYMBOL(kmap_atomic_high_prot);
/*
* This is the same as kmap_atomic() but can map memory that doesn't
@@ -69,7 +30,7 @@
}
EXPORT_SYMBOL_GPL(kmap_atomic_pfn);
-void __kunmap_atomic(void *kvaddr)
+void kunmap_atomic_high(void *kvaddr)
{
unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK;
@@ -99,11 +60,8 @@
BUG_ON(vaddr >= (unsigned long)high_memory);
}
#endif
-
- pagefault_enable();
- preempt_enable();
}
-EXPORT_SYMBOL(__kunmap_atomic);
+EXPORT_SYMBOL(kunmap_atomic_high);
void __init set_highmem_pages_init(void)
{
diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c
index fab0953..a0d023c 100644
--- a/arch/x86/mm/hugetlbpage.c
+++ b/arch/x86/mm/hugetlbpage.c
@@ -17,9 +17,7 @@
#include <asm/mman.h>
#include <asm/tlb.h>
#include <asm/tlbflush.h>
-#include <asm/pgalloc.h>
#include <asm/elf.h>
-#include <asm/mpx.h>
#if 0 /* This is just for testing */
struct page *
@@ -151,10 +149,6 @@
if (len & ~huge_page_mask(h))
return -EINVAL;
- addr = mpx_unmapped_area_check(addr, len, flags);
- if (IS_ERR_VALUE(addr))
- return addr;
-
if (len > TASK_SIZE)
return -ENOMEM;
@@ -186,28 +180,21 @@
#endif /* CONFIG_HUGETLB_PAGE */
#ifdef CONFIG_X86_64
-static __init int setup_hugepagesz(char *opt)
+bool __init arch_hugetlb_valid_size(unsigned long size)
{
- unsigned long ps = memparse(opt, &opt);
- if (ps == PMD_SIZE) {
- hugetlb_add_hstate(PMD_SHIFT - PAGE_SHIFT);
- } else if (ps == PUD_SIZE && boot_cpu_has(X86_FEATURE_GBPAGES)) {
- hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT);
- } else {
- hugetlb_bad_size();
- printk(KERN_ERR "hugepagesz: Unsupported page size %lu M\n",
- ps >> 20);
- return 0;
- }
- return 1;
+ if (size == PMD_SIZE)
+ return true;
+ else if (size == PUD_SIZE && boot_cpu_has(X86_FEATURE_GBPAGES))
+ return true;
+ else
+ return false;
}
-__setup("hugepagesz=", setup_hugepagesz);
#ifdef CONFIG_CONTIG_ALLOC
static __init int gigantic_pages_init(void)
{
/* With compaction or CMA we can allocate gigantic pages at runtime */
- if (boot_cpu_has(X86_FEATURE_GBPAGES) && !size_to_hstate(1UL << PUD_SHIFT))
+ if (boot_cpu_has(X86_FEATURE_GBPAGES))
hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT);
return 0;
}
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index af352e2..c7a4760 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -25,6 +25,7 @@
#include <asm/cpufeature.h>
#include <asm/pti.h>
#include <asm/text-patching.h>
+#include <asm/memtype.h>
/*
* We need to define the tracepoints somewhere, and tlb.c
@@ -49,7 +50,7 @@
* Index into __pte2cachemode_tbl[] are the caching attribute bits of the pte
* (_PAGE_PWT, _PAGE_PCD, _PAGE_PAT) at index bit positions 0, 1, 2.
*/
-uint16_t __cachemode2pte_tbl[_PAGE_CACHE_MODE_NUM] = {
+static uint16_t __cachemode2pte_tbl[_PAGE_CACHE_MODE_NUM] = {
[_PAGE_CACHE_MODE_WB ] = 0 | 0 ,
[_PAGE_CACHE_MODE_WC ] = 0 | _PAGE_PCD,
[_PAGE_CACHE_MODE_UC_MINUS] = 0 | _PAGE_PCD,
@@ -57,9 +58,16 @@
[_PAGE_CACHE_MODE_WT ] = 0 | _PAGE_PCD,
[_PAGE_CACHE_MODE_WP ] = 0 | _PAGE_PCD,
};
-EXPORT_SYMBOL(__cachemode2pte_tbl);
-uint8_t __pte2cachemode_tbl[8] = {
+unsigned long cachemode2protval(enum page_cache_mode pcm)
+{
+ if (likely(pcm == 0))
+ return 0;
+ return __cachemode2pte_tbl[pcm];
+}
+EXPORT_SYMBOL(cachemode2protval);
+
+static uint8_t __pte2cachemode_tbl[8] = {
[__pte2cm_idx( 0 | 0 | 0 )] = _PAGE_CACHE_MODE_WB,
[__pte2cm_idx(_PAGE_PWT | 0 | 0 )] = _PAGE_CACHE_MODE_UC_MINUS,
[__pte2cm_idx( 0 | _PAGE_PCD | 0 )] = _PAGE_CACHE_MODE_UC_MINUS,
@@ -69,7 +77,22 @@
[__pte2cm_idx(0 | _PAGE_PCD | _PAGE_PAT)] = _PAGE_CACHE_MODE_UC_MINUS,
[__pte2cm_idx(_PAGE_PWT | _PAGE_PCD | _PAGE_PAT)] = _PAGE_CACHE_MODE_UC,
};
-EXPORT_SYMBOL(__pte2cachemode_tbl);
+
+/* Check that the write-protect PAT entry is set for write-protect */
+bool x86_has_pat_wp(void)
+{
+ return __pte2cachemode_tbl[_PAGE_CACHE_MODE_WP] == _PAGE_CACHE_MODE_WP;
+}
+
+enum page_cache_mode pgprot2cachemode(pgprot_t pgprot)
+{
+ unsigned long masked;
+
+ masked = pgprot_val(pgprot) & _PAGE_CACHE_MASK;
+ if (likely(masked == 0))
+ return 0;
+ return __pte2cachemode_tbl[__pte2cm_idx(masked)];
+}
static unsigned long __initdata pgt_buf_start;
static unsigned long __initdata pgt_buf_end;
@@ -170,6 +193,19 @@
static int page_size_mask;
+/*
+ * Save some of cr4 feature set we're using (e.g. Pentium 4MB
+ * enable and PPro Global page enable), so that any CPU's that boot
+ * up after us can get the correct flags. Invoked on the boot CPU.
+ */
+static inline void cr4_set_bits_and_update_boot(unsigned long mask)
+{
+ mmu_cr4_features |= mask;
+ if (trampoline_cr4_features)
+ *trampoline_cr4_features = mmu_cr4_features;
+ cr4_set_bits(mask);
+}
+
static void __init probe_page_size_mask(void)
{
/*
@@ -465,7 +501,7 @@
* the physical memory. To access them they are temporarily mapped.
*/
unsigned long __ref init_memory_mapping(unsigned long start,
- unsigned long end)
+ unsigned long end, pgprot_t prot)
{
struct map_range mr[NR_RANGE_MR];
unsigned long ret = 0;
@@ -479,7 +515,8 @@
for (i = 0; i < nr_range; i++)
ret = kernel_physical_mapping_init(mr[i].start, mr[i].end,
- mr[i].page_size_mask);
+ mr[i].page_size_mask,
+ prot);
add_pfn_range_mapped(start >> PAGE_SHIFT, ret >> PAGE_SHIFT);
@@ -519,7 +556,7 @@
*/
can_use_brk_pgt = max(start, (u64)pgt_buf_end<<PAGE_SHIFT) >=
min(end, (u64)pgt_buf_top<<PAGE_SHIFT);
- init_memory_mapping(start, end);
+ init_memory_mapping(start, end, PAGE_KERNEL);
mapped_ram_size += end - start;
can_use_brk_pgt = true;
}
@@ -644,6 +681,28 @@
}
}
+/*
+ * The real mode trampoline, which is required for bootstrapping CPUs
+ * occupies only a small area under the low 1MB. See reserve_real_mode()
+ * for details.
+ *
+ * If KASLR is disabled the first PGD entry of the direct mapping is copied
+ * to map the real mode trampoline.
+ *
+ * If KASLR is enabled, copy only the PUD which covers the low 1MB
+ * area. This limits the randomization granularity to 1GB for both 4-level
+ * and 5-level paging.
+ */
+static void __init init_trampoline(void)
+{
+#ifdef CONFIG_X86_64
+ if (!kaslr_memory_enabled())
+ trampoline_pgd_entry = init_top_pgt[pgd_index(__PAGE_OFFSET)];
+ else
+ init_trampoline_kaslr();
+#endif
+}
+
void __init init_mem_mapping(void)
{
unsigned long end;
@@ -659,7 +718,7 @@
#endif
/* the ISA range is always mapped regardless of memory holes */
- init_memory_mapping(0, ISA_END_ADDRESS);
+ init_memory_mapping(0, ISA_END_ADDRESS, PAGE_KERNEL);
/* Init the trampoline, possibly with KASLR memory offset */
init_trampoline();
@@ -827,14 +886,13 @@
* used for the kernel image only. free_init_pages() will do the
* right thing for either kind of address.
*/
-void free_kernel_image_pages(void *begin, void *end)
+void free_kernel_image_pages(const char *what, void *begin, void *end)
{
unsigned long begin_ul = (unsigned long)begin;
unsigned long end_ul = (unsigned long)end;
unsigned long len_pages = (end_ul - begin_ul) >> PAGE_SHIFT;
-
- free_init_pages("unused kernel image", begin_ul, end_ul);
+ free_init_pages(what, begin_ul, end_ul);
/*
* PTI maps some of the kernel into userspace. For performance,
@@ -855,15 +913,14 @@
set_memory_np_noalias(begin_ul, len_pages);
}
-void __weak mem_encrypt_free_decrypted_mem(void) { }
-
void __ref free_initmem(void)
{
e820__reallocate_tables();
mem_encrypt_free_decrypted_mem();
- free_kernel_image_pages(&__init_begin, &__init_end);
+ free_kernel_image_pages("unused kernel image (initmem)",
+ &__init_begin, &__init_end);
}
#ifdef CONFIG_BLK_DEV_INITRD
@@ -946,7 +1003,7 @@
max_zone_pfns[ZONE_HIGHMEM] = max_pfn;
#endif
- free_area_init_nodes(max_zone_pfns);
+ free_area_init(max_zone_pfns);
}
__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = {
@@ -954,7 +1011,6 @@
.next_asid = 1,
.cr4 = ~0UL, /* fail hard if we screw up cr4 shadow initialization */
};
-EXPORT_PER_CPU_SYMBOL(cpu_tlbstate);
void update_cache_mode_entry(unsigned entry, enum page_cache_mode cache)
{
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 0a74407..7c05525 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -35,7 +35,6 @@
#include <asm/bios_ebda.h>
#include <asm/processor.h>
#include <linux/uaccess.h>
-#include <asm/pgtable.h>
#include <asm/dma.h>
#include <asm/fixmap.h>
#include <asm/e820/api.h>
@@ -52,6 +51,8 @@
#include <asm/page_types.h>
#include <asm/cpu_entry_area.h>
#include <asm/init.h>
+#include <asm/pgtable_areas.h>
+#include <asm/numa.h>
#include "mm_internal.h"
@@ -237,7 +238,11 @@
}
}
-static inline int is_kernel_text(unsigned long addr)
+/*
+ * The <linux/kallsyms.h> already defines is_kernel_text,
+ * using '__' prefix not to get in conflict.
+ */
+static inline int __is_kernel_text(unsigned long addr)
{
if (addr >= (unsigned long)_text && addr <= (unsigned long)__init_end)
return 1;
@@ -252,7 +257,8 @@
unsigned long __init
kernel_physical_mapping_init(unsigned long start,
unsigned long end,
- unsigned long page_size_mask)
+ unsigned long page_size_mask,
+ pgprot_t prot)
{
int use_pse = page_size_mask == (1<<PG_LEVEL_2M);
unsigned long last_map_addr = end;
@@ -327,8 +333,8 @@
addr2 = (pfn + PTRS_PER_PTE-1) * PAGE_SIZE +
PAGE_OFFSET + PAGE_SIZE-1;
- if (is_kernel_text(addr) ||
- is_kernel_text(addr2))
+ if (__is_kernel_text(addr) ||
+ __is_kernel_text(addr2))
prot = PAGE_KERNEL_LARGE_EXEC;
pages_2m++;
@@ -353,7 +359,7 @@
*/
pgprot_t init_prot = __pgprot(PTE_IDENT_ATTR);
- if (is_kernel_text(addr))
+ if (__is_kernel_text(addr))
prot = PAGE_KERNEL_EXEC;
pages_4k++;
@@ -390,15 +396,6 @@
pte_t *kmap_pte;
-static inline pte_t *kmap_get_fixmap_pte(unsigned long vaddr)
-{
- pgd_t *pgd = pgd_offset_k(vaddr);
- p4d_t *p4d = p4d_offset(pgd, vaddr);
- pud_t *pud = pud_offset(p4d, vaddr);
- pmd_t *pmd = pmd_offset(pud, vaddr);
- return pte_offset_kernel(pmd, vaddr);
-}
-
static void __init kmap_init(void)
{
unsigned long kmap_vstart;
@@ -407,28 +404,17 @@
* Cache the first kmap pte:
*/
kmap_vstart = __fix_to_virt(FIX_KMAP_BEGIN);
- kmap_pte = kmap_get_fixmap_pte(kmap_vstart);
+ kmap_pte = virt_to_kpte(kmap_vstart);
}
#ifdef CONFIG_HIGHMEM
static void __init permanent_kmaps_init(pgd_t *pgd_base)
{
- unsigned long vaddr;
- pgd_t *pgd;
- p4d_t *p4d;
- pud_t *pud;
- pmd_t *pmd;
- pte_t *pte;
+ unsigned long vaddr = PKMAP_BASE;
- vaddr = PKMAP_BASE;
page_table_range_init(vaddr, vaddr + PAGE_SIZE*LAST_PKMAP, pgd_base);
- pgd = swapper_pg_dir + pgd_index(vaddr);
- p4d = p4d_offset(pgd, vaddr);
- pud = pud_offset(p4d, vaddr);
- pmd = pmd_offset(pud, vaddr);
- pte = pte_offset_kernel(pmd, vaddr);
- pkmap_page_table = pte;
+ pkmap_page_table = virt_to_kpte(vaddr);
}
void __init add_highpages_with_active_regions(int nid,
@@ -693,7 +679,6 @@
#endif
memblock_set_node(0, PHYS_ADDR_MAX, &memblock.memory, 0);
- sparse_memory_present_with_active_regions(0);
#ifdef CONFIG_FLATMEM
max_mapnr = IS_ENABLED(CONFIG_HIGHMEM) ? highend_pfn : max_low_pfn;
@@ -733,7 +718,6 @@
* NOTE: at this point the bootmem allocator is fully available.
*/
olpc_dt_build_devicetree();
- sparse_memory_present_with_active_regions(MAX_NUMNODES);
sparse_init();
zone_sizes_init();
}
@@ -752,7 +736,7 @@
__set_fixmap(FIX_WP_TEST, __pa_symbol(empty_zero_page), PAGE_KERNEL_RO);
- if (probe_kernel_write((char *)fix_to_virt(FIX_WP_TEST), &z, 1)) {
+ if (copy_to_kernel_nofault((char *)fix_to_virt(FIX_WP_TEST), &z, 1)) {
clear_fixmap(FIX_WP_TEST);
printk(KERN_CONT "Ok.\n");
return;
@@ -787,44 +771,6 @@
x86_init.hyper.init_after_bootmem();
mem_init_print_info(NULL);
- printk(KERN_INFO "virtual kernel memory layout:\n"
- " fixmap : 0x%08lx - 0x%08lx (%4ld kB)\n"
- " cpu_entry : 0x%08lx - 0x%08lx (%4ld kB)\n"
-#ifdef CONFIG_HIGHMEM
- " pkmap : 0x%08lx - 0x%08lx (%4ld kB)\n"
-#endif
- " vmalloc : 0x%08lx - 0x%08lx (%4ld MB)\n"
- " lowmem : 0x%08lx - 0x%08lx (%4ld MB)\n"
- " .init : 0x%08lx - 0x%08lx (%4ld kB)\n"
- " .data : 0x%08lx - 0x%08lx (%4ld kB)\n"
- " .text : 0x%08lx - 0x%08lx (%4ld kB)\n",
- FIXADDR_START, FIXADDR_TOP,
- (FIXADDR_TOP - FIXADDR_START) >> 10,
-
- CPU_ENTRY_AREA_BASE,
- CPU_ENTRY_AREA_BASE + CPU_ENTRY_AREA_MAP_SIZE,
- CPU_ENTRY_AREA_MAP_SIZE >> 10,
-
-#ifdef CONFIG_HIGHMEM
- PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE,
- (LAST_PKMAP*PAGE_SIZE) >> 10,
-#endif
-
- VMALLOC_START, VMALLOC_END,
- (VMALLOC_END - VMALLOC_START) >> 20,
-
- (unsigned long)__va(0), (unsigned long)high_memory,
- ((unsigned long)high_memory - (unsigned long)__va(0)) >> 20,
-
- (unsigned long)&__init_begin, (unsigned long)&__init_end,
- ((unsigned long)&__init_end -
- (unsigned long)&__init_begin) >> 10,
-
- (unsigned long)&_etext, (unsigned long)&_edata,
- ((unsigned long)&_edata - (unsigned long)&_etext) >> 10,
-
- (unsigned long)&_text, (unsigned long)&_etext,
- ((unsigned long)&_etext - (unsigned long)&_text) >> 10);
/*
* Check boundaries twice: Some fundamental inconsistencies can
@@ -852,12 +798,24 @@
#ifdef CONFIG_MEMORY_HOTPLUG
int arch_add_memory(int nid, u64 start, u64 size,
- struct mhp_restrictions *restrictions)
+ struct mhp_params *params)
{
unsigned long start_pfn = start >> PAGE_SHIFT;
unsigned long nr_pages = size >> PAGE_SHIFT;
+ int ret;
- return __add_pages(nid, start_pfn, nr_pages, restrictions);
+ /*
+ * The page tables were already mapped at boot so if the caller
+ * requests a different mapping type then we must change all the
+ * pages with __set_memory_prot().
+ */
+ if (params->pgprot.pgprot != PAGE_KERNEL.pgprot) {
+ ret = __set_memory_prot(start, nr_pages, params->pgprot);
+ if (ret)
+ return ret;
+ }
+
+ return __add_pages(nid, start_pfn, nr_pages, params);
}
void arch_remove_memory(int nid, u64 start, u64 size,
@@ -872,34 +830,6 @@
int kernel_set_to_readonly __read_mostly;
-void set_kernel_text_rw(void)
-{
- unsigned long start = PFN_ALIGN(_text);
- unsigned long size = PFN_ALIGN(_etext) - start;
-
- if (!kernel_set_to_readonly)
- return;
-
- pr_debug("Set kernel text: %lx - %lx for read write\n",
- start, start+size);
-
- set_pages_rw(virt_to_page(start), size >> PAGE_SHIFT);
-}
-
-void set_kernel_text_ro(void)
-{
- unsigned long start = PFN_ALIGN(_text);
- unsigned long size = PFN_ALIGN(_etext) - start;
-
- if (!kernel_set_to_readonly)
- return;
-
- pr_debug("Set kernel text: %lx - %lx for read only\n",
- start, start+size);
-
- set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
-}
-
static void mark_nxdata_nx(void)
{
/*
@@ -908,7 +838,7 @@
*/
unsigned long start = PFN_ALIGN(_etext);
/*
- * This comes from is_kernel_text upper limit. Also HPAGE where used:
+ * This comes from __is_kernel_text upper limit. Also HPAGE where used:
*/
unsigned long size = (((unsigned long)__init_end + HPAGE_SIZE) & HPAGE_MASK) - start;
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index a37ccaf..067ca92 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -37,7 +37,6 @@
#include <asm/processor.h>
#include <asm/bios_ebda.h>
#include <linux/uaccess.h>
-#include <asm/pgtable.h>
#include <asm/pgalloc.h>
#include <asm/dma.h>
#include <asm/fixmap.h>
@@ -54,6 +53,7 @@
#include <asm/init.h>
#include <asm/uv/uv.h>
#include <asm/setup.h>
+#include <asm/ftrace.h>
#include "mm_internal.h"
@@ -209,7 +209,7 @@
* When memory was added make sure all the processes MM have
* suitable PGD entries in the local PGD level page.
*/
-void sync_global_pgds(unsigned long start, unsigned long end)
+static void sync_global_pgds(unsigned long start, unsigned long end)
{
if (pgtable_l5_enabled())
sync_global_pgds_l5(start, end);
@@ -298,7 +298,7 @@
* It's enough to flush this one mapping.
* (PGE mappings get flushed as well)
*/
- __flush_tlb_one_kernel(vaddr);
+ flush_tlb_one_kernel(vaddr);
}
void set_pte_vaddr_p4d(p4d_t *p4d_page, unsigned long vaddr, pte_t new_pte)
@@ -367,7 +367,7 @@
pgprot_t prot;
pgprot_val(prot) = pgprot_val(PAGE_KERNEL_LARGE) |
- pgprot_val(pgprot_4k_2_large(cachemode2pgprot(cache)));
+ protval_4k_2_large(cachemode2protval(cache));
BUG_ON((phys & ~PMD_MASK) || (size & ~PMD_MASK));
for (; size; phys += PMD_SIZE, size -= PMD_SIZE) {
pgd = pgd_offset_k((unsigned long)__va(phys));
@@ -585,7 +585,7 @@
*/
static unsigned long __meminit
phys_pud_init(pud_t *pud_page, unsigned long paddr, unsigned long paddr_end,
- unsigned long page_size_mask, bool init)
+ unsigned long page_size_mask, pgprot_t _prot, bool init)
{
unsigned long pages = 0, paddr_next;
unsigned long paddr_last = paddr_end;
@@ -595,7 +595,7 @@
for (; i < PTRS_PER_PUD; i++, paddr = paddr_next) {
pud_t *pud;
pmd_t *pmd;
- pgprot_t prot = PAGE_KERNEL;
+ pgprot_t prot = _prot;
vaddr = (unsigned long)__va(paddr);
pud = pud_page + pud_index(vaddr);
@@ -644,9 +644,12 @@
if (page_size_mask & (1<<PG_LEVEL_1G)) {
pages++;
spin_lock(&init_mm.page_table_lock);
+
+ prot = __pgprot(pgprot_val(prot) | __PAGE_KERNEL_LARGE);
+
set_pte_init((pte_t *)pud,
pfn_pte((paddr & PUD_MASK) >> PAGE_SHIFT,
- PAGE_KERNEL_LARGE),
+ prot),
init);
spin_unlock(&init_mm.page_table_lock);
paddr_last = paddr_next;
@@ -669,7 +672,7 @@
static unsigned long __meminit
phys_p4d_init(p4d_t *p4d_page, unsigned long paddr, unsigned long paddr_end,
- unsigned long page_size_mask, bool init)
+ unsigned long page_size_mask, pgprot_t prot, bool init)
{
unsigned long vaddr, vaddr_end, vaddr_next, paddr_next, paddr_last;
@@ -679,7 +682,7 @@
if (!pgtable_l5_enabled())
return phys_pud_init((pud_t *) p4d_page, paddr, paddr_end,
- page_size_mask, init);
+ page_size_mask, prot, init);
for (; vaddr < vaddr_end; vaddr = vaddr_next) {
p4d_t *p4d = p4d_page + p4d_index(vaddr);
@@ -702,13 +705,13 @@
if (!p4d_none(*p4d)) {
pud = pud_offset(p4d, 0);
paddr_last = phys_pud_init(pud, paddr, __pa(vaddr_end),
- page_size_mask, init);
+ page_size_mask, prot, init);
continue;
}
pud = alloc_low_page();
paddr_last = phys_pud_init(pud, paddr, __pa(vaddr_end),
- page_size_mask, init);
+ page_size_mask, prot, init);
spin_lock(&init_mm.page_table_lock);
p4d_populate_init(&init_mm, p4d, pud, init);
@@ -722,7 +725,7 @@
__kernel_physical_mapping_init(unsigned long paddr_start,
unsigned long paddr_end,
unsigned long page_size_mask,
- bool init)
+ pgprot_t prot, bool init)
{
bool pgd_changed = false;
unsigned long vaddr, vaddr_start, vaddr_end, vaddr_next, paddr_last;
@@ -743,13 +746,13 @@
paddr_last = phys_p4d_init(p4d, __pa(vaddr),
__pa(vaddr_end),
page_size_mask,
- init);
+ prot, init);
continue;
}
p4d = alloc_low_page();
paddr_last = phys_p4d_init(p4d, __pa(vaddr), __pa(vaddr_end),
- page_size_mask, init);
+ page_size_mask, prot, init);
spin_lock(&init_mm.page_table_lock);
if (pgtable_l5_enabled())
@@ -778,10 +781,10 @@
unsigned long __meminit
kernel_physical_mapping_init(unsigned long paddr_start,
unsigned long paddr_end,
- unsigned long page_size_mask)
+ unsigned long page_size_mask, pgprot_t prot)
{
return __kernel_physical_mapping_init(paddr_start, paddr_end,
- page_size_mask, true);
+ page_size_mask, prot, true);
}
/*
@@ -796,7 +799,8 @@
unsigned long page_size_mask)
{
return __kernel_physical_mapping_init(paddr_start, paddr_end,
- page_size_mask, false);
+ page_size_mask, PAGE_KERNEL,
+ false);
}
#ifndef CONFIG_NUMA
@@ -808,7 +812,6 @@
void __init paging_init(void)
{
- sparse_memory_present_with_active_regions(MAX_NUMNODES);
sparse_init();
/*
@@ -818,8 +821,7 @@
* will not set it back.
*/
node_clear_state(0, N_MEMORY);
- if (N_MEMORY != N_NORMAL_MEMORY)
- node_clear_state(0, N_NORMAL_MEMORY);
+ node_clear_state(0, N_NORMAL_MEMORY);
zone_sizes_init();
}
@@ -844,11 +846,11 @@
}
int add_pages(int nid, unsigned long start_pfn, unsigned long nr_pages,
- struct mhp_restrictions *restrictions)
+ struct mhp_params *params)
{
int ret;
- ret = __add_pages(nid, start_pfn, nr_pages, restrictions);
+ ret = __add_pages(nid, start_pfn, nr_pages, params);
WARN_ON_ONCE(ret);
/* update max_pfn, max_low_pfn and high_memory */
@@ -859,14 +861,14 @@
}
int arch_add_memory(int nid, u64 start, u64 size,
- struct mhp_restrictions *restrictions)
+ struct mhp_params *params)
{
unsigned long start_pfn = start >> PAGE_SHIFT;
unsigned long nr_pages = size >> PAGE_SHIFT;
- init_memory_mapping(start, start + size);
+ init_memory_mapping(start, start + size, params->pgprot);
- return add_pages(nid, start_pfn, nr_pages, restrictions);
+ return add_pages(nid, start_pfn, nr_pages, params);
}
#define PAGE_INUSE 0xFD
@@ -1230,6 +1232,56 @@
#endif
}
+/*
+ * Pre-allocates page-table pages for the vmalloc area in the kernel page-table.
+ * Only the level which needs to be synchronized between all page-tables is
+ * allocated because the synchronization can be expensive.
+ */
+static void __init preallocate_vmalloc_pages(void)
+{
+ unsigned long addr;
+ const char *lvl;
+
+ for (addr = VMALLOC_START; addr <= VMALLOC_END; addr = ALIGN(addr + 1, PGDIR_SIZE)) {
+ pgd_t *pgd = pgd_offset_k(addr);
+ p4d_t *p4d;
+ pud_t *pud;
+
+ lvl = "p4d";
+ p4d = p4d_alloc(&init_mm, pgd, addr);
+ if (!p4d)
+ goto failed;
+
+ if (pgtable_l5_enabled())
+ continue;
+
+ /*
+ * The goal here is to allocate all possibly required
+ * hardware page tables pointed to by the top hardware
+ * level.
+ *
+ * On 4-level systems, the P4D layer is folded away and
+ * the above code does no preallocation. Below, go down
+ * to the pud _software_ level to ensure the second
+ * hardware level is allocated on 4-level systems too.
+ */
+ lvl = "pud";
+ pud = pud_alloc(&init_mm, p4d, addr);
+ if (!pud)
+ goto failed;
+ }
+
+ return;
+
+failed:
+
+ /*
+ * The pages have to be there now or they will be missing in
+ * process page-tables later.
+ */
+ panic("Failed to pre-allocate %s pages for vmalloc area\n", lvl);
+}
+
void __init mem_init(void)
{
pci_iommu_alloc();
@@ -1253,54 +1305,32 @@
if (get_gate_vma(&init_mm))
kclist_add(&kcore_vsyscall, (void *)VSYSCALL_ADDR, PAGE_SIZE, KCORE_USER);
+ preallocate_vmalloc_pages();
+
mem_init_print_info(NULL);
}
+#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
+int __init deferred_page_init_max_threads(const struct cpumask *node_cpumask)
+{
+ /*
+ * More CPUs always led to greater speedups on tested systems, up to
+ * all the nodes' CPUs. Use all since the system is otherwise idle
+ * now.
+ */
+ return max_t(int, cpumask_weight(node_cpumask), 1);
+}
+#endif
+
int kernel_set_to_readonly;
-void set_kernel_text_rw(void)
-{
- unsigned long start = PFN_ALIGN(_text);
- unsigned long end = PFN_ALIGN(__stop___ex_table);
-
- if (!kernel_set_to_readonly)
- return;
-
- pr_debug("Set kernel text: %lx - %lx for read write\n",
- start, end);
-
- /*
- * Make the kernel identity mapping for text RW. Kernel text
- * mapping will always be RO. Refer to the comment in
- * static_protections() in pageattr.c
- */
- set_memory_rw(start, (end - start) >> PAGE_SHIFT);
-}
-
-void set_kernel_text_ro(void)
-{
- unsigned long start = PFN_ALIGN(_text);
- unsigned long end = PFN_ALIGN(__stop___ex_table);
-
- if (!kernel_set_to_readonly)
- return;
-
- pr_debug("Set kernel text: %lx - %lx for read only\n",
- start, end);
-
- /*
- * Set the kernel identity mapping for text RO.
- */
- set_memory_ro(start, (end - start) >> PAGE_SHIFT);
-}
-
void mark_rodata_ro(void)
{
unsigned long start = PFN_ALIGN(_text);
unsigned long rodata_start = PFN_ALIGN(__start_rodata);
- unsigned long end = (unsigned long) &__end_rodata_hpage_align;
- unsigned long text_end = PFN_ALIGN(&__stop___ex_table);
- unsigned long rodata_end = PFN_ALIGN(&__end_rodata);
+ unsigned long end = (unsigned long)__end_rodata_hpage_align;
+ unsigned long text_end = PFN_ALIGN(_etext);
+ unsigned long rodata_end = PFN_ALIGN(__end_rodata);
unsigned long all_end;
printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
@@ -1324,6 +1354,8 @@
all_end = roundup((unsigned long)_brk_end, PMD_SIZE);
set_memory_nx(text_end, (all_end - text_end) >> PAGE_SHIFT);
+ set_ftrace_ops_ro();
+
#ifdef CONFIG_CPA_DEBUG
printk(KERN_INFO "Testing CPA: undo %lx-%lx\n", start, end);
set_memory_rw(start, (end-start) >> PAGE_SHIFT);
@@ -1332,8 +1364,10 @@
set_memory_ro(start, (end-start) >> PAGE_SHIFT);
#endif
- free_kernel_image_pages((void *)text_end, (void *)rodata_start);
- free_kernel_image_pages((void *)rodata_end, (void *)_sdata);
+ free_kernel_image_pages("unused kernel image (text/rodata gap)",
+ (void *)text_end, (void *)rodata_start);
+ free_kernel_image_pages("unused kernel image (rodata/data gap)",
+ (void *)rodata_end, (void *)_sdata);
debug_checkwx();
}
@@ -1418,6 +1452,15 @@
goto done;
}
+ /*
+ * Use max block size to minimize overhead on bare metal, where
+ * alignment for memory hotplug isn't a concern.
+ */
+ if (!boot_cpu_has(X86_FEATURE_HYPERVISOR)) {
+ bz = MAX_BLOCK_SIZE;
+ goto done;
+ }
+
/* Find the largest allowed block size that aligns to memory end */
for (bz = MAX_BLOCK_SIZE; bz > MIN_MEMORY_BLOCK_SIZE; bz >>= 1) {
if (IS_ALIGNED(boot_mem_end, bz))
@@ -1475,10 +1518,7 @@
if (pmd_none(*pmd)) {
void *p;
- if (altmap)
- p = altmap_alloc_block_buf(PMD_SIZE, altmap);
- else
- p = vmemmap_alloc_block_buf(PMD_SIZE, node);
+ p = vmemmap_alloc_block_buf(PMD_SIZE, node, altmap);
if (p) {
pte_t entry;
@@ -1505,7 +1545,7 @@
vmemmap_verify((pte_t *)pmd, node, addr, next);
continue;
}
- if (vmemmap_populate_basepages(addr, next, node))
+ if (vmemmap_populate_basepages(addr, next, node, NULL))
return -ENOMEM;
}
return 0;
@@ -1517,7 +1557,7 @@
int err;
if (end - start < PAGES_PER_SECTION * sizeof(struct page))
- err = vmemmap_populate_basepages(start, end, node);
+ err = vmemmap_populate_basepages(start, end, node, NULL);
else if (boot_cpu_has(X86_FEATURE_PSE))
err = vmemmap_populate_hugepages(start, end, node, altmap);
else if (altmap) {
@@ -1525,7 +1565,7 @@
__func__);
err = -ENOMEM;
} else
- err = vmemmap_populate_basepages(start, end, node);
+ err = vmemmap_populate_basepages(start, end, node, NULL);
if (!err)
sync_global_pgds(start, end - 1);
return err;
diff --git a/arch/x86/mm/iomap_32.c b/arch/x86/mm/iomap_32.c
index 6748b4c..f60398a 100644
--- a/arch/x86/mm/iomap_32.c
+++ b/arch/x86/mm/iomap_32.c
@@ -4,7 +4,7 @@
*/
#include <asm/iomap.h>
-#include <asm/pat.h>
+#include <asm/memtype.h>
#include <linux/export.h>
#include <linux/highmem.h>
@@ -26,7 +26,7 @@
if (!is_io_mapping_possible(base, size))
return -EINVAL;
- ret = io_reserve_memtype(base, base + size, &pcm);
+ ret = memtype_reserve_io(base, base + size, &pcm);
if (ret)
return ret;
@@ -40,7 +40,7 @@
void iomap_free(resource_size_t base, unsigned long size)
{
- io_free_memtype(base, base + size);
+ memtype_free_io(base, base + size);
}
EXPORT_SYMBOL_GPL(iomap_free);
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index a353f88..91e61db 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -16,15 +16,15 @@
#include <linux/mmiotrace.h>
#include <linux/mem_encrypt.h>
#include <linux/efi.h>
+#include <linux/pgtable.h>
#include <asm/set_memory.h>
#include <asm/e820/api.h>
#include <asm/efi.h>
#include <asm/fixmap.h>
-#include <asm/pgtable.h>
#include <asm/tlbflush.h>
#include <asm/pgalloc.h>
-#include <asm/pat.h>
+#include <asm/memtype.h>
#include <asm/setup.h>
#include "physaddr.h"
@@ -118,7 +118,9 @@
if (!IS_ENABLED(CONFIG_EFI))
return;
- if (efi_mem_type(addr) == EFI_RUNTIME_SERVICES_DATA)
+ if (efi_mem_type(addr) == EFI_RUNTIME_SERVICES_DATA ||
+ (efi_mem_type(addr) == EFI_BOOT_SERVICES_DATA &&
+ efi_mem_attributes(addr) & EFI_MEMORY_RUNTIME))
desc->flags |= IORES_MAP_ENCRYPTED;
}
@@ -217,10 +219,10 @@
phys_addr &= PHYSICAL_PAGE_MASK;
size = PAGE_ALIGN(last_addr+1) - phys_addr;
- retval = reserve_memtype(phys_addr, (u64)phys_addr + size,
+ retval = memtype_reserve(phys_addr, (u64)phys_addr + size,
pcm, &new_pcm);
if (retval) {
- printk(KERN_ERR "ioremap reserve_memtype failed %d\n", retval);
+ printk(KERN_ERR "ioremap memtype_reserve failed %d\n", retval);
return NULL;
}
@@ -276,7 +278,7 @@
area->phys_addr = phys_addr;
vaddr = (unsigned long) area->addr;
- if (kernel_map_sync_memtype(phys_addr, size, pcm))
+ if (memtype_kernel_map_sync(phys_addr, size, pcm))
goto err_free_area;
if (ioremap_page_range(vaddr, vaddr + size, phys_addr, prot))
@@ -296,16 +298,16 @@
err_free_area:
free_vm_area(area);
err_free_memtype:
- free_memtype(phys_addr, phys_addr + size);
+ memtype_free(phys_addr, phys_addr + size);
return NULL;
}
/**
- * ioremap_nocache - map bus memory into CPU space
+ * ioremap - map bus memory into CPU space
* @phys_addr: bus address of the memory
* @size: size of the resource to map
*
- * ioremap_nocache performs a platform specific sequence of operations to
+ * ioremap performs a platform specific sequence of operations to
* make bus memory CPU accessible via the readb/readw/readl/writeb/
* writew/writel functions and the other mmio helpers. The returned
* address is not guaranteed to be usable directly as a virtual
@@ -321,7 +323,7 @@
*
* Must be freed with iounmap.
*/
-void __iomem *ioremap_nocache(resource_size_t phys_addr, unsigned long size)
+void __iomem *ioremap(resource_size_t phys_addr, unsigned long size)
{
/*
* Ideally, this should be:
@@ -336,7 +338,7 @@
return __ioremap_caller(phys_addr, size, pcm,
__builtin_return_address(0), false);
}
-EXPORT_SYMBOL(ioremap_nocache);
+EXPORT_SYMBOL(ioremap);
/**
* ioremap_uc - map bus memory into CPU space as strongly uncachable
@@ -472,7 +474,7 @@
return;
}
- free_memtype(p->phys_addr, p->phys_addr + get_vm_area_size(p));
+ memtype_free(p->phys_addr, p->phys_addr + get_vm_area_size(p));
/* Finally remove it */
o = remove_vm_area((void __force *)addr);
@@ -574,7 +576,7 @@
/* For SEV, these areas are encrypted */
if (sev_active())
break;
- /* Fallthrough */
+ fallthrough;
case E820_TYPE_PRAM:
return true;
@@ -631,6 +633,7 @@
static bool memremap_is_setup_data(resource_size_t phys_addr,
unsigned long size)
{
+ struct setup_indirect *indirect;
struct setup_data *data;
u64 paddr, paddr_next;
@@ -643,10 +646,36 @@
data = memremap(paddr, sizeof(*data),
MEMREMAP_WB | MEMREMAP_DEC);
+ if (!data) {
+ pr_warn("failed to memremap setup_data entry\n");
+ return false;
+ }
paddr_next = data->next;
len = data->len;
+ if ((phys_addr > paddr) && (phys_addr < (paddr + len))) {
+ memunmap(data);
+ return true;
+ }
+
+ if (data->type == SETUP_INDIRECT) {
+ memunmap(data);
+ data = memremap(paddr, sizeof(*data) + len,
+ MEMREMAP_WB | MEMREMAP_DEC);
+ if (!data) {
+ pr_warn("failed to memremap indirect setup_data\n");
+ return false;
+ }
+
+ indirect = (struct setup_indirect *)data->data;
+
+ if (indirect->type != SETUP_INDIRECT) {
+ paddr = indirect->addr;
+ len = indirect->len;
+ }
+ }
+
memunmap(data);
if ((phys_addr > paddr) && (phys_addr < (paddr + len)))
@@ -665,22 +694,51 @@
static bool __init early_memremap_is_setup_data(resource_size_t phys_addr,
unsigned long size)
{
+ struct setup_indirect *indirect;
struct setup_data *data;
u64 paddr, paddr_next;
paddr = boot_params.hdr.setup_data;
while (paddr) {
- unsigned int len;
+ unsigned int len, size;
if (phys_addr == paddr)
return true;
data = early_memremap_decrypted(paddr, sizeof(*data));
+ if (!data) {
+ pr_warn("failed to early memremap setup_data entry\n");
+ return false;
+ }
+
+ size = sizeof(*data);
paddr_next = data->next;
len = data->len;
- early_memunmap(data, sizeof(*data));
+ if ((phys_addr > paddr) && (phys_addr < (paddr + len))) {
+ early_memunmap(data, sizeof(*data));
+ return true;
+ }
+
+ if (data->type == SETUP_INDIRECT) {
+ size += len;
+ early_memunmap(data, sizeof(*data));
+ data = early_memremap_decrypted(paddr, size);
+ if (!data) {
+ pr_warn("failed to early memremap indirect setup_data\n");
+ return false;
+ }
+
+ indirect = (struct setup_indirect *)data->data;
+
+ if (indirect->type != SETUP_INDIRECT) {
+ paddr = indirect->addr;
+ len = indirect->len;
+ }
+ }
+
+ early_memunmap(data, size);
if ((phys_addr > paddr) && (phys_addr < (paddr + len)))
return true;
@@ -767,10 +825,8 @@
void __init *early_memremap_encrypted_wp(resource_size_t phys_addr,
unsigned long size)
{
- /* Be sure the write-protect PAT entry is set for write-protect */
- if (__pte2cachemode_tbl[_PAGE_CACHE_MODE_WP] != _PAGE_CACHE_MODE_WP)
+ if (!x86_has_pat_wp())
return NULL;
-
return early_memremap_prot(phys_addr, size, __PAGE_KERNEL_ENC_WP);
}
@@ -788,10 +844,8 @@
void __init *early_memremap_decrypted_wp(resource_size_t phys_addr,
unsigned long size)
{
- /* Be sure the write-protect PAT entry is set for write-protect */
- if (__pte2cachemode_tbl[_PAGE_CACHE_MODE_WP] != _PAGE_CACHE_MODE_WP)
+ if (!x86_has_pat_wp())
return NULL;
-
return early_memremap_prot(phys_addr, size, __PAGE_KERNEL_NOENC_WP);
}
#endif /* CONFIG_AMD_MEM_ENCRYPT */
@@ -878,5 +932,5 @@
set_pte(pte, pfn_pte(phys >> PAGE_SHIFT, flags));
else
pte_clear(&init_mm, addr, pte);
- __flush_tlb_one_kernel(addr);
+ flush_tlb_one_kernel(addr);
}
diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c
index 296da58..1a50434 100644
--- a/arch/x86/mm/kasan_init_64.c
+++ b/arch/x86/mm/kasan_init_64.c
@@ -17,7 +17,6 @@
#include <asm/pgalloc.h>
#include <asm/tlbflush.h>
#include <asm/sections.h>
-#include <asm/pgtable.h>
#include <asm/cpu_entry_area.h>
extern struct range pfn_mapped[E820_MAX_ENTRIES];
@@ -245,22 +244,48 @@
} while (pgd++, addr = next, addr != end);
}
-#ifdef CONFIG_KASAN_INLINE
-static int kasan_die_handler(struct notifier_block *self,
- unsigned long val,
- void *data)
+static void __init kasan_shallow_populate_p4ds(pgd_t *pgd,
+ unsigned long addr,
+ unsigned long end)
{
- if (val == DIE_GPF) {
- pr_emerg("CONFIG_KASAN_INLINE enabled\n");
- pr_emerg("GPF could be caused by NULL-ptr deref or user memory access\n");
- }
- return NOTIFY_OK;
+ p4d_t *p4d;
+ unsigned long next;
+ void *p;
+
+ p4d = p4d_offset(pgd, addr);
+ do {
+ next = p4d_addr_end(addr, end);
+
+ if (p4d_none(*p4d)) {
+ p = early_alloc(PAGE_SIZE, NUMA_NO_NODE, true);
+ p4d_populate(&init_mm, p4d, p);
+ }
+ } while (p4d++, addr = next, addr != end);
}
-static struct notifier_block kasan_die_notifier = {
- .notifier_call = kasan_die_handler,
-};
-#endif
+static void __init kasan_shallow_populate_pgds(void *start, void *end)
+{
+ unsigned long addr, next;
+ pgd_t *pgd;
+ void *p;
+
+ addr = (unsigned long)start;
+ pgd = pgd_offset_k(addr);
+ do {
+ next = pgd_addr_end(addr, (unsigned long)end);
+
+ if (pgd_none(*pgd)) {
+ p = early_alloc(PAGE_SIZE, NUMA_NO_NODE, true);
+ pgd_populate(&init_mm, pgd, p);
+ }
+
+ /*
+ * we need to populate p4ds to be synced when running in
+ * four level mode - see sync_global_pgds_l4()
+ */
+ kasan_shallow_populate_p4ds(pgd, addr, next);
+ } while (pgd++, addr = next, addr != (unsigned long)end);
+}
void __init kasan_early_init(void)
{
@@ -298,10 +323,6 @@
int i;
void *shadow_cpu_entry_begin, *shadow_cpu_entry_end;
-#ifdef CONFIG_KASAN_INLINE
- register_die_notifier(&kasan_die_notifier);
-#endif
-
memcpy(early_top_pgt, init_top_pgt, sizeof(early_top_pgt));
/*
@@ -354,6 +375,24 @@
kasan_populate_early_shadow(
kasan_mem_to_shadow((void *)PAGE_OFFSET + MAXMEM),
+ kasan_mem_to_shadow((void *)VMALLOC_START));
+
+ /*
+ * If we're in full vmalloc mode, don't back vmalloc space with early
+ * shadow pages. Instead, prepopulate pgds/p4ds so they are synced to
+ * the global table and we can populate the lower levels on demand.
+ */
+ if (IS_ENABLED(CONFIG_KASAN_VMALLOC))
+ kasan_shallow_populate_pgds(
+ kasan_mem_to_shadow((void *)VMALLOC_START),
+ kasan_mem_to_shadow((void *)VMALLOC_END));
+ else
+ kasan_populate_early_shadow(
+ kasan_mem_to_shadow((void *)VMALLOC_START),
+ kasan_mem_to_shadow((void *)VMALLOC_END));
+
+ kasan_populate_early_shadow(
+ kasan_mem_to_shadow((void *)VMALLOC_END + 1),
shadow_cpu_entry_begin);
kasan_populate_shadow((unsigned long)shadow_cpu_entry_begin,
diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c
index dc6182e..6e6b397 100644
--- a/arch/x86/mm/kaslr.c
+++ b/arch/x86/mm/kaslr.c
@@ -24,9 +24,8 @@
#include <linux/init.h>
#include <linux/random.h>
#include <linux/memblock.h>
+#include <linux/pgtable.h>
-#include <asm/pgalloc.h>
-#include <asm/pgtable.h>
#include <asm/setup.h>
#include <asm/kaslr.h>
@@ -61,15 +60,6 @@
return (region->size_tb << TB_SHIFT);
}
-/*
- * Apply no randomization if KASLR was disabled at boot or if KASAN
- * is enabled. KASAN shadow mappings rely on regions being PGD aligned.
- */
-static inline bool kaslr_memory_enabled(void)
-{
- return kaslr_enabled() && !IS_ENABLED(CONFIG_KASAN);
-}
-
/* Initialize base and padding for each memory region randomized with KASLR */
void __init kernel_randomize_memory(void)
{
@@ -148,7 +138,7 @@
}
}
-static void __meminit init_trampoline_pud(void)
+void __meminit init_trampoline_kaslr(void)
{
pud_t *pud_page_tramp, *pud, *pud_tramp;
p4d_t *p4d_page_tramp, *p4d, *p4d_tramp;
@@ -189,25 +179,3 @@
__pgd(_KERNPG_TABLE | __pa(pud_page_tramp)));
}
}
-
-/*
- * The real mode trampoline, which is required for bootstrapping CPUs
- * occupies only a small area under the low 1MB. See reserve_real_mode()
- * for details.
- *
- * If KASLR is disabled the first PGD entry of the direct mapping is copied
- * to map the real mode trampoline.
- *
- * If KASLR is enabled, copy only the PUD which covers the low 1MB
- * area. This limits the randomization granularity to 1GB for both 4-level
- * and 5-level paging.
- */
-void __meminit init_trampoline(void)
-{
- if (!kaslr_memory_enabled()) {
- init_trampoline_default();
- return;
- }
-
- init_trampoline_pud();
-}
diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c
index 79eb55c..be020a7 100644
--- a/arch/x86/mm/kmmio.c
+++ b/arch/x86/mm/kmmio.c
@@ -130,7 +130,7 @@
pmdval_t v = pmd_val(*pmd);
if (clear) {
*old = v;
- new_pmd = pmd_mknotpresent(*pmd);
+ new_pmd = pmd_mkinvalid(*pmd);
} else {
/* Presume this has been called with clear==true previously */
new_pmd = __pmd(*old);
@@ -173,7 +173,7 @@
return -1;
}
- __flush_tlb_one_kernel(f->addr);
+ flush_tlb_one_kernel(f->addr);
return 0;
}
@@ -193,8 +193,8 @@
int ret;
WARN_ONCE(f->armed, KERN_ERR pr_fmt("kmmio page already armed.\n"));
if (f->armed) {
- pr_warning("double-arm: addr 0x%08lx, ref %d, old %d\n",
- f->addr, f->count, !!f->old_presence);
+ pr_warn("double-arm: addr 0x%08lx, ref %d, old %d\n",
+ f->addr, f->count, !!f->old_presence);
}
ret = clear_page_presence(f, true);
WARN_ONCE(ret < 0, KERN_ERR pr_fmt("arming at 0x%08lx failed.\n"),
@@ -260,7 +260,7 @@
goto no_kmmio;
}
- ctx = &get_cpu_var(kmmio_ctx);
+ ctx = this_cpu_ptr(&kmmio_ctx);
if (ctx->active) {
if (page_base == ctx->addr) {
/*
@@ -285,7 +285,7 @@
pr_emerg("previous hit was at 0x%08lx.\n", ctx->addr);
disarm_kmmio_fault_page(faultpage);
}
- goto no_kmmio_ctx;
+ goto no_kmmio;
}
ctx->active++;
@@ -314,11 +314,8 @@
* the user should drop to single cpu before tracing.
*/
- put_cpu_var(kmmio_ctx);
return 1; /* fault handled */
-no_kmmio_ctx:
- put_cpu_var(kmmio_ctx);
no_kmmio:
rcu_read_unlock();
preempt_enable_no_resched();
@@ -333,7 +330,7 @@
static int post_kmmio_handler(unsigned long condition, struct pt_regs *regs)
{
int ret = 0;
- struct kmmio_context *ctx = &get_cpu_var(kmmio_ctx);
+ struct kmmio_context *ctx = this_cpu_ptr(&kmmio_ctx);
if (!ctx->active) {
/*
@@ -341,8 +338,7 @@
* something external causing them (f.e. using a debugger while
* mmio tracing enabled), or erroneous behaviour
*/
- pr_warning("unexpected debug trap on CPU %d.\n",
- smp_processor_id());
+ pr_warn("unexpected debug trap on CPU %d.\n", smp_processor_id());
goto out;
}
@@ -372,7 +368,6 @@
if (!(regs->flags & X86_EFLAGS_TF))
ret = 1;
out:
- put_cpu_var(kmmio_ctx);
return ret;
}
diff --git a/arch/x86/mm/maccess.c b/arch/x86/mm/maccess.c
new file mode 100644
index 0000000..92ec176
--- /dev/null
+++ b/arch/x86/mm/maccess.c
@@ -0,0 +1,29 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <linux/uaccess.h>
+#include <linux/kernel.h>
+
+#ifdef CONFIG_X86_64
+static __always_inline u64 canonical_address(u64 vaddr, u8 vaddr_bits)
+{
+ return ((s64)vaddr << (64 - vaddr_bits)) >> (64 - vaddr_bits);
+}
+
+bool copy_from_kernel_nofault_allowed(const void *unsafe_src, size_t size)
+{
+ unsigned long vaddr = (unsigned long)unsafe_src;
+
+ /*
+ * Range covering the highest possible canonical userspace address
+ * as well as non-canonical address range. For the canonical range
+ * we also need to include the userspace guard page.
+ */
+ return vaddr >= TASK_SIZE_MAX + PAGE_SIZE &&
+ canonical_address(vaddr, boot_cpu_data.x86_virt_bits) == vaddr;
+}
+#else
+bool copy_from_kernel_nofault_allowed(const void *unsafe_src, size_t size)
+{
+ return (unsigned long)unsafe_src >= TASK_SIZE_MAX;
+}
+#endif
diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c
index 7b55893..97f7eb5 100644
--- a/arch/x86/mm/mem_encrypt.c
+++ b/arch/x86/mm/mem_encrypt.c
@@ -19,6 +19,7 @@
#include <linux/kernel.h>
#include <linux/bitops.h>
#include <linux/dma-mapping.h>
+#include <linux/cc_platform.h>
#include <asm/tlbflush.h>
#include <asm/fixmap.h>
@@ -37,12 +38,14 @@
* reside in the .data section so as not to be zeroed out when the .bss
* section is later cleared.
*/
-u64 sme_me_mask __section(.data) = 0;
+u64 sme_me_mask __section(".data") = 0;
+u64 sev_status __section(".data") = 0;
+u64 sev_check_data __section(".data") = 0;
EXPORT_SYMBOL(sme_me_mask);
DEFINE_STATIC_KEY_FALSE(sev_enable_key);
EXPORT_SYMBOL_GPL(sev_enable_key);
-bool sev_enabled __section(.data);
+bool sev_enabled __section(".data");
/* Buffer used for early in-place encryption by BSP, no locking needed */
static char sme_early_buffer[PAGE_SIZE] __initdata __aligned(PAGE_SIZE);
@@ -134,7 +137,7 @@
size = (size <= PMD_SIZE) ? 0 : size - PMD_SIZE;
} while (size);
- __native_flush_tlb();
+ flush_tlb_local();
}
void __init sme_unmap_bootdata(char *real_mode_data)
@@ -347,7 +350,14 @@
bool sev_active(void)
{
- return sme_me_mask && sev_enabled;
+ return sev_status & MSR_AMD64_SEV_ENABLED;
+}
+EXPORT_SYMBOL_GPL(sev_active);
+
+/* Needs to be called from non-instrumentable code */
+bool noinstr sev_es_active(void)
+{
+ return sev_status & MSR_AMD64_SEV_ES_ENABLED;
}
/* Override for DMA direct allocation check - ARCH_HAS_FORCE_DMA_UNENCRYPTED */
@@ -367,7 +377,7 @@
if (sme_active()) {
u64 dma_enc_mask = DMA_BIT_MASK(__ffs64(sme_me_mask));
u64 dma_dev_mask = min_not_zero(dev->coherent_dma_mask,
- dev->bus_dma_mask);
+ dev->bus_dma_limit);
if (dma_dev_mask <= dma_enc_mask)
return true;
@@ -375,9 +385,7 @@
return false;
}
-EXPORT_SYMBOL_GPL(sev_active);
-/* Architecture __weak replacement functions */
void __init mem_encrypt_free_decrypted_mem(void)
{
unsigned long vaddr, vaddr_end, npages;
@@ -402,6 +410,32 @@
free_init_pages("unused decrypted", vaddr, vaddr_end);
}
+static void print_mem_encrypt_feature_info(void)
+{
+ pr_info("AMD Memory Encryption Features active:");
+
+ /* Secure Memory Encryption */
+ if (sme_active()) {
+ /*
+ * SME is mutually exclusive with any of the SEV
+ * features below.
+ */
+ pr_cont(" SME\n");
+ return;
+ }
+
+ /* Secure Encrypted Virtualization */
+ if (sev_active())
+ pr_cont(" SEV");
+
+ /* Encrypted Register State */
+ if (sev_es_active())
+ pr_cont(" SEV-ES");
+
+ pr_cont("\n");
+}
+
+/* Architecture __weak replacement functions */
void __init mem_encrypt_init(void)
{
if (!sme_me_mask)
@@ -416,8 +450,6 @@
if (sev_active())
static_branch_enable(&sev_enable_key);
- pr_info("AMD %s active\n",
- sev_active() ? "Secure Encrypted Virtualization (SEV)"
- : "Secure Memory Encryption (SME)");
+ print_mem_encrypt_feature_info();
}
diff --git a/arch/x86/mm/mem_encrypt_boot.S b/arch/x86/mm/mem_encrypt_boot.S
index 6d71481..7a84fc8 100644
--- a/arch/x86/mm/mem_encrypt_boot.S
+++ b/arch/x86/mm/mem_encrypt_boot.S
@@ -8,7 +8,7 @@
*/
#include <linux/linkage.h>
-#include <asm/pgtable.h>
+#include <linux/pgtable.h>
#include <asm/page.h>
#include <asm/processor-flags.h>
#include <asm/msr-index.h>
@@ -16,7 +16,7 @@
.text
.code64
-ENTRY(sme_encrypt_execute)
+SYM_FUNC_START(sme_encrypt_execute)
/*
* Entry parameters:
@@ -66,9 +66,9 @@
pop %rbp
ret
-ENDPROC(sme_encrypt_execute)
+SYM_FUNC_END(sme_encrypt_execute)
-ENTRY(__enc_copy)
+SYM_FUNC_START(__enc_copy)
/*
* Routine used to encrypt memory in place.
* This routine must be run outside of the kernel proper since
@@ -153,4 +153,4 @@
ret
.L__enc_copy_end:
-ENDPROC(__enc_copy)
+SYM_FUNC_END(__enc_copy)
diff --git a/arch/x86/mm/mem_encrypt_identity.c b/arch/x86/mm/mem_encrypt_identity.c
index 84cda5d..011e042 100644
--- a/arch/x86/mm/mem_encrypt_identity.c
+++ b/arch/x86/mm/mem_encrypt_identity.c
@@ -27,6 +27,15 @@
#undef CONFIG_PARAVIRT_XXL
#undef CONFIG_PARAVIRT_SPINLOCKS
+/*
+ * This code runs before CPU feature bits are set. By default, the
+ * pgtable_l5_enabled() function uses bit X86_FEATURE_LA57 to determine if
+ * 5-level paging is active, so that won't work here. USE_EARLY_PGTABLE_L5
+ * is provided to handle this situation and, instead, use a variable that
+ * has been set by the early boot code.
+ */
+#define USE_EARLY_PGTABLE_L5
+
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/mem_encrypt.h>
@@ -81,7 +90,7 @@
* section is 2MB aligned to allow for simple pagetable setup using only
* PMD entries (see vmlinux.lds.S).
*/
-static char sme_workarea[2 * PMD_PAGE_SIZE] __section(.init.scratch);
+static char sme_workarea[2 * PMD_PAGE_SIZE] __section(".init.scratch");
static char sme_cmdline_arg[] __initdata = "mem_encrypt";
static char sme_cmdline_on[] __initdata = "on";
@@ -503,14 +512,6 @@
#define AMD_SME_BIT BIT(0)
#define AMD_SEV_BIT BIT(1)
- /*
- * Set the feature mask (SME or SEV) based on whether we are
- * running under a hypervisor.
- */
- eax = 1;
- ecx = 0;
- native_cpuid(&eax, &ebx, &ecx, &edx);
- feature_mask = (ecx & BIT(31)) ? AMD_SEV_BIT : AMD_SME_BIT;
/*
* Check for the SME/SEV feature:
@@ -523,23 +524,38 @@
eax = 0x8000001f;
ecx = 0;
native_cpuid(&eax, &ebx, &ecx, &edx);
- if (!(eax & feature_mask))
+ /* Check whether SEV or SME is supported */
+ if (!(eax & (AMD_SEV_BIT | AMD_SME_BIT)))
return;
me_mask = 1UL << (ebx & 0x3f);
+ /* Check the SEV MSR whether SEV or SME is enabled */
+ sev_status = __rdmsr(MSR_AMD64_SEV);
+ feature_mask = (sev_status & MSR_AMD64_SEV_ENABLED) ? AMD_SEV_BIT : AMD_SME_BIT;
+
/* Check if memory encryption is enabled */
if (feature_mask == AMD_SME_BIT) {
+ /*
+ * No SME if Hypervisor bit is set. This check is here to
+ * prevent a guest from trying to enable SME. For running as a
+ * KVM guest the MSR_K8_SYSCFG will be sufficient, but there
+ * might be other hypervisors which emulate that MSR as non-zero
+ * or even pass it through to the guest.
+ * A malicious hypervisor can still trick a guest into this
+ * path, but there is no way to protect against that.
+ */
+ eax = 1;
+ ecx = 0;
+ native_cpuid(&eax, &ebx, &ecx, &edx);
+ if (ecx & BIT(31))
+ return;
+
/* For SME, check the SYSCFG MSR */
msr = __rdmsr(MSR_K8_SYSCFG);
if (!(msr & MSR_K8_SYSCFG_MEM_ENCRYPT))
return;
} else {
- /* For SEV, check the SEV MSR */
- msr = __rdmsr(MSR_AMD64_SEV);
- if (!(msr & MSR_AMD64_SEV_ENABLED))
- return;
-
/* SEV state cannot be controlled by a command line option */
sme_me_mask = me_mask;
sev_enabled = true;
diff --git a/arch/x86/mm/mm_internal.h b/arch/x86/mm/mm_internal.h
index eeae142..3f37b5c 100644
--- a/arch/x86/mm/mm_internal.h
+++ b/arch/x86/mm/mm_internal.h
@@ -12,7 +12,8 @@
unsigned long kernel_physical_mapping_init(unsigned long start,
unsigned long end,
- unsigned long page_size_mask);
+ unsigned long page_size_mask,
+ pgprot_t prot);
unsigned long kernel_physical_mapping_change(unsigned long start,
unsigned long end,
unsigned long page_size_mask);
diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c
index aae9a93..c90c209 100644
--- a/arch/x86/mm/mmap.c
+++ b/arch/x86/mm/mmap.c
@@ -18,7 +18,9 @@
#include <linux/sched/signal.h>
#include <linux/sched/mm.h>
#include <linux/compat.h>
+#include <linux/elf-randomize.h>
#include <asm/elf.h>
+#include <asm/io.h>
#include "physaddr.h"
@@ -163,8 +165,6 @@
const char *arch_vma_name(struct vm_area_struct *vma)
{
- if (vma->vm_flags & VM_MPX)
- return "[mpx]";
return NULL;
}
diff --git a/arch/x86/mm/mmio-mod.c b/arch/x86/mm/mmio-mod.c
index 2a36902..bd7aff5 100644
--- a/arch/x86/mm/mmio-mod.c
+++ b/arch/x86/mm/mmio-mod.c
@@ -17,8 +17,8 @@
#include <linux/slab.h>
#include <linux/uaccess.h>
#include <linux/io.h>
-#include <asm/pgtable.h>
#include <linux/mmiotrace.h>
+#include <linux/pgtable.h>
#include <asm/e820/api.h> /* for ISA_START_ADDRESS */
#include <linux/atomic.h>
#include <linux/percpu.h>
@@ -386,7 +386,7 @@
put_online_cpus();
for_each_cpu(cpu, downed_cpus) {
- err = cpu_down(cpu);
+ err = remove_cpu(cpu);
if (!err)
pr_info("CPU%d is down.\n", cpu);
else
@@ -394,7 +394,7 @@
}
out:
if (num_online_cpus() > 1)
- pr_warning("multiple CPUs still online, may miss events.\n");
+ pr_warn("multiple CPUs still online, may miss events.\n");
}
static void leave_uniprocessor(void)
@@ -406,7 +406,7 @@
return;
pr_notice("Re-enabling CPUs...\n");
for_each_cpu(cpu, downed_cpus) {
- err = cpu_up(cpu);
+ err = add_cpu(cpu);
if (!err)
pr_info("enabled CPU%d.\n", cpu);
else
@@ -418,8 +418,8 @@
static void enter_uniprocessor(void)
{
if (num_online_cpus() > 1)
- pr_warning("multiple CPUs are online, may miss events. "
- "Suggest booting with maxcpus=1 kernel argument.\n");
+ pr_warn("multiple CPUs are online, may miss events. "
+ "Suggest booting with maxcpus=1 kernel argument.\n");
}
static void leave_uniprocessor(void)
diff --git a/arch/x86/mm/mpx.c b/arch/x86/mm/mpx.c
deleted file mode 100644
index 895fb7a..0000000
--- a/arch/x86/mm/mpx.c
+++ /dev/null
@@ -1,938 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * mpx.c - Memory Protection eXtensions
- *
- * Copyright (c) 2014, Intel Corporation.
- * Qiaowei Ren <qiaowei.ren@intel.com>
- * Dave Hansen <dave.hansen@intel.com>
- */
-#include <linux/kernel.h>
-#include <linux/slab.h>
-#include <linux/mm_types.h>
-#include <linux/mman.h>
-#include <linux/syscalls.h>
-#include <linux/sched/sysctl.h>
-
-#include <asm/insn.h>
-#include <asm/insn-eval.h>
-#include <asm/mmu_context.h>
-#include <asm/mpx.h>
-#include <asm/processor.h>
-#include <asm/fpu/internal.h>
-
-#define CREATE_TRACE_POINTS
-#include <asm/trace/mpx.h>
-
-static inline unsigned long mpx_bd_size_bytes(struct mm_struct *mm)
-{
- if (is_64bit_mm(mm))
- return MPX_BD_SIZE_BYTES_64;
- else
- return MPX_BD_SIZE_BYTES_32;
-}
-
-static inline unsigned long mpx_bt_size_bytes(struct mm_struct *mm)
-{
- if (is_64bit_mm(mm))
- return MPX_BT_SIZE_BYTES_64;
- else
- return MPX_BT_SIZE_BYTES_32;
-}
-
-/*
- * This is really a simplified "vm_mmap". it only handles MPX
- * bounds tables (the bounds directory is user-allocated).
- */
-static unsigned long mpx_mmap(unsigned long len)
-{
- struct mm_struct *mm = current->mm;
- unsigned long addr, populate;
-
- /* Only bounds table can be allocated here */
- if (len != mpx_bt_size_bytes(mm))
- return -EINVAL;
-
- down_write(&mm->mmap_sem);
- addr = do_mmap(NULL, 0, len, PROT_READ | PROT_WRITE,
- MAP_ANONYMOUS | MAP_PRIVATE, VM_MPX, 0, &populate, NULL);
- up_write(&mm->mmap_sem);
- if (populate)
- mm_populate(addr, populate);
-
- return addr;
-}
-
-static int mpx_insn_decode(struct insn *insn,
- struct pt_regs *regs)
-{
- unsigned char buf[MAX_INSN_SIZE];
- int x86_64 = !test_thread_flag(TIF_IA32);
- int not_copied;
- int nr_copied;
-
- not_copied = copy_from_user(buf, (void __user *)regs->ip, sizeof(buf));
- nr_copied = sizeof(buf) - not_copied;
- /*
- * The decoder _should_ fail nicely if we pass it a short buffer.
- * But, let's not depend on that implementation detail. If we
- * did not get anything, just error out now.
- */
- if (!nr_copied)
- return -EFAULT;
- insn_init(insn, buf, nr_copied, x86_64);
- insn_get_length(insn);
- /*
- * copy_from_user() tries to get as many bytes as we could see in
- * the largest possible instruction. If the instruction we are
- * after is shorter than that _and_ we attempt to copy from
- * something unreadable, we might get a short read. This is OK
- * as long as the read did not stop in the middle of the
- * instruction. Check to see if we got a partial instruction.
- */
- if (nr_copied < insn->length)
- return -EFAULT;
-
- insn_get_opcode(insn);
- /*
- * We only _really_ need to decode bndcl/bndcn/bndcu
- * Error out on anything else.
- */
- if (insn->opcode.bytes[0] != 0x0f)
- goto bad_opcode;
- if ((insn->opcode.bytes[1] != 0x1a) &&
- (insn->opcode.bytes[1] != 0x1b))
- goto bad_opcode;
-
- return 0;
-bad_opcode:
- return -EINVAL;
-}
-
-/*
- * If a bounds overflow occurs then a #BR is generated. This
- * function decodes MPX instructions to get violation address
- * and set this address into extended struct siginfo.
- *
- * Note that this is not a super precise way of doing this.
- * Userspace could have, by the time we get here, written
- * anything it wants in to the instructions. We can not
- * trust anything about it. They might not be valid
- * instructions or might encode invalid registers, etc...
- */
-int mpx_fault_info(struct mpx_fault_info *info, struct pt_regs *regs)
-{
- const struct mpx_bndreg_state *bndregs;
- const struct mpx_bndreg *bndreg;
- struct insn insn;
- uint8_t bndregno;
- int err;
-
- err = mpx_insn_decode(&insn, regs);
- if (err)
- goto err_out;
-
- /*
- * We know at this point that we are only dealing with
- * MPX instructions.
- */
- insn_get_modrm(&insn);
- bndregno = X86_MODRM_REG(insn.modrm.value);
- if (bndregno > 3) {
- err = -EINVAL;
- goto err_out;
- }
- /* get bndregs field from current task's xsave area */
- bndregs = get_xsave_field_ptr(XFEATURE_BNDREGS);
- if (!bndregs) {
- err = -EINVAL;
- goto err_out;
- }
- /* now go select the individual register in the set of 4 */
- bndreg = &bndregs->bndreg[bndregno];
-
- /*
- * The registers are always 64-bit, but the upper 32
- * bits are ignored in 32-bit mode. Also, note that the
- * upper bounds are architecturally represented in 1's
- * complement form.
- *
- * The 'unsigned long' cast is because the compiler
- * complains when casting from integers to different-size
- * pointers.
- */
- info->lower = (void __user *)(unsigned long)bndreg->lower_bound;
- info->upper = (void __user *)(unsigned long)~bndreg->upper_bound;
- info->addr = insn_get_addr_ref(&insn, regs);
-
- /*
- * We were not able to extract an address from the instruction,
- * probably because there was something invalid in it.
- */
- if (info->addr == (void __user *)-1) {
- err = -EINVAL;
- goto err_out;
- }
- trace_mpx_bounds_register_exception(info->addr, bndreg);
- return 0;
-err_out:
- /* info might be NULL, but kfree() handles that */
- return err;
-}
-
-static __user void *mpx_get_bounds_dir(void)
-{
- const struct mpx_bndcsr *bndcsr;
-
- if (!cpu_feature_enabled(X86_FEATURE_MPX))
- return MPX_INVALID_BOUNDS_DIR;
-
- /*
- * The bounds directory pointer is stored in a register
- * only accessible if we first do an xsave.
- */
- bndcsr = get_xsave_field_ptr(XFEATURE_BNDCSR);
- if (!bndcsr)
- return MPX_INVALID_BOUNDS_DIR;
-
- /*
- * Make sure the register looks valid by checking the
- * enable bit.
- */
- if (!(bndcsr->bndcfgu & MPX_BNDCFG_ENABLE_FLAG))
- return MPX_INVALID_BOUNDS_DIR;
-
- /*
- * Lastly, mask off the low bits used for configuration
- * flags, and return the address of the bounds table.
- */
- return (void __user *)(unsigned long)
- (bndcsr->bndcfgu & MPX_BNDCFG_ADDR_MASK);
-}
-
-int mpx_enable_management(void)
-{
- void __user *bd_base = MPX_INVALID_BOUNDS_DIR;
- struct mm_struct *mm = current->mm;
- int ret = 0;
-
- /*
- * runtime in the userspace will be responsible for allocation of
- * the bounds directory. Then, it will save the base of the bounds
- * directory into XSAVE/XRSTOR Save Area and enable MPX through
- * XRSTOR instruction.
- *
- * The copy_xregs_to_kernel() beneath get_xsave_field_ptr() is
- * expected to be relatively expensive. Storing the bounds
- * directory here means that we do not have to do xsave in the
- * unmap path; we can just use mm->context.bd_addr instead.
- */
- bd_base = mpx_get_bounds_dir();
- down_write(&mm->mmap_sem);
-
- /* MPX doesn't support addresses above 47 bits yet. */
- if (find_vma(mm, DEFAULT_MAP_WINDOW)) {
- pr_warn_once("%s (%d): MPX cannot handle addresses "
- "above 47-bits. Disabling.",
- current->comm, current->pid);
- ret = -ENXIO;
- goto out;
- }
- mm->context.bd_addr = bd_base;
- if (mm->context.bd_addr == MPX_INVALID_BOUNDS_DIR)
- ret = -ENXIO;
-out:
- up_write(&mm->mmap_sem);
- return ret;
-}
-
-int mpx_disable_management(void)
-{
- struct mm_struct *mm = current->mm;
-
- if (!cpu_feature_enabled(X86_FEATURE_MPX))
- return -ENXIO;
-
- down_write(&mm->mmap_sem);
- mm->context.bd_addr = MPX_INVALID_BOUNDS_DIR;
- up_write(&mm->mmap_sem);
- return 0;
-}
-
-static int mpx_cmpxchg_bd_entry(struct mm_struct *mm,
- unsigned long *curval,
- unsigned long __user *addr,
- unsigned long old_val, unsigned long new_val)
-{
- int ret;
- /*
- * user_atomic_cmpxchg_inatomic() actually uses sizeof()
- * the pointer that we pass to it to figure out how much
- * data to cmpxchg. We have to be careful here not to
- * pass a pointer to a 64-bit data type when we only want
- * a 32-bit copy.
- */
- if (is_64bit_mm(mm)) {
- ret = user_atomic_cmpxchg_inatomic(curval,
- addr, old_val, new_val);
- } else {
- u32 uninitialized_var(curval_32);
- u32 old_val_32 = old_val;
- u32 new_val_32 = new_val;
- u32 __user *addr_32 = (u32 __user *)addr;
-
- ret = user_atomic_cmpxchg_inatomic(&curval_32,
- addr_32, old_val_32, new_val_32);
- *curval = curval_32;
- }
- return ret;
-}
-
-/*
- * With 32-bit mode, a bounds directory is 4MB, and the size of each
- * bounds table is 16KB. With 64-bit mode, a bounds directory is 2GB,
- * and the size of each bounds table is 4MB.
- */
-static int allocate_bt(struct mm_struct *mm, long __user *bd_entry)
-{
- unsigned long expected_old_val = 0;
- unsigned long actual_old_val = 0;
- unsigned long bt_addr;
- unsigned long bd_new_entry;
- int ret = 0;
-
- /*
- * Carve the virtual space out of userspace for the new
- * bounds table:
- */
- bt_addr = mpx_mmap(mpx_bt_size_bytes(mm));
- if (IS_ERR((void *)bt_addr))
- return PTR_ERR((void *)bt_addr);
- /*
- * Set the valid flag (kinda like _PAGE_PRESENT in a pte)
- */
- bd_new_entry = bt_addr | MPX_BD_ENTRY_VALID_FLAG;
-
- /*
- * Go poke the address of the new bounds table in to the
- * bounds directory entry out in userspace memory. Note:
- * we may race with another CPU instantiating the same table.
- * In that case the cmpxchg will see an unexpected
- * 'actual_old_val'.
- *
- * This can fault, but that's OK because we do not hold
- * mmap_sem at this point, unlike some of the other part
- * of the MPX code that have to pagefault_disable().
- */
- ret = mpx_cmpxchg_bd_entry(mm, &actual_old_val, bd_entry,
- expected_old_val, bd_new_entry);
- if (ret)
- goto out_unmap;
-
- /*
- * The user_atomic_cmpxchg_inatomic() will only return nonzero
- * for faults, *not* if the cmpxchg itself fails. Now we must
- * verify that the cmpxchg itself completed successfully.
- */
- /*
- * We expected an empty 'expected_old_val', but instead found
- * an apparently valid entry. Assume we raced with another
- * thread to instantiate this table and desclare succecss.
- */
- if (actual_old_val & MPX_BD_ENTRY_VALID_FLAG) {
- ret = 0;
- goto out_unmap;
- }
- /*
- * We found a non-empty bd_entry but it did not have the
- * VALID_FLAG set. Return an error which will result in
- * a SEGV since this probably means that somebody scribbled
- * some invalid data in to a bounds table.
- */
- if (expected_old_val != actual_old_val) {
- ret = -EINVAL;
- goto out_unmap;
- }
- trace_mpx_new_bounds_table(bt_addr);
- return 0;
-out_unmap:
- vm_munmap(bt_addr, mpx_bt_size_bytes(mm));
- return ret;
-}
-
-/*
- * When a BNDSTX instruction attempts to save bounds to a bounds
- * table, it will first attempt to look up the table in the
- * first-level bounds directory. If it does not find a table in
- * the directory, a #BR is generated and we get here in order to
- * allocate a new table.
- *
- * With 32-bit mode, the size of BD is 4MB, and the size of each
- * bound table is 16KB. With 64-bit mode, the size of BD is 2GB,
- * and the size of each bound table is 4MB.
- */
-static int do_mpx_bt_fault(void)
-{
- unsigned long bd_entry, bd_base;
- const struct mpx_bndcsr *bndcsr;
- struct mm_struct *mm = current->mm;
-
- bndcsr = get_xsave_field_ptr(XFEATURE_BNDCSR);
- if (!bndcsr)
- return -EINVAL;
- /*
- * Mask off the preserve and enable bits
- */
- bd_base = bndcsr->bndcfgu & MPX_BNDCFG_ADDR_MASK;
- /*
- * The hardware provides the address of the missing or invalid
- * entry via BNDSTATUS, so we don't have to go look it up.
- */
- bd_entry = bndcsr->bndstatus & MPX_BNDSTA_ADDR_MASK;
- /*
- * Make sure the directory entry is within where we think
- * the directory is.
- */
- if ((bd_entry < bd_base) ||
- (bd_entry >= bd_base + mpx_bd_size_bytes(mm)))
- return -EINVAL;
-
- return allocate_bt(mm, (long __user *)bd_entry);
-}
-
-int mpx_handle_bd_fault(void)
-{
- /*
- * Userspace never asked us to manage the bounds tables,
- * so refuse to help.
- */
- if (!kernel_managing_mpx_tables(current->mm))
- return -EINVAL;
-
- return do_mpx_bt_fault();
-}
-
-/*
- * A thin wrapper around get_user_pages(). Returns 0 if the
- * fault was resolved or -errno if not.
- */
-static int mpx_resolve_fault(long __user *addr, int write)
-{
- long gup_ret;
- int nr_pages = 1;
-
- gup_ret = get_user_pages((unsigned long)addr, nr_pages,
- write ? FOLL_WRITE : 0, NULL, NULL);
- /*
- * get_user_pages() returns number of pages gotten.
- * 0 means we failed to fault in and get anything,
- * probably because 'addr' is bad.
- */
- if (!gup_ret)
- return -EFAULT;
- /* Other error, return it */
- if (gup_ret < 0)
- return gup_ret;
- /* must have gup'd a page and gup_ret>0, success */
- return 0;
-}
-
-static unsigned long mpx_bd_entry_to_bt_addr(struct mm_struct *mm,
- unsigned long bd_entry)
-{
- unsigned long bt_addr = bd_entry;
- int align_to_bytes;
- /*
- * Bit 0 in a bt_entry is always the valid bit.
- */
- bt_addr &= ~MPX_BD_ENTRY_VALID_FLAG;
- /*
- * Tables are naturally aligned at 8-byte boundaries
- * on 64-bit and 4-byte boundaries on 32-bit. The
- * documentation makes it appear that the low bits
- * are ignored by the hardware, so we do the same.
- */
- if (is_64bit_mm(mm))
- align_to_bytes = 8;
- else
- align_to_bytes = 4;
- bt_addr &= ~(align_to_bytes-1);
- return bt_addr;
-}
-
-/*
- * We only want to do a 4-byte get_user() on 32-bit. Otherwise,
- * we might run off the end of the bounds table if we are on
- * a 64-bit kernel and try to get 8 bytes.
- */
-static int get_user_bd_entry(struct mm_struct *mm, unsigned long *bd_entry_ret,
- long __user *bd_entry_ptr)
-{
- u32 bd_entry_32;
- int ret;
-
- if (is_64bit_mm(mm))
- return get_user(*bd_entry_ret, bd_entry_ptr);
-
- /*
- * Note that get_user() uses the type of the *pointer* to
- * establish the size of the get, not the destination.
- */
- ret = get_user(bd_entry_32, (u32 __user *)bd_entry_ptr);
- *bd_entry_ret = bd_entry_32;
- return ret;
-}
-
-/*
- * Get the base of bounds tables pointed by specific bounds
- * directory entry.
- */
-static int get_bt_addr(struct mm_struct *mm,
- long __user *bd_entry_ptr,
- unsigned long *bt_addr_result)
-{
- int ret;
- int valid_bit;
- unsigned long bd_entry;
- unsigned long bt_addr;
-
- if (!access_ok((bd_entry_ptr), sizeof(*bd_entry_ptr)))
- return -EFAULT;
-
- while (1) {
- int need_write = 0;
-
- pagefault_disable();
- ret = get_user_bd_entry(mm, &bd_entry, bd_entry_ptr);
- pagefault_enable();
- if (!ret)
- break;
- if (ret == -EFAULT)
- ret = mpx_resolve_fault(bd_entry_ptr, need_write);
- /*
- * If we could not resolve the fault, consider it
- * userspace's fault and error out.
- */
- if (ret)
- return ret;
- }
-
- valid_bit = bd_entry & MPX_BD_ENTRY_VALID_FLAG;
- bt_addr = mpx_bd_entry_to_bt_addr(mm, bd_entry);
-
- /*
- * When the kernel is managing bounds tables, a bounds directory
- * entry will either have a valid address (plus the valid bit)
- * *OR* be completely empty. If we see a !valid entry *and* some
- * data in the address field, we know something is wrong. This
- * -EINVAL return will cause a SIGSEGV.
- */
- if (!valid_bit && bt_addr)
- return -EINVAL;
- /*
- * Do we have an completely zeroed bt entry? That is OK. It
- * just means there was no bounds table for this memory. Make
- * sure to distinguish this from -EINVAL, which will cause
- * a SEGV.
- */
- if (!valid_bit)
- return -ENOENT;
-
- *bt_addr_result = bt_addr;
- return 0;
-}
-
-static inline int bt_entry_size_bytes(struct mm_struct *mm)
-{
- if (is_64bit_mm(mm))
- return MPX_BT_ENTRY_BYTES_64;
- else
- return MPX_BT_ENTRY_BYTES_32;
-}
-
-/*
- * Take a virtual address and turns it in to the offset in bytes
- * inside of the bounds table where the bounds table entry
- * controlling 'addr' can be found.
- */
-static unsigned long mpx_get_bt_entry_offset_bytes(struct mm_struct *mm,
- unsigned long addr)
-{
- unsigned long bt_table_nr_entries;
- unsigned long offset = addr;
-
- if (is_64bit_mm(mm)) {
- /* Bottom 3 bits are ignored on 64-bit */
- offset >>= 3;
- bt_table_nr_entries = MPX_BT_NR_ENTRIES_64;
- } else {
- /* Bottom 2 bits are ignored on 32-bit */
- offset >>= 2;
- bt_table_nr_entries = MPX_BT_NR_ENTRIES_32;
- }
- /*
- * We know the size of the table in to which we are
- * indexing, and we have eliminated all the low bits
- * which are ignored for indexing.
- *
- * Mask out all the high bits which we do not need
- * to index in to the table. Note that the tables
- * are always powers of two so this gives us a proper
- * mask.
- */
- offset &= (bt_table_nr_entries-1);
- /*
- * We now have an entry offset in terms of *entries* in
- * the table. We need to scale it back up to bytes.
- */
- offset *= bt_entry_size_bytes(mm);
- return offset;
-}
-
-/*
- * How much virtual address space does a single bounds
- * directory entry cover?
- *
- * Note, we need a long long because 4GB doesn't fit in
- * to a long on 32-bit.
- */
-static inline unsigned long bd_entry_virt_space(struct mm_struct *mm)
-{
- unsigned long long virt_space;
- unsigned long long GB = (1ULL << 30);
-
- /*
- * This covers 32-bit emulation as well as 32-bit kernels
- * running on 64-bit hardware.
- */
- if (!is_64bit_mm(mm))
- return (4ULL * GB) / MPX_BD_NR_ENTRIES_32;
-
- /*
- * 'x86_virt_bits' returns what the hardware is capable
- * of, and returns the full >32-bit address space when
- * running 32-bit kernels on 64-bit hardware.
- */
- virt_space = (1ULL << boot_cpu_data.x86_virt_bits);
- return virt_space / MPX_BD_NR_ENTRIES_64;
-}
-
-/*
- * Free the backing physical pages of bounds table 'bt_addr'.
- * Assume start...end is within that bounds table.
- */
-static noinline int zap_bt_entries_mapping(struct mm_struct *mm,
- unsigned long bt_addr,
- unsigned long start_mapping, unsigned long end_mapping)
-{
- struct vm_area_struct *vma;
- unsigned long addr, len;
- unsigned long start;
- unsigned long end;
-
- /*
- * if we 'end' on a boundary, the offset will be 0 which
- * is not what we want. Back it up a byte to get the
- * last bt entry. Then once we have the entry itself,
- * move 'end' back up by the table entry size.
- */
- start = bt_addr + mpx_get_bt_entry_offset_bytes(mm, start_mapping);
- end = bt_addr + mpx_get_bt_entry_offset_bytes(mm, end_mapping - 1);
- /*
- * Move end back up by one entry. Among other things
- * this ensures that it remains page-aligned and does
- * not screw up zap_page_range()
- */
- end += bt_entry_size_bytes(mm);
-
- /*
- * Find the first overlapping vma. If vma->vm_start > start, there
- * will be a hole in the bounds table. This -EINVAL return will
- * cause a SIGSEGV.
- */
- vma = find_vma(mm, start);
- if (!vma || vma->vm_start > start)
- return -EINVAL;
-
- /*
- * A NUMA policy on a VM_MPX VMA could cause this bounds table to
- * be split. So we need to look across the entire 'start -> end'
- * range of this bounds table, find all of the VM_MPX VMAs, and
- * zap only those.
- */
- addr = start;
- while (vma && vma->vm_start < end) {
- /*
- * We followed a bounds directory entry down
- * here. If we find a non-MPX VMA, that's bad,
- * so stop immediately and return an error. This
- * probably results in a SIGSEGV.
- */
- if (!(vma->vm_flags & VM_MPX))
- return -EINVAL;
-
- len = min(vma->vm_end, end) - addr;
- zap_page_range(vma, addr, len);
- trace_mpx_unmap_zap(addr, addr+len);
-
- vma = vma->vm_next;
- addr = vma->vm_start;
- }
- return 0;
-}
-
-static unsigned long mpx_get_bd_entry_offset(struct mm_struct *mm,
- unsigned long addr)
-{
- /*
- * There are several ways to derive the bd offsets. We
- * use the following approach here:
- * 1. We know the size of the virtual address space
- * 2. We know the number of entries in a bounds table
- * 3. We know that each entry covers a fixed amount of
- * virtual address space.
- * So, we can just divide the virtual address by the
- * virtual space used by one entry to determine which
- * entry "controls" the given virtual address.
- */
- if (is_64bit_mm(mm)) {
- int bd_entry_size = 8; /* 64-bit pointer */
- /*
- * Take the 64-bit addressing hole in to account.
- */
- addr &= ((1UL << boot_cpu_data.x86_virt_bits) - 1);
- return (addr / bd_entry_virt_space(mm)) * bd_entry_size;
- } else {
- int bd_entry_size = 4; /* 32-bit pointer */
- /*
- * 32-bit has no hole so this case needs no mask
- */
- return (addr / bd_entry_virt_space(mm)) * bd_entry_size;
- }
- /*
- * The two return calls above are exact copies. If we
- * pull out a single copy and put it in here, gcc won't
- * realize that we're doing a power-of-2 divide and use
- * shifts. It uses a real divide. If we put them up
- * there, it manages to figure it out (gcc 4.8.3).
- */
-}
-
-static int unmap_entire_bt(struct mm_struct *mm,
- long __user *bd_entry, unsigned long bt_addr)
-{
- unsigned long expected_old_val = bt_addr | MPX_BD_ENTRY_VALID_FLAG;
- unsigned long uninitialized_var(actual_old_val);
- int ret;
-
- while (1) {
- int need_write = 1;
- unsigned long cleared_bd_entry = 0;
-
- pagefault_disable();
- ret = mpx_cmpxchg_bd_entry(mm, &actual_old_val,
- bd_entry, expected_old_val, cleared_bd_entry);
- pagefault_enable();
- if (!ret)
- break;
- if (ret == -EFAULT)
- ret = mpx_resolve_fault(bd_entry, need_write);
- /*
- * If we could not resolve the fault, consider it
- * userspace's fault and error out.
- */
- if (ret)
- return ret;
- }
- /*
- * The cmpxchg was performed, check the results.
- */
- if (actual_old_val != expected_old_val) {
- /*
- * Someone else raced with us to unmap the table.
- * That is OK, since we were both trying to do
- * the same thing. Declare success.
- */
- if (!actual_old_val)
- return 0;
- /*
- * Something messed with the bounds directory
- * entry. We hold mmap_sem for read or write
- * here, so it could not be a _new_ bounds table
- * that someone just allocated. Something is
- * wrong, so pass up the error and SIGSEGV.
- */
- return -EINVAL;
- }
- /*
- * Note, we are likely being called under do_munmap() already. To
- * avoid recursion, do_munmap() will check whether it comes
- * from one bounds table through VM_MPX flag.
- */
- return do_munmap(mm, bt_addr, mpx_bt_size_bytes(mm), NULL);
-}
-
-static int try_unmap_single_bt(struct mm_struct *mm,
- unsigned long start, unsigned long end)
-{
- struct vm_area_struct *next;
- struct vm_area_struct *prev;
- /*
- * "bta" == Bounds Table Area: the area controlled by the
- * bounds table that we are unmapping.
- */
- unsigned long bta_start_vaddr = start & ~(bd_entry_virt_space(mm)-1);
- unsigned long bta_end_vaddr = bta_start_vaddr + bd_entry_virt_space(mm);
- unsigned long uninitialized_var(bt_addr);
- void __user *bde_vaddr;
- int ret;
- /*
- * We already unlinked the VMAs from the mm's rbtree so 'start'
- * is guaranteed to be in a hole. This gets us the first VMA
- * before the hole in to 'prev' and the next VMA after the hole
- * in to 'next'.
- */
- next = find_vma_prev(mm, start, &prev);
- /*
- * Do not count other MPX bounds table VMAs as neighbors.
- * Although theoretically possible, we do not allow bounds
- * tables for bounds tables so our heads do not explode.
- * If we count them as neighbors here, we may end up with
- * lots of tables even though we have no actual table
- * entries in use.
- */
- while (next && (next->vm_flags & VM_MPX))
- next = next->vm_next;
- while (prev && (prev->vm_flags & VM_MPX))
- prev = prev->vm_prev;
- /*
- * We know 'start' and 'end' lie within an area controlled
- * by a single bounds table. See if there are any other
- * VMAs controlled by that bounds table. If there are not
- * then we can "expand" the are we are unmapping to possibly
- * cover the entire table.
- */
- next = find_vma_prev(mm, start, &prev);
- if ((!prev || prev->vm_end <= bta_start_vaddr) &&
- (!next || next->vm_start >= bta_end_vaddr)) {
- /*
- * No neighbor VMAs controlled by same bounds
- * table. Try to unmap the whole thing
- */
- start = bta_start_vaddr;
- end = bta_end_vaddr;
- }
-
- bde_vaddr = mm->context.bd_addr + mpx_get_bd_entry_offset(mm, start);
- ret = get_bt_addr(mm, bde_vaddr, &bt_addr);
- /*
- * No bounds table there, so nothing to unmap.
- */
- if (ret == -ENOENT) {
- ret = 0;
- return 0;
- }
- if (ret)
- return ret;
- /*
- * We are unmapping an entire table. Either because the
- * unmap that started this whole process was large enough
- * to cover an entire table, or that the unmap was small
- * but was the area covered by a bounds table.
- */
- if ((start == bta_start_vaddr) &&
- (end == bta_end_vaddr))
- return unmap_entire_bt(mm, bde_vaddr, bt_addr);
- return zap_bt_entries_mapping(mm, bt_addr, start, end);
-}
-
-static int mpx_unmap_tables(struct mm_struct *mm,
- unsigned long start, unsigned long end)
-{
- unsigned long one_unmap_start;
- trace_mpx_unmap_search(start, end);
-
- one_unmap_start = start;
- while (one_unmap_start < end) {
- int ret;
- unsigned long next_unmap_start = ALIGN(one_unmap_start+1,
- bd_entry_virt_space(mm));
- unsigned long one_unmap_end = end;
- /*
- * if the end is beyond the current bounds table,
- * move it back so we only deal with a single one
- * at a time
- */
- if (one_unmap_end > next_unmap_start)
- one_unmap_end = next_unmap_start;
- ret = try_unmap_single_bt(mm, one_unmap_start, one_unmap_end);
- if (ret)
- return ret;
-
- one_unmap_start = next_unmap_start;
- }
- return 0;
-}
-
-/*
- * Free unused bounds tables covered in a virtual address region being
- * munmap()ed. Assume end > start.
- *
- * This function will be called by do_munmap(), and the VMAs covering
- * the virtual address region start...end have already been split if
- * necessary, and the 'vma' is the first vma in this range (start -> end).
- */
-void mpx_notify_unmap(struct mm_struct *mm, unsigned long start,
- unsigned long end)
-{
- struct vm_area_struct *vma;
- int ret;
-
- /*
- * Refuse to do anything unless userspace has asked
- * the kernel to help manage the bounds tables,
- */
- if (!kernel_managing_mpx_tables(current->mm))
- return;
- /*
- * This will look across the entire 'start -> end' range,
- * and find all of the non-VM_MPX VMAs.
- *
- * To avoid recursion, if a VM_MPX vma is found in the range
- * (start->end), we will not continue follow-up work. This
- * recursion represents having bounds tables for bounds tables,
- * which should not occur normally. Being strict about it here
- * helps ensure that we do not have an exploitable stack overflow.
- */
- vma = find_vma(mm, start);
- while (vma && vma->vm_start < end) {
- if (vma->vm_flags & VM_MPX)
- return;
- vma = vma->vm_next;
- }
-
- ret = mpx_unmap_tables(mm, start, end);
- if (ret)
- force_sig(SIGSEGV);
-}
-
-/* MPX cannot handle addresses above 47 bits yet. */
-unsigned long mpx_unmapped_area_check(unsigned long addr, unsigned long len,
- unsigned long flags)
-{
- if (!kernel_managing_mpx_tables(current->mm))
- return addr;
- if (addr + len <= DEFAULT_MAP_WINDOW)
- return addr;
- if (flags & MAP_FIXED)
- return -ENOMEM;
-
- /*
- * Requested len is larger than the whole area we're allowed to map in.
- * Resetting hinting address wouldn't do much good -- fail early.
- */
- if (len > DEFAULT_MAP_WINDOW)
- return -ENOMEM;
-
- /* Look for unmap area within DEFAULT_MAP_WINDOW */
- return 0;
-}
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 4123100..e94da74 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -25,11 +25,8 @@
struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
EXPORT_SYMBOL(node_data);
-static struct numa_meminfo numa_meminfo
-#ifndef CONFIG_MEMORY_HOTPLUG
-__initdata
-#endif
-;
+static struct numa_meminfo numa_meminfo __initdata_or_meminfo;
+static struct numa_meminfo numa_reserved_meminfo __initdata_or_meminfo;
static int numa_distance_cnt;
static u8 *numa_distance;
@@ -40,14 +37,12 @@
return -EINVAL;
if (!strncmp(opt, "off", 3))
numa_off = 1;
-#ifdef CONFIG_NUMA_EMU
if (!strncmp(opt, "fake=", 5))
- numa_emu_cmdline(opt + 5);
-#endif
-#ifdef CONFIG_ACPI_NUMA
+ return numa_emu_cmdline(opt + 5);
if (!strncmp(opt, "noacpi", 6))
- acpi_numa = -1;
-#endif
+ disable_srat();
+ if (!strncmp(opt, "nohmat", 6))
+ disable_hmat();
return 0;
}
early_param("numa", numa_setup);
@@ -169,6 +164,19 @@
}
/**
+ * numa_move_tail_memblk - Move a numa_memblk from one numa_meminfo to another
+ * @dst: numa_meminfo to append block to
+ * @idx: Index of memblk to remove
+ * @src: numa_meminfo to remove memblk from
+ */
+static void __init numa_move_tail_memblk(struct numa_meminfo *dst, int idx,
+ struct numa_meminfo *src)
+{
+ dst->blk[dst->nr_blks++] = src->blk[idx];
+ numa_remove_memblk_from(idx, src);
+}
+
+/**
* numa_add_memblk - Add one numa_memblk to numa_meminfo
* @nid: NUMA node ID of the new memblk
* @start: Start address of the new memblk
@@ -237,14 +245,25 @@
for (i = 0; i < mi->nr_blks; i++) {
struct numa_memblk *bi = &mi->blk[i];
- /* make sure all blocks are inside the limits */
- bi->start = max(bi->start, low);
- bi->end = min(bi->end, high);
+ /* move / save reserved memory ranges */
+ if (!memblock_overlaps_region(&memblock.memory,
+ bi->start, bi->end - bi->start)) {
+ numa_move_tail_memblk(&numa_reserved_meminfo, i--, mi);
+ continue;
+ }
- /* and there's no empty or non-exist block */
- if (bi->start >= bi->end ||
- !memblock_overlaps_region(&memblock.memory,
- bi->start, bi->end - bi->start))
+ /* make sure all non-reserved blocks are inside the limits */
+ bi->start = max(bi->start, low);
+
+ /* preserve info for non-RAM areas above 'max_pfn': */
+ if (bi->end > high) {
+ numa_add_memblk_to(bi->nid, high, bi->end,
+ &numa_reserved_meminfo);
+ bi->end = high;
+ }
+
+ /* and there's no empty block */
+ if (bi->start >= bi->end)
numa_remove_memblk_from(i--, mi);
}
@@ -501,9 +520,11 @@
* memory ranges, because quirks such as trim_snb_memory()
* reserve specific pages for Sandy Bridge graphics. ]
*/
- for_each_memblock(reserved, mb_region) {
- if (mb_region->nid != MAX_NUMNODES)
- node_set(mb_region->nid, reserved_nodemask);
+ for_each_reserved_mem_region(mb_region) {
+ int nid = memblock_get_region_node(mb_region);
+
+ if (nid != MAX_NUMNODES)
+ node_set(nid, reserved_nodemask);
}
/*
@@ -526,7 +547,6 @@
static int __init numa_register_memblks(struct numa_meminfo *mi)
{
- unsigned long uninitialized_var(pfn_align);
int i, nid;
/* Account for nodes with cpus and no memory */
@@ -554,15 +574,16 @@
* If sections array is gonna be used for pfn -> nid mapping, check
* whether its granularity is fine enough.
*/
-#ifdef NODE_NOT_IN_PAGE_FLAGS
- pfn_align = node_map_pfn_alignment();
- if (pfn_align && pfn_align < PAGES_PER_SECTION) {
- printk(KERN_WARNING "Node alignment %LuMB < min %LuMB, rejecting NUMA config\n",
- PFN_PHYS(pfn_align) >> 20,
- PFN_PHYS(PAGES_PER_SECTION) >> 20);
- return -EINVAL;
+ if (IS_ENABLED(NODE_NOT_IN_PAGE_FLAGS)) {
+ unsigned long pfn_align = node_map_pfn_alignment();
+
+ if (pfn_align && pfn_align < PAGES_PER_SECTION) {
+ pr_warn("Node alignment %LuMB < min %LuMB, rejecting NUMA config\n",
+ PFN_PHYS(pfn_align) >> 20,
+ PFN_PHYS(PAGES_PER_SECTION) >> 20);
+ return -EINVAL;
+ }
}
-#endif
if (!numa_meminfo_cover_memory(mi))
return -EINVAL;
@@ -699,7 +720,7 @@
* x86_numa_init - Initialize NUMA
*
* Try each configured NUMA initialization method until one succeeds. The
- * last fallback is dummy single node config encomapssing whole memory and
+ * last fallback is dummy single node config encompassing whole memory and
* never fails.
*/
void __init x86_numa_init(void)
@@ -720,12 +741,9 @@
static void __init init_memory_less_node(int nid)
{
- unsigned long zones_size[MAX_NR_ZONES] = {0};
- unsigned long zholes_size[MAX_NR_ZONES] = {0};
-
/* Allocate and initialize node data. Memory-less node is now online.*/
alloc_node_data(nid);
- free_area_init_node(nid, zones_size, 0, zholes_size);
+ free_area_init_memoryless_node(nid);
/*
* All zonelists will be built later in start_kernel() after per cpu
@@ -734,6 +752,27 @@
}
/*
+ * A node may exist which has one or more Generic Initiators but no CPUs and no
+ * memory.
+ *
+ * This function must be called after init_cpu_to_node(), to ensure that any
+ * memoryless CPU nodes have already been brought online, and before the
+ * node_data[nid] is needed for zone list setup in build_all_zonelists().
+ *
+ * When this function is called, any nodes containing either memory and/or CPUs
+ * will already be online and there is no need to do anything extra, even if
+ * they also contain one or more Generic Initiators.
+ */
+void __init init_gi_nodes(void)
+{
+ int nid;
+
+ for_each_node_state(nid, N_GENERIC_INITIATOR)
+ if (!node_online(nid))
+ init_memory_less_node(nid);
+}
+
+/*
* Setup early cpu_to_node.
*
* Populate cpu_to_node[] only if x86_cpu_to_apicid[],
@@ -881,16 +920,38 @@
#endif /* !CONFIG_DEBUG_PER_CPU_MAPS */
-#ifdef CONFIG_MEMORY_HOTPLUG
-int memory_add_physaddr_to_nid(u64 start)
+#ifdef CONFIG_NUMA_KEEP_MEMINFO
+static int meminfo_to_nid(struct numa_meminfo *mi, u64 start)
{
- struct numa_meminfo *mi = &numa_meminfo;
- int nid = mi->blk[0].nid;
int i;
for (i = 0; i < mi->nr_blks; i++)
if (mi->blk[i].start <= start && mi->blk[i].end > start)
- nid = mi->blk[i].nid;
+ return mi->blk[i].nid;
+ return NUMA_NO_NODE;
+}
+
+int phys_to_target_node(phys_addr_t start)
+{
+ int nid = meminfo_to_nid(&numa_meminfo, start);
+
+ /*
+ * Prefer online nodes, but if reserved memory might be
+ * hot-added continue the search with reserved ranges.
+ */
+ if (nid != NUMA_NO_NODE)
+ return nid;
+
+ return meminfo_to_nid(&numa_reserved_meminfo, start);
+}
+EXPORT_SYMBOL_GPL(phys_to_target_node);
+
+int memory_add_physaddr_to_nid(u64 start)
+{
+ int nid = meminfo_to_nid(&numa_meminfo, start);
+
+ if (nid == NUMA_NO_NODE)
+ nid = numa_meminfo.blk[0].nid;
return nid;
}
EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c
index f2bd3d6..1045443 100644
--- a/arch/x86/mm/numa_32.c
+++ b/arch/x86/mm/numa_32.c
@@ -27,40 +27,6 @@
#include "numa_internal.h"
-#ifdef CONFIG_DISCONTIGMEM
-/*
- * 4) physnode_map - the mapping between a pfn and owning node
- * physnode_map keeps track of the physical memory layout of a generic
- * numa node on a 64Mb break (each element of the array will
- * represent 64Mb of memory and will be marked by the node id. so,
- * if the first gig is on node 0, and the second gig is on node 1
- * physnode_map will contain:
- *
- * physnode_map[0-15] = 0;
- * physnode_map[16-31] = 1;
- * physnode_map[32- ] = -1;
- */
-s8 physnode_map[MAX_SECTIONS] __read_mostly = { [0 ... (MAX_SECTIONS - 1)] = -1};
-EXPORT_SYMBOL(physnode_map);
-
-void memory_present(int nid, unsigned long start, unsigned long end)
-{
- unsigned long pfn;
-
- printk(KERN_INFO "Node: %d, start_pfn: %lx, end_pfn: %lx\n",
- nid, start, end);
- printk(KERN_DEBUG " Setting physnode_map array to node %d for pfns:\n", nid);
- printk(KERN_DEBUG " ");
- start = round_down(start, PAGES_PER_SECTION);
- end = round_up(end, PAGES_PER_SECTION);
- for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) {
- physnode_map[pfn / PAGES_PER_SECTION] = nid;
- printk(KERN_CONT "%lx ", pfn);
- }
- printk(KERN_CONT "\n");
-}
-#endif
-
extern unsigned long highend_pfn, highstart_pfn;
void __init initmem_init(void)
diff --git a/arch/x86/mm/numa_emulation.c b/arch/x86/mm/numa_emulation.c
index 8728225..87d77cc 100644
--- a/arch/x86/mm/numa_emulation.c
+++ b/arch/x86/mm/numa_emulation.c
@@ -13,9 +13,10 @@
static int emu_nid_to_phys[MAX_NUMNODES];
static char *emu_cmdline __initdata;
-void __init numa_emu_cmdline(char *str)
+int __init numa_emu_cmdline(char *str)
{
emu_cmdline = str;
+ return 0;
}
static int __init emu_find_memblk_by_nid(int nid, const struct numa_meminfo *mi)
@@ -324,7 +325,7 @@
0, NULL, 0);
}
-int __init setup_emu2phys_nid(int *dfl_phys_nid)
+static int __init setup_emu2phys_nid(int *dfl_phys_nid)
{
int i, max_emu_nid = 0;
@@ -438,7 +439,7 @@
goto no_emu;
if (numa_cleanup_meminfo(&ei) < 0) {
- pr_warning("NUMA: Warning: constructed meminfo invalid, disabling emulation\n");
+ pr_warn("NUMA: Warning: constructed meminfo invalid, disabling emulation\n");
goto no_emu;
}
@@ -449,7 +450,7 @@
phys = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped),
phys_size, PAGE_SIZE);
if (!phys) {
- pr_warning("NUMA: Warning: can't allocate copy of distance table, disabling emulation\n");
+ pr_warn("NUMA: Warning: can't allocate copy of distance table, disabling emulation\n");
goto no_emu;
}
memblock_reserve(phys, phys_size);
diff --git a/arch/x86/mm/pat/Makefile b/arch/x86/mm/pat/Makefile
new file mode 100644
index 0000000..ea464c9
--- /dev/null
+++ b/arch/x86/mm/pat/Makefile
@@ -0,0 +1,5 @@
+# SPDX-License-Identifier: GPL-2.0
+
+obj-y := set_memory.o memtype.o
+
+obj-$(CONFIG_X86_PAT) += memtype_interval.o
diff --git a/arch/x86/mm/pageattr-test.c b/arch/x86/mm/pat/cpa-test.c
similarity index 98%
rename from arch/x86/mm/pageattr-test.c
rename to arch/x86/mm/pat/cpa-test.c
index facce27..0612a73 100644
--- a/arch/x86/mm/pageattr-test.c
+++ b/arch/x86/mm/pat/cpa-test.c
@@ -14,7 +14,6 @@
#include <linux/vmalloc.h>
#include <asm/cacheflush.h>
-#include <asm/pgtable.h>
#include <asm/kdebug.h>
/*
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat/memtype.c
similarity index 83%
rename from arch/x86/mm/pat.c
rename to arch/x86/mm/pat/memtype.c
index 35b2e35..232932b 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat/memtype.c
@@ -1,11 +1,34 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
- * Handle caching attributes in page tables (PAT)
+ * Page Attribute Table (PAT) support: handle memory caching attributes in page tables.
*
* Authors: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
* Suresh B Siddha <suresh.b.siddha@intel.com>
*
* Loosely based on earlier PAT patchset from Eric Biederman and Andi Kleen.
+ *
+ * Basic principles:
+ *
+ * PAT is a CPU feature supported by all modern x86 CPUs, to allow the firmware and
+ * the kernel to set one of a handful of 'caching type' attributes for physical
+ * memory ranges: uncached, write-combining, write-through, write-protected,
+ * and the most commonly used and default attribute: write-back caching.
+ *
+ * PAT support supercedes and augments MTRR support in a compatible fashion: MTRR is
+ * a hardware interface to enumerate a limited number of physical memory ranges
+ * and set their caching attributes explicitly, programmed into the CPU via MSRs.
+ * Even modern CPUs have MTRRs enabled - but these are typically not touched
+ * by the kernel or by user-space (such as the X server), we rely on PAT for any
+ * additional cache attribute logic.
+ *
+ * PAT doesn't work via explicit memory ranges, but uses page table entries to add
+ * cache attribute information to the mapped memory range: there's 3 bits used,
+ * (_PAGE_PWT, _PAGE_PCD, _PAGE_PAT), with the 8 possible values mapped by the
+ * CPU to actual cache attributes via an MSR loaded into the CPU (MSR_IA32_CR_PAT).
+ *
+ * ( There's a metric ton of finer details, such as compatibility with CPU quirks
+ * that only support 4 types of PAT entries, and interaction with MTRRs, see
+ * below for details. )
*/
#include <linux/seq_file.h>
@@ -23,50 +46,53 @@
#include <asm/processor.h>
#include <asm/tlbflush.h>
#include <asm/x86_init.h>
-#include <asm/pgtable.h>
#include <asm/fcntl.h>
#include <asm/e820/api.h>
#include <asm/mtrr.h>
#include <asm/page.h>
#include <asm/msr.h>
-#include <asm/pat.h>
+#include <asm/memtype.h>
#include <asm/io.h>
-#include "pat_internal.h"
-#include "mm_internal.h"
+#include "memtype.h"
+#include "../mm_internal.h"
#undef pr_fmt
#define pr_fmt(fmt) "" fmt
-static bool __read_mostly boot_cpu_done;
+static bool __read_mostly pat_bp_initialized;
static bool __read_mostly pat_disabled = !IS_ENABLED(CONFIG_X86_PAT);
-static bool __read_mostly pat_initialized;
-static bool __read_mostly init_cm_done;
+static bool __read_mostly pat_bp_enabled;
+static bool __read_mostly pat_cm_initialized;
-void pat_disable(const char *reason)
+/*
+ * PAT support is enabled by default, but can be disabled for
+ * various user-requested or hardware-forced reasons:
+ */
+void pat_disable(const char *msg_reason)
{
if (pat_disabled)
return;
- if (boot_cpu_done) {
+ if (pat_bp_initialized) {
WARN_ONCE(1, "x86/PAT: PAT cannot be disabled after initialization\n");
return;
}
pat_disabled = true;
- pr_info("x86/PAT: %s\n", reason);
+ pr_info("x86/PAT: %s\n", msg_reason);
}
static int __init nopat(char *str)
{
- pat_disable("PAT support disabled.");
+ pat_disable("PAT support disabled via boot option.");
return 0;
}
early_param("nopat", nopat);
bool pat_enabled(void)
{
- return pat_initialized;
+ return pat_bp_enabled;
}
EXPORT_SYMBOL_GPL(pat_enabled);
@@ -197,6 +223,8 @@
char pat_msg[33];
int i;
+ WARN_ON_ONCE(pat_cm_initialized);
+
pat_msg[32] = 0;
for (i = 7; i >= 0; i--) {
cache = pat_get_cache_mode((pat >> (i * 8)) & 7,
@@ -205,28 +233,28 @@
}
pr_info("x86/PAT: Configuration [0-7]: %s\n", pat_msg);
- init_cm_done = true;
+ pat_cm_initialized = true;
}
#define PAT(x, y) ((u64)PAT_ ## y << ((x)*8))
-static void pat_bsp_init(u64 pat)
+static void pat_bp_init(u64 pat)
{
u64 tmp_pat;
if (!boot_cpu_has(X86_FEATURE_PAT)) {
- pat_disable("PAT not supported by CPU.");
+ pat_disable("PAT not supported by the CPU.");
return;
}
rdmsrl(MSR_IA32_CR_PAT, tmp_pat);
if (!tmp_pat) {
- pat_disable("PAT MSR is 0, disabled.");
+ pat_disable("PAT support disabled by the firmware.");
return;
}
wrmsrl(MSR_IA32_CR_PAT, pat);
- pat_initialized = true;
+ pat_bp_enabled = true;
__init_cache_modes(pat);
}
@@ -248,7 +276,7 @@
{
u64 pat = 0;
- if (init_cm_done)
+ if (pat_cm_initialized)
return;
if (boot_cpu_has(X86_FEATURE_PAT)) {
@@ -291,7 +319,7 @@
}
/**
- * pat_init - Initialize PAT MSR and PAT table
+ * pat_init - Initialize the PAT MSR and PAT table on the current CPU
*
* This function initializes PAT MSR and PAT table with an OS-defined value
* to enable additional cache attributes, WC, WT and WP.
@@ -305,6 +333,10 @@
u64 pat;
struct cpuinfo_x86 *c = &boot_cpu_data;
+#ifndef CONFIG_X86_PAT
+ pr_info_once("x86/PAT: PAT support disabled because CONFIG_X86_PAT is disabled in the kernel.\n");
+#endif
+
if (pat_disabled)
return;
@@ -364,9 +396,9 @@
PAT(4, WB) | PAT(5, WP) | PAT(6, UC_MINUS) | PAT(7, WT);
}
- if (!boot_cpu_done) {
- pat_bsp_init(pat);
- boot_cpu_done = true;
+ if (!pat_bp_initialized) {
+ pat_bp_init(pat);
+ pat_bp_initialized = true;
} else {
pat_ap_init(pat);
}
@@ -542,16 +574,21 @@
* available type in new_type in case of no error. In case of any error
* it will return a negative return value.
*/
-int reserve_memtype(u64 start, u64 end, enum page_cache_mode req_type,
+int memtype_reserve(u64 start, u64 end, enum page_cache_mode req_type,
enum page_cache_mode *new_type)
{
- struct memtype *new;
+ struct memtype *entry_new;
enum page_cache_mode actual_type;
int is_range_ram;
int err = 0;
start = sanitize_phys(start);
- end = sanitize_phys(end);
+
+ /*
+ * The end address passed into this function is exclusive, but
+ * sanitize_phys() expects an inclusive address.
+ */
+ end = sanitize_phys(end - 1) + 1;
if (start >= end) {
WARN(1, "%s failed: [mem %#010Lx-%#010Lx], req %s\n", __func__,
start, end - 1, cattr_name(req_type));
@@ -593,22 +630,22 @@
return -EINVAL;
}
- new = kzalloc(sizeof(struct memtype), GFP_KERNEL);
- if (!new)
+ entry_new = kzalloc(sizeof(struct memtype), GFP_KERNEL);
+ if (!entry_new)
return -ENOMEM;
- new->start = start;
- new->end = end;
- new->type = actual_type;
+ entry_new->start = start;
+ entry_new->end = end;
+ entry_new->type = actual_type;
spin_lock(&memtype_lock);
- err = rbt_memtype_check_insert(new, new_type);
+ err = memtype_check_insert(entry_new, new_type);
if (err) {
- pr_info("x86/PAT: reserve_memtype failed [mem %#010Lx-%#010Lx], track %s, req %s\n",
+ pr_info("x86/PAT: memtype_reserve failed [mem %#010Lx-%#010Lx], track %s, req %s\n",
start, end - 1,
- cattr_name(new->type), cattr_name(req_type));
- kfree(new);
+ cattr_name(entry_new->type), cattr_name(req_type));
+ kfree(entry_new);
spin_unlock(&memtype_lock);
return err;
@@ -616,18 +653,17 @@
spin_unlock(&memtype_lock);
- dprintk("reserve_memtype added [mem %#010Lx-%#010Lx], track %s, req %s, ret %s\n",
- start, end - 1, cattr_name(new->type), cattr_name(req_type),
+ dprintk("memtype_reserve added [mem %#010Lx-%#010Lx], track %s, req %s, ret %s\n",
+ start, end - 1, cattr_name(entry_new->type), cattr_name(req_type),
new_type ? cattr_name(*new_type) : "-");
return err;
}
-int free_memtype(u64 start, u64 end)
+int memtype_free(u64 start, u64 end)
{
- int err = -EINVAL;
int is_range_ram;
- struct memtype *entry;
+ struct memtype *entry_old;
if (!pat_enabled())
return 0;
@@ -640,28 +676,24 @@
return 0;
is_range_ram = pat_pagerange_is_ram(start, end);
- if (is_range_ram == 1) {
-
- err = free_ram_pages_type(start, end);
-
- return err;
- } else if (is_range_ram < 0) {
+ if (is_range_ram == 1)
+ return free_ram_pages_type(start, end);
+ if (is_range_ram < 0)
return -EINVAL;
- }
spin_lock(&memtype_lock);
- entry = rbt_memtype_erase(start, end);
+ entry_old = memtype_erase(start, end);
spin_unlock(&memtype_lock);
- if (IS_ERR(entry)) {
+ if (IS_ERR(entry_old)) {
pr_info("x86/PAT: %s:%d freeing invalid memtype [mem %#010Lx-%#010Lx]\n",
current->comm, current->pid, start, end - 1);
return -EINVAL;
}
- kfree(entry);
+ kfree(entry_old);
- dprintk("free_memtype request [mem %#010Lx-%#010Lx]\n", start, end - 1);
+ dprintk("memtype_free request [mem %#010Lx-%#010Lx]\n", start, end - 1);
return 0;
}
@@ -693,13 +725,14 @@
spin_lock(&memtype_lock);
- entry = rbt_memtype_lookup(paddr);
+ entry = memtype_lookup(paddr);
if (entry != NULL)
rettype = entry->type;
else
rettype = _PAGE_CACHE_MODE_UC_MINUS;
spin_unlock(&memtype_lock);
+
return rettype;
}
@@ -723,7 +756,7 @@
EXPORT_SYMBOL_GPL(pat_pfn_immune_to_uc_mtrr);
/**
- * io_reserve_memtype - Request a memory type mapping for a region of memory
+ * memtype_reserve_io - Request a memory type mapping for a region of memory
* @start: start (physical address) of the region
* @end: end (physical address) of the region
* @type: A pointer to memtype, with requested type. On success, requested
@@ -732,7 +765,7 @@
* On success, returns 0
* On failure, returns non-zero
*/
-int io_reserve_memtype(resource_size_t start, resource_size_t end,
+int memtype_reserve_io(resource_size_t start, resource_size_t end,
enum page_cache_mode *type)
{
resource_size_t size = end - start;
@@ -742,47 +775,47 @@
WARN_ON_ONCE(iomem_map_sanity_check(start, size));
- ret = reserve_memtype(start, end, req_type, &new_type);
+ ret = memtype_reserve(start, end, req_type, &new_type);
if (ret)
goto out_err;
if (!is_new_memtype_allowed(start, size, req_type, new_type))
goto out_free;
- if (kernel_map_sync_memtype(start, size, new_type) < 0)
+ if (memtype_kernel_map_sync(start, size, new_type) < 0)
goto out_free;
*type = new_type;
return 0;
out_free:
- free_memtype(start, end);
+ memtype_free(start, end);
ret = -EBUSY;
out_err:
return ret;
}
/**
- * io_free_memtype - Release a memory type mapping for a region of memory
+ * memtype_free_io - Release a memory type mapping for a region of memory
* @start: start (physical address) of the region
* @end: end (physical address) of the region
*/
-void io_free_memtype(resource_size_t start, resource_size_t end)
+void memtype_free_io(resource_size_t start, resource_size_t end)
{
- free_memtype(start, end);
+ memtype_free(start, end);
}
int arch_io_reserve_memtype_wc(resource_size_t start, resource_size_t size)
{
enum page_cache_mode type = _PAGE_CACHE_MODE_WC;
- return io_reserve_memtype(start, start + size, &type);
+ return memtype_reserve_io(start, start + size, &type);
}
EXPORT_SYMBOL(arch_io_reserve_memtype_wc);
void arch_io_free_memtype_wc(resource_size_t start, resource_size_t size)
{
- io_free_memtype(start, start + size);
+ memtype_free_io(start, start + size);
}
EXPORT_SYMBOL(arch_io_free_memtype_wc);
@@ -839,10 +872,10 @@
}
/*
- * Change the memory type for the physial address range in kernel identity
+ * Change the memory type for the physical address range in kernel identity
* mapping space if that range is a part of identity map.
*/
-int kernel_map_sync_memtype(u64 base, unsigned long size,
+int memtype_kernel_map_sync(u64 base, unsigned long size,
enum page_cache_mode pcm)
{
unsigned long id_sz;
@@ -851,15 +884,14 @@
return 0;
/*
- * some areas in the middle of the kernel identity range
- * are not mapped, like the PCI space.
+ * Some areas in the middle of the kernel identity range
+ * are not mapped, for example the PCI space.
*/
if (!page_is_ram(base >> PAGE_SHIFT))
return 0;
id_sz = (__pa(high_memory-1) <= base + size) ?
- __pa(high_memory) - base :
- size;
+ __pa(high_memory) - base : size;
if (ioremap_change_attr((unsigned long)__va(base), id_sz, pcm) < 0) {
pr_info("x86/PAT: %s:%d ioremap_change_attr failed %s for [mem %#010Lx-%#010Lx]\n",
@@ -873,7 +905,7 @@
/*
* Internal interface to reserve a range of physical memory with prot.
- * Reserved non RAM regions only and after successful reserve_memtype,
+ * Reserved non RAM regions only and after successful memtype_reserve,
* this func also keeps identity mapping (if any) in sync with this new prot.
*/
static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot,
@@ -910,14 +942,14 @@
return 0;
}
- ret = reserve_memtype(paddr, paddr + size, want_pcm, &pcm);
+ ret = memtype_reserve(paddr, paddr + size, want_pcm, &pcm);
if (ret)
return ret;
if (pcm != want_pcm) {
if (strict_prot ||
!is_new_memtype_allowed(paddr, size, want_pcm, pcm)) {
- free_memtype(paddr, paddr + size);
+ memtype_free(paddr, paddr + size);
pr_err("x86/PAT: %s:%d map pfn expected mapping type %s for [mem %#010Lx-%#010Lx], got %s\n",
current->comm, current->pid,
cattr_name(want_pcm),
@@ -935,8 +967,8 @@
cachemode2protval(pcm));
}
- if (kernel_map_sync_memtype(paddr, size, pcm) < 0) {
- free_memtype(paddr, paddr + size);
+ if (memtype_kernel_map_sync(paddr, size, pcm) < 0) {
+ memtype_free(paddr, paddr + size);
return -EINVAL;
}
return 0;
@@ -952,7 +984,7 @@
is_ram = pat_pagerange_is_ram(paddr, paddr + size);
if (is_ram == 0)
- free_memtype(paddr, paddr + size);
+ memtype_free(paddr, paddr + size);
}
/*
@@ -1099,25 +1131,30 @@
#if defined(CONFIG_DEBUG_FS) && defined(CONFIG_X86_PAT)
+/*
+ * We are allocating a temporary printout-entry to be passed
+ * between seq_start()/next() and seq_show():
+ */
static struct memtype *memtype_get_idx(loff_t pos)
{
- struct memtype *print_entry;
+ struct memtype *entry_print;
int ret;
- print_entry = kzalloc(sizeof(struct memtype), GFP_KERNEL);
- if (!print_entry)
+ entry_print = kzalloc(sizeof(struct memtype), GFP_KERNEL);
+ if (!entry_print)
return NULL;
spin_lock(&memtype_lock);
- ret = rbt_memtype_copy_nth_element(print_entry, pos);
+ ret = memtype_copy_nth_element(entry_print, pos);
spin_unlock(&memtype_lock);
- if (!ret) {
- return print_entry;
- } else {
- kfree(print_entry);
+ /* Free it on error: */
+ if (ret) {
+ kfree(entry_print);
return NULL;
}
+
+ return entry_print;
}
static void *memtype_seq_start(struct seq_file *seq, loff_t *pos)
@@ -1144,10 +1181,12 @@
static int memtype_seq_show(struct seq_file *seq, void *v)
{
- struct memtype *print_entry = (struct memtype *)v;
+ struct memtype *entry_print = (struct memtype *)v;
- seq_printf(seq, "%s @ 0x%Lx-0x%Lx\n", cattr_name(print_entry->type),
- print_entry->start, print_entry->end);
+ seq_printf(seq, "PAT: [mem 0x%016Lx-0x%016Lx] %s\n",
+ entry_print->start,
+ entry_print->end,
+ cattr_name(entry_print->type));
return 0;
}
@@ -1179,7 +1218,6 @@
}
return 0;
}
-
late_initcall(pat_memtype_list_init);
#endif /* CONFIG_DEBUG_FS && CONFIG_X86_PAT */
diff --git a/arch/x86/mm/pat/memtype.h b/arch/x86/mm/pat/memtype.h
new file mode 100644
index 0000000..cacecdb
--- /dev/null
+++ b/arch/x86/mm/pat/memtype.h
@@ -0,0 +1,49 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __MEMTYPE_H_
+#define __MEMTYPE_H_
+
+extern int pat_debug_enable;
+
+#define dprintk(fmt, arg...) \
+ do { if (pat_debug_enable) pr_info("x86/PAT: " fmt, ##arg); } while (0)
+
+struct memtype {
+ u64 start;
+ u64 end;
+ u64 subtree_max_end;
+ enum page_cache_mode type;
+ struct rb_node rb;
+};
+
+static inline char *cattr_name(enum page_cache_mode pcm)
+{
+ switch (pcm) {
+ case _PAGE_CACHE_MODE_UC: return "uncached";
+ case _PAGE_CACHE_MODE_UC_MINUS: return "uncached-minus";
+ case _PAGE_CACHE_MODE_WB: return "write-back";
+ case _PAGE_CACHE_MODE_WC: return "write-combining";
+ case _PAGE_CACHE_MODE_WT: return "write-through";
+ case _PAGE_CACHE_MODE_WP: return "write-protected";
+ default: return "broken";
+ }
+}
+
+#ifdef CONFIG_X86_PAT
+extern int memtype_check_insert(struct memtype *entry_new,
+ enum page_cache_mode *new_type);
+extern struct memtype *memtype_erase(u64 start, u64 end);
+extern struct memtype *memtype_lookup(u64 addr);
+extern int memtype_copy_nth_element(struct memtype *entry_out, loff_t pos);
+#else
+static inline int memtype_check_insert(struct memtype *entry_new,
+ enum page_cache_mode *new_type)
+{ return 0; }
+static inline struct memtype *memtype_erase(u64 start, u64 end)
+{ return NULL; }
+static inline struct memtype *memtype_lookup(u64 addr)
+{ return NULL; }
+static inline int memtype_copy_nth_element(struct memtype *out, loff_t pos)
+{ return 0; }
+#endif
+
+#endif /* __MEMTYPE_H_ */
diff --git a/arch/x86/mm/pat/memtype_interval.c b/arch/x86/mm/pat/memtype_interval.c
new file mode 100644
index 0000000..645613d
--- /dev/null
+++ b/arch/x86/mm/pat/memtype_interval.c
@@ -0,0 +1,194 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Handle caching attributes in page tables (PAT)
+ *
+ * Authors: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
+ * Suresh B Siddha <suresh.b.siddha@intel.com>
+ *
+ * Interval tree used to store the PAT memory type reservations.
+ */
+
+#include <linux/seq_file.h>
+#include <linux/debugfs.h>
+#include <linux/kernel.h>
+#include <linux/interval_tree_generic.h>
+#include <linux/sched.h>
+#include <linux/gfp.h>
+#include <linux/pgtable.h>
+
+#include <asm/memtype.h>
+
+#include "memtype.h"
+
+/*
+ * The memtype tree keeps track of memory type for specific
+ * physical memory areas. Without proper tracking, conflicting memory
+ * types in different mappings can cause CPU cache corruption.
+ *
+ * The tree is an interval tree (augmented rbtree) which tree is ordered
+ * by the starting address. The tree can contain multiple entries for
+ * different regions which overlap. All the aliases have the same
+ * cache attributes of course, as enforced by the PAT logic.
+ *
+ * memtype_lock protects the rbtree.
+ */
+
+static inline u64 interval_start(struct memtype *entry)
+{
+ return entry->start;
+}
+
+static inline u64 interval_end(struct memtype *entry)
+{
+ return entry->end - 1;
+}
+
+INTERVAL_TREE_DEFINE(struct memtype, rb, u64, subtree_max_end,
+ interval_start, interval_end,
+ static, interval)
+
+static struct rb_root_cached memtype_rbroot = RB_ROOT_CACHED;
+
+enum {
+ MEMTYPE_EXACT_MATCH = 0,
+ MEMTYPE_END_MATCH = 1
+};
+
+static struct memtype *memtype_match(u64 start, u64 end, int match_type)
+{
+ struct memtype *entry_match;
+
+ entry_match = interval_iter_first(&memtype_rbroot, start, end-1);
+
+ while (entry_match != NULL && entry_match->start < end) {
+ if ((match_type == MEMTYPE_EXACT_MATCH) &&
+ (entry_match->start == start) && (entry_match->end == end))
+ return entry_match;
+
+ if ((match_type == MEMTYPE_END_MATCH) &&
+ (entry_match->start < start) && (entry_match->end == end))
+ return entry_match;
+
+ entry_match = interval_iter_next(entry_match, start, end-1);
+ }
+
+ return NULL; /* Returns NULL if there is no match */
+}
+
+static int memtype_check_conflict(u64 start, u64 end,
+ enum page_cache_mode reqtype,
+ enum page_cache_mode *newtype)
+{
+ struct memtype *entry_match;
+ enum page_cache_mode found_type = reqtype;
+
+ entry_match = interval_iter_first(&memtype_rbroot, start, end-1);
+ if (entry_match == NULL)
+ goto success;
+
+ if (entry_match->type != found_type && newtype == NULL)
+ goto failure;
+
+ dprintk("Overlap at 0x%Lx-0x%Lx\n", entry_match->start, entry_match->end);
+ found_type = entry_match->type;
+
+ entry_match = interval_iter_next(entry_match, start, end-1);
+ while (entry_match) {
+ if (entry_match->type != found_type)
+ goto failure;
+
+ entry_match = interval_iter_next(entry_match, start, end-1);
+ }
+success:
+ if (newtype)
+ *newtype = found_type;
+
+ return 0;
+
+failure:
+ pr_info("x86/PAT: %s:%d conflicting memory types %Lx-%Lx %s<->%s\n",
+ current->comm, current->pid, start, end,
+ cattr_name(found_type), cattr_name(entry_match->type));
+
+ return -EBUSY;
+}
+
+int memtype_check_insert(struct memtype *entry_new, enum page_cache_mode *ret_type)
+{
+ int err = 0;
+
+ err = memtype_check_conflict(entry_new->start, entry_new->end, entry_new->type, ret_type);
+ if (err)
+ return err;
+
+ if (ret_type)
+ entry_new->type = *ret_type;
+
+ interval_insert(entry_new, &memtype_rbroot);
+ return 0;
+}
+
+struct memtype *memtype_erase(u64 start, u64 end)
+{
+ struct memtype *entry_old;
+
+ /*
+ * Since the memtype_rbroot tree allows overlapping ranges,
+ * memtype_erase() checks with EXACT_MATCH first, i.e. free
+ * a whole node for the munmap case. If no such entry is found,
+ * it then checks with END_MATCH, i.e. shrink the size of a node
+ * from the end for the mremap case.
+ */
+ entry_old = memtype_match(start, end, MEMTYPE_EXACT_MATCH);
+ if (!entry_old) {
+ entry_old = memtype_match(start, end, MEMTYPE_END_MATCH);
+ if (!entry_old)
+ return ERR_PTR(-EINVAL);
+ }
+
+ if (entry_old->start == start) {
+ /* munmap: erase this node */
+ interval_remove(entry_old, &memtype_rbroot);
+ } else {
+ /* mremap: update the end value of this node */
+ interval_remove(entry_old, &memtype_rbroot);
+ entry_old->end = start;
+ interval_insert(entry_old, &memtype_rbroot);
+
+ return NULL;
+ }
+
+ return entry_old;
+}
+
+struct memtype *memtype_lookup(u64 addr)
+{
+ return interval_iter_first(&memtype_rbroot, addr, addr + PAGE_SIZE-1);
+}
+
+/*
+ * Debugging helper, copy the Nth entry of the tree into a
+ * a copy for printout. This allows us to print out the tree
+ * via debugfs, without holding the memtype_lock too long:
+ */
+#ifdef CONFIG_DEBUG_FS
+int memtype_copy_nth_element(struct memtype *entry_out, loff_t pos)
+{
+ struct memtype *entry_match;
+ int i = 1;
+
+ entry_match = interval_iter_first(&memtype_rbroot, 0, ULONG_MAX);
+
+ while (entry_match && pos != i) {
+ entry_match = interval_iter_next(entry_match, 0, ULONG_MAX);
+ i++;
+ }
+
+ if (entry_match) { /* pos == i */
+ *entry_out = *entry_match;
+ return 0;
+ } else {
+ return 1;
+ }
+}
+#endif
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pat/set_memory.c
similarity index 96%
rename from arch/x86/mm/pageattr.c
rename to arch/x86/mm/pat/set_memory.c
index 281e584..40baa90 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pat/set_memory.c
@@ -15,6 +15,7 @@
#include <linux/gfp.h>
#include <linux/pci.h>
#include <linux/vmalloc.h>
+#include <linux/libnvdimm.h>
#include <asm/e820/api.h>
#include <asm/processor.h>
@@ -24,10 +25,10 @@
#include <linux/uaccess.h>
#include <asm/pgalloc.h>
#include <asm/proto.h>
-#include <asm/pat.h>
+#include <asm/memtype.h>
#include <asm/set_memory.h>
-#include "mm_internal.h"
+#include "../mm_internal.h"
/*
* The current flushing context - we pass it instead of 5 arguments:
@@ -68,6 +69,11 @@
#define CPA_PAGES_ARRAY 4
#define CPA_NO_CHECK_ALIAS 8 /* Do not search for aliases */
+static inline pgprot_t cachemode2pgprot(enum page_cache_mode pcm)
+{
+ return __pgprot(cachemode2protval(pcm));
+}
+
#ifdef CONFIG_PROC_FS
static unsigned long direct_pages_count[PG_LEVEL_NUM];
@@ -129,7 +135,7 @@
static inline void cpa_inc_4k_install(void)
{
- cpa_4k_install++;
+ data_race(cpa_4k_install++);
}
static inline void cpa_inc_lp_sameprot(int level)
@@ -305,11 +311,13 @@
}
EXPORT_SYMBOL_GPL(clflush_cache_range);
+#ifdef CONFIG_ARCH_HAS_PMEM_API
void arch_invalidate_pmem(void *addr, size_t size)
{
clflush_cache_range(addr, size);
}
EXPORT_SYMBOL_GPL(arch_invalidate_pmem);
+#endif
static void __cpa_flush_all(void *arg)
{
@@ -332,13 +340,13 @@
on_each_cpu(__cpa_flush_all, (void *) cache, 1);
}
-void __cpa_flush_tlb(void *data)
+static void __cpa_flush_tlb(void *data)
{
struct cpa_data *cpa = data;
unsigned int i;
for (i = 0; i < cpa->numpages; i++)
- __flush_tlb_one_kernel(fix_addr(__cpa_addr(cpa, i)));
+ flush_tlb_one_kernel(fix_addr(__cpa_addr(cpa, i)));
}
static void cpa_flush(struct cpa_data *data, int cache)
@@ -619,6 +627,17 @@
}
EXPORT_SYMBOL_GPL(lookup_address);
+/*
+ * Lookup the page table entry for a virtual address in a given mm. Return a
+ * pointer to the entry and the level of the mapping.
+ */
+pte_t *lookup_address_in_mm(struct mm_struct *mm, unsigned long address,
+ unsigned int *level)
+{
+ return lookup_address_in_pgd(pgd_offset(mm, address), address, level);
+}
+EXPORT_SYMBOL_GPL(lookup_address_in_mm);
+
static pte_t *_lookup_address_cpa(struct cpa_data *cpa, unsigned long address,
unsigned int *level)
{
@@ -1785,10 +1804,23 @@
CPA_PAGES_ARRAY, pages);
}
+/*
+ * _set_memory_prot is an internal helper for callers that have been passed
+ * a pgprot_t value from upper layers and a reservation has already been taken.
+ * If you want to set the pgprot to a specific page protocol, use the
+ * set_memory_xx() functions.
+ */
+int __set_memory_prot(unsigned long addr, int numpages, pgprot_t prot)
+{
+ return change_page_attr_set_clr(&addr, numpages, prot,
+ __pgprot(~pgprot_val(prot)), 0, 0,
+ NULL);
+}
+
int _set_memory_uc(unsigned long addr, int numpages)
{
/*
- * for now UC MINUS. see comments in ioremap_nocache()
+ * for now UC MINUS. see comments in ioremap()
* If you really need strong UC use ioremap_uc(), but note
* that you cannot override IO areas with set_memory_*() as
* these helpers cannot work with IO memory.
@@ -1803,9 +1835,9 @@
int ret;
/*
- * for now UC MINUS. see comments in ioremap_nocache()
+ * for now UC MINUS. see comments in ioremap()
*/
- ret = reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE,
+ ret = memtype_reserve(__pa(addr), __pa(addr) + numpages * PAGE_SIZE,
_PAGE_CACHE_MODE_UC_MINUS, NULL);
if (ret)
goto out_err;
@@ -1817,7 +1849,7 @@
return 0;
out_free:
- free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE);
+ memtype_free(__pa(addr), __pa(addr) + numpages * PAGE_SIZE);
out_err:
return ret;
}
@@ -1843,14 +1875,14 @@
{
int ret;
- ret = reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE,
+ ret = memtype_reserve(__pa(addr), __pa(addr) + numpages * PAGE_SIZE,
_PAGE_CACHE_MODE_WC, NULL);
if (ret)
return ret;
ret = _set_memory_wc(addr, numpages);
if (ret)
- free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE);
+ memtype_free(__pa(addr), __pa(addr) + numpages * PAGE_SIZE);
return ret;
}
@@ -1877,7 +1909,7 @@
if (ret)
return ret;
- free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE);
+ memtype_free(__pa(addr), __pa(addr) + numpages * PAGE_SIZE);
return 0;
}
EXPORT_SYMBOL(set_memory_wb);
@@ -1967,7 +1999,7 @@
/*
* Before changing the encryption attribute, we need to flush caches.
*/
- cpa_flush(&cpa, 1);
+ cpa_flush(&cpa, !this_cpu_has(X86_FEATURE_SME_COHERENT));
ret = __change_page_attr_set_clr(&cpa, 1);
@@ -2018,7 +2050,7 @@
continue;
start = page_to_pfn(pages[i]) << PAGE_SHIFT;
end = start + PAGE_SIZE;
- if (reserve_memtype(start, end, new_type, NULL))
+ if (memtype_reserve(start, end, new_type, NULL))
goto err_out;
}
@@ -2044,7 +2076,7 @@
continue;
start = page_to_pfn(pages[i]) << PAGE_SHIFT;
end = start + PAGE_SIZE;
- free_memtype(start, end);
+ memtype_free(start, end);
}
return -EINVAL;
}
@@ -2093,7 +2125,7 @@
continue;
start = page_to_pfn(pages[i]) << PAGE_SHIFT;
end = start + PAGE_SIZE;
- free_memtype(start, end);
+ memtype_free(start, end);
}
return 0;
@@ -2279,5 +2311,5 @@
* be exposed to the rest of the kernel. Include these directly here.
*/
#ifdef CONFIG_CPA_DEBUG
-#include "pageattr-test.c"
+#include "cpa-test.c"
#endif
diff --git a/arch/x86/mm/pat_internal.h b/arch/x86/mm/pat_internal.h
deleted file mode 100644
index eeb5cae..0000000
--- a/arch/x86/mm/pat_internal.h
+++ /dev/null
@@ -1,49 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef __PAT_INTERNAL_H_
-#define __PAT_INTERNAL_H_
-
-extern int pat_debug_enable;
-
-#define dprintk(fmt, arg...) \
- do { if (pat_debug_enable) pr_info("x86/PAT: " fmt, ##arg); } while (0)
-
-struct memtype {
- u64 start;
- u64 end;
- u64 subtree_max_end;
- enum page_cache_mode type;
- struct rb_node rb;
-};
-
-static inline char *cattr_name(enum page_cache_mode pcm)
-{
- switch (pcm) {
- case _PAGE_CACHE_MODE_UC: return "uncached";
- case _PAGE_CACHE_MODE_UC_MINUS: return "uncached-minus";
- case _PAGE_CACHE_MODE_WB: return "write-back";
- case _PAGE_CACHE_MODE_WC: return "write-combining";
- case _PAGE_CACHE_MODE_WT: return "write-through";
- case _PAGE_CACHE_MODE_WP: return "write-protected";
- default: return "broken";
- }
-}
-
-#ifdef CONFIG_X86_PAT
-extern int rbt_memtype_check_insert(struct memtype *new,
- enum page_cache_mode *new_type);
-extern struct memtype *rbt_memtype_erase(u64 start, u64 end);
-extern struct memtype *rbt_memtype_lookup(u64 addr);
-extern int rbt_memtype_copy_nth_element(struct memtype *out, loff_t pos);
-#else
-static inline int rbt_memtype_check_insert(struct memtype *new,
- enum page_cache_mode *new_type)
-{ return 0; }
-static inline struct memtype *rbt_memtype_erase(u64 start, u64 end)
-{ return NULL; }
-static inline struct memtype *rbt_memtype_lookup(u64 addr)
-{ return NULL; }
-static inline int rbt_memtype_copy_nth_element(struct memtype *out, loff_t pos)
-{ return 0; }
-#endif
-
-#endif /* __PAT_INTERNAL_H_ */
diff --git a/arch/x86/mm/pat_rbtree.c b/arch/x86/mm/pat_rbtree.c
deleted file mode 100644
index 65ebe4b..0000000
--- a/arch/x86/mm/pat_rbtree.c
+++ /dev/null
@@ -1,268 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Handle caching attributes in page tables (PAT)
- *
- * Authors: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
- * Suresh B Siddha <suresh.b.siddha@intel.com>
- *
- * Interval tree (augmented rbtree) used to store the PAT memory type
- * reservations.
- */
-
-#include <linux/seq_file.h>
-#include <linux/debugfs.h>
-#include <linux/kernel.h>
-#include <linux/rbtree_augmented.h>
-#include <linux/sched.h>
-#include <linux/gfp.h>
-
-#include <asm/pgtable.h>
-#include <asm/pat.h>
-
-#include "pat_internal.h"
-
-/*
- * The memtype tree keeps track of memory type for specific
- * physical memory areas. Without proper tracking, conflicting memory
- * types in different mappings can cause CPU cache corruption.
- *
- * The tree is an interval tree (augmented rbtree) with tree ordered
- * on starting address. Tree can contain multiple entries for
- * different regions which overlap. All the aliases have the same
- * cache attributes of course.
- *
- * memtype_lock protects the rbtree.
- */
-
-static struct rb_root memtype_rbroot = RB_ROOT;
-
-static int is_node_overlap(struct memtype *node, u64 start, u64 end)
-{
- if (node->start >= end || node->end <= start)
- return 0;
-
- return 1;
-}
-
-static u64 get_subtree_max_end(struct rb_node *node)
-{
- u64 ret = 0;
- if (node) {
- struct memtype *data = rb_entry(node, struct memtype, rb);
- ret = data->subtree_max_end;
- }
- return ret;
-}
-
-#define NODE_END(node) ((node)->end)
-
-RB_DECLARE_CALLBACKS_MAX(static, memtype_rb_augment_cb,
- struct memtype, rb, u64, subtree_max_end, NODE_END)
-
-/* Find the first (lowest start addr) overlapping range from rb tree */
-static struct memtype *memtype_rb_lowest_match(struct rb_root *root,
- u64 start, u64 end)
-{
- struct rb_node *node = root->rb_node;
- struct memtype *last_lower = NULL;
-
- while (node) {
- struct memtype *data = rb_entry(node, struct memtype, rb);
-
- if (get_subtree_max_end(node->rb_left) > start) {
- /* Lowest overlap if any must be on left side */
- node = node->rb_left;
- } else if (is_node_overlap(data, start, end)) {
- last_lower = data;
- break;
- } else if (start >= data->start) {
- /* Lowest overlap if any must be on right side */
- node = node->rb_right;
- } else {
- break;
- }
- }
- return last_lower; /* Returns NULL if there is no overlap */
-}
-
-enum {
- MEMTYPE_EXACT_MATCH = 0,
- MEMTYPE_END_MATCH = 1
-};
-
-static struct memtype *memtype_rb_match(struct rb_root *root,
- u64 start, u64 end, int match_type)
-{
- struct memtype *match;
-
- match = memtype_rb_lowest_match(root, start, end);
- while (match != NULL && match->start < end) {
- struct rb_node *node;
-
- if ((match_type == MEMTYPE_EXACT_MATCH) &&
- (match->start == start) && (match->end == end))
- return match;
-
- if ((match_type == MEMTYPE_END_MATCH) &&
- (match->start < start) && (match->end == end))
- return match;
-
- node = rb_next(&match->rb);
- if (node)
- match = rb_entry(node, struct memtype, rb);
- else
- match = NULL;
- }
-
- return NULL; /* Returns NULL if there is no match */
-}
-
-static int memtype_rb_check_conflict(struct rb_root *root,
- u64 start, u64 end,
- enum page_cache_mode reqtype,
- enum page_cache_mode *newtype)
-{
- struct rb_node *node;
- struct memtype *match;
- enum page_cache_mode found_type = reqtype;
-
- match = memtype_rb_lowest_match(&memtype_rbroot, start, end);
- if (match == NULL)
- goto success;
-
- if (match->type != found_type && newtype == NULL)
- goto failure;
-
- dprintk("Overlap at 0x%Lx-0x%Lx\n", match->start, match->end);
- found_type = match->type;
-
- node = rb_next(&match->rb);
- while (node) {
- match = rb_entry(node, struct memtype, rb);
-
- if (match->start >= end) /* Checked all possible matches */
- goto success;
-
- if (is_node_overlap(match, start, end) &&
- match->type != found_type) {
- goto failure;
- }
-
- node = rb_next(&match->rb);
- }
-success:
- if (newtype)
- *newtype = found_type;
-
- return 0;
-
-failure:
- pr_info("x86/PAT: %s:%d conflicting memory types %Lx-%Lx %s<->%s\n",
- current->comm, current->pid, start, end,
- cattr_name(found_type), cattr_name(match->type));
- return -EBUSY;
-}
-
-static void memtype_rb_insert(struct rb_root *root, struct memtype *newdata)
-{
- struct rb_node **node = &(root->rb_node);
- struct rb_node *parent = NULL;
-
- while (*node) {
- struct memtype *data = rb_entry(*node, struct memtype, rb);
-
- parent = *node;
- if (data->subtree_max_end < newdata->end)
- data->subtree_max_end = newdata->end;
- if (newdata->start <= data->start)
- node = &((*node)->rb_left);
- else if (newdata->start > data->start)
- node = &((*node)->rb_right);
- }
-
- newdata->subtree_max_end = newdata->end;
- rb_link_node(&newdata->rb, parent, node);
- rb_insert_augmented(&newdata->rb, root, &memtype_rb_augment_cb);
-}
-
-int rbt_memtype_check_insert(struct memtype *new,
- enum page_cache_mode *ret_type)
-{
- int err = 0;
-
- err = memtype_rb_check_conflict(&memtype_rbroot, new->start, new->end,
- new->type, ret_type);
-
- if (!err) {
- if (ret_type)
- new->type = *ret_type;
-
- new->subtree_max_end = new->end;
- memtype_rb_insert(&memtype_rbroot, new);
- }
- return err;
-}
-
-struct memtype *rbt_memtype_erase(u64 start, u64 end)
-{
- struct memtype *data;
-
- /*
- * Since the memtype_rbroot tree allows overlapping ranges,
- * rbt_memtype_erase() checks with EXACT_MATCH first, i.e. free
- * a whole node for the munmap case. If no such entry is found,
- * it then checks with END_MATCH, i.e. shrink the size of a node
- * from the end for the mremap case.
- */
- data = memtype_rb_match(&memtype_rbroot, start, end,
- MEMTYPE_EXACT_MATCH);
- if (!data) {
- data = memtype_rb_match(&memtype_rbroot, start, end,
- MEMTYPE_END_MATCH);
- if (!data)
- return ERR_PTR(-EINVAL);
- }
-
- if (data->start == start) {
- /* munmap: erase this node */
- rb_erase_augmented(&data->rb, &memtype_rbroot,
- &memtype_rb_augment_cb);
- } else {
- /* mremap: update the end value of this node */
- rb_erase_augmented(&data->rb, &memtype_rbroot,
- &memtype_rb_augment_cb);
- data->end = start;
- data->subtree_max_end = data->end;
- memtype_rb_insert(&memtype_rbroot, data);
- return NULL;
- }
-
- return data;
-}
-
-struct memtype *rbt_memtype_lookup(u64 addr)
-{
- return memtype_rb_lowest_match(&memtype_rbroot, addr, addr + PAGE_SIZE);
-}
-
-#if defined(CONFIG_DEBUG_FS)
-int rbt_memtype_copy_nth_element(struct memtype *out, loff_t pos)
-{
- struct rb_node *node;
- int i = 1;
-
- node = rb_first(&memtype_rbroot);
- while (node && pos != i) {
- node = rb_next(node);
- i++;
- }
-
- if (node) { /* pos == i */
- struct memtype *this = rb_entry(node, struct memtype, rb);
- *out = *this;
- return 0;
- } else {
- return 1;
- }
-}
-#endif
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 7982f13..f6a9e2e 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -3,7 +3,6 @@
#include <linux/gfp.h>
#include <linux/hugetlb.h>
#include <asm/pgalloc.h>
-#include <asm/pgtable.h>
#include <asm/tlb.h>
#include <asm/fixmap.h>
#include <asm/mtrr.h>
@@ -19,6 +18,14 @@
#define PGTABLE_HIGHMEM 0
#endif
+#ifndef CONFIG_PARAVIRT
+static inline
+void paravirt_tlb_remove_table(struct mmu_gather *tlb, void *table)
+{
+ tlb_remove_page(tlb, table);
+}
+#endif
+
gfp_t __userpte_alloc_gfp = GFP_PGTABLE_USER | PGTABLE_HIGHMEM;
pgtable_t pte_alloc_one(struct mm_struct *mm)
@@ -706,11 +713,9 @@
if (pud_present(*pud) && !pud_huge(*pud))
return 0;
- prot = pgprot_4k_2_large(prot);
-
set_pte((pte_t *)pud, pfn_pte(
(u64)addr >> PAGE_SHIFT,
- __pgprot(pgprot_val(prot) | _PAGE_PSE)));
+ __pgprot(protval_4k_2_large(pgprot_val(prot)) | _PAGE_PSE)));
return 1;
}
@@ -738,11 +743,9 @@
if (pmd_present(*pmd) && !pmd_huge(*pmd))
return 0;
- prot = pgprot_4k_2_large(prot);
-
set_pte((pte_t *)pmd, pfn_pte(
(u64)addr >> PAGE_SHIFT,
- __pgprot(pgprot_val(prot) | _PAGE_PSE)));
+ __pgprot(protval_4k_2_large(pgprot_val(prot)) | _PAGE_PSE)));
return 1;
}
diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c
index 9bb7f0a..c234634 100644
--- a/arch/x86/mm/pgtable_32.c
+++ b/arch/x86/mm/pgtable_32.c
@@ -11,13 +11,12 @@
#include <linux/spinlock.h>
#include <asm/cpu_entry_area.h>
-#include <asm/pgtable.h>
-#include <asm/pgalloc.h>
#include <asm/fixmap.h>
#include <asm/e820/api.h>
#include <asm/tlb.h>
#include <asm/tlbflush.h>
#include <asm/io.h>
+#include <linux/vmalloc.h>
unsigned int __VMALLOC_RESERVE = 128 << 20;
@@ -63,7 +62,7 @@
* It's enough to flush this one mapping.
* (PGE mappings get flushed as well)
*/
- __flush_tlb_one_kernel(vaddr);
+ flush_tlb_one_kernel(vaddr);
}
unsigned long __FIXADDR_TOP = 0xfffff000;
diff --git a/arch/x86/mm/physaddr.c b/arch/x86/mm/physaddr.c
index bdc9815..fc3f3d3 100644
--- a/arch/x86/mm/physaddr.c
+++ b/arch/x86/mm/physaddr.c
@@ -5,6 +5,7 @@
#include <linux/mm.h>
#include <asm/page.h>
+#include <linux/vmalloc.h>
#include "physaddr.h"
diff --git a/arch/x86/mm/pkeys.c b/arch/x86/mm/pkeys.c
index c6f84c0..8873ed1 100644
--- a/arch/x86/mm/pkeys.c
+++ b/arch/x86/mm/pkeys.c
@@ -63,7 +63,7 @@
static inline bool vma_is_pkey_exec_only(struct vm_area_struct *vma)
{
/* Do this check first since the vm_flags should be hot */
- if ((vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC)) != VM_EXEC)
+ if ((vma->vm_flags & VM_ACCESS_FLAGS) != VM_EXEC)
return false;
if (vma_pkey(vma) != vma->vm_mm->context.execute_only_pkey)
return false;
diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c
index 7f21404..1aab929 100644
--- a/arch/x86/mm/pti.c
+++ b/arch/x86/mm/pti.c
@@ -34,11 +34,10 @@
#include <asm/vsyscall.h>
#include <asm/cmdline.h>
#include <asm/pti.h>
-#include <asm/pgtable.h>
-#include <asm/pgalloc.h>
#include <asm/tlbflush.h>
#include <asm/desc.h>
#include <asm/sections.h>
+#include <asm/set_memory.h>
#undef pr_fmt
#define pr_fmt(fmt) "Kernel/User page tables isolation: " fmt
@@ -447,13 +446,7 @@
* the sp1 and sp2 slots.
*
* This is done for all possible CPUs during boot to ensure
- * that it's propagated to all mms. If we were to add one of
- * these mappings during CPU hotplug, we would need to take
- * some measure to make sure that every mm that subsequently
- * ran on that CPU would have the relevant PGD entry in its
- * pagetables. The usual vmalloc_fault() mechanism would not
- * work for page faults taken in entry_SYSCALL_64 before RSP
- * is set up.
+ * that it's propagated to all mms.
*/
unsigned long va = (unsigned long)&per_cpu(cpu_tss_rw, cpu);
@@ -498,12 +491,12 @@
}
/*
- * Clone the populated PMDs of the entry and irqentry text and force it RO.
+ * Clone the populated PMDs of the entry text and force it RO.
*/
static void pti_clone_entry_text(void)
{
pti_clone_pgtable((unsigned long) __entry_text_start,
- (unsigned long) __irqentry_text_end,
+ (unsigned long) __entry_text_end,
PTI_CLONE_PMD);
}
@@ -555,13 +548,6 @@
}
/*
- * This is the only user for these and it is not arch-generic
- * like the other set_memory.h functions. Just extern them.
- */
-extern int set_memory_nonglobal(unsigned long addr, int numpages);
-extern int set_memory_global(unsigned long addr, int numpages);
-
-/*
* For some configurations, map all of kernel text into the user page
* tables. This reduces TLB misses, especially on non-PCID systems.
*/
@@ -574,7 +560,7 @@
*/
unsigned long start = PFN_ALIGN(_text);
unsigned long end_clone = (unsigned long)__end_rodata_aligned;
- unsigned long end_global = PFN_ALIGN((unsigned long)__stop___ex_table);
+ unsigned long end_global = PFN_ALIGN((unsigned long)_etext);
if (!pti_kernel_image_global_ok())
return;
diff --git a/arch/x86/mm/setup_nx.c b/arch/x86/mm/setup_nx.c
index adb3c57..ed5667f 100644
--- a/arch/x86/mm/setup_nx.c
+++ b/arch/x86/mm/setup_nx.c
@@ -2,8 +2,8 @@
#include <linux/spinlock.h>
#include <linux/errno.h>
#include <linux/init.h>
+#include <linux/pgtable.h>
-#include <asm/pgtable.h>
#include <asm/proto.h>
#include <asm/cpufeature.h>
diff --git a/arch/x86/mm/testmmiotrace.c b/arch/x86/mm/testmmiotrace.c
index a8bd952..bda73cb 100644
--- a/arch/x86/mm/testmmiotrace.c
+++ b/arch/x86/mm/testmmiotrace.c
@@ -79,7 +79,7 @@
static void do_test(unsigned long size)
{
- void __iomem *p = ioremap_nocache(mmio_address, size);
+ void __iomem *p = ioremap(mmio_address, size);
if (!p) {
pr_err("could not ioremap, aborting.\n");
return;
@@ -104,7 +104,7 @@
int i;
for (i = 0; i < 10; ++i) {
- p = ioremap_nocache(mmio_address, PAGE_SIZE);
+ p = ioremap(mmio_address, PAGE_SIZE);
if (p)
iounmap(p);
}
@@ -127,9 +127,9 @@
return -ENXIO;
}
- pr_warning("WARNING: mapping %lu kB @ 0x%08lx in PCI address space, "
- "and writing 16 kB of rubbish in there.\n",
- size >> 10, mmio_address);
+ pr_warn("WARNING: mapping %lu kB @ 0x%08lx in PCI address space, "
+ "and writing 16 kB of rubbish in there.\n",
+ size >> 10, mmio_address);
do_test(size);
do_test_bulk_ioremapping();
pr_info("All done.\n");
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 851359b..569ac1d 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -14,10 +14,19 @@
#include <asm/nospec-branch.h>
#include <asm/cache.h>
#include <asm/apic.h>
-#include <asm/uv/uv.h>
#include "mm_internal.h"
+#ifdef CONFIG_PARAVIRT
+# define STATIC_NOPV
+#else
+# define STATIC_NOPV static
+# define __flush_tlb_local native_flush_tlb_local
+# define __flush_tlb_global native_flush_tlb_global
+# define __flush_tlb_one_user(addr) native_flush_tlb_one_user(addr)
+# define __flush_tlb_others(msk, info) native_flush_tlb_others(msk, info)
+#endif
+
/*
* TLB flushing, formerly SMP-only
* c/o Linus Torvalds.
@@ -39,6 +48,126 @@
#define LAST_USER_MM_IBPB 0x1UL
/*
+ * The x86 feature is called PCID (Process Context IDentifier). It is similar
+ * to what is traditionally called ASID on the RISC processors.
+ *
+ * We don't use the traditional ASID implementation, where each process/mm gets
+ * its own ASID and flush/restart when we run out of ASID space.
+ *
+ * Instead we have a small per-cpu array of ASIDs and cache the last few mm's
+ * that came by on this CPU, allowing cheaper switch_mm between processes on
+ * this CPU.
+ *
+ * We end up with different spaces for different things. To avoid confusion we
+ * use different names for each of them:
+ *
+ * ASID - [0, TLB_NR_DYN_ASIDS-1]
+ * the canonical identifier for an mm
+ *
+ * kPCID - [1, TLB_NR_DYN_ASIDS]
+ * the value we write into the PCID part of CR3; corresponds to the
+ * ASID+1, because PCID 0 is special.
+ *
+ * uPCID - [2048 + 1, 2048 + TLB_NR_DYN_ASIDS]
+ * for KPTI each mm has two address spaces and thus needs two
+ * PCID values, but we can still do with a single ASID denomination
+ * for each mm. Corresponds to kPCID + 2048.
+ *
+ */
+
+/* There are 12 bits of space for ASIDS in CR3 */
+#define CR3_HW_ASID_BITS 12
+
+/*
+ * When enabled, PAGE_TABLE_ISOLATION consumes a single bit for
+ * user/kernel switches
+ */
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+# define PTI_CONSUMED_PCID_BITS 1
+#else
+# define PTI_CONSUMED_PCID_BITS 0
+#endif
+
+#define CR3_AVAIL_PCID_BITS (X86_CR3_PCID_BITS - PTI_CONSUMED_PCID_BITS)
+
+/*
+ * ASIDs are zero-based: 0->MAX_AVAIL_ASID are valid. -1 below to account
+ * for them being zero-based. Another -1 is because PCID 0 is reserved for
+ * use by non-PCID-aware users.
+ */
+#define MAX_ASID_AVAILABLE ((1 << CR3_AVAIL_PCID_BITS) - 2)
+
+/*
+ * Given @asid, compute kPCID
+ */
+static inline u16 kern_pcid(u16 asid)
+{
+ VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE);
+
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+ /*
+ * Make sure that the dynamic ASID space does not confict with the
+ * bit we are using to switch between user and kernel ASIDs.
+ */
+ BUILD_BUG_ON(TLB_NR_DYN_ASIDS >= (1 << X86_CR3_PTI_PCID_USER_BIT));
+
+ /*
+ * The ASID being passed in here should have respected the
+ * MAX_ASID_AVAILABLE and thus never have the switch bit set.
+ */
+ VM_WARN_ON_ONCE(asid & (1 << X86_CR3_PTI_PCID_USER_BIT));
+#endif
+ /*
+ * The dynamically-assigned ASIDs that get passed in are small
+ * (<TLB_NR_DYN_ASIDS). They never have the high switch bit set,
+ * so do not bother to clear it.
+ *
+ * If PCID is on, ASID-aware code paths put the ASID+1 into the
+ * PCID bits. This serves two purposes. It prevents a nasty
+ * situation in which PCID-unaware code saves CR3, loads some other
+ * value (with PCID == 0), and then restores CR3, thus corrupting
+ * the TLB for ASID 0 if the saved ASID was nonzero. It also means
+ * that any bugs involving loading a PCID-enabled CR3 with
+ * CR4.PCIDE off will trigger deterministically.
+ */
+ return asid + 1;
+}
+
+/*
+ * Given @asid, compute uPCID
+ */
+static inline u16 user_pcid(u16 asid)
+{
+ u16 ret = kern_pcid(asid);
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+ ret |= 1 << X86_CR3_PTI_PCID_USER_BIT;
+#endif
+ return ret;
+}
+
+static inline unsigned long build_cr3(pgd_t *pgd, u16 asid)
+{
+ if (static_cpu_has(X86_FEATURE_PCID)) {
+ return __sme_pa(pgd) | kern_pcid(asid);
+ } else {
+ VM_WARN_ON_ONCE(asid != 0);
+ return __sme_pa(pgd);
+ }
+}
+
+static inline unsigned long build_cr3_noflush(pgd_t *pgd, u16 asid)
+{
+ VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE);
+ /*
+ * Use boot_cpu_has() instead of this_cpu_has() as this function
+ * might be called during early boot. This should work even after
+ * boot because all CPU's the have same capabilities:
+ */
+ VM_WARN_ON_ONCE(!boot_cpu_has(X86_FEATURE_PCID));
+ return __sme_pa(pgd) | kern_pcid(asid) | CR3_NOFLUSH;
+}
+
+/*
* We get here when we do something requiring a TLB invalidation
* but could not go invalidate all of the contexts. We do the
* necessary invalidation by clearing out the 'ctx_id' which
@@ -110,6 +239,32 @@
*need_flush = true;
}
+/*
+ * Given an ASID, flush the corresponding user ASID. We can delay this
+ * until the next time we switch to it.
+ *
+ * See SWITCH_TO_USER_CR3.
+ */
+static inline void invalidate_user_asid(u16 asid)
+{
+ /* There is no user ASID if address space separation is off */
+ if (!IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION))
+ return;
+
+ /*
+ * We only have a single ASID if PCID is off and the CR3
+ * write will have flushed it.
+ */
+ if (!cpu_feature_enabled(X86_FEATURE_PCID))
+ return;
+
+ if (!static_cpu_has(X86_FEATURE_PTI))
+ return;
+
+ __set_bit(kern_pcid(asid),
+ (unsigned long *)this_cpu_ptr(&cpu_tlbstate.user_pcid_flush_mask));
+}
+
static void load_new_mm_cr3(pgd_t *pgdir, u16 new_asid, bool need_flush)
{
unsigned long new_mm_cr3;
@@ -161,34 +316,6 @@
local_irq_restore(flags);
}
-static void sync_current_stack_to_mm(struct mm_struct *mm)
-{
- unsigned long sp = current_stack_pointer;
- pgd_t *pgd = pgd_offset(mm, sp);
-
- if (pgtable_l5_enabled()) {
- if (unlikely(pgd_none(*pgd))) {
- pgd_t *pgd_ref = pgd_offset_k(sp);
-
- set_pgd(pgd, *pgd_ref);
- }
- } else {
- /*
- * "pgd" is faked. The top level entries are "p4d"s, so sync
- * the p4d. This compiles to approximately the same code as
- * the 5-level case.
- */
- p4d_t *p4d = p4d_offset(pgd, sp);
-
- if (unlikely(p4d_none(*p4d))) {
- pgd_t *pgd_ref = pgd_offset_k(sp);
- p4d_t *p4d_ref = p4d_offset(pgd_ref, sp);
-
- set_p4d(p4d, *p4d_ref);
- }
- }
-}
-
static inline unsigned long mm_mangle_tif_spec_ib(struct task_struct *next)
{
unsigned long next_tif = task_thread_info(next)->flags;
@@ -272,6 +399,26 @@
}
}
+#ifdef CONFIG_PERF_EVENTS
+static inline void cr4_update_pce_mm(struct mm_struct *mm)
+{
+ if (static_branch_unlikely(&rdpmc_always_available_key) ||
+ (!static_branch_unlikely(&rdpmc_never_available_key) &&
+ atomic_read(&mm->context.perf_rdpmc_allowed)))
+ cr4_set_bits_irqsoff(X86_CR4_PCE);
+ else
+ cr4_clear_bits_irqsoff(X86_CR4_PCE);
+}
+
+void cr4_update_pce(void *ignored)
+{
+ cr4_update_pce_mm(this_cpu_read(cpu_tlbstate.loaded_mm));
+}
+
+#else
+static inline void cr4_update_pce_mm(struct mm_struct *mm) { }
+#endif
+
void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
struct task_struct *tsk)
{
@@ -383,15 +530,6 @@
*/
cond_ibpb(tsk);
- if (IS_ENABLED(CONFIG_VMAP_STACK)) {
- /*
- * If our current stack is in vmalloc space and isn't
- * mapped in the new pgd, we'll double-fault. Forcibly
- * map it.
- */
- sync_current_stack_to_mm(next);
- }
-
/*
* Stop remote flushes for the previous mm.
* Skip kernel threads; we never send init_mm TLB flushing IPIs,
@@ -422,21 +560,12 @@
this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen);
load_new_mm_cr3(next->pgd, new_asid, true);
- /*
- * NB: This gets called via leave_mm() in the idle path
- * where RCU functions differently. Tracing normally
- * uses RCU, so we need to use the _rcuidle variant.
- *
- * (There is no good reason for this. The idle code should
- * be rearranged to call this before rcu_idle_enter().)
- */
- trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
+ trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
} else {
/* The new ASID is already up to date. */
load_new_mm_cr3(next->pgd, new_asid, false);
- /* See above wrt _rcuidle. */
- trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0);
+ trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, 0);
}
/* Make sure we write CR3 before loaded_mm. */
@@ -446,7 +575,7 @@
this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid);
if (next != real_prev) {
- load_mm_cr4_irqsoff(next);
+ cr4_update_pce_mm(next);
switch_ldt(real_prev, next);
}
}
@@ -623,7 +752,7 @@
unsigned long addr = f->start;
while (addr < f->end) {
- __flush_tlb_one_user(addr);
+ flush_tlb_one_user(addr);
addr += 1UL << f->stride_shift;
}
if (local)
@@ -631,7 +760,7 @@
trace_tlb_flush(reason, nr_invalidate);
} else {
/* Full flush. */
- local_flush_tlb();
+ flush_tlb_local();
if (local)
count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
trace_tlb_flush(reason, TLB_FLUSH_ALL);
@@ -666,8 +795,8 @@
return !per_cpu(cpu_tlbstate.is_lazy, cpu);
}
-void native_flush_tlb_others(const struct cpumask *cpumask,
- const struct flush_tlb_info *info)
+STATIC_NOPV void native_flush_tlb_others(const struct cpumask *cpumask,
+ const struct flush_tlb_info *info)
{
count_vm_tlb_event(NR_TLB_REMOTE_FLUSH);
if (info->end == TLB_FLUSH_ALL)
@@ -676,29 +805,6 @@
trace_tlb_flush(TLB_REMOTE_SEND_IPI,
(info->end - info->start) >> PAGE_SHIFT);
- if (is_uv_system()) {
- /*
- * This whole special case is confused. UV has a "Broadcast
- * Assist Unit", which seems to be a fancy way to send IPIs.
- * Back when x86 used an explicit TLB flush IPI, UV was
- * optimized to use its own mechanism. These days, x86 uses
- * smp_call_function_many(), but UV still uses a manual IPI,
- * and that IPI's action is out of date -- it does a manual
- * flush instead of calling flush_tlb_func_remote(). This
- * means that the percpu tlb_gen variables won't be updated
- * and we'll do pointless flushes on future context switches.
- *
- * Rather than hooking native_flush_tlb_others() here, I think
- * that UV should be updated so that smp_call_function_many(),
- * etc, are optimal on UV.
- */
- cpumask = uv_flush_tlb_others(cpumask, info);
- if (cpumask)
- smp_call_function_many(cpumask, flush_tlb_func_remote,
- (void *)info, 1);
- return;
- }
-
/*
* If no page tables were freed, we can skip sending IPIs to
* CPUs in lazy TLB mode. They will flush the CPU themselves
@@ -714,7 +820,13 @@
(void *)info, 1);
else
on_each_cpu_cond_mask(tlb_is_not_lazy, flush_tlb_func_remote,
- (void *)info, 1, GFP_ATOMIC, cpumask);
+ (void *)info, 1, cpumask);
+}
+
+void flush_tlb_others(const struct cpumask *cpumask,
+ const struct flush_tlb_info *info)
+{
+ __flush_tlb_others(cpumask, info);
}
/*
@@ -827,7 +939,7 @@
/* flush range by one by one 'invlpg' */
for (addr = f->start; addr < f->end; addr += PAGE_SIZE)
- __flush_tlb_one_kernel(addr);
+ flush_tlb_one_kernel(addr);
}
void flush_tlb_kernel_range(unsigned long start, unsigned long end)
@@ -850,6 +962,164 @@
}
/*
+ * This can be used from process context to figure out what the value of
+ * CR3 is without needing to do a (slow) __read_cr3().
+ *
+ * It's intended to be used for code like KVM that sneakily changes CR3
+ * and needs to restore it. It needs to be used very carefully.
+ */
+unsigned long __get_current_cr3_fast(void)
+{
+ unsigned long cr3 = build_cr3(this_cpu_read(cpu_tlbstate.loaded_mm)->pgd,
+ this_cpu_read(cpu_tlbstate.loaded_mm_asid));
+
+ /* For now, be very restrictive about when this can be called. */
+ VM_WARN_ON(in_nmi() || preemptible());
+
+ VM_BUG_ON(cr3 != __read_cr3());
+ return cr3;
+}
+EXPORT_SYMBOL_GPL(__get_current_cr3_fast);
+
+/*
+ * Flush one page in the kernel mapping
+ */
+void flush_tlb_one_kernel(unsigned long addr)
+{
+ count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ONE);
+
+ /*
+ * If PTI is off, then __flush_tlb_one_user() is just INVLPG or its
+ * paravirt equivalent. Even with PCID, this is sufficient: we only
+ * use PCID if we also use global PTEs for the kernel mapping, and
+ * INVLPG flushes global translations across all address spaces.
+ *
+ * If PTI is on, then the kernel is mapped with non-global PTEs, and
+ * __flush_tlb_one_user() will flush the given address for the current
+ * kernel address space and for its usermode counterpart, but it does
+ * not flush it for other address spaces.
+ */
+ flush_tlb_one_user(addr);
+
+ if (!static_cpu_has(X86_FEATURE_PTI))
+ return;
+
+ /*
+ * See above. We need to propagate the flush to all other address
+ * spaces. In principle, we only need to propagate it to kernelmode
+ * address spaces, but the extra bookkeeping we would need is not
+ * worth it.
+ */
+ this_cpu_write(cpu_tlbstate.invalidate_other, true);
+}
+
+/*
+ * Flush one page in the user mapping
+ */
+STATIC_NOPV void native_flush_tlb_one_user(unsigned long addr)
+{
+ u32 loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
+
+ asm volatile("invlpg (%0)" ::"r" (addr) : "memory");
+
+ if (!static_cpu_has(X86_FEATURE_PTI))
+ return;
+
+ /*
+ * Some platforms #GP if we call invpcid(type=1/2) before CR4.PCIDE=1.
+ * Just use invalidate_user_asid() in case we are called early.
+ */
+ if (!this_cpu_has(X86_FEATURE_INVPCID_SINGLE))
+ invalidate_user_asid(loaded_mm_asid);
+ else
+ invpcid_flush_one(user_pcid(loaded_mm_asid), addr);
+}
+
+void flush_tlb_one_user(unsigned long addr)
+{
+ __flush_tlb_one_user(addr);
+}
+
+/*
+ * Flush everything
+ */
+STATIC_NOPV void native_flush_tlb_global(void)
+{
+ unsigned long cr4, flags;
+
+ if (static_cpu_has(X86_FEATURE_INVPCID)) {
+ /*
+ * Using INVPCID is considerably faster than a pair of writes
+ * to CR4 sandwiched inside an IRQ flag save/restore.
+ *
+ * Note, this works with CR4.PCIDE=0 or 1.
+ */
+ invpcid_flush_all();
+ return;
+ }
+
+ /*
+ * Read-modify-write to CR4 - protect it from preemption and
+ * from interrupts. (Use the raw variant because this code can
+ * be called from deep inside debugging code.)
+ */
+ raw_local_irq_save(flags);
+
+ cr4 = this_cpu_read(cpu_tlbstate.cr4);
+ /* toggle PGE */
+ native_write_cr4(cr4 ^ X86_CR4_PGE);
+ /* write old PGE again and flush TLBs */
+ native_write_cr4(cr4);
+
+ raw_local_irq_restore(flags);
+}
+
+/*
+ * Flush the entire current user mapping
+ */
+STATIC_NOPV void native_flush_tlb_local(void)
+{
+ /*
+ * Preemption or interrupts must be disabled to protect the access
+ * to the per CPU variable and to prevent being preempted between
+ * read_cr3() and write_cr3().
+ */
+ WARN_ON_ONCE(preemptible());
+
+ invalidate_user_asid(this_cpu_read(cpu_tlbstate.loaded_mm_asid));
+
+ /* If current->mm == NULL then the read_cr3() "borrows" an mm */
+ native_write_cr3(__native_read_cr3());
+}
+
+void flush_tlb_local(void)
+{
+ __flush_tlb_local();
+}
+
+/*
+ * Flush everything
+ */
+void __flush_tlb_all(void)
+{
+ /*
+ * This is to catch users with enabled preemption and the PGE feature
+ * and don't trigger the warning in __native_flush_tlb().
+ */
+ VM_WARN_ON_ONCE(preemptible());
+
+ if (boot_cpu_has(X86_FEATURE_PGE)) {
+ __flush_tlb_global();
+ } else {
+ /*
+ * !PGE -> !PCID (setup_pcid()), thus every flush is total.
+ */
+ flush_tlb_local();
+ }
+}
+EXPORT_SYMBOL_GPL(__flush_tlb_all);
+
+/*
* arch_tlbbatch_flush() performs a full TLB flush regardless of the active mm.
* This means that the 'struct flush_tlb_info' that describes which mappings to
* flush is actually fixed. We therefore set a single fixed struct and use it in
@@ -880,6 +1150,38 @@
put_cpu();
}
+/*
+ * Blindly accessing user memory from NMI context can be dangerous
+ * if we're in the middle of switching the current user task or
+ * switching the loaded mm. It can also be dangerous if we
+ * interrupted some kernel code that was temporarily using a
+ * different mm.
+ */
+bool nmi_uaccess_okay(void)
+{
+ struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
+ struct mm_struct *current_mm = current->mm;
+
+ VM_WARN_ON_ONCE(!loaded_mm);
+
+ /*
+ * The condition we want to check is
+ * current_mm->pgd == __va(read_cr3_pa()). This may be slow, though,
+ * if we're running in a VM with shadow paging, and nmi_uaccess_okay()
+ * is supposed to be reasonably fast.
+ *
+ * Instead, we check the almost equivalent but somewhat conservative
+ * condition below, and we rely on the fact that switch_mm_irqs_off()
+ * sets loaded_mm to LOADED_MM_SWITCHING before writing to CR3.
+ */
+ if (loaded_mm != current_mm)
+ return false;
+
+ VM_WARN_ON_ONCE(current_mm->pgd != __va(read_cr3_pa()));
+
+ return true;
+}
+
static ssize_t tlbflush_read_file(struct file *file, char __user *user_buf,
size_t count, loff_t *ppos)
{