Update Linux to v5.4.2
Change-Id: Idf6911045d9d382da2cfe01b1edff026404ac8fd
diff --git a/arch/powerpc/mm/8xx_mmu.c b/arch/powerpc/mm/8xx_mmu.c
deleted file mode 100644
index 5d53684..0000000
--- a/arch/powerpc/mm/8xx_mmu.c
+++ /dev/null
@@ -1,205 +0,0 @@
-/*
- * This file contains the routines for initializing the MMU
- * on the 8xx series of chips.
- * -- christophe
- *
- * Derived from arch/powerpc/mm/40x_mmu.c:
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
- */
-
-#include <linux/memblock.h>
-#include <asm/fixmap.h>
-#include <asm/code-patching.h>
-
-#include "mmu_decl.h"
-
-#define IMMR_SIZE (FIX_IMMR_SIZE << PAGE_SHIFT)
-
-extern int __map_without_ltlbs;
-
-static unsigned long block_mapped_ram;
-
-/*
- * Return PA for this VA if it is in an area mapped with LTLBs.
- * Otherwise, returns 0
- */
-phys_addr_t v_block_mapped(unsigned long va)
-{
- unsigned long p = PHYS_IMMR_BASE;
-
- if (__map_without_ltlbs)
- return 0;
- if (va >= VIRT_IMMR_BASE && va < VIRT_IMMR_BASE + IMMR_SIZE)
- return p + va - VIRT_IMMR_BASE;
- if (va >= PAGE_OFFSET && va < PAGE_OFFSET + block_mapped_ram)
- return __pa(va);
- return 0;
-}
-
-/*
- * Return VA for a given PA mapped with LTLBs or 0 if not mapped
- */
-unsigned long p_block_mapped(phys_addr_t pa)
-{
- unsigned long p = PHYS_IMMR_BASE;
-
- if (__map_without_ltlbs)
- return 0;
- if (pa >= p && pa < p + IMMR_SIZE)
- return VIRT_IMMR_BASE + pa - p;
- if (pa < block_mapped_ram)
- return (unsigned long)__va(pa);
- return 0;
-}
-
-#define LARGE_PAGE_SIZE_8M (1<<23)
-
-/*
- * MMU_init_hw does the chip-specific initialization of the MMU hardware.
- */
-void __init MMU_init_hw(void)
-{
- /* PIN up to the 3 first 8Mb after IMMR in DTLB table */
-#ifdef CONFIG_PIN_TLB_DATA
- unsigned long ctr = mfspr(SPRN_MD_CTR) & 0xfe000000;
- unsigned long flags = 0xf0 | MD_SPS16K | _PAGE_PRIVILEGED | _PAGE_DIRTY;
-#ifdef CONFIG_PIN_TLB_IMMR
- int i = 29;
-#else
- int i = 28;
-#endif
- unsigned long addr = 0;
- unsigned long mem = total_lowmem;
-
- for (; i < 32 && mem >= LARGE_PAGE_SIZE_8M; i++) {
- mtspr(SPRN_MD_CTR, ctr | (i << 8));
- mtspr(SPRN_MD_EPN, (unsigned long)__va(addr) | MD_EVALID);
- mtspr(SPRN_MD_TWC, MD_PS8MEG | MD_SVALID);
- mtspr(SPRN_MD_RPN, addr | flags | _PAGE_PRESENT);
- addr += LARGE_PAGE_SIZE_8M;
- mem -= LARGE_PAGE_SIZE_8M;
- }
-#endif
-}
-
-static void __init mmu_mapin_immr(void)
-{
- unsigned long p = PHYS_IMMR_BASE;
- unsigned long v = VIRT_IMMR_BASE;
- unsigned long f = pgprot_val(PAGE_KERNEL_NCG);
- int offset;
-
- for (offset = 0; offset < IMMR_SIZE; offset += PAGE_SIZE)
- map_kernel_page(v + offset, p + offset, f);
-}
-
-/* Address of instructions to patch */
-#ifndef CONFIG_PIN_TLB_IMMR
-extern unsigned int DTLBMiss_jmp;
-#endif
-extern unsigned int DTLBMiss_cmp, FixupDAR_cmp;
-#ifndef CONFIG_PIN_TLB_TEXT
-extern unsigned int ITLBMiss_cmp;
-#endif
-
-static void __init mmu_patch_cmp_limit(unsigned int *addr, unsigned long mapped)
-{
- unsigned int instr = *addr;
-
- instr &= 0xffff0000;
- instr |= (unsigned long)__va(mapped) >> 16;
- patch_instruction(addr, instr);
-}
-
-unsigned long __init mmu_mapin_ram(unsigned long top)
-{
- unsigned long mapped;
-
- if (__map_without_ltlbs) {
- mapped = 0;
- mmu_mapin_immr();
-#ifndef CONFIG_PIN_TLB_IMMR
- patch_instruction(&DTLBMiss_jmp, PPC_INST_NOP);
-#endif
-#ifndef CONFIG_PIN_TLB_TEXT
- mmu_patch_cmp_limit(&ITLBMiss_cmp, 0);
-#endif
- } else {
- mapped = top & ~(LARGE_PAGE_SIZE_8M - 1);
- }
-
- mmu_patch_cmp_limit(&DTLBMiss_cmp, mapped);
- mmu_patch_cmp_limit(&FixupDAR_cmp, mapped);
-
- /* If the size of RAM is not an exact power of two, we may not
- * have covered RAM in its entirety with 8 MiB
- * pages. Consequently, restrict the top end of RAM currently
- * allocable so that calls to the MEMBLOCK to allocate PTEs for "tail"
- * coverage with normal-sized pages (or other reasons) do not
- * attempt to allocate outside the allowed range.
- */
- if (mapped)
- memblock_set_current_limit(mapped);
-
- block_mapped_ram = mapped;
-
- return mapped;
-}
-
-void __init setup_initial_memory_limit(phys_addr_t first_memblock_base,
- phys_addr_t first_memblock_size)
-{
- /* We don't currently support the first MEMBLOCK not mapping 0
- * physical on those processors
- */
- BUG_ON(first_memblock_base != 0);
-
- /* 8xx can only access 24MB at the moment */
- memblock_set_current_limit(min_t(u64, first_memblock_size, 0x01800000));
-}
-
-/*
- * Set up to use a given MMU context.
- * id is context number, pgd is PGD pointer.
- *
- * We place the physical address of the new task page directory loaded
- * into the MMU base register, and set the ASID compare register with
- * the new "context."
- */
-void set_context(unsigned long id, pgd_t *pgd)
-{
- s16 offset = (s16)(__pa(swapper_pg_dir));
-
-#ifdef CONFIG_BDI_SWITCH
- pgd_t **ptr = *(pgd_t ***)(KERNELBASE + 0xf0);
-
- /* Context switch the PTE pointer for the Abatron BDI2000.
- * The PGDIR is passed as second argument.
- */
- *(ptr + 1) = pgd;
-#endif
-
- /* Register M_TW will contain base address of level 1 table minus the
- * lower part of the kernel PGDIR base address, so that all accesses to
- * level 1 table are done relative to lower part of kernel PGDIR base
- * address.
- */
- mtspr(SPRN_M_TW, __pa(pgd) - offset);
-
- /* Update context */
- mtspr(SPRN_M_CASID, id - 1);
- /* sync */
- mb();
-}
-
-void flush_instruction_cache(void)
-{
- isync();
- mtspr(SPRN_IC_CST, IDC_INVALL);
- isync();
-}
diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile
index cdf6a99..5e14798 100644
--- a/arch/powerpc/mm/Makefile
+++ b/arch/powerpc/mm/Makefile
@@ -3,45 +3,20 @@
# Makefile for the linux ppc-specific parts of the memory manager.
#
-subdir-ccflags-$(CONFIG_PPC_WERROR) := -Werror
-
ccflags-$(CONFIG_PPC64) := $(NO_MINIMAL_TOC)
obj-y := fault.o mem.o pgtable.o mmap.o \
init_$(BITS).o pgtable_$(BITS).o \
+ pgtable-frag.o ioremap.o ioremap_$(BITS).o \
init-common.o mmu_context.o drmem.o
-obj-$(CONFIG_PPC_MMU_NOHASH) += mmu_context_nohash.o tlb_nohash.o \
- tlb_nohash_low.o
-obj-$(CONFIG_PPC_BOOK3E) += tlb_low_$(BITS)e.o
-hash64-$(CONFIG_PPC_NATIVE) := hash_native_64.o
-obj-$(CONFIG_PPC_BOOK3E_64) += pgtable-book3e.o
-obj-$(CONFIG_PPC_BOOK3S_64) += pgtable-hash64.o hash_utils_64.o slb_low.o slb.o $(hash64-y) mmu_context_book3s64.o pgtable-book3s64.o
-obj-$(CONFIG_PPC_RADIX_MMU) += pgtable-radix.o tlb-radix.o
-obj-$(CONFIG_PPC_STD_MMU_32) += ppc_mmu_32.o hash_low_32.o mmu_context_hash32.o
-obj-$(CONFIG_PPC_STD_MMU) += tlb_hash$(BITS).o
-ifdef CONFIG_PPC_BOOK3S_64
-obj-$(CONFIG_PPC_4K_PAGES) += hash64_4k.o
-obj-$(CONFIG_PPC_64K_PAGES) += hash64_64k.o
-endif
-obj-$(CONFIG_40x) += 40x_mmu.o
-obj-$(CONFIG_44x) += 44x_mmu.o
-obj-$(CONFIG_PPC_8xx) += 8xx_mmu.o
-obj-$(CONFIG_PPC_FSL_BOOK3E) += fsl_booke_mmu.o
+obj-$(CONFIG_PPC_MMU_NOHASH) += nohash/
+obj-$(CONFIG_PPC_BOOK3S_32) += book3s32/
+obj-$(CONFIG_PPC_BOOK3S_64) += book3s64/
obj-$(CONFIG_NEED_MULTIPLE_NODES) += numa.o
-obj-$(CONFIG_PPC_SPLPAR) += vphn.o
obj-$(CONFIG_PPC_MM_SLICES) += slice.o
-obj-y += hugetlbpage.o
-ifdef CONFIG_HUGETLB_PAGE
-obj-$(CONFIG_PPC_BOOK3S_64) += hugetlbpage-hash64.o
-obj-$(CONFIG_PPC_RADIX_MMU) += hugetlbpage-radix.o
-obj-$(CONFIG_PPC_BOOK3E_MMU) += hugetlbpage-book3e.o
-endif
-obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += hugepage-hash64.o
-obj-$(CONFIG_PPC_SUBPAGE_PROT) += subpage-prot.o
+obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
obj-$(CONFIG_NOT_COHERENT_CACHE) += dma-noncoherent.o
obj-$(CONFIG_HIGHMEM) += highmem.o
obj-$(CONFIG_PPC_COPRO_BASE) += copro_fault.o
-obj-$(CONFIG_SPAPR_TCE_IOMMU) += mmu_context_iommu.o
-obj-$(CONFIG_PPC_PTDUMP) += dump_linuxpagetables.o
-obj-$(CONFIG_PPC_HTDUMP) += dump_hashpagetable.o
-obj-$(CONFIG_PPC_MEM_KEYS) += pkeys.o
+obj-$(CONFIG_PPC_PTDUMP) += ptdump/
+obj-$(CONFIG_KASAN) += kasan/
diff --git a/arch/powerpc/mm/book3s32/Makefile b/arch/powerpc/mm/book3s32/Makefile
new file mode 100644
index 0000000..1732eaa
--- /dev/null
+++ b/arch/powerpc/mm/book3s32/Makefile
@@ -0,0 +1,9 @@
+# SPDX-License-Identifier: GPL-2.0
+
+KASAN_SANITIZE_mmu.o := n
+
+ifdef CONFIG_KASAN
+CFLAGS_mmu.o += -DDISABLE_BRANCH_PROFILING
+endif
+
+obj-y += mmu.o hash_low.o mmu_context.o tlb.o
diff --git a/arch/powerpc/mm/hash_low_32.S b/arch/powerpc/mm/book3s32/hash_low.S
similarity index 86%
rename from arch/powerpc/mm/hash_low_32.S
rename to arch/powerpc/mm/book3s32/hash_low.S
index 26acf6c..8bbbd97 100644
--- a/arch/powerpc/mm/hash_low_32.S
+++ b/arch/powerpc/mm/book3s32/hash_low.S
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
* PowerPC version
* Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
@@ -11,12 +12,6 @@
* This file contains low-level assembler routines for managing
* the PowerPC MMU hash table. (PPC 8xx processors don't use a
* hash table, so this file is not used on them.)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
*/
#include <asm/reg.h>
@@ -28,6 +23,7 @@
#include <asm/asm-offsets.h>
#include <asm/export.h>
#include <asm/feature-fixups.h>
+#include <asm/code-patching-asm.h>
#ifdef CONFIG_SMP
.section .bss
@@ -46,14 +42,13 @@
* Returns to the caller if the access is illegal or there is no
* mapping for the address. Otherwise it places an appropriate PTE
* in the hash table and returns from the exception.
- * Uses r0, r3 - r8, r10, ctr, lr.
+ * Uses r0, r3 - r6, r8, r10, ctr, lr.
*/
.text
_GLOBAL(hash_page)
- tophys(r7,0) /* gets -KERNELBASE into r7 */
#ifdef CONFIG_SMP
- addis r8,r7,mmu_hash_lock@h
- ori r8,r8,mmu_hash_lock@l
+ lis r8, (mmu_hash_lock - PAGE_OFFSET)@h
+ ori r8, r8, (mmu_hash_lock - PAGE_OFFSET)@l
lis r0,0x0fff
b 10f
11: lwz r6,0(r8)
@@ -69,14 +64,13 @@
/* Get PTE (linux-style) and check access */
lis r0,KERNELBASE@h /* check if kernel address */
cmplw 0,r4,r0
- mfspr r8,SPRN_SPRG_THREAD /* current task's THREAD (phys) */
ori r3,r3,_PAGE_USER|_PAGE_PRESENT /* test low addresses as user */
- lwz r5,PGDIR(r8) /* virt page-table root */
+ mfspr r5, SPRN_SPRG_PGDIR /* phys page-table root */
blt+ 112f /* assume user more likely */
- lis r5,swapper_pg_dir@ha /* if kernel address, use */
- addi r5,r5,swapper_pg_dir@l /* kernel page table */
+ lis r5, (swapper_pg_dir - PAGE_OFFSET)@ha /* if kernel address, use */
+ addi r5 ,r5 ,(swapper_pg_dir - PAGE_OFFSET)@l /* kernel page table */
rlwimi r3,r9,32-12,29,29 /* MSR_PR -> _PAGE_USER */
-112: add r5,r5,r7 /* convert to phys addr */
+112:
#ifndef CONFIG_PTE_64BIT
rlwimi r5,r4,12,20,29 /* insert top 10 bits of address */
lwz r8,0(r5) /* get pmd entry */
@@ -143,25 +137,24 @@
#ifdef CONFIG_SMP
eieio
- addis r8,r7,mmu_hash_lock@ha
+ lis r8, (mmu_hash_lock - PAGE_OFFSET)@ha
li r0,0
- stw r0,mmu_hash_lock@l(r8)
+ stw r0, (mmu_hash_lock - PAGE_OFFSET)@l(r8)
#endif
/* Return from the exception */
lwz r5,_CTR(r11)
mtctr r5
lwz r0,GPR0(r11)
- lwz r7,GPR7(r11)
lwz r8,GPR8(r11)
b fast_exception_return
#ifdef CONFIG_SMP
hash_page_out:
eieio
- addis r8,r7,mmu_hash_lock@ha
+ lis r8, (mmu_hash_lock - PAGE_OFFSET)@ha
li r0,0
- stw r0,mmu_hash_lock@l(r8)
+ stw r0, (mmu_hash_lock - PAGE_OFFSET)@l(r8)
blr
#endif /* CONFIG_SMP */
@@ -185,8 +178,7 @@
add r3,r3,r0 /* note create_hpte trims to 24 bits */
#ifdef CONFIG_SMP
- CURRENT_THREAD_INFO(r8, r1) /* use cpu number to make tag */
- lwz r8,TI_CPU(r8) /* to go in mmu_hash_lock */
+ lwz r8,TASK_CPU(r2) /* to go in mmu_hash_lock */
oris r8,r8,12
#endif /* CONFIG_SMP */
@@ -207,11 +199,9 @@
SYNC_601
isync
- tophys(r7,0)
-
#ifdef CONFIG_SMP
- addis r6,r7,mmu_hash_lock@ha
- addi r6,r6,mmu_hash_lock@l
+ lis r6, (mmu_hash_lock - PAGE_OFFSET)@ha
+ addi r6, r6, (mmu_hash_lock - PAGE_OFFSET)@l
10: lwarx r0,0,r6 /* take the mmu_hash_lock */
cmpi 0,r0,0
bne- 11f
@@ -256,8 +246,8 @@
9:
#ifdef CONFIG_SMP
- addis r6,r7,mmu_hash_lock@ha
- addi r6,r6,mmu_hash_lock@l
+ lis r6, (mmu_hash_lock - PAGE_OFFSET)@ha
+ addi r6, r6, (mmu_hash_lock - PAGE_OFFSET)@l
eieio
li r0,0
stw r0,0(r6) /* clear mmu_hash_lock */
@@ -277,10 +267,8 @@
* It is designed to be called with the MMU either on or off.
* r3 contains the VSID, r4 contains the virtual address,
* r5 contains the linux PTE, r6 contains the old value of the
- * linux PTE (before setting _PAGE_HASHPTE) and r7 contains the
- * offset to be added to addresses (0 if the MMU is on,
- * -KERNELBASE if it is off). r10 contains the upper half of
- * the PTE if CONFIG_PTE_64BIT.
+ * linux PTE (before setting _PAGE_HASHPTE). r10 contains the
+ * upper half of the PTE if CONFIG_PTE_64BIT.
* On SMP, the caller should have the mmu_hash_lock held.
* We assume that the caller has (or will) set the _PAGE_HASHPTE
* bit in the linux PTE in memory. The value passed in r6 should
@@ -316,13 +304,13 @@
_GLOBAL(create_hpte)
/* Convert linux-style PTE (r5) to low word of PPC-style PTE (r8) */
- rlwinm r8,r5,32-10,31,31 /* _PAGE_RW -> PP lsb */
- rlwinm r0,r5,32-7,31,31 /* _PAGE_DIRTY -> PP lsb */
+ rlwinm r8,r5,32-9,30,30 /* _PAGE_RW -> PP msb */
+ rlwinm r0,r5,32-6,30,30 /* _PAGE_DIRTY -> PP msb */
and r8,r8,r0 /* writable if _RW & _DIRTY */
rlwimi r5,r5,32-1,30,30 /* _PAGE_USER -> PP msb */
rlwimi r5,r5,32-2,31,31 /* _PAGE_USER -> PP lsb */
ori r8,r8,0xe04 /* clear out reserved bits */
- andc r8,r5,r8 /* PP = user? (rw&dirty? 2: 3): 0 */
+ andc r8,r5,r8 /* PP = user? (rw&dirty? 1: 3): 0 */
BEGIN_FTR_SECTION
rlwinm r8,r8,0,~_PAGE_COHERENT /* clear M (coherence not required) */
END_FTR_SECTION_IFCLR(CPU_FTR_NEED_COHERENT)
@@ -337,11 +325,13 @@
rlwimi r5,r4,10,26,31 /* put in API (abbrev page index) */
SET_V(r5) /* set V (valid) bit */
+ patch_site 0f, patch__hash_page_A0
+ patch_site 1f, patch__hash_page_A1
+ patch_site 2f, patch__hash_page_A2
/* Get the address of the primary PTE group in the hash table (r3) */
-_GLOBAL(hash_page_patch_A)
- addis r0,r7,Hash_base@h /* base address of hash table */
- rlwimi r0,r3,LG_PTEG_SIZE,HASH_LEFT,HASH_RIGHT /* VSID -> hash */
- rlwinm r3,r4,20+LG_PTEG_SIZE,HASH_LEFT,HASH_RIGHT /* PI -> hash */
+0: lis r0, (Hash_base - PAGE_OFFSET)@h /* base address of hash table */
+1: rlwimi r0,r3,LG_PTEG_SIZE,HASH_LEFT,HASH_RIGHT /* VSID -> hash */
+2: rlwinm r3,r4,20+LG_PTEG_SIZE,HASH_LEFT,HASH_RIGHT /* PI -> hash */
xor r3,r3,r0 /* make primary hash */
li r0,8 /* PTEs/group */
@@ -353,10 +343,10 @@
beq+ 10f /* no PTE: go look for an empty slot */
tlbie r4
- addis r4,r7,htab_hash_searches@ha
- lwz r6,htab_hash_searches@l(r4)
+ lis r4, (htab_hash_searches - PAGE_OFFSET)@ha
+ lwz r6, (htab_hash_searches - PAGE_OFFSET)@l(r4)
addi r6,r6,1 /* count how many searches we do */
- stw r6,htab_hash_searches@l(r4)
+ stw r6, (htab_hash_searches - PAGE_OFFSET)@l(r4)
/* Search the primary PTEG for a PTE whose 1st (d)word matches r5 */
mtctr r0
@@ -366,10 +356,10 @@
bdnzf 2,1b /* loop while ctr != 0 && !cr0.eq */
beq+ found_slot
+ patch_site 0f, patch__hash_page_B
/* Search the secondary PTEG for a matching PTE */
ori r5,r5,PTE_H /* set H (secondary hash) bit */
-_GLOBAL(hash_page_patch_B)
- xoris r4,r3,Hash_msk>>16 /* compute secondary hash */
+0: xoris r4,r3,Hash_msk>>16 /* compute secondary hash */
xori r4,r4,(-PTEG_SIZE & 0xffff)
addi r4,r4,-HPTE_SIZE
mtctr r0
@@ -388,15 +378,15 @@
beq+ found_empty
/* update counter of times that the primary PTEG is full */
- addis r4,r7,primary_pteg_full@ha
- lwz r6,primary_pteg_full@l(r4)
+ lis r4, (primary_pteg_full - PAGE_OFFSET)@ha
+ lwz r6, (primary_pteg_full - PAGE_OFFSET)@l(r4)
addi r6,r6,1
- stw r6,primary_pteg_full@l(r4)
+ stw r6, (primary_pteg_full - PAGE_OFFSET)@l(r4)
+ patch_site 0f, patch__hash_page_C
/* Search the secondary PTEG for an empty slot */
ori r5,r5,PTE_H /* set H (secondary hash) bit */
-_GLOBAL(hash_page_patch_C)
- xoris r4,r3,Hash_msk>>16 /* compute secondary hash */
+0: xoris r4,r3,Hash_msk>>16 /* compute secondary hash */
xori r4,r4,(-PTEG_SIZE & 0xffff)
addi r4,r4,-HPTE_SIZE
mtctr r0
@@ -425,8 +415,8 @@
* lockup here but that shouldn't happen
*/
-1: addis r4,r7,next_slot@ha /* get next evict slot */
- lwz r6,next_slot@l(r4)
+1: lis r4, (next_slot - PAGE_OFFSET)@ha /* get next evict slot */
+ lwz r6, (next_slot - PAGE_OFFSET)@l(r4)
addi r6,r6,HPTE_SIZE /* search for candidate */
andi. r6,r6,7*HPTE_SIZE
stw r6,next_slot@l(r4)
@@ -498,8 +488,6 @@
* We assume that there is a hash table in use (Hash != 0).
*/
_GLOBAL(flush_hash_pages)
- tophys(r7,0)
-
/*
* We disable interrupts here, even on UP, because we want
* the _PAGE_HASHPTE bit to be a reliable indication of
@@ -544,11 +532,10 @@
SET_V(r11) /* set V (valid) bit */
#ifdef CONFIG_SMP
- addis r9,r7,mmu_hash_lock@ha
- addi r9,r9,mmu_hash_lock@l
- CURRENT_THREAD_INFO(r8, r1)
- add r8,r8,r7
- lwz r8,TI_CPU(r8)
+ lis r9, (mmu_hash_lock - PAGE_OFFSET)@ha
+ addi r9, r9, (mmu_hash_lock - PAGE_OFFSET)@l
+ tophys (r8, r2)
+ lwz r8, TASK_CPU(r8)
oris r8,r8,9
10: lwarx r0,0,r9
cmpi 0,r0,0
@@ -577,11 +564,13 @@
stwcx. r8,0,r5 /* update the pte */
bne- 33b
+ patch_site 0f, patch__flush_hash_A0
+ patch_site 1f, patch__flush_hash_A1
+ patch_site 2f, patch__flush_hash_A2
/* Get the address of the primary PTE group in the hash table (r3) */
-_GLOBAL(flush_hash_patch_A)
- addis r8,r7,Hash_base@h /* base address of hash table */
- rlwimi r8,r3,LG_PTEG_SIZE,HASH_LEFT,HASH_RIGHT /* VSID -> hash */
- rlwinm r0,r4,20+LG_PTEG_SIZE,HASH_LEFT,HASH_RIGHT /* PI -> hash */
+0: lis r8, (Hash_base - PAGE_OFFSET)@h /* base address of hash table */
+1: rlwimi r8,r3,LG_PTEG_SIZE,HASH_LEFT,HASH_RIGHT /* VSID -> hash */
+2: rlwinm r0,r4,20+LG_PTEG_SIZE,HASH_LEFT,HASH_RIGHT /* PI -> hash */
xor r8,r0,r8 /* make primary hash */
/* Search the primary PTEG for a PTE whose 1st (d)word matches r5 */
@@ -593,11 +582,11 @@
bdnzf 2,1b /* loop while ctr != 0 && !cr0.eq */
beq+ 3f
+ patch_site 0f, patch__flush_hash_B
/* Search the secondary PTEG for a matching PTE */
ori r11,r11,PTE_H /* set H (secondary hash) bit */
li r0,8 /* PTEs/group */
-_GLOBAL(flush_hash_patch_B)
- xoris r12,r8,Hash_msk>>16 /* compute secondary hash */
+0: xoris r12,r8,Hash_msk>>16 /* compute secondary hash */
xori r12,r12,(-PTEG_SIZE & 0xffff)
addi r12,r12,-HPTE_SIZE
mtctr r0
@@ -641,8 +630,7 @@
*/
_GLOBAL(_tlbie)
#ifdef CONFIG_SMP
- CURRENT_THREAD_INFO(r8, r1)
- lwz r8,TI_CPU(r8)
+ lwz r8,TASK_CPU(r2)
oris r8,r8,11
mfmsr r10
SYNC
@@ -679,8 +667,7 @@
*/
_GLOBAL(_tlbia)
#if defined(CONFIG_SMP)
- CURRENT_THREAD_INFO(r8, r1)
- lwz r8,TI_CPU(r8)
+ lwz r8,TASK_CPU(r2)
oris r8,r8,10
mfmsr r10
SYNC
diff --git a/arch/powerpc/mm/book3s32/mmu.c b/arch/powerpc/mm/book3s32/mmu.c
new file mode 100644
index 0000000..84d5fab
--- /dev/null
+++ b/arch/powerpc/mm/book3s32/mmu.c
@@ -0,0 +1,475 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * This file contains the routines for handling the MMU on those
+ * PowerPC implementations where the MMU substantially follows the
+ * architecture specification. This includes the 6xx, 7xx, 7xxx,
+ * and 8260 implementations but excludes the 8xx and 4xx.
+ * -- paulus
+ *
+ * Derived from arch/ppc/mm/init.c:
+ * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
+ *
+ * Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au)
+ * and Cort Dougan (PReP) (cort@cs.nmt.edu)
+ * Copyright (C) 1996 Paul Mackerras
+ *
+ * Derived from "arch/i386/mm/init.c"
+ * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
+ */
+
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/init.h>
+#include <linux/highmem.h>
+#include <linux/memblock.h>
+
+#include <asm/prom.h>
+#include <asm/mmu.h>
+#include <asm/machdep.h>
+#include <asm/code-patching.h>
+#include <asm/sections.h>
+
+#include <mm/mmu_decl.h>
+
+struct hash_pte *Hash;
+static unsigned long Hash_size, Hash_mask;
+unsigned long _SDR1;
+static unsigned int hash_mb, hash_mb2;
+
+struct ppc_bat BATS[8][2]; /* 8 pairs of IBAT, DBAT */
+
+struct batrange { /* stores address ranges mapped by BATs */
+ unsigned long start;
+ unsigned long limit;
+ phys_addr_t phys;
+} bat_addrs[8];
+
+/*
+ * Return PA for this VA if it is mapped by a BAT, or 0
+ */
+phys_addr_t v_block_mapped(unsigned long va)
+{
+ int b;
+ for (b = 0; b < ARRAY_SIZE(bat_addrs); ++b)
+ if (va >= bat_addrs[b].start && va < bat_addrs[b].limit)
+ return bat_addrs[b].phys + (va - bat_addrs[b].start);
+ return 0;
+}
+
+/*
+ * Return VA for a given PA or 0 if not mapped
+ */
+unsigned long p_block_mapped(phys_addr_t pa)
+{
+ int b;
+ for (b = 0; b < ARRAY_SIZE(bat_addrs); ++b)
+ if (pa >= bat_addrs[b].phys
+ && pa < (bat_addrs[b].limit-bat_addrs[b].start)
+ +bat_addrs[b].phys)
+ return bat_addrs[b].start+(pa-bat_addrs[b].phys);
+ return 0;
+}
+
+static int find_free_bat(void)
+{
+ int b;
+
+ if (IS_ENABLED(CONFIG_PPC_BOOK3S_601)) {
+ for (b = 0; b < 4; b++) {
+ struct ppc_bat *bat = BATS[b];
+
+ if (!(bat[0].batl & 0x40))
+ return b;
+ }
+ } else {
+ int n = mmu_has_feature(MMU_FTR_USE_HIGH_BATS) ? 8 : 4;
+
+ for (b = 0; b < n; b++) {
+ struct ppc_bat *bat = BATS[b];
+
+ if (!(bat[1].batu & 3))
+ return b;
+ }
+ }
+ return -1;
+}
+
+/*
+ * This function calculates the size of the larger block usable to map the
+ * beginning of an area based on the start address and size of that area:
+ * - max block size is 8M on 601 and 256 on other 6xx.
+ * - base address must be aligned to the block size. So the maximum block size
+ * is identified by the lowest bit set to 1 in the base address (for instance
+ * if base is 0x16000000, max size is 0x02000000).
+ * - block size has to be a power of two. This is calculated by finding the
+ * highest bit set to 1.
+ */
+static unsigned int block_size(unsigned long base, unsigned long top)
+{
+ unsigned int max_size = IS_ENABLED(CONFIG_PPC_BOOK3S_601) ? SZ_8M : SZ_256M;
+ unsigned int base_shift = (ffs(base) - 1) & 31;
+ unsigned int block_shift = (fls(top - base) - 1) & 31;
+
+ return min3(max_size, 1U << base_shift, 1U << block_shift);
+}
+
+/*
+ * Set up one of the IBAT (block address translation) register pairs.
+ * The parameters are not checked; in particular size must be a power
+ * of 2 between 128k and 256M.
+ * Only for 603+ ...
+ */
+static void setibat(int index, unsigned long virt, phys_addr_t phys,
+ unsigned int size, pgprot_t prot)
+{
+ unsigned int bl = (size >> 17) - 1;
+ int wimgxpp;
+ struct ppc_bat *bat = BATS[index];
+ unsigned long flags = pgprot_val(prot);
+
+ if (!cpu_has_feature(CPU_FTR_NEED_COHERENT))
+ flags &= ~_PAGE_COHERENT;
+
+ wimgxpp = (flags & _PAGE_COHERENT) | (_PAGE_EXEC ? BPP_RX : BPP_XX);
+ bat[0].batu = virt | (bl << 2) | 2; /* Vs=1, Vp=0 */
+ bat[0].batl = BAT_PHYS_ADDR(phys) | wimgxpp;
+ if (flags & _PAGE_USER)
+ bat[0].batu |= 1; /* Vp = 1 */
+}
+
+static void clearibat(int index)
+{
+ struct ppc_bat *bat = BATS[index];
+
+ bat[0].batu = 0;
+ bat[0].batl = 0;
+}
+
+static unsigned long __init __mmu_mapin_ram(unsigned long base, unsigned long top)
+{
+ int idx;
+
+ while ((idx = find_free_bat()) != -1 && base != top) {
+ unsigned int size = block_size(base, top);
+
+ if (size < 128 << 10)
+ break;
+ setbat(idx, PAGE_OFFSET + base, base, size, PAGE_KERNEL_X);
+ base += size;
+ }
+
+ return base;
+}
+
+unsigned long __init mmu_mapin_ram(unsigned long base, unsigned long top)
+{
+ unsigned long done;
+ unsigned long border = (unsigned long)__init_begin - PAGE_OFFSET;
+
+ if (__map_without_bats) {
+ pr_debug("RAM mapped without BATs\n");
+ return base;
+ }
+
+ if (!strict_kernel_rwx_enabled() || base >= border || top <= border)
+ return __mmu_mapin_ram(base, top);
+
+ done = __mmu_mapin_ram(base, border);
+ if (done != border)
+ return done;
+
+ return __mmu_mapin_ram(border, top);
+}
+
+void mmu_mark_initmem_nx(void)
+{
+ int nb = mmu_has_feature(MMU_FTR_USE_HIGH_BATS) ? 8 : 4;
+ int i;
+ unsigned long base = (unsigned long)_stext - PAGE_OFFSET;
+ unsigned long top = (unsigned long)_etext - PAGE_OFFSET;
+ unsigned long size;
+
+ if (IS_ENABLED(CONFIG_PPC_BOOK3S_601))
+ return;
+
+ for (i = 0; i < nb - 1 && base < top && top - base > (128 << 10);) {
+ size = block_size(base, top);
+ setibat(i++, PAGE_OFFSET + base, base, size, PAGE_KERNEL_TEXT);
+ base += size;
+ }
+ if (base < top) {
+ size = block_size(base, top);
+ size = max(size, 128UL << 10);
+ if ((top - base) > size) {
+ if (strict_kernel_rwx_enabled())
+ pr_warn("Kernel _etext not properly aligned\n");
+ size <<= 1;
+ }
+ setibat(i++, PAGE_OFFSET + base, base, size, PAGE_KERNEL_TEXT);
+ base += size;
+ }
+ for (; i < nb; i++)
+ clearibat(i);
+
+ update_bats();
+
+ for (i = TASK_SIZE >> 28; i < 16; i++) {
+ /* Do not set NX on VM space for modules */
+ if (IS_ENABLED(CONFIG_MODULES) &&
+ (VMALLOC_START & 0xf0000000) == i << 28)
+ break;
+ mtsrin(mfsrin(i << 28) | 0x10000000, i << 28);
+ }
+}
+
+void mmu_mark_rodata_ro(void)
+{
+ int nb = mmu_has_feature(MMU_FTR_USE_HIGH_BATS) ? 8 : 4;
+ int i;
+
+ if (IS_ENABLED(CONFIG_PPC_BOOK3S_601))
+ return;
+
+ for (i = 0; i < nb; i++) {
+ struct ppc_bat *bat = BATS[i];
+
+ if (bat_addrs[i].start < (unsigned long)__init_begin)
+ bat[1].batl = (bat[1].batl & ~BPP_RW) | BPP_RX;
+ }
+
+ update_bats();
+}
+
+/*
+ * Set up one of the I/D BAT (block address translation) register pairs.
+ * The parameters are not checked; in particular size must be a power
+ * of 2 between 128k and 256M.
+ * On 603+, only set IBAT when _PAGE_EXEC is set
+ */
+void __init setbat(int index, unsigned long virt, phys_addr_t phys,
+ unsigned int size, pgprot_t prot)
+{
+ unsigned int bl;
+ int wimgxpp;
+ struct ppc_bat *bat = BATS[index];
+ unsigned long flags = pgprot_val(prot);
+
+ if ((flags & _PAGE_NO_CACHE) ||
+ (cpu_has_feature(CPU_FTR_NEED_COHERENT) == 0))
+ flags &= ~_PAGE_COHERENT;
+
+ bl = (size >> 17) - 1;
+ if (!IS_ENABLED(CONFIG_PPC_BOOK3S_601)) {
+ /* 603, 604, etc. */
+ /* Do DBAT first */
+ wimgxpp = flags & (_PAGE_WRITETHRU | _PAGE_NO_CACHE
+ | _PAGE_COHERENT | _PAGE_GUARDED);
+ wimgxpp |= (flags & _PAGE_RW)? BPP_RW: BPP_RX;
+ bat[1].batu = virt | (bl << 2) | 2; /* Vs=1, Vp=0 */
+ bat[1].batl = BAT_PHYS_ADDR(phys) | wimgxpp;
+ if (flags & _PAGE_USER)
+ bat[1].batu |= 1; /* Vp = 1 */
+ if (flags & _PAGE_GUARDED) {
+ /* G bit must be zero in IBATs */
+ flags &= ~_PAGE_EXEC;
+ }
+ if (flags & _PAGE_EXEC)
+ bat[0] = bat[1];
+ else
+ bat[0].batu = bat[0].batl = 0;
+ } else {
+ /* 601 cpu */
+ if (bl > BL_8M)
+ bl = BL_8M;
+ wimgxpp = flags & (_PAGE_WRITETHRU | _PAGE_NO_CACHE
+ | _PAGE_COHERENT);
+ wimgxpp |= (flags & _PAGE_RW)?
+ ((flags & _PAGE_USER)? PP_RWRW: PP_RWXX): PP_RXRX;
+ bat->batu = virt | wimgxpp | 4; /* Ks=0, Ku=1 */
+ bat->batl = phys | bl | 0x40; /* V=1 */
+ }
+
+ bat_addrs[index].start = virt;
+ bat_addrs[index].limit = virt + ((bl + 1) << 17) - 1;
+ bat_addrs[index].phys = phys;
+}
+
+/*
+ * Preload a translation in the hash table
+ */
+void hash_preload(struct mm_struct *mm, unsigned long ea)
+{
+ pmd_t *pmd;
+
+ if (!Hash)
+ return;
+ pmd = pmd_offset(pud_offset(pgd_offset(mm, ea), ea), ea);
+ if (!pmd_none(*pmd))
+ add_hash_page(mm->context.id, ea, pmd_val(*pmd));
+}
+
+/*
+ * This is called at the end of handling a user page fault, when the
+ * fault has been handled by updating a PTE in the linux page tables.
+ * We use it to preload an HPTE into the hash table corresponding to
+ * the updated linux PTE.
+ *
+ * This must always be called with the pte lock held.
+ */
+void update_mmu_cache(struct vm_area_struct *vma, unsigned long address,
+ pte_t *ptep)
+{
+ if (!mmu_has_feature(MMU_FTR_HPTE_TABLE))
+ return;
+ /*
+ * We don't need to worry about _PAGE_PRESENT here because we are
+ * called with either mm->page_table_lock held or ptl lock held
+ */
+
+ /* We only want HPTEs for linux PTEs that have _PAGE_ACCESSED set */
+ if (!pte_young(*ptep) || address >= TASK_SIZE)
+ return;
+
+ /* We have to test for regs NULL since init will get here first thing at boot */
+ if (!current->thread.regs)
+ return;
+
+ /* We also avoid filling the hash if not coming from a fault */
+ if (TRAP(current->thread.regs) != 0x300 && TRAP(current->thread.regs) != 0x400)
+ return;
+
+ hash_preload(vma->vm_mm, address);
+}
+
+/*
+ * Initialize the hash table and patch the instructions in hashtable.S.
+ */
+void __init MMU_init_hw(void)
+{
+ unsigned int n_hpteg, lg_n_hpteg;
+
+ if (!mmu_has_feature(MMU_FTR_HPTE_TABLE))
+ return;
+
+ if ( ppc_md.progress ) ppc_md.progress("hash:enter", 0x105);
+
+#define LG_HPTEG_SIZE 6 /* 64 bytes per HPTEG */
+#define SDR1_LOW_BITS ((n_hpteg - 1) >> 10)
+#define MIN_N_HPTEG 1024 /* min 64kB hash table */
+
+ /*
+ * Allow 1 HPTE (1/8 HPTEG) for each page of memory.
+ * This is less than the recommended amount, but then
+ * Linux ain't AIX.
+ */
+ n_hpteg = total_memory / (PAGE_SIZE * 8);
+ if (n_hpteg < MIN_N_HPTEG)
+ n_hpteg = MIN_N_HPTEG;
+ lg_n_hpteg = __ilog2(n_hpteg);
+ if (n_hpteg & (n_hpteg - 1)) {
+ ++lg_n_hpteg; /* round up if not power of 2 */
+ n_hpteg = 1 << lg_n_hpteg;
+ }
+ Hash_size = n_hpteg << LG_HPTEG_SIZE;
+
+ /*
+ * Find some memory for the hash table.
+ */
+ if ( ppc_md.progress ) ppc_md.progress("hash:find piece", 0x322);
+ Hash = memblock_alloc(Hash_size, Hash_size);
+ if (!Hash)
+ panic("%s: Failed to allocate %lu bytes align=0x%lx\n",
+ __func__, Hash_size, Hash_size);
+ _SDR1 = __pa(Hash) | SDR1_LOW_BITS;
+
+ pr_info("Total memory = %lldMB; using %ldkB for hash table\n",
+ (unsigned long long)(total_memory >> 20), Hash_size >> 10);
+
+
+ Hash_mask = n_hpteg - 1;
+ hash_mb2 = hash_mb = 32 - LG_HPTEG_SIZE - lg_n_hpteg;
+ if (lg_n_hpteg > 16)
+ hash_mb2 = 16 - LG_HPTEG_SIZE;
+
+ /*
+ * When KASAN is selected, there is already an early temporary hash
+ * table and the switch to the final hash table is done later.
+ */
+ if (IS_ENABLED(CONFIG_KASAN))
+ return;
+
+ MMU_init_hw_patch();
+}
+
+void __init MMU_init_hw_patch(void)
+{
+ unsigned int hmask = Hash_mask >> (16 - LG_HPTEG_SIZE);
+
+ if (ppc_md.progress)
+ ppc_md.progress("hash:patch", 0x345);
+ if (ppc_md.progress)
+ ppc_md.progress("hash:done", 0x205);
+
+ /* WARNING: Make sure nothing can trigger a KASAN check past this point */
+
+ /*
+ * Patch up the instructions in hashtable.S:create_hpte
+ */
+ modify_instruction_site(&patch__hash_page_A0, 0xffff,
+ ((unsigned int)Hash - PAGE_OFFSET) >> 16);
+ modify_instruction_site(&patch__hash_page_A1, 0x7c0, hash_mb << 6);
+ modify_instruction_site(&patch__hash_page_A2, 0x7c0, hash_mb2 << 6);
+ modify_instruction_site(&patch__hash_page_B, 0xffff, hmask);
+ modify_instruction_site(&patch__hash_page_C, 0xffff, hmask);
+
+ /*
+ * Patch up the instructions in hashtable.S:flush_hash_page
+ */
+ modify_instruction_site(&patch__flush_hash_A0, 0xffff,
+ ((unsigned int)Hash - PAGE_OFFSET) >> 16);
+ modify_instruction_site(&patch__flush_hash_A1, 0x7c0, hash_mb << 6);
+ modify_instruction_site(&patch__flush_hash_A2, 0x7c0, hash_mb2 << 6);
+ modify_instruction_site(&patch__flush_hash_B, 0xffff, hmask);
+}
+
+void setup_initial_memory_limit(phys_addr_t first_memblock_base,
+ phys_addr_t first_memblock_size)
+{
+ /* We don't currently support the first MEMBLOCK not mapping 0
+ * physical on those processors
+ */
+ BUG_ON(first_memblock_base != 0);
+
+ /* 601 can only access 16MB at the moment */
+ if (IS_ENABLED(CONFIG_PPC_BOOK3S_601))
+ memblock_set_current_limit(min_t(u64, first_memblock_size, 0x01000000));
+ else /* Anything else has 256M mapped */
+ memblock_set_current_limit(min_t(u64, first_memblock_size, 0x10000000));
+}
+
+void __init print_system_hash_info(void)
+{
+ pr_info("Hash_size = 0x%lx\n", Hash_size);
+ if (Hash_mask)
+ pr_info("Hash_mask = 0x%lx\n", Hash_mask);
+}
+
+#ifdef CONFIG_PPC_KUEP
+void __init setup_kuep(bool disabled)
+{
+ pr_info("Activating Kernel Userspace Execution Prevention\n");
+
+ if (disabled)
+ pr_warn("KUEP cannot be disabled yet on 6xx when compiled in\n");
+}
+#endif
+
+#ifdef CONFIG_PPC_KUAP
+void __init setup_kuap(bool disabled)
+{
+ pr_info("Activating Kernel Userspace Access Protection\n");
+
+ if (disabled)
+ pr_warn("KUAP cannot be disabled yet on 6xx when compiled in\n");
+}
+#endif
diff --git a/arch/powerpc/mm/mmu_context_hash32.c b/arch/powerpc/mm/book3s32/mmu_context.c
similarity index 91%
rename from arch/powerpc/mm/mmu_context_hash32.c
rename to arch/powerpc/mm/book3s32/mmu_context.c
index 921c1e3..218996e 100644
--- a/arch/powerpc/mm/mmu_context_hash32.c
+++ b/arch/powerpc/mm/book3s32/mmu_context.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* This file contains the routines for handling the MMU on those
* PowerPC implementations where the MMU substantially follows the
@@ -14,12 +15,6 @@
*
* Derived from "arch/i386/mm/init.c"
* Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
*/
#include <linux/mm.h>
diff --git a/arch/powerpc/mm/tlb_hash32.c b/arch/powerpc/mm/book3s32/tlb.c
similarity index 92%
rename from arch/powerpc/mm/tlb_hash32.c
rename to arch/powerpc/mm/book3s32/tlb.c
index cf8472c..2fcd321 100644
--- a/arch/powerpc/mm/tlb_hash32.c
+++ b/arch/powerpc/mm/book3s32/tlb.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* This file contains the routines for TLB flushing.
* On machines where the MMU uses a hash table to store virtual to
@@ -14,12 +15,6 @@
*
* Derived from "arch/i386/mm/init.c"
* Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
*/
#include <linux/kernel.h>
@@ -32,7 +27,7 @@
#include <asm/tlbflush.h>
#include <asm/tlb.h>
-#include "mmu_decl.h"
+#include <mm/mmu_decl.h>
/*
* Called when unmapping pages to flush entries from the TLB/hash table.
diff --git a/arch/powerpc/mm/book3s64/Makefile b/arch/powerpc/mm/book3s64/Makefile
new file mode 100644
index 0000000..fd393b8
--- /dev/null
+++ b/arch/powerpc/mm/book3s64/Makefile
@@ -0,0 +1,23 @@
+# SPDX-License-Identifier: GPL-2.0
+
+ccflags-y := $(NO_MINIMAL_TOC)
+
+CFLAGS_REMOVE_slb.o = $(CC_FLAGS_FTRACE)
+
+obj-y += hash_pgtable.o hash_utils.o slb.o \
+ mmu_context.o pgtable.o hash_tlb.o
+obj-$(CONFIG_PPC_NATIVE) += hash_native.o
+obj-$(CONFIG_PPC_RADIX_MMU) += radix_pgtable.o radix_tlb.o
+obj-$(CONFIG_PPC_4K_PAGES) += hash_4k.o
+obj-$(CONFIG_PPC_64K_PAGES) += hash_64k.o
+obj-$(CONFIG_HUGETLB_PAGE) += hash_hugetlbpage.o
+ifdef CONFIG_HUGETLB_PAGE
+obj-$(CONFIG_PPC_RADIX_MMU) += radix_hugetlbpage.o
+endif
+obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += hash_hugepage.o
+obj-$(CONFIG_PPC_SUBPAGE_PROT) += subpage_prot.o
+obj-$(CONFIG_SPAPR_TCE_IOMMU) += iommu_api.o
+obj-$(CONFIG_PPC_MEM_KEYS) += pkeys.o
+
+# Instrumenting the SLB fault path can lead to duplicate SLB entries
+KCOV_INSTRUMENT_slb.o := n
diff --git a/arch/powerpc/mm/hash64_4k.c b/arch/powerpc/mm/book3s64/hash_4k.c
similarity index 97%
rename from arch/powerpc/mm/hash64_4k.c
rename to arch/powerpc/mm/book3s64/hash_4k.c
index 6fa6765..22e7871 100644
--- a/arch/powerpc/mm/hash64_4k.c
+++ b/arch/powerpc/mm/book3s64/hash_4k.c
@@ -1,6 +1,6 @@
/*
* Copyright IBM Corporation, 2015
- * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+ * Author Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of version 2 of the GNU Lesser General Public License
diff --git a/arch/powerpc/mm/hash64_64k.c b/arch/powerpc/mm/book3s64/hash_64k.c
similarity index 98%
rename from arch/powerpc/mm/hash64_64k.c
rename to arch/powerpc/mm/book3s64/hash_64k.c
index 3afa253..7084ce2 100644
--- a/arch/powerpc/mm/hash64_64k.c
+++ b/arch/powerpc/mm/book3s64/hash_64k.c
@@ -1,6 +1,6 @@
/*
* Copyright IBM Corporation, 2015
- * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+ * Author Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of version 2 of the GNU Lesser General Public License
diff --git a/arch/powerpc/mm/hugepage-hash64.c b/arch/powerpc/mm/book3s64/hash_hugepage.c
similarity index 95%
rename from arch/powerpc/mm/hugepage-hash64.c
rename to arch/powerpc/mm/book3s64/hash_hugepage.c
index 01f213d..4408237 100644
--- a/arch/powerpc/mm/hugepage-hash64.c
+++ b/arch/powerpc/mm/book3s64/hash_hugepage.c
@@ -1,6 +1,6 @@
/*
* Copyright IBM Corporation, 2013
- * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+ * Author Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of version 2.1 of the GNU Lesser General Public License
@@ -51,6 +51,12 @@
new_pmd |= _PAGE_DIRTY;
} while (!pmd_xchg(pmdp, __pmd(old_pmd), __pmd(new_pmd)));
+ /*
+ * Make sure this is thp or devmap entry
+ */
+ if (!(old_pmd & (H_PAGE_THP_HUGE | _PAGE_DEVMAP)))
+ return 0;
+
rflags = htab_convert_pte_flags(new_pmd);
#if 0
diff --git a/arch/powerpc/mm/hugetlbpage-hash64.c b/arch/powerpc/mm/book3s64/hash_hugetlbpage.c
similarity index 66%
rename from arch/powerpc/mm/hugetlbpage-hash64.c
rename to arch/powerpc/mm/book3s64/hash_hugetlbpage.c
index b320f50..eefa89c 100644
--- a/arch/powerpc/mm/hugetlbpage-hash64.c
+++ b/arch/powerpc/mm/book3s64/hash_hugetlbpage.c
@@ -15,6 +15,9 @@
#include <asm/cacheflush.h>
#include <asm/machdep.h>
+unsigned int hpage_shift;
+EXPORT_SYMBOL(hpage_shift);
+
extern long hpte_insert_repeating(unsigned long hash, unsigned long vpn,
unsigned long pa, unsigned long rlags,
unsigned long vflags, int psize, int ssize);
@@ -26,7 +29,7 @@
real_pte_t rpte;
unsigned long vpn;
unsigned long old_pte, new_pte;
- unsigned long rflags, pa, sz;
+ unsigned long rflags, pa;
long slot, offset;
BUG_ON(shift != mmu_psize_defs[mmu_psize].shift);
@@ -34,7 +37,8 @@
/* Search the Linux page table for a match with va */
vpn = hpt_vpn(ea, vsid, ssize);
- /* At this point, we have a pte (old_pte) which can be used to build
+ /*
+ * At this point, we have a pte (old_pte) which can be used to build
* or update an HPTE. There are 2 cases:
*
* 1. There is a valid (present) pte with no associated HPTE (this is
@@ -55,13 +59,19 @@
if (unlikely(!check_pte_access(access, old_pte)))
return 1;
- /* Try to lock the PTE, add ACCESSED and DIRTY if it was
- * a write access */
+ /*
+ * Try to lock the PTE, add ACCESSED and DIRTY if it was
+ * a write access
+ */
new_pte = old_pte | H_PAGE_BUSY | _PAGE_ACCESSED;
if (access & _PAGE_WRITE)
new_pte |= _PAGE_DIRTY;
} while(!pte_xchg(ptep, __pte(old_pte), __pte(new_pte)));
+ /* Make sure this is a hugetlb entry */
+ if (old_pte & (H_PAGE_THP_HUGE | _PAGE_DEVMAP))
+ return 0;
+
rflags = htab_convert_pte_flags(new_pte);
if (unlikely(mmu_psize == MMU_PAGE_16G))
offset = PTRS_PER_PUD;
@@ -69,10 +79,11 @@
offset = PTRS_PER_PMD;
rpte = __real_pte(__pte(old_pte), ptep, offset);
- sz = ((1UL) << shift);
if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE))
- /* No CPU has hugepages but lacks no execute, so we
- * don't need to worry about that case */
+ /*
+ * No CPU has hugepages but lacks no execute, so we
+ * don't need to worry about that case
+ */
rflags = hash_page_do_lazy_icache(rflags, __pte(old_pte), trap);
/* Check if pte already has an hpte (case 2) */
@@ -117,3 +128,41 @@
*ptep = __pte(new_pte & ~H_PAGE_BUSY);
return 0;
}
+
+pte_t huge_ptep_modify_prot_start(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep)
+{
+ unsigned long pte_val;
+ /*
+ * Clear the _PAGE_PRESENT so that no hardware parallel update is
+ * possible. Also keep the pte_present true so that we don't take
+ * wrong fault.
+ */
+ pte_val = pte_update(vma->vm_mm, addr, ptep,
+ _PAGE_PRESENT, _PAGE_INVALID, 1);
+
+ return __pte(pte_val);
+}
+
+void huge_ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr,
+ pte_t *ptep, pte_t old_pte, pte_t pte)
+{
+
+ if (radix_enabled())
+ return radix__huge_ptep_modify_prot_commit(vma, addr, ptep,
+ old_pte, pte);
+ set_huge_pte_at(vma->vm_mm, addr, ptep, pte);
+}
+
+void hugetlbpage_init_default(void)
+{
+ /* Set default large page size. Currently, we pick 16M or 1M
+ * depending on what is available
+ */
+ if (mmu_psize_defs[MMU_PAGE_16M].shift)
+ hpage_shift = mmu_psize_defs[MMU_PAGE_16M].shift;
+ else if (mmu_psize_defs[MMU_PAGE_1M].shift)
+ hpage_shift = mmu_psize_defs[MMU_PAGE_1M].shift;
+ else if (mmu_psize_defs[MMU_PAGE_2M].shift)
+ hpage_shift = mmu_psize_defs[MMU_PAGE_2M].shift;
+}
diff --git a/arch/powerpc/mm/hash_native_64.c b/arch/powerpc/mm/book3s64/hash_native.c
similarity index 95%
rename from arch/powerpc/mm/hash_native_64.c
rename to arch/powerpc/mm/book3s64/hash_native.c
index aaa28fd..523e42e 100644
--- a/arch/powerpc/mm/hash_native_64.c
+++ b/arch/powerpc/mm/book3s64/hash_native.c
@@ -1,13 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* native hashtable management.
*
* SMP scalability work:
* Copyright (C) 2001 Anton Blanchard <anton@au.ibm.com>, IBM
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#undef DEBUG_LOW
@@ -45,7 +41,7 @@
#define HPTE_LOCK_BIT (56+3)
#endif
-DEFINE_RAW_SPINLOCK(native_tlbie_lock);
+static DEFINE_RAW_SPINLOCK(native_tlbie_lock);
static inline void tlbiel_hash_set_isa206(unsigned int set, unsigned int is)
{
@@ -60,7 +56,7 @@
* tlbiel instruction for hash, set invalidation
* i.e., r=1 and is=01 or is=10 or is=11
*/
-static inline void tlbiel_hash_set_isa300(unsigned int set, unsigned int is,
+static __always_inline void tlbiel_hash_set_isa300(unsigned int set, unsigned int is,
unsigned int pid,
unsigned int ric, unsigned int prs)
{
@@ -116,7 +112,7 @@
asm volatile("ptesync": : :"memory");
- asm volatile(PPC_INVALIDATE_ERAT "; isync" : : :"memory");
+ asm volatile(PPC_ISA_3_0_INVALIDATE_ERAT "; isync" : : :"memory");
}
void hash__tlbiel_all(unsigned int action)
@@ -201,9 +197,32 @@
return va;
}
-static inline void fixup_tlbie(unsigned long vpn, int psize, int apsize, int ssize)
+static inline void fixup_tlbie_vpn(unsigned long vpn, int psize,
+ int apsize, int ssize)
{
- if (cpu_has_feature(CPU_FTR_P9_TLBIE_BUG)) {
+ if (cpu_has_feature(CPU_FTR_P9_TLBIE_ERAT_BUG)) {
+ /* Radix flush for a hash guest */
+
+ unsigned long rb,rs,prs,r,ric;
+
+ rb = PPC_BIT(52); /* IS = 2 */
+ rs = 0; /* lpid = 0 */
+ prs = 0; /* partition scoped */
+ r = 1; /* radix format */
+ ric = 0; /* RIC_FLSUH_TLB */
+
+ /*
+ * Need the extra ptesync to make sure we don't
+ * re-order the tlbie
+ */
+ asm volatile("ptesync": : :"memory");
+ asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1)
+ : : "r"(rb), "i"(r), "i"(prs),
+ "i"(ric), "r"(rs) : "memory");
+ }
+
+
+ if (cpu_has_feature(CPU_FTR_P9_TLBIE_STQ_BUG)) {
/* Need the extra ptesync to ensure we don't reorder tlbie*/
asm volatile("ptesync": : :"memory");
___tlbie(vpn, psize, apsize, ssize);
@@ -287,7 +306,7 @@
asm volatile("ptesync": : :"memory");
} else {
__tlbie(vpn, psize, apsize, ssize);
- fixup_tlbie(vpn, psize, apsize, ssize);
+ fixup_tlbie_vpn(vpn, psize, apsize, ssize);
asm volatile("eieio; tlbsync; ptesync": : :"memory");
}
if (lock_tlbie && !use_local)
@@ -860,7 +879,7 @@
/*
* Just do one more with the last used values.
*/
- fixup_tlbie(vpn, psize, psize, ssize);
+ fixup_tlbie_vpn(vpn, psize, psize, ssize);
asm volatile("eieio; tlbsync; ptesync":::"memory");
if (lock_tlbie)
diff --git a/arch/powerpc/mm/pgtable-hash64.c b/arch/powerpc/mm/book3s64/hash_pgtable.c
similarity index 94%
rename from arch/powerpc/mm/pgtable-hash64.c
rename to arch/powerpc/mm/book3s64/hash_pgtable.c
index 692bfc9..64733b9 100644
--- a/arch/powerpc/mm/pgtable-hash64.c
+++ b/arch/powerpc/mm/book3s64/hash_pgtable.c
@@ -1,12 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Copyright 2005, Paul Mackerras, IBM Corporation.
* Copyright 2009, Benjamin Herrenschmidt, IBM Corporation.
* Copyright 2015-2016, Aneesh Kumar K.V, IBM Corporation.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#include <linux/sched.h>
@@ -19,7 +15,7 @@
#include <asm/mmu.h>
#include <asm/tlb.h>
-#include "mmu_decl.h"
+#include <mm/mmu_decl.h>
#define CREATE_TRACE_POINTS
#include <trace/events/thp.h>
@@ -112,9 +108,16 @@
unsigned long page_size,
unsigned long phys)
{
- int rc = htab_bolt_mapping(start, start + page_size, phys,
- pgprot_val(PAGE_KERNEL),
- mmu_vmemmap_psize, mmu_kernel_ssize);
+ int rc;
+
+ if ((start + page_size) >= H_VMEMMAP_END) {
+ pr_warn("Outside the supported range\n");
+ return -1;
+ }
+
+ rc = htab_bolt_mapping(start, start + page_size, phys,
+ pgprot_val(PAGE_KERNEL),
+ mmu_vmemmap_psize, mmu_kernel_ssize);
if (rc < 0) {
int rc2 = htab_remove_mapping(start, start + page_size,
mmu_vmemmap_psize,
@@ -142,7 +145,7 @@
* map_kernel_page adds an entry to the ioremap page table
* and adds an entry to the HPT, possibly bolting it
*/
-int hash__map_kernel_page(unsigned long ea, unsigned long pa, unsigned long flags)
+int hash__map_kernel_page(unsigned long ea, unsigned long pa, pgprot_t prot)
{
pgd_t *pgdp;
pud_t *pudp;
@@ -161,8 +164,7 @@
ptep = pte_alloc_kernel(pmdp, ea);
if (!ptep)
return -ENOMEM;
- set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT,
- __pgprot(flags)));
+ set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT, prot));
} else {
/*
* If the mm subsystem is not fully up, we cannot create a
@@ -170,7 +172,7 @@
* entry in the hardware page table.
*
*/
- if (htab_bolt_mapping(ea, ea + PAGE_SIZE, pa, flags,
+ if (htab_bolt_mapping(ea, ea + PAGE_SIZE, pa, pgprot_val(prot),
mmu_io_psize, mmu_kernel_ssize)) {
printk(KERN_ERR "Failed to do bolted mapping IO "
"memory at %016lx !\n", pa);
@@ -404,6 +406,8 @@
return 1;
}
+EXPORT_SYMBOL_GPL(hash__has_transparent_hugepage);
+
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
#ifdef CONFIG_STRICT_KERNEL_RWX
diff --git a/arch/powerpc/mm/tlb_hash64.c b/arch/powerpc/mm/book3s64/hash_tlb.c
similarity index 91%
rename from arch/powerpc/mm/tlb_hash64.c
rename to arch/powerpc/mm/book3s64/hash_tlb.c
index 87d71dd..4a70d8d 100644
--- a/arch/powerpc/mm/tlb_hash64.c
+++ b/arch/powerpc/mm/book3s64/hash_tlb.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* This file contains the routines for flushing entries from the
* TLB and MMU hash table.
@@ -14,11 +15,6 @@
*
* Dave Engebretsen <engebret@us.ibm.com>
* Rework for PPC64 port.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#include <linux/kernel.h>
@@ -55,7 +51,8 @@
i = batch->index;
- /* Get page size (maybe move back to caller).
+ /*
+ * Get page size (maybe move back to caller).
*
* NOTE: when using special 64K mappings in 4K environment like
* for SPEs, we obtain the page size from the slice, which thus
@@ -77,10 +74,12 @@
#endif
} else {
psize = pte_pagesize_index(mm, addr, pte);
- /* Mask the address for the standard page size. If we
+ /*
+ * Mask the address for the standard page size. If we
* have a 64k page kernel, but the hardware does not
* support 64k pages, this might be different from the
- * hardware page size encoded in the slice table. */
+ * hardware page size encoded in the slice table.
+ */
addr &= PAGE_MASK;
offset = PTRS_PER_PTE;
}
@@ -161,7 +160,8 @@
{
struct ppc64_tlb_batch *tlbbatch = &get_cpu_var(ppc64_tlb_batch);
- /* If there's a TLB batch pending, then we must flush it because the
+ /*
+ * If there's a TLB batch pending, then we must flush it because the
* pages are going to be freed and we really don't want to have a CPU
* access a freed page because it has a stale TLB
*/
@@ -201,7 +201,8 @@
BUG_ON(!mm->pgd);
- /* Note: Normally, we should only ever use a batch within a
+ /*
+ * Note: Normally, we should only ever use a batch within a
* PTE locked section. This violates the rule, but will work
* since we don't actually modify the PTEs, we just flush the
* hash while leaving the PTEs intact (including their reference
@@ -238,7 +239,8 @@
unsigned long flags;
addr = _ALIGN_DOWN(addr, PMD_SIZE);
- /* Note: Normally, we should only ever use a batch within a
+ /*
+ * Note: Normally, we should only ever use a batch within a
* PTE locked section. This violates the rule, but will work
* since we don't actually modify the PTEs, we just flush the
* hash while leaving the PTEs intact (including their reference
diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/book3s64/hash_utils.c
similarity index 87%
rename from arch/powerpc/mm/hash_utils_64.c
rename to arch/powerpc/mm/book3s64/hash_utils.c
index f23a89d..6c12376 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/book3s64/hash_utils.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* PowerPC64 port by Mike Corrigan and Dave Engebretsen
* {mikejc|engebret}@us.ibm.com
@@ -11,11 +12,6 @@
*
* Description:
* PowerPC Hashed Page Table functions
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#undef DEBUG
@@ -37,6 +33,8 @@
#include <linux/context_tracking.h>
#include <linux/libfdt.h>
#include <linux/pkeys.h>
+#include <linux/hugetlb.h>
+#include <linux/cpu.h>
#include <asm/debugfs.h>
#include <asm/processor.h>
@@ -64,6 +62,9 @@
#include <asm/ps3.h>
#include <asm/pte-walk.h>
#include <asm/asm-prototypes.h>
+#include <asm/ultravisor.h>
+
+#include <mm/mmu_decl.h>
#ifdef DEBUG
#define DBG(fmt...) udbg_printf(fmt)
@@ -128,7 +129,8 @@
struct mmu_hash_ops mmu_hash_ops;
EXPORT_SYMBOL(mmu_hash_ops);
-/* There are definitions of page sizes arrays to be used when none
+/*
+ * These are definitions of page sizes arrays to be used when none
* is provided by the firmware.
*/
@@ -145,7 +147,8 @@
},
};
-/* POWER4, GPUL, POWER5
+/*
+ * POWER4, GPUL, POWER5
*
* Support for 16Mb large pages
*/
@@ -270,10 +273,6 @@
if (overlaps_kernel_text(vaddr, vaddr + step))
tprot &= ~HPTE_R_N;
- /* Make kvm guest trampolines executable */
- if (overlaps_kvm_tmp(vaddr, vaddr + step))
- tprot &= ~HPTE_R_N;
-
/*
* If relocatable, check if it overlaps interrupt vectors that
* are copied down to real 0. For relocatable kernel
@@ -479,7 +478,8 @@
}
#ifdef CONFIG_HUGETLB_PAGE
-/* Scan for 16G memory blocks that have been set aside for huge pages
+/*
+ * Scan for 16G memory blocks that have been set aside for huge pages
* and reserve those blocks for 16G huge pages.
*/
static int __init htab_dt_scan_hugepage_blocks(unsigned long node,
@@ -496,8 +496,10 @@
if (type == NULL || strcmp(type, "memory") != 0)
return 0;
- /* This property is the log base 2 of the number of virtual pages that
- * will represent this memory block. */
+ /*
+ * This property is the log base 2 of the number of virtual pages that
+ * will represent this memory block.
+ */
page_count_prop = of_get_flat_dt_prop(node, "ibm,expected#pages", NULL);
if (page_count_prop == NULL)
return 0;
@@ -673,16 +675,15 @@
#endif /* CONFIG_PPC_64K_PAGES */
#ifdef CONFIG_SPARSEMEM_VMEMMAP
- /* We try to use 16M pages for vmemmap if that is supported
+ /*
+ * We try to use 16M pages for vmemmap if that is supported
* and we have at least 1G of RAM at boot
*/
if (mmu_psize_defs[MMU_PAGE_16M].shift &&
memblock_phys_mem_size() >= 0x40000000)
mmu_vmemmap_psize = MMU_PAGE_16M;
- else if (mmu_psize_defs[MMU_PAGE_64K].shift)
- mmu_vmemmap_psize = MMU_PAGE_64K;
else
- mmu_vmemmap_psize = MMU_PAGE_4K;
+ mmu_vmemmap_psize = mmu_virtual_psize;
#endif /* CONFIG_SPARSEMEM_VMEMMAP */
printk(KERN_DEBUG "Page orders: linear mapping = %d, "
@@ -742,7 +743,8 @@
static unsigned long __init htab_get_table_size(void)
{
- /* If hash size isn't already provided by the platform, we try to
+ /*
+ * If hash size isn't already provided by the platform, we try to
* retrieve it from the device-tree. If it's not there neither, we
* calculate it now based on the total RAM size
*/
@@ -755,12 +757,12 @@
}
#ifdef CONFIG_MEMORY_HOTPLUG
-void resize_hpt_for_hotplug(unsigned long new_mem_size)
+int resize_hpt_for_hotplug(unsigned long new_mem_size)
{
unsigned target_hpt_shift;
if (!mmu_hash_ops.resize_hpt)
- return;
+ return 0;
target_hpt_shift = htab_shift_for_mem_size(new_mem_size);
@@ -772,23 +774,25 @@
* reduce unless the target shift is at least 2 below the
* current shift
*/
- if ((target_hpt_shift > ppc64_pft_size)
- || (target_hpt_shift < (ppc64_pft_size - 1))) {
- int rc;
+ if (target_hpt_shift > ppc64_pft_size ||
+ target_hpt_shift < ppc64_pft_size - 1)
+ return mmu_hash_ops.resize_hpt(target_hpt_shift);
- rc = mmu_hash_ops.resize_hpt(target_hpt_shift);
- if (rc && (rc != -ENODEV))
- printk(KERN_WARNING
- "Unable to resize hash page table to target order %d: %d\n",
- target_hpt_shift, rc);
- }
+ return 0;
}
int hash__create_section_mapping(unsigned long start, unsigned long end, int nid)
{
- int rc = htab_bolt_mapping(start, end, __pa(start),
- pgprot_val(PAGE_KERNEL), mmu_linear_psize,
- mmu_kernel_ssize);
+ int rc;
+
+ if (end >= H_VMALLOC_START) {
+ pr_warn("Outside the supported range\n");
+ return -1;
+ }
+
+ rc = htab_bolt_mapping(start, end, __pa(start),
+ pgprot_val(PAGE_KERNEL), mmu_linear_psize,
+ mmu_kernel_ssize);
if (rc < 0) {
int rc2 = htab_remove_mapping(start, end, mmu_linear_psize,
@@ -817,7 +821,7 @@
* For now, UPRT is 0 and we have no segment table.
*/
htab_size = __ilog2(htab_size) - 18;
- mmu_partition_table_set_entry(0, hash_table | htab_size, 0);
+ mmu_partition_table_set_entry(0, hash_table | htab_size, 0, false);
pr_info("Partition table %p\n", partition_tb);
}
@@ -851,12 +855,6 @@
/* Using a hypervisor which owns the htab */
htab_address = NULL;
_SDR1 = 0;
- /*
- * On POWER9, we need to do a H_REGISTER_PROC_TBL hcall
- * to inform the hypervisor that we wish to use the HPT.
- */
- if (cpu_has_feature(CPU_FTR_ARCH_300))
- register_process_table(0, 0, 0);
#ifdef CONFIG_FA_DUMP
/*
* If firmware assisted dump is active firmware preserves
@@ -882,8 +880,12 @@
}
#endif /* CONFIG_PPC_CELL */
- table = memblock_alloc_base(htab_size_bytes, htab_size_bytes,
- limit);
+ table = memblock_phys_alloc_range(htab_size_bytes,
+ htab_size_bytes,
+ 0, limit);
+ if (!table)
+ panic("ERROR: Failed to allocate %pa bytes below %pa\n",
+ &htab_size_bytes, &limit);
DBG("Hash table allocated at %lx, size: %lx\n", table,
htab_size_bytes);
@@ -908,9 +910,12 @@
#ifdef CONFIG_DEBUG_PAGEALLOC
if (debug_pagealloc_enabled()) {
linear_map_hash_count = memblock_end_of_DRAM() >> PAGE_SHIFT;
- linear_map_hash_slots = __va(memblock_alloc_base(
- linear_map_hash_count, 1, ppc64_rma_size));
- memset(linear_map_hash_slots, 0, linear_map_hash_count);
+ linear_map_hash_slots = memblock_alloc_try_nid(
+ linear_map_hash_count, 1, MEMBLOCK_LOW_LIMIT,
+ ppc64_rma_size, NUMA_NO_NODE);
+ if (!linear_map_hash_slots)
+ panic("%s: Failed to allocate %lu bytes max_addr=%pa\n",
+ __func__, linear_map_hash_count, &ppc64_rma_size);
}
#endif /* CONFIG_DEBUG_PAGEALLOC */
@@ -922,6 +927,11 @@
DBG("creating mapping for region: %lx..%lx (prot: %lx)\n",
base, size, prot);
+ if ((base + size) >= H_VMALLOC_START) {
+ pr_warn("Outside the supported range\n");
+ continue;
+ }
+
BUG_ON(htab_bolt_mapping(base, base + size, __pa(base),
prot, mmu_linear_psize, mmu_kernel_ssize));
}
@@ -961,6 +971,7 @@
htab_scan_page_sizes();
}
+static struct hash_mm_context init_hash_mm_context;
void __init hash__early_init_mmu(void)
{
#ifndef CONFIG_PPC_64K_PAGES
@@ -1001,16 +1012,16 @@
* 4k use hugepd format, so for hash set then to
* zero
*/
- __pmd_val_bits = 0;
- __pud_val_bits = 0;
- __pgd_val_bits = 0;
+ __pmd_val_bits = HASH_PMD_VAL_BITS;
+ __pud_val_bits = HASH_PUD_VAL_BITS;
+ __pgd_val_bits = HASH_PGD_VAL_BITS;
__kernel_virt_start = H_KERN_VIRT_START;
- __kernel_virt_size = H_KERN_VIRT_SIZE;
__vmalloc_start = H_VMALLOC_START;
__vmalloc_end = H_VMALLOC_END;
__kernel_io_start = H_KERN_IO_START;
- vmemmap = (struct page *)H_VMEMMAP_BASE;
+ __kernel_io_end = H_KERN_IO_END;
+ vmemmap = (struct page *)H_VMEMMAP_START;
ioremap_bot = IOREMAP_BASE;
#ifdef CONFIG_PCI
@@ -1028,12 +1039,16 @@
if (!mmu_hash_ops.hpte_insert)
panic("hash__early_init_mmu: No MMU hash ops defined!\n");
- /* Initialize the MMU Hash table and create the linear mapping
+ /*
+ * Initialize the MMU Hash table and create the linear mapping
* of memory. Has to be done before SLB initialization as this is
* currently where the page size encoding is obtained.
*/
htab_initialize();
+ init_mm.context.hash_context = &init_hash_mm_context;
+ mm_ctx_set_slb_addr_limit(&init_mm.context, SLB_ADDR_LIMIT_DEFAULT);
+
pr_info("Initializing hash mmu with SLB\n");
/* Initialize SLB management */
slb_initialize();
@@ -1052,8 +1067,8 @@
if (!cpu_has_feature(CPU_FTR_ARCH_300))
mtspr(SPRN_SDR1, _SDR1);
else
- mtspr(SPRN_PTCR,
- __pa(partition_tb) | (PATB_SIZE_SHIFT - 12));
+ set_ptcr_when_no_uv(__pa(partition_tb) |
+ (PATB_SIZE_SHIFT - 12));
}
/* Initialize SLB */
slb_initialize();
@@ -1125,7 +1140,7 @@
if ((get_paca_psize(addr) != MMU_PAGE_4K) && (current->mm == mm)) {
copy_mm_to_paca(mm);
- slb_flush_and_rebolt();
+ slb_flush_and_restore_bolted();
}
}
#endif /* CONFIG_PPC_64K_PAGES */
@@ -1140,10 +1155,13 @@
*/
static int subpage_protection(struct mm_struct *mm, unsigned long ea)
{
- struct subpage_prot_table *spt = &mm->context.spt;
+ struct subpage_prot_table *spt = mm_ctx_subpage_prot(&mm->context);
u32 spp = 0;
u32 **sbpm, *sbpp;
+ if (!spt)
+ return 0;
+
if (ea >= spt->maxaddr)
return 0;
if (ea < 0x100000000UL) {
@@ -1197,7 +1215,7 @@
if (user_region) {
if (psize != get_paca_psize(ea)) {
copy_mm_to_paca(mm);
- slb_flush_and_rebolt();
+ slb_flush_and_restore_bolted();
}
} else if (get_paca()->vmalloc_sllp !=
mmu_psize_defs[mmu_vmalloc_psize].sllp) {
@@ -1207,7 +1225,8 @@
}
}
-/* Result code is:
+/*
+ * Result code is:
* 0 - handled
* 1 - normal page fault
* -1 - critical hash insertion error
@@ -1231,7 +1250,7 @@
trace_hash_fault(ea, access, trap);
/* Get region & vsid */
- switch (REGION_ID(ea)) {
+ switch (get_region_id(ea)) {
case USER_REGION_ID:
user_region = 1;
if (! mm) {
@@ -1245,15 +1264,19 @@
break;
case VMALLOC_REGION_ID:
vsid = get_kernel_vsid(ea, mmu_kernel_ssize);
- if (ea < VMALLOC_END)
- psize = mmu_vmalloc_psize;
- else
- psize = mmu_io_psize;
+ psize = mmu_vmalloc_psize;
+ ssize = mmu_kernel_ssize;
+ break;
+
+ case IO_REGION_ID:
+ vsid = get_kernel_vsid(ea, mmu_kernel_ssize);
+ psize = mmu_io_psize;
ssize = mmu_kernel_ssize;
break;
default:
- /* Not a valid range
- * Send the problem up to do_page_fault
+ /*
+ * Not a valid range
+ * Send the problem up to do_page_fault()
*/
rc = 1;
goto bail;
@@ -1278,7 +1301,8 @@
flags |= HPTE_LOCAL_UPDATE;
#ifndef CONFIG_PPC_64K_PAGES
- /* If we use 4K pages and our psize is not 4K, then we might
+ /*
+ * If we use 4K pages and our psize is not 4K, then we might
* be hitting a special driver mapping, and need to align the
* address before we fetch the PTE.
*
@@ -1300,7 +1324,8 @@
/* Add _PAGE_PRESENT to the required access perm */
access |= _PAGE_PRESENT;
- /* Pre-check access permissions (will be re-checked atomically
+ /*
+ * Pre-check access permissions (will be re-checked atomically
* in __hash_page_XX but this pre-check is a fast path
*/
if (!check_pte_access(access, pte_val(*ptep))) {
@@ -1347,7 +1372,8 @@
psize = MMU_PAGE_4K;
}
- /* If this PTE is non-cacheable and we have restrictions on
+ /*
+ * If this PTE is non-cacheable and we have restrictions on
* using non cacheable large pages, then we switch to 4k
*/
if (mmu_ci_restrictions && psize == MMU_PAGE_64K && pte_ci(*ptep)) {
@@ -1388,7 +1414,8 @@
flags, ssize, spp);
}
- /* Dump some info in case of hash insertion failure, they should
+ /*
+ * Dump some info in case of hash insertion failure, they should
* never happen so it is really useful to know if/when they do
*/
if (rc == -1)
@@ -1414,7 +1441,8 @@
unsigned long flags = 0;
struct mm_struct *mm = current->mm;
- if (REGION_ID(ea) == VMALLOC_REGION_ID)
+ if ((get_region_id(ea) == VMALLOC_REGION_ID) ||
+ (get_region_id(ea) == IO_REGION_ID))
mm = &init_mm;
if (dsisr & DSISR_NOHPTE)
@@ -1424,14 +1452,15 @@
}
EXPORT_SYMBOL_GPL(hash_page);
-int __hash_page(unsigned long ea, unsigned long msr, unsigned long trap,
- unsigned long dsisr)
+int __hash_page(unsigned long trap, unsigned long ea, unsigned long dsisr,
+ unsigned long msr)
{
unsigned long access = _PAGE_PRESENT | _PAGE_READ;
unsigned long flags = 0;
struct mm_struct *mm = current->mm;
+ unsigned int region_id = get_region_id(ea);
- if (REGION_ID(ea) == VMALLOC_REGION_ID)
+ if ((region_id == VMALLOC_REGION_ID) || (region_id == IO_REGION_ID))
mm = &init_mm;
if (dsisr & DSISR_NOHPTE)
@@ -1448,7 +1477,7 @@
* 2) user space access kernel space.
*/
access |= _PAGE_PRIVILEGED;
- if ((msr & MSR_PR) || (REGION_ID(ea) == USER_REGION_ID))
+ if ((msr & MSR_PR) || (region_id == USER_REGION_ID))
access &= ~_PAGE_PRIVILEGED;
if (trap == 0x400)
@@ -1463,7 +1492,7 @@
int psize = get_slice_psize(mm, ea);
/* We only prefault standard pages for now */
- if (unlikely(psize != mm->context.user_psize))
+ if (unlikely(psize != mm_ctx_user_psize(&mm->context)))
return false;
/*
@@ -1481,8 +1510,8 @@
}
#endif
-void hash_preload(struct mm_struct *mm, unsigned long ea,
- unsigned long access, unsigned long trap)
+static void hash_preload(struct mm_struct *mm, unsigned long ea,
+ bool is_exec, unsigned long trap)
{
int hugepage_shift;
unsigned long vsid;
@@ -1490,8 +1519,9 @@
pte_t *ptep;
unsigned long flags;
int rc, ssize, update_flags = 0;
+ unsigned long access = _PAGE_PRESENT | _PAGE_READ | (is_exec ? _PAGE_EXEC : 0);
- BUG_ON(REGION_ID(ea) != USER_REGION_ID);
+ BUG_ON(get_region_id(ea) != USER_REGION_ID);
if (!should_hash_preload(mm, ea))
return;
@@ -1541,7 +1571,7 @@
/* Hash it in */
#ifdef CONFIG_PPC_64K_PAGES
- if (mm->context.user_psize == MMU_PAGE_64K)
+ if (mm_ctx_user_psize(&mm->context) == MMU_PAGE_64K)
rc = __hash_page_64K(ea, access, vsid, ptep, trap,
update_flags, ssize);
else
@@ -1554,13 +1584,64 @@
*/
if (rc == -1)
hash_failure_debug(ea, access, vsid, trap, ssize,
- mm->context.user_psize,
- mm->context.user_psize,
+ mm_ctx_user_psize(&mm->context),
+ mm_ctx_user_psize(&mm->context),
pte_val(*ptep));
out_exit:
local_irq_restore(flags);
}
+/*
+ * This is called at the end of handling a user page fault, when the
+ * fault has been handled by updating a PTE in the linux page tables.
+ * We use it to preload an HPTE into the hash table corresponding to
+ * the updated linux PTE.
+ *
+ * This must always be called with the pte lock held.
+ */
+void update_mmu_cache(struct vm_area_struct *vma, unsigned long address,
+ pte_t *ptep)
+{
+ /*
+ * We don't need to worry about _PAGE_PRESENT here because we are
+ * called with either mm->page_table_lock held or ptl lock held
+ */
+ unsigned long trap;
+ bool is_exec;
+
+ if (radix_enabled()) {
+ prefetch((void *)address);
+ return;
+ }
+
+ /* We only want HPTEs for linux PTEs that have _PAGE_ACCESSED set */
+ if (!pte_young(*ptep) || address >= TASK_SIZE)
+ return;
+
+ /*
+ * We try to figure out if we are coming from an instruction
+ * access fault and pass that down to __hash_page so we avoid
+ * double-faulting on execution of fresh text. We have to test
+ * for regs NULL since init will get here first thing at boot.
+ *
+ * We also avoid filling the hash if not coming from a fault.
+ */
+
+ trap = current->thread.regs ? TRAP(current->thread.regs) : 0UL;
+ switch (trap) {
+ case 0x300:
+ is_exec = false;
+ break;
+ case 0x400:
+ is_exec = true;
+ break;
+ default:
+ return;
+ }
+
+ hash_preload(vma->vm_mm, address, is_exec, trap);
+}
+
#ifdef CONFIG_PPC_MEM_KEYS
/*
* Return the protection key associated with the given address and the
@@ -1626,7 +1707,8 @@
return gslot;
}
-/* WARNING: This is called from hash_low_64.S, if you change this prototype,
+/*
+ * WARNING: This is called from hash_low_64.S, if you change this prototype,
* do not forget to update the assembly call site !
*/
void flush_hash_page(unsigned long vpn, real_pte_t pte, int psize, int ssize,
@@ -1666,7 +1748,7 @@
/*
* IF we try to do a HUGE PTE update after a withdraw is done.
* we will find the below NULL. This happens when we do
- * split_huge_page_pmd
+ * split_huge_pmd
*/
if (!hpte_slot_array)
return;
@@ -1847,7 +1929,8 @@
void hash__setup_initial_memory_limit(phys_addr_t first_memblock_base,
phys_addr_t first_memblock_size)
{
- /* We don't currently support the first MEMBLOCK not mapping 0
+ /*
+ * We don't currently support the first MEMBLOCK not mapping 0
* physical on those processors
*/
BUG_ON(first_memblock_base != 0);
@@ -1859,11 +1942,20 @@
*
* For guests on platforms before POWER9, we clamp the it limit to 1G
* to avoid some funky things such as RTAS bugs etc...
+ *
+ * On POWER9 we limit to 1TB in case the host erroneously told us that
+ * the RMA was >1TB. Effective address bits 0:23 are treated as zero
+ * (meaning the access is aliased to zero i.e. addr = addr % 1TB)
+ * for virtual real mode addressing and so it doesn't make sense to
+ * have an area larger than 1TB as it can't be addressed.
*/
if (!early_cpu_has_feature(CPU_FTR_HVMODE)) {
ppc64_rma_size = first_memblock_size;
if (!early_cpu_has_feature(CPU_FTR_ARCH_300))
ppc64_rma_size = min_t(u64, ppc64_rma_size, 0x40000000);
+ else
+ ppc64_rma_size = min_t(u64, ppc64_rma_size,
+ 1UL << SID_SHIFT_1T);
/* Finally limit subsequent allocations */
memblock_set_current_limit(ppc64_rma_size);
@@ -1882,18 +1974,24 @@
static int hpt_order_set(void *data, u64 val)
{
+ int ret;
+
if (!mmu_hash_ops.resize_hpt)
return -ENODEV;
- return mmu_hash_ops.resize_hpt(val);
+ cpus_read_lock();
+ ret = mmu_hash_ops.resize_hpt(val);
+ cpus_read_unlock();
+
+ return ret;
}
-DEFINE_SIMPLE_ATTRIBUTE(fops_hpt_order, hpt_order_get, hpt_order_set, "%llu\n");
+DEFINE_DEBUGFS_ATTRIBUTE(fops_hpt_order, hpt_order_get, hpt_order_set, "%llu\n");
static int __init hash64_debugfs(void)
{
- if (!debugfs_create_file("hpt_order", 0600, powerpc_debugfs_root,
- NULL, &fops_hpt_order)) {
+ if (!debugfs_create_file_unsafe("hpt_order", 0600, powerpc_debugfs_root,
+ NULL, &fops_hpt_order)) {
pr_err("lpar: unable to create hpt_order debugsfs file\n");
}
@@ -1901,3 +1999,11 @@
}
machine_device_initcall(pseries, hash64_debugfs);
#endif /* CONFIG_DEBUG_FS */
+
+void __init print_system_hash_info(void)
+{
+ pr_info("ppc64_pft_size = 0x%llx\n", ppc64_pft_size);
+
+ if (htab_hash_mask)
+ pr_info("htab_hash_mask = 0x%lx\n", htab_hash_mask);
+}
diff --git a/arch/powerpc/mm/book3s64/iommu_api.c b/arch/powerpc/mm/book3s64/iommu_api.c
new file mode 100644
index 0000000..56cc845
--- /dev/null
+++ b/arch/powerpc/mm/book3s64/iommu_api.c
@@ -0,0 +1,461 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * IOMMU helpers in MMU context.
+ *
+ * Copyright (C) 2015 IBM Corp. <aik@ozlabs.ru>
+ */
+
+#include <linux/sched/signal.h>
+#include <linux/slab.h>
+#include <linux/rculist.h>
+#include <linux/vmalloc.h>
+#include <linux/mutex.h>
+#include <linux/migrate.h>
+#include <linux/hugetlb.h>
+#include <linux/swap.h>
+#include <linux/sizes.h>
+#include <linux/mm.h>
+#include <asm/mmu_context.h>
+#include <asm/pte-walk.h>
+#include <linux/mm_inline.h>
+
+static DEFINE_MUTEX(mem_list_mutex);
+
+#define MM_IOMMU_TABLE_GROUP_PAGE_DIRTY 0x1
+#define MM_IOMMU_TABLE_GROUP_PAGE_MASK ~(SZ_4K - 1)
+
+struct mm_iommu_table_group_mem_t {
+ struct list_head next;
+ struct rcu_head rcu;
+ unsigned long used;
+ atomic64_t mapped;
+ unsigned int pageshift;
+ u64 ua; /* userspace address */
+ u64 entries; /* number of entries in hpas/hpages[] */
+ /*
+ * in mm_iommu_get we temporarily use this to store
+ * struct page address.
+ *
+ * We need to convert ua to hpa in real mode. Make it
+ * simpler by storing physical address.
+ */
+ union {
+ struct page **hpages; /* vmalloc'ed */
+ phys_addr_t *hpas;
+ };
+#define MM_IOMMU_TABLE_INVALID_HPA ((uint64_t)-1)
+ u64 dev_hpa; /* Device memory base address */
+};
+
+bool mm_iommu_preregistered(struct mm_struct *mm)
+{
+ return !list_empty(&mm->context.iommu_group_mem_list);
+}
+EXPORT_SYMBOL_GPL(mm_iommu_preregistered);
+
+static long mm_iommu_do_alloc(struct mm_struct *mm, unsigned long ua,
+ unsigned long entries, unsigned long dev_hpa,
+ struct mm_iommu_table_group_mem_t **pmem)
+{
+ struct mm_iommu_table_group_mem_t *mem, *mem2;
+ long i, ret, locked_entries = 0, pinned = 0;
+ unsigned int pageshift;
+ unsigned long entry, chunk;
+
+ if (dev_hpa == MM_IOMMU_TABLE_INVALID_HPA) {
+ ret = account_locked_vm(mm, entries, true);
+ if (ret)
+ return ret;
+
+ locked_entries = entries;
+ }
+
+ mem = kzalloc(sizeof(*mem), GFP_KERNEL);
+ if (!mem) {
+ ret = -ENOMEM;
+ goto unlock_exit;
+ }
+
+ if (dev_hpa != MM_IOMMU_TABLE_INVALID_HPA) {
+ mem->pageshift = __ffs(dev_hpa | (entries << PAGE_SHIFT));
+ mem->dev_hpa = dev_hpa;
+ goto good_exit;
+ }
+ mem->dev_hpa = MM_IOMMU_TABLE_INVALID_HPA;
+
+ /*
+ * For a starting point for a maximum page size calculation
+ * we use @ua and @entries natural alignment to allow IOMMU pages
+ * smaller than huge pages but still bigger than PAGE_SIZE.
+ */
+ mem->pageshift = __ffs(ua | (entries << PAGE_SHIFT));
+ mem->hpas = vzalloc(array_size(entries, sizeof(mem->hpas[0])));
+ if (!mem->hpas) {
+ kfree(mem);
+ ret = -ENOMEM;
+ goto unlock_exit;
+ }
+
+ down_read(&mm->mmap_sem);
+ chunk = (1UL << (PAGE_SHIFT + MAX_ORDER - 1)) /
+ sizeof(struct vm_area_struct *);
+ chunk = min(chunk, entries);
+ for (entry = 0; entry < entries; entry += chunk) {
+ unsigned long n = min(entries - entry, chunk);
+
+ ret = get_user_pages(ua + (entry << PAGE_SHIFT), n,
+ FOLL_WRITE | FOLL_LONGTERM,
+ mem->hpages + entry, NULL);
+ if (ret == n) {
+ pinned += n;
+ continue;
+ }
+ if (ret > 0)
+ pinned += ret;
+ break;
+ }
+ up_read(&mm->mmap_sem);
+ if (pinned != entries) {
+ if (!ret)
+ ret = -EFAULT;
+ goto free_exit;
+ }
+
+ pageshift = PAGE_SHIFT;
+ for (i = 0; i < entries; ++i) {
+ struct page *page = mem->hpages[i];
+
+ /*
+ * Allow to use larger than 64k IOMMU pages. Only do that
+ * if we are backed by hugetlb.
+ */
+ if ((mem->pageshift > PAGE_SHIFT) && PageHuge(page))
+ pageshift = page_shift(compound_head(page));
+ mem->pageshift = min(mem->pageshift, pageshift);
+ /*
+ * We don't need struct page reference any more, switch
+ * to physical address.
+ */
+ mem->hpas[i] = page_to_pfn(page) << PAGE_SHIFT;
+ }
+
+good_exit:
+ atomic64_set(&mem->mapped, 1);
+ mem->used = 1;
+ mem->ua = ua;
+ mem->entries = entries;
+
+ mutex_lock(&mem_list_mutex);
+
+ list_for_each_entry_rcu(mem2, &mm->context.iommu_group_mem_list, next) {
+ /* Overlap? */
+ if ((mem2->ua < (ua + (entries << PAGE_SHIFT))) &&
+ (ua < (mem2->ua +
+ (mem2->entries << PAGE_SHIFT)))) {
+ ret = -EINVAL;
+ mutex_unlock(&mem_list_mutex);
+ goto free_exit;
+ }
+ }
+
+ list_add_rcu(&mem->next, &mm->context.iommu_group_mem_list);
+
+ mutex_unlock(&mem_list_mutex);
+
+ *pmem = mem;
+
+ return 0;
+
+free_exit:
+ /* free the reference taken */
+ for (i = 0; i < pinned; i++)
+ put_page(mem->hpages[i]);
+
+ vfree(mem->hpas);
+ kfree(mem);
+
+unlock_exit:
+ account_locked_vm(mm, locked_entries, false);
+
+ return ret;
+}
+
+long mm_iommu_new(struct mm_struct *mm, unsigned long ua, unsigned long entries,
+ struct mm_iommu_table_group_mem_t **pmem)
+{
+ return mm_iommu_do_alloc(mm, ua, entries, MM_IOMMU_TABLE_INVALID_HPA,
+ pmem);
+}
+EXPORT_SYMBOL_GPL(mm_iommu_new);
+
+long mm_iommu_newdev(struct mm_struct *mm, unsigned long ua,
+ unsigned long entries, unsigned long dev_hpa,
+ struct mm_iommu_table_group_mem_t **pmem)
+{
+ return mm_iommu_do_alloc(mm, ua, entries, dev_hpa, pmem);
+}
+EXPORT_SYMBOL_GPL(mm_iommu_newdev);
+
+static void mm_iommu_unpin(struct mm_iommu_table_group_mem_t *mem)
+{
+ long i;
+ struct page *page = NULL;
+
+ if (!mem->hpas)
+ return;
+
+ for (i = 0; i < mem->entries; ++i) {
+ if (!mem->hpas[i])
+ continue;
+
+ page = pfn_to_page(mem->hpas[i] >> PAGE_SHIFT);
+ if (!page)
+ continue;
+
+ if (mem->hpas[i] & MM_IOMMU_TABLE_GROUP_PAGE_DIRTY)
+ SetPageDirty(page);
+
+ put_page(page);
+ mem->hpas[i] = 0;
+ }
+}
+
+static void mm_iommu_do_free(struct mm_iommu_table_group_mem_t *mem)
+{
+
+ mm_iommu_unpin(mem);
+ vfree(mem->hpas);
+ kfree(mem);
+}
+
+static void mm_iommu_free(struct rcu_head *head)
+{
+ struct mm_iommu_table_group_mem_t *mem = container_of(head,
+ struct mm_iommu_table_group_mem_t, rcu);
+
+ mm_iommu_do_free(mem);
+}
+
+static void mm_iommu_release(struct mm_iommu_table_group_mem_t *mem)
+{
+ list_del_rcu(&mem->next);
+ call_rcu(&mem->rcu, mm_iommu_free);
+}
+
+long mm_iommu_put(struct mm_struct *mm, struct mm_iommu_table_group_mem_t *mem)
+{
+ long ret = 0;
+ unsigned long unlock_entries = 0;
+
+ mutex_lock(&mem_list_mutex);
+
+ if (mem->used == 0) {
+ ret = -ENOENT;
+ goto unlock_exit;
+ }
+
+ --mem->used;
+ /* There are still users, exit */
+ if (mem->used)
+ goto unlock_exit;
+
+ /* Are there still mappings? */
+ if (atomic_cmpxchg(&mem->mapped, 1, 0) != 1) {
+ ++mem->used;
+ ret = -EBUSY;
+ goto unlock_exit;
+ }
+
+ if (mem->dev_hpa == MM_IOMMU_TABLE_INVALID_HPA)
+ unlock_entries = mem->entries;
+
+ /* @mapped became 0 so now mappings are disabled, release the region */
+ mm_iommu_release(mem);
+
+unlock_exit:
+ mutex_unlock(&mem_list_mutex);
+
+ account_locked_vm(mm, unlock_entries, false);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(mm_iommu_put);
+
+struct mm_iommu_table_group_mem_t *mm_iommu_lookup(struct mm_struct *mm,
+ unsigned long ua, unsigned long size)
+{
+ struct mm_iommu_table_group_mem_t *mem, *ret = NULL;
+
+ list_for_each_entry_rcu(mem, &mm->context.iommu_group_mem_list, next) {
+ if ((mem->ua <= ua) &&
+ (ua + size <= mem->ua +
+ (mem->entries << PAGE_SHIFT))) {
+ ret = mem;
+ break;
+ }
+ }
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(mm_iommu_lookup);
+
+struct mm_iommu_table_group_mem_t *mm_iommu_lookup_rm(struct mm_struct *mm,
+ unsigned long ua, unsigned long size)
+{
+ struct mm_iommu_table_group_mem_t *mem, *ret = NULL;
+
+ list_for_each_entry_lockless(mem, &mm->context.iommu_group_mem_list,
+ next) {
+ if ((mem->ua <= ua) &&
+ (ua + size <= mem->ua +
+ (mem->entries << PAGE_SHIFT))) {
+ ret = mem;
+ break;
+ }
+ }
+
+ return ret;
+}
+
+struct mm_iommu_table_group_mem_t *mm_iommu_get(struct mm_struct *mm,
+ unsigned long ua, unsigned long entries)
+{
+ struct mm_iommu_table_group_mem_t *mem, *ret = NULL;
+
+ mutex_lock(&mem_list_mutex);
+
+ list_for_each_entry_rcu(mem, &mm->context.iommu_group_mem_list, next) {
+ if ((mem->ua == ua) && (mem->entries == entries)) {
+ ret = mem;
+ ++mem->used;
+ break;
+ }
+ }
+
+ mutex_unlock(&mem_list_mutex);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(mm_iommu_get);
+
+long mm_iommu_ua_to_hpa(struct mm_iommu_table_group_mem_t *mem,
+ unsigned long ua, unsigned int pageshift, unsigned long *hpa)
+{
+ const long entry = (ua - mem->ua) >> PAGE_SHIFT;
+ u64 *va;
+
+ if (entry >= mem->entries)
+ return -EFAULT;
+
+ if (pageshift > mem->pageshift)
+ return -EFAULT;
+
+ if (!mem->hpas) {
+ *hpa = mem->dev_hpa + (ua - mem->ua);
+ return 0;
+ }
+
+ va = &mem->hpas[entry];
+ *hpa = (*va & MM_IOMMU_TABLE_GROUP_PAGE_MASK) | (ua & ~PAGE_MASK);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(mm_iommu_ua_to_hpa);
+
+long mm_iommu_ua_to_hpa_rm(struct mm_iommu_table_group_mem_t *mem,
+ unsigned long ua, unsigned int pageshift, unsigned long *hpa)
+{
+ const long entry = (ua - mem->ua) >> PAGE_SHIFT;
+ unsigned long *pa;
+
+ if (entry >= mem->entries)
+ return -EFAULT;
+
+ if (pageshift > mem->pageshift)
+ return -EFAULT;
+
+ if (!mem->hpas) {
+ *hpa = mem->dev_hpa + (ua - mem->ua);
+ return 0;
+ }
+
+ pa = (void *) vmalloc_to_phys(&mem->hpas[entry]);
+ if (!pa)
+ return -EFAULT;
+
+ *hpa = (*pa & MM_IOMMU_TABLE_GROUP_PAGE_MASK) | (ua & ~PAGE_MASK);
+
+ return 0;
+}
+
+extern void mm_iommu_ua_mark_dirty_rm(struct mm_struct *mm, unsigned long ua)
+{
+ struct mm_iommu_table_group_mem_t *mem;
+ long entry;
+ void *va;
+ unsigned long *pa;
+
+ mem = mm_iommu_lookup_rm(mm, ua, PAGE_SIZE);
+ if (!mem)
+ return;
+
+ if (mem->dev_hpa != MM_IOMMU_TABLE_INVALID_HPA)
+ return;
+
+ entry = (ua - mem->ua) >> PAGE_SHIFT;
+ va = &mem->hpas[entry];
+
+ pa = (void *) vmalloc_to_phys(va);
+ if (!pa)
+ return;
+
+ *pa |= MM_IOMMU_TABLE_GROUP_PAGE_DIRTY;
+}
+
+bool mm_iommu_is_devmem(struct mm_struct *mm, unsigned long hpa,
+ unsigned int pageshift, unsigned long *size)
+{
+ struct mm_iommu_table_group_mem_t *mem;
+ unsigned long end;
+
+ list_for_each_entry_rcu(mem, &mm->context.iommu_group_mem_list, next) {
+ if (mem->dev_hpa == MM_IOMMU_TABLE_INVALID_HPA)
+ continue;
+
+ end = mem->dev_hpa + (mem->entries << PAGE_SHIFT);
+ if ((mem->dev_hpa <= hpa) && (hpa < end)) {
+ /*
+ * Since the IOMMU page size might be bigger than
+ * PAGE_SIZE, the amount of preregistered memory
+ * starting from @hpa might be smaller than 1<<pageshift
+ * and the caller needs to distinguish this situation.
+ */
+ *size = min(1UL << pageshift, end - hpa);
+ return true;
+ }
+ }
+
+ return false;
+}
+EXPORT_SYMBOL_GPL(mm_iommu_is_devmem);
+
+long mm_iommu_mapped_inc(struct mm_iommu_table_group_mem_t *mem)
+{
+ if (atomic64_inc_not_zero(&mem->mapped))
+ return 0;
+
+ /* Last mm_iommu_put() has been called, no more mappings allowed() */
+ return -ENXIO;
+}
+EXPORT_SYMBOL_GPL(mm_iommu_mapped_inc);
+
+void mm_iommu_mapped_dec(struct mm_iommu_table_group_mem_t *mem)
+{
+ atomic64_add_unless(&mem->mapped, -1, 1);
+}
+EXPORT_SYMBOL_GPL(mm_iommu_mapped_dec);
+
+void mm_iommu_init(struct mm_struct *mm)
+{
+ INIT_LIST_HEAD_RCU(&mm->context.iommu_group_mem_list);
+}
diff --git a/arch/powerpc/mm/mmu_context_book3s64.c b/arch/powerpc/mm/book3s64/mmu_context.c
similarity index 63%
rename from arch/powerpc/mm/mmu_context_book3s64.c
rename to arch/powerpc/mm/book3s64/mmu_context.c
index dbd8f76..0ba30b8 100644
--- a/arch/powerpc/mm/mmu_context_book3s64.c
+++ b/arch/powerpc/mm/book3s64/mmu_context.c
@@ -1,13 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* MMU context allocation for 64-bit kernels.
*
* Copyright (C) 2004 Anton Blanchard, IBM Corp. <anton@samba.org>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
*/
#include <linux/sched.h>
@@ -53,13 +48,54 @@
}
EXPORT_SYMBOL_GPL(hash__alloc_context_id);
+void slb_setup_new_exec(void);
+
+static int realloc_context_ids(mm_context_t *ctx)
+{
+ int i, id;
+
+ /*
+ * id 0 (aka. ctx->id) is special, we always allocate a new one, even if
+ * there wasn't one allocated previously (which happens in the exec
+ * case where ctx is newly allocated).
+ *
+ * We have to be a bit careful here. We must keep the existing ids in
+ * the array, so that we can test if they're non-zero to decide if we
+ * need to allocate a new one. However in case of error we must free the
+ * ids we've allocated but *not* any of the existing ones (or risk a
+ * UAF). That's why we decrement i at the start of the error handling
+ * loop, to skip the id that we just tested but couldn't reallocate.
+ */
+ for (i = 0; i < ARRAY_SIZE(ctx->extended_id); i++) {
+ if (i == 0 || ctx->extended_id[i]) {
+ id = hash__alloc_context_id();
+ if (id < 0)
+ goto error;
+
+ ctx->extended_id[i] = id;
+ }
+ }
+
+ /* The caller expects us to return id */
+ return ctx->id;
+
+error:
+ for (i--; i >= 0; i--) {
+ if (ctx->extended_id[i])
+ ida_free(&mmu_context_ida, ctx->extended_id[i]);
+ }
+
+ return id;
+}
+
static int hash__init_new_context(struct mm_struct *mm)
{
int index;
- index = hash__alloc_context_id();
- if (index < 0)
- return index;
+ mm->context.hash_context = kmalloc(sizeof(struct hash_mm_context),
+ GFP_KERNEL);
+ if (!mm->context.hash_context)
+ return -ENOMEM;
/*
* The old code would re-promote on fork, we don't do that when using
@@ -75,15 +111,45 @@
* We should not be calling init_new_context() on init_mm. Hence a
* check against 0 is OK.
*/
- if (mm->context.id == 0)
+ if (mm->context.id == 0) {
+ memset(mm->context.hash_context, 0, sizeof(struct hash_mm_context));
slice_init_new_context_exec(mm);
+ } else {
+ /* This is fork. Copy hash_context details from current->mm */
+ memcpy(mm->context.hash_context, current->mm->context.hash_context, sizeof(struct hash_mm_context));
+#ifdef CONFIG_PPC_SUBPAGE_PROT
+ /* inherit subpage prot detalis if we have one. */
+ if (current->mm->context.hash_context->spt) {
+ mm->context.hash_context->spt = kmalloc(sizeof(struct subpage_prot_table),
+ GFP_KERNEL);
+ if (!mm->context.hash_context->spt) {
+ kfree(mm->context.hash_context);
+ return -ENOMEM;
+ }
+ }
+#endif
+ }
- subpage_prot_init_new_context(mm);
+ index = realloc_context_ids(&mm->context);
+ if (index < 0) {
+#ifdef CONFIG_PPC_SUBPAGE_PROT
+ kfree(mm->context.hash_context->spt);
+#endif
+ kfree(mm->context.hash_context);
+ return index;
+ }
pkey_mm_init(mm);
return index;
}
+void hash__setup_new_exec(void)
+{
+ slice_setup_new_exec();
+
+ slb_setup_new_exec();
+}
+
static int radix__init_new_context(struct mm_struct *mm)
{
unsigned long rts_field;
@@ -108,7 +174,7 @@
*/
asm volatile("ptesync;isync" : : : "memory");
- mm->context.npu_context = NULL;
+ mm->context.hash_context = NULL;
return index;
}
@@ -153,21 +219,7 @@
if (context_id)
ida_free(&mmu_context_ida, context_id);
}
-}
-
-static void pte_frag_destroy(void *pte_frag)
-{
- int count;
- struct page *page;
-
- page = virt_to_page(pte_frag);
- /* drop all the pending references */
- count = ((unsigned long)pte_frag & ~PAGE_MASK) >> PTE_FRAG_SIZE_SHIFT;
- /* We allow PTE_FRAG_NR fragments from a PTE page */
- if (atomic_sub_and_test(PTE_FRAG_NR - count, &page->pt_frag_refcount)) {
- pgtable_page_dtor(page);
- __free_page(page);
- }
+ kfree(ctx->hash_context);
}
static void pmd_frag_destroy(void *pmd_frag)
@@ -204,8 +256,21 @@
#ifdef CONFIG_SPAPR_TCE_IOMMU
WARN_ON_ONCE(!list_empty(&mm->context.iommu_group_mem_list));
#endif
+ /*
+ * For tasks which were successfully initialized we end up calling
+ * arch_exit_mmap() which clears the process table entry. And
+ * arch_exit_mmap() is called before the required fullmm TLB flush
+ * which does a RIC=2 flush. Hence for an initialized task, we do clear
+ * any cached process table entries.
+ *
+ * The condition below handles the error case during task init. We have
+ * set the process table entry early and if we fail a task
+ * initialization, we need to ensure the process table entry is zeroed.
+ * We need not worry about process table entry caches because the task
+ * never ran with the PID value.
+ */
if (radix_enabled())
- WARN_ON(process_tb[mm->context.id].prtb0 != 0);
+ process_tb[mm->context.id].prtb0 = 0;
else
subpage_prot_free(mm);
destroy_contexts(&mm->context);
diff --git a/arch/powerpc/mm/pgtable-book3s64.c b/arch/powerpc/mm/book3s64/pgtable.c
similarity index 66%
rename from arch/powerpc/mm/pgtable-book3s64.c
rename to arch/powerpc/mm/book3s64/pgtable.c
index 01d7c0f..75483b4 100644
--- a/arch/powerpc/mm/pgtable-book3s64.c
+++ b/arch/powerpc/mm/book3s64/pgtable.c
@@ -1,10 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Copyright 2015-2016, Aneesh Kumar K.V, IBM Corporation.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#include <linux/sched.h>
@@ -12,12 +8,15 @@
#include <linux/memblock.h>
#include <misc/cxl-base.h>
+#include <asm/debugfs.h>
#include <asm/pgalloc.h>
#include <asm/tlb.h>
#include <asm/trace.h>
#include <asm/powernv.h>
+#include <asm/firmware.h>
+#include <asm/ultravisor.h>
-#include "mmu_decl.h"
+#include <mm/mmu_decl.h>
#include <trace/events/thp.h>
unsigned long __pmd_frag_nr;
@@ -25,9 +24,6 @@
unsigned long __pmd_frag_size_shift;
EXPORT_SYMBOL(__pmd_frag_size_shift);
-int (*register_process_table)(unsigned long base, unsigned long page_size,
- unsigned long tbl_size);
-
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
/*
* This is called when relaxing access to a hugepage. It's also called in the page
@@ -69,9 +65,14 @@
pmd_t *pmdp, pmd_t pmd)
{
#ifdef CONFIG_DEBUG_VM
- WARN_ON(pte_present(pmd_pte(*pmdp)) && !pte_protnone(pmd_pte(*pmdp)));
+ /*
+ * Make sure hardware valid bit is not set. We don't do
+ * tlb flush for this update.
+ */
+
+ WARN_ON(pte_hw_valid(pmd_pte(*pmdp)) && !pte_protnone(pmd_pte(*pmdp)));
assert_spin_locked(pmd_lockptr(mm, pmdp));
- WARN_ON(!(pmd_trans_huge(pmd) || pmd_devmap(pmd)));
+ WARN_ON(!(pmd_large(pmd)));
#endif
trace_hugepage_set_pmd(addr, pmd_val(pmd));
return set_pte_at(mm, addr, pmdp_ptep(pmdp), pmd_pte(pmd));
@@ -106,11 +107,14 @@
{
unsigned long old_pmd;
- old_pmd = pmd_hugepage_update(vma->vm_mm, address, pmdp, _PAGE_PRESENT, 0);
+ old_pmd = pmd_hugepage_update(vma->vm_mm, address, pmdp, _PAGE_PRESENT, _PAGE_INVALID);
flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
/*
* This ensures that generic code that rely on IRQ disabling
* to prevent a parallel THP split work as expected.
+ *
+ * Marking the entry with _PAGE_INVALID && ~_PAGE_PRESENT requires
+ * a special case check in pmd_access_permitted.
*/
serialize_against_pte_lookup(vma->vm_mm);
return __pmd(old_pmd);
@@ -190,48 +194,72 @@
unsigned long ptcr;
BUILD_BUG_ON_MSG((PATB_SIZE_SHIFT > 36), "Partition table size too large.");
- partition_tb = __va(memblock_alloc_base(patb_size, patb_size,
- MEMBLOCK_ALLOC_ANYWHERE));
-
/* Initialize the Partition Table with no entries */
- memset((void *)partition_tb, 0, patb_size);
+ partition_tb = memblock_alloc(patb_size, patb_size);
+ if (!partition_tb)
+ panic("%s: Failed to allocate %lu bytes align=0x%lx\n",
+ __func__, patb_size, patb_size);
/*
* update partition table control register,
* 64 K size.
*/
ptcr = __pa(partition_tb) | (PATB_SIZE_SHIFT - 12);
- mtspr(SPRN_PTCR, ptcr);
+ set_ptcr_when_no_uv(ptcr);
powernv_set_nmmu_ptcr(ptcr);
}
+static void flush_partition(unsigned int lpid, bool radix)
+{
+ if (radix) {
+ radix__flush_all_lpid(lpid);
+ radix__flush_all_lpid_guest(lpid);
+ } else {
+ asm volatile("ptesync" : : : "memory");
+ asm volatile(PPC_TLBIE_5(%0,%1,2,0,0) : :
+ "r" (TLBIEL_INVAL_SET_LPID), "r" (lpid));
+ /* do we need fixup here ?*/
+ asm volatile("eieio; tlbsync; ptesync" : : : "memory");
+ trace_tlbie(lpid, 0, TLBIEL_INVAL_SET_LPID, lpid, 2, 0, 0);
+ }
+}
+
void mmu_partition_table_set_entry(unsigned int lpid, unsigned long dw0,
- unsigned long dw1)
+ unsigned long dw1, bool flush)
{
unsigned long old = be64_to_cpu(partition_tb[lpid].patb0);
+ /*
+ * When ultravisor is enabled, the partition table is stored in secure
+ * memory and can only be accessed doing an ultravisor call. However, we
+ * maintain a copy of the partition table in normal memory to allow Nest
+ * MMU translations to occur (for normal VMs).
+ *
+ * Therefore, here we always update partition_tb, regardless of whether
+ * we are running under an ultravisor or not.
+ */
partition_tb[lpid].patb0 = cpu_to_be64(dw0);
partition_tb[lpid].patb1 = cpu_to_be64(dw1);
/*
- * Global flush of TLBs and partition table caches for this lpid.
- * The type of flush (hash or radix) depends on what the previous
- * use of this partition ID was, not the new use.
+ * If ultravisor is enabled, we do an ultravisor call to register the
+ * partition table entry (PATE), which also do a global flush of TLBs
+ * and partition table caches for the lpid. Otherwise, just do the
+ * flush. The type of flush (hash or radix) depends on what the previous
+ * use of the partition ID was, not the new use.
*/
- asm volatile("ptesync" : : : "memory");
- if (old & PATB_HR) {
- asm volatile(PPC_TLBIE_5(%0,%1,2,0,1) : :
- "r" (TLBIEL_INVAL_SET_LPID), "r" (lpid));
- asm volatile(PPC_TLBIE_5(%0,%1,2,1,1) : :
- "r" (TLBIEL_INVAL_SET_LPID), "r" (lpid));
- trace_tlbie(lpid, 0, TLBIEL_INVAL_SET_LPID, lpid, 2, 0, 1);
- } else {
- asm volatile(PPC_TLBIE_5(%0,%1,2,0,0) : :
- "r" (TLBIEL_INVAL_SET_LPID), "r" (lpid));
- trace_tlbie(lpid, 0, TLBIEL_INVAL_SET_LPID, lpid, 2, 0, 0);
+ if (firmware_has_feature(FW_FEATURE_ULTRAVISOR)) {
+ uv_register_pate(lpid, dw0, dw1);
+ pr_info("PATE registered by ultravisor: dw0 = 0x%lx, dw1 = 0x%lx\n",
+ dw0, dw1);
+ } else if (flush) {
+ /*
+ * Boot does not need to flush, because MMU is off and each
+ * CPU does a tlbiel_all() before switching them on, which
+ * flushes everything.
+ */
+ flush_partition(lpid, (old & PATB_HR));
}
- /* do we need fixup here ?*/
- asm volatile("eieio; tlbsync; ptesync" : : : "memory");
}
EXPORT_SYMBOL_GPL(mmu_partition_table_set_entry);
@@ -239,6 +267,9 @@
{
void *pmd_frag, *ret;
+ if (PMD_FRAG_NR == 1)
+ return NULL;
+
spin_lock(&mm->page_table_lock);
ret = mm->context.pmd_frag;
if (ret) {
@@ -317,91 +348,6 @@
}
}
-static pte_t *get_pte_from_cache(struct mm_struct *mm)
-{
- void *pte_frag, *ret;
-
- spin_lock(&mm->page_table_lock);
- ret = mm->context.pte_frag;
- if (ret) {
- pte_frag = ret + PTE_FRAG_SIZE;
- /*
- * If we have taken up all the fragments mark PTE page NULL
- */
- if (((unsigned long)pte_frag & ~PAGE_MASK) == 0)
- pte_frag = NULL;
- mm->context.pte_frag = pte_frag;
- }
- spin_unlock(&mm->page_table_lock);
- return (pte_t *)ret;
-}
-
-static pte_t *__alloc_for_ptecache(struct mm_struct *mm, int kernel)
-{
- void *ret = NULL;
- struct page *page;
-
- if (!kernel) {
- page = alloc_page(PGALLOC_GFP | __GFP_ACCOUNT);
- if (!page)
- return NULL;
- if (!pgtable_page_ctor(page)) {
- __free_page(page);
- return NULL;
- }
- } else {
- page = alloc_page(PGALLOC_GFP);
- if (!page)
- return NULL;
- }
-
- atomic_set(&page->pt_frag_refcount, 1);
-
- ret = page_address(page);
- /*
- * if we support only one fragment just return the
- * allocated page.
- */
- if (PTE_FRAG_NR == 1)
- return ret;
- spin_lock(&mm->page_table_lock);
- /*
- * If we find pgtable_page set, we return
- * the allocated page with single fragement
- * count.
- */
- if (likely(!mm->context.pte_frag)) {
- atomic_set(&page->pt_frag_refcount, PTE_FRAG_NR);
- mm->context.pte_frag = ret + PTE_FRAG_SIZE;
- }
- spin_unlock(&mm->page_table_lock);
-
- return (pte_t *)ret;
-}
-
-pte_t *pte_fragment_alloc(struct mm_struct *mm, unsigned long vmaddr, int kernel)
-{
- pte_t *pte;
-
- pte = get_pte_from_cache(mm);
- if (pte)
- return pte;
-
- return __alloc_for_ptecache(mm, kernel);
-}
-
-void pte_fragment_free(unsigned long *table, int kernel)
-{
- struct page *page = virt_to_page(table);
-
- BUG_ON(atomic_read(&page->pt_frag_refcount) <= 0);
- if (atomic_dec_and_test(&page->pt_frag_refcount)) {
- if (!kernel)
- pgtable_page_dtor(page);
- __free_page(page);
- }
-}
-
static inline void pgtable_free(void *table, int index)
{
switch (index) {
@@ -477,3 +423,96 @@
atomic_long_read(&direct_pages_count[MMU_PAGE_1G]) << 20);
}
#endif /* CONFIG_PROC_FS */
+
+pte_t ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr,
+ pte_t *ptep)
+{
+ unsigned long pte_val;
+
+ /*
+ * Clear the _PAGE_PRESENT so that no hardware parallel update is
+ * possible. Also keep the pte_present true so that we don't take
+ * wrong fault.
+ */
+ pte_val = pte_update(vma->vm_mm, addr, ptep, _PAGE_PRESENT, _PAGE_INVALID, 0);
+
+ return __pte(pte_val);
+
+}
+
+void ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr,
+ pte_t *ptep, pte_t old_pte, pte_t pte)
+{
+ if (radix_enabled())
+ return radix__ptep_modify_prot_commit(vma, addr,
+ ptep, old_pte, pte);
+ set_pte_at(vma->vm_mm, addr, ptep, pte);
+}
+
+/*
+ * For hash translation mode, we use the deposited table to store hash slot
+ * information and they are stored at PTRS_PER_PMD offset from related pmd
+ * location. Hence a pmd move requires deposit and withdraw.
+ *
+ * For radix translation with split pmd ptl, we store the deposited table in the
+ * pmd page. Hence if we have different pmd page we need to withdraw during pmd
+ * move.
+ *
+ * With hash we use deposited table always irrespective of anon or not.
+ * With radix we use deposited table only for anonymous mapping.
+ */
+int pmd_move_must_withdraw(struct spinlock *new_pmd_ptl,
+ struct spinlock *old_pmd_ptl,
+ struct vm_area_struct *vma)
+{
+ if (radix_enabled())
+ return (new_pmd_ptl != old_pmd_ptl) && vma_is_anonymous(vma);
+
+ return true;
+}
+
+/*
+ * Does the CPU support tlbie?
+ */
+bool tlbie_capable __read_mostly = true;
+EXPORT_SYMBOL(tlbie_capable);
+
+/*
+ * Should tlbie be used for management of CPU TLBs, for kernel and process
+ * address spaces? tlbie may still be used for nMMU accelerators, and for KVM
+ * guest address spaces.
+ */
+bool tlbie_enabled __read_mostly = true;
+
+static int __init setup_disable_tlbie(char *str)
+{
+ if (!radix_enabled()) {
+ pr_err("disable_tlbie: Unable to disable TLBIE with Hash MMU.\n");
+ return 1;
+ }
+
+ tlbie_capable = false;
+ tlbie_enabled = false;
+
+ return 1;
+}
+__setup("disable_tlbie", setup_disable_tlbie);
+
+static int __init pgtable_debugfs_setup(void)
+{
+ if (!tlbie_capable)
+ return 0;
+
+ /*
+ * There is no locking vs tlb flushing when changing this value.
+ * The tlb flushers will see one value or another, and use either
+ * tlbie or tlbiel with IPIs. In both cases the TLBs will be
+ * invalidated as expected.
+ */
+ debugfs_create_bool("tlbie_enabled", 0600,
+ powerpc_debugfs_root,
+ &tlbie_enabled);
+
+ return 0;
+}
+arch_initcall(pgtable_debugfs_setup);
diff --git a/arch/powerpc/mm/pkeys.c b/arch/powerpc/mm/book3s64/pkeys.c
similarity index 93%
rename from arch/powerpc/mm/pkeys.c
rename to arch/powerpc/mm/book3s64/pkeys.c
index b271b28..ae7fca4 100644
--- a/arch/powerpc/mm/pkeys.c
+++ b/arch/powerpc/mm/book3s64/pkeys.c
@@ -6,20 +6,22 @@
*/
#include <asm/mman.h>
+#include <asm/mmu_context.h>
+#include <asm/mmu.h>
#include <asm/setup.h>
#include <linux/pkeys.h>
#include <linux/of_device.h>
DEFINE_STATIC_KEY_TRUE(pkey_disabled);
-bool pkey_execute_disable_supported;
int pkeys_total; /* Total pkeys as per device tree */
-bool pkeys_devtree_defined; /* pkey property exported by device tree */
u32 initial_allocation_mask; /* Bits set for the initially allocated keys */
u32 reserved_allocation_mask; /* Bits set for reserved keys */
-u64 pkey_amr_mask; /* Bits in AMR not to be touched */
-u64 pkey_iamr_mask; /* Bits in AMR not to be touched */
-u64 pkey_uamor_mask; /* Bits in UMOR not to be touched */
-int execute_only_key = 2;
+static bool pkey_execute_disable_supported;
+static bool pkeys_devtree_defined; /* property exported by device tree */
+static u64 pkey_amr_mask; /* Bits in AMR not to be touched */
+static u64 pkey_iamr_mask; /* Bits in AMR not to be touched */
+static u64 pkey_uamor_mask; /* Bits in UMOR not to be touched */
+static int execute_only_key = 2;
#define AMR_BITS_PER_PKEY 2
#define AMR_RD_BIT 0x1UL
@@ -57,7 +59,7 @@
return cpu_has_feature(CPU_FTR_PKEY);
}
-int pkey_initialize(void)
+static int pkey_initialize(void)
{
int os_reserved, i;
@@ -414,3 +416,13 @@
return pkey_access_permitted(vma_pkey(vma), write, execute);
}
+
+void arch_dup_pkeys(struct mm_struct *oldmm, struct mm_struct *mm)
+{
+ if (static_branch_likely(&pkey_disabled))
+ return;
+
+ /* Duplicate the oldmm pkey state in mm: */
+ mm_pkey_allocation_map(mm) = mm_pkey_allocation_map(oldmm);
+ mm->context.execute_only_pkey = oldmm->context.execute_only_pkey;
+}
diff --git a/arch/powerpc/mm/hugetlbpage-radix.c b/arch/powerpc/mm/book3s64/radix_hugetlbpage.c
similarity index 78%
rename from arch/powerpc/mm/hugetlbpage-radix.c
rename to arch/powerpc/mm/book3s64/radix_hugetlbpage.c
index 2486bee..cab0633 100644
--- a/arch/powerpc/mm/hugetlbpage-radix.c
+++ b/arch/powerpc/mm/book3s64/radix_hugetlbpage.c
@@ -1,6 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/mm.h>
#include <linux/hugetlb.h>
+#include <linux/security.h>
#include <asm/pgtable.h>
#include <asm/pgalloc.h>
#include <asm/cacheflush.h>
@@ -73,7 +74,7 @@
if (addr) {
addr = ALIGN(addr, huge_page_size(h));
vma = find_vma(mm, addr);
- if (high_limit - len >= addr &&
+ if (high_limit - len >= addr && addr >= mmap_min_addr &&
(!vma || addr + len <= vm_start_gap(vma)))
return addr;
}
@@ -83,10 +84,27 @@
*/
info.flags = VM_UNMAPPED_AREA_TOPDOWN;
info.length = len;
- info.low_limit = PAGE_SIZE;
+ info.low_limit = max(PAGE_SIZE, mmap_min_addr);
info.high_limit = mm->mmap_base + (high_limit - DEFAULT_MAP_WINDOW);
info.align_mask = PAGE_MASK & ~huge_page_mask(h);
info.align_offset = 0;
return vm_unmapped_area(&info);
}
+
+void radix__huge_ptep_modify_prot_commit(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep,
+ pte_t old_pte, pte_t pte)
+{
+ struct mm_struct *mm = vma->vm_mm;
+
+ /*
+ * To avoid NMMU hang while relaxing access we need to flush the tlb before
+ * we set the new value.
+ */
+ if (is_pte_rw_upgrade(pte_val(old_pte), pte_val(pte)) &&
+ (atomic_read(&mm->context.copros) > 0))
+ radix__flush_hugetlb_page(vma, addr);
+
+ set_huge_pte_at(vma->vm_mm, addr, ptep, pte);
+}
diff --git a/arch/powerpc/mm/pgtable-radix.c b/arch/powerpc/mm/book3s64/radix_pgtable.c
similarity index 77%
rename from arch/powerpc/mm/pgtable-radix.c
rename to arch/powerpc/mm/book3s64/radix_pgtable.c
index c879979..6ee17d0 100644
--- a/arch/powerpc/mm/pgtable-radix.c
+++ b/arch/powerpc/mm/book3s64/radix_pgtable.c
@@ -1,16 +1,13 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Page table handling routines for radix page table.
*
* Copyright 2015-2016, Aneesh Kumar K.V, IBM Corporation.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#define pr_fmt(fmt) "radix-mmu: " fmt
+#include <linux/io.h>
#include <linux/kernel.h>
#include <linux/sched/mm.h>
#include <linux/memblock.h>
@@ -29,48 +26,33 @@
#include <asm/powernv.h>
#include <asm/sections.h>
#include <asm/trace.h>
+#include <asm/uaccess.h>
+#include <asm/ultravisor.h>
#include <trace/events/thp.h>
unsigned int mmu_pid_bits;
unsigned int mmu_base_pid;
-static int native_register_process_table(unsigned long base, unsigned long pg_sz,
- unsigned long table_size)
-{
- unsigned long patb0, patb1;
-
- patb0 = be64_to_cpu(partition_tb[0].patb0);
- patb1 = base | table_size | PATB_GR;
-
- mmu_partition_table_set_entry(0, patb0, patb1);
-
- return 0;
-}
-
static __ref void *early_alloc_pgtable(unsigned long size, int nid,
unsigned long region_start, unsigned long region_end)
{
- unsigned long pa = 0;
- void *pt;
+ phys_addr_t min_addr = MEMBLOCK_LOW_LIMIT;
+ phys_addr_t max_addr = MEMBLOCK_ALLOC_ANYWHERE;
+ void *ptr;
- if (region_start || region_end) /* has region hint */
- pa = memblock_alloc_range(size, size, region_start, region_end,
- MEMBLOCK_NONE);
- else if (nid != -1) /* has node hint */
- pa = memblock_alloc_base_nid(size, size,
- MEMBLOCK_ALLOC_ANYWHERE,
- nid, MEMBLOCK_NONE);
+ if (region_start)
+ min_addr = region_start;
+ if (region_end)
+ max_addr = region_end;
- if (!pa)
- pa = memblock_alloc_base(size, size, MEMBLOCK_ALLOC_ANYWHERE);
+ ptr = memblock_alloc_try_nid(size, size, min_addr, max_addr, nid);
- BUG_ON(!pa);
+ if (!ptr)
+ panic("%s: Failed to allocate %lu bytes align=0x%lx nid=%d from=%pa max_addr=%pa\n",
+ __func__, size, size, nid, &min_addr, &max_addr);
- pt = __va(pa);
- memset(pt, 0, size);
-
- return pt;
+ return ptr;
}
static int early_map_kernel_page(unsigned long ea, unsigned long pa,
@@ -139,6 +121,10 @@
*/
BUILD_BUG_ON(TASK_SIZE_USER64 > RADIX_PGTABLE_RANGE);
+#ifdef CONFIG_PPC_64K_PAGES
+ BUILD_BUG_ON(RADIX_KERN_MAP_SIZE != (1UL << MAX_EA_BITS_PER_CONTEXT));
+#endif
+
if (unlikely(!slab_is_available()))
return early_map_kernel_page(ea, pa, flags, map_page_size,
nid, region_start, region_end);
@@ -201,14 +187,14 @@
pudp = pud_alloc(&init_mm, pgdp, idx);
if (!pudp)
continue;
- if (pud_huge(*pudp)) {
+ if (pud_is_leaf(*pudp)) {
ptep = (pte_t *)pudp;
goto update_the_pte;
}
pmdp = pmd_alloc(&init_mm, pudp, idx);
if (!pmdp)
continue;
- if (pmd_huge(*pmdp)) {
+ if (pmd_is_leaf(*pmdp)) {
ptep = pmdp_ptep(pmdp);
goto update_the_pte;
}
@@ -241,9 +227,8 @@
}
#endif /* CONFIG_STRICT_KERNEL_RWX */
-static inline void __meminit print_mapping(unsigned long start,
- unsigned long end,
- unsigned long size)
+static inline void __meminit
+print_mapping(unsigned long start, unsigned long end, unsigned long size, bool exec)
{
char buf[10];
@@ -252,7 +237,17 @@
string_get_size(size, 1, STRING_UNITS_2, buf, sizeof(buf));
- pr_info("Mapped 0x%016lx-0x%016lx with %s pages\n", start, end, buf);
+ pr_info("Mapped 0x%016lx-0x%016lx with %s pages%s\n", start, end, buf,
+ exec ? " (exec)" : "");
+}
+
+static unsigned long next_boundary(unsigned long addr, unsigned long end)
+{
+#ifdef CONFIG_STRICT_KERNEL_RWX
+ if (addr < __pa_symbol(__init_begin))
+ return __pa_symbol(__init_begin);
+#endif
+ return end;
}
static int __meminit create_physical_mapping(unsigned long start,
@@ -260,13 +255,8 @@
int nid)
{
unsigned long vaddr, addr, mapping_size = 0;
+ bool prev_exec, exec = false;
pgprot_t prot;
- unsigned long max_mapping_size;
-#ifdef CONFIG_STRICT_KERNEL_RWX
- int split_text_mapping = 1;
-#else
- int split_text_mapping = 0;
-#endif
int psize;
start = _ALIGN_UP(start, PAGE_SIZE);
@@ -274,14 +264,12 @@
unsigned long gap, previous_size;
int rc;
- gap = end - addr;
+ gap = next_boundary(addr, end) - addr;
previous_size = mapping_size;
- max_mapping_size = PUD_SIZE;
+ prev_exec = exec;
-retry:
if (IS_ALIGNED(addr, PUD_SIZE) && gap >= PUD_SIZE &&
- mmu_psize_defs[MMU_PAGE_1G].shift &&
- PUD_SIZE <= max_mapping_size) {
+ mmu_psize_defs[MMU_PAGE_1G].shift) {
mapping_size = PUD_SIZE;
psize = MMU_PAGE_1G;
} else if (IS_ALIGNED(addr, PMD_SIZE) && gap >= PMD_SIZE &&
@@ -293,32 +281,21 @@
psize = mmu_virtual_psize;
}
- if (split_text_mapping && (mapping_size == PUD_SIZE) &&
- (addr <= __pa_symbol(__init_begin)) &&
- (addr + mapping_size) >= __pa_symbol(_stext)) {
- max_mapping_size = PMD_SIZE;
- goto retry;
- }
-
- if (split_text_mapping && (mapping_size == PMD_SIZE) &&
- (addr <= __pa_symbol(__init_begin)) &&
- (addr + mapping_size) >= __pa_symbol(_stext)) {
- mapping_size = PAGE_SIZE;
- psize = mmu_virtual_psize;
- }
-
- if (mapping_size != previous_size) {
- print_mapping(start, addr, previous_size);
- start = addr;
- }
-
vaddr = (unsigned long)__va(addr);
if (overlaps_kernel_text(vaddr, vaddr + mapping_size) ||
- overlaps_interrupt_vector_text(vaddr, vaddr + mapping_size))
+ overlaps_interrupt_vector_text(vaddr, vaddr + mapping_size)) {
prot = PAGE_KERNEL_X;
- else
+ exec = true;
+ } else {
prot = PAGE_KERNEL;
+ exec = false;
+ }
+
+ if (mapping_size != previous_size || exec != prev_exec) {
+ print_mapping(start, addr, previous_size, prev_exec);
+ start = addr;
+ }
rc = __map_kernel_page(vaddr, addr, prot, mapping_size, nid, start, end);
if (rc)
@@ -327,11 +304,11 @@
update_page_count(psize, 1);
}
- print_mapping(start, addr, mapping_size);
+ print_mapping(start, addr, mapping_size, exec);
return 0;
}
-void __init radix_init_pgtable(void)
+static void __init radix_init_pgtable(void)
{
unsigned long rts_field;
struct memblock_region *reg;
@@ -347,6 +324,12 @@
* page tables will be allocated within the range. No
* need or a node (which we don't have yet).
*/
+
+ if ((reg->base + reg->size) >= RADIX_VMALLOC_START) {
+ pr_warn("Outside the supported range\n");
+ continue;
+ }
+
WARN_ON(create_physical_mapping(reg->base,
reg->base + reg->size,
-1));
@@ -385,18 +368,6 @@
*/
rts_field = radix__get_tree_size();
process_tb->prtb0 = cpu_to_be64(rts_field | __pa(init_mm.pgd) | RADIX_PGD_INDEX_SIZE);
- /*
- * Fill in the partition table. We are suppose to use effective address
- * of process table here. But our linear mapping also enable us to use
- * physical address here.
- */
- register_process_table(__pa(process_tb), 0, PRTB_SIZE_SHIFT - 12);
- pr_info("Process table %p and radix root for kernel: %p\n", process_tb, init_mm.pgd);
- asm volatile("ptesync" : : : "memory");
- asm volatile(PPC_TLBIE_5(%0,%1,2,1,1) : :
- "r" (TLBIEL_INVAL_SET_LPID), "r" (0));
- asm volatile("eieio; tlbsync; ptesync" : : : "memory");
- trace_tlbie(0, 0, TLBIEL_INVAL_SET_LPID, 0, 2, 1, 1);
/*
* The init_mm context is given the first available (non-zero) PID,
@@ -417,20 +388,15 @@
static void __init radix_init_partition_table(void)
{
- unsigned long rts_field, dw0;
+ unsigned long rts_field, dw0, dw1;
mmu_partition_table_init();
rts_field = radix__get_tree_size();
dw0 = rts_field | __pa(init_mm.pgd) | RADIX_PGD_INDEX_SIZE | PATB_HR;
- mmu_partition_table_set_entry(0, dw0, 0);
+ dw1 = __pa(process_tb) | (PRTB_SIZE_SHIFT - 12) | PATB_GR;
+ mmu_partition_table_set_entry(0, dw0, dw1, false);
pr_info("Initializing Radix MMU\n");
- pr_info("Partition table %p\n", partition_tb);
-}
-
-void __init radix_init_native(void)
-{
- register_process_table = native_register_process_table;
}
static int __init get_idx_from_shift(unsigned int shift)
@@ -521,14 +487,6 @@
mmu_psize_defs[MMU_PAGE_64K].shift = 16;
mmu_psize_defs[MMU_PAGE_64K].ap = 0x5;
found:
-#ifdef CONFIG_SPARSEMEM_VMEMMAP
- if (mmu_psize_defs[MMU_PAGE_2M].shift) {
- /*
- * map vmemmap using 2M if available
- */
- mmu_vmemmap_psize = MMU_PAGE_2M;
- }
-#endif /* CONFIG_SPARSEMEM_VMEMMAP */
return;
}
@@ -544,8 +502,15 @@
mtspr(SPRN_AMOR, (3ul << 62));
}
-static void radix_init_iamr(void)
+#ifdef CONFIG_PPC_KUEP
+void setup_kuep(bool disabled)
{
+ if (disabled || !early_radix_enabled())
+ return;
+
+ if (smp_processor_id() == boot_cpuid)
+ pr_info("Activating Kernel Userspace Execution Prevention\n");
+
/*
* Radix always uses key0 of the IAMR to determine if an access is
* allowed. We set bit 0 (IBM bit 1) of key0, to prevent instruction
@@ -553,6 +518,25 @@
*/
mtspr(SPRN_IAMR, (1ul << 62));
}
+#endif
+
+#ifdef CONFIG_PPC_KUAP
+void setup_kuap(bool disabled)
+{
+ if (disabled || !early_radix_enabled())
+ return;
+
+ if (smp_processor_id() == boot_cpuid) {
+ pr_info("Activating Kernel Userspace Access Prevention\n");
+ cur_cpu_spec->mmu_features |= MMU_FTR_RADIX_KUAP;
+ }
+
+ /* Make sure userspace can't change the AMR */
+ mtspr(SPRN_UAMOR, 0);
+ mtspr(SPRN_AMR, AMR_KUAP_BLOCKED);
+ isync();
+}
+#endif
void __init radix__early_init_mmu(void)
{
@@ -567,7 +551,13 @@
#ifdef CONFIG_SPARSEMEM_VMEMMAP
/* vmemmap mapping */
- mmu_vmemmap_psize = mmu_virtual_psize;
+ if (mmu_psize_defs[MMU_PAGE_2M].shift) {
+ /*
+ * map vmemmap using 2M if available
+ */
+ mmu_vmemmap_psize = MMU_PAGE_2M;
+ } else
+ mmu_vmemmap_psize = mmu_virtual_psize;
#endif
/*
* initialize page table size
@@ -587,11 +577,11 @@
__pgd_val_bits = RADIX_PGD_VAL_BITS;
__kernel_virt_start = RADIX_KERN_VIRT_START;
- __kernel_virt_size = RADIX_KERN_VIRT_SIZE;
__vmalloc_start = RADIX_VMALLOC_START;
__vmalloc_end = RADIX_VMALLOC_END;
__kernel_io_start = RADIX_KERN_IO_START;
- vmemmap = (struct page *)RADIX_VMEMMAP_BASE;
+ __kernel_io_end = RADIX_KERN_IO_END;
+ vmemmap = (struct page *)RADIX_VMEMMAP_START;
ioremap_bot = IOREMAP_BASE;
#ifdef CONFIG_PCI
@@ -602,8 +592,9 @@
__pmd_frag_nr = RADIX_PMD_FRAG_NR;
__pmd_frag_size_shift = RADIX_PMD_FRAG_SIZE_SHIFT;
+ radix_init_pgtable();
+
if (!firmware_has_feature(FW_FEATURE_LPAR)) {
- radix_init_native();
lpcr = mfspr(SPRN_LPCR);
mtspr(SPRN_LPCR, lpcr | LPCR_UPRT | LPCR_HR);
radix_init_partition_table();
@@ -614,12 +605,9 @@
memblock_set_current_limit(MEMBLOCK_ALLOC_ANYWHERE);
- radix_init_iamr();
- radix_init_pgtable();
/* Switch to the guard PID before turning on MMU */
radix__switch_mmu_context(NULL, &init_mm);
- if (cpu_has_feature(CPU_FTR_HVMODE))
- tlbiel_all();
+ tlbiel_all();
}
void radix__early_init_mmu_secondary(void)
@@ -632,15 +620,14 @@
lpcr = mfspr(SPRN_LPCR);
mtspr(SPRN_LPCR, lpcr | LPCR_UPRT | LPCR_HR);
- mtspr(SPRN_PTCR,
- __pa(partition_tb) | (PATB_SIZE_SHIFT - 12));
+ set_ptcr_when_no_uv(__pa(partition_tb) |
+ (PATB_SIZE_SHIFT - 12));
+
radix_init_amor();
}
- radix_init_iamr();
radix__switch_mmu_context(NULL, &init_mm);
- if (cpu_has_feature(CPU_FTR_HVMODE))
- tlbiel_all();
+ tlbiel_all();
}
void radix__mmu_cleanup_all(void)
@@ -650,7 +637,7 @@
if (!firmware_has_feature(FW_FEATURE_LPAR)) {
lpcr = mfspr(SPRN_LPCR);
mtspr(SPRN_LPCR, lpcr & ~LPCR_UPRT);
- mtspr(SPRN_PTCR, 0);
+ set_ptcr_when_no_uv(0);
powernv_set_nmmu_ptcr(0);
radix__flush_tlb_all();
}
@@ -659,7 +646,8 @@
void radix__setup_initial_memory_limit(phys_addr_t first_memblock_base,
phys_addr_t first_memblock_size)
{
- /* We don't currently support the first MEMBLOCK not mapping 0
+ /*
+ * We don't currently support the first MEMBLOCK not mapping 0
* physical on those processors
*/
BUG_ON(first_memblock_base != 0);
@@ -719,8 +707,8 @@
spin_unlock(&init_mm.page_table_lock);
pte_clear(&init_mm, params->aligned_start, params->pte);
- create_physical_mapping(params->aligned_start, params->start, -1);
- create_physical_mapping(params->end, params->aligned_end, -1);
+ create_physical_mapping(__pa(params->aligned_start), __pa(params->start), -1);
+ create_physical_mapping(__pa(params->end), __pa(params->aligned_end), -1);
spin_lock(&init_mm.page_table_lock);
return 0;
}
@@ -813,7 +801,7 @@
if (!pmd_present(*pmd))
continue;
- if (pmd_huge(*pmd)) {
+ if (pmd_is_leaf(*pmd)) {
split_kernel_mapping(addr, end, PMD_SIZE, (pte_t *)pmd);
continue;
}
@@ -838,7 +826,7 @@
if (!pud_present(*pud))
continue;
- if (pud_huge(*pud)) {
+ if (pud_is_leaf(*pud)) {
split_kernel_mapping(addr, end, PUD_SIZE, (pte_t *)pud);
continue;
}
@@ -864,7 +852,7 @@
if (!pgd_present(*pgd))
continue;
- if (pgd_huge(*pgd)) {
+ if (pgd_is_leaf(*pgd)) {
split_kernel_mapping(addr, end, PGDIR_SIZE, (pte_t *)pgd);
continue;
}
@@ -879,7 +867,12 @@
int __meminit radix__create_section_mapping(unsigned long start, unsigned long end, int nid)
{
- return create_physical_mapping(start, end, nid);
+ if (end >= RADIX_VMALLOC_START) {
+ pr_warn("Outside the supported range\n");
+ return -1;
+ }
+
+ return create_physical_mapping(__pa(start), __pa(end), nid);
}
int __meminit radix__remove_section_mapping(unsigned long start, unsigned long end)
@@ -906,6 +899,11 @@
int nid = early_pfn_to_nid(phys >> PAGE_SHIFT);
int ret;
+ if ((start + page_size) >= RADIX_VMEMMAP_END) {
+ pr_warn("Outside the supported range\n");
+ return -1;
+ }
+
ret = __map_kernel_page_nid(start, phys, __pgprot(flags), page_size, nid);
BUG_ON(ret);
@@ -971,45 +969,44 @@
void radix__pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
pgtable_t pgtable)
{
- struct list_head *lh = (struct list_head *) pgtable;
+ struct list_head *lh = (struct list_head *) pgtable;
- assert_spin_locked(pmd_lockptr(mm, pmdp));
+ assert_spin_locked(pmd_lockptr(mm, pmdp));
- /* FIFO */
- if (!pmd_huge_pte(mm, pmdp))
- INIT_LIST_HEAD(lh);
- else
- list_add(lh, (struct list_head *) pmd_huge_pte(mm, pmdp));
- pmd_huge_pte(mm, pmdp) = pgtable;
+ /* FIFO */
+ if (!pmd_huge_pte(mm, pmdp))
+ INIT_LIST_HEAD(lh);
+ else
+ list_add(lh, (struct list_head *) pmd_huge_pte(mm, pmdp));
+ pmd_huge_pte(mm, pmdp) = pgtable;
}
pgtable_t radix__pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
{
- pte_t *ptep;
- pgtable_t pgtable;
- struct list_head *lh;
+ pte_t *ptep;
+ pgtable_t pgtable;
+ struct list_head *lh;
- assert_spin_locked(pmd_lockptr(mm, pmdp));
+ assert_spin_locked(pmd_lockptr(mm, pmdp));
- /* FIFO */
- pgtable = pmd_huge_pte(mm, pmdp);
- lh = (struct list_head *) pgtable;
- if (list_empty(lh))
- pmd_huge_pte(mm, pmdp) = NULL;
- else {
- pmd_huge_pte(mm, pmdp) = (pgtable_t) lh->next;
- list_del(lh);
- }
- ptep = (pte_t *) pgtable;
- *ptep = __pte(0);
- ptep++;
- *ptep = __pte(0);
- return pgtable;
+ /* FIFO */
+ pgtable = pmd_huge_pte(mm, pmdp);
+ lh = (struct list_head *) pgtable;
+ if (list_empty(lh))
+ pmd_huge_pte(mm, pmdp) = NULL;
+ else {
+ pmd_huge_pte(mm, pmdp) = (pgtable_t) lh->next;
+ list_del(lh);
+ }
+ ptep = (pte_t *) pgtable;
+ *ptep = __pte(0);
+ ptep++;
+ *ptep = __pte(0);
+ return pgtable;
}
-
pmd_t radix__pmdp_huge_get_and_clear(struct mm_struct *mm,
- unsigned long addr, pmd_t *pmdp)
+ unsigned long addr, pmd_t *pmdp)
{
pmd_t old_pmd;
unsigned long old;
@@ -1030,13 +1027,6 @@
return old_pmd;
}
-int radix__has_transparent_hugepage(void)
-{
- /* For radix 2M at PMD level means thp */
- if (mmu_psize_defs[MMU_PAGE_2M].shift == PMD_SHIFT)
- return 1;
- return 0;
-}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
void radix__ptep_set_access_flags(struct vm_area_struct *vma, pte_t *ptep,
@@ -1072,3 +1062,126 @@
}
/* See ptesync comment in radix__set_pte_at */
}
+
+void radix__ptep_modify_prot_commit(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep,
+ pte_t old_pte, pte_t pte)
+{
+ struct mm_struct *mm = vma->vm_mm;
+
+ /*
+ * To avoid NMMU hang while relaxing access we need to flush the tlb before
+ * we set the new value. We need to do this only for radix, because hash
+ * translation does flush when updating the linux pte.
+ */
+ if (is_pte_rw_upgrade(pte_val(old_pte), pte_val(pte)) &&
+ (atomic_read(&mm->context.copros) > 0))
+ radix__flush_tlb_page(vma, addr);
+
+ set_pte_at(mm, addr, ptep, pte);
+}
+
+int __init arch_ioremap_pud_supported(void)
+{
+ /* HPT does not cope with large pages in the vmalloc area */
+ return radix_enabled();
+}
+
+int __init arch_ioremap_pmd_supported(void)
+{
+ return radix_enabled();
+}
+
+int p4d_free_pud_page(p4d_t *p4d, unsigned long addr)
+{
+ return 0;
+}
+
+int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot)
+{
+ pte_t *ptep = (pte_t *)pud;
+ pte_t new_pud = pfn_pte(__phys_to_pfn(addr), prot);
+
+ if (!radix_enabled())
+ return 0;
+
+ set_pte_at(&init_mm, 0 /* radix unused */, ptep, new_pud);
+
+ return 1;
+}
+
+int pud_clear_huge(pud_t *pud)
+{
+ if (pud_huge(*pud)) {
+ pud_clear(pud);
+ return 1;
+ }
+
+ return 0;
+}
+
+int pud_free_pmd_page(pud_t *pud, unsigned long addr)
+{
+ pmd_t *pmd;
+ int i;
+
+ pmd = (pmd_t *)pud_page_vaddr(*pud);
+ pud_clear(pud);
+
+ flush_tlb_kernel_range(addr, addr + PUD_SIZE);
+
+ for (i = 0; i < PTRS_PER_PMD; i++) {
+ if (!pmd_none(pmd[i])) {
+ pte_t *pte;
+ pte = (pte_t *)pmd_page_vaddr(pmd[i]);
+
+ pte_free_kernel(&init_mm, pte);
+ }
+ }
+
+ pmd_free(&init_mm, pmd);
+
+ return 1;
+}
+
+int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot)
+{
+ pte_t *ptep = (pte_t *)pmd;
+ pte_t new_pmd = pfn_pte(__phys_to_pfn(addr), prot);
+
+ if (!radix_enabled())
+ return 0;
+
+ set_pte_at(&init_mm, 0 /* radix unused */, ptep, new_pmd);
+
+ return 1;
+}
+
+int pmd_clear_huge(pmd_t *pmd)
+{
+ if (pmd_huge(*pmd)) {
+ pmd_clear(pmd);
+ return 1;
+ }
+
+ return 0;
+}
+
+int pmd_free_pte_page(pmd_t *pmd, unsigned long addr)
+{
+ pte_t *pte;
+
+ pte = (pte_t *)pmd_page_vaddr(*pmd);
+ pmd_clear(pmd);
+
+ flush_tlb_kernel_range(addr, addr + PMD_SIZE);
+
+ pte_free_kernel(&init_mm, pte);
+
+ return 1;
+}
+
+int __init arch_ioremap_p4d_supported(void)
+{
+ return 0;
+}
diff --git a/arch/powerpc/mm/tlb-radix.c b/arch/powerpc/mm/book3s64/radix_tlb.c
similarity index 72%
rename from arch/powerpc/mm/tlb-radix.c
rename to arch/powerpc/mm/book3s64/radix_tlb.c
index fef3e1e..67af871 100644
--- a/arch/powerpc/mm/tlb-radix.c
+++ b/arch/powerpc/mm/book3s64/radix_tlb.c
@@ -1,12 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* TLB flush routines for radix kernels.
*
* Copyright 2015-2016, Aneesh Kumar K.V, IBM Corporation.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#include <linux/mm.h>
@@ -29,7 +25,7 @@
* tlbiel instruction for radix, set invalidation
* i.e., r=1 and is=01 or is=10 or is=11
*/
-static inline void tlbiel_radix_set_isa300(unsigned int set, unsigned int is,
+static __always_inline void tlbiel_radix_set_isa300(unsigned int set, unsigned int is,
unsigned int pid,
unsigned int ric, unsigned int prs)
{
@@ -55,11 +51,15 @@
* and partition table entries. Then flush the remaining sets of the
* TLB.
*/
- tlbiel_radix_set_isa300(0, is, 0, RIC_FLUSH_ALL, 0);
- for (set = 1; set < num_sets; set++)
- tlbiel_radix_set_isa300(set, is, 0, RIC_FLUSH_TLB, 0);
- /* Do the same for process scoped entries. */
+ if (early_cpu_has_feature(CPU_FTR_HVMODE)) {
+ /* MSR[HV] should flush partition scope translations first. */
+ tlbiel_radix_set_isa300(0, is, 0, RIC_FLUSH_ALL, 0);
+ for (set = 1; set < num_sets; set++)
+ tlbiel_radix_set_isa300(set, is, 0, RIC_FLUSH_TLB, 0);
+ }
+
+ /* Flush process scoped entries. */
tlbiel_radix_set_isa300(0, is, 0, RIC_FLUSH_ALL, 1);
for (set = 1; set < num_sets; set++)
tlbiel_radix_set_isa300(set, is, 0, RIC_FLUSH_TLB, 1);
@@ -87,10 +87,10 @@
else
WARN(1, "%s called on pre-POWER9 CPU\n", __func__);
- asm volatile(PPC_INVALIDATE_ERAT "; isync" : : :"memory");
+ asm volatile(PPC_ISA_3_0_INVALIDATE_ERAT "; isync" : : :"memory");
}
-static inline void __tlbiel_pid(unsigned long pid, int set,
+static __always_inline void __tlbiel_pid(unsigned long pid, int set,
unsigned long ric)
{
unsigned long rb,rs,prs,r;
@@ -106,7 +106,7 @@
trace_tlbie(0, 1, rb, rs, ric, prs, r);
}
-static inline void __tlbie_pid(unsigned long pid, unsigned long ric)
+static __always_inline void __tlbie_pid(unsigned long pid, unsigned long ric)
{
unsigned long rb,rs,prs,r;
@@ -120,23 +120,7 @@
trace_tlbie(0, 0, rb, rs, ric, prs, r);
}
-static inline void __tlbiel_lpid(unsigned long lpid, int set,
- unsigned long ric)
-{
- unsigned long rb,rs,prs,r;
-
- rb = PPC_BIT(52); /* IS = 2 */
- rb |= set << PPC_BITLSHIFT(51);
- rs = 0; /* LPID comes from LPIDR */
- prs = 0; /* partition scoped */
- r = 1; /* radix format */
-
- asm volatile(PPC_TLBIEL(%0, %4, %3, %2, %1)
- : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory");
- trace_tlbie(lpid, 1, rb, rs, ric, prs, r);
-}
-
-static inline void __tlbie_lpid(unsigned long lpid, unsigned long ric)
+static __always_inline void __tlbie_lpid(unsigned long lpid, unsigned long ric)
{
unsigned long rb,rs,prs,r;
@@ -150,25 +134,22 @@
trace_tlbie(lpid, 0, rb, rs, ric, prs, r);
}
-static inline void __tlbiel_lpid_guest(unsigned long lpid, int set,
- unsigned long ric)
+static __always_inline void __tlbie_lpid_guest(unsigned long lpid, unsigned long ric)
{
unsigned long rb,rs,prs,r;
rb = PPC_BIT(52); /* IS = 2 */
- rb |= set << PPC_BITLSHIFT(51);
- rs = 0; /* LPID comes from LPIDR */
+ rs = lpid;
prs = 1; /* process scoped */
r = 1; /* radix format */
- asm volatile(PPC_TLBIEL(%0, %4, %3, %2, %1)
+ asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1)
: : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory");
- trace_tlbie(lpid, 1, rb, rs, ric, prs, r);
+ trace_tlbie(lpid, 0, rb, rs, ric, prs, r);
}
-
-static inline void __tlbiel_va(unsigned long va, unsigned long pid,
- unsigned long ap, unsigned long ric)
+static __always_inline void __tlbiel_va(unsigned long va, unsigned long pid,
+ unsigned long ap, unsigned long ric)
{
unsigned long rb,rs,prs,r;
@@ -183,8 +164,8 @@
trace_tlbie(0, 1, rb, rs, ric, prs, r);
}
-static inline void __tlbie_va(unsigned long va, unsigned long pid,
- unsigned long ap, unsigned long ric)
+static __always_inline void __tlbie_va(unsigned long va, unsigned long pid,
+ unsigned long ap, unsigned long ric)
{
unsigned long rb,rs,prs,r;
@@ -199,8 +180,8 @@
trace_tlbie(0, 0, rb, rs, ric, prs, r);
}
-static inline void __tlbie_lpid_va(unsigned long va, unsigned long lpid,
- unsigned long ap, unsigned long ric)
+static __always_inline void __tlbie_lpid_va(unsigned long va, unsigned long lpid,
+ unsigned long ap, unsigned long ric)
{
unsigned long rb,rs,prs,r;
@@ -215,22 +196,83 @@
trace_tlbie(lpid, 0, rb, rs, ric, prs, r);
}
-static inline void fixup_tlbie(void)
+
+static inline void fixup_tlbie_va(unsigned long va, unsigned long pid,
+ unsigned long ap)
{
- unsigned long pid = 0;
+ if (cpu_has_feature(CPU_FTR_P9_TLBIE_ERAT_BUG)) {
+ asm volatile("ptesync": : :"memory");
+ __tlbie_va(va, 0, ap, RIC_FLUSH_TLB);
+ }
+
+ if (cpu_has_feature(CPU_FTR_P9_TLBIE_STQ_BUG)) {
+ asm volatile("ptesync": : :"memory");
+ __tlbie_va(va, pid, ap, RIC_FLUSH_TLB);
+ }
+}
+
+static inline void fixup_tlbie_va_range(unsigned long va, unsigned long pid,
+ unsigned long ap)
+{
+ if (cpu_has_feature(CPU_FTR_P9_TLBIE_ERAT_BUG)) {
+ asm volatile("ptesync": : :"memory");
+ __tlbie_pid(0, RIC_FLUSH_TLB);
+ }
+
+ if (cpu_has_feature(CPU_FTR_P9_TLBIE_STQ_BUG)) {
+ asm volatile("ptesync": : :"memory");
+ __tlbie_va(va, pid, ap, RIC_FLUSH_TLB);
+ }
+}
+
+static inline void fixup_tlbie_pid(unsigned long pid)
+{
+ /*
+ * We can use any address for the invalidation, pick one which is
+ * probably unused as an optimisation.
+ */
unsigned long va = ((1UL << 52) - 1);
- if (cpu_has_feature(CPU_FTR_P9_TLBIE_BUG)) {
+ if (cpu_has_feature(CPU_FTR_P9_TLBIE_ERAT_BUG)) {
+ asm volatile("ptesync": : :"memory");
+ __tlbie_pid(0, RIC_FLUSH_TLB);
+ }
+
+ if (cpu_has_feature(CPU_FTR_P9_TLBIE_STQ_BUG)) {
asm volatile("ptesync": : :"memory");
__tlbie_va(va, pid, mmu_get_ap(MMU_PAGE_64K), RIC_FLUSH_TLB);
}
}
+
+static inline void fixup_tlbie_lpid_va(unsigned long va, unsigned long lpid,
+ unsigned long ap)
+{
+ if (cpu_has_feature(CPU_FTR_P9_TLBIE_ERAT_BUG)) {
+ asm volatile("ptesync": : :"memory");
+ __tlbie_lpid_va(va, 0, ap, RIC_FLUSH_TLB);
+ }
+
+ if (cpu_has_feature(CPU_FTR_P9_TLBIE_STQ_BUG)) {
+ asm volatile("ptesync": : :"memory");
+ __tlbie_lpid_va(va, lpid, ap, RIC_FLUSH_TLB);
+ }
+}
+
static inline void fixup_tlbie_lpid(unsigned long lpid)
{
+ /*
+ * We can use any address for the invalidation, pick one which is
+ * probably unused as an optimisation.
+ */
unsigned long va = ((1UL << 52) - 1);
- if (cpu_has_feature(CPU_FTR_P9_TLBIE_BUG)) {
+ if (cpu_has_feature(CPU_FTR_P9_TLBIE_ERAT_BUG)) {
+ asm volatile("ptesync": : :"memory");
+ __tlbie_lpid(0, RIC_FLUSH_TLB);
+ }
+
+ if (cpu_has_feature(CPU_FTR_P9_TLBIE_STQ_BUG)) {
asm volatile("ptesync": : :"memory");
__tlbie_lpid_va(va, lpid, mmu_get_ap(MMU_PAGE_64K), RIC_FLUSH_TLB);
}
@@ -239,7 +281,7 @@
/*
* We use 128 set in radix mode and 256 set in hpt mode.
*/
-static inline void _tlbiel_pid(unsigned long pid, unsigned long ric)
+static __always_inline void _tlbiel_pid(unsigned long pid, unsigned long ric)
{
int set;
@@ -262,7 +304,7 @@
__tlbiel_pid(pid, set, RIC_FLUSH_TLB);
asm volatile("ptesync": : :"memory");
- asm volatile(PPC_INVALIDATE_ERAT "; isync" : : :"memory");
+ asm volatile(PPC_RADIX_INVALIDATE_ERAT_USER "; isync" : : :"memory");
}
static inline void _tlbie_pid(unsigned long pid, unsigned long ric)
@@ -277,6 +319,7 @@
switch (ric) {
case RIC_FLUSH_TLB:
__tlbie_pid(pid, RIC_FLUSH_TLB);
+ fixup_tlbie_pid(pid);
break;
case RIC_FLUSH_PWC:
__tlbie_pid(pid, RIC_FLUSH_PWC);
@@ -284,37 +327,42 @@
case RIC_FLUSH_ALL:
default:
__tlbie_pid(pid, RIC_FLUSH_ALL);
+ fixup_tlbie_pid(pid);
}
- fixup_tlbie();
asm volatile("eieio; tlbsync; ptesync": : :"memory");
}
-static inline void _tlbiel_lpid(unsigned long lpid, unsigned long ric)
+struct tlbiel_pid {
+ unsigned long pid;
+ unsigned long ric;
+};
+
+static void do_tlbiel_pid(void *info)
{
- int set;
+ struct tlbiel_pid *t = info;
- VM_BUG_ON(mfspr(SPRN_LPID) != lpid);
+ if (t->ric == RIC_FLUSH_TLB)
+ _tlbiel_pid(t->pid, RIC_FLUSH_TLB);
+ else if (t->ric == RIC_FLUSH_PWC)
+ _tlbiel_pid(t->pid, RIC_FLUSH_PWC);
+ else
+ _tlbiel_pid(t->pid, RIC_FLUSH_ALL);
+}
- asm volatile("ptesync": : :"memory");
+static inline void _tlbiel_pid_multicast(struct mm_struct *mm,
+ unsigned long pid, unsigned long ric)
+{
+ struct cpumask *cpus = mm_cpumask(mm);
+ struct tlbiel_pid t = { .pid = pid, .ric = ric };
+ on_each_cpu_mask(cpus, do_tlbiel_pid, &t, 1);
/*
- * Flush the first set of the TLB, and if we're doing a RIC_FLUSH_ALL,
- * also flush the entire Page Walk Cache.
+ * Always want the CPU translations to be invalidated with tlbiel in
+ * these paths, so while coprocessors must use tlbie, we can not
+ * optimise away the tlbiel component.
*/
- __tlbiel_lpid(lpid, 0, ric);
-
- /* For PWC, only one flush is needed */
- if (ric == RIC_FLUSH_PWC) {
- asm volatile("ptesync": : :"memory");
- return;
- }
-
- /* For the remaining sets, just flush the TLB */
- for (set = 1; set < POWER9_TLB_SETS_RADIX ; set++)
- __tlbiel_lpid(lpid, set, RIC_FLUSH_TLB);
-
- asm volatile("ptesync": : :"memory");
- asm volatile(PPC_INVALIDATE_ERAT "; isync" : : :"memory");
+ if (atomic_read(&mm->context.copros) > 0)
+ _tlbie_pid(pid, RIC_FLUSH_ALL);
}
static inline void _tlbie_lpid(unsigned long lpid, unsigned long ric)
@@ -329,6 +377,7 @@
switch (ric) {
case RIC_FLUSH_TLB:
__tlbie_lpid(lpid, RIC_FLUSH_TLB);
+ fixup_tlbie_lpid(lpid);
break;
case RIC_FLUSH_PWC:
__tlbie_lpid(lpid, RIC_FLUSH_PWC);
@@ -336,39 +385,33 @@
case RIC_FLUSH_ALL:
default:
__tlbie_lpid(lpid, RIC_FLUSH_ALL);
+ fixup_tlbie_lpid(lpid);
+ }
+ asm volatile("eieio; tlbsync; ptesync": : :"memory");
+}
+
+static __always_inline void _tlbie_lpid_guest(unsigned long lpid, unsigned long ric)
+{
+ /*
+ * Workaround the fact that the "ric" argument to __tlbie_pid
+ * must be a compile-time contraint to match the "i" constraint
+ * in the asm statement.
+ */
+ switch (ric) {
+ case RIC_FLUSH_TLB:
+ __tlbie_lpid_guest(lpid, RIC_FLUSH_TLB);
+ break;
+ case RIC_FLUSH_PWC:
+ __tlbie_lpid_guest(lpid, RIC_FLUSH_PWC);
+ break;
+ case RIC_FLUSH_ALL:
+ default:
+ __tlbie_lpid_guest(lpid, RIC_FLUSH_ALL);
}
fixup_tlbie_lpid(lpid);
asm volatile("eieio; tlbsync; ptesync": : :"memory");
}
-static inline void _tlbiel_lpid_guest(unsigned long lpid, unsigned long ric)
-{
- int set;
-
- VM_BUG_ON(mfspr(SPRN_LPID) != lpid);
-
- asm volatile("ptesync": : :"memory");
-
- /*
- * Flush the first set of the TLB, and if we're doing a RIC_FLUSH_ALL,
- * also flush the entire Page Walk Cache.
- */
- __tlbiel_lpid_guest(lpid, 0, ric);
-
- /* For PWC, only one flush is needed */
- if (ric == RIC_FLUSH_PWC) {
- asm volatile("ptesync": : :"memory");
- return;
- }
-
- /* For the remaining sets, just flush the TLB */
- for (set = 1; set < POWER9_TLB_SETS_RADIX ; set++)
- __tlbiel_lpid_guest(lpid, set, RIC_FLUSH_TLB);
-
- asm volatile("ptesync": : :"memory");
-}
-
-
static inline void __tlbiel_va_range(unsigned long start, unsigned long end,
unsigned long pid, unsigned long page_size,
unsigned long psize)
@@ -380,8 +423,8 @@
__tlbiel_va(addr, pid, ap, RIC_FLUSH_TLB);
}
-static inline void _tlbiel_va(unsigned long va, unsigned long pid,
- unsigned long psize, unsigned long ric)
+static __always_inline void _tlbiel_va(unsigned long va, unsigned long pid,
+ unsigned long psize, unsigned long ric)
{
unsigned long ap = mmu_get_ap(psize);
@@ -410,27 +453,76 @@
for (addr = start; addr < end; addr += page_size)
__tlbie_va(addr, pid, ap, RIC_FLUSH_TLB);
+
+ fixup_tlbie_va_range(addr - page_size, pid, ap);
}
-static inline void _tlbie_va(unsigned long va, unsigned long pid,
- unsigned long psize, unsigned long ric)
+static __always_inline void _tlbie_va(unsigned long va, unsigned long pid,
+ unsigned long psize, unsigned long ric)
{
unsigned long ap = mmu_get_ap(psize);
asm volatile("ptesync": : :"memory");
__tlbie_va(va, pid, ap, ric);
- fixup_tlbie();
+ fixup_tlbie_va(va, pid, ap);
asm volatile("eieio; tlbsync; ptesync": : :"memory");
}
-static inline void _tlbie_lpid_va(unsigned long va, unsigned long lpid,
+struct tlbiel_va {
+ unsigned long pid;
+ unsigned long va;
+ unsigned long psize;
+ unsigned long ric;
+};
+
+static void do_tlbiel_va(void *info)
+{
+ struct tlbiel_va *t = info;
+
+ if (t->ric == RIC_FLUSH_TLB)
+ _tlbiel_va(t->va, t->pid, t->psize, RIC_FLUSH_TLB);
+ else if (t->ric == RIC_FLUSH_PWC)
+ _tlbiel_va(t->va, t->pid, t->psize, RIC_FLUSH_PWC);
+ else
+ _tlbiel_va(t->va, t->pid, t->psize, RIC_FLUSH_ALL);
+}
+
+static inline void _tlbiel_va_multicast(struct mm_struct *mm,
+ unsigned long va, unsigned long pid,
+ unsigned long psize, unsigned long ric)
+{
+ struct cpumask *cpus = mm_cpumask(mm);
+ struct tlbiel_va t = { .va = va, .pid = pid, .psize = psize, .ric = ric };
+ on_each_cpu_mask(cpus, do_tlbiel_va, &t, 1);
+ if (atomic_read(&mm->context.copros) > 0)
+ _tlbie_va(va, pid, psize, RIC_FLUSH_TLB);
+}
+
+struct tlbiel_va_range {
+ unsigned long pid;
+ unsigned long start;
+ unsigned long end;
+ unsigned long page_size;
+ unsigned long psize;
+ bool also_pwc;
+};
+
+static void do_tlbiel_va_range(void *info)
+{
+ struct tlbiel_va_range *t = info;
+
+ _tlbiel_va_range(t->start, t->end, t->pid, t->page_size,
+ t->psize, t->also_pwc);
+}
+
+static __always_inline void _tlbie_lpid_va(unsigned long va, unsigned long lpid,
unsigned long psize, unsigned long ric)
{
unsigned long ap = mmu_get_ap(psize);
asm volatile("ptesync": : :"memory");
__tlbie_lpid_va(va, lpid, ap, ric);
- fixup_tlbie_lpid(lpid);
+ fixup_tlbie_lpid_va(va, lpid, ap);
asm volatile("eieio; tlbsync; ptesync": : :"memory");
}
@@ -442,10 +534,24 @@
if (also_pwc)
__tlbie_pid(pid, RIC_FLUSH_PWC);
__tlbie_va_range(start, end, pid, page_size, psize);
- fixup_tlbie();
asm volatile("eieio; tlbsync; ptesync": : :"memory");
}
+static inline void _tlbiel_va_range_multicast(struct mm_struct *mm,
+ unsigned long start, unsigned long end,
+ unsigned long pid, unsigned long page_size,
+ unsigned long psize, bool also_pwc)
+{
+ struct cpumask *cpus = mm_cpumask(mm);
+ struct tlbiel_va_range t = { .start = start, .end = end,
+ .pid = pid, .page_size = page_size,
+ .psize = psize, .also_pwc = also_pwc };
+
+ on_each_cpu_mask(cpus, do_tlbiel_va_range, &t, 1);
+ if (atomic_read(&mm->context.copros) > 0)
+ _tlbie_va_range(start, end, pid, page_size, psize, also_pwc);
+}
+
/*
* Base TLB flushing operations:
*
@@ -583,10 +689,14 @@
goto local;
}
- if (mm_needs_flush_escalation(mm))
- _tlbie_pid(pid, RIC_FLUSH_ALL);
- else
- _tlbie_pid(pid, RIC_FLUSH_TLB);
+ if (cputlb_use_tlbie()) {
+ if (mm_needs_flush_escalation(mm))
+ _tlbie_pid(pid, RIC_FLUSH_ALL);
+ else
+ _tlbie_pid(pid, RIC_FLUSH_TLB);
+ } else {
+ _tlbiel_pid_multicast(mm, pid, RIC_FLUSH_TLB);
+ }
} else {
local:
_tlbiel_pid(pid, RIC_FLUSH_TLB);
@@ -612,7 +722,10 @@
goto local;
}
}
- _tlbie_pid(pid, RIC_FLUSH_ALL);
+ if (cputlb_use_tlbie())
+ _tlbie_pid(pid, RIC_FLUSH_ALL);
+ else
+ _tlbiel_pid_multicast(mm, pid, RIC_FLUSH_ALL);
} else {
local:
_tlbiel_pid(pid, RIC_FLUSH_ALL);
@@ -647,7 +760,10 @@
exit_flush_lazy_tlbs(mm);
goto local;
}
- _tlbie_va(vmaddr, pid, psize, RIC_FLUSH_TLB);
+ if (cputlb_use_tlbie())
+ _tlbie_va(vmaddr, pid, psize, RIC_FLUSH_TLB);
+ else
+ _tlbiel_va_multicast(mm, vmaddr, pid, psize, RIC_FLUSH_TLB);
} else {
local:
_tlbiel_va(vmaddr, pid, psize, RIC_FLUSH_TLB);
@@ -669,9 +785,35 @@
#define radix__flush_all_mm radix__local_flush_all_mm
#endif /* CONFIG_SMP */
+static void do_tlbiel_kernel(void *info)
+{
+ _tlbiel_pid(0, RIC_FLUSH_ALL);
+}
+
+static inline void _tlbiel_kernel_broadcast(void)
+{
+ on_each_cpu(do_tlbiel_kernel, NULL, 1);
+ if (tlbie_capable) {
+ /*
+ * Coherent accelerators don't refcount kernel memory mappings,
+ * so have to always issue a tlbie for them. This is quite a
+ * slow path anyway.
+ */
+ _tlbie_pid(0, RIC_FLUSH_ALL);
+ }
+}
+
+/*
+ * If kernel TLBIs ever become local rather than global, then
+ * drivers/misc/ocxl/link.c:ocxl_link_add_pe will need some work, as it
+ * assumes kernel TLBIs are global.
+ */
void radix__flush_tlb_kernel_range(unsigned long start, unsigned long end)
{
- _tlbie_pid(0, RIC_FLUSH_ALL);
+ if (cputlb_use_tlbie())
+ _tlbie_pid(0, RIC_FLUSH_ALL);
+ else
+ _tlbiel_kernel_broadcast();
}
EXPORT_SYMBOL(radix__flush_tlb_kernel_range);
@@ -727,10 +869,14 @@
if (local) {
_tlbiel_pid(pid, RIC_FLUSH_TLB);
} else {
- if (mm_needs_flush_escalation(mm))
- _tlbie_pid(pid, RIC_FLUSH_ALL);
- else
- _tlbie_pid(pid, RIC_FLUSH_TLB);
+ if (cputlb_use_tlbie()) {
+ if (mm_needs_flush_escalation(mm))
+ _tlbie_pid(pid, RIC_FLUSH_ALL);
+ else
+ _tlbie_pid(pid, RIC_FLUSH_TLB);
+ } else {
+ _tlbiel_pid_multicast(mm, pid, RIC_FLUSH_TLB);
+ }
}
} else {
bool hflush = flush_all_sizes;
@@ -755,8 +901,8 @@
gflush = false;
}
- asm volatile("ptesync": : :"memory");
if (local) {
+ asm volatile("ptesync": : :"memory");
__tlbiel_va_range(start, end, pid, page_size, mmu_virtual_psize);
if (hflush)
__tlbiel_va_range(hstart, hend, pid,
@@ -765,7 +911,8 @@
__tlbiel_va_range(gstart, gend, pid,
PUD_SIZE, MMU_PAGE_1G);
asm volatile("ptesync": : :"memory");
- } else {
+ } else if (cputlb_use_tlbie()) {
+ asm volatile("ptesync": : :"memory");
__tlbie_va_range(start, end, pid, page_size, mmu_virtual_psize);
if (hflush)
__tlbie_va_range(hstart, hend, pid,
@@ -773,8 +920,17 @@
if (gflush)
__tlbie_va_range(gstart, gend, pid,
PUD_SIZE, MMU_PAGE_1G);
- fixup_tlbie();
+
asm volatile("eieio; tlbsync; ptesync": : :"memory");
+ } else {
+ _tlbiel_va_range_multicast(mm,
+ start, end, pid, page_size, mmu_virtual_psize, false);
+ if (hflush)
+ _tlbiel_va_range_multicast(mm,
+ hstart, hend, pid, PMD_SIZE, MMU_PAGE_2M, false);
+ if (gflush)
+ _tlbiel_va_range_multicast(mm,
+ gstart, gend, pid, PUD_SIZE, MMU_PAGE_1G, false);
}
}
preempt_enable();
@@ -833,23 +989,19 @@
/*
* Flush partition scoped translations from LPID (=LPIDR)
*/
-void radix__local_flush_tlb_lpid(unsigned int lpid)
+void radix__flush_all_lpid(unsigned int lpid)
{
- _tlbiel_lpid(lpid, RIC_FLUSH_ALL);
+ _tlbie_lpid(lpid, RIC_FLUSH_ALL);
}
-EXPORT_SYMBOL_GPL(radix__local_flush_tlb_lpid);
+EXPORT_SYMBOL_GPL(radix__flush_all_lpid);
/*
- * Flush process scoped translations from LPID (=LPIDR).
- * Important difference, the guest normally manages its own translations,
- * but some cases e.g., vCPU CPU migration require KVM to flush.
+ * Flush process scoped translations from LPID (=LPIDR)
*/
-void radix__local_flush_tlb_lpid_guest(unsigned int lpid)
+void radix__flush_all_lpid_guest(unsigned int lpid)
{
- _tlbiel_lpid_guest(lpid, RIC_FLUSH_ALL);
+ _tlbie_lpid_guest(lpid, RIC_FLUSH_ALL);
}
-EXPORT_SYMBOL_GPL(radix__local_flush_tlb_lpid_guest);
-
static void radix__flush_tlb_pwc_range_psize(struct mm_struct *mm, unsigned long start,
unsigned long end, int psize);
@@ -918,7 +1070,7 @@
tlb->need_flush_all = 0;
}
-static inline void __radix__flush_tlb_range_psize(struct mm_struct *mm,
+static __always_inline void __radix__flush_tlb_range_psize(struct mm_struct *mm,
unsigned long start, unsigned long end,
int psize, bool also_pwc)
{
@@ -955,16 +1107,26 @@
if (local) {
_tlbiel_pid(pid, also_pwc ? RIC_FLUSH_ALL : RIC_FLUSH_TLB);
} else {
- if (mm_needs_flush_escalation(mm))
- also_pwc = true;
+ if (cputlb_use_tlbie()) {
+ if (mm_needs_flush_escalation(mm))
+ also_pwc = true;
- _tlbie_pid(pid, also_pwc ? RIC_FLUSH_ALL : RIC_FLUSH_TLB);
+ _tlbie_pid(pid,
+ also_pwc ? RIC_FLUSH_ALL : RIC_FLUSH_TLB);
+ } else {
+ _tlbiel_pid_multicast(mm, pid,
+ also_pwc ? RIC_FLUSH_ALL : RIC_FLUSH_TLB);
+ }
+
}
} else {
if (local)
_tlbiel_va_range(start, end, pid, page_size, psize, also_pwc);
- else
+ else if (cputlb_use_tlbie())
_tlbie_va_range(start, end, pid, page_size, psize, also_pwc);
+ else
+ _tlbiel_va_range_multicast(mm,
+ start, end, pid, page_size, psize, also_pwc);
}
preempt_enable();
}
@@ -1006,8 +1168,11 @@
exit_flush_lazy_tlbs(mm);
goto local;
}
- _tlbie_va_range(addr, end, pid, PAGE_SIZE, mmu_virtual_psize, true);
- goto local;
+ if (cputlb_use_tlbie())
+ _tlbie_va_range(addr, end, pid, PAGE_SIZE, mmu_virtual_psize, true);
+ else
+ _tlbiel_va_range_multicast(mm,
+ addr, end, pid, PAGE_SIZE, mmu_virtual_psize, true);
} else {
local:
_tlbiel_va_range(addr, end, pid, PAGE_SIZE, mmu_virtual_psize, true);
diff --git a/arch/powerpc/mm/book3s64/slb.c b/arch/powerpc/mm/book3s64/slb.c
new file mode 100644
index 0000000..716204a
--- /dev/null
+++ b/arch/powerpc/mm/book3s64/slb.c
@@ -0,0 +1,828 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * PowerPC64 SLB support.
+ *
+ * Copyright (C) 2004 David Gibson <dwg@au.ibm.com>, IBM
+ * Based on earlier code written by:
+ * Dave Engebretsen and Mike Corrigan {engebret|mikejc}@us.ibm.com
+ * Copyright (c) 2001 Dave Engebretsen
+ * Copyright (C) 2002 Anton Blanchard <anton@au.ibm.com>, IBM
+ */
+
+#include <asm/asm-prototypes.h>
+#include <asm/pgtable.h>
+#include <asm/mmu.h>
+#include <asm/mmu_context.h>
+#include <asm/paca.h>
+#include <asm/ppc-opcode.h>
+#include <asm/cputable.h>
+#include <asm/cacheflush.h>
+#include <asm/smp.h>
+#include <linux/compiler.h>
+#include <linux/context_tracking.h>
+#include <linux/mm_types.h>
+
+#include <asm/udbg.h>
+#include <asm/code-patching.h>
+
+enum slb_index {
+ LINEAR_INDEX = 0, /* Kernel linear map (0xc000000000000000) */
+ KSTACK_INDEX = 1, /* Kernel stack map */
+};
+
+static long slb_allocate_user(struct mm_struct *mm, unsigned long ea);
+
+#define slb_esid_mask(ssize) \
+ (((ssize) == MMU_SEGSIZE_256M)? ESID_MASK: ESID_MASK_1T)
+
+static inline unsigned long mk_esid_data(unsigned long ea, int ssize,
+ enum slb_index index)
+{
+ return (ea & slb_esid_mask(ssize)) | SLB_ESID_V | index;
+}
+
+static inline unsigned long __mk_vsid_data(unsigned long vsid, int ssize,
+ unsigned long flags)
+{
+ return (vsid << slb_vsid_shift(ssize)) | flags |
+ ((unsigned long) ssize << SLB_VSID_SSIZE_SHIFT);
+}
+
+static inline unsigned long mk_vsid_data(unsigned long ea, int ssize,
+ unsigned long flags)
+{
+ return __mk_vsid_data(get_kernel_vsid(ea, ssize), ssize, flags);
+}
+
+static void assert_slb_presence(bool present, unsigned long ea)
+{
+#ifdef CONFIG_DEBUG_VM
+ unsigned long tmp;
+
+ WARN_ON_ONCE(mfmsr() & MSR_EE);
+
+ if (!cpu_has_feature(CPU_FTR_ARCH_206))
+ return;
+
+ /*
+ * slbfee. requires bit 24 (PPC bit 39) be clear in RB. Hardware
+ * ignores all other bits from 0-27, so just clear them all.
+ */
+ ea &= ~((1UL << 28) - 1);
+ asm volatile(__PPC_SLBFEE_DOT(%0, %1) : "=r"(tmp) : "r"(ea) : "cr0");
+
+ WARN_ON(present == (tmp == 0));
+#endif
+}
+
+static inline void slb_shadow_update(unsigned long ea, int ssize,
+ unsigned long flags,
+ enum slb_index index)
+{
+ struct slb_shadow *p = get_slb_shadow();
+
+ /*
+ * Clear the ESID first so the entry is not valid while we are
+ * updating it. No write barriers are needed here, provided
+ * we only update the current CPU's SLB shadow buffer.
+ */
+ WRITE_ONCE(p->save_area[index].esid, 0);
+ WRITE_ONCE(p->save_area[index].vsid, cpu_to_be64(mk_vsid_data(ea, ssize, flags)));
+ WRITE_ONCE(p->save_area[index].esid, cpu_to_be64(mk_esid_data(ea, ssize, index)));
+}
+
+static inline void slb_shadow_clear(enum slb_index index)
+{
+ WRITE_ONCE(get_slb_shadow()->save_area[index].esid, cpu_to_be64(index));
+}
+
+static inline void create_shadowed_slbe(unsigned long ea, int ssize,
+ unsigned long flags,
+ enum slb_index index)
+{
+ /*
+ * Updating the shadow buffer before writing the SLB ensures
+ * we don't get a stale entry here if we get preempted by PHYP
+ * between these two statements.
+ */
+ slb_shadow_update(ea, ssize, flags, index);
+
+ assert_slb_presence(false, ea);
+ asm volatile("slbmte %0,%1" :
+ : "r" (mk_vsid_data(ea, ssize, flags)),
+ "r" (mk_esid_data(ea, ssize, index))
+ : "memory" );
+}
+
+/*
+ * Insert bolted entries into SLB (which may not be empty, so don't clear
+ * slb_cache_ptr).
+ */
+void __slb_restore_bolted_realmode(void)
+{
+ struct slb_shadow *p = get_slb_shadow();
+ enum slb_index index;
+
+ /* No isync needed because realmode. */
+ for (index = 0; index < SLB_NUM_BOLTED; index++) {
+ asm volatile("slbmte %0,%1" :
+ : "r" (be64_to_cpu(p->save_area[index].vsid)),
+ "r" (be64_to_cpu(p->save_area[index].esid)));
+ }
+
+ assert_slb_presence(true, local_paca->kstack);
+}
+
+/*
+ * Insert the bolted entries into an empty SLB.
+ */
+void slb_restore_bolted_realmode(void)
+{
+ __slb_restore_bolted_realmode();
+ get_paca()->slb_cache_ptr = 0;
+
+ get_paca()->slb_kern_bitmap = (1U << SLB_NUM_BOLTED) - 1;
+ get_paca()->slb_used_bitmap = get_paca()->slb_kern_bitmap;
+}
+
+/*
+ * This flushes all SLB entries including 0, so it must be realmode.
+ */
+void slb_flush_all_realmode(void)
+{
+ asm volatile("slbmte %0,%0; slbia" : : "r" (0));
+}
+
+/*
+ * This flushes non-bolted entries, it can be run in virtual mode. Must
+ * be called with interrupts disabled.
+ */
+void slb_flush_and_restore_bolted(void)
+{
+ struct slb_shadow *p = get_slb_shadow();
+
+ BUILD_BUG_ON(SLB_NUM_BOLTED != 2);
+
+ WARN_ON(!irqs_disabled());
+
+ /*
+ * We can't take a PMU exception in the following code, so hard
+ * disable interrupts.
+ */
+ hard_irq_disable();
+
+ asm volatile("isync\n"
+ "slbia\n"
+ "slbmte %0, %1\n"
+ "isync\n"
+ :: "r" (be64_to_cpu(p->save_area[KSTACK_INDEX].vsid)),
+ "r" (be64_to_cpu(p->save_area[KSTACK_INDEX].esid))
+ : "memory");
+ assert_slb_presence(true, get_paca()->kstack);
+
+ get_paca()->slb_cache_ptr = 0;
+
+ get_paca()->slb_kern_bitmap = (1U << SLB_NUM_BOLTED) - 1;
+ get_paca()->slb_used_bitmap = get_paca()->slb_kern_bitmap;
+}
+
+void slb_save_contents(struct slb_entry *slb_ptr)
+{
+ int i;
+ unsigned long e, v;
+
+ /* Save slb_cache_ptr value. */
+ get_paca()->slb_save_cache_ptr = get_paca()->slb_cache_ptr;
+
+ if (!slb_ptr)
+ return;
+
+ for (i = 0; i < mmu_slb_size; i++) {
+ asm volatile("slbmfee %0,%1" : "=r" (e) : "r" (i));
+ asm volatile("slbmfev %0,%1" : "=r" (v) : "r" (i));
+ slb_ptr->esid = e;
+ slb_ptr->vsid = v;
+ slb_ptr++;
+ }
+}
+
+void slb_dump_contents(struct slb_entry *slb_ptr)
+{
+ int i, n;
+ unsigned long e, v;
+ unsigned long llp;
+
+ if (!slb_ptr)
+ return;
+
+ pr_err("SLB contents of cpu 0x%x\n", smp_processor_id());
+ pr_err("Last SLB entry inserted at slot %d\n", get_paca()->stab_rr);
+
+ for (i = 0; i < mmu_slb_size; i++) {
+ e = slb_ptr->esid;
+ v = slb_ptr->vsid;
+ slb_ptr++;
+
+ if (!e && !v)
+ continue;
+
+ pr_err("%02d %016lx %016lx\n", i, e, v);
+
+ if (!(e & SLB_ESID_V)) {
+ pr_err("\n");
+ continue;
+ }
+ llp = v & SLB_VSID_LLP;
+ if (v & SLB_VSID_B_1T) {
+ pr_err(" 1T ESID=%9lx VSID=%13lx LLP:%3lx\n",
+ GET_ESID_1T(e),
+ (v & ~SLB_VSID_B) >> SLB_VSID_SHIFT_1T, llp);
+ } else {
+ pr_err(" 256M ESID=%9lx VSID=%13lx LLP:%3lx\n",
+ GET_ESID(e),
+ (v & ~SLB_VSID_B) >> SLB_VSID_SHIFT, llp);
+ }
+ }
+ pr_err("----------------------------------\n");
+
+ /* Dump slb cache entires as well. */
+ pr_err("SLB cache ptr value = %d\n", get_paca()->slb_save_cache_ptr);
+ pr_err("Valid SLB cache entries:\n");
+ n = min_t(int, get_paca()->slb_save_cache_ptr, SLB_CACHE_ENTRIES);
+ for (i = 0; i < n; i++)
+ pr_err("%02d EA[0-35]=%9x\n", i, get_paca()->slb_cache[i]);
+ pr_err("Rest of SLB cache entries:\n");
+ for (i = n; i < SLB_CACHE_ENTRIES; i++)
+ pr_err("%02d EA[0-35]=%9x\n", i, get_paca()->slb_cache[i]);
+}
+
+void slb_vmalloc_update(void)
+{
+ /*
+ * vmalloc is not bolted, so just have to flush non-bolted.
+ */
+ slb_flush_and_restore_bolted();
+}
+
+static bool preload_hit(struct thread_info *ti, unsigned long esid)
+{
+ unsigned char i;
+
+ for (i = 0; i < ti->slb_preload_nr; i++) {
+ unsigned char idx;
+
+ idx = (ti->slb_preload_tail + i) % SLB_PRELOAD_NR;
+ if (esid == ti->slb_preload_esid[idx])
+ return true;
+ }
+ return false;
+}
+
+static bool preload_add(struct thread_info *ti, unsigned long ea)
+{
+ unsigned char idx;
+ unsigned long esid;
+
+ if (mmu_has_feature(MMU_FTR_1T_SEGMENT)) {
+ /* EAs are stored >> 28 so 256MB segments don't need clearing */
+ if (ea & ESID_MASK_1T)
+ ea &= ESID_MASK_1T;
+ }
+
+ esid = ea >> SID_SHIFT;
+
+ if (preload_hit(ti, esid))
+ return false;
+
+ idx = (ti->slb_preload_tail + ti->slb_preload_nr) % SLB_PRELOAD_NR;
+ ti->slb_preload_esid[idx] = esid;
+ if (ti->slb_preload_nr == SLB_PRELOAD_NR)
+ ti->slb_preload_tail = (ti->slb_preload_tail + 1) % SLB_PRELOAD_NR;
+ else
+ ti->slb_preload_nr++;
+
+ return true;
+}
+
+static void preload_age(struct thread_info *ti)
+{
+ if (!ti->slb_preload_nr)
+ return;
+ ti->slb_preload_nr--;
+ ti->slb_preload_tail = (ti->slb_preload_tail + 1) % SLB_PRELOAD_NR;
+}
+
+void slb_setup_new_exec(void)
+{
+ struct thread_info *ti = current_thread_info();
+ struct mm_struct *mm = current->mm;
+ unsigned long exec = 0x10000000;
+
+ WARN_ON(irqs_disabled());
+
+ /*
+ * preload cache can only be used to determine whether a SLB
+ * entry exists if it does not start to overflow.
+ */
+ if (ti->slb_preload_nr + 2 > SLB_PRELOAD_NR)
+ return;
+
+ hard_irq_disable();
+
+ /*
+ * We have no good place to clear the slb preload cache on exec,
+ * flush_thread is about the earliest arch hook but that happens
+ * after we switch to the mm and have aleady preloaded the SLBEs.
+ *
+ * For the most part that's probably okay to use entries from the
+ * previous exec, they will age out if unused. It may turn out to
+ * be an advantage to clear the cache before switching to it,
+ * however.
+ */
+
+ /*
+ * preload some userspace segments into the SLB.
+ * Almost all 32 and 64bit PowerPC executables are linked at
+ * 0x10000000 so it makes sense to preload this segment.
+ */
+ if (!is_kernel_addr(exec)) {
+ if (preload_add(ti, exec))
+ slb_allocate_user(mm, exec);
+ }
+
+ /* Libraries and mmaps. */
+ if (!is_kernel_addr(mm->mmap_base)) {
+ if (preload_add(ti, mm->mmap_base))
+ slb_allocate_user(mm, mm->mmap_base);
+ }
+
+ /* see switch_slb */
+ asm volatile("isync" : : : "memory");
+
+ local_irq_enable();
+}
+
+void preload_new_slb_context(unsigned long start, unsigned long sp)
+{
+ struct thread_info *ti = current_thread_info();
+ struct mm_struct *mm = current->mm;
+ unsigned long heap = mm->start_brk;
+
+ WARN_ON(irqs_disabled());
+
+ /* see above */
+ if (ti->slb_preload_nr + 3 > SLB_PRELOAD_NR)
+ return;
+
+ hard_irq_disable();
+
+ /* Userspace entry address. */
+ if (!is_kernel_addr(start)) {
+ if (preload_add(ti, start))
+ slb_allocate_user(mm, start);
+ }
+
+ /* Top of stack, grows down. */
+ if (!is_kernel_addr(sp)) {
+ if (preload_add(ti, sp))
+ slb_allocate_user(mm, sp);
+ }
+
+ /* Bottom of heap, grows up. */
+ if (heap && !is_kernel_addr(heap)) {
+ if (preload_add(ti, heap))
+ slb_allocate_user(mm, heap);
+ }
+
+ /* see switch_slb */
+ asm volatile("isync" : : : "memory");
+
+ local_irq_enable();
+}
+
+
+/* Flush all user entries from the segment table of the current processor. */
+void switch_slb(struct task_struct *tsk, struct mm_struct *mm)
+{
+ struct thread_info *ti = task_thread_info(tsk);
+ unsigned char i;
+
+ /*
+ * We need interrupts hard-disabled here, not just soft-disabled,
+ * so that a PMU interrupt can't occur, which might try to access
+ * user memory (to get a stack trace) and possible cause an SLB miss
+ * which would update the slb_cache/slb_cache_ptr fields in the PACA.
+ */
+ hard_irq_disable();
+ asm volatile("isync" : : : "memory");
+ if (cpu_has_feature(CPU_FTR_ARCH_300)) {
+ /*
+ * SLBIA IH=3 invalidates all Class=1 SLBEs and their
+ * associated lookaside structures, which matches what
+ * switch_slb wants. So ARCH_300 does not use the slb
+ * cache.
+ */
+ asm volatile(PPC_SLBIA(3));
+ } else {
+ unsigned long offset = get_paca()->slb_cache_ptr;
+
+ if (!mmu_has_feature(MMU_FTR_NO_SLBIE_B) &&
+ offset <= SLB_CACHE_ENTRIES) {
+ unsigned long slbie_data = 0;
+
+ for (i = 0; i < offset; i++) {
+ unsigned long ea;
+
+ ea = (unsigned long)
+ get_paca()->slb_cache[i] << SID_SHIFT;
+ /*
+ * Could assert_slb_presence(true) here, but
+ * hypervisor or machine check could have come
+ * in and removed the entry at this point.
+ */
+
+ slbie_data = ea;
+ slbie_data |= user_segment_size(slbie_data)
+ << SLBIE_SSIZE_SHIFT;
+ slbie_data |= SLBIE_C; /* user slbs have C=1 */
+ asm volatile("slbie %0" : : "r" (slbie_data));
+ }
+
+ /* Workaround POWER5 < DD2.1 issue */
+ if (!cpu_has_feature(CPU_FTR_ARCH_207S) && offset == 1)
+ asm volatile("slbie %0" : : "r" (slbie_data));
+
+ } else {
+ struct slb_shadow *p = get_slb_shadow();
+ unsigned long ksp_esid_data =
+ be64_to_cpu(p->save_area[KSTACK_INDEX].esid);
+ unsigned long ksp_vsid_data =
+ be64_to_cpu(p->save_area[KSTACK_INDEX].vsid);
+
+ asm volatile(PPC_SLBIA(1) "\n"
+ "slbmte %0,%1\n"
+ "isync"
+ :: "r"(ksp_vsid_data),
+ "r"(ksp_esid_data));
+
+ get_paca()->slb_kern_bitmap = (1U << SLB_NUM_BOLTED) - 1;
+ }
+
+ get_paca()->slb_cache_ptr = 0;
+ }
+ get_paca()->slb_used_bitmap = get_paca()->slb_kern_bitmap;
+
+ copy_mm_to_paca(mm);
+
+ /*
+ * We gradually age out SLBs after a number of context switches to
+ * reduce reload overhead of unused entries (like we do with FP/VEC
+ * reload). Each time we wrap 256 switches, take an entry out of the
+ * SLB preload cache.
+ */
+ tsk->thread.load_slb++;
+ if (!tsk->thread.load_slb) {
+ unsigned long pc = KSTK_EIP(tsk);
+
+ preload_age(ti);
+ preload_add(ti, pc);
+ }
+
+ for (i = 0; i < ti->slb_preload_nr; i++) {
+ unsigned char idx;
+ unsigned long ea;
+
+ idx = (ti->slb_preload_tail + i) % SLB_PRELOAD_NR;
+ ea = (unsigned long)ti->slb_preload_esid[idx] << SID_SHIFT;
+
+ slb_allocate_user(mm, ea);
+ }
+
+ /*
+ * Synchronize slbmte preloads with possible subsequent user memory
+ * address accesses by the kernel (user mode won't happen until
+ * rfid, which is safe).
+ */
+ asm volatile("isync" : : : "memory");
+}
+
+void slb_set_size(u16 size)
+{
+ mmu_slb_size = size;
+}
+
+void slb_initialize(void)
+{
+ unsigned long linear_llp, vmalloc_llp, io_llp;
+ unsigned long lflags;
+ static int slb_encoding_inited;
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+ unsigned long vmemmap_llp;
+#endif
+
+ /* Prepare our SLB miss handler based on our page size */
+ linear_llp = mmu_psize_defs[mmu_linear_psize].sllp;
+ io_llp = mmu_psize_defs[mmu_io_psize].sllp;
+ vmalloc_llp = mmu_psize_defs[mmu_vmalloc_psize].sllp;
+ get_paca()->vmalloc_sllp = SLB_VSID_KERNEL | vmalloc_llp;
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+ vmemmap_llp = mmu_psize_defs[mmu_vmemmap_psize].sllp;
+#endif
+ if (!slb_encoding_inited) {
+ slb_encoding_inited = 1;
+ pr_devel("SLB: linear LLP = %04lx\n", linear_llp);
+ pr_devel("SLB: io LLP = %04lx\n", io_llp);
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+ pr_devel("SLB: vmemmap LLP = %04lx\n", vmemmap_llp);
+#endif
+ }
+
+ get_paca()->stab_rr = SLB_NUM_BOLTED - 1;
+ get_paca()->slb_kern_bitmap = (1U << SLB_NUM_BOLTED) - 1;
+ get_paca()->slb_used_bitmap = get_paca()->slb_kern_bitmap;
+
+ lflags = SLB_VSID_KERNEL | linear_llp;
+
+ /* Invalidate the entire SLB (even entry 0) & all the ERATS */
+ asm volatile("isync":::"memory");
+ asm volatile("slbmte %0,%0"::"r" (0) : "memory");
+ asm volatile("isync; slbia; isync":::"memory");
+ create_shadowed_slbe(PAGE_OFFSET, mmu_kernel_ssize, lflags, LINEAR_INDEX);
+
+ /*
+ * For the boot cpu, we're running on the stack in init_thread_union,
+ * which is in the first segment of the linear mapping, and also
+ * get_paca()->kstack hasn't been initialized yet.
+ * For secondary cpus, we need to bolt the kernel stack entry now.
+ */
+ slb_shadow_clear(KSTACK_INDEX);
+ if (raw_smp_processor_id() != boot_cpuid &&
+ (get_paca()->kstack & slb_esid_mask(mmu_kernel_ssize)) > PAGE_OFFSET)
+ create_shadowed_slbe(get_paca()->kstack,
+ mmu_kernel_ssize, lflags, KSTACK_INDEX);
+
+ asm volatile("isync":::"memory");
+}
+
+static void slb_cache_update(unsigned long esid_data)
+{
+ int slb_cache_index;
+
+ if (cpu_has_feature(CPU_FTR_ARCH_300))
+ return; /* ISAv3.0B and later does not use slb_cache */
+
+ /*
+ * Now update slb cache entries
+ */
+ slb_cache_index = local_paca->slb_cache_ptr;
+ if (slb_cache_index < SLB_CACHE_ENTRIES) {
+ /*
+ * We have space in slb cache for optimized switch_slb().
+ * Top 36 bits from esid_data as per ISA
+ */
+ local_paca->slb_cache[slb_cache_index++] = esid_data >> 28;
+ local_paca->slb_cache_ptr++;
+ } else {
+ /*
+ * Our cache is full and the current cache content strictly
+ * doesn't indicate the active SLB conents. Bump the ptr
+ * so that switch_slb() will ignore the cache.
+ */
+ local_paca->slb_cache_ptr = SLB_CACHE_ENTRIES + 1;
+ }
+}
+
+static enum slb_index alloc_slb_index(bool kernel)
+{
+ enum slb_index index;
+
+ /*
+ * The allocation bitmaps can become out of synch with the SLB
+ * when the _switch code does slbie when bolting a new stack
+ * segment and it must not be anywhere else in the SLB. This leaves
+ * a kernel allocated entry that is unused in the SLB. With very
+ * large systems or small segment sizes, the bitmaps could slowly
+ * fill with these entries. They will eventually be cleared out
+ * by the round robin allocator in that case, so it's probably not
+ * worth accounting for.
+ */
+
+ /*
+ * SLBs beyond 32 entries are allocated with stab_rr only
+ * POWER7/8/9 have 32 SLB entries, this could be expanded if a
+ * future CPU has more.
+ */
+ if (local_paca->slb_used_bitmap != U32_MAX) {
+ index = ffz(local_paca->slb_used_bitmap);
+ local_paca->slb_used_bitmap |= 1U << index;
+ if (kernel)
+ local_paca->slb_kern_bitmap |= 1U << index;
+ } else {
+ /* round-robin replacement of slb starting at SLB_NUM_BOLTED. */
+ index = local_paca->stab_rr;
+ if (index < (mmu_slb_size - 1))
+ index++;
+ else
+ index = SLB_NUM_BOLTED;
+ local_paca->stab_rr = index;
+ if (index < 32) {
+ if (kernel)
+ local_paca->slb_kern_bitmap |= 1U << index;
+ else
+ local_paca->slb_kern_bitmap &= ~(1U << index);
+ }
+ }
+ BUG_ON(index < SLB_NUM_BOLTED);
+
+ return index;
+}
+
+static long slb_insert_entry(unsigned long ea, unsigned long context,
+ unsigned long flags, int ssize, bool kernel)
+{
+ unsigned long vsid;
+ unsigned long vsid_data, esid_data;
+ enum slb_index index;
+
+ vsid = get_vsid(context, ea, ssize);
+ if (!vsid)
+ return -EFAULT;
+
+ /*
+ * There must not be a kernel SLB fault in alloc_slb_index or before
+ * slbmte here or the allocation bitmaps could get out of whack with
+ * the SLB.
+ *
+ * User SLB faults or preloads take this path which might get inlined
+ * into the caller, so add compiler barriers here to ensure unsafe
+ * memory accesses do not come between.
+ */
+ barrier();
+
+ index = alloc_slb_index(kernel);
+
+ vsid_data = __mk_vsid_data(vsid, ssize, flags);
+ esid_data = mk_esid_data(ea, ssize, index);
+
+ /*
+ * No need for an isync before or after this slbmte. The exception
+ * we enter with and the rfid we exit with are context synchronizing.
+ * User preloads should add isync afterwards in case the kernel
+ * accesses user memory before it returns to userspace with rfid.
+ */
+ assert_slb_presence(false, ea);
+ asm volatile("slbmte %0, %1" : : "r" (vsid_data), "r" (esid_data));
+
+ barrier();
+
+ if (!kernel)
+ slb_cache_update(esid_data);
+
+ return 0;
+}
+
+static long slb_allocate_kernel(unsigned long ea, unsigned long id)
+{
+ unsigned long context;
+ unsigned long flags;
+ int ssize;
+
+ if (id == LINEAR_MAP_REGION_ID) {
+
+ /* We only support upto MAX_PHYSMEM_BITS */
+ if ((ea & EA_MASK) > (1UL << MAX_PHYSMEM_BITS))
+ return -EFAULT;
+
+ flags = SLB_VSID_KERNEL | mmu_psize_defs[mmu_linear_psize].sllp;
+
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+ } else if (id == VMEMMAP_REGION_ID) {
+
+ if (ea >= H_VMEMMAP_END)
+ return -EFAULT;
+
+ flags = SLB_VSID_KERNEL | mmu_psize_defs[mmu_vmemmap_psize].sllp;
+#endif
+ } else if (id == VMALLOC_REGION_ID) {
+
+ if (ea >= H_VMALLOC_END)
+ return -EFAULT;
+
+ flags = local_paca->vmalloc_sllp;
+
+ } else if (id == IO_REGION_ID) {
+
+ if (ea >= H_KERN_IO_END)
+ return -EFAULT;
+
+ flags = SLB_VSID_KERNEL | mmu_psize_defs[mmu_io_psize].sllp;
+
+ } else {
+ return -EFAULT;
+ }
+
+ ssize = MMU_SEGSIZE_1T;
+ if (!mmu_has_feature(MMU_FTR_1T_SEGMENT))
+ ssize = MMU_SEGSIZE_256M;
+
+ context = get_kernel_context(ea);
+
+ return slb_insert_entry(ea, context, flags, ssize, true);
+}
+
+static long slb_allocate_user(struct mm_struct *mm, unsigned long ea)
+{
+ unsigned long context;
+ unsigned long flags;
+ int bpsize;
+ int ssize;
+
+ /*
+ * consider this as bad access if we take a SLB miss
+ * on an address above addr limit.
+ */
+ if (ea >= mm_ctx_slb_addr_limit(&mm->context))
+ return -EFAULT;
+
+ context = get_user_context(&mm->context, ea);
+ if (!context)
+ return -EFAULT;
+
+ if (unlikely(ea >= H_PGTABLE_RANGE)) {
+ WARN_ON(1);
+ return -EFAULT;
+ }
+
+ ssize = user_segment_size(ea);
+
+ bpsize = get_slice_psize(mm, ea);
+ flags = SLB_VSID_USER | mmu_psize_defs[bpsize].sllp;
+
+ return slb_insert_entry(ea, context, flags, ssize, false);
+}
+
+long do_slb_fault(struct pt_regs *regs, unsigned long ea)
+{
+ unsigned long id = get_region_id(ea);
+
+ /* IRQs are not reconciled here, so can't check irqs_disabled */
+ VM_WARN_ON(mfmsr() & MSR_EE);
+
+ if (unlikely(!(regs->msr & MSR_RI)))
+ return -EINVAL;
+
+ /*
+ * SLB kernel faults must be very careful not to touch anything
+ * that is not bolted. E.g., PACA and global variables are okay,
+ * mm->context stuff is not.
+ *
+ * SLB user faults can access all of kernel memory, but must be
+ * careful not to touch things like IRQ state because it is not
+ * "reconciled" here. The difficulty is that we must use
+ * fast_exception_return to return from kernel SLB faults without
+ * looking at possible non-bolted memory. We could test user vs
+ * kernel faults in the interrupt handler asm and do a full fault,
+ * reconcile, ret_from_except for user faults which would make them
+ * first class kernel code. But for performance it's probably nicer
+ * if they go via fast_exception_return too.
+ */
+ if (id >= LINEAR_MAP_REGION_ID) {
+ long err;
+#ifdef CONFIG_DEBUG_VM
+ /* Catch recursive kernel SLB faults. */
+ BUG_ON(local_paca->in_kernel_slb_handler);
+ local_paca->in_kernel_slb_handler = 1;
+#endif
+ err = slb_allocate_kernel(ea, id);
+#ifdef CONFIG_DEBUG_VM
+ local_paca->in_kernel_slb_handler = 0;
+#endif
+ return err;
+ } else {
+ struct mm_struct *mm = current->mm;
+ long err;
+
+ if (unlikely(!mm))
+ return -EFAULT;
+
+ err = slb_allocate_user(mm, ea);
+ if (!err)
+ preload_add(current_thread_info(), ea);
+
+ return err;
+ }
+}
+
+void do_bad_slb_fault(struct pt_regs *regs, unsigned long ea, long err)
+{
+ if (err == -EFAULT) {
+ if (user_mode(regs))
+ _exception(SIGSEGV, regs, SEGV_BNDERR, ea);
+ else
+ bad_page_fault(regs, ea, SIGSEGV);
+ } else if (err == -EINVAL) {
+ unrecoverable_exception(regs);
+ } else {
+ BUG();
+ }
+}
diff --git a/arch/powerpc/mm/subpage-prot.c b/arch/powerpc/mm/book3s64/subpage_prot.c
similarity index 87%
rename from arch/powerpc/mm/subpage-prot.c
rename to arch/powerpc/mm/book3s64/subpage_prot.c
index 3327551..2ef24a5 100644
--- a/arch/powerpc/mm/subpage-prot.c
+++ b/arch/powerpc/mm/book3s64/subpage_prot.c
@@ -1,17 +1,13 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Copyright 2007-2008 Paul Mackerras, IBM Corp.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#include <linux/errno.h>
#include <linux/kernel.h>
#include <linux/gfp.h>
#include <linux/types.h>
-#include <linux/mm.h>
+#include <linux/pagewalk.h>
#include <linux/hugetlb.h>
#include <linux/syscalls.h>
@@ -25,10 +21,13 @@
*/
void subpage_prot_free(struct mm_struct *mm)
{
- struct subpage_prot_table *spt = &mm->context.spt;
+ struct subpage_prot_table *spt = mm_ctx_subpage_prot(&mm->context);
unsigned long i, j, addr;
u32 **p;
+ if (!spt)
+ return;
+
for (i = 0; i < 4; ++i) {
if (spt->low_prot[i]) {
free_page((unsigned long)spt->low_prot[i]);
@@ -48,13 +47,7 @@
free_page((unsigned long)p);
}
spt->maxaddr = 0;
-}
-
-void subpage_prot_init_new_context(struct mm_struct *mm)
-{
- struct subpage_prot_table *spt = &mm->context.spt;
-
- memset(spt, 0, sizeof(*spt));
+ kfree(spt);
}
static void hpte_flush_range(struct mm_struct *mm, unsigned long addr,
@@ -93,13 +86,18 @@
static void subpage_prot_clear(unsigned long addr, unsigned long len)
{
struct mm_struct *mm = current->mm;
- struct subpage_prot_table *spt = &mm->context.spt;
+ struct subpage_prot_table *spt;
u32 **spm, *spp;
unsigned long i;
size_t nw;
unsigned long next, limit;
down_write(&mm->mmap_sem);
+
+ spt = mm_ctx_subpage_prot(&mm->context);
+ if (!spt)
+ goto err_out;
+
limit = addr + len;
if (limit > spt->maxaddr)
limit = spt->maxaddr;
@@ -127,6 +125,8 @@
/* now flush any existing HPTEs for the range */
hpte_flush_range(mm, addr, nw);
}
+
+err_out:
up_write(&mm->mmap_sem);
}
@@ -139,14 +139,14 @@
return 0;
}
+static const struct mm_walk_ops subpage_walk_ops = {
+ .pmd_entry = subpage_walk_pmd_entry,
+};
+
static void subpage_mark_vma_nohuge(struct mm_struct *mm, unsigned long addr,
unsigned long len)
{
struct vm_area_struct *vma;
- struct mm_walk subpage_proto_walk = {
- .mm = mm,
- .pmd_entry = subpage_walk_pmd_entry,
- };
/*
* We don't try too hard, we just mark all the vma in that range
@@ -163,7 +163,7 @@
if (vma->vm_start >= (addr + len))
break;
vma->vm_flags |= VM_NOHUGEPAGE;
- walk_page_vma(vma, &subpage_proto_walk);
+ walk_page_vma(vma, &subpage_walk_ops, NULL);
vma = vma->vm_next;
}
}
@@ -189,7 +189,7 @@
unsigned long, len, u32 __user *, map)
{
struct mm_struct *mm = current->mm;
- struct subpage_prot_table *spt = &mm->context.spt;
+ struct subpage_prot_table *spt;
u32 **spm, *spp;
unsigned long i;
size_t nw;
@@ -214,10 +214,25 @@
return 0;
}
- if (!access_ok(VERIFY_READ, map, (len >> PAGE_SHIFT) * sizeof(u32)))
+ if (!access_ok(map, (len >> PAGE_SHIFT) * sizeof(u32)))
return -EFAULT;
down_write(&mm->mmap_sem);
+
+ spt = mm_ctx_subpage_prot(&mm->context);
+ if (!spt) {
+ /*
+ * Allocate subpage prot table if not already done.
+ * Do this with mmap_sem held
+ */
+ spt = kzalloc(sizeof(struct subpage_prot_table), GFP_KERNEL);
+ if (!spt) {
+ err = -ENOMEM;
+ goto out;
+ }
+ mm->context.hash_context->spt = spt;
+ }
+
subpage_mark_vma_nohuge(mm, addr, len);
for (limit = addr + len; addr < limit; addr = next) {
next = pmd_addr_end(addr, limit);
diff --git a/arch/powerpc/mm/copro_fault.c b/arch/powerpc/mm/copro_fault.c
index c8da352..beb060b 100644
--- a/arch/powerpc/mm/copro_fault.c
+++ b/arch/powerpc/mm/copro_fault.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* CoProcessor (SPU/AFU) mm fault handler
*
@@ -5,20 +6,6 @@
*
* Author: Arnd Bergmann <arndb@de.ibm.com>
* Author: Jeremy Kerr <jk@ozlabs.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2, or (at your option)
- * any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
#include <linux/sched.h>
#include <linux/mm.h>
@@ -105,7 +92,7 @@
u64 vsid, vsidkey;
int psize, ssize;
- switch (REGION_ID(ea)) {
+ switch (get_region_id(ea)) {
case USER_REGION_ID:
pr_devel("%s: 0x%llx -- USER_REGION_ID\n", __func__, ea);
if (mm == NULL)
@@ -117,16 +104,20 @@
break;
case VMALLOC_REGION_ID:
pr_devel("%s: 0x%llx -- VMALLOC_REGION_ID\n", __func__, ea);
- if (ea < VMALLOC_END)
- psize = mmu_vmalloc_psize;
- else
- psize = mmu_io_psize;
+ psize = mmu_vmalloc_psize;
ssize = mmu_kernel_ssize;
vsid = get_kernel_vsid(ea, mmu_kernel_ssize);
vsidkey = SLB_VSID_KERNEL;
break;
- case KERNEL_REGION_ID:
- pr_devel("%s: 0x%llx -- KERNEL_REGION_ID\n", __func__, ea);
+ case IO_REGION_ID:
+ pr_devel("%s: 0x%llx -- IO_REGION_ID\n", __func__, ea);
+ psize = mmu_io_psize;
+ ssize = mmu_kernel_ssize;
+ vsid = get_kernel_vsid(ea, mmu_kernel_ssize);
+ vsidkey = SLB_VSID_KERNEL;
+ break;
+ case LINEAR_MAP_REGION_ID:
+ pr_devel("%s: 0x%llx -- LINEAR_MAP_REGION_ID\n", __func__, ea);
psize = mmu_linear_psize;
ssize = mmu_kernel_ssize;
vsid = get_kernel_vsid(ea, mmu_kernel_ssize);
diff --git a/arch/powerpc/mm/dma-noncoherent.c b/arch/powerpc/mm/dma-noncoherent.c
index 3825284..2a82984 100644
--- a/arch/powerpc/mm/dma-noncoherent.c
+++ b/arch/powerpc/mm/dma-noncoherent.c
@@ -1,320 +1,25 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* PowerPC version derived from arch/arm/mm/consistent.c
* Copyright (C) 2001 Dan Malek (dmalek@jlc.net)
*
* Copyright (C) 2000 Russell King
- *
- * Consistent memory allocators. Used for DMA devices that want to
- * share uncached memory with the processor core. The function return
- * is the virtual address and 'dma_handle' is the physical address.
- * Mostly stolen from the ARM port, with some changes for PowerPC.
- * -- Dan
- *
- * Reorganized to get rid of the arch-specific consistent_* functions
- * and provide non-coherent implementations for the DMA API. -Matt
- *
- * Added in_interrupt() safe dma_alloc_coherent()/dma_free_coherent()
- * implementation. This is pulled straight from ARM and barely
- * modified. -Matt
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
-#include <linux/sched.h>
-#include <linux/slab.h>
#include <linux/kernel.h>
#include <linux/errno.h>
-#include <linux/string.h>
#include <linux/types.h>
#include <linux/highmem.h>
-#include <linux/dma-mapping.h>
-#include <linux/export.h>
+#include <linux/dma-direct.h>
+#include <linux/dma-noncoherent.h>
#include <asm/tlbflush.h>
#include <asm/dma.h>
-#include "mmu_decl.h"
-
-/*
- * This address range defaults to a value that is safe for all
- * platforms which currently set CONFIG_NOT_COHERENT_CACHE. It
- * can be further configured for specific applications under
- * the "Advanced Setup" menu. -Matt
- */
-#define CONSISTENT_BASE (IOREMAP_TOP)
-#define CONSISTENT_END (CONSISTENT_BASE + CONFIG_CONSISTENT_SIZE)
-#define CONSISTENT_OFFSET(x) (((unsigned long)(x) - CONSISTENT_BASE) >> PAGE_SHIFT)
-
-/*
- * This is the page table (2MB) covering uncached, DMA consistent allocations
- */
-static DEFINE_SPINLOCK(consistent_lock);
-
-/*
- * VM region handling support.
- *
- * This should become something generic, handling VM region allocations for
- * vmalloc and similar (ioremap, module space, etc).
- *
- * I envisage vmalloc()'s supporting vm_struct becoming:
- *
- * struct vm_struct {
- * struct vm_region region;
- * unsigned long flags;
- * struct page **pages;
- * unsigned int nr_pages;
- * unsigned long phys_addr;
- * };
- *
- * get_vm_area() would then call vm_region_alloc with an appropriate
- * struct vm_region head (eg):
- *
- * struct vm_region vmalloc_head = {
- * .vm_list = LIST_HEAD_INIT(vmalloc_head.vm_list),
- * .vm_start = VMALLOC_START,
- * .vm_end = VMALLOC_END,
- * };
- *
- * However, vmalloc_head.vm_start is variable (typically, it is dependent on
- * the amount of RAM found at boot time.) I would imagine that get_vm_area()
- * would have to initialise this each time prior to calling vm_region_alloc().
- */
-struct ppc_vm_region {
- struct list_head vm_list;
- unsigned long vm_start;
- unsigned long vm_end;
-};
-
-static struct ppc_vm_region consistent_head = {
- .vm_list = LIST_HEAD_INIT(consistent_head.vm_list),
- .vm_start = CONSISTENT_BASE,
- .vm_end = CONSISTENT_END,
-};
-
-static struct ppc_vm_region *
-ppc_vm_region_alloc(struct ppc_vm_region *head, size_t size, gfp_t gfp)
-{
- unsigned long addr = head->vm_start, end = head->vm_end - size;
- unsigned long flags;
- struct ppc_vm_region *c, *new;
-
- new = kmalloc(sizeof(struct ppc_vm_region), gfp);
- if (!new)
- goto out;
-
- spin_lock_irqsave(&consistent_lock, flags);
-
- list_for_each_entry(c, &head->vm_list, vm_list) {
- if ((addr + size) < addr)
- goto nospc;
- if ((addr + size) <= c->vm_start)
- goto found;
- addr = c->vm_end;
- if (addr > end)
- goto nospc;
- }
-
- found:
- /*
- * Insert this entry _before_ the one we found.
- */
- list_add_tail(&new->vm_list, &c->vm_list);
- new->vm_start = addr;
- new->vm_end = addr + size;
-
- spin_unlock_irqrestore(&consistent_lock, flags);
- return new;
-
- nospc:
- spin_unlock_irqrestore(&consistent_lock, flags);
- kfree(new);
- out:
- return NULL;
-}
-
-static struct ppc_vm_region *ppc_vm_region_find(struct ppc_vm_region *head, unsigned long addr)
-{
- struct ppc_vm_region *c;
-
- list_for_each_entry(c, &head->vm_list, vm_list) {
- if (c->vm_start == addr)
- goto out;
- }
- c = NULL;
- out:
- return c;
-}
-
-/*
- * Allocate DMA-coherent memory space and return both the kernel remapped
- * virtual and bus address for that space.
- */
-void *
-__dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *handle, gfp_t gfp)
-{
- struct page *page;
- struct ppc_vm_region *c;
- unsigned long order;
- u64 mask = ISA_DMA_THRESHOLD, limit;
-
- if (dev) {
- mask = dev->coherent_dma_mask;
-
- /*
- * Sanity check the DMA mask - it must be non-zero, and
- * must be able to be satisfied by a DMA allocation.
- */
- if (mask == 0) {
- dev_warn(dev, "coherent DMA mask is unset\n");
- goto no_page;
- }
-
- if ((~mask) & ISA_DMA_THRESHOLD) {
- dev_warn(dev, "coherent DMA mask %#llx is smaller "
- "than system GFP_DMA mask %#llx\n",
- mask, (unsigned long long)ISA_DMA_THRESHOLD);
- goto no_page;
- }
- }
-
-
- size = PAGE_ALIGN(size);
- limit = (mask + 1) & ~mask;
- if ((limit && size >= limit) ||
- size >= (CONSISTENT_END - CONSISTENT_BASE)) {
- printk(KERN_WARNING "coherent allocation too big (requested %#x mask %#Lx)\n",
- size, mask);
- return NULL;
- }
-
- order = get_order(size);
-
- /* Might be useful if we ever have a real legacy DMA zone... */
- if (mask != 0xffffffff)
- gfp |= GFP_DMA;
-
- page = alloc_pages(gfp, order);
- if (!page)
- goto no_page;
-
- /*
- * Invalidate any data that might be lurking in the
- * kernel direct-mapped region for device DMA.
- */
- {
- unsigned long kaddr = (unsigned long)page_address(page);
- memset(page_address(page), 0, size);
- flush_dcache_range(kaddr, kaddr + size);
- }
-
- /*
- * Allocate a virtual address in the consistent mapping region.
- */
- c = ppc_vm_region_alloc(&consistent_head, size,
- gfp & ~(__GFP_DMA | __GFP_HIGHMEM));
- if (c) {
- unsigned long vaddr = c->vm_start;
- struct page *end = page + (1 << order);
-
- split_page(page, order);
-
- /*
- * Set the "dma handle"
- */
- *handle = page_to_phys(page);
-
- do {
- SetPageReserved(page);
- map_kernel_page(vaddr, page_to_phys(page),
- pgprot_val(pgprot_noncached(PAGE_KERNEL)));
- page++;
- vaddr += PAGE_SIZE;
- } while (size -= PAGE_SIZE);
-
- /*
- * Free the otherwise unused pages.
- */
- while (page < end) {
- __free_page(page);
- page++;
- }
-
- return (void *)c->vm_start;
- }
-
- if (page)
- __free_pages(page, order);
- no_page:
- return NULL;
-}
-EXPORT_SYMBOL(__dma_alloc_coherent);
-
-/*
- * free a page as defined by the above mapping.
- */
-void __dma_free_coherent(size_t size, void *vaddr)
-{
- struct ppc_vm_region *c;
- unsigned long flags, addr;
-
- size = PAGE_ALIGN(size);
-
- spin_lock_irqsave(&consistent_lock, flags);
-
- c = ppc_vm_region_find(&consistent_head, (unsigned long)vaddr);
- if (!c)
- goto no_area;
-
- if ((c->vm_end - c->vm_start) != size) {
- printk(KERN_ERR "%s: freeing wrong coherent size (%ld != %d)\n",
- __func__, c->vm_end - c->vm_start, size);
- dump_stack();
- size = c->vm_end - c->vm_start;
- }
-
- addr = c->vm_start;
- do {
- pte_t *ptep;
- unsigned long pfn;
-
- ptep = pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k(addr),
- addr),
- addr),
- addr);
- if (!pte_none(*ptep) && pte_present(*ptep)) {
- pfn = pte_pfn(*ptep);
- pte_clear(&init_mm, addr, ptep);
- if (pfn_valid(pfn)) {
- struct page *page = pfn_to_page(pfn);
- __free_reserved_page(page);
- }
- }
- addr += PAGE_SIZE;
- } while (size -= PAGE_SIZE);
-
- flush_tlb_kernel_range(c->vm_start, c->vm_end);
-
- list_del(&c->vm_list);
-
- spin_unlock_irqrestore(&consistent_lock, flags);
-
- kfree(c);
- return;
-
- no_area:
- spin_unlock_irqrestore(&consistent_lock, flags);
- printk(KERN_ERR "%s: trying to free invalid coherent area: %p\n",
- __func__, vaddr);
- dump_stack();
-}
-EXPORT_SYMBOL(__dma_free_coherent);
-
/*
* make an area consistent.
*/
-void __dma_sync(void *vaddr, size_t size, int direction)
+static void __dma_sync(void *vaddr, size_t size, int direction)
{
unsigned long start = (unsigned long)vaddr;
unsigned long end = start + size;
@@ -340,7 +45,6 @@
break;
}
}
-EXPORT_SYMBOL(__dma_sync);
#ifdef CONFIG_HIGHMEM
/*
@@ -387,34 +91,34 @@
* __dma_sync_page makes memory consistent. identical to __dma_sync, but
* takes a struct page instead of a virtual address
*/
-void __dma_sync_page(struct page *page, unsigned long offset,
- size_t size, int direction)
+static void __dma_sync_page(phys_addr_t paddr, size_t size, int dir)
{
+ struct page *page = pfn_to_page(paddr >> PAGE_SHIFT);
+ unsigned offset = paddr & ~PAGE_MASK;
+
#ifdef CONFIG_HIGHMEM
- __dma_sync_page_highmem(page, offset, size, direction);
+ __dma_sync_page_highmem(page, offset, size, dir);
#else
unsigned long start = (unsigned long)page_address(page) + offset;
- __dma_sync((void *)start, size, direction);
+ __dma_sync((void *)start, size, dir);
#endif
}
-EXPORT_SYMBOL(__dma_sync_page);
-/*
- * Return the PFN for a given cpu virtual address returned by
- * __dma_alloc_coherent. This is used by dma_mmap_coherent()
- */
-unsigned long __dma_get_coherent_pfn(unsigned long cpu_addr)
+void arch_sync_dma_for_device(struct device *dev, phys_addr_t paddr,
+ size_t size, enum dma_data_direction dir)
{
- /* This should always be populated, so we don't test every
- * level. If that fails, we'll have a nice crash which
- * will be as good as a BUG_ON()
- */
- pgd_t *pgd = pgd_offset_k(cpu_addr);
- pud_t *pud = pud_offset(pgd, cpu_addr);
- pmd_t *pmd = pmd_offset(pud, cpu_addr);
- pte_t *ptep = pte_offset_kernel(pmd, cpu_addr);
+ __dma_sync_page(paddr, size, dir);
+}
- if (pte_none(*ptep) || !pte_present(*ptep))
- return 0;
- return pte_pfn(*ptep);
+void arch_sync_dma_for_cpu(struct device *dev, phys_addr_t paddr,
+ size_t size, enum dma_data_direction dir)
+{
+ __dma_sync_page(paddr, size, dir);
+}
+
+void arch_dma_prep_coherent(struct page *page, size_t size)
+{
+ unsigned long kaddr = (unsigned long)page_address(page);
+
+ flush_dcache_range(kaddr, kaddr + size);
}
diff --git a/arch/powerpc/mm/drmem.c b/arch/powerpc/mm/drmem.c
index 3f18036..59327ce 100644
--- a/arch/powerpc/mm/drmem.c
+++ b/arch/powerpc/mm/drmem.c
@@ -1,12 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Dynamic reconfiguration memory support
*
* Copyright 2017 IBM Corporation
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#define pr_fmt(fmt) "drmem: " fmt
@@ -366,8 +362,10 @@
if (!drmem_info->lmbs)
return;
- for_each_drmem_lmb(lmb)
+ for_each_drmem_lmb(lmb) {
read_drconf_v1_cell(lmb, &prop);
+ lmb_set_nid(lmb);
+ }
}
static void __init init_drmem_v2_lmbs(const __be32 *prop)
@@ -412,6 +410,8 @@
lmb->aa_index = dr_cell.aa_index;
lmb->flags = dr_cell.flags;
+
+ lmb_set_nid(lmb);
}
}
}
diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index d51cf5f..8432c28 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* PowerPC version
* Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
@@ -8,11 +9,6 @@
* Modified by Cort Dougan and Paul Mackerras.
*
* Modified for PPC64 by Dave Engebretsen (engebret@ibm.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#include <linux/signal.h>
@@ -44,26 +40,7 @@
#include <asm/mmu_context.h>
#include <asm/siginfo.h>
#include <asm/debug.h>
-
-static inline bool notify_page_fault(struct pt_regs *regs)
-{
- bool ret = false;
-
-#ifdef CONFIG_KPROBES
- /* kprobe_running() needs smp_processor_id() */
- if (!user_mode(regs)) {
- preempt_disable();
- if (kprobe_running() && kprobe_fault_handler(regs, 11))
- ret = true;
- preempt_enable();
- }
-#endif /* CONFIG_KPROBES */
-
- if (unlikely(debugger_fault_handler(regs)))
- ret = true;
-
- return ret;
-}
+#include <asm/kup.h>
/*
* Check whether the instruction inst is a store using
@@ -103,8 +80,7 @@
*/
static int
-__bad_area_nosemaphore(struct pt_regs *regs, unsigned long address, int si_code,
- int pkey)
+__bad_area_nosemaphore(struct pt_regs *regs, unsigned long address, int si_code)
{
/*
* If we are in kernel mode, bail out with a SEGV, this will
@@ -114,18 +90,17 @@
if (!user_mode(regs))
return SIGSEGV;
- _exception_pkey(SIGSEGV, regs, si_code, address, pkey);
+ _exception(SIGSEGV, regs, si_code, address);
return 0;
}
static noinline int bad_area_nosemaphore(struct pt_regs *regs, unsigned long address)
{
- return __bad_area_nosemaphore(regs, address, SEGV_MAPERR, 0);
+ return __bad_area_nosemaphore(regs, address, SEGV_MAPERR);
}
-static int __bad_area(struct pt_regs *regs, unsigned long address, int si_code,
- int pkey)
+static int __bad_area(struct pt_regs *regs, unsigned long address, int si_code)
{
struct mm_struct *mm = current->mm;
@@ -135,54 +110,60 @@
*/
up_read(&mm->mmap_sem);
- return __bad_area_nosemaphore(regs, address, si_code, pkey);
+ return __bad_area_nosemaphore(regs, address, si_code);
}
static noinline int bad_area(struct pt_regs *regs, unsigned long address)
{
- return __bad_area(regs, address, SEGV_MAPERR, 0);
+ return __bad_area(regs, address, SEGV_MAPERR);
}
static int bad_key_fault_exception(struct pt_regs *regs, unsigned long address,
int pkey)
{
- return __bad_area_nosemaphore(regs, address, SEGV_PKUERR, pkey);
+ /*
+ * If we are in kernel mode, bail out with a SEGV, this will
+ * be caught by the assembly which will restore the non-volatile
+ * registers before calling bad_page_fault()
+ */
+ if (!user_mode(regs))
+ return SIGSEGV;
+
+ _exception_pkey(regs, address, pkey);
+
+ return 0;
}
static noinline int bad_access(struct pt_regs *regs, unsigned long address)
{
- return __bad_area(regs, address, SEGV_ACCERR, 0);
+ return __bad_area(regs, address, SEGV_ACCERR);
}
static int do_sigbus(struct pt_regs *regs, unsigned long address,
vm_fault_t fault)
{
- siginfo_t info;
- unsigned int lsb = 0;
-
if (!user_mode(regs))
return SIGBUS;
current->thread.trap_nr = BUS_ADRERR;
- clear_siginfo(&info);
- info.si_signo = SIGBUS;
- info.si_errno = 0;
- info.si_code = BUS_ADRERR;
- info.si_addr = (void __user *)address;
#ifdef CONFIG_MEMORY_FAILURE
if (fault & (VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE)) {
+ unsigned int lsb = 0; /* shutup gcc */
+
pr_err("MCE: Killing %s:%d due to hardware memory corruption fault at %lx\n",
current->comm, current->pid, address);
- info.si_code = BUS_MCEERR_AR;
+
+ if (fault & VM_FAULT_HWPOISON_LARGE)
+ lsb = hstate_index_to_shift(VM_FAULT_GET_HINDEX(fault));
+ if (fault & VM_FAULT_HWPOISON)
+ lsb = PAGE_SHIFT;
+
+ force_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, lsb);
+ return 0;
}
- if (fault & VM_FAULT_HWPOISON_LARGE)
- lsb = hstate_index_to_shift(VM_FAULT_GET_HINDEX(fault));
- if (fault & VM_FAULT_HWPOISON)
- lsb = PAGE_SHIFT;
#endif
- info.si_addr_lsb = lsb;
- force_sig_info(SIGBUS, &info, current);
+ force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *)address);
return 0;
}
@@ -218,17 +199,46 @@
}
/* Is this a bad kernel fault ? */
-static bool bad_kernel_fault(bool is_exec, unsigned long error_code,
- unsigned long address)
+static bool bad_kernel_fault(struct pt_regs *regs, unsigned long error_code,
+ unsigned long address, bool is_write)
{
- if (is_exec && (error_code & (DSISR_NOEXEC_OR_G | DSISR_KEYFAULT))) {
- printk_ratelimited(KERN_CRIT "kernel tried to execute"
- " exec-protected page (%lx) -"
- "exploit attempt? (uid: %d)\n",
- address, from_kuid(&init_user_ns,
- current_uid()));
+ int is_exec = TRAP(regs) == 0x400;
+
+ /* NX faults set DSISR_PROTFAULT on the 8xx, DSISR_NOEXEC_OR_G on others */
+ if (is_exec && (error_code & (DSISR_NOEXEC_OR_G | DSISR_KEYFAULT |
+ DSISR_PROTFAULT))) {
+ pr_crit_ratelimited("kernel tried to execute %s page (%lx) - exploit attempt? (uid: %d)\n",
+ address >= TASK_SIZE ? "exec-protected" : "user",
+ address,
+ from_kuid(&init_user_ns, current_uid()));
+
+ // Kernel exec fault is always bad
+ return true;
}
- return is_exec || (address >= TASK_SIZE);
+
+ if (!is_exec && address < TASK_SIZE && (error_code & DSISR_PROTFAULT) &&
+ !search_exception_tables(regs->nip)) {
+ pr_crit_ratelimited("Kernel attempted to access user page (%lx) - exploit attempt? (uid: %d)\n",
+ address,
+ from_kuid(&init_user_ns, current_uid()));
+ }
+
+ // Kernel fault on kernel address is bad
+ if (address >= TASK_SIZE)
+ return true;
+
+ // Fault on user outside of certain regions (eg. copy_tofrom_user()) is bad
+ if (!search_exception_tables(regs->nip))
+ return true;
+
+ // Read/write fault in a valid region (the exception table search passed
+ // above), but blocked by KUAP is bad, it can never succeed.
+ if (bad_kuap_fault(regs, is_write))
+ return true;
+
+ // What's left? Kernel fault on user in well defined regions (extable
+ // matched), and allowed by KUAP in the faulting context.
+ return false;
}
static bool bad_stack_expansion(struct pt_regs *regs, unsigned long address,
@@ -267,7 +277,7 @@
return false;
if ((flags & FAULT_FLAG_WRITE) && (flags & FAULT_FLAG_USER) &&
- access_ok(VERIFY_READ, nip, sizeof(*nip))) {
+ access_ok(nip, sizeof(*nip))) {
unsigned int inst;
int res;
@@ -336,10 +346,21 @@
static inline void cmo_account_page_fault(void) { }
#endif /* CONFIG_PPC_SMLPAR */
-#ifdef CONFIG_PPC_STD_MMU
-static void sanity_check_fault(bool is_write, unsigned long error_code)
+#ifdef CONFIG_PPC_BOOK3S
+static void sanity_check_fault(bool is_write, bool is_user,
+ unsigned long error_code, unsigned long address)
{
/*
+ * Userspace trying to access kernel address, we get PROTFAULT for that.
+ */
+ if (is_user && address >= TASK_SIZE) {
+ pr_crit_ratelimited("%s[%d]: User access of kernel address (%lx) - exploit attempt? (uid: %d)\n",
+ current->comm, current->pid, address,
+ from_kuid(&init_user_ns, current_uid()));
+ return;
+ }
+
+ /*
* For hash translation mode, we should never get a
* PROTFAULT. Any update to pte to reduce access will result in us
* removing the hash page table entry, thus resulting in a DSISR_NOHPTE
@@ -368,12 +389,15 @@
* For radix, we can get prot fault for autonuma case, because radix
* page table will have them marked noaccess for user.
*/
- if (!radix_enabled() && !is_write)
- WARN_ON_ONCE(error_code & DSISR_PROTFAULT);
+ if (radix_enabled() || is_write)
+ return;
+
+ WARN_ON_ONCE(error_code & DSISR_PROTFAULT);
}
#else
-static void sanity_check_fault(bool is_write, unsigned long error_code) { }
-#endif /* CONFIG_PPC_STD_MMU */
+static void sanity_check_fault(bool is_write, bool is_user,
+ unsigned long error_code, unsigned long address) { }
+#endif /* CONFIG_PPC_BOOK3S */
/*
* Define the correct "is_write" bit in error_code based
@@ -417,8 +441,9 @@
int is_write = page_fault_is_write(error_code);
vm_fault_t fault, major = 0;
bool must_retry = false;
+ bool kprobe_fault = kprobe_page_fault(regs, 11);
- if (notify_page_fault(regs))
+ if (unlikely(debugger_fault_handler(regs) || kprobe_fault))
return 0;
if (unlikely(page_fault_is_bad(error_code))) {
@@ -430,13 +455,14 @@
}
/* Additional sanity check(s) */
- sanity_check_fault(is_write, error_code);
+ sanity_check_fault(is_write, is_user, error_code, address);
/*
* The kernel should never take an execute fault nor should it
- * take a page fault to a kernel address.
+ * take a page fault to a kernel address or a page fault to a user
+ * address outside of dedicated places
*/
- if (unlikely(!is_user && bad_kernel_fault(is_exec, error_code, address)))
+ if (unlikely(!is_user && bad_kernel_fault(regs, error_code, address, is_write)))
return SIGSEGV;
/*
@@ -631,21 +657,23 @@
switch (TRAP(regs)) {
case 0x300:
case 0x380:
- printk(KERN_ALERT "Unable to handle kernel paging request for "
- "data at address 0x%08lx\n", regs->dar);
+ case 0xe00:
+ pr_alert("BUG: %s at 0x%08lx\n",
+ regs->dar < PAGE_SIZE ? "Kernel NULL pointer dereference" :
+ "Unable to handle kernel data access", regs->dar);
break;
case 0x400:
case 0x480:
- printk(KERN_ALERT "Unable to handle kernel paging request for "
- "instruction fetch\n");
+ pr_alert("BUG: Unable to handle kernel instruction fetch%s",
+ regs->nip < PAGE_SIZE ? " (NULL pointer?)\n" : "\n");
break;
case 0x600:
- printk(KERN_ALERT "Unable to handle kernel paging request for "
- "unaligned access at address 0x%08lx\n", regs->dar);
+ pr_alert("BUG: Unable to handle kernel unaligned access at 0x%08lx\n",
+ regs->dar);
break;
default:
- printk(KERN_ALERT "Unable to handle kernel paging request for "
- "unknown fault\n");
+ pr_alert("BUG: Unable to handle unknown paging fault at 0x%08lx\n",
+ regs->dar);
break;
}
printk(KERN_ALERT "Faulting instruction address: 0x%08lx\n",
diff --git a/arch/powerpc/mm/highmem.c b/arch/powerpc/mm/highmem.c
index 82a0e37..320c167 100644
--- a/arch/powerpc/mm/highmem.c
+++ b/arch/powerpc/mm/highmem.c
@@ -43,9 +43,7 @@
type = kmap_atomic_idx_push();
idx = type + KM_TYPE_NR*smp_processor_id();
vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
-#ifdef CONFIG_DEBUG_HIGHMEM
- BUG_ON(!pte_none(*(kmap_pte-idx)));
-#endif
+ WARN_ON(IS_ENABLED(CONFIG_DEBUG_HIGHMEM) && !pte_none(*(kmap_pte - idx)));
__set_pte_at(&init_mm, vaddr, kmap_pte-idx, mk_pte(page, prot), 1);
local_flush_tlb_page(NULL, vaddr);
@@ -56,7 +54,6 @@
void __kunmap_atomic(void *kvaddr)
{
unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK;
- int type __maybe_unused;
if (vaddr < __fix_to_virt(FIX_KMAP_END)) {
pagefault_enable();
@@ -64,14 +61,12 @@
return;
}
- type = kmap_atomic_idx();
-
-#ifdef CONFIG_DEBUG_HIGHMEM
- {
+ if (IS_ENABLED(CONFIG_DEBUG_HIGHMEM)) {
+ int type = kmap_atomic_idx();
unsigned int idx;
idx = type + KM_TYPE_NR * smp_processor_id();
- BUG_ON(vaddr != __fix_to_virt(FIX_KMAP_BEGIN + idx));
+ WARN_ON(vaddr != __fix_to_virt(FIX_KMAP_BEGIN + idx));
/*
* force other mappings to Oops if they'll try to access
@@ -80,7 +75,6 @@
pte_clear(&init_mm, vaddr, kmap_pte-idx);
local_flush_tlb_page(NULL, vaddr);
}
-#endif
kmap_atomic_idx_pop();
pagefault_enable();
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index 7296a42..73d4873 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -15,7 +15,6 @@
#include <linux/export.h>
#include <linux/of_fdt.h>
#include <linux/memblock.h>
-#include <linux/bootmem.h>
#include <linux/moduleparam.h>
#include <linux/swap.h>
#include <linux/swapops.h>
@@ -27,22 +26,12 @@
#include <asm/hugetlb.h>
#include <asm/pte-walk.h>
-
-#ifdef CONFIG_HUGETLB_PAGE
-
-#define PAGE_SHIFT_64K 16
-#define PAGE_SHIFT_512K 19
-#define PAGE_SHIFT_8M 23
-#define PAGE_SHIFT_16M 24
-#define PAGE_SHIFT_16G 34
-
bool hugetlb_disabled = false;
-unsigned int HPAGE_SHIFT;
-EXPORT_SYMBOL(HPAGE_SHIFT);
-
#define hugepd_none(hpd) (hpd_val(hpd) == 0)
+#define PTE_T_ORDER (__builtin_ffs(sizeof(pte_t)) - __builtin_ffs(sizeof(void *)))
+
pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr, unsigned long sz)
{
/*
@@ -62,19 +51,27 @@
int num_hugepd;
if (pshift >= pdshift) {
- cachep = hugepte_cache;
+ cachep = PGT_CACHE(PTE_T_ORDER);
num_hugepd = 1 << (pshift - pdshift);
+ } else if (IS_ENABLED(CONFIG_PPC_8xx)) {
+ cachep = PGT_CACHE(PTE_INDEX_SIZE);
+ num_hugepd = 1;
} else {
cachep = PGT_CACHE(pdshift - pshift);
num_hugepd = 1;
}
- new = kmem_cache_zalloc(cachep, pgtable_gfp_flags(mm, GFP_KERNEL));
+ if (!cachep) {
+ WARN_ONCE(1, "No page table cache created for hugetlb tables");
+ return -ENOMEM;
+ }
+
+ new = kmem_cache_alloc(cachep, pgtable_gfp_flags(mm, GFP_KERNEL));
BUG_ON(pshift > HUGEPD_SHIFT_MASK);
BUG_ON((unsigned long)new & HUGEPD_SHIFT_MASK);
- if (! new)
+ if (!new)
return -ENOMEM;
/*
@@ -94,19 +91,7 @@
for (i = 0; i < num_hugepd; i++, hpdp++) {
if (unlikely(!hugepd_none(*hpdp)))
break;
- else {
-#ifdef CONFIG_PPC_BOOK3S_64
- *hpdp = __hugepd(__pa(new) |
- (shift_to_mmu_psize(pshift) << 2));
-#elif defined(CONFIG_PPC_8xx)
- *hpdp = __hugepd(__pa(new) | _PMD_USER |
- (pshift == PAGE_SHIFT_8M ? _PMD_PAGE_8M :
- _PMD_PAGE_512K) | _PMD_PRESENT);
-#else
- /* We use the old format for PPC_FSL_BOOK3E */
- *hpdp = __hugepd(((unsigned long)new & ~PD_HUGE) | pshift);
-#endif
- }
+ hugepd_populate(hpdp, new, pshift);
}
/* If we bailed from the for loop early, an error occurred, clean up */
if (i < num_hugepd) {
@@ -150,6 +135,8 @@
} else {
pdshift = PUD_SHIFT;
pu = pud_alloc(mm, pg, addr);
+ if (!pu)
+ return NULL;
if (pshift == PUD_SHIFT)
return (pte_t *)pu;
else if (pshift > PMD_SHIFT) {
@@ -158,6 +145,8 @@
} else {
pdshift = PMD_SHIFT;
pm = pmd_alloc(mm, pu, addr);
+ if (!pm)
+ return NULL;
if (pshift == PMD_SHIFT)
/* 16MB hugepage */
return (pte_t *)pm;
@@ -174,12 +163,16 @@
} else {
pdshift = PUD_SHIFT;
pu = pud_alloc(mm, pg, addr);
+ if (!pu)
+ return NULL;
if (pshift >= PUD_SHIFT) {
ptl = pud_lockptr(mm, pu);
hpdp = (hugepd_t *)pu;
} else {
pdshift = PMD_SHIFT;
pm = pmd_alloc(mm, pu, addr);
+ if (!pm)
+ return NULL;
ptl = pmd_lockptr(mm, pm);
hpdp = (hugepd_t *)pm;
}
@@ -246,7 +239,7 @@
return __alloc_bootmem_huge_page(h);
}
-#if defined(CONFIG_PPC_FSL_BOOK3E) || defined(CONFIG_PPC_8xx)
+#ifndef CONFIG_PPC_BOOK3S_64
#define HUGEPD_FREELIST_SIZE \
((PAGE_SIZE - sizeof(struct hugepd_freelist)) / sizeof(pte_t))
@@ -265,7 +258,7 @@
unsigned int i;
for (i = 0; i < batch->index; i++)
- kmem_cache_free(hugepte_cache, batch->ptes[i]);
+ kmem_cache_free(PGT_CACHE(PTE_T_ORDER), batch->ptes[i]);
free_page((unsigned long)batch);
}
@@ -278,7 +271,7 @@
if (atomic_read(&tlb->mm->mm_users) < 2 ||
mm_is_thread_local(tlb->mm)) {
- kmem_cache_free(hugepte_cache, hugepte);
+ kmem_cache_free(PGT_CACHE(PTE_T_ORDER), hugepte);
put_cpu_var(hugepd_freelist_cur);
return;
}
@@ -290,7 +283,7 @@
(*batchp)->ptes[(*batchp)->index++] = hugepte;
if ((*batchp)->index == HUGEPD_FREELIST_SIZE) {
- call_rcu_sched(&(*batchp)->rcu, hugepd_free_rcu_callback);
+ call_rcu(&(*batchp)->rcu, hugepd_free_rcu_callback);
*batchp = NULL;
}
put_cpu_var(hugepd_freelist_cur);
@@ -330,6 +323,9 @@
if (shift >= pdshift)
hugepd_free(tlb, hugepte);
+ else if (IS_ENABLED(CONFIG_PPC_8xx))
+ pgtable_free_tlb(tlb, hugepte,
+ get_hugepd_cache_index(PTE_INDEX_SIZE));
else
pgtable_free_tlb(tlb, hugepte,
get_hugepd_cache_index(pdshift - shift));
@@ -528,30 +524,6 @@
return page;
}
-static unsigned long hugepte_addr_end(unsigned long addr, unsigned long end,
- unsigned long sz)
-{
- unsigned long __boundary = (addr + sz) & ~(sz-1);
- return (__boundary - 1 < end - 1) ? __boundary : end;
-}
-
-int gup_huge_pd(hugepd_t hugepd, unsigned long addr, unsigned pdshift,
- unsigned long end, int write, struct page **pages, int *nr)
-{
- pte_t *ptep;
- unsigned long sz = 1UL << hugepd_shift(hugepd);
- unsigned long next;
-
- ptep = hugepte_offset(hugepd, addr, pdshift);
- do {
- next = hugepte_addr_end(addr, end, sz);
- if (!gup_hugepte(ptep, sz, addr, end, write, pages, nr))
- return 0;
- } while (ptep++, addr = next, addr != end);
-
- return 1;
-}
-
#ifdef CONFIG_PPC_MM_SLICES
unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
unsigned long len, unsigned long pgoff,
@@ -571,24 +543,15 @@
unsigned long vma_mmu_pagesize(struct vm_area_struct *vma)
{
-#ifdef CONFIG_PPC_MM_SLICES
/* With radix we don't use slice, so derive it from vma*/
- if (!radix_enabled()) {
+ if (IS_ENABLED(CONFIG_PPC_MM_SLICES) && !radix_enabled()) {
unsigned int psize = get_slice_psize(vma->vm_mm, vma->vm_start);
return 1UL << mmu_psize_to_shift(psize);
}
-#endif
return vma_kernel_pagesize(vma);
}
-static inline bool is_power_of_4(unsigned long x)
-{
- if (is_power_of_2(x))
- return (__ilog2(x) % 2) ? false : true;
- return false;
-}
-
static int __init add_huge_page_size(unsigned long long size)
{
int shift = __ffs(size);
@@ -596,36 +559,12 @@
/* Check that it is a page size supported by the hardware and
* that it fits within pagetable and slice limits. */
- if (size <= PAGE_SIZE)
- return -EINVAL;
-#if defined(CONFIG_PPC_FSL_BOOK3E)
- if (!is_power_of_4(size))
- return -EINVAL;
-#elif !defined(CONFIG_PPC_8xx)
- if (!is_power_of_2(size) || (shift > SLICE_HIGH_SHIFT))
- return -EINVAL;
-#endif
-
- if ((mmu_psize = shift_to_mmu_psize(shift)) < 0)
+ if (size <= PAGE_SIZE || !is_power_of_2(size))
return -EINVAL;
-#ifdef CONFIG_PPC_BOOK3S_64
- /*
- * We need to make sure that for different page sizes reported by
- * firmware we only add hugetlb support for page sizes that can be
- * supported by linux page table layout.
- * For now we have
- * Radix: 2M and 1G
- * Hash: 16M and 16G
- */
- if (radix_enabled()) {
- if (mmu_psize != MMU_PAGE_2M && mmu_psize != MMU_PAGE_1G)
- return -EINVAL;
- } else {
- if (mmu_psize != MMU_PAGE_16M && mmu_psize != MMU_PAGE_16G)
- return -EINVAL;
- }
-#endif
+ mmu_psize = check_and_get_huge_psize(shift);
+ if (mmu_psize < 0)
+ return -EINVAL;
BUG_ON(mmu_psize_defs[mmu_psize].shift != shift);
@@ -653,9 +592,9 @@
}
__setup("hugepagesz=", hugepage_setup_sz);
-struct kmem_cache *hugepte_cache;
static int __init hugetlbpage_init(void)
{
+ bool configured = false;
int psize;
if (hugetlb_disabled) {
@@ -663,10 +602,10 @@
return 0;
}
-#if !defined(CONFIG_PPC_FSL_BOOK3E) && !defined(CONFIG_PPC_8xx)
- if (!radix_enabled() && !mmu_has_feature(MMU_FTR_16M_PAGE))
+ if (IS_ENABLED(CONFIG_PPC_BOOK3S_64) && !radix_enabled() &&
+ !mmu_has_feature(MMU_FTR_16M_PAGE))
return -ENODEV;
-#endif
+
for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) {
unsigned shift;
unsigned pdshift;
@@ -700,44 +639,22 @@
* if we have pdshift and shift value same, we don't
* use pgt cache for hugepd.
*/
- if (pdshift > shift)
- pgtable_cache_add(pdshift - shift, NULL);
-#if defined(CONFIG_PPC_FSL_BOOK3E) || defined(CONFIG_PPC_8xx)
- else if (!hugepte_cache) {
- /*
- * Create a kmem cache for hugeptes. The bottom bits in
- * the pte have size information encoded in them, so
- * align them to allow this
- */
- hugepte_cache = kmem_cache_create("hugepte-cache",
- sizeof(pte_t),
- HUGEPD_SHIFT_MASK + 1,
- 0, NULL);
- if (hugepte_cache == NULL)
- panic("%s: Unable to create kmem cache "
- "for hugeptes\n", __func__);
+ if (pdshift > shift && IS_ENABLED(CONFIG_PPC_8xx))
+ pgtable_cache_add(PTE_INDEX_SIZE);
+ else if (pdshift > shift)
+ pgtable_cache_add(pdshift - shift);
+ else if (IS_ENABLED(CONFIG_PPC_FSL_BOOK3E) || IS_ENABLED(CONFIG_PPC_8xx))
+ pgtable_cache_add(PTE_T_ORDER);
- }
-#endif
+ configured = true;
}
-#if defined(CONFIG_PPC_FSL_BOOK3E) || defined(CONFIG_PPC_8xx)
- /* Default hpage size = 4M on FSL_BOOK3E and 512k on 8xx */
- if (mmu_psize_defs[MMU_PAGE_4M].shift)
- HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_4M].shift;
- else if (mmu_psize_defs[MMU_PAGE_512K].shift)
- HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_512K].shift;
-#else
- /* Set default large page size. Currently, we pick 16M or 1M
- * depending on what is available
- */
- if (mmu_psize_defs[MMU_PAGE_16M].shift)
- HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_16M].shift;
- else if (mmu_psize_defs[MMU_PAGE_1M].shift)
- HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_1M].shift;
- else if (mmu_psize_defs[MMU_PAGE_2M].shift)
- HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_2M].shift;
-#endif
+ if (configured) {
+ if (IS_ENABLED(CONFIG_HUGETLB_PAGE_SIZE_VARIABLE))
+ hugetlbpage_init_default();
+ } else
+ pr_info("Failed to initialize. Disabling HugeTLB");
+
return 0;
}
@@ -750,7 +667,7 @@
BUG_ON(!PageCompound(page));
- for (i = 0; i < (1UL << compound_order(page)); i++) {
+ for (i = 0; i < compound_nr(page); i++) {
if (!PageHighMem(page)) {
__flush_dcache_icache(page_address(page+i));
} else {
@@ -760,152 +677,3 @@
}
}
}
-
-#endif /* CONFIG_HUGETLB_PAGE */
-
-/*
- * We have 4 cases for pgds and pmds:
- * (1) invalid (all zeroes)
- * (2) pointer to next table, as normal; bottom 6 bits == 0
- * (3) leaf pte for huge page _PAGE_PTE set
- * (4) hugepd pointer, _PAGE_PTE = 0 and bits [2..6] indicate size of table
- *
- * So long as we atomically load page table pointers we are safe against teardown,
- * we can follow the address down to the the page and take a ref on it.
- * This function need to be called with interrupts disabled. We use this variant
- * when we have MSR[EE] = 0 but the paca->irq_soft_mask = IRQS_ENABLED
- */
-pte_t *__find_linux_pte(pgd_t *pgdir, unsigned long ea,
- bool *is_thp, unsigned *hpage_shift)
-{
- pgd_t pgd, *pgdp;
- pud_t pud, *pudp;
- pmd_t pmd, *pmdp;
- pte_t *ret_pte;
- hugepd_t *hpdp = NULL;
- unsigned pdshift = PGDIR_SHIFT;
-
- if (hpage_shift)
- *hpage_shift = 0;
-
- if (is_thp)
- *is_thp = false;
-
- pgdp = pgdir + pgd_index(ea);
- pgd = READ_ONCE(*pgdp);
- /*
- * Always operate on the local stack value. This make sure the
- * value don't get updated by a parallel THP split/collapse,
- * page fault or a page unmap. The return pte_t * is still not
- * stable. So should be checked there for above conditions.
- */
- if (pgd_none(pgd))
- return NULL;
- else if (pgd_huge(pgd)) {
- ret_pte = (pte_t *) pgdp;
- goto out;
- } else if (is_hugepd(__hugepd(pgd_val(pgd))))
- hpdp = (hugepd_t *)&pgd;
- else {
- /*
- * Even if we end up with an unmap, the pgtable will not
- * be freed, because we do an rcu free and here we are
- * irq disabled
- */
- pdshift = PUD_SHIFT;
- pudp = pud_offset(&pgd, ea);
- pud = READ_ONCE(*pudp);
-
- if (pud_none(pud))
- return NULL;
- else if (pud_huge(pud)) {
- ret_pte = (pte_t *) pudp;
- goto out;
- } else if (is_hugepd(__hugepd(pud_val(pud))))
- hpdp = (hugepd_t *)&pud;
- else {
- pdshift = PMD_SHIFT;
- pmdp = pmd_offset(&pud, ea);
- pmd = READ_ONCE(*pmdp);
- /*
- * A hugepage collapse is captured by pmd_none, because
- * it mark the pmd none and do a hpte invalidate.
- */
- if (pmd_none(pmd))
- return NULL;
-
- if (pmd_trans_huge(pmd) || pmd_devmap(pmd)) {
- if (is_thp)
- *is_thp = true;
- ret_pte = (pte_t *) pmdp;
- goto out;
- }
-
- if (pmd_huge(pmd)) {
- ret_pte = (pte_t *) pmdp;
- goto out;
- } else if (is_hugepd(__hugepd(pmd_val(pmd))))
- hpdp = (hugepd_t *)&pmd;
- else
- return pte_offset_kernel(&pmd, ea);
- }
- }
- if (!hpdp)
- return NULL;
-
- ret_pte = hugepte_offset(*hpdp, ea, pdshift);
- pdshift = hugepd_shift(*hpdp);
-out:
- if (hpage_shift)
- *hpage_shift = pdshift;
- return ret_pte;
-}
-EXPORT_SYMBOL_GPL(__find_linux_pte);
-
-int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
- unsigned long end, int write, struct page **pages, int *nr)
-{
- unsigned long pte_end;
- struct page *head, *page;
- pte_t pte;
- int refs;
-
- pte_end = (addr + sz) & ~(sz-1);
- if (pte_end < end)
- end = pte_end;
-
- pte = READ_ONCE(*ptep);
-
- if (!pte_access_permitted(pte, write))
- return 0;
-
- /* hugepages are never "special" */
- VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
-
- refs = 0;
- head = pte_page(pte);
-
- page = head + ((addr & (sz-1)) >> PAGE_SHIFT);
- do {
- VM_BUG_ON(compound_head(page) != head);
- pages[*nr] = page;
- (*nr)++;
- page++;
- refs++;
- } while (addr += PAGE_SIZE, addr != end);
-
- if (!page_cache_add_speculative(head, refs)) {
- *nr -= refs;
- return 0;
- }
-
- if (unlikely(pte_val(pte) != pte_val(*ptep))) {
- /* Could be optimized better */
- *nr -= refs;
- while (refs--)
- put_page(head);
- return 0;
- }
-
- return 1;
-}
diff --git a/arch/powerpc/mm/init-common.c b/arch/powerpc/mm/init-common.c
index 2b656e6..a84da92 100644
--- a/arch/powerpc/mm/init-common.c
+++ b/arch/powerpc/mm/init-common.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* PowerPC version
* Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
@@ -11,12 +12,6 @@
*
* Dave Engebretsen <engebret@us.ibm.com>
* Rework for PPC64 port.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
*/
#undef DEBUG
@@ -24,23 +19,67 @@
#include <linux/string.h>
#include <asm/pgalloc.h>
#include <asm/pgtable.h>
+#include <asm/kup.h>
-static void pgd_ctor(void *addr)
+static bool disable_kuep = !IS_ENABLED(CONFIG_PPC_KUEP);
+static bool disable_kuap = !IS_ENABLED(CONFIG_PPC_KUAP);
+
+static int __init parse_nosmep(char *p)
{
- memset(addr, 0, PGD_TABLE_SIZE);
+ disable_kuep = true;
+ pr_warn("Disabling Kernel Userspace Execution Prevention\n");
+ return 0;
+}
+early_param("nosmep", parse_nosmep);
+
+static int __init parse_nosmap(char *p)
+{
+ disable_kuap = true;
+ pr_warn("Disabling Kernel Userspace Access Protection\n");
+ return 0;
+}
+early_param("nosmap", parse_nosmap);
+
+void __ref setup_kup(void)
+{
+ setup_kuep(disable_kuep);
+ setup_kuap(disable_kuap);
}
-static void pud_ctor(void *addr)
-{
- memset(addr, 0, PUD_TABLE_SIZE);
+#define CTOR(shift) static void ctor_##shift(void *addr) \
+{ \
+ memset(addr, 0, sizeof(void *) << (shift)); \
}
-static void pmd_ctor(void *addr)
+CTOR(0); CTOR(1); CTOR(2); CTOR(3); CTOR(4); CTOR(5); CTOR(6); CTOR(7);
+CTOR(8); CTOR(9); CTOR(10); CTOR(11); CTOR(12); CTOR(13); CTOR(14); CTOR(15);
+
+static inline void (*ctor(int shift))(void *)
{
- memset(addr, 0, PMD_TABLE_SIZE);
+ BUILD_BUG_ON(MAX_PGTABLE_INDEX_SIZE != 15);
+
+ switch (shift) {
+ case 0: return ctor_0;
+ case 1: return ctor_1;
+ case 2: return ctor_2;
+ case 3: return ctor_3;
+ case 4: return ctor_4;
+ case 5: return ctor_5;
+ case 6: return ctor_6;
+ case 7: return ctor_7;
+ case 8: return ctor_8;
+ case 9: return ctor_9;
+ case 10: return ctor_10;
+ case 11: return ctor_11;
+ case 12: return ctor_12;
+ case 13: return ctor_13;
+ case 14: return ctor_14;
+ case 15: return ctor_15;
+ }
+ return NULL;
}
-struct kmem_cache *pgtable_cache[MAX_PGTABLE_INDEX_SIZE];
+struct kmem_cache *pgtable_cache[MAX_PGTABLE_INDEX_SIZE + 1];
EXPORT_SYMBOL_GPL(pgtable_cache); /* used by kvm_hv module */
/*
@@ -50,7 +89,7 @@
* everything else. Caches created by this function are used for all
* the higher level pagetables, and for hugepage pagetables.
*/
-void pgtable_cache_add(unsigned shift, void (*ctor)(void *))
+void pgtable_cache_add(unsigned int shift)
{
char *name;
unsigned long table_size = sizeof(void *) << shift;
@@ -71,19 +110,19 @@
* moment, gcc doesn't seem to recognize is_power_of_2 as a
* constant expression, so so much for that. */
BUG_ON(!is_power_of_2(minalign));
- BUG_ON((shift < 1) || (shift > MAX_PGTABLE_INDEX_SIZE));
+ BUG_ON(shift > MAX_PGTABLE_INDEX_SIZE);
if (PGT_CACHE(shift))
return; /* Already have a cache of this size */
align = max_t(unsigned long, align, minalign);
name = kasprintf(GFP_KERNEL, "pgtable-2^%d", shift);
- new = kmem_cache_create(name, table_size, align, 0, ctor);
+ new = kmem_cache_create(name, table_size, align, 0, ctor(shift));
if (!new)
panic("Could not allocate pgtable cache for order %d", shift);
kfree(name);
- pgtable_cache[shift - 1] = new;
+ pgtable_cache[shift] = new;
pr_debug("Allocated pgtable cache for order %d\n", shift);
}
@@ -91,15 +130,15 @@
void pgtable_cache_init(void)
{
- pgtable_cache_add(PGD_INDEX_SIZE, pgd_ctor);
+ pgtable_cache_add(PGD_INDEX_SIZE);
- if (PMD_CACHE_INDEX && !PGT_CACHE(PMD_CACHE_INDEX))
- pgtable_cache_add(PMD_CACHE_INDEX, pmd_ctor);
+ if (PMD_CACHE_INDEX)
+ pgtable_cache_add(PMD_CACHE_INDEX);
/*
* In all current configs, when the PUD index exists it's the
* same size as either the pgd or pmd index except with THP enabled
* on book3s 64
*/
- if (PUD_CACHE_INDEX && !PGT_CACHE(PUD_CACHE_INDEX))
- pgtable_cache_add(PUD_CACHE_INDEX, pud_ctor);
+ if (PUD_CACHE_INDEX)
+ pgtable_cache_add(PUD_CACHE_INDEX);
}
diff --git a/arch/powerpc/mm/init_32.c b/arch/powerpc/mm/init_32.c
index 3e59e5d..b04896a 100644
--- a/arch/powerpc/mm/init_32.c
+++ b/arch/powerpc/mm/init_32.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* PowerPC version
* Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
@@ -9,12 +10,6 @@
*
* Derived from "arch/i386/mm/init.c"
* Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
*/
#include <linux/module.h>
@@ -45,8 +40,10 @@
#include <asm/tlb.h>
#include <asm/sections.h>
#include <asm/hugetlb.h>
+#include <asm/kup.h>
+#include <asm/kasan.h>
-#include "mmu_decl.h"
+#include <mm/mmu_decl.h>
#if defined(CONFIG_KERNEL_START_BOOL) || defined(CONFIG_LOWMEM_SIZE_BOOL)
/* The amount of lowmem must be within 0xF0000000 - KERNELBASE. */
@@ -108,12 +105,8 @@
__map_without_bats = 1;
__map_without_ltlbs = 1;
}
-#ifdef CONFIG_STRICT_KERNEL_RWX
- if (rodata_enabled) {
- __map_without_bats = 1;
+ if (strict_kernel_rwx_enabled() && !IS_ENABLED(CONFIG_PPC_8xx))
__map_without_ltlbs = 1;
- }
-#endif
}
/*
@@ -182,6 +175,10 @@
btext_unmap();
#endif
+ kasan_mmu_init();
+
+ setup_kup();
+
/* Shortly after that, the entire linear mapping will be available */
memblock_set_current_limit(lowmem_end_addr);
}
diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c
index 7a9886f..4e08246 100644
--- a/arch/powerpc/mm/init_64.c
+++ b/arch/powerpc/mm/init_64.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* PowerPC version
* Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
@@ -11,12 +12,6 @@
*
* Dave Engebretsen <engebret@us.ibm.com>
* Rework for PPC64 port.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
*/
#undef DEBUG
@@ -66,7 +61,7 @@
#include <asm/iommu.h>
#include <asm/vdso.h>
-#include "mmu_decl.h"
+#include <mm/mmu_decl.h>
phys_addr_t memstart_addr = ~0;
EXPORT_SYMBOL_GPL(memstart_addr);
@@ -177,6 +172,21 @@
vmemmap_list = vmem_back;
}
+static bool altmap_cross_boundary(struct vmem_altmap *altmap, unsigned long start,
+ unsigned long page_size)
+{
+ unsigned long nr_pfn = page_size / sizeof(struct page);
+ unsigned long start_pfn = page_to_pfn((struct page *)start);
+
+ if ((start_pfn + nr_pfn) > altmap->end_pfn)
+ return true;
+
+ if (start_pfn < altmap->base_pfn)
+ return true;
+
+ return false;
+}
+
int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
struct vmem_altmap *altmap)
{
@@ -188,15 +198,23 @@
pr_debug("vmemmap_populate %lx..%lx, node %d\n", start, end, node);
for (; start < end; start += page_size) {
- void *p;
+ void *p = NULL;
int rc;
if (vmemmap_populated(start, page_size))
continue;
- if (altmap)
+ /*
+ * Allocate from the altmap first if we have one. This may
+ * fail due to alignment issues when using 16MB hugepages, so
+ * fall back to system memory if the altmap allocation fail.
+ */
+ if (altmap && !altmap_cross_boundary(altmap, start, page_size)) {
p = altmap_alloc_block_buf(page_size, altmap);
- else
+ if (!p)
+ pr_debug("altmap block allocation failed, falling back to system memory");
+ }
+ if (!p)
p = vmemmap_alloc_block_buf(page_size, node);
if (!p)
return -ENOMEM;
@@ -255,14 +273,20 @@
{
unsigned long page_size = 1 << mmu_psize_defs[mmu_vmemmap_psize].shift;
unsigned long page_order = get_order(page_size);
+ unsigned long alt_start = ~0, alt_end = ~0;
+ unsigned long base_pfn;
start = _ALIGN_DOWN(start, page_size);
+ if (altmap) {
+ alt_start = altmap->base_pfn;
+ alt_end = altmap->base_pfn + altmap->reserve +
+ altmap->free + altmap->alloc + altmap->align;
+ }
pr_debug("vmemmap_free %lx...%lx\n", start, end);
for (; start < end; start += page_size) {
unsigned long nr_pages, addr;
- struct page *section_base;
struct page *page;
/*
@@ -278,10 +302,10 @@
continue;
page = pfn_to_page(addr >> PAGE_SHIFT);
- section_base = pfn_to_page(vmemmap_section_start(start));
nr_pages = 1 << page_order;
+ base_pfn = PHYS_PFN(addr);
- if (altmap) {
+ if (base_pfn >= alt_start && base_pfn < alt_end) {
vmem_altmap_free(altmap, nr_pages);
} else if (PageReserved(page)) {
/* allocated from bootmem */
diff --git a/arch/powerpc/mm/ioremap.c b/arch/powerpc/mm/ioremap.c
new file mode 100644
index 0000000..fc66964
--- /dev/null
+++ b/arch/powerpc/mm/ioremap.c
@@ -0,0 +1,99 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include <linux/io.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <asm/io-workarounds.h>
+
+unsigned long ioremap_bot;
+EXPORT_SYMBOL(ioremap_bot);
+
+void __iomem *ioremap(phys_addr_t addr, unsigned long size)
+{
+ pgprot_t prot = pgprot_noncached(PAGE_KERNEL);
+ void *caller = __builtin_return_address(0);
+
+ if (iowa_is_active())
+ return iowa_ioremap(addr, size, prot, caller);
+ return __ioremap_caller(addr, size, prot, caller);
+}
+EXPORT_SYMBOL(ioremap);
+
+void __iomem *ioremap_wc(phys_addr_t addr, unsigned long size)
+{
+ pgprot_t prot = pgprot_noncached_wc(PAGE_KERNEL);
+ void *caller = __builtin_return_address(0);
+
+ if (iowa_is_active())
+ return iowa_ioremap(addr, size, prot, caller);
+ return __ioremap_caller(addr, size, prot, caller);
+}
+EXPORT_SYMBOL(ioremap_wc);
+
+void __iomem *ioremap_coherent(phys_addr_t addr, unsigned long size)
+{
+ pgprot_t prot = pgprot_cached(PAGE_KERNEL);
+ void *caller = __builtin_return_address(0);
+
+ if (iowa_is_active())
+ return iowa_ioremap(addr, size, prot, caller);
+ return __ioremap_caller(addr, size, prot, caller);
+}
+
+void __iomem *ioremap_prot(phys_addr_t addr, unsigned long size, unsigned long flags)
+{
+ pte_t pte = __pte(flags);
+ void *caller = __builtin_return_address(0);
+
+ /* writeable implies dirty for kernel addresses */
+ if (pte_write(pte))
+ pte = pte_mkdirty(pte);
+
+ /* we don't want to let _PAGE_USER and _PAGE_EXEC leak out */
+ pte = pte_exprotect(pte);
+ pte = pte_mkprivileged(pte);
+
+ if (iowa_is_active())
+ return iowa_ioremap(addr, size, pte_pgprot(pte), caller);
+ return __ioremap_caller(addr, size, pte_pgprot(pte), caller);
+}
+EXPORT_SYMBOL(ioremap_prot);
+
+int early_ioremap_range(unsigned long ea, phys_addr_t pa,
+ unsigned long size, pgprot_t prot)
+{
+ unsigned long i;
+
+ for (i = 0; i < size; i += PAGE_SIZE) {
+ int err = map_kernel_page(ea + i, pa + i, prot);
+
+ if (WARN_ON_ONCE(err)) /* Should clean up */
+ return err;
+ }
+
+ return 0;
+}
+
+void __iomem *do_ioremap(phys_addr_t pa, phys_addr_t offset, unsigned long size,
+ pgprot_t prot, void *caller)
+{
+ struct vm_struct *area;
+ int ret;
+ unsigned long va;
+
+ area = __get_vm_area_caller(size, VM_IOREMAP, IOREMAP_START, IOREMAP_END, caller);
+ if (area == NULL)
+ return NULL;
+
+ area->phys_addr = pa;
+ va = (unsigned long)area->addr;
+
+ ret = ioremap_page_range(va, va + size, pa, prot);
+ if (!ret)
+ return (void __iomem *)area->addr + offset;
+
+ unmap_kernel_range(va, size);
+ free_vm_area(area);
+
+ return NULL;
+}
diff --git a/arch/powerpc/mm/ioremap_32.c b/arch/powerpc/mm/ioremap_32.c
new file mode 100644
index 0000000..f36121f
--- /dev/null
+++ b/arch/powerpc/mm/ioremap_32.c
@@ -0,0 +1,92 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include <linux/io.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+
+#include <mm/mmu_decl.h>
+
+void __iomem *ioremap_wt(phys_addr_t addr, unsigned long size)
+{
+ pgprot_t prot = pgprot_cached_wthru(PAGE_KERNEL);
+
+ return __ioremap_caller(addr, size, prot, __builtin_return_address(0));
+}
+EXPORT_SYMBOL(ioremap_wt);
+
+void __iomem *
+__ioremap_caller(phys_addr_t addr, unsigned long size, pgprot_t prot, void *caller)
+{
+ unsigned long v;
+ phys_addr_t p, offset;
+ int err;
+
+ /*
+ * Choose an address to map it to.
+ * Once the vmalloc system is running, we use it.
+ * Before then, we use space going down from IOREMAP_TOP
+ * (ioremap_bot records where we're up to).
+ */
+ p = addr & PAGE_MASK;
+ offset = addr & ~PAGE_MASK;
+ size = PAGE_ALIGN(addr + size) - p;
+
+ /*
+ * If the address lies within the first 16 MB, assume it's in ISA
+ * memory space
+ */
+ if (p < 16 * 1024 * 1024)
+ p += _ISA_MEM_BASE;
+
+#ifndef CONFIG_CRASH_DUMP
+ /*
+ * Don't allow anybody to remap normal RAM that we're using.
+ * mem_init() sets high_memory so only do the check after that.
+ */
+ if (slab_is_available() && p <= virt_to_phys(high_memory - 1) &&
+ page_is_ram(__phys_to_pfn(p))) {
+ pr_warn("%s(): phys addr 0x%llx is RAM lr %ps\n", __func__,
+ (unsigned long long)p, __builtin_return_address(0));
+ return NULL;
+ }
+#endif
+
+ if (size == 0)
+ return NULL;
+
+ /*
+ * Is it already mapped? Perhaps overlapped by a previous
+ * mapping.
+ */
+ v = p_block_mapped(p);
+ if (v)
+ return (void __iomem *)v + offset;
+
+ if (slab_is_available())
+ return do_ioremap(p, offset, size, prot, caller);
+
+ /*
+ * Should check if it is a candidate for a BAT mapping
+ */
+
+ err = early_ioremap_range(ioremap_bot - size, p, size, prot);
+ if (err)
+ return NULL;
+ ioremap_bot -= size;
+
+ return (void __iomem *)ioremap_bot + offset;
+}
+
+void iounmap(volatile void __iomem *addr)
+{
+ /*
+ * If mapped by BATs then there is nothing to do.
+ * Calling vfree() generates a benign warning.
+ */
+ if (v_block_mapped((unsigned long)addr))
+ return;
+
+ if (addr > high_memory && (unsigned long)addr < ioremap_bot)
+ vunmap((void *)(PAGE_MASK & (unsigned long)addr));
+}
+EXPORT_SYMBOL(iounmap);
diff --git a/arch/powerpc/mm/ioremap_64.c b/arch/powerpc/mm/ioremap_64.c
new file mode 100644
index 0000000..fd29e51
--- /dev/null
+++ b/arch/powerpc/mm/ioremap_64.c
@@ -0,0 +1,113 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include <linux/io.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+
+/**
+ * Low level function to establish the page tables for an IO mapping
+ */
+void __iomem *__ioremap_at(phys_addr_t pa, void *ea, unsigned long size, pgprot_t prot)
+{
+ int ret;
+ unsigned long va = (unsigned long)ea;
+
+ /* We don't support the 4K PFN hack with ioremap */
+ if (pgprot_val(prot) & H_PAGE_4K_PFN)
+ return NULL;
+
+ if ((ea + size) >= (void *)IOREMAP_END) {
+ pr_warn("Outside the supported range\n");
+ return NULL;
+ }
+
+ WARN_ON(pa & ~PAGE_MASK);
+ WARN_ON(((unsigned long)ea) & ~PAGE_MASK);
+ WARN_ON(size & ~PAGE_MASK);
+
+ if (slab_is_available()) {
+ ret = ioremap_page_range(va, va + size, pa, prot);
+ if (ret)
+ unmap_kernel_range(va, size);
+ } else {
+ ret = early_ioremap_range(va, pa, size, prot);
+ }
+
+ if (ret)
+ return NULL;
+
+ return (void __iomem *)ea;
+}
+EXPORT_SYMBOL(__ioremap_at);
+
+/**
+ * Low level function to tear down the page tables for an IO mapping. This is
+ * used for mappings that are manipulated manually, like partial unmapping of
+ * PCI IOs or ISA space.
+ */
+void __iounmap_at(void *ea, unsigned long size)
+{
+ WARN_ON(((unsigned long)ea) & ~PAGE_MASK);
+ WARN_ON(size & ~PAGE_MASK);
+
+ unmap_kernel_range((unsigned long)ea, size);
+}
+EXPORT_SYMBOL(__iounmap_at);
+
+void __iomem *__ioremap_caller(phys_addr_t addr, unsigned long size,
+ pgprot_t prot, void *caller)
+{
+ phys_addr_t paligned, offset;
+ void __iomem *ret;
+ int err;
+
+ /* We don't support the 4K PFN hack with ioremap */
+ if (pgprot_val(prot) & H_PAGE_4K_PFN)
+ return NULL;
+
+ /*
+ * Choose an address to map it to. Once the vmalloc system is running,
+ * we use it. Before that, we map using addresses going up from
+ * ioremap_bot. vmalloc will use the addresses from IOREMAP_BASE
+ * through ioremap_bot.
+ */
+ paligned = addr & PAGE_MASK;
+ offset = addr & ~PAGE_MASK;
+ size = PAGE_ALIGN(addr + size) - paligned;
+
+ if (size == 0 || paligned == 0)
+ return NULL;
+
+ if (slab_is_available())
+ return do_ioremap(paligned, offset, size, prot, caller);
+
+ err = early_ioremap_range(ioremap_bot, paligned, size, prot);
+ if (err)
+ return NULL;
+
+ ret = (void __iomem *)ioremap_bot + offset;
+ ioremap_bot += size;
+
+ return ret;
+}
+
+/*
+ * Unmap an IO region and remove it from vmalloc'd list.
+ * Access to IO memory should be serialized by driver.
+ */
+void iounmap(volatile void __iomem *token)
+{
+ void *addr;
+
+ if (!slab_is_available())
+ return;
+
+ addr = (void *)((unsigned long __force)PCI_FIX_ADDR(token) & PAGE_MASK);
+
+ if ((unsigned long)addr < ioremap_bot) {
+ pr_warn("Attempt to iounmap early bolted mapping at 0x%p\n", addr);
+ return;
+ }
+ vunmap(addr);
+}
+EXPORT_SYMBOL(iounmap);
diff --git a/arch/powerpc/mm/kasan/Makefile b/arch/powerpc/mm/kasan/Makefile
new file mode 100644
index 0000000..6577897
--- /dev/null
+++ b/arch/powerpc/mm/kasan/Makefile
@@ -0,0 +1,5 @@
+# SPDX-License-Identifier: GPL-2.0
+
+KASAN_SANITIZE := n
+
+obj-$(CONFIG_PPC32) += kasan_init_32.o
diff --git a/arch/powerpc/mm/kasan/kasan_init_32.c b/arch/powerpc/mm/kasan/kasan_init_32.c
new file mode 100644
index 0000000..0e6ed44
--- /dev/null
+++ b/arch/powerpc/mm/kasan/kasan_init_32.c
@@ -0,0 +1,221 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#define DISABLE_BRANCH_PROFILING
+
+#include <linux/kasan.h>
+#include <linux/printk.h>
+#include <linux/memblock.h>
+#include <linux/moduleloader.h>
+#include <linux/sched/task.h>
+#include <linux/vmalloc.h>
+#include <asm/pgalloc.h>
+#include <asm/code-patching.h>
+#include <mm/mmu_decl.h>
+
+static pgprot_t kasan_prot_ro(void)
+{
+ if (early_mmu_has_feature(MMU_FTR_HPTE_TABLE))
+ return PAGE_READONLY;
+
+ return PAGE_KERNEL_RO;
+}
+
+static void kasan_populate_pte(pte_t *ptep, pgprot_t prot)
+{
+ unsigned long va = (unsigned long)kasan_early_shadow_page;
+ phys_addr_t pa = __pa(kasan_early_shadow_page);
+ int i;
+
+ for (i = 0; i < PTRS_PER_PTE; i++, ptep++)
+ __set_pte_at(&init_mm, va, ptep, pfn_pte(PHYS_PFN(pa), prot), 0);
+}
+
+static int __ref kasan_init_shadow_page_tables(unsigned long k_start, unsigned long k_end)
+{
+ pmd_t *pmd;
+ unsigned long k_cur, k_next;
+ pgprot_t prot = slab_is_available() ? kasan_prot_ro() : PAGE_KERNEL;
+
+ pmd = pmd_offset(pud_offset(pgd_offset_k(k_start), k_start), k_start);
+
+ for (k_cur = k_start; k_cur != k_end; k_cur = k_next, pmd++) {
+ pte_t *new;
+
+ k_next = pgd_addr_end(k_cur, k_end);
+ if ((void *)pmd_page_vaddr(*pmd) != kasan_early_shadow_pte)
+ continue;
+
+ if (slab_is_available())
+ new = pte_alloc_one_kernel(&init_mm);
+ else
+ new = memblock_alloc(PTE_FRAG_SIZE, PTE_FRAG_SIZE);
+
+ if (!new)
+ return -ENOMEM;
+ kasan_populate_pte(new, prot);
+
+ smp_wmb(); /* See comment in __pte_alloc */
+
+ spin_lock(&init_mm.page_table_lock);
+ /* Has another populated it ? */
+ if (likely((void *)pmd_page_vaddr(*pmd) == kasan_early_shadow_pte)) {
+ pmd_populate_kernel(&init_mm, pmd, new);
+ new = NULL;
+ }
+ spin_unlock(&init_mm.page_table_lock);
+
+ if (new && slab_is_available())
+ pte_free_kernel(&init_mm, new);
+ }
+ return 0;
+}
+
+static void __ref *kasan_get_one_page(void)
+{
+ if (slab_is_available())
+ return (void *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
+
+ return memblock_alloc(PAGE_SIZE, PAGE_SIZE);
+}
+
+static int __ref kasan_init_region(void *start, size_t size)
+{
+ unsigned long k_start = (unsigned long)kasan_mem_to_shadow(start);
+ unsigned long k_end = (unsigned long)kasan_mem_to_shadow(start + size);
+ unsigned long k_cur;
+ int ret;
+ void *block = NULL;
+
+ ret = kasan_init_shadow_page_tables(k_start, k_end);
+ if (ret)
+ return ret;
+
+ if (!slab_is_available())
+ block = memblock_alloc(k_end - k_start, PAGE_SIZE);
+
+ for (k_cur = k_start & PAGE_MASK; k_cur < k_end; k_cur += PAGE_SIZE) {
+ pmd_t *pmd = pmd_offset(pud_offset(pgd_offset_k(k_cur), k_cur), k_cur);
+ void *va = block ? block + k_cur - k_start : kasan_get_one_page();
+ pte_t pte = pfn_pte(PHYS_PFN(__pa(va)), PAGE_KERNEL);
+
+ if (!va)
+ return -ENOMEM;
+
+ __set_pte_at(&init_mm, k_cur, pte_offset_kernel(pmd, k_cur), pte, 0);
+ }
+ flush_tlb_kernel_range(k_start, k_end);
+ return 0;
+}
+
+static void __init kasan_remap_early_shadow_ro(void)
+{
+ pgprot_t prot = kasan_prot_ro();
+ unsigned long k_start = KASAN_SHADOW_START;
+ unsigned long k_end = KASAN_SHADOW_END;
+ unsigned long k_cur;
+ phys_addr_t pa = __pa(kasan_early_shadow_page);
+
+ kasan_populate_pte(kasan_early_shadow_pte, prot);
+
+ for (k_cur = k_start & PAGE_MASK; k_cur < k_end; k_cur += PAGE_SIZE) {
+ pmd_t *pmd = pmd_offset(pud_offset(pgd_offset_k(k_cur), k_cur), k_cur);
+ pte_t *ptep = pte_offset_kernel(pmd, k_cur);
+
+ if ((pte_val(*ptep) & PTE_RPN_MASK) != pa)
+ continue;
+
+ __set_pte_at(&init_mm, k_cur, ptep, pfn_pte(PHYS_PFN(pa), prot), 0);
+ }
+ flush_tlb_kernel_range(KASAN_SHADOW_START, KASAN_SHADOW_END);
+}
+
+void __init kasan_mmu_init(void)
+{
+ int ret;
+ struct memblock_region *reg;
+
+ if (early_mmu_has_feature(MMU_FTR_HPTE_TABLE)) {
+ ret = kasan_init_shadow_page_tables(KASAN_SHADOW_START, KASAN_SHADOW_END);
+
+ if (ret)
+ panic("kasan: kasan_init_shadow_page_tables() failed");
+ }
+
+ for_each_memblock(memory, reg) {
+ phys_addr_t base = reg->base;
+ phys_addr_t top = min(base + reg->size, total_lowmem);
+
+ if (base >= top)
+ continue;
+
+ ret = kasan_init_region(__va(base), top - base);
+ if (ret)
+ panic("kasan: kasan_init_region() failed");
+ }
+}
+
+void __init kasan_init(void)
+{
+ kasan_remap_early_shadow_ro();
+
+ clear_page(kasan_early_shadow_page);
+
+ /* At this point kasan is fully initialized. Enable error messages */
+ init_task.kasan_depth = 0;
+ pr_info("KASAN init done\n");
+}
+
+#ifdef CONFIG_MODULES
+void *module_alloc(unsigned long size)
+{
+ void *base;
+
+ base = __vmalloc_node_range(size, MODULE_ALIGN, VMALLOC_START, VMALLOC_END,
+ GFP_KERNEL, PAGE_KERNEL_EXEC, VM_FLUSH_RESET_PERMS,
+ NUMA_NO_NODE, __builtin_return_address(0));
+
+ if (!base)
+ return NULL;
+
+ if (!kasan_init_region(base, size))
+ return base;
+
+ vfree(base);
+
+ return NULL;
+}
+#endif
+
+#ifdef CONFIG_PPC_BOOK3S_32
+u8 __initdata early_hash[256 << 10] __aligned(256 << 10) = {0};
+
+static void __init kasan_early_hash_table(void)
+{
+ modify_instruction_site(&patch__hash_page_A0, 0xffff, __pa(early_hash) >> 16);
+ modify_instruction_site(&patch__flush_hash_A0, 0xffff, __pa(early_hash) >> 16);
+
+ Hash = (struct hash_pte *)early_hash;
+}
+#else
+static void __init kasan_early_hash_table(void) {}
+#endif
+
+void __init kasan_early_init(void)
+{
+ unsigned long addr = KASAN_SHADOW_START;
+ unsigned long end = KASAN_SHADOW_END;
+ unsigned long next;
+ pmd_t *pmd = pmd_offset(pud_offset(pgd_offset_k(addr), addr), addr);
+
+ BUILD_BUG_ON(KASAN_SHADOW_START & ~PGDIR_MASK);
+
+ kasan_populate_pte(kasan_early_shadow_pte, PAGE_KERNEL);
+
+ do {
+ next = pgd_addr_end(addr, end);
+ pmd_populate_kernel(&init_mm, pmd, kasan_early_shadow_pte);
+ } while (pmd++, addr = next, addr != end);
+
+ if (early_mmu_has_feature(MMU_FTR_HPTE_TABLE))
+ kasan_early_hash_table();
+}
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index 04ccb27..be941d3 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* PowerPC version
* Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
@@ -9,12 +10,6 @@
*
* Derived from "arch/i386/mm/init.c"
* Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
*/
#include <linux/export.h>
@@ -27,12 +22,11 @@
#include <linux/mm.h>
#include <linux/stddef.h>
#include <linux/init.h>
-#include <linux/bootmem.h>
+#include <linux/memblock.h>
#include <linux/highmem.h>
#include <linux/initrd.h>
#include <linux/pagemap.h>
#include <linux/suspend.h>
-#include <linux/memblock.h>
#include <linux/hugetlb.h>
#include <linux/slab.h>
#include <linux/vmalloc.h>
@@ -55,7 +49,7 @@
#include <asm/swiotlb.h>
#include <asm/rtas.h>
-#include "mmu_decl.h"
+#include <mm/mmu_decl.h>
#ifndef CPU_FTR_COHERENT_ICACHE
#define CPU_FTR_COHERENT_ICACHE 0 /* XXX for now */
@@ -70,22 +64,14 @@
EXPORT_SYMBOL(kmap_pte);
pgprot_t kmap_prot;
EXPORT_SYMBOL(kmap_prot);
-#define TOP_ZONE ZONE_HIGHMEM
static inline pte_t *virt_to_kpte(unsigned long vaddr)
{
return pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k(vaddr),
vaddr), vaddr), vaddr);
}
-#else
-#define TOP_ZONE ZONE_NORMAL
#endif
-int page_is_ram(unsigned long pfn)
-{
- return memblock_is_memory(__pfn_to_phys(pfn));
-}
-
pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
unsigned long size, pgprot_t vma_prot)
{
@@ -118,8 +104,8 @@
return -ENODEV;
}
-int __meminit arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap,
- bool want_memblock)
+int __ref arch_add_memory(int nid, u64 start, u64 size,
+ struct mhp_restrictions *restrictions)
{
unsigned long start_pfn = start >> PAGE_SHIFT;
unsigned long nr_pages = size >> PAGE_SHIFT;
@@ -134,75 +120,36 @@
start, start + size, rc);
return -EFAULT;
}
- flush_inval_dcache_range(start, start + size);
+ flush_dcache_range(start, start + size);
- return __add_pages(nid, start_pfn, nr_pages, altmap, want_memblock);
+ return __add_pages(nid, start_pfn, nr_pages, restrictions);
}
-#ifdef CONFIG_MEMORY_HOTREMOVE
-int __meminit arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap)
+void __ref arch_remove_memory(int nid, u64 start, u64 size,
+ struct vmem_altmap *altmap)
{
unsigned long start_pfn = start >> PAGE_SHIFT;
unsigned long nr_pages = size >> PAGE_SHIFT;
- struct page *page;
+ struct page *page = pfn_to_page(start_pfn) + vmem_altmap_offset(altmap);
int ret;
- /*
- * If we have an altmap then we need to skip over any reserved PFNs
- * when querying the zone.
- */
- page = pfn_to_page(start_pfn);
- if (altmap)
- page += vmem_altmap_offset(altmap);
-
- ret = __remove_pages(page_zone(page), start_pfn, nr_pages, altmap);
- if (ret)
- return ret;
+ __remove_pages(page_zone(page), start_pfn, nr_pages, altmap);
/* Remove htab bolted mappings for this section of memory */
start = (unsigned long)__va(start);
- flush_inval_dcache_range(start, start + size);
+ flush_dcache_range(start, start + size);
ret = remove_section_mapping(start, start + size);
+ WARN_ON_ONCE(ret);
/* Ensure all vmalloc mappings are flushed in case they also
* hit that section of memory
*/
vm_unmap_aliases();
- resize_hpt_for_hotplug(memblock_phys_mem_size());
-
- return ret;
+ if (resize_hpt_for_hotplug(memblock_phys_mem_size()) == -ENOSPC)
+ pr_warn("Hash collision while resizing HPT\n");
}
#endif
-#endif /* CONFIG_MEMORY_HOTPLUG */
-
-/*
- * walk_memory_resource() needs to make sure there is no holes in a given
- * memory range. PPC64 does not maintain the memory layout in /proc/iomem.
- * Instead it maintains it in memblock.memory structures. Walk through the
- * memory regions, find holes and callback for contiguous regions.
- */
-int
-walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages,
- void *arg, int (*func)(unsigned long, unsigned long, void *))
-{
- struct memblock_region *reg;
- unsigned long end_pfn = start_pfn + nr_pages;
- unsigned long tstart, tend;
- int ret = -1;
-
- for_each_memblock(memory, reg) {
- tstart = max(start_pfn, memblock_region_memory_base_pfn(reg));
- tend = min(end_pfn, memblock_region_memory_end_pfn(reg));
- if (tstart >= tend)
- continue;
- ret = (*func)(tstart, tend - tstart, arg);
- if (ret)
- break;
- }
- return ret;
-}
-EXPORT_SYMBOL_GPL(walk_system_ram_range);
#ifndef CONFIG_NEED_MULTIPLE_NODES
void __init mem_topology_setup(void)
@@ -247,54 +194,19 @@
}
#endif
-static bool zone_limits_final;
-
/*
- * The memory zones past TOP_ZONE are managed by generic mm code.
- * These should be set to zero since that's what every other
- * architecture does.
- */
-static unsigned long max_zone_pfns[MAX_NR_ZONES] = {
- [0 ... TOP_ZONE ] = ~0UL,
- [TOP_ZONE + 1 ... MAX_NR_ZONES - 1] = 0
-};
-
-/*
- * Restrict the specified zone and all more restrictive zones
- * to be below the specified pfn. May not be called after
- * paging_init().
- */
-void __init limit_zone_pfn(enum zone_type zone, unsigned long pfn_limit)
-{
- int i;
-
- if (WARN_ON(zone_limits_final))
- return;
-
- for (i = zone; i >= 0; i--) {
- if (max_zone_pfns[i] > pfn_limit)
- max_zone_pfns[i] = pfn_limit;
- }
-}
-
-/*
- * Find the least restrictive zone that is entirely below the
- * specified pfn limit. Returns < 0 if no suitable zone is found.
+ * Zones usage:
*
- * pfn_limit must be u64 because it can exceed 32 bits even on 32-bit
- * systems -- the DMA limit can be higher than any possible real pfn.
+ * We setup ZONE_DMA to be 31-bits on all platforms and ZONE_NORMAL to be
+ * everything else. GFP_DMA32 page allocations automatically fall back to
+ * ZONE_DMA.
+ *
+ * By using 31-bit unconditionally, we can exploit ARCH_ZONE_DMA_BITS to
+ * inform the generic DMA mapping code. 32-bit only devices (if not handled
+ * by an IOMMU anyway) will take a first dip into ZONE_NORMAL and get
+ * otherwise served by ZONE_DMA.
*/
-int dma_pfn_limit_to_zone(u64 pfn_limit)
-{
- int i;
-
- for (i = TOP_ZONE; i >= 0; i--) {
- if (max_zone_pfns[i] <= pfn_limit)
- return i;
- }
-
- return -EPERM;
-}
+static unsigned long max_zone_pfns[MAX_NR_ZONES];
/*
* paging_init() sets up the page tables - in fact we've already done this.
@@ -309,11 +221,11 @@
unsigned long end = __fix_to_virt(FIX_HOLE);
for (; v < end; v += PAGE_SIZE)
- map_kernel_page(v, 0, 0); /* XXX gross */
+ map_kernel_page(v, 0, __pgprot(0)); /* XXX gross */
#endif
#ifdef CONFIG_HIGHMEM
- map_kernel_page(PKMAP_BASE, 0, 0); /* XXX gross */
+ map_kernel_page(PKMAP_BASE, 0, __pgprot(0)); /* XXX gross */
pkmap_page_table = virt_to_kpte(PKMAP_BASE);
kmap_pte = virt_to_kpte(__fix_to_virt(FIX_KMAP_BEGIN));
@@ -325,11 +237,15 @@
printk(KERN_DEBUG "Memory hole size: %ldMB\n",
(long int)((top_of_ram - total_ram) >> 20));
-#ifdef CONFIG_HIGHMEM
- limit_zone_pfn(ZONE_NORMAL, lowmem_end_addr >> PAGE_SHIFT);
+#ifdef CONFIG_ZONE_DMA
+ max_zone_pfns[ZONE_DMA] = min(max_low_pfn,
+ 1UL << (ARCH_ZONE_DMA_BITS - PAGE_SHIFT));
#endif
- limit_zone_pfn(TOP_ZONE, top_of_ram >> PAGE_SHIFT);
- zone_limits_final = true;
+ max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
+#ifdef CONFIG_HIGHMEM
+ max_zone_pfns[ZONE_HIGHMEM] = max_pfn;
+#endif
+
free_area_init_nodes(max_zone_pfns);
mark_nonram_nosave();
@@ -349,7 +265,7 @@
high_memory = (void *) __va(max_low_pfn * PAGE_SIZE);
set_max_mapnr(max_pfn);
- free_all_bootmem();
+ memblock_free_all();
#ifdef CONFIG_HIGHMEM
{
@@ -377,17 +293,18 @@
mem_init_print_info(NULL);
#ifdef CONFIG_PPC32
pr_info("Kernel virtual memory layout:\n");
+#ifdef CONFIG_KASAN
+ pr_info(" * 0x%08lx..0x%08lx : kasan shadow mem\n",
+ KASAN_SHADOW_START, KASAN_SHADOW_END);
+#endif
pr_info(" * 0x%08lx..0x%08lx : fixmap\n", FIXADDR_START, FIXADDR_TOP);
#ifdef CONFIG_HIGHMEM
pr_info(" * 0x%08lx..0x%08lx : highmem PTEs\n",
PKMAP_BASE, PKMAP_ADDR(LAST_PKMAP));
#endif /* CONFIG_HIGHMEM */
-#ifdef CONFIG_NOT_COHERENT_CACHE
- pr_info(" * 0x%08lx..0x%08lx : consistent mem\n",
- IOREMAP_TOP, IOREMAP_TOP + CONFIG_CONSISTENT_SIZE);
-#endif /* CONFIG_NOT_COHERENT_CACHE */
- pr_info(" * 0x%08lx..0x%08lx : early ioremap\n",
- ioremap_bot, IOREMAP_TOP);
+ if (ioremap_bot != IOREMAP_TOP)
+ pr_info(" * 0x%08lx..0x%08lx : early ioremap\n",
+ ioremap_bot, IOREMAP_TOP);
pr_info(" * 0x%08lx..0x%08lx : vmalloc & ioremap\n",
VMALLOC_START, VMALLOC_END);
#endif /* CONFIG_PPC32 */
@@ -401,13 +318,6 @@
free_initmem_default(POISON_FREE_INITMEM);
}
-#ifdef CONFIG_BLK_DEV_INITRD
-void __init free_initrd_mem(unsigned long start, unsigned long end)
-{
- free_reserved_area((void *)start, (void *)end, -1, "initrd");
-}
-#endif
-
/*
* This is called when a page has been modified by the kernel.
* It just marks the page as not i-cache clean. We do the i-cache
@@ -494,62 +404,6 @@
EXPORT_SYMBOL(flush_icache_user_range);
/*
- * This is called at the end of handling a user page fault, when the
- * fault has been handled by updating a PTE in the linux page tables.
- * We use it to preload an HPTE into the hash table corresponding to
- * the updated linux PTE.
- *
- * This must always be called with the pte lock held.
- */
-void update_mmu_cache(struct vm_area_struct *vma, unsigned long address,
- pte_t *ptep)
-{
-#ifdef CONFIG_PPC_STD_MMU
- /*
- * We don't need to worry about _PAGE_PRESENT here because we are
- * called with either mm->page_table_lock held or ptl lock held
- */
- unsigned long access, trap;
-
- if (radix_enabled()) {
- prefetch((void *)address);
- return;
- }
-
- /* We only want HPTEs for linux PTEs that have _PAGE_ACCESSED set */
- if (!pte_young(*ptep) || address >= TASK_SIZE)
- return;
-
- /* We try to figure out if we are coming from an instruction
- * access fault and pass that down to __hash_page so we avoid
- * double-faulting on execution of fresh text. We have to test
- * for regs NULL since init will get here first thing at boot
- *
- * We also avoid filling the hash if not coming from a fault
- */
-
- trap = current->thread.regs ? TRAP(current->thread.regs) : 0UL;
- switch (trap) {
- case 0x300:
- access = 0UL;
- break;
- case 0x400:
- access = _PAGE_EXEC;
- break;
- default:
- return;
- }
-
- hash_preload(vma->vm_mm, address, access, trap);
-#endif /* CONFIG_PPC_STD_MMU */
-#if (defined(CONFIG_PPC_BOOK3E_64) || defined(CONFIG_PPC_FSL_BOOK3E)) \
- && defined(CONFIG_HUGETLB_PAGE)
- if (is_vm_hugetlb_page(vma))
- book3e_hugetlb_preload(vma, address, *ptep);
-#endif
-}
-
-/*
* System memory should not be in /proc/iomem but various tools expect it
* (eg kdump).
*/
@@ -597,3 +451,9 @@
return 0;
}
#endif /* CONFIG_STRICT_DEVMEM */
+
+/*
+ * This is defined in kernel/resource.c but only powerpc needs to export it, for
+ * the EHEA driver. Drop this when drivers/net/ethernet/ibm/ehea is removed.
+ */
+EXPORT_SYMBOL_GPL(walk_system_ram_range);
diff --git a/arch/powerpc/mm/mmap.c b/arch/powerpc/mm/mmap.c
index b24ce40..ae683fd 100644
--- a/arch/powerpc/mm/mmap.c
+++ b/arch/powerpc/mm/mmap.c
@@ -1,24 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* flexible mmap layout support
*
* Copyright 2003-2004 Red Hat Inc., Durham, North Carolina.
* All Rights Reserved.
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- *
- *
* Started by Ingo Molnar <mingo@elte.hu>
*/
diff --git a/arch/powerpc/mm/mmu_context.c b/arch/powerpc/mm/mmu_context.c
index f84e14f..18f20da 100644
--- a/arch/powerpc/mm/mmu_context.c
+++ b/arch/powerpc/mm/mmu_context.c
@@ -1,13 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Common implementation of switch_mm_irqs_off
*
* Copyright IBM Corp. 2017
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
*/
#include <linux/mm.h>
@@ -15,6 +10,7 @@
#include <linux/sched/mm.h>
#include <asm/mmu_context.h>
+#include <asm/pgalloc.h>
#if defined(CONFIG_PPC32)
static inline void switch_mm_pgdir(struct task_struct *tsk,
@@ -97,3 +93,12 @@
switch_mmu_context(prev, next, tsk);
}
+#ifndef CONFIG_PPC_BOOK3S_64
+void arch_exit_mmap(struct mm_struct *mm)
+{
+ void *frag = pte_frag_get(&mm->context);
+
+ if (frag)
+ pte_frag_destroy(frag);
+}
+#endif
diff --git a/arch/powerpc/mm/mmu_context_iommu.c b/arch/powerpc/mm/mmu_context_iommu.c
deleted file mode 100644
index 56c2234..0000000
--- a/arch/powerpc/mm/mmu_context_iommu.c
+++ /dev/null
@@ -1,467 +0,0 @@
-/*
- * IOMMU helpers in MMU context.
- *
- * Copyright (C) 2015 IBM Corp. <aik@ozlabs.ru>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
- */
-
-#include <linux/sched/signal.h>
-#include <linux/slab.h>
-#include <linux/rculist.h>
-#include <linux/vmalloc.h>
-#include <linux/mutex.h>
-#include <linux/migrate.h>
-#include <linux/hugetlb.h>
-#include <linux/swap.h>
-#include <linux/sizes.h>
-#include <asm/mmu_context.h>
-#include <asm/pte-walk.h>
-
-static DEFINE_MUTEX(mem_list_mutex);
-
-#define MM_IOMMU_TABLE_GROUP_PAGE_DIRTY 0x1
-#define MM_IOMMU_TABLE_GROUP_PAGE_MASK ~(SZ_4K - 1)
-
-struct mm_iommu_table_group_mem_t {
- struct list_head next;
- struct rcu_head rcu;
- unsigned long used;
- atomic64_t mapped;
- unsigned int pageshift;
- u64 ua; /* userspace address */
- u64 entries; /* number of entries in hpas[] */
- u64 *hpas; /* vmalloc'ed */
-};
-
-static long mm_iommu_adjust_locked_vm(struct mm_struct *mm,
- unsigned long npages, bool incr)
-{
- long ret = 0, locked, lock_limit;
-
- if (!npages)
- return 0;
-
- down_write(&mm->mmap_sem);
-
- if (incr) {
- locked = mm->locked_vm + npages;
- lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
- if (locked > lock_limit && !capable(CAP_IPC_LOCK))
- ret = -ENOMEM;
- else
- mm->locked_vm += npages;
- } else {
- if (WARN_ON_ONCE(npages > mm->locked_vm))
- npages = mm->locked_vm;
- mm->locked_vm -= npages;
- }
-
- pr_debug("[%d] RLIMIT_MEMLOCK HASH64 %c%ld %ld/%ld\n",
- current ? current->pid : 0,
- incr ? '+' : '-',
- npages << PAGE_SHIFT,
- mm->locked_vm << PAGE_SHIFT,
- rlimit(RLIMIT_MEMLOCK));
- up_write(&mm->mmap_sem);
-
- return ret;
-}
-
-bool mm_iommu_preregistered(struct mm_struct *mm)
-{
- return !list_empty(&mm->context.iommu_group_mem_list);
-}
-EXPORT_SYMBOL_GPL(mm_iommu_preregistered);
-
-/*
- * Taken from alloc_migrate_target with changes to remove CMA allocations
- */
-struct page *new_iommu_non_cma_page(struct page *page, unsigned long private)
-{
- gfp_t gfp_mask = GFP_USER;
- struct page *new_page;
-
- if (PageCompound(page))
- return NULL;
-
- if (PageHighMem(page))
- gfp_mask |= __GFP_HIGHMEM;
-
- /*
- * We don't want the allocation to force an OOM if possibe
- */
- new_page = alloc_page(gfp_mask | __GFP_NORETRY | __GFP_NOWARN);
- return new_page;
-}
-
-static int mm_iommu_move_page_from_cma(struct page *page)
-{
- int ret = 0;
- LIST_HEAD(cma_migrate_pages);
-
- /* Ignore huge pages for now */
- if (PageCompound(page))
- return -EBUSY;
-
- lru_add_drain();
- ret = isolate_lru_page(page);
- if (ret)
- return ret;
-
- list_add(&page->lru, &cma_migrate_pages);
- put_page(page); /* Drop the gup reference */
-
- ret = migrate_pages(&cma_migrate_pages, new_iommu_non_cma_page,
- NULL, 0, MIGRATE_SYNC, MR_CONTIG_RANGE);
- if (ret) {
- if (!list_empty(&cma_migrate_pages))
- putback_movable_pages(&cma_migrate_pages);
- }
-
- return 0;
-}
-
-long mm_iommu_get(struct mm_struct *mm, unsigned long ua, unsigned long entries,
- struct mm_iommu_table_group_mem_t **pmem)
-{
- struct mm_iommu_table_group_mem_t *mem;
- long i, j, ret = 0, locked_entries = 0;
- unsigned int pageshift;
- unsigned long flags;
- unsigned long cur_ua;
- struct page *page = NULL;
-
- mutex_lock(&mem_list_mutex);
-
- list_for_each_entry_rcu(mem, &mm->context.iommu_group_mem_list,
- next) {
- if ((mem->ua == ua) && (mem->entries == entries)) {
- ++mem->used;
- *pmem = mem;
- goto unlock_exit;
- }
-
- /* Overlap? */
- if ((mem->ua < (ua + (entries << PAGE_SHIFT))) &&
- (ua < (mem->ua +
- (mem->entries << PAGE_SHIFT)))) {
- ret = -EINVAL;
- goto unlock_exit;
- }
-
- }
-
- ret = mm_iommu_adjust_locked_vm(mm, entries, true);
- if (ret)
- goto unlock_exit;
-
- locked_entries = entries;
-
- mem = kzalloc(sizeof(*mem), GFP_KERNEL);
- if (!mem) {
- ret = -ENOMEM;
- goto unlock_exit;
- }
-
- /*
- * For a starting point for a maximum page size calculation
- * we use @ua and @entries natural alignment to allow IOMMU pages
- * smaller than huge pages but still bigger than PAGE_SIZE.
- */
- mem->pageshift = __ffs(ua | (entries << PAGE_SHIFT));
- mem->hpas = vzalloc(array_size(entries, sizeof(mem->hpas[0])));
- if (!mem->hpas) {
- kfree(mem);
- ret = -ENOMEM;
- goto unlock_exit;
- }
-
- for (i = 0; i < entries; ++i) {
- cur_ua = ua + (i << PAGE_SHIFT);
- if (1 != get_user_pages_fast(cur_ua,
- 1/* pages */, 1/* iswrite */, &page)) {
- ret = -EFAULT;
- for (j = 0; j < i; ++j)
- put_page(pfn_to_page(mem->hpas[j] >>
- PAGE_SHIFT));
- vfree(mem->hpas);
- kfree(mem);
- goto unlock_exit;
- }
- /*
- * If we get a page from the CMA zone, since we are going to
- * be pinning these entries, we might as well move them out
- * of the CMA zone if possible. NOTE: faulting in + migration
- * can be expensive. Batching can be considered later
- */
- if (is_migrate_cma_page(page)) {
- if (mm_iommu_move_page_from_cma(page))
- goto populate;
- if (1 != get_user_pages_fast(cur_ua,
- 1/* pages */, 1/* iswrite */,
- &page)) {
- ret = -EFAULT;
- for (j = 0; j < i; ++j)
- put_page(pfn_to_page(mem->hpas[j] >>
- PAGE_SHIFT));
- vfree(mem->hpas);
- kfree(mem);
- goto unlock_exit;
- }
- }
-populate:
- pageshift = PAGE_SHIFT;
- if (mem->pageshift > PAGE_SHIFT && PageCompound(page)) {
- pte_t *pte;
- struct page *head = compound_head(page);
- unsigned int compshift = compound_order(head);
- unsigned int pteshift;
-
- local_irq_save(flags); /* disables as well */
- pte = find_linux_pte(mm->pgd, cur_ua, NULL, &pteshift);
-
- /* Double check it is still the same pinned page */
- if (pte && pte_page(*pte) == head &&
- pteshift == compshift + PAGE_SHIFT)
- pageshift = max_t(unsigned int, pteshift,
- PAGE_SHIFT);
- local_irq_restore(flags);
- }
- mem->pageshift = min(mem->pageshift, pageshift);
- mem->hpas[i] = page_to_pfn(page) << PAGE_SHIFT;
- }
-
- atomic64_set(&mem->mapped, 1);
- mem->used = 1;
- mem->ua = ua;
- mem->entries = entries;
- *pmem = mem;
-
- list_add_rcu(&mem->next, &mm->context.iommu_group_mem_list);
-
-unlock_exit:
- if (locked_entries && ret)
- mm_iommu_adjust_locked_vm(mm, locked_entries, false);
-
- mutex_unlock(&mem_list_mutex);
-
- return ret;
-}
-EXPORT_SYMBOL_GPL(mm_iommu_get);
-
-static void mm_iommu_unpin(struct mm_iommu_table_group_mem_t *mem)
-{
- long i;
- struct page *page = NULL;
-
- for (i = 0; i < mem->entries; ++i) {
- if (!mem->hpas[i])
- continue;
-
- page = pfn_to_page(mem->hpas[i] >> PAGE_SHIFT);
- if (!page)
- continue;
-
- if (mem->hpas[i] & MM_IOMMU_TABLE_GROUP_PAGE_DIRTY)
- SetPageDirty(page);
-
- put_page(page);
- mem->hpas[i] = 0;
- }
-}
-
-static void mm_iommu_do_free(struct mm_iommu_table_group_mem_t *mem)
-{
-
- mm_iommu_unpin(mem);
- vfree(mem->hpas);
- kfree(mem);
-}
-
-static void mm_iommu_free(struct rcu_head *head)
-{
- struct mm_iommu_table_group_mem_t *mem = container_of(head,
- struct mm_iommu_table_group_mem_t, rcu);
-
- mm_iommu_do_free(mem);
-}
-
-static void mm_iommu_release(struct mm_iommu_table_group_mem_t *mem)
-{
- list_del_rcu(&mem->next);
- call_rcu(&mem->rcu, mm_iommu_free);
-}
-
-long mm_iommu_put(struct mm_struct *mm, struct mm_iommu_table_group_mem_t *mem)
-{
- long ret = 0;
-
- mutex_lock(&mem_list_mutex);
-
- if (mem->used == 0) {
- ret = -ENOENT;
- goto unlock_exit;
- }
-
- --mem->used;
- /* There are still users, exit */
- if (mem->used)
- goto unlock_exit;
-
- /* Are there still mappings? */
- if (atomic_cmpxchg(&mem->mapped, 1, 0) != 1) {
- ++mem->used;
- ret = -EBUSY;
- goto unlock_exit;
- }
-
- /* @mapped became 0 so now mappings are disabled, release the region */
- mm_iommu_release(mem);
-
- mm_iommu_adjust_locked_vm(mm, mem->entries, false);
-
-unlock_exit:
- mutex_unlock(&mem_list_mutex);
-
- return ret;
-}
-EXPORT_SYMBOL_GPL(mm_iommu_put);
-
-struct mm_iommu_table_group_mem_t *mm_iommu_lookup(struct mm_struct *mm,
- unsigned long ua, unsigned long size)
-{
- struct mm_iommu_table_group_mem_t *mem, *ret = NULL;
-
- list_for_each_entry_rcu(mem, &mm->context.iommu_group_mem_list, next) {
- if ((mem->ua <= ua) &&
- (ua + size <= mem->ua +
- (mem->entries << PAGE_SHIFT))) {
- ret = mem;
- break;
- }
- }
-
- return ret;
-}
-EXPORT_SYMBOL_GPL(mm_iommu_lookup);
-
-struct mm_iommu_table_group_mem_t *mm_iommu_lookup_rm(struct mm_struct *mm,
- unsigned long ua, unsigned long size)
-{
- struct mm_iommu_table_group_mem_t *mem, *ret = NULL;
-
- list_for_each_entry_lockless(mem, &mm->context.iommu_group_mem_list,
- next) {
- if ((mem->ua <= ua) &&
- (ua + size <= mem->ua +
- (mem->entries << PAGE_SHIFT))) {
- ret = mem;
- break;
- }
- }
-
- return ret;
-}
-
-struct mm_iommu_table_group_mem_t *mm_iommu_find(struct mm_struct *mm,
- unsigned long ua, unsigned long entries)
-{
- struct mm_iommu_table_group_mem_t *mem, *ret = NULL;
-
- list_for_each_entry_rcu(mem, &mm->context.iommu_group_mem_list, next) {
- if ((mem->ua == ua) && (mem->entries == entries)) {
- ret = mem;
- break;
- }
- }
-
- return ret;
-}
-EXPORT_SYMBOL_GPL(mm_iommu_find);
-
-long mm_iommu_ua_to_hpa(struct mm_iommu_table_group_mem_t *mem,
- unsigned long ua, unsigned int pageshift, unsigned long *hpa)
-{
- const long entry = (ua - mem->ua) >> PAGE_SHIFT;
- u64 *va = &mem->hpas[entry];
-
- if (entry >= mem->entries)
- return -EFAULT;
-
- if (pageshift > mem->pageshift)
- return -EFAULT;
-
- *hpa = (*va & MM_IOMMU_TABLE_GROUP_PAGE_MASK) | (ua & ~PAGE_MASK);
-
- return 0;
-}
-EXPORT_SYMBOL_GPL(mm_iommu_ua_to_hpa);
-
-long mm_iommu_ua_to_hpa_rm(struct mm_iommu_table_group_mem_t *mem,
- unsigned long ua, unsigned int pageshift, unsigned long *hpa)
-{
- const long entry = (ua - mem->ua) >> PAGE_SHIFT;
- void *va = &mem->hpas[entry];
- unsigned long *pa;
-
- if (entry >= mem->entries)
- return -EFAULT;
-
- if (pageshift > mem->pageshift)
- return -EFAULT;
-
- pa = (void *) vmalloc_to_phys(va);
- if (!pa)
- return -EFAULT;
-
- *hpa = (*pa & MM_IOMMU_TABLE_GROUP_PAGE_MASK) | (ua & ~PAGE_MASK);
-
- return 0;
-}
-
-extern void mm_iommu_ua_mark_dirty_rm(struct mm_struct *mm, unsigned long ua)
-{
- struct mm_iommu_table_group_mem_t *mem;
- long entry;
- void *va;
- unsigned long *pa;
-
- mem = mm_iommu_lookup_rm(mm, ua, PAGE_SIZE);
- if (!mem)
- return;
-
- entry = (ua - mem->ua) >> PAGE_SHIFT;
- va = &mem->hpas[entry];
-
- pa = (void *) vmalloc_to_phys(va);
- if (!pa)
- return;
-
- *pa |= MM_IOMMU_TABLE_GROUP_PAGE_DIRTY;
-}
-
-long mm_iommu_mapped_inc(struct mm_iommu_table_group_mem_t *mem)
-{
- if (atomic64_inc_not_zero(&mem->mapped))
- return 0;
-
- /* Last mm_iommu_put() has been called, no more mappings allowed() */
- return -ENXIO;
-}
-EXPORT_SYMBOL_GPL(mm_iommu_mapped_inc);
-
-void mm_iommu_mapped_dec(struct mm_iommu_table_group_mem_t *mem)
-{
- atomic64_add_unless(&mem->mapped, -1, 1);
-}
-EXPORT_SYMBOL_GPL(mm_iommu_mapped_dec);
-
-void mm_iommu_init(struct mm_struct *mm)
-{
- INIT_LIST_HEAD_RCU(&mm->context.iommu_group_mem_list);
-}
diff --git a/arch/powerpc/mm/mmu_decl.h b/arch/powerpc/mm/mmu_decl.h
index e5d779e..c750ac9 100644
--- a/arch/powerpc/mm/mmu_decl.h
+++ b/arch/powerpc/mm/mmu_decl.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
* Declarations of procedures and variables shared between files
* in arch/ppc/mm/.
@@ -11,17 +12,12 @@
*
* Derived from "arch/i386/mm/init.c"
* Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
*/
#include <linux/mm.h>
#include <asm/mmu.h>
#ifdef CONFIG_PPC_MMU_NOHASH
+#include <asm/trace.h>
/*
* On 40x and 8xx, we directly inline tlbia and tlbivax
@@ -30,10 +26,12 @@
static inline void _tlbil_all(void)
{
asm volatile ("sync; tlbia; isync" : : : "memory");
+ trace_tlbia(MMU_NO_CONTEXT);
}
static inline void _tlbil_pid(unsigned int pid)
{
asm volatile ("sync; tlbia; isync" : : : "memory");
+ trace_tlbia(pid);
}
#define _tlbil_pid_noind(pid) _tlbil_pid(pid)
@@ -55,6 +53,7 @@
unsigned int tsize, unsigned int ind)
{
asm volatile ("tlbie %0; sync" : : "r" (address) : "memory");
+ trace_tlbie(0, 0, address, pid, 0, 0, 0);
}
#elif defined(CONFIG_PPC_BOOK3E)
extern void _tlbil_va(unsigned long address, unsigned int pid,
@@ -79,19 +78,21 @@
}
#endif
+static inline void print_system_hash_info(void) {}
+
#else /* CONFIG_PPC_MMU_NOHASH */
-extern void hash_preload(struct mm_struct *mm, unsigned long ea,
- unsigned long access, unsigned long trap);
-
-
extern void _tlbie(unsigned long address);
extern void _tlbia(void);
+void print_system_hash_info(void);
+
#endif /* CONFIG_PPC_MMU_NOHASH */
#ifdef CONFIG_PPC32
+void hash_preload(struct mm_struct *mm, unsigned long ea);
+
extern void mapin_ram(void);
extern void setbat(int index, unsigned long virt, phys_addr_t phys,
unsigned int size, pgprot_t prot);
@@ -100,12 +101,11 @@
extern unsigned int rtas_data, rtas_size;
struct hash_pte;
-extern struct hash_pte *Hash, *Hash_end;
-extern unsigned long Hash_size, Hash_mask;
+extern struct hash_pte *Hash;
+extern u8 early_hash[];
#endif /* CONFIG_PPC32 */
-extern unsigned long ioremap_bot;
extern unsigned long __max_low_memory;
extern phys_addr_t __initial_memory_limit_addr;
extern phys_addr_t total_memory;
@@ -126,7 +126,8 @@
*/
#ifdef CONFIG_PPC32
extern void MMU_init_hw(void);
-extern unsigned long mmu_mapin_ram(unsigned long top);
+void MMU_init_hw_patch(void);
+unsigned long mmu_mapin_ram(unsigned long base, unsigned long top);
#endif
#ifdef CONFIG_PPC_FSL_BOOK3E
@@ -151,7 +152,7 @@
};
#endif
-#if defined(CONFIG_6xx) || defined(CONFIG_FSL_BOOKE) || defined(CONFIG_PPC_8xx)
+#if defined(CONFIG_PPC_BOOK3S_32) || defined(CONFIG_FSL_BOOKE) || defined(CONFIG_PPC_8xx)
/* 6xx have BATS */
/* FSL_BOOKE have TLBCAM */
/* 8xx have LTLB */
@@ -161,3 +162,11 @@
static inline phys_addr_t v_block_mapped(unsigned long va) { return 0; }
static inline unsigned long p_block_mapped(phys_addr_t pa) { return 0; }
#endif
+
+#if defined(CONFIG_PPC_BOOK3S_32) || defined(CONFIG_PPC_8xx)
+void mmu_mark_initmem_nx(void);
+void mmu_mark_rodata_ro(void);
+#else
+static inline void mmu_mark_initmem_nx(void) { }
+static inline void mmu_mark_rodata_ro(void) { }
+#endif
diff --git a/arch/powerpc/mm/40x_mmu.c b/arch/powerpc/mm/nohash/40x.c
similarity index 91%
rename from arch/powerpc/mm/40x_mmu.c
rename to arch/powerpc/mm/nohash/40x.c
index 61ac468..f348104 100644
--- a/arch/powerpc/mm/40x_mmu.c
+++ b/arch/powerpc/mm/nohash/40x.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* This file contains the routines for initializing the MMU
* on the 4xx series of chips.
@@ -12,12 +13,6 @@
*
* Derived from "arch/i386/mm/init.c"
* Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
*/
#include <linux/signal.h>
@@ -49,7 +44,7 @@
#include <asm/machdep.h>
#include <asm/setup.h>
-#include "mmu_decl.h"
+#include <mm/mmu_decl.h>
extern int __map_without_ltlbs;
/*
@@ -93,7 +88,7 @@
#define LARGE_PAGE_SIZE_16M (1<<24)
#define LARGE_PAGE_SIZE_4M (1<<22)
-unsigned long __init mmu_mapin_ram(unsigned long top)
+unsigned long __init mmu_mapin_ram(unsigned long base, unsigned long top)
{
unsigned long v, s, mapped;
phys_addr_t p;
diff --git a/arch/powerpc/mm/44x_mmu.c b/arch/powerpc/mm/nohash/44x.c
similarity index 87%
rename from arch/powerpc/mm/44x_mmu.c
rename to arch/powerpc/mm/nohash/44x.c
index 12d9251..3d6ae7c 100644
--- a/arch/powerpc/mm/44x_mmu.c
+++ b/arch/powerpc/mm/nohash/44x.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Modifications by Matt Porter (mporter@mvista.com) to support
* PPC44x Book E processors.
@@ -15,12 +16,6 @@
*
* Derived from "arch/i386/mm/init.c"
* Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
*/
#include <linux/init.h>
@@ -29,8 +24,9 @@
#include <asm/mmu.h>
#include <asm/page.h>
#include <asm/cacheflush.h>
+#include <asm/code-patching.h>
-#include "mmu_decl.h"
+#include <mm/mmu_decl.h>
/* Used by the 44x TLB replacement exception handler.
* Just needed it declared someplace.
@@ -43,22 +39,13 @@
static void ppc44x_update_tlb_hwater(void)
{
- extern unsigned int tlb_44x_patch_hwater_D[];
- extern unsigned int tlb_44x_patch_hwater_I[];
-
/* The TLB miss handlers hard codes the watermark in a cmpli
* instruction to improve performances rather than loading it
* from the global variable. Thus, we patch the instructions
* in the 2 TLB miss handlers when updating the value
*/
- tlb_44x_patch_hwater_D[0] = (tlb_44x_patch_hwater_D[0] & 0xffff0000) |
- tlb_44x_hwater;
- flush_icache_range((unsigned long)&tlb_44x_patch_hwater_D[0],
- (unsigned long)&tlb_44x_patch_hwater_D[1]);
- tlb_44x_patch_hwater_I[0] = (tlb_44x_patch_hwater_I[0] & 0xffff0000) |
- tlb_44x_hwater;
- flush_icache_range((unsigned long)&tlb_44x_patch_hwater_I[0],
- (unsigned long)&tlb_44x_patch_hwater_I[1]);
+ modify_instruction_site(&patch__tlb_44x_hwater_D, 0xffff, tlb_44x_hwater);
+ modify_instruction_site(&patch__tlb_44x_hwater_I, 0xffff, tlb_44x_hwater);
}
/*
@@ -178,7 +165,7 @@
flush_instruction_cache();
}
-unsigned long __init mmu_mapin_ram(unsigned long top)
+unsigned long __init mmu_mapin_ram(unsigned long base, unsigned long top)
{
unsigned long addr;
unsigned long memstart = memstart_addr & ~(PPC_PIN_SIZE - 1);
diff --git a/arch/powerpc/mm/nohash/8xx.c b/arch/powerpc/mm/nohash/8xx.c
new file mode 100644
index 0000000..4a06cb3
--- /dev/null
+++ b/arch/powerpc/mm/nohash/8xx.c
@@ -0,0 +1,234 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * This file contains the routines for initializing the MMU
+ * on the 8xx series of chips.
+ * -- christophe
+ *
+ * Derived from arch/powerpc/mm/40x_mmu.c:
+ */
+
+#include <linux/memblock.h>
+#include <linux/mmu_context.h>
+#include <asm/fixmap.h>
+#include <asm/code-patching.h>
+
+#include <mm/mmu_decl.h>
+
+#define IMMR_SIZE (FIX_IMMR_SIZE << PAGE_SHIFT)
+
+extern int __map_without_ltlbs;
+
+static unsigned long block_mapped_ram;
+
+/*
+ * Return PA for this VA if it is in an area mapped with LTLBs.
+ * Otherwise, returns 0
+ */
+phys_addr_t v_block_mapped(unsigned long va)
+{
+ unsigned long p = PHYS_IMMR_BASE;
+
+ if (__map_without_ltlbs)
+ return 0;
+ if (va >= VIRT_IMMR_BASE && va < VIRT_IMMR_BASE + IMMR_SIZE)
+ return p + va - VIRT_IMMR_BASE;
+ if (va >= PAGE_OFFSET && va < PAGE_OFFSET + block_mapped_ram)
+ return __pa(va);
+ return 0;
+}
+
+/*
+ * Return VA for a given PA mapped with LTLBs or 0 if not mapped
+ */
+unsigned long p_block_mapped(phys_addr_t pa)
+{
+ unsigned long p = PHYS_IMMR_BASE;
+
+ if (__map_without_ltlbs)
+ return 0;
+ if (pa >= p && pa < p + IMMR_SIZE)
+ return VIRT_IMMR_BASE + pa - p;
+ if (pa < block_mapped_ram)
+ return (unsigned long)__va(pa);
+ return 0;
+}
+
+#define LARGE_PAGE_SIZE_8M (1<<23)
+
+/*
+ * MMU_init_hw does the chip-specific initialization of the MMU hardware.
+ */
+void __init MMU_init_hw(void)
+{
+ /* PIN up to the 3 first 8Mb after IMMR in DTLB table */
+ if (IS_ENABLED(CONFIG_PIN_TLB_DATA)) {
+ unsigned long ctr = mfspr(SPRN_MD_CTR) & 0xfe000000;
+ unsigned long flags = 0xf0 | MD_SPS16K | _PAGE_SH | _PAGE_DIRTY;
+ int i = IS_ENABLED(CONFIG_PIN_TLB_IMMR) ? 29 : 28;
+ unsigned long addr = 0;
+ unsigned long mem = total_lowmem;
+
+ for (; i < 32 && mem >= LARGE_PAGE_SIZE_8M; i++) {
+ mtspr(SPRN_MD_CTR, ctr | (i << 8));
+ mtspr(SPRN_MD_EPN, (unsigned long)__va(addr) | MD_EVALID);
+ mtspr(SPRN_MD_TWC, MD_PS8MEG | MD_SVALID);
+ mtspr(SPRN_MD_RPN, addr | flags | _PAGE_PRESENT);
+ addr += LARGE_PAGE_SIZE_8M;
+ mem -= LARGE_PAGE_SIZE_8M;
+ }
+ }
+}
+
+static void __init mmu_mapin_immr(void)
+{
+ unsigned long p = PHYS_IMMR_BASE;
+ unsigned long v = VIRT_IMMR_BASE;
+ int offset;
+
+ for (offset = 0; offset < IMMR_SIZE; offset += PAGE_SIZE)
+ map_kernel_page(v + offset, p + offset, PAGE_KERNEL_NCG);
+}
+
+static void mmu_patch_cmp_limit(s32 *site, unsigned long mapped)
+{
+ modify_instruction_site(site, 0xffff, (unsigned long)__va(mapped) >> 16);
+}
+
+static void mmu_patch_addis(s32 *site, long simm)
+{
+ unsigned int instr = *(unsigned int *)patch_site_addr(site);
+
+ instr &= 0xffff0000;
+ instr |= ((unsigned long)simm) >> 16;
+ patch_instruction_site(site, instr);
+}
+
+unsigned long __init mmu_mapin_ram(unsigned long base, unsigned long top)
+{
+ unsigned long mapped;
+
+ if (__map_without_ltlbs) {
+ mapped = 0;
+ mmu_mapin_immr();
+ if (!IS_ENABLED(CONFIG_PIN_TLB_IMMR))
+ patch_instruction_site(&patch__dtlbmiss_immr_jmp, PPC_INST_NOP);
+ if (!IS_ENABLED(CONFIG_PIN_TLB_TEXT))
+ mmu_patch_cmp_limit(&patch__itlbmiss_linmem_top, 0);
+ } else {
+ mapped = top & ~(LARGE_PAGE_SIZE_8M - 1);
+ if (!IS_ENABLED(CONFIG_PIN_TLB_TEXT))
+ mmu_patch_cmp_limit(&patch__itlbmiss_linmem_top,
+ _ALIGN(__pa(_einittext), 8 << 20));
+ }
+
+ mmu_patch_cmp_limit(&patch__dtlbmiss_linmem_top, mapped);
+ mmu_patch_cmp_limit(&patch__fixupdar_linmem_top, mapped);
+
+ /* If the size of RAM is not an exact power of two, we may not
+ * have covered RAM in its entirety with 8 MiB
+ * pages. Consequently, restrict the top end of RAM currently
+ * allocable so that calls to the MEMBLOCK to allocate PTEs for "tail"
+ * coverage with normal-sized pages (or other reasons) do not
+ * attempt to allocate outside the allowed range.
+ */
+ if (mapped)
+ memblock_set_current_limit(mapped);
+
+ block_mapped_ram = mapped;
+
+ return mapped;
+}
+
+void mmu_mark_initmem_nx(void)
+{
+ if (IS_ENABLED(CONFIG_STRICT_KERNEL_RWX) && CONFIG_ETEXT_SHIFT < 23)
+ mmu_patch_addis(&patch__itlbmiss_linmem_top8,
+ -((long)_etext & ~(LARGE_PAGE_SIZE_8M - 1)));
+ if (!IS_ENABLED(CONFIG_PIN_TLB_TEXT))
+ mmu_patch_cmp_limit(&patch__itlbmiss_linmem_top, __pa(_etext));
+}
+
+#ifdef CONFIG_STRICT_KERNEL_RWX
+void mmu_mark_rodata_ro(void)
+{
+ if (CONFIG_DATA_SHIFT < 23)
+ mmu_patch_addis(&patch__dtlbmiss_romem_top8,
+ -__pa(((unsigned long)_sinittext) &
+ ~(LARGE_PAGE_SIZE_8M - 1)));
+ mmu_patch_addis(&patch__dtlbmiss_romem_top, -__pa(_sinittext));
+}
+#endif
+
+void __init setup_initial_memory_limit(phys_addr_t first_memblock_base,
+ phys_addr_t first_memblock_size)
+{
+ /* We don't currently support the first MEMBLOCK not mapping 0
+ * physical on those processors
+ */
+ BUG_ON(first_memblock_base != 0);
+
+ /* 8xx can only access 32MB at the moment */
+ memblock_set_current_limit(min_t(u64, first_memblock_size, 0x02000000));
+}
+
+/*
+ * Set up to use a given MMU context.
+ * id is context number, pgd is PGD pointer.
+ *
+ * We place the physical address of the new task page directory loaded
+ * into the MMU base register, and set the ASID compare register with
+ * the new "context."
+ */
+void set_context(unsigned long id, pgd_t *pgd)
+{
+ s16 offset = (s16)(__pa(swapper_pg_dir));
+
+ /* Context switch the PTE pointer for the Abatron BDI2000.
+ * The PGDIR is passed as second argument.
+ */
+ if (IS_ENABLED(CONFIG_BDI_SWITCH))
+ abatron_pteptrs[1] = pgd;
+
+ /* Register M_TWB will contain base address of level 1 table minus the
+ * lower part of the kernel PGDIR base address, so that all accesses to
+ * level 1 table are done relative to lower part of kernel PGDIR base
+ * address.
+ */
+ mtspr(SPRN_M_TWB, __pa(pgd) - offset);
+
+ /* Update context */
+ mtspr(SPRN_M_CASID, id - 1);
+ /* sync */
+ mb();
+}
+
+void flush_instruction_cache(void)
+{
+ isync();
+ mtspr(SPRN_IC_CST, IDC_INVALL);
+ isync();
+}
+
+#ifdef CONFIG_PPC_KUEP
+void __init setup_kuep(bool disabled)
+{
+ if (disabled)
+ return;
+
+ pr_info("Activating Kernel Userspace Execution Prevention\n");
+
+ mtspr(SPRN_MI_AP, MI_APG_KUEP);
+}
+#endif
+
+#ifdef CONFIG_PPC_KUAP
+void __init setup_kuap(bool disabled)
+{
+ pr_info("Activating Kernel Userspace Access Protection\n");
+
+ if (disabled)
+ pr_warn("KUAP cannot be disabled yet on 8xx when compiled in\n");
+
+ mtspr(SPRN_MD_AP, MD_APG_KUAP);
+}
+#endif
diff --git a/arch/powerpc/mm/nohash/Makefile b/arch/powerpc/mm/nohash/Makefile
new file mode 100644
index 0000000..33b6f6f
--- /dev/null
+++ b/arch/powerpc/mm/nohash/Makefile
@@ -0,0 +1,18 @@
+# SPDX-License-Identifier: GPL-2.0
+
+ccflags-$(CONFIG_PPC64) := $(NO_MINIMAL_TOC)
+
+obj-y += mmu_context.o tlb.o tlb_low.o
+obj-$(CONFIG_PPC_BOOK3E_64) += tlb_low_64e.o book3e_pgtable.o
+obj-$(CONFIG_40x) += 40x.o
+obj-$(CONFIG_44x) += 44x.o
+obj-$(CONFIG_PPC_8xx) += 8xx.o
+obj-$(CONFIG_PPC_FSL_BOOK3E) += fsl_booke.o
+ifdef CONFIG_HUGETLB_PAGE
+obj-$(CONFIG_PPC_FSL_BOOK3E) += book3e_hugetlbpage.o
+endif
+
+# Disable kcov instrumentation on sensitive code
+# This is necessary for booting with kcov enabled on book3e machines
+KCOV_INSTRUMENT_tlb.o := n
+KCOV_INSTRUMENT_fsl_booke.o := n
diff --git a/arch/powerpc/mm/hugetlbpage-book3e.c b/arch/powerpc/mm/nohash/book3e_hugetlbpage.c
similarity index 88%
rename from arch/powerpc/mm/hugetlbpage-book3e.c
rename to arch/powerpc/mm/nohash/book3e_hugetlbpage.c
index f84ec46..8b88be9 100644
--- a/arch/powerpc/mm/hugetlbpage-book3e.c
+++ b/arch/powerpc/mm/nohash/book3e_hugetlbpage.c
@@ -11,8 +11,9 @@
#include <asm/mmu.h>
-#ifdef CONFIG_PPC_FSL_BOOK3E
#ifdef CONFIG_PPC64
+#include <asm/paca.h>
+
static inline int tlb1_next(void)
{
struct paca_struct *paca = get_paca();
@@ -29,33 +30,6 @@
tcd->esel_next = next;
return this;
}
-#else
-static inline int tlb1_next(void)
-{
- int index, ncams;
-
- ncams = mfspr(SPRN_TLB1CFG) & TLBnCFG_N_ENTRY;
-
- index = this_cpu_read(next_tlbcam_idx);
-
- /* Just round-robin the entries and wrap when we hit the end */
- if (unlikely(index == ncams - 1))
- __this_cpu_write(next_tlbcam_idx, tlbcam_index);
- else
- __this_cpu_inc(next_tlbcam_idx);
-
- return index;
-}
-#endif /* !PPC64 */
-#endif /* FSL */
-
-static inline int mmu_get_tsize(int psize)
-{
- return mmu_psize_defs[psize].enc;
-}
-
-#if defined(CONFIG_PPC_FSL_BOOK3E) && defined(CONFIG_PPC64)
-#include <asm/paca.h>
static inline void book3e_tlb_lock(void)
{
@@ -98,6 +72,23 @@
paca->tcd_ptr->lock = 0;
}
#else
+static inline int tlb1_next(void)
+{
+ int index, ncams;
+
+ ncams = mfspr(SPRN_TLB1CFG) & TLBnCFG_N_ENTRY;
+
+ index = this_cpu_read(next_tlbcam_idx);
+
+ /* Just round-robin the entries and wrap when we hit the end */
+ if (unlikely(index == ncams - 1))
+ __this_cpu_write(next_tlbcam_idx, tlbcam_index);
+ else
+ __this_cpu_inc(next_tlbcam_idx);
+
+ return index;
+}
+
static inline void book3e_tlb_lock(void)
{
}
@@ -131,18 +122,15 @@
return found;
}
-void book3e_hugetlb_preload(struct vm_area_struct *vma, unsigned long ea,
- pte_t pte)
+static void
+book3e_hugetlb_preload(struct vm_area_struct *vma, unsigned long ea, pte_t pte)
{
unsigned long mas1, mas2;
u64 mas7_3;
unsigned long psize, tsize, shift;
unsigned long flags;
struct mm_struct *mm;
-
-#ifdef CONFIG_PPC_FSL_BOOK3E
int index;
-#endif
if (unlikely(is_kernel_addr(ea)))
return;
@@ -166,11 +154,9 @@
return;
}
-#ifdef CONFIG_PPC_FSL_BOOK3E
/* We have to use the CAM(TLB1) on FSL parts for hugepages */
index = tlb1_next();
mtspr(SPRN_MAS0, MAS0_ESEL(index) | MAS0_TLBSEL(1));
-#endif
mas1 = MAS1_VALID | MAS1_TID(mm->context.id) | MAS1_TSIZE(tsize);
mas2 = ea & ~((1UL << shift) - 1);
@@ -197,6 +183,18 @@
local_irq_restore(flags);
}
+/*
+ * This is called at the end of handling a user page fault, when the
+ * fault has been handled by updating a PTE in the linux page tables.
+ *
+ * This must always be called with the pte lock held.
+ */
+void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, pte_t *ptep)
+{
+ if (is_vm_hugetlb_page(vma))
+ book3e_hugetlb_preload(vma, address, *ptep);
+}
+
void flush_hugetlb_page(struct vm_area_struct *vma, unsigned long vmaddr)
{
struct hstate *hstate = hstate_file(vma->vm_file);
diff --git a/arch/powerpc/mm/pgtable-book3e.c b/arch/powerpc/mm/nohash/book3e_pgtable.c
similarity index 74%
rename from arch/powerpc/mm/pgtable-book3e.c
rename to arch/powerpc/mm/nohash/book3e_pgtable.c
index a229893..4637fdd 100644
--- a/arch/powerpc/mm/pgtable-book3e.c
+++ b/arch/powerpc/mm/nohash/book3e_pgtable.c
@@ -1,12 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Copyright 2005, Paul Mackerras, IBM Corporation.
* Copyright 2009, Benjamin Herrenschmidt, IBM Corporation.
* Copyright 2015-2016, Aneesh Kumar K.V, IBM Corporation.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#include <linux/sched.h>
@@ -15,7 +11,7 @@
#include <asm/tlb.h>
#include <asm/dma.h>
-#include "mmu_decl.h"
+#include <mm/mmu_decl.h>
#ifdef CONFIG_SPARSEMEM_VMEMMAP
/*
@@ -42,7 +38,7 @@
* thus must have the low bits clear
*/
for (i = 0; i < page_size; i += PAGE_SIZE)
- BUG_ON(map_kernel_page(start + i, phys, flags));
+ BUG_ON(map_kernel_page(start + i, phys, __pgprot(flags)));
return 0;
}
@@ -55,14 +51,18 @@
#endif
#endif /* CONFIG_SPARSEMEM_VMEMMAP */
-static __ref void *early_alloc_pgtable(unsigned long size)
+static void __init *early_alloc_pgtable(unsigned long size)
{
- void *pt;
+ void *ptr;
- pt = __va(memblock_alloc_base(size, size, __pa(MAX_DMA_ADDRESS)));
- memset(pt, 0, size);
+ ptr = memblock_alloc_try_nid(size, size, MEMBLOCK_LOW_LIMIT,
+ __pa(MAX_DMA_ADDRESS), NUMA_NO_NODE);
- return pt;
+ if (!ptr)
+ panic("%s: Failed to allocate %lu bytes align=0x%lx max_addr=%lx\n",
+ __func__, size, size, __pa(MAX_DMA_ADDRESS));
+
+ return ptr;
}
/*
@@ -70,7 +70,7 @@
* map_kernel_page adds an entry to the ioremap page table
* and adds an entry to the HPT, possibly bolting it
*/
-int map_kernel_page(unsigned long ea, unsigned long pa, unsigned long flags)
+int __ref map_kernel_page(unsigned long ea, unsigned long pa, pgprot_t prot)
{
pgd_t *pgdp;
pud_t *pudp;
@@ -89,33 +89,27 @@
ptep = pte_alloc_kernel(pmdp, ea);
if (!ptep)
return -ENOMEM;
- set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT,
- __pgprot(flags)));
} else {
pgdp = pgd_offset_k(ea);
#ifndef __PAGETABLE_PUD_FOLDED
if (pgd_none(*pgdp)) {
pudp = early_alloc_pgtable(PUD_TABLE_SIZE);
- BUG_ON(pudp == NULL);
pgd_populate(&init_mm, pgdp, pudp);
}
#endif /* !__PAGETABLE_PUD_FOLDED */
pudp = pud_offset(pgdp, ea);
if (pud_none(*pudp)) {
pmdp = early_alloc_pgtable(PMD_TABLE_SIZE);
- BUG_ON(pmdp == NULL);
pud_populate(&init_mm, pudp, pmdp);
}
pmdp = pmd_offset(pudp, ea);
if (!pmd_present(*pmdp)) {
ptep = early_alloc_pgtable(PAGE_SIZE);
- BUG_ON(ptep == NULL);
pmd_populate_kernel(&init_mm, pmdp, ptep);
}
ptep = pte_offset_kernel(pmdp, ea);
- set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT,
- __pgprot(flags)));
}
+ set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT, prot));
smp_wmb();
return 0;
diff --git a/arch/powerpc/mm/fsl_booke_mmu.c b/arch/powerpc/mm/nohash/fsl_booke.c
similarity index 95%
rename from arch/powerpc/mm/fsl_booke_mmu.c
rename to arch/powerpc/mm/nohash/fsl_booke.c
index 080d49b..556e3cd 100644
--- a/arch/powerpc/mm/fsl_booke_mmu.c
+++ b/arch/powerpc/mm/nohash/fsl_booke.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Modifications by Kumar Gala (galak@kernel.crashing.org) to support
* E500 Book E processors.
@@ -17,12 +18,6 @@
*
* Derived from "arch/i386/mm/init.c"
* Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
*/
#include <linux/signal.h>
@@ -54,7 +49,7 @@
#include <asm/setup.h>
#include <asm/paca.h>
-#include "mmu_decl.h"
+#include <mm/mmu_decl.h>
unsigned int tlbcam_index;
@@ -221,7 +216,7 @@
#error "LOWMEM_CAM_NUM must be less than NUM_TLBCAMS"
#endif
-unsigned long __init mmu_mapin_ram(unsigned long top)
+unsigned long __init mmu_mapin_ram(unsigned long base, unsigned long top)
{
return tlbcam_addrs[tlbcam_index - 1].limit - PAGE_OFFSET + 1;
}
diff --git a/arch/powerpc/mm/mmu_context_nohash.c b/arch/powerpc/mm/nohash/mmu_context.c
similarity index 94%
rename from arch/powerpc/mm/mmu_context_nohash.c
rename to arch/powerpc/mm/nohash/mmu_context.c
index 4d80239..aac81c9 100644
--- a/arch/powerpc/mm/mmu_context_nohash.c
+++ b/arch/powerpc/mm/nohash/mmu_context.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* This file contains the routines for handling the MMU on those
* PowerPC implementations where the MMU is not using the hash
@@ -9,11 +10,6 @@
* Derived from previous arch/powerpc/mm/mmu_context.c
* and arch/powerpc/include/asm/mmu_context.h
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* TODO:
*
* - The global context lock will not scale very well
@@ -44,7 +40,7 @@
#include <linux/mm.h>
#include <linux/init.h>
#include <linux/spinlock.h>
-#include <linux/bootmem.h>
+#include <linux/memblock.h>
#include <linux/notifier.h>
#include <linux/cpu.h>
#include <linux/slab.h>
@@ -52,7 +48,7 @@
#include <asm/mmu_context.h>
#include <asm/tlbflush.h>
-#include "mmu_decl.h"
+#include <mm/mmu_decl.h>
/*
* The MPC8xx has only 16 contexts. We rotate through them on each task switch.
@@ -372,7 +368,6 @@
{
pr_hard("initing context for mm @%p\n", mm);
-#ifdef CONFIG_PPC_MM_SLICES
/*
* We have MMU_NO_CONTEXT set to be ~0. Hence check
* explicitly against context.id == 0. This ensures that we properly
@@ -382,9 +377,9 @@
*/
if (mm->context.id == 0)
slice_init_new_context_exec(mm);
-#endif
mm->context.id = MMU_NO_CONTEXT;
mm->context.active = 0;
+ pte_frag_set(&mm->context, NULL);
return 0;
}
@@ -461,10 +456,20 @@
/*
* Allocate the maps used by context management
*/
- context_map = memblock_virt_alloc(CTX_MAP_SIZE, 0);
- context_mm = memblock_virt_alloc(sizeof(void *) * (LAST_CONTEXT + 1), 0);
+ context_map = memblock_alloc(CTX_MAP_SIZE, SMP_CACHE_BYTES);
+ if (!context_map)
+ panic("%s: Failed to allocate %zu bytes\n", __func__,
+ CTX_MAP_SIZE);
+ context_mm = memblock_alloc(sizeof(void *) * (LAST_CONTEXT + 1),
+ SMP_CACHE_BYTES);
+ if (!context_mm)
+ panic("%s: Failed to allocate %zu bytes\n", __func__,
+ sizeof(void *) * (LAST_CONTEXT + 1));
#ifdef CONFIG_SMP
- stale_map[boot_cpuid] = memblock_virt_alloc(CTX_MAP_SIZE, 0);
+ stale_map[boot_cpuid] = memblock_alloc(CTX_MAP_SIZE, SMP_CACHE_BYTES);
+ if (!stale_map[boot_cpuid])
+ panic("%s: Failed to allocate %zu bytes\n", __func__,
+ CTX_MAP_SIZE);
cpuhp_setup_state_nocalls(CPUHP_POWERPC_MMU_CTX_PREPARE,
"powerpc/mmu/ctx:prepare",
@@ -486,4 +491,3 @@
next_context = FIRST_CONTEXT;
nr_free_contexts = LAST_CONTEXT - FIRST_CONTEXT + 1;
}
-
diff --git a/arch/powerpc/mm/tlb_nohash.c b/arch/powerpc/mm/nohash/tlb.c
similarity index 96%
rename from arch/powerpc/mm/tlb_nohash.c
rename to arch/powerpc/mm/nohash/tlb.c
index ae5d568..696f568 100644
--- a/arch/powerpc/mm/tlb_nohash.c
+++ b/arch/powerpc/mm/nohash/tlb.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* This file contains the routines for TLB flushing.
* On machines where the MMU does not use a hash table to store virtual to
@@ -19,12 +20,6 @@
*
* Derived from "arch/i386/mm/init.c"
* Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
*/
#include <linux/kernel.h>
@@ -46,7 +41,7 @@
#include <asm/hugetlb.h>
#include <asm/paca.h>
-#include "mmu_decl.h"
+#include <mm/mmu_decl.h>
/*
* This struct lists the sw-supported page sizes. The hardawre MMU may support
@@ -302,7 +297,7 @@
* This function as well as __local_flush_tlb_page() must only be called
* for user contexts.
*/
- if (unlikely(WARN_ON(!mm)))
+ if (WARN_ON(!mm))
return;
preempt_disable();
@@ -433,11 +428,7 @@
unsigned long rid = (address & rmask) | 0x1000000000000000ul;
unsigned long vpte = address & ~rmask;
-#ifdef CONFIG_PPC_64K_PAGES
- vpte = (vpte >> (PAGE_SHIFT - 4)) & ~0xfffful;
-#else
vpte = (vpte >> (PAGE_SHIFT - 3)) & ~0xffful;
-#endif
vpte |= rid;
__flush_tlb_page(tlb->mm, vpte, tsize, 0);
}
@@ -625,21 +616,12 @@
case PPC_HTW_IBM:
mas4 |= MAS4_INDD;
-#ifdef CONFIG_PPC_64K_PAGES
- mas4 |= BOOK3E_PAGESZ_256M << MAS4_TSIZED_SHIFT;
- mmu_pte_psize = MMU_PAGE_256M;
-#else
mas4 |= BOOK3E_PAGESZ_1M << MAS4_TSIZED_SHIFT;
mmu_pte_psize = MMU_PAGE_1M;
-#endif
break;
case PPC_HTW_NONE:
-#ifdef CONFIG_PPC_64K_PAGES
- mas4 |= BOOK3E_PAGESZ_64K << MAS4_TSIZED_SHIFT;
-#else
mas4 |= BOOK3E_PAGESZ_4K << MAS4_TSIZED_SHIFT;
-#endif
mmu_pte_psize = mmu_virtual_psize;
break;
}
@@ -648,7 +630,6 @@
#ifdef CONFIG_PPC_FSL_BOOK3E
if (mmu_has_feature(MMU_FTR_TYPE_FSL_E)) {
unsigned int num_cams;
- int __maybe_unused cpu = smp_processor_id();
bool map = true;
/* use a quarter of the TLBCAM for bolted linear map */
@@ -722,6 +703,8 @@
* for use by the TLB miss code
*/
linear_map_top = memblock_end_of_DRAM();
+
+ ioremap_bot = IOREMAP_BASE;
}
static void __init early_mmu_set_memory_limit(void)
@@ -800,5 +783,9 @@
#ifdef CONFIG_PPC_47x
early_init_mmu_47x();
#endif
+
+#ifdef CONFIG_PPC_MM_SLICES
+ mm_ctx_set_slb_addr_limit(&init_mm.context, SLB_ADDR_LIMIT_DEFAULT);
+#endif
}
#endif /* CONFIG_PPC64 */
diff --git a/arch/powerpc/mm/tlb_nohash_low.S b/arch/powerpc/mm/nohash/tlb_low.S
similarity index 96%
rename from arch/powerpc/mm/tlb_nohash_low.S
rename to arch/powerpc/mm/nohash/tlb_low.S
index e066a65..2ca407c 100644
--- a/arch/powerpc/mm/tlb_nohash_low.S
+++ b/arch/powerpc/mm/nohash/tlb_low.S
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
* This file contains low-level functions for performing various
* types of TLB invalidations on various processors with no hash
@@ -18,12 +19,6 @@
*
* Partially rewritten by Cort Dougan (cort@cs.nmt.edu)
* Paul Mackerras, Kumar Gala and Benjamin Herrenschmidt.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
*/
#include <asm/reg.h>
diff --git a/arch/powerpc/mm/tlb_low_64e.S b/arch/powerpc/mm/nohash/tlb_low_64e.S
similarity index 96%
rename from arch/powerpc/mm/tlb_low_64e.S
rename to arch/powerpc/mm/nohash/tlb_low_64e.S
index 7fd20c5..1f110c3 100644
--- a/arch/powerpc/mm/tlb_low_64e.S
+++ b/arch/powerpc/mm/nohash/tlb_low_64e.S
@@ -1,13 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
* Low level TLB miss handlers for Book3E
*
* Copyright (C) 2008-2009
* Ben. Herrenschmidt (benh@kernel.crashing.org), IBM Corp.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#include <asm/processor.h>
@@ -24,11 +20,7 @@
#include <asm/kvm_booke_hv_asm.h>
#include <asm/feature-fixups.h>
-#ifdef CONFIG_PPC_64K_PAGES
-#define VPTE_PMD_SHIFT (PTE_INDEX_SIZE+1)
-#else
#define VPTE_PMD_SHIFT (PTE_INDEX_SIZE)
-#endif
#define VPTE_PUD_SHIFT (VPTE_PMD_SHIFT + PMD_INDEX_SIZE)
#define VPTE_PGD_SHIFT (VPTE_PUD_SHIFT + PUD_INDEX_SIZE)
#define VPTE_INDEX_SIZE (VPTE_PGD_SHIFT + PGD_INDEX_SIZE)
@@ -70,6 +62,13 @@
std r15,EX_TLB_R15(r12)
std r10,EX_TLB_CR(r12)
#ifdef CONFIG_PPC_FSL_BOOK3E
+START_BTB_FLUSH_SECTION
+ mfspr r11, SPRN_SRR1
+ andi. r10,r11,MSR_PR
+ beq 1f
+ BTB_FLUSH(r10)
+1:
+END_BTB_FLUSH_SECTION
std r7,EX_TLB_R7(r12)
#endif
TLB_MISS_PROLOG_STATS
@@ -160,13 +159,11 @@
ldx r14,r14,r15 /* grab pgd entry */
ALT_MMU_FTR_SECTION_END_IFSET(MMU_FTR_USE_TLBRSRV)
-#ifndef CONFIG_PPC_64K_PAGES
rldicl r15,r16,64-PUD_SHIFT+3,64-PUD_INDEX_SIZE-3
clrrdi r15,r15,3
cmpdi cr0,r14,0
bge tlb_miss_fault_bolted /* Bad pgd entry or hugepage; bail */
ldx r14,r14,r15 /* grab pud entry */
-#endif /* CONFIG_PPC_64K_PAGES */
rldicl r15,r16,64-PMD_SHIFT+3,64-PMD_INDEX_SIZE-3
clrrdi r15,r15,3
@@ -675,18 +672,7 @@
* order to handle the weird page table format used by linux
*/
ori r10,r15,0x1
-#ifdef CONFIG_PPC_64K_PAGES
- /* For the top bits, 16 bytes per PTE */
- rldicl r14,r16,64-(PAGE_SHIFT-4),PAGE_SHIFT-4+4
- /* Now create the bottom bits as 0 in position 0x8000 and
- * the rest calculated for 8 bytes per PTE
- */
- rldicl r15,r16,64-(PAGE_SHIFT-3),64-15
- /* Insert the bottom bits in */
- rlwimi r14,r15,0,16,31
-#else
rldicl r14,r16,64-(PAGE_SHIFT-3),PAGE_SHIFT-3+4
-#endif
sldi r15,r10,60
clrrdi r14,r14,3
or r10,r15,r14
@@ -725,11 +711,7 @@
/* Check page size, if not standard, update MAS1 */
rldicl r11,r14,64-8,64-8
-#ifdef CONFIG_PPC_64K_PAGES
- cmpldi cr0,r11,BOOK3E_PAGESZ_64K
-#else
cmpldi cr0,r11,BOOK3E_PAGESZ_4K
-#endif
beq- 1f
mfspr r11,SPRN_MAS1
rlwimi r11,r14,31,21,24
@@ -850,14 +832,12 @@
cmpdi cr0,r15,0
bge virt_page_table_tlb_miss_fault
-#ifndef CONFIG_PPC_64K_PAGES
/* Get to PUD entry */
rldicl r11,r16,64-VPTE_PUD_SHIFT,64-PUD_INDEX_SIZE-3
clrrdi r10,r11,3
ldx r15,r10,r15
cmpdi cr0,r15,0
bge virt_page_table_tlb_miss_fault
-#endif /* CONFIG_PPC_64K_PAGES */
/* Get to PMD entry */
rldicl r11,r16,64-VPTE_PMD_SHIFT,64-PMD_INDEX_SIZE-3
@@ -1099,14 +1079,12 @@
cmpdi cr0,r15,0
bge htw_tlb_miss_fault
-#ifndef CONFIG_PPC_64K_PAGES
/* Get to PUD entry */
rldicl r11,r16,64-(PUD_SHIFT-3),64-PUD_INDEX_SIZE-3
clrrdi r10,r11,3
ldx r15,r10,r15
cmpdi cr0,r15,0
bge htw_tlb_miss_fault
-#endif /* CONFIG_PPC_64K_PAGES */
/* Get to PMD entry */
rldicl r11,r16,64-(PMD_SHIFT-3),64-PMD_INDEX_SIZE-3
@@ -1125,9 +1103,7 @@
* 4K page we need to extract a bit from the virtual address and
* insert it into the "PA52" bit of the RPN.
*/
-#ifndef CONFIG_PPC_64K_PAGES
rlwimi r15,r16,32-9,20,20
-#endif
/* Now we build the MAS:
*
* MAS 0 : Fully setup with defaults in MAS4 and TLBnCFG
@@ -1137,11 +1113,7 @@
* MAS 2 : Use defaults
* MAS 3+7 : Needs to be done
*/
-#ifdef CONFIG_PPC_64K_PAGES
- ori r10,r15,(BOOK3E_PAGESZ_64K << MAS3_SPSIZE_SHIFT)
-#else
ori r10,r15,(BOOK3E_PAGESZ_4K << MAS3_SPSIZE_SHIFT)
-#endif
BEGIN_MMU_FTR_SECTION
srdi r16,r10,32
diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index 5500e4e..50d68d2 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -1,17 +1,13 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* pSeries NUMA support
*
* Copyright (C) 2002 Anton Blanchard <anton@au.ibm.com>, IBM
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#define pr_fmt(fmt) "numa: " fmt
#include <linux/threads.h>
-#include <linux/bootmem.h>
+#include <linux/memblock.h>
#include <linux/init.h>
#include <linux/mm.h>
#include <linux/mmzone.h>
@@ -19,7 +15,6 @@
#include <linux/nodemask.h>
#include <linux/cpu.h>
#include <linux/notifier.h>
-#include <linux/memblock.h>
#include <linux/of.h>
#include <linux/pfn.h>
#include <linux/cpuset.h>
@@ -33,7 +28,6 @@
#include <asm/sparsemem.h>
#include <asm/prom.h>
#include <asm/smp.h>
-#include <asm/cputhreads.h>
#include <asm/topology.h>
#include <asm/firmware.h>
#include <asm/paca.h>
@@ -85,7 +79,7 @@
alloc_bootmem_cpumask_var(&node_to_cpumask_map[node]);
/* cpumask_of_node() will now work */
- dbg("Node to cpumask map for %d nodes\n", nr_node_ids);
+ dbg("Node to cpumask map for %u nodes\n", nr_node_ids);
}
static int __init fake_numa_create_new_node(unsigned long end_pfn,
@@ -169,6 +163,22 @@
}
#endif /* CONFIG_HOTPLUG_CPU || CONFIG_PPC_SPLPAR */
+int cpu_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc)
+{
+ int dist = 0;
+
+ int i, index;
+
+ for (i = 0; i < distance_ref_points_depth; i++) {
+ index = be32_to_cpu(distance_ref_points[i]);
+ if (cpu1_assoc[index] == cpu2_assoc[index])
+ break;
+ dist++;
+ }
+
+ return dist;
+}
+
/* must hold reference to node during call */
static const __be32 *of_get_associativity(struct device_node *dev)
{
@@ -216,9 +226,9 @@
*/
static int associativity_to_nid(const __be32 *associativity)
{
- int nid = -1;
+ int nid = NUMA_NO_NODE;
- if (min_common_depth == -1)
+ if (!numa_enabled)
goto out;
if (of_read_number(associativity, 1) >= min_common_depth)
@@ -226,7 +236,7 @@
/* POWER4 LPAR uses 0xffff as invalid node */
if (nid == 0xffff || nid >= MAX_NUMNODES)
- nid = -1;
+ nid = NUMA_NO_NODE;
if (nid > 0 &&
of_read_number(associativity, 1) >= distance_ref_points_depth) {
@@ -245,7 +255,7 @@
*/
static int of_node_to_nid_single(struct device_node *device)
{
- int nid = -1;
+ int nid = NUMA_NO_NODE;
const __be32 *tmp;
tmp = of_get_associativity(device);
@@ -257,7 +267,7 @@
/* Walk the device tree upwards, looking for an associativity id */
int of_node_to_nid(struct device_node *device)
{
- int nid = -1;
+ int nid = NUMA_NO_NODE;
of_node_get(device);
while (device) {
@@ -422,17 +432,19 @@
static int of_drconf_to_nid_single(struct drmem_lmb *lmb)
{
struct assoc_arrays aa = { .arrays = NULL };
- int default_nid = 0;
+ int default_nid = NUMA_NO_NODE;
int nid = default_nid;
int rc, index;
+ if ((min_common_depth < 0) || !numa_enabled)
+ return default_nid;
+
rc = of_get_assoc_arrays(&aa);
if (rc)
return default_nid;
- if (min_common_depth > 0 && min_common_depth <= aa.array_sz &&
- !(lmb->flags & DRCONF_MEM_AI_INVALID) &&
- lmb->aa_index < aa.n_arrays) {
+ if (min_common_depth <= aa.array_sz &&
+ !(lmb->flags & DRCONF_MEM_AI_INVALID) && lmb->aa_index < aa.n_arrays) {
index = lmb->aa_index * aa.array_sz + min_common_depth - 1;
nid = of_read_number(&aa.arrays[index], 1);
@@ -455,7 +467,7 @@
*/
static int numa_setup_cpu(unsigned long lcpu)
{
- int nid = -1;
+ int nid = NUMA_NO_NODE;
struct device_node *cpu;
/*
@@ -632,8 +644,14 @@
min_common_depth = find_min_common_depth();
- if (min_common_depth < 0)
+ if (min_common_depth < 0) {
+ /*
+ * if we fail to parse min_common_depth from device tree
+ * mark the numa disabled, boot with numa disabled.
+ */
+ numa_enabled = false;
return min_common_depth;
+ }
dbg("NUMA associativity depth for CPU/Memory: %d\n", min_common_depth);
@@ -749,7 +767,7 @@
unsigned int node;
unsigned int cpu, count;
- if (min_common_depth == -1 || !numa_enabled)
+ if (!numa_enabled)
return;
for_each_online_node(node) {
@@ -788,7 +806,11 @@
void *nd;
int tnid;
- nd_pa = memblock_alloc_try_nid(nd_size, SMP_CACHE_BYTES, nid);
+ nd_pa = memblock_phys_alloc_try_nid(nd_size, SMP_CACHE_BYTES, nid);
+ if (!nd_pa)
+ panic("Cannot allocate %zu bytes for node %d data\n",
+ nd_size, nid);
+
nd = __va(nd_pa);
/* report and initialize */
@@ -810,7 +832,7 @@
struct device_node *rtas;
u32 numnodes, i;
- if (min_common_depth <= 0)
+ if (!numa_enabled)
return;
rtas = of_find_node_by_path("/rtas");
@@ -905,16 +927,22 @@
}
early_param("numa", early_numa);
-static bool topology_updates_enabled = true;
+/*
+ * The platform can inform us through one of several mechanisms
+ * (post-migration device tree updates, PRRN or VPHN) that the NUMA
+ * assignment of a resource has changed. This controls whether we act
+ * on that. Disabled by default.
+ */
+static bool topology_updates_enabled;
static int __init early_topology_updates(char *p)
{
if (!p)
return 0;
- if (!strcmp(p, "off")) {
- pr_info("Disabling topology updates\n");
- topology_updates_enabled = false;
+ if (!strcmp(p, "on")) {
+ pr_warn("Caution: enabling topology updates\n");
+ topology_updates_enabled = true;
}
return 0;
@@ -931,7 +959,7 @@
{
struct drmem_lmb *lmb;
unsigned long lmb_size;
- int nid = -1;
+ int nid = NUMA_NO_NODE;
lmb_size = drmem_lmb_size();
@@ -961,7 +989,7 @@
static int hot_add_node_scn_to_nid(unsigned long scn_addr)
{
struct device_node *memory;
- int nid = -1;
+ int nid = NUMA_NO_NODE;
for_each_node_by_type(memory, "memory") {
unsigned long start, size;
@@ -1006,7 +1034,7 @@
struct device_node *memory = NULL;
int nid;
- if (!numa_enabled || (min_common_depth < 0))
+ if (!numa_enabled)
return first_online_node;
memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
@@ -1059,9 +1087,6 @@
/* Virtual Processor Home Node (VPHN) support */
#ifdef CONFIG_PPC_SPLPAR
-
-#include "vphn.h"
-
struct topology_update_data {
struct topology_update_data *next;
unsigned int cpu;
@@ -1157,25 +1182,13 @@
* Retrieve the new associativity information for a virtual processor's
* home node.
*/
-static long hcall_vphn(unsigned long cpu, __be32 *associativity)
-{
- long rc;
- long retbuf[PLPAR_HCALL9_BUFSIZE] = {0};
- u64 flags = 1;
- int hwcpu = get_hard_smp_processor_id(cpu);
-
- rc = plpar_hcall9(H_HOME_NODE_ASSOCIATIVITY, retbuf, flags, hwcpu);
- vphn_unpack_associativity(retbuf, associativity);
-
- return rc;
-}
-
static long vphn_get_associativity(unsigned long cpu,
__be32 *associativity)
{
long rc;
- rc = hcall_vphn(cpu, associativity);
+ rc = hcall_vphn(get_hard_smp_processor_id(cpu),
+ VPHN_FLAG_VCPU, associativity);
switch (rc) {
case H_FUNCTION:
@@ -1461,13 +1474,6 @@
#ifdef CONFIG_SMP
-static void stage_topology_update(int core_id)
-{
- cpumask_or(&cpu_associativity_changes_mask,
- &cpu_associativity_changes_mask, cpu_sibling_mask(core_id));
- reset_topology_timer();
-}
-
static int dt_update_callback(struct notifier_block *nb,
unsigned long action, void *data)
{
@@ -1476,11 +1482,11 @@
switch (action) {
case OF_RECONFIG_UPDATE_PROPERTY:
- if (!of_prop_cmp(update->dn->type, "cpu") &&
+ if (of_node_is_type(update->dn, "cpu") &&
!of_prop_cmp(update->prop->name, "ibm,associativity")) {
u32 core_id;
of_property_read_u32(update->dn, "reg", &core_id);
- stage_topology_update(core_id);
+ rc = dlpar_cpu_readd(core_id);
rc = NOTIFY_OK;
}
break;
@@ -1502,6 +1508,9 @@
{
int rc = 0;
+ if (!topology_updates_enabled)
+ return 0;
+
if (firmware_has_feature(FW_FEATURE_PRRN)) {
if (!prrn_enabled) {
prrn_enabled = 1;
@@ -1521,6 +1530,10 @@
}
}
+ pr_info("Starting topology update%s%s\n",
+ (prrn_enabled ? " prrn_enabled" : ""),
+ (vphn_enabled ? " vphn_enabled" : ""));
+
return rc;
}
@@ -1531,6 +1544,9 @@
{
int rc = 0;
+ if (!topology_updates_enabled)
+ return 0;
+
if (prrn_enabled) {
prrn_enabled = 0;
#ifdef CONFIG_SMP
@@ -1542,6 +1558,8 @@
rc = del_timer_sync(&topology_timer);
}
+ pr_info("Stopping topology update\n");
+
return rc;
}
@@ -1586,11 +1604,13 @@
kbuf[read_len] = '\0';
- if (!strncmp(kbuf, "on", 2))
+ if (!strncmp(kbuf, "on", 2)) {
+ topology_updates_enabled = true;
start_topology_update();
- else if (!strncmp(kbuf, "off", 3))
+ } else if (!strncmp(kbuf, "off", 3)) {
stop_topology_update();
- else
+ topology_updates_enabled = false;
+ } else
return -EINVAL;
return count;
@@ -1605,9 +1625,7 @@
static int topology_update_init(void)
{
- /* Do not poll for changes if disabled at boot */
- if (topology_updates_enabled)
- start_topology_update();
+ start_topology_update();
if (vphn_enabled)
topology_schedule_update();
diff --git a/arch/powerpc/mm/pgtable-frag.c b/arch/powerpc/mm/pgtable-frag.c
new file mode 100644
index 0000000..ee4bd6d
--- /dev/null
+++ b/arch/powerpc/mm/pgtable-frag.c
@@ -0,0 +1,119 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * Handling Page Tables through page fragments
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/gfp.h>
+#include <linux/mm.h>
+#include <linux/percpu.h>
+#include <linux/hardirq.h>
+#include <linux/hugetlb.h>
+#include <asm/pgalloc.h>
+#include <asm/tlbflush.h>
+#include <asm/tlb.h>
+
+void pte_frag_destroy(void *pte_frag)
+{
+ int count;
+ struct page *page;
+
+ page = virt_to_page(pte_frag);
+ /* drop all the pending references */
+ count = ((unsigned long)pte_frag & ~PAGE_MASK) >> PTE_FRAG_SIZE_SHIFT;
+ /* We allow PTE_FRAG_NR fragments from a PTE page */
+ if (atomic_sub_and_test(PTE_FRAG_NR - count, &page->pt_frag_refcount)) {
+ pgtable_pte_page_dtor(page);
+ __free_page(page);
+ }
+}
+
+static pte_t *get_pte_from_cache(struct mm_struct *mm)
+{
+ void *pte_frag, *ret;
+
+ if (PTE_FRAG_NR == 1)
+ return NULL;
+
+ spin_lock(&mm->page_table_lock);
+ ret = pte_frag_get(&mm->context);
+ if (ret) {
+ pte_frag = ret + PTE_FRAG_SIZE;
+ /*
+ * If we have taken up all the fragments mark PTE page NULL
+ */
+ if (((unsigned long)pte_frag & ~PAGE_MASK) == 0)
+ pte_frag = NULL;
+ pte_frag_set(&mm->context, pte_frag);
+ }
+ spin_unlock(&mm->page_table_lock);
+ return (pte_t *)ret;
+}
+
+static pte_t *__alloc_for_ptecache(struct mm_struct *mm, int kernel)
+{
+ void *ret = NULL;
+ struct page *page;
+
+ if (!kernel) {
+ page = alloc_page(PGALLOC_GFP | __GFP_ACCOUNT);
+ if (!page)
+ return NULL;
+ if (!pgtable_pte_page_ctor(page)) {
+ __free_page(page);
+ return NULL;
+ }
+ } else {
+ page = alloc_page(PGALLOC_GFP);
+ if (!page)
+ return NULL;
+ }
+
+ atomic_set(&page->pt_frag_refcount, 1);
+
+ ret = page_address(page);
+ /*
+ * if we support only one fragment just return the
+ * allocated page.
+ */
+ if (PTE_FRAG_NR == 1)
+ return ret;
+ spin_lock(&mm->page_table_lock);
+ /*
+ * If we find pgtable_page set, we return
+ * the allocated page with single fragement
+ * count.
+ */
+ if (likely(!pte_frag_get(&mm->context))) {
+ atomic_set(&page->pt_frag_refcount, PTE_FRAG_NR);
+ pte_frag_set(&mm->context, ret + PTE_FRAG_SIZE);
+ }
+ spin_unlock(&mm->page_table_lock);
+
+ return (pte_t *)ret;
+}
+
+pte_t *pte_fragment_alloc(struct mm_struct *mm, int kernel)
+{
+ pte_t *pte;
+
+ pte = get_pte_from_cache(mm);
+ if (pte)
+ return pte;
+
+ return __alloc_for_ptecache(mm, kernel);
+}
+
+void pte_fragment_free(unsigned long *table, int kernel)
+{
+ struct page *page = virt_to_page(table);
+
+ BUG_ON(atomic_read(&page->pt_frag_refcount) <= 0);
+ if (atomic_dec_and_test(&page->pt_frag_refcount)) {
+ if (!kernel)
+ pgtable_pte_page_dtor(page);
+ __free_page(page);
+ }
+}
diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c
index d71c777..e3759b6 100644
--- a/arch/powerpc/mm/pgtable.c
+++ b/arch/powerpc/mm/pgtable.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* This file contains common routines for dealing with free of page tables
* Along with common page table handling code
@@ -14,11 +15,6 @@
*
* Dave Engebretsen <engebret@us.ibm.com>
* Rework for PPC64 port.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#include <linux/kernel.h>
@@ -30,6 +26,7 @@
#include <asm/pgalloc.h>
#include <asm/tlbflush.h>
#include <asm/tlb.h>
+#include <asm/hugetlb.h>
static inline int is_exec_fault(void)
{
@@ -44,20 +41,13 @@
static inline int pte_looks_normal(pte_t pte)
{
-#if defined(CONFIG_PPC_BOOK3S_64)
- if ((pte_val(pte) & (_PAGE_PRESENT | _PAGE_SPECIAL)) == _PAGE_PRESENT) {
+ if (pte_present(pte) && !pte_special(pte)) {
if (pte_ci(pte))
return 0;
if (pte_user(pte))
return 1;
}
return 0;
-#else
- return (pte_val(pte) &
- (_PAGE_PRESENT | _PAGE_SPECIAL | _PAGE_NO_CACHE | _PAGE_USER |
- _PAGE_PRIVILEGED)) ==
- (_PAGE_PRESENT | _PAGE_USER);
-#endif
}
static struct page *maybe_pte_to_page(pte_t pte)
@@ -73,7 +63,7 @@
return page;
}
-#if defined(CONFIG_PPC_STD_MMU) || _PAGE_EXEC == 0
+#ifdef CONFIG_PPC_BOOK3S
/* Server-style MMU handles coherency when hashing if HW exec permission
* is supposed per page (currently 64-bit only). If not, then, we always
@@ -81,7 +71,7 @@
* support falls into the same category.
*/
-static pte_t set_pte_filter(pte_t pte)
+static pte_t set_pte_filter_hash(pte_t pte)
{
if (radix_enabled())
return pte;
@@ -100,13 +90,11 @@
return pte;
}
-static pte_t set_access_flags_filter(pte_t pte, struct vm_area_struct *vma,
- int dirty)
-{
- return pte;
-}
+#else /* CONFIG_PPC_BOOK3S */
-#else /* defined(CONFIG_PPC_STD_MMU) || _PAGE_EXEC == 0 */
+static pte_t set_pte_filter_hash(pte_t pte) { return pte; }
+
+#endif /* CONFIG_PPC_BOOK3S */
/* Embedded type MMU with HW exec support. This is a bit more complicated
* as we don't have two bits to spare for _PAGE_EXEC and _PAGE_HWEXEC so
@@ -116,8 +104,11 @@
{
struct page *pg;
+ if (mmu_has_feature(MMU_FTR_HPTE_TABLE))
+ return set_pte_filter_hash(pte);
+
/* No exec permission in the first place, move on */
- if (!(pte_val(pte) & _PAGE_EXEC) || !pte_looks_normal(pte))
+ if (!pte_exec(pte) || !pte_looks_normal(pte))
return pte;
/* If you set _PAGE_EXEC on weird pages you're on your own */
@@ -137,7 +128,7 @@
}
/* Else, we filter out _PAGE_EXEC */
- return __pte(pte_val(pte) & ~_PAGE_EXEC);
+ return pte_exprotect(pte);
}
static pte_t set_access_flags_filter(pte_t pte, struct vm_area_struct *vma,
@@ -145,12 +136,15 @@
{
struct page *pg;
+ if (mmu_has_feature(MMU_FTR_HPTE_TABLE))
+ return pte;
+
/* So here, we only care about exec faults, as we use them
* to recover lost _PAGE_EXEC and perform I$/D$ coherency
* if necessary. Also if _PAGE_EXEC is already set, same deal,
* we just bail out
*/
- if (dirty || (pte_val(pte) & _PAGE_EXEC) || !is_exec_fault())
+ if (dirty || pte_exec(pte) || !is_exec_fault())
return pte;
#ifdef CONFIG_DEBUG_VM
@@ -176,11 +170,9 @@
set_bit(PG_arch_1, &pg->flags);
bail:
- return __pte(pte_val(pte) | _PAGE_EXEC);
+ return pte_mkexec(pte);
}
-#endif /* !(defined(CONFIG_PPC_STD_MMU) || _PAGE_EXEC == 0) */
-
/*
* set_pte stores a linux PTE into the linux page table.
*/
@@ -188,14 +180,13 @@
pte_t pte)
{
/*
- * When handling numa faults, we already have the pte marked
- * _PAGE_PRESENT, but we can be sure that it is not in hpte.
- * Hence we can use set_pte_at for them.
+ * Make sure hardware valid bit is not set. We don't do
+ * tlb flush for this update.
*/
- VM_WARN_ON(pte_present(*ptep) && !pte_protnone(*ptep));
+ VM_WARN_ON(pte_hw_valid(*ptep) && !pte_protnone(*ptep));
/* Add the pte bit when trying to set a pte */
- pte = __pte(pte_val(pte) | _PAGE_PTE);
+ pte = pte_mkpte(pte);
/* Note: mm->context.id might not yet have been assigned as
* this context might not have been activated yet when this
@@ -229,9 +220,9 @@
}
#ifdef CONFIG_HUGETLB_PAGE
-extern int huge_ptep_set_access_flags(struct vm_area_struct *vma,
- unsigned long addr, pte_t *ptep,
- pte_t pte, int dirty)
+int huge_ptep_set_access_flags(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep,
+ pte_t pte, int dirty)
{
#ifdef HUGETLB_NEED_PRELOAD
/*
@@ -305,3 +296,128 @@
return __pa(pfn_to_kaddr(pfn)) + offset_in_page(va);
}
EXPORT_SYMBOL_GPL(vmalloc_to_phys);
+
+/*
+ * We have 4 cases for pgds and pmds:
+ * (1) invalid (all zeroes)
+ * (2) pointer to next table, as normal; bottom 6 bits == 0
+ * (3) leaf pte for huge page _PAGE_PTE set
+ * (4) hugepd pointer, _PAGE_PTE = 0 and bits [2..6] indicate size of table
+ *
+ * So long as we atomically load page table pointers we are safe against teardown,
+ * we can follow the address down to the the page and take a ref on it.
+ * This function need to be called with interrupts disabled. We use this variant
+ * when we have MSR[EE] = 0 but the paca->irq_soft_mask = IRQS_ENABLED
+ */
+pte_t *__find_linux_pte(pgd_t *pgdir, unsigned long ea,
+ bool *is_thp, unsigned *hpage_shift)
+{
+ pgd_t pgd, *pgdp;
+ pud_t pud, *pudp;
+ pmd_t pmd, *pmdp;
+ pte_t *ret_pte;
+ hugepd_t *hpdp = NULL;
+ unsigned pdshift = PGDIR_SHIFT;
+
+ if (hpage_shift)
+ *hpage_shift = 0;
+
+ if (is_thp)
+ *is_thp = false;
+
+ pgdp = pgdir + pgd_index(ea);
+ pgd = READ_ONCE(*pgdp);
+ /*
+ * Always operate on the local stack value. This make sure the
+ * value don't get updated by a parallel THP split/collapse,
+ * page fault or a page unmap. The return pte_t * is still not
+ * stable. So should be checked there for above conditions.
+ */
+ if (pgd_none(pgd))
+ return NULL;
+
+ if (pgd_is_leaf(pgd)) {
+ ret_pte = (pte_t *)pgdp;
+ goto out;
+ }
+
+ if (is_hugepd(__hugepd(pgd_val(pgd)))) {
+ hpdp = (hugepd_t *)&pgd;
+ goto out_huge;
+ }
+
+ /*
+ * Even if we end up with an unmap, the pgtable will not
+ * be freed, because we do an rcu free and here we are
+ * irq disabled
+ */
+ pdshift = PUD_SHIFT;
+ pudp = pud_offset(&pgd, ea);
+ pud = READ_ONCE(*pudp);
+
+ if (pud_none(pud))
+ return NULL;
+
+ if (pud_is_leaf(pud)) {
+ ret_pte = (pte_t *)pudp;
+ goto out;
+ }
+
+ if (is_hugepd(__hugepd(pud_val(pud)))) {
+ hpdp = (hugepd_t *)&pud;
+ goto out_huge;
+ }
+
+ pdshift = PMD_SHIFT;
+ pmdp = pmd_offset(&pud, ea);
+ pmd = READ_ONCE(*pmdp);
+
+ /*
+ * A hugepage collapse is captured by this condition, see
+ * pmdp_collapse_flush.
+ */
+ if (pmd_none(pmd))
+ return NULL;
+
+#ifdef CONFIG_PPC_BOOK3S_64
+ /*
+ * A hugepage split is captured by this condition, see
+ * pmdp_invalidate.
+ *
+ * Huge page modification can be caught here too.
+ */
+ if (pmd_is_serializing(pmd))
+ return NULL;
+#endif
+
+ if (pmd_trans_huge(pmd) || pmd_devmap(pmd)) {
+ if (is_thp)
+ *is_thp = true;
+ ret_pte = (pte_t *)pmdp;
+ goto out;
+ }
+
+ if (pmd_is_leaf(pmd)) {
+ ret_pte = (pte_t *)pmdp;
+ goto out;
+ }
+
+ if (is_hugepd(__hugepd(pmd_val(pmd)))) {
+ hpdp = (hugepd_t *)&pmd;
+ goto out_huge;
+ }
+
+ return pte_offset_kernel(&pmd, ea);
+
+out_huge:
+ if (!hpdp)
+ return NULL;
+
+ ret_pte = hugepte_offset(*hpdp, ea, pdshift);
+ pdshift = hugepd_shift(*hpdp);
+out:
+ if (hpage_shift)
+ *hpage_shift = pdshift;
+ return ret_pte;
+}
+EXPORT_SYMBOL_GPL(__find_linux_pte);
diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c
index 120a49b..8ec5dfb 100644
--- a/arch/powerpc/mm/pgtable_32.c
+++ b/arch/powerpc/mm/pgtable_32.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* This file contains the routines setting up the linux page tables.
* -- paulus
@@ -11,12 +12,6 @@
*
* Derived from "arch/i386/mm/init.c"
* Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
*/
#include <linux/kernel.h>
@@ -32,184 +27,36 @@
#include <asm/pgtable.h>
#include <asm/pgalloc.h>
#include <asm/fixmap.h>
-#include <asm/io.h>
#include <asm/setup.h>
#include <asm/sections.h>
-#include "mmu_decl.h"
-
-unsigned long ioremap_bot;
-EXPORT_SYMBOL(ioremap_bot); /* aka VMALLOC_END */
+#include <mm/mmu_decl.h>
extern char etext[], _stext[], _sinittext[], _einittext[];
-__ref pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
+static void __init *early_alloc_pgtable(unsigned long size)
{
- pte_t *pte;
+ void *ptr = memblock_alloc(size, size);
- if (slab_is_available()) {
- pte = (pte_t *)__get_free_page(GFP_KERNEL|__GFP_ZERO);
- } else {
- pte = __va(memblock_alloc(PAGE_SIZE, PAGE_SIZE));
- if (pte)
- clear_page(pte);
+ if (!ptr)
+ panic("%s: Failed to allocate %lu bytes align=0x%lx\n",
+ __func__, size, size);
+
+ return ptr;
+}
+
+static pte_t __init *early_pte_alloc_kernel(pmd_t *pmdp, unsigned long va)
+{
+ if (pmd_none(*pmdp)) {
+ pte_t *ptep = early_alloc_pgtable(PTE_FRAG_SIZE);
+
+ pmd_populate_kernel(&init_mm, pmdp, ptep);
}
- return pte;
+ return pte_offset_kernel(pmdp, va);
}
-pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address)
-{
- struct page *ptepage;
- gfp_t flags = GFP_KERNEL | __GFP_ZERO | __GFP_ACCOUNT;
-
- ptepage = alloc_pages(flags, 0);
- if (!ptepage)
- return NULL;
- if (!pgtable_page_ctor(ptepage)) {
- __free_page(ptepage);
- return NULL;
- }
- return ptepage;
-}
-
-void __iomem *
-ioremap(phys_addr_t addr, unsigned long size)
-{
- return __ioremap_caller(addr, size, _PAGE_NO_CACHE | _PAGE_GUARDED,
- __builtin_return_address(0));
-}
-EXPORT_SYMBOL(ioremap);
-
-void __iomem *
-ioremap_wc(phys_addr_t addr, unsigned long size)
-{
- return __ioremap_caller(addr, size, _PAGE_NO_CACHE,
- __builtin_return_address(0));
-}
-EXPORT_SYMBOL(ioremap_wc);
-
-void __iomem *
-ioremap_prot(phys_addr_t addr, unsigned long size, unsigned long flags)
-{
- /* writeable implies dirty for kernel addresses */
- if ((flags & (_PAGE_RW | _PAGE_RO)) != _PAGE_RO)
- flags |= _PAGE_DIRTY | _PAGE_HWWRITE;
-
- /* we don't want to let _PAGE_USER and _PAGE_EXEC leak out */
- flags &= ~(_PAGE_USER | _PAGE_EXEC);
- flags |= _PAGE_PRIVILEGED;
-
- return __ioremap_caller(addr, size, flags, __builtin_return_address(0));
-}
-EXPORT_SYMBOL(ioremap_prot);
-
-void __iomem *
-__ioremap(phys_addr_t addr, unsigned long size, unsigned long flags)
-{
- return __ioremap_caller(addr, size, flags, __builtin_return_address(0));
-}
-
-void __iomem *
-__ioremap_caller(phys_addr_t addr, unsigned long size, unsigned long flags,
- void *caller)
-{
- unsigned long v, i;
- phys_addr_t p;
- int err;
-
- /* Make sure we have the base flags */
- if ((flags & _PAGE_PRESENT) == 0)
- flags |= pgprot_val(PAGE_KERNEL);
-
- /* Non-cacheable page cannot be coherent */
- if (flags & _PAGE_NO_CACHE)
- flags &= ~_PAGE_COHERENT;
-
- /*
- * Choose an address to map it to.
- * Once the vmalloc system is running, we use it.
- * Before then, we use space going down from IOREMAP_TOP
- * (ioremap_bot records where we're up to).
- */
- p = addr & PAGE_MASK;
- size = PAGE_ALIGN(addr + size) - p;
-
- /*
- * If the address lies within the first 16 MB, assume it's in ISA
- * memory space
- */
- if (p < 16*1024*1024)
- p += _ISA_MEM_BASE;
-
-#ifndef CONFIG_CRASH_DUMP
- /*
- * Don't allow anybody to remap normal RAM that we're using.
- * mem_init() sets high_memory so only do the check after that.
- */
- if (slab_is_available() && (p < virt_to_phys(high_memory)) &&
- page_is_ram(__phys_to_pfn(p))) {
- printk("__ioremap(): phys addr 0x%llx is RAM lr %ps\n",
- (unsigned long long)p, __builtin_return_address(0));
- return NULL;
- }
-#endif
-
- if (size == 0)
- return NULL;
-
- /*
- * Is it already mapped? Perhaps overlapped by a previous
- * mapping.
- */
- v = p_block_mapped(p);
- if (v)
- goto out;
-
- if (slab_is_available()) {
- struct vm_struct *area;
- area = get_vm_area_caller(size, VM_IOREMAP, caller);
- if (area == 0)
- return NULL;
- area->phys_addr = p;
- v = (unsigned long) area->addr;
- } else {
- v = (ioremap_bot -= size);
- }
-
- /*
- * Should check if it is a candidate for a BAT mapping
- */
-
- err = 0;
- for (i = 0; i < size && err == 0; i += PAGE_SIZE)
- err = map_kernel_page(v+i, p+i, flags);
- if (err) {
- if (slab_is_available())
- vunmap((void *)v);
- return NULL;
- }
-
-out:
- return (void __iomem *) (v + ((unsigned long)addr & ~PAGE_MASK));
-}
-EXPORT_SYMBOL(__ioremap);
-
-void iounmap(volatile void __iomem *addr)
-{
- /*
- * If mapped by BATs then there is nothing to do.
- * Calling vfree() generates a benign warning.
- */
- if (v_block_mapped((unsigned long)addr))
- return;
-
- if (addr > high_memory && (unsigned long) addr < ioremap_bot)
- vunmap((void *) (PAGE_MASK & (unsigned long)addr));
-}
-EXPORT_SYMBOL(iounmap);
-
-int map_kernel_page(unsigned long va, phys_addr_t pa, int flags)
+int __ref map_kernel_page(unsigned long va, phys_addr_t pa, pgprot_t prot)
{
pmd_t *pd;
pte_t *pg;
@@ -218,16 +65,17 @@
/* Use upper 10 bits of VA to index the first level map */
pd = pmd_offset(pud_offset(pgd_offset_k(va), va), va);
/* Use middle 10 bits of VA to index the second-level map */
- pg = pte_alloc_kernel(pd, va);
+ if (likely(slab_is_available()))
+ pg = pte_alloc_kernel(pd, va);
+ else
+ pg = early_pte_alloc_kernel(pd, va);
if (pg != 0) {
err = 0;
/* The PTE should never be already set nor present in the
* hash table
*/
- BUG_ON((pte_val(*pg) & (_PAGE_PRESENT | _PAGE_HASHPTE)) &&
- flags);
- set_pte_at(&init_mm, va, pg, pfn_pte(pa >> PAGE_SHIFT,
- __pgprot(flags)));
+ BUG_ON((pte_present(*pg) | pte_hashpte(*pg)) && pgprot_val(prot));
+ set_pte_at(&init_mm, va, pg, pfn_pte(pa >> PAGE_SHIFT, prot));
}
smp_wmb();
return err;
@@ -238,7 +86,7 @@
*/
static void __init __mapin_ram_chunk(unsigned long offset, unsigned long top)
{
- unsigned long v, s, f;
+ unsigned long v, s;
phys_addr_t p;
int ktext;
@@ -248,11 +96,10 @@
for (; s < top; s += PAGE_SIZE) {
ktext = ((char *)v >= _stext && (char *)v < etext) ||
((char *)v >= _sinittext && (char *)v < _einittext);
- f = ktext ? pgprot_val(PAGE_KERNEL_TEXT) : pgprot_val(PAGE_KERNEL);
- map_kernel_page(v, p, f);
-#ifdef CONFIG_PPC_STD_MMU_32
+ map_kernel_page(v, p, ktext ? PAGE_KERNEL_TEXT : PAGE_KERNEL);
+#ifdef CONFIG_PPC_BOOK3S_32
if (ktext)
- hash_preload(&init_mm, v, 0, 0x300);
+ hash_preload(&init_mm, v);
#endif
v += PAGE_SIZE;
p += PAGE_SIZE;
@@ -261,26 +108,20 @@
void __init mapin_ram(void)
{
- unsigned long s, top;
+ struct memblock_region *reg;
-#ifndef CONFIG_WII
- top = total_lowmem;
- s = mmu_mapin_ram(top);
- __mapin_ram_chunk(s, top);
-#else
- if (!wii_hole_size) {
- s = mmu_mapin_ram(total_lowmem);
- __mapin_ram_chunk(s, total_lowmem);
- } else {
- top = wii_hole_start;
- s = mmu_mapin_ram(top);
- __mapin_ram_chunk(s, top);
+ for_each_memblock(memory, reg) {
+ phys_addr_t base = reg->base;
+ phys_addr_t top = min(base + reg->size, total_lowmem);
- top = memblock_end_of_DRAM();
- s = wii_mmu_mapin_mem2(top);
- __mapin_ram_chunk(s, top);
+ if (base >= top)
+ continue;
+ base = mmu_mapin_ram(base, top);
+ if (IS_ENABLED(CONFIG_BDI_SWITCH))
+ __mapin_ram_chunk(reg->base, top);
+ else
+ __mapin_ram_chunk(base, top);
}
-#endif
}
/* Scan the real Linux page tables and return a PTE pointer for
@@ -366,7 +207,10 @@
unsigned long numpages = PFN_UP((unsigned long)_einittext) -
PFN_DOWN((unsigned long)_sinittext);
- change_page_attr(page, numpages, PAGE_KERNEL);
+ if (v_block_mapped((unsigned long)_stext + 1))
+ mmu_mark_initmem_nx();
+ else
+ change_page_attr(page, numpages, PAGE_KERNEL);
}
#ifdef CONFIG_STRICT_KERNEL_RWX
@@ -375,6 +219,11 @@
struct page *page;
unsigned long numpages;
+ if (v_block_mapped((unsigned long)_sinittext)) {
+ mmu_mark_rodata_ro();
+ return;
+ }
+
page = virt_to_page(_stext);
numpages = PFN_UP((unsigned long)_etext) -
PFN_DOWN((unsigned long)_stext);
@@ -389,6 +238,9 @@
PFN_DOWN((unsigned long)__start_rodata);
change_page_attr(page, numpages, PAGE_KERNEL_RO);
+
+ // mark_initmem_nx() should have already run by now
+ ptdump_check_wx();
}
#endif
diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c
index 53e9eee..e78832d 100644
--- a/arch/powerpc/mm/pgtable_64.c
+++ b/arch/powerpc/mm/pgtable_64.c
@@ -1,5 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
- * This file contains ioremap and related functions for 64-bit machines.
+ * This file contains pgtable related functions for 64-bit machines.
*
* Derived from arch/ppc64/mm/init.c
* Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
@@ -13,12 +14,6 @@
*
* Dave Engebretsen <engebret@us.ibm.com>
* Rework for PPC64 port.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
*/
#include <linux/signal.h>
@@ -39,7 +34,6 @@
#include <asm/pgalloc.h>
#include <asm/page.h>
#include <asm/prom.h>
-#include <asm/io.h>
#include <asm/mmu_context.h>
#include <asm/pgtable.h>
#include <asm/mmu.h>
@@ -52,7 +46,7 @@
#include <asm/firmware.h>
#include <asm/dma.h>
-#include "mmu_decl.h"
+#include <mm/mmu_decl.h>
#ifdef CONFIG_PPC_BOOK3S_64
@@ -90,213 +84,39 @@
EXPORT_SYMBOL(__pgd_val_bits);
unsigned long __kernel_virt_start;
EXPORT_SYMBOL(__kernel_virt_start);
-unsigned long __kernel_virt_size;
-EXPORT_SYMBOL(__kernel_virt_size);
unsigned long __vmalloc_start;
EXPORT_SYMBOL(__vmalloc_start);
unsigned long __vmalloc_end;
EXPORT_SYMBOL(__vmalloc_end);
unsigned long __kernel_io_start;
EXPORT_SYMBOL(__kernel_io_start);
+unsigned long __kernel_io_end;
struct page *vmemmap;
EXPORT_SYMBOL(vmemmap);
unsigned long __pte_frag_nr;
EXPORT_SYMBOL(__pte_frag_nr);
unsigned long __pte_frag_size_shift;
EXPORT_SYMBOL(__pte_frag_size_shift);
-unsigned long ioremap_bot;
-#else /* !CONFIG_PPC_BOOK3S_64 */
-unsigned long ioremap_bot = IOREMAP_BASE;
#endif
-/**
- * __ioremap_at - Low level function to establish the page tables
- * for an IO mapping
- */
-void __iomem * __ioremap_at(phys_addr_t pa, void *ea, unsigned long size,
- unsigned long flags)
-{
- unsigned long i;
-
- /* Make sure we have the base flags */
- if ((flags & _PAGE_PRESENT) == 0)
- flags |= pgprot_val(PAGE_KERNEL);
-
- /* We don't support the 4K PFN hack with ioremap */
- if (flags & H_PAGE_4K_PFN)
- return NULL;
-
- WARN_ON(pa & ~PAGE_MASK);
- WARN_ON(((unsigned long)ea) & ~PAGE_MASK);
- WARN_ON(size & ~PAGE_MASK);
-
- for (i = 0; i < size; i += PAGE_SIZE)
- if (map_kernel_page((unsigned long)ea+i, pa+i, flags))
- return NULL;
-
- return (void __iomem *)ea;
-}
-
-/**
- * __iounmap_from - Low level function to tear down the page tables
- * for an IO mapping. This is used for mappings that
- * are manipulated manually, like partial unmapping of
- * PCI IOs or ISA space.
- */
-void __iounmap_at(void *ea, unsigned long size)
-{
- WARN_ON(((unsigned long)ea) & ~PAGE_MASK);
- WARN_ON(size & ~PAGE_MASK);
-
- unmap_kernel_range((unsigned long)ea, size);
-}
-
-void __iomem * __ioremap_caller(phys_addr_t addr, unsigned long size,
- unsigned long flags, void *caller)
-{
- phys_addr_t paligned;
- void __iomem *ret;
-
- /*
- * Choose an address to map it to.
- * Once the imalloc system is running, we use it.
- * Before that, we map using addresses going
- * up from ioremap_bot. imalloc will use
- * the addresses from ioremap_bot through
- * IMALLOC_END
- *
- */
- paligned = addr & PAGE_MASK;
- size = PAGE_ALIGN(addr + size) - paligned;
-
- if ((size == 0) || (paligned == 0))
- return NULL;
-
- if (slab_is_available()) {
- struct vm_struct *area;
-
- area = __get_vm_area_caller(size, VM_IOREMAP,
- ioremap_bot, IOREMAP_END,
- caller);
- if (area == NULL)
- return NULL;
-
- area->phys_addr = paligned;
- ret = __ioremap_at(paligned, area->addr, size, flags);
- if (!ret)
- vunmap(area->addr);
- } else {
- ret = __ioremap_at(paligned, (void *)ioremap_bot, size, flags);
- if (ret)
- ioremap_bot += size;
- }
-
- if (ret)
- ret += addr & ~PAGE_MASK;
- return ret;
-}
-
-void __iomem * __ioremap(phys_addr_t addr, unsigned long size,
- unsigned long flags)
-{
- return __ioremap_caller(addr, size, flags, __builtin_return_address(0));
-}
-
-void __iomem * ioremap(phys_addr_t addr, unsigned long size)
-{
- unsigned long flags = pgprot_val(pgprot_noncached(__pgprot(0)));
- void *caller = __builtin_return_address(0);
-
- if (ppc_md.ioremap)
- return ppc_md.ioremap(addr, size, flags, caller);
- return __ioremap_caller(addr, size, flags, caller);
-}
-
-void __iomem * ioremap_wc(phys_addr_t addr, unsigned long size)
-{
- unsigned long flags = pgprot_val(pgprot_noncached_wc(__pgprot(0)));
- void *caller = __builtin_return_address(0);
-
- if (ppc_md.ioremap)
- return ppc_md.ioremap(addr, size, flags, caller);
- return __ioremap_caller(addr, size, flags, caller);
-}
-
-void __iomem * ioremap_prot(phys_addr_t addr, unsigned long size,
- unsigned long flags)
-{
- void *caller = __builtin_return_address(0);
-
- /* writeable implies dirty for kernel addresses */
- if (flags & _PAGE_WRITE)
- flags |= _PAGE_DIRTY;
-
- /* we don't want to let _PAGE_EXEC leak out */
- flags &= ~_PAGE_EXEC;
- /*
- * Force kernel mapping.
- */
- flags &= ~_PAGE_USER;
- flags |= _PAGE_PRIVILEGED;
-
- if (ppc_md.ioremap)
- return ppc_md.ioremap(addr, size, flags, caller);
- return __ioremap_caller(addr, size, flags, caller);
-}
-
-
-/*
- * Unmap an IO region and remove it from imalloc'd list.
- * Access to IO memory should be serialized by driver.
- */
-void __iounmap(volatile void __iomem *token)
-{
- void *addr;
-
- if (!slab_is_available())
- return;
-
- addr = (void *) ((unsigned long __force)
- PCI_FIX_ADDR(token) & PAGE_MASK);
- if ((unsigned long)addr < ioremap_bot) {
- printk(KERN_WARNING "Attempt to iounmap early bolted mapping"
- " at 0x%p\n", addr);
- return;
- }
- vunmap(addr);
-}
-
-void iounmap(volatile void __iomem *token)
-{
- if (ppc_md.iounmap)
- ppc_md.iounmap(token);
- else
- __iounmap(token);
-}
-
-EXPORT_SYMBOL(ioremap);
-EXPORT_SYMBOL(ioremap_wc);
-EXPORT_SYMBOL(ioremap_prot);
-EXPORT_SYMBOL(__ioremap);
-EXPORT_SYMBOL(__ioremap_at);
-EXPORT_SYMBOL(iounmap);
-EXPORT_SYMBOL(__iounmap);
-EXPORT_SYMBOL(__iounmap_at);
-
#ifndef __PAGETABLE_PUD_FOLDED
/* 4 level page table */
struct page *pgd_page(pgd_t pgd)
{
- if (pgd_huge(pgd))
+ if (pgd_is_leaf(pgd)) {
+ VM_WARN_ON(!pgd_huge(pgd));
return pte_page(pgd_pte(pgd));
+ }
return virt_to_page(pgd_page_vaddr(pgd));
}
#endif
struct page *pud_page(pud_t pud)
{
- if (pud_huge(pud))
+ if (pud_is_leaf(pud)) {
+ VM_WARN_ON(!pud_huge(pud));
return pte_page(pud_pte(pud));
+ }
return virt_to_page(pud_page_vaddr(pud));
}
@@ -306,8 +126,10 @@
*/
struct page *pmd_page(pmd_t pmd)
{
- if (pmd_trans_huge(pmd) || pmd_huge(pmd) || pmd_devmap(pmd))
+ if (pmd_is_leaf(pmd)) {
+ VM_WARN_ON(!(pmd_large(pmd) || pmd_huge(pmd)));
return pte_page(pmd_pte(pmd));
+ }
return virt_to_page(pmd_page_vaddr(pmd));
}
@@ -323,6 +145,9 @@
radix__mark_rodata_ro();
else
hash__mark_rodata_ro();
+
+ // mark_initmem_nx() should have already run by now
+ ptdump_check_wx();
}
void mark_initmem_nx(void)
diff --git a/arch/powerpc/mm/ppc_mmu_32.c b/arch/powerpc/mm/ppc_mmu_32.c
deleted file mode 100644
index bea6c54..0000000
--- a/arch/powerpc/mm/ppc_mmu_32.c
+++ /dev/null
@@ -1,289 +0,0 @@
-/*
- * This file contains the routines for handling the MMU on those
- * PowerPC implementations where the MMU substantially follows the
- * architecture specification. This includes the 6xx, 7xx, 7xxx,
- * and 8260 implementations but excludes the 8xx and 4xx.
- * -- paulus
- *
- * Derived from arch/ppc/mm/init.c:
- * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
- *
- * Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au)
- * and Cort Dougan (PReP) (cort@cs.nmt.edu)
- * Copyright (C) 1996 Paul Mackerras
- *
- * Derived from "arch/i386/mm/init.c"
- * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
- */
-
-#include <linux/kernel.h>
-#include <linux/mm.h>
-#include <linux/init.h>
-#include <linux/highmem.h>
-#include <linux/memblock.h>
-
-#include <asm/prom.h>
-#include <asm/mmu.h>
-#include <asm/machdep.h>
-
-#include "mmu_decl.h"
-
-struct hash_pte *Hash, *Hash_end;
-unsigned long Hash_size, Hash_mask;
-unsigned long _SDR1;
-
-struct ppc_bat BATS[8][2]; /* 8 pairs of IBAT, DBAT */
-
-struct batrange { /* stores address ranges mapped by BATs */
- unsigned long start;
- unsigned long limit;
- phys_addr_t phys;
-} bat_addrs[8];
-
-/*
- * Return PA for this VA if it is mapped by a BAT, or 0
- */
-phys_addr_t v_block_mapped(unsigned long va)
-{
- int b;
- for (b = 0; b < 4; ++b)
- if (va >= bat_addrs[b].start && va < bat_addrs[b].limit)
- return bat_addrs[b].phys + (va - bat_addrs[b].start);
- return 0;
-}
-
-/*
- * Return VA for a given PA or 0 if not mapped
- */
-unsigned long p_block_mapped(phys_addr_t pa)
-{
- int b;
- for (b = 0; b < 4; ++b)
- if (pa >= bat_addrs[b].phys
- && pa < (bat_addrs[b].limit-bat_addrs[b].start)
- +bat_addrs[b].phys)
- return bat_addrs[b].start+(pa-bat_addrs[b].phys);
- return 0;
-}
-
-unsigned long __init mmu_mapin_ram(unsigned long top)
-{
- unsigned long tot, bl, done;
- unsigned long max_size = (256<<20);
-
- if (__map_without_bats) {
- printk(KERN_DEBUG "RAM mapped without BATs\n");
- return 0;
- }
-
- /* Set up BAT2 and if necessary BAT3 to cover RAM. */
-
- /* Make sure we don't map a block larger than the
- smallest alignment of the physical address. */
- tot = top;
- for (bl = 128<<10; bl < max_size; bl <<= 1) {
- if (bl * 2 > tot)
- break;
- }
-
- setbat(2, PAGE_OFFSET, 0, bl, PAGE_KERNEL_X);
- done = (unsigned long)bat_addrs[2].limit - PAGE_OFFSET + 1;
- if ((done < tot) && !bat_addrs[3].limit) {
- /* use BAT3 to cover a bit more */
- tot -= done;
- for (bl = 128<<10; bl < max_size; bl <<= 1)
- if (bl * 2 > tot)
- break;
- setbat(3, PAGE_OFFSET+done, done, bl, PAGE_KERNEL_X);
- done = (unsigned long)bat_addrs[3].limit - PAGE_OFFSET + 1;
- }
-
- return done;
-}
-
-/*
- * Set up one of the I/D BAT (block address translation) register pairs.
- * The parameters are not checked; in particular size must be a power
- * of 2 between 128k and 256M.
- */
-void __init setbat(int index, unsigned long virt, phys_addr_t phys,
- unsigned int size, pgprot_t prot)
-{
- unsigned int bl;
- int wimgxpp;
- struct ppc_bat *bat = BATS[index];
- unsigned long flags = pgprot_val(prot);
-
- if ((flags & _PAGE_NO_CACHE) ||
- (cpu_has_feature(CPU_FTR_NEED_COHERENT) == 0))
- flags &= ~_PAGE_COHERENT;
-
- bl = (size >> 17) - 1;
- if (PVR_VER(mfspr(SPRN_PVR)) != 1) {
- /* 603, 604, etc. */
- /* Do DBAT first */
- wimgxpp = flags & (_PAGE_WRITETHRU | _PAGE_NO_CACHE
- | _PAGE_COHERENT | _PAGE_GUARDED);
- wimgxpp |= (flags & _PAGE_RW)? BPP_RW: BPP_RX;
- bat[1].batu = virt | (bl << 2) | 2; /* Vs=1, Vp=0 */
- bat[1].batl = BAT_PHYS_ADDR(phys) | wimgxpp;
- if (flags & _PAGE_USER)
- bat[1].batu |= 1; /* Vp = 1 */
- if (flags & _PAGE_GUARDED) {
- /* G bit must be zero in IBATs */
- bat[0].batu = bat[0].batl = 0;
- } else {
- /* make IBAT same as DBAT */
- bat[0] = bat[1];
- }
- } else {
- /* 601 cpu */
- if (bl > BL_8M)
- bl = BL_8M;
- wimgxpp = flags & (_PAGE_WRITETHRU | _PAGE_NO_CACHE
- | _PAGE_COHERENT);
- wimgxpp |= (flags & _PAGE_RW)?
- ((flags & _PAGE_USER)? PP_RWRW: PP_RWXX): PP_RXRX;
- bat->batu = virt | wimgxpp | 4; /* Ks=0, Ku=1 */
- bat->batl = phys | bl | 0x40; /* V=1 */
- }
-
- bat_addrs[index].start = virt;
- bat_addrs[index].limit = virt + ((bl + 1) << 17) - 1;
- bat_addrs[index].phys = phys;
-}
-
-/*
- * Preload a translation in the hash table
- */
-void hash_preload(struct mm_struct *mm, unsigned long ea,
- unsigned long access, unsigned long trap)
-{
- pmd_t *pmd;
-
- if (!Hash)
- return;
- pmd = pmd_offset(pud_offset(pgd_offset(mm, ea), ea), ea);
- if (!pmd_none(*pmd))
- add_hash_page(mm->context.id, ea, pmd_val(*pmd));
-}
-
-/*
- * Initialize the hash table and patch the instructions in hashtable.S.
- */
-void __init MMU_init_hw(void)
-{
- unsigned int hmask, mb, mb2;
- unsigned int n_hpteg, lg_n_hpteg;
-
- extern unsigned int hash_page_patch_A[];
- extern unsigned int hash_page_patch_B[], hash_page_patch_C[];
- extern unsigned int hash_page[];
- extern unsigned int flush_hash_patch_A[], flush_hash_patch_B[];
-
- if (!mmu_has_feature(MMU_FTR_HPTE_TABLE)) {
- /*
- * Put a blr (procedure return) instruction at the
- * start of hash_page, since we can still get DSI
- * exceptions on a 603.
- */
- hash_page[0] = 0x4e800020;
- flush_icache_range((unsigned long) &hash_page[0],
- (unsigned long) &hash_page[1]);
- return;
- }
-
- if ( ppc_md.progress ) ppc_md.progress("hash:enter", 0x105);
-
-#define LG_HPTEG_SIZE 6 /* 64 bytes per HPTEG */
-#define SDR1_LOW_BITS ((n_hpteg - 1) >> 10)
-#define MIN_N_HPTEG 1024 /* min 64kB hash table */
-
- /*
- * Allow 1 HPTE (1/8 HPTEG) for each page of memory.
- * This is less than the recommended amount, but then
- * Linux ain't AIX.
- */
- n_hpteg = total_memory / (PAGE_SIZE * 8);
- if (n_hpteg < MIN_N_HPTEG)
- n_hpteg = MIN_N_HPTEG;
- lg_n_hpteg = __ilog2(n_hpteg);
- if (n_hpteg & (n_hpteg - 1)) {
- ++lg_n_hpteg; /* round up if not power of 2 */
- n_hpteg = 1 << lg_n_hpteg;
- }
- Hash_size = n_hpteg << LG_HPTEG_SIZE;
-
- /*
- * Find some memory for the hash table.
- */
- if ( ppc_md.progress ) ppc_md.progress("hash:find piece", 0x322);
- Hash = __va(memblock_alloc(Hash_size, Hash_size));
- memset(Hash, 0, Hash_size);
- _SDR1 = __pa(Hash) | SDR1_LOW_BITS;
-
- Hash_end = (struct hash_pte *) ((unsigned long)Hash + Hash_size);
-
- printk("Total memory = %lldMB; using %ldkB for hash table (at %p)\n",
- (unsigned long long)(total_memory >> 20), Hash_size >> 10, Hash);
-
-
- /*
- * Patch up the instructions in hashtable.S:create_hpte
- */
- if ( ppc_md.progress ) ppc_md.progress("hash:patch", 0x345);
- Hash_mask = n_hpteg - 1;
- hmask = Hash_mask >> (16 - LG_HPTEG_SIZE);
- mb2 = mb = 32 - LG_HPTEG_SIZE - lg_n_hpteg;
- if (lg_n_hpteg > 16)
- mb2 = 16 - LG_HPTEG_SIZE;
-
- hash_page_patch_A[0] = (hash_page_patch_A[0] & ~0xffff)
- | ((unsigned int)(Hash) >> 16);
- hash_page_patch_A[1] = (hash_page_patch_A[1] & ~0x7c0) | (mb << 6);
- hash_page_patch_A[2] = (hash_page_patch_A[2] & ~0x7c0) | (mb2 << 6);
- hash_page_patch_B[0] = (hash_page_patch_B[0] & ~0xffff) | hmask;
- hash_page_patch_C[0] = (hash_page_patch_C[0] & ~0xffff) | hmask;
-
- /*
- * Ensure that the locations we've patched have been written
- * out from the data cache and invalidated in the instruction
- * cache, on those machines with split caches.
- */
- flush_icache_range((unsigned long) &hash_page_patch_A[0],
- (unsigned long) &hash_page_patch_C[1]);
-
- /*
- * Patch up the instructions in hashtable.S:flush_hash_page
- */
- flush_hash_patch_A[0] = (flush_hash_patch_A[0] & ~0xffff)
- | ((unsigned int)(Hash) >> 16);
- flush_hash_patch_A[1] = (flush_hash_patch_A[1] & ~0x7c0) | (mb << 6);
- flush_hash_patch_A[2] = (flush_hash_patch_A[2] & ~0x7c0) | (mb2 << 6);
- flush_hash_patch_B[0] = (flush_hash_patch_B[0] & ~0xffff) | hmask;
- flush_icache_range((unsigned long) &flush_hash_patch_A[0],
- (unsigned long) &flush_hash_patch_B[1]);
-
- if ( ppc_md.progress ) ppc_md.progress("hash:done", 0x205);
-}
-
-void setup_initial_memory_limit(phys_addr_t first_memblock_base,
- phys_addr_t first_memblock_size)
-{
- /* We don't currently support the first MEMBLOCK not mapping 0
- * physical on those processors
- */
- BUG_ON(first_memblock_base != 0);
-
- /* 601 can only access 16MB at the moment */
- if (PVR_VER(mfspr(SPRN_PVR)) == 1)
- memblock_set_current_limit(min_t(u64, first_memblock_size, 0x01000000));
- else /* Anything else has 256M mapped */
- memblock_set_current_limit(min_t(u64, first_memblock_size, 0x10000000));
-}
diff --git a/arch/powerpc/mm/ptdump/8xx.c b/arch/powerpc/mm/ptdump/8xx.c
new file mode 100644
index 0000000..9e2d8e8
--- /dev/null
+++ b/arch/powerpc/mm/ptdump/8xx.c
@@ -0,0 +1,82 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * From split of dump_linuxpagetables.c
+ * Copyright 2016, Rashmica Gupta, IBM Corp.
+ *
+ */
+#include <linux/kernel.h>
+#include <asm/pgtable.h>
+
+#include "ptdump.h"
+
+static const struct flag_info flag_array[] = {
+ {
+ .mask = _PAGE_SH,
+ .val = 0,
+ .set = "user",
+ .clear = " ",
+ }, {
+ .mask = _PAGE_RO | _PAGE_NA,
+ .val = 0,
+ .set = "rw",
+ }, {
+ .mask = _PAGE_RO | _PAGE_NA,
+ .val = _PAGE_RO,
+ .set = "r ",
+ }, {
+ .mask = _PAGE_RO | _PAGE_NA,
+ .val = _PAGE_NA,
+ .set = " ",
+ }, {
+ .mask = _PAGE_EXEC,
+ .val = _PAGE_EXEC,
+ .set = " X ",
+ .clear = " ",
+ }, {
+ .mask = _PAGE_PRESENT,
+ .val = _PAGE_PRESENT,
+ .set = "present",
+ .clear = " ",
+ }, {
+ .mask = _PAGE_GUARDED,
+ .val = _PAGE_GUARDED,
+ .set = "guarded",
+ .clear = " ",
+ }, {
+ .mask = _PAGE_DIRTY,
+ .val = _PAGE_DIRTY,
+ .set = "dirty",
+ .clear = " ",
+ }, {
+ .mask = _PAGE_ACCESSED,
+ .val = _PAGE_ACCESSED,
+ .set = "accessed",
+ .clear = " ",
+ }, {
+ .mask = _PAGE_NO_CACHE,
+ .val = _PAGE_NO_CACHE,
+ .set = "no cache",
+ .clear = " ",
+ }, {
+ .mask = _PAGE_SPECIAL,
+ .val = _PAGE_SPECIAL,
+ .set = "special",
+ }
+};
+
+struct pgtable_level pg_level[5] = {
+ {
+ }, { /* pgd */
+ .flag = flag_array,
+ .num = ARRAY_SIZE(flag_array),
+ }, { /* pud */
+ .flag = flag_array,
+ .num = ARRAY_SIZE(flag_array),
+ }, { /* pmd */
+ .flag = flag_array,
+ .num = ARRAY_SIZE(flag_array),
+ }, { /* pte */
+ .flag = flag_array,
+ .num = ARRAY_SIZE(flag_array),
+ },
+};
diff --git a/arch/powerpc/mm/ptdump/Makefile b/arch/powerpc/mm/ptdump/Makefile
new file mode 100644
index 0000000..712762b
--- /dev/null
+++ b/arch/powerpc/mm/ptdump/Makefile
@@ -0,0 +1,9 @@
+# SPDX-License-Identifier: GPL-2.0
+
+obj-y += ptdump.o
+
+obj-$(CONFIG_4xx) += shared.o
+obj-$(CONFIG_PPC_8xx) += 8xx.o
+obj-$(CONFIG_PPC_BOOK3E_MMU) += shared.o
+obj-$(CONFIG_PPC_BOOK3S_32) += shared.o bats.o segment_regs.o
+obj-$(CONFIG_PPC_BOOK3S_64) += book3s64.o hashpagetable.o
diff --git a/arch/powerpc/mm/ptdump/bats.c b/arch/powerpc/mm/ptdump/bats.c
new file mode 100644
index 0000000..4154fea
--- /dev/null
+++ b/arch/powerpc/mm/ptdump/bats.c
@@ -0,0 +1,173 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Copyright 2018, Christophe Leroy CS S.I.
+ * <christophe.leroy@c-s.fr>
+ *
+ * This dumps the content of BATS
+ */
+
+#include <asm/debugfs.h>
+#include <asm/pgtable.h>
+#include <asm/cpu_has_feature.h>
+
+static char *pp_601(int k, int pp)
+{
+ if (pp == 0)
+ return k ? "NA" : "RWX";
+ if (pp == 1)
+ return k ? "ROX" : "RWX";
+ if (pp == 2)
+ return k ? "RWX" : "RWX";
+ return k ? "ROX" : "ROX";
+}
+
+static void bat_show_601(struct seq_file *m, int idx, u32 lower, u32 upper)
+{
+ u32 blpi = upper & 0xfffe0000;
+ u32 k = (upper >> 2) & 3;
+ u32 pp = upper & 3;
+ phys_addr_t pbn = PHYS_BAT_ADDR(lower);
+ u32 bsm = lower & 0x3ff;
+ u32 size = (bsm + 1) << 17;
+
+ seq_printf(m, "%d: ", idx);
+ if (!(lower & 0x40)) {
+ seq_puts(m, " -\n");
+ return;
+ }
+
+ seq_printf(m, "0x%08x-0x%08x ", blpi, blpi + size - 1);
+#ifdef CONFIG_PHYS_64BIT
+ seq_printf(m, "0x%016llx ", pbn);
+#else
+ seq_printf(m, "0x%08x ", pbn);
+#endif
+
+ seq_printf(m, "Kernel %s User %s", pp_601(k & 2, pp), pp_601(k & 1, pp));
+
+ if (lower & _PAGE_WRITETHRU)
+ seq_puts(m, "write through ");
+ if (lower & _PAGE_NO_CACHE)
+ seq_puts(m, "no cache ");
+ if (lower & _PAGE_COHERENT)
+ seq_puts(m, "coherent ");
+ seq_puts(m, "\n");
+}
+
+#define BAT_SHOW_601(_m, _n, _l, _u) bat_show_601(_m, _n, mfspr(_l), mfspr(_u))
+
+static int bats_show_601(struct seq_file *m, void *v)
+{
+ seq_puts(m, "---[ Block Address Translation ]---\n");
+
+ BAT_SHOW_601(m, 0, SPRN_IBAT0L, SPRN_IBAT0U);
+ BAT_SHOW_601(m, 1, SPRN_IBAT1L, SPRN_IBAT1U);
+ BAT_SHOW_601(m, 2, SPRN_IBAT2L, SPRN_IBAT2U);
+ BAT_SHOW_601(m, 3, SPRN_IBAT3L, SPRN_IBAT3U);
+
+ return 0;
+}
+
+static void bat_show_603(struct seq_file *m, int idx, u32 lower, u32 upper, bool is_d)
+{
+ u32 bepi = upper & 0xfffe0000;
+ u32 bl = (upper >> 2) & 0x7ff;
+ u32 k = upper & 3;
+ phys_addr_t brpn = PHYS_BAT_ADDR(lower);
+ u32 size = (bl + 1) << 17;
+
+ seq_printf(m, "%d: ", idx);
+ if (k == 0) {
+ seq_puts(m, " -\n");
+ return;
+ }
+
+ seq_printf(m, "0x%08x-0x%08x ", bepi, bepi + size - 1);
+#ifdef CONFIG_PHYS_64BIT
+ seq_printf(m, "0x%016llx ", brpn);
+#else
+ seq_printf(m, "0x%08x ", brpn);
+#endif
+
+ if (k == 1)
+ seq_puts(m, "User ");
+ else if (k == 2)
+ seq_puts(m, "Kernel ");
+ else
+ seq_puts(m, "Kernel/User ");
+
+ if (lower & BPP_RX)
+ seq_puts(m, is_d ? "RO " : "EXEC ");
+ else if (lower & BPP_RW)
+ seq_puts(m, is_d ? "RW " : "EXEC ");
+ else
+ seq_puts(m, is_d ? "NA " : "NX ");
+
+ if (lower & _PAGE_WRITETHRU)
+ seq_puts(m, "write through ");
+ if (lower & _PAGE_NO_CACHE)
+ seq_puts(m, "no cache ");
+ if (lower & _PAGE_COHERENT)
+ seq_puts(m, "coherent ");
+ if (lower & _PAGE_GUARDED)
+ seq_puts(m, "guarded ");
+ seq_puts(m, "\n");
+}
+
+#define BAT_SHOW_603(_m, _n, _l, _u, _d) bat_show_603(_m, _n, mfspr(_l), mfspr(_u), _d)
+
+static int bats_show_603(struct seq_file *m, void *v)
+{
+ seq_puts(m, "---[ Instruction Block Address Translation ]---\n");
+
+ BAT_SHOW_603(m, 0, SPRN_IBAT0L, SPRN_IBAT0U, false);
+ BAT_SHOW_603(m, 1, SPRN_IBAT1L, SPRN_IBAT1U, false);
+ BAT_SHOW_603(m, 2, SPRN_IBAT2L, SPRN_IBAT2U, false);
+ BAT_SHOW_603(m, 3, SPRN_IBAT3L, SPRN_IBAT3U, false);
+ if (mmu_has_feature(MMU_FTR_USE_HIGH_BATS)) {
+ BAT_SHOW_603(m, 4, SPRN_IBAT4L, SPRN_IBAT4U, false);
+ BAT_SHOW_603(m, 5, SPRN_IBAT5L, SPRN_IBAT5U, false);
+ BAT_SHOW_603(m, 6, SPRN_IBAT6L, SPRN_IBAT6U, false);
+ BAT_SHOW_603(m, 7, SPRN_IBAT7L, SPRN_IBAT7U, false);
+ }
+
+ seq_puts(m, "\n---[ Data Block Address Translation ]---\n");
+
+ BAT_SHOW_603(m, 0, SPRN_DBAT0L, SPRN_DBAT0U, true);
+ BAT_SHOW_603(m, 1, SPRN_DBAT1L, SPRN_DBAT1U, true);
+ BAT_SHOW_603(m, 2, SPRN_DBAT2L, SPRN_DBAT2U, true);
+ BAT_SHOW_603(m, 3, SPRN_DBAT3L, SPRN_DBAT3U, true);
+ if (mmu_has_feature(MMU_FTR_USE_HIGH_BATS)) {
+ BAT_SHOW_603(m, 4, SPRN_DBAT4L, SPRN_DBAT4U, true);
+ BAT_SHOW_603(m, 5, SPRN_DBAT5L, SPRN_DBAT5U, true);
+ BAT_SHOW_603(m, 6, SPRN_DBAT6L, SPRN_DBAT6U, true);
+ BAT_SHOW_603(m, 7, SPRN_DBAT7L, SPRN_DBAT7U, true);
+ }
+
+ return 0;
+}
+
+static int bats_open(struct inode *inode, struct file *file)
+{
+ if (IS_ENABLED(CONFIG_PPC_BOOK3S_601))
+ return single_open(file, bats_show_601, NULL);
+
+ return single_open(file, bats_show_603, NULL);
+}
+
+static const struct file_operations bats_fops = {
+ .open = bats_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static int __init bats_init(void)
+{
+ struct dentry *debugfs_file;
+
+ debugfs_file = debugfs_create_file("block_address_translation", 0400,
+ powerpc_debugfs_root, NULL, &bats_fops);
+ return debugfs_file ? 0 : -ENOMEM;
+}
+device_initcall(bats_init);
diff --git a/arch/powerpc/mm/ptdump/book3s64.c b/arch/powerpc/mm/ptdump/book3s64.c
new file mode 100644
index 0000000..0dfca72
--- /dev/null
+++ b/arch/powerpc/mm/ptdump/book3s64.c
@@ -0,0 +1,120 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * From split of dump_linuxpagetables.c
+ * Copyright 2016, Rashmica Gupta, IBM Corp.
+ *
+ */
+#include <linux/kernel.h>
+#include <asm/pgtable.h>
+
+#include "ptdump.h"
+
+static const struct flag_info flag_array[] = {
+ {
+ .mask = _PAGE_PRIVILEGED,
+ .val = 0,
+ .set = "user",
+ .clear = " ",
+ }, {
+ .mask = _PAGE_READ,
+ .val = _PAGE_READ,
+ .set = "r",
+ .clear = " ",
+ }, {
+ .mask = _PAGE_WRITE,
+ .val = _PAGE_WRITE,
+ .set = "w",
+ .clear = " ",
+ }, {
+ .mask = _PAGE_EXEC,
+ .val = _PAGE_EXEC,
+ .set = " X ",
+ .clear = " ",
+ }, {
+ .mask = _PAGE_PTE,
+ .val = _PAGE_PTE,
+ .set = "pte",
+ .clear = " ",
+ }, {
+ .mask = _PAGE_PRESENT,
+ .val = _PAGE_PRESENT,
+ .set = "valid",
+ .clear = " ",
+ }, {
+ .mask = _PAGE_PRESENT | _PAGE_INVALID,
+ .val = 0,
+ .set = " ",
+ .clear = "present",
+ }, {
+ .mask = H_PAGE_HASHPTE,
+ .val = H_PAGE_HASHPTE,
+ .set = "hpte",
+ .clear = " ",
+ }, {
+ .mask = _PAGE_DIRTY,
+ .val = _PAGE_DIRTY,
+ .set = "dirty",
+ .clear = " ",
+ }, {
+ .mask = _PAGE_ACCESSED,
+ .val = _PAGE_ACCESSED,
+ .set = "accessed",
+ .clear = " ",
+ }, {
+ .mask = _PAGE_NON_IDEMPOTENT,
+ .val = _PAGE_NON_IDEMPOTENT,
+ .set = "non-idempotent",
+ .clear = " ",
+ }, {
+ .mask = _PAGE_TOLERANT,
+ .val = _PAGE_TOLERANT,
+ .set = "tolerant",
+ .clear = " ",
+ }, {
+ .mask = H_PAGE_BUSY,
+ .val = H_PAGE_BUSY,
+ .set = "busy",
+ }, {
+#ifdef CONFIG_PPC_64K_PAGES
+ .mask = H_PAGE_COMBO,
+ .val = H_PAGE_COMBO,
+ .set = "combo",
+ }, {
+ .mask = H_PAGE_4K_PFN,
+ .val = H_PAGE_4K_PFN,
+ .set = "4K_pfn",
+ }, {
+#else /* CONFIG_PPC_64K_PAGES */
+ .mask = H_PAGE_F_GIX,
+ .val = H_PAGE_F_GIX,
+ .set = "f_gix",
+ .is_val = true,
+ .shift = H_PAGE_F_GIX_SHIFT,
+ }, {
+ .mask = H_PAGE_F_SECOND,
+ .val = H_PAGE_F_SECOND,
+ .set = "f_second",
+ }, {
+#endif /* CONFIG_PPC_64K_PAGES */
+ .mask = _PAGE_SPECIAL,
+ .val = _PAGE_SPECIAL,
+ .set = "special",
+ }
+};
+
+struct pgtable_level pg_level[5] = {
+ {
+ }, { /* pgd */
+ .flag = flag_array,
+ .num = ARRAY_SIZE(flag_array),
+ }, { /* pud */
+ .flag = flag_array,
+ .num = ARRAY_SIZE(flag_array),
+ }, { /* pmd */
+ .flag = flag_array,
+ .num = ARRAY_SIZE(flag_array),
+ }, { /* pte */
+ .flag = flag_array,
+ .num = ARRAY_SIZE(flag_array),
+ },
+};
diff --git a/arch/powerpc/mm/dump_hashpagetable.c b/arch/powerpc/mm/ptdump/hashpagetable.c
similarity index 94%
rename from arch/powerpc/mm/dump_hashpagetable.c
rename to arch/powerpc/mm/ptdump/hashpagetable.c
index 8692946..a072780 100644
--- a/arch/powerpc/mm/dump_hashpagetable.c
+++ b/arch/powerpc/mm/ptdump/hashpagetable.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright 2016, Rashmica Gupta, IBM Corp.
*
@@ -7,11 +8,6 @@
*
* If radix is enabled then there is no hash page table and so no debugfs file
* is generated.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; version 2
- * of the License.
*/
#include <linux/debugfs.h>
#include <linux/fs.h>
@@ -241,7 +237,6 @@
return -1;
}
-#ifdef CONFIG_PPC_PSERIES
static int pseries_find(unsigned long ea, int psize, bool primary, u64 *v, u64 *r)
{
struct hash_pte ptes[4];
@@ -278,7 +273,6 @@
}
return -1;
}
-#endif
static void decode_r(int bps, unsigned long r, unsigned long *rpn, int *aps,
unsigned long *lp_bits)
@@ -320,10 +314,9 @@
static int base_hpte_find(unsigned long ea, int psize, bool primary, u64 *v,
u64 *r)
{
-#ifdef CONFIG_PPC_PSERIES
- if (firmware_has_feature(FW_FEATURE_LPAR))
+ if (IS_ENABLED(CONFIG_PPC_PSERIES) && firmware_has_feature(FW_FEATURE_LPAR))
return pseries_find(ea, psize, primary, v, r);
-#endif
+
return native_find(ea, psize, primary, v, r);
}
@@ -342,7 +335,7 @@
/* Look in secondary table */
if (slot == -1)
- slot = base_hpte_find(ea, psize, true, &v, &r);
+ slot = base_hpte_find(ea, psize, false, &v, &r);
/* No entry found */
if (slot == -1)
@@ -390,12 +383,13 @@
psize = mmu_vmalloc_psize;
else
psize = mmu_io_psize;
-#ifdef CONFIG_PPC_64K_PAGES
+
/* check for secret 4K mappings */
- if (((pteval & H_PAGE_COMBO) == H_PAGE_COMBO) ||
- ((pteval & H_PAGE_4K_PFN) == H_PAGE_4K_PFN))
+ if (IS_ENABLED(CONFIG_PPC_64K_PAGES) &&
+ ((pteval & H_PAGE_COMBO) == H_PAGE_COMBO ||
+ (pteval & H_PAGE_4K_PFN) == H_PAGE_4K_PFN))
psize = mmu_io_psize;
-#endif
+
/* check for hashpte */
status = hpte_find(st, addr, psize);
@@ -473,9 +467,10 @@
static void walk_vmemmap(struct pg_state *st)
{
-#ifdef CONFIG_SPARSEMEM_VMEMMAP
struct vmemmap_backing *ptr = vmemmap_list;
+ if (!IS_ENABLED(CONFIG_SPARSEMEM_VMEMMAP))
+ return;
/*
* Traverse the vmemmaped memory and dump pages that are in the hash
* pagetable.
@@ -485,7 +480,6 @@
ptr = ptr->list;
}
seq_puts(st->seq, "---[ vmemmap end ]---\n");
-#endif
}
static void populate_markers(void)
@@ -499,11 +493,7 @@
address_markers[6].start_address = PHB_IO_END;
address_markers[7].start_address = IOREMAP_BASE;
address_markers[8].start_address = IOREMAP_END;
-#ifdef CONFIG_PPC_BOOK3S_64
- address_markers[9].start_address = H_VMEMMAP_BASE;
-#else
- address_markers[9].start_address = VMEMMAP_BASE;
-#endif
+ address_markers[9].start_address = H_VMEMMAP_START;
}
static int ptdump_show(struct seq_file *m, void *v)
diff --git a/arch/powerpc/mm/dump_linuxpagetables.c b/arch/powerpc/mm/ptdump/ptdump.c
similarity index 64%
rename from arch/powerpc/mm/dump_linuxpagetables.c
rename to arch/powerpc/mm/ptdump/ptdump.c
index bdf33b9..2f9ddc2 100644
--- a/arch/powerpc/mm/dump_linuxpagetables.c
+++ b/arch/powerpc/mm/ptdump/ptdump.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright 2016, Rashmica Gupta, IBM Corp.
*
@@ -8,17 +9,13 @@
* Derived from the arm64 implementation:
* Copyright (c) 2014, The Linux Foundation, Laura Abbott.
* (C) Copyright 2008 Intel Corporation, Arjan van de Ven.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; version 2
- * of the License.
*/
#include <linux/debugfs.h>
#include <linux/fs.h>
#include <linux/hugetlb.h>
#include <linux/io.h>
#include <linux/mm.h>
+#include <linux/highmem.h>
#include <linux/sched.h>
#include <linux/seq_file.h>
#include <asm/fixmap.h>
@@ -27,9 +24,7 @@
#include <asm/page.h>
#include <asm/pgalloc.h>
-#ifdef CONFIG_PPC32
-#define KERN_VIRT_START 0
-#endif
+#include "ptdump.h"
/*
* To visualise what is happening,
@@ -65,6 +60,8 @@
unsigned long last_pa;
unsigned int level;
u64 current_flags;
+ bool check_wx;
+ unsigned long wx_pages;
};
struct addr_marker {
@@ -87,10 +84,6 @@
#else
{ 0, "Early I/O remap start" },
{ 0, "Early I/O remap end" },
-#ifdef CONFIG_NOT_COHERENT_CACHE
- { 0, "Consistent mem start" },
- { 0, "Consistent mem end" },
-#endif
#ifdef CONFIG_HIGHMEM
{ 0, "Highmem PTEs start" },
{ 0, "Highmem PTEs end" },
@@ -98,161 +91,24 @@
{ 0, "Fixmap start" },
{ 0, "Fixmap end" },
#endif
+#ifdef CONFIG_KASAN
+ { 0, "kasan shadow mem start" },
+ { 0, "kasan shadow mem end" },
+#endif
{ -1, NULL },
};
-struct flag_info {
- u64 mask;
- u64 val;
- const char *set;
- const char *clear;
- bool is_val;
- int shift;
-};
+#define pt_dump_seq_printf(m, fmt, args...) \
+({ \
+ if (m) \
+ seq_printf(m, fmt, ##args); \
+})
-static const struct flag_info flag_array[] = {
- {
- .mask = _PAGE_USER | _PAGE_PRIVILEGED,
- .val = _PAGE_USER,
- .set = "user",
- .clear = " ",
- }, {
- .mask = _PAGE_RW | _PAGE_RO | _PAGE_NA,
- .val = _PAGE_RW,
- .set = "rw",
- }, {
- .mask = _PAGE_RW | _PAGE_RO | _PAGE_NA,
- .val = _PAGE_RO,
- .set = "ro",
- }, {
-#if _PAGE_NA != 0
- .mask = _PAGE_RW | _PAGE_RO | _PAGE_NA,
- .val = _PAGE_RO,
- .set = "na",
- }, {
-#endif
- .mask = _PAGE_EXEC,
- .val = _PAGE_EXEC,
- .set = " X ",
- .clear = " ",
- }, {
- .mask = _PAGE_PTE,
- .val = _PAGE_PTE,
- .set = "pte",
- .clear = " ",
- }, {
- .mask = _PAGE_PRESENT,
- .val = _PAGE_PRESENT,
- .set = "present",
- .clear = " ",
- }, {
-#ifdef CONFIG_PPC_BOOK3S_64
- .mask = H_PAGE_HASHPTE,
- .val = H_PAGE_HASHPTE,
-#else
- .mask = _PAGE_HASHPTE,
- .val = _PAGE_HASHPTE,
-#endif
- .set = "hpte",
- .clear = " ",
- }, {
-#ifndef CONFIG_PPC_BOOK3S_64
- .mask = _PAGE_GUARDED,
- .val = _PAGE_GUARDED,
- .set = "guarded",
- .clear = " ",
- }, {
-#endif
- .mask = _PAGE_DIRTY,
- .val = _PAGE_DIRTY,
- .set = "dirty",
- .clear = " ",
- }, {
- .mask = _PAGE_ACCESSED,
- .val = _PAGE_ACCESSED,
- .set = "accessed",
- .clear = " ",
- }, {
-#ifndef CONFIG_PPC_BOOK3S_64
- .mask = _PAGE_WRITETHRU,
- .val = _PAGE_WRITETHRU,
- .set = "write through",
- .clear = " ",
- }, {
-#endif
-#ifndef CONFIG_PPC_BOOK3S_64
- .mask = _PAGE_NO_CACHE,
- .val = _PAGE_NO_CACHE,
- .set = "no cache",
- .clear = " ",
- }, {
-#else
- .mask = _PAGE_NON_IDEMPOTENT,
- .val = _PAGE_NON_IDEMPOTENT,
- .set = "non-idempotent",
- .clear = " ",
- }, {
- .mask = _PAGE_TOLERANT,
- .val = _PAGE_TOLERANT,
- .set = "tolerant",
- .clear = " ",
- }, {
-#endif
-#ifdef CONFIG_PPC_BOOK3S_64
- .mask = H_PAGE_BUSY,
- .val = H_PAGE_BUSY,
- .set = "busy",
- }, {
-#ifdef CONFIG_PPC_64K_PAGES
- .mask = H_PAGE_COMBO,
- .val = H_PAGE_COMBO,
- .set = "combo",
- }, {
- .mask = H_PAGE_4K_PFN,
- .val = H_PAGE_4K_PFN,
- .set = "4K_pfn",
- }, {
-#else /* CONFIG_PPC_64K_PAGES */
- .mask = H_PAGE_F_GIX,
- .val = H_PAGE_F_GIX,
- .set = "f_gix",
- .is_val = true,
- .shift = H_PAGE_F_GIX_SHIFT,
- }, {
- .mask = H_PAGE_F_SECOND,
- .val = H_PAGE_F_SECOND,
- .set = "f_second",
- }, {
-#endif /* CONFIG_PPC_64K_PAGES */
-#endif
- .mask = _PAGE_SPECIAL,
- .val = _PAGE_SPECIAL,
- .set = "special",
- }
-};
-
-struct pgtable_level {
- const struct flag_info *flag;
- size_t num;
- u64 mask;
-};
-
-static struct pgtable_level pg_level[] = {
- {
- }, { /* pgd */
- .flag = flag_array,
- .num = ARRAY_SIZE(flag_array),
- }, { /* pud */
- .flag = flag_array,
- .num = ARRAY_SIZE(flag_array),
- }, { /* pmd */
- .flag = flag_array,
- .num = ARRAY_SIZE(flag_array),
- }, { /* pte */
- .flag = flag_array,
- .num = ARRAY_SIZE(flag_array),
- },
-};
+#define pt_dump_seq_putc(m, c) \
+({ \
+ if (m) \
+ seq_putc(m, c); \
+})
static void dump_flag_info(struct pg_state *st, const struct flag_info
*flag, u64 pte, int num)
@@ -271,19 +127,19 @@
val = pte & flag->val;
if (flag->shift)
val = val >> flag->shift;
- seq_printf(st->seq, " %s:%llx", flag->set, val);
+ pt_dump_seq_printf(st->seq, " %s:%llx", flag->set, val);
} else {
if ((pte & flag->mask) == flag->val)
s = flag->set;
else
s = flag->clear;
if (s)
- seq_printf(st->seq, " %s", s);
+ pt_dump_seq_printf(st->seq, " %s", s);
}
st->current_flags &= ~flag->mask;
}
if (st->current_flags != 0)
- seq_printf(st->seq, " unknown flags:%llx", st->current_flags);
+ pt_dump_seq_printf(st->seq, " unknown flags:%llx", st->current_flags);
}
static void dump_addr(struct pg_state *st, unsigned long addr)
@@ -293,23 +149,42 @@
unsigned long delta;
#ifdef CONFIG_PPC64
- seq_printf(st->seq, "0x%016lx-0x%016lx ", st->start_address, addr-1);
- seq_printf(st->seq, "0x%016lx ", st->start_pa);
+#define REG "0x%016lx"
#else
- seq_printf(st->seq, "0x%08lx-0x%08lx ", st->start_address, addr - 1);
- seq_printf(st->seq, "0x%08lx ", st->start_pa);
+#define REG "0x%08lx"
#endif
- delta = (addr - st->start_address) >> 10;
+ pt_dump_seq_printf(st->seq, REG "-" REG " ", st->start_address, addr - 1);
+ if (st->start_pa == st->last_pa && st->start_address + PAGE_SIZE != addr) {
+ pt_dump_seq_printf(st->seq, "[" REG "]", st->start_pa);
+ delta = PAGE_SIZE >> 10;
+ } else {
+ pt_dump_seq_printf(st->seq, " " REG " ", st->start_pa);
+ delta = (addr - st->start_address) >> 10;
+ }
/* Work out what appropriate unit to use */
while (!(delta & 1023) && unit[1]) {
delta >>= 10;
unit++;
}
- seq_printf(st->seq, "%9lu%c", delta, *unit);
+ pt_dump_seq_printf(st->seq, "%9lu%c", delta, *unit);
}
+static void note_prot_wx(struct pg_state *st, unsigned long addr)
+{
+ if (!IS_ENABLED(CONFIG_PPC_DEBUG_WX) || !st->check_wx)
+ return;
+
+ if (!((st->current_flags & pgprot_val(PAGE_KERNEL_X)) == pgprot_val(PAGE_KERNEL_X)))
+ return;
+
+ WARN_ONCE(1, "powerpc/mm: Found insecure W+X mapping at address %p/%pS\n",
+ (void *)st->start_address, (void *)st->start_address);
+
+ st->wx_pages += (addr - st->start_address) / PAGE_SIZE;
+}
+
static void note_page(struct pg_state *st, unsigned long addr,
unsigned int level, u64 val)
{
@@ -323,7 +198,7 @@
st->start_address = addr;
st->start_pa = pa;
st->last_pa = pa;
- seq_printf(st->seq, "---[ %s ]---\n", st->marker->name);
+ pt_dump_seq_printf(st->seq, "---[ %s ]---\n", st->marker->name);
/*
* Dump the section of virtual memory when:
* - the PTE flags from one entry to the next differs.
@@ -334,10 +209,12 @@
*/
} else if (flag != st->current_flags || level != st->level ||
addr >= st->marker[1].start_address ||
- pa != st->last_pa + PAGE_SIZE) {
+ (pa != st->last_pa + PAGE_SIZE &&
+ (pa != st->start_pa || st->start_pa != st->last_pa))) {
/* Check the PTE flags */
if (st->current_flags) {
+ note_prot_wx(st, addr);
dump_addr(st, addr);
/* Dump all the flags */
@@ -346,7 +223,7 @@
st->current_flags,
pg_level[st->level].num);
- seq_putc(st->seq, '\n');
+ pt_dump_seq_putc(st->seq, '\n');
}
/*
@@ -355,7 +232,7 @@
*/
while (addr >= st->marker[1].start_address) {
st->marker++;
- seq_printf(st->seq, "---[ %s ]---\n", st->marker->name);
+ pt_dump_seq_printf(st->seq, "---[ %s ]---\n", st->marker->name);
}
st->start_address = addr;
st->start_pa = pa;
@@ -388,7 +265,7 @@
for (i = 0; i < PTRS_PER_PMD; i++, pmd++) {
addr = start + i * PMD_SIZE;
- if (!pmd_none(*pmd) && !pmd_huge(*pmd))
+ if (!pmd_none(*pmd) && !pmd_is_leaf(*pmd))
/* pmd exists */
walk_pte(st, pmd, addr);
else
@@ -404,7 +281,7 @@
for (i = 0; i < PTRS_PER_PUD; i++, pud++) {
addr = start + i * PUD_SIZE;
- if (!pud_none(*pud) && !pud_huge(*pud))
+ if (!pud_none(*pud) && !pud_is_leaf(*pud))
/* pud exists */
walk_pmd(st, pud, addr);
else
@@ -414,18 +291,16 @@
static void walk_pagetables(struct pg_state *st)
{
- pgd_t *pgd = pgd_offset_k(0UL);
unsigned int i;
- unsigned long addr;
-
- addr = st->start_address;
+ unsigned long addr = st->start_address & PGDIR_MASK;
+ pgd_t *pgd = pgd_offset_k(addr);
/*
* Traverse the linux pagetable structure and dump pages that are in
* the hash pagetable.
*/
- for (i = 0; i < PTRS_PER_PGD; i++, pgd++, addr += PGDIR_SIZE) {
- if (!pgd_none(*pgd) && !pgd_huge(*pgd))
+ for (i = pgd_index(addr); i < PTRS_PER_PGD; i++, pgd++, addr += PGDIR_SIZE) {
+ if (!pgd_none(*pgd) && !pgd_is_leaf(*pgd))
/* pgd exists */
walk_pud(st, pgd, addr);
else
@@ -447,25 +322,25 @@
address_markers[i++].start_address = PHB_IO_END;
address_markers[i++].start_address = IOREMAP_BASE;
address_markers[i++].start_address = IOREMAP_END;
+ /* What is the ifdef about? */
#ifdef CONFIG_PPC_BOOK3S_64
- address_markers[i++].start_address = H_VMEMMAP_BASE;
+ address_markers[i++].start_address = H_VMEMMAP_START;
#else
address_markers[i++].start_address = VMEMMAP_BASE;
#endif
#else /* !CONFIG_PPC64 */
address_markers[i++].start_address = ioremap_bot;
address_markers[i++].start_address = IOREMAP_TOP;
-#ifdef CONFIG_NOT_COHERENT_CACHE
- address_markers[i++].start_address = IOREMAP_TOP;
- address_markers[i++].start_address = IOREMAP_TOP +
- CONFIG_CONSISTENT_SIZE;
-#endif
#ifdef CONFIG_HIGHMEM
address_markers[i++].start_address = PKMAP_BASE;
address_markers[i++].start_address = PKMAP_ADDR(LAST_PKMAP);
#endif
address_markers[i++].start_address = FIXADDR_START;
address_markers[i++].start_address = FIXADDR_TOP;
+#ifdef CONFIG_KASAN
+ address_markers[i++].start_address = KASAN_SHADOW_START;
+ address_markers[i++].start_address = KASAN_SHADOW_END;
+#endif
#endif /* CONFIG_PPC64 */
}
@@ -474,12 +349,13 @@
struct pg_state st = {
.seq = m,
.marker = address_markers,
+ .start_address = PAGE_OFFSET,
};
- if (radix_enabled())
- st.start_address = PAGE_OFFSET;
- else
+#ifdef CONFIG_PPC64
+ if (!radix_enabled())
st.start_address = KERN_VIRT_START;
+#endif
/* Traverse kernel page tables */
walk_pagetables(&st);
@@ -510,6 +386,31 @@
pg_level[i].mask |= pg_level[i].flag[j].mask;
}
+#ifdef CONFIG_PPC_DEBUG_WX
+void ptdump_check_wx(void)
+{
+ struct pg_state st = {
+ .seq = NULL,
+ .marker = address_markers,
+ .check_wx = true,
+ .start_address = PAGE_OFFSET,
+ };
+
+#ifdef CONFIG_PPC64
+ if (!radix_enabled())
+ st.start_address = KERN_VIRT_START;
+#endif
+
+ walk_pagetables(&st);
+
+ if (st.wx_pages)
+ pr_warn("Checked W+X mappings: FAILED, %lu W+X pages found\n",
+ st.wx_pages);
+ else
+ pr_info("Checked W+X mappings: passed, no W+X pages found\n");
+}
+#endif
+
static int ptdump_init(void)
{
struct dentry *debugfs_file;
diff --git a/arch/powerpc/mm/ptdump/ptdump.h b/arch/powerpc/mm/ptdump/ptdump.h
new file mode 100644
index 0000000..5d51363
--- /dev/null
+++ b/arch/powerpc/mm/ptdump/ptdump.h
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/types.h>
+
+struct flag_info {
+ u64 mask;
+ u64 val;
+ const char *set;
+ const char *clear;
+ bool is_val;
+ int shift;
+};
+
+struct pgtable_level {
+ const struct flag_info *flag;
+ size_t num;
+ u64 mask;
+};
+
+extern struct pgtable_level pg_level[5];
diff --git a/arch/powerpc/mm/ptdump/segment_regs.c b/arch/powerpc/mm/ptdump/segment_regs.c
new file mode 100644
index 0000000..5018436
--- /dev/null
+++ b/arch/powerpc/mm/ptdump/segment_regs.c
@@ -0,0 +1,64 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Copyright 2018, Christophe Leroy CS S.I.
+ * <christophe.leroy@c-s.fr>
+ *
+ * This dumps the content of Segment Registers
+ */
+
+#include <asm/debugfs.h>
+
+static void seg_show(struct seq_file *m, int i)
+{
+ u32 val = mfsrin(i << 28);
+
+ seq_printf(m, "0x%01x0000000-0x%01xfffffff ", i, i);
+ seq_printf(m, "Kern key %d ", (val >> 30) & 1);
+ seq_printf(m, "User key %d ", (val >> 29) & 1);
+ if (val & 0x80000000) {
+ seq_printf(m, "Device 0x%03x", (val >> 20) & 0x1ff);
+ seq_printf(m, "-0x%05x", val & 0xfffff);
+ } else {
+ if (val & 0x10000000)
+ seq_puts(m, "No Exec ");
+ seq_printf(m, "VSID 0x%06x", val & 0xffffff);
+ }
+ seq_puts(m, "\n");
+}
+
+static int sr_show(struct seq_file *m, void *v)
+{
+ int i;
+
+ seq_puts(m, "---[ User Segments ]---\n");
+ for (i = 0; i < TASK_SIZE >> 28; i++)
+ seg_show(m, i);
+
+ seq_puts(m, "\n---[ Kernel Segments ]---\n");
+ for (; i < 16; i++)
+ seg_show(m, i);
+
+ return 0;
+}
+
+static int sr_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, sr_show, NULL);
+}
+
+static const struct file_operations sr_fops = {
+ .open = sr_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static int __init sr_init(void)
+{
+ struct dentry *debugfs_file;
+
+ debugfs_file = debugfs_create_file("segment_registers", 0400,
+ powerpc_debugfs_root, NULL, &sr_fops);
+ return debugfs_file ? 0 : -ENOMEM;
+}
+device_initcall(sr_init);
diff --git a/arch/powerpc/mm/ptdump/shared.c b/arch/powerpc/mm/ptdump/shared.c
new file mode 100644
index 0000000..f7ed2f1
--- /dev/null
+++ b/arch/powerpc/mm/ptdump/shared.c
@@ -0,0 +1,80 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * From split of dump_linuxpagetables.c
+ * Copyright 2016, Rashmica Gupta, IBM Corp.
+ *
+ */
+#include <linux/kernel.h>
+#include <asm/pgtable.h>
+
+#include "ptdump.h"
+
+static const struct flag_info flag_array[] = {
+ {
+ .mask = _PAGE_USER,
+ .val = _PAGE_USER,
+ .set = "user",
+ .clear = " ",
+ }, {
+ .mask = _PAGE_RW,
+ .val = _PAGE_RW,
+ .set = "rw",
+ .clear = "r ",
+ }, {
+ .mask = _PAGE_EXEC,
+ .val = _PAGE_EXEC,
+ .set = " X ",
+ .clear = " ",
+ }, {
+ .mask = _PAGE_PRESENT,
+ .val = _PAGE_PRESENT,
+ .set = "present",
+ .clear = " ",
+ }, {
+ .mask = _PAGE_GUARDED,
+ .val = _PAGE_GUARDED,
+ .set = "guarded",
+ .clear = " ",
+ }, {
+ .mask = _PAGE_DIRTY,
+ .val = _PAGE_DIRTY,
+ .set = "dirty",
+ .clear = " ",
+ }, {
+ .mask = _PAGE_ACCESSED,
+ .val = _PAGE_ACCESSED,
+ .set = "accessed",
+ .clear = " ",
+ }, {
+ .mask = _PAGE_WRITETHRU,
+ .val = _PAGE_WRITETHRU,
+ .set = "write through",
+ .clear = " ",
+ }, {
+ .mask = _PAGE_NO_CACHE,
+ .val = _PAGE_NO_CACHE,
+ .set = "no cache",
+ .clear = " ",
+ }, {
+ .mask = _PAGE_SPECIAL,
+ .val = _PAGE_SPECIAL,
+ .set = "special",
+ }
+};
+
+struct pgtable_level pg_level[5] = {
+ {
+ }, { /* pgd */
+ .flag = flag_array,
+ .num = ARRAY_SIZE(flag_array),
+ }, { /* pud */
+ .flag = flag_array,
+ .num = ARRAY_SIZE(flag_array),
+ }, { /* pmd */
+ .flag = flag_array,
+ .num = ARRAY_SIZE(flag_array),
+ }, { /* pte */
+ .flag = flag_array,
+ .num = ARRAY_SIZE(flag_array),
+ },
+};
diff --git a/arch/powerpc/mm/slb.c b/arch/powerpc/mm/slb.c
deleted file mode 100644
index 9f574e5..0000000
--- a/arch/powerpc/mm/slb.c
+++ /dev/null
@@ -1,502 +0,0 @@
-/*
- * PowerPC64 SLB support.
- *
- * Copyright (C) 2004 David Gibson <dwg@au.ibm.com>, IBM
- * Based on earlier code written by:
- * Dave Engebretsen and Mike Corrigan {engebret|mikejc}@us.ibm.com
- * Copyright (c) 2001 Dave Engebretsen
- * Copyright (C) 2002 Anton Blanchard <anton@au.ibm.com>, IBM
- *
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-#include <asm/pgtable.h>
-#include <asm/mmu.h>
-#include <asm/mmu_context.h>
-#include <asm/paca.h>
-#include <asm/cputable.h>
-#include <asm/cacheflush.h>
-#include <asm/smp.h>
-#include <linux/compiler.h>
-#include <linux/context_tracking.h>
-#include <linux/mm_types.h>
-
-#include <asm/udbg.h>
-#include <asm/code-patching.h>
-
-enum slb_index {
- LINEAR_INDEX = 0, /* Kernel linear map (0xc000000000000000) */
- VMALLOC_INDEX = 1, /* Kernel virtual map (0xd000000000000000) */
- KSTACK_INDEX = 2, /* Kernel stack map */
-};
-
-extern void slb_allocate(unsigned long ea);
-
-#define slb_esid_mask(ssize) \
- (((ssize) == MMU_SEGSIZE_256M)? ESID_MASK: ESID_MASK_1T)
-
-static inline unsigned long mk_esid_data(unsigned long ea, int ssize,
- enum slb_index index)
-{
- return (ea & slb_esid_mask(ssize)) | SLB_ESID_V | index;
-}
-
-static inline unsigned long mk_vsid_data(unsigned long ea, int ssize,
- unsigned long flags)
-{
- return (get_kernel_vsid(ea, ssize) << slb_vsid_shift(ssize)) | flags |
- ((unsigned long) ssize << SLB_VSID_SSIZE_SHIFT);
-}
-
-static inline void slb_shadow_update(unsigned long ea, int ssize,
- unsigned long flags,
- enum slb_index index)
-{
- struct slb_shadow *p = get_slb_shadow();
-
- /*
- * Clear the ESID first so the entry is not valid while we are
- * updating it. No write barriers are needed here, provided
- * we only update the current CPU's SLB shadow buffer.
- */
- WRITE_ONCE(p->save_area[index].esid, 0);
- WRITE_ONCE(p->save_area[index].vsid, cpu_to_be64(mk_vsid_data(ea, ssize, flags)));
- WRITE_ONCE(p->save_area[index].esid, cpu_to_be64(mk_esid_data(ea, ssize, index)));
-}
-
-static inline void slb_shadow_clear(enum slb_index index)
-{
- WRITE_ONCE(get_slb_shadow()->save_area[index].esid, cpu_to_be64(index));
-}
-
-static inline void create_shadowed_slbe(unsigned long ea, int ssize,
- unsigned long flags,
- enum slb_index index)
-{
- /*
- * Updating the shadow buffer before writing the SLB ensures
- * we don't get a stale entry here if we get preempted by PHYP
- * between these two statements.
- */
- slb_shadow_update(ea, ssize, flags, index);
-
- asm volatile("slbmte %0,%1" :
- : "r" (mk_vsid_data(ea, ssize, flags)),
- "r" (mk_esid_data(ea, ssize, index))
- : "memory" );
-}
-
-/*
- * Insert bolted entries into SLB (which may not be empty, so don't clear
- * slb_cache_ptr).
- */
-void __slb_restore_bolted_realmode(void)
-{
- struct slb_shadow *p = get_slb_shadow();
- enum slb_index index;
-
- /* No isync needed because realmode. */
- for (index = 0; index < SLB_NUM_BOLTED; index++) {
- asm volatile("slbmte %0,%1" :
- : "r" (be64_to_cpu(p->save_area[index].vsid)),
- "r" (be64_to_cpu(p->save_area[index].esid)));
- }
-}
-
-/*
- * Insert the bolted entries into an empty SLB.
- * This is not the same as rebolt because the bolted segments are not
- * changed, just loaded from the shadow area.
- */
-void slb_restore_bolted_realmode(void)
-{
- __slb_restore_bolted_realmode();
- get_paca()->slb_cache_ptr = 0;
-}
-
-/*
- * This flushes all SLB entries including 0, so it must be realmode.
- */
-void slb_flush_all_realmode(void)
-{
- /*
- * This flushes all SLB entries including 0, so it must be realmode.
- */
- asm volatile("slbmte %0,%0; slbia" : : "r" (0));
-}
-
-static void __slb_flush_and_rebolt(void)
-{
- /* If you change this make sure you change SLB_NUM_BOLTED
- * and PR KVM appropriately too. */
- unsigned long linear_llp, vmalloc_llp, lflags, vflags;
- unsigned long ksp_esid_data, ksp_vsid_data;
-
- linear_llp = mmu_psize_defs[mmu_linear_psize].sllp;
- vmalloc_llp = mmu_psize_defs[mmu_vmalloc_psize].sllp;
- lflags = SLB_VSID_KERNEL | linear_llp;
- vflags = SLB_VSID_KERNEL | vmalloc_llp;
-
- ksp_esid_data = mk_esid_data(get_paca()->kstack, mmu_kernel_ssize, KSTACK_INDEX);
- if ((ksp_esid_data & ~0xfffffffUL) <= PAGE_OFFSET) {
- ksp_esid_data &= ~SLB_ESID_V;
- ksp_vsid_data = 0;
- slb_shadow_clear(KSTACK_INDEX);
- } else {
- /* Update stack entry; others don't change */
- slb_shadow_update(get_paca()->kstack, mmu_kernel_ssize, lflags, KSTACK_INDEX);
- ksp_vsid_data =
- be64_to_cpu(get_slb_shadow()->save_area[KSTACK_INDEX].vsid);
- }
-
- /* We need to do this all in asm, so we're sure we don't touch
- * the stack between the slbia and rebolting it. */
- asm volatile("isync\n"
- "slbia\n"
- /* Slot 1 - first VMALLOC segment */
- "slbmte %0,%1\n"
- /* Slot 2 - kernel stack */
- "slbmte %2,%3\n"
- "isync"
- :: "r"(mk_vsid_data(VMALLOC_START, mmu_kernel_ssize, vflags)),
- "r"(mk_esid_data(VMALLOC_START, mmu_kernel_ssize, VMALLOC_INDEX)),
- "r"(ksp_vsid_data),
- "r"(ksp_esid_data)
- : "memory");
-}
-
-void slb_flush_and_rebolt(void)
-{
-
- WARN_ON(!irqs_disabled());
-
- /*
- * We can't take a PMU exception in the following code, so hard
- * disable interrupts.
- */
- hard_irq_disable();
-
- __slb_flush_and_rebolt();
- get_paca()->slb_cache_ptr = 0;
-}
-
-void slb_vmalloc_update(void)
-{
- unsigned long vflags;
-
- vflags = SLB_VSID_KERNEL | mmu_psize_defs[mmu_vmalloc_psize].sllp;
- slb_shadow_update(VMALLOC_START, mmu_kernel_ssize, vflags, VMALLOC_INDEX);
- slb_flush_and_rebolt();
-}
-
-/* Helper function to compare esids. There are four cases to handle.
- * 1. The system is not 1T segment size capable. Use the GET_ESID compare.
- * 2. The system is 1T capable, both addresses are < 1T, use the GET_ESID compare.
- * 3. The system is 1T capable, only one of the two addresses is > 1T. This is not a match.
- * 4. The system is 1T capable, both addresses are > 1T, use the GET_ESID_1T macro to compare.
- */
-static inline int esids_match(unsigned long addr1, unsigned long addr2)
-{
- int esid_1t_count;
-
- /* System is not 1T segment size capable. */
- if (!mmu_has_feature(MMU_FTR_1T_SEGMENT))
- return (GET_ESID(addr1) == GET_ESID(addr2));
-
- esid_1t_count = (((addr1 >> SID_SHIFT_1T) != 0) +
- ((addr2 >> SID_SHIFT_1T) != 0));
-
- /* both addresses are < 1T */
- if (esid_1t_count == 0)
- return (GET_ESID(addr1) == GET_ESID(addr2));
-
- /* One address < 1T, the other > 1T. Not a match */
- if (esid_1t_count == 1)
- return 0;
-
- /* Both addresses are > 1T. */
- return (GET_ESID_1T(addr1) == GET_ESID_1T(addr2));
-}
-
-/* Flush all user entries from the segment table of the current processor. */
-void switch_slb(struct task_struct *tsk, struct mm_struct *mm)
-{
- unsigned long offset;
- unsigned long slbie_data = 0;
- unsigned long pc = KSTK_EIP(tsk);
- unsigned long stack = KSTK_ESP(tsk);
- unsigned long exec_base;
-
- /*
- * We need interrupts hard-disabled here, not just soft-disabled,
- * so that a PMU interrupt can't occur, which might try to access
- * user memory (to get a stack trace) and possible cause an SLB miss
- * which would update the slb_cache/slb_cache_ptr fields in the PACA.
- */
- hard_irq_disable();
- offset = get_paca()->slb_cache_ptr;
- if (!mmu_has_feature(MMU_FTR_NO_SLBIE_B) &&
- offset <= SLB_CACHE_ENTRIES) {
- int i;
- asm volatile("isync" : : : "memory");
- for (i = 0; i < offset; i++) {
- slbie_data = (unsigned long)get_paca()->slb_cache[i]
- << SID_SHIFT; /* EA */
- slbie_data |= user_segment_size(slbie_data)
- << SLBIE_SSIZE_SHIFT;
- slbie_data |= SLBIE_C; /* C set for user addresses */
- asm volatile("slbie %0" : : "r" (slbie_data));
- }
- asm volatile("isync" : : : "memory");
- } else {
- __slb_flush_and_rebolt();
- }
-
- /* Workaround POWER5 < DD2.1 issue */
- if (offset == 1 || offset > SLB_CACHE_ENTRIES)
- asm volatile("slbie %0" : : "r" (slbie_data));
-
- get_paca()->slb_cache_ptr = 0;
- copy_mm_to_paca(mm);
-
- /*
- * preload some userspace segments into the SLB.
- * Almost all 32 and 64bit PowerPC executables are linked at
- * 0x10000000 so it makes sense to preload this segment.
- */
- exec_base = 0x10000000;
-
- if (is_kernel_addr(pc) || is_kernel_addr(stack) ||
- is_kernel_addr(exec_base))
- return;
-
- slb_allocate(pc);
-
- if (!esids_match(pc, stack))
- slb_allocate(stack);
-
- if (!esids_match(pc, exec_base) &&
- !esids_match(stack, exec_base))
- slb_allocate(exec_base);
-}
-
-static inline void patch_slb_encoding(unsigned int *insn_addr,
- unsigned int immed)
-{
-
- /*
- * This function patches either an li or a cmpldi instruction with
- * a new immediate value. This relies on the fact that both li
- * (which is actually addi) and cmpldi both take a 16-bit immediate
- * value, and it is situated in the same location in the instruction,
- * ie. bits 16-31 (Big endian bit order) or the lower 16 bits.
- * The signedness of the immediate operand differs between the two
- * instructions however this code is only ever patching a small value,
- * much less than 1 << 15, so we can get away with it.
- * To patch the value we read the existing instruction, clear the
- * immediate value, and or in our new value, then write the instruction
- * back.
- */
- unsigned int insn = (*insn_addr & 0xffff0000) | immed;
- patch_instruction(insn_addr, insn);
-}
-
-extern u32 slb_miss_kernel_load_linear[];
-extern u32 slb_miss_kernel_load_io[];
-extern u32 slb_compare_rr_to_size[];
-extern u32 slb_miss_kernel_load_vmemmap[];
-
-void slb_set_size(u16 size)
-{
- if (mmu_slb_size == size)
- return;
-
- mmu_slb_size = size;
- patch_slb_encoding(slb_compare_rr_to_size, mmu_slb_size);
-}
-
-void slb_initialize(void)
-{
- unsigned long linear_llp, vmalloc_llp, io_llp;
- unsigned long lflags, vflags;
- static int slb_encoding_inited;
-#ifdef CONFIG_SPARSEMEM_VMEMMAP
- unsigned long vmemmap_llp;
-#endif
-
- /* Prepare our SLB miss handler based on our page size */
- linear_llp = mmu_psize_defs[mmu_linear_psize].sllp;
- io_llp = mmu_psize_defs[mmu_io_psize].sllp;
- vmalloc_llp = mmu_psize_defs[mmu_vmalloc_psize].sllp;
- get_paca()->vmalloc_sllp = SLB_VSID_KERNEL | vmalloc_llp;
-#ifdef CONFIG_SPARSEMEM_VMEMMAP
- vmemmap_llp = mmu_psize_defs[mmu_vmemmap_psize].sllp;
-#endif
- if (!slb_encoding_inited) {
- slb_encoding_inited = 1;
- patch_slb_encoding(slb_miss_kernel_load_linear,
- SLB_VSID_KERNEL | linear_llp);
- patch_slb_encoding(slb_miss_kernel_load_io,
- SLB_VSID_KERNEL | io_llp);
- patch_slb_encoding(slb_compare_rr_to_size,
- mmu_slb_size);
-
- pr_devel("SLB: linear LLP = %04lx\n", linear_llp);
- pr_devel("SLB: io LLP = %04lx\n", io_llp);
-
-#ifdef CONFIG_SPARSEMEM_VMEMMAP
- patch_slb_encoding(slb_miss_kernel_load_vmemmap,
- SLB_VSID_KERNEL | vmemmap_llp);
- pr_devel("SLB: vmemmap LLP = %04lx\n", vmemmap_llp);
-#endif
- }
-
- get_paca()->stab_rr = SLB_NUM_BOLTED;
-
- lflags = SLB_VSID_KERNEL | linear_llp;
- vflags = SLB_VSID_KERNEL | vmalloc_llp;
-
- /* Invalidate the entire SLB (even entry 0) & all the ERATS */
- asm volatile("isync":::"memory");
- asm volatile("slbmte %0,%0"::"r" (0) : "memory");
- asm volatile("isync; slbia; isync":::"memory");
- create_shadowed_slbe(PAGE_OFFSET, mmu_kernel_ssize, lflags, LINEAR_INDEX);
- create_shadowed_slbe(VMALLOC_START, mmu_kernel_ssize, vflags, VMALLOC_INDEX);
-
- /* For the boot cpu, we're running on the stack in init_thread_union,
- * which is in the first segment of the linear mapping, and also
- * get_paca()->kstack hasn't been initialized yet.
- * For secondary cpus, we need to bolt the kernel stack entry now.
- */
- slb_shadow_clear(KSTACK_INDEX);
- if (raw_smp_processor_id() != boot_cpuid &&
- (get_paca()->kstack & slb_esid_mask(mmu_kernel_ssize)) > PAGE_OFFSET)
- create_shadowed_slbe(get_paca()->kstack,
- mmu_kernel_ssize, lflags, KSTACK_INDEX);
-
- asm volatile("isync":::"memory");
-}
-
-static void insert_slb_entry(unsigned long vsid, unsigned long ea,
- int bpsize, int ssize)
-{
- unsigned long flags, vsid_data, esid_data;
- enum slb_index index;
- int slb_cache_index;
-
- /*
- * We are irq disabled, hence should be safe to access PACA.
- */
- VM_WARN_ON(!irqs_disabled());
-
- /*
- * We can't take a PMU exception in the following code, so hard
- * disable interrupts.
- */
- hard_irq_disable();
-
- index = get_paca()->stab_rr;
-
- /*
- * simple round-robin replacement of slb starting at SLB_NUM_BOLTED.
- */
- if (index < (mmu_slb_size - 1))
- index++;
- else
- index = SLB_NUM_BOLTED;
-
- get_paca()->stab_rr = index;
-
- flags = SLB_VSID_USER | mmu_psize_defs[bpsize].sllp;
- vsid_data = (vsid << slb_vsid_shift(ssize)) | flags |
- ((unsigned long) ssize << SLB_VSID_SSIZE_SHIFT);
- esid_data = mk_esid_data(ea, ssize, index);
-
- /*
- * No need for an isync before or after this slbmte. The exception
- * we enter with and the rfid we exit with are context synchronizing.
- * Also we only handle user segments here.
- */
- asm volatile("slbmte %0, %1" : : "r" (vsid_data), "r" (esid_data)
- : "memory");
-
- /*
- * Now update slb cache entries
- */
- slb_cache_index = get_paca()->slb_cache_ptr;
- if (slb_cache_index < SLB_CACHE_ENTRIES) {
- /*
- * We have space in slb cache for optimized switch_slb().
- * Top 36 bits from esid_data as per ISA
- */
- get_paca()->slb_cache[slb_cache_index++] = esid_data >> 28;
- get_paca()->slb_cache_ptr++;
- } else {
- /*
- * Our cache is full and the current cache content strictly
- * doesn't indicate the active SLB conents. Bump the ptr
- * so that switch_slb() will ignore the cache.
- */
- get_paca()->slb_cache_ptr = SLB_CACHE_ENTRIES + 1;
- }
-}
-
-static void handle_multi_context_slb_miss(int context_id, unsigned long ea)
-{
- struct mm_struct *mm = current->mm;
- unsigned long vsid;
- int bpsize;
-
- /*
- * We are always above 1TB, hence use high user segment size.
- */
- vsid = get_vsid(context_id, ea, mmu_highuser_ssize);
- bpsize = get_slice_psize(mm, ea);
- insert_slb_entry(vsid, ea, bpsize, mmu_highuser_ssize);
-}
-
-void slb_miss_large_addr(struct pt_regs *regs)
-{
- enum ctx_state prev_state = exception_enter();
- unsigned long ea = regs->dar;
- int context;
-
- if (REGION_ID(ea) != USER_REGION_ID)
- goto slb_bad_addr;
-
- /*
- * Are we beyound what the page table layout supports ?
- */
- if ((ea & ~REGION_MASK) >= H_PGTABLE_RANGE)
- goto slb_bad_addr;
-
- /* Lower address should have been handled by asm code */
- if (ea < (1UL << MAX_EA_BITS_PER_CONTEXT))
- goto slb_bad_addr;
-
- /*
- * consider this as bad access if we take a SLB miss
- * on an address above addr limit.
- */
- if (ea >= current->mm->context.slb_addr_limit)
- goto slb_bad_addr;
-
- context = get_ea_context(¤t->mm->context, ea);
- if (!context)
- goto slb_bad_addr;
-
- handle_multi_context_slb_miss(context, ea);
- exception_exit(prev_state);
- return;
-
-slb_bad_addr:
- if (user_mode(regs))
- _exception(SIGSEGV, regs, SEGV_BNDERR, ea);
- else
- bad_page_fault(regs, ea, SIGSEGV);
- exception_exit(prev_state);
-}
diff --git a/arch/powerpc/mm/slb_low.S b/arch/powerpc/mm/slb_low.S
deleted file mode 100644
index 4ac5057..0000000
--- a/arch/powerpc/mm/slb_low.S
+++ /dev/null
@@ -1,335 +0,0 @@
-/*
- * Low-level SLB routines
- *
- * Copyright (C) 2004 David Gibson <dwg@au.ibm.com>, IBM
- *
- * Based on earlier C version:
- * Dave Engebretsen and Mike Corrigan {engebret|mikejc}@us.ibm.com
- * Copyright (c) 2001 Dave Engebretsen
- * Copyright (C) 2002 Anton Blanchard <anton@au.ibm.com>, IBM
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-#include <asm/processor.h>
-#include <asm/ppc_asm.h>
-#include <asm/asm-offsets.h>
-#include <asm/cputable.h>
-#include <asm/page.h>
-#include <asm/mmu.h>
-#include <asm/pgtable.h>
-#include <asm/firmware.h>
-#include <asm/feature-fixups.h>
-
-/*
- * This macro generates asm code to compute the VSID scramble
- * function. Used in slb_allocate() and do_stab_bolted. The function
- * computed is: (protovsid*VSID_MULTIPLIER) % VSID_MODULUS
- *
- * rt = register containing the proto-VSID and into which the
- * VSID will be stored
- * rx = scratch register (clobbered)
- * rf = flags
- *
- * - rt and rx must be different registers
- * - The answer will end up in the low VSID_BITS bits of rt. The higher
- * bits may contain other garbage, so you may need to mask the
- * result.
- */
-#define ASM_VSID_SCRAMBLE(rt, rx, rf, size) \
- lis rx,VSID_MULTIPLIER_##size@h; \
- ori rx,rx,VSID_MULTIPLIER_##size@l; \
- mulld rt,rt,rx; /* rt = rt * MULTIPLIER */ \
-/* \
- * powermac get slb fault before feature fixup, so make 65 bit part \
- * the default part of feature fixup \
- */ \
-BEGIN_MMU_FTR_SECTION \
- srdi rx,rt,VSID_BITS_65_##size; \
- clrldi rt,rt,(64-VSID_BITS_65_##size); \
- add rt,rt,rx; \
- addi rx,rt,1; \
- srdi rx,rx,VSID_BITS_65_##size; \
- add rt,rt,rx; \
- rldimi rf,rt,SLB_VSID_SHIFT_##size,(64 - (SLB_VSID_SHIFT_##size + VSID_BITS_65_##size)); \
-MMU_FTR_SECTION_ELSE \
- srdi rx,rt,VSID_BITS_##size; \
- clrldi rt,rt,(64-VSID_BITS_##size); \
- add rt,rt,rx; /* add high and low bits */ \
- addi rx,rt,1; \
- srdi rx,rx,VSID_BITS_##size; /* extract 2^VSID_BITS bit */ \
- add rt,rt,rx; \
- rldimi rf,rt,SLB_VSID_SHIFT_##size,(64 - (SLB_VSID_SHIFT_##size + VSID_BITS_##size)); \
-ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_68_BIT_VA)
-
-
-/* void slb_allocate(unsigned long ea);
- *
- * Create an SLB entry for the given EA (user or kernel).
- * r3 = faulting address, r13 = PACA
- * r9, r10, r11 are clobbered by this function
- * r3 is preserved.
- * No other registers are examined or changed.
- */
-_GLOBAL(slb_allocate)
- /*
- * Check if the address falls within the range of the first context, or
- * if we may need to handle multi context. For the first context we
- * allocate the slb entry via the fast path below. For large address we
- * branch out to C-code and see if additional contexts have been
- * allocated.
- * The test here is:
- * (ea & ~REGION_MASK) >= (1ull << MAX_EA_BITS_PER_CONTEXT)
- */
- rldicr. r9,r3,4,(63 - MAX_EA_BITS_PER_CONTEXT - 4)
- bne- 8f
-
- srdi r9,r3,60 /* get region */
- srdi r10,r3,SID_SHIFT /* get esid */
- cmpldi cr7,r9,0xc /* cmp PAGE_OFFSET for later use */
-
- /* r3 = address, r10 = esid, cr7 = <> PAGE_OFFSET */
- blt cr7,0f /* user or kernel? */
-
- /* Check if hitting the linear mapping or some other kernel space
- */
- bne cr7,1f
-
- /* Linear mapping encoding bits, the "li" instruction below will
- * be patched by the kernel at boot
- */
-.globl slb_miss_kernel_load_linear
-slb_miss_kernel_load_linear:
- li r11,0
- /*
- * context = (ea >> 60) - (0xc - 1)
- * r9 = region id.
- */
- subi r9,r9,KERNEL_REGION_CONTEXT_OFFSET
-
-BEGIN_FTR_SECTION
- b .Lslb_finish_load
-END_MMU_FTR_SECTION_IFCLR(MMU_FTR_1T_SEGMENT)
- b .Lslb_finish_load_1T
-
-1:
-#ifdef CONFIG_SPARSEMEM_VMEMMAP
- cmpldi cr0,r9,0xf
- bne 1f
-/* Check virtual memmap region. To be patched at kernel boot */
-.globl slb_miss_kernel_load_vmemmap
-slb_miss_kernel_load_vmemmap:
- li r11,0
- b 6f
-1:
-#endif /* CONFIG_SPARSEMEM_VMEMMAP */
-
- /*
- * r10 contains the ESID, which is the original faulting EA shifted
- * right by 28 bits. We need to compare that with (H_VMALLOC_END >> 28)
- * which is 0xd00038000. That can't be used as an immediate, even if we
- * ignored the 0xd, so we have to load it into a register, and we only
- * have one register free. So we must load all of (H_VMALLOC_END >> 28)
- * into a register and compare ESID against that.
- */
- lis r11,(H_VMALLOC_END >> 32)@h // r11 = 0xffffffffd0000000
- ori r11,r11,(H_VMALLOC_END >> 32)@l // r11 = 0xffffffffd0003800
- // Rotate left 4, then mask with 0xffffffff0
- rldic r11,r11,4,28 // r11 = 0xd00038000
- cmpld r10,r11 // if r10 >= r11
- bge 5f // goto io_mapping
-
- /*
- * vmalloc mapping gets the encoding from the PACA as the mapping
- * can be demoted from 64K -> 4K dynamically on some machines.
- */
- lhz r11,PACAVMALLOCSLLP(r13)
- b 6f
-5:
- /* IO mapping */
-.globl slb_miss_kernel_load_io
-slb_miss_kernel_load_io:
- li r11,0
-6:
- /*
- * context = (ea >> 60) - (0xc - 1)
- * r9 = region id.
- */
- subi r9,r9,KERNEL_REGION_CONTEXT_OFFSET
-
-BEGIN_FTR_SECTION
- b .Lslb_finish_load
-END_MMU_FTR_SECTION_IFCLR(MMU_FTR_1T_SEGMENT)
- b .Lslb_finish_load_1T
-
-0: /*
- * For userspace addresses, make sure this is region 0.
- */
- cmpdi r9, 0
- bne- 8f
- /*
- * user space make sure we are within the allowed limit
- */
- ld r11,PACA_SLB_ADDR_LIMIT(r13)
- cmpld r3,r11
- bge- 8f
-
- /* when using slices, we extract the psize off the slice bitmaps
- * and then we need to get the sllp encoding off the mmu_psize_defs
- * array.
- *
- * XXX This is a bit inefficient especially for the normal case,
- * so we should try to implement a fast path for the standard page
- * size using the old sllp value so we avoid the array. We cannot
- * really do dynamic patching unfortunately as processes might flip
- * between 4k and 64k standard page size
- */
-#ifdef CONFIG_PPC_MM_SLICES
- /* r10 have esid */
- cmpldi r10,16
- /* below SLICE_LOW_TOP */
- blt 5f
- /*
- * Handle hpsizes,
- * r9 is get_paca()->context.high_slices_psize[index], r11 is mask_index
- */
- srdi r11,r10,(SLICE_HIGH_SHIFT - SLICE_LOW_SHIFT + 1) /* index */
- addi r9,r11,PACAHIGHSLICEPSIZE
- lbzx r9,r13,r9 /* r9 is hpsizes[r11] */
- /* r11 = (r10 >> (SLICE_HIGH_SHIFT - SLICE_LOW_SHIFT)) & 0x1 */
- rldicl r11,r10,(64 - (SLICE_HIGH_SHIFT - SLICE_LOW_SHIFT)),63
- b 6f
-
-5:
- /*
- * Handle lpsizes
- * r9 is get_paca()->context.low_slices_psize[index], r11 is mask_index
- */
- srdi r11,r10,1 /* index */
- addi r9,r11,PACALOWSLICESPSIZE
- lbzx r9,r13,r9 /* r9 is lpsizes[r11] */
- rldicl r11,r10,0,63 /* r11 = r10 & 0x1 */
-6:
- sldi r11,r11,2 /* index * 4 */
- /* Extract the psize and multiply to get an array offset */
- srd r9,r9,r11
- andi. r9,r9,0xf
- mulli r9,r9,MMUPSIZEDEFSIZE
-
- /* Now get to the array and obtain the sllp
- */
- ld r11,PACATOC(r13)
- ld r11,mmu_psize_defs@got(r11)
- add r11,r11,r9
- ld r11,MMUPSIZESLLP(r11)
- ori r11,r11,SLB_VSID_USER
-#else
- /* paca context sllp already contains the SLB_VSID_USER bits */
- lhz r11,PACACONTEXTSLLP(r13)
-#endif /* CONFIG_PPC_MM_SLICES */
-
- ld r9,PACACONTEXTID(r13)
-BEGIN_FTR_SECTION
- cmpldi r10,0x1000
- bge .Lslb_finish_load_1T
-END_MMU_FTR_SECTION_IFSET(MMU_FTR_1T_SEGMENT)
- b .Lslb_finish_load
-
-8: /* invalid EA - return an error indication */
- crset 4*cr0+eq /* indicate failure */
- blr
-
-/*
- * Finish loading of an SLB entry and return
- *
- * r3 = EA, r9 = context, r10 = ESID, r11 = flags, clobbers r9, cr7 = <> PAGE_OFFSET
- */
-.Lslb_finish_load:
- rldimi r10,r9,ESID_BITS,0
- ASM_VSID_SCRAMBLE(r10,r9,r11,256M)
- /* r3 = EA, r11 = VSID data */
- /*
- * Find a slot, round robin. Previously we tried to find a
- * free slot first but that took too long. Unfortunately we
- * dont have any LRU information to help us choose a slot.
- */
-
- mr r9,r3
-
- /* slb_finish_load_1T continues here. r9=EA with non-ESID bits clear */
-7: ld r10,PACASTABRR(r13)
- addi r10,r10,1
- /* This gets soft patched on boot. */
-.globl slb_compare_rr_to_size
-slb_compare_rr_to_size:
- cmpldi r10,0
-
- blt+ 4f
- li r10,SLB_NUM_BOLTED
-
-4:
- std r10,PACASTABRR(r13)
-
-3:
- rldimi r9,r10,0,36 /* r9 = EA[0:35] | entry */
- oris r10,r9,SLB_ESID_V@h /* r10 = r9 | SLB_ESID_V */
-
- /* r9 = ESID data, r11 = VSID data */
-
- /*
- * No need for an isync before or after this slbmte. The exception
- * we enter with and the rfid we exit with are context synchronizing.
- */
- slbmte r11,r10
-
- /* we're done for kernel addresses */
- crclr 4*cr0+eq /* set result to "success" */
- bgelr cr7
-
- /* Update the slb cache */
- lhz r9,PACASLBCACHEPTR(r13) /* offset = paca->slb_cache_ptr */
- cmpldi r9,SLB_CACHE_ENTRIES
- bge 1f
-
- /* still room in the slb cache */
- sldi r11,r9,2 /* r11 = offset * sizeof(u32) */
- srdi r10,r10,28 /* get the 36 bits of the ESID */
- add r11,r11,r13 /* r11 = (u32 *)paca + offset */
- stw r10,PACASLBCACHE(r11) /* paca->slb_cache[offset] = esid */
- addi r9,r9,1 /* offset++ */
- b 2f
-1: /* offset >= SLB_CACHE_ENTRIES */
- li r9,SLB_CACHE_ENTRIES+1
-2:
- sth r9,PACASLBCACHEPTR(r13) /* paca->slb_cache_ptr = offset */
- crclr 4*cr0+eq /* set result to "success" */
- blr
-
-/*
- * Finish loading of a 1T SLB entry (for the kernel linear mapping) and return.
- *
- * r3 = EA, r9 = context, r10 = ESID(256MB), r11 = flags, clobbers r9
- */
-.Lslb_finish_load_1T:
- srdi r10,r10,(SID_SHIFT_1T - SID_SHIFT) /* get 1T ESID */
- rldimi r10,r9,ESID_BITS_1T,0
- ASM_VSID_SCRAMBLE(r10,r9,r11,1T)
-
- li r10,MMU_SEGSIZE_1T
- rldimi r11,r10,SLB_VSID_SSIZE_SHIFT,0 /* insert segment size */
-
- /* r3 = EA, r11 = VSID data */
- clrrdi r9,r3,SID_SHIFT_1T /* clear out non-ESID bits */
- b 7b
-
-
-_ASM_NOKPROBE_SYMBOL(slb_allocate)
-_ASM_NOKPROBE_SYMBOL(slb_miss_kernel_load_linear)
-_ASM_NOKPROBE_SYMBOL(slb_miss_kernel_load_io)
-_ASM_NOKPROBE_SYMBOL(slb_compare_rr_to_size)
-#ifdef CONFIG_SPARSEMEM_VMEMMAP
-_ASM_NOKPROBE_SYMBOL(slb_miss_kernel_load_vmemmap)
-#endif
diff --git a/arch/powerpc/mm/slice.c b/arch/powerpc/mm/slice.c
index 4f213ba..42bbcd4 100644
--- a/arch/powerpc/mm/slice.c
+++ b/arch/powerpc/mm/slice.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* address space "slices" (meta-segments) support
*
@@ -6,20 +7,6 @@
* Based on hugetlb implementation
*
* Copyright (C) 2003 David Gibson, IBM Corporation.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#undef DEBUG
@@ -31,6 +18,8 @@
#include <linux/spinlock.h>
#include <linux/export.h>
#include <linux/hugetlb.h>
+#include <linux/sched/mm.h>
+#include <linux/security.h>
#include <asm/mman.h>
#include <asm/mmu.h>
#include <asm/copro.h>
@@ -99,7 +88,7 @@
{
struct vm_area_struct *vma;
- if ((mm->context.slb_addr_limit - len) < addr)
+ if ((mm_ctx_slb_addr_limit(&mm->context) - len) < addr)
return 0;
vma = find_vma(mm, addr);
return (!vma || (addr + len) <= vm_start_gap(vma));
@@ -116,13 +105,11 @@
unsigned long start = slice << SLICE_HIGH_SHIFT;
unsigned long end = start + (1ul << SLICE_HIGH_SHIFT);
-#ifdef CONFIG_PPC64
/* Hack, so that each addresses is controlled by exactly one
* of the high or low area bitmaps, the first high area starts
* at 4GB, not 0 */
if (start == 0)
- start = SLICE_LOW_TOP;
-#endif
+ start = (unsigned long)SLICE_LOW_TOP;
return !slice_area_is_free(mm, start, end - start);
}
@@ -148,40 +135,6 @@
__set_bit(i, ret->high_slices);
}
-#ifdef CONFIG_PPC_BOOK3S_64
-static struct slice_mask *slice_mask_for_size(struct mm_struct *mm, int psize)
-{
-#ifdef CONFIG_PPC_64K_PAGES
- if (psize == MMU_PAGE_64K)
- return &mm->context.mask_64k;
-#endif
- if (psize == MMU_PAGE_4K)
- return &mm->context.mask_4k;
-#ifdef CONFIG_HUGETLB_PAGE
- if (psize == MMU_PAGE_16M)
- return &mm->context.mask_16m;
- if (psize == MMU_PAGE_16G)
- return &mm->context.mask_16g;
-#endif
- BUG();
-}
-#elif defined(CONFIG_PPC_8xx)
-static struct slice_mask *slice_mask_for_size(struct mm_struct *mm, int psize)
-{
- if (psize == mmu_virtual_psize)
- return &mm->context.mask_base_psize;
-#ifdef CONFIG_HUGETLB_PAGE
- if (psize == MMU_PAGE_512K)
- return &mm->context.mask_512k;
- if (psize == MMU_PAGE_8M)
- return &mm->context.mask_8m;
-#endif
- BUG();
-}
-#else
-#error "Must define the slice masks for page sizes supported by the platform"
-#endif
-
static bool slice_check_range_fits(struct mm_struct *mm,
const struct slice_mask *available,
unsigned long start, unsigned long len)
@@ -226,7 +179,7 @@
copy_mm_to_paca(current->active_mm);
local_irq_save(flags);
- slb_flush_and_rebolt();
+ slb_flush_and_restore_bolted();
local_irq_restore(flags);
#endif
}
@@ -244,14 +197,14 @@
slice_dbg("slice_convert(mm=%p, psize=%d)\n", mm, psize);
slice_print_mask(" mask", mask);
- psize_mask = slice_mask_for_size(mm, psize);
+ psize_mask = slice_mask_for_size(&mm->context, psize);
/* We need to use a spinlock here to protect against
* concurrent 64k -> 4k demotion ...
*/
spin_lock_irqsave(&slice_convert_lock, flags);
- lpsizes = mm->context.low_slices_psize;
+ lpsizes = mm_ctx_low_slices(&mm->context);
for (i = 0; i < SLICE_NUM_LOW; i++) {
if (!(mask->low_slices & (1u << i)))
continue;
@@ -261,7 +214,7 @@
/* Update the slice_mask */
old_psize = (lpsizes[index] >> (mask_index * 4)) & 0xf;
- old_mask = slice_mask_for_size(mm, old_psize);
+ old_mask = slice_mask_for_size(&mm->context, old_psize);
old_mask->low_slices &= ~(1u << i);
psize_mask->low_slices |= 1u << i;
@@ -270,8 +223,8 @@
(((unsigned long)psize) << (mask_index * 4));
}
- hpsizes = mm->context.high_slices_psize;
- for (i = 0; i < GET_HIGH_SLICE_INDEX(mm->context.slb_addr_limit); i++) {
+ hpsizes = mm_ctx_high_slices(&mm->context);
+ for (i = 0; i < GET_HIGH_SLICE_INDEX(mm_ctx_slb_addr_limit(&mm->context)); i++) {
if (!test_bit(i, mask->high_slices))
continue;
@@ -280,7 +233,7 @@
/* Update the slice_mask */
old_psize = (hpsizes[index] >> (mask_index * 4)) & 0xf;
- old_mask = slice_mask_for_size(mm, old_psize);
+ old_mask = slice_mask_for_size(&mm->context, old_psize);
__clear_bit(i, old_mask->high_slices);
__set_bit(i, psize_mask->high_slices);
@@ -290,8 +243,8 @@
}
slice_dbg(" lsps=%lx, hsps=%lx\n",
- (unsigned long)mm->context.low_slices_psize,
- (unsigned long)mm->context.high_slices_psize);
+ (unsigned long)mm_ctx_low_slices(&mm->context),
+ (unsigned long)mm_ctx_high_slices(&mm->context));
spin_unlock_irqrestore(&slice_convert_lock, flags);
@@ -376,6 +329,7 @@
int pshift = max_t(int, mmu_psize_defs[psize].shift, PAGE_SHIFT);
unsigned long addr, found, prev;
struct vm_unmapped_area_info info;
+ unsigned long min_addr = max(PAGE_SIZE, mmap_min_addr);
info.flags = VM_UNMAPPED_AREA_TOPDOWN;
info.length = len;
@@ -390,9 +344,9 @@
* DEFAULT_MAP_WINDOW we should apply this.
*/
if (high_limit > DEFAULT_MAP_WINDOW)
- addr += mm->context.slb_addr_limit - DEFAULT_MAP_WINDOW;
+ addr += mm_ctx_slb_addr_limit(&mm->context) - DEFAULT_MAP_WINDOW;
- while (addr > PAGE_SIZE) {
+ while (addr > min_addr) {
info.high_limit = addr;
if (!slice_scan_available(addr - 1, available, 0, &addr))
continue;
@@ -404,8 +358,8 @@
* Check if we need to reduce the range, or if we can
* extend it to cover the previous available slice.
*/
- if (addr < PAGE_SIZE)
- addr = PAGE_SIZE;
+ if (addr < min_addr)
+ addr = min_addr;
else if (slice_scan_available(addr - 1, available, 0, &prev)) {
addr = prev;
goto prev_slice;
@@ -502,20 +456,20 @@
return -ENOMEM;
}
- if (high_limit > mm->context.slb_addr_limit) {
+ if (high_limit > mm_ctx_slb_addr_limit(&mm->context)) {
/*
* Increasing the slb_addr_limit does not require
* slice mask cache to be recalculated because it should
* be already initialised beyond the old address limit.
*/
- mm->context.slb_addr_limit = high_limit;
+ mm_ctx_set_slb_addr_limit(&mm->context, high_limit);
on_each_cpu(slice_flush_segments, mm, 1);
}
/* Sanity checks */
BUG_ON(mm->task_size == 0);
- BUG_ON(mm->context.slb_addr_limit == 0);
+ BUG_ON(mm_ctx_slb_addr_limit(&mm->context) == 0);
VM_BUG_ON(radix_enabled());
slice_dbg("slice_get_unmapped_area(mm=%p, psize=%d...\n", mm, psize);
@@ -527,7 +481,7 @@
addr = _ALIGN_UP(addr, page_size);
slice_dbg(" aligned addr=%lx\n", addr);
/* Ignore hint if it's too large or overlaps a VMA */
- if (addr > high_limit - len ||
+ if (addr > high_limit - len || addr < mmap_min_addr ||
!slice_area_is_free(mm, addr, len))
addr = 0;
}
@@ -535,7 +489,7 @@
/* First make up a "good" mask of slices that have the right size
* already
*/
- maskp = slice_mask_for_size(mm, psize);
+ maskp = slice_mask_for_size(&mm->context, psize);
/*
* Here "good" means slices that are already the right page size,
@@ -562,7 +516,7 @@
* a pointer to good mask for the next code to use.
*/
if (IS_ENABLED(CONFIG_PPC_64K_PAGES) && psize == MMU_PAGE_64K) {
- compat_maskp = slice_mask_for_size(mm, MMU_PAGE_4K);
+ compat_maskp = slice_mask_for_size(&mm->context, MMU_PAGE_4K);
if (fixed)
slice_or_mask(&good_mask, maskp, compat_maskp);
else
@@ -639,14 +593,13 @@
newaddr = slice_find_area(mm, len, &potential_mask,
psize, topdown, high_limit);
-#ifdef CONFIG_PPC_64K_PAGES
- if (newaddr == -ENOMEM && psize == MMU_PAGE_64K) {
+ if (IS_ENABLED(CONFIG_PPC_64K_PAGES) && newaddr == -ENOMEM &&
+ psize == MMU_PAGE_64K) {
/* retry the search with 4k-page slices included */
slice_or_mask(&potential_mask, &potential_mask, compat_maskp);
newaddr = slice_find_area(mm, len, &potential_mask,
psize, topdown, high_limit);
}
-#endif
if (newaddr == -ENOMEM)
return -ENOMEM;
@@ -693,7 +646,7 @@
unsigned long flags)
{
return slice_get_unmapped_area(addr, len, flags,
- current->mm->context.user_psize, 0);
+ mm_ctx_user_psize(¤t->mm->context), 0);
}
unsigned long arch_get_unmapped_area_topdown(struct file *filp,
@@ -703,7 +656,7 @@
const unsigned long flags)
{
return slice_get_unmapped_area(addr0, len, flags,
- current->mm->context.user_psize, 1);
+ mm_ctx_user_psize(¤t->mm->context), 1);
}
unsigned int get_slice_psize(struct mm_struct *mm, unsigned long addr)
@@ -714,10 +667,10 @@
VM_BUG_ON(radix_enabled());
if (slice_addr_is_low(addr)) {
- psizes = mm->context.low_slices_psize;
+ psizes = mm_ctx_low_slices(&mm->context);
index = GET_LOW_SLICE_INDEX(addr);
} else {
- psizes = mm->context.high_slices_psize;
+ psizes = mm_ctx_high_slices(&mm->context);
index = GET_HIGH_SLICE_INDEX(addr);
}
mask_index = index & 0x1;
@@ -738,32 +691,41 @@
* case of fork it is just inherited from the mm being
* duplicated.
*/
-#ifdef CONFIG_PPC64
- mm->context.slb_addr_limit = DEFAULT_MAP_WINDOW_USER64;
-#else
- mm->context.slb_addr_limit = DEFAULT_MAP_WINDOW;
-#endif
-
- mm->context.user_psize = psize;
+ mm_ctx_set_slb_addr_limit(&mm->context, SLB_ADDR_LIMIT_DEFAULT);
+ mm_ctx_set_user_psize(&mm->context, psize);
/*
* Set all slice psizes to the default.
*/
- lpsizes = mm->context.low_slices_psize;
+ lpsizes = mm_ctx_low_slices(&mm->context);
memset(lpsizes, (psize << 4) | psize, SLICE_NUM_LOW >> 1);
- hpsizes = mm->context.high_slices_psize;
+ hpsizes = mm_ctx_high_slices(&mm->context);
memset(hpsizes, (psize << 4) | psize, SLICE_NUM_HIGH >> 1);
/*
* Slice mask cache starts zeroed, fill the default size cache.
*/
- mask = slice_mask_for_size(mm, psize);
+ mask = slice_mask_for_size(&mm->context, psize);
mask->low_slices = ~0UL;
if (SLICE_NUM_HIGH)
bitmap_fill(mask->high_slices, SLICE_NUM_HIGH);
}
+#ifdef CONFIG_PPC_BOOK3S_64
+void slice_setup_new_exec(void)
+{
+ struct mm_struct *mm = current->mm;
+
+ slice_dbg("slice_setup_new_exec(mm=%p)\n", mm);
+
+ if (!is_32bit_task())
+ return;
+
+ mm_ctx_set_slb_addr_limit(&mm->context, DEFAULT_MAP_WINDOW);
+}
+#endif
+
void slice_set_range_psize(struct mm_struct *mm, unsigned long start,
unsigned long len, unsigned int psize)
{
@@ -799,22 +761,21 @@
unsigned long len)
{
const struct slice_mask *maskp;
- unsigned int psize = mm->context.user_psize;
+ unsigned int psize = mm_ctx_user_psize(&mm->context);
VM_BUG_ON(radix_enabled());
- maskp = slice_mask_for_size(mm, psize);
-#ifdef CONFIG_PPC_64K_PAGES
+ maskp = slice_mask_for_size(&mm->context, psize);
+
/* We need to account for 4k slices too */
- if (psize == MMU_PAGE_64K) {
+ if (IS_ENABLED(CONFIG_PPC_64K_PAGES) && psize == MMU_PAGE_64K) {
const struct slice_mask *compat_maskp;
struct slice_mask available;
- compat_maskp = slice_mask_for_size(mm, MMU_PAGE_4K);
+ compat_maskp = slice_mask_for_size(&mm->context, MMU_PAGE_4K);
slice_or_mask(&available, maskp, compat_maskp);
return !slice_check_range_fits(mm, &available, addr, len);
}
-#endif
return !slice_check_range_fits(mm, maskp, addr, len);
}
diff --git a/arch/powerpc/mm/vphn.c b/arch/powerpc/mm/vphn.c
deleted file mode 100644
index f83044f..0000000
--- a/arch/powerpc/mm/vphn.c
+++ /dev/null
@@ -1,71 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include <asm/byteorder.h>
-#include "vphn.h"
-
-/*
- * The associativity domain numbers are returned from the hypervisor as a
- * stream of mixed 16-bit and 32-bit fields. The stream is terminated by the
- * special value of "all ones" (aka. 0xffff) and its size may not exceed 48
- * bytes.
- *
- * --- 16-bit fields -->
- * _________________________
- * | 0 | 1 | 2 | 3 | be_packed[0]
- * ------+-----+-----+------
- * _________________________
- * | 4 | 5 | 6 | 7 | be_packed[1]
- * -------------------------
- * ...
- * _________________________
- * | 20 | 21 | 22 | 23 | be_packed[5]
- * -------------------------
- *
- * Convert to the sequence they would appear in the ibm,associativity property.
- */
-int vphn_unpack_associativity(const long *packed, __be32 *unpacked)
-{
- __be64 be_packed[VPHN_REGISTER_COUNT];
- int i, nr_assoc_doms = 0;
- const __be16 *field = (const __be16 *) be_packed;
- u16 last = 0;
- bool is_32bit = false;
-
-#define VPHN_FIELD_UNUSED (0xffff)
-#define VPHN_FIELD_MSB (0x8000)
-#define VPHN_FIELD_MASK (~VPHN_FIELD_MSB)
-
- /* Let's fix the values returned by plpar_hcall9() */
- for (i = 0; i < VPHN_REGISTER_COUNT; i++)
- be_packed[i] = cpu_to_be64(packed[i]);
-
- for (i = 1; i < VPHN_ASSOC_BUFSIZE; i++) {
- u16 new = be16_to_cpup(field++);
-
- if (is_32bit) {
- /* Let's concatenate the 16 bits of this field to the
- * 15 lower bits of the previous field
- */
- unpacked[++nr_assoc_doms] =
- cpu_to_be32(last << 16 | new);
- is_32bit = false;
- } else if (new == VPHN_FIELD_UNUSED)
- /* This is the list terminator */
- break;
- else if (new & VPHN_FIELD_MSB) {
- /* Data is in the lower 15 bits of this field */
- unpacked[++nr_assoc_doms] =
- cpu_to_be32(new & VPHN_FIELD_MASK);
- } else {
- /* Data is in the lower 15 bits of this field
- * concatenated with the next 16 bit field
- */
- last = new;
- is_32bit = true;
- }
- }
-
- /* The first cell contains the length of the property */
- unpacked[0] = cpu_to_be32(nr_assoc_doms);
-
- return nr_assoc_doms;
-}
diff --git a/arch/powerpc/mm/vphn.h b/arch/powerpc/mm/vphn.h
deleted file mode 100644
index f9ffdb3..0000000
--- a/arch/powerpc/mm/vphn.h
+++ /dev/null
@@ -1,17 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ARCH_POWERPC_MM_VPHN_H_
-#define _ARCH_POWERPC_MM_VPHN_H_
-
-/* The H_HOME_NODE_ASSOCIATIVITY h_call returns 6 64-bit registers.
- */
-#define VPHN_REGISTER_COUNT 6
-
-/*
- * 6 64-bit registers unpacked into up to 24 be32 associativity values. To
- * form the complete property we have to add the length in the first cell.
- */
-#define VPHN_ASSOC_BUFSIZE (VPHN_REGISTER_COUNT*sizeof(u64)/sizeof(u16) + 1)
-
-extern int vphn_unpack_associativity(const long *packed, __be32 *unpacked);
-
-#endif