Update Linux to v5.10.109
Sourced from [1]
[1] https://cdn.kernel.org/pub/linux/kernel/v5.x/linux-5.10.109.tar.xz
Change-Id: I19bca9fc6762d4e63bcf3e4cba88bbe560d9c76c
Signed-off-by: Olivier Deprez <olivier.deprez@arm.com>
diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile
index 5e14798..55b4a8b 100644
--- a/arch/powerpc/mm/Makefile
+++ b/arch/powerpc/mm/Makefile
@@ -5,7 +5,7 @@
ccflags-$(CONFIG_PPC64) := $(NO_MINIMAL_TOC)
-obj-y := fault.o mem.o pgtable.o mmap.o \
+obj-y := fault.o mem.o pgtable.o mmap.o maccess.o \
init_$(BITS).o pgtable_$(BITS).o \
pgtable-frag.o ioremap.o ioremap_$(BITS).o \
init-common.o mmu_context.o drmem.o
diff --git a/arch/powerpc/mm/book3s32/hash_low.S b/arch/powerpc/mm/book3s32/hash_low.S
index 8bbbd97..b2c912e 100644
--- a/arch/powerpc/mm/book3s32/hash_low.S
+++ b/arch/powerpc/mm/book3s32/hash_low.S
@@ -14,9 +14,10 @@
* hash table, so this file is not used on them.)
*/
+#include <linux/pgtable.h>
+#include <linux/init.h>
#include <asm/reg.h>
#include <asm/page.h>
-#include <asm/pgtable.h>
#include <asm/cputable.h>
#include <asm/ppc_asm.h>
#include <asm/thread_info.h>
@@ -62,7 +63,7 @@
isync
#endif
/* Get PTE (linux-style) and check access */
- lis r0,KERNELBASE@h /* check if kernel address */
+ lis r0, TASK_SIZE@h /* check if kernel address */
cmplw 0,r4,r0
ori r3,r3,_PAGE_USER|_PAGE_PRESENT /* test low addresses as user */
mfspr r5, SPRN_SPRG_PGDIR /* phys page-table root */
@@ -81,7 +82,7 @@
rlwinm. r8,r8,0,0,20 /* extract pt base address */
#endif
#ifdef CONFIG_SMP
- beq- hash_page_out /* return if no mapping */
+ beq- .Lhash_page_out /* return if no mapping */
#else
/* XXX it seems like the 601 will give a machine fault on the
rfi if its alignment is wrong (bottom 4 bits of address are
@@ -109,11 +110,11 @@
#if (PTE_FLAGS_OFFSET != 0)
addi r8,r8,PTE_FLAGS_OFFSET
#endif
-retry:
+.Lretry:
lwarx r6,0,r8 /* get linux-style pte, flag word */
andc. r5,r3,r6 /* check access & ~permission */
#ifdef CONFIG_SMP
- bne- hash_page_out /* return if access not permitted */
+ bne- .Lhash_page_out /* return if access not permitted */
#else
bnelr-
#endif
@@ -128,11 +129,13 @@
#endif /* CONFIG_SMP */
#endif /* CONFIG_PTE_64BIT */
stwcx. r5,0,r8 /* attempt to update PTE */
- bne- retry /* retry if someone got there first */
+ bne- .Lretry /* retry if someone got there first */
mfsrin r3,r4 /* get segment reg for segment */
+#ifndef CONFIG_VMAP_STACK
mfctr r0
stw r0,_CTR(r11)
+#endif
bl create_hpte /* add the hash table entry */
#ifdef CONFIG_SMP
@@ -142,21 +145,26 @@
stw r0, (mmu_hash_lock - PAGE_OFFSET)@l(r8)
#endif
+#ifdef CONFIG_VMAP_STACK
+ b fast_hash_page_return
+#else
/* Return from the exception */
lwz r5,_CTR(r11)
mtctr r5
lwz r0,GPR0(r11)
lwz r8,GPR8(r11)
b fast_exception_return
+#endif
#ifdef CONFIG_SMP
-hash_page_out:
+.Lhash_page_out:
eieio
lis r8, (mmu_hash_lock - PAGE_OFFSET)@ha
li r0,0
stw r0, (mmu_hash_lock - PAGE_OFFSET)@l(r8)
blr
#endif /* CONFIG_SMP */
+_ASM_NOKPROBE_SYMBOL(hash_page)
/*
* Add an entry for a particular page to the hash table.
@@ -192,11 +200,9 @@
* covered by a BAT). -- paulus
*/
mfmsr r9
- SYNC
rlwinm r0,r9,0,17,15 /* clear bit 16 (MSR_EE) */
rlwinm r0,r0,0,28,26 /* clear MSR_DR */
mtmsr r0
- SYNC_601
isync
#ifdef CONFIG_SMP
@@ -255,12 +261,12 @@
/* reenable interrupts and DR */
mtmsr r9
- SYNC_601
isync
lwz r0,4(r1)
mtlr r0
blr
+_ASM_NOKPROBE_SYMBOL(add_hash_page)
/*
* This routine adds a hardware PTE to the hash table.
@@ -279,9 +285,9 @@
*
* For speed, 4 of the instructions get patched once the size and
* physical address of the hash table are known. These definitions
- * of Hash_base and Hash_bits below are just an example.
+ * of Hash_base and Hash_bits below are for the early hash table.
*/
-Hash_base = 0xc0180000
+Hash_base = early_hash
Hash_bits = 12 /* e.g. 256kB hash table */
Hash_msk = (((1 << Hash_bits) - 1) * 64)
@@ -302,6 +308,7 @@
#define HASH_LEFT 31-(LG_PTEG_SIZE+Hash_bits-1)
#define HASH_RIGHT 31-LG_PTEG_SIZE
+__REF
_GLOBAL(create_hpte)
/* Convert linux-style PTE (r5) to low word of PPC-style PTE (r8) */
rlwinm r8,r5,32-9,30,30 /* _PAGE_RW -> PP msb */
@@ -354,7 +361,7 @@
1: LDPTEu r6,HPTE_SIZE(r4) /* get next PTE */
CMPPTE 0,r6,r5
bdnzf 2,1b /* loop while ctr != 0 && !cr0.eq */
- beq+ found_slot
+ beq+ .Lfound_slot
patch_site 0f, patch__hash_page_B
/* Search the secondary PTEG for a matching PTE */
@@ -366,7 +373,7 @@
2: LDPTEu r6,HPTE_SIZE(r4)
CMPPTE 0,r6,r5
bdnzf 2,2b
- beq+ found_slot
+ beq+ .Lfound_slot
xori r5,r5,PTE_H /* clear H bit again */
/* Search the primary PTEG for an empty slot */
@@ -375,7 +382,7 @@
1: LDPTEu r6,HPTE_SIZE(r4) /* get next PTE */
TST_V(r6) /* test valid bit */
bdnzf 2,1b /* loop while ctr != 0 && !cr0.eq */
- beq+ found_empty
+ beq+ .Lfound_empty
/* update counter of times that the primary PTEG is full */
lis r4, (primary_pteg_full - PAGE_OFFSET)@ha
@@ -393,7 +400,7 @@
2: LDPTEu r6,HPTE_SIZE(r4)
TST_V(r6)
bdnzf 2,2b
- beq+ found_empty
+ beq+ .Lfound_empty
xori r5,r5,PTE_H /* clear H bit again */
/*
@@ -431,9 +438,9 @@
#ifndef CONFIG_SMP
/* Store PTE in PTEG */
-found_empty:
+.Lfound_empty:
STPTE r5,0(r4)
-found_slot:
+.Lfound_slot:
STPTE r8,HPTE_SIZE/2(r4)
#else /* CONFIG_SMP */
@@ -454,8 +461,8 @@
* We do however have to make sure that the PTE is never in an invalid
* state with the V bit set.
*/
-found_empty:
-found_slot:
+.Lfound_empty:
+.Lfound_slot:
CLR_V(r5,r0) /* clear V (valid) bit in PTE */
STPTE r5,0(r4)
sync
@@ -468,6 +475,8 @@
sync /* make sure pte updates get to memory */
blr
+ .previous
+_ASM_NOKPROBE_SYMBOL(create_hpte)
.section .bss
.align 2
@@ -487,6 +496,7 @@
*
* We assume that there is a hash table in use (Hash != 0).
*/
+__REF
_GLOBAL(flush_hash_pages)
/*
* We disable interrupts here, even on UP, because we want
@@ -497,11 +507,9 @@
* covered by a BAT). -- paulus
*/
mfmsr r10
- SYNC
rlwinm r0,r10,0,17,15 /* clear bit 16 (MSR_EE) */
rlwinm r0,r0,0,28,26 /* clear MSR_DR */
mtmsr r0
- SYNC_601
isync
/* First find a PTE in the range that has _PAGE_HASHPTE set */
@@ -620,10 +628,11 @@
#endif
19: mtmsr r10
- SYNC_601
isync
blr
+ .previous
EXPORT_SYMBOL(flush_hash_pages)
+_ASM_NOKPROBE_SYMBOL(flush_hash_pages)
/*
* Flush an entry from the TLB
@@ -633,11 +642,9 @@
lwz r8,TASK_CPU(r2)
oris r8,r8,11
mfmsr r10
- SYNC
rlwinm r0,r10,0,17,15 /* clear bit 16 (MSR_EE) */
rlwinm r0,r0,0,28,26 /* clear DR */
mtmsr r0
- SYNC_601
isync
lis r9,mmu_hash_lock@h
ori r9,r9,mmu_hash_lock@l
@@ -654,13 +661,13 @@
li r0,0
stw r0,0(r9) /* clear mmu_hash_lock */
mtmsr r10
- SYNC_601
isync
#else /* CONFIG_SMP */
tlbie r3
sync
#endif /* CONFIG_SMP */
blr
+_ASM_NOKPROBE_SYMBOL(_tlbie)
/*
* Flush the entire TLB. 603/603e only
@@ -670,11 +677,9 @@
lwz r8,TASK_CPU(r2)
oris r8,r8,10
mfmsr r10
- SYNC
rlwinm r0,r10,0,17,15 /* clear bit 16 (MSR_EE) */
rlwinm r0,r0,0,28,26 /* clear DR */
mtmsr r0
- SYNC_601
isync
lis r9,mmu_hash_lock@h
ori r9,r9,mmu_hash_lock@l
@@ -684,18 +689,21 @@
bne- 10b
stwcx. r8,0,r9
bne- 10b
+#endif /* CONFIG_SMP */
+ li r5, 32
+ lis r4, KERNELBASE@h
+ mtctr r5
sync
- tlbia
+0: tlbie r4
+ addi r4, r4, 0x1000
+ bdnz 0b
sync
+#ifdef CONFIG_SMP
TLBSYNC
li r0,0
stw r0,0(r9) /* clear mmu_hash_lock */
mtmsr r10
- SYNC_601
isync
-#else /* CONFIG_SMP */
- sync
- tlbia
- sync
#endif /* CONFIG_SMP */
blr
+_ASM_NOKPROBE_SYMBOL(_tlbia)
diff --git a/arch/powerpc/mm/book3s32/mmu.c b/arch/powerpc/mm/book3s32/mmu.c
index 1424a12..602ab13 100644
--- a/arch/powerpc/mm/book3s32/mmu.c
+++ b/arch/powerpc/mm/book3s32/mmu.c
@@ -31,6 +31,8 @@
#include <mm/mmu_decl.h>
+u8 __initdata early_hash[SZ_256K] __aligned(SZ_256K) = {0};
+
struct hash_pte *Hash;
static unsigned long Hash_size, Hash_mask;
unsigned long _SDR1;
@@ -70,26 +72,16 @@
return 0;
}
-static int find_free_bat(void)
+int __init find_free_bat(void)
{
int b;
+ int n = mmu_has_feature(MMU_FTR_USE_HIGH_BATS) ? 8 : 4;
- if (IS_ENABLED(CONFIG_PPC_BOOK3S_601)) {
- for (b = 0; b < 4; b++) {
- struct ppc_bat *bat = BATS[b];
+ for (b = 0; b < n; b++) {
+ struct ppc_bat *bat = BATS[b];
- if (!(bat[0].batl & 0x40))
- return b;
- }
- } else {
- int n = mmu_has_feature(MMU_FTR_USE_HIGH_BATS) ? 8 : 4;
-
- for (b = 0; b < n; b++) {
- struct ppc_bat *bat = BATS[b];
-
- if (!(bat[1].batu & 3))
- return b;
- }
+ if (!(bat[1].batu & 3))
+ return b;
}
return -1;
}
@@ -97,16 +89,16 @@
/*
* This function calculates the size of the larger block usable to map the
* beginning of an area based on the start address and size of that area:
- * - max block size is 8M on 601 and 256 on other 6xx.
+ * - max block size is 256 on 6xx.
* - base address must be aligned to the block size. So the maximum block size
* is identified by the lowest bit set to 1 in the base address (for instance
* if base is 0x16000000, max size is 0x02000000).
* - block size has to be a power of two. This is calculated by finding the
* highest bit set to 1.
*/
-static unsigned int block_size(unsigned long base, unsigned long top)
+unsigned int bat_block_size(unsigned long base, unsigned long top)
{
- unsigned int max_size = IS_ENABLED(CONFIG_PPC_BOOK3S_601) ? SZ_8M : SZ_256M;
+ unsigned int max_size = SZ_256M;
unsigned int base_shift = (ffs(base) - 1) & 31;
unsigned int block_shift = (fls(top - base) - 1) & 31;
@@ -117,7 +109,6 @@
* Set up one of the IBAT (block address translation) register pairs.
* The parameters are not checked; in particular size must be a power
* of 2 between 128k and 256M.
- * Only for 603+ ...
*/
static void setibat(int index, unsigned long virt, phys_addr_t phys,
unsigned int size, pgprot_t prot)
@@ -150,7 +141,7 @@
int idx;
while ((idx = find_free_bat()) != -1 && base != top) {
- unsigned int size = block_size(base, top);
+ unsigned int size = bat_block_size(base, top);
if (size < 128 << 10)
break;
@@ -170,6 +161,12 @@
pr_debug("RAM mapped without BATs\n");
return base;
}
+ if (debug_pagealloc_enabled()) {
+ if (base >= border)
+ return base;
+ if (top >= border)
+ top = border;
+ }
if (!strict_kernel_rwx_enabled() || base >= border || top <= border)
return __mmu_mapin_ram(base, top);
@@ -181,26 +178,40 @@
return __mmu_mapin_ram(border, top);
}
+static bool is_module_segment(unsigned long addr)
+{
+ if (!IS_ENABLED(CONFIG_MODULES))
+ return false;
+#ifdef MODULES_VADDR
+ if (addr < ALIGN_DOWN(MODULES_VADDR, SZ_256M))
+ return false;
+ if (addr > ALIGN(MODULES_END, SZ_256M) - 1)
+ return false;
+#else
+ if (addr < ALIGN_DOWN(VMALLOC_START, SZ_256M))
+ return false;
+ if (addr > ALIGN(VMALLOC_END, SZ_256M) - 1)
+ return false;
+#endif
+ return true;
+}
+
void mmu_mark_initmem_nx(void)
{
int nb = mmu_has_feature(MMU_FTR_USE_HIGH_BATS) ? 8 : 4;
int i;
unsigned long base = (unsigned long)_stext - PAGE_OFFSET;
- unsigned long top = (unsigned long)_etext - PAGE_OFFSET;
+ unsigned long top = ALIGN((unsigned long)_etext - PAGE_OFFSET, SZ_128K);
unsigned long border = (unsigned long)__init_begin - PAGE_OFFSET;
unsigned long size;
- if (IS_ENABLED(CONFIG_PPC_BOOK3S_601))
- return;
-
- for (i = 0; i < nb - 1 && base < top && top - base > (128 << 10);) {
- size = block_size(base, top);
+ for (i = 0; i < nb - 1 && base < top;) {
+ size = bat_block_size(base, top);
setibat(i++, PAGE_OFFSET + base, base, size, PAGE_KERNEL_TEXT);
base += size;
}
if (base < top) {
- size = block_size(base, top);
- size = max(size, 128UL << 10);
+ size = bat_block_size(base, top);
if ((top - base) > size) {
size <<= 1;
if (strict_kernel_rwx_enabled() && base + size > border)
@@ -217,9 +228,9 @@
for (i = TASK_SIZE >> 28; i < 16; i++) {
/* Do not set NX on VM space for modules */
- if (IS_ENABLED(CONFIG_MODULES) &&
- (VMALLOC_START & 0xf0000000) == i << 28)
- break;
+ if (is_module_segment(i << 28))
+ continue;
+
mtsrin(mfsrin(i << 28) | 0x10000000, i << 28);
}
}
@@ -229,9 +240,6 @@
int nb = mmu_has_feature(MMU_FTR_USE_HIGH_BATS) ? 8 : 4;
int i;
- if (IS_ENABLED(CONFIG_PPC_BOOK3S_601))
- return;
-
for (i = 0; i < nb; i++) {
struct ppc_bat *bat = BATS[i];
@@ -253,43 +261,39 @@
{
unsigned int bl;
int wimgxpp;
- struct ppc_bat *bat = BATS[index];
+ struct ppc_bat *bat;
unsigned long flags = pgprot_val(prot);
+ if (index == -1)
+ index = find_free_bat();
+ if (index == -1) {
+ pr_err("%s: no BAT available for mapping 0x%llx\n", __func__,
+ (unsigned long long)phys);
+ return;
+ }
+ bat = BATS[index];
+
if ((flags & _PAGE_NO_CACHE) ||
(cpu_has_feature(CPU_FTR_NEED_COHERENT) == 0))
flags &= ~_PAGE_COHERENT;
bl = (size >> 17) - 1;
- if (!IS_ENABLED(CONFIG_PPC_BOOK3S_601)) {
- /* 603, 604, etc. */
- /* Do DBAT first */
- wimgxpp = flags & (_PAGE_WRITETHRU | _PAGE_NO_CACHE
- | _PAGE_COHERENT | _PAGE_GUARDED);
- wimgxpp |= (flags & _PAGE_RW)? BPP_RW: BPP_RX;
- bat[1].batu = virt | (bl << 2) | 2; /* Vs=1, Vp=0 */
- bat[1].batl = BAT_PHYS_ADDR(phys) | wimgxpp;
- if (flags & _PAGE_USER)
- bat[1].batu |= 1; /* Vp = 1 */
- if (flags & _PAGE_GUARDED) {
- /* G bit must be zero in IBATs */
- flags &= ~_PAGE_EXEC;
- }
- if (flags & _PAGE_EXEC)
- bat[0] = bat[1];
- else
- bat[0].batu = bat[0].batl = 0;
- } else {
- /* 601 cpu */
- if (bl > BL_8M)
- bl = BL_8M;
- wimgxpp = flags & (_PAGE_WRITETHRU | _PAGE_NO_CACHE
- | _PAGE_COHERENT);
- wimgxpp |= (flags & _PAGE_RW)?
- ((flags & _PAGE_USER)? PP_RWRW: PP_RWXX): PP_RXRX;
- bat->batu = virt | wimgxpp | 4; /* Ks=0, Ku=1 */
- bat->batl = phys | bl | 0x40; /* V=1 */
+ /* Do DBAT first */
+ wimgxpp = flags & (_PAGE_WRITETHRU | _PAGE_NO_CACHE
+ | _PAGE_COHERENT | _PAGE_GUARDED);
+ wimgxpp |= (flags & _PAGE_RW)? BPP_RW: BPP_RX;
+ bat[1].batu = virt | (bl << 2) | 2; /* Vs=1, Vp=0 */
+ bat[1].batl = BAT_PHYS_ADDR(phys) | wimgxpp;
+ if (flags & _PAGE_USER)
+ bat[1].batu |= 1; /* Vp = 1 */
+ if (flags & _PAGE_GUARDED) {
+ /* G bit must be zero in IBATs */
+ flags &= ~_PAGE_EXEC;
}
+ if (flags & _PAGE_EXEC)
+ bat[0] = bat[1];
+ else
+ bat[0].batu = bat[0].batl = 0;
bat_addrs[index].start = virt;
bat_addrs[index].limit = virt + ((bl + 1) << 17) - 1;
@@ -305,7 +309,7 @@
if (!Hash)
return;
- pmd = pmd_offset(pud_offset(pgd_offset(mm, ea), ea), ea);
+ pmd = pmd_off(mm, ea);
if (!pmd_none(*pmd))
add_hash_page(mm->context.id, ea, pmd_val(*pmd));
}
@@ -392,20 +396,15 @@
hash_mb2 = hash_mb = 32 - LG_HPTEG_SIZE - lg_n_hpteg;
if (lg_n_hpteg > 16)
hash_mb2 = 16 - LG_HPTEG_SIZE;
-
- /*
- * When KASAN is selected, there is already an early temporary hash
- * table and the switch to the final hash table is done later.
- */
- if (IS_ENABLED(CONFIG_KASAN))
- return;
-
- MMU_init_hw_patch();
}
void __init MMU_init_hw_patch(void)
{
unsigned int hmask = Hash_mask >> (16 - LG_HPTEG_SIZE);
+ unsigned int hash = (unsigned int)Hash - PAGE_OFFSET;
+
+ if (!mmu_has_feature(MMU_FTR_HPTE_TABLE))
+ return;
if (ppc_md.progress)
ppc_md.progress("hash:patch", 0x345);
@@ -417,8 +416,7 @@
/*
* Patch up the instructions in hashtable.S:create_hpte
*/
- modify_instruction_site(&patch__hash_page_A0, 0xffff,
- ((unsigned int)Hash - PAGE_OFFSET) >> 16);
+ modify_instruction_site(&patch__hash_page_A0, 0xffff, hash >> 16);
modify_instruction_site(&patch__hash_page_A1, 0x7c0, hash_mb << 6);
modify_instruction_site(&patch__hash_page_A2, 0x7c0, hash_mb2 << 6);
modify_instruction_site(&patch__hash_page_B, 0xffff, hmask);
@@ -427,8 +425,7 @@
/*
* Patch up the instructions in hashtable.S:flush_hash_page
*/
- modify_instruction_site(&patch__flush_hash_A0, 0xffff,
- ((unsigned int)Hash - PAGE_OFFSET) >> 16);
+ modify_instruction_site(&patch__flush_hash_A0, 0xffff, hash >> 16);
modify_instruction_site(&patch__flush_hash_A1, 0x7c0, hash_mb << 6);
modify_instruction_site(&patch__flush_hash_A2, 0x7c0, hash_mb2 << 6);
modify_instruction_site(&patch__flush_hash_B, 0xffff, hmask);
@@ -442,11 +439,7 @@
*/
BUG_ON(first_memblock_base != 0);
- /* 601 can only access 16MB at the moment */
- if (IS_ENABLED(CONFIG_PPC_BOOK3S_601))
- memblock_set_current_limit(min_t(u64, first_memblock_size, 0x01000000));
- else /* Anything else has 256M mapped */
- memblock_set_current_limit(min_t(u64, first_memblock_size, 0x10000000));
+ memblock_set_current_limit(min_t(u64, first_memblock_size, SZ_256M));
}
void __init print_system_hash_info(void)
diff --git a/arch/powerpc/mm/book3s32/tlb.c b/arch/powerpc/mm/book3s32/tlb.c
index 2fcd321..b6c7427 100644
--- a/arch/powerpc/mm/book3s32/tlb.c
+++ b/arch/powerpc/mm/book3s32/tlb.c
@@ -79,15 +79,18 @@
int count;
unsigned int ctx = mm->context.id;
+ start &= PAGE_MASK;
if (!Hash) {
- _tlbia();
+ if (end - start <= PAGE_SIZE)
+ _tlbie(start);
+ else
+ _tlbia();
return;
}
- start &= PAGE_MASK;
if (start >= end)
return;
end = (end - 1) | ~PAGE_MASK;
- pmd = pmd_offset(pud_offset(pgd_offset(mm, start), start), start);
+ pmd = pmd_off(mm, start);
for (;;) {
pmd_end = ((start + PGDIR_SIZE) & PGDIR_MASK) - 1;
if (pmd_end > end)
@@ -126,7 +129,7 @@
/*
* It is safe to go down the mm's list of vmas when called
- * from dup_mmap, holding mmap_sem. It would also be safe from
+ * from dup_mmap, holding mmap_lock. It would also be safe from
* unmap_region or exit_mmap, but not from vmtruncate on SMP -
* but it seems dup_mmap is the only SMP case which gets here.
*/
@@ -145,7 +148,7 @@
return;
}
mm = (vmaddr < TASK_SIZE)? vma->vm_mm: &init_mm;
- pmd = pmd_offset(pud_offset(pgd_offset(mm, vmaddr), vmaddr), vmaddr);
+ pmd = pmd_off(mm, vmaddr);
if (!pmd_none(*pmd))
flush_hash_pages(mm->context.id, vmaddr, pmd_val(*pmd), 1);
}
diff --git a/arch/powerpc/mm/book3s64/hash_hugetlbpage.c b/arch/powerpc/mm/book3s64/hash_hugetlbpage.c
index eefa89c..964467b 100644
--- a/arch/powerpc/mm/book3s64/hash_hugetlbpage.c
+++ b/arch/powerpc/mm/book3s64/hash_hugetlbpage.c
@@ -10,8 +10,6 @@
#include <linux/mm.h>
#include <linux/hugetlb.h>
-#include <asm/pgtable.h>
-#include <asm/pgalloc.h>
#include <asm/cacheflush.h>
#include <asm/machdep.h>
diff --git a/arch/powerpc/mm/book3s64/hash_native.c b/arch/powerpc/mm/book3s64/hash_native.c
index 523e42e..52e170b 100644
--- a/arch/powerpc/mm/book3s64/hash_native.c
+++ b/arch/powerpc/mm/book3s64/hash_native.c
@@ -14,11 +14,11 @@
#include <linux/processor.h>
#include <linux/threads.h>
#include <linux/smp.h>
+#include <linux/pgtable.h>
#include <asm/machdep.h>
#include <asm/mmu.h>
#include <asm/mmu_context.h>
-#include <asm/pgtable.h>
#include <asm/trace.h>
#include <asm/tlb.h>
#include <asm/cputable.h>
@@ -68,7 +68,7 @@
rs = ((unsigned long)pid << PPC_BITLSHIFT(31));
asm volatile(PPC_TLBIEL(%0, %1, %2, %3, %4)
- : : "r"(rb), "r"(rs), "i"(ric), "i"(prs), "r"(r)
+ : : "r"(rb), "r"(rs), "i"(ric), "i"(prs), "i"(r)
: "memory");
}
@@ -82,7 +82,7 @@
for (set = 0; set < num_sets; set++)
tlbiel_hash_set_isa206(set, is);
- asm volatile("ptesync": : :"memory");
+ ppc_after_tlbiel_barrier();
}
static void tlbiel_all_isa300(unsigned int num_sets, unsigned int is)
@@ -92,16 +92,15 @@
asm volatile("ptesync": : :"memory");
/*
- * Flush the first set of the TLB, and any caching of partition table
- * entries. Then flush the remaining sets of the TLB. Hash mode uses
- * partition scoped TLB translations.
+ * Flush the partition table cache if this is HV mode.
*/
- tlbiel_hash_set_isa300(0, is, 0, 2, 0);
- for (set = 1; set < num_sets; set++)
- tlbiel_hash_set_isa300(set, is, 0, 0, 0);
+ if (early_cpu_has_feature(CPU_FTR_HVMODE))
+ tlbiel_hash_set_isa300(0, is, 0, 2, 0);
/*
- * Now invalidate the process table cache.
+ * Now invalidate the process table cache. UPRT=0 HPT modes (what
+ * current hardware implements) do not use the process table, but
+ * add the flushes anyway.
*
* From ISA v3.0B p. 1078:
* The following forms are invalid.
@@ -110,7 +109,15 @@
*/
tlbiel_hash_set_isa300(0, is, 0, 2, 1);
- asm volatile("ptesync": : :"memory");
+ /*
+ * Then flush the sets of the TLB proper. Hash mode uses
+ * partition scoped TLB translations, which may be flushed
+ * in !HV mode.
+ */
+ for (set = 0; set < num_sets; set++)
+ tlbiel_hash_set_isa300(set, is, 0, 0, 0);
+
+ ppc_after_tlbiel_barrier();
asm volatile(PPC_ISA_3_0_INVALIDATE_ERAT "; isync" : : :"memory");
}
@@ -303,7 +310,7 @@
asm volatile("ptesync": : :"memory");
if (use_local) {
__tlbiel(vpn, psize, apsize, ssize);
- asm volatile("ptesync": : :"memory");
+ ppc_after_tlbiel_barrier();
} else {
__tlbie(vpn, psize, apsize, ssize);
fixup_tlbie_vpn(vpn, psize, apsize, ssize);
@@ -482,19 +489,12 @@
return ret;
}
-static long native_hpte_find(unsigned long vpn, int psize, int ssize)
+static long __native_hpte_find(unsigned long want_v, unsigned long slot)
{
struct hash_pte *hptep;
- unsigned long hash;
+ unsigned long hpte_v;
unsigned long i;
- long slot;
- unsigned long want_v, hpte_v;
- hash = hpt_hash(vpn, mmu_psize_defs[psize].shift, ssize);
- want_v = hpte_encode_avpn(vpn, psize, ssize);
-
- /* Bolted mappings are only ever in the primary group */
- slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
for (i = 0; i < HPTES_PER_GROUP; i++) {
hptep = htab_address + slot;
@@ -508,6 +508,33 @@
return -1;
}
+static long native_hpte_find(unsigned long vpn, int psize, int ssize)
+{
+ unsigned long hpte_group;
+ unsigned long want_v;
+ unsigned long hash;
+ long slot;
+
+ hash = hpt_hash(vpn, mmu_psize_defs[psize].shift, ssize);
+ want_v = hpte_encode_avpn(vpn, psize, ssize);
+
+ /*
+ * We try to keep bolted entries always in primary hash
+ * But in some case we can find them in secondary too.
+ */
+ hpte_group = (hash & htab_hash_mask) * HPTES_PER_GROUP;
+ slot = __native_hpte_find(want_v, hpte_group);
+ if (slot < 0) {
+ /* Try in secondary */
+ hpte_group = (~hash & htab_hash_mask) * HPTES_PER_GROUP;
+ slot = __native_hpte_find(want_v, hpte_group);
+ if (slot < 0)
+ return -1;
+ }
+
+ return slot;
+}
+
/*
* Update the page protection bits. Intended to be used to create
* guard pages for kernel data structures on pages which are bolted
@@ -859,7 +886,7 @@
__tlbiel(vpn, psize, psize, ssize);
} pte_iterate_hashed_end();
}
- asm volatile("ptesync":::"memory");
+ ppc_after_tlbiel_barrier();
} else {
int lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE);
diff --git a/arch/powerpc/mm/book3s64/hash_pgtable.c b/arch/powerpc/mm/book3s64/hash_pgtable.c
index 64733b9..fd9c7f9 100644
--- a/arch/powerpc/mm/book3s64/hash_pgtable.c
+++ b/arch/powerpc/mm/book3s64/hash_pgtable.c
@@ -9,8 +9,6 @@
#include <linux/mm_types.h>
#include <linux/mm.h>
-#include <asm/pgalloc.h>
-#include <asm/pgtable.h>
#include <asm/sections.h>
#include <asm/mmu.h>
#include <asm/tlb.h>
@@ -148,6 +146,7 @@
int hash__map_kernel_page(unsigned long ea, unsigned long pa, pgprot_t prot)
{
pgd_t *pgdp;
+ p4d_t *p4dp;
pud_t *pudp;
pmd_t *pmdp;
pte_t *ptep;
@@ -155,7 +154,8 @@
BUILD_BUG_ON(TASK_SIZE_USER64 > H_PGTABLE_RANGE);
if (slab_is_available()) {
pgdp = pgd_offset_k(ea);
- pudp = pud_alloc(&init_mm, pgdp, ea);
+ p4dp = p4d_offset(pgdp, ea);
+ pudp = pud_alloc(&init_mm, p4dp, ea);
if (!pudp)
return -ENOMEM;
pmdp = pmd_alloc(&init_mm, pudp, ea);
@@ -236,7 +236,7 @@
* to hugepage, we first clear the pmd, then invalidate all
* the PTE entries. The assumption here is that any low level
* page fault will see a none pmd and take the slow path that
- * will wait on mmap_sem. But we could very well be in a
+ * will wait on mmap_lock. But we could very well be in a
* hash_page with local ptep pointer value. Such a hash page
* can result in adding new HPTE entries for normal subpages.
* That means we could be modifying the page content as we
@@ -250,7 +250,7 @@
* Now invalidate the hpte entries in the range
* covered by pmd. This make sure we take a
* fault and will find the pmd as none, which will
- * result in a major fault which takes mmap_sem and
+ * result in a major fault which takes mmap_lock and
* hence wait for collapse to complete. Without this
* the __collapse_huge_page_copy can result in copying
* the old content.
@@ -363,17 +363,6 @@
* hash fault look at them.
*/
memset(pgtable, 0, PTE_FRAG_SIZE);
- /*
- * Serialize against find_current_mm_pte variants which does lock-less
- * lookup in page tables with local interrupts disabled. For huge pages
- * it casts pmd_t to pte_t. Since format of pte_t is different from
- * pmd_t we want to prevent transit from pmd pointing to page table
- * to pmd pointing to huge page (and back) while interrupts are disabled.
- * We clear pmd to possibly replace it with page table pointer in
- * different code paths. So make sure we wait for the parallel
- * find_curren_mm_pte to finish.
- */
- serialize_against_pte_lookup(mm);
return old_pmd;
}
diff --git a/arch/powerpc/mm/book3s64/hash_tlb.c b/arch/powerpc/mm/book3s64/hash_tlb.c
index 4a70d8d..eb0bcca 100644
--- a/arch/powerpc/mm/book3s64/hash_tlb.c
+++ b/arch/powerpc/mm/book3s64/hash_tlb.c
@@ -21,7 +21,6 @@
#include <linux/mm.h>
#include <linux/percpu.h>
#include <linux/hardirq.h>
-#include <asm/pgalloc.h>
#include <asm/tlbflush.h>
#include <asm/tlb.h>
#include <asm/bug.h>
@@ -176,7 +175,6 @@
* from the hash table (and the TLB). But keeps
* the linux PTEs intact.
*
- * @mm : mm_struct of the target address space (generally init_mm)
* @start : starting address
* @end : ending address (not included in the flush)
*
@@ -189,17 +187,14 @@
* Because of that usage pattern, it is implemented for small size rather
* than speed.
*/
-void __flush_hash_table_range(struct mm_struct *mm, unsigned long start,
- unsigned long end)
+void __flush_hash_table_range(unsigned long start, unsigned long end)
{
- bool is_thp;
int hugepage_shift;
unsigned long flags;
- start = _ALIGN_DOWN(start, PAGE_SIZE);
- end = _ALIGN_UP(end, PAGE_SIZE);
+ start = ALIGN_DOWN(start, PAGE_SIZE);
+ end = ALIGN(end, PAGE_SIZE);
- BUG_ON(!mm->pgd);
/*
* Note: Normally, we should only ever use a batch within a
@@ -212,21 +207,15 @@
local_irq_save(flags);
arch_enter_lazy_mmu_mode();
for (; start < end; start += PAGE_SIZE) {
- pte_t *ptep = find_current_mm_pte(mm->pgd, start, &is_thp,
- &hugepage_shift);
+ pte_t *ptep = find_init_mm_pte(start, &hugepage_shift);
unsigned long pte;
if (ptep == NULL)
continue;
pte = pte_val(*ptep);
- if (is_thp)
- trace_hugepage_invalidate(start, pte);
if (!(pte & H_PAGE_HASHPTE))
continue;
- if (unlikely(is_thp))
- hpte_do_hugepage_flush(mm, start, (pmd_t *)ptep, pte);
- else
- hpte_need_flush(mm, start, ptep, pte, hugepage_shift);
+ hpte_need_flush(&init_mm, start, ptep, pte, hugepage_shift);
}
arch_leave_lazy_mmu_mode();
local_irq_restore(flags);
@@ -238,7 +227,7 @@
pte_t *start_pte;
unsigned long flags;
- addr = _ALIGN_DOWN(addr, PMD_SIZE);
+ addr = ALIGN_DOWN(addr, PMD_SIZE);
/*
* Note: Normally, we should only ever use a batch within a
* PTE locked section. This violates the rule, but will work
diff --git a/arch/powerpc/mm/book3s64/hash_utils.c b/arch/powerpc/mm/book3s64/hash_utils.c
index 83c51a7..0141d57 100644
--- a/arch/powerpc/mm/book3s64/hash_utils.c
+++ b/arch/powerpc/mm/book3s64/hash_utils.c
@@ -7,7 +7,7 @@
*
* SMP scalability work:
* Copyright (C) 2001 Anton Blanchard <anton@au.ibm.com>, IBM
- *
+ *
* Module name: htab.c
*
* Description:
@@ -35,10 +35,10 @@
#include <linux/pkeys.h>
#include <linux/hugetlb.h>
#include <linux/cpu.h>
+#include <linux/pgtable.h>
#include <asm/debugfs.h>
#include <asm/processor.h>
-#include <asm/pgtable.h>
#include <asm/mmu.h>
#include <asm/mmu_context.h>
#include <asm/page.h>
@@ -66,6 +66,9 @@
#include <mm/mmu_decl.h>
+#include "internal.h"
+
+
#ifdef DEBUG
#define DBG(fmt...) udbg_printf(fmt)
#else
@@ -257,12 +260,17 @@
DBG("htab_bolt_mapping(%lx..%lx -> %lx (%lx,%d,%d)\n",
vstart, vend, pstart, prot, psize, ssize);
- for (vaddr = vstart, paddr = pstart; vaddr < vend;
- vaddr += step, paddr += step) {
+ /* Carefully map only the possible range */
+ vaddr = ALIGN(vstart, step);
+ paddr = ALIGN(pstart, step);
+ vend = ALIGN_DOWN(vend, step);
+
+ for (; vaddr < vend; vaddr += step, paddr += step) {
unsigned long hash, hpteg;
unsigned long vsid = get_kernel_vsid(vaddr, ssize);
unsigned long vpn = hpt_vpn(vaddr, vsid, ssize);
unsigned long tprot = prot;
+ bool secondary_hash = false;
/*
* If we hit a bad address return error.
@@ -291,17 +299,27 @@
hpteg = ((hash & htab_hash_mask) * HPTES_PER_GROUP);
BUG_ON(!mmu_hash_ops.hpte_insert);
+repeat:
ret = mmu_hash_ops.hpte_insert(hpteg, vpn, paddr, tprot,
HPTE_V_BOLTED, psize, psize,
ssize);
if (ret == -1) {
- /* Try to remove a non bolted entry */
+ /*
+ * Try to to keep bolted entries in primary.
+ * Remove non bolted entries and try insert again
+ */
ret = mmu_hash_ops.hpte_remove(hpteg);
if (ret != -1)
ret = mmu_hash_ops.hpte_insert(hpteg, vpn, paddr, tprot,
HPTE_V_BOLTED, psize, psize,
ssize);
+ if (ret == -1 && !secondary_hash) {
+ secondary_hash = true;
+ hpteg = ((~hash & htab_hash_mask) * HPTES_PER_GROUP);
+ goto repeat;
+ }
}
+
if (ret < 0)
break;
@@ -318,7 +336,7 @@
int htab_remove_mapping(unsigned long vstart, unsigned long vend,
int psize, int ssize)
{
- unsigned long vaddr;
+ unsigned long vaddr, time_limit;
unsigned int step, shift;
int rc;
int ret = 0;
@@ -329,8 +347,21 @@
if (!mmu_hash_ops.hpte_removebolted)
return -ENODEV;
- for (vaddr = vstart; vaddr < vend; vaddr += step) {
+ /* Unmap the full range specificied */
+ vaddr = ALIGN_DOWN(vstart, step);
+ time_limit = jiffies + HZ;
+
+ for (;vaddr < vend; vaddr += step) {
rc = mmu_hash_ops.hpte_removebolted(vaddr, psize, ssize);
+
+ /*
+ * For large number of mappings introduce a cond_resched()
+ * to prevent softlockup warnings.
+ */
+ if (time_after(jiffies, time_limit)) {
+ cond_resched();
+ time_limit = jiffies + HZ;
+ }
if (rc == -ENOENT) {
ret = -ENOENT;
continue;
@@ -582,7 +613,7 @@
}
#ifdef CONFIG_HUGETLB_PAGE
- if (!hugetlb_disabled) {
+ if (!hugetlb_disabled && !early_radix_enabled() ) {
/* Reserve 16G huge page memory sections for huge pages */
of_scan_flat_dt(htab_dt_scan_hugepage_blocks, NULL);
}
@@ -641,6 +672,7 @@
static void __init htab_init_page_sizes(void)
{
+ bool aligned = true;
init_hpte_page_sizes();
if (!debug_pagealloc_enabled()) {
@@ -648,7 +680,14 @@
* Pick a size for the linear mapping. Currently, we only
* support 16M, 1M and 4K which is the default
*/
- if (mmu_psize_defs[MMU_PAGE_16M].shift)
+ if (IS_ENABLED(CONFIG_STRICT_KERNEL_RWX) &&
+ (unsigned long)_stext % 0x1000000) {
+ if (mmu_psize_defs[MMU_PAGE_16M].shift)
+ pr_warn("Kernel not 16M aligned, disabling 16M linear map alignment\n");
+ aligned = false;
+ }
+
+ if (mmu_psize_defs[MMU_PAGE_16M].shift && aligned)
mmu_linear_psize = MMU_PAGE_16M;
else if (mmu_psize_defs[MMU_PAGE_1M].shift)
mmu_linear_psize = MMU_PAGE_1M;
@@ -765,7 +804,7 @@
}
#ifdef CONFIG_MEMORY_HOTPLUG
-int resize_hpt_for_hotplug(unsigned long new_mem_size)
+static int resize_hpt_for_hotplug(unsigned long new_mem_size)
{
unsigned target_hpt_shift;
@@ -789,7 +828,8 @@
return 0;
}
-int hash__create_section_mapping(unsigned long start, unsigned long end, int nid)
+int hash__create_section_mapping(unsigned long start, unsigned long end,
+ int nid, pgprot_t prot)
{
int rc;
@@ -798,8 +838,10 @@
return -1;
}
+ resize_hpt_for_hotplug(memblock_phys_mem_size());
+
rc = htab_bolt_mapping(start, end, __pa(start),
- pgprot_val(PAGE_KERNEL), mmu_linear_psize,
+ pgprot_val(prot), mmu_linear_psize,
mmu_kernel_ssize);
if (rc < 0) {
@@ -815,6 +857,10 @@
int rc = htab_remove_mapping(start, end, mmu_linear_psize,
mmu_kernel_ssize);
WARN_ON(rc < 0);
+
+ if (resize_hpt_for_hotplug(memblock_phys_mem_size()) == -ENOSPC)
+ pr_warn("Hash collision while resizing HPT\n");
+
return rc;
}
#endif /* CONFIG_MEMORY_HOTPLUG */
@@ -838,8 +884,8 @@
unsigned long table;
unsigned long pteg_count;
unsigned long prot;
- unsigned long base = 0, size = 0;
- struct memblock_region *reg;
+ phys_addr_t base = 0, size = 0, end;
+ u64 i;
DBG(" -> htab_initialize()\n");
@@ -849,10 +895,13 @@
printk(KERN_INFO "Using 1TB segments\n");
}
+ if (stress_slb_enabled)
+ static_branch_enable(&stress_slb_key);
+
/*
* Calculate the required size of the htab. We want the number of
* PTEGs to equal one half the number of real pages.
- */
+ */
htab_size_bytes = htab_get_table_size();
pteg_count = htab_size_bytes >> 7;
@@ -862,7 +911,7 @@
firmware_has_feature(FW_FEATURE_PS3_LV1)) {
/* Using a hypervisor which owns the htab */
htab_address = NULL;
- _SDR1 = 0;
+ _SDR1 = 0;
#ifdef CONFIG_FA_DUMP
/*
* If firmware assisted dump is active firmware preserves
@@ -928,9 +977,9 @@
#endif /* CONFIG_DEBUG_PAGEALLOC */
/* create bolted the linear mapping in the hash table */
- for_each_memblock(memory, reg) {
- base = (unsigned long)__va(reg->base);
- size = reg->size;
+ for_each_mem_range(i, &base, &end) {
+ size = end - base;
+ base = (unsigned long)__va(base);
DBG("creating mapping for region: %lx..%lx (prot: %lx)\n",
base, size, prot);
@@ -1084,6 +1133,11 @@
if (cpu_has_feature(CPU_FTR_ARCH_206)
&& cpu_has_feature(CPU_FTR_HVMODE))
tlbiel_all();
+
+#ifdef CONFIG_PPC_MEM_KEYS
+ if (mmu_has_feature(MMU_FTR_PKEY))
+ mtspr(SPRN_UAMOR, default_uamor);
+#endif
}
#endif /* CONFIG_SMP */
@@ -1329,8 +1383,15 @@
goto bail;
}
- /* Add _PAGE_PRESENT to the required access perm */
- access |= _PAGE_PRESENT;
+ /*
+ * Add _PAGE_PRESENT to the required access perm. If there are parallel
+ * updates to the pte that can possibly clear _PAGE_PTE, catch that too.
+ *
+ * We can safely use the return pte address in rest of the function
+ * because we do set H_PAGE_BUSY which prevents further updates to pte
+ * from generic code.
+ */
+ access |= _PAGE_PRESENT | _PAGE_PTE;
/*
* Pre-check access permissions (will be re-checked atomically
@@ -1518,16 +1579,14 @@
}
#endif
-static void hash_preload(struct mm_struct *mm, unsigned long ea,
+static void hash_preload(struct mm_struct *mm, pte_t *ptep, unsigned long ea,
bool is_exec, unsigned long trap)
{
- int hugepage_shift;
unsigned long vsid;
pgd_t *pgdir;
- pte_t *ptep;
- unsigned long flags;
int rc, ssize, update_flags = 0;
unsigned long access = _PAGE_PRESENT | _PAGE_READ | (is_exec ? _PAGE_EXEC : 0);
+ unsigned long flags;
BUG_ON(get_region_id(ea) != USER_REGION_ID);
@@ -1547,32 +1606,42 @@
vsid = get_user_vsid(&mm->context, ea, ssize);
if (!vsid)
return;
- /*
- * Hash doesn't like irqs. Walking linux page table with irq disabled
- * saves us from holding multiple locks.
- */
- local_irq_save(flags);
- /*
- * THP pages use update_mmu_cache_pmd. We don't do
- * hash preload there. Hence can ignore THP here
- */
- ptep = find_current_mm_pte(pgdir, ea, NULL, &hugepage_shift);
- if (!ptep)
- goto out_exit;
-
- WARN_ON(hugepage_shift);
#ifdef CONFIG_PPC_64K_PAGES
/* If either H_PAGE_4K_PFN or cache inhibited is set (and we are on
* a 64K kernel), then we don't preload, hash_page() will take
* care of it once we actually try to access the page.
* That way we don't have to duplicate all of the logic for segment
* page size demotion here
+ * Called with PTL held, hence can be sure the value won't change in
+ * between.
*/
if ((pte_val(*ptep) & H_PAGE_4K_PFN) || pte_ci(*ptep))
- goto out_exit;
+ return;
#endif /* CONFIG_PPC_64K_PAGES */
+ /*
+ * __hash_page_* must run with interrupts off, as it sets the
+ * H_PAGE_BUSY bit. It's possible for perf interrupts to hit at any
+ * time and may take a hash fault reading the user stack, see
+ * read_user_stack_slow() in the powerpc/perf code.
+ *
+ * If that takes a hash fault on the same page as we lock here, it
+ * will bail out when seeing H_PAGE_BUSY set, and retry the access
+ * leading to an infinite loop.
+ *
+ * Disabling interrupts here does not prevent perf interrupts, but it
+ * will prevent them taking hash faults (see the NMI test in
+ * do_hash_page), then read_user_stack's copy_from_user_nofault will
+ * fail and perf will fall back to read_user_stack_slow(), which
+ * walks the Linux page tables.
+ *
+ * Interrupts must also be off for the duration of the
+ * mm_is_thread_local test and update, to prevent preempt running the
+ * mm on another CPU (XXX: this may be racy vs kthread_use_mm).
+ */
+ local_irq_save(flags);
+
/* Is that local to this CPU ? */
if (mm_is_thread_local(mm))
update_flags |= HPTE_LOCAL_UPDATE;
@@ -1595,7 +1664,7 @@
mm_ctx_user_psize(&mm->context),
mm_ctx_user_psize(&mm->context),
pte_val(*ptep));
-out_exit:
+
local_irq_restore(flags);
}
@@ -1617,10 +1686,8 @@
unsigned long trap;
bool is_exec;
- if (radix_enabled()) {
- prefetch((void *)address);
+ if (radix_enabled())
return;
- }
/* We only want HPTEs for linux PTEs that have _PAGE_ACCESSED set */
if (!pte_young(*ptep) || address >= TASK_SIZE)
@@ -1647,33 +1714,9 @@
return;
}
- hash_preload(vma->vm_mm, address, is_exec, trap);
+ hash_preload(vma->vm_mm, ptep, address, is_exec, trap);
}
-#ifdef CONFIG_PPC_MEM_KEYS
-/*
- * Return the protection key associated with the given address and the
- * mm_struct.
- */
-u16 get_mm_addr_key(struct mm_struct *mm, unsigned long address)
-{
- pte_t *ptep;
- u16 pkey = 0;
- unsigned long flags;
-
- if (!mm || !mm->pgd)
- return 0;
-
- local_irq_save(flags);
- ptep = find_linux_pte(mm->pgd, address, NULL, NULL);
- if (ptep)
- pkey = pte_to_pkey_bits(pte_val(READ_ONCE(*ptep)));
- local_irq_restore(flags);
-
- return pkey;
-}
-#endif /* CONFIG_PPC_MEM_KEYS */
-
#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
static inline void tm_flush_hash_page(int local)
{
@@ -1715,10 +1758,6 @@
return gslot;
}
-/*
- * WARNING: This is called from hash_low_64.S, if you change this prototype,
- * do not forget to update the assembly call site !
- */
void flush_hash_page(unsigned long vpn, real_pte_t pte, int psize, int ssize,
unsigned long flags)
{
@@ -1998,11 +2037,8 @@
static int __init hash64_debugfs(void)
{
- if (!debugfs_create_file_unsafe("hpt_order", 0600, powerpc_debugfs_root,
- NULL, &fops_hpt_order)) {
- pr_err("lpar: unable to create hpt_order debugsfs file\n");
- }
-
+ debugfs_create_file("hpt_order", 0600, powerpc_debugfs_root, NULL,
+ &fops_hpt_order);
return 0;
}
machine_device_initcall(pseries, hash64_debugfs);
diff --git a/arch/powerpc/mm/book3s64/internal.h b/arch/powerpc/mm/book3s64/internal.h
new file mode 100644
index 0000000..c12d78e
--- /dev/null
+++ b/arch/powerpc/mm/book3s64/internal.h
@@ -0,0 +1,18 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#ifndef ARCH_POWERPC_MM_BOOK3S64_INTERNAL_H
+#define ARCH_POWERPC_MM_BOOK3S64_INTERNAL_H
+
+#include <linux/jump_label.h>
+
+extern bool stress_slb_enabled;
+
+DECLARE_STATIC_KEY_FALSE(stress_slb_key);
+
+static inline bool stress_slb(void)
+{
+ return static_branch_unlikely(&stress_slb_key);
+}
+
+void slb_setup_new_exec(void);
+
+#endif /* ARCH_POWERPC_MM_BOOK3S64_INTERNAL_H */
diff --git a/arch/powerpc/mm/book3s64/iommu_api.c b/arch/powerpc/mm/book3s64/iommu_api.c
index ef16485..563faa1 100644
--- a/arch/powerpc/mm/book3s64/iommu_api.c
+++ b/arch/powerpc/mm/book3s64/iommu_api.c
@@ -96,14 +96,14 @@
goto unlock_exit;
}
- down_read(&mm->mmap_sem);
+ mmap_read_lock(mm);
chunk = (1UL << (PAGE_SHIFT + MAX_ORDER - 1)) /
sizeof(struct vm_area_struct *);
chunk = min(chunk, entries);
for (entry = 0; entry < entries; entry += chunk) {
unsigned long n = min(entries - entry, chunk);
- ret = get_user_pages(ua + (entry << PAGE_SHIFT), n,
+ ret = pin_user_pages(ua + (entry << PAGE_SHIFT), n,
FOLL_WRITE | FOLL_LONGTERM,
mem->hpages + entry, NULL);
if (ret == n) {
@@ -114,7 +114,7 @@
pinned += ret;
break;
}
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
if (pinned != entries) {
if (!ret)
ret = -EFAULT;
@@ -170,9 +170,8 @@
return 0;
free_exit:
- /* free the reference taken */
- for (i = 0; i < pinned; i++)
- put_page(mem->hpages[i]);
+ /* free the references taken */
+ unpin_user_pages(mem->hpages, pinned);
vfree(mem->hpas);
kfree(mem);
@@ -218,7 +217,8 @@
if (mem->hpas[i] & MM_IOMMU_TABLE_GROUP_PAGE_DIRTY)
SetPageDirty(page);
- put_page(page);
+ unpin_user_page(page);
+
mem->hpas[i] = 0;
}
}
diff --git a/arch/powerpc/mm/book3s64/mmu_context.c b/arch/powerpc/mm/book3s64/mmu_context.c
index 0ba30b8..0c85572 100644
--- a/arch/powerpc/mm/book3s64/mmu_context.c
+++ b/arch/powerpc/mm/book3s64/mmu_context.c
@@ -17,10 +17,13 @@
#include <linux/export.h>
#include <linux/gfp.h>
#include <linux/slab.h>
+#include <linux/cpu.h>
#include <asm/mmu_context.h>
#include <asm/pgalloc.h>
+#include "internal.h"
+
static DEFINE_IDA(mmu_context_ida);
static int alloc_context_id(int min_id, int max_id)
@@ -48,8 +51,6 @@
}
EXPORT_SYMBOL_GPL(hash__alloc_context_id);
-void slb_setup_new_exec(void);
-
static int realloc_context_ids(mm_context_t *ctx)
{
int i, id;
@@ -307,3 +308,22 @@
isync();
}
#endif
+
+/**
+ * cleanup_cpu_mmu_context - Clean up MMU details for this CPU (newly offlined)
+ *
+ * This clears the CPU from mm_cpumask for all processes, and then flushes the
+ * local TLB to ensure TLB coherency in case the CPU is onlined again.
+ *
+ * KVM guest translations are not necessarily flushed here. If KVM started
+ * using mm_cpumask or the Linux APIs which do, this would have to be resolved.
+ */
+#ifdef CONFIG_HOTPLUG_CPU
+void cleanup_cpu_mmu_context(void)
+{
+ int cpu = smp_processor_id();
+
+ clear_tasks_mm_cpumask(cpu);
+ tlbiel_all();
+}
+#endif
diff --git a/arch/powerpc/mm/book3s64/pgtable.c b/arch/powerpc/mm/book3s64/pgtable.c
index 2bf7e1b..e18ae50 100644
--- a/arch/powerpc/mm/book3s64/pgtable.c
+++ b/arch/powerpc/mm/book3s64/pgtable.c
@@ -15,6 +15,7 @@
#include <asm/powernv.h>
#include <asm/firmware.h>
#include <asm/ultravisor.h>
+#include <asm/kexec.h>
#include <mm/mmu_decl.h>
#include <trace/events/thp.h>
@@ -109,17 +110,27 @@
old_pmd = pmd_hugepage_update(vma->vm_mm, address, pmdp, _PAGE_PRESENT, _PAGE_INVALID);
flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
- /*
- * This ensures that generic code that rely on IRQ disabling
- * to prevent a parallel THP split work as expected.
- *
- * Marking the entry with _PAGE_INVALID && ~_PAGE_PRESENT requires
- * a special case check in pmd_access_permitted.
- */
- serialize_against_pte_lookup(vma->vm_mm);
return __pmd(old_pmd);
}
+pmd_t pmdp_huge_get_and_clear_full(struct vm_area_struct *vma,
+ unsigned long addr, pmd_t *pmdp, int full)
+{
+ pmd_t pmd;
+ VM_BUG_ON(addr & ~HPAGE_PMD_MASK);
+ VM_BUG_ON((pmd_present(*pmdp) && !pmd_trans_huge(*pmdp) &&
+ !pmd_devmap(*pmdp)) || !pmd_present(*pmdp));
+ pmd = pmdp_huge_get_and_clear(vma->vm_mm, addr, pmdp);
+ /*
+ * if it not a fullmm flush, then we can possibly end up converting
+ * this PMD pte entry to a regular level 0 PTE by a parallel page fault.
+ * Make sure we flush the tlb in this case.
+ */
+ if (!full)
+ flush_pmd_tlb_range(vma, addr, addr + HPAGE_PMD_SIZE);
+ return pmd;
+}
+
static pmd_t pmd_set_protbits(pmd_t pmd, pgprot_t pgprot)
{
return __pmd(pmd_val(pmd) | pgprot_val(pgprot));
@@ -146,19 +157,6 @@
pmdv &= _HPAGE_CHG_MASK;
return pmd_set_protbits(__pmd(pmdv), newprot);
}
-
-/*
- * This is called at the end of handling a user page fault, when the
- * fault has been handled by updating a HUGE PMD entry in the linux page tables.
- * We use it to preload an HPTE into the hash table corresponding to
- * the updated linux HUGE PMD entry.
- */
-void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr,
- pmd_t *pmd)
-{
- if (radix_enabled())
- prefetch((void *)addr);
-}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
/* For use by kexec */
@@ -168,15 +166,18 @@
radix__mmu_cleanup_all();
else if (mmu_hash_ops.hpte_clear_all)
mmu_hash_ops.hpte_clear_all();
+
+ reset_sprs();
}
#ifdef CONFIG_MEMORY_HOTPLUG
-int __meminit create_section_mapping(unsigned long start, unsigned long end, int nid)
+int __meminit create_section_mapping(unsigned long start, unsigned long end,
+ int nid, pgprot_t prot)
{
if (radix_enabled())
- return radix__create_section_mapping(start, end, nid);
+ return radix__create_section_mapping(start, end, nid, prot);
- return hash__create_section_mapping(start, end, nid);
+ return hash__create_section_mapping(start, end, nid, prot);
}
int __meminit remove_section_mapping(unsigned long start, unsigned long end)
@@ -341,6 +342,9 @@
{
struct page *page = virt_to_page(pmd);
+ if (PageReserved(page))
+ return free_reserved_page(page);
+
BUG_ON(atomic_read(&page->pt_frag_refcount) <= 0);
if (atomic_dec_and_test(&page->pt_frag_refcount)) {
pgtable_pmd_page_dtor(page);
@@ -358,7 +362,7 @@
pmd_fragment_free(table);
break;
case PUD_INDEX:
- kmem_cache_free(PGT_CACHE(PUD_CACHE_INDEX), table);
+ __pud_free(table);
break;
#if defined(CONFIG_PPC_4K_PAGES) && defined(CONFIG_HUGETLB_PAGE)
/* 16M hugepd directory at pud level */
diff --git a/arch/powerpc/mm/book3s64/pkeys.c b/arch/powerpc/mm/book3s64/pkeys.c
index 432fd9f..b1d091a 100644
--- a/arch/powerpc/mm/book3s64/pkeys.c
+++ b/arch/powerpc/mm/book3s64/pkeys.c
@@ -10,58 +10,97 @@
#include <asm/mmu.h>
#include <asm/setup.h>
#include <linux/pkeys.h>
-#include <linux/of_device.h>
+#include <linux/of_fdt.h>
-DEFINE_STATIC_KEY_TRUE(pkey_disabled);
-int pkeys_total; /* Total pkeys as per device tree */
-u32 initial_allocation_mask; /* Bits set for the initially allocated keys */
-u32 reserved_allocation_mask; /* Bits set for reserved keys */
-static bool pkey_execute_disable_supported;
-static bool pkeys_devtree_defined; /* property exported by device tree */
-static u64 pkey_amr_mask; /* Bits in AMR not to be touched */
-static u64 pkey_iamr_mask; /* Bits in AMR not to be touched */
-static u64 pkey_uamor_mask; /* Bits in UMOR not to be touched */
+int num_pkey; /* Max number of pkeys supported */
+/*
+ * Keys marked in the reservation list cannot be allocated by userspace
+ */
+u32 reserved_allocation_mask __ro_after_init;
+
+/* Bits set for the initially allocated keys */
+static u32 initial_allocation_mask __ro_after_init;
+
+/*
+ * Even if we allocate keys with sys_pkey_alloc(), we need to make sure
+ * other thread still find the access denied using the same keys.
+ */
+static u64 default_amr = ~0x0UL;
+static u64 default_iamr = 0x5555555555555555UL;
+u64 default_uamor __ro_after_init;
+/*
+ * Key used to implement PROT_EXEC mmap. Denies READ/WRITE
+ * We pick key 2 because 0 is special key and 1 is reserved as per ISA.
+ */
static int execute_only_key = 2;
+static bool pkey_execute_disable_supported;
+
#define AMR_BITS_PER_PKEY 2
#define AMR_RD_BIT 0x1UL
#define AMR_WR_BIT 0x2UL
#define IAMR_EX_BIT 0x1UL
-#define PKEY_REG_BITS (sizeof(u64)*8)
+#define PKEY_REG_BITS (sizeof(u64) * 8)
#define pkeyshift(pkey) (PKEY_REG_BITS - ((pkey+1) * AMR_BITS_PER_PKEY))
-static void scan_pkey_feature(void)
+static int __init dt_scan_storage_keys(unsigned long node,
+ const char *uname, int depth,
+ void *data)
{
- u32 vals[2];
- struct device_node *cpu;
+ const char *type = of_get_flat_dt_prop(node, "device_type", NULL);
+ const __be32 *prop;
+ int *pkeys_total = (int *) data;
- cpu = of_find_node_by_type(NULL, "cpu");
- if (!cpu)
- return;
+ /* We are scanning "cpu" nodes only */
+ if (type == NULL || strcmp(type, "cpu") != 0)
+ return 0;
- if (of_property_read_u32_array(cpu,
- "ibm,processor-storage-keys", vals, 2))
- return;
+ prop = of_get_flat_dt_prop(node, "ibm,processor-storage-keys", NULL);
+ if (!prop)
+ return 0;
+ *pkeys_total = be32_to_cpu(prop[0]);
+ return 1;
+}
+
+static int scan_pkey_feature(void)
+{
+ int ret;
+ int pkeys_total = 0;
/*
- * Since any pkey can be used for data or execute, we will just treat
- * all keys as equal and track them as one entity.
+ * Pkey is not supported with Radix translation.
*/
- pkeys_total = vals[0];
- pkeys_devtree_defined = true;
+ if (early_radix_enabled())
+ return 0;
+
+ ret = of_scan_flat_dt(dt_scan_storage_keys, &pkeys_total);
+ if (ret == 0) {
+ /*
+ * Let's assume 32 pkeys on P8/P9 bare metal, if its not defined by device
+ * tree. We make this exception since some version of skiboot forgot to
+ * expose this property on power8/9.
+ */
+ if (!firmware_has_feature(FW_FEATURE_LPAR)) {
+ unsigned long pvr = mfspr(SPRN_PVR);
+
+ if (PVR_VER(pvr) == PVR_POWER8 || PVR_VER(pvr) == PVR_POWER8E ||
+ PVR_VER(pvr) == PVR_POWER8NVL || PVR_VER(pvr) == PVR_POWER9)
+ pkeys_total = 32;
+ }
+ }
+
+ /*
+ * Adjust the upper limit, based on the number of bits supported by
+ * arch-neutral code.
+ */
+ pkeys_total = min_t(int, pkeys_total,
+ ((ARCH_VM_PKEY_FLAGS >> VM_PKEY_SHIFT) + 1));
+ return pkeys_total;
}
-static inline bool pkey_mmu_enabled(void)
+void __init pkey_early_init_devtree(void)
{
- if (firmware_has_feature(FW_FEATURE_LPAR))
- return pkeys_total;
- else
- return cpu_has_feature(CPU_FTR_PKEY);
-}
-
-static int pkey_initialize(void)
-{
- int os_reserved, i;
+ int pkeys_total, i;
/*
* We define PKEY_DISABLE_EXECUTE in addition to the arch-neutral
@@ -79,36 +118,21 @@
__builtin_popcountl(ARCH_VM_PKEY_FLAGS >> VM_PKEY_SHIFT)
!= (sizeof(u64) * BITS_PER_BYTE));
+ /*
+ * Only P7 and above supports SPRN_AMR update with MSR[PR] = 1
+ */
+ if (!early_cpu_has_feature(CPU_FTR_ARCH_206))
+ return;
+
/* scan the device tree for pkey feature */
- scan_pkey_feature();
+ pkeys_total = scan_pkey_feature();
+ if (!pkeys_total)
+ goto out;
- /*
- * Let's assume 32 pkeys on P8/P9 bare metal, if its not defined by device
- * tree. We make this exception since some version of skiboot forgot to
- * expose this property on power8/9.
- */
- if (!pkeys_devtree_defined && !firmware_has_feature(FW_FEATURE_LPAR)) {
- unsigned long pvr = mfspr(SPRN_PVR);
+ /* Allow all keys to be modified by default */
+ default_uamor = ~0x0UL;
- if (PVR_VER(pvr) == PVR_POWER8 || PVR_VER(pvr) == PVR_POWER8E ||
- PVR_VER(pvr) == PVR_POWER8NVL || PVR_VER(pvr) == PVR_POWER9)
- pkeys_total = 32;
- }
-
- /*
- * Adjust the upper limit, based on the number of bits supported by
- * arch-neutral code.
- */
- pkeys_total = min_t(int, pkeys_total,
- ((ARCH_VM_PKEY_FLAGS >> VM_PKEY_SHIFT)+1));
-
- if (!pkey_mmu_enabled() || radix_enabled() || !pkeys_total)
- static_branch_enable(&pkey_disabled);
- else
- static_branch_disable(&pkey_disabled);
-
- if (static_branch_likely(&pkey_disabled))
- return 0;
+ cur_cpu_spec->mmu_features |= MMU_FTR_PKEY;
/*
* The device tree cannot be relied to indicate support for
@@ -122,53 +146,86 @@
#ifdef CONFIG_PPC_4K_PAGES
/*
* The OS can manage only 8 pkeys due to its inability to represent them
- * in the Linux 4K PTE.
+ * in the Linux 4K PTE. Mark all other keys reserved.
*/
- os_reserved = pkeys_total - 8;
+ num_pkey = min(8, pkeys_total);
#else
- os_reserved = 0;
+ num_pkey = pkeys_total;
#endif
- /* Bits are in LE format. */
- reserved_allocation_mask = (0x1 << 1) | (0x1 << execute_only_key);
- /* register mask is in BE format */
- pkey_amr_mask = ~0x0ul;
- pkey_amr_mask &= ~(0x3ul << pkeyshift(0));
-
- pkey_iamr_mask = ~0x0ul;
- pkey_iamr_mask &= ~(0x3ul << pkeyshift(0));
- pkey_iamr_mask &= ~(0x3ul << pkeyshift(execute_only_key));
-
- pkey_uamor_mask = ~0x0ul;
- pkey_uamor_mask &= ~(0x3ul << pkeyshift(0));
- pkey_uamor_mask &= ~(0x3ul << pkeyshift(execute_only_key));
-
- /* mark the rest of the keys as reserved and hence unavailable */
- for (i = (pkeys_total - os_reserved); i < pkeys_total; i++) {
- reserved_allocation_mask |= (0x1 << i);
- pkey_uamor_mask &= ~(0x3ul << pkeyshift(i));
- }
- initial_allocation_mask = reserved_allocation_mask | (0x1 << 0);
-
- if (unlikely((pkeys_total - os_reserved) <= execute_only_key)) {
+ if (unlikely(num_pkey <= execute_only_key) || !pkey_execute_disable_supported) {
/*
* Insufficient number of keys to support
* execute only key. Mark it unavailable.
- * Any AMR, UAMOR, IAMR bit set for
- * this key is irrelevant since this key
- * can never be allocated.
*/
execute_only_key = -1;
+ } else {
+ /*
+ * Mark the execute_only_pkey as not available for
+ * user allocation via pkey_alloc.
+ */
+ reserved_allocation_mask |= (0x1 << execute_only_key);
+
+ /*
+ * Deny READ/WRITE for execute_only_key.
+ * Allow execute in IAMR.
+ */
+ default_amr |= (0x3ul << pkeyshift(execute_only_key));
+ default_iamr &= ~(0x1ul << pkeyshift(execute_only_key));
+
+ /*
+ * Clear the uamor bits for this key.
+ */
+ default_uamor &= ~(0x3ul << pkeyshift(execute_only_key));
}
- return 0;
-}
+ /*
+ * Allow access for only key 0. And prevent any other modification.
+ */
+ default_amr &= ~(0x3ul << pkeyshift(0));
+ default_iamr &= ~(0x1ul << pkeyshift(0));
+ default_uamor &= ~(0x3ul << pkeyshift(0));
+ /*
+ * key 0 is special in that we want to consider it an allocated
+ * key which is preallocated. We don't allow changing AMR bits
+ * w.r.t key 0. But one can pkey_free(key0)
+ */
+ initial_allocation_mask |= (0x1 << 0);
-arch_initcall(pkey_initialize);
+ /*
+ * key 1 is recommended not to be used. PowerISA(3.0) page 1015,
+ * programming note.
+ */
+ reserved_allocation_mask |= (0x1 << 1);
+ default_uamor &= ~(0x3ul << pkeyshift(1));
+
+ /*
+ * Prevent the usage of OS reserved keys. Update UAMOR
+ * for those keys. Also mark the rest of the bits in the
+ * 32 bit mask as reserved.
+ */
+ for (i = num_pkey; i < 32 ; i++) {
+ reserved_allocation_mask |= (0x1 << i);
+ default_uamor &= ~(0x3ul << pkeyshift(i));
+ }
+ /*
+ * Prevent the allocation of reserved keys too.
+ */
+ initial_allocation_mask |= reserved_allocation_mask;
+
+ pr_info("Enabling pkeys with max key count %d\n", num_pkey);
+out:
+ /*
+ * Setup uamor on boot cpu
+ */
+ mtspr(SPRN_UAMOR, default_uamor);
+
+ return;
+}
void pkey_mm_init(struct mm_struct *mm)
{
- if (static_branch_likely(&pkey_disabled))
+ if (!mmu_has_feature(MMU_FTR_PKEY))
return;
mm_pkey_allocation_map(mm) = initial_allocation_mask;
mm->context.execute_only_pkey = execute_only_key;
@@ -200,30 +257,6 @@
mtspr(SPRN_IAMR, value);
}
-static inline u64 read_uamor(void)
-{
- return mfspr(SPRN_UAMOR);
-}
-
-static inline void write_uamor(u64 value)
-{
- mtspr(SPRN_UAMOR, value);
-}
-
-static bool is_pkey_enabled(int pkey)
-{
- u64 uamor = read_uamor();
- u64 pkey_bits = 0x3ul << pkeyshift(pkey);
- u64 uamor_pkey_bits = (uamor & pkey_bits);
-
- /*
- * Both the bits in UAMOR corresponding to the key should be set or
- * reset.
- */
- WARN_ON(uamor_pkey_bits && (uamor_pkey_bits != pkey_bits));
- return !!(uamor_pkey_bits);
-}
-
static inline void init_amr(int pkey, u8 init_bits)
{
u64 new_amr_bits = (((u64)init_bits & 0x3UL) << pkeyshift(pkey));
@@ -249,8 +282,18 @@
{
u64 new_amr_bits = 0x0ul;
u64 new_iamr_bits = 0x0ul;
+ u64 pkey_bits, uamor_pkey_bits;
- if (!is_pkey_enabled(pkey))
+ /*
+ * Check whether the key is disabled by UAMOR.
+ */
+ pkey_bits = 0x3ul << pkeyshift(pkey);
+ uamor_pkey_bits = (default_uamor & pkey_bits);
+
+ /*
+ * Both the bits in UAMOR corresponding to the key should be set
+ */
+ if (uamor_pkey_bits != pkey_bits)
return -EINVAL;
if (init_val & PKEY_DISABLE_EXECUTE) {
@@ -272,7 +315,7 @@
void thread_pkey_regs_save(struct thread_struct *thread)
{
- if (static_branch_likely(&pkey_disabled))
+ if (!mmu_has_feature(MMU_FTR_PKEY))
return;
/*
@@ -280,48 +323,33 @@
*/
thread->amr = read_amr();
thread->iamr = read_iamr();
- thread->uamor = read_uamor();
}
void thread_pkey_regs_restore(struct thread_struct *new_thread,
struct thread_struct *old_thread)
{
- if (static_branch_likely(&pkey_disabled))
+ if (!mmu_has_feature(MMU_FTR_PKEY))
return;
if (old_thread->amr != new_thread->amr)
write_amr(new_thread->amr);
if (old_thread->iamr != new_thread->iamr)
write_iamr(new_thread->iamr);
- if (old_thread->uamor != new_thread->uamor)
- write_uamor(new_thread->uamor);
}
void thread_pkey_regs_init(struct thread_struct *thread)
{
- if (static_branch_likely(&pkey_disabled))
+ if (!mmu_has_feature(MMU_FTR_PKEY))
return;
- thread->amr = pkey_amr_mask;
- thread->iamr = pkey_iamr_mask;
- thread->uamor = pkey_uamor_mask;
+ thread->amr = default_amr;
+ thread->iamr = default_iamr;
- write_uamor(pkey_uamor_mask);
- write_amr(pkey_amr_mask);
- write_iamr(pkey_iamr_mask);
+ write_amr(default_amr);
+ write_iamr(default_iamr);
}
-static inline bool pkey_allows_readwrite(int pkey)
-{
- int pkey_shift = pkeyshift(pkey);
-
- if (!is_pkey_enabled(pkey))
- return true;
-
- return !(read_amr() & ((AMR_RD_BIT|AMR_WR_BIT) << pkey_shift));
-}
-
-int __execute_only_pkey(struct mm_struct *mm)
+int execute_only_pkey(struct mm_struct *mm)
{
return mm->context.execute_only_pkey;
}
@@ -329,7 +357,7 @@
static inline bool vma_is_pkey_exec_only(struct vm_area_struct *vma)
{
/* Do this check first since the vm_flags should be hot */
- if ((vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC)) != VM_EXEC)
+ if ((vma->vm_flags & VM_ACCESS_FLAGS) != VM_EXEC)
return false;
return (vma_pkey(vma) == vma->vm_mm->context.execute_only_pkey);
@@ -367,9 +395,6 @@
int pkey_shift;
u64 amr;
- if (!is_pkey_enabled(pkey))
- return true;
-
pkey_shift = pkeyshift(pkey);
if (execute)
return !(read_iamr() & (IAMR_EX_BIT << pkey_shift));
@@ -383,7 +408,7 @@
bool arch_pte_access_permitted(u64 pte, bool write, bool execute)
{
- if (static_branch_likely(&pkey_disabled))
+ if (!mmu_has_feature(MMU_FTR_PKEY))
return true;
return pkey_access_permitted(pte_to_pkey_bits(pte), write, execute);
@@ -397,22 +422,10 @@
* So do not enforce things if the VMA is not from the current mm, or if we are
* in a kernel thread.
*/
-static inline bool vma_is_foreign(struct vm_area_struct *vma)
-{
- if (!current->mm)
- return true;
-
- /* if it is not our ->mm, it has to be foreign */
- if (current->mm != vma->vm_mm)
- return true;
-
- return false;
-}
-
bool arch_vma_access_permitted(struct vm_area_struct *vma, bool write,
bool execute, bool foreign)
{
- if (static_branch_likely(&pkey_disabled))
+ if (!mmu_has_feature(MMU_FTR_PKEY))
return true;
/*
* Do not enforce our key-permissions on a foreign vma.
@@ -425,7 +438,7 @@
void arch_dup_pkeys(struct mm_struct *oldmm, struct mm_struct *mm)
{
- if (static_branch_likely(&pkey_disabled))
+ if (!mmu_has_feature(MMU_FTR_PKEY))
return;
/* Duplicate the oldmm pkey state in mm: */
diff --git a/arch/powerpc/mm/book3s64/radix_hugetlbpage.c b/arch/powerpc/mm/book3s64/radix_hugetlbpage.c
index cab0633..cb91071 100644
--- a/arch/powerpc/mm/book3s64/radix_hugetlbpage.c
+++ b/arch/powerpc/mm/book3s64/radix_hugetlbpage.c
@@ -2,8 +2,6 @@
#include <linux/mm.h>
#include <linux/hugetlb.h>
#include <linux/security.h>
-#include <asm/pgtable.h>
-#include <asm/pgalloc.h>
#include <asm/cacheflush.h>
#include <asm/machdep.h>
#include <asm/mman.h>
diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c
index bdcb07a..2959594 100644
--- a/arch/powerpc/mm/book3s64/radix_pgtable.c
+++ b/arch/powerpc/mm/book3s64/radix_pgtable.c
@@ -13,10 +13,10 @@
#include <linux/memblock.h>
#include <linux/of_fdt.h>
#include <linux/mm.h>
+#include <linux/hugetlb.h>
#include <linux/string_helpers.h>
-#include <linux/stop_machine.h>
+#include <linux/memory.h>
-#include <asm/pgtable.h>
#include <asm/pgalloc.h>
#include <asm/mmu_context.h>
#include <asm/dma.h>
@@ -25,6 +25,7 @@
#include <asm/firmware.h>
#include <asm/powernv.h>
#include <asm/sections.h>
+#include <asm/smp.h>
#include <asm/trace.h>
#include <asm/uaccess.h>
#include <asm/ultravisor.h>
@@ -33,6 +34,7 @@
unsigned int mmu_pid_bits;
unsigned int mmu_base_pid;
+unsigned long radix_mem_block_size __ro_after_init;
static __ref void *early_alloc_pgtable(unsigned long size, int nid,
unsigned long region_start, unsigned long region_end)
@@ -55,6 +57,13 @@
return ptr;
}
+/*
+ * When allocating pud or pmd pointers, we allocate a complete page
+ * of PAGE_SIZE rather than PUD_TABLE_SIZE or PMD_TABLE_SIZE. This
+ * is to ensure that the page obtained from the memblock allocator
+ * can be completely used as page table page and can be freed
+ * correctly when the page table entries are removed.
+ */
static int early_map_kernel_page(unsigned long ea, unsigned long pa,
pgprot_t flags,
unsigned int map_page_size,
@@ -63,24 +72,26 @@
{
unsigned long pfn = pa >> PAGE_SHIFT;
pgd_t *pgdp;
+ p4d_t *p4dp;
pud_t *pudp;
pmd_t *pmdp;
pte_t *ptep;
pgdp = pgd_offset_k(ea);
- if (pgd_none(*pgdp)) {
- pudp = early_alloc_pgtable(PUD_TABLE_SIZE, nid,
- region_start, region_end);
- pgd_populate(&init_mm, pgdp, pudp);
+ p4dp = p4d_offset(pgdp, ea);
+ if (p4d_none(*p4dp)) {
+ pudp = early_alloc_pgtable(PAGE_SIZE, nid,
+ region_start, region_end);
+ p4d_populate(&init_mm, p4dp, pudp);
}
- pudp = pud_offset(pgdp, ea);
+ pudp = pud_offset(p4dp, ea);
if (map_page_size == PUD_SIZE) {
ptep = (pte_t *)pudp;
goto set_the_pte;
}
if (pud_none(*pudp)) {
- pmdp = early_alloc_pgtable(PMD_TABLE_SIZE, nid,
- region_start, region_end);
+ pmdp = early_alloc_pgtable(PAGE_SIZE, nid, region_start,
+ region_end);
pud_populate(&init_mm, pudp, pmdp);
}
pmdp = pmd_offset(pudp, ea);
@@ -113,6 +124,7 @@
{
unsigned long pfn = pa >> PAGE_SHIFT;
pgd_t *pgdp;
+ p4d_t *p4dp;
pud_t *pudp;
pmd_t *pmdp;
pte_t *ptep;
@@ -135,7 +147,8 @@
* boot.
*/
pgdp = pgd_offset_k(ea);
- pudp = pud_alloc(&init_mm, pgdp, ea);
+ p4dp = p4d_offset(pgdp, ea);
+ pudp = pud_alloc(&init_mm, p4dp, ea);
if (!pudp)
return -ENOMEM;
if (map_page_size == PUD_SIZE) {
@@ -172,6 +185,7 @@
{
unsigned long idx;
pgd_t *pgdp;
+ p4d_t *p4dp;
pud_t *pudp;
pmd_t *pmdp;
pte_t *ptep;
@@ -184,7 +198,8 @@
for (idx = start; idx < end; idx += PAGE_SIZE) {
pgdp = pgd_offset_k(idx);
- pudp = pud_alloc(&init_mm, pgdp, idx);
+ p4dp = p4d_offset(pgdp, idx);
+ pudp = pud_alloc(&init_mm, p4dp, idx);
if (!pudp)
continue;
if (pud_is_leaf(*pudp)) {
@@ -252,19 +267,23 @@
static int __meminit create_physical_mapping(unsigned long start,
unsigned long end,
- int nid)
+ unsigned long max_mapping_size,
+ int nid, pgprot_t _prot)
{
unsigned long vaddr, addr, mapping_size = 0;
bool prev_exec, exec = false;
pgprot_t prot;
int psize;
- start = _ALIGN_UP(start, PAGE_SIZE);
+ start = ALIGN(start, PAGE_SIZE);
+ end = ALIGN_DOWN(end, PAGE_SIZE);
for (addr = start; addr < end; addr += mapping_size) {
unsigned long gap, previous_size;
int rc;
gap = next_boundary(addr, end) - addr;
+ if (gap > max_mapping_size)
+ gap = max_mapping_size;
previous_size = mapping_size;
prev_exec = exec;
@@ -288,7 +307,7 @@
prot = PAGE_KERNEL_X;
exec = true;
} else {
- prot = PAGE_KERNEL;
+ prot = _prot;
exec = false;
}
@@ -311,32 +330,38 @@
static void __init radix_init_pgtable(void)
{
unsigned long rts_field;
- struct memblock_region *reg;
+ phys_addr_t start, end;
+ u64 i;
/* We don't support slb for radix */
mmu_slb_size = 0;
+
/*
- * Create the linear mapping, using standard page size for now
+ * Create the linear mapping
*/
- for_each_memblock(memory, reg) {
+ for_each_mem_range(i, &start, &end) {
/*
* The memblock allocator is up at this point, so the
* page tables will be allocated within the range. No
* need or a node (which we don't have yet).
*/
- if ((reg->base + reg->size) >= RADIX_VMALLOC_START) {
+ if (end >= RADIX_VMALLOC_START) {
pr_warn("Outside the supported range\n");
continue;
}
- WARN_ON(create_physical_mapping(reg->base,
- reg->base + reg->size,
- -1));
+ WARN_ON(create_physical_mapping(start, end,
+ radix_mem_block_size,
+ -1, PAGE_KERNEL));
}
/* Find out how many PID bits are supported */
- if (cpu_has_feature(CPU_FTR_HVMODE)) {
+ if (!cpu_has_feature(CPU_FTR_P9_RADIX_PREFETCH_BUG)) {
+ if (!mmu_pid_bits)
+ mmu_pid_bits = 20;
+ mmu_base_pid = 1;
+ } else if (cpu_has_feature(CPU_FTR_HVMODE)) {
if (!mmu_pid_bits)
mmu_pid_bits = 20;
#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
@@ -468,6 +493,58 @@
return 1;
}
+#ifdef CONFIG_MEMORY_HOTPLUG
+static int __init probe_memory_block_size(unsigned long node, const char *uname, int
+ depth, void *data)
+{
+ unsigned long *mem_block_size = (unsigned long *)data;
+ const __be32 *prop;
+ int len;
+
+ if (depth != 1)
+ return 0;
+
+ if (strcmp(uname, "ibm,dynamic-reconfiguration-memory"))
+ return 0;
+
+ prop = of_get_flat_dt_prop(node, "ibm,lmb-size", &len);
+
+ if (!prop || len < dt_root_size_cells * sizeof(__be32))
+ /*
+ * Nothing in the device tree
+ */
+ *mem_block_size = MIN_MEMORY_BLOCK_SIZE;
+ else
+ *mem_block_size = of_read_number(prop, dt_root_size_cells);
+ return 1;
+}
+
+static unsigned long radix_memory_block_size(void)
+{
+ unsigned long mem_block_size = MIN_MEMORY_BLOCK_SIZE;
+
+ /*
+ * OPAL firmware feature is set by now. Hence we are ok
+ * to test OPAL feature.
+ */
+ if (firmware_has_feature(FW_FEATURE_OPAL))
+ mem_block_size = 1UL * 1024 * 1024 * 1024;
+ else
+ of_scan_flat_dt(probe_memory_block_size, &mem_block_size);
+
+ return mem_block_size;
+}
+
+#else /* CONFIG_MEMORY_HOTPLUG */
+
+static unsigned long radix_memory_block_size(void)
+{
+ return 1UL * 1024 * 1024 * 1024;
+}
+
+#endif /* CONFIG_MEMORY_HOTPLUG */
+
+
void __init radix__early_init_devtree(void)
{
int rc;
@@ -476,17 +553,27 @@
* Try to find the available page sizes in the device-tree
*/
rc = of_scan_flat_dt(radix_dt_scan_page_sizes, NULL);
- if (rc != 0) /* Found */
- goto found;
- /*
- * let's assume we have page 4k and 64k support
- */
- mmu_psize_defs[MMU_PAGE_4K].shift = 12;
- mmu_psize_defs[MMU_PAGE_4K].ap = 0x0;
+ if (!rc) {
+ /*
+ * No page size details found in device tree.
+ * Let's assume we have page 4k and 64k support
+ */
+ mmu_psize_defs[MMU_PAGE_4K].shift = 12;
+ mmu_psize_defs[MMU_PAGE_4K].ap = 0x0;
- mmu_psize_defs[MMU_PAGE_64K].shift = 16;
- mmu_psize_defs[MMU_PAGE_64K].ap = 0x5;
-found:
+ mmu_psize_defs[MMU_PAGE_64K].shift = 16;
+ mmu_psize_defs[MMU_PAGE_64K].ap = 0x5;
+ }
+
+ /*
+ * Max mapping size used when mapping pages. We don't use
+ * ppc_md.memory_block_size() here because this get called
+ * early and we don't have machine probe called yet. Also
+ * the pseries implementation only check for ibm,lmb-size.
+ * All hypervisor supporting radix do expose that device
+ * tree node.
+ */
+ radix_mem_block_size = radix_memory_block_size();
return;
}
@@ -508,8 +595,10 @@
if (disabled || !early_radix_enabled())
return;
- if (smp_processor_id() == boot_cpuid)
+ if (smp_processor_id() == boot_cpuid) {
pr_info("Activating Kernel Userspace Execution Prevention\n");
+ cur_cpu_spec->mmu_features |= MMU_FTR_KUEP;
+ }
/*
* Radix always uses key0 of the IAMR to determine if an access is
@@ -533,6 +622,10 @@
/* Make sure userspace can't change the AMR */
mtspr(SPRN_UAMOR, 0);
+
+ /*
+ * Set the default kernel AMR values on all cpus.
+ */
mtspr(SPRN_AMR, AMR_KUAP_BLOCKED);
isync();
}
@@ -674,28 +767,19 @@
pud_clear(pud);
}
-struct change_mapping_params {
- pte_t *pte;
- unsigned long start;
- unsigned long end;
- unsigned long aligned_start;
- unsigned long aligned_end;
-};
-
-static int __meminit stop_machine_change_mapping(void *data)
+static void free_pud_table(pud_t *pud_start, p4d_t *p4d)
{
- struct change_mapping_params *params =
- (struct change_mapping_params *)data;
+ pud_t *pud;
+ int i;
- if (!data)
- return -1;
+ for (i = 0; i < PTRS_PER_PUD; i++) {
+ pud = pud_start + i;
+ if (!pud_none(*pud))
+ return;
+ }
- spin_unlock(&init_mm.page_table_lock);
- pte_clear(&init_mm, params->aligned_start, params->pte);
- create_physical_mapping(__pa(params->aligned_start), __pa(params->start), -1);
- create_physical_mapping(__pa(params->end), __pa(params->aligned_end), -1);
- spin_lock(&init_mm.page_table_lock);
- return 0;
+ pud_free(&init_mm, pud_start);
+ p4d_clear(p4d);
}
static void remove_pte_table(pte_t *pte_start, unsigned long addr,
@@ -726,53 +810,7 @@
}
}
-/*
- * clear the pte and potentially split the mapping helper
- */
-static void __meminit split_kernel_mapping(unsigned long addr, unsigned long end,
- unsigned long size, pte_t *pte)
-{
- unsigned long mask = ~(size - 1);
- unsigned long aligned_start = addr & mask;
- unsigned long aligned_end = addr + size;
- struct change_mapping_params params;
- bool split_region = false;
-
- if ((end - addr) < size) {
- /*
- * We're going to clear the PTE, but not flushed
- * the mapping, time to remap and flush. The
- * effects if visible outside the processor or
- * if we are running in code close to the
- * mapping we cleared, we are in trouble.
- */
- if (overlaps_kernel_text(aligned_start, addr) ||
- overlaps_kernel_text(end, aligned_end)) {
- /*
- * Hack, just return, don't pte_clear
- */
- WARN_ONCE(1, "Linear mapping %lx->%lx overlaps kernel "
- "text, not splitting\n", addr, end);
- return;
- }
- split_region = true;
- }
-
- if (split_region) {
- params.pte = pte;
- params.start = addr;
- params.end = end;
- params.aligned_start = addr & ~(size - 1);
- params.aligned_end = min_t(unsigned long, aligned_end,
- (unsigned long)__va(memblock_end_of_DRAM()));
- stop_machine(stop_machine_change_mapping, ¶ms, NULL);
- return;
- }
-
- pte_clear(&init_mm, addr, pte);
-}
-
-static void remove_pmd_table(pmd_t *pmd_start, unsigned long addr,
+static void __meminit remove_pmd_table(pmd_t *pmd_start, unsigned long addr,
unsigned long end)
{
unsigned long next;
@@ -787,7 +825,12 @@
continue;
if (pmd_is_leaf(*pmd)) {
- split_kernel_mapping(addr, end, PMD_SIZE, (pte_t *)pmd);
+ if (!IS_ALIGNED(addr, PMD_SIZE) ||
+ !IS_ALIGNED(next, PMD_SIZE)) {
+ WARN_ONCE(1, "%s: unaligned range\n", __func__);
+ continue;
+ }
+ pte_clear(&init_mm, addr, (pte_t *)pmd);
continue;
}
@@ -797,7 +840,7 @@
}
}
-static void remove_pud_table(pud_t *pud_start, unsigned long addr,
+static void __meminit remove_pud_table(pud_t *pud_start, unsigned long addr,
unsigned long end)
{
unsigned long next;
@@ -812,7 +855,12 @@
continue;
if (pud_is_leaf(*pud)) {
- split_kernel_mapping(addr, end, PUD_SIZE, (pte_t *)pud);
+ if (!IS_ALIGNED(addr, PUD_SIZE) ||
+ !IS_ALIGNED(next, PUD_SIZE)) {
+ WARN_ONCE(1, "%s: unaligned range\n", __func__);
+ continue;
+ }
+ pte_clear(&init_mm, addr, (pte_t *)pud);
continue;
}
@@ -827,6 +875,7 @@
unsigned long addr, next;
pud_t *pud_base;
pgd_t *pgd;
+ p4d_t *p4d;
spin_lock(&init_mm.page_table_lock);
@@ -834,30 +883,41 @@
next = pgd_addr_end(addr, end);
pgd = pgd_offset_k(addr);
- if (!pgd_present(*pgd))
+ p4d = p4d_offset(pgd, addr);
+ if (!p4d_present(*p4d))
continue;
- if (pgd_is_leaf(*pgd)) {
- split_kernel_mapping(addr, end, PGDIR_SIZE, (pte_t *)pgd);
+ if (p4d_is_leaf(*p4d)) {
+ if (!IS_ALIGNED(addr, P4D_SIZE) ||
+ !IS_ALIGNED(next, P4D_SIZE)) {
+ WARN_ONCE(1, "%s: unaligned range\n", __func__);
+ continue;
+ }
+
+ pte_clear(&init_mm, addr, (pte_t *)pgd);
continue;
}
- pud_base = (pud_t *)pgd_page_vaddr(*pgd);
+ pud_base = (pud_t *)p4d_page_vaddr(*p4d);
remove_pud_table(pud_base, addr, next);
+ free_pud_table(pud_base, p4d);
}
spin_unlock(&init_mm.page_table_lock);
radix__flush_tlb_kernel_range(start, end);
}
-int __meminit radix__create_section_mapping(unsigned long start, unsigned long end, int nid)
+int __meminit radix__create_section_mapping(unsigned long start,
+ unsigned long end, int nid,
+ pgprot_t prot)
{
if (end >= RADIX_VMALLOC_START) {
pr_warn("Outside the supported range\n");
return -1;
}
- return create_physical_mapping(__pa(start), __pa(end), nid);
+ return create_physical_mapping(__pa(start), __pa(end),
+ radix_mem_block_size, nid, prot);
}
int __meminit radix__remove_section_mapping(unsigned long start, unsigned long end)
@@ -937,7 +997,13 @@
pmd = *pmdp;
pmd_clear(pmdp);
- /*FIXME!! Verify whether we need this kick below */
+ /*
+ * pmdp collapse_flush need to ensure that there are no parallel gup
+ * walk after this call. This is needed so that we can have stable
+ * page ref count when collapsing a page. We don't allow a collapse page
+ * if we have gup taken on the page. We can ensure that by sending IPI
+ * because gup walk happens with IRQ disabled.
+ */
serialize_against_pte_lookup(vma->vm_mm);
radix__flush_tlb_collapsed_pmd(vma->vm_mm, address);
@@ -998,17 +1064,6 @@
old = radix__pmd_hugepage_update(mm, addr, pmdp, ~0UL, 0);
old_pmd = __pmd(old);
- /*
- * Serialize against find_current_mm_pte which does lock-less
- * lookup in page tables with local interrupts disabled. For huge pages
- * it casts pmd_t to pte_t. Since format of pte_t is different from
- * pmd_t we want to prevent transit from pmd pointing to page table
- * to pmd pointing to huge page (and back) while interrupts are disabled.
- * We clear pmd to possibly replace it with page table pointer in
- * different code paths. So make sure we wait for the parallel
- * find_current_mm_pte to finish.
- */
- serialize_against_pte_lookup(mm);
return old_pmd;
}
@@ -1097,7 +1152,7 @@
int pud_clear_huge(pud_t *pud)
{
- if (pud_huge(*pud)) {
+ if (pud_is_leaf(*pud)) {
pud_clear(pud);
return 1;
}
@@ -1144,7 +1199,7 @@
int pmd_clear_huge(pmd_t *pmd)
{
- if (pmd_huge(*pmd)) {
+ if (pmd_is_leaf(*pmd)) {
pmd_clear(pmd);
return 1;
}
diff --git a/arch/powerpc/mm/book3s64/radix_tlb.c b/arch/powerpc/mm/book3s64/radix_tlb.c
index b0f240a..4c2f759 100644
--- a/arch/powerpc/mm/book3s64/radix_tlb.c
+++ b/arch/powerpc/mm/book3s64/radix_tlb.c
@@ -16,6 +16,7 @@
#include <asm/tlbflush.h>
#include <asm/trace.h>
#include <asm/cputhreads.h>
+#include <asm/plpar_wrappers.h>
#define RIC_FLUSH_TLB 0
#define RIC_FLUSH_PWC 1
@@ -64,7 +65,7 @@
for (set = 1; set < num_sets; set++)
tlbiel_radix_set_isa300(set, is, 0, RIC_FLUSH_TLB, 1);
- asm volatile("ptesync": : :"memory");
+ ppc_after_tlbiel_barrier();
}
void radix__tlbiel_all(unsigned int action)
@@ -281,29 +282,37 @@
/*
* We use 128 set in radix mode and 256 set in hpt mode.
*/
-static __always_inline void _tlbiel_pid(unsigned long pid, unsigned long ric)
+static inline void _tlbiel_pid(unsigned long pid, unsigned long ric)
{
int set;
asm volatile("ptesync": : :"memory");
- /*
- * Flush the first set of the TLB, and if we're doing a RIC_FLUSH_ALL,
- * also flush the entire Page Walk Cache.
- */
- __tlbiel_pid(pid, 0, ric);
+ switch (ric) {
+ case RIC_FLUSH_PWC:
- /* For PWC, only one flush is needed */
- if (ric == RIC_FLUSH_PWC) {
- asm volatile("ptesync": : :"memory");
+ /* For PWC, only one flush is needed */
+ __tlbiel_pid(pid, 0, RIC_FLUSH_PWC);
+ ppc_after_tlbiel_barrier();
return;
+ case RIC_FLUSH_TLB:
+ __tlbiel_pid(pid, 0, RIC_FLUSH_TLB);
+ break;
+ case RIC_FLUSH_ALL:
+ default:
+ /*
+ * Flush the first set of the TLB, and if
+ * we're doing a RIC_FLUSH_ALL, also flush
+ * the entire Page Walk Cache.
+ */
+ __tlbiel_pid(pid, 0, RIC_FLUSH_ALL);
}
/* For the remaining sets, just flush the TLB */
for (set = 1; set < POWER9_TLB_SETS_RADIX ; set++)
__tlbiel_pid(pid, set, RIC_FLUSH_TLB);
- asm volatile("ptesync": : :"memory");
+ ppc_after_tlbiel_barrier();
asm volatile(PPC_RADIX_INVALIDATE_ERAT_USER "; isync" : : :"memory");
}
@@ -430,7 +439,7 @@
asm volatile("ptesync": : :"memory");
__tlbiel_va(va, pid, ap, ric);
- asm volatile("ptesync": : :"memory");
+ ppc_after_tlbiel_barrier();
}
static inline void _tlbiel_va_range(unsigned long start, unsigned long end,
@@ -441,7 +450,7 @@
if (also_pwc)
__tlbiel_pid(pid, 0, RIC_FLUSH_PWC);
__tlbiel_va_range(start, end, pid, page_size, psize);
- asm volatile("ptesync": : :"memory");
+ ppc_after_tlbiel_barrier();
}
static inline void __tlbie_va_range(unsigned long start, unsigned long end,
@@ -587,6 +596,11 @@
preempt_enable();
}
EXPORT_SYMBOL(radix__local_flush_all_mm);
+
+static void __flush_all_mm(struct mm_struct *mm, bool fullmm)
+{
+ radix__local_flush_all_mm(mm);
+}
#endif /* CONFIG_SMP */
void radix__local_flush_tlb_page_psize(struct mm_struct *mm, unsigned long vmaddr,
@@ -698,7 +712,14 @@
goto local;
}
- if (cputlb_use_tlbie()) {
+ if (!mmu_has_feature(MMU_FTR_GTSE)) {
+ unsigned long tgt = H_RPTI_TARGET_CMMU;
+
+ if (atomic_read(&mm->context.copros) > 0)
+ tgt |= H_RPTI_TARGET_NMMU;
+ pseries_rpt_invalidate(pid, tgt, H_RPTI_TYPE_TLB,
+ H_RPTI_PAGE_ALL, 0, -1UL);
+ } else if (cputlb_use_tlbie()) {
if (mm_needs_flush_escalation(mm))
_tlbie_pid(pid, RIC_FLUSH_ALL);
else
@@ -731,7 +752,16 @@
goto local;
}
}
- if (cputlb_use_tlbie())
+ if (!mmu_has_feature(MMU_FTR_GTSE)) {
+ unsigned long tgt = H_RPTI_TARGET_CMMU;
+ unsigned long type = H_RPTI_TYPE_TLB | H_RPTI_TYPE_PWC |
+ H_RPTI_TYPE_PRT;
+
+ if (atomic_read(&mm->context.copros) > 0)
+ tgt |= H_RPTI_TARGET_NMMU;
+ pseries_rpt_invalidate(pid, tgt, type,
+ H_RPTI_PAGE_ALL, 0, -1UL);
+ } else if (cputlb_use_tlbie())
_tlbie_pid(pid, RIC_FLUSH_ALL);
else
_tlbiel_pid_multicast(mm, pid, RIC_FLUSH_ALL);
@@ -741,18 +771,13 @@
}
preempt_enable();
}
+
void radix__flush_all_mm(struct mm_struct *mm)
{
__flush_all_mm(mm, false);
}
EXPORT_SYMBOL(radix__flush_all_mm);
-void radix__flush_tlb_pwc(struct mmu_gather *tlb, unsigned long addr)
-{
- tlb->need_flush_all = 1;
-}
-EXPORT_SYMBOL(radix__flush_tlb_pwc);
-
void radix__flush_tlb_page_psize(struct mm_struct *mm, unsigned long vmaddr,
int psize)
{
@@ -769,7 +794,19 @@
exit_flush_lazy_tlbs(mm);
goto local;
}
- if (cputlb_use_tlbie())
+ if (!mmu_has_feature(MMU_FTR_GTSE)) {
+ unsigned long tgt, pg_sizes, size;
+
+ tgt = H_RPTI_TARGET_CMMU;
+ pg_sizes = psize_to_rpti_pgsize(psize);
+ size = 1UL << mmu_psize_to_shift(psize);
+
+ if (atomic_read(&mm->context.copros) > 0)
+ tgt |= H_RPTI_TARGET_NMMU;
+ pseries_rpt_invalidate(pid, tgt, H_RPTI_TYPE_TLB,
+ pg_sizes, vmaddr,
+ vmaddr + size);
+ } else if (cputlb_use_tlbie())
_tlbie_va(vmaddr, pid, psize, RIC_FLUSH_TLB);
else
_tlbiel_va_multicast(mm, vmaddr, pid, psize, RIC_FLUSH_TLB);
@@ -791,7 +828,7 @@
EXPORT_SYMBOL(radix__flush_tlb_page);
#else /* CONFIG_SMP */
-#define radix__flush_all_mm radix__local_flush_all_mm
+static inline void exit_flush_lazy_tlbs(struct mm_struct *mm) { }
#endif /* CONFIG_SMP */
static void do_tlbiel_kernel(void *info)
@@ -819,7 +856,14 @@
*/
void radix__flush_tlb_kernel_range(unsigned long start, unsigned long end)
{
- if (cputlb_use_tlbie())
+ if (!mmu_has_feature(MMU_FTR_GTSE)) {
+ unsigned long tgt = H_RPTI_TARGET_CMMU | H_RPTI_TARGET_NMMU;
+ unsigned long type = H_RPTI_TYPE_TLB | H_RPTI_TYPE_PWC |
+ H_RPTI_TYPE_PRT;
+
+ pseries_rpt_invalidate(0, tgt, type, H_RPTI_PAGE_ALL,
+ start, end);
+ } else if (cputlb_use_tlbie())
_tlbie_pid(0, RIC_FLUSH_ALL);
else
_tlbiel_kernel_broadcast();
@@ -841,8 +885,7 @@
static unsigned long tlb_local_single_page_flush_ceiling __read_mostly = POWER9_TLB_SETS_RADIX * 2;
static inline void __radix__flush_tlb_range(struct mm_struct *mm,
- unsigned long start, unsigned long end,
- bool flush_all_sizes)
+ unsigned long start, unsigned long end)
{
unsigned long pid;
@@ -874,7 +917,17 @@
nr_pages > tlb_local_single_page_flush_ceiling);
}
- if (full) {
+ if (!mmu_has_feature(MMU_FTR_GTSE) && !local) {
+ unsigned long tgt = H_RPTI_TARGET_CMMU;
+ unsigned long pg_sizes = psize_to_rpti_pgsize(mmu_virtual_psize);
+
+ if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
+ pg_sizes |= psize_to_rpti_pgsize(MMU_PAGE_2M);
+ if (atomic_read(&mm->context.copros) > 0)
+ tgt |= H_RPTI_TARGET_NMMU;
+ pseries_rpt_invalidate(pid, tgt, H_RPTI_TYPE_TLB, pg_sizes,
+ start, end);
+ } else if (full) {
if (local) {
_tlbiel_pid(pid, RIC_FLUSH_TLB);
} else {
@@ -888,26 +941,14 @@
}
}
} else {
- bool hflush = flush_all_sizes;
- bool gflush = flush_all_sizes;
+ bool hflush = false;
unsigned long hstart, hend;
- unsigned long gstart, gend;
- if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
- hflush = true;
-
- if (hflush) {
+ if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
hstart = (start + PMD_SIZE - 1) & PMD_MASK;
hend = end & PMD_MASK;
- if (hstart == hend)
- hflush = false;
- }
-
- if (gflush) {
- gstart = (start + PUD_SIZE - 1) & PUD_MASK;
- gend = end & PUD_MASK;
- if (gstart == gend)
- gflush = false;
+ if (hstart < hend)
+ hflush = true;
}
if (local) {
@@ -916,20 +957,13 @@
if (hflush)
__tlbiel_va_range(hstart, hend, pid,
PMD_SIZE, MMU_PAGE_2M);
- if (gflush)
- __tlbiel_va_range(gstart, gend, pid,
- PUD_SIZE, MMU_PAGE_1G);
- asm volatile("ptesync": : :"memory");
+ ppc_after_tlbiel_barrier();
} else if (cputlb_use_tlbie()) {
asm volatile("ptesync": : :"memory");
__tlbie_va_range(start, end, pid, page_size, mmu_virtual_psize);
if (hflush)
__tlbie_va_range(hstart, hend, pid,
PMD_SIZE, MMU_PAGE_2M);
- if (gflush)
- __tlbie_va_range(gstart, gend, pid,
- PUD_SIZE, MMU_PAGE_1G);
-
asm volatile("eieio; tlbsync; ptesync": : :"memory");
} else {
_tlbiel_va_range_multicast(mm,
@@ -937,9 +971,6 @@
if (hflush)
_tlbiel_va_range_multicast(mm,
hstart, hend, pid, PMD_SIZE, MMU_PAGE_2M, false);
- if (gflush)
- _tlbiel_va_range_multicast(mm,
- gstart, gend, pid, PUD_SIZE, MMU_PAGE_1G, false);
}
}
preempt_enable();
@@ -954,7 +985,7 @@
return radix__flush_hugetlb_tlb_range(vma, start, end);
#endif
- __radix__flush_tlb_range(vma->vm_mm, start, end, false);
+ __radix__flush_tlb_range(vma->vm_mm, start, end);
}
EXPORT_SYMBOL(radix__flush_tlb_range);
@@ -1030,56 +1061,22 @@
* that flushes the process table entry cache upon process teardown.
* See the comment for radix in arch_exit_mmap().
*/
- if (tlb->fullmm) {
+ if (tlb->fullmm || tlb->need_flush_all) {
__flush_all_mm(mm, true);
-#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLB_PAGE)
- } else if (mm_tlb_flush_nested(mm)) {
- /*
- * If there is a concurrent invalidation that is clearing ptes,
- * then it's possible this invalidation will miss one of those
- * cleared ptes and miss flushing the TLB. If this invalidate
- * returns before the other one flushes TLBs, that can result
- * in it returning while there are still valid TLBs inside the
- * range to be invalidated.
- *
- * See mm/memory.c:tlb_finish_mmu() for more details.
- *
- * The solution to this is ensure the entire range is always
- * flushed here. The problem for powerpc is that the flushes
- * are page size specific, so this "forced flush" would not
- * do the right thing if there are a mix of page sizes in
- * the range to be invalidated. So use __flush_tlb_range
- * which invalidates all possible page sizes in the range.
- *
- * PWC flush probably is not be required because the core code
- * shouldn't free page tables in this path, but accounting
- * for the possibility makes us a bit more robust.
- *
- * need_flush_all is an uncommon case because page table
- * teardown should be done with exclusive locks held (but
- * after locks are dropped another invalidate could come
- * in), it could be optimized further if necessary.
- */
- if (!tlb->need_flush_all)
- __radix__flush_tlb_range(mm, start, end, true);
- else
- radix__flush_all_mm(mm);
-#endif
} else if ( (psize = radix_get_mmu_psize(page_size)) == -1) {
- if (!tlb->need_flush_all)
+ if (!tlb->freed_tables)
radix__flush_tlb_mm(mm);
else
radix__flush_all_mm(mm);
} else {
- if (!tlb->need_flush_all)
+ if (!tlb->freed_tables)
radix__flush_tlb_range_psize(mm, start, end, psize);
else
radix__flush_tlb_pwc_range_psize(mm, start, end, psize);
}
- tlb->need_flush_all = 0;
}
-static __always_inline void __radix__flush_tlb_range_psize(struct mm_struct *mm,
+static void __radix__flush_tlb_range_psize(struct mm_struct *mm,
unsigned long start, unsigned long end,
int psize, bool also_pwc)
{
@@ -1112,7 +1109,17 @@
nr_pages > tlb_local_single_page_flush_ceiling);
}
- if (full) {
+ if (!mmu_has_feature(MMU_FTR_GTSE) && !local) {
+ unsigned long tgt = H_RPTI_TARGET_CMMU;
+ unsigned long type = H_RPTI_TYPE_TLB;
+ unsigned long pg_sizes = psize_to_rpti_pgsize(psize);
+
+ if (also_pwc)
+ type |= H_RPTI_TYPE_PWC;
+ if (atomic_read(&mm->context.copros) > 0)
+ tgt |= H_RPTI_TARGET_NMMU;
+ pseries_rpt_invalidate(pid, tgt, type, pg_sizes, start, end);
+ } else if (full) {
if (local) {
_tlbiel_pid(pid, also_pwc ? RIC_FLUSH_ALL : RIC_FLUSH_TLB);
} else {
@@ -1177,7 +1184,19 @@
exit_flush_lazy_tlbs(mm);
goto local;
}
- if (cputlb_use_tlbie())
+ if (!mmu_has_feature(MMU_FTR_GTSE)) {
+ unsigned long tgt, type, pg_sizes;
+
+ tgt = H_RPTI_TARGET_CMMU;
+ type = H_RPTI_TYPE_TLB | H_RPTI_TYPE_PWC |
+ H_RPTI_TYPE_PRT;
+ pg_sizes = psize_to_rpti_pgsize(mmu_virtual_psize);
+
+ if (atomic_read(&mm->context.copros) > 0)
+ tgt |= H_RPTI_TARGET_NMMU;
+ pseries_rpt_invalidate(pid, tgt, type, pg_sizes,
+ addr, end);
+ } else if (cputlb_use_tlbie())
_tlbie_va_range(addr, end, pid, PAGE_SIZE, mmu_virtual_psize, true);
else
_tlbiel_va_range_multicast(mm,
@@ -1230,6 +1249,9 @@
if (unlikely(pid == MMU_NO_CONTEXT))
return;
+ if (!cpu_has_feature(CPU_FTR_P9_RADIX_PREFETCH_BUG))
+ return;
+
/*
* If this context hasn't run on that CPU before and KVM is
* around, there's a slim chance that the guest on another
diff --git a/arch/powerpc/mm/book3s64/slb.c b/arch/powerpc/mm/book3s64/slb.c
index 716204a..c30fcbf 100644
--- a/arch/powerpc/mm/book3s64/slb.c
+++ b/arch/powerpc/mm/book3s64/slb.c
@@ -10,7 +10,6 @@
*/
#include <asm/asm-prototypes.h>
-#include <asm/pgtable.h>
#include <asm/mmu.h>
#include <asm/mmu_context.h>
#include <asm/paca.h>
@@ -21,10 +20,14 @@
#include <linux/compiler.h>
#include <linux/context_tracking.h>
#include <linux/mm_types.h>
+#include <linux/pgtable.h>
#include <asm/udbg.h>
#include <asm/code-patching.h>
+#include "internal.h"
+
+
enum slb_index {
LINEAR_INDEX = 0, /* Kernel linear map (0xc000000000000000) */
KSTACK_INDEX = 1, /* Kernel stack map */
@@ -54,6 +57,17 @@
return __mk_vsid_data(get_kernel_vsid(ea, ssize), ssize, flags);
}
+bool stress_slb_enabled __initdata;
+
+static int __init parse_stress_slb(char *p)
+{
+ stress_slb_enabled = true;
+ return 0;
+}
+early_param("stress_slb", parse_stress_slb);
+
+__ro_after_init DEFINE_STATIC_KEY_FALSE(stress_slb_key);
+
static void assert_slb_presence(bool present, unsigned long ea)
{
#ifdef CONFIG_DEBUG_VM
@@ -68,7 +82,7 @@
* slbfee. requires bit 24 (PPC bit 39) be clear in RB. Hardware
* ignores all other bits from 0-27, so just clear them all.
*/
- ea &= ~((1UL << 28) - 1);
+ ea &= ~((1UL << SID_SHIFT) - 1);
asm volatile(__PPC_SLBFEE_DOT(%0, %1) : "=r"(tmp) : "r"(ea) : "cr0");
WARN_ON(present == (tmp == 0));
@@ -153,14 +167,42 @@
asm volatile("slbmte %0,%0; slbia" : : "r" (0));
}
+static __always_inline void __slb_flush_and_restore_bolted(bool preserve_kernel_lookaside)
+{
+ struct slb_shadow *p = get_slb_shadow();
+ unsigned long ksp_esid_data, ksp_vsid_data;
+ u32 ih;
+
+ /*
+ * SLBIA IH=1 on ISA v2.05 and newer processors may preserve lookaside
+ * information created with Class=0 entries, which we use for kernel
+ * SLB entries (the SLB entries themselves are still invalidated).
+ *
+ * Older processors will ignore this optimisation. Over-invalidation
+ * is fine because we never rely on lookaside information existing.
+ */
+ if (preserve_kernel_lookaside)
+ ih = 1;
+ else
+ ih = 0;
+
+ ksp_esid_data = be64_to_cpu(p->save_area[KSTACK_INDEX].esid);
+ ksp_vsid_data = be64_to_cpu(p->save_area[KSTACK_INDEX].vsid);
+
+ asm volatile(PPC_SLBIA(%0)" \n"
+ "slbmte %1, %2 \n"
+ :: "i" (ih),
+ "r" (ksp_vsid_data),
+ "r" (ksp_esid_data)
+ : "memory");
+}
+
/*
* This flushes non-bolted entries, it can be run in virtual mode. Must
* be called with interrupts disabled.
*/
void slb_flush_and_restore_bolted(void)
{
- struct slb_shadow *p = get_slb_shadow();
-
BUILD_BUG_ON(SLB_NUM_BOLTED != 2);
WARN_ON(!irqs_disabled());
@@ -171,13 +213,10 @@
*/
hard_irq_disable();
- asm volatile("isync\n"
- "slbia\n"
- "slbmte %0, %1\n"
- "isync\n"
- :: "r" (be64_to_cpu(p->save_area[KSTACK_INDEX].vsid)),
- "r" (be64_to_cpu(p->save_area[KSTACK_INDEX].esid))
- : "memory");
+ isync();
+ __slb_flush_and_restore_bolted(false);
+ isync();
+
assert_slb_presence(true, get_paca()->kstack);
get_paca()->slb_cache_ptr = 0;
@@ -400,6 +439,30 @@
local_irq_enable();
}
+static void slb_cache_slbie_kernel(unsigned int index)
+{
+ unsigned long slbie_data = get_paca()->slb_cache[index];
+ unsigned long ksp = get_paca()->kstack;
+
+ slbie_data <<= SID_SHIFT;
+ slbie_data |= 0xc000000000000000ULL;
+ if ((ksp & slb_esid_mask(mmu_kernel_ssize)) == slbie_data)
+ return;
+ slbie_data |= mmu_kernel_ssize << SLBIE_SSIZE_SHIFT;
+
+ asm volatile("slbie %0" : : "r" (slbie_data));
+}
+
+static void slb_cache_slbie_user(unsigned int index)
+{
+ unsigned long slbie_data = get_paca()->slb_cache[index];
+
+ slbie_data <<= SID_SHIFT;
+ slbie_data |= user_segment_size(slbie_data) << SLBIE_SSIZE_SHIFT;
+ slbie_data |= SLBIE_C; /* user slbs have C=1 */
+
+ asm volatile("slbie %0" : : "r" (slbie_data));
+}
/* Flush all user entries from the segment table of the current processor. */
void switch_slb(struct task_struct *tsk, struct mm_struct *mm)
@@ -414,8 +477,14 @@
* which would update the slb_cache/slb_cache_ptr fields in the PACA.
*/
hard_irq_disable();
- asm volatile("isync" : : : "memory");
- if (cpu_has_feature(CPU_FTR_ARCH_300)) {
+ isync();
+ if (stress_slb()) {
+ __slb_flush_and_restore_bolted(false);
+ isync();
+ get_paca()->slb_cache_ptr = 0;
+ get_paca()->slb_kern_bitmap = (1U << SLB_NUM_BOLTED) - 1;
+
+ } else if (cpu_has_feature(CPU_FTR_ARCH_300)) {
/*
* SLBIA IH=3 invalidates all Class=1 SLBEs and their
* associated lookaside structures, which matches what
@@ -423,47 +492,29 @@
* cache.
*/
asm volatile(PPC_SLBIA(3));
+
} else {
unsigned long offset = get_paca()->slb_cache_ptr;
if (!mmu_has_feature(MMU_FTR_NO_SLBIE_B) &&
offset <= SLB_CACHE_ENTRIES) {
- unsigned long slbie_data = 0;
+ /*
+ * Could assert_slb_presence(true) here, but
+ * hypervisor or machine check could have come
+ * in and removed the entry at this point.
+ */
- for (i = 0; i < offset; i++) {
- unsigned long ea;
-
- ea = (unsigned long)
- get_paca()->slb_cache[i] << SID_SHIFT;
- /*
- * Could assert_slb_presence(true) here, but
- * hypervisor or machine check could have come
- * in and removed the entry at this point.
- */
-
- slbie_data = ea;
- slbie_data |= user_segment_size(slbie_data)
- << SLBIE_SSIZE_SHIFT;
- slbie_data |= SLBIE_C; /* user slbs have C=1 */
- asm volatile("slbie %0" : : "r" (slbie_data));
- }
+ for (i = 0; i < offset; i++)
+ slb_cache_slbie_user(i);
/* Workaround POWER5 < DD2.1 issue */
if (!cpu_has_feature(CPU_FTR_ARCH_207S) && offset == 1)
- asm volatile("slbie %0" : : "r" (slbie_data));
+ slb_cache_slbie_user(0);
} else {
- struct slb_shadow *p = get_slb_shadow();
- unsigned long ksp_esid_data =
- be64_to_cpu(p->save_area[KSTACK_INDEX].esid);
- unsigned long ksp_vsid_data =
- be64_to_cpu(p->save_area[KSTACK_INDEX].vsid);
-
- asm volatile(PPC_SLBIA(1) "\n"
- "slbmte %0,%1\n"
- "isync"
- :: "r"(ksp_vsid_data),
- "r"(ksp_esid_data));
+ /* Flush but retain kernel lookaside information */
+ __slb_flush_and_restore_bolted(true);
+ isync();
get_paca()->slb_kern_bitmap = (1U << SLB_NUM_BOLTED) - 1;
}
@@ -503,7 +554,7 @@
* address accesses by the kernel (user mode won't happen until
* rfid, which is safe).
*/
- asm volatile("isync" : : : "memory");
+ isync();
}
void slb_set_size(u16 size)
@@ -571,6 +622,9 @@
if (cpu_has_feature(CPU_FTR_ARCH_300))
return; /* ISAv3.0B and later does not use slb_cache */
+ if (stress_slb())
+ return;
+
/*
* Now update slb cache entries
*/
@@ -580,7 +634,7 @@
* We have space in slb cache for optimized switch_slb().
* Top 36 bits from esid_data as per ISA
*/
- local_paca->slb_cache[slb_cache_index++] = esid_data >> 28;
+ local_paca->slb_cache[slb_cache_index++] = esid_data >> SID_SHIFT;
local_paca->slb_cache_ptr++;
} else {
/*
@@ -671,6 +725,28 @@
* accesses user memory before it returns to userspace with rfid.
*/
assert_slb_presence(false, ea);
+ if (stress_slb()) {
+ int slb_cache_index = local_paca->slb_cache_ptr;
+
+ /*
+ * stress_slb() does not use slb cache, repurpose as a
+ * cache of inserted (non-bolted) kernel SLB entries. All
+ * non-bolted kernel entries are flushed on any user fault,
+ * or if there are already 3 non-boled kernel entries.
+ */
+ BUILD_BUG_ON(SLB_CACHE_ENTRIES < 3);
+ if (!kernel || slb_cache_index == 3) {
+ int i;
+
+ for (i = 0; i < slb_cache_index; i++)
+ slb_cache_slbie_kernel(i);
+ slb_cache_index = 0;
+ }
+
+ if (kernel)
+ local_paca->slb_cache[slb_cache_index++] = esid_data >> SID_SHIFT;
+ local_paca->slb_cache_ptr = slb_cache_index;
+ }
asm volatile("slbmte %0, %1" : : "r" (vsid_data), "r" (esid_data));
barrier();
@@ -689,8 +765,8 @@
if (id == LINEAR_MAP_REGION_ID) {
- /* We only support upto MAX_PHYSMEM_BITS */
- if ((ea & EA_MASK) > (1UL << MAX_PHYSMEM_BITS))
+ /* We only support upto H_MAX_PHYSMEM_BITS */
+ if ((ea & EA_MASK) > (1UL << H_MAX_PHYSMEM_BITS))
return -EFAULT;
flags = SLB_VSID_KERNEL | mmu_psize_defs[mmu_linear_psize].sllp;
diff --git a/arch/powerpc/mm/book3s64/subpage_prot.c b/arch/powerpc/mm/book3s64/subpage_prot.c
index 2ef24a5..60c6ea1 100644
--- a/arch/powerpc/mm/book3s64/subpage_prot.c
+++ b/arch/powerpc/mm/book3s64/subpage_prot.c
@@ -11,7 +11,7 @@
#include <linux/hugetlb.h>
#include <linux/syscalls.h>
-#include <asm/pgtable.h>
+#include <linux/pgtable.h>
#include <linux/uaccess.h>
/*
@@ -54,15 +54,17 @@
int npages)
{
pgd_t *pgd;
+ p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;
pte_t *pte;
spinlock_t *ptl;
pgd = pgd_offset(mm, addr);
- if (pgd_none(*pgd))
+ p4d = p4d_offset(pgd, addr);
+ if (p4d_none(*p4d))
return;
- pud = pud_offset(pgd, addr);
+ pud = pud_offset(p4d, addr);
if (pud_none(*pud))
return;
pmd = pmd_offset(pud, addr);
@@ -92,7 +94,7 @@
size_t nw;
unsigned long next, limit;
- down_write(&mm->mmap_sem);
+ mmap_write_lock(mm);
spt = mm_ctx_subpage_prot(&mm->context);
if (!spt)
@@ -127,7 +129,7 @@
}
err_out:
- up_write(&mm->mmap_sem);
+ mmap_write_unlock(mm);
}
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
@@ -217,13 +219,13 @@
if (!access_ok(map, (len >> PAGE_SHIFT) * sizeof(u32)))
return -EFAULT;
- down_write(&mm->mmap_sem);
+ mmap_write_lock(mm);
spt = mm_ctx_subpage_prot(&mm->context);
if (!spt) {
/*
* Allocate subpage prot table if not already done.
- * Do this with mmap_sem held
+ * Do this with mmap_lock held
*/
spt = kzalloc(sizeof(struct subpage_prot_table), GFP_KERNEL);
if (!spt) {
@@ -267,11 +269,11 @@
if (addr + (nw << PAGE_SHIFT) > next)
nw = (next - addr) >> PAGE_SHIFT;
- up_write(&mm->mmap_sem);
+ mmap_write_unlock(mm);
if (__copy_from_user(spp, map, nw * sizeof(u32)))
return -EFAULT;
map += nw;
- down_write(&mm->mmap_sem);
+ mmap_write_lock(mm);
/* now flush any existing HPTEs for the range */
hpte_flush_range(mm, addr, nw);
@@ -280,6 +282,6 @@
spt->maxaddr = limit;
err = 0;
out:
- up_write(&mm->mmap_sem);
+ mmap_write_unlock(mm);
return err;
}
diff --git a/arch/powerpc/mm/copro_fault.c b/arch/powerpc/mm/copro_fault.c
index beb060b..8acd001 100644
--- a/arch/powerpc/mm/copro_fault.c
+++ b/arch/powerpc/mm/copro_fault.c
@@ -33,7 +33,7 @@
if (mm->pgd == NULL)
return -EFAULT;
- down_read(&mm->mmap_sem);
+ mmap_read_lock(mm);
ret = -EFAULT;
vma = find_vma(mm, ea);
if (!vma)
@@ -64,7 +64,7 @@
}
ret = 0;
- *flt = handle_mm_fault(vma, ea, is_write ? FAULT_FLAG_WRITE : 0);
+ *flt = handle_mm_fault(vma, ea, is_write ? FAULT_FLAG_WRITE : 0, NULL);
if (unlikely(*flt & VM_FAULT_ERROR)) {
if (*flt & VM_FAULT_OOM) {
ret = -ENOMEM;
@@ -76,13 +76,8 @@
BUG();
}
- if (*flt & VM_FAULT_MAJOR)
- current->maj_flt++;
- else
- current->min_flt++;
-
out_unlock:
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
return ret;
}
EXPORT_SYMBOL_GPL(copro_handle_mm_fault);
diff --git a/arch/powerpc/mm/dma-noncoherent.c b/arch/powerpc/mm/dma-noncoherent.c
index 2a82984..30260b5 100644
--- a/arch/powerpc/mm/dma-noncoherent.c
+++ b/arch/powerpc/mm/dma-noncoherent.c
@@ -11,7 +11,7 @@
#include <linux/types.h>
#include <linux/highmem.h>
#include <linux/dma-direct.h>
-#include <linux/dma-noncoherent.h>
+#include <linux/dma-map-ops.h>
#include <asm/tlbflush.h>
#include <asm/dma.h>
@@ -104,14 +104,14 @@
#endif
}
-void arch_sync_dma_for_device(struct device *dev, phys_addr_t paddr,
- size_t size, enum dma_data_direction dir)
+void arch_sync_dma_for_device(phys_addr_t paddr, size_t size,
+ enum dma_data_direction dir)
{
__dma_sync_page(paddr, size, dir);
}
-void arch_sync_dma_for_cpu(struct device *dev, phys_addr_t paddr,
- size_t size, enum dma_data_direction dir)
+void arch_sync_dma_for_cpu(phys_addr_t paddr, size_t size,
+ enum dma_data_direction dir)
{
__dma_sync_page(paddr, size, dir);
}
diff --git a/arch/powerpc/mm/drmem.c b/arch/powerpc/mm/drmem.c
index 873fcfc..9af3832 100644
--- a/arch/powerpc/mm/drmem.c
+++ b/arch/powerpc/mm/drmem.c
@@ -14,6 +14,8 @@
#include <asm/prom.h>
#include <asm/drmem.h>
+static int n_root_addr_cells, n_root_size_cells;
+
static struct drmem_lmb_info __drmem_info;
struct drmem_lmb_info *drmem_info = &__drmem_info;
@@ -189,12 +191,13 @@
return rc;
}
-static void __init read_drconf_v1_cell(struct drmem_lmb *lmb,
+static void read_drconf_v1_cell(struct drmem_lmb *lmb,
const __be32 **prop)
{
const __be32 *p = *prop;
- lmb->base_addr = dt_mem_next_cell(dt_root_addr_cells, &p);
+ lmb->base_addr = of_read_number(p, n_root_addr_cells);
+ p += n_root_addr_cells;
lmb->drc_index = of_read_number(p++, 1);
p++; /* skip reserved field */
@@ -205,29 +208,33 @@
*prop = p;
}
-static void __init __walk_drmem_v1_lmbs(const __be32 *prop, const __be32 *usm,
- void (*func)(struct drmem_lmb *, const __be32 **))
+static int
+__walk_drmem_v1_lmbs(const __be32 *prop, const __be32 *usm, void *data,
+ int (*func)(struct drmem_lmb *, const __be32 **, void *))
{
struct drmem_lmb lmb;
u32 i, n_lmbs;
+ int ret = 0;
n_lmbs = of_read_number(prop++, 1);
- if (n_lmbs == 0)
- return;
-
for (i = 0; i < n_lmbs; i++) {
read_drconf_v1_cell(&lmb, &prop);
- func(&lmb, &usm);
+ ret = func(&lmb, &usm, data);
+ if (ret)
+ break;
}
+
+ return ret;
}
-static void __init read_drconf_v2_cell(struct of_drconf_cell_v2 *dr_cell,
+static void read_drconf_v2_cell(struct of_drconf_cell_v2 *dr_cell,
const __be32 **prop)
{
const __be32 *p = *prop;
dr_cell->seq_lmbs = of_read_number(p++, 1);
- dr_cell->base_addr = dt_mem_next_cell(dt_root_addr_cells, &p);
+ dr_cell->base_addr = of_read_number(p, n_root_addr_cells);
+ p += n_root_addr_cells;
dr_cell->drc_index = of_read_number(p++, 1);
dr_cell->aa_index = of_read_number(p++, 1);
dr_cell->flags = of_read_number(p++, 1);
@@ -235,17 +242,16 @@
*prop = p;
}
-static void __init __walk_drmem_v2_lmbs(const __be32 *prop, const __be32 *usm,
- void (*func)(struct drmem_lmb *, const __be32 **))
+static int
+__walk_drmem_v2_lmbs(const __be32 *prop, const __be32 *usm, void *data,
+ int (*func)(struct drmem_lmb *, const __be32 **, void *))
{
struct of_drconf_cell_v2 dr_cell;
struct drmem_lmb lmb;
u32 i, j, lmb_sets;
+ int ret = 0;
lmb_sets = of_read_number(prop++, 1);
- if (lmb_sets == 0)
- return;
-
for (i = 0; i < lmb_sets; i++) {
read_drconf_v2_cell(&dr_cell, &prop);
@@ -259,21 +265,29 @@
lmb.aa_index = dr_cell.aa_index;
lmb.flags = dr_cell.flags;
- func(&lmb, &usm);
+ ret = func(&lmb, &usm, data);
+ if (ret)
+ break;
}
}
+
+ return ret;
}
#ifdef CONFIG_PPC_PSERIES
-void __init walk_drmem_lmbs_early(unsigned long node,
- void (*func)(struct drmem_lmb *, const __be32 **))
+int __init walk_drmem_lmbs_early(unsigned long node, void *data,
+ int (*func)(struct drmem_lmb *, const __be32 **, void *))
{
const __be32 *prop, *usm;
- int len;
+ int len, ret = -ENODEV;
prop = of_get_flat_dt_prop(node, "ibm,lmb-size", &len);
if (!prop || len < dt_root_size_cells * sizeof(__be32))
- return;
+ return ret;
+
+ /* Get the address & size cells */
+ n_root_addr_cells = dt_root_addr_cells;
+ n_root_size_cells = dt_root_size_cells;
drmem_info->lmb_size = dt_mem_next_cell(dt_root_size_cells, &prop);
@@ -281,20 +295,21 @@
prop = of_get_flat_dt_prop(node, "ibm,dynamic-memory", &len);
if (prop) {
- __walk_drmem_v1_lmbs(prop, usm, func);
+ ret = __walk_drmem_v1_lmbs(prop, usm, data, func);
} else {
prop = of_get_flat_dt_prop(node, "ibm,dynamic-memory-v2",
&len);
if (prop)
- __walk_drmem_v2_lmbs(prop, usm, func);
+ ret = __walk_drmem_v2_lmbs(prop, usm, data, func);
}
memblock_dump_all();
+ return ret;
}
#endif
-static int __init init_drmem_lmb_size(struct device_node *dn)
+static int init_drmem_lmb_size(struct device_node *dn)
{
const __be32 *prop;
int len;
@@ -303,12 +318,12 @@
return 0;
prop = of_get_property(dn, "ibm,lmb-size", &len);
- if (!prop || len < dt_root_size_cells * sizeof(__be32)) {
+ if (!prop || len < n_root_size_cells * sizeof(__be32)) {
pr_info("Could not determine LMB size\n");
return -1;
}
- drmem_info->lmb_size = dt_mem_next_cell(dt_root_size_cells, &prop);
+ drmem_info->lmb_size = of_read_number(prop, n_root_size_cells);
return 0;
}
@@ -329,24 +344,36 @@
return prop;
}
-void __init walk_drmem_lmbs(struct device_node *dn,
- void (*func)(struct drmem_lmb *, const __be32 **))
+int walk_drmem_lmbs(struct device_node *dn, void *data,
+ int (*func)(struct drmem_lmb *, const __be32 **, void *))
{
const __be32 *prop, *usm;
+ int ret = -ENODEV;
+
+ if (!of_root)
+ return ret;
+
+ /* Get the address & size cells */
+ of_node_get(of_root);
+ n_root_addr_cells = of_n_addr_cells(of_root);
+ n_root_size_cells = of_n_size_cells(of_root);
+ of_node_put(of_root);
if (init_drmem_lmb_size(dn))
- return;
+ return ret;
usm = of_get_usable_memory(dn);
prop = of_get_property(dn, "ibm,dynamic-memory", NULL);
if (prop) {
- __walk_drmem_v1_lmbs(prop, usm, func);
+ ret = __walk_drmem_v1_lmbs(prop, usm, data, func);
} else {
prop = of_get_property(dn, "ibm,dynamic-memory-v2", NULL);
if (prop)
- __walk_drmem_v2_lmbs(prop, usm, func);
+ ret = __walk_drmem_v2_lmbs(prop, usm, data, func);
}
+
+ return ret;
}
static void __init init_drmem_v1_lmbs(const __be32 *prop)
diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index 9f4a78e..55fdba8 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -35,46 +35,14 @@
#include <asm/firmware.h>
#include <asm/page.h>
-#include <asm/pgtable.h>
#include <asm/mmu.h>
#include <asm/mmu_context.h>
#include <asm/siginfo.h>
#include <asm/debug.h>
#include <asm/kup.h>
+#include <asm/inst.h>
-/*
- * Check whether the instruction inst is a store using
- * an update addressing form which will update r1.
- */
-static bool store_updates_sp(unsigned int inst)
-{
- /* check for 1 in the rA field */
- if (((inst >> 16) & 0x1f) != 1)
- return false;
- /* check major opcode */
- switch (inst >> 26) {
- case OP_STWU:
- case OP_STBU:
- case OP_STHU:
- case OP_STFSU:
- case OP_STFDU:
- return true;
- case OP_STD: /* std or stdu */
- return (inst & 3) == 1;
- case OP_31:
- /* check minor opcode */
- switch ((inst >> 1) & 0x3ff) {
- case OP_31_XOP_STDUX:
- case OP_31_XOP_STWUX:
- case OP_31_XOP_STBUX:
- case OP_31_XOP_STHUX:
- case OP_31_XOP_STFSUX:
- case OP_31_XOP_STFDUX:
- return true;
- }
- }
- return false;
-}
+
/*
* do_page_fault error handling helpers
*/
@@ -108,7 +76,7 @@
* Something tried to access memory that isn't in our memory map..
* Fix it, but check if it's kernel or user first..
*/
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
return __bad_area_nosemaphore(regs, address, si_code);
}
@@ -118,9 +86,34 @@
return __bad_area(regs, address, SEGV_MAPERR);
}
-static int bad_key_fault_exception(struct pt_regs *regs, unsigned long address,
- int pkey)
+#ifdef CONFIG_PPC_MEM_KEYS
+static noinline int bad_access_pkey(struct pt_regs *regs, unsigned long address,
+ struct vm_area_struct *vma)
{
+ struct mm_struct *mm = current->mm;
+ int pkey;
+
+ /*
+ * We don't try to fetch the pkey from page table because reading
+ * page table without locking doesn't guarantee stable pte value.
+ * Hence the pkey value that we return to userspace can be different
+ * from the pkey that actually caused access error.
+ *
+ * It does *not* guarantee that the VMA we find here
+ * was the one that we faulted on.
+ *
+ * 1. T1 : mprotect_key(foo, PAGE_SIZE, pkey=4);
+ * 2. T1 : set AMR to deny access to pkey=4, touches, page
+ * 3. T1 : faults...
+ * 4. T2: mprotect_key(foo, PAGE_SIZE, pkey=5);
+ * 5. T1 : enters fault handler, takes mmap_lock, etc...
+ * 6. T1 : reaches here, sees vma_pkey(vma)=5, when we really
+ * faulted on a pte with its pkey=4.
+ */
+ pkey = vma_pkey(vma);
+
+ mmap_read_unlock(mm);
+
/*
* If we are in kernel mode, bail out with a SEGV, this will
* be caught by the assembly which will restore the non-volatile
@@ -133,6 +126,7 @@
return 0;
}
+#endif
static noinline int bad_access(struct pt_regs *regs, unsigned long address)
{
@@ -239,63 +233,23 @@
return false;
}
-// This comes from 64-bit struct rt_sigframe + __SIGNAL_FRAMESIZE
-#define SIGFRAME_MAX_SIZE (4096 + 128)
-
-static bool bad_stack_expansion(struct pt_regs *regs, unsigned long address,
- struct vm_area_struct *vma, unsigned int flags,
- bool *must_retry)
+#ifdef CONFIG_PPC_MEM_KEYS
+static bool access_pkey_error(bool is_write, bool is_exec, bool is_pkey,
+ struct vm_area_struct *vma)
{
/*
- * N.B. The POWER/Open ABI allows programs to access up to
- * 288 bytes below the stack pointer.
- * The kernel signal delivery code writes a bit over 4KB
- * below the stack pointer (r1) before decrementing it.
- * The exec code can write slightly over 640kB to the stack
- * before setting the user r1. Thus we allow the stack to
- * expand to 1MB without further checks.
+ * Make sure to check the VMA so that we do not perform
+ * faults just to hit a pkey fault as soon as we fill in a
+ * page. Only called for current mm, hence foreign == 0
*/
- if (address + 0x100000 < vma->vm_end) {
- unsigned int __user *nip = (unsigned int __user *)regs->nip;
- /* get user regs even if this fault is in kernel mode */
- struct pt_regs *uregs = current->thread.regs;
- if (uregs == NULL)
- return true;
-
- /*
- * A user-mode access to an address a long way below
- * the stack pointer is only valid if the instruction
- * is one which would update the stack pointer to the
- * address accessed if the instruction completed,
- * i.e. either stwu rs,n(r1) or stwux rs,r1,rb
- * (or the byte, halfword, float or double forms).
- *
- * If we don't check this then any write to the area
- * between the last mapped region and the stack will
- * expand the stack rather than segfaulting.
- */
- if (address + SIGFRAME_MAX_SIZE >= uregs->gpr[1])
- return false;
-
- if ((flags & FAULT_FLAG_WRITE) && (flags & FAULT_FLAG_USER) &&
- access_ok(nip, sizeof(*nip))) {
- unsigned int inst;
- int res;
-
- pagefault_disable();
- res = __get_user_inatomic(inst, nip);
- pagefault_enable();
- if (!res)
- return !store_updates_sp(inst);
- *must_retry = true;
- }
+ if (!arch_vma_access_permitted(vma, is_write, is_exec, 0))
return true;
- }
+
return false;
}
+#endif
-static bool access_error(bool is_write, bool is_exec,
- struct vm_area_struct *vma)
+static bool access_error(bool is_write, bool is_exec, struct vm_area_struct *vma)
{
/*
* Allow execution from readable areas if the MMU does not
@@ -319,7 +273,7 @@
return false;
}
- if (unlikely(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))))
+ if (unlikely(!vma_is_accessible(vma)))
return true;
/*
* We should ideally do the vma pkey access check here. But in the
@@ -437,12 +391,11 @@
{
struct vm_area_struct * vma;
struct mm_struct *mm = current->mm;
- unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
+ unsigned int flags = FAULT_FLAG_DEFAULT;
int is_exec = TRAP(regs) == 0x400;
int is_user = user_mode(regs);
int is_write = page_fault_is_write(error_code);
vm_fault_t fault, major = 0;
- bool must_retry = false;
bool kprobe_fault = kprobe_page_fault(regs, 11);
if (unlikely(debugger_fault_handler(regs) || kprobe_fault))
@@ -486,14 +439,10 @@
perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
- if (error_code & DSISR_KEYFAULT)
- return bad_key_fault_exception(regs, address,
- get_mm_addr_key(mm, address));
-
/*
- * We want to do this outside mmap_sem, because reading code around nip
+ * We want to do this outside mmap_lock, because reading code around nip
* can result in fault, which will cause a deadlock when called with
- * mmap_sem held
+ * mmap_lock held
*/
if (is_user)
flags |= FAULT_FLAG_USER;
@@ -505,7 +454,7 @@
/* When running in the kernel we expect faults to occur only to
* addresses in user space. All other faults represent errors in the
* kernel and should generate an OOPS. Unfortunately, in the case of an
- * erroneous fault occurring in a code path which already holds mmap_sem
+ * erroneous fault occurring in a code path which already holds mmap_lock
* we will deadlock attempting to validate the fault against the
* address space. Luckily the kernel only validly references user
* space from well defined areas of code, which are listed in the
@@ -517,12 +466,12 @@
* source. If this is invalid we can skip the address space check,
* thus avoiding the deadlock.
*/
- if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
+ if (unlikely(!mmap_read_trylock(mm))) {
if (!is_user && !search_exception_tables(regs->nip))
return bad_area_nosemaphore(regs, address);
retry:
- down_read(&mm->mmap_sem);
+ mmap_read_lock(mm);
} else {
/*
* The above down_read_trylock() might have succeeded in
@@ -535,29 +484,21 @@
vma = find_vma(mm, address);
if (unlikely(!vma))
return bad_area(regs, address);
- if (likely(vma->vm_start <= address))
- goto good_area;
- if (unlikely(!(vma->vm_flags & VM_GROWSDOWN)))
- return bad_area(regs, address);
- /* The stack is being expanded, check if it's valid */
- if (unlikely(bad_stack_expansion(regs, address, vma, flags,
- &must_retry))) {
- if (!must_retry)
+ if (unlikely(vma->vm_start > address)) {
+ if (unlikely(!(vma->vm_flags & VM_GROWSDOWN)))
return bad_area(regs, address);
- up_read(&mm->mmap_sem);
- if (fault_in_pages_readable((const char __user *)regs->nip,
- sizeof(unsigned int)))
- return bad_area_nosemaphore(regs, address);
- goto retry;
+ if (unlikely(expand_stack(vma, address)))
+ return bad_area(regs, address);
}
- /* Try to expand it */
- if (unlikely(expand_stack(vma, address)))
- return bad_area(regs, address);
+#ifdef CONFIG_PPC_MEM_KEYS
+ if (unlikely(access_pkey_error(is_write, is_exec,
+ (error_code & DSISR_KEYFAULT), vma)))
+ return bad_access_pkey(regs, address, vma);
+#endif /* CONFIG_PPC_MEM_KEYS */
-good_area:
if (unlikely(access_error(is_write, is_exec, vma)))
return bad_access(regs, address);
@@ -566,50 +507,25 @@
* make sure we exit gracefully rather than endlessly redo
* the fault.
*/
- fault = handle_mm_fault(vma, address, flags);
-
-#ifdef CONFIG_PPC_MEM_KEYS
- /*
- * we skipped checking for access error due to key earlier.
- * Check that using handle_mm_fault error return.
- */
- if (unlikely(fault & VM_FAULT_SIGSEGV) &&
- !arch_vma_access_permitted(vma, is_write, is_exec, 0)) {
-
- int pkey = vma_pkey(vma);
-
- up_read(&mm->mmap_sem);
- return bad_key_fault_exception(regs, address, pkey);
- }
-#endif /* CONFIG_PPC_MEM_KEYS */
+ fault = handle_mm_fault(vma, address, flags, regs);
major |= fault & VM_FAULT_MAJOR;
+ if (fault_signal_pending(fault, regs))
+ return user_mode(regs) ? 0 : SIGBUS;
+
/*
- * Handle the retry right now, the mmap_sem has been released in that
+ * Handle the retry right now, the mmap_lock has been released in that
* case.
*/
if (unlikely(fault & VM_FAULT_RETRY)) {
- /* We retry only once */
if (flags & FAULT_FLAG_ALLOW_RETRY) {
- /*
- * Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk
- * of starvation.
- */
- flags &= ~FAULT_FLAG_ALLOW_RETRY;
flags |= FAULT_FLAG_TRIED;
- if (!fatal_signal_pending(current))
- goto retry;
+ goto retry;
}
-
- /*
- * User mode? Just return to handle the fatal exception otherwise
- * return to bad_page_fault
- */
- return is_user ? 0 : SIGBUS;
}
- up_read(¤t->mm->mmap_sem);
+ mmap_read_unlock(current->mm);
if (unlikely(fault & VM_FAULT_ERROR))
return mm_fault_error(regs, address, fault);
@@ -617,14 +533,9 @@
/*
* Major/minor page fault accounting.
*/
- if (major) {
- current->maj_flt++;
- perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, regs, address);
+ if (major)
cmo_account_page_fault();
- } else {
- current->min_flt++;
- perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs, address);
- }
+
return 0;
}
NOKPROBE_SYMBOL(__do_page_fault);
@@ -647,6 +558,7 @@
void bad_page_fault(struct pt_regs *regs, unsigned long address, int sig)
{
const struct exception_table_entry *entry;
+ int is_write = page_fault_is_write(regs->dsisr);
/* Are we prepared to handle this fault? */
if ((entry = search_exception_tables(regs->nip)) != NULL) {
@@ -660,9 +572,10 @@
case 0x300:
case 0x380:
case 0xe00:
- pr_alert("BUG: %s at 0x%08lx\n",
+ pr_alert("BUG: %s on %s at 0x%08lx\n",
regs->dar < PAGE_SIZE ? "Kernel NULL pointer dereference" :
- "Unable to handle kernel data access", regs->dar);
+ "Unable to handle kernel data access",
+ is_write ? "write" : "read", regs->dar);
break;
case 0x400:
case 0x480:
diff --git a/arch/powerpc/mm/highmem.c b/arch/powerpc/mm/highmem.c
index 320c167..624b443 100644
--- a/arch/powerpc/mm/highmem.c
+++ b/arch/powerpc/mm/highmem.c
@@ -24,22 +24,11 @@
#include <linux/highmem.h>
#include <linux/module.h>
-/*
- * The use of kmap_atomic/kunmap_atomic is discouraged - kmap/kunmap
- * gives a more generic (and caching) interface. But kmap_atomic can
- * be used in IRQ contexts, so in some (very limited) cases we need
- * it.
- */
-void *kmap_atomic_prot(struct page *page, pgprot_t prot)
+void *kmap_atomic_high_prot(struct page *page, pgprot_t prot)
{
unsigned long vaddr;
int idx, type;
- preempt_disable();
- pagefault_disable();
- if (!PageHighMem(page))
- return page_address(page);
-
type = kmap_atomic_idx_push();
idx = type + KM_TYPE_NR*smp_processor_id();
vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
@@ -49,17 +38,14 @@
return (void*) vaddr;
}
-EXPORT_SYMBOL(kmap_atomic_prot);
+EXPORT_SYMBOL(kmap_atomic_high_prot);
-void __kunmap_atomic(void *kvaddr)
+void kunmap_atomic_high(void *kvaddr)
{
unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK;
- if (vaddr < __fix_to_virt(FIX_KMAP_END)) {
- pagefault_enable();
- preempt_enable();
+ if (vaddr < __fix_to_virt(FIX_KMAP_END))
return;
- }
if (IS_ENABLED(CONFIG_DEBUG_HIGHMEM)) {
int type = kmap_atomic_idx();
@@ -77,7 +63,5 @@
}
kmap_atomic_idx_pop();
- pagefault_enable();
- preempt_enable();
}
-EXPORT_SYMBOL(__kunmap_atomic);
+EXPORT_SYMBOL(kunmap_atomic_high);
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index 33b3461..36c3800 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -19,7 +19,6 @@
#include <linux/swap.h>
#include <linux/swapops.h>
#include <linux/kmemleak.h>
-#include <asm/pgtable.h>
#include <asm/pgalloc.h>
#include <asm/tlb.h>
#include <asm/setup.h>
@@ -30,7 +29,8 @@
#define hugepd_none(hpd) (hpd_val(hpd) == 0)
-#define PTE_T_ORDER (__builtin_ffs(sizeof(pte_t)) - __builtin_ffs(sizeof(void *)))
+#define PTE_T_ORDER (__builtin_ffs(sizeof(pte_basic_t)) - \
+ __builtin_ffs(sizeof(void *)))
pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr, unsigned long sz)
{
@@ -53,24 +53,17 @@
if (pshift >= pdshift) {
cachep = PGT_CACHE(PTE_T_ORDER);
num_hugepd = 1 << (pshift - pdshift);
- new = NULL;
- } else if (IS_ENABLED(CONFIG_PPC_8xx)) {
- cachep = NULL;
- num_hugepd = 1;
- new = pte_alloc_one(mm);
} else {
cachep = PGT_CACHE(pdshift - pshift);
num_hugepd = 1;
- new = NULL;
}
- if (!cachep && !new) {
+ if (!cachep) {
WARN_ONCE(1, "No page table cache created for hugetlb tables");
return -ENOMEM;
}
- if (cachep)
- new = kmem_cache_alloc(cachep, pgtable_gfp_flags(mm, GFP_KERNEL));
+ new = kmem_cache_alloc(cachep, pgtable_gfp_flags(mm, GFP_KERNEL));
BUG_ON(pshift > HUGEPD_SHIFT_MASK);
BUG_ON((unsigned long)new & HUGEPD_SHIFT_MASK);
@@ -101,10 +94,7 @@
if (i < num_hugepd) {
for (i = i - 1 ; i >= 0; i--, hpdp--)
*hpdp = __hugepd(0);
- if (cachep)
- kmem_cache_free(cachep, new);
- else
- pte_free(mm, new);
+ kmem_cache_free(cachep, new);
} else {
kmemleak_ignore(new);
}
@@ -119,6 +109,7 @@
pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz)
{
pgd_t *pg;
+ p4d_t *p4;
pud_t *pu;
pmd_t *pm;
hugepd_t *hpdp = NULL;
@@ -128,20 +119,21 @@
addr &= ~(sz-1);
pg = pgd_offset(mm, addr);
+ p4 = p4d_offset(pg, addr);
#ifdef CONFIG_PPC_BOOK3S_64
if (pshift == PGDIR_SHIFT)
/* 16GB huge page */
- return (pte_t *) pg;
+ return (pte_t *) p4;
else if (pshift > PUD_SHIFT) {
/*
* We need to use hugepd table
*/
ptl = &mm->page_table_lock;
- hpdp = (hugepd_t *)pg;
+ hpdp = (hugepd_t *)p4;
} else {
pdshift = PUD_SHIFT;
- pu = pud_alloc(mm, pg, addr);
+ pu = pud_alloc(mm, p4, addr);
if (!pu)
return NULL;
if (pshift == PUD_SHIFT)
@@ -166,10 +158,10 @@
#else
if (pshift >= PGDIR_SHIFT) {
ptl = &mm->page_table_lock;
- hpdp = (hugepd_t *)pg;
+ hpdp = (hugepd_t *)p4;
} else {
pdshift = PUD_SHIFT;
- pu = pud_alloc(mm, pg, addr);
+ pu = pud_alloc(mm, p4, addr);
if (!pu)
return NULL;
if (pshift >= PUD_SHIFT) {
@@ -188,6 +180,9 @@
if (!hpdp)
return NULL;
+ if (IS_ENABLED(CONFIG_PPC_8xx) && pshift < PMD_SHIFT)
+ return pte_alloc_map(mm, (pmd_t *)hpdp, addr);
+
BUG_ON(!hugepd_none(*hpdp) && !hugepd_ok(*hpdp));
if (hugepd_none(*hpdp) && __hugepte_alloc(mm, hpdp, addr,
@@ -253,7 +248,7 @@
struct hugepd_freelist {
struct rcu_head rcu;
unsigned int index;
- void *ptes[0];
+ void *ptes[];
};
static DEFINE_PER_CPU(struct hugepd_freelist *, hugepd_freelist_cur);
@@ -330,13 +325,34 @@
if (shift >= pdshift)
hugepd_free(tlb, hugepte);
- else if (IS_ENABLED(CONFIG_PPC_8xx))
- pgtable_free_tlb(tlb, hugepte, 0);
else
pgtable_free_tlb(tlb, hugepte,
get_hugepd_cache_index(pdshift - shift));
}
+static void hugetlb_free_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
+ unsigned long addr, unsigned long end,
+ unsigned long floor, unsigned long ceiling)
+{
+ unsigned long start = addr;
+ pgtable_t token = pmd_pgtable(*pmd);
+
+ start &= PMD_MASK;
+ if (start < floor)
+ return;
+ if (ceiling) {
+ ceiling &= PMD_MASK;
+ if (!ceiling)
+ return;
+ }
+ if (end - 1 > ceiling - 1)
+ return;
+
+ pmd_clear(pmd);
+ pte_free_tlb(tlb, token, addr);
+ mm_dec_nr_ptes(tlb->mm);
+}
+
static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
unsigned long addr, unsigned long end,
unsigned long floor, unsigned long ceiling)
@@ -352,11 +368,17 @@
pmd = pmd_offset(pud, addr);
next = pmd_addr_end(addr, end);
if (!is_hugepd(__hugepd(pmd_val(*pmd)))) {
+ if (pmd_none_or_clear_bad(pmd))
+ continue;
+
/*
* if it is not hugepd pointer, we should already find
* it cleared.
*/
- WARN_ON(!pmd_none_or_clear_bad(pmd));
+ WARN_ON(!IS_ENABLED(CONFIG_PPC_8xx));
+
+ hugetlb_free_pte_range(tlb, pmd, addr, end, floor, ceiling);
+
continue;
}
/*
@@ -390,7 +412,7 @@
mm_dec_nr_pmds(tlb->mm);
}
-static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
+static void hugetlb_free_pud_range(struct mmu_gather *tlb, p4d_t *p4d,
unsigned long addr, unsigned long end,
unsigned long floor, unsigned long ceiling)
{
@@ -400,7 +422,7 @@
start = addr;
do {
- pud = pud_offset(pgd, addr);
+ pud = pud_offset(p4d, addr);
next = pud_addr_end(addr, end);
if (!is_hugepd(__hugepd(pud_val(*pud)))) {
if (pud_none_or_clear_bad(pud))
@@ -435,8 +457,8 @@
if (end - 1 > ceiling - 1)
return;
- pud = pud_offset(pgd, start);
- pgd_clear(pgd);
+ pud = pud_offset(p4d, start);
+ p4d_clear(p4d);
pud_free_tlb(tlb, pud, start);
mm_dec_nr_puds(tlb->mm);
}
@@ -449,6 +471,7 @@
unsigned long floor, unsigned long ceiling)
{
pgd_t *pgd;
+ p4d_t *p4d;
unsigned long next;
/*
@@ -471,10 +494,11 @@
do {
next = pgd_addr_end(addr, end);
pgd = pgd_offset(tlb->mm, addr);
+ p4d = p4d_offset(pgd, addr);
if (!is_hugepd(__hugepd(pgd_val(*pgd)))) {
- if (pgd_none_or_clear_bad(pgd))
+ if (p4d_none_or_clear_bad(p4d))
continue;
- hugetlb_free_pud_range(tlb, pgd, addr, next, floor, ceiling);
+ hugetlb_free_pud_range(tlb, p4d, addr, next, floor, ceiling);
} else {
unsigned long more;
/*
@@ -487,7 +511,7 @@
if (more > next)
next = more;
- free_hugepd_range(tlb, (hugepd_t *)pgd, PGDIR_SHIFT,
+ free_hugepd_range(tlb, (hugepd_t *)p4d, PGDIR_SHIFT,
addr, next, floor, ceiling);
}
} while (addr = next, addr != end);
@@ -558,7 +582,7 @@
return vma_kernel_pagesize(vma);
}
-static int __init add_huge_page_size(unsigned long long size)
+bool __init arch_hugetlb_valid_size(unsigned long size)
{
int shift = __ffs(size);
int mmu_psize;
@@ -566,38 +590,28 @@
/* Check that it is a page size supported by the hardware and
* that it fits within pagetable and slice limits. */
if (size <= PAGE_SIZE || !is_power_of_2(size))
- return -EINVAL;
+ return false;
mmu_psize = check_and_get_huge_psize(shift);
if (mmu_psize < 0)
- return -EINVAL;
+ return false;
BUG_ON(mmu_psize_defs[mmu_psize].shift != shift);
- /* Return if huge page size has already been setup */
- if (size_to_hstate(size))
- return 0;
+ return true;
+}
+
+static int __init add_huge_page_size(unsigned long long size)
+{
+ int shift = __ffs(size);
+
+ if (!arch_hugetlb_valid_size((unsigned long)size))
+ return -EINVAL;
hugetlb_add_hstate(shift - PAGE_SHIFT);
-
return 0;
}
-static int __init hugepage_setup_sz(char *str)
-{
- unsigned long long size;
-
- size = memparse(str, &str);
-
- if (add_huge_page_size(size) != 0) {
- hugetlb_bad_size();
- pr_err("Invalid huge page size specified(%llu)\n", size);
- }
-
- return 1;
-}
-__setup("hugepagesz=", hugepage_setup_sz);
-
static int __init hugetlbpage_init(void)
{
bool configured = false;
@@ -684,3 +698,21 @@
}
}
}
+
+void __init gigantic_hugetlb_cma_reserve(void)
+{
+ unsigned long order = 0;
+
+ if (radix_enabled())
+ order = PUD_SHIFT - PAGE_SHIFT;
+ else if (!firmware_has_feature(FW_FEATURE_LPAR) && mmu_psize_defs[MMU_PAGE_16G].shift)
+ /*
+ * For pseries we do use ibm,expected#pages for reserving 16G pages.
+ */
+ order = mmu_psize_to_shift(MMU_PAGE_16G) - PAGE_SHIFT;
+
+ if (order) {
+ VM_WARN_ON(order < MAX_ORDER);
+ hugetlb_cma_reserve(order);
+ }
+}
diff --git a/arch/powerpc/mm/init-common.c b/arch/powerpc/mm/init-common.c
index a84da92..8e0d792 100644
--- a/arch/powerpc/mm/init-common.c
+++ b/arch/powerpc/mm/init-common.c
@@ -17,10 +17,17 @@
#undef DEBUG
#include <linux/string.h>
+#include <linux/pgtable.h>
#include <asm/pgalloc.h>
-#include <asm/pgtable.h>
#include <asm/kup.h>
+phys_addr_t memstart_addr __ro_after_init = (phys_addr_t)~0ull;
+EXPORT_SYMBOL_GPL(memstart_addr);
+phys_addr_t kernstart_addr __ro_after_init;
+EXPORT_SYMBOL_GPL(kernstart_addr);
+unsigned long kernstart_virt_addr __ro_after_init = KERNELBASE;
+EXPORT_SYMBOL_GPL(kernstart_virt_addr);
+
static bool disable_kuep = !IS_ENABLED(CONFIG_PPC_KUEP);
static bool disable_kuap = !IS_ENABLED(CONFIG_PPC_KUAP);
diff --git a/arch/powerpc/mm/init_32.c b/arch/powerpc/mm/init_32.c
index b04896a..02c7db4 100644
--- a/arch/powerpc/mm/init_32.c
+++ b/arch/powerpc/mm/init_32.c
@@ -29,10 +29,8 @@
#include <linux/slab.h>
#include <linux/hugetlb.h>
-#include <asm/pgalloc.h>
#include <asm/prom.h>
#include <asm/io.h>
-#include <asm/pgtable.h>
#include <asm/mmu.h>
#include <asm/smp.h>
#include <asm/machdep.h>
@@ -56,11 +54,6 @@
phys_addr_t total_memory;
phys_addr_t total_lowmem;
-phys_addr_t memstart_addr = (phys_addr_t)~0ull;
-EXPORT_SYMBOL(memstart_addr);
-phys_addr_t kernstart_addr;
-EXPORT_SYMBOL(kernstart_addr);
-
#ifdef CONFIG_RELOCATABLE
/* Used in __va()/__pa() */
long long virt_phys_offset;
@@ -101,11 +94,13 @@
if (strstr(boot_command_line, "noltlbs")) {
__map_without_ltlbs = 1;
}
- if (debug_pagealloc_enabled()) {
- __map_without_bats = 1;
+ if (IS_ENABLED(CONFIG_PPC_8xx))
+ return;
+
+ if (debug_pagealloc_enabled())
__map_without_ltlbs = 1;
- }
- if (strict_kernel_rwx_enabled() && !IS_ENABLED(CONFIG_PPC_8xx))
+
+ if (strict_kernel_rwx_enabled())
__map_without_ltlbs = 1;
}
diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c
index 210f1c2..386be13 100644
--- a/arch/powerpc/mm/init_64.c
+++ b/arch/powerpc/mm/init_64.c
@@ -47,7 +47,6 @@
#include <asm/rtas.h>
#include <asm/io.h>
#include <asm/mmu_context.h>
-#include <asm/pgtable.h>
#include <asm/mmu.h>
#include <linux/uaccess.h>
#include <asm/smp.h>
@@ -63,38 +62,48 @@
#include <mm/mmu_decl.h>
-phys_addr_t memstart_addr = ~0;
-EXPORT_SYMBOL_GPL(memstart_addr);
-phys_addr_t kernstart_addr;
-EXPORT_SYMBOL_GPL(kernstart_addr);
-
#ifdef CONFIG_SPARSEMEM_VMEMMAP
/*
- * Given an address within the vmemmap, determine the pfn of the page that
- * represents the start of the section it is within. Note that we have to
+ * Given an address within the vmemmap, determine the page that
+ * represents the start of the subsection it is within. Note that we have to
* do this by hand as the proffered address may not be correctly aligned.
* Subtraction of non-aligned pointers produces undefined results.
*/
-static unsigned long __meminit vmemmap_section_start(unsigned long page)
+static struct page * __meminit vmemmap_subsection_start(unsigned long vmemmap_addr)
{
- unsigned long offset = page - ((unsigned long)(vmemmap));
+ unsigned long start_pfn;
+ unsigned long offset = vmemmap_addr - ((unsigned long)(vmemmap));
/* Return the pfn of the start of the section. */
- return (offset / sizeof(struct page)) & PAGE_SECTION_MASK;
+ start_pfn = (offset / sizeof(struct page)) & PAGE_SUBSECTION_MASK;
+ return pfn_to_page(start_pfn);
}
/*
- * Check if this vmemmap page is already initialised. If any section
- * which overlaps this vmemmap page is initialised then this page is
- * initialised already.
+ * Since memory is added in sub-section chunks, before creating a new vmemmap
+ * mapping, the kernel should check whether there is an existing memmap mapping
+ * covering the new subsection added. This is needed because kernel can map
+ * vmemmap area using 16MB pages which will cover a memory range of 16G. Such
+ * a range covers multiple subsections (2M)
+ *
+ * If any subsection in the 16G range mapped by vmemmap is valid we consider the
+ * vmemmap populated (There is a page table entry already present). We can't do
+ * a page table lookup here because with the hash translation we don't keep
+ * vmemmap details in linux page table.
*/
-static int __meminit vmemmap_populated(unsigned long start, int page_size)
+static int __meminit vmemmap_populated(unsigned long vmemmap_addr, int vmemmap_map_size)
{
- unsigned long end = start + page_size;
- start = (unsigned long)(pfn_to_page(vmemmap_section_start(start)));
+ struct page *start;
+ unsigned long vmemmap_end = vmemmap_addr + vmemmap_map_size;
+ start = vmemmap_subsection_start(vmemmap_addr);
- for (; start < end; start += (PAGES_PER_SECTION * sizeof(struct page)))
- if (pfn_valid(page_to_pfn((struct page *)start)))
+ for (; (unsigned long)start < vmemmap_end; start += PAGES_PER_SUBSECTION)
+ /*
+ * pfn valid check here is intended to really check
+ * whether we have any subsection already initialized
+ * in this range.
+ */
+ if (pfn_valid(page_to_pfn(start)))
return 1;
return 0;
@@ -153,16 +162,16 @@
return next++;
}
-static __meminit void vmemmap_list_populate(unsigned long phys,
- unsigned long start,
- int node)
+static __meminit int vmemmap_list_populate(unsigned long phys,
+ unsigned long start,
+ int node)
{
struct vmemmap_backing *vmem_back;
vmem_back = vmemmap_list_alloc(node);
if (unlikely(!vmem_back)) {
- WARN_ON(1);
- return;
+ pr_debug("vmemap list allocation failed\n");
+ return -ENOMEM;
}
vmem_back->phys = phys;
@@ -170,6 +179,7 @@
vmem_back->list = vmemmap_list;
vmemmap_list = vmem_back;
+ return 0;
}
static bool altmap_cross_boundary(struct vmem_altmap *altmap, unsigned long start,
@@ -190,10 +200,11 @@
int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
struct vmem_altmap *altmap)
{
+ bool altmap_alloc;
unsigned long page_size = 1 << mmu_psize_defs[mmu_vmemmap_psize].shift;
/* Align to the page size of the linear mapping. */
- start = _ALIGN_DOWN(start, page_size);
+ start = ALIGN_DOWN(start, page_size);
pr_debug("vmemmap_populate %lx..%lx, node %d\n", start, end, node);
@@ -201,6 +212,12 @@
void *p = NULL;
int rc;
+ /*
+ * This vmemmap range is backing different subsections. If any
+ * of that subsection is marked valid, that means we already
+ * have initialized a page table covering this range and hence
+ * the vmemmap range is populated.
+ */
if (vmemmap_populated(start, page_size))
continue;
@@ -210,16 +227,35 @@
* fall back to system memory if the altmap allocation fail.
*/
if (altmap && !altmap_cross_boundary(altmap, start, page_size)) {
- p = altmap_alloc_block_buf(page_size, altmap);
+ p = vmemmap_alloc_block_buf(page_size, node, altmap);
if (!p)
pr_debug("altmap block allocation failed, falling back to system memory");
+ else
+ altmap_alloc = true;
}
- if (!p)
- p = vmemmap_alloc_block_buf(page_size, node);
+ if (!p) {
+ p = vmemmap_alloc_block_buf(page_size, node, NULL);
+ altmap_alloc = false;
+ }
if (!p)
return -ENOMEM;
- vmemmap_list_populate(__pa(p), start, node);
+ if (vmemmap_list_populate(__pa(p), start, node)) {
+ /*
+ * If we don't populate vmemap list, we don't have
+ * the ability to free the allocated vmemmap
+ * pages in section_deactivate. Hence free them
+ * here.
+ */
+ int nr_pfns = page_size >> PAGE_SHIFT;
+ unsigned long page_order = get_order(page_size);
+
+ if (altmap_alloc)
+ vmem_altmap_free(altmap, nr_pfns);
+ else
+ free_pages((unsigned long)p, page_order);
+ return -ENOMEM;
+ }
pr_debug(" * %016lx..%016lx allocated at %p\n",
start, start + page_size, p);
@@ -249,10 +285,8 @@
vmem_back_prev = vmem_back;
}
- if (unlikely(!vmem_back)) {
- WARN_ON(1);
+ if (unlikely(!vmem_back))
return 0;
- }
/* remove it from vmemmap_list */
if (vmem_back == vmemmap_list) /* remove head */
@@ -276,7 +310,7 @@
unsigned long alt_start = ~0, alt_end = ~0;
unsigned long base_pfn;
- start = _ALIGN_DOWN(start, page_size);
+ start = ALIGN_DOWN(start, page_size);
if (altmap) {
alt_start = altmap->base_pfn;
alt_end = altmap->base_pfn + altmap->reserve +
@@ -290,9 +324,10 @@
struct page *page;
/*
- * the section has already be marked as invalid, so
- * vmemmap_populated() true means some other sections still
- * in this page, so skip it.
+ * We have already marked the subsection we are trying to remove
+ * invalid. So if we want to remove the vmemmap range, we
+ * need to make sure there is no subsection marked valid
+ * in this range.
*/
if (vmemmap_populated(start, page_size))
continue;
@@ -390,13 +425,15 @@
}
if (!(vec5[OV5_INDX(OV5_RADIX_GTSE)] &
OV5_FEAT(OV5_RADIX_GTSE))) {
- pr_warn("WARNING: Hypervisor doesn't support RADIX with GTSE\n");
- }
+ cur_cpu_spec->mmu_features &= ~MMU_FTR_GTSE;
+ } else
+ cur_cpu_spec->mmu_features |= MMU_FTR_GTSE;
/* Do radix anyway - the hypervisor said we had to */
cur_cpu_spec->mmu_features |= MMU_FTR_TYPE_RADIX;
} else if (mmu_supported == OV5_FEAT(OV5_MMU_HASH)) {
/* Hypervisor only supports hash - disable radix */
cur_cpu_spec->mmu_features &= ~MMU_FTR_TYPE_RADIX;
+ cur_cpu_spec->mmu_features &= ~MMU_FTR_GTSE;
}
}
diff --git a/arch/powerpc/mm/ioremap.c b/arch/powerpc/mm/ioremap.c
index fc66964..b1a0aeb 100644
--- a/arch/powerpc/mm/ioremap.c
+++ b/arch/powerpc/mm/ioremap.c
@@ -2,6 +2,7 @@
#include <linux/io.h>
#include <linux/slab.h>
+#include <linux/mmzone.h>
#include <linux/vmalloc.h>
#include <asm/io-workarounds.h>
@@ -97,3 +98,23 @@
return NULL;
}
+
+#ifdef CONFIG_ZONE_DEVICE
+/*
+ * Override the generic version in mm/memremap.c.
+ *
+ * With hash translation, the direct-map range is mapped with just one
+ * page size selected by htab_init_page_sizes(). Consult
+ * mmu_psize_defs[] to determine the minimum page size alignment.
+*/
+unsigned long memremap_compat_align(void)
+{
+ unsigned int shift = mmu_psize_defs[mmu_linear_psize].shift;
+
+ if (radix_enabled())
+ return SUBSECTION_SIZE;
+ return max(SUBSECTION_SIZE, 1UL << shift);
+
+}
+EXPORT_SYMBOL_GPL(memremap_compat_align);
+#endif
diff --git a/arch/powerpc/mm/ioremap_32.c b/arch/powerpc/mm/ioremap_32.c
index f36121f..743e113 100644
--- a/arch/powerpc/mm/ioremap_32.c
+++ b/arch/powerpc/mm/ioremap_32.c
@@ -68,6 +68,7 @@
/*
* Should check if it is a candidate for a BAT mapping
*/
+ pr_warn("ioremap() called early from %pS. Use early_ioremap() instead\n", caller);
err = early_ioremap_range(ioremap_bot - size, p, size, prot);
if (err)
diff --git a/arch/powerpc/mm/ioremap_64.c b/arch/powerpc/mm/ioremap_64.c
index fd29e51..ba5cbb0 100644
--- a/arch/powerpc/mm/ioremap_64.c
+++ b/arch/powerpc/mm/ioremap_64.c
@@ -4,56 +4,6 @@
#include <linux/slab.h>
#include <linux/vmalloc.h>
-/**
- * Low level function to establish the page tables for an IO mapping
- */
-void __iomem *__ioremap_at(phys_addr_t pa, void *ea, unsigned long size, pgprot_t prot)
-{
- int ret;
- unsigned long va = (unsigned long)ea;
-
- /* We don't support the 4K PFN hack with ioremap */
- if (pgprot_val(prot) & H_PAGE_4K_PFN)
- return NULL;
-
- if ((ea + size) >= (void *)IOREMAP_END) {
- pr_warn("Outside the supported range\n");
- return NULL;
- }
-
- WARN_ON(pa & ~PAGE_MASK);
- WARN_ON(((unsigned long)ea) & ~PAGE_MASK);
- WARN_ON(size & ~PAGE_MASK);
-
- if (slab_is_available()) {
- ret = ioremap_page_range(va, va + size, pa, prot);
- if (ret)
- unmap_kernel_range(va, size);
- } else {
- ret = early_ioremap_range(va, pa, size, prot);
- }
-
- if (ret)
- return NULL;
-
- return (void __iomem *)ea;
-}
-EXPORT_SYMBOL(__ioremap_at);
-
-/**
- * Low level function to tear down the page tables for an IO mapping. This is
- * used for mappings that are manipulated manually, like partial unmapping of
- * PCI IOs or ISA space.
- */
-void __iounmap_at(void *ea, unsigned long size)
-{
- WARN_ON(((unsigned long)ea) & ~PAGE_MASK);
- WARN_ON(size & ~PAGE_MASK);
-
- unmap_kernel_range((unsigned long)ea, size);
-}
-EXPORT_SYMBOL(__iounmap_at);
-
void __iomem *__ioremap_caller(phys_addr_t addr, unsigned long size,
pgprot_t prot, void *caller)
{
@@ -81,6 +31,8 @@
if (slab_is_available())
return do_ioremap(paligned, offset, size, prot, caller);
+ pr_warn("ioremap() called early from %pS. Use early_ioremap() instead\n", caller);
+
err = early_ioremap_range(ioremap_bot, paligned, size, prot);
if (err)
return NULL;
diff --git a/arch/powerpc/mm/kasan/8xx.c b/arch/powerpc/mm/kasan/8xx.c
new file mode 100644
index 0000000..2784224
--- /dev/null
+++ b/arch/powerpc/mm/kasan/8xx.c
@@ -0,0 +1,73 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#define DISABLE_BRANCH_PROFILING
+
+#include <linux/kasan.h>
+#include <linux/memblock.h>
+#include <linux/hugetlb.h>
+
+static int __init
+kasan_init_shadow_8M(unsigned long k_start, unsigned long k_end, void *block)
+{
+ pmd_t *pmd = pmd_off_k(k_start);
+ unsigned long k_cur, k_next;
+
+ for (k_cur = k_start; k_cur != k_end; k_cur = k_next, pmd += 2, block += SZ_8M) {
+ pte_basic_t *new;
+
+ k_next = pgd_addr_end(k_cur, k_end);
+ k_next = pgd_addr_end(k_next, k_end);
+ if ((void *)pmd_page_vaddr(*pmd) != kasan_early_shadow_pte)
+ continue;
+
+ new = memblock_alloc(sizeof(pte_basic_t), SZ_4K);
+ if (!new)
+ return -ENOMEM;
+
+ *new = pte_val(pte_mkhuge(pfn_pte(PHYS_PFN(__pa(block)), PAGE_KERNEL)));
+
+ hugepd_populate_kernel((hugepd_t *)pmd, (pte_t *)new, PAGE_SHIFT_8M);
+ hugepd_populate_kernel((hugepd_t *)pmd + 1, (pte_t *)new, PAGE_SHIFT_8M);
+ }
+ return 0;
+}
+
+int __init kasan_init_region(void *start, size_t size)
+{
+ unsigned long k_start = (unsigned long)kasan_mem_to_shadow(start);
+ unsigned long k_end = (unsigned long)kasan_mem_to_shadow(start + size);
+ unsigned long k_cur;
+ int ret;
+ void *block;
+
+ block = memblock_alloc(k_end - k_start, SZ_8M);
+ if (!block)
+ return -ENOMEM;
+
+ if (IS_ALIGNED(k_start, SZ_8M)) {
+ kasan_init_shadow_8M(k_start, ALIGN_DOWN(k_end, SZ_8M), block);
+ k_cur = ALIGN_DOWN(k_end, SZ_8M);
+ if (k_cur == k_end)
+ goto finish;
+ } else {
+ k_cur = k_start;
+ }
+
+ ret = kasan_init_shadow_page_tables(k_start, k_end);
+ if (ret)
+ return ret;
+
+ for (; k_cur < k_end; k_cur += PAGE_SIZE) {
+ pmd_t *pmd = pmd_off_k(k_cur);
+ void *va = block + k_cur - k_start;
+ pte_t pte = pfn_pte(PHYS_PFN(__pa(va)), PAGE_KERNEL);
+
+ if (k_cur < ALIGN_DOWN(k_end, SZ_512K))
+ pte = pte_mkhuge(pte);
+
+ __set_pte_at(&init_mm, k_cur, pte_offset_kernel(pmd, k_cur), pte, 0);
+ }
+finish:
+ flush_tlb_kernel_range(k_start, k_end);
+ return 0;
+}
diff --git a/arch/powerpc/mm/kasan/Makefile b/arch/powerpc/mm/kasan/Makefile
index 6577897..bb1a540 100644
--- a/arch/powerpc/mm/kasan/Makefile
+++ b/arch/powerpc/mm/kasan/Makefile
@@ -3,3 +3,5 @@
KASAN_SANITIZE := n
obj-$(CONFIG_PPC32) += kasan_init_32.o
+obj-$(CONFIG_PPC_8xx) += 8xx.o
+obj-$(CONFIG_PPC_BOOK3S_32) += book3s_32.o
diff --git a/arch/powerpc/mm/kasan/book3s_32.c b/arch/powerpc/mm/kasan/book3s_32.c
new file mode 100644
index 0000000..450a67e
--- /dev/null
+++ b/arch/powerpc/mm/kasan/book3s_32.c
@@ -0,0 +1,60 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#define DISABLE_BRANCH_PROFILING
+
+#include <linux/kasan.h>
+#include <linux/memblock.h>
+#include <mm/mmu_decl.h>
+
+int __init kasan_init_region(void *start, size_t size)
+{
+ unsigned long k_start = (unsigned long)kasan_mem_to_shadow(start);
+ unsigned long k_end = (unsigned long)kasan_mem_to_shadow(start + size);
+ unsigned long k_nobat = k_start;
+ unsigned long k_cur;
+ phys_addr_t phys;
+ int ret;
+
+ while (k_nobat < k_end) {
+ unsigned int k_size = bat_block_size(k_nobat, k_end);
+ int idx = find_free_bat();
+
+ if (idx == -1)
+ break;
+ if (k_size < SZ_128K)
+ break;
+ phys = memblock_phys_alloc_range(k_size, k_size, 0,
+ MEMBLOCK_ALLOC_ANYWHERE);
+ if (!phys)
+ break;
+
+ setbat(idx, k_nobat, phys, k_size, PAGE_KERNEL);
+ k_nobat += k_size;
+ }
+ if (k_nobat != k_start)
+ update_bats();
+
+ if (k_nobat < k_end) {
+ phys = memblock_phys_alloc_range(k_end - k_nobat, PAGE_SIZE, 0,
+ MEMBLOCK_ALLOC_ANYWHERE);
+ if (!phys)
+ return -ENOMEM;
+ }
+
+ ret = kasan_init_shadow_page_tables(k_start, k_end);
+ if (ret)
+ return ret;
+
+ kasan_update_early_region(k_start, k_nobat, __pte(0));
+
+ for (k_cur = k_nobat; k_cur < k_end; k_cur += PAGE_SIZE) {
+ pmd_t *pmd = pmd_off_k(k_cur);
+ pte_t pte = pfn_pte(PHYS_PFN(phys + k_cur - k_nobat), PAGE_KERNEL);
+
+ __set_pte_at(&init_mm, k_cur, pte_offset_kernel(pmd, k_cur), pte, 0);
+ }
+ flush_tlb_kernel_range(k_start, k_end);
+ memset(kasan_mem_to_shadow(start), 0, k_end - k_start);
+
+ return 0;
+}
diff --git a/arch/powerpc/mm/kasan/kasan_init_32.c b/arch/powerpc/mm/kasan/kasan_init_32.c
index 1cfe57b..cf8770b 100644
--- a/arch/powerpc/mm/kasan/kasan_init_32.c
+++ b/arch/powerpc/mm/kasan/kasan_init_32.c
@@ -5,14 +5,12 @@
#include <linux/kasan.h>
#include <linux/printk.h>
#include <linux/memblock.h>
-#include <linux/moduleloader.h>
#include <linux/sched/task.h>
-#include <linux/vmalloc.h>
#include <asm/pgalloc.h>
#include <asm/code-patching.h>
#include <mm/mmu_decl.h>
-static pgprot_t kasan_prot_ro(void)
+static pgprot_t __init kasan_prot_ro(void)
{
if (early_mmu_has_feature(MMU_FTR_HPTE_TABLE))
return PAGE_READONLY;
@@ -20,7 +18,7 @@
return PAGE_KERNEL_RO;
}
-static void kasan_populate_pte(pte_t *ptep, pgprot_t prot)
+static void __init kasan_populate_pte(pte_t *ptep, pgprot_t prot)
{
unsigned long va = (unsigned long)kasan_early_shadow_page;
phys_addr_t pa = __pa(kasan_early_shadow_page);
@@ -30,13 +28,12 @@
__set_pte_at(&init_mm, va, ptep, pfn_pte(PHYS_PFN(pa), prot), 0);
}
-static int __ref kasan_init_shadow_page_tables(unsigned long k_start, unsigned long k_end)
+int __init kasan_init_shadow_page_tables(unsigned long k_start, unsigned long k_end)
{
pmd_t *pmd;
unsigned long k_cur, k_next;
- pgprot_t prot = slab_is_available() ? kasan_prot_ro() : PAGE_KERNEL;
- pmd = pmd_offset(pud_offset(pgd_offset_k(k_start), k_start), k_start);
+ pmd = pmd_off_k(k_start);
for (k_cur = k_start; k_cur != k_end; k_cur = k_next, pmd++) {
pte_t *new;
@@ -45,94 +42,90 @@
if ((void *)pmd_page_vaddr(*pmd) != kasan_early_shadow_pte)
continue;
- if (slab_is_available())
- new = pte_alloc_one_kernel(&init_mm);
- else
- new = memblock_alloc(PTE_FRAG_SIZE, PTE_FRAG_SIZE);
+ new = memblock_alloc(PTE_FRAG_SIZE, PTE_FRAG_SIZE);
if (!new)
return -ENOMEM;
- kasan_populate_pte(new, prot);
-
- smp_wmb(); /* See comment in __pte_alloc */
-
- spin_lock(&init_mm.page_table_lock);
- /* Has another populated it ? */
- if (likely((void *)pmd_page_vaddr(*pmd) == kasan_early_shadow_pte)) {
- pmd_populate_kernel(&init_mm, pmd, new);
- new = NULL;
- }
- spin_unlock(&init_mm.page_table_lock);
-
- if (new && slab_is_available())
- pte_free_kernel(&init_mm, new);
+ kasan_populate_pte(new, PAGE_KERNEL);
+ pmd_populate_kernel(&init_mm, pmd, new);
}
return 0;
}
-static void __ref *kasan_get_one_page(void)
-{
- if (slab_is_available())
- return (void *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
-
- return memblock_alloc(PAGE_SIZE, PAGE_SIZE);
-}
-
-static int __ref kasan_init_region(void *start, size_t size)
+int __init __weak kasan_init_region(void *start, size_t size)
{
unsigned long k_start = (unsigned long)kasan_mem_to_shadow(start);
unsigned long k_end = (unsigned long)kasan_mem_to_shadow(start + size);
unsigned long k_cur;
int ret;
- void *block = NULL;
+ void *block;
ret = kasan_init_shadow_page_tables(k_start, k_end);
if (ret)
return ret;
- if (!slab_is_available())
- block = memblock_alloc(k_end - k_start, PAGE_SIZE);
+ block = memblock_alloc(k_end - k_start, PAGE_SIZE);
+ if (!block)
+ return -ENOMEM;
for (k_cur = k_start & PAGE_MASK; k_cur < k_end; k_cur += PAGE_SIZE) {
- pmd_t *pmd = pmd_offset(pud_offset(pgd_offset_k(k_cur), k_cur), k_cur);
- void *va = block ? block + k_cur - k_start : kasan_get_one_page();
+ pmd_t *pmd = pmd_off_k(k_cur);
+ void *va = block + k_cur - k_start;
pte_t pte = pfn_pte(PHYS_PFN(__pa(va)), PAGE_KERNEL);
- if (!va)
- return -ENOMEM;
-
__set_pte_at(&init_mm, k_cur, pte_offset_kernel(pmd, k_cur), pte, 0);
}
flush_tlb_kernel_range(k_start, k_end);
return 0;
}
-static void __init kasan_remap_early_shadow_ro(void)
+void __init
+kasan_update_early_region(unsigned long k_start, unsigned long k_end, pte_t pte)
{
- pgprot_t prot = kasan_prot_ro();
- unsigned long k_start = KASAN_SHADOW_START;
- unsigned long k_end = KASAN_SHADOW_END;
unsigned long k_cur;
phys_addr_t pa = __pa(kasan_early_shadow_page);
- kasan_populate_pte(kasan_early_shadow_pte, prot);
-
- for (k_cur = k_start & PAGE_MASK; k_cur != k_end; k_cur += PAGE_SIZE) {
- pmd_t *pmd = pmd_offset(pud_offset(pgd_offset_k(k_cur), k_cur), k_cur);
+ for (k_cur = k_start; k_cur != k_end; k_cur += PAGE_SIZE) {
+ pmd_t *pmd = pmd_off_k(k_cur);
pte_t *ptep = pte_offset_kernel(pmd, k_cur);
if ((pte_val(*ptep) & PTE_RPN_MASK) != pa)
continue;
- __set_pte_at(&init_mm, k_cur, ptep, pfn_pte(PHYS_PFN(pa), prot), 0);
+ __set_pte_at(&init_mm, k_cur, ptep, pte, 0);
}
- flush_tlb_kernel_range(KASAN_SHADOW_START, KASAN_SHADOW_END);
+
+ flush_tlb_kernel_range(k_start, k_end);
+}
+
+static void __init kasan_remap_early_shadow_ro(void)
+{
+ pgprot_t prot = kasan_prot_ro();
+ phys_addr_t pa = __pa(kasan_early_shadow_page);
+
+ kasan_populate_pte(kasan_early_shadow_pte, prot);
+
+ kasan_update_early_region(KASAN_SHADOW_START, KASAN_SHADOW_END,
+ pfn_pte(PHYS_PFN(pa), prot));
+}
+
+static void __init kasan_unmap_early_shadow_vmalloc(void)
+{
+ unsigned long k_start = (unsigned long)kasan_mem_to_shadow((void *)VMALLOC_START);
+ unsigned long k_end = (unsigned long)kasan_mem_to_shadow((void *)VMALLOC_END);
+
+ kasan_update_early_region(k_start, k_end, __pte(0));
+
+#ifdef MODULES_VADDR
+ k_start = (unsigned long)kasan_mem_to_shadow((void *)MODULES_VADDR);
+ k_end = (unsigned long)kasan_mem_to_shadow((void *)MODULES_END);
+ kasan_update_early_region(k_start, k_end, __pte(0));
+#endif
}
void __init kasan_mmu_init(void)
{
int ret;
- struct memblock_region *reg;
if (early_mmu_has_feature(MMU_FTR_HPTE_TABLE)) {
ret = kasan_init_shadow_page_tables(KASAN_SHADOW_START, KASAN_SHADOW_END);
@@ -140,10 +133,16 @@
if (ret)
panic("kasan: kasan_init_shadow_page_tables() failed");
}
+}
- for_each_memblock(memory, reg) {
- phys_addr_t base = reg->base;
- phys_addr_t top = min(base + reg->size, total_lowmem);
+void __init kasan_init(void)
+{
+ phys_addr_t base, end;
+ u64 i;
+ int ret;
+
+ for_each_mem_range(i, &base, &end) {
+ phys_addr_t top = min(end, total_lowmem);
if (base >= top)
continue;
@@ -152,10 +151,14 @@
if (ret)
panic("kasan: kasan_init_region() failed");
}
-}
-void __init kasan_init(void)
-{
+ if (IS_ENABLED(CONFIG_KASAN_VMALLOC)) {
+ ret = kasan_init_shadow_page_tables(KASAN_SHADOW_START, KASAN_SHADOW_END);
+
+ if (ret)
+ panic("kasan: kasan_init_shadow_page_tables() failed");
+ }
+
kasan_remap_early_shadow_ro();
clear_page(kasan_early_shadow_page);
@@ -165,47 +168,18 @@
pr_info("KASAN init done\n");
}
-#ifdef CONFIG_MODULES
-void *module_alloc(unsigned long size)
+void __init kasan_late_init(void)
{
- void *base;
-
- base = __vmalloc_node_range(size, MODULE_ALIGN, VMALLOC_START, VMALLOC_END,
- GFP_KERNEL, PAGE_KERNEL_EXEC, VM_FLUSH_RESET_PERMS,
- NUMA_NO_NODE, __builtin_return_address(0));
-
- if (!base)
- return NULL;
-
- if (!kasan_init_region(base, size))
- return base;
-
- vfree(base);
-
- return NULL;
+ if (IS_ENABLED(CONFIG_KASAN_VMALLOC))
+ kasan_unmap_early_shadow_vmalloc();
}
-#endif
-
-#ifdef CONFIG_PPC_BOOK3S_32
-u8 __initdata early_hash[256 << 10] __aligned(256 << 10) = {0};
-
-static void __init kasan_early_hash_table(void)
-{
- modify_instruction_site(&patch__hash_page_A0, 0xffff, __pa(early_hash) >> 16);
- modify_instruction_site(&patch__flush_hash_A0, 0xffff, __pa(early_hash) >> 16);
-
- Hash = (struct hash_pte *)early_hash;
-}
-#else
-static void __init kasan_early_hash_table(void) {}
-#endif
void __init kasan_early_init(void)
{
unsigned long addr = KASAN_SHADOW_START;
unsigned long end = KASAN_SHADOW_END;
unsigned long next;
- pmd_t *pmd = pmd_offset(pud_offset(pgd_offset_k(addr), addr), addr);
+ pmd_t *pmd = pmd_off_k(addr);
BUILD_BUG_ON(KASAN_SHADOW_START & ~PGDIR_MASK);
@@ -215,7 +189,4 @@
next = pgd_addr_end(addr, end);
pmd_populate_kernel(&init_mm, pmd, kasan_early_shadow_pte);
} while (pmd++, addr = next, addr != end);
-
- if (early_mmu_has_feature(MMU_FTR_HPTE_TABLE))
- kasan_early_hash_table();
}
diff --git a/arch/powerpc/mm/maccess.c b/arch/powerpc/mm/maccess.c
new file mode 100644
index 0000000..fa9a7a7
--- /dev/null
+++ b/arch/powerpc/mm/maccess.c
@@ -0,0 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <linux/uaccess.h>
+#include <linux/kernel.h>
+
+bool copy_from_kernel_nofault_allowed(const void *unsafe_src, size_t size)
+{
+ return is_kernel_addr((unsigned long)unsafe_src);
+}
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index c48705c..22eb1c7 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -31,12 +31,12 @@
#include <linux/slab.h>
#include <linux/vmalloc.h>
#include <linux/memremap.h>
+#include <linux/dma-direct.h>
+#include <linux/kprobes.h>
-#include <asm/pgalloc.h>
#include <asm/prom.h>
#include <asm/io.h>
#include <asm/mmu_context.h>
-#include <asm/pgtable.h>
#include <asm/mmu.h>
#include <asm/smp.h>
#include <asm/machdep.h>
@@ -48,6 +48,9 @@
#include <asm/fixmap.h>
#include <asm/swiotlb.h>
#include <asm/rtas.h>
+#include <asm/kasan.h>
+#include <asm/svm.h>
+#include <asm/mmzone.h>
#include <mm/mmu_decl.h>
@@ -62,14 +65,6 @@
#ifdef CONFIG_HIGHMEM
pte_t *kmap_pte;
EXPORT_SYMBOL(kmap_pte);
-pgprot_t kmap_prot;
-EXPORT_SYMBOL(kmap_prot);
-
-static inline pte_t *virt_to_kpte(unsigned long vaddr)
-{
- return pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k(vaddr),
- vaddr), vaddr), vaddr);
-}
#endif
pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
@@ -94,7 +89,8 @@
}
#endif
-int __weak create_section_mapping(unsigned long start, unsigned long end, int nid)
+int __weak create_section_mapping(unsigned long start, unsigned long end,
+ int nid, pgprot_t prot)
{
return -ENODEV;
}
@@ -126,25 +122,22 @@
}
int __ref arch_add_memory(int nid, u64 start, u64 size,
- struct mhp_restrictions *restrictions)
+ struct mhp_params *params)
{
unsigned long start_pfn = start >> PAGE_SHIFT;
unsigned long nr_pages = size >> PAGE_SHIFT;
int rc;
- resize_hpt_for_hotplug(memblock_phys_mem_size());
-
start = (unsigned long)__va(start);
- rc = create_section_mapping(start, start + size, nid);
+ rc = create_section_mapping(start, start + size, nid,
+ params->pgprot);
if (rc) {
pr_warn("Unable to create mapping for hot added memory 0x%llx..0x%llx: %d\n",
start, start + size, rc);
return -EFAULT;
}
- flush_dcache_range_chunked(start, start + size, FLUSH_CHUNK_SIZE);
-
- return __add_pages(nid, start_pfn, nr_pages, restrictions);
+ return __add_pages(nid, start_pfn, nr_pages, params);
}
void __ref arch_remove_memory(int nid, u64 start, u64 size,
@@ -167,9 +160,6 @@
* hit that section of memory
*/
vm_unmap_aliases();
-
- if (resize_hpt_for_hotplug(memblock_phys_mem_size()) == -ENOSPC)
- pr_warn("Hash collision while resizing HPT\n");
}
#endif
@@ -190,23 +180,22 @@
void __init initmem_init(void)
{
- /* XXX need to clip this if using highmem? */
- sparse_memory_present_with_active_regions(0);
sparse_init();
}
/* mark pages that don't exist as nosave */
static int __init mark_nonram_nosave(void)
{
- struct memblock_region *reg, *prev = NULL;
+ unsigned long spfn, epfn, prev = 0;
+ int i;
- for_each_memblock(memory, reg) {
- if (prev &&
- memblock_region_memory_end_pfn(prev) < memblock_region_memory_base_pfn(reg))
- register_nosave_region(memblock_region_memory_end_pfn(prev),
- memblock_region_memory_base_pfn(reg));
- prev = reg;
+ for_each_mem_pfn_range(i, MAX_NUMNODES, &spfn, &epfn, NULL) {
+ if (prev && prev < spfn)
+ register_nosave_region(prev, spfn);
+
+ prev = epfn;
}
+
return 0;
}
#else /* CONFIG_NEED_MULTIPLE_NODES */
@@ -223,10 +212,10 @@
* everything else. GFP_DMA32 page allocations automatically fall back to
* ZONE_DMA.
*
- * By using 31-bit unconditionally, we can exploit ARCH_ZONE_DMA_BITS to
- * inform the generic DMA mapping code. 32-bit only devices (if not handled
- * by an IOMMU anyway) will take a first dip into ZONE_NORMAL and get
- * otherwise served by ZONE_DMA.
+ * By using 31-bit unconditionally, we can exploit zone_dma_bits to inform the
+ * generic DMA mapping code. 32-bit only devices (if not handled by an IOMMU
+ * anyway) will take a first dip into ZONE_NORMAL and get otherwise served by
+ * ZONE_DMA.
*/
static unsigned long max_zone_pfns[MAX_NR_ZONES];
@@ -238,20 +227,17 @@
unsigned long long total_ram = memblock_phys_mem_size();
phys_addr_t top_of_ram = memblock_end_of_DRAM();
-#ifdef CONFIG_PPC32
- unsigned long v = __fix_to_virt(__end_of_fixed_addresses - 1);
- unsigned long end = __fix_to_virt(FIX_HOLE);
+#ifdef CONFIG_HIGHMEM
+ unsigned long v = __fix_to_virt(FIX_KMAP_END);
+ unsigned long end = __fix_to_virt(FIX_KMAP_BEGIN);
for (; v < end; v += PAGE_SIZE)
map_kernel_page(v, 0, __pgprot(0)); /* XXX gross */
-#endif
-#ifdef CONFIG_HIGHMEM
map_kernel_page(PKMAP_BASE, 0, __pgprot(0)); /* XXX gross */
pkmap_page_table = virt_to_kpte(PKMAP_BASE);
kmap_pte = virt_to_kpte(__fix_to_virt(FIX_KMAP_BEGIN));
- kmap_prot = PAGE_KERNEL;
#endif /* CONFIG_HIGHMEM */
printk(KERN_DEBUG "Top of RAM: 0x%llx, Total RAM: 0x%llx\n",
@@ -259,16 +245,25 @@
printk(KERN_DEBUG "Memory hole size: %ldMB\n",
(long int)((top_of_ram - total_ram) >> 20));
+ /*
+ * Allow 30-bit DMA for very limited Broadcom wifi chips on many
+ * powerbooks.
+ */
+ if (IS_ENABLED(CONFIG_PPC32))
+ zone_dma_bits = 30;
+ else
+ zone_dma_bits = 31;
+
#ifdef CONFIG_ZONE_DMA
max_zone_pfns[ZONE_DMA] = min(max_low_pfn,
- 1UL << (ARCH_ZONE_DMA_BITS - PAGE_SHIFT));
+ 1UL << (zone_dma_bits - PAGE_SHIFT));
#endif
max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
#ifdef CONFIG_HIGHMEM
max_zone_pfns[ZONE_HIGHMEM] = max_pfn;
#endif
- free_area_init_nodes(max_zone_pfns);
+ free_area_init(max_zone_pfns);
mark_nonram_nosave();
}
@@ -290,11 +285,17 @@
* back to to-down.
*/
memblock_set_bottom_up(true);
- swiotlb_init(0);
+ if (is_secure_guest())
+ svm_swiotlb_init();
+ else
+ swiotlb_init(0);
#endif
high_memory = (void *) __va(max_low_pfn * PAGE_SIZE);
set_max_mapnr(max_pfn);
+
+ kasan_late_init();
+
memblock_free_all();
#ifdef CONFIG_HIGHMEM
@@ -462,6 +463,7 @@
: "r" (nb), "r" (msr), "i" (bytes), "r" (msr0)
: "ctr", "memory");
}
+NOKPROBE_SYMBOL(flush_dcache_icache_phys)
#endif // !defined(CONFIG_PPC_8xx) && !defined(CONFIG_PPC64)
/*
@@ -572,7 +574,7 @@
flush_dcache_page(pg);
}
-void flush_icache_user_range(struct vm_area_struct *vma, struct page *page,
+void flush_icache_user_page(struct vm_area_struct *vma, struct page *page,
unsigned long addr, int len)
{
unsigned long maddr;
@@ -581,7 +583,6 @@
flush_icache_range(maddr, maddr + len);
kunmap(page);
}
-EXPORT_SYMBOL(flush_icache_user_range);
/*
* System memory should not be in /proc/iomem but various tools expect it
@@ -589,20 +590,24 @@
*/
static int __init add_system_ram_resources(void)
{
- struct memblock_region *reg;
+ phys_addr_t start, end;
+ u64 i;
- for_each_memblock(memory, reg) {
+ for_each_mem_range(i, &start, &end) {
struct resource *res;
- unsigned long base = reg->base;
- unsigned long size = reg->size;
res = kzalloc(sizeof(struct resource), GFP_KERNEL);
WARN_ON(!res);
if (res) {
res->name = "System RAM";
- res->start = base;
- res->end = base + size - 1;
+ res->start = start;
+ /*
+ * In memblock, end points to the first byte after
+ * the range while in resourses, end points to the
+ * last byte in the range.
+ */
+ res->end = end - 1;
res->flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
WARN_ON(request_resource(&iomem_resource, res) < 0);
}
diff --git a/arch/powerpc/mm/mmu_decl.h b/arch/powerpc/mm/mmu_decl.h
index c750ac9..1b6d39e 100644
--- a/arch/powerpc/mm/mmu_decl.h
+++ b/arch/powerpc/mm/mmu_decl.h
@@ -139,10 +139,21 @@
extern void adjust_total_lowmem(void);
extern int switch_to_as1(void);
extern void restore_to_as0(int esel, int offset, void *dt_ptr, int bootcpu);
+void create_kaslr_tlb_entry(int entry, unsigned long virt, phys_addr_t phys);
+void reloc_kernel_entry(void *fdt, int addr);
+extern int is_second_reloc;
#endif
extern void loadcam_entry(unsigned int index);
extern void loadcam_multi(int first_idx, int num, int tmp_idx);
+#ifdef CONFIG_RANDOMIZE_BASE
+void kaslr_early_init(void *dt_ptr, phys_addr_t size);
+void kaslr_late_init(void);
+#else
+static inline void kaslr_early_init(void *dt_ptr, phys_addr_t size) {}
+static inline void kaslr_late_init(void) {}
+#endif
+
struct tlbcam {
u32 MAS0;
u32 MAS1;
@@ -170,3 +181,13 @@
static inline void mmu_mark_initmem_nx(void) { }
static inline void mmu_mark_rodata_ro(void) { }
#endif
+
+#ifdef CONFIG_PPC_8xx
+void __init mmu_mapin_immr(void);
+#endif
+
+#ifdef CONFIG_PPC_DEBUG_WX
+void ptdump_check_wx(void);
+#else
+static inline void ptdump_check_wx(void) { }
+#endif
diff --git a/arch/powerpc/mm/nohash/40x.c b/arch/powerpc/mm/nohash/40x.c
index f348104..95751c3 100644
--- a/arch/powerpc/mm/nohash/40x.c
+++ b/arch/powerpc/mm/nohash/40x.c
@@ -32,11 +32,9 @@
#include <linux/highmem.h>
#include <linux/memblock.h>
-#include <asm/pgalloc.h>
#include <asm/prom.h>
#include <asm/io.h>
#include <asm/mmu_context.h>
-#include <asm/pgtable.h>
#include <asm/mmu.h>
#include <linux/uaccess.h>
#include <asm/smp.h>
@@ -102,9 +100,9 @@
while (s >= LARGE_PAGE_SIZE_16M) {
pmd_t *pmdp;
- unsigned long val = p | _PMD_SIZE_16M | _PAGE_EXEC | _PAGE_HWWRITE;
+ unsigned long val = p | _PMD_SIZE_16M | _PAGE_EXEC | _PAGE_RW;
- pmdp = pmd_offset(pud_offset(pgd_offset_k(v), v), v);
+ pmdp = pmd_off_k(v);
*pmdp++ = __pmd(val);
*pmdp++ = __pmd(val);
*pmdp++ = __pmd(val);
@@ -117,9 +115,9 @@
while (s >= LARGE_PAGE_SIZE_4M) {
pmd_t *pmdp;
- unsigned long val = p | _PMD_SIZE_4M | _PAGE_EXEC | _PAGE_HWWRITE;
+ unsigned long val = p | _PMD_SIZE_4M | _PAGE_EXEC | _PAGE_RW;
- pmdp = pmd_offset(pud_offset(pgd_offset_k(v), v), v);
+ pmdp = pmd_off_k(v);
*pmdp = __pmd(val);
v += LARGE_PAGE_SIZE_4M;
diff --git a/arch/powerpc/mm/nohash/8xx.c b/arch/powerpc/mm/nohash/8xx.c
index 4a06cb3..231ca95 100644
--- a/arch/powerpc/mm/nohash/8xx.c
+++ b/arch/powerpc/mm/nohash/8xx.c
@@ -9,8 +9,10 @@
#include <linux/memblock.h>
#include <linux/mmu_context.h>
+#include <linux/hugetlb.h>
#include <asm/fixmap.h>
#include <asm/code-patching.h>
+#include <asm/inst.h>
#include <mm/mmu_decl.h>
@@ -21,141 +23,181 @@
static unsigned long block_mapped_ram;
/*
- * Return PA for this VA if it is in an area mapped with LTLBs.
+ * Return PA for this VA if it is in an area mapped with LTLBs or fixmap.
* Otherwise, returns 0
*/
phys_addr_t v_block_mapped(unsigned long va)
{
unsigned long p = PHYS_IMMR_BASE;
- if (__map_without_ltlbs)
- return 0;
if (va >= VIRT_IMMR_BASE && va < VIRT_IMMR_BASE + IMMR_SIZE)
return p + va - VIRT_IMMR_BASE;
+ if (__map_without_ltlbs)
+ return 0;
if (va >= PAGE_OFFSET && va < PAGE_OFFSET + block_mapped_ram)
return __pa(va);
return 0;
}
/*
- * Return VA for a given PA mapped with LTLBs or 0 if not mapped
+ * Return VA for a given PA mapped with LTLBs or fixmap
+ * Return 0 if not mapped
*/
unsigned long p_block_mapped(phys_addr_t pa)
{
unsigned long p = PHYS_IMMR_BASE;
- if (__map_without_ltlbs)
- return 0;
if (pa >= p && pa < p + IMMR_SIZE)
return VIRT_IMMR_BASE + pa - p;
+ if (__map_without_ltlbs)
+ return 0;
if (pa < block_mapped_ram)
return (unsigned long)__va(pa);
return 0;
}
-#define LARGE_PAGE_SIZE_8M (1<<23)
+static pte_t __init *early_hugepd_alloc_kernel(hugepd_t *pmdp, unsigned long va)
+{
+ if (hpd_val(*pmdp) == 0) {
+ pte_t *ptep = memblock_alloc(sizeof(pte_basic_t), SZ_4K);
+
+ if (!ptep)
+ return NULL;
+
+ hugepd_populate_kernel((hugepd_t *)pmdp, ptep, PAGE_SHIFT_8M);
+ hugepd_populate_kernel((hugepd_t *)pmdp + 1, ptep, PAGE_SHIFT_8M);
+ }
+ return hugepte_offset(*(hugepd_t *)pmdp, va, PGDIR_SHIFT);
+}
+
+static int __ref __early_map_kernel_hugepage(unsigned long va, phys_addr_t pa,
+ pgprot_t prot, int psize, bool new)
+{
+ pmd_t *pmdp = pmd_off_k(va);
+ pte_t *ptep;
+
+ if (WARN_ON(psize != MMU_PAGE_512K && psize != MMU_PAGE_8M))
+ return -EINVAL;
+
+ if (new) {
+ if (WARN_ON(slab_is_available()))
+ return -EINVAL;
+
+ if (psize == MMU_PAGE_512K)
+ ptep = early_pte_alloc_kernel(pmdp, va);
+ else
+ ptep = early_hugepd_alloc_kernel((hugepd_t *)pmdp, va);
+ } else {
+ if (psize == MMU_PAGE_512K)
+ ptep = pte_offset_kernel(pmdp, va);
+ else
+ ptep = hugepte_offset(*(hugepd_t *)pmdp, va, PGDIR_SHIFT);
+ }
+
+ if (WARN_ON(!ptep))
+ return -ENOMEM;
+
+ /* The PTE should never be already present */
+ if (new && WARN_ON(pte_present(*ptep) && pgprot_val(prot)))
+ return -EINVAL;
+
+ set_huge_pte_at(&init_mm, va, ptep, pte_mkhuge(pfn_pte(pa >> PAGE_SHIFT, prot)));
+
+ return 0;
+}
/*
* MMU_init_hw does the chip-specific initialization of the MMU hardware.
*/
void __init MMU_init_hw(void)
{
- /* PIN up to the 3 first 8Mb after IMMR in DTLB table */
- if (IS_ENABLED(CONFIG_PIN_TLB_DATA)) {
- unsigned long ctr = mfspr(SPRN_MD_CTR) & 0xfe000000;
- unsigned long flags = 0xf0 | MD_SPS16K | _PAGE_SH | _PAGE_DIRTY;
- int i = IS_ENABLED(CONFIG_PIN_TLB_IMMR) ? 29 : 28;
- unsigned long addr = 0;
- unsigned long mem = total_lowmem;
-
- for (; i < 32 && mem >= LARGE_PAGE_SIZE_8M; i++) {
- mtspr(SPRN_MD_CTR, ctr | (i << 8));
- mtspr(SPRN_MD_EPN, (unsigned long)__va(addr) | MD_EVALID);
- mtspr(SPRN_MD_TWC, MD_PS8MEG | MD_SVALID);
- mtspr(SPRN_MD_RPN, addr | flags | _PAGE_PRESENT);
- addr += LARGE_PAGE_SIZE_8M;
- mem -= LARGE_PAGE_SIZE_8M;
- }
- }
}
-static void __init mmu_mapin_immr(void)
-{
- unsigned long p = PHYS_IMMR_BASE;
- unsigned long v = VIRT_IMMR_BASE;
- int offset;
+static bool immr_is_mapped __initdata;
- for (offset = 0; offset < IMMR_SIZE; offset += PAGE_SIZE)
- map_kernel_page(v + offset, p + offset, PAGE_KERNEL_NCG);
+void __init mmu_mapin_immr(void)
+{
+ if (immr_is_mapped)
+ return;
+
+ immr_is_mapped = true;
+
+ __early_map_kernel_hugepage(VIRT_IMMR_BASE, PHYS_IMMR_BASE,
+ PAGE_KERNEL_NCG, MMU_PAGE_512K, true);
}
-static void mmu_patch_cmp_limit(s32 *site, unsigned long mapped)
+static void mmu_mapin_ram_chunk(unsigned long offset, unsigned long top,
+ pgprot_t prot, bool new)
{
- modify_instruction_site(site, 0xffff, (unsigned long)__va(mapped) >> 16);
-}
+ unsigned long v = PAGE_OFFSET + offset;
+ unsigned long p = offset;
-static void mmu_patch_addis(s32 *site, long simm)
-{
- unsigned int instr = *(unsigned int *)patch_site_addr(site);
+ WARN_ON(!IS_ALIGNED(offset, SZ_512K) || !IS_ALIGNED(top, SZ_512K));
- instr &= 0xffff0000;
- instr |= ((unsigned long)simm) >> 16;
- patch_instruction_site(site, instr);
+ for (; p < ALIGN(p, SZ_8M) && p < top; p += SZ_512K, v += SZ_512K)
+ __early_map_kernel_hugepage(v, p, prot, MMU_PAGE_512K, new);
+ for (; p < ALIGN_DOWN(top, SZ_8M) && p < top; p += SZ_8M, v += SZ_8M)
+ __early_map_kernel_hugepage(v, p, prot, MMU_PAGE_8M, new);
+ for (; p < ALIGN_DOWN(top, SZ_512K) && p < top; p += SZ_512K, v += SZ_512K)
+ __early_map_kernel_hugepage(v, p, prot, MMU_PAGE_512K, new);
+
+ if (!new)
+ flush_tlb_kernel_range(PAGE_OFFSET + v, PAGE_OFFSET + top);
}
unsigned long __init mmu_mapin_ram(unsigned long base, unsigned long top)
{
- unsigned long mapped;
+ unsigned long etext8 = ALIGN(__pa(_etext), SZ_8M);
+ unsigned long sinittext = __pa(_sinittext);
+ bool strict_boundary = strict_kernel_rwx_enabled() || debug_pagealloc_enabled();
+ unsigned long boundary = strict_boundary ? sinittext : etext8;
+ unsigned long einittext8 = ALIGN(__pa(_einittext), SZ_8M);
- if (__map_without_ltlbs) {
- mapped = 0;
- mmu_mapin_immr();
- if (!IS_ENABLED(CONFIG_PIN_TLB_IMMR))
- patch_instruction_site(&patch__dtlbmiss_immr_jmp, PPC_INST_NOP);
- if (!IS_ENABLED(CONFIG_PIN_TLB_TEXT))
- mmu_patch_cmp_limit(&patch__itlbmiss_linmem_top, 0);
+ WARN_ON(top < einittext8);
+
+ mmu_mapin_immr();
+
+ if (__map_without_ltlbs)
+ return 0;
+
+ mmu_mapin_ram_chunk(0, boundary, PAGE_KERNEL_TEXT, true);
+ if (debug_pagealloc_enabled()) {
+ top = boundary;
} else {
- mapped = top & ~(LARGE_PAGE_SIZE_8M - 1);
- if (!IS_ENABLED(CONFIG_PIN_TLB_TEXT))
- mmu_patch_cmp_limit(&patch__itlbmiss_linmem_top,
- _ALIGN(__pa(_einittext), 8 << 20));
+ mmu_mapin_ram_chunk(boundary, einittext8, PAGE_KERNEL_TEXT, true);
+ mmu_mapin_ram_chunk(einittext8, top, PAGE_KERNEL, true);
}
- mmu_patch_cmp_limit(&patch__dtlbmiss_linmem_top, mapped);
- mmu_patch_cmp_limit(&patch__fixupdar_linmem_top, mapped);
+ if (top > SZ_32M)
+ memblock_set_current_limit(top);
- /* If the size of RAM is not an exact power of two, we may not
- * have covered RAM in its entirety with 8 MiB
- * pages. Consequently, restrict the top end of RAM currently
- * allocable so that calls to the MEMBLOCK to allocate PTEs for "tail"
- * coverage with normal-sized pages (or other reasons) do not
- * attempt to allocate outside the allowed range.
- */
- if (mapped)
- memblock_set_current_limit(mapped);
+ block_mapped_ram = top;
- block_mapped_ram = mapped;
-
- return mapped;
+ return top;
}
void mmu_mark_initmem_nx(void)
{
- if (IS_ENABLED(CONFIG_STRICT_KERNEL_RWX) && CONFIG_ETEXT_SHIFT < 23)
- mmu_patch_addis(&patch__itlbmiss_linmem_top8,
- -((long)_etext & ~(LARGE_PAGE_SIZE_8M - 1)));
- if (!IS_ENABLED(CONFIG_PIN_TLB_TEXT))
- mmu_patch_cmp_limit(&patch__itlbmiss_linmem_top, __pa(_etext));
+ unsigned long etext8 = ALIGN(__pa(_etext), SZ_8M);
+ unsigned long sinittext = __pa(_sinittext);
+ unsigned long boundary = strict_kernel_rwx_enabled() ? sinittext : etext8;
+ unsigned long einittext8 = ALIGN(__pa(_einittext), SZ_8M);
+
+ mmu_mapin_ram_chunk(0, boundary, PAGE_KERNEL_TEXT, false);
+ mmu_mapin_ram_chunk(boundary, einittext8, PAGE_KERNEL, false);
+
+ if (IS_ENABLED(CONFIG_PIN_TLB_TEXT))
+ mmu_pin_tlb(block_mapped_ram, false);
}
#ifdef CONFIG_STRICT_KERNEL_RWX
void mmu_mark_rodata_ro(void)
{
- if (CONFIG_DATA_SHIFT < 23)
- mmu_patch_addis(&patch__dtlbmiss_romem_top8,
- -__pa(((unsigned long)_sinittext) &
- ~(LARGE_PAGE_SIZE_8M - 1)));
- mmu_patch_addis(&patch__dtlbmiss_romem_top, -__pa(_sinittext));
+ unsigned long sinittext = __pa(_sinittext);
+
+ mmu_mapin_ram_chunk(0, sinittext, PAGE_KERNEL_ROX, false);
+ if (IS_ENABLED(CONFIG_PIN_TLB_DATA))
+ mmu_pin_tlb(block_mapped_ram, true);
}
#endif
@@ -168,7 +210,7 @@
BUG_ON(first_memblock_base != 0);
/* 8xx can only access 32MB at the moment */
- memblock_set_current_limit(min_t(u64, first_memblock_size, 0x02000000));
+ memblock_set_current_limit(min_t(u64, first_memblock_size, SZ_32M));
}
/*
@@ -202,13 +244,6 @@
mb();
}
-void flush_instruction_cache(void)
-{
- isync();
- mtspr(SPRN_IC_CST, IDC_INVALL);
- isync();
-}
-
#ifdef CONFIG_PPC_KUEP
void __init setup_kuep(bool disabled)
{
diff --git a/arch/powerpc/mm/nohash/Makefile b/arch/powerpc/mm/nohash/Makefile
index 33b6f6f..0424f6c 100644
--- a/arch/powerpc/mm/nohash/Makefile
+++ b/arch/powerpc/mm/nohash/Makefile
@@ -8,6 +8,7 @@
obj-$(CONFIG_44x) += 44x.o
obj-$(CONFIG_PPC_8xx) += 8xx.o
obj-$(CONFIG_PPC_FSL_BOOK3E) += fsl_booke.o
+obj-$(CONFIG_RANDOMIZE_BASE) += kaslr_booke.o
ifdef CONFIG_HUGETLB_PAGE
obj-$(CONFIG_PPC_FSL_BOOK3E) += book3e_hugetlbpage.o
endif
diff --git a/arch/powerpc/mm/nohash/book3e_pgtable.c b/arch/powerpc/mm/nohash/book3e_pgtable.c
index 4637fdd..77884e2 100644
--- a/arch/powerpc/mm/nohash/book3e_pgtable.c
+++ b/arch/powerpc/mm/nohash/book3e_pgtable.c
@@ -73,6 +73,7 @@
int __ref map_kernel_page(unsigned long ea, unsigned long pa, pgprot_t prot)
{
pgd_t *pgdp;
+ p4d_t *p4dp;
pud_t *pudp;
pmd_t *pmdp;
pte_t *ptep;
@@ -80,7 +81,8 @@
BUILD_BUG_ON(TASK_SIZE_USER64 > PGTABLE_RANGE);
if (slab_is_available()) {
pgdp = pgd_offset_k(ea);
- pudp = pud_alloc(&init_mm, pgdp, ea);
+ p4dp = p4d_offset(pgdp, ea);
+ pudp = pud_alloc(&init_mm, p4dp, ea);
if (!pudp)
return -ENOMEM;
pmdp = pmd_alloc(&init_mm, pudp, ea);
@@ -91,13 +93,12 @@
return -ENOMEM;
} else {
pgdp = pgd_offset_k(ea);
-#ifndef __PAGETABLE_PUD_FOLDED
- if (pgd_none(*pgdp)) {
- pudp = early_alloc_pgtable(PUD_TABLE_SIZE);
- pgd_populate(&init_mm, pgdp, pudp);
+ p4dp = p4d_offset(pgdp, ea);
+ if (p4d_none(*p4dp)) {
+ pmdp = early_alloc_pgtable(PMD_TABLE_SIZE);
+ p4d_populate(&init_mm, p4dp, pmdp);
}
-#endif /* !__PAGETABLE_PUD_FOLDED */
- pudp = pud_offset(pgdp, ea);
+ pudp = pud_offset(p4dp, ea);
if (pud_none(*pudp)) {
pmdp = early_alloc_pgtable(PMD_TABLE_SIZE);
pud_populate(&init_mm, pudp, pmdp);
diff --git a/arch/powerpc/mm/nohash/fsl_booke.c b/arch/powerpc/mm/nohash/fsl_booke.c
index 556e3cd..36bda96 100644
--- a/arch/powerpc/mm/nohash/fsl_booke.c
+++ b/arch/powerpc/mm/nohash/fsl_booke.c
@@ -37,11 +37,9 @@
#include <linux/highmem.h>
#include <linux/memblock.h>
-#include <asm/pgalloc.h>
#include <asm/prom.h>
#include <asm/io.h>
#include <asm/mmu_context.h>
-#include <asm/pgtable.h>
#include <asm/mmu.h>
#include <linux/uaccess.h>
#include <asm/smp.h>
@@ -221,6 +219,22 @@
return tlbcam_addrs[tlbcam_index - 1].limit - PAGE_OFFSET + 1;
}
+void flush_instruction_cache(void)
+{
+ unsigned long tmp;
+
+ if (IS_ENABLED(CONFIG_E200)) {
+ tmp = mfspr(SPRN_L1CSR0);
+ tmp |= L1CSR0_CFI | L1CSR0_CLFC;
+ mtspr(SPRN_L1CSR0, tmp);
+ } else {
+ tmp = mfspr(SPRN_L1CSR1);
+ tmp |= L1CSR1_ICFI | L1CSR1_ICLFR;
+ mtspr(SPRN_L1CSR1, tmp);
+ }
+ isync();
+}
+
/*
* MMU_init_hw does the chip-specific initialization of the MMU hardware.
*/
@@ -263,11 +277,13 @@
int __initdata is_second_reloc;
notrace void __init relocate_init(u64 dt_ptr, phys_addr_t start)
{
- unsigned long base = KERNELBASE;
+ unsigned long base = kernstart_virt_addr;
+ phys_addr_t size;
kernstart_addr = start;
if (is_second_reloc) {
virt_phys_offset = PAGE_OFFSET - memstart_addr;
+ kaslr_late_init();
return;
}
@@ -291,7 +307,7 @@
start &= ~0x3ffffff;
base &= ~0x3ffffff;
virt_phys_offset = base - start;
- early_get_first_memblock_info(__va(dt_ptr), NULL);
+ early_get_first_memblock_info(__va(dt_ptr), &size);
/*
* We now get the memstart_addr, then we should check if this
* address is the same as what the PAGE_OFFSET map to now. If
@@ -316,6 +332,8 @@
/* We should never reach here */
panic("Relocation error");
}
+
+ kaslr_early_init(__va(dt_ptr), size);
}
#endif
#endif
diff --git a/arch/powerpc/mm/nohash/kaslr_booke.c b/arch/powerpc/mm/nohash/kaslr_booke.c
new file mode 100644
index 0000000..4c74e8a
--- /dev/null
+++ b/arch/powerpc/mm/nohash/kaslr_booke.c
@@ -0,0 +1,401 @@
+// SPDX-License-Identifier: GPL-2.0-only
+//
+// Copyright (C) 2019 Jason Yan <yanaijie@huawei.com>
+
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/mm.h>
+#include <linux/swap.h>
+#include <linux/stddef.h>
+#include <linux/init.h>
+#include <linux/delay.h>
+#include <linux/memblock.h>
+#include <linux/libfdt.h>
+#include <linux/crash_core.h>
+#include <asm/cacheflush.h>
+#include <asm/prom.h>
+#include <asm/kdump.h>
+#include <mm/mmu_decl.h>
+#include <generated/compile.h>
+#include <generated/utsrelease.h>
+
+struct regions {
+ unsigned long pa_start;
+ unsigned long pa_end;
+ unsigned long kernel_size;
+ unsigned long dtb_start;
+ unsigned long dtb_end;
+ unsigned long initrd_start;
+ unsigned long initrd_end;
+ unsigned long crash_start;
+ unsigned long crash_end;
+ int reserved_mem;
+ int reserved_mem_addr_cells;
+ int reserved_mem_size_cells;
+};
+
+/* Simplified build-specific string for starting entropy. */
+static const char build_str[] = UTS_RELEASE " (" LINUX_COMPILE_BY "@"
+ LINUX_COMPILE_HOST ") (" LINUX_COMPILER ") " UTS_VERSION;
+
+struct regions __initdata regions;
+
+static __init void kaslr_get_cmdline(void *fdt)
+{
+ int node = fdt_path_offset(fdt, "/chosen");
+
+ early_init_dt_scan_chosen(node, "chosen", 1, boot_command_line);
+}
+
+static unsigned long __init rotate_xor(unsigned long hash, const void *area,
+ size_t size)
+{
+ size_t i;
+ const unsigned long *ptr = area;
+
+ for (i = 0; i < size / sizeof(hash); i++) {
+ /* Rotate by odd number of bits and XOR. */
+ hash = (hash << ((sizeof(hash) * 8) - 7)) | (hash >> 7);
+ hash ^= ptr[i];
+ }
+
+ return hash;
+}
+
+/* Attempt to create a simple starting entropy. This can make it defferent for
+ * every build but it is still not enough. Stronger entropy should
+ * be added to make it change for every boot.
+ */
+static unsigned long __init get_boot_seed(void *fdt)
+{
+ unsigned long hash = 0;
+
+ hash = rotate_xor(hash, build_str, sizeof(build_str));
+ hash = rotate_xor(hash, fdt, fdt_totalsize(fdt));
+
+ return hash;
+}
+
+static __init u64 get_kaslr_seed(void *fdt)
+{
+ int node, len;
+ fdt64_t *prop;
+ u64 ret;
+
+ node = fdt_path_offset(fdt, "/chosen");
+ if (node < 0)
+ return 0;
+
+ prop = fdt_getprop_w(fdt, node, "kaslr-seed", &len);
+ if (!prop || len != sizeof(u64))
+ return 0;
+
+ ret = fdt64_to_cpu(*prop);
+ *prop = 0;
+ return ret;
+}
+
+static __init bool regions_overlap(u32 s1, u32 e1, u32 s2, u32 e2)
+{
+ return e1 >= s2 && e2 >= s1;
+}
+
+static __init bool overlaps_reserved_region(const void *fdt, u32 start,
+ u32 end)
+{
+ int subnode, len, i;
+ u64 base, size;
+
+ /* check for overlap with /memreserve/ entries */
+ for (i = 0; i < fdt_num_mem_rsv(fdt); i++) {
+ if (fdt_get_mem_rsv(fdt, i, &base, &size) < 0)
+ continue;
+ if (regions_overlap(start, end, base, base + size))
+ return true;
+ }
+
+ if (regions.reserved_mem < 0)
+ return false;
+
+ /* check for overlap with static reservations in /reserved-memory */
+ for (subnode = fdt_first_subnode(fdt, regions.reserved_mem);
+ subnode >= 0;
+ subnode = fdt_next_subnode(fdt, subnode)) {
+ const fdt32_t *reg;
+ u64 rsv_end;
+
+ len = 0;
+ reg = fdt_getprop(fdt, subnode, "reg", &len);
+ while (len >= (regions.reserved_mem_addr_cells +
+ regions.reserved_mem_size_cells)) {
+ base = fdt32_to_cpu(reg[0]);
+ if (regions.reserved_mem_addr_cells == 2)
+ base = (base << 32) | fdt32_to_cpu(reg[1]);
+
+ reg += regions.reserved_mem_addr_cells;
+ len -= 4 * regions.reserved_mem_addr_cells;
+
+ size = fdt32_to_cpu(reg[0]);
+ if (regions.reserved_mem_size_cells == 2)
+ size = (size << 32) | fdt32_to_cpu(reg[1]);
+
+ reg += regions.reserved_mem_size_cells;
+ len -= 4 * regions.reserved_mem_size_cells;
+
+ if (base >= regions.pa_end)
+ continue;
+
+ rsv_end = min(base + size, (u64)U32_MAX);
+
+ if (regions_overlap(start, end, base, rsv_end))
+ return true;
+ }
+ }
+ return false;
+}
+
+static __init bool overlaps_region(const void *fdt, u32 start,
+ u32 end)
+{
+ if (regions_overlap(start, end, __pa(_stext), __pa(_end)))
+ return true;
+
+ if (regions_overlap(start, end, regions.dtb_start,
+ regions.dtb_end))
+ return true;
+
+ if (regions_overlap(start, end, regions.initrd_start,
+ regions.initrd_end))
+ return true;
+
+ if (regions_overlap(start, end, regions.crash_start,
+ regions.crash_end))
+ return true;
+
+ return overlaps_reserved_region(fdt, start, end);
+}
+
+static void __init get_crash_kernel(void *fdt, unsigned long size)
+{
+#ifdef CONFIG_CRASH_CORE
+ unsigned long long crash_size, crash_base;
+ int ret;
+
+ ret = parse_crashkernel(boot_command_line, size, &crash_size,
+ &crash_base);
+ if (ret != 0 || crash_size == 0)
+ return;
+ if (crash_base == 0)
+ crash_base = KDUMP_KERNELBASE;
+
+ regions.crash_start = (unsigned long)crash_base;
+ regions.crash_end = (unsigned long)(crash_base + crash_size);
+
+ pr_debug("crash_base=0x%llx crash_size=0x%llx\n", crash_base, crash_size);
+#endif
+}
+
+static void __init get_initrd_range(void *fdt)
+{
+ u64 start, end;
+ int node, len;
+ const __be32 *prop;
+
+ node = fdt_path_offset(fdt, "/chosen");
+ if (node < 0)
+ return;
+
+ prop = fdt_getprop(fdt, node, "linux,initrd-start", &len);
+ if (!prop)
+ return;
+ start = of_read_number(prop, len / 4);
+
+ prop = fdt_getprop(fdt, node, "linux,initrd-end", &len);
+ if (!prop)
+ return;
+ end = of_read_number(prop, len / 4);
+
+ regions.initrd_start = (unsigned long)start;
+ regions.initrd_end = (unsigned long)end;
+
+ pr_debug("initrd_start=0x%llx initrd_end=0x%llx\n", start, end);
+}
+
+static __init unsigned long get_usable_address(const void *fdt,
+ unsigned long start,
+ unsigned long offset)
+{
+ unsigned long pa;
+ unsigned long pa_end;
+
+ for (pa = offset; (long)pa > (long)start; pa -= SZ_16K) {
+ pa_end = pa + regions.kernel_size;
+ if (overlaps_region(fdt, pa, pa_end))
+ continue;
+
+ return pa;
+ }
+ return 0;
+}
+
+static __init void get_cell_sizes(const void *fdt, int node, int *addr_cells,
+ int *size_cells)
+{
+ const int *prop;
+ int len;
+
+ /*
+ * Retrieve the #address-cells and #size-cells properties
+ * from the 'node', or use the default if not provided.
+ */
+ *addr_cells = *size_cells = 1;
+
+ prop = fdt_getprop(fdt, node, "#address-cells", &len);
+ if (len == 4)
+ *addr_cells = fdt32_to_cpu(*prop);
+ prop = fdt_getprop(fdt, node, "#size-cells", &len);
+ if (len == 4)
+ *size_cells = fdt32_to_cpu(*prop);
+}
+
+static unsigned long __init kaslr_legal_offset(void *dt_ptr, unsigned long index,
+ unsigned long offset)
+{
+ unsigned long koffset = 0;
+ unsigned long start;
+
+ while ((long)index >= 0) {
+ offset = memstart_addr + index * SZ_64M + offset;
+ start = memstart_addr + index * SZ_64M;
+ koffset = get_usable_address(dt_ptr, start, offset);
+ if (koffset)
+ break;
+ index--;
+ }
+
+ if (koffset != 0)
+ koffset -= memstart_addr;
+
+ return koffset;
+}
+
+static inline __init bool kaslr_disabled(void)
+{
+ return strstr(boot_command_line, "nokaslr") != NULL;
+}
+
+static unsigned long __init kaslr_choose_location(void *dt_ptr, phys_addr_t size,
+ unsigned long kernel_sz)
+{
+ unsigned long offset, random;
+ unsigned long ram, linear_sz;
+ u64 seed;
+ unsigned long index;
+
+ kaslr_get_cmdline(dt_ptr);
+ if (kaslr_disabled())
+ return 0;
+
+ random = get_boot_seed(dt_ptr);
+
+ seed = get_tb() << 32;
+ seed ^= get_tb();
+ random = rotate_xor(random, &seed, sizeof(seed));
+
+ /*
+ * Retrieve (and wipe) the seed from the FDT
+ */
+ seed = get_kaslr_seed(dt_ptr);
+ if (seed)
+ random = rotate_xor(random, &seed, sizeof(seed));
+ else
+ pr_warn("KASLR: No safe seed for randomizing the kernel base.\n");
+
+ ram = min_t(phys_addr_t, __max_low_memory, size);
+ ram = map_mem_in_cams(ram, CONFIG_LOWMEM_CAM_NUM, true);
+ linear_sz = min_t(unsigned long, ram, SZ_512M);
+
+ /* If the linear size is smaller than 64M, do not randmize */
+ if (linear_sz < SZ_64M)
+ return 0;
+
+ /* check for a reserved-memory node and record its cell sizes */
+ regions.reserved_mem = fdt_path_offset(dt_ptr, "/reserved-memory");
+ if (regions.reserved_mem >= 0)
+ get_cell_sizes(dt_ptr, regions.reserved_mem,
+ ®ions.reserved_mem_addr_cells,
+ ®ions.reserved_mem_size_cells);
+
+ regions.pa_start = memstart_addr;
+ regions.pa_end = memstart_addr + linear_sz;
+ regions.dtb_start = __pa(dt_ptr);
+ regions.dtb_end = __pa(dt_ptr) + fdt_totalsize(dt_ptr);
+ regions.kernel_size = kernel_sz;
+
+ get_initrd_range(dt_ptr);
+ get_crash_kernel(dt_ptr, ram);
+
+ /*
+ * Decide which 64M we want to start
+ * Only use the low 8 bits of the random seed
+ */
+ index = random & 0xFF;
+ index %= linear_sz / SZ_64M;
+
+ /* Decide offset inside 64M */
+ offset = random % (SZ_64M - kernel_sz);
+ offset = round_down(offset, SZ_16K);
+
+ return kaslr_legal_offset(dt_ptr, index, offset);
+}
+
+/*
+ * To see if we need to relocate the kernel to a random offset
+ * void *dt_ptr - address of the device tree
+ * phys_addr_t size - size of the first memory block
+ */
+notrace void __init kaslr_early_init(void *dt_ptr, phys_addr_t size)
+{
+ unsigned long tlb_virt;
+ phys_addr_t tlb_phys;
+ unsigned long offset;
+ unsigned long kernel_sz;
+
+ kernel_sz = (unsigned long)_end - (unsigned long)_stext;
+
+ offset = kaslr_choose_location(dt_ptr, size, kernel_sz);
+ if (offset == 0)
+ return;
+
+ kernstart_virt_addr += offset;
+ kernstart_addr += offset;
+
+ is_second_reloc = 1;
+
+ if (offset >= SZ_64M) {
+ tlb_virt = round_down(kernstart_virt_addr, SZ_64M);
+ tlb_phys = round_down(kernstart_addr, SZ_64M);
+
+ /* Create kernel map to relocate in */
+ create_kaslr_tlb_entry(1, tlb_virt, tlb_phys);
+ }
+
+ /* Copy the kernel to it's new location and run */
+ memcpy((void *)kernstart_virt_addr, (void *)_stext, kernel_sz);
+ flush_icache_range(kernstart_virt_addr, kernstart_virt_addr + kernel_sz);
+
+ reloc_kernel_entry(dt_ptr, kernstart_virt_addr);
+}
+
+void __init kaslr_late_init(void)
+{
+ /* If randomized, clear the original kernel */
+ if (kernstart_virt_addr != KERNELBASE) {
+ unsigned long kernel_sz;
+
+ kernel_sz = (unsigned long)_end - kernstart_virt_addr;
+ memzero_explicit((void *)KERNELBASE, kernel_sz);
+ }
+}
diff --git a/arch/powerpc/mm/nohash/tlb.c b/arch/powerpc/mm/nohash/tlb.c
index 696f568..5872f69 100644
--- a/arch/powerpc/mm/nohash/tlb.c
+++ b/arch/powerpc/mm/nohash/tlb.c
@@ -34,6 +34,7 @@
#include <linux/of_fdt.h>
#include <linux/hugetlb.h>
+#include <asm/pgalloc.h>
#include <asm/tlbflush.h>
#include <asm/tlb.h>
#include <asm/code-patching.h>
@@ -82,16 +83,12 @@
};
#elif defined(CONFIG_PPC_8xx)
struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT] = {
- /* we only manage 4k and 16k pages as normal pages */
-#ifdef CONFIG_PPC_4K_PAGES
[MMU_PAGE_4K] = {
.shift = 12,
},
-#else
[MMU_PAGE_16K] = {
.shift = 14,
},
-#endif
[MMU_PAGE_512K] = {
.shift = 19,
},
diff --git a/arch/powerpc/mm/nohash/tlb_low_64e.S b/arch/powerpc/mm/nohash/tlb_low_64e.S
index 1f110c3..bf24451 100644
--- a/arch/powerpc/mm/nohash/tlb_low_64e.S
+++ b/arch/powerpc/mm/nohash/tlb_low_64e.S
@@ -6,6 +6,7 @@
* Ben. Herrenschmidt (benh@kernel.crashing.org), IBM Corp.
*/
+#include <linux/pgtable.h>
#include <asm/processor.h>
#include <asm/reg.h>
#include <asm/page.h>
@@ -13,7 +14,6 @@
#include <asm/ppc_asm.h>
#include <asm/asm-offsets.h>
#include <asm/cputable.h>
-#include <asm/pgtable.h>
#include <asm/exception-64e.h>
#include <asm/ppc-opcode.h>
#include <asm/kvm_asm.h>
@@ -71,7 +71,6 @@
END_BTB_FLUSH_SECTION
std r7,EX_TLB_R7(r12)
#endif
- TLB_MISS_PROLOG_STATS
.endm
.macro tlb_epilog_bolted
@@ -85,7 +84,6 @@
mtcr r14
ld r14,EX_TLB_R14(r12)
ld r15,EX_TLB_R15(r12)
- TLB_MISS_RESTORE_STATS
ld r16,EX_TLB_R16(r12)
mfspr r12,SPRN_SPRG_GEN_SCRATCH
.endm
@@ -128,7 +126,6 @@
ori r10,r10,_PAGE_PRESENT
oris r11,r10,_PAGE_ACCESSED@h
- TLB_MISS_STATS_SAVE_INFO_BOLTED
bne tlb_miss_kernel_bolted
tlb_miss_common_bolted:
@@ -209,7 +206,6 @@
tlbwe
tlb_miss_done_bolted:
- TLB_MISS_STATS_X(MMSTAT_TLB_MISS_NORM_OK)
tlb_epilog_bolted
rfi
@@ -229,11 +225,9 @@
andi. r10,r11,_PAGE_EXEC|_PAGE_BAP_SX
bne itlb_miss_fault_bolted
dtlb_miss_fault_bolted:
- TLB_MISS_STATS_D(MMSTAT_TLB_MISS_NORM_FAULT)
tlb_epilog_bolted
b exc_data_storage_book3e
itlb_miss_fault_bolted:
- TLB_MISS_STATS_I(MMSTAT_TLB_MISS_NORM_FAULT)
tlb_epilog_bolted
b exc_instruction_storage_book3e
@@ -243,7 +237,6 @@
rldicl. r10,r16,64-PGTABLE_EADDR_SIZE,PGTABLE_EADDR_SIZE+4
srdi r15,r16,60 /* get region */
- TLB_MISS_STATS_SAVE_INFO_BOLTED
bne- itlb_miss_fault_bolted
li r11,_PAGE_PRESENT|_PAGE_EXEC /* Base perm */
@@ -276,7 +269,6 @@
srdi. r15,r16,60 /* get region */
ori r16,r16,1
- TLB_MISS_STATS_SAVE_INFO_BOLTED
bne tlb_miss_kernel_e6500 /* user/kernel test */
b tlb_miss_common_e6500
@@ -288,7 +280,6 @@
srdi. r15,r16,60 /* get region */
rldicr r16,r16,0,62
- TLB_MISS_STATS_SAVE_INFO_BOLTED
bne tlb_miss_kernel_e6500 /* user vs kernel check */
/*
@@ -460,7 +451,6 @@
.endm
tlb_unlock_e6500
- TLB_MISS_STATS_X(MMSTAT_TLB_MISS_NORM_OK)
tlb_epilog_bolted
rfi
@@ -519,11 +509,9 @@
andi. r16,r16,1
bne itlb_miss_fault_e6500
dtlb_miss_fault_e6500:
- TLB_MISS_STATS_D(MMSTAT_TLB_MISS_NORM_FAULT)
tlb_epilog_bolted
b exc_data_storage_book3e
itlb_miss_fault_e6500:
- TLB_MISS_STATS_I(MMSTAT_TLB_MISS_NORM_FAULT)
tlb_epilog_bolted
b exc_instruction_storage_book3e
#endif /* CONFIG_PPC_FSL_BOOK3E */
@@ -548,7 +536,6 @@
mfspr r16,SPRN_DEAR /* get faulting address */
srdi r15,r16,60 /* get region */
cmpldi cr0,r15,0xc /* linear mapping ? */
- TLB_MISS_STATS_SAVE_INFO
beq tlb_load_linear /* yes -> go to linear map load */
/* The page tables are mapped virtually linear. At this point, though,
@@ -600,7 +587,6 @@
/* We got a crappy address, just fault with whatever DEAR and ESR
* are here
*/
- TLB_MISS_STATS_D(MMSTAT_TLB_MISS_NORM_FAULT)
TLB_MISS_EPILOG_ERROR
b exc_data_storage_book3e
@@ -624,7 +610,6 @@
*/
srdi r15,r16,60 /* get region */
cmpldi cr0,r15,0xc /* linear mapping ? */
- TLB_MISS_STATS_SAVE_INFO
beq tlb_load_linear /* yes -> go to linear map load */
/* We do the user/kernel test for the PID here along with the RW test
@@ -646,7 +631,6 @@
beq+ normal_tlb_miss
/* We got a crappy address, just fault */
- TLB_MISS_STATS_I(MMSTAT_TLB_MISS_NORM_FAULT)
TLB_MISS_EPILOG_ERROR
b exc_instruction_storage_book3e
@@ -745,7 +729,6 @@
* level 0 and just going back to userland. They are only needed
* if you are going to take an access fault
*/
- TLB_MISS_STATS_X(MMSTAT_TLB_MISS_NORM_OK)
TLB_MISS_EPILOG_SUCCESS
rfi
@@ -757,11 +740,9 @@
ld r15,EX_TLB_ESR(r12)
mtspr SPRN_DEAR,r14
mtspr SPRN_ESR,r15
- TLB_MISS_STATS_D(MMSTAT_TLB_MISS_NORM_FAULT)
TLB_MISS_EPILOG_ERROR
b exc_data_storage_book3e
-1: TLB_MISS_STATS_I(MMSTAT_TLB_MISS_NORM_FAULT)
- TLB_MISS_EPILOG_ERROR
+1: TLB_MISS_EPILOG_ERROR
b exc_instruction_storage_book3e
@@ -899,7 +880,6 @@
1:
END_MMU_FTR_SECTION_IFSET(MMU_FTR_USE_TLBRSRV)
/* Return to caller, normal case */
- TLB_MISS_STATS_X(MMSTAT_TLB_MISS_PT_OK);
TLB_MISS_EPILOG_SUCCESS
rfi
@@ -935,18 +915,15 @@
beq 1f
mtspr SPRN_DEAR,r15
mtspr SPRN_ESR,r16
- TLB_MISS_STATS_D(MMSTAT_TLB_MISS_PT_FAULT);
TLB_MISS_EPILOG_ERROR
b exc_data_storage_book3e
-1: TLB_MISS_STATS_I(MMSTAT_TLB_MISS_PT_FAULT);
- TLB_MISS_EPILOG_ERROR
+1: TLB_MISS_EPILOG_ERROR
b exc_instruction_storage_book3e
virt_page_table_tlb_miss_whacko_fault:
/* The linear fault will restart everything so ESR and DEAR will
* not have been clobbered, let's just fault with what we have
*/
- TLB_MISS_STATS_X(MMSTAT_TLB_MISS_PT_FAULT);
TLB_MISS_EPILOG_ERROR
b exc_data_storage_book3e
@@ -971,7 +948,6 @@
mfspr r16,SPRN_DEAR /* get faulting address */
srdi r11,r16,60 /* get region */
cmpldi cr0,r11,0xc /* linear mapping ? */
- TLB_MISS_STATS_SAVE_INFO
beq tlb_load_linear /* yes -> go to linear map load */
/* We do the user/kernel test for the PID here along with the RW test
@@ -991,7 +967,6 @@
/* We got a crappy address, just fault with whatever DEAR and ESR
* are here
*/
- TLB_MISS_STATS_D(MMSTAT_TLB_MISS_NORM_FAULT)
TLB_MISS_EPILOG_ERROR
b exc_data_storage_book3e
@@ -1015,7 +990,6 @@
*/
srdi r11,r16,60 /* get region */
cmpldi cr0,r11,0xc /* linear mapping ? */
- TLB_MISS_STATS_SAVE_INFO
beq tlb_load_linear /* yes -> go to linear map load */
/* We do the user/kernel test for the PID here along with the RW test
@@ -1033,7 +1007,6 @@
beq+ htw_tlb_miss
/* We got a crappy address, just fault */
- TLB_MISS_STATS_I(MMSTAT_TLB_MISS_NORM_FAULT)
TLB_MISS_EPILOG_ERROR
b exc_instruction_storage_book3e
@@ -1130,7 +1103,6 @@
* level 0 and just going back to userland. They are only needed
* if you are going to take an access fault
*/
- TLB_MISS_STATS_X(MMSTAT_TLB_MISS_PT_OK)
TLB_MISS_EPILOG_SUCCESS
rfi
@@ -1142,11 +1114,9 @@
beq 1f
mtspr SPRN_DEAR,r16
mtspr SPRN_ESR,r14
- TLB_MISS_STATS_D(MMSTAT_TLB_MISS_PT_FAULT)
TLB_MISS_EPILOG_ERROR
b exc_data_storage_book3e
-1: TLB_MISS_STATS_I(MMSTAT_TLB_MISS_PT_FAULT)
- TLB_MISS_EPILOG_ERROR
+1: TLB_MISS_EPILOG_ERROR
b exc_instruction_storage_book3e
/*
@@ -1221,7 +1191,6 @@
* We do that because we can't resume a fault within a TLB
* miss handler, due to MAS and TLB reservation being clobbered.
*/
- TLB_MISS_STATS_X(MMSTAT_TLB_MISS_LINEAR)
TLB_MISS_EPILOG_ERROR
rfi
@@ -1233,13 +1202,3 @@
b exc_data_storage_book3e
1: TLB_MISS_EPILOG_ERROR_SPECIAL
b exc_instruction_storage_book3e
-
-
-#ifdef CONFIG_BOOK3E_MMU_TLB_STATS
-.tlb_stat_inc:
-1: ldarx r8,0,r9
- addi r8,r8,1
- stdcx. r8,0,r9
- bne- 1b
- blr
-#endif
diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index 50d68d2..094a107 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -221,7 +221,8 @@
}
}
-/* Returns nid in the range [0..MAX_NUMNODES-1], or -1 if no useful numa
+/*
+ * Returns nid in the range [0..nr_node_ids], or -1 if no useful NUMA
* info is found.
*/
static int associativity_to_nid(const __be32 *associativity)
@@ -235,7 +236,7 @@
nid = of_read_number(&associativity[min_common_depth], 1);
/* POWER4 LPAR uses 0xffff as invalid node */
- if (nid == 0xffff || nid >= MAX_NUMNODES)
+ if (nid == 0xffff || nid >= nr_node_ids)
nid = NUMA_NO_NODE;
if (nid > 0 &&
@@ -429,7 +430,7 @@
* This is like of_node_to_nid_single() for memory represented in the
* ibm,dynamic-reconfiguration-memory node.
*/
-static int of_drconf_to_nid_single(struct drmem_lmb *lmb)
+int of_drconf_to_nid_single(struct drmem_lmb *lmb)
{
struct assoc_arrays aa = { .arrays = NULL };
int default_nid = NUMA_NO_NODE;
@@ -448,7 +449,7 @@
index = lmb->aa_index * aa.array_sz + min_common_depth - 1;
nid = of_read_number(&aa.arrays[index], 1);
- if (nid == 0xffff || nid >= MAX_NUMNODES)
+ if (nid == 0xffff || nid >= nr_node_ids)
nid = default_nid;
if (nid > 0) {
@@ -461,25 +462,74 @@
return nid;
}
+#ifdef CONFIG_PPC_SPLPAR
+static int vphn_get_nid(long lcpu)
+{
+ __be32 associativity[VPHN_ASSOC_BUFSIZE] = {0};
+ long rc, hwid;
+
+ /*
+ * On a shared lpar, device tree will not have node associativity.
+ * At this time lppaca, or its __old_status field may not be
+ * updated. Hence kernel cannot detect if its on a shared lpar. So
+ * request an explicit associativity irrespective of whether the
+ * lpar is shared or dedicated. Use the device tree property as a
+ * fallback. cpu_to_phys_id is only valid between
+ * smp_setup_cpu_maps() and smp_setup_pacas().
+ */
+ if (firmware_has_feature(FW_FEATURE_VPHN)) {
+ if (cpu_to_phys_id)
+ hwid = cpu_to_phys_id[lcpu];
+ else
+ hwid = get_hard_smp_processor_id(lcpu);
+
+ rc = hcall_vphn(hwid, VPHN_FLAG_VCPU, associativity);
+ if (rc == H_SUCCESS)
+ return associativity_to_nid(associativity);
+ }
+
+ return NUMA_NO_NODE;
+}
+#else
+static int vphn_get_nid(long unused)
+{
+ return NUMA_NO_NODE;
+}
+#endif /* CONFIG_PPC_SPLPAR */
+
/*
* Figure out to which domain a cpu belongs and stick it there.
* Return the id of the domain used.
*/
static int numa_setup_cpu(unsigned long lcpu)
{
- int nid = NUMA_NO_NODE;
struct device_node *cpu;
+ int fcpu = cpu_first_thread_sibling(lcpu);
+ int nid = NUMA_NO_NODE;
+
+ if (!cpu_present(lcpu)) {
+ set_cpu_numa_node(lcpu, first_online_node);
+ return first_online_node;
+ }
/*
* If a valid cpu-to-node mapping is already available, use it
* directly instead of querying the firmware, since it represents
* the most recent mapping notified to us by the platform (eg: VPHN).
+ * Since cpu_to_node binding remains the same for all threads in the
+ * core. If a valid cpu-to-node mapping is already available, for
+ * the first thread in the core, use it.
*/
- if ((nid = numa_cpu_lookup_table[lcpu]) >= 0) {
+ nid = numa_cpu_lookup_table[fcpu];
+ if (nid >= 0) {
map_cpu_to_node(lcpu, nid);
return nid;
}
+ nid = vphn_get_nid(lcpu);
+ if (nid != NUMA_NO_NODE)
+ goto out_present;
+
cpu = of_get_cpu_node(lcpu, NULL);
if (!cpu) {
@@ -491,13 +541,26 @@
}
nid = of_node_to_nid_single(cpu);
+ of_node_put(cpu);
out_present:
if (nid < 0 || !node_possible(nid))
nid = first_online_node;
+ /*
+ * Update for the first thread of the core. All threads of a core
+ * have to be part of the same node. This not only avoids querying
+ * for every other thread in the core, but always avoids a case
+ * where virtual node associativity change causes subsequent threads
+ * of a core to be associated with different nid. However if first
+ * thread is already online, expect it to have a valid mapping.
+ */
+ if (fcpu != lcpu) {
+ WARN_ON(cpu_online(fcpu));
+ map_cpu_to_node(fcpu, nid);
+ }
+
map_cpu_to_node(lcpu, nid);
- of_node_put(cpu);
out:
return nid;
}
@@ -587,8 +650,9 @@
* Extract NUMA information from the ibm,dynamic-reconfiguration-memory
* node. This assumes n_mem_{addr,size}_cells have been set.
*/
-static void __init numa_setup_drmem_lmb(struct drmem_lmb *lmb,
- const __be32 **usm)
+static int __init numa_setup_drmem_lmb(struct drmem_lmb *lmb,
+ const __be32 **usm,
+ void *data)
{
unsigned int ranges, is_kexec_kdump = 0;
unsigned long base, size, sz;
@@ -600,7 +664,7 @@
*/
if ((lmb->flags & DRCONF_MEM_RESERVED)
|| !(lmb->flags & DRCONF_MEM_ASSIGNED))
- return;
+ return 0;
if (*usm)
is_kexec_kdump = 1;
@@ -612,7 +676,7 @@
if (is_kexec_kdump) {
ranges = read_usm_ranges(usm);
if (!ranges) /* there are no (base, size) duple */
- return;
+ return 0;
}
do {
@@ -629,6 +693,8 @@
if (sz)
memblock_set_node(base, sz, &memblock.memory, nid);
} while (--ranges);
+
+ return 0;
}
static int __init parse_numa_properties(void)
@@ -662,20 +728,20 @@
*/
for_each_present_cpu(i) {
struct device_node *cpu;
- int nid;
-
- cpu = of_get_cpu_node(i, NULL);
- BUG_ON(!cpu);
- nid = of_node_to_nid_single(cpu);
- of_node_put(cpu);
+ int nid = vphn_get_nid(i);
/*
* Don't fall back to default_nid yet -- we will plug
* cpus into nodes once the memory scan has discovered
* the topology.
*/
- if (nid < 0)
- continue;
+ if (nid == NUMA_NO_NODE) {
+ cpu = of_get_cpu_node(i, NULL);
+ BUG_ON(!cpu);
+ nid = of_node_to_nid_single(cpu);
+ of_node_put(cpu);
+ }
+
node_set_online(nid);
}
@@ -730,7 +796,7 @@
*/
memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
if (memory) {
- walk_drmem_lmbs(memory, numa_setup_drmem_lmb);
+ walk_drmem_lmbs(memory, NULL, numa_setup_drmem_lmb);
of_node_put(memory);
}
@@ -743,17 +809,14 @@
unsigned long total_ram = memblock_phys_mem_size();
unsigned long start_pfn, end_pfn;
unsigned int nid = 0;
- struct memblock_region *reg;
+ int i;
printk(KERN_DEBUG "Top of RAM: 0x%lx, Total RAM: 0x%lx\n",
top_of_ram, total_ram);
printk(KERN_DEBUG "Memory hole size: %ldMB\n",
(top_of_ram - total_ram) >> 20);
- for_each_memblock(memory, reg) {
- start_pfn = memblock_region_memory_base_pfn(reg);
- end_pfn = memblock_region_memory_end_pfn(reg);
-
+ for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, NULL) {
fake_numa_create_new_node(end_pfn, &nid);
memblock_set_node(PFN_PHYS(start_pfn),
PFN_PHYS(end_pfn - start_pfn),
@@ -830,7 +893,9 @@
static void __init find_possible_nodes(void)
{
struct device_node *rtas;
- u32 numnodes, i;
+ const __be32 *domains = NULL;
+ int prop_length, max_nodes;
+ u32 i;
if (!numa_enabled)
return;
@@ -839,16 +904,38 @@
if (!rtas)
return;
- if (of_property_read_u32_index(rtas,
- "ibm,max-associativity-domains",
- min_common_depth, &numnodes))
- goto out;
+ /*
+ * ibm,current-associativity-domains is a fairly recent property. If
+ * it doesn't exist, then fallback on ibm,max-associativity-domains.
+ * Current denotes what the platform can support compared to max
+ * which denotes what the Hypervisor can support.
+ *
+ * If the LPAR is migratable, new nodes might be activated after a LPM,
+ * so we should consider the max number in that case.
+ */
+ if (!of_get_property(of_root, "ibm,migratable-partition", NULL))
+ domains = of_get_property(rtas,
+ "ibm,current-associativity-domains",
+ &prop_length);
+ if (!domains) {
+ domains = of_get_property(rtas, "ibm,max-associativity-domains",
+ &prop_length);
+ if (!domains)
+ goto out;
+ }
- for (i = 0; i < numnodes; i++) {
+ max_nodes = of_read_number(&domains[min_common_depth], 1);
+ pr_info("Partition configured for %d NUMA nodes.\n", max_nodes);
+
+ for (i = 0; i < max_nodes; i++) {
if (!node_possible(i))
node_set(i, node_possible_map);
}
+ prop_length /= sizeof(int);
+ if (prop_length > min_common_depth + 2)
+ coregroup_enabled = 1;
+
out:
of_node_put(rtas);
}
@@ -857,6 +944,16 @@
{
int cpu;
+ /*
+ * Linux/mm assumes node 0 to be online at boot. However this is not
+ * true on PowerPC, where node 0 is similar to any other node, it
+ * could be cpuless, memoryless node. So force node 0 to be offline
+ * for now. This will prevent cpuless, memoryless node 0 showing up
+ * unnecessarily as online. If a node has cpus or memory that need
+ * to be online, then node will anyway be marked online.
+ */
+ node_set_offline(0);
+
if (parse_numa_properties())
setup_nonnuma();
@@ -874,8 +971,17 @@
reset_numa_cpu_lookup_table();
- for_each_present_cpu(cpu)
+ for_each_possible_cpu(cpu) {
+ /*
+ * Powerpc with CONFIG_NUMA always used to have a node 0,
+ * even if it was memoryless or cpuless. For all cpus that
+ * are possible but not present, cpu_to_node() would point
+ * to node 0. To remove a cpuless, memoryless dummy node,
+ * powerpc need to make sure all possible but not present
+ * cpu_to_node are set to a proper node.
+ */
numa_setup_cpu(cpu);
+ }
}
void __init initmem_init(void)
@@ -892,7 +998,6 @@
get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
setup_node_data(nid, start_pfn, end_pfn);
- sparse_memory_present_with_active_regions(nid);
}
sparse_init();
@@ -927,28 +1032,6 @@
}
early_param("numa", early_numa);
-/*
- * The platform can inform us through one of several mechanisms
- * (post-migration device tree updates, PRRN or VPHN) that the NUMA
- * assignment of a resource has changed. This controls whether we act
- * on that. Disabled by default.
- */
-static bool topology_updates_enabled;
-
-static int __init early_topology_updates(char *p)
-{
- if (!p)
- return 0;
-
- if (!strcmp(p, "on")) {
- pr_warn("Caution: enabling topology updates\n");
- topology_updates_enabled = true;
- }
-
- return 0;
-}
-early_param("topology_updates", early_topology_updates);
-
#ifdef CONFIG_MEMORY_HOTPLUG
/*
* Find the node associated with a hot added memory section for
@@ -1087,98 +1170,9 @@
/* Virtual Processor Home Node (VPHN) support */
#ifdef CONFIG_PPC_SPLPAR
-struct topology_update_data {
- struct topology_update_data *next;
- unsigned int cpu;
- int old_nid;
- int new_nid;
-};
-
-#define TOPOLOGY_DEF_TIMER_SECS 60
-
-static u8 vphn_cpu_change_counts[NR_CPUS][MAX_DISTANCE_REF_POINTS];
-static cpumask_t cpu_associativity_changes_mask;
-static int vphn_enabled;
-static int prrn_enabled;
-static void reset_topology_timer(void);
-static int topology_timer_secs = 1;
static int topology_inited;
/*
- * Change polling interval for associativity changes.
- */
-int timed_topology_update(int nsecs)
-{
- if (vphn_enabled) {
- if (nsecs > 0)
- topology_timer_secs = nsecs;
- else
- topology_timer_secs = TOPOLOGY_DEF_TIMER_SECS;
-
- reset_topology_timer();
- }
-
- return 0;
-}
-
-/*
- * Store the current values of the associativity change counters in the
- * hypervisor.
- */
-static void setup_cpu_associativity_change_counters(void)
-{
- int cpu;
-
- /* The VPHN feature supports a maximum of 8 reference points */
- BUILD_BUG_ON(MAX_DISTANCE_REF_POINTS > 8);
-
- for_each_possible_cpu(cpu) {
- int i;
- u8 *counts = vphn_cpu_change_counts[cpu];
- volatile u8 *hypervisor_counts = lppaca_of(cpu).vphn_assoc_counts;
-
- for (i = 0; i < distance_ref_points_depth; i++)
- counts[i] = hypervisor_counts[i];
- }
-}
-
-/*
- * The hypervisor maintains a set of 8 associativity change counters in
- * the VPA of each cpu that correspond to the associativity levels in the
- * ibm,associativity-reference-points property. When an associativity
- * level changes, the corresponding counter is incremented.
- *
- * Set a bit in cpu_associativity_changes_mask for each cpu whose home
- * node associativity levels have changed.
- *
- * Returns the number of cpus with unhandled associativity changes.
- */
-static int update_cpu_associativity_changes_mask(void)
-{
- int cpu;
- cpumask_t *changes = &cpu_associativity_changes_mask;
-
- for_each_possible_cpu(cpu) {
- int i, changed = 0;
- u8 *counts = vphn_cpu_change_counts[cpu];
- volatile u8 *hypervisor_counts = lppaca_of(cpu).vphn_assoc_counts;
-
- for (i = 0; i < distance_ref_points_depth; i++) {
- if (hypervisor_counts[i] != counts[i]) {
- counts[i] = hypervisor_counts[i];
- changed = 1;
- }
- }
- if (changed) {
- cpumask_or(changes, changes, cpu_sibling_mask(cpu));
- cpu = cpu_last_thread_sibling(cpu);
- }
- }
-
- return cpumask_weight(changes);
-}
-
-/*
* Retrieve the new associativity information for a virtual processor's
* home node.
*/
@@ -1191,23 +1185,27 @@
VPHN_FLAG_VCPU, associativity);
switch (rc) {
- case H_FUNCTION:
- printk_once(KERN_INFO
- "VPHN is not supported. Disabling polling...\n");
- stop_topology_update();
- break;
- case H_HARDWARE:
- printk(KERN_ERR
- "hcall_vphn() experienced a hardware fault "
- "preventing VPHN. Disabling polling...\n");
- stop_topology_update();
- break;
case H_SUCCESS:
dbg("VPHN hcall succeeded. Reset polling...\n");
- timed_topology_update(0);
+ goto out;
+
+ case H_FUNCTION:
+ pr_err_ratelimited("VPHN unsupported. Disabling polling...\n");
+ break;
+ case H_HARDWARE:
+ pr_err_ratelimited("hcall_vphn() experienced a hardware fault "
+ "preventing VPHN. Disabling polling...\n");
+ break;
+ case H_PARAMETER:
+ pr_err_ratelimited("hcall_vphn() was passed an invalid parameter. "
+ "Disabling polling...\n");
+ break;
+ default:
+ pr_err_ratelimited("hcall_vphn() returned %ld. Disabling polling...\n"
+ , rc);
break;
}
-
+out:
return rc;
}
@@ -1250,389 +1248,33 @@
return new_nid;
}
-/*
- * Update the CPU maps and sysfs entries for a single CPU when its NUMA
- * characteristics change. This function doesn't perform any locking and is
- * only safe to call from stop_machine().
- */
-static int update_cpu_topology(void *data)
+int cpu_to_coregroup_id(int cpu)
{
- struct topology_update_data *update;
- unsigned long cpu;
+ __be32 associativity[VPHN_ASSOC_BUFSIZE] = {0};
+ int index;
- if (!data)
- return -EINVAL;
+ if (cpu < 0 || cpu > nr_cpu_ids)
+ return -1;
- cpu = smp_processor_id();
-
- for (update = data; update; update = update->next) {
- int new_nid = update->new_nid;
- if (cpu != update->cpu)
- continue;
-
- unmap_cpu_from_node(cpu);
- map_cpu_to_node(cpu, new_nid);
- set_cpu_numa_node(cpu, new_nid);
- set_cpu_numa_mem(cpu, local_memory_node(new_nid));
- vdso_getcpu_init();
- }
-
- return 0;
-}
-
-static int update_lookup_table(void *data)
-{
- struct topology_update_data *update;
-
- if (!data)
- return -EINVAL;
-
- /*
- * Upon topology update, the numa-cpu lookup table needs to be updated
- * for all threads in the core, including offline CPUs, to ensure that
- * future hotplug operations respect the cpu-to-node associativity
- * properly.
- */
- for (update = data; update; update = update->next) {
- int nid, base, j;
-
- nid = update->new_nid;
- base = cpu_first_thread_sibling(update->cpu);
-
- for (j = 0; j < threads_per_core; j++) {
- update_numa_cpu_lookup_table(base + j, nid);
- }
- }
-
- return 0;
-}
-
-/*
- * Update the node maps and sysfs entries for each cpu whose home node
- * has changed. Returns 1 when the topology has changed, and 0 otherwise.
- *
- * cpus_locked says whether we already hold cpu_hotplug_lock.
- */
-int numa_update_cpu_topology(bool cpus_locked)
-{
- unsigned int cpu, sibling, changed = 0;
- struct topology_update_data *updates, *ud;
- cpumask_t updated_cpus;
- struct device *dev;
- int weight, new_nid, i = 0;
-
- if (!prrn_enabled && !vphn_enabled && topology_inited)
- return 0;
-
- weight = cpumask_weight(&cpu_associativity_changes_mask);
- if (!weight)
- return 0;
-
- updates = kcalloc(weight, sizeof(*updates), GFP_KERNEL);
- if (!updates)
- return 0;
-
- cpumask_clear(&updated_cpus);
-
- for_each_cpu(cpu, &cpu_associativity_changes_mask) {
- /*
- * If siblings aren't flagged for changes, updates list
- * will be too short. Skip on this update and set for next
- * update.
- */
- if (!cpumask_subset(cpu_sibling_mask(cpu),
- &cpu_associativity_changes_mask)) {
- pr_info("Sibling bits not set for associativity "
- "change, cpu%d\n", cpu);
- cpumask_or(&cpu_associativity_changes_mask,
- &cpu_associativity_changes_mask,
- cpu_sibling_mask(cpu));
- cpu = cpu_last_thread_sibling(cpu);
- continue;
- }
-
- new_nid = find_and_online_cpu_nid(cpu);
-
- if (new_nid == numa_cpu_lookup_table[cpu]) {
- cpumask_andnot(&cpu_associativity_changes_mask,
- &cpu_associativity_changes_mask,
- cpu_sibling_mask(cpu));
- dbg("Assoc chg gives same node %d for cpu%d\n",
- new_nid, cpu);
- cpu = cpu_last_thread_sibling(cpu);
- continue;
- }
-
- for_each_cpu(sibling, cpu_sibling_mask(cpu)) {
- ud = &updates[i++];
- ud->next = &updates[i];
- ud->cpu = sibling;
- ud->new_nid = new_nid;
- ud->old_nid = numa_cpu_lookup_table[sibling];
- cpumask_set_cpu(sibling, &updated_cpus);
- }
- cpu = cpu_last_thread_sibling(cpu);
- }
-
- /*
- * Prevent processing of 'updates' from overflowing array
- * where last entry filled in a 'next' pointer.
- */
- if (i)
- updates[i-1].next = NULL;
-
- pr_debug("Topology update for the following CPUs:\n");
- if (cpumask_weight(&updated_cpus)) {
- for (ud = &updates[0]; ud; ud = ud->next) {
- pr_debug("cpu %d moving from node %d "
- "to %d\n", ud->cpu,
- ud->old_nid, ud->new_nid);
- }
- }
-
- /*
- * In cases where we have nothing to update (because the updates list
- * is too short or because the new topology is same as the old one),
- * skip invoking update_cpu_topology() via stop-machine(). This is
- * necessary (and not just a fast-path optimization) since stop-machine
- * can end up electing a random CPU to run update_cpu_topology(), and
- * thus trick us into setting up incorrect cpu-node mappings (since
- * 'updates' is kzalloc()'ed).
- *
- * And for the similar reason, we will skip all the following updating.
- */
- if (!cpumask_weight(&updated_cpus))
+ if (!coregroup_enabled)
goto out;
- if (cpus_locked)
- stop_machine_cpuslocked(update_cpu_topology, &updates[0],
- &updated_cpus);
- else
- stop_machine(update_cpu_topology, &updates[0], &updated_cpus);
+ if (!firmware_has_feature(FW_FEATURE_VPHN))
+ goto out;
- /*
- * Update the numa-cpu lookup table with the new mappings, even for
- * offline CPUs. It is best to perform this update from the stop-
- * machine context.
- */
- if (cpus_locked)
- stop_machine_cpuslocked(update_lookup_table, &updates[0],
- cpumask_of(raw_smp_processor_id()));
- else
- stop_machine(update_lookup_table, &updates[0],
- cpumask_of(raw_smp_processor_id()));
+ if (vphn_get_associativity(cpu, associativity))
+ goto out;
- for (ud = &updates[0]; ud; ud = ud->next) {
- unregister_cpu_under_node(ud->cpu, ud->old_nid);
- register_cpu_under_node(ud->cpu, ud->new_nid);
-
- dev = get_cpu_device(ud->cpu);
- if (dev)
- kobject_uevent(&dev->kobj, KOBJ_CHANGE);
- cpumask_clear_cpu(ud->cpu, &cpu_associativity_changes_mask);
- changed = 1;
- }
+ index = of_read_number(associativity, 1);
+ if (index > min_common_depth + 1)
+ return of_read_number(&associativity[index - 1], 1);
out:
- kfree(updates);
- return changed;
+ return cpu_to_core_id(cpu);
}
-int arch_update_cpu_topology(void)
-{
- return numa_update_cpu_topology(true);
-}
-
-static void topology_work_fn(struct work_struct *work)
-{
- rebuild_sched_domains();
-}
-static DECLARE_WORK(topology_work, topology_work_fn);
-
-static void topology_schedule_update(void)
-{
- schedule_work(&topology_work);
-}
-
-static void topology_timer_fn(struct timer_list *unused)
-{
- if (prrn_enabled && cpumask_weight(&cpu_associativity_changes_mask))
- topology_schedule_update();
- else if (vphn_enabled) {
- if (update_cpu_associativity_changes_mask() > 0)
- topology_schedule_update();
- reset_topology_timer();
- }
-}
-static struct timer_list topology_timer;
-
-static void reset_topology_timer(void)
-{
- if (vphn_enabled)
- mod_timer(&topology_timer, jiffies + topology_timer_secs * HZ);
-}
-
-#ifdef CONFIG_SMP
-
-static int dt_update_callback(struct notifier_block *nb,
- unsigned long action, void *data)
-{
- struct of_reconfig_data *update = data;
- int rc = NOTIFY_DONE;
-
- switch (action) {
- case OF_RECONFIG_UPDATE_PROPERTY:
- if (of_node_is_type(update->dn, "cpu") &&
- !of_prop_cmp(update->prop->name, "ibm,associativity")) {
- u32 core_id;
- of_property_read_u32(update->dn, "reg", &core_id);
- rc = dlpar_cpu_readd(core_id);
- rc = NOTIFY_OK;
- }
- break;
- }
-
- return rc;
-}
-
-static struct notifier_block dt_update_nb = {
- .notifier_call = dt_update_callback,
-};
-
-#endif
-
-/*
- * Start polling for associativity changes.
- */
-int start_topology_update(void)
-{
- int rc = 0;
-
- if (!topology_updates_enabled)
- return 0;
-
- if (firmware_has_feature(FW_FEATURE_PRRN)) {
- if (!prrn_enabled) {
- prrn_enabled = 1;
-#ifdef CONFIG_SMP
- rc = of_reconfig_notifier_register(&dt_update_nb);
-#endif
- }
- }
- if (firmware_has_feature(FW_FEATURE_VPHN) &&
- lppaca_shared_proc(get_lppaca())) {
- if (!vphn_enabled) {
- vphn_enabled = 1;
- setup_cpu_associativity_change_counters();
- timer_setup(&topology_timer, topology_timer_fn,
- TIMER_DEFERRABLE);
- reset_topology_timer();
- }
- }
-
- pr_info("Starting topology update%s%s\n",
- (prrn_enabled ? " prrn_enabled" : ""),
- (vphn_enabled ? " vphn_enabled" : ""));
-
- return rc;
-}
-
-/*
- * Disable polling for VPHN associativity changes.
- */
-int stop_topology_update(void)
-{
- int rc = 0;
-
- if (!topology_updates_enabled)
- return 0;
-
- if (prrn_enabled) {
- prrn_enabled = 0;
-#ifdef CONFIG_SMP
- rc = of_reconfig_notifier_unregister(&dt_update_nb);
-#endif
- }
- if (vphn_enabled) {
- vphn_enabled = 0;
- rc = del_timer_sync(&topology_timer);
- }
-
- pr_info("Stopping topology update\n");
-
- return rc;
-}
-
-int prrn_is_enabled(void)
-{
- return prrn_enabled;
-}
-
-void __init shared_proc_topology_init(void)
-{
- if (lppaca_shared_proc(get_lppaca())) {
- bitmap_fill(cpumask_bits(&cpu_associativity_changes_mask),
- nr_cpumask_bits);
- numa_update_cpu_topology(false);
- }
-}
-
-static int topology_read(struct seq_file *file, void *v)
-{
- if (vphn_enabled || prrn_enabled)
- seq_puts(file, "on\n");
- else
- seq_puts(file, "off\n");
-
- return 0;
-}
-
-static int topology_open(struct inode *inode, struct file *file)
-{
- return single_open(file, topology_read, NULL);
-}
-
-static ssize_t topology_write(struct file *file, const char __user *buf,
- size_t count, loff_t *off)
-{
- char kbuf[4]; /* "on" or "off" plus null. */
- int read_len;
-
- read_len = count < 3 ? count : 3;
- if (copy_from_user(kbuf, buf, read_len))
- return -EINVAL;
-
- kbuf[read_len] = '\0';
-
- if (!strncmp(kbuf, "on", 2)) {
- topology_updates_enabled = true;
- start_topology_update();
- } else if (!strncmp(kbuf, "off", 3)) {
- stop_topology_update();
- topology_updates_enabled = false;
- } else
- return -EINVAL;
-
- return count;
-}
-
-static const struct file_operations topology_ops = {
- .read = seq_read,
- .write = topology_write,
- .open = topology_open,
- .release = single_release
-};
-
static int topology_update_init(void)
{
- start_topology_update();
-
- if (vphn_enabled)
- topology_schedule_update();
-
- if (!proc_create("powerpc/topology_updates", 0644, NULL, &topology_ops))
- return -ENOMEM;
-
topology_inited = 1;
return 0;
}
diff --git a/arch/powerpc/mm/pgtable-frag.c b/arch/powerpc/mm/pgtable-frag.c
index ee4bd6d..97ae493 100644
--- a/arch/powerpc/mm/pgtable-frag.c
+++ b/arch/powerpc/mm/pgtable-frag.c
@@ -110,6 +110,9 @@
{
struct page *page = virt_to_page(table);
+ if (PageReserved(page))
+ return free_reserved_page(page);
+
BUG_ON(atomic_read(&page->pt_frag_refcount) <= 0);
if (atomic_dec_and_test(&page->pt_frag_refcount)) {
if (!kernel)
diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c
index e3759b6..faaf33e 100644
--- a/arch/powerpc/mm/pgtable.c
+++ b/arch/powerpc/mm/pgtable.c
@@ -23,7 +23,6 @@
#include <linux/percpu.h>
#include <linux/hardirq.h>
#include <linux/hugetlb.h>
-#include <asm/pgalloc.h>
#include <asm/tlbflush.h>
#include <asm/tlb.h>
#include <asm/hugetlb.h>
@@ -100,7 +99,7 @@
* as we don't have two bits to spare for _PAGE_EXEC and _PAGE_HWEXEC so
* instead we "filter out" the exec permission for non clean pages.
*/
-static pte_t set_pte_filter(pte_t pte)
+static inline pte_t set_pte_filter(pte_t pte)
{
struct page *pg;
@@ -185,9 +184,6 @@
*/
VM_WARN_ON(pte_hw_valid(*ptep) && !pte_protnone(*ptep));
- /* Add the pte bit when trying to set a pte */
- pte = pte_mkpte(pte);
-
/* Note: mm->context.id might not yet have been assigned as
* this context might not have been activated yet when this
* is called.
@@ -198,6 +194,15 @@
__set_pte_at(mm, addr, ptep, pte, 0);
}
+void unmap_kernel_page(unsigned long va)
+{
+ pmd_t *pmdp = pmd_off_k(va);
+ pte_t *ptep = pte_offset_kernel(pmdp, va);
+
+ pte_clear(&init_mm, va, ptep);
+ flush_tlb_kernel_range(va, va + PAGE_SIZE);
+}
+
/*
* This is called when relaxing access to a PTE. It's also called in the page
* fault path when we don't hit any of the major fault cases, ie, a minor
@@ -249,22 +254,49 @@
#else
/*
- * Not used on non book3s64 platforms. But 8xx
- * can possibly use tsize derived from hstate.
+ * Not used on non book3s64 platforms.
+ * 8xx compares it with mmu_virtual_psize to
+ * know if it is a huge page or not.
*/
- psize = 0;
+ psize = MMU_PAGE_COUNT;
#endif
__ptep_set_access_flags(vma, ptep, pte, addr, psize);
}
return changed;
#endif
}
+
+#if defined(CONFIG_PPC_8xx)
+void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte)
+{
+ pmd_t *pmd = pmd_off(mm, addr);
+ pte_basic_t val;
+ pte_basic_t *entry = &ptep->pte;
+ int num, i;
+
+ /*
+ * Make sure hardware valid bit is not set. We don't do
+ * tlb flush for this update.
+ */
+ VM_WARN_ON(pte_hw_valid(*ptep) && !pte_protnone(*ptep));
+
+ pte = set_pte_filter(pte);
+
+ val = pte_val(pte);
+
+ num = number_of_cells_per_pte(pmd, val, 1);
+
+ for (i = 0; i < num; i++, entry++, val += SZ_4K)
+ *entry = val;
+}
+#endif
#endif /* CONFIG_HUGETLB_PAGE */
#ifdef CONFIG_DEBUG_VM
void assert_pte_locked(struct mm_struct *mm, unsigned long addr)
{
pgd_t *pgd;
+ p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;
@@ -272,12 +304,14 @@
return;
pgd = mm->pgd + pgd_index(addr);
BUG_ON(pgd_none(*pgd));
- pud = pud_offset(pgd, addr);
+ p4d = p4d_offset(pgd, addr);
+ BUG_ON(p4d_none(*p4d));
+ pud = pud_offset(p4d, addr);
BUG_ON(pud_none(*pud));
pmd = pmd_offset(pud, addr);
/*
* khugepaged to collapse normal pages to hugepage, first set
- * pmd to none to force page fault/gup to take mmap_sem. After
+ * pmd to none to force page fault/gup to take mmap_lock. After
* pmd is set to none, we do a pte_clear which does this assertion
* so if we find pmd none, return.
*/
@@ -312,12 +346,13 @@
pte_t *__find_linux_pte(pgd_t *pgdir, unsigned long ea,
bool *is_thp, unsigned *hpage_shift)
{
- pgd_t pgd, *pgdp;
+ pgd_t *pgdp;
+ p4d_t p4d, *p4dp;
pud_t pud, *pudp;
pmd_t pmd, *pmdp;
pte_t *ret_pte;
hugepd_t *hpdp = NULL;
- unsigned pdshift = PGDIR_SHIFT;
+ unsigned pdshift;
if (hpage_shift)
*hpage_shift = 0;
@@ -325,24 +360,28 @@
if (is_thp)
*is_thp = false;
- pgdp = pgdir + pgd_index(ea);
- pgd = READ_ONCE(*pgdp);
/*
* Always operate on the local stack value. This make sure the
* value don't get updated by a parallel THP split/collapse,
* page fault or a page unmap. The return pte_t * is still not
* stable. So should be checked there for above conditions.
+ * Top level is an exception because it is folded into p4d.
*/
- if (pgd_none(pgd))
+ pgdp = pgdir + pgd_index(ea);
+ p4dp = p4d_offset(pgdp, ea);
+ p4d = READ_ONCE(*p4dp);
+ pdshift = P4D_SHIFT;
+
+ if (p4d_none(p4d))
return NULL;
- if (pgd_is_leaf(pgd)) {
- ret_pte = (pte_t *)pgdp;
+ if (p4d_is_leaf(p4d)) {
+ ret_pte = (pte_t *)p4dp;
goto out;
}
- if (is_hugepd(__hugepd(pgd_val(pgd)))) {
- hpdp = (hugepd_t *)&pgd;
+ if (is_hugepd(__hugepd(p4d_val(p4d)))) {
+ hpdp = (hugepd_t *)&p4d;
goto out_huge;
}
@@ -352,7 +391,7 @@
* irq disabled
*/
pdshift = PUD_SHIFT;
- pudp = pud_offset(&pgd, ea);
+ pudp = pud_offset(&p4d, ea);
pud = READ_ONCE(*pudp);
if (pud_none(pud))
diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c
index da9f722..079159e 100644
--- a/arch/powerpc/mm/pgtable_32.c
+++ b/arch/powerpc/mm/pgtable_32.c
@@ -24,16 +24,31 @@
#include <linux/memblock.h>
#include <linux/slab.h>
-#include <asm/pgtable.h>
#include <asm/pgalloc.h>
#include <asm/fixmap.h>
#include <asm/setup.h>
#include <asm/sections.h>
+#include <asm/early_ioremap.h>
#include <mm/mmu_decl.h>
extern char etext[], _stext[], _sinittext[], _einittext[];
+static u8 early_fixmap_pagetable[FIXMAP_PTE_SIZE] __page_aligned_data;
+
+notrace void __init early_ioremap_init(void)
+{
+ unsigned long addr = ALIGN_DOWN(FIXADDR_START, PGDIR_SIZE);
+ pte_t *ptep = (pte_t *)early_fixmap_pagetable;
+ pmd_t *pmdp = pmd_off_k(addr);
+
+ for (; (s32)(FIXADDR_TOP - addr) > 0;
+ addr += PGDIR_SIZE, ptep += PTRS_PER_PTE, pmdp++)
+ pmd_populate_kernel(&init_mm, pmdp, ptep);
+
+ early_ioremap_setup();
+}
+
static void __init *early_alloc_pgtable(unsigned long size)
{
void *ptr = memblock_alloc(size, size);
@@ -45,7 +60,7 @@
return ptr;
}
-static pte_t __init *early_pte_alloc_kernel(pmd_t *pmdp, unsigned long va)
+pte_t __init *early_pte_alloc_kernel(pmd_t *pmdp, unsigned long va)
{
if (pmd_none(*pmdp)) {
pte_t *ptep = early_alloc_pgtable(PTE_FRAG_SIZE);
@@ -63,7 +78,7 @@
int err = -ENOMEM;
/* Use upper 10 bits of VA to index the first level map */
- pd = pmd_offset(pud_offset(pgd_offset_k(va), va), va);
+ pd = pmd_off_k(va);
/* Use middle 10 bits of VA to index the second-level map */
if (likely(slab_is_available()))
pg = pte_alloc_kernel(pd, va);
@@ -108,60 +123,22 @@
void __init mapin_ram(void)
{
- struct memblock_region *reg;
+ phys_addr_t base, end;
+ u64 i;
- for_each_memblock(memory, reg) {
- phys_addr_t base = reg->base;
- phys_addr_t top = min(base + reg->size, total_lowmem);
+ for_each_mem_range(i, &base, &end) {
+ phys_addr_t top = min(end, total_lowmem);
if (base >= top)
continue;
base = mmu_mapin_ram(base, top);
- if (IS_ENABLED(CONFIG_BDI_SWITCH))
- __mapin_ram_chunk(reg->base, top);
- else
- __mapin_ram_chunk(base, top);
+ __mapin_ram_chunk(base, top);
}
}
-/* Scan the real Linux page tables and return a PTE pointer for
- * a virtual address in a context.
- * Returns true (1) if PTE was found, zero otherwise. The pointer to
- * the PTE pointer is unmodified if PTE is not found.
- */
-static int
-get_pteptr(struct mm_struct *mm, unsigned long addr, pte_t **ptep, pmd_t **pmdp)
-{
- pgd_t *pgd;
- pud_t *pud;
- pmd_t *pmd;
- pte_t *pte;
- int retval = 0;
-
- pgd = pgd_offset(mm, addr & PAGE_MASK);
- if (pgd) {
- pud = pud_offset(pgd, addr & PAGE_MASK);
- if (pud && pud_present(*pud)) {
- pmd = pmd_offset(pud, addr & PAGE_MASK);
- if (pmd_present(*pmd)) {
- pte = pte_offset_map(pmd, addr & PAGE_MASK);
- if (pte) {
- retval = 1;
- *ptep = pte;
- if (pmdp)
- *pmdp = pmd;
- /* XXX caller needs to do pte_unmap, yuck */
- }
- }
- }
- }
- return(retval);
-}
-
static int __change_page_attr_noflush(struct page *page, pgprot_t prot)
{
pte_t *kpte;
- pmd_t *kpmd;
unsigned long address;
BUG_ON(PageHighMem(page));
@@ -169,10 +146,10 @@
if (v_block_mapped(address))
return 0;
- if (!get_pteptr(&init_mm, address, &kpte, &kpmd))
+ kpte = virt_to_kpte(address);
+ if (!kpte)
return -EINVAL;
__set_pte_at(&init_mm, address, kpte, mk_pte(page, prot), 0);
- pte_unmap(kpte);
return 0;
}
diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c
index e78832d..aefc2bf 100644
--- a/arch/powerpc/mm/pgtable_64.c
+++ b/arch/powerpc/mm/pgtable_64.c
@@ -31,11 +31,9 @@
#include <linux/slab.h>
#include <linux/hugetlb.h>
-#include <asm/pgalloc.h>
#include <asm/page.h>
#include <asm/prom.h>
#include <asm/mmu_context.h>
-#include <asm/pgtable.h>
#include <asm/mmu.h>
#include <asm/smp.h>
#include <asm/machdep.h>
@@ -101,20 +99,22 @@
#ifndef __PAGETABLE_PUD_FOLDED
/* 4 level page table */
-struct page *pgd_page(pgd_t pgd)
+struct page *p4d_page(p4d_t p4d)
{
- if (pgd_is_leaf(pgd)) {
- VM_WARN_ON(!pgd_huge(pgd));
- return pte_page(pgd_pte(pgd));
+ if (p4d_is_leaf(p4d)) {
+ if (!IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMAP))
+ VM_WARN_ON(!p4d_huge(p4d));
+ return pte_page(p4d_pte(p4d));
}
- return virt_to_page(pgd_page_vaddr(pgd));
+ return virt_to_page(p4d_page_vaddr(p4d));
}
#endif
struct page *pud_page(pud_t pud)
{
if (pud_is_leaf(pud)) {
- VM_WARN_ON(!pud_huge(pud));
+ if (!IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMAP))
+ VM_WARN_ON(!pud_huge(pud));
return pte_page(pud_pte(pud));
}
return virt_to_page(pud_page_vaddr(pud));
@@ -127,7 +127,13 @@
struct page *pmd_page(pmd_t pmd)
{
if (pmd_is_leaf(pmd)) {
- VM_WARN_ON(!(pmd_large(pmd) || pmd_huge(pmd)));
+ /*
+ * vmalloc_to_page may be called on any vmap address (not only
+ * vmalloc), and it uses pmd_page() etc., when huge vmap is
+ * enabled so these checks can't be used.
+ */
+ if (!IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMAP))
+ VM_WARN_ON(!(pmd_large(pmd) || pmd_huge(pmd)));
return pte_page(pmd_pte(pmd));
}
return virt_to_page(pmd_page_vaddr(pmd));
diff --git a/arch/powerpc/mm/ptdump/8xx.c b/arch/powerpc/mm/ptdump/8xx.c
index 9e2d8e8..86da2a6 100644
--- a/arch/powerpc/mm/ptdump/8xx.c
+++ b/arch/powerpc/mm/ptdump/8xx.c
@@ -5,12 +5,22 @@
*
*/
#include <linux/kernel.h>
-#include <asm/pgtable.h>
+#include <linux/pgtable.h>
#include "ptdump.h"
static const struct flag_info flag_array[] = {
{
+#ifdef CONFIG_PPC_16K_PAGES
+ .mask = _PAGE_HUGE,
+ .val = _PAGE_HUGE,
+#else
+ .mask = _PAGE_SPS,
+ .val = _PAGE_SPS,
+#endif
+ .set = "huge",
+ .clear = " ",
+ }, {
.mask = _PAGE_SH,
.val = 0,
.set = "user",
diff --git a/arch/powerpc/mm/ptdump/bats.c b/arch/powerpc/mm/ptdump/bats.c
index 4154fea..c4c628b 100644
--- a/arch/powerpc/mm/ptdump/bats.c
+++ b/arch/powerpc/mm/ptdump/bats.c
@@ -6,67 +6,11 @@
* This dumps the content of BATS
*/
+#include <linux/pgtable.h>
#include <asm/debugfs.h>
-#include <asm/pgtable.h>
#include <asm/cpu_has_feature.h>
-static char *pp_601(int k, int pp)
-{
- if (pp == 0)
- return k ? "NA" : "RWX";
- if (pp == 1)
- return k ? "ROX" : "RWX";
- if (pp == 2)
- return k ? "RWX" : "RWX";
- return k ? "ROX" : "ROX";
-}
-
-static void bat_show_601(struct seq_file *m, int idx, u32 lower, u32 upper)
-{
- u32 blpi = upper & 0xfffe0000;
- u32 k = (upper >> 2) & 3;
- u32 pp = upper & 3;
- phys_addr_t pbn = PHYS_BAT_ADDR(lower);
- u32 bsm = lower & 0x3ff;
- u32 size = (bsm + 1) << 17;
-
- seq_printf(m, "%d: ", idx);
- if (!(lower & 0x40)) {
- seq_puts(m, " -\n");
- return;
- }
-
- seq_printf(m, "0x%08x-0x%08x ", blpi, blpi + size - 1);
-#ifdef CONFIG_PHYS_64BIT
- seq_printf(m, "0x%016llx ", pbn);
-#else
- seq_printf(m, "0x%08x ", pbn);
-#endif
-
- seq_printf(m, "Kernel %s User %s", pp_601(k & 2, pp), pp_601(k & 1, pp));
-
- if (lower & _PAGE_WRITETHRU)
- seq_puts(m, "write through ");
- if (lower & _PAGE_NO_CACHE)
- seq_puts(m, "no cache ");
- if (lower & _PAGE_COHERENT)
- seq_puts(m, "coherent ");
- seq_puts(m, "\n");
-}
-
-#define BAT_SHOW_601(_m, _n, _l, _u) bat_show_601(_m, _n, mfspr(_l), mfspr(_u))
-
-static int bats_show_601(struct seq_file *m, void *v)
-{
- seq_puts(m, "---[ Block Address Translation ]---\n");
-
- BAT_SHOW_601(m, 0, SPRN_IBAT0L, SPRN_IBAT0U);
- BAT_SHOW_601(m, 1, SPRN_IBAT1L, SPRN_IBAT1U);
- BAT_SHOW_601(m, 2, SPRN_IBAT2L, SPRN_IBAT2U);
- BAT_SHOW_601(m, 3, SPRN_IBAT3L, SPRN_IBAT3U);
-
- return 0;
-}
+#include "ptdump.h"
static void bat_show_603(struct seq_file *m, int idx, u32 lower, u32 upper, bool is_d)
{
@@ -88,6 +32,7 @@
#else
seq_printf(m, "0x%08x ", brpn);
#endif
+ pt_dump_size(m, size);
if (k == 1)
seq_puts(m, "User ");
@@ -97,20 +42,16 @@
seq_puts(m, "Kernel/User ");
if (lower & BPP_RX)
- seq_puts(m, is_d ? "RO " : "EXEC ");
+ seq_puts(m, is_d ? "r " : " x ");
else if (lower & BPP_RW)
- seq_puts(m, is_d ? "RW " : "EXEC ");
+ seq_puts(m, is_d ? "rw " : " x ");
else
- seq_puts(m, is_d ? "NA " : "NX ");
+ seq_puts(m, is_d ? " " : " ");
- if (lower & _PAGE_WRITETHRU)
- seq_puts(m, "write through ");
- if (lower & _PAGE_NO_CACHE)
- seq_puts(m, "no cache ");
- if (lower & _PAGE_COHERENT)
- seq_puts(m, "coherent ");
- if (lower & _PAGE_GUARDED)
- seq_puts(m, "guarded ");
+ seq_puts(m, lower & _PAGE_WRITETHRU ? "w " : " ");
+ seq_puts(m, lower & _PAGE_NO_CACHE ? "i " : " ");
+ seq_puts(m, lower & _PAGE_COHERENT ? "m " : " ");
+ seq_puts(m, lower & _PAGE_GUARDED ? "g " : " ");
seq_puts(m, "\n");
}
@@ -149,9 +90,6 @@
static int bats_open(struct inode *inode, struct file *file)
{
- if (IS_ENABLED(CONFIG_PPC_BOOK3S_601))
- return single_open(file, bats_show_601, NULL);
-
return single_open(file, bats_show_603, NULL);
}
@@ -164,10 +102,8 @@
static int __init bats_init(void)
{
- struct dentry *debugfs_file;
-
- debugfs_file = debugfs_create_file("block_address_translation", 0400,
- powerpc_debugfs_root, NULL, &bats_fops);
- return debugfs_file ? 0 : -ENOMEM;
+ debugfs_create_file("block_address_translation", 0400,
+ powerpc_debugfs_root, NULL, &bats_fops);
+ return 0;
}
device_initcall(bats_init);
diff --git a/arch/powerpc/mm/ptdump/book3s64.c b/arch/powerpc/mm/ptdump/book3s64.c
index 0dfca72..14f7386 100644
--- a/arch/powerpc/mm/ptdump/book3s64.c
+++ b/arch/powerpc/mm/ptdump/book3s64.c
@@ -5,7 +5,7 @@
*
*/
#include <linux/kernel.h>
-#include <asm/pgtable.h>
+#include <linux/pgtable.h>
#include "ptdump.h"
diff --git a/arch/powerpc/mm/ptdump/hashpagetable.c b/arch/powerpc/mm/ptdump/hashpagetable.c
index a2e8c3b..ad6df9a 100644
--- a/arch/powerpc/mm/ptdump/hashpagetable.c
+++ b/arch/powerpc/mm/ptdump/hashpagetable.c
@@ -15,13 +15,12 @@
#include <linux/mm.h>
#include <linux/sched.h>
#include <linux/seq_file.h>
-#include <asm/pgtable.h>
#include <linux/const.h>
#include <asm/page.h>
-#include <asm/pgalloc.h>
#include <asm/plpar_wrappers.h>
#include <linux/memblock.h>
#include <asm/firmware.h>
+#include <asm/pgalloc.h>
struct pg_state {
struct seq_file *seq;
@@ -417,9 +416,9 @@
}
}
-static void walk_pud(struct pg_state *st, pgd_t *pgd, unsigned long start)
+static void walk_pud(struct pg_state *st, p4d_t *p4d, unsigned long start)
{
- pud_t *pud = pud_offset(pgd, 0);
+ pud_t *pud = pud_offset(p4d, 0);
unsigned long addr;
unsigned int i;
@@ -431,6 +430,20 @@
}
}
+static void walk_p4d(struct pg_state *st, pgd_t *pgd, unsigned long start)
+{
+ p4d_t *p4d = p4d_offset(pgd, 0);
+ unsigned long addr;
+ unsigned int i;
+
+ for (i = 0; i < PTRS_PER_P4D; i++, p4d++) {
+ addr = start + i * P4D_SIZE;
+ if (!p4d_none(*p4d))
+ /* p4d exists */
+ walk_pud(st, p4d, addr);
+ }
+}
+
static void walk_pagetables(struct pg_state *st)
{
pgd_t *pgd = pgd_offset_k(0UL);
@@ -445,7 +458,7 @@
addr = KERN_VIRT_START + i * PGDIR_SIZE;
if (!pgd_none(*pgd))
/* pgd exists */
- walk_pud(st, pgd, addr);
+ walk_p4d(st, pgd, addr);
}
}
@@ -527,13 +540,10 @@
static int ptdump_init(void)
{
- struct dentry *debugfs_file;
-
if (!radix_enabled()) {
populate_markers();
- debugfs_file = debugfs_create_file("kernel_hash_pagetable",
- 0400, NULL, NULL, &ptdump_fops);
- return debugfs_file ? 0 : -ENOMEM;
+ debugfs_create_file("kernel_hash_pagetable", 0400, NULL, NULL,
+ &ptdump_fops);
}
return 0;
}
diff --git a/arch/powerpc/mm/ptdump/ptdump.c b/arch/powerpc/mm/ptdump/ptdump.c
index 633711b..aca354f 100644
--- a/arch/powerpc/mm/ptdump/ptdump.c
+++ b/arch/powerpc/mm/ptdump/ptdump.c
@@ -19,10 +19,11 @@
#include <linux/sched.h>
#include <linux/seq_file.h>
#include <asm/fixmap.h>
-#include <asm/pgtable.h>
#include <linux/const.h>
#include <asm/page.h>
-#include <asm/pgalloc.h>
+#include <asm/hugetlb.h>
+
+#include <mm/mmu_decl.h>
#include "ptdump.h"
@@ -72,6 +73,10 @@
static struct addr_marker address_markers[] = {
{ 0, "Start of kernel VM" },
+#ifdef MODULES_VADDR
+ { 0, "modules start" },
+ { 0, "modules end" },
+#endif
{ 0, "vmalloc() Area" },
{ 0, "vmalloc() End" },
#ifdef CONFIG_PPC64
@@ -111,6 +116,19 @@
seq_putc(m, c); \
})
+void pt_dump_size(struct seq_file *m, unsigned long size)
+{
+ static const char units[] = "KMGTPE";
+ const char *unit = units;
+
+ /* Work out what appropriate unit to use */
+ while (!(size & 1023) && unit[1]) {
+ size >>= 10;
+ unit++;
+ }
+ pt_dump_seq_printf(m, "%9lu%c ", size, *unit);
+}
+
static void dump_flag_info(struct pg_state *st, const struct flag_info
*flag, u64 pte, int num)
{
@@ -145,8 +163,6 @@
static void dump_addr(struct pg_state *st, unsigned long addr)
{
- static const char units[] = "KMGTPE";
- const char *unit = units;
unsigned long delta;
#ifdef CONFIG_PPC64
@@ -163,13 +179,7 @@
pt_dump_seq_printf(st->seq, " " REG " ", st->start_pa);
delta = (addr - st->start_address) >> 10;
}
- /* Work out what appropriate unit to use */
- while (!(delta & 1023) && unit[1]) {
- delta >>= 10;
- unit++;
- }
- pt_dump_seq_printf(st->seq, "%9lu%c", delta, *unit);
-
+ pt_dump_size(st->seq, delta);
}
static void note_prot_wx(struct pg_state *st, unsigned long addr)
@@ -188,6 +198,24 @@
st->wx_pages += (addr - st->start_address) / PAGE_SIZE;
}
+static void note_page_update_state(struct pg_state *st, unsigned long addr,
+ unsigned int level, u64 val, unsigned long page_size)
+{
+ u64 flag = val & pg_level[level].mask;
+ u64 pa = val & PTE_RPN_MASK;
+
+ st->level = level;
+ st->current_flags = flag;
+ st->start_address = addr;
+ st->start_pa = pa;
+ st->page_size = page_size;
+
+ while (addr >= st->marker[1].start_address) {
+ st->marker++;
+ pt_dump_seq_printf(st->seq, "---[ %s ]---\n", st->marker->name);
+ }
+}
+
static void note_page(struct pg_state *st, unsigned long addr,
unsigned int level, u64 val, unsigned long page_size)
{
@@ -196,13 +224,8 @@
/* At first no level is set */
if (!st->level) {
- st->level = level;
- st->current_flags = flag;
- st->start_address = addr;
- st->start_pa = pa;
- st->last_pa = pa;
- st->page_size = page_size;
pt_dump_seq_printf(st->seq, "---[ %s ]---\n", st->marker->name);
+ note_page_update_state(st, addr, level, val, page_size);
/*
* Dump the section of virtual memory when:
* - the PTE flags from one entry to the next differs.
@@ -234,19 +257,9 @@
* Address indicates we have passed the end of the
* current section of virtual memory
*/
- while (addr >= st->marker[1].start_address) {
- st->marker++;
- pt_dump_seq_printf(st->seq, "---[ %s ]---\n", st->marker->name);
- }
- st->start_address = addr;
- st->start_pa = pa;
- st->last_pa = pa;
- st->page_size = page_size;
- st->current_flags = flag;
- st->level = level;
- } else {
- st->last_pa = pa;
+ note_page_update_state(st, addr, level, val, page_size);
}
+ st->last_pa = pa;
}
static void walk_pte(struct pg_state *st, pmd_t *pmd, unsigned long start)
@@ -262,6 +275,26 @@
}
}
+static void walk_hugepd(struct pg_state *st, hugepd_t *phpd, unsigned long start,
+ int pdshift, int level)
+{
+#ifdef CONFIG_ARCH_HAS_HUGEPD
+ unsigned int i;
+ int shift = hugepd_shift(*phpd);
+ int ptrs_per_hpd = pdshift - shift > 0 ? 1 << (pdshift - shift) : 1;
+
+ if (start & ((1 << shift) - 1))
+ return;
+
+ for (i = 0; i < ptrs_per_hpd; i++) {
+ unsigned long addr = start + (i << shift);
+ pte_t *pte = hugepte_offset(*phpd, addr, pdshift);
+
+ note_page(st, addr, level + 1, pte_val(*pte), 1 << shift);
+ }
+#endif
+}
+
static void walk_pmd(struct pg_state *st, pud_t *pud, unsigned long start)
{
pmd_t *pmd = pmd_offset(pud, 0);
@@ -278,9 +311,9 @@
}
}
-static void walk_pud(struct pg_state *st, pgd_t *pgd, unsigned long start)
+static void walk_pud(struct pg_state *st, p4d_t *p4d, unsigned long start)
{
- pud_t *pud = pud_offset(pgd, 0);
+ pud_t *pud = pud_offset(p4d, 0);
unsigned long addr;
unsigned int i;
@@ -305,11 +338,15 @@
* the hash pagetable.
*/
for (i = pgd_index(addr); i < PTRS_PER_PGD; i++, pgd++, addr += PGDIR_SIZE) {
- if (!pgd_none(*pgd) && !pgd_is_leaf(*pgd))
- /* pgd exists */
- walk_pud(st, pgd, addr);
+ p4d_t *p4d = p4d_offset(pgd, 0);
+
+ if (p4d_none(*p4d) || p4d_is_leaf(*p4d))
+ note_page(st, addr, 1, p4d_val(*p4d), PGDIR_SIZE);
+ else if (is_hugepd(__hugepd(p4d_val(*p4d))))
+ walk_hugepd(st, (hugepd_t *)p4d, addr, PGDIR_SHIFT, 1);
else
- note_page(st, addr, 1, pgd_val(*pgd), PGDIR_SIZE);
+ /* p4d exists */
+ walk_pud(st, p4d, addr);
}
}
@@ -317,7 +354,15 @@
{
int i = 0;
+#ifdef CONFIG_PPC64
address_markers[i++].start_address = PAGE_OFFSET;
+#else
+ address_markers[i++].start_address = TASK_SIZE;
+#endif
+#ifdef MODULES_VADDR
+ address_markers[i++].start_address = MODULES_VADDR;
+ address_markers[i++].start_address = MODULES_END;
+#endif
address_markers[i++].start_address = VMALLOC_START;
address_markers[i++].start_address = VMALLOC_END;
#ifdef CONFIG_PPC64
@@ -354,7 +399,7 @@
struct pg_state st = {
.seq = m,
.marker = address_markers,
- .start_address = PAGE_OFFSET,
+ .start_address = IS_ENABLED(CONFIG_PPC64) ? PAGE_OFFSET : TASK_SIZE,
};
#ifdef CONFIG_PPC64
@@ -398,7 +443,7 @@
.seq = NULL,
.marker = address_markers,
.check_wx = true,
- .start_address = PAGE_OFFSET,
+ .start_address = IS_ENABLED(CONFIG_PPC64) ? PAGE_OFFSET : TASK_SIZE,
};
#ifdef CONFIG_PPC64
@@ -418,12 +463,10 @@
static int ptdump_init(void)
{
- struct dentry *debugfs_file;
-
populate_markers();
build_pgtable_complete_mask();
- debugfs_file = debugfs_create_file("kernel_page_tables", 0400, NULL,
- NULL, &ptdump_fops);
- return debugfs_file ? 0 : -ENOMEM;
+ debugfs_create_file("kernel_page_tables", 0400, NULL, NULL,
+ &ptdump_fops);
+ return 0;
}
device_initcall(ptdump_init);
diff --git a/arch/powerpc/mm/ptdump/ptdump.h b/arch/powerpc/mm/ptdump/ptdump.h
index 5d51363..154efae 100644
--- a/arch/powerpc/mm/ptdump/ptdump.h
+++ b/arch/powerpc/mm/ptdump/ptdump.h
@@ -1,5 +1,6 @@
/* SPDX-License-Identifier: GPL-2.0 */
#include <linux/types.h>
+#include <linux/seq_file.h>
struct flag_info {
u64 mask;
@@ -17,3 +18,5 @@
};
extern struct pgtable_level pg_level[5];
+
+void pt_dump_size(struct seq_file *m, unsigned long delta);
diff --git a/arch/powerpc/mm/ptdump/segment_regs.c b/arch/powerpc/mm/ptdump/segment_regs.c
index 5018436..dde2fe8 100644
--- a/arch/powerpc/mm/ptdump/segment_regs.c
+++ b/arch/powerpc/mm/ptdump/segment_regs.c
@@ -55,10 +55,8 @@
static int __init sr_init(void)
{
- struct dentry *debugfs_file;
-
- debugfs_file = debugfs_create_file("segment_registers", 0400,
- powerpc_debugfs_root, NULL, &sr_fops);
- return debugfs_file ? 0 : -ENOMEM;
+ debugfs_create_file("segment_registers", 0400, powerpc_debugfs_root,
+ NULL, &sr_fops);
+ return 0;
}
device_initcall(sr_init);
diff --git a/arch/powerpc/mm/ptdump/shared.c b/arch/powerpc/mm/ptdump/shared.c
index 784f8df..c005fe0 100644
--- a/arch/powerpc/mm/ptdump/shared.c
+++ b/arch/powerpc/mm/ptdump/shared.c
@@ -5,7 +5,7 @@
*
*/
#include <linux/kernel.h>
-#include <asm/pgtable.h>
+#include <linux/pgtable.h>
#include "ptdump.h"
diff --git a/arch/powerpc/mm/slice.c b/arch/powerpc/mm/slice.c
index dffe1a4..82b45b1 100644
--- a/arch/powerpc/mm/slice.c
+++ b/arch/powerpc/mm/slice.c
@@ -478,7 +478,7 @@
/* If hint, make sure it matches our alignment restrictions */
if (!fixed && addr) {
- addr = _ALIGN_UP(addr, page_size);
+ addr = ALIGN(addr, page_size);
slice_dbg(" aligned addr=%lx\n", addr);
/* Ignore hint if it's too large or overlaps a VMA */
if (addr > high_limit - len || addr < mmap_min_addr ||