Update Linux to v5.10.109
Sourced from [1]
[1] https://cdn.kernel.org/pub/linux/kernel/v5.x/linux-5.10.109.tar.xz
Change-Id: I19bca9fc6762d4e63bcf3e4cba88bbe560d9c76c
Signed-off-by: Olivier Deprez <olivier.deprez@arm.com>
diff --git a/arch/arm64/mm/Makefile b/arch/arm64/mm/Makefile
index 849c1df..5ead3c3 100644
--- a/arch/arm64/mm/Makefile
+++ b/arch/arm64/mm/Makefile
@@ -4,10 +4,11 @@
ioremap.o mmap.o pgd.o mmu.o \
context.o proc.o pageattr.o
obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
-obj-$(CONFIG_ARM64_PTDUMP_CORE) += dump.o
-obj-$(CONFIG_ARM64_PTDUMP_DEBUGFS) += ptdump_debugfs.o
+obj-$(CONFIG_PTDUMP_CORE) += ptdump.o
+obj-$(CONFIG_PTDUMP_DEBUGFS) += ptdump_debugfs.o
obj-$(CONFIG_NUMA) += numa.o
obj-$(CONFIG_DEBUG_VIRTUAL) += physaddr.o
+obj-$(CONFIG_ARM64_MTE) += mteswap.o
KASAN_SANITIZE_physaddr.o += n
obj-$(CONFIG_KASAN) += kasan_init.o
diff --git a/arch/arm64/mm/cache.S b/arch/arm64/mm/cache.S
index db767b0..2d881f3 100644
--- a/arch/arm64/mm/cache.S
+++ b/arch/arm64/mm/cache.S
@@ -24,7 +24,7 @@
* - start - virtual start address of region
* - end - virtual end address of region
*/
-ENTRY(__flush_icache_range)
+SYM_FUNC_START(__flush_icache_range)
/* FALLTHROUGH */
/*
@@ -37,7 +37,7 @@
* - start - virtual start address of region
* - end - virtual end address of region
*/
-ENTRY(__flush_cache_user_range)
+SYM_FUNC_START(__flush_cache_user_range)
uaccess_ttbr0_enable x2, x3, x4
alternative_if ARM64_HAS_CACHE_IDC
dsb ishst
@@ -66,8 +66,8 @@
9:
mov x0, #-EFAULT
b 1b
-ENDPROC(__flush_icache_range)
-ENDPROC(__flush_cache_user_range)
+SYM_FUNC_END(__flush_icache_range)
+SYM_FUNC_END(__flush_cache_user_range)
/*
* invalidate_icache_range(start,end)
@@ -77,7 +77,7 @@
* - start - virtual start address of region
* - end - virtual end address of region
*/
-ENTRY(invalidate_icache_range)
+SYM_FUNC_START(invalidate_icache_range)
alternative_if ARM64_HAS_CACHE_DIC
mov x0, xzr
isb
@@ -94,7 +94,7 @@
2:
mov x0, #-EFAULT
b 1b
-ENDPROC(invalidate_icache_range)
+SYM_FUNC_END(invalidate_icache_range)
/*
* __flush_dcache_area(kaddr, size)
@@ -105,10 +105,10 @@
* - kaddr - kernel address
* - size - size in question
*/
-ENTRY(__flush_dcache_area)
+SYM_FUNC_START_PI(__flush_dcache_area)
dcache_by_line_op civac, sy, x0, x1, x2, x3
ret
-ENDPIPROC(__flush_dcache_area)
+SYM_FUNC_END_PI(__flush_dcache_area)
/*
* __clean_dcache_area_pou(kaddr, size)
@@ -119,14 +119,14 @@
* - kaddr - kernel address
* - size - size in question
*/
-ENTRY(__clean_dcache_area_pou)
+SYM_FUNC_START(__clean_dcache_area_pou)
alternative_if ARM64_HAS_CACHE_IDC
dsb ishst
ret
alternative_else_nop_endif
dcache_by_line_op cvau, ish, x0, x1, x2, x3
ret
-ENDPROC(__clean_dcache_area_pou)
+SYM_FUNC_END(__clean_dcache_area_pou)
/*
* __inval_dcache_area(kaddr, size)
@@ -138,7 +138,8 @@
* - kaddr - kernel address
* - size - size in question
*/
-ENTRY(__inval_dcache_area)
+SYM_FUNC_START_LOCAL(__dma_inv_area)
+SYM_FUNC_START_PI(__inval_dcache_area)
/* FALLTHROUGH */
/*
@@ -146,7 +147,6 @@
* - start - virtual start address of region
* - size - size in question
*/
-__dma_inv_area:
add x1, x1, x0
dcache_line_size x2, x3
sub x3, x2, #1
@@ -165,8 +165,8 @@
b.lo 2b
dsb sy
ret
-ENDPIPROC(__inval_dcache_area)
-ENDPROC(__dma_inv_area)
+SYM_FUNC_END_PI(__inval_dcache_area)
+SYM_FUNC_END(__dma_inv_area)
/*
* __clean_dcache_area_poc(kaddr, size)
@@ -177,7 +177,8 @@
* - kaddr - kernel address
* - size - size in question
*/
-ENTRY(__clean_dcache_area_poc)
+SYM_FUNC_START_LOCAL(__dma_clean_area)
+SYM_FUNC_START_PI(__clean_dcache_area_poc)
/* FALLTHROUGH */
/*
@@ -185,11 +186,10 @@
* - start - virtual start address of region
* - size - size in question
*/
-__dma_clean_area:
dcache_by_line_op cvac, sy, x0, x1, x2, x3
ret
-ENDPIPROC(__clean_dcache_area_poc)
-ENDPROC(__dma_clean_area)
+SYM_FUNC_END_PI(__clean_dcache_area_poc)
+SYM_FUNC_END(__dma_clean_area)
/*
* __clean_dcache_area_pop(kaddr, size)
@@ -200,13 +200,13 @@
* - kaddr - kernel address
* - size - size in question
*/
-ENTRY(__clean_dcache_area_pop)
+SYM_FUNC_START_PI(__clean_dcache_area_pop)
alternative_if_not ARM64_HAS_DCPOP
b __clean_dcache_area_poc
alternative_else_nop_endif
dcache_by_line_op cvap, sy, x0, x1, x2, x3
ret
-ENDPIPROC(__clean_dcache_area_pop)
+SYM_FUNC_END_PI(__clean_dcache_area_pop)
/*
* __dma_flush_area(start, size)
@@ -216,10 +216,10 @@
* - start - virtual start address of region
* - size - size in question
*/
-ENTRY(__dma_flush_area)
+SYM_FUNC_START_PI(__dma_flush_area)
dcache_by_line_op civac, sy, x0, x1, x2, x3
ret
-ENDPIPROC(__dma_flush_area)
+SYM_FUNC_END_PI(__dma_flush_area)
/*
* __dma_map_area(start, size, dir)
@@ -227,11 +227,11 @@
* - size - size of region
* - dir - DMA direction
*/
-ENTRY(__dma_map_area)
+SYM_FUNC_START_PI(__dma_map_area)
cmp w2, #DMA_FROM_DEVICE
b.eq __dma_inv_area
b __dma_clean_area
-ENDPIPROC(__dma_map_area)
+SYM_FUNC_END_PI(__dma_map_area)
/*
* __dma_unmap_area(start, size, dir)
@@ -239,8 +239,8 @@
* - size - size of region
* - dir - DMA direction
*/
-ENTRY(__dma_unmap_area)
+SYM_FUNC_START_PI(__dma_unmap_area)
cmp w2, #DMA_TO_DEVICE
b.ne __dma_inv_area
ret
-ENDPIPROC(__dma_unmap_area)
+SYM_FUNC_END_PI(__dma_unmap_area)
diff --git a/arch/arm64/mm/context.c b/arch/arm64/mm/context.c
index b5e329f..001737a 100644
--- a/arch/arm64/mm/context.c
+++ b/arch/arm64/mm/context.c
@@ -6,6 +6,7 @@
* Copyright (C) 2012 ARM Ltd.
*/
+#include <linux/bitfield.h>
#include <linux/bitops.h>
#include <linux/sched.h>
#include <linux/slab.h>
@@ -26,18 +27,16 @@
static DEFINE_PER_CPU(u64, reserved_asids);
static cpumask_t tlb_flush_pending;
+static unsigned long max_pinned_asids;
+static unsigned long nr_pinned_asids;
+static unsigned long *pinned_asid_map;
+
#define ASID_MASK (~GENMASK(asid_bits - 1, 0))
#define ASID_FIRST_VERSION (1UL << asid_bits)
-#ifdef CONFIG_UNMAP_KERNEL_AT_EL0
-#define NUM_USER_ASIDS (ASID_FIRST_VERSION >> 1)
-#define asid2idx(asid) (((asid) & ~ASID_MASK) >> 1)
-#define idx2asid(idx) (((idx) << 1) & ~ASID_MASK)
-#else
-#define NUM_USER_ASIDS (ASID_FIRST_VERSION)
+#define NUM_USER_ASIDS ASID_FIRST_VERSION
#define asid2idx(asid) ((asid) & ~ASID_MASK)
#define idx2asid(idx) asid2idx(idx)
-#endif
/* Get the ASIDBits supported by the current CPU */
static u32 get_cpu_asid_bits(void)
@@ -50,7 +49,7 @@
default:
pr_warn("CPU%d: Unknown ASID size (%d); assuming 8-bit\n",
smp_processor_id(), fld);
- /* Fallthrough */
+ fallthrough;
case 0:
asid = 8;
break;
@@ -77,13 +76,38 @@
}
}
+static void set_kpti_asid_bits(unsigned long *map)
+{
+ unsigned int len = BITS_TO_LONGS(NUM_USER_ASIDS) * sizeof(unsigned long);
+ /*
+ * In case of KPTI kernel/user ASIDs are allocated in
+ * pairs, the bottom bit distinguishes the two: if it
+ * is set, then the ASID will map only userspace. Thus
+ * mark even as reserved for kernel.
+ */
+ memset(map, 0xaa, len);
+}
+
+static void set_reserved_asid_bits(void)
+{
+ if (pinned_asid_map)
+ bitmap_copy(asid_map, pinned_asid_map, NUM_USER_ASIDS);
+ else if (arm64_kernel_unmapped_at_el0())
+ set_kpti_asid_bits(asid_map);
+ else
+ bitmap_clear(asid_map, 0, NUM_USER_ASIDS);
+}
+
+#define asid_gen_match(asid) \
+ (!(((asid) ^ atomic64_read(&asid_generation)) >> asid_bits))
+
static void flush_context(void)
{
int i;
u64 asid;
/* Update the list of reserved ASIDs and the ASID bitmap. */
- bitmap_clear(asid_map, 0, NUM_USER_ASIDS);
+ set_reserved_asid_bits();
for_each_possible_cpu(i) {
asid = atomic64_xchg_relaxed(&per_cpu(active_asids, i), 0);
@@ -148,6 +172,14 @@
return newasid;
/*
+ * If it is pinned, we can keep using it. Note that reserved
+ * takes priority, because even if it is also pinned, we need to
+ * update the generation into the reserved_asids.
+ */
+ if (refcount_read(&mm->context.pinned))
+ return newasid;
+
+ /*
* We had a valid ASID in a previous life, so try to re-use
* it if possible.
*/
@@ -180,9 +212,10 @@
return idx2asid(asid) | generation;
}
-void check_and_switch_context(struct mm_struct *mm, unsigned int cpu)
+void check_and_switch_context(struct mm_struct *mm)
{
unsigned long flags;
+ unsigned int cpu;
u64 asid, old_active_asid;
if (system_supports_cnp())
@@ -204,25 +237,25 @@
* relaxed xchg in flush_context will treat us as reserved
* because atomic RmWs are totally ordered for a given location.
*/
- old_active_asid = atomic64_read(&per_cpu(active_asids, cpu));
- if (old_active_asid &&
- !((asid ^ atomic64_read(&asid_generation)) >> asid_bits) &&
- atomic64_cmpxchg_relaxed(&per_cpu(active_asids, cpu),
+ old_active_asid = atomic64_read(this_cpu_ptr(&active_asids));
+ if (old_active_asid && asid_gen_match(asid) &&
+ atomic64_cmpxchg_relaxed(this_cpu_ptr(&active_asids),
old_active_asid, asid))
goto switch_mm_fastpath;
raw_spin_lock_irqsave(&cpu_asid_lock, flags);
/* Check that our ASID belongs to the current generation. */
asid = atomic64_read(&mm->context.id);
- if ((asid ^ atomic64_read(&asid_generation)) >> asid_bits) {
+ if (!asid_gen_match(asid)) {
asid = new_context(mm);
atomic64_set(&mm->context.id, asid);
}
+ cpu = smp_processor_id();
if (cpumask_test_and_clear_cpu(cpu, &tlb_flush_pending))
local_flush_tlb_all();
- atomic64_set(&per_cpu(active_asids, cpu), asid);
+ atomic64_set(this_cpu_ptr(&active_asids), asid);
raw_spin_unlock_irqrestore(&cpu_asid_lock, flags);
switch_mm_fastpath:
@@ -237,23 +270,137 @@
cpu_switch_mm(mm->pgd, mm);
}
+unsigned long arm64_mm_context_get(struct mm_struct *mm)
+{
+ unsigned long flags;
+ u64 asid;
+
+ if (!pinned_asid_map)
+ return 0;
+
+ raw_spin_lock_irqsave(&cpu_asid_lock, flags);
+
+ asid = atomic64_read(&mm->context.id);
+
+ if (refcount_inc_not_zero(&mm->context.pinned))
+ goto out_unlock;
+
+ if (nr_pinned_asids >= max_pinned_asids) {
+ asid = 0;
+ goto out_unlock;
+ }
+
+ if (!asid_gen_match(asid)) {
+ /*
+ * We went through one or more rollover since that ASID was
+ * used. Ensure that it is still valid, or generate a new one.
+ */
+ asid = new_context(mm);
+ atomic64_set(&mm->context.id, asid);
+ }
+
+ nr_pinned_asids++;
+ __set_bit(asid2idx(asid), pinned_asid_map);
+ refcount_set(&mm->context.pinned, 1);
+
+out_unlock:
+ raw_spin_unlock_irqrestore(&cpu_asid_lock, flags);
+
+ asid &= ~ASID_MASK;
+
+ /* Set the equivalent of USER_ASID_BIT */
+ if (asid && arm64_kernel_unmapped_at_el0())
+ asid |= 1;
+
+ return asid;
+}
+EXPORT_SYMBOL_GPL(arm64_mm_context_get);
+
+void arm64_mm_context_put(struct mm_struct *mm)
+{
+ unsigned long flags;
+ u64 asid = atomic64_read(&mm->context.id);
+
+ if (!pinned_asid_map)
+ return;
+
+ raw_spin_lock_irqsave(&cpu_asid_lock, flags);
+
+ if (refcount_dec_and_test(&mm->context.pinned)) {
+ __clear_bit(asid2idx(asid), pinned_asid_map);
+ nr_pinned_asids--;
+ }
+
+ raw_spin_unlock_irqrestore(&cpu_asid_lock, flags);
+}
+EXPORT_SYMBOL_GPL(arm64_mm_context_put);
+
/* Errata workaround post TTBRx_EL1 update. */
asmlinkage void post_ttbr_update_workaround(void)
{
+ if (!IS_ENABLED(CONFIG_CAVIUM_ERRATUM_27456))
+ return;
+
asm(ALTERNATIVE("nop; nop; nop",
"ic iallu; dsb nsh; isb",
- ARM64_WORKAROUND_CAVIUM_27456,
- CONFIG_CAVIUM_ERRATUM_27456));
+ ARM64_WORKAROUND_CAVIUM_27456));
}
-static int asids_init(void)
+void cpu_do_switch_mm(phys_addr_t pgd_phys, struct mm_struct *mm)
{
- asid_bits = get_cpu_asid_bits();
+ unsigned long ttbr1 = read_sysreg(ttbr1_el1);
+ unsigned long asid = ASID(mm);
+ unsigned long ttbr0 = phys_to_ttbr(pgd_phys);
+
+ /* Skip CNP for the reserved ASID */
+ if (system_supports_cnp() && asid)
+ ttbr0 |= TTBR_CNP_BIT;
+
+ /* SW PAN needs a copy of the ASID in TTBR0 for entry */
+ if (IS_ENABLED(CONFIG_ARM64_SW_TTBR0_PAN))
+ ttbr0 |= FIELD_PREP(TTBR_ASID_MASK, asid);
+
+ /* Set ASID in TTBR1 since TCR.A1 is set */
+ ttbr1 &= ~TTBR_ASID_MASK;
+ ttbr1 |= FIELD_PREP(TTBR_ASID_MASK, asid);
+
+ write_sysreg(ttbr1, ttbr1_el1);
+ isb();
+ write_sysreg(ttbr0, ttbr0_el1);
+ isb();
+ post_ttbr_update_workaround();
+}
+
+static int asids_update_limit(void)
+{
+ unsigned long num_available_asids = NUM_USER_ASIDS;
+
+ if (arm64_kernel_unmapped_at_el0()) {
+ num_available_asids /= 2;
+ if (pinned_asid_map)
+ set_kpti_asid_bits(pinned_asid_map);
+ }
/*
* Expect allocation after rollover to fail if we don't have at least
* one more ASID than CPUs. ASID #0 is reserved for init_mm.
*/
- WARN_ON(NUM_USER_ASIDS - 1 <= num_possible_cpus());
+ WARN_ON(num_available_asids - 1 <= num_possible_cpus());
+ pr_info("ASID allocator initialised with %lu entries\n",
+ num_available_asids);
+
+ /*
+ * There must always be an ASID available after rollover. Ensure that,
+ * even if all CPUs have a reserved ASID and the maximum number of ASIDs
+ * are pinned, there still is at least one empty slot in the ASID map.
+ */
+ max_pinned_asids = num_available_asids - num_possible_cpus() - 2;
+ return 0;
+}
+arch_initcall(asids_update_limit);
+
+static int asids_init(void)
+{
+ asid_bits = get_cpu_asid_bits();
atomic64_set(&asid_generation, ASID_FIRST_VERSION);
asid_map = kcalloc(BITS_TO_LONGS(NUM_USER_ASIDS), sizeof(*asid_map),
GFP_KERNEL);
@@ -261,7 +408,17 @@
panic("Failed to allocate bitmap for %lu ASIDs\n",
NUM_USER_ASIDS);
- pr_info("ASID allocator initialised with %lu entries\n", NUM_USER_ASIDS);
+ pinned_asid_map = kcalloc(BITS_TO_LONGS(NUM_USER_ASIDS),
+ sizeof(*pinned_asid_map), GFP_KERNEL);
+ nr_pinned_asids = 0;
+
+ /*
+ * We cannot call set_reserved_asid_bits() here because CPU
+ * caps are not finalized yet, so it is safer to assume KPTI
+ * and reserve kernel ASID's from beginning.
+ */
+ if (IS_ENABLED(CONFIG_UNMAP_KERNEL_AT_EL0))
+ set_kpti_asid_bits(asid_map);
return 0;
}
early_initcall(asids_init);
diff --git a/arch/arm64/mm/copypage.c b/arch/arm64/mm/copypage.c
index 2ee7b73..70a71f3 100644
--- a/arch/arm64/mm/copypage.c
+++ b/arch/arm64/mm/copypage.c
@@ -6,21 +6,32 @@
* Copyright (C) 2012 ARM Ltd.
*/
+#include <linux/bitops.h>
#include <linux/mm.h>
#include <asm/page.h>
#include <asm/cacheflush.h>
+#include <asm/cpufeature.h>
+#include <asm/mte.h>
-void __cpu_copy_user_page(void *kto, const void *kfrom, unsigned long vaddr)
+void copy_highpage(struct page *to, struct page *from)
{
- struct page *page = virt_to_page(kto);
+ struct page *kto = page_address(to);
+ struct page *kfrom = page_address(from);
+
copy_page(kto, kfrom);
- flush_dcache_page(page);
-}
-EXPORT_SYMBOL_GPL(__cpu_copy_user_page);
-void __cpu_clear_user_page(void *kaddr, unsigned long vaddr)
-{
- clear_page(kaddr);
+ if (system_supports_mte() && test_bit(PG_mte_tagged, &from->flags)) {
+ set_bit(PG_mte_tagged, &to->flags);
+ mte_copy_page_tags(kto, kfrom);
+ }
}
-EXPORT_SYMBOL_GPL(__cpu_clear_user_page);
+EXPORT_SYMBOL(copy_highpage);
+
+void copy_user_highpage(struct page *to, struct page *from,
+ unsigned long vaddr, struct vm_area_struct *vma)
+{
+ copy_highpage(to, from);
+ flush_dcache_page(to);
+}
+EXPORT_SYMBOL_GPL(copy_user_highpage);
diff --git a/arch/arm64/mm/dma-mapping.c b/arch/arm64/mm/dma-mapping.c
index 9239416..93e87b2 100644
--- a/arch/arm64/mm/dma-mapping.c
+++ b/arch/arm64/mm/dma-mapping.c
@@ -6,21 +6,21 @@
#include <linux/gfp.h>
#include <linux/cache.h>
-#include <linux/dma-noncoherent.h>
+#include <linux/dma-map-ops.h>
#include <linux/dma-iommu.h>
#include <xen/xen.h>
#include <xen/swiotlb-xen.h>
#include <asm/cacheflush.h>
-void arch_sync_dma_for_device(struct device *dev, phys_addr_t paddr,
- size_t size, enum dma_data_direction dir)
+void arch_sync_dma_for_device(phys_addr_t paddr, size_t size,
+ enum dma_data_direction dir)
{
__dma_map_area(phys_to_virt(paddr), size, dir);
}
-void arch_sync_dma_for_cpu(struct device *dev, phys_addr_t paddr,
- size_t size, enum dma_data_direction dir)
+void arch_sync_dma_for_cpu(phys_addr_t paddr, size_t size,
+ enum dma_data_direction dir)
{
__dma_unmap_area(phys_to_virt(paddr), size, dir);
}
diff --git a/arch/arm64/mm/extable.c b/arch/arm64/mm/extable.c
index 81e694a..aa00601 100644
--- a/arch/arm64/mm/extable.c
+++ b/arch/arm64/mm/extable.c
@@ -11,8 +11,12 @@
const struct exception_table_entry *fixup;
fixup = search_exception_tables(instruction_pointer(regs));
- if (fixup)
- regs->pc = (unsigned long)&fixup->fixup + fixup->fixup;
+ if (!fixup)
+ return 0;
- return fixup != NULL;
+ if (in_bpf_jit(regs))
+ return arm64_bpf_fixup_exception(fixup, regs);
+
+ regs->pc = (unsigned long)&fixup->fixup + fixup->fixup;
+ return 1;
}
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index 2a7339a..795d224 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -32,10 +32,10 @@
#include <asm/daifflags.h>
#include <asm/debug-monitors.h>
#include <asm/esr.h>
-#include <asm/kasan.h>
+#include <asm/kprobes.h>
+#include <asm/processor.h>
#include <asm/sysreg.h>
#include <asm/system_misc.h>
-#include <asm/pgtable.h>
#include <asm/tlbflush.h>
#include <asm/traps.h>
@@ -101,18 +101,6 @@
data_abort_decode(esr);
}
-static inline bool is_ttbr0_addr(unsigned long addr)
-{
- /* entry assembly clears tags for TTBR0 addrs */
- return addr < TASK_SIZE;
-}
-
-static inline bool is_ttbr1_addr(unsigned long addr)
-{
- /* TTBR1 addresses may have a tag if KASAN_SW_TAGS is in use */
- return arch_kasan_reset_tag(addr) >= PAGE_OFFSET;
-}
-
static inline unsigned long mm_to_pgd_phys(struct mm_struct *mm)
{
/* Either init_pg_dir or swapper_pg_dir */
@@ -156,6 +144,7 @@
pr_alert("[%016lx] pgd=%016llx", addr, pgd_val(pgd));
do {
+ p4d_t *p4dp, p4d;
pud_t *pudp, pud;
pmd_t *pmdp, pmd;
pte_t *ptep, pte;
@@ -163,7 +152,13 @@
if (pgd_none(pgd) || pgd_bad(pgd))
break;
- pudp = pud_offset(pgdp, addr);
+ p4dp = p4d_offset(pgdp, addr);
+ p4d = READ_ONCE(*p4dp);
+ pr_cont(", p4d=%016llx", p4d_val(p4d));
+ if (p4d_none(p4d) || p4d_bad(p4d))
+ break;
+
+ pudp = pud_offset(p4dp, addr);
pud = READ_ONCE(*pudp);
pr_cont(", pud=%016llx", pud_val(pud));
if (pud_none(pud) || pud_bad(pud))
@@ -223,7 +218,9 @@
pteval = cmpxchg_relaxed(&pte_val(*ptep), old_pteval, pteval);
} while (pteval != old_pteval);
- flush_tlb_fix_spurious_fault(vma, address);
+ /* Invalidate a stale read-only entry */
+ if (dirty)
+ flush_tlb_page(vma, address);
return 1;
}
@@ -265,7 +262,7 @@
local_irq_save(flags);
asm volatile("at s1e1r, %0" :: "r" (addr));
isb();
- par = read_sysreg(par_el1);
+ par = read_sysreg_par();
local_irq_restore(flags);
/*
@@ -318,6 +315,8 @@
if (is_el1_permission_fault(addr, esr, regs)) {
if (esr & ESR_ELx_WNR)
msg = "write to read-only memory";
+ else if (is_el1_instruction_abort(esr))
+ msg = "execute from non-executable memory";
else
msg = "read from unreadable memory";
} else if (addr < PAGE_SIZE) {
@@ -407,7 +406,8 @@
#define VM_FAULT_BADACCESS 0x020000
static vm_fault_t __do_page_fault(struct mm_struct *mm, unsigned long addr,
- unsigned int mm_flags, unsigned long vm_flags)
+ unsigned int mm_flags, unsigned long vm_flags,
+ struct pt_regs *regs)
{
struct vm_area_struct *vma = find_vma(mm, addr);
@@ -431,7 +431,7 @@
*/
if (!(vma->vm_flags & vm_flags))
return VM_FAULT_BADACCESS;
- return handle_mm_fault(vma, addr & PAGE_MASK, mm_flags);
+ return handle_mm_fault(vma, addr & PAGE_MASK, mm_flags, regs);
}
static bool is_el0_instruction_abort(unsigned int esr)
@@ -453,9 +453,9 @@
{
const struct fault_info *inf;
struct mm_struct *mm = current->mm;
- vm_fault_t fault, major = 0;
- unsigned long vm_flags = VM_READ | VM_WRITE | VM_EXEC;
- unsigned int mm_flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
+ vm_fault_t fault;
+ unsigned long vm_flags = VM_ACCESS_FLAGS;
+ unsigned int mm_flags = FAULT_FLAG_DEFAULT;
if (kprobe_page_fault(regs, esr))
return 0;
@@ -500,11 +500,11 @@
* validly references user space from well defined areas of the code,
* we can bug out early if this is from code which shouldn't.
*/
- if (!down_read_trylock(&mm->mmap_sem)) {
+ if (!mmap_read_trylock(mm)) {
if (!user_mode(regs) && !search_exception_tables(regs->pc))
goto no_context;
retry:
- down_read(&mm->mmap_sem);
+ mmap_read_lock(mm);
} else {
/*
* The above down_read_trylock() might have succeeded in which
@@ -513,63 +513,35 @@
might_sleep();
#ifdef CONFIG_DEBUG_VM
if (!user_mode(regs) && !search_exception_tables(regs->pc)) {
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
goto no_context;
}
#endif
}
- fault = __do_page_fault(mm, addr, mm_flags, vm_flags);
- major |= fault & VM_FAULT_MAJOR;
+ fault = __do_page_fault(mm, addr, mm_flags, vm_flags, regs);
+
+ /* Quick path to respond to signals */
+ if (fault_signal_pending(fault, regs)) {
+ if (!user_mode(regs))
+ goto no_context;
+ return 0;
+ }
if (fault & VM_FAULT_RETRY) {
- /*
- * If we need to retry but a fatal signal is pending,
- * handle the signal first. We do not need to release
- * the mmap_sem because it would already be released
- * in __lock_page_or_retry in mm/filemap.c.
- */
- if (fatal_signal_pending(current)) {
- if (!user_mode(regs))
- goto no_context;
- return 0;
- }
-
- /*
- * Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk of
- * starvation.
- */
if (mm_flags & FAULT_FLAG_ALLOW_RETRY) {
- mm_flags &= ~FAULT_FLAG_ALLOW_RETRY;
mm_flags |= FAULT_FLAG_TRIED;
goto retry;
}
}
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
/*
* Handle the "normal" (no error) case first.
*/
if (likely(!(fault & (VM_FAULT_ERROR | VM_FAULT_BADMAP |
- VM_FAULT_BADACCESS)))) {
- /*
- * Major/minor page fault accounting is only done
- * once. If we go through a retry, it is extremely
- * likely that the page will be found in page cache at
- * that point.
- */
- if (major) {
- current->maj_flt++;
- perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, regs,
- addr);
- } else {
- current->min_flt++;
- perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs,
- addr);
- }
-
+ VM_FAULT_BADACCESS))))
return 0;
- }
/*
* If we are in kernel mode at this point, we have no context to
@@ -671,6 +643,13 @@
return 0;
}
+static int do_tag_check_fault(unsigned long addr, unsigned int esr,
+ struct pt_regs *regs)
+{
+ do_bad_area(addr, esr, regs);
+ return 0;
+}
+
static const struct fault_info fault_info[] = {
{ do_bad, SIGKILL, SI_KERNEL, "ttbr address size fault" },
{ do_bad, SIGKILL, SI_KERNEL, "level 1 address size fault" },
@@ -689,7 +668,7 @@
{ do_page_fault, SIGSEGV, SEGV_ACCERR, "level 2 permission fault" },
{ do_page_fault, SIGSEGV, SEGV_ACCERR, "level 3 permission fault" },
{ do_sea, SIGBUS, BUS_OBJERR, "synchronous external abort" },
- { do_bad, SIGKILL, SI_KERNEL, "unknown 17" },
+ { do_tag_check_fault, SIGSEGV, SEGV_MTESERR, "synchronous tag check fault" },
{ do_bad, SIGKILL, SI_KERNEL, "unknown 18" },
{ do_bad, SIGKILL, SI_KERNEL, "unknown 19" },
{ do_sea, SIGKILL, SI_KERNEL, "level 0 (translation table walk)" },
@@ -738,8 +717,7 @@
{ do_bad, SIGKILL, SI_KERNEL, "unknown 63" },
};
-asmlinkage void __exception do_mem_abort(unsigned long addr, unsigned int esr,
- struct pt_regs *regs)
+void do_mem_abort(unsigned long addr, unsigned int esr, struct pt_regs *regs)
{
const struct fault_info *inf = esr_to_fault_info(esr);
@@ -755,43 +733,21 @@
arm64_notify_die(inf->name, regs,
inf->sig, inf->code, (void __user *)addr, esr);
}
+NOKPROBE_SYMBOL(do_mem_abort);
-asmlinkage void __exception do_el0_irq_bp_hardening(void)
+void do_el0_irq_bp_hardening(void)
{
/* PC has already been checked in entry.S */
arm64_apply_bp_hardening();
}
+NOKPROBE_SYMBOL(do_el0_irq_bp_hardening);
-asmlinkage void __exception do_el0_ia_bp_hardening(unsigned long addr,
- unsigned int esr,
- struct pt_regs *regs)
+void do_sp_pc_abort(unsigned long addr, unsigned int esr, struct pt_regs *regs)
{
- /*
- * We've taken an instruction abort from userspace and not yet
- * re-enabled IRQs. If the address is a kernel address, apply
- * BP hardening prior to enabling IRQs and pre-emption.
- */
- if (!is_ttbr0_addr(addr))
- arm64_apply_bp_hardening();
-
- local_daif_restore(DAIF_PROCCTX);
- do_mem_abort(addr, esr, regs);
-}
-
-
-asmlinkage void __exception do_sp_pc_abort(unsigned long addr,
- unsigned int esr,
- struct pt_regs *regs)
-{
- if (user_mode(regs)) {
- if (!is_ttbr0_addr(instruction_pointer(regs)))
- arm64_apply_bp_hardening();
- local_daif_restore(DAIF_PROCCTX);
- }
-
arm64_notify_die("SP/PC alignment exception", regs,
SIGBUS, BUS_ADRALN, (void __user *)addr, esr);
}
+NOKPROBE_SYMBOL(do_sp_pc_abort);
int __init early_brk64(unsigned long addr, unsigned int esr,
struct pt_regs *regs);
@@ -833,25 +789,6 @@
*/
static void debug_exception_enter(struct pt_regs *regs)
{
- /*
- * Tell lockdep we disabled irqs in entry.S. Do nothing if they were
- * already disabled to preserve the last enabled/disabled addresses.
- */
- if (interrupts_enabled(regs))
- trace_hardirqs_off();
-
- if (user_mode(regs)) {
- RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
- } else {
- /*
- * We might have interrupted pretty much anything. In
- * fact, if we're a debug exception, we can even interrupt
- * NMI processing. We don't want this code makes in_nmi()
- * to return true, but we need to notify RCU.
- */
- rcu_nmi_enter();
- }
-
preempt_disable();
/* This code is a bit fragile. Test it. */
@@ -862,20 +799,13 @@
static void debug_exception_exit(struct pt_regs *regs)
{
preempt_enable_no_resched();
-
- if (!user_mode(regs))
- rcu_nmi_exit();
-
- if (interrupts_enabled(regs))
- trace_hardirqs_on();
}
NOKPROBE_SYMBOL(debug_exception_exit);
#ifdef CONFIG_ARM64_ERRATUM_1463225
DECLARE_PER_CPU(int, __in_cortex_a76_erratum_1463225_wa);
-static int __exception
-cortex_a76_erratum_1463225_debug_handler(struct pt_regs *regs)
+static int cortex_a76_erratum_1463225_debug_handler(struct pt_regs *regs)
{
if (user_mode(regs))
return 0;
@@ -894,16 +824,15 @@
return 1;
}
#else
-static int __exception
-cortex_a76_erratum_1463225_debug_handler(struct pt_regs *regs)
+static int cortex_a76_erratum_1463225_debug_handler(struct pt_regs *regs)
{
return 0;
}
#endif /* CONFIG_ARM64_ERRATUM_1463225 */
+NOKPROBE_SYMBOL(cortex_a76_erratum_1463225_debug_handler);
-asmlinkage void __exception do_debug_exception(unsigned long addr_if_watchpoint,
- unsigned int esr,
- struct pt_regs *regs)
+void do_debug_exception(unsigned long addr_if_watchpoint, unsigned int esr,
+ struct pt_regs *regs)
{
const struct fault_info *inf = esr_to_debug_fault_info(esr);
unsigned long pc = instruction_pointer(regs);
diff --git a/arch/arm64/mm/flush.c b/arch/arm64/mm/flush.c
index ac48516..6d44c02 100644
--- a/arch/arm64/mm/flush.c
+++ b/arch/arm64/mm/flush.c
@@ -55,8 +55,10 @@
{
struct page *page = pte_page(pte);
- if (!test_and_set_bit(PG_dcache_clean, &page->flags))
+ if (!test_bit(PG_dcache_clean, &page->flags)) {
sync_icache_aliases(page_address(page), page_size(page));
+ set_bit(PG_dcache_clean, &page->flags);
+ }
}
EXPORT_SYMBOL_GPL(__sync_icache_dcache);
diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c
index 0be3355..99cd6e7 100644
--- a/arch/arm64/mm/hugetlbpage.c
+++ b/arch/arm64/mm/hugetlbpage.c
@@ -17,7 +17,44 @@
#include <asm/mman.h>
#include <asm/tlb.h>
#include <asm/tlbflush.h>
-#include <asm/pgalloc.h>
+
+/*
+ * HugeTLB Support Matrix
+ *
+ * ---------------------------------------------------
+ * | Page Size | CONT PTE | PMD | CONT PMD | PUD |
+ * ---------------------------------------------------
+ * | 4K | 64K | 2M | 32M | 1G |
+ * | 16K | 2M | 32M | 1G | |
+ * | 64K | 2M | 512M | 16G | |
+ * ---------------------------------------------------
+ */
+
+/*
+ * Reserve CMA areas for the largest supported gigantic
+ * huge page when requested. Any other smaller gigantic
+ * huge pages could still be served from those areas.
+ */
+#ifdef CONFIG_CMA
+void __init arm64_hugetlb_cma_reserve(void)
+{
+ int order;
+
+#ifdef CONFIG_ARM64_4K_PAGES
+ order = PUD_SHIFT - PAGE_SHIFT;
+#else
+ order = CONT_PMD_SHIFT - PAGE_SHIFT;
+#endif
+ /*
+ * HugeTLB CMA reservation is required for gigantic
+ * huge pages which could not be allocated via the
+ * page allocator. Just warn if there is any change
+ * breaking this assumption.
+ */
+ WARN_ON(order <= MAX_ORDER);
+ hugetlb_cma_reserve(order);
+}
+#endif /* CONFIG_CMA */
#ifdef CONFIG_ARCH_ENABLE_HUGEPAGE_MIGRATION
bool arch_hugetlb_migration_supported(struct hstate *h)
@@ -67,11 +104,13 @@
pte_t *ptep, size_t *pgsize)
{
pgd_t *pgdp = pgd_offset(mm, addr);
+ p4d_t *p4dp;
pud_t *pudp;
pmd_t *pmdp;
*pgsize = PAGE_SIZE;
- pudp = pud_offset(pgdp, addr);
+ p4dp = p4d_offset(pgdp, addr);
+ pudp = pud_offset(p4dp, addr);
pmdp = pmd_offset(pudp, addr);
if ((pte_t *)pmdp == ptep) {
*pgsize = PMD_SIZE;
@@ -217,12 +256,14 @@
unsigned long addr, unsigned long sz)
{
pgd_t *pgdp;
+ p4d_t *p4dp;
pud_t *pudp;
pmd_t *pmdp;
pte_t *ptep = NULL;
pgdp = pgd_offset(mm, addr);
- pudp = pud_alloc(mm, pgdp, addr);
+ p4dp = p4d_offset(pgdp, addr);
+ pudp = pud_alloc(mm, p4dp, addr);
if (!pudp)
return NULL;
@@ -261,6 +302,7 @@
unsigned long addr, unsigned long sz)
{
pgd_t *pgdp;
+ p4d_t *p4dp;
pud_t *pudp, pud;
pmd_t *pmdp, pmd;
@@ -268,7 +310,11 @@
if (!pgd_present(READ_ONCE(*pgdp)))
return NULL;
- pudp = pud_offset(pgdp, addr);
+ p4dp = p4d_offset(pgdp, addr);
+ if (!p4d_present(READ_ONCE(*p4dp)))
+ return NULL;
+
+ pudp = pud_offset(p4dp, addr);
pud = READ_ONCE(*pudp);
if (sz != PUD_SIZE && pud_none(pud))
return NULL;
@@ -443,44 +489,30 @@
clear_flush(vma->vm_mm, addr, ptep, pgsize, ncontig);
}
-static void __init add_huge_page_size(unsigned long size)
-{
- if (size_to_hstate(size))
- return;
-
- hugetlb_add_hstate(ilog2(size) - PAGE_SHIFT);
-}
-
static int __init hugetlbpage_init(void)
{
#ifdef CONFIG_ARM64_4K_PAGES
- add_huge_page_size(PUD_SIZE);
+ hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT);
#endif
- add_huge_page_size(CONT_PMD_SIZE);
- add_huge_page_size(PMD_SIZE);
- add_huge_page_size(CONT_PTE_SIZE);
+ hugetlb_add_hstate(CONT_PMD_SHIFT - PAGE_SHIFT);
+ hugetlb_add_hstate(PMD_SHIFT - PAGE_SHIFT);
+ hugetlb_add_hstate(CONT_PTE_SHIFT - PAGE_SHIFT);
return 0;
}
arch_initcall(hugetlbpage_init);
-static __init int setup_hugepagesz(char *opt)
+bool __init arch_hugetlb_valid_size(unsigned long size)
{
- unsigned long ps = memparse(opt, &opt);
-
- switch (ps) {
+ switch (size) {
#ifdef CONFIG_ARM64_4K_PAGES
case PUD_SIZE:
#endif
case CONT_PMD_SIZE:
case PMD_SIZE:
case CONT_PTE_SIZE:
- add_huge_page_size(ps);
- return 1;
+ return true;
}
- hugetlb_bad_size();
- pr_err("hugepagesz: Unsupported page size %lu K\n", ps >> 10);
- return 0;
+ return false;
}
-__setup("hugepagesz=", setup_hugepagesz);
diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
index cbcac03..c0a7f0d 100644
--- a/arch/arm64/mm/init.c
+++ b/arch/arm64/mm/init.c
@@ -20,14 +20,16 @@
#include <linux/sort.h>
#include <linux/of.h>
#include <linux/of_fdt.h>
-#include <linux/dma-mapping.h>
-#include <linux/dma-contiguous.h>
+#include <linux/dma-direct.h>
+#include <linux/dma-map-ops.h>
#include <linux/efi.h>
#include <linux/swiotlb.h>
#include <linux/vmalloc.h>
#include <linux/mm.h>
#include <linux/kexec.h>
#include <linux/crash_dump.h>
+#include <linux/hugetlb.h>
+#include <linux/acpi_iort.h>
#include <asm/boot.h>
#include <asm/fixmap.h>
@@ -50,6 +52,13 @@
s64 memstart_addr __ro_after_init = -1;
EXPORT_SYMBOL(memstart_addr);
+/*
+ * If the corresponding config options are enabled, we create both ZONE_DMA
+ * and ZONE_DMA32. By default ZONE_DMA covers the 32-bit addressable memory
+ * unless restricted on specific platforms (e.g. 30-bit on Raspberry Pi 4).
+ * In such case, ZONE_DMA32 covers the rest of the 32-bit addressable memory,
+ * otherwise it is empty.
+ */
phys_addr_t arm64_dma_phys_limit __ro_after_init;
#ifdef CONFIG_KEXEC_CORE
@@ -75,7 +84,7 @@
if (crash_base == 0) {
/* Current arm64 boot protocol requires 2MB alignment */
- crash_base = memblock_find_in_range(0, ARCH_LOW_ADDRESS_LIMIT,
+ crash_base = memblock_find_in_range(0, arm64_dma_phys_limit,
crash_size, SZ_2M);
if (crash_base == 0) {
pr_warn("cannot allocate crashkernel (size:0x%llx)\n",
@@ -163,75 +172,51 @@
{
}
#endif /* CONFIG_CRASH_DUMP */
-/*
- * Return the maximum physical address for ZONE_DMA32 (DMA_BIT_MASK(32)). It
- * currently assumes that for memory starting above 4G, 32-bit devices will
- * use a DMA offset.
- */
-static phys_addr_t __init max_zone_dma_phys(void)
-{
- phys_addr_t offset = memblock_start_of_DRAM() & GENMASK_ULL(63, 32);
- return min(offset + (1ULL << 32), memblock_end_of_DRAM());
-}
-#ifdef CONFIG_NUMA
+/*
+ * Return the maximum physical address for a zone accessible by the given bits
+ * limit. If DRAM starts above 32-bit, expand the zone to the maximum
+ * available memory, otherwise cap it at 32-bit.
+ */
+static phys_addr_t __init max_zone_phys(unsigned int zone_bits)
+{
+ phys_addr_t zone_mask = DMA_BIT_MASK(zone_bits);
+ phys_addr_t phys_start = memblock_start_of_DRAM();
+
+ if (phys_start > U32_MAX)
+ zone_mask = PHYS_ADDR_MAX;
+ else if (phys_start > zone_mask)
+ zone_mask = U32_MAX;
+
+ return min(zone_mask, memblock_end_of_DRAM() - 1) + 1;
+}
static void __init zone_sizes_init(unsigned long min, unsigned long max)
{
unsigned long max_zone_pfns[MAX_NR_ZONES] = {0};
+ unsigned int __maybe_unused acpi_zone_dma_bits;
+ unsigned int __maybe_unused dt_zone_dma_bits;
+ phys_addr_t __maybe_unused dma32_phys_limit = max_zone_phys(32);
-#ifdef CONFIG_ZONE_DMA32
- max_zone_pfns[ZONE_DMA32] = PFN_DOWN(max_zone_dma_phys());
+#ifdef CONFIG_ZONE_DMA
+ acpi_zone_dma_bits = fls64(acpi_iort_dma_get_max_cpu_address());
+ dt_zone_dma_bits = fls64(of_dma_get_max_cpu_address(NULL));
+ zone_dma_bits = min3(32U, dt_zone_dma_bits, acpi_zone_dma_bits);
+ arm64_dma_phys_limit = max_zone_phys(zone_dma_bits);
+ max_zone_pfns[ZONE_DMA] = PFN_DOWN(arm64_dma_phys_limit);
#endif
+#ifdef CONFIG_ZONE_DMA32
+ max_zone_pfns[ZONE_DMA32] = PFN_DOWN(dma32_phys_limit);
+ if (!arm64_dma_phys_limit)
+ arm64_dma_phys_limit = dma32_phys_limit;
+#endif
+ if (!arm64_dma_phys_limit)
+ arm64_dma_phys_limit = PHYS_MASK + 1;
max_zone_pfns[ZONE_NORMAL] = max;
- free_area_init_nodes(max_zone_pfns);
+ free_area_init(max_zone_pfns);
}
-#else
-
-static void __init zone_sizes_init(unsigned long min, unsigned long max)
-{
- struct memblock_region *reg;
- unsigned long zone_size[MAX_NR_ZONES], zhole_size[MAX_NR_ZONES];
- unsigned long max_dma = min;
-
- memset(zone_size, 0, sizeof(zone_size));
-
- /* 4GB maximum for 32-bit only capable devices */
-#ifdef CONFIG_ZONE_DMA32
- max_dma = PFN_DOWN(arm64_dma_phys_limit);
- zone_size[ZONE_DMA32] = max_dma - min;
-#endif
- zone_size[ZONE_NORMAL] = max - max_dma;
-
- memcpy(zhole_size, zone_size, sizeof(zhole_size));
-
- for_each_memblock(memory, reg) {
- unsigned long start = memblock_region_memory_base_pfn(reg);
- unsigned long end = memblock_region_memory_end_pfn(reg);
-
- if (start >= max)
- continue;
-
-#ifdef CONFIG_ZONE_DMA32
- if (start < max_dma) {
- unsigned long dma_end = min(end, max_dma);
- zhole_size[ZONE_DMA32] -= dma_end - start;
- }
-#endif
- if (end > max_dma) {
- unsigned long normal_end = min(end, max);
- unsigned long normal_start = max(start, max_dma);
- zhole_size[ZONE_NORMAL] -= normal_end - normal_start;
- }
- }
-
- free_area_init_node(0, zone_size, min, zhole_size);
-}
-
-#endif /* CONFIG_NUMA */
-
int pfn_valid(unsigned long pfn)
{
phys_addr_t addr = pfn << PAGE_SHIFT;
@@ -243,7 +228,7 @@
if (pfn_to_section_nr(pfn) >= NR_MEM_SECTIONS)
return 0;
- if (!valid_section(__nr_to_section(pfn_to_section_nr(pfn))))
+ if (!valid_section(__pfn_to_section(pfn)))
return 0;
/*
@@ -420,19 +405,9 @@
early_init_fdt_scan_reserved_mem();
- /* 4GB maximum for 32-bit only capable devices */
- if (IS_ENABLED(CONFIG_ZONE_DMA32))
- arm64_dma_phys_limit = max_zone_dma_phys();
- else
- arm64_dma_phys_limit = PHYS_MASK + 1;
-
- reserve_crashkernel();
-
reserve_elfcorehdr();
high_memory = __va(memblock_end_of_DRAM() - 1) + 1;
-
- dma_contiguous_reserve(arm64_dma_phys_limit);
}
void __init bootmem_init(void)
@@ -448,15 +423,36 @@
min_low_pfn = min;
arm64_numa_init();
- /*
- * Sparsemem tries to allocate bootmem in memory_present(), so must be
- * done after the fixed reservations.
- */
- memblocks_present();
+ /*
+ * must be done after arm64_numa_init() which calls numa_init() to
+ * initialize node_online_map that gets used in hugetlb_cma_reserve()
+ * while allocating required CMA size across online nodes.
+ */
+#if defined(CONFIG_HUGETLB_PAGE) && defined(CONFIG_CMA)
+ arm64_hugetlb_cma_reserve();
+#endif
+
+ dma_pernuma_cma_reserve();
+
+ /*
+ * sparse_init() tries to allocate memory from memblock, so must be
+ * done after the fixed reservations
+ */
sparse_init();
zone_sizes_init(min, max);
+ /*
+ * Reserve the CMA area after arm64_dma_phys_limit was initialised.
+ */
+ dma_contiguous_reserve(arm64_dma_phys_limit);
+
+ /*
+ * request_standard_resources() depends on crashkernel's memory being
+ * reserved, so do it here.
+ */
+ reserve_crashkernel();
+
memblock_dump_all();
}
@@ -492,12 +488,10 @@
*/
static void __init free_unused_memmap(void)
{
- unsigned long start, prev_end = 0;
- struct memblock_region *reg;
+ unsigned long start, end, prev_end = 0;
+ int i;
- for_each_memblock(memory, reg) {
- start = __phys_to_pfn(reg->base);
-
+ for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, NULL) {
#ifdef CONFIG_SPARSEMEM
/*
* Take care not to free memmap entries that don't exist due
@@ -517,8 +511,7 @@
* memmap entries are valid from the bank end aligned to
* MAX_ORDER_NR_PAGES.
*/
- prev_end = ALIGN(__phys_to_pfn(reg->base + reg->size),
- MAX_ORDER_NR_PAGES);
+ prev_end = ALIGN(end, MAX_ORDER_NR_PAGES);
}
#ifdef CONFIG_SPARSEMEM
@@ -536,7 +529,7 @@
void __init mem_init(void)
{
if (swiotlb_force == SWIOTLB_FORCE ||
- max_pfn > (arm64_dma_phys_limit >> PAGE_SHIFT))
+ max_pfn > PFN_DOWN(arm64_dma_phys_limit))
swiotlb_init(1);
else
swiotlb_force = SWIOTLB_NO_FORCE;
@@ -573,7 +566,7 @@
{
free_reserved_area(lm_alias(__init_begin),
lm_alias(__init_end),
- 0, "unused kernel");
+ POISON_FREE_INITMEM, "unused kernel");
/*
* Unmap the __init region but leave the VM area in place. This
* prevents the region from being reused for kernel modules, which
@@ -582,39 +575,11 @@
unmap_kernel_range((u64)__init_begin, (u64)(__init_end - __init_begin));
}
-#ifdef CONFIG_BLK_DEV_INITRD
-void __init free_initrd_mem(unsigned long start, unsigned long end)
-{
- unsigned long aligned_start, aligned_end;
-
- aligned_start = __virt_to_phys(start) & PAGE_MASK;
- aligned_end = PAGE_ALIGN(__virt_to_phys(end));
- memblock_free(aligned_start, aligned_end - aligned_start);
- free_reserved_area((void *)start, (void *)end, 0, "initrd");
-}
-#endif
-
-/*
- * Dump out memory limit information on panic.
- */
-static int dump_mem_limit(struct notifier_block *self, unsigned long v, void *p)
+void dump_mem_limit(void)
{
if (memory_limit != PHYS_ADDR_MAX) {
pr_emerg("Memory Limit: %llu MB\n", memory_limit >> 20);
} else {
pr_emerg("Memory Limit: none\n");
}
- return 0;
}
-
-static struct notifier_block mem_limit_notifier = {
- .notifier_call = dump_mem_limit,
-};
-
-static int __init register_mem_limit_dumper(void)
-{
- atomic_notifier_chain_register(&panic_notifier_list,
- &mem_limit_notifier);
- return 0;
-}
-__initcall(register_mem_limit_dumper);
diff --git a/arch/arm64/mm/ioremap.c b/arch/arm64/mm/ioremap.c
index 9be71be..b5e83c4 100644
--- a/arch/arm64/mm/ioremap.c
+++ b/arch/arm64/mm/ioremap.c
@@ -16,7 +16,6 @@
#include <asm/fixmap.h>
#include <asm/tlbflush.h>
-#include <asm/pgalloc.h>
static void __iomem *__ioremap_caller(phys_addr_t phys_addr, size_t size,
pgprot_t prot, void *caller)
diff --git a/arch/arm64/mm/kasan_init.c b/arch/arm64/mm/kasan_init.c
index f87a324..b24e43d 100644
--- a/arch/arm64/mm/kasan_init.c
+++ b/arch/arm64/mm/kasan_init.c
@@ -18,7 +18,6 @@
#include <asm/kernel-pgtable.h>
#include <asm/page.h>
#include <asm/pgalloc.h>
-#include <asm/pgtable.h>
#include <asm/sections.h>
#include <asm/tlbflush.h>
@@ -84,17 +83,17 @@
return early ? pmd_offset_kimg(pudp, addr) : pmd_offset(pudp, addr);
}
-static pud_t *__init kasan_pud_offset(pgd_t *pgdp, unsigned long addr, int node,
+static pud_t *__init kasan_pud_offset(p4d_t *p4dp, unsigned long addr, int node,
bool early)
{
- if (pgd_none(READ_ONCE(*pgdp))) {
+ if (p4d_none(READ_ONCE(*p4dp))) {
phys_addr_t pud_phys = early ?
__pa_symbol(kasan_early_shadow_pud)
: kasan_alloc_zeroed_page(node);
- __pgd_populate(pgdp, pud_phys, PMD_TYPE_TABLE);
+ __p4d_populate(p4dp, pud_phys, PMD_TYPE_TABLE);
}
- return early ? pud_offset_kimg(pgdp, addr) : pud_offset(pgdp, addr);
+ return early ? pud_offset_kimg(p4dp, addr) : pud_offset(p4dp, addr);
}
static void __init kasan_pte_populate(pmd_t *pmdp, unsigned long addr,
@@ -126,11 +125,11 @@
} while (pmdp++, addr = next, addr != end && pmd_none(READ_ONCE(*pmdp)));
}
-static void __init kasan_pud_populate(pgd_t *pgdp, unsigned long addr,
+static void __init kasan_pud_populate(p4d_t *p4dp, unsigned long addr,
unsigned long end, int node, bool early)
{
unsigned long next;
- pud_t *pudp = kasan_pud_offset(pgdp, addr, node, early);
+ pud_t *pudp = kasan_pud_offset(p4dp, addr, node, early);
do {
next = pud_addr_end(addr, end);
@@ -138,6 +137,18 @@
} while (pudp++, addr = next, addr != end && pud_none(READ_ONCE(*pudp)));
}
+static void __init kasan_p4d_populate(pgd_t *pgdp, unsigned long addr,
+ unsigned long end, int node, bool early)
+{
+ unsigned long next;
+ p4d_t *p4dp = p4d_offset(pgdp, addr);
+
+ do {
+ next = p4d_addr_end(addr, end);
+ kasan_pud_populate(p4dp, addr, next, node, early);
+ } while (p4dp++, addr = next, addr != end);
+}
+
static void __init kasan_pgd_populate(unsigned long addr, unsigned long end,
int node, bool early)
{
@@ -147,7 +158,7 @@
pgdp = pgd_offset_k(addr);
do {
next = pgd_addr_end(addr, end);
- kasan_pud_populate(pgdp, addr, next, node, early);
+ kasan_p4d_populate(pgdp, addr, next, node, early);
} while (pgdp++, addr = next, addr != end);
}
@@ -179,7 +190,7 @@
pgdp = pgd_offset_k(KASAN_SHADOW_START);
pgdp_end = pgd_offset_k(KASAN_SHADOW_END);
- pgdp_new = pgd_offset_raw(pgdir, KASAN_SHADOW_START);
+ pgdp_new = pgd_offset_pgd(pgdir, KASAN_SHADOW_START);
do {
set_pgd(pgdp_new, READ_ONCE(*pgdp));
} while (pgdp++, pgdp_new++, pgdp != pgdp_end);
@@ -201,8 +212,8 @@
{
u64 kimg_shadow_start, kimg_shadow_end;
u64 mod_shadow_start, mod_shadow_end;
- struct memblock_region *reg;
- int i;
+ phys_addr_t pa_start, pa_end;
+ u64 i;
kimg_shadow_start = (u64)kasan_mem_to_shadow(_text) & PAGE_MASK;
kimg_shadow_end = PAGE_ALIGN((u64)kasan_mem_to_shadow(_end));
@@ -235,9 +246,9 @@
kasan_populate_early_shadow((void *)mod_shadow_end,
(void *)kimg_shadow_start);
- for_each_memblock(memory, reg) {
- void *start = (void *)__phys_to_virt(reg->base);
- void *end = (void *)__phys_to_virt(reg->base + reg->size);
+ for_each_mem_range(i, &pa_start, &pa_end) {
+ void *start = (void *)__phys_to_virt(pa_start);
+ void *end = (void *)__phys_to_virt(pa_end);
if (start >= end)
break;
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index 99bc028..991e599 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -17,6 +17,7 @@
#include <linux/mman.h>
#include <linux/nodemask.h>
#include <linux/memblock.h>
+#include <linux/memory.h>
#include <linux/fs.h>
#include <linux/io.h>
#include <linux/mm.h>
@@ -34,6 +35,7 @@
#include <asm/mmu_context.h>
#include <asm/ptdump.h>
#include <asm/tlbflush.h>
+#include <asm/pgalloc.h>
#define NO_BLOCK_MAPPINGS BIT(0)
#define NO_CONT_MAPPINGS BIT(1)
@@ -120,7 +122,7 @@
* The following mapping attributes may be updated in live
* kernel mappings without the need for break-before-make.
*/
- static const pteval_t mask = PTE_PXN | PTE_RDONLY | PTE_WRITE | PTE_NG;
+ pteval_t mask = PTE_PXN | PTE_RDONLY | PTE_WRITE | PTE_NG;
/* creating or taking down mappings is always safe */
if (old == 0 || new == 0)
@@ -134,6 +136,17 @@
if (old & ~new & PTE_NG)
return false;
+ /*
+ * Changing the memory type between Normal and Normal-Tagged is safe
+ * since Tagged is considered a permission attribute from the
+ * mismatched attribute aliases perspective.
+ */
+ if (((old & PTE_ATTRINDX_MASK) == PTE_ATTRINDX(MT_NORMAL) ||
+ (old & PTE_ATTRINDX_MASK) == PTE_ATTRINDX(MT_NORMAL_TAGGED)) &&
+ ((new & PTE_ATTRINDX_MASK) == PTE_ATTRINDX(MT_NORMAL) ||
+ (new & PTE_ATTRINDX_MASK) == PTE_ATTRINDX(MT_NORMAL_TAGGED)))
+ mask |= PTE_ATTRINDX_MASK;
+
return ((old ^ new) & ~mask) == 0;
}
@@ -289,18 +302,19 @@
{
unsigned long next;
pud_t *pudp;
- pgd_t pgd = READ_ONCE(*pgdp);
+ p4d_t *p4dp = p4d_offset(pgdp, addr);
+ p4d_t p4d = READ_ONCE(*p4dp);
- if (pgd_none(pgd)) {
+ if (p4d_none(p4d)) {
phys_addr_t pud_phys;
BUG_ON(!pgtable_alloc);
pud_phys = pgtable_alloc(PUD_SHIFT);
- __pgd_populate(pgdp, pud_phys, PUD_TYPE_TABLE);
- pgd = READ_ONCE(*pgdp);
+ __p4d_populate(p4dp, pud_phys, PUD_TYPE_TABLE);
+ p4d = READ_ONCE(*p4dp);
}
- BUG_ON(pgd_bad(pgd));
+ BUG_ON(p4d_bad(p4d));
- pudp = pud_set_fixmap_offset(pgdp, addr);
+ pudp = pud_set_fixmap_offset(p4dp, addr);
do {
pud_t old_pud = READ_ONCE(*pudp);
@@ -338,8 +352,8 @@
phys_addr_t (*pgtable_alloc)(int),
int flags)
{
- unsigned long addr, length, end, next;
- pgd_t *pgdp = pgd_offset_raw(pgdir, virt);
+ unsigned long addr, end, next;
+ pgd_t *pgdp = pgd_offset_pgd(pgdir, virt);
/*
* If the virtual and physical address don't have the same offset
@@ -350,9 +364,8 @@
phys &= PAGE_MASK;
addr = virt & PAGE_MASK;
- length = PAGE_ALIGN(size + (virt & ~PAGE_MASK));
+ end = PAGE_ALIGN(virt + size);
- end = addr + length;
do {
next = pgd_addr_end(addr, end);
alloc_init_pud(pgdp, addr, next, phys, prot, pgtable_alloc,
@@ -456,14 +469,30 @@
PAGE_KERNEL_RO);
}
+static bool crash_mem_map __initdata;
+
+static int __init enable_crash_mem_map(char *arg)
+{
+ /*
+ * Proper parameter parsing is done by reserve_crashkernel(). We only
+ * need to know if the linear map has to avoid block mappings so that
+ * the crashkernel reservations can be unmapped later.
+ */
+ crash_mem_map = true;
+
+ return 0;
+}
+early_param("crashkernel", enable_crash_mem_map);
+
static void __init map_mem(pgd_t *pgdp)
{
phys_addr_t kernel_start = __pa_symbol(_text);
phys_addr_t kernel_end = __pa_symbol(__init_begin);
- struct memblock_region *reg;
+ phys_addr_t start, end;
int flags = 0;
+ u64 i;
- if (rodata_full || debug_pagealloc_enabled())
+ if (rodata_full || crash_mem_map || debug_pagealloc_enabled())
flags = NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS;
/*
@@ -473,23 +502,18 @@
* the following for-loop
*/
memblock_mark_nomap(kernel_start, kernel_end - kernel_start);
-#ifdef CONFIG_KEXEC_CORE
- if (crashk_res.end)
- memblock_mark_nomap(crashk_res.start,
- resource_size(&crashk_res));
-#endif
/* map all the memory banks */
- for_each_memblock(memory, reg) {
- phys_addr_t start = reg->base;
- phys_addr_t end = start + reg->size;
-
+ for_each_mem_range(i, &start, &end) {
if (start >= end)
break;
- if (memblock_is_nomap(reg))
- continue;
-
- __map_memblock(pgdp, start, end, PAGE_KERNEL, flags);
+ /*
+ * The linear map must allow allocation tags reading/writing
+ * if MTE is present. Otherwise, it has the same attributes as
+ * PAGE_KERNEL.
+ */
+ __map_memblock(pgdp, start, end, pgprot_tagged(PAGE_KERNEL),
+ flags);
}
/*
@@ -505,21 +529,6 @@
__map_memblock(pgdp, kernel_start, kernel_end,
PAGE_KERNEL, NO_CONT_MAPPINGS);
memblock_clear_nomap(kernel_start, kernel_end - kernel_start);
-
-#ifdef CONFIG_KEXEC_CORE
- /*
- * Use page-level mappings here so that we can shrink the region
- * in page granularity and put back unused memory to buddy system
- * through /sys/kernel/kexec_crash_size interface.
- */
- if (crashk_res.end) {
- __map_memblock(pgdp, crashk_res.start, crashk_res.end + 1,
- PAGE_KERNEL,
- NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS);
- memblock_clear_nomap(crashk_res.start,
- resource_size(&crashk_res));
- }
-#endif
}
void mark_rodata_ro(void)
@@ -583,6 +592,8 @@
#ifdef CONFIG_UNMAP_KERNEL_AT_EL0
static int __init map_entry_trampoline(void)
{
+ int i;
+
pgprot_t prot = rodata_enabled ? PAGE_KERNEL_ROX : PAGE_KERNEL_EXEC;
phys_addr_t pa_start = __pa_symbol(__entry_tramp_text_start);
@@ -591,11 +602,15 @@
/* Map only the text into the trampoline page table */
memset(tramp_pg_dir, 0, PGD_SIZE);
- __create_pgd_mapping(tramp_pg_dir, pa_start, TRAMP_VALIAS, PAGE_SIZE,
- prot, __pgd_pgtable_alloc, 0);
+ __create_pgd_mapping(tramp_pg_dir, pa_start, TRAMP_VALIAS,
+ entry_tramp_text_size(), prot,
+ __pgd_pgtable_alloc, NO_BLOCK_MAPPINGS);
/* Map both the text and data into the kernel page table */
- __set_fixmap(FIX_ENTRY_TRAMP_TEXT, pa_start, prot);
+ for (i = 0; i < DIV_ROUND_UP(entry_tramp_text_size(), PAGE_SIZE); i++)
+ __set_fixmap(FIX_ENTRY_TRAMP_TEXT1 - i,
+ pa_start + i * PAGE_SIZE, prot);
+
if (IS_ENABLED(CONFIG_RANDOMIZE_BASE)) {
extern char __entry_tramp_data_start[];
@@ -610,6 +625,22 @@
#endif
/*
+ * Open coded check for BTI, only for use to determine configuration
+ * for early mappings for before the cpufeature code has run.
+ */
+static bool arm64_early_this_cpu_has_bti(void)
+{
+ u64 pfr1;
+
+ if (!IS_ENABLED(CONFIG_ARM64_BTI_KERNEL))
+ return false;
+
+ pfr1 = read_sysreg_s(SYS_ID_AA64PFR1_EL1);
+ return cpuid_feature_extract_unsigned_field(pfr1,
+ ID_AA64PFR1_BT_SHIFT);
+}
+
+/*
* Create fine-grained mappings for the kernel.
*/
static void __init map_kernel(pgd_t *pgdp)
@@ -625,6 +656,14 @@
pgprot_t text_prot = rodata_enabled ? PAGE_KERNEL_ROX : PAGE_KERNEL_EXEC;
/*
+ * If we have a CPU that supports BTI and a kernel built for
+ * BTI then mark the kernel executable text as guarded pages
+ * now so we don't have to rewrite the page tables later.
+ */
+ if (arm64_early_this_cpu_has_bti())
+ text_prot = __pgprot_modify(text_prot, PTE_GP, PTE_GP);
+
+ /*
* Only rodata will be remapped with different permissions later on,
* all other segments are allowed to use contiguous mappings.
*/
@@ -638,16 +677,17 @@
&vmlinux_initdata, 0, VM_NO_GUARD);
map_kernel_segment(pgdp, _data, _end, PAGE_KERNEL, &vmlinux_data, 0, 0);
- if (!READ_ONCE(pgd_val(*pgd_offset_raw(pgdp, FIXADDR_START)))) {
+ if (!READ_ONCE(pgd_val(*pgd_offset_pgd(pgdp, FIXADDR_START)))) {
/*
* The fixmap falls in a separate pgd to the kernel, and doesn't
* live in the carveout for the swapper_pg_dir. We can simply
* re-use the existing dir for the fixmap.
*/
- set_pgd(pgd_offset_raw(pgdp, FIXADDR_START),
+ set_pgd(pgd_offset_pgd(pgdp, FIXADDR_START),
READ_ONCE(*pgd_offset_k(FIXADDR_START)));
} else if (CONFIG_PGTABLE_LEVELS > 3) {
pgd_t *bm_pgdp;
+ p4d_t *bm_p4dp;
pud_t *bm_pudp;
/*
* The fixmap shares its top level pgd entry with the kernel
@@ -656,8 +696,9 @@
* entry instead.
*/
BUG_ON(!IS_ENABLED(CONFIG_ARM64_16K_PAGES));
- bm_pgdp = pgd_offset_raw(pgdp, FIXADDR_START);
- bm_pudp = pud_set_fixmap_offset(bm_pgdp, FIXADDR_START);
+ bm_pgdp = pgd_offset_pgd(pgdp, FIXADDR_START);
+ bm_p4dp = p4d_offset(bm_pgdp, FIXADDR_START);
+ bm_pudp = pud_set_fixmap_offset(bm_p4dp, FIXADDR_START);
pud_populate(&init_mm, bm_pudp, lm_alias(bm_pmd));
pud_clear_fixmap();
} else {
@@ -691,10 +732,12 @@
int kern_addr_valid(unsigned long addr)
{
pgd_t *pgdp;
+ p4d_t *p4dp;
pud_t *pudp, pud;
pmd_t *pmdp, pmd;
pte_t *ptep, pte;
+ addr = arch_kasan_reset_tag(addr);
if ((((long)addr) >> VA_BITS) != -1UL)
return 0;
@@ -702,7 +745,11 @@
if (pgd_none(READ_ONCE(*pgdp)))
return 0;
- pudp = pud_offset(pgdp, addr);
+ p4dp = p4d_offset(pgdp, addr);
+ if (p4d_none(READ_ONCE(*p4dp)))
+ return 0;
+
+ pudp = pud_offset(p4dp, addr);
pud = READ_ONCE(*pudp);
if (pud_none(pud))
return 0;
@@ -725,12 +772,336 @@
return pfn_valid(pte_pfn(pte));
}
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+static void free_hotplug_page_range(struct page *page, size_t size,
+ struct vmem_altmap *altmap)
+{
+ if (altmap) {
+ vmem_altmap_free(altmap, size >> PAGE_SHIFT);
+ } else {
+ WARN_ON(PageReserved(page));
+ free_pages((unsigned long)page_address(page), get_order(size));
+ }
+}
+
+static void free_hotplug_pgtable_page(struct page *page)
+{
+ free_hotplug_page_range(page, PAGE_SIZE, NULL);
+}
+
+static bool pgtable_range_aligned(unsigned long start, unsigned long end,
+ unsigned long floor, unsigned long ceiling,
+ unsigned long mask)
+{
+ start &= mask;
+ if (start < floor)
+ return false;
+
+ if (ceiling) {
+ ceiling &= mask;
+ if (!ceiling)
+ return false;
+ }
+
+ if (end - 1 > ceiling - 1)
+ return false;
+ return true;
+}
+
+static void unmap_hotplug_pte_range(pmd_t *pmdp, unsigned long addr,
+ unsigned long end, bool free_mapped,
+ struct vmem_altmap *altmap)
+{
+ pte_t *ptep, pte;
+
+ do {
+ ptep = pte_offset_kernel(pmdp, addr);
+ pte = READ_ONCE(*ptep);
+ if (pte_none(pte))
+ continue;
+
+ WARN_ON(!pte_present(pte));
+ pte_clear(&init_mm, addr, ptep);
+ flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
+ if (free_mapped)
+ free_hotplug_page_range(pte_page(pte),
+ PAGE_SIZE, altmap);
+ } while (addr += PAGE_SIZE, addr < end);
+}
+
+static void unmap_hotplug_pmd_range(pud_t *pudp, unsigned long addr,
+ unsigned long end, bool free_mapped,
+ struct vmem_altmap *altmap)
+{
+ unsigned long next;
+ pmd_t *pmdp, pmd;
+
+ do {
+ next = pmd_addr_end(addr, end);
+ pmdp = pmd_offset(pudp, addr);
+ pmd = READ_ONCE(*pmdp);
+ if (pmd_none(pmd))
+ continue;
+
+ WARN_ON(!pmd_present(pmd));
+ if (pmd_sect(pmd)) {
+ pmd_clear(pmdp);
+
+ /*
+ * One TLBI should be sufficient here as the PMD_SIZE
+ * range is mapped with a single block entry.
+ */
+ flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
+ if (free_mapped)
+ free_hotplug_page_range(pmd_page(pmd),
+ PMD_SIZE, altmap);
+ continue;
+ }
+ WARN_ON(!pmd_table(pmd));
+ unmap_hotplug_pte_range(pmdp, addr, next, free_mapped, altmap);
+ } while (addr = next, addr < end);
+}
+
+static void unmap_hotplug_pud_range(p4d_t *p4dp, unsigned long addr,
+ unsigned long end, bool free_mapped,
+ struct vmem_altmap *altmap)
+{
+ unsigned long next;
+ pud_t *pudp, pud;
+
+ do {
+ next = pud_addr_end(addr, end);
+ pudp = pud_offset(p4dp, addr);
+ pud = READ_ONCE(*pudp);
+ if (pud_none(pud))
+ continue;
+
+ WARN_ON(!pud_present(pud));
+ if (pud_sect(pud)) {
+ pud_clear(pudp);
+
+ /*
+ * One TLBI should be sufficient here as the PUD_SIZE
+ * range is mapped with a single block entry.
+ */
+ flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
+ if (free_mapped)
+ free_hotplug_page_range(pud_page(pud),
+ PUD_SIZE, altmap);
+ continue;
+ }
+ WARN_ON(!pud_table(pud));
+ unmap_hotplug_pmd_range(pudp, addr, next, free_mapped, altmap);
+ } while (addr = next, addr < end);
+}
+
+static void unmap_hotplug_p4d_range(pgd_t *pgdp, unsigned long addr,
+ unsigned long end, bool free_mapped,
+ struct vmem_altmap *altmap)
+{
+ unsigned long next;
+ p4d_t *p4dp, p4d;
+
+ do {
+ next = p4d_addr_end(addr, end);
+ p4dp = p4d_offset(pgdp, addr);
+ p4d = READ_ONCE(*p4dp);
+ if (p4d_none(p4d))
+ continue;
+
+ WARN_ON(!p4d_present(p4d));
+ unmap_hotplug_pud_range(p4dp, addr, next, free_mapped, altmap);
+ } while (addr = next, addr < end);
+}
+
+static void unmap_hotplug_range(unsigned long addr, unsigned long end,
+ bool free_mapped, struct vmem_altmap *altmap)
+{
+ unsigned long next;
+ pgd_t *pgdp, pgd;
+
+ /*
+ * altmap can only be used as vmemmap mapping backing memory.
+ * In case the backing memory itself is not being freed, then
+ * altmap is irrelevant. Warn about this inconsistency when
+ * encountered.
+ */
+ WARN_ON(!free_mapped && altmap);
+
+ do {
+ next = pgd_addr_end(addr, end);
+ pgdp = pgd_offset_k(addr);
+ pgd = READ_ONCE(*pgdp);
+ if (pgd_none(pgd))
+ continue;
+
+ WARN_ON(!pgd_present(pgd));
+ unmap_hotplug_p4d_range(pgdp, addr, next, free_mapped, altmap);
+ } while (addr = next, addr < end);
+}
+
+static void free_empty_pte_table(pmd_t *pmdp, unsigned long addr,
+ unsigned long end, unsigned long floor,
+ unsigned long ceiling)
+{
+ pte_t *ptep, pte;
+ unsigned long i, start = addr;
+
+ do {
+ ptep = pte_offset_kernel(pmdp, addr);
+ pte = READ_ONCE(*ptep);
+
+ /*
+ * This is just a sanity check here which verifies that
+ * pte clearing has been done by earlier unmap loops.
+ */
+ WARN_ON(!pte_none(pte));
+ } while (addr += PAGE_SIZE, addr < end);
+
+ if (!pgtable_range_aligned(start, end, floor, ceiling, PMD_MASK))
+ return;
+
+ /*
+ * Check whether we can free the pte page if the rest of the
+ * entries are empty. Overlap with other regions have been
+ * handled by the floor/ceiling check.
+ */
+ ptep = pte_offset_kernel(pmdp, 0UL);
+ for (i = 0; i < PTRS_PER_PTE; i++) {
+ if (!pte_none(READ_ONCE(ptep[i])))
+ return;
+ }
+
+ pmd_clear(pmdp);
+ __flush_tlb_kernel_pgtable(start);
+ free_hotplug_pgtable_page(virt_to_page(ptep));
+}
+
+static void free_empty_pmd_table(pud_t *pudp, unsigned long addr,
+ unsigned long end, unsigned long floor,
+ unsigned long ceiling)
+{
+ pmd_t *pmdp, pmd;
+ unsigned long i, next, start = addr;
+
+ do {
+ next = pmd_addr_end(addr, end);
+ pmdp = pmd_offset(pudp, addr);
+ pmd = READ_ONCE(*pmdp);
+ if (pmd_none(pmd))
+ continue;
+
+ WARN_ON(!pmd_present(pmd) || !pmd_table(pmd) || pmd_sect(pmd));
+ free_empty_pte_table(pmdp, addr, next, floor, ceiling);
+ } while (addr = next, addr < end);
+
+ if (CONFIG_PGTABLE_LEVELS <= 2)
+ return;
+
+ if (!pgtable_range_aligned(start, end, floor, ceiling, PUD_MASK))
+ return;
+
+ /*
+ * Check whether we can free the pmd page if the rest of the
+ * entries are empty. Overlap with other regions have been
+ * handled by the floor/ceiling check.
+ */
+ pmdp = pmd_offset(pudp, 0UL);
+ for (i = 0; i < PTRS_PER_PMD; i++) {
+ if (!pmd_none(READ_ONCE(pmdp[i])))
+ return;
+ }
+
+ pud_clear(pudp);
+ __flush_tlb_kernel_pgtable(start);
+ free_hotplug_pgtable_page(virt_to_page(pmdp));
+}
+
+static void free_empty_pud_table(p4d_t *p4dp, unsigned long addr,
+ unsigned long end, unsigned long floor,
+ unsigned long ceiling)
+{
+ pud_t *pudp, pud;
+ unsigned long i, next, start = addr;
+
+ do {
+ next = pud_addr_end(addr, end);
+ pudp = pud_offset(p4dp, addr);
+ pud = READ_ONCE(*pudp);
+ if (pud_none(pud))
+ continue;
+
+ WARN_ON(!pud_present(pud) || !pud_table(pud) || pud_sect(pud));
+ free_empty_pmd_table(pudp, addr, next, floor, ceiling);
+ } while (addr = next, addr < end);
+
+ if (CONFIG_PGTABLE_LEVELS <= 3)
+ return;
+
+ if (!pgtable_range_aligned(start, end, floor, ceiling, PGDIR_MASK))
+ return;
+
+ /*
+ * Check whether we can free the pud page if the rest of the
+ * entries are empty. Overlap with other regions have been
+ * handled by the floor/ceiling check.
+ */
+ pudp = pud_offset(p4dp, 0UL);
+ for (i = 0; i < PTRS_PER_PUD; i++) {
+ if (!pud_none(READ_ONCE(pudp[i])))
+ return;
+ }
+
+ p4d_clear(p4dp);
+ __flush_tlb_kernel_pgtable(start);
+ free_hotplug_pgtable_page(virt_to_page(pudp));
+}
+
+static void free_empty_p4d_table(pgd_t *pgdp, unsigned long addr,
+ unsigned long end, unsigned long floor,
+ unsigned long ceiling)
+{
+ unsigned long next;
+ p4d_t *p4dp, p4d;
+
+ do {
+ next = p4d_addr_end(addr, end);
+ p4dp = p4d_offset(pgdp, addr);
+ p4d = READ_ONCE(*p4dp);
+ if (p4d_none(p4d))
+ continue;
+
+ WARN_ON(!p4d_present(p4d));
+ free_empty_pud_table(p4dp, addr, next, floor, ceiling);
+ } while (addr = next, addr < end);
+}
+
+static void free_empty_tables(unsigned long addr, unsigned long end,
+ unsigned long floor, unsigned long ceiling)
+{
+ unsigned long next;
+ pgd_t *pgdp, pgd;
+
+ do {
+ next = pgd_addr_end(addr, end);
+ pgdp = pgd_offset_k(addr);
+ pgd = READ_ONCE(*pgdp);
+ if (pgd_none(pgd))
+ continue;
+
+ WARN_ON(!pgd_present(pgd));
+ free_empty_p4d_table(pgdp, addr, next, floor, ceiling);
+ } while (addr = next, addr < end);
+}
+#endif
+
#ifdef CONFIG_SPARSEMEM_VMEMMAP
#if !ARM64_SWAPPER_USES_SECTION_MAPS
int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
struct vmem_altmap *altmap)
{
- return vmemmap_populate_basepages(start, end, node);
+ return vmemmap_populate_basepages(start, end, node, altmap);
}
#else /* !ARM64_SWAPPER_USES_SECTION_MAPS */
int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
@@ -739,6 +1110,7 @@
unsigned long addr = start;
unsigned long next;
pgd_t *pgdp;
+ p4d_t *p4dp;
pud_t *pudp;
pmd_t *pmdp;
@@ -749,7 +1121,11 @@
if (!pgdp)
return -ENOMEM;
- pudp = vmemmap_pud_populate(pgdp, addr, node);
+ p4dp = vmemmap_p4d_populate(pgdp, addr, node);
+ if (!p4dp)
+ return -ENOMEM;
+
+ pudp = vmemmap_pud_populate(p4dp, addr, node);
if (!pudp)
return -ENOMEM;
@@ -757,7 +1133,7 @@
if (pmd_none(READ_ONCE(*pmdp))) {
void *p = NULL;
- p = vmemmap_alloc_block_buf(PMD_SIZE, node);
+ p = vmemmap_alloc_block_buf(PMD_SIZE, node, altmap);
if (!p)
return -ENOMEM;
@@ -772,17 +1148,24 @@
void vmemmap_free(unsigned long start, unsigned long end,
struct vmem_altmap *altmap)
{
+#ifdef CONFIG_MEMORY_HOTPLUG
+ WARN_ON((start < VMEMMAP_START) || (end > VMEMMAP_END));
+
+ unmap_hotplug_range(start, end, true, altmap);
+ free_empty_tables(start, end, VMEMMAP_START, VMEMMAP_END);
+#endif
}
#endif /* CONFIG_SPARSEMEM_VMEMMAP */
static inline pud_t * fixmap_pud(unsigned long addr)
{
pgd_t *pgdp = pgd_offset_k(addr);
- pgd_t pgd = READ_ONCE(*pgdp);
+ p4d_t *p4dp = p4d_offset(pgdp, addr);
+ p4d_t p4d = READ_ONCE(*p4dp);
- BUG_ON(pgd_none(pgd) || pgd_bad(pgd));
+ BUG_ON(p4d_none(p4d) || p4d_bad(p4d));
- return pud_offset_kimg(pgdp, addr);
+ return pud_offset_kimg(p4dp, addr);
}
static inline pmd_t * fixmap_pmd(unsigned long addr)
@@ -808,25 +1191,27 @@
*/
void __init early_fixmap_init(void)
{
- pgd_t *pgdp, pgd;
+ pgd_t *pgdp;
+ p4d_t *p4dp, p4d;
pud_t *pudp;
pmd_t *pmdp;
unsigned long addr = FIXADDR_START;
pgdp = pgd_offset_k(addr);
- pgd = READ_ONCE(*pgdp);
+ p4dp = p4d_offset(pgdp, addr);
+ p4d = READ_ONCE(*p4dp);
if (CONFIG_PGTABLE_LEVELS > 3 &&
- !(pgd_none(pgd) || pgd_page_paddr(pgd) == __pa_symbol(bm_pud))) {
+ !(p4d_none(p4d) || p4d_page_paddr(p4d) == __pa_symbol(bm_pud))) {
/*
* We only end up here if the kernel mapping and the fixmap
* share the top level pgd entry, which should only happen on
* 16k/4 levels configurations.
*/
BUG_ON(!IS_ENABLED(CONFIG_ARM64_16K_PAGES));
- pudp = pud_offset_kimg(pgdp, addr);
+ pudp = pud_offset_kimg(p4dp, addr);
} else {
- if (pgd_none(pgd))
- __pgd_populate(pgdp, __pa_symbol(bm_pud), PUD_TYPE_TABLE);
+ if (p4d_none(p4d))
+ __p4d_populate(p4dp, __pa_symbol(bm_pud), PUD_TYPE_TABLE);
pudp = fixmap_pud(addr);
}
if (pud_none(READ_ONCE(*pudp)))
@@ -944,13 +1329,13 @@
* SW table walks can't handle removal of intermediate entries.
*/
return IS_ENABLED(CONFIG_ARM64_4K_PAGES) &&
- !IS_ENABLED(CONFIG_ARM64_PTDUMP_DEBUGFS);
+ !IS_ENABLED(CONFIG_PTDUMP_DEBUGFS);
}
int __init arch_ioremap_pmd_supported(void)
{
/* See arch_ioremap_pud_supported() */
- return !IS_ENABLED(CONFIG_ARM64_PTDUMP_DEBUGFS);
+ return !IS_ENABLED(CONFIG_PTDUMP_DEBUGFS);
}
int pud_set_huge(pud_t *pudp, phys_addr_t phys, pgprot_t prot)
@@ -1050,33 +1435,122 @@
}
#ifdef CONFIG_MEMORY_HOTPLUG
-int arch_add_memory(int nid, u64 start, u64 size,
- struct mhp_restrictions *restrictions)
+static void __remove_pgd_mapping(pgd_t *pgdir, unsigned long start, u64 size)
{
- int flags = 0;
+ unsigned long end = start + size;
+
+ WARN_ON(pgdir != init_mm.pgd);
+ WARN_ON((start < PAGE_OFFSET) || (end > PAGE_END));
+
+ unmap_hotplug_range(start, end, false, NULL);
+ free_empty_tables(start, end, PAGE_OFFSET, PAGE_END);
+}
+
+static bool inside_linear_region(u64 start, u64 size)
+{
+ u64 start_linear_pa = __pa(_PAGE_OFFSET(vabits_actual));
+ u64 end_linear_pa = __pa(PAGE_END - 1);
+
+ if (IS_ENABLED(CONFIG_RANDOMIZE_BASE)) {
+ /*
+ * Check for a wrap, it is possible because of randomized linear
+ * mapping the start physical address is actually bigger than
+ * the end physical address. In this case set start to zero
+ * because [0, end_linear_pa] range must still be able to cover
+ * all addressable physical addresses.
+ */
+ if (start_linear_pa > end_linear_pa)
+ start_linear_pa = 0;
+ }
+
+ WARN_ON(start_linear_pa > end_linear_pa);
+
+ /*
+ * Linear mapping region is the range [PAGE_OFFSET..(PAGE_END - 1)]
+ * accommodating both its ends but excluding PAGE_END. Max physical
+ * range which can be mapped inside this linear mapping range, must
+ * also be derived from its end points.
+ */
+ return start >= start_linear_pa && (start + size - 1) <= end_linear_pa;
+}
+
+int arch_add_memory(int nid, u64 start, u64 size,
+ struct mhp_params *params)
+{
+ int ret, flags = 0;
+
+ if (!inside_linear_region(start, size)) {
+ pr_err("[%llx %llx] is outside linear mapping region\n", start, start + size);
+ return -EINVAL;
+ }
if (rodata_full || debug_pagealloc_enabled())
flags = NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS;
__create_pgd_mapping(swapper_pg_dir, start, __phys_to_virt(start),
- size, PAGE_KERNEL, __pgd_pgtable_alloc, flags);
+ size, params->pgprot, __pgd_pgtable_alloc,
+ flags);
- return __add_pages(nid, start >> PAGE_SHIFT, size >> PAGE_SHIFT,
- restrictions);
+ memblock_clear_nomap(start, size);
+
+ ret = __add_pages(nid, start >> PAGE_SHIFT, size >> PAGE_SHIFT,
+ params);
+ if (ret)
+ __remove_pgd_mapping(swapper_pg_dir,
+ __phys_to_virt(start), size);
+ else {
+ max_pfn = PFN_UP(start + size);
+ max_low_pfn = max_pfn;
+ }
+
+ return ret;
}
+
void arch_remove_memory(int nid, u64 start, u64 size,
struct vmem_altmap *altmap)
{
unsigned long start_pfn = start >> PAGE_SHIFT;
unsigned long nr_pages = size >> PAGE_SHIFT;
- /*
- * FIXME: Cleanup page tables (also in arch_add_memory() in case
- * adding fails). Until then, this function should only be used
- * during memory hotplug (adding memory), not for memory
- * unplug. ARCH_ENABLE_MEMORY_HOTREMOVE must not be
- * unlocked yet.
- */
__remove_pages(start_pfn, nr_pages, altmap);
+ __remove_pgd_mapping(swapper_pg_dir, __phys_to_virt(start), size);
}
+
+/*
+ * This memory hotplug notifier helps prevent boot memory from being
+ * inadvertently removed as it blocks pfn range offlining process in
+ * __offline_pages(). Hence this prevents both offlining as well as
+ * removal process for boot memory which is initially always online.
+ * In future if and when boot memory could be removed, this notifier
+ * should be dropped and free_hotplug_page_range() should handle any
+ * reserved pages allocated during boot.
+ */
+static int prevent_bootmem_remove_notifier(struct notifier_block *nb,
+ unsigned long action, void *data)
+{
+ struct mem_section *ms;
+ struct memory_notify *arg = data;
+ unsigned long end_pfn = arg->start_pfn + arg->nr_pages;
+ unsigned long pfn = arg->start_pfn;
+
+ if (action != MEM_GOING_OFFLINE)
+ return NOTIFY_OK;
+
+ for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
+ ms = __pfn_to_section(pfn);
+ if (early_section(ms))
+ return NOTIFY_BAD;
+ }
+ return NOTIFY_OK;
+}
+
+static struct notifier_block prevent_bootmem_remove_nb = {
+ .notifier_call = prevent_bootmem_remove_notifier,
+};
+
+static int __init prevent_bootmem_remove_init(void)
+{
+ return register_memory_notifier(&prevent_bootmem_remove_nb);
+}
+device_initcall(prevent_bootmem_remove_init);
#endif
diff --git a/arch/arm64/mm/mteswap.c b/arch/arm64/mm/mteswap.c
new file mode 100644
index 0000000..c52c184
--- /dev/null
+++ b/arch/arm64/mm/mteswap.c
@@ -0,0 +1,83 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <linux/pagemap.h>
+#include <linux/xarray.h>
+#include <linux/slab.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
+#include <asm/mte.h>
+
+static DEFINE_XARRAY(mte_pages);
+
+void *mte_allocate_tag_storage(void)
+{
+ /* tags granule is 16 bytes, 2 tags stored per byte */
+ return kmalloc(PAGE_SIZE / 16 / 2, GFP_KERNEL);
+}
+
+void mte_free_tag_storage(char *storage)
+{
+ kfree(storage);
+}
+
+int mte_save_tags(struct page *page)
+{
+ void *tag_storage, *ret;
+
+ if (!test_bit(PG_mte_tagged, &page->flags))
+ return 0;
+
+ tag_storage = mte_allocate_tag_storage();
+ if (!tag_storage)
+ return -ENOMEM;
+
+ mte_save_page_tags(page_address(page), tag_storage);
+
+ /* page_private contains the swap entry.val set in do_swap_page */
+ ret = xa_store(&mte_pages, page_private(page), tag_storage, GFP_KERNEL);
+ if (WARN(xa_is_err(ret), "Failed to store MTE tags")) {
+ mte_free_tag_storage(tag_storage);
+ return xa_err(ret);
+ } else if (ret) {
+ /* Entry is being replaced, free the old entry */
+ mte_free_tag_storage(ret);
+ }
+
+ return 0;
+}
+
+bool mte_restore_tags(swp_entry_t entry, struct page *page)
+{
+ void *tags = xa_load(&mte_pages, entry.val);
+
+ if (!tags)
+ return false;
+
+ mte_restore_page_tags(page_address(page), tags);
+
+ return true;
+}
+
+void mte_invalidate_tags(int type, pgoff_t offset)
+{
+ swp_entry_t entry = swp_entry(type, offset);
+ void *tags = xa_erase(&mte_pages, entry.val);
+
+ mte_free_tag_storage(tags);
+}
+
+void mte_invalidate_tags_area(int type)
+{
+ swp_entry_t entry = swp_entry(type, 0);
+ swp_entry_t last_entry = swp_entry(type + 1, 0);
+ void *tags;
+
+ XA_STATE(xa_state, &mte_pages, entry.val);
+
+ xa_lock(&mte_pages);
+ xas_for_each(&xa_state, tags, last_entry.val - 1) {
+ __xa_erase(&mte_pages, xa_state.xa_index);
+ mte_free_tag_storage(tags);
+ }
+ xa_unlock(&mte_pages);
+}
diff --git a/arch/arm64/mm/numa.c b/arch/arm64/mm/numa.c
index 53ebb4b..a8303bc 100644
--- a/arch/arm64/mm/numa.c
+++ b/arch/arm64/mm/numa.c
@@ -354,13 +354,16 @@
struct memblock_region *mblk;
/* Check that valid nid is set to memblks */
- for_each_memblock(memory, mblk)
- if (mblk->nid == NUMA_NO_NODE || mblk->nid >= MAX_NUMNODES) {
+ for_each_mem_region(mblk) {
+ int mblk_nid = memblock_get_region_node(mblk);
+
+ if (mblk_nid == NUMA_NO_NODE || mblk_nid >= MAX_NUMNODES) {
pr_warn("Warning: invalid memblk node %d [mem %#010Lx-%#010Lx]\n",
- mblk->nid, mblk->base,
+ mblk_nid, mblk->base,
mblk->base + mblk->size - 1);
return -EINVAL;
}
+ }
/* Finally register nodes. */
for_each_node_mask(nid, numa_nodes_parsed) {
@@ -424,19 +427,16 @@
*/
static int __init dummy_numa_init(void)
{
+ phys_addr_t start = memblock_start_of_DRAM();
+ phys_addr_t end = memblock_end_of_DRAM();
int ret;
- struct memblock_region *mblk;
if (numa_off)
pr_info("NUMA disabled\n"); /* Forced off on command line. */
- pr_info("Faking a node at [mem %#018Lx-%#018Lx]\n",
- memblock_start_of_DRAM(), memblock_end_of_DRAM() - 1);
+ pr_info("Faking a node at [mem %#018Lx-%#018Lx]\n", start, end - 1);
- for_each_memblock(memory, mblk) {
- ret = numa_add_memblk(0, mblk->base, mblk->base + mblk->size);
- if (!ret)
- continue;
-
+ ret = numa_add_memblk(0, start, end);
+ if (ret) {
pr_err("NUMA init failed\n");
return ret;
}
@@ -449,7 +449,7 @@
* arm64_numa_init() - Initialize NUMA
*
* Try each configured NUMA initialization method until one succeeds. The
- * last fallback is dummy single node config encomapssing whole memory.
+ * last fallback is dummy single node config encompassing whole memory.
*/
void __init arm64_numa_init(void)
{
@@ -462,13 +462,3 @@
numa_init(dummy_numa_init);
}
-
-/*
- * We hope that we will be hotplugging memory on nodes we already know about,
- * such that acpi_get_node() succeeds and we never fall back to this...
- */
-int memory_add_physaddr_to_nid(u64 addr)
-{
- pr_warn("Unknown node for memory at 0x%llx, assuming node 0\n", addr);
- return 0;
-}
diff --git a/arch/arm64/mm/pageattr.c b/arch/arm64/mm/pageattr.c
index 9ce7bd9..1b94f5b 100644
--- a/arch/arm64/mm/pageattr.c
+++ b/arch/arm64/mm/pageattr.c
@@ -8,7 +8,7 @@
#include <linux/sched.h>
#include <linux/vmalloc.h>
-#include <asm/pgtable.h>
+#include <asm/cacheflush.h>
#include <asm/set_memory.h>
#include <asm/tlbflush.h>
@@ -54,7 +54,7 @@
pgprot_t set_mask, pgprot_t clear_mask)
{
unsigned long start = addr;
- unsigned long size = PAGE_SIZE*numpages;
+ unsigned long size = PAGE_SIZE * numpages;
unsigned long end = start + size;
struct vm_struct *area;
int i;
@@ -126,13 +126,13 @@
{
return change_memory_common(addr, numpages,
__pgprot(PTE_PXN),
- __pgprot(0));
+ __pgprot(PTE_MAYBE_GP));
}
int set_memory_x(unsigned long addr, int numpages)
{
return change_memory_common(addr, numpages,
- __pgprot(0),
+ __pgprot(PTE_MAYBE_GP),
__pgprot(PTE_PXN));
}
@@ -198,6 +198,7 @@
bool kernel_page_present(struct page *page)
{
pgd_t *pgdp;
+ p4d_t *p4dp;
pud_t *pudp, pud;
pmd_t *pmdp, pmd;
pte_t *ptep;
@@ -210,7 +211,11 @@
if (pgd_none(READ_ONCE(*pgdp)))
return false;
- pudp = pud_offset(pgdp, addr);
+ p4dp = p4d_offset(pgdp, addr);
+ if (p4d_none(READ_ONCE(*p4dp)))
+ return false;
+
+ pudp = pud_offset(p4dp, addr);
pud = READ_ONCE(*pudp);
if (pud_none(pud))
return false;
diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S
index 13e78a5..aacc7ea 100644
--- a/arch/arm64/mm/proc.S
+++ b/arch/arm64/mm/proc.S
@@ -9,13 +9,16 @@
#include <linux/init.h>
#include <linux/linkage.h>
+#include <linux/pgtable.h>
#include <asm/assembler.h>
#include <asm/asm-offsets.h>
+#include <asm/asm_pointer_auth.h>
#include <asm/hwcap.h>
-#include <asm/pgtable.h>
#include <asm/pgtable-hwdef.h>
#include <asm/cpufeature.h>
#include <asm/alternative.h>
+#include <asm/smp.h>
+#include <asm/sysreg.h>
#ifdef CONFIG_ARM64_64K_PAGES
#define TCR_TG_FLAGS TCR_TG0_64K | TCR_TG1_64K
@@ -42,15 +45,28 @@
#define TCR_KASAN_FLAGS 0
#endif
-#define MAIR(attr, mt) ((attr) << ((mt) * 8))
+/*
+ * Default MAIR_EL1. MT_NORMAL_TAGGED is initially mapped as Normal memory and
+ * changed during __cpu_setup to Normal Tagged if the system supports MTE.
+ */
+#define MAIR_EL1_SET \
+ (MAIR_ATTRIDX(MAIR_ATTR_DEVICE_nGnRnE, MT_DEVICE_nGnRnE) | \
+ MAIR_ATTRIDX(MAIR_ATTR_DEVICE_nGnRE, MT_DEVICE_nGnRE) | \
+ MAIR_ATTRIDX(MAIR_ATTR_DEVICE_GRE, MT_DEVICE_GRE) | \
+ MAIR_ATTRIDX(MAIR_ATTR_NORMAL_NC, MT_NORMAL_NC) | \
+ MAIR_ATTRIDX(MAIR_ATTR_NORMAL, MT_NORMAL) | \
+ MAIR_ATTRIDX(MAIR_ATTR_NORMAL_WT, MT_NORMAL_WT) | \
+ MAIR_ATTRIDX(MAIR_ATTR_NORMAL, MT_NORMAL_TAGGED))
#ifdef CONFIG_CPU_PM
/**
* cpu_do_suspend - save CPU registers context
*
* x0: virtual address of context pointer
+ *
+ * This must be kept in sync with struct cpu_suspend_ctx in <asm/suspend.h>.
*/
-ENTRY(cpu_do_suspend)
+SYM_FUNC_START(cpu_do_suspend)
mrs x2, tpidr_el0
mrs x3, tpidrro_el0
mrs x4, contextidr_el1
@@ -73,8 +89,13 @@
stp x8, x9, [x0, #48]
stp x10, x11, [x0, #64]
stp x12, x13, [x0, #80]
+ /*
+ * Save x18 as it may be used as a platform register, e.g. by shadow
+ * call stack.
+ */
+ str x18, [x0, #96]
ret
-ENDPROC(cpu_do_suspend)
+SYM_FUNC_END(cpu_do_suspend)
/**
* cpu_do_resume - restore CPU register context
@@ -82,13 +103,20 @@
* x0: Address of context pointer
*/
.pushsection ".idmap.text", "awx"
-ENTRY(cpu_do_resume)
+SYM_FUNC_START(cpu_do_resume)
ldp x2, x3, [x0]
ldp x4, x5, [x0, #16]
ldp x6, x8, [x0, #32]
ldp x9, x10, [x0, #48]
ldp x11, x12, [x0, #64]
ldp x13, x14, [x0, #80]
+ /*
+ * Restore x18, as it may be used as a platform register, and clear
+ * the buffer to minimize the risk of exposure when used for shadow
+ * call stack.
+ */
+ ldr x18, [x0, #96]
+ str xzr, [x0, #96]
msr tpidr_el0, x2
msr tpidrro_el0, x3
msr contextidr_el1, x4
@@ -124,45 +152,19 @@
ubfx x11, x11, #1, #1
msr oslar_el1, x11
reset_pmuserenr_el0 x0 // Disable PMU access from EL0
+ reset_amuserenr_el0 x0 // Disable AMU access from EL0
alternative_if ARM64_HAS_RAS_EXTN
msr_s SYS_DISR_EL1, xzr
alternative_else_nop_endif
+ ptrauth_keys_install_kernel_nosync x14, x1, x2, x3
isb
ret
-ENDPROC(cpu_do_resume)
+SYM_FUNC_END(cpu_do_resume)
.popsection
#endif
-/*
- * cpu_do_switch_mm(pgd_phys, tsk)
- *
- * Set the translation table base pointer to be pgd_phys.
- *
- * - pgd_phys - physical address of new TTB
- */
-ENTRY(cpu_do_switch_mm)
- mrs x2, ttbr1_el1
- mmid x1, x1 // get mm->context.id
- phys_to_ttbr x3, x0
-
-alternative_if ARM64_HAS_CNP
- cbz x1, 1f // skip CNP for reserved ASID
- orr x3, x3, #TTBR_CNP_BIT
-1:
-alternative_else_nop_endif
-#ifdef CONFIG_ARM64_SW_TTBR0_PAN
- bfi x3, x1, #48, #16 // set the ASID field in TTBR0
-#endif
- bfi x2, x1, #48, #16 // set the ASID
- msr ttbr1_el1, x2 // in TTBR1 (since TCR.A1 is set)
- isb
- msr ttbr0_el1, x3 // now update TTBR0
- isb
- b post_ttbr_update_workaround // Back to C code...
-ENDPROC(cpu_do_switch_mm)
-
.pushsection ".idmap.text", "awx"
.macro __idmap_cpu_set_reserved_ttbr1, tmp1, tmp2
@@ -182,7 +184,7 @@
* This is the low-level counterpart to cpu_replace_ttbr1, and should not be
* called by anything else. It can only be executed from a TTBR0 mapping.
*/
-ENTRY(idmap_cpu_replace_ttbr1)
+SYM_FUNC_START(idmap_cpu_replace_ttbr1)
save_and_disable_daif flags=x2
__idmap_cpu_set_reserved_ttbr1 x1, x3
@@ -194,7 +196,7 @@
restore_daif x2
ret
-ENDPROC(idmap_cpu_replace_ttbr1)
+SYM_FUNC_END(idmap_cpu_replace_ttbr1)
.popsection
#ifdef CONFIG_UNMAP_KERNEL_AT_EL0
@@ -222,7 +224,7 @@
*/
__idmap_kpti_flag:
.long 1
-ENTRY(idmap_kpti_install_ng_mappings)
+SYM_FUNC_START(idmap_kpti_install_ng_mappings)
cpu .req w0
num_cpus .req w1
swapper_pa .req x2
@@ -250,15 +252,15 @@
/* We're the boot CPU. Wait for the others to catch up */
sevl
1: wfe
- ldaxr w18, [flag_ptr]
- eor w18, w18, num_cpus
- cbnz w18, 1b
+ ldaxr w17, [flag_ptr]
+ eor w17, w17, num_cpus
+ cbnz w17, 1b
/* We need to walk swapper, so turn off the MMU. */
pre_disable_mmu_workaround
- mrs x18, sctlr_el1
- bic x18, x18, #SCTLR_ELx_M
- msr sctlr_el1, x18
+ mrs x17, sctlr_el1
+ bic x17, x17, #SCTLR_ELx_M
+ msr sctlr_el1, x17
isb
/* Everybody is enjoying the idmap, so we can rewrite swapper. */
@@ -281,9 +283,9 @@
isb
/* We're done: fire up the MMU again */
- mrs x18, sctlr_el1
- orr x18, x18, #SCTLR_ELx_M
- msr sctlr_el1, x18
+ mrs x17, sctlr_el1
+ orr x17, x17, #SCTLR_ELx_M
+ msr sctlr_el1, x17
isb
/*
@@ -353,34 +355,9 @@
b.ne do_pte
b next_pmd
- /* Secondary CPUs end up here */
-__idmap_kpti_secondary:
- /* Uninstall swapper before surgery begins */
- __idmap_cpu_set_reserved_ttbr1 x18, x17
-
- /* Increment the flag to let the boot CPU we're ready */
-1: ldxr w18, [flag_ptr]
- add w18, w18, #1
- stxr w17, w18, [flag_ptr]
- cbnz w17, 1b
-
- /* Wait for the boot CPU to finish messing around with swapper */
- sevl
-1: wfe
- ldxr w18, [flag_ptr]
- cbnz w18, 1b
-
- /* All done, act like nothing happened */
- offset_ttbr1 swapper_ttb, x18
- msr ttbr1_el1, swapper_ttb
- isb
- ret
-
.unreq cpu
.unreq num_cpus
.unreq swapper_pa
- .unreq swapper_ttb
- .unreq flag_ptr
.unreq cur_pgdp
.unreq end_pgdp
.unreq pgd
@@ -393,56 +370,103 @@
.unreq cur_ptep
.unreq end_ptep
.unreq pte
-ENDPROC(idmap_kpti_install_ng_mappings)
+
+ /* Secondary CPUs end up here */
+__idmap_kpti_secondary:
+ /* Uninstall swapper before surgery begins */
+ __idmap_cpu_set_reserved_ttbr1 x16, x17
+
+ /* Increment the flag to let the boot CPU we're ready */
+1: ldxr w16, [flag_ptr]
+ add w16, w16, #1
+ stxr w17, w16, [flag_ptr]
+ cbnz w17, 1b
+
+ /* Wait for the boot CPU to finish messing around with swapper */
+ sevl
+1: wfe
+ ldxr w16, [flag_ptr]
+ cbnz w16, 1b
+
+ /* All done, act like nothing happened */
+ offset_ttbr1 swapper_ttb, x16
+ msr ttbr1_el1, swapper_ttb
+ isb
+ ret
+
+ .unreq swapper_ttb
+ .unreq flag_ptr
+SYM_FUNC_END(idmap_kpti_install_ng_mappings)
.popsection
#endif
/*
* __cpu_setup
*
- * Initialise the processor for turning the MMU on. Return in x0 the
- * value of the SCTLR_EL1 register.
+ * Initialise the processor for turning the MMU on.
+ *
+ * Output:
+ * Return in x0 the value of the SCTLR_EL1 register.
*/
.pushsection ".idmap.text", "awx"
-ENTRY(__cpu_setup)
+SYM_FUNC_START(__cpu_setup)
tlbi vmalle1 // Invalidate local TLB
dsb nsh
- mov x0, #3 << 20
- msr cpacr_el1, x0 // Enable FP/ASIMD
- mov x0, #1 << 12 // Reset mdscr_el1 and disable
- msr mdscr_el1, x0 // access to the DCC from EL0
+ mov x1, #3 << 20
+ msr cpacr_el1, x1 // Enable FP/ASIMD
+ mov x1, #1 << 12 // Reset mdscr_el1 and disable
+ msr mdscr_el1, x1 // access to the DCC from EL0
isb // Unmask debug exceptions now,
enable_dbg // since this is per-cpu
- reset_pmuserenr_el0 x0 // Disable PMU access from EL0
+ reset_pmuserenr_el0 x1 // Disable PMU access from EL0
+ reset_amuserenr_el0 x1 // Disable AMU access from EL0
+
/*
- * Memory region attributes for LPAE:
- *
- * n = AttrIndx[2:0]
- * n MAIR
- * DEVICE_nGnRnE 000 00000000
- * DEVICE_nGnRE 001 00000100
- * DEVICE_GRE 010 00001100
- * NORMAL_NC 011 01000100
- * NORMAL 100 11111111
- * NORMAL_WT 101 10111011
+ * Memory region attributes
*/
- ldr x5, =MAIR(0x00, MT_DEVICE_nGnRnE) | \
- MAIR(0x04, MT_DEVICE_nGnRE) | \
- MAIR(0x0c, MT_DEVICE_GRE) | \
- MAIR(0x44, MT_NORMAL_NC) | \
- MAIR(0xff, MT_NORMAL) | \
- MAIR(0xbb, MT_NORMAL_WT)
+ mov_q x5, MAIR_EL1_SET
+#ifdef CONFIG_ARM64_MTE
+ /*
+ * Update MAIR_EL1, GCR_EL1 and TFSR*_EL1 if MTE is supported
+ * (ID_AA64PFR1_EL1[11:8] > 1).
+ */
+ mrs x10, ID_AA64PFR1_EL1
+ ubfx x10, x10, #ID_AA64PFR1_MTE_SHIFT, #4
+ cmp x10, #ID_AA64PFR1_MTE
+ b.lt 1f
+
+ /* Normal Tagged memory type at the corresponding MAIR index */
+ mov x10, #MAIR_ATTR_NORMAL_TAGGED
+ bfi x5, x10, #(8 * MT_NORMAL_TAGGED), #8
+
+ /* initialize GCR_EL1: all non-zero tags excluded by default */
+ mov x10, #(SYS_GCR_EL1_RRND | SYS_GCR_EL1_EXCL_MASK)
+ msr_s SYS_GCR_EL1, x10
+
+ /*
+ * If GCR_EL1.RRND=1 is implemented the same way as RRND=0, then
+ * RGSR_EL1.SEED must be non-zero for IRG to produce
+ * pseudorandom numbers. As RGSR_EL1 is UNKNOWN out of reset, we
+ * must initialize it.
+ */
+ mrs x10, CNTVCT_EL0
+ ands x10, x10, #SYS_RGSR_EL1_SEED_MASK
+ csinc x10, x10, xzr, ne
+ lsl x10, x10, #SYS_RGSR_EL1_SEED_SHIFT
+ msr_s SYS_RGSR_EL1, x10
+
+ /* clear any pending tag check faults in TFSR*_EL1 */
+ msr_s SYS_TFSR_EL1, xzr
+ msr_s SYS_TFSRE0_EL1, xzr
+1:
+#endif
msr mair_el1, x5
/*
- * Prepare SCTLR
- */
- mov_q x0, SCTLR_EL1_SET
- /*
* Set/prepare TCR and TTBR. We use 512GB (39-bit) address range for
* both user and kernel.
*/
- ldr x10, =TCR_TxSZ(VA_BITS) | TCR_CACHE_FLAGS | TCR_SMP_FLAGS | \
+ mov_q x10, TCR_TxSZ(VA_BITS) | TCR_CACHE_FLAGS | TCR_SMP_FLAGS | \
TCR_TG_FLAGS | TCR_KASLR_FLAGS | TCR_ASID16 | \
TCR_TBI0 | TCR_A1 | TCR_KASAN_FLAGS
tcr_clear_errata_bits x10, x9, x5
@@ -474,5 +498,9 @@
1:
#endif /* CONFIG_ARM64_HW_AFDBM */
msr tcr_el1, x10
+ /*
+ * Prepare SCTLR
+ */
+ mov_q x0, SCTLR_EL1_SET
ret // return to head.S
-ENDPROC(__cpu_setup)
+SYM_FUNC_END(__cpu_setup)
diff --git a/arch/arm64/mm/dump.c b/arch/arm64/mm/ptdump.c
similarity index 74%
rename from arch/arm64/mm/dump.c
rename to arch/arm64/mm/ptdump.c
index 93f9f77..807dc63 100644
--- a/arch/arm64/mm/dump.c
+++ b/arch/arm64/mm/ptdump.c
@@ -15,13 +15,13 @@
#include <linux/io.h>
#include <linux/init.h>
#include <linux/mm.h>
+#include <linux/ptdump.h>
#include <linux/sched.h>
#include <linux/seq_file.h>
#include <asm/fixmap.h>
#include <asm/kasan.h>
#include <asm/memory.h>
-#include <asm/pgtable.h>
#include <asm/pgtable-hwdef.h>
#include <asm/ptdump.h>
@@ -41,6 +41,8 @@
{ 0 /* KASAN_SHADOW_START */, "Kasan shadow start" },
{ KASAN_SHADOW_END, "Kasan shadow end" },
#endif
+ { BPF_JIT_REGION_START, "BPF start" },
+ { BPF_JIT_REGION_END, "BPF end" },
{ MODULES_VADDR, "Modules start" },
{ MODULES_END, "Modules end" },
{ VMALLOC_START, "vmalloc() area" },
@@ -75,10 +77,11 @@
* dumps out a description of the range.
*/
struct pg_state {
+ struct ptdump_state ptdump;
struct seq_file *seq;
const struct addr_marker *marker;
unsigned long start_address;
- unsigned level;
+ int level;
u64 current_prot;
bool check_wx;
unsigned long wx_pages;
@@ -142,6 +145,12 @@
.mask = PTE_UXN,
.val = PTE_UXN,
.set = "UXN",
+ .clear = " ",
+ }, {
+ .mask = PTE_GP,
+ .val = PTE_GP,
+ .set = "GP",
+ .clear = " ",
}, {
.mask = PTE_ATTRINDX_MASK,
.val = PTE_ATTRINDX(MT_DEVICE_nGnRnE),
@@ -162,6 +171,10 @@
.mask = PTE_ATTRINDX_MASK,
.val = PTE_ATTRINDX(MT_NORMAL),
.set = "MEM/NORMAL",
+ }, {
+ .mask = PTE_ATTRINDX_MASK,
+ .val = PTE_ATTRINDX(MT_NORMAL_TAGGED),
+ .set = "MEM/NORMAL-TAGGED",
}
};
@@ -173,11 +186,14 @@
};
static struct pg_level pg_level[] = {
- {
- }, { /* pgd */
+ { /* pgd */
.name = "PGD",
.bits = pte_bits,
.num = ARRAY_SIZE(pte_bits),
+ }, { /* p4d */
+ .name = "P4D",
+ .bits = pte_bits,
+ .num = ARRAY_SIZE(pte_bits),
}, { /* pud */
.name = (CONFIG_PGTABLE_LEVELS > 3) ? "PUD" : "PGD",
.bits = pte_bits,
@@ -240,13 +256,17 @@
st->wx_pages += (addr - st->start_address) / PAGE_SIZE;
}
-static void note_page(struct pg_state *st, unsigned long addr, unsigned level,
- u64 val)
+static void note_page(struct ptdump_state *pt_st, unsigned long addr, int level,
+ u64 val)
{
+ struct pg_state *st = container_of(pt_st, struct pg_state, ptdump);
static const char units[] = "KMGTPE";
- u64 prot = val & pg_level[level].mask;
+ u64 prot = 0;
- if (!st->level) {
+ if (level >= 0)
+ prot = val & pg_level[level].mask;
+
+ if (st->level == -1) {
st->level = level;
st->current_prot = prot;
st->start_address = addr;
@@ -259,21 +279,22 @@
if (st->current_prot) {
note_prot_uxn(st, addr);
note_prot_wx(st, addr);
- pt_dump_seq_printf(st->seq, "0x%016lx-0x%016lx ",
+ }
+
+ pt_dump_seq_printf(st->seq, "0x%016lx-0x%016lx ",
st->start_address, addr);
- delta = (addr - st->start_address) >> 10;
- while (!(delta & 1023) && unit[1]) {
- delta >>= 10;
- unit++;
- }
- pt_dump_seq_printf(st->seq, "%9lu%c %s", delta, *unit,
- pg_level[st->level].name);
- if (pg_level[st->level].bits)
- dump_prot(st, pg_level[st->level].bits,
- pg_level[st->level].num);
- pt_dump_seq_puts(st->seq, "\n");
+ delta = (addr - st->start_address) >> 10;
+ while (!(delta & 1023) && unit[1]) {
+ delta >>= 10;
+ unit++;
}
+ pt_dump_seq_printf(st->seq, "%9lu%c %s", delta, *unit,
+ pg_level[st->level].name);
+ if (st->current_prot && pg_level[st->level].bits)
+ dump_prot(st, pg_level[st->level].bits,
+ pg_level[st->level].num);
+ pt_dump_seq_puts(st->seq, "\n");
if (addr >= st->marker[1].start_address) {
st->marker++;
@@ -292,85 +313,27 @@
}
-static void walk_pte(struct pg_state *st, pmd_t *pmdp, unsigned long start,
- unsigned long end)
+void ptdump_walk(struct seq_file *s, struct ptdump_info *info)
{
- unsigned long addr = start;
- pte_t *ptep = pte_offset_kernel(pmdp, start);
+ unsigned long end = ~0UL;
+ struct pg_state st;
- do {
- note_page(st, addr, 4, READ_ONCE(pte_val(*ptep)));
- } while (ptep++, addr += PAGE_SIZE, addr != end);
-}
+ if (info->base_addr < TASK_SIZE_64)
+ end = TASK_SIZE_64;
-static void walk_pmd(struct pg_state *st, pud_t *pudp, unsigned long start,
- unsigned long end)
-{
- unsigned long next, addr = start;
- pmd_t *pmdp = pmd_offset(pudp, start);
-
- do {
- pmd_t pmd = READ_ONCE(*pmdp);
- next = pmd_addr_end(addr, end);
-
- if (pmd_none(pmd) || pmd_sect(pmd)) {
- note_page(st, addr, 3, pmd_val(pmd));
- } else {
- BUG_ON(pmd_bad(pmd));
- walk_pte(st, pmdp, addr, next);
- }
- } while (pmdp++, addr = next, addr != end);
-}
-
-static void walk_pud(struct pg_state *st, pgd_t *pgdp, unsigned long start,
- unsigned long end)
-{
- unsigned long next, addr = start;
- pud_t *pudp = pud_offset(pgdp, start);
-
- do {
- pud_t pud = READ_ONCE(*pudp);
- next = pud_addr_end(addr, end);
-
- if (pud_none(pud) || pud_sect(pud)) {
- note_page(st, addr, 2, pud_val(pud));
- } else {
- BUG_ON(pud_bad(pud));
- walk_pmd(st, pudp, addr, next);
- }
- } while (pudp++, addr = next, addr != end);
-}
-
-static void walk_pgd(struct pg_state *st, struct mm_struct *mm,
- unsigned long start)
-{
- unsigned long end = (start < TASK_SIZE_64) ? TASK_SIZE_64 : 0;
- unsigned long next, addr = start;
- pgd_t *pgdp = pgd_offset(mm, start);
-
- do {
- pgd_t pgd = READ_ONCE(*pgdp);
- next = pgd_addr_end(addr, end);
-
- if (pgd_none(pgd)) {
- note_page(st, addr, 1, pgd_val(pgd));
- } else {
- BUG_ON(pgd_bad(pgd));
- walk_pud(st, pgdp, addr, next);
- }
- } while (pgdp++, addr = next, addr != end);
-}
-
-void ptdump_walk_pgd(struct seq_file *m, struct ptdump_info *info)
-{
- struct pg_state st = {
- .seq = m,
+ st = (struct pg_state){
+ .seq = s,
.marker = info->markers,
+ .ptdump = {
+ .note_page = note_page,
+ .range = (struct ptdump_range[]){
+ {info->base_addr, end},
+ {0, 0}
+ }
+ }
};
- walk_pgd(&st, info->mm, info->base_addr);
-
- note_page(&st, 0, 0, 0);
+ ptdump_walk_pgd(&st.ptdump, info->mm, NULL);
}
static void ptdump_initialize(void)
@@ -397,11 +360,19 @@
{ 0, NULL},
{ -1, NULL},
},
+ .level = -1,
.check_wx = true,
+ .ptdump = {
+ .note_page = note_page,
+ .range = (struct ptdump_range[]) {
+ {PAGE_OFFSET, ~0UL},
+ {0, 0}
+ }
+ }
};
- walk_pgd(&st, &init_mm, PAGE_OFFSET);
- note_page(&st, 0, 0, 0);
+ ptdump_walk_pgd(&st.ptdump, &init_mm, NULL);
+
if (st.wx_pages || st.uxn_pages)
pr_warn("Checked W+X mappings: FAILED, %lu W+X pages found, %lu non-UXN pages found\n",
st.wx_pages, st.uxn_pages);
diff --git a/arch/arm64/mm/ptdump_debugfs.c b/arch/arm64/mm/ptdump_debugfs.c
index 064163f..d29d722 100644
--- a/arch/arm64/mm/ptdump_debugfs.c
+++ b/arch/arm64/mm/ptdump_debugfs.c
@@ -1,5 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/debugfs.h>
+#include <linux/memory_hotplug.h>
#include <linux/seq_file.h>
#include <asm/ptdump.h>
@@ -7,7 +8,10 @@
static int ptdump_show(struct seq_file *m, void *v)
{
struct ptdump_info *info = m->private;
- ptdump_walk_pgd(m, info);
+
+ get_online_mems();
+ ptdump_walk(m, info);
+ put_online_mems();
return 0;
}
DEFINE_SHOW_ATTRIBUTE(ptdump);