Update Linux to v5.10.109
Sourced from [1]
[1] https://cdn.kernel.org/pub/linux/kernel/v5.x/linux-5.10.109.tar.xz
Change-Id: I19bca9fc6762d4e63bcf3e4cba88bbe560d9c76c
Signed-off-by: Olivier Deprez <olivier.deprez@arm.com>
diff --git a/arch/s390/mm/Makefile b/arch/s390/mm/Makefile
index 3175413..cd67e94 100644
--- a/arch/s390/mm/Makefile
+++ b/arch/s390/mm/Makefile
@@ -8,7 +8,7 @@
obj-$(CONFIG_CMM) += cmm.o
obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
-obj-$(CONFIG_S390_PTDUMP) += dump_pagetables.o
+obj-$(CONFIG_PTDUMP_CORE) += dump_pagetables.o
obj-$(CONFIG_PGSTE) += gmap.o
KASAN_SANITIZE_kasan_init.o := n
diff --git a/arch/s390/mm/cmm.c b/arch/s390/mm/cmm.c
index a51c892..1141c8d 100644
--- a/arch/s390/mm/cmm.c
+++ b/arch/s390/mm/cmm.c
@@ -19,10 +19,8 @@
#include <linux/swap.h>
#include <linux/kthread.h>
#include <linux/oom.h>
-#include <linux/suspend.h>
#include <linux/uaccess.h>
-#include <asm/pgalloc.h>
#include <asm/diag.h>
#ifdef CONFIG_CMM_IUCV
@@ -49,7 +47,6 @@
static volatile long cmm_timed_pages_target;
static long cmm_timeout_pages;
static long cmm_timeout_seconds;
-static int cmm_suspended;
static struct cmm_page_array *cmm_page_list;
static struct cmm_page_array *cmm_timed_page_list;
@@ -151,9 +148,9 @@
while (1) {
rc = wait_event_interruptible(cmm_thread_wait,
- (!cmm_suspended && (cmm_pages != cmm_pages_target ||
- cmm_timed_pages != cmm_timed_pages_target)) ||
- kthread_should_stop());
+ cmm_pages != cmm_pages_target ||
+ cmm_timed_pages != cmm_timed_pages_target ||
+ kthread_should_stop());
if (kthread_should_stop() || rc == -ERESTARTSYS) {
cmm_pages_target = cmm_pages;
cmm_timed_pages_target = cmm_timed_pages;
@@ -191,7 +188,7 @@
del_timer(&cmm_timer);
return;
}
- mod_timer(&cmm_timer, jiffies + cmm_timeout_seconds * HZ);
+ mod_timer(&cmm_timer, jiffies + msecs_to_jiffies(cmm_timeout_seconds * MSEC_PER_SEC));
}
static void cmm_timer_fn(struct timer_list *unused)
@@ -247,7 +244,7 @@
}
static int cmm_pages_handler(struct ctl_table *ctl, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
long nr = cmm_get_pages();
struct ctl_table ctl_entry = {
@@ -266,7 +263,7 @@
}
static int cmm_timed_pages_handler(struct ctl_table *ctl, int write,
- void __user *buffer, size_t *lenp,
+ void *buffer, size_t *lenp,
loff_t *ppos)
{
long nr = cmm_get_timed_pages();
@@ -286,7 +283,7 @@
}
static int cmm_timeout_handler(struct ctl_table *ctl, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
char buf[64], *p;
long nr, seconds;
@@ -299,8 +296,7 @@
if (write) {
len = min(*lenp, sizeof(buf));
- if (copy_from_user(buf, buffer, len))
- return -EFAULT;
+ memcpy(buf, buffer, len);
buf[len - 1] = '\0';
cmm_skip_blanks(buf, &p);
nr = simple_strtoul(p, &p, 0);
@@ -313,8 +309,7 @@
cmm_timeout_pages, cmm_timeout_seconds);
if (len > *lenp)
len = *lenp;
- if (copy_to_user(buffer, buf, len))
- return -EFAULT;
+ memcpy(buffer, buf, len);
*lenp = len;
*ppos += len;
}
@@ -390,38 +385,6 @@
static struct ctl_table_header *cmm_sysctl_header;
-static int cmm_suspend(void)
-{
- cmm_suspended = 1;
- cmm_free_pages(cmm_pages, &cmm_pages, &cmm_page_list);
- cmm_free_pages(cmm_timed_pages, &cmm_timed_pages, &cmm_timed_page_list);
- return 0;
-}
-
-static int cmm_resume(void)
-{
- cmm_suspended = 0;
- cmm_kick_thread();
- return 0;
-}
-
-static int cmm_power_event(struct notifier_block *this,
- unsigned long event, void *ptr)
-{
- switch (event) {
- case PM_POST_HIBERNATION:
- return cmm_resume();
- case PM_HIBERNATION_PREPARE:
- return cmm_suspend();
- default:
- return NOTIFY_DONE;
- }
-}
-
-static struct notifier_block cmm_power_notifier = {
- .notifier_call = cmm_power_event,
-};
-
static int __init cmm_init(void)
{
int rc = -ENOMEM;
@@ -446,16 +409,11 @@
rc = register_oom_notifier(&cmm_oom_nb);
if (rc < 0)
goto out_oom_notify;
- rc = register_pm_notifier(&cmm_power_notifier);
- if (rc)
- goto out_pm;
cmm_thread_ptr = kthread_run(cmm_thread, NULL, "cmmthread");
if (!IS_ERR(cmm_thread_ptr))
return 0;
rc = PTR_ERR(cmm_thread_ptr);
- unregister_pm_notifier(&cmm_power_notifier);
-out_pm:
unregister_oom_notifier(&cmm_oom_nb);
out_oom_notify:
#ifdef CONFIG_CMM_IUCV
@@ -475,7 +433,6 @@
#ifdef CONFIG_CMM_IUCV
smsg_unregister_callback(SMSG_PREFIX, cmm_smsg_target);
#endif
- unregister_pm_notifier(&cmm_power_notifier);
unregister_oom_notifier(&cmm_oom_nb);
kthread_stop(cmm_thread_ptr);
del_timer_sync(&cmm_timer);
diff --git a/arch/s390/mm/dump_pagetables.c b/arch/s390/mm/dump_pagetables.c
index 5d67b81..8f9ff7e 100644
--- a/arch/s390/mm/dump_pagetables.c
+++ b/arch/s390/mm/dump_pagetables.c
@@ -1,12 +1,13 @@
// SPDX-License-Identifier: GPL-2.0
+#include <linux/set_memory.h>
+#include <linux/ptdump.h>
#include <linux/seq_file.h>
#include <linux/debugfs.h>
-#include <linux/sched.h>
#include <linux/mm.h>
#include <linux/kasan.h>
+#include <asm/ptdump.h>
#include <asm/kasan.h>
#include <asm/sections.h>
-#include <asm/pgtable.h>
static unsigned long max_addr;
@@ -16,263 +17,234 @@
};
enum address_markers_idx {
- IDENTITY_NR = 0,
+ IDENTITY_BEFORE_NR = 0,
+ IDENTITY_BEFORE_END_NR,
KERNEL_START_NR,
KERNEL_END_NR,
+ IDENTITY_AFTER_NR,
+ IDENTITY_AFTER_END_NR,
#ifdef CONFIG_KASAN
KASAN_SHADOW_START_NR,
KASAN_SHADOW_END_NR,
#endif
VMEMMAP_NR,
+ VMEMMAP_END_NR,
VMALLOC_NR,
+ VMALLOC_END_NR,
MODULES_NR,
+ MODULES_END_NR,
};
static struct addr_marker address_markers[] = {
- [IDENTITY_NR] = {0, "Identity Mapping"},
+ [IDENTITY_BEFORE_NR] = {0, "Identity Mapping Start"},
+ [IDENTITY_BEFORE_END_NR] = {(unsigned long)_stext, "Identity Mapping End"},
[KERNEL_START_NR] = {(unsigned long)_stext, "Kernel Image Start"},
[KERNEL_END_NR] = {(unsigned long)_end, "Kernel Image End"},
+ [IDENTITY_AFTER_NR] = {(unsigned long)_end, "Identity Mapping Start"},
+ [IDENTITY_AFTER_END_NR] = {0, "Identity Mapping End"},
#ifdef CONFIG_KASAN
[KASAN_SHADOW_START_NR] = {KASAN_SHADOW_START, "Kasan Shadow Start"},
[KASAN_SHADOW_END_NR] = {KASAN_SHADOW_END, "Kasan Shadow End"},
#endif
- [VMEMMAP_NR] = {0, "vmemmap Area"},
- [VMALLOC_NR] = {0, "vmalloc Area"},
- [MODULES_NR] = {0, "Modules Area"},
+ [VMEMMAP_NR] = {0, "vmemmap Area Start"},
+ [VMEMMAP_END_NR] = {0, "vmemmap Area End"},
+ [VMALLOC_NR] = {0, "vmalloc Area Start"},
+ [VMALLOC_END_NR] = {0, "vmalloc Area End"},
+ [MODULES_NR] = {0, "Modules Area Start"},
+ [MODULES_END_NR] = {0, "Modules Area End"},
{ -1, NULL }
};
struct pg_state {
+ struct ptdump_state ptdump;
+ struct seq_file *seq;
int level;
unsigned int current_prot;
+ bool check_wx;
+ unsigned long wx_pages;
unsigned long start_address;
- unsigned long current_address;
const struct addr_marker *marker;
};
+#define pt_dump_seq_printf(m, fmt, args...) \
+({ \
+ struct seq_file *__m = (m); \
+ \
+ if (__m) \
+ seq_printf(__m, fmt, ##args); \
+})
+
+#define pt_dump_seq_puts(m, fmt) \
+({ \
+ struct seq_file *__m = (m); \
+ \
+ if (__m) \
+ seq_printf(__m, fmt); \
+})
+
static void print_prot(struct seq_file *m, unsigned int pr, int level)
{
static const char * const level_name[] =
{ "ASCE", "PGD", "PUD", "PMD", "PTE" };
- seq_printf(m, "%s ", level_name[level]);
+ pt_dump_seq_printf(m, "%s ", level_name[level]);
if (pr & _PAGE_INVALID) {
- seq_printf(m, "I\n");
+ pt_dump_seq_printf(m, "I\n");
return;
}
- seq_puts(m, (pr & _PAGE_PROTECT) ? "RO " : "RW ");
- seq_puts(m, (pr & _PAGE_NOEXEC) ? "NX\n" : "X\n");
+ pt_dump_seq_puts(m, (pr & _PAGE_PROTECT) ? "RO " : "RW ");
+ pt_dump_seq_puts(m, (pr & _PAGE_NOEXEC) ? "NX\n" : "X\n");
}
-static void note_page(struct seq_file *m, struct pg_state *st,
- unsigned int new_prot, int level)
+static void note_prot_wx(struct pg_state *st, unsigned long addr)
{
- static const char units[] = "KMGTPE";
+#ifdef CONFIG_DEBUG_WX
+ if (!st->check_wx)
+ return;
+ if (st->current_prot & _PAGE_INVALID)
+ return;
+ if (st->current_prot & _PAGE_PROTECT)
+ return;
+ if (st->current_prot & _PAGE_NOEXEC)
+ return;
+ /* The first lowcore page is currently still W+X. */
+ if (addr == PAGE_SIZE)
+ return;
+ WARN_ONCE(1, "s390/mm: Found insecure W+X mapping at address %pS\n",
+ (void *)st->start_address);
+ st->wx_pages += (addr - st->start_address) / PAGE_SIZE;
+#endif /* CONFIG_DEBUG_WX */
+}
+
+static void note_page(struct ptdump_state *pt_st, unsigned long addr, int level, u64 val)
+{
int width = sizeof(unsigned long) * 2;
+ static const char units[] = "KMGTPE";
const char *unit = units;
- unsigned int prot, cur;
unsigned long delta;
+ struct pg_state *st;
+ struct seq_file *m;
+ unsigned int prot;
- /*
- * If we have a "break" in the series, we need to flush the state
- * that we have now. "break" is either changing perms, levels or
- * address space marker.
- */
- prot = new_prot;
- cur = st->current_prot;
-
- if (!st->level) {
- /* First entry */
- st->current_prot = new_prot;
+ st = container_of(pt_st, struct pg_state, ptdump);
+ m = st->seq;
+ prot = val & (_PAGE_PROTECT | _PAGE_NOEXEC);
+ if (level == 4 && (val & _PAGE_INVALID))
+ prot = _PAGE_INVALID;
+ /* For pmd_none() & friends val gets passed as zero. */
+ if (level != 4 && !val)
+ prot = _PAGE_INVALID;
+ /* Final flush from generic code. */
+ if (level == -1)
+ addr = max_addr;
+ if (st->level == -1) {
+ pt_dump_seq_printf(m, "---[ %s ]---\n", st->marker->name);
+ st->start_address = addr;
+ st->current_prot = prot;
st->level = level;
- st->marker = address_markers;
- seq_printf(m, "---[ %s ]---\n", st->marker->name);
- } else if (prot != cur || level != st->level ||
- st->current_address >= st->marker[1].start_address) {
- /* Print the actual finished series */
- seq_printf(m, "0x%0*lx-0x%0*lx ",
- width, st->start_address,
- width, st->current_address);
- delta = (st->current_address - st->start_address) >> 10;
+ } else if (prot != st->current_prot || level != st->level ||
+ addr >= st->marker[1].start_address) {
+ note_prot_wx(st, addr);
+ pt_dump_seq_printf(m, "0x%0*lx-0x%0*lx ",
+ width, st->start_address,
+ width, addr);
+ delta = (addr - st->start_address) >> 10;
while (!(delta & 0x3ff) && unit[1]) {
delta >>= 10;
unit++;
}
- seq_printf(m, "%9lu%c ", delta, *unit);
+ pt_dump_seq_printf(m, "%9lu%c ", delta, *unit);
print_prot(m, st->current_prot, st->level);
- while (st->current_address >= st->marker[1].start_address) {
+ while (addr >= st->marker[1].start_address) {
st->marker++;
- seq_printf(m, "---[ %s ]---\n", st->marker->name);
+ pt_dump_seq_printf(m, "---[ %s ]---\n", st->marker->name);
}
- st->start_address = st->current_address;
- st->current_prot = new_prot;
+ st->start_address = addr;
+ st->current_prot = prot;
st->level = level;
}
}
-#ifdef CONFIG_KASAN
-static void note_kasan_early_shadow_page(struct seq_file *m,
- struct pg_state *st)
+#ifdef CONFIG_DEBUG_WX
+void ptdump_check_wx(void)
{
- unsigned int prot;
+ struct pg_state st = {
+ .ptdump = {
+ .note_page = note_page,
+ .range = (struct ptdump_range[]) {
+ {.start = 0, .end = max_addr},
+ {.start = 0, .end = 0},
+ }
+ },
+ .seq = NULL,
+ .level = -1,
+ .current_prot = 0,
+ .check_wx = true,
+ .wx_pages = 0,
+ .start_address = 0,
+ .marker = (struct addr_marker[]) {
+ { .start_address = 0, .name = NULL},
+ { .start_address = -1, .name = NULL},
+ },
+ };
- prot = pte_val(*kasan_early_shadow_pte) &
- (_PAGE_PROTECT | _PAGE_INVALID | _PAGE_NOEXEC);
- note_page(m, st, prot, 4);
-}
-#endif
-
-/*
- * The actual page table walker functions. In order to keep the
- * implementation of print_prot() short, we only check and pass
- * _PAGE_INVALID and _PAGE_PROTECT flags to note_page() if a region,
- * segment or page table entry is invalid or read-only.
- * After all it's just a hint that the current level being walked
- * contains an invalid or read-only entry.
- */
-static void walk_pte_level(struct seq_file *m, struct pg_state *st,
- pmd_t *pmd, unsigned long addr)
-{
- unsigned int prot;
- pte_t *pte;
- int i;
-
- for (i = 0; i < PTRS_PER_PTE && addr < max_addr; i++) {
- st->current_address = addr;
- pte = pte_offset_kernel(pmd, addr);
- prot = pte_val(*pte) &
- (_PAGE_PROTECT | _PAGE_INVALID | _PAGE_NOEXEC);
- note_page(m, st, prot, 4);
- addr += PAGE_SIZE;
- }
-}
-
-static void walk_pmd_level(struct seq_file *m, struct pg_state *st,
- pud_t *pud, unsigned long addr)
-{
- unsigned int prot;
- pmd_t *pmd;
- int i;
-
-#ifdef CONFIG_KASAN
- if ((pud_val(*pud) & PAGE_MASK) == __pa(kasan_early_shadow_pmd)) {
- note_kasan_early_shadow_page(m, st);
+ if (!MACHINE_HAS_NX)
return;
- }
-#endif
-
- pmd = pmd_offset(pud, addr);
- for (i = 0; i < PTRS_PER_PMD && addr < max_addr; i++, pmd++) {
- st->current_address = addr;
- if (!pmd_none(*pmd)) {
- if (pmd_large(*pmd)) {
- prot = pmd_val(*pmd) &
- (_SEGMENT_ENTRY_PROTECT |
- _SEGMENT_ENTRY_NOEXEC);
- note_page(m, st, prot, 3);
- } else
- walk_pte_level(m, st, pmd, addr);
- } else
- note_page(m, st, _PAGE_INVALID, 3);
- addr += PMD_SIZE;
- }
+ ptdump_walk_pgd(&st.ptdump, &init_mm, NULL);
+ if (st.wx_pages)
+ pr_warn("Checked W+X mappings: FAILED, %lu W+X pages found\n", st.wx_pages);
+ else
+ pr_info("Checked W+X mappings: passed, no unexpected W+X pages found\n");
}
+#endif /* CONFIG_DEBUG_WX */
-static void walk_pud_level(struct seq_file *m, struct pg_state *st,
- p4d_t *p4d, unsigned long addr)
-{
- unsigned int prot;
- pud_t *pud;
- int i;
-
-#ifdef CONFIG_KASAN
- if ((p4d_val(*p4d) & PAGE_MASK) == __pa(kasan_early_shadow_pud)) {
- note_kasan_early_shadow_page(m, st);
- return;
- }
-#endif
-
- pud = pud_offset(p4d, addr);
- for (i = 0; i < PTRS_PER_PUD && addr < max_addr; i++, pud++) {
- st->current_address = addr;
- if (!pud_none(*pud))
- if (pud_large(*pud)) {
- prot = pud_val(*pud) &
- (_REGION_ENTRY_PROTECT |
- _REGION_ENTRY_NOEXEC);
- note_page(m, st, prot, 2);
- } else
- walk_pmd_level(m, st, pud, addr);
- else
- note_page(m, st, _PAGE_INVALID, 2);
- addr += PUD_SIZE;
- }
-}
-
-static void walk_p4d_level(struct seq_file *m, struct pg_state *st,
- pgd_t *pgd, unsigned long addr)
-{
- p4d_t *p4d;
- int i;
-
-#ifdef CONFIG_KASAN
- if ((pgd_val(*pgd) & PAGE_MASK) == __pa(kasan_early_shadow_p4d)) {
- note_kasan_early_shadow_page(m, st);
- return;
- }
-#endif
-
- p4d = p4d_offset(pgd, addr);
- for (i = 0; i < PTRS_PER_P4D && addr < max_addr; i++, p4d++) {
- st->current_address = addr;
- if (!p4d_none(*p4d))
- walk_pud_level(m, st, p4d, addr);
- else
- note_page(m, st, _PAGE_INVALID, 2);
- addr += P4D_SIZE;
- }
-}
-
-static void walk_pgd_level(struct seq_file *m)
-{
- unsigned long addr = 0;
- struct pg_state st;
- pgd_t *pgd;
- int i;
-
- memset(&st, 0, sizeof(st));
- for (i = 0; i < PTRS_PER_PGD && addr < max_addr; i++) {
- st.current_address = addr;
- pgd = pgd_offset_k(addr);
- if (!pgd_none(*pgd))
- walk_p4d_level(m, &st, pgd, addr);
- else
- note_page(m, &st, _PAGE_INVALID, 1);
- addr += PGDIR_SIZE;
- cond_resched();
- }
- /* Flush out the last page */
- st.current_address = max_addr;
- note_page(m, &st, 0, 0);
-}
-
+#ifdef CONFIG_PTDUMP_DEBUGFS
static int ptdump_show(struct seq_file *m, void *v)
{
- walk_pgd_level(m);
+ struct pg_state st = {
+ .ptdump = {
+ .note_page = note_page,
+ .range = (struct ptdump_range[]) {
+ {.start = 0, .end = max_addr},
+ {.start = 0, .end = 0},
+ }
+ },
+ .seq = m,
+ .level = -1,
+ .current_prot = 0,
+ .check_wx = false,
+ .wx_pages = 0,
+ .start_address = 0,
+ .marker = address_markers,
+ };
+
+ get_online_mems();
+ mutex_lock(&cpa_mutex);
+ ptdump_walk_pgd(&st.ptdump, &init_mm, NULL);
+ mutex_unlock(&cpa_mutex);
+ put_online_mems();
return 0;
}
+DEFINE_SHOW_ATTRIBUTE(ptdump);
+#endif /* CONFIG_PTDUMP_DEBUGFS */
-static int ptdump_open(struct inode *inode, struct file *filp)
+/*
+ * Heapsort from lib/sort.c is not a stable sorting algorithm, do a simple
+ * insertion sort to preserve the original order of markers with the same
+ * start address.
+ */
+static void sort_address_markers(void)
{
- return single_open(filp, ptdump_show, NULL);
-}
+ struct addr_marker tmp;
+ int i, j;
-static const struct file_operations ptdump_fops = {
- .open = ptdump_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
+ for (i = 1; i < ARRAY_SIZE(address_markers) - 1; i++) {
+ tmp = address_markers[i];
+ for (j = i - 1; j >= 0 && address_markers[j].start_address > tmp.start_address; j--)
+ address_markers[j + 1] = address_markers[j];
+ address_markers[j + 1] = tmp;
+ }
+}
static int pt_dump_init(void)
{
@@ -283,10 +255,17 @@
*/
max_addr = (S390_lowcore.kernel_asce & _REGION_ENTRY_TYPE_MASK) >> 2;
max_addr = 1UL << (max_addr * 11 + 31);
+ address_markers[IDENTITY_AFTER_END_NR].start_address = memory_end;
address_markers[MODULES_NR].start_address = MODULES_VADDR;
+ address_markers[MODULES_END_NR].start_address = MODULES_END;
address_markers[VMEMMAP_NR].start_address = (unsigned long) vmemmap;
+ address_markers[VMEMMAP_END_NR].start_address = (unsigned long)vmemmap + vmemmap_size;
address_markers[VMALLOC_NR].start_address = VMALLOC_START;
+ address_markers[VMALLOC_END_NR].start_address = VMALLOC_END;
+ sort_address_markers();
+#ifdef CONFIG_PTDUMP_DEBUGFS
debugfs_create_file("kernel_page_tables", 0400, NULL, NULL, &ptdump_fops);
+#endif /* CONFIG_PTDUMP_DEBUGFS */
return 0;
}
device_initcall(pt_dump_init);
diff --git a/arch/s390/mm/extmem.c b/arch/s390/mm/extmem.c
index fd0dae9..5060956 100644
--- a/arch/s390/mm/extmem.c
+++ b/arch/s390/mm/extmem.c
@@ -20,9 +20,9 @@
#include <linux/ctype.h>
#include <linux/ioport.h>
#include <linux/refcount.h>
+#include <linux/pgtable.h>
#include <asm/diag.h>
#include <asm/page.h>
-#include <asm/pgtable.h>
#include <asm/ebcdic.h>
#include <asm/errno.h>
#include <asm/extmem.h>
@@ -313,15 +313,10 @@
goto out_free;
}
- rc = vmem_add_mapping(seg->start_addr, seg->end - seg->start_addr + 1);
-
- if (rc)
- goto out_free;
-
seg->res = kzalloc(sizeof(struct resource), GFP_KERNEL);
if (seg->res == NULL) {
rc = -ENOMEM;
- goto out_shared;
+ goto out_free;
}
seg->res->flags = IORESOURCE_BUSY | IORESOURCE_MEM;
seg->res->start = seg->start_addr;
@@ -335,12 +330,17 @@
if (rc == SEG_TYPE_SC ||
((rc == SEG_TYPE_SR || rc == SEG_TYPE_ER) && !do_nonshared))
seg->res->flags |= IORESOURCE_READONLY;
+
+ /* Check for overlapping resources before adding the mapping. */
if (request_resource(&iomem_resource, seg->res)) {
rc = -EBUSY;
- kfree(seg->res);
- goto out_shared;
+ goto out_free_resource;
}
+ rc = vmem_add_mapping(seg->start_addr, seg->end - seg->start_addr + 1);
+ if (rc)
+ goto out_resource;
+
if (do_nonshared)
diag_cc = dcss_diag(&loadnsr_scode, seg->dcss_name,
&start_addr, &end_addr);
@@ -351,14 +351,14 @@
dcss_diag(&purgeseg_scode, seg->dcss_name,
&dummy, &dummy);
rc = diag_cc;
- goto out_resource;
+ goto out_mapping;
}
if (diag_cc > 1) {
pr_warn("Loading DCSS %s failed with rc=%ld\n", name, end_addr);
rc = dcss_diag_translate_rc(end_addr);
dcss_diag(&purgeseg_scode, seg->dcss_name,
&dummy, &dummy);
- goto out_resource;
+ goto out_mapping;
}
seg->start_addr = start_addr;
seg->end = end_addr;
@@ -377,11 +377,12 @@
(void*) seg->end, segtype_string[seg->vm_segtype]);
}
goto out;
+ out_mapping:
+ vmem_remove_mapping(seg->start_addr, seg->end - seg->start_addr + 1);
out_resource:
release_resource(seg->res);
+ out_free_resource:
kfree(seg->res);
- out_shared:
- vmem_remove_mapping(seg->start_addr, seg->end - seg->start_addr + 1);
out_free:
kfree(seg);
out:
@@ -400,8 +401,7 @@
* -EIO : could not perform query or load diagnose
* -ENOENT : no such segment
* -EOPNOTSUPP: multi-part segment cannot be used with linux
- * -ENOSPC : segment cannot be used (overlaps with storage)
- * -EBUSY : segment can temporarily not be used (overlaps with dcss)
+ * -EBUSY : segment cannot be used (overlaps with dcss or storage)
* -ERANGE : segment cannot be used (exceeds kernel mapping range)
* -EPERM : segment is currently loaded with incompatible permissions
* -ENOMEM : out of memory
@@ -626,10 +626,6 @@
pr_err("DCSS %s has multiple page ranges and cannot be "
"loaded or queried\n", seg_name);
break;
- case -ENOSPC:
- pr_err("DCSS %s overlaps with used storage and cannot "
- "be loaded\n", seg_name);
- break;
case -EBUSY:
pr_err("%s needs used memory resources and cannot be "
"loaded or queried\n", seg_name);
diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c
index 7b0bb47..ed517fa 100644
--- a/arch/s390/mm/fault.c
+++ b/arch/s390/mm/fault.c
@@ -33,22 +33,22 @@
#include <linux/hugetlb.h>
#include <asm/asm-offsets.h>
#include <asm/diag.h>
-#include <asm/pgtable.h>
#include <asm/gmap.h>
#include <asm/irq.h>
#include <asm/mmu_context.h>
#include <asm/facility.h>
+#include <asm/uv.h>
#include "../kernel/entry.h"
#define __FAIL_ADDR_MASK -4096L
#define __SUBCODE_MASK 0x0600
#define __PF_RES_FIELD 0x8000000000000000ULL
-#define VM_FAULT_BADCONTEXT 0x010000
-#define VM_FAULT_BADMAP 0x020000
-#define VM_FAULT_BADACCESS 0x040000
-#define VM_FAULT_SIGNAL 0x080000
-#define VM_FAULT_PFAULT 0x100000
+#define VM_FAULT_BADCONTEXT ((__force vm_fault_t) 0x010000)
+#define VM_FAULT_BADMAP ((__force vm_fault_t) 0x020000)
+#define VM_FAULT_BADACCESS ((__force vm_fault_t) 0x040000)
+#define VM_FAULT_SIGNAL ((__force vm_fault_t) 0x080000)
+#define VM_FAULT_PFAULT ((__force vm_fault_t) 0x100000)
enum fault_type {
KERNEL_FAULT,
@@ -105,7 +105,7 @@
{
unsigned long dummy;
- return probe_kernel_address((unsigned long *)p, dummy);
+ return get_kernel_nofault(dummy, (unsigned long *)p);
}
static void dump_pagetable(unsigned long asce, unsigned long address)
@@ -122,7 +122,7 @@
if (*table & _REGION_ENTRY_INVALID)
goto out;
table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
- /* fallthrough */
+ fallthrough;
case _ASCE_TYPE_REGION2:
table += (address & _REGION2_INDEX) >> _REGION2_SHIFT;
if (bad_address(table))
@@ -131,7 +131,7 @@
if (*table & _REGION_ENTRY_INVALID)
goto out;
table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
- /* fallthrough */
+ fallthrough;
case _ASCE_TYPE_REGION3:
table += (address & _REGION3_INDEX) >> _REGION3_SHIFT;
if (bad_address(table))
@@ -140,7 +140,7 @@
if (*table & (_REGION_ENTRY_INVALID | _REGION3_ENTRY_LARGE))
goto out;
table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
- /* fallthrough */
+ fallthrough;
case _ASCE_TYPE_SEGMENT:
table += (address & _SEGMENT_INDEX) >> _SEGMENT_SHIFT;
if (bad_address(table))
@@ -255,10 +255,8 @@
/* Are we prepared to handle this kernel fault? */
fixup = s390_search_extables(regs->psw.addr);
- if (fixup) {
- regs->psw.addr = extable_fixup(fixup);
+ if (fixup && ex_handle(fixup, regs))
return;
- }
/*
* Oops. The kernel tried to access some bad page. We'll have to
@@ -327,7 +325,7 @@
case VM_FAULT_BADACCESS:
if (access == VM_EXEC && signal_return(regs) == 0)
break;
- /* fallthrough */
+ fallthrough;
case VM_FAULT_BADMAP:
/* Bad memory access. Check if it is kernel or user space. */
if (user_mode(regs)) {
@@ -337,9 +335,8 @@
do_sigsegv(regs, si_code);
break;
}
- /* fallthrough */
+ fallthrough;
case VM_FAULT_BADCONTEXT:
- /* fallthrough */
case VM_FAULT_PFAULT:
do_no_context(regs);
break;
@@ -377,7 +374,7 @@
* routines.
*
* interruption code (int_code):
- * 04 Protection -> Write-Protection (suprression)
+ * 04 Protection -> Write-Protection (suppression)
* 10 Segment translation -> Not present (nullification)
* 11 Page translation -> Not present (nullification)
* 3b Region third trans. -> Not present (nullification)
@@ -429,12 +426,12 @@
address = trans_exc_code & __FAIL_ADDR_MASK;
perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
- flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
+ flags = FAULT_FLAG_DEFAULT;
if (user_mode(regs))
flags |= FAULT_FLAG_USER;
if (access == VM_WRITE || (trans_exc_code & store_indication) == 0x400)
flags |= FAULT_FLAG_WRITE;
- down_read(&mm->mmap_sem);
+ mmap_read_lock(mm);
gmap = NULL;
if (IS_ENABLED(CONFIG_PGSTE) && type == GMAP_FAULT) {
@@ -479,9 +476,8 @@
* make sure we exit gracefully rather than endlessly redo
* the fault.
*/
- fault = handle_mm_fault(vma, address, flags);
- /* No reason to continue if interrupted by SIGKILL. */
- if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) {
+ fault = handle_mm_fault(vma, address, flags, regs);
+ if (fault_signal_pending(fault, regs)) {
fault = VM_FAULT_SIGNAL;
if (flags & FAULT_FLAG_RETRY_NOWAIT)
goto out_up;
@@ -490,36 +486,19 @@
if (unlikely(fault & VM_FAULT_ERROR))
goto out_up;
- /*
- * Major/minor page fault accounting is only done on the
- * initial attempt. If we go through a retry, it is extremely
- * likely that the page will be found in page cache at that point.
- */
if (flags & FAULT_FLAG_ALLOW_RETRY) {
- if (fault & VM_FAULT_MAJOR) {
- tsk->maj_flt++;
- perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1,
- regs, address);
- } else {
- tsk->min_flt++;
- perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1,
- regs, address);
- }
if (fault & VM_FAULT_RETRY) {
if (IS_ENABLED(CONFIG_PGSTE) && gmap &&
(flags & FAULT_FLAG_RETRY_NOWAIT)) {
/* FAULT_FLAG_RETRY_NOWAIT has been set,
- * mmap_sem has not been released */
+ * mmap_lock has not been released */
current->thread.gmap_pfault = 1;
fault = VM_FAULT_PFAULT;
goto out_up;
}
- /* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk
- * of starvation. */
- flags &= ~(FAULT_FLAG_ALLOW_RETRY |
- FAULT_FLAG_RETRY_NOWAIT);
+ flags &= ~FAULT_FLAG_RETRY_NOWAIT;
flags |= FAULT_FLAG_TRIED;
- down_read(&mm->mmap_sem);
+ mmap_read_lock(mm);
goto retry;
}
}
@@ -537,7 +516,7 @@
}
fault = 0;
out_up:
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
out:
return fault;
}
@@ -584,7 +563,7 @@
int access;
vm_fault_t fault;
- access = VM_READ | VM_EXEC | VM_WRITE;
+ access = VM_ACCESS_FLAGS;
fault = do_exception(regs, access);
if (unlikely(fault))
do_fault_error(regs, access, fault);
@@ -816,3 +795,124 @@
early_initcall(pfault_irq_init);
#endif /* CONFIG_PFAULT */
+
+#if IS_ENABLED(CONFIG_PGSTE)
+void do_secure_storage_access(struct pt_regs *regs)
+{
+ unsigned long addr = regs->int_parm_long & __FAIL_ADDR_MASK;
+ struct vm_area_struct *vma;
+ struct mm_struct *mm;
+ struct page *page;
+ int rc;
+
+ /*
+ * bit 61 tells us if the address is valid, if it's not we
+ * have a major problem and should stop the kernel or send a
+ * SIGSEGV to the process. Unfortunately bit 61 is not
+ * reliable without the misc UV feature so we need to check
+ * for that as well.
+ */
+ if (test_bit_inv(BIT_UV_FEAT_MISC, &uv_info.uv_feature_indications) &&
+ !test_bit_inv(61, ®s->int_parm_long)) {
+ /*
+ * When this happens, userspace did something that it
+ * was not supposed to do, e.g. branching into secure
+ * memory. Trigger a segmentation fault.
+ */
+ if (user_mode(regs)) {
+ send_sig(SIGSEGV, current, 0);
+ return;
+ }
+
+ /*
+ * The kernel should never run into this case and we
+ * have no way out of this situation.
+ */
+ panic("Unexpected PGM 0x3d with TEID bit 61=0");
+ }
+
+ switch (get_fault_type(regs)) {
+ case USER_FAULT:
+ mm = current->mm;
+ mmap_read_lock(mm);
+ vma = find_vma(mm, addr);
+ if (!vma) {
+ mmap_read_unlock(mm);
+ do_fault_error(regs, VM_READ | VM_WRITE, VM_FAULT_BADMAP);
+ break;
+ }
+ page = follow_page(vma, addr, FOLL_WRITE | FOLL_GET);
+ if (IS_ERR_OR_NULL(page)) {
+ mmap_read_unlock(mm);
+ break;
+ }
+ if (arch_make_page_accessible(page))
+ send_sig(SIGSEGV, current, 0);
+ put_page(page);
+ mmap_read_unlock(mm);
+ break;
+ case KERNEL_FAULT:
+ page = phys_to_page(addr);
+ if (unlikely(!try_get_page(page)))
+ break;
+ rc = arch_make_page_accessible(page);
+ put_page(page);
+ if (rc)
+ BUG();
+ break;
+ case VDSO_FAULT:
+ case GMAP_FAULT:
+ default:
+ do_fault_error(regs, VM_READ | VM_WRITE, VM_FAULT_BADMAP);
+ WARN_ON_ONCE(1);
+ }
+}
+NOKPROBE_SYMBOL(do_secure_storage_access);
+
+void do_non_secure_storage_access(struct pt_regs *regs)
+{
+ unsigned long gaddr = regs->int_parm_long & __FAIL_ADDR_MASK;
+ struct gmap *gmap = (struct gmap *)S390_lowcore.gmap;
+
+ if (get_fault_type(regs) != GMAP_FAULT) {
+ do_fault_error(regs, VM_READ | VM_WRITE, VM_FAULT_BADMAP);
+ WARN_ON_ONCE(1);
+ return;
+ }
+
+ if (gmap_convert_to_secure(gmap, gaddr) == -EINVAL)
+ send_sig(SIGSEGV, current, 0);
+}
+NOKPROBE_SYMBOL(do_non_secure_storage_access);
+
+void do_secure_storage_violation(struct pt_regs *regs)
+{
+ /*
+ * Either KVM messed up the secure guest mapping or the same
+ * page is mapped into multiple secure guests.
+ *
+ * This exception is only triggered when a guest 2 is running
+ * and can therefore never occur in kernel context.
+ */
+ printk_ratelimited(KERN_WARNING
+ "Secure storage violation in task: %s, pid %d\n",
+ current->comm, current->pid);
+ send_sig(SIGSEGV, current, 0);
+}
+
+#else
+void do_secure_storage_access(struct pt_regs *regs)
+{
+ default_trap_handler(regs);
+}
+
+void do_non_secure_storage_access(struct pt_regs *regs)
+{
+ default_trap_handler(regs);
+}
+
+void do_secure_storage_violation(struct pt_regs *regs)
+{
+ default_trap_handler(regs);
+}
+#endif
diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c
index 4fa7a56..f2d19d4 100644
--- a/arch/s390/mm/gmap.c
+++ b/arch/s390/mm/gmap.c
@@ -17,8 +17,8 @@
#include <linux/swapops.h>
#include <linux/ksm.h>
#include <linux/mman.h>
+#include <linux/pgtable.h>
-#include <asm/pgtable.h>
#include <asm/pgalloc.h>
#include <asm/gmap.h>
#include <asm/tlb.h>
@@ -300,7 +300,7 @@
EXPORT_SYMBOL_GPL(gmap_get_enabled);
/*
- * gmap_alloc_table is assumed to be called with mmap_sem held
+ * gmap_alloc_table is assumed to be called with mmap_lock held
*/
static int gmap_alloc_table(struct gmap *gmap, unsigned long *table,
unsigned long init, unsigned long gaddr)
@@ -405,10 +405,10 @@
return -EINVAL;
flush = 0;
- down_write(&gmap->mm->mmap_sem);
+ mmap_write_lock(gmap->mm);
for (off = 0; off < len; off += PMD_SIZE)
flush |= __gmap_unmap_by_gaddr(gmap, to + off);
- up_write(&gmap->mm->mmap_sem);
+ mmap_write_unlock(gmap->mm);
if (flush)
gmap_flush_tlb(gmap);
return 0;
@@ -438,7 +438,7 @@
return -EINVAL;
flush = 0;
- down_write(&gmap->mm->mmap_sem);
+ mmap_write_lock(gmap->mm);
for (off = 0; off < len; off += PMD_SIZE) {
/* Remove old translation */
flush |= __gmap_unmap_by_gaddr(gmap, to + off);
@@ -448,7 +448,7 @@
(void *) from + off))
break;
}
- up_write(&gmap->mm->mmap_sem);
+ mmap_write_unlock(gmap->mm);
if (flush)
gmap_flush_tlb(gmap);
if (off >= len)
@@ -466,7 +466,7 @@
* Returns user space address which corresponds to the guest address or
* -EFAULT if no such mapping exists.
* This function does not establish potentially missing page table entries.
- * The mmap_sem of the mm that belongs to the address space must be held
+ * The mmap_lock of the mm that belongs to the address space must be held
* when this function gets called.
*
* Note: Can also be called for shadow gmaps.
@@ -495,9 +495,9 @@
{
unsigned long rc;
- down_read(&gmap->mm->mmap_sem);
+ mmap_read_lock(gmap->mm);
rc = __gmap_translate(gmap, gaddr);
- up_read(&gmap->mm->mmap_sem);
+ mmap_read_unlock(gmap->mm);
return rc;
}
EXPORT_SYMBOL_GPL(gmap_translate);
@@ -534,7 +534,7 @@
*
* Returns 0 on success, -ENOMEM for out of memory conditions, and -EFAULT
* if the vm address is already mapped to a different guest segment.
- * The mmap_sem of the mm that belongs to the address space must be held
+ * The mmap_lock of the mm that belongs to the address space must be held
* when this function gets called.
*/
int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr)
@@ -640,7 +640,7 @@
int rc;
bool unlocked;
- down_read(&gmap->mm->mmap_sem);
+ mmap_read_lock(gmap->mm);
retry:
unlocked = false;
@@ -649,13 +649,13 @@
rc = vmaddr;
goto out_up;
}
- if (fixup_user_fault(current, gmap->mm, vmaddr, fault_flags,
+ if (fixup_user_fault(gmap->mm, vmaddr, fault_flags,
&unlocked)) {
rc = -EFAULT;
goto out_up;
}
/*
- * In the case that fixup_user_fault unlocked the mmap_sem during
+ * In the case that fixup_user_fault unlocked the mmap_lock during
* faultin redo __gmap_translate to not race with a map/unmap_segment.
*/
if (unlocked)
@@ -663,13 +663,13 @@
rc = __gmap_link(gmap, gaddr, vmaddr);
out_up:
- up_read(&gmap->mm->mmap_sem);
+ mmap_read_unlock(gmap->mm);
return rc;
}
EXPORT_SYMBOL_GPL(gmap_fault);
/*
- * this function is assumed to be called with mmap_sem held
+ * this function is assumed to be called with mmap_lock held
*/
void __gmap_zap(struct gmap *gmap, unsigned long gaddr)
{
@@ -684,9 +684,10 @@
vmaddr |= gaddr & ~PMD_MASK;
/* Get pointer to the page table entry */
ptep = get_locked_pte(gmap->mm, vmaddr, &ptl);
- if (likely(ptep))
+ if (likely(ptep)) {
ptep_zap_unused(gmap->mm, vmaddr, ptep, 0);
- pte_unmap_unlock(ptep, ptl);
+ pte_unmap_unlock(ptep, ptl);
+ }
}
}
EXPORT_SYMBOL_GPL(__gmap_zap);
@@ -696,7 +697,7 @@
unsigned long gaddr, vmaddr, size;
struct vm_area_struct *vma;
- down_read(&gmap->mm->mmap_sem);
+ mmap_read_lock(gmap->mm);
for (gaddr = from; gaddr < to;
gaddr = (gaddr + PMD_SIZE) & PMD_MASK) {
/* Find the vm address for the guest address */
@@ -719,7 +720,7 @@
size = min(to - gaddr, PMD_SIZE - (gaddr & ~PMD_MASK));
zap_page_range(vma, vmaddr, size);
}
- up_read(&gmap->mm->mmap_sem);
+ mmap_read_unlock(gmap->mm);
}
EXPORT_SYMBOL_GPL(gmap_discard);
@@ -788,19 +789,19 @@
unsigned long gaddr, int level)
{
const int asce_type = gmap->asce & _ASCE_TYPE_MASK;
- unsigned long *table;
+ unsigned long *table = gmap->table;
- if ((gmap->asce & _ASCE_TYPE_MASK) + 4 < (level * 4))
- return NULL;
if (gmap_is_shadow(gmap) && gmap->removed)
return NULL;
+ if (WARN_ON_ONCE(level > (asce_type >> 2) + 1))
+ return NULL;
+
if (asce_type != _ASCE_TYPE_REGION1 &&
gaddr & (-1UL << (31 + (asce_type >> 2) * 11)))
return NULL;
- table = gmap->table;
- switch (gmap->asce & _ASCE_TYPE_MASK) {
+ switch (asce_type) {
case _ASCE_TYPE_REGION1:
table += (gaddr & _REGION1_INDEX) >> _REGION1_SHIFT;
if (level == 4)
@@ -808,7 +809,7 @@
if (*table & _REGION_ENTRY_INVALID)
return NULL;
table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
- /* Fallthrough */
+ fallthrough;
case _ASCE_TYPE_REGION2:
table += (gaddr & _REGION2_INDEX) >> _REGION2_SHIFT;
if (level == 3)
@@ -816,7 +817,7 @@
if (*table & _REGION_ENTRY_INVALID)
return NULL;
table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
- /* Fallthrough */
+ fallthrough;
case _ASCE_TYPE_REGION3:
table += (gaddr & _REGION3_INDEX) >> _REGION3_SHIFT;
if (level == 2)
@@ -824,7 +825,7 @@
if (*table & _REGION_ENTRY_INVALID)
return NULL;
table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
- /* Fallthrough */
+ fallthrough;
case _ASCE_TYPE_SEGMENT:
table += (gaddr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT;
if (level == 1)
@@ -879,10 +880,10 @@
BUG_ON(gmap_is_shadow(gmap));
fault_flags = (prot == PROT_WRITE) ? FAULT_FLAG_WRITE : 0;
- if (fixup_user_fault(current, mm, vmaddr, fault_flags, &unlocked))
+ if (fixup_user_fault(mm, vmaddr, fault_flags, &unlocked))
return -EFAULT;
if (unlocked)
- /* lost mmap_sem, caller has to retry __gmap_translate */
+ /* lost mmap_lock, caller has to retry __gmap_translate */
return 0;
/* Connect the page tables */
return __gmap_link(gmap, gaddr, vmaddr);
@@ -953,7 +954,7 @@
* -EAGAIN if a fixup is needed
* -EINVAL if unsupported notifier bits have been specified
*
- * Expected to be called with sg->mm->mmap_sem in read and
+ * Expected to be called with sg->mm->mmap_lock in read and
* guest_table_lock held.
*/
static int gmap_protect_pmd(struct gmap *gmap, unsigned long gaddr,
@@ -999,7 +1000,7 @@
* Returns 0 if successfully protected, -ENOMEM if out of memory and
* -EAGAIN if a fixup is needed.
*
- * Expected to be called with sg->mm->mmap_sem in read
+ * Expected to be called with sg->mm->mmap_lock in read
*/
static int gmap_protect_pte(struct gmap *gmap, unsigned long gaddr,
pmd_t *pmdp, int prot, unsigned long bits)
@@ -1035,7 +1036,7 @@
* Returns 0 if successfully protected, -ENOMEM if out of memory and
* -EFAULT if gaddr is invalid (or mapping for shadows is missing).
*
- * Called with sg->mm->mmap_sem in read.
+ * Called with sg->mm->mmap_lock in read.
*/
static int gmap_protect_range(struct gmap *gmap, unsigned long gaddr,
unsigned long len, int prot, unsigned long bits)
@@ -1106,9 +1107,9 @@
return -EINVAL;
if (!MACHINE_HAS_ESOP && prot == PROT_READ)
return -EINVAL;
- down_read(&gmap->mm->mmap_sem);
+ mmap_read_lock(gmap->mm);
rc = gmap_protect_range(gmap, gaddr, len, prot, GMAP_NOTIFY_MPROT);
- up_read(&gmap->mm->mmap_sem);
+ mmap_read_unlock(gmap->mm);
return rc;
}
EXPORT_SYMBOL_GPL(gmap_mprotect_notify);
@@ -1124,7 +1125,7 @@
* if reading using the virtual address failed. -EINVAL if called on a gmap
* shadow.
*
- * Called with gmap->mm->mmap_sem in read.
+ * Called with gmap->mm->mmap_lock in read.
*/
int gmap_read_table(struct gmap *gmap, unsigned long gaddr, unsigned long *val)
{
@@ -1696,11 +1697,11 @@
}
spin_unlock(&parent->shadow_lock);
/* protect after insertion, so it will get properly invalidated */
- down_read(&parent->mm->mmap_sem);
+ mmap_read_lock(parent->mm);
rc = gmap_protect_range(parent, asce & _ASCE_ORIGIN,
((asce & _ASCE_TABLE_LENGTH) + 1) * PAGE_SIZE,
PROT_READ, GMAP_NOTIFY_SHADOW);
- up_read(&parent->mm->mmap_sem);
+ mmap_read_unlock(parent->mm);
spin_lock(&parent->shadow_lock);
new->initialized = true;
if (rc) {
@@ -1729,7 +1730,7 @@
* shadow table structure is incomplete, -ENOMEM if out of memory and
* -EFAULT if an address in the parent gmap could not be resolved.
*
- * Called with sg->mm->mmap_sem in read.
+ * Called with sg->mm->mmap_lock in read.
*/
int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t,
int fake)
@@ -1813,7 +1814,7 @@
* shadow table structure is incomplete, -ENOMEM if out of memory and
* -EFAULT if an address in the parent gmap could not be resolved.
*
- * Called with sg->mm->mmap_sem in read.
+ * Called with sg->mm->mmap_lock in read.
*/
int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t,
int fake)
@@ -1897,7 +1898,7 @@
* shadow table structure is incomplete, -ENOMEM if out of memory and
* -EFAULT if an address in the parent gmap could not be resolved.
*
- * Called with sg->mm->mmap_sem in read.
+ * Called with sg->mm->mmap_lock in read.
*/
int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt,
int fake)
@@ -1981,7 +1982,7 @@
* Returns 0 if the shadow page table was found and -EAGAIN if the page
* table was not found.
*
- * Called with sg->mm->mmap_sem in read.
+ * Called with sg->mm->mmap_lock in read.
*/
int gmap_shadow_pgt_lookup(struct gmap *sg, unsigned long saddr,
unsigned long *pgt, int *dat_protection,
@@ -2021,7 +2022,7 @@
* shadow table structure is incomplete, -ENOMEM if out of memory,
* -EFAULT if an address in the parent gmap could not be resolved and
*
- * Called with gmap->mm->mmap_sem in read
+ * Called with gmap->mm->mmap_lock in read
*/
int gmap_shadow_pgt(struct gmap *sg, unsigned long saddr, unsigned long pgt,
int fake)
@@ -2100,7 +2101,7 @@
* shadow table structure is incomplete, -ENOMEM if out of memory and
* -EFAULT if an address in the parent gmap could not be resolved.
*
- * Called with sg->mm->mmap_sem in read.
+ * Called with sg->mm->mmap_lock in read.
*/
int gmap_shadow_page(struct gmap *sg, unsigned long saddr, pte_t pte)
{
@@ -2556,16 +2557,33 @@
/* Fail if the page tables are 2K */
if (!mm_alloc_pgste(mm))
return -EINVAL;
- down_write(&mm->mmap_sem);
+ mmap_write_lock(mm);
mm->context.has_pgste = 1;
/* split thp mappings and disable thp for future mappings */
thp_split_mm(mm);
walk_page_range(mm, 0, TASK_SIZE, &zap_zero_walk_ops, NULL);
- up_write(&mm->mmap_sem);
+ mmap_write_unlock(mm);
return 0;
}
EXPORT_SYMBOL_GPL(s390_enable_sie);
+int gmap_mark_unmergeable(void)
+{
+ struct mm_struct *mm = current->mm;
+ struct vm_area_struct *vma;
+ int ret;
+
+ for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ ret = ksm_madvise(vma, vma->vm_start, vma->vm_end,
+ MADV_UNMERGEABLE, &vma->vm_flags);
+ if (ret)
+ return ret;
+ }
+ mm->def_flags &= ~VM_MERGEABLE;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(gmap_mark_unmergeable);
+
/*
* Enable storage key handling from now on and initialize the storage
* keys with the default key.
@@ -2611,28 +2629,22 @@
int s390_enable_skey(void)
{
struct mm_struct *mm = current->mm;
- struct vm_area_struct *vma;
int rc = 0;
- down_write(&mm->mmap_sem);
+ mmap_write_lock(mm);
if (mm_uses_skeys(mm))
goto out_up;
mm->context.uses_skeys = 1;
- for (vma = mm->mmap; vma; vma = vma->vm_next) {
- if (ksm_madvise(vma, vma->vm_start, vma->vm_end,
- MADV_UNMERGEABLE, &vma->vm_flags)) {
- mm->context.uses_skeys = 0;
- rc = -ENOMEM;
- goto out_up;
- }
+ rc = gmap_mark_unmergeable();
+ if (rc) {
+ mm->context.uses_skeys = 0;
+ goto out_up;
}
- mm->def_flags &= ~VM_MERGEABLE;
-
walk_page_range(mm, 0, TASK_SIZE, &enable_skey_walk_ops, NULL);
out_up:
- up_write(&mm->mmap_sem);
+ mmap_write_unlock(mm);
return rc;
}
EXPORT_SYMBOL_GPL(s390_enable_skey);
@@ -2653,8 +2665,45 @@
void s390_reset_cmma(struct mm_struct *mm)
{
- down_write(&mm->mmap_sem);
+ mmap_write_lock(mm);
walk_page_range(mm, 0, TASK_SIZE, &reset_cmma_walk_ops, NULL);
- up_write(&mm->mmap_sem);
+ mmap_write_unlock(mm);
}
EXPORT_SYMBOL_GPL(s390_reset_cmma);
+
+/*
+ * make inaccessible pages accessible again
+ */
+static int __s390_reset_acc(pte_t *ptep, unsigned long addr,
+ unsigned long next, struct mm_walk *walk)
+{
+ pte_t pte = READ_ONCE(*ptep);
+
+ if (pte_present(pte))
+ WARN_ON_ONCE(uv_destroy_page(pte_val(pte) & PAGE_MASK));
+ return 0;
+}
+
+static const struct mm_walk_ops reset_acc_walk_ops = {
+ .pte_entry = __s390_reset_acc,
+};
+
+#include <linux/sched/mm.h>
+void s390_reset_acc(struct mm_struct *mm)
+{
+ if (!mm_is_protected(mm))
+ return;
+ /*
+ * we might be called during
+ * reset: we walk the pages and clear
+ * close of all kvm file descriptors: we walk the pages and clear
+ * exit of process on fd closure: vma already gone, do nothing
+ */
+ if (!mmget_not_zero(mm))
+ return;
+ mmap_read_lock(mm);
+ walk_page_range(mm, 0, TASK_SIZE, &reset_acc_walk_ops, NULL);
+ mmap_read_unlock(mm);
+ mmput(mm);
+}
+EXPORT_SYMBOL_GPL(s390_reset_acc);
diff --git a/arch/s390/mm/hugetlbpage.c b/arch/s390/mm/hugetlbpage.c
index ff8234b..3b5a4d2 100644
--- a/arch/s390/mm/hugetlbpage.c
+++ b/arch/s390/mm/hugetlbpage.c
@@ -254,25 +254,15 @@
return pud_page(*pud) + ((address & ~PUD_MASK) >> PAGE_SHIFT);
}
-static __init int setup_hugepagesz(char *opt)
+bool __init arch_hugetlb_valid_size(unsigned long size)
{
- unsigned long size;
- char *string = opt;
-
- size = memparse(opt, &opt);
- if (MACHINE_HAS_EDAT1 && size == PMD_SIZE) {
- hugetlb_add_hstate(PMD_SHIFT - PAGE_SHIFT);
- } else if (MACHINE_HAS_EDAT2 && size == PUD_SIZE) {
- hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT);
- } else {
- hugetlb_bad_size();
- pr_err("hugepagesz= specifies an unsupported page size %s\n",
- string);
- return 0;
- }
- return 1;
+ if (MACHINE_HAS_EDAT1 && size == PMD_SIZE)
+ return true;
+ else if (MACHINE_HAS_EDAT2 && size == PUD_SIZE)
+ return true;
+ else
+ return false;
}
-__setup("hugepagesz=", setup_hugepagesz);
static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file,
unsigned long addr, unsigned long len,
@@ -329,7 +319,6 @@
struct hstate *h = hstate_file(file);
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma;
- int rc;
if (len & ~huge_page_mask(h))
return -EINVAL;
@@ -356,15 +345,9 @@
else
addr = hugetlb_get_unmapped_area_topdown(file, addr, len,
pgoff, flags);
- if (addr & ~PAGE_MASK)
+ if (offset_in_page(addr))
return addr;
check_asce_limit:
- if (addr + len > current->mm->context.asce_limit &&
- addr + len <= TASK_SIZE) {
- rc = crst_table_upgrade(mm, addr + len);
- if (rc)
- return (unsigned long) rc;
- }
- return addr;
+ return check_asce_limit(mm, addr, len);
}
diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c
index 5521f59..9d5960b 100644
--- a/arch/s390/mm/init.c
+++ b/arch/s390/mm/init.c
@@ -33,8 +33,8 @@
#include <linux/dma-direct.h>
#include <asm/processor.h>
#include <linux/uaccess.h>
-#include <asm/pgtable.h>
#include <asm/pgalloc.h>
+#include <asm/ptdump.h>
#include <asm/dma.h>
#include <asm/lowcore.h>
#include <asm/tlb.h>
@@ -46,8 +46,9 @@
#include <asm/kasan.h>
#include <asm/dma-mapping.h>
#include <asm/uv.h>
+#include <linux/virtio_config.h>
-pgd_t swapper_pg_dir[PTRS_PER_PGD] __section(.bss..swapper_pg_dir);
+pgd_t swapper_pg_dir[PTRS_PER_PGD] __section(".bss..swapper_pg_dir");
unsigned long empty_zero_page, zero_page_mask;
EXPORT_SYMBOL(empty_zero_page);
@@ -116,12 +117,12 @@
__load_psw_mask(psw.mask);
kasan_free_early_identity();
- sparse_memory_present_with_active_regions(MAX_NUMNODES);
sparse_init();
+ zone_dma_bits = 31;
memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
max_zone_pfns[ZONE_DMA] = PFN_DOWN(MAX_DMA_ADDRESS);
max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
- free_area_init_nodes(max_zone_pfns);
+ free_area_init(max_zone_pfns);
}
void mark_rodata_ro(void)
@@ -130,6 +131,7 @@
set_memory_ro((unsigned long)__start_ro_after_init, size >> PAGE_SHIFT);
pr_info("Write protected read-only-after-init data: %luk\n", size >> 10);
+ debug_checkwx();
}
int set_memory_encrypted(unsigned long addr, int numpages)
@@ -161,6 +163,16 @@
return is_prot_virt_guest();
}
+#ifdef CONFIG_ARCH_HAS_RESTRICTED_VIRTIO_MEMORY_ACCESS
+
+int arch_has_restricted_virtio_memory_access(void)
+{
+ return is_prot_virt_guest();
+}
+EXPORT_SYMBOL(arch_has_restricted_virtio_memory_access);
+
+#endif
+
/* protected virtualization */
static void pv_init(void)
{
@@ -267,20 +279,23 @@
#endif /* CONFIG_CMA */
int arch_add_memory(int nid, u64 start, u64 size,
- struct mhp_restrictions *restrictions)
+ struct mhp_params *params)
{
unsigned long start_pfn = PFN_DOWN(start);
unsigned long size_pages = PFN_DOWN(size);
int rc;
- if (WARN_ON_ONCE(restrictions->altmap))
+ if (WARN_ON_ONCE(params->altmap))
+ return -EINVAL;
+
+ if (WARN_ON_ONCE(params->pgprot.pgprot != PAGE_KERNEL.pgprot))
return -EINVAL;
rc = vmem_add_mapping(start, size);
if (rc)
return rc;
- rc = __add_pages(nid, start_pfn, size_pages, restrictions);
+ rc = __add_pages(nid, start_pfn, size_pages, params);
if (rc)
vmem_remove_mapping(start, size);
return rc;
diff --git a/arch/s390/mm/kasan_init.c b/arch/s390/mm/kasan_init.c
index 5182e08..e9a9b7b 100644
--- a/arch/s390/mm/kasan_init.c
+++ b/arch/s390/mm/kasan_init.c
@@ -2,8 +2,8 @@
#include <linux/kasan.h>
#include <linux/sched/task.h>
#include <linux/memblock.h>
+#include <linux/pgtable.h>
#include <asm/pgalloc.h>
-#include <asm/pgtable.h>
#include <asm/kasan.h>
#include <asm/mem_detect.h>
#include <asm/processor.h>
@@ -11,7 +11,9 @@
#include <asm/facility.h>
#include <asm/sections.h>
#include <asm/setup.h>
+#include <asm/uv.h>
+unsigned long kasan_vmax;
static unsigned long segment_pos __initdata;
static unsigned long segment_low __initdata;
static unsigned long pgalloc_pos __initdata;
@@ -82,7 +84,8 @@
enum populate_mode {
POPULATE_ONE2ONE,
POPULATE_MAP,
- POPULATE_ZERO_SHADOW
+ POPULATE_ZERO_SHADOW,
+ POPULATE_SHALLOW
};
static void __init kasan_early_vmemmap_populate(unsigned long address,
unsigned long end,
@@ -98,8 +101,12 @@
pgt_prot_zero = pgprot_val(PAGE_KERNEL_RO);
if (!has_nx)
pgt_prot_zero &= ~_PAGE_NOEXEC;
- pgt_prot = pgprot_val(PAGE_KERNEL_EXEC);
- sgt_prot = pgprot_val(SEGMENT_KERNEL_EXEC);
+ pgt_prot = pgprot_val(PAGE_KERNEL);
+ sgt_prot = pgprot_val(SEGMENT_KERNEL);
+ if (!has_nx || mode == POPULATE_ONE2ONE) {
+ pgt_prot &= ~_PAGE_NOEXEC;
+ sgt_prot &= ~_SEGMENT_ENTRY_NOEXEC;
+ }
/*
* The first 1MB of 1:1 mapping is mapped with 4KB pages
@@ -119,6 +126,12 @@
pgd_populate(&init_mm, pg_dir, p4_dir);
}
+ if (IS_ENABLED(CONFIG_KASAN_S390_4_LEVEL_PAGING) &&
+ mode == POPULATE_SHALLOW) {
+ address = (address + P4D_SIZE) & P4D_MASK;
+ continue;
+ }
+
p4_dir = p4d_offset(pg_dir, address);
if (p4d_none(*p4_dir)) {
if (mode == POPULATE_ZERO_SHADOW &&
@@ -133,6 +146,12 @@
p4d_populate(&init_mm, p4_dir, pu_dir);
}
+ if (!IS_ENABLED(CONFIG_KASAN_S390_4_LEVEL_PAGING) &&
+ mode == POPULATE_SHALLOW) {
+ address = (address + PUD_SIZE) & PUD_MASK;
+ continue;
+ }
+
pu_dir = pud_offset(p4_dir, address);
if (pud_none(*pu_dir)) {
if (mode == POPULATE_ZERO_SHADOW &&
@@ -194,6 +213,9 @@
page = kasan_early_shadow_page;
pte_val(*pt_dir) = __pa(page) | pgt_prot_zero;
break;
+ case POPULATE_SHALLOW:
+ /* should never happen */
+ break;
}
}
address += PAGE_SIZE;
@@ -235,14 +257,31 @@
}
}
+static bool __init has_uv_sec_stor_limit(void)
+{
+ /*
+ * keep these conditions in line with setup_uv()
+ */
+ if (!is_prot_virt_host())
+ return false;
+
+ if (is_prot_virt_guest())
+ return false;
+
+ if (!test_facility(158))
+ return false;
+
+ return !!uv_info.max_sec_stor_addr;
+}
+
void __init kasan_early_init(void)
{
unsigned long untracked_mem_end;
unsigned long shadow_alloc_size;
+ unsigned long vmax_unlimited;
unsigned long initrd_end;
unsigned long asce_type;
unsigned long memsize;
- unsigned long vmax;
unsigned long pgt_prot = pgprot_val(PAGE_KERNEL_RO);
pte_t pte_z;
pmd_t pmd_z = __pmd(__pa(kasan_early_shadow_pte) | _SEGMENT_ENTRY);
@@ -270,7 +309,9 @@
BUILD_BUG_ON(!IS_ALIGNED(KASAN_SHADOW_END, P4D_SIZE));
crst_table_init((unsigned long *)early_pg_dir,
_REGION2_ENTRY_EMPTY);
- untracked_mem_end = vmax = _REGION1_SIZE;
+ untracked_mem_end = kasan_vmax = vmax_unlimited = _REGION1_SIZE;
+ if (has_uv_sec_stor_limit())
+ kasan_vmax = min(vmax_unlimited, uv_info.max_sec_stor_addr);
asce_type = _ASCE_TYPE_REGION2;
} else {
/* 3 level paging */
@@ -278,7 +319,7 @@
BUILD_BUG_ON(!IS_ALIGNED(KASAN_SHADOW_END, PUD_SIZE));
crst_table_init((unsigned long *)early_pg_dir,
_REGION3_ENTRY_EMPTY);
- untracked_mem_end = vmax = _REGION2_SIZE;
+ untracked_mem_end = kasan_vmax = vmax_unlimited = _REGION2_SIZE;
asce_type = _ASCE_TYPE_REGION3;
}
@@ -312,25 +353,56 @@
init_mm.pgd = early_pg_dir;
/*
* Current memory layout:
- * +- 0 -------------+ +- shadow start -+
- * | 1:1 ram mapping | /| 1/8 ram |
- * +- end of ram ----+ / +----------------+
- * | ... gap ... |/ | kasan |
- * +- shadow start --+ | zero |
- * | 1/8 addr space | | page |
- * +- shadow end -+ | mapping |
- * | ... gap ... |\ | (untracked) |
- * +- modules vaddr -+ \ +----------------+
- * | 2Gb | \| unmapped | allocated per module
- * +-----------------+ +- shadow end ---+
+ * +- 0 -------------+ +- shadow start -+
+ * | 1:1 ram mapping | /| 1/8 ram |
+ * | | / | |
+ * +- end of ram ----+ / +----------------+
+ * | ... gap ... | / | |
+ * | |/ | kasan |
+ * +- shadow start --+ | zero |
+ * | 1/8 addr space | | page |
+ * +- shadow end -+ | mapping |
+ * | ... gap ... |\ | (untracked) |
+ * +- vmalloc area -+ \ | |
+ * | vmalloc_size | \ | |
+ * +- modules vaddr -+ \ +----------------+
+ * | 2Gb | \| unmapped | allocated per module
+ * +-----------------+ +- shadow end ---+
+ *
+ * Current memory layout (KASAN_VMALLOC):
+ * +- 0 -------------+ +- shadow start -+
+ * | 1:1 ram mapping | /| 1/8 ram |
+ * | | / | |
+ * +- end of ram ----+ / +----------------+
+ * | ... gap ... | / | kasan |
+ * | |/ | zero |
+ * +- shadow start --+ | page |
+ * | 1/8 addr space | | mapping |
+ * +- shadow end -+ | (untracked) |
+ * | ... gap ... |\ | |
+ * +- vmalloc area -+ \ +- vmalloc area -+
+ * | vmalloc_size | \ |shallow populate|
+ * +- modules vaddr -+ \ +- modules area -+
+ * | 2Gb | \|shallow populate|
+ * +-----------------+ +- shadow end ---+
*/
/* populate kasan shadow (for identity mapping and zero page mapping) */
kasan_early_vmemmap_populate(__sha(0), __sha(memsize), POPULATE_MAP);
if (IS_ENABLED(CONFIG_MODULES))
- untracked_mem_end = vmax - MODULES_LEN;
+ untracked_mem_end = kasan_vmax - MODULES_LEN;
+ if (IS_ENABLED(CONFIG_KASAN_VMALLOC)) {
+ untracked_mem_end = kasan_vmax - vmalloc_size - MODULES_LEN;
+ /* shallowly populate kasan shadow for vmalloc and modules */
+ kasan_early_vmemmap_populate(__sha(untracked_mem_end),
+ __sha(kasan_vmax), POPULATE_SHALLOW);
+ }
+ /* populate kasan shadow for untracked memory */
kasan_early_vmemmap_populate(__sha(max_physmem_end),
__sha(untracked_mem_end),
POPULATE_ZERO_SHADOW);
+ kasan_early_vmemmap_populate(__sha(kasan_vmax),
+ __sha(vmax_unlimited),
+ POPULATE_ZERO_SHADOW);
/* memory allocated for identity mapping structs will be freed later */
pgalloc_freeable = pgalloc_pos;
/* populate identity mapping */
diff --git a/arch/s390/mm/mmap.c b/arch/s390/mm/mmap.c
index cbc718b..e54f928 100644
--- a/arch/s390/mm/mmap.c
+++ b/arch/s390/mm/mmap.c
@@ -17,7 +17,6 @@
#include <linux/random.h>
#include <linux/compat.h>
#include <linux/security.h>
-#include <asm/pgalloc.h>
#include <asm/elf.h>
static unsigned long stack_maxrandom_size(void)
@@ -72,14 +71,13 @@
return PAGE_ALIGN(STACK_TOP - gap - rnd);
}
-unsigned long
-arch_get_unmapped_area(struct file *filp, unsigned long addr,
- unsigned long len, unsigned long pgoff, unsigned long flags)
+unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr,
+ unsigned long len, unsigned long pgoff,
+ unsigned long flags)
{
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma;
struct vm_unmapped_area_info info;
- int rc;
if (len > TASK_SIZE - mmap_min_addr)
return -ENOMEM;
@@ -105,30 +103,20 @@
info.align_mask = 0;
info.align_offset = pgoff << PAGE_SHIFT;
addr = vm_unmapped_area(&info);
- if (addr & ~PAGE_MASK)
+ if (offset_in_page(addr))
return addr;
check_asce_limit:
- if (addr + len > current->mm->context.asce_limit &&
- addr + len <= TASK_SIZE) {
- rc = crst_table_upgrade(mm, addr + len);
- if (rc)
- return (unsigned long) rc;
- }
-
- return addr;
+ return check_asce_limit(mm, addr, len);
}
-unsigned long
-arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
- const unsigned long len, const unsigned long pgoff,
- const unsigned long flags)
+unsigned long arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr,
+ unsigned long len, unsigned long pgoff,
+ unsigned long flags)
{
struct vm_area_struct *vma;
struct mm_struct *mm = current->mm;
- unsigned long addr = addr0;
struct vm_unmapped_area_info info;
- int rc;
/* requested length too big for entire address space */
if (len > TASK_SIZE - mmap_min_addr)
@@ -163,25 +151,18 @@
* can happen with large stack limits and large mmap()
* allocations.
*/
- if (addr & ~PAGE_MASK) {
+ if (offset_in_page(addr)) {
VM_BUG_ON(addr != -ENOMEM);
info.flags = 0;
info.low_limit = TASK_UNMAPPED_BASE;
info.high_limit = TASK_SIZE;
addr = vm_unmapped_area(&info);
- if (addr & ~PAGE_MASK)
+ if (offset_in_page(addr))
return addr;
}
check_asce_limit:
- if (addr + len > current->mm->context.asce_limit &&
- addr + len <= TASK_SIZE) {
- rc = crst_table_upgrade(mm, addr + len);
- if (rc)
- return (unsigned long) rc;
- }
-
- return addr;
+ return check_asce_limit(mm, addr, len);
}
/*
diff --git a/arch/s390/mm/page-states.c b/arch/s390/mm/page-states.c
index fc14189..567c69f 100644
--- a/arch/s390/mm/page-states.c
+++ b/arch/s390/mm/page-states.c
@@ -183,9 +183,9 @@
void __init cmma_init_nodat(void)
{
- struct memblock_region *reg;
struct page *page;
unsigned long start, end, ix;
+ int i;
if (cmma_flag < 2)
return;
@@ -193,9 +193,7 @@
mark_kernel_pgd();
/* Set all kernel pages not used for page tables to stable/no-dat */
- for_each_memblock(memory, reg) {
- start = memblock_region_memory_base_pfn(reg);
- end = memblock_region_memory_end_pfn(reg);
+ for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, NULL) {
page = pfn_to_page(start);
for (ix = start; ix < end; ix++, page++) {
if (__test_and_clear_bit(PG_arch_1, &page->flags))
diff --git a/arch/s390/mm/pageattr.c b/arch/s390/mm/pageattr.c
index f8c6faa..ed8e5b3 100644
--- a/arch/s390/mm/pageattr.c
+++ b/arch/s390/mm/pageattr.c
@@ -7,7 +7,6 @@
#include <linux/mm.h>
#include <asm/cacheflush.h>
#include <asm/facility.h>
-#include <asm/pgtable.h>
#include <asm/pgalloc.h>
#include <asm/page.h>
#include <asm/set_memory.h>
@@ -86,7 +85,7 @@
{
pte_t *ptep, new;
- ptep = pte_offset(pmdp, addr);
+ ptep = pte_offset_kernel(pmdp, addr);
do {
new = *ptep;
if (pte_none(new))
@@ -279,7 +278,7 @@
return rc;
}
-static DEFINE_MUTEX(cpa_mutex);
+DEFINE_MUTEX(cpa_mutex);
static int change_page_attr(unsigned long addr, unsigned long end,
unsigned long flags)
@@ -338,19 +337,11 @@
{
unsigned long address;
int nr, i, j;
- pgd_t *pgd;
- p4d_t *p4d;
- pud_t *pud;
- pmd_t *pmd;
pte_t *pte;
for (i = 0; i < numpages;) {
address = page_to_phys(page + i);
- pgd = pgd_offset_k(address);
- p4d = p4d_offset(pgd, address);
- pud = pud_offset(p4d, address);
- pmd = pmd_offset(pud, address);
- pte = pte_offset_kernel(pmd, address);
+ pte = virt_to_kpte(address);
nr = (unsigned long)pte >> ilog2(sizeof(long));
nr = PTRS_PER_PTE - (nr & (PTRS_PER_PTE - 1));
nr = min(numpages - i, nr);
@@ -367,20 +358,4 @@
}
}
-#ifdef CONFIG_HIBERNATION
-bool kernel_page_present(struct page *page)
-{
- unsigned long addr;
- int cc;
-
- addr = page_to_phys(page);
- asm volatile(
- " lra %1,0(%1)\n"
- " ipm %0\n"
- " srl %0,28"
- : "=d" (cc), "+a" (addr) : : "cc");
- return cc == 0;
-}
-#endif /* CONFIG_HIBERNATION */
-
#endif /* CONFIG_DEBUG_PAGEALLOC */
diff --git a/arch/s390/mm/pgalloc.c b/arch/s390/mm/pgalloc.c
index 46071be..6d99b1b 100644
--- a/arch/s390/mm/pgalloc.c
+++ b/arch/s390/mm/pgalloc.c
@@ -89,67 +89,65 @@
int crst_table_upgrade(struct mm_struct *mm, unsigned long end)
{
- unsigned long *table, *pgd;
- int rc, notify;
+ unsigned long *pgd = NULL, *p4d = NULL, *__pgd;
+ unsigned long asce_limit = mm->context.asce_limit;
/* upgrade should only happen from 3 to 4, 3 to 5, or 4 to 5 levels */
- VM_BUG_ON(mm->context.asce_limit < _REGION2_SIZE);
- rc = 0;
- notify = 0;
- while (mm->context.asce_limit < end) {
- table = crst_table_alloc(mm);
- if (!table) {
- rc = -ENOMEM;
- break;
- }
- spin_lock_bh(&mm->page_table_lock);
- pgd = (unsigned long *) mm->pgd;
- if (mm->context.asce_limit == _REGION2_SIZE) {
- crst_table_init(table, _REGION2_ENTRY_EMPTY);
- p4d_populate(mm, (p4d_t *) table, (pud_t *) pgd);
- mm->pgd = (pgd_t *) table;
- mm->context.asce_limit = _REGION1_SIZE;
- mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH |
- _ASCE_USER_BITS | _ASCE_TYPE_REGION2;
- mm_inc_nr_puds(mm);
- } else {
- crst_table_init(table, _REGION1_ENTRY_EMPTY);
- pgd_populate(mm, (pgd_t *) table, (p4d_t *) pgd);
- mm->pgd = (pgd_t *) table;
- mm->context.asce_limit = -PAGE_SIZE;
- mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH |
- _ASCE_USER_BITS | _ASCE_TYPE_REGION1;
- }
- notify = 1;
- spin_unlock_bh(&mm->page_table_lock);
+ VM_BUG_ON(asce_limit < _REGION2_SIZE);
+
+ if (end <= asce_limit)
+ return 0;
+
+ if (asce_limit == _REGION2_SIZE) {
+ p4d = crst_table_alloc(mm);
+ if (unlikely(!p4d))
+ goto err_p4d;
+ crst_table_init(p4d, _REGION2_ENTRY_EMPTY);
}
- if (notify)
- on_each_cpu(__crst_table_upgrade, mm, 0);
- return rc;
-}
-
-void crst_table_downgrade(struct mm_struct *mm)
-{
- pgd_t *pgd;
-
- /* downgrade should only happen from 3 to 2 levels (compat only) */
- VM_BUG_ON(mm->context.asce_limit != _REGION2_SIZE);
-
- if (current->active_mm == mm) {
- clear_user_asce();
- __tlb_flush_mm(mm);
+ if (end > _REGION1_SIZE) {
+ pgd = crst_table_alloc(mm);
+ if (unlikely(!pgd))
+ goto err_pgd;
+ crst_table_init(pgd, _REGION1_ENTRY_EMPTY);
}
- pgd = mm->pgd;
- mm_dec_nr_pmds(mm);
- mm->pgd = (pgd_t *) (pgd_val(*pgd) & _REGION_ENTRY_ORIGIN);
- mm->context.asce_limit = _REGION3_SIZE;
- mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH |
- _ASCE_USER_BITS | _ASCE_TYPE_SEGMENT;
- crst_table_free(mm, (unsigned long *) pgd);
+ spin_lock_bh(&mm->page_table_lock);
- if (current->active_mm == mm)
- set_user_asce(mm);
+ /*
+ * This routine gets called with mmap_lock lock held and there is
+ * no reason to optimize for the case of otherwise. However, if
+ * that would ever change, the below check will let us know.
+ */
+ VM_BUG_ON(asce_limit != mm->context.asce_limit);
+
+ if (p4d) {
+ __pgd = (unsigned long *) mm->pgd;
+ p4d_populate(mm, (p4d_t *) p4d, (pud_t *) __pgd);
+ mm->pgd = (pgd_t *) p4d;
+ mm->context.asce_limit = _REGION1_SIZE;
+ mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH |
+ _ASCE_USER_BITS | _ASCE_TYPE_REGION2;
+ mm_inc_nr_puds(mm);
+ }
+ if (pgd) {
+ __pgd = (unsigned long *) mm->pgd;
+ pgd_populate(mm, (pgd_t *) pgd, (p4d_t *) __pgd);
+ mm->pgd = (pgd_t *) pgd;
+ mm->context.asce_limit = TASK_SIZE_MAX;
+ mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH |
+ _ASCE_USER_BITS | _ASCE_TYPE_REGION1;
+ }
+
+ spin_unlock_bh(&mm->page_table_lock);
+
+ on_each_cpu(__crst_table_upgrade, mm, 0);
+
+ return 0;
+
+err_pgd:
+ crst_table_free(mm, p4d);
+err_p4d:
+ return -ENOMEM;
}
static inline unsigned int atomic_xor_bits(atomic_t *v, unsigned int bits)
@@ -255,13 +253,15 @@
/* Free 2K page table fragment of a 4K page */
bit = (__pa(table) & ~PAGE_MASK)/(PTRS_PER_PTE*sizeof(pte_t));
spin_lock_bh(&mm->context.lock);
- mask = atomic_xor_bits(&page->_refcount, 1U << (bit + 24));
+ mask = atomic_xor_bits(&page->_refcount, 0x11U << (bit + 24));
mask >>= 24;
if (mask & 3)
list_add(&page->lru, &mm->context.pgtable_list);
else
list_del(&page->lru);
spin_unlock_bh(&mm->context.lock);
+ mask = atomic_xor_bits(&page->_refcount, 0x10U << (bit + 24));
+ mask >>= 24;
if (mask != 0)
return;
} else {
@@ -316,7 +316,7 @@
mask >>= 24;
if (mask != 0)
break;
- /* fallthrough */
+ fallthrough;
case 3: /* 4K page table with pgstes */
if (mask & 3)
atomic_xor_bits(&page->_refcount, 3 << 24);
@@ -541,7 +541,7 @@
base_region2_walk(table, 0, _REGION1_SIZE, 0);
break;
case _ASCE_TYPE_REGION1:
- base_region1_walk(table, 0, -_PAGE_SIZE, 0);
+ base_region1_walk(table, 0, TASK_SIZE_MAX, 0);
break;
}
base_crst_free(table);
diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c
index 9ebd012..fabaedd 100644
--- a/arch/s390/mm/pgtable.c
+++ b/arch/s390/mm/pgtable.c
@@ -19,13 +19,31 @@
#include <linux/ksm.h>
#include <linux/mman.h>
-#include <asm/pgtable.h>
-#include <asm/pgalloc.h>
#include <asm/tlb.h>
#include <asm/tlbflush.h>
#include <asm/mmu_context.h>
#include <asm/page-states.h>
+pgprot_t pgprot_writecombine(pgprot_t prot)
+{
+ /*
+ * mio_wb_bit_mask may be set on a different CPU, but it is only set
+ * once at init and only read afterwards.
+ */
+ return __pgprot(pgprot_val(prot) | mio_wb_bit_mask);
+}
+EXPORT_SYMBOL_GPL(pgprot_writecombine);
+
+pgprot_t pgprot_writethrough(pgprot_t prot)
+{
+ /*
+ * mio_wb_bit_mask may be set on a different CPU, but it is only set
+ * once at init and only read afterwards.
+ */
+ return __pgprot(pgprot_val(prot) & ~mio_wb_bit_mask);
+}
+EXPORT_SYMBOL_GPL(pgprot_writethrough);
+
static inline void ptep_ipte_local(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, int nodat)
{
@@ -970,6 +988,7 @@
int pgste_perform_essa(struct mm_struct *mm, unsigned long hva, int orc,
unsigned long *oldpte, unsigned long *oldpgste)
{
+ struct vm_area_struct *vma;
unsigned long pgstev;
spinlock_t *ptl;
pgste_t pgste;
@@ -979,6 +998,10 @@
WARN_ON_ONCE(orc > ESSA_MAX);
if (unlikely(orc > ESSA_MAX))
return -EINVAL;
+
+ vma = find_vma(mm, hva);
+ if (!vma || hva < vma->vm_start || is_vm_hugetlb_page(vma))
+ return -EFAULT;
ptep = get_locked_pte(mm, hva, &ptl);
if (unlikely(!ptep))
return -EFAULT;
@@ -1071,10 +1094,14 @@
int set_pgste_bits(struct mm_struct *mm, unsigned long hva,
unsigned long bits, unsigned long value)
{
+ struct vm_area_struct *vma;
spinlock_t *ptl;
pgste_t new;
pte_t *ptep;
+ vma = find_vma(mm, hva);
+ if (!vma || hva < vma->vm_start || is_vm_hugetlb_page(vma))
+ return -EFAULT;
ptep = get_locked_pte(mm, hva, &ptl);
if (unlikely(!ptep))
return -EFAULT;
@@ -1099,9 +1126,13 @@
*/
int get_pgste(struct mm_struct *mm, unsigned long hva, unsigned long *pgstep)
{
+ struct vm_area_struct *vma;
spinlock_t *ptl;
pte_t *ptep;
+ vma = find_vma(mm, hva);
+ if (!vma || hva < vma->vm_start || is_vm_hugetlb_page(vma))
+ return -EFAULT;
ptep = get_locked_pte(mm, hva, &ptl);
if (unlikely(!ptep))
return -EFAULT;
diff --git a/arch/s390/mm/vmem.c b/arch/s390/mm/vmem.c
index f810930..b239f2b 100644
--- a/arch/s390/mm/vmem.c
+++ b/arch/s390/mm/vmem.c
@@ -13,7 +13,6 @@
#include <linux/slab.h>
#include <asm/cacheflush.h>
#include <asm/pgalloc.h>
-#include <asm/pgtable.h>
#include <asm/setup.h>
#include <asm/tlbflush.h>
#include <asm/sections.h>
@@ -21,14 +20,6 @@
static DEFINE_MUTEX(vmem_mutex);
-struct memory_segment {
- struct list_head list;
- unsigned long start;
- unsigned long size;
-};
-
-static LIST_HEAD(mem_segs);
-
static void __ref *vmem_alloc_pages(unsigned int order)
{
unsigned long size = PAGE_SIZE << order;
@@ -38,6 +29,15 @@
return (void *) memblock_phys_alloc(size, size);
}
+static void vmem_free_pages(unsigned long addr, int order)
+{
+ /* We don't expect boot memory to be removed ever. */
+ if (!slab_is_available() ||
+ WARN_ON_ONCE(PageReserved(phys_to_page(addr))))
+ return;
+ free_pages(addr, order);
+}
+
void *vmem_crst_alloc(unsigned long val)
{
unsigned long *table;
@@ -63,332 +63,487 @@
return pte;
}
-/*
- * Add a physical memory range to the 1:1 mapping.
- */
-static int vmem_add_mem(unsigned long start, unsigned long size)
+static void vmem_pte_free(unsigned long *table)
{
- unsigned long pgt_prot, sgt_prot, r3_prot;
- unsigned long pages4k, pages1m, pages2g;
- unsigned long end = start + size;
- unsigned long address = start;
- pgd_t *pg_dir;
- p4d_t *p4_dir;
- pud_t *pu_dir;
- pmd_t *pm_dir;
- pte_t *pt_dir;
- int ret = -ENOMEM;
+ /* We don't expect boot memory to be removed ever. */
+ if (!slab_is_available() ||
+ WARN_ON_ONCE(PageReserved(virt_to_page(table))))
+ return;
+ page_table_free(&init_mm, table);
+}
- pgt_prot = pgprot_val(PAGE_KERNEL);
- sgt_prot = pgprot_val(SEGMENT_KERNEL);
- r3_prot = pgprot_val(REGION3_KERNEL);
- if (!MACHINE_HAS_NX) {
- pgt_prot &= ~_PAGE_NOEXEC;
- sgt_prot &= ~_SEGMENT_ENTRY_NOEXEC;
- r3_prot &= ~_REGION_ENTRY_NOEXEC;
+#define PAGE_UNUSED 0xFD
+
+/*
+ * The unused vmemmap range, which was not yet memset(PAGE_UNUSED) ranges
+ * from unused_pmd_start to next PMD_SIZE boundary.
+ */
+static unsigned long unused_pmd_start;
+
+static void vmemmap_flush_unused_pmd(void)
+{
+ if (!unused_pmd_start)
+ return;
+ memset(__va(unused_pmd_start), PAGE_UNUSED,
+ ALIGN(unused_pmd_start, PMD_SIZE) - unused_pmd_start);
+ unused_pmd_start = 0;
+}
+
+static void __vmemmap_use_sub_pmd(unsigned long start, unsigned long end)
+{
+ /*
+ * As we expect to add in the same granularity as we remove, it's
+ * sufficient to mark only some piece used to block the memmap page from
+ * getting removed (just in case the memmap never gets initialized,
+ * e.g., because the memory block never gets onlined).
+ */
+ memset(__va(start), 0, sizeof(struct page));
+}
+
+static void vmemmap_use_sub_pmd(unsigned long start, unsigned long end)
+{
+ /*
+ * We only optimize if the new used range directly follows the
+ * previously unused range (esp., when populating consecutive sections).
+ */
+ if (unused_pmd_start == start) {
+ unused_pmd_start = end;
+ if (likely(IS_ALIGNED(unused_pmd_start, PMD_SIZE)))
+ unused_pmd_start = 0;
+ return;
}
- pages4k = pages1m = pages2g = 0;
- while (address < end) {
- pg_dir = pgd_offset_k(address);
- if (pgd_none(*pg_dir)) {
- p4_dir = vmem_crst_alloc(_REGION2_ENTRY_EMPTY);
- if (!p4_dir)
- goto out;
- pgd_populate(&init_mm, pg_dir, p4_dir);
- }
- p4_dir = p4d_offset(pg_dir, address);
- if (p4d_none(*p4_dir)) {
- pu_dir = vmem_crst_alloc(_REGION3_ENTRY_EMPTY);
- if (!pu_dir)
- goto out;
- p4d_populate(&init_mm, p4_dir, pu_dir);
- }
- pu_dir = pud_offset(p4_dir, address);
- if (MACHINE_HAS_EDAT2 && pud_none(*pu_dir) && address &&
- !(address & ~PUD_MASK) && (address + PUD_SIZE <= end) &&
- !debug_pagealloc_enabled()) {
- pud_val(*pu_dir) = address | r3_prot;
- address += PUD_SIZE;
- pages2g++;
- continue;
- }
- if (pud_none(*pu_dir)) {
- pm_dir = vmem_crst_alloc(_SEGMENT_ENTRY_EMPTY);
- if (!pm_dir)
- goto out;
- pud_populate(&init_mm, pu_dir, pm_dir);
- }
- pm_dir = pmd_offset(pu_dir, address);
- if (MACHINE_HAS_EDAT1 && pmd_none(*pm_dir) && address &&
- !(address & ~PMD_MASK) && (address + PMD_SIZE <= end) &&
- !debug_pagealloc_enabled()) {
- pmd_val(*pm_dir) = address | sgt_prot;
- address += PMD_SIZE;
- pages1m++;
- continue;
- }
- if (pmd_none(*pm_dir)) {
- pt_dir = vmem_pte_alloc();
- if (!pt_dir)
- goto out;
- pmd_populate(&init_mm, pm_dir, pt_dir);
- }
+ vmemmap_flush_unused_pmd();
+ __vmemmap_use_sub_pmd(start, end);
+}
- pt_dir = pte_offset_kernel(pm_dir, address);
- pte_val(*pt_dir) = address | pgt_prot;
- address += PAGE_SIZE;
- pages4k++;
+static void vmemmap_use_new_sub_pmd(unsigned long start, unsigned long end)
+{
+ void *page = __va(ALIGN_DOWN(start, PMD_SIZE));
+
+ vmemmap_flush_unused_pmd();
+
+ /* Could be our memmap page is filled with PAGE_UNUSED already ... */
+ __vmemmap_use_sub_pmd(start, end);
+
+ /* Mark the unused parts of the new memmap page PAGE_UNUSED. */
+ if (!IS_ALIGNED(start, PMD_SIZE))
+ memset(page, PAGE_UNUSED, start - __pa(page));
+ /*
+ * We want to avoid memset(PAGE_UNUSED) when populating the vmemmap of
+ * consecutive sections. Remember for the last added PMD the last
+ * unused range in the populated PMD.
+ */
+ if (!IS_ALIGNED(end, PMD_SIZE))
+ unused_pmd_start = end;
+}
+
+/* Returns true if the PMD is completely unused and can be freed. */
+static bool vmemmap_unuse_sub_pmd(unsigned long start, unsigned long end)
+{
+ void *page = __va(ALIGN_DOWN(start, PMD_SIZE));
+
+ vmemmap_flush_unused_pmd();
+ memset(__va(start), PAGE_UNUSED, end - start);
+ return !memchr_inv(page, PAGE_UNUSED, PMD_SIZE);
+}
+
+/* __ref: we'll only call vmemmap_alloc_block() via vmemmap_populate() */
+static int __ref modify_pte_table(pmd_t *pmd, unsigned long addr,
+ unsigned long end, bool add, bool direct)
+{
+ unsigned long prot, pages = 0;
+ int ret = -ENOMEM;
+ pte_t *pte;
+
+ prot = pgprot_val(PAGE_KERNEL);
+ if (!MACHINE_HAS_NX)
+ prot &= ~_PAGE_NOEXEC;
+
+ pte = pte_offset_kernel(pmd, addr);
+ for (; addr < end; addr += PAGE_SIZE, pte++) {
+ if (!add) {
+ if (pte_none(*pte))
+ continue;
+ if (!direct)
+ vmem_free_pages(pfn_to_phys(pte_pfn(*pte)), 0);
+ pte_clear(&init_mm, addr, pte);
+ } else if (pte_none(*pte)) {
+ if (!direct) {
+ void *new_page = vmemmap_alloc_block(PAGE_SIZE, NUMA_NO_NODE);
+
+ if (!new_page)
+ goto out;
+ pte_val(*pte) = __pa(new_page) | prot;
+ } else {
+ pte_val(*pte) = addr | prot;
+ }
+ } else {
+ continue;
+ }
+ pages++;
}
ret = 0;
out:
- update_page_count(PG_DIRECT_MAP_4K, pages4k);
- update_page_count(PG_DIRECT_MAP_1M, pages1m);
- update_page_count(PG_DIRECT_MAP_2G, pages2g);
+ if (direct)
+ update_page_count(PG_DIRECT_MAP_4K, add ? pages : -pages);
return ret;
}
+static void try_free_pte_table(pmd_t *pmd, unsigned long start)
+{
+ pte_t *pte;
+ int i;
+
+ /* We can safely assume this is fully in 1:1 mapping & vmemmap area */
+ pte = pte_offset_kernel(pmd, start);
+ for (i = 0; i < PTRS_PER_PTE; i++, pte++) {
+ if (!pte_none(*pte))
+ return;
+ }
+ vmem_pte_free(__va(pmd_deref(*pmd)));
+ pmd_clear(pmd);
+}
+
+/* __ref: we'll only call vmemmap_alloc_block() via vmemmap_populate() */
+static int __ref modify_pmd_table(pud_t *pud, unsigned long addr,
+ unsigned long end, bool add, bool direct)
+{
+ unsigned long next, prot, pages = 0;
+ int ret = -ENOMEM;
+ pmd_t *pmd;
+ pte_t *pte;
+
+ prot = pgprot_val(SEGMENT_KERNEL);
+ if (!MACHINE_HAS_NX)
+ prot &= ~_SEGMENT_ENTRY_NOEXEC;
+
+ pmd = pmd_offset(pud, addr);
+ for (; addr < end; addr = next, pmd++) {
+ next = pmd_addr_end(addr, end);
+ if (!add) {
+ if (pmd_none(*pmd))
+ continue;
+ if (pmd_large(*pmd) && !add) {
+ if (IS_ALIGNED(addr, PMD_SIZE) &&
+ IS_ALIGNED(next, PMD_SIZE)) {
+ if (!direct)
+ vmem_free_pages(pmd_deref(*pmd), get_order(PMD_SIZE));
+ pmd_clear(pmd);
+ pages++;
+ } else if (!direct && vmemmap_unuse_sub_pmd(addr, next)) {
+ vmem_free_pages(pmd_deref(*pmd), get_order(PMD_SIZE));
+ pmd_clear(pmd);
+ }
+ continue;
+ }
+ } else if (pmd_none(*pmd)) {
+ if (IS_ALIGNED(addr, PMD_SIZE) &&
+ IS_ALIGNED(next, PMD_SIZE) &&
+ MACHINE_HAS_EDAT1 && addr && direct &&
+ !debug_pagealloc_enabled()) {
+ pmd_val(*pmd) = addr | prot;
+ pages++;
+ continue;
+ } else if (!direct && MACHINE_HAS_EDAT1) {
+ void *new_page;
+
+ /*
+ * Use 1MB frames for vmemmap if available. We
+ * always use large frames even if they are only
+ * partially used. Otherwise we would have also
+ * page tables since vmemmap_populate gets
+ * called for each section separately.
+ */
+ new_page = vmemmap_alloc_block(PMD_SIZE, NUMA_NO_NODE);
+ if (new_page) {
+ pmd_val(*pmd) = __pa(new_page) | prot;
+ if (!IS_ALIGNED(addr, PMD_SIZE) ||
+ !IS_ALIGNED(next, PMD_SIZE)) {
+ vmemmap_use_new_sub_pmd(addr, next);
+ }
+ continue;
+ }
+ }
+ pte = vmem_pte_alloc();
+ if (!pte)
+ goto out;
+ pmd_populate(&init_mm, pmd, pte);
+ } else if (pmd_large(*pmd)) {
+ if (!direct)
+ vmemmap_use_sub_pmd(addr, next);
+ continue;
+ }
+ ret = modify_pte_table(pmd, addr, next, add, direct);
+ if (ret)
+ goto out;
+ if (!add)
+ try_free_pte_table(pmd, addr & PMD_MASK);
+ }
+ ret = 0;
+out:
+ if (direct)
+ update_page_count(PG_DIRECT_MAP_1M, add ? pages : -pages);
+ return ret;
+}
+
+static void try_free_pmd_table(pud_t *pud, unsigned long start)
+{
+ const unsigned long end = start + PUD_SIZE;
+ pmd_t *pmd;
+ int i;
+
+ /* Don't mess with any tables not fully in 1:1 mapping & vmemmap area */
+ if (end > VMALLOC_START)
+ return;
+#ifdef CONFIG_KASAN
+ if (start < KASAN_SHADOW_END && KASAN_SHADOW_START > end)
+ return;
+#endif
+ pmd = pmd_offset(pud, start);
+ for (i = 0; i < PTRS_PER_PMD; i++, pmd++)
+ if (!pmd_none(*pmd))
+ return;
+ vmem_free_pages(pud_deref(*pud), CRST_ALLOC_ORDER);
+ pud_clear(pud);
+}
+
+static int modify_pud_table(p4d_t *p4d, unsigned long addr, unsigned long end,
+ bool add, bool direct)
+{
+ unsigned long next, prot, pages = 0;
+ int ret = -ENOMEM;
+ pud_t *pud;
+ pmd_t *pmd;
+
+ prot = pgprot_val(REGION3_KERNEL);
+ if (!MACHINE_HAS_NX)
+ prot &= ~_REGION_ENTRY_NOEXEC;
+ pud = pud_offset(p4d, addr);
+ for (; addr < end; addr = next, pud++) {
+ next = pud_addr_end(addr, end);
+ if (!add) {
+ if (pud_none(*pud))
+ continue;
+ if (pud_large(*pud)) {
+ if (IS_ALIGNED(addr, PUD_SIZE) &&
+ IS_ALIGNED(next, PUD_SIZE)) {
+ pud_clear(pud);
+ pages++;
+ }
+ continue;
+ }
+ } else if (pud_none(*pud)) {
+ if (IS_ALIGNED(addr, PUD_SIZE) &&
+ IS_ALIGNED(next, PUD_SIZE) &&
+ MACHINE_HAS_EDAT2 && addr && direct &&
+ !debug_pagealloc_enabled()) {
+ pud_val(*pud) = addr | prot;
+ pages++;
+ continue;
+ }
+ pmd = vmem_crst_alloc(_SEGMENT_ENTRY_EMPTY);
+ if (!pmd)
+ goto out;
+ pud_populate(&init_mm, pud, pmd);
+ } else if (pud_large(*pud)) {
+ continue;
+ }
+ ret = modify_pmd_table(pud, addr, next, add, direct);
+ if (ret)
+ goto out;
+ if (!add)
+ try_free_pmd_table(pud, addr & PUD_MASK);
+ }
+ ret = 0;
+out:
+ if (direct)
+ update_page_count(PG_DIRECT_MAP_2G, add ? pages : -pages);
+ return ret;
+}
+
+static void try_free_pud_table(p4d_t *p4d, unsigned long start)
+{
+ const unsigned long end = start + P4D_SIZE;
+ pud_t *pud;
+ int i;
+
+ /* Don't mess with any tables not fully in 1:1 mapping & vmemmap area */
+ if (end > VMALLOC_START)
+ return;
+#ifdef CONFIG_KASAN
+ if (start < KASAN_SHADOW_END && KASAN_SHADOW_START > end)
+ return;
+#endif
+
+ pud = pud_offset(p4d, start);
+ for (i = 0; i < PTRS_PER_PUD; i++, pud++) {
+ if (!pud_none(*pud))
+ return;
+ }
+ vmem_free_pages(p4d_deref(*p4d), CRST_ALLOC_ORDER);
+ p4d_clear(p4d);
+}
+
+static int modify_p4d_table(pgd_t *pgd, unsigned long addr, unsigned long end,
+ bool add, bool direct)
+{
+ unsigned long next;
+ int ret = -ENOMEM;
+ p4d_t *p4d;
+ pud_t *pud;
+
+ p4d = p4d_offset(pgd, addr);
+ for (; addr < end; addr = next, p4d++) {
+ next = p4d_addr_end(addr, end);
+ if (!add) {
+ if (p4d_none(*p4d))
+ continue;
+ } else if (p4d_none(*p4d)) {
+ pud = vmem_crst_alloc(_REGION3_ENTRY_EMPTY);
+ if (!pud)
+ goto out;
+ p4d_populate(&init_mm, p4d, pud);
+ }
+ ret = modify_pud_table(p4d, addr, next, add, direct);
+ if (ret)
+ goto out;
+ if (!add)
+ try_free_pud_table(p4d, addr & P4D_MASK);
+ }
+ ret = 0;
+out:
+ return ret;
+}
+
+static void try_free_p4d_table(pgd_t *pgd, unsigned long start)
+{
+ const unsigned long end = start + PGDIR_SIZE;
+ p4d_t *p4d;
+ int i;
+
+ /* Don't mess with any tables not fully in 1:1 mapping & vmemmap area */
+ if (end > VMALLOC_START)
+ return;
+#ifdef CONFIG_KASAN
+ if (start < KASAN_SHADOW_END && KASAN_SHADOW_START > end)
+ return;
+#endif
+
+ p4d = p4d_offset(pgd, start);
+ for (i = 0; i < PTRS_PER_P4D; i++, p4d++) {
+ if (!p4d_none(*p4d))
+ return;
+ }
+ vmem_free_pages(pgd_deref(*pgd), CRST_ALLOC_ORDER);
+ pgd_clear(pgd);
+}
+
+static int modify_pagetable(unsigned long start, unsigned long end, bool add,
+ bool direct)
+{
+ unsigned long addr, next;
+ int ret = -ENOMEM;
+ pgd_t *pgd;
+ p4d_t *p4d;
+
+ if (WARN_ON_ONCE(!PAGE_ALIGNED(start | end)))
+ return -EINVAL;
+ for (addr = start; addr < end; addr = next) {
+ next = pgd_addr_end(addr, end);
+ pgd = pgd_offset_k(addr);
+
+ if (!add) {
+ if (pgd_none(*pgd))
+ continue;
+ } else if (pgd_none(*pgd)) {
+ p4d = vmem_crst_alloc(_REGION2_ENTRY_EMPTY);
+ if (!p4d)
+ goto out;
+ pgd_populate(&init_mm, pgd, p4d);
+ }
+ ret = modify_p4d_table(pgd, addr, next, add, direct);
+ if (ret)
+ goto out;
+ if (!add)
+ try_free_p4d_table(pgd, addr & PGDIR_MASK);
+ }
+ ret = 0;
+out:
+ if (!add)
+ flush_tlb_kernel_range(start, end);
+ return ret;
+}
+
+static int add_pagetable(unsigned long start, unsigned long end, bool direct)
+{
+ return modify_pagetable(start, end, true, direct);
+}
+
+static int remove_pagetable(unsigned long start, unsigned long end, bool direct)
+{
+ return modify_pagetable(start, end, false, direct);
+}
+
+/*
+ * Add a physical memory range to the 1:1 mapping.
+ */
+static int vmem_add_range(unsigned long start, unsigned long size)
+{
+ return add_pagetable(start, start + size, true);
+}
+
/*
* Remove a physical memory range from the 1:1 mapping.
- * Currently only invalidates page table entries.
*/
static void vmem_remove_range(unsigned long start, unsigned long size)
{
- unsigned long pages4k, pages1m, pages2g;
- unsigned long end = start + size;
- unsigned long address = start;
- pgd_t *pg_dir;
- p4d_t *p4_dir;
- pud_t *pu_dir;
- pmd_t *pm_dir;
- pte_t *pt_dir;
-
- pages4k = pages1m = pages2g = 0;
- while (address < end) {
- pg_dir = pgd_offset_k(address);
- if (pgd_none(*pg_dir)) {
- address += PGDIR_SIZE;
- continue;
- }
- p4_dir = p4d_offset(pg_dir, address);
- if (p4d_none(*p4_dir)) {
- address += P4D_SIZE;
- continue;
- }
- pu_dir = pud_offset(p4_dir, address);
- if (pud_none(*pu_dir)) {
- address += PUD_SIZE;
- continue;
- }
- if (pud_large(*pu_dir)) {
- pud_clear(pu_dir);
- address += PUD_SIZE;
- pages2g++;
- continue;
- }
- pm_dir = pmd_offset(pu_dir, address);
- if (pmd_none(*pm_dir)) {
- address += PMD_SIZE;
- continue;
- }
- if (pmd_large(*pm_dir)) {
- pmd_clear(pm_dir);
- address += PMD_SIZE;
- pages1m++;
- continue;
- }
- pt_dir = pte_offset_kernel(pm_dir, address);
- pte_clear(&init_mm, address, pt_dir);
- address += PAGE_SIZE;
- pages4k++;
- }
- flush_tlb_kernel_range(start, end);
- update_page_count(PG_DIRECT_MAP_4K, -pages4k);
- update_page_count(PG_DIRECT_MAP_1M, -pages1m);
- update_page_count(PG_DIRECT_MAP_2G, -pages2g);
+ remove_pagetable(start, start + size, true);
}
/*
* Add a backed mem_map array to the virtual mem_map array.
*/
int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
- struct vmem_altmap *altmap)
+ struct vmem_altmap *altmap)
{
- unsigned long pgt_prot, sgt_prot;
- unsigned long address = start;
- pgd_t *pg_dir;
- p4d_t *p4_dir;
- pud_t *pu_dir;
- pmd_t *pm_dir;
- pte_t *pt_dir;
- int ret = -ENOMEM;
-
- pgt_prot = pgprot_val(PAGE_KERNEL);
- sgt_prot = pgprot_val(SEGMENT_KERNEL);
- if (!MACHINE_HAS_NX) {
- pgt_prot &= ~_PAGE_NOEXEC;
- sgt_prot &= ~_SEGMENT_ENTRY_NOEXEC;
- }
- for (address = start; address < end;) {
- pg_dir = pgd_offset_k(address);
- if (pgd_none(*pg_dir)) {
- p4_dir = vmem_crst_alloc(_REGION2_ENTRY_EMPTY);
- if (!p4_dir)
- goto out;
- pgd_populate(&init_mm, pg_dir, p4_dir);
- }
-
- p4_dir = p4d_offset(pg_dir, address);
- if (p4d_none(*p4_dir)) {
- pu_dir = vmem_crst_alloc(_REGION3_ENTRY_EMPTY);
- if (!pu_dir)
- goto out;
- p4d_populate(&init_mm, p4_dir, pu_dir);
- }
-
- pu_dir = pud_offset(p4_dir, address);
- if (pud_none(*pu_dir)) {
- pm_dir = vmem_crst_alloc(_SEGMENT_ENTRY_EMPTY);
- if (!pm_dir)
- goto out;
- pud_populate(&init_mm, pu_dir, pm_dir);
- }
-
- pm_dir = pmd_offset(pu_dir, address);
- if (pmd_none(*pm_dir)) {
- /* Use 1MB frames for vmemmap if available. We always
- * use large frames even if they are only partially
- * used.
- * Otherwise we would have also page tables since
- * vmemmap_populate gets called for each section
- * separately. */
- if (MACHINE_HAS_EDAT1) {
- void *new_page;
-
- new_page = vmemmap_alloc_block(PMD_SIZE, node);
- if (!new_page)
- goto out;
- pmd_val(*pm_dir) = __pa(new_page) | sgt_prot;
- address = (address + PMD_SIZE) & PMD_MASK;
- continue;
- }
- pt_dir = vmem_pte_alloc();
- if (!pt_dir)
- goto out;
- pmd_populate(&init_mm, pm_dir, pt_dir);
- } else if (pmd_large(*pm_dir)) {
- address = (address + PMD_SIZE) & PMD_MASK;
- continue;
- }
-
- pt_dir = pte_offset_kernel(pm_dir, address);
- if (pte_none(*pt_dir)) {
- void *new_page;
-
- new_page = vmemmap_alloc_block(PAGE_SIZE, node);
- if (!new_page)
- goto out;
- pte_val(*pt_dir) = __pa(new_page) | pgt_prot;
- }
- address += PAGE_SIZE;
- }
- ret = 0;
-out:
- return ret;
-}
-
-void vmemmap_free(unsigned long start, unsigned long end,
- struct vmem_altmap *altmap)
-{
-}
-
-/*
- * Add memory segment to the segment list if it doesn't overlap with
- * an already present segment.
- */
-static int insert_memory_segment(struct memory_segment *seg)
-{
- struct memory_segment *tmp;
-
- if (seg->start + seg->size > VMEM_MAX_PHYS ||
- seg->start + seg->size < seg->start)
- return -ERANGE;
-
- list_for_each_entry(tmp, &mem_segs, list) {
- if (seg->start >= tmp->start + tmp->size)
- continue;
- if (seg->start + seg->size <= tmp->start)
- continue;
- return -ENOSPC;
- }
- list_add(&seg->list, &mem_segs);
- return 0;
-}
-
-/*
- * Remove memory segment from the segment list.
- */
-static void remove_memory_segment(struct memory_segment *seg)
-{
- list_del(&seg->list);
-}
-
-static void __remove_shared_memory(struct memory_segment *seg)
-{
- remove_memory_segment(seg);
- vmem_remove_range(seg->start, seg->size);
-}
-
-int vmem_remove_mapping(unsigned long start, unsigned long size)
-{
- struct memory_segment *seg;
int ret;
mutex_lock(&vmem_mutex);
-
- ret = -ENOENT;
- list_for_each_entry(seg, &mem_segs, list) {
- if (seg->start == start && seg->size == size)
- break;
- }
-
- if (seg->start != start || seg->size != size)
- goto out;
-
- ret = 0;
- __remove_shared_memory(seg);
- kfree(seg);
-out:
+ /* We don't care about the node, just use NUMA_NO_NODE on allocations */
+ ret = add_pagetable(start, end, false);
+ if (ret)
+ remove_pagetable(start, end, false);
mutex_unlock(&vmem_mutex);
return ret;
}
+void vmemmap_free(unsigned long start, unsigned long end,
+ struct vmem_altmap *altmap)
+{
+ mutex_lock(&vmem_mutex);
+ remove_pagetable(start, end, false);
+ mutex_unlock(&vmem_mutex);
+}
+
+void vmem_remove_mapping(unsigned long start, unsigned long size)
+{
+ mutex_lock(&vmem_mutex);
+ vmem_remove_range(start, size);
+ mutex_unlock(&vmem_mutex);
+}
+
int vmem_add_mapping(unsigned long start, unsigned long size)
{
- struct memory_segment *seg;
int ret;
+ if (start + size > VMEM_MAX_PHYS ||
+ start + size < start)
+ return -ERANGE;
+
mutex_lock(&vmem_mutex);
- ret = -ENOMEM;
- seg = kzalloc(sizeof(*seg), GFP_KERNEL);
- if (!seg)
- goto out;
- seg->start = start;
- seg->size = size;
-
- ret = insert_memory_segment(seg);
+ ret = vmem_add_range(start, size);
if (ret)
- goto out_free;
-
- ret = vmem_add_mem(start, size);
- if (ret)
- goto out_remove;
- goto out;
-
-out_remove:
- __remove_shared_memory(seg);
-out_free:
- kfree(seg);
-out:
+ vmem_remove_range(start, size);
mutex_unlock(&vmem_mutex);
return ret;
}
@@ -400,10 +555,11 @@
*/
void __init vmem_map_init(void)
{
- struct memblock_region *reg;
+ phys_addr_t base, end;
+ u64 i;
- for_each_memblock(memory, reg)
- vmem_add_mem(reg->base, reg->size);
+ for_each_mem_range(i, &base, &end)
+ vmem_add_range(base, end - base);
__set_memory((unsigned long)_stext,
(unsigned long)(_etext - _stext) >> PAGE_SHIFT,
SET_MEMORY_RO | SET_MEMORY_X);
@@ -422,27 +578,3 @@
pr_info("Write protected kernel read-only data: %luk\n",
(unsigned long)(__end_rodata - _stext) >> 10);
}
-
-/*
- * Convert memblock.memory to a memory segment list so there is a single
- * list that contains all memory segments.
- */
-static int __init vmem_convert_memory_chunk(void)
-{
- struct memblock_region *reg;
- struct memory_segment *seg;
-
- mutex_lock(&vmem_mutex);
- for_each_memblock(memory, reg) {
- seg = kzalloc(sizeof(*seg), GFP_KERNEL);
- if (!seg)
- panic("Out of memory...\n");
- seg->start = reg->base;
- seg->size = reg->size;
- insert_memory_segment(seg);
- }
- mutex_unlock(&vmem_mutex);
- return 0;
-}
-
-core_initcall(vmem_convert_memory_chunk);