Update Linux to v5.10.109
Sourced from [1]
[1] https://cdn.kernel.org/pub/linux/kernel/v5.x/linux-5.10.109.tar.xz
Change-Id: I19bca9fc6762d4e63bcf3e4cba88bbe560d9c76c
Signed-off-by: Olivier Deprez <olivier.deprez@arm.com>
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 5797e1e..fff03a3 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1,12 +1,11 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
- * linux/mm/vmalloc.c
- *
* Copyright (C) 1993 Linus Torvalds
* Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
* SMP-safe vmalloc/vfree/ioremap, Tigran Aivazian <tigran@veritas.com>, May 2000
* Major rework to support vmap/vunmap, Christoph Hellwig, SGI, August 2002
* Numa awareness, Christoph Lameter, SGI, June 2005
+ * Improving global KVA allocator, Uladzislau Rezki, Sony, May 2019
*/
#include <linux/vmalloc.h>
@@ -25,7 +24,7 @@
#include <linux/list.h>
#include <linux/notifier.h>
#include <linux/rbtree.h>
-#include <linux/radix-tree.h>
+#include <linux/xarray.h>
#include <linux/rcupdate.h>
#include <linux/pfn.h>
#include <linux/kmemleak.h>
@@ -41,6 +40,15 @@
#include <asm/shmparam.h>
#include "internal.h"
+#include "pgalloc-track.h"
+
+bool is_vmalloc_addr(const void *x)
+{
+ unsigned long addr = (unsigned long)x;
+
+ return addr >= VMALLOC_START && addr < VMALLOC_END;
+}
+EXPORT_SYMBOL(is_vmalloc_addr);
struct vfree_deferred {
struct llist_head list;
@@ -61,7 +69,8 @@
/*** Page table manipulation functions ***/
-static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end)
+static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
+ pgtbl_mod_mask *mask)
{
pte_t *pte;
@@ -70,75 +79,119 @@
pte_t ptent = ptep_get_and_clear(&init_mm, addr, pte);
WARN_ON(!pte_none(ptent) && !pte_present(ptent));
} while (pte++, addr += PAGE_SIZE, addr != end);
+ *mask |= PGTBL_PTE_MODIFIED;
}
-static void vunmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end)
+static void vunmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
+ pgtbl_mod_mask *mask)
{
pmd_t *pmd;
unsigned long next;
+ int cleared;
pmd = pmd_offset(pud, addr);
do {
next = pmd_addr_end(addr, end);
- if (pmd_clear_huge(pmd))
+
+ cleared = pmd_clear_huge(pmd);
+ if (cleared || pmd_bad(*pmd))
+ *mask |= PGTBL_PMD_MODIFIED;
+
+ if (cleared)
continue;
if (pmd_none_or_clear_bad(pmd))
continue;
- vunmap_pte_range(pmd, addr, next);
+ vunmap_pte_range(pmd, addr, next, mask);
cond_resched();
} while (pmd++, addr = next, addr != end);
}
-static void vunmap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end)
+static void vunmap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
+ pgtbl_mod_mask *mask)
{
pud_t *pud;
unsigned long next;
+ int cleared;
pud = pud_offset(p4d, addr);
do {
next = pud_addr_end(addr, end);
- if (pud_clear_huge(pud))
+
+ cleared = pud_clear_huge(pud);
+ if (cleared || pud_bad(*pud))
+ *mask |= PGTBL_PUD_MODIFIED;
+
+ if (cleared)
continue;
if (pud_none_or_clear_bad(pud))
continue;
- vunmap_pmd_range(pud, addr, next);
+ vunmap_pmd_range(pud, addr, next, mask);
} while (pud++, addr = next, addr != end);
}
-static void vunmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end)
+static void vunmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end,
+ pgtbl_mod_mask *mask)
{
p4d_t *p4d;
unsigned long next;
+ int cleared;
p4d = p4d_offset(pgd, addr);
do {
next = p4d_addr_end(addr, end);
- if (p4d_clear_huge(p4d))
+
+ cleared = p4d_clear_huge(p4d);
+ if (cleared || p4d_bad(*p4d))
+ *mask |= PGTBL_P4D_MODIFIED;
+
+ if (cleared)
continue;
if (p4d_none_or_clear_bad(p4d))
continue;
- vunmap_pud_range(p4d, addr, next);
+ vunmap_pud_range(p4d, addr, next, mask);
} while (p4d++, addr = next, addr != end);
}
-static void vunmap_page_range(unsigned long addr, unsigned long end)
+/**
+ * unmap_kernel_range_noflush - unmap kernel VM area
+ * @start: start of the VM area to unmap
+ * @size: size of the VM area to unmap
+ *
+ * Unmap PFN_UP(@size) pages at @addr. The VM area @addr and @size specify
+ * should have been allocated using get_vm_area() and its friends.
+ *
+ * NOTE:
+ * This function does NOT do any cache flushing. The caller is responsible
+ * for calling flush_cache_vunmap() on to-be-mapped areas before calling this
+ * function and flush_tlb_kernel_range() after.
+ */
+void unmap_kernel_range_noflush(unsigned long start, unsigned long size)
{
- pgd_t *pgd;
+ unsigned long end = start + size;
unsigned long next;
+ pgd_t *pgd;
+ unsigned long addr = start;
+ pgtbl_mod_mask mask = 0;
BUG_ON(addr >= end);
pgd = pgd_offset_k(addr);
do {
next = pgd_addr_end(addr, end);
+ if (pgd_bad(*pgd))
+ mask |= PGTBL_PGD_MODIFIED;
if (pgd_none_or_clear_bad(pgd))
continue;
- vunmap_p4d_range(pgd, addr, next);
+ vunmap_p4d_range(pgd, addr, next, &mask);
} while (pgd++, addr = next, addr != end);
+
+ if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
+ arch_sync_kernel_mappings(start, end);
}
static int vmap_pte_range(pmd_t *pmd, unsigned long addr,
- unsigned long end, pgprot_t prot, struct page **pages, int *nr)
+ unsigned long end, pgprot_t prot, struct page **pages, int *nr,
+ pgtbl_mod_mask *mask)
{
pte_t *pte;
@@ -147,7 +200,7 @@
* callers keep track of where we're up to.
*/
- pte = pte_alloc_kernel(pmd, addr);
+ pte = pte_alloc_kernel_track(pmd, addr, mask);
if (!pte)
return -ENOMEM;
do {
@@ -160,94 +213,117 @@
set_pte_at(&init_mm, addr, pte, mk_pte(page, prot));
(*nr)++;
} while (pte++, addr += PAGE_SIZE, addr != end);
+ *mask |= PGTBL_PTE_MODIFIED;
return 0;
}
static int vmap_pmd_range(pud_t *pud, unsigned long addr,
- unsigned long end, pgprot_t prot, struct page **pages, int *nr)
+ unsigned long end, pgprot_t prot, struct page **pages, int *nr,
+ pgtbl_mod_mask *mask)
{
pmd_t *pmd;
unsigned long next;
- pmd = pmd_alloc(&init_mm, pud, addr);
+ pmd = pmd_alloc_track(&init_mm, pud, addr, mask);
if (!pmd)
return -ENOMEM;
do {
next = pmd_addr_end(addr, end);
- if (vmap_pte_range(pmd, addr, next, prot, pages, nr))
+ if (vmap_pte_range(pmd, addr, next, prot, pages, nr, mask))
return -ENOMEM;
} while (pmd++, addr = next, addr != end);
return 0;
}
static int vmap_pud_range(p4d_t *p4d, unsigned long addr,
- unsigned long end, pgprot_t prot, struct page **pages, int *nr)
+ unsigned long end, pgprot_t prot, struct page **pages, int *nr,
+ pgtbl_mod_mask *mask)
{
pud_t *pud;
unsigned long next;
- pud = pud_alloc(&init_mm, p4d, addr);
+ pud = pud_alloc_track(&init_mm, p4d, addr, mask);
if (!pud)
return -ENOMEM;
do {
next = pud_addr_end(addr, end);
- if (vmap_pmd_range(pud, addr, next, prot, pages, nr))
+ if (vmap_pmd_range(pud, addr, next, prot, pages, nr, mask))
return -ENOMEM;
} while (pud++, addr = next, addr != end);
return 0;
}
static int vmap_p4d_range(pgd_t *pgd, unsigned long addr,
- unsigned long end, pgprot_t prot, struct page **pages, int *nr)
+ unsigned long end, pgprot_t prot, struct page **pages, int *nr,
+ pgtbl_mod_mask *mask)
{
p4d_t *p4d;
unsigned long next;
- p4d = p4d_alloc(&init_mm, pgd, addr);
+ p4d = p4d_alloc_track(&init_mm, pgd, addr, mask);
if (!p4d)
return -ENOMEM;
do {
next = p4d_addr_end(addr, end);
- if (vmap_pud_range(p4d, addr, next, prot, pages, nr))
+ if (vmap_pud_range(p4d, addr, next, prot, pages, nr, mask))
return -ENOMEM;
} while (p4d++, addr = next, addr != end);
return 0;
}
-/*
- * Set up page tables in kva (addr, end). The ptes shall have prot "prot", and
- * will have pfns corresponding to the "pages" array.
+/**
+ * map_kernel_range_noflush - map kernel VM area with the specified pages
+ * @addr: start of the VM area to map
+ * @size: size of the VM area to map
+ * @prot: page protection flags to use
+ * @pages: pages to map
*
- * Ie. pte at addr+N*PAGE_SIZE shall point to pfn corresponding to pages[N]
+ * Map PFN_UP(@size) pages at @addr. The VM area @addr and @size specify should
+ * have been allocated using get_vm_area() and its friends.
+ *
+ * NOTE:
+ * This function does NOT do any cache flushing. The caller is responsible for
+ * calling flush_cache_vmap() on to-be-mapped areas before calling this
+ * function.
+ *
+ * RETURNS:
+ * 0 on success, -errno on failure.
*/
-static int vmap_page_range_noflush(unsigned long start, unsigned long end,
- pgprot_t prot, struct page **pages)
+int map_kernel_range_noflush(unsigned long addr, unsigned long size,
+ pgprot_t prot, struct page **pages)
{
- pgd_t *pgd;
+ unsigned long start = addr;
+ unsigned long end = addr + size;
unsigned long next;
- unsigned long addr = start;
+ pgd_t *pgd;
int err = 0;
int nr = 0;
+ pgtbl_mod_mask mask = 0;
BUG_ON(addr >= end);
pgd = pgd_offset_k(addr);
do {
next = pgd_addr_end(addr, end);
- err = vmap_p4d_range(pgd, addr, next, prot, pages, &nr);
+ if (pgd_bad(*pgd))
+ mask |= PGTBL_PGD_MODIFIED;
+ err = vmap_p4d_range(pgd, addr, next, prot, pages, &nr, &mask);
if (err)
return err;
} while (pgd++, addr = next, addr != end);
- return nr;
+ if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
+ arch_sync_kernel_mappings(start, end);
+
+ return 0;
}
-static int vmap_page_range(unsigned long start, unsigned long end,
- pgprot_t prot, struct page **pages)
+int map_kernel_range(unsigned long start, unsigned long size, pgprot_t prot,
+ struct page **pages)
{
int ret;
- ret = vmap_page_range_noflush(start, end, prot, pages);
- flush_cache_vmap(start, end);
+ ret = map_kernel_range_noflush(start, size, prot, pages);
+ flush_cache_vmap(start, start + size);
return ret;
}
@@ -334,6 +410,7 @@
static DEFINE_SPINLOCK(vmap_area_lock);
+static DEFINE_SPINLOCK(free_vmap_area_lock);
/* Export for kexec only */
LIST_HEAD(vmap_area_list);
static LLIST_HEAD(vmap_purge_list);
@@ -435,6 +512,10 @@
/*
* This function returns back addresses of parent node
* and its left or right link for further processing.
+ *
+ * Otherwise NULL is returned. In that case all further
+ * steps regarding inserting of conflicting overlap range
+ * have to be declined and actually considered as a bug.
*/
static __always_inline struct rb_node **
find_va_links(struct vmap_area *va,
@@ -473,8 +554,12 @@
else if (va->va_end > tmp_va->va_start &&
va->va_start >= tmp_va->va_end)
link = &(*link)->rb_right;
- else
- BUG();
+ else {
+ WARN(1, "vmalloc bug: 0x%lx-0x%lx overlaps with 0x%lx-0x%lx\n",
+ va->va_start, va->va_end, tmp_va->va_start, tmp_va->va_end);
+
+ return NULL;
+ }
} while (*link);
*parent = &tmp_va->rb_node;
@@ -556,43 +641,17 @@
#if DEBUG_AUGMENT_PROPAGATE_CHECK
static void
-augment_tree_propagate_check(struct rb_node *n)
+augment_tree_propagate_check(void)
{
struct vmap_area *va;
- struct rb_node *node;
- unsigned long size;
- bool found = false;
+ unsigned long computed_size;
- if (n == NULL)
- return;
-
- va = rb_entry(n, struct vmap_area, rb_node);
- size = va->subtree_max_size;
- node = n;
-
- while (node) {
- va = rb_entry(node, struct vmap_area, rb_node);
-
- if (get_subtree_max_size(node->rb_left) == size) {
- node = node->rb_left;
- } else {
- if (va_size(va) == size) {
- found = true;
- break;
- }
-
- node = node->rb_right;
- }
+ list_for_each_entry(va, &free_vmap_area_list, list) {
+ computed_size = compute_subtree_max_size(va);
+ if (computed_size != va->subtree_max_size)
+ pr_emerg("tree is corrupted: %lu, %lu\n",
+ va_size(va), va->subtree_max_size);
}
-
- if (!found) {
- va = rb_entry(n, struct vmap_area, rb_node);
- pr_emerg("tree is corrupted: %lu, %lu\n",
- va_size(va), va->subtree_max_size);
- }
-
- augment_tree_propagate_check(n->rb_left);
- augment_tree_propagate_check(n->rb_right);
}
#endif
@@ -626,28 +685,15 @@
static __always_inline void
augment_tree_propagate_from(struct vmap_area *va)
{
- struct rb_node *node = &va->rb_node;
- unsigned long new_va_sub_max_size;
-
- while (node) {
- va = rb_entry(node, struct vmap_area, rb_node);
- new_va_sub_max_size = compute_subtree_max_size(va);
-
- /*
- * If the newly calculated maximum available size of the
- * subtree is equal to the current one, then it means that
- * the tree is propagated correctly. So we have to stop at
- * this point to save cycles.
- */
- if (va->subtree_max_size == new_va_sub_max_size)
- break;
-
- va->subtree_max_size = new_va_sub_max_size;
- node = rb_parent(&va->rb_node);
- }
+ /*
+ * Populate the tree from bottom towards the root until
+ * the calculated maximum available size of checked node
+ * is equal to its current one.
+ */
+ free_vmap_area_rb_augment_cb_propagate(&va->rb_node, NULL);
#if DEBUG_AUGMENT_PROPAGATE_CHECK
- augment_tree_propagate_check(free_vmap_area_root.rb_node);
+ augment_tree_propagate_check();
#endif
}
@@ -659,7 +705,8 @@
struct rb_node *parent;
link = find_va_links(va, root, NULL, &parent);
- link_va(va, root, parent, link, head);
+ if (link)
+ link_va(va, root, parent, link, head);
}
static void
@@ -675,8 +722,10 @@
else
link = find_va_links(va, root, NULL, &parent);
- link_va(va, root, parent, link, head);
- augment_tree_propagate_from(va);
+ if (link) {
+ link_va(va, root, parent, link, head);
+ augment_tree_propagate_from(va);
+ }
}
/*
@@ -684,8 +733,13 @@
* and next free blocks. If coalesce is not done a new
* free area is inserted. If VA has been merged, it is
* freed.
+ *
+ * Please note, it can return NULL in case of overlap
+ * ranges, followed by WARN() report. Despite it is a
+ * buggy behaviour, a system can be alive and keep
+ * ongoing.
*/
-static __always_inline void
+static __always_inline struct vmap_area *
merge_or_add_vmap_area(struct vmap_area *va,
struct rb_root *root, struct list_head *head)
{
@@ -700,6 +754,8 @@
* inserted, unless it is merged with its sibling/siblings.
*/
link = find_va_links(va, root, NULL, &parent);
+ if (!link)
+ return NULL;
/*
* Get next node of VA to check if merging can be done.
@@ -720,9 +776,6 @@
if (sibling->va_start == va->va_end) {
sibling->va_start = va->va_start;
- /* Check and update the tree if needed. */
- augment_tree_propagate_from(sibling);
-
/* Free vmap_area object. */
kmem_cache_free(vmap_area_cachep, va);
@@ -742,25 +795,36 @@
if (next->prev != head) {
sibling = list_entry(next->prev, struct vmap_area, list);
if (sibling->va_end == va->va_start) {
- sibling->va_end = va->va_end;
-
- /* Check and update the tree if needed. */
- augment_tree_propagate_from(sibling);
-
+ /*
+ * If both neighbors are coalesced, it is important
+ * to unlink the "next" node first, followed by merging
+ * with "previous" one. Otherwise the tree might not be
+ * fully populated if a sibling's augmented value is
+ * "normalized" because of rotation operations.
+ */
if (merged)
unlink_va(va, root);
+ sibling->va_end = va->va_end;
+
/* Free vmap_area object. */
kmem_cache_free(vmap_area_cachep, va);
- return;
+
+ /* Point to the new merged area. */
+ va = sibling;
+ merged = true;
}
}
insert:
- if (!merged) {
+ if (!merged)
link_va(va, root, parent, link, head);
- augment_tree_propagate_from(va);
- }
+
+ /*
+ * Last step is to check and update the tree.
+ */
+ augment_tree_propagate_from(va);
+ return va;
}
static __always_inline bool
@@ -971,6 +1035,19 @@
* There are a few exceptions though, as an example it is
* a first allocation (early boot up) when we have "one"
* big free space that has to be split.
+ *
+ * Also we can hit this path in case of regular "vmap"
+ * allocations, if "this" current CPU was not preloaded.
+ * See the comment in alloc_vmap_area() why. If so, then
+ * GFP_NOWAIT is used instead to get an extra object for
+ * split purpose. That is rare and most time does not
+ * occur.
+ *
+ * What happens if an allocation gets failed. Basically,
+ * an "overflow" path is triggered to purge lazily freed
+ * areas to free some memory, then, the "retry" path is
+ * triggered to repeat one more time. See more details
+ * in alloc_vmap_area() function.
*/
lva = kmem_cache_alloc(vmap_area_cachep, GFP_NOWAIT);
if (!lva)
@@ -1046,6 +1123,26 @@
}
/*
+ * Free a region of KVA allocated by alloc_vmap_area
+ */
+static void free_vmap_area(struct vmap_area *va)
+{
+ /*
+ * Remove from the busy tree/list.
+ */
+ spin_lock(&vmap_area_lock);
+ unlink_va(va, &vmap_area_root);
+ spin_unlock(&vmap_area_lock);
+
+ /*
+ * Insert/Merge it back to the free tree/list.
+ */
+ spin_lock(&free_vmap_area_lock);
+ merge_or_add_vmap_area(va, &free_vmap_area_root, &free_vmap_area_list);
+ spin_unlock(&free_vmap_area_lock);
+}
+
+/*
* Allocate a region of KVA of the specified size and alignment, within the
* vstart and vend.
*/
@@ -1057,6 +1154,7 @@
struct vmap_area *va, *pva;
unsigned long addr;
int purged = 0;
+ int ret;
BUG_ON(!size);
BUG_ON(offset_in_page(size));
@@ -1066,9 +1164,9 @@
return ERR_PTR(-EBUSY);
might_sleep();
+ gfp_mask = gfp_mask & GFP_RECLAIM_MASK;
- va = kmem_cache_alloc_node(vmap_area_cachep,
- gfp_mask & GFP_RECLAIM_MASK, node);
+ va = kmem_cache_alloc_node(vmap_area_cachep, gfp_mask, node);
if (unlikely(!va))
return ERR_PTR(-ENOMEM);
@@ -1076,59 +1174,71 @@
* Only scan the relevant parts containing pointers to other objects
* to avoid false negatives.
*/
- kmemleak_scan_area(&va->rb_node, SIZE_MAX, gfp_mask & GFP_RECLAIM_MASK);
+ kmemleak_scan_area(&va->rb_node, SIZE_MAX, gfp_mask);
retry:
/*
- * Preload this CPU with one extra vmap_area object to ensure
- * that we have it available when fit type of free area is
- * NE_FIT_TYPE.
+ * Preload this CPU with one extra vmap_area object. It is used
+ * when fit type of free area is NE_FIT_TYPE. Please note, it
+ * does not guarantee that an allocation occurs on a CPU that
+ * is preloaded, instead we minimize the case when it is not.
+ * It can happen because of cpu migration, because there is a
+ * race until the below spinlock is taken.
*
* The preload is done in non-atomic context, thus it allows us
* to use more permissive allocation masks to be more stable under
- * low memory condition and high memory pressure.
+ * low memory condition and high memory pressure. In rare case,
+ * if not preloaded, GFP_NOWAIT is used.
*
- * Even if it fails we do not really care about that. Just proceed
- * as it is. "overflow" path will refill the cache we allocate from.
+ * Set "pva" to NULL here, because of "retry" path.
*/
- preempt_disable();
- if (!__this_cpu_read(ne_fit_preload_node)) {
- preempt_enable();
- pva = kmem_cache_alloc_node(vmap_area_cachep, GFP_KERNEL, node);
- preempt_disable();
+ pva = NULL;
- if (__this_cpu_cmpxchg(ne_fit_preload_node, NULL, pva)) {
- if (pva)
- kmem_cache_free(vmap_area_cachep, pva);
- }
- }
+ if (!this_cpu_read(ne_fit_preload_node))
+ /*
+ * Even if it fails we do not really care about that.
+ * Just proceed as it is. If needed "overflow" path
+ * will refill the cache we allocate from.
+ */
+ pva = kmem_cache_alloc_node(vmap_area_cachep, gfp_mask, node);
- spin_lock(&vmap_area_lock);
- preempt_enable();
+ spin_lock(&free_vmap_area_lock);
+
+ if (pva && __this_cpu_cmpxchg(ne_fit_preload_node, NULL, pva))
+ kmem_cache_free(vmap_area_cachep, pva);
/*
* If an allocation fails, the "vend" address is
* returned. Therefore trigger the overflow path.
*/
addr = __alloc_vmap_area(size, align, vstart, vend);
+ spin_unlock(&free_vmap_area_lock);
+
if (unlikely(addr == vend))
goto overflow;
va->va_start = addr;
va->va_end = addr + size;
va->vm = NULL;
- insert_vmap_area(va, &vmap_area_root, &vmap_area_list);
+
+ spin_lock(&vmap_area_lock);
+ insert_vmap_area(va, &vmap_area_root, &vmap_area_list);
spin_unlock(&vmap_area_lock);
BUG_ON(!IS_ALIGNED(va->va_start, align));
BUG_ON(va->va_start < vstart);
BUG_ON(va->va_end > vend);
+ ret = kasan_populate_vmalloc(addr, size);
+ if (ret) {
+ free_vmap_area(va);
+ return ERR_PTR(ret);
+ }
+
return va;
overflow:
- spin_unlock(&vmap_area_lock);
if (!purged) {
purge_vmap_area_lazy();
purged = 1;
@@ -1164,38 +1274,6 @@
}
EXPORT_SYMBOL_GPL(unregister_vmap_purge_notifier);
-static void __free_vmap_area(struct vmap_area *va)
-{
- /*
- * Remove from the busy tree/list.
- */
- unlink_va(va, &vmap_area_root);
-
- /*
- * Merge VA with its neighbors, otherwise just add it.
- */
- merge_or_add_vmap_area(va,
- &free_vmap_area_root, &free_vmap_area_list);
-}
-
-/*
- * Free a region of KVA allocated by alloc_vmap_area
- */
-static void free_vmap_area(struct vmap_area *va)
-{
- spin_lock(&vmap_area_lock);
- __free_vmap_area(va);
- spin_unlock(&vmap_area_lock);
-}
-
-/*
- * Clear the pagetable entries of a given vmap_area
- */
-static void unmap_vmap_area(struct vmap_area *va)
-{
- vunmap_page_range(va->va_start, va->va_end);
-}
-
/*
* lazy_max_pages is the maximum amount of virtual address space we gather up
* before attempting to purge with a TLB flush.
@@ -1259,12 +1337,6 @@
return false;
/*
- * First make sure the mappings are removed from all page-tables
- * before they are freed.
- */
- vmalloc_sync_unmappings();
-
- /*
* TODO: to calculate a flush range without looping.
* The list can be up to lazy_max_pages() elements.
*/
@@ -1278,24 +1350,33 @@
flush_tlb_kernel_range(start, end);
resched_threshold = lazy_max_pages() << 1;
- spin_lock(&vmap_area_lock);
+ spin_lock(&free_vmap_area_lock);
llist_for_each_entry_safe(va, n_va, valist, purge_list) {
unsigned long nr = (va->va_end - va->va_start) >> PAGE_SHIFT;
+ unsigned long orig_start = va->va_start;
+ unsigned long orig_end = va->va_end;
/*
* Finally insert or merge lazily-freed area. It is
* detached and there is no need to "unlink" it from
* anything.
*/
- merge_or_add_vmap_area(va,
- &free_vmap_area_root, &free_vmap_area_list);
+ va = merge_or_add_vmap_area(va, &free_vmap_area_root,
+ &free_vmap_area_list);
+
+ if (!va)
+ continue;
+
+ if (is_vmalloc_or_module_addr((void *)orig_start))
+ kasan_release_vmalloc(orig_start, orig_end,
+ va->va_start, va->va_end);
atomic_long_sub(nr, &vmap_lazy_nr);
if (atomic_long_read(&vmap_lazy_nr) < resched_threshold)
- cond_resched_lock(&vmap_area_lock);
+ cond_resched_lock(&free_vmap_area_lock);
}
- spin_unlock(&vmap_area_lock);
+ spin_unlock(&free_vmap_area_lock);
return true;
}
@@ -1351,7 +1432,7 @@
static void free_unmap_vmap_area(struct vmap_area *va)
{
flush_cache_vunmap(va->va_start, va->va_end);
- unmap_vmap_area(va);
+ unmap_kernel_range_noflush(va->va_start, va->va_end - va->va_start);
if (debug_pagealloc_enabled_static())
flush_tlb_kernel_range(va->va_start, va->va_end);
@@ -1418,12 +1499,11 @@
static DEFINE_PER_CPU(struct vmap_block_queue, vmap_block_queue);
/*
- * Radix tree of vmap blocks, indexed by address, to quickly find a vmap block
+ * XArray of vmap blocks, indexed by address, to quickly find a vmap block
* in the free path. Could get rid of this if we change the API to return a
* "cookie" from alloc, to be passed to free. But no big deal yet.
*/
-static DEFINE_SPINLOCK(vmap_block_tree_lock);
-static RADIX_TREE(vmap_block_tree, GFP_ATOMIC);
+static DEFINE_XARRAY(vmap_blocks);
/*
* We should probably have a fallback mechanism to allocate virtual memory
@@ -1480,13 +1560,6 @@
return ERR_CAST(va);
}
- err = radix_tree_preload(gfp_mask);
- if (unlikely(err)) {
- kfree(vb);
- free_vmap_area(va);
- return ERR_PTR(err);
- }
-
vaddr = vmap_block_vaddr(va->va_start, 0);
spin_lock_init(&vb->lock);
vb->va = va;
@@ -1499,11 +1572,12 @@
INIT_LIST_HEAD(&vb->free_list);
vb_idx = addr_to_vb_idx(va->va_start);
- spin_lock(&vmap_block_tree_lock);
- err = radix_tree_insert(&vmap_block_tree, vb_idx, vb);
- spin_unlock(&vmap_block_tree_lock);
- BUG_ON(err);
- radix_tree_preload_end();
+ err = xa_insert(&vmap_blocks, vb_idx, vb, gfp_mask);
+ if (err) {
+ kfree(vb);
+ free_vmap_area(va);
+ return ERR_PTR(err);
+ }
vbq = &get_cpu_var(vmap_block_queue);
spin_lock(&vbq->lock);
@@ -1517,12 +1591,8 @@
static void free_vmap_block(struct vmap_block *vb)
{
struct vmap_block *tmp;
- unsigned long vb_idx;
- vb_idx = addr_to_vb_idx(vb->va->va_start);
- spin_lock(&vmap_block_tree_lock);
- tmp = radix_tree_delete(&vmap_block_tree, vb_idx);
- spin_unlock(&vmap_block_tree_lock);
+ tmp = xa_erase(&vmap_blocks, addr_to_vb_idx(vb->va->va_start));
BUG_ON(tmp != vb);
free_vmap_area_noflush(vb->va);
@@ -1625,34 +1695,25 @@
return vaddr;
}
-static void vb_free(const void *addr, unsigned long size)
+static void vb_free(unsigned long addr, unsigned long size)
{
unsigned long offset;
- unsigned long vb_idx;
unsigned int order;
struct vmap_block *vb;
BUG_ON(offset_in_page(size));
BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
- flush_cache_vunmap((unsigned long)addr, (unsigned long)addr + size);
+ flush_cache_vunmap(addr, addr + size);
order = get_order(size);
+ offset = (addr & (VMAP_BLOCK_SIZE - 1)) >> PAGE_SHIFT;
+ vb = xa_load(&vmap_blocks, addr_to_vb_idx(addr));
- offset = (unsigned long)addr & (VMAP_BLOCK_SIZE - 1);
- offset >>= PAGE_SHIFT;
-
- vb_idx = addr_to_vb_idx((unsigned long)addr);
- rcu_read_lock();
- vb = radix_tree_lookup(&vmap_block_tree, vb_idx);
- rcu_read_unlock();
- BUG_ON(!vb);
-
- vunmap_page_range((unsigned long)addr, (unsigned long)addr + size);
+ unmap_kernel_range_noflush(addr, size);
if (debug_pagealloc_enabled_static())
- flush_tlb_kernel_range((unsigned long)addr,
- (unsigned long)addr + size);
+ flush_tlb_kernel_range(addr, addr + size);
spin_lock(&vb->lock);
@@ -1748,9 +1809,11 @@
BUG_ON(addr > VMALLOC_END);
BUG_ON(!PAGE_ALIGNED(addr));
+ kasan_poison_vmalloc(mem, size);
+
if (likely(count <= VMAP_MAX_ALLOC)) {
debug_check_no_locks_freed(mem, size);
- vb_free(mem, size);
+ vb_free(addr, size);
return;
}
@@ -1767,7 +1830,6 @@
* @pages: an array of pointers to the pages to be mapped
* @count: number of pages
* @node: prefer to allocate data structures on this node
- * @prot: memory protection to use. PAGE_KERNEL for regular RAM
*
* If you use this function for less than VMAP_MAX_ALLOC pages, it could be
* faster than vmap so it's good. But if you mix long-life and short-life
@@ -1777,7 +1839,7 @@
*
* Returns: a pointer to the address that has been mapped, or %NULL on failure
*/
-void *vm_map_ram(struct page **pages, unsigned int count, int node, pgprot_t prot)
+void *vm_map_ram(struct page **pages, unsigned int count, int node)
{
unsigned long size = (unsigned long)count << PAGE_SHIFT;
unsigned long addr;
@@ -1798,7 +1860,10 @@
addr = va->va_start;
mem = (void *)addr;
}
- if (vmap_page_range(addr, addr + size, prot, pages) < 0) {
+
+ kasan_unpoison_vmalloc(mem, size);
+
+ if (map_kernel_range(addr, size, PAGE_KERNEL, pages) < 0) {
vm_unmap_ram(mem, count);
return NULL;
}
@@ -1943,51 +2008,6 @@
}
/**
- * map_kernel_range_noflush - map kernel VM area with the specified pages
- * @addr: start of the VM area to map
- * @size: size of the VM area to map
- * @prot: page protection flags to use
- * @pages: pages to map
- *
- * Map PFN_UP(@size) pages at @addr. The VM area @addr and @size
- * specify should have been allocated using get_vm_area() and its
- * friends.
- *
- * NOTE:
- * This function does NOT do any cache flushing. The caller is
- * responsible for calling flush_cache_vmap() on to-be-mapped areas
- * before calling this function.
- *
- * RETURNS:
- * The number of pages mapped on success, -errno on failure.
- */
-int map_kernel_range_noflush(unsigned long addr, unsigned long size,
- pgprot_t prot, struct page **pages)
-{
- return vmap_page_range_noflush(addr, addr + size, prot, pages);
-}
-
-/**
- * unmap_kernel_range_noflush - unmap kernel VM area
- * @addr: start of the VM area to unmap
- * @size: size of the VM area to unmap
- *
- * Unmap PFN_UP(@size) pages at @addr. The VM area @addr and @size
- * specify should have been allocated using get_vm_area() and its
- * friends.
- *
- * NOTE:
- * This function does NOT do any cache flushing. The caller is
- * responsible for calling flush_cache_vunmap() on to-be-mapped areas
- * before calling this function and flush_tlb_kernel_range() after.
- */
-void unmap_kernel_range_noflush(unsigned long addr, unsigned long size)
-{
- vunmap_page_range(addr, addr + size);
-}
-EXPORT_SYMBOL_GPL(unmap_kernel_range_noflush);
-
-/**
* unmap_kernel_range - unmap kernel VM area and flush cache and TLB
* @addr: start of the VM area to unmap
* @size: size of the VM area to unmap
@@ -2000,32 +2020,25 @@
unsigned long end = addr + size;
flush_cache_vunmap(addr, end);
- vunmap_page_range(addr, end);
+ unmap_kernel_range_noflush(addr, size);
flush_tlb_kernel_range(addr, end);
}
-EXPORT_SYMBOL_GPL(unmap_kernel_range);
-int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page **pages)
+static inline void setup_vmalloc_vm_locked(struct vm_struct *vm,
+ struct vmap_area *va, unsigned long flags, const void *caller)
{
- unsigned long addr = (unsigned long)area->addr;
- unsigned long end = addr + get_vm_area_size(area);
- int err;
-
- err = vmap_page_range(addr, end, prot, pages);
-
- return err > 0 ? 0 : err;
-}
-EXPORT_SYMBOL_GPL(map_vm_area);
-
-static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
- unsigned long flags, const void *caller)
-{
- spin_lock(&vmap_area_lock);
vm->flags = flags;
vm->addr = (void *)va->va_start;
vm->size = va->va_end - va->va_start;
vm->caller = caller;
va->vm = vm;
+}
+
+static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
+ unsigned long flags, const void *caller)
+{
+ spin_lock(&vmap_area_lock);
+ setup_vmalloc_vm_locked(vm, va, flags, caller);
spin_unlock(&vmap_area_lock);
}
@@ -2046,6 +2059,7 @@
{
struct vmap_area *va;
struct vm_struct *area;
+ unsigned long requested_size = size;
BUG_ON(in_interrupt());
size = PAGE_ALIGN(size);
@@ -2069,19 +2083,13 @@
return NULL;
}
+ kasan_unpoison_vmalloc((void *)va->va_start, requested_size);
+
setup_vmalloc_vm(area, va, flags, caller);
return area;
}
-struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags,
- unsigned long start, unsigned long end)
-{
- return __get_vm_area_node(size, 1, flags, start, end, NUMA_NO_NODE,
- GFP_KERNEL, __builtin_return_address(0));
-}
-EXPORT_SYMBOL_GPL(__get_vm_area);
-
struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags,
unsigned long start, unsigned long end,
const void *caller)
@@ -2123,7 +2131,7 @@
* It is up to the caller to do all required locking to keep the returned
* pointer valid.
*
- * Return: pointer to the found area or %NULL on faulure
+ * Return: the area descriptor on success or %NULL on failure.
*/
struct vm_struct *find_vm_area(const void *addr)
{
@@ -2144,7 +2152,7 @@
* This function returns the found VM area, but using it is NOT safe
* on SMP machines, except for its size or flags.
*
- * Return: pointer to the found area or %NULL on faulure
+ * Return: the area descriptor on success or %NULL on failure.
*/
struct vm_struct *remove_vm_area(const void *addr)
{
@@ -2248,6 +2256,8 @@
debug_check_no_locks_freed(area->addr, get_vm_area_size(area));
debug_check_no_obj_freed(area->addr, get_vm_area_size(area));
+ kasan_poison_vmalloc(area->addr, get_vm_area_size(area));
+
vm_remove_mappings(area, deallocate_pages);
if (deallocate_pages) {
@@ -2274,7 +2284,7 @@
* Use raw_cpu_ptr() because this can be called from preemptible
* context. Preemption is absolutely fine here, because the llist_add()
* implementation is lockless, so it works even if we are adding to
- * nother cpu's list. schedule_work() should be fine with this too.
+ * another cpu's list. schedule_work() should be fine with this too.
*/
struct vfree_deferred *p = raw_cpu_ptr(&vfree_deferred);
@@ -2309,20 +2319,21 @@
}
/**
- * vfree - release memory allocated by vmalloc()
- * @addr: memory base address
+ * vfree - Release memory allocated by vmalloc()
+ * @addr: Memory base address
*
- * Free the virtually continuous memory area starting at @addr, as
- * obtained from vmalloc(), vmalloc_32() or __vmalloc(). If @addr is
- * NULL, no operation is performed.
+ * Free the virtually continuous memory area starting at @addr, as obtained
+ * from one of the vmalloc() family of APIs. This will usually also free the
+ * physical memory underlying the virtual allocation, but that memory is
+ * reference counted, so it will not be freed until the last user goes away.
*
- * Must not be called in NMI context (strictly speaking, only if we don't
- * have CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG, but making the calling
- * conventions for vfree() arch-depenedent would be a really bad idea)
+ * If @addr is NULL, no operation is performed.
*
+ * Context:
* May sleep if called *not* from interrupt context.
- *
- * NOTE: assumes that the object at @addr has a size >= sizeof(llist_node)
+ * Must not be called in NMI context (strictly speaking, it could be
+ * if we have CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG, but making the calling
+ * conventions for vfree() arch-depenedent would be a really bad idea).
*/
void vfree(const void *addr)
{
@@ -2364,8 +2375,11 @@
* @flags: vm_area->flags
* @prot: page protection for the mapping
*
- * Maps @count pages from @pages into contiguous kernel virtual
- * space.
+ * Maps @count pages from @pages into contiguous kernel virtual space.
+ * If @flags contains %VM_MAP_PUT_PAGES the ownership of the pages array itself
+ * (which must be kmalloc or vmalloc memory) and one reference per pages in it
+ * are transferred from the caller to vmap(), and will be freed / dropped when
+ * vfree() is called on the return value.
*
* Return: the address of the area or %NULL on failure
*/
@@ -2385,36 +2399,81 @@
if (!area)
return NULL;
- if (map_vm_area(area, prot, pages)) {
+ if (map_kernel_range((unsigned long)area->addr, size, pgprot_nx(prot),
+ pages) < 0) {
vunmap(area->addr);
return NULL;
}
+ if (flags & VM_MAP_PUT_PAGES) {
+ area->pages = pages;
+ area->nr_pages = count;
+ }
return area->addr;
}
EXPORT_SYMBOL(vmap);
-static void *__vmalloc_node(unsigned long size, unsigned long align,
- gfp_t gfp_mask, pgprot_t prot,
- int node, const void *caller);
+#ifdef CONFIG_VMAP_PFN
+struct vmap_pfn_data {
+ unsigned long *pfns;
+ pgprot_t prot;
+ unsigned int idx;
+};
+
+static int vmap_pfn_apply(pte_t *pte, unsigned long addr, void *private)
+{
+ struct vmap_pfn_data *data = private;
+
+ if (WARN_ON_ONCE(pfn_valid(data->pfns[data->idx])))
+ return -EINVAL;
+ *pte = pte_mkspecial(pfn_pte(data->pfns[data->idx++], data->prot));
+ return 0;
+}
+
+/**
+ * vmap_pfn - map an array of PFNs into virtually contiguous space
+ * @pfns: array of PFNs
+ * @count: number of pages to map
+ * @prot: page protection for the mapping
+ *
+ * Maps @count PFNs from @pfns into contiguous kernel virtual space and returns
+ * the start address of the mapping.
+ */
+void *vmap_pfn(unsigned long *pfns, unsigned int count, pgprot_t prot)
+{
+ struct vmap_pfn_data data = { .pfns = pfns, .prot = pgprot_nx(prot) };
+ struct vm_struct *area;
+
+ area = get_vm_area_caller(count * PAGE_SIZE, VM_IOREMAP,
+ __builtin_return_address(0));
+ if (!area)
+ return NULL;
+ if (apply_to_page_range(&init_mm, (unsigned long)area->addr,
+ count * PAGE_SIZE, vmap_pfn_apply, &data)) {
+ free_vm_area(area);
+ return NULL;
+ }
+ return area->addr;
+}
+EXPORT_SYMBOL_GPL(vmap_pfn);
+#endif /* CONFIG_VMAP_PFN */
+
static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
pgprot_t prot, int node)
{
- struct page **pages;
- unsigned int nr_pages, array_size, i;
const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO;
- const gfp_t alloc_mask = gfp_mask | __GFP_NOWARN;
- const gfp_t highmem_mask = (gfp_mask & (GFP_DMA | GFP_DMA32)) ?
- 0 :
- __GFP_HIGHMEM;
+ unsigned int nr_pages = get_vm_area_size(area) >> PAGE_SHIFT;
+ unsigned int array_size = nr_pages * sizeof(struct page *), i;
+ struct page **pages;
- nr_pages = get_vm_area_size(area) >> PAGE_SHIFT;
- array_size = (nr_pages * sizeof(struct page *));
+ gfp_mask |= __GFP_NOWARN;
+ if (!(gfp_mask & (GFP_DMA | GFP_DMA32)))
+ gfp_mask |= __GFP_HIGHMEM;
/* Please note that the recursion is strictly bounded. */
if (array_size > PAGE_SIZE) {
- pages = __vmalloc_node(array_size, 1, nested_gfp|highmem_mask,
- PAGE_KERNEL, node, area->caller);
+ pages = __vmalloc_node(array_size, 1, nested_gfp, node,
+ area->caller);
} else {
pages = kmalloc_node(array_size, nested_gfp, node);
}
@@ -2432,24 +2491,26 @@
struct page *page;
if (node == NUMA_NO_NODE)
- page = alloc_page(alloc_mask|highmem_mask);
+ page = alloc_page(gfp_mask);
else
- page = alloc_pages_node(node, alloc_mask|highmem_mask, 0);
+ page = alloc_pages_node(node, gfp_mask, 0);
if (unlikely(!page)) {
- /* Successfully allocated i pages, free them in __vunmap() */
+ /* Successfully allocated i pages, free them in __vfree() */
area->nr_pages = i;
atomic_long_add(area->nr_pages, &nr_vmalloc_pages);
goto fail;
}
area->pages[i] = page;
- if (gfpflags_allow_blocking(gfp_mask|highmem_mask))
+ if (gfpflags_allow_blocking(gfp_mask))
cond_resched();
}
atomic_long_add(area->nr_pages, &nr_vmalloc_pages);
- if (map_vm_area(area, prot, pages))
+ if (map_kernel_range((unsigned long)area->addr, get_vm_area_size(area),
+ prot, pages) < 0)
goto fail;
+
return area->addr;
fail:
@@ -2491,7 +2552,7 @@
if (!size || (size >> PAGE_SHIFT) > totalram_pages())
goto fail;
- area = __get_vm_area_node(size, align, VM_ALLOC | VM_UNINITIALIZED |
+ area = __get_vm_area_node(real_size, align, VM_ALLOC | VM_UNINITIALIZED |
vm_flags, start, end, node, gfp_mask, caller);
if (!area)
goto fail;
@@ -2517,27 +2578,16 @@
return NULL;
}
-/*
- * This is only for performance analysis of vmalloc and stress purpose.
- * It is required by vmalloc test module, therefore do not use it other
- * than that.
- */
-#ifdef CONFIG_TEST_VMALLOC_MODULE
-EXPORT_SYMBOL_GPL(__vmalloc_node_range);
-#endif
-
/**
* __vmalloc_node - allocate virtually contiguous memory
* @size: allocation size
* @align: desired alignment
* @gfp_mask: flags for the page level allocator
- * @prot: protection mask for the allocated pages
* @node: node to use for allocation or NUMA_NO_NODE
* @caller: caller's return address
*
- * Allocate enough pages to cover @size from the page level
- * allocator with @gfp_mask flags. Map them into contiguous
- * kernel virtual space, using a pagetable protection of @prot.
+ * Allocate enough pages to cover @size from the page level allocator with
+ * @gfp_mask flags. Map them into contiguous kernel virtual space.
*
* Reclaim modifiers in @gfp_mask - __GFP_NORETRY, __GFP_RETRY_MAYFAIL
* and __GFP_NOFAIL are not supported
@@ -2547,35 +2597,28 @@
*
* Return: pointer to the allocated memory or %NULL on error
*/
-static void *__vmalloc_node(unsigned long size, unsigned long align,
- gfp_t gfp_mask, pgprot_t prot,
- int node, const void *caller)
+void *__vmalloc_node(unsigned long size, unsigned long align,
+ gfp_t gfp_mask, int node, const void *caller)
{
return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END,
- gfp_mask, prot, 0, node, caller);
+ gfp_mask, PAGE_KERNEL, 0, node, caller);
}
+/*
+ * This is only for performance analysis of vmalloc and stress purpose.
+ * It is required by vmalloc test module, therefore do not use it other
+ * than that.
+ */
+#ifdef CONFIG_TEST_VMALLOC_MODULE
+EXPORT_SYMBOL_GPL(__vmalloc_node);
+#endif
-void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot)
+void *__vmalloc(unsigned long size, gfp_t gfp_mask)
{
- return __vmalloc_node(size, 1, gfp_mask, prot, NUMA_NO_NODE,
+ return __vmalloc_node(size, 1, gfp_mask, NUMA_NO_NODE,
__builtin_return_address(0));
}
EXPORT_SYMBOL(__vmalloc);
-static inline void *__vmalloc_node_flags(unsigned long size,
- int node, gfp_t flags)
-{
- return __vmalloc_node(size, 1, flags, PAGE_KERNEL,
- node, __builtin_return_address(0));
-}
-
-
-void *__vmalloc_node_flags_caller(unsigned long size, int node, gfp_t flags,
- void *caller)
-{
- return __vmalloc_node(size, 1, flags, PAGE_KERNEL, node, caller);
-}
-
/**
* vmalloc - allocate virtually contiguous memory
* @size: allocation size
@@ -2590,8 +2633,8 @@
*/
void *vmalloc(unsigned long size)
{
- return __vmalloc_node_flags(size, NUMA_NO_NODE,
- GFP_KERNEL);
+ return __vmalloc_node(size, 1, GFP_KERNEL, NUMA_NO_NODE,
+ __builtin_return_address(0));
}
EXPORT_SYMBOL(vmalloc);
@@ -2610,8 +2653,8 @@
*/
void *vzalloc(unsigned long size)
{
- return __vmalloc_node_flags(size, NUMA_NO_NODE,
- GFP_KERNEL | __GFP_ZERO);
+ return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_ZERO, NUMA_NO_NODE,
+ __builtin_return_address(0));
}
EXPORT_SYMBOL(vzalloc);
@@ -2648,8 +2691,8 @@
*/
void *vmalloc_node(unsigned long size, int node)
{
- return __vmalloc_node(size, 1, GFP_KERNEL, PAGE_KERNEL,
- node, __builtin_return_address(0));
+ return __vmalloc_node(size, 1, GFP_KERNEL, node,
+ __builtin_return_address(0));
}
EXPORT_SYMBOL(vmalloc_node);
@@ -2662,38 +2705,15 @@
* allocator and map them into contiguous kernel virtual space.
* The memory allocated is set to zero.
*
- * For tight control over page level allocator and protection flags
- * use __vmalloc_node() instead.
- *
* Return: pointer to the allocated memory or %NULL on error
*/
void *vzalloc_node(unsigned long size, int node)
{
- return __vmalloc_node_flags(size, node,
- GFP_KERNEL | __GFP_ZERO);
+ return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_ZERO, node,
+ __builtin_return_address(0));
}
EXPORT_SYMBOL(vzalloc_node);
-/**
- * vmalloc_exec - allocate virtually contiguous, executable memory
- * @size: allocation size
- *
- * Kernel-internal function to allocate enough pages to cover @size
- * the page level allocator and map them into contiguous and
- * executable kernel virtual space.
- *
- * For tight control over page level allocator and protection flags
- * use __vmalloc() instead.
- *
- * Return: pointer to the allocated memory or %NULL on error
- */
-void *vmalloc_exec(unsigned long size)
-{
- return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END,
- GFP_KERNEL, PAGE_KERNEL_EXEC, VM_FLUSH_RESET_PERMS,
- NUMA_NO_NODE, __builtin_return_address(0));
-}
-
#if defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA32)
#define GFP_VMALLOC32 (GFP_DMA32 | GFP_KERNEL)
#elif defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA)
@@ -2717,8 +2737,8 @@
*/
void *vmalloc_32(unsigned long size)
{
- return __vmalloc_node(size, 1, GFP_VMALLOC32, PAGE_KERNEL,
- NUMA_NO_NODE, __builtin_return_address(0));
+ return __vmalloc_node(size, 1, GFP_VMALLOC32, NUMA_NO_NODE,
+ __builtin_return_address(0));
}
EXPORT_SYMBOL(vmalloc_32);
@@ -3061,69 +3081,6 @@
}
EXPORT_SYMBOL(remap_vmalloc_range);
-/*
- * Implement stubs for vmalloc_sync_[un]mappings () if the architecture chose
- * not to have one.
- *
- * The purpose of this function is to make sure the vmalloc area
- * mappings are identical in all page-tables in the system.
- */
-void __weak vmalloc_sync_mappings(void)
-{
-}
-
-void __weak vmalloc_sync_unmappings(void)
-{
-}
-
-static int f(pte_t *pte, unsigned long addr, void *data)
-{
- pte_t ***p = data;
-
- if (p) {
- *(*p) = pte;
- (*p)++;
- }
- return 0;
-}
-
-/**
- * alloc_vm_area - allocate a range of kernel address space
- * @size: size of the area
- * @ptes: returns the PTEs for the address space
- *
- * Returns: NULL on failure, vm_struct on success
- *
- * This function reserves a range of kernel address space, and
- * allocates pagetables to map that range. No actual mappings
- * are created.
- *
- * If @ptes is non-NULL, pointers to the PTEs (in init_mm)
- * allocated for the VM area are returned.
- */
-struct vm_struct *alloc_vm_area(size_t size, pte_t **ptes)
-{
- struct vm_struct *area;
-
- area = get_vm_area_caller(size, VM_IOREMAP,
- __builtin_return_address(0));
- if (area == NULL)
- return NULL;
-
- /*
- * This ensures that page tables are constructed for this region
- * of kernel virtual address space and mapped into init_mm.
- */
- if (apply_to_page_range(&init_mm, (unsigned long)area->addr,
- size, f, ptes ? &ptes : NULL)) {
- free_vm_area(area);
- return NULL;
- }
-
- return area;
-}
-EXPORT_SYMBOL_GPL(alloc_vm_area);
-
void free_vm_area(struct vm_struct *area)
{
struct vm_struct *ret;
@@ -3233,7 +3190,7 @@
struct vmap_area **vas, *va;
struct vm_struct **vms;
int area, area2, last_area, term_area;
- unsigned long base, start, size, end, last_end;
+ unsigned long base, start, size, end, last_end, orig_start, orig_end;
bool purged = false;
enum fit_type type;
@@ -3277,7 +3234,7 @@
goto err_free;
}
retry:
- spin_lock(&vmap_area_lock);
+ spin_lock(&free_vmap_area_lock);
/* start scanning - we scan from the top, begin with the last area */
area = term_area = last_area;
@@ -3302,7 +3259,7 @@
goto overflow;
/*
- * If required width exeeds current VA block, move
+ * If required width exceeds current VA block, move
* base downwards and then recheck.
*/
if (base + end > va->va_end) {
@@ -3359,29 +3316,52 @@
va = vas[area];
va->va_start = start;
va->va_end = start + size;
-
- insert_vmap_area(va, &vmap_area_root, &vmap_area_list);
}
- spin_unlock(&vmap_area_lock);
+ spin_unlock(&free_vmap_area_lock);
+
+ /* populate the kasan shadow space */
+ for (area = 0; area < nr_vms; area++) {
+ if (kasan_populate_vmalloc(vas[area]->va_start, sizes[area]))
+ goto err_free_shadow;
+
+ kasan_unpoison_vmalloc((void *)vas[area]->va_start,
+ sizes[area]);
+ }
/* insert all vm's */
- for (area = 0; area < nr_vms; area++)
- setup_vmalloc_vm(vms[area], vas[area], VM_ALLOC,
+ spin_lock(&vmap_area_lock);
+ for (area = 0; area < nr_vms; area++) {
+ insert_vmap_area(vas[area], &vmap_area_root, &vmap_area_list);
+
+ setup_vmalloc_vm_locked(vms[area], vas[area], VM_ALLOC,
pcpu_get_vm_areas);
+ }
+ spin_unlock(&vmap_area_lock);
kfree(vas);
return vms;
recovery:
- /* Remove previously inserted areas. */
+ /*
+ * Remove previously allocated areas. There is no
+ * need in removing these areas from the busy tree,
+ * because they are inserted only on the final step
+ * and when pcpu_get_vm_areas() is success.
+ */
while (area--) {
- __free_vmap_area(vas[area]);
+ orig_start = vas[area]->va_start;
+ orig_end = vas[area]->va_end;
+ va = merge_or_add_vmap_area(vas[area], &free_vmap_area_root,
+ &free_vmap_area_list);
+ if (va)
+ kasan_release_vmalloc(orig_start, orig_end,
+ va->va_start, va->va_end);
vas[area] = NULL;
}
overflow:
- spin_unlock(&vmap_area_lock);
+ spin_unlock(&free_vmap_area_lock);
if (!purged) {
purge_vmap_area_lazy();
purged = true;
@@ -3411,6 +3391,29 @@
kfree(vas);
kfree(vms);
return NULL;
+
+err_free_shadow:
+ spin_lock(&free_vmap_area_lock);
+ /*
+ * We release all the vmalloc shadows, even the ones for regions that
+ * hadn't been successfully added. This relies on kasan_release_vmalloc
+ * being able to tolerate this case.
+ */
+ for (area = 0; area < nr_vms; area++) {
+ orig_start = vas[area]->va_start;
+ orig_end = vas[area]->va_end;
+ va = merge_or_add_vmap_area(vas[area], &free_vmap_area_root,
+ &free_vmap_area_list);
+ if (va)
+ kasan_release_vmalloc(orig_start, orig_end,
+ va->va_start, va->va_end);
+ vas[area] = NULL;
+ kfree(vms[area]);
+ }
+ spin_unlock(&free_vmap_area_lock);
+ kfree(vas);
+ kfree(vms);
+ return NULL;
}
/**
@@ -3432,9 +3435,12 @@
#ifdef CONFIG_PROC_FS
static void *s_start(struct seq_file *m, loff_t *pos)
+ __acquires(&vmap_purge_lock)
__acquires(&vmap_area_lock)
{
+ mutex_lock(&vmap_purge_lock);
spin_lock(&vmap_area_lock);
+
return seq_list_start(&vmap_area_list, *pos);
}
@@ -3445,8 +3451,10 @@
static void s_stop(struct seq_file *m, void *p)
__releases(&vmap_area_lock)
+ __releases(&vmap_purge_lock)
{
spin_unlock(&vmap_area_lock);
+ mutex_unlock(&vmap_purge_lock);
}
static void show_numa_info(struct seq_file *m, struct vm_struct *v)