Blame - mm/mmap.c - hafnium/third_party/linux

blob: ba78f1f1b1bd170cf18232212e48e495d4894aa3 [file] [log] [blame]

David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1	// SPDX-License-Identifier: GPL-2.0-only
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2	/*
				3	* mm/mmap.c
				4	*
				5	* Written by obz.
				6	*
				7	* Address space accounting code <alan@lxorguk.ukuu.org.uk>
				8	*/
				9
				10	#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
				11
				12	#include <linux/kernel.h>
				13	#include <linux/slab.h>
				14	#include <linux/backing-dev.h>
				15	#include <linux/mm.h>
				16	#include <linux/vmacache.h>
				17	#include <linux/shm.h>
				18	#include <linux/mman.h>
				19	#include <linux/pagemap.h>
				20	#include <linux/swap.h>
				21	#include <linux/syscalls.h>
				22	#include <linux/capability.h>
				23	#include <linux/init.h>
				24	#include <linux/file.h>
				25	#include <linux/fs.h>
				26	#include <linux/personality.h>
				27	#include <linux/security.h>
				28	#include <linux/hugetlb.h>
				29	#include <linux/shmem_fs.h>
				30	#include <linux/profile.h>
				31	#include <linux/export.h>
				32	#include <linux/mount.h>
				33	#include <linux/mempolicy.h>
				34	#include <linux/rmap.h>
				35	#include <linux/mmu_notifier.h>
				36	#include <linux/mmdebug.h>
				37	#include <linux/perf_event.h>
				38	#include <linux/audit.h>
				39	#include <linux/khugepaged.h>
				40	#include <linux/uprobes.h>
				41	#include <linux/rbtree_augmented.h>
				42	#include <linux/notifier.h>
				43	#include <linux/memory.h>
				44	#include <linux/printk.h>
				45	#include <linux/userfaultfd_k.h>
				46	#include <linux/moduleparam.h>
				47	#include <linux/pkeys.h>
				48	#include <linux/oom.h>
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	49	#include <linux/sched/mm.h>
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	50
				51	#include <linux/uaccess.h>
				52	#include <asm/cacheflush.h>
				53	#include <asm/tlb.h>
				54	#include <asm/mmu_context.h>
				55
				56	#include "internal.h"
				57
				58	#ifndef arch_mmap_check
				59	#define arch_mmap_check(addr, len, flags) (0)
				60	#endif
				61
				62	#ifdef CONFIG_HAVE_ARCH_MMAP_RND_BITS
				63	const int mmap_rnd_bits_min = CONFIG_ARCH_MMAP_RND_BITS_MIN;
				64	const int mmap_rnd_bits_max = CONFIG_ARCH_MMAP_RND_BITS_MAX;
				65	int mmap_rnd_bits __read_mostly = CONFIG_ARCH_MMAP_RND_BITS;
				66	#endif
				67	#ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS
				68	const int mmap_rnd_compat_bits_min = CONFIG_ARCH_MMAP_RND_COMPAT_BITS_MIN;
				69	const int mmap_rnd_compat_bits_max = CONFIG_ARCH_MMAP_RND_COMPAT_BITS_MAX;
				70	int mmap_rnd_compat_bits __read_mostly = CONFIG_ARCH_MMAP_RND_COMPAT_BITS;
				71	#endif
				72
				73	static bool ignore_rlimit_data;
				74	core_param(ignore_rlimit_data, ignore_rlimit_data, bool, 0644);
				75
				76	static void unmap_region(struct mm_struct *mm,
				77	struct vm_area_struct vma, struct vm_area_struct prev,
				78	unsigned long start, unsigned long end);
				79
				80	/* description of effects of mapping type and prot in current implementation.
				81	* this is due to the limited x86 page protection hardware. The expected
				82	* behavior is in parens:
				83	*
				84	* map_type prot
				85	* PROT_NONE PROT_READ PROT_WRITE PROT_EXEC
				86	* MAP_SHARED r: (no) no r: (yes) yes r: (no) yes r: (no) yes
				87	* w: (no) no w: (no) no w: (yes) yes w: (no) no
				88	* x: (no) no x: (no) yes x: (no) yes x: (yes) yes
				89	*
				90	* MAP_PRIVATE r: (no) no r: (yes) yes r: (no) yes r: (no) yes
				91	* w: (no) no w: (no) no w: (copy) copy w: (no) no
				92	* x: (no) no x: (no) yes x: (no) yes x: (yes) yes
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	93	*/
				94	pgprot_t protection_map[16] __ro_after_init = {
				95	__P000, __P001, __P010, __P011, __P100, __P101, __P110, __P111,
				96	__S000, __S001, __S010, __S011, __S100, __S101, __S110, __S111
				97	};
				98
				99	#ifndef CONFIG_ARCH_HAS_FILTER_PGPROT
				100	static inline pgprot_t arch_filter_pgprot(pgprot_t prot)
				101	{
				102	return prot;
				103	}
				104	#endif
				105
				106	pgprot_t vm_get_page_prot(unsigned long vm_flags)
				107	{
				108	pgprot_t ret = __pgprot(pgprot_val(protection_map[vm_flags &
				109	(VM_READ\|VM_WRITE\|VM_EXEC\|VM_SHARED)]) \|
				110	pgprot_val(arch_vm_get_page_prot(vm_flags)));
				111
				112	return arch_filter_pgprot(ret);
				113	}
				114	EXPORT_SYMBOL(vm_get_page_prot);
				115
				116	static pgprot_t vm_pgprot_modify(pgprot_t oldprot, unsigned long vm_flags)
				117	{
				118	return pgprot_modify(oldprot, vm_get_page_prot(vm_flags));
				119	}
				120
				121	/* Update vma->vm_page_prot to reflect vma->vm_flags. */
				122	void vma_set_page_prot(struct vm_area_struct *vma)
				123	{
				124	unsigned long vm_flags = vma->vm_flags;
				125	pgprot_t vm_page_prot;
				126
				127	vm_page_prot = vm_pgprot_modify(vma->vm_page_prot, vm_flags);
				128	if (vma_wants_writenotify(vma, vm_page_prot)) {
				129	vm_flags &= ~VM_SHARED;
				130	vm_page_prot = vm_pgprot_modify(vm_page_prot, vm_flags);
				131	}
				132	/* remove_protection_ptes reads vma->vm_page_prot without mmap_sem */
				133	WRITE_ONCE(vma->vm_page_prot, vm_page_prot);
				134	}
				135
				136	/*
				137	* Requires inode->i_mapping->i_mmap_rwsem
				138	*/
				139	static void __remove_shared_vm_struct(struct vm_area_struct *vma,
				140	struct file file, struct address_space mapping)
				141	{
				142	if (vma->vm_flags & VM_DENYWRITE)
				143	atomic_inc(&file_inode(file)->i_writecount);
				144	if (vma->vm_flags & VM_SHARED)
				145	mapping_unmap_writable(mapping);
				146
				147	flush_dcache_mmap_lock(mapping);
				148	vma_interval_tree_remove(vma, &mapping->i_mmap);
				149	flush_dcache_mmap_unlock(mapping);
				150	}
				151
				152	/*
				153	* Unlink a file-based vm structure from its interval tree, to hide
				154	* vma from rmap and vmtruncate before freeing its page tables.
				155	*/
				156	void unlink_file_vma(struct vm_area_struct *vma)
				157	{
				158	struct file *file = vma->vm_file;
				159
				160	if (file) {
				161	struct address_space *mapping = file->f_mapping;
				162	i_mmap_lock_write(mapping);
				163	__remove_shared_vm_struct(vma, file, mapping);
				164	i_mmap_unlock_write(mapping);
				165	}
				166	}
				167
				168	/*
				169	* Close a vm structure and free it, returning the next.
				170	*/
				171	static struct vm_area_struct remove_vma(struct vm_area_struct vma)
				172	{
				173	struct vm_area_struct *next = vma->vm_next;
				174
				175	might_sleep();
				176	if (vma->vm_ops && vma->vm_ops->close)
				177	vma->vm_ops->close(vma);
				178	if (vma->vm_file)
				179	fput(vma->vm_file);
				180	mpol_put(vma_policy(vma));
				181	vm_area_free(vma);
				182	return next;
				183	}
				184
				185	static int do_brk_flags(unsigned long addr, unsigned long request, unsigned long flags,
				186	struct list_head *uf);
				187	SYSCALL_DEFINE1(brk, unsigned long, brk)
				188	{
				189	unsigned long retval;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	190	unsigned long newbrk, oldbrk, origbrk;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	191	struct mm_struct *mm = current->mm;
				192	struct vm_area_struct *next;
				193	unsigned long min_brk;
				194	bool populate;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	195	bool downgraded = false;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	196	LIST_HEAD(uf);
				197
				198	if (down_write_killable(&mm->mmap_sem))
				199	return -EINTR;
				200
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	201	origbrk = mm->brk;
				202
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	203	#ifdef CONFIG_COMPAT_BRK
				204	/*
				205	* CONFIG_COMPAT_BRK can still be overridden by setting
				206	* randomize_va_space to 2, which will still cause mm->start_brk
				207	* to be arbitrarily shifted
				208	*/
				209	if (current->brk_randomized)
				210	min_brk = mm->start_brk;
				211	else
				212	min_brk = mm->end_data;
				213	#else
				214	min_brk = mm->start_brk;
				215	#endif
				216	if (brk < min_brk)
				217	goto out;
				218
				219	/*
				220	* Check against rlimit here. If this check is done later after the test
				221	* of oldbrk with newbrk then it can escape the test and let the data
				222	* segment grow beyond its set limit the in case where the limit is
				223	* not page aligned -Ram Gupta
				224	*/
				225	if (check_data_rlimit(rlimit(RLIMIT_DATA), brk, mm->start_brk,
				226	mm->end_data, mm->start_data))
				227	goto out;
				228
				229	newbrk = PAGE_ALIGN(brk);
				230	oldbrk = PAGE_ALIGN(mm->brk);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	231	if (oldbrk == newbrk) {
				232	mm->brk = brk;
				233	goto success;
				234	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	235
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	236	/*
				237	* Always allow shrinking brk.
				238	* __do_munmap() may downgrade mmap_sem to read.
				239	*/
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	240	if (brk <= mm->brk) {
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	241	int ret;
				242
				243	/*
				244	* mm->brk must to be protected by write mmap_sem so update it
				245	* before downgrading mmap_sem. When __do_munmap() fails,
				246	* mm->brk will be restored from origbrk.
				247	*/
				248	mm->brk = brk;
				249	ret = __do_munmap(mm, newbrk, oldbrk-newbrk, &uf, true);
				250	if (ret < 0) {
				251	mm->brk = origbrk;
				252	goto out;
				253	} else if (ret == 1) {
				254	downgraded = true;
				255	}
				256	goto success;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	257	}
				258
				259	/* Check against existing mmap mappings. */
				260	next = find_vma(mm, oldbrk);
				261	if (next && newbrk + PAGE_SIZE > vm_start_gap(next))
				262	goto out;
				263
				264	/* Ok, looks good - let it rip. */
				265	if (do_brk_flags(oldbrk, newbrk-oldbrk, 0, &uf) < 0)
				266	goto out;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	267	mm->brk = brk;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	268
				269	success:
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	270	populate = newbrk > oldbrk && (mm->def_flags & VM_LOCKED) != 0;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	271	if (downgraded)
				272	up_read(&mm->mmap_sem);
				273	else
				274	up_write(&mm->mmap_sem);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	275	userfaultfd_unmap_complete(mm, &uf);
				276	if (populate)
				277	mm_populate(oldbrk, newbrk - oldbrk);
				278	return brk;
				279
				280	out:
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	281	retval = origbrk;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	282	up_write(&mm->mmap_sem);
				283	return retval;
				284	}
				285
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	286	static inline unsigned long vma_compute_gap(struct vm_area_struct *vma)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	287	{
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	288	unsigned long gap, prev_end;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	289
				290	/*
				291	* Note: in the rare case of a VM_GROWSDOWN above a VM_GROWSUP, we
				292	* allow two stack_guard_gaps between them here, and when choosing
				293	* an unmapped area; whereas when expanding we only require one.
				294	* That's a little inconsistent, but keeps the code here simpler.
				295	*/
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	296	gap = vm_start_gap(vma);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	297	if (vma->vm_prev) {
				298	prev_end = vm_end_gap(vma->vm_prev);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	299	if (gap > prev_end)
				300	gap -= prev_end;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	301	else
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	302	gap = 0;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	303	}
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	304	return gap;
				305	}
				306
				307	#ifdef CONFIG_DEBUG_VM_RB
				308	static unsigned long vma_compute_subtree_gap(struct vm_area_struct *vma)
				309	{
				310	unsigned long max = vma_compute_gap(vma), subtree_gap;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	311	if (vma->vm_rb.rb_left) {
				312	subtree_gap = rb_entry(vma->vm_rb.rb_left,
				313	struct vm_area_struct, vm_rb)->rb_subtree_gap;
				314	if (subtree_gap > max)
				315	max = subtree_gap;
				316	}
				317	if (vma->vm_rb.rb_right) {
				318	subtree_gap = rb_entry(vma->vm_rb.rb_right,
				319	struct vm_area_struct, vm_rb)->rb_subtree_gap;
				320	if (subtree_gap > max)
				321	max = subtree_gap;
				322	}
				323	return max;
				324	}
				325
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	326	static int browse_rb(struct mm_struct *mm)
				327	{
				328	struct rb_root *root = &mm->mm_rb;
				329	int i = 0, j, bug = 0;
				330	struct rb_node nd, pn = NULL;
				331	unsigned long prev = 0, pend = 0;
				332
				333	for (nd = rb_first(root); nd; nd = rb_next(nd)) {
				334	struct vm_area_struct *vma;
				335	vma = rb_entry(nd, struct vm_area_struct, vm_rb);
				336	if (vma->vm_start < prev) {
				337	pr_emerg("vm_start %lx < prev %lx\n",
				338	vma->vm_start, prev);
				339	bug = 1;
				340	}
				341	if (vma->vm_start < pend) {
				342	pr_emerg("vm_start %lx < pend %lx\n",
				343	vma->vm_start, pend);
				344	bug = 1;
				345	}
				346	if (vma->vm_start > vma->vm_end) {
				347	pr_emerg("vm_start %lx > vm_end %lx\n",
				348	vma->vm_start, vma->vm_end);
				349	bug = 1;
				350	}
				351	spin_lock(&mm->page_table_lock);
				352	if (vma->rb_subtree_gap != vma_compute_subtree_gap(vma)) {
				353	pr_emerg("free gap %lx, correct %lx\n",
				354	vma->rb_subtree_gap,
				355	vma_compute_subtree_gap(vma));
				356	bug = 1;
				357	}
				358	spin_unlock(&mm->page_table_lock);
				359	i++;
				360	pn = nd;
				361	prev = vma->vm_start;
				362	pend = vma->vm_end;
				363	}
				364	j = 0;
				365	for (nd = pn; nd; nd = rb_prev(nd))
				366	j++;
				367	if (i != j) {
				368	pr_emerg("backwards %d, forwards %d\n", j, i);
				369	bug = 1;
				370	}
				371	return bug ? -1 : i;
				372	}
				373
				374	static void validate_mm_rb(struct rb_root root, struct vm_area_struct ignore)
				375	{
				376	struct rb_node *nd;
				377
				378	for (nd = rb_first(root); nd; nd = rb_next(nd)) {
				379	struct vm_area_struct *vma;
				380	vma = rb_entry(nd, struct vm_area_struct, vm_rb);
				381	VM_BUG_ON_VMA(vma != ignore &&
				382	vma->rb_subtree_gap != vma_compute_subtree_gap(vma),
				383	vma);
				384	}
				385	}
				386
				387	static void validate_mm(struct mm_struct *mm)
				388	{
				389	int bug = 0;
				390	int i = 0;
				391	unsigned long highest_address = 0;
				392	struct vm_area_struct *vma = mm->mmap;
				393
				394	while (vma) {
				395	struct anon_vma *anon_vma = vma->anon_vma;
				396	struct anon_vma_chain *avc;
				397
				398	if (anon_vma) {
				399	anon_vma_lock_read(anon_vma);
				400	list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
				401	anon_vma_interval_tree_verify(avc);
				402	anon_vma_unlock_read(anon_vma);
				403	}
				404
				405	highest_address = vm_end_gap(vma);
				406	vma = vma->vm_next;
				407	i++;
				408	}
				409	if (i != mm->map_count) {
				410	pr_emerg("map_count %d vm_next %d\n", mm->map_count, i);
				411	bug = 1;
				412	}
				413	if (highest_address != mm->highest_vm_end) {
				414	pr_emerg("mm->highest_vm_end %lx, found %lx\n",
				415	mm->highest_vm_end, highest_address);
				416	bug = 1;
				417	}
				418	i = browse_rb(mm);
				419	if (i != mm->map_count) {
				420	if (i != -1)
				421	pr_emerg("map_count %d rb %d\n", mm->map_count, i);
				422	bug = 1;
				423	}
				424	VM_BUG_ON_MM(bug, mm);
				425	}
				426	#else
				427	#define validate_mm_rb(root, ignore) do { } while (0)
				428	#define validate_mm(mm) do { } while (0)
				429	#endif
				430
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	431	RB_DECLARE_CALLBACKS_MAX(static, vma_gap_callbacks,
				432	struct vm_area_struct, vm_rb,
				433	unsigned long, rb_subtree_gap, vma_compute_gap)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	434
				435	/*
				436	* Update augmented rbtree rb_subtree_gap values after vma->vm_start or
				437	* vma->vm_prev->vm_end values changed, without modifying the vma's position
				438	* in the rbtree.
				439	*/
				440	static void vma_gap_update(struct vm_area_struct *vma)
				441	{
				442	/*
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	443	* As it turns out, RB_DECLARE_CALLBACKS_MAX() already created
				444	* a callback function that does exactly what we want.
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	445	*/
				446	vma_gap_callbacks_propagate(&vma->vm_rb, NULL);
				447	}
				448
				449	static inline void vma_rb_insert(struct vm_area_struct *vma,
				450	struct rb_root *root)
				451	{
				452	/* All rb_subtree_gap values must be consistent prior to insertion */
				453	validate_mm_rb(root, NULL);
				454
				455	rb_insert_augmented(&vma->vm_rb, root, &vma_gap_callbacks);
				456	}
				457
				458	static void __vma_rb_erase(struct vm_area_struct vma, struct rb_root root)
				459	{
				460	/*
				461	* Note rb_erase_augmented is a fairly large inline function,
				462	* so make sure we instantiate it only once with our desired
				463	* augmented rbtree callbacks.
				464	*/
				465	rb_erase_augmented(&vma->vm_rb, root, &vma_gap_callbacks);
				466	}
				467
				468	static __always_inline void vma_rb_erase_ignore(struct vm_area_struct *vma,
				469	struct rb_root *root,
				470	struct vm_area_struct *ignore)
				471	{
				472	/*
				473	* All rb_subtree_gap values must be consistent prior to erase,
				474	* with the possible exception of the "next" vma being erased if
				475	* next->vm_start was reduced.
				476	*/
				477	validate_mm_rb(root, ignore);
				478
				479	__vma_rb_erase(vma, root);
				480	}
				481
				482	static __always_inline void vma_rb_erase(struct vm_area_struct *vma,
				483	struct rb_root *root)
				484	{
				485	/*
				486	* All rb_subtree_gap values must be consistent prior to erase,
				487	* with the possible exception of the vma being erased.
				488	*/
				489	validate_mm_rb(root, vma);
				490
				491	__vma_rb_erase(vma, root);
				492	}
				493
				494	/*
				495	* vma has some anon_vma assigned, and is already inserted on that
				496	* anon_vma's interval trees.
				497	*
				498	* Before updating the vma's vm_start / vm_end / vm_pgoff fields, the
				499	* vma must be removed from the anon_vma's interval trees using
				500	* anon_vma_interval_tree_pre_update_vma().
				501	*
				502	* After the update, the vma will be reinserted using
				503	* anon_vma_interval_tree_post_update_vma().
				504	*
				505	* The entire update must be protected by exclusive mmap_sem and by
				506	* the root anon_vma's mutex.
				507	*/
				508	static inline void
				509	anon_vma_interval_tree_pre_update_vma(struct vm_area_struct *vma)
				510	{
				511	struct anon_vma_chain *avc;
				512
				513	list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
				514	anon_vma_interval_tree_remove(avc, &avc->anon_vma->rb_root);
				515	}
				516
				517	static inline void
				518	anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma)
				519	{
				520	struct anon_vma_chain *avc;
				521
				522	list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
				523	anon_vma_interval_tree_insert(avc, &avc->anon_vma->rb_root);
				524	}
				525
				526	static int find_vma_links(struct mm_struct *mm, unsigned long addr,
				527	unsigned long end, struct vm_area_struct **pprev,
				528	struct rb_node *rb_link, struct rb_node rb_parent)
				529	{
				530	struct rb_node *__rb_link, __rb_parent, *rb_prev;
				531
				532	__rb_link = &mm->mm_rb.rb_node;
				533	rb_prev = __rb_parent = NULL;
				534
				535	while (*__rb_link) {
				536	struct vm_area_struct *vma_tmp;
				537
				538	__rb_parent = *__rb_link;
				539	vma_tmp = rb_entry(__rb_parent, struct vm_area_struct, vm_rb);
				540
				541	if (vma_tmp->vm_end > addr) {
				542	/* Fail if an existing vma overlaps the area */
				543	if (vma_tmp->vm_start < end)
				544	return -ENOMEM;
				545	__rb_link = &__rb_parent->rb_left;
				546	} else {
				547	rb_prev = __rb_parent;
				548	__rb_link = &__rb_parent->rb_right;
				549	}
				550	}
				551
				552	*pprev = NULL;
				553	if (rb_prev)
				554	*pprev = rb_entry(rb_prev, struct vm_area_struct, vm_rb);
				555	*rb_link = __rb_link;
				556	*rb_parent = __rb_parent;
				557	return 0;
				558	}
				559
				560	static unsigned long count_vma_pages_range(struct mm_struct *mm,
				561	unsigned long addr, unsigned long end)
				562	{
				563	unsigned long nr_pages = 0;
				564	struct vm_area_struct *vma;
				565
				566	/* Find first overlaping mapping */
				567	vma = find_vma_intersection(mm, addr, end);
				568	if (!vma)
				569	return 0;
				570
				571	nr_pages = (min(end, vma->vm_end) -
				572	max(addr, vma->vm_start)) >> PAGE_SHIFT;
				573
				574	/* Iterate over the rest of the overlaps */
				575	for (vma = vma->vm_next; vma; vma = vma->vm_next) {
				576	unsigned long overlap_len;
				577
				578	if (vma->vm_start > end)
				579	break;
				580
				581	overlap_len = min(end, vma->vm_end) - vma->vm_start;
				582	nr_pages += overlap_len >> PAGE_SHIFT;
				583	}
				584
				585	return nr_pages;
				586	}
				587
				588	void __vma_link_rb(struct mm_struct mm, struct vm_area_struct vma,
				589	struct rb_node *rb_link, struct rb_node rb_parent)
				590	{
				591	/* Update tracking information for the gap following the new vma. */
				592	if (vma->vm_next)
				593	vma_gap_update(vma->vm_next);
				594	else
				595	mm->highest_vm_end = vm_end_gap(vma);
				596
				597	/*
				598	* vma->vm_prev wasn't known when we followed the rbtree to find the
				599	* correct insertion point for that vma. As a result, we could not
				600	* update the vma vm_rb parents rb_subtree_gap values on the way down.
				601	* So, we first insert the vma with a zero rb_subtree_gap value
				602	* (to be consistent with what we did on the way down), and then
				603	* immediately update the gap to the correct value. Finally we
				604	* rebalance the rbtree after all augmented values have been set.
				605	*/
				606	rb_link_node(&vma->vm_rb, rb_parent, rb_link);
				607	vma->rb_subtree_gap = 0;
				608	vma_gap_update(vma);
				609	vma_rb_insert(vma, &mm->mm_rb);
				610	}
				611
				612	static void __vma_link_file(struct vm_area_struct *vma)
				613	{
				614	struct file *file;
				615
				616	file = vma->vm_file;
				617	if (file) {
				618	struct address_space *mapping = file->f_mapping;
				619
				620	if (vma->vm_flags & VM_DENYWRITE)
				621	atomic_dec(&file_inode(file)->i_writecount);
				622	if (vma->vm_flags & VM_SHARED)
				623	atomic_inc(&mapping->i_mmap_writable);
				624
				625	flush_dcache_mmap_lock(mapping);
				626	vma_interval_tree_insert(vma, &mapping->i_mmap);
				627	flush_dcache_mmap_unlock(mapping);
				628	}
				629	}
				630
				631	static void
				632	__vma_link(struct mm_struct mm, struct vm_area_struct vma,
				633	struct vm_area_struct prev, struct rb_node *rb_link,
				634	struct rb_node *rb_parent)
				635	{
				636	__vma_link_list(mm, vma, prev, rb_parent);
				637	__vma_link_rb(mm, vma, rb_link, rb_parent);
				638	}
				639
				640	static void vma_link(struct mm_struct mm, struct vm_area_struct vma,
				641	struct vm_area_struct prev, struct rb_node *rb_link,
				642	struct rb_node *rb_parent)
				643	{
				644	struct address_space *mapping = NULL;
				645
				646	if (vma->vm_file) {
				647	mapping = vma->vm_file->f_mapping;
				648	i_mmap_lock_write(mapping);
				649	}
				650
				651	__vma_link(mm, vma, prev, rb_link, rb_parent);
				652	__vma_link_file(vma);
				653
				654	if (mapping)
				655	i_mmap_unlock_write(mapping);
				656
				657	mm->map_count++;
				658	validate_mm(mm);
				659	}
				660
				661	/*
				662	* Helper for vma_adjust() in the split_vma insert case: insert a vma into the
				663	* mm's list and rbtree. It has already been inserted into the interval tree.
				664	*/
				665	static void __insert_vm_struct(struct mm_struct mm, struct vm_area_struct vma)
				666	{
				667	struct vm_area_struct *prev;
				668	struct rb_node *rb_link, rb_parent;
				669
				670	if (find_vma_links(mm, vma->vm_start, vma->vm_end,
				671	&prev, &rb_link, &rb_parent))
				672	BUG();
				673	__vma_link(mm, vma, prev, rb_link, rb_parent);
				674	mm->map_count++;
				675	}
				676
				677	static __always_inline void __vma_unlink_common(struct mm_struct *mm,
				678	struct vm_area_struct *vma,
				679	struct vm_area_struct *prev,
				680	bool has_prev,
				681	struct vm_area_struct *ignore)
				682	{
				683	struct vm_area_struct *next;
				684
				685	vma_rb_erase_ignore(vma, &mm->mm_rb, ignore);
				686	next = vma->vm_next;
				687	if (has_prev)
				688	prev->vm_next = next;
				689	else {
				690	prev = vma->vm_prev;
				691	if (prev)
				692	prev->vm_next = next;
				693	else
				694	mm->mmap = next;
				695	}
				696	if (next)
				697	next->vm_prev = prev;
				698
				699	/* Kill the cache */
				700	vmacache_invalidate(mm);
				701	}
				702
				703	static inline void __vma_unlink_prev(struct mm_struct *mm,
				704	struct vm_area_struct *vma,
				705	struct vm_area_struct *prev)
				706	{
				707	__vma_unlink_common(mm, vma, prev, true, vma);
				708	}
				709
				710	/*
				711	* We cannot adjust vm_start, vm_end, vm_pgoff fields of a vma that
				712	* is already present in an i_mmap tree without adjusting the tree.
				713	* The following helper function should be used when such adjustments
				714	* are necessary. The "insert" vma (if any) is to be inserted
				715	* before we drop the necessary locks.
				716	*/
				717	int __vma_adjust(struct vm_area_struct *vma, unsigned long start,
				718	unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert,
				719	struct vm_area_struct *expand)
				720	{
				721	struct mm_struct *mm = vma->vm_mm;
				722	struct vm_area_struct next = vma->vm_next, orig_vma = vma;
				723	struct address_space *mapping = NULL;
				724	struct rb_root_cached *root = NULL;
				725	struct anon_vma *anon_vma = NULL;
				726	struct file *file = vma->vm_file;
				727	bool start_changed = false, end_changed = false;
				728	long adjust_next = 0;
				729	int remove_next = 0;
				730
				731	if (next && !insert) {
				732	struct vm_area_struct exporter = NULL, importer = NULL;
				733
				734	if (end >= next->vm_end) {
				735	/*
				736	* vma expands, overlapping all the next, and
				737	* perhaps the one after too (mprotect case 6).
				738	* The only other cases that gets here are
				739	* case 1, case 7 and case 8.
				740	*/
				741	if (next == expand) {
				742	/*
				743	* The only case where we don't expand "vma"
				744	* and we expand "next" instead is case 8.
				745	*/
				746	VM_WARN_ON(end != next->vm_end);
				747	/*
				748	* remove_next == 3 means we're
				749	* removing "vma" and that to do so we
				750	* swapped "vma" and "next".
				751	*/
				752	remove_next = 3;
				753	VM_WARN_ON(file != next->vm_file);
				754	swap(vma, next);
				755	} else {
				756	VM_WARN_ON(expand != vma);
				757	/*
				758	* case 1, 6, 7, remove_next == 2 is case 6,
				759	* remove_next == 1 is case 1 or 7.
				760	*/
				761	remove_next = 1 + (end > next->vm_end);
				762	VM_WARN_ON(remove_next == 2 &&
				763	end != next->vm_next->vm_end);
				764	VM_WARN_ON(remove_next == 1 &&
				765	end != next->vm_end);
				766	/* trim end to next, for case 6 first pass */
				767	end = next->vm_end;
				768	}
				769
				770	exporter = next;
				771	importer = vma;
				772
				773	/*
				774	* If next doesn't have anon_vma, import from vma after
				775	* next, if the vma overlaps with it.
				776	*/
				777	if (remove_next == 2 && !next->anon_vma)
				778	exporter = next->vm_next;
				779
				780	} else if (end > next->vm_start) {
				781	/*
				782	* vma expands, overlapping part of the next:
				783	* mprotect case 5 shifting the boundary up.
				784	*/
				785	adjust_next = (end - next->vm_start) >> PAGE_SHIFT;
				786	exporter = next;
				787	importer = vma;
				788	VM_WARN_ON(expand != importer);
				789	} else if (end < vma->vm_end) {
				790	/*
				791	* vma shrinks, and !insert tells it's not
				792	* split_vma inserting another: so it must be
				793	* mprotect case 4 shifting the boundary down.
				794	*/
				795	adjust_next = -((vma->vm_end - end) >> PAGE_SHIFT);
				796	exporter = vma;
				797	importer = next;
				798	VM_WARN_ON(expand != importer);
				799	}
				800
				801	/*
				802	* Easily overlooked: when mprotect shifts the boundary,
				803	* make sure the expanding vma has anon_vma set if the
				804	* shrinking vma had, to cover any anon pages imported.
				805	*/
				806	if (exporter && exporter->anon_vma && !importer->anon_vma) {
				807	int error;
				808
				809	importer->anon_vma = exporter->anon_vma;
				810	error = anon_vma_clone(importer, exporter);
				811	if (error)
				812	return error;
				813	}
				814	}
				815	again:
				816	vma_adjust_trans_huge(orig_vma, start, end, adjust_next);
				817
				818	if (file) {
				819	mapping = file->f_mapping;
				820	root = &mapping->i_mmap;
				821	uprobe_munmap(vma, vma->vm_start, vma->vm_end);
				822
				823	if (adjust_next)
				824	uprobe_munmap(next, next->vm_start, next->vm_end);
				825
				826	i_mmap_lock_write(mapping);
				827	if (insert) {
				828	/*
				829	* Put into interval tree now, so instantiated pages
				830	* are visible to arm/parisc __flush_dcache_page
				831	* throughout; but we cannot insert into address
				832	* space until vma start or end is updated.
				833	*/
				834	__vma_link_file(insert);
				835	}
				836	}
				837
				838	anon_vma = vma->anon_vma;
				839	if (!anon_vma && adjust_next)
				840	anon_vma = next->anon_vma;
				841	if (anon_vma) {
				842	VM_WARN_ON(adjust_next && next->anon_vma &&
				843	anon_vma != next->anon_vma);
				844	anon_vma_lock_write(anon_vma);
				845	anon_vma_interval_tree_pre_update_vma(vma);
				846	if (adjust_next)
				847	anon_vma_interval_tree_pre_update_vma(next);
				848	}
				849
				850	if (root) {
				851	flush_dcache_mmap_lock(mapping);
				852	vma_interval_tree_remove(vma, root);
				853	if (adjust_next)
				854	vma_interval_tree_remove(next, root);
				855	}
				856
				857	if (start != vma->vm_start) {
				858	vma->vm_start = start;
				859	start_changed = true;
				860	}
				861	if (end != vma->vm_end) {
				862	vma->vm_end = end;
				863	end_changed = true;
				864	}
				865	vma->vm_pgoff = pgoff;
				866	if (adjust_next) {
				867	next->vm_start += adjust_next << PAGE_SHIFT;
				868	next->vm_pgoff += adjust_next;
				869	}
				870
				871	if (root) {
				872	if (adjust_next)
				873	vma_interval_tree_insert(next, root);
				874	vma_interval_tree_insert(vma, root);
				875	flush_dcache_mmap_unlock(mapping);
				876	}
				877
				878	if (remove_next) {
				879	/*
				880	* vma_merge has merged next into vma, and needs
				881	* us to remove next before dropping the locks.
				882	*/
				883	if (remove_next != 3)
				884	__vma_unlink_prev(mm, next, vma);
				885	else
				886	/*
				887	* vma is not before next if they've been
				888	* swapped.
				889	*
				890	* pre-swap() next->vm_start was reduced so
				891	* tell validate_mm_rb to ignore pre-swap()
				892	* "next" (which is stored in post-swap()
				893	* "vma").
				894	*/
				895	__vma_unlink_common(mm, next, NULL, false, vma);
				896	if (file)
				897	__remove_shared_vm_struct(next, file, mapping);
				898	} else if (insert) {
				899	/*
				900	* split_vma has split insert from vma, and needs
				901	* us to insert it before dropping the locks
				902	* (it may either follow vma or precede it).
				903	*/
				904	__insert_vm_struct(mm, insert);
				905	} else {
				906	if (start_changed)
				907	vma_gap_update(vma);
				908	if (end_changed) {
				909	if (!next)
				910	mm->highest_vm_end = vm_end_gap(vma);
				911	else if (!adjust_next)
				912	vma_gap_update(next);
				913	}
				914	}
				915
				916	if (anon_vma) {
				917	anon_vma_interval_tree_post_update_vma(vma);
				918	if (adjust_next)
				919	anon_vma_interval_tree_post_update_vma(next);
				920	anon_vma_unlock_write(anon_vma);
				921	}
				922	if (mapping)
				923	i_mmap_unlock_write(mapping);
				924
				925	if (root) {
				926	uprobe_mmap(vma);
				927
				928	if (adjust_next)
				929	uprobe_mmap(next);
				930	}
				931
				932	if (remove_next) {
				933	if (file) {
				934	uprobe_munmap(next, next->vm_start, next->vm_end);
				935	fput(file);
				936	}
				937	if (next->anon_vma)
				938	anon_vma_merge(vma, next);
				939	mm->map_count--;
				940	mpol_put(vma_policy(next));
				941	vm_area_free(next);
				942	/*
				943	* In mprotect's case 6 (see comments on vma_merge),
				944	* we must remove another next too. It would clutter
				945	* up the code too much to do both in one go.
				946	*/
				947	if (remove_next != 3) {
				948	/*
				949	* If "next" was removed and vma->vm_end was
				950	* expanded (up) over it, in turn
				951	* "next->vm_prev->vm_end" changed and the
				952	* "vma->vm_next" gap must be updated.
				953	*/
				954	next = vma->vm_next;
				955	} else {
				956	/*
				957	* For the scope of the comment "next" and
				958	* "vma" considered pre-swap(): if "vma" was
				959	* removed, next->vm_start was expanded (down)
				960	* over it and the "next" gap must be updated.
				961	* Because of the swap() the post-swap() "vma"
				962	* actually points to pre-swap() "next"
				963	* (post-swap() "next" as opposed is now a
				964	* dangling pointer).
				965	*/
				966	next = vma;
				967	}
				968	if (remove_next == 2) {
				969	remove_next = 1;
				970	end = next->vm_end;
				971	goto again;
				972	}
				973	else if (next)
				974	vma_gap_update(next);
				975	else {
				976	/*
				977	* If remove_next == 2 we obviously can't
				978	* reach this path.
				979	*
				980	* If remove_next == 3 we can't reach this
				981	* path because pre-swap() next is always not
				982	* NULL. pre-swap() "next" is not being
				983	* removed and its next->vm_end is not altered
				984	* (and furthermore "end" already matches
				985	* next->vm_end in remove_next == 3).
				986	*
				987	* We reach this only in the remove_next == 1
				988	* case if the "next" vma that was removed was
				989	* the highest vma of the mm. However in such
				990	* case next->vm_end == "end" and the extended
				991	* "vma" has vma->vm_end == next->vm_end so
				992	* mm->highest_vm_end doesn't need any update
				993	* in remove_next == 1 case.
				994	*/
				995	VM_WARN_ON(mm->highest_vm_end != vm_end_gap(vma));
				996	}
				997	}
				998	if (insert && file)
				999	uprobe_mmap(insert);
				1000
				1001	validate_mm(mm);
				1002
				1003	return 0;
				1004	}
				1005
				1006	/*
				1007	* If the vma has a ->close operation then the driver probably needs to release
				1008	* per-vma resources, so we don't attempt to merge those.
				1009	*/
				1010	static inline int is_mergeable_vma(struct vm_area_struct *vma,
				1011	struct file *file, unsigned long vm_flags,
				1012	struct vm_userfaultfd_ctx vm_userfaultfd_ctx)
				1013	{
				1014	/*
				1015	* VM_SOFTDIRTY should not prevent from VMA merging, if we
				1016	* match the flags but dirty bit -- the caller should mark
				1017	* merged VMA as dirty. If dirty bit won't be excluded from
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1018	* comparison, we increase pressure on the memory system forcing
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1019	* the kernel to generate new VMAs when old one could be
				1020	* extended instead.
				1021	*/
				1022	if ((vma->vm_flags ^ vm_flags) & ~VM_SOFTDIRTY)
				1023	return 0;
				1024	if (vma->vm_file != file)
				1025	return 0;
				1026	if (vma->vm_ops && vma->vm_ops->close)
				1027	return 0;
				1028	if (!is_mergeable_vm_userfaultfd_ctx(vma, vm_userfaultfd_ctx))
				1029	return 0;
				1030	return 1;
				1031	}
				1032
				1033	static inline int is_mergeable_anon_vma(struct anon_vma *anon_vma1,
				1034	struct anon_vma *anon_vma2,
				1035	struct vm_area_struct *vma)
				1036	{
				1037	/*
				1038	* The list_is_singular() test is to avoid merging VMA cloned from
				1039	* parents. This can improve scalability caused by anon_vma lock.
				1040	*/
				1041	if ((!anon_vma1 \|\| !anon_vma2) && (!vma \|\|
				1042	list_is_singular(&vma->anon_vma_chain)))
				1043	return 1;
				1044	return anon_vma1 == anon_vma2;
				1045	}
				1046
				1047	/*
				1048	* Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff)
				1049	* in front of (at a lower virtual address and file offset than) the vma.
				1050	*
				1051	* We cannot merge two vmas if they have differently assigned (non-NULL)
				1052	* anon_vmas, nor if same anon_vma is assigned but offsets incompatible.
				1053	*
				1054	* We don't check here for the merged mmap wrapping around the end of pagecache
				1055	* indices (16TB on ia32) because do_mmap_pgoff() does not permit mmap's which
				1056	* wrap, nor mmaps which cover the final page at index -1UL.
				1057	*/
				1058	static int
				1059	can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags,
				1060	struct anon_vma anon_vma, struct file file,
				1061	pgoff_t vm_pgoff,
				1062	struct vm_userfaultfd_ctx vm_userfaultfd_ctx)
				1063	{
				1064	if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx) &&
				1065	is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {
				1066	if (vma->vm_pgoff == vm_pgoff)
				1067	return 1;
				1068	}
				1069	return 0;
				1070	}
				1071
				1072	/*
				1073	* Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff)
				1074	* beyond (at a higher virtual address and file offset than) the vma.
				1075	*
				1076	* We cannot merge two vmas if they have differently assigned (non-NULL)
				1077	* anon_vmas, nor if same anon_vma is assigned but offsets incompatible.
				1078	*/
				1079	static int
				1080	can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags,
				1081	struct anon_vma anon_vma, struct file file,
				1082	pgoff_t vm_pgoff,
				1083	struct vm_userfaultfd_ctx vm_userfaultfd_ctx)
				1084	{
				1085	if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx) &&
				1086	is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {
				1087	pgoff_t vm_pglen;
				1088	vm_pglen = vma_pages(vma);
				1089	if (vma->vm_pgoff + vm_pglen == vm_pgoff)
				1090	return 1;
				1091	}
				1092	return 0;
				1093	}
				1094
				1095	/*
				1096	* Given a mapping request (addr,end,vm_flags,file,pgoff), figure out
				1097	* whether that can be merged with its predecessor or its successor.
				1098	* Or both (it neatly fills a hole).
				1099	*
				1100	* In most cases - when called for mmap, brk or mremap - [addr,end) is
				1101	* certain not to be mapped by the time vma_merge is called; but when
				1102	* called for mprotect, it is certain to be already mapped (either at
				1103	* an offset within prev, or at the start of next), and the flags of
				1104	* this area are about to be changed to vm_flags - and the no-change
				1105	* case has already been eliminated.
				1106	*
				1107	* The following mprotect cases have to be considered, where AAAA is
				1108	* the area passed down from mprotect_fixup, never extending beyond one
				1109	* vma, PPPPPP is the prev vma specified, and NNNNNN the next vma after:
				1110	*
				1111	* AAAA AAAA AAAA AAAA
				1112	* PPPPPPNNNNNN PPPPPPNNNNNN PPPPPPNNNNNN PPPPNNNNXXXX
				1113	* cannot merge might become might become might become
				1114	* PPNNNNNNNNNN PPPPPPPPPPNN PPPPPPPPPPPP 6 or
				1115	* mmap, brk or case 4 below case 5 below PPPPPPPPXXXX 7 or
				1116	* mremap move: PPPPXXXXXXXX 8
				1117	* AAAA
				1118	* PPPP NNNN PPPPPPPPPPPP PPPPPPPPNNNN PPPPNNNNNNNN
				1119	* might become case 1 below case 2 below case 3 below
				1120	*
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1121	* It is important for case 8 that the vma NNNN overlapping the
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1122	* region AAAA is never going to extended over XXXX. Instead XXXX must
				1123	* be extended in region AAAA and NNNN must be removed. This way in
				1124	* all cases where vma_merge succeeds, the moment vma_adjust drops the
				1125	* rmap_locks, the properties of the merged vma will be already
				1126	* correct for the whole merged range. Some of those properties like
				1127	* vm_page_prot/vm_flags may be accessed by rmap_walks and they must
				1128	* be correct for the whole merged range immediately after the
				1129	* rmap_locks are released. Otherwise if XXXX would be removed and
				1130	* NNNN would be extended over the XXXX range, remove_migration_ptes
				1131	* or other rmap walkers (if working on addresses beyond the "end"
				1132	* parameter) may establish ptes with the wrong permissions of NNNN
				1133	* instead of the right permissions of XXXX.
				1134	*/
				1135	struct vm_area_struct vma_merge(struct mm_struct mm,
				1136	struct vm_area_struct *prev, unsigned long addr,
				1137	unsigned long end, unsigned long vm_flags,
				1138	struct anon_vma anon_vma, struct file file,
				1139	pgoff_t pgoff, struct mempolicy *policy,
				1140	struct vm_userfaultfd_ctx vm_userfaultfd_ctx)
				1141	{
				1142	pgoff_t pglen = (end - addr) >> PAGE_SHIFT;
				1143	struct vm_area_struct area, next;
				1144	int err;
				1145
				1146	/*
				1147	* We later require that vma->vm_flags == vm_flags,
				1148	* so this tests vma->vm_flags & VM_SPECIAL, too.
				1149	*/
				1150	if (vm_flags & VM_SPECIAL)
				1151	return NULL;
				1152
				1153	if (prev)
				1154	next = prev->vm_next;
				1155	else
				1156	next = mm->mmap;
				1157	area = next;
				1158	if (area && area->vm_end == end) /* cases 6, 7, 8 */
				1159	next = next->vm_next;
				1160
				1161	/* verify some invariant that must be enforced by the caller */
				1162	VM_WARN_ON(prev && addr <= prev->vm_start);
				1163	VM_WARN_ON(area && end > area->vm_end);
				1164	VM_WARN_ON(addr >= end);
				1165
				1166	/*
				1167	* Can it merge with the predecessor?
				1168	*/
				1169	if (prev && prev->vm_end == addr &&
				1170	mpol_equal(vma_policy(prev), policy) &&
				1171	can_vma_merge_after(prev, vm_flags,
				1172	anon_vma, file, pgoff,
				1173	vm_userfaultfd_ctx)) {
				1174	/*
				1175	* OK, it can. Can we now merge in the successor as well?
				1176	*/
				1177	if (next && end == next->vm_start &&
				1178	mpol_equal(policy, vma_policy(next)) &&
				1179	can_vma_merge_before(next, vm_flags,
				1180	anon_vma, file,
				1181	pgoff+pglen,
				1182	vm_userfaultfd_ctx) &&
				1183	is_mergeable_anon_vma(prev->anon_vma,
				1184	next->anon_vma, NULL)) {
				1185	/* cases 1, 6 */
				1186	err = __vma_adjust(prev, prev->vm_start,
				1187	next->vm_end, prev->vm_pgoff, NULL,
				1188	prev);
				1189	} else /* cases 2, 5, 7 */
				1190	err = __vma_adjust(prev, prev->vm_start,
				1191	end, prev->vm_pgoff, NULL, prev);
				1192	if (err)
				1193	return NULL;
				1194	khugepaged_enter_vma_merge(prev, vm_flags);
				1195	return prev;
				1196	}
				1197
				1198	/*
				1199	* Can this new request be merged in front of next?
				1200	*/
				1201	if (next && end == next->vm_start &&
				1202	mpol_equal(policy, vma_policy(next)) &&
				1203	can_vma_merge_before(next, vm_flags,
				1204	anon_vma, file, pgoff+pglen,
				1205	vm_userfaultfd_ctx)) {
				1206	if (prev && addr < prev->vm_end) /* case 4 */
				1207	err = __vma_adjust(prev, prev->vm_start,
				1208	addr, prev->vm_pgoff, NULL, next);
				1209	else { /* cases 3, 8 */
				1210	err = __vma_adjust(area, addr, next->vm_end,
				1211	next->vm_pgoff - pglen, NULL, next);
				1212	/*
				1213	* In case 3 area is already equal to next and
				1214	* this is a noop, but in case 8 "area" has
				1215	* been removed and next was expanded over it.
				1216	*/
				1217	area = next;
				1218	}
				1219	if (err)
				1220	return NULL;
				1221	khugepaged_enter_vma_merge(area, vm_flags);
				1222	return area;
				1223	}
				1224
				1225	return NULL;
				1226	}
				1227
				1228	/*
				1229	* Rough compatbility check to quickly see if it's even worth looking
				1230	* at sharing an anon_vma.
				1231	*
				1232	* They need to have the same vm_file, and the flags can only differ
				1233	* in things that mprotect may change.
				1234	*
				1235	* NOTE! The fact that we share an anon_vma doesn't _have_ to mean that
				1236	* we can merge the two vma's. For example, we refuse to merge a vma if
				1237	* there is a vm_ops->close() function, because that indicates that the
				1238	* driver is doing some kind of reference counting. But that doesn't
				1239	* really matter for the anon_vma sharing case.
				1240	*/
				1241	static int anon_vma_compatible(struct vm_area_struct a, struct vm_area_struct b)
				1242	{
				1243	return a->vm_end == b->vm_start &&
				1244	mpol_equal(vma_policy(a), vma_policy(b)) &&
				1245	a->vm_file == b->vm_file &&
				1246	!((a->vm_flags ^ b->vm_flags) & ~(VM_READ\|VM_WRITE\|VM_EXEC\|VM_SOFTDIRTY)) &&
				1247	b->vm_pgoff == a->vm_pgoff + ((b->vm_start - a->vm_start) >> PAGE_SHIFT);
				1248	}
				1249
				1250	/*
				1251	* Do some basic sanity checking to see if we can re-use the anon_vma
				1252	* from 'old'. The 'a'/'b' vma's are in VM order - one of them will be
				1253	* the same as 'old', the other will be the new one that is trying
				1254	* to share the anon_vma.
				1255	*
				1256	* NOTE! This runs with mm_sem held for reading, so it is possible that
				1257	* the anon_vma of 'old' is concurrently in the process of being set up
				1258	* by another page fault trying to merge _that_. But that's ok: if it
				1259	* is being set up, that automatically means that it will be a singleton
				1260	* acceptable for merging, so we can do all of this optimistically. But
				1261	* we do that READ_ONCE() to make sure that we never re-load the pointer.
				1262	*
				1263	* IOW: that the "list_is_singular()" test on the anon_vma_chain only
				1264	* matters for the 'stable anon_vma' case (ie the thing we want to avoid
				1265	* is to return an anon_vma that is "complex" due to having gone through
				1266	* a fork).
				1267	*
				1268	* We also make sure that the two vma's are compatible (adjacent,
				1269	* and with the same memory policies). That's all stable, even with just
				1270	* a read lock on the mm_sem.
				1271	*/
				1272	static struct anon_vma reusable_anon_vma(struct vm_area_struct old, struct vm_area_struct a, struct vm_area_struct b)
				1273	{
				1274	if (anon_vma_compatible(a, b)) {
				1275	struct anon_vma *anon_vma = READ_ONCE(old->anon_vma);
				1276
				1277	if (anon_vma && list_is_singular(&old->anon_vma_chain))
				1278	return anon_vma;
				1279	}
				1280	return NULL;
				1281	}
				1282
				1283	/*
				1284	* find_mergeable_anon_vma is used by anon_vma_prepare, to check
				1285	* neighbouring vmas for a suitable anon_vma, before it goes off
				1286	* to allocate a new anon_vma. It checks because a repetitive
				1287	* sequence of mprotects and faults may otherwise lead to distinct
				1288	* anon_vmas being allocated, preventing vma merge in subsequent
				1289	* mprotect.
				1290	*/
				1291	struct anon_vma find_mergeable_anon_vma(struct vm_area_struct vma)
				1292	{
				1293	struct anon_vma *anon_vma;
				1294	struct vm_area_struct *near;
				1295
				1296	near = vma->vm_next;
				1297	if (!near)
				1298	goto try_prev;
				1299
				1300	anon_vma = reusable_anon_vma(near, vma, near);
				1301	if (anon_vma)
				1302	return anon_vma;
				1303	try_prev:
				1304	near = vma->vm_prev;
				1305	if (!near)
				1306	goto none;
				1307
				1308	anon_vma = reusable_anon_vma(near, near, vma);
				1309	if (anon_vma)
				1310	return anon_vma;
				1311	none:
				1312	/*
				1313	* There's no absolute need to look only at touching neighbours:
				1314	* we could search further afield for "compatible" anon_vmas.
				1315	* But it would probably just be a waste of time searching,
				1316	* or lead to too many vmas hanging off the same anon_vma.
				1317	* We're trying to allow mprotect remerging later on,
				1318	* not trying to minimize memory used for anon_vmas.
				1319	*/
				1320	return NULL;
				1321	}
				1322
				1323	/*
				1324	* If a hint addr is less than mmap_min_addr change hint to be as
				1325	* low as possible but still greater than mmap_min_addr
				1326	*/
				1327	static inline unsigned long round_hint_to_min(unsigned long hint)
				1328	{
				1329	hint &= PAGE_MASK;
				1330	if (((void *)hint != NULL) &&
				1331	(hint < mmap_min_addr))
				1332	return PAGE_ALIGN(mmap_min_addr);
				1333	return hint;
				1334	}
				1335
				1336	static inline int mlock_future_check(struct mm_struct *mm,
				1337	unsigned long flags,
				1338	unsigned long len)
				1339	{
				1340	unsigned long locked, lock_limit;
				1341
				1342	/* mlock MCL_FUTURE? */
				1343	if (flags & VM_LOCKED) {
				1344	locked = len >> PAGE_SHIFT;
				1345	locked += mm->locked_vm;
				1346	lock_limit = rlimit(RLIMIT_MEMLOCK);
				1347	lock_limit >>= PAGE_SHIFT;
				1348	if (locked > lock_limit && !capable(CAP_IPC_LOCK))
				1349	return -EAGAIN;
				1350	}
				1351	return 0;
				1352	}
				1353
				1354	static inline u64 file_mmap_size_max(struct file file, struct inode inode)
				1355	{
				1356	if (S_ISREG(inode->i_mode))
				1357	return MAX_LFS_FILESIZE;
				1358
				1359	if (S_ISBLK(inode->i_mode))
				1360	return MAX_LFS_FILESIZE;
				1361
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1362	if (S_ISSOCK(inode->i_mode))
				1363	return MAX_LFS_FILESIZE;
				1364
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1365	/* Special "we do even unsigned file positions" case */
				1366	if (file->f_mode & FMODE_UNSIGNED_OFFSET)
				1367	return 0;
				1368
				1369	/* Yes, random drivers might want more. But I'm tired of buggy drivers */
				1370	return ULONG_MAX;
				1371	}
				1372
				1373	static inline bool file_mmap_ok(struct file file, struct inode inode,
				1374	unsigned long pgoff, unsigned long len)
				1375	{
				1376	u64 maxsize = file_mmap_size_max(file, inode);
				1377
				1378	if (maxsize && len > maxsize)
				1379	return false;
				1380	maxsize -= len;
				1381	if (pgoff > maxsize >> PAGE_SHIFT)
				1382	return false;
				1383	return true;
				1384	}
				1385
				1386	/*
				1387	* The caller must hold down_write(&current->mm->mmap_sem).
				1388	*/
				1389	unsigned long do_mmap(struct file *file, unsigned long addr,
				1390	unsigned long len, unsigned long prot,
				1391	unsigned long flags, vm_flags_t vm_flags,
				1392	unsigned long pgoff, unsigned long *populate,
				1393	struct list_head *uf)
				1394	{
				1395	struct mm_struct *mm = current->mm;
				1396	int pkey = 0;
				1397
				1398	*populate = 0;
				1399
				1400	if (!len)
				1401	return -EINVAL;
				1402
				1403	/*
				1404	* Does the application expect PROT_READ to imply PROT_EXEC?
				1405	*
				1406	* (the exception is when the underlying filesystem is noexec
				1407	* mounted, in which case we dont add PROT_EXEC.)
				1408	*/
				1409	if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC))
				1410	if (!(file && path_noexec(&file->f_path)))
				1411	prot \|= PROT_EXEC;
				1412
				1413	/* force arch specific MAP_FIXED handling in get_unmapped_area */
				1414	if (flags & MAP_FIXED_NOREPLACE)
				1415	flags \|= MAP_FIXED;
				1416
				1417	if (!(flags & MAP_FIXED))
				1418	addr = round_hint_to_min(addr);
				1419
				1420	/* Careful about overflows.. */
				1421	len = PAGE_ALIGN(len);
				1422	if (!len)
				1423	return -ENOMEM;
				1424
				1425	/* offset overflow? */
				1426	if ((pgoff + (len >> PAGE_SHIFT)) < pgoff)
				1427	return -EOVERFLOW;
				1428
				1429	/* Too many mappings? */
				1430	if (mm->map_count > sysctl_max_map_count)
				1431	return -ENOMEM;
				1432
				1433	/* Obtain the address to map to. we verify (or select) it and ensure
				1434	* that it represents a valid section of the address space.
				1435	*/
				1436	addr = get_unmapped_area(file, addr, len, pgoff, flags);
				1437	if (offset_in_page(addr))
				1438	return addr;
				1439
				1440	if (flags & MAP_FIXED_NOREPLACE) {
				1441	struct vm_area_struct *vma = find_vma(mm, addr);
				1442
				1443	if (vma && vma->vm_start < addr + len)
				1444	return -EEXIST;
				1445	}
				1446
				1447	if (prot == PROT_EXEC) {
				1448	pkey = execute_only_pkey(mm);
				1449	if (pkey < 0)
				1450	pkey = 0;
				1451	}
				1452
				1453	/* Do simple checking here so the lower-level routines won't have
				1454	* to. we assume access permissions have been handled by the open
				1455	* of the memory object, so we don't do any here.
				1456	*/
				1457	vm_flags \|= calc_vm_prot_bits(prot, pkey) \| calc_vm_flag_bits(flags) \|
				1458	mm->def_flags \| VM_MAYREAD \| VM_MAYWRITE \| VM_MAYEXEC;
				1459
				1460	if (flags & MAP_LOCKED)
				1461	if (!can_do_mlock())
				1462	return -EPERM;
				1463
				1464	if (mlock_future_check(mm, vm_flags, len))
				1465	return -EAGAIN;
				1466
				1467	if (file) {
				1468	struct inode *inode = file_inode(file);
				1469	unsigned long flags_mask;
				1470
				1471	if (!file_mmap_ok(file, inode, pgoff, len))
				1472	return -EOVERFLOW;
				1473
				1474	flags_mask = LEGACY_MAP_MASK \| file->f_op->mmap_supported_flags;
				1475
				1476	switch (flags & MAP_TYPE) {
				1477	case MAP_SHARED:
				1478	/*
				1479	* Force use of MAP_SHARED_VALIDATE with non-legacy
				1480	* flags. E.g. MAP_SYNC is dangerous to use with
				1481	* MAP_SHARED as you don't know which consistency model
				1482	* you will get. We silently ignore unsupported flags
				1483	* with MAP_SHARED to preserve backward compatibility.
				1484	*/
				1485	flags &= LEGACY_MAP_MASK;
				1486	/* fall through */
				1487	case MAP_SHARED_VALIDATE:
				1488	if (flags & ~flags_mask)
				1489	return -EOPNOTSUPP;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1490	if (prot & PROT_WRITE) {
				1491	if (!(file->f_mode & FMODE_WRITE))
				1492	return -EACCES;
				1493	if (IS_SWAPFILE(file->f_mapping->host))
				1494	return -ETXTBSY;
				1495	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1496
				1497	/*
				1498	* Make sure we don't allow writing to an append-only
				1499	* file..
				1500	*/
				1501	if (IS_APPEND(inode) && (file->f_mode & FMODE_WRITE))
				1502	return -EACCES;
				1503
				1504	/*
				1505	* Make sure there are no mandatory locks on the file.
				1506	*/
				1507	if (locks_verify_locked(file))
				1508	return -EAGAIN;
				1509
				1510	vm_flags \|= VM_SHARED \| VM_MAYSHARE;
				1511	if (!(file->f_mode & FMODE_WRITE))
				1512	vm_flags &= ~(VM_MAYWRITE \| VM_SHARED);
				1513
				1514	/* fall through */
				1515	case MAP_PRIVATE:
				1516	if (!(file->f_mode & FMODE_READ))
				1517	return -EACCES;
				1518	if (path_noexec(&file->f_path)) {
				1519	if (vm_flags & VM_EXEC)
				1520	return -EPERM;
				1521	vm_flags &= ~VM_MAYEXEC;
				1522	}
				1523
				1524	if (!file->f_op->mmap)
				1525	return -ENODEV;
				1526	if (vm_flags & (VM_GROWSDOWN\|VM_GROWSUP))
				1527	return -EINVAL;
				1528	break;
				1529
				1530	default:
				1531	return -EINVAL;
				1532	}
				1533	} else {
				1534	switch (flags & MAP_TYPE) {
				1535	case MAP_SHARED:
				1536	if (vm_flags & (VM_GROWSDOWN\|VM_GROWSUP))
				1537	return -EINVAL;
				1538	/*
				1539	* Ignore pgoff.
				1540	*/
				1541	pgoff = 0;
				1542	vm_flags \|= VM_SHARED \| VM_MAYSHARE;
				1543	break;
				1544	case MAP_PRIVATE:
				1545	/*
				1546	* Set pgoff according to addr for anon_vma.
				1547	*/
				1548	pgoff = addr >> PAGE_SHIFT;
				1549	break;
				1550	default:
				1551	return -EINVAL;
				1552	}
				1553	}
				1554
				1555	/*
				1556	* Set 'VM_NORESERVE' if we should not account for the
				1557	* memory use of this mapping.
				1558	*/
				1559	if (flags & MAP_NORESERVE) {
				1560	/* We honor MAP_NORESERVE if allowed to overcommit */
				1561	if (sysctl_overcommit_memory != OVERCOMMIT_NEVER)
				1562	vm_flags \|= VM_NORESERVE;
				1563
				1564	/* hugetlb applies strict overcommit unless MAP_NORESERVE */
				1565	if (file && is_file_hugepages(file))
				1566	vm_flags \|= VM_NORESERVE;
				1567	}
				1568
				1569	addr = mmap_region(file, addr, len, vm_flags, pgoff, uf);
				1570	if (!IS_ERR_VALUE(addr) &&
				1571	((vm_flags & VM_LOCKED) \|\|
				1572	(flags & (MAP_POPULATE \| MAP_NONBLOCK)) == MAP_POPULATE))
				1573	*populate = len;
				1574	return addr;
				1575	}
				1576
				1577	unsigned long ksys_mmap_pgoff(unsigned long addr, unsigned long len,
				1578	unsigned long prot, unsigned long flags,
				1579	unsigned long fd, unsigned long pgoff)
				1580	{
				1581	struct file *file = NULL;
				1582	unsigned long retval;
				1583
				1584	if (!(flags & MAP_ANONYMOUS)) {
				1585	audit_mmap_fd(fd, flags);
				1586	file = fget(fd);
				1587	if (!file)
				1588	return -EBADF;
				1589	if (is_file_hugepages(file))
				1590	len = ALIGN(len, huge_page_size(hstate_file(file)));
				1591	retval = -EINVAL;
				1592	if (unlikely(flags & MAP_HUGETLB && !is_file_hugepages(file)))
				1593	goto out_fput;
				1594	} else if (flags & MAP_HUGETLB) {
				1595	struct user_struct *user = NULL;
				1596	struct hstate *hs;
				1597
				1598	hs = hstate_sizelog((flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK);
				1599	if (!hs)
				1600	return -EINVAL;
				1601
				1602	len = ALIGN(len, huge_page_size(hs));
				1603	/*
				1604	* VM_NORESERVE is used because the reservations will be
				1605	* taken when vm_ops->mmap() is called
				1606	* A dummy user value is used because we are not locking
				1607	* memory so no accounting is necessary
				1608	*/
				1609	file = hugetlb_file_setup(HUGETLB_ANON_FILE, len,
				1610	VM_NORESERVE,
				1611	&user, HUGETLB_ANONHUGE_INODE,
				1612	(flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK);
				1613	if (IS_ERR(file))
				1614	return PTR_ERR(file);
				1615	}
				1616
				1617	flags &= ~(MAP_EXECUTABLE \| MAP_DENYWRITE);
				1618
				1619	retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff);
				1620	out_fput:
				1621	if (file)
				1622	fput(file);
				1623	return retval;
				1624	}
				1625
				1626	SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
				1627	unsigned long, prot, unsigned long, flags,
				1628	unsigned long, fd, unsigned long, pgoff)
				1629	{
				1630	return ksys_mmap_pgoff(addr, len, prot, flags, fd, pgoff);
				1631	}
				1632
				1633	#ifdef __ARCH_WANT_SYS_OLD_MMAP
				1634	struct mmap_arg_struct {
				1635	unsigned long addr;
				1636	unsigned long len;
				1637	unsigned long prot;
				1638	unsigned long flags;
				1639	unsigned long fd;
				1640	unsigned long offset;
				1641	};
				1642
				1643	SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg)
				1644	{
				1645	struct mmap_arg_struct a;
				1646
				1647	if (copy_from_user(&a, arg, sizeof(a)))
				1648	return -EFAULT;
				1649	if (offset_in_page(a.offset))
				1650	return -EINVAL;
				1651
				1652	return ksys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd,
				1653	a.offset >> PAGE_SHIFT);
				1654	}
				1655	#endif /* __ARCH_WANT_SYS_OLD_MMAP */
				1656
				1657	/*
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1658	* Some shared mappings will want the pages marked read-only
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1659	* to track write events. If so, we'll downgrade vm_page_prot
				1660	* to the private version (using protection_map[] without the
				1661	* VM_SHARED bit).
				1662	*/
				1663	int vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot)
				1664	{
				1665	vm_flags_t vm_flags = vma->vm_flags;
				1666	const struct vm_operations_struct *vm_ops = vma->vm_ops;
				1667
				1668	/* If it was private or non-writable, the write bit is already clear */
				1669	if ((vm_flags & (VM_WRITE\|VM_SHARED)) != ((VM_WRITE\|VM_SHARED)))
				1670	return 0;
				1671
				1672	/* The backer wishes to know when pages are first written to? */
				1673	if (vm_ops && (vm_ops->page_mkwrite \|\| vm_ops->pfn_mkwrite))
				1674	return 1;
				1675
				1676	/* The open routine did something to the protections that pgprot_modify
				1677	* won't preserve? */
				1678	if (pgprot_val(vm_page_prot) !=
				1679	pgprot_val(vm_pgprot_modify(vm_page_prot, vm_flags)))
				1680	return 0;
				1681
				1682	/* Do we need to track softdirty? */
				1683	if (IS_ENABLED(CONFIG_MEM_SOFT_DIRTY) && !(vm_flags & VM_SOFTDIRTY))
				1684	return 1;
				1685
				1686	/* Specialty mapping? */
				1687	if (vm_flags & VM_PFNMAP)
				1688	return 0;
				1689
				1690	/* Can the mapping track the dirty pages? */
				1691	return vma->vm_file && vma->vm_file->f_mapping &&
				1692	mapping_cap_account_dirty(vma->vm_file->f_mapping);
				1693	}
				1694
				1695	/*
				1696	* We account for memory if it's a private writeable mapping,
				1697	* not hugepages and VM_NORESERVE wasn't set.
				1698	*/
				1699	static inline int accountable_mapping(struct file *file, vm_flags_t vm_flags)
				1700	{
				1701	/*
				1702	* hugetlb has its own accounting separate from the core VM
				1703	* VM_HUGETLB may not be set yet so we cannot check for that flag.
				1704	*/
				1705	if (file && is_file_hugepages(file))
				1706	return 0;
				1707
				1708	return (vm_flags & (VM_NORESERVE \| VM_SHARED \| VM_WRITE)) == VM_WRITE;
				1709	}
				1710
				1711	unsigned long mmap_region(struct file *file, unsigned long addr,
				1712	unsigned long len, vm_flags_t vm_flags, unsigned long pgoff,
				1713	struct list_head *uf)
				1714	{
				1715	struct mm_struct *mm = current->mm;
				1716	struct vm_area_struct vma, prev;
				1717	int error;
				1718	struct rb_node *rb_link, rb_parent;
				1719	unsigned long charged = 0;
				1720
				1721	/* Check against address space limit. */
				1722	if (!may_expand_vm(mm, vm_flags, len >> PAGE_SHIFT)) {
				1723	unsigned long nr_pages;
				1724
				1725	/*
				1726	* MAP_FIXED may remove pages of mappings that intersects with
				1727	* requested mapping. Account for the pages it would unmap.
				1728	*/
				1729	nr_pages = count_vma_pages_range(mm, addr, addr + len);
				1730
				1731	if (!may_expand_vm(mm, vm_flags,
				1732	(len >> PAGE_SHIFT) - nr_pages))
				1733	return -ENOMEM;
				1734	}
				1735
				1736	/* Clear old maps */
				1737	while (find_vma_links(mm, addr, addr + len, &prev, &rb_link,
				1738	&rb_parent)) {
				1739	if (do_munmap(mm, addr, len, uf))
				1740	return -ENOMEM;
				1741	}
				1742
				1743	/*
				1744	* Private writable mapping: check memory availability
				1745	*/
				1746	if (accountable_mapping(file, vm_flags)) {
				1747	charged = len >> PAGE_SHIFT;
				1748	if (security_vm_enough_memory_mm(mm, charged))
				1749	return -ENOMEM;
				1750	vm_flags \|= VM_ACCOUNT;
				1751	}
				1752
				1753	/*
				1754	* Can we just expand an old mapping?
				1755	*/
				1756	vma = vma_merge(mm, prev, addr, addr + len, vm_flags,
				1757	NULL, file, pgoff, NULL, NULL_VM_UFFD_CTX);
				1758	if (vma)
				1759	goto out;
				1760
				1761	/*
				1762	* Determine the object being mapped and call the appropriate
				1763	* specific mapper. the address has already been validated, but
				1764	* not unmapped, but the maps are removed from the list.
				1765	*/
				1766	vma = vm_area_alloc(mm);
				1767	if (!vma) {
				1768	error = -ENOMEM;
				1769	goto unacct_error;
				1770	}
				1771
				1772	vma->vm_start = addr;
				1773	vma->vm_end = addr + len;
				1774	vma->vm_flags = vm_flags;
				1775	vma->vm_page_prot = vm_get_page_prot(vm_flags);
				1776	vma->vm_pgoff = pgoff;
				1777
				1778	if (file) {
				1779	if (vm_flags & VM_DENYWRITE) {
				1780	error = deny_write_access(file);
				1781	if (error)
				1782	goto free_vma;
				1783	}
				1784	if (vm_flags & VM_SHARED) {
				1785	error = mapping_map_writable(file->f_mapping);
				1786	if (error)
				1787	goto allow_write_and_free_vma;
				1788	}
				1789
				1790	/* ->mmap() can change vma->vm_file, but must guarantee that
				1791	* vma_link() below can deny write-access if VM_DENYWRITE is set
				1792	* and map writably if VM_SHARED is set. This usually means the
				1793	* new file must not have been exposed to user-space, yet.
				1794	*/
				1795	vma->vm_file = get_file(file);
				1796	error = call_mmap(file, vma);
				1797	if (error)
				1798	goto unmap_and_free_vma;
				1799
				1800	/* Can addr have changed??
				1801	*
				1802	* Answer: Yes, several device drivers can do it in their
				1803	* f_op->mmap method. -DaveM
				1804	* Bug: If addr is changed, prev, rb_link, rb_parent should
				1805	* be updated for vma_link()
				1806	*/
				1807	WARN_ON_ONCE(addr != vma->vm_start);
				1808
				1809	addr = vma->vm_start;
				1810	vm_flags = vma->vm_flags;
				1811	} else if (vm_flags & VM_SHARED) {
				1812	error = shmem_zero_setup(vma);
				1813	if (error)
				1814	goto free_vma;
				1815	} else {
				1816	vma_set_anonymous(vma);
				1817	}
				1818
				1819	vma_link(mm, vma, prev, rb_link, rb_parent);
				1820	/* Once vma denies write, undo our temporary denial count */
				1821	if (file) {
				1822	if (vm_flags & VM_SHARED)
				1823	mapping_unmap_writable(file->f_mapping);
				1824	if (vm_flags & VM_DENYWRITE)
				1825	allow_write_access(file);
				1826	}
				1827	file = vma->vm_file;
				1828	out:
				1829	perf_event_mmap(vma);
				1830
				1831	vm_stat_account(mm, vm_flags, len >> PAGE_SHIFT);
				1832	if (vm_flags & VM_LOCKED) {
				1833	if ((vm_flags & VM_SPECIAL) \|\| vma_is_dax(vma) \|\|
				1834	is_vm_hugetlb_page(vma) \|\|
				1835	vma == get_gate_vma(current->mm))
				1836	vma->vm_flags &= VM_LOCKED_CLEAR_MASK;
				1837	else
				1838	mm->locked_vm += (len >> PAGE_SHIFT);
				1839	}
				1840
				1841	if (file)
				1842	uprobe_mmap(vma);
				1843
				1844	/*
				1845	* New (or expanded) vma always get soft dirty status.
				1846	* Otherwise user-space soft-dirty page tracker won't
				1847	* be able to distinguish situation when vma area unmapped,
				1848	* then new mapped in-place (which must be aimed as
				1849	* a completely new data area).
				1850	*/
				1851	vma->vm_flags \|= VM_SOFTDIRTY;
				1852
				1853	vma_set_page_prot(vma);
				1854
				1855	return addr;
				1856
				1857	unmap_and_free_vma:
				1858	vma->vm_file = NULL;
				1859	fput(file);
				1860
				1861	/* Undo any partial mapping done by a device driver. */
				1862	unmap_region(mm, vma, prev, vma->vm_start, vma->vm_end);
				1863	charged = 0;
				1864	if (vm_flags & VM_SHARED)
				1865	mapping_unmap_writable(file->f_mapping);
				1866	allow_write_and_free_vma:
				1867	if (vm_flags & VM_DENYWRITE)
				1868	allow_write_access(file);
				1869	free_vma:
				1870	vm_area_free(vma);
				1871	unacct_error:
				1872	if (charged)
				1873	vm_unacct_memory(charged);
				1874	return error;
				1875	}
				1876
				1877	unsigned long unmapped_area(struct vm_unmapped_area_info *info)
				1878	{
				1879	/*
				1880	* We implement the search by looking for an rbtree node that
				1881	* immediately follows a suitable gap. That is,
				1882	* - gap_start = vma->vm_prev->vm_end <= info->high_limit - length;
				1883	* - gap_end = vma->vm_start >= info->low_limit + length;
				1884	* - gap_end - gap_start >= length
				1885	*/
				1886
				1887	struct mm_struct *mm = current->mm;
				1888	struct vm_area_struct *vma;
				1889	unsigned long length, low_limit, high_limit, gap_start, gap_end;
				1890
				1891	/* Adjust search length to account for worst case alignment overhead */
				1892	length = info->length + info->align_mask;
				1893	if (length < info->length)
				1894	return -ENOMEM;
				1895
				1896	/* Adjust search limits by the desired length */
				1897	if (info->high_limit < length)
				1898	return -ENOMEM;
				1899	high_limit = info->high_limit - length;
				1900
				1901	if (info->low_limit > high_limit)
				1902	return -ENOMEM;
				1903	low_limit = info->low_limit + length;
				1904
				1905	/* Check if rbtree root looks promising */
				1906	if (RB_EMPTY_ROOT(&mm->mm_rb))
				1907	goto check_highest;
				1908	vma = rb_entry(mm->mm_rb.rb_node, struct vm_area_struct, vm_rb);
				1909	if (vma->rb_subtree_gap < length)
				1910	goto check_highest;
				1911
				1912	while (true) {
				1913	/* Visit left subtree if it looks promising */
				1914	gap_end = vm_start_gap(vma);
				1915	if (gap_end >= low_limit && vma->vm_rb.rb_left) {
				1916	struct vm_area_struct *left =
				1917	rb_entry(vma->vm_rb.rb_left,
				1918	struct vm_area_struct, vm_rb);
				1919	if (left->rb_subtree_gap >= length) {
				1920	vma = left;
				1921	continue;
				1922	}
				1923	}
				1924
				1925	gap_start = vma->vm_prev ? vm_end_gap(vma->vm_prev) : 0;
				1926	check_current:
				1927	/* Check if current node has a suitable gap */
				1928	if (gap_start > high_limit)
				1929	return -ENOMEM;
				1930	if (gap_end >= low_limit &&
				1931	gap_end > gap_start && gap_end - gap_start >= length)
				1932	goto found;
				1933
				1934	/* Visit right subtree if it looks promising */
				1935	if (vma->vm_rb.rb_right) {
				1936	struct vm_area_struct *right =
				1937	rb_entry(vma->vm_rb.rb_right,
				1938	struct vm_area_struct, vm_rb);
				1939	if (right->rb_subtree_gap >= length) {
				1940	vma = right;
				1941	continue;
				1942	}
				1943	}
				1944
				1945	/* Go back up the rbtree to find next candidate node */
				1946	while (true) {
				1947	struct rb_node *prev = &vma->vm_rb;
				1948	if (!rb_parent(prev))
				1949	goto check_highest;
				1950	vma = rb_entry(rb_parent(prev),
				1951	struct vm_area_struct, vm_rb);
				1952	if (prev == vma->vm_rb.rb_left) {
				1953	gap_start = vm_end_gap(vma->vm_prev);
				1954	gap_end = vm_start_gap(vma);
				1955	goto check_current;
				1956	}
				1957	}
				1958	}
				1959
				1960	check_highest:
				1961	/* Check highest gap, which does not precede any rbtree node */
				1962	gap_start = mm->highest_vm_end;
				1963	gap_end = ULONG_MAX; /* Only for VM_BUG_ON below */
				1964	if (gap_start > high_limit)
				1965	return -ENOMEM;
				1966
				1967	found:
				1968	/* We found a suitable gap. Clip it with the original low_limit. */
				1969	if (gap_start < info->low_limit)
				1970	gap_start = info->low_limit;
				1971
				1972	/* Adjust gap address to the desired alignment */
				1973	gap_start += (info->align_offset - gap_start) & info->align_mask;
				1974
				1975	VM_BUG_ON(gap_start + info->length > info->high_limit);
				1976	VM_BUG_ON(gap_start + info->length > gap_end);
				1977	return gap_start;
				1978	}
				1979
				1980	unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info)
				1981	{
				1982	struct mm_struct *mm = current->mm;
				1983	struct vm_area_struct *vma;
				1984	unsigned long length, low_limit, high_limit, gap_start, gap_end;
				1985
				1986	/* Adjust search length to account for worst case alignment overhead */
				1987	length = info->length + info->align_mask;
				1988	if (length < info->length)
				1989	return -ENOMEM;
				1990
				1991	/*
				1992	* Adjust search limits by the desired length.
				1993	* See implementation comment at top of unmapped_area().
				1994	*/
				1995	gap_end = info->high_limit;
				1996	if (gap_end < length)
				1997	return -ENOMEM;
				1998	high_limit = gap_end - length;
				1999
				2000	if (info->low_limit > high_limit)
				2001	return -ENOMEM;
				2002	low_limit = info->low_limit + length;
				2003
				2004	/* Check highest gap, which does not precede any rbtree node */
				2005	gap_start = mm->highest_vm_end;
				2006	if (gap_start <= high_limit)
				2007	goto found_highest;
				2008
				2009	/* Check if rbtree root looks promising */
				2010	if (RB_EMPTY_ROOT(&mm->mm_rb))
				2011	return -ENOMEM;
				2012	vma = rb_entry(mm->mm_rb.rb_node, struct vm_area_struct, vm_rb);
				2013	if (vma->rb_subtree_gap < length)
				2014	return -ENOMEM;
				2015
				2016	while (true) {
				2017	/* Visit right subtree if it looks promising */
				2018	gap_start = vma->vm_prev ? vm_end_gap(vma->vm_prev) : 0;
				2019	if (gap_start <= high_limit && vma->vm_rb.rb_right) {
				2020	struct vm_area_struct *right =
				2021	rb_entry(vma->vm_rb.rb_right,
				2022	struct vm_area_struct, vm_rb);
				2023	if (right->rb_subtree_gap >= length) {
				2024	vma = right;
				2025	continue;
				2026	}
				2027	}
				2028
				2029	check_current:
				2030	/* Check if current node has a suitable gap */
				2031	gap_end = vm_start_gap(vma);
				2032	if (gap_end < low_limit)
				2033	return -ENOMEM;
				2034	if (gap_start <= high_limit &&
				2035	gap_end > gap_start && gap_end - gap_start >= length)
				2036	goto found;
				2037
				2038	/* Visit left subtree if it looks promising */
				2039	if (vma->vm_rb.rb_left) {
				2040	struct vm_area_struct *left =
				2041	rb_entry(vma->vm_rb.rb_left,
				2042	struct vm_area_struct, vm_rb);
				2043	if (left->rb_subtree_gap >= length) {
				2044	vma = left;
				2045	continue;
				2046	}
				2047	}
				2048
				2049	/* Go back up the rbtree to find next candidate node */
				2050	while (true) {
				2051	struct rb_node *prev = &vma->vm_rb;
				2052	if (!rb_parent(prev))
				2053	return -ENOMEM;
				2054	vma = rb_entry(rb_parent(prev),
				2055	struct vm_area_struct, vm_rb);
				2056	if (prev == vma->vm_rb.rb_right) {
				2057	gap_start = vma->vm_prev ?
				2058	vm_end_gap(vma->vm_prev) : 0;
				2059	goto check_current;
				2060	}
				2061	}
				2062	}
				2063
				2064	found:
				2065	/* We found a suitable gap. Clip it with the original high_limit. */
				2066	if (gap_end > info->high_limit)
				2067	gap_end = info->high_limit;
				2068
				2069	found_highest:
				2070	/* Compute highest gap address at the desired alignment */
				2071	gap_end -= info->length;
				2072	gap_end -= (gap_end - info->align_offset) & info->align_mask;
				2073
				2074	VM_BUG_ON(gap_end < info->low_limit);
				2075	VM_BUG_ON(gap_end < gap_start);
				2076	return gap_end;
				2077	}
				2078
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2079
				2080	#ifndef arch_get_mmap_end
				2081	#define arch_get_mmap_end(addr) (TASK_SIZE)
				2082	#endif
				2083
				2084	#ifndef arch_get_mmap_base
				2085	#define arch_get_mmap_base(addr, base) (base)
				2086	#endif
				2087
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2088	/* Get an address range which is currently unmapped.
				2089	* For shmat() with addr=0.
				2090	*
				2091	* Ugly calling convention alert:
				2092	* Return value with the low bits set means error value,
				2093	* ie
				2094	* if (ret & ~PAGE_MASK)
				2095	* error = ret;
				2096	*
				2097	* This function "knows" that -ENOMEM has the bits set.
				2098	*/
				2099	#ifndef HAVE_ARCH_UNMAPPED_AREA
				2100	unsigned long
				2101	arch_get_unmapped_area(struct file *filp, unsigned long addr,
				2102	unsigned long len, unsigned long pgoff, unsigned long flags)
				2103	{
				2104	struct mm_struct *mm = current->mm;
				2105	struct vm_area_struct vma, prev;
				2106	struct vm_unmapped_area_info info;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2107	const unsigned long mmap_end = arch_get_mmap_end(addr);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2108
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2109	if (len > mmap_end - mmap_min_addr)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2110	return -ENOMEM;
				2111
				2112	if (flags & MAP_FIXED)
				2113	return addr;
				2114
				2115	if (addr) {
				2116	addr = PAGE_ALIGN(addr);
				2117	vma = find_vma_prev(mm, addr, &prev);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2118	if (mmap_end - len >= addr && addr >= mmap_min_addr &&
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2119	(!vma \|\| addr + len <= vm_start_gap(vma)) &&
				2120	(!prev \|\| addr >= vm_end_gap(prev)))
				2121	return addr;
				2122	}
				2123
				2124	info.flags = 0;
				2125	info.length = len;
				2126	info.low_limit = mm->mmap_base;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2127	info.high_limit = mmap_end;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2128	info.align_mask = 0;
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	2129	info.align_offset = 0;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2130	return vm_unmapped_area(&info);
				2131	}
				2132	#endif
				2133
				2134	/*
				2135	* This mmap-allocator allocates new areas top-down from below the
				2136	* stack's low limit (the base):
				2137	*/
				2138	#ifndef HAVE_ARCH_UNMAPPED_AREA_TOPDOWN
				2139	unsigned long
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2140	arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr,
				2141	unsigned long len, unsigned long pgoff,
				2142	unsigned long flags)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2143	{
				2144	struct vm_area_struct vma, prev;
				2145	struct mm_struct *mm = current->mm;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2146	struct vm_unmapped_area_info info;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2147	const unsigned long mmap_end = arch_get_mmap_end(addr);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2148
				2149	/* requested length too big for entire address space */
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2150	if (len > mmap_end - mmap_min_addr)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2151	return -ENOMEM;
				2152
				2153	if (flags & MAP_FIXED)
				2154	return addr;
				2155
				2156	/* requesting a specific address */
				2157	if (addr) {
				2158	addr = PAGE_ALIGN(addr);
				2159	vma = find_vma_prev(mm, addr, &prev);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2160	if (mmap_end - len >= addr && addr >= mmap_min_addr &&
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2161	(!vma \|\| addr + len <= vm_start_gap(vma)) &&
				2162	(!prev \|\| addr >= vm_end_gap(prev)))
				2163	return addr;
				2164	}
				2165
				2166	info.flags = VM_UNMAPPED_AREA_TOPDOWN;
				2167	info.length = len;
				2168	info.low_limit = max(PAGE_SIZE, mmap_min_addr);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2169	info.high_limit = arch_get_mmap_base(addr, mm->mmap_base);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2170	info.align_mask = 0;
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	2171	info.align_offset = 0;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2172	addr = vm_unmapped_area(&info);
				2173
				2174	/*
				2175	* A failed mmap() very likely causes application failure,
				2176	* so fall back to the bottom-up function here. This scenario
				2177	* can happen with large stack limits and large mmap()
				2178	* allocations.
				2179	*/
				2180	if (offset_in_page(addr)) {
				2181	VM_BUG_ON(addr != -ENOMEM);
				2182	info.flags = 0;
				2183	info.low_limit = TASK_UNMAPPED_BASE;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2184	info.high_limit = mmap_end;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2185	addr = vm_unmapped_area(&info);
				2186	}
				2187
				2188	return addr;
				2189	}
				2190	#endif
				2191
				2192	unsigned long
				2193	get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
				2194	unsigned long pgoff, unsigned long flags)
				2195	{
				2196	unsigned long (get_area)(struct file , unsigned long,
				2197	unsigned long, unsigned long, unsigned long);
				2198
				2199	unsigned long error = arch_mmap_check(addr, len, flags);
				2200	if (error)
				2201	return error;
				2202
				2203	/* Careful about overflows.. */
				2204	if (len > TASK_SIZE)
				2205	return -ENOMEM;
				2206
				2207	get_area = current->mm->get_unmapped_area;
				2208	if (file) {
				2209	if (file->f_op->get_unmapped_area)
				2210	get_area = file->f_op->get_unmapped_area;
				2211	} else if (flags & MAP_SHARED) {
				2212	/*
				2213	* mmap_region() will call shmem_zero_setup() to create a file,
				2214	* so use shmem's get_unmapped_area in case it can be huge.
				2215	* do_mmap_pgoff() will clear pgoff, so match alignment.
				2216	*/
				2217	pgoff = 0;
				2218	get_area = shmem_get_unmapped_area;
				2219	}
				2220
				2221	addr = get_area(file, addr, len, pgoff, flags);
				2222	if (IS_ERR_VALUE(addr))
				2223	return addr;
				2224
				2225	if (addr > TASK_SIZE - len)
				2226	return -ENOMEM;
				2227	if (offset_in_page(addr))
				2228	return -EINVAL;
				2229
				2230	error = security_mmap_addr(addr);
				2231	return error ? error : addr;
				2232	}
				2233
				2234	EXPORT_SYMBOL(get_unmapped_area);
				2235
				2236	/* Look up the first VMA which satisfies addr < vm_end, NULL if none. */
				2237	struct vm_area_struct find_vma(struct mm_struct mm, unsigned long addr)
				2238	{
				2239	struct rb_node *rb_node;
				2240	struct vm_area_struct *vma;
				2241
				2242	/* Check the cache first. */
				2243	vma = vmacache_find(mm, addr);
				2244	if (likely(vma))
				2245	return vma;
				2246
				2247	rb_node = mm->mm_rb.rb_node;
				2248
				2249	while (rb_node) {
				2250	struct vm_area_struct *tmp;
				2251
				2252	tmp = rb_entry(rb_node, struct vm_area_struct, vm_rb);
				2253
				2254	if (tmp->vm_end > addr) {
				2255	vma = tmp;
				2256	if (tmp->vm_start <= addr)
				2257	break;
				2258	rb_node = rb_node->rb_left;
				2259	} else
				2260	rb_node = rb_node->rb_right;
				2261	}
				2262
				2263	if (vma)
				2264	vmacache_update(addr, vma);
				2265	return vma;
				2266	}
				2267
				2268	EXPORT_SYMBOL(find_vma);
				2269
				2270	/*
				2271	* Same as find_vma, but also return a pointer to the previous VMA in *pprev.
				2272	*/
				2273	struct vm_area_struct *
				2274	find_vma_prev(struct mm_struct *mm, unsigned long addr,
				2275	struct vm_area_struct **pprev)
				2276	{
				2277	struct vm_area_struct *vma;
				2278
				2279	vma = find_vma(mm, addr);
				2280	if (vma) {
				2281	*pprev = vma->vm_prev;
				2282	} else {
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2283	struct rb_node *rb_node = rb_last(&mm->mm_rb);
				2284
				2285	*pprev = rb_node ? rb_entry(rb_node, struct vm_area_struct, vm_rb) : NULL;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2286	}
				2287	return vma;
				2288	}
				2289
				2290	/*
				2291	* Verify that the stack growth is acceptable and
				2292	* update accounting. This is shared with both the
				2293	* grow-up and grow-down cases.
				2294	*/
				2295	static int acct_stack_growth(struct vm_area_struct *vma,
				2296	unsigned long size, unsigned long grow)
				2297	{
				2298	struct mm_struct *mm = vma->vm_mm;
				2299	unsigned long new_start;
				2300
				2301	/* address space limit tests */
				2302	if (!may_expand_vm(mm, vma->vm_flags, grow))
				2303	return -ENOMEM;
				2304
				2305	/* Stack limit test */
				2306	if (size > rlimit(RLIMIT_STACK))
				2307	return -ENOMEM;
				2308
				2309	/* mlock limit tests */
				2310	if (vma->vm_flags & VM_LOCKED) {
				2311	unsigned long locked;
				2312	unsigned long limit;
				2313	locked = mm->locked_vm + grow;
				2314	limit = rlimit(RLIMIT_MEMLOCK);
				2315	limit >>= PAGE_SHIFT;
				2316	if (locked > limit && !capable(CAP_IPC_LOCK))
				2317	return -ENOMEM;
				2318	}
				2319
				2320	/* Check to ensure the stack will not grow into a hugetlb-only region */
				2321	new_start = (vma->vm_flags & VM_GROWSUP) ? vma->vm_start :
				2322	vma->vm_end - size;
				2323	if (is_hugepage_only_range(vma->vm_mm, new_start, size))
				2324	return -EFAULT;
				2325
				2326	/*
				2327	* Overcommit.. This must be the final test, as it will
				2328	* update security statistics.
				2329	*/
				2330	if (security_vm_enough_memory_mm(mm, grow))
				2331	return -ENOMEM;
				2332
				2333	return 0;
				2334	}
				2335
				2336	#if defined(CONFIG_STACK_GROWSUP) \|\| defined(CONFIG_IA64)
				2337	/*
				2338	* PA-RISC uses this for its stack; IA64 for its Register Backing Store.
				2339	* vma is the last one with address > vma->vm_end. Have to extend vma.
				2340	*/
				2341	int expand_upwards(struct vm_area_struct *vma, unsigned long address)
				2342	{
				2343	struct mm_struct *mm = vma->vm_mm;
				2344	struct vm_area_struct *next;
				2345	unsigned long gap_addr;
				2346	int error = 0;
				2347
				2348	if (!(vma->vm_flags & VM_GROWSUP))
				2349	return -EFAULT;
				2350
				2351	/* Guard against exceeding limits of the address space. */
				2352	address &= PAGE_MASK;
				2353	if (address >= (TASK_SIZE & PAGE_MASK))
				2354	return -ENOMEM;
				2355	address += PAGE_SIZE;
				2356
				2357	/* Enforce stack_guard_gap */
				2358	gap_addr = address + stack_guard_gap;
				2359
				2360	/* Guard against overflow */
				2361	if (gap_addr < address \|\| gap_addr > TASK_SIZE)
				2362	gap_addr = TASK_SIZE;
				2363
				2364	next = vma->vm_next;
				2365	if (next && next->vm_start < gap_addr &&
				2366	(next->vm_flags & (VM_WRITE\|VM_READ\|VM_EXEC))) {
				2367	if (!(next->vm_flags & VM_GROWSUP))
				2368	return -ENOMEM;
				2369	/* Check that both stack segments have the same anon_vma? */
				2370	}
				2371
				2372	/* We must make sure the anon_vma is allocated. */
				2373	if (unlikely(anon_vma_prepare(vma)))
				2374	return -ENOMEM;
				2375
				2376	/*
				2377	* vma->vm_start/vm_end cannot change under us because the caller
				2378	* is required to hold the mmap_sem in read mode. We need the
				2379	* anon_vma lock to serialize against concurrent expand_stacks.
				2380	*/
				2381	anon_vma_lock_write(vma->anon_vma);
				2382
				2383	/* Somebody else might have raced and expanded it already */
				2384	if (address > vma->vm_end) {
				2385	unsigned long size, grow;
				2386
				2387	size = address - vma->vm_start;
				2388	grow = (address - vma->vm_end) >> PAGE_SHIFT;
				2389
				2390	error = -ENOMEM;
				2391	if (vma->vm_pgoff + (size >> PAGE_SHIFT) >= vma->vm_pgoff) {
				2392	error = acct_stack_growth(vma, size, grow);
				2393	if (!error) {
				2394	/*
				2395	* vma_gap_update() doesn't support concurrent
				2396	* updates, but we only hold a shared mmap_sem
				2397	* lock here, so we need to protect against
				2398	* concurrent vma expansions.
				2399	* anon_vma_lock_write() doesn't help here, as
				2400	* we don't guarantee that all growable vmas
				2401	* in a mm share the same root anon vma.
				2402	* So, we reuse mm->page_table_lock to guard
				2403	* against concurrent vma expansions.
				2404	*/
				2405	spin_lock(&mm->page_table_lock);
				2406	if (vma->vm_flags & VM_LOCKED)
				2407	mm->locked_vm += grow;
				2408	vm_stat_account(mm, vma->vm_flags, grow);
				2409	anon_vma_interval_tree_pre_update_vma(vma);
				2410	vma->vm_end = address;
				2411	anon_vma_interval_tree_post_update_vma(vma);
				2412	if (vma->vm_next)
				2413	vma_gap_update(vma->vm_next);
				2414	else
				2415	mm->highest_vm_end = vm_end_gap(vma);
				2416	spin_unlock(&mm->page_table_lock);
				2417
				2418	perf_event_mmap(vma);
				2419	}
				2420	}
				2421	}
				2422	anon_vma_unlock_write(vma->anon_vma);
				2423	khugepaged_enter_vma_merge(vma, vma->vm_flags);
				2424	validate_mm(mm);
				2425	return error;
				2426	}
				2427	#endif /* CONFIG_STACK_GROWSUP \|\| CONFIG_IA64 */
				2428
				2429	/*
				2430	* vma is the first one with address < vma->vm_start. Have to extend vma.
				2431	*/
				2432	int expand_downwards(struct vm_area_struct *vma,
				2433	unsigned long address)
				2434	{
				2435	struct mm_struct *mm = vma->vm_mm;
				2436	struct vm_area_struct *prev;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2437	int error = 0;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2438
				2439	address &= PAGE_MASK;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2440	if (address < mmap_min_addr)
				2441	return -EPERM;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2442
				2443	/* Enforce stack_guard_gap */
				2444	prev = vma->vm_prev;
				2445	/* Check that both stack segments have the same anon_vma? */
				2446	if (prev && !(prev->vm_flags & VM_GROWSDOWN) &&
				2447	(prev->vm_flags & (VM_WRITE\|VM_READ\|VM_EXEC))) {
				2448	if (address - prev->vm_end < stack_guard_gap)
				2449	return -ENOMEM;
				2450	}
				2451
				2452	/* We must make sure the anon_vma is allocated. */
				2453	if (unlikely(anon_vma_prepare(vma)))
				2454	return -ENOMEM;
				2455
				2456	/*
				2457	* vma->vm_start/vm_end cannot change under us because the caller
				2458	* is required to hold the mmap_sem in read mode. We need the
				2459	* anon_vma lock to serialize against concurrent expand_stacks.
				2460	*/
				2461	anon_vma_lock_write(vma->anon_vma);
				2462
				2463	/* Somebody else might have raced and expanded it already */
				2464	if (address < vma->vm_start) {
				2465	unsigned long size, grow;
				2466
				2467	size = vma->vm_end - address;
				2468	grow = (vma->vm_start - address) >> PAGE_SHIFT;
				2469
				2470	error = -ENOMEM;
				2471	if (grow <= vma->vm_pgoff) {
				2472	error = acct_stack_growth(vma, size, grow);
				2473	if (!error) {
				2474	/*
				2475	* vma_gap_update() doesn't support concurrent
				2476	* updates, but we only hold a shared mmap_sem
				2477	* lock here, so we need to protect against
				2478	* concurrent vma expansions.
				2479	* anon_vma_lock_write() doesn't help here, as
				2480	* we don't guarantee that all growable vmas
				2481	* in a mm share the same root anon vma.
				2482	* So, we reuse mm->page_table_lock to guard
				2483	* against concurrent vma expansions.
				2484	*/
				2485	spin_lock(&mm->page_table_lock);
				2486	if (vma->vm_flags & VM_LOCKED)
				2487	mm->locked_vm += grow;
				2488	vm_stat_account(mm, vma->vm_flags, grow);
				2489	anon_vma_interval_tree_pre_update_vma(vma);
				2490	vma->vm_start = address;
				2491	vma->vm_pgoff -= grow;
				2492	anon_vma_interval_tree_post_update_vma(vma);
				2493	vma_gap_update(vma);
				2494	spin_unlock(&mm->page_table_lock);
				2495
				2496	perf_event_mmap(vma);
				2497	}
				2498	}
				2499	}
				2500	anon_vma_unlock_write(vma->anon_vma);
				2501	khugepaged_enter_vma_merge(vma, vma->vm_flags);
				2502	validate_mm(mm);
				2503	return error;
				2504	}
				2505
				2506	/* enforced gap between the expanding stack and other mappings. */
				2507	unsigned long stack_guard_gap = 256UL<<PAGE_SHIFT;
				2508
				2509	static int __init cmdline_parse_stack_guard_gap(char *p)
				2510	{
				2511	unsigned long val;
				2512	char *endptr;
				2513
				2514	val = simple_strtoul(p, &endptr, 10);
				2515	if (!*endptr)
				2516	stack_guard_gap = val << PAGE_SHIFT;
				2517
				2518	return 0;
				2519	}
				2520	__setup("stack_guard_gap=", cmdline_parse_stack_guard_gap);
				2521
				2522	#ifdef CONFIG_STACK_GROWSUP
				2523	int expand_stack(struct vm_area_struct *vma, unsigned long address)
				2524	{
				2525	return expand_upwards(vma, address);
				2526	}
				2527
				2528	struct vm_area_struct *
				2529	find_extend_vma(struct mm_struct *mm, unsigned long addr)
				2530	{
				2531	struct vm_area_struct vma, prev;
				2532
				2533	addr &= PAGE_MASK;
				2534	vma = find_vma_prev(mm, addr, &prev);
				2535	if (vma && (vma->vm_start <= addr))
				2536	return vma;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2537	/* don't alter vm_end if the coredump is running */
				2538	if (!prev \|\| !mmget_still_valid(mm) \|\| expand_stack(prev, addr))
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2539	return NULL;
				2540	if (prev->vm_flags & VM_LOCKED)
				2541	populate_vma_page_range(prev, addr, prev->vm_end, NULL);
				2542	return prev;
				2543	}
				2544	#else
				2545	int expand_stack(struct vm_area_struct *vma, unsigned long address)
				2546	{
				2547	return expand_downwards(vma, address);
				2548	}
				2549
				2550	struct vm_area_struct *
				2551	find_extend_vma(struct mm_struct *mm, unsigned long addr)
				2552	{
				2553	struct vm_area_struct *vma;
				2554	unsigned long start;
				2555
				2556	addr &= PAGE_MASK;
				2557	vma = find_vma(mm, addr);
				2558	if (!vma)
				2559	return NULL;
				2560	if (vma->vm_start <= addr)
				2561	return vma;
				2562	if (!(vma->vm_flags & VM_GROWSDOWN))
				2563	return NULL;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2564	/* don't alter vm_start if the coredump is running */
				2565	if (!mmget_still_valid(mm))
				2566	return NULL;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2567	start = vma->vm_start;
				2568	if (expand_stack(vma, addr))
				2569	return NULL;
				2570	if (vma->vm_flags & VM_LOCKED)
				2571	populate_vma_page_range(vma, addr, start, NULL);
				2572	return vma;
				2573	}
				2574	#endif
				2575
				2576	EXPORT_SYMBOL_GPL(find_extend_vma);
				2577
				2578	/*
				2579	* Ok - we have the memory areas we should free on the vma list,
				2580	* so release them, and do the vma updates.
				2581	*
				2582	* Called with the mm semaphore held.
				2583	*/
				2584	static void remove_vma_list(struct mm_struct mm, struct vm_area_struct vma)
				2585	{
				2586	unsigned long nr_accounted = 0;
				2587
				2588	/* Update high watermark before we lower total_vm */
				2589	update_hiwater_vm(mm);
				2590	do {
				2591	long nrpages = vma_pages(vma);
				2592
				2593	if (vma->vm_flags & VM_ACCOUNT)
				2594	nr_accounted += nrpages;
				2595	vm_stat_account(mm, vma->vm_flags, -nrpages);
				2596	vma = remove_vma(vma);
				2597	} while (vma);
				2598	vm_unacct_memory(nr_accounted);
				2599	validate_mm(mm);
				2600	}
				2601
				2602	/*
				2603	* Get rid of page table information in the indicated region.
				2604	*
				2605	* Called with the mm semaphore held.
				2606	*/
				2607	static void unmap_region(struct mm_struct *mm,
				2608	struct vm_area_struct vma, struct vm_area_struct prev,
				2609	unsigned long start, unsigned long end)
				2610	{
				2611	struct vm_area_struct *next = prev ? prev->vm_next : mm->mmap;
				2612	struct mmu_gather tlb;
				2613
				2614	lru_add_drain();
				2615	tlb_gather_mmu(&tlb, mm, start, end);
				2616	update_hiwater_rss(mm);
				2617	unmap_vmas(&tlb, vma, start, end);
				2618	free_pgtables(&tlb, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS,
				2619	next ? next->vm_start : USER_PGTABLES_CEILING);
				2620	tlb_finish_mmu(&tlb, start, end);
				2621	}
				2622
				2623	/*
				2624	* Create a list of vma's touched by the unmap, removing them from the mm's
				2625	* vma list as we go..
				2626	*/
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	2627	static bool
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2628	detach_vmas_to_be_unmapped(struct mm_struct mm, struct vm_area_struct vma,
				2629	struct vm_area_struct *prev, unsigned long end)
				2630	{
				2631	struct vm_area_struct **insertion_point;
				2632	struct vm_area_struct *tail_vma = NULL;
				2633
				2634	insertion_point = (prev ? &prev->vm_next : &mm->mmap);
				2635	vma->vm_prev = NULL;
				2636	do {
				2637	vma_rb_erase(vma, &mm->mm_rb);
				2638	mm->map_count--;
				2639	tail_vma = vma;
				2640	vma = vma->vm_next;
				2641	} while (vma && vma->vm_start < end);
				2642	*insertion_point = vma;
				2643	if (vma) {
				2644	vma->vm_prev = prev;
				2645	vma_gap_update(vma);
				2646	} else
				2647	mm->highest_vm_end = prev ? vm_end_gap(prev) : 0;
				2648	tail_vma->vm_next = NULL;
				2649
				2650	/* Kill the cache */
				2651	vmacache_invalidate(mm);
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	2652
				2653	/*
				2654	* Do not downgrade mmap_lock if we are next to VM_GROWSDOWN or
				2655	* VM_GROWSUP VMA. Such VMAs can change their size under
				2656	* down_read(mmap_lock) and collide with the VMA we are about to unmap.
				2657	*/
				2658	if (vma && (vma->vm_flags & VM_GROWSDOWN))
				2659	return false;
				2660	if (prev && (prev->vm_flags & VM_GROWSUP))
				2661	return false;
				2662	return true;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2663	}
				2664
				2665	/*
				2666	* __split_vma() bypasses sysctl_max_map_count checking. We use this where it
				2667	* has already been checked or doesn't make sense to fail.
				2668	*/
				2669	int __split_vma(struct mm_struct mm, struct vm_area_struct vma,
				2670	unsigned long addr, int new_below)
				2671	{
				2672	struct vm_area_struct *new;
				2673	int err;
				2674
				2675	if (vma->vm_ops && vma->vm_ops->split) {
				2676	err = vma->vm_ops->split(vma, addr);
				2677	if (err)
				2678	return err;
				2679	}
				2680
				2681	new = vm_area_dup(vma);
				2682	if (!new)
				2683	return -ENOMEM;
				2684
				2685	if (new_below)
				2686	new->vm_end = addr;
				2687	else {
				2688	new->vm_start = addr;
				2689	new->vm_pgoff += ((addr - vma->vm_start) >> PAGE_SHIFT);
				2690	}
				2691
				2692	err = vma_dup_policy(vma, new);
				2693	if (err)
				2694	goto out_free_vma;
				2695
				2696	err = anon_vma_clone(new, vma);
				2697	if (err)
				2698	goto out_free_mpol;
				2699
				2700	if (new->vm_file)
				2701	get_file(new->vm_file);
				2702
				2703	if (new->vm_ops && new->vm_ops->open)
				2704	new->vm_ops->open(new);
				2705
				2706	if (new_below)
				2707	err = vma_adjust(vma, addr, vma->vm_end, vma->vm_pgoff +
				2708	((addr - new->vm_start) >> PAGE_SHIFT), new);
				2709	else
				2710	err = vma_adjust(vma, vma->vm_start, addr, vma->vm_pgoff, new);
				2711
				2712	/* Success. */
				2713	if (!err)
				2714	return 0;
				2715
				2716	/* Clean everything up if vma_adjust failed. */
				2717	if (new->vm_ops && new->vm_ops->close)
				2718	new->vm_ops->close(new);
				2719	if (new->vm_file)
				2720	fput(new->vm_file);
				2721	unlink_anon_vmas(new);
				2722	out_free_mpol:
				2723	mpol_put(vma_policy(new));
				2724	out_free_vma:
				2725	vm_area_free(new);
				2726	return err;
				2727	}
				2728
				2729	/*
				2730	* Split a vma into two pieces at address 'addr', a new vma is allocated
				2731	* either for the first part or the tail.
				2732	*/
				2733	int split_vma(struct mm_struct mm, struct vm_area_struct vma,
				2734	unsigned long addr, int new_below)
				2735	{
				2736	if (mm->map_count >= sysctl_max_map_count)
				2737	return -ENOMEM;
				2738
				2739	return __split_vma(mm, vma, addr, new_below);
				2740	}
				2741
				2742	/* Munmap is split into 2 main parts -- this part which finds
				2743	* what needs doing, and the areas themselves, which do the
				2744	* work. This now handles partial unmappings.
				2745	* Jeremy Fitzhardinge <jeremy@goop.org>
				2746	*/
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2747	int __do_munmap(struct mm_struct *mm, unsigned long start, size_t len,
				2748	struct list_head *uf, bool downgrade)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2749	{
				2750	unsigned long end;
				2751	struct vm_area_struct vma, prev, *last;
				2752
				2753	if ((offset_in_page(start)) \|\| start > TASK_SIZE \|\| len > TASK_SIZE-start)
				2754	return -EINVAL;
				2755
				2756	len = PAGE_ALIGN(len);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2757	end = start + len;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2758	if (len == 0)
				2759	return -EINVAL;
				2760
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2761	/*
				2762	* arch_unmap() might do unmaps itself. It must be called
				2763	* and finish any rbtree manipulation before this code
				2764	* runs and also starts to manipulate the rbtree.
				2765	*/
				2766	arch_unmap(mm, start, end);
				2767
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2768	/* Find the first overlapping VMA */
				2769	vma = find_vma(mm, start);
				2770	if (!vma)
				2771	return 0;
				2772	prev = vma->vm_prev;
				2773	/* we have start < vma->vm_end */
				2774
				2775	/* if it doesn't overlap, we have nothing.. */
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2776	if (vma->vm_start >= end)
				2777	return 0;
				2778
				2779	/*
				2780	* If we need to split any vma, do it now to save pain later.
				2781	*
				2782	* Note: mremap's move_vma VM_ACCOUNT handling assumes a partially
				2783	* unmapped vm_area_struct will remain in use: so lower split_vma
				2784	* places tmp vma above, and higher split_vma places tmp vma below.
				2785	*/
				2786	if (start > vma->vm_start) {
				2787	int error;
				2788
				2789	/*
				2790	* Make sure that map_count on return from munmap() will
				2791	* not exceed its limit; but let map_count go just above
				2792	* its limit temporarily, to help free resources as expected.
				2793	*/
				2794	if (end < vma->vm_end && mm->map_count >= sysctl_max_map_count)
				2795	return -ENOMEM;
				2796
				2797	error = __split_vma(mm, vma, start, 0);
				2798	if (error)
				2799	return error;
				2800	prev = vma;
				2801	}
				2802
				2803	/* Does it split the last one? */
				2804	last = find_vma(mm, end);
				2805	if (last && end > last->vm_start) {
				2806	int error = __split_vma(mm, last, end, 1);
				2807	if (error)
				2808	return error;
				2809	}
				2810	vma = prev ? prev->vm_next : mm->mmap;
				2811
				2812	if (unlikely(uf)) {
				2813	/*
				2814	* If userfaultfd_unmap_prep returns an error the vmas
				2815	* will remain splitted, but userland will get a
				2816	* highly unexpected error anyway. This is no
				2817	* different than the case where the first of the two
				2818	* __split_vma fails, but we don't undo the first
				2819	* split, despite we could. This is unlikely enough
				2820	* failure that it's not worth optimizing it for.
				2821	*/
				2822	int error = userfaultfd_unmap_prep(vma, start, end, uf);
				2823	if (error)
				2824	return error;
				2825	}
				2826
				2827	/*
				2828	* unlock any mlock()ed ranges before detaching vmas
				2829	*/
				2830	if (mm->locked_vm) {
				2831	struct vm_area_struct *tmp = vma;
				2832	while (tmp && tmp->vm_start < end) {
				2833	if (tmp->vm_flags & VM_LOCKED) {
				2834	mm->locked_vm -= vma_pages(tmp);
				2835	munlock_vma_pages_all(tmp);
				2836	}
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2837
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2838	tmp = tmp->vm_next;
				2839	}
				2840	}
				2841
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2842	/* Detach vmas from rbtree */
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	2843	if (!detach_vmas_to_be_unmapped(mm, vma, prev, end))
				2844	downgrade = false;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2845
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2846	if (downgrade)
				2847	downgrade_write(&mm->mmap_sem);
				2848
				2849	unmap_region(mm, vma, prev, start, end);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2850
				2851	/* Fix up all other VM information */
				2852	remove_vma_list(mm, vma);
				2853
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2854	return downgrade ? 1 : 0;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2855	}
				2856
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2857	int do_munmap(struct mm_struct *mm, unsigned long start, size_t len,
				2858	struct list_head *uf)
				2859	{
				2860	return __do_munmap(mm, start, len, uf, false);
				2861	}
				2862
				2863	static int __vm_munmap(unsigned long start, size_t len, bool downgrade)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2864	{
				2865	int ret;
				2866	struct mm_struct *mm = current->mm;
				2867	LIST_HEAD(uf);
				2868
				2869	if (down_write_killable(&mm->mmap_sem))
				2870	return -EINTR;
				2871
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2872	ret = __do_munmap(mm, start, len, &uf, downgrade);
				2873	/*
				2874	* Returning 1 indicates mmap_sem is downgraded.
				2875	* But 1 is not legal return value of vm_munmap() and munmap(), reset
				2876	* it to 0 before return.
				2877	*/
				2878	if (ret == 1) {
				2879	up_read(&mm->mmap_sem);
				2880	ret = 0;
				2881	} else
				2882	up_write(&mm->mmap_sem);
				2883
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2884	userfaultfd_unmap_complete(mm, &uf);
				2885	return ret;
				2886	}
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2887
				2888	int vm_munmap(unsigned long start, size_t len)
				2889	{
				2890	return __vm_munmap(start, len, false);
				2891	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2892	EXPORT_SYMBOL(vm_munmap);
				2893
				2894	SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len)
				2895	{
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2896	addr = untagged_addr(addr);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2897	profile_munmap(addr);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2898	return __vm_munmap(addr, len, true);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2899	}
				2900
				2901
				2902	/*
				2903	* Emulation of deprecated remap_file_pages() syscall.
				2904	*/
				2905	SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
				2906	unsigned long, prot, unsigned long, pgoff, unsigned long, flags)
				2907	{
				2908
				2909	struct mm_struct *mm = current->mm;
				2910	struct vm_area_struct *vma;
				2911	unsigned long populate = 0;
				2912	unsigned long ret = -EINVAL;
				2913	struct file *file;
				2914
				2915	pr_warn_once("%s (%d) uses deprecated remap_file_pages() syscall. See Documentation/vm/remap_file_pages.rst.\n",
				2916	current->comm, current->pid);
				2917
				2918	if (prot)
				2919	return ret;
				2920	start = start & PAGE_MASK;
				2921	size = size & PAGE_MASK;
				2922
				2923	if (start + size <= start)
				2924	return ret;
				2925
				2926	/* Does pgoff wrap? */
				2927	if (pgoff + (size >> PAGE_SHIFT) < pgoff)
				2928	return ret;
				2929
				2930	if (down_write_killable(&mm->mmap_sem))
				2931	return -EINTR;
				2932
				2933	vma = find_vma(mm, start);
				2934
				2935	if (!vma \|\| !(vma->vm_flags & VM_SHARED))
				2936	goto out;
				2937
				2938	if (start < vma->vm_start)
				2939	goto out;
				2940
				2941	if (start + size > vma->vm_end) {
				2942	struct vm_area_struct *next;
				2943
				2944	for (next = vma->vm_next; next; next = next->vm_next) {
				2945	/* hole between vmas ? */
				2946	if (next->vm_start != next->vm_prev->vm_end)
				2947	goto out;
				2948
				2949	if (next->vm_file != vma->vm_file)
				2950	goto out;
				2951
				2952	if (next->vm_flags != vma->vm_flags)
				2953	goto out;
				2954
				2955	if (start + size <= next->vm_end)
				2956	break;
				2957	}
				2958
				2959	if (!next)
				2960	goto out;
				2961	}
				2962
				2963	prot \|= vma->vm_flags & VM_READ ? PROT_READ : 0;
				2964	prot \|= vma->vm_flags & VM_WRITE ? PROT_WRITE : 0;
				2965	prot \|= vma->vm_flags & VM_EXEC ? PROT_EXEC : 0;
				2966
				2967	flags &= MAP_NONBLOCK;
				2968	flags \|= MAP_SHARED \| MAP_FIXED \| MAP_POPULATE;
				2969	if (vma->vm_flags & VM_LOCKED) {
				2970	struct vm_area_struct *tmp;
				2971	flags \|= MAP_LOCKED;
				2972
				2973	/* drop PG_Mlocked flag for over-mapped range */
				2974	for (tmp = vma; tmp->vm_start >= start + size;
				2975	tmp = tmp->vm_next) {
				2976	/*
				2977	* Split pmd and munlock page on the border
				2978	* of the range.
				2979	*/
				2980	vma_adjust_trans_huge(tmp, start, start + size, 0);
				2981
				2982	munlock_vma_pages_range(tmp,
				2983	max(tmp->vm_start, start),
				2984	min(tmp->vm_end, start + size));
				2985	}
				2986	}
				2987
				2988	file = get_file(vma->vm_file);
				2989	ret = do_mmap_pgoff(vma->vm_file, start, size,
				2990	prot, flags, pgoff, &populate, NULL);
				2991	fput(file);
				2992	out:
				2993	up_write(&mm->mmap_sem);
				2994	if (populate)
				2995	mm_populate(ret, populate);
				2996	if (!IS_ERR_VALUE(ret))
				2997	ret = 0;
				2998	return ret;
				2999	}
				3000
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3001	/*
				3002	* this is really a simplified "do_mmap". it only handles
				3003	* anonymous maps. eventually we may be able to do some
				3004	* brk-specific accounting here.
				3005	*/
				3006	static int do_brk_flags(unsigned long addr, unsigned long len, unsigned long flags, struct list_head *uf)
				3007	{
				3008	struct mm_struct *mm = current->mm;
				3009	struct vm_area_struct vma, prev;
				3010	struct rb_node *rb_link, rb_parent;
				3011	pgoff_t pgoff = addr >> PAGE_SHIFT;
				3012	int error;
				3013
				3014	/* Until we need other flags, refuse anything except VM_EXEC. */
				3015	if ((flags & (~VM_EXEC)) != 0)
				3016	return -EINVAL;
				3017	flags \|= VM_DATA_DEFAULT_FLAGS \| VM_ACCOUNT \| mm->def_flags;
				3018
				3019	error = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED);
				3020	if (offset_in_page(error))
				3021	return error;
				3022
				3023	error = mlock_future_check(mm, mm->def_flags, len);
				3024	if (error)
				3025	return error;
				3026
				3027	/*
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3028	* Clear old maps. this also does some error checking for us
				3029	*/
				3030	while (find_vma_links(mm, addr, addr + len, &prev, &rb_link,
				3031	&rb_parent)) {
				3032	if (do_munmap(mm, addr, len, uf))
				3033	return -ENOMEM;
				3034	}
				3035
				3036	/* Check against address space limits after clearing old maps... */
				3037	if (!may_expand_vm(mm, flags, len >> PAGE_SHIFT))
				3038	return -ENOMEM;
				3039
				3040	if (mm->map_count > sysctl_max_map_count)
				3041	return -ENOMEM;
				3042
				3043	if (security_vm_enough_memory_mm(mm, len >> PAGE_SHIFT))
				3044	return -ENOMEM;
				3045
				3046	/* Can we just expand an old private anonymous mapping? */
				3047	vma = vma_merge(mm, prev, addr, addr + len, flags,
				3048	NULL, NULL, pgoff, NULL, NULL_VM_UFFD_CTX);
				3049	if (vma)
				3050	goto out;
				3051
				3052	/*
				3053	* create a vma struct for an anonymous mapping
				3054	*/
				3055	vma = vm_area_alloc(mm);
				3056	if (!vma) {
				3057	vm_unacct_memory(len >> PAGE_SHIFT);
				3058	return -ENOMEM;
				3059	}
				3060
				3061	vma_set_anonymous(vma);
				3062	vma->vm_start = addr;
				3063	vma->vm_end = addr + len;
				3064	vma->vm_pgoff = pgoff;
				3065	vma->vm_flags = flags;
				3066	vma->vm_page_prot = vm_get_page_prot(flags);
				3067	vma_link(mm, vma, prev, rb_link, rb_parent);
				3068	out:
				3069	perf_event_mmap(vma);
				3070	mm->total_vm += len >> PAGE_SHIFT;
				3071	mm->data_vm += len >> PAGE_SHIFT;
				3072	if (flags & VM_LOCKED)
				3073	mm->locked_vm += (len >> PAGE_SHIFT);
				3074	vma->vm_flags \|= VM_SOFTDIRTY;
				3075	return 0;
				3076	}
				3077
				3078	int vm_brk_flags(unsigned long addr, unsigned long request, unsigned long flags)
				3079	{
				3080	struct mm_struct *mm = current->mm;
				3081	unsigned long len;
				3082	int ret;
				3083	bool populate;
				3084	LIST_HEAD(uf);
				3085
				3086	len = PAGE_ALIGN(request);
				3087	if (len < request)
				3088	return -ENOMEM;
				3089	if (!len)
				3090	return 0;
				3091
				3092	if (down_write_killable(&mm->mmap_sem))
				3093	return -EINTR;
				3094
				3095	ret = do_brk_flags(addr, len, flags, &uf);
				3096	populate = ((mm->def_flags & VM_LOCKED) != 0);
				3097	up_write(&mm->mmap_sem);
				3098	userfaultfd_unmap_complete(mm, &uf);
				3099	if (populate && !ret)
				3100	mm_populate(addr, len);
				3101	return ret;
				3102	}
				3103	EXPORT_SYMBOL(vm_brk_flags);
				3104
				3105	int vm_brk(unsigned long addr, unsigned long len)
				3106	{
				3107	return vm_brk_flags(addr, len, 0);
				3108	}
				3109	EXPORT_SYMBOL(vm_brk);
				3110
				3111	/* Release all mmaps. */
				3112	void exit_mmap(struct mm_struct *mm)
				3113	{
				3114	struct mmu_gather tlb;
				3115	struct vm_area_struct *vma;
				3116	unsigned long nr_accounted = 0;
				3117
				3118	/* mm's last user has gone, and its about to be pulled down */
				3119	mmu_notifier_release(mm);
				3120
				3121	if (unlikely(mm_is_oom_victim(mm))) {
				3122	/*
				3123	* Manually reap the mm to free as much memory as possible.
				3124	* Then, as the oom reaper does, set MMF_OOM_SKIP to disregard
				3125	* this mm from further consideration. Taking mm->mmap_sem for
				3126	* write after setting MMF_OOM_SKIP will guarantee that the oom
				3127	* reaper will not run on this mm again after mmap_sem is
				3128	* dropped.
				3129	*
				3130	* Nothing can be holding mm->mmap_sem here and the above call
				3131	* to mmu_notifier_release(mm) ensures mmu notifier callbacks in
				3132	* __oom_reap_task_mm() will not block.
				3133	*
				3134	* This needs to be done before calling munlock_vma_pages_all(),
				3135	* which clears VM_LOCKED, otherwise the oom reaper cannot
				3136	* reliably test it.
				3137	*/
				3138	(void)__oom_reap_task_mm(mm);
				3139
				3140	set_bit(MMF_OOM_SKIP, &mm->flags);
				3141	down_write(&mm->mmap_sem);
				3142	up_write(&mm->mmap_sem);
				3143	}
				3144
				3145	if (mm->locked_vm) {
				3146	vma = mm->mmap;
				3147	while (vma) {
				3148	if (vma->vm_flags & VM_LOCKED)
				3149	munlock_vma_pages_all(vma);
				3150	vma = vma->vm_next;
				3151	}
				3152	}
				3153
				3154	arch_exit_mmap(mm);
				3155
				3156	vma = mm->mmap;
				3157	if (!vma) /* Can happen if dup_mmap() received an OOM */
				3158	return;
				3159
				3160	lru_add_drain();
				3161	flush_cache_mm(mm);
				3162	tlb_gather_mmu(&tlb, mm, 0, -1);
				3163	/* update_hiwater_rss(mm) here? but nobody should be looking */
				3164	/* Use -1 here to ensure all VMAs in the mm are unmapped */
				3165	unmap_vmas(&tlb, vma, 0, -1);
				3166	free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, USER_PGTABLES_CEILING);
				3167	tlb_finish_mmu(&tlb, 0, -1);
				3168
				3169	/*
				3170	* Walk the list again, actually closing and freeing it,
				3171	* with preemption enabled, without holding any MM locks.
				3172	*/
				3173	while (vma) {
				3174	if (vma->vm_flags & VM_ACCOUNT)
				3175	nr_accounted += vma_pages(vma);
				3176	vma = remove_vma(vma);
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	3177	cond_resched();
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3178	}
				3179	vm_unacct_memory(nr_accounted);
				3180	}
				3181
				3182	/* Insert vm structure into process list sorted by address
				3183	* and into the inode's i_mmap tree. If vm_file is non-NULL
				3184	* then i_mmap_rwsem is taken here.
				3185	*/
				3186	int insert_vm_struct(struct mm_struct mm, struct vm_area_struct vma)
				3187	{
				3188	struct vm_area_struct *prev;
				3189	struct rb_node *rb_link, rb_parent;
				3190
				3191	if (find_vma_links(mm, vma->vm_start, vma->vm_end,
				3192	&prev, &rb_link, &rb_parent))
				3193	return -ENOMEM;
				3194	if ((vma->vm_flags & VM_ACCOUNT) &&
				3195	security_vm_enough_memory_mm(mm, vma_pages(vma)))
				3196	return -ENOMEM;
				3197
				3198	/*
				3199	* The vm_pgoff of a purely anonymous vma should be irrelevant
				3200	* until its first write fault, when page's anon_vma and index
				3201	* are set. But now set the vm_pgoff it will almost certainly
				3202	* end up with (unless mremap moves it elsewhere before that
				3203	* first wfault), so /proc/pid/maps tells a consistent story.
				3204	*
				3205	* By setting it to reflect the virtual start address of the
				3206	* vma, merges and splits can happen in a seamless way, just
				3207	* using the existing file pgoff checks and manipulations.
				3208	* Similarly in do_mmap_pgoff and in do_brk.
				3209	*/
				3210	if (vma_is_anonymous(vma)) {
				3211	BUG_ON(vma->anon_vma);
				3212	vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT;
				3213	}
				3214
				3215	vma_link(mm, vma, prev, rb_link, rb_parent);
				3216	return 0;
				3217	}
				3218
				3219	/*
				3220	* Copy the vma structure to a new location in the same mm,
				3221	* prior to moving page table entries, to effect an mremap move.
				3222	*/
				3223	struct vm_area_struct copy_vma(struct vm_area_struct *vmap,
				3224	unsigned long addr, unsigned long len, pgoff_t pgoff,
				3225	bool *need_rmap_locks)
				3226	{
				3227	struct vm_area_struct vma = vmap;
				3228	unsigned long vma_start = vma->vm_start;
				3229	struct mm_struct *mm = vma->vm_mm;
				3230	struct vm_area_struct new_vma, prev;
				3231	struct rb_node *rb_link, rb_parent;
				3232	bool faulted_in_anon_vma = true;
				3233
				3234	/*
				3235	* If anonymous vma has not yet been faulted, update new pgoff
				3236	* to match new location, to increase its chance of merging.
				3237	*/
				3238	if (unlikely(vma_is_anonymous(vma) && !vma->anon_vma)) {
				3239	pgoff = addr >> PAGE_SHIFT;
				3240	faulted_in_anon_vma = false;
				3241	}
				3242
				3243	if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent))
				3244	return NULL; /* should never get here */
				3245	new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags,
				3246	vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma),
				3247	vma->vm_userfaultfd_ctx);
				3248	if (new_vma) {
				3249	/*
				3250	* Source vma may have been merged into new_vma
				3251	*/
				3252	if (unlikely(vma_start >= new_vma->vm_start &&
				3253	vma_start < new_vma->vm_end)) {
				3254	/*
				3255	* The only way we can get a vma_merge with
				3256	* self during an mremap is if the vma hasn't
				3257	* been faulted in yet and we were allowed to
				3258	* reset the dst vma->vm_pgoff to the
				3259	* destination address of the mremap to allow
				3260	* the merge to happen. mremap must change the
				3261	* vm_pgoff linearity between src and dst vmas
				3262	* (in turn preventing a vma_merge) to be
				3263	* safe. It is only safe to keep the vm_pgoff
				3264	* linear if there are no pages mapped yet.
				3265	*/
				3266	VM_BUG_ON_VMA(faulted_in_anon_vma, new_vma);
				3267	*vmap = vma = new_vma;
				3268	}
				3269	*need_rmap_locks = (new_vma->vm_pgoff <= vma->vm_pgoff);
				3270	} else {
				3271	new_vma = vm_area_dup(vma);
				3272	if (!new_vma)
				3273	goto out;
				3274	new_vma->vm_start = addr;
				3275	new_vma->vm_end = addr + len;
				3276	new_vma->vm_pgoff = pgoff;
				3277	if (vma_dup_policy(vma, new_vma))
				3278	goto out_free_vma;
				3279	if (anon_vma_clone(new_vma, vma))
				3280	goto out_free_mempol;
				3281	if (new_vma->vm_file)
				3282	get_file(new_vma->vm_file);
				3283	if (new_vma->vm_ops && new_vma->vm_ops->open)
				3284	new_vma->vm_ops->open(new_vma);
				3285	vma_link(mm, new_vma, prev, rb_link, rb_parent);
				3286	*need_rmap_locks = false;
				3287	}
				3288	return new_vma;
				3289
				3290	out_free_mempol:
				3291	mpol_put(vma_policy(new_vma));
				3292	out_free_vma:
				3293	vm_area_free(new_vma);
				3294	out:
				3295	return NULL;
				3296	}
				3297
				3298	/*
				3299	* Return true if the calling process may expand its vm space by the passed
				3300	* number of pages
				3301	*/
				3302	bool may_expand_vm(struct mm_struct *mm, vm_flags_t flags, unsigned long npages)
				3303	{
				3304	if (mm->total_vm + npages > rlimit(RLIMIT_AS) >> PAGE_SHIFT)
				3305	return false;
				3306
				3307	if (is_data_mapping(flags) &&
				3308	mm->data_vm + npages > rlimit(RLIMIT_DATA) >> PAGE_SHIFT) {
				3309	/* Workaround for Valgrind */
				3310	if (rlimit(RLIMIT_DATA) == 0 &&
				3311	mm->data_vm + npages <= rlimit_max(RLIMIT_DATA) >> PAGE_SHIFT)
				3312	return true;
				3313
				3314	pr_warn_once("%s (%d): VmData %lu exceed data ulimit %lu. Update limits%s.\n",
				3315	current->comm, current->pid,
				3316	(mm->data_vm + npages) << PAGE_SHIFT,
				3317	rlimit(RLIMIT_DATA),
				3318	ignore_rlimit_data ? "" : " or use boot option ignore_rlimit_data");
				3319
				3320	if (!ignore_rlimit_data)
				3321	return false;
				3322	}
				3323
				3324	return true;
				3325	}
				3326
				3327	void vm_stat_account(struct mm_struct *mm, vm_flags_t flags, long npages)
				3328	{
				3329	mm->total_vm += npages;
				3330
				3331	if (is_exec_mapping(flags))
				3332	mm->exec_vm += npages;
				3333	else if (is_stack_mapping(flags))
				3334	mm->stack_vm += npages;
				3335	else if (is_data_mapping(flags))
				3336	mm->data_vm += npages;
				3337	}
				3338
				3339	static vm_fault_t special_mapping_fault(struct vm_fault *vmf);
				3340
				3341	/*
				3342	* Having a close hook prevents vma merging regardless of flags.
				3343	*/
				3344	static void special_mapping_close(struct vm_area_struct *vma)
				3345	{
				3346	}
				3347
				3348	static const char special_mapping_name(struct vm_area_struct vma)
				3349	{
				3350	return ((struct vm_special_mapping *)vma->vm_private_data)->name;
				3351	}
				3352
				3353	static int special_mapping_mremap(struct vm_area_struct *new_vma)
				3354	{
				3355	struct vm_special_mapping *sm = new_vma->vm_private_data;
				3356
				3357	if (WARN_ON_ONCE(current->mm != new_vma->vm_mm))
				3358	return -EFAULT;
				3359
				3360	if (sm->mremap)
				3361	return sm->mremap(sm, new_vma);
				3362
				3363	return 0;
				3364	}
				3365
				3366	static const struct vm_operations_struct special_mapping_vmops = {
				3367	.close = special_mapping_close,
				3368	.fault = special_mapping_fault,
				3369	.mremap = special_mapping_mremap,
				3370	.name = special_mapping_name,
				3371	};
				3372
				3373	static const struct vm_operations_struct legacy_special_mapping_vmops = {
				3374	.close = special_mapping_close,
				3375	.fault = special_mapping_fault,
				3376	};
				3377
				3378	static vm_fault_t special_mapping_fault(struct vm_fault *vmf)
				3379	{
				3380	struct vm_area_struct *vma = vmf->vma;
				3381	pgoff_t pgoff;
				3382	struct page **pages;
				3383
				3384	if (vma->vm_ops == &legacy_special_mapping_vmops) {
				3385	pages = vma->vm_private_data;
				3386	} else {
				3387	struct vm_special_mapping *sm = vma->vm_private_data;
				3388
				3389	if (sm->fault)
				3390	return sm->fault(sm, vmf->vma, vmf);
				3391
				3392	pages = sm->pages;
				3393	}
				3394
				3395	for (pgoff = vmf->pgoff; pgoff && *pages; ++pages)
				3396	pgoff--;
				3397
				3398	if (*pages) {
				3399	struct page page = pages;
				3400	get_page(page);
				3401	vmf->page = page;
				3402	return 0;
				3403	}
				3404
				3405	return VM_FAULT_SIGBUS;
				3406	}
				3407
				3408	static struct vm_area_struct *__install_special_mapping(
				3409	struct mm_struct *mm,
				3410	unsigned long addr, unsigned long len,
				3411	unsigned long vm_flags, void *priv,
				3412	const struct vm_operations_struct *ops)
				3413	{
				3414	int ret;
				3415	struct vm_area_struct *vma;
				3416
				3417	vma = vm_area_alloc(mm);
				3418	if (unlikely(vma == NULL))
				3419	return ERR_PTR(-ENOMEM);
				3420
				3421	vma->vm_start = addr;
				3422	vma->vm_end = addr + len;
				3423
				3424	vma->vm_flags = vm_flags \| mm->def_flags \| VM_DONTEXPAND \| VM_SOFTDIRTY;
				3425	vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
				3426
				3427	vma->vm_ops = ops;
				3428	vma->vm_private_data = priv;
				3429
				3430	ret = insert_vm_struct(mm, vma);
				3431	if (ret)
				3432	goto out;
				3433
				3434	vm_stat_account(mm, vma->vm_flags, len >> PAGE_SHIFT);
				3435
				3436	perf_event_mmap(vma);
				3437
				3438	return vma;
				3439
				3440	out:
				3441	vm_area_free(vma);
				3442	return ERR_PTR(ret);
				3443	}
				3444
				3445	bool vma_is_special_mapping(const struct vm_area_struct *vma,
				3446	const struct vm_special_mapping *sm)
				3447	{
				3448	return vma->vm_private_data == sm &&
				3449	(vma->vm_ops == &special_mapping_vmops \|\|
				3450	vma->vm_ops == &legacy_special_mapping_vmops);
				3451	}
				3452
				3453	/*
				3454	* Called with mm->mmap_sem held for writing.
				3455	* Insert a new vma covering the given region, with the given flags.
				3456	* Its pages are supplied by the given array of struct page *.
				3457	* The array can be shorter than len >> PAGE_SHIFT if it's null-terminated.
				3458	* The region past the last page supplied will always produce SIGBUS.
				3459	* The array pointer and the pages it points to are assumed to stay alive
				3460	* for as long as this mapping might exist.
				3461	*/
				3462	struct vm_area_struct *_install_special_mapping(
				3463	struct mm_struct *mm,
				3464	unsigned long addr, unsigned long len,
				3465	unsigned long vm_flags, const struct vm_special_mapping *spec)
				3466	{
				3467	return __install_special_mapping(mm, addr, len, vm_flags, (void *)spec,
				3468	&special_mapping_vmops);
				3469	}
				3470
				3471	int install_special_mapping(struct mm_struct *mm,
				3472	unsigned long addr, unsigned long len,
				3473	unsigned long vm_flags, struct page **pages)
				3474	{
				3475	struct vm_area_struct *vma = __install_special_mapping(
				3476	mm, addr, len, vm_flags, (void *)pages,
				3477	&legacy_special_mapping_vmops);
				3478
				3479	return PTR_ERR_OR_ZERO(vma);
				3480	}
				3481
				3482	static DEFINE_MUTEX(mm_all_locks_mutex);
				3483
				3484	static void vm_lock_anon_vma(struct mm_struct mm, struct anon_vma anon_vma)
				3485	{
				3486	if (!test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_root.rb_node)) {
				3487	/*
				3488	* The LSB of head.next can't change from under us
				3489	* because we hold the mm_all_locks_mutex.
				3490	*/
				3491	down_write_nest_lock(&anon_vma->root->rwsem, &mm->mmap_sem);
				3492	/*
				3493	* We can safely modify head.next after taking the
				3494	* anon_vma->root->rwsem. If some other vma in this mm shares
				3495	* the same anon_vma we won't take it again.
				3496	*
				3497	* No need of atomic instructions here, head.next
				3498	* can't change from under us thanks to the
				3499	* anon_vma->root->rwsem.
				3500	*/
				3501	if (__test_and_set_bit(0, (unsigned long *)
				3502	&anon_vma->root->rb_root.rb_root.rb_node))
				3503	BUG();
				3504	}
				3505	}
				3506
				3507	static void vm_lock_mapping(struct mm_struct mm, struct address_space mapping)
				3508	{
				3509	if (!test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) {
				3510	/*
				3511	* AS_MM_ALL_LOCKS can't change from under us because
				3512	* we hold the mm_all_locks_mutex.
				3513	*
				3514	* Operations on ->flags have to be atomic because
				3515	* even if AS_MM_ALL_LOCKS is stable thanks to the
				3516	* mm_all_locks_mutex, there may be other cpus
				3517	* changing other bitflags in parallel to us.
				3518	*/
				3519	if (test_and_set_bit(AS_MM_ALL_LOCKS, &mapping->flags))
				3520	BUG();
				3521	down_write_nest_lock(&mapping->i_mmap_rwsem, &mm->mmap_sem);
				3522	}
				3523	}
				3524
				3525	/*
				3526	* This operation locks against the VM for all pte/vma/mm related
				3527	* operations that could ever happen on a certain mm. This includes
				3528	* vmtruncate, try_to_unmap, and all page faults.
				3529	*
				3530	* The caller must take the mmap_sem in write mode before calling
				3531	* mm_take_all_locks(). The caller isn't allowed to release the
				3532	* mmap_sem until mm_drop_all_locks() returns.
				3533	*
				3534	* mmap_sem in write mode is required in order to block all operations
				3535	* that could modify pagetables and free pages without need of
				3536	* altering the vma layout. It's also needed in write mode to avoid new
				3537	* anon_vmas to be associated with existing vmas.
				3538	*
				3539	* A single task can't take more than one mm_take_all_locks() in a row
				3540	* or it would deadlock.
				3541	*
				3542	* The LSB in anon_vma->rb_root.rb_node and the AS_MM_ALL_LOCKS bitflag in
				3543	* mapping->flags avoid to take the same lock twice, if more than one
				3544	* vma in this mm is backed by the same anon_vma or address_space.
				3545	*
				3546	* We take locks in following order, accordingly to comment at beginning
				3547	* of mm/rmap.c:
				3548	* - all hugetlbfs_i_mmap_rwsem_key locks (aka mapping->i_mmap_rwsem for
				3549	* hugetlb mapping);
				3550	* - all i_mmap_rwsem locks;
				3551	* - all anon_vma->rwseml
				3552	*
				3553	* We can take all locks within these types randomly because the VM code
				3554	* doesn't nest them and we protected from parallel mm_take_all_locks() by
				3555	* mm_all_locks_mutex.
				3556	*
				3557	* mm_take_all_locks() and mm_drop_all_locks are expensive operations
				3558	* that may have to take thousand of locks.
				3559	*
				3560	* mm_take_all_locks() can fail if it's interrupted by signals.
				3561	*/
				3562	int mm_take_all_locks(struct mm_struct *mm)
				3563	{
				3564	struct vm_area_struct *vma;
				3565	struct anon_vma_chain *avc;
				3566
				3567	BUG_ON(down_read_trylock(&mm->mmap_sem));
				3568
				3569	mutex_lock(&mm_all_locks_mutex);
				3570
				3571	for (vma = mm->mmap; vma; vma = vma->vm_next) {
				3572	if (signal_pending(current))
				3573	goto out_unlock;
				3574	if (vma->vm_file && vma->vm_file->f_mapping &&
				3575	is_vm_hugetlb_page(vma))
				3576	vm_lock_mapping(mm, vma->vm_file->f_mapping);
				3577	}
				3578
				3579	for (vma = mm->mmap; vma; vma = vma->vm_next) {
				3580	if (signal_pending(current))
				3581	goto out_unlock;
				3582	if (vma->vm_file && vma->vm_file->f_mapping &&
				3583	!is_vm_hugetlb_page(vma))
				3584	vm_lock_mapping(mm, vma->vm_file->f_mapping);
				3585	}
				3586
				3587	for (vma = mm->mmap; vma; vma = vma->vm_next) {
				3588	if (signal_pending(current))
				3589	goto out_unlock;
				3590	if (vma->anon_vma)
				3591	list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
				3592	vm_lock_anon_vma(mm, avc->anon_vma);
				3593	}
				3594
				3595	return 0;
				3596
				3597	out_unlock:
				3598	mm_drop_all_locks(mm);
				3599	return -EINTR;
				3600	}
				3601
				3602	static void vm_unlock_anon_vma(struct anon_vma *anon_vma)
				3603	{
				3604	if (test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_root.rb_node)) {
				3605	/*
				3606	* The LSB of head.next can't change to 0 from under
				3607	* us because we hold the mm_all_locks_mutex.
				3608	*
				3609	* We must however clear the bitflag before unlocking
				3610	* the vma so the users using the anon_vma->rb_root will
				3611	* never see our bitflag.
				3612	*
				3613	* No need of atomic instructions here, head.next
				3614	* can't change from under us until we release the
				3615	* anon_vma->root->rwsem.
				3616	*/
				3617	if (!__test_and_clear_bit(0, (unsigned long *)
				3618	&anon_vma->root->rb_root.rb_root.rb_node))
				3619	BUG();
				3620	anon_vma_unlock_write(anon_vma);
				3621	}
				3622	}
				3623
				3624	static void vm_unlock_mapping(struct address_space *mapping)
				3625	{
				3626	if (test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) {
				3627	/*
				3628	* AS_MM_ALL_LOCKS can't change to 0 from under us
				3629	* because we hold the mm_all_locks_mutex.
				3630	*/
				3631	i_mmap_unlock_write(mapping);
				3632	if (!test_and_clear_bit(AS_MM_ALL_LOCKS,
				3633	&mapping->flags))
				3634	BUG();
				3635	}
				3636	}
				3637
				3638	/*
				3639	* The mmap_sem cannot be released by the caller until
				3640	* mm_drop_all_locks() returns.
				3641	*/
				3642	void mm_drop_all_locks(struct mm_struct *mm)
				3643	{
				3644	struct vm_area_struct *vma;
				3645	struct anon_vma_chain *avc;
				3646
				3647	BUG_ON(down_read_trylock(&mm->mmap_sem));
				3648	BUG_ON(!mutex_is_locked(&mm_all_locks_mutex));
				3649
				3650	for (vma = mm->mmap; vma; vma = vma->vm_next) {
				3651	if (vma->anon_vma)
				3652	list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
				3653	vm_unlock_anon_vma(avc->anon_vma);
				3654	if (vma->vm_file && vma->vm_file->f_mapping)
				3655	vm_unlock_mapping(vma->vm_file->f_mapping);
				3656	}
				3657
				3658	mutex_unlock(&mm_all_locks_mutex);
				3659	}
				3660
				3661	/*
				3662	* initialise the percpu counter for VM
				3663	*/
				3664	void __init mmap_init(void)
				3665	{
				3666	int ret;
				3667
				3668	ret = percpu_counter_init(&vm_committed_as, 0, GFP_KERNEL);
				3669	VM_BUG_ON(ret);
				3670	}
				3671
				3672	/*
				3673	* Initialise sysctl_user_reserve_kbytes.
				3674	*
				3675	* This is intended to prevent a user from starting a single memory hogging
				3676	* process, such that they cannot recover (kill the hog) in OVERCOMMIT_NEVER
				3677	* mode.
				3678	*
				3679	* The default value is min(3% of free memory, 128MB)
				3680	* 128MB is enough to recover with sshd/login, bash, and top/kill.
				3681	*/
				3682	static int init_user_reserve(void)
				3683	{
				3684	unsigned long free_kbytes;
				3685
				3686	free_kbytes = global_zone_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);
				3687
				3688	sysctl_user_reserve_kbytes = min(free_kbytes / 32, 1UL << 17);
				3689	return 0;
				3690	}
				3691	subsys_initcall(init_user_reserve);
				3692
				3693	/*
				3694	* Initialise sysctl_admin_reserve_kbytes.
				3695	*
				3696	* The purpose of sysctl_admin_reserve_kbytes is to allow the sys admin
				3697	* to log in and kill a memory hogging process.
				3698	*
				3699	* Systems with more than 256MB will reserve 8MB, enough to recover
				3700	* with sshd, bash, and top in OVERCOMMIT_GUESS. Smaller systems will
				3701	* only reserve 3% of free pages by default.
				3702	*/
				3703	static int init_admin_reserve(void)
				3704	{
				3705	unsigned long free_kbytes;
				3706
				3707	free_kbytes = global_zone_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);
				3708
				3709	sysctl_admin_reserve_kbytes = min(free_kbytes / 32, 1UL << 13);
				3710	return 0;
				3711	}
				3712	subsys_initcall(init_admin_reserve);
				3713
				3714	/*
				3715	* Reinititalise user and admin reserves if memory is added or removed.
				3716	*
				3717	* The default user reserve max is 128MB, and the default max for the
				3718	* admin reserve is 8MB. These are usually, but not always, enough to
				3719	* enable recovery from a memory hogging process using login/sshd, a shell,
				3720	* and tools like top. It may make sense to increase or even disable the
				3721	* reserve depending on the existence of swap or variations in the recovery
				3722	* tools. So, the admin may have changed them.
				3723	*
				3724	* If memory is added and the reserves have been eliminated or increased above
				3725	* the default max, then we'll trust the admin.
				3726	*
				3727	* If memory is removed and there isn't enough free memory, then we
				3728	* need to reset the reserves.
				3729	*
				3730	* Otherwise keep the reserve set by the admin.
				3731	*/
				3732	static int reserve_mem_notifier(struct notifier_block *nb,
				3733	unsigned long action, void *data)
				3734	{
				3735	unsigned long tmp, free_kbytes;
				3736
				3737	switch (action) {
				3738	case MEM_ONLINE:
				3739	/* Default max is 128MB. Leave alone if modified by operator. */
				3740	tmp = sysctl_user_reserve_kbytes;
				3741	if (0 < tmp && tmp < (1UL << 17))
				3742	init_user_reserve();
				3743
				3744	/* Default max is 8MB. Leave alone if modified by operator. */
				3745	tmp = sysctl_admin_reserve_kbytes;
				3746	if (0 < tmp && tmp < (1UL << 13))
				3747	init_admin_reserve();
				3748
				3749	break;
				3750	case MEM_OFFLINE:
				3751	free_kbytes = global_zone_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);
				3752
				3753	if (sysctl_user_reserve_kbytes > free_kbytes) {
				3754	init_user_reserve();
				3755	pr_info("vm.user_reserve_kbytes reset to %lu\n",
				3756	sysctl_user_reserve_kbytes);
				3757	}
				3758
				3759	if (sysctl_admin_reserve_kbytes > free_kbytes) {
				3760	init_admin_reserve();
				3761	pr_info("vm.admin_reserve_kbytes reset to %lu\n",
				3762	sysctl_admin_reserve_kbytes);
				3763	}
				3764	break;
				3765	default:
				3766	break;
				3767	}
				3768	return NOTIFY_OK;
				3769	}
				3770
				3771	static struct notifier_block reserve_mem_nb = {
				3772	.notifier_call = reserve_mem_notifier,
				3773	};
				3774
				3775	static int __meminit init_reserve_notifier(void)
				3776	{
				3777	if (register_hotmemory_notifier(&reserve_mem_nb))
				3778	pr_err("Failed registering memory add/remove notifier for admin reserve\n");
				3779
				3780	return 0;
				3781	}
				3782	subsys_initcall(init_reserve_notifier);