Blame - mm/madvise.c - hafnium/third_party/linux.git

blob: 24abc79f8914e86836381dce3e0d406ad034f127 [file] [log] [blame]

Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1	// SPDX-License-Identifier: GPL-2.0
				2	/*
				3	* linux/mm/madvise.c
				4	*
				5	* Copyright (C) 1999 Linus Torvalds
				6	* Copyright (C) 2002 Christoph Hellwig
				7	*/
				8
				9	#include <linux/mman.h>
				10	#include <linux/pagemap.h>
				11	#include <linux/syscalls.h>
				12	#include <linux/mempolicy.h>
				13	#include <linux/page-isolation.h>
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	14	#include <linux/page_idle.h>
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	15	#include <linux/userfaultfd_k.h>
				16	#include <linux/hugetlb.h>
				17	#include <linux/falloc.h>
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	18	#include <linux/fadvise.h>
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	19	#include <linux/sched.h>
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	20	#include <linux/sched/mm.h>
				21	#include <linux/uio.h>
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	22	#include <linux/ksm.h>
				23	#include <linux/fs.h>
				24	#include <linux/file.h>
				25	#include <linux/blkdev.h>
				26	#include <linux/backing-dev.h>
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	27	#include <linux/pagewalk.h>
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	28	#include <linux/swap.h>
				29	#include <linux/swapops.h>
				30	#include <linux/shmem_fs.h>
				31	#include <linux/mmu_notifier.h>
				32
				33	#include <asm/tlb.h>
				34
				35	#include "internal.h"
				36
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	37	struct madvise_walk_private {
				38	struct mmu_gather *tlb;
				39	bool pageout;
				40	};
				41
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	42	/*
				43	* Any behaviour which results in changes to the vma->vm_flags needs to
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	44	* take mmap_lock for writing. Others, which simply traverse vmas, need
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	45	* to only take it for reading.
				46	*/
				47	static int madvise_need_mmap_write(int behavior)
				48	{
				49	switch (behavior) {
				50	case MADV_REMOVE:
				51	case MADV_WILLNEED:
				52	case MADV_DONTNEED:
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	53	case MADV_COLD:
				54	case MADV_PAGEOUT:
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	55	case MADV_FREE:
				56	return 0;
				57	default:
				58	/* be safe, default to 1. list exceptions explicitly */
				59	return 1;
				60	}
				61	}
				62
				63	/*
				64	* We can potentially split a vm area into separate
				65	* areas, each area with its own behavior.
				66	*/
				67	static long madvise_behavior(struct vm_area_struct *vma,
				68	struct vm_area_struct **prev,
				69	unsigned long start, unsigned long end, int behavior)
				70	{
				71	struct mm_struct *mm = vma->vm_mm;
				72	int error = 0;
				73	pgoff_t pgoff;
				74	unsigned long new_flags = vma->vm_flags;
				75
				76	switch (behavior) {
				77	case MADV_NORMAL:
				78	new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ;
				79	break;
				80	case MADV_SEQUENTIAL:
				81	new_flags = (new_flags & ~VM_RAND_READ) \| VM_SEQ_READ;
				82	break;
				83	case MADV_RANDOM:
				84	new_flags = (new_flags & ~VM_SEQ_READ) \| VM_RAND_READ;
				85	break;
				86	case MADV_DONTFORK:
				87	new_flags \|= VM_DONTCOPY;
				88	break;
				89	case MADV_DOFORK:
				90	if (vma->vm_flags & VM_IO) {
				91	error = -EINVAL;
				92	goto out;
				93	}
				94	new_flags &= ~VM_DONTCOPY;
				95	break;
				96	case MADV_WIPEONFORK:
				97	/* MADV_WIPEONFORK is only supported on anonymous memory. */
				98	if (vma->vm_file \|\| vma->vm_flags & VM_SHARED) {
				99	error = -EINVAL;
				100	goto out;
				101	}
				102	new_flags \|= VM_WIPEONFORK;
				103	break;
				104	case MADV_KEEPONFORK:
				105	new_flags &= ~VM_WIPEONFORK;
				106	break;
				107	case MADV_DONTDUMP:
				108	new_flags \|= VM_DONTDUMP;
				109	break;
				110	case MADV_DODUMP:
				111	if (!is_vm_hugetlb_page(vma) && new_flags & VM_SPECIAL) {
				112	error = -EINVAL;
				113	goto out;
				114	}
				115	new_flags &= ~VM_DONTDUMP;
				116	break;
				117	case MADV_MERGEABLE:
				118	case MADV_UNMERGEABLE:
				119	error = ksm_madvise(vma, start, end, behavior, &new_flags);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	120	if (error)
				121	goto out_convert_errno;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	122	break;
				123	case MADV_HUGEPAGE:
				124	case MADV_NOHUGEPAGE:
				125	error = hugepage_madvise(vma, &new_flags, behavior);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	126	if (error)
				127	goto out_convert_errno;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	128	break;
				129	}
				130
				131	if (new_flags == vma->vm_flags) {
				132	*prev = vma;
				133	goto out;
				134	}
				135
				136	pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
				137	prev = vma_merge(mm, prev, start, end, new_flags, vma->anon_vma,
				138	vma->vm_file, pgoff, vma_policy(vma),
				139	vma->vm_userfaultfd_ctx);
				140	if (*prev) {
				141	vma = *prev;
				142	goto success;
				143	}
				144
				145	*prev = vma;
				146
				147	if (start != vma->vm_start) {
				148	if (unlikely(mm->map_count >= sysctl_max_map_count)) {
				149	error = -ENOMEM;
				150	goto out;
				151	}
				152	error = __split_vma(mm, vma, start, 1);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	153	if (error)
				154	goto out_convert_errno;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	155	}
				156
				157	if (end != vma->vm_end) {
				158	if (unlikely(mm->map_count >= sysctl_max_map_count)) {
				159	error = -ENOMEM;
				160	goto out;
				161	}
				162	error = __split_vma(mm, vma, end, 0);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	163	if (error)
				164	goto out_convert_errno;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	165	}
				166
				167	success:
				168	/*
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	169	* vm_flags is protected by the mmap_lock held in write mode.
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	170	*/
				171	vma->vm_flags = new_flags;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	172
				173	out_convert_errno:
				174	/*
				175	* madvise() returns EAGAIN if kernel resources, such as
				176	* slab, are temporarily unavailable.
				177	*/
				178	if (error == -ENOMEM)
				179	error = -EAGAIN;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	180	out:
				181	return error;
				182	}
				183
				184	#ifdef CONFIG_SWAP
				185	static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start,
				186	unsigned long end, struct mm_walk *walk)
				187	{
				188	pte_t *orig_pte;
				189	struct vm_area_struct *vma = walk->private;
				190	unsigned long index;
				191
				192	if (pmd_none_or_trans_huge_or_clear_bad(pmd))
				193	return 0;
				194
				195	for (index = start; index != end; index += PAGE_SIZE) {
				196	pte_t pte;
				197	swp_entry_t entry;
				198	struct page *page;
				199	spinlock_t *ptl;
				200
				201	orig_pte = pte_offset_map_lock(vma->vm_mm, pmd, start, &ptl);
				202	pte = *(orig_pte + ((index - start) / PAGE_SIZE));
				203	pte_unmap_unlock(orig_pte, ptl);
				204
				205	if (pte_present(pte) \|\| pte_none(pte))
				206	continue;
				207	entry = pte_to_swp_entry(pte);
				208	if (unlikely(non_swap_entry(entry)))
				209	continue;
				210
				211	page = read_swap_cache_async(entry, GFP_HIGHUSER_MOVABLE,
				212	vma, index, false);
				213	if (page)
				214	put_page(page);
				215	}
				216
				217	return 0;
				218	}
				219
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	220	static const struct mm_walk_ops swapin_walk_ops = {
				221	.pmd_entry = swapin_walk_pmd_entry,
				222	};
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	223
				224	static void force_shm_swapin_readahead(struct vm_area_struct *vma,
				225	unsigned long start, unsigned long end,
				226	struct address_space *mapping)
				227	{
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	228	XA_STATE(xas, &mapping->i_pages, linear_page_index(vma, start));
				229	pgoff_t end_index = linear_page_index(vma, end + PAGE_SIZE - 1);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	230	struct page *page;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	231
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	232	rcu_read_lock();
				233	xas_for_each(&xas, page, end_index) {
				234	swp_entry_t swap;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	235
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	236	if (!xa_is_value(page))
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	237	continue;
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	238	xas_pause(&xas);
				239	rcu_read_unlock();
				240
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	241	swap = radix_to_swp_entry(page);
				242	page = read_swap_cache_async(swap, GFP_HIGHUSER_MOVABLE,
				243	NULL, 0, false);
				244	if (page)
				245	put_page(page);
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	246
				247	rcu_read_lock();
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	248	}
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	249	rcu_read_unlock();
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	250
				251	lru_add_drain(); /* Push any new pages onto the LRU now */
				252	}
				253	#endif /* CONFIG_SWAP */
				254
				255	/*
				256	* Schedule all required I/O operations. Do not wait for completion.
				257	*/
				258	static long madvise_willneed(struct vm_area_struct *vma,
				259	struct vm_area_struct **prev,
				260	unsigned long start, unsigned long end)
				261	{
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	262	struct mm_struct *mm = vma->vm_mm;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	263	struct file *file = vma->vm_file;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	264	loff_t offset;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	265
				266	*prev = vma;
				267	#ifdef CONFIG_SWAP
				268	if (!file) {
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	269	walk_page_range(vma->vm_mm, start, end, &swapin_walk_ops, vma);
				270	lru_add_drain(); /* Push any new pages onto the LRU now */
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	271	return 0;
				272	}
				273
				274	if (shmem_mapping(file->f_mapping)) {
				275	force_shm_swapin_readahead(vma, start, end,
				276	file->f_mapping);
				277	return 0;
				278	}
				279	#else
				280	if (!file)
				281	return -EBADF;
				282	#endif
				283
				284	if (IS_DAX(file_inode(file))) {
				285	/* no bad return value, but ignore advice */
				286	return 0;
				287	}
				288
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	289	/*
				290	* Filesystem's fadvise may need to take various locks. We need to
				291	* explicitly grab a reference because the vma (and hence the
				292	* vma's reference to the file) can go away as soon as we drop
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	293	* mmap_lock.
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	294	*/
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	295	prev = NULL; / tell sys_madvise we drop mmap_lock */
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	296	get_file(file);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	297	offset = (loff_t)(start - vma->vm_start)
				298	+ ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	299	mmap_read_unlock(mm);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	300	vfs_fadvise(file, offset, end - start, POSIX_FADV_WILLNEED);
				301	fput(file);
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	302	mmap_read_lock(mm);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	303	return 0;
				304	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	305
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	306	static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
				307	unsigned long addr, unsigned long end,
				308	struct mm_walk *walk)
				309	{
				310	struct madvise_walk_private *private = walk->private;
				311	struct mmu_gather *tlb = private->tlb;
				312	bool pageout = private->pageout;
				313	struct mm_struct *mm = tlb->mm;
				314	struct vm_area_struct *vma = walk->vma;
				315	pte_t orig_pte, pte, ptent;
				316	spinlock_t *ptl;
				317	struct page *page = NULL;
				318	LIST_HEAD(page_list);
				319
				320	if (fatal_signal_pending(current))
				321	return -EINTR;
				322
				323	#ifdef CONFIG_TRANSPARENT_HUGEPAGE
				324	if (pmd_trans_huge(*pmd)) {
				325	pmd_t orig_pmd;
				326	unsigned long next = pmd_addr_end(addr, end);
				327
				328	tlb_change_page_size(tlb, HPAGE_PMD_SIZE);
				329	ptl = pmd_trans_huge_lock(pmd, vma);
				330	if (!ptl)
				331	return 0;
				332
				333	orig_pmd = *pmd;
				334	if (is_huge_zero_pmd(orig_pmd))
				335	goto huge_unlock;
				336
				337	if (unlikely(!pmd_present(orig_pmd))) {
				338	VM_BUG_ON(thp_migration_supported() &&
				339	!is_pmd_migration_entry(orig_pmd));
				340	goto huge_unlock;
				341	}
				342
				343	page = pmd_page(orig_pmd);
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame]	344
				345	/* Do not interfere with other mappings of this page */
				346	if (page_mapcount(page) != 1)
				347	goto huge_unlock;
				348
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	349	if (next - addr != HPAGE_PMD_SIZE) {
				350	int err;
				351
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	352	get_page(page);
				353	spin_unlock(ptl);
				354	lock_page(page);
				355	err = split_huge_page(page);
				356	unlock_page(page);
				357	put_page(page);
				358	if (!err)
				359	goto regular_page;
				360	return 0;
				361	}
				362
				363	if (pmd_young(orig_pmd)) {
				364	pmdp_invalidate(vma, addr, pmd);
				365	orig_pmd = pmd_mkold(orig_pmd);
				366
				367	set_pmd_at(mm, addr, pmd, orig_pmd);
				368	tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
				369	}
				370
				371	ClearPageReferenced(page);
				372	test_and_clear_page_young(page);
				373	if (pageout) {
				374	if (!isolate_lru_page(page)) {
				375	if (PageUnevictable(page))
				376	putback_lru_page(page);
				377	else
				378	list_add(&page->lru, &page_list);
				379	}
				380	} else
				381	deactivate_page(page);
				382	huge_unlock:
				383	spin_unlock(ptl);
				384	if (pageout)
				385	reclaim_pages(&page_list);
				386	return 0;
				387	}
				388
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame]	389	regular_page:
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	390	if (pmd_trans_unstable(pmd))
				391	return 0;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	392	#endif
				393	tlb_change_page_size(tlb, PAGE_SIZE);
				394	orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
				395	flush_tlb_batched_pending(mm);
				396	arch_enter_lazy_mmu_mode();
				397	for (; addr < end; pte++, addr += PAGE_SIZE) {
				398	ptent = *pte;
				399
				400	if (pte_none(ptent))
				401	continue;
				402
				403	if (!pte_present(ptent))
				404	continue;
				405
				406	page = vm_normal_page(vma, addr, ptent);
				407	if (!page)
				408	continue;
				409
				410	/*
				411	* Creating a THP page is expensive so split it only if we
				412	* are sure it's worth. Split it if we are only owner.
				413	*/
				414	if (PageTransCompound(page)) {
				415	if (page_mapcount(page) != 1)
				416	break;
				417	get_page(page);
				418	if (!trylock_page(page)) {
				419	put_page(page);
				420	break;
				421	}
				422	pte_unmap_unlock(orig_pte, ptl);
				423	if (split_huge_page(page)) {
				424	unlock_page(page);
				425	put_page(page);
				426	pte_offset_map_lock(mm, pmd, addr, &ptl);
				427	break;
				428	}
				429	unlock_page(page);
				430	put_page(page);
				431	pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
				432	pte--;
				433	addr -= PAGE_SIZE;
				434	continue;
				435	}
				436
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame]	437	/* Do not interfere with other mappings of this page */
				438	if (page_mapcount(page) != 1)
				439	continue;
				440
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	441	VM_BUG_ON_PAGE(PageTransCompound(page), page);
				442
				443	if (pte_young(ptent)) {
				444	ptent = ptep_get_and_clear_full(mm, addr, pte,
				445	tlb->fullmm);
				446	ptent = pte_mkold(ptent);
				447	set_pte_at(mm, addr, pte, ptent);
				448	tlb_remove_tlb_entry(tlb, pte, addr);
				449	}
				450
				451	/*
				452	* We are deactivating a page for accelerating reclaiming.
				453	* VM couldn't reclaim the page unless we clear PG_young.
				454	* As a side effect, it makes confuse idle-page tracking
				455	* because they will miss recent referenced history.
				456	*/
				457	ClearPageReferenced(page);
				458	test_and_clear_page_young(page);
				459	if (pageout) {
				460	if (!isolate_lru_page(page)) {
				461	if (PageUnevictable(page))
				462	putback_lru_page(page);
				463	else
				464	list_add(&page->lru, &page_list);
				465	}
				466	} else
				467	deactivate_page(page);
				468	}
				469
				470	arch_leave_lazy_mmu_mode();
				471	pte_unmap_unlock(orig_pte, ptl);
				472	if (pageout)
				473	reclaim_pages(&page_list);
				474	cond_resched();
				475
				476	return 0;
				477	}
				478
				479	static const struct mm_walk_ops cold_walk_ops = {
				480	.pmd_entry = madvise_cold_or_pageout_pte_range,
				481	};
				482
				483	static void madvise_cold_page_range(struct mmu_gather *tlb,
				484	struct vm_area_struct *vma,
				485	unsigned long addr, unsigned long end)
				486	{
				487	struct madvise_walk_private walk_private = {
				488	.pageout = false,
				489	.tlb = tlb,
				490	};
				491
				492	tlb_start_vma(tlb, vma);
				493	walk_page_range(vma->vm_mm, addr, end, &cold_walk_ops, &walk_private);
				494	tlb_end_vma(tlb, vma);
				495	}
				496
				497	static long madvise_cold(struct vm_area_struct *vma,
				498	struct vm_area_struct **prev,
				499	unsigned long start_addr, unsigned long end_addr)
				500	{
				501	struct mm_struct *mm = vma->vm_mm;
				502	struct mmu_gather tlb;
				503
				504	*prev = vma;
				505	if (!can_madv_lru_vma(vma))
				506	return -EINVAL;
				507
				508	lru_add_drain();
				509	tlb_gather_mmu(&tlb, mm, start_addr, end_addr);
				510	madvise_cold_page_range(&tlb, vma, start_addr, end_addr);
				511	tlb_finish_mmu(&tlb, start_addr, end_addr);
				512
				513	return 0;
				514	}
				515
				516	static void madvise_pageout_page_range(struct mmu_gather *tlb,
				517	struct vm_area_struct *vma,
				518	unsigned long addr, unsigned long end)
				519	{
				520	struct madvise_walk_private walk_private = {
				521	.pageout = true,
				522	.tlb = tlb,
				523	};
				524
				525	tlb_start_vma(tlb, vma);
				526	walk_page_range(vma->vm_mm, addr, end, &cold_walk_ops, &walk_private);
				527	tlb_end_vma(tlb, vma);
				528	}
				529
				530	static inline bool can_do_pageout(struct vm_area_struct *vma)
				531	{
				532	if (vma_is_anonymous(vma))
				533	return true;
				534	if (!vma->vm_file)
				535	return false;
				536	/*
				537	* paging out pagecache only for non-anonymous mappings that correspond
				538	* to the files the calling process could (if tried) open for writing;
				539	* otherwise we'd be including shared non-exclusive mappings, which
				540	* opens a side channel.
				541	*/
				542	return inode_owner_or_capable(file_inode(vma->vm_file)) \|\|
				543	inode_permission(file_inode(vma->vm_file), MAY_WRITE) == 0;
				544	}
				545
				546	static long madvise_pageout(struct vm_area_struct *vma,
				547	struct vm_area_struct **prev,
				548	unsigned long start_addr, unsigned long end_addr)
				549	{
				550	struct mm_struct *mm = vma->vm_mm;
				551	struct mmu_gather tlb;
				552
				553	*prev = vma;
				554	if (!can_madv_lru_vma(vma))
				555	return -EINVAL;
				556
				557	if (!can_do_pageout(vma))
				558	return 0;
				559
				560	lru_add_drain();
				561	tlb_gather_mmu(&tlb, mm, start_addr, end_addr);
				562	madvise_pageout_page_range(&tlb, vma, start_addr, end_addr);
				563	tlb_finish_mmu(&tlb, start_addr, end_addr);
				564
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	565	return 0;
				566	}
				567
				568	static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
				569	unsigned long end, struct mm_walk *walk)
				570
				571	{
				572	struct mmu_gather *tlb = walk->private;
				573	struct mm_struct *mm = tlb->mm;
				574	struct vm_area_struct *vma = walk->vma;
				575	spinlock_t *ptl;
				576	pte_t orig_pte, pte, ptent;
				577	struct page *page;
				578	int nr_swap = 0;
				579	unsigned long next;
				580
				581	next = pmd_addr_end(addr, end);
				582	if (pmd_trans_huge(*pmd))
				583	if (madvise_free_huge_pmd(tlb, vma, pmd, addr, next))
				584	goto next;
				585
				586	if (pmd_trans_unstable(pmd))
				587	return 0;
				588
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	589	tlb_change_page_size(tlb, PAGE_SIZE);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	590	orig_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
				591	flush_tlb_batched_pending(mm);
				592	arch_enter_lazy_mmu_mode();
				593	for (; addr != end; pte++, addr += PAGE_SIZE) {
				594	ptent = *pte;
				595
				596	if (pte_none(ptent))
				597	continue;
				598	/*
				599	* If the pte has swp_entry, just clear page table to
				600	* prevent swap-in which is more expensive rather than
				601	* (page allocation + zeroing).
				602	*/
				603	if (!pte_present(ptent)) {
				604	swp_entry_t entry;
				605
				606	entry = pte_to_swp_entry(ptent);
				607	if (non_swap_entry(entry))
				608	continue;
				609	nr_swap--;
				610	free_swap_and_cache(entry);
				611	pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
				612	continue;
				613	}
				614
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	615	page = vm_normal_page(vma, addr, ptent);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	616	if (!page)
				617	continue;
				618
				619	/*
				620	* If pmd isn't transhuge but the page is THP and
				621	* is owned by only this process, split it and
				622	* deactivate all pages.
				623	*/
				624	if (PageTransCompound(page)) {
				625	if (page_mapcount(page) != 1)
				626	goto out;
				627	get_page(page);
				628	if (!trylock_page(page)) {
				629	put_page(page);
				630	goto out;
				631	}
				632	pte_unmap_unlock(orig_pte, ptl);
				633	if (split_huge_page(page)) {
				634	unlock_page(page);
				635	put_page(page);
				636	pte_offset_map_lock(mm, pmd, addr, &ptl);
				637	goto out;
				638	}
				639	unlock_page(page);
				640	put_page(page);
				641	pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
				642	pte--;
				643	addr -= PAGE_SIZE;
				644	continue;
				645	}
				646
				647	VM_BUG_ON_PAGE(PageTransCompound(page), page);
				648
				649	if (PageSwapCache(page) \|\| PageDirty(page)) {
				650	if (!trylock_page(page))
				651	continue;
				652	/*
				653	* If page is shared with others, we couldn't clear
				654	* PG_dirty of the page.
				655	*/
				656	if (page_mapcount(page) != 1) {
				657	unlock_page(page);
				658	continue;
				659	}
				660
				661	if (PageSwapCache(page) && !try_to_free_swap(page)) {
				662	unlock_page(page);
				663	continue;
				664	}
				665
				666	ClearPageDirty(page);
				667	unlock_page(page);
				668	}
				669
				670	if (pte_young(ptent) \|\| pte_dirty(ptent)) {
				671	/*
				672	* Some of architecture(ex, PPC) don't update TLB
				673	* with set_pte_at and tlb_remove_tlb_entry so for
				674	* the portability, remap the pte with old\|clean
				675	* after pte clearing.
				676	*/
				677	ptent = ptep_get_and_clear_full(mm, addr, pte,
				678	tlb->fullmm);
				679
				680	ptent = pte_mkold(ptent);
				681	ptent = pte_mkclean(ptent);
				682	set_pte_at(mm, addr, pte, ptent);
				683	tlb_remove_tlb_entry(tlb, pte, addr);
				684	}
				685	mark_page_lazyfree(page);
				686	}
				687	out:
				688	if (nr_swap) {
				689	if (current->mm == mm)
				690	sync_mm_rss(mm);
				691
				692	add_mm_counter(mm, MM_SWAPENTS, nr_swap);
				693	}
				694	arch_leave_lazy_mmu_mode();
				695	pte_unmap_unlock(orig_pte, ptl);
				696	cond_resched();
				697	next:
				698	return 0;
				699	}
				700
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	701	static const struct mm_walk_ops madvise_free_walk_ops = {
				702	.pmd_entry = madvise_free_pte_range,
				703	};
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	704
				705	static int madvise_free_single_vma(struct vm_area_struct *vma,
				706	unsigned long start_addr, unsigned long end_addr)
				707	{
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	708	struct mm_struct *mm = vma->vm_mm;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	709	struct mmu_notifier_range range;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	710	struct mmu_gather tlb;
				711
				712	/* MADV_FREE works for only anon vma at the moment */
				713	if (!vma_is_anonymous(vma))
				714	return -EINVAL;
				715
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	716	range.start = max(vma->vm_start, start_addr);
				717	if (range.start >= vma->vm_end)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	718	return -EINVAL;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	719	range.end = min(vma->vm_end, end_addr);
				720	if (range.end <= vma->vm_start)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	721	return -EINVAL;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	722	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm,
				723	range.start, range.end);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	724
				725	lru_add_drain();
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	726	tlb_gather_mmu(&tlb, mm, range.start, range.end);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	727	update_hiwater_rss(mm);
				728
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	729	mmu_notifier_invalidate_range_start(&range);
				730	tlb_start_vma(&tlb, vma);
				731	walk_page_range(vma->vm_mm, range.start, range.end,
				732	&madvise_free_walk_ops, &tlb);
				733	tlb_end_vma(&tlb, vma);
				734	mmu_notifier_invalidate_range_end(&range);
				735	tlb_finish_mmu(&tlb, range.start, range.end);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	736
				737	return 0;
				738	}
				739
				740	/*
				741	* Application no longer needs these pages. If the pages are dirty,
				742	* it's OK to just throw them away. The app will be more careful about
				743	* data it wants to keep. Be sure to free swap resources too. The
				744	* zap_page_range call sets things up for shrink_active_list to actually free
				745	* these pages later if no one else has touched them in the meantime,
				746	* although we could add these pages to a global reuse list for
				747	* shrink_active_list to pick up before reclaiming other pages.
				748	*
				749	* NB: This interface discards data rather than pushes it out to swap,
				750	* as some implementations do. This has performance implications for
				751	* applications like large transactional databases which want to discard
				752	* pages in anonymous maps after committing to backing store the data
				753	* that was kept in them. There is no reason to write this data out to
				754	* the swap area if the application is discarding it.
				755	*
				756	* An interface that causes the system to free clean pages and flush
				757	* dirty pages is already available as msync(MS_INVALIDATE).
				758	*/
				759	static long madvise_dontneed_single_vma(struct vm_area_struct *vma,
				760	unsigned long start, unsigned long end)
				761	{
				762	zap_page_range(vma, start, end - start);
				763	return 0;
				764	}
				765
				766	static long madvise_dontneed_free(struct vm_area_struct *vma,
				767	struct vm_area_struct **prev,
				768	unsigned long start, unsigned long end,
				769	int behavior)
				770	{
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	771	struct mm_struct *mm = vma->vm_mm;
				772
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	773	*prev = vma;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	774	if (!can_madv_lru_vma(vma))
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	775	return -EINVAL;
				776
				777	if (!userfaultfd_remove(vma, start, end)) {
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	778	prev = NULL; / mmap_lock has been dropped, prev is stale */
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	779
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	780	mmap_read_lock(mm);
				781	vma = find_vma(mm, start);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	782	if (!vma)
				783	return -ENOMEM;
				784	if (start < vma->vm_start) {
				785	/*
				786	* This "vma" under revalidation is the one
				787	* with the lowest vma->vm_start where start
				788	* is also < vma->vm_end. If start <
				789	* vma->vm_start it means an hole materialized
				790	* in the user address space within the
				791	* virtual range passed to MADV_DONTNEED
				792	* or MADV_FREE.
				793	*/
				794	return -ENOMEM;
				795	}
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	796	if (!can_madv_lru_vma(vma))
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	797	return -EINVAL;
				798	if (end > vma->vm_end) {
				799	/*
				800	* Don't fail if end > vma->vm_end. If the old
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	801	* vma was splitted while the mmap_lock was
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	802	* released the effect of the concurrent
				803	* operation may not cause madvise() to
				804	* have an undefined result. There may be an
				805	* adjacent next vma that we'll walk
				806	* next. userfaultfd_remove() will generate an
				807	* UFFD_EVENT_REMOVE repetition on the
				808	* end-vma->vm_end range, but the manager can
				809	* handle a repetition fine.
				810	*/
				811	end = vma->vm_end;
				812	}
				813	VM_WARN_ON(start >= end);
				814	}
				815
				816	if (behavior == MADV_DONTNEED)
				817	return madvise_dontneed_single_vma(vma, start, end);
				818	else if (behavior == MADV_FREE)
				819	return madvise_free_single_vma(vma, start, end);
				820	else
				821	return -EINVAL;
				822	}
				823
				824	/*
				825	* Application wants to free up the pages and associated backing store.
				826	* This is effectively punching a hole into the middle of a file.
				827	*/
				828	static long madvise_remove(struct vm_area_struct *vma,
				829	struct vm_area_struct **prev,
				830	unsigned long start, unsigned long end)
				831	{
				832	loff_t offset;
				833	int error;
				834	struct file *f;
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	835	struct mm_struct *mm = vma->vm_mm;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	836
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	837	prev = NULL; / tell sys_madvise we drop mmap_lock */
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	838
				839	if (vma->vm_flags & VM_LOCKED)
				840	return -EINVAL;
				841
				842	f = vma->vm_file;
				843
				844	if (!f \|\| !f->f_mapping \|\| !f->f_mapping->host) {
				845	return -EINVAL;
				846	}
				847
				848	if ((vma->vm_flags & (VM_SHARED\|VM_WRITE)) != (VM_SHARED\|VM_WRITE))
				849	return -EACCES;
				850
				851	offset = (loff_t)(start - vma->vm_start)
				852	+ ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
				853
				854	/*
				855	* Filesystem's fallocate may need to take i_mutex. We need to
				856	* explicitly grab a reference because the vma (and hence the
				857	* vma's reference to the file) can go away as soon as we drop
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	858	* mmap_lock.
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	859	*/
				860	get_file(f);
				861	if (userfaultfd_remove(vma, start, end)) {
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	862	/* mmap_lock was not released by userfaultfd_remove() */
				863	mmap_read_unlock(mm);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	864	}
				865	error = vfs_fallocate(f,
				866	FALLOC_FL_PUNCH_HOLE \| FALLOC_FL_KEEP_SIZE,
				867	offset, end - start);
				868	fput(f);
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	869	mmap_read_lock(mm);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	870	return error;
				871	}
				872
				873	#ifdef CONFIG_MEMORY_FAILURE
				874	/*
				875	* Error injection support for memory error handling.
				876	*/
				877	static int madvise_inject_error(int behavior,
				878	unsigned long start, unsigned long end)
				879	{
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	880	struct zone *zone;
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	881	unsigned long size;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	882
				883	if (!capable(CAP_SYS_ADMIN))
				884	return -EPERM;
				885
				886
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	887	for (; start < end; start += size) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	888	unsigned long pfn;
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	889	struct page *page;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	890	int ret;
				891
				892	ret = get_user_pages_fast(start, 1, 0, &page);
				893	if (ret != 1)
				894	return ret;
				895	pfn = page_to_pfn(page);
				896
				897	/*
				898	* When soft offlining hugepages, after migrating the page
				899	* we dissolve it, therefore in the second loop "page" will
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	900	* no longer be a compound page.
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	901	*/
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	902	size = page_size(compound_head(page));
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	903
				904	if (behavior == MADV_SOFT_OFFLINE) {
				905	pr_info("Soft offlining pfn %#lx at process virtual address %#lx\n",
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	906	pfn, start);
				907	ret = soft_offline_page(pfn, MF_COUNT_INCREASED);
				908	} else {
				909	pr_info("Injecting memory failure for pfn %#lx at process virtual address %#lx\n",
				910	pfn, start);
				911	ret = memory_failure(pfn, MF_COUNT_INCREASED);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	912	}
				913
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	914	if (ret)
				915	return ret;
				916	}
				917
				918	/* Ensure that all poisoned pages are removed from per-cpu lists */
				919	for_each_populated_zone(zone)
				920	drain_all_pages(zone);
				921
				922	return 0;
				923	}
				924	#endif
				925
				926	static long
				927	madvise_vma(struct vm_area_struct vma, struct vm_area_struct *prev,
				928	unsigned long start, unsigned long end, int behavior)
				929	{
				930	switch (behavior) {
				931	case MADV_REMOVE:
				932	return madvise_remove(vma, prev, start, end);
				933	case MADV_WILLNEED:
				934	return madvise_willneed(vma, prev, start, end);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	935	case MADV_COLD:
				936	return madvise_cold(vma, prev, start, end);
				937	case MADV_PAGEOUT:
				938	return madvise_pageout(vma, prev, start, end);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	939	case MADV_FREE:
				940	case MADV_DONTNEED:
				941	return madvise_dontneed_free(vma, prev, start, end, behavior);
				942	default:
				943	return madvise_behavior(vma, prev, start, end, behavior);
				944	}
				945	}
				946
				947	static bool
				948	madvise_behavior_valid(int behavior)
				949	{
				950	switch (behavior) {
				951	case MADV_DOFORK:
				952	case MADV_DONTFORK:
				953	case MADV_NORMAL:
				954	case MADV_SEQUENTIAL:
				955	case MADV_RANDOM:
				956	case MADV_REMOVE:
				957	case MADV_WILLNEED:
				958	case MADV_DONTNEED:
				959	case MADV_FREE:
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	960	case MADV_COLD:
				961	case MADV_PAGEOUT:
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	962	#ifdef CONFIG_KSM
				963	case MADV_MERGEABLE:
				964	case MADV_UNMERGEABLE:
				965	#endif
				966	#ifdef CONFIG_TRANSPARENT_HUGEPAGE
				967	case MADV_HUGEPAGE:
				968	case MADV_NOHUGEPAGE:
				969	#endif
				970	case MADV_DONTDUMP:
				971	case MADV_DODUMP:
				972	case MADV_WIPEONFORK:
				973	case MADV_KEEPONFORK:
				974	#ifdef CONFIG_MEMORY_FAILURE
				975	case MADV_SOFT_OFFLINE:
				976	case MADV_HWPOISON:
				977	#endif
				978	return true;
				979
				980	default:
				981	return false;
				982	}
				983	}
				984
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	985	static bool
				986	process_madvise_behavior_valid(int behavior)
				987	{
				988	switch (behavior) {
				989	case MADV_COLD:
				990	case MADV_PAGEOUT:
				991	return true;
				992	default:
				993	return false;
				994	}
				995	}
				996
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	997	/*
				998	* The madvise(2) system call.
				999	*
				1000	* Applications can use madvise() to advise the kernel how it should
				1001	* handle paging I/O in this VM area. The idea is to help the kernel
				1002	* use appropriate read-ahead and caching techniques. The information
				1003	* provided is advisory only, and can be safely disregarded by the
				1004	* kernel without affecting the correct operation of the application.
				1005	*
				1006	* behavior values:
				1007	* MADV_NORMAL - the default behavior is to read clusters. This
				1008	* results in some read-ahead and read-behind.
				1009	* MADV_RANDOM - the system should read the minimum amount of data
				1010	* on any access, since it is unlikely that the appli-
				1011	* cation will need more than what it asks for.
				1012	* MADV_SEQUENTIAL - pages in the given range will probably be accessed
				1013	* once, so they can be aggressively read ahead, and
				1014	* can be freed soon after they are accessed.
				1015	* MADV_WILLNEED - the application is notifying the system to read
				1016	* some pages ahead.
				1017	* MADV_DONTNEED - the application is finished with the given range,
				1018	* so the kernel can free resources associated with it.
				1019	* MADV_FREE - the application marks pages in the given range as lazy free,
				1020	* where actual purges are postponed until memory pressure happens.
				1021	* MADV_REMOVE - the application wants to free up the given range of
				1022	* pages and associated backing store.
				1023	* MADV_DONTFORK - omit this area from child's address space when forking:
				1024	* typically, to avoid COWing pages pinned by get_user_pages().
				1025	* MADV_DOFORK - cancel MADV_DONTFORK: no longer omit this area when forking.
				1026	* MADV_WIPEONFORK - present the child process with zero-filled memory in this
				1027	* range after a fork.
				1028	* MADV_KEEPONFORK - undo the effect of MADV_WIPEONFORK
				1029	* MADV_HWPOISON - trigger memory error handler as if the given memory range
				1030	* were corrupted by unrecoverable hardware memory failure.
				1031	* MADV_SOFT_OFFLINE - try to soft-offline the given range of memory.
				1032	* MADV_MERGEABLE - the application recommends that KSM try to merge pages in
				1033	* this area with pages of identical content from other such areas.
				1034	* MADV_UNMERGEABLE- cancel MADV_MERGEABLE: no longer merge pages with others.
				1035	* MADV_HUGEPAGE - the application wants to back the given range by transparent
				1036	* huge pages in the future. Existing pages might be coalesced and
				1037	* new pages might be allocated as THP.
				1038	* MADV_NOHUGEPAGE - mark the given range as not worth being backed by
				1039	* transparent huge pages so the existing pages will not be
				1040	* coalesced into THP and new pages will not be allocated as THP.
				1041	* MADV_DONTDUMP - the application wants to prevent pages in the given range
				1042	* from being included in its core dump.
				1043	* MADV_DODUMP - cancel MADV_DONTDUMP: no longer exclude from core dump.
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	1044	* MADV_COLD - the application is not expected to use this memory soon,
				1045	* deactivate pages in this range so that they can be reclaimed
				1046	* easily if memory pressure hanppens.
				1047	* MADV_PAGEOUT - the application is not expected to use this memory soon,
				1048	* page out the pages in this range immediately.
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1049	*
				1050	* return values:
				1051	* zero - success
				1052	* -EINVAL - start + len < 0, start is not page-aligned,
				1053	* "behavior" is not a valid value, or application
				1054	* is attempting to release locked or shared pages,
				1055	* or the specified address range includes file, Huge TLB,
				1056	* MAP_SHARED or VMPFNMAP range.
				1057	* -ENOMEM - addresses in the specified range are not currently
				1058	* mapped, or are outside the AS of the process.
				1059	* -EIO - an I/O error occurred while paging in data.
				1060	* -EBADF - map exists, but area maps something that isn't a file.
				1061	* -EAGAIN - a kernel resource was temporarily unavailable.
				1062	*/
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	1063	int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1064	{
				1065	unsigned long end, tmp;
				1066	struct vm_area_struct vma, prev;
				1067	int unmapped_error = 0;
				1068	int error = -EINVAL;
				1069	int write;
				1070	size_t len;
				1071	struct blk_plug plug;
				1072
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1073	start = untagged_addr(start);
				1074
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1075	if (!madvise_behavior_valid(behavior))
				1076	return error;
				1077
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	1078	if (!PAGE_ALIGNED(start))
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1079	return error;
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	1080	len = PAGE_ALIGN(len_in);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1081
				1082	/* Check to see whether len was rounded up from small -ve to zero */
				1083	if (len_in && !len)
				1084	return error;
				1085
				1086	end = start + len;
				1087	if (end < start)
				1088	return error;
				1089
				1090	error = 0;
				1091	if (end == start)
				1092	return error;
				1093
				1094	#ifdef CONFIG_MEMORY_FAILURE
				1095	if (behavior == MADV_HWPOISON \|\| behavior == MADV_SOFT_OFFLINE)
				1096	return madvise_inject_error(behavior, start, start + len_in);
				1097	#endif
				1098
				1099	write = madvise_need_mmap_write(behavior);
				1100	if (write) {
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	1101	if (mmap_write_lock_killable(mm))
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1102	return -EINTR;
				1103	} else {
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	1104	mmap_read_lock(mm);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1105	}
				1106
				1107	/*
				1108	* If the interval [start,end) covers some unmapped address
				1109	* ranges, just ignore them, but return -ENOMEM at the end.
				1110	* - different from the way of handling in mlock etc.
				1111	*/
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	1112	vma = find_vma_prev(mm, start, &prev);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1113	if (vma && start > vma->vm_start)
				1114	prev = vma;
				1115
				1116	blk_start_plug(&plug);
				1117	for (;;) {
				1118	/* Still start < end. */
				1119	error = -ENOMEM;
				1120	if (!vma)
				1121	goto out;
				1122
				1123	/* Here start < (end\|vma->vm_end). */
				1124	if (start < vma->vm_start) {
				1125	unmapped_error = -ENOMEM;
				1126	start = vma->vm_start;
				1127	if (start >= end)
				1128	goto out;
				1129	}
				1130
				1131	/* Here vma->vm_start <= start < (end\|vma->vm_end) */
				1132	tmp = vma->vm_end;
				1133	if (end < tmp)
				1134	tmp = end;
				1135
				1136	/* Here vma->vm_start <= start < tmp <= (end\|vma->vm_end). */
				1137	error = madvise_vma(vma, &prev, start, tmp, behavior);
				1138	if (error)
				1139	goto out;
				1140	start = tmp;
				1141	if (prev && start < prev->vm_end)
				1142	start = prev->vm_end;
				1143	error = unmapped_error;
				1144	if (start >= end)
				1145	goto out;
				1146	if (prev)
				1147	vma = prev->vm_next;
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	1148	else /* madvise_remove dropped mmap_lock */
				1149	vma = find_vma(mm, start);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1150	}
				1151	out:
				1152	blk_finish_plug(&plug);
				1153	if (write)
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	1154	mmap_write_unlock(mm);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1155	else
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	1156	mmap_read_unlock(mm);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1157
				1158	return error;
				1159	}
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	1160
				1161	SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
				1162	{
				1163	return do_madvise(current->mm, start, len_in, behavior);
				1164	}
				1165
				1166	SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec,
				1167	size_t, vlen, int, behavior, unsigned int, flags)
				1168	{
				1169	ssize_t ret;
				1170	struct iovec iovstack[UIO_FASTIOV], iovec;
				1171	struct iovec *iov = iovstack;
				1172	struct iov_iter iter;
				1173	struct pid *pid;
				1174	struct task_struct *task;
				1175	struct mm_struct *mm;
				1176	size_t total_len;
				1177	unsigned int f_flags;
				1178
				1179	if (flags != 0) {
				1180	ret = -EINVAL;
				1181	goto out;
				1182	}
				1183
				1184	ret = import_iovec(READ, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter);
				1185	if (ret < 0)
				1186	goto out;
				1187
				1188	pid = pidfd_get_pid(pidfd, &f_flags);
				1189	if (IS_ERR(pid)) {
				1190	ret = PTR_ERR(pid);
				1191	goto free_iov;
				1192	}
				1193
				1194	task = get_pid_task(pid, PIDTYPE_PID);
				1195	if (!task) {
				1196	ret = -ESRCH;
				1197	goto put_pid;
				1198	}
				1199
				1200	if (!process_madvise_behavior_valid(behavior)) {
				1201	ret = -EINVAL;
				1202	goto release_task;
				1203	}
				1204
				1205	/* Require PTRACE_MODE_READ to avoid leaking ASLR metadata. */
				1206	mm = mm_access(task, PTRACE_MODE_READ_FSCREDS);
				1207	if (IS_ERR_OR_NULL(mm)) {
				1208	ret = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH;
				1209	goto release_task;
				1210	}
				1211
				1212	/*
				1213	* Require CAP_SYS_NICE for influencing process performance. Note that
				1214	* only non-destructive hints are currently supported.
				1215	*/
				1216	if (!capable(CAP_SYS_NICE)) {
				1217	ret = -EPERM;
				1218	goto release_mm;
				1219	}
				1220
				1221	total_len = iov_iter_count(&iter);
				1222
				1223	while (iov_iter_count(&iter)) {
				1224	iovec = iov_iter_iovec(&iter);
				1225	ret = do_madvise(mm, (unsigned long)iovec.iov_base,
				1226	iovec.iov_len, behavior);
				1227	if (ret < 0)
				1228	break;
				1229	iov_iter_advance(&iter, iovec.iov_len);
				1230	}
				1231
				1232	if (ret == 0)
				1233	ret = total_len - iov_iter_count(&iter);
				1234
				1235	release_mm:
				1236	mmput(mm);
				1237	release_task:
				1238	put_task_struct(task);
				1239	put_pid:
				1240	put_pid(pid);
				1241	free_iov:
				1242	kfree(iov);
				1243	out:
				1244	return ret;
				1245	}