Blame - mm/madvise.c - hafnium/third_party/linux

blob: f71fc88f0b33137617628ae8cd8ab5180fcca2b3 [file] [log] [blame]

Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1	// SPDX-License-Identifier: GPL-2.0
				2	/*
				3	* linux/mm/madvise.c
				4	*
				5	* Copyright (C) 1999 Linus Torvalds
				6	* Copyright (C) 2002 Christoph Hellwig
				7	*/
				8
				9	#include <linux/mman.h>
				10	#include <linux/pagemap.h>
				11	#include <linux/syscalls.h>
				12	#include <linux/mempolicy.h>
				13	#include <linux/page-isolation.h>
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	14	#include <linux/page_idle.h>
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	15	#include <linux/userfaultfd_k.h>
				16	#include <linux/hugetlb.h>
				17	#include <linux/falloc.h>
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	18	#include <linux/fadvise.h>
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	19	#include <linux/sched.h>
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame]	20	#include <linux/sched/mm.h>
				21	#include <linux/uio.h>
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	22	#include <linux/ksm.h>
				23	#include <linux/fs.h>
				24	#include <linux/file.h>
				25	#include <linux/blkdev.h>
				26	#include <linux/backing-dev.h>
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	27	#include <linux/pagewalk.h>
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	28	#include <linux/swap.h>
				29	#include <linux/swapops.h>
				30	#include <linux/shmem_fs.h>
				31	#include <linux/mmu_notifier.h>
				32
				33	#include <asm/tlb.h>
				34
				35	#include "internal.h"
				36
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	37	struct madvise_walk_private {
				38	struct mmu_gather *tlb;
				39	bool pageout;
				40	};
				41
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	42	/*
				43	* Any behaviour which results in changes to the vma->vm_flags needs to
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame]	44	* take mmap_lock for writing. Others, which simply traverse vmas, need
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	45	* to only take it for reading.
				46	*/
				47	static int madvise_need_mmap_write(int behavior)
				48	{
				49	switch (behavior) {
				50	case MADV_REMOVE:
				51	case MADV_WILLNEED:
				52	case MADV_DONTNEED:
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	53	case MADV_COLD:
				54	case MADV_PAGEOUT:
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	55	case MADV_FREE:
				56	return 0;
				57	default:
				58	/* be safe, default to 1. list exceptions explicitly */
				59	return 1;
				60	}
				61	}
				62
				63	/*
				64	* We can potentially split a vm area into separate
				65	* areas, each area with its own behavior.
				66	*/
				67	static long madvise_behavior(struct vm_area_struct *vma,
				68	struct vm_area_struct **prev,
				69	unsigned long start, unsigned long end, int behavior)
				70	{
				71	struct mm_struct *mm = vma->vm_mm;
				72	int error = 0;
				73	pgoff_t pgoff;
				74	unsigned long new_flags = vma->vm_flags;
				75
				76	switch (behavior) {
				77	case MADV_NORMAL:
				78	new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ;
				79	break;
				80	case MADV_SEQUENTIAL:
				81	new_flags = (new_flags & ~VM_RAND_READ) \| VM_SEQ_READ;
				82	break;
				83	case MADV_RANDOM:
				84	new_flags = (new_flags & ~VM_SEQ_READ) \| VM_RAND_READ;
				85	break;
				86	case MADV_DONTFORK:
				87	new_flags \|= VM_DONTCOPY;
				88	break;
				89	case MADV_DOFORK:
				90	if (vma->vm_flags & VM_IO) {
				91	error = -EINVAL;
				92	goto out;
				93	}
				94	new_flags &= ~VM_DONTCOPY;
				95	break;
				96	case MADV_WIPEONFORK:
				97	/* MADV_WIPEONFORK is only supported on anonymous memory. */
				98	if (vma->vm_file \|\| vma->vm_flags & VM_SHARED) {
				99	error = -EINVAL;
				100	goto out;
				101	}
				102	new_flags \|= VM_WIPEONFORK;
				103	break;
				104	case MADV_KEEPONFORK:
				105	new_flags &= ~VM_WIPEONFORK;
				106	break;
				107	case MADV_DONTDUMP:
				108	new_flags \|= VM_DONTDUMP;
				109	break;
				110	case MADV_DODUMP:
				111	if (!is_vm_hugetlb_page(vma) && new_flags & VM_SPECIAL) {
				112	error = -EINVAL;
				113	goto out;
				114	}
				115	new_flags &= ~VM_DONTDUMP;
				116	break;
				117	case MADV_MERGEABLE:
				118	case MADV_UNMERGEABLE:
				119	error = ksm_madvise(vma, start, end, behavior, &new_flags);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	120	if (error)
				121	goto out_convert_errno;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	122	break;
				123	case MADV_HUGEPAGE:
				124	case MADV_NOHUGEPAGE:
				125	error = hugepage_madvise(vma, &new_flags, behavior);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	126	if (error)
				127	goto out_convert_errno;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	128	break;
				129	}
				130
				131	if (new_flags == vma->vm_flags) {
				132	*prev = vma;
				133	goto out;
				134	}
				135
				136	pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
				137	prev = vma_merge(mm, prev, start, end, new_flags, vma->anon_vma,
				138	vma->vm_file, pgoff, vma_policy(vma),
				139	vma->vm_userfaultfd_ctx);
				140	if (*prev) {
				141	vma = *prev;
				142	goto success;
				143	}
				144
				145	*prev = vma;
				146
				147	if (start != vma->vm_start) {
				148	if (unlikely(mm->map_count >= sysctl_max_map_count)) {
				149	error = -ENOMEM;
				150	goto out;
				151	}
				152	error = __split_vma(mm, vma, start, 1);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	153	if (error)
				154	goto out_convert_errno;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	155	}
				156
				157	if (end != vma->vm_end) {
				158	if (unlikely(mm->map_count >= sysctl_max_map_count)) {
				159	error = -ENOMEM;
				160	goto out;
				161	}
				162	error = __split_vma(mm, vma, end, 0);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	163	if (error)
				164	goto out_convert_errno;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	165	}
				166
				167	success:
				168	/*
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame]	169	* vm_flags is protected by the mmap_lock held in write mode.
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	170	*/
				171	vma->vm_flags = new_flags;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	172
				173	out_convert_errno:
				174	/*
				175	* madvise() returns EAGAIN if kernel resources, such as
				176	* slab, are temporarily unavailable.
				177	*/
				178	if (error == -ENOMEM)
				179	error = -EAGAIN;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	180	out:
				181	return error;
				182	}
				183
				184	#ifdef CONFIG_SWAP
				185	static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start,
				186	unsigned long end, struct mm_walk *walk)
				187	{
				188	pte_t *orig_pte;
				189	struct vm_area_struct *vma = walk->private;
				190	unsigned long index;
				191
				192	if (pmd_none_or_trans_huge_or_clear_bad(pmd))
				193	return 0;
				194
				195	for (index = start; index != end; index += PAGE_SIZE) {
				196	pte_t pte;
				197	swp_entry_t entry;
				198	struct page *page;
				199	spinlock_t *ptl;
				200
				201	orig_pte = pte_offset_map_lock(vma->vm_mm, pmd, start, &ptl);
				202	pte = *(orig_pte + ((index - start) / PAGE_SIZE));
				203	pte_unmap_unlock(orig_pte, ptl);
				204
				205	if (pte_present(pte) \|\| pte_none(pte))
				206	continue;
				207	entry = pte_to_swp_entry(pte);
				208	if (unlikely(non_swap_entry(entry)))
				209	continue;
				210
				211	page = read_swap_cache_async(entry, GFP_HIGHUSER_MOVABLE,
				212	vma, index, false);
				213	if (page)
				214	put_page(page);
				215	}
				216
				217	return 0;
				218	}
				219
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	220	static const struct mm_walk_ops swapin_walk_ops = {
				221	.pmd_entry = swapin_walk_pmd_entry,
				222	};
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	223
				224	static void force_shm_swapin_readahead(struct vm_area_struct *vma,
				225	unsigned long start, unsigned long end,
				226	struct address_space *mapping)
				227	{
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame]	228	XA_STATE(xas, &mapping->i_pages, linear_page_index(vma, start));
				229	pgoff_t end_index = linear_page_index(vma, end + PAGE_SIZE - 1);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	230	struct page *page;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	231
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame]	232	rcu_read_lock();
				233	xas_for_each(&xas, page, end_index) {
				234	swp_entry_t swap;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	235
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame]	236	if (!xa_is_value(page))
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	237	continue;
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame]	238	xas_pause(&xas);
				239	rcu_read_unlock();
				240
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	241	swap = radix_to_swp_entry(page);
				242	page = read_swap_cache_async(swap, GFP_HIGHUSER_MOVABLE,
				243	NULL, 0, false);
				244	if (page)
				245	put_page(page);
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame]	246
				247	rcu_read_lock();
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	248	}
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame]	249	rcu_read_unlock();
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	250
				251	lru_add_drain(); /* Push any new pages onto the LRU now */
				252	}
				253	#endif /* CONFIG_SWAP */
				254
				255	/*
				256	* Schedule all required I/O operations. Do not wait for completion.
				257	*/
				258	static long madvise_willneed(struct vm_area_struct *vma,
				259	struct vm_area_struct **prev,
				260	unsigned long start, unsigned long end)
				261	{
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame]	262	struct mm_struct *mm = vma->vm_mm;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	263	struct file *file = vma->vm_file;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	264	loff_t offset;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	265
				266	*prev = vma;
				267	#ifdef CONFIG_SWAP
				268	if (!file) {
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	269	walk_page_range(vma->vm_mm, start, end, &swapin_walk_ops, vma);
				270	lru_add_drain(); /* Push any new pages onto the LRU now */
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	271	return 0;
				272	}
				273
				274	if (shmem_mapping(file->f_mapping)) {
				275	force_shm_swapin_readahead(vma, start, end,
				276	file->f_mapping);
				277	return 0;
				278	}
				279	#else
				280	if (!file)
				281	return -EBADF;
				282	#endif
				283
				284	if (IS_DAX(file_inode(file))) {
				285	/* no bad return value, but ignore advice */
				286	return 0;
				287	}
				288
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	289	/*
				290	* Filesystem's fadvise may need to take various locks. We need to
				291	* explicitly grab a reference because the vma (and hence the
				292	* vma's reference to the file) can go away as soon as we drop
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame]	293	* mmap_lock.
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	294	*/
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame]	295	prev = NULL; / tell sys_madvise we drop mmap_lock */
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	296	get_file(file);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	297	offset = (loff_t)(start - vma->vm_start)
				298	+ ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame]	299	mmap_read_unlock(mm);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	300	vfs_fadvise(file, offset, end - start, POSIX_FADV_WILLNEED);
				301	fput(file);
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame]	302	mmap_read_lock(mm);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	303	return 0;
				304	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	305
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	306	static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
				307	unsigned long addr, unsigned long end,
				308	struct mm_walk *walk)
				309	{
				310	struct madvise_walk_private *private = walk->private;
				311	struct mmu_gather *tlb = private->tlb;
				312	bool pageout = private->pageout;
				313	struct mm_struct *mm = tlb->mm;
				314	struct vm_area_struct *vma = walk->vma;
				315	pte_t orig_pte, pte, ptent;
				316	spinlock_t *ptl;
				317	struct page *page = NULL;
				318	LIST_HEAD(page_list);
				319
				320	if (fatal_signal_pending(current))
				321	return -EINTR;
				322
				323	#ifdef CONFIG_TRANSPARENT_HUGEPAGE
				324	if (pmd_trans_huge(*pmd)) {
				325	pmd_t orig_pmd;
				326	unsigned long next = pmd_addr_end(addr, end);
				327
				328	tlb_change_page_size(tlb, HPAGE_PMD_SIZE);
				329	ptl = pmd_trans_huge_lock(pmd, vma);
				330	if (!ptl)
				331	return 0;
				332
				333	orig_pmd = *pmd;
				334	if (is_huge_zero_pmd(orig_pmd))
				335	goto huge_unlock;
				336
				337	if (unlikely(!pmd_present(orig_pmd))) {
				338	VM_BUG_ON(thp_migration_supported() &&
				339	!is_pmd_migration_entry(orig_pmd));
				340	goto huge_unlock;
				341	}
				342
				343	page = pmd_page(orig_pmd);
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame]	344
				345	/* Do not interfere with other mappings of this page */
				346	if (page_mapcount(page) != 1)
				347	goto huge_unlock;
				348
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	349	if (next - addr != HPAGE_PMD_SIZE) {
				350	int err;
				351
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	352	get_page(page);
				353	spin_unlock(ptl);
				354	lock_page(page);
				355	err = split_huge_page(page);
				356	unlock_page(page);
				357	put_page(page);
				358	if (!err)
				359	goto regular_page;
				360	return 0;
				361	}
				362
				363	if (pmd_young(orig_pmd)) {
				364	pmdp_invalidate(vma, addr, pmd);
				365	orig_pmd = pmd_mkold(orig_pmd);
				366
				367	set_pmd_at(mm, addr, pmd, orig_pmd);
				368	tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
				369	}
				370
				371	ClearPageReferenced(page);
				372	test_and_clear_page_young(page);
				373	if (pageout) {
				374	if (!isolate_lru_page(page)) {
				375	if (PageUnevictable(page))
				376	putback_lru_page(page);
				377	else
				378	list_add(&page->lru, &page_list);
				379	}
				380	} else
				381	deactivate_page(page);
				382	huge_unlock:
				383	spin_unlock(ptl);
				384	if (pageout)
				385	reclaim_pages(&page_list);
				386	return 0;
				387	}
				388
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame]	389	regular_page:
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	390	if (pmd_trans_unstable(pmd))
				391	return 0;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	392	#endif
				393	tlb_change_page_size(tlb, PAGE_SIZE);
				394	orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
				395	flush_tlb_batched_pending(mm);
				396	arch_enter_lazy_mmu_mode();
				397	for (; addr < end; pte++, addr += PAGE_SIZE) {
				398	ptent = *pte;
				399
				400	if (pte_none(ptent))
				401	continue;
				402
				403	if (!pte_present(ptent))
				404	continue;
				405
				406	page = vm_normal_page(vma, addr, ptent);
				407	if (!page)
				408	continue;
				409
				410	/*
				411	* Creating a THP page is expensive so split it only if we
				412	* are sure it's worth. Split it if we are only owner.
				413	*/
				414	if (PageTransCompound(page)) {
				415	if (page_mapcount(page) != 1)
				416	break;
				417	get_page(page);
				418	if (!trylock_page(page)) {
				419	put_page(page);
				420	break;
				421	}
				422	pte_unmap_unlock(orig_pte, ptl);
				423	if (split_huge_page(page)) {
				424	unlock_page(page);
				425	put_page(page);
				426	pte_offset_map_lock(mm, pmd, addr, &ptl);
				427	break;
				428	}
				429	unlock_page(page);
				430	put_page(page);
				431	pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
				432	pte--;
				433	addr -= PAGE_SIZE;
				434	continue;
				435	}
				436
Olivier Deprez	92d4c21	2022-12-06 15:05:30 +0100	[diff] [blame^]	437	/*
				438	* Do not interfere with other mappings of this page and
				439	* non-LRU page.
				440	*/
				441	if (!PageLRU(page) \|\| page_mapcount(page) != 1)
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame]	442	continue;
				443
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	444	VM_BUG_ON_PAGE(PageTransCompound(page), page);
				445
				446	if (pte_young(ptent)) {
				447	ptent = ptep_get_and_clear_full(mm, addr, pte,
				448	tlb->fullmm);
				449	ptent = pte_mkold(ptent);
				450	set_pte_at(mm, addr, pte, ptent);
				451	tlb_remove_tlb_entry(tlb, pte, addr);
				452	}
				453
				454	/*
				455	* We are deactivating a page for accelerating reclaiming.
				456	* VM couldn't reclaim the page unless we clear PG_young.
				457	* As a side effect, it makes confuse idle-page tracking
				458	* because they will miss recent referenced history.
				459	*/
				460	ClearPageReferenced(page);
				461	test_and_clear_page_young(page);
				462	if (pageout) {
				463	if (!isolate_lru_page(page)) {
				464	if (PageUnevictable(page))
				465	putback_lru_page(page);
				466	else
				467	list_add(&page->lru, &page_list);
				468	}
				469	} else
				470	deactivate_page(page);
				471	}
				472
				473	arch_leave_lazy_mmu_mode();
				474	pte_unmap_unlock(orig_pte, ptl);
				475	if (pageout)
				476	reclaim_pages(&page_list);
				477	cond_resched();
				478
				479	return 0;
				480	}
				481
				482	static const struct mm_walk_ops cold_walk_ops = {
				483	.pmd_entry = madvise_cold_or_pageout_pte_range,
				484	};
				485
				486	static void madvise_cold_page_range(struct mmu_gather *tlb,
				487	struct vm_area_struct *vma,
				488	unsigned long addr, unsigned long end)
				489	{
				490	struct madvise_walk_private walk_private = {
				491	.pageout = false,
				492	.tlb = tlb,
				493	};
				494
				495	tlb_start_vma(tlb, vma);
				496	walk_page_range(vma->vm_mm, addr, end, &cold_walk_ops, &walk_private);
				497	tlb_end_vma(tlb, vma);
				498	}
				499
				500	static long madvise_cold(struct vm_area_struct *vma,
				501	struct vm_area_struct **prev,
				502	unsigned long start_addr, unsigned long end_addr)
				503	{
				504	struct mm_struct *mm = vma->vm_mm;
				505	struct mmu_gather tlb;
				506
				507	*prev = vma;
				508	if (!can_madv_lru_vma(vma))
				509	return -EINVAL;
				510
				511	lru_add_drain();
				512	tlb_gather_mmu(&tlb, mm, start_addr, end_addr);
				513	madvise_cold_page_range(&tlb, vma, start_addr, end_addr);
				514	tlb_finish_mmu(&tlb, start_addr, end_addr);
				515
				516	return 0;
				517	}
				518
				519	static void madvise_pageout_page_range(struct mmu_gather *tlb,
				520	struct vm_area_struct *vma,
				521	unsigned long addr, unsigned long end)
				522	{
				523	struct madvise_walk_private walk_private = {
				524	.pageout = true,
				525	.tlb = tlb,
				526	};
				527
				528	tlb_start_vma(tlb, vma);
				529	walk_page_range(vma->vm_mm, addr, end, &cold_walk_ops, &walk_private);
				530	tlb_end_vma(tlb, vma);
				531	}
				532
				533	static inline bool can_do_pageout(struct vm_area_struct *vma)
				534	{
				535	if (vma_is_anonymous(vma))
				536	return true;
				537	if (!vma->vm_file)
				538	return false;
				539	/*
				540	* paging out pagecache only for non-anonymous mappings that correspond
				541	* to the files the calling process could (if tried) open for writing;
				542	* otherwise we'd be including shared non-exclusive mappings, which
				543	* opens a side channel.
				544	*/
				545	return inode_owner_or_capable(file_inode(vma->vm_file)) \|\|
				546	inode_permission(file_inode(vma->vm_file), MAY_WRITE) == 0;
				547	}
				548
				549	static long madvise_pageout(struct vm_area_struct *vma,
				550	struct vm_area_struct **prev,
				551	unsigned long start_addr, unsigned long end_addr)
				552	{
				553	struct mm_struct *mm = vma->vm_mm;
				554	struct mmu_gather tlb;
				555
				556	*prev = vma;
				557	if (!can_madv_lru_vma(vma))
				558	return -EINVAL;
				559
				560	if (!can_do_pageout(vma))
				561	return 0;
				562
				563	lru_add_drain();
				564	tlb_gather_mmu(&tlb, mm, start_addr, end_addr);
				565	madvise_pageout_page_range(&tlb, vma, start_addr, end_addr);
				566	tlb_finish_mmu(&tlb, start_addr, end_addr);
				567
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	568	return 0;
				569	}
				570
				571	static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
				572	unsigned long end, struct mm_walk *walk)
				573
				574	{
				575	struct mmu_gather *tlb = walk->private;
				576	struct mm_struct *mm = tlb->mm;
				577	struct vm_area_struct *vma = walk->vma;
				578	spinlock_t *ptl;
				579	pte_t orig_pte, pte, ptent;
				580	struct page *page;
				581	int nr_swap = 0;
				582	unsigned long next;
				583
				584	next = pmd_addr_end(addr, end);
				585	if (pmd_trans_huge(*pmd))
				586	if (madvise_free_huge_pmd(tlb, vma, pmd, addr, next))
				587	goto next;
				588
				589	if (pmd_trans_unstable(pmd))
				590	return 0;
				591
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	592	tlb_change_page_size(tlb, PAGE_SIZE);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	593	orig_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
				594	flush_tlb_batched_pending(mm);
				595	arch_enter_lazy_mmu_mode();
				596	for (; addr != end; pte++, addr += PAGE_SIZE) {
				597	ptent = *pte;
				598
				599	if (pte_none(ptent))
				600	continue;
				601	/*
				602	* If the pte has swp_entry, just clear page table to
				603	* prevent swap-in which is more expensive rather than
				604	* (page allocation + zeroing).
				605	*/
				606	if (!pte_present(ptent)) {
				607	swp_entry_t entry;
				608
				609	entry = pte_to_swp_entry(ptent);
				610	if (non_swap_entry(entry))
				611	continue;
				612	nr_swap--;
				613	free_swap_and_cache(entry);
				614	pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
				615	continue;
				616	}
				617
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	618	page = vm_normal_page(vma, addr, ptent);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	619	if (!page)
				620	continue;
				621
				622	/*
				623	* If pmd isn't transhuge but the page is THP and
				624	* is owned by only this process, split it and
				625	* deactivate all pages.
				626	*/
				627	if (PageTransCompound(page)) {
				628	if (page_mapcount(page) != 1)
				629	goto out;
				630	get_page(page);
				631	if (!trylock_page(page)) {
				632	put_page(page);
				633	goto out;
				634	}
				635	pte_unmap_unlock(orig_pte, ptl);
				636	if (split_huge_page(page)) {
				637	unlock_page(page);
				638	put_page(page);
				639	pte_offset_map_lock(mm, pmd, addr, &ptl);
				640	goto out;
				641	}
				642	unlock_page(page);
				643	put_page(page);
				644	pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
				645	pte--;
				646	addr -= PAGE_SIZE;
				647	continue;
				648	}
				649
				650	VM_BUG_ON_PAGE(PageTransCompound(page), page);
				651
				652	if (PageSwapCache(page) \|\| PageDirty(page)) {
				653	if (!trylock_page(page))
				654	continue;
				655	/*
				656	* If page is shared with others, we couldn't clear
				657	* PG_dirty of the page.
				658	*/
				659	if (page_mapcount(page) != 1) {
				660	unlock_page(page);
				661	continue;
				662	}
				663
				664	if (PageSwapCache(page) && !try_to_free_swap(page)) {
				665	unlock_page(page);
				666	continue;
				667	}
				668
				669	ClearPageDirty(page);
				670	unlock_page(page);
				671	}
				672
				673	if (pte_young(ptent) \|\| pte_dirty(ptent)) {
				674	/*
				675	* Some of architecture(ex, PPC) don't update TLB
				676	* with set_pte_at and tlb_remove_tlb_entry so for
				677	* the portability, remap the pte with old\|clean
				678	* after pte clearing.
				679	*/
				680	ptent = ptep_get_and_clear_full(mm, addr, pte,
				681	tlb->fullmm);
				682
				683	ptent = pte_mkold(ptent);
				684	ptent = pte_mkclean(ptent);
				685	set_pte_at(mm, addr, pte, ptent);
				686	tlb_remove_tlb_entry(tlb, pte, addr);
				687	}
				688	mark_page_lazyfree(page);
				689	}
				690	out:
				691	if (nr_swap) {
				692	if (current->mm == mm)
				693	sync_mm_rss(mm);
				694
				695	add_mm_counter(mm, MM_SWAPENTS, nr_swap);
				696	}
				697	arch_leave_lazy_mmu_mode();
				698	pte_unmap_unlock(orig_pte, ptl);
				699	cond_resched();
				700	next:
				701	return 0;
				702	}
				703
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	704	static const struct mm_walk_ops madvise_free_walk_ops = {
				705	.pmd_entry = madvise_free_pte_range,
				706	};
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	707
				708	static int madvise_free_single_vma(struct vm_area_struct *vma,
				709	unsigned long start_addr, unsigned long end_addr)
				710	{
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	711	struct mm_struct *mm = vma->vm_mm;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	712	struct mmu_notifier_range range;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	713	struct mmu_gather tlb;
				714
				715	/* MADV_FREE works for only anon vma at the moment */
				716	if (!vma_is_anonymous(vma))
				717	return -EINVAL;
				718
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	719	range.start = max(vma->vm_start, start_addr);
				720	if (range.start >= vma->vm_end)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	721	return -EINVAL;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	722	range.end = min(vma->vm_end, end_addr);
				723	if (range.end <= vma->vm_start)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	724	return -EINVAL;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	725	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm,
				726	range.start, range.end);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	727
				728	lru_add_drain();
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	729	tlb_gather_mmu(&tlb, mm, range.start, range.end);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	730	update_hiwater_rss(mm);
				731
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	732	mmu_notifier_invalidate_range_start(&range);
				733	tlb_start_vma(&tlb, vma);
				734	walk_page_range(vma->vm_mm, range.start, range.end,
				735	&madvise_free_walk_ops, &tlb);
				736	tlb_end_vma(&tlb, vma);
				737	mmu_notifier_invalidate_range_end(&range);
				738	tlb_finish_mmu(&tlb, range.start, range.end);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	739
				740	return 0;
				741	}
				742
				743	/*
				744	* Application no longer needs these pages. If the pages are dirty,
				745	* it's OK to just throw them away. The app will be more careful about
				746	* data it wants to keep. Be sure to free swap resources too. The
				747	* zap_page_range call sets things up for shrink_active_list to actually free
				748	* these pages later if no one else has touched them in the meantime,
				749	* although we could add these pages to a global reuse list for
				750	* shrink_active_list to pick up before reclaiming other pages.
				751	*
				752	* NB: This interface discards data rather than pushes it out to swap,
				753	* as some implementations do. This has performance implications for
				754	* applications like large transactional databases which want to discard
				755	* pages in anonymous maps after committing to backing store the data
				756	* that was kept in them. There is no reason to write this data out to
				757	* the swap area if the application is discarding it.
				758	*
				759	* An interface that causes the system to free clean pages and flush
				760	* dirty pages is already available as msync(MS_INVALIDATE).
				761	*/
				762	static long madvise_dontneed_single_vma(struct vm_area_struct *vma,
				763	unsigned long start, unsigned long end)
				764	{
				765	zap_page_range(vma, start, end - start);
				766	return 0;
				767	}
				768
				769	static long madvise_dontneed_free(struct vm_area_struct *vma,
				770	struct vm_area_struct **prev,
				771	unsigned long start, unsigned long end,
				772	int behavior)
				773	{
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame]	774	struct mm_struct *mm = vma->vm_mm;
				775
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	776	*prev = vma;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	777	if (!can_madv_lru_vma(vma))
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	778	return -EINVAL;
				779
				780	if (!userfaultfd_remove(vma, start, end)) {
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame]	781	prev = NULL; / mmap_lock has been dropped, prev is stale */
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	782
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame]	783	mmap_read_lock(mm);
				784	vma = find_vma(mm, start);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	785	if (!vma)
				786	return -ENOMEM;
				787	if (start < vma->vm_start) {
				788	/*
				789	* This "vma" under revalidation is the one
				790	* with the lowest vma->vm_start where start
				791	* is also < vma->vm_end. If start <
				792	* vma->vm_start it means an hole materialized
				793	* in the user address space within the
				794	* virtual range passed to MADV_DONTNEED
				795	* or MADV_FREE.
				796	*/
				797	return -ENOMEM;
				798	}
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	799	if (!can_madv_lru_vma(vma))
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	800	return -EINVAL;
				801	if (end > vma->vm_end) {
				802	/*
				803	* Don't fail if end > vma->vm_end. If the old
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame]	804	* vma was splitted while the mmap_lock was
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	805	* released the effect of the concurrent
				806	* operation may not cause madvise() to
				807	* have an undefined result. There may be an
				808	* adjacent next vma that we'll walk
				809	* next. userfaultfd_remove() will generate an
				810	* UFFD_EVENT_REMOVE repetition on the
				811	* end-vma->vm_end range, but the manager can
				812	* handle a repetition fine.
				813	*/
				814	end = vma->vm_end;
				815	}
				816	VM_WARN_ON(start >= end);
				817	}
				818
				819	if (behavior == MADV_DONTNEED)
				820	return madvise_dontneed_single_vma(vma, start, end);
				821	else if (behavior == MADV_FREE)
				822	return madvise_free_single_vma(vma, start, end);
				823	else
				824	return -EINVAL;
				825	}
				826
				827	/*
				828	* Application wants to free up the pages and associated backing store.
				829	* This is effectively punching a hole into the middle of a file.
				830	*/
				831	static long madvise_remove(struct vm_area_struct *vma,
				832	struct vm_area_struct **prev,
				833	unsigned long start, unsigned long end)
				834	{
				835	loff_t offset;
				836	int error;
				837	struct file *f;
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame]	838	struct mm_struct *mm = vma->vm_mm;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	839
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame]	840	prev = NULL; / tell sys_madvise we drop mmap_lock */
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	841
				842	if (vma->vm_flags & VM_LOCKED)
				843	return -EINVAL;
				844
				845	f = vma->vm_file;
				846
				847	if (!f \|\| !f->f_mapping \|\| !f->f_mapping->host) {
				848	return -EINVAL;
				849	}
				850
				851	if ((vma->vm_flags & (VM_SHARED\|VM_WRITE)) != (VM_SHARED\|VM_WRITE))
				852	return -EACCES;
				853
				854	offset = (loff_t)(start - vma->vm_start)
				855	+ ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
				856
				857	/*
				858	* Filesystem's fallocate may need to take i_mutex. We need to
				859	* explicitly grab a reference because the vma (and hence the
				860	* vma's reference to the file) can go away as soon as we drop
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame]	861	* mmap_lock.
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	862	*/
				863	get_file(f);
				864	if (userfaultfd_remove(vma, start, end)) {
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame]	865	/* mmap_lock was not released by userfaultfd_remove() */
				866	mmap_read_unlock(mm);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	867	}
				868	error = vfs_fallocate(f,
				869	FALLOC_FL_PUNCH_HOLE \| FALLOC_FL_KEEP_SIZE,
				870	offset, end - start);
				871	fput(f);
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame]	872	mmap_read_lock(mm);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	873	return error;
				874	}
				875
				876	#ifdef CONFIG_MEMORY_FAILURE
				877	/*
				878	* Error injection support for memory error handling.
				879	*/
				880	static int madvise_inject_error(int behavior,
				881	unsigned long start, unsigned long end)
				882	{
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	883	struct zone *zone;
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame]	884	unsigned long size;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	885
				886	if (!capable(CAP_SYS_ADMIN))
				887	return -EPERM;
				888
				889
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame]	890	for (; start < end; start += size) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	891	unsigned long pfn;
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame]	892	struct page *page;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	893	int ret;
				894
				895	ret = get_user_pages_fast(start, 1, 0, &page);
				896	if (ret != 1)
				897	return ret;
				898	pfn = page_to_pfn(page);
				899
				900	/*
				901	* When soft offlining hugepages, after migrating the page
				902	* we dissolve it, therefore in the second loop "page" will
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame]	903	* no longer be a compound page.
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	904	*/
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame]	905	size = page_size(compound_head(page));
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	906
				907	if (behavior == MADV_SOFT_OFFLINE) {
				908	pr_info("Soft offlining pfn %#lx at process virtual address %#lx\n",
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame]	909	pfn, start);
				910	ret = soft_offline_page(pfn, MF_COUNT_INCREASED);
				911	} else {
				912	pr_info("Injecting memory failure for pfn %#lx at process virtual address %#lx\n",
				913	pfn, start);
				914	ret = memory_failure(pfn, MF_COUNT_INCREASED);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	915	}
				916
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	917	if (ret)
				918	return ret;
				919	}
				920
				921	/* Ensure that all poisoned pages are removed from per-cpu lists */
				922	for_each_populated_zone(zone)
				923	drain_all_pages(zone);
				924
				925	return 0;
				926	}
				927	#endif
				928
				929	static long
				930	madvise_vma(struct vm_area_struct vma, struct vm_area_struct *prev,
				931	unsigned long start, unsigned long end, int behavior)
				932	{
				933	switch (behavior) {
				934	case MADV_REMOVE:
				935	return madvise_remove(vma, prev, start, end);
				936	case MADV_WILLNEED:
				937	return madvise_willneed(vma, prev, start, end);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	938	case MADV_COLD:
				939	return madvise_cold(vma, prev, start, end);
				940	case MADV_PAGEOUT:
				941	return madvise_pageout(vma, prev, start, end);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	942	case MADV_FREE:
				943	case MADV_DONTNEED:
				944	return madvise_dontneed_free(vma, prev, start, end, behavior);
				945	default:
				946	return madvise_behavior(vma, prev, start, end, behavior);
				947	}
				948	}
				949
				950	static bool
				951	madvise_behavior_valid(int behavior)
				952	{
				953	switch (behavior) {
				954	case MADV_DOFORK:
				955	case MADV_DONTFORK:
				956	case MADV_NORMAL:
				957	case MADV_SEQUENTIAL:
				958	case MADV_RANDOM:
				959	case MADV_REMOVE:
				960	case MADV_WILLNEED:
				961	case MADV_DONTNEED:
				962	case MADV_FREE:
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	963	case MADV_COLD:
				964	case MADV_PAGEOUT:
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	965	#ifdef CONFIG_KSM
				966	case MADV_MERGEABLE:
				967	case MADV_UNMERGEABLE:
				968	#endif
				969	#ifdef CONFIG_TRANSPARENT_HUGEPAGE
				970	case MADV_HUGEPAGE:
				971	case MADV_NOHUGEPAGE:
				972	#endif
				973	case MADV_DONTDUMP:
				974	case MADV_DODUMP:
				975	case MADV_WIPEONFORK:
				976	case MADV_KEEPONFORK:
				977	#ifdef CONFIG_MEMORY_FAILURE
				978	case MADV_SOFT_OFFLINE:
				979	case MADV_HWPOISON:
				980	#endif
				981	return true;
				982
				983	default:
				984	return false;
				985	}
				986	}
				987
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame]	988	static bool
				989	process_madvise_behavior_valid(int behavior)
				990	{
				991	switch (behavior) {
				992	case MADV_COLD:
				993	case MADV_PAGEOUT:
				994	return true;
				995	default:
				996	return false;
				997	}
				998	}
				999
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1000	/*
				1001	* The madvise(2) system call.
				1002	*
				1003	* Applications can use madvise() to advise the kernel how it should
				1004	* handle paging I/O in this VM area. The idea is to help the kernel
				1005	* use appropriate read-ahead and caching techniques. The information
				1006	* provided is advisory only, and can be safely disregarded by the
				1007	* kernel without affecting the correct operation of the application.
				1008	*
				1009	* behavior values:
				1010	* MADV_NORMAL - the default behavior is to read clusters. This
				1011	* results in some read-ahead and read-behind.
				1012	* MADV_RANDOM - the system should read the minimum amount of data
				1013	* on any access, since it is unlikely that the appli-
				1014	* cation will need more than what it asks for.
				1015	* MADV_SEQUENTIAL - pages in the given range will probably be accessed
				1016	* once, so they can be aggressively read ahead, and
				1017	* can be freed soon after they are accessed.
				1018	* MADV_WILLNEED - the application is notifying the system to read
				1019	* some pages ahead.
				1020	* MADV_DONTNEED - the application is finished with the given range,
				1021	* so the kernel can free resources associated with it.
				1022	* MADV_FREE - the application marks pages in the given range as lazy free,
				1023	* where actual purges are postponed until memory pressure happens.
				1024	* MADV_REMOVE - the application wants to free up the given range of
				1025	* pages and associated backing store.
				1026	* MADV_DONTFORK - omit this area from child's address space when forking:
				1027	* typically, to avoid COWing pages pinned by get_user_pages().
				1028	* MADV_DOFORK - cancel MADV_DONTFORK: no longer omit this area when forking.
				1029	* MADV_WIPEONFORK - present the child process with zero-filled memory in this
				1030	* range after a fork.
				1031	* MADV_KEEPONFORK - undo the effect of MADV_WIPEONFORK
				1032	* MADV_HWPOISON - trigger memory error handler as if the given memory range
				1033	* were corrupted by unrecoverable hardware memory failure.
				1034	* MADV_SOFT_OFFLINE - try to soft-offline the given range of memory.
				1035	* MADV_MERGEABLE - the application recommends that KSM try to merge pages in
				1036	* this area with pages of identical content from other such areas.
				1037	* MADV_UNMERGEABLE- cancel MADV_MERGEABLE: no longer merge pages with others.
				1038	* MADV_HUGEPAGE - the application wants to back the given range by transparent
				1039	* huge pages in the future. Existing pages might be coalesced and
				1040	* new pages might be allocated as THP.
				1041	* MADV_NOHUGEPAGE - mark the given range as not worth being backed by
				1042	* transparent huge pages so the existing pages will not be
				1043	* coalesced into THP and new pages will not be allocated as THP.
				1044	* MADV_DONTDUMP - the application wants to prevent pages in the given range
				1045	* from being included in its core dump.
				1046	* MADV_DODUMP - cancel MADV_DONTDUMP: no longer exclude from core dump.
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame]	1047	* MADV_COLD - the application is not expected to use this memory soon,
				1048	* deactivate pages in this range so that they can be reclaimed
				1049	* easily if memory pressure hanppens.
				1050	* MADV_PAGEOUT - the application is not expected to use this memory soon,
				1051	* page out the pages in this range immediately.
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1052	*
				1053	* return values:
				1054	* zero - success
				1055	* -EINVAL - start + len < 0, start is not page-aligned,
				1056	* "behavior" is not a valid value, or application
				1057	* is attempting to release locked or shared pages,
				1058	* or the specified address range includes file, Huge TLB,
				1059	* MAP_SHARED or VMPFNMAP range.
				1060	* -ENOMEM - addresses in the specified range are not currently
				1061	* mapped, or are outside the AS of the process.
				1062	* -EIO - an I/O error occurred while paging in data.
				1063	* -EBADF - map exists, but area maps something that isn't a file.
				1064	* -EAGAIN - a kernel resource was temporarily unavailable.
				1065	*/
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame]	1066	int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1067	{
				1068	unsigned long end, tmp;
				1069	struct vm_area_struct vma, prev;
				1070	int unmapped_error = 0;
				1071	int error = -EINVAL;
				1072	int write;
				1073	size_t len;
				1074	struct blk_plug plug;
				1075
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1076	start = untagged_addr(start);
				1077
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1078	if (!madvise_behavior_valid(behavior))
				1079	return error;
				1080
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame]	1081	if (!PAGE_ALIGNED(start))
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1082	return error;
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame]	1083	len = PAGE_ALIGN(len_in);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1084
				1085	/* Check to see whether len was rounded up from small -ve to zero */
				1086	if (len_in && !len)
				1087	return error;
				1088
				1089	end = start + len;
				1090	if (end < start)
				1091	return error;
				1092
				1093	error = 0;
				1094	if (end == start)
				1095	return error;
				1096
				1097	#ifdef CONFIG_MEMORY_FAILURE
				1098	if (behavior == MADV_HWPOISON \|\| behavior == MADV_SOFT_OFFLINE)
				1099	return madvise_inject_error(behavior, start, start + len_in);
				1100	#endif
				1101
				1102	write = madvise_need_mmap_write(behavior);
				1103	if (write) {
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame]	1104	if (mmap_write_lock_killable(mm))
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1105	return -EINTR;
				1106	} else {
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame]	1107	mmap_read_lock(mm);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1108	}
				1109
				1110	/*
				1111	* If the interval [start,end) covers some unmapped address
				1112	* ranges, just ignore them, but return -ENOMEM at the end.
				1113	* - different from the way of handling in mlock etc.
				1114	*/
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame]	1115	vma = find_vma_prev(mm, start, &prev);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1116	if (vma && start > vma->vm_start)
				1117	prev = vma;
				1118
				1119	blk_start_plug(&plug);
				1120	for (;;) {
				1121	/* Still start < end. */
				1122	error = -ENOMEM;
				1123	if (!vma)
				1124	goto out;
				1125
				1126	/* Here start < (end\|vma->vm_end). */
				1127	if (start < vma->vm_start) {
				1128	unmapped_error = -ENOMEM;
				1129	start = vma->vm_start;
				1130	if (start >= end)
				1131	goto out;
				1132	}
				1133
				1134	/* Here vma->vm_start <= start < (end\|vma->vm_end) */
				1135	tmp = vma->vm_end;
				1136	if (end < tmp)
				1137	tmp = end;
				1138
				1139	/* Here vma->vm_start <= start < tmp <= (end\|vma->vm_end). */
				1140	error = madvise_vma(vma, &prev, start, tmp, behavior);
				1141	if (error)
				1142	goto out;
				1143	start = tmp;
				1144	if (prev && start < prev->vm_end)
				1145	start = prev->vm_end;
				1146	error = unmapped_error;
				1147	if (start >= end)
				1148	goto out;
				1149	if (prev)
				1150	vma = prev->vm_next;
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame]	1151	else /* madvise_remove dropped mmap_lock */
				1152	vma = find_vma(mm, start);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1153	}
				1154	out:
				1155	blk_finish_plug(&plug);
				1156	if (write)
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame]	1157	mmap_write_unlock(mm);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1158	else
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame]	1159	mmap_read_unlock(mm);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1160
				1161	return error;
				1162	}
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame]	1163
				1164	SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
				1165	{
				1166	return do_madvise(current->mm, start, len_in, behavior);
				1167	}
				1168
				1169	SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec,
				1170	size_t, vlen, int, behavior, unsigned int, flags)
				1171	{
				1172	ssize_t ret;
				1173	struct iovec iovstack[UIO_FASTIOV], iovec;
				1174	struct iovec *iov = iovstack;
				1175	struct iov_iter iter;
				1176	struct pid *pid;
				1177	struct task_struct *task;
				1178	struct mm_struct *mm;
				1179	size_t total_len;
				1180	unsigned int f_flags;
				1181
				1182	if (flags != 0) {
				1183	ret = -EINVAL;
				1184	goto out;
				1185	}
				1186
				1187	ret = import_iovec(READ, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter);
				1188	if (ret < 0)
				1189	goto out;
				1190
				1191	pid = pidfd_get_pid(pidfd, &f_flags);
				1192	if (IS_ERR(pid)) {
				1193	ret = PTR_ERR(pid);
				1194	goto free_iov;
				1195	}
				1196
				1197	task = get_pid_task(pid, PIDTYPE_PID);
				1198	if (!task) {
				1199	ret = -ESRCH;
				1200	goto put_pid;
				1201	}
				1202
				1203	if (!process_madvise_behavior_valid(behavior)) {
				1204	ret = -EINVAL;
				1205	goto release_task;
				1206	}
				1207
				1208	/* Require PTRACE_MODE_READ to avoid leaking ASLR metadata. */
				1209	mm = mm_access(task, PTRACE_MODE_READ_FSCREDS);
				1210	if (IS_ERR_OR_NULL(mm)) {
				1211	ret = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH;
				1212	goto release_task;
				1213	}
				1214
				1215	/*
				1216	* Require CAP_SYS_NICE for influencing process performance. Note that
				1217	* only non-destructive hints are currently supported.
				1218	*/
				1219	if (!capable(CAP_SYS_NICE)) {
				1220	ret = -EPERM;
				1221	goto release_mm;
				1222	}
				1223
				1224	total_len = iov_iter_count(&iter);
				1225
				1226	while (iov_iter_count(&iter)) {
				1227	iovec = iov_iter_iovec(&iter);
				1228	ret = do_madvise(mm, (unsigned long)iovec.iov_base,
				1229	iovec.iov_len, behavior);
				1230	if (ret < 0)
				1231	break;
				1232	iov_iter_advance(&iter, iovec.iov_len);
				1233	}
				1234
Olivier Deprez	92d4c21	2022-12-06 15:05:30 +0100	[diff] [blame^]	1235	ret = (total_len - iov_iter_count(&iter)) ? : ret;
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame]	1236
				1237	release_mm:
				1238	mmput(mm);
				1239	release_task:
				1240	put_task_struct(task);
				1241	put_pid:
				1242	put_pid(pid);
				1243	free_iov:
				1244	kfree(iov);
				1245	out:
				1246	return ret;
				1247	}