Blame - fs/xfs/xfs_buf.c - hafnium/third_party/linux.git

blob: 0abba171aa89b6c948fc795201d2eee9734d6e06 [file] [log] [blame]

Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1	// SPDX-License-Identifier: GPL-2.0
				2	/*
				3	* Copyright (c) 2000-2006 Silicon Graphics, Inc.
				4	* All Rights Reserved.
				5	*/
				6	#include "xfs.h"
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	7	#include <linux/backing-dev.h>
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	8
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	9	#include "xfs_shared.h"
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	10	#include "xfs_format.h"
				11	#include "xfs_log_format.h"
				12	#include "xfs_trans_resv.h"
				13	#include "xfs_sb.h"
				14	#include "xfs_mount.h"
				15	#include "xfs_trace.h"
				16	#include "xfs_log.h"
				17	#include "xfs_errortag.h"
				18	#include "xfs_error.h"
				19
				20	static kmem_zone_t *xfs_buf_zone;
				21
				22	#define xb_to_gfp(flags) \
				23	((((flags) & XBF_READ_AHEAD) ? __GFP_NORETRY : GFP_NOFS) \| __GFP_NOWARN)
				24
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	25	/*
				26	* Locking orders
				27	*
				28	* xfs_buf_ioacct_inc:
				29	* xfs_buf_ioacct_dec:
				30	* b_sema (caller holds)
				31	* b_lock
				32	*
				33	* xfs_buf_stale:
				34	* b_sema (caller holds)
				35	* b_lock
				36	* lru_lock
				37	*
				38	* xfs_buf_rele:
				39	* b_lock
				40	* pag_buf_lock
				41	* lru_lock
				42	*
				43	* xfs_buftarg_wait_rele
				44	* lru_lock
				45	* b_lock (trylock due to inversion)
				46	*
				47	* xfs_buftarg_isolate
				48	* lru_lock
				49	* b_lock (trylock due to inversion)
				50	*/
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	51
				52	static inline int
				53	xfs_buf_is_vmapped(
				54	struct xfs_buf *bp)
				55	{
				56	/*
				57	* Return true if the buffer is vmapped.
				58	*
				59	* b_addr is null if the buffer is not mapped, but the code is clever
				60	* enough to know it doesn't have to map a single page, so the check has
				61	* to be both for b_addr and bp->b_page_count > 1.
				62	*/
				63	return bp->b_addr && bp->b_page_count > 1;
				64	}
				65
				66	static inline int
				67	xfs_buf_vmap_len(
				68	struct xfs_buf *bp)
				69	{
				70	return (bp->b_page_count * PAGE_SIZE) - bp->b_offset;
				71	}
				72
				73	/*
				74	* Bump the I/O in flight count on the buftarg if we haven't yet done so for
				75	* this buffer. The count is incremented once per buffer (per hold cycle)
				76	* because the corresponding decrement is deferred to buffer release. Buffers
				77	* can undergo I/O multiple times in a hold-release cycle and per buffer I/O
				78	* tracking adds unnecessary overhead. This is used for sychronization purposes
				79	* with unmount (see xfs_wait_buftarg()), so all we really need is a count of
				80	* in-flight buffers.
				81	*
				82	* Buffers that are never released (e.g., superblock, iclog buffers) must set
				83	* the XBF_NO_IOACCT flag before I/O submission. Otherwise, the buftarg count
				84	* never reaches zero and unmount hangs indefinitely.
				85	*/
				86	static inline void
				87	xfs_buf_ioacct_inc(
				88	struct xfs_buf *bp)
				89	{
				90	if (bp->b_flags & XBF_NO_IOACCT)
				91	return;
				92
				93	ASSERT(bp->b_flags & XBF_ASYNC);
				94	spin_lock(&bp->b_lock);
				95	if (!(bp->b_state & XFS_BSTATE_IN_FLIGHT)) {
				96	bp->b_state \|= XFS_BSTATE_IN_FLIGHT;
				97	percpu_counter_inc(&bp->b_target->bt_io_count);
				98	}
				99	spin_unlock(&bp->b_lock);
				100	}
				101
				102	/*
				103	* Clear the in-flight state on a buffer about to be released to the LRU or
				104	* freed and unaccount from the buftarg.
				105	*/
				106	static inline void
				107	__xfs_buf_ioacct_dec(
				108	struct xfs_buf *bp)
				109	{
				110	lockdep_assert_held(&bp->b_lock);
				111
				112	if (bp->b_state & XFS_BSTATE_IN_FLIGHT) {
				113	bp->b_state &= ~XFS_BSTATE_IN_FLIGHT;
				114	percpu_counter_dec(&bp->b_target->bt_io_count);
				115	}
				116	}
				117
				118	static inline void
				119	xfs_buf_ioacct_dec(
				120	struct xfs_buf *bp)
				121	{
				122	spin_lock(&bp->b_lock);
				123	__xfs_buf_ioacct_dec(bp);
				124	spin_unlock(&bp->b_lock);
				125	}
				126
				127	/*
				128	* When we mark a buffer stale, we remove the buffer from the LRU and clear the
				129	* b_lru_ref count so that the buffer is freed immediately when the buffer
				130	* reference count falls to zero. If the buffer is already on the LRU, we need
				131	* to remove the reference that LRU holds on the buffer.
				132	*
				133	* This prevents build-up of stale buffers on the LRU.
				134	*/
				135	void
				136	xfs_buf_stale(
				137	struct xfs_buf *bp)
				138	{
				139	ASSERT(xfs_buf_islocked(bp));
				140
				141	bp->b_flags \|= XBF_STALE;
				142
				143	/*
				144	* Clear the delwri status so that a delwri queue walker will not
				145	* flush this buffer to disk now that it is stale. The delwri queue has
				146	* a reference to the buffer, so this is safe to do.
				147	*/
				148	bp->b_flags &= ~_XBF_DELWRI_Q;
				149
				150	/*
				151	* Once the buffer is marked stale and unlocked, a subsequent lookup
				152	* could reset b_flags. There is no guarantee that the buffer is
				153	* unaccounted (released to LRU) before that occurs. Drop in-flight
				154	* status now to preserve accounting consistency.
				155	*/
				156	spin_lock(&bp->b_lock);
				157	__xfs_buf_ioacct_dec(bp);
				158
				159	atomic_set(&bp->b_lru_ref, 0);
				160	if (!(bp->b_state & XFS_BSTATE_DISPOSE) &&
				161	(list_lru_del(&bp->b_target->bt_lru, &bp->b_lru)))
				162	atomic_dec(&bp->b_hold);
				163
				164	ASSERT(atomic_read(&bp->b_hold) >= 1);
				165	spin_unlock(&bp->b_lock);
				166	}
				167
				168	static int
				169	xfs_buf_get_maps(
				170	struct xfs_buf *bp,
				171	int map_count)
				172	{
				173	ASSERT(bp->b_maps == NULL);
				174	bp->b_map_count = map_count;
				175
				176	if (map_count == 1) {
				177	bp->b_maps = &bp->__b_map;
				178	return 0;
				179	}
				180
				181	bp->b_maps = kmem_zalloc(map_count * sizeof(struct xfs_buf_map),
				182	KM_NOFS);
				183	if (!bp->b_maps)
				184	return -ENOMEM;
				185	return 0;
				186	}
				187
				188	/*
				189	* Frees b_pages if it was allocated.
				190	*/
				191	static void
				192	xfs_buf_free_maps(
				193	struct xfs_buf *bp)
				194	{
				195	if (bp->b_maps != &bp->__b_map) {
				196	kmem_free(bp->b_maps);
				197	bp->b_maps = NULL;
				198	}
				199	}
				200
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	201	static struct xfs_buf *
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	202	_xfs_buf_alloc(
				203	struct xfs_buftarg *target,
				204	struct xfs_buf_map *map,
				205	int nmaps,
				206	xfs_buf_flags_t flags)
				207	{
				208	struct xfs_buf *bp;
				209	int error;
				210	int i;
				211
				212	bp = kmem_zone_zalloc(xfs_buf_zone, KM_NOFS);
				213	if (unlikely(!bp))
				214	return NULL;
				215
				216	/*
				217	* We don't want certain flags to appear in b_flags unless they are
				218	* specifically set by later operations on the buffer.
				219	*/
				220	flags &= ~(XBF_UNMAPPED \| XBF_TRYLOCK \| XBF_ASYNC \| XBF_READ_AHEAD);
				221
				222	atomic_set(&bp->b_hold, 1);
				223	atomic_set(&bp->b_lru_ref, 1);
				224	init_completion(&bp->b_iowait);
				225	INIT_LIST_HEAD(&bp->b_lru);
				226	INIT_LIST_HEAD(&bp->b_list);
				227	INIT_LIST_HEAD(&bp->b_li_list);
				228	sema_init(&bp->b_sema, 0); /* held, no waiters */
				229	spin_lock_init(&bp->b_lock);
				230	bp->b_target = target;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	231	bp->b_mount = target->bt_mount;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	232	bp->b_flags = flags;
				233
				234	/*
				235	* Set length and io_length to the same value initially.
				236	* I/O routines should use io_length, which will be the same in
				237	* most cases but may be reset (e.g. XFS recovery).
				238	*/
				239	error = xfs_buf_get_maps(bp, nmaps);
				240	if (error) {
				241	kmem_zone_free(xfs_buf_zone, bp);
				242	return NULL;
				243	}
				244
				245	bp->b_bn = map[0].bm_bn;
				246	bp->b_length = 0;
				247	for (i = 0; i < nmaps; i++) {
				248	bp->b_maps[i].bm_bn = map[i].bm_bn;
				249	bp->b_maps[i].bm_len = map[i].bm_len;
				250	bp->b_length += map[i].bm_len;
				251	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	252
				253	atomic_set(&bp->b_pin_count, 0);
				254	init_waitqueue_head(&bp->b_waiters);
				255
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	256	XFS_STATS_INC(bp->b_mount, xb_create);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	257	trace_xfs_buf_init(bp, _RET_IP_);
				258
				259	return bp;
				260	}
				261
				262	/*
				263	* Allocate a page array capable of holding a specified number
				264	* of pages, and point the page buf at it.
				265	*/
				266	STATIC int
				267	_xfs_buf_get_pages(
				268	xfs_buf_t *bp,
				269	int page_count)
				270	{
				271	/* Make sure that we have a page list */
				272	if (bp->b_pages == NULL) {
				273	bp->b_page_count = page_count;
				274	if (page_count <= XB_PAGES) {
				275	bp->b_pages = bp->b_page_array;
				276	} else {
				277	bp->b_pages = kmem_alloc(sizeof(struct page )
				278	page_count, KM_NOFS);
				279	if (bp->b_pages == NULL)
				280	return -ENOMEM;
				281	}
				282	memset(bp->b_pages, 0, sizeof(struct page ) page_count);
				283	}
				284	return 0;
				285	}
				286
				287	/*
				288	* Frees b_pages if it was allocated.
				289	*/
				290	STATIC void
				291	_xfs_buf_free_pages(
				292	xfs_buf_t *bp)
				293	{
				294	if (bp->b_pages != bp->b_page_array) {
				295	kmem_free(bp->b_pages);
				296	bp->b_pages = NULL;
				297	}
				298	}
				299
				300	/*
				301	* Releases the specified buffer.
				302	*
				303	* The modification state of any associated pages is left unchanged.
				304	* The buffer must not be on any hash - use xfs_buf_rele instead for
				305	* hashed and refcounted buffers
				306	*/
				307	void
				308	xfs_buf_free(
				309	xfs_buf_t *bp)
				310	{
				311	trace_xfs_buf_free(bp, _RET_IP_);
				312
				313	ASSERT(list_empty(&bp->b_lru));
				314
				315	if (bp->b_flags & _XBF_PAGES) {
				316	uint i;
				317
				318	if (xfs_buf_is_vmapped(bp))
				319	vm_unmap_ram(bp->b_addr - bp->b_offset,
				320	bp->b_page_count);
				321
				322	for (i = 0; i < bp->b_page_count; i++) {
				323	struct page *page = bp->b_pages[i];
				324
				325	__free_page(page);
				326	}
				327	} else if (bp->b_flags & _XBF_KMEM)
				328	kmem_free(bp->b_addr);
				329	_xfs_buf_free_pages(bp);
				330	xfs_buf_free_maps(bp);
				331	kmem_zone_free(xfs_buf_zone, bp);
				332	}
				333
				334	/*
				335	* Allocates all the pages for buffer in question and builds it's page list.
				336	*/
				337	STATIC int
				338	xfs_buf_allocate_memory(
				339	xfs_buf_t *bp,
				340	uint flags)
				341	{
				342	size_t size;
				343	size_t nbytes, offset;
				344	gfp_t gfp_mask = xb_to_gfp(flags);
				345	unsigned short page_count, i;
				346	xfs_off_t start, end;
				347	int error;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	348	xfs_km_flags_t kmflag_mask = 0;
				349
				350	/*
				351	* assure zeroed buffer for non-read cases.
				352	*/
				353	if (!(flags & XBF_READ)) {
				354	kmflag_mask \|= KM_ZERO;
				355	gfp_mask \|= __GFP_ZERO;
				356	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	357
				358	/*
				359	* for buffers that are contained within a single page, just allocate
				360	* the memory from the heap - there's no need for the complexity of
				361	* page arrays to keep allocation down to order 0.
				362	*/
				363	size = BBTOB(bp->b_length);
				364	if (size < PAGE_SIZE) {
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	365	int align_mask = xfs_buftarg_dma_alignment(bp->b_target);
				366	bp->b_addr = kmem_alloc_io(size, align_mask,
				367	KM_NOFS \| kmflag_mask);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	368	if (!bp->b_addr) {
				369	/* low memory - use alloc_page loop instead */
				370	goto use_alloc_page;
				371	}
				372
				373	if (((unsigned long)(bp->b_addr + size - 1) & PAGE_MASK) !=
				374	((unsigned long)bp->b_addr & PAGE_MASK)) {
				375	/* b_addr spans two pages - use alloc_page instead */
				376	kmem_free(bp->b_addr);
				377	bp->b_addr = NULL;
				378	goto use_alloc_page;
				379	}
				380	bp->b_offset = offset_in_page(bp->b_addr);
				381	bp->b_pages = bp->b_page_array;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	382	bp->b_pages[0] = kmem_to_page(bp->b_addr);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	383	bp->b_page_count = 1;
				384	bp->b_flags \|= _XBF_KMEM;
				385	return 0;
				386	}
				387
				388	use_alloc_page:
				389	start = BBTOB(bp->b_maps[0].bm_bn) >> PAGE_SHIFT;
				390	end = (BBTOB(bp->b_maps[0].bm_bn + bp->b_length) + PAGE_SIZE - 1)
				391	>> PAGE_SHIFT;
				392	page_count = end - start;
				393	error = _xfs_buf_get_pages(bp, page_count);
				394	if (unlikely(error))
				395	return error;
				396
				397	offset = bp->b_offset;
				398	bp->b_flags \|= _XBF_PAGES;
				399
				400	for (i = 0; i < bp->b_page_count; i++) {
				401	struct page *page;
				402	uint retries = 0;
				403	retry:
				404	page = alloc_page(gfp_mask);
				405	if (unlikely(page == NULL)) {
				406	if (flags & XBF_READ_AHEAD) {
				407	bp->b_page_count = i;
				408	error = -ENOMEM;
				409	goto out_free_pages;
				410	}
				411
				412	/*
				413	* This could deadlock.
				414	*
				415	* But until all the XFS lowlevel code is revamped to
				416	* handle buffer allocation failures we can't do much.
				417	*/
				418	if (!(++retries % 100))
				419	xfs_err(NULL,
				420	"%s(%u) possible memory allocation deadlock in %s (mode:0x%x)",
				421	current->comm, current->pid,
				422	__func__, gfp_mask);
				423
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	424	XFS_STATS_INC(bp->b_mount, xb_page_retries);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	425	congestion_wait(BLK_RW_ASYNC, HZ/50);
				426	goto retry;
				427	}
				428
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	429	XFS_STATS_INC(bp->b_mount, xb_page_found);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	430
				431	nbytes = min_t(size_t, size, PAGE_SIZE - offset);
				432	size -= nbytes;
				433	bp->b_pages[i] = page;
				434	offset = 0;
				435	}
				436	return 0;
				437
				438	out_free_pages:
				439	for (i = 0; i < bp->b_page_count; i++)
				440	__free_page(bp->b_pages[i]);
				441	bp->b_flags &= ~_XBF_PAGES;
				442	return error;
				443	}
				444
				445	/*
				446	* Map buffer into kernel address-space if necessary.
				447	*/
				448	STATIC int
				449	_xfs_buf_map_pages(
				450	xfs_buf_t *bp,
				451	uint flags)
				452	{
				453	ASSERT(bp->b_flags & _XBF_PAGES);
				454	if (bp->b_page_count == 1) {
				455	/* A single page buffer is always mappable */
				456	bp->b_addr = page_address(bp->b_pages[0]) + bp->b_offset;
				457	} else if (flags & XBF_UNMAPPED) {
				458	bp->b_addr = NULL;
				459	} else {
				460	int retried = 0;
				461	unsigned nofs_flag;
				462
				463	/*
				464	* vm_map_ram() will allocate auxillary structures (e.g.
				465	* pagetables) with GFP_KERNEL, yet we are likely to be under
				466	* GFP_NOFS context here. Hence we need to tell memory reclaim
				467	* that we are in such a context via PF_MEMALLOC_NOFS to prevent
				468	* memory reclaim re-entering the filesystem here and
				469	* potentially deadlocking.
				470	*/
				471	nofs_flag = memalloc_nofs_save();
				472	do {
				473	bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count,
				474	-1, PAGE_KERNEL);
				475	if (bp->b_addr)
				476	break;
				477	vm_unmap_aliases();
				478	} while (retried++ <= 1);
				479	memalloc_nofs_restore(nofs_flag);
				480
				481	if (!bp->b_addr)
				482	return -ENOMEM;
				483	bp->b_addr += bp->b_offset;
				484	}
				485
				486	return 0;
				487	}
				488
				489	/*
				490	* Finding and Reading Buffers
				491	*/
				492	static int
				493	_xfs_buf_obj_cmp(
				494	struct rhashtable_compare_arg *arg,
				495	const void *obj)
				496	{
				497	const struct xfs_buf_map *map = arg->key;
				498	const struct xfs_buf *bp = obj;
				499
				500	/*
				501	* The key hashing in the lookup path depends on the key being the
				502	* first element of the compare_arg, make sure to assert this.
				503	*/
				504	BUILD_BUG_ON(offsetof(struct xfs_buf_map, bm_bn) != 0);
				505
				506	if (bp->b_bn != map->bm_bn)
				507	return 1;
				508
				509	if (unlikely(bp->b_length != map->bm_len)) {
				510	/*
				511	* found a block number match. If the range doesn't
				512	* match, the only way this is allowed is if the buffer
				513	* in the cache is stale and the transaction that made
				514	* it stale has not yet committed. i.e. we are
				515	* reallocating a busy extent. Skip this buffer and
				516	* continue searching for an exact match.
				517	*/
				518	ASSERT(bp->b_flags & XBF_STALE);
				519	return 1;
				520	}
				521	return 0;
				522	}
				523
				524	static const struct rhashtable_params xfs_buf_hash_params = {
				525	.min_size = 32, /* empty AGs have minimal footprint */
				526	.nelem_hint = 16,
				527	.key_len = sizeof(xfs_daddr_t),
				528	.key_offset = offsetof(struct xfs_buf, b_bn),
				529	.head_offset = offsetof(struct xfs_buf, b_rhash_head),
				530	.automatic_shrinking = true,
				531	.obj_cmpfn = _xfs_buf_obj_cmp,
				532	};
				533
				534	int
				535	xfs_buf_hash_init(
				536	struct xfs_perag *pag)
				537	{
				538	spin_lock_init(&pag->pag_buf_lock);
				539	return rhashtable_init(&pag->pag_buf_hash, &xfs_buf_hash_params);
				540	}
				541
				542	void
				543	xfs_buf_hash_destroy(
				544	struct xfs_perag *pag)
				545	{
				546	rhashtable_destroy(&pag->pag_buf_hash);
				547	}
				548
				549	/*
				550	* Look up a buffer in the buffer cache and return it referenced and locked
				551	* in @found_bp.
				552	*
				553	* If @new_bp is supplied and we have a lookup miss, insert @new_bp into the
				554	* cache.
				555	*
				556	* If XBF_TRYLOCK is set in @flags, only try to lock the buffer and return
				557	* -EAGAIN if we fail to lock it.
				558	*
				559	* Return values are:
				560	* -EFSCORRUPTED if have been supplied with an invalid address
				561	* -EAGAIN on trylock failure
				562	* -ENOENT if we fail to find a match and @new_bp was NULL
				563	* 0, with @found_bp:
				564	* - @new_bp if we inserted it into the cache
				565	* - the buffer we found and locked.
				566	*/
				567	static int
				568	xfs_buf_find(
				569	struct xfs_buftarg *btp,
				570	struct xfs_buf_map *map,
				571	int nmaps,
				572	xfs_buf_flags_t flags,
				573	struct xfs_buf *new_bp,
				574	struct xfs_buf **found_bp)
				575	{
				576	struct xfs_perag *pag;
				577	xfs_buf_t *bp;
				578	struct xfs_buf_map cmap = { .bm_bn = map[0].bm_bn };
				579	xfs_daddr_t eofs;
				580	int i;
				581
				582	*found_bp = NULL;
				583
				584	for (i = 0; i < nmaps; i++)
				585	cmap.bm_len += map[i].bm_len;
				586
				587	/* Check for IOs smaller than the sector size / not sector aligned */
				588	ASSERT(!(BBTOB(cmap.bm_len) < btp->bt_meta_sectorsize));
				589	ASSERT(!(BBTOB(cmap.bm_bn) & (xfs_off_t)btp->bt_meta_sectormask));
				590
				591	/*
				592	* Corrupted block numbers can get through to here, unfortunately, so we
				593	* have to check that the buffer falls within the filesystem bounds.
				594	*/
				595	eofs = XFS_FSB_TO_BB(btp->bt_mount, btp->bt_mount->m_sb.sb_dblocks);
				596	if (cmap.bm_bn < 0 \|\| cmap.bm_bn >= eofs) {
				597	xfs_alert(btp->bt_mount,
				598	"%s: daddr 0x%llx out of range, EOFS 0x%llx",
				599	__func__, cmap.bm_bn, eofs);
				600	WARN_ON(1);
				601	return -EFSCORRUPTED;
				602	}
				603
				604	pag = xfs_perag_get(btp->bt_mount,
				605	xfs_daddr_to_agno(btp->bt_mount, cmap.bm_bn));
				606
				607	spin_lock(&pag->pag_buf_lock);
				608	bp = rhashtable_lookup_fast(&pag->pag_buf_hash, &cmap,
				609	xfs_buf_hash_params);
				610	if (bp) {
				611	atomic_inc(&bp->b_hold);
				612	goto found;
				613	}
				614
				615	/* No match found */
				616	if (!new_bp) {
				617	XFS_STATS_INC(btp->bt_mount, xb_miss_locked);
				618	spin_unlock(&pag->pag_buf_lock);
				619	xfs_perag_put(pag);
				620	return -ENOENT;
				621	}
				622
				623	/* the buffer keeps the perag reference until it is freed */
				624	new_bp->b_pag = pag;
				625	rhashtable_insert_fast(&pag->pag_buf_hash, &new_bp->b_rhash_head,
				626	xfs_buf_hash_params);
				627	spin_unlock(&pag->pag_buf_lock);
				628	*found_bp = new_bp;
				629	return 0;
				630
				631	found:
				632	spin_unlock(&pag->pag_buf_lock);
				633	xfs_perag_put(pag);
				634
				635	if (!xfs_buf_trylock(bp)) {
				636	if (flags & XBF_TRYLOCK) {
				637	xfs_buf_rele(bp);
				638	XFS_STATS_INC(btp->bt_mount, xb_busy_locked);
				639	return -EAGAIN;
				640	}
				641	xfs_buf_lock(bp);
				642	XFS_STATS_INC(btp->bt_mount, xb_get_locked_waited);
				643	}
				644
				645	/*
				646	* if the buffer is stale, clear all the external state associated with
				647	* it. We need to keep flags such as how we allocated the buffer memory
				648	* intact here.
				649	*/
				650	if (bp->b_flags & XBF_STALE) {
				651	ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0);
				652	ASSERT(bp->b_iodone == NULL);
				653	bp->b_flags &= _XBF_KMEM \| _XBF_PAGES;
				654	bp->b_ops = NULL;
				655	}
				656
				657	trace_xfs_buf_find(bp, flags, _RET_IP_);
				658	XFS_STATS_INC(btp->bt_mount, xb_get_locked);
				659	*found_bp = bp;
				660	return 0;
				661	}
				662
				663	struct xfs_buf *
				664	xfs_buf_incore(
				665	struct xfs_buftarg *target,
				666	xfs_daddr_t blkno,
				667	size_t numblks,
				668	xfs_buf_flags_t flags)
				669	{
				670	struct xfs_buf *bp;
				671	int error;
				672	DEFINE_SINGLE_BUF_MAP(map, blkno, numblks);
				673
				674	error = xfs_buf_find(target, &map, 1, flags, NULL, &bp);
				675	if (error)
				676	return NULL;
				677	return bp;
				678	}
				679
				680	/*
				681	* Assembles a buffer covering the specified range. The code is optimised for
				682	* cache hits, as metadata intensive workloads will see 3 orders of magnitude
				683	* more hits than misses.
				684	*/
				685	struct xfs_buf *
				686	xfs_buf_get_map(
				687	struct xfs_buftarg *target,
				688	struct xfs_buf_map *map,
				689	int nmaps,
				690	xfs_buf_flags_t flags)
				691	{
				692	struct xfs_buf *bp;
				693	struct xfs_buf *new_bp;
				694	int error = 0;
				695
				696	error = xfs_buf_find(target, map, nmaps, flags, NULL, &bp);
				697
				698	switch (error) {
				699	case 0:
				700	/* cache hit */
				701	goto found;
				702	case -EAGAIN:
				703	/* cache hit, trylock failure, caller handles failure */
				704	ASSERT(flags & XBF_TRYLOCK);
				705	return NULL;
				706	case -ENOENT:
				707	/* cache miss, go for insert */
				708	break;
				709	case -EFSCORRUPTED:
				710	default:
				711	/*
				712	* None of the higher layers understand failure types
				713	* yet, so return NULL to signal a fatal lookup error.
				714	*/
				715	return NULL;
				716	}
				717
				718	new_bp = _xfs_buf_alloc(target, map, nmaps, flags);
				719	if (unlikely(!new_bp))
				720	return NULL;
				721
				722	error = xfs_buf_allocate_memory(new_bp, flags);
				723	if (error) {
				724	xfs_buf_free(new_bp);
				725	return NULL;
				726	}
				727
				728	error = xfs_buf_find(target, map, nmaps, flags, new_bp, &bp);
				729	if (error) {
				730	xfs_buf_free(new_bp);
				731	return NULL;
				732	}
				733
				734	if (bp != new_bp)
				735	xfs_buf_free(new_bp);
				736
				737	found:
				738	if (!bp->b_addr) {
				739	error = _xfs_buf_map_pages(bp, flags);
				740	if (unlikely(error)) {
				741	xfs_warn(target->bt_mount,
				742	"%s: failed to map pagesn", __func__);
				743	xfs_buf_relse(bp);
				744	return NULL;
				745	}
				746	}
				747
				748	/*
				749	* Clear b_error if this is a lookup from a caller that doesn't expect
				750	* valid data to be found in the buffer.
				751	*/
				752	if (!(flags & XBF_READ))
				753	xfs_buf_ioerror(bp, 0);
				754
				755	XFS_STATS_INC(target->bt_mount, xb_get);
				756	trace_xfs_buf_get(bp, flags, _RET_IP_);
				757	return bp;
				758	}
				759
				760	STATIC int
				761	_xfs_buf_read(
				762	xfs_buf_t *bp,
				763	xfs_buf_flags_t flags)
				764	{
				765	ASSERT(!(flags & XBF_WRITE));
				766	ASSERT(bp->b_maps[0].bm_bn != XFS_BUF_DADDR_NULL);
				767
				768	bp->b_flags &= ~(XBF_WRITE \| XBF_ASYNC \| XBF_READ_AHEAD);
				769	bp->b_flags \|= flags & (XBF_READ \| XBF_ASYNC \| XBF_READ_AHEAD);
				770
				771	return xfs_buf_submit(bp);
				772	}
				773
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	774	/*
				775	* Reverify a buffer found in cache without an attached ->b_ops.
				776	*
				777	* If the caller passed an ops structure and the buffer doesn't have ops
				778	* assigned, set the ops and use it to verify the contents. If verification
				779	* fails, clear XBF_DONE. We assume the buffer has no recorded errors and is
				780	* already in XBF_DONE state on entry.
				781	*
				782	* Under normal operations, every in-core buffer is verified on read I/O
				783	* completion. There are two scenarios that can lead to in-core buffers without
				784	* an assigned ->b_ops. The first is during log recovery of buffers on a V4
				785	* filesystem, though these buffers are purged at the end of recovery. The
				786	* other is online repair, which intentionally reads with a NULL buffer ops to
				787	* run several verifiers across an in-core buffer in order to establish buffer
				788	* type. If repair can't establish that, the buffer will be left in memory
				789	* with NULL buffer ops.
				790	*/
				791	int
				792	xfs_buf_reverify(
				793	struct xfs_buf *bp,
				794	const struct xfs_buf_ops *ops)
				795	{
				796	ASSERT(bp->b_flags & XBF_DONE);
				797	ASSERT(bp->b_error == 0);
				798
				799	if (!ops \|\| bp->b_ops)
				800	return 0;
				801
				802	bp->b_ops = ops;
				803	bp->b_ops->verify_read(bp);
				804	if (bp->b_error)
				805	bp->b_flags &= ~XBF_DONE;
				806	return bp->b_error;
				807	}
				808
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	809	xfs_buf_t *
				810	xfs_buf_read_map(
				811	struct xfs_buftarg *target,
				812	struct xfs_buf_map *map,
				813	int nmaps,
				814	xfs_buf_flags_t flags,
				815	const struct xfs_buf_ops *ops)
				816	{
				817	struct xfs_buf *bp;
				818
				819	flags \|= XBF_READ;
				820
				821	bp = xfs_buf_get_map(target, map, nmaps, flags);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	822	if (!bp)
				823	return NULL;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	824
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	825	trace_xfs_buf_read(bp, flags, _RET_IP_);
				826
				827	if (!(bp->b_flags & XBF_DONE)) {
				828	XFS_STATS_INC(target->bt_mount, xb_get_read);
				829	bp->b_ops = ops;
				830	_xfs_buf_read(bp, flags);
				831	return bp;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	832	}
				833
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	834	xfs_buf_reverify(bp, ops);
				835
				836	if (flags & XBF_ASYNC) {
				837	/*
				838	* Read ahead call which is already satisfied,
				839	* drop the buffer
				840	*/
				841	xfs_buf_relse(bp);
				842	return NULL;
				843	}
				844
				845	/* We do not want read in the flags */
				846	bp->b_flags &= ~XBF_READ;
				847	ASSERT(bp->b_ops != NULL \|\| ops == NULL);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	848	return bp;
				849	}
				850
				851	/*
				852	* If we are not low on memory then do the readahead in a deadlock
				853	* safe manner.
				854	*/
				855	void
				856	xfs_buf_readahead_map(
				857	struct xfs_buftarg *target,
				858	struct xfs_buf_map *map,
				859	int nmaps,
				860	const struct xfs_buf_ops *ops)
				861	{
				862	if (bdi_read_congested(target->bt_bdev->bd_bdi))
				863	return;
				864
				865	xfs_buf_read_map(target, map, nmaps,
				866	XBF_TRYLOCK\|XBF_ASYNC\|XBF_READ_AHEAD, ops);
				867	}
				868
				869	/*
				870	* Read an uncached buffer from disk. Allocates and returns a locked
				871	* buffer containing the disk contents or nothing.
				872	*/
				873	int
				874	xfs_buf_read_uncached(
				875	struct xfs_buftarg *target,
				876	xfs_daddr_t daddr,
				877	size_t numblks,
				878	int flags,
				879	struct xfs_buf **bpp,
				880	const struct xfs_buf_ops *ops)
				881	{
				882	struct xfs_buf *bp;
				883
				884	*bpp = NULL;
				885
				886	bp = xfs_buf_get_uncached(target, numblks, flags);
				887	if (!bp)
				888	return -ENOMEM;
				889
				890	/* set up the buffer for a read IO */
				891	ASSERT(bp->b_map_count == 1);
				892	bp->b_bn = XFS_BUF_DADDR_NULL; /* always null for uncached buffers */
				893	bp->b_maps[0].bm_bn = daddr;
				894	bp->b_flags \|= XBF_READ;
				895	bp->b_ops = ops;
				896
				897	xfs_buf_submit(bp);
				898	if (bp->b_error) {
				899	int error = bp->b_error;
				900	xfs_buf_relse(bp);
				901	return error;
				902	}
				903
				904	*bpp = bp;
				905	return 0;
				906	}
				907
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	908	xfs_buf_t *
				909	xfs_buf_get_uncached(
				910	struct xfs_buftarg *target,
				911	size_t numblks,
				912	int flags)
				913	{
				914	unsigned long page_count;
				915	int error, i;
				916	struct xfs_buf *bp;
				917	DEFINE_SINGLE_BUF_MAP(map, XFS_BUF_DADDR_NULL, numblks);
				918
				919	/* flags might contain irrelevant bits, pass only what we care about */
				920	bp = _xfs_buf_alloc(target, &map, 1, flags & XBF_NO_IOACCT);
				921	if (unlikely(bp == NULL))
				922	goto fail;
				923
				924	page_count = PAGE_ALIGN(numblks << BBSHIFT) >> PAGE_SHIFT;
				925	error = _xfs_buf_get_pages(bp, page_count);
				926	if (error)
				927	goto fail_free_buf;
				928
				929	for (i = 0; i < page_count; i++) {
				930	bp->b_pages[i] = alloc_page(xb_to_gfp(flags));
				931	if (!bp->b_pages[i])
				932	goto fail_free_mem;
				933	}
				934	bp->b_flags \|= _XBF_PAGES;
				935
				936	error = _xfs_buf_map_pages(bp, 0);
				937	if (unlikely(error)) {
				938	xfs_warn(target->bt_mount,
				939	"%s: failed to map pages", __func__);
				940	goto fail_free_mem;
				941	}
				942
				943	trace_xfs_buf_get_uncached(bp, _RET_IP_);
				944	return bp;
				945
				946	fail_free_mem:
				947	while (--i >= 0)
				948	__free_page(bp->b_pages[i]);
				949	_xfs_buf_free_pages(bp);
				950	fail_free_buf:
				951	xfs_buf_free_maps(bp);
				952	kmem_zone_free(xfs_buf_zone, bp);
				953	fail:
				954	return NULL;
				955	}
				956
				957	/*
				958	* Increment reference count on buffer, to hold the buffer concurrently
				959	* with another thread which may release (free) the buffer asynchronously.
				960	* Must hold the buffer already to call this function.
				961	*/
				962	void
				963	xfs_buf_hold(
				964	xfs_buf_t *bp)
				965	{
				966	trace_xfs_buf_hold(bp, _RET_IP_);
				967	atomic_inc(&bp->b_hold);
				968	}
				969
				970	/*
				971	* Release a hold on the specified buffer. If the hold count is 1, the buffer is
				972	* placed on LRU or freed (depending on b_lru_ref).
				973	*/
				974	void
				975	xfs_buf_rele(
				976	xfs_buf_t *bp)
				977	{
				978	struct xfs_perag *pag = bp->b_pag;
				979	bool release;
				980	bool freebuf = false;
				981
				982	trace_xfs_buf_rele(bp, _RET_IP_);
				983
				984	if (!pag) {
				985	ASSERT(list_empty(&bp->b_lru));
				986	if (atomic_dec_and_test(&bp->b_hold)) {
				987	xfs_buf_ioacct_dec(bp);
				988	xfs_buf_free(bp);
				989	}
				990	return;
				991	}
				992
				993	ASSERT(atomic_read(&bp->b_hold) > 0);
				994
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	995	/*
				996	* We grab the b_lock here first to serialise racing xfs_buf_rele()
				997	* calls. The pag_buf_lock being taken on the last reference only
				998	* serialises against racing lookups in xfs_buf_find(). IOWs, the second
				999	* to last reference we drop here is not serialised against the last
				1000	* reference until we take bp->b_lock. Hence if we don't grab b_lock
				1001	* first, the last "release" reference can win the race to the lock and
				1002	* free the buffer before the second-to-last reference is processed,
				1003	* leading to a use-after-free scenario.
				1004	*/
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1005	spin_lock(&bp->b_lock);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	1006	release = atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1007	if (!release) {
				1008	/*
				1009	* Drop the in-flight state if the buffer is already on the LRU
				1010	* and it holds the only reference. This is racy because we
				1011	* haven't acquired the pag lock, but the use of _XBF_IN_FLIGHT
				1012	* ensures the decrement occurs only once per-buf.
				1013	*/
				1014	if ((atomic_read(&bp->b_hold) == 1) && !list_empty(&bp->b_lru))
				1015	__xfs_buf_ioacct_dec(bp);
				1016	goto out_unlock;
				1017	}
				1018
				1019	/* the last reference has been dropped ... */
				1020	__xfs_buf_ioacct_dec(bp);
				1021	if (!(bp->b_flags & XBF_STALE) && atomic_read(&bp->b_lru_ref)) {
				1022	/*
				1023	* If the buffer is added to the LRU take a new reference to the
				1024	* buffer for the LRU and clear the (now stale) dispose list
				1025	* state flag
				1026	*/
				1027	if (list_lru_add(&bp->b_target->bt_lru, &bp->b_lru)) {
				1028	bp->b_state &= ~XFS_BSTATE_DISPOSE;
				1029	atomic_inc(&bp->b_hold);
				1030	}
				1031	spin_unlock(&pag->pag_buf_lock);
				1032	} else {
				1033	/*
				1034	* most of the time buffers will already be removed from the
				1035	* LRU, so optimise that case by checking for the
				1036	* XFS_BSTATE_DISPOSE flag indicating the last list the buffer
				1037	* was on was the disposal list
				1038	*/
				1039	if (!(bp->b_state & XFS_BSTATE_DISPOSE)) {
				1040	list_lru_del(&bp->b_target->bt_lru, &bp->b_lru);
				1041	} else {
				1042	ASSERT(list_empty(&bp->b_lru));
				1043	}
				1044
				1045	ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
				1046	rhashtable_remove_fast(&pag->pag_buf_hash, &bp->b_rhash_head,
				1047	xfs_buf_hash_params);
				1048	spin_unlock(&pag->pag_buf_lock);
				1049	xfs_perag_put(pag);
				1050	freebuf = true;
				1051	}
				1052
				1053	out_unlock:
				1054	spin_unlock(&bp->b_lock);
				1055
				1056	if (freebuf)
				1057	xfs_buf_free(bp);
				1058	}
				1059
				1060
				1061	/*
				1062	* Lock a buffer object, if it is not already locked.
				1063	*
				1064	* If we come across a stale, pinned, locked buffer, we know that we are
				1065	* being asked to lock a buffer that has been reallocated. Because it is
				1066	* pinned, we know that the log has not been pushed to disk and hence it
				1067	* will still be locked. Rather than continuing to have trylock attempts
				1068	* fail until someone else pushes the log, push it ourselves before
				1069	* returning. This means that the xfsaild will not get stuck trying
				1070	* to push on stale inode buffers.
				1071	*/
				1072	int
				1073	xfs_buf_trylock(
				1074	struct xfs_buf *bp)
				1075	{
				1076	int locked;
				1077
				1078	locked = down_trylock(&bp->b_sema) == 0;
				1079	if (locked)
				1080	trace_xfs_buf_trylock(bp, _RET_IP_);
				1081	else
				1082	trace_xfs_buf_trylock_fail(bp, _RET_IP_);
				1083	return locked;
				1084	}
				1085
				1086	/*
				1087	* Lock a buffer object.
				1088	*
				1089	* If we come across a stale, pinned, locked buffer, we know that we
				1090	* are being asked to lock a buffer that has been reallocated. Because
				1091	* it is pinned, we know that the log has not been pushed to disk and
				1092	* hence it will still be locked. Rather than sleeping until someone
				1093	* else pushes the log, push it ourselves before trying to get the lock.
				1094	*/
				1095	void
				1096	xfs_buf_lock(
				1097	struct xfs_buf *bp)
				1098	{
				1099	trace_xfs_buf_lock(bp, _RET_IP_);
				1100
				1101	if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE))
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	1102	xfs_log_force(bp->b_mount, 0);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1103	down(&bp->b_sema);
				1104
				1105	trace_xfs_buf_lock_done(bp, _RET_IP_);
				1106	}
				1107
				1108	void
				1109	xfs_buf_unlock(
				1110	struct xfs_buf *bp)
				1111	{
				1112	ASSERT(xfs_buf_islocked(bp));
				1113
				1114	up(&bp->b_sema);
				1115	trace_xfs_buf_unlock(bp, _RET_IP_);
				1116	}
				1117
				1118	STATIC void
				1119	xfs_buf_wait_unpin(
				1120	xfs_buf_t *bp)
				1121	{
				1122	DECLARE_WAITQUEUE (wait, current);
				1123
				1124	if (atomic_read(&bp->b_pin_count) == 0)
				1125	return;
				1126
				1127	add_wait_queue(&bp->b_waiters, &wait);
				1128	for (;;) {
				1129	set_current_state(TASK_UNINTERRUPTIBLE);
				1130	if (atomic_read(&bp->b_pin_count) == 0)
				1131	break;
				1132	io_schedule();
				1133	}
				1134	remove_wait_queue(&bp->b_waiters, &wait);
				1135	set_current_state(TASK_RUNNING);
				1136	}
				1137
				1138	/*
				1139	* Buffer Utility Routines
				1140	*/
				1141
				1142	void
				1143	xfs_buf_ioend(
				1144	struct xfs_buf *bp)
				1145	{
				1146	bool read = bp->b_flags & XBF_READ;
				1147
				1148	trace_xfs_buf_iodone(bp, _RET_IP_);
				1149
				1150	bp->b_flags &= ~(XBF_READ \| XBF_WRITE \| XBF_READ_AHEAD);
				1151
				1152	/*
				1153	* Pull in IO completion errors now. We are guaranteed to be running
				1154	* single threaded, so we don't need the lock to read b_io_error.
				1155	*/
				1156	if (!bp->b_error && bp->b_io_error)
				1157	xfs_buf_ioerror(bp, bp->b_io_error);
				1158
				1159	/* Only validate buffers that were read without errors */
				1160	if (read && !bp->b_error && bp->b_ops) {
				1161	ASSERT(!bp->b_iodone);
				1162	bp->b_ops->verify_read(bp);
				1163	}
				1164
				1165	if (!bp->b_error)
				1166	bp->b_flags \|= XBF_DONE;
				1167
				1168	if (bp->b_iodone)
				1169	(*(bp->b_iodone))(bp);
				1170	else if (bp->b_flags & XBF_ASYNC)
				1171	xfs_buf_relse(bp);
				1172	else
				1173	complete(&bp->b_iowait);
				1174	}
				1175
				1176	static void
				1177	xfs_buf_ioend_work(
				1178	struct work_struct *work)
				1179	{
				1180	struct xfs_buf *bp =
				1181	container_of(work, xfs_buf_t, b_ioend_work);
				1182
				1183	xfs_buf_ioend(bp);
				1184	}
				1185
				1186	static void
				1187	xfs_buf_ioend_async(
				1188	struct xfs_buf *bp)
				1189	{
				1190	INIT_WORK(&bp->b_ioend_work, xfs_buf_ioend_work);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	1191	queue_work(bp->b_mount->m_buf_workqueue, &bp->b_ioend_work);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1192	}
				1193
				1194	void
				1195	__xfs_buf_ioerror(
				1196	xfs_buf_t *bp,
				1197	int error,
				1198	xfs_failaddr_t failaddr)
				1199	{
				1200	ASSERT(error <= 0 && error >= -1000);
				1201	bp->b_error = error;
				1202	trace_xfs_buf_ioerror(bp, error, failaddr);
				1203	}
				1204
				1205	void
				1206	xfs_buf_ioerror_alert(
				1207	struct xfs_buf *bp,
				1208	const char *func)
				1209	{
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	1210	xfs_alert(bp->b_mount,
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1211	"metadata I/O error in \"%s\" at daddr 0x%llx len %d error %d",
				1212	func, (uint64_t)XFS_BUF_ADDR(bp), bp->b_length,
				1213	-bp->b_error);
				1214	}
				1215
				1216	int
				1217	xfs_bwrite(
				1218	struct xfs_buf *bp)
				1219	{
				1220	int error;
				1221
				1222	ASSERT(xfs_buf_islocked(bp));
				1223
				1224	bp->b_flags \|= XBF_WRITE;
				1225	bp->b_flags &= ~(XBF_ASYNC \| XBF_READ \| _XBF_DELWRI_Q \|
				1226	XBF_WRITE_FAIL \| XBF_DONE);
				1227
				1228	error = xfs_buf_submit(bp);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	1229	if (error)
				1230	xfs_force_shutdown(bp->b_mount, SHUTDOWN_META_IO_ERROR);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1231	return error;
				1232	}
				1233
				1234	static void
				1235	xfs_buf_bio_end_io(
				1236	struct bio *bio)
				1237	{
				1238	struct xfs_buf bp = (struct xfs_buf )bio->bi_private;
				1239
				1240	/*
				1241	* don't overwrite existing errors - otherwise we can lose errors on
				1242	* buffers that require multiple bios to complete.
				1243	*/
				1244	if (bio->bi_status) {
				1245	int error = blk_status_to_errno(bio->bi_status);
				1246
				1247	cmpxchg(&bp->b_io_error, 0, error);
				1248	}
				1249
				1250	if (!bp->b_error && xfs_buf_is_vmapped(bp) && (bp->b_flags & XBF_READ))
				1251	invalidate_kernel_vmap_range(bp->b_addr, xfs_buf_vmap_len(bp));
				1252
				1253	if (atomic_dec_and_test(&bp->b_io_remaining) == 1)
				1254	xfs_buf_ioend_async(bp);
				1255	bio_put(bio);
				1256	}
				1257
				1258	static void
				1259	xfs_buf_ioapply_map(
				1260	struct xfs_buf *bp,
				1261	int map,
				1262	int *buf_offset,
				1263	int *count,
				1264	int op,
				1265	int op_flags)
				1266	{
				1267	int page_index;
				1268	int total_nr_pages = bp->b_page_count;
				1269	int nr_pages;
				1270	struct bio *bio;
				1271	sector_t sector = bp->b_maps[map].bm_bn;
				1272	int size;
				1273	int offset;
				1274
				1275	/* skip the pages in the buffer before the start offset */
				1276	page_index = 0;
				1277	offset = *buf_offset;
				1278	while (offset >= PAGE_SIZE) {
				1279	page_index++;
				1280	offset -= PAGE_SIZE;
				1281	}
				1282
				1283	/*
				1284	* Limit the IO size to the length of the current vector, and update the
				1285	* remaining IO count for the next time around.
				1286	*/
				1287	size = min_t(int, BBTOB(bp->b_maps[map].bm_len), *count);
				1288	*count -= size;
				1289	*buf_offset += size;
				1290
				1291	next_chunk:
				1292	atomic_inc(&bp->b_io_remaining);
				1293	nr_pages = min(total_nr_pages, BIO_MAX_PAGES);
				1294
				1295	bio = bio_alloc(GFP_NOIO, nr_pages);
				1296	bio_set_dev(bio, bp->b_target->bt_bdev);
				1297	bio->bi_iter.bi_sector = sector;
				1298	bio->bi_end_io = xfs_buf_bio_end_io;
				1299	bio->bi_private = bp;
				1300	bio_set_op_attrs(bio, op, op_flags);
				1301
				1302	for (; size && nr_pages; nr_pages--, page_index++) {
				1303	int rbytes, nbytes = PAGE_SIZE - offset;
				1304
				1305	if (nbytes > size)
				1306	nbytes = size;
				1307
				1308	rbytes = bio_add_page(bio, bp->b_pages[page_index], nbytes,
				1309	offset);
				1310	if (rbytes < nbytes)
				1311	break;
				1312
				1313	offset = 0;
				1314	sector += BTOBB(nbytes);
				1315	size -= nbytes;
				1316	total_nr_pages--;
				1317	}
				1318
				1319	if (likely(bio->bi_iter.bi_size)) {
				1320	if (xfs_buf_is_vmapped(bp)) {
				1321	flush_kernel_vmap_range(bp->b_addr,
				1322	xfs_buf_vmap_len(bp));
				1323	}
				1324	submit_bio(bio);
				1325	if (size)
				1326	goto next_chunk;
				1327	} else {
				1328	/*
				1329	* This is guaranteed not to be the last io reference count
				1330	* because the caller (xfs_buf_submit) holds a count itself.
				1331	*/
				1332	atomic_dec(&bp->b_io_remaining);
				1333	xfs_buf_ioerror(bp, -EIO);
				1334	bio_put(bio);
				1335	}
				1336
				1337	}
				1338
				1339	STATIC void
				1340	_xfs_buf_ioapply(
				1341	struct xfs_buf *bp)
				1342	{
				1343	struct blk_plug plug;
				1344	int op;
				1345	int op_flags = 0;
				1346	int offset;
				1347	int size;
				1348	int i;
				1349
				1350	/*
				1351	* Make sure we capture only current IO errors rather than stale errors
				1352	* left over from previous use of the buffer (e.g. failed readahead).
				1353	*/
				1354	bp->b_error = 0;
				1355
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1356	if (bp->b_flags & XBF_WRITE) {
				1357	op = REQ_OP_WRITE;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1358
				1359	/*
				1360	* Run the write verifier callback function if it exists. If
				1361	* this function fails it will mark the buffer with an error and
				1362	* the IO should not be dispatched.
				1363	*/
				1364	if (bp->b_ops) {
				1365	bp->b_ops->verify_write(bp);
				1366	if (bp->b_error) {
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	1367	xfs_force_shutdown(bp->b_mount,
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1368	SHUTDOWN_CORRUPT_INCORE);
				1369	return;
				1370	}
				1371	} else if (bp->b_bn != XFS_BUF_DADDR_NULL) {
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	1372	struct xfs_mount *mp = bp->b_mount;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1373
				1374	/*
				1375	* non-crc filesystems don't attach verifiers during
				1376	* log recovery, so don't warn for such filesystems.
				1377	*/
				1378	if (xfs_sb_version_hascrc(&mp->m_sb)) {
				1379	xfs_warn(mp,
				1380	"%s: no buf ops on daddr 0x%llx len %d",
				1381	__func__, bp->b_bn, bp->b_length);
				1382	xfs_hex_dump(bp->b_addr,
				1383	XFS_CORRUPTION_DUMP_LEN);
				1384	dump_stack();
				1385	}
				1386	}
				1387	} else if (bp->b_flags & XBF_READ_AHEAD) {
				1388	op = REQ_OP_READ;
				1389	op_flags = REQ_RAHEAD;
				1390	} else {
				1391	op = REQ_OP_READ;
				1392	}
				1393
				1394	/* we only use the buffer cache for meta-data */
				1395	op_flags \|= REQ_META;
				1396
				1397	/*
				1398	* Walk all the vectors issuing IO on them. Set up the initial offset
				1399	* into the buffer and the desired IO size before we start -
				1400	* _xfs_buf_ioapply_vec() will modify them appropriately for each
				1401	* subsequent call.
				1402	*/
				1403	offset = bp->b_offset;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	1404	size = BBTOB(bp->b_length);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1405	blk_start_plug(&plug);
				1406	for (i = 0; i < bp->b_map_count; i++) {
				1407	xfs_buf_ioapply_map(bp, i, &offset, &size, op, op_flags);
				1408	if (bp->b_error)
				1409	break;
				1410	if (size <= 0)
				1411	break; /* all done */
				1412	}
				1413	blk_finish_plug(&plug);
				1414	}
				1415
				1416	/*
				1417	* Wait for I/O completion of a sync buffer and return the I/O error code.
				1418	*/
				1419	static int
				1420	xfs_buf_iowait(
				1421	struct xfs_buf *bp)
				1422	{
				1423	ASSERT(!(bp->b_flags & XBF_ASYNC));
				1424
				1425	trace_xfs_buf_iowait(bp, _RET_IP_);
				1426	wait_for_completion(&bp->b_iowait);
				1427	trace_xfs_buf_iowait_done(bp, _RET_IP_);
				1428
				1429	return bp->b_error;
				1430	}
				1431
				1432	/*
				1433	* Buffer I/O submission path, read or write. Asynchronous submission transfers
				1434	* the buffer lock ownership and the current reference to the IO. It is not
				1435	* safe to reference the buffer after a call to this function unless the caller
				1436	* holds an additional reference itself.
				1437	*/
				1438	int
				1439	__xfs_buf_submit(
				1440	struct xfs_buf *bp,
				1441	bool wait)
				1442	{
				1443	int error = 0;
				1444
				1445	trace_xfs_buf_submit(bp, _RET_IP_);
				1446
				1447	ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
				1448
				1449	/* on shutdown we stale and complete the buffer immediately */
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	1450	if (XFS_FORCED_SHUTDOWN(bp->b_mount)) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1451	xfs_buf_ioerror(bp, -EIO);
				1452	bp->b_flags &= ~XBF_DONE;
				1453	xfs_buf_stale(bp);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	1454	xfs_buf_ioend(bp);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1455	return -EIO;
				1456	}
				1457
				1458	/*
				1459	* Grab a reference so the buffer does not go away underneath us. For
				1460	* async buffers, I/O completion drops the callers reference, which
				1461	* could occur before submission returns.
				1462	*/
				1463	xfs_buf_hold(bp);
				1464
				1465	if (bp->b_flags & XBF_WRITE)
				1466	xfs_buf_wait_unpin(bp);
				1467
				1468	/* clear the internal error state to avoid spurious errors */
				1469	bp->b_io_error = 0;
				1470
				1471	/*
				1472	* Set the count to 1 initially, this will stop an I/O completion
				1473	* callout which happens before we have started all the I/O from calling
				1474	* xfs_buf_ioend too early.
				1475	*/
				1476	atomic_set(&bp->b_io_remaining, 1);
				1477	if (bp->b_flags & XBF_ASYNC)
				1478	xfs_buf_ioacct_inc(bp);
				1479	_xfs_buf_ioapply(bp);
				1480
				1481	/*
				1482	* If _xfs_buf_ioapply failed, we can get back here with only the IO
				1483	* reference we took above. If we drop it to zero, run completion so
				1484	* that we don't return to the caller with completion still pending.
				1485	*/
				1486	if (atomic_dec_and_test(&bp->b_io_remaining) == 1) {
				1487	if (bp->b_error \|\| !(bp->b_flags & XBF_ASYNC))
				1488	xfs_buf_ioend(bp);
				1489	else
				1490	xfs_buf_ioend_async(bp);
				1491	}
				1492
				1493	if (wait)
				1494	error = xfs_buf_iowait(bp);
				1495
				1496	/*
				1497	* Release the hold that keeps the buffer referenced for the entire
				1498	* I/O. Note that if the buffer is async, it is not safe to reference
				1499	* after this release.
				1500	*/
				1501	xfs_buf_rele(bp);
				1502	return error;
				1503	}
				1504
				1505	void *
				1506	xfs_buf_offset(
				1507	struct xfs_buf *bp,
				1508	size_t offset)
				1509	{
				1510	struct page *page;
				1511
				1512	if (bp->b_addr)
				1513	return bp->b_addr + offset;
				1514
				1515	offset += bp->b_offset;
				1516	page = bp->b_pages[offset >> PAGE_SHIFT];
				1517	return page_address(page) + (offset & (PAGE_SIZE-1));
				1518	}
				1519
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1520	void
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	1521	xfs_buf_zero(
				1522	struct xfs_buf *bp,
				1523	size_t boff,
				1524	size_t bsize)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1525	{
				1526	size_t bend;
				1527
				1528	bend = boff + bsize;
				1529	while (boff < bend) {
				1530	struct page *page;
				1531	int page_index, page_offset, csize;
				1532
				1533	page_index = (boff + bp->b_offset) >> PAGE_SHIFT;
				1534	page_offset = (boff + bp->b_offset) & ~PAGE_MASK;
				1535	page = bp->b_pages[page_index];
				1536	csize = min_t(size_t, PAGE_SIZE - page_offset,
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	1537	BBTOB(bp->b_length) - boff);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1538
				1539	ASSERT((csize + page_offset) <= PAGE_SIZE);
				1540
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	1541	memset(page_address(page) + page_offset, 0, csize);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1542
				1543	boff += csize;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1544	}
				1545	}
				1546
				1547	/*
				1548	* Handling of buffer targets (buftargs).
				1549	*/
				1550
				1551	/*
				1552	* Wait for any bufs with callbacks that have been submitted but have not yet
				1553	* returned. These buffers will have an elevated hold count, so wait on those
				1554	* while freeing all the buffers only held by the LRU.
				1555	*/
				1556	static enum lru_status
				1557	xfs_buftarg_wait_rele(
				1558	struct list_head *item,
				1559	struct list_lru_one *lru,
				1560	spinlock_t *lru_lock,
				1561	void *arg)
				1562
				1563	{
				1564	struct xfs_buf *bp = container_of(item, struct xfs_buf, b_lru);
				1565	struct list_head *dispose = arg;
				1566
				1567	if (atomic_read(&bp->b_hold) > 1) {
				1568	/* need to wait, so skip it this pass */
				1569	trace_xfs_buf_wait_buftarg(bp, _RET_IP_);
				1570	return LRU_SKIP;
				1571	}
				1572	if (!spin_trylock(&bp->b_lock))
				1573	return LRU_SKIP;
				1574
				1575	/*
				1576	* clear the LRU reference count so the buffer doesn't get
				1577	* ignored in xfs_buf_rele().
				1578	*/
				1579	atomic_set(&bp->b_lru_ref, 0);
				1580	bp->b_state \|= XFS_BSTATE_DISPOSE;
				1581	list_lru_isolate_move(lru, item, dispose);
				1582	spin_unlock(&bp->b_lock);
				1583	return LRU_REMOVED;
				1584	}
				1585
				1586	void
				1587	xfs_wait_buftarg(
				1588	struct xfs_buftarg *btp)
				1589	{
				1590	LIST_HEAD(dispose);
				1591	int loop = 0;
				1592
				1593	/*
				1594	* First wait on the buftarg I/O count for all in-flight buffers to be
				1595	* released. This is critical as new buffers do not make the LRU until
				1596	* they are released.
				1597	*
				1598	* Next, flush the buffer workqueue to ensure all completion processing
				1599	* has finished. Just waiting on buffer locks is not sufficient for
				1600	* async IO as the reference count held over IO is not released until
				1601	* after the buffer lock is dropped. Hence we need to ensure here that
				1602	* all reference counts have been dropped before we start walking the
				1603	* LRU list.
				1604	*/
				1605	while (percpu_counter_sum(&btp->bt_io_count))
				1606	delay(100);
				1607	flush_workqueue(btp->bt_mount->m_buf_workqueue);
				1608
				1609	/* loop until there is nothing left on the lru list. */
				1610	while (list_lru_count(&btp->bt_lru)) {
				1611	list_lru_walk(&btp->bt_lru, xfs_buftarg_wait_rele,
				1612	&dispose, LONG_MAX);
				1613
				1614	while (!list_empty(&dispose)) {
				1615	struct xfs_buf *bp;
				1616	bp = list_first_entry(&dispose, struct xfs_buf, b_lru);
				1617	list_del_init(&bp->b_lru);
				1618	if (bp->b_flags & XBF_WRITE_FAIL) {
				1619	xfs_alert(btp->bt_mount,
				1620	"Corruption Alert: Buffer at daddr 0x%llx had permanent write failures!",
				1621	(long long)bp->b_bn);
				1622	xfs_alert(btp->bt_mount,
				1623	"Please run xfs_repair to determine the extent of the problem.");
				1624	}
				1625	xfs_buf_rele(bp);
				1626	}
				1627	if (loop++ != 0)
				1628	delay(100);
				1629	}
				1630	}
				1631
				1632	static enum lru_status
				1633	xfs_buftarg_isolate(
				1634	struct list_head *item,
				1635	struct list_lru_one *lru,
				1636	spinlock_t *lru_lock,
				1637	void *arg)
				1638	{
				1639	struct xfs_buf *bp = container_of(item, struct xfs_buf, b_lru);
				1640	struct list_head *dispose = arg;
				1641
				1642	/*
				1643	* we are inverting the lru lock/bp->b_lock here, so use a trylock.
				1644	* If we fail to get the lock, just skip it.
				1645	*/
				1646	if (!spin_trylock(&bp->b_lock))
				1647	return LRU_SKIP;
				1648	/*
				1649	* Decrement the b_lru_ref count unless the value is already
				1650	* zero. If the value is already zero, we need to reclaim the
				1651	* buffer, otherwise it gets another trip through the LRU.
				1652	*/
				1653	if (atomic_add_unless(&bp->b_lru_ref, -1, 0)) {
				1654	spin_unlock(&bp->b_lock);
				1655	return LRU_ROTATE;
				1656	}
				1657
				1658	bp->b_state \|= XFS_BSTATE_DISPOSE;
				1659	list_lru_isolate_move(lru, item, dispose);
				1660	spin_unlock(&bp->b_lock);
				1661	return LRU_REMOVED;
				1662	}
				1663
				1664	static unsigned long
				1665	xfs_buftarg_shrink_scan(
				1666	struct shrinker *shrink,
				1667	struct shrink_control *sc)
				1668	{
				1669	struct xfs_buftarg *btp = container_of(shrink,
				1670	struct xfs_buftarg, bt_shrinker);
				1671	LIST_HEAD(dispose);
				1672	unsigned long freed;
				1673
				1674	freed = list_lru_shrink_walk(&btp->bt_lru, sc,
				1675	xfs_buftarg_isolate, &dispose);
				1676
				1677	while (!list_empty(&dispose)) {
				1678	struct xfs_buf *bp;
				1679	bp = list_first_entry(&dispose, struct xfs_buf, b_lru);
				1680	list_del_init(&bp->b_lru);
				1681	xfs_buf_rele(bp);
				1682	}
				1683
				1684	return freed;
				1685	}
				1686
				1687	static unsigned long
				1688	xfs_buftarg_shrink_count(
				1689	struct shrinker *shrink,
				1690	struct shrink_control *sc)
				1691	{
				1692	struct xfs_buftarg *btp = container_of(shrink,
				1693	struct xfs_buftarg, bt_shrinker);
				1694	return list_lru_shrink_count(&btp->bt_lru, sc);
				1695	}
				1696
				1697	void
				1698	xfs_free_buftarg(
				1699	struct xfs_buftarg *btp)
				1700	{
				1701	unregister_shrinker(&btp->bt_shrinker);
				1702	ASSERT(percpu_counter_sum(&btp->bt_io_count) == 0);
				1703	percpu_counter_destroy(&btp->bt_io_count);
				1704	list_lru_destroy(&btp->bt_lru);
				1705
				1706	xfs_blkdev_issue_flush(btp);
				1707
				1708	kmem_free(btp);
				1709	}
				1710
				1711	int
				1712	xfs_setsize_buftarg(
				1713	xfs_buftarg_t *btp,
				1714	unsigned int sectorsize)
				1715	{
				1716	/* Set up metadata sector size info */
				1717	btp->bt_meta_sectorsize = sectorsize;
				1718	btp->bt_meta_sectormask = sectorsize - 1;
				1719
				1720	if (set_blocksize(btp->bt_bdev, sectorsize)) {
				1721	xfs_warn(btp->bt_mount,
				1722	"Cannot set_blocksize to %u on device %pg",
				1723	sectorsize, btp->bt_bdev);
				1724	return -EINVAL;
				1725	}
				1726
				1727	/* Set up device logical sector size mask */
				1728	btp->bt_logical_sectorsize = bdev_logical_block_size(btp->bt_bdev);
				1729	btp->bt_logical_sectormask = bdev_logical_block_size(btp->bt_bdev) - 1;
				1730
				1731	return 0;
				1732	}
				1733
				1734	/*
				1735	* When allocating the initial buffer target we have not yet
				1736	* read in the superblock, so don't know what sized sectors
				1737	* are being used at this early stage. Play safe.
				1738	*/
				1739	STATIC int
				1740	xfs_setsize_buftarg_early(
				1741	xfs_buftarg_t *btp,
				1742	struct block_device *bdev)
				1743	{
				1744	return xfs_setsize_buftarg(btp, bdev_logical_block_size(bdev));
				1745	}
				1746
				1747	xfs_buftarg_t *
				1748	xfs_alloc_buftarg(
				1749	struct xfs_mount *mp,
				1750	struct block_device *bdev,
				1751	struct dax_device *dax_dev)
				1752	{
				1753	xfs_buftarg_t *btp;
				1754
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	1755	btp = kmem_zalloc(sizeof(*btp), KM_NOFS);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1756
				1757	btp->bt_mount = mp;
				1758	btp->bt_dev = bdev->bd_dev;
				1759	btp->bt_bdev = bdev;
				1760	btp->bt_daxdev = dax_dev;
				1761
				1762	if (xfs_setsize_buftarg_early(btp, bdev))
				1763	goto error_free;
				1764
				1765	if (list_lru_init(&btp->bt_lru))
				1766	goto error_free;
				1767
				1768	if (percpu_counter_init(&btp->bt_io_count, 0, GFP_KERNEL))
				1769	goto error_lru;
				1770
				1771	btp->bt_shrinker.count_objects = xfs_buftarg_shrink_count;
				1772	btp->bt_shrinker.scan_objects = xfs_buftarg_shrink_scan;
				1773	btp->bt_shrinker.seeks = DEFAULT_SEEKS;
				1774	btp->bt_shrinker.flags = SHRINKER_NUMA_AWARE;
				1775	if (register_shrinker(&btp->bt_shrinker))
				1776	goto error_pcpu;
				1777	return btp;
				1778
				1779	error_pcpu:
				1780	percpu_counter_destroy(&btp->bt_io_count);
				1781	error_lru:
				1782	list_lru_destroy(&btp->bt_lru);
				1783	error_free:
				1784	kmem_free(btp);
				1785	return NULL;
				1786	}
				1787
				1788	/*
				1789	* Cancel a delayed write list.
				1790	*
				1791	* Remove each buffer from the list, clear the delwri queue flag and drop the
				1792	* associated buffer reference.
				1793	*/
				1794	void
				1795	xfs_buf_delwri_cancel(
				1796	struct list_head *list)
				1797	{
				1798	struct xfs_buf *bp;
				1799
				1800	while (!list_empty(list)) {
				1801	bp = list_first_entry(list, struct xfs_buf, b_list);
				1802
				1803	xfs_buf_lock(bp);
				1804	bp->b_flags &= ~_XBF_DELWRI_Q;
				1805	list_del_init(&bp->b_list);
				1806	xfs_buf_relse(bp);
				1807	}
				1808	}
				1809
				1810	/*
				1811	* Add a buffer to the delayed write list.
				1812	*
				1813	* This queues a buffer for writeout if it hasn't already been. Note that
				1814	* neither this routine nor the buffer list submission functions perform
				1815	* any internal synchronization. It is expected that the lists are thread-local
				1816	* to the callers.
				1817	*
				1818	* Returns true if we queued up the buffer, or false if it already had
				1819	* been on the buffer list.
				1820	*/
				1821	bool
				1822	xfs_buf_delwri_queue(
				1823	struct xfs_buf *bp,
				1824	struct list_head *list)
				1825	{
				1826	ASSERT(xfs_buf_islocked(bp));
				1827	ASSERT(!(bp->b_flags & XBF_READ));
				1828
				1829	/*
				1830	* If the buffer is already marked delwri it already is queued up
				1831	* by someone else for imediate writeout. Just ignore it in that
				1832	* case.
				1833	*/
				1834	if (bp->b_flags & _XBF_DELWRI_Q) {
				1835	trace_xfs_buf_delwri_queued(bp, _RET_IP_);
				1836	return false;
				1837	}
				1838
				1839	trace_xfs_buf_delwri_queue(bp, _RET_IP_);
				1840
				1841	/*
				1842	* If a buffer gets written out synchronously or marked stale while it
				1843	* is on a delwri list we lazily remove it. To do this, the other party
				1844	* clears the _XBF_DELWRI_Q flag but otherwise leaves the buffer alone.
				1845	* It remains referenced and on the list. In a rare corner case it
				1846	* might get readded to a delwri list after the synchronous writeout, in
				1847	* which case we need just need to re-add the flag here.
				1848	*/
				1849	bp->b_flags \|= _XBF_DELWRI_Q;
				1850	if (list_empty(&bp->b_list)) {
				1851	atomic_inc(&bp->b_hold);
				1852	list_add_tail(&bp->b_list, list);
				1853	}
				1854
				1855	return true;
				1856	}
				1857
				1858	/*
				1859	* Compare function is more complex than it needs to be because
				1860	* the return value is only 32 bits and we are doing comparisons
				1861	* on 64 bit values
				1862	*/
				1863	static int
				1864	xfs_buf_cmp(
				1865	void *priv,
				1866	struct list_head *a,
				1867	struct list_head *b)
				1868	{
				1869	struct xfs_buf *ap = container_of(a, struct xfs_buf, b_list);
				1870	struct xfs_buf *bp = container_of(b, struct xfs_buf, b_list);
				1871	xfs_daddr_t diff;
				1872
				1873	diff = ap->b_maps[0].bm_bn - bp->b_maps[0].bm_bn;
				1874	if (diff < 0)
				1875	return -1;
				1876	if (diff > 0)
				1877	return 1;
				1878	return 0;
				1879	}
				1880
				1881	/*
				1882	* Submit buffers for write. If wait_list is specified, the buffers are
				1883	* submitted using sync I/O and placed on the wait list such that the caller can
				1884	* iowait each buffer. Otherwise async I/O is used and the buffers are released
				1885	* at I/O completion time. In either case, buffers remain locked until I/O
				1886	* completes and the buffer is released from the queue.
				1887	*/
				1888	static int
				1889	xfs_buf_delwri_submit_buffers(
				1890	struct list_head *buffer_list,
				1891	struct list_head *wait_list)
				1892	{
				1893	struct xfs_buf bp, n;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1894	int pinned = 0;
				1895	struct blk_plug plug;
				1896
				1897	list_sort(NULL, buffer_list, xfs_buf_cmp);
				1898
				1899	blk_start_plug(&plug);
				1900	list_for_each_entry_safe(bp, n, buffer_list, b_list) {
				1901	if (!wait_list) {
				1902	if (xfs_buf_ispinned(bp)) {
				1903	pinned++;
				1904	continue;
				1905	}
				1906	if (!xfs_buf_trylock(bp))
				1907	continue;
				1908	} else {
				1909	xfs_buf_lock(bp);
				1910	}
				1911
				1912	/*
				1913	* Someone else might have written the buffer synchronously or
				1914	* marked it stale in the meantime. In that case only the
				1915	* _XBF_DELWRI_Q flag got cleared, and we have to drop the
				1916	* reference and remove it from the list here.
				1917	*/
				1918	if (!(bp->b_flags & _XBF_DELWRI_Q)) {
				1919	list_del_init(&bp->b_list);
				1920	xfs_buf_relse(bp);
				1921	continue;
				1922	}
				1923
				1924	trace_xfs_buf_delwri_split(bp, _RET_IP_);
				1925
				1926	/*
				1927	* If we have a wait list, each buffer (and associated delwri
				1928	* queue reference) transfers to it and is submitted
				1929	* synchronously. Otherwise, drop the buffer from the delwri
				1930	* queue and submit async.
				1931	*/
				1932	bp->b_flags &= ~(_XBF_DELWRI_Q \| XBF_WRITE_FAIL);
				1933	bp->b_flags \|= XBF_WRITE;
				1934	if (wait_list) {
				1935	bp->b_flags &= ~XBF_ASYNC;
				1936	list_move_tail(&bp->b_list, wait_list);
				1937	} else {
				1938	bp->b_flags \|= XBF_ASYNC;
				1939	list_del_init(&bp->b_list);
				1940	}
				1941	__xfs_buf_submit(bp, false);
				1942	}
				1943	blk_finish_plug(&plug);
				1944
				1945	return pinned;
				1946	}
				1947
				1948	/*
				1949	* Write out a buffer list asynchronously.
				1950	*
				1951	* This will take the @buffer_list, write all non-locked and non-pinned buffers
				1952	* out and not wait for I/O completion on any of the buffers. This interface
				1953	* is only safely useable for callers that can track I/O completion by higher
				1954	* level means, e.g. AIL pushing as the @buffer_list is consumed in this
				1955	* function.
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	1956	*
				1957	* Note: this function will skip buffers it would block on, and in doing so
				1958	* leaves them on @buffer_list so they can be retried on a later pass. As such,
				1959	* it is up to the caller to ensure that the buffer list is fully submitted or
				1960	* cancelled appropriately when they are finished with the list. Failure to
				1961	* cancel or resubmit the list until it is empty will result in leaked buffers
				1962	* at unmount time.
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1963	*/
				1964	int
				1965	xfs_buf_delwri_submit_nowait(
				1966	struct list_head *buffer_list)
				1967	{
				1968	return xfs_buf_delwri_submit_buffers(buffer_list, NULL);
				1969	}
				1970
				1971	/*
				1972	* Write out a buffer list synchronously.
				1973	*
				1974	* This will take the @buffer_list, write all buffers out and wait for I/O
				1975	* completion on all of the buffers. @buffer_list is consumed by the function,
				1976	* so callers must have some other way of tracking buffers if they require such
				1977	* functionality.
				1978	*/
				1979	int
				1980	xfs_buf_delwri_submit(
				1981	struct list_head *buffer_list)
				1982	{
				1983	LIST_HEAD (wait_list);
				1984	int error = 0, error2;
				1985	struct xfs_buf *bp;
				1986
				1987	xfs_buf_delwri_submit_buffers(buffer_list, &wait_list);
				1988
				1989	/* Wait for IO to complete. */
				1990	while (!list_empty(&wait_list)) {
				1991	bp = list_first_entry(&wait_list, struct xfs_buf, b_list);
				1992
				1993	list_del_init(&bp->b_list);
				1994
				1995	/*
				1996	* Wait on the locked buffer, check for errors and unlock and
				1997	* release the delwri queue reference.
				1998	*/
				1999	error2 = xfs_buf_iowait(bp);
				2000	xfs_buf_relse(bp);
				2001	if (!error)
				2002	error = error2;
				2003	}
				2004
				2005	return error;
				2006	}
				2007
				2008	/*
				2009	* Push a single buffer on a delwri queue.
				2010	*
				2011	* The purpose of this function is to submit a single buffer of a delwri queue
				2012	* and return with the buffer still on the original queue. The waiting delwri
				2013	* buffer submission infrastructure guarantees transfer of the delwri queue
				2014	* buffer reference to a temporary wait list. We reuse this infrastructure to
				2015	* transfer the buffer back to the original queue.
				2016	*
				2017	* Note the buffer transitions from the queued state, to the submitted and wait
				2018	* listed state and back to the queued state during this call. The buffer
				2019	* locking and queue management logic between _delwri_pushbuf() and
				2020	* _delwri_queue() guarantee that the buffer cannot be queued to another list
				2021	* before returning.
				2022	*/
				2023	int
				2024	xfs_buf_delwri_pushbuf(
				2025	struct xfs_buf *bp,
				2026	struct list_head *buffer_list)
				2027	{
				2028	LIST_HEAD (submit_list);
				2029	int error;
				2030
				2031	ASSERT(bp->b_flags & _XBF_DELWRI_Q);
				2032
				2033	trace_xfs_buf_delwri_pushbuf(bp, _RET_IP_);
				2034
				2035	/*
				2036	* Isolate the buffer to a new local list so we can submit it for I/O
				2037	* independently from the rest of the original list.
				2038	*/
				2039	xfs_buf_lock(bp);
				2040	list_move(&bp->b_list, &submit_list);
				2041	xfs_buf_unlock(bp);
				2042
				2043	/*
				2044	* Delwri submission clears the DELWRI_Q buffer flag and returns with
				2045	* the buffer on the wait list with the original reference. Rather than
				2046	* bounce the buffer from a local wait list back to the original list
				2047	* after I/O completion, reuse the original list as the wait list.
				2048	*/
				2049	xfs_buf_delwri_submit_buffers(&submit_list, buffer_list);
				2050
				2051	/*
				2052	* The buffer is now locked, under I/O and wait listed on the original
				2053	* delwri queue. Wait for I/O completion, restore the DELWRI_Q flag and
				2054	* return with the buffer unlocked and on the original queue.
				2055	*/
				2056	error = xfs_buf_iowait(bp);
				2057	bp->b_flags \|= _XBF_DELWRI_Q;
				2058	xfs_buf_unlock(bp);
				2059
				2060	return error;
				2061	}
				2062
				2063	int __init
				2064	xfs_buf_init(void)
				2065	{
				2066	xfs_buf_zone = kmem_zone_init_flags(sizeof(xfs_buf_t), "xfs_buf",
				2067	KM_ZONE_HWALIGN, NULL);
				2068	if (!xfs_buf_zone)
				2069	goto out;
				2070
				2071	return 0;
				2072
				2073	out:
				2074	return -ENOMEM;
				2075	}
				2076
				2077	void
				2078	xfs_buf_terminate(void)
				2079	{
				2080	kmem_zone_destroy(xfs_buf_zone);
				2081	}
				2082
				2083	void xfs_buf_set_ref(struct xfs_buf *bp, int lru_ref)
				2084	{
				2085	/*
				2086	* Set the lru reference count to 0 based on the error injection tag.
				2087	* This allows userspace to disrupt buffer caching for debug/testing
				2088	* purposes.
				2089	*/
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	2090	if (XFS_TEST_ERROR(false, bp->b_mount, XFS_ERRTAG_BUF_LRU_REF))
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2091	lru_ref = 0;
				2092
				2093	atomic_set(&bp->b_lru_ref, lru_ref);
				2094	}
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	2095
				2096	/*
				2097	* Verify an on-disk magic value against the magic value specified in the
				2098	* verifier structure. The verifier magic is in disk byte order so the caller is
				2099	* expected to pass the value directly from disk.
				2100	*/
				2101	bool
				2102	xfs_verify_magic(
				2103	struct xfs_buf *bp,
				2104	__be32 dmagic)
				2105	{
				2106	struct xfs_mount *mp = bp->b_mount;
				2107	int idx;
				2108
				2109	idx = xfs_sb_version_hascrc(&mp->m_sb);
				2110	if (WARN_ON(!bp->b_ops \|\| !bp->b_ops->magic[idx]))
				2111	return false;
				2112	return dmagic == bp->b_ops->magic[idx];
				2113	}
				2114	/*
				2115	* Verify an on-disk magic value against the magic value specified in the
				2116	* verifier structure. The verifier magic is in disk byte order so the caller is
				2117	* expected to pass the value directly from disk.
				2118	*/
				2119	bool
				2120	xfs_verify_magic16(
				2121	struct xfs_buf *bp,
				2122	__be16 dmagic)
				2123	{
				2124	struct xfs_mount *mp = bp->b_mount;
				2125	int idx;
				2126
				2127	idx = xfs_sb_version_hascrc(&mp->m_sb);
				2128	if (WARN_ON(!bp->b_ops \|\| !bp->b_ops->magic16[idx]))
				2129	return false;
				2130	return dmagic == bp->b_ops->magic16[idx];
				2131	}