Blame - fs/xfs/xfs_log.c - hafnium/third_party/linux

blob: fa2d05e65ff1fc00041b92c703635dc5804145f5 [file] [log] [blame]

Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1	// SPDX-License-Identifier: GPL-2.0
				2	/*
				3	* Copyright (c) 2000-2005 Silicon Graphics, Inc.
				4	* All Rights Reserved.
				5	*/
				6	#include "xfs.h"
				7	#include "xfs_fs.h"
				8	#include "xfs_shared.h"
				9	#include "xfs_format.h"
				10	#include "xfs_log_format.h"
				11	#include "xfs_trans_resv.h"
				12	#include "xfs_mount.h"
				13	#include "xfs_errortag.h"
				14	#include "xfs_error.h"
				15	#include "xfs_trans.h"
				16	#include "xfs_trans_priv.h"
				17	#include "xfs_log.h"
				18	#include "xfs_log_priv.h"
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	19	#include "xfs_trace.h"
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	20	#include "xfs_sysfs.h"
				21	#include "xfs_sb.h"
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	22	#include "xfs_health.h"
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	23
				24	kmem_zone_t *xfs_log_ticket_zone;
				25
				26	/* Local miscellaneous function prototypes */
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	27	STATIC struct xlog *
				28	xlog_alloc_log(
				29	struct xfs_mount *mp,
				30	struct xfs_buftarg *log_target,
				31	xfs_daddr_t blk_offset,
				32	int num_bblks);
				33	STATIC int
				34	xlog_space_left(
				35	struct xlog *log,
				36	atomic64_t *head);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	37	STATIC void
				38	xlog_dealloc_log(
				39	struct xlog *log);
				40
				41	/* local state machine functions */
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	42	STATIC void xlog_state_done_syncing(
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	43	struct xlog_in_core *iclog);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	44	STATIC int
				45	xlog_state_get_iclog_space(
				46	struct xlog *log,
				47	int len,
				48	struct xlog_in_core **iclog,
				49	struct xlog_ticket *ticket,
				50	int *continued_write,
				51	int *logoffsetp);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	52	STATIC void
				53	xlog_state_switch_iclogs(
				54	struct xlog *log,
				55	struct xlog_in_core *iclog,
				56	int eventual_size);
				57	STATIC void
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	58	xlog_grant_push_ail(
				59	struct xlog *log,
				60	int need_bytes);
				61	STATIC void
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	62	xlog_sync(
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	63	struct xlog *log,
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	64	struct xlog_in_core *iclog);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	65	#if defined(DEBUG)
				66	STATIC void
				67	xlog_verify_dest_ptr(
				68	struct xlog *log,
				69	void *ptr);
				70	STATIC void
				71	xlog_verify_grant_tail(
				72	struct xlog *log);
				73	STATIC void
				74	xlog_verify_iclog(
				75	struct xlog *log,
				76	struct xlog_in_core *iclog,
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	77	int count);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	78	STATIC void
				79	xlog_verify_tail_lsn(
				80	struct xlog *log,
				81	struct xlog_in_core *iclog,
				82	xfs_lsn_t tail_lsn);
				83	#else
				84	#define xlog_verify_dest_ptr(a,b)
				85	#define xlog_verify_grant_tail(a)
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	86	#define xlog_verify_iclog(a,b,c)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	87	#define xlog_verify_tail_lsn(a,b,c)
				88	#endif
				89
				90	STATIC int
				91	xlog_iclogs_empty(
				92	struct xlog *log);
				93
				94	static void
				95	xlog_grant_sub_space(
				96	struct xlog *log,
				97	atomic64_t *head,
				98	int bytes)
				99	{
				100	int64_t head_val = atomic64_read(head);
				101	int64_t new, old;
				102
				103	do {
				104	int cycle, space;
				105
				106	xlog_crack_grant_head_val(head_val, &cycle, &space);
				107
				108	space -= bytes;
				109	if (space < 0) {
				110	space += log->l_logsize;
				111	cycle--;
				112	}
				113
				114	old = head_val;
				115	new = xlog_assign_grant_head_val(cycle, space);
				116	head_val = atomic64_cmpxchg(head, old, new);
				117	} while (head_val != old);
				118	}
				119
				120	static void
				121	xlog_grant_add_space(
				122	struct xlog *log,
				123	atomic64_t *head,
				124	int bytes)
				125	{
				126	int64_t head_val = atomic64_read(head);
				127	int64_t new, old;
				128
				129	do {
				130	int tmp;
				131	int cycle, space;
				132
				133	xlog_crack_grant_head_val(head_val, &cycle, &space);
				134
				135	tmp = log->l_logsize - space;
				136	if (tmp > bytes)
				137	space += bytes;
				138	else {
				139	space = bytes - tmp;
				140	cycle++;
				141	}
				142
				143	old = head_val;
				144	new = xlog_assign_grant_head_val(cycle, space);
				145	head_val = atomic64_cmpxchg(head, old, new);
				146	} while (head_val != old);
				147	}
				148
				149	STATIC void
				150	xlog_grant_head_init(
				151	struct xlog_grant_head *head)
				152	{
				153	xlog_assign_grant_head(&head->grant, 1, 0);
				154	INIT_LIST_HEAD(&head->waiters);
				155	spin_lock_init(&head->lock);
				156	}
				157
				158	STATIC void
				159	xlog_grant_head_wake_all(
				160	struct xlog_grant_head *head)
				161	{
				162	struct xlog_ticket *tic;
				163
				164	spin_lock(&head->lock);
				165	list_for_each_entry(tic, &head->waiters, t_queue)
				166	wake_up_process(tic->t_task);
				167	spin_unlock(&head->lock);
				168	}
				169
				170	static inline int
				171	xlog_ticket_reservation(
				172	struct xlog *log,
				173	struct xlog_grant_head *head,
				174	struct xlog_ticket *tic)
				175	{
				176	if (head == &log->l_write_head) {
				177	ASSERT(tic->t_flags & XLOG_TIC_PERM_RESERV);
				178	return tic->t_unit_res;
				179	} else {
				180	if (tic->t_flags & XLOG_TIC_PERM_RESERV)
				181	return tic->t_unit_res * tic->t_cnt;
				182	else
				183	return tic->t_unit_res;
				184	}
				185	}
				186
				187	STATIC bool
				188	xlog_grant_head_wake(
				189	struct xlog *log,
				190	struct xlog_grant_head *head,
				191	int *free_bytes)
				192	{
				193	struct xlog_ticket *tic;
				194	int need_bytes;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	195	bool woken_task = false;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	196
				197	list_for_each_entry(tic, &head->waiters, t_queue) {
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	198
				199	/*
				200	* There is a chance that the size of the CIL checkpoints in
				201	* progress at the last AIL push target calculation resulted in
				202	* limiting the target to the log head (l_last_sync_lsn) at the
				203	* time. This may not reflect where the log head is now as the
				204	* CIL checkpoints may have completed.
				205	*
				206	* Hence when we are woken here, it may be that the head of the
				207	* log that has moved rather than the tail. As the tail didn't
				208	* move, there still won't be space available for the
				209	* reservation we require. However, if the AIL has already
				210	* pushed to the target defined by the old log head location, we
				211	* will hang here waiting for something else to update the AIL
				212	* push target.
				213	*
				214	* Therefore, if there isn't space to wake the first waiter on
				215	* the grant head, we need to push the AIL again to ensure the
				216	* target reflects both the current log tail and log head
				217	* position before we wait for the tail to move again.
				218	*/
				219
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	220	need_bytes = xlog_ticket_reservation(log, head, tic);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	221	if (*free_bytes < need_bytes) {
				222	if (!woken_task)
				223	xlog_grant_push_ail(log, need_bytes);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	224	return false;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	225	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	226
				227	*free_bytes -= need_bytes;
				228	trace_xfs_log_grant_wake_up(log, tic);
				229	wake_up_process(tic->t_task);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	230	woken_task = true;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	231	}
				232
				233	return true;
				234	}
				235
				236	STATIC int
				237	xlog_grant_head_wait(
				238	struct xlog *log,
				239	struct xlog_grant_head *head,
				240	struct xlog_ticket *tic,
				241	int need_bytes) __releases(&head->lock)
				242	__acquires(&head->lock)
				243	{
				244	list_add_tail(&tic->t_queue, &head->waiters);
				245
				246	do {
				247	if (XLOG_FORCED_SHUTDOWN(log))
				248	goto shutdown;
				249	xlog_grant_push_ail(log, need_bytes);
				250
				251	__set_current_state(TASK_UNINTERRUPTIBLE);
				252	spin_unlock(&head->lock);
				253
				254	XFS_STATS_INC(log->l_mp, xs_sleep_logspace);
				255
				256	trace_xfs_log_grant_sleep(log, tic);
				257	schedule();
				258	trace_xfs_log_grant_wake(log, tic);
				259
				260	spin_lock(&head->lock);
				261	if (XLOG_FORCED_SHUTDOWN(log))
				262	goto shutdown;
				263	} while (xlog_space_left(log, &head->grant) < need_bytes);
				264
				265	list_del_init(&tic->t_queue);
				266	return 0;
				267	shutdown:
				268	list_del_init(&tic->t_queue);
				269	return -EIO;
				270	}
				271
				272	/*
				273	* Atomically get the log space required for a log ticket.
				274	*
				275	* Once a ticket gets put onto head->waiters, it will only return after the
				276	* needed reservation is satisfied.
				277	*
				278	* This function is structured so that it has a lock free fast path. This is
				279	* necessary because every new transaction reservation will come through this
				280	* path. Hence any lock will be globally hot if we take it unconditionally on
				281	* every pass.
				282	*
				283	* As tickets are only ever moved on and off head->waiters under head->lock, we
				284	* only need to take that lock if we are going to add the ticket to the queue
				285	* and sleep. We can avoid taking the lock if the ticket was never added to
				286	* head->waiters because the t_queue list head will be empty and we hold the
				287	* only reference to it so it can safely be checked unlocked.
				288	*/
				289	STATIC int
				290	xlog_grant_head_check(
				291	struct xlog *log,
				292	struct xlog_grant_head *head,
				293	struct xlog_ticket *tic,
				294	int *need_bytes)
				295	{
				296	int free_bytes;
				297	int error = 0;
				298
				299	ASSERT(!(log->l_flags & XLOG_ACTIVE_RECOVERY));
				300
				301	/*
				302	* If there are other waiters on the queue then give them a chance at
				303	* logspace before us. Wake up the first waiters, if we do not wake
				304	* up all the waiters then go to sleep waiting for more free space,
				305	* otherwise try to get some space for this transaction.
				306	*/
				307	*need_bytes = xlog_ticket_reservation(log, head, tic);
				308	free_bytes = xlog_space_left(log, &head->grant);
				309	if (!list_empty_careful(&head->waiters)) {
				310	spin_lock(&head->lock);
				311	if (!xlog_grant_head_wake(log, head, &free_bytes) \|\|
				312	free_bytes < *need_bytes) {
				313	error = xlog_grant_head_wait(log, head, tic,
				314	*need_bytes);
				315	}
				316	spin_unlock(&head->lock);
				317	} else if (free_bytes < *need_bytes) {
				318	spin_lock(&head->lock);
				319	error = xlog_grant_head_wait(log, head, tic, *need_bytes);
				320	spin_unlock(&head->lock);
				321	}
				322
				323	return error;
				324	}
				325
				326	static void
				327	xlog_tic_reset_res(xlog_ticket_t *tic)
				328	{
				329	tic->t_res_num = 0;
				330	tic->t_res_arr_sum = 0;
				331	tic->t_res_num_ophdrs = 0;
				332	}
				333
				334	static void
				335	xlog_tic_add_region(xlog_ticket_t *tic, uint len, uint type)
				336	{
				337	if (tic->t_res_num == XLOG_TIC_LEN_MAX) {
				338	/* add to overflow and start again */
				339	tic->t_res_o_flow += tic->t_res_arr_sum;
				340	tic->t_res_num = 0;
				341	tic->t_res_arr_sum = 0;
				342	}
				343
				344	tic->t_res_arr[tic->t_res_num].r_len = len;
				345	tic->t_res_arr[tic->t_res_num].r_type = type;
				346	tic->t_res_arr_sum += len;
				347	tic->t_res_num++;
				348	}
				349
				350	/*
				351	* Replenish the byte reservation required by moving the grant write head.
				352	*/
				353	int
				354	xfs_log_regrant(
				355	struct xfs_mount *mp,
				356	struct xlog_ticket *tic)
				357	{
				358	struct xlog *log = mp->m_log;
				359	int need_bytes;
				360	int error = 0;
				361
				362	if (XLOG_FORCED_SHUTDOWN(log))
				363	return -EIO;
				364
				365	XFS_STATS_INC(mp, xs_try_logspace);
				366
				367	/*
				368	* This is a new transaction on the ticket, so we need to change the
				369	* transaction ID so that the next transaction has a different TID in
				370	* the log. Just add one to the existing tid so that we can see chains
				371	* of rolling transactions in the log easily.
				372	*/
				373	tic->t_tid++;
				374
				375	xlog_grant_push_ail(log, tic->t_unit_res);
				376
				377	tic->t_curr_res = tic->t_unit_res;
				378	xlog_tic_reset_res(tic);
				379
				380	if (tic->t_cnt > 0)
				381	return 0;
				382
				383	trace_xfs_log_regrant(log, tic);
				384
				385	error = xlog_grant_head_check(log, &log->l_write_head, tic,
				386	&need_bytes);
				387	if (error)
				388	goto out_error;
				389
				390	xlog_grant_add_space(log, &log->l_write_head.grant, need_bytes);
				391	trace_xfs_log_regrant_exit(log, tic);
				392	xlog_verify_grant_tail(log);
				393	return 0;
				394
				395	out_error:
				396	/*
				397	* If we are failing, make sure the ticket doesn't have any current
				398	* reservations. We don't want to add this back when the ticket/
				399	* transaction gets cancelled.
				400	*/
				401	tic->t_curr_res = 0;
				402	tic->t_cnt = 0; /* ungrant will give back unit_res * t_cnt. */
				403	return error;
				404	}
				405
				406	/*
				407	* Reserve log space and return a ticket corresponding to the reservation.
				408	*
				409	* Each reservation is going to reserve extra space for a log record header.
				410	* When writes happen to the on-disk log, we don't subtract the length of the
				411	* log record header from any reservation. By wasting space in each
				412	* reservation, we prevent over allocation problems.
				413	*/
				414	int
				415	xfs_log_reserve(
				416	struct xfs_mount *mp,
				417	int unit_bytes,
				418	int cnt,
				419	struct xlog_ticket **ticp,
				420	uint8_t client,
				421	bool permanent)
				422	{
				423	struct xlog *log = mp->m_log;
				424	struct xlog_ticket *tic;
				425	int need_bytes;
				426	int error = 0;
				427
				428	ASSERT(client == XFS_TRANSACTION \|\| client == XFS_LOG);
				429
				430	if (XLOG_FORCED_SHUTDOWN(log))
				431	return -EIO;
				432
				433	XFS_STATS_INC(mp, xs_try_logspace);
				434
				435	ASSERT(*ticp == NULL);
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	436	tic = xlog_ticket_alloc(log, unit_bytes, cnt, client, permanent);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	437	*ticp = tic;
				438
				439	xlog_grant_push_ail(log, tic->t_cnt ? tic->t_unit_res * tic->t_cnt
				440	: tic->t_unit_res);
				441
				442	trace_xfs_log_reserve(log, tic);
				443
				444	error = xlog_grant_head_check(log, &log->l_reserve_head, tic,
				445	&need_bytes);
				446	if (error)
				447	goto out_error;
				448
				449	xlog_grant_add_space(log, &log->l_reserve_head.grant, need_bytes);
				450	xlog_grant_add_space(log, &log->l_write_head.grant, need_bytes);
				451	trace_xfs_log_reserve_exit(log, tic);
				452	xlog_verify_grant_tail(log);
				453	return 0;
				454
				455	out_error:
				456	/*
				457	* If we are failing, make sure the ticket doesn't have any current
				458	* reservations. We don't want to add this back when the ticket/
				459	* transaction gets cancelled.
				460	*/
				461	tic->t_curr_res = 0;
				462	tic->t_cnt = 0; /* ungrant will give back unit_res * t_cnt. */
				463	return error;
				464	}
				465
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	466	static bool
				467	__xlog_state_release_iclog(
				468	struct xlog *log,
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	469	struct xlog_in_core *iclog)
				470	{
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	471	lockdep_assert_held(&log->l_icloglock);
				472
				473	if (iclog->ic_state == XLOG_STATE_WANT_SYNC) {
				474	/* update tail before writing to iclog */
				475	xfs_lsn_t tail_lsn = xlog_assign_tail_lsn(log->l_mp);
				476
				477	iclog->ic_state = XLOG_STATE_SYNCING;
				478	iclog->ic_header.h_tail_lsn = cpu_to_be64(tail_lsn);
				479	xlog_verify_tail_lsn(log, iclog, tail_lsn);
				480	/* cycle incremented when incrementing curr_block */
				481	return true;
				482	}
				483
				484	ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE);
				485	return false;
				486	}
				487
				488	/*
				489	* Flush iclog to disk if this is the last reference to the given iclog and the
				490	* it is in the WANT_SYNC state.
				491	*/
				492	static int
				493	xlog_state_release_iclog(
				494	struct xlog *log,
				495	struct xlog_in_core *iclog)
				496	{
				497	lockdep_assert_held(&log->l_icloglock);
				498
				499	if (iclog->ic_state == XLOG_STATE_IOERROR)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	500	return -EIO;
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	501
				502	if (atomic_dec_and_test(&iclog->ic_refcnt) &&
				503	__xlog_state_release_iclog(log, iclog)) {
				504	spin_unlock(&log->l_icloglock);
				505	xlog_sync(log, iclog);
				506	spin_lock(&log->l_icloglock);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	507	}
				508
				509	return 0;
				510	}
				511
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	512	void
				513	xfs_log_release_iclog(
				514	struct xlog_in_core *iclog)
				515	{
				516	struct xlog *log = iclog->ic_log;
				517	bool sync = false;
				518
				519	if (atomic_dec_and_lock(&iclog->ic_refcnt, &log->l_icloglock)) {
				520	if (iclog->ic_state != XLOG_STATE_IOERROR)
				521	sync = __xlog_state_release_iclog(log, iclog);
				522	spin_unlock(&log->l_icloglock);
				523	}
				524
				525	if (sync)
				526	xlog_sync(log, iclog);
				527	}
				528
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	529	/*
				530	* Mount a log filesystem
				531	*
				532	* mp - ubiquitous xfs mount point structure
				533	* log_target - buftarg of on-disk log device
				534	* blk_offset - Start block # where block size is 512 bytes (BBSIZE)
				535	* num_bblocks - Number of BBSIZE blocks in on-disk log
				536	*
				537	* Return error or zero.
				538	*/
				539	int
				540	xfs_log_mount(
				541	xfs_mount_t *mp,
				542	xfs_buftarg_t *log_target,
				543	xfs_daddr_t blk_offset,
				544	int num_bblks)
				545	{
				546	bool fatal = xfs_sb_version_hascrc(&mp->m_sb);
				547	int error = 0;
				548	int min_logfsbs;
				549
				550	if (!(mp->m_flags & XFS_MOUNT_NORECOVERY)) {
				551	xfs_notice(mp, "Mounting V%d Filesystem",
				552	XFS_SB_VERSION_NUM(&mp->m_sb));
				553	} else {
				554	xfs_notice(mp,
				555	"Mounting V%d filesystem in no-recovery mode. Filesystem will be inconsistent.",
				556	XFS_SB_VERSION_NUM(&mp->m_sb));
				557	ASSERT(mp->m_flags & XFS_MOUNT_RDONLY);
				558	}
				559
				560	mp->m_log = xlog_alloc_log(mp, log_target, blk_offset, num_bblks);
				561	if (IS_ERR(mp->m_log)) {
				562	error = PTR_ERR(mp->m_log);
				563	goto out;
				564	}
				565
				566	/*
				567	* Validate the given log space and drop a critical message via syslog
				568	* if the log size is too small that would lead to some unexpected
				569	* situations in transaction log space reservation stage.
				570	*
				571	* Note: we can't just reject the mount if the validation fails. This
				572	* would mean that people would have to downgrade their kernel just to
				573	* remedy the situation as there is no way to grow the log (short of
				574	* black magic surgery with xfs_db).
				575	*
				576	* We can, however, reject mounts for CRC format filesystems, as the
				577	* mkfs binary being used to make the filesystem should never create a
				578	* filesystem with a log that is too small.
				579	*/
				580	min_logfsbs = xfs_log_calc_minimum_size(mp);
				581
				582	if (mp->m_sb.sb_logblocks < min_logfsbs) {
				583	xfs_warn(mp,
				584	"Log size %d blocks too small, minimum size is %d blocks",
				585	mp->m_sb.sb_logblocks, min_logfsbs);
				586	error = -EINVAL;
				587	} else if (mp->m_sb.sb_logblocks > XFS_MAX_LOG_BLOCKS) {
				588	xfs_warn(mp,
				589	"Log size %d blocks too large, maximum size is %lld blocks",
				590	mp->m_sb.sb_logblocks, XFS_MAX_LOG_BLOCKS);
				591	error = -EINVAL;
				592	} else if (XFS_FSB_TO_B(mp, mp->m_sb.sb_logblocks) > XFS_MAX_LOG_BYTES) {
				593	xfs_warn(mp,
				594	"log size %lld bytes too large, maximum size is %lld bytes",
				595	XFS_FSB_TO_B(mp, mp->m_sb.sb_logblocks),
				596	XFS_MAX_LOG_BYTES);
				597	error = -EINVAL;
				598	} else if (mp->m_sb.sb_logsunit > 1 &&
				599	mp->m_sb.sb_logsunit % mp->m_sb.sb_blocksize) {
				600	xfs_warn(mp,
				601	"log stripe unit %u bytes must be a multiple of block size",
				602	mp->m_sb.sb_logsunit);
				603	error = -EINVAL;
				604	fatal = true;
				605	}
				606	if (error) {
				607	/*
				608	* Log check errors are always fatal on v5; or whenever bad
				609	* metadata leads to a crash.
				610	*/
				611	if (fatal) {
				612	xfs_crit(mp, "AAIEEE! Log failed size checks. Abort!");
				613	ASSERT(0);
				614	goto out_free_log;
				615	}
				616	xfs_crit(mp, "Log size out of supported range.");
				617	xfs_crit(mp,
				618	"Continuing onwards, but if log hangs are experienced then please report this message in the bug report.");
				619	}
				620
				621	/*
				622	* Initialize the AIL now we have a log.
				623	*/
				624	error = xfs_trans_ail_init(mp);
				625	if (error) {
				626	xfs_warn(mp, "AIL initialisation failed: error %d", error);
				627	goto out_free_log;
				628	}
				629	mp->m_log->l_ailp = mp->m_ail;
				630
				631	/*
				632	* skip log recovery on a norecovery mount. pretend it all
				633	* just worked.
				634	*/
				635	if (!(mp->m_flags & XFS_MOUNT_NORECOVERY)) {
				636	int readonly = (mp->m_flags & XFS_MOUNT_RDONLY);
				637
				638	if (readonly)
				639	mp->m_flags &= ~XFS_MOUNT_RDONLY;
				640
				641	error = xlog_recover(mp->m_log);
				642
				643	if (readonly)
				644	mp->m_flags \|= XFS_MOUNT_RDONLY;
				645	if (error) {
				646	xfs_warn(mp, "log mount/recovery failed: error %d",
				647	error);
				648	xlog_recover_cancel(mp->m_log);
				649	goto out_destroy_ail;
				650	}
				651	}
				652
				653	error = xfs_sysfs_init(&mp->m_log->l_kobj, &xfs_log_ktype, &mp->m_kobj,
				654	"log");
				655	if (error)
				656	goto out_destroy_ail;
				657
				658	/* Normal transactions can now occur */
				659	mp->m_log->l_flags &= ~XLOG_ACTIVE_RECOVERY;
				660
				661	/*
				662	* Now the log has been fully initialised and we know were our
				663	* space grant counters are, we can initialise the permanent ticket
				664	* needed for delayed logging to work.
				665	*/
				666	xlog_cil_init_post_recovery(mp->m_log);
				667
				668	return 0;
				669
				670	out_destroy_ail:
				671	xfs_trans_ail_destroy(mp);
				672	out_free_log:
				673	xlog_dealloc_log(mp->m_log);
				674	out:
				675	return error;
				676	}
				677
				678	/*
				679	* Finish the recovery of the file system. This is separate from the
				680	* xfs_log_mount() call, because it depends on the code in xfs_mountfs() to read
				681	* in the root and real-time bitmap inodes between calling xfs_log_mount() and
				682	* here.
				683	*
				684	* If we finish recovery successfully, start the background log work. If we are
				685	* not doing recovery, then we have a RO filesystem and we don't need to start
				686	* it.
				687	*/
				688	int
				689	xfs_log_mount_finish(
				690	struct xfs_mount *mp)
				691	{
				692	int error = 0;
				693	bool readonly = (mp->m_flags & XFS_MOUNT_RDONLY);
				694	bool recovered = mp->m_log->l_flags & XLOG_RECOVERY_NEEDED;
				695
				696	if (mp->m_flags & XFS_MOUNT_NORECOVERY) {
				697	ASSERT(mp->m_flags & XFS_MOUNT_RDONLY);
				698	return 0;
				699	} else if (readonly) {
				700	/* Allow unlinked processing to proceed */
				701	mp->m_flags &= ~XFS_MOUNT_RDONLY;
				702	}
				703
				704	/*
				705	* During the second phase of log recovery, we need iget and
				706	* iput to behave like they do for an active filesystem.
				707	* xfs_fs_drop_inode needs to be able to prevent the deletion
				708	* of inodes before we're done replaying log items on those
				709	* inodes. Turn it off immediately after recovery finishes
				710	* so that we don't leak the quota inodes if subsequent mount
				711	* activities fail.
				712	*
				713	* We let all inodes involved in redo item processing end up on
				714	* the LRU instead of being evicted immediately so that if we do
				715	* something to an unlinked inode, the irele won't cause
				716	* premature truncation and freeing of the inode, which results
				717	* in log recovery failure. We have to evict the unreferenced
				718	* lru inodes after clearing SB_ACTIVE because we don't
				719	* otherwise clean up the lru if there's a subsequent failure in
				720	* xfs_mountfs, which leads to us leaking the inodes if nothing
				721	* else (e.g. quotacheck) references the inodes before the
				722	* mount failure occurs.
				723	*/
				724	mp->m_super->s_flags \|= SB_ACTIVE;
				725	error = xlog_recover_finish(mp->m_log);
				726	if (!error)
				727	xfs_log_work_queue(mp);
				728	mp->m_super->s_flags &= ~SB_ACTIVE;
				729	evict_inodes(mp->m_super);
				730
				731	/*
				732	* Drain the buffer LRU after log recovery. This is required for v4
				733	* filesystems to avoid leaving around buffers with NULL verifier ops,
				734	* but we do it unconditionally to make sure we're always in a clean
				735	* cache state after mount.
				736	*
				737	* Don't push in the error case because the AIL may have pending intents
				738	* that aren't removed until recovery is cancelled.
				739	*/
				740	if (!error && recovered) {
				741	xfs_log_force(mp, XFS_LOG_SYNC);
				742	xfs_ail_push_all_sync(mp->m_ail);
				743	}
				744	xfs_wait_buftarg(mp->m_ddev_targp);
				745
				746	if (readonly)
				747	mp->m_flags \|= XFS_MOUNT_RDONLY;
				748
				749	return error;
				750	}
				751
				752	/*
				753	* The mount has failed. Cancel the recovery if it hasn't completed and destroy
				754	* the log.
				755	*/
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	756	void
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	757	xfs_log_mount_cancel(
				758	struct xfs_mount *mp)
				759	{
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	760	xlog_recover_cancel(mp->m_log);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	761	xfs_log_unmount(mp);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	762	}
				763
				764	/*
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	765	* Wait for the iclog to be written disk, or return an error if the log has been
				766	* shut down.
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	767	*/
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	768	static int
				769	xlog_wait_on_iclog(
				770	struct xlog_in_core *iclog)
				771	__releases(iclog->ic_log->l_icloglock)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	772	{
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	773	struct xlog *log = iclog->ic_log;
				774
				775	if (!XLOG_FORCED_SHUTDOWN(log) &&
				776	iclog->ic_state != XLOG_STATE_ACTIVE &&
				777	iclog->ic_state != XLOG_STATE_DIRTY) {
				778	XFS_STATS_INC(log->l_mp, xs_log_force_sleep);
				779	xlog_wait(&iclog->ic_force_wait, &log->l_icloglock);
				780	} else {
				781	spin_unlock(&log->l_icloglock);
				782	}
				783
				784	if (XLOG_FORCED_SHUTDOWN(log))
				785	return -EIO;
				786	return 0;
				787	}
				788
				789	/*
				790	* Write out an unmount record using the ticket provided. We have to account for
				791	* the data space used in the unmount ticket as this write is not done from a
				792	* transaction context that has already done the accounting for us.
				793	*/
				794	static int
				795	xlog_write_unmount_record(
				796	struct xlog *log,
				797	struct xlog_ticket *ticket,
				798	xfs_lsn_t *lsn,
				799	uint flags)
				800	{
				801	struct xfs_unmount_log_format ulf = {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	802	.magic = XLOG_UNMOUNT_TYPE,
				803	};
				804	struct xfs_log_iovec reg = {
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	805	.i_addr = &ulf,
				806	.i_len = sizeof(ulf),
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	807	.i_type = XLOG_REG_TYPE_UNMOUNT,
				808	};
				809	struct xfs_log_vec vec = {
				810	.lv_niovecs = 1,
				811	.lv_iovecp = &reg,
				812	};
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	813
				814	/* account for space used by record data */
				815	ticket->t_curr_res -= sizeof(ulf);
				816	return xlog_write(log, &vec, ticket, lsn, NULL, flags, false);
				817	}
				818
				819	/*
				820	* Mark the filesystem clean by writing an unmount record to the head of the
				821	* log.
				822	*/
				823	static void
				824	xlog_unmount_write(
				825	struct xlog *log)
				826	{
				827	struct xfs_mount *mp = log->l_mp;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	828	struct xlog_in_core *iclog;
				829	struct xlog_ticket *tic = NULL;
				830	xfs_lsn_t lsn;
				831	uint flags = XLOG_UNMOUNT_TRANS;
				832	int error;
				833
				834	error = xfs_log_reserve(mp, 600, 1, &tic, XFS_LOG, 0);
				835	if (error)
				836	goto out_err;
				837
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	838	error = xlog_write_unmount_record(log, tic, &lsn, flags);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	839	/*
				840	* At this point, we're umounting anyway, so there's no point in
				841	* transitioning log state to IOERROR. Just continue...
				842	*/
				843	out_err:
				844	if (error)
				845	xfs_alert(mp, "%s: unmount record failed", __func__);
				846
				847	spin_lock(&log->l_icloglock);
				848	iclog = log->l_iclog;
				849	atomic_inc(&iclog->ic_refcnt);
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	850	if (iclog->ic_state == XLOG_STATE_ACTIVE)
				851	xlog_state_switch_iclogs(log, iclog, 0);
				852	else
				853	ASSERT(iclog->ic_state == XLOG_STATE_WANT_SYNC \|\|
				854	iclog->ic_state == XLOG_STATE_IOERROR);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	855	error = xlog_state_release_iclog(log, iclog);
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	856	xlog_wait_on_iclog(iclog);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	857
				858	if (tic) {
				859	trace_xfs_log_umount_write(log, tic);
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	860	xfs_log_ticket_ungrant(log, tic);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	861	}
				862	}
				863
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	864	static void
				865	xfs_log_unmount_verify_iclog(
				866	struct xlog *log)
				867	{
				868	struct xlog_in_core *iclog = log->l_iclog;
				869
				870	do {
				871	ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE);
				872	ASSERT(iclog->ic_offset == 0);
				873	} while ((iclog = iclog->ic_next) != log->l_iclog);
				874	}
				875
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	876	/*
				877	* Unmount record used to have a string "Unmount filesystem--" in the
				878	* data section where the "Un" was really a magic number (XLOG_UNMOUNT_TYPE).
				879	* We just write the magic number now since that particular field isn't
				880	* currently architecture converted and "Unmount" is a bit foo.
				881	* As far as I know, there weren't any dependencies on the old behaviour.
				882	*/
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	883	static void
				884	xfs_log_unmount_write(
				885	struct xfs_mount *mp)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	886	{
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	887	struct xlog *log = mp->m_log;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	888
				889	/*
				890	* Don't write out unmount record on norecovery mounts or ro devices.
				891	* Or, if we are doing a forced umount (typically because of IO errors).
				892	*/
				893	if (mp->m_flags & XFS_MOUNT_NORECOVERY \|\|
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	894	xfs_readonly_buftarg(log->l_targ)) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	895	ASSERT(mp->m_flags & XFS_MOUNT_RDONLY);
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	896	return;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	897	}
				898
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	899	xfs_log_force(mp, XFS_LOG_SYNC);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	900
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	901	if (XLOG_FORCED_SHUTDOWN(log))
				902	return;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	903
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	904	/*
				905	* If we think the summary counters are bad, avoid writing the unmount
				906	* record to force log recovery at next mount, after which the summary
				907	* counters will be recalculated. Refer to xlog_check_unmount_rec for
				908	* more details.
				909	*/
				910	if (XFS_TEST_ERROR(xfs_fs_has_sickness(mp, XFS_SICK_FS_COUNTERS), mp,
				911	XFS_ERRTAG_FORCE_SUMMARY_RECALC)) {
				912	xfs_alert(mp, "%s: will fix summary counters at next mount",
				913	__func__);
				914	return;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	915	}
				916
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	917	xfs_log_unmount_verify_iclog(log);
				918	xlog_unmount_write(log);
				919	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	920
				921	/*
				922	* Empty the log for unmount/freeze.
				923	*
				924	* To do this, we first need to shut down the background log work so it is not
				925	* trying to cover the log as we clean up. We then need to unpin all objects in
				926	* the log so we can then flush them out. Once they have completed their IO and
				927	* run the callbacks removing themselves from the AIL, we can write the unmount
				928	* record.
				929	*/
				930	void
				931	xfs_log_quiesce(
				932	struct xfs_mount *mp)
				933	{
				934	cancel_delayed_work_sync(&mp->m_log->l_work);
				935	xfs_log_force(mp, XFS_LOG_SYNC);
				936
				937	/*
				938	* The superblock buffer is uncached and while xfs_ail_push_all_sync()
				939	* will push it, xfs_wait_buftarg() will not wait for it. Further,
				940	* xfs_buf_iowait() cannot be used because it was pushed with the
				941	* XBF_ASYNC flag set, so we need to use a lock/unlock pair to wait for
				942	* the IO to complete.
				943	*/
				944	xfs_ail_push_all_sync(mp->m_ail);
				945	xfs_wait_buftarg(mp->m_ddev_targp);
				946	xfs_buf_lock(mp->m_sb_bp);
				947	xfs_buf_unlock(mp->m_sb_bp);
				948
				949	xfs_log_unmount_write(mp);
				950	}
				951
				952	/*
				953	* Shut down and release the AIL and Log.
				954	*
				955	* During unmount, we need to ensure we flush all the dirty metadata objects
				956	* from the AIL so that the log is empty before we write the unmount record to
				957	* the log. Once this is done, we can tear down the AIL and the log.
				958	*/
				959	void
				960	xfs_log_unmount(
				961	struct xfs_mount *mp)
				962	{
				963	xfs_log_quiesce(mp);
				964
				965	xfs_trans_ail_destroy(mp);
				966
				967	xfs_sysfs_del(&mp->m_log->l_kobj);
				968
				969	xlog_dealloc_log(mp->m_log);
				970	}
				971
				972	void
				973	xfs_log_item_init(
				974	struct xfs_mount *mp,
				975	struct xfs_log_item *item,
				976	int type,
				977	const struct xfs_item_ops *ops)
				978	{
				979	item->li_mountp = mp;
				980	item->li_ailp = mp->m_ail;
				981	item->li_type = type;
				982	item->li_ops = ops;
				983	item->li_lv = NULL;
				984
				985	INIT_LIST_HEAD(&item->li_ail);
				986	INIT_LIST_HEAD(&item->li_cil);
				987	INIT_LIST_HEAD(&item->li_bio_list);
				988	INIT_LIST_HEAD(&item->li_trans);
				989	}
				990
				991	/*
				992	* Wake up processes waiting for log space after we have moved the log tail.
				993	*/
				994	void
				995	xfs_log_space_wake(
				996	struct xfs_mount *mp)
				997	{
				998	struct xlog *log = mp->m_log;
				999	int free_bytes;
				1000
				1001	if (XLOG_FORCED_SHUTDOWN(log))
				1002	return;
				1003
				1004	if (!list_empty_careful(&log->l_write_head.waiters)) {
				1005	ASSERT(!(log->l_flags & XLOG_ACTIVE_RECOVERY));
				1006
				1007	spin_lock(&log->l_write_head.lock);
				1008	free_bytes = xlog_space_left(log, &log->l_write_head.grant);
				1009	xlog_grant_head_wake(log, &log->l_write_head, &free_bytes);
				1010	spin_unlock(&log->l_write_head.lock);
				1011	}
				1012
				1013	if (!list_empty_careful(&log->l_reserve_head.waiters)) {
				1014	ASSERT(!(log->l_flags & XLOG_ACTIVE_RECOVERY));
				1015
				1016	spin_lock(&log->l_reserve_head.lock);
				1017	free_bytes = xlog_space_left(log, &log->l_reserve_head.grant);
				1018	xlog_grant_head_wake(log, &log->l_reserve_head, &free_bytes);
				1019	spin_unlock(&log->l_reserve_head.lock);
				1020	}
				1021	}
				1022
				1023	/*
				1024	* Determine if we have a transaction that has gone to disk that needs to be
				1025	* covered. To begin the transition to the idle state firstly the log needs to
				1026	* be idle. That means the CIL, the AIL and the iclogs needs to be empty before
				1027	* we start attempting to cover the log.
				1028	*
				1029	* Only if we are then in a state where covering is needed, the caller is
				1030	* informed that dummy transactions are required to move the log into the idle
				1031	* state.
				1032	*
				1033	* If there are any items in the AIl or CIL, then we do not want to attempt to
				1034	* cover the log as we may be in a situation where there isn't log space
				1035	* available to run a dummy transaction and this can lead to deadlocks when the
				1036	* tail of the log is pinned by an item that is modified in the CIL. Hence
				1037	* there's no point in running a dummy transaction at this point because we
				1038	* can't start trying to idle the log until both the CIL and AIL are empty.
				1039	*/
				1040	static int
				1041	xfs_log_need_covered(xfs_mount_t *mp)
				1042	{
				1043	struct xlog *log = mp->m_log;
				1044	int needed = 0;
				1045
				1046	if (!xfs_fs_writable(mp, SB_FREEZE_WRITE))
				1047	return 0;
				1048
				1049	if (!xlog_cil_empty(log))
				1050	return 0;
				1051
				1052	spin_lock(&log->l_icloglock);
				1053	switch (log->l_covered_state) {
				1054	case XLOG_STATE_COVER_DONE:
				1055	case XLOG_STATE_COVER_DONE2:
				1056	case XLOG_STATE_COVER_IDLE:
				1057	break;
				1058	case XLOG_STATE_COVER_NEED:
				1059	case XLOG_STATE_COVER_NEED2:
				1060	if (xfs_ail_min_lsn(log->l_ailp))
				1061	break;
				1062	if (!xlog_iclogs_empty(log))
				1063	break;
				1064
				1065	needed = 1;
				1066	if (log->l_covered_state == XLOG_STATE_COVER_NEED)
				1067	log->l_covered_state = XLOG_STATE_COVER_DONE;
				1068	else
				1069	log->l_covered_state = XLOG_STATE_COVER_DONE2;
				1070	break;
				1071	default:
				1072	needed = 1;
				1073	break;
				1074	}
				1075	spin_unlock(&log->l_icloglock);
				1076	return needed;
				1077	}
				1078
				1079	/*
				1080	* We may be holding the log iclog lock upon entering this routine.
				1081	*/
				1082	xfs_lsn_t
				1083	xlog_assign_tail_lsn_locked(
				1084	struct xfs_mount *mp)
				1085	{
				1086	struct xlog *log = mp->m_log;
				1087	struct xfs_log_item *lip;
				1088	xfs_lsn_t tail_lsn;
				1089
				1090	assert_spin_locked(&mp->m_ail->ail_lock);
				1091
				1092	/*
				1093	* To make sure we always have a valid LSN for the log tail we keep
				1094	* track of the last LSN which was committed in log->l_last_sync_lsn,
				1095	* and use that when the AIL was empty.
				1096	*/
				1097	lip = xfs_ail_min(mp->m_ail);
				1098	if (lip)
				1099	tail_lsn = lip->li_lsn;
				1100	else
				1101	tail_lsn = atomic64_read(&log->l_last_sync_lsn);
				1102	trace_xfs_log_assign_tail_lsn(log, tail_lsn);
				1103	atomic64_set(&log->l_tail_lsn, tail_lsn);
				1104	return tail_lsn;
				1105	}
				1106
				1107	xfs_lsn_t
				1108	xlog_assign_tail_lsn(
				1109	struct xfs_mount *mp)
				1110	{
				1111	xfs_lsn_t tail_lsn;
				1112
				1113	spin_lock(&mp->m_ail->ail_lock);
				1114	tail_lsn = xlog_assign_tail_lsn_locked(mp);
				1115	spin_unlock(&mp->m_ail->ail_lock);
				1116
				1117	return tail_lsn;
				1118	}
				1119
				1120	/*
				1121	* Return the space in the log between the tail and the head. The head
				1122	* is passed in the cycle/bytes formal parms. In the special case where
				1123	* the reserve head has wrapped passed the tail, this calculation is no
				1124	* longer valid. In this case, just return 0 which means there is no space
				1125	* in the log. This works for all places where this function is called
				1126	* with the reserve head. Of course, if the write head were to ever
				1127	* wrap the tail, we should blow up. Rather than catch this case here,
				1128	* we depend on other ASSERTions in other parts of the code. XXXmiken
				1129	*
				1130	* This code also handles the case where the reservation head is behind
				1131	* the tail. The details of this case are described below, but the end
				1132	* result is that we return the size of the log as the amount of space left.
				1133	*/
				1134	STATIC int
				1135	xlog_space_left(
				1136	struct xlog *log,
				1137	atomic64_t *head)
				1138	{
				1139	int free_bytes;
				1140	int tail_bytes;
				1141	int tail_cycle;
				1142	int head_cycle;
				1143	int head_bytes;
				1144
				1145	xlog_crack_grant_head(head, &head_cycle, &head_bytes);
				1146	xlog_crack_atomic_lsn(&log->l_tail_lsn, &tail_cycle, &tail_bytes);
				1147	tail_bytes = BBTOB(tail_bytes);
				1148	if (tail_cycle == head_cycle && head_bytes >= tail_bytes)
				1149	free_bytes = log->l_logsize - (head_bytes - tail_bytes);
				1150	else if (tail_cycle + 1 < head_cycle)
				1151	return 0;
				1152	else if (tail_cycle < head_cycle) {
				1153	ASSERT(tail_cycle == (head_cycle - 1));
				1154	free_bytes = tail_bytes - head_bytes;
				1155	} else {
				1156	/*
				1157	* The reservation head is behind the tail.
				1158	* In this case we just want to return the size of the
				1159	* log as the amount of space left.
				1160	*/
				1161	xfs_alert(log->l_mp, "xlog_space_left: head behind tail");
				1162	xfs_alert(log->l_mp,
				1163	" tail_cycle = %d, tail_bytes = %d",
				1164	tail_cycle, tail_bytes);
				1165	xfs_alert(log->l_mp,
				1166	" GH cycle = %d, GH bytes = %d",
				1167	head_cycle, head_bytes);
				1168	ASSERT(0);
				1169	free_bytes = log->l_logsize;
				1170	}
				1171	return free_bytes;
				1172	}
				1173
				1174
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1175	static void
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1176	xlog_ioend_work(
				1177	struct work_struct *work)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1178	{
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1179	struct xlog_in_core *iclog =
				1180	container_of(work, struct xlog_in_core, ic_end_io_work);
				1181	struct xlog *log = iclog->ic_log;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1182	int error;
				1183
				1184	error = blk_status_to_errno(iclog->ic_bio.bi_status);
				1185	#ifdef DEBUG
				1186	/* treat writes with injected CRC errors as failed */
				1187	if (iclog->ic_fail_crc)
				1188	error = -EIO;
				1189	#endif
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1190
				1191	/*
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1192	* Race to shutdown the filesystem if we see an error.
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1193	*/
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1194	if (XFS_TEST_ERROR(error, log->l_mp, XFS_ERRTAG_IODONE_IOERR)) {
				1195	xfs_alert(log->l_mp, "log I/O error %d", error);
				1196	xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1197	}
				1198
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	1199	xlog_state_done_syncing(iclog);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1200	bio_uninit(&iclog->ic_bio);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1201
				1202	/*
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1203	* Drop the lock to signal that we are done. Nothing references the
				1204	* iclog after this, so an unmount waiting on this lock can now tear it
				1205	* down safely. As such, it is unsafe to reference the iclog after the
				1206	* unlock as we could race with it being freed.
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1207	*/
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1208	up(&iclog->ic_sema);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1209	}
				1210
				1211	/*
				1212	* Return size of each in-core log record buffer.
				1213	*
				1214	* All machines get 8 x 32kB buffers by default, unless tuned otherwise.
				1215	*
				1216	* If the filesystem blocksize is too large, we may need to choose a
				1217	* larger size since the directory code currently logs entire blocks.
				1218	*/
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1219	STATIC void
				1220	xlog_get_iclog_buffer_size(
				1221	struct xfs_mount *mp,
				1222	struct xlog *log)
				1223	{
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1224	if (mp->m_logbufs <= 0)
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1225	mp->m_logbufs = XLOG_MAX_ICLOGS;
				1226	if (mp->m_logbsize <= 0)
				1227	mp->m_logbsize = XLOG_BIG_RECORD_BSIZE;
				1228
				1229	log->l_iclog_bufs = mp->m_logbufs;
				1230	log->l_iclog_size = mp->m_logbsize;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1231
				1232	/*
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1233	* # headers = size / 32k - one header holds cycles from 32k of data.
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1234	*/
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1235	log->l_iclog_heads =
				1236	DIV_ROUND_UP(mp->m_logbsize, XLOG_HEADER_CYCLE_SIZE);
				1237	log->l_iclog_hsize = log->l_iclog_heads << BBSHIFT;
				1238	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1239
				1240	void
				1241	xfs_log_work_queue(
				1242	struct xfs_mount *mp)
				1243	{
				1244	queue_delayed_work(mp->m_sync_workqueue, &mp->m_log->l_work,
				1245	msecs_to_jiffies(xfs_syncd_centisecs * 10));
				1246	}
				1247
				1248	/*
				1249	* Every sync period we need to unpin all items in the AIL and push them to
				1250	* disk. If there is nothing dirty, then we might need to cover the log to
				1251	* indicate that the filesystem is idle.
				1252	*/
				1253	static void
				1254	xfs_log_worker(
				1255	struct work_struct *work)
				1256	{
				1257	struct xlog *log = container_of(to_delayed_work(work),
				1258	struct xlog, l_work);
				1259	struct xfs_mount *mp = log->l_mp;
				1260
				1261	/* dgc: errors ignored - not fatal and nowhere to report them */
				1262	if (xfs_log_need_covered(mp)) {
				1263	/*
				1264	* Dump a transaction into the log that contains no real change.
				1265	* This is needed to stamp the current tail LSN into the log
				1266	* during the covering operation.
				1267	*
				1268	* We cannot use an inode here for this - that will push dirty
				1269	* state back up into the VFS and then periodic inode flushing
				1270	* will prevent log covering from making progress. Hence we
				1271	* synchronously log the superblock instead to ensure the
				1272	* superblock is immediately unpinned and can be written back.
				1273	*/
				1274	xfs_sync_sb(mp, true);
				1275	} else
				1276	xfs_log_force(mp, 0);
				1277
				1278	/* start pushing all the metadata that is currently dirty */
				1279	xfs_ail_push_all(mp->m_ail);
				1280
				1281	/* queue us up again */
				1282	xfs_log_work_queue(mp);
				1283	}
				1284
				1285	/*
				1286	* This routine initializes some of the log structure for a given mount point.
				1287	* Its primary purpose is to fill in enough, so recovery can occur. However,
				1288	* some other stuff may be filled in too.
				1289	*/
				1290	STATIC struct xlog *
				1291	xlog_alloc_log(
				1292	struct xfs_mount *mp,
				1293	struct xfs_buftarg *log_target,
				1294	xfs_daddr_t blk_offset,
				1295	int num_bblks)
				1296	{
				1297	struct xlog *log;
				1298	xlog_rec_header_t *head;
				1299	xlog_in_core_t **iclogp;
				1300	xlog_in_core_t iclog, prev_iclog=NULL;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1301	int i;
				1302	int error = -ENOMEM;
				1303	uint log2_size = 0;
				1304
				1305	log = kmem_zalloc(sizeof(struct xlog), KM_MAYFAIL);
				1306	if (!log) {
				1307	xfs_warn(mp, "Log allocation failed: No memory!");
				1308	goto out;
				1309	}
				1310
				1311	log->l_mp = mp;
				1312	log->l_targ = log_target;
				1313	log->l_logsize = BBTOB(num_bblks);
				1314	log->l_logBBstart = blk_offset;
				1315	log->l_logBBsize = num_bblks;
				1316	log->l_covered_state = XLOG_STATE_COVER_IDLE;
				1317	log->l_flags \|= XLOG_ACTIVE_RECOVERY;
				1318	INIT_DELAYED_WORK(&log->l_work, xfs_log_worker);
				1319
				1320	log->l_prev_block = -1;
				1321	/* log->l_tail_lsn = 0x100000000LL; cycle = 1; current block = 0 */
				1322	xlog_assign_atomic_lsn(&log->l_tail_lsn, 1, 0);
				1323	xlog_assign_atomic_lsn(&log->l_last_sync_lsn, 1, 0);
				1324	log->l_curr_cycle = 1; /* 0 is bad since this is initial value */
				1325
				1326	xlog_grant_head_init(&log->l_reserve_head);
				1327	xlog_grant_head_init(&log->l_write_head);
				1328
				1329	error = -EFSCORRUPTED;
				1330	if (xfs_sb_version_hassector(&mp->m_sb)) {
				1331	log2_size = mp->m_sb.sb_logsectlog;
				1332	if (log2_size < BBSHIFT) {
				1333	xfs_warn(mp, "Log sector size too small (0x%x < 0x%x)",
				1334	log2_size, BBSHIFT);
				1335	goto out_free_log;
				1336	}
				1337
				1338	log2_size -= BBSHIFT;
				1339	if (log2_size > mp->m_sectbb_log) {
				1340	xfs_warn(mp, "Log sector size too large (0x%x > 0x%x)",
				1341	log2_size, mp->m_sectbb_log);
				1342	goto out_free_log;
				1343	}
				1344
				1345	/* for larger sector sizes, must have v2 or external log */
				1346	if (log2_size && log->l_logBBstart > 0 &&
				1347	!xfs_sb_version_haslogv2(&mp->m_sb)) {
				1348	xfs_warn(mp,
				1349	"log sector size (0x%x) invalid for configuration.",
				1350	log2_size);
				1351	goto out_free_log;
				1352	}
				1353	}
				1354	log->l_sectBBsize = 1 << log2_size;
				1355
				1356	xlog_get_iclog_buffer_size(mp, log);
				1357
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1358	spin_lock_init(&log->l_icloglock);
				1359	init_waitqueue_head(&log->l_flush_wait);
				1360
				1361	iclogp = &log->l_iclog;
				1362	/*
				1363	* The amount of memory to allocate for the iclog structure is
				1364	* rather funky due to the way the structure is defined. It is
				1365	* done this way so that we can use different sizes for machines
				1366	* with different amounts of memory. See the definition of
				1367	* xlog_in_core_t in xfs_log_priv.h for details.
				1368	*/
				1369	ASSERT(log->l_iclog_size >= 4096);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1370	for (i = 0; i < log->l_iclog_bufs; i++) {
				1371	int align_mask = xfs_buftarg_dma_alignment(mp->m_logdev_targp);
				1372	size_t bvec_size = howmany(log->l_iclog_size, PAGE_SIZE) *
				1373	sizeof(struct bio_vec);
				1374
				1375	iclog = kmem_zalloc(sizeof(*iclog) + bvec_size, KM_MAYFAIL);
				1376	if (!iclog)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1377	goto out_free_iclog;
				1378
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1379	*iclogp = iclog;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1380	iclog->ic_prev = prev_iclog;
				1381	prev_iclog = iclog;
				1382
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1383	iclog->ic_data = kmem_alloc_io(log->l_iclog_size, align_mask,
				1384	KM_MAYFAIL \| KM_ZERO);
				1385	if (!iclog->ic_data)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1386	goto out_free_iclog;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1387	#ifdef DEBUG
				1388	log->l_iclog_bak[i] = &iclog->ic_header;
				1389	#endif
				1390	head = &iclog->ic_header;
				1391	memset(head, 0, sizeof(xlog_rec_header_t));
				1392	head->h_magicno = cpu_to_be32(XLOG_HEADER_MAGIC_NUM);
				1393	head->h_version = cpu_to_be32(
				1394	xfs_sb_version_haslogv2(&log->l_mp->m_sb) ? 2 : 1);
				1395	head->h_size = cpu_to_be32(log->l_iclog_size);
				1396	/* new fields */
				1397	head->h_fmt = cpu_to_be32(XLOG_FMT);
				1398	memcpy(&head->h_fs_uuid, &mp->m_sb.sb_uuid, sizeof(uuid_t));
				1399
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1400	iclog->ic_size = log->l_iclog_size - log->l_iclog_hsize;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1401	iclog->ic_state = XLOG_STATE_ACTIVE;
				1402	iclog->ic_log = log;
				1403	atomic_set(&iclog->ic_refcnt, 0);
				1404	spin_lock_init(&iclog->ic_callback_lock);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1405	INIT_LIST_HEAD(&iclog->ic_callbacks);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1406	iclog->ic_datap = (char *)iclog->ic_data + log->l_iclog_hsize;
				1407
				1408	init_waitqueue_head(&iclog->ic_force_wait);
				1409	init_waitqueue_head(&iclog->ic_write_wait);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1410	INIT_WORK(&iclog->ic_end_io_work, xlog_ioend_work);
				1411	sema_init(&iclog->ic_sema, 1);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1412
				1413	iclogp = &iclog->ic_next;
				1414	}
				1415	iclogp = log->l_iclog; / complete ring */
				1416	log->l_iclog->ic_prev = prev_iclog; /* re-write 1st prev ptr */
				1417
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1418	log->l_ioend_workqueue = alloc_workqueue("xfs-log/%s",
				1419	WQ_MEM_RECLAIM \| WQ_FREEZABLE \| WQ_HIGHPRI, 0,
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	1420	mp->m_super->s_id);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1421	if (!log->l_ioend_workqueue)
				1422	goto out_free_iclog;
				1423
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1424	error = xlog_cil_init(log);
				1425	if (error)
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1426	goto out_destroy_workqueue;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1427	return log;
				1428
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1429	out_destroy_workqueue:
				1430	destroy_workqueue(log->l_ioend_workqueue);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1431	out_free_iclog:
				1432	for (iclog = log->l_iclog; iclog; iclog = prev_iclog) {
				1433	prev_iclog = iclog->ic_next;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1434	kmem_free(iclog->ic_data);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1435	kmem_free(iclog);
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame]	1436	if (prev_iclog == log->l_iclog)
				1437	break;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1438	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1439	out_free_log:
				1440	kmem_free(log);
				1441	out:
				1442	return ERR_PTR(error);
				1443	} /* xlog_alloc_log */
				1444
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1445	/*
				1446	* Write out the commit record of a transaction associated with the given
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	1447	* ticket to close off a running log write. Return the lsn of the commit record.
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1448	*/
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	1449	int
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1450	xlog_commit_record(
				1451	struct xlog *log,
				1452	struct xlog_ticket *ticket,
				1453	struct xlog_in_core **iclog,
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	1454	xfs_lsn_t *lsn)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1455	{
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1456	struct xfs_log_iovec reg = {
				1457	.i_addr = NULL,
				1458	.i_len = 0,
				1459	.i_type = XLOG_REG_TYPE_COMMIT,
				1460	};
				1461	struct xfs_log_vec vec = {
				1462	.lv_niovecs = 1,
				1463	.lv_iovecp = &reg,
				1464	};
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	1465	int error;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1466
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	1467	if (XLOG_FORCED_SHUTDOWN(log))
				1468	return -EIO;
				1469
				1470	error = xlog_write(log, &vec, ticket, lsn, iclog, XLOG_COMMIT_TRANS,
				1471	false);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1472	if (error)
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	1473	xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1474	return error;
				1475	}
				1476
				1477	/*
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	1478	* Compute the LSN that we'd need to push the log tail towards in order to have
				1479	* (a) enough on-disk log space to log the number of bytes specified, (b) at
				1480	* least 25% of the log space free, and (c) at least 256 blocks free. If the
				1481	* log free space already meets all three thresholds, this function returns
				1482	* NULLCOMMITLSN.
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1483	*/
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	1484	xfs_lsn_t
				1485	xlog_grant_push_threshold(
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1486	struct xlog *log,
				1487	int need_bytes)
				1488	{
				1489	xfs_lsn_t threshold_lsn = 0;
				1490	xfs_lsn_t last_sync_lsn;
				1491	int free_blocks;
				1492	int free_bytes;
				1493	int threshold_block;
				1494	int threshold_cycle;
				1495	int free_threshold;
				1496
				1497	ASSERT(BTOBB(need_bytes) < log->l_logBBsize);
				1498
				1499	free_bytes = xlog_space_left(log, &log->l_reserve_head.grant);
				1500	free_blocks = BTOBBT(free_bytes);
				1501
				1502	/*
				1503	* Set the threshold for the minimum number of free blocks in the
				1504	* log to the maximum of what the caller needs, one quarter of the
				1505	* log, and 256 blocks.
				1506	*/
				1507	free_threshold = BTOBB(need_bytes);
				1508	free_threshold = max(free_threshold, (log->l_logBBsize >> 2));
				1509	free_threshold = max(free_threshold, 256);
				1510	if (free_blocks >= free_threshold)
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	1511	return NULLCOMMITLSN;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1512
				1513	xlog_crack_atomic_lsn(&log->l_tail_lsn, &threshold_cycle,
				1514	&threshold_block);
				1515	threshold_block += free_threshold;
				1516	if (threshold_block >= log->l_logBBsize) {
				1517	threshold_block -= log->l_logBBsize;
				1518	threshold_cycle += 1;
				1519	}
				1520	threshold_lsn = xlog_assign_lsn(threshold_cycle,
				1521	threshold_block);
				1522	/*
				1523	* Don't pass in an lsn greater than the lsn of the last
				1524	* log record known to be on disk. Use a snapshot of the last sync lsn
				1525	* so that it doesn't change between the compare and the set.
				1526	*/
				1527	last_sync_lsn = atomic64_read(&log->l_last_sync_lsn);
				1528	if (XFS_LSN_CMP(threshold_lsn, last_sync_lsn) > 0)
				1529	threshold_lsn = last_sync_lsn;
				1530
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	1531	return threshold_lsn;
				1532	}
				1533
				1534	/*
				1535	* Push the tail of the log if we need to do so to maintain the free log space
				1536	* thresholds set out by xlog_grant_push_threshold. We may need to adopt a
				1537	* policy which pushes on an lsn which is further along in the log once we
				1538	* reach the high water mark. In this manner, we would be creating a low water
				1539	* mark.
				1540	*/
				1541	STATIC void
				1542	xlog_grant_push_ail(
				1543	struct xlog *log,
				1544	int need_bytes)
				1545	{
				1546	xfs_lsn_t threshold_lsn;
				1547
				1548	threshold_lsn = xlog_grant_push_threshold(log, need_bytes);
				1549	if (threshold_lsn == NULLCOMMITLSN \|\| XLOG_FORCED_SHUTDOWN(log))
				1550	return;
				1551
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1552	/*
				1553	* Get the transaction layer to kick the dirty buffers out to
				1554	* disk asynchronously. No point in trying to do this if
				1555	* the filesystem is shutting down.
				1556	*/
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	1557	xfs_ail_push(log->l_ailp, threshold_lsn);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1558	}
				1559
				1560	/*
				1561	* Stamp cycle number in every block
				1562	*/
				1563	STATIC void
				1564	xlog_pack_data(
				1565	struct xlog *log,
				1566	struct xlog_in_core *iclog,
				1567	int roundoff)
				1568	{
				1569	int i, j, k;
				1570	int size = iclog->ic_offset + roundoff;
				1571	__be32 cycle_lsn;
				1572	char *dp;
				1573
				1574	cycle_lsn = CYCLE_LSN_DISK(iclog->ic_header.h_lsn);
				1575
				1576	dp = iclog->ic_datap;
				1577	for (i = 0; i < BTOBB(size); i++) {
				1578	if (i >= (XLOG_HEADER_CYCLE_SIZE / BBSIZE))
				1579	break;
				1580	iclog->ic_header.h_cycle_data[i] = (__be32 )dp;
				1581	(__be32 )dp = cycle_lsn;
				1582	dp += BBSIZE;
				1583	}
				1584
				1585	if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
				1586	xlog_in_core_2_t *xhdr = iclog->ic_data;
				1587
				1588	for ( ; i < BTOBB(size); i++) {
				1589	j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
				1590	k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
				1591	xhdr[j].hic_xheader.xh_cycle_data[k] = (__be32 )dp;
				1592	(__be32 )dp = cycle_lsn;
				1593	dp += BBSIZE;
				1594	}
				1595
				1596	for (i = 1; i < log->l_iclog_heads; i++)
				1597	xhdr[i].hic_xheader.xh_cycle = cycle_lsn;
				1598	}
				1599	}
				1600
				1601	/*
				1602	* Calculate the checksum for a log buffer.
				1603	*
				1604	* This is a little more complicated than it should be because the various
				1605	* headers and the actual data are non-contiguous.
				1606	*/
				1607	__le32
				1608	xlog_cksum(
				1609	struct xlog *log,
				1610	struct xlog_rec_header *rhead,
				1611	char *dp,
				1612	int size)
				1613	{
				1614	uint32_t crc;
				1615
				1616	/* first generate the crc for the record header ... */
				1617	crc = xfs_start_cksum_update((char *)rhead,
				1618	sizeof(struct xlog_rec_header),
				1619	offsetof(struct xlog_rec_header, h_crc));
				1620
				1621	/* ... then for additional cycle data for v2 logs ... */
				1622	if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
				1623	union xlog_in_core2 xhdr = (union xlog_in_core2 )rhead;
				1624	int i;
				1625	int xheads;
				1626
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	1627	xheads = DIV_ROUND_UP(size, XLOG_HEADER_CYCLE_SIZE);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1628
				1629	for (i = 1; i < xheads; i++) {
				1630	crc = crc32c(crc, &xhdr[i].hic_xheader,
				1631	sizeof(struct xlog_rec_ext_header));
				1632	}
				1633	}
				1634
				1635	/* ... and finally for the payload */
				1636	crc = crc32c(crc, dp, size);
				1637
				1638	return xfs_end_cksum(crc);
				1639	}
				1640
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1641	static void
				1642	xlog_bio_end_io(
				1643	struct bio *bio)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1644	{
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1645	struct xlog_in_core *iclog = bio->bi_private;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1646
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1647	queue_work(iclog->ic_log->l_ioend_workqueue,
				1648	&iclog->ic_end_io_work);
				1649	}
				1650
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	1651	static int
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1652	xlog_map_iclog_data(
				1653	struct bio *bio,
				1654	void *data,
				1655	size_t count)
				1656	{
				1657	do {
				1658	struct page *page = kmem_to_page(data);
				1659	unsigned int off = offset_in_page(data);
				1660	size_t len = min_t(size_t, count, PAGE_SIZE - off);
				1661
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	1662	if (bio_add_page(bio, page, len, off) != len)
				1663	return -EIO;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1664
				1665	data += len;
				1666	count -= len;
				1667	} while (count);
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	1668
				1669	return 0;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1670	}
				1671
				1672	STATIC void
				1673	xlog_write_iclog(
				1674	struct xlog *log,
				1675	struct xlog_in_core *iclog,
				1676	uint64_t bno,
				1677	unsigned int count,
				1678	bool need_flush)
				1679	{
				1680	ASSERT(bno < log->l_logBBsize);
				1681
				1682	/*
				1683	* We lock the iclogbufs here so that we can serialise against I/O
				1684	* completion during unmount. We might be processing a shutdown
				1685	* triggered during unmount, and that can occur asynchronously to the
				1686	* unmount thread, and hence we need to ensure that completes before
				1687	* tearing down the iclogbufs. Hence we need to hold the buffer lock
				1688	* across the log IO to archieve that.
				1689	*/
				1690	down(&iclog->ic_sema);
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	1691	if (unlikely(iclog->ic_state == XLOG_STATE_IOERROR)) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1692	/*
				1693	* It would seem logical to return EIO here, but we rely on
				1694	* the log state machine to propagate I/O errors instead of
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1695	* doing it here. We kick of the state machine and unlock
				1696	* the buffer manually, the code needs to be kept in sync
				1697	* with the I/O completion path.
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1698	*/
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	1699	xlog_state_done_syncing(iclog);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1700	up(&iclog->ic_sema);
				1701	return;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1702	}
				1703
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1704	bio_init(&iclog->ic_bio, iclog->ic_bvec, howmany(count, PAGE_SIZE));
				1705	bio_set_dev(&iclog->ic_bio, log->l_targ->bt_bdev);
				1706	iclog->ic_bio.bi_iter.bi_sector = log->l_logBBstart + bno;
				1707	iclog->ic_bio.bi_end_io = xlog_bio_end_io;
				1708	iclog->ic_bio.bi_private = iclog;
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	1709
				1710	/*
				1711	* We use REQ_SYNC \| REQ_IDLE here to tell the block layer the are more
				1712	* IOs coming immediately after this one. This prevents the block layer
				1713	* writeback throttle from throttling log writes behind background
				1714	* metadata writeback and causing priority inversions.
				1715	*/
				1716	iclog->ic_bio.bi_opf = REQ_OP_WRITE \| REQ_META \| REQ_SYNC \|
				1717	REQ_IDLE \| REQ_FUA;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1718	if (need_flush)
				1719	iclog->ic_bio.bi_opf \|= REQ_PREFLUSH;
				1720
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	1721	if (xlog_map_iclog_data(&iclog->ic_bio, iclog->ic_data, count)) {
				1722	xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR);
				1723	return;
				1724	}
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1725	if (is_vmalloc_addr(iclog->ic_data))
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	1726	flush_kernel_vmap_range(iclog->ic_data, count);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1727
				1728	/*
				1729	* If this log buffer would straddle the end of the log we will have
				1730	* to split it up into two bios, so that we can continue at the start.
				1731	*/
				1732	if (bno + BTOBB(count) > log->l_logBBsize) {
				1733	struct bio *split;
				1734
				1735	split = bio_split(&iclog->ic_bio, log->l_logBBsize - bno,
				1736	GFP_NOIO, &fs_bio_set);
				1737	bio_chain(split, &iclog->ic_bio);
				1738	submit_bio(split);
				1739
				1740	/* restart at logical offset zero for the remainder */
				1741	iclog->ic_bio.bi_iter.bi_sector = log->l_logBBstart;
				1742	}
				1743
				1744	submit_bio(&iclog->ic_bio);
				1745	}
				1746
				1747	/*
				1748	* We need to bump cycle number for the part of the iclog that is
				1749	* written to the start of the log. Watch out for the header magic
				1750	* number case, though.
				1751	*/
				1752	static void
				1753	xlog_split_iclog(
				1754	struct xlog *log,
				1755	void *data,
				1756	uint64_t bno,
				1757	unsigned int count)
				1758	{
				1759	unsigned int split_offset = BBTOB(log->l_logBBsize - bno);
				1760	unsigned int i;
				1761
				1762	for (i = split_offset; i < count; i += BBSIZE) {
				1763	uint32_t cycle = get_unaligned_be32(data + i);
				1764
				1765	if (++cycle == XLOG_HEADER_MAGIC_NUM)
				1766	cycle++;
				1767	put_unaligned_be32(cycle, data + i);
				1768	}
				1769	}
				1770
				1771	static int
				1772	xlog_calc_iclog_size(
				1773	struct xlog *log,
				1774	struct xlog_in_core *iclog,
				1775	uint32_t *roundoff)
				1776	{
				1777	uint32_t count_init, count;
				1778	bool use_lsunit;
				1779
				1780	use_lsunit = xfs_sb_version_haslogv2(&log->l_mp->m_sb) &&
				1781	log->l_mp->m_sb.sb_logsunit > 1;
				1782
				1783	/* Add for LR header */
				1784	count_init = log->l_iclog_hsize + iclog->ic_offset;
				1785
				1786	/* Round out the log write size */
				1787	if (use_lsunit) {
				1788	/* we have a v2 stripe unit to use */
				1789	count = XLOG_LSUNITTOB(log, XLOG_BTOLSUNIT(log, count_init));
				1790	} else {
				1791	count = BBTOB(BTOBB(count_init));
				1792	}
				1793
				1794	ASSERT(count >= count_init);
				1795	*roundoff = count - count_init;
				1796
				1797	if (use_lsunit)
				1798	ASSERT(*roundoff < log->l_mp->m_sb.sb_logsunit);
				1799	else
				1800	ASSERT(*roundoff < BBTOB(1));
				1801	return count;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1802	}
				1803
				1804	/*
				1805	* Flush out the in-core log (iclog) to the on-disk log in an asynchronous
				1806	* fashion. Previously, we should have moved the current iclog
				1807	* ptr in the log to point to the next available iclog. This allows further
				1808	* write to continue while this code syncs out an iclog ready to go.
				1809	* Before an in-core log can be written out, the data section must be scanned
				1810	* to save away the 1st word of each BBSIZE block into the header. We replace
				1811	* it with the current cycle count. Each BBSIZE block is tagged with the
				1812	* cycle count because there in an implicit assumption that drives will
				1813	* guarantee that entire 512 byte blocks get written at once. In other words,
				1814	* we can't have part of a 512 byte block written and part not written. By
				1815	* tagging each block, we will know which blocks are valid when recovering
				1816	* after an unclean shutdown.
				1817	*
				1818	* This routine is single threaded on the iclog. No other thread can be in
				1819	* this routine with the same iclog. Changing contents of iclog can there-
				1820	* fore be done without grabbing the state machine lock. Updating the global
				1821	* log will require grabbing the lock though.
				1822	*
				1823	* The entire log manager uses a logical block numbering scheme. Only
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1824	* xlog_write_iclog knows about the fact that the log may not start with
				1825	* block zero on a given device.
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1826	*/
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1827	STATIC void
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1828	xlog_sync(
				1829	struct xlog *log,
				1830	struct xlog_in_core *iclog)
				1831	{
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1832	unsigned int count; /* byte count of bwrite */
				1833	unsigned int roundoff; /* roundoff to BB or stripe */
				1834	uint64_t bno;
				1835	unsigned int size;
				1836	bool need_flush = true, split = false;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1837
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1838	ASSERT(atomic_read(&iclog->ic_refcnt) == 0);
				1839
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1840	count = xlog_calc_iclog_size(log, iclog, &roundoff);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1841
				1842	/* move grant heads by roundoff in sync */
				1843	xlog_grant_add_space(log, &log->l_reserve_head.grant, roundoff);
				1844	xlog_grant_add_space(log, &log->l_write_head.grant, roundoff);
				1845
				1846	/* put cycle number in every block */
				1847	xlog_pack_data(log, iclog, roundoff);
				1848
				1849	/* real byte length */
				1850	size = iclog->ic_offset;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1851	if (xfs_sb_version_haslogv2(&log->l_mp->m_sb))
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1852	size += roundoff;
				1853	iclog->ic_header.h_len = cpu_to_be32(size);
				1854
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1855	XFS_STATS_INC(log->l_mp, xs_log_writes);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1856	XFS_STATS_ADD(log->l_mp, xs_log_blocks, BTOBB(count));
				1857
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1858	bno = BLOCK_LSN(be64_to_cpu(iclog->ic_header.h_lsn));
				1859
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1860	/* Do we need to split this write into 2 parts? */
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1861	if (bno + BTOBB(count) > log->l_logBBsize) {
				1862	xlog_split_iclog(log, &iclog->ic_header, bno, count);
				1863	split = true;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1864	}
				1865
				1866	/* calculcate the checksum */
				1867	iclog->ic_header.h_crc = xlog_cksum(log, &iclog->ic_header,
				1868	iclog->ic_datap, size);
				1869	/*
				1870	* Intentionally corrupt the log record CRC based on the error injection
				1871	* frequency, if defined. This facilitates testing log recovery in the
				1872	* event of torn writes. Hence, set the IOABORT state to abort the log
				1873	* write on I/O completion and shutdown the fs. The subsequent mount
				1874	* detects the bad CRC and attempts to recover.
				1875	*/
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1876	#ifdef DEBUG
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1877	if (XFS_TEST_ERROR(false, log->l_mp, XFS_ERRTAG_LOG_BAD_CRC)) {
				1878	iclog->ic_header.h_crc &= cpu_to_le32(0xAAAAAAAA);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1879	iclog->ic_fail_crc = true;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1880	xfs_warn(log->l_mp,
				1881	"Intentionally corrupted log record at LSN 0x%llx. Shutdown imminent.",
				1882	be64_to_cpu(iclog->ic_header.h_lsn));
				1883	}
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1884	#endif
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1885
				1886	/*
				1887	* Flush the data device before flushing the log to make sure all meta
				1888	* data written back from the AIL actually made it to disk before
				1889	* stamping the new log tail LSN into the log buffer. For an external
				1890	* log we need to issue the flush explicitly, and unfortunately
				1891	* synchronously here; for an internal log we can simply use the block
				1892	* layer state machine for preflushes.
				1893	*/
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1894	if (log->l_targ != log->l_mp->m_ddev_targp \|\| split) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1895	xfs_blkdev_issue_flush(log->l_mp->m_ddev_targp);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1896	need_flush = false;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1897	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1898
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1899	xlog_verify_iclog(log, iclog, count);
				1900	xlog_write_iclog(log, iclog, bno, count, need_flush);
				1901	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1902
				1903	/*
				1904	* Deallocate a log structure
				1905	*/
				1906	STATIC void
				1907	xlog_dealloc_log(
				1908	struct xlog *log)
				1909	{
				1910	xlog_in_core_t iclog, next_iclog;
				1911	int i;
				1912
				1913	xlog_cil_destroy(log);
				1914
				1915	/*
				1916	* Cycle all the iclogbuf locks to make sure all log IO completion
				1917	* is done before we tear down these buffers.
				1918	*/
				1919	iclog = log->l_iclog;
				1920	for (i = 0; i < log->l_iclog_bufs; i++) {
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1921	down(&iclog->ic_sema);
				1922	up(&iclog->ic_sema);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1923	iclog = iclog->ic_next;
				1924	}
				1925
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1926	iclog = log->l_iclog;
				1927	for (i = 0; i < log->l_iclog_bufs; i++) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1928	next_iclog = iclog->ic_next;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1929	kmem_free(iclog->ic_data);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1930	kmem_free(iclog);
				1931	iclog = next_iclog;
				1932	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1933
				1934	log->l_mp->m_log = NULL;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1935	destroy_workqueue(log->l_ioend_workqueue);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1936	kmem_free(log);
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	1937	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1938
				1939	/*
				1940	* Update counters atomically now that memcpy is done.
				1941	*/
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1942	static inline void
				1943	xlog_state_finish_copy(
				1944	struct xlog *log,
				1945	struct xlog_in_core *iclog,
				1946	int record_cnt,
				1947	int copy_bytes)
				1948	{
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	1949	lockdep_assert_held(&log->l_icloglock);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1950
				1951	be32_add_cpu(&iclog->ic_header.h_num_logops, record_cnt);
				1952	iclog->ic_offset += copy_bytes;
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	1953	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1954
				1955	/*
				1956	* print out info relating to regions written which consume
				1957	* the reservation
				1958	*/
				1959	void
				1960	xlog_print_tic_res(
				1961	struct xfs_mount *mp,
				1962	struct xlog_ticket *ticket)
				1963	{
				1964	uint i;
				1965	uint ophdr_spc = ticket->t_res_num_ophdrs * (uint)sizeof(xlog_op_header_t);
				1966
				1967	/* match with XLOG_REG_TYPE_* in xfs_log.h */
				1968	#define REG_TYPE_STR(type, str) [XLOG_REG_TYPE_##type] = str
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1969	static char *res_type_str[] = {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1970	REG_TYPE_STR(BFORMAT, "bformat"),
				1971	REG_TYPE_STR(BCHUNK, "bchunk"),
				1972	REG_TYPE_STR(EFI_FORMAT, "efi_format"),
				1973	REG_TYPE_STR(EFD_FORMAT, "efd_format"),
				1974	REG_TYPE_STR(IFORMAT, "iformat"),
				1975	REG_TYPE_STR(ICORE, "icore"),
				1976	REG_TYPE_STR(IEXT, "iext"),
				1977	REG_TYPE_STR(IBROOT, "ibroot"),
				1978	REG_TYPE_STR(ILOCAL, "ilocal"),
				1979	REG_TYPE_STR(IATTR_EXT, "iattr_ext"),
				1980	REG_TYPE_STR(IATTR_BROOT, "iattr_broot"),
				1981	REG_TYPE_STR(IATTR_LOCAL, "iattr_local"),
				1982	REG_TYPE_STR(QFORMAT, "qformat"),
				1983	REG_TYPE_STR(DQUOT, "dquot"),
				1984	REG_TYPE_STR(QUOTAOFF, "quotaoff"),
				1985	REG_TYPE_STR(LRHEADER, "LR header"),
				1986	REG_TYPE_STR(UNMOUNT, "unmount"),
				1987	REG_TYPE_STR(COMMIT, "commit"),
				1988	REG_TYPE_STR(TRANSHDR, "trans header"),
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1989	REG_TYPE_STR(ICREATE, "inode create"),
				1990	REG_TYPE_STR(RUI_FORMAT, "rui_format"),
				1991	REG_TYPE_STR(RUD_FORMAT, "rud_format"),
				1992	REG_TYPE_STR(CUI_FORMAT, "cui_format"),
				1993	REG_TYPE_STR(CUD_FORMAT, "cud_format"),
				1994	REG_TYPE_STR(BUI_FORMAT, "bui_format"),
				1995	REG_TYPE_STR(BUD_FORMAT, "bud_format"),
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1996	};
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1997	BUILD_BUG_ON(ARRAY_SIZE(res_type_str) != XLOG_REG_TYPE_MAX + 1);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1998	#undef REG_TYPE_STR
				1999
				2000	xfs_warn(mp, "ticket reservation summary:");
				2001	xfs_warn(mp, " unit res = %d bytes",
				2002	ticket->t_unit_res);
				2003	xfs_warn(mp, " current res = %d bytes",
				2004	ticket->t_curr_res);
				2005	xfs_warn(mp, " total reg = %u bytes (o/flow = %u bytes)",
				2006	ticket->t_res_arr_sum, ticket->t_res_o_flow);
				2007	xfs_warn(mp, " ophdrs = %u (ophdr space = %u bytes)",
				2008	ticket->t_res_num_ophdrs, ophdr_spc);
				2009	xfs_warn(mp, " ophdr + reg = %u bytes",
				2010	ticket->t_res_arr_sum + ticket->t_res_o_flow + ophdr_spc);
				2011	xfs_warn(mp, " num regions = %u",
				2012	ticket->t_res_num);
				2013
				2014	for (i = 0; i < ticket->t_res_num; i++) {
				2015	uint r_type = ticket->t_res_arr[i].r_type;
				2016	xfs_warn(mp, "region[%u]: %s - %u bytes", i,
				2017	((r_type <= 0 \|\| r_type > XLOG_REG_TYPE_MAX) ?
				2018	"bad-rtype" : res_type_str[r_type]),
				2019	ticket->t_res_arr[i].r_len);
				2020	}
				2021	}
				2022
				2023	/*
				2024	* Print a summary of the transaction.
				2025	*/
				2026	void
				2027	xlog_print_trans(
				2028	struct xfs_trans *tp)
				2029	{
				2030	struct xfs_mount *mp = tp->t_mountp;
				2031	struct xfs_log_item *lip;
				2032
				2033	/* dump core transaction and ticket info */
				2034	xfs_warn(mp, "transaction summary:");
				2035	xfs_warn(mp, " log res = %d", tp->t_log_res);
				2036	xfs_warn(mp, " log count = %d", tp->t_log_count);
				2037	xfs_warn(mp, " flags = 0x%x", tp->t_flags);
				2038
				2039	xlog_print_tic_res(mp, tp->t_ticket);
				2040
				2041	/* dump each log item */
				2042	list_for_each_entry(lip, &tp->t_items, li_trans) {
				2043	struct xfs_log_vec *lv = lip->li_lv;
				2044	struct xfs_log_iovec *vec;
				2045	int i;
				2046
				2047	xfs_warn(mp, "log item: ");
				2048	xfs_warn(mp, " type = 0x%x", lip->li_type);
				2049	xfs_warn(mp, " flags = 0x%lx", lip->li_flags);
				2050	if (!lv)
				2051	continue;
				2052	xfs_warn(mp, " niovecs = %d", lv->lv_niovecs);
				2053	xfs_warn(mp, " size = %d", lv->lv_size);
				2054	xfs_warn(mp, " bytes = %d", lv->lv_bytes);
				2055	xfs_warn(mp, " buf len = %d", lv->lv_buf_len);
				2056
				2057	/* dump each iovec for the log item */
				2058	vec = lv->lv_iovecp;
				2059	for (i = 0; i < lv->lv_niovecs; i++) {
				2060	int dumplen = min(vec->i_len, 32);
				2061
				2062	xfs_warn(mp, " iovec[%d]", i);
				2063	xfs_warn(mp, " type = 0x%x", vec->i_type);
				2064	xfs_warn(mp, " len = %d", vec->i_len);
				2065	xfs_warn(mp, " first %d bytes of iovec[%d]:", dumplen, i);
				2066	xfs_hex_dump(vec->i_addr, dumplen);
				2067
				2068	vec++;
				2069	}
				2070	}
				2071	}
				2072
				2073	/*
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	2074	* Calculate the potential space needed by the log vector. We may need a start
				2075	* record, and each region gets its own struct xlog_op_header and may need to be
				2076	* double word aligned.
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2077	*/
				2078	static int
				2079	xlog_write_calc_vec_length(
				2080	struct xlog_ticket *ticket,
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	2081	struct xfs_log_vec *log_vector,
				2082	bool need_start_rec)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2083	{
				2084	struct xfs_log_vec *lv;
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	2085	int headers = need_start_rec ? 1 : 0;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2086	int len = 0;
				2087	int i;
				2088
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2089	for (lv = log_vector; lv; lv = lv->lv_next) {
				2090	/* we don't write ordered log vectors */
				2091	if (lv->lv_buf_len == XFS_LOG_VEC_ORDERED)
				2092	continue;
				2093
				2094	headers += lv->lv_niovecs;
				2095
				2096	for (i = 0; i < lv->lv_niovecs; i++) {
				2097	struct xfs_log_iovec *vecp = &lv->lv_iovecp[i];
				2098
				2099	len += vecp->i_len;
				2100	xlog_tic_add_region(ticket, vecp->i_len, vecp->i_type);
				2101	}
				2102	}
				2103
				2104	ticket->t_res_num_ophdrs += headers;
				2105	len += headers * sizeof(struct xlog_op_header);
				2106
				2107	return len;
				2108	}
				2109
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	2110	static void
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2111	xlog_write_start_rec(
				2112	struct xlog_op_header *ophdr,
				2113	struct xlog_ticket *ticket)
				2114	{
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2115	ophdr->oh_tid = cpu_to_be32(ticket->t_tid);
				2116	ophdr->oh_clientid = ticket->t_clientid;
				2117	ophdr->oh_len = 0;
				2118	ophdr->oh_flags = XLOG_START_TRANS;
				2119	ophdr->oh_res2 = 0;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2120	}
				2121
				2122	static xlog_op_header_t *
				2123	xlog_write_setup_ophdr(
				2124	struct xlog *log,
				2125	struct xlog_op_header *ophdr,
				2126	struct xlog_ticket *ticket,
				2127	uint flags)
				2128	{
				2129	ophdr->oh_tid = cpu_to_be32(ticket->t_tid);
				2130	ophdr->oh_clientid = ticket->t_clientid;
				2131	ophdr->oh_res2 = 0;
				2132
				2133	/* are we copying a commit or unmount record? */
				2134	ophdr->oh_flags = flags;
				2135
				2136	/*
				2137	* We've seen logs corrupted with bad transaction client ids. This
				2138	* makes sure that XFS doesn't generate them on. Turn this into an EIO
				2139	* and shut down the filesystem.
				2140	*/
				2141	switch (ophdr->oh_clientid) {
				2142	case XFS_TRANSACTION:
				2143	case XFS_VOLUME:
				2144	case XFS_LOG:
				2145	break;
				2146	default:
				2147	xfs_warn(log->l_mp,
				2148	"Bad XFS transaction clientid 0x%x in ticket "PTR_FMT,
				2149	ophdr->oh_clientid, ticket);
				2150	return NULL;
				2151	}
				2152
				2153	return ophdr;
				2154	}
				2155
				2156	/*
				2157	* Set up the parameters of the region copy into the log. This has
				2158	* to handle region write split across multiple log buffers - this
				2159	* state is kept external to this function so that this code can
				2160	* be written in an obvious, self documenting manner.
				2161	*/
				2162	static int
				2163	xlog_write_setup_copy(
				2164	struct xlog_ticket *ticket,
				2165	struct xlog_op_header *ophdr,
				2166	int space_available,
				2167	int space_required,
				2168	int *copy_off,
				2169	int *copy_len,
				2170	int *last_was_partial_copy,
				2171	int *bytes_consumed)
				2172	{
				2173	int still_to_copy;
				2174
				2175	still_to_copy = space_required - *bytes_consumed;
				2176	copy_off = bytes_consumed;
				2177
				2178	if (still_to_copy <= space_available) {
				2179	/* write of region completes here */
				2180	*copy_len = still_to_copy;
				2181	ophdr->oh_len = cpu_to_be32(*copy_len);
				2182	if (*last_was_partial_copy)
				2183	ophdr->oh_flags \|= (XLOG_END_TRANS\|XLOG_WAS_CONT_TRANS);
				2184	*last_was_partial_copy = 0;
				2185	*bytes_consumed = 0;
				2186	return 0;
				2187	}
				2188
				2189	/* partial write of region, needs extra log op header reservation */
				2190	*copy_len = space_available;
				2191	ophdr->oh_len = cpu_to_be32(*copy_len);
				2192	ophdr->oh_flags \|= XLOG_CONTINUE_TRANS;
				2193	if (*last_was_partial_copy)
				2194	ophdr->oh_flags \|= XLOG_WAS_CONT_TRANS;
				2195	bytes_consumed += copy_len;
				2196	(*last_was_partial_copy)++;
				2197
				2198	/* account for new log op header */
				2199	ticket->t_curr_res -= sizeof(struct xlog_op_header);
				2200	ticket->t_res_num_ophdrs++;
				2201
				2202	return sizeof(struct xlog_op_header);
				2203	}
				2204
				2205	static int
				2206	xlog_write_copy_finish(
				2207	struct xlog *log,
				2208	struct xlog_in_core *iclog,
				2209	uint flags,
				2210	int *record_cnt,
				2211	int *data_cnt,
				2212	int *partial_copy,
				2213	int *partial_copy_len,
				2214	int log_offset,
				2215	struct xlog_in_core **commit_iclog)
				2216	{
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	2217	int error;
				2218
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2219	if (*partial_copy) {
				2220	/*
				2221	* This iclog has already been marked WANT_SYNC by
				2222	* xlog_state_get_iclog_space.
				2223	*/
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	2224	spin_lock(&log->l_icloglock);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2225	xlog_state_finish_copy(log, iclog, record_cnt, data_cnt);
				2226	*record_cnt = 0;
				2227	*data_cnt = 0;
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	2228	goto release_iclog;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2229	}
				2230
				2231	*partial_copy = 0;
				2232	*partial_copy_len = 0;
				2233
				2234	if (iclog->ic_size - log_offset <= sizeof(xlog_op_header_t)) {
				2235	/* no more space in this iclog - push it. */
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	2236	spin_lock(&log->l_icloglock);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2237	xlog_state_finish_copy(log, iclog, record_cnt, data_cnt);
				2238	*record_cnt = 0;
				2239	*data_cnt = 0;
				2240
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	2241	if (iclog->ic_state == XLOG_STATE_ACTIVE)
				2242	xlog_state_switch_iclogs(log, iclog, 0);
				2243	else
				2244	ASSERT(iclog->ic_state == XLOG_STATE_WANT_SYNC \|\|
				2245	iclog->ic_state == XLOG_STATE_IOERROR);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2246	if (!commit_iclog)
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	2247	goto release_iclog;
				2248	spin_unlock(&log->l_icloglock);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2249	ASSERT(flags & XLOG_COMMIT_TRANS);
				2250	*commit_iclog = iclog;
				2251	}
				2252
				2253	return 0;
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	2254
				2255	release_iclog:
				2256	error = xlog_state_release_iclog(log, iclog);
				2257	spin_unlock(&log->l_icloglock);
				2258	return error;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2259	}
				2260
				2261	/*
				2262	* Write some region out to in-core log
				2263	*
				2264	* This will be called when writing externally provided regions or when
				2265	* writing out a commit record for a given transaction.
				2266	*
				2267	* General algorithm:
				2268	* 1. Find total length of this write. This may include adding to the
				2269	* lengths passed in.
				2270	* 2. Check whether we violate the tickets reservation.
				2271	* 3. While writing to this iclog
				2272	* A. Reserve as much space in this iclog as can get
				2273	* B. If this is first write, save away start lsn
				2274	* C. While writing this region:
				2275	* 1. If first write of transaction, write start record
				2276	* 2. Write log operation header (header per region)
				2277	* 3. Find out if we can fit entire region into this iclog
				2278	* 4. Potentially, verify destination memcpy ptr
				2279	* 5. Memcpy (partial) region
				2280	* 6. If partial copy, release iclog; otherwise, continue
				2281	* copying more regions into current iclog
				2282	* 4. Mark want sync bit (in simulation mode)
				2283	* 5. Release iclog for potential flush to on-disk log.
				2284	*
				2285	* ERRORS:
				2286	* 1. Panic if reservation is overrun. This should never happen since
				2287	* reservation amounts are generated internal to the filesystem.
				2288	* NOTES:
				2289	* 1. Tickets are single threaded data structures.
				2290	* 2. The XLOG_END_TRANS & XLOG_CONTINUE_TRANS flags are passed down to the
				2291	* syncing routine. When a single log_write region needs to span
				2292	* multiple in-core logs, the XLOG_CONTINUE_TRANS bit should be set
				2293	* on all log operation writes which don't contain the end of the
				2294	* region. The XLOG_END_TRANS bit is used for the in-core log
				2295	* operation which contains the end of the continued log_write region.
				2296	* 3. When xlog_state_get_iclog_space() grabs the rest of the current iclog,
				2297	* we don't really know exactly how much space will be used. As a result,
				2298	* we don't update ic_offset until the end when we know exactly how many
				2299	* bytes have been written out.
				2300	*/
				2301	int
				2302	xlog_write(
				2303	struct xlog *log,
				2304	struct xfs_log_vec *log_vector,
				2305	struct xlog_ticket *ticket,
				2306	xfs_lsn_t *start_lsn,
				2307	struct xlog_in_core **commit_iclog,
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	2308	uint flags,
				2309	bool need_start_rec)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2310	{
				2311	struct xlog_in_core *iclog = NULL;
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	2312	struct xfs_log_vec *lv = log_vector;
				2313	struct xfs_log_iovec *vecp = lv->lv_iovecp;
				2314	int index = 0;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2315	int len;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2316	int partial_copy = 0;
				2317	int partial_copy_len = 0;
				2318	int contwr = 0;
				2319	int record_cnt = 0;
				2320	int data_cnt = 0;
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	2321	int error = 0;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2322
				2323	/*
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	2324	* If this is a commit or unmount transaction, we don't need a start
				2325	* record to be written. We do, however, have to account for the
				2326	* commit or unmount header that gets written. Hence we always have
				2327	* to account for an extra xlog_op_header here.
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2328	*/
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	2329	ticket->t_curr_res -= sizeof(struct xlog_op_header);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2330	if (ticket->t_curr_res < 0) {
				2331	xfs_alert_tag(log->l_mp, XFS_PTAG_LOGRES,
				2332	"ctx ticket reservation ran out. Need to up reservation");
				2333	xlog_print_tic_res(log->l_mp, ticket);
				2334	xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR);
				2335	}
				2336
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	2337	len = xlog_write_calc_vec_length(ticket, log_vector, need_start_rec);
				2338	*start_lsn = 0;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2339	while (lv && (!lv->lv_niovecs \|\| index < lv->lv_niovecs)) {
				2340	void *ptr;
				2341	int log_offset;
				2342
				2343	error = xlog_state_get_iclog_space(log, len, &iclog, ticket,
				2344	&contwr, &log_offset);
				2345	if (error)
				2346	return error;
				2347
				2348	ASSERT(log_offset <= iclog->ic_size - 1);
				2349	ptr = iclog->ic_datap + log_offset;
				2350
				2351	/* start_lsn is the first lsn written to. That's all we need. */
				2352	if (!*start_lsn)
				2353	*start_lsn = be64_to_cpu(iclog->ic_header.h_lsn);
				2354
				2355	/*
				2356	* This loop writes out as many regions as can fit in the amount
				2357	* of space which was allocated by xlog_state_get_iclog_space().
				2358	*/
				2359	while (lv && (!lv->lv_niovecs \|\| index < lv->lv_niovecs)) {
				2360	struct xfs_log_iovec *reg;
				2361	struct xlog_op_header *ophdr;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2362	int copy_len;
				2363	int copy_off;
				2364	bool ordered = false;
				2365
				2366	/* ordered log vectors have no regions to write */
				2367	if (lv->lv_buf_len == XFS_LOG_VEC_ORDERED) {
				2368	ASSERT(lv->lv_niovecs == 0);
				2369	ordered = true;
				2370	goto next_lv;
				2371	}
				2372
				2373	reg = &vecp[index];
				2374	ASSERT(reg->i_len % sizeof(int32_t) == 0);
				2375	ASSERT((unsigned long)ptr % sizeof(int32_t) == 0);
				2376
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	2377	/*
				2378	* Before we start formatting log vectors, we need to
				2379	* write a start record. Only do this for the first
				2380	* iclog we write to.
				2381	*/
				2382	if (need_start_rec) {
				2383	xlog_write_start_rec(ptr, ticket);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2384	xlog_write_adv_cnt(&ptr, &len, &log_offset,
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	2385	sizeof(struct xlog_op_header));
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2386	}
				2387
				2388	ophdr = xlog_write_setup_ophdr(log, ptr, ticket, flags);
				2389	if (!ophdr)
				2390	return -EIO;
				2391
				2392	xlog_write_adv_cnt(&ptr, &len, &log_offset,
				2393	sizeof(struct xlog_op_header));
				2394
				2395	len += xlog_write_setup_copy(ticket, ophdr,
				2396	iclog->ic_size-log_offset,
				2397	reg->i_len,
				2398	&copy_off, &copy_len,
				2399	&partial_copy,
				2400	&partial_copy_len);
				2401	xlog_verify_dest_ptr(log, ptr);
				2402
				2403	/*
				2404	* Copy region.
				2405	*
				2406	* Unmount records just log an opheader, so can have
				2407	* empty payloads with no data region to copy. Hence we
				2408	* only copy the payload if the vector says it has data
				2409	* to copy.
				2410	*/
				2411	ASSERT(copy_len >= 0);
				2412	if (copy_len > 0) {
				2413	memcpy(ptr, reg->i_addr + copy_off, copy_len);
				2414	xlog_write_adv_cnt(&ptr, &len, &log_offset,
				2415	copy_len);
				2416	}
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	2417	copy_len += sizeof(struct xlog_op_header);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2418	record_cnt++;
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	2419	if (need_start_rec) {
				2420	copy_len += sizeof(struct xlog_op_header);
				2421	record_cnt++;
				2422	need_start_rec = false;
				2423	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2424	data_cnt += contwr ? copy_len : 0;
				2425
				2426	error = xlog_write_copy_finish(log, iclog, flags,
				2427	&record_cnt, &data_cnt,
				2428	&partial_copy,
				2429	&partial_copy_len,
				2430	log_offset,
				2431	commit_iclog);
				2432	if (error)
				2433	return error;
				2434
				2435	/*
				2436	* if we had a partial copy, we need to get more iclog
				2437	* space but we don't want to increment the region
				2438	* index because there is still more is this region to
				2439	* write.
				2440	*
				2441	* If we completed writing this region, and we flushed
				2442	* the iclog (indicated by resetting of the record
				2443	* count), then we also need to get more log space. If
				2444	* this was the last record, though, we are done and
				2445	* can just return.
				2446	*/
				2447	if (partial_copy)
				2448	break;
				2449
				2450	if (++index == lv->lv_niovecs) {
				2451	next_lv:
				2452	lv = lv->lv_next;
				2453	index = 0;
				2454	if (lv)
				2455	vecp = lv->lv_iovecp;
				2456	}
				2457	if (record_cnt == 0 && !ordered) {
				2458	if (!lv)
				2459	return 0;
				2460	break;
				2461	}
				2462	}
				2463	}
				2464
				2465	ASSERT(len == 0);
				2466
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	2467	spin_lock(&log->l_icloglock);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2468	xlog_state_finish_copy(log, iclog, record_cnt, data_cnt);
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	2469	if (commit_iclog) {
				2470	ASSERT(flags & XLOG_COMMIT_TRANS);
				2471	*commit_iclog = iclog;
				2472	} else {
				2473	error = xlog_state_release_iclog(log, iclog);
				2474	}
				2475	spin_unlock(&log->l_icloglock);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2476
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	2477	return error;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2478	}
				2479
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	2480	static void
				2481	xlog_state_activate_iclog(
				2482	struct xlog_in_core *iclog,
				2483	int *iclogs_changed)
				2484	{
				2485	ASSERT(list_empty_careful(&iclog->ic_callbacks));
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2486
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	2487	/*
				2488	* If the number of ops in this iclog indicate it just contains the
				2489	* dummy transaction, we can change state into IDLE (the second time
				2490	* around). Otherwise we should change the state into NEED a dummy.
				2491	* We don't need to cover the dummy.
				2492	*/
				2493	if (*iclogs_changed == 0 &&
				2494	iclog->ic_header.h_num_logops == cpu_to_be32(XLOG_COVER_OPS)) {
				2495	*iclogs_changed = 1;
				2496	} else {
				2497	/*
				2498	* We have two dirty iclogs so start over. This could also be
				2499	* num of ops indicating this is not the dummy going out.
				2500	*/
				2501	*iclogs_changed = 2;
				2502	}
				2503
				2504	iclog->ic_state = XLOG_STATE_ACTIVE;
				2505	iclog->ic_offset = 0;
				2506	iclog->ic_header.h_num_logops = 0;
				2507	memset(iclog->ic_header.h_cycle_data, 0,
				2508	sizeof(iclog->ic_header.h_cycle_data));
				2509	iclog->ic_header.h_lsn = 0;
				2510	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2511
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2512	/*
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	2513	* Loop through all iclogs and mark all iclogs currently marked DIRTY as
				2514	* ACTIVE after iclog I/O has completed.
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2515	*/
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	2516	static void
				2517	xlog_state_activate_iclogs(
				2518	struct xlog *log,
				2519	int *iclogs_changed)
				2520	{
				2521	struct xlog_in_core *iclog = log->l_iclog;
				2522
				2523	do {
				2524	if (iclog->ic_state == XLOG_STATE_DIRTY)
				2525	xlog_state_activate_iclog(iclog, iclogs_changed);
				2526	/*
				2527	* The ordering of marking iclogs ACTIVE must be maintained, so
				2528	* an iclog doesn't become ACTIVE beyond one that is SYNCING.
				2529	*/
				2530	else if (iclog->ic_state != XLOG_STATE_ACTIVE)
				2531	break;
				2532	} while ((iclog = iclog->ic_next) != log->l_iclog);
				2533	}
				2534
				2535	static int
				2536	xlog_covered_state(
				2537	int prev_state,
				2538	int iclogs_changed)
				2539	{
				2540	/*
				2541	* We usually go to NEED. But we go to NEED2 if the changed indicates we
				2542	* are done writing the dummy record. If we are done with the second
				2543	* dummy recored (DONE2), then we go to IDLE.
				2544	*/
				2545	switch (prev_state) {
				2546	case XLOG_STATE_COVER_IDLE:
				2547	case XLOG_STATE_COVER_NEED:
				2548	case XLOG_STATE_COVER_NEED2:
				2549	break;
				2550	case XLOG_STATE_COVER_DONE:
				2551	if (iclogs_changed == 1)
				2552	return XLOG_STATE_COVER_NEED2;
				2553	break;
				2554	case XLOG_STATE_COVER_DONE2:
				2555	if (iclogs_changed == 1)
				2556	return XLOG_STATE_COVER_IDLE;
				2557	break;
				2558	default:
				2559	ASSERT(0);
				2560	}
				2561
				2562	return XLOG_STATE_COVER_NEED;
				2563	}
				2564
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2565	STATIC void
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2566	xlog_state_clean_iclog(
				2567	struct xlog *log,
				2568	struct xlog_in_core *dirty_iclog)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2569	{
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	2570	int iclogs_changed = 0;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2571
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	2572	dirty_iclog->ic_state = XLOG_STATE_DIRTY;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2573
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	2574	xlog_state_activate_iclogs(log, &iclogs_changed);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2575	wake_up_all(&dirty_iclog->ic_force_wait);
				2576
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	2577	if (iclogs_changed) {
				2578	log->l_covered_state = xlog_covered_state(log->l_covered_state,
				2579	iclogs_changed);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2580	}
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2581	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2582
				2583	STATIC xfs_lsn_t
				2584	xlog_get_lowest_lsn(
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2585	struct xlog *log)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2586	{
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2587	struct xlog_in_core *iclog = log->l_iclog;
				2588	xfs_lsn_t lowest_lsn = 0, lsn;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2589
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2590	do {
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	2591	if (iclog->ic_state == XLOG_STATE_ACTIVE \|\|
				2592	iclog->ic_state == XLOG_STATE_DIRTY)
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2593	continue;
				2594
				2595	lsn = be64_to_cpu(iclog->ic_header.h_lsn);
				2596	if ((lsn && !lowest_lsn) \|\| XFS_LSN_CMP(lsn, lowest_lsn) < 0)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2597	lowest_lsn = lsn;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2598	} while ((iclog = iclog->ic_next) != log->l_iclog);
				2599
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2600	return lowest_lsn;
				2601	}
				2602
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2603	/*
				2604	* Completion of a iclog IO does not imply that a transaction has completed, as
				2605	* transactions can be large enough to span many iclogs. We cannot change the
				2606	* tail of the log half way through a transaction as this may be the only
				2607	* transaction in the log and moving the tail to point to the middle of it
				2608	* will prevent recovery from finding the start of the transaction. Hence we
				2609	* should only update the last_sync_lsn if this iclog contains transaction
				2610	* completion callbacks on it.
				2611	*
				2612	* We have to do this before we drop the icloglock to ensure we are the only one
				2613	* that can update it.
				2614	*
				2615	* If we are moving the last_sync_lsn forwards, we also need to ensure we kick
				2616	* the reservation grant head pushing. This is due to the fact that the push
				2617	* target is bound by the current last_sync_lsn value. Hence if we have a large
				2618	* amount of log space bound up in this committing transaction then the
				2619	* last_sync_lsn value may be the limiting factor preventing tail pushing from
				2620	* freeing space in the log. Hence once we've updated the last_sync_lsn we
				2621	* should push the AIL to ensure the push target (and hence the grant head) is
				2622	* no longer bound by the old log head location and can move forwards and make
				2623	* progress again.
				2624	*/
				2625	static void
				2626	xlog_state_set_callback(
				2627	struct xlog *log,
				2628	struct xlog_in_core *iclog,
				2629	xfs_lsn_t header_lsn)
				2630	{
				2631	iclog->ic_state = XLOG_STATE_CALLBACK;
				2632
				2633	ASSERT(XFS_LSN_CMP(atomic64_read(&log->l_last_sync_lsn),
				2634	header_lsn) <= 0);
				2635
				2636	if (list_empty_careful(&iclog->ic_callbacks))
				2637	return;
				2638
				2639	atomic64_set(&log->l_last_sync_lsn, header_lsn);
				2640	xlog_grant_push_ail(log, 0);
				2641	}
				2642
				2643	/*
				2644	* Return true if we need to stop processing, false to continue to the next
				2645	* iclog. The caller will need to run callbacks if the iclog is returned in the
				2646	* XLOG_STATE_CALLBACK state.
				2647	*/
				2648	static bool
				2649	xlog_state_iodone_process_iclog(
				2650	struct xlog *log,
				2651	struct xlog_in_core *iclog,
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2652	bool *ioerror)
				2653	{
				2654	xfs_lsn_t lowest_lsn;
				2655	xfs_lsn_t header_lsn;
				2656
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	2657	switch (iclog->ic_state) {
				2658	case XLOG_STATE_ACTIVE:
				2659	case XLOG_STATE_DIRTY:
				2660	/*
				2661	* Skip all iclogs in the ACTIVE & DIRTY states:
				2662	*/
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2663	return false;
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	2664	case XLOG_STATE_IOERROR:
				2665	/*
				2666	* Between marking a filesystem SHUTDOWN and stopping the log,
				2667	* we do flush all iclogs to disk (if there wasn't a log I/O
				2668	* error). So, we do want things to go smoothly in case of just
				2669	* a SHUTDOWN w/o a LOG_IO_ERROR.
				2670	*/
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2671	*ioerror = true;
				2672	return false;
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	2673	case XLOG_STATE_DONE_SYNC:
				2674	/*
				2675	* Now that we have an iclog that is in the DONE_SYNC state, do
				2676	* one more check here to see if we have chased our tail around.
				2677	* If this is not the lowest lsn iclog, then we will leave it
				2678	* for another completion to process.
				2679	*/
				2680	header_lsn = be64_to_cpu(iclog->ic_header.h_lsn);
				2681	lowest_lsn = xlog_get_lowest_lsn(log);
				2682	if (lowest_lsn && XFS_LSN_CMP(lowest_lsn, header_lsn) < 0)
				2683	return false;
				2684	xlog_state_set_callback(log, iclog, header_lsn);
				2685	return false;
				2686	default:
				2687	/*
				2688	* Can only perform callbacks in order. Since this iclog is not
				2689	* in the DONE_SYNC state, we skip the rest and just try to
				2690	* clean up.
				2691	*/
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2692	return true;
				2693	}
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2694	}
				2695
				2696	/*
				2697	* Keep processing entries in the iclog callback list until we come around and
				2698	* it is empty. We need to atomically see that the list is empty and change the
				2699	* state to DIRTY so that we don't miss any more callbacks being added.
				2700	*
				2701	* This function is called with the icloglock held and returns with it held. We
				2702	* drop it while running callbacks, however, as holding it over thousands of
				2703	* callbacks is unnecessary and causes excessive contention if we do.
				2704	*/
				2705	static void
				2706	xlog_state_do_iclog_callbacks(
				2707	struct xlog *log,
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	2708	struct xlog_in_core *iclog)
				2709	__releases(&log->l_icloglock)
				2710	__acquires(&log->l_icloglock)
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2711	{
				2712	spin_unlock(&log->l_icloglock);
				2713	spin_lock(&iclog->ic_callback_lock);
				2714	while (!list_empty(&iclog->ic_callbacks)) {
				2715	LIST_HEAD(tmp);
				2716
				2717	list_splice_init(&iclog->ic_callbacks, &tmp);
				2718
				2719	spin_unlock(&iclog->ic_callback_lock);
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	2720	xlog_cil_process_committed(&tmp);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2721	spin_lock(&iclog->ic_callback_lock);
				2722	}
				2723
				2724	/*
				2725	* Pick up the icloglock while still holding the callback lock so we
				2726	* serialise against anyone trying to add more callbacks to this iclog
				2727	* now we've finished processing.
				2728	*/
				2729	spin_lock(&log->l_icloglock);
				2730	spin_unlock(&iclog->ic_callback_lock);
				2731	}
				2732
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2733	STATIC void
				2734	xlog_state_do_callback(
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	2735	struct xlog *log)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2736	{
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2737	struct xlog_in_core *iclog;
				2738	struct xlog_in_core *first_iclog;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2739	bool cycled_icloglock;
				2740	bool ioerror;
				2741	int flushcnt = 0;
				2742	int repeats = 0;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2743
				2744	spin_lock(&log->l_icloglock);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2745	do {
				2746	/*
				2747	* Scan all iclogs starting with the one pointed to by the
				2748	* log. Reset this starting point each time the log is
				2749	* unlocked (during callbacks).
				2750	*
				2751	* Keep looping through iclogs until one full pass is made
				2752	* without running any callbacks.
				2753	*/
				2754	first_iclog = log->l_iclog;
				2755	iclog = log->l_iclog;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2756	cycled_icloglock = false;
				2757	ioerror = false;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2758	repeats++;
				2759
				2760	do {
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2761	if (xlog_state_iodone_process_iclog(log, iclog,
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	2762	&ioerror))
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2763	break;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2764
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	2765	if (iclog->ic_state != XLOG_STATE_CALLBACK &&
				2766	iclog->ic_state != XLOG_STATE_IOERROR) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2767	iclog = iclog->ic_next;
				2768	continue;
				2769	}
				2770
				2771	/*
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2772	* Running callbacks will drop the icloglock which means
				2773	* we'll have to run at least one more complete loop.
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2774	*/
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2775	cycled_icloglock = true;
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	2776	xlog_state_do_iclog_callbacks(log, iclog);
				2777	if (XLOG_FORCED_SHUTDOWN(log))
				2778	wake_up_all(&iclog->ic_force_wait);
				2779	else
				2780	xlog_state_clean_iclog(log, iclog);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2781	iclog = iclog->ic_next;
				2782	} while (first_iclog != iclog);
				2783
				2784	if (repeats > 5000) {
				2785	flushcnt += repeats;
				2786	repeats = 0;
				2787	xfs_warn(log->l_mp,
				2788	"%s: possible infinite loop (%d iterations)",
				2789	__func__, flushcnt);
				2790	}
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2791	} while (!ioerror && cycled_icloglock);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2792
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	2793	if (log->l_iclog->ic_state == XLOG_STATE_ACTIVE \|\|
				2794	log->l_iclog->ic_state == XLOG_STATE_IOERROR)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2795	wake_up_all(&log->l_flush_wait);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2796
				2797	spin_unlock(&log->l_icloglock);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2798	}
				2799
				2800
				2801	/*
				2802	* Finish transitioning this iclog to the dirty state.
				2803	*
				2804	* Make sure that we completely execute this routine only when this is
				2805	* the last call to the iclog. There is a good chance that iclog flushes,
				2806	* when we reach the end of the physical log, get turned into 2 separate
				2807	* calls to bwrite. Hence, one iclog flush could generate two calls to this
				2808	* routine. By using the reference count bwritecnt, we guarantee that only
				2809	* the second completion goes through.
				2810	*
				2811	* Callbacks could take time, so they are done outside the scope of the
				2812	* global state machine log lock.
				2813	*/
				2814	STATIC void
				2815	xlog_state_done_syncing(
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	2816	struct xlog_in_core *iclog)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2817	{
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2818	struct xlog *log = iclog->ic_log;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2819
				2820	spin_lock(&log->l_icloglock);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2821	ASSERT(atomic_read(&iclog->ic_refcnt) == 0);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2822
				2823	/*
				2824	* If we got an error, either on the first buffer, or in the case of
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	2825	* split log writes, on the second, we shut down the file system and
				2826	* no iclogs should ever be attempted to be written to disk again.
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2827	*/
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	2828	if (!XLOG_FORCED_SHUTDOWN(log)) {
				2829	ASSERT(iclog->ic_state == XLOG_STATE_SYNCING);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2830	iclog->ic_state = XLOG_STATE_DONE_SYNC;
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	2831	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2832
				2833	/*
				2834	* Someone could be sleeping prior to writing out the next
				2835	* iclog buffer, we wake them all, one will get to do the
				2836	* I/O, the others get to wait for the result.
				2837	*/
				2838	wake_up_all(&iclog->ic_write_wait);
				2839	spin_unlock(&log->l_icloglock);
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	2840	xlog_state_do_callback(log);
				2841	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2842
				2843	/*
				2844	* If the head of the in-core log ring is not (ACTIVE or DIRTY), then we must
				2845	* sleep. We wait on the flush queue on the head iclog as that should be
				2846	* the first iclog to complete flushing. Hence if all iclogs are syncing,
				2847	* we will wait here and all new writes will sleep until a sync completes.
				2848	*
				2849	* The in-core logs are used in a circular fashion. They are not used
				2850	* out-of-order even when an iclog past the head is free.
				2851	*
				2852	* return:
				2853	* * log_offset where xlog_write() can start writing into the in-core
				2854	* log's data space.
				2855	* * in-core log pointer to which xlog_write() should write.
				2856	* * boolean indicating this is a continued write to an in-core log.
				2857	* If this is the last write, then the in-core log's offset field
				2858	* needs to be incremented, depending on the amount of data which
				2859	* is copied.
				2860	*/
				2861	STATIC int
				2862	xlog_state_get_iclog_space(
				2863	struct xlog *log,
				2864	int len,
				2865	struct xlog_in_core **iclogp,
				2866	struct xlog_ticket *ticket,
				2867	int *continued_write,
				2868	int *logoffsetp)
				2869	{
				2870	int log_offset;
				2871	xlog_rec_header_t *head;
				2872	xlog_in_core_t *iclog;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2873
				2874	restart:
				2875	spin_lock(&log->l_icloglock);
				2876	if (XLOG_FORCED_SHUTDOWN(log)) {
				2877	spin_unlock(&log->l_icloglock);
				2878	return -EIO;
				2879	}
				2880
				2881	iclog = log->l_iclog;
				2882	if (iclog->ic_state != XLOG_STATE_ACTIVE) {
				2883	XFS_STATS_INC(log->l_mp, xs_log_noiclogs);
				2884
				2885	/* Wait for log writes to have flushed */
				2886	xlog_wait(&log->l_flush_wait, &log->l_icloglock);
				2887	goto restart;
				2888	}
				2889
				2890	head = &iclog->ic_header;
				2891
				2892	atomic_inc(&iclog->ic_refcnt); /* prevents sync */
				2893	log_offset = iclog->ic_offset;
				2894
				2895	/* On the 1st write to an iclog, figure out lsn. This works
				2896	* if iclogs marked XLOG_STATE_WANT_SYNC always write out what they are
				2897	* committing to. If the offset is set, that's how many blocks
				2898	* must be written.
				2899	*/
				2900	if (log_offset == 0) {
				2901	ticket->t_curr_res -= log->l_iclog_hsize;
				2902	xlog_tic_add_region(ticket,
				2903	log->l_iclog_hsize,
				2904	XLOG_REG_TYPE_LRHEADER);
				2905	head->h_cycle = cpu_to_be32(log->l_curr_cycle);
				2906	head->h_lsn = cpu_to_be64(
				2907	xlog_assign_lsn(log->l_curr_cycle, log->l_curr_block));
				2908	ASSERT(log->l_curr_block >= 0);
				2909	}
				2910
				2911	/* If there is enough room to write everything, then do it. Otherwise,
				2912	* claim the rest of the region and make sure the XLOG_STATE_WANT_SYNC
				2913	* bit is on, so this will get flushed out. Don't update ic_offset
				2914	* until you know exactly how many bytes get copied. Therefore, wait
				2915	* until later to update ic_offset.
				2916	*
				2917	* xlog_write() algorithm assumes that at least 2 xlog_op_header_t's
				2918	* can fit into remaining data section.
				2919	*/
				2920	if (iclog->ic_size - iclog->ic_offset < 2*sizeof(xlog_op_header_t)) {
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	2921	int error = 0;
				2922
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2923	xlog_state_switch_iclogs(log, iclog, iclog->ic_size);
				2924
				2925	/*
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	2926	* If we are the only one writing to this iclog, sync it to
				2927	* disk. We need to do an atomic compare and decrement here to
				2928	* avoid racing with concurrent atomic_dec_and_lock() calls in
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2929	* xlog_state_release_iclog() when there is more than one
				2930	* reference to the iclog.
				2931	*/
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	2932	if (!atomic_add_unless(&iclog->ic_refcnt, -1, 1))
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2933	error = xlog_state_release_iclog(log, iclog);
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	2934	spin_unlock(&log->l_icloglock);
				2935	if (error)
				2936	return error;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2937	goto restart;
				2938	}
				2939
				2940	/* Do we have enough room to write the full amount in the remainder
				2941	* of this iclog? Or must we continue a write on the next iclog and
				2942	* mark this iclog as completely taken? In the case where we switch
				2943	* iclogs (to mark it taken), this particular iclog will release/sync
				2944	* to disk in xlog_write().
				2945	*/
				2946	if (len <= iclog->ic_size - iclog->ic_offset) {
				2947	*continued_write = 0;
				2948	iclog->ic_offset += len;
				2949	} else {
				2950	*continued_write = 1;
				2951	xlog_state_switch_iclogs(log, iclog, iclog->ic_size);
				2952	}
				2953	*iclogp = iclog;
				2954
				2955	ASSERT(iclog->ic_offset <= iclog->ic_size);
				2956	spin_unlock(&log->l_icloglock);
				2957
				2958	*logoffsetp = log_offset;
				2959	return 0;
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	2960	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2961
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	2962	/*
				2963	* The first cnt-1 times a ticket goes through here we don't need to move the
				2964	* grant write head because the permanent reservation has reserved cnt times the
				2965	* unit amount. Release part of current permanent unit reservation and reset
				2966	* current reservation to be one units worth. Also move grant reservation head
				2967	* forward.
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2968	*/
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	2969	void
				2970	xfs_log_ticket_regrant(
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2971	struct xlog *log,
				2972	struct xlog_ticket *ticket)
				2973	{
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	2974	trace_xfs_log_ticket_regrant(log, ticket);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2975
				2976	if (ticket->t_cnt > 0)
				2977	ticket->t_cnt--;
				2978
				2979	xlog_grant_sub_space(log, &log->l_reserve_head.grant,
				2980	ticket->t_curr_res);
				2981	xlog_grant_sub_space(log, &log->l_write_head.grant,
				2982	ticket->t_curr_res);
				2983	ticket->t_curr_res = ticket->t_unit_res;
				2984	xlog_tic_reset_res(ticket);
				2985
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	2986	trace_xfs_log_ticket_regrant_sub(log, ticket);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2987
				2988	/* just return if we still have some of the pre-reserved space */
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	2989	if (!ticket->t_cnt) {
				2990	xlog_grant_add_space(log, &log->l_reserve_head.grant,
				2991	ticket->t_unit_res);
				2992	trace_xfs_log_ticket_regrant_exit(log, ticket);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2993
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	2994	ticket->t_curr_res = ticket->t_unit_res;
				2995	xlog_tic_reset_res(ticket);
				2996	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2997
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	2998	xfs_log_ticket_put(ticket);
				2999	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3000
				3001	/*
				3002	* Give back the space left from a reservation.
				3003	*
				3004	* All the information we need to make a correct determination of space left
				3005	* is present. For non-permanent reservations, things are quite easy. The
				3006	* count should have been decremented to zero. We only need to deal with the
				3007	* space remaining in the current reservation part of the ticket. If the
				3008	* ticket contains a permanent reservation, there may be left over space which
				3009	* needs to be released. A count of N means that N-1 refills of the current
				3010	* reservation can be done before we need to ask for more space. The first
				3011	* one goes to fill up the first current reservation. Once we run out of
				3012	* space, the count will stay at zero and the only space remaining will be
				3013	* in the current reservation field.
				3014	*/
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	3015	void
				3016	xfs_log_ticket_ungrant(
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3017	struct xlog *log,
				3018	struct xlog_ticket *ticket)
				3019	{
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	3020	int bytes;
				3021
				3022	trace_xfs_log_ticket_ungrant(log, ticket);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3023
				3024	if (ticket->t_cnt > 0)
				3025	ticket->t_cnt--;
				3026
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	3027	trace_xfs_log_ticket_ungrant_sub(log, ticket);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3028
				3029	/*
				3030	* If this is a permanent reservation ticket, we may be able to free
				3031	* up more space based on the remaining count.
				3032	*/
				3033	bytes = ticket->t_curr_res;
				3034	if (ticket->t_cnt > 0) {
				3035	ASSERT(ticket->t_flags & XLOG_TIC_PERM_RESERV);
				3036	bytes += ticket->t_unit_res*ticket->t_cnt;
				3037	}
				3038
				3039	xlog_grant_sub_space(log, &log->l_reserve_head.grant, bytes);
				3040	xlog_grant_sub_space(log, &log->l_write_head.grant, bytes);
				3041
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	3042	trace_xfs_log_ticket_ungrant_exit(log, ticket);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3043
				3044	xfs_log_space_wake(log->l_mp);
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	3045	xfs_log_ticket_put(ticket);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3046	}
				3047
				3048	/*
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	3049	* This routine will mark the current iclog in the ring as WANT_SYNC and move
				3050	* the current iclog pointer to the next iclog in the ring.
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3051	*/
				3052	STATIC void
				3053	xlog_state_switch_iclogs(
				3054	struct xlog *log,
				3055	struct xlog_in_core *iclog,
				3056	int eventual_size)
				3057	{
				3058	ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE);
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	3059	assert_spin_locked(&log->l_icloglock);
				3060
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3061	if (!eventual_size)
				3062	eventual_size = iclog->ic_offset;
				3063	iclog->ic_state = XLOG_STATE_WANT_SYNC;
				3064	iclog->ic_header.h_prev_block = cpu_to_be32(log->l_prev_block);
				3065	log->l_prev_block = log->l_curr_block;
				3066	log->l_prev_cycle = log->l_curr_cycle;
				3067
				3068	/* roll log?: ic_offset changed later */
				3069	log->l_curr_block += BTOBB(eventual_size)+BTOBB(log->l_iclog_hsize);
				3070
				3071	/* Round up to next log-sunit */
				3072	if (xfs_sb_version_haslogv2(&log->l_mp->m_sb) &&
				3073	log->l_mp->m_sb.sb_logsunit > 1) {
				3074	uint32_t sunit_bb = BTOBB(log->l_mp->m_sb.sb_logsunit);
				3075	log->l_curr_block = roundup(log->l_curr_block, sunit_bb);
				3076	}
				3077
				3078	if (log->l_curr_block >= log->l_logBBsize) {
				3079	/*
				3080	* Rewind the current block before the cycle is bumped to make
				3081	* sure that the combined LSN never transiently moves forward
				3082	* when the log wraps to the next cycle. This is to support the
				3083	* unlocked sample of these fields from xlog_valid_lsn(). Most
				3084	* other cases should acquire l_icloglock.
				3085	*/
				3086	log->l_curr_block -= log->l_logBBsize;
				3087	ASSERT(log->l_curr_block >= 0);
				3088	smp_wmb();
				3089	log->l_curr_cycle++;
				3090	if (log->l_curr_cycle == XLOG_HEADER_MAGIC_NUM)
				3091	log->l_curr_cycle++;
				3092	}
				3093	ASSERT(iclog == log->l_iclog);
				3094	log->l_iclog = iclog->ic_next;
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	3095	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3096
				3097	/*
				3098	* Write out all data in the in-core log as of this exact moment in time.
				3099	*
				3100	* Data may be written to the in-core log during this call. However,
				3101	* we don't guarantee this data will be written out. A change from past
				3102	* implementation means this routine will not write out zero length LRs.
				3103	*
				3104	* Basically, we try and perform an intelligent scan of the in-core logs.
				3105	* If we determine there is no flushable data, we just return. There is no
				3106	* flushable data if:
				3107	*
				3108	* 1. the current iclog is active and has no data; the previous iclog
				3109	* is in the active or dirty state.
				3110	* 2. the current iclog is drity, and the previous iclog is in the
				3111	* active or dirty state.
				3112	*
				3113	* We may sleep if:
				3114	*
				3115	* 1. the current iclog is not in the active nor dirty state.
				3116	* 2. the current iclog dirty, and the previous iclog is not in the
				3117	* active nor dirty state.
				3118	* 3. the current iclog is active, and there is another thread writing
				3119	* to this particular iclog.
				3120	* 4. a) the current iclog is active and has no other writers
				3121	* b) when we return from flushing out this iclog, it is still
				3122	* not in the active nor dirty state.
				3123	*/
				3124	int
				3125	xfs_log_force(
				3126	struct xfs_mount *mp,
				3127	uint flags)
				3128	{
				3129	struct xlog *log = mp->m_log;
				3130	struct xlog_in_core *iclog;
				3131	xfs_lsn_t lsn;
				3132
				3133	XFS_STATS_INC(mp, xs_log_force);
				3134	trace_xfs_log_force(mp, 0, _RET_IP_);
				3135
				3136	xlog_cil_force(log);
				3137
				3138	spin_lock(&log->l_icloglock);
				3139	iclog = log->l_iclog;
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	3140	if (iclog->ic_state == XLOG_STATE_IOERROR)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3141	goto out_error;
				3142
				3143	if (iclog->ic_state == XLOG_STATE_DIRTY \|\|
				3144	(iclog->ic_state == XLOG_STATE_ACTIVE &&
				3145	atomic_read(&iclog->ic_refcnt) == 0 && iclog->ic_offset == 0)) {
				3146	/*
				3147	* If the head is dirty or (active and empty), then we need to
				3148	* look at the previous iclog.
				3149	*
				3150	* If the previous iclog is active or dirty we are done. There
				3151	* is nothing to sync out. Otherwise, we attach ourselves to the
				3152	* previous iclog and go to sleep.
				3153	*/
				3154	iclog = iclog->ic_prev;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3155	} else if (iclog->ic_state == XLOG_STATE_ACTIVE) {
				3156	if (atomic_read(&iclog->ic_refcnt) == 0) {
				3157	/*
				3158	* We are the only one with access to this iclog.
				3159	*
				3160	* Flush it out now. There should be a roundoff of zero
				3161	* to show that someone has already taken care of the
				3162	* roundoff from the previous sync.
				3163	*/
				3164	atomic_inc(&iclog->ic_refcnt);
				3165	lsn = be64_to_cpu(iclog->ic_header.h_lsn);
				3166	xlog_state_switch_iclogs(log, iclog, 0);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3167	if (xlog_state_release_iclog(log, iclog))
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	3168	goto out_error;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3169
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	3170	if (be64_to_cpu(iclog->ic_header.h_lsn) != lsn)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3171	goto out_unlock;
				3172	} else {
				3173	/*
				3174	* Someone else is writing to this iclog.
				3175	*
				3176	* Use its call to flush out the data. However, the
				3177	* other thread may not force out this LR, so we mark
				3178	* it WANT_SYNC.
				3179	*/
				3180	xlog_state_switch_iclogs(log, iclog, 0);
				3181	}
				3182	} else {
				3183	/*
				3184	* If the head iclog is not active nor dirty, we just attach
				3185	* ourselves to the head and go to sleep if necessary.
				3186	*/
				3187	;
				3188	}
				3189
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	3190	if (flags & XFS_LOG_SYNC)
				3191	return xlog_wait_on_iclog(iclog);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3192	out_unlock:
				3193	spin_unlock(&log->l_icloglock);
				3194	return 0;
				3195	out_error:
				3196	spin_unlock(&log->l_icloglock);
				3197	return -EIO;
				3198	}
				3199
				3200	static int
				3201	__xfs_log_force_lsn(
				3202	struct xfs_mount *mp,
				3203	xfs_lsn_t lsn,
				3204	uint flags,
				3205	int *log_flushed,
				3206	bool already_slept)
				3207	{
				3208	struct xlog *log = mp->m_log;
				3209	struct xlog_in_core *iclog;
				3210
				3211	spin_lock(&log->l_icloglock);
				3212	iclog = log->l_iclog;
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	3213	if (iclog->ic_state == XLOG_STATE_IOERROR)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3214	goto out_error;
				3215
				3216	while (be64_to_cpu(iclog->ic_header.h_lsn) != lsn) {
				3217	iclog = iclog->ic_next;
				3218	if (iclog == log->l_iclog)
				3219	goto out_unlock;
				3220	}
				3221
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3222	if (iclog->ic_state == XLOG_STATE_ACTIVE) {
				3223	/*
				3224	* We sleep here if we haven't already slept (e.g. this is the
				3225	* first time we've looked at the correct iclog buf) and the
				3226	* buffer before us is going to be sync'ed. The reason for this
				3227	* is that if we are doing sync transactions here, by waiting
				3228	* for the previous I/O to complete, we can allow a few more
				3229	* transactions into this iclog before we close it down.
				3230	*
				3231	* Otherwise, we mark the buffer WANT_SYNC, and bump up the
				3232	* refcnt so we can release the log (which drops the ref count).
				3233	* The state switch keeps new transaction commits from using
				3234	* this buffer. When the current commits finish writing into
				3235	* the buffer, the refcount will drop to zero and the buffer
				3236	* will go out then.
				3237	*/
				3238	if (!already_slept &&
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	3239	(iclog->ic_prev->ic_state == XLOG_STATE_WANT_SYNC \|\|
				3240	iclog->ic_prev->ic_state == XLOG_STATE_SYNCING)) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3241	XFS_STATS_INC(mp, xs_log_force_sleep);
				3242
				3243	xlog_wait(&iclog->ic_prev->ic_write_wait,
				3244	&log->l_icloglock);
				3245	return -EAGAIN;
				3246	}
				3247	atomic_inc(&iclog->ic_refcnt);
				3248	xlog_state_switch_iclogs(log, iclog, 0);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3249	if (xlog_state_release_iclog(log, iclog))
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	3250	goto out_error;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3251	if (log_flushed)
				3252	*log_flushed = 1;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3253	}
				3254
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	3255	if (flags & XFS_LOG_SYNC)
				3256	return xlog_wait_on_iclog(iclog);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3257	out_unlock:
				3258	spin_unlock(&log->l_icloglock);
				3259	return 0;
				3260	out_error:
				3261	spin_unlock(&log->l_icloglock);
				3262	return -EIO;
				3263	}
				3264
				3265	/*
				3266	* Force the in-core log to disk for a specific LSN.
				3267	*
				3268	* Find in-core log with lsn.
				3269	* If it is in the DIRTY state, just return.
				3270	* If it is in the ACTIVE state, move the in-core log into the WANT_SYNC
				3271	* state and go to sleep or return.
				3272	* If it is in any other state, go to sleep or return.
				3273	*
				3274	* Synchronous forces are implemented with a wait queue. All callers trying
				3275	* to force a given lsn to disk must wait on the queue attached to the
				3276	* specific in-core log. When given in-core log finally completes its write
				3277	* to disk, that thread will wake up all threads waiting on the queue.
				3278	*/
				3279	int
				3280	xfs_log_force_lsn(
				3281	struct xfs_mount *mp,
				3282	xfs_lsn_t lsn,
				3283	uint flags,
				3284	int *log_flushed)
				3285	{
				3286	int ret;
				3287	ASSERT(lsn != 0);
				3288
				3289	XFS_STATS_INC(mp, xs_log_force);
				3290	trace_xfs_log_force(mp, lsn, _RET_IP_);
				3291
				3292	lsn = xlog_cil_force_lsn(mp->m_log, lsn);
				3293	if (lsn == NULLCOMMITLSN)
				3294	return 0;
				3295
				3296	ret = __xfs_log_force_lsn(mp, lsn, flags, log_flushed, false);
				3297	if (ret == -EAGAIN)
				3298	ret = __xfs_log_force_lsn(mp, lsn, flags, log_flushed, true);
				3299	return ret;
				3300	}
				3301
				3302	/*
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3303	* Free a used ticket when its refcount falls to zero.
				3304	*/
				3305	void
				3306	xfs_log_ticket_put(
				3307	xlog_ticket_t *ticket)
				3308	{
				3309	ASSERT(atomic_read(&ticket->t_ref) > 0);
				3310	if (atomic_dec_and_test(&ticket->t_ref))
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	3311	kmem_cache_free(xfs_log_ticket_zone, ticket);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3312	}
				3313
				3314	xlog_ticket_t *
				3315	xfs_log_ticket_get(
				3316	xlog_ticket_t *ticket)
				3317	{
				3318	ASSERT(atomic_read(&ticket->t_ref) > 0);
				3319	atomic_inc(&ticket->t_ref);
				3320	return ticket;
				3321	}
				3322
				3323	/*
				3324	* Figure out the total log space unit (in bytes) that would be
				3325	* required for a log ticket.
				3326	*/
				3327	int
				3328	xfs_log_calc_unit_res(
				3329	struct xfs_mount *mp,
				3330	int unit_bytes)
				3331	{
				3332	struct xlog *log = mp->m_log;
				3333	int iclog_space;
				3334	uint num_headers;
				3335
				3336	/*
				3337	* Permanent reservations have up to 'cnt'-1 active log operations
				3338	* in the log. A unit in this case is the amount of space for one
				3339	* of these log operations. Normal reservations have a cnt of 1
				3340	* and their unit amount is the total amount of space required.
				3341	*
				3342	* The following lines of code account for non-transaction data
				3343	* which occupy space in the on-disk log.
				3344	*
				3345	* Normal form of a transaction is:
				3346	* <oph><trans-hdr><start-oph><reg1-oph><reg1><reg2-oph>...<commit-oph>
				3347	* and then there are LR hdrs, split-recs and roundoff at end of syncs.
				3348	*
				3349	* We need to account for all the leadup data and trailer data
				3350	* around the transaction data.
				3351	* And then we need to account for the worst case in terms of using
				3352	* more space.
				3353	* The worst case will happen if:
				3354	* - the placement of the transaction happens to be such that the
				3355	* roundoff is at its maximum
				3356	* - the transaction data is synced before the commit record is synced
				3357	* i.e. <transaction-data><roundoff> \| <commit-rec><roundoff>
				3358	* Therefore the commit record is in its own Log Record.
				3359	* This can happen as the commit record is called with its
				3360	* own region to xlog_write().
				3361	* This then means that in the worst case, roundoff can happen for
				3362	* the commit-rec as well.
				3363	* The commit-rec is smaller than padding in this scenario and so it is
				3364	* not added separately.
				3365	*/
				3366
				3367	/* for trans header */
				3368	unit_bytes += sizeof(xlog_op_header_t);
				3369	unit_bytes += sizeof(xfs_trans_header_t);
				3370
				3371	/* for start-rec */
				3372	unit_bytes += sizeof(xlog_op_header_t);
				3373
				3374	/*
				3375	* for LR headers - the space for data in an iclog is the size minus
				3376	* the space used for the headers. If we use the iclog size, then we
				3377	* undercalculate the number of headers required.
				3378	*
				3379	* Furthermore - the addition of op headers for split-recs might
				3380	* increase the space required enough to require more log and op
				3381	* headers, so take that into account too.
				3382	*
				3383	* IMPORTANT: This reservation makes the assumption that if this
				3384	* transaction is the first in an iclog and hence has the LR headers
				3385	* accounted to it, then the remaining space in the iclog is
				3386	* exclusively for this transaction. i.e. if the transaction is larger
				3387	* than the iclog, it will be the only thing in that iclog.
				3388	* Fundamentally, this means we must pass the entire log vector to
				3389	* xlog_write to guarantee this.
				3390	*/
				3391	iclog_space = log->l_iclog_size - log->l_iclog_hsize;
				3392	num_headers = howmany(unit_bytes, iclog_space);
				3393
				3394	/* for split-recs - ophdrs added when data split over LRs */
				3395	unit_bytes += sizeof(xlog_op_header_t) * num_headers;
				3396
				3397	/* add extra header reservations if we overrun */
				3398	while (!num_headers \|\|
				3399	howmany(unit_bytes, iclog_space) > num_headers) {
				3400	unit_bytes += sizeof(xlog_op_header_t);
				3401	num_headers++;
				3402	}
				3403	unit_bytes += log->l_iclog_hsize * num_headers;
				3404
				3405	/* for commit-rec LR header - note: padding will subsume the ophdr */
				3406	unit_bytes += log->l_iclog_hsize;
				3407
				3408	/* for roundoff padding for transaction data and one for commit record */
				3409	if (xfs_sb_version_haslogv2(&mp->m_sb) && mp->m_sb.sb_logsunit > 1) {
				3410	/* log su roundoff */
				3411	unit_bytes += 2 * mp->m_sb.sb_logsunit;
				3412	} else {
				3413	/* BB roundoff */
				3414	unit_bytes += 2 * BBSIZE;
				3415	}
				3416
				3417	return unit_bytes;
				3418	}
				3419
				3420	/*
				3421	* Allocate and initialise a new log ticket.
				3422	*/
				3423	struct xlog_ticket *
				3424	xlog_ticket_alloc(
				3425	struct xlog *log,
				3426	int unit_bytes,
				3427	int cnt,
				3428	char client,
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	3429	bool permanent)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3430	{
				3431	struct xlog_ticket *tic;
				3432	int unit_res;
				3433
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	3434	tic = kmem_cache_zalloc(xfs_log_ticket_zone, GFP_NOFS \| __GFP_NOFAIL);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3435
				3436	unit_res = xfs_log_calc_unit_res(log->l_mp, unit_bytes);
				3437
				3438	atomic_set(&tic->t_ref, 1);
				3439	tic->t_task = current;
				3440	INIT_LIST_HEAD(&tic->t_queue);
				3441	tic->t_unit_res = unit_res;
				3442	tic->t_curr_res = unit_res;
				3443	tic->t_cnt = cnt;
				3444	tic->t_ocnt = cnt;
				3445	tic->t_tid = prandom_u32();
				3446	tic->t_clientid = client;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3447	if (permanent)
				3448	tic->t_flags \|= XLOG_TIC_PERM_RESERV;
				3449
				3450	xlog_tic_reset_res(tic);
				3451
				3452	return tic;
				3453	}
				3454
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3455	#if defined(DEBUG)
				3456	/*
				3457	* Make sure that the destination ptr is within the valid data region of
				3458	* one of the iclogs. This uses backup pointers stored in a different
				3459	* part of the log in case we trash the log structure.
				3460	*/
				3461	STATIC void
				3462	xlog_verify_dest_ptr(
				3463	struct xlog *log,
				3464	void *ptr)
				3465	{
				3466	int i;
				3467	int good_ptr = 0;
				3468
				3469	for (i = 0; i < log->l_iclog_bufs; i++) {
				3470	if (ptr >= log->l_iclog_bak[i] &&
				3471	ptr <= log->l_iclog_bak[i] + log->l_iclog_size)
				3472	good_ptr++;
				3473	}
				3474
				3475	if (!good_ptr)
				3476	xfs_emerg(log->l_mp, "%s: invalid ptr", __func__);
				3477	}
				3478
				3479	/*
				3480	* Check to make sure the grant write head didn't just over lap the tail. If
				3481	* the cycles are the same, we can't be overlapping. Otherwise, make sure that
				3482	* the cycles differ by exactly one and check the byte count.
				3483	*
				3484	* This check is run unlocked, so can give false positives. Rather than assert
				3485	* on failures, use a warn-once flag and a panic tag to allow the admin to
				3486	* determine if they want to panic the machine when such an error occurs. For
				3487	* debug kernels this will have the same effect as using an assert but, unlinke
				3488	* an assert, it can be turned off at runtime.
				3489	*/
				3490	STATIC void
				3491	xlog_verify_grant_tail(
				3492	struct xlog *log)
				3493	{
				3494	int tail_cycle, tail_blocks;
				3495	int cycle, space;
				3496
				3497	xlog_crack_grant_head(&log->l_write_head.grant, &cycle, &space);
				3498	xlog_crack_atomic_lsn(&log->l_tail_lsn, &tail_cycle, &tail_blocks);
				3499	if (tail_cycle != cycle) {
				3500	if (cycle - 1 != tail_cycle &&
				3501	!(log->l_flags & XLOG_TAIL_WARN)) {
				3502	xfs_alert_tag(log->l_mp, XFS_PTAG_LOGRES,
				3503	"%s: cycle - 1 != tail_cycle", __func__);
				3504	log->l_flags \|= XLOG_TAIL_WARN;
				3505	}
				3506
				3507	if (space > BBTOB(tail_blocks) &&
				3508	!(log->l_flags & XLOG_TAIL_WARN)) {
				3509	xfs_alert_tag(log->l_mp, XFS_PTAG_LOGRES,
				3510	"%s: space > BBTOB(tail_blocks)", __func__);
				3511	log->l_flags \|= XLOG_TAIL_WARN;
				3512	}
				3513	}
				3514	}
				3515
				3516	/* check if it will fit */
				3517	STATIC void
				3518	xlog_verify_tail_lsn(
				3519	struct xlog *log,
				3520	struct xlog_in_core *iclog,
				3521	xfs_lsn_t tail_lsn)
				3522	{
				3523	int blocks;
				3524
				3525	if (CYCLE_LSN(tail_lsn) == log->l_prev_cycle) {
				3526	blocks =
				3527	log->l_logBBsize - (log->l_prev_block - BLOCK_LSN(tail_lsn));
				3528	if (blocks < BTOBB(iclog->ic_offset)+BTOBB(log->l_iclog_hsize))
				3529	xfs_emerg(log->l_mp, "%s: ran out of log space", __func__);
				3530	} else {
				3531	ASSERT(CYCLE_LSN(tail_lsn)+1 == log->l_prev_cycle);
				3532
				3533	if (BLOCK_LSN(tail_lsn) == log->l_prev_block)
				3534	xfs_emerg(log->l_mp, "%s: tail wrapped", __func__);
				3535
				3536	blocks = BLOCK_LSN(tail_lsn) - log->l_prev_block;
				3537	if (blocks < BTOBB(iclog->ic_offset) + 1)
				3538	xfs_emerg(log->l_mp, "%s: ran out of log space", __func__);
				3539	}
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	3540	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3541
				3542	/*
				3543	* Perform a number of checks on the iclog before writing to disk.
				3544	*
				3545	* 1. Make sure the iclogs are still circular
				3546	* 2. Make sure we have a good magic number
				3547	* 3. Make sure we don't have magic numbers in the data
				3548	* 4. Check fields of each log operation header for:
				3549	* A. Valid client identifier
				3550	* B. tid ptr value falls in valid ptr space (user space code)
				3551	* C. Length in log record header is correct according to the
				3552	* individual operation headers within record.
				3553	* 5. When a bwrite will occur within 5 blocks of the front of the physical
				3554	* log, check the preceding blocks of the physical log to make sure all
				3555	* the cycle numbers agree with the current cycle number.
				3556	*/
				3557	STATIC void
				3558	xlog_verify_iclog(
				3559	struct xlog *log,
				3560	struct xlog_in_core *iclog,
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3561	int count)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3562	{
				3563	xlog_op_header_t *ophead;
				3564	xlog_in_core_t *icptr;
				3565	xlog_in_core_2_t *xhdr;
				3566	void base_ptr, ptr, *p;
				3567	ptrdiff_t field_offset;
				3568	uint8_t clientid;
				3569	int len, i, j, k, op_len;
				3570	int idx;
				3571
				3572	/* check validity of iclog pointers */
				3573	spin_lock(&log->l_icloglock);
				3574	icptr = log->l_iclog;
				3575	for (i = 0; i < log->l_iclog_bufs; i++, icptr = icptr->ic_next)
				3576	ASSERT(icptr);
				3577
				3578	if (icptr != log->l_iclog)
				3579	xfs_emerg(log->l_mp, "%s: corrupt iclog ring", __func__);
				3580	spin_unlock(&log->l_icloglock);
				3581
				3582	/* check log magic numbers */
				3583	if (iclog->ic_header.h_magicno != cpu_to_be32(XLOG_HEADER_MAGIC_NUM))
				3584	xfs_emerg(log->l_mp, "%s: invalid magic num", __func__);
				3585
				3586	base_ptr = ptr = &iclog->ic_header;
				3587	p = &iclog->ic_header;
				3588	for (ptr += BBSIZE; ptr < base_ptr + count; ptr += BBSIZE) {
				3589	if ((__be32 )ptr == cpu_to_be32(XLOG_HEADER_MAGIC_NUM))
				3590	xfs_emerg(log->l_mp, "%s: unexpected magic num",
				3591	__func__);
				3592	}
				3593
				3594	/* check fields */
				3595	len = be32_to_cpu(iclog->ic_header.h_num_logops);
				3596	base_ptr = ptr = iclog->ic_datap;
				3597	ophead = ptr;
				3598	xhdr = iclog->ic_data;
				3599	for (i = 0; i < len; i++) {
				3600	ophead = ptr;
				3601
				3602	/* clientid is only 1 byte */
				3603	p = &ophead->oh_clientid;
				3604	field_offset = p - base_ptr;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3605	if (field_offset & 0x1ff) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3606	clientid = ophead->oh_clientid;
				3607	} else {
				3608	idx = BTOBBT((char *)&ophead->oh_clientid - iclog->ic_datap);
				3609	if (idx >= (XLOG_HEADER_CYCLE_SIZE / BBSIZE)) {
				3610	j = idx / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
				3611	k = idx % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
				3612	clientid = xlog_get_client_id(
				3613	xhdr[j].hic_xheader.xh_cycle_data[k]);
				3614	} else {
				3615	clientid = xlog_get_client_id(
				3616	iclog->ic_header.h_cycle_data[idx]);
				3617	}
				3618	}
				3619	if (clientid != XFS_TRANSACTION && clientid != XFS_LOG)
				3620	xfs_warn(log->l_mp,
				3621	"%s: invalid clientid %d op "PTR_FMT" offset 0x%lx",
				3622	__func__, clientid, ophead,
				3623	(unsigned long)field_offset);
				3624
				3625	/* check length */
				3626	p = &ophead->oh_len;
				3627	field_offset = p - base_ptr;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3628	if (field_offset & 0x1ff) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3629	op_len = be32_to_cpu(ophead->oh_len);
				3630	} else {
				3631	idx = BTOBBT((uintptr_t)&ophead->oh_len -
				3632	(uintptr_t)iclog->ic_datap);
				3633	if (idx >= (XLOG_HEADER_CYCLE_SIZE / BBSIZE)) {
				3634	j = idx / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
				3635	k = idx % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
				3636	op_len = be32_to_cpu(xhdr[j].hic_xheader.xh_cycle_data[k]);
				3637	} else {
				3638	op_len = be32_to_cpu(iclog->ic_header.h_cycle_data[idx]);
				3639	}
				3640	}
				3641	ptr += sizeof(xlog_op_header_t) + op_len;
				3642	}
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	3643	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3644	#endif
				3645
				3646	/*
				3647	* Mark all iclogs IOERROR. l_icloglock is held by the caller.
				3648	*/
				3649	STATIC int
				3650	xlog_state_ioerror(
				3651	struct xlog *log)
				3652	{
				3653	xlog_in_core_t iclog, ic;
				3654
				3655	iclog = log->l_iclog;
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	3656	if (iclog->ic_state != XLOG_STATE_IOERROR) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3657	/*
				3658	* Mark all the incore logs IOERROR.
				3659	* From now on, no log flushes will result.
				3660	*/
				3661	ic = iclog;
				3662	do {
				3663	ic->ic_state = XLOG_STATE_IOERROR;
				3664	ic = ic->ic_next;
				3665	} while (ic != iclog);
				3666	return 0;
				3667	}
				3668	/*
				3669	* Return non-zero, if state transition has already happened.
				3670	*/
				3671	return 1;
				3672	}
				3673
				3674	/*
				3675	* This is called from xfs_force_shutdown, when we're forcibly
				3676	* shutting down the filesystem, typically because of an IO error.
				3677	* Our main objectives here are to make sure that:
				3678	* a. if !logerror, flush the logs to disk. Anything modified
				3679	* after this is ignored.
				3680	* b. the filesystem gets marked 'SHUTDOWN' for all interested
				3681	* parties to find out, 'atomically'.
				3682	* c. those who're sleeping on log reservations, pinned objects and
				3683	* other resources get woken up, and be told the bad news.
				3684	* d. nothing new gets queued up after (b) and (c) are done.
				3685	*
				3686	* Note: for the !logerror case we need to flush the regions held in memory out
				3687	* to disk first. This needs to be done before the log is marked as shutdown,
				3688	* otherwise the iclog writes will fail.
				3689	*/
				3690	int
				3691	xfs_log_force_umount(
				3692	struct xfs_mount *mp,
				3693	int logerror)
				3694	{
				3695	struct xlog *log;
				3696	int retval;
				3697
				3698	log = mp->m_log;
				3699
				3700	/*
				3701	* If this happens during log recovery, don't worry about
				3702	* locking; the log isn't open for business yet.
				3703	*/
				3704	if (!log \|\|
				3705	log->l_flags & XLOG_ACTIVE_RECOVERY) {
				3706	mp->m_flags \|= XFS_MOUNT_FS_SHUTDOWN;
				3707	if (mp->m_sb_bp)
				3708	mp->m_sb_bp->b_flags \|= XBF_DONE;
				3709	return 0;
				3710	}
				3711
				3712	/*
				3713	* Somebody could've already done the hard work for us.
				3714	* No need to get locks for this.
				3715	*/
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	3716	if (logerror && log->l_iclog->ic_state == XLOG_STATE_IOERROR) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3717	ASSERT(XLOG_FORCED_SHUTDOWN(log));
				3718	return 1;
				3719	}
				3720
				3721	/*
				3722	* Flush all the completed transactions to disk before marking the log
				3723	* being shut down. We need to do it in this order to ensure that
				3724	* completed operations are safely on disk before we shut down, and that
				3725	* we don't have to issue any buffer IO after the shutdown flags are set
				3726	* to guarantee this.
				3727	*/
				3728	if (!logerror)
				3729	xfs_log_force(mp, XFS_LOG_SYNC);
				3730
				3731	/*
				3732	* mark the filesystem and the as in a shutdown state and wake
				3733	* everybody up to tell them the bad news.
				3734	*/
				3735	spin_lock(&log->l_icloglock);
				3736	mp->m_flags \|= XFS_MOUNT_FS_SHUTDOWN;
				3737	if (mp->m_sb_bp)
				3738	mp->m_sb_bp->b_flags \|= XBF_DONE;
				3739
				3740	/*
				3741	* Mark the log and the iclogs with IO error flags to prevent any
				3742	* further log IO from being issued or completed.
				3743	*/
				3744	log->l_flags \|= XLOG_IO_ERROR;
				3745	retval = xlog_state_ioerror(log);
				3746	spin_unlock(&log->l_icloglock);
				3747
				3748	/*
				3749	* We don't want anybody waiting for log reservations after this. That
				3750	* means we have to wake up everybody queued up on reserveq as well as
				3751	* writeq. In addition, we make sure in xlog_{re}grant_log_space that
				3752	* we don't enqueue anything once the SHUTDOWN flag is set, and this
				3753	* action is protected by the grant locks.
				3754	*/
				3755	xlog_grant_head_wake_all(&log->l_reserve_head);
				3756	xlog_grant_head_wake_all(&log->l_write_head);
				3757
				3758	/*
				3759	* Wake up everybody waiting on xfs_log_force. Wake the CIL push first
				3760	* as if the log writes were completed. The abort handling in the log
				3761	* item committed callback functions will do this again under lock to
				3762	* avoid races.
				3763	*/
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3764	spin_lock(&log->l_cilp->xc_push_lock);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3765	wake_up_all(&log->l_cilp->xc_commit_wait);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3766	spin_unlock(&log->l_cilp->xc_push_lock);
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	3767	xlog_state_do_callback(log);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3768
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3769	/* return non-zero if log IOERROR transition had already happened */
				3770	return retval;
				3771	}
				3772
				3773	STATIC int
				3774	xlog_iclogs_empty(
				3775	struct xlog *log)
				3776	{
				3777	xlog_in_core_t *iclog;
				3778
				3779	iclog = log->l_iclog;
				3780	do {
				3781	/* endianness does not matter here, zero is zero in
				3782	* any language.
				3783	*/
				3784	if (iclog->ic_header.h_num_logops)
				3785	return 0;
				3786	iclog = iclog->ic_next;
				3787	} while (iclog != log->l_iclog);
				3788	return 1;
				3789	}
				3790
				3791	/*
				3792	* Verify that an LSN stamped into a piece of metadata is valid. This is
				3793	* intended for use in read verifiers on v5 superblocks.
				3794	*/
				3795	bool
				3796	xfs_log_check_lsn(
				3797	struct xfs_mount *mp,
				3798	xfs_lsn_t lsn)
				3799	{
				3800	struct xlog *log = mp->m_log;
				3801	bool valid;
				3802
				3803	/*
				3804	* norecovery mode skips mount-time log processing and unconditionally
				3805	* resets the in-core LSN. We can't validate in this mode, but
				3806	* modifications are not allowed anyways so just return true.
				3807	*/
				3808	if (mp->m_flags & XFS_MOUNT_NORECOVERY)
				3809	return true;
				3810
				3811	/*
				3812	* Some metadata LSNs are initialized to NULL (e.g., the agfl). This is
				3813	* handled by recovery and thus safe to ignore here.
				3814	*/
				3815	if (lsn == NULLCOMMITLSN)
				3816	return true;
				3817
				3818	valid = xlog_valid_lsn(mp->m_log, lsn);
				3819
				3820	/* warn the user about what's gone wrong before verifier failure */
				3821	if (!valid) {
				3822	spin_lock(&log->l_icloglock);
				3823	xfs_warn(mp,
				3824	"Corruption warning: Metadata has LSN (%d:%d) ahead of current LSN (%d:%d). "
				3825	"Please unmount and run xfs_repair (>= v4.3) to resolve.",
				3826	CYCLE_LSN(lsn), BLOCK_LSN(lsn),
				3827	log->l_curr_cycle, log->l_curr_block);
				3828	spin_unlock(&log->l_icloglock);
				3829	}
				3830
				3831	return valid;
				3832	}
				3833
				3834	bool
				3835	xfs_log_in_recovery(
				3836	struct xfs_mount *mp)
				3837	{
				3838	struct xlog *log = mp->m_log;
				3839
				3840	return log->l_flags & XLOG_ACTIVE_RECOVERY;
				3841	}