blob: a1135b86e79f9c182a28cb74b8f7a4f58428a0b9 [file] [log] [blame]
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001// SPDX-License-Identifier: GPL-2.0
2/*
3 * Copyright (c) 2000-2005 Silicon Graphics, Inc.
4 * All Rights Reserved.
5 */
6#include "xfs.h"
7#include "xfs_fs.h"
David Brazdil0f672f62019-12-10 10:32:29 +00008#include "xfs_shared.h"
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00009#include "xfs_format.h"
10#include "xfs_log_format.h"
11#include "xfs_trans_resv.h"
12#include "xfs_sb.h"
13#include "xfs_mount.h"
14#include "xfs_inode.h"
Andrew Scullb4b6d4a2019-01-02 15:54:55 +000015#include "xfs_trans.h"
16#include "xfs_trans_priv.h"
17#include "xfs_inode_item.h"
18#include "xfs_quota.h"
19#include "xfs_trace.h"
20#include "xfs_icache.h"
21#include "xfs_bmap_util.h"
22#include "xfs_dquot_item.h"
23#include "xfs_dquot.h"
24#include "xfs_reflink.h"
25
Andrew Scullb4b6d4a2019-01-02 15:54:55 +000026#include <linux/iversion.h>
27
28/*
29 * Allocate and initialise an xfs_inode.
30 */
31struct xfs_inode *
32xfs_inode_alloc(
33 struct xfs_mount *mp,
34 xfs_ino_t ino)
35{
36 struct xfs_inode *ip;
37
38 /*
39 * if this didn't occur in transactions, we could use
40 * KM_MAYFAIL and return NULL here on ENOMEM. Set the
41 * code up to do this anyway.
42 */
David Brazdil0f672f62019-12-10 10:32:29 +000043 ip = kmem_zone_alloc(xfs_inode_zone, 0);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +000044 if (!ip)
45 return NULL;
46 if (inode_init_always(mp->m_super, VFS_I(ip))) {
47 kmem_zone_free(xfs_inode_zone, ip);
48 return NULL;
49 }
50
51 /* VFS doesn't initialise i_mode! */
52 VFS_I(ip)->i_mode = 0;
53
54 XFS_STATS_INC(mp, vn_active);
55 ASSERT(atomic_read(&ip->i_pincount) == 0);
56 ASSERT(!xfs_isiflocked(ip));
57 ASSERT(ip->i_ino == 0);
58
59 /* initialise the xfs inode */
60 ip->i_ino = ino;
61 ip->i_mount = mp;
62 memset(&ip->i_imap, 0, sizeof(struct xfs_imap));
63 ip->i_afp = NULL;
64 ip->i_cowfp = NULL;
65 ip->i_cnextents = 0;
66 ip->i_cformat = XFS_DINODE_FMT_EXTENTS;
67 memset(&ip->i_df, 0, sizeof(ip->i_df));
68 ip->i_flags = 0;
69 ip->i_delayed_blks = 0;
70 memset(&ip->i_d, 0, sizeof(ip->i_d));
David Brazdil0f672f62019-12-10 10:32:29 +000071 ip->i_sick = 0;
72 ip->i_checked = 0;
73 INIT_WORK(&ip->i_ioend_work, xfs_end_io);
74 INIT_LIST_HEAD(&ip->i_ioend_list);
75 spin_lock_init(&ip->i_ioend_lock);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +000076
77 return ip;
78}
79
80STATIC void
81xfs_inode_free_callback(
82 struct rcu_head *head)
83{
84 struct inode *inode = container_of(head, struct inode, i_rcu);
85 struct xfs_inode *ip = XFS_I(inode);
86
87 switch (VFS_I(ip)->i_mode & S_IFMT) {
88 case S_IFREG:
89 case S_IFDIR:
90 case S_IFLNK:
91 xfs_idestroy_fork(ip, XFS_DATA_FORK);
92 break;
93 }
94
95 if (ip->i_afp)
96 xfs_idestroy_fork(ip, XFS_ATTR_FORK);
97 if (ip->i_cowfp)
98 xfs_idestroy_fork(ip, XFS_COW_FORK);
99
100 if (ip->i_itemp) {
101 ASSERT(!test_bit(XFS_LI_IN_AIL,
102 &ip->i_itemp->ili_item.li_flags));
103 xfs_inode_item_destroy(ip);
104 ip->i_itemp = NULL;
105 }
106
107 kmem_zone_free(xfs_inode_zone, ip);
108}
109
110static void
111__xfs_inode_free(
112 struct xfs_inode *ip)
113{
114 /* asserts to verify all state is correct here */
115 ASSERT(atomic_read(&ip->i_pincount) == 0);
116 XFS_STATS_DEC(ip->i_mount, vn_active);
117
118 call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback);
119}
120
121void
122xfs_inode_free(
123 struct xfs_inode *ip)
124{
125 ASSERT(!xfs_isiflocked(ip));
126
127 /*
128 * Because we use RCU freeing we need to ensure the inode always
129 * appears to be reclaimed with an invalid inode number when in the
130 * free state. The ip->i_flags_lock provides the barrier against lookup
131 * races.
132 */
133 spin_lock(&ip->i_flags_lock);
134 ip->i_flags = XFS_IRECLAIM;
135 ip->i_ino = 0;
136 spin_unlock(&ip->i_flags_lock);
137
138 __xfs_inode_free(ip);
139}
140
141/*
142 * Queue a new inode reclaim pass if there are reclaimable inodes and there
143 * isn't a reclaim pass already in progress. By default it runs every 5s based
144 * on the xfs periodic sync default of 30s. Perhaps this should have it's own
145 * tunable, but that can be done if this method proves to be ineffective or too
146 * aggressive.
147 */
148static void
149xfs_reclaim_work_queue(
150 struct xfs_mount *mp)
151{
152
153 rcu_read_lock();
154 if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) {
155 queue_delayed_work(mp->m_reclaim_workqueue, &mp->m_reclaim_work,
156 msecs_to_jiffies(xfs_syncd_centisecs / 6 * 10));
157 }
158 rcu_read_unlock();
159}
160
161/*
162 * This is a fast pass over the inode cache to try to get reclaim moving on as
163 * many inodes as possible in a short period of time. It kicks itself every few
164 * seconds, as well as being kicked by the inode cache shrinker when memory
165 * goes low. It scans as quickly as possible avoiding locked inodes or those
166 * already being flushed, and once done schedules a future pass.
167 */
168void
169xfs_reclaim_worker(
170 struct work_struct *work)
171{
172 struct xfs_mount *mp = container_of(to_delayed_work(work),
173 struct xfs_mount, m_reclaim_work);
174
175 xfs_reclaim_inodes(mp, SYNC_TRYLOCK);
176 xfs_reclaim_work_queue(mp);
177}
178
179static void
180xfs_perag_set_reclaim_tag(
181 struct xfs_perag *pag)
182{
183 struct xfs_mount *mp = pag->pag_mount;
184
185 lockdep_assert_held(&pag->pag_ici_lock);
186 if (pag->pag_ici_reclaimable++)
187 return;
188
189 /* propagate the reclaim tag up into the perag radix tree */
190 spin_lock(&mp->m_perag_lock);
191 radix_tree_tag_set(&mp->m_perag_tree, pag->pag_agno,
192 XFS_ICI_RECLAIM_TAG);
193 spin_unlock(&mp->m_perag_lock);
194
195 /* schedule periodic background inode reclaim */
196 xfs_reclaim_work_queue(mp);
197
198 trace_xfs_perag_set_reclaim(mp, pag->pag_agno, -1, _RET_IP_);
199}
200
201static void
202xfs_perag_clear_reclaim_tag(
203 struct xfs_perag *pag)
204{
205 struct xfs_mount *mp = pag->pag_mount;
206
207 lockdep_assert_held(&pag->pag_ici_lock);
208 if (--pag->pag_ici_reclaimable)
209 return;
210
211 /* clear the reclaim tag from the perag radix tree */
212 spin_lock(&mp->m_perag_lock);
213 radix_tree_tag_clear(&mp->m_perag_tree, pag->pag_agno,
214 XFS_ICI_RECLAIM_TAG);
215 spin_unlock(&mp->m_perag_lock);
216 trace_xfs_perag_clear_reclaim(mp, pag->pag_agno, -1, _RET_IP_);
217}
218
219
220/*
221 * We set the inode flag atomically with the radix tree tag.
222 * Once we get tag lookups on the radix tree, this inode flag
223 * can go away.
224 */
225void
226xfs_inode_set_reclaim_tag(
227 struct xfs_inode *ip)
228{
229 struct xfs_mount *mp = ip->i_mount;
230 struct xfs_perag *pag;
231
232 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
233 spin_lock(&pag->pag_ici_lock);
234 spin_lock(&ip->i_flags_lock);
235
236 radix_tree_tag_set(&pag->pag_ici_root, XFS_INO_TO_AGINO(mp, ip->i_ino),
237 XFS_ICI_RECLAIM_TAG);
238 xfs_perag_set_reclaim_tag(pag);
239 __xfs_iflags_set(ip, XFS_IRECLAIMABLE);
240
241 spin_unlock(&ip->i_flags_lock);
242 spin_unlock(&pag->pag_ici_lock);
243 xfs_perag_put(pag);
244}
245
246STATIC void
247xfs_inode_clear_reclaim_tag(
248 struct xfs_perag *pag,
249 xfs_ino_t ino)
250{
251 radix_tree_tag_clear(&pag->pag_ici_root,
252 XFS_INO_TO_AGINO(pag->pag_mount, ino),
253 XFS_ICI_RECLAIM_TAG);
254 xfs_perag_clear_reclaim_tag(pag);
255}
256
257static void
258xfs_inew_wait(
259 struct xfs_inode *ip)
260{
261 wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_INEW_BIT);
262 DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_INEW_BIT);
263
264 do {
265 prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
266 if (!xfs_iflags_test(ip, XFS_INEW))
267 break;
268 schedule();
269 } while (true);
270 finish_wait(wq, &wait.wq_entry);
271}
272
273/*
274 * When we recycle a reclaimable inode, we need to re-initialise the VFS inode
275 * part of the structure. This is made more complex by the fact we store
276 * information about the on-disk values in the VFS inode and so we can't just
277 * overwrite the values unconditionally. Hence we save the parameters we
278 * need to retain across reinitialisation, and rewrite them into the VFS inode
279 * after reinitialisation even if it fails.
280 */
281static int
282xfs_reinit_inode(
283 struct xfs_mount *mp,
284 struct inode *inode)
285{
286 int error;
287 uint32_t nlink = inode->i_nlink;
288 uint32_t generation = inode->i_generation;
289 uint64_t version = inode_peek_iversion(inode);
290 umode_t mode = inode->i_mode;
291 dev_t dev = inode->i_rdev;
292
293 error = inode_init_always(mp->m_super, inode);
294
295 set_nlink(inode, nlink);
296 inode->i_generation = generation;
297 inode_set_iversion_queried(inode, version);
298 inode->i_mode = mode;
299 inode->i_rdev = dev;
300 return error;
301}
302
303/*
304 * If we are allocating a new inode, then check what was returned is
305 * actually a free, empty inode. If we are not allocating an inode,
306 * then check we didn't find a free inode.
307 *
308 * Returns:
309 * 0 if the inode free state matches the lookup context
310 * -ENOENT if the inode is free and we are not allocating
311 * -EFSCORRUPTED if there is any state mismatch at all
312 */
313static int
314xfs_iget_check_free_state(
315 struct xfs_inode *ip,
316 int flags)
317{
318 if (flags & XFS_IGET_CREATE) {
319 /* should be a free inode */
320 if (VFS_I(ip)->i_mode != 0) {
321 xfs_warn(ip->i_mount,
322"Corruption detected! Free inode 0x%llx not marked free! (mode 0x%x)",
323 ip->i_ino, VFS_I(ip)->i_mode);
324 return -EFSCORRUPTED;
325 }
326
327 if (ip->i_d.di_nblocks != 0) {
328 xfs_warn(ip->i_mount,
329"Corruption detected! Free inode 0x%llx has blocks allocated!",
330 ip->i_ino);
331 return -EFSCORRUPTED;
332 }
333 return 0;
334 }
335
336 /* should be an allocated inode */
337 if (VFS_I(ip)->i_mode == 0)
338 return -ENOENT;
339
340 return 0;
341}
342
343/*
344 * Check the validity of the inode we just found it the cache
345 */
346static int
347xfs_iget_cache_hit(
348 struct xfs_perag *pag,
349 struct xfs_inode *ip,
350 xfs_ino_t ino,
351 int flags,
352 int lock_flags) __releases(RCU)
353{
354 struct inode *inode = VFS_I(ip);
355 struct xfs_mount *mp = ip->i_mount;
356 int error;
357
358 /*
359 * check for re-use of an inode within an RCU grace period due to the
360 * radix tree nodes not being updated yet. We monitor for this by
361 * setting the inode number to zero before freeing the inode structure.
362 * If the inode has been reallocated and set up, then the inode number
363 * will not match, so check for that, too.
364 */
365 spin_lock(&ip->i_flags_lock);
366 if (ip->i_ino != ino) {
367 trace_xfs_iget_skip(ip);
368 XFS_STATS_INC(mp, xs_ig_frecycle);
369 error = -EAGAIN;
370 goto out_error;
371 }
372
373
374 /*
375 * If we are racing with another cache hit that is currently
376 * instantiating this inode or currently recycling it out of
377 * reclaimabe state, wait for the initialisation to complete
378 * before continuing.
379 *
380 * XXX(hch): eventually we should do something equivalent to
381 * wait_on_inode to wait for these flags to be cleared
382 * instead of polling for it.
383 */
384 if (ip->i_flags & (XFS_INEW|XFS_IRECLAIM)) {
385 trace_xfs_iget_skip(ip);
386 XFS_STATS_INC(mp, xs_ig_frecycle);
387 error = -EAGAIN;
388 goto out_error;
389 }
390
391 /*
392 * Check the inode free state is valid. This also detects lookup
393 * racing with unlinks.
394 */
395 error = xfs_iget_check_free_state(ip, flags);
396 if (error)
397 goto out_error;
398
399 /*
400 * If IRECLAIMABLE is set, we've torn down the VFS inode already.
401 * Need to carefully get it back into useable state.
402 */
403 if (ip->i_flags & XFS_IRECLAIMABLE) {
404 trace_xfs_iget_reclaim(ip);
405
406 if (flags & XFS_IGET_INCORE) {
407 error = -EAGAIN;
408 goto out_error;
409 }
410
411 /*
412 * We need to set XFS_IRECLAIM to prevent xfs_reclaim_inode
413 * from stomping over us while we recycle the inode. We can't
414 * clear the radix tree reclaimable tag yet as it requires
415 * pag_ici_lock to be held exclusive.
416 */
417 ip->i_flags |= XFS_IRECLAIM;
418
419 spin_unlock(&ip->i_flags_lock);
420 rcu_read_unlock();
421
422 error = xfs_reinit_inode(mp, inode);
423 if (error) {
424 bool wake;
425 /*
426 * Re-initializing the inode failed, and we are in deep
427 * trouble. Try to re-add it to the reclaim list.
428 */
429 rcu_read_lock();
430 spin_lock(&ip->i_flags_lock);
431 wake = !!__xfs_iflags_test(ip, XFS_INEW);
432 ip->i_flags &= ~(XFS_INEW | XFS_IRECLAIM);
433 if (wake)
434 wake_up_bit(&ip->i_flags, __XFS_INEW_BIT);
435 ASSERT(ip->i_flags & XFS_IRECLAIMABLE);
436 trace_xfs_iget_reclaim_fail(ip);
437 goto out_error;
438 }
439
440 spin_lock(&pag->pag_ici_lock);
441 spin_lock(&ip->i_flags_lock);
442
443 /*
444 * Clear the per-lifetime state in the inode as we are now
445 * effectively a new inode and need to return to the initial
446 * state before reuse occurs.
447 */
448 ip->i_flags &= ~XFS_IRECLAIM_RESET_FLAGS;
449 ip->i_flags |= XFS_INEW;
450 xfs_inode_clear_reclaim_tag(pag, ip->i_ino);
451 inode->i_state = I_NEW;
David Brazdil0f672f62019-12-10 10:32:29 +0000452 ip->i_sick = 0;
453 ip->i_checked = 0;
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000454
455 ASSERT(!rwsem_is_locked(&inode->i_rwsem));
456 init_rwsem(&inode->i_rwsem);
457
458 spin_unlock(&ip->i_flags_lock);
459 spin_unlock(&pag->pag_ici_lock);
460 } else {
461 /* If the VFS inode is being torn down, pause and try again. */
462 if (!igrab(inode)) {
463 trace_xfs_iget_skip(ip);
464 error = -EAGAIN;
465 goto out_error;
466 }
467
468 /* We've got a live one. */
469 spin_unlock(&ip->i_flags_lock);
470 rcu_read_unlock();
471 trace_xfs_iget_hit(ip);
472 }
473
474 if (lock_flags != 0)
475 xfs_ilock(ip, lock_flags);
476
477 if (!(flags & XFS_IGET_INCORE))
478 xfs_iflags_clear(ip, XFS_ISTALE | XFS_IDONTCACHE);
479 XFS_STATS_INC(mp, xs_ig_found);
480
481 return 0;
482
483out_error:
484 spin_unlock(&ip->i_flags_lock);
485 rcu_read_unlock();
486 return error;
487}
488
489
490static int
491xfs_iget_cache_miss(
492 struct xfs_mount *mp,
493 struct xfs_perag *pag,
494 xfs_trans_t *tp,
495 xfs_ino_t ino,
496 struct xfs_inode **ipp,
497 int flags,
498 int lock_flags)
499{
500 struct xfs_inode *ip;
501 int error;
502 xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ino);
503 int iflags;
504
505 ip = xfs_inode_alloc(mp, ino);
506 if (!ip)
507 return -ENOMEM;
508
509 error = xfs_iread(mp, tp, ip, flags);
510 if (error)
511 goto out_destroy;
512
513 if (!xfs_inode_verify_forks(ip)) {
514 error = -EFSCORRUPTED;
515 goto out_destroy;
516 }
517
518 trace_xfs_iget_miss(ip);
519
520
521 /*
522 * Check the inode free state is valid. This also detects lookup
523 * racing with unlinks.
524 */
525 error = xfs_iget_check_free_state(ip, flags);
526 if (error)
527 goto out_destroy;
528
529 /*
530 * Preload the radix tree so we can insert safely under the
531 * write spinlock. Note that we cannot sleep inside the preload
532 * region. Since we can be called from transaction context, don't
533 * recurse into the file system.
534 */
535 if (radix_tree_preload(GFP_NOFS)) {
536 error = -EAGAIN;
537 goto out_destroy;
538 }
539
540 /*
541 * Because the inode hasn't been added to the radix-tree yet it can't
542 * be found by another thread, so we can do the non-sleeping lock here.
543 */
544 if (lock_flags) {
545 if (!xfs_ilock_nowait(ip, lock_flags))
546 BUG();
547 }
548
549 /*
550 * These values must be set before inserting the inode into the radix
551 * tree as the moment it is inserted a concurrent lookup (allowed by the
552 * RCU locking mechanism) can find it and that lookup must see that this
553 * is an inode currently under construction (i.e. that XFS_INEW is set).
554 * The ip->i_flags_lock that protects the XFS_INEW flag forms the
555 * memory barrier that ensures this detection works correctly at lookup
556 * time.
557 */
558 iflags = XFS_INEW;
559 if (flags & XFS_IGET_DONTCACHE)
560 iflags |= XFS_IDONTCACHE;
561 ip->i_udquot = NULL;
562 ip->i_gdquot = NULL;
563 ip->i_pdquot = NULL;
564 xfs_iflags_set(ip, iflags);
565
566 /* insert the new inode */
567 spin_lock(&pag->pag_ici_lock);
568 error = radix_tree_insert(&pag->pag_ici_root, agino, ip);
569 if (unlikely(error)) {
570 WARN_ON(error != -EEXIST);
571 XFS_STATS_INC(mp, xs_ig_dup);
572 error = -EAGAIN;
573 goto out_preload_end;
574 }
575 spin_unlock(&pag->pag_ici_lock);
576 radix_tree_preload_end();
577
578 *ipp = ip;
579 return 0;
580
581out_preload_end:
582 spin_unlock(&pag->pag_ici_lock);
583 radix_tree_preload_end();
584 if (lock_flags)
585 xfs_iunlock(ip, lock_flags);
586out_destroy:
587 __destroy_inode(VFS_I(ip));
588 xfs_inode_free(ip);
589 return error;
590}
591
592/*
593 * Look up an inode by number in the given file system.
594 * The inode is looked up in the cache held in each AG.
595 * If the inode is found in the cache, initialise the vfs inode
596 * if necessary.
597 *
598 * If it is not in core, read it in from the file system's device,
599 * add it to the cache and initialise the vfs inode.
600 *
601 * The inode is locked according to the value of the lock_flags parameter.
602 * This flag parameter indicates how and if the inode's IO lock and inode lock
603 * should be taken.
604 *
605 * mp -- the mount point structure for the current file system. It points
606 * to the inode hash table.
607 * tp -- a pointer to the current transaction if there is one. This is
608 * simply passed through to the xfs_iread() call.
609 * ino -- the number of the inode desired. This is the unique identifier
610 * within the file system for the inode being requested.
611 * lock_flags -- flags indicating how to lock the inode. See the comment
612 * for xfs_ilock() for a list of valid values.
613 */
614int
615xfs_iget(
616 xfs_mount_t *mp,
617 xfs_trans_t *tp,
618 xfs_ino_t ino,
619 uint flags,
620 uint lock_flags,
621 xfs_inode_t **ipp)
622{
623 xfs_inode_t *ip;
624 int error;
625 xfs_perag_t *pag;
626 xfs_agino_t agino;
627
628 /*
629 * xfs_reclaim_inode() uses the ILOCK to ensure an inode
630 * doesn't get freed while it's being referenced during a
631 * radix tree traversal here. It assumes this function
632 * aqcuires only the ILOCK (and therefore it has no need to
633 * involve the IOLOCK in this synchronization).
634 */
635 ASSERT((lock_flags & (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED)) == 0);
636
637 /* reject inode numbers outside existing AGs */
638 if (!ino || XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount)
639 return -EINVAL;
640
641 XFS_STATS_INC(mp, xs_ig_attempts);
642
643 /* get the perag structure and ensure that it's inode capable */
644 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ino));
645 agino = XFS_INO_TO_AGINO(mp, ino);
646
647again:
648 error = 0;
649 rcu_read_lock();
650 ip = radix_tree_lookup(&pag->pag_ici_root, agino);
651
652 if (ip) {
653 error = xfs_iget_cache_hit(pag, ip, ino, flags, lock_flags);
654 if (error)
655 goto out_error_or_again;
656 } else {
657 rcu_read_unlock();
658 if (flags & XFS_IGET_INCORE) {
659 error = -ENODATA;
660 goto out_error_or_again;
661 }
662 XFS_STATS_INC(mp, xs_ig_missed);
663
664 error = xfs_iget_cache_miss(mp, pag, tp, ino, &ip,
665 flags, lock_flags);
666 if (error)
667 goto out_error_or_again;
668 }
669 xfs_perag_put(pag);
670
671 *ipp = ip;
672
673 /*
674 * If we have a real type for an on-disk inode, we can setup the inode
675 * now. If it's a new inode being created, xfs_ialloc will handle it.
676 */
677 if (xfs_iflags_test(ip, XFS_INEW) && VFS_I(ip)->i_mode != 0)
678 xfs_setup_existing_inode(ip);
679 return 0;
680
681out_error_or_again:
682 if (!(flags & XFS_IGET_INCORE) && error == -EAGAIN) {
683 delay(1);
684 goto again;
685 }
686 xfs_perag_put(pag);
687 return error;
688}
689
690/*
691 * "Is this a cached inode that's also allocated?"
692 *
693 * Look up an inode by number in the given file system. If the inode is
694 * in cache and isn't in purgatory, return 1 if the inode is allocated
695 * and 0 if it is not. For all other cases (not in cache, being torn
696 * down, etc.), return a negative error code.
697 *
698 * The caller has to prevent inode allocation and freeing activity,
699 * presumably by locking the AGI buffer. This is to ensure that an
700 * inode cannot transition from allocated to freed until the caller is
701 * ready to allow that. If the inode is in an intermediate state (new,
702 * reclaimable, or being reclaimed), -EAGAIN will be returned; if the
703 * inode is not in the cache, -ENOENT will be returned. The caller must
704 * deal with these scenarios appropriately.
705 *
706 * This is a specialized use case for the online scrubber; if you're
707 * reading this, you probably want xfs_iget.
708 */
709int
710xfs_icache_inode_is_allocated(
711 struct xfs_mount *mp,
712 struct xfs_trans *tp,
713 xfs_ino_t ino,
714 bool *inuse)
715{
716 struct xfs_inode *ip;
717 int error;
718
719 error = xfs_iget(mp, tp, ino, XFS_IGET_INCORE, 0, &ip);
720 if (error)
721 return error;
722
723 *inuse = !!(VFS_I(ip)->i_mode);
724 xfs_irele(ip);
725 return 0;
726}
727
728/*
729 * The inode lookup is done in batches to keep the amount of lock traffic and
730 * radix tree lookups to a minimum. The batch size is a trade off between
731 * lookup reduction and stack usage. This is in the reclaim path, so we can't
732 * be too greedy.
733 */
734#define XFS_LOOKUP_BATCH 32
735
736STATIC int
737xfs_inode_ag_walk_grab(
738 struct xfs_inode *ip,
739 int flags)
740{
741 struct inode *inode = VFS_I(ip);
742 bool newinos = !!(flags & XFS_AGITER_INEW_WAIT);
743
744 ASSERT(rcu_read_lock_held());
745
746 /*
747 * check for stale RCU freed inode
748 *
749 * If the inode has been reallocated, it doesn't matter if it's not in
750 * the AG we are walking - we are walking for writeback, so if it
751 * passes all the "valid inode" checks and is dirty, then we'll write
752 * it back anyway. If it has been reallocated and still being
753 * initialised, the XFS_INEW check below will catch it.
754 */
755 spin_lock(&ip->i_flags_lock);
756 if (!ip->i_ino)
757 goto out_unlock_noent;
758
759 /* avoid new or reclaimable inodes. Leave for reclaim code to flush */
760 if ((!newinos && __xfs_iflags_test(ip, XFS_INEW)) ||
761 __xfs_iflags_test(ip, XFS_IRECLAIMABLE | XFS_IRECLAIM))
762 goto out_unlock_noent;
763 spin_unlock(&ip->i_flags_lock);
764
765 /* nothing to sync during shutdown */
766 if (XFS_FORCED_SHUTDOWN(ip->i_mount))
767 return -EFSCORRUPTED;
768
769 /* If we can't grab the inode, it must on it's way to reclaim. */
770 if (!igrab(inode))
771 return -ENOENT;
772
773 /* inode is valid */
774 return 0;
775
776out_unlock_noent:
777 spin_unlock(&ip->i_flags_lock);
778 return -ENOENT;
779}
780
781STATIC int
782xfs_inode_ag_walk(
783 struct xfs_mount *mp,
784 struct xfs_perag *pag,
785 int (*execute)(struct xfs_inode *ip, int flags,
786 void *args),
787 int flags,
788 void *args,
789 int tag,
790 int iter_flags)
791{
792 uint32_t first_index;
793 int last_error = 0;
794 int skipped;
795 int done;
796 int nr_found;
797
798restart:
799 done = 0;
800 skipped = 0;
801 first_index = 0;
802 nr_found = 0;
803 do {
804 struct xfs_inode *batch[XFS_LOOKUP_BATCH];
805 int error = 0;
806 int i;
807
808 rcu_read_lock();
809
810 if (tag == -1)
811 nr_found = radix_tree_gang_lookup(&pag->pag_ici_root,
812 (void **)batch, first_index,
813 XFS_LOOKUP_BATCH);
814 else
815 nr_found = radix_tree_gang_lookup_tag(
816 &pag->pag_ici_root,
817 (void **) batch, first_index,
818 XFS_LOOKUP_BATCH, tag);
819
820 if (!nr_found) {
821 rcu_read_unlock();
822 break;
823 }
824
825 /*
826 * Grab the inodes before we drop the lock. if we found
827 * nothing, nr == 0 and the loop will be skipped.
828 */
829 for (i = 0; i < nr_found; i++) {
830 struct xfs_inode *ip = batch[i];
831
832 if (done || xfs_inode_ag_walk_grab(ip, iter_flags))
833 batch[i] = NULL;
834
835 /*
836 * Update the index for the next lookup. Catch
837 * overflows into the next AG range which can occur if
838 * we have inodes in the last block of the AG and we
839 * are currently pointing to the last inode.
840 *
841 * Because we may see inodes that are from the wrong AG
842 * due to RCU freeing and reallocation, only update the
843 * index if it lies in this AG. It was a race that lead
844 * us to see this inode, so another lookup from the
845 * same index will not find it again.
846 */
847 if (XFS_INO_TO_AGNO(mp, ip->i_ino) != pag->pag_agno)
848 continue;
849 first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
850 if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
851 done = 1;
852 }
853
854 /* unlock now we've grabbed the inodes. */
855 rcu_read_unlock();
856
857 for (i = 0; i < nr_found; i++) {
858 if (!batch[i])
859 continue;
860 if ((iter_flags & XFS_AGITER_INEW_WAIT) &&
861 xfs_iflags_test(batch[i], XFS_INEW))
862 xfs_inew_wait(batch[i]);
863 error = execute(batch[i], flags, args);
864 xfs_irele(batch[i]);
865 if (error == -EAGAIN) {
866 skipped++;
867 continue;
868 }
869 if (error && last_error != -EFSCORRUPTED)
870 last_error = error;
871 }
872
873 /* bail out if the filesystem is corrupted. */
874 if (error == -EFSCORRUPTED)
875 break;
876
877 cond_resched();
878
879 } while (nr_found && !done);
880
881 if (skipped) {
882 delay(1);
883 goto restart;
884 }
885 return last_error;
886}
887
888/*
889 * Background scanning to trim post-EOF preallocated space. This is queued
890 * based on the 'speculative_prealloc_lifetime' tunable (5m by default).
891 */
892void
893xfs_queue_eofblocks(
894 struct xfs_mount *mp)
895{
896 rcu_read_lock();
897 if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_EOFBLOCKS_TAG))
898 queue_delayed_work(mp->m_eofblocks_workqueue,
899 &mp->m_eofblocks_work,
900 msecs_to_jiffies(xfs_eofb_secs * 1000));
901 rcu_read_unlock();
902}
903
904void
905xfs_eofblocks_worker(
906 struct work_struct *work)
907{
908 struct xfs_mount *mp = container_of(to_delayed_work(work),
909 struct xfs_mount, m_eofblocks_work);
Olivier Deprez0e641232021-09-23 10:07:05 +0200910
911 if (!sb_start_write_trylock(mp->m_super))
912 return;
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000913 xfs_icache_free_eofblocks(mp, NULL);
Olivier Deprez0e641232021-09-23 10:07:05 +0200914 sb_end_write(mp->m_super);
915
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000916 xfs_queue_eofblocks(mp);
917}
918
919/*
920 * Background scanning to trim preallocated CoW space. This is queued
921 * based on the 'speculative_cow_prealloc_lifetime' tunable (5m by default).
922 * (We'll just piggyback on the post-EOF prealloc space workqueue.)
923 */
924void
925xfs_queue_cowblocks(
926 struct xfs_mount *mp)
927{
928 rcu_read_lock();
929 if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_COWBLOCKS_TAG))
930 queue_delayed_work(mp->m_eofblocks_workqueue,
931 &mp->m_cowblocks_work,
932 msecs_to_jiffies(xfs_cowb_secs * 1000));
933 rcu_read_unlock();
934}
935
936void
937xfs_cowblocks_worker(
938 struct work_struct *work)
939{
940 struct xfs_mount *mp = container_of(to_delayed_work(work),
941 struct xfs_mount, m_cowblocks_work);
Olivier Deprez0e641232021-09-23 10:07:05 +0200942
943 if (!sb_start_write_trylock(mp->m_super))
944 return;
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000945 xfs_icache_free_cowblocks(mp, NULL);
Olivier Deprez0e641232021-09-23 10:07:05 +0200946 sb_end_write(mp->m_super);
947
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000948 xfs_queue_cowblocks(mp);
949}
950
951int
952xfs_inode_ag_iterator_flags(
953 struct xfs_mount *mp,
954 int (*execute)(struct xfs_inode *ip, int flags,
955 void *args),
956 int flags,
957 void *args,
958 int iter_flags)
959{
960 struct xfs_perag *pag;
961 int error = 0;
962 int last_error = 0;
963 xfs_agnumber_t ag;
964
965 ag = 0;
966 while ((pag = xfs_perag_get(mp, ag))) {
967 ag = pag->pag_agno + 1;
968 error = xfs_inode_ag_walk(mp, pag, execute, flags, args, -1,
969 iter_flags);
970 xfs_perag_put(pag);
971 if (error) {
972 last_error = error;
973 if (error == -EFSCORRUPTED)
974 break;
975 }
976 }
977 return last_error;
978}
979
980int
981xfs_inode_ag_iterator(
982 struct xfs_mount *mp,
983 int (*execute)(struct xfs_inode *ip, int flags,
984 void *args),
985 int flags,
986 void *args)
987{
988 return xfs_inode_ag_iterator_flags(mp, execute, flags, args, 0);
989}
990
991int
992xfs_inode_ag_iterator_tag(
993 struct xfs_mount *mp,
994 int (*execute)(struct xfs_inode *ip, int flags,
995 void *args),
996 int flags,
997 void *args,
998 int tag)
999{
1000 struct xfs_perag *pag;
1001 int error = 0;
1002 int last_error = 0;
1003 xfs_agnumber_t ag;
1004
1005 ag = 0;
1006 while ((pag = xfs_perag_get_tag(mp, ag, tag))) {
1007 ag = pag->pag_agno + 1;
1008 error = xfs_inode_ag_walk(mp, pag, execute, flags, args, tag,
1009 0);
1010 xfs_perag_put(pag);
1011 if (error) {
1012 last_error = error;
1013 if (error == -EFSCORRUPTED)
1014 break;
1015 }
1016 }
1017 return last_error;
1018}
1019
1020/*
1021 * Grab the inode for reclaim exclusively.
1022 * Return 0 if we grabbed it, non-zero otherwise.
1023 */
1024STATIC int
1025xfs_reclaim_inode_grab(
1026 struct xfs_inode *ip,
1027 int flags)
1028{
1029 ASSERT(rcu_read_lock_held());
1030
1031 /* quick check for stale RCU freed inode */
1032 if (!ip->i_ino)
1033 return 1;
1034
1035 /*
1036 * If we are asked for non-blocking operation, do unlocked checks to
1037 * see if the inode already is being flushed or in reclaim to avoid
1038 * lock traffic.
1039 */
1040 if ((flags & SYNC_TRYLOCK) &&
1041 __xfs_iflags_test(ip, XFS_IFLOCK | XFS_IRECLAIM))
1042 return 1;
1043
1044 /*
1045 * The radix tree lock here protects a thread in xfs_iget from racing
1046 * with us starting reclaim on the inode. Once we have the
1047 * XFS_IRECLAIM flag set it will not touch us.
1048 *
1049 * Due to RCU lookup, we may find inodes that have been freed and only
1050 * have XFS_IRECLAIM set. Indeed, we may see reallocated inodes that
1051 * aren't candidates for reclaim at all, so we must check the
1052 * XFS_IRECLAIMABLE is set first before proceeding to reclaim.
1053 */
1054 spin_lock(&ip->i_flags_lock);
1055 if (!__xfs_iflags_test(ip, XFS_IRECLAIMABLE) ||
1056 __xfs_iflags_test(ip, XFS_IRECLAIM)) {
1057 /* not a reclaim candidate. */
1058 spin_unlock(&ip->i_flags_lock);
1059 return 1;
1060 }
1061 __xfs_iflags_set(ip, XFS_IRECLAIM);
1062 spin_unlock(&ip->i_flags_lock);
1063 return 0;
1064}
1065
1066/*
1067 * Inodes in different states need to be treated differently. The following
1068 * table lists the inode states and the reclaim actions necessary:
1069 *
1070 * inode state iflush ret required action
1071 * --------------- ---------- ---------------
1072 * bad - reclaim
1073 * shutdown EIO unpin and reclaim
1074 * clean, unpinned 0 reclaim
1075 * stale, unpinned 0 reclaim
1076 * clean, pinned(*) 0 requeue
1077 * stale, pinned EAGAIN requeue
1078 * dirty, async - requeue
1079 * dirty, sync 0 reclaim
1080 *
1081 * (*) dgc: I don't think the clean, pinned state is possible but it gets
1082 * handled anyway given the order of checks implemented.
1083 *
1084 * Also, because we get the flush lock first, we know that any inode that has
1085 * been flushed delwri has had the flush completed by the time we check that
1086 * the inode is clean.
1087 *
1088 * Note that because the inode is flushed delayed write by AIL pushing, the
1089 * flush lock may already be held here and waiting on it can result in very
1090 * long latencies. Hence for sync reclaims, where we wait on the flush lock,
1091 * the caller should push the AIL first before trying to reclaim inodes to
1092 * minimise the amount of time spent waiting. For background relaim, we only
1093 * bother to reclaim clean inodes anyway.
1094 *
1095 * Hence the order of actions after gaining the locks should be:
1096 * bad => reclaim
1097 * shutdown => unpin and reclaim
1098 * pinned, async => requeue
1099 * pinned, sync => unpin
1100 * stale => reclaim
1101 * clean => reclaim
1102 * dirty, async => requeue
1103 * dirty, sync => flush, wait and reclaim
1104 */
1105STATIC int
1106xfs_reclaim_inode(
1107 struct xfs_inode *ip,
1108 struct xfs_perag *pag,
1109 int sync_mode)
1110{
1111 struct xfs_buf *bp = NULL;
1112 xfs_ino_t ino = ip->i_ino; /* for radix_tree_delete */
1113 int error;
1114
1115restart:
1116 error = 0;
1117 xfs_ilock(ip, XFS_ILOCK_EXCL);
1118 if (!xfs_iflock_nowait(ip)) {
1119 if (!(sync_mode & SYNC_WAIT))
1120 goto out;
1121 xfs_iflock(ip);
1122 }
1123
1124 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
1125 xfs_iunpin_wait(ip);
1126 /* xfs_iflush_abort() drops the flush lock */
1127 xfs_iflush_abort(ip, false);
1128 goto reclaim;
1129 }
1130 if (xfs_ipincount(ip)) {
1131 if (!(sync_mode & SYNC_WAIT))
1132 goto out_ifunlock;
1133 xfs_iunpin_wait(ip);
1134 }
Olivier Deprez0e641232021-09-23 10:07:05 +02001135 if (xfs_inode_clean(ip)) {
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001136 xfs_ifunlock(ip);
1137 goto reclaim;
1138 }
1139
1140 /*
1141 * Never flush out dirty data during non-blocking reclaim, as it would
1142 * just contend with AIL pushing trying to do the same job.
1143 */
1144 if (!(sync_mode & SYNC_WAIT))
1145 goto out_ifunlock;
1146
1147 /*
1148 * Now we have an inode that needs flushing.
1149 *
1150 * Note that xfs_iflush will never block on the inode buffer lock, as
1151 * xfs_ifree_cluster() can lock the inode buffer before it locks the
1152 * ip->i_lock, and we are doing the exact opposite here. As a result,
1153 * doing a blocking xfs_imap_to_bp() to get the cluster buffer would
1154 * result in an ABBA deadlock with xfs_ifree_cluster().
1155 *
1156 * As xfs_ifree_cluser() must gather all inodes that are active in the
1157 * cache to mark them stale, if we hit this case we don't actually want
1158 * to do IO here - we want the inode marked stale so we can simply
1159 * reclaim it. Hence if we get an EAGAIN error here, just unlock the
1160 * inode, back off and try again. Hopefully the next pass through will
1161 * see the stale flag set on the inode.
1162 */
1163 error = xfs_iflush(ip, &bp);
1164 if (error == -EAGAIN) {
1165 xfs_iunlock(ip, XFS_ILOCK_EXCL);
1166 /* backoff longer than in xfs_ifree_cluster */
1167 delay(2);
1168 goto restart;
1169 }
1170
1171 if (!error) {
1172 error = xfs_bwrite(bp);
1173 xfs_buf_relse(bp);
1174 }
1175
1176reclaim:
1177 ASSERT(!xfs_isiflocked(ip));
1178
1179 /*
1180 * Because we use RCU freeing we need to ensure the inode always appears
1181 * to be reclaimed with an invalid inode number when in the free state.
1182 * We do this as early as possible under the ILOCK so that
1183 * xfs_iflush_cluster() and xfs_ifree_cluster() can be guaranteed to
1184 * detect races with us here. By doing this, we guarantee that once
1185 * xfs_iflush_cluster() or xfs_ifree_cluster() has locked XFS_ILOCK that
1186 * it will see either a valid inode that will serialise correctly, or it
1187 * will see an invalid inode that it can skip.
1188 */
1189 spin_lock(&ip->i_flags_lock);
1190 ip->i_flags = XFS_IRECLAIM;
1191 ip->i_ino = 0;
1192 spin_unlock(&ip->i_flags_lock);
1193
1194 xfs_iunlock(ip, XFS_ILOCK_EXCL);
1195
1196 XFS_STATS_INC(ip->i_mount, xs_ig_reclaims);
1197 /*
1198 * Remove the inode from the per-AG radix tree.
1199 *
1200 * Because radix_tree_delete won't complain even if the item was never
1201 * added to the tree assert that it's been there before to catch
1202 * problems with the inode life time early on.
1203 */
1204 spin_lock(&pag->pag_ici_lock);
1205 if (!radix_tree_delete(&pag->pag_ici_root,
1206 XFS_INO_TO_AGINO(ip->i_mount, ino)))
1207 ASSERT(0);
1208 xfs_perag_clear_reclaim_tag(pag);
1209 spin_unlock(&pag->pag_ici_lock);
1210
1211 /*
1212 * Here we do an (almost) spurious inode lock in order to coordinate
1213 * with inode cache radix tree lookups. This is because the lookup
1214 * can reference the inodes in the cache without taking references.
1215 *
1216 * We make that OK here by ensuring that we wait until the inode is
1217 * unlocked after the lookup before we go ahead and free it.
1218 */
1219 xfs_ilock(ip, XFS_ILOCK_EXCL);
1220 xfs_qm_dqdetach(ip);
1221 xfs_iunlock(ip, XFS_ILOCK_EXCL);
Olivier Deprez0e641232021-09-23 10:07:05 +02001222 ASSERT(xfs_inode_clean(ip));
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001223
1224 __xfs_inode_free(ip);
1225 return error;
1226
1227out_ifunlock:
1228 xfs_ifunlock(ip);
1229out:
1230 xfs_iflags_clear(ip, XFS_IRECLAIM);
1231 xfs_iunlock(ip, XFS_ILOCK_EXCL);
1232 /*
1233 * We could return -EAGAIN here to make reclaim rescan the inode tree in
1234 * a short while. However, this just burns CPU time scanning the tree
1235 * waiting for IO to complete and the reclaim work never goes back to
1236 * the idle state. Instead, return 0 to let the next scheduled
1237 * background reclaim attempt to reclaim the inode again.
1238 */
1239 return 0;
1240}
1241
1242/*
1243 * Walk the AGs and reclaim the inodes in them. Even if the filesystem is
1244 * corrupted, we still want to try to reclaim all the inodes. If we don't,
1245 * then a shut down during filesystem unmount reclaim walk leak all the
1246 * unreclaimed inodes.
1247 */
1248STATIC int
1249xfs_reclaim_inodes_ag(
1250 struct xfs_mount *mp,
1251 int flags,
1252 int *nr_to_scan)
1253{
1254 struct xfs_perag *pag;
1255 int error = 0;
1256 int last_error = 0;
1257 xfs_agnumber_t ag;
1258 int trylock = flags & SYNC_TRYLOCK;
1259 int skipped;
1260
1261restart:
1262 ag = 0;
1263 skipped = 0;
1264 while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) {
1265 unsigned long first_index = 0;
1266 int done = 0;
1267 int nr_found = 0;
1268
1269 ag = pag->pag_agno + 1;
1270
1271 if (trylock) {
1272 if (!mutex_trylock(&pag->pag_ici_reclaim_lock)) {
1273 skipped++;
1274 xfs_perag_put(pag);
1275 continue;
1276 }
1277 first_index = pag->pag_ici_reclaim_cursor;
1278 } else
1279 mutex_lock(&pag->pag_ici_reclaim_lock);
1280
1281 do {
1282 struct xfs_inode *batch[XFS_LOOKUP_BATCH];
1283 int i;
1284
1285 rcu_read_lock();
1286 nr_found = radix_tree_gang_lookup_tag(
1287 &pag->pag_ici_root,
1288 (void **)batch, first_index,
1289 XFS_LOOKUP_BATCH,
1290 XFS_ICI_RECLAIM_TAG);
1291 if (!nr_found) {
1292 done = 1;
1293 rcu_read_unlock();
1294 break;
1295 }
1296
1297 /*
1298 * Grab the inodes before we drop the lock. if we found
1299 * nothing, nr == 0 and the loop will be skipped.
1300 */
1301 for (i = 0; i < nr_found; i++) {
1302 struct xfs_inode *ip = batch[i];
1303
1304 if (done || xfs_reclaim_inode_grab(ip, flags))
1305 batch[i] = NULL;
1306
1307 /*
1308 * Update the index for the next lookup. Catch
1309 * overflows into the next AG range which can
1310 * occur if we have inodes in the last block of
1311 * the AG and we are currently pointing to the
1312 * last inode.
1313 *
1314 * Because we may see inodes that are from the
1315 * wrong AG due to RCU freeing and
1316 * reallocation, only update the index if it
1317 * lies in this AG. It was a race that lead us
1318 * to see this inode, so another lookup from
1319 * the same index will not find it again.
1320 */
1321 if (XFS_INO_TO_AGNO(mp, ip->i_ino) !=
1322 pag->pag_agno)
1323 continue;
1324 first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
1325 if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
1326 done = 1;
1327 }
1328
1329 /* unlock now we've grabbed the inodes. */
1330 rcu_read_unlock();
1331
1332 for (i = 0; i < nr_found; i++) {
1333 if (!batch[i])
1334 continue;
1335 error = xfs_reclaim_inode(batch[i], pag, flags);
1336 if (error && last_error != -EFSCORRUPTED)
1337 last_error = error;
1338 }
1339
1340 *nr_to_scan -= XFS_LOOKUP_BATCH;
1341
1342 cond_resched();
1343
1344 } while (nr_found && !done && *nr_to_scan > 0);
1345
1346 if (trylock && !done)
1347 pag->pag_ici_reclaim_cursor = first_index;
1348 else
1349 pag->pag_ici_reclaim_cursor = 0;
1350 mutex_unlock(&pag->pag_ici_reclaim_lock);
1351 xfs_perag_put(pag);
1352 }
1353
1354 /*
1355 * if we skipped any AG, and we still have scan count remaining, do
1356 * another pass this time using blocking reclaim semantics (i.e
1357 * waiting on the reclaim locks and ignoring the reclaim cursors). This
1358 * ensure that when we get more reclaimers than AGs we block rather
1359 * than spin trying to execute reclaim.
1360 */
1361 if (skipped && (flags & SYNC_WAIT) && *nr_to_scan > 0) {
1362 trylock = 0;
1363 goto restart;
1364 }
1365 return last_error;
1366}
1367
1368int
1369xfs_reclaim_inodes(
1370 xfs_mount_t *mp,
1371 int mode)
1372{
1373 int nr_to_scan = INT_MAX;
1374
1375 return xfs_reclaim_inodes_ag(mp, mode, &nr_to_scan);
1376}
1377
1378/*
1379 * Scan a certain number of inodes for reclaim.
1380 *
1381 * When called we make sure that there is a background (fast) inode reclaim in
1382 * progress, while we will throttle the speed of reclaim via doing synchronous
1383 * reclaim of inodes. That means if we come across dirty inodes, we wait for
1384 * them to be cleaned, which we hope will not be very long due to the
1385 * background walker having already kicked the IO off on those dirty inodes.
1386 */
1387long
1388xfs_reclaim_inodes_nr(
1389 struct xfs_mount *mp,
1390 int nr_to_scan)
1391{
1392 /* kick background reclaimer and push the AIL */
1393 xfs_reclaim_work_queue(mp);
1394 xfs_ail_push_all(mp->m_ail);
1395
1396 return xfs_reclaim_inodes_ag(mp, SYNC_TRYLOCK | SYNC_WAIT, &nr_to_scan);
1397}
1398
1399/*
1400 * Return the number of reclaimable inodes in the filesystem for
1401 * the shrinker to determine how much to reclaim.
1402 */
1403int
1404xfs_reclaim_inodes_count(
1405 struct xfs_mount *mp)
1406{
1407 struct xfs_perag *pag;
1408 xfs_agnumber_t ag = 0;
1409 int reclaimable = 0;
1410
1411 while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) {
1412 ag = pag->pag_agno + 1;
1413 reclaimable += pag->pag_ici_reclaimable;
1414 xfs_perag_put(pag);
1415 }
1416 return reclaimable;
1417}
1418
1419STATIC int
1420xfs_inode_match_id(
1421 struct xfs_inode *ip,
1422 struct xfs_eofblocks *eofb)
1423{
1424 if ((eofb->eof_flags & XFS_EOF_FLAGS_UID) &&
1425 !uid_eq(VFS_I(ip)->i_uid, eofb->eof_uid))
1426 return 0;
1427
1428 if ((eofb->eof_flags & XFS_EOF_FLAGS_GID) &&
1429 !gid_eq(VFS_I(ip)->i_gid, eofb->eof_gid))
1430 return 0;
1431
1432 if ((eofb->eof_flags & XFS_EOF_FLAGS_PRID) &&
1433 xfs_get_projid(ip) != eofb->eof_prid)
1434 return 0;
1435
1436 return 1;
1437}
1438
1439/*
1440 * A union-based inode filtering algorithm. Process the inode if any of the
1441 * criteria match. This is for global/internal scans only.
1442 */
1443STATIC int
1444xfs_inode_match_id_union(
1445 struct xfs_inode *ip,
1446 struct xfs_eofblocks *eofb)
1447{
1448 if ((eofb->eof_flags & XFS_EOF_FLAGS_UID) &&
1449 uid_eq(VFS_I(ip)->i_uid, eofb->eof_uid))
1450 return 1;
1451
1452 if ((eofb->eof_flags & XFS_EOF_FLAGS_GID) &&
1453 gid_eq(VFS_I(ip)->i_gid, eofb->eof_gid))
1454 return 1;
1455
1456 if ((eofb->eof_flags & XFS_EOF_FLAGS_PRID) &&
1457 xfs_get_projid(ip) == eofb->eof_prid)
1458 return 1;
1459
1460 return 0;
1461}
1462
1463STATIC int
1464xfs_inode_free_eofblocks(
1465 struct xfs_inode *ip,
1466 int flags,
1467 void *args)
1468{
1469 int ret = 0;
1470 struct xfs_eofblocks *eofb = args;
1471 int match;
1472
1473 if (!xfs_can_free_eofblocks(ip, false)) {
1474 /* inode could be preallocated or append-only */
1475 trace_xfs_inode_free_eofblocks_invalid(ip);
1476 xfs_inode_clear_eofblocks_tag(ip);
1477 return 0;
1478 }
1479
1480 /*
1481 * If the mapping is dirty the operation can block and wait for some
1482 * time. Unless we are waiting, skip it.
1483 */
1484 if (!(flags & SYNC_WAIT) &&
1485 mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_DIRTY))
1486 return 0;
1487
1488 if (eofb) {
1489 if (eofb->eof_flags & XFS_EOF_FLAGS_UNION)
1490 match = xfs_inode_match_id_union(ip, eofb);
1491 else
1492 match = xfs_inode_match_id(ip, eofb);
1493 if (!match)
1494 return 0;
1495
1496 /* skip the inode if the file size is too small */
1497 if (eofb->eof_flags & XFS_EOF_FLAGS_MINFILESIZE &&
1498 XFS_ISIZE(ip) < eofb->eof_min_file_size)
1499 return 0;
1500 }
1501
1502 /*
1503 * If the caller is waiting, return -EAGAIN to keep the background
1504 * scanner moving and revisit the inode in a subsequent pass.
1505 */
1506 if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) {
1507 if (flags & SYNC_WAIT)
1508 ret = -EAGAIN;
1509 return ret;
1510 }
1511 ret = xfs_free_eofblocks(ip);
1512 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1513
1514 return ret;
1515}
1516
1517static int
1518__xfs_icache_free_eofblocks(
1519 struct xfs_mount *mp,
1520 struct xfs_eofblocks *eofb,
1521 int (*execute)(struct xfs_inode *ip, int flags,
1522 void *args),
1523 int tag)
1524{
1525 int flags = SYNC_TRYLOCK;
1526
1527 if (eofb && (eofb->eof_flags & XFS_EOF_FLAGS_SYNC))
1528 flags = SYNC_WAIT;
1529
1530 return xfs_inode_ag_iterator_tag(mp, execute, flags,
1531 eofb, tag);
1532}
1533
1534int
1535xfs_icache_free_eofblocks(
1536 struct xfs_mount *mp,
1537 struct xfs_eofblocks *eofb)
1538{
1539 return __xfs_icache_free_eofblocks(mp, eofb, xfs_inode_free_eofblocks,
1540 XFS_ICI_EOFBLOCKS_TAG);
1541}
1542
1543/*
1544 * Run eofblocks scans on the quotas applicable to the inode. For inodes with
1545 * multiple quotas, we don't know exactly which quota caused an allocation
1546 * failure. We make a best effort by including each quota under low free space
1547 * conditions (less than 1% free space) in the scan.
1548 */
1549static int
1550__xfs_inode_free_quota_eofblocks(
1551 struct xfs_inode *ip,
1552 int (*execute)(struct xfs_mount *mp,
1553 struct xfs_eofblocks *eofb))
1554{
1555 int scan = 0;
1556 struct xfs_eofblocks eofb = {0};
1557 struct xfs_dquot *dq;
1558
1559 /*
1560 * Run a sync scan to increase effectiveness and use the union filter to
1561 * cover all applicable quotas in a single scan.
1562 */
1563 eofb.eof_flags = XFS_EOF_FLAGS_UNION|XFS_EOF_FLAGS_SYNC;
1564
1565 if (XFS_IS_UQUOTA_ENFORCED(ip->i_mount)) {
1566 dq = xfs_inode_dquot(ip, XFS_DQ_USER);
1567 if (dq && xfs_dquot_lowsp(dq)) {
1568 eofb.eof_uid = VFS_I(ip)->i_uid;
1569 eofb.eof_flags |= XFS_EOF_FLAGS_UID;
1570 scan = 1;
1571 }
1572 }
1573
1574 if (XFS_IS_GQUOTA_ENFORCED(ip->i_mount)) {
1575 dq = xfs_inode_dquot(ip, XFS_DQ_GROUP);
1576 if (dq && xfs_dquot_lowsp(dq)) {
1577 eofb.eof_gid = VFS_I(ip)->i_gid;
1578 eofb.eof_flags |= XFS_EOF_FLAGS_GID;
1579 scan = 1;
1580 }
1581 }
1582
1583 if (scan)
1584 execute(ip->i_mount, &eofb);
1585
1586 return scan;
1587}
1588
1589int
1590xfs_inode_free_quota_eofblocks(
1591 struct xfs_inode *ip)
1592{
1593 return __xfs_inode_free_quota_eofblocks(ip, xfs_icache_free_eofblocks);
1594}
1595
1596static inline unsigned long
1597xfs_iflag_for_tag(
1598 int tag)
1599{
1600 switch (tag) {
1601 case XFS_ICI_EOFBLOCKS_TAG:
1602 return XFS_IEOFBLOCKS;
1603 case XFS_ICI_COWBLOCKS_TAG:
1604 return XFS_ICOWBLOCKS;
1605 default:
1606 ASSERT(0);
1607 return 0;
1608 }
1609}
1610
1611static void
1612__xfs_inode_set_blocks_tag(
1613 xfs_inode_t *ip,
1614 void (*execute)(struct xfs_mount *mp),
1615 void (*set_tp)(struct xfs_mount *mp, xfs_agnumber_t agno,
1616 int error, unsigned long caller_ip),
1617 int tag)
1618{
1619 struct xfs_mount *mp = ip->i_mount;
1620 struct xfs_perag *pag;
1621 int tagged;
1622
1623 /*
1624 * Don't bother locking the AG and looking up in the radix trees
1625 * if we already know that we have the tag set.
1626 */
1627 if (ip->i_flags & xfs_iflag_for_tag(tag))
1628 return;
1629 spin_lock(&ip->i_flags_lock);
1630 ip->i_flags |= xfs_iflag_for_tag(tag);
1631 spin_unlock(&ip->i_flags_lock);
1632
1633 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
1634 spin_lock(&pag->pag_ici_lock);
1635
1636 tagged = radix_tree_tagged(&pag->pag_ici_root, tag);
1637 radix_tree_tag_set(&pag->pag_ici_root,
1638 XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino), tag);
1639 if (!tagged) {
1640 /* propagate the eofblocks tag up into the perag radix tree */
1641 spin_lock(&ip->i_mount->m_perag_lock);
1642 radix_tree_tag_set(&ip->i_mount->m_perag_tree,
1643 XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
1644 tag);
1645 spin_unlock(&ip->i_mount->m_perag_lock);
1646
1647 /* kick off background trimming */
1648 execute(ip->i_mount);
1649
1650 set_tp(ip->i_mount, pag->pag_agno, -1, _RET_IP_);
1651 }
1652
1653 spin_unlock(&pag->pag_ici_lock);
1654 xfs_perag_put(pag);
1655}
1656
1657void
1658xfs_inode_set_eofblocks_tag(
1659 xfs_inode_t *ip)
1660{
1661 trace_xfs_inode_set_eofblocks_tag(ip);
1662 return __xfs_inode_set_blocks_tag(ip, xfs_queue_eofblocks,
1663 trace_xfs_perag_set_eofblocks,
1664 XFS_ICI_EOFBLOCKS_TAG);
1665}
1666
1667static void
1668__xfs_inode_clear_blocks_tag(
1669 xfs_inode_t *ip,
1670 void (*clear_tp)(struct xfs_mount *mp, xfs_agnumber_t agno,
1671 int error, unsigned long caller_ip),
1672 int tag)
1673{
1674 struct xfs_mount *mp = ip->i_mount;
1675 struct xfs_perag *pag;
1676
1677 spin_lock(&ip->i_flags_lock);
1678 ip->i_flags &= ~xfs_iflag_for_tag(tag);
1679 spin_unlock(&ip->i_flags_lock);
1680
1681 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
1682 spin_lock(&pag->pag_ici_lock);
1683
1684 radix_tree_tag_clear(&pag->pag_ici_root,
1685 XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino), tag);
1686 if (!radix_tree_tagged(&pag->pag_ici_root, tag)) {
1687 /* clear the eofblocks tag from the perag radix tree */
1688 spin_lock(&ip->i_mount->m_perag_lock);
1689 radix_tree_tag_clear(&ip->i_mount->m_perag_tree,
1690 XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
1691 tag);
1692 spin_unlock(&ip->i_mount->m_perag_lock);
1693 clear_tp(ip->i_mount, pag->pag_agno, -1, _RET_IP_);
1694 }
1695
1696 spin_unlock(&pag->pag_ici_lock);
1697 xfs_perag_put(pag);
1698}
1699
1700void
1701xfs_inode_clear_eofblocks_tag(
1702 xfs_inode_t *ip)
1703{
1704 trace_xfs_inode_clear_eofblocks_tag(ip);
1705 return __xfs_inode_clear_blocks_tag(ip,
1706 trace_xfs_perag_clear_eofblocks, XFS_ICI_EOFBLOCKS_TAG);
1707}
1708
1709/*
1710 * Set ourselves up to free CoW blocks from this file. If it's already clean
1711 * then we can bail out quickly, but otherwise we must back off if the file
1712 * is undergoing some kind of write.
1713 */
1714static bool
1715xfs_prep_free_cowblocks(
1716 struct xfs_inode *ip)
1717{
1718 /*
1719 * Just clear the tag if we have an empty cow fork or none at all. It's
1720 * possible the inode was fully unshared since it was originally tagged.
1721 */
1722 if (!xfs_inode_has_cow_data(ip)) {
1723 trace_xfs_inode_free_cowblocks_invalid(ip);
1724 xfs_inode_clear_cowblocks_tag(ip);
1725 return false;
1726 }
1727
1728 /*
1729 * If the mapping is dirty or under writeback we cannot touch the
1730 * CoW fork. Leave it alone if we're in the midst of a directio.
1731 */
1732 if ((VFS_I(ip)->i_state & I_DIRTY_PAGES) ||
1733 mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_DIRTY) ||
1734 mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_WRITEBACK) ||
1735 atomic_read(&VFS_I(ip)->i_dio_count))
1736 return false;
1737
1738 return true;
1739}
1740
1741/*
1742 * Automatic CoW Reservation Freeing
1743 *
1744 * These functions automatically garbage collect leftover CoW reservations
1745 * that were made on behalf of a cowextsize hint when we start to run out
1746 * of quota or when the reservations sit around for too long. If the file
1747 * has dirty pages or is undergoing writeback, its CoW reservations will
1748 * be retained.
1749 *
1750 * The actual garbage collection piggybacks off the same code that runs
1751 * the speculative EOF preallocation garbage collector.
1752 */
1753STATIC int
1754xfs_inode_free_cowblocks(
1755 struct xfs_inode *ip,
1756 int flags,
1757 void *args)
1758{
1759 struct xfs_eofblocks *eofb = args;
1760 int match;
1761 int ret = 0;
1762
1763 if (!xfs_prep_free_cowblocks(ip))
1764 return 0;
1765
1766 if (eofb) {
1767 if (eofb->eof_flags & XFS_EOF_FLAGS_UNION)
1768 match = xfs_inode_match_id_union(ip, eofb);
1769 else
1770 match = xfs_inode_match_id(ip, eofb);
1771 if (!match)
1772 return 0;
1773
1774 /* skip the inode if the file size is too small */
1775 if (eofb->eof_flags & XFS_EOF_FLAGS_MINFILESIZE &&
1776 XFS_ISIZE(ip) < eofb->eof_min_file_size)
1777 return 0;
1778 }
1779
1780 /* Free the CoW blocks */
1781 xfs_ilock(ip, XFS_IOLOCK_EXCL);
1782 xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
1783
1784 /*
1785 * Check again, nobody else should be able to dirty blocks or change
1786 * the reflink iflag now that we have the first two locks held.
1787 */
1788 if (xfs_prep_free_cowblocks(ip))
1789 ret = xfs_reflink_cancel_cow_range(ip, 0, NULLFILEOFF, false);
1790
1791 xfs_iunlock(ip, XFS_MMAPLOCK_EXCL);
1792 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1793
1794 return ret;
1795}
1796
1797int
1798xfs_icache_free_cowblocks(
1799 struct xfs_mount *mp,
1800 struct xfs_eofblocks *eofb)
1801{
1802 return __xfs_icache_free_eofblocks(mp, eofb, xfs_inode_free_cowblocks,
1803 XFS_ICI_COWBLOCKS_TAG);
1804}
1805
1806int
1807xfs_inode_free_quota_cowblocks(
1808 struct xfs_inode *ip)
1809{
1810 return __xfs_inode_free_quota_eofblocks(ip, xfs_icache_free_cowblocks);
1811}
1812
1813void
1814xfs_inode_set_cowblocks_tag(
1815 xfs_inode_t *ip)
1816{
1817 trace_xfs_inode_set_cowblocks_tag(ip);
1818 return __xfs_inode_set_blocks_tag(ip, xfs_queue_cowblocks,
1819 trace_xfs_perag_set_cowblocks,
1820 XFS_ICI_COWBLOCKS_TAG);
1821}
1822
1823void
1824xfs_inode_clear_cowblocks_tag(
1825 xfs_inode_t *ip)
1826{
1827 trace_xfs_inode_clear_cowblocks_tag(ip);
1828 return __xfs_inode_clear_blocks_tag(ip,
1829 trace_xfs_perag_clear_cowblocks, XFS_ICI_COWBLOCKS_TAG);
1830}
1831
1832/* Disable post-EOF and CoW block auto-reclamation. */
1833void
David Brazdil0f672f62019-12-10 10:32:29 +00001834xfs_stop_block_reaping(
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001835 struct xfs_mount *mp)
1836{
1837 cancel_delayed_work_sync(&mp->m_eofblocks_work);
1838 cancel_delayed_work_sync(&mp->m_cowblocks_work);
1839}
1840
1841/* Enable post-EOF and CoW block auto-reclamation. */
1842void
David Brazdil0f672f62019-12-10 10:32:29 +00001843xfs_start_block_reaping(
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001844 struct xfs_mount *mp)
1845{
1846 xfs_queue_eofblocks(mp);
1847 xfs_queue_cowblocks(mp);
1848}