Update Linux to v5.4.2
Change-Id: Idf6911045d9d382da2cfe01b1edff026404ac8fd
diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig
index 457ac9f..e685299 100644
--- a/fs/xfs/Kconfig
+++ b/fs/xfs/Kconfig
@@ -1,7 +1,7 @@
+# SPDX-License-Identifier: GPL-2.0-only
config XFS_FS
tristate "XFS filesystem support"
depends on BLOCK
- depends on (64BIT || LBDAF)
select EXPORTFS
select LIBCRC32C
select FS_IOMAP
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index 7f96bda..06b68b6 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -4,8 +4,8 @@
# All Rights Reserved.
#
-ccflags-y += -I$(src) # needed for trace events
-ccflags-y += -I$(src)/libxfs
+ccflags-y += -I $(srctree)/$(src) # needed for trace events
+ccflags-y += -I $(srctree)/$(src)/libxfs
ccflags-$(CONFIG_XFS_DEBUG) += -g
@@ -49,6 +49,7 @@
xfs_refcount_btree.o \
xfs_sb.o \
xfs_symlink_remote.o \
+ xfs_trans_inode.o \
xfs_trans_resv.o \
xfs_types.o \
)
@@ -62,6 +63,7 @@
xfs_attr_inactive.o \
xfs_attr_list.o \
xfs_bmap_util.o \
+ xfs_bio_io.o \
xfs_buf.o \
xfs_dir2_readdir.o \
xfs_discard.o \
@@ -73,15 +75,18 @@
xfs_fsmap.o \
xfs_fsops.o \
xfs_globals.o \
+ xfs_health.o \
xfs_icache.o \
xfs_ioctl.o \
xfs_iomap.o \
xfs_iops.o \
xfs_inode.o \
xfs_itable.o \
+ xfs_iwalk.o \
xfs_message.o \
xfs_mount.o \
xfs_mru_cache.o \
+ xfs_pwork.o \
xfs_reflink.o \
xfs_stats.o \
xfs_super.o \
@@ -103,12 +108,7 @@
xfs_rmap_item.o \
xfs_log_recover.o \
xfs_trans_ail.o \
- xfs_trans_bmap.o \
- xfs_trans_buf.o \
- xfs_trans_extfree.o \
- xfs_trans_inode.o \
- xfs_trans_refcount.o \
- xfs_trans_rmap.o \
+ xfs_trans_buf.o
# optional features
xfs-$(CONFIG_XFS_QUOTA) += xfs_dquot.o \
@@ -142,6 +142,8 @@
common.o \
dabtree.o \
dir.o \
+ fscounters.o \
+ health.o \
ialloc.o \
inode.o \
parent.o \
diff --git a/fs/xfs/kmem.c b/fs/xfs/kmem.c
index fdd9d6e..da031b9 100644
--- a/fs/xfs/kmem.c
+++ b/fs/xfs/kmem.c
@@ -3,15 +3,10 @@
* Copyright (c) 2000-2005 Silicon Graphics, Inc.
* All Rights Reserved.
*/
-#include <linux/mm.h>
-#include <linux/sched/mm.h>
-#include <linux/highmem.h>
-#include <linux/slab.h>
-#include <linux/swap.h>
-#include <linux/blkdev.h>
+#include "xfs.h"
#include <linux/backing-dev.h>
-#include "kmem.h"
#include "xfs_message.h"
+#include "xfs_trace.h"
void *
kmem_alloc(size_t size, xfs_km_flags_t flags)
@@ -20,9 +15,11 @@
gfp_t lflags = kmem_flags_convert(flags);
void *ptr;
+ trace_kmem_alloc(size, flags, _RET_IP_);
+
do {
ptr = kmalloc(size, lflags);
- if (ptr || (flags & (KM_MAYFAIL|KM_NOSLEEP)))
+ if (ptr || (flags & KM_MAYFAIL))
return ptr;
if (!(++retries % 100))
xfs_err(NULL,
@@ -33,28 +30,24 @@
} while (1);
}
-void *
-kmem_alloc_large(size_t size, xfs_km_flags_t flags)
+
+/*
+ * __vmalloc() will allocate data pages and auxillary structures (e.g.
+ * pagetables) with GFP_KERNEL, yet we may be under GFP_NOFS context here. Hence
+ * we need to tell memory reclaim that we are in such a context via
+ * PF_MEMALLOC_NOFS to prevent memory reclaim re-entering the filesystem here
+ * and potentially deadlocking.
+ */
+static void *
+__kmem_vmalloc(size_t size, xfs_km_flags_t flags)
{
unsigned nofs_flag = 0;
void *ptr;
- gfp_t lflags;
+ gfp_t lflags = kmem_flags_convert(flags);
- ptr = kmem_alloc(size, flags | KM_MAYFAIL);
- if (ptr)
- return ptr;
-
- /*
- * __vmalloc() will allocate data pages and auxillary structures (e.g.
- * pagetables) with GFP_KERNEL, yet we may be under GFP_NOFS context
- * here. Hence we need to tell memory reclaim that we are in such a
- * context via PF_MEMALLOC_NOFS to prevent memory reclaim re-entering
- * the filesystem here and potentially deadlocking.
- */
if (flags & KM_NOFS)
nofs_flag = memalloc_nofs_save();
- lflags = kmem_flags_convert(flags);
ptr = __vmalloc(size, lflags, PAGE_KERNEL);
if (flags & KM_NOFS)
@@ -63,6 +56,44 @@
return ptr;
}
+/*
+ * Same as kmem_alloc_large, except we guarantee the buffer returned is aligned
+ * to the @align_mask. We only guarantee alignment up to page size, we'll clamp
+ * alignment at page size if it is larger. vmalloc always returns a PAGE_SIZE
+ * aligned region.
+ */
+void *
+kmem_alloc_io(size_t size, int align_mask, xfs_km_flags_t flags)
+{
+ void *ptr;
+
+ trace_kmem_alloc_io(size, flags, _RET_IP_);
+
+ if (WARN_ON_ONCE(align_mask >= PAGE_SIZE))
+ align_mask = PAGE_SIZE - 1;
+
+ ptr = kmem_alloc(size, flags | KM_MAYFAIL);
+ if (ptr) {
+ if (!((uintptr_t)ptr & align_mask))
+ return ptr;
+ kfree(ptr);
+ }
+ return __kmem_vmalloc(size, flags);
+}
+
+void *
+kmem_alloc_large(size_t size, xfs_km_flags_t flags)
+{
+ void *ptr;
+
+ trace_kmem_alloc_large(size, flags, _RET_IP_);
+
+ ptr = kmem_alloc(size, flags | KM_MAYFAIL);
+ if (ptr)
+ return ptr;
+ return __kmem_vmalloc(size, flags);
+}
+
void *
kmem_realloc(const void *old, size_t newsize, xfs_km_flags_t flags)
{
@@ -70,9 +101,11 @@
gfp_t lflags = kmem_flags_convert(flags);
void *ptr;
+ trace_kmem_realloc(newsize, flags, _RET_IP_);
+
do {
ptr = krealloc(old, newsize, lflags);
- if (ptr || (flags & (KM_MAYFAIL|KM_NOSLEEP)))
+ if (ptr || (flags & KM_MAYFAIL))
return ptr;
if (!(++retries % 100))
xfs_err(NULL,
@@ -90,9 +123,10 @@
gfp_t lflags = kmem_flags_convert(flags);
void *ptr;
+ trace_kmem_zone_alloc(kmem_cache_size(zone), flags, _RET_IP_);
do {
ptr = kmem_cache_alloc(zone, lflags);
- if (ptr || (flags & (KM_MAYFAIL|KM_NOSLEEP)))
+ if (ptr || (flags & KM_MAYFAIL))
return ptr;
if (!(++retries % 100))
xfs_err(NULL,
diff --git a/fs/xfs/kmem.h b/fs/xfs/kmem.h
index 8e6b3ba..8170d95 100644
--- a/fs/xfs/kmem.h
+++ b/fs/xfs/kmem.h
@@ -16,8 +16,6 @@
*/
typedef unsigned __bitwise xfs_km_flags_t;
-#define KM_SLEEP ((__force xfs_km_flags_t)0x0001u)
-#define KM_NOSLEEP ((__force xfs_km_flags_t)0x0002u)
#define KM_NOFS ((__force xfs_km_flags_t)0x0004u)
#define KM_MAYFAIL ((__force xfs_km_flags_t)0x0008u)
#define KM_ZERO ((__force xfs_km_flags_t)0x0010u)
@@ -32,15 +30,11 @@
{
gfp_t lflags;
- BUG_ON(flags & ~(KM_SLEEP|KM_NOSLEEP|KM_NOFS|KM_MAYFAIL|KM_ZERO));
+ BUG_ON(flags & ~(KM_NOFS|KM_MAYFAIL|KM_ZERO));
- if (flags & KM_NOSLEEP) {
- lflags = GFP_ATOMIC | __GFP_NOWARN;
- } else {
- lflags = GFP_KERNEL | __GFP_NOWARN;
- if (flags & KM_NOFS)
- lflags &= ~__GFP_FS;
- }
+ lflags = GFP_KERNEL | __GFP_NOWARN;
+ if (flags & KM_NOFS)
+ lflags &= ~__GFP_FS;
/*
* Default page/slab allocator behavior is to retry for ever
@@ -59,6 +53,7 @@
}
extern void *kmem_alloc(size_t, xfs_km_flags_t);
+extern void *kmem_alloc_io(size_t size, int align_mask, xfs_km_flags_t flags);
extern void *kmem_alloc_large(size_t size, xfs_km_flags_t);
extern void *kmem_realloc(const void *, size_t, xfs_km_flags_t);
static inline void kmem_free(const void *ptr)
@@ -124,4 +119,12 @@
return kmem_zone_alloc(zone, flags | KM_ZERO);
}
+static inline struct page *
+kmem_to_page(void *addr)
+{
+ if (is_vmalloc_addr(addr))
+ return vmalloc_to_page(addr);
+ return virt_to_page(addr);
+}
+
#endif /* __XFS_SUPPORT_KMEM_H__ */
diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c
index 9345802..14fbdf2 100644
--- a/fs/xfs/libxfs/xfs_ag.c
+++ b/fs/xfs/libxfs/xfs_ag.c
@@ -10,6 +10,7 @@
#include "xfs_shared.h"
#include "xfs_format.h"
#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
#include "xfs_sb.h"
#include "xfs_mount.h"
#include "xfs_btree.h"
@@ -19,18 +20,19 @@
#include "xfs_ialloc.h"
#include "xfs_rmap.h"
#include "xfs_ag.h"
+#include "xfs_ag_resv.h"
+#include "xfs_health.h"
static struct xfs_buf *
xfs_get_aghdr_buf(
struct xfs_mount *mp,
xfs_daddr_t blkno,
size_t numblks,
- int flags,
const struct xfs_buf_ops *ops)
{
struct xfs_buf *bp;
- bp = xfs_buf_get_uncached(mp->m_ddev_targp, numblks, flags);
+ bp = xfs_buf_get_uncached(mp->m_ddev_targp, numblks, 0);
if (!bp)
return NULL;
@@ -42,6 +44,12 @@
return bp;
}
+static inline bool is_log_ag(struct xfs_mount *mp, struct aghdr_init_data *id)
+{
+ return mp->m_sb.sb_logstart > 0 &&
+ id->agno == XFS_FSB_TO_AGNO(mp, mp->m_sb.sb_logstart);
+}
+
/*
* Generic btree root block init function
*/
@@ -51,7 +59,62 @@
struct xfs_buf *bp,
struct aghdr_init_data *id)
{
- xfs_btree_init_block(mp, bp, id->type, 0, 0, id->agno, 0);
+ xfs_btree_init_block(mp, bp, id->type, 0, 0, id->agno);
+}
+
+/* Finish initializing a free space btree. */
+static void
+xfs_freesp_init_recs(
+ struct xfs_mount *mp,
+ struct xfs_buf *bp,
+ struct aghdr_init_data *id)
+{
+ struct xfs_alloc_rec *arec;
+ struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
+
+ arec = XFS_ALLOC_REC_ADDR(mp, XFS_BUF_TO_BLOCK(bp), 1);
+ arec->ar_startblock = cpu_to_be32(mp->m_ag_prealloc_blocks);
+
+ if (is_log_ag(mp, id)) {
+ struct xfs_alloc_rec *nrec;
+ xfs_agblock_t start = XFS_FSB_TO_AGBNO(mp,
+ mp->m_sb.sb_logstart);
+
+ ASSERT(start >= mp->m_ag_prealloc_blocks);
+ if (start != mp->m_ag_prealloc_blocks) {
+ /*
+ * Modify first record to pad stripe align of log
+ */
+ arec->ar_blockcount = cpu_to_be32(start -
+ mp->m_ag_prealloc_blocks);
+ nrec = arec + 1;
+
+ /*
+ * Insert second record at start of internal log
+ * which then gets trimmed.
+ */
+ nrec->ar_startblock = cpu_to_be32(
+ be32_to_cpu(arec->ar_startblock) +
+ be32_to_cpu(arec->ar_blockcount));
+ arec = nrec;
+ be16_add_cpu(&block->bb_numrecs, 1);
+ }
+ /*
+ * Change record start to after the internal log
+ */
+ be32_add_cpu(&arec->ar_startblock, mp->m_sb.sb_logblocks);
+ }
+
+ /*
+ * Calculate the record block count and check for the case where
+ * the log might have consumed all available space in the AG. If
+ * so, reset the record count to 0 to avoid exposure of an invalid
+ * record start block.
+ */
+ arec->ar_blockcount = cpu_to_be32(id->agsize -
+ be32_to_cpu(arec->ar_startblock));
+ if (!arec->ar_blockcount)
+ block->bb_numrecs = 0;
}
/*
@@ -63,13 +126,8 @@
struct xfs_buf *bp,
struct aghdr_init_data *id)
{
- struct xfs_alloc_rec *arec;
-
- xfs_btree_init_block(mp, bp, XFS_BTNUM_BNO, 0, 1, id->agno, 0);
- arec = XFS_ALLOC_REC_ADDR(mp, XFS_BUF_TO_BLOCK(bp), 1);
- arec->ar_startblock = cpu_to_be32(mp->m_ag_prealloc_blocks);
- arec->ar_blockcount = cpu_to_be32(id->agsize -
- be32_to_cpu(arec->ar_startblock));
+ xfs_btree_init_block(mp, bp, XFS_BTNUM_BNO, 0, 1, id->agno);
+ xfs_freesp_init_recs(mp, bp, id);
}
static void
@@ -78,13 +136,8 @@
struct xfs_buf *bp,
struct aghdr_init_data *id)
{
- struct xfs_alloc_rec *arec;
-
- xfs_btree_init_block(mp, bp, XFS_BTNUM_CNT, 0, 1, id->agno, 0);
- arec = XFS_ALLOC_REC_ADDR(mp, XFS_BUF_TO_BLOCK(bp), 1);
- arec->ar_startblock = cpu_to_be32(mp->m_ag_prealloc_blocks);
- arec->ar_blockcount = cpu_to_be32(id->agsize -
- be32_to_cpu(arec->ar_startblock));
+ xfs_btree_init_block(mp, bp, XFS_BTNUM_CNT, 0, 1, id->agno);
+ xfs_freesp_init_recs(mp, bp, id);
}
/*
@@ -99,7 +152,7 @@
struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
struct xfs_rmap_rec *rrec;
- xfs_btree_init_block(mp, bp, XFS_BTNUM_RMAP, 0, 4, id->agno, 0);
+ xfs_btree_init_block(mp, bp, XFS_BTNUM_RMAP, 0, 4, id->agno);
/*
* mark the AG header regions as static metadata The BNO
@@ -147,6 +200,18 @@
rrec->rm_offset = 0;
be16_add_cpu(&block->bb_numrecs, 1);
}
+
+ /* account for the log space */
+ if (is_log_ag(mp, id)) {
+ rrec = XFS_RMAP_REC_ADDR(block,
+ be16_to_cpu(block->bb_numrecs) + 1);
+ rrec->rm_startblock = cpu_to_be32(
+ XFS_FSB_TO_AGBNO(mp, mp->m_sb.sb_logstart));
+ rrec->rm_blockcount = cpu_to_be32(mp->m_sb.sb_logblocks);
+ rrec->rm_owner = cpu_to_be64(XFS_RMAP_OWN_LOG);
+ rrec->rm_offset = 0;
+ be16_add_cpu(&block->bb_numrecs, 1);
+ }
}
/*
@@ -207,6 +272,14 @@
agf->agf_refcount_level = cpu_to_be32(1);
agf->agf_refcount_blocks = cpu_to_be32(1);
}
+
+ if (is_log_ag(mp, id)) {
+ int64_t logblocks = mp->m_sb.sb_logblocks;
+
+ be32_add_cpu(&agf->agf_freeblks, -logblocks);
+ agf->agf_longest = cpu_to_be32(id->agsize -
+ XFS_FSB_TO_AGBNO(mp, mp->m_sb.sb_logstart) - logblocks);
+ }
}
static void
@@ -271,7 +344,7 @@
{
struct xfs_buf *bp;
- bp = xfs_get_aghdr_buf(mp, id->daddr, id->numblks, 0, ops);
+ bp = xfs_get_aghdr_buf(mp, id->daddr, id->numblks, ops);
if (!bp)
return -ENOMEM;
@@ -339,14 +412,14 @@
{ /* BNO root block */
.daddr = XFS_AGB_TO_DADDR(mp, id->agno, XFS_BNO_BLOCK(mp)),
.numblks = BTOBB(mp->m_sb.sb_blocksize),
- .ops = &xfs_allocbt_buf_ops,
+ .ops = &xfs_bnobt_buf_ops,
.work = &xfs_bnoroot_init,
.need_init = true
},
{ /* CNT root block */
.daddr = XFS_AGB_TO_DADDR(mp, id->agno, XFS_CNT_BLOCK(mp)),
.numblks = BTOBB(mp->m_sb.sb_blocksize),
- .ops = &xfs_allocbt_buf_ops,
+ .ops = &xfs_cntbt_buf_ops,
.work = &xfs_cntroot_init,
.need_init = true
},
@@ -361,7 +434,7 @@
{ /* FINO root block */
.daddr = XFS_AGB_TO_DADDR(mp, id->agno, XFS_FIBT_BLOCK(mp)),
.numblks = BTOBB(mp->m_sb.sb_blocksize),
- .ops = &xfs_inobt_buf_ops,
+ .ops = &xfs_finobt_buf_ops,
.work = &xfs_btroot_init,
.type = XFS_BTNUM_FINO,
.need_init = xfs_sb_version_hasfinobt(&mp->m_sb)
@@ -414,7 +487,6 @@
struct aghdr_init_data *id,
xfs_extlen_t len)
{
- struct xfs_owner_info oinfo;
struct xfs_buf *bp;
struct xfs_agi *agi;
struct xfs_agf *agf;
@@ -448,17 +520,69 @@
/*
* Free the new space.
*
- * XFS_RMAP_OWN_NULL is used here to tell the rmap btree that
+ * XFS_RMAP_OINFO_SKIP_UPDATE is used here to tell the rmap btree that
* this doesn't actually exist in the rmap btree.
*/
- xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_NULL);
error = xfs_rmap_free(tp, bp, id->agno,
be32_to_cpu(agf->agf_length) - len,
- len, &oinfo);
+ len, &XFS_RMAP_OINFO_SKIP_UPDATE);
if (error)
return error;
return xfs_free_extent(tp, XFS_AGB_TO_FSB(mp, id->agno,
be32_to_cpu(agf->agf_length) - len),
- len, &oinfo, XFS_AG_RESV_NONE);
+ len, &XFS_RMAP_OINFO_SKIP_UPDATE,
+ XFS_AG_RESV_NONE);
+}
+
+/* Retrieve AG geometry. */
+int
+xfs_ag_get_geometry(
+ struct xfs_mount *mp,
+ xfs_agnumber_t agno,
+ struct xfs_ag_geometry *ageo)
+{
+ struct xfs_buf *agi_bp;
+ struct xfs_buf *agf_bp;
+ struct xfs_agi *agi;
+ struct xfs_agf *agf;
+ struct xfs_perag *pag;
+ unsigned int freeblks;
+ int error;
+
+ if (agno >= mp->m_sb.sb_agcount)
+ return -EINVAL;
+
+ /* Lock the AG headers. */
+ error = xfs_ialloc_read_agi(mp, NULL, agno, &agi_bp);
+ if (error)
+ return error;
+ error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agf_bp);
+ if (error)
+ goto out_agi;
+ pag = xfs_perag_get(mp, agno);
+
+ /* Fill out form. */
+ memset(ageo, 0, sizeof(*ageo));
+ ageo->ag_number = agno;
+
+ agi = XFS_BUF_TO_AGI(agi_bp);
+ ageo->ag_icount = be32_to_cpu(agi->agi_count);
+ ageo->ag_ifree = be32_to_cpu(agi->agi_freecount);
+
+ agf = XFS_BUF_TO_AGF(agf_bp);
+ ageo->ag_length = be32_to_cpu(agf->agf_length);
+ freeblks = pag->pagf_freeblks +
+ pag->pagf_flcount +
+ pag->pagf_btreeblks -
+ xfs_ag_resv_needed(pag, XFS_AG_RESV_NONE);
+ ageo->ag_freeblks = freeblks;
+ xfs_ag_geom_health(pag, ageo);
+
+ /* Release resources. */
+ xfs_perag_put(pag);
+ xfs_buf_relse(agf_bp);
+out_agi:
+ xfs_buf_relse(agi_bp);
+ return error;
}
diff --git a/fs/xfs/libxfs/xfs_ag.h b/fs/xfs/libxfs/xfs_ag.h
index 412702e..5166322 100644
--- a/fs/xfs/libxfs/xfs_ag.h
+++ b/fs/xfs/libxfs/xfs_ag.h
@@ -26,5 +26,7 @@
int xfs_ag_init_headers(struct xfs_mount *mp, struct aghdr_init_data *id);
int xfs_ag_extend_space(struct xfs_mount *mp, struct xfs_trans *tp,
struct aghdr_init_data *id, xfs_extlen_t len);
+int xfs_ag_get_geometry(struct xfs_mount *mp, xfs_agnumber_t agno,
+ struct xfs_ag_geometry *ageo);
#endif /* __LIBXFS_AG_H */
diff --git a/fs/xfs/libxfs/xfs_ag_resv.c b/fs/xfs/libxfs/xfs_ag_resv.c
index e701ebc..87a9747 100644
--- a/fs/xfs/libxfs/xfs_ag_resv.c
+++ b/fs/xfs/libxfs/xfs_ag_resv.c
@@ -9,20 +9,12 @@
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
-#include "xfs_sb.h"
#include "xfs_mount.h"
-#include "xfs_defer.h"
#include "xfs_alloc.h"
#include "xfs_errortag.h"
#include "xfs_error.h"
#include "xfs_trace.h"
-#include "xfs_cksum.h"
#include "xfs_trans.h"
-#include "xfs_bit.h"
-#include "xfs_bmap.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_ag_resv.h"
-#include "xfs_trans_space.h"
#include "xfs_rmap_btree.h"
#include "xfs_btree.h"
#include "xfs_refcount_btree.h"
@@ -281,7 +273,7 @@
*/
ask = used = 0;
- mp->m_inotbt_nores = true;
+ mp->m_finobt_nores = true;
error = xfs_refcountbt_calc_reserves(mp, tp, agno, &ask,
&used);
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index e1c0c0d..533b04a 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -13,7 +13,6 @@
#include "xfs_sb.h"
#include "xfs_mount.h"
#include "xfs_defer.h"
-#include "xfs_inode.h"
#include "xfs_btree.h"
#include "xfs_rmap.h"
#include "xfs_alloc_btree.h"
@@ -21,7 +20,6 @@
#include "xfs_extent_busy.h"
#include "xfs_errortag.h"
#include "xfs_error.h"
-#include "xfs_cksum.h"
#include "xfs_trace.h"
#include "xfs_trans.h"
#include "xfs_buf_item.h"
@@ -41,8 +39,6 @@
STATIC int xfs_alloc_ag_vextent_exact(xfs_alloc_arg_t *);
STATIC int xfs_alloc_ag_vextent_near(xfs_alloc_arg_t *);
STATIC int xfs_alloc_ag_vextent_size(xfs_alloc_arg_t *);
-STATIC int xfs_alloc_ag_vextent_small(xfs_alloc_arg_t *,
- xfs_btree_cur_t *, xfs_agblock_t *, xfs_extlen_t *, int *);
/*
* Size of the AGFL. For CRC-enabled filesystes we steal a couple of slots in
@@ -555,7 +551,7 @@
xfs_agfl_verify(
struct xfs_buf *bp)
{
- struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_mount *mp = bp->b_mount;
struct xfs_agfl *agfl = XFS_BUF_TO_AGFL(bp);
int i;
@@ -568,9 +564,9 @@
if (!xfs_sb_version_hascrc(&mp->m_sb))
return NULL;
- if (!uuid_equal(&agfl->agfl_uuid, &mp->m_sb.sb_meta_uuid))
+ if (!xfs_verify_magic(bp, agfl->agfl_magicnum))
return __this_address;
- if (be32_to_cpu(agfl->agfl_magicnum) != XFS_AGFL_MAGIC)
+ if (!uuid_equal(&agfl->agfl_uuid, &mp->m_sb.sb_meta_uuid))
return __this_address;
/*
* during growfs operations, the perag is not fully initialised,
@@ -596,7 +592,7 @@
xfs_agfl_read_verify(
struct xfs_buf *bp)
{
- struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_mount *mp = bp->b_mount;
xfs_failaddr_t fa;
/*
@@ -621,7 +617,7 @@
xfs_agfl_write_verify(
struct xfs_buf *bp)
{
- struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_mount *mp = bp->b_mount;
struct xfs_buf_log_item *bip = bp->b_log_item;
xfs_failaddr_t fa;
@@ -643,6 +639,7 @@
const struct xfs_buf_ops xfs_agfl_buf_ops = {
.name = "xfs_agfl",
+ .magic = { cpu_to_be32(XFS_AGFL_MAGIC), cpu_to_be32(XFS_AGFL_MAGIC) },
.verify_read = xfs_agfl_read_verify,
.verify_write = xfs_agfl_write_verify,
.verify_struct = xfs_agfl_verify,
@@ -699,6 +696,107 @@
*/
/*
+ * Deal with the case where only small freespaces remain. Either return the
+ * contents of the last freespace record, or allocate space from the freelist if
+ * there is nothing in the tree.
+ */
+STATIC int /* error */
+xfs_alloc_ag_vextent_small(
+ struct xfs_alloc_arg *args, /* allocation argument structure */
+ struct xfs_btree_cur *ccur, /* optional by-size cursor */
+ xfs_agblock_t *fbnop, /* result block number */
+ xfs_extlen_t *flenp, /* result length */
+ int *stat) /* status: 0-freelist, 1-normal/none */
+{
+ int error = 0;
+ xfs_agblock_t fbno = NULLAGBLOCK;
+ xfs_extlen_t flen = 0;
+ int i = 0;
+
+ /*
+ * If a cntbt cursor is provided, try to allocate the largest record in
+ * the tree. Try the AGFL if the cntbt is empty, otherwise fail the
+ * allocation. Make sure to respect minleft even when pulling from the
+ * freelist.
+ */
+ if (ccur)
+ error = xfs_btree_decrement(ccur, 0, &i);
+ if (error)
+ goto error;
+ if (i) {
+ error = xfs_alloc_get_rec(ccur, &fbno, &flen, &i);
+ if (error)
+ goto error;
+ XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error);
+ goto out;
+ }
+
+ if (args->minlen != 1 || args->alignment != 1 ||
+ args->resv == XFS_AG_RESV_AGFL ||
+ (be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_flcount) <=
+ args->minleft))
+ goto out;
+
+ error = xfs_alloc_get_freelist(args->tp, args->agbp, &fbno, 0);
+ if (error)
+ goto error;
+ if (fbno == NULLAGBLOCK)
+ goto out;
+
+ xfs_extent_busy_reuse(args->mp, args->agno, fbno, 1,
+ xfs_alloc_allow_busy_reuse(args->datatype));
+
+ if (xfs_alloc_is_userdata(args->datatype)) {
+ struct xfs_buf *bp;
+
+ bp = xfs_btree_get_bufs(args->mp, args->tp, args->agno, fbno);
+ if (!bp) {
+ error = -EFSCORRUPTED;
+ goto error;
+ }
+ xfs_trans_binval(args->tp, bp);
+ }
+ *fbnop = args->agbno = fbno;
+ *flenp = args->len = 1;
+ XFS_WANT_CORRUPTED_GOTO(args->mp,
+ fbno < be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length),
+ error);
+ args->wasfromfl = 1;
+ trace_xfs_alloc_small_freelist(args);
+
+ /*
+ * If we're feeding an AGFL block to something that doesn't live in the
+ * free space, we need to clear out the OWN_AG rmap.
+ */
+ error = xfs_rmap_free(args->tp, args->agbp, args->agno, fbno, 1,
+ &XFS_RMAP_OINFO_AG);
+ if (error)
+ goto error;
+
+ *stat = 0;
+ return 0;
+
+out:
+ /*
+ * Can't do the allocation, give up.
+ */
+ if (flen < args->minlen) {
+ args->agbno = NULLAGBLOCK;
+ trace_xfs_alloc_small_notenough(args);
+ flen = 0;
+ }
+ *fbnop = fbno;
+ *flenp = flen;
+ *stat = 1;
+ trace_xfs_alloc_small_done(args);
+ return 0;
+
+error:
+ trace_xfs_alloc_small_error(args);
+ return error;
+}
+
+/*
* Allocate a variable extent in the allocation group agno.
* Type and bno are used to determine where in the allocation group the
* extent will start.
@@ -1582,140 +1680,32 @@
}
/*
- * Deal with the case where only small freespaces remain.
- * Either return the contents of the last freespace record,
- * or allocate space from the freelist if there is nothing in the tree.
- */
-STATIC int /* error */
-xfs_alloc_ag_vextent_small(
- xfs_alloc_arg_t *args, /* allocation argument structure */
- xfs_btree_cur_t *ccur, /* by-size cursor */
- xfs_agblock_t *fbnop, /* result block number */
- xfs_extlen_t *flenp, /* result length */
- int *stat) /* status: 0-freelist, 1-normal/none */
-{
- struct xfs_owner_info oinfo;
- int error;
- xfs_agblock_t fbno;
- xfs_extlen_t flen;
- int i;
-
- if ((error = xfs_btree_decrement(ccur, 0, &i)))
- goto error0;
- if (i) {
- if ((error = xfs_alloc_get_rec(ccur, &fbno, &flen, &i)))
- goto error0;
- XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0);
- }
- /*
- * Nothing in the btree, try the freelist. Make sure
- * to respect minleft even when pulling from the
- * freelist.
- */
- else if (args->minlen == 1 && args->alignment == 1 &&
- args->resv != XFS_AG_RESV_AGFL &&
- (be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_flcount)
- > args->minleft)) {
- error = xfs_alloc_get_freelist(args->tp, args->agbp, &fbno, 0);
- if (error)
- goto error0;
- if (fbno != NULLAGBLOCK) {
- xfs_extent_busy_reuse(args->mp, args->agno, fbno, 1,
- xfs_alloc_allow_busy_reuse(args->datatype));
-
- if (xfs_alloc_is_userdata(args->datatype)) {
- xfs_buf_t *bp;
-
- bp = xfs_btree_get_bufs(args->mp, args->tp,
- args->agno, fbno, 0);
- if (!bp) {
- error = -EFSCORRUPTED;
- goto error0;
- }
- xfs_trans_binval(args->tp, bp);
- }
- args->len = 1;
- args->agbno = fbno;
- XFS_WANT_CORRUPTED_GOTO(args->mp,
- args->agbno + args->len <=
- be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length),
- error0);
- args->wasfromfl = 1;
- trace_xfs_alloc_small_freelist(args);
-
- /*
- * If we're feeding an AGFL block to something that
- * doesn't live in the free space, we need to clear
- * out the OWN_AG rmap.
- */
- xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_AG);
- error = xfs_rmap_free(args->tp, args->agbp, args->agno,
- fbno, 1, &oinfo);
- if (error)
- goto error0;
-
- *stat = 0;
- return 0;
- }
- /*
- * Nothing in the freelist.
- */
- else
- flen = 0;
- }
- /*
- * Can't allocate from the freelist for some reason.
- */
- else {
- fbno = NULLAGBLOCK;
- flen = 0;
- }
- /*
- * Can't do the allocation, give up.
- */
- if (flen < args->minlen) {
- args->agbno = NULLAGBLOCK;
- trace_xfs_alloc_small_notenough(args);
- flen = 0;
- }
- *fbnop = fbno;
- *flenp = flen;
- *stat = 1;
- trace_xfs_alloc_small_done(args);
- return 0;
-
-error0:
- trace_xfs_alloc_small_error(args);
- return error;
-}
-
-/*
* Free the extent starting at agno/bno for length.
*/
STATIC int
xfs_free_ag_extent(
- xfs_trans_t *tp,
- xfs_buf_t *agbp,
- xfs_agnumber_t agno,
- xfs_agblock_t bno,
- xfs_extlen_t len,
- struct xfs_owner_info *oinfo,
- enum xfs_ag_resv_type type)
+ struct xfs_trans *tp,
+ struct xfs_buf *agbp,
+ xfs_agnumber_t agno,
+ xfs_agblock_t bno,
+ xfs_extlen_t len,
+ const struct xfs_owner_info *oinfo,
+ enum xfs_ag_resv_type type)
{
- xfs_btree_cur_t *bno_cur; /* cursor for by-block btree */
- xfs_btree_cur_t *cnt_cur; /* cursor for by-size btree */
- int error; /* error return value */
- xfs_agblock_t gtbno; /* start of right neighbor block */
- xfs_extlen_t gtlen; /* length of right neighbor block */
- int haveleft; /* have a left neighbor block */
- int haveright; /* have a right neighbor block */
- int i; /* temp, result code */
- xfs_agblock_t ltbno; /* start of left neighbor block */
- xfs_extlen_t ltlen; /* length of left neighbor block */
- xfs_mount_t *mp; /* mount point struct for filesystem */
- xfs_agblock_t nbno; /* new starting block of freespace */
- xfs_extlen_t nlen; /* new length of freespace */
- xfs_perag_t *pag; /* per allocation group data */
+ struct xfs_mount *mp;
+ struct xfs_perag *pag;
+ struct xfs_btree_cur *bno_cur;
+ struct xfs_btree_cur *cnt_cur;
+ xfs_agblock_t gtbno; /* start of right neighbor */
+ xfs_extlen_t gtlen; /* length of right neighbor */
+ xfs_agblock_t ltbno; /* start of left neighbor */
+ xfs_extlen_t ltlen; /* length of left neighbor */
+ xfs_agblock_t nbno; /* new starting block of freesp */
+ xfs_extlen_t nlen; /* new length of freespace */
+ int haveleft; /* have a left neighbor */
+ int haveright; /* have a right neighbor */
+ int i;
+ int error;
bno_cur = cnt_cur = NULL;
mp = tp->t_mountp;
@@ -2043,6 +2033,7 @@
xfs_extlen_t alloc_len, longest;
xfs_extlen_t reservation; /* blocks that are still reserved */
int available;
+ xfs_extlen_t agflcount;
if (flags & XFS_ALLOC_FLAG_FREEING)
return true;
@@ -2055,8 +2046,13 @@
if (longest < alloc_len)
return false;
- /* do we have enough free space remaining for the allocation? */
- available = (int)(pag->pagf_freeblks + pag->pagf_flcount -
+ /*
+ * Do we have enough free space remaining for the allocation? Don't
+ * account extra agfl blocks because we are about to defer free them,
+ * making them unavailable until the current transaction commits.
+ */
+ agflcount = min_t(xfs_extlen_t, pag->pagf_flcount, min_free);
+ available = (int)(pag->pagf_freeblks + agflcount -
reservation - min_free - args->minleft);
if (available < (int)max(args->total, alloc_len))
return false;
@@ -2090,7 +2086,7 @@
if (error)
return error;
- bp = xfs_btree_get_bufs(tp->t_mountp, tp, agno, agbno, 0);
+ bp = xfs_btree_get_bufs(tp->t_mountp, tp, agno, agbno);
if (!bp)
return -EFSCORRUPTED;
xfs_trans_binval(tp, bp);
@@ -2209,7 +2205,7 @@
ASSERT(xfs_bmap_free_item_zone != NULL);
ASSERT(oinfo != NULL);
- new = kmem_zone_alloc(xfs_bmap_free_item_zone, KM_SLEEP);
+ new = kmem_zone_alloc(xfs_bmap_free_item_zone, 0);
new->xefi_startblock = XFS_AGB_TO_FSB(mp, agno, agbno);
new->xefi_blockcount = 1;
new->xefi_oinfo = *oinfo;
@@ -2238,6 +2234,9 @@
xfs_extlen_t need; /* total blocks needed in freelist */
int error = 0;
+ /* deferred ops (AGFL block frees) require permanent transactions */
+ ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
+
if (!pag->pagf_init) {
error = xfs_alloc_read_agf(mp, tp, args->agno, flags, &agbp);
if (error)
@@ -2314,10 +2313,11 @@
* repair/rmap.c in xfsprogs for details.
*/
memset(&targs, 0, sizeof(targs));
+ /* struct copy below */
if (flags & XFS_ALLOC_FLAG_NORMAP)
- xfs_rmap_skip_owner_update(&targs.oinfo);
+ targs.oinfo = XFS_RMAP_OINFO_SKIP_UPDATE;
else
- xfs_rmap_ag_owner(&targs.oinfo, XFS_RMAP_OWN_AG);
+ targs.oinfo = XFS_RMAP_OINFO_AG;
while (!(flags & XFS_ALLOC_FLAG_NOSHRINK) && pag->pagf_flcount > need) {
error = xfs_alloc_get_freelist(tp, agbp, &bno, 0);
if (error)
@@ -2435,7 +2435,6 @@
be32_add_cpu(&agf->agf_flcount, -1);
xfs_trans_agflist_delta(tp, -1);
pag->pagf_flcount--;
- xfs_perag_put(pag);
logflags = XFS_AGF_FLFIRST | XFS_AGF_FLCOUNT;
if (btreeblk) {
@@ -2443,6 +2442,7 @@
pag->pagf_btreeblks++;
logflags |= XFS_AGF_BTREEBLKS;
}
+ xfs_perag_put(pag);
xfs_alloc_log_agf(tp, agbp, logflags);
*bnop = bno;
@@ -2577,7 +2577,7 @@
xfs_agf_verify(
struct xfs_buf *bp)
{
- struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_mount *mp = bp->b_mount;
struct xfs_agf *agf = XFS_BUF_TO_AGF(bp);
if (xfs_sb_version_hascrc(&mp->m_sb)) {
@@ -2588,8 +2588,10 @@
return __this_address;
}
- if (!(agf->agf_magicnum == cpu_to_be32(XFS_AGF_MAGIC) &&
- XFS_AGF_GOOD_VERSION(be32_to_cpu(agf->agf_versionnum)) &&
+ if (!xfs_verify_magic(bp, agf->agf_magicnum))
+ return __this_address;
+
+ if (!(XFS_AGF_GOOD_VERSION(be32_to_cpu(agf->agf_versionnum)) &&
be32_to_cpu(agf->agf_freeblks) <= be32_to_cpu(agf->agf_length) &&
be32_to_cpu(agf->agf_flfirst) < xfs_agfl_size(mp) &&
be32_to_cpu(agf->agf_fllast) < xfs_agfl_size(mp) &&
@@ -2633,7 +2635,7 @@
xfs_agf_read_verify(
struct xfs_buf *bp)
{
- struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_mount *mp = bp->b_mount;
xfs_failaddr_t fa;
if (xfs_sb_version_hascrc(&mp->m_sb) &&
@@ -2650,7 +2652,7 @@
xfs_agf_write_verify(
struct xfs_buf *bp)
{
- struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_mount *mp = bp->b_mount;
struct xfs_buf_log_item *bip = bp->b_log_item;
xfs_failaddr_t fa;
@@ -2671,6 +2673,7 @@
const struct xfs_buf_ops xfs_agf_buf_ops = {
.name = "xfs_agf",
+ .magic = { cpu_to_be32(XFS_AGF_MAGIC), cpu_to_be32(XFS_AGF_MAGIC) },
.verify_read = xfs_agf_read_verify,
.verify_write = xfs_agf_write_verify,
.verify_struct = xfs_agf_verify,
@@ -3008,21 +3011,21 @@
* Just break up the extent address and hand off to xfs_free_ag_extent
* after fixing up the freelist.
*/
-int /* error */
+int
__xfs_free_extent(
- struct xfs_trans *tp, /* transaction pointer */
- xfs_fsblock_t bno, /* starting block number of extent */
- xfs_extlen_t len, /* length of extent */
- struct xfs_owner_info *oinfo, /* extent owner */
- enum xfs_ag_resv_type type, /* block reservation type */
- bool skip_discard)
+ struct xfs_trans *tp,
+ xfs_fsblock_t bno,
+ xfs_extlen_t len,
+ const struct xfs_owner_info *oinfo,
+ enum xfs_ag_resv_type type,
+ bool skip_discard)
{
- struct xfs_mount *mp = tp->t_mountp;
- struct xfs_buf *agbp;
- xfs_agnumber_t agno = XFS_FSB_TO_AGNO(mp, bno);
- xfs_agblock_t agbno = XFS_FSB_TO_AGBNO(mp, bno);
- int error;
- unsigned int busy_flags = 0;
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_buf *agbp;
+ xfs_agnumber_t agno = XFS_FSB_TO_AGNO(mp, bno);
+ xfs_agblock_t agbno = XFS_FSB_TO_AGBNO(mp, bno);
+ int error;
+ unsigned int busy_flags = 0;
ASSERT(len != 0);
ASSERT(type != XFS_AG_RESV_AGFL);
@@ -3134,7 +3137,7 @@
/*
* Walk all the blocks in the AGFL. The @walk_fn can return any negative
- * error code or XFS_BTREE_QUERY_RANGE_ABORT.
+ * error code or XFS_ITER_*.
*/
int
xfs_agfl_walk(
diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h
index 00cd5ec..d6ed5d2 100644
--- a/fs/xfs/libxfs/xfs_alloc.h
+++ b/fs/xfs/libxfs/xfs_alloc.h
@@ -182,7 +182,7 @@
struct xfs_trans *tp, /* transaction pointer */
xfs_fsblock_t bno, /* starting block number of extent */
xfs_extlen_t len, /* length of extent */
- struct xfs_owner_info *oinfo, /* extent owner */
+ const struct xfs_owner_info *oinfo, /* extent owner */
enum xfs_ag_resv_type type, /* block reservation type */
bool skip_discard);
@@ -191,7 +191,7 @@
struct xfs_trans *tp,
xfs_fsblock_t bno,
xfs_extlen_t len,
- struct xfs_owner_info *oinfo,
+ const struct xfs_owner_info *oinfo,
enum xfs_ag_resv_type type)
{
return __xfs_free_extent(tp, bno, len, oinfo, type, false);
diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c
index 4e59cc8..2a94543 100644
--- a/fs/xfs/libxfs/xfs_alloc_btree.c
+++ b/fs/xfs/libxfs/xfs_alloc_btree.c
@@ -17,7 +17,6 @@
#include "xfs_extent_busy.h"
#include "xfs_error.h"
#include "xfs_trace.h"
-#include "xfs_cksum.h"
#include "xfs_trans.h"
@@ -292,53 +291,39 @@
xfs_allocbt_verify(
struct xfs_buf *bp)
{
- struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_mount *mp = bp->b_mount;
struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
struct xfs_perag *pag = bp->b_pag;
xfs_failaddr_t fa;
unsigned int level;
+ xfs_btnum_t btnum = XFS_BTNUM_BNOi;
+
+ if (!xfs_verify_magic(bp, block->bb_magic))
+ return __this_address;
+
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ fa = xfs_btree_sblock_v5hdr_verify(bp);
+ if (fa)
+ return fa;
+ }
/*
- * magic number and level verification
+ * The perag may not be attached during grow operations or fully
+ * initialized from the AGF during log recovery. Therefore we can only
+ * check against maximum tree depth from those contexts.
*
- * During growfs operations, we can't verify the exact level or owner as
- * the perag is not fully initialised and hence not attached to the
- * buffer. In this case, check against the maximum tree depth.
- *
- * Similarly, during log recovery we will have a perag structure
- * attached, but the agf information will not yet have been initialised
- * from the on disk AGF. Again, we can only check against maximum limits
- * in this case.
+ * Otherwise check against the per-tree limit. Peek at one of the
+ * verifier magic values to determine the type of tree we're verifying
+ * against.
*/
level = be16_to_cpu(block->bb_level);
- switch (block->bb_magic) {
- case cpu_to_be32(XFS_ABTB_CRC_MAGIC):
- fa = xfs_btree_sblock_v5hdr_verify(bp);
- if (fa)
- return fa;
- /* fall through */
- case cpu_to_be32(XFS_ABTB_MAGIC):
- if (pag && pag->pagf_init) {
- if (level >= pag->pagf_levels[XFS_BTNUM_BNOi])
- return __this_address;
- } else if (level >= mp->m_ag_maxlevels)
+ if (bp->b_ops->magic[0] == cpu_to_be32(XFS_ABTC_MAGIC))
+ btnum = XFS_BTNUM_CNTi;
+ if (pag && pag->pagf_init) {
+ if (level >= pag->pagf_levels[btnum])
return __this_address;
- break;
- case cpu_to_be32(XFS_ABTC_CRC_MAGIC):
- fa = xfs_btree_sblock_v5hdr_verify(bp);
- if (fa)
- return fa;
- /* fall through */
- case cpu_to_be32(XFS_ABTC_MAGIC):
- if (pag && pag->pagf_init) {
- if (level >= pag->pagf_levels[XFS_BTNUM_CNTi])
- return __this_address;
- } else if (level >= mp->m_ag_maxlevels)
- return __this_address;
- break;
- default:
+ } else if (level >= mp->m_ag_maxlevels)
return __this_address;
- }
return xfs_btree_sblock_verify(bp, mp->m_alloc_mxr[level != 0]);
}
@@ -377,13 +362,23 @@
}
-const struct xfs_buf_ops xfs_allocbt_buf_ops = {
- .name = "xfs_allocbt",
+const struct xfs_buf_ops xfs_bnobt_buf_ops = {
+ .name = "xfs_bnobt",
+ .magic = { cpu_to_be32(XFS_ABTB_MAGIC),
+ cpu_to_be32(XFS_ABTB_CRC_MAGIC) },
.verify_read = xfs_allocbt_read_verify,
.verify_write = xfs_allocbt_write_verify,
.verify_struct = xfs_allocbt_verify,
};
+const struct xfs_buf_ops xfs_cntbt_buf_ops = {
+ .name = "xfs_cntbt",
+ .magic = { cpu_to_be32(XFS_ABTC_MAGIC),
+ cpu_to_be32(XFS_ABTC_CRC_MAGIC) },
+ .verify_read = xfs_allocbt_read_verify,
+ .verify_write = xfs_allocbt_write_verify,
+ .verify_struct = xfs_allocbt_verify,
+};
STATIC int
xfs_bnobt_keys_inorder(
@@ -448,7 +443,7 @@
.init_rec_from_cur = xfs_allocbt_init_rec_from_cur,
.init_ptr_from_cur = xfs_allocbt_init_ptr_from_cur,
.key_diff = xfs_bnobt_key_diff,
- .buf_ops = &xfs_allocbt_buf_ops,
+ .buf_ops = &xfs_bnobt_buf_ops,
.diff_two_keys = xfs_bnobt_diff_two_keys,
.keys_inorder = xfs_bnobt_keys_inorder,
.recs_inorder = xfs_bnobt_recs_inorder,
@@ -470,7 +465,7 @@
.init_rec_from_cur = xfs_allocbt_init_rec_from_cur,
.init_ptr_from_cur = xfs_allocbt_init_ptr_from_cur,
.key_diff = xfs_cntbt_key_diff,
- .buf_ops = &xfs_allocbt_buf_ops,
+ .buf_ops = &xfs_cntbt_buf_ops,
.diff_two_keys = xfs_cntbt_diff_two_keys,
.keys_inorder = xfs_cntbt_keys_inorder,
.recs_inorder = xfs_cntbt_recs_inorder,
diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c
index c6299f8..510ca69 100644
--- a/fs/xfs/libxfs/xfs_attr.c
+++ b/fs/xfs/libxfs/xfs_attr.c
@@ -9,23 +9,18 @@
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
-#include "xfs_bit.h"
#include "xfs_mount.h"
#include "xfs_defer.h"
#include "xfs_da_format.h"
#include "xfs_da_btree.h"
#include "xfs_attr_sf.h"
#include "xfs_inode.h"
-#include "xfs_alloc.h"
#include "xfs_trans.h"
-#include "xfs_inode_item.h"
#include "xfs_bmap.h"
-#include "xfs_bmap_util.h"
#include "xfs_bmap_btree.h"
#include "xfs_attr.h"
#include "xfs_attr_leaf.h"
#include "xfs_attr_remote.h"
-#include "xfs_error.h"
#include "xfs_quota.h"
#include "xfs_trans_space.h"
#include "xfs_trace.h"
@@ -102,7 +97,10 @@
* Overall external interface routines.
*========================================================================*/
-/* Retrieve an extended attribute and its value. Must have ilock. */
+/*
+ * Retrieve an extended attribute and its value. Must have ilock.
+ * Returns 0 on successful retrieval, otherwise an error.
+ */
int
xfs_attr_get_ilocked(
struct xfs_inode *ip,
@@ -120,12 +118,28 @@
return xfs_attr_node_get(args);
}
-/* Retrieve an extended attribute by name, and its value. */
+/*
+ * Retrieve an extended attribute by name, and its value if requested.
+ *
+ * If ATTR_KERNOVAL is set in @flags, then the caller does not want the value,
+ * just an indication whether the attribute exists and the size of the value if
+ * it exists. The size is returned in @valuelenp,
+ *
+ * If the attribute is found, but exceeds the size limit set by the caller in
+ * @valuelenp, return -ERANGE with the size of the attribute that was found in
+ * @valuelenp.
+ *
+ * If ATTR_ALLOC is set in @flags, allocate the buffer for the value after
+ * existence of the attribute has been determined. On success, return that
+ * buffer to the caller and leave them to free it. On failure, free any
+ * allocated buffer and ensure the buffer pointer returned to the caller is
+ * null.
+ */
int
xfs_attr_get(
struct xfs_inode *ip,
const unsigned char *name,
- unsigned char *value,
+ unsigned char **value,
int *valuelenp,
int flags)
{
@@ -133,6 +147,8 @@
uint lock_mode;
int error;
+ ASSERT((flags & (ATTR_ALLOC | ATTR_KERNOVAL)) || *value);
+
XFS_STATS_INC(ip->i_mount, xs_attr_get);
if (XFS_FORCED_SHUTDOWN(ip->i_mount))
@@ -142,17 +158,29 @@
if (error)
return error;
- args.value = value;
- args.valuelen = *valuelenp;
/* Entirely possible to look up a name which doesn't exist */
args.op_flags = XFS_DA_OP_OKNOENT;
+ if (flags & ATTR_ALLOC)
+ args.op_flags |= XFS_DA_OP_ALLOCVAL;
+ else
+ args.value = *value;
+ args.valuelen = *valuelenp;
lock_mode = xfs_ilock_attr_map_shared(ip);
error = xfs_attr_get_ilocked(ip, &args);
xfs_iunlock(ip, lock_mode);
-
*valuelenp = args.valuelen;
- return error == -EEXIST ? 0 : error;
+
+ /* on error, we have to clean up allocated value buffers */
+ if (error) {
+ if (flags & ATTR_ALLOC) {
+ kmem_free(args.value);
+ *value = NULL;
+ }
+ return error;
+ }
+ *value = args.value;
+ return 0;
}
/*
@@ -191,6 +219,121 @@
return nblks;
}
+STATIC int
+xfs_attr_try_sf_addname(
+ struct xfs_inode *dp,
+ struct xfs_da_args *args)
+{
+
+ struct xfs_mount *mp = dp->i_mount;
+ int error, error2;
+
+ error = xfs_attr_shortform_addname(args);
+ if (error == -ENOSPC)
+ return error;
+
+ /*
+ * Commit the shortform mods, and we're done.
+ * NOTE: this is also the error path (EEXIST, etc).
+ */
+ if (!error && (args->flags & ATTR_KERNOTIME) == 0)
+ xfs_trans_ichgtime(args->trans, dp, XFS_ICHGTIME_CHG);
+
+ if (mp->m_flags & XFS_MOUNT_WSYNC)
+ xfs_trans_set_sync(args->trans);
+
+ error2 = xfs_trans_commit(args->trans);
+ args->trans = NULL;
+ return error ? error : error2;
+}
+
+/*
+ * Set the attribute specified in @args.
+ */
+int
+xfs_attr_set_args(
+ struct xfs_da_args *args)
+{
+ struct xfs_inode *dp = args->dp;
+ struct xfs_buf *leaf_bp = NULL;
+ int error;
+
+ /*
+ * If the attribute list is non-existent or a shortform list,
+ * upgrade it to a single-leaf-block attribute list.
+ */
+ if (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL ||
+ (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS &&
+ dp->i_d.di_anextents == 0)) {
+
+ /*
+ * Build initial attribute list (if required).
+ */
+ if (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS)
+ xfs_attr_shortform_create(args);
+
+ /*
+ * Try to add the attr to the attribute list in the inode.
+ */
+ error = xfs_attr_try_sf_addname(dp, args);
+ if (error != -ENOSPC)
+ return error;
+
+ /*
+ * It won't fit in the shortform, transform to a leaf block.
+ * GROT: another possible req'mt for a double-split btree op.
+ */
+ error = xfs_attr_shortform_to_leaf(args, &leaf_bp);
+ if (error)
+ return error;
+
+ /*
+ * Prevent the leaf buffer from being unlocked so that a
+ * concurrent AIL push cannot grab the half-baked leaf
+ * buffer and run into problems with the write verifier.
+ * Once we're done rolling the transaction we can release
+ * the hold and add the attr to the leaf.
+ */
+ xfs_trans_bhold(args->trans, leaf_bp);
+ error = xfs_defer_finish(&args->trans);
+ xfs_trans_bhold_release(args->trans, leaf_bp);
+ if (error) {
+ xfs_trans_brelse(args->trans, leaf_bp);
+ return error;
+ }
+ }
+
+ if (xfs_bmap_one_block(dp, XFS_ATTR_FORK))
+ error = xfs_attr_leaf_addname(args);
+ else
+ error = xfs_attr_node_addname(args);
+ return error;
+}
+
+/*
+ * Remove the attribute specified in @args.
+ */
+int
+xfs_attr_remove_args(
+ struct xfs_da_args *args)
+{
+ struct xfs_inode *dp = args->dp;
+ int error;
+
+ if (!xfs_inode_hasattr(dp)) {
+ error = -ENOATTR;
+ } else if (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) {
+ ASSERT(dp->i_afp->if_flags & XFS_IFINLINE);
+ error = xfs_attr_shortform_remove(args);
+ } else if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) {
+ error = xfs_attr_leaf_removename(args);
+ } else {
+ error = xfs_attr_node_removename(args);
+ }
+
+ return error;
+}
+
int
xfs_attr_set(
struct xfs_inode *dp,
@@ -200,11 +343,10 @@
int flags)
{
struct xfs_mount *mp = dp->i_mount;
- struct xfs_buf *leaf_bp = NULL;
struct xfs_da_args args;
struct xfs_trans_res tres;
int rsvd = (flags & ATTR_ROOT) != 0;
- int error, err2, local;
+ int error, local;
XFS_STATS_INC(mp, xs_attr_set);
@@ -255,93 +397,17 @@
error = xfs_trans_reserve_quota_nblks(args.trans, dp, args.total, 0,
rsvd ? XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_FORCE_RES :
XFS_QMOPT_RES_REGBLKS);
- if (error) {
- xfs_iunlock(dp, XFS_ILOCK_EXCL);
- xfs_trans_cancel(args.trans);
- return error;
- }
+ if (error)
+ goto out_trans_cancel;
xfs_trans_ijoin(args.trans, dp, 0);
-
- /*
- * If the attribute list is non-existent or a shortform list,
- * upgrade it to a single-leaf-block attribute list.
- */
- if (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL ||
- (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS &&
- dp->i_d.di_anextents == 0)) {
-
- /*
- * Build initial attribute list (if required).
- */
- if (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS)
- xfs_attr_shortform_create(&args);
-
- /*
- * Try to add the attr to the attribute list in
- * the inode.
- */
- error = xfs_attr_shortform_addname(&args);
- if (error != -ENOSPC) {
- /*
- * Commit the shortform mods, and we're done.
- * NOTE: this is also the error path (EEXIST, etc).
- */
- ASSERT(args.trans != NULL);
-
- /*
- * If this is a synchronous mount, make sure that
- * the transaction goes to disk before returning
- * to the user.
- */
- if (mp->m_flags & XFS_MOUNT_WSYNC)
- xfs_trans_set_sync(args.trans);
-
- if (!error && (flags & ATTR_KERNOTIME) == 0) {
- xfs_trans_ichgtime(args.trans, dp,
- XFS_ICHGTIME_CHG);
- }
- err2 = xfs_trans_commit(args.trans);
- xfs_iunlock(dp, XFS_ILOCK_EXCL);
-
- return error ? error : err2;
- }
-
- /*
- * It won't fit in the shortform, transform to a leaf block.
- * GROT: another possible req'mt for a double-split btree op.
- */
- error = xfs_attr_shortform_to_leaf(&args, &leaf_bp);
- if (error)
- goto out;
- /*
- * Prevent the leaf buffer from being unlocked so that a
- * concurrent AIL push cannot grab the half-baked leaf
- * buffer and run into problems with the write verifier.
- */
- xfs_trans_bhold(args.trans, leaf_bp);
- error = xfs_defer_finish(&args.trans);
- if (error)
- goto out;
-
- /*
- * Commit the leaf transformation. We'll need another (linked)
- * transaction to add the new attribute to the leaf, which
- * means that we have to hold & join the leaf buffer here too.
- */
- error = xfs_trans_roll_inode(&args.trans, dp);
- if (error)
- goto out;
- xfs_trans_bjoin(args.trans, leaf_bp);
- leaf_bp = NULL;
- }
-
- if (xfs_bmap_one_block(dp, XFS_ATTR_FORK))
- error = xfs_attr_leaf_addname(&args);
- else
- error = xfs_attr_node_addname(&args);
+ error = xfs_attr_set_args(&args);
if (error)
- goto out;
+ goto out_trans_cancel;
+ if (!args.trans) {
+ /* shortform attribute has already been committed */
+ goto out_unlock;
+ }
/*
* If this is a synchronous mount, make sure that the
@@ -358,17 +424,14 @@
*/
xfs_trans_log_inode(args.trans, dp, XFS_ILOG_CORE);
error = xfs_trans_commit(args.trans);
+out_unlock:
xfs_iunlock(dp, XFS_ILOCK_EXCL);
-
return error;
-out:
- if (leaf_bp)
- xfs_trans_brelse(args.trans, leaf_bp);
+out_trans_cancel:
if (args.trans)
xfs_trans_cancel(args.trans);
- xfs_iunlock(dp, XFS_ILOCK_EXCL);
- return error;
+ goto out_unlock;
}
/*
@@ -423,17 +486,7 @@
*/
xfs_trans_ijoin(args.trans, dp, 0);
- if (!xfs_inode_hasattr(dp)) {
- error = -ENOATTR;
- } else if (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) {
- ASSERT(dp->i_afp->if_flags & XFS_IFINLINE);
- error = xfs_attr_shortform_remove(&args);
- } else if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) {
- error = xfs_attr_leaf_removename(&args);
- } else {
- error = xfs_attr_node_removename(&args);
- }
-
+ error = xfs_attr_remove_args(&args);
if (error)
goto out;
@@ -748,6 +801,8 @@
*
* This leaf block cannot have a "remote" value, we only call this routine
* if bmap_one_block() says there is only one block (ie: no remote blks).
+ *
+ * Returns 0 on successful retrieval, otherwise an error.
*/
STATIC int
xfs_attr_leaf_get(xfs_da_args_t *args)
@@ -769,9 +824,6 @@
}
error = xfs_attr3_leaf_getvalue(bp, args);
xfs_trans_brelse(args->trans, bp);
- if (!error && (args->rmtblkno > 0) && !(args->flags & ATTR_KERNOVAL)) {
- error = xfs_attr_rmtval_get(args);
- }
return error;
}
@@ -1248,11 +1300,13 @@
}
/*
- * Look up a filename in a node attribute list.
+ * Retrieve the attribute data from a node attribute list.
*
* This routine gets called for any attribute fork that has more than one
* block, ie: both true Btree attr lists and for single-leaf-blocks with
* "remote" values taking up more blocks.
+ *
+ * Returns 0 on successful retrieval, otherwise an error.
*/
STATIC int
xfs_attr_node_get(xfs_da_args_t *args)
@@ -1274,24 +1328,21 @@
error = xfs_da3_node_lookup_int(state, &retval);
if (error) {
retval = error;
- } else if (retval == -EEXIST) {
- blk = &state->path.blk[ state->path.active-1 ];
- ASSERT(blk->bp != NULL);
- ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC);
-
- /*
- * Get the value, local or "remote"
- */
- retval = xfs_attr3_leaf_getvalue(blk->bp, args);
- if (!retval && (args->rmtblkno > 0)
- && !(args->flags & ATTR_KERNOVAL)) {
- retval = xfs_attr_rmtval_get(args);
- }
+ goto out_release;
}
+ if (retval != -EEXIST)
+ goto out_release;
+
+ /*
+ * Get the value, local or "remote"
+ */
+ blk = &state->path.blk[state->path.active - 1];
+ retval = xfs_attr3_leaf_getvalue(blk->bp, args);
/*
* If not in a transaction, we have to release all the buffers.
*/
+out_release:
for (i = 0; i < state->path.active; i++) {
xfs_trans_brelse(args->trans, state->path.blk[i].bp);
state->path.blk[i].bp = NULL;
@@ -1300,3 +1351,20 @@
xfs_da_state_free(state);
return retval;
}
+
+/* Returns true if the attribute entry name is valid. */
+bool
+xfs_attr_namecheck(
+ const void *name,
+ size_t length)
+{
+ /*
+ * MAXNAMELEN includes the trailing null, but (name/length) leave it
+ * out, so use >= for the length check.
+ */
+ if (length >= MAXNAMELEN)
+ return false;
+
+ /* There shouldn't be any nulls here */
+ return !memchr(name, 0, length);
+}
diff --git a/fs/xfs/xfs_attr.h b/fs/xfs/libxfs/xfs_attr.h
similarity index 91%
rename from fs/xfs/xfs_attr.h
rename to fs/xfs/libxfs/xfs_attr.h
index 033ff8c..94badfa 100644
--- a/fs/xfs/xfs_attr.h
+++ b/fs/xfs/libxfs/xfs_attr.h
@@ -37,6 +37,7 @@
#define ATTR_KERNOVAL 0x2000 /* [kernel] get attr size only, not value */
#define ATTR_INCOMPLETE 0x4000 /* [kernel] return INCOMPLETE attr keys */
+#define ATTR_ALLOC 0x8000 /* allocate xattr buffer on demand */
#define XFS_ATTR_FLAGS \
{ ATTR_DONTFOLLOW, "DONTFOLLOW" }, \
@@ -47,7 +48,8 @@
{ ATTR_REPLACE, "REPLACE" }, \
{ ATTR_KERNOTIME, "KERNOTIME" }, \
{ ATTR_KERNOVAL, "KERNOVAL" }, \
- { ATTR_INCOMPLETE, "INCOMPLETE" }
+ { ATTR_INCOMPLETE, "INCOMPLETE" }, \
+ { ATTR_ALLOC, "ALLOC" }
/*
* The maximum size (into the kernel or returned from the kernel) of an
@@ -112,7 +114,13 @@
struct xfs_inode *dp; /* inode */
struct attrlist_cursor_kern *cursor; /* position in list */
char *alist; /* output buffer */
- int seen_enough; /* T/F: seen enough of list? */
+
+ /*
+ * Abort attribute list iteration if non-zero. Can be used to pass
+ * error values to the xfs_attr_list caller.
+ */
+ int seen_enough;
+
ssize_t count; /* num used entries */
int dupcnt; /* count dup hashvals seen */
int bufsize; /* total buffer size */
@@ -137,12 +145,14 @@
int xfs_inode_hasattr(struct xfs_inode *ip);
int xfs_attr_get_ilocked(struct xfs_inode *ip, struct xfs_da_args *args);
int xfs_attr_get(struct xfs_inode *ip, const unsigned char *name,
- unsigned char *value, int *valuelenp, int flags);
+ unsigned char **value, int *valuelenp, int flags);
int xfs_attr_set(struct xfs_inode *dp, const unsigned char *name,
unsigned char *value, int valuelen, int flags);
+int xfs_attr_set_args(struct xfs_da_args *args);
int xfs_attr_remove(struct xfs_inode *dp, const unsigned char *name, int flags);
+int xfs_attr_remove_args(struct xfs_da_args *args);
int xfs_attr_list(struct xfs_inode *dp, char *buffer, int bufsize,
int flags, struct attrlist_cursor_kern *cursor);
-
+bool xfs_attr_namecheck(const void *name, size_t length);
#endif /* __XFS_ATTR_H__ */
diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c
index 6fc5425..f0089e8 100644
--- a/fs/xfs/libxfs/xfs_attr_leaf.c
+++ b/fs/xfs/libxfs/xfs_attr_leaf.c
@@ -10,14 +10,12 @@
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
-#include "xfs_bit.h"
#include "xfs_sb.h"
#include "xfs_mount.h"
#include "xfs_da_format.h"
#include "xfs_da_btree.h"
#include "xfs_inode.h"
#include "xfs_trans.h"
-#include "xfs_inode_item.h"
#include "xfs_bmap_btree.h"
#include "xfs_bmap.h"
#include "xfs_attr_sf.h"
@@ -27,7 +25,6 @@
#include "xfs_error.h"
#include "xfs_trace.h"
#include "xfs_buf_item.h"
-#include "xfs_cksum.h"
#include "xfs_dir2.h"
#include "xfs_log.h"
@@ -240,30 +237,19 @@
struct xfs_buf *bp)
{
struct xfs_attr3_icleaf_hdr ichdr;
- struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_mount *mp = bp->b_mount;
struct xfs_attr_leafblock *leaf = bp->b_addr;
struct xfs_attr_leaf_entry *entries;
- uint16_t end;
+ uint32_t end; /* must be 32bit - see below */
int i;
+ xfs_failaddr_t fa;
xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr, leaf);
- if (xfs_sb_version_hascrc(&mp->m_sb)) {
- struct xfs_da3_node_hdr *hdr3 = bp->b_addr;
+ fa = xfs_da3_blkinfo_verify(bp, bp->b_addr);
+ if (fa)
+ return fa;
- if (ichdr.magic != XFS_ATTR3_LEAF_MAGIC)
- return __this_address;
-
- if (!uuid_equal(&hdr3->info.uuid, &mp->m_sb.sb_meta_uuid))
- return __this_address;
- if (be64_to_cpu(hdr3->info.blkno) != bp->b_bn)
- return __this_address;
- if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->info.lsn)))
- return __this_address;
- } else {
- if (ichdr.magic != XFS_ATTR_LEAF_MAGIC)
- return __this_address;
- }
/*
* In recovery there is a transient state where count == 0 is valid
* because we may have transitioned an empty shortform attr to a leaf
@@ -293,6 +279,11 @@
/*
* Quickly check the freemap information. Attribute data has to be
* aligned to 4-byte boundaries, and likewise for the free space.
+ *
+ * Note that for 64k block size filesystems, the freemap entries cannot
+ * overflow as they are only be16 fields. However, when checking end
+ * pointer of the freemap, we have to be careful to detect overflows and
+ * so use uint32_t for those checks.
*/
for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) {
if (ichdr.freemap[i].base > mp->m_attr_geo->blksize)
@@ -303,7 +294,9 @@
return __this_address;
if (ichdr.freemap[i].size & 0x3)
return __this_address;
- end = ichdr.freemap[i].base + ichdr.freemap[i].size;
+
+ /* be care of 16 bit overflows here */
+ end = (uint32_t)ichdr.freemap[i].base + ichdr.freemap[i].size;
if (end < ichdr.freemap[i].base)
return __this_address;
if (end > mp->m_attr_geo->blksize)
@@ -317,7 +310,7 @@
xfs_attr3_leaf_write_verify(
struct xfs_buf *bp)
{
- struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_mount *mp = bp->b_mount;
struct xfs_buf_log_item *bip = bp->b_log_item;
struct xfs_attr3_leaf_hdr *hdr3 = bp->b_addr;
xfs_failaddr_t fa;
@@ -347,7 +340,7 @@
xfs_attr3_leaf_read_verify(
struct xfs_buf *bp)
{
- struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_mount *mp = bp->b_mount;
xfs_failaddr_t fa;
if (xfs_sb_version_hascrc(&mp->m_sb) &&
@@ -362,6 +355,8 @@
const struct xfs_buf_ops xfs_attr3_leaf_buf_ops = {
.name = "xfs_attr3_leaf",
+ .magic16 = { cpu_to_be16(XFS_ATTR_LEAF_MAGIC),
+ cpu_to_be16(XFS_ATTR3_LEAF_MAGIC) },
.verify_read = xfs_attr3_leaf_read_verify,
.verify_write = xfs_attr3_leaf_write_verify,
.verify_struct = xfs_attr3_leaf_verify,
@@ -398,6 +393,50 @@
return XFS_ATTR_NSP_ONDISK(ondisk_flags) == XFS_ATTR_NSP_ARGS_TO_ONDISK(arg_flags);
}
+static int
+xfs_attr_copy_value(
+ struct xfs_da_args *args,
+ unsigned char *value,
+ int valuelen)
+{
+ /*
+ * No copy if all we have to do is get the length
+ */
+ if (args->flags & ATTR_KERNOVAL) {
+ args->valuelen = valuelen;
+ return 0;
+ }
+
+ /*
+ * No copy if the length of the existing buffer is too small
+ */
+ if (args->valuelen < valuelen) {
+ args->valuelen = valuelen;
+ return -ERANGE;
+ }
+
+ if (args->op_flags & XFS_DA_OP_ALLOCVAL) {
+ args->value = kmem_alloc_large(valuelen, 0);
+ if (!args->value)
+ return -ENOMEM;
+ }
+ args->valuelen = valuelen;
+
+ /* remote block xattr requires IO for copy-in */
+ if (args->rmtblkno)
+ return xfs_attr_rmtval_get(args);
+
+ /*
+ * This is to prevent a GCC warning because the remote xattr case
+ * doesn't have a value to pass in. In that case, we never reach here,
+ * but GCC can't work that out and so throws a "passing NULL to
+ * memcpy" warning.
+ */
+ if (!value)
+ return -EINVAL;
+ memcpy(args->value, value, valuelen);
+ return 0;
+}
/*========================================================================
* External routines when attribute fork size < XFS_LITINO(mp).
@@ -725,15 +764,19 @@
}
/*
- * Look up a name in a shortform attribute list structure.
+ * Retreive the attribute value and length.
+ *
+ * If ATTR_KERNOVAL is specified, only the length needs to be returned.
+ * Unlike a lookup, we only return an error if the attribute does not
+ * exist or we can't retrieve the value.
*/
-/*ARGSUSED*/
int
-xfs_attr_shortform_getvalue(xfs_da_args_t *args)
+xfs_attr_shortform_getvalue(
+ struct xfs_da_args *args)
{
- xfs_attr_shortform_t *sf;
- xfs_attr_sf_entry_t *sfe;
- int i;
+ struct xfs_attr_shortform *sf;
+ struct xfs_attr_sf_entry *sfe;
+ int i;
ASSERT(args->dp->i_afp->if_flags == XFS_IFINLINE);
sf = (xfs_attr_shortform_t *)args->dp->i_afp->if_u1.if_data;
@@ -746,18 +789,8 @@
continue;
if (!xfs_attr_namesp_match(args->flags, sfe->flags))
continue;
- if (args->flags & ATTR_KERNOVAL) {
- args->valuelen = sfe->valuelen;
- return -EEXIST;
- }
- if (args->valuelen < sfe->valuelen) {
- args->valuelen = sfe->valuelen;
- return -ERANGE;
- }
- args->valuelen = sfe->valuelen;
- memcpy(args->value, &sfe->nameval[args->namelen],
- args->valuelen);
- return -EEXIST;
+ return xfs_attr_copy_value(args, &sfe->nameval[args->namelen],
+ sfe->valuelen);
}
return -ENOATTR;
}
@@ -787,38 +820,23 @@
ifp = dp->i_afp;
sf = (xfs_attr_shortform_t *)ifp->if_u1.if_data;
size = be16_to_cpu(sf->hdr.totsize);
- tmpbuffer = kmem_alloc(size, KM_SLEEP);
+ tmpbuffer = kmem_alloc(size, 0);
ASSERT(tmpbuffer != NULL);
memcpy(tmpbuffer, ifp->if_u1.if_data, size);
sf = (xfs_attr_shortform_t *)tmpbuffer;
xfs_idata_realloc(dp, -size, XFS_ATTR_FORK);
- xfs_bmap_local_to_extents_empty(dp, XFS_ATTR_FORK);
+ xfs_bmap_local_to_extents_empty(args->trans, dp, XFS_ATTR_FORK);
bp = NULL;
error = xfs_da_grow_inode(args, &blkno);
- if (error) {
- /*
- * If we hit an IO error middle of the transaction inside
- * grow_inode(), we may have inconsistent data. Bail out.
- */
- if (error == -EIO)
- goto out;
- xfs_idata_realloc(dp, size, XFS_ATTR_FORK); /* try to put */
- memcpy(ifp->if_u1.if_data, tmpbuffer, size); /* it back */
+ if (error)
goto out;
- }
ASSERT(blkno == 0);
error = xfs_attr3_leaf_create(args, blkno, &bp);
- if (error) {
- /* xfs_attr3_leaf_create may not have instantiated a block */
- if (bp && (xfs_da_shrink_inode(args, 0, bp) != 0))
- goto out;
- xfs_idata_realloc(dp, size, XFS_ATTR_FORK); /* try to put */
- memcpy(ifp->if_u1.if_data, tmpbuffer, size); /* it back */
+ if (error)
goto out;
- }
memset((char *)&nargs, 0, sizeof(nargs));
nargs.dp = dp;
@@ -867,7 +885,7 @@
struct xfs_attr3_icleaf_hdr leafhdr;
int bytes;
int i;
- struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_mount *mp = bp->b_mount;
leaf = bp->b_addr;
xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &leafhdr, leaf);
@@ -990,7 +1008,7 @@
trace_xfs_attr_leaf_to_sf(args);
- tmpbuffer = kmem_alloc(args->geo->blksize, KM_SLEEP);
+ tmpbuffer = kmem_alloc(args->geo->blksize, 0);
if (!tmpbuffer)
return -ENOMEM;
@@ -1453,7 +1471,7 @@
trace_xfs_attr_leaf_compact(args);
- tmpbuffer = kmem_alloc(args->geo->blksize, KM_SLEEP);
+ tmpbuffer = kmem_alloc(args->geo->blksize, 0);
memcpy(tmpbuffer, bp->b_addr, args->geo->blksize);
memset(bp->b_addr, 0, args->geo->blksize);
leaf_src = (xfs_attr_leafblock_t *)tmpbuffer;
@@ -1527,7 +1545,7 @@
{
struct xfs_attr3_icleaf_hdr ichdr1;
struct xfs_attr3_icleaf_hdr ichdr2;
- struct xfs_mount *mp = leaf1_bp->b_target->bt_mount;
+ struct xfs_mount *mp = leaf1_bp->b_mount;
xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr1, leaf1_bp->b_addr);
xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr2, leaf2_bp->b_addr);
@@ -2172,7 +2190,7 @@
struct xfs_attr_leafblock *tmp_leaf;
struct xfs_attr3_icleaf_hdr tmphdr;
- tmp_leaf = kmem_zalloc(state->args->geo->blksize, KM_SLEEP);
+ tmp_leaf = kmem_zalloc(state->args->geo->blksize, 0);
/*
* Copy the header into the temp leaf so that all the stuff
@@ -2355,6 +2373,10 @@
/*
* Get the value associated with an attribute name from a leaf attribute
* list structure.
+ *
+ * If ATTR_KERNOVAL is specified, only the length needs to be returned.
+ * Unlike a lookup, we only return an error if the attribute does not
+ * exist or we can't retrieve the value.
*/
int
xfs_attr3_leaf_getvalue(
@@ -2366,7 +2388,6 @@
struct xfs_attr_leaf_entry *entry;
struct xfs_attr_leaf_name_local *name_loc;
struct xfs_attr_leaf_name_remote *name_rmt;
- int valuelen;
leaf = bp->b_addr;
xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr, leaf);
@@ -2378,36 +2399,19 @@
name_loc = xfs_attr3_leaf_name_local(leaf, args->index);
ASSERT(name_loc->namelen == args->namelen);
ASSERT(memcmp(args->name, name_loc->nameval, args->namelen) == 0);
- valuelen = be16_to_cpu(name_loc->valuelen);
- if (args->flags & ATTR_KERNOVAL) {
- args->valuelen = valuelen;
- return 0;
- }
- if (args->valuelen < valuelen) {
- args->valuelen = valuelen;
- return -ERANGE;
- }
- args->valuelen = valuelen;
- memcpy(args->value, &name_loc->nameval[args->namelen], valuelen);
- } else {
- name_rmt = xfs_attr3_leaf_name_remote(leaf, args->index);
- ASSERT(name_rmt->namelen == args->namelen);
- ASSERT(memcmp(args->name, name_rmt->name, args->namelen) == 0);
- args->rmtvaluelen = be32_to_cpu(name_rmt->valuelen);
- args->rmtblkno = be32_to_cpu(name_rmt->valueblk);
- args->rmtblkcnt = xfs_attr3_rmt_blocks(args->dp->i_mount,
- args->rmtvaluelen);
- if (args->flags & ATTR_KERNOVAL) {
- args->valuelen = args->rmtvaluelen;
- return 0;
- }
- if (args->valuelen < args->rmtvaluelen) {
- args->valuelen = args->rmtvaluelen;
- return -ERANGE;
- }
- args->valuelen = args->rmtvaluelen;
+ return xfs_attr_copy_value(args,
+ &name_loc->nameval[args->namelen],
+ be16_to_cpu(name_loc->valuelen));
}
- return 0;
+
+ name_rmt = xfs_attr3_leaf_name_remote(leaf, args->index);
+ ASSERT(name_rmt->namelen == args->namelen);
+ ASSERT(memcmp(args->name, name_rmt->name, args->namelen) == 0);
+ args->rmtvaluelen = be32_to_cpu(name_rmt->valuelen);
+ args->rmtblkno = be32_to_cpu(name_rmt->valueblk);
+ args->rmtblkcnt = xfs_attr3_rmt_blocks(args->dp->i_mount,
+ args->rmtvaluelen);
+ return xfs_attr_copy_value(args, NULL, args->rmtvaluelen);
}
/*========================================================================
@@ -2570,7 +2574,7 @@
{
struct xfs_attr3_icleaf_hdr ichdr;
struct xfs_attr_leaf_entry *entries;
- struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_mount *mp = bp->b_mount;
xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr, bp->b_addr);
entries = xfs_attr3_leaf_entryp(bp->b_addr);
diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c
index d89363c..3e39b7d 100644
--- a/fs/xfs/libxfs/xfs_attr_remote.c
+++ b/fs/xfs/libxfs/xfs_attr_remote.c
@@ -16,18 +16,10 @@
#include "xfs_da_format.h"
#include "xfs_da_btree.h"
#include "xfs_inode.h"
-#include "xfs_alloc.h"
#include "xfs_trans.h"
-#include "xfs_inode_item.h"
#include "xfs_bmap.h"
-#include "xfs_bmap_util.h"
#include "xfs_attr.h"
-#include "xfs_attr_leaf.h"
-#include "xfs_attr_remote.h"
-#include "xfs_trans_space.h"
#include "xfs_trace.h"
-#include "xfs_cksum.h"
-#include "xfs_buf_item.h"
#include "xfs_error.h"
#define ATTR_RMTVALUE_MAPSIZE 1 /* # of map entries at once */
@@ -79,6 +71,7 @@
static xfs_failaddr_t
xfs_attr3_rmt_verify(
struct xfs_mount *mp,
+ struct xfs_buf *bp,
void *ptr,
int fsbsize,
xfs_daddr_t bno)
@@ -87,7 +80,7 @@
if (!xfs_sb_version_hascrc(&mp->m_sb))
return __this_address;
- if (rmt->rm_magic != cpu_to_be32(XFS_ATTR3_RMT_MAGIC))
+ if (!xfs_verify_magic(bp, rmt->rm_magic))
return __this_address;
if (!uuid_equal(&rmt->rm_uuid, &mp->m_sb.sb_meta_uuid))
return __this_address;
@@ -110,7 +103,7 @@
bool check_crc,
xfs_failaddr_t *failaddr)
{
- struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_mount *mp = bp->b_mount;
char *ptr;
int len;
xfs_daddr_t bno;
@@ -131,7 +124,7 @@
*failaddr = __this_address;
return -EFSBADCRC;
}
- *failaddr = xfs_attr3_rmt_verify(mp, ptr, blksize, bno);
+ *failaddr = xfs_attr3_rmt_verify(mp, bp, ptr, blksize, bno);
if (*failaddr)
return -EFSCORRUPTED;
len -= blksize;
@@ -174,7 +167,7 @@
xfs_attr3_rmt_write_verify(
struct xfs_buf *bp)
{
- struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_mount *mp = bp->b_mount;
xfs_failaddr_t fa;
int blksize = mp->m_attr_geo->blksize;
char *ptr;
@@ -193,7 +186,7 @@
while (len > 0) {
struct xfs_attr3_rmt_hdr *rmt = (struct xfs_attr3_rmt_hdr *)ptr;
- fa = xfs_attr3_rmt_verify(mp, ptr, blksize, bno);
+ fa = xfs_attr3_rmt_verify(mp, bp, ptr, blksize, bno);
if (fa) {
xfs_verifier_error(bp, -EFSCORRUPTED, fa);
return;
@@ -220,6 +213,7 @@
const struct xfs_buf_ops xfs_attr3_rmt_buf_ops = {
.name = "xfs_attr3_rmt",
+ .magic = { 0, cpu_to_be32(XFS_ATTR3_RMT_MAGIC) },
.verify_read = xfs_attr3_rmt_read_verify,
.verify_write = xfs_attr3_rmt_write_verify,
.verify_struct = xfs_attr3_rmt_verify_struct,
@@ -364,6 +358,8 @@
/*
* Read the value associated with an attribute from the out-of-line buffer
* that we stored it in.
+ *
+ * Returns 0 on successful retrieval, otherwise an error.
*/
int
xfs_attr_rmtval_get(
@@ -533,7 +529,7 @@
dblkno = XFS_FSB_TO_DADDR(mp, map.br_startblock),
dblkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount);
- bp = xfs_buf_get(mp->m_ddev_targp, dblkno, dblkcnt, 0);
+ bp = xfs_buf_get(mp->m_ddev_targp, dblkno, dblkcnt);
if (!bp)
return -ENOMEM;
bp->b_ops = &xfs_attr3_rmt_buf_ops;
diff --git a/fs/xfs/libxfs/xfs_bit.c b/fs/xfs/libxfs/xfs_bit.c
index 40ce5f3..7071ff9 100644
--- a/fs/xfs/libxfs/xfs_bit.c
+++ b/fs/xfs/libxfs/xfs_bit.c
@@ -5,7 +5,6 @@
*/
#include "xfs.h"
#include "xfs_log_format.h"
-#include "xfs_bit.h"
/*
* XFS bit manipulation routines, used in non-realtime code.
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index a476703..02469d5 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -13,14 +13,10 @@
#include "xfs_sb.h"
#include "xfs_mount.h"
#include "xfs_defer.h"
-#include "xfs_da_format.h"
-#include "xfs_da_btree.h"
#include "xfs_dir2.h"
#include "xfs_inode.h"
#include "xfs_btree.h"
#include "xfs_trans.h"
-#include "xfs_inode_item.h"
-#include "xfs_extfree_item.h"
#include "xfs_alloc.h"
#include "xfs_bmap.h"
#include "xfs_bmap_util.h"
@@ -32,7 +28,6 @@
#include "xfs_trans_space.h"
#include "xfs_buf_item.h"
#include "xfs_trace.h"
-#include "xfs_symlink.h"
#include "xfs_attr_leaf.h"
#include "xfs_filestream.h"
#include "xfs_rmap.h"
@@ -370,7 +365,7 @@
bp = xfs_bmap_get_bp(cur, XFS_FSB_TO_DADDR(mp, bno));
if (!bp) {
bp_release = 1;
- error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp,
+ error = xfs_btree_read_bufl(mp, NULL, bno, &bp,
XFS_BMAP_BTREE_REF,
&xfs_bmbt_buf_ops);
if (error)
@@ -454,7 +449,7 @@
bp = xfs_bmap_get_bp(cur, XFS_FSB_TO_DADDR(mp, bno));
if (!bp) {
bp_release = 1;
- error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp,
+ error = xfs_btree_read_bufl(mp, NULL, bno, &bp,
XFS_BMAP_BTREE_REF,
&xfs_bmbt_buf_ops);
if (error)
@@ -536,7 +531,7 @@
struct xfs_trans *tp,
xfs_fsblock_t bno,
xfs_filblks_t len,
- struct xfs_owner_info *oinfo,
+ const struct xfs_owner_info *oinfo,
bool skip_discard)
{
struct xfs_extent_free_item *new; /* new element */
@@ -558,13 +553,13 @@
#endif
ASSERT(xfs_bmap_free_item_zone != NULL);
- new = kmem_zone_alloc(xfs_bmap_free_item_zone, KM_SLEEP);
+ new = kmem_zone_alloc(xfs_bmap_free_item_zone, 0);
new->xefi_startblock = bno;
new->xefi_blockcount = (xfs_extlen_t)len;
if (oinfo)
new->xefi_oinfo = *oinfo;
else
- xfs_rmap_skip_owner_update(&new->xefi_oinfo);
+ new->xefi_oinfo = XFS_RMAP_OINFO_SKIP_UPDATE;
new->xefi_skip_discard = skip_discard;
trace_xfs_bmap_free_defer(tp->t_mountp,
XFS_FSB_TO_AGNO(tp->t_mountp, bno), 0,
@@ -577,47 +572,49 @@
*/
/*
- * Transform a btree format file with only one leaf node, where the
- * extents list will fit in the inode, into an extents format file.
- * Since the file extents are already in-core, all we have to do is
- * give up the space for the btree root and pitch the leaf block.
+ * Convert the inode format to extent format if it currently is in btree format,
+ * but the extent list is small enough that it fits into the extent format.
+ *
+ * Since the extents are already in-core, all we have to do is give up the space
+ * for the btree root and pitch the leaf block.
*/
STATIC int /* error */
xfs_bmap_btree_to_extents(
- xfs_trans_t *tp, /* transaction pointer */
- xfs_inode_t *ip, /* incore inode pointer */
- xfs_btree_cur_t *cur, /* btree cursor */
+ struct xfs_trans *tp, /* transaction pointer */
+ struct xfs_inode *ip, /* incore inode pointer */
+ struct xfs_btree_cur *cur, /* btree cursor */
int *logflagsp, /* inode logging flags */
int whichfork) /* data or attr fork */
{
- /* REFERENCED */
+ struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_btree_block *rblock = ifp->if_broot;
struct xfs_btree_block *cblock;/* child btree block */
xfs_fsblock_t cbno; /* child block number */
xfs_buf_t *cbp; /* child block's buffer */
int error; /* error return value */
- struct xfs_ifork *ifp; /* inode fork data */
- xfs_mount_t *mp; /* mount point structure */
__be64 *pp; /* ptr to block address */
- struct xfs_btree_block *rblock;/* root btree block */
struct xfs_owner_info oinfo;
- mp = ip->i_mount;
- ifp = XFS_IFORK_PTR(ip, whichfork);
+ /* check if we actually need the extent format first: */
+ if (!xfs_bmap_wants_extents(ip, whichfork))
+ return 0;
+
+ ASSERT(cur);
ASSERT(whichfork != XFS_COW_FORK);
ASSERT(ifp->if_flags & XFS_IFEXTENTS);
ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE);
- rblock = ifp->if_broot;
ASSERT(be16_to_cpu(rblock->bb_level) == 1);
ASSERT(be16_to_cpu(rblock->bb_numrecs) == 1);
ASSERT(xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0) == 1);
+
pp = XFS_BMAP_BROOT_PTR_ADDR(mp, rblock, 1, ifp->if_broot_bytes);
cbno = be64_to_cpu(*pp);
- *logflagsp = 0;
#ifdef DEBUG
XFS_WANT_CORRUPTED_RETURN(cur->bc_mp,
xfs_btree_check_lptr(cur, cbno, 1));
#endif
- error = xfs_btree_read_bufl(mp, tp, cbno, 0, &cbp, XFS_BMAP_BTREE_REF,
+ error = xfs_btree_read_bufl(mp, tp, cbno, &cbp, XFS_BMAP_BTREE_REF,
&xfs_bmbt_buf_ops);
if (error)
return error;
@@ -635,7 +632,7 @@
ASSERT(ifp->if_broot == NULL);
ASSERT((ifp->if_flags & XFS_IFBROOT) == 0);
XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS);
- *logflagsp = XFS_ILOG_CORE | xfs_ilog_fext(whichfork);
+ *logflagsp |= XFS_ILOG_CORE | xfs_ilog_fext(whichfork);
return 0;
}
@@ -730,7 +727,7 @@
cur->bc_private.b.allocated++;
ip->i_d.di_nblocks++;
xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, 1L);
- abp = xfs_btree_get_bufl(mp, tp, args.fsbno, 0);
+ abp = xfs_btree_get_bufl(mp, tp, args.fsbno);
if (!abp) {
error = -EFSCORRUPTED;
goto out_unreserve_dquot;
@@ -795,6 +792,7 @@
*/
void
xfs_bmap_local_to_extents_empty(
+ struct xfs_trans *tp,
struct xfs_inode *ip,
int whichfork)
{
@@ -811,6 +809,7 @@
ifp->if_u1.if_root = NULL;
ifp->if_height = 0;
XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS);
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
}
@@ -843,7 +842,7 @@
ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL);
if (!ifp->if_bytes) {
- xfs_bmap_local_to_extents_empty(ip, whichfork);
+ xfs_bmap_local_to_extents_empty(tp, ip, whichfork);
flags = XFS_ILOG_CORE;
goto done;
}
@@ -876,7 +875,7 @@
ASSERT(args.fsbno != NULLFSBLOCK);
ASSERT(args.len == 1);
tp->t_firstblock = args.fsbno;
- bp = xfs_btree_get_bufl(args.mp, tp, args.fsbno, 0);
+ bp = xfs_btree_get_bufl(args.mp, tp, args.fsbno);
/*
* Initialize the block, copy the data and log the remote buffer.
@@ -890,7 +889,7 @@
/* account for the change in fork size */
xfs_idata_realloc(ip, -ifp->if_bytes, whichfork);
- xfs_bmap_local_to_extents_empty(ip, whichfork);
+ xfs_bmap_local_to_extents_empty(tp, ip, whichfork);
flags |= XFS_ILOG_CORE;
ifp->if_u1.if_root = NULL;
@@ -1019,6 +1018,34 @@
return -EFSCORRUPTED;
}
+/* Set an inode attr fork off based on the format */
+int
+xfs_bmap_set_attrforkoff(
+ struct xfs_inode *ip,
+ int size,
+ int *version)
+{
+ switch (ip->i_d.di_format) {
+ case XFS_DINODE_FMT_DEV:
+ ip->i_d.di_forkoff = roundup(sizeof(xfs_dev_t), 8) >> 3;
+ break;
+ case XFS_DINODE_FMT_LOCAL:
+ case XFS_DINODE_FMT_EXTENTS:
+ case XFS_DINODE_FMT_BTREE:
+ ip->i_d.di_forkoff = xfs_attr_shortform_bytesfit(ip, size);
+ if (!ip->i_d.di_forkoff)
+ ip->i_d.di_forkoff = xfs_default_attroffset(ip) >> 3;
+ else if ((ip->i_mount->m_flags & XFS_MOUNT_ATTR2) && version)
+ *version = 2;
+ break;
+ default:
+ ASSERT(0);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
/*
* Convert inode from non-attributed to attributed.
* Must not be in a transaction, ip must not be locked.
@@ -1070,28 +1097,11 @@
xfs_trans_ijoin(tp, ip, 0);
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-
- switch (ip->i_d.di_format) {
- case XFS_DINODE_FMT_DEV:
- ip->i_d.di_forkoff = roundup(sizeof(xfs_dev_t), 8) >> 3;
- break;
- case XFS_DINODE_FMT_LOCAL:
- case XFS_DINODE_FMT_EXTENTS:
- case XFS_DINODE_FMT_BTREE:
- ip->i_d.di_forkoff = xfs_attr_shortform_bytesfit(ip, size);
- if (!ip->i_d.di_forkoff)
- ip->i_d.di_forkoff = xfs_default_attroffset(ip) >> 3;
- else if (mp->m_flags & XFS_MOUNT_ATTR2)
- version = 2;
- break;
- default:
- ASSERT(0);
- error = -EINVAL;
+ error = xfs_bmap_set_attrforkoff(ip, size, &version);
+ if (error)
goto trans_cancel;
- }
-
ASSERT(ip->i_afp == NULL);
- ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP);
+ ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, 0);
ip->i_afp->if_flags = XFS_IFEXTENTS;
logflags = 0;
switch (ip->i_d.di_format) {
@@ -1178,7 +1188,10 @@
* Root level must use BMAP_BROOT_PTR_ADDR macro to get ptr out.
*/
level = be16_to_cpu(block->bb_level);
- ASSERT(level > 0);
+ if (unlikely(level == 0)) {
+ XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp);
+ return -EFSCORRUPTED;
+ }
pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes);
bno = be64_to_cpu(*pp);
@@ -1187,7 +1200,7 @@
* pointer (leftmost) at each level.
*/
while (level-- > 0) {
- error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp,
+ error = xfs_btree_read_bufl(mp, tp, bno, &bp,
XFS_BMAP_BTREE_REF, &xfs_bmbt_buf_ops);
if (error)
goto out;
@@ -1260,7 +1273,7 @@
*/
if (bno == NULLFSBLOCK)
break;
- error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp,
+ error = xfs_btree_read_bufl(mp, tp, bno, &bp,
XFS_BMAP_BTREE_REF, &xfs_bmbt_buf_ops);
if (error)
goto out;
@@ -1683,10 +1696,13 @@
case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG:
/*
* Filling in all of a previously delayed allocation extent.
- * The right neighbor is contiguous, the left is not.
+ * The right neighbor is contiguous, the left is not. Take care
+ * with delay -> unwritten extent allocation here because the
+ * delalloc record we are overwriting is always written.
*/
PREV.br_startblock = new->br_startblock;
PREV.br_blockcount += RIGHT.br_blockcount;
+ PREV.br_state = new->br_state;
xfs_iext_next(ifp, &bma->icur);
xfs_iext_remove(bma->ip, &bma->icur, state);
@@ -1971,11 +1987,8 @@
}
/* add reverse mapping unless caller opted out */
- if (!(bma->flags & XFS_BMAPI_NORMAP)) {
- error = xfs_rmap_map_extent(bma->tp, bma->ip, whichfork, new);
- if (error)
- goto done;
- }
+ if (!(bma->flags & XFS_BMAPI_NORMAP))
+ xfs_rmap_map_extent(bma->tp, bma->ip, whichfork, new);
/* convert to a btree if necessary */
if (xfs_bmap_needs_btree(bma->ip, whichfork)) {
@@ -1990,6 +2003,9 @@
goto done;
}
+ if (da_new != da_old)
+ xfs_mod_delalloc(mp, (int64_t)da_new - da_old);
+
if (bma->cur) {
da_new += bma->cur->bc_private.b.allocated;
bma->cur->bc_private.b.allocated = 0;
@@ -2015,7 +2031,7 @@
/*
* Convert an unwritten allocation to a real allocation or vice versa.
*/
-STATIC int /* error */
+int /* error */
xfs_bmap_add_extent_unwritten_real(
struct xfs_trans *tp,
xfs_inode_t *ip, /* incore inode pointer */
@@ -2454,9 +2470,7 @@
}
/* update reverse mappings */
- error = xfs_rmap_convert_extent(mp, tp, ip, whichfork, new);
- if (error)
- goto done;
+ xfs_rmap_convert_extent(mp, tp, ip, whichfork, new);
/* convert to a btree if necessary */
if (xfs_bmap_needs_btree(ip, whichfork)) {
@@ -2621,6 +2635,7 @@
/*
* Nothing to do for disk quota accounting here.
*/
+ xfs_mod_delalloc(ip->i_mount, (int64_t)newlen - oldlen);
}
}
@@ -2814,11 +2829,8 @@
}
/* add reverse mapping unless caller opted out */
- if (!(flags & XFS_BMAPI_NORMAP)) {
- error = xfs_rmap_map_extent(tp, ip, whichfork, new);
- if (error)
- goto done;
- }
+ if (!(flags & XFS_BMAPI_NORMAP))
+ xfs_rmap_map_extent(tp, ip, whichfork, new);
/* convert to a btree if necessary */
if (xfs_bmap_needs_btree(ip, whichfork)) {
@@ -3333,8 +3345,10 @@
* already have quota reservation and there's nothing to do
* yet.
*/
- if (ap->wasdel)
+ if (ap->wasdel) {
+ xfs_mod_delalloc(ap->ip->i_mount, -(int64_t)args->len);
return;
+ }
/*
* Otherwise, we've allocated blocks in a hole. The transaction
@@ -3353,8 +3367,10 @@
/* data/attr fork only */
ap->ip->i_d.di_nblocks += args->len;
xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE);
- if (ap->wasdel)
+ if (ap->wasdel) {
ap->ip->i_delayed_blks -= args->len;
+ xfs_mod_delalloc(ap->ip->i_mount, -(int64_t)args->len);
+ }
xfs_trans_mod_dquot_byino(ap->tp, ap->ip,
ap->wasdel ? XFS_TRANS_DQ_DELBCOUNT : XFS_TRANS_DQ_BCOUNT,
args->len);
@@ -3439,7 +3455,7 @@
args.tp = ap->tp;
args.mp = mp;
args.fsbno = ap->blkno;
- xfs_rmap_skip_owner_update(&args.oinfo);
+ args.oinfo = XFS_RMAP_OINFO_SKIP_UPDATE;
/* Trim the allocation back to the maximum an AG can fit. */
args.maxlen = min(ap->length, mp->m_ag_max_usable);
@@ -3671,17 +3687,6 @@
}
}
-/* trim extent to within eof */
-void
-xfs_trim_extent_eof(
- struct xfs_bmbt_irec *irec,
- struct xfs_inode *ip)
-
-{
- xfs_trim_extent(irec, 0, XFS_B_TO_FSB(ip->i_mount,
- i_size_read(VFS_I(ip))));
-}
-
/*
* Trim the returned map to the required bounds
*/
@@ -3824,15 +3829,28 @@
XFS_STATS_INC(mp, xs_blk_mapr);
ifp = XFS_IFORK_PTR(ip, whichfork);
+ if (!ifp) {
+ /* No CoW fork? Return a hole. */
+ if (whichfork == XFS_COW_FORK) {
+ mval->br_startoff = bno;
+ mval->br_startblock = HOLESTARTBLOCK;
+ mval->br_blockcount = len;
+ mval->br_state = XFS_EXT_NORM;
+ *nmap = 1;
+ return 0;
+ }
- /* No CoW fork? Return a hole. */
- if (whichfork == XFS_COW_FORK && !ifp) {
- mval->br_startoff = bno;
- mval->br_startblock = HOLESTARTBLOCK;
- mval->br_blockcount = len;
- mval->br_state = XFS_EXT_NORM;
- *nmap = 1;
- return 0;
+ /*
+ * A missing attr ifork implies that the inode says we're in
+ * extents or btree format but failed to pass the inode fork
+ * verifier while trying to load it. Treat that as a file
+ * corruption too.
+ */
+#ifdef DEBUG
+ xfs_alert(mp, "%s: inode %llu missing fork %d",
+ __func__, ip->i_ino, whichfork);
+#endif /* DEBUG */
+ return -EFSCORRUPTED;
}
if (!(ifp->if_flags & XFS_IFEXTENTS)) {
@@ -3961,6 +3979,7 @@
ip->i_delayed_blks += alen;
+ xfs_mod_delalloc(ip->i_mount, alen + indlen);
got->br_startoff = aoff;
got->br_startblock = nullstartblock(indlen);
@@ -4081,8 +4100,7 @@
* extents to real extents when we're about to write the data.
*/
if ((!bma->wasdel || (bma->flags & XFS_BMAPI_COWFORK)) &&
- (bma->flags & XFS_BMAPI_PREALLOC) &&
- xfs_sb_version_hasextflgbit(&mp->m_sb))
+ (bma->flags & XFS_BMAPI_PREALLOC))
bma->got.br_state = XFS_EXT_UNWRITTEN;
if (bma->wasdel)
@@ -4190,6 +4208,44 @@
return 0;
}
+static inline xfs_extlen_t
+xfs_bmapi_minleft(
+ struct xfs_trans *tp,
+ struct xfs_inode *ip,
+ int fork)
+{
+ if (tp && tp->t_firstblock != NULLFSBLOCK)
+ return 0;
+ if (XFS_IFORK_FORMAT(ip, fork) != XFS_DINODE_FMT_BTREE)
+ return 1;
+ return be16_to_cpu(XFS_IFORK_PTR(ip, fork)->if_broot->bb_level) + 1;
+}
+
+/*
+ * Log whatever the flags say, even if error. Otherwise we might miss detecting
+ * a case where the data is changed, there's an error, and it's not logged so we
+ * don't shutdown when we should. Don't bother logging extents/btree changes if
+ * we converted to the other format.
+ */
+static void
+xfs_bmapi_finish(
+ struct xfs_bmalloca *bma,
+ int whichfork,
+ int error)
+{
+ if ((bma->logflags & xfs_ilog_fext(whichfork)) &&
+ XFS_IFORK_FORMAT(bma->ip, whichfork) != XFS_DINODE_FMT_EXTENTS)
+ bma->logflags &= ~xfs_ilog_fext(whichfork);
+ else if ((bma->logflags & xfs_ilog_fbroot(whichfork)) &&
+ XFS_IFORK_FORMAT(bma->ip, whichfork) != XFS_DINODE_FMT_BTREE)
+ bma->logflags &= ~xfs_ilog_fbroot(whichfork);
+
+ if (bma->logflags)
+ xfs_trans_log_inode(bma->tp, bma->ip, bma->logflags);
+ if (bma->cur)
+ xfs_btree_del_cursor(bma->cur, error);
+}
+
/*
* Map file blocks to filesystem blocks, and allocate blocks or convert the
* extent state if necessary. Details behaviour is controlled by the flags
@@ -4207,9 +4263,13 @@
struct xfs_bmbt_irec *mval, /* output: map values */
int *nmap) /* i/o: mval size/count */
{
+ struct xfs_bmalloca bma = {
+ .tp = tp,
+ .ip = ip,
+ .total = total,
+ };
struct xfs_mount *mp = ip->i_mount;
struct xfs_ifork *ifp;
- struct xfs_bmalloca bma = { NULL }; /* args for xfs_bmap_alloc */
xfs_fileoff_t end; /* end of mapped file region */
bool eof = false; /* after the end of extents */
int error; /* error return */
@@ -4234,9 +4294,7 @@
ASSERT(*nmap >= 1);
ASSERT(*nmap <= XFS_BMAP_MAX_NMAP);
- ASSERT(tp != NULL ||
- (flags & (XFS_BMAPI_CONVERT | XFS_BMAPI_COWFORK)) ==
- (XFS_BMAPI_CONVERT | XFS_BMAPI_COWFORK));
+ ASSERT(tp != NULL);
ASSERT(len > 0);
ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL);
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
@@ -4269,34 +4327,21 @@
XFS_STATS_INC(mp, xs_blk_mapw);
- if (!tp || tp->t_firstblock == NULLFSBLOCK) {
- if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE)
- bma.minleft = be16_to_cpu(ifp->if_broot->bb_level) + 1;
- else
- bma.minleft = 1;
- } else {
- bma.minleft = 0;
- }
-
if (!(ifp->if_flags & XFS_IFEXTENTS)) {
error = xfs_iread_extents(tp, ip, whichfork);
if (error)
goto error0;
}
- n = 0;
- end = bno + len;
- obno = bno;
-
if (!xfs_iext_lookup_extent(ip, ifp, bno, &bma.icur, &bma.got))
eof = true;
if (!xfs_iext_peek_prev_extent(ifp, &bma.icur, &bma.prev))
bma.prev.br_startoff = NULLFILEOFF;
- bma.tp = tp;
- bma.ip = ip;
- bma.total = total;
- bma.datatype = 0;
+ bma.minleft = xfs_bmapi_minleft(tp, ip, whichfork);
+ n = 0;
+ end = bno + len;
+ obno = bno;
while (bno < end && n < *nmap) {
bool need_alloc = false, wasdelay = false;
@@ -4310,26 +4355,7 @@
ASSERT(!((flags & XFS_BMAPI_CONVERT) &&
(flags & XFS_BMAPI_COWFORK)));
- if (flags & XFS_BMAPI_DELALLOC) {
- /*
- * For the COW fork we can reasonably get a
- * request for converting an extent that races
- * with other threads already having converted
- * part of it, as there converting COW to
- * regular blocks is not protected using the
- * IOLOCK.
- */
- ASSERT(flags & XFS_BMAPI_COWFORK);
- if (!(flags & XFS_BMAPI_COWFORK)) {
- error = -EIO;
- goto error0;
- }
-
- if (eof || bno >= end)
- break;
- } else {
- need_alloc = true;
- }
+ need_alloc = true;
} else if (isnullstartblock(bma.got.br_startblock)) {
wasdelay = true;
}
@@ -4338,8 +4364,7 @@
* First, deal with the hole before the allocated space
* that we found, if any.
*/
- if ((need_alloc || wasdelay) &&
- !(flags & XFS_BMAPI_CONVERT_ONLY)) {
+ if (need_alloc || wasdelay) {
bma.eof = eof;
bma.conv = !!(flags & XFS_BMAPI_CONVERT);
bma.wasdel = wasdelay;
@@ -4370,12 +4395,9 @@
* If this is a CoW allocation, record the data in
* the refcount btree for orphan recovery.
*/
- if (whichfork == XFS_COW_FORK) {
- error = xfs_refcount_alloc_cow_extent(tp,
- bma.blkno, bma.length);
- if (error)
- goto error0;
- }
+ if (whichfork == XFS_COW_FORK)
+ xfs_refcount_alloc_cow_extent(tp, bma.blkno,
+ bma.length);
}
/* Deal with the allocated space we found. */
@@ -4407,49 +4429,126 @@
}
*nmap = n;
- /*
- * Transform from btree to extents, give it cur.
- */
- if (xfs_bmap_wants_extents(ip, whichfork)) {
- int tmp_logflags = 0;
-
- ASSERT(bma.cur);
- error = xfs_bmap_btree_to_extents(tp, ip, bma.cur,
- &tmp_logflags, whichfork);
- bma.logflags |= tmp_logflags;
- if (error)
- goto error0;
- }
+ error = xfs_bmap_btree_to_extents(tp, ip, bma.cur, &bma.logflags,
+ whichfork);
+ if (error)
+ goto error0;
ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE ||
XFS_IFORK_NEXTENTS(ip, whichfork) >
XFS_IFORK_MAXEXT(ip, whichfork));
- error = 0;
+ xfs_bmapi_finish(&bma, whichfork, 0);
+ xfs_bmap_validate_ret(orig_bno, orig_len, orig_flags, orig_mval,
+ orig_nmap, *nmap);
+ return 0;
error0:
- /*
- * Log everything. Do this after conversion, there's no point in
- * logging the extent records if we've converted to btree format.
- */
- if ((bma.logflags & xfs_ilog_fext(whichfork)) &&
- XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS)
- bma.logflags &= ~xfs_ilog_fext(whichfork);
- else if ((bma.logflags & xfs_ilog_fbroot(whichfork)) &&
- XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)
- bma.logflags &= ~xfs_ilog_fbroot(whichfork);
- /*
- * Log whatever the flags say, even if error. Otherwise we might miss
- * detecting a case where the data is changed, there's an error,
- * and it's not logged so we don't shutdown when we should.
- */
- if (bma.logflags)
- xfs_trans_log_inode(tp, ip, bma.logflags);
+ xfs_bmapi_finish(&bma, whichfork, error);
+ return error;
+}
- if (bma.cur) {
- xfs_btree_del_cursor(bma.cur, error);
+/*
+ * Convert an existing delalloc extent to real blocks based on file offset. This
+ * attempts to allocate the entire delalloc extent and may require multiple
+ * invocations to allocate the target offset if a large enough physical extent
+ * is not available.
+ */
+int
+xfs_bmapi_convert_delalloc(
+ struct xfs_inode *ip,
+ int whichfork,
+ xfs_fileoff_t offset_fsb,
+ struct xfs_bmbt_irec *imap,
+ unsigned int *seq)
+{
+ struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_bmalloca bma = { NULL };
+ struct xfs_trans *tp;
+ int error;
+
+ /*
+ * Space for the extent and indirect blocks was reserved when the
+ * delalloc extent was created so there's no need to do so here.
+ */
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, 0, 0,
+ XFS_TRANS_RESERVE, &tp);
+ if (error)
+ return error;
+
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, ip, 0);
+
+ if (!xfs_iext_lookup_extent(ip, ifp, offset_fsb, &bma.icur, &bma.got) ||
+ bma.got.br_startoff > offset_fsb) {
+ /*
+ * No extent found in the range we are trying to convert. This
+ * should only happen for the COW fork, where another thread
+ * might have moved the extent to the data fork in the meantime.
+ */
+ WARN_ON_ONCE(whichfork != XFS_COW_FORK);
+ error = -EAGAIN;
+ goto out_trans_cancel;
}
- if (!error)
- xfs_bmap_validate_ret(orig_bno, orig_len, orig_flags, orig_mval,
- orig_nmap, *nmap);
+
+ /*
+ * If we find a real extent here we raced with another thread converting
+ * the extent. Just return the real extent at this offset.
+ */
+ if (!isnullstartblock(bma.got.br_startblock)) {
+ *imap = bma.got;
+ *seq = READ_ONCE(ifp->if_seq);
+ goto out_trans_cancel;
+ }
+
+ bma.tp = tp;
+ bma.ip = ip;
+ bma.wasdel = true;
+ bma.offset = bma.got.br_startoff;
+ bma.length = max_t(xfs_filblks_t, bma.got.br_blockcount, MAXEXTLEN);
+ bma.total = XFS_EXTENTADD_SPACE_RES(ip->i_mount, XFS_DATA_FORK);
+ bma.minleft = xfs_bmapi_minleft(tp, ip, whichfork);
+ if (whichfork == XFS_COW_FORK)
+ bma.flags = XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC;
+
+ if (!xfs_iext_peek_prev_extent(ifp, &bma.icur, &bma.prev))
+ bma.prev.br_startoff = NULLFILEOFF;
+
+ error = xfs_bmapi_allocate(&bma);
+ if (error)
+ goto out_finish;
+
+ error = -ENOSPC;
+ if (WARN_ON_ONCE(bma.blkno == NULLFSBLOCK))
+ goto out_finish;
+ error = -EFSCORRUPTED;
+ if (WARN_ON_ONCE(!xfs_valid_startblock(ip, bma.got.br_startblock)))
+ goto out_finish;
+
+ XFS_STATS_ADD(mp, xs_xstrat_bytes, XFS_FSB_TO_B(mp, bma.length));
+ XFS_STATS_INC(mp, xs_xstrat_quick);
+
+ ASSERT(!isnullstartblock(bma.got.br_startblock));
+ *imap = bma.got;
+ *seq = READ_ONCE(ifp->if_seq);
+
+ if (whichfork == XFS_COW_FORK)
+ xfs_refcount_alloc_cow_extent(tp, bma.blkno, bma.length);
+
+ error = xfs_bmap_btree_to_extents(tp, ip, bma.cur, &bma.logflags,
+ whichfork);
+ if (error)
+ goto out_finish;
+
+ xfs_bmapi_finish(&bma, whichfork, 0);
+ error = xfs_trans_commit(tp);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ return error;
+
+out_finish:
+ xfs_bmapi_finish(&bma, whichfork, error);
+out_trans_cancel:
+ xfs_trans_cancel(tp);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
return error;
}
@@ -4523,13 +4622,7 @@
if (error)
goto error0;
- if (xfs_bmap_wants_extents(ip, whichfork)) {
- int tmp_logflags = 0;
-
- error = xfs_bmap_btree_to_extents(tp, ip, cur,
- &tmp_logflags, whichfork);
- logflags |= tmp_logflags;
- }
+ error = xfs_bmap_btree_to_extents(tp, ip, cur, &logflags, whichfork);
error0:
if (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS)
@@ -4751,8 +4844,10 @@
da_diff = da_old - da_new;
if (!isrt)
da_diff += del->br_blockcount;
- if (da_diff)
+ if (da_diff) {
xfs_mod_fdblocks(mp, da_diff, false);
+ xfs_mod_delalloc(mp, -da_diff);
+ }
return error;
}
@@ -5041,18 +5136,14 @@
}
/* remove reverse mapping */
- error = xfs_rmap_unmap_extent(tp, ip, whichfork, del);
- if (error)
- goto done;
+ xfs_rmap_unmap_extent(tp, ip, whichfork, del);
/*
* If we need to, add to list of extents to delete.
*/
if (do_fx && !(bflags & XFS_BMAPI_REMAP)) {
if (xfs_is_reflink_inode(ip) && whichfork == XFS_DATA_FORK) {
- error = xfs_refcount_decrease_extent(tp, del);
- if (error)
- goto done;
+ xfs_refcount_decrease_extent(tp, del);
} else {
__xfs_bmap_add_free(tp, del->br_startblock,
del->br_blockcount, NULL,
@@ -5245,8 +5336,7 @@
* unmapping part of it. But we can't really
* get rid of part of a realtime extent.
*/
- if (del.br_state == XFS_EXT_UNWRITTEN ||
- !xfs_sb_version_hasextflgbit(&mp->m_sb)) {
+ if (del.br_state == XFS_EXT_UNWRITTEN) {
/*
* This piece is unwritten, or we're not
* using unwritten extents. Skip over it.
@@ -5296,10 +5386,9 @@
del.br_blockcount -= mod;
del.br_startoff += mod;
del.br_startblock += mod;
- } else if ((del.br_startoff == start &&
- (del.br_state == XFS_EXT_UNWRITTEN ||
- tp->t_blk_res == 0)) ||
- !xfs_sb_version_hasextflgbit(&mp->m_sb)) {
+ } else if (del.br_startoff == start &&
+ (del.br_state == XFS_EXT_UNWRITTEN ||
+ tp->t_blk_res == 0)) {
/*
* Can't make it unwritten. There isn't
* a full extent here so just skip it.
@@ -5395,24 +5484,11 @@
error = xfs_bmap_extents_to_btree(tp, ip, &cur, 0,
&tmp_logflags, whichfork);
logflags |= tmp_logflags;
- if (error)
- goto error0;
- }
- /*
- * transform from btree to extents, give it cur
- */
- else if (xfs_bmap_wants_extents(ip, whichfork)) {
- ASSERT(cur != NULL);
- error = xfs_bmap_btree_to_extents(tp, ip, cur, &tmp_logflags,
+ } else {
+ error = xfs_bmap_btree_to_extents(tp, ip, cur, &logflags,
whichfork);
- logflags |= tmp_logflags;
- if (error)
- goto error0;
}
- /*
- * transform from extents to local?
- */
- error = 0;
+
error0:
/*
* Log everything. Do this after conversion, there's no point in
@@ -5551,6 +5627,11 @@
if (error)
return error;
+ /* change to extent format if required after extent removal */
+ error = xfs_bmap_btree_to_extents(tp, ip, cur, logflags, whichfork);
+ if (error)
+ return error;
+
done:
xfs_iext_remove(ip, icur, 0);
xfs_iext_prev(XFS_IFORK_PTR(ip, whichfork), icur);
@@ -5558,12 +5639,11 @@
&new);
/* update reverse mapping. rmap functions merge the rmaps for us */
- error = xfs_rmap_unmap_extent(tp, ip, whichfork, got);
- if (error)
- return error;
+ xfs_rmap_unmap_extent(tp, ip, whichfork, got);
memcpy(&new, got, sizeof(new));
new.br_startoff = left->br_startoff + left->br_blockcount;
- return xfs_rmap_map_extent(tp, ip, whichfork, &new);
+ xfs_rmap_map_extent(tp, ip, whichfork, &new);
+ return 0;
}
static int
@@ -5602,10 +5682,9 @@
got);
/* update reverse mapping */
- error = xfs_rmap_unmap_extent(tp, ip, whichfork, &prev);
- if (error)
- return error;
- return xfs_rmap_map_extent(tp, ip, whichfork, got);
+ xfs_rmap_unmap_extent(tp, ip, whichfork, &prev);
+ xfs_rmap_map_extent(tp, ip, whichfork, got);
+ return 0;
}
int
@@ -6001,7 +6080,7 @@
bmap->br_blockcount,
bmap->br_state);
- bi = kmem_alloc(sizeof(struct xfs_bmap_intent), KM_SLEEP | KM_NOFS);
+ bi = kmem_alloc(sizeof(struct xfs_bmap_intent), KM_NOFS);
INIT_LIST_HEAD(&bi->bi_list);
bi->bi_type = type;
bi->bi_owner = ip;
@@ -6013,29 +6092,29 @@
}
/* Map an extent into a file. */
-int
+void
xfs_bmap_map_extent(
struct xfs_trans *tp,
struct xfs_inode *ip,
struct xfs_bmbt_irec *PREV)
{
if (!xfs_bmap_is_update_needed(PREV))
- return 0;
+ return;
- return __xfs_bmap_add(tp, XFS_BMAP_MAP, ip, XFS_DATA_FORK, PREV);
+ __xfs_bmap_add(tp, XFS_BMAP_MAP, ip, XFS_DATA_FORK, PREV);
}
/* Unmap an extent out of a file. */
-int
+void
xfs_bmap_unmap_extent(
struct xfs_trans *tp,
struct xfs_inode *ip,
struct xfs_bmbt_irec *PREV)
{
if (!xfs_bmap_is_update_needed(PREV))
- return 0;
+ return;
- return __xfs_bmap_add(tp, XFS_BMAP_UNMAP, ip, XFS_DATA_FORK, PREV);
+ __xfs_bmap_add(tp, XFS_BMAP_UNMAP, ip, XFS_DATA_FORK, PREV);
}
/*
@@ -6114,11 +6193,7 @@
XFS_FSB_TO_AGNO(mp, endfsb))
return __this_address;
}
- if (irec->br_state != XFS_EXT_NORM) {
- if (whichfork != XFS_DATA_FORK)
- return __this_address;
- if (!xfs_sb_version_hasextflgbit(&mp->m_sb))
- return __this_address;
- }
+ if (irec->br_state != XFS_EXT_NORM && whichfork != XFS_DATA_FORK)
+ return __this_address;
return NULL;
}
diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h
index b6e9b63..e2798c6 100644
--- a/fs/xfs/libxfs/xfs_bmap.h
+++ b/fs/xfs/libxfs/xfs_bmap.h
@@ -95,12 +95,6 @@
/* Map something in the CoW fork. */
#define XFS_BMAPI_COWFORK 0x200
-/* Only convert delalloc space, don't allocate entirely new extents */
-#define XFS_BMAPI_DELALLOC 0x400
-
-/* Only convert unwritten extents, don't allocate new blocks */
-#define XFS_BMAPI_CONVERT_ONLY 0x800
-
/* Skip online discard of freed extents */
#define XFS_BMAPI_NODISCARD 0x1000
@@ -117,8 +111,6 @@
{ XFS_BMAPI_ZERO, "ZERO" }, \
{ XFS_BMAPI_REMAP, "REMAP" }, \
{ XFS_BMAPI_COWFORK, "COWFORK" }, \
- { XFS_BMAPI_DELALLOC, "DELALLOC" }, \
- { XFS_BMAPI_CONVERT_ONLY, "CONVERT_ONLY" }, \
{ XFS_BMAPI_NODISCARD, "NODISCARD" }, \
{ XFS_BMAPI_NORMAP, "NORMAP" }
@@ -179,13 +171,21 @@
!isnullstartblock(irec->br_startblock);
}
+/*
+ * Check the mapping for obviously garbage allocations that could trash the
+ * filesystem immediately.
+ */
+#define xfs_valid_startblock(ip, startblock) \
+ ((startblock) != 0 || XFS_IS_REALTIME_INODE(ip))
+
void xfs_trim_extent(struct xfs_bmbt_irec *irec, xfs_fileoff_t bno,
xfs_filblks_t len);
-void xfs_trim_extent_eof(struct xfs_bmbt_irec *, struct xfs_inode *);
int xfs_bmap_add_attrfork(struct xfs_inode *ip, int size, int rsvd);
-void xfs_bmap_local_to_extents_empty(struct xfs_inode *ip, int whichfork);
+int xfs_bmap_set_attrforkoff(struct xfs_inode *ip, int size, int *version);
+void xfs_bmap_local_to_extents_empty(struct xfs_trans *tp,
+ struct xfs_inode *ip, int whichfork);
void __xfs_bmap_add_free(struct xfs_trans *tp, xfs_fsblock_t bno,
- xfs_filblks_t len, struct xfs_owner_info *oinfo,
+ xfs_filblks_t len, const struct xfs_owner_info *oinfo,
bool skip_discard);
void xfs_bmap_compute_maxlevels(struct xfs_mount *mp, int whichfork);
int xfs_bmap_first_unused(struct xfs_trans *tp, struct xfs_inode *ip,
@@ -227,13 +227,20 @@
xfs_fileoff_t off, xfs_filblks_t len, xfs_filblks_t prealloc,
struct xfs_bmbt_irec *got, struct xfs_iext_cursor *cur,
int eof);
+int xfs_bmapi_convert_delalloc(struct xfs_inode *ip, int whichfork,
+ xfs_fileoff_t offset_fsb, struct xfs_bmbt_irec *imap,
+ unsigned int *seq);
+int xfs_bmap_add_extent_unwritten_real(struct xfs_trans *tp,
+ struct xfs_inode *ip, int whichfork,
+ struct xfs_iext_cursor *icur, struct xfs_btree_cur **curp,
+ struct xfs_bmbt_irec *new, int *logflagsp);
static inline void
xfs_bmap_add_free(
struct xfs_trans *tp,
xfs_fsblock_t bno,
xfs_filblks_t len,
- struct xfs_owner_info *oinfo)
+ const struct xfs_owner_info *oinfo)
{
__xfs_bmap_add_free(tp, bno, len, oinfo, false);
}
@@ -255,9 +262,9 @@
enum xfs_bmap_intent_type type, int whichfork,
xfs_fileoff_t startoff, xfs_fsblock_t startblock,
xfs_filblks_t *blockcount, xfs_exntst_t state);
-int xfs_bmap_map_extent(struct xfs_trans *tp, struct xfs_inode *ip,
+void xfs_bmap_map_extent(struct xfs_trans *tp, struct xfs_inode *ip,
struct xfs_bmbt_irec *imap);
-int xfs_bmap_unmap_extent(struct xfs_trans *tp, struct xfs_inode *ip,
+void xfs_bmap_unmap_extent(struct xfs_trans *tp, struct xfs_inode *ip,
struct xfs_bmbt_irec *imap);
static inline int xfs_bmap_fork_to_state(int whichfork)
diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c
index cdb74d2..ffe608d 100644
--- a/fs/xfs/libxfs/xfs_bmap_btree.c
+++ b/fs/xfs/libxfs/xfs_bmap_btree.c
@@ -11,10 +11,8 @@
#include "xfs_trans_resv.h"
#include "xfs_bit.h"
#include "xfs_mount.h"
-#include "xfs_defer.h"
#include "xfs_inode.h"
#include "xfs_trans.h"
-#include "xfs_inode_item.h"
#include "xfs_alloc.h"
#include "xfs_btree.h"
#include "xfs_bmap_btree.h"
@@ -22,7 +20,6 @@
#include "xfs_error.h"
#include "xfs_quota.h"
#include "xfs_trace.h"
-#include "xfs_cksum.h"
#include "xfs_rmap.h"
/*
@@ -403,21 +400,35 @@
union xfs_btree_key *k1,
union xfs_btree_key *k2)
{
- return (int64_t)be64_to_cpu(k1->bmbt.br_startoff) -
- be64_to_cpu(k2->bmbt.br_startoff);
+ uint64_t a = be64_to_cpu(k1->bmbt.br_startoff);
+ uint64_t b = be64_to_cpu(k2->bmbt.br_startoff);
+
+ /*
+ * Note: This routine previously casted a and b to int64 and subtracted
+ * them to generate a result. This lead to problems if b was the
+ * "maximum" key value (all ones) being signed incorrectly, hence this
+ * somewhat less efficient version.
+ */
+ if (a > b)
+ return 1;
+ if (b > a)
+ return -1;
+ return 0;
}
static xfs_failaddr_t
xfs_bmbt_verify(
struct xfs_buf *bp)
{
- struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_mount *mp = bp->b_mount;
struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
xfs_failaddr_t fa;
unsigned int level;
- switch (block->bb_magic) {
- case cpu_to_be32(XFS_BMAP_CRC_MAGIC):
+ if (!xfs_verify_magic(bp, block->bb_magic))
+ return __this_address;
+
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
/*
* XXX: need a better way of verifying the owner here. Right now
* just make sure there has been one set.
@@ -425,11 +436,6 @@
fa = xfs_btree_lblock_v5hdr_verify(bp, XFS_RMAP_OWN_UNKNOWN);
if (fa)
return fa;
- /* fall through */
- case cpu_to_be32(XFS_BMAP_MAGIC):
- break;
- default:
- return __this_address;
}
/*
@@ -481,6 +487,8 @@
const struct xfs_buf_ops xfs_bmbt_buf_ops = {
.name = "xfs_bmbt",
+ .magic = { cpu_to_be32(XFS_BMAP_MAGIC),
+ cpu_to_be32(XFS_BMAP_CRC_MAGIC) },
.verify_read = xfs_bmbt_read_verify,
.verify_write = xfs_bmbt_write_verify,
.verify_struct = xfs_bmbt_verify,
diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index 34c6d7b..71de937 100644
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -11,16 +11,13 @@
#include "xfs_trans_resv.h"
#include "xfs_bit.h"
#include "xfs_mount.h"
-#include "xfs_defer.h"
#include "xfs_inode.h"
#include "xfs_trans.h"
-#include "xfs_inode_item.h"
#include "xfs_buf_item.h"
#include "xfs_btree.h"
#include "xfs_errortag.h"
#include "xfs_error.h"
#include "xfs_trace.h"
-#include "xfs_cksum.h"
#include "xfs_alloc.h"
#include "xfs_log.h"
@@ -276,7 +273,7 @@
struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
struct xfs_buf_log_item *bip = bp->b_log_item;
- if (!xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb))
+ if (!xfs_sb_version_hascrc(&bp->b_mount->m_sb))
return;
if (bip)
block->bb_u.l.bb_lsn = cpu_to_be64(bip->bli_item.li_lsn);
@@ -288,7 +285,7 @@
struct xfs_buf *bp)
{
struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
- struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_mount *mp = bp->b_mount;
if (xfs_sb_version_hascrc(&mp->m_sb)) {
if (!xfs_log_check_lsn(mp, be64_to_cpu(block->bb_u.l.bb_lsn)))
@@ -314,7 +311,7 @@
struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
struct xfs_buf_log_item *bip = bp->b_log_item;
- if (!xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb))
+ if (!xfs_sb_version_hascrc(&bp->b_mount->m_sb))
return;
if (bip)
block->bb_u.s.bb_lsn = cpu_to_be64(bip->bli_item.li_lsn);
@@ -326,11 +323,11 @@
struct xfs_buf *bp)
{
struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
- struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_mount *mp = bp->b_mount;
if (xfs_sb_version_hascrc(&mp->m_sb)) {
if (!xfs_log_check_lsn(mp, be64_to_cpu(block->bb_u.s.bb_lsn)))
- return __this_address;
+ return false;
return xfs_buf_verify_cksum(bp, XFS_BTREE_SBLOCK_CRC_OFF);
}
@@ -691,14 +688,13 @@
xfs_btree_get_bufl(
xfs_mount_t *mp, /* file system mount point */
xfs_trans_t *tp, /* transaction pointer */
- xfs_fsblock_t fsbno, /* file system block number */
- uint lock) /* lock flags for get_buf */
+ xfs_fsblock_t fsbno) /* file system block number */
{
xfs_daddr_t d; /* real disk block address */
ASSERT(fsbno != NULLFSBLOCK);
d = XFS_FSB_TO_DADDR(mp, fsbno);
- return xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize, lock);
+ return xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize, 0);
}
/*
@@ -710,15 +706,14 @@
xfs_mount_t *mp, /* file system mount point */
xfs_trans_t *tp, /* transaction pointer */
xfs_agnumber_t agno, /* allocation group number */
- xfs_agblock_t agbno, /* allocation group block number */
- uint lock) /* lock flags for get_buf */
+ xfs_agblock_t agbno) /* allocation group block number */
{
xfs_daddr_t d; /* real disk block address */
ASSERT(agno != NULLAGNUMBER);
ASSERT(agbno != NULLAGBLOCK);
d = XFS_AGB_TO_DADDR(mp, agno, agbno);
- return xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize, lock);
+ return xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize, 0);
}
/*
@@ -845,7 +840,6 @@
struct xfs_mount *mp, /* file system mount point */
struct xfs_trans *tp, /* transaction pointer */
xfs_fsblock_t fsbno, /* file system block number */
- uint lock, /* lock flags for read_buf */
struct xfs_buf **bpp, /* buffer for fsbno */
int refval, /* ref count value for buffer */
const struct xfs_buf_ops *ops)
@@ -858,7 +852,7 @@
return -EFSCORRUPTED;
d = XFS_FSB_TO_DADDR(mp, fsbno);
error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, d,
- mp->m_bsize, lock, &bp, ops);
+ mp->m_bsize, 0, &bp, ops);
if (error)
return error;
if (bp)
@@ -1185,11 +1179,10 @@
xfs_btnum_t btnum,
__u16 level,
__u16 numrecs,
- __u64 owner,
- unsigned int flags)
+ __u64 owner)
{
xfs_btree_init_block_int(mp, XFS_BUF_TO_BLOCK(bp), bp->b_bn,
- btnum, level, numrecs, owner, flags);
+ btnum, level, numrecs, owner, 0);
}
STATIC void
@@ -1288,7 +1281,6 @@
xfs_btree_get_buf_block(
struct xfs_btree_cur *cur,
union xfs_btree_ptr *ptr,
- int flags,
struct xfs_btree_block **block,
struct xfs_buf **bpp)
{
@@ -1296,14 +1288,11 @@
xfs_daddr_t d;
int error;
- /* need to sort out how callers deal with failures first */
- ASSERT(!(flags & XBF_TRYLOCK));
-
error = xfs_btree_ptr_to_daddr(cur, ptr, &d);
if (error)
return error;
*bpp = xfs_trans_get_buf(cur->bc_tp, mp->m_ddev_targp, d,
- mp->m_bsize, flags);
+ mp->m_bsize, 0);
if (!*bpp)
return -ENOMEM;
@@ -2706,7 +2695,7 @@
XFS_BTREE_STATS_INC(cur, alloc);
/* Set up the new block as "right". */
- error = xfs_btree_get_buf_block(cur, &rptr, 0, &right, &rbp);
+ error = xfs_btree_get_buf_block(cur, &rptr, &right, &rbp);
if (error)
goto error0;
@@ -2961,7 +2950,7 @@
XFS_BTREE_STATS_INC(cur, alloc);
/* Copy the root into a real block. */
- error = xfs_btree_get_buf_block(cur, &nptr, 0, &cblock, &cbp);
+ error = xfs_btree_get_buf_block(cur, &nptr, &cblock, &cbp);
if (error)
goto error0;
@@ -3058,7 +3047,7 @@
XFS_BTREE_STATS_INC(cur, alloc);
/* Set up the new block. */
- error = xfs_btree_get_buf_block(cur, &lptr, 0, &new, &nbp);
+ error = xfs_btree_get_buf_block(cur, &lptr, &new, &nbp);
if (error)
goto error0;
@@ -4433,7 +4422,7 @@
struct xfs_buf *bp,
uint64_t owner)
{
- struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_mount *mp = bp->b_mount;
struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
if (!xfs_sb_version_hascrc(&mp->m_sb))
@@ -4454,7 +4443,7 @@
struct xfs_buf *bp,
unsigned int max_recs)
{
- struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_mount *mp = bp->b_mount;
struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
/* numrecs verification */
@@ -4477,14 +4466,12 @@
* btree block
*
* @bp: buffer containing the btree block
- * @max_recs: pointer to the m_*_mxr max records field in the xfs mount
- * @pag_max_level: pointer to the per-ag max level field
*/
xfs_failaddr_t
xfs_btree_sblock_v5hdr_verify(
struct xfs_buf *bp)
{
- struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_mount *mp = bp->b_mount;
struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
struct xfs_perag *pag = bp->b_pag;
@@ -4510,7 +4497,7 @@
struct xfs_buf *bp,
unsigned int max_recs)
{
- struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_mount *mp = bp->b_mount;
struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
xfs_agblock_t agno;
@@ -4611,7 +4598,7 @@
/* Callback */
error = fn(cur, recp, priv);
- if (error < 0 || error == XFS_BTREE_QUERY_RANGE_ABORT)
+ if (error)
break;
advloop:
@@ -4713,8 +4700,7 @@
*/
if (ldiff >= 0 && hdiff >= 0) {
error = fn(cur, recp, priv);
- if (error < 0 ||
- error == XFS_BTREE_QUERY_RANGE_ABORT)
+ if (error)
break;
} else if (hdiff < 0) {
/* Record is larger than high key; pop. */
@@ -4785,8 +4771,7 @@
* Query a btree for all records overlapping a given interval of keys. The
* supplied function will be called with each record found; return one of the
* XFS_BTREE_QUERY_RANGE_{CONTINUE,ABORT} values or the usual negative error
- * code. This function returns XFS_BTREE_QUERY_RANGE_ABORT, zero, or a
- * negative error code.
+ * code. This function returns -ECANCELED, zero, or a negative error code.
*/
int
xfs_btree_query_range(
@@ -4902,7 +4887,7 @@
union xfs_btree_rec *rec,
void *priv)
{
- return XFS_BTREE_QUERY_RANGE_ABORT;
+ return -ECANCELED;
}
/* Is there a record covering a given range of keys? */
@@ -4917,7 +4902,7 @@
error = xfs_btree_query_range(cur, low, high,
&xfs_btree_has_record_helper, NULL);
- if (error == XFS_BTREE_QUERY_RANGE_ABORT) {
+ if (error == -ECANCELED) {
*exists = true;
return 0;
}
diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h
index e3b3e9d..ced1e65 100644
--- a/fs/xfs/libxfs/xfs_btree.h
+++ b/fs/xfs/libxfs/xfs_btree.h
@@ -301,8 +301,7 @@
xfs_btree_get_bufl(
struct xfs_mount *mp, /* file system mount point */
struct xfs_trans *tp, /* transaction pointer */
- xfs_fsblock_t fsbno, /* file system block number */
- uint lock); /* lock flags for get_buf */
+ xfs_fsblock_t fsbno); /* file system block number */
/*
* Get a buffer for the block, return it with no data read.
@@ -313,8 +312,7 @@
struct xfs_mount *mp, /* file system mount point */
struct xfs_trans *tp, /* transaction pointer */
xfs_agnumber_t agno, /* allocation group number */
- xfs_agblock_t agbno, /* allocation group block number */
- uint lock); /* lock flags for get_buf */
+ xfs_agblock_t agbno); /* allocation group block number */
/*
* Check for the cursor referring to the last block at the given level.
@@ -345,7 +343,6 @@
struct xfs_mount *mp, /* file system mount point */
struct xfs_trans *tp, /* transaction pointer */
xfs_fsblock_t fsbno, /* file system block number */
- uint lock, /* lock flags for read_buf */
struct xfs_buf **bpp, /* buffer for fsbno */
int refval, /* ref count value for buffer */
const struct xfs_buf_ops *ops);
@@ -383,8 +380,7 @@
xfs_btnum_t btnum,
__u16 level,
__u16 numrecs,
- __u64 owner,
- unsigned int flags);
+ __u64 owner);
void
xfs_btree_init_block_int(
@@ -468,9 +464,13 @@
uint xfs_btree_compute_maxlevels(uint *limits, unsigned long len);
unsigned long long xfs_btree_calc_size(uint *limits, unsigned long long len);
-/* return codes */
-#define XFS_BTREE_QUERY_RANGE_CONTINUE 0 /* keep iterating */
-#define XFS_BTREE_QUERY_RANGE_ABORT 1 /* stop iterating */
+/*
+ * Return codes for the query range iterator function are 0 to continue
+ * iterating, and non-zero to stop iterating. Any non-zero value will be
+ * passed up to the _query_range caller. The special value -ECANCELED can be
+ * used to stop iteration, because _query_range never generates that error
+ * code on its own.
+ */
typedef int (*xfs_btree_query_range_fn)(struct xfs_btree_cur *cur,
union xfs_btree_rec *rec, void *priv);
diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c
index 376bee9..4fd1223 100644
--- a/fs/xfs/libxfs/xfs_da_btree.c
+++ b/fs/xfs/libxfs/xfs_da_btree.c
@@ -12,20 +12,14 @@
#include "xfs_trans_resv.h"
#include "xfs_bit.h"
#include "xfs_mount.h"
-#include "xfs_da_format.h"
-#include "xfs_da_btree.h"
#include "xfs_dir2.h"
#include "xfs_dir2_priv.h"
#include "xfs_inode.h"
#include "xfs_trans.h"
-#include "xfs_inode_item.h"
-#include "xfs_alloc.h"
#include "xfs_bmap.h"
-#include "xfs_attr.h"
#include "xfs_attr_leaf.h"
#include "xfs_error.h"
#include "xfs_trace.h"
-#include "xfs_cksum.h"
#include "xfs_buf_item.h"
#include "xfs_log.h"
@@ -116,35 +110,52 @@
kmem_zone_free(xfs_da_state_zone, state);
}
+/*
+ * Verify an xfs_da3_blkinfo structure. Note that the da3 fields are only
+ * accessible on v5 filesystems. This header format is common across da node,
+ * attr leaf and dir leaf blocks.
+ */
+xfs_failaddr_t
+xfs_da3_blkinfo_verify(
+ struct xfs_buf *bp,
+ struct xfs_da3_blkinfo *hdr3)
+{
+ struct xfs_mount *mp = bp->b_mount;
+ struct xfs_da_blkinfo *hdr = &hdr3->hdr;
+
+ if (!xfs_verify_magic16(bp, hdr->magic))
+ return __this_address;
+
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_meta_uuid))
+ return __this_address;
+ if (be64_to_cpu(hdr3->blkno) != bp->b_bn)
+ return __this_address;
+ if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->lsn)))
+ return __this_address;
+ }
+
+ return NULL;
+}
+
static xfs_failaddr_t
xfs_da3_node_verify(
struct xfs_buf *bp)
{
- struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_mount *mp = bp->b_mount;
struct xfs_da_intnode *hdr = bp->b_addr;
struct xfs_da3_icnode_hdr ichdr;
const struct xfs_dir_ops *ops;
+ xfs_failaddr_t fa;
ops = xfs_dir_get_ops(mp, NULL);
ops->node_hdr_from_disk(&ichdr, hdr);
- if (xfs_sb_version_hascrc(&mp->m_sb)) {
- struct xfs_da3_node_hdr *hdr3 = bp->b_addr;
+ fa = xfs_da3_blkinfo_verify(bp, bp->b_addr);
+ if (fa)
+ return fa;
- if (ichdr.magic != XFS_DA3_NODE_MAGIC)
- return __this_address;
-
- if (!uuid_equal(&hdr3->info.uuid, &mp->m_sb.sb_meta_uuid))
- return __this_address;
- if (be64_to_cpu(hdr3->info.blkno) != bp->b_bn)
- return __this_address;
- if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->info.lsn)))
- return __this_address;
- } else {
- if (ichdr.magic != XFS_DA_NODE_MAGIC)
- return __this_address;
- }
if (ichdr.level == 0)
return __this_address;
if (ichdr.level > XFS_DA_NODE_MAXDEPTH)
@@ -169,7 +180,7 @@
xfs_da3_node_write_verify(
struct xfs_buf *bp)
{
- struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_mount *mp = bp->b_mount;
struct xfs_buf_log_item *bip = bp->b_log_item;
struct xfs_da3_node_hdr *hdr3 = bp->b_addr;
xfs_failaddr_t fa;
@@ -257,6 +268,8 @@
const struct xfs_buf_ops xfs_da3_node_buf_ops = {
.name = "xfs_da3_node",
+ .magic16 = { cpu_to_be16(XFS_DA_NODE_MAGIC),
+ cpu_to_be16(XFS_DA3_NODE_MAGIC) },
.verify_read = xfs_da3_node_read_verify,
.verify_write = xfs_da3_node_write_verify,
.verify_struct = xfs_da3_node_verify_struct,
@@ -474,10 +487,8 @@
ASSERT(state->path.active == 0);
oldblk = &state->path.blk[0];
error = xfs_da3_root_split(state, oldblk, addblk);
- if (error) {
- addblk->bp = NULL;
- return error; /* GROT: dir is inconsistent */
- }
+ if (error)
+ goto out;
/*
* Update pointers to the node which used to be block 0 and just got
@@ -492,7 +503,10 @@
*/
node = oldblk->bp->b_addr;
if (node->hdr.info.forw) {
- ASSERT(be32_to_cpu(node->hdr.info.forw) == addblk->blkno);
+ if (be32_to_cpu(node->hdr.info.forw) != addblk->blkno) {
+ error = -EFSCORRUPTED;
+ goto out;
+ }
node = addblk->bp->b_addr;
node->hdr.info.back = cpu_to_be32(oldblk->blkno);
xfs_trans_log_buf(state->args->trans, addblk->bp,
@@ -501,15 +515,19 @@
}
node = oldblk->bp->b_addr;
if (node->hdr.info.back) {
- ASSERT(be32_to_cpu(node->hdr.info.back) == addblk->blkno);
+ if (be32_to_cpu(node->hdr.info.back) != addblk->blkno) {
+ error = -EFSCORRUPTED;
+ goto out;
+ }
node = addblk->bp->b_addr;
node->hdr.info.forw = cpu_to_be32(oldblk->blkno);
xfs_trans_log_buf(state->args->trans, addblk->bp,
XFS_DA_LOGRANGE(node, &node->hdr.info,
sizeof(node->hdr.info)));
}
+out:
addblk->bp = NULL;
- return 0;
+ return error;
}
/*
@@ -2080,7 +2098,7 @@
* If we didn't get it and the block might work if fragmented,
* try without the CONTIG flag. Loop until we get it all.
*/
- mapp = kmem_alloc(sizeof(*mapp) * count, KM_SLEEP);
+ mapp = kmem_alloc(sizeof(*mapp) * count, 0);
for (b = *bno, mapi = 0; b < *bno + count; ) {
nmap = min(XFS_BMAP_MAX_NMAP, count);
c = (int)(*bno + count - b);
@@ -2462,7 +2480,7 @@
if (nirecs > 1) {
map = kmem_zalloc(nirecs * sizeof(struct xfs_buf_map),
- KM_SLEEP | KM_NOFS);
+ KM_NOFS);
if (!map)
return -ENOMEM;
*mapp = map;
@@ -2521,7 +2539,7 @@
*/
if (nfsb != 1)
irecs = kmem_zalloc(sizeof(irec) * nfsb,
- KM_SLEEP | KM_NOFS);
+ KM_NOFS);
nirecs = nfsb;
error = xfs_bmapi_read(dp, (xfs_fileoff_t)bno, nfsb, irecs,
diff --git a/fs/xfs/libxfs/xfs_da_btree.h b/fs/xfs/libxfs/xfs_da_btree.h
index 84dd865..ae0bbd2 100644
--- a/fs/xfs/libxfs/xfs_da_btree.h
+++ b/fs/xfs/libxfs/xfs_da_btree.h
@@ -81,13 +81,15 @@
#define XFS_DA_OP_ADDNAME 0x0004 /* this is an add operation */
#define XFS_DA_OP_OKNOENT 0x0008 /* lookup/add op, ENOENT ok, else die */
#define XFS_DA_OP_CILOOKUP 0x0010 /* lookup to return CI name if found */
+#define XFS_DA_OP_ALLOCVAL 0x0020 /* lookup to alloc buffer if found */
#define XFS_DA_OP_FLAGS \
{ XFS_DA_OP_JUSTCHECK, "JUSTCHECK" }, \
{ XFS_DA_OP_RENAME, "RENAME" }, \
{ XFS_DA_OP_ADDNAME, "ADDNAME" }, \
{ XFS_DA_OP_OKNOENT, "OKNOENT" }, \
- { XFS_DA_OP_CILOOKUP, "CILOOKUP" }
+ { XFS_DA_OP_CILOOKUP, "CILOOKUP" }, \
+ { XFS_DA_OP_ALLOCVAL, "ALLOCVAL" }
/*
* Storage for holding state during Btree searches and split/join ops.
diff --git a/fs/xfs/libxfs/xfs_da_format.c b/fs/xfs/libxfs/xfs_da_format.c
index b39053d..b1ae572 100644
--- a/fs/xfs/libxfs/xfs_da_format.c
+++ b/fs/xfs/libxfs/xfs_da_format.c
@@ -11,11 +11,8 @@
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
-#include "xfs_da_format.h"
-#include "xfs_da_btree.h"
#include "xfs_inode.h"
#include "xfs_dir2.h"
-#include "xfs_dir2_priv.h"
/*
* Shortform directory ops
diff --git a/fs/xfs/libxfs/xfs_da_format.h b/fs/xfs/libxfs/xfs_da_format.h
index 5d5bf3b..ae654e0 100644
--- a/fs/xfs/libxfs/xfs_da_format.h
+++ b/fs/xfs/libxfs/xfs_da_format.h
@@ -869,4 +869,7 @@
return 1 << (sbp->sb_blocklog + sbp->sb_dirblklog);
}
+xfs_failaddr_t xfs_da3_blkinfo_verify(struct xfs_buf *bp,
+ struct xfs_da3_blkinfo *hdr3);
+
#endif /* __XFS_DA_FORMAT_H__ */
diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c
index e792b16..2255752 100644
--- a/fs/xfs/libxfs/xfs_defer.c
+++ b/fs/xfs/libxfs/xfs_defer.c
@@ -9,8 +9,6 @@
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
-#include "xfs_bit.h"
-#include "xfs_sb.h"
#include "xfs_mount.h"
#include "xfs_defer.h"
#include "xfs_trans.h"
@@ -172,7 +170,13 @@
* reoccur.
*/
-static const struct xfs_defer_op_type *defer_op_types[XFS_DEFER_OPS_TYPE_MAX];
+static const struct xfs_defer_op_type *defer_op_types[] = {
+ [XFS_DEFER_OPS_TYPE_BMAP] = &xfs_bmap_update_defer_type,
+ [XFS_DEFER_OPS_TYPE_REFCOUNT] = &xfs_refcount_update_defer_type,
+ [XFS_DEFER_OPS_TYPE_RMAP] = &xfs_rmap_update_defer_type,
+ [XFS_DEFER_OPS_TYPE_FREE] = &xfs_extent_free_defer_type,
+ [XFS_DEFER_OPS_TYPE_AGFL_FREE] = &xfs_agfl_free_defer_type,
+};
/*
* For each pending item in the intake list, log its intent item and the
@@ -185,15 +189,15 @@
{
struct list_head *li;
struct xfs_defer_pending *dfp;
+ const struct xfs_defer_op_type *ops;
list_for_each_entry(dfp, &tp->t_dfops, dfp_list) {
- dfp->dfp_intent = dfp->dfp_type->create_intent(tp,
- dfp->dfp_count);
+ ops = defer_op_types[dfp->dfp_type];
+ dfp->dfp_intent = ops->create_intent(tp, dfp->dfp_count);
trace_xfs_defer_create_intent(tp->t_mountp, dfp);
- list_sort(tp->t_mountp, &dfp->dfp_work,
- dfp->dfp_type->diff_items);
+ list_sort(tp->t_mountp, &dfp->dfp_work, ops->diff_items);
list_for_each(li, &dfp->dfp_work)
- dfp->dfp_type->log_item(tp, dfp->dfp_intent, li);
+ ops->log_item(tp, dfp->dfp_intent, li);
}
}
@@ -204,14 +208,16 @@
struct list_head *dop_pending)
{
struct xfs_defer_pending *dfp;
+ const struct xfs_defer_op_type *ops;
trace_xfs_defer_trans_abort(tp, _RET_IP_);
/* Abort intent items that don't have a done item. */
list_for_each_entry(dfp, dop_pending, dfp_list) {
+ ops = defer_op_types[dfp->dfp_type];
trace_xfs_defer_pending_abort(tp->t_mountp, dfp);
if (dfp->dfp_intent && !dfp->dfp_done) {
- dfp->dfp_type->abort_intent(dfp->dfp_intent);
+ ops->abort_intent(dfp->dfp_intent);
dfp->dfp_intent = NULL;
}
}
@@ -266,13 +272,15 @@
trace_xfs_defer_trans_roll(tp, _RET_IP_);
- /* Roll the transaction. */
+ /*
+ * Roll the transaction. Rolling always given a new transaction (even
+ * if committing the old one fails!) to hand back to the caller, so we
+ * join the held resources to the new transaction so that we always
+ * return with the held resources joined to @tpp, no matter what
+ * happened.
+ */
error = xfs_trans_roll(tpp);
tp = *tpp;
- if (error) {
- trace_xfs_defer_trans_roll_error(tp, error);
- return error;
- }
/* Rejoin the joined inodes. */
for (i = 0; i < ipcount; i++)
@@ -284,6 +292,8 @@
xfs_trans_bhold(tp, bplist[i]);
}
+ if (error)
+ trace_xfs_defer_trans_roll_error(tp, error);
return error;
}
@@ -315,18 +325,20 @@
struct xfs_defer_pending *pli;
struct list_head *pwi;
struct list_head *n;
+ const struct xfs_defer_op_type *ops;
/*
* Free the pending items. Caller should already have arranged
* for the intent items to be released.
*/
list_for_each_entry_safe(dfp, pli, dop_list, dfp_list) {
+ ops = defer_op_types[dfp->dfp_type];
trace_xfs_defer_cancel_list(mp, dfp);
list_del(&dfp->dfp_list);
list_for_each_safe(pwi, n, &dfp->dfp_work) {
list_del(pwi);
dfp->dfp_count--;
- dfp->dfp_type->cancel_item(pwi);
+ ops->cancel_item(pwi);
}
ASSERT(dfp->dfp_count == 0);
kmem_free(dfp);
@@ -350,7 +362,7 @@
struct list_head *n;
void *state;
int error = 0;
- void (*cleanup_fn)(struct xfs_trans *, void *, int);
+ const struct xfs_defer_op_type *ops;
LIST_HEAD(dop_pending);
ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES);
@@ -373,18 +385,18 @@
/* Log an intent-done item for the first pending item. */
dfp = list_first_entry(&dop_pending, struct xfs_defer_pending,
dfp_list);
+ ops = defer_op_types[dfp->dfp_type];
trace_xfs_defer_pending_finish((*tp)->t_mountp, dfp);
- dfp->dfp_done = dfp->dfp_type->create_done(*tp, dfp->dfp_intent,
+ dfp->dfp_done = ops->create_done(*tp, dfp->dfp_intent,
dfp->dfp_count);
- cleanup_fn = dfp->dfp_type->finish_cleanup;
/* Finish the work items. */
state = NULL;
list_for_each_safe(li, n, &dfp->dfp_work) {
list_del(li);
dfp->dfp_count--;
- error = dfp->dfp_type->finish_item(*tp, li,
- dfp->dfp_done, &state);
+ error = ops->finish_item(*tp, li, dfp->dfp_done,
+ &state);
if (error == -EAGAIN) {
/*
* Caller wants a fresh transaction;
@@ -400,8 +412,8 @@
* xfs_defer_cancel will take care of freeing
* all these lists and stuff.
*/
- if (cleanup_fn)
- cleanup_fn(*tp, state, error);
+ if (ops->finish_cleanup)
+ ops->finish_cleanup(*tp, state, error);
goto out;
}
}
@@ -413,20 +425,19 @@
* a Fresh Transaction while Finishing
* Deferred Work" above.
*/
- dfp->dfp_intent = dfp->dfp_type->create_intent(*tp,
+ dfp->dfp_intent = ops->create_intent(*tp,
dfp->dfp_count);
dfp->dfp_done = NULL;
list_for_each(li, &dfp->dfp_work)
- dfp->dfp_type->log_item(*tp, dfp->dfp_intent,
- li);
+ ops->log_item(*tp, dfp->dfp_intent, li);
} else {
/* Done with the dfp, free it. */
list_del(&dfp->dfp_list);
kmem_free(dfp);
}
- if (cleanup_fn)
- cleanup_fn(*tp, state, error);
+ if (ops->finish_cleanup)
+ ops->finish_cleanup(*tp, state, error);
}
out:
@@ -486,8 +497,10 @@
struct list_head *li)
{
struct xfs_defer_pending *dfp = NULL;
+ const struct xfs_defer_op_type *ops;
ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
+ BUILD_BUG_ON(ARRAY_SIZE(defer_op_types) != XFS_DEFER_OPS_TYPE_MAX);
/*
* Add the item to a pending item at the end of the intake list.
@@ -497,15 +510,15 @@
if (!list_empty(&tp->t_dfops)) {
dfp = list_last_entry(&tp->t_dfops,
struct xfs_defer_pending, dfp_list);
- if (dfp->dfp_type->type != type ||
- (dfp->dfp_type->max_items &&
- dfp->dfp_count >= dfp->dfp_type->max_items))
+ ops = defer_op_types[dfp->dfp_type];
+ if (dfp->dfp_type != type ||
+ (ops->max_items && dfp->dfp_count >= ops->max_items))
dfp = NULL;
}
if (!dfp) {
dfp = kmem_alloc(sizeof(struct xfs_defer_pending),
- KM_SLEEP | KM_NOFS);
- dfp->dfp_type = defer_op_types[type];
+ KM_NOFS);
+ dfp->dfp_type = type;
dfp->dfp_intent = NULL;
dfp->dfp_done = NULL;
dfp->dfp_count = 0;
@@ -517,14 +530,6 @@
dfp->dfp_count++;
}
-/* Initialize a deferred operation list. */
-void
-xfs_defer_init_op_type(
- const struct xfs_defer_op_type *type)
-{
- defer_op_types[type->type] = type;
-}
-
/*
* Move deferred ops from one transaction to another and reset the source to
* initial state. This is primarily used to carry state forward across
diff --git a/fs/xfs/libxfs/xfs_defer.h b/fs/xfs/libxfs/xfs_defer.h
index 2584a5b..7c28d76 100644
--- a/fs/xfs/libxfs/xfs_defer.h
+++ b/fs/xfs/libxfs/xfs_defer.h
@@ -9,20 +9,6 @@
struct xfs_defer_op_type;
/*
- * Save a log intent item and a list of extents, so that we can replay
- * whatever action had to happen to the extent list and file the log done
- * item.
- */
-struct xfs_defer_pending {
- const struct xfs_defer_op_type *dfp_type; /* function pointers */
- struct list_head dfp_list; /* pending items */
- void *dfp_intent; /* log intent item */
- void *dfp_done; /* log done item */
- struct list_head dfp_work; /* work items */
- unsigned int dfp_count; /* # extent items */
-};
-
-/*
* Header for deferred operation list.
*/
enum xfs_defer_ops_type {
@@ -34,6 +20,20 @@
XFS_DEFER_OPS_TYPE_MAX,
};
+/*
+ * Save a log intent item and a list of extents, so that we can replay
+ * whatever action had to happen to the extent list and file the log done
+ * item.
+ */
+struct xfs_defer_pending {
+ struct list_head dfp_list; /* pending items */
+ struct list_head dfp_work; /* work items */
+ void *dfp_intent; /* log intent item */
+ void *dfp_done; /* log done item */
+ unsigned int dfp_count; /* # extent items */
+ enum xfs_defer_ops_type dfp_type;
+};
+
void xfs_defer_add(struct xfs_trans *tp, enum xfs_defer_ops_type type,
struct list_head *h);
int xfs_defer_finish_noroll(struct xfs_trans **tp);
@@ -43,8 +43,6 @@
/* Description of a deferred type. */
struct xfs_defer_op_type {
- enum xfs_defer_ops_type type;
- unsigned int max_items;
void (*abort_intent)(void *);
void *(*create_done)(struct xfs_trans *, void *, unsigned int);
int (*finish_item)(struct xfs_trans *, struct list_head *, void *,
@@ -54,8 +52,13 @@
int (*diff_items)(void *, struct list_head *, struct list_head *);
void *(*create_intent)(struct xfs_trans *, uint);
void (*log_item)(struct xfs_trans *, void *, struct list_head *);
+ unsigned int max_items;
};
-void xfs_defer_init_op_type(const struct xfs_defer_op_type *type);
+extern const struct xfs_defer_op_type xfs_bmap_update_defer_type;
+extern const struct xfs_defer_op_type xfs_refcount_update_defer_type;
+extern const struct xfs_defer_op_type xfs_rmap_update_defer_type;
+extern const struct xfs_defer_op_type xfs_extent_free_defer_type;
+extern const struct xfs_defer_op_type xfs_agfl_free_defer_type;
#endif /* __XFS_DEFER_H__ */
diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c
index 229152c..867c5de 100644
--- a/fs/xfs/libxfs/xfs_dir2.c
+++ b/fs/xfs/libxfs/xfs_dir2.c
@@ -5,20 +5,16 @@
*/
#include "xfs.h"
#include "xfs_fs.h"
+#include "xfs_shared.h"
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
-#include "xfs_defer.h"
-#include "xfs_da_format.h"
-#include "xfs_da_btree.h"
#include "xfs_inode.h"
#include "xfs_trans.h"
-#include "xfs_inode_item.h"
#include "xfs_bmap.h"
#include "xfs_dir2.h"
#include "xfs_dir2_priv.h"
-#include "xfs_ialloc.h"
#include "xfs_errortag.h"
#include "xfs_error.h"
#include "xfs_trace.h"
@@ -114,9 +110,9 @@
nodehdr_size = mp->m_dir_inode_ops->node_hdr_size;
mp->m_dir_geo = kmem_zalloc(sizeof(struct xfs_da_geometry),
- KM_SLEEP | KM_MAYFAIL);
+ KM_MAYFAIL);
mp->m_attr_geo = kmem_zalloc(sizeof(struct xfs_da_geometry),
- KM_SLEEP | KM_MAYFAIL);
+ KM_MAYFAIL);
if (!mp->m_dir_geo || !mp->m_attr_geo) {
kmem_free(mp->m_dir_geo);
kmem_free(mp->m_attr_geo);
@@ -221,7 +217,7 @@
if (error)
return error;
- args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS);
+ args = kmem_zalloc(sizeof(*args), KM_NOFS);
if (!args)
return -ENOMEM;
@@ -258,7 +254,7 @@
XFS_STATS_INC(dp->i_mount, xs_dir_create);
}
- args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS);
+ args = kmem_zalloc(sizeof(*args), KM_NOFS);
if (!args)
return -ENOMEM;
@@ -357,7 +353,7 @@
* lockdep Doing this avoids having to add a bunch of lockdep class
* annotations into the reclaim path for the ilock.
*/
- args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS);
+ args = kmem_zalloc(sizeof(*args), KM_NOFS);
args->geo = dp->i_mount->m_dir_geo;
args->name = name->name;
args->namelen = name->len;
@@ -426,7 +422,7 @@
ASSERT(S_ISDIR(VFS_I(dp)->i_mode));
XFS_STATS_INC(dp->i_mount, xs_dir_remove);
- args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS);
+ args = kmem_zalloc(sizeof(*args), KM_NOFS);
if (!args)
return -ENOMEM;
@@ -487,7 +483,7 @@
if (rval)
return rval;
- args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS);
+ args = kmem_zalloc(sizeof(*args), KM_NOFS);
if (!args)
return -ENOMEM;
@@ -703,3 +699,20 @@
xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
return 0;
}
+
+/* Returns true if the directory entry name is valid. */
+bool
+xfs_dir2_namecheck(
+ const void *name,
+ size_t length)
+{
+ /*
+ * MAXNAMELEN includes the trailing null, but (name/length) leave it
+ * out, so use >= for the length check.
+ */
+ if (length >= MAXNAMELEN)
+ return false;
+
+ /* There shouldn't be any slashes or nulls here */
+ return !memchr(name, '/', length) && !memchr(name, 0, length);
+}
diff --git a/fs/xfs/libxfs/xfs_dir2.h b/fs/xfs/libxfs/xfs_dir2.h
index c3e3f6b..f542447 100644
--- a/fs/xfs/libxfs/xfs_dir2.h
+++ b/fs/xfs/libxfs/xfs_dir2.h
@@ -326,5 +326,6 @@
unsigned char xfs_dir3_get_dtype(struct xfs_mount *mp, uint8_t filetype);
void *xfs_dir3_data_endp(struct xfs_da_geometry *geo,
struct xfs_dir2_data_hdr *hdr);
+bool xfs_dir2_namecheck(const void *name, size_t length);
#endif /* __XFS_DIR2_H__ */
diff --git a/fs/xfs/libxfs/xfs_dir2_block.c b/fs/xfs/libxfs/xfs_dir2_block.c
index 30ed591..49e4bc3 100644
--- a/fs/xfs/libxfs/xfs_dir2_block.c
+++ b/fs/xfs/libxfs/xfs_dir2_block.c
@@ -6,22 +6,19 @@
*/
#include "xfs.h"
#include "xfs_fs.h"
+#include "xfs_shared.h"
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
-#include "xfs_da_format.h"
-#include "xfs_da_btree.h"
#include "xfs_inode.h"
#include "xfs_trans.h"
-#include "xfs_inode_item.h"
#include "xfs_bmap.h"
#include "xfs_buf_item.h"
#include "xfs_dir2.h"
#include "xfs_dir2_priv.h"
#include "xfs_error.h"
#include "xfs_trace.h"
-#include "xfs_cksum.h"
#include "xfs_log.h"
/*
@@ -50,21 +47,19 @@
xfs_dir3_block_verify(
struct xfs_buf *bp)
{
- struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_mount *mp = bp->b_mount;
struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr;
+ if (!xfs_verify_magic(bp, hdr3->magic))
+ return __this_address;
+
if (xfs_sb_version_hascrc(&mp->m_sb)) {
- if (hdr3->magic != cpu_to_be32(XFS_DIR3_BLOCK_MAGIC))
- return __this_address;
if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_meta_uuid))
return __this_address;
if (be64_to_cpu(hdr3->blkno) != bp->b_bn)
return __this_address;
if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->lsn)))
return __this_address;
- } else {
- if (hdr3->magic != cpu_to_be32(XFS_DIR2_BLOCK_MAGIC))
- return __this_address;
}
return __xfs_dir3_data_check(NULL, bp);
}
@@ -73,7 +68,7 @@
xfs_dir3_block_read_verify(
struct xfs_buf *bp)
{
- struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_mount *mp = bp->b_mount;
xfs_failaddr_t fa;
if (xfs_sb_version_hascrc(&mp->m_sb) &&
@@ -90,7 +85,7 @@
xfs_dir3_block_write_verify(
struct xfs_buf *bp)
{
- struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_mount *mp = bp->b_mount;
struct xfs_buf_log_item *bip = bp->b_log_item;
struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr;
xfs_failaddr_t fa;
@@ -112,6 +107,8 @@
const struct xfs_buf_ops xfs_dir3_block_buf_ops = {
.name = "xfs_dir3_block",
+ .magic = { cpu_to_be32(XFS_DIR2_BLOCK_MAGIC),
+ cpu_to_be32(XFS_DIR3_BLOCK_MAGIC) },
.verify_read = xfs_dir3_block_read_verify,
.verify_write = xfs_dir3_block_write_verify,
.verify_struct = xfs_dir3_block_verify,
@@ -1095,11 +1092,11 @@
* Copy the directory into a temporary buffer.
* Then pitch the incore inode data so we can make extents.
*/
- sfp = kmem_alloc(ifp->if_bytes, KM_SLEEP);
+ sfp = kmem_alloc(ifp->if_bytes, 0);
memcpy(sfp, oldsfp, ifp->if_bytes);
xfs_idata_realloc(dp, -ifp->if_bytes, XFS_DATA_FORK);
- xfs_bmap_local_to_extents_empty(dp, XFS_DATA_FORK);
+ xfs_bmap_local_to_extents_empty(tp, dp, XFS_DATA_FORK);
dp->i_d.di_size = 0;
/*
diff --git a/fs/xfs/libxfs/xfs_dir2_data.c b/fs/xfs/libxfs/xfs_dir2_data.c
index 01162c6..2c79be4 100644
--- a/fs/xfs/libxfs/xfs_dir2_data.c
+++ b/fs/xfs/libxfs/xfs_dir2_data.c
@@ -6,19 +6,16 @@
*/
#include "xfs.h"
#include "xfs_fs.h"
+#include "xfs_shared.h"
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
-#include "xfs_da_format.h"
-#include "xfs_da_btree.h"
#include "xfs_inode.h"
#include "xfs_dir2.h"
-#include "xfs_dir2_priv.h"
#include "xfs_error.h"
#include "xfs_trans.h"
#include "xfs_buf_item.h"
-#include "xfs_cksum.h"
#include "xfs_log.h"
static xfs_failaddr_t xfs_dir2_data_freefind_verify(
@@ -50,14 +47,13 @@
int i; /* leaf index */
int lastfree; /* last entry was unused */
xfs_dir2_leaf_entry_t *lep=NULL; /* block leaf entries */
- xfs_mount_t *mp; /* filesystem mount point */
+ struct xfs_mount *mp = bp->b_mount;
char *p; /* current data position */
int stale; /* count of stale leaves */
struct xfs_name name;
const struct xfs_dir_ops *ops;
struct xfs_da_geometry *geo;
- mp = bp->b_target->bt_mount;
geo = mp->m_dir_geo;
/*
@@ -249,21 +245,19 @@
xfs_dir3_data_verify(
struct xfs_buf *bp)
{
- struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_mount *mp = bp->b_mount;
struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr;
+ if (!xfs_verify_magic(bp, hdr3->magic))
+ return __this_address;
+
if (xfs_sb_version_hascrc(&mp->m_sb)) {
- if (hdr3->magic != cpu_to_be32(XFS_DIR3_DATA_MAGIC))
- return __this_address;
if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_meta_uuid))
return __this_address;
if (be64_to_cpu(hdr3->blkno) != bp->b_bn)
return __this_address;
if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->lsn)))
return __this_address;
- } else {
- if (hdr3->magic != cpu_to_be32(XFS_DIR2_DATA_MAGIC))
- return __this_address;
}
return __xfs_dir3_data_check(NULL, bp);
}
@@ -300,7 +294,7 @@
xfs_dir3_data_read_verify(
struct xfs_buf *bp)
{
- struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_mount *mp = bp->b_mount;
xfs_failaddr_t fa;
if (xfs_sb_version_hascrc(&mp->m_sb) &&
@@ -317,7 +311,7 @@
xfs_dir3_data_write_verify(
struct xfs_buf *bp)
{
- struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_mount *mp = bp->b_mount;
struct xfs_buf_log_item *bip = bp->b_log_item;
struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr;
xfs_failaddr_t fa;
@@ -339,6 +333,8 @@
const struct xfs_buf_ops xfs_dir3_data_buf_ops = {
.name = "xfs_dir3_data",
+ .magic = { cpu_to_be32(XFS_DIR2_DATA_MAGIC),
+ cpu_to_be32(XFS_DIR3_DATA_MAGIC) },
.verify_read = xfs_dir3_data_read_verify,
.verify_write = xfs_dir3_data_write_verify,
.verify_struct = xfs_dir3_data_verify,
@@ -346,6 +342,8 @@
static const struct xfs_buf_ops xfs_dir3_data_reada_buf_ops = {
.name = "xfs_dir3_data_reada",
+ .magic = { cpu_to_be32(XFS_DIR2_DATA_MAGIC),
+ cpu_to_be32(XFS_DIR3_DATA_MAGIC) },
.verify_read = xfs_dir3_data_reada_verify,
.verify_write = xfs_dir3_data_write_verify,
};
diff --git a/fs/xfs/libxfs/xfs_dir2_leaf.c b/fs/xfs/libxfs/xfs_dir2_leaf.c
index 1728a3e..a53e458 100644
--- a/fs/xfs/libxfs/xfs_dir2_leaf.c
+++ b/fs/xfs/libxfs/xfs_dir2_leaf.c
@@ -6,12 +6,11 @@
*/
#include "xfs.h"
#include "xfs_fs.h"
+#include "xfs_shared.h"
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
-#include "xfs_da_format.h"
-#include "xfs_da_btree.h"
#include "xfs_inode.h"
#include "xfs_bmap.h"
#include "xfs_dir2.h"
@@ -20,8 +19,6 @@
#include "xfs_trace.h"
#include "xfs_trans.h"
#include "xfs_buf_item.h"
-#include "xfs_cksum.h"
-#include "xfs_log.h"
/*
* Local function declarations.
@@ -142,66 +139,46 @@
*/
static xfs_failaddr_t
xfs_dir3_leaf_verify(
- struct xfs_buf *bp,
- uint16_t magic)
+ struct xfs_buf *bp)
{
- struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_mount *mp = bp->b_mount;
struct xfs_dir2_leaf *leaf = bp->b_addr;
+ xfs_failaddr_t fa;
- ASSERT(magic == XFS_DIR2_LEAF1_MAGIC || magic == XFS_DIR2_LEAFN_MAGIC);
-
- if (xfs_sb_version_hascrc(&mp->m_sb)) {
- struct xfs_dir3_leaf_hdr *leaf3 = bp->b_addr;
- uint16_t magic3;
-
- magic3 = (magic == XFS_DIR2_LEAF1_MAGIC) ? XFS_DIR3_LEAF1_MAGIC
- : XFS_DIR3_LEAFN_MAGIC;
-
- if (leaf3->info.hdr.magic != cpu_to_be16(magic3))
- return __this_address;
- if (!uuid_equal(&leaf3->info.uuid, &mp->m_sb.sb_meta_uuid))
- return __this_address;
- if (be64_to_cpu(leaf3->info.blkno) != bp->b_bn)
- return __this_address;
- if (!xfs_log_check_lsn(mp, be64_to_cpu(leaf3->info.lsn)))
- return __this_address;
- } else {
- if (leaf->hdr.info.magic != cpu_to_be16(magic))
- return __this_address;
- }
+ fa = xfs_da3_blkinfo_verify(bp, bp->b_addr);
+ if (fa)
+ return fa;
return xfs_dir3_leaf_check_int(mp, NULL, NULL, leaf);
}
static void
-__read_verify(
- struct xfs_buf *bp,
- uint16_t magic)
+xfs_dir3_leaf_read_verify(
+ struct xfs_buf *bp)
{
- struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_mount *mp = bp->b_mount;
xfs_failaddr_t fa;
if (xfs_sb_version_hascrc(&mp->m_sb) &&
!xfs_buf_verify_cksum(bp, XFS_DIR3_LEAF_CRC_OFF))
xfs_verifier_error(bp, -EFSBADCRC, __this_address);
else {
- fa = xfs_dir3_leaf_verify(bp, magic);
+ fa = xfs_dir3_leaf_verify(bp);
if (fa)
xfs_verifier_error(bp, -EFSCORRUPTED, fa);
}
}
static void
-__write_verify(
- struct xfs_buf *bp,
- uint16_t magic)
+xfs_dir3_leaf_write_verify(
+ struct xfs_buf *bp)
{
- struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_mount *mp = bp->b_mount;
struct xfs_buf_log_item *bip = bp->b_log_item;
struct xfs_dir3_leaf_hdr *hdr3 = bp->b_addr;
xfs_failaddr_t fa;
- fa = xfs_dir3_leaf_verify(bp, magic);
+ fa = xfs_dir3_leaf_verify(bp);
if (fa) {
xfs_verifier_error(bp, -EFSCORRUPTED, fa);
return;
@@ -216,60 +193,22 @@
xfs_buf_update_cksum(bp, XFS_DIR3_LEAF_CRC_OFF);
}
-static xfs_failaddr_t
-xfs_dir3_leaf1_verify(
- struct xfs_buf *bp)
-{
- return xfs_dir3_leaf_verify(bp, XFS_DIR2_LEAF1_MAGIC);
-}
-
-static void
-xfs_dir3_leaf1_read_verify(
- struct xfs_buf *bp)
-{
- __read_verify(bp, XFS_DIR2_LEAF1_MAGIC);
-}
-
-static void
-xfs_dir3_leaf1_write_verify(
- struct xfs_buf *bp)
-{
- __write_verify(bp, XFS_DIR2_LEAF1_MAGIC);
-}
-
-static xfs_failaddr_t
-xfs_dir3_leafn_verify(
- struct xfs_buf *bp)
-{
- return xfs_dir3_leaf_verify(bp, XFS_DIR2_LEAFN_MAGIC);
-}
-
-static void
-xfs_dir3_leafn_read_verify(
- struct xfs_buf *bp)
-{
- __read_verify(bp, XFS_DIR2_LEAFN_MAGIC);
-}
-
-static void
-xfs_dir3_leafn_write_verify(
- struct xfs_buf *bp)
-{
- __write_verify(bp, XFS_DIR2_LEAFN_MAGIC);
-}
-
const struct xfs_buf_ops xfs_dir3_leaf1_buf_ops = {
.name = "xfs_dir3_leaf1",
- .verify_read = xfs_dir3_leaf1_read_verify,
- .verify_write = xfs_dir3_leaf1_write_verify,
- .verify_struct = xfs_dir3_leaf1_verify,
+ .magic16 = { cpu_to_be16(XFS_DIR2_LEAF1_MAGIC),
+ cpu_to_be16(XFS_DIR3_LEAF1_MAGIC) },
+ .verify_read = xfs_dir3_leaf_read_verify,
+ .verify_write = xfs_dir3_leaf_write_verify,
+ .verify_struct = xfs_dir3_leaf_verify,
};
const struct xfs_buf_ops xfs_dir3_leafn_buf_ops = {
.name = "xfs_dir3_leafn",
- .verify_read = xfs_dir3_leafn_read_verify,
- .verify_write = xfs_dir3_leafn_write_verify,
- .verify_struct = xfs_dir3_leafn_verify,
+ .magic16 = { cpu_to_be16(XFS_DIR2_LEAFN_MAGIC),
+ cpu_to_be16(XFS_DIR3_LEAFN_MAGIC) },
+ .verify_read = xfs_dir3_leaf_read_verify,
+ .verify_write = xfs_dir3_leaf_write_verify,
+ .verify_struct = xfs_dir3_leaf_verify,
};
int
@@ -621,43 +560,40 @@
*/
int /* error */
xfs_dir2_leaf_addname(
- xfs_da_args_t *args) /* operation arguments */
+ struct xfs_da_args *args) /* operation arguments */
{
+ struct xfs_dir3_icleaf_hdr leafhdr;
+ struct xfs_trans *tp = args->trans;
__be16 *bestsp; /* freespace table in leaf */
- int compact; /* need to compact leaves */
- xfs_dir2_data_hdr_t *hdr; /* data block header */
+ __be16 *tagp; /* end of data entry */
struct xfs_buf *dbp; /* data block buffer */
- xfs_dir2_data_entry_t *dep; /* data block entry */
- xfs_inode_t *dp; /* incore directory inode */
- xfs_dir2_data_unused_t *dup; /* data unused entry */
+ struct xfs_buf *lbp; /* leaf's buffer */
+ struct xfs_dir2_leaf *leaf; /* leaf structure */
+ struct xfs_inode *dp = args->dp; /* incore directory inode */
+ struct xfs_dir2_data_hdr *hdr; /* data block header */
+ struct xfs_dir2_data_entry *dep; /* data block entry */
+ struct xfs_dir2_leaf_entry *lep; /* leaf entry table pointer */
+ struct xfs_dir2_leaf_entry *ents;
+ struct xfs_dir2_data_unused *dup; /* data unused entry */
+ struct xfs_dir2_leaf_tail *ltp; /* leaf tail pointer */
+ struct xfs_dir2_data_free *bf; /* bestfree table */
+ int compact; /* need to compact leaves */
int error; /* error return value */
int grown; /* allocated new data block */
- int highstale; /* index of next stale leaf */
+ int highstale = 0; /* index of next stale leaf */
int i; /* temporary, index */
int index; /* leaf table position */
- struct xfs_buf *lbp; /* leaf's buffer */
- xfs_dir2_leaf_t *leaf; /* leaf structure */
int length; /* length of new entry */
- xfs_dir2_leaf_entry_t *lep; /* leaf entry table pointer */
int lfloglow; /* low leaf logging index */
int lfloghigh; /* high leaf logging index */
- int lowstale; /* index of prev stale leaf */
- xfs_dir2_leaf_tail_t *ltp; /* leaf tail pointer */
+ int lowstale = 0; /* index of prev stale leaf */
int needbytes; /* leaf block bytes needed */
int needlog; /* need to log data header */
int needscan; /* need to rescan data free */
- __be16 *tagp; /* end of data entry */
- xfs_trans_t *tp; /* transaction pointer */
xfs_dir2_db_t use_block; /* data block number */
- struct xfs_dir2_data_free *bf; /* bestfree table */
- struct xfs_dir2_leaf_entry *ents;
- struct xfs_dir3_icleaf_hdr leafhdr;
trace_xfs_dir2_leaf_addname(args);
- dp = args->dp;
- tp = args->trans;
-
error = xfs_dir3_leaf_read(tp, dp, args->geo->leafblk, -1, &lbp);
if (error)
return error;
diff --git a/fs/xfs/libxfs/xfs_dir2_node.c b/fs/xfs/libxfs/xfs_dir2_node.c
index f1bb343..705c4f5 100644
--- a/fs/xfs/libxfs/xfs_dir2_node.c
+++ b/fs/xfs/libxfs/xfs_dir2_node.c
@@ -6,12 +6,11 @@
*/
#include "xfs.h"
#include "xfs_fs.h"
+#include "xfs_shared.h"
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
-#include "xfs_da_format.h"
-#include "xfs_da_btree.h"
#include "xfs_inode.h"
#include "xfs_bmap.h"
#include "xfs_dir2.h"
@@ -20,7 +19,6 @@
#include "xfs_trace.h"
#include "xfs_trans.h"
#include "xfs_buf_item.h"
-#include "xfs_cksum.h"
#include "xfs_log.h"
/*
@@ -34,8 +32,6 @@
static int xfs_dir2_leafn_remove(xfs_da_args_t *args, struct xfs_buf *bp,
int index, xfs_da_state_blk_t *dblk,
int *rval);
-static int xfs_dir2_node_addname_int(xfs_da_args_t *args,
- xfs_da_state_blk_t *fblk);
/*
* Check internal consistency of a leafn block.
@@ -84,23 +80,21 @@
xfs_dir3_free_verify(
struct xfs_buf *bp)
{
- struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_mount *mp = bp->b_mount;
struct xfs_dir2_free_hdr *hdr = bp->b_addr;
+ if (!xfs_verify_magic(bp, hdr->magic))
+ return __this_address;
+
if (xfs_sb_version_hascrc(&mp->m_sb)) {
struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr;
- if (hdr3->magic != cpu_to_be32(XFS_DIR3_FREE_MAGIC))
- return __this_address;
if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_meta_uuid))
return __this_address;
if (be64_to_cpu(hdr3->blkno) != bp->b_bn)
return __this_address;
if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->lsn)))
return __this_address;
- } else {
- if (hdr->magic != cpu_to_be32(XFS_DIR2_FREE_MAGIC))
- return __this_address;
}
/* XXX: should bounds check the xfs_dir3_icfree_hdr here */
@@ -112,7 +106,7 @@
xfs_dir3_free_read_verify(
struct xfs_buf *bp)
{
- struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_mount *mp = bp->b_mount;
xfs_failaddr_t fa;
if (xfs_sb_version_hascrc(&mp->m_sb) &&
@@ -129,7 +123,7 @@
xfs_dir3_free_write_verify(
struct xfs_buf *bp)
{
- struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_mount *mp = bp->b_mount;
struct xfs_buf_log_item *bip = bp->b_log_item;
struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr;
xfs_failaddr_t fa;
@@ -151,6 +145,8 @@
const struct xfs_buf_ops xfs_dir3_free_buf_ops = {
.name = "xfs_dir3_free",
+ .magic = { cpu_to_be32(XFS_DIR2_FREE_MAGIC),
+ cpu_to_be32(XFS_DIR3_FREE_MAGIC) },
.verify_read = xfs_dir3_free_read_verify,
.verify_write = xfs_dir3_free_write_verify,
.verify_struct = xfs_dir3_free_verify,
@@ -426,24 +422,22 @@
static int /* error */
xfs_dir2_leafn_add(
struct xfs_buf *bp, /* leaf buffer */
- xfs_da_args_t *args, /* operation arguments */
+ struct xfs_da_args *args, /* operation arguments */
int index) /* insertion pt for new entry */
{
+ struct xfs_dir3_icleaf_hdr leafhdr;
+ struct xfs_inode *dp = args->dp;
+ struct xfs_dir2_leaf *leaf = bp->b_addr;
+ struct xfs_dir2_leaf_entry *lep;
+ struct xfs_dir2_leaf_entry *ents;
int compact; /* compacting stale leaves */
- xfs_inode_t *dp; /* incore directory inode */
- int highstale; /* next stale entry */
- xfs_dir2_leaf_t *leaf; /* leaf structure */
- xfs_dir2_leaf_entry_t *lep; /* leaf entry */
+ int highstale = 0; /* next stale entry */
int lfloghigh; /* high leaf entry logging */
int lfloglow; /* low leaf entry logging */
- int lowstale; /* previous stale entry */
- struct xfs_dir3_icleaf_hdr leafhdr;
- struct xfs_dir2_leaf_entry *ents;
+ int lowstale = 0; /* previous stale entry */
trace_xfs_dir2_leafn_add(args, index);
- dp = args->dp;
- leaf = bp->b_addr;
dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
ents = dp->d_ops->leaf_ents_p(leaf);
@@ -745,7 +739,8 @@
ents = dp->d_ops->leaf_ents_p(leaf);
xfs_dir3_leaf_check(dp, bp);
- ASSERT(leafhdr.count > 0);
+ if (leafhdr.count <= 0)
+ return -EFSCORRUPTED;
/*
* Look up the hash value in the leaf entries.
@@ -1614,6 +1609,353 @@
}
/*
+ * Add a new data block to the directory at the free space index that the caller
+ * has specified.
+ */
+static int
+xfs_dir2_node_add_datablk(
+ struct xfs_da_args *args,
+ struct xfs_da_state_blk *fblk,
+ xfs_dir2_db_t *dbno,
+ struct xfs_buf **dbpp,
+ struct xfs_buf **fbpp,
+ int *findex)
+{
+ struct xfs_inode *dp = args->dp;
+ struct xfs_trans *tp = args->trans;
+ struct xfs_mount *mp = dp->i_mount;
+ struct xfs_dir3_icfree_hdr freehdr;
+ struct xfs_dir2_data_free *bf;
+ struct xfs_dir2_data_hdr *hdr;
+ struct xfs_dir2_free *free = NULL;
+ xfs_dir2_db_t fbno;
+ struct xfs_buf *fbp;
+ struct xfs_buf *dbp;
+ __be16 *bests = NULL;
+ int error;
+
+ /* Not allowed to allocate, return failure. */
+ if (args->total == 0)
+ return -ENOSPC;
+
+ /* Allocate and initialize the new data block. */
+ error = xfs_dir2_grow_inode(args, XFS_DIR2_DATA_SPACE, dbno);
+ if (error)
+ return error;
+ error = xfs_dir3_data_init(args, *dbno, &dbp);
+ if (error)
+ return error;
+
+ /*
+ * Get the freespace block corresponding to the data block
+ * that was just allocated.
+ */
+ fbno = dp->d_ops->db_to_fdb(args->geo, *dbno);
+ error = xfs_dir2_free_try_read(tp, dp,
+ xfs_dir2_db_to_da(args->geo, fbno), &fbp);
+ if (error)
+ return error;
+
+ /*
+ * If there wasn't a freespace block, the read will
+ * return a NULL fbp. Allocate and initialize a new one.
+ */
+ if (!fbp) {
+ error = xfs_dir2_grow_inode(args, XFS_DIR2_FREE_SPACE, &fbno);
+ if (error)
+ return error;
+
+ if (dp->d_ops->db_to_fdb(args->geo, *dbno) != fbno) {
+ xfs_alert(mp,
+"%s: dir ino %llu needed freesp block %lld for data block %lld, got %lld",
+ __func__, (unsigned long long)dp->i_ino,
+ (long long)dp->d_ops->db_to_fdb(args->geo, *dbno),
+ (long long)*dbno, (long long)fbno);
+ if (fblk) {
+ xfs_alert(mp,
+ " fblk "PTR_FMT" blkno %llu index %d magic 0x%x",
+ fblk, (unsigned long long)fblk->blkno,
+ fblk->index, fblk->magic);
+ } else {
+ xfs_alert(mp, " ... fblk is NULL");
+ }
+ XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp);
+ return -EFSCORRUPTED;
+ }
+
+ /* Get a buffer for the new block. */
+ error = xfs_dir3_free_get_buf(args, fbno, &fbp);
+ if (error)
+ return error;
+ free = fbp->b_addr;
+ bests = dp->d_ops->free_bests_p(free);
+ dp->d_ops->free_hdr_from_disk(&freehdr, free);
+
+ /* Remember the first slot as our empty slot. */
+ freehdr.firstdb = (fbno - xfs_dir2_byte_to_db(args->geo,
+ XFS_DIR2_FREE_OFFSET)) *
+ dp->d_ops->free_max_bests(args->geo);
+ } else {
+ free = fbp->b_addr;
+ bests = dp->d_ops->free_bests_p(free);
+ dp->d_ops->free_hdr_from_disk(&freehdr, free);
+ }
+
+ /* Set the freespace block index from the data block number. */
+ *findex = dp->d_ops->db_to_fdindex(args->geo, *dbno);
+
+ /* Extend the freespace table if the new data block is off the end. */
+ if (*findex >= freehdr.nvalid) {
+ ASSERT(*findex < dp->d_ops->free_max_bests(args->geo));
+ freehdr.nvalid = *findex + 1;
+ bests[*findex] = cpu_to_be16(NULLDATAOFF);
+ }
+
+ /*
+ * If this entry was for an empty data block (this should always be
+ * true) then update the header.
+ */
+ if (bests[*findex] == cpu_to_be16(NULLDATAOFF)) {
+ freehdr.nused++;
+ dp->d_ops->free_hdr_to_disk(fbp->b_addr, &freehdr);
+ xfs_dir2_free_log_header(args, fbp);
+ }
+
+ /* Update the freespace value for the new block in the table. */
+ hdr = dbp->b_addr;
+ bf = dp->d_ops->data_bestfree_p(hdr);
+ bests[*findex] = bf[0].length;
+
+ *dbpp = dbp;
+ *fbpp = fbp;
+ return 0;
+}
+
+static int
+xfs_dir2_node_find_freeblk(
+ struct xfs_da_args *args,
+ struct xfs_da_state_blk *fblk,
+ xfs_dir2_db_t *dbnop,
+ struct xfs_buf **fbpp,
+ int *findexp,
+ int length)
+{
+ struct xfs_dir3_icfree_hdr freehdr;
+ struct xfs_dir2_free *free = NULL;
+ struct xfs_inode *dp = args->dp;
+ struct xfs_trans *tp = args->trans;
+ struct xfs_buf *fbp = NULL;
+ xfs_dir2_db_t firstfbno;
+ xfs_dir2_db_t lastfbno;
+ xfs_dir2_db_t ifbno = -1;
+ xfs_dir2_db_t dbno = -1;
+ xfs_dir2_db_t fbno;
+ xfs_fileoff_t fo;
+ __be16 *bests = NULL;
+ int findex = 0;
+ int error;
+
+ /*
+ * If we came in with a freespace block that means that lookup
+ * found an entry with our hash value. This is the freespace
+ * block for that data entry.
+ */
+ if (fblk) {
+ fbp = fblk->bp;
+ free = fbp->b_addr;
+ findex = fblk->index;
+ if (findex >= 0) {
+ /* caller already found the freespace for us. */
+ bests = dp->d_ops->free_bests_p(free);
+ dp->d_ops->free_hdr_from_disk(&freehdr, free);
+
+ ASSERT(findex < freehdr.nvalid);
+ ASSERT(be16_to_cpu(bests[findex]) != NULLDATAOFF);
+ ASSERT(be16_to_cpu(bests[findex]) >= length);
+ dbno = freehdr.firstdb + findex;
+ goto found_block;
+ }
+
+ /*
+ * The data block looked at didn't have enough room.
+ * We'll start at the beginning of the freespace entries.
+ */
+ ifbno = fblk->blkno;
+ xfs_trans_brelse(tp, fbp);
+ fbp = NULL;
+ fblk->bp = NULL;
+ }
+
+ /*
+ * If we don't have a data block yet, we're going to scan the freespace
+ * data for a data block with enough free space in it.
+ */
+ error = xfs_bmap_last_offset(dp, &fo, XFS_DATA_FORK);
+ if (error)
+ return error;
+ lastfbno = xfs_dir2_da_to_db(args->geo, (xfs_dablk_t)fo);
+ firstfbno = xfs_dir2_byte_to_db(args->geo, XFS_DIR2_FREE_OFFSET);
+
+ for (fbno = lastfbno - 1; fbno >= firstfbno; fbno--) {
+ /* If it's ifbno we already looked at it. */
+ if (fbno == ifbno)
+ continue;
+
+ /*
+ * Read the block. There can be holes in the freespace blocks,
+ * so this might not succeed. This should be really rare, so
+ * there's no reason to avoid it.
+ */
+ error = xfs_dir2_free_try_read(tp, dp,
+ xfs_dir2_db_to_da(args->geo, fbno),
+ &fbp);
+ if (error)
+ return error;
+ if (!fbp)
+ continue;
+
+ free = fbp->b_addr;
+ bests = dp->d_ops->free_bests_p(free);
+ dp->d_ops->free_hdr_from_disk(&freehdr, free);
+
+ /* Scan the free entry array for a large enough free space. */
+ for (findex = freehdr.nvalid - 1; findex >= 0; findex--) {
+ if (be16_to_cpu(bests[findex]) != NULLDATAOFF &&
+ be16_to_cpu(bests[findex]) >= length) {
+ dbno = freehdr.firstdb + findex;
+ goto found_block;
+ }
+ }
+
+ /* Didn't find free space, go on to next free block */
+ xfs_trans_brelse(tp, fbp);
+ }
+
+found_block:
+ *dbnop = dbno;
+ *fbpp = fbp;
+ *findexp = findex;
+ return 0;
+}
+
+
+/*
+ * Add the data entry for a node-format directory name addition.
+ * The leaf entry is added in xfs_dir2_leafn_add.
+ * We may enter with a freespace block that the lookup found.
+ */
+static int
+xfs_dir2_node_addname_int(
+ struct xfs_da_args *args, /* operation arguments */
+ struct xfs_da_state_blk *fblk) /* optional freespace block */
+{
+ struct xfs_dir2_data_unused *dup; /* data unused entry pointer */
+ struct xfs_dir2_data_entry *dep; /* data entry pointer */
+ struct xfs_dir2_data_hdr *hdr; /* data block header */
+ struct xfs_dir2_data_free *bf;
+ struct xfs_dir2_free *free = NULL; /* freespace block structure */
+ struct xfs_trans *tp = args->trans;
+ struct xfs_inode *dp = args->dp;
+ struct xfs_buf *dbp; /* data block buffer */
+ struct xfs_buf *fbp; /* freespace buffer */
+ xfs_dir2_data_aoff_t aoff;
+ xfs_dir2_db_t dbno; /* data block number */
+ int error; /* error return value */
+ int findex; /* freespace entry index */
+ int length; /* length of the new entry */
+ int logfree = 0; /* need to log free entry */
+ int needlog = 0; /* need to log data header */
+ int needscan = 0; /* need to rescan data frees */
+ __be16 *tagp; /* data entry tag pointer */
+ __be16 *bests;
+
+ length = dp->d_ops->data_entsize(args->namelen);
+ error = xfs_dir2_node_find_freeblk(args, fblk, &dbno, &fbp, &findex,
+ length);
+ if (error)
+ return error;
+
+ /*
+ * Now we know if we must allocate blocks, so if we are checking whether
+ * we can insert without allocation then we can return now.
+ */
+ if (args->op_flags & XFS_DA_OP_JUSTCHECK) {
+ if (dbno == -1)
+ return -ENOSPC;
+ return 0;
+ }
+
+ /*
+ * If we don't have a data block, we need to allocate one and make
+ * the freespace entries refer to it.
+ */
+ if (dbno == -1) {
+ /* we're going to have to log the free block index later */
+ logfree = 1;
+ error = xfs_dir2_node_add_datablk(args, fblk, &dbno, &dbp, &fbp,
+ &findex);
+ } else {
+ /* Read the data block in. */
+ error = xfs_dir3_data_read(tp, dp,
+ xfs_dir2_db_to_da(args->geo, dbno),
+ -1, &dbp);
+ }
+ if (error)
+ return error;
+
+ /* setup for data block up now */
+ hdr = dbp->b_addr;
+ bf = dp->d_ops->data_bestfree_p(hdr);
+ ASSERT(be16_to_cpu(bf[0].length) >= length);
+
+ /* Point to the existing unused space. */
+ dup = (xfs_dir2_data_unused_t *)
+ ((char *)hdr + be16_to_cpu(bf[0].offset));
+
+ /* Mark the first part of the unused space, inuse for us. */
+ aoff = (xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr);
+ error = xfs_dir2_data_use_free(args, dbp, dup, aoff, length,
+ &needlog, &needscan);
+ if (error) {
+ xfs_trans_brelse(tp, dbp);
+ return error;
+ }
+
+ /* Fill in the new entry and log it. */
+ dep = (xfs_dir2_data_entry_t *)dup;
+ dep->inumber = cpu_to_be64(args->inumber);
+ dep->namelen = args->namelen;
+ memcpy(dep->name, args->name, dep->namelen);
+ dp->d_ops->data_put_ftype(dep, args->filetype);
+ tagp = dp->d_ops->data_entry_tag_p(dep);
+ *tagp = cpu_to_be16((char *)dep - (char *)hdr);
+ xfs_dir2_data_log_entry(args, dbp, dep);
+
+ /* Rescan the freespace and log the data block if needed. */
+ if (needscan)
+ xfs_dir2_data_freescan(dp, hdr, &needlog);
+ if (needlog)
+ xfs_dir2_data_log_header(args, dbp);
+
+ /* If the freespace block entry is now wrong, update it. */
+ free = fbp->b_addr;
+ bests = dp->d_ops->free_bests_p(free);
+ if (bests[findex] != bf[0].length) {
+ bests[findex] = bf[0].length;
+ logfree = 1;
+ }
+
+ /* Log the freespace entry if needed. */
+ if (logfree)
+ xfs_dir2_free_log_bests(args, fbp, findex, findex);
+
+ /* Return the data block and offset in args. */
+ args->blkno = (xfs_dablk_t)dbno;
+ args->index = be16_to_cpu(*tagp);
+ return 0;
+}
+
+/*
* Top-level node form directory addname routine.
*/
int /* error */
@@ -1683,383 +2025,6 @@
}
/*
- * Add the data entry for a node-format directory name addition.
- * The leaf entry is added in xfs_dir2_leafn_add.
- * We may enter with a freespace block that the lookup found.
- */
-static int /* error */
-xfs_dir2_node_addname_int(
- xfs_da_args_t *args, /* operation arguments */
- xfs_da_state_blk_t *fblk) /* optional freespace block */
-{
- xfs_dir2_data_hdr_t *hdr; /* data block header */
- xfs_dir2_db_t dbno; /* data block number */
- struct xfs_buf *dbp; /* data block buffer */
- xfs_dir2_data_entry_t *dep; /* data entry pointer */
- xfs_inode_t *dp; /* incore directory inode */
- xfs_dir2_data_unused_t *dup; /* data unused entry pointer */
- int error; /* error return value */
- xfs_dir2_db_t fbno; /* freespace block number */
- struct xfs_buf *fbp; /* freespace buffer */
- int findex; /* freespace entry index */
- xfs_dir2_free_t *free=NULL; /* freespace block structure */
- xfs_dir2_db_t ifbno; /* initial freespace block no */
- xfs_dir2_db_t lastfbno=0; /* highest freespace block no */
- int length; /* length of the new entry */
- int logfree; /* need to log free entry */
- xfs_mount_t *mp; /* filesystem mount point */
- int needlog; /* need to log data header */
- int needscan; /* need to rescan data frees */
- __be16 *tagp; /* data entry tag pointer */
- xfs_trans_t *tp; /* transaction pointer */
- __be16 *bests;
- struct xfs_dir3_icfree_hdr freehdr;
- struct xfs_dir2_data_free *bf;
- xfs_dir2_data_aoff_t aoff;
-
- dp = args->dp;
- mp = dp->i_mount;
- tp = args->trans;
- length = dp->d_ops->data_entsize(args->namelen);
- /*
- * If we came in with a freespace block that means that lookup
- * found an entry with our hash value. This is the freespace
- * block for that data entry.
- */
- if (fblk) {
- fbp = fblk->bp;
- /*
- * Remember initial freespace block number.
- */
- ifbno = fblk->blkno;
- free = fbp->b_addr;
- findex = fblk->index;
- bests = dp->d_ops->free_bests_p(free);
- dp->d_ops->free_hdr_from_disk(&freehdr, free);
-
- /*
- * This means the free entry showed that the data block had
- * space for our entry, so we remembered it.
- * Use that data block.
- */
- if (findex >= 0) {
- ASSERT(findex < freehdr.nvalid);
- ASSERT(be16_to_cpu(bests[findex]) != NULLDATAOFF);
- ASSERT(be16_to_cpu(bests[findex]) >= length);
- dbno = freehdr.firstdb + findex;
- } else {
- /*
- * The data block looked at didn't have enough room.
- * We'll start at the beginning of the freespace entries.
- */
- dbno = -1;
- findex = 0;
- }
- } else {
- /*
- * Didn't come in with a freespace block, so no data block.
- */
- ifbno = dbno = -1;
- fbp = NULL;
- findex = 0;
- }
-
- /*
- * If we don't have a data block yet, we're going to scan the
- * freespace blocks looking for one. Figure out what the
- * highest freespace block number is.
- */
- if (dbno == -1) {
- xfs_fileoff_t fo; /* freespace block number */
-
- if ((error = xfs_bmap_last_offset(dp, &fo, XFS_DATA_FORK)))
- return error;
- lastfbno = xfs_dir2_da_to_db(args->geo, (xfs_dablk_t)fo);
- fbno = ifbno;
- }
- /*
- * While we haven't identified a data block, search the freeblock
- * data for a good data block. If we find a null freeblock entry,
- * indicating a hole in the data blocks, remember that.
- */
- while (dbno == -1) {
- /*
- * If we don't have a freeblock in hand, get the next one.
- */
- if (fbp == NULL) {
- /*
- * Happens the first time through unless lookup gave
- * us a freespace block to start with.
- */
- if (++fbno == 0)
- fbno = xfs_dir2_byte_to_db(args->geo,
- XFS_DIR2_FREE_OFFSET);
- /*
- * If it's ifbno we already looked at it.
- */
- if (fbno == ifbno)
- fbno++;
- /*
- * If it's off the end we're done.
- */
- if (fbno >= lastfbno)
- break;
- /*
- * Read the block. There can be holes in the
- * freespace blocks, so this might not succeed.
- * This should be really rare, so there's no reason
- * to avoid it.
- */
- error = xfs_dir2_free_try_read(tp, dp,
- xfs_dir2_db_to_da(args->geo, fbno),
- &fbp);
- if (error)
- return error;
- if (!fbp)
- continue;
- free = fbp->b_addr;
- findex = 0;
- }
- /*
- * Look at the current free entry. Is it good enough?
- *
- * The bests initialisation should be where the bufer is read in
- * the above branch. But gcc is too stupid to realise that bests
- * and the freehdr are actually initialised if they are placed
- * there, so we have to do it here to avoid warnings. Blech.
- */
- bests = dp->d_ops->free_bests_p(free);
- dp->d_ops->free_hdr_from_disk(&freehdr, free);
- if (be16_to_cpu(bests[findex]) != NULLDATAOFF &&
- be16_to_cpu(bests[findex]) >= length)
- dbno = freehdr.firstdb + findex;
- else {
- /*
- * Are we done with the freeblock?
- */
- if (++findex == freehdr.nvalid) {
- /*
- * Drop the block.
- */
- xfs_trans_brelse(tp, fbp);
- fbp = NULL;
- if (fblk && fblk->bp)
- fblk->bp = NULL;
- }
- }
- }
- /*
- * If we don't have a data block, we need to allocate one and make
- * the freespace entries refer to it.
- */
- if (unlikely(dbno == -1)) {
- /*
- * Not allowed to allocate, return failure.
- */
- if ((args->op_flags & XFS_DA_OP_JUSTCHECK) || args->total == 0)
- return -ENOSPC;
-
- /*
- * Allocate and initialize the new data block.
- */
- if (unlikely((error = xfs_dir2_grow_inode(args,
- XFS_DIR2_DATA_SPACE,
- &dbno)) ||
- (error = xfs_dir3_data_init(args, dbno, &dbp))))
- return error;
-
- /*
- * If (somehow) we have a freespace block, get rid of it.
- */
- if (fbp)
- xfs_trans_brelse(tp, fbp);
- if (fblk && fblk->bp)
- fblk->bp = NULL;
-
- /*
- * Get the freespace block corresponding to the data block
- * that was just allocated.
- */
- fbno = dp->d_ops->db_to_fdb(args->geo, dbno);
- error = xfs_dir2_free_try_read(tp, dp,
- xfs_dir2_db_to_da(args->geo, fbno),
- &fbp);
- if (error)
- return error;
-
- /*
- * If there wasn't a freespace block, the read will
- * return a NULL fbp. Allocate and initialize a new one.
- */
- if (!fbp) {
- error = xfs_dir2_grow_inode(args, XFS_DIR2_FREE_SPACE,
- &fbno);
- if (error)
- return error;
-
- if (dp->d_ops->db_to_fdb(args->geo, dbno) != fbno) {
- xfs_alert(mp,
-"%s: dir ino %llu needed freesp block %lld for data block %lld, got %lld ifbno %llu lastfbno %d",
- __func__, (unsigned long long)dp->i_ino,
- (long long)dp->d_ops->db_to_fdb(
- args->geo, dbno),
- (long long)dbno, (long long)fbno,
- (unsigned long long)ifbno, lastfbno);
- if (fblk) {
- xfs_alert(mp,
- " fblk "PTR_FMT" blkno %llu index %d magic 0x%x",
- fblk,
- (unsigned long long)fblk->blkno,
- fblk->index,
- fblk->magic);
- } else {
- xfs_alert(mp, " ... fblk is NULL");
- }
- XFS_ERROR_REPORT("xfs_dir2_node_addname_int",
- XFS_ERRLEVEL_LOW, mp);
- return -EFSCORRUPTED;
- }
-
- /*
- * Get a buffer for the new block.
- */
- error = xfs_dir3_free_get_buf(args, fbno, &fbp);
- if (error)
- return error;
- free = fbp->b_addr;
- bests = dp->d_ops->free_bests_p(free);
- dp->d_ops->free_hdr_from_disk(&freehdr, free);
-
- /*
- * Remember the first slot as our empty slot.
- */
- freehdr.firstdb =
- (fbno - xfs_dir2_byte_to_db(args->geo,
- XFS_DIR2_FREE_OFFSET)) *
- dp->d_ops->free_max_bests(args->geo);
- } else {
- free = fbp->b_addr;
- bests = dp->d_ops->free_bests_p(free);
- dp->d_ops->free_hdr_from_disk(&freehdr, free);
- }
-
- /*
- * Set the freespace block index from the data block number.
- */
- findex = dp->d_ops->db_to_fdindex(args->geo, dbno);
- /*
- * If it's after the end of the current entries in the
- * freespace block, extend that table.
- */
- if (findex >= freehdr.nvalid) {
- ASSERT(findex < dp->d_ops->free_max_bests(args->geo));
- freehdr.nvalid = findex + 1;
- /*
- * Tag new entry so nused will go up.
- */
- bests[findex] = cpu_to_be16(NULLDATAOFF);
- }
- /*
- * If this entry was for an empty data block
- * (this should always be true) then update the header.
- */
- if (bests[findex] == cpu_to_be16(NULLDATAOFF)) {
- freehdr.nused++;
- dp->d_ops->free_hdr_to_disk(fbp->b_addr, &freehdr);
- xfs_dir2_free_log_header(args, fbp);
- }
- /*
- * Update the real value in the table.
- * We haven't allocated the data entry yet so this will
- * change again.
- */
- hdr = dbp->b_addr;
- bf = dp->d_ops->data_bestfree_p(hdr);
- bests[findex] = bf[0].length;
- logfree = 1;
- }
- /*
- * We had a data block so we don't have to make a new one.
- */
- else {
- /*
- * If just checking, we succeeded.
- */
- if (args->op_flags & XFS_DA_OP_JUSTCHECK)
- return 0;
-
- /*
- * Read the data block in.
- */
- error = xfs_dir3_data_read(tp, dp,
- xfs_dir2_db_to_da(args->geo, dbno),
- -1, &dbp);
- if (error)
- return error;
- hdr = dbp->b_addr;
- bf = dp->d_ops->data_bestfree_p(hdr);
- logfree = 0;
- }
- ASSERT(be16_to_cpu(bf[0].length) >= length);
- /*
- * Point to the existing unused space.
- */
- dup = (xfs_dir2_data_unused_t *)
- ((char *)hdr + be16_to_cpu(bf[0].offset));
- needscan = needlog = 0;
- /*
- * Mark the first part of the unused space, inuse for us.
- */
- aoff = (xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr);
- error = xfs_dir2_data_use_free(args, dbp, dup, aoff, length,
- &needlog, &needscan);
- if (error) {
- xfs_trans_brelse(tp, dbp);
- return error;
- }
- /*
- * Fill in the new entry and log it.
- */
- dep = (xfs_dir2_data_entry_t *)dup;
- dep->inumber = cpu_to_be64(args->inumber);
- dep->namelen = args->namelen;
- memcpy(dep->name, args->name, dep->namelen);
- dp->d_ops->data_put_ftype(dep, args->filetype);
- tagp = dp->d_ops->data_entry_tag_p(dep);
- *tagp = cpu_to_be16((char *)dep - (char *)hdr);
- xfs_dir2_data_log_entry(args, dbp, dep);
- /*
- * Rescan the block for bestfree if needed.
- */
- if (needscan)
- xfs_dir2_data_freescan(dp, hdr, &needlog);
- /*
- * Log the data block header if needed.
- */
- if (needlog)
- xfs_dir2_data_log_header(args, dbp);
- /*
- * If the freespace entry is now wrong, update it.
- */
- bests = dp->d_ops->free_bests_p(free); /* gcc is so stupid */
- if (be16_to_cpu(bests[findex]) != be16_to_cpu(bf[0].length)) {
- bests[findex] = bf[0].length;
- logfree = 1;
- }
- /*
- * Log the freespace entry if needed.
- */
- if (logfree)
- xfs_dir2_free_log_bests(args, fbp, findex, findex);
- /*
- * Return the data block and offset in args, then drop the data block.
- */
- args->blkno = (xfs_dablk_t)dbno;
- args->index = be16_to_cpu(*tagp);
- return 0;
-}
-
-/*
* Lookup an entry in a node-format directory.
* All the real work happens in xfs_da3_node_lookup_int.
* The only real output is the inode number of the entry.
diff --git a/fs/xfs/libxfs/xfs_dir2_sf.c b/fs/xfs/libxfs/xfs_dir2_sf.c
index 585dfdb..85f14fc 100644
--- a/fs/xfs/libxfs/xfs_dir2_sf.c
+++ b/fs/xfs/libxfs/xfs_dir2_sf.c
@@ -5,16 +5,13 @@
*/
#include "xfs.h"
#include "xfs_fs.h"
+#include "xfs_shared.h"
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
-#include "xfs_da_format.h"
-#include "xfs_da_btree.h"
#include "xfs_inode.h"
#include "xfs_trans.h"
-#include "xfs_inode_item.h"
-#include "xfs_error.h"
#include "xfs_dir2.h"
#include "xfs_dir2_priv.h"
#include "xfs_trace.h"
@@ -167,7 +164,7 @@
* can free the block and copy the formatted data into the inode literal
* area.
*/
- dst = kmem_alloc(mp->m_sb.sb_inodesize, KM_SLEEP);
+ dst = kmem_alloc(mp->m_sb.sb_inodesize, 0);
hdr = bp->b_addr;
/*
@@ -439,7 +436,7 @@
sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
old_isize = (int)dp->i_d.di_size;
- buf = kmem_alloc(old_isize, KM_SLEEP);
+ buf = kmem_alloc(old_isize, 0);
oldsfp = (xfs_dir2_sf_hdr_t *)buf;
memcpy(oldsfp, sfp, old_isize);
/*
@@ -1099,7 +1096,7 @@
* Don't want xfs_idata_realloc copying the data here.
*/
oldsize = dp->i_df.if_bytes;
- buf = kmem_alloc(oldsize, KM_SLEEP);
+ buf = kmem_alloc(oldsize, 0);
oldsfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
ASSERT(oldsfp->i8count == 1);
memcpy(buf, oldsfp, oldsize);
@@ -1172,7 +1169,7 @@
* Don't want xfs_idata_realloc copying the data here.
*/
oldsize = dp->i_df.if_bytes;
- buf = kmem_alloc(oldsize, KM_SLEEP);
+ buf = kmem_alloc(oldsize, 0);
oldsfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
ASSERT(oldsfp->i8count == 0);
memcpy(buf, oldsfp, oldsize);
diff --git a/fs/xfs/libxfs/xfs_dquot_buf.c b/fs/xfs/libxfs/xfs_dquot_buf.c
index d293f37..e8bd688 100644
--- a/fs/xfs/libxfs/xfs_dquot_buf.c
+++ b/fs/xfs/libxfs/xfs_dquot_buf.c
@@ -16,8 +16,6 @@
#include "xfs_trans.h"
#include "xfs_qm.h"
#include "xfs_error.h"
-#include "xfs_cksum.h"
-#include "xfs_trace.h"
int
xfs_calc_dquots_per_chunk(
@@ -110,7 +108,7 @@
/*
* Do some primitive error checking on ondisk dquot data structures.
*/
-int
+void
xfs_dqblk_repair(
struct xfs_mount *mp,
struct xfs_dqblk *dqb,
@@ -133,8 +131,6 @@
xfs_update_cksum((char *)dqb, sizeof(struct xfs_dqblk),
XFS_DQUOT_CRC_OFF);
}
-
- return 0;
}
STATIC bool
@@ -226,7 +222,7 @@
xfs_dquot_buf_verify_struct(
struct xfs_buf *bp)
{
- struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_mount *mp = bp->b_mount;
return xfs_dquot_buf_verify(mp, bp, false);
}
@@ -235,7 +231,7 @@
xfs_dquot_buf_read_verify(
struct xfs_buf *bp)
{
- struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_mount *mp = bp->b_mount;
if (!xfs_dquot_buf_verify_crc(mp, bp, false))
return;
@@ -252,7 +248,7 @@
xfs_dquot_buf_readahead_verify(
struct xfs_buf *bp)
{
- struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_mount *mp = bp->b_mount;
if (!xfs_dquot_buf_verify_crc(mp, bp, true) ||
xfs_dquot_buf_verify(mp, bp, true) != NULL) {
@@ -270,13 +266,15 @@
xfs_dquot_buf_write_verify(
struct xfs_buf *bp)
{
- struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_mount *mp = bp->b_mount;
xfs_dquot_buf_verify(mp, bp, false);
}
const struct xfs_buf_ops xfs_dquot_buf_ops = {
.name = "xfs_dquot",
+ .magic16 = { cpu_to_be16(XFS_DQUOT_MAGIC),
+ cpu_to_be16(XFS_DQUOT_MAGIC) },
.verify_read = xfs_dquot_buf_read_verify,
.verify_write = xfs_dquot_buf_write_verify,
.verify_struct = xfs_dquot_buf_verify_struct,
@@ -284,6 +282,8 @@
const struct xfs_buf_ops xfs_dquot_buf_ra_ops = {
.name = "xfs_dquot_ra",
+ .magic16 = { cpu_to_be16(XFS_DQUOT_MAGIC),
+ cpu_to_be16(XFS_DQUOT_MAGIC) },
.verify_read = xfs_dquot_buf_readahead_verify,
.verify_write = xfs_dquot_buf_write_verify,
};
diff --git a/fs/xfs/libxfs/xfs_errortag.h b/fs/xfs/libxfs/xfs_errortag.h
index 66077a1..79e6c4f 100644
--- a/fs/xfs/libxfs/xfs_errortag.h
+++ b/fs/xfs/libxfs/xfs_errortag.h
@@ -54,7 +54,8 @@
#define XFS_ERRTAG_BUF_LRU_REF 31
#define XFS_ERRTAG_FORCE_SCRUB_REPAIR 32
#define XFS_ERRTAG_FORCE_SUMMARY_RECALC 33
-#define XFS_ERRTAG_MAX 34
+#define XFS_ERRTAG_IUNLINK_FALLBACK 34
+#define XFS_ERRTAG_MAX 35
/*
* Random factors for above tags, 1 means always, 2 means 1/2 time, etc.
@@ -93,5 +94,6 @@
#define XFS_RANDOM_BUF_LRU_REF 2
#define XFS_RANDOM_FORCE_SCRUB_REPAIR 1
#define XFS_RANDOM_FORCE_SUMMARY_RECALC 1
+#define XFS_RANDOM_IUNLINK_FALLBACK (XFS_RANDOM_DEFAULT/10)
#endif /* __XFS_ERRORTAG_H_ */
diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
index afbe336..c968b60 100644
--- a/fs/xfs/libxfs/xfs_format.h
+++ b/fs/xfs/libxfs/xfs_format.h
@@ -287,6 +287,8 @@
{
if (!(sbp->sb_versionnum & XFS_SB_VERSION_DIRV2BIT))
return false;
+ if (!(sbp->sb_versionnum & XFS_SB_VERSION_EXTFLGBIT))
+ return false;
/* check for unknown features in the fs */
if ((sbp->sb_versionnum & ~XFS_SB_VERSION_OKBITS) ||
@@ -357,12 +359,6 @@
(sbp->sb_versionnum & XFS_SB_VERSION_LOGV2BIT);
}
-static inline bool xfs_sb_version_hasextflgbit(struct xfs_sb *sbp)
-{
- return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 ||
- (sbp->sb_versionnum & XFS_SB_VERSION_EXTFLGBIT);
-}
-
static inline bool xfs_sb_version_hassector(struct xfs_sb *sbp)
{
return (sbp->sb_versionnum & XFS_SB_VERSION_SECTORBIT);
@@ -920,6 +916,9 @@
/*
* Values for di_format
+ *
+ * This enum is used in string mapping in xfs_trace.h; please keep the
+ * TRACE_DEFINE_ENUMs for it up to date.
*/
typedef enum xfs_dinode_fmt {
XFS_DINODE_FMT_DEV, /* xfs_dev_t */
@@ -929,6 +928,13 @@
XFS_DINODE_FMT_UUID /* added long ago, but never used */
} xfs_dinode_fmt_t;
+#define XFS_INODE_FORMAT_STR \
+ { XFS_DINODE_FMT_DEV, "dev" }, \
+ { XFS_DINODE_FMT_LOCAL, "local" }, \
+ { XFS_DINODE_FMT_EXTENTS, "extent" }, \
+ { XFS_DINODE_FMT_BTREE, "btree" }, \
+ { XFS_DINODE_FMT_UUID, "uuid" }
+
/*
* Inode minimum and maximum sizes.
*/
@@ -1065,7 +1071,7 @@
#define XFS_INO_MASK(k) (uint32_t)((1ULL << (k)) - 1)
#define XFS_INO_OFFSET_BITS(mp) (mp)->m_sb.sb_inopblog
#define XFS_INO_AGBNO_BITS(mp) (mp)->m_sb.sb_agblklog
-#define XFS_INO_AGINO_BITS(mp) (mp)->m_agino_log
+#define XFS_INO_AGINO_BITS(mp) ((mp)->m_ino_geo.agino_log)
#define XFS_INO_AGNO_BITS(mp) (mp)->m_agno_log
#define XFS_INO_BITS(mp) \
XFS_INO_AGNO_BITS(mp) + XFS_INO_AGINO_BITS(mp)
@@ -1087,6 +1093,8 @@
((i) & XFS_INO_MASK(XFS_INO_OFFSET_BITS(mp)))
#define XFS_OFFBNO_TO_AGINO(mp,b,o) \
((xfs_agino_t)(((b) << XFS_INO_OFFSET_BITS(mp)) | (o)))
+#define XFS_FSB_TO_INO(mp, b) ((xfs_ino_t)((b) << XFS_INO_OFFSET_BITS(mp)))
+#define XFS_AGB_TO_AGINO(mp, b) ((xfs_agino_t)((b) << XFS_INO_OFFSET_BITS(mp)))
#define XFS_MAXINUMBER ((xfs_ino_t)((1ULL << 56) - 1ULL))
#define XFS_MAXINUMBER_32 ((xfs_ino_t)((1ULL << 32) - 1ULL))
diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h
index f3aa593..e9371a8 100644
--- a/fs/xfs/libxfs/xfs_fs.h
+++ b/fs/xfs/libxfs/xfs_fs.h
@@ -97,7 +97,7 @@
* For use by backup and restore programs to set the XFS on-disk inode
* fields di_dmevmask and di_dmstate. These must be set to exactly and
* only values previously obtained via xfs_bulkstat! (Specifically the
- * xfs_bstat_t fields bs_dmevmask and bs_dmstate.)
+ * struct xfs_bstat fields bs_dmevmask and bs_dmstate.)
*/
#ifndef HAVE_FSDMIDATA
struct fsdmidata {
@@ -124,7 +124,7 @@
/*
* Output for XFS_IOC_FSGEOMETRY_V1
*/
-typedef struct xfs_fsop_geom_v1 {
+struct xfs_fsop_geom_v1 {
__u32 blocksize; /* filesystem (data) block size */
__u32 rtextsize; /* realtime extent size */
__u32 agblocks; /* fsblocks in an AG */
@@ -145,12 +145,39 @@
__u32 logsectsize; /* log sector size, bytes */
__u32 rtsectsize; /* realtime sector size, bytes */
__u32 dirblocksize; /* directory block size, bytes */
-} xfs_fsop_geom_v1_t;
+};
+
+/*
+ * Output for XFS_IOC_FSGEOMETRY_V4
+ */
+struct xfs_fsop_geom_v4 {
+ __u32 blocksize; /* filesystem (data) block size */
+ __u32 rtextsize; /* realtime extent size */
+ __u32 agblocks; /* fsblocks in an AG */
+ __u32 agcount; /* number of allocation groups */
+ __u32 logblocks; /* fsblocks in the log */
+ __u32 sectsize; /* (data) sector size, bytes */
+ __u32 inodesize; /* inode size in bytes */
+ __u32 imaxpct; /* max allowed inode space(%) */
+ __u64 datablocks; /* fsblocks in data subvolume */
+ __u64 rtblocks; /* fsblocks in realtime subvol */
+ __u64 rtextents; /* rt extents in realtime subvol*/
+ __u64 logstart; /* starting fsblock of the log */
+ unsigned char uuid[16]; /* unique id of the filesystem */
+ __u32 sunit; /* stripe unit, fsblocks */
+ __u32 swidth; /* stripe width, fsblocks */
+ __s32 version; /* structure version */
+ __u32 flags; /* superblock version flags */
+ __u32 logsectsize; /* log sector size, bytes */
+ __u32 rtsectsize; /* realtime sector size, bytes */
+ __u32 dirblocksize; /* directory block size, bytes */
+ __u32 logsunit; /* log stripe unit, bytes */
+};
/*
* Output for XFS_IOC_FSGEOMETRY
*/
-typedef struct xfs_fsop_geom {
+struct xfs_fsop_geom {
__u32 blocksize; /* filesystem (data) block size */
__u32 rtextsize; /* realtime extent size */
__u32 agblocks; /* fsblocks in an AG */
@@ -171,8 +198,18 @@
__u32 logsectsize; /* log sector size, bytes */
__u32 rtsectsize; /* realtime sector size, bytes */
__u32 dirblocksize; /* directory block size, bytes */
- __u32 logsunit; /* log stripe unit, bytes */
-} xfs_fsop_geom_t;
+ __u32 logsunit; /* log stripe unit, bytes */
+ uint32_t sick; /* o: unhealthy fs & rt metadata */
+ uint32_t checked; /* o: checked fs & rt metadata */
+ __u64 reserved[17]; /* reserved space */
+};
+
+#define XFS_FSOP_GEOM_SICK_COUNTERS (1 << 0) /* summary counters */
+#define XFS_FSOP_GEOM_SICK_UQUOTA (1 << 1) /* user quota */
+#define XFS_FSOP_GEOM_SICK_GQUOTA (1 << 2) /* group quota */
+#define XFS_FSOP_GEOM_SICK_PQUOTA (1 << 3) /* project quota */
+#define XFS_FSOP_GEOM_SICK_RT_BITMAP (1 << 4) /* realtime bitmap */
+#define XFS_FSOP_GEOM_SICK_RT_SUMMARY (1 << 5) /* realtime summary */
/* Output for XFS_FS_COUNTS */
typedef struct xfs_fsop_counts {
@@ -188,28 +225,30 @@
__u64 resblks_avail;
} xfs_fsop_resblks_t;
-#define XFS_FSOP_GEOM_VERSION 0
+#define XFS_FSOP_GEOM_VERSION 0
+#define XFS_FSOP_GEOM_VERSION_V5 5
-#define XFS_FSOP_GEOM_FLAGS_ATTR 0x0001 /* attributes in use */
-#define XFS_FSOP_GEOM_FLAGS_NLINK 0x0002 /* 32-bit nlink values */
-#define XFS_FSOP_GEOM_FLAGS_QUOTA 0x0004 /* quotas enabled */
-#define XFS_FSOP_GEOM_FLAGS_IALIGN 0x0008 /* inode alignment */
-#define XFS_FSOP_GEOM_FLAGS_DALIGN 0x0010 /* large data alignment */
-#define XFS_FSOP_GEOM_FLAGS_SHARED 0x0020 /* read-only shared */
-#define XFS_FSOP_GEOM_FLAGS_EXTFLG 0x0040 /* special extent flag */
-#define XFS_FSOP_GEOM_FLAGS_DIRV2 0x0080 /* directory version 2 */
-#define XFS_FSOP_GEOM_FLAGS_LOGV2 0x0100 /* log format version 2 */
-#define XFS_FSOP_GEOM_FLAGS_SECTOR 0x0200 /* sector sizes >1BB */
-#define XFS_FSOP_GEOM_FLAGS_ATTR2 0x0400 /* inline attributes rework */
-#define XFS_FSOP_GEOM_FLAGS_PROJID32 0x0800 /* 32-bit project IDs */
-#define XFS_FSOP_GEOM_FLAGS_DIRV2CI 0x1000 /* ASCII only CI names */
-#define XFS_FSOP_GEOM_FLAGS_LAZYSB 0x4000 /* lazy superblock counters */
-#define XFS_FSOP_GEOM_FLAGS_V5SB 0x8000 /* version 5 superblock */
-#define XFS_FSOP_GEOM_FLAGS_FTYPE 0x10000 /* inode directory types */
-#define XFS_FSOP_GEOM_FLAGS_FINOBT 0x20000 /* free inode btree */
-#define XFS_FSOP_GEOM_FLAGS_SPINODES 0x40000 /* sparse inode chunks */
-#define XFS_FSOP_GEOM_FLAGS_RMAPBT 0x80000 /* reverse mapping btree */
-#define XFS_FSOP_GEOM_FLAGS_REFLINK 0x100000 /* files can share blocks */
+#define XFS_FSOP_GEOM_FLAGS_ATTR (1 << 0) /* attributes in use */
+#define XFS_FSOP_GEOM_FLAGS_NLINK (1 << 1) /* 32-bit nlink values */
+#define XFS_FSOP_GEOM_FLAGS_QUOTA (1 << 2) /* quotas enabled */
+#define XFS_FSOP_GEOM_FLAGS_IALIGN (1 << 3) /* inode alignment */
+#define XFS_FSOP_GEOM_FLAGS_DALIGN (1 << 4) /* large data alignment */
+#define XFS_FSOP_GEOM_FLAGS_SHARED (1 << 5) /* read-only shared */
+#define XFS_FSOP_GEOM_FLAGS_EXTFLG (1 << 6) /* special extent flag */
+#define XFS_FSOP_GEOM_FLAGS_DIRV2 (1 << 7) /* directory version 2 */
+#define XFS_FSOP_GEOM_FLAGS_LOGV2 (1 << 8) /* log format version 2 */
+#define XFS_FSOP_GEOM_FLAGS_SECTOR (1 << 9) /* sector sizes >1BB */
+#define XFS_FSOP_GEOM_FLAGS_ATTR2 (1 << 10) /* inline attributes rework */
+#define XFS_FSOP_GEOM_FLAGS_PROJID32 (1 << 11) /* 32-bit project IDs */
+#define XFS_FSOP_GEOM_FLAGS_DIRV2CI (1 << 12) /* ASCII only CI names */
+ /* -- Do not use -- (1 << 13) SGI parent pointers */
+#define XFS_FSOP_GEOM_FLAGS_LAZYSB (1 << 14) /* lazy superblock counters */
+#define XFS_FSOP_GEOM_FLAGS_V5SB (1 << 15) /* version 5 superblock */
+#define XFS_FSOP_GEOM_FLAGS_FTYPE (1 << 16) /* inode directory types */
+#define XFS_FSOP_GEOM_FLAGS_FINOBT (1 << 17) /* free inode btree */
+#define XFS_FSOP_GEOM_FLAGS_SPINODES (1 << 18) /* sparse inode chunks */
+#define XFS_FSOP_GEOM_FLAGS_RMAPBT (1 << 19) /* reverse mapping btree */
+#define XFS_FSOP_GEOM_FLAGS_REFLINK (1 << 20) /* files can share blocks */
/*
* Minimum and maximum sizes need for growth checks.
@@ -238,6 +277,31 @@
(s)->sb_agblocks + XFS_MIN_AG_BLOCKS)
/*
+ * Output for XFS_IOC_AG_GEOMETRY
+ */
+struct xfs_ag_geometry {
+ uint32_t ag_number; /* i/o: AG number */
+ uint32_t ag_length; /* o: length in blocks */
+ uint32_t ag_freeblks; /* o: free space */
+ uint32_t ag_icount; /* o: inodes allocated */
+ uint32_t ag_ifree; /* o: inodes free */
+ uint32_t ag_sick; /* o: sick things in ag */
+ uint32_t ag_checked; /* o: checked metadata in ag */
+ uint32_t ag_flags; /* i/o: flags for this ag */
+ uint64_t ag_reserved[12];/* o: zero */
+};
+#define XFS_AG_GEOM_SICK_SB (1 << 0) /* superblock */
+#define XFS_AG_GEOM_SICK_AGF (1 << 1) /* AGF header */
+#define XFS_AG_GEOM_SICK_AGFL (1 << 2) /* AGFL header */
+#define XFS_AG_GEOM_SICK_AGI (1 << 3) /* AGI header */
+#define XFS_AG_GEOM_SICK_BNOBT (1 << 4) /* free space by block */
+#define XFS_AG_GEOM_SICK_CNTBT (1 << 5) /* free space by length */
+#define XFS_AG_GEOM_SICK_INOBT (1 << 6) /* inode index */
+#define XFS_AG_GEOM_SICK_FINOBT (1 << 7) /* free inode index */
+#define XFS_AG_GEOM_SICK_RMAPBT (1 << 8) /* reverse mappings */
+#define XFS_AG_GEOM_SICK_REFCNTBT (1 << 9) /* reference counts */
+
+/*
* Structures for XFS_IOC_FSGROWFSDATA, XFS_IOC_FSGROWFSLOG & XFS_IOC_FSGROWFSRT
*/
typedef struct xfs_growfs_data {
@@ -264,7 +328,7 @@
__s32 tv_nsec; /* and nanoseconds */
} xfs_bstime_t;
-typedef struct xfs_bstat {
+struct xfs_bstat {
__u64 bs_ino; /* inode number */
__u16 bs_mode; /* type and mode */
__u16 bs_nlink; /* number of links */
@@ -285,12 +349,70 @@
#define bs_projid bs_projid_lo /* (previously just bs_projid) */
__u16 bs_forkoff; /* inode fork offset in bytes */
__u16 bs_projid_hi; /* higher part of project id */
- unsigned char bs_pad[6]; /* pad space, unused */
+ uint16_t bs_sick; /* sick inode metadata */
+ uint16_t bs_checked; /* checked inode metadata */
+ unsigned char bs_pad[2]; /* pad space, unused */
__u32 bs_cowextsize; /* cow extent size */
__u32 bs_dmevmask; /* DMIG event mask */
__u16 bs_dmstate; /* DMIG state info */
__u16 bs_aextents; /* attribute number of extents */
-} xfs_bstat_t;
+};
+
+/* New bulkstat structure that reports v5 features and fixes padding issues */
+struct xfs_bulkstat {
+ uint64_t bs_ino; /* inode number */
+ uint64_t bs_size; /* file size */
+
+ uint64_t bs_blocks; /* number of blocks */
+ uint64_t bs_xflags; /* extended flags */
+
+ int64_t bs_atime; /* access time, seconds */
+ int64_t bs_mtime; /* modify time, seconds */
+
+ int64_t bs_ctime; /* inode change time, seconds */
+ int64_t bs_btime; /* creation time, seconds */
+
+ uint32_t bs_gen; /* generation count */
+ uint32_t bs_uid; /* user id */
+ uint32_t bs_gid; /* group id */
+ uint32_t bs_projectid; /* project id */
+
+ uint32_t bs_atime_nsec; /* access time, nanoseconds */
+ uint32_t bs_mtime_nsec; /* modify time, nanoseconds */
+ uint32_t bs_ctime_nsec; /* inode change time, nanoseconds */
+ uint32_t bs_btime_nsec; /* creation time, nanoseconds */
+
+ uint32_t bs_blksize; /* block size */
+ uint32_t bs_rdev; /* device value */
+ uint32_t bs_cowextsize_blks; /* cow extent size hint, blocks */
+ uint32_t bs_extsize_blks; /* extent size hint, blocks */
+
+ uint32_t bs_nlink; /* number of links */
+ uint32_t bs_extents; /* number of extents */
+ uint32_t bs_aextents; /* attribute number of extents */
+ uint16_t bs_version; /* structure version */
+ uint16_t bs_forkoff; /* inode fork offset in bytes */
+
+ uint16_t bs_sick; /* sick inode metadata */
+ uint16_t bs_checked; /* checked inode metadata */
+ uint16_t bs_mode; /* type and mode */
+ uint16_t bs_pad2; /* zeroed */
+
+ uint64_t bs_pad[7]; /* zeroed */
+};
+
+#define XFS_BULKSTAT_VERSION_V1 (1)
+#define XFS_BULKSTAT_VERSION_V5 (5)
+
+/* bs_sick flags */
+#define XFS_BS_SICK_INODE (1 << 0) /* inode core */
+#define XFS_BS_SICK_BMBTD (1 << 1) /* data fork */
+#define XFS_BS_SICK_BMBTA (1 << 2) /* attr fork */
+#define XFS_BS_SICK_BMBTC (1 << 3) /* cow fork */
+#define XFS_BS_SICK_DIR (1 << 4) /* directory */
+#define XFS_BS_SICK_XATTR (1 << 5) /* extended attributes */
+#define XFS_BS_SICK_SYMLINK (1 << 6) /* symbolic link remote target */
+#define XFS_BS_SICK_PARENT (1 << 7) /* parent pointers */
/*
* Project quota id helpers (previously projid was 16bit only
@@ -298,7 +420,7 @@
* to retain compatibility with "old" filesystems).
*/
static inline uint32_t
-bstat_get_projid(struct xfs_bstat *bs)
+bstat_get_projid(const struct xfs_bstat *bs)
{
return (uint32_t)bs->bs_projid_hi << 16 | bs->bs_projid_lo;
}
@@ -306,23 +428,79 @@
/*
* The user-level BulkStat Request interface structure.
*/
-typedef struct xfs_fsop_bulkreq {
+struct xfs_fsop_bulkreq {
__u64 __user *lastip; /* last inode # pointer */
__s32 icount; /* count of entries in buffer */
void __user *ubuffer;/* user buffer for inode desc. */
__s32 __user *ocount; /* output count pointer */
-} xfs_fsop_bulkreq_t;
-
+};
/*
* Structures returned from xfs_inumbers routine (XFS_IOC_FSINUMBERS).
*/
-typedef struct xfs_inogrp {
+struct xfs_inogrp {
__u64 xi_startino; /* starting inode number */
__s32 xi_alloccount; /* # bits set in allocmask */
__u64 xi_allocmask; /* mask of allocated inodes */
-} xfs_inogrp_t;
+};
+/* New inumbers structure that reports v5 features and fixes padding issues */
+struct xfs_inumbers {
+ uint64_t xi_startino; /* starting inode number */
+ uint64_t xi_allocmask; /* mask of allocated inodes */
+ uint8_t xi_alloccount; /* # bits set in allocmask */
+ uint8_t xi_version; /* version */
+ uint8_t xi_padding[6]; /* zero */
+};
+
+#define XFS_INUMBERS_VERSION_V1 (1)
+#define XFS_INUMBERS_VERSION_V5 (5)
+
+/* Header for bulk inode requests. */
+struct xfs_bulk_ireq {
+ uint64_t ino; /* I/O: start with this inode */
+ uint32_t flags; /* I/O: operation flags */
+ uint32_t icount; /* I: count of entries in buffer */
+ uint32_t ocount; /* O: count of entries filled out */
+ uint32_t agno; /* I: see comment for IREQ_AGNO */
+ uint64_t reserved[5]; /* must be zero */
+};
+
+/*
+ * Only return results from the specified @agno. If @ino is zero, start
+ * with the first inode of @agno.
+ */
+#define XFS_BULK_IREQ_AGNO (1 << 0)
+
+/*
+ * Return bulkstat information for a single inode, where @ino value is a
+ * special value, not a literal inode number. See the XFS_BULK_IREQ_SPECIAL_*
+ * values below. Not compatible with XFS_BULK_IREQ_AGNO.
+ */
+#define XFS_BULK_IREQ_SPECIAL (1 << 1)
+
+#define XFS_BULK_IREQ_FLAGS_ALL (XFS_BULK_IREQ_AGNO | \
+ XFS_BULK_IREQ_SPECIAL)
+
+/* Operate on the root directory inode. */
+#define XFS_BULK_IREQ_SPECIAL_ROOT (1)
+
+/*
+ * ioctl structures for v5 bulkstat and inumbers requests
+ */
+struct xfs_bulkstat_req {
+ struct xfs_bulk_ireq hdr;
+ struct xfs_bulkstat bulkstat[];
+};
+#define XFS_BULKSTAT_REQ_SIZE(nr) (sizeof(struct xfs_bulkstat_req) + \
+ (nr) * sizeof(struct xfs_bulkstat))
+
+struct xfs_inumbers_req {
+ struct xfs_bulk_ireq hdr;
+ struct xfs_inumbers inumbers[];
+};
+#define XFS_INUMBERS_REQ_SIZE(nr) (sizeof(struct xfs_inumbers_req) + \
+ (nr) * sizeof(struct xfs_inumbers))
/*
* Error injection.
@@ -453,7 +631,7 @@
xfs_off_t sx_offset; /* offset into file */
xfs_off_t sx_length; /* leng from offset */
char sx_pad[16]; /* pad space, unused */
- xfs_bstat_t sx_stat; /* stat of target b4 copy */
+ struct xfs_bstat sx_stat; /* stat of target b4 copy */
} xfs_swapext_t;
/*
@@ -502,9 +680,10 @@
#define XFS_SCRUB_TYPE_UQUOTA 21 /* user quotas */
#define XFS_SCRUB_TYPE_GQUOTA 22 /* group quotas */
#define XFS_SCRUB_TYPE_PQUOTA 23 /* project quotas */
+#define XFS_SCRUB_TYPE_FSCOUNTERS 24 /* fs summary counters */
/* Number of scrub subcommands. */
-#define XFS_SCRUB_TYPE_NR 24
+#define XFS_SCRUB_TYPE_NR 25
/* i: Repair this metadata. */
#define XFS_SCRUB_IFLAG_REPAIR (1 << 0)
@@ -590,6 +769,7 @@
#define XFS_IOC_FREE_EOFBLOCKS _IOR ('X', 58, struct xfs_fs_eofblocks)
/* XFS_IOC_GETFSMAP ------ hoisted 59 */
#define XFS_IOC_SCRUB_METADATA _IOWR('X', 60, struct xfs_scrub_metadata)
+#define XFS_IOC_AG_GEOMETRY _IOWR('X', 61, struct xfs_ag_geometry)
/*
* ioctl commands that replace IRIX syssgi()'s
@@ -620,8 +800,11 @@
#define XFS_IOC_FSSETDM_BY_HANDLE _IOW ('X', 121, struct xfs_fsop_setdm_handlereq)
#define XFS_IOC_ATTRLIST_BY_HANDLE _IOW ('X', 122, struct xfs_fsop_attrlist_handlereq)
#define XFS_IOC_ATTRMULTI_BY_HANDLE _IOW ('X', 123, struct xfs_fsop_attrmulti_handlereq)
-#define XFS_IOC_FSGEOMETRY _IOR ('X', 124, struct xfs_fsop_geom)
+#define XFS_IOC_FSGEOMETRY_V4 _IOR ('X', 124, struct xfs_fsop_geom_v4)
#define XFS_IOC_GOINGDOWN _IOR ('X', 125, uint32_t)
+#define XFS_IOC_FSGEOMETRY _IOR ('X', 126, struct xfs_fsop_geom)
+#define XFS_IOC_BULKSTAT _IOR ('X', 127, struct xfs_bulkstat_req)
+#define XFS_IOC_INUMBERS _IOR ('X', 128, struct xfs_inumbers_req)
/* XFS_IOC_GETFSUUID ---------- deprecated 140 */
diff --git a/fs/xfs/libxfs/xfs_health.h b/fs/xfs/libxfs/xfs_health.h
new file mode 100644
index 0000000..272005a
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_health.h
@@ -0,0 +1,190 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Copyright (C) 2019 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ */
+#ifndef __XFS_HEALTH_H__
+#define __XFS_HEALTH_H__
+
+/*
+ * In-Core Filesystem Health Assessments
+ * =====================================
+ *
+ * We'd like to be able to summarize the current health status of the
+ * filesystem so that the administrator knows when it's necessary to schedule
+ * some downtime for repairs. Until then, we would also like to avoid abrupt
+ * shutdowns due to corrupt metadata.
+ *
+ * The online scrub feature evaluates the health of all filesystem metadata.
+ * When scrub detects corruption in a piece of metadata it will set the
+ * corresponding sickness flag, and repair will clear it if successful. If
+ * problems remain at unmount time, we can also request manual intervention by
+ * logging a notice to run xfs_repair.
+ *
+ * Each health tracking group uses a pair of fields for reporting. The
+ * "checked" field tell us if a given piece of metadata has ever been examined,
+ * and the "sick" field tells us if that piece was found to need repairs.
+ * Therefore we can conclude that for a given sick flag value:
+ *
+ * - checked && sick => metadata needs repair
+ * - checked && !sick => metadata is ok
+ * - !checked => has not been examined since mount
+ */
+
+struct xfs_mount;
+struct xfs_perag;
+struct xfs_inode;
+struct xfs_fsop_geom;
+
+/* Observable health issues for metadata spanning the entire filesystem. */
+#define XFS_SICK_FS_COUNTERS (1 << 0) /* summary counters */
+#define XFS_SICK_FS_UQUOTA (1 << 1) /* user quota */
+#define XFS_SICK_FS_GQUOTA (1 << 2) /* group quota */
+#define XFS_SICK_FS_PQUOTA (1 << 3) /* project quota */
+
+/* Observable health issues for realtime volume metadata. */
+#define XFS_SICK_RT_BITMAP (1 << 0) /* realtime bitmap */
+#define XFS_SICK_RT_SUMMARY (1 << 1) /* realtime summary */
+
+/* Observable health issues for AG metadata. */
+#define XFS_SICK_AG_SB (1 << 0) /* superblock */
+#define XFS_SICK_AG_AGF (1 << 1) /* AGF header */
+#define XFS_SICK_AG_AGFL (1 << 2) /* AGFL header */
+#define XFS_SICK_AG_AGI (1 << 3) /* AGI header */
+#define XFS_SICK_AG_BNOBT (1 << 4) /* free space by block */
+#define XFS_SICK_AG_CNTBT (1 << 5) /* free space by length */
+#define XFS_SICK_AG_INOBT (1 << 6) /* inode index */
+#define XFS_SICK_AG_FINOBT (1 << 7) /* free inode index */
+#define XFS_SICK_AG_RMAPBT (1 << 8) /* reverse mappings */
+#define XFS_SICK_AG_REFCNTBT (1 << 9) /* reference counts */
+
+/* Observable health issues for inode metadata. */
+#define XFS_SICK_INO_CORE (1 << 0) /* inode core */
+#define XFS_SICK_INO_BMBTD (1 << 1) /* data fork */
+#define XFS_SICK_INO_BMBTA (1 << 2) /* attr fork */
+#define XFS_SICK_INO_BMBTC (1 << 3) /* cow fork */
+#define XFS_SICK_INO_DIR (1 << 4) /* directory */
+#define XFS_SICK_INO_XATTR (1 << 5) /* extended attributes */
+#define XFS_SICK_INO_SYMLINK (1 << 6) /* symbolic link remote target */
+#define XFS_SICK_INO_PARENT (1 << 7) /* parent pointers */
+
+/* Primary evidence of health problems in a given group. */
+#define XFS_SICK_FS_PRIMARY (XFS_SICK_FS_COUNTERS | \
+ XFS_SICK_FS_UQUOTA | \
+ XFS_SICK_FS_GQUOTA | \
+ XFS_SICK_FS_PQUOTA)
+
+#define XFS_SICK_RT_PRIMARY (XFS_SICK_RT_BITMAP | \
+ XFS_SICK_RT_SUMMARY)
+
+#define XFS_SICK_AG_PRIMARY (XFS_SICK_AG_SB | \
+ XFS_SICK_AG_AGF | \
+ XFS_SICK_AG_AGFL | \
+ XFS_SICK_AG_AGI | \
+ XFS_SICK_AG_BNOBT | \
+ XFS_SICK_AG_CNTBT | \
+ XFS_SICK_AG_INOBT | \
+ XFS_SICK_AG_FINOBT | \
+ XFS_SICK_AG_RMAPBT | \
+ XFS_SICK_AG_REFCNTBT)
+
+#define XFS_SICK_INO_PRIMARY (XFS_SICK_INO_CORE | \
+ XFS_SICK_INO_BMBTD | \
+ XFS_SICK_INO_BMBTA | \
+ XFS_SICK_INO_BMBTC | \
+ XFS_SICK_INO_DIR | \
+ XFS_SICK_INO_XATTR | \
+ XFS_SICK_INO_SYMLINK | \
+ XFS_SICK_INO_PARENT)
+
+/* These functions must be provided by the xfs implementation. */
+
+void xfs_fs_mark_sick(struct xfs_mount *mp, unsigned int mask);
+void xfs_fs_mark_healthy(struct xfs_mount *mp, unsigned int mask);
+void xfs_fs_measure_sickness(struct xfs_mount *mp, unsigned int *sick,
+ unsigned int *checked);
+
+void xfs_rt_mark_sick(struct xfs_mount *mp, unsigned int mask);
+void xfs_rt_mark_healthy(struct xfs_mount *mp, unsigned int mask);
+void xfs_rt_measure_sickness(struct xfs_mount *mp, unsigned int *sick,
+ unsigned int *checked);
+
+void xfs_ag_mark_sick(struct xfs_perag *pag, unsigned int mask);
+void xfs_ag_mark_healthy(struct xfs_perag *pag, unsigned int mask);
+void xfs_ag_measure_sickness(struct xfs_perag *pag, unsigned int *sick,
+ unsigned int *checked);
+
+void xfs_inode_mark_sick(struct xfs_inode *ip, unsigned int mask);
+void xfs_inode_mark_healthy(struct xfs_inode *ip, unsigned int mask);
+void xfs_inode_measure_sickness(struct xfs_inode *ip, unsigned int *sick,
+ unsigned int *checked);
+
+void xfs_health_unmount(struct xfs_mount *mp);
+
+/* Now some helpers. */
+
+static inline bool
+xfs_fs_has_sickness(struct xfs_mount *mp, unsigned int mask)
+{
+ unsigned int sick, checked;
+
+ xfs_fs_measure_sickness(mp, &sick, &checked);
+ return sick & mask;
+}
+
+static inline bool
+xfs_rt_has_sickness(struct xfs_mount *mp, unsigned int mask)
+{
+ unsigned int sick, checked;
+
+ xfs_rt_measure_sickness(mp, &sick, &checked);
+ return sick & mask;
+}
+
+static inline bool
+xfs_ag_has_sickness(struct xfs_perag *pag, unsigned int mask)
+{
+ unsigned int sick, checked;
+
+ xfs_ag_measure_sickness(pag, &sick, &checked);
+ return sick & mask;
+}
+
+static inline bool
+xfs_inode_has_sickness(struct xfs_inode *ip, unsigned int mask)
+{
+ unsigned int sick, checked;
+
+ xfs_inode_measure_sickness(ip, &sick, &checked);
+ return sick & mask;
+}
+
+static inline bool
+xfs_fs_is_healthy(struct xfs_mount *mp)
+{
+ return !xfs_fs_has_sickness(mp, -1U);
+}
+
+static inline bool
+xfs_rt_is_healthy(struct xfs_mount *mp)
+{
+ return !xfs_rt_has_sickness(mp, -1U);
+}
+
+static inline bool
+xfs_ag_is_healthy(struct xfs_perag *pag)
+{
+ return !xfs_ag_has_sickness(pag, -1U);
+}
+
+static inline bool
+xfs_inode_is_healthy(struct xfs_inode *ip)
+{
+ return !xfs_inode_has_sickness(ip, -1U);
+}
+
+void xfs_fsop_geom_health(struct xfs_mount *mp, struct xfs_fsop_geom *geo);
+void xfs_ag_geom_health(struct xfs_perag *pag, struct xfs_ag_geometry *ageo);
+void xfs_bulkstat_health(struct xfs_inode *ip, struct xfs_bulkstat *bs);
+
+#endif /* __XFS_HEALTH_H__ */
diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c
index a8f6db7..588d446 100644
--- a/fs/xfs/libxfs/xfs_ialloc.c
+++ b/fs/xfs/libxfs/xfs_ialloc.c
@@ -12,17 +12,14 @@
#include "xfs_bit.h"
#include "xfs_sb.h"
#include "xfs_mount.h"
-#include "xfs_defer.h"
#include "xfs_inode.h"
#include "xfs_btree.h"
#include "xfs_ialloc.h"
#include "xfs_ialloc_btree.h"
#include "xfs_alloc.h"
-#include "xfs_rtalloc.h"
#include "xfs_errortag.h"
#include "xfs_error.h"
#include "xfs_bmap.h"
-#include "xfs_cksum.h"
#include "xfs_trans.h"
#include "xfs_buf_item.h"
#include "xfs_icreate_item.h"
@@ -31,20 +28,6 @@
#include "xfs_log.h"
#include "xfs_rmap.h"
-
-/*
- * Allocation group level functions.
- */
-int
-xfs_ialloc_cluster_alignment(
- struct xfs_mount *mp)
-{
- if (xfs_sb_version_hasalign(&mp->m_sb) &&
- mp->m_sb.sb_inoalignmt >= xfs_icluster_size_fsb(mp))
- return mp->m_sb.sb_inoalignmt;
- return 1;
-}
-
/*
* Lookup a record by ino in the btree given by cur.
*/
@@ -288,7 +271,7 @@
{
struct xfs_buf *fbuf;
struct xfs_dinode *free;
- int nbufs, blks_per_cluster, inodes_per_cluster;
+ int nbufs;
int version;
int i, j;
xfs_daddr_t d;
@@ -299,9 +282,7 @@
* sizes, manipulate the inodes in buffers which are multiples of the
* blocks size.
*/
- blks_per_cluster = xfs_icluster_size_fsb(mp);
- inodes_per_cluster = blks_per_cluster << mp->m_sb.sb_inopblog;
- nbufs = length / blks_per_cluster;
+ nbufs = length / M_IGEO(mp)->blocks_per_cluster;
/*
* Figure out what version number to use in the inodes we create. If
@@ -312,7 +293,7 @@
*
* For v3 inodes, we also need to write the inode number into the inode,
* so calculate the first inode number of the chunk here as
- * XFS_OFFBNO_TO_AGINO() only works within a filesystem block, not
+ * XFS_AGB_TO_AGINO() only works within a filesystem block, not
* across multiple filesystem blocks (such as a cluster) and so cannot
* be used in the cluster buffer loop below.
*
@@ -324,8 +305,7 @@
*/
if (xfs_sb_version_hascrc(&mp->m_sb)) {
version = 3;
- ino = XFS_AGINO_TO_INO(mp, agno,
- XFS_OFFBNO_TO_AGINO(mp, agbno, 0));
+ ino = XFS_AGINO_TO_INO(mp, agno, XFS_AGB_TO_AGINO(mp, agbno));
/*
* log the initialisation that is about to take place as an
@@ -345,9 +325,11 @@
/*
* Get the block.
*/
- d = XFS_AGB_TO_DADDR(mp, agno, agbno + (j * blks_per_cluster));
+ d = XFS_AGB_TO_DADDR(mp, agno, agbno +
+ (j * M_IGEO(mp)->blocks_per_cluster));
fbuf = xfs_trans_get_buf(tp, mp->m_ddev_targp, d,
- mp->m_bsize * blks_per_cluster,
+ mp->m_bsize *
+ M_IGEO(mp)->blocks_per_cluster,
XBF_UNMAPPED);
if (!fbuf)
return -ENOMEM;
@@ -355,7 +337,7 @@
/* Initialize the inode buffers and log them appropriately. */
fbuf->b_ops = &xfs_inode_buf_ops;
xfs_buf_zero(fbuf, 0, BBTOB(fbuf->b_length));
- for (i = 0; i < inodes_per_cluster; i++) {
+ for (i = 0; i < M_IGEO(mp)->inodes_per_cluster; i++) {
int ioffset = i << mp->m_sb.sb_inodelog;
uint isize = xfs_dinode_size(version);
@@ -445,7 +427,7 @@
return;
/* calculate the inode offset and align startino */
- offset = mod << mp->m_sb.sb_inopblog;
+ offset = XFS_AGB_TO_AGINO(mp, mod);
*startino -= offset;
/*
@@ -618,35 +600,37 @@
* Allocate new inodes in the allocation group specified by agbp.
* Return 0 for success, else error code.
*/
-STATIC int /* error code or 0 */
+STATIC int
xfs_ialloc_ag_alloc(
- xfs_trans_t *tp, /* transaction pointer */
- xfs_buf_t *agbp, /* alloc group buffer */
- int *alloc)
+ struct xfs_trans *tp,
+ struct xfs_buf *agbp,
+ int *alloc)
{
- xfs_agi_t *agi; /* allocation group header */
- xfs_alloc_arg_t args; /* allocation argument structure */
- xfs_agnumber_t agno;
- int error;
- xfs_agino_t newino; /* new first inode's number */
- xfs_agino_t newlen; /* new number of inodes */
- int isaligned = 0; /* inode allocation at stripe unit */
- /* boundary */
- uint16_t allocmask = (uint16_t) -1; /* init. to full chunk */
+ struct xfs_agi *agi;
+ struct xfs_alloc_arg args;
+ xfs_agnumber_t agno;
+ int error;
+ xfs_agino_t newino; /* new first inode's number */
+ xfs_agino_t newlen; /* new number of inodes */
+ int isaligned = 0; /* inode allocation at stripe */
+ /* unit boundary */
+ /* init. to full chunk */
+ uint16_t allocmask = (uint16_t) -1;
struct xfs_inobt_rec_incore rec;
- struct xfs_perag *pag;
- int do_sparse = 0;
+ struct xfs_perag *pag;
+ struct xfs_ino_geometry *igeo = M_IGEO(tp->t_mountp);
+ int do_sparse = 0;
memset(&args, 0, sizeof(args));
args.tp = tp;
args.mp = tp->t_mountp;
args.fsbno = NULLFSBLOCK;
- xfs_rmap_ag_owner(&args.oinfo, XFS_RMAP_OWN_INODES);
+ args.oinfo = XFS_RMAP_OINFO_INODES;
#ifdef DEBUG
/* randomly do sparse inode allocations */
if (xfs_sb_version_hassparseinodes(&tp->t_mountp->m_sb) &&
- args.mp->m_ialloc_min_blks < args.mp->m_ialloc_blks)
+ igeo->ialloc_min_blks < igeo->ialloc_blks)
do_sparse = prandom_u32() & 1;
#endif
@@ -654,12 +638,12 @@
* Locking will ensure that we don't have two callers in here
* at one time.
*/
- newlen = args.mp->m_ialloc_inos;
- if (args.mp->m_maxicount &&
+ newlen = igeo->ialloc_inos;
+ if (igeo->maxicount &&
percpu_counter_read_positive(&args.mp->m_icount) + newlen >
- args.mp->m_maxicount)
+ igeo->maxicount)
return -ENOSPC;
- args.minlen = args.maxlen = args.mp->m_ialloc_blks;
+ args.minlen = args.maxlen = igeo->ialloc_blks;
/*
* First try to allocate inodes contiguous with the last-allocated
* chunk of inodes. If the filesystem is striped, this will fill
@@ -669,7 +653,7 @@
newino = be32_to_cpu(agi->agi_newino);
agno = be32_to_cpu(agi->agi_seqno);
args.agbno = XFS_AGINO_TO_AGBNO(args.mp, newino) +
- args.mp->m_ialloc_blks;
+ igeo->ialloc_blks;
if (do_sparse)
goto sparse_alloc;
if (likely(newino != NULLAGINO &&
@@ -692,10 +676,10 @@
* but not to use them in the actual exact allocation.
*/
args.alignment = 1;
- args.minalignslop = xfs_ialloc_cluster_alignment(args.mp) - 1;
+ args.minalignslop = igeo->cluster_align - 1;
/* Allow space for the inode btree to split. */
- args.minleft = args.mp->m_in_maxlevels - 1;
+ args.minleft = igeo->inobt_maxlevels - 1;
if ((error = xfs_alloc_vextent(&args)))
return error;
@@ -722,12 +706,12 @@
* pieces, so don't need alignment anyway.
*/
isaligned = 0;
- if (args.mp->m_sinoalign) {
+ if (igeo->ialloc_align) {
ASSERT(!(args.mp->m_flags & XFS_MOUNT_NOALIGN));
args.alignment = args.mp->m_dalign;
isaligned = 1;
} else
- args.alignment = xfs_ialloc_cluster_alignment(args.mp);
+ args.alignment = igeo->cluster_align;
/*
* Need to figure out where to allocate the inode blocks.
* Ideally they should be spaced out through the a.g.
@@ -743,7 +727,7 @@
/*
* Allow space for the inode btree to split.
*/
- args.minleft = args.mp->m_in_maxlevels - 1;
+ args.minleft = igeo->inobt_maxlevels - 1;
if ((error = xfs_alloc_vextent(&args)))
return error;
}
@@ -756,7 +740,7 @@
args.type = XFS_ALLOCTYPE_NEAR_BNO;
args.agbno = be32_to_cpu(agi->agi_root);
args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno);
- args.alignment = xfs_ialloc_cluster_alignment(args.mp);
+ args.alignment = igeo->cluster_align;
if ((error = xfs_alloc_vextent(&args)))
return error;
}
@@ -766,7 +750,7 @@
* the sparse allocation length is smaller than a full chunk.
*/
if (xfs_sb_version_hassparseinodes(&args.mp->m_sb) &&
- args.mp->m_ialloc_min_blks < args.mp->m_ialloc_blks &&
+ igeo->ialloc_min_blks < igeo->ialloc_blks &&
args.fsbno == NULLFSBLOCK) {
sparse_alloc:
args.type = XFS_ALLOCTYPE_NEAR_BNO;
@@ -775,7 +759,7 @@
args.alignment = args.mp->m_sb.sb_spino_align;
args.prod = 1;
- args.minlen = args.mp->m_ialloc_min_blks;
+ args.minlen = igeo->ialloc_min_blks;
args.maxlen = args.minlen;
/*
@@ -791,13 +775,13 @@
args.min_agbno = args.mp->m_sb.sb_inoalignmt;
args.max_agbno = round_down(args.mp->m_sb.sb_agblocks,
args.mp->m_sb.sb_inoalignmt) -
- args.mp->m_ialloc_blks;
+ igeo->ialloc_blks;
error = xfs_alloc_vextent(&args);
if (error)
return error;
- newlen = args.len << args.mp->m_sb.sb_inopblog;
+ newlen = XFS_AGB_TO_AGINO(args.mp, args.len);
ASSERT(newlen <= XFS_INODES_PER_CHUNK);
allocmask = (1 << (newlen / XFS_INODES_PER_HOLEMASK_BIT)) - 1;
}
@@ -825,7 +809,7 @@
/*
* Convert the results.
*/
- newino = XFS_OFFBNO_TO_AGINO(args.mp, args.agbno, 0);
+ newino = XFS_AGB_TO_AGINO(args.mp, args.agbno);
if (xfs_inobt_issparse(~allocmask)) {
/*
@@ -1008,7 +992,7 @@
* space needed for alignment of inode chunks when checking the
* longest contiguous free space in the AG - this prevents us
* from getting ENOSPC because we have free space larger than
- * m_ialloc_blks but alignment constraints prevent us from using
+ * ialloc_blks but alignment constraints prevent us from using
* it.
*
* If we can't find an AG with space for full alignment slack to
@@ -1017,9 +1001,9 @@
* if we fail allocation due to alignment issues then it is most
* likely a real ENOSPC condition.
*/
- ineed = mp->m_ialloc_min_blks;
+ ineed = M_IGEO(mp)->ialloc_min_blks;
if (flags && ineed > 1)
- ineed += xfs_ialloc_cluster_alignment(mp);
+ ineed += M_IGEO(mp)->cluster_align;
longest = pag->pagf_longest;
if (!longest)
longest = pag->pagf_flcount > 0;
@@ -1705,6 +1689,7 @@
int noroom = 0;
xfs_agnumber_t start_agno;
struct xfs_perag *pag;
+ struct xfs_ino_geometry *igeo = M_IGEO(mp);
int okalloc = 1;
if (*IO_agbp) {
@@ -1735,9 +1720,9 @@
* Read rough value of mp->m_icount by percpu_counter_read_positive,
* which will sacrifice the preciseness but improve the performance.
*/
- if (mp->m_maxicount &&
- percpu_counter_read_positive(&mp->m_icount) + mp->m_ialloc_inos
- > mp->m_maxicount) {
+ if (igeo->maxicount &&
+ percpu_counter_read_positive(&mp->m_icount) + igeo->ialloc_inos
+ > igeo->maxicount) {
noroom = 1;
okalloc = 0;
}
@@ -1849,14 +1834,13 @@
int nextbit;
xfs_agblock_t agbno;
int contigblk;
- struct xfs_owner_info oinfo;
DECLARE_BITMAP(holemask, XFS_INOBT_HOLEMASK_BITS);
- xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_INODES);
if (!xfs_inobt_issparse(rec->ir_holemask)) {
/* not sparse, calculate extent info directly */
xfs_bmap_add_free(tp, XFS_AGB_TO_FSB(mp, agno, sagbno),
- mp->m_ialloc_blks, &oinfo);
+ M_IGEO(mp)->ialloc_blks,
+ &XFS_RMAP_OINFO_INODES);
return;
}
@@ -1900,7 +1884,7 @@
ASSERT(agbno % mp->m_sb.sb_spino_align == 0);
ASSERT(contigblk % mp->m_sb.sb_spino_align == 0);
xfs_bmap_add_free(tp, XFS_AGB_TO_FSB(mp, agno, agbno),
- contigblk, &oinfo);
+ contigblk, &XFS_RMAP_OINFO_INODES);
/* reset range to current bit and carry on... */
startidx = endidx = nextbit;
@@ -2265,7 +2249,7 @@
/* check that the returned record contains the required inode */
if (rec.ir_startino > agino ||
- rec.ir_startino + mp->m_ialloc_inos <= agino)
+ rec.ir_startino + M_IGEO(mp)->ialloc_inos <= agino)
return -EINVAL;
/* for untrusted inodes check it is allocated first */
@@ -2292,7 +2276,6 @@
xfs_agblock_t agbno; /* block number of inode in the alloc group */
xfs_agino_t agino; /* inode number within alloc group */
xfs_agnumber_t agno; /* allocation group number */
- int blks_per_cluster; /* num blocks per inode cluster */
xfs_agblock_t chunk_agbno; /* first block in inode chunk */
xfs_agblock_t cluster_agbno; /* first block in inode cluster */
int error; /* error code */
@@ -2338,8 +2321,6 @@
return -EINVAL;
}
- blks_per_cluster = xfs_icluster_size_fsb(mp);
-
/*
* For bulkstat and handle lookups, we have an untrusted inode number
* that we have to verify is valid. We cannot do this just by reading
@@ -2359,7 +2340,7 @@
* If the inode cluster size is the same as the blocksize or
* smaller we get to the buffer by simple arithmetics.
*/
- if (blks_per_cluster == 1) {
+ if (M_IGEO(mp)->blocks_per_cluster == 1) {
offset = XFS_INO_TO_OFFSET(mp, ino);
ASSERT(offset < mp->m_sb.sb_inopblock);
@@ -2375,8 +2356,8 @@
* find the location. Otherwise we have to do a btree
* lookup to find the location.
*/
- if (mp->m_inoalign_mask) {
- offset_agbno = agbno & mp->m_inoalign_mask;
+ if (M_IGEO(mp)->inoalign_mask) {
+ offset_agbno = agbno & M_IGEO(mp)->inoalign_mask;
chunk_agbno = agbno - offset_agbno;
} else {
error = xfs_imap_lookup(mp, tp, agno, agino, agbno,
@@ -2388,12 +2369,13 @@
out_map:
ASSERT(agbno >= chunk_agbno);
cluster_agbno = chunk_agbno +
- ((offset_agbno / blks_per_cluster) * blks_per_cluster);
+ ((offset_agbno / M_IGEO(mp)->blocks_per_cluster) *
+ M_IGEO(mp)->blocks_per_cluster);
offset = ((agbno - cluster_agbno) * mp->m_sb.sb_inopblock) +
XFS_INO_TO_OFFSET(mp, ino);
imap->im_blkno = XFS_AGB_TO_DADDR(mp, agno, cluster_agbno);
- imap->im_len = XFS_FSB_TO_BB(mp, blks_per_cluster);
+ imap->im_len = XFS_FSB_TO_BB(mp, M_IGEO(mp)->blocks_per_cluster);
imap->im_boffset = (unsigned short)(offset << mp->m_sb.sb_inodelog);
/*
@@ -2415,20 +2397,6 @@
}
/*
- * Compute and fill in value of m_in_maxlevels.
- */
-void
-xfs_ialloc_compute_maxlevels(
- xfs_mount_t *mp) /* file system mount structure */
-{
- uint inodes;
-
- inodes = (1LL << XFS_INO_AGINO_BITS(mp)) >> XFS_INODES_PER_CHUNK_LOG;
- mp->m_in_maxlevels = xfs_btree_compute_maxlevels(mp->m_inobt_mnr,
- inodes);
-}
-
-/*
* Log specified fields for the ag hdr (inode section). The growth of the agi
* structure over time requires that we interpret the buffer as two logical
* regions delineated by the end of the unlinked list. This is due to the size
@@ -2499,7 +2467,7 @@
xfs_agi_verify(
struct xfs_buf *bp)
{
- struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_mount *mp = bp->b_mount;
struct xfs_agi *agi = XFS_BUF_TO_AGI(bp);
int i;
@@ -2514,7 +2482,7 @@
/*
* Validate the magic number of the agi block.
*/
- if (agi->agi_magicnum != cpu_to_be32(XFS_AGI_MAGIC))
+ if (!xfs_verify_magic(bp, agi->agi_magicnum))
return __this_address;
if (!XFS_AGI_GOOD_VERSION(be32_to_cpu(agi->agi_versionnum)))
return __this_address;
@@ -2551,7 +2519,7 @@
xfs_agi_read_verify(
struct xfs_buf *bp)
{
- struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_mount *mp = bp->b_mount;
xfs_failaddr_t fa;
if (xfs_sb_version_hascrc(&mp->m_sb) &&
@@ -2568,7 +2536,7 @@
xfs_agi_write_verify(
struct xfs_buf *bp)
{
- struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_mount *mp = bp->b_mount;
struct xfs_buf_log_item *bip = bp->b_log_item;
xfs_failaddr_t fa;
@@ -2588,6 +2556,7 @@
const struct xfs_buf_ops xfs_agi_buf_ops = {
.name = "xfs_agi",
+ .magic = { cpu_to_be32(XFS_AGI_MAGIC), cpu_to_be32(XFS_AGI_MAGIC) },
.verify_read = xfs_agi_read_verify,
.verify_write = xfs_agi_write_verify,
.verify_struct = xfs_agi_verify,
@@ -2726,8 +2695,8 @@
xfs_agino_t low;
xfs_agino_t high;
- low = XFS_OFFBNO_TO_AGINO(cur->bc_mp, bno, 0);
- high = XFS_OFFBNO_TO_AGINO(cur->bc_mp, bno + len, 0) - 1;
+ low = XFS_AGB_TO_AGINO(cur->bc_mp, bno);
+ high = XFS_AGB_TO_AGINO(cur->bc_mp, bno + len) - 1;
return xfs_ialloc_has_inode_record(cur, low, high, exists);
}
@@ -2773,3 +2742,115 @@
*freecount = ci.freecount;
return 0;
}
+
+/*
+ * Initialize inode-related geometry information.
+ *
+ * Compute the inode btree min and max levels and set maxicount.
+ *
+ * Set the inode cluster size. This may still be overridden by the file
+ * system block size if it is larger than the chosen cluster size.
+ *
+ * For v5 filesystems, scale the cluster size with the inode size to keep a
+ * constant ratio of inode per cluster buffer, but only if mkfs has set the
+ * inode alignment value appropriately for larger cluster sizes.
+ *
+ * Then compute the inode cluster alignment information.
+ */
+void
+xfs_ialloc_setup_geometry(
+ struct xfs_mount *mp)
+{
+ struct xfs_sb *sbp = &mp->m_sb;
+ struct xfs_ino_geometry *igeo = M_IGEO(mp);
+ uint64_t icount;
+ uint inodes;
+
+ /* Compute inode btree geometry. */
+ igeo->agino_log = sbp->sb_inopblog + sbp->sb_agblklog;
+ igeo->inobt_mxr[0] = xfs_inobt_maxrecs(mp, sbp->sb_blocksize, 1);
+ igeo->inobt_mxr[1] = xfs_inobt_maxrecs(mp, sbp->sb_blocksize, 0);
+ igeo->inobt_mnr[0] = igeo->inobt_mxr[0] / 2;
+ igeo->inobt_mnr[1] = igeo->inobt_mxr[1] / 2;
+
+ igeo->ialloc_inos = max_t(uint16_t, XFS_INODES_PER_CHUNK,
+ sbp->sb_inopblock);
+ igeo->ialloc_blks = igeo->ialloc_inos >> sbp->sb_inopblog;
+
+ if (sbp->sb_spino_align)
+ igeo->ialloc_min_blks = sbp->sb_spino_align;
+ else
+ igeo->ialloc_min_blks = igeo->ialloc_blks;
+
+ /* Compute and fill in value of m_ino_geo.inobt_maxlevels. */
+ inodes = (1LL << XFS_INO_AGINO_BITS(mp)) >> XFS_INODES_PER_CHUNK_LOG;
+ igeo->inobt_maxlevels = xfs_btree_compute_maxlevels(igeo->inobt_mnr,
+ inodes);
+
+ /*
+ * Set the maximum inode count for this filesystem, being careful not
+ * to use obviously garbage sb_inopblog/sb_inopblock values. Regular
+ * users should never get here due to failing sb verification, but
+ * certain users (xfs_db) need to be usable even with corrupt metadata.
+ */
+ if (sbp->sb_imax_pct && igeo->ialloc_blks) {
+ /*
+ * Make sure the maximum inode count is a multiple
+ * of the units we allocate inodes in.
+ */
+ icount = sbp->sb_dblocks * sbp->sb_imax_pct;
+ do_div(icount, 100);
+ do_div(icount, igeo->ialloc_blks);
+ igeo->maxicount = XFS_FSB_TO_INO(mp,
+ icount * igeo->ialloc_blks);
+ } else {
+ igeo->maxicount = 0;
+ }
+
+ /*
+ * Compute the desired size of an inode cluster buffer size, which
+ * starts at 8K and (on v5 filesystems) scales up with larger inode
+ * sizes.
+ *
+ * Preserve the desired inode cluster size because the sparse inodes
+ * feature uses that desired size (not the actual size) to compute the
+ * sparse inode alignment. The mount code validates this value, so we
+ * cannot change the behavior.
+ */
+ igeo->inode_cluster_size_raw = XFS_INODE_BIG_CLUSTER_SIZE;
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ int new_size = igeo->inode_cluster_size_raw;
+
+ new_size *= mp->m_sb.sb_inodesize / XFS_DINODE_MIN_SIZE;
+ if (mp->m_sb.sb_inoalignmt >= XFS_B_TO_FSBT(mp, new_size))
+ igeo->inode_cluster_size_raw = new_size;
+ }
+
+ /* Calculate inode cluster ratios. */
+ if (igeo->inode_cluster_size_raw > mp->m_sb.sb_blocksize)
+ igeo->blocks_per_cluster = XFS_B_TO_FSBT(mp,
+ igeo->inode_cluster_size_raw);
+ else
+ igeo->blocks_per_cluster = 1;
+ igeo->inode_cluster_size = XFS_FSB_TO_B(mp, igeo->blocks_per_cluster);
+ igeo->inodes_per_cluster = XFS_FSB_TO_INO(mp, igeo->blocks_per_cluster);
+
+ /* Calculate inode cluster alignment. */
+ if (xfs_sb_version_hasalign(&mp->m_sb) &&
+ mp->m_sb.sb_inoalignmt >= igeo->blocks_per_cluster)
+ igeo->cluster_align = mp->m_sb.sb_inoalignmt;
+ else
+ igeo->cluster_align = 1;
+ igeo->inoalign_mask = igeo->cluster_align - 1;
+ igeo->cluster_align_inodes = XFS_FSB_TO_INO(mp, igeo->cluster_align);
+
+ /*
+ * If we are using stripe alignment, check whether
+ * the stripe unit is a multiple of the inode alignment
+ */
+ if (mp->m_dalign && igeo->inoalign_mask &&
+ !(mp->m_dalign & igeo->inoalign_mask))
+ igeo->ialloc_align = mp->m_dalign;
+ else
+ igeo->ialloc_align = 0;
+}
diff --git a/fs/xfs/libxfs/xfs_ialloc.h b/fs/xfs/libxfs/xfs_ialloc.h
index e936b7c..323592d 100644
--- a/fs/xfs/libxfs/xfs_ialloc.h
+++ b/fs/xfs/libxfs/xfs_ialloc.h
@@ -23,16 +23,6 @@
* sparse chunks */
};
-/* Calculate and return the number of filesystem blocks per inode cluster */
-static inline int
-xfs_icluster_size_fsb(
- struct xfs_mount *mp)
-{
- if (mp->m_sb.sb_blocksize >= mp->m_inode_cluster_size)
- return 1;
- return mp->m_inode_cluster_size >> mp->m_sb.sb_blocklog;
-}
-
/*
* Make an inode pointer out of the buffer/offset.
*/
@@ -96,13 +86,6 @@
uint flags); /* flags for inode btree lookup */
/*
- * Compute and fill in value of m_in_maxlevels.
- */
-void
-xfs_ialloc_compute_maxlevels(
- struct xfs_mount *mp); /* file system mount structure */
-
-/*
* Log specified fields for the ag hdr (inode section)
*/
void
@@ -168,5 +151,6 @@
int *stat);
int xfs_ialloc_cluster_alignment(struct xfs_mount *mp);
+void xfs_ialloc_setup_geometry(struct xfs_mount *mp);
#endif /* __XFS_IALLOC_H__ */
diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c
index 86c5020..b82992f 100644
--- a/fs/xfs/libxfs/xfs_ialloc_btree.c
+++ b/fs/xfs/libxfs/xfs_ialloc_btree.c
@@ -11,14 +11,12 @@
#include "xfs_trans_resv.h"
#include "xfs_bit.h"
#include "xfs_mount.h"
-#include "xfs_inode.h"
#include "xfs_btree.h"
#include "xfs_ialloc.h"
#include "xfs_ialloc_btree.h"
#include "xfs_alloc.h"
#include "xfs_error.h"
#include "xfs_trace.h"
-#include "xfs_cksum.h"
#include "xfs_trans.h"
#include "xfs_rmap.h"
@@ -28,7 +26,7 @@
struct xfs_btree_cur *cur,
int level)
{
- return cur->bc_mp->m_inobt_mnr[level != 0];
+ return M_IGEO(cur->bc_mp)->inobt_mnr[level != 0];
}
STATIC struct xfs_btree_cur *
@@ -84,7 +82,7 @@
memset(&args, 0, sizeof(args));
args.tp = cur->bc_tp;
args.mp = cur->bc_mp;
- xfs_rmap_ag_owner(&args.oinfo, XFS_RMAP_OWN_INOBT);
+ args.oinfo = XFS_RMAP_OINFO_INOBT;
args.fsbno = XFS_AGB_TO_FSB(args.mp, cur->bc_private.a.agno, sbno);
args.minlen = 1;
args.maxlen = 1;
@@ -124,7 +122,7 @@
union xfs_btree_ptr *new,
int *stat)
{
- if (cur->bc_mp->m_inotbt_nores)
+ if (cur->bc_mp->m_finobt_nores)
return xfs_inobt_alloc_block(cur, start, new, stat);
return __xfs_inobt_alloc_block(cur, start, new, stat,
XFS_AG_RESV_METADATA);
@@ -136,12 +134,9 @@
struct xfs_buf *bp,
enum xfs_ag_resv_type resv)
{
- struct xfs_owner_info oinfo;
-
- xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_INOBT);
return xfs_free_extent(cur->bc_tp,
XFS_DADDR_TO_FSB(cur->bc_mp, XFS_BUF_ADDR(bp)), 1,
- &oinfo, resv);
+ &XFS_RMAP_OINFO_INOBT, resv);
}
STATIC int
@@ -157,7 +152,7 @@
struct xfs_btree_cur *cur,
struct xfs_buf *bp)
{
- if (cur->bc_mp->m_inotbt_nores)
+ if (cur->bc_mp->m_finobt_nores)
return xfs_inobt_free_block(cur, bp);
return __xfs_inobt_free_block(cur, bp, XFS_AG_RESV_METADATA);
}
@@ -167,7 +162,7 @@
struct xfs_btree_cur *cur,
int level)
{
- return cur->bc_mp->m_inobt_mxr[level != 0];
+ return M_IGEO(cur->bc_mp)->inobt_mxr[level != 0];
}
STATIC void
@@ -258,11 +253,14 @@
xfs_inobt_verify(
struct xfs_buf *bp)
{
- struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_mount *mp = bp->b_mount;
struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
xfs_failaddr_t fa;
unsigned int level;
+ if (!xfs_verify_magic(bp, block->bb_magic))
+ return __this_address;
+
/*
* During growfs operations, we can't verify the exact owner as the
* perag is not fully initialised and hence not attached to the buffer.
@@ -273,26 +271,19 @@
* but beware of the landmine (i.e. need to check pag->pagi_init) if we
* ever do.
*/
- switch (block->bb_magic) {
- case cpu_to_be32(XFS_IBT_CRC_MAGIC):
- case cpu_to_be32(XFS_FIBT_CRC_MAGIC):
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
fa = xfs_btree_sblock_v5hdr_verify(bp);
if (fa)
return fa;
- /* fall through */
- case cpu_to_be32(XFS_IBT_MAGIC):
- case cpu_to_be32(XFS_FIBT_MAGIC):
- break;
- default:
- return __this_address;
}
/* level verification */
level = be16_to_cpu(block->bb_level);
- if (level >= mp->m_in_maxlevels)
+ if (level >= M_IGEO(mp)->inobt_maxlevels)
return __this_address;
- return xfs_btree_sblock_verify(bp, mp->m_inobt_mxr[level != 0]);
+ return xfs_btree_sblock_verify(bp,
+ M_IGEO(mp)->inobt_mxr[level != 0]);
}
static void
@@ -331,6 +322,16 @@
const struct xfs_buf_ops xfs_inobt_buf_ops = {
.name = "xfs_inobt",
+ .magic = { cpu_to_be32(XFS_IBT_MAGIC), cpu_to_be32(XFS_IBT_CRC_MAGIC) },
+ .verify_read = xfs_inobt_read_verify,
+ .verify_write = xfs_inobt_write_verify,
+ .verify_struct = xfs_inobt_verify,
+};
+
+const struct xfs_buf_ops xfs_finobt_buf_ops = {
+ .name = "xfs_finobt",
+ .magic = { cpu_to_be32(XFS_FIBT_MAGIC),
+ cpu_to_be32(XFS_FIBT_CRC_MAGIC) },
.verify_read = xfs_inobt_read_verify,
.verify_write = xfs_inobt_write_verify,
.verify_struct = xfs_inobt_verify,
@@ -392,7 +393,7 @@
.init_rec_from_cur = xfs_inobt_init_rec_from_cur,
.init_ptr_from_cur = xfs_finobt_init_ptr_from_cur,
.key_diff = xfs_inobt_key_diff,
- .buf_ops = &xfs_inobt_buf_ops,
+ .buf_ops = &xfs_finobt_buf_ops,
.diff_two_keys = xfs_inobt_diff_two_keys,
.keys_inorder = xfs_inobt_keys_inorder,
.recs_inorder = xfs_inobt_recs_inorder,
@@ -538,15 +539,57 @@
static xfs_extlen_t
xfs_inobt_max_size(
- struct xfs_mount *mp)
+ struct xfs_mount *mp,
+ xfs_agnumber_t agno)
{
+ xfs_agblock_t agblocks = xfs_ag_block_count(mp, agno);
+
/* Bail out if we're uninitialized, which can happen in mkfs. */
- if (mp->m_inobt_mxr[0] == 0)
+ if (M_IGEO(mp)->inobt_mxr[0] == 0)
return 0;
- return xfs_btree_calc_size(mp->m_inobt_mnr,
- (uint64_t)mp->m_sb.sb_agblocks * mp->m_sb.sb_inopblock /
- XFS_INODES_PER_CHUNK);
+ /*
+ * The log is permanently allocated, so the space it occupies will
+ * never be available for the kinds of things that would require btree
+ * expansion. We therefore can pretend the space isn't there.
+ */
+ if (mp->m_sb.sb_logstart &&
+ XFS_FSB_TO_AGNO(mp, mp->m_sb.sb_logstart) == agno)
+ agblocks -= mp->m_sb.sb_logblocks;
+
+ return xfs_btree_calc_size(M_IGEO(mp)->inobt_mnr,
+ (uint64_t)agblocks * mp->m_sb.sb_inopblock /
+ XFS_INODES_PER_CHUNK);
+}
+
+/* Read AGI and create inobt cursor. */
+int
+xfs_inobt_cur(
+ struct xfs_mount *mp,
+ struct xfs_trans *tp,
+ xfs_agnumber_t agno,
+ xfs_btnum_t which,
+ struct xfs_btree_cur **curpp,
+ struct xfs_buf **agi_bpp)
+{
+ struct xfs_btree_cur *cur;
+ int error;
+
+ ASSERT(*agi_bpp == NULL);
+ ASSERT(*curpp == NULL);
+
+ error = xfs_ialloc_read_agi(mp, tp, agno, agi_bpp);
+ if (error)
+ return error;
+
+ cur = xfs_inobt_init_cursor(mp, tp, *agi_bpp, agno, which);
+ if (!cur) {
+ xfs_trans_brelse(tp, *agi_bpp);
+ *agi_bpp = NULL;
+ return -ENOMEM;
+ }
+ *curpp = cur;
+ return 0;
}
static int
@@ -557,15 +600,14 @@
xfs_btnum_t btnum,
xfs_extlen_t *tree_blocks)
{
- struct xfs_buf *agbp;
- struct xfs_btree_cur *cur;
+ struct xfs_buf *agbp = NULL;
+ struct xfs_btree_cur *cur = NULL;
int error;
- error = xfs_ialloc_read_agi(mp, tp, agno, &agbp);
+ error = xfs_inobt_cur(mp, tp, agno, btnum, &cur, &agbp);
if (error)
return error;
- cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, btnum);
error = xfs_btree_count_blocks(cur, tree_blocks);
xfs_btree_del_cursor(cur, error);
xfs_trans_brelse(tp, agbp);
@@ -594,7 +636,7 @@
if (error)
return error;
- *ask += xfs_inobt_max_size(mp);
+ *ask += xfs_inobt_max_size(mp, agno);
*used += tree_len;
return 0;
}
@@ -605,5 +647,5 @@
struct xfs_mount *mp,
unsigned long long len)
{
- return xfs_btree_calc_size(mp->m_inobt_mnr, len);
+ return xfs_btree_calc_size(M_IGEO(mp)->inobt_mnr, len);
}
diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.h b/fs/xfs/libxfs/xfs_ialloc_btree.h
index ebdd0c6..951305e 100644
--- a/fs/xfs/libxfs/xfs_ialloc_btree.h
+++ b/fs/xfs/libxfs/xfs_ialloc_btree.h
@@ -64,5 +64,8 @@
xfs_agnumber_t agno, xfs_extlen_t *ask, xfs_extlen_t *used);
extern xfs_extlen_t xfs_iallocbt_calc_size(struct xfs_mount *mp,
unsigned long long len);
+int xfs_inobt_cur(struct xfs_mount *mp, struct xfs_trans *tp,
+ xfs_agnumber_t agno, xfs_btnum_t btnum,
+ struct xfs_btree_cur **curpp, struct xfs_buf **agi_bpp);
#endif /* __XFS_IALLOC_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_iext_tree.c b/fs/xfs/libxfs/xfs_iext_tree.c
index 771dd07..7bc8740 100644
--- a/fs/xfs/libxfs/xfs_iext_tree.c
+++ b/fs/xfs/libxfs/xfs_iext_tree.c
@@ -3,18 +3,14 @@
* Copyright (c) 2017 Christoph Hellwig.
*/
-#include <linux/cache.h>
-#include <linux/kernel.h>
-#include <linux/slab.h>
#include "xfs.h"
+#include "xfs_shared.h"
#include "xfs_format.h"
#include "xfs_bit.h"
#include "xfs_log_format.h"
#include "xfs_inode.h"
-#include "xfs_inode_fork.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
-#include "xfs_bmap.h"
#include "xfs_trace.h"
/*
@@ -614,16 +610,15 @@
}
/*
- * Increment the sequence counter if we are on a COW fork. This allows
- * the writeback code to skip looking for a COW extent if the COW fork
- * hasn't changed. We use WRITE_ONCE here to ensure the update to the
- * sequence counter is seen before the modifications to the extent
- * tree itself take effect.
+ * Increment the sequence counter on extent tree changes. If we are on a COW
+ * fork, this allows the writeback code to skip looking for a COW extent if the
+ * COW fork hasn't changed. We use WRITE_ONCE here to ensure the update to the
+ * sequence counter is seen before the modifications to the extent tree itself
+ * take effect.
*/
-static inline void xfs_iext_inc_seq(struct xfs_ifork *ifp, int state)
+static inline void xfs_iext_inc_seq(struct xfs_ifork *ifp)
{
- if (state & BMAP_COWFORK)
- WRITE_ONCE(ifp->if_seq, READ_ONCE(ifp->if_seq) + 1);
+ WRITE_ONCE(ifp->if_seq, READ_ONCE(ifp->if_seq) + 1);
}
void
@@ -638,7 +633,7 @@
struct xfs_iext_leaf *new = NULL;
int nr_entries, i;
- xfs_iext_inc_seq(ifp, state);
+ xfs_iext_inc_seq(ifp);
if (ifp->if_height == 0)
xfs_iext_alloc_root(ifp, cur);
@@ -880,7 +875,7 @@
ASSERT(ifp->if_u1.if_root != NULL);
ASSERT(xfs_iext_valid(ifp, cur));
- xfs_iext_inc_seq(ifp, state);
+ xfs_iext_inc_seq(ifp);
nr_entries = xfs_iext_leaf_nr_entries(ifp, leaf, cur->pos) - 1;
for (i = cur->pos; i < nr_entries; i++)
@@ -988,7 +983,7 @@
{
struct xfs_ifork *ifp = xfs_iext_state_to_fork(ip, state);
- xfs_iext_inc_seq(ifp, state);
+ xfs_iext_inc_seq(ifp);
if (cur->pos == 0) {
struct xfs_bmbt_irec old;
diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c
index 09d9c8c..28ab3c5 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.c
+++ b/fs/xfs/libxfs/xfs_inode_buf.c
@@ -10,11 +10,9 @@
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
-#include "xfs_defer.h"
#include "xfs_inode.h"
#include "xfs_errortag.h"
#include "xfs_error.h"
-#include "xfs_cksum.h"
#include "xfs_icache.h"
#include "xfs_trans.h"
#include "xfs_ialloc.h"
@@ -33,12 +31,9 @@
xfs_buf_t *bp)
{
int i;
- int j;
xfs_dinode_t *dip;
- j = mp->m_inode_cluster_size >> mp->m_sb.sb_inodelog;
-
- for (i = 0; i < j; i++) {
+ for (i = 0; i < M_IGEO(mp)->inodes_per_cluster; i++) {
dip = xfs_buf_offset(bp, i * mp->m_sb.sb_inodesize);
if (!dip->di_next_unlinked) {
xfs_alert(mp,
@@ -80,7 +75,7 @@
struct xfs_buf *bp,
bool readahead)
{
- struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_mount *mp = bp->b_mount;
xfs_agnumber_t agno;
int i;
int ni;
@@ -97,10 +92,9 @@
dip = xfs_buf_offset(bp, (i << mp->m_sb.sb_inodelog));
unlinked_ino = be32_to_cpu(dip->di_next_unlinked);
- di_ok = dip->di_magic == cpu_to_be16(XFS_DINODE_MAGIC) &&
+ di_ok = xfs_verify_magic16(bp, dip->di_magic) &&
xfs_dinode_good_version(mp, dip->di_version) &&
- (unlinked_ino == NULLAGINO ||
- xfs_verify_agino(mp, agno, unlinked_ino));
+ xfs_verify_agino_or_null(mp, agno, unlinked_ino);
if (unlikely(XFS_TEST_ERROR(!di_ok, mp,
XFS_ERRTAG_ITOBP_INOTOBP))) {
if (readahead) {
@@ -147,12 +141,16 @@
const struct xfs_buf_ops xfs_inode_buf_ops = {
.name = "xfs_inode",
+ .magic16 = { cpu_to_be16(XFS_DINODE_MAGIC),
+ cpu_to_be16(XFS_DINODE_MAGIC) },
.verify_read = xfs_inode_buf_read_verify,
.verify_write = xfs_inode_buf_write_verify,
};
const struct xfs_buf_ops xfs_inode_buf_ra_ops = {
- .name = "xxfs_inode_ra",
+ .name = "xfs_inode_ra",
+ .magic16 = { cpu_to_be16(XFS_DINODE_MAGIC),
+ cpu_to_be16(XFS_DINODE_MAGIC) },
.verify_read = xfs_inode_buf_readahead_verify,
.verify_write = xfs_inode_buf_write_verify,
};
diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c
index f9acf1d..c643bee 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.c
+++ b/fs/xfs/libxfs/xfs_inode_fork.c
@@ -3,10 +3,10 @@
* Copyright (c) 2000-2006 Silicon Graphics, Inc.
* All Rights Reserved.
*/
-#include <linux/log2.h>
#include "xfs.h"
#include "xfs_fs.h"
+#include "xfs_shared.h"
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
@@ -19,12 +19,10 @@
#include "xfs_bmap.h"
#include "xfs_error.h"
#include "xfs_trace.h"
-#include "xfs_attr_sf.h"
#include "xfs_da_format.h"
#include "xfs_da_btree.h"
#include "xfs_dir2_priv.h"
#include "xfs_attr_leaf.h"
-#include "xfs_shared.h"
kmem_zone_t *xfs_ifork_zone;
@@ -96,7 +94,7 @@
return 0;
ASSERT(ip->i_afp == NULL);
- ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP | KM_NOFS);
+ ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_NOFS);
switch (dip->di_aformat) {
case XFS_DINODE_FMT_LOCAL:
@@ -149,7 +147,7 @@
if (size) {
real_size = roundup(mem_size, 4);
- ifp->if_u1.if_data = kmem_alloc(real_size, KM_SLEEP | KM_NOFS);
+ ifp->if_u1.if_data = kmem_alloc(real_size, KM_NOFS);
memcpy(ifp->if_u1.if_data, data, size);
if (zero_terminate)
ifp->if_u1.if_data[size] = '\0';
@@ -304,7 +302,7 @@
}
ifp->if_broot_bytes = size;
- ifp->if_broot = kmem_alloc(size, KM_SLEEP | KM_NOFS);
+ ifp->if_broot = kmem_alloc(size, KM_NOFS);
ASSERT(ifp->if_broot != NULL);
/*
* Copy and convert from the on-disk structure
@@ -369,7 +367,7 @@
*/
if (ifp->if_broot_bytes == 0) {
new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, rec_diff);
- ifp->if_broot = kmem_alloc(new_size, KM_SLEEP | KM_NOFS);
+ ifp->if_broot = kmem_alloc(new_size, KM_NOFS);
ifp->if_broot_bytes = (int)new_size;
return;
}
@@ -384,7 +382,7 @@
new_max = cur_max + rec_diff;
new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, new_max);
ifp->if_broot = kmem_realloc(ifp->if_broot, new_size,
- KM_SLEEP | KM_NOFS);
+ KM_NOFS);
op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
ifp->if_broot_bytes);
np = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
@@ -410,7 +408,7 @@
else
new_size = 0;
if (new_size > 0) {
- new_broot = kmem_alloc(new_size, KM_SLEEP | KM_NOFS);
+ new_broot = kmem_alloc(new_size, KM_NOFS);
/*
* First copy over the btree block header.
*/
@@ -494,7 +492,7 @@
* We enforce that here.
*/
ifp->if_u1.if_data = kmem_realloc(ifp->if_u1.if_data,
- roundup(new_size, 4), KM_SLEEP | KM_NOFS);
+ roundup(new_size, 4), KM_NOFS);
ifp->if_bytes = new_size;
}
@@ -685,7 +683,7 @@
return;
ip->i_cowfp = kmem_zone_zalloc(xfs_ifork_zone,
- KM_SLEEP | KM_NOFS);
+ KM_NOFS);
ip->i_cowfp->if_flags = XFS_IFEXTENTS;
ip->i_cformat = XFS_DINODE_FMT_EXTENTS;
ip->i_cnextents = 0;
diff --git a/fs/xfs/libxfs/xfs_inode_fork.h b/fs/xfs/libxfs/xfs_inode_fork.h
index 60361d2..00c62ce 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.h
+++ b/fs/xfs/libxfs/xfs_inode_fork.h
@@ -14,7 +14,7 @@
*/
struct xfs_ifork {
int if_bytes; /* bytes in if_u1 */
- unsigned int if_seq; /* cow fork mod counter */
+ unsigned int if_seq; /* fork mod counter */
struct xfs_btree_block *if_broot; /* file's incore btree root */
short if_broot_bytes; /* bytes allocated for root */
unsigned char if_flags; /* per-fork flags */
diff --git a/fs/xfs/libxfs/xfs_log_rlimit.c b/fs/xfs/libxfs/xfs_log_rlimit.c
index 1b542ec..7f55eb3 100644
--- a/fs/xfs/libxfs/xfs_log_rlimit.c
+++ b/fs/xfs/libxfs/xfs_log_rlimit.c
@@ -12,9 +12,7 @@
#include "xfs_mount.h"
#include "xfs_da_format.h"
#include "xfs_trans_space.h"
-#include "xfs_inode.h"
#include "xfs_da_btree.h"
-#include "xfs_attr_leaf.h"
#include "xfs_bmap_btree.h"
/*
diff --git a/fs/xfs/libxfs/xfs_quota_defs.h b/fs/xfs/libxfs/xfs_quota_defs.h
index 4bfdd5f..b2113b1 100644
--- a/fs/xfs/libxfs/xfs_quota_defs.h
+++ b/fs/xfs/libxfs/xfs_quota_defs.h
@@ -142,7 +142,7 @@
extern xfs_failaddr_t xfs_dqblk_verify(struct xfs_mount *mp,
struct xfs_dqblk *dqb, xfs_dqid_t id, uint type);
extern int xfs_calc_dquots_per_chunk(unsigned int nbblks);
-extern int xfs_dqblk_repair(struct xfs_mount *mp, struct xfs_dqblk *dqb,
+extern void xfs_dqblk_repair(struct xfs_mount *mp, struct xfs_dqblk *dqb,
xfs_dqid_t id, uint type);
#endif /* __XFS_QUOTA_H__ */
diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c
index 542aa14..9a7fadb 100644
--- a/fs/xfs/libxfs/xfs_refcount.c
+++ b/fs/xfs/libxfs/xfs_refcount.c
@@ -9,7 +9,6 @@
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
-#include "xfs_sb.h"
#include "xfs_mount.h"
#include "xfs_defer.h"
#include "xfs_btree.h"
@@ -19,7 +18,6 @@
#include "xfs_errortag.h"
#include "xfs_error.h"
#include "xfs_trace.h"
-#include "xfs_cksum.h"
#include "xfs_trans.h"
#include "xfs_bit.h"
#include "xfs_refcount.h"
@@ -1176,7 +1174,7 @@
/*
* Record a refcount intent for later processing.
*/
-static int
+static void
__xfs_refcount_add(
struct xfs_trans *tp,
enum xfs_refcount_intent_type type,
@@ -1191,44 +1189,43 @@
blockcount);
ri = kmem_alloc(sizeof(struct xfs_refcount_intent),
- KM_SLEEP | KM_NOFS);
+ KM_NOFS);
INIT_LIST_HEAD(&ri->ri_list);
ri->ri_type = type;
ri->ri_startblock = startblock;
ri->ri_blockcount = blockcount;
xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_REFCOUNT, &ri->ri_list);
- return 0;
}
/*
* Increase the reference count of the blocks backing a file's extent.
*/
-int
+void
xfs_refcount_increase_extent(
struct xfs_trans *tp,
struct xfs_bmbt_irec *PREV)
{
if (!xfs_sb_version_hasreflink(&tp->t_mountp->m_sb))
- return 0;
+ return;
- return __xfs_refcount_add(tp, XFS_REFCOUNT_INCREASE,
- PREV->br_startblock, PREV->br_blockcount);
+ __xfs_refcount_add(tp, XFS_REFCOUNT_INCREASE, PREV->br_startblock,
+ PREV->br_blockcount);
}
/*
* Decrease the reference count of the blocks backing a file's extent.
*/
-int
+void
xfs_refcount_decrease_extent(
struct xfs_trans *tp,
struct xfs_bmbt_irec *PREV)
{
if (!xfs_sb_version_hasreflink(&tp->t_mountp->m_sb))
- return 0;
+ return;
- return __xfs_refcount_add(tp, XFS_REFCOUNT_DECREASE,
- PREV->br_startblock, PREV->br_blockcount);
+ __xfs_refcount_add(tp, XFS_REFCOUNT_DECREASE, PREV->br_startblock,
+ PREV->br_blockcount);
}
/*
@@ -1543,47 +1540,40 @@
}
/* Record a CoW staging extent in the refcount btree. */
-int
+void
xfs_refcount_alloc_cow_extent(
struct xfs_trans *tp,
xfs_fsblock_t fsb,
xfs_extlen_t len)
{
struct xfs_mount *mp = tp->t_mountp;
- int error;
if (!xfs_sb_version_hasreflink(&mp->m_sb))
- return 0;
+ return;
- error = __xfs_refcount_add(tp, XFS_REFCOUNT_ALLOC_COW, fsb, len);
- if (error)
- return error;
+ __xfs_refcount_add(tp, XFS_REFCOUNT_ALLOC_COW, fsb, len);
/* Add rmap entry */
- return xfs_rmap_alloc_extent(tp, XFS_FSB_TO_AGNO(mp, fsb),
+ xfs_rmap_alloc_extent(tp, XFS_FSB_TO_AGNO(mp, fsb),
XFS_FSB_TO_AGBNO(mp, fsb), len, XFS_RMAP_OWN_COW);
}
/* Forget a CoW staging event in the refcount btree. */
-int
+void
xfs_refcount_free_cow_extent(
struct xfs_trans *tp,
xfs_fsblock_t fsb,
xfs_extlen_t len)
{
struct xfs_mount *mp = tp->t_mountp;
- int error;
if (!xfs_sb_version_hasreflink(&mp->m_sb))
- return 0;
+ return;
/* Remove rmap entry */
- error = xfs_rmap_free_extent(tp, XFS_FSB_TO_AGNO(mp, fsb),
+ xfs_rmap_free_extent(tp, XFS_FSB_TO_AGNO(mp, fsb),
XFS_FSB_TO_AGBNO(mp, fsb), len, XFS_RMAP_OWN_COW);
- if (error)
- return error;
-
- return __xfs_refcount_add(tp, XFS_REFCOUNT_FREE_COW, fsb, len);
+ __xfs_refcount_add(tp, XFS_REFCOUNT_FREE_COW, fsb, len);
}
struct xfs_refcount_recovery {
@@ -1604,7 +1594,7 @@
if (be32_to_cpu(rec->refc.rc_refcount) != 1)
return -EFSCORRUPTED;
- rr = kmem_alloc(sizeof(struct xfs_refcount_recovery), KM_SLEEP);
+ rr = kmem_alloc(sizeof(struct xfs_refcount_recovery), 0);
xfs_refcount_btrec_to_irec(rec, &rr->rr_rrec);
list_add_tail(&rr->rr_list, debris);
@@ -1681,10 +1671,8 @@
/* Free the orphan record */
agbno = rr->rr_rrec.rc_startblock - XFS_REFC_COW_START;
fsb = XFS_AGB_TO_FSB(mp, agno, agbno);
- error = xfs_refcount_free_cow_extent(tp, fsb,
+ xfs_refcount_free_cow_extent(tp, fsb,
rr->rr_rrec.rc_blockcount);
- if (error)
- goto out_trans;
/* Free the block. */
xfs_bmap_add_free(tp, fsb, rr->rr_rrec.rc_blockcount, NULL);
diff --git a/fs/xfs/libxfs/xfs_refcount.h b/fs/xfs/libxfs/xfs_refcount.h
index 1d9c518..2097955 100644
--- a/fs/xfs/libxfs/xfs_refcount.h
+++ b/fs/xfs/libxfs/xfs_refcount.h
@@ -29,9 +29,9 @@
xfs_extlen_t ri_blockcount;
};
-extern int xfs_refcount_increase_extent(struct xfs_trans *tp,
+void xfs_refcount_increase_extent(struct xfs_trans *tp,
struct xfs_bmbt_irec *irec);
-extern int xfs_refcount_decrease_extent(struct xfs_trans *tp,
+void xfs_refcount_decrease_extent(struct xfs_trans *tp,
struct xfs_bmbt_irec *irec);
extern void xfs_refcount_finish_one_cleanup(struct xfs_trans *tp,
@@ -45,10 +45,10 @@
xfs_agblock_t agbno, xfs_extlen_t aglen, xfs_agblock_t *fbno,
xfs_extlen_t *flen, bool find_end_of_shared);
-extern int xfs_refcount_alloc_cow_extent(struct xfs_trans *tp,
- xfs_fsblock_t fsb, xfs_extlen_t len);
-extern int xfs_refcount_free_cow_extent(struct xfs_trans *tp,
- xfs_fsblock_t fsb, xfs_extlen_t len);
+void xfs_refcount_alloc_cow_extent(struct xfs_trans *tp, xfs_fsblock_t fsb,
+ xfs_extlen_t len);
+void xfs_refcount_free_cow_extent(struct xfs_trans *tp, xfs_fsblock_t fsb,
+ xfs_extlen_t len);
extern int xfs_refcount_recover_cow_leftovers(struct xfs_mount *mp,
xfs_agnumber_t agno);
diff --git a/fs/xfs/libxfs/xfs_refcount_btree.c b/fs/xfs/libxfs/xfs_refcount_btree.c
index 1aaa01c..38529db 100644
--- a/fs/xfs/libxfs/xfs_refcount_btree.c
+++ b/fs/xfs/libxfs/xfs_refcount_btree.c
@@ -12,12 +12,10 @@
#include "xfs_sb.h"
#include "xfs_mount.h"
#include "xfs_btree.h"
-#include "xfs_bmap.h"
#include "xfs_refcount_btree.h"
#include "xfs_alloc.h"
#include "xfs_error.h"
#include "xfs_trace.h"
-#include "xfs_cksum.h"
#include "xfs_trans.h"
#include "xfs_bit.h"
#include "xfs_rmap.h"
@@ -70,7 +68,7 @@
args.type = XFS_ALLOCTYPE_NEAR_BNO;
args.fsbno = XFS_AGB_TO_FSB(cur->bc_mp, cur->bc_private.a.agno,
xfs_refc_block(args.mp));
- xfs_rmap_ag_owner(&args.oinfo, XFS_RMAP_OWN_REFC);
+ args.oinfo = XFS_RMAP_OINFO_REFC;
args.minlen = args.maxlen = args.prod = 1;
args.resv = XFS_AG_RESV_METADATA;
@@ -106,15 +104,13 @@
struct xfs_buf *agbp = cur->bc_private.a.agbp;
struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp);
xfs_fsblock_t fsbno = XFS_DADDR_TO_FSB(mp, XFS_BUF_ADDR(bp));
- struct xfs_owner_info oinfo;
int error;
trace_xfs_refcountbt_free_block(cur->bc_mp, cur->bc_private.a.agno,
XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno), 1);
- xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_REFC);
be32_add_cpu(&agf->agf_refcount_blocks, -1);
xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_REFCOUNT_BLOCKS);
- error = xfs_free_extent(cur->bc_tp, fsbno, 1, &oinfo,
+ error = xfs_free_extent(cur->bc_tp, fsbno, 1, &XFS_RMAP_OINFO_REFC,
XFS_AG_RESV_METADATA);
if (error)
return error;
@@ -205,13 +201,13 @@
xfs_refcountbt_verify(
struct xfs_buf *bp)
{
- struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_mount *mp = bp->b_mount;
struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
struct xfs_perag *pag = bp->b_pag;
xfs_failaddr_t fa;
unsigned int level;
- if (block->bb_magic != cpu_to_be32(XFS_REFC_CRC_MAGIC))
+ if (!xfs_verify_magic(bp, block->bb_magic))
return __this_address;
if (!xfs_sb_version_hasreflink(&mp->m_sb))
@@ -266,6 +262,7 @@
const struct xfs_buf_ops xfs_refcountbt_buf_ops = {
.name = "xfs_refcountbt",
+ .magic = { 0, cpu_to_be32(XFS_REFC_CRC_MAGIC) },
.verify_read = xfs_refcountbt_read_verify,
.verify_write = xfs_refcountbt_write_verify,
.verify_struct = xfs_refcountbt_verify,
@@ -428,6 +425,15 @@
tree_len = be32_to_cpu(agf->agf_refcount_blocks);
xfs_trans_brelse(tp, agbp);
+ /*
+ * The log is permanently allocated, so the space it occupies will
+ * never be available for the kinds of things that would require btree
+ * expansion. We therefore can pretend the space isn't there.
+ */
+ if (mp->m_sb.sb_logstart &&
+ XFS_FSB_TO_AGNO(mp, mp->m_sb.sb_logstart) == agno)
+ agblocks -= mp->m_sb.sb_logblocks;
+
*ask += xfs_refcountbt_max_size(mp, agblocks);
*used += tree_len;
diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c
index 245af45..38e9414 100644
--- a/fs/xfs/libxfs/xfs_rmap.c
+++ b/fs/xfs/libxfs/xfs_rmap.c
@@ -10,24 +10,17 @@
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
#include "xfs_bit.h"
-#include "xfs_sb.h"
#include "xfs_mount.h"
#include "xfs_defer.h"
-#include "xfs_da_format.h"
-#include "xfs_da_btree.h"
#include "xfs_btree.h"
#include "xfs_trans.h"
#include "xfs_alloc.h"
#include "xfs_rmap.h"
#include "xfs_rmap_btree.h"
-#include "xfs_trans_space.h"
#include "xfs_trace.h"
#include "xfs_errortag.h"
#include "xfs_error.h"
-#include "xfs_extent_busy.h"
-#include "xfs_bmap.h"
#include "xfs_inode.h"
-#include "xfs_ialloc.h"
/*
* Lookup the first record less than or equal to [bno, len, owner, offset]
@@ -175,7 +168,6 @@
union xfs_btree_rec *rec,
struct xfs_rmap_irec *irec)
{
- irec->rm_flags = 0;
irec->rm_startblock = be32_to_cpu(rec->rmap.rm_startblock);
irec->rm_blockcount = be32_to_cpu(rec->rmap.rm_blockcount);
irec->rm_owner = be64_to_cpu(rec->rmap.rm_owner);
@@ -261,15 +253,15 @@
rec->rm_flags);
if (rec->rm_owner != info->high.rm_owner)
- return XFS_BTREE_QUERY_RANGE_CONTINUE;
+ return 0;
if (!XFS_RMAP_NON_INODE_OWNER(rec->rm_owner) &&
!(rec->rm_flags & XFS_RMAP_BMBT_BLOCK) &&
rec->rm_offset + rec->rm_blockcount - 1 != info->high.rm_offset)
- return XFS_BTREE_QUERY_RANGE_CONTINUE;
+ return 0;
*info->irec = *rec;
*info->stat = 1;
- return XFS_BTREE_QUERY_RANGE_ABORT;
+ return -ECANCELED;
}
/*
@@ -312,7 +304,7 @@
error = xfs_rmap_query_range(cur, &info.high, &info.high,
xfs_rmap_find_left_neighbor_helper, &info);
- if (error == XFS_BTREE_QUERY_RANGE_ABORT)
+ if (error == -ECANCELED)
error = 0;
if (*stat)
trace_xfs_rmap_find_left_neighbor_result(cur->bc_mp,
@@ -337,16 +329,16 @@
rec->rm_flags);
if (rec->rm_owner != info->high.rm_owner)
- return XFS_BTREE_QUERY_RANGE_CONTINUE;
+ return 0;
if (!XFS_RMAP_NON_INODE_OWNER(rec->rm_owner) &&
!(rec->rm_flags & XFS_RMAP_BMBT_BLOCK) &&
(rec->rm_offset > info->high.rm_offset ||
rec->rm_offset + rec->rm_blockcount <= info->high.rm_offset))
- return XFS_BTREE_QUERY_RANGE_CONTINUE;
+ return 0;
*info->irec = *rec;
*info->stat = 1;
- return XFS_BTREE_QUERY_RANGE_ABORT;
+ return -ECANCELED;
}
/*
@@ -384,7 +376,7 @@
cur->bc_private.a.agno, bno, 0, owner, offset, flags);
error = xfs_rmap_query_range(cur, &info.high, &info.high,
xfs_rmap_lookup_le_range_helper, &info);
- if (error == XFS_BTREE_QUERY_RANGE_ABORT)
+ if (error == -ECANCELED)
error = 0;
if (*stat)
trace_xfs_rmap_lookup_le_range_result(cur->bc_mp,
@@ -458,21 +450,21 @@
*/
STATIC int
xfs_rmap_unmap(
- struct xfs_btree_cur *cur,
- xfs_agblock_t bno,
- xfs_extlen_t len,
- bool unwritten,
- struct xfs_owner_info *oinfo)
+ struct xfs_btree_cur *cur,
+ xfs_agblock_t bno,
+ xfs_extlen_t len,
+ bool unwritten,
+ const struct xfs_owner_info *oinfo)
{
- struct xfs_mount *mp = cur->bc_mp;
- struct xfs_rmap_irec ltrec;
- uint64_t ltoff;
- int error = 0;
- int i;
- uint64_t owner;
- uint64_t offset;
- unsigned int flags;
- bool ignore_off;
+ struct xfs_mount *mp = cur->bc_mp;
+ struct xfs_rmap_irec ltrec;
+ uint64_t ltoff;
+ int error = 0;
+ int i;
+ uint64_t owner;
+ uint64_t offset;
+ unsigned int flags;
+ bool ignore_off;
xfs_owner_info_unpack(oinfo, &owner, &offset, &flags);
ignore_off = XFS_RMAP_NON_INODE_OWNER(owner) ||
@@ -653,16 +645,16 @@
*/
int
xfs_rmap_free(
- struct xfs_trans *tp,
- struct xfs_buf *agbp,
- xfs_agnumber_t agno,
- xfs_agblock_t bno,
- xfs_extlen_t len,
- struct xfs_owner_info *oinfo)
+ struct xfs_trans *tp,
+ struct xfs_buf *agbp,
+ xfs_agnumber_t agno,
+ xfs_agblock_t bno,
+ xfs_extlen_t len,
+ const struct xfs_owner_info *oinfo)
{
- struct xfs_mount *mp = tp->t_mountp;
- struct xfs_btree_cur *cur;
- int error;
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_btree_cur *cur;
+ int error;
if (!xfs_sb_version_hasrmapbt(&mp->m_sb))
return 0;
@@ -710,23 +702,23 @@
*/
STATIC int
xfs_rmap_map(
- struct xfs_btree_cur *cur,
- xfs_agblock_t bno,
- xfs_extlen_t len,
- bool unwritten,
- struct xfs_owner_info *oinfo)
+ struct xfs_btree_cur *cur,
+ xfs_agblock_t bno,
+ xfs_extlen_t len,
+ bool unwritten,
+ const struct xfs_owner_info *oinfo)
{
- struct xfs_mount *mp = cur->bc_mp;
- struct xfs_rmap_irec ltrec;
- struct xfs_rmap_irec gtrec;
- int have_gt;
- int have_lt;
- int error = 0;
- int i;
- uint64_t owner;
- uint64_t offset;
- unsigned int flags = 0;
- bool ignore_off;
+ struct xfs_mount *mp = cur->bc_mp;
+ struct xfs_rmap_irec ltrec;
+ struct xfs_rmap_irec gtrec;
+ int have_gt;
+ int have_lt;
+ int error = 0;
+ int i;
+ uint64_t owner;
+ uint64_t offset;
+ unsigned int flags = 0;
+ bool ignore_off;
xfs_owner_info_unpack(oinfo, &owner, &offset, &flags);
ASSERT(owner != 0);
@@ -890,16 +882,16 @@
*/
int
xfs_rmap_alloc(
- struct xfs_trans *tp,
- struct xfs_buf *agbp,
- xfs_agnumber_t agno,
- xfs_agblock_t bno,
- xfs_extlen_t len,
- struct xfs_owner_info *oinfo)
+ struct xfs_trans *tp,
+ struct xfs_buf *agbp,
+ xfs_agnumber_t agno,
+ xfs_agblock_t bno,
+ xfs_extlen_t len,
+ const struct xfs_owner_info *oinfo)
{
- struct xfs_mount *mp = tp->t_mountp;
- struct xfs_btree_cur *cur;
- int error;
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_btree_cur *cur;
+ int error;
if (!xfs_sb_version_hasrmapbt(&mp->m_sb))
return 0;
@@ -929,16 +921,16 @@
*/
STATIC int
xfs_rmap_convert(
- struct xfs_btree_cur *cur,
- xfs_agblock_t bno,
- xfs_extlen_t len,
- bool unwritten,
- struct xfs_owner_info *oinfo)
+ struct xfs_btree_cur *cur,
+ xfs_agblock_t bno,
+ xfs_extlen_t len,
+ bool unwritten,
+ const struct xfs_owner_info *oinfo)
{
- struct xfs_mount *mp = cur->bc_mp;
- struct xfs_rmap_irec r[4]; /* neighbor extent entries */
- /* left is 0, right is 1, prev is 2 */
- /* new is 3 */
+ struct xfs_mount *mp = cur->bc_mp;
+ struct xfs_rmap_irec r[4]; /* neighbor extent entries */
+ /* left is 0, right is 1, */
+ /* prev is 2, new is 3 */
uint64_t owner;
uint64_t offset;
uint64_t new_endoff;
@@ -1354,16 +1346,16 @@
*/
STATIC int
xfs_rmap_convert_shared(
- struct xfs_btree_cur *cur,
- xfs_agblock_t bno,
- xfs_extlen_t len,
- bool unwritten,
- struct xfs_owner_info *oinfo)
+ struct xfs_btree_cur *cur,
+ xfs_agblock_t bno,
+ xfs_extlen_t len,
+ bool unwritten,
+ const struct xfs_owner_info *oinfo)
{
- struct xfs_mount *mp = cur->bc_mp;
- struct xfs_rmap_irec r[4]; /* neighbor extent entries */
- /* left is 0, right is 1, prev is 2 */
- /* new is 3 */
+ struct xfs_mount *mp = cur->bc_mp;
+ struct xfs_rmap_irec r[4]; /* neighbor extent entries */
+ /* left is 0, right is 1, */
+ /* prev is 2, new is 3 */
uint64_t owner;
uint64_t offset;
uint64_t new_endoff;
@@ -1743,20 +1735,20 @@
*/
STATIC int
xfs_rmap_unmap_shared(
- struct xfs_btree_cur *cur,
- xfs_agblock_t bno,
- xfs_extlen_t len,
- bool unwritten,
- struct xfs_owner_info *oinfo)
+ struct xfs_btree_cur *cur,
+ xfs_agblock_t bno,
+ xfs_extlen_t len,
+ bool unwritten,
+ const struct xfs_owner_info *oinfo)
{
- struct xfs_mount *mp = cur->bc_mp;
- struct xfs_rmap_irec ltrec;
- uint64_t ltoff;
- int error = 0;
- int i;
- uint64_t owner;
- uint64_t offset;
- unsigned int flags;
+ struct xfs_mount *mp = cur->bc_mp;
+ struct xfs_rmap_irec ltrec;
+ uint64_t ltoff;
+ int error = 0;
+ int i;
+ uint64_t owner;
+ uint64_t offset;
+ unsigned int flags;
xfs_owner_info_unpack(oinfo, &owner, &offset, &flags);
if (unwritten)
@@ -1905,22 +1897,22 @@
*/
STATIC int
xfs_rmap_map_shared(
- struct xfs_btree_cur *cur,
- xfs_agblock_t bno,
- xfs_extlen_t len,
- bool unwritten,
- struct xfs_owner_info *oinfo)
+ struct xfs_btree_cur *cur,
+ xfs_agblock_t bno,
+ xfs_extlen_t len,
+ bool unwritten,
+ const struct xfs_owner_info *oinfo)
{
- struct xfs_mount *mp = cur->bc_mp;
- struct xfs_rmap_irec ltrec;
- struct xfs_rmap_irec gtrec;
- int have_gt;
- int have_lt;
- int error = 0;
- int i;
- uint64_t owner;
- uint64_t offset;
- unsigned int flags = 0;
+ struct xfs_mount *mp = cur->bc_mp;
+ struct xfs_rmap_irec ltrec;
+ struct xfs_rmap_irec gtrec;
+ int have_gt;
+ int have_lt;
+ int error = 0;
+ int i;
+ uint64_t owner;
+ uint64_t offset;
+ unsigned int flags = 0;
xfs_owner_info_unpack(oinfo, &owner, &offset, &flags);
if (unwritten)
@@ -2275,7 +2267,7 @@
* Record a rmap intent; the list is kept sorted first by AG and then by
* increasing age.
*/
-static int
+static void
__xfs_rmap_add(
struct xfs_trans *tp,
enum xfs_rmap_intent_type type,
@@ -2294,7 +2286,7 @@
bmap->br_blockcount,
bmap->br_state);
- ri = kmem_alloc(sizeof(struct xfs_rmap_intent), KM_SLEEP | KM_NOFS);
+ ri = kmem_alloc(sizeof(struct xfs_rmap_intent), KM_NOFS);
INIT_LIST_HEAD(&ri->ri_list);
ri->ri_type = type;
ri->ri_owner = owner;
@@ -2302,11 +2294,10 @@
ri->ri_bmap = *bmap;
xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_RMAP, &ri->ri_list);
- return 0;
}
/* Map an extent into a file. */
-int
+void
xfs_rmap_map_extent(
struct xfs_trans *tp,
struct xfs_inode *ip,
@@ -2314,15 +2305,15 @@
struct xfs_bmbt_irec *PREV)
{
if (!xfs_rmap_update_is_needed(tp->t_mountp, whichfork))
- return 0;
+ return;
- return __xfs_rmap_add(tp, xfs_is_reflink_inode(ip) ?
+ __xfs_rmap_add(tp, xfs_is_reflink_inode(ip) ?
XFS_RMAP_MAP_SHARED : XFS_RMAP_MAP, ip->i_ino,
whichfork, PREV);
}
/* Unmap an extent out of a file. */
-int
+void
xfs_rmap_unmap_extent(
struct xfs_trans *tp,
struct xfs_inode *ip,
@@ -2330,9 +2321,9 @@
struct xfs_bmbt_irec *PREV)
{
if (!xfs_rmap_update_is_needed(tp->t_mountp, whichfork))
- return 0;
+ return;
- return __xfs_rmap_add(tp, xfs_is_reflink_inode(ip) ?
+ __xfs_rmap_add(tp, xfs_is_reflink_inode(ip) ?
XFS_RMAP_UNMAP_SHARED : XFS_RMAP_UNMAP, ip->i_ino,
whichfork, PREV);
}
@@ -2343,7 +2334,7 @@
* Note that tp can be NULL here as no transaction is used for COW fork
* unwritten conversion.
*/
-int
+void
xfs_rmap_convert_extent(
struct xfs_mount *mp,
struct xfs_trans *tp,
@@ -2352,15 +2343,15 @@
struct xfs_bmbt_irec *PREV)
{
if (!xfs_rmap_update_is_needed(mp, whichfork))
- return 0;
+ return;
- return __xfs_rmap_add(tp, xfs_is_reflink_inode(ip) ?
+ __xfs_rmap_add(tp, xfs_is_reflink_inode(ip) ?
XFS_RMAP_CONVERT_SHARED : XFS_RMAP_CONVERT, ip->i_ino,
whichfork, PREV);
}
/* Schedule the creation of an rmap for non-file data. */
-int
+void
xfs_rmap_alloc_extent(
struct xfs_trans *tp,
xfs_agnumber_t agno,
@@ -2371,18 +2362,18 @@
struct xfs_bmbt_irec bmap;
if (!xfs_rmap_update_is_needed(tp->t_mountp, XFS_DATA_FORK))
- return 0;
+ return;
bmap.br_startblock = XFS_AGB_TO_FSB(tp->t_mountp, agno, bno);
bmap.br_blockcount = len;
bmap.br_startoff = 0;
bmap.br_state = XFS_EXT_NORM;
- return __xfs_rmap_add(tp, XFS_RMAP_ALLOC, owner, XFS_DATA_FORK, &bmap);
+ __xfs_rmap_add(tp, XFS_RMAP_ALLOC, owner, XFS_DATA_FORK, &bmap);
}
/* Schedule the deletion of an rmap for non-file data. */
-int
+void
xfs_rmap_free_extent(
struct xfs_trans *tp,
xfs_agnumber_t agno,
@@ -2393,14 +2384,14 @@
struct xfs_bmbt_irec bmap;
if (!xfs_rmap_update_is_needed(tp->t_mountp, XFS_DATA_FORK))
- return 0;
+ return;
bmap.br_startblock = XFS_AGB_TO_FSB(tp->t_mountp, agno, bno);
bmap.br_blockcount = len;
bmap.br_startoff = 0;
bmap.br_state = XFS_EXT_NORM;
- return __xfs_rmap_add(tp, XFS_RMAP_FREE, owner, XFS_DATA_FORK, &bmap);
+ __xfs_rmap_add(tp, XFS_RMAP_FREE, owner, XFS_DATA_FORK, &bmap);
}
/* Compare rmap records. Returns -1 if a < b, 1 if a > b, and 0 if equal. */
@@ -2459,18 +2450,18 @@
*/
int
xfs_rmap_record_exists(
- struct xfs_btree_cur *cur,
- xfs_agblock_t bno,
- xfs_extlen_t len,
- struct xfs_owner_info *oinfo,
- bool *has_rmap)
+ struct xfs_btree_cur *cur,
+ xfs_agblock_t bno,
+ xfs_extlen_t len,
+ const struct xfs_owner_info *oinfo,
+ bool *has_rmap)
{
- uint64_t owner;
- uint64_t offset;
- unsigned int flags;
- int has_record;
- struct xfs_rmap_irec irec;
- int error;
+ uint64_t owner;
+ uint64_t offset;
+ unsigned int flags;
+ int has_record;
+ struct xfs_rmap_irec irec;
+ int error;
xfs_owner_info_unpack(oinfo, &owner, &offset, &flags);
ASSERT(XFS_RMAP_NON_INODE_OWNER(owner) ||
@@ -2518,7 +2509,7 @@
((rks->flags & rec->rm_flags) & XFS_RMAP_KEY_FLAGS) == rks->flags)
return 0;
rks->has_rmap = true;
- return XFS_BTREE_QUERY_RANGE_ABORT;
+ return -ECANCELED;
}
/*
@@ -2530,7 +2521,7 @@
struct xfs_btree_cur *cur,
xfs_agblock_t bno,
xfs_extlen_t len,
- struct xfs_owner_info *oinfo,
+ const struct xfs_owner_info *oinfo,
bool *has_rmap)
{
struct xfs_rmap_irec low = {0};
@@ -2547,6 +2538,37 @@
error = xfs_rmap_query_range(cur, &low, &high,
xfs_rmap_has_other_keys_helper, &rks);
+ if (error < 0)
+ return error;
+
*has_rmap = rks.has_rmap;
- return error;
+ return 0;
}
+
+const struct xfs_owner_info XFS_RMAP_OINFO_SKIP_UPDATE = {
+ .oi_owner = XFS_RMAP_OWN_NULL,
+};
+const struct xfs_owner_info XFS_RMAP_OINFO_ANY_OWNER = {
+ .oi_owner = XFS_RMAP_OWN_UNKNOWN,
+};
+const struct xfs_owner_info XFS_RMAP_OINFO_FS = {
+ .oi_owner = XFS_RMAP_OWN_FS,
+};
+const struct xfs_owner_info XFS_RMAP_OINFO_LOG = {
+ .oi_owner = XFS_RMAP_OWN_LOG,
+};
+const struct xfs_owner_info XFS_RMAP_OINFO_AG = {
+ .oi_owner = XFS_RMAP_OWN_AG,
+};
+const struct xfs_owner_info XFS_RMAP_OINFO_INOBT = {
+ .oi_owner = XFS_RMAP_OWN_INOBT,
+};
+const struct xfs_owner_info XFS_RMAP_OINFO_INODES = {
+ .oi_owner = XFS_RMAP_OWN_INODES,
+};
+const struct xfs_owner_info XFS_RMAP_OINFO_REFC = {
+ .oi_owner = XFS_RMAP_OWN_REFC,
+};
+const struct xfs_owner_info XFS_RMAP_OINFO_COW = {
+ .oi_owner = XFS_RMAP_OWN_COW,
+};
diff --git a/fs/xfs/libxfs/xfs_rmap.h b/fs/xfs/libxfs/xfs_rmap.h
index 157dc72..abe6334 100644
--- a/fs/xfs/libxfs/xfs_rmap.h
+++ b/fs/xfs/libxfs/xfs_rmap.h
@@ -7,16 +7,6 @@
#define __XFS_RMAP_H__
static inline void
-xfs_rmap_ag_owner(
- struct xfs_owner_info *oi,
- uint64_t owner)
-{
- oi->oi_owner = owner;
- oi->oi_offset = 0;
- oi->oi_flags = 0;
-}
-
-static inline void
xfs_rmap_ino_bmbt_owner(
struct xfs_owner_info *oi,
xfs_ino_t ino,
@@ -43,27 +33,13 @@
oi->oi_flags |= XFS_OWNER_INFO_ATTR_FORK;
}
-static inline void
-xfs_rmap_skip_owner_update(
- struct xfs_owner_info *oi)
-{
- xfs_rmap_ag_owner(oi, XFS_RMAP_OWN_NULL);
-}
-
static inline bool
xfs_rmap_should_skip_owner_update(
- struct xfs_owner_info *oi)
+ const struct xfs_owner_info *oi)
{
return oi->oi_owner == XFS_RMAP_OWN_NULL;
}
-static inline void
-xfs_rmap_any_owner_update(
- struct xfs_owner_info *oi)
-{
- xfs_rmap_ag_owner(oi, XFS_RMAP_OWN_UNKNOWN);
-}
-
/* Reverse mapping functions. */
struct xfs_buf;
@@ -92,6 +68,7 @@
if (offset & ~(XFS_RMAP_OFF_MASK | XFS_RMAP_OFF_FLAGS))
return -EFSCORRUPTED;
irec->rm_offset = XFS_RMAP_OFF(offset);
+ irec->rm_flags = 0;
if (offset & XFS_RMAP_OFF_ATTR_FORK)
irec->rm_flags |= XFS_RMAP_ATTR_FORK;
if (offset & XFS_RMAP_OFF_BMBT_BLOCK)
@@ -103,12 +80,12 @@
static inline void
xfs_owner_info_unpack(
- struct xfs_owner_info *oinfo,
- uint64_t *owner,
- uint64_t *offset,
- unsigned int *flags)
+ const struct xfs_owner_info *oinfo,
+ uint64_t *owner,
+ uint64_t *offset,
+ unsigned int *flags)
{
- unsigned int r = 0;
+ unsigned int r = 0;
*owner = oinfo->oi_owner;
*offset = oinfo->oi_offset;
@@ -137,10 +114,10 @@
int xfs_rmap_alloc(struct xfs_trans *tp, struct xfs_buf *agbp,
xfs_agnumber_t agno, xfs_agblock_t bno, xfs_extlen_t len,
- struct xfs_owner_info *oinfo);
+ const struct xfs_owner_info *oinfo);
int xfs_rmap_free(struct xfs_trans *tp, struct xfs_buf *agbp,
xfs_agnumber_t agno, xfs_agblock_t bno, xfs_extlen_t len,
- struct xfs_owner_info *oinfo);
+ const struct xfs_owner_info *oinfo);
int xfs_rmap_lookup_le(struct xfs_btree_cur *cur, xfs_agblock_t bno,
xfs_extlen_t len, uint64_t owner, uint64_t offset,
@@ -185,16 +162,16 @@
};
/* functions for updating the rmapbt based on bmbt map/unmap operations */
-int xfs_rmap_map_extent(struct xfs_trans *tp, struct xfs_inode *ip,
+void xfs_rmap_map_extent(struct xfs_trans *tp, struct xfs_inode *ip,
int whichfork, struct xfs_bmbt_irec *imap);
-int xfs_rmap_unmap_extent(struct xfs_trans *tp, struct xfs_inode *ip,
+void xfs_rmap_unmap_extent(struct xfs_trans *tp, struct xfs_inode *ip,
int whichfork, struct xfs_bmbt_irec *imap);
-int xfs_rmap_convert_extent(struct xfs_mount *mp, struct xfs_trans *tp,
+void xfs_rmap_convert_extent(struct xfs_mount *mp, struct xfs_trans *tp,
struct xfs_inode *ip, int whichfork,
struct xfs_bmbt_irec *imap);
-int xfs_rmap_alloc_extent(struct xfs_trans *tp, xfs_agnumber_t agno,
+void xfs_rmap_alloc_extent(struct xfs_trans *tp, xfs_agnumber_t agno,
xfs_agblock_t bno, xfs_extlen_t len, uint64_t owner);
-int xfs_rmap_free_extent(struct xfs_trans *tp, xfs_agnumber_t agno,
+void xfs_rmap_free_extent(struct xfs_trans *tp, xfs_agnumber_t agno,
xfs_agblock_t bno, xfs_extlen_t len, uint64_t owner);
void xfs_rmap_finish_one_cleanup(struct xfs_trans *tp,
@@ -218,11 +195,21 @@
int xfs_rmap_has_record(struct xfs_btree_cur *cur, xfs_agblock_t bno,
xfs_extlen_t len, bool *exists);
int xfs_rmap_record_exists(struct xfs_btree_cur *cur, xfs_agblock_t bno,
- xfs_extlen_t len, struct xfs_owner_info *oinfo,
+ xfs_extlen_t len, const struct xfs_owner_info *oinfo,
bool *has_rmap);
int xfs_rmap_has_other_keys(struct xfs_btree_cur *cur, xfs_agblock_t bno,
- xfs_extlen_t len, struct xfs_owner_info *oinfo,
+ xfs_extlen_t len, const struct xfs_owner_info *oinfo,
bool *has_rmap);
int xfs_rmap_map_raw(struct xfs_btree_cur *cur, struct xfs_rmap_irec *rmap);
+extern const struct xfs_owner_info XFS_RMAP_OINFO_SKIP_UPDATE;
+extern const struct xfs_owner_info XFS_RMAP_OINFO_ANY_OWNER;
+extern const struct xfs_owner_info XFS_RMAP_OINFO_FS;
+extern const struct xfs_owner_info XFS_RMAP_OINFO_LOG;
+extern const struct xfs_owner_info XFS_RMAP_OINFO_AG;
+extern const struct xfs_owner_info XFS_RMAP_OINFO_INOBT;
+extern const struct xfs_owner_info XFS_RMAP_OINFO_INODES;
+extern const struct xfs_owner_info XFS_RMAP_OINFO_REFC;
+extern const struct xfs_owner_info XFS_RMAP_OINFO_COW;
+
#endif /* __XFS_RMAP_H__ */
diff --git a/fs/xfs/libxfs/xfs_rmap_btree.c b/fs/xfs/libxfs/xfs_rmap_btree.c
index f79cf04..fc78efa 100644
--- a/fs/xfs/libxfs/xfs_rmap_btree.c
+++ b/fs/xfs/libxfs/xfs_rmap_btree.c
@@ -9,18 +9,14 @@
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
-#include "xfs_bit.h"
#include "xfs_sb.h"
#include "xfs_mount.h"
-#include "xfs_defer.h"
-#include "xfs_inode.h"
#include "xfs_trans.h"
#include "xfs_alloc.h"
#include "xfs_btree.h"
#include "xfs_rmap.h"
#include "xfs_rmap_btree.h"
#include "xfs_trace.h"
-#include "xfs_cksum.h"
#include "xfs_error.h"
#include "xfs_extent_busy.h"
#include "xfs_ag_resv.h"
@@ -292,7 +288,7 @@
xfs_rmapbt_verify(
struct xfs_buf *bp)
{
- struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_mount *mp = bp->b_mount;
struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
struct xfs_perag *pag = bp->b_pag;
xfs_failaddr_t fa;
@@ -310,7 +306,7 @@
* from the on disk AGF. Again, we can only check against maximum limits
* in this case.
*/
- if (block->bb_magic != cpu_to_be32(XFS_RMAP_CRC_MAGIC))
+ if (!xfs_verify_magic(bp, block->bb_magic))
return __this_address;
if (!xfs_sb_version_hasrmapbt(&mp->m_sb))
@@ -365,6 +361,7 @@
const struct xfs_buf_ops xfs_rmapbt_buf_ops = {
.name = "xfs_rmapbt",
+ .magic = { 0, cpu_to_be32(XFS_RMAP_CRC_MAGIC) },
.verify_read = xfs_rmapbt_read_verify,
.verify_write = xfs_rmapbt_write_verify,
.verify_struct = xfs_rmapbt_verify,
@@ -577,6 +574,15 @@
tree_len = be32_to_cpu(agf->agf_rmap_blocks);
xfs_trans_brelse(tp, agbp);
+ /*
+ * The log is permanently allocated, so the space it occupies will
+ * never be available for the kinds of things that would require btree
+ * expansion. We therefore can pretend the space isn't there.
+ */
+ if (mp->m_sb.sb_logstart &&
+ XFS_FSB_TO_AGNO(mp, mp->m_sb.sb_logstart) == agno)
+ agblocks -= mp->m_sb.sb_logblocks;
+
/* Reserve 1% of the AG or enough for 1 block per record. */
*ask += max(agblocks / 100, xfs_rmapbt_max_size(mp, agblocks));
*used += tree_len;
diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c
index b228c82..8ea1efc 100644
--- a/fs/xfs/libxfs/xfs_rtbitmap.c
+++ b/fs/xfs/libxfs/xfs_rtbitmap.c
@@ -13,15 +13,7 @@
#include "xfs_mount.h"
#include "xfs_inode.h"
#include "xfs_bmap.h"
-#include "xfs_bmap_util.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_alloc.h"
-#include "xfs_error.h"
#include "xfs_trans.h"
-#include "xfs_trans_space.h"
-#include "xfs_trace.h"
-#include "xfs_buf.h"
-#include "xfs_icache.h"
#include "xfs_rtalloc.h"
@@ -505,6 +497,12 @@
uint first = (uint)((char *)sp - (char *)bp->b_addr);
*sp += delta;
+ if (mp->m_rsum_cache) {
+ if (*sp == 0 && log == mp->m_rsum_cache[bbno])
+ mp->m_rsum_cache[bbno]++;
+ if (*sp != 0 && log < mp->m_rsum_cache[bbno])
+ mp->m_rsum_cache[bbno] = log;
+ }
xfs_trans_log_buf(tp, bp, first, first + sizeof(*sp) - 1);
}
if (sum)
diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
index 081f46e..ac6cdca 100644
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -10,26 +10,20 @@
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
#include "xfs_bit.h"
-#include "xfs_sb.h"
#include "xfs_mount.h"
-#include "xfs_defer.h"
-#include "xfs_inode.h"
#include "xfs_ialloc.h"
#include "xfs_alloc.h"
#include "xfs_error.h"
#include "xfs_trace.h"
-#include "xfs_cksum.h"
#include "xfs_trans.h"
#include "xfs_buf_item.h"
#include "xfs_bmap_btree.h"
#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
#include "xfs_log.h"
#include "xfs_rmap_btree.h"
-#include "xfs_bmap.h"
#include "xfs_refcount_btree.h"
#include "xfs_da_format.h"
-#include "xfs_da_btree.h"
+#include "xfs_health.h"
/*
* Physical superblock buffer manipulations. Shared with libxfs in userspace.
@@ -225,10 +219,11 @@
struct xfs_buf *bp,
struct xfs_sb *sbp)
{
+ struct xfs_dsb *dsb = XFS_BUF_TO_SBP(bp);
uint32_t agcount = 0;
uint32_t rem;
- if (sbp->sb_magicnum != XFS_SB_MAGIC) {
+ if (!xfs_verify_magic(bp, dsb->sb_magicnum)) {
xfs_warn(mp, "bad magic number");
return -EWRONGFS;
}
@@ -684,7 +679,7 @@
struct xfs_buf *bp)
{
struct xfs_sb sb;
- struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_mount *mp = bp->b_mount;
struct xfs_dsb *dsb = XFS_BUF_TO_SBP(bp);
int error;
@@ -750,7 +745,7 @@
struct xfs_buf *bp)
{
struct xfs_sb sb;
- struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_mount *mp = bp->b_mount;
struct xfs_buf_log_item *bip = bp->b_log_item;
int error;
@@ -781,12 +776,14 @@
const struct xfs_buf_ops xfs_sb_buf_ops = {
.name = "xfs_sb",
+ .magic = { cpu_to_be32(XFS_SB_MAGIC), cpu_to_be32(XFS_SB_MAGIC) },
.verify_read = xfs_sb_read_verify,
.verify_write = xfs_sb_write_verify,
};
const struct xfs_buf_ops xfs_sb_quiet_buf_ops = {
.name = "xfs_sb_quiet",
+ .magic = { cpu_to_be32(XFS_SB_MAGIC), cpu_to_be32(XFS_SB_MAGIC) },
.verify_read = xfs_sb_quiet_read_verify,
.verify_write = xfs_sb_write_verify,
};
@@ -796,12 +793,14 @@
*
* Mount initialization code establishing various mount
* fields from the superblock associated with the given
- * mount structure
+ * mount structure.
+ *
+ * Inode geometry are calculated in xfs_ialloc_setup_geometry.
*/
void
xfs_sb_mount_common(
- struct xfs_mount *mp,
- struct xfs_sb *sbp)
+ struct xfs_mount *mp,
+ struct xfs_sb *sbp)
{
mp->m_agfrotor = mp->m_agirotor = 0;
mp->m_maxagi = mp->m_sb.sb_agcount;
@@ -809,7 +808,6 @@
mp->m_blkbb_log = sbp->sb_blocklog - BBSHIFT;
mp->m_sectbb_log = sbp->sb_sectlog - BBSHIFT;
mp->m_agno_log = xfs_highbit32(sbp->sb_agcount - 1) + 1;
- mp->m_agino_log = sbp->sb_inopblog + sbp->sb_agblklog;
mp->m_blockmask = sbp->sb_blocksize - 1;
mp->m_blockwsize = sbp->sb_blocksize >> XFS_WORDLOG;
mp->m_blockwmask = mp->m_blockwsize - 1;
@@ -819,11 +817,6 @@
mp->m_alloc_mnr[0] = mp->m_alloc_mxr[0] / 2;
mp->m_alloc_mnr[1] = mp->m_alloc_mxr[1] / 2;
- mp->m_inobt_mxr[0] = xfs_inobt_maxrecs(mp, sbp->sb_blocksize, 1);
- mp->m_inobt_mxr[1] = xfs_inobt_maxrecs(mp, sbp->sb_blocksize, 0);
- mp->m_inobt_mnr[0] = mp->m_inobt_mxr[0] / 2;
- mp->m_inobt_mnr[1] = mp->m_inobt_mxr[1] / 2;
-
mp->m_bmap_dmxr[0] = xfs_bmbt_maxrecs(mp, sbp->sb_blocksize, 1);
mp->m_bmap_dmxr[1] = xfs_bmbt_maxrecs(mp, sbp->sb_blocksize, 0);
mp->m_bmap_dmnr[0] = mp->m_bmap_dmxr[0] / 2;
@@ -840,14 +833,6 @@
mp->m_refc_mnr[1] = mp->m_refc_mxr[1] / 2;
mp->m_bsize = XFS_FSB_TO_BB(mp, 1);
- mp->m_ialloc_inos = max_t(uint16_t, XFS_INODES_PER_CHUNK,
- sbp->sb_inopblock);
- mp->m_ialloc_blks = mp->m_ialloc_inos >> sbp->sb_inopblog;
-
- if (sbp->sb_spino_align)
- mp->m_ialloc_min_blks = sbp->sb_spino_align;
- else
- mp->m_ialloc_min_blks = mp->m_ialloc_blks;
mp->m_alloc_set_aside = xfs_alloc_set_aside(mp);
mp->m_ag_max_usable = xfs_alloc_ag_max_usable(mp);
}
@@ -874,7 +859,7 @@
uint64_t bfreelst = 0;
uint64_t btree = 0;
uint64_t fdblocks;
- int error;
+ int error = 0;
for (index = 0; index < agcount; index++) {
/*
@@ -902,7 +887,7 @@
/*
* If the new summary counts are obviously incorrect, fail the
* mount operation because that implies the AGFs are also corrupt.
- * Clear BAD_SUMMARY so that we don't unmount with a dirty log, which
+ * Clear FS_COUNTERS so that we don't unmount with a dirty log, which
* will prevent xfs_repair from fixing anything.
*/
if (fdblocks > sbp->sb_dblocks || ifree > ialloc) {
@@ -920,7 +905,7 @@
xfs_reinit_percpu_counters(mp);
out:
- mp->m_flags &= ~XFS_MOUNT_BAD_SUMMARY;
+ xfs_fs_mark_healthy(mp, XFS_SICK_FS_COUNTERS);
return error;
}
@@ -935,7 +920,7 @@
struct xfs_trans *tp)
{
struct xfs_mount *mp = tp->t_mountp;
- struct xfs_buf *bp = xfs_trans_getsb(tp, mp, 0);
+ struct xfs_buf *bp = xfs_trans_getsb(tp, mp);
mp->m_sb.sb_icount = percpu_counter_sum(&mp->m_icount);
mp->m_sb.sb_ifree = percpu_counter_sum(&mp->m_ifree);
@@ -943,7 +928,7 @@
xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb);
xfs_trans_buf_set_type(tp, bp, XFS_BLFT_SB_BUF);
- xfs_trans_log_buf(tp, bp, 0, sizeof(struct xfs_dsb));
+ xfs_trans_log_buf(tp, bp, 0, sizeof(struct xfs_dsb) - 1);
}
/*
@@ -1001,7 +986,7 @@
bp = xfs_buf_get(mp->m_ddev_targp,
XFS_AG_DADDR(mp, agno, XFS_SB_DADDR),
- XFS_FSS_TO_BB(mp, 1), 0);
+ XFS_FSS_TO_BB(mp, 1));
/*
* If we get an error reading or writing alternate superblocks,
* continue. xfs_repair chooses the "best" superblock based
@@ -1065,7 +1050,7 @@
if (error)
return error;
- bp = xfs_trans_getsb(tp, mp, 0);
+ bp = xfs_trans_getsb(tp, mp);
xfs_log_sb(tp);
xfs_trans_bhold(tp, bp);
xfs_trans_set_sync(tp);
@@ -1081,7 +1066,7 @@
return error;
}
-int
+void
xfs_fs_geometry(
struct xfs_sb *sbp,
struct xfs_fsop_geom *geo,
@@ -1105,17 +1090,18 @@
memcpy(geo->uuid, &sbp->sb_uuid, sizeof(sbp->sb_uuid));
if (struct_version < 2)
- return 0;
+ return;
geo->sunit = sbp->sb_unit;
geo->swidth = sbp->sb_width;
if (struct_version < 3)
- return 0;
+ return;
geo->version = XFS_FSOP_GEOM_VERSION;
geo->flags = XFS_FSOP_GEOM_FLAGS_NLINK |
- XFS_FSOP_GEOM_FLAGS_DIRV2;
+ XFS_FSOP_GEOM_FLAGS_DIRV2 |
+ XFS_FSOP_GEOM_FLAGS_EXTFLG;
if (xfs_sb_version_hasattr(sbp))
geo->flags |= XFS_FSOP_GEOM_FLAGS_ATTR;
if (xfs_sb_version_hasquota(sbp))
@@ -1124,8 +1110,6 @@
geo->flags |= XFS_FSOP_GEOM_FLAGS_IALIGN;
if (xfs_sb_version_hasdalign(sbp))
geo->flags |= XFS_FSOP_GEOM_FLAGS_DALIGN;
- if (xfs_sb_version_hasextflgbit(sbp))
- geo->flags |= XFS_FSOP_GEOM_FLAGS_EXTFLG;
if (xfs_sb_version_hassector(sbp))
geo->flags |= XFS_FSOP_GEOM_FLAGS_SECTOR;
if (xfs_sb_version_hasasciici(sbp))
@@ -1156,14 +1140,17 @@
geo->dirblocksize = xfs_dir2_dirblock_bytes(sbp);
if (struct_version < 4)
- return 0;
+ return;
if (xfs_sb_version_haslogv2(sbp))
geo->flags |= XFS_FSOP_GEOM_FLAGS_LOGV2;
geo->logsunit = sbp->sb_logsunit;
- return 0;
+ if (struct_version < 5)
+ return;
+
+ geo->version = XFS_FSOP_GEOM_VERSION_V5;
}
/* Read a secondary superblock. */
diff --git a/fs/xfs/libxfs/xfs_sb.h b/fs/xfs/libxfs/xfs_sb.h
index 13564d6..92465a9 100644
--- a/fs/xfs/libxfs/xfs_sb.h
+++ b/fs/xfs/libxfs/xfs_sb.h
@@ -33,7 +33,7 @@
extern int xfs_update_secondary_sbs(struct xfs_mount *mp);
#define XFS_FS_GEOM_MAX_STRUCT_VER (4)
-extern int xfs_fs_geometry(struct xfs_sb *sbp, struct xfs_fsop_geom *geo,
+extern void xfs_fs_geometry(struct xfs_sb *sbp, struct xfs_fsop_geom *geo,
int struct_version);
extern int xfs_sb_read_secondary(struct xfs_mount *mp,
struct xfs_trans *tp, xfs_agnumber_t agno,
diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h
index 1c5debe..c45acbd 100644
--- a/fs/xfs/libxfs/xfs_shared.h
+++ b/fs/xfs/libxfs/xfs_shared.h
@@ -25,7 +25,8 @@
extern const struct xfs_buf_ops xfs_agi_buf_ops;
extern const struct xfs_buf_ops xfs_agf_buf_ops;
extern const struct xfs_buf_ops xfs_agfl_buf_ops;
-extern const struct xfs_buf_ops xfs_allocbt_buf_ops;
+extern const struct xfs_buf_ops xfs_bnobt_buf_ops;
+extern const struct xfs_buf_ops xfs_cntbt_buf_ops;
extern const struct xfs_buf_ops xfs_rmapbt_buf_ops;
extern const struct xfs_buf_ops xfs_refcountbt_buf_ops;
extern const struct xfs_buf_ops xfs_attr3_leaf_buf_ops;
@@ -36,6 +37,7 @@
extern const struct xfs_buf_ops xfs_symlink_buf_ops;
extern const struct xfs_buf_ops xfs_agi_buf_ops;
extern const struct xfs_buf_ops xfs_inobt_buf_ops;
+extern const struct xfs_buf_ops xfs_finobt_buf_ops;
extern const struct xfs_buf_ops xfs_inode_buf_ops;
extern const struct xfs_buf_ops xfs_inode_buf_ra_ops;
extern const struct xfs_buf_ops xfs_dquot_buf_ops;
@@ -63,7 +65,6 @@
#define XFS_TRANS_DQ_DIRTY 0x10 /* at least one dquot in trx dirty */
#define XFS_TRANS_RESERVE 0x20 /* OK to use reserved data blocks */
#define XFS_TRANS_NO_WRITECOUNT 0x40 /* do not elevate SB writecount */
-#define XFS_TRANS_NOFS 0x80 /* pass KM_NOFS to kmem_alloc */
/*
* LOWMODE is used by the allocator to activate the lowspace algorithm - when
* free space is running low the extent allocator may choose to allocate an
@@ -134,4 +135,46 @@
struct xfs_inode *ip, struct xfs_ifork *ifp);
xfs_failaddr_t xfs_symlink_shortform_verify(struct xfs_inode *ip);
+/* Computed inode geometry for the filesystem. */
+struct xfs_ino_geometry {
+ /* Maximum inode count in this filesystem. */
+ uint64_t maxicount;
+
+ /* Actual inode cluster buffer size, in bytes. */
+ unsigned int inode_cluster_size;
+
+ /*
+ * Desired inode cluster buffer size, in bytes. This value is not
+ * rounded up to at least one filesystem block, which is necessary for
+ * the sole purpose of validating sb_spino_align. Runtime code must
+ * only ever use inode_cluster_size.
+ */
+ unsigned int inode_cluster_size_raw;
+
+ /* Inode cluster sizes, adjusted to be at least 1 fsb. */
+ unsigned int inodes_per_cluster;
+ unsigned int blocks_per_cluster;
+
+ /* Inode cluster alignment. */
+ unsigned int cluster_align;
+ unsigned int cluster_align_inodes;
+ unsigned int inoalign_mask; /* mask sb_inoalignmt if used */
+
+ unsigned int inobt_mxr[2]; /* max inobt btree records */
+ unsigned int inobt_mnr[2]; /* min inobt btree records */
+ unsigned int inobt_maxlevels; /* max inobt btree levels. */
+
+ /* Size of inode allocations under normal operation. */
+ unsigned int ialloc_inos;
+ unsigned int ialloc_blks;
+
+ /* Minimum inode blocks for a sparse allocation. */
+ unsigned int ialloc_min_blks;
+
+ /* stripe unit inode alignment */
+ unsigned int ialloc_align;
+
+ unsigned int agino_log; /* #bits for agino in inum */
+};
+
#endif /* __XFS_SHARED_H__ */
diff --git a/fs/xfs/libxfs/xfs_symlink_remote.c b/fs/xfs/libxfs/xfs_symlink_remote.c
index 95374ab..3b8260c 100644
--- a/fs/xfs/libxfs/xfs_symlink_remote.c
+++ b/fs/xfs/libxfs/xfs_symlink_remote.c
@@ -11,12 +11,8 @@
#include "xfs_shared.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
#include "xfs_inode.h"
#include "xfs_error.h"
-#include "xfs_trace.h"
-#include "xfs_symlink.h"
-#include "xfs_cksum.h"
#include "xfs_trans.h"
#include "xfs_buf_item.h"
#include "xfs_log.h"
@@ -90,12 +86,12 @@
xfs_symlink_verify(
struct xfs_buf *bp)
{
- struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_mount *mp = bp->b_mount;
struct xfs_dsymlink_hdr *dsl = bp->b_addr;
if (!xfs_sb_version_hascrc(&mp->m_sb))
return __this_address;
- if (dsl->sl_magic != cpu_to_be32(XFS_SYMLINK_MAGIC))
+ if (!xfs_verify_magic(bp, dsl->sl_magic))
return __this_address;
if (!uuid_equal(&dsl->sl_uuid, &mp->m_sb.sb_meta_uuid))
return __this_address;
@@ -116,7 +112,7 @@
xfs_symlink_read_verify(
struct xfs_buf *bp)
{
- struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_mount *mp = bp->b_mount;
xfs_failaddr_t fa;
/* no verification of non-crc buffers */
@@ -136,7 +132,7 @@
xfs_symlink_write_verify(
struct xfs_buf *bp)
{
- struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_mount *mp = bp->b_mount;
struct xfs_buf_log_item *bip = bp->b_log_item;
xfs_failaddr_t fa;
@@ -159,6 +155,7 @@
const struct xfs_buf_ops xfs_symlink_buf_ops = {
.name = "xfs_symlink",
+ .magic = { 0, cpu_to_be32(XFS_SYMLINK_MAGIC) },
.verify_read = xfs_symlink_read_verify,
.verify_write = xfs_symlink_write_verify,
.verify_struct = xfs_symlink_verify,
@@ -199,7 +196,10 @@
ifp->if_bytes - 1);
}
-/* Verify the consistency of an inline symlink. */
+/*
+ * Verify the in-memory consistency of an inline symlink data fork. This
+ * does not do on-disk format checks.
+ */
xfs_failaddr_t
xfs_symlink_shortform_verify(
struct xfs_inode *ip)
@@ -215,9 +215,12 @@
size = ifp->if_bytes;
endp = sfp + size;
- /* Zero length symlinks can exist while we're deleting a remote one. */
- if (size == 0)
- return NULL;
+ /*
+ * Zero length symlinks should never occur in memory as they are
+ * never alllowed to exist on disk.
+ */
+ if (!size)
+ return __this_address;
/* No negative sizes or overly long symlink targets. */
if (size < 0 || size > XFS_SYMLINK_MAXLEN)
diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/libxfs/xfs_trans_inode.c
similarity index 96%
rename from fs/xfs/xfs_trans_inode.c
rename to fs/xfs/libxfs/xfs_trans_inode.c
index 5429273..a9ad909 100644
--- a/fs/xfs/xfs_trans_inode.c
+++ b/fs/xfs/libxfs/xfs_trans_inode.c
@@ -8,13 +8,10 @@
#include "xfs_shared.h"
#include "xfs_format.h"
#include "xfs_log_format.h"
-#include "xfs_trans_resv.h"
-#include "xfs_mount.h"
#include "xfs_inode.h"
#include "xfs_trans.h"
#include "xfs_trans_priv.h"
#include "xfs_inode_item.h"
-#include "xfs_trace.h"
#include <linux/iversion.h>
@@ -69,6 +66,10 @@
inode->i_mtime = tv;
if (flags & XFS_ICHGTIME_CHG)
inode->i_ctime = tv;
+ if (flags & XFS_ICHGTIME_CREATE) {
+ ip->i_d.di_crtime.t_sec = (int32_t)tv.tv_sec;
+ ip->i_d.di_crtime.t_nsec = (int32_t)tv.tv_nsec;
+ }
}
/*
diff --git a/fs/xfs/libxfs/xfs_trans_resv.c b/fs/xfs/libxfs/xfs_trans_resv.c
index f99a7ae..d12bbd5 100644
--- a/fs/xfs/libxfs/xfs_trans_resv.c
+++ b/fs/xfs/libxfs/xfs_trans_resv.c
@@ -15,12 +15,10 @@
#include "xfs_da_btree.h"
#include "xfs_inode.h"
#include "xfs_bmap_btree.h"
-#include "xfs_ialloc.h"
#include "xfs_quota.h"
#include "xfs_trans.h"
#include "xfs_qm.h"
#include "xfs_trans_space.h"
-#include "xfs_trace.h"
#define _ALLOC true
#define _FREE false
@@ -136,9 +134,10 @@
xfs_calc_inobt_res(
struct xfs_mount *mp)
{
- return xfs_calc_buf_res(mp->m_in_maxlevels, XFS_FSB_TO_B(mp, 1)) +
- xfs_calc_buf_res(xfs_allocfree_log_count(mp, 1),
- XFS_FSB_TO_B(mp, 1));
+ return xfs_calc_buf_res(M_IGEO(mp)->inobt_maxlevels,
+ XFS_FSB_TO_B(mp, 1)) +
+ xfs_calc_buf_res(xfs_allocfree_log_count(mp, 1),
+ XFS_FSB_TO_B(mp, 1));
}
/*
@@ -167,7 +166,7 @@
* includes:
*
* the allocation btrees: 2 trees * (max depth - 1) * block size
- * the inode chunk: m_ialloc_blks * N
+ * the inode chunk: m_ino_geo.ialloc_blks * N
*
* The size N of the inode chunk reservation depends on whether it is for
* allocation or free and which type of create transaction is in use. An inode
@@ -193,7 +192,7 @@
size = XFS_FSB_TO_B(mp, 1);
}
- res += xfs_calc_buf_res(mp->m_ialloc_blks, size);
+ res += xfs_calc_buf_res(M_IGEO(mp)->ialloc_blks, size);
return res;
}
@@ -307,7 +306,7 @@
struct xfs_mount *mp)
{
return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) +
- 2 * max_t(uint, XFS_FSB_TO_B(mp, 1), mp->m_inode_cluster_size);
+ 2 * M_IGEO(mp)->inode_cluster_size;
}
/*
@@ -345,7 +344,7 @@
xfs_calc_iunlink_add_reservation(xfs_mount_t *mp)
{
return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) +
- max_t(uint, XFS_FSB_TO_B(mp, 1), mp->m_inode_cluster_size);
+ M_IGEO(mp)->inode_cluster_size;
}
/*
@@ -876,9 +875,13 @@
resp->tr_sb.tr_logres = xfs_calc_sb_reservation(mp);
resp->tr_sb.tr_logcount = XFS_DEFAULT_LOG_COUNT;
+ /* growdata requires permanent res; it can free space to the last AG */
+ resp->tr_growdata.tr_logres = xfs_calc_growdata_reservation(mp);
+ resp->tr_growdata.tr_logcount = XFS_DEFAULT_PERM_LOG_COUNT;
+ resp->tr_growdata.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+
/* The following transaction are logged in logical format */
resp->tr_ichange.tr_logres = xfs_calc_ichange_reservation(mp);
- resp->tr_growdata.tr_logres = xfs_calc_growdata_reservation(mp);
resp->tr_fsyncts.tr_logres = xfs_calc_swrite_reservation(mp);
resp->tr_writeid.tr_logres = xfs_calc_writeid_reservation(mp);
resp->tr_attrsetrt.tr_logres = xfs_calc_attrsetrt_reservation(mp);
diff --git a/fs/xfs/libxfs/xfs_trans_space.h b/fs/xfs/libxfs/xfs_trans_space.h
index a62fb95..88221c7 100644
--- a/fs/xfs/libxfs/xfs_trans_space.h
+++ b/fs/xfs/libxfs/xfs_trans_space.h
@@ -56,9 +56,9 @@
#define XFS_DIRREMOVE_SPACE_RES(mp) \
XFS_DAREMOVE_SPACE_RES(mp, XFS_DATA_FORK)
#define XFS_IALLOC_SPACE_RES(mp) \
- ((mp)->m_ialloc_blks + \
+ (M_IGEO(mp)->ialloc_blks + \
(xfs_sb_version_hasfinobt(&mp->m_sb) ? 2 : 1 * \
- ((mp)->m_in_maxlevels - 1)))
+ (M_IGEO(mp)->inobt_maxlevels - 1)))
/*
* Space reservation values for various transactions.
@@ -94,7 +94,8 @@
#define XFS_SYMLINK_SPACE_RES(mp,nl,b) \
(XFS_IALLOC_SPACE_RES(mp) + XFS_DIRENTER_SPACE_RES(mp,nl) + (b))
#define XFS_IFREE_SPACE_RES(mp) \
- (xfs_sb_version_hasfinobt(&mp->m_sb) ? (mp)->m_in_maxlevels : 0)
+ (xfs_sb_version_hasfinobt(&mp->m_sb) ? \
+ M_IGEO(mp)->inobt_maxlevels : 0)
#endif /* __XFS_TRANS_SPACE_H__ */
diff --git a/fs/xfs/libxfs/xfs_types.c b/fs/xfs/libxfs/xfs_types.c
index 33a5ca3..4f59554 100644
--- a/fs/xfs/libxfs/xfs_types.c
+++ b/fs/xfs/libxfs/xfs_types.c
@@ -7,19 +7,10 @@
#include "xfs.h"
#include "xfs_fs.h"
#include "xfs_format.h"
-#include "xfs_log_format.h"
#include "xfs_shared.h"
#include "xfs_trans_resv.h"
#include "xfs_bit.h"
-#include "xfs_sb.h"
#include "xfs_mount.h"
-#include "xfs_defer.h"
-#include "xfs_inode.h"
-#include "xfs_btree.h"
-#include "xfs_rmap.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_alloc.h"
-#include "xfs_ialloc.h"
/* Find the size of the AG, in blocks. */
xfs_agblock_t
@@ -87,16 +78,15 @@
* Calculate the first inode, which will be in the first
* cluster-aligned block after the AGFL.
*/
- bno = round_up(XFS_AGFL_BLOCK(mp) + 1,
- xfs_ialloc_cluster_alignment(mp));
- *first = XFS_OFFBNO_TO_AGINO(mp, bno, 0);
+ bno = round_up(XFS_AGFL_BLOCK(mp) + 1, M_IGEO(mp)->cluster_align);
+ *first = XFS_AGB_TO_AGINO(mp, bno);
/*
* Calculate the last inode, which will be at the end of the
* last (aligned) cluster that can be allocated in the AG.
*/
- bno = round_down(eoag, xfs_ialloc_cluster_alignment(mp));
- *last = XFS_OFFBNO_TO_AGINO(mp, bno, 0) - 1;
+ bno = round_down(eoag, M_IGEO(mp)->cluster_align);
+ *last = XFS_AGB_TO_AGINO(mp, bno) - 1;
}
/*
@@ -117,6 +107,19 @@
}
/*
+ * Verify that an AG inode number pointer neither points outside the AG
+ * nor points at static metadata, or is NULLAGINO.
+ */
+bool
+xfs_verify_agino_or_null(
+ struct xfs_mount *mp,
+ xfs_agnumber_t agno,
+ xfs_agino_t agino)
+{
+ return agino == NULLAGINO || xfs_verify_agino(mp, agno, agino);
+}
+
+/*
* Verify that an FS inode number pointer neither points outside the
* filesystem nor points at static AG metadata.
*/
@@ -173,7 +176,7 @@
}
/* Calculate the range of valid icount values. */
-static void
+void
xfs_icount_range(
struct xfs_mount *mp,
unsigned long long *min,
@@ -205,3 +208,14 @@
xfs_icount_range(mp, &min, &max);
return icount >= min && icount <= max;
}
+
+/* Sanity-checking of dir/attr block offsets. */
+bool
+xfs_verify_dablk(
+ struct xfs_mount *mp,
+ xfs_fileoff_t dabno)
+{
+ xfs_dablk_t max_dablk = -1U;
+
+ return dabno <= max_dablk;
+}
diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h
index b9e6c89..300b3e9 100644
--- a/fs/xfs/libxfs/xfs_types.h
+++ b/fs/xfs/libxfs/xfs_types.h
@@ -100,15 +100,37 @@
*/
#define MAXNAMELEN 256
+/*
+ * This enum is used in string mapping in xfs_trace.h; please keep the
+ * TRACE_DEFINE_ENUMs for it up to date.
+ */
typedef enum {
XFS_LOOKUP_EQi, XFS_LOOKUP_LEi, XFS_LOOKUP_GEi
} xfs_lookup_t;
+#define XFS_AG_BTREE_CMP_FORMAT_STR \
+ { XFS_LOOKUP_EQi, "eq" }, \
+ { XFS_LOOKUP_LEi, "le" }, \
+ { XFS_LOOKUP_GEi, "ge" }
+
+/*
+ * This enum is used in string mapping in xfs_trace.h and scrub/trace.h;
+ * please keep the TRACE_DEFINE_ENUMs for it up to date.
+ */
typedef enum {
XFS_BTNUM_BNOi, XFS_BTNUM_CNTi, XFS_BTNUM_RMAPi, XFS_BTNUM_BMAPi,
XFS_BTNUM_INOi, XFS_BTNUM_FINOi, XFS_BTNUM_REFCi, XFS_BTNUM_MAX
} xfs_btnum_t;
+#define XFS_BTNUM_STRINGS \
+ { XFS_BTNUM_BNOi, "bnobt" }, \
+ { XFS_BTNUM_CNTi, "cntbt" }, \
+ { XFS_BTNUM_RMAPi, "rmapbt" }, \
+ { XFS_BTNUM_BMAPi, "bmbt" }, \
+ { XFS_BTNUM_INOi, "inobt" }, \
+ { XFS_BTNUM_FINOi, "finobt" }, \
+ { XFS_BTNUM_REFCi, "refcbt" }
+
struct xfs_name {
const unsigned char *name;
int len;
@@ -147,6 +169,14 @@
xfs_exntst_t br_state; /* extent state */
} xfs_bmbt_irec_t;
+/* per-AG block reservation types */
+enum xfs_ag_resv_type {
+ XFS_AG_RESV_NONE = 0,
+ XFS_AG_RESV_AGFL,
+ XFS_AG_RESV_METADATA,
+ XFS_AG_RESV_RMAPBT,
+};
+
/*
* Type verifier functions
*/
@@ -161,10 +191,15 @@
xfs_agino_t *first, xfs_agino_t *last);
bool xfs_verify_agino(struct xfs_mount *mp, xfs_agnumber_t agno,
xfs_agino_t agino);
+bool xfs_verify_agino_or_null(struct xfs_mount *mp, xfs_agnumber_t agno,
+ xfs_agino_t agino);
bool xfs_verify_ino(struct xfs_mount *mp, xfs_ino_t ino);
bool xfs_internal_inum(struct xfs_mount *mp, xfs_ino_t ino);
bool xfs_verify_dir_ino(struct xfs_mount *mp, xfs_ino_t ino);
bool xfs_verify_rtbno(struct xfs_mount *mp, xfs_rtblock_t rtbno);
bool xfs_verify_icount(struct xfs_mount *mp, unsigned long long icount);
+bool xfs_verify_dablk(struct xfs_mount *mp, xfs_fileoff_t off);
+void xfs_icount_range(struct xfs_mount *mp, unsigned long long *min,
+ unsigned long long *max);
#endif /* __XFS_TYPES_H__ */
diff --git a/fs/xfs/scrub/agheader.c b/fs/xfs/scrub/agheader.c
index 3068a93..ba0f747 100644
--- a/fs/xfs/scrub/agheader.c
+++ b/fs/xfs/scrub/agheader.c
@@ -9,20 +9,13 @@
#include "xfs_format.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
-#include "xfs_defer.h"
#include "xfs_btree.h"
-#include "xfs_bit.h"
-#include "xfs_log_format.h"
-#include "xfs_trans.h"
#include "xfs_sb.h"
-#include "xfs_inode.h"
#include "xfs_alloc.h"
#include "xfs_ialloc.h"
#include "xfs_rmap.h"
-#include "scrub/xfs_scrub.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
-#include "scrub/trace.h"
/* Superblock */
@@ -32,7 +25,6 @@
struct xfs_scrub *sc,
struct xfs_buf *bp)
{
- struct xfs_owner_info oinfo;
struct xfs_mount *mp = sc->mp;
xfs_agnumber_t agno = sc->sm->sm_agno;
xfs_agblock_t agbno;
@@ -49,8 +41,7 @@
xchk_xref_is_used_space(sc, agbno, 1);
xchk_xref_is_not_inode_chunk(sc, agbno, 1);
- xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_FS);
- xchk_xref_is_owned_by(sc, agbno, 1, &oinfo);
+ xchk_xref_is_owned_by(sc, agbno, 1, &XFS_RMAP_OINFO_FS);
xchk_xref_is_not_shared(sc, agbno, 1);
/* scrub teardown will take care of sc->sa for us */
@@ -401,7 +392,7 @@
if (!xchk_should_check_xref(sc, &error, &sc->sa.cnt_cur))
return;
if (!have) {
- if (agf->agf_freeblks != be32_to_cpu(0))
+ if (agf->agf_freeblks != cpu_to_be32(0))
xchk_block_xref_set_corrupt(sc, sc->sa.agf_bp);
return;
}
@@ -484,7 +475,6 @@
xchk_agf_xref(
struct xfs_scrub *sc)
{
- struct xfs_owner_info oinfo;
struct xfs_mount *mp = sc->mp;
xfs_agblock_t agbno;
int error;
@@ -502,8 +492,7 @@
xchk_agf_xref_freeblks(sc);
xchk_agf_xref_cntbt(sc);
xchk_xref_is_not_inode_chunk(sc, agbno, 1);
- xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_FS);
- xchk_xref_is_owned_by(sc, agbno, 1, &oinfo);
+ xchk_xref_is_owned_by(sc, agbno, 1, &XFS_RMAP_OINFO_FS);
xchk_agf_xref_btreeblks(sc);
xchk_xref_is_not_shared(sc, agbno, 1);
xchk_agf_xref_refcblks(sc);
@@ -518,6 +507,7 @@
{
struct xfs_mount *mp = sc->mp;
struct xfs_agf *agf;
+ struct xfs_perag *pag;
xfs_agnumber_t agno;
xfs_agblock_t agbno;
xfs_agblock_t eoag;
@@ -590,6 +580,16 @@
if (agfl_count != 0 && fl_count != agfl_count)
xchk_block_set_corrupt(sc, sc->sa.agf_bp);
+ /* Do the incore counters match? */
+ pag = xfs_perag_get(mp, agno);
+ if (pag->pagf_freeblks != be32_to_cpu(agf->agf_freeblks))
+ xchk_block_set_corrupt(sc, sc->sa.agf_bp);
+ if (pag->pagf_flcount != be32_to_cpu(agf->agf_flcount))
+ xchk_block_set_corrupt(sc, sc->sa.agf_bp);
+ if (pag->pagf_btreeblks != be32_to_cpu(agf->agf_btreeblks))
+ xchk_block_set_corrupt(sc, sc->sa.agf_bp);
+ xfs_perag_put(pag);
+
xchk_agf_xref(sc);
out:
return error;
@@ -598,7 +598,6 @@
/* AGFL */
struct xchk_agfl_info {
- struct xfs_owner_info oinfo;
unsigned int sz_entries;
unsigned int nr_entries;
xfs_agblock_t *entries;
@@ -609,15 +608,14 @@
STATIC void
xchk_agfl_block_xref(
struct xfs_scrub *sc,
- xfs_agblock_t agbno,
- struct xfs_owner_info *oinfo)
+ xfs_agblock_t agbno)
{
if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
return;
xchk_xref_is_used_space(sc, agbno, 1);
xchk_xref_is_not_inode_chunk(sc, agbno, 1);
- xchk_xref_is_owned_by(sc, agbno, 1, oinfo);
+ xchk_xref_is_owned_by(sc, agbno, 1, &XFS_RMAP_OINFO_AG);
xchk_xref_is_not_shared(sc, agbno, 1);
}
@@ -638,10 +636,10 @@
else
xchk_block_set_corrupt(sc, sc->sa.agfl_bp);
- xchk_agfl_block_xref(sc, agbno, priv);
+ xchk_agfl_block_xref(sc, agbno);
if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
- return XFS_BTREE_QUERY_RANGE_ABORT;
+ return -ECANCELED;
return 0;
}
@@ -662,7 +660,6 @@
xchk_agfl_xref(
struct xfs_scrub *sc)
{
- struct xfs_owner_info oinfo;
struct xfs_mount *mp = sc->mp;
xfs_agblock_t agbno;
int error;
@@ -678,8 +675,7 @@
xchk_xref_is_used_space(sc, agbno, 1);
xchk_xref_is_not_inode_chunk(sc, agbno, 1);
- xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_FS);
- xchk_xref_is_owned_by(sc, agbno, 1, &oinfo);
+ xchk_xref_is_owned_by(sc, agbno, 1, &XFS_RMAP_OINFO_FS);
xchk_xref_is_not_shared(sc, agbno, 1);
/*
@@ -732,10 +728,9 @@
}
/* Check the blocks in the AGFL. */
- xfs_rmap_ag_owner(&sai.oinfo, XFS_RMAP_OWN_AG);
error = xfs_agfl_walk(sc->mp, XFS_BUF_TO_AGF(sc->sa.agf_bp),
sc->sa.agfl_bp, xchk_agfl_block, &sai);
- if (error == XFS_BTREE_QUERY_RANGE_ABORT) {
+ if (error == -ECANCELED) {
error = 0;
goto out_free;
}
@@ -791,7 +786,6 @@
xchk_agi_xref(
struct xfs_scrub *sc)
{
- struct xfs_owner_info oinfo;
struct xfs_mount *mp = sc->mp;
xfs_agblock_t agbno;
int error;
@@ -808,8 +802,7 @@
xchk_xref_is_used_space(sc, agbno, 1);
xchk_xref_is_not_inode_chunk(sc, agbno, 1);
xchk_agi_xref_icounts(sc);
- xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_FS);
- xchk_xref_is_owned_by(sc, agbno, 1, &oinfo);
+ xchk_xref_is_owned_by(sc, agbno, 1, &XFS_RMAP_OINFO_FS);
xchk_xref_is_not_shared(sc, agbno, 1);
/* scrub teardown will take care of sc->sa for us */
@@ -822,6 +815,7 @@
{
struct xfs_mount *mp = sc->mp;
struct xfs_agi *agi;
+ struct xfs_perag *pag;
xfs_agnumber_t agno;
xfs_agblock_t agbno;
xfs_agblock_t eoag;
@@ -875,25 +869,31 @@
/* Check inode pointers */
agino = be32_to_cpu(agi->agi_newino);
- if (agino != NULLAGINO && !xfs_verify_agino(mp, agno, agino))
+ if (!xfs_verify_agino_or_null(mp, agno, agino))
xchk_block_set_corrupt(sc, sc->sa.agi_bp);
agino = be32_to_cpu(agi->agi_dirino);
- if (agino != NULLAGINO && !xfs_verify_agino(mp, agno, agino))
+ if (!xfs_verify_agino_or_null(mp, agno, agino))
xchk_block_set_corrupt(sc, sc->sa.agi_bp);
/* Check unlinked inode buckets */
for (i = 0; i < XFS_AGI_UNLINKED_BUCKETS; i++) {
agino = be32_to_cpu(agi->agi_unlinked[i]);
- if (agino == NULLAGINO)
- continue;
- if (!xfs_verify_agino(mp, agno, agino))
+ if (!xfs_verify_agino_or_null(mp, agno, agino))
xchk_block_set_corrupt(sc, sc->sa.agi_bp);
}
if (agi->agi_pad32 != cpu_to_be32(0))
xchk_block_set_corrupt(sc, sc->sa.agi_bp);
+ /* Do the incore counters match? */
+ pag = xfs_perag_get(mp, agno);
+ if (pag->pagi_count != be32_to_cpu(agi->agi_count))
+ xchk_block_set_corrupt(sc, sc->sa.agi_bp);
+ if (pag->pagi_freecount != be32_to_cpu(agi->agi_freecount))
+ xchk_block_set_corrupt(sc, sc->sa.agi_bp);
+ xfs_perag_put(pag);
+
xchk_agi_xref(sc);
out:
return error;
diff --git a/fs/xfs/scrub/agheader_repair.c b/fs/xfs/scrub/agheader_repair.c
index f7568a4..7a1a38b 100644
--- a/fs/xfs/scrub/agheader_repair.c
+++ b/fs/xfs/scrub/agheader_repair.c
@@ -9,22 +9,17 @@
#include "xfs_format.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
-#include "xfs_defer.h"
#include "xfs_btree.h"
-#include "xfs_bit.h"
#include "xfs_log_format.h"
#include "xfs_trans.h"
#include "xfs_sb.h"
-#include "xfs_inode.h"
#include "xfs_alloc.h"
#include "xfs_alloc_btree.h"
#include "xfs_ialloc.h"
#include "xfs_ialloc_btree.h"
#include "xfs_rmap.h"
#include "xfs_rmap_btree.h"
-#include "xfs_refcount.h"
#include "xfs_refcount_btree.h"
-#include "scrub/xfs_scrub.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/trace.h"
@@ -341,23 +336,19 @@
struct xrep_find_ag_btree fab[XREP_AGF_MAX] = {
[XREP_AGF_BNOBT] = {
.rmap_owner = XFS_RMAP_OWN_AG,
- .buf_ops = &xfs_allocbt_buf_ops,
- .magic = XFS_ABTB_CRC_MAGIC,
+ .buf_ops = &xfs_bnobt_buf_ops,
},
[XREP_AGF_CNTBT] = {
.rmap_owner = XFS_RMAP_OWN_AG,
- .buf_ops = &xfs_allocbt_buf_ops,
- .magic = XFS_ABTC_CRC_MAGIC,
+ .buf_ops = &xfs_cntbt_buf_ops,
},
[XREP_AGF_RMAPBT] = {
.rmap_owner = XFS_RMAP_OWN_AG,
.buf_ops = &xfs_rmapbt_buf_ops,
- .magic = XFS_RMAP_CRC_MAGIC,
},
[XREP_AGF_REFCOUNTBT] = {
.rmap_owner = XFS_RMAP_OWN_REFC,
.buf_ops = &xfs_refcountbt_buf_ops,
- .magic = XFS_REFC_CRC_MAGIC,
},
[XREP_AGF_END] = {
.buf_ops = NULL,
@@ -646,7 +637,6 @@
xrep_agfl(
struct xfs_scrub *sc)
{
- struct xfs_owner_info oinfo;
struct xfs_bitmap agfl_extents;
struct xfs_mount *mp = sc->mp;
struct xfs_buf *agf_bp;
@@ -708,8 +698,8 @@
goto err;
/* Dump any AGFL overflow. */
- xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_AG);
- return xrep_reap_extents(sc, &agfl_extents, &oinfo, XFS_AG_RESV_AGFL);
+ return xrep_reap_extents(sc, &agfl_extents, &XFS_RMAP_OINFO_AG,
+ XFS_AG_RESV_AGFL);
err:
xfs_bitmap_destroy(&agfl_extents);
return error;
@@ -876,12 +866,10 @@
[XREP_AGI_INOBT] = {
.rmap_owner = XFS_RMAP_OWN_INOBT,
.buf_ops = &xfs_inobt_buf_ops,
- .magic = XFS_IBT_CRC_MAGIC,
},
[XREP_AGI_FINOBT] = {
.rmap_owner = XFS_RMAP_OWN_INOBT,
- .buf_ops = &xfs_inobt_buf_ops,
- .magic = XFS_FIBT_CRC_MAGIC,
+ .buf_ops = &xfs_finobt_buf_ops,
},
[XREP_AGI_END] = {
.buf_ops = NULL
diff --git a/fs/xfs/scrub/alloc.c b/fs/xfs/scrub/alloc.c
index 376bcb5..5533e48 100644
--- a/fs/xfs/scrub/alloc.c
+++ b/fs/xfs/scrub/alloc.c
@@ -9,19 +9,12 @@
#include "xfs_format.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
-#include "xfs_defer.h"
#include "xfs_btree.h"
-#include "xfs_bit.h"
-#include "xfs_log_format.h"
-#include "xfs_trans.h"
-#include "xfs_sb.h"
#include "xfs_alloc.h"
#include "xfs_rmap.h"
-#include "scrub/xfs_scrub.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/btree.h"
-#include "scrub/trace.h"
/*
* Set us up to scrub free space btrees.
@@ -104,7 +97,6 @@
xfs_agnumber_t agno = bs->cur->bc_private.a.agno;
xfs_agblock_t bno;
xfs_extlen_t len;
- int error = 0;
bno = be32_to_cpu(rec->alloc.ar_startblock);
len = be32_to_cpu(rec->alloc.ar_blockcount);
@@ -116,7 +108,7 @@
xchk_allocbt_xref(bs->sc, bno, len);
- return error;
+ return 0;
}
/* Scrub the freespace btrees for some AG. */
@@ -125,12 +117,10 @@
struct xfs_scrub *sc,
xfs_btnum_t which)
{
- struct xfs_owner_info oinfo;
struct xfs_btree_cur *cur;
- xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_AG);
cur = which == XFS_BTNUM_BNO ? sc->sa.bno_cur : sc->sa.cnt_cur;
- return xchk_btree(sc, cur, xchk_allocbt_rec, &oinfo, NULL);
+ return xchk_btree(sc, cur, xchk_allocbt_rec, &XFS_RMAP_OINFO_AG, NULL);
}
int
diff --git a/fs/xfs/scrub/attr.c b/fs/xfs/scrub/attr.c
index 81d5e90..0edc7f8 100644
--- a/fs/xfs/scrub/attr.c
+++ b/fs/xfs/scrub/attr.c
@@ -9,26 +9,62 @@
#include "xfs_format.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
-#include "xfs_defer.h"
-#include "xfs_btree.h"
-#include "xfs_bit.h"
#include "xfs_log_format.h"
-#include "xfs_trans.h"
-#include "xfs_sb.h"
#include "xfs_inode.h"
#include "xfs_da_format.h"
#include "xfs_da_btree.h"
-#include "xfs_dir2.h"
#include "xfs_attr.h"
#include "xfs_attr_leaf.h"
-#include "scrub/xfs_scrub.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/dabtree.h"
-#include "scrub/trace.h"
+#include "scrub/attr.h"
-#include <linux/posix_acl_xattr.h>
-#include <linux/xattr.h>
+/*
+ * Allocate enough memory to hold an attr value and attr block bitmaps,
+ * reallocating the buffer if necessary. Buffer contents are not preserved
+ * across a reallocation.
+ */
+int
+xchk_setup_xattr_buf(
+ struct xfs_scrub *sc,
+ size_t value_size,
+ xfs_km_flags_t flags)
+{
+ size_t sz;
+ struct xchk_xattr_buf *ab = sc->buf;
+
+ /*
+ * We need enough space to read an xattr value from the file or enough
+ * space to hold three copies of the xattr free space bitmap. We don't
+ * need the buffer space for both purposes at the same time.
+ */
+ sz = 3 * sizeof(long) * BITS_TO_LONGS(sc->mp->m_attr_geo->blksize);
+ sz = max_t(size_t, sz, value_size);
+
+ /*
+ * If there's already a buffer, figure out if we need to reallocate it
+ * to accommodate a larger size.
+ */
+ if (ab) {
+ if (sz <= ab->sz)
+ return 0;
+ kmem_free(ab);
+ sc->buf = NULL;
+ }
+
+ /*
+ * Don't zero the buffer upon allocation to avoid runtime overhead.
+ * All users must be careful never to read uninitialized contents.
+ */
+ ab = kmem_alloc_large(sizeof(*ab) + sz, flags);
+ if (!ab)
+ return -ENOMEM;
+
+ ab->sz = sz;
+ sc->buf = ab;
+ return 0;
+}
/* Set us up to scrub an inode's extended attributes. */
int
@@ -36,19 +72,18 @@
struct xfs_scrub *sc,
struct xfs_inode *ip)
{
- size_t sz;
+ int error;
/*
- * Allocate the buffer without the inode lock held. We need enough
- * space to read every xattr value in the file or enough space to
- * hold three copies of the xattr free space bitmap. (Not both at
- * the same time.)
+ * We failed to get memory while checking attrs, so this time try to
+ * get all the memory we're ever going to need. Allocate the buffer
+ * without the inode lock held, which means we can sleep.
*/
- sz = max_t(size_t, XATTR_SIZE_MAX, 3 * sizeof(long) *
- BITS_TO_LONGS(sc->mp->m_attr_geo->blksize));
- sc->buf = kmem_zalloc_large(sz, KM_SLEEP);
- if (!sc->buf)
- return -ENOMEM;
+ if (sc->flags & XCHK_TRY_HARDER) {
+ error = xchk_setup_xattr_buf(sc, XATTR_SIZE_MAX, 0);
+ if (error)
+ return error;
+ }
return xchk_setup_inode_contents(sc, ip, 0);
}
@@ -82,12 +117,36 @@
sx = container_of(context, struct xchk_xattr, context);
+ if (xchk_should_terminate(sx->sc, &error)) {
+ context->seen_enough = error;
+ return;
+ }
+
if (flags & XFS_ATTR_INCOMPLETE) {
/* Incomplete attr key, just mark the inode for preening. */
xchk_ino_set_preen(sx->sc, context->dp->i_ino);
return;
}
+ /* Does this name make sense? */
+ if (!xfs_attr_namecheck(name, namelen)) {
+ xchk_fblock_set_corrupt(sx->sc, XFS_ATTR_FORK, args.blkno);
+ return;
+ }
+
+ /*
+ * Try to allocate enough memory to extrat the attr value. If that
+ * doesn't work, we overload the seen_enough variable to convey
+ * the error message back to the main scrub function.
+ */
+ error = xchk_setup_xattr_buf(sx->sc, valuelen, KM_MAYFAIL);
+ if (error == -ENOMEM)
+ error = -EDEADLOCK;
+ if (error) {
+ context->seen_enough = error;
+ return;
+ }
+
args.flags = ATTR_KERNOTIME;
if (flags & XFS_ATTR_ROOT)
args.flags |= ATTR_ROOT;
@@ -100,12 +159,10 @@
args.namelen = namelen;
args.hashval = xfs_da_hashname(args.name, args.namelen);
args.trans = context->tp;
- args.value = sx->sc->buf;
- args.valuelen = XATTR_SIZE_MAX;
+ args.value = xchk_xattr_valuebuf(sx->sc);
+ args.valuelen = valuelen;
error = xfs_attr_get_ilocked(context->dp, &args);
- if (error == -EEXIST)
- error = 0;
if (!xchk_fblock_process_error(sx->sc, XFS_ATTR_FORK, args.blkno,
&error))
goto fail_xref;
@@ -159,13 +216,12 @@
unsigned long *map,
struct xfs_attr3_icleaf_hdr *leafhdr)
{
- unsigned long *freemap;
- unsigned long *dstmap;
+ unsigned long *freemap = xchk_xattr_freemap(sc);
+ unsigned long *dstmap = xchk_xattr_dstmap(sc);
unsigned int mapsize = sc->mp->m_attr_geo->blksize;
int i;
/* Construct bitmap of freemap contents. */
- freemap = (unsigned long *)sc->buf + BITS_TO_LONGS(mapsize);
bitmap_zero(freemap, mapsize);
for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) {
if (!xchk_xattr_set_map(sc, freemap,
@@ -175,7 +231,6 @@
}
/* Look for bits that are set in freemap and are marked in use. */
- dstmap = freemap + BITS_TO_LONGS(mapsize);
return bitmap_and(dstmap, freemap, map, mapsize) == 0;
}
@@ -190,13 +245,13 @@
char *buf_end,
struct xfs_attr_leafblock *leaf,
struct xfs_attr3_icleaf_hdr *leafhdr,
- unsigned long *usedmap,
struct xfs_attr_leaf_entry *ent,
int idx,
unsigned int *usedbytes,
__u32 *last_hashval)
{
struct xfs_mount *mp = ds->state->mp;
+ unsigned long *usedmap = xchk_xattr_usedmap(ds->sc);
char *name_end;
struct xfs_attr_leaf_name_local *lentry;
struct xfs_attr_leaf_name_remote *rentry;
@@ -256,16 +311,26 @@
struct xfs_attr_leafblock *leaf = bp->b_addr;
struct xfs_attr_leaf_entry *ent;
struct xfs_attr_leaf_entry *entries;
- unsigned long *usedmap = ds->sc->buf;
+ unsigned long *usedmap;
char *buf_end;
size_t off;
__u32 last_hashval = 0;
unsigned int usedbytes = 0;
unsigned int hdrsize;
int i;
+ int error;
if (*last_checked == blk->blkno)
return 0;
+
+ /* Allocate memory for block usage checking. */
+ error = xchk_setup_xattr_buf(ds->sc, 0, KM_MAYFAIL);
+ if (error == -ENOMEM)
+ return -EDEADLOCK;
+ if (error)
+ return error;
+ usedmap = xchk_xattr_usedmap(ds->sc);
+
*last_checked = blk->blkno;
bitmap_zero(usedmap, mp->m_attr_geo->blksize);
@@ -313,7 +378,7 @@
/* Check the entry and nameval. */
xchk_xattr_entry(ds, level, buf_end, leaf, &leafhdr,
- usedmap, ent, i, &usedbytes, &last_hashval);
+ ent, i, &usedbytes, &last_hashval);
if (ds->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
goto out;
@@ -453,6 +518,10 @@
error = xfs_attr_list_int_ilocked(&sx.context);
if (!xchk_fblock_process_error(sc, XFS_ATTR_FORK, 0, &error))
goto out;
+
+ /* Did our listent function try to return any errors? */
+ if (sx.context.seen_enough < 0)
+ error = sx.context.seen_enough;
out:
return error;
}
diff --git a/fs/xfs/scrub/attr.h b/fs/xfs/scrub/attr.h
new file mode 100644
index 0000000..13a1d2e
--- /dev/null
+++ b/fs/xfs/scrub/attr.h
@@ -0,0 +1,71 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (C) 2019 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ */
+#ifndef __XFS_SCRUB_ATTR_H__
+#define __XFS_SCRUB_ATTR_H__
+
+/*
+ * Temporary storage for online scrub and repair of extended attributes.
+ */
+struct xchk_xattr_buf {
+ /* Size of @buf, in bytes. */
+ size_t sz;
+
+ /*
+ * Memory buffer -- either used for extracting attr values while
+ * walking the attributes; or for computing attr block bitmaps when
+ * checking the attribute tree.
+ *
+ * Each bitmap contains enough bits to track every byte in an attr
+ * block (rounded up to the size of an unsigned long). The attr block
+ * used space bitmap starts at the beginning of the buffer; the free
+ * space bitmap follows immediately after; and we have a third buffer
+ * for storing intermediate bitmap results.
+ */
+ uint8_t buf[0];
+};
+
+/* A place to store attribute values. */
+static inline uint8_t *
+xchk_xattr_valuebuf(
+ struct xfs_scrub *sc)
+{
+ struct xchk_xattr_buf *ab = sc->buf;
+
+ return ab->buf;
+}
+
+/* A bitmap of space usage computed by walking an attr leaf block. */
+static inline unsigned long *
+xchk_xattr_usedmap(
+ struct xfs_scrub *sc)
+{
+ struct xchk_xattr_buf *ab = sc->buf;
+
+ return (unsigned long *)ab->buf;
+}
+
+/* A bitmap of free space computed by walking attr leaf block free info. */
+static inline unsigned long *
+xchk_xattr_freemap(
+ struct xfs_scrub *sc)
+{
+ return xchk_xattr_usedmap(sc) +
+ BITS_TO_LONGS(sc->mp->m_attr_geo->blksize);
+}
+
+/* A bitmap used to hold temporary results. */
+static inline unsigned long *
+xchk_xattr_dstmap(
+ struct xfs_scrub *sc)
+{
+ return xchk_xattr_freemap(sc) +
+ BITS_TO_LONGS(sc->mp->m_attr_geo->blksize);
+}
+
+int xchk_setup_xattr_buf(struct xfs_scrub *sc, size_t value_size,
+ xfs_km_flags_t flags);
+
+#endif /* __XFS_SCRUB_ATTR_H__ */
diff --git a/fs/xfs/scrub/bitmap.c b/fs/xfs/scrub/bitmap.c
index fdadc9e..3d47d11 100644
--- a/fs/xfs/scrub/bitmap.c
+++ b/fs/xfs/scrub/bitmap.c
@@ -10,11 +10,6 @@
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
#include "xfs_btree.h"
-#include "scrub/xfs_scrub.h"
-#include "scrub/scrub.h"
-#include "scrub/common.h"
-#include "scrub/trace.h"
-#include "scrub/repair.h"
#include "scrub/bitmap.h"
/*
diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c
index e1d11f3..fa6ea64 100644
--- a/fs/xfs/scrub/bmap.c
+++ b/fs/xfs/scrub/bmap.c
@@ -9,27 +9,19 @@
#include "xfs_format.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
-#include "xfs_defer.h"
#include "xfs_btree.h"
#include "xfs_bit.h"
#include "xfs_log_format.h"
#include "xfs_trans.h"
-#include "xfs_sb.h"
#include "xfs_inode.h"
-#include "xfs_inode_fork.h"
#include "xfs_alloc.h"
-#include "xfs_rtalloc.h"
#include "xfs_bmap.h"
-#include "xfs_bmap_util.h"
#include "xfs_bmap_btree.h"
#include "xfs_rmap.h"
#include "xfs_rmap_btree.h"
-#include "xfs_refcount.h"
-#include "scrub/xfs_scrub.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/btree.h"
-#include "scrub/trace.h"
/* Set us up with an inode's bmap. */
int
@@ -83,6 +75,7 @@
xfs_fileoff_t lastoff;
bool is_rt;
bool is_shared;
+ bool was_loaded;
int whichfork;
};
@@ -221,25 +214,20 @@
/* Cross-reference a single rtdev extent record. */
STATIC void
-xchk_bmap_rt_extent_xref(
- struct xchk_bmap_info *info,
+xchk_bmap_rt_iextent_xref(
struct xfs_inode *ip,
- struct xfs_btree_cur *cur,
+ struct xchk_bmap_info *info,
struct xfs_bmbt_irec *irec)
{
- if (info->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
- return;
-
xchk_xref_is_used_rt_space(info->sc, irec->br_startblock,
irec->br_blockcount);
}
/* Cross-reference a single datadev extent record. */
STATIC void
-xchk_bmap_extent_xref(
- struct xchk_bmap_info *info,
+xchk_bmap_iextent_xref(
struct xfs_inode *ip,
- struct xfs_btree_cur *cur,
+ struct xchk_bmap_info *info,
struct xfs_bmbt_irec *irec)
{
struct xfs_mount *mp = info->sc->mp;
@@ -248,9 +236,6 @@
xfs_extlen_t len;
int error;
- if (info->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
- return;
-
agno = XFS_FSB_TO_AGNO(mp, irec->br_startblock);
agbno = XFS_FSB_TO_AGBNO(mp, irec->br_startblock);
len = irec->br_blockcount;
@@ -281,22 +266,42 @@
xchk_ag_free(info->sc, &info->sc->sa);
}
+/*
+ * Directories and attr forks should never have blocks that can't be addressed
+ * by a xfs_dablk_t.
+ */
+STATIC void
+xchk_bmap_dirattr_extent(
+ struct xfs_inode *ip,
+ struct xchk_bmap_info *info,
+ struct xfs_bmbt_irec *irec)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ xfs_fileoff_t off;
+
+ if (!S_ISDIR(VFS_I(ip)->i_mode) && info->whichfork != XFS_ATTR_FORK)
+ return;
+
+ if (!xfs_verify_dablk(mp, irec->br_startoff))
+ xchk_fblock_set_corrupt(info->sc, info->whichfork,
+ irec->br_startoff);
+
+ off = irec->br_startoff + irec->br_blockcount - 1;
+ if (!xfs_verify_dablk(mp, off))
+ xchk_fblock_set_corrupt(info->sc, info->whichfork, off);
+}
+
/* Scrub a single extent record. */
STATIC int
-xchk_bmap_extent(
+xchk_bmap_iextent(
struct xfs_inode *ip,
- struct xfs_btree_cur *cur,
struct xchk_bmap_info *info,
struct xfs_bmbt_irec *irec)
{
struct xfs_mount *mp = info->sc->mp;
- struct xfs_buf *bp = NULL;
xfs_filblks_t end;
int error = 0;
- if (cur)
- xfs_btree_get_block(cur, 0, &bp);
-
/*
* Check for out-of-order extents. This record could have come
* from the incore list, for which there is no ordering check.
@@ -305,6 +310,8 @@
xchk_fblock_set_corrupt(info->sc, info->whichfork,
irec->br_startoff);
+ xchk_bmap_dirattr_extent(ip, info, irec);
+
/* There should never be a "hole" extent in either extent list. */
if (irec->br_startblock == HOLESTARTBLOCK)
xchk_fblock_set_corrupt(info->sc, info->whichfork,
@@ -345,10 +352,13 @@
xchk_fblock_set_corrupt(info->sc, info->whichfork,
irec->br_startoff);
+ if (info->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+ return 0;
+
if (info->is_rt)
- xchk_bmap_rt_extent_xref(info, ip, cur, irec);
+ xchk_bmap_rt_iextent_xref(ip, info, irec);
else
- xchk_bmap_extent_xref(info, ip, cur, irec);
+ xchk_bmap_iextent_xref(ip, info, irec);
info->lastoff = irec->br_startoff + irec->br_blockcount;
return error;
@@ -361,10 +371,13 @@
union xfs_btree_rec *rec)
{
struct xfs_bmbt_irec irec;
+ struct xfs_bmbt_irec iext_irec;
+ struct xfs_iext_cursor icur;
struct xchk_bmap_info *info = bs->private;
struct xfs_inode *ip = bs->cur->bc_private.b.ip;
struct xfs_buf *bp = NULL;
struct xfs_btree_block *block;
+ struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, info->whichfork);
uint64_t owner;
int i;
@@ -383,9 +396,26 @@
}
}
- /* Set up the in-core record and scrub it. */
+ /*
+ * Check that the incore extent tree contains an extent that matches
+ * this one exactly. We validate those cached bmaps later, so we don't
+ * need to check them here. If the incore extent tree was just loaded
+ * from disk by the scrubber, we assume that its contents match what's
+ * on disk (we still hold the ILOCK) and skip the equivalence check.
+ */
+ if (!info->was_loaded)
+ return 0;
+
xfs_bmbt_disk_get_all(&rec->bmbt, &irec);
- return xchk_bmap_extent(ip, bs->cur, info, &irec);
+ if (!xfs_iext_lookup_extent(ip, ifp, irec.br_startoff, &icur,
+ &iext_irec) ||
+ irec.br_startoff != iext_irec.br_startoff ||
+ irec.br_startblock != iext_irec.br_startblock ||
+ irec.br_blockcount != iext_irec.br_blockcount ||
+ irec.br_state != iext_irec.br_state)
+ xchk_fblock_set_corrupt(bs->sc, info->whichfork,
+ irec.br_startoff);
+ return 0;
}
/* Scan the btree records. */
@@ -396,15 +426,26 @@
struct xchk_bmap_info *info)
{
struct xfs_owner_info oinfo;
+ struct xfs_ifork *ifp = XFS_IFORK_PTR(sc->ip, whichfork);
struct xfs_mount *mp = sc->mp;
struct xfs_inode *ip = sc->ip;
struct xfs_btree_cur *cur;
int error;
+ /* Load the incore bmap cache if it's not loaded. */
+ info->was_loaded = ifp->if_flags & XFS_IFEXTENTS;
+ if (!info->was_loaded) {
+ error = xfs_iread_extents(sc->tp, ip, whichfork);
+ if (!xchk_fblock_process_error(sc, whichfork, 0, &error))
+ goto out;
+ }
+
+ /* Check the btree structure. */
cur = xfs_bmbt_init_cursor(mp, sc->tp, ip, whichfork);
xfs_rmap_ino_bmbt_owner(&oinfo, ip->i_ino, whichfork);
error = xchk_btree(sc, cur, xchk_bmapbt_rec, &oinfo, info);
xfs_btree_del_cursor(cur, error);
+out:
return error;
}
@@ -481,7 +522,7 @@
out:
if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
- return XFS_BTREE_QUERY_RANGE_ABORT;
+ return -ECANCELED;
return 0;
}
@@ -510,7 +551,7 @@
sbcri.sc = sc;
sbcri.whichfork = whichfork;
error = xfs_rmap_query_all(cur, xchk_bmap_check_rmap, &sbcri);
- if (error == XFS_BTREE_QUERY_RANGE_ABORT)
+ if (error == -ECANCELED)
error = 0;
xfs_btree_del_cursor(cur, error);
@@ -652,13 +693,6 @@
if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
goto out;
- /* Now try to scrub the in-memory extent list. */
- if (!(ifp->if_flags & XFS_IFEXTENTS)) {
- error = xfs_iread_extents(sc->tp, ip, whichfork);
- if (!xchk_fblock_process_error(sc, whichfork, 0, &error))
- goto out;
- }
-
/* Find the offset of the last extent in the mapping. */
error = xfs_bmap_last_offset(ip, &endoff, whichfork);
if (!xchk_fblock_process_error(sc, whichfork, 0, &error))
@@ -670,7 +704,7 @@
for_each_xfs_iext(ifp, &icur, &irec) {
if (xchk_should_terminate(sc, &error) ||
(sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
- break;
+ goto out;
if (isnullstartblock(irec.br_startblock))
continue;
if (irec.br_startoff >= endoff) {
@@ -678,7 +712,7 @@
irec.br_startoff);
goto out;
}
- error = xchk_bmap_extent(ip, NULL, &info, &irec);
+ error = xchk_bmap_iextent(ip, &info, &irec);
if (error)
goto out;
}
diff --git a/fs/xfs/scrub/btree.c b/fs/xfs/scrub/btree.c
index 4ae959f..f52a7b8 100644
--- a/fs/xfs/scrub/btree.c
+++ b/fs/xfs/scrub/btree.c
@@ -9,14 +9,7 @@
#include "xfs_format.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
-#include "xfs_defer.h"
#include "xfs_btree.h"
-#include "xfs_bit.h"
-#include "xfs_log_format.h"
-#include "xfs_trans.h"
-#include "xfs_sb.h"
-#include "xfs_inode.h"
-#include "xfs_alloc.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/btree.h"
@@ -415,8 +408,17 @@
struct xfs_btree_cur *cur = bs->cur;
struct check_owner *co;
- if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) && bp == NULL)
+ /*
+ * In theory, xfs_btree_get_block should only give us a null buffer
+ * pointer for the root of a root-in-inode btree type, but we need
+ * to check defensively here in case the cursor state is also screwed
+ * up.
+ */
+ if (bp == NULL) {
+ if (!(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE))
+ xchk_btree_set_corrupt(bs->sc, bs->cur, level);
return 0;
+ }
/*
* We want to cross-reference each btree block with the bnobt
@@ -583,31 +585,32 @@
*/
int
xchk_btree(
- struct xfs_scrub *sc,
- struct xfs_btree_cur *cur,
- xchk_btree_rec_fn scrub_fn,
- struct xfs_owner_info *oinfo,
- void *private)
+ struct xfs_scrub *sc,
+ struct xfs_btree_cur *cur,
+ xchk_btree_rec_fn scrub_fn,
+ const struct xfs_owner_info *oinfo,
+ void *private)
{
- struct xchk_btree bs = { NULL };
- union xfs_btree_ptr ptr;
- union xfs_btree_ptr *pp;
- union xfs_btree_rec *recp;
- struct xfs_btree_block *block;
- int level;
- struct xfs_buf *bp;
- struct check_owner *co;
- struct check_owner *n;
- int i;
- int error = 0;
+ struct xchk_btree bs = {
+ .cur = cur,
+ .scrub_rec = scrub_fn,
+ .oinfo = oinfo,
+ .firstrec = true,
+ .private = private,
+ .sc = sc,
+ };
+ union xfs_btree_ptr ptr;
+ union xfs_btree_ptr *pp;
+ union xfs_btree_rec *recp;
+ struct xfs_btree_block *block;
+ int level;
+ struct xfs_buf *bp;
+ struct check_owner *co;
+ struct check_owner *n;
+ int i;
+ int error = 0;
/* Initialize scrub state */
- bs.cur = cur;
- bs.scrub_rec = scrub_fn;
- bs.oinfo = oinfo;
- bs.firstrec = true;
- bs.private = private;
- bs.sc = sc;
for (i = 0; i < XFS_BTREE_MAXLEVELS; i++)
bs.firstkey[i] = true;
INIT_LIST_HEAD(&bs.to_check);
diff --git a/fs/xfs/scrub/btree.h b/fs/xfs/scrub/btree.h
index aada763..5572e47 100644
--- a/fs/xfs/scrub/btree.h
+++ b/fs/xfs/scrub/btree.h
@@ -31,21 +31,21 @@
struct xchk_btree {
/* caller-provided scrub state */
- struct xfs_scrub *sc;
- struct xfs_btree_cur *cur;
- xchk_btree_rec_fn scrub_rec;
- struct xfs_owner_info *oinfo;
- void *private;
+ struct xfs_scrub *sc;
+ struct xfs_btree_cur *cur;
+ xchk_btree_rec_fn scrub_rec;
+ const struct xfs_owner_info *oinfo;
+ void *private;
/* internal scrub state */
- union xfs_btree_rec lastrec;
- bool firstrec;
- union xfs_btree_key lastkey[XFS_BTREE_MAXLEVELS];
- bool firstkey[XFS_BTREE_MAXLEVELS];
- struct list_head to_check;
+ union xfs_btree_rec lastrec;
+ bool firstrec;
+ union xfs_btree_key lastkey[XFS_BTREE_MAXLEVELS];
+ bool firstkey[XFS_BTREE_MAXLEVELS];
+ struct list_head to_check;
};
int xchk_btree(struct xfs_scrub *sc, struct xfs_btree_cur *cur,
- xchk_btree_rec_fn scrub_fn, struct xfs_owner_info *oinfo,
+ xchk_btree_rec_fn scrub_fn, const struct xfs_owner_info *oinfo,
void *private);
#endif /* __XFS_SCRUB_BTREE_H__ */
diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c
index 346b02a..1887605 100644
--- a/fs/xfs/scrub/common.c
+++ b/fs/xfs/scrub/common.c
@@ -9,22 +9,16 @@
#include "xfs_format.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
-#include "xfs_defer.h"
#include "xfs_btree.h"
-#include "xfs_bit.h"
#include "xfs_log_format.h"
#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_inode.h"
#include "xfs_icache.h"
-#include "xfs_itable.h"
#include "xfs_alloc.h"
#include "xfs_alloc_btree.h"
-#include "xfs_bmap.h"
-#include "xfs_bmap_btree.h"
#include "xfs_ialloc.h"
#include "xfs_ialloc_btree.h"
-#include "xfs_refcount.h"
#include "xfs_refcount_btree.h"
#include "xfs_rmap.h"
#include "xfs_rmap_btree.h"
@@ -32,12 +26,11 @@
#include "xfs_trans_priv.h"
#include "xfs_attr.h"
#include "xfs_reflink.h"
-#include "scrub/xfs_scrub.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/trace.h"
-#include "scrub/btree.h"
#include "scrub/repair.h"
+#include "scrub/health.h"
/* Common code for the metadata scrubbers. */
@@ -208,6 +201,15 @@
trace_xchk_ino_preen(sc, ino, __return_address);
}
+/* Record something being wrong with the filesystem primary superblock. */
+void
+xchk_set_corrupt(
+ struct xfs_scrub *sc)
+{
+ sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT;
+ trace_xchk_fs_error(sc, 0, __return_address);
+}
+
/* Record a corrupt block. */
void
xchk_block_set_corrupt(
@@ -313,8 +315,8 @@
*/
struct xchk_rmap_ownedby_info {
- struct xfs_owner_info *oinfo;
- xfs_filblks_t *blocks;
+ const struct xfs_owner_info *oinfo;
+ xfs_filblks_t *blocks;
};
STATIC int
@@ -347,15 +349,15 @@
xchk_count_rmap_ownedby_ag(
struct xfs_scrub *sc,
struct xfs_btree_cur *cur,
- struct xfs_owner_info *oinfo,
+ const struct xfs_owner_info *oinfo,
xfs_filblks_t *blocks)
{
- struct xchk_rmap_ownedby_info sroi;
+ struct xchk_rmap_ownedby_info sroi = {
+ .oinfo = oinfo,
+ .blocks = blocks,
+ };
- sroi.oinfo = oinfo;
*blocks = 0;
- sroi.blocks = blocks;
-
return xfs_rmap_query_all(cur, xchk_count_rmap_ownedby_irec,
&sroi);
}
@@ -458,13 +460,18 @@
struct xfs_mount *mp = sc->mp;
xfs_agnumber_t agno = sa->agno;
- if (sa->agf_bp) {
+ xchk_perag_get(sc->mp, sa);
+ if (sa->agf_bp &&
+ xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_BNO)) {
/* Set up a bnobt cursor for cross-referencing. */
sa->bno_cur = xfs_allocbt_init_cursor(mp, sc->tp, sa->agf_bp,
agno, XFS_BTNUM_BNO);
if (!sa->bno_cur)
goto err;
+ }
+ if (sa->agf_bp &&
+ xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_CNT)) {
/* Set up a cntbt cursor for cross-referencing. */
sa->cnt_cur = xfs_allocbt_init_cursor(mp, sc->tp, sa->agf_bp,
agno, XFS_BTNUM_CNT);
@@ -473,7 +480,8 @@
}
/* Set up a inobt cursor for cross-referencing. */
- if (sa->agi_bp) {
+ if (sa->agi_bp &&
+ xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_INO)) {
sa->ino_cur = xfs_inobt_init_cursor(mp, sc->tp, sa->agi_bp,
agno, XFS_BTNUM_INO);
if (!sa->ino_cur)
@@ -481,7 +489,8 @@
}
/* Set up a finobt cursor for cross-referencing. */
- if (sa->agi_bp && xfs_sb_version_hasfinobt(&mp->m_sb)) {
+ if (sa->agi_bp && xfs_sb_version_hasfinobt(&mp->m_sb) &&
+ xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_FINO)) {
sa->fino_cur = xfs_inobt_init_cursor(mp, sc->tp, sa->agi_bp,
agno, XFS_BTNUM_FINO);
if (!sa->fino_cur)
@@ -489,7 +498,8 @@
}
/* Set up a rmapbt cursor for cross-referencing. */
- if (sa->agf_bp && xfs_sb_version_hasrmapbt(&mp->m_sb)) {
+ if (sa->agf_bp && xfs_sb_version_hasrmapbt(&mp->m_sb) &&
+ xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_RMAP)) {
sa->rmap_cur = xfs_rmapbt_init_cursor(mp, sc->tp, sa->agf_bp,
agno);
if (!sa->rmap_cur)
@@ -497,7 +507,8 @@
}
/* Set up a refcountbt cursor for cross-referencing. */
- if (sa->agf_bp && xfs_sb_version_hasreflink(&mp->m_sb)) {
+ if (sa->agf_bp && xfs_sb_version_hasreflink(&mp->m_sb) &&
+ xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_REFC)) {
sa->refc_cur = xfs_refcountbt_init_cursor(mp, sc->tp,
sa->agf_bp, agno);
if (!sa->refc_cur)
@@ -884,3 +895,21 @@
}
return -EDEADLOCK;
}
+
+/* Pause background reaping of resources. */
+void
+xchk_stop_reaping(
+ struct xfs_scrub *sc)
+{
+ sc->flags |= XCHK_REAPING_DISABLED;
+ xfs_stop_block_reaping(sc->mp);
+}
+
+/* Restart background reaping of resources. */
+void
+xchk_start_reaping(
+ struct xfs_scrub *sc)
+{
+ xfs_start_block_reaping(sc->mp);
+ sc->flags &= ~XCHK_REAPING_DISABLED;
+}
diff --git a/fs/xfs/scrub/common.h b/fs/xfs/scrub/common.h
index 2d4324d..003a772 100644
--- a/fs/xfs/scrub/common.h
+++ b/fs/xfs/scrub/common.h
@@ -39,6 +39,7 @@
struct xfs_buf *bp);
void xchk_ino_set_preen(struct xfs_scrub *sc, xfs_ino_t ino);
+void xchk_set_corrupt(struct xfs_scrub *sc);
void xchk_block_set_corrupt(struct xfs_scrub *sc,
struct xfs_buf *bp);
void xchk_ino_set_corrupt(struct xfs_scrub *sc, xfs_ino_t ino);
@@ -105,6 +106,7 @@
return -ENOENT;
}
#endif
+int xchk_setup_fscounters(struct xfs_scrub *sc, struct xfs_inode *ip);
void xchk_ag_free(struct xfs_scrub *sc, struct xchk_ag *sa);
int xchk_ag_init(struct xfs_scrub *sc, xfs_agnumber_t agno,
@@ -116,7 +118,7 @@
void xchk_ag_btcur_free(struct xchk_ag *sa);
int xchk_ag_btcur_init(struct xfs_scrub *sc, struct xchk_ag *sa);
int xchk_count_rmap_ownedby_ag(struct xfs_scrub *sc, struct xfs_btree_cur *cur,
- struct xfs_owner_info *oinfo, xfs_filblks_t *blocks);
+ const struct xfs_owner_info *oinfo, xfs_filblks_t *blocks);
int xchk_setup_ag_btree(struct xfs_scrub *sc, struct xfs_inode *ip,
bool force_log);
@@ -137,5 +139,7 @@
int xchk_metadata_inode_forks(struct xfs_scrub *sc);
int xchk_ilock_inverted(struct xfs_inode *ip, uint lock_mode);
+void xchk_stop_reaping(struct xfs_scrub *sc);
+void xchk_start_reaping(struct xfs_scrub *sc);
#endif /* __XFS_SCRUB_COMMON_H__ */
diff --git a/fs/xfs/scrub/dabtree.c b/fs/xfs/scrub/dabtree.c
index f1260b4..77ff9f9 100644
--- a/fs/xfs/scrub/dabtree.c
+++ b/fs/xfs/scrub/dabtree.c
@@ -9,20 +9,12 @@
#include "xfs_format.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
-#include "xfs_defer.h"
-#include "xfs_btree.h"
-#include "xfs_bit.h"
#include "xfs_log_format.h"
#include "xfs_trans.h"
-#include "xfs_sb.h"
#include "xfs_inode.h"
-#include "xfs_inode_fork.h"
-#include "xfs_da_format.h"
-#include "xfs_da_btree.h"
#include "xfs_dir2.h"
#include "xfs_dir2_priv.h"
#include "xfs_attr_leaf.h"
-#include "scrub/xfs_scrub.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/trace.h"
@@ -286,7 +278,11 @@
/* Compare upper level pointer to sibling pointer. */
if (ds->state->altpath.blk[level].blkno != sibling)
xchk_da_set_corrupt(ds, level);
- xfs_trans_brelse(ds->dargs.trans, ds->state->altpath.blk[level].bp);
+ if (ds->state->altpath.blk[level].bp) {
+ xfs_trans_brelse(ds->dargs.trans,
+ ds->state->altpath.blk[level].bp);
+ ds->state->altpath.blk[level].bp = NULL;
+ }
out:
return error;
}
@@ -574,6 +570,11 @@
/* Drill another level deeper. */
blkno = be32_to_cpu(key->before);
level++;
+ if (level >= XFS_DA_NODE_MAXDEPTH) {
+ /* Too deep! */
+ xchk_da_set_corrupt(&ds, level - 1);
+ break;
+ }
ds.tree_level--;
error = xchk_da_btree_block(&ds, level, blkno);
if (error)
diff --git a/fs/xfs/scrub/dir.c b/fs/xfs/scrub/dir.c
index cd3e4d7..1e2e117 100644
--- a/fs/xfs/scrub/dir.c
+++ b/fs/xfs/scrub/dir.c
@@ -9,24 +9,14 @@
#include "xfs_format.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
-#include "xfs_defer.h"
-#include "xfs_btree.h"
-#include "xfs_bit.h"
#include "xfs_log_format.h"
#include "xfs_trans.h"
-#include "xfs_sb.h"
#include "xfs_inode.h"
#include "xfs_icache.h"
-#include "xfs_itable.h"
-#include "xfs_da_format.h"
-#include "xfs_da_btree.h"
#include "xfs_dir2.h"
#include "xfs_dir2_priv.h"
-#include "xfs_ialloc.h"
-#include "scrub/xfs_scrub.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
-#include "scrub/trace.h"
#include "scrub/dabtree.h"
/* Set us up to scrub directories. */
@@ -129,6 +119,12 @@
goto out;
}
+ /* Does this name make sense? */
+ if (!xfs_dir2_namecheck(name, namelen)) {
+ xchk_fblock_set_corrupt(sdc->sc, XFS_DATA_FORK, offset);
+ goto out;
+ }
+
if (!strncmp(".", name, namelen)) {
/* If this is "." then check that the inum matches the dir. */
if (xfs_sb_version_hasftype(&mp->m_sb) && type != DT_DIR)
diff --git a/fs/xfs/scrub/fscounters.c b/fs/xfs/scrub/fscounters.c
new file mode 100644
index 0000000..98f82d7
--- /dev/null
+++ b/fs/xfs/scrub/fscounters.c
@@ -0,0 +1,354 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Copyright (C) 2019 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_sb.h"
+#include "xfs_alloc.h"
+#include "xfs_ialloc.h"
+#include "xfs_health.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/trace.h"
+
+/*
+ * FS Summary Counters
+ * ===================
+ *
+ * The basics of filesystem summary counter checking are that we iterate the
+ * AGs counting the number of free blocks, free space btree blocks, per-AG
+ * reservations, inodes, delayed allocation reservations, and free inodes.
+ * Then we compare what we computed against the in-core counters.
+ *
+ * However, the reality is that summary counters are a tricky beast to check.
+ * While we /could/ freeze the filesystem and scramble around the AGs counting
+ * the free blocks, in practice we prefer not do that for a scan because
+ * freezing is costly. To get around this, we added a per-cpu counter of the
+ * delalloc reservations so that we can rotor around the AGs relatively
+ * quickly, and we allow the counts to be slightly off because we're not taking
+ * any locks while we do this.
+ *
+ * So the first thing we do is warm up the buffer cache in the setup routine by
+ * walking all the AGs to make sure the incore per-AG structure has been
+ * initialized. The expected value calculation then iterates the incore per-AG
+ * structures as quickly as it can. We snapshot the percpu counters before and
+ * after this operation and use the difference in counter values to guess at
+ * our tolerance for mismatch between expected and actual counter values.
+ */
+
+/*
+ * Since the expected value computation is lockless but only browses incore
+ * values, the percpu counters should be fairly close to each other. However,
+ * we'll allow ourselves to be off by at least this (arbitrary) amount.
+ */
+#define XCHK_FSCOUNT_MIN_VARIANCE (512)
+
+/*
+ * Make sure the per-AG structure has been initialized from the on-disk header
+ * contents and trust that the incore counters match the ondisk counters. (The
+ * AGF and AGI scrubbers check them, and a normal xfs_scrub run checks the
+ * summary counters after checking all AG headers). Do this from the setup
+ * function so that the inner AG aggregation loop runs as quickly as possible.
+ *
+ * This function runs during the setup phase /before/ we start checking any
+ * metadata.
+ */
+STATIC int
+xchk_fscount_warmup(
+ struct xfs_scrub *sc)
+{
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_buf *agi_bp = NULL;
+ struct xfs_buf *agf_bp = NULL;
+ struct xfs_perag *pag = NULL;
+ xfs_agnumber_t agno;
+ int error = 0;
+
+ for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
+ pag = xfs_perag_get(mp, agno);
+
+ if (pag->pagi_init && pag->pagf_init)
+ goto next_loop_perag;
+
+ /* Lock both AG headers. */
+ error = xfs_ialloc_read_agi(mp, sc->tp, agno, &agi_bp);
+ if (error)
+ break;
+ error = xfs_alloc_read_agf(mp, sc->tp, agno, 0, &agf_bp);
+ if (error)
+ break;
+ error = -ENOMEM;
+ if (!agf_bp || !agi_bp)
+ break;
+
+ /*
+ * These are supposed to be initialized by the header read
+ * function.
+ */
+ error = -EFSCORRUPTED;
+ if (!pag->pagi_init || !pag->pagf_init)
+ break;
+
+ xfs_buf_relse(agf_bp);
+ agf_bp = NULL;
+ xfs_buf_relse(agi_bp);
+ agi_bp = NULL;
+next_loop_perag:
+ xfs_perag_put(pag);
+ pag = NULL;
+ error = 0;
+
+ if (fatal_signal_pending(current))
+ break;
+ }
+
+ if (agf_bp)
+ xfs_buf_relse(agf_bp);
+ if (agi_bp)
+ xfs_buf_relse(agi_bp);
+ if (pag)
+ xfs_perag_put(pag);
+ return error;
+}
+
+int
+xchk_setup_fscounters(
+ struct xfs_scrub *sc,
+ struct xfs_inode *ip)
+{
+ struct xchk_fscounters *fsc;
+ int error;
+
+ sc->buf = kmem_zalloc(sizeof(struct xchk_fscounters), 0);
+ if (!sc->buf)
+ return -ENOMEM;
+ fsc = sc->buf;
+
+ xfs_icount_range(sc->mp, &fsc->icount_min, &fsc->icount_max);
+
+ /* We must get the incore counters set up before we can proceed. */
+ error = xchk_fscount_warmup(sc);
+ if (error)
+ return error;
+
+ /*
+ * Pause background reclaim while we're scrubbing to reduce the
+ * likelihood of background perturbations to the counters throwing off
+ * our calculations.
+ */
+ xchk_stop_reaping(sc);
+
+ return xchk_trans_alloc(sc, 0);
+}
+
+/*
+ * Calculate what the global in-core counters ought to be from the incore
+ * per-AG structure. Callers can compare this to the actual in-core counters
+ * to estimate by how much both in-core and on-disk counters need to be
+ * adjusted.
+ */
+STATIC int
+xchk_fscount_aggregate_agcounts(
+ struct xfs_scrub *sc,
+ struct xchk_fscounters *fsc)
+{
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_perag *pag;
+ uint64_t delayed;
+ xfs_agnumber_t agno;
+ int tries = 8;
+
+retry:
+ fsc->icount = 0;
+ fsc->ifree = 0;
+ fsc->fdblocks = 0;
+
+ for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
+ pag = xfs_perag_get(mp, agno);
+
+ /* This somehow got unset since the warmup? */
+ if (!pag->pagi_init || !pag->pagf_init) {
+ xfs_perag_put(pag);
+ return -EFSCORRUPTED;
+ }
+
+ /* Count all the inodes */
+ fsc->icount += pag->pagi_count;
+ fsc->ifree += pag->pagi_freecount;
+
+ /* Add up the free/freelist/bnobt/cntbt blocks */
+ fsc->fdblocks += pag->pagf_freeblks;
+ fsc->fdblocks += pag->pagf_flcount;
+ fsc->fdblocks += pag->pagf_btreeblks;
+
+ /*
+ * Per-AG reservations are taken out of the incore counters,
+ * so they must be left out of the free blocks computation.
+ */
+ fsc->fdblocks -= pag->pag_meta_resv.ar_reserved;
+ fsc->fdblocks -= pag->pag_rmapbt_resv.ar_orig_reserved;
+
+ xfs_perag_put(pag);
+
+ if (fatal_signal_pending(current))
+ break;
+ }
+
+ /*
+ * The global incore space reservation is taken from the incore
+ * counters, so leave that out of the computation.
+ */
+ fsc->fdblocks -= mp->m_resblks_avail;
+
+ /*
+ * Delayed allocation reservations are taken out of the incore counters
+ * but not recorded on disk, so leave them and their indlen blocks out
+ * of the computation.
+ */
+ delayed = percpu_counter_sum(&mp->m_delalloc_blks);
+ fsc->fdblocks -= delayed;
+
+ trace_xchk_fscounters_calc(mp, fsc->icount, fsc->ifree, fsc->fdblocks,
+ delayed);
+
+
+ /* Bail out if the values we compute are totally nonsense. */
+ if (fsc->icount < fsc->icount_min || fsc->icount > fsc->icount_max ||
+ fsc->fdblocks > mp->m_sb.sb_dblocks ||
+ fsc->ifree > fsc->icount_max)
+ return -EFSCORRUPTED;
+
+ /*
+ * If ifree > icount then we probably had some perturbation in the
+ * counters while we were calculating things. We'll try a few times
+ * to maintain ifree <= icount before giving up.
+ */
+ if (fsc->ifree > fsc->icount) {
+ if (tries--)
+ goto retry;
+ xchk_set_incomplete(sc);
+ return 0;
+ }
+
+ return 0;
+}
+
+/*
+ * Is the @counter reasonably close to the @expected value?
+ *
+ * We neither locked nor froze anything in the filesystem while aggregating the
+ * per-AG data to compute the @expected value, which means that the counter
+ * could have changed. We know the @old_value of the summation of the counter
+ * before the aggregation, and we re-sum the counter now. If the expected
+ * value falls between the two summations, we're ok.
+ *
+ * Otherwise, we /might/ have a problem. If the change in the summations is
+ * more than we want to tolerate, the filesystem is probably busy and we should
+ * just send back INCOMPLETE and see if userspace will try again.
+ */
+static inline bool
+xchk_fscount_within_range(
+ struct xfs_scrub *sc,
+ const int64_t old_value,
+ struct percpu_counter *counter,
+ uint64_t expected)
+{
+ int64_t min_value, max_value;
+ int64_t curr_value = percpu_counter_sum(counter);
+
+ trace_xchk_fscounters_within_range(sc->mp, expected, curr_value,
+ old_value);
+
+ /* Negative values are always wrong. */
+ if (curr_value < 0)
+ return false;
+
+ /* Exact matches are always ok. */
+ if (curr_value == expected)
+ return true;
+
+ min_value = min(old_value, curr_value);
+ max_value = max(old_value, curr_value);
+
+ /* Within the before-and-after range is ok. */
+ if (expected >= min_value && expected <= max_value)
+ return true;
+
+ /*
+ * If the difference between the two summations is too large, the fs
+ * might just be busy and so we'll mark the scrub incomplete. Return
+ * true here so that we don't mark the counter corrupt.
+ *
+ * XXX: In the future when userspace can grant scrub permission to
+ * quiesce the filesystem to solve the outsized variance problem, this
+ * check should be moved up and the return code changed to signal to
+ * userspace that we need quiesce permission.
+ */
+ if (max_value - min_value >= XCHK_FSCOUNT_MIN_VARIANCE) {
+ xchk_set_incomplete(sc);
+ return true;
+ }
+
+ return false;
+}
+
+/* Check the superblock counters. */
+int
+xchk_fscounters(
+ struct xfs_scrub *sc)
+{
+ struct xfs_mount *mp = sc->mp;
+ struct xchk_fscounters *fsc = sc->buf;
+ int64_t icount, ifree, fdblocks;
+ int error;
+
+ /* Snapshot the percpu counters. */
+ icount = percpu_counter_sum(&mp->m_icount);
+ ifree = percpu_counter_sum(&mp->m_ifree);
+ fdblocks = percpu_counter_sum(&mp->m_fdblocks);
+
+ /* No negative values, please! */
+ if (icount < 0 || ifree < 0 || fdblocks < 0)
+ xchk_set_corrupt(sc);
+
+ /* See if icount is obviously wrong. */
+ if (icount < fsc->icount_min || icount > fsc->icount_max)
+ xchk_set_corrupt(sc);
+
+ /* See if fdblocks is obviously wrong. */
+ if (fdblocks > mp->m_sb.sb_dblocks)
+ xchk_set_corrupt(sc);
+
+ /*
+ * If ifree exceeds icount by more than the minimum variance then
+ * something's probably wrong with the counters.
+ */
+ if (ifree > icount && ifree - icount > XCHK_FSCOUNT_MIN_VARIANCE)
+ xchk_set_corrupt(sc);
+
+ /* Walk the incore AG headers to calculate the expected counters. */
+ error = xchk_fscount_aggregate_agcounts(sc, fsc);
+ if (!xchk_process_error(sc, 0, XFS_SB_BLOCK(mp), &error))
+ return error;
+ if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_INCOMPLETE)
+ return 0;
+
+ /* Compare the in-core counters with whatever we counted. */
+ if (!xchk_fscount_within_range(sc, icount, &mp->m_icount, fsc->icount))
+ xchk_set_corrupt(sc);
+
+ if (!xchk_fscount_within_range(sc, ifree, &mp->m_ifree, fsc->ifree))
+ xchk_set_corrupt(sc);
+
+ if (!xchk_fscount_within_range(sc, fdblocks, &mp->m_fdblocks,
+ fsc->fdblocks))
+ xchk_set_corrupt(sc);
+
+ return 0;
+}
diff --git a/fs/xfs/scrub/health.c b/fs/xfs/scrub/health.c
new file mode 100644
index 0000000..b2f6028
--- /dev/null
+++ b/fs/xfs/scrub/health.c
@@ -0,0 +1,229 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Copyright (C) 2019 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_btree.h"
+#include "xfs_sb.h"
+#include "xfs_health.h"
+#include "scrub/scrub.h"
+
+/*
+ * Scrub and In-Core Filesystem Health Assessments
+ * ===============================================
+ *
+ * Online scrub and repair have the time and the ability to perform stronger
+ * checks than we can do from the metadata verifiers, because they can
+ * cross-reference records between data structures. Therefore, scrub is in a
+ * good position to update the online filesystem health assessments to reflect
+ * the good/bad state of the data structure.
+ *
+ * We therefore extend scrub in the following ways to achieve this:
+ *
+ * 1. Create a "sick_mask" field in the scrub context. When we're setting up a
+ * scrub call, set this to the default XFS_SICK_* flag(s) for the selected
+ * scrub type (call it A). Scrub and repair functions can override the default
+ * sick_mask value if they choose.
+ *
+ * 2. If the scrubber returns a runtime error code, we exit making no changes
+ * to the incore sick state.
+ *
+ * 3. If the scrubber finds that A is clean, use sick_mask to clear the incore
+ * sick flags before exiting.
+ *
+ * 4. If the scrubber finds that A is corrupt, use sick_mask to set the incore
+ * sick flags. If the user didn't want to repair then we exit, leaving the
+ * metadata structure unfixed and the sick flag set.
+ *
+ * 5. Now we know that A is corrupt and the user wants to repair, so run the
+ * repairer. If the repairer returns an error code, we exit with that error
+ * code, having made no further changes to the incore sick state.
+ *
+ * 6. If repair rebuilds A correctly and the subsequent re-scrub of A is clean,
+ * use sick_mask to clear the incore sick flags. This should have the effect
+ * that A is no longer marked sick.
+ *
+ * 7. If repair rebuilds A incorrectly, the re-scrub will find it corrupt and
+ * use sick_mask to set the incore sick flags. This should have no externally
+ * visible effect since we already set them in step (4).
+ *
+ * There are some complications to this story, however. For certain types of
+ * complementary metadata indices (e.g. inobt/finobt), it is easier to rebuild
+ * both structures at the same time. The following principles apply to this
+ * type of repair strategy:
+ *
+ * 8. Any repair function that rebuilds multiple structures should update
+ * sick_mask_visible to reflect whatever other structures are rebuilt, and
+ * verify that all the rebuilt structures can pass a scrub check. The outcomes
+ * of 5-7 still apply, but with a sick_mask that covers everything being
+ * rebuilt.
+ */
+
+/* Map our scrub type to a sick mask and a set of health update functions. */
+
+enum xchk_health_group {
+ XHG_FS = 1,
+ XHG_RT,
+ XHG_AG,
+ XHG_INO,
+};
+
+struct xchk_health_map {
+ enum xchk_health_group group;
+ unsigned int sick_mask;
+};
+
+static const struct xchk_health_map type_to_health_flag[XFS_SCRUB_TYPE_NR] = {
+ [XFS_SCRUB_TYPE_SB] = { XHG_AG, XFS_SICK_AG_SB },
+ [XFS_SCRUB_TYPE_AGF] = { XHG_AG, XFS_SICK_AG_AGF },
+ [XFS_SCRUB_TYPE_AGFL] = { XHG_AG, XFS_SICK_AG_AGFL },
+ [XFS_SCRUB_TYPE_AGI] = { XHG_AG, XFS_SICK_AG_AGI },
+ [XFS_SCRUB_TYPE_BNOBT] = { XHG_AG, XFS_SICK_AG_BNOBT },
+ [XFS_SCRUB_TYPE_CNTBT] = { XHG_AG, XFS_SICK_AG_CNTBT },
+ [XFS_SCRUB_TYPE_INOBT] = { XHG_AG, XFS_SICK_AG_INOBT },
+ [XFS_SCRUB_TYPE_FINOBT] = { XHG_AG, XFS_SICK_AG_FINOBT },
+ [XFS_SCRUB_TYPE_RMAPBT] = { XHG_AG, XFS_SICK_AG_RMAPBT },
+ [XFS_SCRUB_TYPE_REFCNTBT] = { XHG_AG, XFS_SICK_AG_REFCNTBT },
+ [XFS_SCRUB_TYPE_INODE] = { XHG_INO, XFS_SICK_INO_CORE },
+ [XFS_SCRUB_TYPE_BMBTD] = { XHG_INO, XFS_SICK_INO_BMBTD },
+ [XFS_SCRUB_TYPE_BMBTA] = { XHG_INO, XFS_SICK_INO_BMBTA },
+ [XFS_SCRUB_TYPE_BMBTC] = { XHG_INO, XFS_SICK_INO_BMBTC },
+ [XFS_SCRUB_TYPE_DIR] = { XHG_INO, XFS_SICK_INO_DIR },
+ [XFS_SCRUB_TYPE_XATTR] = { XHG_INO, XFS_SICK_INO_XATTR },
+ [XFS_SCRUB_TYPE_SYMLINK] = { XHG_INO, XFS_SICK_INO_SYMLINK },
+ [XFS_SCRUB_TYPE_PARENT] = { XHG_INO, XFS_SICK_INO_PARENT },
+ [XFS_SCRUB_TYPE_RTBITMAP] = { XHG_RT, XFS_SICK_RT_BITMAP },
+ [XFS_SCRUB_TYPE_RTSUM] = { XHG_RT, XFS_SICK_RT_SUMMARY },
+ [XFS_SCRUB_TYPE_UQUOTA] = { XHG_FS, XFS_SICK_FS_UQUOTA },
+ [XFS_SCRUB_TYPE_GQUOTA] = { XHG_FS, XFS_SICK_FS_GQUOTA },
+ [XFS_SCRUB_TYPE_PQUOTA] = { XHG_FS, XFS_SICK_FS_PQUOTA },
+ [XFS_SCRUB_TYPE_FSCOUNTERS] = { XHG_FS, XFS_SICK_FS_COUNTERS },
+};
+
+/* Return the health status mask for this scrub type. */
+unsigned int
+xchk_health_mask_for_scrub_type(
+ __u32 scrub_type)
+{
+ return type_to_health_flag[scrub_type].sick_mask;
+}
+
+/*
+ * Update filesystem health assessments based on what we found and did.
+ *
+ * If the scrubber finds errors, we mark sick whatever's mentioned in
+ * sick_mask, no matter whether this is a first scan or an
+ * evaluation of repair effectiveness.
+ *
+ * Otherwise, no direct corruption was found, so mark whatever's in
+ * sick_mask as healthy.
+ */
+void
+xchk_update_health(
+ struct xfs_scrub *sc)
+{
+ struct xfs_perag *pag;
+ bool bad;
+
+ if (!sc->sick_mask)
+ return;
+
+ bad = (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT);
+ switch (type_to_health_flag[sc->sm->sm_type].group) {
+ case XHG_AG:
+ pag = xfs_perag_get(sc->mp, sc->sm->sm_agno);
+ if (bad)
+ xfs_ag_mark_sick(pag, sc->sick_mask);
+ else
+ xfs_ag_mark_healthy(pag, sc->sick_mask);
+ xfs_perag_put(pag);
+ break;
+ case XHG_INO:
+ if (!sc->ip)
+ return;
+ if (bad)
+ xfs_inode_mark_sick(sc->ip, sc->sick_mask);
+ else
+ xfs_inode_mark_healthy(sc->ip, sc->sick_mask);
+ break;
+ case XHG_FS:
+ if (bad)
+ xfs_fs_mark_sick(sc->mp, sc->sick_mask);
+ else
+ xfs_fs_mark_healthy(sc->mp, sc->sick_mask);
+ break;
+ case XHG_RT:
+ if (bad)
+ xfs_rt_mark_sick(sc->mp, sc->sick_mask);
+ else
+ xfs_rt_mark_healthy(sc->mp, sc->sick_mask);
+ break;
+ default:
+ ASSERT(0);
+ break;
+ }
+}
+
+/* Is the given per-AG btree healthy enough for scanning? */
+bool
+xchk_ag_btree_healthy_enough(
+ struct xfs_scrub *sc,
+ struct xfs_perag *pag,
+ xfs_btnum_t btnum)
+{
+ unsigned int mask = 0;
+
+ /*
+ * We always want the cursor if it's the same type as whatever we're
+ * scrubbing, even if we already know the structure is corrupt.
+ *
+ * Otherwise, we're only interested in the btree for cross-referencing.
+ * If we know the btree is bad then don't bother, just set XFAIL.
+ */
+ switch (btnum) {
+ case XFS_BTNUM_BNO:
+ if (sc->sm->sm_type == XFS_SCRUB_TYPE_BNOBT)
+ return true;
+ mask = XFS_SICK_AG_BNOBT;
+ break;
+ case XFS_BTNUM_CNT:
+ if (sc->sm->sm_type == XFS_SCRUB_TYPE_CNTBT)
+ return true;
+ mask = XFS_SICK_AG_CNTBT;
+ break;
+ case XFS_BTNUM_INO:
+ if (sc->sm->sm_type == XFS_SCRUB_TYPE_INOBT)
+ return true;
+ mask = XFS_SICK_AG_INOBT;
+ break;
+ case XFS_BTNUM_FINO:
+ if (sc->sm->sm_type == XFS_SCRUB_TYPE_FINOBT)
+ return true;
+ mask = XFS_SICK_AG_FINOBT;
+ break;
+ case XFS_BTNUM_RMAP:
+ if (sc->sm->sm_type == XFS_SCRUB_TYPE_RMAPBT)
+ return true;
+ mask = XFS_SICK_AG_RMAPBT;
+ break;
+ case XFS_BTNUM_REFC:
+ if (sc->sm->sm_type == XFS_SCRUB_TYPE_REFCNTBT)
+ return true;
+ mask = XFS_SICK_AG_REFCNTBT;
+ break;
+ default:
+ ASSERT(0);
+ return true;
+ }
+
+ if (xfs_ag_has_sickness(pag, mask)) {
+ sc->sm->sm_flags |= XFS_SCRUB_OFLAG_XFAIL;
+ return false;
+ }
+
+ return true;
+}
diff --git a/fs/xfs/scrub/health.h b/fs/xfs/scrub/health.h
new file mode 100644
index 0000000..d0b938d
--- /dev/null
+++ b/fs/xfs/scrub/health.h
@@ -0,0 +1,14 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Copyright (C) 2019 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ */
+#ifndef __XFS_SCRUB_HEALTH_H__
+#define __XFS_SCRUB_HEALTH_H__
+
+unsigned int xchk_health_mask_for_scrub_type(__u32 scrub_type);
+void xchk_update_health(struct xfs_scrub *sc);
+bool xchk_ag_btree_healthy_enough(struct xfs_scrub *sc, struct xfs_perag *pag,
+ xfs_btnum_t btnum);
+
+#endif /* __XFS_SCRUB_HEALTH_H__ */
diff --git a/fs/xfs/scrub/ialloc.c b/fs/xfs/scrub/ialloc.c
index 224dba9..6817587 100644
--- a/fs/xfs/scrub/ialloc.c
+++ b/fs/xfs/scrub/ialloc.c
@@ -9,21 +9,14 @@
#include "xfs_format.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
-#include "xfs_defer.h"
#include "xfs_btree.h"
-#include "xfs_bit.h"
#include "xfs_log_format.h"
#include "xfs_trans.h"
-#include "xfs_sb.h"
#include "xfs_inode.h"
-#include "xfs_alloc.h"
#include "xfs_ialloc.h"
#include "xfs_ialloc_btree.h"
#include "xfs_icache.h"
#include "xfs_rmap.h"
-#include "xfs_log.h"
-#include "xfs_trans_priv.h"
-#include "scrub/xfs_scrub.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/btree.h"
@@ -39,11 +32,22 @@
struct xfs_scrub *sc,
struct xfs_inode *ip)
{
- return xchk_setup_ag_btree(sc, ip, sc->try_harder);
+ return xchk_setup_ag_btree(sc, ip, sc->flags & XCHK_TRY_HARDER);
}
/* Inode btree scrubber. */
+struct xchk_iallocbt {
+ /* Number of inodes we see while scanning inobt. */
+ unsigned long long inodes;
+
+ /* Expected next startino, for big block filesystems. */
+ xfs_agino_t next_startino;
+
+ /* Expected end of the current inode cluster. */
+ xfs_agino_t next_cluster_ino;
+};
+
/*
* If we're checking the finobt, cross-reference with the inobt.
* Otherwise we're checking the inobt; if there is an finobt, make sure
@@ -82,15 +86,12 @@
xfs_agblock_t agbno,
xfs_extlen_t len)
{
- struct xfs_owner_info oinfo;
-
if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
return;
xchk_xref_is_used_space(sc, agbno, len);
xchk_iallocbt_chunk_xref_other(sc, irec, agino);
- xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_INODES);
- xchk_xref_is_owned_by(sc, agbno, len, &oinfo);
+ xchk_xref_is_owned_by(sc, agbno, len, &XFS_RMAP_OINFO_INODES);
xchk_xref_is_not_shared(sc, agbno, len);
}
@@ -126,42 +127,58 @@
return hweight64(freemask);
}
-/* Check a particular inode with ir_free. */
+/*
+ * Check that an inode's allocation status matches ir_free in the inobt
+ * record. First we try querying the in-core inode state, and if the inode
+ * isn't loaded we examine the on-disk inode directly.
+ *
+ * Since there can be 1:M and M:1 mappings between inobt records and inode
+ * clusters, we pass in the inode location information as an inobt record;
+ * the index of an inode cluster within the inobt record (as well as the
+ * cluster buffer itself); and the index of the inode within the cluster.
+ *
+ * @irec is the inobt record.
+ * @irec_ino is the inode offset from the start of the record.
+ * @dip is the on-disk inode.
+ */
STATIC int
-xchk_iallocbt_check_cluster_freemask(
+xchk_iallocbt_check_cluster_ifree(
struct xchk_btree *bs,
- xfs_ino_t fsino,
- xfs_agino_t chunkino,
- xfs_agino_t clusterino,
struct xfs_inobt_rec_incore *irec,
- struct xfs_buf *bp)
+ unsigned int irec_ino,
+ struct xfs_dinode *dip)
{
- struct xfs_dinode *dip;
struct xfs_mount *mp = bs->cur->bc_mp;
- bool inode_is_free = false;
+ xfs_ino_t fsino;
+ xfs_agino_t agino;
+ bool irec_free;
+ bool ino_inuse;
bool freemask_ok;
- bool inuse;
int error = 0;
if (xchk_should_terminate(bs->sc, &error))
return error;
- dip = xfs_buf_offset(bp, clusterino * mp->m_sb.sb_inodesize);
+ /*
+ * Given an inobt record and the offset of an inode from the start of
+ * the record, compute which fs inode we're talking about.
+ */
+ agino = irec->ir_startino + irec_ino;
+ fsino = XFS_AGINO_TO_INO(mp, bs->cur->bc_private.a.agno, agino);
+ irec_free = (irec->ir_free & XFS_INOBT_MASK(irec_ino));
+
if (be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC ||
- (dip->di_version >= 3 &&
- be64_to_cpu(dip->di_ino) != fsino + clusterino)) {
+ (dip->di_version >= 3 && be64_to_cpu(dip->di_ino) != fsino)) {
xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
goto out;
}
- if (irec->ir_free & XFS_INOBT_MASK(chunkino + clusterino))
- inode_is_free = true;
- error = xfs_icache_inode_is_allocated(mp, bs->cur->bc_tp,
- fsino + clusterino, &inuse);
+ error = xfs_icache_inode_is_allocated(mp, bs->cur->bc_tp, fsino,
+ &ino_inuse);
if (error == -ENODATA) {
/* Not cached, just read the disk buffer */
- freemask_ok = inode_is_free ^ !!(dip->di_mode);
- if (!bs->sc->try_harder && !freemask_ok)
+ freemask_ok = irec_free ^ !!(dip->di_mode);
+ if (!(bs->sc->flags & XCHK_TRY_HARDER) && !freemask_ok)
return -EDEADLOCK;
} else if (error < 0) {
/*
@@ -172,7 +189,7 @@
goto out;
} else {
/* Inode is all there. */
- freemask_ok = inode_is_free ^ inuse;
+ freemask_ok = irec_free ^ ino_inuse;
}
if (!freemask_ok)
xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
@@ -180,91 +197,225 @@
return 0;
}
-/* Make sure the free mask is consistent with what the inodes think. */
+/*
+ * Check that the holemask and freemask of a hypothetical inode cluster match
+ * what's actually on disk. If sparse inodes are enabled, the cluster does
+ * not actually have to map to inodes if the corresponding holemask bit is set.
+ *
+ * @cluster_base is the first inode in the cluster within the @irec.
+ */
STATIC int
-xchk_iallocbt_check_freemask(
+xchk_iallocbt_check_cluster(
struct xchk_btree *bs,
- struct xfs_inobt_rec_incore *irec)
+ struct xfs_inobt_rec_incore *irec,
+ unsigned int cluster_base)
{
- struct xfs_owner_info oinfo;
struct xfs_imap imap;
struct xfs_mount *mp = bs->cur->bc_mp;
struct xfs_dinode *dip;
- struct xfs_buf *bp;
- xfs_ino_t fsino;
- xfs_agino_t nr_inodes;
- xfs_agino_t agino;
- xfs_agino_t chunkino;
- xfs_agino_t clusterino;
+ struct xfs_buf *cluster_bp;
+ unsigned int nr_inodes;
+ xfs_agnumber_t agno = bs->cur->bc_private.a.agno;
xfs_agblock_t agbno;
- int blks_per_cluster;
- uint16_t holemask;
+ unsigned int cluster_index;
+ uint16_t cluster_mask = 0;
uint16_t ir_holemask;
int error = 0;
- /* Make sure the freemask matches the inode records. */
- blks_per_cluster = xfs_icluster_size_fsb(mp);
- nr_inodes = XFS_OFFBNO_TO_AGINO(mp, blks_per_cluster, 0);
- xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_INODES);
+ nr_inodes = min_t(unsigned int, XFS_INODES_PER_CHUNK,
+ M_IGEO(mp)->inodes_per_cluster);
- for (agino = irec->ir_startino;
- agino < irec->ir_startino + XFS_INODES_PER_CHUNK;
- agino += blks_per_cluster * mp->m_sb.sb_inopblock) {
- fsino = XFS_AGINO_TO_INO(mp, bs->cur->bc_private.a.agno, agino);
- chunkino = agino - irec->ir_startino;
- agbno = XFS_AGINO_TO_AGBNO(mp, agino);
+ /* Map this inode cluster */
+ agbno = XFS_AGINO_TO_AGBNO(mp, irec->ir_startino + cluster_base);
- /* Compute the holemask mask for this cluster. */
- for (clusterino = 0, holemask = 0; clusterino < nr_inodes;
- clusterino += XFS_INODES_PER_HOLEMASK_BIT)
- holemask |= XFS_INOBT_MASK((chunkino + clusterino) /
- XFS_INODES_PER_HOLEMASK_BIT);
+ /* Compute a bitmask for this cluster that can be used for holemask. */
+ for (cluster_index = 0;
+ cluster_index < nr_inodes;
+ cluster_index += XFS_INODES_PER_HOLEMASK_BIT)
+ cluster_mask |= XFS_INOBT_MASK((cluster_base + cluster_index) /
+ XFS_INODES_PER_HOLEMASK_BIT);
- /* The whole cluster must be a hole or not a hole. */
- ir_holemask = (irec->ir_holemask & holemask);
- if (ir_holemask != holemask && ir_holemask != 0) {
+ /*
+ * Map the first inode of this cluster to a buffer and offset.
+ * Be careful about inobt records that don't align with the start of
+ * the inode buffer when block sizes are large enough to hold multiple
+ * inode chunks. When this happens, cluster_base will be zero but
+ * ir_startino can be large enough to make im_boffset nonzero.
+ */
+ ir_holemask = (irec->ir_holemask & cluster_mask);
+ imap.im_blkno = XFS_AGB_TO_DADDR(mp, agno, agbno);
+ imap.im_len = XFS_FSB_TO_BB(mp, M_IGEO(mp)->blocks_per_cluster);
+ imap.im_boffset = XFS_INO_TO_OFFSET(mp, irec->ir_startino) <<
+ mp->m_sb.sb_inodelog;
+
+ if (imap.im_boffset != 0 && cluster_base != 0) {
+ ASSERT(imap.im_boffset == 0 || cluster_base == 0);
+ xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
+ return 0;
+ }
+
+ trace_xchk_iallocbt_check_cluster(mp, agno, irec->ir_startino,
+ imap.im_blkno, imap.im_len, cluster_base, nr_inodes,
+ cluster_mask, ir_holemask,
+ XFS_INO_TO_OFFSET(mp, irec->ir_startino +
+ cluster_base));
+
+ /* The whole cluster must be a hole or not a hole. */
+ if (ir_holemask != cluster_mask && ir_holemask != 0) {
+ xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
+ return 0;
+ }
+
+ /* If any part of this is a hole, skip it. */
+ if (ir_holemask) {
+ xchk_xref_is_not_owned_by(bs->sc, agbno,
+ M_IGEO(mp)->blocks_per_cluster,
+ &XFS_RMAP_OINFO_INODES);
+ return 0;
+ }
+
+ xchk_xref_is_owned_by(bs->sc, agbno, M_IGEO(mp)->blocks_per_cluster,
+ &XFS_RMAP_OINFO_INODES);
+
+ /* Grab the inode cluster buffer. */
+ error = xfs_imap_to_bp(mp, bs->cur->bc_tp, &imap, &dip, &cluster_bp,
+ 0, 0);
+ if (!xchk_btree_xref_process_error(bs->sc, bs->cur, 0, &error))
+ return error;
+
+ /* Check free status of each inode within this cluster. */
+ for (cluster_index = 0; cluster_index < nr_inodes; cluster_index++) {
+ struct xfs_dinode *dip;
+
+ if (imap.im_boffset >= BBTOB(cluster_bp->b_length)) {
xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
- continue;
+ break;
}
- /* If any part of this is a hole, skip it. */
- if (ir_holemask) {
- xchk_xref_is_not_owned_by(bs->sc, agbno,
- blks_per_cluster, &oinfo);
- continue;
- }
+ dip = xfs_buf_offset(cluster_bp, imap.im_boffset);
+ error = xchk_iallocbt_check_cluster_ifree(bs, irec,
+ cluster_base + cluster_index, dip);
+ if (error)
+ break;
+ imap.im_boffset += mp->m_sb.sb_inodesize;
+ }
- xchk_xref_is_owned_by(bs->sc, agbno, blks_per_cluster,
- &oinfo);
+ xfs_trans_brelse(bs->cur->bc_tp, cluster_bp);
+ return error;
+}
- /* Grab the inode cluster buffer. */
- imap.im_blkno = XFS_AGB_TO_DADDR(mp, bs->cur->bc_private.a.agno,
- agbno);
- imap.im_len = XFS_FSB_TO_BB(mp, blks_per_cluster);
- imap.im_boffset = 0;
+/*
+ * For all the inode clusters that could map to this inobt record, make sure
+ * that the holemask makes sense and that the allocation status of each inode
+ * matches the freemask.
+ */
+STATIC int
+xchk_iallocbt_check_clusters(
+ struct xchk_btree *bs,
+ struct xfs_inobt_rec_incore *irec)
+{
+ unsigned int cluster_base;
+ int error = 0;
- error = xfs_imap_to_bp(mp, bs->cur->bc_tp, &imap,
- &dip, &bp, 0, 0);
- if (!xchk_btree_xref_process_error(bs->sc, bs->cur, 0,
- &error))
- continue;
-
- /* Which inodes are free? */
- for (clusterino = 0; clusterino < nr_inodes; clusterino++) {
- error = xchk_iallocbt_check_cluster_freemask(bs,
- fsino, chunkino, clusterino, irec, bp);
- if (error) {
- xfs_trans_brelse(bs->cur->bc_tp, bp);
- return error;
- }
- }
-
- xfs_trans_brelse(bs->cur->bc_tp, bp);
+ /*
+ * For the common case where this inobt record maps to multiple inode
+ * clusters this will call _check_cluster for each cluster.
+ *
+ * For the case that multiple inobt records map to a single cluster,
+ * this will call _check_cluster once.
+ */
+ for (cluster_base = 0;
+ cluster_base < XFS_INODES_PER_CHUNK;
+ cluster_base += M_IGEO(bs->sc->mp)->inodes_per_cluster) {
+ error = xchk_iallocbt_check_cluster(bs, irec, cluster_base);
+ if (error)
+ break;
}
return error;
}
+/*
+ * Make sure this inode btree record is aligned properly. Because a fs block
+ * contains multiple inodes, we check that the inobt record is aligned to the
+ * correct inode, not just the correct block on disk. This results in a finer
+ * grained corruption check.
+ */
+STATIC void
+xchk_iallocbt_rec_alignment(
+ struct xchk_btree *bs,
+ struct xfs_inobt_rec_incore *irec)
+{
+ struct xfs_mount *mp = bs->sc->mp;
+ struct xchk_iallocbt *iabt = bs->private;
+ struct xfs_ino_geometry *igeo = M_IGEO(mp);
+
+ /*
+ * finobt records have different positioning requirements than inobt
+ * records: each finobt record must have a corresponding inobt record.
+ * That is checked in the xref function, so for now we only catch the
+ * obvious case where the record isn't at all aligned properly.
+ *
+ * Note that if a fs block contains more than a single chunk of inodes,
+ * we will have finobt records only for those chunks containing free
+ * inodes, and therefore expect chunk alignment of finobt records.
+ * Otherwise, we expect that the finobt record is aligned to the
+ * cluster alignment as told by the superblock.
+ */
+ if (bs->cur->bc_btnum == XFS_BTNUM_FINO) {
+ unsigned int imask;
+
+ imask = min_t(unsigned int, XFS_INODES_PER_CHUNK,
+ igeo->cluster_align_inodes) - 1;
+ if (irec->ir_startino & imask)
+ xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
+ return;
+ }
+
+ if (iabt->next_startino != NULLAGINO) {
+ /*
+ * We're midway through a cluster of inodes that is mapped by
+ * multiple inobt records. Did we get the record for the next
+ * irec in the sequence?
+ */
+ if (irec->ir_startino != iabt->next_startino) {
+ xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
+ return;
+ }
+
+ iabt->next_startino += XFS_INODES_PER_CHUNK;
+
+ /* Are we done with the cluster? */
+ if (iabt->next_startino >= iabt->next_cluster_ino) {
+ iabt->next_startino = NULLAGINO;
+ iabt->next_cluster_ino = NULLAGINO;
+ }
+ return;
+ }
+
+ /* inobt records must be aligned to cluster and inoalignmnt size. */
+ if (irec->ir_startino & (igeo->cluster_align_inodes - 1)) {
+ xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
+ return;
+ }
+
+ if (irec->ir_startino & (igeo->inodes_per_cluster - 1)) {
+ xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
+ return;
+ }
+
+ if (igeo->inodes_per_cluster <= XFS_INODES_PER_CHUNK)
+ return;
+
+ /*
+ * If this is the start of an inode cluster that can be mapped by
+ * multiple inobt records, the next inobt record must follow exactly
+ * after this one.
+ */
+ iabt->next_startino = irec->ir_startino + XFS_INODES_PER_CHUNK;
+ iabt->next_cluster_ino = irec->ir_startino + igeo->inodes_per_cluster;
+}
+
/* Scrub an inobt/finobt record. */
STATIC int
xchk_iallocbt_rec(
@@ -272,12 +423,11 @@
union xfs_btree_rec *rec)
{
struct xfs_mount *mp = bs->cur->bc_mp;
- xfs_filblks_t *inode_blocks = bs->private;
+ struct xchk_iallocbt *iabt = bs->private;
struct xfs_inobt_rec_incore irec;
uint64_t holes;
xfs_agnumber_t agno = bs->cur->bc_private.a.agno;
xfs_agino_t agino;
- xfs_agblock_t agbno;
xfs_extlen_t len;
int holecount;
int i;
@@ -304,14 +454,11 @@
goto out;
}
- /* Make sure this record is aligned to cluster and inoalignmnt size. */
- agbno = XFS_AGINO_TO_AGBNO(mp, irec.ir_startino);
- if ((agbno & (xfs_ialloc_cluster_alignment(mp) - 1)) ||
- (agbno & (xfs_icluster_size_fsb(mp) - 1)))
- xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
+ xchk_iallocbt_rec_alignment(bs, &irec);
+ if (bs->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+ goto out;
- *inode_blocks += XFS_B_TO_FSB(mp,
- irec.ir_count * mp->m_sb.sb_inodesize);
+ iabt->inodes += irec.ir_count;
/* Handle non-sparse inodes */
if (!xfs_inobt_issparse(irec.ir_holemask)) {
@@ -322,7 +469,7 @@
if (!xchk_iallocbt_chunk(bs, &irec, agino, len))
goto out;
- goto check_freemask;
+ goto check_clusters;
}
/* Check each chunk of a sparse inode cluster. */
@@ -348,8 +495,8 @@
holecount + irec.ir_count != XFS_INODES_PER_CHUNK)
xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
-check_freemask:
- error = xchk_iallocbt_check_freemask(bs, &irec);
+check_clusters:
+ error = xchk_iallocbt_check_clusters(bs, &irec);
if (error)
goto out;
@@ -366,7 +513,6 @@
struct xfs_scrub *sc,
int which)
{
- struct xfs_owner_info oinfo;
xfs_filblks_t blocks;
xfs_extlen_t inobt_blocks = 0;
xfs_extlen_t finobt_blocks = 0;
@@ -388,9 +534,8 @@
return;
}
- xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_INOBT);
- error = xchk_count_rmap_ownedby_ag(sc, sc->sa.rmap_cur, &oinfo,
- &blocks);
+ error = xchk_count_rmap_ownedby_ag(sc, sc->sa.rmap_cur,
+ &XFS_RMAP_OINFO_INOBT, &blocks);
if (!xchk_should_check_xref(sc, &error, &sc->sa.rmap_cur))
return;
if (blocks != inobt_blocks + finobt_blocks)
@@ -405,21 +550,21 @@
xchk_iallocbt_xref_rmap_inodes(
struct xfs_scrub *sc,
int which,
- xfs_filblks_t inode_blocks)
+ unsigned long long inodes)
{
- struct xfs_owner_info oinfo;
xfs_filblks_t blocks;
+ xfs_filblks_t inode_blocks;
int error;
if (!sc->sa.rmap_cur || xchk_skip_xref(sc->sm))
return;
/* Check that we saw as many inode blocks as the rmap knows about. */
- xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_INODES);
- error = xchk_count_rmap_ownedby_ag(sc, sc->sa.rmap_cur, &oinfo,
- &blocks);
+ error = xchk_count_rmap_ownedby_ag(sc, sc->sa.rmap_cur,
+ &XFS_RMAP_OINFO_INODES, &blocks);
if (!xchk_should_check_xref(sc, &error, &sc->sa.rmap_cur))
return;
+ inode_blocks = XFS_B_TO_FSB(sc->mp, inodes * sc->mp->m_sb.sb_inodesize);
if (blocks != inode_blocks)
xchk_btree_xref_set_corrupt(sc, sc->sa.rmap_cur, 0);
}
@@ -431,14 +576,16 @@
xfs_btnum_t which)
{
struct xfs_btree_cur *cur;
- struct xfs_owner_info oinfo;
- xfs_filblks_t inode_blocks = 0;
+ struct xchk_iallocbt iabt = {
+ .inodes = 0,
+ .next_startino = NULLAGINO,
+ .next_cluster_ino = NULLAGINO,
+ };
int error;
- xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_INOBT);
cur = which == XFS_BTNUM_INO ? sc->sa.ino_cur : sc->sa.fino_cur;
- error = xchk_btree(sc, cur, xchk_iallocbt_rec, &oinfo,
- &inode_blocks);
+ error = xchk_btree(sc, cur, xchk_iallocbt_rec, &XFS_RMAP_OINFO_INOBT,
+ &iabt);
if (error)
return error;
@@ -452,7 +599,7 @@
* to inode chunks with free inodes.
*/
if (which == XFS_BTNUM_INO)
- xchk_iallocbt_xref_rmap_inodes(sc, which, inode_blocks);
+ xchk_iallocbt_xref_rmap_inodes(sc, which, iabt.inodes);
return error;
}
diff --git a/fs/xfs/scrub/inode.c b/fs/xfs/scrub/inode.c
index e386c9b..6d483ab 100644
--- a/fs/xfs/scrub/inode.c
+++ b/fs/xfs/scrub/inode.c
@@ -9,27 +9,17 @@
#include "xfs_format.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
-#include "xfs_defer.h"
#include "xfs_btree.h"
-#include "xfs_bit.h"
#include "xfs_log_format.h"
-#include "xfs_trans.h"
-#include "xfs_sb.h"
#include "xfs_inode.h"
-#include "xfs_icache.h"
-#include "xfs_inode_buf.h"
-#include "xfs_inode_fork.h"
#include "xfs_ialloc.h"
#include "xfs_da_format.h"
#include "xfs_reflink.h"
#include "xfs_rmap.h"
-#include "xfs_bmap.h"
#include "xfs_bmap_util.h"
-#include "scrub/xfs_scrub.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/btree.h"
-#include "scrub/trace.h"
/*
* Grab total control of the inode metadata. It doesn't matter here if
@@ -509,7 +499,6 @@
xfs_ino_t ino,
struct xfs_dinode *dip)
{
- struct xfs_owner_info oinfo;
xfs_agnumber_t agno;
xfs_agblock_t agbno;
int error;
@@ -526,8 +515,7 @@
xchk_xref_is_used_space(sc, agbno, 1);
xchk_inode_xref_finobt(sc, ino);
- xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_INODES);
- xchk_xref_is_owned_by(sc, agbno, 1, &oinfo);
+ xchk_xref_is_owned_by(sc, agbno, 1, &XFS_RMAP_OINFO_INODES);
xchk_xref_is_not_shared(sc, agbno, 1);
xchk_inode_xref_bmap(sc, dip);
diff --git a/fs/xfs/scrub/parent.c b/fs/xfs/scrub/parent.c
index 1c9d7c7..c962bd5 100644
--- a/fs/xfs/scrub/parent.c
+++ b/fs/xfs/scrub/parent.c
@@ -9,21 +9,13 @@
#include "xfs_format.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
-#include "xfs_defer.h"
-#include "xfs_btree.h"
-#include "xfs_bit.h"
#include "xfs_log_format.h"
-#include "xfs_trans.h"
-#include "xfs_sb.h"
#include "xfs_inode.h"
#include "xfs_icache.h"
#include "xfs_dir2.h"
#include "xfs_dir2_priv.h"
-#include "xfs_ialloc.h"
-#include "scrub/xfs_scrub.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
-#include "scrub/trace.h"
/* Set us up to scrub parents. */
int
@@ -320,7 +312,7 @@
* If we failed to lock the parent inode even after a retry, just mark
* this scrub incomplete and return.
*/
- if (sc->try_harder && error == -EDEADLOCK) {
+ if ((sc->flags & XCHK_TRY_HARDER) && error == -EDEADLOCK) {
error = 0;
xchk_set_incomplete(sc);
}
diff --git a/fs/xfs/scrub/quota.c b/fs/xfs/scrub/quota.c
index 782d582..0a33b44 100644
--- a/fs/xfs/scrub/quota.c
+++ b/fs/xfs/scrub/quota.c
@@ -9,24 +9,13 @@
#include "xfs_format.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
-#include "xfs_defer.h"
-#include "xfs_btree.h"
-#include "xfs_bit.h"
#include "xfs_log_format.h"
#include "xfs_trans.h"
-#include "xfs_sb.h"
#include "xfs_inode.h"
-#include "xfs_inode_fork.h"
-#include "xfs_alloc.h"
-#include "xfs_bmap.h"
#include "xfs_quota.h"
#include "xfs_qm.h"
-#include "xfs_dquot.h"
-#include "xfs_dquot_item.h"
-#include "scrub/xfs_scrub.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
-#include "scrub/trace.h"
/* Convert a scrub type code to a DQ flag, or return 0 if error. */
static inline uint
@@ -60,7 +49,7 @@
dqtype = xchk_quota_to_dqtype(sc);
if (dqtype == 0)
return -EINVAL;
- sc->has_quotaofflock = true;
+ sc->flags |= XCHK_HAS_QUOTAOFFLOCK;
mutex_lock(&sc->mp->m_quotainfo->qi_quotaofflock);
if (!xfs_this_quota_on(sc->mp, dqtype))
return -ENOENT;
@@ -144,7 +133,7 @@
if (bsoft > bhard)
xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset);
- if (ihard > mp->m_maxicount)
+ if (ihard > M_IGEO(mp)->maxicount)
xchk_fblock_set_warning(sc, XFS_DATA_FORK, offset);
if (isoft > ihard)
xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset);
diff --git a/fs/xfs/scrub/refcount.c b/fs/xfs/scrub/refcount.c
index e8c82b0..0cab11a 100644
--- a/fs/xfs/scrub/refcount.c
+++ b/fs/xfs/scrub/refcount.c
@@ -7,22 +7,12 @@
#include "xfs_fs.h"
#include "xfs_shared.h"
#include "xfs_format.h"
-#include "xfs_trans_resv.h"
-#include "xfs_mount.h"
-#include "xfs_defer.h"
#include "xfs_btree.h"
-#include "xfs_bit.h"
-#include "xfs_log_format.h"
-#include "xfs_trans.h"
-#include "xfs_sb.h"
-#include "xfs_alloc.h"
#include "xfs_rmap.h"
#include "xfs_refcount.h"
-#include "scrub/xfs_scrub.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/btree.h"
-#include "scrub/trace.h"
/*
* Set us up to scrub reference count btrees.
@@ -351,7 +341,6 @@
xfs_extlen_t len;
xfs_nlink_t refcount;
bool has_cowflag;
- int error = 0;
bno = be32_to_cpu(rec->refc.rc_startblock);
len = be32_to_cpu(rec->refc.rc_blockcount);
@@ -376,14 +365,13 @@
xchk_refcountbt_xref(bs->sc, bno, len, refcount);
- return error;
+ return 0;
}
/* Make sure we have as many refc blocks as the rmap says. */
STATIC void
xchk_refcount_xref_rmap(
struct xfs_scrub *sc,
- struct xfs_owner_info *oinfo,
xfs_filblks_t cow_blocks)
{
xfs_extlen_t refcbt_blocks = 0;
@@ -397,17 +385,16 @@
error = xfs_btree_count_blocks(sc->sa.refc_cur, &refcbt_blocks);
if (!xchk_btree_process_error(sc, sc->sa.refc_cur, 0, &error))
return;
- error = xchk_count_rmap_ownedby_ag(sc, sc->sa.rmap_cur, oinfo,
- &blocks);
+ error = xchk_count_rmap_ownedby_ag(sc, sc->sa.rmap_cur,
+ &XFS_RMAP_OINFO_REFC, &blocks);
if (!xchk_should_check_xref(sc, &error, &sc->sa.rmap_cur))
return;
if (blocks != refcbt_blocks)
xchk_btree_xref_set_corrupt(sc, sc->sa.rmap_cur, 0);
/* Check that we saw as many cow blocks as the rmap knows about. */
- xfs_rmap_ag_owner(oinfo, XFS_RMAP_OWN_COW);
- error = xchk_count_rmap_ownedby_ag(sc, sc->sa.rmap_cur, oinfo,
- &blocks);
+ error = xchk_count_rmap_ownedby_ag(sc, sc->sa.rmap_cur,
+ &XFS_RMAP_OINFO_COW, &blocks);
if (!xchk_should_check_xref(sc, &error, &sc->sa.rmap_cur))
return;
if (blocks != cow_blocks)
@@ -419,17 +406,15 @@
xchk_refcountbt(
struct xfs_scrub *sc)
{
- struct xfs_owner_info oinfo;
xfs_agblock_t cow_blocks = 0;
int error;
- xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_REFC);
error = xchk_btree(sc, sc->sa.refc_cur, xchk_refcountbt_rec,
- &oinfo, &cow_blocks);
+ &XFS_RMAP_OINFO_REFC, &cow_blocks);
if (error)
return error;
- xchk_refcount_xref_rmap(sc, &oinfo, cow_blocks);
+ xchk_refcount_xref_rmap(sc, cow_blocks);
return 0;
}
diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
index 9f08dd9..b70a88b 100644
--- a/fs/xfs/scrub/repair.c
+++ b/fs/xfs/scrub/repair.c
@@ -9,27 +9,21 @@
#include "xfs_format.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
-#include "xfs_defer.h"
#include "xfs_btree.h"
-#include "xfs_bit.h"
#include "xfs_log_format.h"
#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_inode.h"
-#include "xfs_icache.h"
#include "xfs_alloc.h"
#include "xfs_alloc_btree.h"
#include "xfs_ialloc.h"
#include "xfs_ialloc_btree.h"
#include "xfs_rmap.h"
#include "xfs_rmap_btree.h"
-#include "xfs_refcount.h"
#include "xfs_refcount_btree.h"
#include "xfs_extent_busy.h"
#include "xfs_ag_resv.h"
-#include "xfs_trans_space.h"
#include "xfs_quota.h"
-#include "scrub/xfs_scrub.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/trace.h"
@@ -44,8 +38,7 @@
int
xrep_attempt(
struct xfs_inode *ip,
- struct xfs_scrub *sc,
- bool *fixed)
+ struct xfs_scrub *sc)
{
int error = 0;
@@ -64,13 +57,13 @@
* scrub so that we can tell userspace if we fixed the problem.
*/
sc->sm->sm_flags &= ~XFS_SCRUB_FLAGS_OUT;
- *fixed = true;
+ sc->flags |= XREP_ALREADY_FIXED;
return -EAGAIN;
case -EDEADLOCK:
case -EAGAIN:
/* Tell the caller to try again having grabbed all the locks. */
- if (!sc->try_harder) {
- sc->try_harder = true;
+ if (!(sc->flags & XCHK_TRY_HARDER)) {
+ sc->flags |= XCHK_TRY_HARDER;
return -EAGAIN;
}
/*
@@ -135,10 +128,16 @@
if (sc->sa.agfl_bp)
xfs_trans_bhold(sc->tp, sc->sa.agfl_bp);
- /* Roll the transaction. */
+ /*
+ * Roll the transaction. We still own the buffer and the buffer lock
+ * regardless of whether or not the roll succeeds. If the roll fails,
+ * the buffers will be released during teardown on our way out of the
+ * kernel. If it succeeds, we join them to the new transaction and
+ * move on.
+ */
error = xfs_trans_roll(&sc->tp);
if (error)
- goto out_release;
+ return error;
/* Join AG headers to the new transaction. */
if (sc->sa.agi_bp)
@@ -149,21 +148,6 @@
xfs_trans_bjoin(sc->tp, sc->sa.agfl_bp);
return 0;
-
-out_release:
- /*
- * Rolling failed, so release the hold on the buffers. The
- * buffers will be released during teardown on our way out
- * of the kernel.
- */
- if (sc->sa.agi_bp)
- xfs_trans_bhold_release(sc->tp, sc->sa.agi_bp);
- if (sc->sa.agf_bp)
- xfs_trans_bhold_release(sc->tp, sc->sa.agf_bp);
- if (sc->sa.agfl_bp)
- xfs_trans_bhold_release(sc->tp, sc->sa.agfl_bp);
-
- return error;
}
/*
@@ -297,14 +281,14 @@
/* Allocate a block in an AG. */
int
xrep_alloc_ag_block(
- struct xfs_scrub *sc,
- struct xfs_owner_info *oinfo,
- xfs_fsblock_t *fsbno,
- enum xfs_ag_resv_type resv)
+ struct xfs_scrub *sc,
+ const struct xfs_owner_info *oinfo,
+ xfs_fsblock_t *fsbno,
+ enum xfs_ag_resv_type resv)
{
- struct xfs_alloc_arg args = {0};
- xfs_agblock_t bno;
- int error;
+ struct xfs_alloc_arg args = {0};
+ xfs_agblock_t bno;
+ int error;
switch (resv) {
case XFS_AG_RESV_AGFL:
@@ -365,9 +349,9 @@
bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, XFS_FSB_TO_DADDR(mp, fsb),
XFS_FSB_TO_BB(mp, 1), 0);
xfs_buf_zero(bp, 0, BBTOB(bp->b_length));
- xfs_btree_init_block(mp, bp, btnum, 0, 0, sc->sa.agno, 0);
+ xfs_btree_init_block(mp, bp, btnum, 0, 0, sc->sa.agno);
xfs_trans_buf_set_type(tp, bp, XFS_BLFT_BTREE_BUF);
- xfs_trans_log_buf(tp, bp, 0, bp->b_length);
+ xfs_trans_log_buf(tp, bp, 0, BBTOB(bp->b_length) - 1);
bp->b_ops = ops;
*bpp = bp;
@@ -503,7 +487,6 @@
struct xfs_scrub *sc,
xfs_agblock_t agbno)
{
- struct xfs_owner_info oinfo;
int error;
/* Make sure there's space on the freelist. */
@@ -516,9 +499,8 @@
* create an rmap for the block prior to merging it or else other
* parts will break.
*/
- xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_AG);
error = xfs_rmap_alloc(sc->tp, sc->sa.agf_bp, sc->sa.agno, agbno, 1,
- &oinfo);
+ &XFS_RMAP_OINFO_AG);
if (error)
return error;
@@ -536,17 +518,17 @@
/* Dispose of a single block. */
STATIC int
xrep_reap_block(
- struct xfs_scrub *sc,
- xfs_fsblock_t fsbno,
- struct xfs_owner_info *oinfo,
- enum xfs_ag_resv_type resv)
+ struct xfs_scrub *sc,
+ xfs_fsblock_t fsbno,
+ const struct xfs_owner_info *oinfo,
+ enum xfs_ag_resv_type resv)
{
- struct xfs_btree_cur *cur;
- struct xfs_buf *agf_bp = NULL;
- xfs_agnumber_t agno;
- xfs_agblock_t agbno;
- bool has_other_rmap;
- int error;
+ struct xfs_btree_cur *cur;
+ struct xfs_buf *agf_bp = NULL;
+ xfs_agnumber_t agno;
+ xfs_agblock_t agbno;
+ bool has_other_rmap;
+ int error;
agno = XFS_FSB_TO_AGNO(sc->mp, fsbno);
agbno = XFS_FSB_TO_AGBNO(sc->mp, fsbno);
@@ -610,15 +592,15 @@
/* Dispose of every block of every extent in the bitmap. */
int
xrep_reap_extents(
- struct xfs_scrub *sc,
- struct xfs_bitmap *bitmap,
- struct xfs_owner_info *oinfo,
- enum xfs_ag_resv_type type)
+ struct xfs_scrub *sc,
+ struct xfs_bitmap *bitmap,
+ const struct xfs_owner_info *oinfo,
+ enum xfs_ag_resv_type type)
{
- struct xfs_bitmap_range *bmr;
- struct xfs_bitmap_range *n;
- xfs_fsblock_t fsbno;
- int error = 0;
+ struct xfs_bitmap_range *bmr;
+ struct xfs_bitmap_range *n;
+ xfs_fsblock_t fsbno;
+ int error = 0;
ASSERT(xfs_sb_version_hasrmapbt(&sc->mp->m_sb));
@@ -682,7 +664,7 @@
{
xfs_agblock_t *agbno = priv;
- return (*agbno == bno) ? XFS_BTREE_QUERY_RANGE_ABORT : 0;
+ return (*agbno == bno) ? -ECANCELED : 0;
}
/* Does this block match the btree information passed in? */
@@ -692,13 +674,14 @@
struct xrep_find_ag_btree *fab,
uint64_t owner,
xfs_agblock_t agbno,
- bool *found_it)
+ bool *done_with_block)
{
struct xfs_mount *mp = ri->sc->mp;
struct xfs_buf *bp;
struct xfs_btree_block *btblock;
xfs_daddr_t daddr;
- int error;
+ int block_level;
+ int error = 0;
daddr = XFS_AGB_TO_DADDR(mp, ri->sc->sa.agno, agbno);
@@ -711,42 +694,123 @@
if (owner == XFS_RMAP_OWN_AG) {
error = xfs_agfl_walk(mp, ri->agf, ri->agfl_bp,
xrep_findroot_agfl_walk, &agbno);
- if (error == XFS_BTREE_QUERY_RANGE_ABORT)
+ if (error == -ECANCELED)
return 0;
if (error)
return error;
}
+ /*
+ * Read the buffer into memory so that we can see if it's a match for
+ * our btree type. We have no clue if it is beforehand, and we want to
+ * avoid xfs_trans_read_buf's behavior of dumping the DONE state (which
+ * will cause needless disk reads in subsequent calls to this function)
+ * and logging metadata verifier failures.
+ *
+ * Therefore, pass in NULL buffer ops. If the buffer was already in
+ * memory from some other caller it will already have b_ops assigned.
+ * If it was in memory from a previous unsuccessful findroot_block
+ * call, the buffer won't have b_ops but it should be clean and ready
+ * for us to try to verify if the read call succeeds. The same applies
+ * if the buffer wasn't in memory at all.
+ *
+ * Note: If we never match a btree type with this buffer, it will be
+ * left in memory with NULL b_ops. This shouldn't be a problem unless
+ * the buffer gets written.
+ */
error = xfs_trans_read_buf(mp, ri->sc->tp, mp->m_ddev_targp, daddr,
mp->m_bsize, 0, &bp, NULL);
if (error)
return error;
- /*
- * Does this look like a block matching our fs and higher than any
- * other block we've found so far? If so, reattach buffer verifiers
- * so the AIL won't complain if the buffer is also dirty.
- */
+ /* Ensure the block magic matches the btree type we're looking for. */
btblock = XFS_BUF_TO_BLOCK(bp);
- if (be32_to_cpu(btblock->bb_magic) != fab->magic)
- goto out;
- if (xfs_sb_version_hascrc(&mp->m_sb) &&
- !uuid_equal(&btblock->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid))
- goto out;
- bp->b_ops = fab->buf_ops;
-
- /* Ignore this block if it's lower in the tree than we've seen. */
- if (fab->root != NULLAGBLOCK &&
- xfs_btree_get_level(btblock) < fab->height)
+ ASSERT(fab->buf_ops->magic[1] != 0);
+ if (btblock->bb_magic != fab->buf_ops->magic[1])
goto out;
- /* Make sure we pass the verifiers. */
- bp->b_ops->verify_read(bp);
- if (bp->b_error)
+ /*
+ * If the buffer already has ops applied and they're not the ones for
+ * this btree type, we know this block doesn't match the btree and we
+ * can bail out.
+ *
+ * If the buffer ops match ours, someone else has already validated
+ * the block for us, so we can move on to checking if this is a root
+ * block candidate.
+ *
+ * If the buffer does not have ops, nobody has successfully validated
+ * the contents and the buffer cannot be dirty. If the magic, uuid,
+ * and structure match this btree type then we'll move on to checking
+ * if it's a root block candidate. If there is no match, bail out.
+ */
+ if (bp->b_ops) {
+ if (bp->b_ops != fab->buf_ops)
+ goto out;
+ } else {
+ ASSERT(!xfs_trans_buf_is_dirty(bp));
+ if (!uuid_equal(&btblock->bb_u.s.bb_uuid,
+ &mp->m_sb.sb_meta_uuid))
+ goto out;
+ /*
+ * Read verifiers can reference b_ops, so we set the pointer
+ * here. If the verifier fails we'll reset the buffer state
+ * to what it was before we touched the buffer.
+ */
+ bp->b_ops = fab->buf_ops;
+ fab->buf_ops->verify_read(bp);
+ if (bp->b_error) {
+ bp->b_ops = NULL;
+ bp->b_error = 0;
+ goto out;
+ }
+
+ /*
+ * Some read verifiers will (re)set b_ops, so we must be
+ * careful not to change b_ops after running the verifier.
+ */
+ }
+
+ /*
+ * This block passes the magic/uuid and verifier tests for this btree
+ * type. We don't need the caller to try the other tree types.
+ */
+ *done_with_block = true;
+
+ /*
+ * Compare this btree block's level to the height of the current
+ * candidate root block.
+ *
+ * If the level matches the root we found previously, throw away both
+ * blocks because there can't be two candidate roots.
+ *
+ * If level is lower in the tree than the root we found previously,
+ * ignore this block.
+ */
+ block_level = xfs_btree_get_level(btblock);
+ if (block_level + 1 == fab->height) {
+ fab->root = NULLAGBLOCK;
goto out;
- fab->root = agbno;
- fab->height = xfs_btree_get_level(btblock) + 1;
- *found_it = true;
+ } else if (block_level < fab->height) {
+ goto out;
+ }
+
+ /*
+ * This is the highest block in the tree that we've found so far.
+ * Update the btree height to reflect what we've learned from this
+ * block.
+ */
+ fab->height = block_level + 1;
+
+ /*
+ * If this block doesn't have sibling pointers, then it's the new root
+ * block candidate. Otherwise, the root will be found farther up the
+ * tree.
+ */
+ if (btblock->bb_u.s.bb_leftsib == cpu_to_be32(NULLAGBLOCK) &&
+ btblock->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK))
+ fab->root = agbno;
+ else
+ fab->root = NULLAGBLOCK;
trace_xrep_findroot_block(mp, ri->sc->sa.agno, agbno,
be32_to_cpu(btblock->bb_magic), fab->height - 1);
@@ -768,7 +832,7 @@
struct xrep_findroot *ri = priv;
struct xrep_find_ag_btree *fab;
xfs_agblock_t b;
- bool found_it;
+ bool done;
int error = 0;
/* Ignore anything that isn't AG metadata. */
@@ -777,16 +841,16 @@
/* Otherwise scan each block + btree type. */
for (b = 0; b < rec->rm_blockcount; b++) {
- found_it = false;
+ done = false;
for (fab = ri->btree_info; fab->buf_ops; fab++) {
if (rec->rm_owner != fab->rmap_owner)
continue;
error = xrep_findroot_block(ri, fab,
rec->rm_owner, rec->rm_startblock + b,
- &found_it);
+ &done);
if (error)
return error;
- if (found_it)
+ if (done)
break;
}
}
diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h
index 9de321e..60c61d7 100644
--- a/fs/xfs/scrub/repair.h
+++ b/fs/xfs/scrub/repair.h
@@ -15,14 +15,15 @@
/* Repair helpers */
-int xrep_attempt(struct xfs_inode *ip, struct xfs_scrub *sc, bool *fixed);
+int xrep_attempt(struct xfs_inode *ip, struct xfs_scrub *sc);
void xrep_failure(struct xfs_mount *mp);
int xrep_roll_ag_trans(struct xfs_scrub *sc);
bool xrep_ag_has_space(struct xfs_perag *pag, xfs_extlen_t nr_blocks,
enum xfs_ag_resv_type type);
xfs_extlen_t xrep_calc_ag_resblks(struct xfs_scrub *sc);
-int xrep_alloc_ag_block(struct xfs_scrub *sc, struct xfs_owner_info *oinfo,
- xfs_fsblock_t *fsbno, enum xfs_ag_resv_type resv);
+int xrep_alloc_ag_block(struct xfs_scrub *sc,
+ const struct xfs_owner_info *oinfo, xfs_fsblock_t *fsbno,
+ enum xfs_ag_resv_type resv);
int xrep_init_btblock(struct xfs_scrub *sc, xfs_fsblock_t fsb,
struct xfs_buf **bpp, xfs_btnum_t btnum,
const struct xfs_buf_ops *ops);
@@ -32,7 +33,7 @@
int xrep_fix_freelist(struct xfs_scrub *sc, bool can_shrink);
int xrep_invalidate_blocks(struct xfs_scrub *sc, struct xfs_bitmap *btlist);
int xrep_reap_extents(struct xfs_scrub *sc, struct xfs_bitmap *exlist,
- struct xfs_owner_info *oinfo, enum xfs_ag_resv_type type);
+ const struct xfs_owner_info *oinfo, enum xfs_ag_resv_type type);
struct xrep_find_ag_btree {
/* in: rmap owner of the btree we're looking for */
@@ -41,9 +42,6 @@
/* in: buffer ops */
const struct xfs_buf_ops *buf_ops;
- /* in: magic number of the btree */
- uint32_t magic;
-
/* out: the highest btree block found and the tree height */
xfs_agblock_t root;
unsigned int height;
@@ -66,8 +64,7 @@
static inline int xrep_attempt(
struct xfs_inode *ip,
- struct xfs_scrub *sc,
- bool *fixed)
+ struct xfs_scrub *sc)
{
return -EOPNOTSUPP;
}
diff --git a/fs/xfs/scrub/rmap.c b/fs/xfs/scrub/rmap.c
index 5e293c1..8d4cefd 100644
--- a/fs/xfs/scrub/rmap.c
+++ b/fs/xfs/scrub/rmap.c
@@ -9,21 +9,12 @@
#include "xfs_format.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
-#include "xfs_defer.h"
#include "xfs_btree.h"
-#include "xfs_bit.h"
-#include "xfs_log_format.h"
-#include "xfs_trans.h"
-#include "xfs_sb.h"
-#include "xfs_alloc.h"
-#include "xfs_ialloc.h"
#include "xfs_rmap.h"
#include "xfs_refcount.h"
-#include "scrub/xfs_scrub.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/btree.h"
-#include "scrub/trace.h"
/*
* Set us up to scrub reverse mapping btrees.
@@ -174,24 +165,21 @@
xchk_rmapbt(
struct xfs_scrub *sc)
{
- struct xfs_owner_info oinfo;
-
- xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_AG);
return xchk_btree(sc, sc->sa.rmap_cur, xchk_rmapbt_rec,
- &oinfo, NULL);
+ &XFS_RMAP_OINFO_AG, NULL);
}
/* xref check that the extent is owned by a given owner */
static inline void
xchk_xref_check_owner(
- struct xfs_scrub *sc,
- xfs_agblock_t bno,
- xfs_extlen_t len,
- struct xfs_owner_info *oinfo,
- bool should_have_rmap)
+ struct xfs_scrub *sc,
+ xfs_agblock_t bno,
+ xfs_extlen_t len,
+ const struct xfs_owner_info *oinfo,
+ bool should_have_rmap)
{
- bool has_rmap;
- int error;
+ bool has_rmap;
+ int error;
if (!sc->sa.rmap_cur || xchk_skip_xref(sc->sm))
return;
@@ -207,10 +195,10 @@
/* xref check that the extent is owned by a given owner */
void
xchk_xref_is_owned_by(
- struct xfs_scrub *sc,
- xfs_agblock_t bno,
- xfs_extlen_t len,
- struct xfs_owner_info *oinfo)
+ struct xfs_scrub *sc,
+ xfs_agblock_t bno,
+ xfs_extlen_t len,
+ const struct xfs_owner_info *oinfo)
{
xchk_xref_check_owner(sc, bno, len, oinfo, true);
}
@@ -218,10 +206,10 @@
/* xref check that the extent is not owned by a given owner */
void
xchk_xref_is_not_owned_by(
- struct xfs_scrub *sc,
- xfs_agblock_t bno,
- xfs_extlen_t len,
- struct xfs_owner_info *oinfo)
+ struct xfs_scrub *sc,
+ xfs_agblock_t bno,
+ xfs_extlen_t len,
+ const struct xfs_owner_info *oinfo)
{
xchk_xref_check_owner(sc, bno, len, oinfo, false);
}
diff --git a/fs/xfs/scrub/rtbitmap.c b/fs/xfs/scrub/rtbitmap.c
index 665d4bb..c642bc2 100644
--- a/fs/xfs/scrub/rtbitmap.c
+++ b/fs/xfs/scrub/rtbitmap.c
@@ -9,19 +9,12 @@
#include "xfs_format.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
-#include "xfs_defer.h"
-#include "xfs_btree.h"
-#include "xfs_bit.h"
#include "xfs_log_format.h"
#include "xfs_trans.h"
-#include "xfs_sb.h"
-#include "xfs_alloc.h"
#include "xfs_rtalloc.h"
#include "xfs_inode.h"
-#include "scrub/xfs_scrub.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
-#include "scrub/trace.h"
/* Set us up with the realtime metadata locked. */
int
@@ -141,9 +134,8 @@
startext = fsbno;
endext = fsbno + len - 1;
do_div(startext, sc->mp->m_sb.sb_rextsize);
- if (do_div(endext, sc->mp->m_sb.sb_rextsize))
- endext++;
- extcount = endext - startext;
+ do_div(endext, sc->mp->m_sb.sb_rextsize);
+ extcount = endext - startext + 1;
xfs_ilock(sc->mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP);
error = xfs_rtalloc_extent_is_free(sc->mp, sc->tp, startext, extcount,
&is_free);
diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c
index 4bfae1e..15c8c5f 100644
--- a/fs/xfs/scrub/scrub.c
+++ b/fs/xfs/scrub/scrub.c
@@ -9,37 +9,18 @@
#include "xfs_format.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
-#include "xfs_defer.h"
-#include "xfs_btree.h"
-#include "xfs_bit.h"
#include "xfs_log_format.h"
#include "xfs_trans.h"
-#include "xfs_sb.h"
#include "xfs_inode.h"
-#include "xfs_icache.h"
-#include "xfs_itable.h"
-#include "xfs_alloc.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_bmap.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_ialloc.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_refcount.h"
-#include "xfs_refcount_btree.h"
-#include "xfs_rmap.h"
-#include "xfs_rmap_btree.h"
#include "xfs_quota.h"
#include "xfs_qm.h"
#include "xfs_errortag.h"
#include "xfs_error.h"
-#include "xfs_log.h"
-#include "xfs_trans_priv.h"
-#include "scrub/xfs_scrub.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/trace.h"
-#include "scrub/btree.h"
#include "scrub/repair.h"
+#include "scrub/health.h"
/*
* Online Scrub and Repair
@@ -186,8 +167,12 @@
xfs_irele(sc->ip);
sc->ip = NULL;
}
- if (sc->has_quotaofflock)
+ if (sc->flags & XCHK_REAPING_DISABLED)
+ xchk_start_reaping(sc);
+ if (sc->flags & XCHK_HAS_QUOTAOFFLOCK) {
mutex_unlock(&sc->mp->m_quotainfo->qi_quotaofflock);
+ sc->flags &= ~XCHK_HAS_QUOTAOFFLOCK;
+ }
if (sc->buf) {
kmem_free(sc->buf);
sc->buf = NULL;
@@ -347,6 +332,12 @@
.scrub = xchk_quota,
.repair = xrep_notsupported,
},
+ [XFS_SCRUB_TYPE_FSCOUNTERS] = { /* fs summary counters */
+ .type = ST_FS,
+ .setup = xchk_setup_fscounters,
+ .scrub = xchk_fscounters,
+ .repair = xrep_notsupported,
+ },
};
/* This isn't a stable feature, warn once per day. */
@@ -412,19 +403,6 @@
goto out;
}
- error = -EOPNOTSUPP;
- /*
- * We won't scrub any filesystem that doesn't have the ability
- * to record unwritten extents. The option was made default in
- * 2003, removed from mkfs in 2007, and cannot be disabled in
- * v5, so if we find a filesystem without this flag it's either
- * really old or totally unsupported. Avoid it either way.
- * We also don't support v1-v3 filesystems, which aren't
- * mountable.
- */
- if (!xfs_sb_version_hasextflgbit(&mp->m_sb))
- goto out;
-
/*
* We only want to repair read-write v5+ filesystems. Defer the check
* for ops->repair until after our scrub confirms that we need to
@@ -479,10 +457,14 @@
struct xfs_inode *ip,
struct xfs_scrub_metadata *sm)
{
- struct xfs_scrub sc;
+ struct xfs_scrub sc = {
+ .mp = ip->i_mount,
+ .sm = sm,
+ .sa = {
+ .agno = NULLAGNUMBER,
+ },
+ };
struct xfs_mount *mp = ip->i_mount;
- bool try_harder = false;
- bool already_fixed = false;
int error = 0;
BUILD_BUG_ON(sizeof(meta_scrub_ops) !=
@@ -504,21 +486,17 @@
xchk_experimental_warning(mp);
+ sc.ops = &meta_scrub_ops[sm->sm_type];
+ sc.sick_mask = xchk_health_mask_for_scrub_type(sm->sm_type);
retry_op:
/* Set up for the operation. */
- memset(&sc, 0, sizeof(sc));
- sc.mp = ip->i_mount;
- sc.sm = sm;
- sc.ops = &meta_scrub_ops[sm->sm_type];
- sc.try_harder = try_harder;
- sc.sa.agno = NULLAGNUMBER;
error = sc.ops->setup(&sc, ip);
if (error)
goto out_teardown;
/* Scrub for errors. */
error = sc.ops->scrub(&sc);
- if (!try_harder && error == -EDEADLOCK) {
+ if (!(sc.flags & XCHK_TRY_HARDER) && error == -EDEADLOCK) {
/*
* Scrubbers return -EDEADLOCK to mean 'try harder'.
* Tear down everything we hold, then set up again with
@@ -527,12 +505,15 @@
error = xchk_teardown(&sc, ip, 0);
if (error)
goto out;
- try_harder = true;
+ sc.flags |= XCHK_TRY_HARDER;
goto retry_op;
} else if (error)
goto out_teardown;
- if ((sc.sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) && !already_fixed) {
+ xchk_update_health(&sc);
+
+ if ((sc.sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) &&
+ !(sc.flags & XREP_ALREADY_FIXED)) {
bool needs_fix;
/* Let debug users force us into the repair routines. */
@@ -555,10 +536,13 @@
* If it's broken, userspace wants us to fix it, and we haven't
* already tried to fix it, then attempt a repair.
*/
- error = xrep_attempt(ip, &sc, &already_fixed);
+ error = xrep_attempt(ip, &sc);
if (error == -EAGAIN) {
- if (sc.try_harder)
- try_harder = true;
+ /*
+ * Either the repair function succeeded or it couldn't
+ * get all the resources it needs; either way, we go
+ * back to the beginning and call the scrub function.
+ */
error = xchk_teardown(&sc, ip, 0);
if (error) {
xrep_failure(mp);
diff --git a/fs/xfs/scrub/scrub.h b/fs/xfs/scrub/scrub.h
index af323b2..ad1ceb4 100644
--- a/fs/xfs/scrub/scrub.h
+++ b/fs/xfs/scrub/scrub.h
@@ -62,13 +62,27 @@
struct xfs_inode *ip;
void *buf;
uint ilock_flags;
- bool try_harder;
- bool has_quotaofflock;
+
+ /* See the XCHK/XREP state flags below. */
+ unsigned int flags;
+
+ /*
+ * The XFS_SICK_* flags that correspond to the metadata being scrubbed
+ * or repaired. We will use this mask to update the in-core fs health
+ * status with whatever we find.
+ */
+ unsigned int sick_mask;
/* State tracking for single-AG operations. */
struct xchk_ag sa;
};
+/* XCHK state flags grow up from zero, XREP state flags grown down from 2^31 */
+#define XCHK_TRY_HARDER (1 << 0) /* can't get resources, try again */
+#define XCHK_HAS_QUOTAOFFLOCK (1 << 1) /* we hold the quotaoff lock */
+#define XCHK_REAPING_DISABLED (1 << 2) /* background block reaping paused */
+#define XREP_ALREADY_FIXED (1 << 31) /* checking our repair work */
+
/* Metadata scrubbers */
int xchk_tester(struct xfs_scrub *sc);
int xchk_superblock(struct xfs_scrub *sc);
@@ -113,6 +127,7 @@
return -ENOENT;
}
#endif
+int xchk_fscounters(struct xfs_scrub *sc);
/* cross-referencing helpers */
void xchk_xref_is_used_space(struct xfs_scrub *sc, xfs_agblock_t agbno,
@@ -122,9 +137,9 @@
void xchk_xref_is_inode_chunk(struct xfs_scrub *sc, xfs_agblock_t agbno,
xfs_extlen_t len);
void xchk_xref_is_owned_by(struct xfs_scrub *sc, xfs_agblock_t agbno,
- xfs_extlen_t len, struct xfs_owner_info *oinfo);
+ xfs_extlen_t len, const struct xfs_owner_info *oinfo);
void xchk_xref_is_not_owned_by(struct xfs_scrub *sc, xfs_agblock_t agbno,
- xfs_extlen_t len, struct xfs_owner_info *oinfo);
+ xfs_extlen_t len, const struct xfs_owner_info *oinfo);
void xchk_xref_has_no_owner(struct xfs_scrub *sc, xfs_agblock_t agbno,
xfs_extlen_t len);
void xchk_xref_is_cow_staging(struct xfs_scrub *sc, xfs_agblock_t bno,
@@ -138,4 +153,12 @@
# define xchk_xref_is_used_rt_space(sc, rtbno, len) do { } while (0)
#endif
+struct xchk_fscounters {
+ uint64_t icount;
+ uint64_t ifree;
+ uint64_t fdblocks;
+ unsigned long long icount_min;
+ unsigned long long icount_max;
+};
+
#endif /* __XFS_SCRUB_SCRUB_H__ */
diff --git a/fs/xfs/scrub/symlink.c b/fs/xfs/scrub/symlink.c
index f7ebaa9..5641ae5 100644
--- a/fs/xfs/scrub/symlink.c
+++ b/fs/xfs/scrub/symlink.c
@@ -9,19 +9,11 @@
#include "xfs_format.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
-#include "xfs_defer.h"
-#include "xfs_btree.h"
-#include "xfs_bit.h"
#include "xfs_log_format.h"
-#include "xfs_trans.h"
-#include "xfs_sb.h"
#include "xfs_inode.h"
-#include "xfs_inode_fork.h"
#include "xfs_symlink.h"
-#include "scrub/xfs_scrub.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
-#include "scrub/trace.h"
/* Set us up to scrub a symbolic link. */
int
@@ -30,7 +22,7 @@
struct xfs_inode *ip)
{
/* Allocate the buffer without the inode lock held. */
- sc->buf = kmem_zalloc_large(XFS_SYMLINK_MAXLEN + 1, KM_SLEEP);
+ sc->buf = kmem_zalloc_large(XFS_SYMLINK_MAXLEN + 1, 0);
if (!sc->buf)
return -ENOMEM;
diff --git a/fs/xfs/scrub/trace.c b/fs/xfs/scrub/trace.c
index 96feaf8..9eaab2e 100644
--- a/fs/xfs/scrub/trace.c
+++ b/fs/xfs/scrub/trace.c
@@ -10,15 +10,9 @@
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
-#include "xfs_defer.h"
-#include "xfs_da_format.h"
#include "xfs_inode.h"
#include "xfs_btree.h"
-#include "xfs_trans.h"
-#include "xfs_bit.h"
-#include "scrub/xfs_scrub.h"
#include "scrub/scrub.h"
-#include "scrub/common.h"
/* Figure out which block the btree cursor was pointing to. */
static inline xfs_fsblock_t
diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h
index 4e20f0e..3362bae 100644
--- a/fs/xfs/scrub/trace.h
+++ b/fs/xfs/scrub/trace.h
@@ -12,6 +12,73 @@
#include <linux/tracepoint.h>
#include "xfs_bit.h"
+/*
+ * ftrace's __print_symbolic requires that all enum values be wrapped in the
+ * TRACE_DEFINE_ENUM macro so that the enum value can be encoded in the ftrace
+ * ring buffer. Somehow this was only worth mentioning in the ftrace sample
+ * code.
+ */
+TRACE_DEFINE_ENUM(XFS_BTNUM_BNOi);
+TRACE_DEFINE_ENUM(XFS_BTNUM_CNTi);
+TRACE_DEFINE_ENUM(XFS_BTNUM_BMAPi);
+TRACE_DEFINE_ENUM(XFS_BTNUM_INOi);
+TRACE_DEFINE_ENUM(XFS_BTNUM_FINOi);
+TRACE_DEFINE_ENUM(XFS_BTNUM_RMAPi);
+TRACE_DEFINE_ENUM(XFS_BTNUM_REFCi);
+
+TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_PROBE);
+TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_SB);
+TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_AGF);
+TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_AGFL);
+TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_AGI);
+TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_BNOBT);
+TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_CNTBT);
+TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_INOBT);
+TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_FINOBT);
+TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_RMAPBT);
+TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_REFCNTBT);
+TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_INODE);
+TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_BMBTD);
+TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_BMBTA);
+TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_BMBTC);
+TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_DIR);
+TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_XATTR);
+TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_SYMLINK);
+TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_PARENT);
+TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_RTBITMAP);
+TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_RTSUM);
+TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_UQUOTA);
+TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_GQUOTA);
+TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_PQUOTA);
+TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_FSCOUNTERS);
+
+#define XFS_SCRUB_TYPE_STRINGS \
+ { XFS_SCRUB_TYPE_PROBE, "probe" }, \
+ { XFS_SCRUB_TYPE_SB, "sb" }, \
+ { XFS_SCRUB_TYPE_AGF, "agf" }, \
+ { XFS_SCRUB_TYPE_AGFL, "agfl" }, \
+ { XFS_SCRUB_TYPE_AGI, "agi" }, \
+ { XFS_SCRUB_TYPE_BNOBT, "bnobt" }, \
+ { XFS_SCRUB_TYPE_CNTBT, "cntbt" }, \
+ { XFS_SCRUB_TYPE_INOBT, "inobt" }, \
+ { XFS_SCRUB_TYPE_FINOBT, "finobt" }, \
+ { XFS_SCRUB_TYPE_RMAPBT, "rmapbt" }, \
+ { XFS_SCRUB_TYPE_REFCNTBT, "refcountbt" }, \
+ { XFS_SCRUB_TYPE_INODE, "inode" }, \
+ { XFS_SCRUB_TYPE_BMBTD, "bmapbtd" }, \
+ { XFS_SCRUB_TYPE_BMBTA, "bmapbta" }, \
+ { XFS_SCRUB_TYPE_BMBTC, "bmapbtc" }, \
+ { XFS_SCRUB_TYPE_DIR, "directory" }, \
+ { XFS_SCRUB_TYPE_XATTR, "xattr" }, \
+ { XFS_SCRUB_TYPE_SYMLINK, "symlink" }, \
+ { XFS_SCRUB_TYPE_PARENT, "parent" }, \
+ { XFS_SCRUB_TYPE_RTBITMAP, "rtbitmap" }, \
+ { XFS_SCRUB_TYPE_RTSUM, "rtsummary" }, \
+ { XFS_SCRUB_TYPE_UQUOTA, "usrquota" }, \
+ { XFS_SCRUB_TYPE_GQUOTA, "grpquota" }, \
+ { XFS_SCRUB_TYPE_PQUOTA, "prjquota" }, \
+ { XFS_SCRUB_TYPE_FSCOUNTERS, "fscounters" }
+
DECLARE_EVENT_CLASS(xchk_class,
TP_PROTO(struct xfs_inode *ip, struct xfs_scrub_metadata *sm,
int error),
@@ -36,10 +103,10 @@
__entry->flags = sm->sm_flags;
__entry->error = error;
),
- TP_printk("dev %d:%d ino 0x%llx type %u agno %u inum %llu gen %u flags 0x%x error %d",
+ TP_printk("dev %d:%d ino 0x%llx type %s agno %u inum %llu gen %u flags 0x%x error %d",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->ino,
- __entry->type,
+ __print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS),
__entry->agno,
__entry->inum,
__entry->gen,
@@ -78,9 +145,9 @@
__entry->error = error;
__entry->ret_ip = ret_ip;
),
- TP_printk("dev %d:%d type %u agno %u agbno %u error %d ret_ip %pS",
+ TP_printk("dev %d:%d type %s agno %u agbno %u error %d ret_ip %pS",
MAJOR(__entry->dev), MINOR(__entry->dev),
- __entry->type,
+ __print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS),
__entry->agno,
__entry->bno,
__entry->error,
@@ -109,11 +176,11 @@
__entry->error = error;
__entry->ret_ip = ret_ip;
),
- TP_printk("dev %d:%d ino 0x%llx fork %d type %u offset %llu error %d ret_ip %pS",
+ TP_printk("dev %d:%d ino 0x%llx fork %d type %s offset %llu error %d ret_ip %pS",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->ino,
__entry->whichfork,
- __entry->type,
+ __print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS),
__entry->offset,
__entry->error,
__entry->ret_ip)
@@ -144,9 +211,9 @@
__entry->bno = bno;
__entry->ret_ip = ret_ip;
),
- TP_printk("dev %d:%d type %u agno %u agbno %u ret_ip %pS",
+ TP_printk("dev %d:%d type %s agno %u agbno %u ret_ip %pS",
MAJOR(__entry->dev), MINOR(__entry->dev),
- __entry->type,
+ __print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS),
__entry->agno,
__entry->bno,
__entry->ret_ip)
@@ -158,6 +225,7 @@
void *ret_ip), \
TP_ARGS(sc, daddr, ret_ip))
+DEFINE_SCRUB_BLOCK_ERROR_EVENT(xchk_fs_error);
DEFINE_SCRUB_BLOCK_ERROR_EVENT(xchk_block_error);
DEFINE_SCRUB_BLOCK_ERROR_EVENT(xchk_block_preen);
@@ -176,10 +244,10 @@
__entry->type = sc->sm->sm_type;
__entry->ret_ip = ret_ip;
),
- TP_printk("dev %d:%d ino 0x%llx type %u ret_ip %pS",
+ TP_printk("dev %d:%d ino 0x%llx type %s ret_ip %pS",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->ino,
- __entry->type,
+ __print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS),
__entry->ret_ip)
)
@@ -213,11 +281,11 @@
__entry->offset = offset;
__entry->ret_ip = ret_ip;
),
- TP_printk("dev %d:%d ino 0x%llx fork %d type %u offset %llu ret_ip %pS",
+ TP_printk("dev %d:%d ino 0x%llx fork %d type %s offset %llu ret_ip %pS",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->ino,
__entry->whichfork,
- __entry->type,
+ __print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS),
__entry->offset,
__entry->ret_ip)
);
@@ -244,9 +312,9 @@
__entry->type = sc->sm->sm_type;
__entry->ret_ip = ret_ip;
),
- TP_printk("dev %d:%d type %u ret_ip %pS",
+ TP_printk("dev %d:%d type %s ret_ip %pS",
MAJOR(__entry->dev), MINOR(__entry->dev),
- __entry->type,
+ __print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS),
__entry->ret_ip)
);
@@ -278,10 +346,10 @@
__entry->error = error;
__entry->ret_ip = ret_ip;
),
- TP_printk("dev %d:%d type %u btnum %d level %d ptr %d agno %u agbno %u error %d ret_ip %pS",
+ TP_printk("dev %d:%d type %s btree %s level %d ptr %d agno %u agbno %u error %d ret_ip %pS",
MAJOR(__entry->dev), MINOR(__entry->dev),
- __entry->type,
- __entry->btnum,
+ __print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS),
+ __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS),
__entry->level,
__entry->ptr,
__entry->agno,
@@ -321,12 +389,12 @@
__entry->error = error;
__entry->ret_ip = ret_ip;
),
- TP_printk("dev %d:%d ino 0x%llx fork %d type %u btnum %d level %d ptr %d agno %u agbno %u error %d ret_ip %pS",
+ TP_printk("dev %d:%d ino 0x%llx fork %d type %s btree %s level %d ptr %d agno %u agbno %u error %d ret_ip %pS",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->ino,
__entry->whichfork,
- __entry->type,
- __entry->btnum,
+ __print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS),
+ __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS),
__entry->level,
__entry->ptr,
__entry->agno,
@@ -360,10 +428,10 @@
__entry->ptr = cur->bc_ptrs[level];
__entry->ret_ip = ret_ip;
),
- TP_printk("dev %d:%d type %u btnum %d level %d ptr %d agno %u agbno %u ret_ip %pS",
+ TP_printk("dev %d:%d type %s btree %s level %d ptr %d agno %u agbno %u ret_ip %pS",
MAJOR(__entry->dev), MINOR(__entry->dev),
- __entry->type,
- __entry->btnum,
+ __print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS),
+ __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS),
__entry->level,
__entry->ptr,
__entry->agno,
@@ -400,12 +468,12 @@
__entry->ptr = cur->bc_ptrs[level];
__entry->ret_ip = ret_ip;
),
- TP_printk("dev %d:%d ino 0x%llx fork %d type %u btnum %d level %d ptr %d agno %u agbno %u ret_ip %pS",
+ TP_printk("dev %d:%d ino 0x%llx fork %d type %s btree %s level %d ptr %d agno %u agbno %u ret_ip %pS",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->ino,
__entry->whichfork,
- __entry->type,
- __entry->btnum,
+ __print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS),
+ __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS),
__entry->level,
__entry->ptr,
__entry->agno,
@@ -439,10 +507,10 @@
__entry->nlevels = cur->bc_nlevels;
__entry->ptr = cur->bc_ptrs[level];
),
- TP_printk("dev %d:%d type %u btnum %d agno %u agbno %u level %d nlevels %d ptr %d",
+ TP_printk("dev %d:%d type %s btree %s agno %u agbno %u level %d nlevels %d ptr %d",
MAJOR(__entry->dev), MINOR(__entry->dev),
- __entry->type,
- __entry->btnum,
+ __print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS),
+ __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS),
__entry->agno,
__entry->bno,
__entry->level,
@@ -473,13 +541,116 @@
__entry->error = error;
__entry->ret_ip = ret_ip;
),
- TP_printk("dev %d:%d type %u xref error %d ret_ip %pF",
+ TP_printk("dev %d:%d type %s xref error %d ret_ip %pS",
MAJOR(__entry->dev), MINOR(__entry->dev),
- __entry->type,
+ __print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS),
__entry->error,
__entry->ret_ip)
);
+TRACE_EVENT(xchk_iallocbt_check_cluster,
+ TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
+ xfs_agino_t startino, xfs_daddr_t map_daddr,
+ unsigned short map_len, unsigned int chunk_ino,
+ unsigned int nr_inodes, uint16_t cluster_mask,
+ uint16_t holemask, unsigned int cluster_ino),
+ TP_ARGS(mp, agno, startino, map_daddr, map_len, chunk_ino, nr_inodes,
+ cluster_mask, holemask, cluster_ino),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_agino_t, startino)
+ __field(xfs_daddr_t, map_daddr)
+ __field(unsigned short, map_len)
+ __field(unsigned int, chunk_ino)
+ __field(unsigned int, nr_inodes)
+ __field(unsigned int, cluster_ino)
+ __field(uint16_t, cluster_mask)
+ __field(uint16_t, holemask)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->agno = agno;
+ __entry->startino = startino;
+ __entry->map_daddr = map_daddr;
+ __entry->map_len = map_len;
+ __entry->chunk_ino = chunk_ino;
+ __entry->nr_inodes = nr_inodes;
+ __entry->cluster_mask = cluster_mask;
+ __entry->holemask = holemask;
+ __entry->cluster_ino = cluster_ino;
+ ),
+ TP_printk("dev %d:%d agno %d startino %u daddr 0x%llx len %d chunkino %u nr_inodes %u cluster_mask 0x%x holemask 0x%x cluster_ino %u",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->agno,
+ __entry->startino,
+ __entry->map_daddr,
+ __entry->map_len,
+ __entry->chunk_ino,
+ __entry->nr_inodes,
+ __entry->cluster_mask,
+ __entry->holemask,
+ __entry->cluster_ino)
+)
+
+TRACE_EVENT(xchk_fscounters_calc,
+ TP_PROTO(struct xfs_mount *mp, uint64_t icount, uint64_t ifree,
+ uint64_t fdblocks, uint64_t delalloc),
+ TP_ARGS(mp, icount, ifree, fdblocks, delalloc),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(int64_t, icount_sb)
+ __field(uint64_t, icount_calculated)
+ __field(int64_t, ifree_sb)
+ __field(uint64_t, ifree_calculated)
+ __field(int64_t, fdblocks_sb)
+ __field(uint64_t, fdblocks_calculated)
+ __field(uint64_t, delalloc)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->icount_sb = mp->m_sb.sb_icount;
+ __entry->icount_calculated = icount;
+ __entry->ifree_sb = mp->m_sb.sb_ifree;
+ __entry->ifree_calculated = ifree;
+ __entry->fdblocks_sb = mp->m_sb.sb_fdblocks;
+ __entry->fdblocks_calculated = fdblocks;
+ __entry->delalloc = delalloc;
+ ),
+ TP_printk("dev %d:%d icount %lld:%llu ifree %lld::%llu fdblocks %lld::%llu delalloc %llu",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->icount_sb,
+ __entry->icount_calculated,
+ __entry->ifree_sb,
+ __entry->ifree_calculated,
+ __entry->fdblocks_sb,
+ __entry->fdblocks_calculated,
+ __entry->delalloc)
+)
+
+TRACE_EVENT(xchk_fscounters_within_range,
+ TP_PROTO(struct xfs_mount *mp, uint64_t expected, int64_t curr_value,
+ int64_t old_value),
+ TP_ARGS(mp, expected, curr_value, old_value),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(uint64_t, expected)
+ __field(int64_t, curr_value)
+ __field(int64_t, old_value)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->expected = expected;
+ __entry->curr_value = curr_value;
+ __entry->old_value = old_value;
+ ),
+ TP_printk("dev %d:%d expected %llu curr_value %lld old_value %lld",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->expected,
+ __entry->curr_value,
+ __entry->old_value)
+)
+
/* repair tracepoints */
#if IS_ENABLED(CONFIG_XFS_ONLINE_REPAIR)
@@ -598,11 +769,11 @@
__entry->agbno = agbno;
__entry->btnum = btnum;
),
- TP_printk("dev %d:%d agno %u agbno %u btnum %d",
+ TP_printk("dev %d:%d agno %u agbno %u btree %s",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->agno,
__entry->agbno,
- __entry->btnum)
+ __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS))
)
TRACE_EVENT(xrep_findroot_block,
TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno,
diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c
index 8039e35..96d7071 100644
--- a/fs/xfs/xfs_acl.c
+++ b/fs/xfs/xfs_acl.c
@@ -4,16 +4,14 @@
* All Rights Reserved.
*/
#include "xfs.h"
+#include "xfs_shared.h"
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
#include "xfs_inode.h"
-#include "xfs_acl.h"
#include "xfs_attr.h"
#include "xfs_trace.h"
-#include <linux/slab.h>
-#include <linux/xattr.h>
#include <linux/posix_acl_xattr.h>
@@ -114,7 +112,7 @@
{
struct xfs_inode *ip = XFS_I(inode);
struct posix_acl *acl = NULL;
- struct xfs_acl *xfs_acl;
+ struct xfs_acl *xfs_acl = NULL;
unsigned char *ea_name;
int error;
int len;
@@ -137,12 +135,8 @@
* go out to the disk.
*/
len = XFS_ACL_MAX_SIZE(ip->i_mount);
- xfs_acl = kmem_zalloc_large(len, KM_SLEEP);
- if (!xfs_acl)
- return ERR_PTR(-ENOMEM);
-
- error = xfs_attr_get(ip, ea_name, (unsigned char *)xfs_acl,
- &len, ATTR_ROOT);
+ error = xfs_attr_get(ip, ea_name, (unsigned char **)&xfs_acl, &len,
+ ATTR_ALLOC | ATTR_ROOT);
if (error) {
/*
* If the attribute doesn't exist make sure we have a negative
@@ -153,8 +147,8 @@
} else {
acl = xfs_acl_from_disk(xfs_acl, len,
XFS_ACL_MAX_ENTRIES(ip->i_mount));
+ kmem_free(xfs_acl);
}
- kmem_free(xfs_acl);
return acl;
}
@@ -182,7 +176,7 @@
struct xfs_acl *xfs_acl;
int len = XFS_ACL_MAX_SIZE(ip->i_mount);
- xfs_acl = kmem_zalloc_large(len, KM_SLEEP);
+ xfs_acl = kmem_zalloc_large(len, 0);
if (!xfs_acl)
return -ENOMEM;
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 49f5f58..f16d5f1 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -12,23 +12,19 @@
#include "xfs_mount.h"
#include "xfs_inode.h"
#include "xfs_trans.h"
-#include "xfs_inode_item.h"
-#include "xfs_alloc.h"
-#include "xfs_error.h"
#include "xfs_iomap.h"
#include "xfs_trace.h"
#include "xfs_bmap.h"
#include "xfs_bmap_util.h"
-#include "xfs_bmap_btree.h"
#include "xfs_reflink.h"
-#include <linux/writeback.h>
/*
* structure owned by writepages passed to individual writepage calls
*/
struct xfs_writepage_ctx {
struct xfs_bmbt_irec imap;
- unsigned int io_type;
+ int fork;
+ unsigned int data_seq;
unsigned int cow_seq;
struct xfs_ioend *ioend;
};
@@ -62,7 +58,7 @@
static void
xfs_finish_page_writeback(
struct inode *inode,
- struct bio_vec *bvec,
+ struct bio_vec *bvec,
int error)
{
struct iomap_page *iop = to_iomap_page(bvec->bv_page);
@@ -97,7 +93,7 @@
for (bio = &ioend->io_inline_bio; bio; bio = next) {
struct bio_vec *bvec;
- int i;
+ struct bvec_iter_all iter_all;
/*
* For the last bio, bi_private points to the ioend, so we
@@ -109,7 +105,7 @@
next = bio->bi_private;
/* walk each page on bio, ending page IO on them */
- bio_for_each_segment_all(bvec, bio, i)
+ bio_for_each_segment_all(bvec, bio, iter_all)
xfs_finish_page_writeback(inode, bvec, error);
bio_put(bio);
}
@@ -137,8 +133,7 @@
struct xfs_trans *tp;
int error;
- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_fsyncts, 0, 0,
- XFS_TRANS_NOFS, &tp);
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_fsyncts, 0, 0, 0, &tp);
if (error)
return error;
@@ -232,17 +227,24 @@
* IO write completion.
*/
STATIC void
-xfs_end_io(
- struct work_struct *work)
+xfs_end_ioend(
+ struct xfs_ioend *ioend)
{
- struct xfs_ioend *ioend =
- container_of(work, struct xfs_ioend, io_work);
+ struct list_head ioend_list;
struct xfs_inode *ip = XFS_I(ioend->io_inode);
xfs_off_t offset = ioend->io_offset;
size_t size = ioend->io_size;
+ unsigned int nofs_flag;
int error;
/*
+ * We can allocate memory here while doing writeback on behalf of
+ * memory reclaim. To avoid memory allocation deadlocks set the
+ * task-wide nofs context for the following operations.
+ */
+ nofs_flag = memalloc_nofs_save();
+
+ /*
* Just clean up the in-memory strutures if the fs has been shut down.
*/
if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
@@ -255,35 +257,140 @@
*/
error = blk_status_to_errno(ioend->io_bio->bi_status);
if (unlikely(error)) {
- switch (ioend->io_type) {
- case XFS_IO_COW:
+ if (ioend->io_fork == XFS_COW_FORK)
xfs_reflink_cancel_cow_range(ip, offset, size, true);
- break;
- }
-
goto done;
}
/*
- * Success: commit the COW or unwritten blocks if needed.
+ * Success: commit the COW or unwritten blocks if needed.
*/
- switch (ioend->io_type) {
- case XFS_IO_COW:
+ if (ioend->io_fork == XFS_COW_FORK)
error = xfs_reflink_end_cow(ip, offset, size);
- break;
- case XFS_IO_UNWRITTEN:
- /* writeback should never update isize */
+ else if (ioend->io_state == XFS_EXT_UNWRITTEN)
error = xfs_iomap_write_unwritten(ip, offset, size, false);
- break;
- default:
+ else
ASSERT(!xfs_ioend_is_append(ioend) || ioend->io_append_trans);
- break;
- }
done:
if (ioend->io_append_trans)
error = xfs_setfilesize_ioend(ioend, error);
+ list_replace_init(&ioend->io_list, &ioend_list);
xfs_destroy_ioend(ioend, error);
+
+ while (!list_empty(&ioend_list)) {
+ ioend = list_first_entry(&ioend_list, struct xfs_ioend,
+ io_list);
+ list_del_init(&ioend->io_list);
+ xfs_destroy_ioend(ioend, error);
+ }
+
+ memalloc_nofs_restore(nofs_flag);
+}
+
+/*
+ * We can merge two adjacent ioends if they have the same set of work to do.
+ */
+static bool
+xfs_ioend_can_merge(
+ struct xfs_ioend *ioend,
+ struct xfs_ioend *next)
+{
+ if (ioend->io_bio->bi_status != next->io_bio->bi_status)
+ return false;
+ if ((ioend->io_fork == XFS_COW_FORK) ^ (next->io_fork == XFS_COW_FORK))
+ return false;
+ if ((ioend->io_state == XFS_EXT_UNWRITTEN) ^
+ (next->io_state == XFS_EXT_UNWRITTEN))
+ return false;
+ if (ioend->io_offset + ioend->io_size != next->io_offset)
+ return false;
+ return true;
+}
+
+/*
+ * If the to be merged ioend has a preallocated transaction for file
+ * size updates we need to ensure the ioend it is merged into also
+ * has one. If it already has one we can simply cancel the transaction
+ * as it is guaranteed to be clean.
+ */
+static void
+xfs_ioend_merge_append_transactions(
+ struct xfs_ioend *ioend,
+ struct xfs_ioend *next)
+{
+ if (!ioend->io_append_trans) {
+ ioend->io_append_trans = next->io_append_trans;
+ next->io_append_trans = NULL;
+ } else {
+ xfs_setfilesize_ioend(next, -ECANCELED);
+ }
+}
+
+/* Try to merge adjacent completions. */
+STATIC void
+xfs_ioend_try_merge(
+ struct xfs_ioend *ioend,
+ struct list_head *more_ioends)
+{
+ struct xfs_ioend *next_ioend;
+
+ while (!list_empty(more_ioends)) {
+ next_ioend = list_first_entry(more_ioends, struct xfs_ioend,
+ io_list);
+ if (!xfs_ioend_can_merge(ioend, next_ioend))
+ break;
+ list_move_tail(&next_ioend->io_list, &ioend->io_list);
+ ioend->io_size += next_ioend->io_size;
+ if (next_ioend->io_append_trans)
+ xfs_ioend_merge_append_transactions(ioend, next_ioend);
+ }
+}
+
+/* list_sort compare function for ioends */
+static int
+xfs_ioend_compare(
+ void *priv,
+ struct list_head *a,
+ struct list_head *b)
+{
+ struct xfs_ioend *ia;
+ struct xfs_ioend *ib;
+
+ ia = container_of(a, struct xfs_ioend, io_list);
+ ib = container_of(b, struct xfs_ioend, io_list);
+ if (ia->io_offset < ib->io_offset)
+ return -1;
+ else if (ia->io_offset > ib->io_offset)
+ return 1;
+ return 0;
+}
+
+/* Finish all pending io completions. */
+void
+xfs_end_io(
+ struct work_struct *work)
+{
+ struct xfs_inode *ip;
+ struct xfs_ioend *ioend;
+ struct list_head completion_list;
+ unsigned long flags;
+
+ ip = container_of(work, struct xfs_inode, i_ioend_work);
+
+ spin_lock_irqsave(&ip->i_ioend_lock, flags);
+ list_replace_init(&ip->i_ioend_list, &completion_list);
+ spin_unlock_irqrestore(&ip->i_ioend_lock, flags);
+
+ list_sort(NULL, &completion_list, xfs_ioend_compare);
+
+ while (!list_empty(&completion_list)) {
+ ioend = list_first_entry(&completion_list, struct xfs_ioend,
+ io_list);
+ list_del_init(&ioend->io_list);
+ xfs_ioend_try_merge(ioend, &completion_list);
+ xfs_end_ioend(ioend);
+ }
}
STATIC void
@@ -291,16 +398,92 @@
struct bio *bio)
{
struct xfs_ioend *ioend = bio->bi_private;
- struct xfs_mount *mp = XFS_I(ioend->io_inode)->i_mount;
+ struct xfs_inode *ip = XFS_I(ioend->io_inode);
+ struct xfs_mount *mp = ip->i_mount;
+ unsigned long flags;
- if (ioend->io_type == XFS_IO_UNWRITTEN || ioend->io_type == XFS_IO_COW)
- queue_work(mp->m_unwritten_workqueue, &ioend->io_work);
- else if (ioend->io_append_trans)
- queue_work(mp->m_data_workqueue, &ioend->io_work);
- else
+ if (ioend->io_fork == XFS_COW_FORK ||
+ ioend->io_state == XFS_EXT_UNWRITTEN ||
+ ioend->io_append_trans != NULL) {
+ spin_lock_irqsave(&ip->i_ioend_lock, flags);
+ if (list_empty(&ip->i_ioend_list))
+ WARN_ON_ONCE(!queue_work(mp->m_unwritten_workqueue,
+ &ip->i_ioend_work));
+ list_add_tail(&ioend->io_list, &ip->i_ioend_list);
+ spin_unlock_irqrestore(&ip->i_ioend_lock, flags);
+ } else
xfs_destroy_ioend(ioend, blk_status_to_errno(bio->bi_status));
}
+/*
+ * Fast revalidation of the cached writeback mapping. Return true if the current
+ * mapping is valid, false otherwise.
+ */
+static bool
+xfs_imap_valid(
+ struct xfs_writepage_ctx *wpc,
+ struct xfs_inode *ip,
+ xfs_fileoff_t offset_fsb)
+{
+ if (offset_fsb < wpc->imap.br_startoff ||
+ offset_fsb >= wpc->imap.br_startoff + wpc->imap.br_blockcount)
+ return false;
+ /*
+ * If this is a COW mapping, it is sufficient to check that the mapping
+ * covers the offset. Be careful to check this first because the caller
+ * can revalidate a COW mapping without updating the data seqno.
+ */
+ if (wpc->fork == XFS_COW_FORK)
+ return true;
+
+ /*
+ * This is not a COW mapping. Check the sequence number of the data fork
+ * because concurrent changes could have invalidated the extent. Check
+ * the COW fork because concurrent changes since the last time we
+ * checked (and found nothing at this offset) could have added
+ * overlapping blocks.
+ */
+ if (wpc->data_seq != READ_ONCE(ip->i_df.if_seq))
+ return false;
+ if (xfs_inode_has_cow_data(ip) &&
+ wpc->cow_seq != READ_ONCE(ip->i_cowfp->if_seq))
+ return false;
+ return true;
+}
+
+/*
+ * Pass in a dellalloc extent and convert it to real extents, return the real
+ * extent that maps offset_fsb in wpc->imap.
+ *
+ * The current page is held locked so nothing could have removed the block
+ * backing offset_fsb, although it could have moved from the COW to the data
+ * fork by another thread.
+ */
+static int
+xfs_convert_blocks(
+ struct xfs_writepage_ctx *wpc,
+ struct xfs_inode *ip,
+ xfs_fileoff_t offset_fsb)
+{
+ int error;
+
+ /*
+ * Attempt to allocate whatever delalloc extent currently backs
+ * offset_fsb and put the result into wpc->imap. Allocate in a loop
+ * because it may take several attempts to allocate real blocks for a
+ * contiguous delalloc extent if free space is sufficiently fragmented.
+ */
+ do {
+ error = xfs_bmapi_convert_delalloc(ip, wpc->fork, offset_fsb,
+ &wpc->imap, wpc->fork == XFS_COW_FORK ?
+ &wpc->cow_seq : &wpc->data_seq);
+ if (error)
+ return error;
+ } while (wpc->imap.br_startoff + wpc->imap.br_blockcount <= offset_fsb);
+
+ return 0;
+}
+
STATIC int
xfs_map_blocks(
struct xfs_writepage_ctx *wpc,
@@ -310,26 +493,16 @@
struct xfs_inode *ip = XFS_I(inode);
struct xfs_mount *mp = ip->i_mount;
ssize_t count = i_blocksize(inode);
- xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset), end_fsb;
+ xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
+ xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + count);
xfs_fileoff_t cow_fsb = NULLFILEOFF;
struct xfs_bmbt_irec imap;
- int whichfork = XFS_DATA_FORK;
struct xfs_iext_cursor icur;
- bool imap_valid;
+ int retries = 0;
int error = 0;
- /*
- * We have to make sure the cached mapping is within EOF to protect
- * against eofblocks trimming on file release leaving us with a stale
- * mapping. Otherwise, a page for a subsequent file extending buffered
- * write could get picked up by this writeback cycle and written to the
- * wrong blocks.
- *
- * Note that what we really want here is a generic mapping invalidation
- * mechanism to protect us from arbitrary extent modifying contexts, not
- * just eofblocks.
- */
- xfs_trim_extent_eof(&wpc->imap, ip);
+ if (XFS_FORCED_SHUTDOWN(mp))
+ return -EIO;
/*
* COW fork blocks can overlap data fork blocks even if the blocks
@@ -346,31 +519,19 @@
* against concurrent updates and provides a memory barrier on the way
* out that ensures that we always see the current value.
*/
- imap_valid = offset_fsb >= wpc->imap.br_startoff &&
- offset_fsb < wpc->imap.br_startoff + wpc->imap.br_blockcount;
- if (imap_valid &&
- (!xfs_inode_has_cow_data(ip) ||
- wpc->io_type == XFS_IO_COW ||
- wpc->cow_seq == READ_ONCE(ip->i_cowfp->if_seq)))
+ if (xfs_imap_valid(wpc, ip, offset_fsb))
return 0;
- if (XFS_FORCED_SHUTDOWN(mp))
- return -EIO;
-
/*
* If we don't have a valid map, now it's time to get a new one for this
* offset. This will convert delayed allocations (including COW ones)
* into real extents. If we return without a valid map, it means we
* landed in a hole and we skip the block.
*/
+retry:
xfs_ilock(ip, XFS_ILOCK_SHARED);
ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
(ip->i_df.if_flags & XFS_IFEXTENTS));
- ASSERT(offset <= mp->m_super->s_maxbytes);
-
- if (offset > mp->m_super->s_maxbytes - count)
- count = mp->m_super->s_maxbytes - offset;
- end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count);
/*
* Check if this is offset is covered by a COW extents, and if yes use
@@ -382,30 +543,16 @@
if (cow_fsb != NULLFILEOFF && cow_fsb <= offset_fsb) {
wpc->cow_seq = READ_ONCE(ip->i_cowfp->if_seq);
xfs_iunlock(ip, XFS_ILOCK_SHARED);
- /*
- * Truncate can race with writeback since writeback doesn't
- * take the iolock and truncate decreases the file size before
- * it starts truncating the pages between new_size and old_size.
- * Therefore, we can end up in the situation where writeback
- * gets a CoW fork mapping but the truncate makes the mapping
- * invalid and we end up in here trying to get a new mapping.
- * bail out here so that we simply never get a valid mapping
- * and so we drop the write altogether. The page truncation
- * will kill the contents anyway.
- */
- if (offset > i_size_read(inode)) {
- wpc->io_type = XFS_IO_HOLE;
- return 0;
- }
- whichfork = XFS_COW_FORK;
- wpc->io_type = XFS_IO_COW;
+
+ wpc->fork = XFS_COW_FORK;
goto allocate_blocks;
}
/*
- * Map valid and no COW extent in the way? We're done.
+ * No COW extent overlap. Revalidate now that we may have updated
+ * ->cow_seq. If the data mapping is still valid, we're done.
*/
- if (imap_valid) {
+ if (xfs_imap_valid(wpc, ip, offset_fsb)) {
xfs_iunlock(ip, XFS_ILOCK_SHARED);
return 0;
}
@@ -417,49 +564,65 @@
*/
if (!xfs_iext_lookup_extent(ip, &ip->i_df, offset_fsb, &icur, &imap))
imap.br_startoff = end_fsb; /* fake a hole past EOF */
+ wpc->data_seq = READ_ONCE(ip->i_df.if_seq);
xfs_iunlock(ip, XFS_ILOCK_SHARED);
+ wpc->fork = XFS_DATA_FORK;
+
+ /* landed in a hole or beyond EOF? */
if (imap.br_startoff > offset_fsb) {
- /* landed in a hole or beyond EOF */
imap.br_blockcount = imap.br_startoff - offset_fsb;
imap.br_startoff = offset_fsb;
imap.br_startblock = HOLESTARTBLOCK;
- wpc->io_type = XFS_IO_HOLE;
- } else {
- /*
- * Truncate to the next COW extent if there is one. This is the
- * only opportunity to do this because we can skip COW fork
- * lookups for the subsequent blocks in the mapping; however,
- * the requirement to treat the COW range separately remains.
- */
- if (cow_fsb != NULLFILEOFF &&
- cow_fsb < imap.br_startoff + imap.br_blockcount)
- imap.br_blockcount = cow_fsb - imap.br_startoff;
-
- if (isnullstartblock(imap.br_startblock)) {
- /* got a delalloc extent */
- wpc->io_type = XFS_IO_DELALLOC;
- goto allocate_blocks;
- }
-
- if (imap.br_state == XFS_EXT_UNWRITTEN)
- wpc->io_type = XFS_IO_UNWRITTEN;
- else
- wpc->io_type = XFS_IO_OVERWRITE;
+ imap.br_state = XFS_EXT_NORM;
}
+ /*
+ * Truncate to the next COW extent if there is one. This is the only
+ * opportunity to do this because we can skip COW fork lookups for the
+ * subsequent blocks in the mapping; however, the requirement to treat
+ * the COW range separately remains.
+ */
+ if (cow_fsb != NULLFILEOFF &&
+ cow_fsb < imap.br_startoff + imap.br_blockcount)
+ imap.br_blockcount = cow_fsb - imap.br_startoff;
+
+ /* got a delalloc extent? */
+ if (imap.br_startblock != HOLESTARTBLOCK &&
+ isnullstartblock(imap.br_startblock))
+ goto allocate_blocks;
+
wpc->imap = imap;
- trace_xfs_map_blocks_found(ip, offset, count, wpc->io_type, &imap);
+ trace_xfs_map_blocks_found(ip, offset, count, wpc->fork, &imap);
return 0;
allocate_blocks:
- error = xfs_iomap_write_allocate(ip, whichfork, offset, &imap,
- &wpc->cow_seq);
- if (error)
+ error = xfs_convert_blocks(wpc, ip, offset_fsb);
+ if (error) {
+ /*
+ * If we failed to find the extent in the COW fork we might have
+ * raced with a COW to data fork conversion or truncate.
+ * Restart the lookup to catch the extent in the data fork for
+ * the former case, but prevent additional retries to avoid
+ * looping forever for the latter case.
+ */
+ if (error == -EAGAIN && wpc->fork == XFS_COW_FORK && !retries++)
+ goto retry;
+ ASSERT(error != -EAGAIN);
return error;
- ASSERT(whichfork == XFS_COW_FORK || cow_fsb == NULLFILEOFF ||
- imap.br_startoff + imap.br_blockcount <= cow_fsb);
- wpc->imap = imap;
- trace_xfs_map_blocks_alloc(ip, offset, count, wpc->io_type, &imap);
+ }
+
+ /*
+ * Due to merging the return real extent might be larger than the
+ * original delalloc one. Trim the return extent to the next COW
+ * boundary again to force a re-lookup.
+ */
+ if (wpc->fork != XFS_COW_FORK && cow_fsb != NULLFILEOFF &&
+ cow_fsb < wpc->imap.br_startoff + wpc->imap.br_blockcount)
+ wpc->imap.br_blockcount = cow_fsb - wpc->imap.br_startoff;
+
+ ASSERT(wpc->imap.br_startoff <= offset_fsb);
+ ASSERT(wpc->imap.br_startoff + wpc->imap.br_blockcount > offset_fsb);
+ trace_xfs_map_blocks_alloc(ip, offset, count, wpc->fork, &imap);
return 0;
}
@@ -471,7 +634,7 @@
* reference to the ioend to ensure that the ioend completion is only done once
* all bios have been submitted and the ioend is really done.
*
- * If @fail is non-zero, it means that we have a situation where some part of
+ * If @status is non-zero, it means that we have a situation where some part of
* the submission process has failed after we have marked paged for writeback
* and unlocked them. In this situation, we need to fail the bio and ioend
* rather than submit it to IO. This typically only happens on a filesystem
@@ -483,33 +646,33 @@
struct xfs_ioend *ioend,
int status)
{
- /* Convert CoW extents to regular */
- if (!status && ioend->io_type == XFS_IO_COW) {
- /*
- * Yuk. This can do memory allocation, but is not a
- * transactional operation so everything is done in GFP_KERNEL
- * context. That can deadlock, because we hold pages in
- * writeback state and GFP_KERNEL allocations can block on them.
- * Hence we must operate in nofs conditions here.
- */
- unsigned nofs_flag;
+ unsigned int nofs_flag;
- nofs_flag = memalloc_nofs_save();
+ /*
+ * We can allocate memory here while doing writeback on behalf of
+ * memory reclaim. To avoid memory allocation deadlocks set the
+ * task-wide nofs context for the following operations.
+ */
+ nofs_flag = memalloc_nofs_save();
+
+ /* Convert CoW extents to regular */
+ if (!status && ioend->io_fork == XFS_COW_FORK) {
status = xfs_reflink_convert_cow(XFS_I(ioend->io_inode),
ioend->io_offset, ioend->io_size);
- memalloc_nofs_restore(nofs_flag);
}
/* Reserve log space if we might write beyond the on-disk inode size. */
if (!status &&
- ioend->io_type != XFS_IO_UNWRITTEN &&
+ (ioend->io_fork == XFS_COW_FORK ||
+ ioend->io_state != XFS_EXT_UNWRITTEN) &&
xfs_ioend_is_append(ioend) &&
!ioend->io_append_trans)
status = xfs_setfilesize_trans_alloc(ioend);
+ memalloc_nofs_restore(nofs_flag);
+
ioend->io_bio->bi_private = ioend;
ioend->io_bio->bi_end_io = xfs_end_bio;
- ioend->io_bio->bi_opf = REQ_OP_WRITE | wbc_to_write_flags(wbc);
/*
* If we are failing the IO now, just mark the ioend with an
@@ -523,7 +686,6 @@
return status;
}
- ioend->io_bio->bi_write_hint = ioend->io_inode->i_write_hint;
submit_bio(ioend->io_bio);
return 0;
}
@@ -531,10 +693,12 @@
static struct xfs_ioend *
xfs_alloc_ioend(
struct inode *inode,
- unsigned int type,
+ int fork,
+ xfs_exntst_t state,
xfs_off_t offset,
struct block_device *bdev,
- sector_t sector)
+ sector_t sector,
+ struct writeback_control *wbc)
{
struct xfs_ioend *ioend;
struct bio *bio;
@@ -542,14 +706,17 @@
bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_PAGES, &xfs_ioend_bioset);
bio_set_dev(bio, bdev);
bio->bi_iter.bi_sector = sector;
+ bio->bi_opf = REQ_OP_WRITE | wbc_to_write_flags(wbc);
+ bio->bi_write_hint = inode->i_write_hint;
+ wbc_init_bio(wbc, bio);
ioend = container_of(bio, struct xfs_ioend, io_inline_bio);
INIT_LIST_HEAD(&ioend->io_list);
- ioend->io_type = type;
+ ioend->io_fork = fork;
+ ioend->io_state = state;
ioend->io_inode = inode;
ioend->io_size = 0;
ioend->io_offset = offset;
- INIT_WORK(&ioend->io_work, xfs_end_io);
ioend->io_append_trans = NULL;
ioend->io_bio = bio;
return ioend;
@@ -562,24 +729,22 @@
* so that the bi_private linkage is set up in the right direction for the
* traversal in xfs_destroy_ioend().
*/
-static void
+static struct bio *
xfs_chain_bio(
- struct xfs_ioend *ioend,
- struct writeback_control *wbc,
- struct block_device *bdev,
- sector_t sector)
+ struct bio *prev)
{
struct bio *new;
new = bio_alloc(GFP_NOFS, BIO_MAX_PAGES);
- bio_set_dev(new, bdev);
- new->bi_iter.bi_sector = sector;
- bio_chain(ioend->io_bio, new);
- bio_get(ioend->io_bio); /* for xfs_destroy_ioend */
- ioend->io_bio->bi_opf = REQ_OP_WRITE | wbc_to_write_flags(wbc);
- ioend->io_bio->bi_write_hint = ioend->io_inode->i_write_hint;
- submit_bio(ioend->io_bio);
- ioend->io_bio = new;
+ bio_copy_dev(new, prev);/* also copies over blkcg information */
+ new->bi_iter.bi_sector = bio_end_sector(prev);
+ new->bi_opf = prev->bi_opf;
+ new->bi_write_hint = prev->bi_write_hint;
+
+ bio_chain(prev, new);
+ bio_get(prev); /* for xfs_destroy_ioend */
+ submit_bio(prev);
+ return new;
}
/*
@@ -601,29 +766,37 @@
struct block_device *bdev = xfs_find_bdev_for_inode(inode);
unsigned len = i_blocksize(inode);
unsigned poff = offset & (PAGE_SIZE - 1);
+ bool merged, same_page = false;
sector_t sector;
sector = xfs_fsb_to_db(ip, wpc->imap.br_startblock) +
((offset - XFS_FSB_TO_B(mp, wpc->imap.br_startoff)) >> 9);
- if (!wpc->ioend || wpc->io_type != wpc->ioend->io_type ||
+ if (!wpc->ioend ||
+ wpc->fork != wpc->ioend->io_fork ||
+ wpc->imap.br_state != wpc->ioend->io_state ||
sector != bio_end_sector(wpc->ioend->io_bio) ||
offset != wpc->ioend->io_offset + wpc->ioend->io_size) {
if (wpc->ioend)
list_add(&wpc->ioend->io_list, iolist);
- wpc->ioend = xfs_alloc_ioend(inode, wpc->io_type, offset,
- bdev, sector);
+ wpc->ioend = xfs_alloc_ioend(inode, wpc->fork,
+ wpc->imap.br_state, offset, bdev, sector, wbc);
}
- if (!__bio_try_merge_page(wpc->ioend->io_bio, page, len, poff)) {
- if (iop)
- atomic_inc(&iop->write_count);
- if (bio_full(wpc->ioend->io_bio))
- xfs_chain_bio(wpc->ioend, wbc, bdev, sector);
- __bio_add_page(wpc->ioend->io_bio, page, len, poff);
+ merged = __bio_try_merge_page(wpc->ioend->io_bio, page, len, poff,
+ &same_page);
+
+ if (iop && !same_page)
+ atomic_inc(&iop->write_count);
+
+ if (!merged) {
+ if (bio_full(wpc->ioend->io_bio, len))
+ wpc->ioend->io_bio = xfs_chain_bio(wpc->ioend->io_bio);
+ bio_add_page(wpc->ioend->io_bio, page, len, poff);
}
wpc->ioend->io_size += len;
+ wbc_account_cgroup_owner(wbc, page, len);
}
STATIC void
@@ -721,7 +894,7 @@
error = xfs_map_blocks(wpc, inode, file_offset);
if (error)
break;
- if (wpc->io_type == XFS_IO_HOLE)
+ if (wpc->imap.br_startblock == HOLESTARTBLOCK)
continue;
xfs_add_to_ioend(inode, file_offset, page, iop, wpc, wbc,
&submit_list);
@@ -916,9 +1089,7 @@
struct page *page,
struct writeback_control *wbc)
{
- struct xfs_writepage_ctx wpc = {
- .io_type = XFS_IO_INVALID,
- };
+ struct xfs_writepage_ctx wpc = { };
int ret;
ret = xfs_do_writepage(page, wbc, &wpc);
@@ -932,9 +1103,7 @@
struct address_space *mapping,
struct writeback_control *wbc)
{
- struct xfs_writepage_ctx wpc = {
- .io_type = XFS_IO_INVALID,
- };
+ struct xfs_writepage_ctx wpc = { };
int ret;
xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED);
@@ -981,7 +1150,7 @@
* Since we don't pass back blockdev info, we can't return bmap
* information for rt files either.
*/
- if (xfs_is_reflink_inode(ip) || XFS_IS_REALTIME_INODE(ip))
+ if (xfs_is_cow_inode(ip) || XFS_IS_REALTIME_INODE(ip))
return 0;
return iomap_bmap(mapping, block, &xfs_iomap_ops);
}
diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h
index 9af8679..45a1ea2 100644
--- a/fs/xfs/xfs_aops.h
+++ b/fs/xfs/xfs_aops.h
@@ -9,35 +9,15 @@
extern struct bio_set xfs_ioend_bioset;
/*
- * Types of I/O for bmap clustering and I/O completion tracking.
- */
-enum {
- XFS_IO_INVALID, /* initial state */
- XFS_IO_DELALLOC, /* covers delalloc region */
- XFS_IO_UNWRITTEN, /* covers allocated but uninitialized data */
- XFS_IO_OVERWRITE, /* covers already allocated extent */
- XFS_IO_COW, /* covers copy-on-write extent */
- XFS_IO_HOLE, /* covers region without any block allocation */
-};
-
-#define XFS_IO_TYPES \
- { XFS_IO_INVALID, "invalid" }, \
- { XFS_IO_DELALLOC, "delalloc" }, \
- { XFS_IO_UNWRITTEN, "unwritten" }, \
- { XFS_IO_OVERWRITE, "overwrite" }, \
- { XFS_IO_COW, "CoW" }, \
- { XFS_IO_HOLE, "hole" }
-
-/*
* Structure for buffered I/O completions.
*/
struct xfs_ioend {
struct list_head io_list; /* next ioend in chain */
- unsigned int io_type; /* delalloc / unwritten */
+ int io_fork; /* inode fork written back */
+ xfs_exntst_t io_state; /* extent state */
struct inode *io_inode; /* file being written to */
size_t io_size; /* size of the extent */
xfs_off_t io_offset; /* offset in the file */
- struct work_struct io_work; /* xfsdatad work queue */
struct xfs_trans *io_append_trans;/* xact. for size update */
struct bio *io_bio; /* bio being built */
struct bio io_inline_bio; /* MUST BE LAST! */
@@ -48,7 +28,6 @@
int xfs_setfilesize(struct xfs_inode *ip, xfs_off_t offset, size_t size);
-extern void xfs_count_page_state(struct page *, int *, int *);
extern struct block_device *xfs_find_bdev_for_inode(struct inode *);
extern struct dax_device *xfs_find_daxdev_for_inode(struct inode *);
diff --git a/fs/xfs/xfs_attr_inactive.c b/fs/xfs/xfs_attr_inactive.c
index 228821b..a640a28 100644
--- a/fs/xfs/xfs_attr_inactive.c
+++ b/fs/xfs/xfs_attr_inactive.c
@@ -15,18 +15,13 @@
#include "xfs_da_format.h"
#include "xfs_da_btree.h"
#include "xfs_inode.h"
-#include "xfs_alloc.h"
#include "xfs_attr_remote.h"
#include "xfs_trans.h"
-#include "xfs_inode_item.h"
#include "xfs_bmap.h"
#include "xfs_attr.h"
#include "xfs_attr_leaf.h"
-#include "xfs_error.h"
#include "xfs_quota.h"
-#include "xfs_trace.h"
#include "xfs_dir2.h"
-#include "xfs_defer.h"
/*
* Look at all the extents for this logical region,
@@ -121,7 +116,7 @@
int size;
int tmp;
int i;
- struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_mount *mp = bp->b_mount;
leaf = bp->b_addr;
xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr, leaf);
@@ -152,7 +147,7 @@
* Allocate storage for a list of all the "remote" value extents.
*/
size = count * sizeof(xfs_attr_inactive_list_t);
- list = kmem_alloc(size, KM_SLEEP);
+ list = kmem_alloc(size, 0);
/*
* Identify each of the "remote" value extents.
diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c
index a580340..00758fd 100644
--- a/fs/xfs/xfs_attr_list.c
+++ b/fs/xfs/xfs_attr_list.c
@@ -6,25 +6,20 @@
*/
#include "xfs.h"
#include "xfs_fs.h"
+#include "xfs_shared.h"
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
-#include "xfs_bit.h"
#include "xfs_mount.h"
#include "xfs_da_format.h"
-#include "xfs_da_btree.h"
#include "xfs_inode.h"
#include "xfs_trans.h"
-#include "xfs_inode_item.h"
#include "xfs_bmap.h"
#include "xfs_attr.h"
#include "xfs_attr_sf.h"
-#include "xfs_attr_remote.h"
#include "xfs_attr_leaf.h"
#include "xfs_error.h"
#include "xfs_trace.h"
-#include "xfs_buf_item.h"
-#include "xfs_cksum.h"
#include "xfs_dir2.h"
STATIC int
@@ -114,7 +109,7 @@
* It didn't all fit, so we have to sort everything on hashval.
*/
sbsize = sf->hdr.count * sizeof(*sbuf);
- sbp = sbuf = kmem_alloc(sbsize, KM_SLEEP | KM_NOFS);
+ sbp = sbuf = kmem_alloc(sbsize, KM_NOFS);
/*
* Scan the attribute list for the rest of the entries, storing
@@ -555,6 +550,7 @@
attrlist_ent_t *aep;
int arraytop;
+ ASSERT(!context->seen_enough);
ASSERT(!(context->flags & ATTR_KERNOVAL));
ASSERT(context->count >= 0);
ASSERT(context->count < (ATTR_MAX_VALUELEN/8));
diff --git a/fs/xfs/xfs_bio_io.c b/fs/xfs/xfs_bio_io.c
new file mode 100644
index 0000000..e2148f2
--- /dev/null
+++ b/fs/xfs/xfs_bio_io.c
@@ -0,0 +1,61 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2019 Christoph Hellwig.
+ */
+#include "xfs.h"
+
+static inline unsigned int bio_max_vecs(unsigned int count)
+{
+ return min_t(unsigned, howmany(count, PAGE_SIZE), BIO_MAX_PAGES);
+}
+
+int
+xfs_rw_bdev(
+ struct block_device *bdev,
+ sector_t sector,
+ unsigned int count,
+ char *data,
+ unsigned int op)
+
+{
+ unsigned int is_vmalloc = is_vmalloc_addr(data);
+ unsigned int left = count;
+ int error;
+ struct bio *bio;
+
+ if (is_vmalloc && op == REQ_OP_WRITE)
+ flush_kernel_vmap_range(data, count);
+
+ bio = bio_alloc(GFP_KERNEL, bio_max_vecs(left));
+ bio_set_dev(bio, bdev);
+ bio->bi_iter.bi_sector = sector;
+ bio->bi_opf = op | REQ_META | REQ_SYNC;
+
+ do {
+ struct page *page = kmem_to_page(data);
+ unsigned int off = offset_in_page(data);
+ unsigned int len = min_t(unsigned, left, PAGE_SIZE - off);
+
+ while (bio_add_page(bio, page, len, off) != len) {
+ struct bio *prev = bio;
+
+ bio = bio_alloc(GFP_KERNEL, bio_max_vecs(left));
+ bio_copy_dev(bio, prev);
+ bio->bi_iter.bi_sector = bio_end_sector(prev);
+ bio->bi_opf = prev->bi_opf;
+ bio_chain(prev, bio);
+
+ submit_bio(prev);
+ }
+
+ data += len;
+ left -= len;
+ } while (left > 0);
+
+ error = submit_bio_wait(bio);
+ bio_put(bio);
+
+ if (is_vmalloc && op == REQ_OP_READ)
+ invalidate_kernel_vmap_range(data, count);
+ return error;
+}
diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c
index ce45f06..83d24e9 100644
--- a/fs/xfs/xfs_bmap_item.c
+++ b/fs/xfs/xfs_bmap_item.c
@@ -9,17 +9,16 @@
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
#include "xfs_bit.h"
+#include "xfs_shared.h"
#include "xfs_mount.h"
#include "xfs_defer.h"
#include "xfs_inode.h"
#include "xfs_trans.h"
#include "xfs_trans_priv.h"
-#include "xfs_buf_item.h"
#include "xfs_bmap_item.h"
#include "xfs_log.h"
#include "xfs_bmap.h"
#include "xfs_icache.h"
-#include "xfs_trace.h"
#include "xfs_bmap_btree.h"
#include "xfs_trans_space.h"
@@ -96,15 +95,6 @@
}
/*
- * Pinning has no meaning for an bui item, so just return.
- */
-STATIC void
-xfs_bui_item_pin(
- struct xfs_log_item *lip)
-{
-}
-
-/*
* The unpin operation is the last place an BUI is manipulated in the log. It is
* either inserted in the AIL or aborted in the event of a log I/O error. In
* either case, the BUI transaction has been successfully committed to make it
@@ -123,71 +113,22 @@
}
/*
- * BUI items have no locking or pushing. However, since BUIs are pulled from
- * the AIL when their corresponding BUDs are committed to disk, their situation
- * is very similar to being pinned. Return XFS_ITEM_PINNED so that the caller
- * will eventually flush the log. This should help in getting the BUI out of
- * the AIL.
- */
-STATIC uint
-xfs_bui_item_push(
- struct xfs_log_item *lip,
- struct list_head *buffer_list)
-{
- return XFS_ITEM_PINNED;
-}
-
-/*
* The BUI has been either committed or aborted if the transaction has been
* cancelled. If the transaction was cancelled, an BUD isn't going to be
* constructed and thus we free the BUI here directly.
*/
STATIC void
-xfs_bui_item_unlock(
+xfs_bui_item_release(
struct xfs_log_item *lip)
{
- if (test_bit(XFS_LI_ABORTED, &lip->li_flags))
- xfs_bui_release(BUI_ITEM(lip));
+ xfs_bui_release(BUI_ITEM(lip));
}
-/*
- * The BUI is logged only once and cannot be moved in the log, so simply return
- * the lsn at which it's been logged.
- */
-STATIC xfs_lsn_t
-xfs_bui_item_committed(
- struct xfs_log_item *lip,
- xfs_lsn_t lsn)
-{
- return lsn;
-}
-
-/*
- * The BUI dependency tracking op doesn't do squat. It can't because
- * it doesn't know where the free extent is coming from. The dependency
- * tracking has to be handled by the "enclosing" metadata object. For
- * example, for inodes, the inode is locked throughout the extent freeing
- * so the dependency should be recorded there.
- */
-STATIC void
-xfs_bui_item_committing(
- struct xfs_log_item *lip,
- xfs_lsn_t lsn)
-{
-}
-
-/*
- * This is the ops vector shared by all bui log items.
- */
static const struct xfs_item_ops xfs_bui_item_ops = {
.iop_size = xfs_bui_item_size,
.iop_format = xfs_bui_item_format,
- .iop_pin = xfs_bui_item_pin,
.iop_unpin = xfs_bui_item_unpin,
- .iop_unlock = xfs_bui_item_unlock,
- .iop_committed = xfs_bui_item_committed,
- .iop_push = xfs_bui_item_push,
- .iop_committing = xfs_bui_item_committing,
+ .iop_release = xfs_bui_item_release,
};
/*
@@ -200,7 +141,7 @@
{
struct xfs_bui_log_item *buip;
- buip = kmem_zone_zalloc(xfs_bui_zone, KM_SLEEP);
+ buip = kmem_zone_zalloc(xfs_bui_zone, 0);
xfs_log_item_init(mp, &buip->bui_item, XFS_LI_BUI, &xfs_bui_item_ops);
buip->bui_format.bui_nextents = XFS_BUI_MAX_FAST_EXTENTS;
@@ -249,127 +190,242 @@
}
/*
- * Pinning has no meaning for an bud item, so just return.
- */
-STATIC void
-xfs_bud_item_pin(
- struct xfs_log_item *lip)
-{
-}
-
-/*
- * Since pinning has no meaning for an bud item, unpinning does
- * not either.
- */
-STATIC void
-xfs_bud_item_unpin(
- struct xfs_log_item *lip,
- int remove)
-{
-}
-
-/*
- * There isn't much you can do to push on an bud item. It is simply stuck
- * waiting for the log to be flushed to disk.
- */
-STATIC uint
-xfs_bud_item_push(
- struct xfs_log_item *lip,
- struct list_head *buffer_list)
-{
- return XFS_ITEM_PINNED;
-}
-
-/*
* The BUD is either committed or aborted if the transaction is cancelled. If
* the transaction is cancelled, drop our reference to the BUI and free the
* BUD.
*/
STATIC void
-xfs_bud_item_unlock(
+xfs_bud_item_release(
struct xfs_log_item *lip)
{
struct xfs_bud_log_item *budp = BUD_ITEM(lip);
- if (test_bit(XFS_LI_ABORTED, &lip->li_flags)) {
- xfs_bui_release(budp->bud_buip);
- kmem_zone_free(xfs_bud_zone, budp);
- }
-}
-
-/*
- * When the bud item is committed to disk, all we need to do is delete our
- * reference to our partner bui item and then free ourselves. Since we're
- * freeing ourselves we must return -1 to keep the transaction code from
- * further referencing this item.
- */
-STATIC xfs_lsn_t
-xfs_bud_item_committed(
- struct xfs_log_item *lip,
- xfs_lsn_t lsn)
-{
- struct xfs_bud_log_item *budp = BUD_ITEM(lip);
-
- /*
- * Drop the BUI reference regardless of whether the BUD has been
- * aborted. Once the BUD transaction is constructed, it is the sole
- * responsibility of the BUD to release the BUI (even if the BUI is
- * aborted due to log I/O error).
- */
xfs_bui_release(budp->bud_buip);
kmem_zone_free(xfs_bud_zone, budp);
-
- return (xfs_lsn_t)-1;
}
-/*
- * The BUD dependency tracking op doesn't do squat. It can't because
- * it doesn't know where the free extent is coming from. The dependency
- * tracking has to be handled by the "enclosing" metadata object. For
- * example, for inodes, the inode is locked throughout the extent freeing
- * so the dependency should be recorded there.
- */
-STATIC void
-xfs_bud_item_committing(
- struct xfs_log_item *lip,
- xfs_lsn_t lsn)
-{
-}
-
-/*
- * This is the ops vector shared by all bud log items.
- */
static const struct xfs_item_ops xfs_bud_item_ops = {
+ .flags = XFS_ITEM_RELEASE_WHEN_COMMITTED,
.iop_size = xfs_bud_item_size,
.iop_format = xfs_bud_item_format,
- .iop_pin = xfs_bud_item_pin,
- .iop_unpin = xfs_bud_item_unpin,
- .iop_unlock = xfs_bud_item_unlock,
- .iop_committed = xfs_bud_item_committed,
- .iop_push = xfs_bud_item_push,
- .iop_committing = xfs_bud_item_committing,
+ .iop_release = xfs_bud_item_release,
};
-/*
- * Allocate and initialize an bud item with the given number of extents.
- */
-struct xfs_bud_log_item *
-xfs_bud_init(
- struct xfs_mount *mp,
+static struct xfs_bud_log_item *
+xfs_trans_get_bud(
+ struct xfs_trans *tp,
struct xfs_bui_log_item *buip)
-
{
- struct xfs_bud_log_item *budp;
+ struct xfs_bud_log_item *budp;
- budp = kmem_zone_zalloc(xfs_bud_zone, KM_SLEEP);
- xfs_log_item_init(mp, &budp->bud_item, XFS_LI_BUD, &xfs_bud_item_ops);
+ budp = kmem_zone_zalloc(xfs_bud_zone, 0);
+ xfs_log_item_init(tp->t_mountp, &budp->bud_item, XFS_LI_BUD,
+ &xfs_bud_item_ops);
budp->bud_buip = buip;
budp->bud_format.bud_bui_id = buip->bui_format.bui_id;
+ xfs_trans_add_item(tp, &budp->bud_item);
return budp;
}
/*
+ * Finish an bmap update and log it to the BUD. Note that the
+ * transaction is marked dirty regardless of whether the bmap update
+ * succeeds or fails to support the BUI/BUD lifecycle rules.
+ */
+static int
+xfs_trans_log_finish_bmap_update(
+ struct xfs_trans *tp,
+ struct xfs_bud_log_item *budp,
+ enum xfs_bmap_intent_type type,
+ struct xfs_inode *ip,
+ int whichfork,
+ xfs_fileoff_t startoff,
+ xfs_fsblock_t startblock,
+ xfs_filblks_t *blockcount,
+ xfs_exntst_t state)
+{
+ int error;
+
+ error = xfs_bmap_finish_one(tp, ip, type, whichfork, startoff,
+ startblock, blockcount, state);
+
+ /*
+ * Mark the transaction dirty, even on error. This ensures the
+ * transaction is aborted, which:
+ *
+ * 1.) releases the BUI and frees the BUD
+ * 2.) shuts down the filesystem
+ */
+ tp->t_flags |= XFS_TRANS_DIRTY;
+ set_bit(XFS_LI_DIRTY, &budp->bud_item.li_flags);
+
+ return error;
+}
+
+/* Sort bmap intents by inode. */
+static int
+xfs_bmap_update_diff_items(
+ void *priv,
+ struct list_head *a,
+ struct list_head *b)
+{
+ struct xfs_bmap_intent *ba;
+ struct xfs_bmap_intent *bb;
+
+ ba = container_of(a, struct xfs_bmap_intent, bi_list);
+ bb = container_of(b, struct xfs_bmap_intent, bi_list);
+ return ba->bi_owner->i_ino - bb->bi_owner->i_ino;
+}
+
+/* Get an BUI. */
+STATIC void *
+xfs_bmap_update_create_intent(
+ struct xfs_trans *tp,
+ unsigned int count)
+{
+ struct xfs_bui_log_item *buip;
+
+ ASSERT(count == XFS_BUI_MAX_FAST_EXTENTS);
+ ASSERT(tp != NULL);
+
+ buip = xfs_bui_init(tp->t_mountp);
+ ASSERT(buip != NULL);
+
+ /*
+ * Get a log_item_desc to point at the new item.
+ */
+ xfs_trans_add_item(tp, &buip->bui_item);
+ return buip;
+}
+
+/* Set the map extent flags for this mapping. */
+static void
+xfs_trans_set_bmap_flags(
+ struct xfs_map_extent *bmap,
+ enum xfs_bmap_intent_type type,
+ int whichfork,
+ xfs_exntst_t state)
+{
+ bmap->me_flags = 0;
+ switch (type) {
+ case XFS_BMAP_MAP:
+ case XFS_BMAP_UNMAP:
+ bmap->me_flags = type;
+ break;
+ default:
+ ASSERT(0);
+ }
+ if (state == XFS_EXT_UNWRITTEN)
+ bmap->me_flags |= XFS_BMAP_EXTENT_UNWRITTEN;
+ if (whichfork == XFS_ATTR_FORK)
+ bmap->me_flags |= XFS_BMAP_EXTENT_ATTR_FORK;
+}
+
+/* Log bmap updates in the intent item. */
+STATIC void
+xfs_bmap_update_log_item(
+ struct xfs_trans *tp,
+ void *intent,
+ struct list_head *item)
+{
+ struct xfs_bui_log_item *buip = intent;
+ struct xfs_bmap_intent *bmap;
+ uint next_extent;
+ struct xfs_map_extent *map;
+
+ bmap = container_of(item, struct xfs_bmap_intent, bi_list);
+
+ tp->t_flags |= XFS_TRANS_DIRTY;
+ set_bit(XFS_LI_DIRTY, &buip->bui_item.li_flags);
+
+ /*
+ * atomic_inc_return gives us the value after the increment;
+ * we want to use it as an array index so we need to subtract 1 from
+ * it.
+ */
+ next_extent = atomic_inc_return(&buip->bui_next_extent) - 1;
+ ASSERT(next_extent < buip->bui_format.bui_nextents);
+ map = &buip->bui_format.bui_extents[next_extent];
+ map->me_owner = bmap->bi_owner->i_ino;
+ map->me_startblock = bmap->bi_bmap.br_startblock;
+ map->me_startoff = bmap->bi_bmap.br_startoff;
+ map->me_len = bmap->bi_bmap.br_blockcount;
+ xfs_trans_set_bmap_flags(map, bmap->bi_type, bmap->bi_whichfork,
+ bmap->bi_bmap.br_state);
+}
+
+/* Get an BUD so we can process all the deferred rmap updates. */
+STATIC void *
+xfs_bmap_update_create_done(
+ struct xfs_trans *tp,
+ void *intent,
+ unsigned int count)
+{
+ return xfs_trans_get_bud(tp, intent);
+}
+
+/* Process a deferred rmap update. */
+STATIC int
+xfs_bmap_update_finish_item(
+ struct xfs_trans *tp,
+ struct list_head *item,
+ void *done_item,
+ void **state)
+{
+ struct xfs_bmap_intent *bmap;
+ xfs_filblks_t count;
+ int error;
+
+ bmap = container_of(item, struct xfs_bmap_intent, bi_list);
+ count = bmap->bi_bmap.br_blockcount;
+ error = xfs_trans_log_finish_bmap_update(tp, done_item,
+ bmap->bi_type,
+ bmap->bi_owner, bmap->bi_whichfork,
+ bmap->bi_bmap.br_startoff,
+ bmap->bi_bmap.br_startblock,
+ &count,
+ bmap->bi_bmap.br_state);
+ if (!error && count > 0) {
+ ASSERT(bmap->bi_type == XFS_BMAP_UNMAP);
+ bmap->bi_bmap.br_blockcount = count;
+ return -EAGAIN;
+ }
+ kmem_free(bmap);
+ return error;
+}
+
+/* Abort all pending BUIs. */
+STATIC void
+xfs_bmap_update_abort_intent(
+ void *intent)
+{
+ xfs_bui_release(intent);
+}
+
+/* Cancel a deferred rmap update. */
+STATIC void
+xfs_bmap_update_cancel_item(
+ struct list_head *item)
+{
+ struct xfs_bmap_intent *bmap;
+
+ bmap = container_of(item, struct xfs_bmap_intent, bi_list);
+ kmem_free(bmap);
+}
+
+const struct xfs_defer_op_type xfs_bmap_update_defer_type = {
+ .max_items = XFS_BUI_MAX_FAST_EXTENTS,
+ .diff_items = xfs_bmap_update_diff_items,
+ .create_intent = xfs_bmap_update_create_intent,
+ .abort_intent = xfs_bmap_update_abort_intent,
+ .log_item = xfs_bmap_update_log_item,
+ .create_done = xfs_bmap_update_create_done,
+ .finish_item = xfs_bmap_update_finish_item,
+ .cancel_item = xfs_bmap_update_cancel_item,
+};
+
+/*
* Process a bmap update intent item that was recovered from the log.
* We need to update some inode's bmbt.
*/
@@ -486,9 +542,7 @@
irec.br_blockcount = count;
irec.br_startoff = bmap->me_startoff;
irec.br_state = state;
- error = xfs_bmap_unmap_extent(tp, ip, &irec);
- if (error)
- goto err_inode;
+ xfs_bmap_unmap_extent(tp, ip, &irec);
}
set_bit(XFS_BUI_RECOVERED, &buip->bui_flags);
diff --git a/fs/xfs/xfs_bmap_item.h b/fs/xfs/xfs_bmap_item.h
index 89e043a..ad479cc 100644
--- a/fs/xfs/xfs_bmap_item.h
+++ b/fs/xfs/xfs_bmap_item.h
@@ -75,8 +75,6 @@
extern struct kmem_zone *xfs_bud_zone;
struct xfs_bui_log_item *xfs_bui_init(struct xfs_mount *);
-struct xfs_bud_log_item *xfs_bud_init(struct xfs_mount *,
- struct xfs_bui_log_item *);
void xfs_bui_item_free(struct xfs_bui_log_item *);
void xfs_bui_release(struct xfs_bui_log_item *);
int xfs_bui_recover(struct xfs_trans *parent_tp, struct xfs_bui_log_item *buip);
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 6de8d90..4f44370 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -12,12 +12,10 @@
#include "xfs_trans_resv.h"
#include "xfs_bit.h"
#include "xfs_mount.h"
-#include "xfs_da_format.h"
#include "xfs_defer.h"
#include "xfs_inode.h"
#include "xfs_btree.h"
#include "xfs_trans.h"
-#include "xfs_extfree_item.h"
#include "xfs_alloc.h"
#include "xfs_bmap.h"
#include "xfs_bmap_util.h"
@@ -28,11 +26,8 @@
#include "xfs_trans_space.h"
#include "xfs_trace.h"
#include "xfs_icache.h"
-#include "xfs_log.h"
-#include "xfs_rmap_btree.h"
#include "xfs_iomap.h"
#include "xfs_reflink.h"
-#include "xfs_refcount.h"
/* Kernel only BMAP related definitions and functions */
@@ -44,9 +39,9 @@
xfs_daddr_t
xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb)
{
- return (XFS_IS_REALTIME_INODE(ip) ? \
- (xfs_daddr_t)XFS_FSB_TO_BB((ip)->i_mount, (fsb)) : \
- XFS_FSB_TO_DADDR((ip)->i_mount, (fsb)));
+ if (XFS_IS_REALTIME_INODE(ip))
+ return XFS_FSB_TO_BB(ip->i_mount, fsb);
+ return XFS_FSB_TO_DADDR(ip->i_mount, fsb);
}
/*
@@ -276,7 +271,7 @@
struct xfs_btree_block *block, *nextblock;
int numrecs;
- error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, XFS_BMAP_BTREE_REF,
+ error = xfs_btree_read_bufl(mp, tp, bno, &bp, XFS_BMAP_BTREE_REF,
&xfs_bmbt_buf_ops);
if (error)
return error;
@@ -287,7 +282,7 @@
/* Not at node above leaves, count this level of nodes */
nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib);
while (nextbno != NULLFSBLOCK) {
- error = xfs_btree_read_bufl(mp, tp, nextbno, 0, &nbp,
+ error = xfs_btree_read_bufl(mp, tp, nextbno, &nbp,
XFS_BMAP_BTREE_REF,
&xfs_bmbt_buf_ops);
if (error)
@@ -321,7 +316,7 @@
if (nextbno == NULLFSBLOCK)
break;
bno = nextbno;
- error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp,
+ error = xfs_btree_read_bufl(mp, tp, bno, &bp,
XFS_BMAP_BTREE_REF,
&xfs_bmbt_buf_ops);
if (error)
@@ -406,10 +401,10 @@
struct xfs_bmbt_irec *got)
{
struct kgetbmap *p = out + bmv->bmv_entries;
- bool shared = false, trimmed = false;
+ bool shared = false;
int error;
- error = xfs_reflink_trim_around_shared(ip, got, &shared, &trimmed);
+ error = xfs_reflink_trim_around_shared(ip, got, &shared);
if (error)
return error;
@@ -869,6 +864,7 @@
xfs_filblks_t allocatesize_fsb;
xfs_extlen_t extsz, temp;
xfs_fileoff_t startoffset_fsb;
+ xfs_fileoff_t endoffset_fsb;
int nimaps;
int quota_flag;
int rt;
@@ -896,7 +892,8 @@
imapp = &imaps[0];
nimaps = 1;
startoffset_fsb = XFS_B_TO_FSBT(mp, offset);
- allocatesize_fsb = XFS_B_TO_FSB(mp, count);
+ endoffset_fsb = XFS_B_TO_FSB(mp, offset + count);
+ allocatesize_fsb = endoffset_fsb - startoffset_fsb;
/*
* Allocate file space until done or until there is an error
@@ -1042,45 +1039,7 @@
goto out_unlock;
}
-static int
-xfs_adjust_extent_unmap_boundaries(
- struct xfs_inode *ip,
- xfs_fileoff_t *startoffset_fsb,
- xfs_fileoff_t *endoffset_fsb)
-{
- struct xfs_mount *mp = ip->i_mount;
- struct xfs_bmbt_irec imap;
- int nimap, error;
- xfs_extlen_t mod = 0;
-
- nimap = 1;
- error = xfs_bmapi_read(ip, *startoffset_fsb, 1, &imap, &nimap, 0);
- if (error)
- return error;
-
- if (nimap && imap.br_startblock != HOLESTARTBLOCK) {
- ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
- div_u64_rem(imap.br_startblock, mp->m_sb.sb_rextsize, &mod);
- if (mod)
- *startoffset_fsb += mp->m_sb.sb_rextsize - mod;
- }
-
- nimap = 1;
- error = xfs_bmapi_read(ip, *endoffset_fsb - 1, 1, &imap, &nimap, 0);
- if (error)
- return error;
-
- if (nimap && imap.br_startblock != HOLESTARTBLOCK) {
- ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
- mod++;
- if (mod && mod != mp->m_sb.sb_rextsize)
- *endoffset_fsb -= mod;
- }
-
- return 0;
-}
-
-static int
+int
xfs_flush_unmap_range(
struct xfs_inode *ip,
xfs_off_t offset,
@@ -1133,19 +1092,8 @@
endoffset_fsb = XFS_B_TO_FSBT(mp, offset + len);
/*
- * Need to zero the stuff we're not freeing, on disk. If it's a RT file
- * and we can't use unwritten extents then we actually need to ensure
- * to zero the whole extent, otherwise we just need to take of block
- * boundaries, and xfs_bunmapi will handle the rest.
+ * Need to zero the stuff we're not freeing, on disk.
*/
- if (XFS_IS_REALTIME_INODE(ip) &&
- !xfs_sb_version_hasextflgbit(&mp->m_sb)) {
- error = xfs_adjust_extent_unmap_boundaries(ip, &startoffset_fsb,
- &endoffset_fsb);
- if (error)
- return error;
- }
-
if (endoffset_fsb > startoffset_fsb) {
while (!done) {
error = xfs_unmap_extent(ip, startoffset_fsb,
@@ -1175,9 +1123,9 @@
* page could be mmap'd and iomap_zero_range doesn't do that for us.
* Writeback of the eof page will do this, albeit clumsily.
*/
- if (offset + len >= XFS_ISIZE(ip) && ((offset + len) & PAGE_MASK)) {
+ if (offset + len >= XFS_ISIZE(ip) && offset_in_page(offset + len) > 0) {
error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
- (offset + len) & ~PAGE_MASK, LLONG_MAX);
+ round_down(offset + len, PAGE_SIZE), LLONG_MAX);
}
return error;
@@ -1211,16 +1159,13 @@
* by virtue of the hole punch.
*/
error = xfs_free_file_space(ip, offset, len);
- if (error)
- goto out;
+ if (error || xfs_is_always_cow_inode(ip))
+ return error;
- error = xfs_alloc_file_space(ip, round_down(offset, blksize),
+ return xfs_alloc_file_space(ip, round_down(offset, blksize),
round_up(offset + len, blksize) -
round_down(offset, blksize),
XFS_BMAPI_PREALLOC);
-out:
- return error;
-
}
static int
@@ -1244,11 +1189,7 @@
* Writeback and invalidate cache for the remainder of the file as we're
* about to shift down every extent from offset to EOF.
*/
- error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping, offset, -1);
- if (error)
- return error;
- error = invalidate_inode_pages2_range(VFS_I(ip)->i_mapping,
- offset >> PAGE_SHIFT, -1);
+ error = xfs_flush_unmap_range(ip, offset, XFS_ISIZE(ip));
if (error)
return error;
@@ -1593,24 +1534,16 @@
trace_xfs_swap_extent_rmap_remap_piece(tip, &uirec);
/* Remove the mapping from the donor file. */
- error = xfs_bmap_unmap_extent(tp, tip, &uirec);
- if (error)
- goto out;
+ xfs_bmap_unmap_extent(tp, tip, &uirec);
/* Remove the mapping from the source file. */
- error = xfs_bmap_unmap_extent(tp, ip, &irec);
- if (error)
- goto out;
+ xfs_bmap_unmap_extent(tp, ip, &irec);
/* Map the donor file's blocks into the source file. */
- error = xfs_bmap_map_extent(tp, ip, &uirec);
- if (error)
- goto out;
+ xfs_bmap_map_extent(tp, ip, &uirec);
/* Map the source file's blocks into the donor file. */
- error = xfs_bmap_map_extent(tp, tip, &irec);
- if (error)
- goto out;
+ xfs_bmap_map_extent(tp, tip, &irec);
error = xfs_defer_finish(tpp);
tp = *tpp;
@@ -1824,6 +1757,12 @@
if (error)
goto out_unlock;
+ if (xfs_inode_has_cow_data(tip)) {
+ error = xfs_reflink_cancel_cow_range(tip, 0, NULLFILEOFF, true);
+ if (error)
+ return error;
+ }
+
/*
* Extent "swapping" with rmap requires a permanent reservation and
* a block reservation because it's really just a remap operation
diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h
index 87363d1..7a78229 100644
--- a/fs/xfs/xfs_bmap_util.h
+++ b/fs/xfs/xfs_bmap_util.h
@@ -80,4 +80,7 @@
int whichfork, xfs_extnum_t *nextents,
xfs_filblks_t *count);
+int xfs_flush_unmap_range(struct xfs_inode *ip, xfs_off_t offset,
+ xfs_off_t len);
+
#endif /* __XFS_BMAP_UTIL_H__ */
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index e839907..0abba17 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -4,24 +4,9 @@
* All Rights Reserved.
*/
#include "xfs.h"
-#include <linux/stddef.h>
-#include <linux/errno.h>
-#include <linux/gfp.h>
-#include <linux/pagemap.h>
-#include <linux/init.h>
-#include <linux/vmalloc.h>
-#include <linux/bio.h>
-#include <linux/sysctl.h>
-#include <linux/proc_fs.h>
-#include <linux/workqueue.h>
-#include <linux/percpu.h>
-#include <linux/blkdev.h>
-#include <linux/hash.h>
-#include <linux/kthread.h>
-#include <linux/migrate.h>
#include <linux/backing-dev.h>
-#include <linux/freezer.h>
+#include "xfs_shared.h"
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
@@ -37,6 +22,32 @@
#define xb_to_gfp(flags) \
((((flags) & XBF_READ_AHEAD) ? __GFP_NORETRY : GFP_NOFS) | __GFP_NOWARN)
+/*
+ * Locking orders
+ *
+ * xfs_buf_ioacct_inc:
+ * xfs_buf_ioacct_dec:
+ * b_sema (caller holds)
+ * b_lock
+ *
+ * xfs_buf_stale:
+ * b_sema (caller holds)
+ * b_lock
+ * lru_lock
+ *
+ * xfs_buf_rele:
+ * b_lock
+ * pag_buf_lock
+ * lru_lock
+ *
+ * xfs_buftarg_wait_rele
+ * lru_lock
+ * b_lock (trylock due to inversion)
+ *
+ * xfs_buftarg_isolate
+ * lru_lock
+ * b_lock (trylock due to inversion)
+ */
static inline int
xfs_buf_is_vmapped(
@@ -187,7 +198,7 @@
}
}
-struct xfs_buf *
+static struct xfs_buf *
_xfs_buf_alloc(
struct xfs_buftarg *target,
struct xfs_buf_map *map,
@@ -217,6 +228,7 @@
sema_init(&bp->b_sema, 0); /* held, no waiters */
spin_lock_init(&bp->b_lock);
bp->b_target = target;
+ bp->b_mount = target->bt_mount;
bp->b_flags = flags;
/*
@@ -237,12 +249,11 @@
bp->b_maps[i].bm_len = map[i].bm_len;
bp->b_length += map[i].bm_len;
}
- bp->b_io_length = bp->b_length;
atomic_set(&bp->b_pin_count, 0);
init_waitqueue_head(&bp->b_waiters);
- XFS_STATS_INC(target->bt_mount, xb_create);
+ XFS_STATS_INC(bp->b_mount, xb_create);
trace_xfs_buf_init(bp, _RET_IP_);
return bp;
@@ -334,6 +345,15 @@
unsigned short page_count, i;
xfs_off_t start, end;
int error;
+ xfs_km_flags_t kmflag_mask = 0;
+
+ /*
+ * assure zeroed buffer for non-read cases.
+ */
+ if (!(flags & XBF_READ)) {
+ kmflag_mask |= KM_ZERO;
+ gfp_mask |= __GFP_ZERO;
+ }
/*
* for buffers that are contained within a single page, just allocate
@@ -342,7 +362,9 @@
*/
size = BBTOB(bp->b_length);
if (size < PAGE_SIZE) {
- bp->b_addr = kmem_alloc(size, KM_NOFS);
+ int align_mask = xfs_buftarg_dma_alignment(bp->b_target);
+ bp->b_addr = kmem_alloc_io(size, align_mask,
+ KM_NOFS | kmflag_mask);
if (!bp->b_addr) {
/* low memory - use alloc_page loop instead */
goto use_alloc_page;
@@ -357,7 +379,7 @@
}
bp->b_offset = offset_in_page(bp->b_addr);
bp->b_pages = bp->b_page_array;
- bp->b_pages[0] = virt_to_page(bp->b_addr);
+ bp->b_pages[0] = kmem_to_page(bp->b_addr);
bp->b_page_count = 1;
bp->b_flags |= _XBF_KMEM;
return 0;
@@ -399,12 +421,12 @@
current->comm, current->pid,
__func__, gfp_mask);
- XFS_STATS_INC(bp->b_target->bt_mount, xb_page_retries);
+ XFS_STATS_INC(bp->b_mount, xb_page_retries);
congestion_wait(BLK_RW_ASYNC, HZ/50);
goto retry;
}
- XFS_STATS_INC(bp->b_target->bt_mount, xb_page_found);
+ XFS_STATS_INC(bp->b_mount, xb_page_found);
nbytes = min_t(size_t, size, PAGE_SIZE - offset);
size -= nbytes;
@@ -749,6 +771,41 @@
return xfs_buf_submit(bp);
}
+/*
+ * Reverify a buffer found in cache without an attached ->b_ops.
+ *
+ * If the caller passed an ops structure and the buffer doesn't have ops
+ * assigned, set the ops and use it to verify the contents. If verification
+ * fails, clear XBF_DONE. We assume the buffer has no recorded errors and is
+ * already in XBF_DONE state on entry.
+ *
+ * Under normal operations, every in-core buffer is verified on read I/O
+ * completion. There are two scenarios that can lead to in-core buffers without
+ * an assigned ->b_ops. The first is during log recovery of buffers on a V4
+ * filesystem, though these buffers are purged at the end of recovery. The
+ * other is online repair, which intentionally reads with a NULL buffer ops to
+ * run several verifiers across an in-core buffer in order to establish buffer
+ * type. If repair can't establish that, the buffer will be left in memory
+ * with NULL buffer ops.
+ */
+int
+xfs_buf_reverify(
+ struct xfs_buf *bp,
+ const struct xfs_buf_ops *ops)
+{
+ ASSERT(bp->b_flags & XBF_DONE);
+ ASSERT(bp->b_error == 0);
+
+ if (!ops || bp->b_ops)
+ return 0;
+
+ bp->b_ops = ops;
+ bp->b_ops->verify_read(bp);
+ if (bp->b_error)
+ bp->b_flags &= ~XBF_DONE;
+ return bp->b_error;
+}
+
xfs_buf_t *
xfs_buf_read_map(
struct xfs_buftarg *target,
@@ -762,26 +819,32 @@
flags |= XBF_READ;
bp = xfs_buf_get_map(target, map, nmaps, flags);
- if (bp) {
- trace_xfs_buf_read(bp, flags, _RET_IP_);
+ if (!bp)
+ return NULL;
- if (!(bp->b_flags & XBF_DONE)) {
- XFS_STATS_INC(target->bt_mount, xb_get_read);
- bp->b_ops = ops;
- _xfs_buf_read(bp, flags);
- } else if (flags & XBF_ASYNC) {
- /*
- * Read ahead call which is already satisfied,
- * drop the buffer
- */
- xfs_buf_relse(bp);
- return NULL;
- } else {
- /* We do not want read in the flags */
- bp->b_flags &= ~XBF_READ;
- }
+ trace_xfs_buf_read(bp, flags, _RET_IP_);
+
+ if (!(bp->b_flags & XBF_DONE)) {
+ XFS_STATS_INC(target->bt_mount, xb_get_read);
+ bp->b_ops = ops;
+ _xfs_buf_read(bp, flags);
+ return bp;
}
+ xfs_buf_reverify(bp, ops);
+
+ if (flags & XBF_ASYNC) {
+ /*
+ * Read ahead call which is already satisfied,
+ * drop the buffer
+ */
+ xfs_buf_relse(bp);
+ return NULL;
+ }
+
+ /* We do not want read in the flags */
+ bp->b_flags &= ~XBF_READ;
+ ASSERT(bp->b_ops != NULL || ops == NULL);
return bp;
}
@@ -842,83 +905,6 @@
return 0;
}
-/*
- * Return a buffer allocated as an empty buffer and associated to external
- * memory via xfs_buf_associate_memory() back to it's empty state.
- */
-void
-xfs_buf_set_empty(
- struct xfs_buf *bp,
- size_t numblks)
-{
- if (bp->b_pages)
- _xfs_buf_free_pages(bp);
-
- bp->b_pages = NULL;
- bp->b_page_count = 0;
- bp->b_addr = NULL;
- bp->b_length = numblks;
- bp->b_io_length = numblks;
-
- ASSERT(bp->b_map_count == 1);
- bp->b_bn = XFS_BUF_DADDR_NULL;
- bp->b_maps[0].bm_bn = XFS_BUF_DADDR_NULL;
- bp->b_maps[0].bm_len = bp->b_length;
-}
-
-static inline struct page *
-mem_to_page(
- void *addr)
-{
- if ((!is_vmalloc_addr(addr))) {
- return virt_to_page(addr);
- } else {
- return vmalloc_to_page(addr);
- }
-}
-
-int
-xfs_buf_associate_memory(
- xfs_buf_t *bp,
- void *mem,
- size_t len)
-{
- int rval;
- int i = 0;
- unsigned long pageaddr;
- unsigned long offset;
- size_t buflen;
- int page_count;
-
- pageaddr = (unsigned long)mem & PAGE_MASK;
- offset = (unsigned long)mem - pageaddr;
- buflen = PAGE_ALIGN(len + offset);
- page_count = buflen >> PAGE_SHIFT;
-
- /* Free any previous set of page pointers */
- if (bp->b_pages)
- _xfs_buf_free_pages(bp);
-
- bp->b_pages = NULL;
- bp->b_addr = mem;
-
- rval = _xfs_buf_get_pages(bp, page_count);
- if (rval)
- return rval;
-
- bp->b_offset = offset;
-
- for (i = 0; i < bp->b_page_count; i++) {
- bp->b_pages[i] = mem_to_page((void *)pageaddr);
- pageaddr += PAGE_SIZE;
- }
-
- bp->b_io_length = BTOBB(len);
- bp->b_length = BTOBB(buflen);
-
- return 0;
-}
-
xfs_buf_t *
xfs_buf_get_uncached(
struct xfs_buftarg *target,
@@ -1006,8 +992,18 @@
ASSERT(atomic_read(&bp->b_hold) > 0);
- release = atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock);
+ /*
+ * We grab the b_lock here first to serialise racing xfs_buf_rele()
+ * calls. The pag_buf_lock being taken on the last reference only
+ * serialises against racing lookups in xfs_buf_find(). IOWs, the second
+ * to last reference we drop here is not serialised against the last
+ * reference until we take bp->b_lock. Hence if we don't grab b_lock
+ * first, the last "release" reference can win the race to the lock and
+ * free the buffer before the second-to-last reference is processed,
+ * leading to a use-after-free scenario.
+ */
spin_lock(&bp->b_lock);
+ release = atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock);
if (!release) {
/*
* Drop the in-flight state if the buffer is already on the LRU
@@ -1103,7 +1099,7 @@
trace_xfs_buf_lock(bp, _RET_IP_);
if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE))
- xfs_log_force(bp->b_target->bt_mount, 0);
+ xfs_log_force(bp->b_mount, 0);
down(&bp->b_sema);
trace_xfs_buf_lock_done(bp, _RET_IP_);
@@ -1192,7 +1188,7 @@
struct xfs_buf *bp)
{
INIT_WORK(&bp->b_ioend_work, xfs_buf_ioend_work);
- queue_work(bp->b_ioend_wq, &bp->b_ioend_work);
+ queue_work(bp->b_mount->m_buf_workqueue, &bp->b_ioend_work);
}
void
@@ -1211,7 +1207,7 @@
struct xfs_buf *bp,
const char *func)
{
- xfs_alert(bp->b_target->bt_mount,
+ xfs_alert(bp->b_mount,
"metadata I/O error in \"%s\" at daddr 0x%llx len %d error %d",
func, (uint64_t)XFS_BUF_ADDR(bp), bp->b_length,
-bp->b_error);
@@ -1230,10 +1226,8 @@
XBF_WRITE_FAIL | XBF_DONE);
error = xfs_buf_submit(bp);
- if (error) {
- xfs_force_shutdown(bp->b_target->bt_mount,
- SHUTDOWN_META_IO_ERROR);
- }
+ if (error)
+ xfs_force_shutdown(bp->b_mount, SHUTDOWN_META_IO_ERROR);
return error;
}
@@ -1359,21 +1353,8 @@
*/
bp->b_error = 0;
- /*
- * Initialize the I/O completion workqueue if we haven't yet or the
- * submitter has not opted to specify a custom one.
- */
- if (!bp->b_ioend_wq)
- bp->b_ioend_wq = bp->b_target->bt_mount->m_buf_workqueue;
-
if (bp->b_flags & XBF_WRITE) {
op = REQ_OP_WRITE;
- if (bp->b_flags & XBF_SYNCIO)
- op_flags = REQ_SYNC;
- if (bp->b_flags & XBF_FUA)
- op_flags |= REQ_FUA;
- if (bp->b_flags & XBF_FLUSH)
- op_flags |= REQ_PREFLUSH;
/*
* Run the write verifier callback function if it exists. If
@@ -1383,12 +1364,12 @@
if (bp->b_ops) {
bp->b_ops->verify_write(bp);
if (bp->b_error) {
- xfs_force_shutdown(bp->b_target->bt_mount,
+ xfs_force_shutdown(bp->b_mount,
SHUTDOWN_CORRUPT_INCORE);
return;
}
} else if (bp->b_bn != XFS_BUF_DADDR_NULL) {
- struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_mount *mp = bp->b_mount;
/*
* non-crc filesystems don't attach verifiers during
@@ -1420,7 +1401,7 @@
* subsequent call.
*/
offset = bp->b_offset;
- size = BBTOB(bp->b_io_length);
+ size = BBTOB(bp->b_length);
blk_start_plug(&plug);
for (i = 0; i < bp->b_map_count; i++) {
xfs_buf_ioapply_map(bp, i, &offset, &size, op, op_flags);
@@ -1466,12 +1447,11 @@
ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
/* on shutdown we stale and complete the buffer immediately */
- if (XFS_FORCED_SHUTDOWN(bp->b_target->bt_mount)) {
+ if (XFS_FORCED_SHUTDOWN(bp->b_mount)) {
xfs_buf_ioerror(bp, -EIO);
bp->b_flags &= ~XBF_DONE;
xfs_buf_stale(bp);
- if (bp->b_flags & XBF_ASYNC)
- xfs_buf_ioend(bp);
+ xfs_buf_ioend(bp);
return -EIO;
}
@@ -1537,16 +1517,11 @@
return page_address(page) + (offset & (PAGE_SIZE-1));
}
-/*
- * Move data into or out of a buffer.
- */
void
-xfs_buf_iomove(
- xfs_buf_t *bp, /* buffer to process */
- size_t boff, /* starting buffer offset */
- size_t bsize, /* length to copy */
- void *data, /* data address */
- xfs_buf_rw_t mode) /* read/write/zero flag */
+xfs_buf_zero(
+ struct xfs_buf *bp,
+ size_t boff,
+ size_t bsize)
{
size_t bend;
@@ -1559,23 +1534,13 @@
page_offset = (boff + bp->b_offset) & ~PAGE_MASK;
page = bp->b_pages[page_index];
csize = min_t(size_t, PAGE_SIZE - page_offset,
- BBTOB(bp->b_io_length) - boff);
+ BBTOB(bp->b_length) - boff);
ASSERT((csize + page_offset) <= PAGE_SIZE);
- switch (mode) {
- case XBRW_ZERO:
- memset(page_address(page) + page_offset, 0, csize);
- break;
- case XBRW_READ:
- memcpy(data, page_address(page) + page_offset, csize);
- break;
- case XBRW_WRITE:
- memcpy(page_address(page) + page_offset, data, csize);
- }
+ memset(page_address(page) + page_offset, 0, csize);
boff += csize;
- data += csize;
}
}
@@ -1787,7 +1752,7 @@
{
xfs_buftarg_t *btp;
- btp = kmem_zalloc(sizeof(*btp), KM_SLEEP | KM_NOFS);
+ btp = kmem_zalloc(sizeof(*btp), KM_NOFS);
btp->bt_mount = mp;
btp->bt_dev = bdev->bd_dev;
@@ -1926,7 +1891,6 @@
struct list_head *wait_list)
{
struct xfs_buf *bp, *n;
- LIST_HEAD (submit_list);
int pinned = 0;
struct blk_plug plug;
@@ -1989,6 +1953,13 @@
* is only safely useable for callers that can track I/O completion by higher
* level means, e.g. AIL pushing as the @buffer_list is consumed in this
* function.
+ *
+ * Note: this function will skip buffers it would block on, and in doing so
+ * leaves them on @buffer_list so they can be retried on a later pass. As such,
+ * it is up to the caller to ensure that the buffer list is fully submitted or
+ * cancelled appropriately when they are finished with the list. Failure to
+ * cancel or resubmit the list until it is empty will result in leaked buffers
+ * at unmount time.
*/
int
xfs_buf_delwri_submit_nowait(
@@ -2116,9 +2087,45 @@
* This allows userspace to disrupt buffer caching for debug/testing
* purposes.
*/
- if (XFS_TEST_ERROR(false, bp->b_target->bt_mount,
- XFS_ERRTAG_BUF_LRU_REF))
+ if (XFS_TEST_ERROR(false, bp->b_mount, XFS_ERRTAG_BUF_LRU_REF))
lru_ref = 0;
atomic_set(&bp->b_lru_ref, lru_ref);
}
+
+/*
+ * Verify an on-disk magic value against the magic value specified in the
+ * verifier structure. The verifier magic is in disk byte order so the caller is
+ * expected to pass the value directly from disk.
+ */
+bool
+xfs_verify_magic(
+ struct xfs_buf *bp,
+ __be32 dmagic)
+{
+ struct xfs_mount *mp = bp->b_mount;
+ int idx;
+
+ idx = xfs_sb_version_hascrc(&mp->m_sb);
+ if (WARN_ON(!bp->b_ops || !bp->b_ops->magic[idx]))
+ return false;
+ return dmagic == bp->b_ops->magic[idx];
+}
+/*
+ * Verify an on-disk magic value against the magic value specified in the
+ * verifier structure. The verifier magic is in disk byte order so the caller is
+ * expected to pass the value directly from disk.
+ */
+bool
+xfs_verify_magic16(
+ struct xfs_buf *bp,
+ __be16 dmagic)
+{
+ struct xfs_mount *mp = bp->b_mount;
+ int idx;
+
+ idx = xfs_sb_version_hascrc(&mp->m_sb);
+ if (WARN_ON(!bp->b_ops || !bp->b_ops->magic16[idx]))
+ return false;
+ return dmagic == bp->b_ops->magic16[idx];
+}
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index 4e3171a..f6ce17d 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -21,12 +21,6 @@
#define XFS_BUF_DADDR_NULL ((xfs_daddr_t) (-1LL))
-typedef enum {
- XBRW_READ = 1, /* transfer into target memory */
- XBRW_WRITE = 2, /* transfer from target memory */
- XBRW_ZERO = 3, /* Zero target memory */
-} xfs_buf_rw_t;
-
#define XBF_READ (1 << 0) /* buffer intended for reading from device */
#define XBF_WRITE (1 << 1) /* buffer intended for writing to device */
#define XBF_READ_AHEAD (1 << 2) /* asynchronous read-ahead */
@@ -34,12 +28,7 @@
#define XBF_ASYNC (1 << 4) /* initiator will not wait for completion */
#define XBF_DONE (1 << 5) /* all pages in the buffer uptodate */
#define XBF_STALE (1 << 6) /* buffer has been staled, do not find it */
-#define XBF_WRITE_FAIL (1 << 24)/* async writes have failed on this buffer */
-
-/* I/O hints for the BIO layer */
-#define XBF_SYNCIO (1 << 10)/* treat this buffer as synchronous I/O */
-#define XBF_FUA (1 << 11)/* force cache write through mode */
-#define XBF_FLUSH (1 << 12)/* flush the disk cache before a write */
+#define XBF_WRITE_FAIL (1 << 7) /* async writes have failed on this buffer */
/* flags used only as arguments to access routines */
#define XBF_TRYLOCK (1 << 16)/* lock requested, but do not wait */
@@ -49,7 +38,6 @@
#define _XBF_PAGES (1 << 20)/* backed by refcounted pages */
#define _XBF_KMEM (1 << 21)/* backed by heap memory */
#define _XBF_DELWRI_Q (1 << 22)/* buffer on a delwri queue */
-#define _XBF_COMPOUND (1 << 23)/* compound buffer */
typedef unsigned int xfs_buf_flags_t;
@@ -62,15 +50,11 @@
{ XBF_DONE, "DONE" }, \
{ XBF_STALE, "STALE" }, \
{ XBF_WRITE_FAIL, "WRITE_FAIL" }, \
- { XBF_SYNCIO, "SYNCIO" }, \
- { XBF_FUA, "FUA" }, \
- { XBF_FLUSH, "FLUSH" }, \
{ XBF_TRYLOCK, "TRYLOCK" }, /* should never be set */\
{ XBF_UNMAPPED, "UNMAPPED" }, /* ditto */\
{ _XBF_PAGES, "PAGES" }, \
{ _XBF_KMEM, "KMEM" }, \
- { _XBF_DELWRI_Q, "DELWRI_Q" }, \
- { _XBF_COMPOUND, "COMPOUND" }
+ { _XBF_DELWRI_Q, "DELWRI_Q" }
/*
@@ -125,6 +109,10 @@
struct xfs_buf_ops {
char *name;
+ union {
+ __be32 magic[2]; /* v4 and v5 on disk magic values */
+ __be16 magic16[2]; /* v4 and v5 on disk magic values */
+ };
void (*verify_read)(struct xfs_buf *);
void (*verify_write)(struct xfs_buf *);
xfs_failaddr_t (*verify_struct)(struct xfs_buf *bp);
@@ -157,13 +145,13 @@
wait_queue_head_t b_waiters; /* unpin waiters */
struct list_head b_list;
struct xfs_perag *b_pag; /* contains rbtree root */
+ struct xfs_mount *b_mount;
xfs_buftarg_t *b_target; /* buffer target (device) */
void *b_addr; /* virtual address of buffer */
struct work_struct b_ioend_work;
- struct workqueue_struct *b_ioend_wq; /* I/O completion wq */
xfs_buf_iodone_t b_iodone; /* I/O completion function */
struct completion b_iowait; /* queue for I/O waiters */
- void *b_log_item;
+ struct xfs_buf_log_item *b_log_item;
struct list_head b_li_list; /* Log items list head */
struct xfs_trans *b_transp;
struct page **b_pages; /* array of page pointers */
@@ -171,7 +159,6 @@
struct xfs_buf_map *b_maps; /* compound buffer map */
struct xfs_buf_map __b_map; /* inline compound buffer map */
int b_map_count;
- int b_io_length; /* IO size in BBs */
atomic_t b_pin_count; /* pin count */
atomic_t b_io_remaining; /* #outstanding I/O requests */
unsigned int b_page_count; /* size of page array */
@@ -205,21 +192,6 @@
xfs_daddr_t blkno, size_t numblks,
xfs_buf_flags_t flags);
-struct xfs_buf *_xfs_buf_alloc(struct xfs_buftarg *target,
- struct xfs_buf_map *map, int nmaps,
- xfs_buf_flags_t flags);
-
-static inline struct xfs_buf *
-xfs_buf_alloc(
- struct xfs_buftarg *target,
- xfs_daddr_t blkno,
- size_t numblks,
- xfs_buf_flags_t flags)
-{
- DEFINE_SINGLE_BUF_MAP(map, blkno, numblks);
- return _xfs_buf_alloc(target, &map, 1, flags);
-}
-
struct xfs_buf *xfs_buf_get_map(struct xfs_buftarg *target,
struct xfs_buf_map *map, int nmaps,
xfs_buf_flags_t flags);
@@ -235,11 +207,10 @@
xfs_buf_get(
struct xfs_buftarg *target,
xfs_daddr_t blkno,
- size_t numblks,
- xfs_buf_flags_t flags)
+ size_t numblks)
{
DEFINE_SINGLE_BUF_MAP(map, blkno, numblks);
- return xfs_buf_get_map(target, &map, 1, flags);
+ return xfs_buf_get_map(target, &map, 1, 0);
}
static inline struct xfs_buf *
@@ -265,9 +236,6 @@
return xfs_buf_readahead_map(target, &map, 1, ops);
}
-void xfs_buf_set_empty(struct xfs_buf *bp, size_t numblks);
-int xfs_buf_associate_memory(struct xfs_buf *bp, void *mem, size_t length);
-
struct xfs_buf *xfs_buf_get_uncached(struct xfs_buftarg *target, size_t numblks,
int flags);
int xfs_buf_read_uncached(struct xfs_buftarg *target, xfs_daddr_t daddr,
@@ -301,10 +269,7 @@
return __xfs_buf_submit(bp, wait);
}
-extern void xfs_buf_iomove(xfs_buf_t *, size_t, size_t, void *,
- xfs_buf_rw_t);
-#define xfs_buf_zero(bp, off, len) \
- xfs_buf_iomove((bp), (off), (len), NULL, XBRW_ZERO)
+void xfs_buf_zero(struct xfs_buf *bp, size_t boff, size_t bsize);
/* Buffer Utility Routines */
extern void *xfs_buf_offset(struct xfs_buf *, size_t);
@@ -385,4 +350,14 @@
#define xfs_getsize_buftarg(buftarg) block_size((buftarg)->bt_bdev)
#define xfs_readonly_buftarg(buftarg) bdev_read_only((buftarg)->bt_bdev)
+static inline int
+xfs_buftarg_dma_alignment(struct xfs_buftarg *bt)
+{
+ return queue_dma_alignment(bt->bt_bdev->bd_disk->queue);
+}
+
+int xfs_buf_reverify(struct xfs_buf *bp, const struct xfs_buf_ops *ops);
+bool xfs_verify_magic(struct xfs_buf *bp, __be32 dmagic);
+bool xfs_verify_magic16(struct xfs_buf *bp, __be16 dmagic);
+
#endif /* __XFS_BUF_H__ */
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 12d8455..d74fbd1 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -5,19 +5,17 @@
*/
#include "xfs.h"
#include "xfs_fs.h"
+#include "xfs_shared.h"
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
#include "xfs_bit.h"
-#include "xfs_sb.h"
#include "xfs_mount.h"
#include "xfs_trans.h"
#include "xfs_buf_item.h"
#include "xfs_trans_priv.h"
-#include "xfs_error.h"
#include "xfs_trace.h"
#include "xfs_log.h"
-#include "xfs_inode.h"
kmem_zone_t *xfs_buf_item_zone;
@@ -520,7 +518,7 @@
/* has a previous flush failed due to IO errors? */
if ((bp->b_flags & XBF_WRITE_FAIL) &&
___ratelimit(&xfs_buf_write_fail_rl_state, "XFS: Failing async write")) {
- xfs_warn(bp->b_target->bt_mount,
+ xfs_warn(bp->b_mount,
"Failing async write on buffer block 0x%llx. Retrying async write.",
(long long)bp->b_bn);
}
@@ -594,7 +592,7 @@
* free the item.
*/
STATIC void
-xfs_buf_item_unlock(
+xfs_buf_item_release(
struct xfs_log_item *lip)
{
struct xfs_buf_log_item *bip = BUF_ITEM(lip);
@@ -605,9 +603,11 @@
#if defined(DEBUG) || defined(XFS_WARN)
bool ordered = bip->bli_flags & XFS_BLI_ORDERED;
bool dirty = bip->bli_flags & XFS_BLI_DIRTY;
+ bool aborted = test_bit(XFS_LI_ABORTED,
+ &lip->li_flags);
#endif
- trace_xfs_buf_item_unlock(bip);
+ trace_xfs_buf_item_release(bip);
/*
* The bli dirty state should match whether the blf has logged segments
@@ -633,10 +633,18 @@
released = xfs_buf_item_put(bip);
if (hold || (stale && !released))
return;
- ASSERT(!stale || test_bit(XFS_LI_ABORTED, &lip->li_flags));
+ ASSERT(!stale || aborted);
xfs_buf_relse(bp);
}
+STATIC void
+xfs_buf_item_committing(
+ struct xfs_log_item *lip,
+ xfs_lsn_t commit_lsn)
+{
+ return xfs_buf_item_release(lip);
+}
+
/*
* This is called to find out where the oldest active copy of the
* buf log item in the on disk log resides now that the last log
@@ -669,25 +677,15 @@
return lsn;
}
-STATIC void
-xfs_buf_item_committing(
- struct xfs_log_item *lip,
- xfs_lsn_t commit_lsn)
-{
-}
-
-/*
- * This is the ops vector shared by all buf log items.
- */
static const struct xfs_item_ops xfs_buf_item_ops = {
.iop_size = xfs_buf_item_size,
.iop_format = xfs_buf_item_format,
.iop_pin = xfs_buf_item_pin,
.iop_unpin = xfs_buf_item_unpin,
- .iop_unlock = xfs_buf_item_unlock,
+ .iop_release = xfs_buf_item_release,
+ .iop_committing = xfs_buf_item_committing,
.iop_committed = xfs_buf_item_committed,
.iop_push = xfs_buf_item_push,
- .iop_committing = xfs_buf_item_committing
};
STATIC int
@@ -704,7 +702,7 @@
}
bip->bli_formats = kmem_zalloc(count * sizeof(struct xfs_buf_log_format),
- KM_SLEEP);
+ 0);
if (!bip->bli_formats)
return -ENOMEM;
return 0;
@@ -741,7 +739,7 @@
* this buffer. If we do already have one, there is
* nothing to do here so return.
*/
- ASSERT(bp->b_target->bt_mount == mp);
+ ASSERT(bp->b_mount == mp);
if (bip) {
ASSERT(bip->bli_item.li_type == XFS_LI_BUF);
ASSERT(!bp->b_transp);
@@ -749,7 +747,7 @@
return 0;
}
- bip = kmem_zone_zalloc(xfs_buf_item_zone, KM_SLEEP);
+ bip = kmem_zone_zalloc(xfs_buf_item_zone, 0);
xfs_log_item_init(mp, &bip->bli_item, XFS_LI_BUF, &xfs_buf_item_ops);
bip->bli_buf = bp;
@@ -978,9 +976,9 @@
*/
void
xfs_buf_attach_iodone(
- xfs_buf_t *bp,
- void (*cb)(xfs_buf_t *, xfs_log_item_t *),
- xfs_log_item_t *lip)
+ struct xfs_buf *bp,
+ void (*cb)(struct xfs_buf *, struct xfs_log_item *),
+ struct xfs_log_item *lip)
{
ASSERT(xfs_buf_islocked(bp));
@@ -1233,9 +1231,23 @@
}
/*
- * Requeue a failed buffer for writeback
+ * Requeue a failed buffer for writeback.
*
- * Return true if the buffer has been re-queued properly, false otherwise
+ * We clear the log item failed state here as well, but we have to be careful
+ * about reference counts because the only active reference counts on the buffer
+ * may be the failed log items. Hence if we clear the log item failed state
+ * before queuing the buffer for IO we can release all active references to
+ * the buffer and free it, leading to use after free problems in
+ * xfs_buf_delwri_queue. It makes no difference to the buffer or log items which
+ * order we process them in - the buffer is locked, and we own the buffer list
+ * so nothing on them is going to change while we are performing this action.
+ *
+ * Hence we can safely queue the buffer for IO before we clear the failed log
+ * item state, therefore always having an active reference to the buffer and
+ * avoiding the transient zero-reference state that leads to use-after-free.
+ *
+ * Return true if the buffer was added to the buffer list, false if it was
+ * already on the buffer list.
*/
bool
xfs_buf_resubmit_failed_buffers(
@@ -1243,16 +1255,16 @@
struct list_head *buffer_list)
{
struct xfs_log_item *lip;
+ bool ret;
+
+ ret = xfs_buf_delwri_queue(bp, buffer_list);
/*
- * Clear XFS_LI_FAILED flag from all items before resubmit
- *
- * XFS_LI_FAILED set/clear is protected by ail_lock, caller this
+ * XFS_LI_FAILED set/clear is protected by ail_lock, caller of this
* function already have it acquired
*/
list_for_each_entry(lip, &bp->b_li_list, li_bio_list)
xfs_clear_li_failed(lip);
- /* Add this buffer back to the delayed write list */
- return xfs_buf_delwri_queue(bp, buffer_list);
+ return ret;
}
diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h
index 90f65f8..4a054b1 100644
--- a/fs/xfs/xfs_buf_item.h
+++ b/fs/xfs/xfs_buf_item.h
@@ -39,7 +39,7 @@
* locked, and which 128 byte chunks of the buffer are dirty.
*/
struct xfs_buf_log_item {
- xfs_log_item_t bli_item; /* common item structure */
+ struct xfs_log_item bli_item; /* common item structure */
struct xfs_buf *bli_buf; /* real buffer pointer */
unsigned int bli_flags; /* misc flags */
unsigned int bli_recur; /* lock recursion count */
@@ -55,8 +55,8 @@
void xfs_buf_item_log(struct xfs_buf_log_item *, uint, uint);
bool xfs_buf_item_dirty_format(struct xfs_buf_log_item *);
void xfs_buf_attach_iodone(struct xfs_buf *,
- void(*)(struct xfs_buf *, xfs_log_item_t *),
- xfs_log_item_t *);
+ void(*)(struct xfs_buf *, struct xfs_log_item *),
+ struct xfs_log_item *);
void xfs_buf_iodone_callbacks(struct xfs_buf *);
void xfs_buf_iodone(struct xfs_buf *, struct xfs_log_item *);
bool xfs_buf_resubmit_failed_buffers(struct xfs_buf *,
diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c
index 5142e64..283df89 100644
--- a/fs/xfs/xfs_dir2_readdir.c
+++ b/fs/xfs/xfs_dir2_readdir.c
@@ -6,17 +6,14 @@
*/
#include "xfs.h"
#include "xfs_fs.h"
+#include "xfs_shared.h"
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
-#include "xfs_bit.h"
#include "xfs_mount.h"
-#include "xfs_da_format.h"
-#include "xfs_da_btree.h"
#include "xfs_inode.h"
#include "xfs_dir2.h"
#include "xfs_dir2_priv.h"
-#include "xfs_error.h"
#include "xfs_trace.h"
#include "xfs_bmap.h"
#include "xfs_trans.h"
diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c
index 93f07ed..8ec7aab 100644
--- a/fs/xfs/xfs_discard.c
+++ b/fs/xfs/xfs_discard.c
@@ -4,19 +4,17 @@
* All Rights Reserved.
*/
#include "xfs.h"
+#include "xfs_shared.h"
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
#include "xfs_sb.h"
#include "xfs_mount.h"
-#include "xfs_quota.h"
-#include "xfs_inode.h"
#include "xfs_btree.h"
#include "xfs_alloc_btree.h"
#include "xfs_alloc.h"
#include "xfs_error.h"
#include "xfs_extent_busy.h"
-#include "xfs_discard.h"
#include "xfs_trace.h"
#include "xfs_log.h"
@@ -161,9 +159,19 @@
return -EPERM;
if (!blk_queue_discard(q))
return -EOPNOTSUPP;
+
+ /*
+ * We haven't recovered the log, so we cannot use our bnobt-guided
+ * storage zapping commands.
+ */
+ if (mp->m_flags & XFS_MOUNT_NORECOVERY)
+ return -EROFS;
+
if (copy_from_user(&range, urange, sizeof(range)))
return -EFAULT;
+ range.minlen = max_t(u64, granularity, range.minlen);
+ minlen = BTOBB(range.minlen);
/*
* Truncating down the len isn't actually quite correct, but using
* BBTOB would mean we trivially get overflows for values
@@ -178,7 +186,6 @@
start = BTOBB(range.start);
end = start + BTOBBT(range.len) - 1;
- minlen = BTOBB(max_t(u64, granularity, range.minlen));
if (end > XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks) - 1)
end = XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)- 1;
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index 87e6dd5..aeb95e7 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -14,16 +14,12 @@
#include "xfs_defer.h"
#include "xfs_inode.h"
#include "xfs_bmap.h"
-#include "xfs_bmap_util.h"
-#include "xfs_alloc.h"
#include "xfs_quota.h"
-#include "xfs_error.h"
#include "xfs_trans.h"
#include "xfs_buf_item.h"
#include "xfs_trans_space.h"
#include "xfs_trans_priv.h"
#include "xfs_qm.h"
-#include "xfs_cksum.h"
#include "xfs_trace.h"
#include "xfs_log.h"
#include "xfs_bmap_btree.h"
@@ -277,7 +273,8 @@
/*
* Ensure that the given in-core dquot has a buffer on disk backing it, and
- * return the buffer. This is called when the bmapi finds a hole.
+ * return the buffer locked and held. This is called when the bmapi finds a
+ * hole.
*/
STATIC int
xfs_dquot_disk_alloc(
@@ -355,13 +352,14 @@
* If everything succeeds, the caller of this function is returned a
* buffer that is locked and held to the transaction. The caller
* is responsible for unlocking any buffer passed back, either
- * manually or by committing the transaction.
+ * manually or by committing the transaction. On error, the buffer is
+ * released and not passed back.
*/
xfs_trans_bhold(tp, bp);
error = xfs_defer_finish(tpp);
- tp = *tpp;
if (error) {
- xfs_buf_relse(bp);
+ xfs_trans_bhold_release(*tpp, bp);
+ xfs_trans_brelse(*tpp, bp);
return error;
}
*bpp = bp;
@@ -442,7 +440,7 @@
{
struct xfs_dquot *dqp;
- dqp = kmem_zone_zalloc(xfs_qm_dqzone, KM_SLEEP);
+ dqp = kmem_zone_zalloc(xfs_qm_dqzone, 0);
dqp->dq_flags = type;
dqp->q_core.d_id = cpu_to_be32(id);
@@ -521,7 +519,6 @@
struct xfs_buf **bpp)
{
struct xfs_trans *tp;
- struct xfs_buf *bp;
int error;
error = xfs_trans_alloc(mp, &M_RES(mp)->tr_qm_dqalloc,
@@ -529,7 +526,7 @@
if (error)
goto err;
- error = xfs_dquot_disk_alloc(&tp, dqp, &bp);
+ error = xfs_dquot_disk_alloc(&tp, dqp, bpp);
if (error)
goto err_cancel;
@@ -539,10 +536,10 @@
* Buffer was held to the transaction, so we have to unlock it
* manually here because we're not passing it back.
*/
- xfs_buf_relse(bp);
+ xfs_buf_relse(*bpp);
+ *bpp = NULL;
goto err;
}
- *bpp = bp;
return 0;
err_cancel:
@@ -1242,7 +1239,7 @@
/*
* Iterate every dquot of a particular type. The caller must ensure that the
* particular quota type is active. iter_fn can return negative error codes,
- * or XFS_BTREE_QUERY_RANGE_ABORT to indicate that it wants to stop iterating.
+ * or -ECANCELED to indicate that it wants to stop iterating.
*/
int
xfs_qm_dqiterate(
diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h
index 64bd864..4fe8570 100644
--- a/fs/xfs/xfs_dquot.h
+++ b/fs/xfs/xfs_dquot.h
@@ -34,7 +34,6 @@
uint dq_flags; /* various flags (XFS_DQ_*) */
struct list_head q_lru; /* global free list of dquots */
struct xfs_mount*q_mount; /* filesystem this relates to */
- struct xfs_trans*q_transp; /* trans this belongs to currently */
uint q_nrefs; /* # active refs from inodes */
xfs_daddr_t q_blkno; /* blkno of dquot buffer */
int q_bufoffset; /* off of dq in buffer (# dquots) */
diff --git a/fs/xfs/xfs_dquot_item.c b/fs/xfs/xfs_dquot_item.c
index 7dedd17..d60647d 100644
--- a/fs/xfs/xfs_dquot_item.c
+++ b/fs/xfs/xfs_dquot_item.c
@@ -5,13 +5,13 @@
*/
#include "xfs.h"
#include "xfs_fs.h"
+#include "xfs_shared.h"
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
#include "xfs_inode.h"
#include "xfs_quota.h"
-#include "xfs_error.h"
#include "xfs_trans.h"
#include "xfs_buf_item.h"
#include "xfs_trans_priv.h"
@@ -94,18 +94,6 @@
wake_up(&dqp->q_pinwait);
}
-STATIC xfs_lsn_t
-xfs_qm_dquot_logitem_committed(
- struct xfs_log_item *lip,
- xfs_lsn_t lsn)
-{
- /*
- * We always re-log the entire dquot when it becomes dirty,
- * so, the latest copy _is_ the only one that matters.
- */
- return lsn;
-}
-
/*
* This is called to wait for the given dquot to be unpinned.
* Most of these pin/unpin routines are plagiarized from inode code.
@@ -209,14 +197,8 @@
return rval;
}
-/*
- * Unlock the dquot associated with the log item.
- * Clear the fields of the dquot and dquot log item that
- * are specific to the current transaction. If the
- * hold flags is set, do not unlock the dquot.
- */
STATIC void
-xfs_qm_dquot_logitem_unlock(
+xfs_qm_dquot_logitem_release(
struct xfs_log_item *lip)
{
struct xfs_dquot *dqp = DQUOT_ITEM(lip)->qli_dquot;
@@ -224,11 +206,6 @@
ASSERT(XFS_DQ_IS_LOCKED(dqp));
/*
- * Clear the transaction pointer in the dquot
- */
- dqp->q_transp = NULL;
-
- /*
* dquots are never 'held' from getting unlocked at the end of
* a transaction. Their locking and unlocking is hidden inside the
* transaction layer, within trans_commit. Hence, no LI_HOLD flag
@@ -237,30 +214,22 @@
xfs_dqunlock(dqp);
}
-/*
- * this needs to stamp an lsn into the dquot, I think.
- * rpc's that look at user dquot's would then have to
- * push on the dependency recorded in the dquot
- */
STATIC void
xfs_qm_dquot_logitem_committing(
struct xfs_log_item *lip,
- xfs_lsn_t lsn)
+ xfs_lsn_t commit_lsn)
{
+ return xfs_qm_dquot_logitem_release(lip);
}
-/*
- * This is the ops vector for dquots
- */
static const struct xfs_item_ops xfs_dquot_item_ops = {
.iop_size = xfs_qm_dquot_logitem_size,
.iop_format = xfs_qm_dquot_logitem_format,
.iop_pin = xfs_qm_dquot_logitem_pin,
.iop_unpin = xfs_qm_dquot_logitem_unpin,
- .iop_unlock = xfs_qm_dquot_logitem_unlock,
- .iop_committed = xfs_qm_dquot_logitem_committed,
+ .iop_release = xfs_qm_dquot_logitem_release,
+ .iop_committing = xfs_qm_dquot_logitem_committing,
.iop_push = xfs_qm_dquot_logitem_push,
- .iop_committing = xfs_qm_dquot_logitem_committing,
.iop_error = xfs_dquot_item_error
};
@@ -320,26 +289,6 @@
}
/*
- * Pinning has no meaning for an quotaoff item, so just return.
- */
-STATIC void
-xfs_qm_qoff_logitem_pin(
- struct xfs_log_item *lip)
-{
-}
-
-/*
- * Since pinning has no meaning for an quotaoff item, unpinning does
- * not either.
- */
-STATIC void
-xfs_qm_qoff_logitem_unpin(
- struct xfs_log_item *lip,
- int remove)
-{
-}
-
-/*
* There isn't much you can do to push a quotaoff item. It is simply
* stuck waiting for the log to be flushed to disk.
*/
@@ -351,28 +300,6 @@
return XFS_ITEM_LOCKED;
}
-/*
- * Quotaoff items have no locking or pushing, so return failure
- * so that the caller doesn't bother with us.
- */
-STATIC void
-xfs_qm_qoff_logitem_unlock(
- struct xfs_log_item *lip)
-{
-}
-
-/*
- * The quotaoff-start-item is logged only once and cannot be moved in the log,
- * so simply return the lsn at which it's been logged.
- */
-STATIC xfs_lsn_t
-xfs_qm_qoff_logitem_committed(
- struct xfs_log_item *lip,
- xfs_lsn_t lsn)
-{
- return lsn;
-}
-
STATIC xfs_lsn_t
xfs_qm_qoffend_logitem_committed(
struct xfs_log_item *lip,
@@ -396,50 +323,17 @@
return (xfs_lsn_t)-1;
}
-/*
- * XXX rcc - don't know quite what to do with this. I think we can
- * just ignore it. The only time that isn't the case is if we allow
- * the client to somehow see that quotas have been turned off in which
- * we can't allow that to get back until the quotaoff hits the disk.
- * So how would that happen? Also, do we need different routines for
- * quotaoff start and quotaoff end? I suspect the answer is yes but
- * to be sure, I need to look at the recovery code and see how quota off
- * recovery is handled (do we roll forward or back or do something else).
- * If we roll forwards or backwards, then we need two separate routines,
- * one that does nothing and one that stamps in the lsn that matters
- * (truly makes the quotaoff irrevocable). If we do something else,
- * then maybe we don't need two.
- */
-STATIC void
-xfs_qm_qoff_logitem_committing(
- struct xfs_log_item *lip,
- xfs_lsn_t commit_lsn)
-{
-}
-
static const struct xfs_item_ops xfs_qm_qoffend_logitem_ops = {
.iop_size = xfs_qm_qoff_logitem_size,
.iop_format = xfs_qm_qoff_logitem_format,
- .iop_pin = xfs_qm_qoff_logitem_pin,
- .iop_unpin = xfs_qm_qoff_logitem_unpin,
- .iop_unlock = xfs_qm_qoff_logitem_unlock,
.iop_committed = xfs_qm_qoffend_logitem_committed,
.iop_push = xfs_qm_qoff_logitem_push,
- .iop_committing = xfs_qm_qoff_logitem_committing
};
-/*
- * This is the ops vector shared by all quotaoff-start log items.
- */
static const struct xfs_item_ops xfs_qm_qoff_logitem_ops = {
.iop_size = xfs_qm_qoff_logitem_size,
.iop_format = xfs_qm_qoff_logitem_format,
- .iop_pin = xfs_qm_qoff_logitem_pin,
- .iop_unpin = xfs_qm_qoff_logitem_unpin,
- .iop_unlock = xfs_qm_qoff_logitem_unlock,
- .iop_committed = xfs_qm_qoff_logitem_committed,
.iop_push = xfs_qm_qoff_logitem_push,
- .iop_committing = xfs_qm_qoff_logitem_committing
};
/*
@@ -453,7 +347,7 @@
{
struct xfs_qoff_logitem *qf;
- qf = kmem_zalloc(sizeof(struct xfs_qoff_logitem), KM_SLEEP);
+ qf = kmem_zalloc(sizeof(struct xfs_qoff_logitem), 0);
xfs_log_item_init(mp, &qf->qql_item, XFS_LI_QUOTAOFF, start ?
&xfs_qm_qoffend_logitem_ops : &xfs_qm_qoff_logitem_ops);
diff --git a/fs/xfs/xfs_dquot_item.h b/fs/xfs/xfs_dquot_item.h
index db9df71..1aed34c 100644
--- a/fs/xfs/xfs_dquot_item.h
+++ b/fs/xfs/xfs_dquot_item.h
@@ -12,13 +12,13 @@
struct xfs_qoff_logitem;
typedef struct xfs_dq_logitem {
- xfs_log_item_t qli_item; /* common portion */
+ struct xfs_log_item qli_item; /* common portion */
struct xfs_dquot *qli_dquot; /* dquot ptr */
xfs_lsn_t qli_flush_lsn; /* lsn at last flush */
} xfs_dq_logitem_t;
typedef struct xfs_qoff_logitem {
- xfs_log_item_t qql_item; /* common portion */
+ struct xfs_log_item qql_item; /* common portion */
struct xfs_qoff_logitem *qql_start_lip; /* qoff-start logitem, if any */
unsigned int qql_flags;
} xfs_qoff_logitem_t;
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index 9866f54..849fd44 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -4,6 +4,7 @@
* All Rights Reserved.
*/
#include "xfs.h"
+#include "xfs_shared.h"
#include "xfs_format.h"
#include "xfs_fs.h"
#include "xfs_log_format.h"
@@ -51,6 +52,7 @@
XFS_RANDOM_BUF_LRU_REF,
XFS_RANDOM_FORCE_SCRUB_REPAIR,
XFS_RANDOM_FORCE_SUMMARY_RECALC,
+ XFS_RANDOM_IUNLINK_FALLBACK,
};
struct xfs_errortag_attr {
@@ -159,6 +161,7 @@
XFS_ERRORTAG_ATTR_RW(buf_lru_ref, XFS_ERRTAG_BUF_LRU_REF);
XFS_ERRORTAG_ATTR_RW(force_repair, XFS_ERRTAG_FORCE_SCRUB_REPAIR);
XFS_ERRORTAG_ATTR_RW(bad_summary, XFS_ERRTAG_FORCE_SUMMARY_RECALC);
+XFS_ERRORTAG_ATTR_RW(iunlink_fallback, XFS_ERRTAG_IUNLINK_FALLBACK);
static struct attribute *xfs_errortag_attrs[] = {
XFS_ERRORTAG_ATTR_LIST(noerror),
@@ -195,6 +198,7 @@
XFS_ERRORTAG_ATTR_LIST(buf_lru_ref),
XFS_ERRORTAG_ATTR_LIST(force_repair),
XFS_ERRORTAG_ATTR_LIST(bad_summary),
+ XFS_ERRORTAG_ATTR_LIST(iunlink_fallback),
NULL,
};
@@ -209,7 +213,7 @@
struct xfs_mount *mp)
{
mp->m_errortag = kmem_zalloc(sizeof(unsigned int) * XFS_ERRTAG_MAX,
- KM_SLEEP | KM_MAYFAIL);
+ KM_MAYFAIL);
if (!mp->m_errortag)
return -ENOMEM;
@@ -350,14 +354,15 @@
size_t bufsz,
xfs_failaddr_t failaddr)
{
- struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_mount *mp = bp->b_mount;
xfs_failaddr_t fa;
int sz;
fa = failaddr ? failaddr : __return_address;
__xfs_buf_ioerror(bp, error, fa);
- xfs_alert(mp, "Metadata %s detected at %pS, %s block 0x%llx %s",
+ xfs_alert_tag(mp, XFS_PTAG_VERIFIER_ERROR,
+ "Metadata %s detected at %pS, %s block 0x%llx %s",
bp->b_error == -EFSBADCRC ? "CRC error" : "corruption",
fa, bp->b_ops->name, bp->b_bn, name);
diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h
index 246d3e9..602aa7d 100644
--- a/fs/xfs/xfs_error.h
+++ b/fs/xfs/xfs_error.h
@@ -98,5 +98,6 @@
#define XFS_PTAG_SHUTDOWN_IOERROR 0x00000020
#define XFS_PTAG_SHUTDOWN_LOGERROR 0x00000040
#define XFS_PTAG_FSBLOCK_ZERO 0x00000080
+#define XFS_PTAG_VERIFIER_ERROR 0x00000100
#endif /* __XFS_ERROR_H__ */
diff --git a/fs/xfs/xfs_export.c b/fs/xfs/xfs_export.c
index f2284ce..f1372f9 100644
--- a/fs/xfs/xfs_export.c
+++ b/fs/xfs/xfs_export.c
@@ -4,18 +4,16 @@
* All Rights Reserved.
*/
#include "xfs.h"
+#include "xfs_shared.h"
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
-#include "xfs_da_format.h"
-#include "xfs_da_btree.h"
#include "xfs_dir2.h"
#include "xfs_export.h"
#include "xfs_inode.h"
#include "xfs_trans.h"
#include "xfs_inode_item.h"
-#include "xfs_trace.h"
#include "xfs_icache.h"
#include "xfs_log.h"
#include "xfs_pnfs.h"
diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c
index 0ed6837..2183d87 100644
--- a/fs/xfs/xfs_extent_busy.c
+++ b/fs/xfs/xfs_extent_busy.c
@@ -33,7 +33,7 @@
struct rb_node **rbp;
struct rb_node *parent = NULL;
- new = kmem_zalloc(sizeof(struct xfs_extent_busy), KM_SLEEP);
+ new = kmem_zalloc(sizeof(struct xfs_extent_busy), 0);
new->agno = agno;
new->bno = bno;
new->length = len;
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index d9da66c..e44efc4 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -9,14 +9,18 @@
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
#include "xfs_bit.h"
+#include "xfs_shared.h"
#include "xfs_mount.h"
+#include "xfs_defer.h"
#include "xfs_trans.h"
#include "xfs_trans_priv.h"
-#include "xfs_buf_item.h"
#include "xfs_extfree_item.h"
#include "xfs_log.h"
#include "xfs_btree.h"
#include "xfs_rmap.h"
+#include "xfs_alloc.h"
+#include "xfs_bmap.h"
+#include "xfs_trace.h"
kmem_zone_t *xfs_efi_zone;
@@ -107,15 +111,6 @@
/*
- * Pinning has no meaning for an efi item, so just return.
- */
-STATIC void
-xfs_efi_item_pin(
- struct xfs_log_item *lip)
-{
-}
-
-/*
* The unpin operation is the last place an EFI is manipulated in the log. It is
* either inserted in the AIL or aborted in the event of a log I/O error. In
* either case, the EFI transaction has been successfully committed to make it
@@ -133,71 +128,22 @@
}
/*
- * Efi items have no locking or pushing. However, since EFIs are pulled from
- * the AIL when their corresponding EFDs are committed to disk, their situation
- * is very similar to being pinned. Return XFS_ITEM_PINNED so that the caller
- * will eventually flush the log. This should help in getting the EFI out of
- * the AIL.
- */
-STATIC uint
-xfs_efi_item_push(
- struct xfs_log_item *lip,
- struct list_head *buffer_list)
-{
- return XFS_ITEM_PINNED;
-}
-
-/*
* The EFI has been either committed or aborted if the transaction has been
* cancelled. If the transaction was cancelled, an EFD isn't going to be
* constructed and thus we free the EFI here directly.
*/
STATIC void
-xfs_efi_item_unlock(
+xfs_efi_item_release(
struct xfs_log_item *lip)
{
- if (test_bit(XFS_LI_ABORTED, &lip->li_flags))
- xfs_efi_release(EFI_ITEM(lip));
+ xfs_efi_release(EFI_ITEM(lip));
}
-/*
- * The EFI is logged only once and cannot be moved in the log, so simply return
- * the lsn at which it's been logged.
- */
-STATIC xfs_lsn_t
-xfs_efi_item_committed(
- struct xfs_log_item *lip,
- xfs_lsn_t lsn)
-{
- return lsn;
-}
-
-/*
- * The EFI dependency tracking op doesn't do squat. It can't because
- * it doesn't know where the free extent is coming from. The dependency
- * tracking has to be handled by the "enclosing" metadata object. For
- * example, for inodes, the inode is locked throughout the extent freeing
- * so the dependency should be recorded there.
- */
-STATIC void
-xfs_efi_item_committing(
- struct xfs_log_item *lip,
- xfs_lsn_t lsn)
-{
-}
-
-/*
- * This is the ops vector shared by all efi log items.
- */
static const struct xfs_item_ops xfs_efi_item_ops = {
.iop_size = xfs_efi_item_size,
.iop_format = xfs_efi_item_format,
- .iop_pin = xfs_efi_item_pin,
.iop_unpin = xfs_efi_item_unpin,
- .iop_unlock = xfs_efi_item_unlock,
- .iop_committed = xfs_efi_item_committed,
- .iop_push = xfs_efi_item_push,
- .iop_committing = xfs_efi_item_committing
+ .iop_release = xfs_efi_item_release,
};
@@ -217,9 +163,9 @@
if (nextents > XFS_EFI_MAX_FAST_EXTENTS) {
size = (uint)(sizeof(xfs_efi_log_item_t) +
((nextents - 1) * sizeof(xfs_extent_t)));
- efip = kmem_zalloc(size, KM_SLEEP);
+ efip = kmem_zalloc(size, 0);
} else {
- efip = kmem_zone_zalloc(xfs_efi_zone, KM_SLEEP);
+ efip = kmem_zone_zalloc(xfs_efi_zone, 0);
}
xfs_log_item_init(mp, &efip->efi_item, XFS_LI_EFI, &xfs_efi_item_ops);
@@ -349,137 +295,299 @@
}
/*
- * Pinning has no meaning for an efd item, so just return.
- */
-STATIC void
-xfs_efd_item_pin(
- struct xfs_log_item *lip)
-{
-}
-
-/*
- * Since pinning has no meaning for an efd item, unpinning does
- * not either.
- */
-STATIC void
-xfs_efd_item_unpin(
- struct xfs_log_item *lip,
- int remove)
-{
-}
-
-/*
- * There isn't much you can do to push on an efd item. It is simply stuck
- * waiting for the log to be flushed to disk.
- */
-STATIC uint
-xfs_efd_item_push(
- struct xfs_log_item *lip,
- struct list_head *buffer_list)
-{
- return XFS_ITEM_PINNED;
-}
-
-/*
* The EFD is either committed or aborted if the transaction is cancelled. If
* the transaction is cancelled, drop our reference to the EFI and free the EFD.
*/
STATIC void
-xfs_efd_item_unlock(
+xfs_efd_item_release(
struct xfs_log_item *lip)
{
struct xfs_efd_log_item *efdp = EFD_ITEM(lip);
- if (test_bit(XFS_LI_ABORTED, &lip->li_flags)) {
- xfs_efi_release(efdp->efd_efip);
- xfs_efd_item_free(efdp);
- }
-}
-
-/*
- * When the efd item is committed to disk, all we need to do is delete our
- * reference to our partner efi item and then free ourselves. Since we're
- * freeing ourselves we must return -1 to keep the transaction code from further
- * referencing this item.
- */
-STATIC xfs_lsn_t
-xfs_efd_item_committed(
- struct xfs_log_item *lip,
- xfs_lsn_t lsn)
-{
- struct xfs_efd_log_item *efdp = EFD_ITEM(lip);
-
- /*
- * Drop the EFI reference regardless of whether the EFD has been
- * aborted. Once the EFD transaction is constructed, it is the sole
- * responsibility of the EFD to release the EFI (even if the EFI is
- * aborted due to log I/O error).
- */
xfs_efi_release(efdp->efd_efip);
xfs_efd_item_free(efdp);
-
- return (xfs_lsn_t)-1;
}
-/*
- * The EFD dependency tracking op doesn't do squat. It can't because
- * it doesn't know where the free extent is coming from. The dependency
- * tracking has to be handled by the "enclosing" metadata object. For
- * example, for inodes, the inode is locked throughout the extent freeing
- * so the dependency should be recorded there.
- */
-STATIC void
-xfs_efd_item_committing(
- struct xfs_log_item *lip,
- xfs_lsn_t lsn)
-{
-}
-
-/*
- * This is the ops vector shared by all efd log items.
- */
static const struct xfs_item_ops xfs_efd_item_ops = {
+ .flags = XFS_ITEM_RELEASE_WHEN_COMMITTED,
.iop_size = xfs_efd_item_size,
.iop_format = xfs_efd_item_format,
- .iop_pin = xfs_efd_item_pin,
- .iop_unpin = xfs_efd_item_unpin,
- .iop_unlock = xfs_efd_item_unlock,
- .iop_committed = xfs_efd_item_committed,
- .iop_push = xfs_efd_item_push,
- .iop_committing = xfs_efd_item_committing
+ .iop_release = xfs_efd_item_release,
};
/*
- * Allocate and initialize an efd item with the given number of extents.
+ * Allocate an "extent free done" log item that will hold nextents worth of
+ * extents. The caller must use all nextents extents, because we are not
+ * flexible about this at all.
*/
-struct xfs_efd_log_item *
-xfs_efd_init(
- struct xfs_mount *mp,
- struct xfs_efi_log_item *efip,
- uint nextents)
-
+static struct xfs_efd_log_item *
+xfs_trans_get_efd(
+ struct xfs_trans *tp,
+ struct xfs_efi_log_item *efip,
+ unsigned int nextents)
{
- struct xfs_efd_log_item *efdp;
- uint size;
+ struct xfs_efd_log_item *efdp;
ASSERT(nextents > 0);
+
if (nextents > XFS_EFD_MAX_FAST_EXTENTS) {
- size = (uint)(sizeof(xfs_efd_log_item_t) +
- ((nextents - 1) * sizeof(xfs_extent_t)));
- efdp = kmem_zalloc(size, KM_SLEEP);
+ efdp = kmem_zalloc(sizeof(struct xfs_efd_log_item) +
+ (nextents - 1) * sizeof(struct xfs_extent),
+ 0);
} else {
- efdp = kmem_zone_zalloc(xfs_efd_zone, KM_SLEEP);
+ efdp = kmem_zone_zalloc(xfs_efd_zone, 0);
}
- xfs_log_item_init(mp, &efdp->efd_item, XFS_LI_EFD, &xfs_efd_item_ops);
+ xfs_log_item_init(tp->t_mountp, &efdp->efd_item, XFS_LI_EFD,
+ &xfs_efd_item_ops);
efdp->efd_efip = efip;
efdp->efd_format.efd_nextents = nextents;
efdp->efd_format.efd_efi_id = efip->efi_format.efi_id;
+ xfs_trans_add_item(tp, &efdp->efd_item);
return efdp;
}
/*
+ * Free an extent and log it to the EFD. Note that the transaction is marked
+ * dirty regardless of whether the extent free succeeds or fails to support the
+ * EFI/EFD lifecycle rules.
+ */
+static int
+xfs_trans_free_extent(
+ struct xfs_trans *tp,
+ struct xfs_efd_log_item *efdp,
+ xfs_fsblock_t start_block,
+ xfs_extlen_t ext_len,
+ const struct xfs_owner_info *oinfo,
+ bool skip_discard)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_extent *extp;
+ uint next_extent;
+ xfs_agnumber_t agno = XFS_FSB_TO_AGNO(mp, start_block);
+ xfs_agblock_t agbno = XFS_FSB_TO_AGBNO(mp,
+ start_block);
+ int error;
+
+ trace_xfs_bmap_free_deferred(tp->t_mountp, agno, 0, agbno, ext_len);
+
+ error = __xfs_free_extent(tp, start_block, ext_len,
+ oinfo, XFS_AG_RESV_NONE, skip_discard);
+ /*
+ * Mark the transaction dirty, even on error. This ensures the
+ * transaction is aborted, which:
+ *
+ * 1.) releases the EFI and frees the EFD
+ * 2.) shuts down the filesystem
+ */
+ tp->t_flags |= XFS_TRANS_DIRTY;
+ set_bit(XFS_LI_DIRTY, &efdp->efd_item.li_flags);
+
+ next_extent = efdp->efd_next_extent;
+ ASSERT(next_extent < efdp->efd_format.efd_nextents);
+ extp = &(efdp->efd_format.efd_extents[next_extent]);
+ extp->ext_start = start_block;
+ extp->ext_len = ext_len;
+ efdp->efd_next_extent++;
+
+ return error;
+}
+
+/* Sort bmap items by AG. */
+static int
+xfs_extent_free_diff_items(
+ void *priv,
+ struct list_head *a,
+ struct list_head *b)
+{
+ struct xfs_mount *mp = priv;
+ struct xfs_extent_free_item *ra;
+ struct xfs_extent_free_item *rb;
+
+ ra = container_of(a, struct xfs_extent_free_item, xefi_list);
+ rb = container_of(b, struct xfs_extent_free_item, xefi_list);
+ return XFS_FSB_TO_AGNO(mp, ra->xefi_startblock) -
+ XFS_FSB_TO_AGNO(mp, rb->xefi_startblock);
+}
+
+/* Get an EFI. */
+STATIC void *
+xfs_extent_free_create_intent(
+ struct xfs_trans *tp,
+ unsigned int count)
+{
+ struct xfs_efi_log_item *efip;
+
+ ASSERT(tp != NULL);
+ ASSERT(count > 0);
+
+ efip = xfs_efi_init(tp->t_mountp, count);
+ ASSERT(efip != NULL);
+
+ /*
+ * Get a log_item_desc to point at the new item.
+ */
+ xfs_trans_add_item(tp, &efip->efi_item);
+ return efip;
+}
+
+/* Log a free extent to the intent item. */
+STATIC void
+xfs_extent_free_log_item(
+ struct xfs_trans *tp,
+ void *intent,
+ struct list_head *item)
+{
+ struct xfs_efi_log_item *efip = intent;
+ struct xfs_extent_free_item *free;
+ uint next_extent;
+ struct xfs_extent *extp;
+
+ free = container_of(item, struct xfs_extent_free_item, xefi_list);
+
+ tp->t_flags |= XFS_TRANS_DIRTY;
+ set_bit(XFS_LI_DIRTY, &efip->efi_item.li_flags);
+
+ /*
+ * atomic_inc_return gives us the value after the increment;
+ * we want to use it as an array index so we need to subtract 1 from
+ * it.
+ */
+ next_extent = atomic_inc_return(&efip->efi_next_extent) - 1;
+ ASSERT(next_extent < efip->efi_format.efi_nextents);
+ extp = &efip->efi_format.efi_extents[next_extent];
+ extp->ext_start = free->xefi_startblock;
+ extp->ext_len = free->xefi_blockcount;
+}
+
+/* Get an EFD so we can process all the free extents. */
+STATIC void *
+xfs_extent_free_create_done(
+ struct xfs_trans *tp,
+ void *intent,
+ unsigned int count)
+{
+ return xfs_trans_get_efd(tp, intent, count);
+}
+
+/* Process a free extent. */
+STATIC int
+xfs_extent_free_finish_item(
+ struct xfs_trans *tp,
+ struct list_head *item,
+ void *done_item,
+ void **state)
+{
+ struct xfs_extent_free_item *free;
+ int error;
+
+ free = container_of(item, struct xfs_extent_free_item, xefi_list);
+ error = xfs_trans_free_extent(tp, done_item,
+ free->xefi_startblock,
+ free->xefi_blockcount,
+ &free->xefi_oinfo, free->xefi_skip_discard);
+ kmem_free(free);
+ return error;
+}
+
+/* Abort all pending EFIs. */
+STATIC void
+xfs_extent_free_abort_intent(
+ void *intent)
+{
+ xfs_efi_release(intent);
+}
+
+/* Cancel a free extent. */
+STATIC void
+xfs_extent_free_cancel_item(
+ struct list_head *item)
+{
+ struct xfs_extent_free_item *free;
+
+ free = container_of(item, struct xfs_extent_free_item, xefi_list);
+ kmem_free(free);
+}
+
+const struct xfs_defer_op_type xfs_extent_free_defer_type = {
+ .max_items = XFS_EFI_MAX_FAST_EXTENTS,
+ .diff_items = xfs_extent_free_diff_items,
+ .create_intent = xfs_extent_free_create_intent,
+ .abort_intent = xfs_extent_free_abort_intent,
+ .log_item = xfs_extent_free_log_item,
+ .create_done = xfs_extent_free_create_done,
+ .finish_item = xfs_extent_free_finish_item,
+ .cancel_item = xfs_extent_free_cancel_item,
+};
+
+/*
+ * AGFL blocks are accounted differently in the reserve pools and are not
+ * inserted into the busy extent list.
+ */
+STATIC int
+xfs_agfl_free_finish_item(
+ struct xfs_trans *tp,
+ struct list_head *item,
+ void *done_item,
+ void **state)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_efd_log_item *efdp = done_item;
+ struct xfs_extent_free_item *free;
+ struct xfs_extent *extp;
+ struct xfs_buf *agbp;
+ int error;
+ xfs_agnumber_t agno;
+ xfs_agblock_t agbno;
+ uint next_extent;
+
+ free = container_of(item, struct xfs_extent_free_item, xefi_list);
+ ASSERT(free->xefi_blockcount == 1);
+ agno = XFS_FSB_TO_AGNO(mp, free->xefi_startblock);
+ agbno = XFS_FSB_TO_AGBNO(mp, free->xefi_startblock);
+
+ trace_xfs_agfl_free_deferred(mp, agno, 0, agbno, free->xefi_blockcount);
+
+ error = xfs_alloc_read_agf(mp, tp, agno, 0, &agbp);
+ if (!error)
+ error = xfs_free_agfl_block(tp, agno, agbno, agbp,
+ &free->xefi_oinfo);
+
+ /*
+ * Mark the transaction dirty, even on error. This ensures the
+ * transaction is aborted, which:
+ *
+ * 1.) releases the EFI and frees the EFD
+ * 2.) shuts down the filesystem
+ */
+ tp->t_flags |= XFS_TRANS_DIRTY;
+ set_bit(XFS_LI_DIRTY, &efdp->efd_item.li_flags);
+
+ next_extent = efdp->efd_next_extent;
+ ASSERT(next_extent < efdp->efd_format.efd_nextents);
+ extp = &(efdp->efd_format.efd_extents[next_extent]);
+ extp->ext_start = free->xefi_startblock;
+ extp->ext_len = free->xefi_blockcount;
+ efdp->efd_next_extent++;
+
+ kmem_free(free);
+ return error;
+}
+
+/* sub-type with special handling for AGFL deferred frees */
+const struct xfs_defer_op_type xfs_agfl_free_defer_type = {
+ .max_items = XFS_EFI_MAX_FAST_EXTENTS,
+ .diff_items = xfs_extent_free_diff_items,
+ .create_intent = xfs_extent_free_create_intent,
+ .abort_intent = xfs_extent_free_abort_intent,
+ .log_item = xfs_extent_free_log_item,
+ .create_done = xfs_extent_free_create_done,
+ .finish_item = xfs_agfl_free_finish_item,
+ .cancel_item = xfs_extent_free_cancel_item,
+};
+
+/*
* Process an extent free intent item that was recovered from
* the log. We need to free the extents that it describes.
*/
@@ -494,7 +602,6 @@
int error = 0;
xfs_extent_t *extp;
xfs_fsblock_t startblock_fsb;
- struct xfs_owner_info oinfo;
ASSERT(!test_bit(XFS_EFI_RECOVERED, &efip->efi_flags));
@@ -526,11 +633,11 @@
return error;
efdp = xfs_trans_get_efd(tp, efip, efip->efi_format.efi_nextents);
- xfs_rmap_any_owner_update(&oinfo);
for (i = 0; i < efip->efi_format.efi_nextents; i++) {
extp = &efip->efi_format.efi_extents[i];
error = xfs_trans_free_extent(tp, efdp, extp->ext_start,
- extp->ext_len, &oinfo, false);
+ extp->ext_len,
+ &XFS_RMAP_OINFO_ANY_OWNER, false);
if (error)
goto abort_error;
diff --git a/fs/xfs/xfs_extfree_item.h b/fs/xfs/xfs_extfree_item.h
index 2a6a895..16aaab0 100644
--- a/fs/xfs/xfs_extfree_item.h
+++ b/fs/xfs/xfs_extfree_item.h
@@ -51,7 +51,7 @@
* AIL, so at this point both the EFI and EFD are freed.
*/
typedef struct xfs_efi_log_item {
- xfs_log_item_t efi_item;
+ struct xfs_log_item efi_item;
atomic_t efi_refcount;
atomic_t efi_next_extent;
unsigned long efi_flags; /* misc flags */
@@ -64,7 +64,7 @@
* have been freed.
*/
typedef struct xfs_efd_log_item {
- xfs_log_item_t efd_item;
+ struct xfs_log_item efd_item;
xfs_efi_log_item_t *efd_efip;
uint efd_next_extent;
xfs_efd_log_format_t efd_format;
@@ -79,8 +79,6 @@
extern struct kmem_zone *xfs_efd_zone;
xfs_efi_log_item_t *xfs_efi_init(struct xfs_mount *, uint);
-xfs_efd_log_item_t *xfs_efd_init(struct xfs_mount *, xfs_efi_log_item_t *,
- uint);
int xfs_efi_copy_format(xfs_log_iovec_t *buf,
xfs_efi_log_format_t *dst_efi_fmt);
void xfs_efi_item_free(xfs_efi_log_item_t *);
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 61a5ad2..1ffb179 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -10,14 +10,11 @@
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
-#include "xfs_da_format.h"
-#include "xfs_da_btree.h"
#include "xfs_inode.h"
#include "xfs_trans.h"
#include "xfs_inode_item.h"
#include "xfs_bmap.h"
#include "xfs_bmap_util.h"
-#include "xfs_error.h"
#include "xfs_dir2.h"
#include "xfs_dir2_priv.h"
#include "xfs_ioctl.h"
@@ -28,11 +25,10 @@
#include "xfs_iomap.h"
#include "xfs_reflink.h"
-#include <linux/dcache.h>
#include <linux/falloc.h>
-#include <linux/pagevec.h>
#include <linux/backing-dev.h>
#include <linux/mman.h>
+#include <linux/fadvise.h>
static const struct vm_operations_struct xfs_file_vm_ops;
@@ -367,40 +363,30 @@
* lock above. Eventually we should look into a way to avoid
* the pointless lock roundtrip.
*/
- if (likely(!(file->f_mode & FMODE_NOCMTIME))) {
- error = file_update_time(file);
- if (error)
- return error;
- }
-
- /*
- * If we're writing the file then make sure to clear the setuid and
- * setgid bits if the process is not being run by root. This keeps
- * people from modifying setuid and setgid binaries.
- */
- if (!IS_NOSEC(inode))
- return file_remove_privs(file);
- return 0;
+ return file_modified(file);
}
static int
xfs_dio_write_end_io(
struct kiocb *iocb,
ssize_t size,
+ int error,
unsigned flags)
{
struct inode *inode = file_inode(iocb->ki_filp);
struct xfs_inode *ip = XFS_I(inode);
loff_t offset = iocb->ki_pos;
- int error = 0;
+ unsigned int nofs_flag;
trace_xfs_end_io_direct_write(ip, offset, size);
if (XFS_FORCED_SHUTDOWN(ip->i_mount))
return -EIO;
- if (size <= 0)
- return size;
+ if (error)
+ return error;
+ if (!size)
+ return 0;
/*
* Capture amount written on completion as we can't reliably account
@@ -408,10 +394,17 @@
*/
XFS_STATS_ADD(ip->i_mount, xs_write_bytes, size);
+ /*
+ * We can allocate memory here while doing writeback on behalf of
+ * memory reclaim. To avoid memory allocation deadlocks set the
+ * task-wide nofs context for the following operations.
+ */
+ nofs_flag = memalloc_nofs_save();
+
if (flags & IOMAP_DIO_COW) {
error = xfs_reflink_end_cow(ip, offset, size);
if (error)
- return error;
+ goto out;
}
/*
@@ -420,8 +413,10 @@
* earlier allows a racing dio read to find unwritten extents before
* they are converted.
*/
- if (flags & IOMAP_DIO_UNWRITTEN)
- return xfs_iomap_write_unwritten(ip, offset, size, true);
+ if (flags & IOMAP_DIO_UNWRITTEN) {
+ error = xfs_iomap_write_unwritten(ip, offset, size, true);
+ goto out;
+ }
/*
* We need to update the in-core inode size here so that we don't end up
@@ -443,9 +438,15 @@
spin_unlock(&ip->i_flags_lock);
}
+out:
+ memalloc_nofs_restore(nofs_flag);
return error;
}
+static const struct iomap_dio_ops xfs_dio_write_ops = {
+ .end_io = xfs_dio_write_end_io,
+};
+
/*
* xfs_file_dio_aio_write - handle direct IO writes
*
@@ -507,7 +508,7 @@
* We can't properly handle unaligned direct I/O to reflink
* files yet, as we can't unshare a partial block.
*/
- if (xfs_is_reflink_inode(ip)) {
+ if (xfs_is_cow_inode(ip)) {
trace_xfs_reflink_bounce_dio_write(ip, iocb->ki_pos, count);
return -EREMCHG;
}
@@ -517,6 +518,9 @@
}
if (iocb->ki_flags & IOCB_NOWAIT) {
+ /* unaligned dio always waits, bail */
+ if (unaligned_io)
+ return -EAGAIN;
if (!xfs_ilock_nowait(ip, iolock))
return -EAGAIN;
} else {
@@ -529,25 +533,29 @@
count = iov_iter_count(from);
/*
- * If we are doing unaligned IO, wait for all other IO to drain,
- * otherwise demote the lock if we had to take the exclusive lock
- * for other reasons in xfs_file_aio_write_checks.
+ * If we are doing unaligned IO, we can't allow any other overlapping IO
+ * in-flight at the same time or we risk data corruption. Wait for all
+ * other IO to drain before we submit. If the IO is aligned, demote the
+ * iolock if we had to take the exclusive lock in
+ * xfs_file_aio_write_checks() for other reasons.
*/
if (unaligned_io) {
- /* If we are going to wait for other DIO to finish, bail */
- if (iocb->ki_flags & IOCB_NOWAIT) {
- if (atomic_read(&inode->i_dio_count))
- return -EAGAIN;
- } else {
- inode_dio_wait(inode);
- }
+ inode_dio_wait(inode);
} else if (iolock == XFS_IOLOCK_EXCL) {
xfs_ilock_demote(ip, XFS_IOLOCK_EXCL);
iolock = XFS_IOLOCK_SHARED;
}
trace_xfs_file_direct_write(ip, count, iocb->ki_pos);
- ret = iomap_dio_rw(iocb, from, &xfs_iomap_ops, xfs_dio_write_end_io);
+ ret = iomap_dio_rw(iocb, from, &xfs_iomap_ops, &xfs_dio_write_ops);
+
+ /*
+ * If unaligned, this is the only IO in-flight. If it has not yet
+ * completed, wait on it before we release the iolock to prevent
+ * subsequent overlapping IO.
+ */
+ if (ret == -EIOCBQUEUED && unaligned_io)
+ inode_dio_wait(inode);
out:
xfs_iunlock(ip, iolock);
@@ -872,14 +880,27 @@
goto out_unlock;
}
- if (mode & FALLOC_FL_ZERO_RANGE)
+ if (mode & FALLOC_FL_ZERO_RANGE) {
error = xfs_zero_file_space(ip, offset, len);
- else {
- if (mode & FALLOC_FL_UNSHARE_RANGE) {
- error = xfs_reflink_unshare(ip, offset, len);
- if (error)
- goto out_unlock;
+ } else if (mode & FALLOC_FL_UNSHARE_RANGE) {
+ error = xfs_reflink_unshare(ip, offset, len);
+ if (error)
+ goto out_unlock;
+
+ if (!xfs_is_always_cow_inode(ip)) {
+ error = xfs_alloc_file_space(ip, offset, len,
+ XFS_BMAPI_PREALLOC);
}
+ } else {
+ /*
+ * If always_cow mode we can't use preallocations and
+ * thus should not create them.
+ */
+ if (xfs_is_always_cow_inode(ip)) {
+ error = -EOPNOTSUPP;
+ goto out_unlock;
+ }
+
error = xfs_alloc_file_space(ip, offset, len,
XFS_BMAPI_PREALLOC);
}
@@ -920,27 +941,90 @@
}
STATIC int
-xfs_file_clone_range(
- struct file *file_in,
- loff_t pos_in,
- struct file *file_out,
- loff_t pos_out,
- u64 len)
+xfs_file_fadvise(
+ struct file *file,
+ loff_t start,
+ loff_t end,
+ int advice)
{
- return xfs_reflink_remap_range(file_in, pos_in, file_out, pos_out,
- len, false);
+ struct xfs_inode *ip = XFS_I(file_inode(file));
+ int ret;
+ int lockflags = 0;
+
+ /*
+ * Operations creating pages in page cache need protection from hole
+ * punching and similar ops
+ */
+ if (advice == POSIX_FADV_WILLNEED) {
+ lockflags = XFS_IOLOCK_SHARED;
+ xfs_ilock(ip, lockflags);
+ }
+ ret = generic_fadvise(file, start, end, advice);
+ if (lockflags)
+ xfs_iunlock(ip, lockflags);
+ return ret;
}
-STATIC int
-xfs_file_dedupe_range(
- struct file *file_in,
- loff_t pos_in,
- struct file *file_out,
- loff_t pos_out,
- u64 len)
+STATIC loff_t
+xfs_file_remap_range(
+ struct file *file_in,
+ loff_t pos_in,
+ struct file *file_out,
+ loff_t pos_out,
+ loff_t len,
+ unsigned int remap_flags)
{
- return xfs_reflink_remap_range(file_in, pos_in, file_out, pos_out,
- len, true);
+ struct inode *inode_in = file_inode(file_in);
+ struct xfs_inode *src = XFS_I(inode_in);
+ struct inode *inode_out = file_inode(file_out);
+ struct xfs_inode *dest = XFS_I(inode_out);
+ struct xfs_mount *mp = src->i_mount;
+ loff_t remapped = 0;
+ xfs_extlen_t cowextsize;
+ int ret;
+
+ if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY))
+ return -EINVAL;
+
+ if (!xfs_sb_version_hasreflink(&mp->m_sb))
+ return -EOPNOTSUPP;
+
+ if (XFS_FORCED_SHUTDOWN(mp))
+ return -EIO;
+
+ /* Prepare and then clone file data. */
+ ret = xfs_reflink_remap_prep(file_in, pos_in, file_out, pos_out,
+ &len, remap_flags);
+ if (ret < 0 || len == 0)
+ return ret;
+
+ trace_xfs_reflink_remap_range(src, pos_in, len, dest, pos_out);
+
+ ret = xfs_reflink_remap_blocks(src, pos_in, dest, pos_out, len,
+ &remapped);
+ if (ret)
+ goto out_unlock;
+
+ /*
+ * Carry the cowextsize hint from src to dest if we're sharing the
+ * entire source file to the entire destination file, the source file
+ * has a cowextsize hint, and the destination file does not.
+ */
+ cowextsize = 0;
+ if (pos_in == 0 && len == i_size_read(inode_in) &&
+ (src->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE) &&
+ pos_out == 0 && len >= i_size_read(inode_out) &&
+ !(dest->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE))
+ cowextsize = src->i_d.di_cowextsize;
+
+ ret = xfs_reflink_update_dest(dest, pos_out + len, cowextsize,
+ remap_flags);
+
+out_unlock:
+ xfs_reflink_remap_unlock(file_in, file_out);
+ if (ret)
+ trace_xfs_reflink_remap_range_error(dest, ret, _RET_IP_);
+ return remapped > 0 ? remapped : ret;
}
STATIC int
@@ -1029,10 +1113,10 @@
default:
return generic_file_llseek(file, offset, whence);
case SEEK_HOLE:
- offset = iomap_seek_hole(inode, offset, &xfs_iomap_ops);
+ offset = iomap_seek_hole(inode, offset, &xfs_seek_iomap_ops);
break;
case SEEK_DATA:
- offset = iomap_seek_data(inode, offset, &xfs_iomap_ops);
+ offset = iomap_seek_data(inode, offset, &xfs_seek_iomap_ops);
break;
}
@@ -1144,11 +1228,14 @@
struct file *filp,
struct vm_area_struct *vma)
{
+ struct dax_device *dax_dev;
+
+ dax_dev = xfs_find_daxdev_for_inode(file_inode(filp));
/*
- * We don't support synchronous mappings for non-DAX files. At least
- * until someone comes with a sensible use case.
+ * We don't support synchronous mappings for non-DAX files and
+ * for DAX files if underneath dax_device is not synchronous.
*/
- if (!IS_DAX(file_inode(filp)) && (vma->vm_flags & VM_SYNC))
+ if (!daxdev_mapping_supported(vma, dax_dev))
return -EOPNOTSUPP;
file_accessed(filp);
@@ -1164,6 +1251,7 @@
.write_iter = xfs_file_write_iter,
.splice_read = generic_file_splice_read,
.splice_write = iter_file_splice_write,
+ .iopoll = iomap_dio_iopoll,
.unlocked_ioctl = xfs_file_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = xfs_file_compat_ioctl,
@@ -1175,8 +1263,8 @@
.fsync = xfs_file_fsync,
.get_unmapped_area = thp_get_unmapped_area,
.fallocate = xfs_file_fallocate,
- .clone_file_range = xfs_file_clone_range,
- .dedupe_file_range = xfs_file_dedupe_range,
+ .fadvise = xfs_file_fadvise,
+ .remap_file_range = xfs_file_remap_range,
};
const struct file_operations xfs_dir_file_operations = {
diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c
index 1825013..574a7a8 100644
--- a/fs/xfs/xfs_filestream.c
+++ b/fs/xfs/xfs_filestream.c
@@ -5,22 +5,19 @@
* All Rights Reserved.
*/
#include "xfs.h"
+#include "xfs_shared.h"
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
#include "xfs_sb.h"
#include "xfs_mount.h"
-#include "xfs_defer.h"
#include "xfs_inode.h"
#include "xfs_bmap.h"
-#include "xfs_bmap_util.h"
#include "xfs_alloc.h"
#include "xfs_mru_cache.h"
-#include "xfs_filestream.h"
#include "xfs_trace.h"
#include "xfs_ag_resv.h"
#include "xfs_trans.h"
-#include "xfs_shared.h"
struct xfs_fstrm_item {
struct xfs_mru_cache_elem mru;
diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c
index 3d76a9e..d082143 100644
--- a/fs/xfs/xfs_fsmap.c
+++ b/fs/xfs/xfs_fsmap.c
@@ -9,16 +9,12 @@
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
-#include "xfs_sb.h"
#include "xfs_mount.h"
-#include "xfs_defer.h"
#include "xfs_inode.h"
#include "xfs_trans.h"
-#include "xfs_error.h"
#include "xfs_btree.h"
#include "xfs_rmap_btree.h"
#include "xfs_trace.h"
-#include "xfs_log.h"
#include "xfs_rmap.h"
#include "xfs_alloc.h"
#include "xfs_bit.h"
@@ -254,7 +250,7 @@
rec_daddr += XFS_FSB_TO_BB(mp, rec->rm_blockcount);
if (info->next_daddr < rec_daddr)
info->next_daddr = rec_daddr;
- return XFS_BTREE_QUERY_RANGE_CONTINUE;
+ return 0;
}
/* Are we just counting mappings? */
@@ -263,14 +259,14 @@
info->head->fmh_entries++;
if (info->last)
- return XFS_BTREE_QUERY_RANGE_CONTINUE;
+ return 0;
info->head->fmh_entries++;
rec_daddr += XFS_FSB_TO_BB(mp, rec->rm_blockcount);
if (info->next_daddr < rec_daddr)
info->next_daddr = rec_daddr;
- return XFS_BTREE_QUERY_RANGE_CONTINUE;
+ return 0;
}
/*
@@ -280,7 +276,7 @@
*/
if (rec_daddr > info->next_daddr) {
if (info->head->fmh_entries >= info->head->fmh_count)
- return XFS_BTREE_QUERY_RANGE_ABORT;
+ return -ECANCELED;
fmr.fmr_device = info->dev;
fmr.fmr_physical = info->next_daddr;
@@ -299,7 +295,7 @@
/* Fill out the extent we found */
if (info->head->fmh_entries >= info->head->fmh_count)
- return XFS_BTREE_QUERY_RANGE_ABORT;
+ return -ECANCELED;
trace_xfs_fsmap_mapping(mp, info->dev, info->agno, rec);
@@ -332,7 +328,7 @@
rec_daddr += XFS_FSB_TO_BB(mp, rec->rm_blockcount);
if (info->next_daddr < rec_daddr)
info->next_daddr = rec_daddr;
- return XFS_BTREE_QUERY_RANGE_CONTINUE;
+ return 0;
}
/* Transform a rmapbt irec into a fsmap */
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 7c00b8b..3e61d0c 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -11,15 +11,11 @@
#include "xfs_trans_resv.h"
#include "xfs_sb.h"
#include "xfs_mount.h"
-#include "xfs_defer.h"
#include "xfs_trans.h"
#include "xfs_error.h"
-#include "xfs_btree.h"
#include "xfs_alloc.h"
#include "xfs_fsops.h"
#include "xfs_trans_space.h"
-#include "xfs_rtalloc.h"
-#include "xfs_trace.h"
#include "xfs_log.h"
#include "xfs_ag.h"
#include "xfs_ag_resv.h"
@@ -40,7 +36,6 @@
xfs_rfsblock_t new;
xfs_agnumber_t oagcount;
xfs_trans_t *tp;
- LIST_HEAD (buffer_list);
struct aghdr_init_data id = {};
nb = in->newblocks;
@@ -252,9 +247,9 @@
if (mp->m_sb.sb_imax_pct) {
uint64_t icount = mp->m_sb.sb_dblocks * mp->m_sb.sb_imax_pct;
do_div(icount, 100);
- mp->m_maxicount = icount << mp->m_sb.sb_inopblog;
+ M_IGEO(mp)->maxicount = XFS_FSB_TO_INO(mp, icount);
} else
- mp->m_maxicount = 0;
+ M_IGEO(mp)->maxicount = 0;
/* Update secondary superblocks now the physical grow has completed */
error = xfs_update_secondary_sbs(mp);
@@ -290,7 +285,7 @@
* exported through ioctl XFS_IOC_FSCOUNTS
*/
-int
+void
xfs_fs_counts(
xfs_mount_t *mp,
xfs_fsop_counts_t *cnt)
@@ -303,7 +298,6 @@
spin_lock(&mp->m_sb_lock);
cnt->freertx = mp->m_sb.sb_frextents;
spin_unlock(&mp->m_sb_lock);
- return 0;
}
/*
@@ -470,20 +464,13 @@
*/
void
xfs_do_force_shutdown(
- xfs_mount_t *mp,
+ struct xfs_mount *mp,
int flags,
char *fname,
int lnnum)
{
- int logerror;
+ bool logerror = flags & SHUTDOWN_LOG_IO_ERROR;
- logerror = flags & SHUTDOWN_LOG_IO_ERROR;
-
- if (!(flags & SHUTDOWN_FORCE_UMOUNT)) {
- xfs_notice(mp,
- "%s(0x%x) called from line %d of file %s. Return address = "PTR_FMT,
- __func__, flags, lnnum, fname, __return_address);
- }
/*
* No need to duplicate efforts.
*/
@@ -499,27 +486,34 @@
if (xfs_log_force_umount(mp, logerror))
return;
+ if (flags & SHUTDOWN_FORCE_UMOUNT) {
+ xfs_alert(mp,
+"User initiated shutdown received. Shutting down filesystem");
+ return;
+ }
+
+ xfs_notice(mp,
+"%s(0x%x) called from line %d of file %s. Return address = "PTR_FMT,
+ __func__, flags, lnnum, fname, __return_address);
+
if (flags & SHUTDOWN_CORRUPT_INCORE) {
xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_CORRUPT,
- "Corruption of in-memory data detected. Shutting down filesystem");
+"Corruption of in-memory data detected. Shutting down filesystem");
if (XFS_ERRLEVEL_HIGH <= xfs_error_level)
xfs_stack_trace();
- } else if (!(flags & SHUTDOWN_FORCE_UMOUNT)) {
- if (logerror) {
- xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_LOGERROR,
- "Log I/O Error Detected. Shutting down filesystem");
- } else if (flags & SHUTDOWN_DEVICE_REQ) {
- xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_IOERROR,
- "All device paths lost. Shutting down filesystem");
- } else if (!(flags & SHUTDOWN_REMOTE_REQ)) {
- xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_IOERROR,
- "I/O Error Detected. Shutting down filesystem");
- }
+ } else if (logerror) {
+ xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_LOGERROR,
+ "Log I/O Error Detected. Shutting down filesystem");
+ } else if (flags & SHUTDOWN_DEVICE_REQ) {
+ xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_IOERROR,
+ "All device paths lost. Shutting down filesystem");
+ } else if (!(flags & SHUTDOWN_REMOTE_REQ)) {
+ xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_IOERROR,
+ "I/O Error Detected. Shutting down filesystem");
}
- if (!(flags & SHUTDOWN_FORCE_UMOUNT)) {
- xfs_alert(mp,
- "Please umount the filesystem and rectify the problem(s)");
- }
+
+ xfs_alert(mp,
+ "Please unmount the filesystem and rectify the problem(s)");
}
/*
@@ -534,6 +528,7 @@
int error = 0;
int err2;
+ mp->m_finobt_nores = false;
for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
pag = xfs_perag_get(mp, agno);
err2 = xfs_ag_resv_init(pag, NULL);
diff --git a/fs/xfs/xfs_fsops.h b/fs/xfs/xfs_fsops.h
index d023db0..92869f6 100644
--- a/fs/xfs/xfs_fsops.h
+++ b/fs/xfs/xfs_fsops.h
@@ -8,7 +8,7 @@
extern int xfs_growfs_data(xfs_mount_t *mp, xfs_growfs_data_t *in);
extern int xfs_growfs_log(xfs_mount_t *mp, xfs_growfs_log_t *in);
-extern int xfs_fs_counts(xfs_mount_t *mp, xfs_fsop_counts_t *cnt);
+extern void xfs_fs_counts(xfs_mount_t *mp, xfs_fsop_counts_t *cnt);
extern int xfs_reserve_blocks(xfs_mount_t *mp, uint64_t *inval,
xfs_fsop_resblks_t *outval);
extern int xfs_fs_goingdown(xfs_mount_t *mp, uint32_t inflags);
diff --git a/fs/xfs/xfs_globals.c b/fs/xfs/xfs_globals.c
index 5169e84..fa55ab8 100644
--- a/fs/xfs/xfs_globals.c
+++ b/fs/xfs/xfs_globals.c
@@ -4,7 +4,6 @@
* All Rights Reserved.
*/
#include "xfs.h"
-#include "xfs_sysctl.h"
/*
* Tunable XFS parameters. xfs_params is required even when CONFIG_SYSCTL=n,
@@ -16,7 +15,7 @@
/* MIN DFLT MAX */
.sgid_inherit = { 0, 0, 1 },
.symlink_mode = { 0, 0, 1 },
- .panic_mask = { 0, 0, 255 },
+ .panic_mask = { 0, 0, 256 },
.error_level = { 0, 3, 11 },
.syncd_timer = { 1*100, 30*100, 7200*100},
.stats_clear = { 0, 0, 1 },
@@ -41,4 +40,7 @@
#else
.bug_on_assert = false, /* assert failures WARN() */
#endif
+#ifdef DEBUG
+ .pwork_threads = -1, /* automatic thread detection */
+#endif
};
diff --git a/fs/xfs/xfs_health.c b/fs/xfs/xfs_health.c
new file mode 100644
index 0000000..8e0cb05
--- /dev/null
+++ b/fs/xfs/xfs_health.c
@@ -0,0 +1,388 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Copyright (C) 2019 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_sb.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_trace.h"
+#include "xfs_health.h"
+
+/*
+ * Warn about metadata corruption that we detected but haven't fixed, and
+ * make sure we're not sitting on anything that would get in the way of
+ * recovery.
+ */
+void
+xfs_health_unmount(
+ struct xfs_mount *mp)
+{
+ struct xfs_perag *pag;
+ xfs_agnumber_t agno;
+ unsigned int sick = 0;
+ unsigned int checked = 0;
+ bool warn = false;
+
+ if (XFS_FORCED_SHUTDOWN(mp))
+ return;
+
+ /* Measure AG corruption levels. */
+ for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
+ pag = xfs_perag_get(mp, agno);
+ xfs_ag_measure_sickness(pag, &sick, &checked);
+ if (sick) {
+ trace_xfs_ag_unfixed_corruption(mp, agno, sick);
+ warn = true;
+ }
+ xfs_perag_put(pag);
+ }
+
+ /* Measure realtime volume corruption levels. */
+ xfs_rt_measure_sickness(mp, &sick, &checked);
+ if (sick) {
+ trace_xfs_rt_unfixed_corruption(mp, sick);
+ warn = true;
+ }
+
+ /*
+ * Measure fs corruption and keep the sample around for the warning.
+ * See the note below for why we exempt FS_COUNTERS.
+ */
+ xfs_fs_measure_sickness(mp, &sick, &checked);
+ if (sick & ~XFS_SICK_FS_COUNTERS) {
+ trace_xfs_fs_unfixed_corruption(mp, sick);
+ warn = true;
+ }
+
+ if (warn) {
+ xfs_warn(mp,
+"Uncorrected metadata errors detected; please run xfs_repair.");
+
+ /*
+ * We discovered uncorrected metadata problems at some point
+ * during this filesystem mount and have advised the
+ * administrator to run repair once the unmount completes.
+ *
+ * However, we must be careful -- when FSCOUNTERS are flagged
+ * unhealthy, the unmount procedure omits writing the clean
+ * unmount record to the log so that the next mount will run
+ * recovery and recompute the summary counters. In other
+ * words, we leave a dirty log to get the counters fixed.
+ *
+ * Unfortunately, xfs_repair cannot recover dirty logs, so if
+ * there were filesystem problems, FSCOUNTERS was flagged, and
+ * the administrator takes our advice to run xfs_repair,
+ * they'll have to zap the log before repairing structures.
+ * We don't really want to encourage this, so we mark the
+ * FSCOUNTERS healthy so that a subsequent repair run won't see
+ * a dirty log.
+ */
+ if (sick & XFS_SICK_FS_COUNTERS)
+ xfs_fs_mark_healthy(mp, XFS_SICK_FS_COUNTERS);
+ }
+}
+
+/* Mark unhealthy per-fs metadata. */
+void
+xfs_fs_mark_sick(
+ struct xfs_mount *mp,
+ unsigned int mask)
+{
+ ASSERT(!(mask & ~XFS_SICK_FS_PRIMARY));
+ trace_xfs_fs_mark_sick(mp, mask);
+
+ spin_lock(&mp->m_sb_lock);
+ mp->m_fs_sick |= mask;
+ mp->m_fs_checked |= mask;
+ spin_unlock(&mp->m_sb_lock);
+}
+
+/* Mark a per-fs metadata healed. */
+void
+xfs_fs_mark_healthy(
+ struct xfs_mount *mp,
+ unsigned int mask)
+{
+ ASSERT(!(mask & ~XFS_SICK_FS_PRIMARY));
+ trace_xfs_fs_mark_healthy(mp, mask);
+
+ spin_lock(&mp->m_sb_lock);
+ mp->m_fs_sick &= ~mask;
+ mp->m_fs_checked |= mask;
+ spin_unlock(&mp->m_sb_lock);
+}
+
+/* Sample which per-fs metadata are unhealthy. */
+void
+xfs_fs_measure_sickness(
+ struct xfs_mount *mp,
+ unsigned int *sick,
+ unsigned int *checked)
+{
+ spin_lock(&mp->m_sb_lock);
+ *sick = mp->m_fs_sick;
+ *checked = mp->m_fs_checked;
+ spin_unlock(&mp->m_sb_lock);
+}
+
+/* Mark unhealthy realtime metadata. */
+void
+xfs_rt_mark_sick(
+ struct xfs_mount *mp,
+ unsigned int mask)
+{
+ ASSERT(!(mask & ~XFS_SICK_RT_PRIMARY));
+ trace_xfs_rt_mark_sick(mp, mask);
+
+ spin_lock(&mp->m_sb_lock);
+ mp->m_rt_sick |= mask;
+ mp->m_rt_checked |= mask;
+ spin_unlock(&mp->m_sb_lock);
+}
+
+/* Mark a realtime metadata healed. */
+void
+xfs_rt_mark_healthy(
+ struct xfs_mount *mp,
+ unsigned int mask)
+{
+ ASSERT(!(mask & ~XFS_SICK_RT_PRIMARY));
+ trace_xfs_rt_mark_healthy(mp, mask);
+
+ spin_lock(&mp->m_sb_lock);
+ mp->m_rt_sick &= ~mask;
+ mp->m_rt_checked |= mask;
+ spin_unlock(&mp->m_sb_lock);
+}
+
+/* Sample which realtime metadata are unhealthy. */
+void
+xfs_rt_measure_sickness(
+ struct xfs_mount *mp,
+ unsigned int *sick,
+ unsigned int *checked)
+{
+ spin_lock(&mp->m_sb_lock);
+ *sick = mp->m_rt_sick;
+ *checked = mp->m_rt_checked;
+ spin_unlock(&mp->m_sb_lock);
+}
+
+/* Mark unhealthy per-ag metadata. */
+void
+xfs_ag_mark_sick(
+ struct xfs_perag *pag,
+ unsigned int mask)
+{
+ ASSERT(!(mask & ~XFS_SICK_AG_PRIMARY));
+ trace_xfs_ag_mark_sick(pag->pag_mount, pag->pag_agno, mask);
+
+ spin_lock(&pag->pag_state_lock);
+ pag->pag_sick |= mask;
+ pag->pag_checked |= mask;
+ spin_unlock(&pag->pag_state_lock);
+}
+
+/* Mark per-ag metadata ok. */
+void
+xfs_ag_mark_healthy(
+ struct xfs_perag *pag,
+ unsigned int mask)
+{
+ ASSERT(!(mask & ~XFS_SICK_AG_PRIMARY));
+ trace_xfs_ag_mark_healthy(pag->pag_mount, pag->pag_agno, mask);
+
+ spin_lock(&pag->pag_state_lock);
+ pag->pag_sick &= ~mask;
+ pag->pag_checked |= mask;
+ spin_unlock(&pag->pag_state_lock);
+}
+
+/* Sample which per-ag metadata are unhealthy. */
+void
+xfs_ag_measure_sickness(
+ struct xfs_perag *pag,
+ unsigned int *sick,
+ unsigned int *checked)
+{
+ spin_lock(&pag->pag_state_lock);
+ *sick = pag->pag_sick;
+ *checked = pag->pag_checked;
+ spin_unlock(&pag->pag_state_lock);
+}
+
+/* Mark the unhealthy parts of an inode. */
+void
+xfs_inode_mark_sick(
+ struct xfs_inode *ip,
+ unsigned int mask)
+{
+ ASSERT(!(mask & ~XFS_SICK_INO_PRIMARY));
+ trace_xfs_inode_mark_sick(ip, mask);
+
+ spin_lock(&ip->i_flags_lock);
+ ip->i_sick |= mask;
+ ip->i_checked |= mask;
+ spin_unlock(&ip->i_flags_lock);
+}
+
+/* Mark parts of an inode healed. */
+void
+xfs_inode_mark_healthy(
+ struct xfs_inode *ip,
+ unsigned int mask)
+{
+ ASSERT(!(mask & ~XFS_SICK_INO_PRIMARY));
+ trace_xfs_inode_mark_healthy(ip, mask);
+
+ spin_lock(&ip->i_flags_lock);
+ ip->i_sick &= ~mask;
+ ip->i_checked |= mask;
+ spin_unlock(&ip->i_flags_lock);
+}
+
+/* Sample which parts of an inode are unhealthy. */
+void
+xfs_inode_measure_sickness(
+ struct xfs_inode *ip,
+ unsigned int *sick,
+ unsigned int *checked)
+{
+ spin_lock(&ip->i_flags_lock);
+ *sick = ip->i_sick;
+ *checked = ip->i_checked;
+ spin_unlock(&ip->i_flags_lock);
+}
+
+/* Mappings between internal sick masks and ioctl sick masks. */
+
+struct ioctl_sick_map {
+ unsigned int sick_mask;
+ unsigned int ioctl_mask;
+};
+
+static const struct ioctl_sick_map fs_map[] = {
+ { XFS_SICK_FS_COUNTERS, XFS_FSOP_GEOM_SICK_COUNTERS},
+ { XFS_SICK_FS_UQUOTA, XFS_FSOP_GEOM_SICK_UQUOTA },
+ { XFS_SICK_FS_GQUOTA, XFS_FSOP_GEOM_SICK_GQUOTA },
+ { XFS_SICK_FS_PQUOTA, XFS_FSOP_GEOM_SICK_PQUOTA },
+ { 0, 0 },
+};
+
+static const struct ioctl_sick_map rt_map[] = {
+ { XFS_SICK_RT_BITMAP, XFS_FSOP_GEOM_SICK_RT_BITMAP },
+ { XFS_SICK_RT_SUMMARY, XFS_FSOP_GEOM_SICK_RT_SUMMARY },
+ { 0, 0 },
+};
+
+static inline void
+xfgeo_health_tick(
+ struct xfs_fsop_geom *geo,
+ unsigned int sick,
+ unsigned int checked,
+ const struct ioctl_sick_map *m)
+{
+ if (checked & m->sick_mask)
+ geo->checked |= m->ioctl_mask;
+ if (sick & m->sick_mask)
+ geo->sick |= m->ioctl_mask;
+}
+
+/* Fill out fs geometry health info. */
+void
+xfs_fsop_geom_health(
+ struct xfs_mount *mp,
+ struct xfs_fsop_geom *geo)
+{
+ const struct ioctl_sick_map *m;
+ unsigned int sick;
+ unsigned int checked;
+
+ geo->sick = 0;
+ geo->checked = 0;
+
+ xfs_fs_measure_sickness(mp, &sick, &checked);
+ for (m = fs_map; m->sick_mask; m++)
+ xfgeo_health_tick(geo, sick, checked, m);
+
+ xfs_rt_measure_sickness(mp, &sick, &checked);
+ for (m = rt_map; m->sick_mask; m++)
+ xfgeo_health_tick(geo, sick, checked, m);
+}
+
+static const struct ioctl_sick_map ag_map[] = {
+ { XFS_SICK_AG_SB, XFS_AG_GEOM_SICK_SB },
+ { XFS_SICK_AG_AGF, XFS_AG_GEOM_SICK_AGF },
+ { XFS_SICK_AG_AGFL, XFS_AG_GEOM_SICK_AGFL },
+ { XFS_SICK_AG_AGI, XFS_AG_GEOM_SICK_AGI },
+ { XFS_SICK_AG_BNOBT, XFS_AG_GEOM_SICK_BNOBT },
+ { XFS_SICK_AG_CNTBT, XFS_AG_GEOM_SICK_CNTBT },
+ { XFS_SICK_AG_INOBT, XFS_AG_GEOM_SICK_INOBT },
+ { XFS_SICK_AG_FINOBT, XFS_AG_GEOM_SICK_FINOBT },
+ { XFS_SICK_AG_RMAPBT, XFS_AG_GEOM_SICK_RMAPBT },
+ { XFS_SICK_AG_REFCNTBT, XFS_AG_GEOM_SICK_REFCNTBT },
+ { 0, 0 },
+};
+
+/* Fill out ag geometry health info. */
+void
+xfs_ag_geom_health(
+ struct xfs_perag *pag,
+ struct xfs_ag_geometry *ageo)
+{
+ const struct ioctl_sick_map *m;
+ unsigned int sick;
+ unsigned int checked;
+
+ ageo->ag_sick = 0;
+ ageo->ag_checked = 0;
+
+ xfs_ag_measure_sickness(pag, &sick, &checked);
+ for (m = ag_map; m->sick_mask; m++) {
+ if (checked & m->sick_mask)
+ ageo->ag_checked |= m->ioctl_mask;
+ if (sick & m->sick_mask)
+ ageo->ag_sick |= m->ioctl_mask;
+ }
+}
+
+static const struct ioctl_sick_map ino_map[] = {
+ { XFS_SICK_INO_CORE, XFS_BS_SICK_INODE },
+ { XFS_SICK_INO_BMBTD, XFS_BS_SICK_BMBTD },
+ { XFS_SICK_INO_BMBTA, XFS_BS_SICK_BMBTA },
+ { XFS_SICK_INO_BMBTC, XFS_BS_SICK_BMBTC },
+ { XFS_SICK_INO_DIR, XFS_BS_SICK_DIR },
+ { XFS_SICK_INO_XATTR, XFS_BS_SICK_XATTR },
+ { XFS_SICK_INO_SYMLINK, XFS_BS_SICK_SYMLINK },
+ { XFS_SICK_INO_PARENT, XFS_BS_SICK_PARENT },
+ { 0, 0 },
+};
+
+/* Fill out bulkstat health info. */
+void
+xfs_bulkstat_health(
+ struct xfs_inode *ip,
+ struct xfs_bulkstat *bs)
+{
+ const struct ioctl_sick_map *m;
+ unsigned int sick;
+ unsigned int checked;
+
+ bs->bs_sick = 0;
+ bs->bs_checked = 0;
+
+ xfs_inode_measure_sickness(ip, &sick, &checked);
+ for (m = ino_map; m->sick_mask; m++) {
+ if (checked & m->sick_mask)
+ bs->bs_checked |= m->ioctl_mask;
+ if (sick & m->sick_mask)
+ bs->bs_sick |= m->ioctl_mask;
+ }
+}
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index 245483c..944add5 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -5,13 +5,13 @@
*/
#include "xfs.h"
#include "xfs_fs.h"
+#include "xfs_shared.h"
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
#include "xfs_sb.h"
#include "xfs_mount.h"
#include "xfs_inode.h"
-#include "xfs_error.h"
#include "xfs_trans.h"
#include "xfs_trans_priv.h"
#include "xfs_inode_item.h"
@@ -23,8 +23,6 @@
#include "xfs_dquot.h"
#include "xfs_reflink.h"
-#include <linux/kthread.h>
-#include <linux/freezer.h>
#include <linux/iversion.h>
/*
@@ -42,7 +40,7 @@
* KM_MAYFAIL and return NULL here on ENOMEM. Set the
* code up to do this anyway.
*/
- ip = kmem_zone_alloc(xfs_inode_zone, KM_SLEEP);
+ ip = kmem_zone_alloc(xfs_inode_zone, 0);
if (!ip)
return NULL;
if (inode_init_always(mp->m_super, VFS_I(ip))) {
@@ -70,6 +68,11 @@
ip->i_flags = 0;
ip->i_delayed_blks = 0;
memset(&ip->i_d, 0, sizeof(ip->i_d));
+ ip->i_sick = 0;
+ ip->i_checked = 0;
+ INIT_WORK(&ip->i_ioend_work, xfs_end_io);
+ INIT_LIST_HEAD(&ip->i_ioend_list);
+ spin_lock_init(&ip->i_ioend_lock);
return ip;
}
@@ -446,6 +449,8 @@
ip->i_flags |= XFS_INEW;
xfs_inode_clear_reclaim_tag(pag, ip->i_ino);
inode->i_state = I_NEW;
+ ip->i_sick = 0;
+ ip->i_checked = 0;
ASSERT(!rwsem_is_locked(&inode->i_rwsem));
init_rwsem(&inode->i_rwsem);
@@ -1815,7 +1820,7 @@
/* Disable post-EOF and CoW block auto-reclamation. */
void
-xfs_icache_disable_reclaim(
+xfs_stop_block_reaping(
struct xfs_mount *mp)
{
cancel_delayed_work_sync(&mp->m_eofblocks_work);
@@ -1824,7 +1829,7 @@
/* Enable post-EOF and CoW block auto-reclamation. */
void
-xfs_icache_enable_reclaim(
+xfs_start_block_reaping(
struct xfs_mount *mp)
{
xfs_queue_eofblocks(mp);
diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h
index 26c0626..48f1fd2 100644
--- a/fs/xfs/xfs_icache.h
+++ b/fs/xfs/xfs_icache.h
@@ -119,7 +119,7 @@
int xfs_icache_inode_is_allocated(struct xfs_mount *mp, struct xfs_trans *tp,
xfs_ino_t ino, bool *inuse);
-void xfs_icache_disable_reclaim(struct xfs_mount *mp);
-void xfs_icache_enable_reclaim(struct xfs_mount *mp);
+void xfs_stop_block_reaping(struct xfs_mount *mp);
+void xfs_start_block_reaping(struct xfs_mount *mp);
#endif
diff --git a/fs/xfs/xfs_icreate_item.c b/fs/xfs/xfs_icreate_item.c
index 8381d34..3ebd1b7 100644
--- a/fs/xfs/xfs_icreate_item.c
+++ b/fs/xfs/xfs_icreate_item.c
@@ -6,14 +6,9 @@
#include "xfs.h"
#include "xfs_fs.h"
#include "xfs_shared.h"
-#include "xfs_format.h"
#include "xfs_log_format.h"
-#include "xfs_trans_resv.h"
-#include "xfs_bit.h"
-#include "xfs_mount.h"
#include "xfs_trans.h"
#include "xfs_trans_priv.h"
-#include "xfs_error.h"
#include "xfs_icreate_item.h"
#include "xfs_log.h"
@@ -56,80 +51,18 @@
sizeof(struct xfs_icreate_log));
}
-
-/* Pinning has no meaning for the create item, so just return. */
STATIC void
-xfs_icreate_item_pin(
+xfs_icreate_item_release(
struct xfs_log_item *lip)
{
+ kmem_zone_free(xfs_icreate_zone, ICR_ITEM(lip));
}
-
-/* pinning has no meaning for the create item, so just return. */
-STATIC void
-xfs_icreate_item_unpin(
- struct xfs_log_item *lip,
- int remove)
-{
-}
-
-STATIC void
-xfs_icreate_item_unlock(
- struct xfs_log_item *lip)
-{
- struct xfs_icreate_item *icp = ICR_ITEM(lip);
-
- if (test_bit(XFS_LI_ABORTED, &lip->li_flags))
- kmem_zone_free(xfs_icreate_zone, icp);
- return;
-}
-
-/*
- * Because we have ordered buffers being tracked in the AIL for the inode
- * creation, we don't need the create item after this. Hence we can free
- * the log item and return -1 to tell the caller we're done with the item.
- */
-STATIC xfs_lsn_t
-xfs_icreate_item_committed(
- struct xfs_log_item *lip,
- xfs_lsn_t lsn)
-{
- struct xfs_icreate_item *icp = ICR_ITEM(lip);
-
- kmem_zone_free(xfs_icreate_zone, icp);
- return (xfs_lsn_t)-1;
-}
-
-/* item can never get into the AIL */
-STATIC uint
-xfs_icreate_item_push(
- struct xfs_log_item *lip,
- struct list_head *buffer_list)
-{
- ASSERT(0);
- return XFS_ITEM_SUCCESS;
-}
-
-/* Ordered buffers do the dependency tracking here, so this does nothing. */
-STATIC void
-xfs_icreate_item_committing(
- struct xfs_log_item *lip,
- xfs_lsn_t lsn)
-{
-}
-
-/*
- * This is the ops vector shared by all buf log items.
- */
static const struct xfs_item_ops xfs_icreate_item_ops = {
+ .flags = XFS_ITEM_RELEASE_WHEN_COMMITTED,
.iop_size = xfs_icreate_item_size,
.iop_format = xfs_icreate_item_format,
- .iop_pin = xfs_icreate_item_pin,
- .iop_unpin = xfs_icreate_item_unpin,
- .iop_push = xfs_icreate_item_push,
- .iop_unlock = xfs_icreate_item_unlock,
- .iop_committed = xfs_icreate_item_committed,
- .iop_committing = xfs_icreate_item_committing,
+ .iop_release = xfs_icreate_item_release,
};
@@ -156,7 +89,7 @@
{
struct xfs_icreate_item *icp;
- icp = kmem_zone_zalloc(xfs_icreate_zone, KM_SLEEP);
+ icp = kmem_zone_zalloc(xfs_icreate_zone, 0);
xfs_log_item_init(tp->t_mountp, &icp->ic_item, XFS_LI_ICREATE,
&xfs_icreate_item_ops);
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 05db954..18f4b26 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -3,7 +3,6 @@
* Copyright (c) 2000-2006 Silicon Graphics, Inc.
* All Rights Reserved.
*/
-#include <linux/log2.h>
#include <linux/iversion.h>
#include "xfs.h"
@@ -16,10 +15,7 @@
#include "xfs_mount.h"
#include "xfs_defer.h"
#include "xfs_inode.h"
-#include "xfs_da_format.h"
-#include "xfs_da_btree.h"
#include "xfs_dir2.h"
-#include "xfs_attr_sf.h"
#include "xfs_attr.h"
#include "xfs_trans_space.h"
#include "xfs_trans.h"
@@ -32,7 +28,6 @@
#include "xfs_error.h"
#include "xfs_quota.h"
#include "xfs_filestream.h"
-#include "xfs_cksum.h"
#include "xfs_trace.h"
#include "xfs_icache.h"
#include "xfs_symlink.h"
@@ -40,7 +35,6 @@
#include "xfs_log.h"
#include "xfs_bmap_btree.h"
#include "xfs_reflink.h"
-#include "xfs_dir2_priv.h"
kmem_zone_t *xfs_inode_zone;
@@ -441,12 +435,12 @@
*/
static void
xfs_lock_inodes(
- xfs_inode_t **ips,
- int inodes,
- uint lock_mode)
+ struct xfs_inode **ips,
+ int inodes,
+ uint lock_mode)
{
- int attempts = 0, i, j, try_lock;
- xfs_log_item_t *lp;
+ int attempts = 0, i, j, try_lock;
+ struct xfs_log_item *lp;
/*
* Currently supports between 2 and 5 inodes with exclusive locking. We
@@ -485,7 +479,7 @@
*/
if (!try_lock) {
for (j = (i - 1); j >= 0 && !try_lock; j--) {
- lp = (xfs_log_item_t *)ips[j]->i_itemp;
+ lp = &ips[j]->i_itemp->ili_item;
if (lp && test_bit(XFS_LI_IN_AIL, &lp->li_flags))
try_lock++;
}
@@ -551,7 +545,7 @@
struct xfs_inode *temp;
uint mode_temp;
int attempts = 0;
- xfs_log_item_t *lp;
+ struct xfs_log_item *lp;
ASSERT(hweight32(ip0_mode) == 1);
ASSERT(hweight32(ip1_mode) == 1);
@@ -585,7 +579,7 @@
* the second lock. If we can't get it, we must release the first one
* and try again.
*/
- lp = (xfs_log_item_t *)ip0->i_itemp;
+ lp = &ip0->i_itemp->ili_item;
if (lp && test_bit(XFS_LI_IN_AIL, &lp->li_flags)) {
if (!xfs_ilock_nowait(ip1, xfs_lock_inumorder(ip1_mode, 1))) {
xfs_iunlock(ip0, ip0_mode);
@@ -1116,7 +1110,7 @@
/*
* Increment the link count on an inode & log the change.
*/
-static int
+static void
xfs_bumplink(
xfs_trans_t *tp,
xfs_inode_t *ip)
@@ -1126,7 +1120,6 @@
ASSERT(ip->i_d.di_version > 1);
inc_nlink(VFS_I(ip));
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
- return 0;
}
int
@@ -1235,9 +1228,7 @@
if (error)
goto out_trans_cancel;
- error = xfs_bumplink(tp, dp);
- if (error)
- goto out_trans_cancel;
+ xfs_bumplink(tp, dp);
}
/*
@@ -1332,7 +1323,7 @@
if (error)
goto out_trans_cancel;
- error = xfs_dir_ialloc(&tp, dp, mode, 1, 0, prid, &ip);
+ error = xfs_dir_ialloc(&tp, dp, mode, 0, 0, prid, &ip);
if (error)
goto out_trans_cancel;
@@ -1454,9 +1445,7 @@
xfs_trans_ichgtime(tp, tdp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
xfs_trans_log_inode(tp, tdp, XFS_ILOG_CORE);
- error = xfs_bumplink(tp, sip);
- if (error)
- goto error_return;
+ xfs_bumplink(tp, sip);
/*
* If this is a synchronous mount, make sure that the
@@ -1754,7 +1743,7 @@
* now remains allocated and sits on the unlinked list until the fs is
* repaired.
*/
- if (unlikely(mp->m_inotbt_nores)) {
+ if (unlikely(mp->m_finobt_nores)) {
error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ifree,
XFS_IFREE_SPACE_RES(mp), 0, XFS_TRANS_RESERVE,
&tp);
@@ -1907,86 +1896,510 @@
}
/*
- * This is called when the inode's link count goes to 0 or we are creating a
- * tmpfile via O_TMPFILE. In the case of a tmpfile, @ignore_linkcount will be
- * set to true as the link count is dropped to zero by the VFS after we've
- * created the file successfully, so we have to add it to the unlinked list
- * while the link count is non-zero.
+ * In-Core Unlinked List Lookups
+ * =============================
+ *
+ * Every inode is supposed to be reachable from some other piece of metadata
+ * with the exception of the root directory. Inodes with a connection to a
+ * file descriptor but not linked from anywhere in the on-disk directory tree
+ * are collectively known as unlinked inodes, though the filesystem itself
+ * maintains links to these inodes so that on-disk metadata are consistent.
+ *
+ * XFS implements a per-AG on-disk hash table of unlinked inodes. The AGI
+ * header contains a number of buckets that point to an inode, and each inode
+ * record has a pointer to the next inode in the hash chain. This
+ * singly-linked list causes scaling problems in the iunlink remove function
+ * because we must walk that list to find the inode that points to the inode
+ * being removed from the unlinked hash bucket list.
+ *
+ * What if we modelled the unlinked list as a collection of records capturing
+ * "X.next_unlinked = Y" relations? If we indexed those records on Y, we'd
+ * have a fast way to look up unlinked list predecessors, which avoids the
+ * slow list walk. That's exactly what we do here (in-core) with a per-AG
+ * rhashtable.
+ *
+ * Because this is a backref cache, we ignore operational failures since the
+ * iunlink code can fall back to the slow bucket walk. The only errors that
+ * should bubble out are for obviously incorrect situations.
+ *
+ * All users of the backref cache MUST hold the AGI buffer lock to serialize
+ * access or have otherwise provided for concurrency control.
+ */
+
+/* Capture a "X.next_unlinked = Y" relationship. */
+struct xfs_iunlink {
+ struct rhash_head iu_rhash_head;
+ xfs_agino_t iu_agino; /* X */
+ xfs_agino_t iu_next_unlinked; /* Y */
+};
+
+/* Unlinked list predecessor lookup hashtable construction */
+static int
+xfs_iunlink_obj_cmpfn(
+ struct rhashtable_compare_arg *arg,
+ const void *obj)
+{
+ const xfs_agino_t *key = arg->key;
+ const struct xfs_iunlink *iu = obj;
+
+ if (iu->iu_next_unlinked != *key)
+ return 1;
+ return 0;
+}
+
+static const struct rhashtable_params xfs_iunlink_hash_params = {
+ .min_size = XFS_AGI_UNLINKED_BUCKETS,
+ .key_len = sizeof(xfs_agino_t),
+ .key_offset = offsetof(struct xfs_iunlink,
+ iu_next_unlinked),
+ .head_offset = offsetof(struct xfs_iunlink, iu_rhash_head),
+ .automatic_shrinking = true,
+ .obj_cmpfn = xfs_iunlink_obj_cmpfn,
+};
+
+/*
+ * Return X, where X.next_unlinked == @agino. Returns NULLAGINO if no such
+ * relation is found.
+ */
+static xfs_agino_t
+xfs_iunlink_lookup_backref(
+ struct xfs_perag *pag,
+ xfs_agino_t agino)
+{
+ struct xfs_iunlink *iu;
+
+ iu = rhashtable_lookup_fast(&pag->pagi_unlinked_hash, &agino,
+ xfs_iunlink_hash_params);
+ return iu ? iu->iu_agino : NULLAGINO;
+}
+
+/*
+ * Take ownership of an iunlink cache entry and insert it into the hash table.
+ * If successful, the entry will be owned by the cache; if not, it is freed.
+ * Either way, the caller does not own @iu after this call.
+ */
+static int
+xfs_iunlink_insert_backref(
+ struct xfs_perag *pag,
+ struct xfs_iunlink *iu)
+{
+ int error;
+
+ error = rhashtable_insert_fast(&pag->pagi_unlinked_hash,
+ &iu->iu_rhash_head, xfs_iunlink_hash_params);
+ /*
+ * Fail loudly if there already was an entry because that's a sign of
+ * corruption of in-memory data. Also fail loudly if we see an error
+ * code we didn't anticipate from the rhashtable code. Currently we
+ * only anticipate ENOMEM.
+ */
+ if (error) {
+ WARN(error != -ENOMEM, "iunlink cache insert error %d", error);
+ kmem_free(iu);
+ }
+ /*
+ * Absorb any runtime errors that aren't a result of corruption because
+ * this is a cache and we can always fall back to bucket list scanning.
+ */
+ if (error != 0 && error != -EEXIST)
+ error = 0;
+ return error;
+}
+
+/* Remember that @prev_agino.next_unlinked = @this_agino. */
+static int
+xfs_iunlink_add_backref(
+ struct xfs_perag *pag,
+ xfs_agino_t prev_agino,
+ xfs_agino_t this_agino)
+{
+ struct xfs_iunlink *iu;
+
+ if (XFS_TEST_ERROR(false, pag->pag_mount, XFS_ERRTAG_IUNLINK_FALLBACK))
+ return 0;
+
+ iu = kmem_zalloc(sizeof(*iu), KM_NOFS);
+ iu->iu_agino = prev_agino;
+ iu->iu_next_unlinked = this_agino;
+
+ return xfs_iunlink_insert_backref(pag, iu);
+}
+
+/*
+ * Replace X.next_unlinked = @agino with X.next_unlinked = @next_unlinked.
+ * If @next_unlinked is NULLAGINO, we drop the backref and exit. If there
+ * wasn't any such entry then we don't bother.
+ */
+static int
+xfs_iunlink_change_backref(
+ struct xfs_perag *pag,
+ xfs_agino_t agino,
+ xfs_agino_t next_unlinked)
+{
+ struct xfs_iunlink *iu;
+ int error;
+
+ /* Look up the old entry; if there wasn't one then exit. */
+ iu = rhashtable_lookup_fast(&pag->pagi_unlinked_hash, &agino,
+ xfs_iunlink_hash_params);
+ if (!iu)
+ return 0;
+
+ /*
+ * Remove the entry. This shouldn't ever return an error, but if we
+ * couldn't remove the old entry we don't want to add it again to the
+ * hash table, and if the entry disappeared on us then someone's
+ * violated the locking rules and we need to fail loudly. Either way
+ * we cannot remove the inode because internal state is or would have
+ * been corrupt.
+ */
+ error = rhashtable_remove_fast(&pag->pagi_unlinked_hash,
+ &iu->iu_rhash_head, xfs_iunlink_hash_params);
+ if (error)
+ return error;
+
+ /* If there is no new next entry just free our item and return. */
+ if (next_unlinked == NULLAGINO) {
+ kmem_free(iu);
+ return 0;
+ }
+
+ /* Update the entry and re-add it to the hash table. */
+ iu->iu_next_unlinked = next_unlinked;
+ return xfs_iunlink_insert_backref(pag, iu);
+}
+
+/* Set up the in-core predecessor structures. */
+int
+xfs_iunlink_init(
+ struct xfs_perag *pag)
+{
+ return rhashtable_init(&pag->pagi_unlinked_hash,
+ &xfs_iunlink_hash_params);
+}
+
+/* Free the in-core predecessor structures. */
+static void
+xfs_iunlink_free_item(
+ void *ptr,
+ void *arg)
+{
+ struct xfs_iunlink *iu = ptr;
+ bool *freed_anything = arg;
+
+ *freed_anything = true;
+ kmem_free(iu);
+}
+
+void
+xfs_iunlink_destroy(
+ struct xfs_perag *pag)
+{
+ bool freed_anything = false;
+
+ rhashtable_free_and_destroy(&pag->pagi_unlinked_hash,
+ xfs_iunlink_free_item, &freed_anything);
+
+ ASSERT(freed_anything == false || XFS_FORCED_SHUTDOWN(pag->pag_mount));
+}
+
+/*
+ * Point the AGI unlinked bucket at an inode and log the results. The caller
+ * is responsible for validating the old value.
+ */
+STATIC int
+xfs_iunlink_update_bucket(
+ struct xfs_trans *tp,
+ xfs_agnumber_t agno,
+ struct xfs_buf *agibp,
+ unsigned int bucket_index,
+ xfs_agino_t new_agino)
+{
+ struct xfs_agi *agi = XFS_BUF_TO_AGI(agibp);
+ xfs_agino_t old_value;
+ int offset;
+
+ ASSERT(xfs_verify_agino_or_null(tp->t_mountp, agno, new_agino));
+
+ old_value = be32_to_cpu(agi->agi_unlinked[bucket_index]);
+ trace_xfs_iunlink_update_bucket(tp->t_mountp, agno, bucket_index,
+ old_value, new_agino);
+
+ /*
+ * We should never find the head of the list already set to the value
+ * passed in because either we're adding or removing ourselves from the
+ * head of the list.
+ */
+ if (old_value == new_agino)
+ return -EFSCORRUPTED;
+
+ agi->agi_unlinked[bucket_index] = cpu_to_be32(new_agino);
+ offset = offsetof(struct xfs_agi, agi_unlinked) +
+ (sizeof(xfs_agino_t) * bucket_index);
+ xfs_trans_log_buf(tp, agibp, offset, offset + sizeof(xfs_agino_t) - 1);
+ return 0;
+}
+
+/* Set an on-disk inode's next_unlinked pointer. */
+STATIC void
+xfs_iunlink_update_dinode(
+ struct xfs_trans *tp,
+ xfs_agnumber_t agno,
+ xfs_agino_t agino,
+ struct xfs_buf *ibp,
+ struct xfs_dinode *dip,
+ struct xfs_imap *imap,
+ xfs_agino_t next_agino)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+ int offset;
+
+ ASSERT(xfs_verify_agino_or_null(mp, agno, next_agino));
+
+ trace_xfs_iunlink_update_dinode(mp, agno, agino,
+ be32_to_cpu(dip->di_next_unlinked), next_agino);
+
+ dip->di_next_unlinked = cpu_to_be32(next_agino);
+ offset = imap->im_boffset +
+ offsetof(struct xfs_dinode, di_next_unlinked);
+
+ /* need to recalc the inode CRC if appropriate */
+ xfs_dinode_calc_crc(mp, dip);
+ xfs_trans_inode_buf(tp, ibp);
+ xfs_trans_log_buf(tp, ibp, offset, offset + sizeof(xfs_agino_t) - 1);
+ xfs_inobp_check(mp, ibp);
+}
+
+/* Set an in-core inode's unlinked pointer and return the old value. */
+STATIC int
+xfs_iunlink_update_inode(
+ struct xfs_trans *tp,
+ struct xfs_inode *ip,
+ xfs_agnumber_t agno,
+ xfs_agino_t next_agino,
+ xfs_agino_t *old_next_agino)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_dinode *dip;
+ struct xfs_buf *ibp;
+ xfs_agino_t old_value;
+ int error;
+
+ ASSERT(xfs_verify_agino_or_null(mp, agno, next_agino));
+
+ error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp, 0, 0);
+ if (error)
+ return error;
+
+ /* Make sure the old pointer isn't garbage. */
+ old_value = be32_to_cpu(dip->di_next_unlinked);
+ if (!xfs_verify_agino_or_null(mp, agno, old_value)) {
+ error = -EFSCORRUPTED;
+ goto out;
+ }
+
+ /*
+ * Since we're updating a linked list, we should never find that the
+ * current pointer is the same as the new value, unless we're
+ * terminating the list.
+ */
+ *old_next_agino = old_value;
+ if (old_value == next_agino) {
+ if (next_agino != NULLAGINO)
+ error = -EFSCORRUPTED;
+ goto out;
+ }
+
+ /* Ok, update the new pointer. */
+ xfs_iunlink_update_dinode(tp, agno, XFS_INO_TO_AGINO(mp, ip->i_ino),
+ ibp, dip, &ip->i_imap, next_agino);
+ return 0;
+out:
+ xfs_trans_brelse(tp, ibp);
+ return error;
+}
+
+/*
+ * This is called when the inode's link count has gone to 0 or we are creating
+ * a tmpfile via O_TMPFILE. The inode @ip must have nlink == 0.
*
* We place the on-disk inode on a list in the AGI. It will be pulled from this
* list when the inode is freed.
*/
STATIC int
xfs_iunlink(
- struct xfs_trans *tp,
- struct xfs_inode *ip)
+ struct xfs_trans *tp,
+ struct xfs_inode *ip)
{
- xfs_mount_t *mp = tp->t_mountp;
- xfs_agi_t *agi;
- xfs_dinode_t *dip;
- xfs_buf_t *agibp;
- xfs_buf_t *ibp;
- xfs_agino_t agino;
- short bucket_index;
- int offset;
- int error;
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_agi *agi;
+ struct xfs_buf *agibp;
+ xfs_agino_t next_agino;
+ xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
+ xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
+ short bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
+ int error;
+ ASSERT(VFS_I(ip)->i_nlink == 0);
ASSERT(VFS_I(ip)->i_mode != 0);
+ trace_xfs_iunlink(ip);
- /*
- * Get the agi buffer first. It ensures lock ordering
- * on the list.
- */
- error = xfs_read_agi(mp, tp, XFS_INO_TO_AGNO(mp, ip->i_ino), &agibp);
+ /* Get the agi buffer first. It ensures lock ordering on the list. */
+ error = xfs_read_agi(mp, tp, agno, &agibp);
if (error)
return error;
agi = XFS_BUF_TO_AGI(agibp);
/*
- * Get the index into the agi hash table for the
- * list this inode will go on.
+ * Get the index into the agi hash table for the list this inode will
+ * go on. Make sure the pointer isn't garbage and that this inode
+ * isn't already on the list.
*/
- agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
- ASSERT(agino != 0);
- bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
- ASSERT(agi->agi_unlinked[bucket_index]);
- ASSERT(be32_to_cpu(agi->agi_unlinked[bucket_index]) != agino);
+ next_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]);
+ if (next_agino == agino ||
+ !xfs_verify_agino_or_null(mp, agno, next_agino))
+ return -EFSCORRUPTED;
- if (agi->agi_unlinked[bucket_index] != cpu_to_be32(NULLAGINO)) {
+ if (next_agino != NULLAGINO) {
+ struct xfs_perag *pag;
+ xfs_agino_t old_agino;
+
/*
- * There is already another inode in the bucket we need
- * to add ourselves to. Add us at the front of the list.
- * Here we put the head pointer into our next pointer,
- * and then we fall through to point the head at us.
+ * There is already another inode in the bucket, so point this
+ * inode to the current head of the list.
*/
- error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp,
- 0, 0);
+ error = xfs_iunlink_update_inode(tp, ip, agno, next_agino,
+ &old_agino);
+ if (error)
+ return error;
+ ASSERT(old_agino == NULLAGINO);
+
+ /*
+ * agino has been unlinked, add a backref from the next inode
+ * back to agino.
+ */
+ pag = xfs_perag_get(mp, agno);
+ error = xfs_iunlink_add_backref(pag, agino, next_agino);
+ xfs_perag_put(pag);
+ if (error)
+ return error;
+ }
+
+ /* Point the head of the list to point to this inode. */
+ return xfs_iunlink_update_bucket(tp, agno, agibp, bucket_index, agino);
+}
+
+/* Return the imap, dinode pointer, and buffer for an inode. */
+STATIC int
+xfs_iunlink_map_ino(
+ struct xfs_trans *tp,
+ xfs_agnumber_t agno,
+ xfs_agino_t agino,
+ struct xfs_imap *imap,
+ struct xfs_dinode **dipp,
+ struct xfs_buf **bpp)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+ int error;
+
+ imap->im_blkno = 0;
+ error = xfs_imap(mp, tp, XFS_AGINO_TO_INO(mp, agno, agino), imap, 0);
+ if (error) {
+ xfs_warn(mp, "%s: xfs_imap returned error %d.",
+ __func__, error);
+ return error;
+ }
+
+ error = xfs_imap_to_bp(mp, tp, imap, dipp, bpp, 0, 0);
+ if (error) {
+ xfs_warn(mp, "%s: xfs_imap_to_bp returned error %d.",
+ __func__, error);
+ return error;
+ }
+
+ return 0;
+}
+
+/*
+ * Walk the unlinked chain from @head_agino until we find the inode that
+ * points to @target_agino. Return the inode number, map, dinode pointer,
+ * and inode cluster buffer of that inode as @agino, @imap, @dipp, and @bpp.
+ *
+ * @tp, @pag, @head_agino, and @target_agino are input parameters.
+ * @agino, @imap, @dipp, and @bpp are all output parameters.
+ *
+ * Do not call this function if @target_agino is the head of the list.
+ */
+STATIC int
+xfs_iunlink_map_prev(
+ struct xfs_trans *tp,
+ xfs_agnumber_t agno,
+ xfs_agino_t head_agino,
+ xfs_agino_t target_agino,
+ xfs_agino_t *agino,
+ struct xfs_imap *imap,
+ struct xfs_dinode **dipp,
+ struct xfs_buf **bpp,
+ struct xfs_perag *pag)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+ xfs_agino_t next_agino;
+ int error;
+
+ ASSERT(head_agino != target_agino);
+ *bpp = NULL;
+
+ /* See if our backref cache can find it faster. */
+ *agino = xfs_iunlink_lookup_backref(pag, target_agino);
+ if (*agino != NULLAGINO) {
+ error = xfs_iunlink_map_ino(tp, agno, *agino, imap, dipp, bpp);
if (error)
return error;
- ASSERT(dip->di_next_unlinked == cpu_to_be32(NULLAGINO));
- dip->di_next_unlinked = agi->agi_unlinked[bucket_index];
- offset = ip->i_imap.im_boffset +
- offsetof(xfs_dinode_t, di_next_unlinked);
+ if (be32_to_cpu((*dipp)->di_next_unlinked) == target_agino)
+ return 0;
- /* need to recalc the inode CRC if appropriate */
- xfs_dinode_calc_crc(mp, dip);
-
- xfs_trans_inode_buf(tp, ibp);
- xfs_trans_log_buf(tp, ibp, offset,
- (offset + sizeof(xfs_agino_t) - 1));
- xfs_inobp_check(mp, ibp);
+ /*
+ * If we get here the cache contents were corrupt, so drop the
+ * buffer and fall back to walking the bucket list.
+ */
+ xfs_trans_brelse(tp, *bpp);
+ *bpp = NULL;
+ WARN_ON_ONCE(1);
}
- /*
- * Point the bucket head pointer at the inode being inserted.
- */
- ASSERT(agino != 0);
- agi->agi_unlinked[bucket_index] = cpu_to_be32(agino);
- offset = offsetof(xfs_agi_t, agi_unlinked) +
- (sizeof(xfs_agino_t) * bucket_index);
- xfs_trans_log_buf(tp, agibp, offset,
- (offset + sizeof(xfs_agino_t) - 1));
+ trace_xfs_iunlink_map_prev_fallback(mp, agno);
+
+ /* Otherwise, walk the entire bucket until we find it. */
+ next_agino = head_agino;
+ while (next_agino != target_agino) {
+ xfs_agino_t unlinked_agino;
+
+ if (*bpp)
+ xfs_trans_brelse(tp, *bpp);
+
+ *agino = next_agino;
+ error = xfs_iunlink_map_ino(tp, agno, next_agino, imap, dipp,
+ bpp);
+ if (error)
+ return error;
+
+ unlinked_agino = be32_to_cpu((*dipp)->di_next_unlinked);
+ /*
+ * Make sure this pointer is valid and isn't an obvious
+ * infinite loop.
+ */
+ if (!xfs_verify_agino(mp, agno, unlinked_agino) ||
+ next_agino == unlinked_agino) {
+ XFS_CORRUPTION_ERROR(__func__,
+ XFS_ERRLEVEL_LOW, mp,
+ *dipp, sizeof(**dipp));
+ error = -EFSCORRUPTED;
+ return error;
+ }
+ next_agino = unlinked_agino;
+ }
+
return 0;
}
@@ -1995,181 +2408,106 @@
*/
STATIC int
xfs_iunlink_remove(
- xfs_trans_t *tp,
- xfs_inode_t *ip)
+ struct xfs_trans *tp,
+ struct xfs_inode *ip)
{
- xfs_ino_t next_ino;
- xfs_mount_t *mp;
- xfs_agi_t *agi;
- xfs_dinode_t *dip;
- xfs_buf_t *agibp;
- xfs_buf_t *ibp;
- xfs_agnumber_t agno;
- xfs_agino_t agino;
- xfs_agino_t next_agino;
- xfs_buf_t *last_ibp;
- xfs_dinode_t *last_dip = NULL;
- short bucket_index;
- int offset, last_offset = 0;
- int error;
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_agi *agi;
+ struct xfs_buf *agibp;
+ struct xfs_buf *last_ibp;
+ struct xfs_dinode *last_dip = NULL;
+ struct xfs_perag *pag = NULL;
+ xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
+ xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
+ xfs_agino_t next_agino;
+ xfs_agino_t head_agino;
+ short bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
+ int error;
- mp = tp->t_mountp;
- agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
+ trace_xfs_iunlink_remove(ip);
- /*
- * Get the agi buffer first. It ensures lock ordering
- * on the list.
- */
+ /* Get the agi buffer first. It ensures lock ordering on the list. */
error = xfs_read_agi(mp, tp, agno, &agibp);
if (error)
return error;
-
agi = XFS_BUF_TO_AGI(agibp);
/*
- * Get the index into the agi hash table for the
- * list this inode will go on.
+ * Get the index into the agi hash table for the list this inode will
+ * go on. Make sure the head pointer isn't garbage.
*/
- agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
- if (!xfs_verify_agino(mp, agno, agino))
- return -EFSCORRUPTED;
- bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
- if (!xfs_verify_agino(mp, agno,
- be32_to_cpu(agi->agi_unlinked[bucket_index]))) {
+ head_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]);
+ if (!xfs_verify_agino(mp, agno, head_agino)) {
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
agi, sizeof(*agi));
return -EFSCORRUPTED;
}
- if (be32_to_cpu(agi->agi_unlinked[bucket_index]) == agino) {
- /*
- * We're at the head of the list. Get the inode's on-disk
- * buffer to see if there is anyone after us on the list.
- * Only modify our next pointer if it is not already NULLAGINO.
- * This saves us the overhead of dealing with the buffer when
- * there is no need to change it.
- */
- error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp,
- 0, 0);
- if (error) {
- xfs_warn(mp, "%s: xfs_imap_to_bp returned error %d.",
- __func__, error);
- return error;
- }
- next_agino = be32_to_cpu(dip->di_next_unlinked);
- ASSERT(next_agino != 0);
- if (next_agino != NULLAGINO) {
- dip->di_next_unlinked = cpu_to_be32(NULLAGINO);
- offset = ip->i_imap.im_boffset +
- offsetof(xfs_dinode_t, di_next_unlinked);
+ /*
+ * Set our inode's next_unlinked pointer to NULL and then return
+ * the old pointer value so that we can update whatever was previous
+ * to us in the list to point to whatever was next in the list.
+ */
+ error = xfs_iunlink_update_inode(tp, ip, agno, NULLAGINO, &next_agino);
+ if (error)
+ return error;
- /* need to recalc the inode CRC if appropriate */
- xfs_dinode_calc_crc(mp, dip);
-
- xfs_trans_inode_buf(tp, ibp);
- xfs_trans_log_buf(tp, ibp, offset,
- (offset + sizeof(xfs_agino_t) - 1));
- xfs_inobp_check(mp, ibp);
- } else {
- xfs_trans_brelse(tp, ibp);
- }
- /*
- * Point the bucket head pointer at the next inode.
- */
- ASSERT(next_agino != 0);
- ASSERT(next_agino != agino);
- agi->agi_unlinked[bucket_index] = cpu_to_be32(next_agino);
- offset = offsetof(xfs_agi_t, agi_unlinked) +
- (sizeof(xfs_agino_t) * bucket_index);
- xfs_trans_log_buf(tp, agibp, offset,
- (offset + sizeof(xfs_agino_t) - 1));
- } else {
- /*
- * We need to search the list for the inode being freed.
- */
- next_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]);
- last_ibp = NULL;
- while (next_agino != agino) {
- struct xfs_imap imap;
-
- if (last_ibp)
- xfs_trans_brelse(tp, last_ibp);
-
- imap.im_blkno = 0;
- next_ino = XFS_AGINO_TO_INO(mp, agno, next_agino);
-
- error = xfs_imap(mp, tp, next_ino, &imap, 0);
- if (error) {
- xfs_warn(mp,
- "%s: xfs_imap returned error %d.",
- __func__, error);
- return error;
- }
-
- error = xfs_imap_to_bp(mp, tp, &imap, &last_dip,
- &last_ibp, 0, 0);
- if (error) {
- xfs_warn(mp,
- "%s: xfs_imap_to_bp returned error %d.",
- __func__, error);
- return error;
- }
-
- last_offset = imap.im_boffset;
- next_agino = be32_to_cpu(last_dip->di_next_unlinked);
- if (!xfs_verify_agino(mp, agno, next_agino)) {
- XFS_CORRUPTION_ERROR(__func__,
- XFS_ERRLEVEL_LOW, mp,
- last_dip, sizeof(*last_dip));
- return -EFSCORRUPTED;
- }
- }
-
- /*
- * Now last_ibp points to the buffer previous to us on the
- * unlinked list. Pull us from the list.
- */
- error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp,
- 0, 0);
- if (error) {
- xfs_warn(mp, "%s: xfs_imap_to_bp(2) returned error %d.",
- __func__, error);
- return error;
- }
- next_agino = be32_to_cpu(dip->di_next_unlinked);
- ASSERT(next_agino != 0);
- ASSERT(next_agino != agino);
- if (next_agino != NULLAGINO) {
- dip->di_next_unlinked = cpu_to_be32(NULLAGINO);
- offset = ip->i_imap.im_boffset +
- offsetof(xfs_dinode_t, di_next_unlinked);
-
- /* need to recalc the inode CRC if appropriate */
- xfs_dinode_calc_crc(mp, dip);
-
- xfs_trans_inode_buf(tp, ibp);
- xfs_trans_log_buf(tp, ibp, offset,
- (offset + sizeof(xfs_agino_t) - 1));
- xfs_inobp_check(mp, ibp);
- } else {
- xfs_trans_brelse(tp, ibp);
- }
- /*
- * Point the previous inode on the list to the next inode.
- */
- last_dip->di_next_unlinked = cpu_to_be32(next_agino);
- ASSERT(next_agino != 0);
- offset = last_offset + offsetof(xfs_dinode_t, di_next_unlinked);
-
- /* need to recalc the inode CRC if appropriate */
- xfs_dinode_calc_crc(mp, last_dip);
-
- xfs_trans_inode_buf(tp, last_ibp);
- xfs_trans_log_buf(tp, last_ibp, offset,
- (offset + sizeof(xfs_agino_t) - 1));
- xfs_inobp_check(mp, last_ibp);
+ /*
+ * If there was a backref pointing from the next inode back to this
+ * one, remove it because we've removed this inode from the list.
+ *
+ * Later, if this inode was in the middle of the list we'll update
+ * this inode's backref to point from the next inode.
+ */
+ if (next_agino != NULLAGINO) {
+ pag = xfs_perag_get(mp, agno);
+ error = xfs_iunlink_change_backref(pag, next_agino,
+ NULLAGINO);
+ if (error)
+ goto out;
}
- return 0;
+
+ if (head_agino == agino) {
+ /* Point the head of the list to the next unlinked inode. */
+ error = xfs_iunlink_update_bucket(tp, agno, agibp, bucket_index,
+ next_agino);
+ if (error)
+ goto out;
+ } else {
+ struct xfs_imap imap;
+ xfs_agino_t prev_agino;
+
+ if (!pag)
+ pag = xfs_perag_get(mp, agno);
+
+ /* We need to search the list for the inode being freed. */
+ error = xfs_iunlink_map_prev(tp, agno, head_agino, agino,
+ &prev_agino, &imap, &last_dip, &last_ibp,
+ pag);
+ if (error)
+ goto out;
+
+ /* Point the previous inode on the list to the next inode. */
+ xfs_iunlink_update_dinode(tp, agno, prev_agino, last_ibp,
+ last_dip, &imap, next_agino);
+
+ /*
+ * Now we deal with the backref for this inode. If this inode
+ * pointed at a real inode, change the backref that pointed to
+ * us to point to our old next. If this inode was the end of
+ * the list, delete the backref that pointed to us. Note that
+ * change_backref takes care of deleting the backref if
+ * next_agino is NULLAGINO.
+ */
+ error = xfs_iunlink_change_backref(pag, agino, next_agino);
+ if (error)
+ goto out;
+ }
+
+out:
+ if (pag)
+ xfs_perag_put(pag);
+ return error;
}
/*
@@ -2184,8 +2522,6 @@
struct xfs_icluster *xic)
{
xfs_mount_t *mp = free_ip->i_mount;
- int blks_per_cluster;
- int inodes_per_cluster;
int nbufs;
int i, j;
int ioffset;
@@ -2195,15 +2531,14 @@
xfs_inode_log_item_t *iip;
struct xfs_log_item *lip;
struct xfs_perag *pag;
+ struct xfs_ino_geometry *igeo = M_IGEO(mp);
xfs_ino_t inum;
inum = xic->first_ino;
pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, inum));
- blks_per_cluster = xfs_icluster_size_fsb(mp);
- inodes_per_cluster = blks_per_cluster << mp->m_sb.sb_inopblog;
- nbufs = mp->m_ialloc_blks / blks_per_cluster;
+ nbufs = igeo->ialloc_blks / igeo->blocks_per_cluster;
- for (j = 0; j < nbufs; j++, inum += inodes_per_cluster) {
+ for (j = 0; j < nbufs; j++, inum += igeo->inodes_per_cluster) {
/*
* The allocation bitmap tells us which inodes of the chunk were
* physically allocated. Skip the cluster if an inode falls into
@@ -2211,7 +2546,7 @@
*/
ioffset = inum - xic->first_ino;
if ((xic->alloc & XFS_INOBT_MASK(ioffset)) == 0) {
- ASSERT(ioffset % inodes_per_cluster == 0);
+ ASSERT(ioffset % igeo->inodes_per_cluster == 0);
continue;
}
@@ -2227,7 +2562,7 @@
* to mark all the active inodes on the buffer stale.
*/
bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno,
- mp->m_bsize * blks_per_cluster,
+ mp->m_bsize * igeo->blocks_per_cluster,
XBF_UNMAPPED);
if (!bp)
@@ -2242,7 +2577,7 @@
* want it to fail. We can acheive this by adding a write
* verifier to the buffer.
*/
- bp->b_ops = &xfs_inode_buf_ops;
+ bp->b_ops = &xfs_inode_buf_ops;
/*
* Walk the inodes already attached to the buffer and mark them
@@ -2274,7 +2609,7 @@
* transaction stale above, which means there is no point in
* even trying to lock them.
*/
- for (i = 0; i < inodes_per_cluster; i++) {
+ for (i = 0; i < igeo->inodes_per_cluster; i++) {
retry:
rcu_read_lock();
ip = radix_tree_lookup(&pag->pag_ici_root,
@@ -2752,9 +3087,7 @@
error = xfs_droplink(tp, dp2);
if (error)
goto out_trans_abort;
- error = xfs_bumplink(tp, dp1);
- if (error)
- goto out_trans_abort;
+ xfs_bumplink(tp, dp1);
}
/*
@@ -2778,9 +3111,7 @@
error = xfs_droplink(tp, dp1);
if (error)
goto out_trans_abort;
- error = xfs_bumplink(tp, dp2);
- if (error)
- goto out_trans_abort;
+ xfs_bumplink(tp, dp2);
}
/*
@@ -2837,11 +3168,9 @@
/*
* Prepare the tmpfile inode as if it were created through the VFS.
- * Otherwise, the link increment paths will complain about nlink 0->1.
- * Drop the link count as done by d_tmpfile(), complete the inode setup
- * and flag it as linkable.
+ * Complete the inode setup and flag it as linkable. nlink is already
+ * zero, so we can skip the drop_nlink.
*/
- drop_nlink(VFS_I(tmpfile));
xfs_setup_iops(tmpfile);
xfs_finish_inode_setup(tmpfile);
VFS_I(tmpfile)->i_state |= I_LINKABLE;
@@ -2953,7 +3282,8 @@
spaceres);
/*
- * Set up the target.
+ * Check for expected errors before we dirty the transaction
+ * so we can return an error without a transaction abort.
*/
if (target_ip == NULL) {
/*
@@ -2965,6 +3295,46 @@
if (error)
goto out_trans_cancel;
}
+ } else {
+ /*
+ * If target exists and it's a directory, check that whether
+ * it can be destroyed.
+ */
+ if (S_ISDIR(VFS_I(target_ip)->i_mode) &&
+ (!xfs_dir_isempty(target_ip) ||
+ (VFS_I(target_ip)->i_nlink > 2))) {
+ error = -EEXIST;
+ goto out_trans_cancel;
+ }
+ }
+
+ /*
+ * Directory entry creation below may acquire the AGF. Remove
+ * the whiteout from the unlinked list first to preserve correct
+ * AGI/AGF locking order. This dirties the transaction so failures
+ * after this point will abort and log recovery will clean up the
+ * mess.
+ *
+ * For whiteouts, we need to bump the link count on the whiteout
+ * inode. After this point, we have a real link, clear the tmpfile
+ * state flag from the inode so it doesn't accidentally get misused
+ * in future.
+ */
+ if (wip) {
+ ASSERT(VFS_I(wip)->i_nlink == 0);
+ error = xfs_iunlink_remove(tp, wip);
+ if (error)
+ goto out_trans_cancel;
+
+ xfs_bumplink(tp, wip);
+ xfs_trans_log_inode(tp, wip, XFS_ILOG_CORE);
+ VFS_I(wip)->i_state &= ~I_LINKABLE;
+ }
+
+ /*
+ * Set up the target.
+ */
+ if (target_ip == NULL) {
/*
* If target does not exist and the rename crosses
* directories, adjust the target directory link count
@@ -2979,28 +3349,10 @@
XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
if (new_parent && src_is_directory) {
- error = xfs_bumplink(tp, target_dp);
- if (error)
- goto out_trans_cancel;
+ xfs_bumplink(tp, target_dp);
}
} else { /* target_ip != NULL */
/*
- * If target exists and it's a directory, check that both
- * target and source are directories and that target can be
- * destroyed, or that neither is a directory.
- */
- if (S_ISDIR(VFS_I(target_ip)->i_mode)) {
- /*
- * Make sure target dir is empty.
- */
- if (!(xfs_dir_isempty(target_ip)) ||
- (VFS_I(target_ip)->i_nlink > 2)) {
- error = -EEXIST;
- goto out_trans_cancel;
- }
- }
-
- /*
* Link the source inode under the target name.
* If the source inode is a directory and we are moving
* it across directories, its ".." entry will be
@@ -3090,32 +3442,6 @@
if (error)
goto out_trans_cancel;
- /*
- * For whiteouts, we need to bump the link count on the whiteout inode.
- * This means that failures all the way up to this point leave the inode
- * on the unlinked list and so cleanup is a simple matter of dropping
- * the remaining reference to it. If we fail here after bumping the link
- * count, we're shutting down the filesystem so we'll never see the
- * intermediate state on disk.
- */
- if (wip) {
- ASSERT(VFS_I(wip)->i_nlink == 0);
- error = xfs_bumplink(tp, wip);
- if (error)
- goto out_trans_cancel;
- error = xfs_iunlink_remove(tp, wip);
- if (error)
- goto out_trans_cancel;
- xfs_trans_log_inode(tp, wip, XFS_ILOG_CORE);
-
- /*
- * Now we have a real link, clear the "I'm a tmpfile" state
- * flag from the inode so it doesn't accidentally get misused in
- * future.
- */
- VFS_I(wip)->i_state &= ~I_LINKABLE;
- }
-
xfs_trans_ichgtime(tp, src_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
xfs_trans_log_inode(tp, src_dp, XFS_ILOG_CORE);
if (new_parent)
@@ -3142,28 +3468,27 @@
struct xfs_mount *mp = ip->i_mount;
struct xfs_perag *pag;
unsigned long first_index, mask;
- unsigned long inodes_per_cluster;
int cilist_size;
struct xfs_inode **cilist;
struct xfs_inode *cip;
+ struct xfs_ino_geometry *igeo = M_IGEO(mp);
int nr_found;
int clcount = 0;
int i;
pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
- inodes_per_cluster = mp->m_inode_cluster_size >> mp->m_sb.sb_inodelog;
- cilist_size = inodes_per_cluster * sizeof(xfs_inode_t *);
+ cilist_size = igeo->inodes_per_cluster * sizeof(struct xfs_inode *);
cilist = kmem_alloc(cilist_size, KM_MAYFAIL|KM_NOFS);
if (!cilist)
goto out_put;
- mask = ~(((mp->m_inode_cluster_size >> mp->m_sb.sb_inodelog)) - 1);
+ mask = ~(igeo->inodes_per_cluster - 1);
first_index = XFS_INO_TO_AGINO(mp, ip->i_ino) & mask;
rcu_read_lock();
/* really need a gang lookup range call here */
nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, (void**)cilist,
- first_index, inodes_per_cluster);
+ first_index, igeo->inodes_per_cluster);
if (nr_found == 0)
goto out_free;
@@ -3271,7 +3596,6 @@
* inode buffer and shut down the filesystem.
*/
rcu_read_unlock();
- xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
/*
* We'll always have an inode attached to the buffer for completion
@@ -3281,11 +3605,14 @@
* xfs_buf_submit().
*/
ASSERT(bp->b_iodone);
+ bp->b_flags |= XBF_ASYNC;
bp->b_flags &= ~XBF_DONE;
xfs_buf_stale(bp);
xfs_buf_ioerror(bp, -EIO);
xfs_buf_ioend(bp);
+ xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+
/* abort the corrupt inode, as it was not attached to the buffer */
xfs_iflush_abort(cip, false);
kmem_free(cilist);
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index be20145..558173f 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -45,10 +45,18 @@
mrlock_t i_lock; /* inode lock */
mrlock_t i_mmaplock; /* inode mmap IO lock */
atomic_t i_pincount; /* inode pin count */
+
+ /*
+ * Bitsets of inode metadata that have been checked and/or are sick.
+ * Callers must hold i_flags_lock before accessing this field.
+ */
+ uint16_t i_checked;
+ uint16_t i_sick;
+
spinlock_t i_flags_lock; /* inode i_flags lock */
/* Miscellaneous state. */
unsigned long i_flags; /* see defined flags below */
- unsigned int i_delayed_blks; /* count of delay alloc blks */
+ uint64_t i_delayed_blks; /* count of delay alloc blks */
struct xfs_icdinode i_d; /* most of ondisk inode */
@@ -57,6 +65,11 @@
/* VFS inode */
struct inode i_vnode; /* embedded VFS inode */
+
+ /* pending io completions */
+ spinlock_t i_ioend_lock;
+ struct work_struct i_ioend_work;
+ struct list_head i_ioend_list;
} xfs_inode_t;
/* Convert from vfs inode to xfs inode */
@@ -500,4 +513,9 @@
bool xfs_inode_verify_forks(struct xfs_inode *ip);
+int xfs_iunlink_init(struct xfs_perag *pag);
+void xfs_iunlink_destroy(struct xfs_perag *pag);
+
+void xfs_end_io(struct work_struct *work);
+
#endif /* __XFS_INODE_H__ */
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index fa1c4fe..bb8f076 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -5,6 +5,7 @@
*/
#include "xfs.h"
#include "xfs_fs.h"
+#include "xfs_shared.h"
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
@@ -12,7 +13,6 @@
#include "xfs_inode.h"
#include "xfs_trans.h"
#include "xfs_inode_item.h"
-#include "xfs_error.h"
#include "xfs_trace.h"
#include "xfs_trans_priv.h"
#include "xfs_buf_item.h"
@@ -565,7 +565,7 @@
* Unlock the inode associated with the inode log item.
*/
STATIC void
-xfs_inode_item_unlock(
+xfs_inode_item_release(
struct xfs_log_item *lip)
{
struct xfs_inode_log_item *iip = INODE_ITEM(lip);
@@ -621,23 +621,21 @@
STATIC void
xfs_inode_item_committing(
struct xfs_log_item *lip,
- xfs_lsn_t lsn)
+ xfs_lsn_t commit_lsn)
{
- INODE_ITEM(lip)->ili_last_lsn = lsn;
+ INODE_ITEM(lip)->ili_last_lsn = commit_lsn;
+ return xfs_inode_item_release(lip);
}
-/*
- * This is the ops vector shared by all buf log items.
- */
static const struct xfs_item_ops xfs_inode_item_ops = {
.iop_size = xfs_inode_item_size,
.iop_format = xfs_inode_item_format,
.iop_pin = xfs_inode_item_pin,
.iop_unpin = xfs_inode_item_unpin,
- .iop_unlock = xfs_inode_item_unlock,
+ .iop_release = xfs_inode_item_release,
.iop_committed = xfs_inode_item_committed,
.iop_push = xfs_inode_item_push,
- .iop_committing = xfs_inode_item_committing,
+ .iop_committing = xfs_inode_item_committing,
.iop_error = xfs_inode_item_error
};
@@ -653,7 +651,7 @@
struct xfs_inode_log_item *iip;
ASSERT(ip->i_itemp == NULL);
- iip = ip->i_itemp = kmem_zone_zalloc(xfs_ili_zone, KM_SLEEP);
+ iip = ip->i_itemp = kmem_zone_zalloc(xfs_ili_zone, 0);
iip->ili_inode = ip;
xfs_log_item_init(mp, &iip->ili_item, XFS_LI_INODE,
diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h
index 27081eb..07a60e7 100644
--- a/fs/xfs/xfs_inode_item.h
+++ b/fs/xfs/xfs_inode_item.h
@@ -14,7 +14,7 @@
struct xfs_mount;
typedef struct xfs_inode_log_item {
- xfs_log_item_t ili_item; /* common portion */
+ struct xfs_log_item ili_item; /* common portion */
struct xfs_inode *ili_inode; /* inode ptr */
xfs_lsn_t ili_flush_lsn; /* lsn at last flush */
xfs_lsn_t ili_last_lsn; /* lsn at last transaction */
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 0ef5ece..d58f0d6 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -11,9 +11,8 @@
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
#include "xfs_inode.h"
-#include "xfs_ioctl.h"
-#include "xfs_alloc.h"
#include "xfs_rtalloc.h"
+#include "xfs_iwalk.h"
#include "xfs_itable.h"
#include "xfs_error.h"
#include "xfs_attr.h"
@@ -25,7 +24,6 @@
#include "xfs_export.h"
#include "xfs_trace.h"
#include "xfs_icache.h"
-#include "xfs_symlink.h"
#include "xfs_trans.h"
#include "xfs_acl.h"
#include "xfs_btree.h"
@@ -33,15 +31,11 @@
#include "xfs_fsmap.h"
#include "scrub/xfs_scrub.h"
#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_health.h"
-#include <linux/capability.h>
-#include <linux/cred.h>
-#include <linux/dcache.h>
#include <linux/mount.h>
#include <linux/namei.h>
-#include <linux/pagemap.h>
-#include <linux/slab.h>
-#include <linux/exportfs.h>
/*
* xfs_find_handle maps from userspace xfs_fsop_handlereq structure to
@@ -73,7 +67,7 @@
return -EBADF;
inode = file_inode(f.file);
} else {
- error = user_lpath((const char __user *)hreq->path, &path);
+ error = user_path_at(AT_FDCWD, hreq->path, 0, &path);
if (error)
return error;
inode = d_inode(path.dentry);
@@ -402,7 +396,7 @@
if (IS_ERR(dentry))
return PTR_ERR(dentry);
- kbuf = kmem_zalloc_large(al_hreq.buflen, KM_SLEEP);
+ kbuf = kmem_zalloc_large(al_hreq.buflen, 0);
if (!kbuf)
goto out_dput;
@@ -440,11 +434,11 @@
if (*len > XFS_XATTR_SIZE_MAX)
return -EINVAL;
- kbuf = kmem_zalloc_large(*len, KM_SLEEP);
+ kbuf = kmem_zalloc_large(*len, 0);
if (!kbuf)
return -ENOMEM;
- error = xfs_attr_get(XFS_I(inode), name, kbuf, (int *)len, flags);
+ error = xfs_attr_get(XFS_I(inode), name, &kbuf, (int *)len, flags);
if (error)
goto out_kfree;
@@ -604,14 +598,6 @@
uint iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
int error;
- /*
- * Only allow the sys admin to reserve space unless
- * unwritten extents are enabled.
- */
- if (!xfs_sb_version_hasextflgbit(&ip->i_mount->m_sb) &&
- !capable(CAP_SYS_ADMIN))
- return -EPERM;
-
if (inode->i_flags & (S_IMMUTABLE|S_APPEND))
return -EPERM;
@@ -727,16 +713,45 @@
return error;
}
+/* Return 0 on success or positive error */
+int
+xfs_fsbulkstat_one_fmt(
+ struct xfs_ibulk *breq,
+ const struct xfs_bulkstat *bstat)
+{
+ struct xfs_bstat bs1;
+
+ xfs_bulkstat_to_bstat(breq->mp, &bs1, bstat);
+ if (copy_to_user(breq->ubuffer, &bs1, sizeof(bs1)))
+ return -EFAULT;
+ return xfs_ibulk_advance(breq, sizeof(struct xfs_bstat));
+}
+
+int
+xfs_fsinumbers_fmt(
+ struct xfs_ibulk *breq,
+ const struct xfs_inumbers *igrp)
+{
+ struct xfs_inogrp ig1;
+
+ xfs_inumbers_to_inogrp(&ig1, igrp);
+ if (copy_to_user(breq->ubuffer, &ig1, sizeof(struct xfs_inogrp)))
+ return -EFAULT;
+ return xfs_ibulk_advance(breq, sizeof(struct xfs_inogrp));
+}
+
STATIC int
-xfs_ioc_bulkstat(
+xfs_ioc_fsbulkstat(
xfs_mount_t *mp,
unsigned int cmd,
void __user *arg)
{
- xfs_fsop_bulkreq_t bulkreq;
- int count; /* # of records returned */
- xfs_ino_t inlast; /* last inode number */
- int done;
+ struct xfs_fsop_bulkreq bulkreq;
+ struct xfs_ibulk breq = {
+ .mp = mp,
+ .ocount = 0,
+ };
+ xfs_ino_t lastino;
int error;
/* done = 1 if there are more stats to get and if bulkstat */
@@ -748,79 +763,291 @@
if (XFS_FORCED_SHUTDOWN(mp))
return -EIO;
- if (copy_from_user(&bulkreq, arg, sizeof(xfs_fsop_bulkreq_t)))
+ if (copy_from_user(&bulkreq, arg, sizeof(struct xfs_fsop_bulkreq)))
return -EFAULT;
- if (copy_from_user(&inlast, bulkreq.lastip, sizeof(__s64)))
+ if (copy_from_user(&lastino, bulkreq.lastip, sizeof(__s64)))
return -EFAULT;
- if ((count = bulkreq.icount) <= 0)
+ if (bulkreq.icount <= 0)
return -EINVAL;
if (bulkreq.ubuffer == NULL)
return -EINVAL;
- if (cmd == XFS_IOC_FSINUMBERS)
- error = xfs_inumbers(mp, &inlast, &count,
- bulkreq.ubuffer, xfs_inumbers_fmt);
- else if (cmd == XFS_IOC_FSBULKSTAT_SINGLE)
- error = xfs_bulkstat_one(mp, inlast, bulkreq.ubuffer,
- sizeof(xfs_bstat_t), NULL, &done);
- else /* XFS_IOC_FSBULKSTAT */
- error = xfs_bulkstat(mp, &inlast, &count, xfs_bulkstat_one,
- sizeof(xfs_bstat_t), bulkreq.ubuffer,
- &done);
+ breq.ubuffer = bulkreq.ubuffer;
+ breq.icount = bulkreq.icount;
+
+ /*
+ * FSBULKSTAT_SINGLE expects that *lastip contains the inode number
+ * that we want to stat. However, FSINUMBERS and FSBULKSTAT expect
+ * that *lastip contains either zero or the number of the last inode to
+ * be examined by the previous call and return results starting with
+ * the next inode after that. The new bulk request back end functions
+ * take the inode to start with, so we have to compute the startino
+ * parameter from lastino to maintain correct function. lastino == 0
+ * is a special case because it has traditionally meant "first inode
+ * in filesystem".
+ */
+ if (cmd == XFS_IOC_FSINUMBERS) {
+ breq.startino = lastino ? lastino + 1 : 0;
+ error = xfs_inumbers(&breq, xfs_fsinumbers_fmt);
+ lastino = breq.startino - 1;
+ } else if (cmd == XFS_IOC_FSBULKSTAT_SINGLE) {
+ breq.startino = lastino;
+ breq.icount = 1;
+ error = xfs_bulkstat_one(&breq, xfs_fsbulkstat_one_fmt);
+ } else { /* XFS_IOC_FSBULKSTAT */
+ breq.startino = lastino ? lastino + 1 : 0;
+ error = xfs_bulkstat(&breq, xfs_fsbulkstat_one_fmt);
+ lastino = breq.startino - 1;
+ }
if (error)
return error;
- if (bulkreq.ocount != NULL) {
- if (copy_to_user(bulkreq.lastip, &inlast,
- sizeof(xfs_ino_t)))
- return -EFAULT;
+ if (bulkreq.lastip != NULL &&
+ copy_to_user(bulkreq.lastip, &lastino, sizeof(xfs_ino_t)))
+ return -EFAULT;
- if (copy_to_user(bulkreq.ocount, &count, sizeof(count)))
- return -EFAULT;
+ if (bulkreq.ocount != NULL &&
+ copy_to_user(bulkreq.ocount, &breq.ocount, sizeof(__s32)))
+ return -EFAULT;
+
+ return 0;
+}
+
+/* Return 0 on success or positive error */
+static int
+xfs_bulkstat_fmt(
+ struct xfs_ibulk *breq,
+ const struct xfs_bulkstat *bstat)
+{
+ if (copy_to_user(breq->ubuffer, bstat, sizeof(struct xfs_bulkstat)))
+ return -EFAULT;
+ return xfs_ibulk_advance(breq, sizeof(struct xfs_bulkstat));
+}
+
+/*
+ * Check the incoming bulk request @hdr from userspace and initialize the
+ * internal @breq bulk request appropriately. Returns 0 if the bulk request
+ * should proceed; -ECANCELED if there's nothing to do; or the usual
+ * negative error code.
+ */
+static int
+xfs_bulk_ireq_setup(
+ struct xfs_mount *mp,
+ struct xfs_bulk_ireq *hdr,
+ struct xfs_ibulk *breq,
+ void __user *ubuffer)
+{
+ if (hdr->icount == 0 ||
+ (hdr->flags & ~XFS_BULK_IREQ_FLAGS_ALL) ||
+ memchr_inv(hdr->reserved, 0, sizeof(hdr->reserved)))
+ return -EINVAL;
+
+ breq->startino = hdr->ino;
+ breq->ubuffer = ubuffer;
+ breq->icount = hdr->icount;
+ breq->ocount = 0;
+ breq->flags = 0;
+
+ /*
+ * The @ino parameter is a special value, so we must look it up here.
+ * We're not allowed to have IREQ_AGNO, and we only return one inode
+ * worth of data.
+ */
+ if (hdr->flags & XFS_BULK_IREQ_SPECIAL) {
+ if (hdr->flags & XFS_BULK_IREQ_AGNO)
+ return -EINVAL;
+
+ switch (hdr->ino) {
+ case XFS_BULK_IREQ_SPECIAL_ROOT:
+ hdr->ino = mp->m_sb.sb_rootino;
+ break;
+ default:
+ return -EINVAL;
+ }
+ breq->icount = 1;
}
+ /*
+ * The IREQ_AGNO flag means that we only want results from a given AG.
+ * If @hdr->ino is zero, we start iterating in that AG. If @hdr->ino is
+ * beyond the specified AG then we return no results.
+ */
+ if (hdr->flags & XFS_BULK_IREQ_AGNO) {
+ if (hdr->agno >= mp->m_sb.sb_agcount)
+ return -EINVAL;
+
+ if (breq->startino == 0)
+ breq->startino = XFS_AGINO_TO_INO(mp, hdr->agno, 0);
+ else if (XFS_INO_TO_AGNO(mp, breq->startino) < hdr->agno)
+ return -EINVAL;
+
+ breq->flags |= XFS_IBULK_SAME_AG;
+
+ /* Asking for an inode past the end of the AG? We're done! */
+ if (XFS_INO_TO_AGNO(mp, breq->startino) > hdr->agno)
+ return -ECANCELED;
+ } else if (hdr->agno)
+ return -EINVAL;
+
+ /* Asking for an inode past the end of the FS? We're done! */
+ if (XFS_INO_TO_AGNO(mp, breq->startino) >= mp->m_sb.sb_agcount)
+ return -ECANCELED;
+
+ return 0;
+}
+
+/*
+ * Update the userspace bulk request @hdr to reflect the end state of the
+ * internal bulk request @breq.
+ */
+static void
+xfs_bulk_ireq_teardown(
+ struct xfs_bulk_ireq *hdr,
+ struct xfs_ibulk *breq)
+{
+ hdr->ino = breq->startino;
+ hdr->ocount = breq->ocount;
+}
+
+/* Handle the v5 bulkstat ioctl. */
+STATIC int
+xfs_ioc_bulkstat(
+ struct xfs_mount *mp,
+ unsigned int cmd,
+ struct xfs_bulkstat_req __user *arg)
+{
+ struct xfs_bulk_ireq hdr;
+ struct xfs_ibulk breq = {
+ .mp = mp,
+ };
+ int error;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (XFS_FORCED_SHUTDOWN(mp))
+ return -EIO;
+
+ if (copy_from_user(&hdr, &arg->hdr, sizeof(hdr)))
+ return -EFAULT;
+
+ error = xfs_bulk_ireq_setup(mp, &hdr, &breq, arg->bulkstat);
+ if (error == -ECANCELED)
+ goto out_teardown;
+ if (error < 0)
+ return error;
+
+ error = xfs_bulkstat(&breq, xfs_bulkstat_fmt);
+ if (error)
+ return error;
+
+out_teardown:
+ xfs_bulk_ireq_teardown(&hdr, &breq);
+ if (copy_to_user(&arg->hdr, &hdr, sizeof(hdr)))
+ return -EFAULT;
+
return 0;
}
STATIC int
-xfs_ioc_fsgeometry_v1(
- xfs_mount_t *mp,
- void __user *arg)
+xfs_inumbers_fmt(
+ struct xfs_ibulk *breq,
+ const struct xfs_inumbers *igrp)
{
- xfs_fsop_geom_t fsgeo;
- int error;
+ if (copy_to_user(breq->ubuffer, igrp, sizeof(struct xfs_inumbers)))
+ return -EFAULT;
+ return xfs_ibulk_advance(breq, sizeof(struct xfs_inumbers));
+}
- error = xfs_fs_geometry(&mp->m_sb, &fsgeo, 3);
+/* Handle the v5 inumbers ioctl. */
+STATIC int
+xfs_ioc_inumbers(
+ struct xfs_mount *mp,
+ unsigned int cmd,
+ struct xfs_inumbers_req __user *arg)
+{
+ struct xfs_bulk_ireq hdr;
+ struct xfs_ibulk breq = {
+ .mp = mp,
+ };
+ int error;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (XFS_FORCED_SHUTDOWN(mp))
+ return -EIO;
+
+ if (copy_from_user(&hdr, &arg->hdr, sizeof(hdr)))
+ return -EFAULT;
+
+ error = xfs_bulk_ireq_setup(mp, &hdr, &breq, arg->inumbers);
+ if (error == -ECANCELED)
+ goto out_teardown;
+ if (error < 0)
+ return error;
+
+ error = xfs_inumbers(&breq, xfs_inumbers_fmt);
if (error)
return error;
- /*
- * Caller should have passed an argument of type
- * xfs_fsop_geom_v1_t. This is a proper subset of the
- * xfs_fsop_geom_t that xfs_fs_geometry() fills in.
- */
- if (copy_to_user(arg, &fsgeo, sizeof(xfs_fsop_geom_v1_t)))
+out_teardown:
+ xfs_bulk_ireq_teardown(&hdr, &breq);
+ if (copy_to_user(&arg->hdr, &hdr, sizeof(hdr)))
return -EFAULT;
+
return 0;
}
STATIC int
xfs_ioc_fsgeometry(
- xfs_mount_t *mp,
+ struct xfs_mount *mp,
+ void __user *arg,
+ int struct_version)
+{
+ struct xfs_fsop_geom fsgeo;
+ size_t len;
+
+ xfs_fs_geometry(&mp->m_sb, &fsgeo, struct_version);
+
+ if (struct_version <= 3)
+ len = sizeof(struct xfs_fsop_geom_v1);
+ else if (struct_version == 4)
+ len = sizeof(struct xfs_fsop_geom_v4);
+ else {
+ xfs_fsop_geom_health(mp, &fsgeo);
+ len = sizeof(fsgeo);
+ }
+
+ if (copy_to_user(arg, &fsgeo, len))
+ return -EFAULT;
+ return 0;
+}
+
+STATIC int
+xfs_ioc_ag_geometry(
+ struct xfs_mount *mp,
void __user *arg)
{
- xfs_fsop_geom_t fsgeo;
+ struct xfs_ag_geometry ageo;
int error;
- error = xfs_fs_geometry(&mp->m_sb, &fsgeo, 4);
+ if (copy_from_user(&ageo, arg, sizeof(ageo)))
+ return -EFAULT;
+ if (ageo.ag_flags)
+ return -EINVAL;
+ if (memchr_inv(&ageo.ag_reserved, 0, sizeof(ageo.ag_reserved)))
+ return -EINVAL;
+
+ error = xfs_ag_get_geometry(mp, ageo.ag_number, &ageo);
if (error)
return error;
- if (copy_to_user(arg, &fsgeo, sizeof(fsgeo)))
+ if (copy_to_user(arg, &ageo, sizeof(ageo)))
return -EFAULT;
return 0;
}
@@ -879,6 +1106,34 @@
return flags;
}
+static void
+xfs_fill_fsxattr(
+ struct xfs_inode *ip,
+ bool attr,
+ struct fsxattr *fa)
+{
+ simple_fill_fsxattr(fa, xfs_ip2xflags(ip));
+ fa->fsx_extsize = ip->i_d.di_extsize << ip->i_mount->m_sb.sb_blocklog;
+ fa->fsx_cowextsize = ip->i_d.di_cowextsize <<
+ ip->i_mount->m_sb.sb_blocklog;
+ fa->fsx_projid = xfs_get_projid(ip);
+
+ if (attr) {
+ if (ip->i_afp) {
+ if (ip->i_afp->if_flags & XFS_IFEXTENTS)
+ fa->fsx_nextents = xfs_iext_count(ip->i_afp);
+ else
+ fa->fsx_nextents = ip->i_d.di_anextents;
+ } else
+ fa->fsx_nextents = 0;
+ } else {
+ if (ip->i_df.if_flags & XFS_IFEXTENTS)
+ fa->fsx_nextents = xfs_iext_count(&ip->i_df);
+ else
+ fa->fsx_nextents = ip->i_d.di_nextents;
+ }
+}
+
STATIC int
xfs_ioc_fsgetxattr(
xfs_inode_t *ip,
@@ -887,29 +1142,8 @@
{
struct fsxattr fa;
- memset(&fa, 0, sizeof(struct fsxattr));
-
xfs_ilock(ip, XFS_ILOCK_SHARED);
- fa.fsx_xflags = xfs_ip2xflags(ip);
- fa.fsx_extsize = ip->i_d.di_extsize << ip->i_mount->m_sb.sb_blocklog;
- fa.fsx_cowextsize = ip->i_d.di_cowextsize <<
- ip->i_mount->m_sb.sb_blocklog;
- fa.fsx_projid = xfs_get_projid(ip);
-
- if (attr) {
- if (ip->i_afp) {
- if (ip->i_afp->if_flags & XFS_IFEXTENTS)
- fa.fsx_nextents = xfs_iext_count(ip->i_afp);
- else
- fa.fsx_nextents = ip->i_d.di_anextents;
- } else
- fa.fsx_nextents = 0;
- } else {
- if (ip->i_df.if_flags & XFS_IFEXTENTS)
- fa.fsx_nextents = xfs_iext_count(&ip->i_df);
- else
- fa.fsx_nextents = ip->i_d.di_nextents;
- }
+ xfs_fill_fsxattr(ip, attr, &fa);
xfs_iunlock(ip, XFS_ILOCK_SHARED);
if (copy_to_user(arg, &fa, sizeof(fa)))
@@ -1035,15 +1269,6 @@
if ((fa->fsx_xflags & FS_XFLAG_DAX) && xfs_is_reflink_inode(ip))
return -EINVAL;
- /*
- * Can't modify an immutable/append-only file unless
- * we have appropriate permission.
- */
- if (((ip->i_d.di_flags & (XFS_DIFLAG_IMMUTABLE | XFS_DIFLAG_APPEND)) ||
- (fa->fsx_xflags & (FS_XFLAG_IMMUTABLE | FS_XFLAG_APPEND))) &&
- !capable(CAP_LINUX_IMMUTABLE))
- return -EPERM;
-
/* diflags2 only valid for v3 inodes. */
di_flags2 = xfs_flags2diflags2(ip, fa->fsx_xflags);
if (di_flags2 && ip->i_d.di_version < 3)
@@ -1088,8 +1313,7 @@
if (fa->fsx_xflags & FS_XFLAG_DAX) {
if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)))
return -EINVAL;
- if (S_ISREG(inode->i_mode) &&
- !bdev_dax_supported(xfs_find_bdev_for_inode(VFS_I(ip)),
+ if (!bdev_dax_supported(xfs_find_bdev_for_inode(VFS_I(ip)),
sb->s_blocksize))
return -EINVAL;
}
@@ -1150,7 +1374,7 @@
error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp);
if (error)
- return ERR_PTR(error);
+ goto out_unlock;
xfs_ilock(ip, XFS_ILOCK_EXCL);
xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | join_flags);
@@ -1202,39 +1426,31 @@
struct fsxattr *fa)
{
struct xfs_mount *mp = ip->i_mount;
-
- if ((fa->fsx_xflags & FS_XFLAG_EXTSIZE) && !S_ISREG(VFS_I(ip)->i_mode))
- return -EINVAL;
-
- if ((fa->fsx_xflags & FS_XFLAG_EXTSZINHERIT) &&
- !S_ISDIR(VFS_I(ip)->i_mode))
- return -EINVAL;
+ xfs_extlen_t size;
+ xfs_fsblock_t extsize_fsb;
if (S_ISREG(VFS_I(ip)->i_mode) && ip->i_d.di_nextents &&
((ip->i_d.di_extsize << mp->m_sb.sb_blocklog) != fa->fsx_extsize))
return -EINVAL;
- if (fa->fsx_extsize != 0) {
- xfs_extlen_t size;
- xfs_fsblock_t extsize_fsb;
+ if (fa->fsx_extsize == 0)
+ return 0;
- extsize_fsb = XFS_B_TO_FSB(mp, fa->fsx_extsize);
- if (extsize_fsb > MAXEXTLEN)
+ extsize_fsb = XFS_B_TO_FSB(mp, fa->fsx_extsize);
+ if (extsize_fsb > MAXEXTLEN)
+ return -EINVAL;
+
+ if (XFS_IS_REALTIME_INODE(ip) ||
+ (fa->fsx_xflags & FS_XFLAG_REALTIME)) {
+ size = mp->m_sb.sb_rextsize << mp->m_sb.sb_blocklog;
+ } else {
+ size = mp->m_sb.sb_blocksize;
+ if (extsize_fsb > mp->m_sb.sb_agblocks / 2)
return -EINVAL;
+ }
- if (XFS_IS_REALTIME_INODE(ip) ||
- (fa->fsx_xflags & FS_XFLAG_REALTIME)) {
- size = mp->m_sb.sb_rextsize << mp->m_sb.sb_blocklog;
- } else {
- size = mp->m_sb.sb_blocksize;
- if (extsize_fsb > mp->m_sb.sb_agblocks / 2)
- return -EINVAL;
- }
-
- if (fa->fsx_extsize % size)
- return -EINVAL;
- } else
- fa->fsx_xflags &= ~(FS_XFLAG_EXTSIZE | FS_XFLAG_EXTSZINHERIT);
+ if (fa->fsx_extsize % size)
+ return -EINVAL;
return 0;
}
@@ -1260,6 +1476,8 @@
struct fsxattr *fa)
{
struct xfs_mount *mp = ip->i_mount;
+ xfs_extlen_t size;
+ xfs_fsblock_t cowextsize_fsb;
if (!(fa->fsx_xflags & FS_XFLAG_COWEXTSIZE))
return 0;
@@ -1268,25 +1486,19 @@
ip->i_d.di_version != 3)
return -EINVAL;
- if (!S_ISREG(VFS_I(ip)->i_mode) && !S_ISDIR(VFS_I(ip)->i_mode))
+ if (fa->fsx_cowextsize == 0)
+ return 0;
+
+ cowextsize_fsb = XFS_B_TO_FSB(mp, fa->fsx_cowextsize);
+ if (cowextsize_fsb > MAXEXTLEN)
return -EINVAL;
- if (fa->fsx_cowextsize != 0) {
- xfs_extlen_t size;
- xfs_fsblock_t cowextsize_fsb;
+ size = mp->m_sb.sb_blocksize;
+ if (cowextsize_fsb > mp->m_sb.sb_agblocks / 2)
+ return -EINVAL;
- cowextsize_fsb = XFS_B_TO_FSB(mp, fa->fsx_cowextsize);
- if (cowextsize_fsb > MAXEXTLEN)
- return -EINVAL;
-
- size = mp->m_sb.sb_blocksize;
- if (cowextsize_fsb > mp->m_sb.sb_agblocks / 2)
- return -EINVAL;
-
- if (fa->fsx_cowextsize % size)
- return -EINVAL;
- } else
- fa->fsx_xflags &= ~FS_XFLAG_COWEXTSIZE;
+ if (fa->fsx_cowextsize % size)
+ return -EINVAL;
return 0;
}
@@ -1300,21 +1512,6 @@
if (fa->fsx_projid > (uint16_t)-1 &&
!xfs_sb_version_hasprojid32bit(&ip->i_mount->m_sb))
return -EINVAL;
-
- /*
- * Project Quota ID state is only allowed to change from within the init
- * namespace. Enforce that restriction only if we are trying to change
- * the quota ID state. Everything else is allowed in user namespaces.
- */
- if (current_user_ns() == &init_user_ns)
- return 0;
-
- if (xfs_get_projid(ip) != fa->fsx_projid)
- return -EINVAL;
- if ((fa->fsx_xflags & FS_XFLAG_PROJINHERIT) !=
- (ip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT))
- return -EINVAL;
-
return 0;
}
@@ -1323,6 +1520,7 @@
xfs_inode_t *ip,
struct fsxattr *fa)
{
+ struct fsxattr old_fa;
struct xfs_mount *mp = ip->i_mount;
struct xfs_trans *tp;
struct xfs_dquot *udqp = NULL;
@@ -1370,7 +1568,6 @@
goto error_free_dquots;
}
-
if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_PQUOTA_ON(mp) &&
xfs_get_projid(ip) != fa->fsx_projid) {
code = xfs_qm_vop_chown_reserve(tp, ip, udqp, NULL, pdqp,
@@ -1379,6 +1576,11 @@
goto error_trans_cancel;
}
+ xfs_fill_fsxattr(ip, false, &old_fa);
+ code = vfs_ioc_fssetxattr_check(VFS_I(ip), &old_fa, fa);
+ if (code)
+ goto error_trans_cancel;
+
code = xfs_ioctl_setattr_check_extsize(ip, fa);
if (code)
goto error_trans_cancel;
@@ -1489,6 +1691,7 @@
{
struct xfs_trans *tp;
struct fsxattr fa;
+ struct fsxattr old_fa;
unsigned int flags;
int join_flags = 0;
int error;
@@ -1524,6 +1727,13 @@
goto out_drop_write;
}
+ xfs_fill_fsxattr(ip, false, &old_fa);
+ error = vfs_ioc_fssetxattr_check(VFS_I(ip), &old_fa, &fa);
+ if (error) {
+ xfs_trans_cancel(tp);
+ goto out_drop_write;
+ }
+
error = xfs_ioctl_setattr_xflags(tp, ip, &fa);
if (error) {
xfs_trans_cancel(tp);
@@ -1616,7 +1826,7 @@
error = 0;
out_free_buf:
kmem_free(buf);
- return 0;
+ return error;
}
struct getfsmap_info {
@@ -1674,7 +1884,7 @@
info.mp = ip->i_mount;
info.data = arg;
error = xfs_getfsmap(ip->i_mount, &xhead, xfs_getfsmap_format, &info);
- if (error == XFS_BTREE_QUERY_RANGE_ABORT) {
+ if (error == -ECANCELED) {
error = 0;
aborted = true;
} else if (error)
@@ -1942,13 +2152,22 @@
case XFS_IOC_FSBULKSTAT_SINGLE:
case XFS_IOC_FSBULKSTAT:
case XFS_IOC_FSINUMBERS:
+ return xfs_ioc_fsbulkstat(mp, cmd, arg);
+
+ case XFS_IOC_BULKSTAT:
return xfs_ioc_bulkstat(mp, cmd, arg);
+ case XFS_IOC_INUMBERS:
+ return xfs_ioc_inumbers(mp, cmd, arg);
case XFS_IOC_FSGEOMETRY_V1:
- return xfs_ioc_fsgeometry_v1(mp, arg);
-
+ return xfs_ioc_fsgeometry(mp, arg, 3);
+ case XFS_IOC_FSGEOMETRY_V4:
+ return xfs_ioc_fsgeometry(mp, arg, 4);
case XFS_IOC_FSGEOMETRY:
- return xfs_ioc_fsgeometry(mp, arg);
+ return xfs_ioc_fsgeometry(mp, arg, 5);
+
+ case XFS_IOC_AG_GEOMETRY:
+ return xfs_ioc_ag_geometry(mp, arg);
case XFS_IOC_GETVERSION:
return put_user(inode->i_generation, (int __user *)arg);
@@ -2039,9 +2258,7 @@
case XFS_IOC_FSCOUNTS: {
xfs_fsop_counts_t out;
- error = xfs_fs_counts(mp, &out);
- if (error)
- return error;
+ xfs_fs_counts(mp, &out);
if (copy_to_user(arg, &out, sizeof(out)))
return -EFAULT;
diff --git a/fs/xfs/xfs_ioctl.h b/fs/xfs/xfs_ioctl.h
index 4b17f67..654c0bb 100644
--- a/fs/xfs/xfs_ioctl.h
+++ b/fs/xfs/xfs_ioctl.h
@@ -77,4 +77,12 @@
uint evmask,
uint16_t state);
+struct xfs_ibulk;
+struct xfs_bstat;
+struct xfs_inogrp;
+
+int xfs_fsbulkstat_one_fmt(struct xfs_ibulk *breq,
+ const struct xfs_bulkstat *bstat);
+int xfs_fsinumbers_fmt(struct xfs_ibulk *breq, const struct xfs_inumbers *igrp);
+
#endif
diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c
index fba115f..1e08bf7 100644
--- a/fs/xfs/xfs_ioctl32.c
+++ b/fs/xfs/xfs_ioctl32.c
@@ -3,23 +3,19 @@
* Copyright (c) 2004-2005 Silicon Graphics, Inc.
* All Rights Reserved.
*/
-#include <linux/compat.h>
-#include <linux/ioctl.h>
#include <linux/mount.h>
-#include <linux/slab.h>
-#include <linux/uaccess.h>
#include <linux/fsmap.h>
#include "xfs.h"
#include "xfs_fs.h"
+#include "xfs_shared.h"
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
#include "xfs_inode.h"
+#include "xfs_iwalk.h"
#include "xfs_itable.h"
-#include "xfs_error.h"
#include "xfs_fsops.h"
-#include "xfs_alloc.h"
#include "xfs_rtalloc.h"
#include "xfs_attr.h"
#include "xfs_ioctl.h"
@@ -52,12 +48,9 @@
struct xfs_mount *mp,
compat_xfs_fsop_geom_v1_t __user *arg32)
{
- xfs_fsop_geom_t fsgeo;
- int error;
+ struct xfs_fsop_geom fsgeo;
- error = xfs_fs_geometry(&mp->m_sb, &fsgeo, 3);
- if (error)
- return error;
+ xfs_fs_geometry(&mp->m_sb, &fsgeo, 3);
/* The 32-bit variant simply has some padding at the end */
if (copy_to_user(arg32, &fsgeo, sizeof(struct compat_xfs_fsop_geom_v1)))
return -EFAULT;
@@ -87,27 +80,26 @@
}
STATIC int
-xfs_inumbers_fmt_compat(
- void __user *ubuffer,
- const struct xfs_inogrp *buffer,
- long count,
- long *written)
+xfs_fsinumbers_fmt_compat(
+ struct xfs_ibulk *breq,
+ const struct xfs_inumbers *ig)
{
- compat_xfs_inogrp_t __user *p32 = ubuffer;
- long i;
+ struct compat_xfs_inogrp __user *p32 = breq->ubuffer;
+ struct xfs_inogrp ig1;
+ struct xfs_inogrp *igrp = &ig1;
- for (i = 0; i < count; i++) {
- if (put_user(buffer[i].xi_startino, &p32[i].xi_startino) ||
- put_user(buffer[i].xi_alloccount, &p32[i].xi_alloccount) ||
- put_user(buffer[i].xi_allocmask, &p32[i].xi_allocmask))
- return -EFAULT;
- }
- *written = count * sizeof(*p32);
- return 0;
+ xfs_inumbers_to_inogrp(&ig1, ig);
+
+ if (put_user(igrp->xi_startino, &p32->xi_startino) ||
+ put_user(igrp->xi_alloccount, &p32->xi_alloccount) ||
+ put_user(igrp->xi_allocmask, &p32->xi_allocmask))
+ return -EFAULT;
+
+ return xfs_ibulk_advance(breq, sizeof(struct compat_xfs_inogrp));
}
#else
-#define xfs_inumbers_fmt_compat xfs_inumbers_fmt
+#define xfs_fsinumbers_fmt_compat xfs_fsinumbers_fmt
#endif /* BROKEN_X86_ALIGNMENT */
STATIC int
@@ -124,11 +116,14 @@
return 0;
}
-/* xfs_bstat_t has differing alignment on intel, & bstime_t sizes everywhere */
+/*
+ * struct xfs_bstat has differing alignment on intel, & bstime_t sizes
+ * everywhere
+ */
STATIC int
xfs_ioctl32_bstat_copyin(
- xfs_bstat_t *bstat,
- compat_xfs_bstat_t __user *bstat32)
+ struct xfs_bstat *bstat,
+ struct compat_xfs_bstat __user *bstat32)
{
if (get_user(bstat->bs_ino, &bstat32->bs_ino) ||
get_user(bstat->bs_mode, &bstat32->bs_mode) ||
@@ -174,16 +169,15 @@
/* Return 0 on success or positive error (to xfs_bulkstat()) */
STATIC int
-xfs_bulkstat_one_fmt_compat(
- void __user *ubuffer,
- int ubsize,
- int *ubused,
- const xfs_bstat_t *buffer)
+xfs_fsbulkstat_one_fmt_compat(
+ struct xfs_ibulk *breq,
+ const struct xfs_bulkstat *bstat)
{
- compat_xfs_bstat_t __user *p32 = ubuffer;
+ struct compat_xfs_bstat __user *p32 = breq->ubuffer;
+ struct xfs_bstat bs1;
+ struct xfs_bstat *buffer = &bs1;
- if (ubsize < sizeof(*p32))
- return -ENOMEM;
+ xfs_bulkstat_to_bstat(breq->mp, &bs1, bstat);
if (put_user(buffer->bs_ino, &p32->bs_ino) ||
put_user(buffer->bs_mode, &p32->bs_mode) ||
@@ -208,39 +202,50 @@
put_user(buffer->bs_dmstate, &p32->bs_dmstate) ||
put_user(buffer->bs_aextents, &p32->bs_aextents))
return -EFAULT;
- if (ubused)
- *ubused = sizeof(*p32);
- return 0;
-}
-STATIC int
-xfs_bulkstat_one_compat(
- xfs_mount_t *mp, /* mount point for filesystem */
- xfs_ino_t ino, /* inode number to get data for */
- void __user *buffer, /* buffer to place output in */
- int ubsize, /* size of buffer */
- int *ubused, /* bytes used by me */
- int *stat) /* BULKSTAT_RV_... */
-{
- return xfs_bulkstat_one_int(mp, ino, buffer, ubsize,
- xfs_bulkstat_one_fmt_compat,
- ubused, stat);
+ return xfs_ibulk_advance(breq, sizeof(struct compat_xfs_bstat));
}
/* copied from xfs_ioctl.c */
STATIC int
-xfs_compat_ioc_bulkstat(
+xfs_compat_ioc_fsbulkstat(
xfs_mount_t *mp,
unsigned int cmd,
- compat_xfs_fsop_bulkreq_t __user *p32)
+ struct compat_xfs_fsop_bulkreq __user *p32)
{
u32 addr;
- xfs_fsop_bulkreq_t bulkreq;
- int count; /* # of records returned */
- xfs_ino_t inlast; /* last inode number */
- int done;
+ struct xfs_fsop_bulkreq bulkreq;
+ struct xfs_ibulk breq = {
+ .mp = mp,
+ .ocount = 0,
+ };
+ xfs_ino_t lastino;
int error;
+ /*
+ * Output structure handling functions. Depending on the command,
+ * either the xfs_bstat and xfs_inogrp structures are written out
+ * to userpace memory via bulkreq.ubuffer. Normally the compat
+ * functions and structure size are the correct ones to use ...
+ */
+ inumbers_fmt_pf inumbers_func = xfs_fsinumbers_fmt_compat;
+ bulkstat_one_fmt_pf bs_one_func = xfs_fsbulkstat_one_fmt_compat;
+
+#ifdef CONFIG_X86_X32
+ if (in_x32_syscall()) {
+ /*
+ * ... but on x32 the input xfs_fsop_bulkreq has pointers
+ * which must be handled in the "compat" (32-bit) way, while
+ * the xfs_bstat and xfs_inogrp structures follow native 64-
+ * bit layout convention. So adjust accordingly, otherwise
+ * the data written out in compat layout will not match what
+ * x32 userspace expects.
+ */
+ inumbers_func = xfs_fsinumbers_fmt;
+ bs_one_func = xfs_fsbulkstat_one_fmt;
+ }
+#endif
+
/* done = 1 if there are more stats to get and if bulkstat */
/* should be called again (unused here, but used in dmapi) */
@@ -261,40 +266,55 @@
return -EFAULT;
bulkreq.ocount = compat_ptr(addr);
- if (copy_from_user(&inlast, bulkreq.lastip, sizeof(__s64)))
+ if (copy_from_user(&lastino, bulkreq.lastip, sizeof(__s64)))
return -EFAULT;
- if ((count = bulkreq.icount) <= 0)
+ if (bulkreq.icount <= 0)
return -EINVAL;
if (bulkreq.ubuffer == NULL)
return -EINVAL;
- if (cmd == XFS_IOC_FSINUMBERS_32) {
- error = xfs_inumbers(mp, &inlast, &count,
- bulkreq.ubuffer, xfs_inumbers_fmt_compat);
- } else if (cmd == XFS_IOC_FSBULKSTAT_SINGLE_32) {
- int res;
+ breq.ubuffer = bulkreq.ubuffer;
+ breq.icount = bulkreq.icount;
- error = xfs_bulkstat_one_compat(mp, inlast, bulkreq.ubuffer,
- sizeof(compat_xfs_bstat_t), NULL, &res);
+ /*
+ * FSBULKSTAT_SINGLE expects that *lastip contains the inode number
+ * that we want to stat. However, FSINUMBERS and FSBULKSTAT expect
+ * that *lastip contains either zero or the number of the last inode to
+ * be examined by the previous call and return results starting with
+ * the next inode after that. The new bulk request back end functions
+ * take the inode to start with, so we have to compute the startino
+ * parameter from lastino to maintain correct function. lastino == 0
+ * is a special case because it has traditionally meant "first inode
+ * in filesystem".
+ */
+ if (cmd == XFS_IOC_FSINUMBERS_32) {
+ breq.startino = lastino ? lastino + 1 : 0;
+ error = xfs_inumbers(&breq, inumbers_func);
+ lastino = breq.startino - 1;
+ } else if (cmd == XFS_IOC_FSBULKSTAT_SINGLE_32) {
+ breq.startino = lastino;
+ breq.icount = 1;
+ error = xfs_bulkstat_one(&breq, bs_one_func);
+ lastino = breq.startino;
} else if (cmd == XFS_IOC_FSBULKSTAT_32) {
- error = xfs_bulkstat(mp, &inlast, &count,
- xfs_bulkstat_one_compat, sizeof(compat_xfs_bstat_t),
- bulkreq.ubuffer, &done);
- } else
+ breq.startino = lastino ? lastino + 1 : 0;
+ error = xfs_bulkstat(&breq, bs_one_func);
+ lastino = breq.startino - 1;
+ } else {
error = -EINVAL;
+ }
if (error)
return error;
- if (bulkreq.ocount != NULL) {
- if (copy_to_user(bulkreq.lastip, &inlast,
- sizeof(xfs_ino_t)))
- return -EFAULT;
+ if (bulkreq.lastip != NULL &&
+ copy_to_user(bulkreq.lastip, &lastino, sizeof(xfs_ino_t)))
+ return -EFAULT;
- if (copy_to_user(bulkreq.ocount, &count, sizeof(count)))
- return -EFAULT;
- }
+ if (bulkreq.ocount != NULL &&
+ copy_to_user(bulkreq.ocount, &breq.ocount, sizeof(__s32)))
+ return -EFAULT;
return 0;
}
@@ -336,6 +356,7 @@
{
int error;
attrlist_cursor_kern_t *cursor;
+ compat_xfs_fsop_attrlist_handlereq_t __user *p = arg;
compat_xfs_fsop_attrlist_handlereq_t al_hreq;
struct dentry *dentry;
char *kbuf;
@@ -360,7 +381,7 @@
return PTR_ERR(dentry);
error = -ENOMEM;
- kbuf = kmem_zalloc_large(al_hreq.buflen, KM_SLEEP);
+ kbuf = kmem_zalloc_large(al_hreq.buflen, 0);
if (!kbuf)
goto out_dput;
@@ -370,6 +391,11 @@
if (error)
goto out_kfree;
+ if (copy_to_user(&p->pos, cursor, sizeof(attrlist_cursor_kern_t))) {
+ error = -EFAULT;
+ goto out_kfree;
+ }
+
if (copy_to_user(compat_ptr(al_hreq.buffer), kbuf, al_hreq.buflen))
error = -EFAULT;
@@ -521,48 +547,13 @@
struct inode *inode = file_inode(filp);
struct xfs_inode *ip = XFS_I(inode);
struct xfs_mount *mp = ip->i_mount;
- void __user *arg = (void __user *)p;
+ void __user *arg = compat_ptr(p);
int error;
trace_xfs_file_compat_ioctl(ip);
switch (cmd) {
- /* No size or alignment issues on any arch */
- case XFS_IOC_DIOINFO:
- case XFS_IOC_FSGEOMETRY:
- case XFS_IOC_FSGETXATTR:
- case XFS_IOC_FSSETXATTR:
- case XFS_IOC_FSGETXATTRA:
- case XFS_IOC_FSSETDM:
- case XFS_IOC_GETBMAP:
- case XFS_IOC_GETBMAPA:
- case XFS_IOC_GETBMAPX:
- case XFS_IOC_FSCOUNTS:
- case XFS_IOC_SET_RESBLKS:
- case XFS_IOC_GET_RESBLKS:
- case XFS_IOC_FSGROWFSLOG:
- case XFS_IOC_GOINGDOWN:
- case XFS_IOC_ERROR_INJECTION:
- case XFS_IOC_ERROR_CLEARALL:
- case FS_IOC_GETFSMAP:
- case XFS_IOC_SCRUB_METADATA:
- return xfs_file_ioctl(filp, cmd, p);
-#ifndef BROKEN_X86_ALIGNMENT
- /* These are handled fine if no alignment issues */
- case XFS_IOC_ALLOCSP:
- case XFS_IOC_FREESP:
- case XFS_IOC_RESVSP:
- case XFS_IOC_UNRESVSP:
- case XFS_IOC_ALLOCSP64:
- case XFS_IOC_FREESP64:
- case XFS_IOC_RESVSP64:
- case XFS_IOC_UNRESVSP64:
- case XFS_IOC_FSGEOMETRY_V1:
- case XFS_IOC_FSGROWFSDATA:
- case XFS_IOC_FSGROWFSRT:
- case XFS_IOC_ZERO_RANGE:
- return xfs_file_ioctl(filp, cmd, p);
-#else
+#if defined(BROKEN_X86_ALIGNMENT)
case XFS_IOC_ALLOCSP_32:
case XFS_IOC_FREESP_32:
case XFS_IOC_ALLOCSP64_32:
@@ -631,7 +622,7 @@
case XFS_IOC_FSBULKSTAT_32:
case XFS_IOC_FSBULKSTAT_SINGLE_32:
case XFS_IOC_FSINUMBERS_32:
- return xfs_compat_ioc_bulkstat(mp, cmd, arg);
+ return xfs_compat_ioc_fsbulkstat(mp, cmd, arg);
case XFS_IOC_FD_TO_HANDLE_32:
case XFS_IOC_PATH_TO_HANDLE_32:
case XFS_IOC_PATH_TO_FSHANDLE_32: {
@@ -663,6 +654,7 @@
case XFS_IOC_FSSETDM_BY_HANDLE_32:
return xfs_compat_fssetdm_by_handle(filp, arg);
default:
- return -ENOIOCTLCMD;
+ /* try the native version */
+ return xfs_file_ioctl(filp, cmd, (unsigned long)arg);
}
}
diff --git a/fs/xfs/xfs_ioctl32.h b/fs/xfs/xfs_ioctl32.h
index d28fa82..7985344 100644
--- a/fs/xfs/xfs_ioctl32.h
+++ b/fs/xfs/xfs_ioctl32.h
@@ -36,7 +36,7 @@
__s32 tv_nsec; /* and nanoseconds */
} compat_xfs_bstime_t;
-typedef struct compat_xfs_bstat {
+struct compat_xfs_bstat {
__u64 bs_ino; /* inode number */
__u16 bs_mode; /* type and mode */
__u16 bs_nlink; /* number of links */
@@ -61,14 +61,14 @@
__u32 bs_dmevmask; /* DMIG event mask */
__u16 bs_dmstate; /* DMIG state info */
__u16 bs_aextents; /* attribute number of extents */
-} __compat_packed compat_xfs_bstat_t;
+} __compat_packed;
-typedef struct compat_xfs_fsop_bulkreq {
+struct compat_xfs_fsop_bulkreq {
compat_uptr_t lastip; /* last inode # pointer */
__s32 icount; /* count of entries in buffer */
compat_uptr_t ubuffer; /* user buffer for inode desc. */
compat_uptr_t ocount; /* output count pointer */
-} compat_xfs_fsop_bulkreq_t;
+};
#define XFS_IOC_FSBULKSTAT_32 \
_IOWR('X', 101, struct compat_xfs_fsop_bulkreq)
@@ -106,7 +106,7 @@
xfs_off_t sx_offset; /* offset into file */
xfs_off_t sx_length; /* leng from offset */
char sx_pad[16]; /* pad space, unused */
- compat_xfs_bstat_t sx_stat; /* stat of target b4 copy */
+ struct compat_xfs_bstat sx_stat; /* stat of target b4 copy */
} __compat_packed compat_xfs_swapext_t;
#define XFS_IOC_SWAPEXT_32 _IOWR('X', 109, struct compat_xfs_swapext)
@@ -201,11 +201,11 @@
#define XFS_IOC_FSGEOMETRY_V1_32 \
_IOR('X', 100, struct compat_xfs_fsop_geom_v1)
-typedef struct compat_xfs_inogrp {
+struct compat_xfs_inogrp {
__u64 xi_startino; /* starting inode number */
__s32 xi_alloccount; /* # bits set in allocmask */
__u64 xi_allocmask; /* mask of allocated inodes */
-} __attribute__((packed)) compat_xfs_inogrp_t;
+} __attribute__((packed));
/* These growfs input structures have padding on the end, so must translate */
typedef struct compat_xfs_growfs_data {
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 6320aca..f780e22 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -4,7 +4,6 @@
* Copyright (c) 2016-2018 Christoph Hellwig.
* All Rights Reserved.
*/
-#include <linux/iomap.h>
#include "xfs.h"
#include "xfs_fs.h"
#include "xfs_shared.h"
@@ -12,7 +11,6 @@
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
-#include "xfs_defer.h"
#include "xfs_inode.h"
#include "xfs_btree.h"
#include "xfs_bmap_btree.h"
@@ -25,7 +23,6 @@
#include "xfs_inode_item.h"
#include "xfs_iomap.h"
#include "xfs_trace.h"
-#include "xfs_icache.h"
#include "xfs_quota.h"
#include "xfs_dquot_item.h"
#include "xfs_dquot.h"
@@ -35,18 +32,40 @@
#define XFS_WRITEIO_ALIGN(mp,off) (((off) >> mp->m_writeio_log) \
<< mp->m_writeio_log)
-void
+static int
+xfs_alert_fsblock_zero(
+ xfs_inode_t *ip,
+ xfs_bmbt_irec_t *imap)
+{
+ xfs_alert_tag(ip->i_mount, XFS_PTAG_FSBLOCK_ZERO,
+ "Access to block zero in inode %llu "
+ "start_block: %llx start_off: %llx "
+ "blkcnt: %llx extent-state: %x",
+ (unsigned long long)ip->i_ino,
+ (unsigned long long)imap->br_startblock,
+ (unsigned long long)imap->br_startoff,
+ (unsigned long long)imap->br_blockcount,
+ imap->br_state);
+ return -EFSCORRUPTED;
+}
+
+int
xfs_bmbt_to_iomap(
struct xfs_inode *ip,
struct iomap *iomap,
- struct xfs_bmbt_irec *imap)
+ struct xfs_bmbt_irec *imap,
+ bool shared)
{
struct xfs_mount *mp = ip->i_mount;
+ if (unlikely(!xfs_valid_startblock(ip, imap->br_startblock)))
+ return xfs_alert_fsblock_zero(ip, imap);
+
if (imap->br_startblock == HOLESTARTBLOCK) {
iomap->addr = IOMAP_NULL_ADDR;
iomap->type = IOMAP_HOLE;
- } else if (imap->br_startblock == DELAYSTARTBLOCK) {
+ } else if (imap->br_startblock == DELAYSTARTBLOCK ||
+ isnullstartblock(imap->br_startblock)) {
iomap->addr = IOMAP_NULL_ADDR;
iomap->type = IOMAP_DELALLOC;
} else {
@@ -60,6 +79,28 @@
iomap->length = XFS_FSB_TO_B(mp, imap->br_blockcount);
iomap->bdev = xfs_find_bdev_for_inode(VFS_I(ip));
iomap->dax_dev = xfs_find_daxdev_for_inode(VFS_I(ip));
+
+ if (xfs_ipincount(ip) &&
+ (ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP))
+ iomap->flags |= IOMAP_F_DIRTY;
+ if (shared)
+ iomap->flags |= IOMAP_F_SHARED;
+ return 0;
+}
+
+static void
+xfs_hole_to_iomap(
+ struct xfs_inode *ip,
+ struct iomap *iomap,
+ xfs_fileoff_t offset_fsb,
+ xfs_fileoff_t end_fsb)
+{
+ iomap->addr = IOMAP_NULL_ADDR;
+ iomap->type = IOMAP_HOLE;
+ iomap->offset = XFS_FSB_TO_B(ip->i_mount, offset_fsb);
+ iomap->length = XFS_FSB_TO_B(ip->i_mount, end_fsb - offset_fsb);
+ iomap->bdev = xfs_find_bdev_for_inode(VFS_I(ip));
+ iomap->dax_dev = xfs_find_daxdev_for_inode(VFS_I(ip));
}
xfs_extlen_t
@@ -123,23 +164,6 @@
return 0;
}
-STATIC int
-xfs_alert_fsblock_zero(
- xfs_inode_t *ip,
- xfs_bmbt_irec_t *imap)
-{
- xfs_alert_tag(ip->i_mount, XFS_PTAG_FSBLOCK_ZERO,
- "Access to block zero in inode %llu "
- "start_block: %llx start_off: %llx "
- "blkcnt: %llx extent-state: %x",
- (unsigned long long)ip->i_ino,
- (unsigned long long)imap->br_startblock,
- (unsigned long long)imap->br_startoff,
- (unsigned long long)imap->br_blockcount,
- imap->br_state);
- return -EFSCORRUPTED;
-}
-
int
xfs_iomap_write_direct(
xfs_inode_t *ip,
@@ -273,7 +297,7 @@
goto out_unlock;
}
- if (!(imap->br_startblock || XFS_IS_REALTIME_INODE(ip)))
+ if (unlikely(!xfs_valid_startblock(ip, imap->br_startblock)))
error = xfs_alert_fsblock_zero(ip, imap);
out_unlock:
@@ -368,12 +392,13 @@
STATIC xfs_fsblock_t
xfs_iomap_prealloc_size(
struct xfs_inode *ip,
+ int whichfork,
loff_t offset,
loff_t count,
struct xfs_iext_cursor *icur)
{
struct xfs_mount *mp = ip->i_mount;
- struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
+ struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
struct xfs_bmbt_irec prev;
int shift = 0;
@@ -502,19 +527,21 @@
struct inode *inode,
loff_t offset,
loff_t count,
+ unsigned flags,
struct iomap *iomap)
{
struct xfs_inode *ip = XFS_I(inode);
struct xfs_mount *mp = ip->i_mount;
- struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
xfs_fileoff_t maxbytes_fsb =
XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes);
xfs_fileoff_t end_fsb;
- int error = 0, eof = 0;
- struct xfs_bmbt_irec got;
- struct xfs_iext_cursor icur;
+ struct xfs_bmbt_irec imap, cmap;
+ struct xfs_iext_cursor icur, ccur;
xfs_fsblock_t prealloc_blocks = 0;
+ bool eof = false, cow_eof = false, shared = false;
+ int whichfork = XFS_DATA_FORK;
+ int error = 0;
ASSERT(!XFS_IS_REALTIME_INODE(ip));
ASSERT(!xfs_get_extsz_hint(ip));
@@ -532,48 +559,109 @@
XFS_STATS_INC(mp, xs_blk_mapw);
- if (!(ifp->if_flags & XFS_IFEXTENTS)) {
+ if (!(ip->i_df.if_flags & XFS_IFEXTENTS)) {
error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK);
if (error)
goto out_unlock;
}
- eof = !xfs_iext_lookup_extent(ip, ifp, offset_fsb, &icur, &got);
- if (!eof && got.br_startoff <= offset_fsb) {
- if (xfs_is_reflink_inode(ip)) {
- bool shared;
+ end_fsb = min(XFS_B_TO_FSB(mp, offset + count), maxbytes_fsb);
- end_fsb = min(XFS_B_TO_FSB(mp, offset + count),
- maxbytes_fsb);
- xfs_trim_extent(&got, offset_fsb, end_fsb - offset_fsb);
- error = xfs_reflink_reserve_cow(ip, &got, &shared);
- if (error)
- goto out_unlock;
+ /*
+ * Search the data fork fork first to look up our source mapping. We
+ * always need the data fork map, as we have to return it to the
+ * iomap code so that the higher level write code can read data in to
+ * perform read-modify-write cycles for unaligned writes.
+ */
+ eof = !xfs_iext_lookup_extent(ip, &ip->i_df, offset_fsb, &icur, &imap);
+ if (eof)
+ imap.br_startoff = end_fsb; /* fake hole until the end */
+
+ /* We never need to allocate blocks for zeroing a hole. */
+ if ((flags & IOMAP_ZERO) && imap.br_startoff > offset_fsb) {
+ xfs_hole_to_iomap(ip, iomap, offset_fsb, imap.br_startoff);
+ goto out_unlock;
+ }
+
+ /*
+ * Search the COW fork extent list even if we did not find a data fork
+ * extent. This serves two purposes: first this implements the
+ * speculative preallocation using cowextsize, so that we also unshare
+ * block adjacent to shared blocks instead of just the shared blocks
+ * themselves. Second the lookup in the extent list is generally faster
+ * than going out to the shared extent tree.
+ */
+ if (xfs_is_cow_inode(ip)) {
+ if (!ip->i_cowfp) {
+ ASSERT(!xfs_is_reflink_inode(ip));
+ xfs_ifork_init_cow(ip);
+ }
+ cow_eof = !xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb,
+ &ccur, &cmap);
+ if (!cow_eof && cmap.br_startoff <= offset_fsb) {
+ trace_xfs_reflink_cow_found(ip, &cmap);
+ whichfork = XFS_COW_FORK;
+ goto done;
+ }
+ }
+
+ if (imap.br_startoff <= offset_fsb) {
+ /*
+ * For reflink files we may need a delalloc reservation when
+ * overwriting shared extents. This includes zeroing of
+ * existing extents that contain data.
+ */
+ if (!xfs_is_cow_inode(ip) ||
+ ((flags & IOMAP_ZERO) && imap.br_state != XFS_EXT_NORM)) {
+ trace_xfs_iomap_found(ip, offset, count, XFS_DATA_FORK,
+ &imap);
+ goto done;
}
- trace_xfs_iomap_found(ip, offset, count, 0, &got);
- goto done;
+ xfs_trim_extent(&imap, offset_fsb, end_fsb - offset_fsb);
+
+ /* Trim the mapping to the nearest shared extent boundary. */
+ error = xfs_inode_need_cow(ip, &imap, &shared);
+ if (error)
+ goto out_unlock;
+
+ /* Not shared? Just report the (potentially capped) extent. */
+ if (!shared) {
+ trace_xfs_iomap_found(ip, offset, count, XFS_DATA_FORK,
+ &imap);
+ goto done;
+ }
+
+ /*
+ * Fork all the shared blocks from our write offset until the
+ * end of the extent.
+ */
+ whichfork = XFS_COW_FORK;
+ end_fsb = imap.br_startoff + imap.br_blockcount;
+ } else {
+ /*
+ * We cap the maximum length we map here to MAX_WRITEBACK_PAGES
+ * pages to keep the chunks of work done where somewhat
+ * symmetric with the work writeback does. This is a completely
+ * arbitrary number pulled out of thin air.
+ *
+ * Note that the values needs to be less than 32-bits wide until
+ * the lower level functions are updated.
+ */
+ count = min_t(loff_t, count, 1024 * PAGE_SIZE);
+ end_fsb = min(XFS_B_TO_FSB(mp, offset + count), maxbytes_fsb);
+
+ if (xfs_is_always_cow_inode(ip))
+ whichfork = XFS_COW_FORK;
}
error = xfs_qm_dqattach_locked(ip, false);
if (error)
goto out_unlock;
- /*
- * We cap the maximum length we map here to MAX_WRITEBACK_PAGES pages
- * to keep the chunks of work done where somewhat symmetric with the
- * work writeback does. This is a completely arbitrary number pulled
- * out of thin air as a best guess for initial testing.
- *
- * Note that the values needs to be less than 32-bits wide until
- * the lower level functions are updated.
- */
- count = min_t(loff_t, count, 1024 * PAGE_SIZE);
- end_fsb = min(XFS_B_TO_FSB(mp, offset + count), maxbytes_fsb);
-
if (eof) {
- prealloc_blocks = xfs_iomap_prealloc_size(ip, offset, count,
- &icur);
+ prealloc_blocks = xfs_iomap_prealloc_size(ip, whichfork, offset,
+ count, &icur);
if (prealloc_blocks) {
xfs_extlen_t align;
xfs_off_t end_offset;
@@ -594,9 +682,11 @@
}
retry:
- error = xfs_bmapi_reserve_delalloc(ip, XFS_DATA_FORK, offset_fsb,
- end_fsb - offset_fsb, prealloc_blocks, &got, &icur,
- eof);
+ error = xfs_bmapi_reserve_delalloc(ip, whichfork, offset_fsb,
+ end_fsb - offset_fsb, prealloc_blocks,
+ whichfork == XFS_DATA_FORK ? &imap : &cmap,
+ whichfork == XFS_DATA_FORK ? &icur : &ccur,
+ whichfork == XFS_DATA_FORK ? eof : cow_eof);
switch (error) {
case 0:
break;
@@ -618,190 +708,26 @@
* them out if the write happens to fail.
*/
iomap->flags |= IOMAP_F_NEW;
- trace_xfs_iomap_alloc(ip, offset, count, 0, &got);
+ trace_xfs_iomap_alloc(ip, offset, count, whichfork,
+ whichfork == XFS_DATA_FORK ? &imap : &cmap);
done:
- if (isnullstartblock(got.br_startblock))
- got.br_startblock = DELAYSTARTBLOCK;
-
- if (!got.br_startblock) {
- error = xfs_alert_fsblock_zero(ip, &got);
- if (error)
+ if (whichfork == XFS_COW_FORK) {
+ if (imap.br_startoff > offset_fsb) {
+ xfs_trim_extent(&cmap, offset_fsb,
+ imap.br_startoff - offset_fsb);
+ error = xfs_bmbt_to_iomap(ip, iomap, &cmap, true);
goto out_unlock;
+ }
+ /* ensure we only report blocks we have a reservation for */
+ xfs_trim_extent(&imap, cmap.br_startoff, cmap.br_blockcount);
+ shared = true;
}
-
- xfs_bmbt_to_iomap(ip, iomap, &got);
-
+ error = xfs_bmbt_to_iomap(ip, iomap, &imap, shared);
out_unlock:
xfs_iunlock(ip, XFS_ILOCK_EXCL);
return error;
}
-/*
- * Pass in a delayed allocate extent, convert it to real extents;
- * return to the caller the extent we create which maps on top of
- * the originating callers request.
- *
- * Called without a lock on the inode.
- *
- * We no longer bother to look at the incoming map - all we have to
- * guarantee is that whatever we allocate fills the required range.
- */
-int
-xfs_iomap_write_allocate(
- xfs_inode_t *ip,
- int whichfork,
- xfs_off_t offset,
- xfs_bmbt_irec_t *imap,
- unsigned int *cow_seq)
-{
- xfs_mount_t *mp = ip->i_mount;
- struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
- xfs_fileoff_t offset_fsb, last_block;
- xfs_fileoff_t end_fsb, map_start_fsb;
- xfs_filblks_t count_fsb;
- xfs_trans_t *tp;
- int nimaps;
- int error = 0;
- int flags = XFS_BMAPI_DELALLOC;
- int nres;
-
- if (whichfork == XFS_COW_FORK)
- flags |= XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC;
-
- /*
- * Make sure that the dquots are there.
- */
- error = xfs_qm_dqattach(ip);
- if (error)
- return error;
-
- offset_fsb = XFS_B_TO_FSBT(mp, offset);
- count_fsb = imap->br_blockcount;
- map_start_fsb = imap->br_startoff;
-
- XFS_STATS_ADD(mp, xs_xstrat_bytes, XFS_FSB_TO_B(mp, count_fsb));
-
- while (count_fsb != 0) {
- /*
- * Set up a transaction with which to allocate the
- * backing store for the file. Do allocations in a
- * loop until we get some space in the range we are
- * interested in. The other space that might be allocated
- * is in the delayed allocation extent on which we sit
- * but before our buffer starts.
- */
- nimaps = 0;
- while (nimaps == 0) {
- nres = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK);
- /*
- * We have already reserved space for the extent and any
- * indirect blocks when creating the delalloc extent,
- * there is no need to reserve space in this transaction
- * again.
- */
- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, 0,
- 0, XFS_TRANS_RESERVE, &tp);
- if (error)
- return error;
-
- xfs_ilock(ip, XFS_ILOCK_EXCL);
- xfs_trans_ijoin(tp, ip, 0);
-
- /*
- * it is possible that the extents have changed since
- * we did the read call as we dropped the ilock for a
- * while. We have to be careful about truncates or hole
- * punchs here - we are not allowed to allocate
- * non-delalloc blocks here.
- *
- * The only protection against truncation is the pages
- * for the range we are being asked to convert are
- * locked and hence a truncate will block on them
- * first.
- *
- * As a result, if we go beyond the range we really
- * need and hit an delalloc extent boundary followed by
- * a hole while we have excess blocks in the map, we
- * will fill the hole incorrectly and overrun the
- * transaction reservation.
- *
- * Using a single map prevents this as we are forced to
- * check each map we look for overlap with the desired
- * range and abort as soon as we find it. Also, given
- * that we only return a single map, having one beyond
- * what we can return is probably a bit silly.
- *
- * We also need to check that we don't go beyond EOF;
- * this is a truncate optimisation as a truncate sets
- * the new file size before block on the pages we
- * currently have locked under writeback. Because they
- * are about to be tossed, we don't need to write them
- * back....
- */
- nimaps = 1;
- end_fsb = XFS_B_TO_FSB(mp, XFS_ISIZE(ip));
- error = xfs_bmap_last_offset(ip, &last_block,
- XFS_DATA_FORK);
- if (error)
- goto trans_cancel;
-
- last_block = XFS_FILEOFF_MAX(last_block, end_fsb);
- if ((map_start_fsb + count_fsb) > last_block) {
- count_fsb = last_block - map_start_fsb;
- if (count_fsb == 0) {
- error = -EAGAIN;
- goto trans_cancel;
- }
- }
-
- /*
- * From this point onwards we overwrite the imap
- * pointer that the caller gave to us.
- */
- error = xfs_bmapi_write(tp, ip, map_start_fsb,
- count_fsb, flags, nres, imap,
- &nimaps);
- if (error)
- goto trans_cancel;
-
- error = xfs_trans_commit(tp);
- if (error)
- goto error0;
-
- if (whichfork == XFS_COW_FORK)
- *cow_seq = READ_ONCE(ifp->if_seq);
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- }
-
- /*
- * See if we were able to allocate an extent that
- * covers at least part of the callers request
- */
- if (!(imap->br_startblock || XFS_IS_REALTIME_INODE(ip)))
- return xfs_alert_fsblock_zero(ip, imap);
-
- if ((offset_fsb >= imap->br_startoff) &&
- (offset_fsb < (imap->br_startoff +
- imap->br_blockcount))) {
- XFS_STATS_INC(mp, xs_xstrat_quick);
- return 0;
- }
-
- /*
- * So far we have not mapped the requested part of the
- * file, just surrounding data, try again.
- */
- count_fsb -= imap->br_blockcount;
- map_start_fsb = imap->br_startoff + imap->br_blockcount;
- }
-
-trans_cancel:
- xfs_trans_cancel(tp);
-error0:
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- return error;
-}
-
int
xfs_iomap_write_unwritten(
xfs_inode_t *ip,
@@ -850,7 +776,7 @@
* complete here and might deadlock on the iolock.
*/
error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0,
- XFS_TRANS_RESERVE | XFS_TRANS_NOFS, &tp);
+ XFS_TRANS_RESERVE, &tp);
if (error)
return error;
@@ -888,7 +814,7 @@
if (error)
return error;
- if (!(imap.br_startblock || XFS_IS_REALTIME_INODE(ip)))
+ if (unlikely(!xfs_valid_startblock(ip, imap.br_startblock)))
return xfs_alert_fsblock_zero(ip, &imap);
if ((numblks_fsb = imap.br_blockcount) == 0) {
@@ -946,7 +872,7 @@
* COW writes may allocate delalloc space or convert unwritten COW
* extents, so we need to make sure to take the lock exclusively here.
*/
- if (xfs_is_reflink_inode(ip) && is_write) {
+ if (xfs_is_cow_inode(ip) && is_write) {
/*
* FIXME: It could still overwrite on unshared extents and not
* need allocation.
@@ -980,7 +906,7 @@
* check, so if we got ILOCK_SHARED for a write and but we're now a
* reflink inode we have to switch to ILOCK_EXCL and relock.
*/
- if (mode == XFS_ILOCK_SHARED && is_write && xfs_is_reflink_inode(ip)) {
+ if (mode == XFS_ILOCK_SHARED && is_write && xfs_is_cow_inode(ip)) {
xfs_iunlock(ip, mode);
mode = XFS_ILOCK_EXCL;
goto relock;
@@ -1003,16 +929,17 @@
struct xfs_bmbt_irec imap;
xfs_fileoff_t offset_fsb, end_fsb;
int nimaps = 1, error = 0;
- bool shared = false, trimmed = false;
+ bool shared = false;
unsigned lockmode;
if (XFS_FORCED_SHUTDOWN(mp))
return -EIO;
- if (((flags & (IOMAP_WRITE | IOMAP_DIRECT)) == IOMAP_WRITE) &&
+ if ((flags & (IOMAP_WRITE | IOMAP_ZERO)) && !(flags & IOMAP_DIRECT) &&
!IS_DAX(inode) && !xfs_get_extsz_hint(ip)) {
/* Reserve delalloc blocks for regular writeback. */
- return xfs_file_iomap_begin_delay(inode, offset, length, iomap);
+ return xfs_file_iomap_begin_delay(inode, offset, length, flags,
+ iomap);
}
/*
@@ -1038,8 +965,7 @@
if (flags & IOMAP_REPORT) {
/* Trim the mapping to the nearest shared extent boundary. */
- error = xfs_reflink_trim_around_shared(ip, &imap, &shared,
- &trimmed);
+ error = xfs_reflink_trim_around_shared(ip, &imap, &shared);
if (error)
goto out_unlock;
}
@@ -1052,23 +978,33 @@
* Break shared extents if necessary. Checks for non-blocking IO have
* been done up front, so we don't need to do them here.
*/
- if (xfs_is_reflink_inode(ip)) {
+ if (xfs_is_cow_inode(ip)) {
+ struct xfs_bmbt_irec cmap;
+ bool directio = (flags & IOMAP_DIRECT);
+
/* if zeroing doesn't need COW allocation, then we are done. */
if ((flags & IOMAP_ZERO) &&
!needs_cow_for_zeroing(&imap, nimaps))
goto out_found;
- if (flags & IOMAP_DIRECT) {
- /* may drop and re-acquire the ilock */
- error = xfs_reflink_allocate_cow(ip, &imap, &shared,
- &lockmode);
- if (error)
- goto out_unlock;
- } else {
- error = xfs_reflink_reserve_cow(ip, &imap, &shared);
- if (error)
- goto out_unlock;
- }
+ /* may drop and re-acquire the ilock */
+ cmap = imap;
+ error = xfs_reflink_allocate_cow(ip, &cmap, &shared, &lockmode,
+ directio);
+ if (error)
+ goto out_unlock;
+
+ /*
+ * For buffered writes we need to report the address of the
+ * previous block (if there was any) so that the higher level
+ * write code can perform read-modify-write operations; we
+ * won't need the CoW fork mapping until writeback. For direct
+ * I/O, which must be block aligned, we need to report the
+ * newly allocated address. If the data fork has a hole, copy
+ * the COW fork mapping to avoid allocating to the data fork.
+ */
+ if (directio || imap.br_startblock == HOLESTARTBLOCK)
+ imap = cmap;
end_fsb = imap.br_startoff + imap.br_blockcount;
length = XFS_FSB_TO_B(mp, end_fsb) - offset;
@@ -1110,23 +1046,15 @@
return error;
iomap->flags |= IOMAP_F_NEW;
- trace_xfs_iomap_alloc(ip, offset, length, 0, &imap);
+ trace_xfs_iomap_alloc(ip, offset, length, XFS_DATA_FORK, &imap);
out_finish:
- if (xfs_ipincount(ip) && (ip->i_itemp->ili_fsync_fields
- & ~XFS_ILOG_TIMESTAMP))
- iomap->flags |= IOMAP_F_DIRTY;
-
- xfs_bmbt_to_iomap(ip, iomap, &imap);
-
- if (shared)
- iomap->flags |= IOMAP_F_SHARED;
- return 0;
+ return xfs_bmbt_to_iomap(ip, iomap, &imap, shared);
out_found:
ASSERT(nimaps);
xfs_iunlock(ip, lockmode);
- trace_xfs_iomap_found(ip, offset, length, 0, &imap);
+ trace_xfs_iomap_found(ip, offset, length, XFS_DATA_FORK, &imap);
goto out_finish;
out_unlock:
@@ -1212,6 +1140,92 @@
};
static int
+xfs_seek_iomap_begin(
+ struct inode *inode,
+ loff_t offset,
+ loff_t length,
+ unsigned flags,
+ struct iomap *iomap)
+{
+ struct xfs_inode *ip = XFS_I(inode);
+ struct xfs_mount *mp = ip->i_mount;
+ xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
+ xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + length);
+ xfs_fileoff_t cow_fsb = NULLFILEOFF, data_fsb = NULLFILEOFF;
+ struct xfs_iext_cursor icur;
+ struct xfs_bmbt_irec imap, cmap;
+ int error = 0;
+ unsigned lockmode;
+
+ if (XFS_FORCED_SHUTDOWN(mp))
+ return -EIO;
+
+ lockmode = xfs_ilock_data_map_shared(ip);
+ if (!(ip->i_df.if_flags & XFS_IFEXTENTS)) {
+ error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK);
+ if (error)
+ goto out_unlock;
+ }
+
+ if (xfs_iext_lookup_extent(ip, &ip->i_df, offset_fsb, &icur, &imap)) {
+ /*
+ * If we found a data extent we are done.
+ */
+ if (imap.br_startoff <= offset_fsb)
+ goto done;
+ data_fsb = imap.br_startoff;
+ } else {
+ /*
+ * Fake a hole until the end of the file.
+ */
+ data_fsb = min(XFS_B_TO_FSB(mp, offset + length),
+ XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes));
+ }
+
+ /*
+ * If a COW fork extent covers the hole, report it - capped to the next
+ * data fork extent:
+ */
+ if (xfs_inode_has_cow_data(ip) &&
+ xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &cmap))
+ cow_fsb = cmap.br_startoff;
+ if (cow_fsb != NULLFILEOFF && cow_fsb <= offset_fsb) {
+ if (data_fsb < cow_fsb + cmap.br_blockcount)
+ end_fsb = min(end_fsb, data_fsb);
+ xfs_trim_extent(&cmap, offset_fsb, end_fsb);
+ error = xfs_bmbt_to_iomap(ip, iomap, &cmap, true);
+ /*
+ * This is a COW extent, so we must probe the page cache
+ * because there could be dirty page cache being backed
+ * by this extent.
+ */
+ iomap->type = IOMAP_UNWRITTEN;
+ goto out_unlock;
+ }
+
+ /*
+ * Else report a hole, capped to the next found data or COW extent.
+ */
+ if (cow_fsb != NULLFILEOFF && cow_fsb < data_fsb)
+ imap.br_blockcount = cow_fsb - offset_fsb;
+ else
+ imap.br_blockcount = data_fsb - offset_fsb;
+ imap.br_startoff = offset_fsb;
+ imap.br_startblock = HOLESTARTBLOCK;
+ imap.br_state = XFS_EXT_NORM;
+done:
+ xfs_trim_extent(&imap, offset_fsb, end_fsb);
+ error = xfs_bmbt_to_iomap(ip, iomap, &imap, false);
+out_unlock:
+ xfs_iunlock(ip, lockmode);
+ return error;
+}
+
+const struct iomap_ops xfs_seek_iomap_ops = {
+ .iomap_begin = xfs_seek_iomap_begin,
+};
+
+static int
xfs_xattr_iomap_begin(
struct inode *inode,
loff_t offset,
@@ -1244,12 +1258,10 @@
out_unlock:
xfs_iunlock(ip, lockmode);
- if (!error) {
- ASSERT(nimaps);
- xfs_bmbt_to_iomap(ip, iomap, &imap);
- }
-
- return error;
+ if (error)
+ return error;
+ ASSERT(nimaps);
+ return xfs_bmbt_to_iomap(ip, iomap, &imap, false);
}
const struct iomap_ops xfs_xattr_iomap_ops = {
diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h
index c617054..5c2f6aa 100644
--- a/fs/xfs/xfs_iomap.h
+++ b/fs/xfs/xfs_iomap.h
@@ -13,12 +13,10 @@
int xfs_iomap_write_direct(struct xfs_inode *, xfs_off_t, size_t,
struct xfs_bmbt_irec *, int);
-int xfs_iomap_write_allocate(struct xfs_inode *, int, xfs_off_t,
- struct xfs_bmbt_irec *, unsigned int *);
int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, xfs_off_t, bool);
-void xfs_bmbt_to_iomap(struct xfs_inode *, struct iomap *,
- struct xfs_bmbt_irec *);
+int xfs_bmbt_to_iomap(struct xfs_inode *, struct iomap *,
+ struct xfs_bmbt_irec *, bool shared);
xfs_extlen_t xfs_eof_alignment(struct xfs_inode *ip, xfs_extlen_t extsize);
static inline xfs_filblks_t
@@ -42,6 +40,7 @@
}
extern const struct iomap_ops xfs_iomap_ops;
+extern const struct iomap_ops xfs_seek_iomap_ops;
extern const struct iomap_ops xfs_xattr_iomap_ops;
#endif /* __XFS_IOMAP_H__*/
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index f48ffd7..fe285d1 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -10,30 +10,20 @@
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
-#include "xfs_da_format.h"
#include "xfs_inode.h"
-#include "xfs_bmap.h"
-#include "xfs_bmap_util.h"
#include "xfs_acl.h"
#include "xfs_quota.h"
-#include "xfs_error.h"
#include "xfs_attr.h"
#include "xfs_trans.h"
#include "xfs_trace.h"
#include "xfs_icache.h"
#include "xfs_symlink.h"
-#include "xfs_da_btree.h"
#include "xfs_dir2.h"
-#include "xfs_trans_space.h"
#include "xfs_iomap.h"
-#include "xfs_defer.h"
-#include <linux/capability.h>
#include <linux/xattr.h>
#include <linux/posix_acl.h>
#include <linux/security.h>
-#include <linux/iomap.h>
-#include <linux/slab.h>
#include <linux/iversion.h>
/*
@@ -191,9 +181,18 @@
xfs_setup_iops(ip);
- if (tmpfile)
+ if (tmpfile) {
+ /*
+ * The VFS requires that any inode fed to d_tmpfile must have
+ * nlink == 1 so that it can decrement the nlink in d_tmpfile.
+ * However, we created the temp file with nlink == 0 because
+ * we're not allowed to put an inode with nlink > 0 on the
+ * unlinked list. Therefore we have to set nlink to 1 so that
+ * d_tmpfile can immediately set it back to zero.
+ */
+ set_nlink(inode, 1);
d_tmpfile(dentry, inode);
- else
+ } else
d_instantiate(dentry, inode);
xfs_finish_inode_setup(ip);
@@ -522,6 +521,10 @@
}
}
+ /*
+ * Note: If you add another clause to set an attribute flag, please
+ * update attributes_mask below.
+ */
if (ip->i_d.di_flags & XFS_DIFLAG_IMMUTABLE)
stat->attributes |= STATX_ATTR_IMMUTABLE;
if (ip->i_d.di_flags & XFS_DIFLAG_APPEND)
@@ -529,6 +532,10 @@
if (ip->i_d.di_flags & XFS_DIFLAG_NODUMP)
stat->attributes |= STATX_ATTR_NODUMP;
+ stat->attributes_mask |= (STATX_ATTR_IMMUTABLE |
+ STATX_ATTR_APPEND |
+ STATX_ATTR_NODUMP);
+
switch (inode->i_mode & S_IFMT) {
case S_IFBLK:
case S_IFCHR:
@@ -786,6 +793,7 @@
out_cancel:
xfs_trans_cancel(tp);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
out_dqrele:
xfs_qm_dqrele(udqp);
xfs_qm_dqrele(gdqp);
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index e9508ba..884950a 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -14,45 +14,66 @@
#include "xfs_btree.h"
#include "xfs_ialloc.h"
#include "xfs_ialloc_btree.h"
+#include "xfs_iwalk.h"
#include "xfs_itable.h"
#include "xfs_error.h"
-#include "xfs_trace.h"
#include "xfs_icache.h"
+#include "xfs_health.h"
/*
- * Return stat information for one inode.
- * Return 0 if ok, else errno.
+ * Bulk Stat
+ * =========
+ *
+ * Use the inode walking functions to fill out struct xfs_bulkstat for every
+ * allocated inode, then pass the stat information to some externally provided
+ * iteration function.
*/
-int
+
+struct xfs_bstat_chunk {
+ bulkstat_one_fmt_pf formatter;
+ struct xfs_ibulk *breq;
+ struct xfs_bulkstat *buf;
+};
+
+/*
+ * Fill out the bulkstat info for a single inode and report it somewhere.
+ *
+ * bc->breq->lastino is effectively the inode cursor as we walk through the
+ * filesystem. Therefore, we update it any time we need to move the cursor
+ * forward, regardless of whether or not we're sending any bstat information
+ * back to userspace. If the inode is internal metadata or, has been freed
+ * out from under us, we just simply keep going.
+ *
+ * However, if any other type of error happens we want to stop right where we
+ * are so that userspace will call back with exact number of the bad inode and
+ * we can send back an error code.
+ *
+ * Note that if the formatter tells us there's no space left in the buffer we
+ * move the cursor forward and abort the walk.
+ */
+STATIC int
xfs_bulkstat_one_int(
- struct xfs_mount *mp, /* mount point for filesystem */
- xfs_ino_t ino, /* inode to get data for */
- void __user *buffer, /* buffer to place output in */
- int ubsize, /* size of buffer */
- bulkstat_one_fmt_pf formatter, /* formatter, copy to user */
- int *ubused, /* bytes used by me */
- int *stat) /* BULKSTAT_RV_... */
+ struct xfs_mount *mp,
+ struct xfs_trans *tp,
+ xfs_ino_t ino,
+ struct xfs_bstat_chunk *bc)
{
struct xfs_icdinode *dic; /* dinode core info pointer */
struct xfs_inode *ip; /* incore inode pointer */
struct inode *inode;
- struct xfs_bstat *buf; /* return buffer */
- int error = 0; /* error value */
+ struct xfs_bulkstat *buf = bc->buf;
+ int error = -EINVAL;
- *stat = BULKSTAT_RV_NOTHING;
+ if (xfs_internal_inum(mp, ino))
+ goto out_advance;
- if (!buffer || xfs_internal_inum(mp, ino))
- return -EINVAL;
-
- buf = kmem_zalloc(sizeof(*buf), KM_SLEEP | KM_MAYFAIL);
- if (!buf)
- return -ENOMEM;
-
- error = xfs_iget(mp, NULL, ino,
+ error = xfs_iget(mp, tp, ino,
(XFS_IGET_DONTCACHE | XFS_IGET_UNTRUSTED),
XFS_ILOCK_SHARED, &ip);
+ if (error == -ENOENT || error == -EINVAL)
+ goto out_advance;
if (error)
- goto out_free;
+ goto out;
ASSERT(ip != NULL);
ASSERT(ip->i_imap.im_blkno != 0);
@@ -63,36 +84,35 @@
/* xfs_iget returns the following without needing
* further change.
*/
- buf->bs_projid_lo = dic->di_projid_lo;
- buf->bs_projid_hi = dic->di_projid_hi;
+ buf->bs_projectid = xfs_get_projid(ip);
buf->bs_ino = ino;
buf->bs_uid = dic->di_uid;
buf->bs_gid = dic->di_gid;
buf->bs_size = dic->di_size;
buf->bs_nlink = inode->i_nlink;
- buf->bs_atime.tv_sec = inode->i_atime.tv_sec;
- buf->bs_atime.tv_nsec = inode->i_atime.tv_nsec;
- buf->bs_mtime.tv_sec = inode->i_mtime.tv_sec;
- buf->bs_mtime.tv_nsec = inode->i_mtime.tv_nsec;
- buf->bs_ctime.tv_sec = inode->i_ctime.tv_sec;
- buf->bs_ctime.tv_nsec = inode->i_ctime.tv_nsec;
+ buf->bs_atime = inode->i_atime.tv_sec;
+ buf->bs_atime_nsec = inode->i_atime.tv_nsec;
+ buf->bs_mtime = inode->i_mtime.tv_sec;
+ buf->bs_mtime_nsec = inode->i_mtime.tv_nsec;
+ buf->bs_ctime = inode->i_ctime.tv_sec;
+ buf->bs_ctime_nsec = inode->i_ctime.tv_nsec;
+ buf->bs_btime = dic->di_crtime.t_sec;
+ buf->bs_btime_nsec = dic->di_crtime.t_nsec;
buf->bs_gen = inode->i_generation;
buf->bs_mode = inode->i_mode;
buf->bs_xflags = xfs_ip2xflags(ip);
- buf->bs_extsize = dic->di_extsize << mp->m_sb.sb_blocklog;
+ buf->bs_extsize_blks = dic->di_extsize;
buf->bs_extents = dic->di_nextents;
- memset(buf->bs_pad, 0, sizeof(buf->bs_pad));
- buf->bs_dmevmask = dic->di_dmevmask;
- buf->bs_dmstate = dic->di_dmstate;
+ xfs_bulkstat_health(ip, buf);
buf->bs_aextents = dic->di_anextents;
buf->bs_forkoff = XFS_IFORK_BOFF(ip);
+ buf->bs_version = XFS_BULKSTAT_VERSION_V5;
if (dic->di_version == 3) {
if (dic->di_flags2 & XFS_DIFLAG2_COWEXTSIZE)
- buf->bs_cowextsize = dic->di_cowextsize <<
- mp->m_sb.sb_blocklog;
+ buf->bs_cowextsize_blks = dic->di_cowextsize;
}
switch (dic->di_format) {
@@ -116,387 +136,121 @@
xfs_iunlock(ip, XFS_ILOCK_SHARED);
xfs_irele(ip);
- error = formatter(buffer, ubsize, ubused, buf);
- if (!error)
- *stat = BULKSTAT_RV_DIDONE;
+ error = bc->formatter(bc->breq, buf);
+ if (error == -ECANCELED)
+ goto out_advance;
+ if (error)
+ goto out;
- out_free:
- kmem_free(buf);
+out_advance:
+ /*
+ * Advance the cursor to the inode that comes after the one we just
+ * looked at. We want the caller to move along if the bulkstat
+ * information was copied successfully; if we tried to grab the inode
+ * but it's no longer allocated; or if it's internal metadata.
+ */
+ bc->breq->startino = ino + 1;
+out:
return error;
}
-/* Return 0 on success or positive error */
-STATIC int
-xfs_bulkstat_one_fmt(
- void __user *ubuffer,
- int ubsize,
- int *ubused,
- const xfs_bstat_t *buffer)
-{
- if (ubsize < sizeof(*buffer))
- return -ENOMEM;
- if (copy_to_user(ubuffer, buffer, sizeof(*buffer)))
- return -EFAULT;
- if (ubused)
- *ubused = sizeof(*buffer);
- return 0;
-}
-
+/* Bulkstat a single inode. */
int
xfs_bulkstat_one(
- xfs_mount_t *mp, /* mount point for filesystem */
- xfs_ino_t ino, /* inode number to get data for */
- void __user *buffer, /* buffer to place output in */
- int ubsize, /* size of buffer */
- int *ubused, /* bytes used by me */
- int *stat) /* BULKSTAT_RV_... */
+ struct xfs_ibulk *breq,
+ bulkstat_one_fmt_pf formatter)
{
- return xfs_bulkstat_one_int(mp, ino, buffer, ubsize,
- xfs_bulkstat_one_fmt, ubused, stat);
-}
+ struct xfs_bstat_chunk bc = {
+ .formatter = formatter,
+ .breq = breq,
+ };
+ int error;
-/*
- * Loop over all clusters in a chunk for a given incore inode allocation btree
- * record. Do a readahead if there are any allocated inodes in that cluster.
- */
-STATIC void
-xfs_bulkstat_ichunk_ra(
- struct xfs_mount *mp,
- xfs_agnumber_t agno,
- struct xfs_inobt_rec_incore *irec)
-{
- xfs_agblock_t agbno;
- struct blk_plug plug;
- int blks_per_cluster;
- int inodes_per_cluster;
- int i; /* inode chunk index */
+ ASSERT(breq->icount == 1);
- agbno = XFS_AGINO_TO_AGBNO(mp, irec->ir_startino);
- blks_per_cluster = xfs_icluster_size_fsb(mp);
- inodes_per_cluster = blks_per_cluster << mp->m_sb.sb_inopblog;
+ bc.buf = kmem_zalloc(sizeof(struct xfs_bulkstat),
+ KM_MAYFAIL);
+ if (!bc.buf)
+ return -ENOMEM;
- blk_start_plug(&plug);
- for (i = 0; i < XFS_INODES_PER_CHUNK;
- i += inodes_per_cluster, agbno += blks_per_cluster) {
- if (xfs_inobt_maskn(i, inodes_per_cluster) & ~irec->ir_free) {
- xfs_btree_reada_bufs(mp, agno, agbno, blks_per_cluster,
- &xfs_inode_buf_ops);
- }
- }
- blk_finish_plug(&plug);
-}
+ error = xfs_bulkstat_one_int(breq->mp, NULL, breq->startino, &bc);
-/*
- * Lookup the inode chunk that the given inode lives in and then get the record
- * if we found the chunk. If the inode was not the last in the chunk and there
- * are some left allocated, update the data for the pointed-to record as well as
- * return the count of grabbed inodes.
- */
-STATIC int
-xfs_bulkstat_grab_ichunk(
- struct xfs_btree_cur *cur, /* btree cursor */
- xfs_agino_t agino, /* starting inode of chunk */
- int *icount,/* return # of inodes grabbed */
- struct xfs_inobt_rec_incore *irec) /* btree record */
-{
- int idx; /* index into inode chunk */
- int stat;
- int error = 0;
-
- /* Lookup the inode chunk that this inode lives in */
- error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &stat);
- if (error)
- return error;
- if (!stat) {
- *icount = 0;
- return error;
- }
-
- /* Get the record, should always work */
- error = xfs_inobt_get_rec(cur, irec, &stat);
- if (error)
- return error;
- XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, stat == 1);
-
- /* Check if the record contains the inode in request */
- if (irec->ir_startino + XFS_INODES_PER_CHUNK <= agino) {
- *icount = 0;
- return 0;
- }
-
- idx = agino - irec->ir_startino + 1;
- if (idx < XFS_INODES_PER_CHUNK &&
- (xfs_inobt_maskn(idx, XFS_INODES_PER_CHUNK - idx) & ~irec->ir_free)) {
- int i;
-
- /* We got a right chunk with some left inodes allocated at it.
- * Grab the chunk record. Mark all the uninteresting inodes
- * free -- because they're before our start point.
- */
- for (i = 0; i < idx; i++) {
- if (XFS_INOBT_MASK(i) & ~irec->ir_free)
- irec->ir_freecount++;
- }
-
- irec->ir_free |= xfs_inobt_maskn(0, idx);
- *icount = irec->ir_count - irec->ir_freecount;
- }
-
- return 0;
-}
-
-#define XFS_BULKSTAT_UBLEFT(ubleft) ((ubleft) >= statstruct_size)
-
-struct xfs_bulkstat_agichunk {
- char __user **ac_ubuffer;/* pointer into user's buffer */
- int ac_ubleft; /* bytes left in user's buffer */
- int ac_ubelem; /* spaces used in user's buffer */
-};
-
-/*
- * Process inodes in chunk with a pointer to a formatter function
- * that will iget the inode and fill in the appropriate structure.
- */
-static int
-xfs_bulkstat_ag_ichunk(
- struct xfs_mount *mp,
- xfs_agnumber_t agno,
- struct xfs_inobt_rec_incore *irbp,
- bulkstat_one_pf formatter,
- size_t statstruct_size,
- struct xfs_bulkstat_agichunk *acp,
- xfs_agino_t *last_agino)
-{
- char __user **ubufp = acp->ac_ubuffer;
- int chunkidx;
- int error = 0;
- xfs_agino_t agino = irbp->ir_startino;
-
- for (chunkidx = 0; chunkidx < XFS_INODES_PER_CHUNK;
- chunkidx++, agino++) {
- int fmterror;
- int ubused;
-
- /* inode won't fit in buffer, we are done */
- if (acp->ac_ubleft < statstruct_size)
- break;
-
- /* Skip if this inode is free */
- if (XFS_INOBT_MASK(chunkidx) & irbp->ir_free)
- continue;
-
- /* Get the inode and fill in a single buffer */
- ubused = statstruct_size;
- error = formatter(mp, XFS_AGINO_TO_INO(mp, agno, agino),
- *ubufp, acp->ac_ubleft, &ubused, &fmterror);
-
- if (fmterror == BULKSTAT_RV_GIVEUP ||
- (error && error != -ENOENT && error != -EINVAL)) {
- acp->ac_ubleft = 0;
- ASSERT(error);
- break;
- }
-
- /* be careful not to leak error if at end of chunk */
- if (fmterror == BULKSTAT_RV_NOTHING || error) {
- error = 0;
- continue;
- }
-
- *ubufp += ubused;
- acp->ac_ubleft -= ubused;
- acp->ac_ubelem++;
- }
+ kmem_free(bc.buf);
/*
- * Post-update *last_agino. At this point, agino will always point one
- * inode past the last inode we processed successfully. Hence we
- * substract that inode when setting the *last_agino cursor so that we
- * return the correct cookie to userspace. On the next bulkstat call,
- * the inode under the lastino cookie will be skipped as we have already
- * processed it here.
+ * If we reported one inode to userspace then we abort because we hit
+ * the end of the buffer. Don't leak that back to userspace.
*/
- *last_agino = agino - 1;
+ if (error == -ECANCELED)
+ error = 0;
return error;
}
-/*
- * Return stat information in bulk (by-inode) for the filesystem.
- */
-int /* error status */
-xfs_bulkstat(
- xfs_mount_t *mp, /* mount point for filesystem */
- xfs_ino_t *lastinop, /* last inode returned */
- int *ubcountp, /* size of buffer/count returned */
- bulkstat_one_pf formatter, /* func that'd fill a single buf */
- size_t statstruct_size, /* sizeof struct filling */
- char __user *ubuffer, /* buffer with inode stats */
- int *done) /* 1 if there are more stats to get */
+static int
+xfs_bulkstat_iwalk(
+ struct xfs_mount *mp,
+ struct xfs_trans *tp,
+ xfs_ino_t ino,
+ void *data)
{
- xfs_buf_t *agbp; /* agi header buffer */
- xfs_agino_t agino; /* inode # in allocation group */
- xfs_agnumber_t agno; /* allocation group number */
- xfs_btree_cur_t *cur; /* btree cursor for ialloc btree */
- xfs_inobt_rec_incore_t *irbuf; /* start of irec buffer */
- int nirbuf; /* size of irbuf */
- int ubcount; /* size of user's buffer */
- struct xfs_bulkstat_agichunk ac;
- int error = 0;
+ int error;
- /*
- * Get the last inode value, see if there's nothing to do.
- */
- agno = XFS_INO_TO_AGNO(mp, *lastinop);
- agino = XFS_INO_TO_AGINO(mp, *lastinop);
- if (agno >= mp->m_sb.sb_agcount ||
- *lastinop != XFS_AGINO_TO_INO(mp, agno, agino)) {
- *done = 1;
- *ubcountp = 0;
+ error = xfs_bulkstat_one_int(mp, tp, ino, data);
+ /* bulkstat just skips over missing inodes */
+ if (error == -ENOENT || error == -EINVAL)
return 0;
- }
+ return error;
+}
- ubcount = *ubcountp; /* statstruct's */
- ac.ac_ubuffer = &ubuffer;
- ac.ac_ubleft = ubcount * statstruct_size; /* bytes */;
- ac.ac_ubelem = 0;
+/*
+ * Check the incoming lastino parameter.
+ *
+ * We allow any inode value that could map to physical space inside the
+ * filesystem because if there are no inodes there, bulkstat moves on to the
+ * next chunk. In other words, the magic agino value of zero takes us to the
+ * first chunk in the AG, and an agino value past the end of the AG takes us to
+ * the first chunk in the next AG.
+ *
+ * Therefore we can end early if the requested inode is beyond the end of the
+ * filesystem or doesn't map properly.
+ */
+static inline bool
+xfs_bulkstat_already_done(
+ struct xfs_mount *mp,
+ xfs_ino_t startino)
+{
+ xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, startino);
+ xfs_agino_t agino = XFS_INO_TO_AGINO(mp, startino);
- *ubcountp = 0;
- *done = 0;
+ return agno >= mp->m_sb.sb_agcount ||
+ startino != XFS_AGINO_TO_INO(mp, agno, agino);
+}
- irbuf = kmem_zalloc_large(PAGE_SIZE * 4, KM_SLEEP);
- if (!irbuf)
+/* Return stat information in bulk (by-inode) for the filesystem. */
+int
+xfs_bulkstat(
+ struct xfs_ibulk *breq,
+ bulkstat_one_fmt_pf formatter)
+{
+ struct xfs_bstat_chunk bc = {
+ .formatter = formatter,
+ .breq = breq,
+ };
+ int error;
+
+ if (xfs_bulkstat_already_done(breq->mp, breq->startino))
+ return 0;
+
+ bc.buf = kmem_zalloc(sizeof(struct xfs_bulkstat),
+ KM_MAYFAIL);
+ if (!bc.buf)
return -ENOMEM;
- nirbuf = (PAGE_SIZE * 4) / sizeof(*irbuf);
- /*
- * Loop over the allocation groups, starting from the last
- * inode returned; 0 means start of the allocation group.
- */
- while (agno < mp->m_sb.sb_agcount) {
- struct xfs_inobt_rec_incore *irbp = irbuf;
- struct xfs_inobt_rec_incore *irbufend = irbuf + nirbuf;
- bool end_of_ag = false;
- int icount = 0;
- int stat;
+ error = xfs_iwalk(breq->mp, NULL, breq->startino, breq->flags,
+ xfs_bulkstat_iwalk, breq->icount, &bc);
- error = xfs_ialloc_read_agi(mp, NULL, agno, &agbp);
- if (error)
- break;
- /*
- * Allocate and initialize a btree cursor for ialloc btree.
- */
- cur = xfs_inobt_init_cursor(mp, NULL, agbp, agno,
- XFS_BTNUM_INO);
- if (agino > 0) {
- /*
- * In the middle of an allocation group, we need to get
- * the remainder of the chunk we're in.
- */
- struct xfs_inobt_rec_incore r;
-
- error = xfs_bulkstat_grab_ichunk(cur, agino, &icount, &r);
- if (error)
- goto del_cursor;
- if (icount) {
- irbp->ir_startino = r.ir_startino;
- irbp->ir_holemask = r.ir_holemask;
- irbp->ir_count = r.ir_count;
- irbp->ir_freecount = r.ir_freecount;
- irbp->ir_free = r.ir_free;
- irbp++;
- }
- /* Increment to the next record */
- error = xfs_btree_increment(cur, 0, &stat);
- } else {
- /* Start of ag. Lookup the first inode chunk */
- error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &stat);
- }
- if (error || stat == 0) {
- end_of_ag = true;
- goto del_cursor;
- }
-
- /*
- * Loop through inode btree records in this ag,
- * until we run out of inodes or space in the buffer.
- */
- while (irbp < irbufend && icount < ubcount) {
- struct xfs_inobt_rec_incore r;
-
- error = xfs_inobt_get_rec(cur, &r, &stat);
- if (error || stat == 0) {
- end_of_ag = true;
- goto del_cursor;
- }
-
- /*
- * If this chunk has any allocated inodes, save it.
- * Also start read-ahead now for this chunk.
- */
- if (r.ir_freecount < r.ir_count) {
- xfs_bulkstat_ichunk_ra(mp, agno, &r);
- irbp->ir_startino = r.ir_startino;
- irbp->ir_holemask = r.ir_holemask;
- irbp->ir_count = r.ir_count;
- irbp->ir_freecount = r.ir_freecount;
- irbp->ir_free = r.ir_free;
- irbp++;
- icount += r.ir_count - r.ir_freecount;
- }
- error = xfs_btree_increment(cur, 0, &stat);
- if (error || stat == 0) {
- end_of_ag = true;
- goto del_cursor;
- }
- cond_resched();
- }
-
- /*
- * Drop the btree buffers and the agi buffer as we can't hold any
- * of the locks these represent when calling iget. If there is a
- * pending error, then we are done.
- */
-del_cursor:
- xfs_btree_del_cursor(cur, error);
- xfs_buf_relse(agbp);
- if (error)
- break;
- /*
- * Now format all the good inodes into the user's buffer. The
- * call to xfs_bulkstat_ag_ichunk() sets up the agino pointer
- * for the next loop iteration.
- */
- irbufend = irbp;
- for (irbp = irbuf;
- irbp < irbufend && ac.ac_ubleft >= statstruct_size;
- irbp++) {
- error = xfs_bulkstat_ag_ichunk(mp, agno, irbp,
- formatter, statstruct_size, &ac,
- &agino);
- if (error)
- break;
-
- cond_resched();
- }
-
- /*
- * If we've run out of space or had a formatting error, we
- * are now done
- */
- if (ac.ac_ubleft < statstruct_size || error)
- break;
-
- if (end_of_ag) {
- agno++;
- agino = 0;
- }
- }
- /*
- * Done, we're either out of filesystem or space to put the data.
- */
- kmem_free(irbuf);
- *ubcountp = ac.ac_ubelem;
+ kmem_free(bc.buf);
/*
* We found some inodes, so clear the error status and return them.
@@ -505,135 +259,139 @@
* triggered again and propagated to userspace as there will be no
* formatted inodes in the buffer.
*/
- if (ac.ac_ubelem)
+ if (breq->ocount > 0)
error = 0;
- /*
- * If we ran out of filesystem, lastino will point off the end of
- * the filesystem so the next call will return immediately.
- */
- *lastinop = XFS_AGINO_TO_INO(mp, agno, agino);
- if (agno >= mp->m_sb.sb_agcount)
- *done = 1;
-
return error;
}
-int
-xfs_inumbers_fmt(
- void __user *ubuffer, /* buffer to write to */
- const struct xfs_inogrp *buffer, /* buffer to read from */
- long count, /* # of elements to read */
- long *written) /* # of bytes written */
+/* Convert bulkstat (v5) to bstat (v1). */
+void
+xfs_bulkstat_to_bstat(
+ struct xfs_mount *mp,
+ struct xfs_bstat *bs1,
+ const struct xfs_bulkstat *bstat)
{
- if (copy_to_user(ubuffer, buffer, count * sizeof(*buffer)))
- return -EFAULT;
- *written = count * sizeof(*buffer);
- return 0;
+ /* memset is needed here because of padding holes in the structure. */
+ memset(bs1, 0, sizeof(struct xfs_bstat));
+ bs1->bs_ino = bstat->bs_ino;
+ bs1->bs_mode = bstat->bs_mode;
+ bs1->bs_nlink = bstat->bs_nlink;
+ bs1->bs_uid = bstat->bs_uid;
+ bs1->bs_gid = bstat->bs_gid;
+ bs1->bs_rdev = bstat->bs_rdev;
+ bs1->bs_blksize = bstat->bs_blksize;
+ bs1->bs_size = bstat->bs_size;
+ bs1->bs_atime.tv_sec = bstat->bs_atime;
+ bs1->bs_mtime.tv_sec = bstat->bs_mtime;
+ bs1->bs_ctime.tv_sec = bstat->bs_ctime;
+ bs1->bs_atime.tv_nsec = bstat->bs_atime_nsec;
+ bs1->bs_mtime.tv_nsec = bstat->bs_mtime_nsec;
+ bs1->bs_ctime.tv_nsec = bstat->bs_ctime_nsec;
+ bs1->bs_blocks = bstat->bs_blocks;
+ bs1->bs_xflags = bstat->bs_xflags;
+ bs1->bs_extsize = XFS_FSB_TO_B(mp, bstat->bs_extsize_blks);
+ bs1->bs_extents = bstat->bs_extents;
+ bs1->bs_gen = bstat->bs_gen;
+ bs1->bs_projid_lo = bstat->bs_projectid & 0xFFFF;
+ bs1->bs_forkoff = bstat->bs_forkoff;
+ bs1->bs_projid_hi = bstat->bs_projectid >> 16;
+ bs1->bs_sick = bstat->bs_sick;
+ bs1->bs_checked = bstat->bs_checked;
+ bs1->bs_cowextsize = XFS_FSB_TO_B(mp, bstat->bs_cowextsize_blks);
+ bs1->bs_dmevmask = 0;
+ bs1->bs_dmstate = 0;
+ bs1->bs_aextents = bstat->bs_aextents;
+}
+
+struct xfs_inumbers_chunk {
+ inumbers_fmt_pf formatter;
+ struct xfs_ibulk *breq;
+};
+
+/*
+ * INUMBERS
+ * ========
+ * This is how we export inode btree records to userspace, so that XFS tools
+ * can figure out where inodes are allocated.
+ */
+
+/*
+ * Format the inode group structure and report it somewhere.
+ *
+ * Similar to xfs_bulkstat_one_int, lastino is the inode cursor as we walk
+ * through the filesystem so we move it forward unless there was a runtime
+ * error. If the formatter tells us the buffer is now full we also move the
+ * cursor forward and abort the walk.
+ */
+STATIC int
+xfs_inumbers_walk(
+ struct xfs_mount *mp,
+ struct xfs_trans *tp,
+ xfs_agnumber_t agno,
+ const struct xfs_inobt_rec_incore *irec,
+ void *data)
+{
+ struct xfs_inumbers inogrp = {
+ .xi_startino = XFS_AGINO_TO_INO(mp, agno, irec->ir_startino),
+ .xi_alloccount = irec->ir_count - irec->ir_freecount,
+ .xi_allocmask = ~irec->ir_free,
+ .xi_version = XFS_INUMBERS_VERSION_V5,
+ };
+ struct xfs_inumbers_chunk *ic = data;
+ int error;
+
+ error = ic->formatter(ic->breq, &inogrp);
+ if (error && error != -ECANCELED)
+ return error;
+
+ ic->breq->startino = XFS_AGINO_TO_INO(mp, agno, irec->ir_startino) +
+ XFS_INODES_PER_CHUNK;
+ return error;
}
/*
* Return inode number table for the filesystem.
*/
-int /* error status */
+int
xfs_inumbers(
- struct xfs_mount *mp,/* mount point for filesystem */
- xfs_ino_t *lastino,/* last inode returned */
- int *count,/* size of buffer/count returned */
- void __user *ubuffer,/* buffer with inode descriptions */
+ struct xfs_ibulk *breq,
inumbers_fmt_pf formatter)
{
- xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, *lastino);
- xfs_agino_t agino = XFS_INO_TO_AGINO(mp, *lastino);
- struct xfs_btree_cur *cur = NULL;
- struct xfs_buf *agbp = NULL;
- struct xfs_inogrp *buffer;
- int bcount;
- int left = *count;
- int bufidx = 0;
+ struct xfs_inumbers_chunk ic = {
+ .formatter = formatter,
+ .breq = breq,
+ };
int error = 0;
- *count = 0;
- if (agno >= mp->m_sb.sb_agcount ||
- *lastino != XFS_AGINO_TO_INO(mp, agno, agino))
- return error;
+ if (xfs_bulkstat_already_done(breq->mp, breq->startino))
+ return 0;
- bcount = min(left, (int)(PAGE_SIZE / sizeof(*buffer)));
- buffer = kmem_zalloc(bcount * sizeof(*buffer), KM_SLEEP);
- do {
- struct xfs_inobt_rec_incore r;
- int stat;
+ error = xfs_inobt_walk(breq->mp, NULL, breq->startino, breq->flags,
+ xfs_inumbers_walk, breq->icount, &ic);
- if (!agbp) {
- error = xfs_ialloc_read_agi(mp, NULL, agno, &agbp);
- if (error)
- break;
-
- cur = xfs_inobt_init_cursor(mp, NULL, agbp, agno,
- XFS_BTNUM_INO);
- error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_GE,
- &stat);
- if (error)
- break;
- if (!stat)
- goto next_ag;
- }
-
- error = xfs_inobt_get_rec(cur, &r, &stat);
- if (error)
- break;
- if (!stat)
- goto next_ag;
-
- agino = r.ir_startino + XFS_INODES_PER_CHUNK - 1;
- buffer[bufidx].xi_startino =
- XFS_AGINO_TO_INO(mp, agno, r.ir_startino);
- buffer[bufidx].xi_alloccount = r.ir_count - r.ir_freecount;
- buffer[bufidx].xi_allocmask = ~r.ir_free;
- if (++bufidx == bcount) {
- long written;
-
- error = formatter(ubuffer, buffer, bufidx, &written);
- if (error)
- break;
- ubuffer += written;
- *count += bufidx;
- bufidx = 0;
- }
- if (!--left)
- break;
-
- error = xfs_btree_increment(cur, 0, &stat);
- if (error)
- break;
- if (stat)
- continue;
-
-next_ag:
- xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
- cur = NULL;
- xfs_buf_relse(agbp);
- agbp = NULL;
- agino = 0;
- agno++;
- } while (agno < mp->m_sb.sb_agcount);
-
- if (!error) {
- if (bufidx) {
- long written;
-
- error = formatter(ubuffer, buffer, bufidx, &written);
- if (!error)
- *count += bufidx;
- }
- *lastino = XFS_AGINO_TO_INO(mp, agno, agino);
- }
-
- kmem_free(buffer);
- if (cur)
- xfs_btree_del_cursor(cur, error);
- if (agbp)
- xfs_buf_relse(agbp);
+ /*
+ * We found some inode groups, so clear the error status and return
+ * them. The lastino pointer will point directly at the inode that
+ * triggered any error that occurred, so on the next call the error
+ * will be triggered again and propagated to userspace as there will be
+ * no formatted inode groups in the buffer.
+ */
+ if (breq->ocount > 0)
+ error = 0;
return error;
}
+
+/* Convert an inumbers (v5) struct to a inogrp (v1) struct. */
+void
+xfs_inumbers_to_inogrp(
+ struct xfs_inogrp *ig1,
+ const struct xfs_inumbers *ig)
+{
+ /* memset is needed here because of padding holes in the structure. */
+ memset(ig1, 0, sizeof(struct xfs_inogrp));
+ ig1->xi_startino = ig->xi_startino;
+ ig1->xi_alloccount = ig->xi_alloccount;
+ ig1->xi_allocmask = ig->xi_allocmask;
+}
diff --git a/fs/xfs/xfs_itable.h b/fs/xfs/xfs_itable.h
index 8a82228..96a1e2a 100644
--- a/fs/xfs/xfs_itable.h
+++ b/fs/xfs/xfs_itable.h
@@ -5,83 +5,60 @@
#ifndef __XFS_ITABLE_H__
#define __XFS_ITABLE_H__
-/*
- * xfs_bulkstat() is used to fill in xfs_bstat structures as well as dm_stat
- * structures (by the dmi library). This is a pointer to a formatter function
- * that will iget the inode and fill in the appropriate structure.
- * see xfs_bulkstat_one() and xfs_dm_bulkstat_one() in dmapi_xfs.c
- */
-typedef int (*bulkstat_one_pf)(struct xfs_mount *mp,
- xfs_ino_t ino,
- void __user *buffer,
- int ubsize,
- int *ubused,
- int *stat);
+/* In-memory representation of a userspace request for batch inode data. */
+struct xfs_ibulk {
+ struct xfs_mount *mp;
+ void __user *ubuffer; /* user output buffer */
+ xfs_ino_t startino; /* start with this inode */
+ unsigned int icount; /* number of elements in ubuffer */
+ unsigned int ocount; /* number of records returned */
+ unsigned int flags; /* see XFS_IBULK_FLAG_* */
+};
+
+/* Only iterate within the same AG as startino */
+#define XFS_IBULK_SAME_AG (XFS_IWALK_SAME_AG)
/*
- * Values for stat return value.
+ * Advance the user buffer pointer by one record of the given size. If the
+ * buffer is now full, return the appropriate error code.
*/
-#define BULKSTAT_RV_NOTHING 0
-#define BULKSTAT_RV_DIDONE 1
-#define BULKSTAT_RV_GIVEUP 2
+static inline int
+xfs_ibulk_advance(
+ struct xfs_ibulk *breq,
+ size_t bytes)
+{
+ char __user *b = breq->ubuffer;
+
+ breq->ubuffer = b + bytes;
+ breq->ocount++;
+ return breq->ocount == breq->icount ? -ECANCELED : 0;
+}
/*
* Return stat information in bulk (by-inode) for the filesystem.
*/
-int /* error status */
-xfs_bulkstat(
- xfs_mount_t *mp, /* mount point for filesystem */
- xfs_ino_t *lastino, /* last inode returned */
- int *count, /* size of buffer/count returned */
- bulkstat_one_pf formatter, /* func that'd fill a single buf */
- size_t statstruct_size,/* sizeof struct that we're filling */
- char __user *ubuffer,/* buffer with inode stats */
- int *done); /* 1 if there are more stats to get */
-typedef int (*bulkstat_one_fmt_pf)( /* used size in bytes or negative error */
- void __user *ubuffer, /* buffer to write to */
- int ubsize, /* remaining user buffer sz */
- int *ubused, /* bytes used by formatter */
- const xfs_bstat_t *buffer); /* buffer to read from */
+/*
+ * Return codes for the formatter function are 0 to continue iterating, and
+ * non-zero to stop iterating. Any non-zero value will be passed up to the
+ * bulkstat/inumbers caller. The special value -ECANCELED can be used to stop
+ * iteration, as neither bulkstat nor inumbers will ever generate that error
+ * code on their own.
+ */
-int
-xfs_bulkstat_one_int(
- xfs_mount_t *mp,
- xfs_ino_t ino,
- void __user *buffer,
- int ubsize,
- bulkstat_one_fmt_pf formatter,
- int *ubused,
- int *stat);
+typedef int (*bulkstat_one_fmt_pf)(struct xfs_ibulk *breq,
+ const struct xfs_bulkstat *bstat);
-int
-xfs_bulkstat_one(
- xfs_mount_t *mp,
- xfs_ino_t ino,
- void __user *buffer,
- int ubsize,
- int *ubused,
- int *stat);
+int xfs_bulkstat_one(struct xfs_ibulk *breq, bulkstat_one_fmt_pf formatter);
+int xfs_bulkstat(struct xfs_ibulk *breq, bulkstat_one_fmt_pf formatter);
+void xfs_bulkstat_to_bstat(struct xfs_mount *mp, struct xfs_bstat *bs1,
+ const struct xfs_bulkstat *bstat);
-typedef int (*inumbers_fmt_pf)(
- void __user *ubuffer, /* buffer to write to */
- const xfs_inogrp_t *buffer, /* buffer to read from */
- long count, /* # of elements to read */
- long *written); /* # of bytes written */
+typedef int (*inumbers_fmt_pf)(struct xfs_ibulk *breq,
+ const struct xfs_inumbers *igrp);
-int
-xfs_inumbers_fmt(
- void __user *ubuffer, /* buffer to write to */
- const xfs_inogrp_t *buffer, /* buffer to read from */
- long count, /* # of elements to read */
- long *written); /* # of bytes written */
-
-int /* error status */
-xfs_inumbers(
- xfs_mount_t *mp, /* mount point for filesystem */
- xfs_ino_t *last, /* last inode returned */
- int *count, /* size of buffer/count returned */
- void __user *buffer, /* buffer with inode info */
- inumbers_fmt_pf formatter);
+int xfs_inumbers(struct xfs_ibulk *breq, inumbers_fmt_pf formatter);
+void xfs_inumbers_to_inogrp(struct xfs_inogrp *ig1,
+ const struct xfs_inumbers *ig);
#endif /* __XFS_ITABLE_H__ */
diff --git a/fs/xfs/xfs_iwalk.c b/fs/xfs/xfs_iwalk.c
new file mode 100644
index 0000000..aa375cf
--- /dev/null
+++ b/fs/xfs/xfs_iwalk.c
@@ -0,0 +1,720 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2019 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_btree.h"
+#include "xfs_ialloc.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_iwalk.h"
+#include "xfs_error.h"
+#include "xfs_trace.h"
+#include "xfs_icache.h"
+#include "xfs_health.h"
+#include "xfs_trans.h"
+#include "xfs_pwork.h"
+
+/*
+ * Walking Inodes in the Filesystem
+ * ================================
+ *
+ * This iterator function walks a subset of filesystem inodes in increasing
+ * order from @startino until there are no more inodes. For each allocated
+ * inode it finds, it calls a walk function with the relevant inode number and
+ * a pointer to caller-provided data. The walk function can return the usual
+ * negative error code to stop the iteration; 0 to continue the iteration; or
+ * -ECANCELED to stop the iteration. This return value is returned to the
+ * caller.
+ *
+ * Internally, we allow the walk function to do anything, which means that we
+ * cannot maintain the inobt cursor or our lock on the AGI buffer. We
+ * therefore cache the inobt records in kernel memory and only call the walk
+ * function when our memory buffer is full. @nr_recs is the number of records
+ * that we've cached, and @sz_recs is the size of our cache.
+ *
+ * It is the responsibility of the walk function to ensure it accesses
+ * allocated inodes, as the inobt records may be stale by the time they are
+ * acted upon.
+ */
+
+struct xfs_iwalk_ag {
+ /* parallel work control data; will be null if single threaded */
+ struct xfs_pwork pwork;
+
+ struct xfs_mount *mp;
+ struct xfs_trans *tp;
+
+ /* Where do we start the traversal? */
+ xfs_ino_t startino;
+
+ /* Array of inobt records we cache. */
+ struct xfs_inobt_rec_incore *recs;
+
+ /* Number of entries allocated for the @recs array. */
+ unsigned int sz_recs;
+
+ /* Number of entries in the @recs array that are in use. */
+ unsigned int nr_recs;
+
+ /* Inode walk function and data pointer. */
+ xfs_iwalk_fn iwalk_fn;
+ xfs_inobt_walk_fn inobt_walk_fn;
+ void *data;
+
+ /*
+ * Make it look like the inodes up to startino are free so that
+ * bulkstat can start its inode iteration at the correct place without
+ * needing to special case everywhere.
+ */
+ unsigned int trim_start:1;
+
+ /* Skip empty inobt records? */
+ unsigned int skip_empty:1;
+};
+
+/*
+ * Loop over all clusters in a chunk for a given incore inode allocation btree
+ * record. Do a readahead if there are any allocated inodes in that cluster.
+ */
+STATIC void
+xfs_iwalk_ichunk_ra(
+ struct xfs_mount *mp,
+ xfs_agnumber_t agno,
+ struct xfs_inobt_rec_incore *irec)
+{
+ struct xfs_ino_geometry *igeo = M_IGEO(mp);
+ xfs_agblock_t agbno;
+ struct blk_plug plug;
+ int i; /* inode chunk index */
+
+ agbno = XFS_AGINO_TO_AGBNO(mp, irec->ir_startino);
+
+ blk_start_plug(&plug);
+ for (i = 0; i < XFS_INODES_PER_CHUNK; i += igeo->inodes_per_cluster) {
+ xfs_inofree_t imask;
+
+ imask = xfs_inobt_maskn(i, igeo->inodes_per_cluster);
+ if (imask & ~irec->ir_free) {
+ xfs_btree_reada_bufs(mp, agno, agbno,
+ igeo->blocks_per_cluster,
+ &xfs_inode_buf_ops);
+ }
+ agbno += igeo->blocks_per_cluster;
+ }
+ blk_finish_plug(&plug);
+}
+
+/*
+ * Set the bits in @irec's free mask that correspond to the inodes before
+ * @agino so that we skip them. This is how we restart an inode walk that was
+ * interrupted in the middle of an inode record.
+ */
+STATIC void
+xfs_iwalk_adjust_start(
+ xfs_agino_t agino, /* starting inode of chunk */
+ struct xfs_inobt_rec_incore *irec) /* btree record */
+{
+ int idx; /* index into inode chunk */
+ int i;
+
+ idx = agino - irec->ir_startino;
+
+ /*
+ * We got a right chunk with some left inodes allocated at it. Grab
+ * the chunk record. Mark all the uninteresting inodes free because
+ * they're before our start point.
+ */
+ for (i = 0; i < idx; i++) {
+ if (XFS_INOBT_MASK(i) & ~irec->ir_free)
+ irec->ir_freecount++;
+ }
+
+ irec->ir_free |= xfs_inobt_maskn(0, idx);
+}
+
+/* Allocate memory for a walk. */
+STATIC int
+xfs_iwalk_alloc(
+ struct xfs_iwalk_ag *iwag)
+{
+ size_t size;
+
+ ASSERT(iwag->recs == NULL);
+ iwag->nr_recs = 0;
+
+ /* Allocate a prefetch buffer for inobt records. */
+ size = iwag->sz_recs * sizeof(struct xfs_inobt_rec_incore);
+ iwag->recs = kmem_alloc(size, KM_MAYFAIL);
+ if (iwag->recs == NULL)
+ return -ENOMEM;
+
+ return 0;
+}
+
+/* Free memory we allocated for a walk. */
+STATIC void
+xfs_iwalk_free(
+ struct xfs_iwalk_ag *iwag)
+{
+ kmem_free(iwag->recs);
+ iwag->recs = NULL;
+}
+
+/* For each inuse inode in each cached inobt record, call our function. */
+STATIC int
+xfs_iwalk_ag_recs(
+ struct xfs_iwalk_ag *iwag)
+{
+ struct xfs_mount *mp = iwag->mp;
+ struct xfs_trans *tp = iwag->tp;
+ xfs_ino_t ino;
+ unsigned int i, j;
+ xfs_agnumber_t agno;
+ int error;
+
+ agno = XFS_INO_TO_AGNO(mp, iwag->startino);
+ for (i = 0; i < iwag->nr_recs; i++) {
+ struct xfs_inobt_rec_incore *irec = &iwag->recs[i];
+
+ trace_xfs_iwalk_ag_rec(mp, agno, irec);
+
+ if (xfs_pwork_want_abort(&iwag->pwork))
+ return 0;
+
+ if (iwag->inobt_walk_fn) {
+ error = iwag->inobt_walk_fn(mp, tp, agno, irec,
+ iwag->data);
+ if (error)
+ return error;
+ }
+
+ if (!iwag->iwalk_fn)
+ continue;
+
+ for (j = 0; j < XFS_INODES_PER_CHUNK; j++) {
+ if (xfs_pwork_want_abort(&iwag->pwork))
+ return 0;
+
+ /* Skip if this inode is free */
+ if (XFS_INOBT_MASK(j) & irec->ir_free)
+ continue;
+
+ /* Otherwise call our function. */
+ ino = XFS_AGINO_TO_INO(mp, agno, irec->ir_startino + j);
+ error = iwag->iwalk_fn(mp, tp, ino, iwag->data);
+ if (error)
+ return error;
+ }
+ }
+
+ return 0;
+}
+
+/* Delete cursor and let go of AGI. */
+static inline void
+xfs_iwalk_del_inobt(
+ struct xfs_trans *tp,
+ struct xfs_btree_cur **curpp,
+ struct xfs_buf **agi_bpp,
+ int error)
+{
+ if (*curpp) {
+ xfs_btree_del_cursor(*curpp, error);
+ *curpp = NULL;
+ }
+ if (*agi_bpp) {
+ xfs_trans_brelse(tp, *agi_bpp);
+ *agi_bpp = NULL;
+ }
+}
+
+/*
+ * Set ourselves up for walking inobt records starting from a given point in
+ * the filesystem.
+ *
+ * If caller passed in a nonzero start inode number, load the record from the
+ * inobt and make the record look like all the inodes before agino are free so
+ * that we skip them, and then move the cursor to the next inobt record. This
+ * is how we support starting an iwalk in the middle of an inode chunk.
+ *
+ * If the caller passed in a start number of zero, move the cursor to the first
+ * inobt record.
+ *
+ * The caller is responsible for cleaning up the cursor and buffer pointer
+ * regardless of the error status.
+ */
+STATIC int
+xfs_iwalk_ag_start(
+ struct xfs_iwalk_ag *iwag,
+ xfs_agnumber_t agno,
+ xfs_agino_t agino,
+ struct xfs_btree_cur **curpp,
+ struct xfs_buf **agi_bpp,
+ int *has_more)
+{
+ struct xfs_mount *mp = iwag->mp;
+ struct xfs_trans *tp = iwag->tp;
+ struct xfs_inobt_rec_incore *irec;
+ int error;
+
+ /* Set up a fresh cursor and empty the inobt cache. */
+ iwag->nr_recs = 0;
+ error = xfs_inobt_cur(mp, tp, agno, XFS_BTNUM_INO, curpp, agi_bpp);
+ if (error)
+ return error;
+
+ /* Starting at the beginning of the AG? That's easy! */
+ if (agino == 0)
+ return xfs_inobt_lookup(*curpp, 0, XFS_LOOKUP_GE, has_more);
+
+ /*
+ * Otherwise, we have to grab the inobt record where we left off, stuff
+ * the record into our cache, and then see if there are more records.
+ * We require a lookup cache of at least two elements so that the
+ * caller doesn't have to deal with tearing down the cursor to walk the
+ * records.
+ */
+ error = xfs_inobt_lookup(*curpp, agino, XFS_LOOKUP_LE, has_more);
+ if (error)
+ return error;
+
+ /*
+ * If the LE lookup at @agino yields no records, jump ahead to the
+ * inobt cursor increment to see if there are more records to process.
+ */
+ if (!*has_more)
+ goto out_advance;
+
+ /* Get the record, should always work */
+ irec = &iwag->recs[iwag->nr_recs];
+ error = xfs_inobt_get_rec(*curpp, irec, has_more);
+ if (error)
+ return error;
+ XFS_WANT_CORRUPTED_RETURN(mp, *has_more == 1);
+
+ /*
+ * If the LE lookup yielded an inobt record before the cursor position,
+ * skip it and see if there's another one after it.
+ */
+ if (irec->ir_startino + XFS_INODES_PER_CHUNK <= agino)
+ goto out_advance;
+
+ /*
+ * If agino fell in the middle of the inode record, make it look like
+ * the inodes up to agino are free so that we don't return them again.
+ */
+ if (iwag->trim_start)
+ xfs_iwalk_adjust_start(agino, irec);
+
+ /*
+ * The prefetch calculation is supposed to give us a large enough inobt
+ * record cache that grab_ichunk can stage a partial first record and
+ * the loop body can cache a record without having to check for cache
+ * space until after it reads an inobt record.
+ */
+ iwag->nr_recs++;
+ ASSERT(iwag->nr_recs < iwag->sz_recs);
+
+out_advance:
+ return xfs_btree_increment(*curpp, 0, has_more);
+}
+
+/*
+ * The inobt record cache is full, so preserve the inobt cursor state and
+ * run callbacks on the cached inobt records. When we're done, restore the
+ * cursor state to wherever the cursor would have been had the cache not been
+ * full (and therefore we could've just incremented the cursor) if *@has_more
+ * is true. On exit, *@has_more will indicate whether or not the caller should
+ * try for more inode records.
+ */
+STATIC int
+xfs_iwalk_run_callbacks(
+ struct xfs_iwalk_ag *iwag,
+ xfs_agnumber_t agno,
+ struct xfs_btree_cur **curpp,
+ struct xfs_buf **agi_bpp,
+ int *has_more)
+{
+ struct xfs_mount *mp = iwag->mp;
+ struct xfs_trans *tp = iwag->tp;
+ struct xfs_inobt_rec_incore *irec;
+ xfs_agino_t restart;
+ int error;
+
+ ASSERT(iwag->nr_recs > 0);
+
+ /* Delete cursor but remember the last record we cached... */
+ xfs_iwalk_del_inobt(tp, curpp, agi_bpp, 0);
+ irec = &iwag->recs[iwag->nr_recs - 1];
+ restart = irec->ir_startino + XFS_INODES_PER_CHUNK - 1;
+
+ error = xfs_iwalk_ag_recs(iwag);
+ if (error)
+ return error;
+
+ /* ...empty the cache... */
+ iwag->nr_recs = 0;
+
+ if (!has_more)
+ return 0;
+
+ /* ...and recreate the cursor just past where we left off. */
+ error = xfs_inobt_cur(mp, tp, agno, XFS_BTNUM_INO, curpp, agi_bpp);
+ if (error)
+ return error;
+
+ return xfs_inobt_lookup(*curpp, restart, XFS_LOOKUP_GE, has_more);
+}
+
+/* Walk all inodes in a single AG, from @iwag->startino to the end of the AG. */
+STATIC int
+xfs_iwalk_ag(
+ struct xfs_iwalk_ag *iwag)
+{
+ struct xfs_mount *mp = iwag->mp;
+ struct xfs_trans *tp = iwag->tp;
+ struct xfs_buf *agi_bp = NULL;
+ struct xfs_btree_cur *cur = NULL;
+ xfs_agnumber_t agno;
+ xfs_agino_t agino;
+ int has_more;
+ int error = 0;
+
+ /* Set up our cursor at the right place in the inode btree. */
+ agno = XFS_INO_TO_AGNO(mp, iwag->startino);
+ agino = XFS_INO_TO_AGINO(mp, iwag->startino);
+ error = xfs_iwalk_ag_start(iwag, agno, agino, &cur, &agi_bp, &has_more);
+
+ while (!error && has_more) {
+ struct xfs_inobt_rec_incore *irec;
+
+ cond_resched();
+ if (xfs_pwork_want_abort(&iwag->pwork))
+ goto out;
+
+ /* Fetch the inobt record. */
+ irec = &iwag->recs[iwag->nr_recs];
+ error = xfs_inobt_get_rec(cur, irec, &has_more);
+ if (error || !has_more)
+ break;
+
+ /* No allocated inodes in this chunk; skip it. */
+ if (iwag->skip_empty && irec->ir_freecount == irec->ir_count) {
+ error = xfs_btree_increment(cur, 0, &has_more);
+ if (error)
+ break;
+ continue;
+ }
+
+ /*
+ * Start readahead for this inode chunk in anticipation of
+ * walking the inodes.
+ */
+ if (iwag->iwalk_fn)
+ xfs_iwalk_ichunk_ra(mp, agno, irec);
+
+ /*
+ * If there's space in the buffer for more records, increment
+ * the btree cursor and grab more.
+ */
+ if (++iwag->nr_recs < iwag->sz_recs) {
+ error = xfs_btree_increment(cur, 0, &has_more);
+ if (error || !has_more)
+ break;
+ continue;
+ }
+
+ /*
+ * Otherwise, we need to save cursor state and run the callback
+ * function on the cached records. The run_callbacks function
+ * is supposed to return a cursor pointing to the record where
+ * we would be if we had been able to increment like above.
+ */
+ ASSERT(has_more);
+ error = xfs_iwalk_run_callbacks(iwag, agno, &cur, &agi_bp,
+ &has_more);
+ }
+
+ if (iwag->nr_recs == 0 || error)
+ goto out;
+
+ /* Walk the unprocessed records in the cache. */
+ error = xfs_iwalk_run_callbacks(iwag, agno, &cur, &agi_bp, &has_more);
+
+out:
+ xfs_iwalk_del_inobt(tp, &cur, &agi_bp, error);
+ return error;
+}
+
+/*
+ * We experimentally determined that the reduction in ioctl call overhead
+ * diminishes when userspace asks for more than 2048 inodes, so we'll cap
+ * prefetch at this point.
+ */
+#define IWALK_MAX_INODE_PREFETCH (2048U)
+
+/*
+ * Given the number of inodes to prefetch, set the number of inobt records that
+ * we cache in memory, which controls the number of inodes we try to read
+ * ahead. Set the maximum if @inodes == 0.
+ */
+static inline unsigned int
+xfs_iwalk_prefetch(
+ unsigned int inodes)
+{
+ unsigned int inobt_records;
+
+ /*
+ * If the caller didn't tell us the number of inodes they wanted,
+ * assume the maximum prefetch possible for best performance.
+ * Otherwise, cap prefetch at that maximum so that we don't start an
+ * absurd amount of prefetch.
+ */
+ if (inodes == 0)
+ inodes = IWALK_MAX_INODE_PREFETCH;
+ inodes = min(inodes, IWALK_MAX_INODE_PREFETCH);
+
+ /* Round the inode count up to a full chunk. */
+ inodes = round_up(inodes, XFS_INODES_PER_CHUNK);
+
+ /*
+ * In order to convert the number of inodes to prefetch into an
+ * estimate of the number of inobt records to cache, we require a
+ * conversion factor that reflects our expectations of the average
+ * loading factor of an inode chunk. Based on data gathered, most
+ * (but not all) filesystems manage to keep the inode chunks totally
+ * full, so we'll underestimate slightly so that our readahead will
+ * still deliver the performance we want on aging filesystems:
+ *
+ * inobt = inodes / (INODES_PER_CHUNK * (4 / 5));
+ *
+ * The funny math is to avoid integer division.
+ */
+ inobt_records = (inodes * 5) / (4 * XFS_INODES_PER_CHUNK);
+
+ /*
+ * Allocate enough space to prefetch at least two inobt records so that
+ * we can cache both the record where the iwalk started and the next
+ * record. This simplifies the AG inode walk loop setup code.
+ */
+ return max(inobt_records, 2U);
+}
+
+/*
+ * Walk all inodes in the filesystem starting from @startino. The @iwalk_fn
+ * will be called for each allocated inode, being passed the inode's number and
+ * @data. @max_prefetch controls how many inobt records' worth of inodes we
+ * try to readahead.
+ */
+int
+xfs_iwalk(
+ struct xfs_mount *mp,
+ struct xfs_trans *tp,
+ xfs_ino_t startino,
+ unsigned int flags,
+ xfs_iwalk_fn iwalk_fn,
+ unsigned int inode_records,
+ void *data)
+{
+ struct xfs_iwalk_ag iwag = {
+ .mp = mp,
+ .tp = tp,
+ .iwalk_fn = iwalk_fn,
+ .data = data,
+ .startino = startino,
+ .sz_recs = xfs_iwalk_prefetch(inode_records),
+ .trim_start = 1,
+ .skip_empty = 1,
+ .pwork = XFS_PWORK_SINGLE_THREADED,
+ };
+ xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, startino);
+ int error;
+
+ ASSERT(agno < mp->m_sb.sb_agcount);
+ ASSERT(!(flags & ~XFS_IWALK_FLAGS_ALL));
+
+ error = xfs_iwalk_alloc(&iwag);
+ if (error)
+ return error;
+
+ for (; agno < mp->m_sb.sb_agcount; agno++) {
+ error = xfs_iwalk_ag(&iwag);
+ if (error)
+ break;
+ iwag.startino = XFS_AGINO_TO_INO(mp, agno + 1, 0);
+ if (flags & XFS_INOBT_WALK_SAME_AG)
+ break;
+ }
+
+ xfs_iwalk_free(&iwag);
+ return error;
+}
+
+/* Run per-thread iwalk work. */
+static int
+xfs_iwalk_ag_work(
+ struct xfs_mount *mp,
+ struct xfs_pwork *pwork)
+{
+ struct xfs_iwalk_ag *iwag;
+ int error = 0;
+
+ iwag = container_of(pwork, struct xfs_iwalk_ag, pwork);
+ if (xfs_pwork_want_abort(pwork))
+ goto out;
+
+ error = xfs_iwalk_alloc(iwag);
+ if (error)
+ goto out;
+
+ error = xfs_iwalk_ag(iwag);
+ xfs_iwalk_free(iwag);
+out:
+ kmem_free(iwag);
+ return error;
+}
+
+/*
+ * Walk all the inodes in the filesystem using multiple threads to process each
+ * AG.
+ */
+int
+xfs_iwalk_threaded(
+ struct xfs_mount *mp,
+ xfs_ino_t startino,
+ unsigned int flags,
+ xfs_iwalk_fn iwalk_fn,
+ unsigned int inode_records,
+ bool polled,
+ void *data)
+{
+ struct xfs_pwork_ctl pctl;
+ xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, startino);
+ unsigned int nr_threads;
+ int error;
+
+ ASSERT(agno < mp->m_sb.sb_agcount);
+ ASSERT(!(flags & ~XFS_IWALK_FLAGS_ALL));
+
+ nr_threads = xfs_pwork_guess_datadev_parallelism(mp);
+ error = xfs_pwork_init(mp, &pctl, xfs_iwalk_ag_work, "xfs_iwalk",
+ nr_threads);
+ if (error)
+ return error;
+
+ for (; agno < mp->m_sb.sb_agcount; agno++) {
+ struct xfs_iwalk_ag *iwag;
+
+ if (xfs_pwork_ctl_want_abort(&pctl))
+ break;
+
+ iwag = kmem_zalloc(sizeof(struct xfs_iwalk_ag), 0);
+ iwag->mp = mp;
+ iwag->iwalk_fn = iwalk_fn;
+ iwag->data = data;
+ iwag->startino = startino;
+ iwag->sz_recs = xfs_iwalk_prefetch(inode_records);
+ xfs_pwork_queue(&pctl, &iwag->pwork);
+ startino = XFS_AGINO_TO_INO(mp, agno + 1, 0);
+ if (flags & XFS_INOBT_WALK_SAME_AG)
+ break;
+ }
+
+ if (polled)
+ xfs_pwork_poll(&pctl);
+ return xfs_pwork_destroy(&pctl);
+}
+
+/*
+ * Allow callers to cache up to a page's worth of inobt records. This reflects
+ * the existing inumbers prefetching behavior. Since the inobt walk does not
+ * itself do anything with the inobt records, we can set a fairly high limit
+ * here.
+ */
+#define MAX_INOBT_WALK_PREFETCH \
+ (PAGE_SIZE / sizeof(struct xfs_inobt_rec_incore))
+
+/*
+ * Given the number of records that the user wanted, set the number of inobt
+ * records that we buffer in memory. Set the maximum if @inobt_records == 0.
+ */
+static inline unsigned int
+xfs_inobt_walk_prefetch(
+ unsigned int inobt_records)
+{
+ /*
+ * If the caller didn't tell us the number of inobt records they
+ * wanted, assume the maximum prefetch possible for best performance.
+ */
+ if (inobt_records == 0)
+ inobt_records = MAX_INOBT_WALK_PREFETCH;
+
+ /*
+ * Allocate enough space to prefetch at least two inobt records so that
+ * we can cache both the record where the iwalk started and the next
+ * record. This simplifies the AG inode walk loop setup code.
+ */
+ inobt_records = max(inobt_records, 2U);
+
+ /*
+ * Cap prefetch at that maximum so that we don't use an absurd amount
+ * of memory.
+ */
+ return min_t(unsigned int, inobt_records, MAX_INOBT_WALK_PREFETCH);
+}
+
+/*
+ * Walk all inode btree records in the filesystem starting from @startino. The
+ * @inobt_walk_fn will be called for each btree record, being passed the incore
+ * record and @data. @max_prefetch controls how many inobt records we try to
+ * cache ahead of time.
+ */
+int
+xfs_inobt_walk(
+ struct xfs_mount *mp,
+ struct xfs_trans *tp,
+ xfs_ino_t startino,
+ unsigned int flags,
+ xfs_inobt_walk_fn inobt_walk_fn,
+ unsigned int inobt_records,
+ void *data)
+{
+ struct xfs_iwalk_ag iwag = {
+ .mp = mp,
+ .tp = tp,
+ .inobt_walk_fn = inobt_walk_fn,
+ .data = data,
+ .startino = startino,
+ .sz_recs = xfs_inobt_walk_prefetch(inobt_records),
+ .pwork = XFS_PWORK_SINGLE_THREADED,
+ };
+ xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, startino);
+ int error;
+
+ ASSERT(agno < mp->m_sb.sb_agcount);
+ ASSERT(!(flags & ~XFS_INOBT_WALK_FLAGS_ALL));
+
+ error = xfs_iwalk_alloc(&iwag);
+ if (error)
+ return error;
+
+ for (; agno < mp->m_sb.sb_agcount; agno++) {
+ error = xfs_iwalk_ag(&iwag);
+ if (error)
+ break;
+ iwag.startino = XFS_AGINO_TO_INO(mp, agno + 1, 0);
+ if (flags & XFS_INOBT_WALK_SAME_AG)
+ break;
+ }
+
+ xfs_iwalk_free(&iwag);
+ return error;
+}
diff --git a/fs/xfs/xfs_iwalk.h b/fs/xfs/xfs_iwalk.h
new file mode 100644
index 0000000..37a795f
--- /dev/null
+++ b/fs/xfs/xfs_iwalk.h
@@ -0,0 +1,49 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (C) 2019 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ */
+#ifndef __XFS_IWALK_H__
+#define __XFS_IWALK_H__
+
+/*
+ * Return codes for the inode/inobt walk function are 0 to continue iterating,
+ * and non-zero to stop iterating. Any non-zero value will be passed up to the
+ * iwalk or inobt_walk caller. The special value -ECANCELED can be used to
+ * stop iteration, as neither iwalk nor inobt_walk will ever generate that
+ * error code on their own.
+ */
+
+/* Walk all inodes in the filesystem starting from @startino. */
+typedef int (*xfs_iwalk_fn)(struct xfs_mount *mp, struct xfs_trans *tp,
+ xfs_ino_t ino, void *data);
+
+int xfs_iwalk(struct xfs_mount *mp, struct xfs_trans *tp, xfs_ino_t startino,
+ unsigned int flags, xfs_iwalk_fn iwalk_fn,
+ unsigned int inode_records, void *data);
+int xfs_iwalk_threaded(struct xfs_mount *mp, xfs_ino_t startino,
+ unsigned int flags, xfs_iwalk_fn iwalk_fn,
+ unsigned int inode_records, bool poll, void *data);
+
+/* Only iterate inodes within the same AG as @startino. */
+#define XFS_IWALK_SAME_AG (0x1)
+
+#define XFS_IWALK_FLAGS_ALL (XFS_IWALK_SAME_AG)
+
+/* Walk all inode btree records in the filesystem starting from @startino. */
+typedef int (*xfs_inobt_walk_fn)(struct xfs_mount *mp, struct xfs_trans *tp,
+ xfs_agnumber_t agno,
+ const struct xfs_inobt_rec_incore *irec,
+ void *data);
+
+int xfs_inobt_walk(struct xfs_mount *mp, struct xfs_trans *tp,
+ xfs_ino_t startino, unsigned int flags,
+ xfs_inobt_walk_fn inobt_walk_fn, unsigned int inobt_records,
+ void *data);
+
+/* Only iterate inobt records within the same AG as @startino. */
+#define XFS_INOBT_WALK_SAME_AG (XFS_IWALK_SAME_AG)
+
+#define XFS_INOBT_WALK_FLAGS_ALL (XFS_INOBT_WALK_SAME_AG)
+
+#endif /* __XFS_IWALK_H__ */
diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h
index edbd5a2..ca15105 100644
--- a/fs/xfs/xfs_linux.h
+++ b/fs/xfs/xfs_linux.h
@@ -110,8 +110,6 @@
#define current_restore_flags_nested(sp, f) \
(current->flags = ((current->flags & ~(f)) | (*(sp) & (f))))
-#define spinlock_destroy(lock)
-
#define NBBY 8 /* number of bits per byte */
/*
@@ -221,6 +219,9 @@
return x;
}
+int xfs_rw_bdev(struct block_device *bdev, sector_t sector, unsigned int count,
+ char *data, unsigned int op);
+
#define ASSERT_ALWAYS(expr) \
(likely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__))
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index c3b610b..641d07f 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -16,13 +16,10 @@
#include "xfs_trans_priv.h"
#include "xfs_log.h"
#include "xfs_log_priv.h"
-#include "xfs_log_recover.h"
-#include "xfs_inode.h"
#include "xfs_trace.h"
-#include "xfs_fsops.h"
-#include "xfs_cksum.h"
#include "xfs_sysfs.h"
#include "xfs_sb.h"
+#include "xfs_health.h"
kmem_zone_t *xfs_log_ticket_zone;
@@ -44,21 +41,14 @@
xlog_space_left(
struct xlog *log,
atomic64_t *head);
-STATIC int
-xlog_sync(
- struct xlog *log,
- struct xlog_in_core *iclog);
STATIC void
xlog_dealloc_log(
struct xlog *log);
/* local state machine functions */
-STATIC void xlog_state_done_syncing(xlog_in_core_t *iclog, int);
-STATIC void
-xlog_state_do_callback(
- struct xlog *log,
- int aborted,
- struct xlog_in_core *iclog);
+STATIC void xlog_state_done_syncing(
+ struct xlog_in_core *iclog,
+ bool aborted);
STATIC int
xlog_state_get_iclog_space(
struct xlog *log,
@@ -106,8 +96,7 @@
xlog_verify_iclog(
struct xlog *log,
struct xlog_in_core *iclog,
- int count,
- bool syncing);
+ int count);
STATIC void
xlog_verify_tail_lsn(
struct xlog *log,
@@ -116,7 +105,7 @@
#else
#define xlog_verify_dest_ptr(a,b)
#define xlog_verify_grant_tail(a)
-#define xlog_verify_iclog(a,b,c,d)
+#define xlog_verify_iclog(a,b,c)
#define xlog_verify_tail_lsn(a,b,c)
#endif
@@ -225,15 +214,42 @@
{
struct xlog_ticket *tic;
int need_bytes;
+ bool woken_task = false;
list_for_each_entry(tic, &head->waiters, t_queue) {
+
+ /*
+ * There is a chance that the size of the CIL checkpoints in
+ * progress at the last AIL push target calculation resulted in
+ * limiting the target to the log head (l_last_sync_lsn) at the
+ * time. This may not reflect where the log head is now as the
+ * CIL checkpoints may have completed.
+ *
+ * Hence when we are woken here, it may be that the head of the
+ * log that has moved rather than the tail. As the tail didn't
+ * move, there still won't be space available for the
+ * reservation we require. However, if the AIL has already
+ * pushed to the target defined by the old log head location, we
+ * will hang here waiting for something else to update the AIL
+ * push target.
+ *
+ * Therefore, if there isn't space to wake the first waiter on
+ * the grant head, we need to push the AIL again to ensure the
+ * target reflects both the current log tail and log head
+ * position before we wait for the tail to move again.
+ */
+
need_bytes = xlog_ticket_reservation(log, head, tic);
- if (*free_bytes < need_bytes)
+ if (*free_bytes < need_bytes) {
+ if (!woken_task)
+ xlog_grant_push_ail(log, need_bytes);
return false;
+ }
*free_bytes -= need_bytes;
trace_xfs_log_grant_wake_up(log, tic);
wake_up_process(tic->t_task);
+ woken_task = true;
}
return true;
@@ -439,11 +455,7 @@
XFS_STATS_INC(mp, xs_try_logspace);
ASSERT(*ticp == NULL);
- tic = xlog_ticket_alloc(log, unit_bytes, cnt, client, permanent,
- KM_SLEEP | KM_MAYFAIL);
- if (!tic)
- return -ENOMEM;
-
+ tic = xlog_ticket_alloc(log, unit_bytes, cnt, client, permanent, 0);
*ticp = tic;
xlog_grant_push_ail(log, tic->t_cnt ? tic->t_unit_res * tic->t_cnt
@@ -540,32 +552,6 @@
return lsn;
}
-/*
- * Attaches a new iclog I/O completion callback routine during
- * transaction commit. If the log is in error state, a non-zero
- * return code is handed back and the caller is responsible for
- * executing the callback at an appropriate time.
- */
-int
-xfs_log_notify(
- struct xlog_in_core *iclog,
- xfs_log_callback_t *cb)
-{
- int abortflg;
-
- spin_lock(&iclog->ic_callback_lock);
- abortflg = (iclog->ic_state & XLOG_STATE_IOERROR);
- if (!abortflg) {
- ASSERT_ALWAYS((iclog->ic_state == XLOG_STATE_ACTIVE) ||
- (iclog->ic_state == XLOG_STATE_WANT_SYNC));
- cb->cb_next = NULL;
- *(iclog->ic_callback_tail) = cb;
- iclog->ic_callback_tail = &(cb->cb_next);
- }
- spin_unlock(&iclog->ic_callback_lock);
- return abortflg;
-}
-
int
xfs_log_release_iclog(
struct xfs_mount *mp,
@@ -806,16 +792,12 @@
* The mount has failed. Cancel the recovery if it hasn't completed and destroy
* the log.
*/
-int
+void
xfs_log_mount_cancel(
struct xfs_mount *mp)
{
- int error;
-
- error = xlog_recover_cancel(mp->m_log);
+ xlog_recover_cancel(mp->m_log);
xfs_log_unmount(mp);
-
- return error;
}
/*
@@ -861,7 +843,7 @@
* recalculated during log recovery at next mount. Refer to
* xlog_check_unmount_rec for more details.
*/
- if (XFS_TEST_ERROR((mp->m_flags & XFS_MOUNT_BAD_SUMMARY), mp,
+ if (XFS_TEST_ERROR(xfs_fs_has_sickness(mp, XFS_SICK_FS_COUNTERS), mp,
XFS_ERRTAG_FORCE_SUMMARY_RECALC)) {
xfs_alert(mp, "%s: will fix summary counters at next mount",
__func__);
@@ -931,7 +913,7 @@
* Or, if we are doing a forced umount (typically because of IO errors).
*/
if (mp->m_flags & XFS_MOUNT_NORECOVERY ||
- xfs_readonly_buftarg(log->l_mp->m_logdev_targp)) {
+ xfs_readonly_buftarg(log->l_targ)) {
ASSERT(mp->m_flags & XFS_MOUNT_RDONLY);
return 0;
}
@@ -1243,53 +1225,49 @@
}
-/*
- * Log function which is called when an io completes.
- *
- * The log manager needs its own routine, in order to control what
- * happens with the buffer after the write completes.
- */
static void
-xlog_iodone(xfs_buf_t *bp)
+xlog_ioend_work(
+ struct work_struct *work)
{
- struct xlog_in_core *iclog = bp->b_log_item;
- struct xlog *l = iclog->ic_log;
- int aborted = 0;
+ struct xlog_in_core *iclog =
+ container_of(work, struct xlog_in_core, ic_end_io_work);
+ struct xlog *log = iclog->ic_log;
+ bool aborted = false;
+ int error;
+
+ error = blk_status_to_errno(iclog->ic_bio.bi_status);
+#ifdef DEBUG
+ /* treat writes with injected CRC errors as failed */
+ if (iclog->ic_fail_crc)
+ error = -EIO;
+#endif
/*
- * Race to shutdown the filesystem if we see an error or the iclog is in
- * IOABORT state. The IOABORT state is only set in DEBUG mode to inject
- * CRC errors into log recovery.
+ * Race to shutdown the filesystem if we see an error.
*/
- if (XFS_TEST_ERROR(bp->b_error, l->l_mp, XFS_ERRTAG_IODONE_IOERR) ||
- iclog->ic_state & XLOG_STATE_IOABORT) {
- if (iclog->ic_state & XLOG_STATE_IOABORT)
- iclog->ic_state &= ~XLOG_STATE_IOABORT;
-
- xfs_buf_ioerror_alert(bp, __func__);
- xfs_buf_stale(bp);
- xfs_force_shutdown(l->l_mp, SHUTDOWN_LOG_IO_ERROR);
+ if (XFS_TEST_ERROR(error, log->l_mp, XFS_ERRTAG_IODONE_IOERR)) {
+ xfs_alert(log->l_mp, "log I/O error %d", error);
+ xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR);
/*
* This flag will be propagated to the trans-committed
* callback routines to let them know that the log-commit
* didn't succeed.
*/
- aborted = XFS_LI_ABORTED;
+ aborted = true;
} else if (iclog->ic_state & XLOG_STATE_IOERROR) {
- aborted = XFS_LI_ABORTED;
+ aborted = true;
}
- /* log I/O is always issued ASYNC */
- ASSERT(bp->b_flags & XBF_ASYNC);
xlog_state_done_syncing(iclog, aborted);
+ bio_uninit(&iclog->ic_bio);
/*
- * drop the buffer lock now that we are done. Nothing references
- * the buffer after this, so an unmount waiting on this lock can now
- * tear it down safely. As such, it is unsafe to reference the buffer
- * (bp) after the unlock as we could race with it being freed.
+ * Drop the lock to signal that we are done. Nothing references the
+ * iclog after this, so an unmount waiting on this lock can now tear it
+ * down safely. As such, it is unsafe to reference the iclog after the
+ * unlock as we could race with it being freed.
*/
- xfs_buf_unlock(bp);
+ up(&iclog->ic_sema);
}
/*
@@ -1300,65 +1278,26 @@
* If the filesystem blocksize is too large, we may need to choose a
* larger size since the directory code currently logs entire blocks.
*/
-
STATIC void
xlog_get_iclog_buffer_size(
struct xfs_mount *mp,
struct xlog *log)
{
- int size;
- int xhdrs;
-
if (mp->m_logbufs <= 0)
- log->l_iclog_bufs = XLOG_MAX_ICLOGS;
- else
- log->l_iclog_bufs = mp->m_logbufs;
+ mp->m_logbufs = XLOG_MAX_ICLOGS;
+ if (mp->m_logbsize <= 0)
+ mp->m_logbsize = XLOG_BIG_RECORD_BSIZE;
+
+ log->l_iclog_bufs = mp->m_logbufs;
+ log->l_iclog_size = mp->m_logbsize;
/*
- * Buffer size passed in from mount system call.
+ * # headers = size / 32k - one header holds cycles from 32k of data.
*/
- if (mp->m_logbsize > 0) {
- size = log->l_iclog_size = mp->m_logbsize;
- log->l_iclog_size_log = 0;
- while (size != 1) {
- log->l_iclog_size_log++;
- size >>= 1;
- }
-
- if (xfs_sb_version_haslogv2(&mp->m_sb)) {
- /* # headers = size / 32k
- * one header holds cycles from 32k of data
- */
-
- xhdrs = mp->m_logbsize / XLOG_HEADER_CYCLE_SIZE;
- if (mp->m_logbsize % XLOG_HEADER_CYCLE_SIZE)
- xhdrs++;
- log->l_iclog_hsize = xhdrs << BBSHIFT;
- log->l_iclog_heads = xhdrs;
- } else {
- ASSERT(mp->m_logbsize <= XLOG_BIG_RECORD_BSIZE);
- log->l_iclog_hsize = BBSIZE;
- log->l_iclog_heads = 1;
- }
- goto done;
- }
-
- /* All machines use 32kB buffers by default. */
- log->l_iclog_size = XLOG_BIG_RECORD_BSIZE;
- log->l_iclog_size_log = XLOG_BIG_RECORD_BSHIFT;
-
- /* the default log size is 16k or 32k which is one header sector */
- log->l_iclog_hsize = BBSIZE;
- log->l_iclog_heads = 1;
-
-done:
- /* are we being asked to make the sizes selected above visible? */
- if (mp->m_logbufs == 0)
- mp->m_logbufs = log->l_iclog_bufs;
- if (mp->m_logbsize == 0)
- mp->m_logbsize = log->l_iclog_size;
-} /* xlog_get_iclog_buffer_size */
-
+ log->l_iclog_heads =
+ DIV_ROUND_UP(mp->m_logbsize, XLOG_HEADER_CYCLE_SIZE);
+ log->l_iclog_hsize = log->l_iclog_heads << BBSHIFT;
+}
void
xfs_log_work_queue(
@@ -1421,7 +1360,6 @@
xlog_rec_header_t *head;
xlog_in_core_t **iclogp;
xlog_in_core_t *iclog, *prev_iclog=NULL;
- xfs_buf_t *bp;
int i;
int error = -ENOMEM;
uint log2_size = 0;
@@ -1479,30 +1417,6 @@
xlog_get_iclog_buffer_size(mp, log);
- /*
- * Use a NULL block for the extra log buffer used during splits so that
- * it will trigger errors if we ever try to do IO on it without first
- * having set it up properly.
- */
- error = -ENOMEM;
- bp = xfs_buf_alloc(mp->m_logdev_targp, XFS_BUF_DADDR_NULL,
- BTOBB(log->l_iclog_size), XBF_NO_IOACCT);
- if (!bp)
- goto out_free_log;
-
- /*
- * The iclogbuf buffer locks are held over IO but we are not going to do
- * IO yet. Hence unlock the buffer so that the log IO path can grab it
- * when appropriately.
- */
- ASSERT(xfs_buf_islocked(bp));
- xfs_buf_unlock(bp);
-
- /* use high priority wq for log I/O completion */
- bp->b_ioend_wq = mp->m_log_workqueue;
- bp->b_iodone = xlog_iodone;
- log->l_xbuf = bp;
-
spin_lock_init(&log->l_icloglock);
init_waitqueue_head(&log->l_flush_wait);
@@ -1515,29 +1429,23 @@
* xlog_in_core_t in xfs_log_priv.h for details.
*/
ASSERT(log->l_iclog_size >= 4096);
- for (i=0; i < log->l_iclog_bufs; i++) {
- *iclogp = kmem_zalloc(sizeof(xlog_in_core_t), KM_MAYFAIL);
- if (!*iclogp)
+ for (i = 0; i < log->l_iclog_bufs; i++) {
+ int align_mask = xfs_buftarg_dma_alignment(mp->m_logdev_targp);
+ size_t bvec_size = howmany(log->l_iclog_size, PAGE_SIZE) *
+ sizeof(struct bio_vec);
+
+ iclog = kmem_zalloc(sizeof(*iclog) + bvec_size, KM_MAYFAIL);
+ if (!iclog)
goto out_free_iclog;
- iclog = *iclogp;
+ *iclogp = iclog;
iclog->ic_prev = prev_iclog;
prev_iclog = iclog;
- bp = xfs_buf_get_uncached(mp->m_logdev_targp,
- BTOBB(log->l_iclog_size),
- XBF_NO_IOACCT);
- if (!bp)
+ iclog->ic_data = kmem_alloc_io(log->l_iclog_size, align_mask,
+ KM_MAYFAIL | KM_ZERO);
+ if (!iclog->ic_data)
goto out_free_iclog;
-
- ASSERT(xfs_buf_islocked(bp));
- xfs_buf_unlock(bp);
-
- /* use high priority wq for log I/O completion */
- bp->b_ioend_wq = mp->m_log_workqueue;
- bp->b_iodone = xlog_iodone;
- iclog->ic_bp = bp;
- iclog->ic_data = bp->b_addr;
#ifdef DEBUG
log->l_iclog_bak[i] = &iclog->ic_header;
#endif
@@ -1551,36 +1459,43 @@
head->h_fmt = cpu_to_be32(XLOG_FMT);
memcpy(&head->h_fs_uuid, &mp->m_sb.sb_uuid, sizeof(uuid_t));
- iclog->ic_size = BBTOB(bp->b_length) - log->l_iclog_hsize;
+ iclog->ic_size = log->l_iclog_size - log->l_iclog_hsize;
iclog->ic_state = XLOG_STATE_ACTIVE;
iclog->ic_log = log;
atomic_set(&iclog->ic_refcnt, 0);
spin_lock_init(&iclog->ic_callback_lock);
- iclog->ic_callback_tail = &(iclog->ic_callback);
+ INIT_LIST_HEAD(&iclog->ic_callbacks);
iclog->ic_datap = (char *)iclog->ic_data + log->l_iclog_hsize;
init_waitqueue_head(&iclog->ic_force_wait);
init_waitqueue_head(&iclog->ic_write_wait);
+ INIT_WORK(&iclog->ic_end_io_work, xlog_ioend_work);
+ sema_init(&iclog->ic_sema, 1);
iclogp = &iclog->ic_next;
}
*iclogp = log->l_iclog; /* complete ring */
log->l_iclog->ic_prev = prev_iclog; /* re-write 1st prev ptr */
+ log->l_ioend_workqueue = alloc_workqueue("xfs-log/%s",
+ WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_HIGHPRI, 0,
+ mp->m_fsname);
+ if (!log->l_ioend_workqueue)
+ goto out_free_iclog;
+
error = xlog_cil_init(log);
if (error)
- goto out_free_iclog;
+ goto out_destroy_workqueue;
return log;
+out_destroy_workqueue:
+ destroy_workqueue(log->l_ioend_workqueue);
out_free_iclog:
for (iclog = log->l_iclog; iclog; iclog = prev_iclog) {
prev_iclog = iclog->ic_next;
- if (iclog->ic_bp)
- xfs_buf_free(iclog->ic_bp);
+ kmem_free(iclog->ic_data);
kmem_free(iclog);
}
- spinlock_destroy(&log->l_icloglock);
- xfs_buf_free(log->l_xbuf);
out_free_log:
kmem_free(log);
out:
@@ -1765,42 +1680,155 @@
return xfs_end_cksum(crc);
}
-/*
- * The bdstrat callback function for log bufs. This gives us a central
- * place to trap bufs in case we get hit by a log I/O error and need to
- * shutdown. Actually, in practice, even when we didn't get a log error,
- * we transition the iclogs to IOERROR state *after* flushing all existing
- * iclogs to disk. This is because we don't want anymore new transactions to be
- * started or completed afterwards.
- *
- * We lock the iclogbufs here so that we can serialise against IO completion
- * during unmount. We might be processing a shutdown triggered during unmount,
- * and that can occur asynchronously to the unmount thread, and hence we need to
- * ensure that completes before tearing down the iclogbufs. Hence we need to
- * hold the buffer lock across the log IO to acheive that.
- */
-STATIC int
-xlog_bdstrat(
- struct xfs_buf *bp)
+static void
+xlog_bio_end_io(
+ struct bio *bio)
{
- struct xlog_in_core *iclog = bp->b_log_item;
+ struct xlog_in_core *iclog = bio->bi_private;
- xfs_buf_lock(bp);
- if (iclog->ic_state & XLOG_STATE_IOERROR) {
- xfs_buf_ioerror(bp, -EIO);
- xfs_buf_stale(bp);
- xfs_buf_ioend(bp);
+ queue_work(iclog->ic_log->l_ioend_workqueue,
+ &iclog->ic_end_io_work);
+}
+
+static void
+xlog_map_iclog_data(
+ struct bio *bio,
+ void *data,
+ size_t count)
+{
+ do {
+ struct page *page = kmem_to_page(data);
+ unsigned int off = offset_in_page(data);
+ size_t len = min_t(size_t, count, PAGE_SIZE - off);
+
+ WARN_ON_ONCE(bio_add_page(bio, page, len, off) != len);
+
+ data += len;
+ count -= len;
+ } while (count);
+}
+
+STATIC void
+xlog_write_iclog(
+ struct xlog *log,
+ struct xlog_in_core *iclog,
+ uint64_t bno,
+ unsigned int count,
+ bool need_flush)
+{
+ ASSERT(bno < log->l_logBBsize);
+
+ /*
+ * We lock the iclogbufs here so that we can serialise against I/O
+ * completion during unmount. We might be processing a shutdown
+ * triggered during unmount, and that can occur asynchronously to the
+ * unmount thread, and hence we need to ensure that completes before
+ * tearing down the iclogbufs. Hence we need to hold the buffer lock
+ * across the log IO to archieve that.
+ */
+ down(&iclog->ic_sema);
+ if (unlikely(iclog->ic_state & XLOG_STATE_IOERROR)) {
/*
* It would seem logical to return EIO here, but we rely on
* the log state machine to propagate I/O errors instead of
- * doing it here. Similarly, IO completion will unlock the
- * buffer, so we don't do it here.
+ * doing it here. We kick of the state machine and unlock
+ * the buffer manually, the code needs to be kept in sync
+ * with the I/O completion path.
*/
- return 0;
+ xlog_state_done_syncing(iclog, XFS_LI_ABORTED);
+ up(&iclog->ic_sema);
+ return;
}
- xfs_buf_submit(bp);
- return 0;
+ iclog->ic_io_size = count;
+
+ bio_init(&iclog->ic_bio, iclog->ic_bvec, howmany(count, PAGE_SIZE));
+ bio_set_dev(&iclog->ic_bio, log->l_targ->bt_bdev);
+ iclog->ic_bio.bi_iter.bi_sector = log->l_logBBstart + bno;
+ iclog->ic_bio.bi_end_io = xlog_bio_end_io;
+ iclog->ic_bio.bi_private = iclog;
+ iclog->ic_bio.bi_opf = REQ_OP_WRITE | REQ_META | REQ_SYNC | REQ_FUA;
+ if (need_flush)
+ iclog->ic_bio.bi_opf |= REQ_PREFLUSH;
+
+ xlog_map_iclog_data(&iclog->ic_bio, iclog->ic_data, iclog->ic_io_size);
+ if (is_vmalloc_addr(iclog->ic_data))
+ flush_kernel_vmap_range(iclog->ic_data, iclog->ic_io_size);
+
+ /*
+ * If this log buffer would straddle the end of the log we will have
+ * to split it up into two bios, so that we can continue at the start.
+ */
+ if (bno + BTOBB(count) > log->l_logBBsize) {
+ struct bio *split;
+
+ split = bio_split(&iclog->ic_bio, log->l_logBBsize - bno,
+ GFP_NOIO, &fs_bio_set);
+ bio_chain(split, &iclog->ic_bio);
+ submit_bio(split);
+
+ /* restart at logical offset zero for the remainder */
+ iclog->ic_bio.bi_iter.bi_sector = log->l_logBBstart;
+ }
+
+ submit_bio(&iclog->ic_bio);
+}
+
+/*
+ * We need to bump cycle number for the part of the iclog that is
+ * written to the start of the log. Watch out for the header magic
+ * number case, though.
+ */
+static void
+xlog_split_iclog(
+ struct xlog *log,
+ void *data,
+ uint64_t bno,
+ unsigned int count)
+{
+ unsigned int split_offset = BBTOB(log->l_logBBsize - bno);
+ unsigned int i;
+
+ for (i = split_offset; i < count; i += BBSIZE) {
+ uint32_t cycle = get_unaligned_be32(data + i);
+
+ if (++cycle == XLOG_HEADER_MAGIC_NUM)
+ cycle++;
+ put_unaligned_be32(cycle, data + i);
+ }
+}
+
+static int
+xlog_calc_iclog_size(
+ struct xlog *log,
+ struct xlog_in_core *iclog,
+ uint32_t *roundoff)
+{
+ uint32_t count_init, count;
+ bool use_lsunit;
+
+ use_lsunit = xfs_sb_version_haslogv2(&log->l_mp->m_sb) &&
+ log->l_mp->m_sb.sb_logsunit > 1;
+
+ /* Add for LR header */
+ count_init = log->l_iclog_hsize + iclog->ic_offset;
+
+ /* Round out the log write size */
+ if (use_lsunit) {
+ /* we have a v2 stripe unit to use */
+ count = XLOG_LSUNITTOB(log, XLOG_BTOLSUNIT(log, count_init));
+ } else {
+ count = BBTOB(BTOBB(count_init));
+ }
+
+ ASSERT(count >= count_init);
+ *roundoff = count - count_init;
+
+ if (use_lsunit)
+ ASSERT(*roundoff < log->l_mp->m_sb.sb_logsunit);
+ else
+ ASSERT(*roundoff < BBTOB(1));
+ return count;
}
/*
@@ -1823,46 +1851,23 @@
* log will require grabbing the lock though.
*
* The entire log manager uses a logical block numbering scheme. Only
- * log_sync (and then only bwrite()) know about the fact that the log may
- * not start with block zero on a given device. The log block start offset
- * is added immediately before calling bwrite().
+ * xlog_write_iclog knows about the fact that the log may not start with
+ * block zero on a given device.
*/
-
-STATIC int
+STATIC void
xlog_sync(
struct xlog *log,
struct xlog_in_core *iclog)
{
- xfs_buf_t *bp;
- int i;
- uint count; /* byte count of bwrite */
- uint count_init; /* initial count before roundup */
- int roundoff; /* roundoff to BB or stripe */
- int split = 0; /* split write into two regions */
- int error;
- int v2 = xfs_sb_version_haslogv2(&log->l_mp->m_sb);
- int size;
+ unsigned int count; /* byte count of bwrite */
+ unsigned int roundoff; /* roundoff to BB or stripe */
+ uint64_t bno;
+ unsigned int size;
+ bool need_flush = true, split = false;
- XFS_STATS_INC(log->l_mp, xs_log_writes);
ASSERT(atomic_read(&iclog->ic_refcnt) == 0);
- /* Add for LR header */
- count_init = log->l_iclog_hsize + iclog->ic_offset;
-
- /* Round out the log write size */
- if (v2 && log->l_mp->m_sb.sb_logsunit > 1) {
- /* we have a v2 stripe unit to use */
- count = XLOG_LSUNITTOB(log, XLOG_BTOLSUNIT(log, count_init));
- } else {
- count = BBTOB(BTOBB(count_init));
- }
- roundoff = count - count_init;
- ASSERT(roundoff >= 0);
- ASSERT((v2 && log->l_mp->m_sb.sb_logsunit > 1 &&
- roundoff < log->l_mp->m_sb.sb_logsunit)
- ||
- (log->l_mp->m_sb.sb_logsunit <= 1 &&
- roundoff < BBTOB(1)));
+ count = xlog_calc_iclog_size(log, iclog, &roundoff);
/* move grant heads by roundoff in sync */
xlog_grant_add_space(log, &log->l_reserve_head.grant, roundoff);
@@ -1873,41 +1878,19 @@
/* real byte length */
size = iclog->ic_offset;
- if (v2)
+ if (xfs_sb_version_haslogv2(&log->l_mp->m_sb))
size += roundoff;
iclog->ic_header.h_len = cpu_to_be32(size);
- bp = iclog->ic_bp;
- XFS_BUF_SET_ADDR(bp, BLOCK_LSN(be64_to_cpu(iclog->ic_header.h_lsn)));
-
+ XFS_STATS_INC(log->l_mp, xs_log_writes);
XFS_STATS_ADD(log->l_mp, xs_log_blocks, BTOBB(count));
+ bno = BLOCK_LSN(be64_to_cpu(iclog->ic_header.h_lsn));
+
/* Do we need to split this write into 2 parts? */
- if (XFS_BUF_ADDR(bp) + BTOBB(count) > log->l_logBBsize) {
- char *dptr;
-
- split = count - (BBTOB(log->l_logBBsize - XFS_BUF_ADDR(bp)));
- count = BBTOB(log->l_logBBsize - XFS_BUF_ADDR(bp));
- iclog->ic_bwritecnt = 2;
-
- /*
- * Bump the cycle numbers at the start of each block in the
- * part of the iclog that ends up in the buffer that gets
- * written to the start of the log.
- *
- * Watch out for the header magic number case, though.
- */
- dptr = (char *)&iclog->ic_header + count;
- for (i = 0; i < split; i += BBSIZE) {
- uint32_t cycle = be32_to_cpu(*(__be32 *)dptr);
- if (++cycle == XLOG_HEADER_MAGIC_NUM)
- cycle++;
- *(__be32 *)dptr = cpu_to_be32(cycle);
-
- dptr += BBSIZE;
- }
- } else {
- iclog->ic_bwritecnt = 1;
+ if (bno + BTOBB(count) > log->l_logBBsize) {
+ xlog_split_iclog(log, &iclog->ic_header, bno, count);
+ split = true;
}
/* calculcate the checksum */
@@ -1920,18 +1903,15 @@
* write on I/O completion and shutdown the fs. The subsequent mount
* detects the bad CRC and attempts to recover.
*/
+#ifdef DEBUG
if (XFS_TEST_ERROR(false, log->l_mp, XFS_ERRTAG_LOG_BAD_CRC)) {
iclog->ic_header.h_crc &= cpu_to_le32(0xAAAAAAAA);
- iclog->ic_state |= XLOG_STATE_IOABORT;
+ iclog->ic_fail_crc = true;
xfs_warn(log->l_mp,
"Intentionally corrupted log record at LSN 0x%llx. Shutdown imminent.",
be64_to_cpu(iclog->ic_header.h_lsn));
}
-
- bp->b_io_length = BTOBB(count);
- bp->b_log_item = iclog;
- bp->b_flags &= ~XBF_FLUSH;
- bp->b_flags |= (XBF_ASYNC | XBF_SYNCIO | XBF_WRITE | XBF_FUA);
+#endif
/*
* Flush the data device before flushing the log to make sure all meta
@@ -1941,50 +1921,14 @@
* synchronously here; for an internal log we can simply use the block
* layer state machine for preflushes.
*/
- if (log->l_mp->m_logdev_targp != log->l_mp->m_ddev_targp)
+ if (log->l_targ != log->l_mp->m_ddev_targp || split) {
xfs_blkdev_issue_flush(log->l_mp->m_ddev_targp);
- else
- bp->b_flags |= XBF_FLUSH;
-
- ASSERT(XFS_BUF_ADDR(bp) <= log->l_logBBsize-1);
- ASSERT(XFS_BUF_ADDR(bp) + BTOBB(count) <= log->l_logBBsize);
-
- xlog_verify_iclog(log, iclog, count, true);
-
- /* account for log which doesn't start at block #0 */
- XFS_BUF_SET_ADDR(bp, XFS_BUF_ADDR(bp) + log->l_logBBstart);
-
- /*
- * Don't call xfs_bwrite here. We do log-syncs even when the filesystem
- * is shutting down.
- */
- error = xlog_bdstrat(bp);
- if (error) {
- xfs_buf_ioerror_alert(bp, "xlog_sync");
- return error;
+ need_flush = false;
}
- if (split) {
- bp = iclog->ic_log->l_xbuf;
- XFS_BUF_SET_ADDR(bp, 0); /* logical 0 */
- xfs_buf_associate_memory(bp,
- (char *)&iclog->ic_header + count, split);
- bp->b_log_item = iclog;
- bp->b_flags &= ~XBF_FLUSH;
- bp->b_flags |= (XBF_ASYNC | XBF_SYNCIO | XBF_WRITE | XBF_FUA);
- ASSERT(XFS_BUF_ADDR(bp) <= log->l_logBBsize-1);
- ASSERT(XFS_BUF_ADDR(bp) + BTOBB(count) <= log->l_logBBsize);
-
- /* account for internal log which doesn't start at block #0 */
- XFS_BUF_SET_ADDR(bp, XFS_BUF_ADDR(bp) + log->l_logBBstart);
- error = xlog_bdstrat(bp);
- if (error) {
- xfs_buf_ioerror_alert(bp, "xlog_sync (split)");
- return error;
- }
- }
- return 0;
-} /* xlog_sync */
+ xlog_verify_iclog(log, iclog, count);
+ xlog_write_iclog(log, iclog, bno, count, need_flush);
+}
/*
* Deallocate a log structure
@@ -2004,31 +1948,21 @@
*/
iclog = log->l_iclog;
for (i = 0; i < log->l_iclog_bufs; i++) {
- xfs_buf_lock(iclog->ic_bp);
- xfs_buf_unlock(iclog->ic_bp);
+ down(&iclog->ic_sema);
+ up(&iclog->ic_sema);
iclog = iclog->ic_next;
}
- /*
- * Always need to ensure that the extra buffer does not point to memory
- * owned by another log buffer before we free it. Also, cycle the lock
- * first to ensure we've completed IO on it.
- */
- xfs_buf_lock(log->l_xbuf);
- xfs_buf_unlock(log->l_xbuf);
- xfs_buf_set_empty(log->l_xbuf, BTOBB(log->l_iclog_size));
- xfs_buf_free(log->l_xbuf);
-
iclog = log->l_iclog;
for (i = 0; i < log->l_iclog_bufs; i++) {
- xfs_buf_free(iclog->ic_bp);
next_iclog = iclog->ic_next;
+ kmem_free(iclog->ic_data);
kmem_free(iclog);
iclog = next_iclog;
}
- spinlock_destroy(&log->l_icloglock);
log->l_mp->m_log = NULL;
+ destroy_workqueue(log->l_ioend_workqueue);
kmem_free(log);
} /* xlog_dealloc_log */
@@ -2068,7 +2002,7 @@
/* match with XLOG_REG_TYPE_* in xfs_log.h */
#define REG_TYPE_STR(type, str) [XLOG_REG_TYPE_##type] = str
- static char *res_type_str[XLOG_REG_TYPE_MAX + 1] = {
+ static char *res_type_str[] = {
REG_TYPE_STR(BFORMAT, "bformat"),
REG_TYPE_STR(BCHUNK, "bchunk"),
REG_TYPE_STR(EFI_FORMAT, "efi_format"),
@@ -2088,8 +2022,15 @@
REG_TYPE_STR(UNMOUNT, "unmount"),
REG_TYPE_STR(COMMIT, "commit"),
REG_TYPE_STR(TRANSHDR, "trans header"),
- REG_TYPE_STR(ICREATE, "inode create")
+ REG_TYPE_STR(ICREATE, "inode create"),
+ REG_TYPE_STR(RUI_FORMAT, "rui_format"),
+ REG_TYPE_STR(RUD_FORMAT, "rud_format"),
+ REG_TYPE_STR(CUI_FORMAT, "cui_format"),
+ REG_TYPE_STR(CUD_FORMAT, "cud_format"),
+ REG_TYPE_STR(BUI_FORMAT, "bui_format"),
+ REG_TYPE_STR(BUD_FORMAT, "bud_format"),
};
+ BUILD_BUG_ON(ARRAY_SIZE(res_type_str) != XLOG_REG_TYPE_MAX + 1);
#undef REG_TYPE_STR
xfs_warn(mp, "ticket reservation summary:");
@@ -2582,27 +2523,41 @@
*****************************************************************************
*/
-/* Clean iclogs starting from the head. This ordering must be
- * maintained, so an iclog doesn't become ACTIVE beyond one that
- * is SYNCING. This is also required to maintain the notion that we use
- * a ordered wait queue to hold off would be writers to the log when every
- * iclog is trying to sync to disk.
+/*
+ * An iclog has just finished IO completion processing, so we need to update
+ * the iclog state and propagate that up into the overall log state. Hence we
+ * prepare the iclog for cleaning, and then clean all the pending dirty iclogs
+ * starting from the head, and then wake up any threads that are waiting for the
+ * iclog to be marked clean.
*
- * State Change: DIRTY -> ACTIVE
+ * The ordering of marking iclogs ACTIVE must be maintained, so an iclog
+ * doesn't become ACTIVE beyond one that is SYNCING. This is also required to
+ * maintain the notion that we use a ordered wait queue to hold off would be
+ * writers to the log when every iclog is trying to sync to disk.
+ *
+ * Caller must hold the icloglock before calling us.
+ *
+ * State Change: !IOERROR -> DIRTY -> ACTIVE
*/
STATIC void
-xlog_state_clean_log(
- struct xlog *log)
+xlog_state_clean_iclog(
+ struct xlog *log,
+ struct xlog_in_core *dirty_iclog)
{
- xlog_in_core_t *iclog;
- int changed = 0;
+ struct xlog_in_core *iclog;
+ int changed = 0;
+ /* Prepare the completed iclog. */
+ if (!(dirty_iclog->ic_state & XLOG_STATE_IOERROR))
+ dirty_iclog->ic_state = XLOG_STATE_DIRTY;
+
+ /* Walk all the iclogs to update the ordered active state. */
iclog = log->l_iclog;
do {
if (iclog->ic_state == XLOG_STATE_DIRTY) {
iclog->ic_state = XLOG_STATE_ACTIVE;
iclog->ic_offset = 0;
- ASSERT(iclog->ic_callback == NULL);
+ ASSERT(list_empty_careful(&iclog->ic_callbacks));
/*
* If the number of ops in this iclog indicate it just
* contains the dummy transaction, we can
@@ -2634,7 +2589,13 @@
iclog = iclog->ic_next;
} while (iclog != log->l_iclog);
- /* log is locked when we are called */
+
+ /*
+ * Wake up threads waiting in xfs_log_force() for the dirty iclog
+ * to be cleaned.
+ */
+ wake_up_all(&dirty_iclog->ic_force_wait);
+
/*
* Change state for the dummy log recording.
* We usually go to NEED. But we go to NEED2 if the changed indicates
@@ -2668,56 +2629,226 @@
ASSERT(0);
}
}
-} /* xlog_state_clean_log */
+}
STATIC xfs_lsn_t
xlog_get_lowest_lsn(
- struct xlog *log)
+ struct xlog *log)
{
- xlog_in_core_t *lsn_log;
- xfs_lsn_t lowest_lsn, lsn;
+ struct xlog_in_core *iclog = log->l_iclog;
+ xfs_lsn_t lowest_lsn = 0, lsn;
- lsn_log = log->l_iclog;
- lowest_lsn = 0;
do {
- if (!(lsn_log->ic_state & (XLOG_STATE_ACTIVE|XLOG_STATE_DIRTY))) {
- lsn = be64_to_cpu(lsn_log->ic_header.h_lsn);
- if ((lsn && !lowest_lsn) ||
- (XFS_LSN_CMP(lsn, lowest_lsn) < 0)) {
+ if (iclog->ic_state & (XLOG_STATE_ACTIVE | XLOG_STATE_DIRTY))
+ continue;
+
+ lsn = be64_to_cpu(iclog->ic_header.h_lsn);
+ if ((lsn && !lowest_lsn) || XFS_LSN_CMP(lsn, lowest_lsn) < 0)
lowest_lsn = lsn;
- }
- }
- lsn_log = lsn_log->ic_next;
- } while (lsn_log != log->l_iclog);
+ } while ((iclog = iclog->ic_next) != log->l_iclog);
+
return lowest_lsn;
}
+/*
+ * Completion of a iclog IO does not imply that a transaction has completed, as
+ * transactions can be large enough to span many iclogs. We cannot change the
+ * tail of the log half way through a transaction as this may be the only
+ * transaction in the log and moving the tail to point to the middle of it
+ * will prevent recovery from finding the start of the transaction. Hence we
+ * should only update the last_sync_lsn if this iclog contains transaction
+ * completion callbacks on it.
+ *
+ * We have to do this before we drop the icloglock to ensure we are the only one
+ * that can update it.
+ *
+ * If we are moving the last_sync_lsn forwards, we also need to ensure we kick
+ * the reservation grant head pushing. This is due to the fact that the push
+ * target is bound by the current last_sync_lsn value. Hence if we have a large
+ * amount of log space bound up in this committing transaction then the
+ * last_sync_lsn value may be the limiting factor preventing tail pushing from
+ * freeing space in the log. Hence once we've updated the last_sync_lsn we
+ * should push the AIL to ensure the push target (and hence the grant head) is
+ * no longer bound by the old log head location and can move forwards and make
+ * progress again.
+ */
+static void
+xlog_state_set_callback(
+ struct xlog *log,
+ struct xlog_in_core *iclog,
+ xfs_lsn_t header_lsn)
+{
+ iclog->ic_state = XLOG_STATE_CALLBACK;
+
+ ASSERT(XFS_LSN_CMP(atomic64_read(&log->l_last_sync_lsn),
+ header_lsn) <= 0);
+
+ if (list_empty_careful(&iclog->ic_callbacks))
+ return;
+
+ atomic64_set(&log->l_last_sync_lsn, header_lsn);
+ xlog_grant_push_ail(log, 0);
+}
+
+/*
+ * Return true if we need to stop processing, false to continue to the next
+ * iclog. The caller will need to run callbacks if the iclog is returned in the
+ * XLOG_STATE_CALLBACK state.
+ */
+static bool
+xlog_state_iodone_process_iclog(
+ struct xlog *log,
+ struct xlog_in_core *iclog,
+ struct xlog_in_core *completed_iclog,
+ bool *ioerror)
+{
+ xfs_lsn_t lowest_lsn;
+ xfs_lsn_t header_lsn;
+
+ /* Skip all iclogs in the ACTIVE & DIRTY states */
+ if (iclog->ic_state & (XLOG_STATE_ACTIVE | XLOG_STATE_DIRTY))
+ return false;
+
+ /*
+ * Between marking a filesystem SHUTDOWN and stopping the log, we do
+ * flush all iclogs to disk (if there wasn't a log I/O error). So, we do
+ * want things to go smoothly in case of just a SHUTDOWN w/o a
+ * LOG_IO_ERROR.
+ */
+ if (iclog->ic_state & XLOG_STATE_IOERROR) {
+ *ioerror = true;
+ return false;
+ }
+
+ /*
+ * Can only perform callbacks in order. Since this iclog is not in the
+ * DONE_SYNC/ DO_CALLBACK state, we skip the rest and just try to clean
+ * up. If we set our iclog to DO_CALLBACK, we will not process it when
+ * we retry since a previous iclog is in the CALLBACK and the state
+ * cannot change since we are holding the l_icloglock.
+ */
+ if (!(iclog->ic_state &
+ (XLOG_STATE_DONE_SYNC | XLOG_STATE_DO_CALLBACK))) {
+ if (completed_iclog &&
+ (completed_iclog->ic_state == XLOG_STATE_DONE_SYNC)) {
+ completed_iclog->ic_state = XLOG_STATE_DO_CALLBACK;
+ }
+ return true;
+ }
+
+ /*
+ * We now have an iclog that is in either the DO_CALLBACK or DONE_SYNC
+ * states. The other states (WANT_SYNC, SYNCING, or CALLBACK were caught
+ * by the above if and are going to clean (i.e. we aren't doing their
+ * callbacks) see the above if.
+ *
+ * We will do one more check here to see if we have chased our tail
+ * around. If this is not the lowest lsn iclog, then we will leave it
+ * for another completion to process.
+ */
+ header_lsn = be64_to_cpu(iclog->ic_header.h_lsn);
+ lowest_lsn = xlog_get_lowest_lsn(log);
+ if (lowest_lsn && XFS_LSN_CMP(lowest_lsn, header_lsn) < 0)
+ return false;
+
+ xlog_state_set_callback(log, iclog, header_lsn);
+ return false;
+
+}
+
+/*
+ * Keep processing entries in the iclog callback list until we come around and
+ * it is empty. We need to atomically see that the list is empty and change the
+ * state to DIRTY so that we don't miss any more callbacks being added.
+ *
+ * This function is called with the icloglock held and returns with it held. We
+ * drop it while running callbacks, however, as holding it over thousands of
+ * callbacks is unnecessary and causes excessive contention if we do.
+ */
+static void
+xlog_state_do_iclog_callbacks(
+ struct xlog *log,
+ struct xlog_in_core *iclog,
+ bool aborted)
+{
+ spin_unlock(&log->l_icloglock);
+ spin_lock(&iclog->ic_callback_lock);
+ while (!list_empty(&iclog->ic_callbacks)) {
+ LIST_HEAD(tmp);
+
+ list_splice_init(&iclog->ic_callbacks, &tmp);
+
+ spin_unlock(&iclog->ic_callback_lock);
+ xlog_cil_process_committed(&tmp, aborted);
+ spin_lock(&iclog->ic_callback_lock);
+ }
+
+ /*
+ * Pick up the icloglock while still holding the callback lock so we
+ * serialise against anyone trying to add more callbacks to this iclog
+ * now we've finished processing.
+ */
+ spin_lock(&log->l_icloglock);
+ spin_unlock(&iclog->ic_callback_lock);
+}
+
+#ifdef DEBUG
+/*
+ * Make one last gasp attempt to see if iclogs are being left in limbo. If the
+ * above loop finds an iclog earlier than the current iclog and in one of the
+ * syncing states, the current iclog is put into DO_CALLBACK and the callbacks
+ * are deferred to the completion of the earlier iclog. Walk the iclogs in order
+ * and make sure that no iclog is in DO_CALLBACK unless an earlier iclog is in
+ * one of the syncing states.
+ *
+ * Note that SYNCING|IOERROR is a valid state so we cannot just check for
+ * ic_state == SYNCING.
+ */
+static void
+xlog_state_callback_check_state(
+ struct xlog *log)
+{
+ struct xlog_in_core *first_iclog = log->l_iclog;
+ struct xlog_in_core *iclog = first_iclog;
+
+ do {
+ ASSERT(iclog->ic_state != XLOG_STATE_DO_CALLBACK);
+ /*
+ * Terminate the loop if iclogs are found in states
+ * which will cause other threads to clean up iclogs.
+ *
+ * SYNCING - i/o completion will go through logs
+ * DONE_SYNC - interrupt thread should be waiting for
+ * l_icloglock
+ * IOERROR - give up hope all ye who enter here
+ */
+ if (iclog->ic_state == XLOG_STATE_WANT_SYNC ||
+ iclog->ic_state & XLOG_STATE_SYNCING ||
+ iclog->ic_state == XLOG_STATE_DONE_SYNC ||
+ iclog->ic_state == XLOG_STATE_IOERROR )
+ break;
+ iclog = iclog->ic_next;
+ } while (first_iclog != iclog);
+}
+#else
+#define xlog_state_callback_check_state(l) ((void)0)
+#endif
STATIC void
xlog_state_do_callback(
struct xlog *log,
- int aborted,
+ bool aborted,
struct xlog_in_core *ciclog)
{
- xlog_in_core_t *iclog;
- xlog_in_core_t *first_iclog; /* used to know when we've
- * processed all iclogs once */
- xfs_log_callback_t *cb, *cb_next;
- int flushcnt = 0;
- xfs_lsn_t lowest_lsn;
- int ioerrors; /* counter: iclogs with errors */
- int loopdidcallbacks; /* flag: inner loop did callbacks*/
- int funcdidcallbacks; /* flag: function did callbacks */
- int repeats; /* for issuing console warnings if
- * looping too many times */
- int wake = 0;
+ struct xlog_in_core *iclog;
+ struct xlog_in_core *first_iclog;
+ bool did_callbacks = false;
+ bool cycled_icloglock;
+ bool ioerror;
+ int flushcnt = 0;
+ int repeats = 0;
spin_lock(&log->l_icloglock);
- first_iclog = iclog = log->l_iclog;
- ioerrors = 0;
- funcdidcallbacks = 0;
- repeats = 0;
-
do {
/*
* Scan all iclogs starting with the one pointed to by the
@@ -2729,143 +2860,34 @@
*/
first_iclog = log->l_iclog;
iclog = log->l_iclog;
- loopdidcallbacks = 0;
+ cycled_icloglock = false;
+ ioerror = false;
repeats++;
do {
+ if (xlog_state_iodone_process_iclog(log, iclog,
+ ciclog, &ioerror))
+ break;
- /* skip all iclogs in the ACTIVE & DIRTY states */
- if (iclog->ic_state &
- (XLOG_STATE_ACTIVE|XLOG_STATE_DIRTY)) {
+ if (!(iclog->ic_state &
+ (XLOG_STATE_CALLBACK | XLOG_STATE_IOERROR))) {
iclog = iclog->ic_next;
continue;
}
/*
- * Between marking a filesystem SHUTDOWN and stopping
- * the log, we do flush all iclogs to disk (if there
- * wasn't a log I/O error). So, we do want things to
- * go smoothly in case of just a SHUTDOWN w/o a
- * LOG_IO_ERROR.
+ * Running callbacks will drop the icloglock which means
+ * we'll have to run at least one more complete loop.
*/
- if (!(iclog->ic_state & XLOG_STATE_IOERROR)) {
- /*
- * Can only perform callbacks in order. Since
- * this iclog is not in the DONE_SYNC/
- * DO_CALLBACK state, we skip the rest and
- * just try to clean up. If we set our iclog
- * to DO_CALLBACK, we will not process it when
- * we retry since a previous iclog is in the
- * CALLBACK and the state cannot change since
- * we are holding the l_icloglock.
- */
- if (!(iclog->ic_state &
- (XLOG_STATE_DONE_SYNC |
- XLOG_STATE_DO_CALLBACK))) {
- if (ciclog && (ciclog->ic_state ==
- XLOG_STATE_DONE_SYNC)) {
- ciclog->ic_state = XLOG_STATE_DO_CALLBACK;
- }
- break;
- }
- /*
- * We now have an iclog that is in either the
- * DO_CALLBACK or DONE_SYNC states. The other
- * states (WANT_SYNC, SYNCING, or CALLBACK were
- * caught by the above if and are going to
- * clean (i.e. we aren't doing their callbacks)
- * see the above if.
- */
+ cycled_icloglock = true;
+ xlog_state_do_iclog_callbacks(log, iclog, aborted);
- /*
- * We will do one more check here to see if we
- * have chased our tail around.
- */
-
- lowest_lsn = xlog_get_lowest_lsn(log);
- if (lowest_lsn &&
- XFS_LSN_CMP(lowest_lsn,
- be64_to_cpu(iclog->ic_header.h_lsn)) < 0) {
- iclog = iclog->ic_next;
- continue; /* Leave this iclog for
- * another thread */
- }
-
- iclog->ic_state = XLOG_STATE_CALLBACK;
-
-
- /*
- * Completion of a iclog IO does not imply that
- * a transaction has completed, as transactions
- * can be large enough to span many iclogs. We
- * cannot change the tail of the log half way
- * through a transaction as this may be the only
- * transaction in the log and moving th etail to
- * point to the middle of it will prevent
- * recovery from finding the start of the
- * transaction. Hence we should only update the
- * last_sync_lsn if this iclog contains
- * transaction completion callbacks on it.
- *
- * We have to do this before we drop the
- * icloglock to ensure we are the only one that
- * can update it.
- */
- ASSERT(XFS_LSN_CMP(atomic64_read(&log->l_last_sync_lsn),
- be64_to_cpu(iclog->ic_header.h_lsn)) <= 0);
- if (iclog->ic_callback)
- atomic64_set(&log->l_last_sync_lsn,
- be64_to_cpu(iclog->ic_header.h_lsn));
-
- } else
- ioerrors++;
-
- spin_unlock(&log->l_icloglock);
-
- /*
- * Keep processing entries in the callback list until
- * we come around and it is empty. We need to
- * atomically see that the list is empty and change the
- * state to DIRTY so that we don't miss any more
- * callbacks being added.
- */
- spin_lock(&iclog->ic_callback_lock);
- cb = iclog->ic_callback;
- while (cb) {
- iclog->ic_callback_tail = &(iclog->ic_callback);
- iclog->ic_callback = NULL;
- spin_unlock(&iclog->ic_callback_lock);
-
- /* perform callbacks in the order given */
- for (; cb; cb = cb_next) {
- cb_next = cb->cb_next;
- cb->cb_func(cb->cb_arg, aborted);
- }
- spin_lock(&iclog->ic_callback_lock);
- cb = iclog->ic_callback;
- }
-
- loopdidcallbacks++;
- funcdidcallbacks++;
-
- spin_lock(&log->l_icloglock);
- ASSERT(iclog->ic_callback == NULL);
- spin_unlock(&iclog->ic_callback_lock);
- if (!(iclog->ic_state & XLOG_STATE_IOERROR))
- iclog->ic_state = XLOG_STATE_DIRTY;
-
- /*
- * Transition from DIRTY to ACTIVE if applicable.
- * NOP if STATE_IOERROR.
- */
- xlog_state_clean_log(log);
-
- /* wake up threads waiting in xfs_log_force() */
- wake_up_all(&iclog->ic_force_wait);
-
+ xlog_state_clean_iclog(log, iclog);
iclog = iclog->ic_next;
} while (first_iclog != iclog);
+ did_callbacks |= cycled_icloglock;
+
if (repeats > 5000) {
flushcnt += repeats;
repeats = 0;
@@ -2873,50 +2895,15 @@
"%s: possible infinite loop (%d iterations)",
__func__, flushcnt);
}
- } while (!ioerrors && loopdidcallbacks);
+ } while (!ioerror && cycled_icloglock);
-#ifdef DEBUG
- /*
- * Make one last gasp attempt to see if iclogs are being left in limbo.
- * If the above loop finds an iclog earlier than the current iclog and
- * in one of the syncing states, the current iclog is put into
- * DO_CALLBACK and the callbacks are deferred to the completion of the
- * earlier iclog. Walk the iclogs in order and make sure that no iclog
- * is in DO_CALLBACK unless an earlier iclog is in one of the syncing
- * states.
- *
- * Note that SYNCING|IOABORT is a valid state so we cannot just check
- * for ic_state == SYNCING.
- */
- if (funcdidcallbacks) {
- first_iclog = iclog = log->l_iclog;
- do {
- ASSERT(iclog->ic_state != XLOG_STATE_DO_CALLBACK);
- /*
- * Terminate the loop if iclogs are found in states
- * which will cause other threads to clean up iclogs.
- *
- * SYNCING - i/o completion will go through logs
- * DONE_SYNC - interrupt thread should be waiting for
- * l_icloglock
- * IOERROR - give up hope all ye who enter here
- */
- if (iclog->ic_state == XLOG_STATE_WANT_SYNC ||
- iclog->ic_state & XLOG_STATE_SYNCING ||
- iclog->ic_state == XLOG_STATE_DONE_SYNC ||
- iclog->ic_state == XLOG_STATE_IOERROR )
- break;
- iclog = iclog->ic_next;
- } while (first_iclog != iclog);
- }
-#endif
+ if (did_callbacks)
+ xlog_state_callback_check_state(log);
if (log->l_iclog->ic_state & (XLOG_STATE_ACTIVE|XLOG_STATE_IOERROR))
- wake = 1;
- spin_unlock(&log->l_icloglock);
-
- if (wake)
wake_up_all(&log->l_flush_wait);
+
+ spin_unlock(&log->l_icloglock);
}
@@ -2935,18 +2922,16 @@
*/
STATIC void
xlog_state_done_syncing(
- xlog_in_core_t *iclog,
- int aborted)
+ struct xlog_in_core *iclog,
+ bool aborted)
{
- struct xlog *log = iclog->ic_log;
+ struct xlog *log = iclog->ic_log;
spin_lock(&log->l_icloglock);
ASSERT(iclog->ic_state == XLOG_STATE_SYNCING ||
iclog->ic_state == XLOG_STATE_IOERROR);
ASSERT(atomic_read(&iclog->ic_refcnt) == 0);
- ASSERT(iclog->ic_bwritecnt == 1 || iclog->ic_bwritecnt == 2);
-
/*
* If we got an error, either on the first buffer, or in the case of
@@ -2954,13 +2939,8 @@
* and none should ever be attempted to be written to disk
* again.
*/
- if (iclog->ic_state != XLOG_STATE_IOERROR) {
- if (--iclog->ic_bwritecnt == 1) {
- spin_unlock(&log->l_icloglock);
- return;
- }
+ if (iclog->ic_state != XLOG_STATE_IOERROR)
iclog->ic_state = XLOG_STATE_DONE_SYNC;
- }
/*
* Someone could be sleeping prior to writing out the next
@@ -3229,7 +3209,7 @@
* flags after this point.
*/
if (sync)
- return xlog_sync(log, iclog);
+ xlog_sync(log, iclog);
return 0;
} /* xlog_state_release_iclog */
@@ -3820,8 +3800,7 @@
xlog_verify_iclog(
struct xlog *log,
struct xlog_in_core *iclog,
- int count,
- bool syncing)
+ int count)
{
xlog_op_header_t *ophead;
xlog_in_core_t *icptr;
@@ -3865,7 +3844,7 @@
/* clientid is only 1 byte */
p = &ophead->oh_clientid;
field_offset = p - base_ptr;
- if (!syncing || (field_offset & 0x1ff)) {
+ if (field_offset & 0x1ff) {
clientid = ophead->oh_clientid;
} else {
idx = BTOBBT((char *)&ophead->oh_clientid - iclog->ic_datap);
@@ -3888,7 +3867,7 @@
/* check length */
p = &ophead->oh_len;
field_offset = p - base_ptr;
- if (!syncing || (field_offset & 0x1ff)) {
+ if (field_offset & 0x1ff) {
op_len = be32_to_cpu(ophead->oh_len);
} else {
idx = BTOBBT((uintptr_t)&ophead->oh_len -
@@ -4024,8 +4003,10 @@
* item committed callback functions will do this again under lock to
* avoid races.
*/
+ spin_lock(&log->l_cilp->xc_push_lock);
wake_up_all(&log->l_cilp->xc_commit_wait);
- xlog_state_do_callback(log, XFS_LI_ABORTED, NULL);
+ spin_unlock(&log->l_cilp->xc_push_lock);
+ xlog_state_do_callback(log, true, NULL);
#ifdef XFSERRORDEBUG
{
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index 73a64bf..84e0680 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -6,6 +6,8 @@
#ifndef __XFS_LOG_H__
#define __XFS_LOG_H__
+struct xfs_cil_ctx;
+
struct xfs_log_vec {
struct xfs_log_vec *lv_next; /* next lv in build list */
int lv_niovecs; /* number of iovecs in lv */
@@ -72,16 +74,6 @@
}
/*
- * Structure used to pass callback function and the function's argument
- * to the log manager.
- */
-typedef struct xfs_log_callback {
- struct xfs_log_callback *cb_next;
- void (*cb_func)(void *, int);
- void *cb_arg;
-} xfs_log_callback_t;
-
-/*
* By comparing each component, we don't have to worry about extra
* endian issues in treating two 32 bit numbers as one 64 bit number
*/
@@ -125,12 +117,10 @@
xfs_daddr_t start_block,
int num_bblocks);
int xfs_log_mount_finish(struct xfs_mount *mp);
-int xfs_log_mount_cancel(struct xfs_mount *);
+void xfs_log_mount_cancel(struct xfs_mount *);
xfs_lsn_t xlog_assign_tail_lsn(struct xfs_mount *mp);
xfs_lsn_t xlog_assign_tail_lsn_locked(struct xfs_mount *mp);
void xfs_log_space_wake(struct xfs_mount *mp);
-int xfs_log_notify(struct xlog_in_core *iclog,
- struct xfs_log_callback *callback_entry);
int xfs_log_release_iclog(struct xfs_mount *mp,
struct xlog_in_core *iclog);
int xfs_log_reserve(struct xfs_mount *mp,
@@ -148,6 +138,7 @@
void xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp,
xfs_lsn_t *commit_lsn, bool regrant);
+void xlog_cil_process_committed(struct list_head *list, bool aborted);
bool xfs_log_item_in_current_chkpt(struct xfs_log_item *lip);
void xfs_log_work_queue(struct xfs_mount *mp);
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index d3884e0..ef652ab 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -10,10 +10,7 @@
#include "xfs_shared.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
-#include "xfs_error.h"
-#include "xfs_alloc.h"
#include "xfs_extent_busy.h"
-#include "xfs_discard.h"
#include "xfs_trans.h"
#include "xfs_trans_priv.h"
#include "xfs_log.h"
@@ -41,7 +38,7 @@
struct xlog_ticket *tic;
tic = xlog_ticket_alloc(log, 0, 1, XFS_TRANSACTION, 0,
- KM_SLEEP|KM_NOFS);
+ KM_NOFS);
/*
* set the current reservation to zero so we know to steal the basic
@@ -189,7 +186,7 @@
*/
kmem_free(lip->li_lv_shadow);
- lv = kmem_alloc_large(buf_size, KM_SLEEP | KM_NOFS);
+ lv = kmem_alloc_large(buf_size, KM_NOFS);
memset(lv, 0, xlog_cil_iovec_space(niovecs));
lv->lv_item = lip;
@@ -246,7 +243,8 @@
* shadow buffer, so update the the pointer to it appropriately.
*/
if (!old_lv) {
- lv->lv_item->li_ops->iop_pin(lv->lv_item);
+ if (lv->lv_item->li_ops->iop_pin)
+ lv->lv_item->li_ops->iop_pin(lv->lv_item);
lv->lv_item->li_lv_shadow = NULL;
} else if (old_lv != lv) {
ASSERT(lv->lv_buf_len != XFS_LOG_VEC_ORDERED);
@@ -576,12 +574,24 @@
*/
static void
xlog_cil_committed(
- void *args,
- int abort)
+ struct xfs_cil_ctx *ctx,
+ bool abort)
{
- struct xfs_cil_ctx *ctx = args;
struct xfs_mount *mp = ctx->cil->xc_log->l_mp;
+ /*
+ * If the I/O failed, we're aborting the commit and already shutdown.
+ * Wake any commit waiters before aborting the log items so we don't
+ * block async log pushers on callbacks. Async log pushers explicitly do
+ * not wait on log force completion because they may be holding locks
+ * required to unpin items.
+ */
+ if (abort) {
+ spin_lock(&ctx->cil->xc_push_lock);
+ wake_up_all(&ctx->cil->xc_commit_wait);
+ spin_unlock(&ctx->cil->xc_push_lock);
+ }
+
xfs_trans_committed_bulk(ctx->cil->xc_log->l_ailp, ctx->lv_chain,
ctx->start_lsn, abort);
@@ -589,15 +599,7 @@
xfs_extent_busy_clear(mp, &ctx->busy_extents,
(mp->m_flags & XFS_MOUNT_DISCARD) && !abort);
- /*
- * If we are aborting the commit, wake up anyone waiting on the
- * committing list. If we don't, then a shutdown we can leave processes
- * waiting in xlog_cil_force_lsn() waiting on a sequence commit that
- * will never happen because we aborted it.
- */
spin_lock(&ctx->cil->xc_push_lock);
- if (abort)
- wake_up_all(&ctx->cil->xc_commit_wait);
list_del(&ctx->committing);
spin_unlock(&ctx->cil->xc_push_lock);
@@ -609,6 +611,20 @@
kmem_free(ctx);
}
+void
+xlog_cil_process_committed(
+ struct list_head *list,
+ bool aborted)
+{
+ struct xfs_cil_ctx *ctx;
+
+ while ((ctx = list_first_entry_or_null(list,
+ struct xfs_cil_ctx, iclog_entry))) {
+ list_del(&ctx->iclog_entry);
+ xlog_cil_committed(ctx, aborted);
+ }
+}
+
/*
* Push the Committed Item List to the log. If @push_seq flag is zero, then it
* is a background flush and so we can chose to ignore it. Otherwise, if the
@@ -644,7 +660,7 @@
if (!cil)
return 0;
- new_ctx = kmem_zalloc(sizeof(*new_ctx), KM_SLEEP|KM_NOFS);
+ new_ctx = kmem_zalloc(sizeof(*new_ctx), KM_NOFS);
new_ctx->ticket = xlog_cil_ticket_alloc(log);
down_write(&cil->xc_ctx_lock);
@@ -830,12 +846,15 @@
if (commit_lsn == -1)
goto out_abort;
- /* attach all the transactions w/ busy extents to iclog */
- ctx->log_cb.cb_func = xlog_cil_committed;
- ctx->log_cb.cb_arg = ctx;
- error = xfs_log_notify(commit_iclog, &ctx->log_cb);
- if (error)
+ spin_lock(&commit_iclog->ic_callback_lock);
+ if (commit_iclog->ic_state & XLOG_STATE_IOERROR) {
+ spin_unlock(&commit_iclog->ic_callback_lock);
goto out_abort;
+ }
+ ASSERT_ALWAYS(commit_iclog->ic_state == XLOG_STATE_ACTIVE ||
+ commit_iclog->ic_state == XLOG_STATE_WANT_SYNC);
+ list_add_tail(&ctx->iclog_entry, &commit_iclog->ic_callbacks);
+ spin_unlock(&commit_iclog->ic_callback_lock);
/*
* now the checkpoint commit is complete and we've attached the
@@ -859,7 +878,7 @@
out_abort_free_ticket:
xfs_log_ticket_put(tic);
out_abort:
- xlog_cil_committed(ctx, XFS_LI_ABORTED);
+ xlog_cil_committed(ctx, true);
return -EIO;
}
@@ -979,6 +998,7 @@
{
struct xlog *log = mp->m_log;
struct xfs_cil *cil = log->l_cilp;
+ struct xfs_log_item *lip, *next;
xfs_lsn_t xc_commit_lsn;
/*
@@ -1003,7 +1023,7 @@
/*
* Once all the items of the transaction have been copied to the CIL,
- * the items can be unlocked and freed.
+ * the items can be unlocked and possibly freed.
*
* This needs to be done before we drop the CIL context lock because we
* have to update state in the log items and unlock them before they go
@@ -1012,8 +1032,12 @@
* the log items. This affects (at least) processing of stale buffers,
* inodes and EFIs.
*/
- xfs_trans_free_items(tp, xc_commit_lsn, false);
-
+ trace_xfs_trans_commit_items(tp, _RET_IP_);
+ list_for_each_entry_safe(lip, next, &tp->t_items, li_trans) {
+ xfs_trans_del_item(lip);
+ if (lip->li_ops->iop_committing)
+ lip->li_ops->iop_committing(lip, xc_commit_lsn);
+ }
xlog_cil_push_background(log);
up_read(&cil->xc_ctx_lock);
@@ -1155,11 +1179,11 @@
struct xfs_cil *cil;
struct xfs_cil_ctx *ctx;
- cil = kmem_zalloc(sizeof(*cil), KM_SLEEP|KM_MAYFAIL);
+ cil = kmem_zalloc(sizeof(*cil), KM_MAYFAIL);
if (!cil)
return -ENOMEM;
- ctx = kmem_zalloc(sizeof(*ctx), KM_SLEEP|KM_MAYFAIL);
+ ctx = kmem_zalloc(sizeof(*ctx), KM_MAYFAIL);
if (!ctx) {
kmem_free(cil);
return -ENOMEM;
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index b5f82cb..b880c23 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -10,7 +10,6 @@
struct xlog;
struct xlog_ticket;
struct xfs_mount;
-struct xfs_log_callback;
/*
* Flags for log structure
@@ -50,7 +49,6 @@
#define XLOG_STATE_CALLBACK 0x0020 /* Callback functions now */
#define XLOG_STATE_DIRTY 0x0040 /* Dirty IC log, not ready for ACTIVE status*/
#define XLOG_STATE_IOERROR 0x0080 /* IO error happened in sync'ing log */
-#define XLOG_STATE_IOABORT 0x0100 /* force abort on I/O completion (debug) */
#define XLOG_STATE_ALL 0x7FFF /* All possible valid flags */
#define XLOG_STATE_NOTUSED 0x8000 /* This IC log not being used */
@@ -179,11 +177,10 @@
* the iclog.
* - ic_forcewait is used to implement synchronous forcing of the iclog to disk.
* - ic_next is the pointer to the next iclog in the ring.
- * - ic_bp is a pointer to the buffer used to write this incore log to disk.
* - ic_log is a pointer back to the global log structure.
- * - ic_callback is a linked list of callback function/argument pairs to be
- * called after an iclog finishes writing.
- * - ic_size is the full size of the header plus data.
+ * - ic_size is the full size of the log buffer, minus the cycle headers.
+ * - ic_io_size is the size of the currently pending log buffer write, which
+ * might be smaller than ic_size
* - ic_offset is the current number of bytes written to in this iclog.
* - ic_refcnt is bumped when someone is writing to the log.
* - ic_state is the state of the iclog.
@@ -193,7 +190,7 @@
* structure cacheline aligned. The following fields can be contended on
* by independent processes:
*
- * - ic_callback_*
+ * - ic_callbacks
* - ic_refcnt
* - fields protected by the global l_icloglock
*
@@ -206,23 +203,28 @@
wait_queue_head_t ic_write_wait;
struct xlog_in_core *ic_next;
struct xlog_in_core *ic_prev;
- struct xfs_buf *ic_bp;
struct xlog *ic_log;
- int ic_size;
- int ic_offset;
- int ic_bwritecnt;
+ u32 ic_size;
+ u32 ic_io_size;
+ u32 ic_offset;
unsigned short ic_state;
char *ic_datap; /* pointer to iclog data */
/* Callback structures need their own cacheline */
spinlock_t ic_callback_lock ____cacheline_aligned_in_smp;
- struct xfs_log_callback *ic_callback;
- struct xfs_log_callback **ic_callback_tail;
+ struct list_head ic_callbacks;
/* reference counts need their own cacheline */
atomic_t ic_refcnt ____cacheline_aligned_in_smp;
xlog_in_core_2_t *ic_data;
#define ic_header ic_data->hic_header
+#ifdef DEBUG
+ bool ic_fail_crc : 1;
+#endif
+ struct semaphore ic_sema;
+ struct work_struct ic_end_io_work;
+ struct bio ic_bio;
+ struct bio_vec ic_bvec[];
} xlog_in_core_t;
/*
@@ -243,7 +245,7 @@
int space_used; /* aggregate size of regions */
struct list_head busy_extents; /* busy extents in chkpt */
struct xfs_log_vec *lv_chain; /* logvecs being pushed */
- struct xfs_log_callback log_cb; /* completion callback hook. */
+ struct list_head iclog_entry;
struct list_head committing; /* ctx committing list */
struct work_struct discard_endio_work;
};
@@ -350,9 +352,8 @@
struct xfs_mount *l_mp; /* mount point */
struct xfs_ail *l_ailp; /* AIL log is working with */
struct xfs_cil *l_cilp; /* CIL log is working with */
- struct xfs_buf *l_xbuf; /* extra buffer for log
- * wrapping */
struct xfs_buftarg *l_targ; /* buftarg of log */
+ struct workqueue_struct *l_ioend_workqueue; /* for I/O completions */
struct delayed_work l_work; /* background flush work */
uint l_flags;
uint l_quotaoffs_flag; /* XFS_DQ_*, for QUOTAOFFs */
@@ -361,7 +362,6 @@
int l_iclog_heads; /* # of iclog header sectors */
uint l_sectBBsize; /* sector size in BBs (2^n) */
int l_iclog_size; /* size of log in bytes */
- int l_iclog_size_log; /* log power size of log */
int l_iclog_bufs; /* number of iclog buffers */
xfs_daddr_t l_logBBstart; /* start block of log */
int l_logsize; /* size of log in bytes */
@@ -418,7 +418,7 @@
extern int
xlog_recover_finish(
struct xlog *log);
-extern int
+extern void
xlog_recover_cancel(struct xlog *);
extern __le32 xlog_cksum(struct xlog *log, struct xlog_rec_header *rhead,
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 1fc9e90..c1a514f 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -13,8 +13,6 @@
#include "xfs_sb.h"
#include "xfs_mount.h"
#include "xfs_defer.h"
-#include "xfs_da_format.h"
-#include "xfs_da_btree.h"
#include "xfs_inode.h"
#include "xfs_trans.h"
#include "xfs_log.h"
@@ -26,7 +24,6 @@
#include "xfs_alloc.h"
#include "xfs_ialloc.h"
#include "xfs_quota.h"
-#include "xfs_cksum.h"
#include "xfs_trace.h"
#include "xfs_icache.h"
#include "xfs_bmap_btree.h"
@@ -79,7 +76,7 @@
* are valid, false otherwise.
*/
static inline bool
-xlog_verify_bp(
+xlog_verify_bno(
struct xlog *log,
xfs_daddr_t blk_no,
int bbcount)
@@ -92,22 +89,21 @@
}
/*
- * Allocate a buffer to hold log data. The buffer needs to be able
- * to map to a range of nbblks basic blocks at any valid (basic
- * block) offset within the log.
+ * Allocate a buffer to hold log data. The buffer needs to be able to map to
+ * a range of nbblks basic blocks at any valid offset within the log.
*/
-STATIC xfs_buf_t *
-xlog_get_bp(
+static char *
+xlog_alloc_buffer(
struct xlog *log,
int nbblks)
{
- struct xfs_buf *bp;
+ int align_mask = xfs_buftarg_dma_alignment(log->l_targ);
/*
* Pass log block 0 since we don't have an addr yet, buffer will be
* verified on read.
*/
- if (!xlog_verify_bp(log, 0, nbblks)) {
+ if (!xlog_verify_bno(log, 0, nbblks)) {
xfs_warn(log->l_mp, "Invalid block length (0x%x) for buffer",
nbblks);
XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp);
@@ -115,69 +111,48 @@
}
/*
- * We do log I/O in units of log sectors (a power-of-2
- * multiple of the basic block size), so we round up the
- * requested size to accommodate the basic blocks required
- * for complete log sectors.
+ * We do log I/O in units of log sectors (a power-of-2 multiple of the
+ * basic block size), so we round up the requested size to accommodate
+ * the basic blocks required for complete log sectors.
*
- * In addition, the buffer may be used for a non-sector-
- * aligned block offset, in which case an I/O of the
- * requested size could extend beyond the end of the
- * buffer. If the requested size is only 1 basic block it
- * will never straddle a sector boundary, so this won't be
- * an issue. Nor will this be a problem if the log I/O is
- * done in basic blocks (sector size 1). But otherwise we
- * extend the buffer by one extra log sector to ensure
- * there's space to accommodate this possibility.
+ * In addition, the buffer may be used for a non-sector-aligned block
+ * offset, in which case an I/O of the requested size could extend
+ * beyond the end of the buffer. If the requested size is only 1 basic
+ * block it will never straddle a sector boundary, so this won't be an
+ * issue. Nor will this be a problem if the log I/O is done in basic
+ * blocks (sector size 1). But otherwise we extend the buffer by one
+ * extra log sector to ensure there's space to accommodate this
+ * possibility.
*/
if (nbblks > 1 && log->l_sectBBsize > 1)
nbblks += log->l_sectBBsize;
nbblks = round_up(nbblks, log->l_sectBBsize);
-
- bp = xfs_buf_get_uncached(log->l_mp->m_logdev_targp, nbblks, 0);
- if (bp)
- xfs_buf_unlock(bp);
- return bp;
-}
-
-STATIC void
-xlog_put_bp(
- xfs_buf_t *bp)
-{
- xfs_buf_free(bp);
+ return kmem_alloc_io(BBTOB(nbblks), align_mask, KM_MAYFAIL | KM_ZERO);
}
/*
* Return the address of the start of the given block number's data
* in a log buffer. The buffer covers a log sector-aligned region.
*/
-STATIC char *
+static inline unsigned int
xlog_align(
struct xlog *log,
- xfs_daddr_t blk_no,
- int nbblks,
- struct xfs_buf *bp)
+ xfs_daddr_t blk_no)
{
- xfs_daddr_t offset = blk_no & ((xfs_daddr_t)log->l_sectBBsize - 1);
-
- ASSERT(offset + nbblks <= bp->b_length);
- return bp->b_addr + BBTOB(offset);
+ return BBTOB(blk_no & ((xfs_daddr_t)log->l_sectBBsize - 1));
}
-
-/*
- * nbblks should be uint, but oh well. Just want to catch that 32-bit length.
- */
-STATIC int
-xlog_bread_noalign(
- struct xlog *log,
- xfs_daddr_t blk_no,
- int nbblks,
- struct xfs_buf *bp)
+static int
+xlog_do_io(
+ struct xlog *log,
+ xfs_daddr_t blk_no,
+ unsigned int nbblks,
+ char *data,
+ unsigned int op)
{
- int error;
+ int error;
- if (!xlog_verify_bp(log, blk_no, nbblks)) {
+ if (!xlog_verify_bno(log, blk_no, nbblks)) {
xfs_warn(log->l_mp,
"Invalid log block/length (0x%llx, 0x%x) for buffer",
blk_no, nbblks);
@@ -187,107 +162,53 @@
blk_no = round_down(blk_no, log->l_sectBBsize);
nbblks = round_up(nbblks, log->l_sectBBsize);
-
ASSERT(nbblks > 0);
- ASSERT(nbblks <= bp->b_length);
- XFS_BUF_SET_ADDR(bp, log->l_logBBstart + blk_no);
- bp->b_flags |= XBF_READ;
- bp->b_io_length = nbblks;
- bp->b_error = 0;
-
- error = xfs_buf_submit(bp);
- if (error && !XFS_FORCED_SHUTDOWN(log->l_mp))
- xfs_buf_ioerror_alert(bp, __func__);
+ error = xfs_rw_bdev(log->l_targ->bt_bdev, log->l_logBBstart + blk_no,
+ BBTOB(nbblks), data, op);
+ if (error && !XFS_FORCED_SHUTDOWN(log->l_mp)) {
+ xfs_alert(log->l_mp,
+ "log recovery %s I/O error at daddr 0x%llx len %d error %d",
+ op == REQ_OP_WRITE ? "write" : "read",
+ blk_no, nbblks, error);
+ }
return error;
}
STATIC int
+xlog_bread_noalign(
+ struct xlog *log,
+ xfs_daddr_t blk_no,
+ int nbblks,
+ char *data)
+{
+ return xlog_do_io(log, blk_no, nbblks, data, REQ_OP_READ);
+}
+
+STATIC int
xlog_bread(
struct xlog *log,
xfs_daddr_t blk_no,
int nbblks,
- struct xfs_buf *bp,
+ char *data,
char **offset)
{
int error;
- error = xlog_bread_noalign(log, blk_no, nbblks, bp);
- if (error)
- return error;
-
- *offset = xlog_align(log, blk_no, nbblks, bp);
- return 0;
+ error = xlog_do_io(log, blk_no, nbblks, data, REQ_OP_READ);
+ if (!error)
+ *offset = data + xlog_align(log, blk_no);
+ return error;
}
-/*
- * Read at an offset into the buffer. Returns with the buffer in it's original
- * state regardless of the result of the read.
- */
-STATIC int
-xlog_bread_offset(
- struct xlog *log,
- xfs_daddr_t blk_no, /* block to read from */
- int nbblks, /* blocks to read */
- struct xfs_buf *bp,
- char *offset)
-{
- char *orig_offset = bp->b_addr;
- int orig_len = BBTOB(bp->b_length);
- int error, error2;
-
- error = xfs_buf_associate_memory(bp, offset, BBTOB(nbblks));
- if (error)
- return error;
-
- error = xlog_bread_noalign(log, blk_no, nbblks, bp);
-
- /* must reset buffer pointer even on error */
- error2 = xfs_buf_associate_memory(bp, orig_offset, orig_len);
- if (error)
- return error;
- return error2;
-}
-
-/*
- * Write out the buffer at the given block for the given number of blocks.
- * The buffer is kept locked across the write and is returned locked.
- * This can only be used for synchronous log writes.
- */
STATIC int
xlog_bwrite(
struct xlog *log,
xfs_daddr_t blk_no,
int nbblks,
- struct xfs_buf *bp)
+ char *data)
{
- int error;
-
- if (!xlog_verify_bp(log, blk_no, nbblks)) {
- xfs_warn(log->l_mp,
- "Invalid log block/length (0x%llx, 0x%x) for buffer",
- blk_no, nbblks);
- XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp);
- return -EFSCORRUPTED;
- }
-
- blk_no = round_down(blk_no, log->l_sectBBsize);
- nbblks = round_up(nbblks, log->l_sectBBsize);
-
- ASSERT(nbblks > 0);
- ASSERT(nbblks <= bp->b_length);
-
- XFS_BUF_SET_ADDR(bp, log->l_logBBstart + blk_no);
- xfs_buf_hold(bp);
- xfs_buf_lock(bp);
- bp->b_io_length = nbblks;
- bp->b_error = 0;
-
- error = xfs_bwrite(bp);
- if (error)
- xfs_buf_ioerror_alert(bp, __func__);
- xfs_buf_relse(bp);
- return error;
+ return xlog_do_io(log, blk_no, nbblks, data, REQ_OP_WRITE);
}
#ifdef DEBUG
@@ -377,10 +298,9 @@
* We're not going to bother about retrying
* this during recovery. One strike!
*/
- if (!XFS_FORCED_SHUTDOWN(bp->b_target->bt_mount)) {
+ if (!XFS_FORCED_SHUTDOWN(bp->b_mount)) {
xfs_buf_ioerror_alert(bp, __func__);
- xfs_force_shutdown(bp->b_target->bt_mount,
- SHUTDOWN_META_IO_ERROR);
+ xfs_force_shutdown(bp->b_mount, SHUTDOWN_META_IO_ERROR);
}
}
@@ -405,7 +325,7 @@
STATIC int
xlog_find_cycle_start(
struct xlog *log,
- struct xfs_buf *bp,
+ char *buffer,
xfs_daddr_t first_blk,
xfs_daddr_t *last_blk,
uint cycle)
@@ -419,7 +339,7 @@
end_blk = *last_blk;
mid_blk = BLK_AVG(first_blk, end_blk);
while (mid_blk != first_blk && mid_blk != end_blk) {
- error = xlog_bread(log, mid_blk, 1, bp, &offset);
+ error = xlog_bread(log, mid_blk, 1, buffer, &offset);
if (error)
return error;
mid_cycle = xlog_get_cycle(offset);
@@ -455,7 +375,7 @@
{
xfs_daddr_t i, j;
uint cycle;
- xfs_buf_t *bp;
+ char *buffer;
xfs_daddr_t bufblks;
char *buf = NULL;
int error = 0;
@@ -469,7 +389,7 @@
bufblks = 1 << ffs(nbblks);
while (bufblks > log->l_logBBsize)
bufblks >>= 1;
- while (!(bp = xlog_get_bp(log, bufblks))) {
+ while (!(buffer = xlog_alloc_buffer(log, bufblks))) {
bufblks >>= 1;
if (bufblks < log->l_sectBBsize)
return -ENOMEM;
@@ -480,7 +400,7 @@
bcount = min(bufblks, (start_blk + nbblks - i));
- error = xlog_bread(log, i, bcount, bp, &buf);
+ error = xlog_bread(log, i, bcount, buffer, &buf);
if (error)
goto out;
@@ -498,7 +418,7 @@
*new_blk = -1;
out:
- xlog_put_bp(bp);
+ kmem_free(buffer);
return error;
}
@@ -522,7 +442,7 @@
int extra_bblks)
{
xfs_daddr_t i;
- xfs_buf_t *bp;
+ char *buffer;
char *offset = NULL;
xlog_rec_header_t *head = NULL;
int error = 0;
@@ -532,12 +452,14 @@
ASSERT(start_blk != 0 || *last_blk != start_blk);
- if (!(bp = xlog_get_bp(log, num_blks))) {
- if (!(bp = xlog_get_bp(log, 1)))
+ buffer = xlog_alloc_buffer(log, num_blks);
+ if (!buffer) {
+ buffer = xlog_alloc_buffer(log, 1);
+ if (!buffer)
return -ENOMEM;
smallmem = 1;
} else {
- error = xlog_bread(log, start_blk, num_blks, bp, &offset);
+ error = xlog_bread(log, start_blk, num_blks, buffer, &offset);
if (error)
goto out;
offset += ((num_blks - 1) << BBSHIFT);
@@ -554,7 +476,7 @@
}
if (smallmem) {
- error = xlog_bread(log, i, 1, bp, &offset);
+ error = xlog_bread(log, i, 1, buffer, &offset);
if (error)
goto out;
}
@@ -607,7 +529,7 @@
*last_blk = i;
out:
- xlog_put_bp(bp);
+ kmem_free(buffer);
return error;
}
@@ -629,7 +551,7 @@
struct xlog *log,
xfs_daddr_t *return_head_blk)
{
- xfs_buf_t *bp;
+ char *buffer;
char *offset;
xfs_daddr_t new_blk, first_blk, start_blk, last_blk, head_blk;
int num_scan_bblks;
@@ -659,20 +581,20 @@
}
first_blk = 0; /* get cycle # of 1st block */
- bp = xlog_get_bp(log, 1);
- if (!bp)
+ buffer = xlog_alloc_buffer(log, 1);
+ if (!buffer)
return -ENOMEM;
- error = xlog_bread(log, 0, 1, bp, &offset);
+ error = xlog_bread(log, 0, 1, buffer, &offset);
if (error)
- goto bp_err;
+ goto out_free_buffer;
first_half_cycle = xlog_get_cycle(offset);
last_blk = head_blk = log_bbnum - 1; /* get cycle # of last block */
- error = xlog_bread(log, last_blk, 1, bp, &offset);
+ error = xlog_bread(log, last_blk, 1, buffer, &offset);
if (error)
- goto bp_err;
+ goto out_free_buffer;
last_half_cycle = xlog_get_cycle(offset);
ASSERT(last_half_cycle != 0);
@@ -740,9 +662,10 @@
* ^ we want to locate this spot
*/
stop_on_cycle = last_half_cycle;
- if ((error = xlog_find_cycle_start(log, bp, first_blk,
- &head_blk, last_half_cycle)))
- goto bp_err;
+ error = xlog_find_cycle_start(log, buffer, first_blk, &head_blk,
+ last_half_cycle);
+ if (error)
+ goto out_free_buffer;
}
/*
@@ -762,7 +685,7 @@
if ((error = xlog_find_verify_cycle(log,
start_blk, num_scan_bblks,
stop_on_cycle, &new_blk)))
- goto bp_err;
+ goto out_free_buffer;
if (new_blk != -1)
head_blk = new_blk;
} else { /* need to read 2 parts of log */
@@ -799,7 +722,7 @@
if ((error = xlog_find_verify_cycle(log, start_blk,
num_scan_bblks - (int)head_blk,
(stop_on_cycle - 1), &new_blk)))
- goto bp_err;
+ goto out_free_buffer;
if (new_blk != -1) {
head_blk = new_blk;
goto validate_head;
@@ -815,7 +738,7 @@
if ((error = xlog_find_verify_cycle(log,
start_blk, (int)head_blk,
stop_on_cycle, &new_blk)))
- goto bp_err;
+ goto out_free_buffer;
if (new_blk != -1)
head_blk = new_blk;
}
@@ -834,13 +757,13 @@
if (error == 1)
error = -EIO;
if (error)
- goto bp_err;
+ goto out_free_buffer;
} else {
start_blk = 0;
ASSERT(head_blk <= INT_MAX);
error = xlog_find_verify_log_record(log, start_blk, &head_blk, 0);
if (error < 0)
- goto bp_err;
+ goto out_free_buffer;
if (error == 1) {
/* We hit the beginning of the log during our search */
start_blk = log_bbnum - (num_scan_bblks - head_blk);
@@ -853,14 +776,14 @@
if (error == 1)
error = -EIO;
if (error)
- goto bp_err;
+ goto out_free_buffer;
if (new_blk != log_bbnum)
head_blk = new_blk;
} else if (error)
- goto bp_err;
+ goto out_free_buffer;
}
- xlog_put_bp(bp);
+ kmem_free(buffer);
if (head_blk == log_bbnum)
*return_head_blk = 0;
else
@@ -873,9 +796,8 @@
*/
return 0;
- bp_err:
- xlog_put_bp(bp);
-
+out_free_buffer:
+ kmem_free(buffer);
if (error)
xfs_warn(log->l_mp, "failed to find log head");
return error;
@@ -895,7 +817,7 @@
xfs_daddr_t head_blk,
xfs_daddr_t tail_blk,
int count,
- struct xfs_buf *bp,
+ char *buffer,
xfs_daddr_t *rblk,
struct xlog_rec_header **rhead,
bool *wrapped)
@@ -914,7 +836,7 @@
*/
end_blk = head_blk > tail_blk ? tail_blk : 0;
for (i = (int) head_blk - 1; i >= end_blk; i--) {
- error = xlog_bread(log, i, 1, bp, &offset);
+ error = xlog_bread(log, i, 1, buffer, &offset);
if (error)
goto out_error;
@@ -933,7 +855,7 @@
*/
if (tail_blk >= head_blk && found != count) {
for (i = log->l_logBBsize - 1; i >= (int) tail_blk; i--) {
- error = xlog_bread(log, i, 1, bp, &offset);
+ error = xlog_bread(log, i, 1, buffer, &offset);
if (error)
goto out_error;
@@ -969,7 +891,7 @@
xfs_daddr_t head_blk,
xfs_daddr_t tail_blk,
int count,
- struct xfs_buf *bp,
+ char *buffer,
xfs_daddr_t *rblk,
struct xlog_rec_header **rhead,
bool *wrapped)
@@ -988,7 +910,7 @@
*/
end_blk = head_blk > tail_blk ? head_blk : log->l_logBBsize - 1;
for (i = (int) tail_blk; i <= end_blk; i++) {
- error = xlog_bread(log, i, 1, bp, &offset);
+ error = xlog_bread(log, i, 1, buffer, &offset);
if (error)
goto out_error;
@@ -1006,7 +928,7 @@
*/
if (tail_blk > head_blk && found != count) {
for (i = 0; i < (int) head_blk; i++) {
- error = xlog_bread(log, i, 1, bp, &offset);
+ error = xlog_bread(log, i, 1, buffer, &offset);
if (error)
goto out_error;
@@ -1069,22 +991,22 @@
int hsize)
{
struct xlog_rec_header *thead;
- struct xfs_buf *bp;
+ char *buffer;
xfs_daddr_t first_bad;
int error = 0;
bool wrapped;
xfs_daddr_t tmp_tail;
xfs_daddr_t orig_tail = *tail_blk;
- bp = xlog_get_bp(log, 1);
- if (!bp)
+ buffer = xlog_alloc_buffer(log, 1);
+ if (!buffer)
return -ENOMEM;
/*
* Make sure the tail points to a record (returns positive count on
* success).
*/
- error = xlog_seek_logrec_hdr(log, head_blk, *tail_blk, 1, bp,
+ error = xlog_seek_logrec_hdr(log, head_blk, *tail_blk, 1, buffer,
&tmp_tail, &thead, &wrapped);
if (error < 0)
goto out;
@@ -1113,8 +1035,8 @@
break;
/* skip to the next record; returns positive count on success */
- error = xlog_seek_logrec_hdr(log, head_blk, first_bad, 2, bp,
- &tmp_tail, &thead, &wrapped);
+ error = xlog_seek_logrec_hdr(log, head_blk, first_bad, 2,
+ buffer, &tmp_tail, &thead, &wrapped);
if (error < 0)
goto out;
@@ -1129,7 +1051,7 @@
"Tail block (0x%llx) overwrite detected. Updated to 0x%llx",
orig_tail, *tail_blk);
out:
- xlog_put_bp(bp);
+ kmem_free(buffer);
return error;
}
@@ -1151,13 +1073,13 @@
struct xlog *log,
xfs_daddr_t *head_blk, /* in/out: unverified head */
xfs_daddr_t *tail_blk, /* out: tail block */
- struct xfs_buf *bp,
+ char *buffer,
xfs_daddr_t *rhead_blk, /* start blk of last record */
struct xlog_rec_header **rhead, /* ptr to last record */
bool *wrapped) /* last rec. wraps phys. log */
{
struct xlog_rec_header *tmp_rhead;
- struct xfs_buf *tmp_bp;
+ char *tmp_buffer;
xfs_daddr_t first_bad;
xfs_daddr_t tmp_rhead_blk;
int found;
@@ -1168,15 +1090,15 @@
* Check the head of the log for torn writes. Search backwards from the
* head until we hit the tail or the maximum number of log record I/Os
* that could have been in flight at one time. Use a temporary buffer so
- * we don't trash the rhead/bp pointers from the caller.
+ * we don't trash the rhead/buffer pointers from the caller.
*/
- tmp_bp = xlog_get_bp(log, 1);
- if (!tmp_bp)
+ tmp_buffer = xlog_alloc_buffer(log, 1);
+ if (!tmp_buffer)
return -ENOMEM;
error = xlog_rseek_logrec_hdr(log, *head_blk, *tail_blk,
- XLOG_MAX_ICLOGS, tmp_bp, &tmp_rhead_blk,
- &tmp_rhead, &tmp_wrapped);
- xlog_put_bp(tmp_bp);
+ XLOG_MAX_ICLOGS, tmp_buffer,
+ &tmp_rhead_blk, &tmp_rhead, &tmp_wrapped);
+ kmem_free(tmp_buffer);
if (error < 0)
return error;
@@ -1205,8 +1127,8 @@
* (i.e., the records with invalid CRC) if the cycle number
* matches the the current cycle.
*/
- found = xlog_rseek_logrec_hdr(log, first_bad, *tail_blk, 1, bp,
- rhead_blk, rhead, wrapped);
+ found = xlog_rseek_logrec_hdr(log, first_bad, *tail_blk, 1,
+ buffer, rhead_blk, rhead, wrapped);
if (found < 0)
return found;
if (found == 0) /* XXX: right thing to do here? */
@@ -1266,7 +1188,7 @@
xfs_daddr_t *tail_blk,
struct xlog_rec_header *rhead,
xfs_daddr_t rhead_blk,
- struct xfs_buf *bp,
+ char *buffer,
bool *clean)
{
struct xlog_op_header *op_head;
@@ -1309,7 +1231,7 @@
if (*head_blk == after_umount_blk &&
be32_to_cpu(rhead->h_num_logops) == 1) {
umount_data_blk = xlog_wrap_logbno(log, rhead_blk + hblks);
- error = xlog_bread(log, umount_data_blk, 1, bp, &offset);
+ error = xlog_bread(log, umount_data_blk, 1, buffer, &offset);
if (error)
return error;
@@ -1388,7 +1310,7 @@
{
xlog_rec_header_t *rhead;
char *offset = NULL;
- xfs_buf_t *bp;
+ char *buffer;
int error;
xfs_daddr_t rhead_blk;
xfs_lsn_t tail_lsn;
@@ -1402,11 +1324,11 @@
return error;
ASSERT(*head_blk < INT_MAX);
- bp = xlog_get_bp(log, 1);
- if (!bp)
+ buffer = xlog_alloc_buffer(log, 1);
+ if (!buffer)
return -ENOMEM;
if (*head_blk == 0) { /* special case */
- error = xlog_bread(log, 0, 1, bp, &offset);
+ error = xlog_bread(log, 0, 1, buffer, &offset);
if (error)
goto done;
@@ -1422,7 +1344,7 @@
* block. This wraps all the way back around to the head so something is
* seriously wrong if we can't find it.
*/
- error = xlog_rseek_logrec_hdr(log, *head_blk, *head_blk, 1, bp,
+ error = xlog_rseek_logrec_hdr(log, *head_blk, *head_blk, 1, buffer,
&rhead_blk, &rhead, &wrapped);
if (error < 0)
return error;
@@ -1443,7 +1365,7 @@
* state to determine whether recovery is necessary.
*/
error = xlog_check_unmount_rec(log, head_blk, tail_blk, rhead,
- rhead_blk, bp, &clean);
+ rhead_blk, buffer, &clean);
if (error)
goto done;
@@ -1460,7 +1382,7 @@
if (!clean) {
xfs_daddr_t orig_head = *head_blk;
- error = xlog_verify_head(log, head_blk, tail_blk, bp,
+ error = xlog_verify_head(log, head_blk, tail_blk, buffer,
&rhead_blk, &rhead, &wrapped);
if (error)
goto done;
@@ -1471,7 +1393,7 @@
wrapped);
tail_lsn = atomic64_read(&log->l_tail_lsn);
error = xlog_check_unmount_rec(log, head_blk, tail_blk,
- rhead, rhead_blk, bp,
+ rhead, rhead_blk, buffer,
&clean);
if (error)
goto done;
@@ -1505,11 +1427,11 @@
* But... if the -device- itself is readonly, just skip this.
* We can't recover this device anyway, so it won't matter.
*/
- if (!xfs_readonly_buftarg(log->l_mp->m_logdev_targp))
+ if (!xfs_readonly_buftarg(log->l_targ))
error = xlog_clear_stale_blocks(log, tail_lsn);
done:
- xlog_put_bp(bp);
+ kmem_free(buffer);
if (error)
xfs_warn(log->l_mp, "failed to locate log tail");
@@ -1537,7 +1459,7 @@
struct xlog *log,
xfs_daddr_t *blk_no)
{
- xfs_buf_t *bp;
+ char *buffer;
char *offset;
uint first_cycle, last_cycle;
xfs_daddr_t new_blk, last_blk, start_blk;
@@ -1547,35 +1469,36 @@
*blk_no = 0;
/* check totally zeroed log */
- bp = xlog_get_bp(log, 1);
- if (!bp)
+ buffer = xlog_alloc_buffer(log, 1);
+ if (!buffer)
return -ENOMEM;
- error = xlog_bread(log, 0, 1, bp, &offset);
+ error = xlog_bread(log, 0, 1, buffer, &offset);
if (error)
- goto bp_err;
+ goto out_free_buffer;
first_cycle = xlog_get_cycle(offset);
if (first_cycle == 0) { /* completely zeroed log */
*blk_no = 0;
- xlog_put_bp(bp);
+ kmem_free(buffer);
return 1;
}
/* check partially zeroed log */
- error = xlog_bread(log, log_bbnum-1, 1, bp, &offset);
+ error = xlog_bread(log, log_bbnum-1, 1, buffer, &offset);
if (error)
- goto bp_err;
+ goto out_free_buffer;
last_cycle = xlog_get_cycle(offset);
if (last_cycle != 0) { /* log completely written to */
- xlog_put_bp(bp);
+ kmem_free(buffer);
return 0;
}
/* we have a partially zeroed log */
last_blk = log_bbnum-1;
- if ((error = xlog_find_cycle_start(log, bp, 0, &last_blk, 0)))
- goto bp_err;
+ error = xlog_find_cycle_start(log, buffer, 0, &last_blk, 0);
+ if (error)
+ goto out_free_buffer;
/*
* Validate the answer. Because there is no way to guarantee that
@@ -1598,7 +1521,7 @@
*/
if ((error = xlog_find_verify_cycle(log, start_blk,
(int)num_scan_bblks, 0, &new_blk)))
- goto bp_err;
+ goto out_free_buffer;
if (new_blk != -1)
last_blk = new_blk;
@@ -1610,11 +1533,11 @@
if (error == 1)
error = -EIO;
if (error)
- goto bp_err;
+ goto out_free_buffer;
*blk_no = last_blk;
-bp_err:
- xlog_put_bp(bp);
+out_free_buffer:
+ kmem_free(buffer);
if (error)
return error;
return 1;
@@ -1657,7 +1580,7 @@
int tail_block)
{
char *offset;
- xfs_buf_t *bp;
+ char *buffer;
int balign, ealign;
int sectbb = log->l_sectBBsize;
int end_block = start_block + blocks;
@@ -1674,7 +1597,7 @@
bufblks = 1 << ffs(blocks);
while (bufblks > log->l_logBBsize)
bufblks >>= 1;
- while (!(bp = xlog_get_bp(log, bufblks))) {
+ while (!(buffer = xlog_alloc_buffer(log, bufblks))) {
bufblks >>= 1;
if (bufblks < sectbb)
return -ENOMEM;
@@ -1686,9 +1609,9 @@
*/
balign = round_down(start_block, sectbb);
if (balign != start_block) {
- error = xlog_bread_noalign(log, start_block, 1, bp);
+ error = xlog_bread_noalign(log, start_block, 1, buffer);
if (error)
- goto out_put_bp;
+ goto out_free_buffer;
j = start_block - balign;
}
@@ -1705,29 +1628,28 @@
*/
ealign = round_down(end_block, sectbb);
if (j == 0 && (start_block + endcount > ealign)) {
- offset = bp->b_addr + BBTOB(ealign - start_block);
- error = xlog_bread_offset(log, ealign, sectbb,
- bp, offset);
+ error = xlog_bread_noalign(log, ealign, sectbb,
+ buffer + BBTOB(ealign - start_block));
if (error)
break;
}
- offset = xlog_align(log, start_block, endcount, bp);
+ offset = buffer + xlog_align(log, start_block);
for (; j < endcount; j++) {
xlog_add_record(log, offset, cycle, i+j,
tail_cycle, tail_block);
offset += BBSIZE;
}
- error = xlog_bwrite(log, start_block, endcount, bp);
+ error = xlog_bwrite(log, start_block, endcount, buffer);
if (error)
break;
start_block += endcount;
j = 0;
}
- out_put_bp:
- xlog_put_bp(bp);
+out_free_buffer:
+ kmem_free(buffer);
return error;
}
@@ -2040,7 +1962,7 @@
}
}
- bcp = kmem_alloc(sizeof(struct xfs_buf_cancel), KM_SLEEP);
+ bcp = kmem_alloc(sizeof(struct xfs_buf_cancel), 0);
bcp->bc_blkno = buf_f->blf_blkno;
bcp->bc_len = buf_f->blf_len;
bcp->bc_refcount = 1;
@@ -2162,7 +2084,7 @@
if (xfs_sb_version_hascrc(&mp->m_sb))
bp->b_ops = &xfs_inode_buf_ops;
- inodes_per_buf = BBTOB(bp->b_io_length) >> mp->m_sb.sb_inodelog;
+ inodes_per_buf = BBTOB(bp->b_length) >> mp->m_sb.sb_inodelog;
for (i = 0; i < inodes_per_buf; i++) {
next_unlinked_offset = (i * mp->m_sb.sb_inodesize) +
offsetof(xfs_dinode_t, di_next_unlinked);
@@ -2204,8 +2126,7 @@
ASSERT(item->ri_buf[item_index].i_addr != NULL);
ASSERT((item->ri_buf[item_index].i_len % XFS_BLF_CHUNK) == 0);
- ASSERT((reg_buf_offset + reg_buf_bytes) <=
- BBTOB(bp->b_io_length));
+ ASSERT((reg_buf_offset + reg_buf_bytes) <= BBTOB(bp->b_length));
/*
* The current logged region contains a copy of the
@@ -2439,17 +2360,21 @@
case XFS_BLFT_BTREE_BUF:
switch (magic32) {
case XFS_ABTB_CRC_MAGIC:
- case XFS_ABTC_CRC_MAGIC:
case XFS_ABTB_MAGIC:
+ bp->b_ops = &xfs_bnobt_buf_ops;
+ break;
+ case XFS_ABTC_CRC_MAGIC:
case XFS_ABTC_MAGIC:
- bp->b_ops = &xfs_allocbt_buf_ops;
+ bp->b_ops = &xfs_cntbt_buf_ops;
break;
case XFS_IBT_CRC_MAGIC:
- case XFS_FIBT_CRC_MAGIC:
case XFS_IBT_MAGIC:
- case XFS_FIBT_MAGIC:
bp->b_ops = &xfs_inobt_buf_ops;
break;
+ case XFS_FIBT_CRC_MAGIC:
+ case XFS_FIBT_MAGIC:
+ bp->b_ops = &xfs_finobt_buf_ops;
+ break;
case XFS_BMAP_CRC_MAGIC:
case XFS_BMAP_MAGIC:
bp->b_ops = &xfs_bmbt_buf_ops;
@@ -2666,7 +2591,7 @@
ASSERT(nbits > 0);
ASSERT(item->ri_buf[i].i_addr != NULL);
ASSERT(item->ri_buf[i].i_len % XFS_BLF_CHUNK == 0);
- ASSERT(BBTOB(bp->b_io_length) >=
+ ASSERT(BBTOB(bp->b_length) >=
((uint)bit << XFS_BLF_SHIFT) + (nbits << XFS_BLF_SHIFT));
/*
@@ -2878,23 +2803,22 @@
*
* Also make sure that only inode buffers with good sizes stay in
* the buffer cache. The kernel moves inodes in buffers of 1 block
- * or mp->m_inode_cluster_size bytes, whichever is bigger. The inode
+ * or inode_cluster_size bytes, whichever is bigger. The inode
* buffers in the log can be a different size if the log was generated
* by an older kernel using unclustered inode buffers or a newer kernel
* running with a different inode cluster size. Regardless, if the
- * the inode buffer size isn't max(blocksize, mp->m_inode_cluster_size)
- * for *our* value of mp->m_inode_cluster_size, then we need to keep
+ * the inode buffer size isn't max(blocksize, inode_cluster_size)
+ * for *our* value of inode_cluster_size, then we need to keep
* the buffer out of the buffer cache so that the buffer won't
* overlap with future reads of those inodes.
*/
if (XFS_DINODE_MAGIC ==
be16_to_cpu(*((__be16 *)xfs_buf_offset(bp, 0))) &&
- (BBTOB(bp->b_io_length) != max(log->l_mp->m_sb.sb_blocksize,
- (uint32_t)log->l_mp->m_inode_cluster_size))) {
+ (BBTOB(bp->b_length) != M_IGEO(log->l_mp)->inode_cluster_size)) {
xfs_buf_stale(bp);
error = xfs_bwrite(bp);
} else {
- ASSERT(bp->b_target->bt_mount == mp);
+ ASSERT(bp->b_mount == mp);
bp->b_iodone = xlog_recover_iodone;
xfs_buf_delwri_queue(bp, buffer_list);
}
@@ -3008,7 +2932,7 @@
if (item->ri_buf[0].i_len == sizeof(struct xfs_inode_log_format)) {
in_f = item->ri_buf[0].i_addr;
} else {
- in_f = kmem_alloc(sizeof(struct xfs_inode_log_format), KM_SLEEP);
+ in_f = kmem_alloc(sizeof(struct xfs_inode_log_format), 0);
need_free = 1;
error = xfs_inode_item_format_convert(&item->ri_buf[0], in_f);
if (error)
@@ -3045,7 +2969,7 @@
* Make sure the place we're flushing out to really looks
* like an inode!
*/
- if (unlikely(dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC))) {
+ if (unlikely(!xfs_verify_magic16(bp, dip->di_magic))) {
xfs_alert(mp,
"%s: Bad inode magic number, dip = "PTR_FMT", dino bp = "PTR_FMT", ino = %Ld",
__func__, dip, bp, in_f->ilf_ino);
@@ -3256,7 +3180,7 @@
/* re-generate the checksum. */
xfs_dinode_calc_crc(log->l_mp, dip);
- ASSERT(bp->b_target->bt_mount == mp);
+ ASSERT(bp->b_mount == mp);
bp->b_iodone = xlog_recover_iodone;
xfs_buf_delwri_queue(bp, buffer_list);
@@ -3395,7 +3319,7 @@
}
ASSERT(dq_f->qlf_size == 2);
- ASSERT(bp->b_target->bt_mount == mp);
+ ASSERT(bp->b_mount == mp);
bp->b_iodone = xlog_recover_iodone;
xfs_buf_delwri_queue(bp, buffer_list);
@@ -3459,7 +3383,7 @@
{
xfs_efd_log_format_t *efd_formatp;
xfs_efi_log_item_t *efip = NULL;
- xfs_log_item_t *lip;
+ struct xfs_log_item *lip;
uint64_t efi_id;
struct xfs_ail_cursor cur;
struct xfs_ail *ailp = log->l_ailp;
@@ -3845,12 +3769,12 @@
{
struct xfs_mount *mp = log->l_mp;
struct xfs_icreate_log *icl;
+ struct xfs_ino_geometry *igeo = M_IGEO(mp);
xfs_agnumber_t agno;
xfs_agblock_t agbno;
unsigned int count;
unsigned int isize;
xfs_agblock_t length;
- int blks_per_cluster;
int bb_per_cluster;
int cancel_count;
int nbufs;
@@ -3895,10 +3819,10 @@
/*
* The inode chunk is either full or sparse and we only support
- * m_ialloc_min_blks sized sparse allocations at this time.
+ * m_ino_geo.ialloc_min_blks sized sparse allocations at this time.
*/
- if (length != mp->m_ialloc_blks &&
- length != mp->m_ialloc_min_blks) {
+ if (length != igeo->ialloc_blks &&
+ length != igeo->ialloc_min_blks) {
xfs_warn(log->l_mp,
"%s: unsupported chunk length", __FUNCTION__);
return -EINVAL;
@@ -3918,14 +3842,13 @@
* buffers for cancellation so we don't overwrite anything written after
* a cancellation.
*/
- blks_per_cluster = xfs_icluster_size_fsb(mp);
- bb_per_cluster = XFS_FSB_TO_BB(mp, blks_per_cluster);
- nbufs = length / blks_per_cluster;
+ bb_per_cluster = XFS_FSB_TO_BB(mp, igeo->blocks_per_cluster);
+ nbufs = length / igeo->blocks_per_cluster;
for (i = 0, cancel_count = 0; i < nbufs; i++) {
xfs_daddr_t daddr;
daddr = XFS_AGB_TO_DADDR(mp, agno,
- agbno + i * blks_per_cluster);
+ agbno + i * igeo->blocks_per_cluster);
if (xlog_check_buffer_cancelled(log, daddr, bb_per_cluster, 0))
cancel_count++;
}
@@ -4240,7 +4163,7 @@
{
xlog_recover_item_t *item;
- item = kmem_zalloc(sizeof(xlog_recover_item_t), KM_SLEEP);
+ item = kmem_zalloc(sizeof(xlog_recover_item_t), 0);
INIT_LIST_HEAD(&item->ri_list);
list_add_tail(&item->ri_list, head);
}
@@ -4280,7 +4203,7 @@
old_ptr = item->ri_buf[item->ri_cnt-1].i_addr;
old_len = item->ri_buf[item->ri_cnt-1].i_len;
- ptr = kmem_realloc(old_ptr, len + old_len, KM_SLEEP);
+ ptr = kmem_realloc(old_ptr, len + old_len, 0);
memcpy(&ptr[old_len], dp, len);
item->ri_buf[item->ri_cnt-1].i_len += len;
item->ri_buf[item->ri_cnt-1].i_addr = ptr;
@@ -4340,7 +4263,7 @@
return 0;
}
- ptr = kmem_alloc(len, KM_SLEEP);
+ ptr = kmem_alloc(len, 0);
memcpy(ptr, dp, len);
in_f = (struct xfs_inode_log_format *)ptr;
@@ -4368,7 +4291,7 @@
item->ri_total = in_f->ilf_size;
item->ri_buf =
kmem_zalloc(item->ri_total * sizeof(xfs_log_iovec_t),
- KM_SLEEP);
+ 0);
}
ASSERT(item->ri_total > item->ri_cnt);
/* Description region is ri_buf[0] */
@@ -4502,7 +4425,7 @@
* This is a new transaction so allocate a new recovery container to
* hold the recovery ops that will follow.
*/
- trans = kmem_zalloc(sizeof(struct xlog_recover), KM_SLEEP);
+ trans = kmem_zalloc(sizeof(struct xlog_recover), 0);
trans->r_log_tid = tid;
trans->r_lsn = be64_to_cpu(rhead->h_lsn);
INIT_LIST_HEAD(&trans->r_itemq);
@@ -4954,12 +4877,11 @@
* A cancel occurs when the mount has failed and we're bailing out.
* Release all pending log intent items so they don't pin the AIL.
*/
-STATIC int
+STATIC void
xlog_recover_cancel_intents(
struct xlog *log)
{
struct xfs_log_item *lip;
- int error = 0;
struct xfs_ail_cursor cur;
struct xfs_ail *ailp;
@@ -4999,7 +4921,6 @@
xfs_trans_ail_cursor_done(&cur);
spin_unlock(&ailp->ail_lock);
- return error;
}
/*
@@ -5103,16 +5024,27 @@
}
/*
- * xlog_iunlink_recover
+ * Recover AGI unlinked lists
*
- * This is called during recovery to process any inodes which
- * we unlinked but not freed when the system crashed. These
- * inodes will be on the lists in the AGI blocks. What we do
- * here is scan all the AGIs and fully truncate and free any
- * inodes found on the lists. Each inode is removed from the
- * lists when it has been fully truncated and is freed. The
- * freeing of the inode and its removal from the list must be
- * atomic.
+ * This is called during recovery to process any inodes which we unlinked but
+ * not freed when the system crashed. These inodes will be on the lists in the
+ * AGI blocks. What we do here is scan all the AGIs and fully truncate and free
+ * any inodes found on the lists. Each inode is removed from the lists when it
+ * has been fully truncated and is freed. The freeing of the inode and its
+ * removal from the list must be atomic.
+ *
+ * If everything we touch in the agi processing loop is already in memory, this
+ * loop can hold the cpu for a long time. It runs without lock contention,
+ * memory allocation contention, the need wait for IO, etc, and so will run
+ * until we either run out of inodes to process, run low on memory or we run out
+ * of log space.
+ *
+ * This behaviour is bad for latency on single CPU and non-preemptible kernels,
+ * and can prevent other filesytem work (such as CIL pushes) from running. This
+ * can lead to deadlocks if the recovery process runs out of log reservation
+ * space. Hence we need to yield the CPU when there is other kernel work
+ * scheduled on this CPU to ensure other scheduled work can run without undue
+ * latency.
*/
STATIC void
xlog_recover_process_iunlinks(
@@ -5159,13 +5091,14 @@
while (agino != NULLAGINO) {
agino = xlog_recover_process_one_iunlink(mp,
agno, agino, bucket);
+ cond_resched();
}
}
xfs_buf_rele(agibp);
}
}
-STATIC int
+STATIC void
xlog_unpack_data(
struct xlog_rec_header *rhead,
char *dp,
@@ -5188,8 +5121,6 @@
dp += BBSIZE;
}
}
-
- return 0;
}
/*
@@ -5204,11 +5135,9 @@
int pass,
struct list_head *buffer_list)
{
- int error;
__le32 old_crc = rhead->h_crc;
__le32 crc;
-
crc = xlog_cksum(log, rhead, dp, be32_to_cpu(rhead->h_len));
/*
@@ -5247,9 +5176,7 @@
return -EFSCORRUPTED;
}
- error = xlog_unpack_data(rhead, dp, log);
- if (error)
- return error;
+ xlog_unpack_data(rhead, dp, log);
return xlog_recover_process_data(log, rhash, rhead, dp, pass,
buffer_list);
@@ -5311,7 +5238,7 @@
xfs_daddr_t blk_no, rblk_no;
xfs_daddr_t rhead_blk;
char *offset;
- xfs_buf_t *hbp, *dbp;
+ char *hbp, *dbp;
int error = 0, h_size, h_len;
int error2 = 0;
int bblks, split_bblks;
@@ -5336,7 +5263,7 @@
* iclog header and extract the header size from it. Get a
* new hbp that is the correct size.
*/
- hbp = xlog_get_bp(log, 1);
+ hbp = xlog_alloc_buffer(log, 1);
if (!hbp)
return -ENOMEM;
@@ -5378,23 +5305,23 @@
hblks = h_size / XLOG_HEADER_CYCLE_SIZE;
if (h_size % XLOG_HEADER_CYCLE_SIZE)
hblks++;
- xlog_put_bp(hbp);
- hbp = xlog_get_bp(log, hblks);
+ kmem_free(hbp);
+ hbp = xlog_alloc_buffer(log, hblks);
} else {
hblks = 1;
}
} else {
ASSERT(log->l_sectBBsize == 1);
hblks = 1;
- hbp = xlog_get_bp(log, 1);
+ hbp = xlog_alloc_buffer(log, 1);
h_size = XLOG_BIG_RECORD_BSIZE;
}
if (!hbp)
return -ENOMEM;
- dbp = xlog_get_bp(log, BTOBB(h_size));
+ dbp = xlog_alloc_buffer(log, BTOBB(h_size));
if (!dbp) {
- xlog_put_bp(hbp);
+ kmem_free(hbp);
return -ENOMEM;
}
@@ -5409,7 +5336,7 @@
/*
* Check for header wrapping around physical end-of-log
*/
- offset = hbp->b_addr;
+ offset = hbp;
split_hblks = 0;
wrapped_hblks = 0;
if (blk_no + hblks <= log->l_logBBsize) {
@@ -5445,8 +5372,8 @@
* - order is important.
*/
wrapped_hblks = hblks - split_hblks;
- error = xlog_bread_offset(log, 0,
- wrapped_hblks, hbp,
+ error = xlog_bread_noalign(log, 0,
+ wrapped_hblks,
offset + BBTOB(split_hblks));
if (error)
goto bread_err2;
@@ -5477,7 +5404,7 @@
} else {
/* This log record is split across the
* physical end of log */
- offset = dbp->b_addr;
+ offset = dbp;
split_bblks = 0;
if (blk_no != log->l_logBBsize) {
/* some data is before the physical
@@ -5506,8 +5433,8 @@
* _first_, then the log start (LR header end)
* - order is important.
*/
- error = xlog_bread_offset(log, 0,
- bblks - split_bblks, dbp,
+ error = xlog_bread_noalign(log, 0,
+ bblks - split_bblks,
offset + BBTOB(split_bblks));
if (error)
goto bread_err2;
@@ -5555,9 +5482,9 @@
}
bread_err2:
- xlog_put_bp(dbp);
+ kmem_free(dbp);
bread_err1:
- xlog_put_bp(hbp);
+ kmem_free(hbp);
/*
* Submit buffers that have been added from the last record processed,
@@ -5614,7 +5541,7 @@
*/
log->l_buf_cancel_table = kmem_zalloc(XLOG_BC_TABLE_SIZE *
sizeof(struct list_head),
- KM_SLEEP);
+ 0);
for (i = 0; i < XLOG_BC_TABLE_SIZE; i++)
INIT_LIST_HEAD(&log->l_buf_cancel_table[i]);
@@ -5691,7 +5618,7 @@
* Now that we've finished replaying all buffer and inode
* updates, re-read in the superblock and reverify it.
*/
- bp = xfs_getsb(mp, 0);
+ bp = xfs_getsb(mp);
bp->b_flags &= ~(XBF_DONE | XBF_ASYNC);
ASSERT(!(bp->b_flags & XBF_WRITE));
bp->b_flags |= XBF_READ;
@@ -5864,16 +5791,12 @@
return 0;
}
-int
+void
xlog_recover_cancel(
struct xlog *log)
{
- int error = 0;
-
if (log->l_flags & XLOG_RECOVERY_NEEDED)
- error = xlog_recover_cancel_intents(log);
-
- return error;
+ xlog_recover_cancel_intents(log);
}
#if defined(DEBUG)
diff --git a/fs/xfs/xfs_message.c b/fs/xfs/xfs_message.c
index 576c375..9804efe 100644
--- a/fs/xfs/xfs_message.c
+++ b/fs/xfs/xfs_message.c
@@ -6,8 +6,8 @@
#include "xfs.h"
#include "xfs_fs.h"
#include "xfs_error.h"
+#include "xfs_shared.h"
#include "xfs_format.h"
-#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
@@ -107,5 +107,5 @@
void
xfs_hex_dump(void *p, int length)
{
- print_hex_dump(KERN_ALERT, "", DUMP_PREFIX_ADDRESS, 16, 1, p, length, 1);
+ print_hex_dump(KERN_ALERT, "", DUMP_PREFIX_OFFSET, 16, 1, p, length, 1);
}
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 02d1509..ba5b6f3 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -12,9 +12,6 @@
#include "xfs_bit.h"
#include "xfs_sb.h"
#include "xfs_mount.h"
-#include "xfs_defer.h"
-#include "xfs_da_format.h"
-#include "xfs_da_btree.h"
#include "xfs_inode.h"
#include "xfs_dir2.h"
#include "xfs_ialloc.h"
@@ -27,13 +24,13 @@
#include "xfs_error.h"
#include "xfs_quota.h"
#include "xfs_fsops.h"
-#include "xfs_trace.h"
#include "xfs_icache.h"
#include "xfs_sysfs.h"
#include "xfs_rmap_btree.h"
#include "xfs_refcount_btree.h"
#include "xfs_reflink.h"
#include "xfs_extent_busy.h"
+#include "xfs_health.h"
static DEFINE_MUTEX(xfs_uuid_table_mutex);
@@ -85,7 +82,7 @@
if (hole < 0) {
xfs_uuid_table = kmem_realloc(xfs_uuid_table,
(xfs_uuid_table_size + 1) * sizeof(*xfs_uuid_table),
- KM_SLEEP);
+ 0);
hole = xfs_uuid_table_size++;
}
xfs_uuid_table[hole] = *uuid;
@@ -149,6 +146,7 @@
spin_unlock(&mp->m_perag_lock);
ASSERT(pag);
ASSERT(atomic_read(&pag->pag_ref) == 0);
+ xfs_iunlink_destroy(pag);
xfs_buf_hash_destroy(pag);
mutex_destroy(&pag->pag_ici_reclaim_lock);
call_rcu(&pag->rcu_head, __xfs_free_perag);
@@ -216,7 +214,7 @@
spin_lock(&mp->m_perag_lock);
if (radix_tree_insert(&mp->m_perag_tree, index, pag)) {
- BUG();
+ WARN_ON_ONCE(1);
spin_unlock(&mp->m_perag_lock);
radix_tree_preload_end();
error = -EEXIST;
@@ -227,6 +225,10 @@
/* first new pag is fully initialized */
if (first_initialised == NULLAGNUMBER)
first_initialised = index;
+ error = xfs_iunlink_init(pag);
+ if (error)
+ goto out_hash_destroy;
+ spin_lock_init(&pag->pag_state_lock);
}
index = xfs_set_inode_alloc(mp, agcount);
@@ -249,6 +251,7 @@
if (!pag)
break;
xfs_buf_hash_destroy(pag);
+ xfs_iunlink_destroy(pag);
mutex_destroy(&pag->pag_ici_reclaim_lock);
kmem_free(pag);
}
@@ -423,30 +426,6 @@
}
/*
- * Set the maximum inode count for this filesystem
- */
-STATIC void
-xfs_set_maxicount(xfs_mount_t *mp)
-{
- xfs_sb_t *sbp = &(mp->m_sb);
- uint64_t icount;
-
- if (sbp->sb_imax_pct) {
- /*
- * Make sure the maximum inode count is a multiple
- * of the units we allocate inodes in.
- */
- icount = sbp->sb_dblocks * sbp->sb_imax_pct;
- do_div(icount, 100);
- do_div(icount, mp->m_ialloc_blks);
- mp->m_maxicount = (icount * mp->m_ialloc_blks) <<
- sbp->sb_inopblog;
- } else {
- mp->m_maxicount = 0;
- }
-}
-
-/*
* Set the default minimum read and write sizes unless
* already specified in a mount option.
* We use smaller I/O sizes when the file system
@@ -502,29 +481,6 @@
}
}
-
-/*
- * Set whether we're using inode alignment.
- */
-STATIC void
-xfs_set_inoalignment(xfs_mount_t *mp)
-{
- if (xfs_sb_version_hasalign(&mp->m_sb) &&
- mp->m_sb.sb_inoalignmt >= xfs_icluster_size_fsb(mp))
- mp->m_inoalign_mask = mp->m_sb.sb_inoalignmt - 1;
- else
- mp->m_inoalign_mask = 0;
- /*
- * If we are using stripe alignment, check whether
- * the stripe unit is a multiple of the inode alignment
- */
- if (mp->m_dalign && mp->m_inoalign_mask &&
- !(mp->m_dalign & mp->m_inoalign_mask))
- mp->m_sinoalign = mp->m_dalign;
- else
- mp->m_sinoalign = 0;
-}
-
/*
* Check that the data (and log if separate) is an ok size.
*/
@@ -639,7 +595,7 @@
(mp->m_sb.sb_fdblocks > mp->m_sb.sb_dblocks ||
!xfs_verify_icount(mp, mp->m_sb.sb_icount) ||
mp->m_sb.sb_ifree > mp->m_sb.sb_icount))
- mp->m_flags |= XFS_MOUNT_BAD_SUMMARY;
+ xfs_fs_mark_sick(mp, XFS_SICK_FS_COUNTERS);
/*
* We can safely re-initialise incore superblock counters from the
@@ -654,7 +610,7 @@
*/
if ((!xfs_sb_version_haslazysbcount(&mp->m_sb) ||
XFS_LAST_UNMOUNT_WAS_CLEAN(mp)) &&
- !(mp->m_flags & XFS_MOUNT_BAD_SUMMARY))
+ !xfs_fs_has_sickness(mp, XFS_SICK_FS_COUNTERS))
return 0;
return xfs_initialize_perag_data(mp, mp->m_sb.sb_agcount);
@@ -676,6 +632,7 @@
{
struct xfs_sb *sbp = &(mp->m_sb);
struct xfs_inode *rip;
+ struct xfs_ino_geometry *igeo = M_IGEO(mp);
uint64_t resblks;
uint quotamount = 0;
uint quotaflags = 0;
@@ -742,12 +699,10 @@
xfs_alloc_compute_maxlevels(mp);
xfs_bmap_compute_maxlevels(mp, XFS_DATA_FORK);
xfs_bmap_compute_maxlevels(mp, XFS_ATTR_FORK);
- xfs_ialloc_compute_maxlevels(mp);
+ xfs_ialloc_setup_geometry(mp);
xfs_rmapbt_compute_maxlevels(mp);
xfs_refcountbt_compute_maxlevels(mp);
- xfs_set_maxicount(mp);
-
/* enable fail_at_unmount as default */
mp->m_fail_unmount = true;
@@ -781,46 +736,22 @@
xfs_set_low_space_thresholds(mp);
/*
- * Set the inode cluster size.
- * This may still be overridden by the file system
- * block size if it is larger than the chosen cluster size.
- *
- * For v5 filesystems, scale the cluster size with the inode size to
- * keep a constant ratio of inode per cluster buffer, but only if mkfs
- * has set the inode alignment value appropriately for larger cluster
- * sizes.
- */
- mp->m_inode_cluster_size = XFS_INODE_BIG_CLUSTER_SIZE;
- if (xfs_sb_version_hascrc(&mp->m_sb)) {
- int new_size = mp->m_inode_cluster_size;
-
- new_size *= mp->m_sb.sb_inodesize / XFS_DINODE_MIN_SIZE;
- if (mp->m_sb.sb_inoalignmt >= XFS_B_TO_FSBT(mp, new_size))
- mp->m_inode_cluster_size = new_size;
- }
-
- /*
* If enabled, sparse inode chunk alignment is expected to match the
* cluster size. Full inode chunk alignment must match the chunk size,
* but that is checked on sb read verification...
*/
if (xfs_sb_version_hassparseinodes(&mp->m_sb) &&
mp->m_sb.sb_spino_align !=
- XFS_B_TO_FSBT(mp, mp->m_inode_cluster_size)) {
+ XFS_B_TO_FSBT(mp, igeo->inode_cluster_size_raw)) {
xfs_warn(mp,
"Sparse inode block alignment (%u) must match cluster size (%llu).",
mp->m_sb.sb_spino_align,
- XFS_B_TO_FSBT(mp, mp->m_inode_cluster_size));
+ XFS_B_TO_FSBT(mp, igeo->inode_cluster_size_raw));
error = -EINVAL;
goto out_remove_uuid;
}
/*
- * Set inode alignment fields
- */
- xfs_set_inoalignment(mp);
-
- /*
* Check that the data (and log if separate) is an ok size.
*/
error = xfs_check_sizes(mp);
@@ -1059,6 +990,7 @@
*/
cancel_delayed_work_sync(&mp->m_reclaim_work);
xfs_reclaim_inodes(mp, SYNC_WAIT);
+ xfs_health_unmount(mp);
out_log_dealloc:
mp->m_flags |= XFS_MOUNT_UNMOUNTING;
xfs_log_mount_cancel(mp);
@@ -1095,7 +1027,7 @@
uint64_t resblks;
int error;
- xfs_icache_disable_reclaim(mp);
+ xfs_stop_block_reaping(mp);
xfs_fs_unreserve_ag_blocks(mp);
xfs_qm_unmount_quotas(mp);
xfs_rtunmount_inodes(mp);
@@ -1141,6 +1073,7 @@
*/
cancel_delayed_work_sync(&mp->m_reclaim_work);
xfs_reclaim_inodes(mp, SYNC_WAIT);
+ xfs_health_unmount(mp);
xfs_qm_unmount(mp);
@@ -1372,24 +1305,14 @@
* xfs_getsb() is called to obtain the buffer for the superblock.
* The buffer is returned locked and read in from disk.
* The buffer should be released with a call to xfs_brelse().
- *
- * If the flags parameter is BUF_TRYLOCK, then we'll only return
- * the superblock buffer if it can be locked without sleeping.
- * If it can't then we'll return NULL.
*/
struct xfs_buf *
xfs_getsb(
- struct xfs_mount *mp,
- int flags)
+ struct xfs_mount *mp)
{
struct xfs_buf *bp = mp->m_sb_bp;
- if (!xfs_buf_trylock(bp)) {
- if (flags & XBF_TRYLOCK)
- return NULL;
- xfs_buf_lock(bp);
- }
-
+ xfs_buf_lock(bp);
xfs_buf_hold(bp);
ASSERT(bp->b_flags & XBF_DONE);
return bp;
@@ -1436,7 +1359,26 @@
if (!xfs_sb_version_haslazysbcount(&mp->m_sb))
return;
- spin_lock(&mp->m_sb_lock);
- mp->m_flags |= XFS_MOUNT_BAD_SUMMARY;
- spin_unlock(&mp->m_sb_lock);
+ xfs_fs_mark_sick(mp, XFS_SICK_FS_COUNTERS);
+}
+
+/*
+ * Update the in-core delayed block counter.
+ *
+ * We prefer to update the counter without having to take a spinlock for every
+ * counter update (i.e. batching). Each change to delayed allocation
+ * reservations can change can easily exceed the default percpu counter
+ * batching, so we use a larger batch factor here.
+ *
+ * Note that we don't currently have any callers requiring fast summation
+ * (e.g. percpu_counter_read) so we can use a big batch value here.
+ */
+#define XFS_DELALLOC_BATCH (4096)
+void
+xfs_mod_delalloc(
+ struct xfs_mount *mp,
+ int64_t delta)
+{
+ percpu_counter_add_batch(&mp->m_delalloc_blks, delta,
+ XFS_DELALLOC_BATCH);
}
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 7964513..fdb60e0 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -60,6 +60,20 @@
typedef struct xfs_mount {
struct super_block *m_super;
xfs_tid_t m_tid; /* next unused tid for fs */
+
+ /*
+ * Bitsets of per-fs metadata that have been checked and/or are sick.
+ * Callers must hold m_sb_lock to access these two fields.
+ */
+ uint8_t m_fs_checked;
+ uint8_t m_fs_sick;
+ /*
+ * Bitsets of rt metadata that have been checked and/or are sick.
+ * Callers must hold m_sb_lock to access this field.
+ */
+ uint8_t m_rt_checked;
+ uint8_t m_rt_sick;
+
struct xfs_ail *m_ail; /* fs active log item list */
struct xfs_sb m_sb; /* copy of fs superblock */
@@ -67,6 +81,12 @@
struct percpu_counter m_icount; /* allocated inodes counter */
struct percpu_counter m_ifree; /* free inodes counter */
struct percpu_counter m_fdblocks; /* free block counter */
+ /*
+ * Count of data device blocks reserved for delayed allocations,
+ * including indlen blocks. Does not include allocated CoW staging
+ * extents or anything related to the rt device.
+ */
+ struct percpu_counter m_delalloc_blks;
struct xfs_buf *m_sb_bp; /* buffer for superblock */
char *m_fsname; /* filesystem name */
@@ -85,10 +105,18 @@
struct xfs_da_geometry *m_dir_geo; /* directory block geometry */
struct xfs_da_geometry *m_attr_geo; /* attribute block geometry */
struct xlog *m_log; /* log specific stuff */
+ struct xfs_ino_geometry m_ino_geo; /* inode geometry */
int m_logbufs; /* number of log buffers */
int m_logbsize; /* size of each log buffer */
uint m_rsumlevels; /* rt summary levels */
uint m_rsumsize; /* size of rt summary, bytes */
+ /*
+ * Optional cache of rt summary level per bitmap block with the
+ * invariant that m_rsum_cache[bbno] <= the minimum i for which
+ * rsum[i][bbno] != 0. Reads and writes are serialized by the rsumip
+ * inode lock.
+ */
+ uint8_t *m_rsum_cache;
struct xfs_inode *m_rbmip; /* pointer to bitmap inode */
struct xfs_inode *m_rsumip; /* pointer to summary inode */
struct xfs_inode *m_rootip; /* pointer to root directory */
@@ -99,8 +127,6 @@
uint8_t m_blkbit_log; /* blocklog + NBBY */
uint8_t m_blkbb_log; /* blocklog - BBSHIFT */
uint8_t m_agno_log; /* log #ag's */
- uint8_t m_agino_log; /* #bits for agino in inum */
- uint m_inode_cluster_size;/* min inode buf size */
uint m_blockmask; /* sb_blocksize-1 */
uint m_blockwsize; /* sb_blocksize in words */
uint m_blockwmask; /* blockwsize-1 */
@@ -108,15 +134,12 @@
uint m_alloc_mnr[2]; /* min alloc btree records */
uint m_bmap_dmxr[2]; /* max bmap btree records */
uint m_bmap_dmnr[2]; /* min bmap btree records */
- uint m_inobt_mxr[2]; /* max inobt btree records */
- uint m_inobt_mnr[2]; /* min inobt btree records */
uint m_rmap_mxr[2]; /* max rmap btree records */
uint m_rmap_mnr[2]; /* min rmap btree records */
uint m_refc_mxr[2]; /* max refc btree records */
uint m_refc_mnr[2]; /* min refc btree records */
uint m_ag_maxlevels; /* XFS_AG_MAXLEVELS */
uint m_bm_maxlevels[2]; /* XFS_BM_MAXLEVELS */
- uint m_in_maxlevels; /* max inobt btree levels. */
uint m_rmap_maxlevels; /* max rmap btree levels */
uint m_refc_maxlevels; /* max refcount btree level */
xfs_extlen_t m_ag_prealloc_blocks; /* reserved ag blocks */
@@ -127,21 +150,14 @@
struct mutex m_growlock; /* growfs mutex */
int m_fixedfsid[2]; /* unchanged for life of FS */
uint64_t m_flags; /* global mount flags */
- bool m_inotbt_nores; /* no per-AG finobt resv. */
- int m_ialloc_inos; /* inodes in inode allocation */
- int m_ialloc_blks; /* blocks in inode allocation */
- int m_ialloc_min_blks;/* min blocks in sparse inode
- * allocation */
- int m_inoalign_mask;/* mask sb_inoalignmt if used */
+ bool m_finobt_nores; /* no per-AG finobt resv. */
uint m_qflags; /* quota status flags */
struct xfs_trans_resv m_resv; /* precomputed res values */
- uint64_t m_maxicount; /* maximum inode count */
uint64_t m_resblks; /* total reserved blocks */
uint64_t m_resblks_avail;/* available reserved blocks */
uint64_t m_resblks_save; /* reserved blks @ remount,ro */
int m_dalign; /* stripe unit */
int m_swidth; /* stripe width */
- int m_sinoalign; /* stripe unit inode alignment */
uint8_t m_sectbb_log; /* sectlog - BBSHIFT */
const struct xfs_nameops *m_dirnameops; /* vector of dir name ops */
const struct xfs_dir_ops *m_dir_inode_ops; /* vector of dir inode ops */
@@ -164,11 +180,9 @@
struct xstats m_stats; /* per-fs stats */
struct workqueue_struct *m_buf_workqueue;
- struct workqueue_struct *m_data_workqueue;
struct workqueue_struct *m_unwritten_workqueue;
struct workqueue_struct *m_cil_workqueue;
struct workqueue_struct *m_reclaim_workqueue;
- struct workqueue_struct *m_log_workqueue;
struct workqueue_struct *m_eofblocks_workqueue;
struct workqueue_struct *m_sync_workqueue;
@@ -183,6 +197,7 @@
*/
uint32_t m_generation;
+ bool m_always_cow;
bool m_fail_unmount;
#ifdef DEBUG
/*
@@ -195,6 +210,8 @@
#endif
} xfs_mount_t;
+#define M_IGEO(mp) (&(mp)->m_ino_geo)
+
/*
* Flags for m_flags.
*/
@@ -202,7 +219,6 @@
must be synchronous except
for space allocations */
#define XFS_MOUNT_UNMOUNTING (1ULL << 1) /* filesystem is unmounting */
-#define XFS_MOUNT_BAD_SUMMARY (1ULL << 2) /* summary counters are bad */
#define XFS_MOUNT_WAS_CLEAN (1ULL << 3)
#define XFS_MOUNT_FS_SHUTDOWN (1ULL << 4) /* atomic stop of all filesystem
operations, typically for
@@ -311,13 +327,6 @@
}
/* per-AG block reservation data structures*/
-enum xfs_ag_resv_type {
- XFS_AG_RESV_NONE = 0,
- XFS_AG_RESV_AGFL,
- XFS_AG_RESV_METADATA,
- XFS_AG_RESV_RMAPBT,
-};
-
struct xfs_ag_resv {
/* number of blocks originally reserved here */
xfs_extlen_t ar_orig_reserved;
@@ -357,6 +366,15 @@
xfs_agino_t pagl_pagino;
xfs_agino_t pagl_leftrec;
xfs_agino_t pagl_rightrec;
+
+ /*
+ * Bitsets of per-ag metadata that have been checked and/or are sick.
+ * Callers should hold pag_state_lock before accessing this field.
+ */
+ uint16_t pag_checked;
+ uint16_t pag_sick;
+ spinlock_t pag_state_lock;
+
spinlock_t pagb_lock; /* lock for pagb_tree */
struct rb_root pagb_tree; /* ordered tree of busy extents */
unsigned int pagb_gen; /* generation count for pagb_tree */
@@ -385,6 +403,13 @@
/* reference count */
uint8_t pagf_refcount_level;
+
+ /*
+ * Unlinked inode information. This incore information reflects
+ * data stored in the AGI, so callers must hold the AGI buffer lock
+ * or have some other means to control concurrency.
+ */
+ struct rhashtable pagi_unlinked_hash;
} xfs_perag_t;
static inline struct xfs_ag_resv *
@@ -419,7 +444,7 @@
bool reserved);
extern int xfs_mod_frextents(struct xfs_mount *mp, int64_t delta);
-extern struct xfs_buf *xfs_getsb(xfs_mount_t *, int);
+extern struct xfs_buf *xfs_getsb(xfs_mount_t *);
extern int xfs_readsb(xfs_mount_t *, int);
extern void xfs_freesb(xfs_mount_t *);
extern bool xfs_fs_writable(struct xfs_mount *mp, int level);
@@ -435,5 +460,6 @@
struct xfs_error_cfg * xfs_error_get_cfg(struct xfs_mount *mp,
int error_class, int error);
void xfs_force_summary_recalc(struct xfs_mount *mp);
+void xfs_mod_delalloc(struct xfs_mount *mp, int64_t delta);
#endif /* __XFS_MOUNT_H__ */
diff --git a/fs/xfs/xfs_mru_cache.c b/fs/xfs/xfs_mru_cache.c
index 7473881..a06661d 100644
--- a/fs/xfs/xfs_mru_cache.c
+++ b/fs/xfs/xfs_mru_cache.c
@@ -333,12 +333,12 @@
if (!(grp_time = msecs_to_jiffies(lifetime_ms) / grp_count))
return -EINVAL;
- if (!(mru = kmem_zalloc(sizeof(*mru), KM_SLEEP)))
+ if (!(mru = kmem_zalloc(sizeof(*mru), 0)))
return -ENOMEM;
/* An extra list is needed to avoid reaping up to a grp_time early. */
mru->grp_count = grp_count + 1;
- mru->lists = kmem_zalloc(mru->grp_count * sizeof(*mru->lists), KM_SLEEP);
+ mru->lists = kmem_zalloc(mru->grp_count * sizeof(*mru->lists), 0);
if (!mru->lists) {
err = -ENOMEM;
diff --git a/fs/xfs/xfs_ondisk.h b/fs/xfs/xfs_ondisk.h
index d3e04d2..b6701b4 100644
--- a/fs/xfs/xfs_ondisk.h
+++ b/fs/xfs/xfs_ondisk.h
@@ -125,6 +125,32 @@
XFS_CHECK_STRUCT_SIZE(struct xfs_inode_log_format, 56);
XFS_CHECK_STRUCT_SIZE(struct xfs_qoff_logformat, 20);
XFS_CHECK_STRUCT_SIZE(struct xfs_trans_header, 16);
+
+ /*
+ * The v5 superblock format extended several v4 header structures with
+ * additional data. While new fields are only accessible on v5
+ * superblocks, it's important that the v5 structures place original v4
+ * fields/headers in the correct location on-disk. For example, we must
+ * be able to find magic values at the same location in certain blocks
+ * regardless of superblock version.
+ *
+ * The following checks ensure that various v5 data structures place the
+ * subset of v4 metadata associated with the same type of block at the
+ * start of the on-disk block. If there is no data structure definition
+ * for certain types of v4 blocks, traverse down to the first field of
+ * common metadata (e.g., magic value) and make sure it is at offset
+ * zero.
+ */
+ XFS_CHECK_OFFSET(struct xfs_dir3_leaf, hdr.info.hdr, 0);
+ XFS_CHECK_OFFSET(struct xfs_da3_intnode, hdr.info.hdr, 0);
+ XFS_CHECK_OFFSET(struct xfs_dir3_data_hdr, hdr.magic, 0);
+ XFS_CHECK_OFFSET(struct xfs_dir3_free, hdr.hdr.magic, 0);
+ XFS_CHECK_OFFSET(struct xfs_attr3_leafblock, hdr.info.hdr, 0);
+
+ XFS_CHECK_STRUCT_SIZE(struct xfs_bulkstat, 192);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_inumbers, 24);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_bulkstat_req, 64);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_inumbers_req, 64);
}
#endif /* __XFS_ONDISK_H */
diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c
index f44c359..a339bd5 100644
--- a/fs/xfs/xfs_pnfs.c
+++ b/fs/xfs/xfs_pnfs.c
@@ -2,23 +2,16 @@
/*
* Copyright (c) 2014 Christoph Hellwig.
*/
-#include <linux/iomap.h>
#include "xfs.h"
+#include "xfs_shared.h"
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
-#include "xfs_sb.h"
#include "xfs_mount.h"
#include "xfs_inode.h"
#include "xfs_trans.h"
-#include "xfs_log.h"
#include "xfs_bmap.h"
-#include "xfs_bmap_util.h"
-#include "xfs_error.h"
#include "xfs_iomap.h"
-#include "xfs_shared.h"
-#include "xfs_bit.h"
-#include "xfs_pnfs.h"
/*
* Ensure that we do not have any outstanding pNFS layouts that can be used by
@@ -39,7 +32,7 @@
struct xfs_inode *ip = XFS_I(inode);
int error;
- while ((error = break_layout(inode, false) == -EWOULDBLOCK)) {
+ while ((error = break_layout(inode, false)) == -EWOULDBLOCK) {
xfs_iunlock(ip, *iolock);
*did_unlock = true;
error = break_layout(inode, true);
@@ -185,7 +178,7 @@
}
xfs_iunlock(ip, XFS_IOLOCK_EXCL);
- xfs_bmbt_to_iomap(ip, iomap, &imap);
+ error = xfs_bmbt_to_iomap(ip, iomap, &imap, false);
*device_generation = mp->m_generation;
return error;
out_unlock:
diff --git a/fs/xfs/xfs_pwork.c b/fs/xfs/xfs_pwork.c
new file mode 100644
index 0000000..4bcc3e6
--- /dev/null
+++ b/fs/xfs/xfs_pwork.c
@@ -0,0 +1,136 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2019 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_trace.h"
+#include "xfs_sysctl.h"
+#include "xfs_pwork.h"
+#include <linux/nmi.h>
+
+/*
+ * Parallel Work Queue
+ * ===================
+ *
+ * Abstract away the details of running a large and "obviously" parallelizable
+ * task across multiple CPUs. Callers initialize the pwork control object with
+ * a desired level of parallelization and a work function. Next, they embed
+ * struct xfs_pwork in whatever structure they use to pass work context to a
+ * worker thread and queue that pwork. The work function will be passed the
+ * pwork item when it is run (from process context) and any returned error will
+ * be recorded in xfs_pwork_ctl.error. Work functions should check for errors
+ * and abort if necessary; the non-zeroness of xfs_pwork_ctl.error does not
+ * stop workqueue item processing.
+ *
+ * This is the rough equivalent of the xfsprogs workqueue code, though we can't
+ * reuse that name here.
+ */
+
+/* Invoke our caller's function. */
+static void
+xfs_pwork_work(
+ struct work_struct *work)
+{
+ struct xfs_pwork *pwork;
+ struct xfs_pwork_ctl *pctl;
+ int error;
+
+ pwork = container_of(work, struct xfs_pwork, work);
+ pctl = pwork->pctl;
+ error = pctl->work_fn(pctl->mp, pwork);
+ if (error && !pctl->error)
+ pctl->error = error;
+ if (atomic_dec_and_test(&pctl->nr_work))
+ wake_up(&pctl->poll_wait);
+}
+
+/*
+ * Set up control data for parallel work. @work_fn is the function that will
+ * be called. @tag will be written into the kernel threads. @nr_threads is
+ * the level of parallelism desired, or 0 for no limit.
+ */
+int
+xfs_pwork_init(
+ struct xfs_mount *mp,
+ struct xfs_pwork_ctl *pctl,
+ xfs_pwork_work_fn work_fn,
+ const char *tag,
+ unsigned int nr_threads)
+{
+#ifdef DEBUG
+ if (xfs_globals.pwork_threads >= 0)
+ nr_threads = xfs_globals.pwork_threads;
+#endif
+ trace_xfs_pwork_init(mp, nr_threads, current->pid);
+
+ pctl->wq = alloc_workqueue("%s-%d", WQ_FREEZABLE, nr_threads, tag,
+ current->pid);
+ if (!pctl->wq)
+ return -ENOMEM;
+ pctl->work_fn = work_fn;
+ pctl->error = 0;
+ pctl->mp = mp;
+ atomic_set(&pctl->nr_work, 0);
+ init_waitqueue_head(&pctl->poll_wait);
+
+ return 0;
+}
+
+/* Queue some parallel work. */
+void
+xfs_pwork_queue(
+ struct xfs_pwork_ctl *pctl,
+ struct xfs_pwork *pwork)
+{
+ INIT_WORK(&pwork->work, xfs_pwork_work);
+ pwork->pctl = pctl;
+ atomic_inc(&pctl->nr_work);
+ queue_work(pctl->wq, &pwork->work);
+}
+
+/* Wait for the work to finish and tear down the control structure. */
+int
+xfs_pwork_destroy(
+ struct xfs_pwork_ctl *pctl)
+{
+ destroy_workqueue(pctl->wq);
+ pctl->wq = NULL;
+ return pctl->error;
+}
+
+/*
+ * Wait for the work to finish by polling completion status and touch the soft
+ * lockup watchdog. This is for callers such as mount which hold locks.
+ */
+void
+xfs_pwork_poll(
+ struct xfs_pwork_ctl *pctl)
+{
+ while (wait_event_timeout(pctl->poll_wait,
+ atomic_read(&pctl->nr_work) == 0, HZ) == 0)
+ touch_softlockup_watchdog();
+}
+
+/*
+ * Return the amount of parallelism that the data device can handle, or 0 for
+ * no limit.
+ */
+unsigned int
+xfs_pwork_guess_datadev_parallelism(
+ struct xfs_mount *mp)
+{
+ struct xfs_buftarg *btp = mp->m_ddev_targp;
+
+ /*
+ * For now we'll go with the most conservative setting possible,
+ * which is two threads for an SSD and 1 thread everywhere else.
+ */
+ return blk_queue_nonrot(btp->bt_bdev->bd_queue) ? 2 : 1;
+}
diff --git a/fs/xfs/xfs_pwork.h b/fs/xfs/xfs_pwork.h
new file mode 100644
index 0000000..8133124
--- /dev/null
+++ b/fs/xfs/xfs_pwork.h
@@ -0,0 +1,61 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (C) 2019 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ */
+#ifndef __XFS_PWORK_H__
+#define __XFS_PWORK_H__
+
+struct xfs_pwork;
+struct xfs_mount;
+
+typedef int (*xfs_pwork_work_fn)(struct xfs_mount *mp, struct xfs_pwork *pwork);
+
+/*
+ * Parallel work coordination structure.
+ */
+struct xfs_pwork_ctl {
+ struct workqueue_struct *wq;
+ struct xfs_mount *mp;
+ xfs_pwork_work_fn work_fn;
+ struct wait_queue_head poll_wait;
+ atomic_t nr_work;
+ int error;
+};
+
+/*
+ * Embed this parallel work control item inside your own work structure,
+ * then queue work with it.
+ */
+struct xfs_pwork {
+ struct work_struct work;
+ struct xfs_pwork_ctl *pctl;
+};
+
+#define XFS_PWORK_SINGLE_THREADED { .pctl = NULL }
+
+/* Have we been told to abort? */
+static inline bool
+xfs_pwork_ctl_want_abort(
+ struct xfs_pwork_ctl *pctl)
+{
+ return pctl && pctl->error;
+}
+
+/* Have we been told to abort? */
+static inline bool
+xfs_pwork_want_abort(
+ struct xfs_pwork *pwork)
+{
+ return xfs_pwork_ctl_want_abort(pwork->pctl);
+}
+
+int xfs_pwork_init(struct xfs_mount *mp, struct xfs_pwork_ctl *pctl,
+ xfs_pwork_work_fn work_fn, const char *tag,
+ unsigned int nr_threads);
+void xfs_pwork_queue(struct xfs_pwork_ctl *pctl, struct xfs_pwork *pwork);
+int xfs_pwork_destroy(struct xfs_pwork_ctl *pctl);
+void xfs_pwork_poll(struct xfs_pwork_ctl *pctl);
+unsigned int xfs_pwork_guess_datadev_parallelism(struct xfs_mount *mp);
+
+#endif /* __XFS_PWORK_H__ */
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index 52ed790..ecd8ce1 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -13,19 +13,15 @@
#include "xfs_sb.h"
#include "xfs_mount.h"
#include "xfs_inode.h"
-#include "xfs_ialloc.h"
-#include "xfs_itable.h"
+#include "xfs_iwalk.h"
#include "xfs_quota.h"
-#include "xfs_error.h"
#include "xfs_bmap.h"
-#include "xfs_bmap_btree.h"
#include "xfs_bmap_util.h"
#include "xfs_trans.h"
#include "xfs_trans_space.h"
#include "xfs_qm.h"
#include "xfs_trace.h"
#include "xfs_icache.h"
-#include "xfs_cksum.h"
/*
* The global quota manager. There is only one of these for the entire
@@ -646,7 +642,7 @@
ASSERT(XFS_IS_QUOTA_RUNNING(mp));
- qinf = mp->m_quotainfo = kmem_zalloc(sizeof(xfs_quotainfo_t), KM_SLEEP);
+ qinf = mp->m_quotainfo = kmem_zalloc(sizeof(xfs_quotainfo_t), 0);
error = list_lru_init(&qinf->qi_lru);
if (error)
@@ -982,7 +978,7 @@
if (qip->i_d.di_nblocks == 0)
return 0;
- map = kmem_alloc(XFS_DQITER_MAP_SIZE * sizeof(*map), KM_SLEEP);
+ map = kmem_alloc(XFS_DQITER_MAP_SIZE * sizeof(*map), 0);
lblkno = 0;
maxlblkcnt = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes);
@@ -1118,17 +1114,15 @@
/* ARGSUSED */
STATIC int
xfs_qm_dqusage_adjust(
- xfs_mount_t *mp, /* mount point for filesystem */
- xfs_ino_t ino, /* inode number to get data for */
- void __user *buffer, /* not used */
- int ubsize, /* not used */
- int *ubused, /* not used */
- int *res) /* result code value */
+ struct xfs_mount *mp,
+ struct xfs_trans *tp,
+ xfs_ino_t ino,
+ void *data)
{
- xfs_inode_t *ip;
- xfs_qcnt_t nblks;
- xfs_filblks_t rtblks = 0; /* total rt blks */
- int error;
+ struct xfs_inode *ip;
+ xfs_qcnt_t nblks;
+ xfs_filblks_t rtblks = 0; /* total rt blks */
+ int error;
ASSERT(XFS_IS_QUOTA_RUNNING(mp));
@@ -1136,20 +1130,18 @@
* rootino must have its resources accounted for, not so with the quota
* inodes.
*/
- if (xfs_is_quota_inode(&mp->m_sb, ino)) {
- *res = BULKSTAT_RV_NOTHING;
- return -EINVAL;
- }
+ if (xfs_is_quota_inode(&mp->m_sb, ino))
+ return 0;
/*
* We don't _need_ to take the ilock EXCL here because quotacheck runs
* at mount time and therefore nobody will be racing chown/chproj.
*/
- error = xfs_iget(mp, NULL, ino, XFS_IGET_DONTCACHE, 0, &ip);
- if (error) {
- *res = BULKSTAT_RV_NOTHING;
+ error = xfs_iget(mp, tp, ino, XFS_IGET_DONTCACHE, 0, &ip);
+ if (error == -EINVAL || error == -ENOENT)
+ return 0;
+ if (error)
return error;
- }
ASSERT(ip->i_delayed_blks == 0);
@@ -1157,7 +1149,7 @@
struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
if (!(ifp->if_flags & XFS_IFEXTENTS)) {
- error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK);
+ error = xfs_iread_extents(tp, ip, XFS_DATA_FORK);
if (error)
goto error0;
}
@@ -1200,13 +1192,8 @@
goto error0;
}
- xfs_irele(ip);
- *res = BULKSTAT_RV_DIDONE;
- return 0;
-
error0:
xfs_irele(ip);
- *res = BULKSTAT_RV_GIVEUP;
return error;
}
@@ -1270,18 +1257,13 @@
xfs_qm_quotacheck(
xfs_mount_t *mp)
{
- int done, count, error, error2;
- xfs_ino_t lastino;
- size_t structsz;
+ int error, error2;
uint flags;
LIST_HEAD (buffer_list);
struct xfs_inode *uip = mp->m_quotainfo->qi_uquotaip;
struct xfs_inode *gip = mp->m_quotainfo->qi_gquotaip;
struct xfs_inode *pip = mp->m_quotainfo->qi_pquotaip;
- count = INT_MAX;
- structsz = 1;
- lastino = 0;
flags = 0;
ASSERT(uip || gip || pip);
@@ -1318,18 +1300,10 @@
flags |= XFS_PQUOTA_CHKD;
}
- do {
- /*
- * Iterate thru all the inodes in the file system,
- * adjusting the corresponding dquot counters in core.
- */
- error = xfs_bulkstat(mp, &lastino, &count,
- xfs_qm_dqusage_adjust,
- structsz, NULL, &done);
- if (error)
- break;
-
- } while (!done);
+ error = xfs_iwalk_threaded(mp, 0, 0, xfs_qm_dqusage_adjust, 0, true,
+ NULL);
+ if (error)
+ goto error_return;
/*
* We've made all the changes that we need to make incore. Flush them
@@ -1812,7 +1786,8 @@
uint flags)
{
struct xfs_mount *mp = ip->i_mount;
- uint delblks, blkflags, prjflags = 0;
+ uint64_t delblks;
+ unsigned int blkflags, prjflags = 0;
struct xfs_dquot *udq_unres = NULL;
struct xfs_dquot *gdq_unres = NULL;
struct xfs_dquot *pdq_unres = NULL;
diff --git a/fs/xfs/xfs_qm.h b/fs/xfs/xfs_qm.h
index 3ccf0fb..b41b750 100644
--- a/fs/xfs/xfs_qm.h
+++ b/fs/xfs/xfs_qm.h
@@ -113,12 +113,8 @@
return NULL;
}
-extern void xfs_trans_mod_dquot(struct xfs_trans *,
- struct xfs_dquot *, uint, long);
-extern int xfs_trans_reserve_quota_bydquots(struct xfs_trans *,
- struct xfs_mount *, struct xfs_dquot *,
- struct xfs_dquot *, struct xfs_dquot *,
- long, long, uint);
+extern void xfs_trans_mod_dquot(struct xfs_trans *tp, struct xfs_dquot *dqp,
+ uint field, int64_t delta);
extern void xfs_trans_dqjoin(struct xfs_trans *, struct xfs_dquot *);
extern void xfs_trans_log_dquot(struct xfs_trans *, struct xfs_dquot *);
diff --git a/fs/xfs/xfs_qm_bhv.c b/fs/xfs/xfs_qm_bhv.c
index 73a1d77..5d72e88 100644
--- a/fs/xfs/xfs_qm_bhv.c
+++ b/fs/xfs/xfs_qm_bhv.c
@@ -5,13 +5,13 @@
*/
#include "xfs.h"
#include "xfs_fs.h"
+#include "xfs_shared.h"
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
#include "xfs_quota.h"
#include "xfs_mount.h"
#include "xfs_inode.h"
-#include "xfs_error.h"
#include "xfs_trans.h"
#include "xfs_qm.h"
@@ -40,7 +40,7 @@
statp->f_files = limit;
statp->f_ffree =
(statp->f_files > dqp->q_res_icount) ?
- (statp->f_ffree - dqp->q_res_icount) : 0;
+ (statp->f_files - dqp->q_res_icount) : 0;
}
}
diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c
index b319089..da7ad03 100644
--- a/fs/xfs/xfs_qm_syscalls.c
+++ b/fs/xfs/xfs_qm_syscalls.c
@@ -4,7 +4,6 @@
* All Rights Reserved.
*/
-#include <linux/capability.h>
#include "xfs.h"
#include "xfs_fs.h"
@@ -12,17 +11,13 @@
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
-#include "xfs_bit.h"
#include "xfs_sb.h"
#include "xfs_mount.h"
#include "xfs_inode.h"
#include "xfs_trans.h"
-#include "xfs_error.h"
#include "xfs_quota.h"
#include "xfs_qm.h"
-#include "xfs_trace.h"
#include "xfs_icache.h"
-#include "xfs_defer.h"
STATIC int xfs_qm_log_quotaoff(xfs_mount_t *, xfs_qoff_logitem_t **, uint);
STATIC int xfs_qm_log_quotaoff_end(xfs_mount_t *, xfs_qoff_logitem_t *,
diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h
index 55b7982..efe42ae 100644
--- a/fs/xfs/xfs_quota.h
+++ b/fs/xfs/xfs_quota.h
@@ -56,32 +56,35 @@
* The structure kept inside the xfs_trans_t keep track of dquot changes
* within a transaction and apply them later.
*/
-typedef struct xfs_dqtrx {
+struct xfs_dqtrx {
struct xfs_dquot *qt_dquot; /* the dquot this refers to */
- ulong qt_blk_res; /* blks reserved on a dquot */
- ulong qt_ino_res; /* inode reserved on a dquot */
- ulong qt_ino_res_used; /* inodes used from the reservation */
- long qt_bcount_delta; /* dquot blk count changes */
- long qt_delbcnt_delta; /* delayed dquot blk count changes */
- long qt_icount_delta; /* dquot inode count changes */
- ulong qt_rtblk_res; /* # blks reserved on a dquot */
- ulong qt_rtblk_res_used;/* # blks used from reservation */
- long qt_rtbcount_delta;/* dquot realtime blk changes */
- long qt_delrtb_delta; /* delayed RT blk count changes */
-} xfs_dqtrx_t;
+
+ uint64_t qt_blk_res; /* blks reserved on a dquot */
+ int64_t qt_bcount_delta; /* dquot blk count changes */
+ int64_t qt_delbcnt_delta; /* delayed dquot blk count changes */
+
+ uint64_t qt_rtblk_res; /* # blks reserved on a dquot */
+ uint64_t qt_rtblk_res_used;/* # blks used from reservation */
+ int64_t qt_rtbcount_delta;/* dquot realtime blk changes */
+ int64_t qt_delrtb_delta; /* delayed RT blk count changes */
+
+ uint64_t qt_ino_res; /* inode reserved on a dquot */
+ uint64_t qt_ino_res_used; /* inodes used from the reservation */
+ int64_t qt_icount_delta; /* dquot inode count changes */
+};
#ifdef CONFIG_XFS_QUOTA
extern void xfs_trans_dup_dqinfo(struct xfs_trans *, struct xfs_trans *);
extern void xfs_trans_free_dqinfo(struct xfs_trans *);
extern void xfs_trans_mod_dquot_byino(struct xfs_trans *, struct xfs_inode *,
- uint, long);
+ uint, int64_t);
extern void xfs_trans_apply_dquot_deltas(struct xfs_trans *);
extern void xfs_trans_unreserve_and_mod_dquots(struct xfs_trans *);
extern int xfs_trans_reserve_quota_nblks(struct xfs_trans *,
- struct xfs_inode *, long, long, uint);
+ struct xfs_inode *, int64_t, long, uint);
extern int xfs_trans_reserve_quota_bydquots(struct xfs_trans *,
struct xfs_mount *, struct xfs_dquot *,
- struct xfs_dquot *, struct xfs_dquot *, long, long, uint);
+ struct xfs_dquot *, struct xfs_dquot *, int64_t, long, uint);
extern int xfs_qm_vop_dqalloc(struct xfs_inode *, xfs_dqid_t, xfs_dqid_t,
prid_t, uint, struct xfs_dquot **, struct xfs_dquot **,
@@ -121,14 +124,14 @@
#define xfs_trans_apply_dquot_deltas(tp)
#define xfs_trans_unreserve_and_mod_dquots(tp)
static inline int xfs_trans_reserve_quota_nblks(struct xfs_trans *tp,
- struct xfs_inode *ip, long nblks, long ninos, uint flags)
+ struct xfs_inode *ip, int64_t nblks, long ninos, uint flags)
{
return 0;
}
static inline int xfs_trans_reserve_quota_bydquots(struct xfs_trans *tp,
struct xfs_mount *mp, struct xfs_dquot *udqp,
struct xfs_dquot *gdqp, struct xfs_dquot *pdqp,
- long nblks, long nions, uint flags)
+ int64_t nblks, long nions, uint flags)
{
return 0;
}
diff --git a/fs/xfs/xfs_quotaops.c b/fs/xfs/xfs_quotaops.c
index a7c0c65..cd6c721 100644
--- a/fs/xfs/xfs_quotaops.c
+++ b/fs/xfs/xfs_quotaops.c
@@ -4,6 +4,7 @@
* All Rights Reserved.
*/
#include "xfs.h"
+#include "xfs_shared.h"
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
@@ -11,10 +12,8 @@
#include "xfs_inode.h"
#include "xfs_quota.h"
#include "xfs_trans.h"
-#include "xfs_trace.h"
#include "xfs_icache.h"
#include "xfs_qm.h"
-#include <linux/quota.h>
static void
diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c
index fce38b5..2328268 100644
--- a/fs/xfs/xfs_refcount_item.c
+++ b/fs/xfs/xfs_refcount_item.c
@@ -14,7 +14,6 @@
#include "xfs_defer.h"
#include "xfs_trans.h"
#include "xfs_trans_priv.h"
-#include "xfs_buf_item.h"
#include "xfs_refcount_item.h"
#include "xfs_log.h"
#include "xfs_refcount.h"
@@ -95,15 +94,6 @@
}
/*
- * Pinning has no meaning for an cui item, so just return.
- */
-STATIC void
-xfs_cui_item_pin(
- struct xfs_log_item *lip)
-{
-}
-
-/*
* The unpin operation is the last place an CUI is manipulated in the log. It is
* either inserted in the AIL or aborted in the event of a log I/O error. In
* either case, the CUI transaction has been successfully committed to make it
@@ -122,71 +112,22 @@
}
/*
- * CUI items have no locking or pushing. However, since CUIs are pulled from
- * the AIL when their corresponding CUDs are committed to disk, their situation
- * is very similar to being pinned. Return XFS_ITEM_PINNED so that the caller
- * will eventually flush the log. This should help in getting the CUI out of
- * the AIL.
- */
-STATIC uint
-xfs_cui_item_push(
- struct xfs_log_item *lip,
- struct list_head *buffer_list)
-{
- return XFS_ITEM_PINNED;
-}
-
-/*
* The CUI has been either committed or aborted if the transaction has been
* cancelled. If the transaction was cancelled, an CUD isn't going to be
* constructed and thus we free the CUI here directly.
*/
STATIC void
-xfs_cui_item_unlock(
+xfs_cui_item_release(
struct xfs_log_item *lip)
{
- if (test_bit(XFS_LI_ABORTED, &lip->li_flags))
- xfs_cui_release(CUI_ITEM(lip));
+ xfs_cui_release(CUI_ITEM(lip));
}
-/*
- * The CUI is logged only once and cannot be moved in the log, so simply return
- * the lsn at which it's been logged.
- */
-STATIC xfs_lsn_t
-xfs_cui_item_committed(
- struct xfs_log_item *lip,
- xfs_lsn_t lsn)
-{
- return lsn;
-}
-
-/*
- * The CUI dependency tracking op doesn't do squat. It can't because
- * it doesn't know where the free extent is coming from. The dependency
- * tracking has to be handled by the "enclosing" metadata object. For
- * example, for inodes, the inode is locked throughout the extent freeing
- * so the dependency should be recorded there.
- */
-STATIC void
-xfs_cui_item_committing(
- struct xfs_log_item *lip,
- xfs_lsn_t lsn)
-{
-}
-
-/*
- * This is the ops vector shared by all cui log items.
- */
static const struct xfs_item_ops xfs_cui_item_ops = {
.iop_size = xfs_cui_item_size,
.iop_format = xfs_cui_item_format,
- .iop_pin = xfs_cui_item_pin,
.iop_unpin = xfs_cui_item_unpin,
- .iop_unlock = xfs_cui_item_unlock,
- .iop_committed = xfs_cui_item_committed,
- .iop_push = xfs_cui_item_push,
- .iop_committing = xfs_cui_item_committing,
+ .iop_release = xfs_cui_item_release,
};
/*
@@ -203,9 +144,9 @@
ASSERT(nextents > 0);
if (nextents > XFS_CUI_MAX_FAST_EXTENTS)
cuip = kmem_zalloc(xfs_cui_log_item_sizeof(nextents),
- KM_SLEEP);
+ 0);
else
- cuip = kmem_zone_zalloc(xfs_cui_zone, KM_SLEEP);
+ cuip = kmem_zone_zalloc(xfs_cui_zone, 0);
xfs_log_item_init(mp, &cuip->cui_item, XFS_LI_CUI, &xfs_cui_item_ops);
cuip->cui_format.cui_nextents = nextents;
@@ -254,127 +195,251 @@
}
/*
- * Pinning has no meaning for an cud item, so just return.
- */
-STATIC void
-xfs_cud_item_pin(
- struct xfs_log_item *lip)
-{
-}
-
-/*
- * Since pinning has no meaning for an cud item, unpinning does
- * not either.
- */
-STATIC void
-xfs_cud_item_unpin(
- struct xfs_log_item *lip,
- int remove)
-{
-}
-
-/*
- * There isn't much you can do to push on an cud item. It is simply stuck
- * waiting for the log to be flushed to disk.
- */
-STATIC uint
-xfs_cud_item_push(
- struct xfs_log_item *lip,
- struct list_head *buffer_list)
-{
- return XFS_ITEM_PINNED;
-}
-
-/*
* The CUD is either committed or aborted if the transaction is cancelled. If
* the transaction is cancelled, drop our reference to the CUI and free the
* CUD.
*/
STATIC void
-xfs_cud_item_unlock(
+xfs_cud_item_release(
struct xfs_log_item *lip)
{
struct xfs_cud_log_item *cudp = CUD_ITEM(lip);
- if (test_bit(XFS_LI_ABORTED, &lip->li_flags)) {
- xfs_cui_release(cudp->cud_cuip);
- kmem_zone_free(xfs_cud_zone, cudp);
- }
-}
-
-/*
- * When the cud item is committed to disk, all we need to do is delete our
- * reference to our partner cui item and then free ourselves. Since we're
- * freeing ourselves we must return -1 to keep the transaction code from
- * further referencing this item.
- */
-STATIC xfs_lsn_t
-xfs_cud_item_committed(
- struct xfs_log_item *lip,
- xfs_lsn_t lsn)
-{
- struct xfs_cud_log_item *cudp = CUD_ITEM(lip);
-
- /*
- * Drop the CUI reference regardless of whether the CUD has been
- * aborted. Once the CUD transaction is constructed, it is the sole
- * responsibility of the CUD to release the CUI (even if the CUI is
- * aborted due to log I/O error).
- */
xfs_cui_release(cudp->cud_cuip);
kmem_zone_free(xfs_cud_zone, cudp);
-
- return (xfs_lsn_t)-1;
}
-/*
- * The CUD dependency tracking op doesn't do squat. It can't because
- * it doesn't know where the free extent is coming from. The dependency
- * tracking has to be handled by the "enclosing" metadata object. For
- * example, for inodes, the inode is locked throughout the extent freeing
- * so the dependency should be recorded there.
- */
-STATIC void
-xfs_cud_item_committing(
- struct xfs_log_item *lip,
- xfs_lsn_t lsn)
-{
-}
-
-/*
- * This is the ops vector shared by all cud log items.
- */
static const struct xfs_item_ops xfs_cud_item_ops = {
+ .flags = XFS_ITEM_RELEASE_WHEN_COMMITTED,
.iop_size = xfs_cud_item_size,
.iop_format = xfs_cud_item_format,
- .iop_pin = xfs_cud_item_pin,
- .iop_unpin = xfs_cud_item_unpin,
- .iop_unlock = xfs_cud_item_unlock,
- .iop_committed = xfs_cud_item_committed,
- .iop_push = xfs_cud_item_push,
- .iop_committing = xfs_cud_item_committing,
+ .iop_release = xfs_cud_item_release,
};
-/*
- * Allocate and initialize an cud item with the given number of extents.
- */
-struct xfs_cud_log_item *
-xfs_cud_init(
- struct xfs_mount *mp,
+static struct xfs_cud_log_item *
+xfs_trans_get_cud(
+ struct xfs_trans *tp,
struct xfs_cui_log_item *cuip)
-
{
- struct xfs_cud_log_item *cudp;
+ struct xfs_cud_log_item *cudp;
- cudp = kmem_zone_zalloc(xfs_cud_zone, KM_SLEEP);
- xfs_log_item_init(mp, &cudp->cud_item, XFS_LI_CUD, &xfs_cud_item_ops);
+ cudp = kmem_zone_zalloc(xfs_cud_zone, 0);
+ xfs_log_item_init(tp->t_mountp, &cudp->cud_item, XFS_LI_CUD,
+ &xfs_cud_item_ops);
cudp->cud_cuip = cuip;
cudp->cud_format.cud_cui_id = cuip->cui_format.cui_id;
+ xfs_trans_add_item(tp, &cudp->cud_item);
return cudp;
}
/*
+ * Finish an refcount update and log it to the CUD. Note that the
+ * transaction is marked dirty regardless of whether the refcount
+ * update succeeds or fails to support the CUI/CUD lifecycle rules.
+ */
+static int
+xfs_trans_log_finish_refcount_update(
+ struct xfs_trans *tp,
+ struct xfs_cud_log_item *cudp,
+ enum xfs_refcount_intent_type type,
+ xfs_fsblock_t startblock,
+ xfs_extlen_t blockcount,
+ xfs_fsblock_t *new_fsb,
+ xfs_extlen_t *new_len,
+ struct xfs_btree_cur **pcur)
+{
+ int error;
+
+ error = xfs_refcount_finish_one(tp, type, startblock,
+ blockcount, new_fsb, new_len, pcur);
+
+ /*
+ * Mark the transaction dirty, even on error. This ensures the
+ * transaction is aborted, which:
+ *
+ * 1.) releases the CUI and frees the CUD
+ * 2.) shuts down the filesystem
+ */
+ tp->t_flags |= XFS_TRANS_DIRTY;
+ set_bit(XFS_LI_DIRTY, &cudp->cud_item.li_flags);
+
+ return error;
+}
+
+/* Sort refcount intents by AG. */
+static int
+xfs_refcount_update_diff_items(
+ void *priv,
+ struct list_head *a,
+ struct list_head *b)
+{
+ struct xfs_mount *mp = priv;
+ struct xfs_refcount_intent *ra;
+ struct xfs_refcount_intent *rb;
+
+ ra = container_of(a, struct xfs_refcount_intent, ri_list);
+ rb = container_of(b, struct xfs_refcount_intent, ri_list);
+ return XFS_FSB_TO_AGNO(mp, ra->ri_startblock) -
+ XFS_FSB_TO_AGNO(mp, rb->ri_startblock);
+}
+
+/* Get an CUI. */
+STATIC void *
+xfs_refcount_update_create_intent(
+ struct xfs_trans *tp,
+ unsigned int count)
+{
+ struct xfs_cui_log_item *cuip;
+
+ ASSERT(tp != NULL);
+ ASSERT(count > 0);
+
+ cuip = xfs_cui_init(tp->t_mountp, count);
+ ASSERT(cuip != NULL);
+
+ /*
+ * Get a log_item_desc to point at the new item.
+ */
+ xfs_trans_add_item(tp, &cuip->cui_item);
+ return cuip;
+}
+
+/* Set the phys extent flags for this reverse mapping. */
+static void
+xfs_trans_set_refcount_flags(
+ struct xfs_phys_extent *refc,
+ enum xfs_refcount_intent_type type)
+{
+ refc->pe_flags = 0;
+ switch (type) {
+ case XFS_REFCOUNT_INCREASE:
+ case XFS_REFCOUNT_DECREASE:
+ case XFS_REFCOUNT_ALLOC_COW:
+ case XFS_REFCOUNT_FREE_COW:
+ refc->pe_flags |= type;
+ break;
+ default:
+ ASSERT(0);
+ }
+}
+
+/* Log refcount updates in the intent item. */
+STATIC void
+xfs_refcount_update_log_item(
+ struct xfs_trans *tp,
+ void *intent,
+ struct list_head *item)
+{
+ struct xfs_cui_log_item *cuip = intent;
+ struct xfs_refcount_intent *refc;
+ uint next_extent;
+ struct xfs_phys_extent *ext;
+
+ refc = container_of(item, struct xfs_refcount_intent, ri_list);
+
+ tp->t_flags |= XFS_TRANS_DIRTY;
+ set_bit(XFS_LI_DIRTY, &cuip->cui_item.li_flags);
+
+ /*
+ * atomic_inc_return gives us the value after the increment;
+ * we want to use it as an array index so we need to subtract 1 from
+ * it.
+ */
+ next_extent = atomic_inc_return(&cuip->cui_next_extent) - 1;
+ ASSERT(next_extent < cuip->cui_format.cui_nextents);
+ ext = &cuip->cui_format.cui_extents[next_extent];
+ ext->pe_startblock = refc->ri_startblock;
+ ext->pe_len = refc->ri_blockcount;
+ xfs_trans_set_refcount_flags(ext, refc->ri_type);
+}
+
+/* Get an CUD so we can process all the deferred refcount updates. */
+STATIC void *
+xfs_refcount_update_create_done(
+ struct xfs_trans *tp,
+ void *intent,
+ unsigned int count)
+{
+ return xfs_trans_get_cud(tp, intent);
+}
+
+/* Process a deferred refcount update. */
+STATIC int
+xfs_refcount_update_finish_item(
+ struct xfs_trans *tp,
+ struct list_head *item,
+ void *done_item,
+ void **state)
+{
+ struct xfs_refcount_intent *refc;
+ xfs_fsblock_t new_fsb;
+ xfs_extlen_t new_aglen;
+ int error;
+
+ refc = container_of(item, struct xfs_refcount_intent, ri_list);
+ error = xfs_trans_log_finish_refcount_update(tp, done_item,
+ refc->ri_type,
+ refc->ri_startblock,
+ refc->ri_blockcount,
+ &new_fsb, &new_aglen,
+ (struct xfs_btree_cur **)state);
+ /* Did we run out of reservation? Requeue what we didn't finish. */
+ if (!error && new_aglen > 0) {
+ ASSERT(refc->ri_type == XFS_REFCOUNT_INCREASE ||
+ refc->ri_type == XFS_REFCOUNT_DECREASE);
+ refc->ri_startblock = new_fsb;
+ refc->ri_blockcount = new_aglen;
+ return -EAGAIN;
+ }
+ kmem_free(refc);
+ return error;
+}
+
+/* Clean up after processing deferred refcounts. */
+STATIC void
+xfs_refcount_update_finish_cleanup(
+ struct xfs_trans *tp,
+ void *state,
+ int error)
+{
+ struct xfs_btree_cur *rcur = state;
+
+ xfs_refcount_finish_one_cleanup(tp, rcur, error);
+}
+
+/* Abort all pending CUIs. */
+STATIC void
+xfs_refcount_update_abort_intent(
+ void *intent)
+{
+ xfs_cui_release(intent);
+}
+
+/* Cancel a deferred refcount update. */
+STATIC void
+xfs_refcount_update_cancel_item(
+ struct list_head *item)
+{
+ struct xfs_refcount_intent *refc;
+
+ refc = container_of(item, struct xfs_refcount_intent, ri_list);
+ kmem_free(refc);
+}
+
+const struct xfs_defer_op_type xfs_refcount_update_defer_type = {
+ .max_items = XFS_CUI_MAX_FAST_EXTENTS,
+ .diff_items = xfs_refcount_update_diff_items,
+ .create_intent = xfs_refcount_update_create_intent,
+ .abort_intent = xfs_refcount_update_abort_intent,
+ .log_item = xfs_refcount_update_log_item,
+ .create_done = xfs_refcount_update_create_done,
+ .finish_item = xfs_refcount_update_finish_item,
+ .finish_cleanup = xfs_refcount_update_finish_cleanup,
+ .cancel_item = xfs_refcount_update_cancel_item,
+};
+
+/*
* Process a refcount update intent item that was recovered from the log.
* We need to update the refcountbt.
*/
@@ -490,26 +555,24 @@
irec.br_blockcount = new_len;
switch (type) {
case XFS_REFCOUNT_INCREASE:
- error = xfs_refcount_increase_extent(tp, &irec);
+ xfs_refcount_increase_extent(tp, &irec);
break;
case XFS_REFCOUNT_DECREASE:
- error = xfs_refcount_decrease_extent(tp, &irec);
+ xfs_refcount_decrease_extent(tp, &irec);
break;
case XFS_REFCOUNT_ALLOC_COW:
- error = xfs_refcount_alloc_cow_extent(tp,
+ xfs_refcount_alloc_cow_extent(tp,
irec.br_startblock,
irec.br_blockcount);
break;
case XFS_REFCOUNT_FREE_COW:
- error = xfs_refcount_free_cow_extent(tp,
+ xfs_refcount_free_cow_extent(tp,
irec.br_startblock,
irec.br_blockcount);
break;
default:
ASSERT(0);
}
- if (error)
- goto abort_error;
requeue_only = true;
}
}
diff --git a/fs/xfs/xfs_refcount_item.h b/fs/xfs/xfs_refcount_item.h
index 3896dcc..e47530f 100644
--- a/fs/xfs/xfs_refcount_item.h
+++ b/fs/xfs/xfs_refcount_item.h
@@ -78,8 +78,6 @@
extern struct kmem_zone *xfs_cud_zone;
struct xfs_cui_log_item *xfs_cui_init(struct xfs_mount *, uint);
-struct xfs_cud_log_item *xfs_cud_init(struct xfs_mount *,
- struct xfs_cui_log_item *);
void xfs_cui_item_free(struct xfs_cui_log_item *);
void xfs_cui_release(struct xfs_cui_log_item *);
int xfs_cui_recover(struct xfs_trans *parent_tp, struct xfs_cui_log_item *cuip);
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index 42ea7ba..0f08153 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -11,21 +11,12 @@
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
#include "xfs_defer.h"
-#include "xfs_da_format.h"
-#include "xfs_da_btree.h"
#include "xfs_inode.h"
#include "xfs_trans.h"
-#include "xfs_inode_item.h"
#include "xfs_bmap.h"
#include "xfs_bmap_util.h"
-#include "xfs_error.h"
-#include "xfs_dir2.h"
-#include "xfs_dir2_priv.h"
-#include "xfs_ioctl.h"
#include "xfs_trace.h"
-#include "xfs_log.h"
#include "xfs_icache.h"
-#include "xfs_pnfs.h"
#include "xfs_btree.h"
#include "xfs_refcount_btree.h"
#include "xfs_refcount.h"
@@ -33,11 +24,9 @@
#include "xfs_trans_space.h"
#include "xfs_bit.h"
#include "xfs_alloc.h"
-#include "xfs_quota_defs.h"
#include "xfs_quota.h"
#include "xfs_reflink.h"
#include "xfs_iomap.h"
-#include "xfs_rmap_btree.h"
#include "xfs_sb.h"
#include "xfs_ag_resv.h"
@@ -182,8 +171,7 @@
xfs_reflink_trim_around_shared(
struct xfs_inode *ip,
struct xfs_bmbt_irec *irec,
- bool *shared,
- bool *trimmed)
+ bool *shared)
{
xfs_agnumber_t agno;
xfs_agblock_t agbno;
@@ -193,7 +181,7 @@
int error = 0;
/* Holes, unwritten, and delalloc extents cannot be shared */
- if (!xfs_is_reflink_inode(ip) || !xfs_bmap_is_real_extent(irec)) {
+ if (!xfs_is_cow_inode(ip) || !xfs_bmap_is_real_extent(irec)) {
*shared = false;
return 0;
}
@@ -209,7 +197,7 @@
if (error)
return error;
- *shared = *trimmed = false;
+ *shared = false;
if (fbno == NULLAGBLOCK) {
/* No shared blocks at all. */
return 0;
@@ -222,8 +210,6 @@
*/
irec->br_blockcount = flen;
*shared = true;
- if (flen != aglen)
- *trimmed = true;
return 0;
} else {
/*
@@ -233,99 +219,63 @@
* start of the shared region.
*/
irec->br_blockcount = fbno - agbno;
- *trimmed = true;
return 0;
}
}
-/*
- * Trim the passed in imap to the next shared/unshared extent boundary, and
- * if imap->br_startoff points to a shared extent reserve space for it in the
- * COW fork. In this case *shared is set to true, else to false.
- *
- * Note that imap will always contain the block numbers for the existing blocks
- * in the data fork, as the upper layers need them for read-modify-write
- * operations.
- */
-int
-xfs_reflink_reserve_cow(
+bool
+xfs_inode_need_cow(
struct xfs_inode *ip,
struct xfs_bmbt_irec *imap,
bool *shared)
{
- struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
- struct xfs_bmbt_irec got;
- int error = 0;
- bool eof = false, trimmed;
- struct xfs_iext_cursor icur;
-
- /*
- * Search the COW fork extent list first. This serves two purposes:
- * first this implement the speculative preallocation using cowextisze,
- * so that we also unshared block adjacent to shared blocks instead
- * of just the shared blocks themselves. Second the lookup in the
- * extent list is generally faster than going out to the shared extent
- * tree.
- */
-
- if (!xfs_iext_lookup_extent(ip, ifp, imap->br_startoff, &icur, &got))
- eof = true;
- if (!eof && got.br_startoff <= imap->br_startoff) {
- trace_xfs_reflink_cow_found(ip, imap);
- xfs_trim_extent(imap, got.br_startoff, got.br_blockcount);
-
+ /* We can't update any real extents in always COW mode. */
+ if (xfs_is_always_cow_inode(ip) &&
+ !isnullstartblock(imap->br_startblock)) {
*shared = true;
return 0;
}
/* Trim the mapping to the nearest shared extent boundary. */
- error = xfs_reflink_trim_around_shared(ip, imap, shared, &trimmed);
- if (error)
- return error;
-
- /* Not shared? Just report the (potentially capped) extent. */
- if (!*shared)
- return 0;
-
- /*
- * Fork all the shared blocks from our write offset until the end of
- * the extent.
- */
- error = xfs_qm_dqattach_locked(ip, false);
- if (error)
- return error;
-
- error = xfs_bmapi_reserve_delalloc(ip, XFS_COW_FORK, imap->br_startoff,
- imap->br_blockcount, 0, &got, &icur, eof);
- if (error == -ENOSPC || error == -EDQUOT)
- trace_xfs_reflink_cow_enospc(ip, imap);
- if (error)
- return error;
-
- trace_xfs_reflink_cow_alloc(ip, &got);
- return 0;
+ return xfs_reflink_trim_around_shared(ip, imap, shared);
}
-/* Convert part of an unwritten CoW extent to a real one. */
-STATIC int
-xfs_reflink_convert_cow_extent(
- struct xfs_inode *ip,
- struct xfs_bmbt_irec *imap,
- xfs_fileoff_t offset_fsb,
- xfs_filblks_t count_fsb)
+static int
+xfs_reflink_convert_cow_locked(
+ struct xfs_inode *ip,
+ xfs_fileoff_t offset_fsb,
+ xfs_filblks_t count_fsb)
{
- int nimaps = 1;
+ struct xfs_iext_cursor icur;
+ struct xfs_bmbt_irec got;
+ struct xfs_btree_cur *dummy_cur = NULL;
+ int dummy_logflags;
+ int error = 0;
- if (imap->br_state == XFS_EXT_NORM)
+ if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &got))
return 0;
- xfs_trim_extent(imap, offset_fsb, count_fsb);
- trace_xfs_reflink_convert_cow(ip, imap);
- if (imap->br_blockcount == 0)
- return 0;
- return xfs_bmapi_write(NULL, ip, imap->br_startoff, imap->br_blockcount,
- XFS_BMAPI_COWFORK | XFS_BMAPI_CONVERT, 0, imap,
- &nimaps);
+ do {
+ if (got.br_startoff >= offset_fsb + count_fsb)
+ break;
+ if (got.br_state == XFS_EXT_NORM)
+ continue;
+ if (WARN_ON_ONCE(isnullstartblock(got.br_startblock)))
+ return -EIO;
+
+ xfs_trim_extent(&got, offset_fsb, count_fsb);
+ if (!got.br_blockcount)
+ continue;
+
+ got.br_state = XFS_EXT_NORM;
+ error = xfs_bmap_add_extent_unwritten_real(NULL, ip,
+ XFS_COW_FORK, &icur, &dummy_cur, &got,
+ &dummy_logflags);
+ if (error)
+ return error;
+ } while (xfs_iext_next_extent(ip->i_cowfp, &icur, &got));
+
+ return error;
}
/* Convert all of the unwritten CoW extents in a file's range to real ones. */
@@ -339,15 +289,12 @@
xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + count);
xfs_filblks_t count_fsb = end_fsb - offset_fsb;
- struct xfs_bmbt_irec imap;
- int nimaps = 1, error = 0;
+ int error;
ASSERT(count != 0);
xfs_ilock(ip, XFS_ILOCK_EXCL);
- error = xfs_bmapi_write(NULL, ip, offset_fsb, count_fsb,
- XFS_BMAPI_COWFORK | XFS_BMAPI_CONVERT |
- XFS_BMAPI_CONVERT_ONLY, 0, &imap, &nimaps);
+ error = xfs_reflink_convert_cow_locked(ip, offset_fsb, count_fsb);
xfs_iunlock(ip, XFS_ILOCK_EXCL);
return error;
}
@@ -368,7 +315,6 @@
xfs_filblks_t count_fsb = imap->br_blockcount;
struct xfs_iext_cursor icur;
struct xfs_bmbt_irec got;
- bool trimmed;
*found = false;
@@ -376,9 +322,13 @@
* If we don't find an overlapping extent, trim the range we need to
* allocate to fit the hole we found.
*/
- if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &got) ||
- got.br_startoff > offset_fsb)
- return xfs_reflink_trim_around_shared(ip, imap, shared, &trimmed);
+ if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &got))
+ got.br_startoff = offset_fsb + count_fsb;
+ if (got.br_startoff > offset_fsb) {
+ xfs_trim_extent(imap, imap->br_startoff,
+ got.br_startoff - imap->br_startoff);
+ return xfs_inode_need_cow(ip, imap, shared);
+ }
*shared = true;
if (isnullstartblock(got.br_startblock)) {
@@ -399,7 +349,8 @@
struct xfs_inode *ip,
struct xfs_bmbt_irec *imap,
bool *shared,
- uint *lockmode)
+ uint *lockmode,
+ bool convert_now)
{
struct xfs_mount *mp = ip->i_mount;
xfs_fileoff_t offset_fsb = imap->br_startoff;
@@ -411,7 +362,10 @@
xfs_extlen_t resblks = 0;
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
- ASSERT(xfs_is_reflink_inode(ip));
+ if (!ip->i_cowfp) {
+ ASSERT(!xfs_is_reflink_inode(ip));
+ xfs_ifork_init_cow(ip);
+ }
error = xfs_find_trim_cow_extent(ip, imap, shared, &found);
if (error || !*shared)
@@ -473,7 +427,16 @@
if (nimaps == 0)
return -ENOSPC;
convert:
- return xfs_reflink_convert_cow_extent(ip, imap, offset_fsb, count_fsb);
+ xfs_trim_extent(imap, offset_fsb, count_fsb);
+ /*
+ * COW fork extents are supposed to remain unwritten until we're ready
+ * to initiate a disk write. For direct I/O we are going to write the
+ * data and need the conversion, but for buffered writes we're done.
+ */
+ if (!convert_now || imap->br_state == XFS_EXT_NORM)
+ return 0;
+ trace_xfs_reflink_convert_cow(ip, imap);
+ return xfs_reflink_convert_cow_locked(ip, offset_fsb, count_fsb);
out_unreserve:
xfs_trans_unreserve_quota_nblks(tp, ip, (long)resblks, 0,
@@ -532,10 +495,8 @@
ASSERT((*tpp)->t_firstblock == NULLFSBLOCK);
/* Free the CoW orphan record. */
- error = xfs_refcount_free_cow_extent(*tpp,
- del.br_startblock, del.br_blockcount);
- if (error)
- break;
+ xfs_refcount_free_cow_extent(*tpp, del.br_startblock,
+ del.br_blockcount);
xfs_bmap_add_free(*tpp, del.br_startblock,
del.br_blockcount, NULL);
@@ -588,7 +549,7 @@
int error;
trace_xfs_reflink_cancel_cow_range(ip, offset, count);
- ASSERT(xfs_is_reflink_inode(ip));
+ ASSERT(ip->i_cowfp);
offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset);
if (count == NULLFILEOFF)
@@ -598,7 +559,7 @@
/* Start a rolling transaction to remove the mappings */
error = xfs_trans_alloc(ip->i_mount, &M_RES(ip->i_mount)->tr_write,
- 0, 0, XFS_TRANS_NOFS, &tp);
+ 0, 0, 0, &tp);
if (error)
goto out;
@@ -625,54 +586,47 @@
}
/*
- * Remap parts of a file's data fork after a successful CoW.
+ * Remap part of the CoW fork into the data fork.
+ *
+ * We aim to remap the range starting at @offset_fsb and ending at @end_fsb
+ * into the data fork; this function will remap what it can (at the end of the
+ * range) and update @end_fsb appropriately. Each remap gets its own
+ * transaction because we can end up merging and splitting bmbt blocks for
+ * every remap operation and we'd like to keep the block reservation
+ * requirements as low as possible.
*/
-int
-xfs_reflink_end_cow(
- struct xfs_inode *ip,
- xfs_off_t offset,
- xfs_off_t count)
+STATIC int
+xfs_reflink_end_cow_extent(
+ struct xfs_inode *ip,
+ xfs_fileoff_t offset_fsb,
+ xfs_fileoff_t *end_fsb)
{
- struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
- struct xfs_bmbt_irec got, del;
- struct xfs_trans *tp;
- xfs_fileoff_t offset_fsb;
- xfs_fileoff_t end_fsb;
- int error;
- unsigned int resblks;
- xfs_filblks_t rlen;
- struct xfs_iext_cursor icur;
-
- trace_xfs_reflink_end_cow(ip, offset, count);
+ struct xfs_bmbt_irec got, del;
+ struct xfs_iext_cursor icur;
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_trans *tp;
+ struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
+ xfs_filblks_t rlen;
+ unsigned int resblks;
+ int error;
/* No COW extents? That's easy! */
- if (ifp->if_bytes == 0)
+ if (ifp->if_bytes == 0) {
+ *end_fsb = offset_fsb;
return 0;
+ }
- offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset);
- end_fsb = XFS_B_TO_FSB(ip->i_mount, offset + count);
+ resblks = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK);
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0,
+ XFS_TRANS_RESERVE, &tp);
+ if (error)
+ return error;
/*
- * Start a rolling transaction to switch the mappings. We're
- * unlikely ever to have to remap 16T worth of single-block
- * extents, so just cap the worst case extent count to 2^32-1.
- * Stick a warning in just in case, and avoid 64-bit division.
+ * Lock the inode. We have to ijoin without automatic unlock because
+ * the lead transaction is the refcountbt record deletion; the data
+ * fork update follows as a deferred log item.
*/
- BUILD_BUG_ON(MAX_RW_COUNT > UINT_MAX);
- if (end_fsb - offset_fsb > UINT_MAX) {
- error = -EFSCORRUPTED;
- xfs_force_shutdown(ip->i_mount, SHUTDOWN_CORRUPT_INCORE);
- ASSERT(0);
- goto out;
- }
- resblks = XFS_NEXTENTADD_SPACE_RES(ip->i_mount,
- (unsigned int)(end_fsb - offset_fsb),
- XFS_DATA_FORK);
- error = xfs_trans_alloc(ip->i_mount, &M_RES(ip->i_mount)->tr_write,
- resblks, 0, XFS_TRANS_RESERVE | XFS_TRANS_NOFS, &tp);
- if (error)
- goto out;
-
xfs_ilock(ip, XFS_ILOCK_EXCL);
xfs_trans_ijoin(tp, ip, 0);
@@ -681,80 +635,126 @@
* left by the time I/O completes for the loser of the race. In that
* case we are done.
*/
- if (!xfs_iext_lookup_extent_before(ip, ifp, &end_fsb, &icur, &got))
+ if (!xfs_iext_lookup_extent_before(ip, ifp, end_fsb, &icur, &got) ||
+ got.br_startoff + got.br_blockcount <= offset_fsb) {
+ *end_fsb = offset_fsb;
+ goto out_cancel;
+ }
+
+ /*
+ * Structure copy @got into @del, then trim @del to the range that we
+ * were asked to remap. We preserve @got for the eventual CoW fork
+ * deletion; from now on @del represents the mapping that we're
+ * actually remapping.
+ */
+ del = got;
+ xfs_trim_extent(&del, offset_fsb, *end_fsb - offset_fsb);
+
+ ASSERT(del.br_blockcount > 0);
+
+ /*
+ * Only remap real extents that contain data. With AIO, speculative
+ * preallocations can leak into the range we are called upon, and we
+ * need to skip them.
+ */
+ if (!xfs_bmap_is_real_extent(&got)) {
+ *end_fsb = del.br_startoff;
+ goto out_cancel;
+ }
+
+ /* Unmap the old blocks in the data fork. */
+ rlen = del.br_blockcount;
+ error = __xfs_bunmapi(tp, ip, del.br_startoff, &rlen, 0, 1);
+ if (error)
goto out_cancel;
- /* Walk backwards until we're out of the I/O range... */
- while (got.br_startoff + got.br_blockcount > offset_fsb) {
- del = got;
- xfs_trim_extent(&del, offset_fsb, end_fsb - offset_fsb);
+ /* Trim the extent to whatever got unmapped. */
+ xfs_trim_extent(&del, del.br_startoff + rlen, del.br_blockcount - rlen);
+ trace_xfs_reflink_cow_remap(ip, &del);
- /* Extent delete may have bumped ext forward */
- if (!del.br_blockcount)
- goto prev_extent;
+ /* Free the CoW orphan record. */
+ xfs_refcount_free_cow_extent(tp, del.br_startblock, del.br_blockcount);
- /*
- * Only remap real extent that contain data. With AIO
- * speculatively preallocations can leak into the range we
- * are called upon, and we need to skip them.
- */
- if (!xfs_bmap_is_real_extent(&got))
- goto prev_extent;
+ /* Map the new blocks into the data fork. */
+ xfs_bmap_map_extent(tp, ip, &del);
- /* Unmap the old blocks in the data fork. */
- ASSERT(tp->t_firstblock == NULLFSBLOCK);
- rlen = del.br_blockcount;
- error = __xfs_bunmapi(tp, ip, del.br_startoff, &rlen, 0, 1);
- if (error)
- goto out_cancel;
+ /* Charge this new data fork mapping to the on-disk quota. */
+ xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_DELBCOUNT,
+ (long)del.br_blockcount);
- /* Trim the extent to whatever got unmapped. */
- if (rlen) {
- xfs_trim_extent(&del, del.br_startoff + rlen,
- del.br_blockcount - rlen);
- }
- trace_xfs_reflink_cow_remap(ip, &del);
-
- /* Free the CoW orphan record. */
- error = xfs_refcount_free_cow_extent(tp, del.br_startblock,
- del.br_blockcount);
- if (error)
- goto out_cancel;
-
- /* Map the new blocks into the data fork. */
- error = xfs_bmap_map_extent(tp, ip, &del);
- if (error)
- goto out_cancel;
-
- /* Charge this new data fork mapping to the on-disk quota. */
- xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_DELBCOUNT,
- (long)del.br_blockcount);
-
- /* Remove the mapping from the CoW fork. */
- xfs_bmap_del_extent_cow(ip, &icur, &got, &del);
-
- error = xfs_defer_finish(&tp);
- if (error)
- goto out_cancel;
- if (!xfs_iext_get_extent(ifp, &icur, &got))
- break;
- continue;
-prev_extent:
- if (!xfs_iext_prev_extent(ifp, &icur, &got))
- break;
- }
+ /* Remove the mapping from the CoW fork. */
+ xfs_bmap_del_extent_cow(ip, &icur, &got, &del);
error = xfs_trans_commit(tp);
xfs_iunlock(ip, XFS_ILOCK_EXCL);
if (error)
- goto out;
+ return error;
+
+ /* Update the caller about how much progress we made. */
+ *end_fsb = del.br_startoff;
return 0;
out_cancel:
xfs_trans_cancel(tp);
xfs_iunlock(ip, XFS_ILOCK_EXCL);
-out:
- trace_xfs_reflink_end_cow_error(ip, error, _RET_IP_);
+ return error;
+}
+
+/*
+ * Remap parts of a file's data fork after a successful CoW.
+ */
+int
+xfs_reflink_end_cow(
+ struct xfs_inode *ip,
+ xfs_off_t offset,
+ xfs_off_t count)
+{
+ xfs_fileoff_t offset_fsb;
+ xfs_fileoff_t end_fsb;
+ int error = 0;
+
+ trace_xfs_reflink_end_cow(ip, offset, count);
+
+ offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset);
+ end_fsb = XFS_B_TO_FSB(ip->i_mount, offset + count);
+
+ /*
+ * Walk backwards until we're out of the I/O range. The loop function
+ * repeatedly cycles the ILOCK to allocate one transaction per remapped
+ * extent.
+ *
+ * If we're being called by writeback then the the pages will still
+ * have PageWriteback set, which prevents races with reflink remapping
+ * and truncate. Reflink remapping prevents races with writeback by
+ * taking the iolock and mmaplock before flushing the pages and
+ * remapping, which means there won't be any further writeback or page
+ * cache dirtying until the reflink completes.
+ *
+ * We should never have two threads issuing writeback for the same file
+ * region. There are also have post-eof checks in the writeback
+ * preparation code so that we don't bother writing out pages that are
+ * about to be truncated.
+ *
+ * If we're being called as part of directio write completion, the dio
+ * count is still elevated, which reflink and truncate will wait for.
+ * Reflink remapping takes the iolock and mmaplock and waits for
+ * pending dio to finish, which should prevent any directio until the
+ * remap completes. Multiple concurrent directio writes to the same
+ * region are handled by end_cow processing only occurring for the
+ * threads which succeed; the outcome of multiple overlapping direct
+ * writes is not well defined anyway.
+ *
+ * It's possible that a buffered write and a direct write could collide
+ * here (the buffered write stumbles in after the dio flushes and
+ * invalidates the page cache and immediately queues writeback), but we
+ * have never supported this 100%. If either disk write succeeds the
+ * blocks will be remapped.
+ */
+ while (end_fsb > offset_fsb && !error)
+ error = xfs_reflink_end_cow_extent(ip, offset_fsb, &end_fsb);
+
+ if (error)
+ trace_xfs_reflink_end_cow_error(ip, error, _RET_IP_);
return error;
}
@@ -916,18 +916,18 @@
/*
* Update destination inode size & cowextsize hint, if necessary.
*/
-STATIC int
+int
xfs_reflink_update_dest(
struct xfs_inode *dest,
xfs_off_t newlen,
xfs_extlen_t cowextsize,
- bool is_dedupe)
+ unsigned int remap_flags)
{
struct xfs_mount *mp = dest->i_mount;
struct xfs_trans *tp;
int error;
- if (is_dedupe && newlen <= i_size_read(VFS_I(dest)) && cowextsize == 0)
+ if (newlen <= i_size_read(VFS_I(dest)) && cowextsize == 0)
return 0;
error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp);
@@ -948,10 +948,6 @@
dest->i_d.di_flags2 |= XFS_DIFLAG2_COWEXTSIZE;
}
- if (!is_dedupe) {
- xfs_trans_ichgtime(tp, dest,
- XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
- }
xfs_trans_log_inode(tp, dest, XFS_ILOG_CORE);
error = xfs_trans_commit(tp);
@@ -1067,14 +1063,10 @@
uirec.br_blockcount, uirec.br_startblock);
/* Update the refcount tree */
- error = xfs_refcount_increase_extent(tp, &uirec);
- if (error)
- goto out_cancel;
+ xfs_refcount_increase_extent(tp, &uirec);
/* Map the new blocks into the data fork. */
- error = xfs_bmap_map_extent(tp, ip, &uirec);
- if (error)
- goto out_cancel;
+ xfs_bmap_map_extent(tp, ip, &uirec);
/* Update quota accounting. */
xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT,
@@ -1115,19 +1107,28 @@
/*
* Iteratively remap one file's extents (and holes) to another's.
*/
-STATIC int
+int
xfs_reflink_remap_blocks(
struct xfs_inode *src,
- xfs_fileoff_t srcoff,
+ loff_t pos_in,
struct xfs_inode *dest,
- xfs_fileoff_t destoff,
- xfs_filblks_t len,
- xfs_off_t new_isize)
+ loff_t pos_out,
+ loff_t remap_len,
+ loff_t *remapped)
{
struct xfs_bmbt_irec imap;
+ xfs_fileoff_t srcoff;
+ xfs_fileoff_t destoff;
+ xfs_filblks_t len;
+ xfs_filblks_t range_len;
+ xfs_filblks_t remapped_len = 0;
+ xfs_off_t new_isize = pos_out + remap_len;
int nimaps;
int error = 0;
- xfs_filblks_t range_len;
+
+ destoff = XFS_B_TO_FSBT(src->i_mount, pos_out);
+ srcoff = XFS_B_TO_FSBT(src->i_mount, pos_in);
+ len = XFS_B_TO_FSB(src->i_mount, remap_len);
/* drange = (destoff, destoff + len); srange = (srcoff, srcoff + len) */
while (len) {
@@ -1142,10 +1143,10 @@
error = xfs_bmapi_read(src, srcoff, len, &imap, &nimaps, 0);
xfs_iunlock(src, lock_mode);
if (error)
- goto err;
+ break;
ASSERT(nimaps == 1);
- trace_xfs_reflink_remap_imap(src, srcoff, len, XFS_IO_OVERWRITE,
+ trace_xfs_reflink_remap_imap(src, srcoff, len, XFS_DATA_FORK,
&imap);
/* Translate imap into the destination file. */
@@ -1156,32 +1157,33 @@
error = xfs_reflink_remap_extent(dest, &imap, destoff,
new_isize);
if (error)
- goto err;
+ break;
if (fatal_signal_pending(current)) {
error = -EINTR;
- goto err;
+ break;
}
/* Advance drange/srange */
srcoff += range_len;
destoff += range_len;
len -= range_len;
+ remapped_len += range_len;
}
- return 0;
-
-err:
- trace_xfs_reflink_remap_blocks_error(dest, error, _RET_IP_);
+ if (error)
+ trace_xfs_reflink_remap_blocks_error(dest, error, _RET_IP_);
+ *remapped = min_t(loff_t, remap_len,
+ XFS_FSB_TO_B(src->i_mount, remapped_len));
return error;
}
/*
- * Grab the exclusive iolock for a data copy from src to dest, making
- * sure to abide vfs locking order (lowest pointer value goes first) and
- * breaking the pnfs layout leases on dest before proceeding. The loop
- * is needed because we cannot call the blocking break_layout() with the
- * src iolock held, and therefore have to back out both locks.
+ * Grab the exclusive iolock for a data copy from src to dest, making sure to
+ * abide vfs locking order (lowest pointer value goes first) and breaking the
+ * layout leases before proceeding. The loop is needed because we cannot call
+ * the blocking break_layout() with the iolocks held, and therefore have to
+ * back out both locks.
*/
static int
xfs_iolock_two_inodes_and_break_layout(
@@ -1190,38 +1192,49 @@
{
int error;
-retry:
- if (src < dest) {
- inode_lock_shared(src);
- inode_lock_nested(dest, I_MUTEX_NONDIR2);
- } else {
- /* src >= dest */
- inode_lock(dest);
- }
+ if (src > dest)
+ swap(src, dest);
- error = break_layout(dest, false);
- if (error == -EWOULDBLOCK) {
- inode_unlock(dest);
- if (src < dest)
- inode_unlock_shared(src);
+retry:
+ /* Wait to break both inodes' layouts before we start locking. */
+ error = break_layout(src, true);
+ if (error)
+ return error;
+ if (src != dest) {
error = break_layout(dest, true);
if (error)
return error;
- goto retry;
}
+
+ /* Lock one inode and make sure nobody got in and leased it. */
+ inode_lock(src);
+ error = break_layout(src, false);
if (error) {
- inode_unlock(dest);
- if (src < dest)
- inode_unlock_shared(src);
+ inode_unlock(src);
+ if (error == -EWOULDBLOCK)
+ goto retry;
return error;
}
- if (src > dest)
- inode_lock_shared_nested(src, I_MUTEX_NONDIR2);
+
+ if (src == dest)
+ return 0;
+
+ /* Lock the other inode and make sure nobody got in and leased it. */
+ inode_lock_nested(dest, I_MUTEX_NONDIR2);
+ error = break_layout(dest, false);
+ if (error) {
+ inode_unlock(src);
+ inode_unlock(dest);
+ if (error == -EWOULDBLOCK)
+ goto retry;
+ return error;
+ }
+
return 0;
}
/* Unlock both inodes after they've been prepped for a range clone. */
-STATIC void
+void
xfs_reflink_remap_unlock(
struct file *file_in,
struct file *file_out)
@@ -1234,10 +1247,10 @@
xfs_iunlock(dest, XFS_MMAPLOCK_EXCL);
if (!same_inode)
- xfs_iunlock(src, XFS_MMAPLOCK_SHARED);
+ xfs_iunlock(src, XFS_MMAPLOCK_EXCL);
inode_unlock(inode_out);
if (!same_inode)
- inode_unlock_shared(inode_in);
+ inode_unlock(inode_in);
}
/*
@@ -1289,21 +1302,20 @@
* stale data in the destination file. Hence we reject these clone attempts with
* -EINVAL in this case.
*/
-STATIC int
+int
xfs_reflink_remap_prep(
struct file *file_in,
loff_t pos_in,
struct file *file_out,
loff_t pos_out,
- u64 *len,
- bool is_dedupe)
+ loff_t *len,
+ unsigned int remap_flags)
{
struct inode *inode_in = file_inode(file_in);
struct xfs_inode *src = XFS_I(inode_in);
struct inode *inode_out = file_inode(file_out);
struct xfs_inode *dest = XFS_I(inode_out);
bool same_inode = (inode_in == inode_out);
- u64 blkmask = i_blocksize(inode_in) - 1;
ssize_t ret;
/* Lock both files against IO */
@@ -1313,7 +1325,7 @@
if (same_inode)
xfs_ilock(src, XFS_MMAPLOCK_EXCL);
else
- xfs_lock_two_inodes(src, XFS_MMAPLOCK_SHARED, dest,
+ xfs_lock_two_inodes(src, XFS_MMAPLOCK_EXCL, dest,
XFS_MMAPLOCK_EXCL);
/* Check file eligibility and prepare for block sharing. */
@@ -1326,29 +1338,11 @@
if (IS_DAX(inode_in) || IS_DAX(inode_out))
goto out_unlock;
- ret = vfs_clone_file_prep_inodes(inode_in, pos_in, inode_out, pos_out,
- len, is_dedupe);
- if (ret <= 0)
+ ret = generic_remap_file_range_prep(file_in, pos_in, file_out, pos_out,
+ len, remap_flags);
+ if (ret < 0 || *len == 0)
goto out_unlock;
- /*
- * If the dedupe data matches, chop off the partial EOF block
- * from the source file so we don't try to dedupe the partial
- * EOF block.
- */
- if (is_dedupe) {
- *len &= ~blkmask;
- } else if (*len & blkmask) {
- /*
- * The user is attempting to share a partial EOF block,
- * if it's inside the destination EOF then reject it.
- */
- if (pos_out + *len < i_size_read(inode_out)) {
- ret = -EINVAL;
- goto out_unlock;
- }
- }
-
/* Attach dquots to dest inode before changing block map */
ret = xfs_qm_dqattach(dest);
if (ret)
@@ -1367,32 +1361,19 @@
if (ret)
goto out_unlock;
- /* Zap any page cache for the destination file's range. */
- truncate_inode_pages_range(&inode_out->i_data, pos_out,
- PAGE_ALIGN(pos_out + *len) - 1);
-
- /* If we're altering the file contents... */
- if (!is_dedupe) {
- /*
- * ...update the timestamps (which will grab the ilock again
- * from xfs_fs_dirty_inode, so we have to call it before we
- * take the ilock).
- */
- if (!(file_out->f_mode & FMODE_NOCMTIME)) {
- ret = file_update_time(file_out);
- if (ret)
- goto out_unlock;
- }
-
- /*
- * ...clear the security bits if the process is not being run
- * by root. This keeps people from modifying setuid and setgid
- * binaries.
- */
- ret = file_remove_privs(file_out);
- if (ret)
- goto out_unlock;
+ /*
+ * If pos_out > EOF, we may have dirtied blocks between EOF and
+ * pos_out. In that case, we need to extend the flush and unmap to cover
+ * from EOF to the end of the copy length.
+ */
+ if (pos_out > XFS_ISIZE(dest)) {
+ loff_t flen = *len + (pos_out - XFS_ISIZE(dest));
+ ret = xfs_flush_unmap_range(dest, XFS_ISIZE(dest), flen);
+ } else {
+ ret = xfs_flush_unmap_range(dest, pos_out, *len);
}
+ if (ret)
+ goto out_unlock;
return 1;
out_unlock:
@@ -1401,72 +1382,6 @@
}
/*
- * Link a range of blocks from one file to another.
- */
-int
-xfs_reflink_remap_range(
- struct file *file_in,
- loff_t pos_in,
- struct file *file_out,
- loff_t pos_out,
- u64 len,
- bool is_dedupe)
-{
- struct inode *inode_in = file_inode(file_in);
- struct xfs_inode *src = XFS_I(inode_in);
- struct inode *inode_out = file_inode(file_out);
- struct xfs_inode *dest = XFS_I(inode_out);
- struct xfs_mount *mp = src->i_mount;
- xfs_fileoff_t sfsbno, dfsbno;
- xfs_filblks_t fsblen;
- xfs_extlen_t cowextsize;
- ssize_t ret;
-
- if (!xfs_sb_version_hasreflink(&mp->m_sb))
- return -EOPNOTSUPP;
-
- if (XFS_FORCED_SHUTDOWN(mp))
- return -EIO;
-
- /* Prepare and then clone file data. */
- ret = xfs_reflink_remap_prep(file_in, pos_in, file_out, pos_out,
- &len, is_dedupe);
- if (ret <= 0)
- return ret;
-
- trace_xfs_reflink_remap_range(src, pos_in, len, dest, pos_out);
-
- dfsbno = XFS_B_TO_FSBT(mp, pos_out);
- sfsbno = XFS_B_TO_FSBT(mp, pos_in);
- fsblen = XFS_B_TO_FSB(mp, len);
- ret = xfs_reflink_remap_blocks(src, sfsbno, dest, dfsbno, fsblen,
- pos_out + len);
- if (ret)
- goto out_unlock;
-
- /*
- * Carry the cowextsize hint from src to dest if we're sharing the
- * entire source file to the entire destination file, the source file
- * has a cowextsize hint, and the destination file does not.
- */
- cowextsize = 0;
- if (pos_in == 0 && len == i_size_read(inode_in) &&
- (src->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE) &&
- pos_out == 0 && len >= i_size_read(inode_out) &&
- !(dest->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE))
- cowextsize = src->i_d.di_cowextsize;
-
- ret = xfs_reflink_update_dest(dest, pos_out + len, cowextsize,
- is_dedupe);
-
-out_unlock:
- xfs_reflink_remap_unlock(file_in, file_out);
- if (ret)
- trace_xfs_reflink_remap_range_error(dest, ret, _RET_IP_);
- return ret;
-}
-
-/*
* The user wants to preemptively CoW all shared blocks in this file,
* which enables us to turn off the reflink flag. Iterate all
* extents which are not prealloc/delalloc to see which ranges are
diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h
index c585ad9..28a43b7 100644
--- a/fs/xfs/xfs_reflink.h
+++ b/fs/xfs/xfs_reflink.h
@@ -6,16 +6,28 @@
#ifndef __XFS_REFLINK_H
#define __XFS_REFLINK_H 1
+static inline bool xfs_is_always_cow_inode(struct xfs_inode *ip)
+{
+ return ip->i_mount->m_always_cow &&
+ xfs_sb_version_hasreflink(&ip->i_mount->m_sb);
+}
+
+static inline bool xfs_is_cow_inode(struct xfs_inode *ip)
+{
+ return xfs_is_reflink_inode(ip) || xfs_is_always_cow_inode(ip);
+}
+
extern int xfs_reflink_find_shared(struct xfs_mount *mp, struct xfs_trans *tp,
xfs_agnumber_t agno, xfs_agblock_t agbno, xfs_extlen_t aglen,
xfs_agblock_t *fbno, xfs_extlen_t *flen, bool find_maximal);
extern int xfs_reflink_trim_around_shared(struct xfs_inode *ip,
- struct xfs_bmbt_irec *irec, bool *shared, bool *trimmed);
+ struct xfs_bmbt_irec *irec, bool *shared);
+bool xfs_inode_need_cow(struct xfs_inode *ip, struct xfs_bmbt_irec *imap,
+ bool *shared);
-extern int xfs_reflink_reserve_cow(struct xfs_inode *ip,
- struct xfs_bmbt_irec *imap, bool *shared);
extern int xfs_reflink_allocate_cow(struct xfs_inode *ip,
- struct xfs_bmbt_irec *imap, bool *shared, uint *lockmode);
+ struct xfs_bmbt_irec *imap, bool *shared, uint *lockmode,
+ bool convert_now);
extern int xfs_reflink_convert_cow(struct xfs_inode *ip, xfs_off_t offset,
xfs_off_t count);
@@ -27,13 +39,24 @@
extern int xfs_reflink_end_cow(struct xfs_inode *ip, xfs_off_t offset,
xfs_off_t count);
extern int xfs_reflink_recover_cow(struct xfs_mount *mp);
-extern int xfs_reflink_remap_range(struct file *file_in, loff_t pos_in,
- struct file *file_out, loff_t pos_out, u64 len, bool is_dedupe);
+extern loff_t xfs_reflink_remap_range(struct file *file_in, loff_t pos_in,
+ struct file *file_out, loff_t pos_out, loff_t len,
+ unsigned int remap_flags);
extern int xfs_reflink_inode_has_shared_extents(struct xfs_trans *tp,
struct xfs_inode *ip, bool *has_shared);
extern int xfs_reflink_clear_inode_flag(struct xfs_inode *ip,
struct xfs_trans **tpp);
extern int xfs_reflink_unshare(struct xfs_inode *ip, xfs_off_t offset,
xfs_off_t len);
+extern int xfs_reflink_remap_prep(struct file *file_in, loff_t pos_in,
+ struct file *file_out, loff_t pos_out, loff_t *len,
+ unsigned int remap_flags);
+extern int xfs_reflink_remap_blocks(struct xfs_inode *src, loff_t pos_in,
+ struct xfs_inode *dest, loff_t pos_out, loff_t remap_len,
+ loff_t *remapped);
+extern int xfs_reflink_update_dest(struct xfs_inode *dest, xfs_off_t newlen,
+ xfs_extlen_t cowextsize, unsigned int remap_flags);
+extern void xfs_reflink_remap_unlock(struct file *file_in,
+ struct file *file_out);
#endif /* __XFS_REFLINK_H */
diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c
index 127dc9c..8939e0e 100644
--- a/fs/xfs/xfs_rmap_item.c
+++ b/fs/xfs/xfs_rmap_item.c
@@ -14,7 +14,6 @@
#include "xfs_defer.h"
#include "xfs_trans.h"
#include "xfs_trans_priv.h"
-#include "xfs_buf_item.h"
#include "xfs_rmap_item.h"
#include "xfs_log.h"
#include "xfs_rmap.h"
@@ -94,15 +93,6 @@
}
/*
- * Pinning has no meaning for an rui item, so just return.
- */
-STATIC void
-xfs_rui_item_pin(
- struct xfs_log_item *lip)
-{
-}
-
-/*
* The unpin operation is the last place an RUI is manipulated in the log. It is
* either inserted in the AIL or aborted in the event of a log I/O error. In
* either case, the RUI transaction has been successfully committed to make it
@@ -121,71 +111,22 @@
}
/*
- * RUI items have no locking or pushing. However, since RUIs are pulled from
- * the AIL when their corresponding RUDs are committed to disk, their situation
- * is very similar to being pinned. Return XFS_ITEM_PINNED so that the caller
- * will eventually flush the log. This should help in getting the RUI out of
- * the AIL.
- */
-STATIC uint
-xfs_rui_item_push(
- struct xfs_log_item *lip,
- struct list_head *buffer_list)
-{
- return XFS_ITEM_PINNED;
-}
-
-/*
* The RUI has been either committed or aborted if the transaction has been
* cancelled. If the transaction was cancelled, an RUD isn't going to be
* constructed and thus we free the RUI here directly.
*/
STATIC void
-xfs_rui_item_unlock(
+xfs_rui_item_release(
struct xfs_log_item *lip)
{
- if (test_bit(XFS_LI_ABORTED, &lip->li_flags))
- xfs_rui_release(RUI_ITEM(lip));
+ xfs_rui_release(RUI_ITEM(lip));
}
-/*
- * The RUI is logged only once and cannot be moved in the log, so simply return
- * the lsn at which it's been logged.
- */
-STATIC xfs_lsn_t
-xfs_rui_item_committed(
- struct xfs_log_item *lip,
- xfs_lsn_t lsn)
-{
- return lsn;
-}
-
-/*
- * The RUI dependency tracking op doesn't do squat. It can't because
- * it doesn't know where the free extent is coming from. The dependency
- * tracking has to be handled by the "enclosing" metadata object. For
- * example, for inodes, the inode is locked throughout the extent freeing
- * so the dependency should be recorded there.
- */
-STATIC void
-xfs_rui_item_committing(
- struct xfs_log_item *lip,
- xfs_lsn_t lsn)
-{
-}
-
-/*
- * This is the ops vector shared by all rui log items.
- */
static const struct xfs_item_ops xfs_rui_item_ops = {
.iop_size = xfs_rui_item_size,
.iop_format = xfs_rui_item_format,
- .iop_pin = xfs_rui_item_pin,
.iop_unpin = xfs_rui_item_unpin,
- .iop_unlock = xfs_rui_item_unlock,
- .iop_committed = xfs_rui_item_committed,
- .iop_push = xfs_rui_item_push,
- .iop_committing = xfs_rui_item_committing,
+ .iop_release = xfs_rui_item_release,
};
/*
@@ -201,9 +142,9 @@
ASSERT(nextents > 0);
if (nextents > XFS_RUI_MAX_FAST_EXTENTS)
- ruip = kmem_zalloc(xfs_rui_log_item_sizeof(nextents), KM_SLEEP);
+ ruip = kmem_zalloc(xfs_rui_log_item_sizeof(nextents), 0);
else
- ruip = kmem_zone_zalloc(xfs_rui_zone, KM_SLEEP);
+ ruip = kmem_zone_zalloc(xfs_rui_zone, 0);
xfs_log_item_init(mp, &ruip->rui_item, XFS_LI_RUI, &xfs_rui_item_ops);
ruip->rui_format.rui_nextents = nextents;
@@ -275,127 +216,272 @@
}
/*
- * Pinning has no meaning for an rud item, so just return.
- */
-STATIC void
-xfs_rud_item_pin(
- struct xfs_log_item *lip)
-{
-}
-
-/*
- * Since pinning has no meaning for an rud item, unpinning does
- * not either.
- */
-STATIC void
-xfs_rud_item_unpin(
- struct xfs_log_item *lip,
- int remove)
-{
-}
-
-/*
- * There isn't much you can do to push on an rud item. It is simply stuck
- * waiting for the log to be flushed to disk.
- */
-STATIC uint
-xfs_rud_item_push(
- struct xfs_log_item *lip,
- struct list_head *buffer_list)
-{
- return XFS_ITEM_PINNED;
-}
-
-/*
* The RUD is either committed or aborted if the transaction is cancelled. If
* the transaction is cancelled, drop our reference to the RUI and free the
* RUD.
*/
STATIC void
-xfs_rud_item_unlock(
+xfs_rud_item_release(
struct xfs_log_item *lip)
{
struct xfs_rud_log_item *rudp = RUD_ITEM(lip);
- if (test_bit(XFS_LI_ABORTED, &lip->li_flags)) {
- xfs_rui_release(rudp->rud_ruip);
- kmem_zone_free(xfs_rud_zone, rudp);
+ xfs_rui_release(rudp->rud_ruip);
+ kmem_zone_free(xfs_rud_zone, rudp);
+}
+
+static const struct xfs_item_ops xfs_rud_item_ops = {
+ .flags = XFS_ITEM_RELEASE_WHEN_COMMITTED,
+ .iop_size = xfs_rud_item_size,
+ .iop_format = xfs_rud_item_format,
+ .iop_release = xfs_rud_item_release,
+};
+
+static struct xfs_rud_log_item *
+xfs_trans_get_rud(
+ struct xfs_trans *tp,
+ struct xfs_rui_log_item *ruip)
+{
+ struct xfs_rud_log_item *rudp;
+
+ rudp = kmem_zone_zalloc(xfs_rud_zone, 0);
+ xfs_log_item_init(tp->t_mountp, &rudp->rud_item, XFS_LI_RUD,
+ &xfs_rud_item_ops);
+ rudp->rud_ruip = ruip;
+ rudp->rud_format.rud_rui_id = ruip->rui_format.rui_id;
+
+ xfs_trans_add_item(tp, &rudp->rud_item);
+ return rudp;
+}
+
+/* Set the map extent flags for this reverse mapping. */
+static void
+xfs_trans_set_rmap_flags(
+ struct xfs_map_extent *rmap,
+ enum xfs_rmap_intent_type type,
+ int whichfork,
+ xfs_exntst_t state)
+{
+ rmap->me_flags = 0;
+ if (state == XFS_EXT_UNWRITTEN)
+ rmap->me_flags |= XFS_RMAP_EXTENT_UNWRITTEN;
+ if (whichfork == XFS_ATTR_FORK)
+ rmap->me_flags |= XFS_RMAP_EXTENT_ATTR_FORK;
+ switch (type) {
+ case XFS_RMAP_MAP:
+ rmap->me_flags |= XFS_RMAP_EXTENT_MAP;
+ break;
+ case XFS_RMAP_MAP_SHARED:
+ rmap->me_flags |= XFS_RMAP_EXTENT_MAP_SHARED;
+ break;
+ case XFS_RMAP_UNMAP:
+ rmap->me_flags |= XFS_RMAP_EXTENT_UNMAP;
+ break;
+ case XFS_RMAP_UNMAP_SHARED:
+ rmap->me_flags |= XFS_RMAP_EXTENT_UNMAP_SHARED;
+ break;
+ case XFS_RMAP_CONVERT:
+ rmap->me_flags |= XFS_RMAP_EXTENT_CONVERT;
+ break;
+ case XFS_RMAP_CONVERT_SHARED:
+ rmap->me_flags |= XFS_RMAP_EXTENT_CONVERT_SHARED;
+ break;
+ case XFS_RMAP_ALLOC:
+ rmap->me_flags |= XFS_RMAP_EXTENT_ALLOC;
+ break;
+ case XFS_RMAP_FREE:
+ rmap->me_flags |= XFS_RMAP_EXTENT_FREE;
+ break;
+ default:
+ ASSERT(0);
}
}
/*
- * When the rud item is committed to disk, all we need to do is delete our
- * reference to our partner rui item and then free ourselves. Since we're
- * freeing ourselves we must return -1 to keep the transaction code from
- * further referencing this item.
+ * Finish an rmap update and log it to the RUD. Note that the transaction is
+ * marked dirty regardless of whether the rmap update succeeds or fails to
+ * support the RUI/RUD lifecycle rules.
*/
-STATIC xfs_lsn_t
-xfs_rud_item_committed(
- struct xfs_log_item *lip,
- xfs_lsn_t lsn)
+static int
+xfs_trans_log_finish_rmap_update(
+ struct xfs_trans *tp,
+ struct xfs_rud_log_item *rudp,
+ enum xfs_rmap_intent_type type,
+ uint64_t owner,
+ int whichfork,
+ xfs_fileoff_t startoff,
+ xfs_fsblock_t startblock,
+ xfs_filblks_t blockcount,
+ xfs_exntst_t state,
+ struct xfs_btree_cur **pcur)
{
- struct xfs_rud_log_item *rudp = RUD_ITEM(lip);
+ int error;
+
+ error = xfs_rmap_finish_one(tp, type, owner, whichfork, startoff,
+ startblock, blockcount, state, pcur);
/*
- * Drop the RUI reference regardless of whether the RUD has been
- * aborted. Once the RUD transaction is constructed, it is the sole
- * responsibility of the RUD to release the RUI (even if the RUI is
- * aborted due to log I/O error).
+ * Mark the transaction dirty, even on error. This ensures the
+ * transaction is aborted, which:
+ *
+ * 1.) releases the RUI and frees the RUD
+ * 2.) shuts down the filesystem
*/
- xfs_rui_release(rudp->rud_ruip);
- kmem_zone_free(xfs_rud_zone, rudp);
+ tp->t_flags |= XFS_TRANS_DIRTY;
+ set_bit(XFS_LI_DIRTY, &rudp->rud_item.li_flags);
- return (xfs_lsn_t)-1;
+ return error;
}
-/*
- * The RUD dependency tracking op doesn't do squat. It can't because
- * it doesn't know where the free extent is coming from. The dependency
- * tracking has to be handled by the "enclosing" metadata object. For
- * example, for inodes, the inode is locked throughout the extent freeing
- * so the dependency should be recorded there.
- */
+/* Sort rmap intents by AG. */
+static int
+xfs_rmap_update_diff_items(
+ void *priv,
+ struct list_head *a,
+ struct list_head *b)
+{
+ struct xfs_mount *mp = priv;
+ struct xfs_rmap_intent *ra;
+ struct xfs_rmap_intent *rb;
+
+ ra = container_of(a, struct xfs_rmap_intent, ri_list);
+ rb = container_of(b, struct xfs_rmap_intent, ri_list);
+ return XFS_FSB_TO_AGNO(mp, ra->ri_bmap.br_startblock) -
+ XFS_FSB_TO_AGNO(mp, rb->ri_bmap.br_startblock);
+}
+
+/* Get an RUI. */
+STATIC void *
+xfs_rmap_update_create_intent(
+ struct xfs_trans *tp,
+ unsigned int count)
+{
+ struct xfs_rui_log_item *ruip;
+
+ ASSERT(tp != NULL);
+ ASSERT(count > 0);
+
+ ruip = xfs_rui_init(tp->t_mountp, count);
+ ASSERT(ruip != NULL);
+
+ /*
+ * Get a log_item_desc to point at the new item.
+ */
+ xfs_trans_add_item(tp, &ruip->rui_item);
+ return ruip;
+}
+
+/* Log rmap updates in the intent item. */
STATIC void
-xfs_rud_item_committing(
- struct xfs_log_item *lip,
- xfs_lsn_t lsn)
+xfs_rmap_update_log_item(
+ struct xfs_trans *tp,
+ void *intent,
+ struct list_head *item)
{
+ struct xfs_rui_log_item *ruip = intent;
+ struct xfs_rmap_intent *rmap;
+ uint next_extent;
+ struct xfs_map_extent *map;
+
+ rmap = container_of(item, struct xfs_rmap_intent, ri_list);
+
+ tp->t_flags |= XFS_TRANS_DIRTY;
+ set_bit(XFS_LI_DIRTY, &ruip->rui_item.li_flags);
+
+ /*
+ * atomic_inc_return gives us the value after the increment;
+ * we want to use it as an array index so we need to subtract 1 from
+ * it.
+ */
+ next_extent = atomic_inc_return(&ruip->rui_next_extent) - 1;
+ ASSERT(next_extent < ruip->rui_format.rui_nextents);
+ map = &ruip->rui_format.rui_extents[next_extent];
+ map->me_owner = rmap->ri_owner;
+ map->me_startblock = rmap->ri_bmap.br_startblock;
+ map->me_startoff = rmap->ri_bmap.br_startoff;
+ map->me_len = rmap->ri_bmap.br_blockcount;
+ xfs_trans_set_rmap_flags(map, rmap->ri_type, rmap->ri_whichfork,
+ rmap->ri_bmap.br_state);
}
-/*
- * This is the ops vector shared by all rud log items.
- */
-static const struct xfs_item_ops xfs_rud_item_ops = {
- .iop_size = xfs_rud_item_size,
- .iop_format = xfs_rud_item_format,
- .iop_pin = xfs_rud_item_pin,
- .iop_unpin = xfs_rud_item_unpin,
- .iop_unlock = xfs_rud_item_unlock,
- .iop_committed = xfs_rud_item_committed,
- .iop_push = xfs_rud_item_push,
- .iop_committing = xfs_rud_item_committing,
+/* Get an RUD so we can process all the deferred rmap updates. */
+STATIC void *
+xfs_rmap_update_create_done(
+ struct xfs_trans *tp,
+ void *intent,
+ unsigned int count)
+{
+ return xfs_trans_get_rud(tp, intent);
+}
+
+/* Process a deferred rmap update. */
+STATIC int
+xfs_rmap_update_finish_item(
+ struct xfs_trans *tp,
+ struct list_head *item,
+ void *done_item,
+ void **state)
+{
+ struct xfs_rmap_intent *rmap;
+ int error;
+
+ rmap = container_of(item, struct xfs_rmap_intent, ri_list);
+ error = xfs_trans_log_finish_rmap_update(tp, done_item,
+ rmap->ri_type,
+ rmap->ri_owner, rmap->ri_whichfork,
+ rmap->ri_bmap.br_startoff,
+ rmap->ri_bmap.br_startblock,
+ rmap->ri_bmap.br_blockcount,
+ rmap->ri_bmap.br_state,
+ (struct xfs_btree_cur **)state);
+ kmem_free(rmap);
+ return error;
+}
+
+/* Clean up after processing deferred rmaps. */
+STATIC void
+xfs_rmap_update_finish_cleanup(
+ struct xfs_trans *tp,
+ void *state,
+ int error)
+{
+ struct xfs_btree_cur *rcur = state;
+
+ xfs_rmap_finish_one_cleanup(tp, rcur, error);
+}
+
+/* Abort all pending RUIs. */
+STATIC void
+xfs_rmap_update_abort_intent(
+ void *intent)
+{
+ xfs_rui_release(intent);
+}
+
+/* Cancel a deferred rmap update. */
+STATIC void
+xfs_rmap_update_cancel_item(
+ struct list_head *item)
+{
+ struct xfs_rmap_intent *rmap;
+
+ rmap = container_of(item, struct xfs_rmap_intent, ri_list);
+ kmem_free(rmap);
+}
+
+const struct xfs_defer_op_type xfs_rmap_update_defer_type = {
+ .max_items = XFS_RUI_MAX_FAST_EXTENTS,
+ .diff_items = xfs_rmap_update_diff_items,
+ .create_intent = xfs_rmap_update_create_intent,
+ .abort_intent = xfs_rmap_update_abort_intent,
+ .log_item = xfs_rmap_update_log_item,
+ .create_done = xfs_rmap_update_create_done,
+ .finish_item = xfs_rmap_update_finish_item,
+ .finish_cleanup = xfs_rmap_update_finish_cleanup,
+ .cancel_item = xfs_rmap_update_cancel_item,
};
/*
- * Allocate and initialize an rud item with the given number of extents.
- */
-struct xfs_rud_log_item *
-xfs_rud_init(
- struct xfs_mount *mp,
- struct xfs_rui_log_item *ruip)
-
-{
- struct xfs_rud_log_item *rudp;
-
- rudp = kmem_zone_zalloc(xfs_rud_zone, KM_SLEEP);
- xfs_log_item_init(mp, &rudp->rud_item, XFS_LI_RUD, &xfs_rud_item_ops);
- rudp->rud_ruip = ruip;
- rudp->rud_format.rud_rui_id = ruip->rui_format.rui_id;
-
- return rudp;
-}
-
-/*
* Process an rmap update intent item that was recovered from the log.
* We need to update the rmapbt.
*/
diff --git a/fs/xfs/xfs_rmap_item.h b/fs/xfs/xfs_rmap_item.h
index 7e482ba..8708e4a 100644
--- a/fs/xfs/xfs_rmap_item.h
+++ b/fs/xfs/xfs_rmap_item.h
@@ -78,8 +78,6 @@
extern struct kmem_zone *xfs_rud_zone;
struct xfs_rui_log_item *xfs_rui_init(struct xfs_mount *, uint);
-struct xfs_rud_log_item *xfs_rud_init(struct xfs_mount *,
- struct xfs_rui_log_item *);
int xfs_rui_copy_format(struct xfs_log_iovec *buf,
struct xfs_rui_log_format *dst_rui_fmt);
void xfs_rui_item_free(struct xfs_rui_log_item *);
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index 926ed31..4a48a8c 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -11,17 +11,11 @@
#include "xfs_trans_resv.h"
#include "xfs_bit.h"
#include "xfs_mount.h"
-#include "xfs_defer.h"
#include "xfs_inode.h"
#include "xfs_bmap.h"
-#include "xfs_bmap_util.h"
#include "xfs_bmap_btree.h"
-#include "xfs_alloc.h"
-#include "xfs_error.h"
#include "xfs_trans.h"
#include "xfs_trans_space.h"
-#include "xfs_trace.h"
-#include "xfs_buf.h"
#include "xfs_icache.h"
#include "xfs_rtalloc.h"
@@ -64,8 +58,12 @@
int log; /* loop counter, log2 of ext. size */
xfs_suminfo_t sum; /* summary data */
+ /* There are no extents at levels < m_rsum_cache[bbno]. */
+ if (mp->m_rsum_cache && low < mp->m_rsum_cache[bbno])
+ low = mp->m_rsum_cache[bbno];
+
/*
- * Loop over logs of extent sizes. Order is irrelevant.
+ * Loop over logs of extent sizes.
*/
for (log = low; log <= high; log++) {
/*
@@ -80,13 +78,17 @@
*/
if (sum) {
*stat = 1;
- return 0;
+ goto out;
}
}
/*
* Found nothing, return failure.
*/
*stat = 0;
+out:
+ /* There were no extents at levels < log. */
+ if (mp->m_rsum_cache && log > mp->m_rsum_cache[bbno])
+ mp->m_rsum_cache[bbno] = log;
return 0;
}
@@ -853,6 +855,21 @@
return error;
}
+static void
+xfs_alloc_rsum_cache(
+ xfs_mount_t *mp, /* file system mount structure */
+ xfs_extlen_t rbmblocks) /* number of rt bitmap blocks */
+{
+ /*
+ * The rsum cache is initialized to all zeroes, which is trivially a
+ * lower bound on the minimum level with any free extents. We can
+ * continue without the cache if it couldn't be allocated.
+ */
+ mp->m_rsum_cache = kmem_zalloc_large(rbmblocks, 0);
+ if (!mp->m_rsum_cache)
+ xfs_warn(mp, "could not allocate realtime summary cache");
+}
+
/*
* Visible (exported) functions.
*/
@@ -881,6 +898,7 @@
xfs_extlen_t rsumblocks; /* current number of rt summary blks */
xfs_sb_t *sbp; /* old superblock */
xfs_fsblock_t sumbno; /* summary block number */
+ uint8_t *rsum_cache; /* old summary cache */
sbp = &mp->m_sb;
/*
@@ -937,10 +955,15 @@
error = xfs_growfs_rt_alloc(mp, rsumblocks, nrsumblocks, mp->m_rsumip);
if (error)
return error;
+
+ rsum_cache = mp->m_rsum_cache;
+ if (nrbmblocks != sbp->sb_rbmblocks)
+ xfs_alloc_rsum_cache(mp, nrbmblocks);
+
/*
* Allocate a new (fake) mount/sb.
*/
- nmp = kmem_alloc(sizeof(*nmp), KM_SLEEP);
+ nmp = kmem_alloc(sizeof(*nmp), 0);
/*
* Loop over the bitmap blocks.
* We will do everything one bitmap block at a time.
@@ -1062,6 +1085,20 @@
*/
kmem_free(nmp);
+ /*
+ * If we had to allocate a new rsum_cache, we either need to free the
+ * old one (if we succeeded) or free the new one and restore the old one
+ * (if there was an error).
+ */
+ if (rsum_cache != mp->m_rsum_cache) {
+ if (error) {
+ kmem_free(mp->m_rsum_cache);
+ mp->m_rsum_cache = rsum_cache;
+ } else {
+ kmem_free(rsum_cache);
+ }
+ }
+
return error;
}
@@ -1187,8 +1224,8 @@
}
/*
- * Get the bitmap and summary inodes into the mount structure
- * at mount time.
+ * Get the bitmap and summary inodes and the summary cache into the mount
+ * structure at mount time.
*/
int /* error */
xfs_rtmount_inodes(
@@ -1198,19 +1235,18 @@
xfs_sb_t *sbp;
sbp = &mp->m_sb;
- if (sbp->sb_rbmino == NULLFSINO)
- return 0;
error = xfs_iget(mp, NULL, sbp->sb_rbmino, 0, 0, &mp->m_rbmip);
if (error)
return error;
ASSERT(mp->m_rbmip != NULL);
- ASSERT(sbp->sb_rsumino != NULLFSINO);
+
error = xfs_iget(mp, NULL, sbp->sb_rsumino, 0, 0, &mp->m_rsumip);
if (error) {
xfs_irele(mp->m_rbmip);
return error;
}
ASSERT(mp->m_rsumip != NULL);
+ xfs_alloc_rsum_cache(mp, sbp->sb_rbmblocks);
return 0;
}
@@ -1218,6 +1254,7 @@
xfs_rtunmount_inodes(
struct xfs_mount *mp)
{
+ kmem_free(mp->m_rsum_cache);
if (mp->m_rbmip)
xfs_irele(mp->m_rbmip);
if (mp->m_rsumip)
diff --git a/fs/xfs/xfs_stats.c b/fs/xfs/xfs_stats.c
index 4e44231..113883c 100644
--- a/fs/xfs/xfs_stats.c
+++ b/fs/xfs/xfs_stats.c
@@ -4,7 +4,6 @@
* All Rights Reserved.
*/
#include "xfs.h"
-#include <linux/proc_fs.h>
struct xstats xfsstats;
@@ -29,30 +28,30 @@
char *desc;
int endpoint;
} xstats[] = {
- { "extent_alloc", XFSSTAT_END_EXTENT_ALLOC },
- { "abt", XFSSTAT_END_ALLOC_BTREE },
- { "blk_map", XFSSTAT_END_BLOCK_MAPPING },
- { "bmbt", XFSSTAT_END_BLOCK_MAP_BTREE },
- { "dir", XFSSTAT_END_DIRECTORY_OPS },
- { "trans", XFSSTAT_END_TRANSACTIONS },
- { "ig", XFSSTAT_END_INODE_OPS },
- { "log", XFSSTAT_END_LOG_OPS },
- { "push_ail", XFSSTAT_END_TAIL_PUSHING },
- { "xstrat", XFSSTAT_END_WRITE_CONVERT },
- { "rw", XFSSTAT_END_READ_WRITE_OPS },
- { "attr", XFSSTAT_END_ATTRIBUTE_OPS },
- { "icluster", XFSSTAT_END_INODE_CLUSTER },
- { "vnodes", XFSSTAT_END_VNODE_OPS },
- { "buf", XFSSTAT_END_BUF },
- { "abtb2", XFSSTAT_END_ABTB_V2 },
- { "abtc2", XFSSTAT_END_ABTC_V2 },
- { "bmbt2", XFSSTAT_END_BMBT_V2 },
- { "ibt2", XFSSTAT_END_IBT_V2 },
- { "fibt2", XFSSTAT_END_FIBT_V2 },
- { "rmapbt", XFSSTAT_END_RMAP_V2 },
- { "refcntbt", XFSSTAT_END_REFCOUNT },
+ { "extent_alloc", xfsstats_offset(xs_abt_lookup) },
+ { "abt", xfsstats_offset(xs_blk_mapr) },
+ { "blk_map", xfsstats_offset(xs_bmbt_lookup) },
+ { "bmbt", xfsstats_offset(xs_dir_lookup) },
+ { "dir", xfsstats_offset(xs_trans_sync) },
+ { "trans", xfsstats_offset(xs_ig_attempts) },
+ { "ig", xfsstats_offset(xs_log_writes) },
+ { "log", xfsstats_offset(xs_try_logspace)},
+ { "push_ail", xfsstats_offset(xs_xstrat_quick)},
+ { "xstrat", xfsstats_offset(xs_write_calls) },
+ { "rw", xfsstats_offset(xs_attr_get) },
+ { "attr", xfsstats_offset(xs_iflush_count)},
+ { "icluster", xfsstats_offset(vn_active) },
+ { "vnodes", xfsstats_offset(xb_get) },
+ { "buf", xfsstats_offset(xs_abtb_2) },
+ { "abtb2", xfsstats_offset(xs_abtc_2) },
+ { "abtc2", xfsstats_offset(xs_bmbt_2) },
+ { "bmbt2", xfsstats_offset(xs_ibt_2) },
+ { "ibt2", xfsstats_offset(xs_fibt_2) },
+ { "fibt2", xfsstats_offset(xs_rmap_2) },
+ { "rmapbt", xfsstats_offset(xs_refcbt_2) },
+ { "refcntbt", xfsstats_offset(xs_qm_dqreclaims)},
/* we print both series of quota information together */
- { "qm", XFSSTAT_END_QM },
+ { "qm", xfsstats_offset(xs_xstrat_bytes)},
};
/* Loop over all stats groups */
@@ -104,6 +103,10 @@
#ifdef CONFIG_PROC_FS
/* legacy quota interfaces */
#ifdef CONFIG_XFS_QUOTA
+
+#define XFSSTAT_START_XQMSTAT xfsstats_offset(xs_qm_dqreclaims)
+#define XFSSTAT_END_XQMSTAT xfsstats_offset(xs_qm_dquot)
+
static int xqm_proc_show(struct seq_file *m, void *v)
{
/* maximum; incore; ratio free to inuse; freelist */
@@ -119,7 +122,7 @@
int j;
seq_printf(m, "qm");
- for (j = XFSSTAT_END_IBT_V2; j < XFSSTAT_END_XQMSTAT; j++)
+ for (j = XFSSTAT_START_XQMSTAT; j < XFSSTAT_END_XQMSTAT; j++)
seq_printf(m, " %u", counter_val(xfsstats.xs_stats, j));
seq_putc(m, '\n');
return 0;
diff --git a/fs/xfs/xfs_stats.h b/fs/xfs/xfs_stats.h
index 130db07..34d704f 100644
--- a/fs/xfs/xfs_stats.h
+++ b/fs/xfs/xfs_stats.h
@@ -41,17 +41,14 @@
* XFS global statistics
*/
struct __xfsstats {
-# define XFSSTAT_END_EXTENT_ALLOC 4
uint32_t xs_allocx;
uint32_t xs_allocb;
uint32_t xs_freex;
uint32_t xs_freeb;
-# define XFSSTAT_END_ALLOC_BTREE (XFSSTAT_END_EXTENT_ALLOC+4)
uint32_t xs_abt_lookup;
uint32_t xs_abt_compare;
uint32_t xs_abt_insrec;
uint32_t xs_abt_delrec;
-# define XFSSTAT_END_BLOCK_MAPPING (XFSSTAT_END_ALLOC_BTREE+7)
uint32_t xs_blk_mapr;
uint32_t xs_blk_mapw;
uint32_t xs_blk_unmap;
@@ -59,21 +56,17 @@
uint32_t xs_del_exlist;
uint32_t xs_look_exlist;
uint32_t xs_cmp_exlist;
-# define XFSSTAT_END_BLOCK_MAP_BTREE (XFSSTAT_END_BLOCK_MAPPING+4)
uint32_t xs_bmbt_lookup;
uint32_t xs_bmbt_compare;
uint32_t xs_bmbt_insrec;
uint32_t xs_bmbt_delrec;
-# define XFSSTAT_END_DIRECTORY_OPS (XFSSTAT_END_BLOCK_MAP_BTREE+4)
uint32_t xs_dir_lookup;
uint32_t xs_dir_create;
uint32_t xs_dir_remove;
uint32_t xs_dir_getdents;
-# define XFSSTAT_END_TRANSACTIONS (XFSSTAT_END_DIRECTORY_OPS+3)
uint32_t xs_trans_sync;
uint32_t xs_trans_async;
uint32_t xs_trans_empty;
-# define XFSSTAT_END_INODE_OPS (XFSSTAT_END_TRANSACTIONS+7)
uint32_t xs_ig_attempts;
uint32_t xs_ig_found;
uint32_t xs_ig_frecycle;
@@ -81,13 +74,11 @@
uint32_t xs_ig_dup;
uint32_t xs_ig_reclaims;
uint32_t xs_ig_attrchg;
-# define XFSSTAT_END_LOG_OPS (XFSSTAT_END_INODE_OPS+5)
uint32_t xs_log_writes;
uint32_t xs_log_blocks;
uint32_t xs_log_noiclogs;
uint32_t xs_log_force;
uint32_t xs_log_force_sleep;
-# define XFSSTAT_END_TAIL_PUSHING (XFSSTAT_END_LOG_OPS+10)
uint32_t xs_try_logspace;
uint32_t xs_sleep_logspace;
uint32_t xs_push_ail;
@@ -98,22 +89,17 @@
uint32_t xs_push_ail_flushing;
uint32_t xs_push_ail_restarts;
uint32_t xs_push_ail_flush;
-# define XFSSTAT_END_WRITE_CONVERT (XFSSTAT_END_TAIL_PUSHING+2)
uint32_t xs_xstrat_quick;
uint32_t xs_xstrat_split;
-# define XFSSTAT_END_READ_WRITE_OPS (XFSSTAT_END_WRITE_CONVERT+2)
uint32_t xs_write_calls;
uint32_t xs_read_calls;
-# define XFSSTAT_END_ATTRIBUTE_OPS (XFSSTAT_END_READ_WRITE_OPS+4)
uint32_t xs_attr_get;
uint32_t xs_attr_set;
uint32_t xs_attr_remove;
uint32_t xs_attr_list;
-# define XFSSTAT_END_INODE_CLUSTER (XFSSTAT_END_ATTRIBUTE_OPS+3)
uint32_t xs_iflush_count;
uint32_t xs_icluster_flushcnt;
uint32_t xs_icluster_flushinode;
-# define XFSSTAT_END_VNODE_OPS (XFSSTAT_END_INODE_CLUSTER+8)
uint32_t vn_active; /* # vnodes not on free lists */
uint32_t vn_alloc; /* # times vn_alloc called */
uint32_t vn_get; /* # times vn_get called */
@@ -122,7 +108,6 @@
uint32_t vn_reclaim; /* # times vn_reclaim called */
uint32_t vn_remove; /* # times vn_remove called */
uint32_t vn_free; /* # times vn_free called */
-#define XFSSTAT_END_BUF (XFSSTAT_END_VNODE_OPS+9)
uint32_t xb_get;
uint32_t xb_create;
uint32_t xb_get_locked;
@@ -133,28 +118,19 @@
uint32_t xb_page_found;
uint32_t xb_get_read;
/* Version 2 btree counters */
-#define XFSSTAT_END_ABTB_V2 (XFSSTAT_END_BUF + __XBTS_MAX)
uint32_t xs_abtb_2[__XBTS_MAX];
-#define XFSSTAT_END_ABTC_V2 (XFSSTAT_END_ABTB_V2 + __XBTS_MAX)
uint32_t xs_abtc_2[__XBTS_MAX];
-#define XFSSTAT_END_BMBT_V2 (XFSSTAT_END_ABTC_V2 + __XBTS_MAX)
uint32_t xs_bmbt_2[__XBTS_MAX];
-#define XFSSTAT_END_IBT_V2 (XFSSTAT_END_BMBT_V2 + __XBTS_MAX)
uint32_t xs_ibt_2[__XBTS_MAX];
-#define XFSSTAT_END_FIBT_V2 (XFSSTAT_END_IBT_V2 + __XBTS_MAX)
uint32_t xs_fibt_2[__XBTS_MAX];
-#define XFSSTAT_END_RMAP_V2 (XFSSTAT_END_FIBT_V2 + __XBTS_MAX)
uint32_t xs_rmap_2[__XBTS_MAX];
-#define XFSSTAT_END_REFCOUNT (XFSSTAT_END_RMAP_V2 + __XBTS_MAX)
uint32_t xs_refcbt_2[__XBTS_MAX];
-#define XFSSTAT_END_XQMSTAT (XFSSTAT_END_REFCOUNT + 6)
uint32_t xs_qm_dqreclaims;
uint32_t xs_qm_dqreclaim_misses;
uint32_t xs_qm_dquot_dups;
uint32_t xs_qm_dqcachemisses;
uint32_t xs_qm_dqcachehits;
uint32_t xs_qm_dqwants;
-#define XFSSTAT_END_QM (XFSSTAT_END_XQMSTAT+2)
uint32_t xs_qm_dquot;
uint32_t xs_qm_dquot_unused;
/* Extra precision counters */
@@ -163,10 +139,12 @@
uint64_t xs_read_bytes;
};
+#define xfsstats_offset(f) (offsetof(struct __xfsstats, f)/sizeof(uint32_t))
+
struct xfsstats {
union {
struct __xfsstats s;
- uint32_t a[XFSSTAT_END_XQMSTAT];
+ uint32_t a[xfsstats_offset(xs_qm_dquot)];
};
};
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 207ee30..8d1df9f 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -11,18 +11,15 @@
#include "xfs_trans_resv.h"
#include "xfs_sb.h"
#include "xfs_mount.h"
-#include "xfs_da_format.h"
#include "xfs_inode.h"
#include "xfs_btree.h"
#include "xfs_bmap.h"
#include "xfs_alloc.h"
-#include "xfs_error.h"
#include "xfs_fsops.h"
#include "xfs_trans.h"
#include "xfs_buf_item.h"
#include "xfs_log.h"
#include "xfs_log_priv.h"
-#include "xfs_da_btree.h"
#include "xfs_dir2.h"
#include "xfs_extfree_item.h"
#include "xfs_mru_cache.h"
@@ -39,15 +36,7 @@
#include "xfs_bmap_item.h"
#include "xfs_reflink.h"
-#include <linux/namei.h>
-#include <linux/dax.h>
-#include <linux/init.h>
-#include <linux/slab.h>
-#include <linux/mount.h>
-#include <linux/mempool.h>
-#include <linux/writeback.h>
-#include <linux/kthread.h>
-#include <linux/freezer.h>
+#include <linux/magic.h>
#include <linux/parser.h>
static const struct super_operations xfs_super_operations;
@@ -64,7 +53,7 @@
enum {
Opt_logbufs, Opt_logbsize, Opt_logdev, Opt_rtdev, Opt_biosize,
Opt_wsync, Opt_noalign, Opt_swalloc, Opt_sunit, Opt_swidth, Opt_nouuid,
- Opt_mtpt, Opt_grpid, Opt_nogrpid, Opt_bsdgroups, Opt_sysvgroups,
+ Opt_grpid, Opt_nogrpid, Opt_bsdgroups, Opt_sysvgroups,
Opt_allocsize, Opt_norecovery, Opt_inode64, Opt_inode32, Opt_ikeep,
Opt_noikeep, Opt_largeio, Opt_nolargeio, Opt_attr2, Opt_noattr2,
Opt_filestreams, Opt_quota, Opt_noquota, Opt_usrquota, Opt_grpquota,
@@ -85,7 +74,6 @@
{Opt_sunit, "sunit=%u"}, /* data volume stripe unit */
{Opt_swidth, "swidth=%u"}, /* data volume stripe width */
{Opt_nouuid, "nouuid"}, /* ignore filesystem UUID */
- {Opt_mtpt, "mtpt"}, /* filesystem mount point */
{Opt_grpid, "grpid"}, /* group-ID from parent directory */
{Opt_nogrpid, "nogrpid"}, /* group-ID from current process */
{Opt_bsdgroups, "bsdgroups"}, /* group-ID from parent directory */
@@ -234,9 +222,6 @@
if (!mp->m_logname)
return -ENOMEM;
break;
- case Opt_mtpt:
- xfs_warn(mp, "%s option not allowed on this system", p);
- return -EINVAL;
case Opt_rtdev:
kfree(mp->m_rtname);
mp->m_rtname = match_strdup(args);
@@ -446,7 +431,7 @@
char *str;
};
-STATIC int
+STATIC void
xfs_showargs(
struct xfs_mount *mp,
struct seq_file *m)
@@ -525,9 +510,8 @@
if (!(mp->m_qflags & XFS_ALL_QUOTA_ACCT))
seq_puts(m, ",noquota");
-
- return 0;
}
+
static uint64_t
xfs_max_file_offset(
unsigned int blockshift)
@@ -537,26 +521,18 @@
/* Figure out maximum filesize, on Linux this can depend on
* the filesystem blocksize (on 32 bit platforms).
- * __block_write_begin does this in an [unsigned] long...
+ * __block_write_begin does this in an [unsigned] long long...
* page->index << (PAGE_SHIFT - bbits)
* So, for page sized blocks (4K on 32 bit platforms),
* this wraps at around 8Tb (hence MAX_LFS_FILESIZE which is
* (((u64)PAGE_SIZE << (BITS_PER_LONG-1))-1)
* but for smaller blocksizes it is less (bbits = log2 bsize).
- * Note1: get_block_t takes a long (implicit cast from above)
- * Note2: The Large Block Device (LBD and HAVE_SECTOR_T) patch
- * can optionally convert the [unsigned] long from above into
- * an [unsigned] long long.
*/
#if BITS_PER_LONG == 32
-# if defined(CONFIG_LBDAF)
ASSERT(sizeof(sector_t) == 8);
pagefactor = PAGE_SIZE;
bitshift = BITS_PER_LONG;
-# else
- pagefactor = PAGE_SIZE >> (PAGE_SHIFT - blockshift);
-# endif
#endif
return (((uint64_t)pagefactor) << bitshift) - 1;
@@ -593,7 +569,7 @@
* Calculate how much should be reserved for inodes to meet
* the max inode percentage. Used only for inode32.
*/
- if (mp->m_maxicount) {
+ if (M_IGEO(mp)->maxicount) {
uint64_t icount;
icount = sbp->sb_dblocks * sbp->sb_imax_pct;
@@ -606,7 +582,7 @@
}
/* Get the last possible inode in the filesystem */
- agino = XFS_OFFBNO_TO_AGINO(mp, sbp->sb_agblocks - 1, 0);
+ agino = XFS_AGB_TO_AGINO(mp, sbp->sb_agblocks - 1);
ino = XFS_AGINO_TO_INO(mp, agcount - 1, agino);
/*
@@ -836,18 +812,14 @@
if (!mp->m_buf_workqueue)
goto out;
- mp->m_data_workqueue = alloc_workqueue("xfs-data/%s",
- WQ_MEM_RECLAIM|WQ_FREEZABLE, 0, mp->m_fsname);
- if (!mp->m_data_workqueue)
- goto out_destroy_buf;
-
mp->m_unwritten_workqueue = alloc_workqueue("xfs-conv/%s",
WQ_MEM_RECLAIM|WQ_FREEZABLE, 0, mp->m_fsname);
if (!mp->m_unwritten_workqueue)
- goto out_destroy_data_iodone_queue;
+ goto out_destroy_buf;
mp->m_cil_workqueue = alloc_workqueue("xfs-cil/%s",
- WQ_MEM_RECLAIM|WQ_FREEZABLE, 0, mp->m_fsname);
+ WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_UNBOUND,
+ 0, mp->m_fsname);
if (!mp->m_cil_workqueue)
goto out_destroy_unwritten;
@@ -856,16 +828,10 @@
if (!mp->m_reclaim_workqueue)
goto out_destroy_cil;
- mp->m_log_workqueue = alloc_workqueue("xfs-log/%s",
- WQ_MEM_RECLAIM|WQ_FREEZABLE|WQ_HIGHPRI, 0,
- mp->m_fsname);
- if (!mp->m_log_workqueue)
- goto out_destroy_reclaim;
-
mp->m_eofblocks_workqueue = alloc_workqueue("xfs-eofblocks/%s",
WQ_MEM_RECLAIM|WQ_FREEZABLE, 0, mp->m_fsname);
if (!mp->m_eofblocks_workqueue)
- goto out_destroy_log;
+ goto out_destroy_reclaim;
mp->m_sync_workqueue = alloc_workqueue("xfs-sync/%s", WQ_FREEZABLE, 0,
mp->m_fsname);
@@ -876,16 +842,12 @@
out_destroy_eofb:
destroy_workqueue(mp->m_eofblocks_workqueue);
-out_destroy_log:
- destroy_workqueue(mp->m_log_workqueue);
out_destroy_reclaim:
destroy_workqueue(mp->m_reclaim_workqueue);
out_destroy_cil:
destroy_workqueue(mp->m_cil_workqueue);
out_destroy_unwritten:
destroy_workqueue(mp->m_unwritten_workqueue);
-out_destroy_data_iodone_queue:
- destroy_workqueue(mp->m_data_workqueue);
out_destroy_buf:
destroy_workqueue(mp->m_buf_workqueue);
out:
@@ -898,10 +860,8 @@
{
destroy_workqueue(mp->m_sync_workqueue);
destroy_workqueue(mp->m_eofblocks_workqueue);
- destroy_workqueue(mp->m_log_workqueue);
destroy_workqueue(mp->m_reclaim_workqueue);
destroy_workqueue(mp->m_cil_workqueue);
- destroy_workqueue(mp->m_data_workqueue);
destroy_workqueue(mp->m_unwritten_workqueue);
destroy_workqueue(mp->m_buf_workqueue);
}
@@ -933,6 +893,32 @@
return NULL;
}
+#ifdef DEBUG
+static void
+xfs_check_delalloc(
+ struct xfs_inode *ip,
+ int whichfork)
+{
+ struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
+ struct xfs_bmbt_irec got;
+ struct xfs_iext_cursor icur;
+
+ if (!ifp || !xfs_iext_lookup_extent(ip, ifp, 0, &icur, &got))
+ return;
+ do {
+ if (isnullstartblock(got.br_startblock)) {
+ xfs_warn(ip->i_mount,
+ "ino %llx %s fork has delalloc extent at [0x%llx:0x%llx]",
+ ip->i_ino,
+ whichfork == XFS_DATA_FORK ? "data" : "cow",
+ got.br_startoff, got.br_blockcount);
+ }
+ } while (xfs_iext_next_extent(ifp, &icur, &got));
+}
+#else
+#define xfs_check_delalloc(ip, whichfork) do { } while (0)
+#endif
+
/*
* Now that the generic code is guaranteed not to be accessing
* the linux inode, we can inactivate and reclaim the inode.
@@ -951,7 +937,12 @@
xfs_inactive(ip);
- ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || ip->i_delayed_blks == 0);
+ if (!XFS_FORCED_SHUTDOWN(ip->i_mount) && ip->i_delayed_blks) {
+ xfs_check_delalloc(ip, XFS_DATA_FORK);
+ xfs_check_delalloc(ip, XFS_COW_FORK);
+ ASSERT(0);
+ }
+
XFS_STATS_INC(ip->i_mount, vn_reclaim);
/*
@@ -1097,7 +1088,7 @@
xfs_extlen_t lsize;
int64_t ffree;
- statp->f_type = XFS_SB_MAGIC;
+ statp->f_type = XFS_SUPER_MAGIC;
statp->f_namelen = MAXNAMELEN - 1;
id = huge_encode_dev(mp->m_ddev_targp->bt_dev);
@@ -1117,12 +1108,12 @@
statp->f_bfree = fdblocks - mp->m_alloc_set_aside;
statp->f_bavail = statp->f_bfree;
- fakeinos = statp->f_bfree << sbp->sb_inopblog;
+ fakeinos = XFS_FSB_TO_INO(mp, statp->f_bfree);
statp->f_files = min(icount + fakeinos, (uint64_t)XFS_MAXINUMBER);
- if (mp->m_maxicount)
+ if (M_IGEO(mp)->maxicount)
statp->f_files = min_t(typeof(statp->f_files),
statp->f_files,
- mp->m_maxicount);
+ M_IGEO(mp)->maxicount);
/* If sb_icount overshot maxicount, report actual allocation */
statp->f_files = max_t(typeof(statp->f_files),
@@ -1343,7 +1334,7 @@
xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
return error;
}
- xfs_icache_enable_reclaim(mp);
+ xfs_start_block_reaping(mp);
/* Create the per-AG metadata reservation pool .*/
error = xfs_fs_reserve_ag_blocks(mp);
@@ -1357,7 +1348,7 @@
* Cancel background eofb scanning so it cannot race with the
* final log force+buftarg wait and deadlock the remount.
*/
- xfs_icache_disable_reclaim(mp);
+ xfs_stop_block_reaping(mp);
/* Get rid of any leftover CoW reservations... */
error = xfs_icache_free_cowblocks(mp, NULL);
@@ -1401,7 +1392,7 @@
{
struct xfs_mount *mp = XFS_M(sb);
- xfs_icache_disable_reclaim(mp);
+ xfs_stop_block_reaping(mp);
xfs_save_resvblks(mp);
xfs_quiesce_attr(mp);
return xfs_sync_sb(mp, true);
@@ -1415,7 +1406,7 @@
xfs_restore_resvblks(mp);
xfs_log_work_queue(mp);
- xfs_icache_enable_reclaim(mp);
+ xfs_start_block_reaping(mp);
return 0;
}
@@ -1424,7 +1415,8 @@
struct seq_file *m,
struct dentry *root)
{
- return xfs_showargs(XFS_M(root->d_sb), m);
+ xfs_showargs(XFS_M(root->d_sb), m);
+ return 0;
}
/*
@@ -1513,8 +1505,14 @@
if (error)
goto free_ifree;
+ error = percpu_counter_init(&mp->m_delalloc_blks, 0, GFP_KERNEL);
+ if (error)
+ goto free_fdblocks;
+
return 0;
+free_fdblocks:
+ percpu_counter_destroy(&mp->m_fdblocks);
free_ifree:
percpu_counter_destroy(&mp->m_ifree);
free_icount:
@@ -1538,6 +1536,9 @@
percpu_counter_destroy(&mp->m_icount);
percpu_counter_destroy(&mp->m_ifree);
percpu_counter_destroy(&mp->m_fdblocks);
+ ASSERT(XFS_FORCED_SHUTDOWN(mp) ||
+ percpu_counter_sum(&mp->m_delalloc_blks) == 0);
+ percpu_counter_destroy(&mp->m_delalloc_blks);
}
static struct xfs_mount *
@@ -1561,6 +1562,13 @@
INIT_DELAYED_WORK(&mp->m_eofblocks_work, xfs_eofblocks_worker);
INIT_DELAYED_WORK(&mp->m_cowblocks_work, xfs_cowblocks_worker);
mp->m_kobj.kobject.kset = xfs_kset;
+ /*
+ * We don't create the finobt per-ag space reservation until after log
+ * recovery, so we must set this to true so that an ifree transaction
+ * started during log recovery will not depend on space reservations
+ * for finobt expansion.
+ */
+ mp->m_finobt_nores = true;
return mp;
}
@@ -1650,12 +1658,16 @@
* we must configure the block size in the superblock before we run the
* full mount process as the mount process can lookup and cache inodes.
*/
- sb->s_magic = XFS_SB_MAGIC;
+ sb->s_magic = XFS_SUPER_MAGIC;
sb->s_blocksize = mp->m_sb.sb_blocksize;
sb->s_blocksize_bits = ffs(sb->s_blocksize) - 1;
sb->s_maxbytes = xfs_max_file_offset(sb->s_blocksize_bits);
sb->s_max_links = XFS_MAXLINK;
sb->s_time_gran = 1;
+ sb->s_time_min = S32_MIN;
+ sb->s_time_max = S32_MAX;
+ sb->s_iflags |= SB_I_CGROUPWB;
+
set_posix_acl_flag(sb);
/* version 5 superblocks support inode version counters. */
@@ -1696,11 +1708,18 @@
}
}
- if (xfs_sb_version_hasreflink(&mp->m_sb) && mp->m_sb.sb_rblocks) {
- xfs_alert(mp,
+ if (xfs_sb_version_hasreflink(&mp->m_sb)) {
+ if (mp->m_sb.sb_rblocks) {
+ xfs_alert(mp,
"reflink not compatible with realtime device!");
- error = -EINVAL;
- goto out_filestream_unmount;
+ error = -EINVAL;
+ goto out_filestream_unmount;
+ }
+
+ if (xfs_globals.always_cow) {
+ xfs_info(mp, "using DEBUG-only always_cow mode.");
+ mp->m_always_cow = true;
+ }
}
if (xfs_sb_version_hasrmapbt(&mp->m_sb) && mp->m_sb.sb_rblocks) {
@@ -2053,11 +2072,6 @@
printk(KERN_INFO XFS_VERSION_STRING " with "
XFS_BUILD_OPTIONS " enabled\n");
- xfs_extent_free_init_defer_op();
- xfs_rmap_update_init_defer_op();
- xfs_refcount_update_init_defer_op();
- xfs_bmap_update_init_defer_op();
-
xfs_dir_startup();
error = xfs_init_zones();
diff --git a/fs/xfs/xfs_super.h b/fs/xfs/xfs_super.h
index 21cb49a..763e43d 100644
--- a/fs/xfs/xfs_super.h
+++ b/fs/xfs/xfs_super.h
@@ -38,6 +38,18 @@
# define XFS_SCRUB_STRING
#endif
+#ifdef CONFIG_XFS_ONLINE_REPAIR
+# define XFS_REPAIR_STRING "repair, "
+#else
+# define XFS_REPAIR_STRING
+#endif
+
+#ifdef CONFIG_XFS_WARN
+# define XFS_WARN_STRING "verbose warnings, "
+#else
+# define XFS_WARN_STRING
+#endif
+
#ifdef DEBUG
# define XFS_DBG_STRING "debug"
#else
@@ -49,6 +61,8 @@
XFS_SECURITY_STRING \
XFS_REALTIME_STRING \
XFS_SCRUB_STRING \
+ XFS_REPAIR_STRING \
+ XFS_WARN_STRING \
XFS_DBG_STRING /* DBG must be last */
struct xfs_inode;
diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c
index a3e98c6..ed66fd2 100644
--- a/fs/xfs/xfs_symlink.c
+++ b/fs/xfs/xfs_symlink.c
@@ -12,23 +12,14 @@
#include "xfs_trans_resv.h"
#include "xfs_bit.h"
#include "xfs_mount.h"
-#include "xfs_da_format.h"
-#include "xfs_da_btree.h"
-#include "xfs_defer.h"
#include "xfs_dir2.h"
#include "xfs_inode.h"
-#include "xfs_ialloc.h"
-#include "xfs_alloc.h"
#include "xfs_bmap.h"
#include "xfs_bmap_btree.h"
-#include "xfs_bmap_util.h"
-#include "xfs_error.h"
#include "xfs_quota.h"
#include "xfs_trans_space.h"
#include "xfs_trace.h"
-#include "xfs_symlink.h"
#include "xfs_trans.h"
-#include "xfs_log.h"
/* ----- Kernel only functions below ----- */
int
@@ -192,6 +183,7 @@
pathlen = strlen(target_path);
if (pathlen >= XFS_SYMLINK_MAXLEN) /* total string too long */
return -ENAMETOOLONG;
+ ASSERT(pathlen > 0);
udqp = gdqp = NULL;
prid = xfs_get_initial_prid(dp);
@@ -378,6 +370,12 @@
/*
* Free a symlink that has blocks associated with it.
+ *
+ * Note: zero length symlinks are not allowed to exist. When we set the size to
+ * zero, also change it to a regular file so that it does not get written to
+ * disk as a zero length symlink. The inode is on the unlinked list already, so
+ * userspace cannot find this inode anymore, so this change is not user visible
+ * but allows us to catch corrupt zero-length symlinks in the verifiers.
*/
STATIC int
xfs_inactive_symlink_rmt(
@@ -412,13 +410,14 @@
xfs_trans_ijoin(tp, ip, 0);
/*
- * Lock the inode, fix the size, and join it to the transaction.
- * Hold it so in the normal path, we still have it locked for
- * the second transaction. In the error paths we need it
+ * Lock the inode, fix the size, turn it into a regular file and join it
+ * to the transaction. Hold it so in the normal path, we still have it
+ * locked for the second transaction. In the error paths we need it
* held so the cancel won't rele it, see below.
*/
size = (int)ip->i_d.di_size;
ip->i_d.di_size = 0;
+ VFS_I(ip)->i_mode = (VFS_I(ip)->i_mode & ~S_IFMT) | S_IFREG;
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
/*
* Find the block(s) so we can inval and unmap them.
@@ -494,17 +493,10 @@
return -EIO;
xfs_ilock(ip, XFS_ILOCK_EXCL);
-
- /*
- * Zero length symlinks _can_ exist.
- */
pathlen = (int)ip->i_d.di_size;
- if (!pathlen) {
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- return 0;
- }
+ ASSERT(pathlen);
- if (pathlen < 0 || pathlen > XFS_SYMLINK_MAXLEN) {
+ if (pathlen <= 0 || pathlen > XFS_SYMLINK_MAXLEN) {
xfs_alert(mp, "%s: inode (0x%llx) bad symlink length (%d)",
__func__, (unsigned long long)ip->i_ino, pathlen);
xfs_iunlock(ip, XFS_ILOCK_EXCL);
@@ -512,12 +504,12 @@
return -EFSCORRUPTED;
}
+ /*
+ * Inline fork state gets removed by xfs_difree() so we have nothing to
+ * do here in that case.
+ */
if (ip->i_df.if_flags & XFS_IFINLINE) {
- if (ip->i_df.if_bytes > 0)
- xfs_idata_realloc(ip, -(ip->i_df.if_bytes),
- XFS_DATA_FORK);
xfs_iunlock(ip, XFS_ILOCK_EXCL);
- ASSERT(ip->i_df.if_bytes == 0);
return 0;
}
diff --git a/fs/xfs/xfs_sysctl.c b/fs/xfs/xfs_sysctl.c
index 0cc034d..31b3bdb 100644
--- a/fs/xfs/xfs_sysctl.c
+++ b/fs/xfs/xfs_sysctl.c
@@ -4,10 +4,7 @@
* All Rights Reserved.
*/
#include "xfs.h"
-#include <linux/sysctl.h>
-#include <linux/proc_fs.h>
#include "xfs_error.h"
-#include "xfs_stats.h"
static struct ctl_table_header *xfs_table_header;
diff --git a/fs/xfs/xfs_sysctl.h b/fs/xfs/xfs_sysctl.h
index 1684881..8abf464 100644
--- a/fs/xfs/xfs_sysctl.h
+++ b/fs/xfs/xfs_sysctl.h
@@ -82,9 +82,13 @@
extern xfs_param_t xfs_params;
struct xfs_globals {
+#ifdef DEBUG
+ int pwork_threads; /* parallel workqueue threads */
+#endif
int log_recovery_delay; /* log recovery delay (secs) */
int mount_delay; /* mount setup delay (secs) */
bool bug_on_assert; /* BUG() the kernel on assert failure */
+ bool always_cow; /* use COW fork for all overwrites */
};
extern struct xfs_globals xfs_globals;
diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c
index cd6a994..f1bc88f 100644
--- a/fs/xfs/xfs_sysfs.c
+++ b/fs/xfs/xfs_sysfs.c
@@ -10,9 +10,7 @@
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
#include "xfs_sysfs.h"
-#include "xfs_log.h"
#include "xfs_log_priv.h"
-#include "xfs_stats.h"
#include "xfs_mount.h"
struct xfs_sysfs_attr {
@@ -65,19 +63,6 @@
.store = xfs_sysfs_object_store,
};
-/*
- * xfs_mount kobject. The mp kobject also serves as the per-mount parent object
- * that is identified by the fsname under sysfs.
- */
-
-static inline struct xfs_mount *
-to_mp(struct kobject *kobject)
-{
- struct xfs_kobj *kobj = to_kobj(kobject);
-
- return container_of(kobj, struct xfs_mount, m_kobj);
-}
-
static struct attribute *xfs_mp_attrs[] = {
NULL,
};
@@ -183,10 +168,74 @@
}
XFS_SYSFS_ATTR_RW(mount_delay);
+static ssize_t
+always_cow_store(
+ struct kobject *kobject,
+ const char *buf,
+ size_t count)
+{
+ ssize_t ret;
+
+ ret = kstrtobool(buf, &xfs_globals.always_cow);
+ if (ret < 0)
+ return ret;
+ return count;
+}
+
+static ssize_t
+always_cow_show(
+ struct kobject *kobject,
+ char *buf)
+{
+ return snprintf(buf, PAGE_SIZE, "%d\n", xfs_globals.always_cow);
+}
+XFS_SYSFS_ATTR_RW(always_cow);
+
+#ifdef DEBUG
+/*
+ * Override how many threads the parallel work queue is allowed to create.
+ * This has to be a debug-only global (instead of an errortag) because one of
+ * the main users of parallel workqueues is mount time quotacheck.
+ */
+STATIC ssize_t
+pwork_threads_store(
+ struct kobject *kobject,
+ const char *buf,
+ size_t count)
+{
+ int ret;
+ int val;
+
+ ret = kstrtoint(buf, 0, &val);
+ if (ret)
+ return ret;
+
+ if (val < -1 || val > num_possible_cpus())
+ return -EINVAL;
+
+ xfs_globals.pwork_threads = val;
+
+ return count;
+}
+
+STATIC ssize_t
+pwork_threads_show(
+ struct kobject *kobject,
+ char *buf)
+{
+ return snprintf(buf, PAGE_SIZE, "%d\n", xfs_globals.pwork_threads);
+}
+XFS_SYSFS_ATTR_RW(pwork_threads);
+#endif /* DEBUG */
+
static struct attribute *xfs_dbg_attrs[] = {
ATTR_LIST(bug_on_assert),
ATTR_LIST(log_recovery_delay),
ATTR_LIST(mount_delay),
+ ATTR_LIST(always_cow),
+#ifdef DEBUG
+ ATTR_LIST(pwork_threads),
+#endif
NULL,
};
diff --git a/fs/xfs/xfs_trace.c b/fs/xfs/xfs_trace.c
index cb6489c..bc85b89 100644
--- a/fs/xfs/xfs_trace.c
+++ b/fs/xfs/xfs_trace.c
@@ -15,24 +15,16 @@
#include "xfs_inode.h"
#include "xfs_btree.h"
#include "xfs_da_btree.h"
-#include "xfs_ialloc.h"
-#include "xfs_itable.h"
#include "xfs_alloc.h"
#include "xfs_bmap.h"
#include "xfs_attr.h"
-#include "xfs_attr_leaf.h"
#include "xfs_trans.h"
-#include "xfs_log.h"
#include "xfs_log_priv.h"
#include "xfs_buf_item.h"
#include "xfs_quota.h"
-#include "xfs_iomap.h"
-#include "xfs_aops.h"
#include "xfs_dquot_item.h"
#include "xfs_dquot.h"
#include "xfs_log_recover.h"
-#include "xfs_inode_item.h"
-#include "xfs_bmap_btree.h"
#include "xfs_filestream.h"
#include "xfs_fsmap.h"
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 3043e5e..eaae275 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -23,6 +23,7 @@
struct xlog_ticket;
struct xlog_recover;
struct xlog_recover_item;
+struct xlog_rec_header;
struct xfs_buf_log_format;
struct xfs_inode_log_format;
struct xfs_bmbt_irec;
@@ -30,6 +31,10 @@
struct xfs_refcount_irec;
struct xfs_fsmap;
struct xfs_rmap_irec;
+struct xfs_icreate_log;
+struct xfs_owner_info;
+struct xfs_trans_res;
+struct xfs_inobt_rec_incore;
DECLARE_EVENT_CLASS(xfs_attr_list_class,
TP_PROTO(struct xfs_attr_list_context *ctx),
@@ -280,7 +285,10 @@
),
TP_fast_assign(
__entry->dev = bp->b_target->bt_dev;
- __entry->bno = bp->b_bn;
+ if (bp->b_bn == XFS_BUF_DADDR_NULL)
+ __entry->bno = bp->b_maps[0].bm_bn;
+ else
+ __entry->bno = bp->b_bn;
__entry->nblks = bp->b_length;
__entry->hold = atomic_read(&bp->b_hold);
__entry->pincount = atomic_read(&bp->b_pin_count);
@@ -472,7 +480,7 @@
DEFINE_BUF_ITEM_EVENT(xfs_buf_item_pin);
DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unpin);
DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unpin_stale);
-DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unlock);
+DEFINE_BUF_ITEM_EVENT(xfs_buf_item_release);
DEFINE_BUF_ITEM_EVENT(xfs_buf_item_committed);
DEFINE_BUF_ITEM_EVENT(xfs_buf_item_push);
DEFINE_BUF_ITEM_EVENT(xfs_trans_get_buf);
@@ -637,6 +645,16 @@
DEFINE_INODE_EVENT(xfs_inode_clear_cowblocks_tag);
DEFINE_INODE_EVENT(xfs_inode_free_cowblocks_invalid);
+/*
+ * ftrace's __print_symbolic requires that all enum values be wrapped in the
+ * TRACE_DEFINE_ENUM macro so that the enum value can be encoded in the ftrace
+ * ring buffer. Somehow this was only worth mentioning in the ftrace sample
+ * code.
+ */
+TRACE_DEFINE_ENUM(PE_SIZE_PTE);
+TRACE_DEFINE_ENUM(PE_SIZE_PMD);
+TRACE_DEFINE_ENUM(PE_SIZE_PUD);
+
TRACE_EVENT(xfs_filemap_fault,
TP_PROTO(struct xfs_inode *ip, enum page_entry_size pe_size,
bool write_fault),
@@ -1207,15 +1225,15 @@
DECLARE_EVENT_CLASS(xfs_imap_class,
TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count,
- int type, struct xfs_bmbt_irec *irec),
- TP_ARGS(ip, offset, count, type, irec),
+ int whichfork, struct xfs_bmbt_irec *irec),
+ TP_ARGS(ip, offset, count, whichfork, irec),
TP_STRUCT__entry(
__field(dev_t, dev)
__field(xfs_ino_t, ino)
__field(loff_t, size)
__field(loff_t, offset)
__field(size_t, count)
- __field(int, type)
+ __field(int, whichfork)
__field(xfs_fileoff_t, startoff)
__field(xfs_fsblock_t, startblock)
__field(xfs_filblks_t, blockcount)
@@ -1226,33 +1244,33 @@
__entry->size = ip->i_d.di_size;
__entry->offset = offset;
__entry->count = count;
- __entry->type = type;
+ __entry->whichfork = whichfork;
__entry->startoff = irec ? irec->br_startoff : 0;
__entry->startblock = irec ? irec->br_startblock : 0;
__entry->blockcount = irec ? irec->br_blockcount : 0;
),
TP_printk("dev %d:%d ino 0x%llx size 0x%llx offset 0x%llx count %zd "
- "type %s startoff 0x%llx startblock %lld blockcount 0x%llx",
+ "fork %s startoff 0x%llx startblock %lld blockcount 0x%llx",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->ino,
__entry->size,
__entry->offset,
__entry->count,
- __print_symbolic(__entry->type, XFS_IO_TYPES),
+ __entry->whichfork == XFS_COW_FORK ? "cow" : "data",
__entry->startoff,
(int64_t)__entry->startblock,
__entry->blockcount)
)
-#define DEFINE_IOMAP_EVENT(name) \
+#define DEFINE_IMAP_EVENT(name) \
DEFINE_EVENT(xfs_imap_class, name, \
TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count, \
- int type, struct xfs_bmbt_irec *irec), \
- TP_ARGS(ip, offset, count, type, irec))
-DEFINE_IOMAP_EVENT(xfs_map_blocks_found);
-DEFINE_IOMAP_EVENT(xfs_map_blocks_alloc);
-DEFINE_IOMAP_EVENT(xfs_iomap_alloc);
-DEFINE_IOMAP_EVENT(xfs_iomap_found);
+ int whichfork, struct xfs_bmbt_irec *irec), \
+ TP_ARGS(ip, offset, count, whichfork, irec))
+DEFINE_IMAP_EVENT(xfs_map_blocks_found);
+DEFINE_IMAP_EVENT(xfs_map_blocks_alloc);
+DEFINE_IMAP_EVENT(xfs_iomap_alloc);
+DEFINE_IMAP_EVENT(xfs_iomap_found);
DECLARE_EVENT_CLASS(xfs_simple_io_class,
TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count),
@@ -1882,11 +1900,11 @@
{ 0, "target" }, \
{ 1, "temp" }
-#define XFS_INODE_FORMAT_STR \
- { 0, "invalid" }, \
- { 1, "local" }, \
- { 2, "extent" }, \
- { 3, "btree" }
+TRACE_DEFINE_ENUM(XFS_DINODE_FMT_DEV);
+TRACE_DEFINE_ENUM(XFS_DINODE_FMT_LOCAL);
+TRACE_DEFINE_ENUM(XFS_DINODE_FMT_EXTENTS);
+TRACE_DEFINE_ENUM(XFS_DINODE_FMT_BTREE);
+TRACE_DEFINE_ENUM(XFS_DINODE_FMT_UUID);
DECLARE_EVENT_CLASS(xfs_swap_extent_class,
TP_PROTO(struct xfs_inode *ip, int which),
@@ -2175,6 +2193,14 @@
DEFINE_DISCARD_EVENT(xfs_discard_busy);
/* btree cursor events */
+TRACE_DEFINE_ENUM(XFS_BTNUM_BNOi);
+TRACE_DEFINE_ENUM(XFS_BTNUM_CNTi);
+TRACE_DEFINE_ENUM(XFS_BTNUM_BMAPi);
+TRACE_DEFINE_ENUM(XFS_BTNUM_INOi);
+TRACE_DEFINE_ENUM(XFS_BTNUM_FINOi);
+TRACE_DEFINE_ENUM(XFS_BTNUM_RMAPi);
+TRACE_DEFINE_ENUM(XFS_BTNUM_REFCi);
+
DECLARE_EVENT_CLASS(xfs_btree_cur_class,
TP_PROTO(struct xfs_btree_cur *cur, int level, struct xfs_buf *bp),
TP_ARGS(cur, level, bp),
@@ -2194,9 +2220,9 @@
__entry->ptr = cur->bc_ptrs[level];
__entry->daddr = bp ? bp->b_bn : -1;
),
- TP_printk("dev %d:%d btnum %d level %d/%d ptr %d daddr 0x%llx",
+ TP_printk("dev %d:%d btree %s level %d/%d ptr %d daddr 0x%llx",
MAJOR(__entry->dev), MINOR(__entry->dev),
- __entry->btnum,
+ __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS),
__entry->level,
__entry->nlevels,
__entry->ptr,
@@ -2273,7 +2299,7 @@
),
TP_fast_assign(
__entry->dev = mp ? mp->m_super->s_dev : 0;
- __entry->type = dfp->dfp_type->type;
+ __entry->type = dfp->dfp_type;
__entry->intent = dfp->dfp_intent;
__entry->committed = dfp->dfp_done != NULL;
__entry->nr = dfp->dfp_count;
@@ -2402,7 +2428,7 @@
DECLARE_EVENT_CLASS(xfs_rmap_class,
TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
xfs_agblock_t agbno, xfs_extlen_t len, bool unwritten,
- struct xfs_owner_info *oinfo),
+ const struct xfs_owner_info *oinfo),
TP_ARGS(mp, agno, agbno, len, unwritten, oinfo),
TP_STRUCT__entry(
__field(dev_t, dev)
@@ -2437,7 +2463,7 @@
DEFINE_EVENT(xfs_rmap_class, name, \
TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \
xfs_agblock_t agbno, xfs_extlen_t len, bool unwritten, \
- struct xfs_owner_info *oinfo), \
+ const struct xfs_owner_info *oinfo), \
TP_ARGS(mp, agno, agbno, len, unwritten, oinfo))
/* simple AG-based error/%ip tracepoint class */
@@ -2607,10 +2633,9 @@
#define DEFINE_AG_EXTENT_EVENT(name) DEFINE_DISCARD_EVENT(name)
/* ag btree lookup tracepoint class */
-#define XFS_AG_BTREE_CMP_FORMAT_STR \
- { XFS_LOOKUP_EQ, "eq" }, \
- { XFS_LOOKUP_LE, "le" }, \
- { XFS_LOOKUP_GE, "ge" }
+TRACE_DEFINE_ENUM(XFS_LOOKUP_EQi);
+TRACE_DEFINE_ENUM(XFS_LOOKUP_LEi);
+TRACE_DEFINE_ENUM(XFS_LOOKUP_GEi);
DECLARE_EVENT_CLASS(xfs_ag_btree_lookup_class,
TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
xfs_agblock_t agbno, xfs_lookup_t dir),
@@ -3052,7 +3077,7 @@
DEFINE_INODE_EVENT(xfs_reflink_set_inode_flag);
DEFINE_INODE_EVENT(xfs_reflink_unset_inode_flag);
DEFINE_ITRUNC_EVENT(xfs_reflink_update_inode_size);
-DEFINE_IOMAP_EVENT(xfs_reflink_remap_imap);
+DEFINE_IMAP_EVENT(xfs_reflink_remap_imap);
TRACE_EVENT(xfs_reflink_remap_blocks_loop,
TP_PROTO(struct xfs_inode *src, xfs_fileoff_t soffset,
xfs_filblks_t len, struct xfs_inode *dest,
@@ -3176,13 +3201,10 @@
/* copy on write */
DEFINE_INODE_IREC_EVENT(xfs_reflink_trim_around_shared);
-DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_alloc);
DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_found);
DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_enospc);
DEFINE_INODE_IREC_EVENT(xfs_reflink_convert_cow);
-DEFINE_RW_EVENT(xfs_reflink_reserve_cow);
-
DEFINE_SIMPLE_IO_EVENT(xfs_reflink_bounce_dio_write);
DEFINE_SIMPLE_IO_EVENT(xfs_reflink_cancel_cow_range);
@@ -3343,8 +3365,250 @@
DEFINE_TRANS_EVENT(xfs_trans_free);
DEFINE_TRANS_EVENT(xfs_trans_roll);
DEFINE_TRANS_EVENT(xfs_trans_add_item);
+DEFINE_TRANS_EVENT(xfs_trans_commit_items);
DEFINE_TRANS_EVENT(xfs_trans_free_items);
+TRACE_EVENT(xfs_iunlink_update_bucket,
+ TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, unsigned int bucket,
+ xfs_agino_t old_ptr, xfs_agino_t new_ptr),
+ TP_ARGS(mp, agno, bucket, old_ptr, new_ptr),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_agnumber_t, agno)
+ __field(unsigned int, bucket)
+ __field(xfs_agino_t, old_ptr)
+ __field(xfs_agino_t, new_ptr)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->agno = agno;
+ __entry->bucket = bucket;
+ __entry->old_ptr = old_ptr;
+ __entry->new_ptr = new_ptr;
+ ),
+ TP_printk("dev %d:%d agno %u bucket %u old 0x%x new 0x%x",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->agno,
+ __entry->bucket,
+ __entry->old_ptr,
+ __entry->new_ptr)
+);
+
+TRACE_EVENT(xfs_iunlink_update_dinode,
+ TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agino_t agino,
+ xfs_agino_t old_ptr, xfs_agino_t new_ptr),
+ TP_ARGS(mp, agno, agino, old_ptr, new_ptr),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_agino_t, agino)
+ __field(xfs_agino_t, old_ptr)
+ __field(xfs_agino_t, new_ptr)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->agno = agno;
+ __entry->agino = agino;
+ __entry->old_ptr = old_ptr;
+ __entry->new_ptr = new_ptr;
+ ),
+ TP_printk("dev %d:%d agno %u agino 0x%x old 0x%x new 0x%x",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->agno,
+ __entry->agino,
+ __entry->old_ptr,
+ __entry->new_ptr)
+);
+
+DECLARE_EVENT_CLASS(xfs_ag_inode_class,
+ TP_PROTO(struct xfs_inode *ip),
+ TP_ARGS(ip),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_agino_t, agino)
+ ),
+ TP_fast_assign(
+ __entry->dev = VFS_I(ip)->i_sb->s_dev;
+ __entry->agno = XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino);
+ __entry->agino = XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino);
+ ),
+ TP_printk("dev %d:%d agno %u agino %u",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->agno, __entry->agino)
+)
+
+#define DEFINE_AGINODE_EVENT(name) \
+DEFINE_EVENT(xfs_ag_inode_class, name, \
+ TP_PROTO(struct xfs_inode *ip), \
+ TP_ARGS(ip))
+DEFINE_AGINODE_EVENT(xfs_iunlink);
+DEFINE_AGINODE_EVENT(xfs_iunlink_remove);
+DEFINE_AG_EVENT(xfs_iunlink_map_prev_fallback);
+
+DECLARE_EVENT_CLASS(xfs_fs_corrupt_class,
+ TP_PROTO(struct xfs_mount *mp, unsigned int flags),
+ TP_ARGS(mp, flags),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(unsigned int, flags)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->flags = flags;
+ ),
+ TP_printk("dev %d:%d flags 0x%x",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->flags)
+);
+#define DEFINE_FS_CORRUPT_EVENT(name) \
+DEFINE_EVENT(xfs_fs_corrupt_class, name, \
+ TP_PROTO(struct xfs_mount *mp, unsigned int flags), \
+ TP_ARGS(mp, flags))
+DEFINE_FS_CORRUPT_EVENT(xfs_fs_mark_sick);
+DEFINE_FS_CORRUPT_EVENT(xfs_fs_mark_healthy);
+DEFINE_FS_CORRUPT_EVENT(xfs_fs_unfixed_corruption);
+DEFINE_FS_CORRUPT_EVENT(xfs_rt_mark_sick);
+DEFINE_FS_CORRUPT_EVENT(xfs_rt_mark_healthy);
+DEFINE_FS_CORRUPT_EVENT(xfs_rt_unfixed_corruption);
+
+DECLARE_EVENT_CLASS(xfs_ag_corrupt_class,
+ TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, unsigned int flags),
+ TP_ARGS(mp, agno, flags),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_agnumber_t, agno)
+ __field(unsigned int, flags)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->agno = agno;
+ __entry->flags = flags;
+ ),
+ TP_printk("dev %d:%d agno %u flags 0x%x",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->agno, __entry->flags)
+);
+#define DEFINE_AG_CORRUPT_EVENT(name) \
+DEFINE_EVENT(xfs_ag_corrupt_class, name, \
+ TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \
+ unsigned int flags), \
+ TP_ARGS(mp, agno, flags))
+DEFINE_AG_CORRUPT_EVENT(xfs_ag_mark_sick);
+DEFINE_AG_CORRUPT_EVENT(xfs_ag_mark_healthy);
+DEFINE_AG_CORRUPT_EVENT(xfs_ag_unfixed_corruption);
+
+DECLARE_EVENT_CLASS(xfs_inode_corrupt_class,
+ TP_PROTO(struct xfs_inode *ip, unsigned int flags),
+ TP_ARGS(ip, flags),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(unsigned int, flags)
+ ),
+ TP_fast_assign(
+ __entry->dev = ip->i_mount->m_super->s_dev;
+ __entry->ino = ip->i_ino;
+ __entry->flags = flags;
+ ),
+ TP_printk("dev %d:%d ino 0x%llx flags 0x%x",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino, __entry->flags)
+);
+#define DEFINE_INODE_CORRUPT_EVENT(name) \
+DEFINE_EVENT(xfs_inode_corrupt_class, name, \
+ TP_PROTO(struct xfs_inode *ip, unsigned int flags), \
+ TP_ARGS(ip, flags))
+DEFINE_INODE_CORRUPT_EVENT(xfs_inode_mark_sick);
+DEFINE_INODE_CORRUPT_EVENT(xfs_inode_mark_healthy);
+
+TRACE_EVENT(xfs_iwalk_ag,
+ TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
+ xfs_agino_t startino),
+ TP_ARGS(mp, agno, startino),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_agino_t, startino)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->agno = agno;
+ __entry->startino = startino;
+ ),
+ TP_printk("dev %d:%d agno %d startino %u",
+ MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno,
+ __entry->startino)
+)
+
+TRACE_EVENT(xfs_iwalk_ag_rec,
+ TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
+ struct xfs_inobt_rec_incore *irec),
+ TP_ARGS(mp, agno, irec),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_agino_t, startino)
+ __field(uint64_t, freemask)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->agno = agno;
+ __entry->startino = irec->ir_startino;
+ __entry->freemask = irec->ir_free;
+ ),
+ TP_printk("dev %d:%d agno %d startino %u freemask 0x%llx",
+ MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno,
+ __entry->startino, __entry->freemask)
+)
+
+TRACE_EVENT(xfs_pwork_init,
+ TP_PROTO(struct xfs_mount *mp, unsigned int nr_threads, pid_t pid),
+ TP_ARGS(mp, nr_threads, pid),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(unsigned int, nr_threads)
+ __field(pid_t, pid)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->nr_threads = nr_threads;
+ __entry->pid = pid;
+ ),
+ TP_printk("dev %d:%d nr_threads %u pid %u",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->nr_threads, __entry->pid)
+)
+
+DECLARE_EVENT_CLASS(xfs_kmem_class,
+ TP_PROTO(ssize_t size, int flags, unsigned long caller_ip),
+ TP_ARGS(size, flags, caller_ip),
+ TP_STRUCT__entry(
+ __field(ssize_t, size)
+ __field(int, flags)
+ __field(unsigned long, caller_ip)
+ ),
+ TP_fast_assign(
+ __entry->size = size;
+ __entry->flags = flags;
+ __entry->caller_ip = caller_ip;
+ ),
+ TP_printk("size %zd flags 0x%x caller %pS",
+ __entry->size,
+ __entry->flags,
+ (char *)__entry->caller_ip)
+)
+
+#define DEFINE_KMEM_EVENT(name) \
+DEFINE_EVENT(xfs_kmem_class, name, \
+ TP_PROTO(ssize_t size, int flags, unsigned long caller_ip), \
+ TP_ARGS(size, flags, caller_ip))
+DEFINE_KMEM_EVENT(kmem_alloc);
+DEFINE_KMEM_EVENT(kmem_alloc_io);
+DEFINE_KMEM_EVENT(kmem_alloc_large);
+DEFINE_KMEM_EVENT(kmem_realloc);
+DEFINE_KMEM_EVENT(kmem_zone_alloc);
+
#endif /* _TRACE_XFS_H */
#undef TRACE_INCLUDE_PATH
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 912b42f..f4795fd 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -11,7 +11,6 @@
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
-#include "xfs_inode.h"
#include "xfs_extent_busy.h"
#include "xfs_quota.h"
#include "xfs_trans.h"
@@ -91,7 +90,7 @@
trace_xfs_trans_dup(tp, _RET_IP_);
- ntp = kmem_zone_zalloc(xfs_trans_zone, KM_SLEEP);
+ ntp = kmem_zone_zalloc(xfs_trans_zone, 0);
/*
* Initialize the new transaction structure.
@@ -264,9 +263,7 @@
* GFP_NOFS allocation context so that we avoid lockdep false positives
* by doing GFP_KERNEL allocations inside sb_start_intwrite().
*/
- tp = kmem_zone_zalloc(xfs_trans_zone,
- (flags & XFS_TRANS_NOFS) ? KM_NOFS : KM_SLEEP);
-
+ tp = kmem_zone_zalloc(xfs_trans_zone, 0);
if (!(flags & XFS_TRANS_NO_WRITECOUNT))
sb_start_intwrite(mp->m_super);
@@ -452,7 +449,7 @@
xfs_buf_t *bp;
int whole = 0;
- bp = xfs_trans_getsb(tp, tp->t_mountp, 0);
+ bp = xfs_trans_getsb(tp, tp->t_mountp);
sbp = XFS_BUF_TO_SBP(bp);
/*
@@ -767,10 +764,9 @@
}
/* Detach and unlock all of the items in a transaction */
-void
+static void
xfs_trans_free_items(
struct xfs_trans *tp,
- xfs_lsn_t commit_lsn,
bool abort)
{
struct xfs_log_item *lip, *next;
@@ -779,11 +775,10 @@
list_for_each_entry_safe(lip, next, &tp->t_items, li_trans) {
xfs_trans_del_item(lip);
- if (commit_lsn != NULLCOMMITLSN)
- lip->li_ops->iop_committing(lip, commit_lsn);
if (abort)
set_bit(XFS_LI_ABORTED, &lip->li_flags);
- lip->li_ops->iop_unlock(lip);
+ if (lip->li_ops->iop_release)
+ lip->li_ops->iop_release(lip);
}
}
@@ -804,7 +799,8 @@
for (i = 0; i < nr_items; i++) {
struct xfs_log_item *lip = log_items[i];
- lip->li_ops->iop_unpin(lip, 0);
+ if (lip->li_ops->iop_unpin)
+ lip->li_ops->iop_unpin(lip, 0);
}
}
@@ -815,7 +811,7 @@
*
* If we are called with the aborted flag set, it is because a log write during
* a CIL checkpoint commit has failed. In this case, all the items in the
- * checkpoint have already gone through iop_commited and iop_unlock, which
+ * checkpoint have already gone through iop_committed and iop_committing, which
* means that checkpoint commit abort handling is treated exactly the same
* as an iclog write error even though we haven't started any IO yet. Hence in
* this case all we need to do is iop_committed processing, followed by an
@@ -833,7 +829,7 @@
struct xfs_ail *ailp,
struct xfs_log_vec *log_vector,
xfs_lsn_t commit_lsn,
- int aborted)
+ bool aborted)
{
#define LOG_ITEM_BATCH_SIZE 32
struct xfs_log_item *log_items[LOG_ITEM_BATCH_SIZE];
@@ -852,7 +848,16 @@
if (aborted)
set_bit(XFS_LI_ABORTED, &lip->li_flags);
- item_lsn = lip->li_ops->iop_committed(lip, commit_lsn);
+
+ if (lip->li_ops->flags & XFS_ITEM_RELEASE_WHEN_COMMITTED) {
+ lip->li_ops->iop_release(lip);
+ continue;
+ }
+
+ if (lip->li_ops->iop_committed)
+ item_lsn = lip->li_ops->iop_committed(lip, commit_lsn);
+ else
+ item_lsn = commit_lsn;
/* item_lsn of -1 means the item needs no further processing */
if (XFS_LSN_CMP(item_lsn, (xfs_lsn_t)-1) == 0)
@@ -864,7 +869,8 @@
*/
if (aborted) {
ASSERT(XFS_FORCED_SHUTDOWN(ailp->ail_mount));
- lip->li_ops->iop_unpin(lip, 1);
+ if (lip->li_ops->iop_unpin)
+ lip->li_ops->iop_unpin(lip, 1);
continue;
}
@@ -882,7 +888,8 @@
xfs_trans_ail_update(ailp, lip, item_lsn);
else
spin_unlock(&ailp->ail_lock);
- lip->li_ops->iop_unpin(lip, 0);
+ if (lip->li_ops->iop_unpin)
+ lip->li_ops->iop_unpin(lip, 0);
continue;
}
@@ -998,7 +1005,7 @@
tp->t_ticket = NULL;
}
current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
- xfs_trans_free_items(tp, NULLCOMMITLSN, !!error);
+ xfs_trans_free_items(tp, !!error);
xfs_trans_free(tp);
XFS_STATS_INC(mp, xs_trans_empty);
@@ -1060,7 +1067,7 @@
/* mark this thread as no longer being in a transaction */
current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
- xfs_trans_free_items(tp, NULLCOMMITLSN, dirty);
+ xfs_trans_free_items(tp, dirty);
xfs_trans_free(tp);
}
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index c3d278e..64d7f17 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -27,7 +27,7 @@
struct xfs_bui_log_item;
struct xfs_bud_log_item;
-typedef struct xfs_log_item {
+struct xfs_log_item {
struct list_head li_ail; /* AIL pointers */
struct list_head li_trans; /* transaction list */
xfs_lsn_t li_lsn; /* last on-disk lsn */
@@ -48,7 +48,7 @@
struct xfs_log_vec *li_lv; /* active log vector */
struct xfs_log_vec *li_lv_shadow; /* standby vector */
xfs_lsn_t li_seq; /* CIL commit seq */
-} xfs_log_item_t;
+};
/*
* li_flags use the (set/test/clear)_bit atomic interfaces because updates can
@@ -67,17 +67,24 @@
{ (1 << XFS_LI_DIRTY), "DIRTY" }
struct xfs_item_ops {
- void (*iop_size)(xfs_log_item_t *, int *, int *);
- void (*iop_format)(xfs_log_item_t *, struct xfs_log_vec *);
- void (*iop_pin)(xfs_log_item_t *);
- void (*iop_unpin)(xfs_log_item_t *, int remove);
+ unsigned flags;
+ void (*iop_size)(struct xfs_log_item *, int *, int *);
+ void (*iop_format)(struct xfs_log_item *, struct xfs_log_vec *);
+ void (*iop_pin)(struct xfs_log_item *);
+ void (*iop_unpin)(struct xfs_log_item *, int remove);
uint (*iop_push)(struct xfs_log_item *, struct list_head *);
- void (*iop_unlock)(xfs_log_item_t *);
- xfs_lsn_t (*iop_committed)(xfs_log_item_t *, xfs_lsn_t);
- void (*iop_committing)(xfs_log_item_t *, xfs_lsn_t);
- void (*iop_error)(xfs_log_item_t *, xfs_buf_t *);
+ void (*iop_committing)(struct xfs_log_item *, xfs_lsn_t commit_lsn);
+ void (*iop_release)(struct xfs_log_item *);
+ xfs_lsn_t (*iop_committed)(struct xfs_log_item *, xfs_lsn_t);
+ void (*iop_error)(struct xfs_log_item *, xfs_buf_t *);
};
+/*
+ * Release the log item as soon as committed. This is for items just logging
+ * intents that never need to be written back in place.
+ */
+#define XFS_ITEM_RELEASE_WHEN_COMMITTED (1 << 0)
+
void xfs_log_item_init(struct xfs_mount *mp, struct xfs_log_item *item,
int type, const struct xfs_item_ops *ops);
@@ -203,7 +210,7 @@
flags, bpp, ops);
}
-struct xfs_buf *xfs_trans_getsb(xfs_trans_t *, struct xfs_mount *, int);
+struct xfs_buf *xfs_trans_getsb(xfs_trans_t *, struct xfs_mount *);
void xfs_trans_brelse(xfs_trans_t *, struct xfs_buf *);
void xfs_trans_bjoin(xfs_trans_t *, struct xfs_buf *);
@@ -220,16 +227,9 @@
void xfs_trans_log_buf(struct xfs_trans *, struct xfs_buf *, uint,
uint);
void xfs_trans_dirty_buf(struct xfs_trans *, struct xfs_buf *);
+bool xfs_trans_buf_is_dirty(struct xfs_buf *bp);
void xfs_trans_log_inode(xfs_trans_t *, struct xfs_inode *, uint);
-void xfs_extent_free_init_defer_op(void);
-struct xfs_efd_log_item *xfs_trans_get_efd(struct xfs_trans *,
- struct xfs_efi_log_item *,
- uint);
-int xfs_trans_free_extent(struct xfs_trans *,
- struct xfs_efd_log_item *, xfs_fsblock_t,
- xfs_extlen_t, struct xfs_owner_info *,
- bool);
int xfs_trans_commit(struct xfs_trans *);
int xfs_trans_roll(struct xfs_trans **);
int xfs_trans_roll_inode(struct xfs_trans **, struct xfs_inode *);
@@ -244,40 +244,4 @@
extern kmem_zone_t *xfs_trans_zone;
-/* rmap updates */
-enum xfs_rmap_intent_type;
-
-void xfs_rmap_update_init_defer_op(void);
-struct xfs_rud_log_item *xfs_trans_get_rud(struct xfs_trans *tp,
- struct xfs_rui_log_item *ruip);
-int xfs_trans_log_finish_rmap_update(struct xfs_trans *tp,
- struct xfs_rud_log_item *rudp, enum xfs_rmap_intent_type type,
- uint64_t owner, int whichfork, xfs_fileoff_t startoff,
- xfs_fsblock_t startblock, xfs_filblks_t blockcount,
- xfs_exntst_t state, struct xfs_btree_cur **pcur);
-
-/* refcount updates */
-enum xfs_refcount_intent_type;
-
-void xfs_refcount_update_init_defer_op(void);
-struct xfs_cud_log_item *xfs_trans_get_cud(struct xfs_trans *tp,
- struct xfs_cui_log_item *cuip);
-int xfs_trans_log_finish_refcount_update(struct xfs_trans *tp,
- struct xfs_cud_log_item *cudp,
- enum xfs_refcount_intent_type type, xfs_fsblock_t startblock,
- xfs_extlen_t blockcount, xfs_fsblock_t *new_fsb,
- xfs_extlen_t *new_len, struct xfs_btree_cur **pcur);
-
-/* mapping updates */
-enum xfs_bmap_intent_type;
-
-void xfs_bmap_update_init_defer_op(void);
-struct xfs_bud_log_item *xfs_trans_get_bud(struct xfs_trans *tp,
- struct xfs_bui_log_item *buip);
-int xfs_trans_log_finish_bmap_update(struct xfs_trans *tp,
- struct xfs_bud_log_item *rudp, enum xfs_bmap_intent_type type,
- struct xfs_inode *ip, int whichfork, xfs_fileoff_t startoff,
- xfs_fsblock_t startblock, xfs_filblks_t *blockcount,
- xfs_exntst_t state);
-
#endif /* __XFS_TRANS_H__ */
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index 55326f9..6ccfd75 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -6,6 +6,7 @@
*/
#include "xfs.h"
#include "xfs_fs.h"
+#include "xfs_shared.h"
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
@@ -74,29 +75,29 @@
* Return a pointer to the last item in the AIL. If the AIL is empty, then
* return NULL.
*/
-static xfs_log_item_t *
+static struct xfs_log_item *
xfs_ail_max(
struct xfs_ail *ailp)
{
if (list_empty(&ailp->ail_head))
return NULL;
- return list_entry(ailp->ail_head.prev, xfs_log_item_t, li_ail);
+ return list_entry(ailp->ail_head.prev, struct xfs_log_item, li_ail);
}
/*
* Return a pointer to the item which follows the given item in the AIL. If
* the given item is the last item in the list, then return NULL.
*/
-static xfs_log_item_t *
+static struct xfs_log_item *
xfs_ail_next(
- struct xfs_ail *ailp,
- xfs_log_item_t *lip)
+ struct xfs_ail *ailp,
+ struct xfs_log_item *lip)
{
if (lip->li_ail.next == &ailp->ail_head)
return NULL;
- return list_first_entry(&lip->li_ail, xfs_log_item_t, li_ail);
+ return list_first_entry(&lip->li_ail, struct xfs_log_item, li_ail);
}
/*
@@ -109,10 +110,10 @@
*/
xfs_lsn_t
xfs_ail_min_lsn(
- struct xfs_ail *ailp)
+ struct xfs_ail *ailp)
{
- xfs_lsn_t lsn = 0;
- xfs_log_item_t *lip;
+ xfs_lsn_t lsn = 0;
+ struct xfs_log_item *lip;
spin_lock(&ailp->ail_lock);
lip = xfs_ail_min(ailp);
@@ -128,10 +129,10 @@
*/
static xfs_lsn_t
xfs_ail_max_lsn(
- struct xfs_ail *ailp)
+ struct xfs_ail *ailp)
{
- xfs_lsn_t lsn = 0;
- xfs_log_item_t *lip;
+ xfs_lsn_t lsn = 0;
+ struct xfs_log_item *lip;
spin_lock(&ailp->ail_lock);
lip = xfs_ail_max(ailp);
@@ -216,13 +217,13 @@
* ascending traversal. Pass a @lsn of zero to initialise the cursor to the
* first item in the AIL. Returns NULL if the list is empty.
*/
-xfs_log_item_t *
+struct xfs_log_item *
xfs_trans_ail_cursor_first(
struct xfs_ail *ailp,
struct xfs_ail_cursor *cur,
xfs_lsn_t lsn)
{
- xfs_log_item_t *lip;
+ struct xfs_log_item *lip;
xfs_trans_ail_cursor_init(ailp, cur);
@@ -248,7 +249,7 @@
struct xfs_ail *ailp,
xfs_lsn_t lsn)
{
- xfs_log_item_t *lip;
+ struct xfs_log_item *lip;
list_for_each_entry_reverse(lip, &ailp->ail_head, li_ail) {
if (XFS_LSN_CMP(lip->li_lsn, lsn) <= 0)
@@ -327,8 +328,8 @@
*/
static void
xfs_ail_delete(
- struct xfs_ail *ailp,
- xfs_log_item_t *lip)
+ struct xfs_ail *ailp,
+ struct xfs_log_item *lip)
{
xfs_ail_check(ailp, lip);
list_del(&lip->li_ail);
@@ -347,6 +348,14 @@
if (XFS_TEST_ERROR(false, ailp->ail_mount, XFS_ERRTAG_LOG_ITEM_PIN))
return XFS_ITEM_PINNED;
+ /*
+ * Consider the item pinned if a push callback is not defined so the
+ * caller will force the log. This should only happen for intent items
+ * as they are unpinned once the associated done item is committed to
+ * the on-disk log.
+ */
+ if (!lip->li_ops->iop_push)
+ return XFS_ITEM_PINNED;
return lip->li_ops->iop_push(lip, &ailp->ail_buf_list);
}
@@ -356,7 +365,7 @@
{
xfs_mount_t *mp = ailp->ail_mount;
struct xfs_ail_cursor cur;
- xfs_log_item_t *lip;
+ struct xfs_log_item *lip;
xfs_lsn_t lsn;
xfs_lsn_t target;
long tout;
@@ -531,17 +540,33 @@
set_current_state(TASK_INTERRUPTIBLE);
/*
- * Check kthread_should_stop() after we set the task state
- * to guarantee that we either see the stop bit and exit or
- * the task state is reset to runnable such that it's not
- * scheduled out indefinitely and detects the stop bit at
- * next iteration.
- *
+ * Check kthread_should_stop() after we set the task state to
+ * guarantee that we either see the stop bit and exit or the
+ * task state is reset to runnable such that it's not scheduled
+ * out indefinitely and detects the stop bit at next iteration.
* A memory barrier is included in above task state set to
* serialize again kthread_stop().
*/
if (kthread_should_stop()) {
__set_current_state(TASK_RUNNING);
+
+ /*
+ * The caller forces out the AIL before stopping the
+ * thread in the common case, which means the delwri
+ * queue is drained. In the shutdown case, the queue may
+ * still hold relogged buffers that haven't been
+ * submitted because they were pinned since added to the
+ * queue.
+ *
+ * Log I/O error processing stales the underlying buffer
+ * and clears the delwri state, expecting the buf to be
+ * removed on the next submission attempt. That won't
+ * happen if we're shutting down, so this is the last
+ * opportunity to release such buffers from the queue.
+ */
+ ASSERT(list_empty(&ailp->ail_buf_list) ||
+ XFS_FORCED_SHUTDOWN(ailp->ail_mount));
+ xfs_buf_delwri_cancel(&ailp->ail_buf_list);
break;
}
@@ -595,10 +620,10 @@
*/
void
xfs_ail_push(
- struct xfs_ail *ailp,
- xfs_lsn_t threshold_lsn)
+ struct xfs_ail *ailp,
+ xfs_lsn_t threshold_lsn)
{
- xfs_log_item_t *lip;
+ struct xfs_log_item *lip;
lip = xfs_ail_min(ailp);
if (!lip || XFS_FORCED_SHUTDOWN(ailp->ail_mount) ||
@@ -683,7 +708,7 @@
int nr_items,
xfs_lsn_t lsn) __releases(ailp->ail_lock)
{
- xfs_log_item_t *mlip;
+ struct xfs_log_item *mlip;
int mlip_changed = 0;
int i;
LIST_HEAD(tmp);
diff --git a/fs/xfs/xfs_trans_bmap.c b/fs/xfs/xfs_trans_bmap.c
deleted file mode 100644
index 741c558..0000000
--- a/fs/xfs/xfs_trans_bmap.c
+++ /dev/null
@@ -1,240 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0+
-/*
- * Copyright (C) 2016 Oracle. All Rights Reserved.
- * Author: Darrick J. Wong <darrick.wong@oracle.com>
- */
-#include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_shared.h"
-#include "xfs_format.h"
-#include "xfs_log_format.h"
-#include "xfs_trans_resv.h"
-#include "xfs_mount.h"
-#include "xfs_defer.h"
-#include "xfs_trans.h"
-#include "xfs_trans_priv.h"
-#include "xfs_bmap_item.h"
-#include "xfs_alloc.h"
-#include "xfs_bmap.h"
-#include "xfs_inode.h"
-
-/*
- * This routine is called to allocate a "bmap update done"
- * log item.
- */
-struct xfs_bud_log_item *
-xfs_trans_get_bud(
- struct xfs_trans *tp,
- struct xfs_bui_log_item *buip)
-{
- struct xfs_bud_log_item *budp;
-
- budp = xfs_bud_init(tp->t_mountp, buip);
- xfs_trans_add_item(tp, &budp->bud_item);
- return budp;
-}
-
-/*
- * Finish an bmap update and log it to the BUD. Note that the
- * transaction is marked dirty regardless of whether the bmap update
- * succeeds or fails to support the BUI/BUD lifecycle rules.
- */
-int
-xfs_trans_log_finish_bmap_update(
- struct xfs_trans *tp,
- struct xfs_bud_log_item *budp,
- enum xfs_bmap_intent_type type,
- struct xfs_inode *ip,
- int whichfork,
- xfs_fileoff_t startoff,
- xfs_fsblock_t startblock,
- xfs_filblks_t *blockcount,
- xfs_exntst_t state)
-{
- int error;
-
- error = xfs_bmap_finish_one(tp, ip, type, whichfork, startoff,
- startblock, blockcount, state);
-
- /*
- * Mark the transaction dirty, even on error. This ensures the
- * transaction is aborted, which:
- *
- * 1.) releases the BUI and frees the BUD
- * 2.) shuts down the filesystem
- */
- tp->t_flags |= XFS_TRANS_DIRTY;
- set_bit(XFS_LI_DIRTY, &budp->bud_item.li_flags);
-
- return error;
-}
-
-/* Sort bmap intents by inode. */
-static int
-xfs_bmap_update_diff_items(
- void *priv,
- struct list_head *a,
- struct list_head *b)
-{
- struct xfs_bmap_intent *ba;
- struct xfs_bmap_intent *bb;
-
- ba = container_of(a, struct xfs_bmap_intent, bi_list);
- bb = container_of(b, struct xfs_bmap_intent, bi_list);
- return ba->bi_owner->i_ino - bb->bi_owner->i_ino;
-}
-
-/* Get an BUI. */
-STATIC void *
-xfs_bmap_update_create_intent(
- struct xfs_trans *tp,
- unsigned int count)
-{
- struct xfs_bui_log_item *buip;
-
- ASSERT(count == XFS_BUI_MAX_FAST_EXTENTS);
- ASSERT(tp != NULL);
-
- buip = xfs_bui_init(tp->t_mountp);
- ASSERT(buip != NULL);
-
- /*
- * Get a log_item_desc to point at the new item.
- */
- xfs_trans_add_item(tp, &buip->bui_item);
- return buip;
-}
-
-/* Set the map extent flags for this mapping. */
-static void
-xfs_trans_set_bmap_flags(
- struct xfs_map_extent *bmap,
- enum xfs_bmap_intent_type type,
- int whichfork,
- xfs_exntst_t state)
-{
- bmap->me_flags = 0;
- switch (type) {
- case XFS_BMAP_MAP:
- case XFS_BMAP_UNMAP:
- bmap->me_flags = type;
- break;
- default:
- ASSERT(0);
- }
- if (state == XFS_EXT_UNWRITTEN)
- bmap->me_flags |= XFS_BMAP_EXTENT_UNWRITTEN;
- if (whichfork == XFS_ATTR_FORK)
- bmap->me_flags |= XFS_BMAP_EXTENT_ATTR_FORK;
-}
-
-/* Log bmap updates in the intent item. */
-STATIC void
-xfs_bmap_update_log_item(
- struct xfs_trans *tp,
- void *intent,
- struct list_head *item)
-{
- struct xfs_bui_log_item *buip = intent;
- struct xfs_bmap_intent *bmap;
- uint next_extent;
- struct xfs_map_extent *map;
-
- bmap = container_of(item, struct xfs_bmap_intent, bi_list);
-
- tp->t_flags |= XFS_TRANS_DIRTY;
- set_bit(XFS_LI_DIRTY, &buip->bui_item.li_flags);
-
- /*
- * atomic_inc_return gives us the value after the increment;
- * we want to use it as an array index so we need to subtract 1 from
- * it.
- */
- next_extent = atomic_inc_return(&buip->bui_next_extent) - 1;
- ASSERT(next_extent < buip->bui_format.bui_nextents);
- map = &buip->bui_format.bui_extents[next_extent];
- map->me_owner = bmap->bi_owner->i_ino;
- map->me_startblock = bmap->bi_bmap.br_startblock;
- map->me_startoff = bmap->bi_bmap.br_startoff;
- map->me_len = bmap->bi_bmap.br_blockcount;
- xfs_trans_set_bmap_flags(map, bmap->bi_type, bmap->bi_whichfork,
- bmap->bi_bmap.br_state);
-}
-
-/* Get an BUD so we can process all the deferred rmap updates. */
-STATIC void *
-xfs_bmap_update_create_done(
- struct xfs_trans *tp,
- void *intent,
- unsigned int count)
-{
- return xfs_trans_get_bud(tp, intent);
-}
-
-/* Process a deferred rmap update. */
-STATIC int
-xfs_bmap_update_finish_item(
- struct xfs_trans *tp,
- struct list_head *item,
- void *done_item,
- void **state)
-{
- struct xfs_bmap_intent *bmap;
- xfs_filblks_t count;
- int error;
-
- bmap = container_of(item, struct xfs_bmap_intent, bi_list);
- count = bmap->bi_bmap.br_blockcount;
- error = xfs_trans_log_finish_bmap_update(tp, done_item,
- bmap->bi_type,
- bmap->bi_owner, bmap->bi_whichfork,
- bmap->bi_bmap.br_startoff,
- bmap->bi_bmap.br_startblock,
- &count,
- bmap->bi_bmap.br_state);
- if (!error && count > 0) {
- ASSERT(bmap->bi_type == XFS_BMAP_UNMAP);
- bmap->bi_bmap.br_blockcount = count;
- return -EAGAIN;
- }
- kmem_free(bmap);
- return error;
-}
-
-/* Abort all pending BUIs. */
-STATIC void
-xfs_bmap_update_abort_intent(
- void *intent)
-{
- xfs_bui_release(intent);
-}
-
-/* Cancel a deferred rmap update. */
-STATIC void
-xfs_bmap_update_cancel_item(
- struct list_head *item)
-{
- struct xfs_bmap_intent *bmap;
-
- bmap = container_of(item, struct xfs_bmap_intent, bi_list);
- kmem_free(bmap);
-}
-
-static const struct xfs_defer_op_type xfs_bmap_update_defer_type = {
- .type = XFS_DEFER_OPS_TYPE_BMAP,
- .max_items = XFS_BUI_MAX_FAST_EXTENTS,
- .diff_items = xfs_bmap_update_diff_items,
- .create_intent = xfs_bmap_update_create_intent,
- .abort_intent = xfs_bmap_update_abort_intent,
- .log_item = xfs_bmap_update_log_item,
- .create_done = xfs_bmap_update_create_done,
- .finish_item = xfs_bmap_update_finish_item,
- .cancel_item = xfs_bmap_update_cancel_item,
-};
-
-/* Register the deferred op type. */
-void
-xfs_bmap_update_init_defer_op(void)
-{
- xfs_defer_init_op_type(&xfs_bmap_update_defer_type);
-}
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index 286a287..b5b3a78 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -10,11 +10,9 @@
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
-#include "xfs_inode.h"
#include "xfs_trans.h"
#include "xfs_buf_item.h"
#include "xfs_trans_priv.h"
-#include "xfs_error.h"
#include "xfs_trace.h"
/*
@@ -174,8 +172,7 @@
xfs_buf_t *
xfs_trans_getsb(
xfs_trans_t *tp,
- struct xfs_mount *mp,
- int flags)
+ struct xfs_mount *mp)
{
xfs_buf_t *bp;
struct xfs_buf_log_item *bip;
@@ -185,7 +182,7 @@
* if tp is NULL.
*/
if (tp == NULL)
- return xfs_getsb(mp, flags);
+ return xfs_getsb(mp);
/*
* If the superblock buffer already has this transaction
@@ -203,7 +200,7 @@
return bp;
}
- bp = xfs_getsb(mp, flags);
+ bp = xfs_getsb(mp);
if (bp == NULL)
return NULL;
@@ -264,11 +261,39 @@
return -EIO;
}
+ /*
+ * Check if the caller is trying to read a buffer that is
+ * already attached to the transaction yet has no buffer ops
+ * assigned. Ops are usually attached when the buffer is
+ * attached to the transaction, or by the read caller if
+ * special circumstances. That didn't happen, which is not
+ * how this is supposed to go.
+ *
+ * If the buffer passes verification we'll let this go, but if
+ * not we have to shut down. Let the transaction cleanup code
+ * release this buffer when it kills the tranaction.
+ */
+ ASSERT(bp->b_ops != NULL);
+ error = xfs_buf_reverify(bp, ops);
+ if (error) {
+ xfs_buf_ioerror_alert(bp, __func__);
+
+ if (tp->t_flags & XFS_TRANS_DIRTY)
+ xfs_force_shutdown(tp->t_mountp,
+ SHUTDOWN_META_IO_ERROR);
+
+ /* bad CRC means corrupted metadata */
+ if (error == -EFSBADCRC)
+ error = -EFSCORRUPTED;
+ return error;
+ }
+
bip = bp->b_log_item;
bip->bli_recur++;
ASSERT(atomic_read(&bip->bli_refcount) > 0);
trace_xfs_trans_read_buf_recur(bip);
+ ASSERT(bp->b_ops != NULL || ops == NULL);
*bpp = bp;
return 0;
}
@@ -316,11 +341,25 @@
_xfs_trans_bjoin(tp, bp, 1);
trace_xfs_trans_read_buf(bp->b_log_item);
}
+ ASSERT(bp->b_ops != NULL || ops == NULL);
*bpp = bp;
return 0;
}
+/* Has this buffer been dirtied by anyone? */
+bool
+xfs_trans_buf_is_dirty(
+ struct xfs_buf *bp)
+{
+ struct xfs_buf_log_item *bip = bp->b_log_item;
+
+ if (!bip)
+ return false;
+ ASSERT(bip->bli_item.li_type == XFS_LI_BUF);
+ return test_bit(XFS_LI_DIRTY, &bip->bli_item.li_flags);
+}
+
/*
* Release a buffer previously joined to the transaction. If the buffer is
* modified within this transaction, decrement the recursion count but do not
@@ -386,7 +425,7 @@
/*
* Mark the buffer as not needing to be unlocked when the buf item's
- * iop_unlock() routine is called. The buffer must already be locked
+ * iop_committing() routine is called. The buffer must already be locked
* and associated with the given transaction.
*/
/* ARGSUSED */
diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c
index c23257a..1645746 100644
--- a/fs/xfs/xfs_trans_dquot.c
+++ b/fs/xfs/xfs_trans_dquot.c
@@ -11,7 +11,6 @@
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
#include "xfs_inode.h"
-#include "xfs_error.h"
#include "xfs_trans.h"
#include "xfs_trans_priv.h"
#include "xfs_quota.h"
@@ -29,7 +28,6 @@
xfs_trans_t *tp,
xfs_dquot_t *dqp)
{
- ASSERT(dqp->q_transp != tp);
ASSERT(XFS_DQ_IS_LOCKED(dqp));
ASSERT(dqp->q_logitem.qli_dquot == dqp);
@@ -37,15 +35,8 @@
* Get a log_item_desc to point at the new item.
*/
xfs_trans_add_item(tp, &dqp->q_logitem.qli_item);
-
- /*
- * Initialize d_transp so we can later determine if this dquot is
- * associated with this transaction.
- */
- dqp->q_transp = tp;
}
-
/*
* This is called to mark the dquot as needing
* to be logged when the transaction is committed. The dquot must
@@ -61,7 +52,6 @@
xfs_trans_t *tp,
xfs_dquot_t *dqp)
{
- ASSERT(dqp->q_transp == tp);
ASSERT(XFS_DQ_IS_LOCKED(dqp));
tp->t_flags |= XFS_TRANS_DIRTY;
@@ -74,13 +64,13 @@
*/
void
xfs_trans_dup_dqinfo(
- xfs_trans_t *otp,
- xfs_trans_t *ntp)
+ struct xfs_trans *otp,
+ struct xfs_trans *ntp)
{
- xfs_dqtrx_t *oq, *nq;
- int i, j;
- xfs_dqtrx_t *oqa, *nqa;
- ulong blk_res_used;
+ struct xfs_dqtrx *oq, *nq;
+ int i, j;
+ struct xfs_dqtrx *oqa, *nqa;
+ uint64_t blk_res_used;
if (!otp->t_dqinfo)
return;
@@ -137,7 +127,7 @@
xfs_trans_t *tp,
xfs_inode_t *ip,
uint field,
- long delta)
+ int64_t delta)
{
xfs_mount_t *mp = tp->t_mountp;
@@ -191,12 +181,12 @@
*/
void
xfs_trans_mod_dquot(
- xfs_trans_t *tp,
- xfs_dquot_t *dqp,
- uint field,
- long delta)
+ struct xfs_trans *tp,
+ struct xfs_dquot *dqp,
+ uint field,
+ int64_t delta)
{
- xfs_dqtrx_t *qtrx;
+ struct xfs_dqtrx *qtrx;
ASSERT(tp);
ASSERT(XFS_IS_QUOTA_RUNNING(tp->t_mountp));
@@ -219,14 +209,14 @@
* regular disk blk reservation
*/
case XFS_TRANS_DQ_RES_BLKS:
- qtrx->qt_blk_res += (ulong)delta;
+ qtrx->qt_blk_res += delta;
break;
/*
* inode reservation
*/
case XFS_TRANS_DQ_RES_INOS:
- qtrx->qt_ino_res += (ulong)delta;
+ qtrx->qt_ino_res += delta;
break;
/*
@@ -245,7 +235,7 @@
*/
case XFS_TRANS_DQ_ICOUNT:
if (qtrx->qt_ino_res && delta > 0) {
- qtrx->qt_ino_res_used += (ulong)delta;
+ qtrx->qt_ino_res_used += delta;
ASSERT(qtrx->qt_ino_res >= qtrx->qt_ino_res_used);
}
qtrx->qt_icount_delta += delta;
@@ -255,7 +245,7 @@
* rtblk reservation
*/
case XFS_TRANS_DQ_RES_RTBLKS:
- qtrx->qt_rtblk_res += (ulong)delta;
+ qtrx->qt_rtblk_res += delta;
break;
/*
@@ -263,7 +253,7 @@
*/
case XFS_TRANS_DQ_RTBCOUNT:
if (qtrx->qt_rtblk_res && delta > 0) {
- qtrx->qt_rtblk_res_used += (ulong)delta;
+ qtrx->qt_rtblk_res_used += delta;
ASSERT(qtrx->qt_rtblk_res >= qtrx->qt_rtblk_res_used);
}
qtrx->qt_rtbcount_delta += delta;
@@ -288,8 +278,8 @@
*/
STATIC void
xfs_trans_dqlockedjoin(
- xfs_trans_t *tp,
- xfs_dqtrx_t *q)
+ struct xfs_trans *tp,
+ struct xfs_dqtrx *q)
{
ASSERT(q[0].qt_dquot != NULL);
if (q[1].qt_dquot == NULL) {
@@ -320,8 +310,8 @@
struct xfs_dquot *dqp;
struct xfs_dqtrx *qtrx, *qa;
struct xfs_disk_dquot *d;
- long totalbdelta;
- long totalrtbdelta;
+ int64_t totalbdelta;
+ int64_t totalrtbdelta;
if (!(tp->t_flags & XFS_TRANS_DQ_DIRTY))
return;
@@ -347,7 +337,6 @@
break;
ASSERT(XFS_DQ_IS_LOCKED(dqp));
- ASSERT(dqp->q_transp == tp);
/*
* adjust the actual number of blocks used
@@ -413,7 +402,7 @@
* reservation that a transaction structure knows of.
*/
if (qtrx->qt_blk_res != 0) {
- ulong blk_res_used = 0;
+ uint64_t blk_res_used = 0;
if (qtrx->qt_bcount_delta > 0)
blk_res_used = qtrx->qt_bcount_delta;
@@ -501,7 +490,7 @@
{
int i, j;
xfs_dquot_t *dqp;
- xfs_dqtrx_t *qtrx, *qa;
+ struct xfs_dqtrx *qtrx, *qa;
bool locked;
if (!tp->t_dqinfo || !(tp->t_flags & XFS_TRANS_DQ_DIRTY))
@@ -585,7 +574,7 @@
xfs_trans_t *tp,
xfs_mount_t *mp,
xfs_dquot_t *dqp,
- long nblks,
+ int64_t nblks,
long ninos,
uint flags)
{
@@ -745,7 +734,7 @@
struct xfs_dquot *udqp,
struct xfs_dquot *gdqp,
struct xfs_dquot *pdqp,
- long nblks,
+ int64_t nblks,
long ninos,
uint flags)
{
@@ -804,7 +793,7 @@
xfs_trans_reserve_quota_nblks(
struct xfs_trans *tp,
struct xfs_inode *ip,
- long nblks,
+ int64_t nblks,
long ninos,
uint flags)
{
@@ -874,7 +863,7 @@
xfs_trans_alloc_dqinfo(
xfs_trans_t *tp)
{
- tp->t_dqinfo = kmem_zone_zalloc(xfs_qm_dqtrxzone, KM_SLEEP);
+ tp->t_dqinfo = kmem_zone_zalloc(xfs_qm_dqtrxzone, 0);
}
void
diff --git a/fs/xfs/xfs_trans_extfree.c b/fs/xfs/xfs_trans_extfree.c
deleted file mode 100644
index 855c0b6..0000000
--- a/fs/xfs/xfs_trans_extfree.c
+++ /dev/null
@@ -1,295 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright (c) 2000,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- */
-#include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_shared.h"
-#include "xfs_format.h"
-#include "xfs_log_format.h"
-#include "xfs_trans_resv.h"
-#include "xfs_bit.h"
-#include "xfs_mount.h"
-#include "xfs_defer.h"
-#include "xfs_trans.h"
-#include "xfs_trans_priv.h"
-#include "xfs_extfree_item.h"
-#include "xfs_alloc.h"
-#include "xfs_bmap.h"
-#include "xfs_trace.h"
-
-/*
- * This routine is called to allocate an "extent free done"
- * log item that will hold nextents worth of extents. The
- * caller must use all nextents extents, because we are not
- * flexible about this at all.
- */
-struct xfs_efd_log_item *
-xfs_trans_get_efd(struct xfs_trans *tp,
- struct xfs_efi_log_item *efip,
- uint nextents)
-{
- struct xfs_efd_log_item *efdp;
-
- ASSERT(tp != NULL);
- ASSERT(nextents > 0);
-
- efdp = xfs_efd_init(tp->t_mountp, efip, nextents);
- ASSERT(efdp != NULL);
-
- /*
- * Get a log_item_desc to point at the new item.
- */
- xfs_trans_add_item(tp, &efdp->efd_item);
- return efdp;
-}
-
-/*
- * Free an extent and log it to the EFD. Note that the transaction is marked
- * dirty regardless of whether the extent free succeeds or fails to support the
- * EFI/EFD lifecycle rules.
- */
-int
-xfs_trans_free_extent(
- struct xfs_trans *tp,
- struct xfs_efd_log_item *efdp,
- xfs_fsblock_t start_block,
- xfs_extlen_t ext_len,
- struct xfs_owner_info *oinfo,
- bool skip_discard)
-{
- struct xfs_mount *mp = tp->t_mountp;
- uint next_extent;
- xfs_agnumber_t agno = XFS_FSB_TO_AGNO(mp, start_block);
- xfs_agblock_t agbno = XFS_FSB_TO_AGBNO(mp, start_block);
- struct xfs_extent *extp;
- int error;
-
- trace_xfs_bmap_free_deferred(tp->t_mountp, agno, 0, agbno, ext_len);
-
- error = __xfs_free_extent(tp, start_block, ext_len,
- oinfo, XFS_AG_RESV_NONE, skip_discard);
- /*
- * Mark the transaction dirty, even on error. This ensures the
- * transaction is aborted, which:
- *
- * 1.) releases the EFI and frees the EFD
- * 2.) shuts down the filesystem
- */
- tp->t_flags |= XFS_TRANS_DIRTY;
- set_bit(XFS_LI_DIRTY, &efdp->efd_item.li_flags);
-
- next_extent = efdp->efd_next_extent;
- ASSERT(next_extent < efdp->efd_format.efd_nextents);
- extp = &(efdp->efd_format.efd_extents[next_extent]);
- extp->ext_start = start_block;
- extp->ext_len = ext_len;
- efdp->efd_next_extent++;
-
- return error;
-}
-
-/* Sort bmap items by AG. */
-static int
-xfs_extent_free_diff_items(
- void *priv,
- struct list_head *a,
- struct list_head *b)
-{
- struct xfs_mount *mp = priv;
- struct xfs_extent_free_item *ra;
- struct xfs_extent_free_item *rb;
-
- ra = container_of(a, struct xfs_extent_free_item, xefi_list);
- rb = container_of(b, struct xfs_extent_free_item, xefi_list);
- return XFS_FSB_TO_AGNO(mp, ra->xefi_startblock) -
- XFS_FSB_TO_AGNO(mp, rb->xefi_startblock);
-}
-
-/* Get an EFI. */
-STATIC void *
-xfs_extent_free_create_intent(
- struct xfs_trans *tp,
- unsigned int count)
-{
- struct xfs_efi_log_item *efip;
-
- ASSERT(tp != NULL);
- ASSERT(count > 0);
-
- efip = xfs_efi_init(tp->t_mountp, count);
- ASSERT(efip != NULL);
-
- /*
- * Get a log_item_desc to point at the new item.
- */
- xfs_trans_add_item(tp, &efip->efi_item);
- return efip;
-}
-
-/* Log a free extent to the intent item. */
-STATIC void
-xfs_extent_free_log_item(
- struct xfs_trans *tp,
- void *intent,
- struct list_head *item)
-{
- struct xfs_efi_log_item *efip = intent;
- struct xfs_extent_free_item *free;
- uint next_extent;
- struct xfs_extent *extp;
-
- free = container_of(item, struct xfs_extent_free_item, xefi_list);
-
- tp->t_flags |= XFS_TRANS_DIRTY;
- set_bit(XFS_LI_DIRTY, &efip->efi_item.li_flags);
-
- /*
- * atomic_inc_return gives us the value after the increment;
- * we want to use it as an array index so we need to subtract 1 from
- * it.
- */
- next_extent = atomic_inc_return(&efip->efi_next_extent) - 1;
- ASSERT(next_extent < efip->efi_format.efi_nextents);
- extp = &efip->efi_format.efi_extents[next_extent];
- extp->ext_start = free->xefi_startblock;
- extp->ext_len = free->xefi_blockcount;
-}
-
-/* Get an EFD so we can process all the free extents. */
-STATIC void *
-xfs_extent_free_create_done(
- struct xfs_trans *tp,
- void *intent,
- unsigned int count)
-{
- return xfs_trans_get_efd(tp, intent, count);
-}
-
-/* Process a free extent. */
-STATIC int
-xfs_extent_free_finish_item(
- struct xfs_trans *tp,
- struct list_head *item,
- void *done_item,
- void **state)
-{
- struct xfs_extent_free_item *free;
- int error;
-
- free = container_of(item, struct xfs_extent_free_item, xefi_list);
- error = xfs_trans_free_extent(tp, done_item,
- free->xefi_startblock,
- free->xefi_blockcount,
- &free->xefi_oinfo, free->xefi_skip_discard);
- kmem_free(free);
- return error;
-}
-
-/* Abort all pending EFIs. */
-STATIC void
-xfs_extent_free_abort_intent(
- void *intent)
-{
- xfs_efi_release(intent);
-}
-
-/* Cancel a free extent. */
-STATIC void
-xfs_extent_free_cancel_item(
- struct list_head *item)
-{
- struct xfs_extent_free_item *free;
-
- free = container_of(item, struct xfs_extent_free_item, xefi_list);
- kmem_free(free);
-}
-
-static const struct xfs_defer_op_type xfs_extent_free_defer_type = {
- .type = XFS_DEFER_OPS_TYPE_FREE,
- .max_items = XFS_EFI_MAX_FAST_EXTENTS,
- .diff_items = xfs_extent_free_diff_items,
- .create_intent = xfs_extent_free_create_intent,
- .abort_intent = xfs_extent_free_abort_intent,
- .log_item = xfs_extent_free_log_item,
- .create_done = xfs_extent_free_create_done,
- .finish_item = xfs_extent_free_finish_item,
- .cancel_item = xfs_extent_free_cancel_item,
-};
-
-/*
- * AGFL blocks are accounted differently in the reserve pools and are not
- * inserted into the busy extent list.
- */
-STATIC int
-xfs_agfl_free_finish_item(
- struct xfs_trans *tp,
- struct list_head *item,
- void *done_item,
- void **state)
-{
- struct xfs_mount *mp = tp->t_mountp;
- struct xfs_efd_log_item *efdp = done_item;
- struct xfs_extent_free_item *free;
- struct xfs_extent *extp;
- struct xfs_buf *agbp;
- int error;
- xfs_agnumber_t agno;
- xfs_agblock_t agbno;
- uint next_extent;
-
- free = container_of(item, struct xfs_extent_free_item, xefi_list);
- ASSERT(free->xefi_blockcount == 1);
- agno = XFS_FSB_TO_AGNO(mp, free->xefi_startblock);
- agbno = XFS_FSB_TO_AGBNO(mp, free->xefi_startblock);
-
- trace_xfs_agfl_free_deferred(mp, agno, 0, agbno, free->xefi_blockcount);
-
- error = xfs_alloc_read_agf(mp, tp, agno, 0, &agbp);
- if (!error)
- error = xfs_free_agfl_block(tp, agno, agbno, agbp,
- &free->xefi_oinfo);
-
- /*
- * Mark the transaction dirty, even on error. This ensures the
- * transaction is aborted, which:
- *
- * 1.) releases the EFI and frees the EFD
- * 2.) shuts down the filesystem
- */
- tp->t_flags |= XFS_TRANS_DIRTY;
- set_bit(XFS_LI_DIRTY, &efdp->efd_item.li_flags);
-
- next_extent = efdp->efd_next_extent;
- ASSERT(next_extent < efdp->efd_format.efd_nextents);
- extp = &(efdp->efd_format.efd_extents[next_extent]);
- extp->ext_start = free->xefi_startblock;
- extp->ext_len = free->xefi_blockcount;
- efdp->efd_next_extent++;
-
- kmem_free(free);
- return error;
-}
-
-
-/* sub-type with special handling for AGFL deferred frees */
-static const struct xfs_defer_op_type xfs_agfl_free_defer_type = {
- .type = XFS_DEFER_OPS_TYPE_AGFL_FREE,
- .max_items = XFS_EFI_MAX_FAST_EXTENTS,
- .diff_items = xfs_extent_free_diff_items,
- .create_intent = xfs_extent_free_create_intent,
- .abort_intent = xfs_extent_free_abort_intent,
- .log_item = xfs_extent_free_log_item,
- .create_done = xfs_extent_free_create_done,
- .finish_item = xfs_agfl_free_finish_item,
- .cancel_item = xfs_extent_free_cancel_item,
-};
-
-/* Register the deferred op type. */
-void
-xfs_extent_free_init_defer_op(void)
-{
- xfs_defer_init_op_type(&xfs_extent_free_defer_type);
- xfs_defer_init_op_type(&xfs_agfl_free_defer_type);
-}
diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h
index 091eae9..2e073c1 100644
--- a/fs/xfs/xfs_trans_priv.h
+++ b/fs/xfs/xfs_trans_priv.h
@@ -16,12 +16,10 @@
void xfs_trans_init(struct xfs_mount *);
void xfs_trans_add_item(struct xfs_trans *, struct xfs_log_item *);
void xfs_trans_del_item(struct xfs_log_item *);
-void xfs_trans_free_items(struct xfs_trans *tp, xfs_lsn_t commit_lsn,
- bool abort);
void xfs_trans_unreserve_and_mod_sb(struct xfs_trans *tp);
void xfs_trans_committed_bulk(struct xfs_ail *ailp, struct xfs_log_vec *lv,
- xfs_lsn_t commit_lsn, int aborted);
+ xfs_lsn_t commit_lsn, bool aborted);
/*
* AIL traversal cursor.
*
diff --git a/fs/xfs/xfs_trans_refcount.c b/fs/xfs/xfs_trans_refcount.c
deleted file mode 100644
index 523c556..0000000
--- a/fs/xfs/xfs_trans_refcount.c
+++ /dev/null
@@ -1,248 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0+
-/*
- * Copyright (C) 2016 Oracle. All Rights Reserved.
- * Author: Darrick J. Wong <darrick.wong@oracle.com>
- */
-#include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_shared.h"
-#include "xfs_format.h"
-#include "xfs_log_format.h"
-#include "xfs_trans_resv.h"
-#include "xfs_mount.h"
-#include "xfs_defer.h"
-#include "xfs_trans.h"
-#include "xfs_trans_priv.h"
-#include "xfs_refcount_item.h"
-#include "xfs_alloc.h"
-#include "xfs_refcount.h"
-
-/*
- * This routine is called to allocate a "refcount update done"
- * log item.
- */
-struct xfs_cud_log_item *
-xfs_trans_get_cud(
- struct xfs_trans *tp,
- struct xfs_cui_log_item *cuip)
-{
- struct xfs_cud_log_item *cudp;
-
- cudp = xfs_cud_init(tp->t_mountp, cuip);
- xfs_trans_add_item(tp, &cudp->cud_item);
- return cudp;
-}
-
-/*
- * Finish an refcount update and log it to the CUD. Note that the
- * transaction is marked dirty regardless of whether the refcount
- * update succeeds or fails to support the CUI/CUD lifecycle rules.
- */
-int
-xfs_trans_log_finish_refcount_update(
- struct xfs_trans *tp,
- struct xfs_cud_log_item *cudp,
- enum xfs_refcount_intent_type type,
- xfs_fsblock_t startblock,
- xfs_extlen_t blockcount,
- xfs_fsblock_t *new_fsb,
- xfs_extlen_t *new_len,
- struct xfs_btree_cur **pcur)
-{
- int error;
-
- error = xfs_refcount_finish_one(tp, type, startblock,
- blockcount, new_fsb, new_len, pcur);
-
- /*
- * Mark the transaction dirty, even on error. This ensures the
- * transaction is aborted, which:
- *
- * 1.) releases the CUI and frees the CUD
- * 2.) shuts down the filesystem
- */
- tp->t_flags |= XFS_TRANS_DIRTY;
- set_bit(XFS_LI_DIRTY, &cudp->cud_item.li_flags);
-
- return error;
-}
-
-/* Sort refcount intents by AG. */
-static int
-xfs_refcount_update_diff_items(
- void *priv,
- struct list_head *a,
- struct list_head *b)
-{
- struct xfs_mount *mp = priv;
- struct xfs_refcount_intent *ra;
- struct xfs_refcount_intent *rb;
-
- ra = container_of(a, struct xfs_refcount_intent, ri_list);
- rb = container_of(b, struct xfs_refcount_intent, ri_list);
- return XFS_FSB_TO_AGNO(mp, ra->ri_startblock) -
- XFS_FSB_TO_AGNO(mp, rb->ri_startblock);
-}
-
-/* Get an CUI. */
-STATIC void *
-xfs_refcount_update_create_intent(
- struct xfs_trans *tp,
- unsigned int count)
-{
- struct xfs_cui_log_item *cuip;
-
- ASSERT(tp != NULL);
- ASSERT(count > 0);
-
- cuip = xfs_cui_init(tp->t_mountp, count);
- ASSERT(cuip != NULL);
-
- /*
- * Get a log_item_desc to point at the new item.
- */
- xfs_trans_add_item(tp, &cuip->cui_item);
- return cuip;
-}
-
-/* Set the phys extent flags for this reverse mapping. */
-static void
-xfs_trans_set_refcount_flags(
- struct xfs_phys_extent *refc,
- enum xfs_refcount_intent_type type)
-{
- refc->pe_flags = 0;
- switch (type) {
- case XFS_REFCOUNT_INCREASE:
- case XFS_REFCOUNT_DECREASE:
- case XFS_REFCOUNT_ALLOC_COW:
- case XFS_REFCOUNT_FREE_COW:
- refc->pe_flags |= type;
- break;
- default:
- ASSERT(0);
- }
-}
-
-/* Log refcount updates in the intent item. */
-STATIC void
-xfs_refcount_update_log_item(
- struct xfs_trans *tp,
- void *intent,
- struct list_head *item)
-{
- struct xfs_cui_log_item *cuip = intent;
- struct xfs_refcount_intent *refc;
- uint next_extent;
- struct xfs_phys_extent *ext;
-
- refc = container_of(item, struct xfs_refcount_intent, ri_list);
-
- tp->t_flags |= XFS_TRANS_DIRTY;
- set_bit(XFS_LI_DIRTY, &cuip->cui_item.li_flags);
-
- /*
- * atomic_inc_return gives us the value after the increment;
- * we want to use it as an array index so we need to subtract 1 from
- * it.
- */
- next_extent = atomic_inc_return(&cuip->cui_next_extent) - 1;
- ASSERT(next_extent < cuip->cui_format.cui_nextents);
- ext = &cuip->cui_format.cui_extents[next_extent];
- ext->pe_startblock = refc->ri_startblock;
- ext->pe_len = refc->ri_blockcount;
- xfs_trans_set_refcount_flags(ext, refc->ri_type);
-}
-
-/* Get an CUD so we can process all the deferred refcount updates. */
-STATIC void *
-xfs_refcount_update_create_done(
- struct xfs_trans *tp,
- void *intent,
- unsigned int count)
-{
- return xfs_trans_get_cud(tp, intent);
-}
-
-/* Process a deferred refcount update. */
-STATIC int
-xfs_refcount_update_finish_item(
- struct xfs_trans *tp,
- struct list_head *item,
- void *done_item,
- void **state)
-{
- struct xfs_refcount_intent *refc;
- xfs_fsblock_t new_fsb;
- xfs_extlen_t new_aglen;
- int error;
-
- refc = container_of(item, struct xfs_refcount_intent, ri_list);
- error = xfs_trans_log_finish_refcount_update(tp, done_item,
- refc->ri_type,
- refc->ri_startblock,
- refc->ri_blockcount,
- &new_fsb, &new_aglen,
- (struct xfs_btree_cur **)state);
- /* Did we run out of reservation? Requeue what we didn't finish. */
- if (!error && new_aglen > 0) {
- ASSERT(refc->ri_type == XFS_REFCOUNT_INCREASE ||
- refc->ri_type == XFS_REFCOUNT_DECREASE);
- refc->ri_startblock = new_fsb;
- refc->ri_blockcount = new_aglen;
- return -EAGAIN;
- }
- kmem_free(refc);
- return error;
-}
-
-/* Clean up after processing deferred refcounts. */
-STATIC void
-xfs_refcount_update_finish_cleanup(
- struct xfs_trans *tp,
- void *state,
- int error)
-{
- struct xfs_btree_cur *rcur = state;
-
- xfs_refcount_finish_one_cleanup(tp, rcur, error);
-}
-
-/* Abort all pending CUIs. */
-STATIC void
-xfs_refcount_update_abort_intent(
- void *intent)
-{
- xfs_cui_release(intent);
-}
-
-/* Cancel a deferred refcount update. */
-STATIC void
-xfs_refcount_update_cancel_item(
- struct list_head *item)
-{
- struct xfs_refcount_intent *refc;
-
- refc = container_of(item, struct xfs_refcount_intent, ri_list);
- kmem_free(refc);
-}
-
-static const struct xfs_defer_op_type xfs_refcount_update_defer_type = {
- .type = XFS_DEFER_OPS_TYPE_REFCOUNT,
- .max_items = XFS_CUI_MAX_FAST_EXTENTS,
- .diff_items = xfs_refcount_update_diff_items,
- .create_intent = xfs_refcount_update_create_intent,
- .abort_intent = xfs_refcount_update_abort_intent,
- .log_item = xfs_refcount_update_log_item,
- .create_done = xfs_refcount_update_create_done,
- .finish_item = xfs_refcount_update_finish_item,
- .finish_cleanup = xfs_refcount_update_finish_cleanup,
- .cancel_item = xfs_refcount_update_cancel_item,
-};
-
-/* Register the deferred op type. */
-void
-xfs_refcount_update_init_defer_op(void)
-{
- xfs_defer_init_op_type(&xfs_refcount_update_defer_type);
-}
diff --git a/fs/xfs/xfs_trans_rmap.c b/fs/xfs/xfs_trans_rmap.c
deleted file mode 100644
index 05b00e4..0000000
--- a/fs/xfs/xfs_trans_rmap.c
+++ /dev/null
@@ -1,265 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0+
-/*
- * Copyright (C) 2016 Oracle. All Rights Reserved.
- * Author: Darrick J. Wong <darrick.wong@oracle.com>
- */
-#include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_shared.h"
-#include "xfs_format.h"
-#include "xfs_log_format.h"
-#include "xfs_trans_resv.h"
-#include "xfs_mount.h"
-#include "xfs_defer.h"
-#include "xfs_trans.h"
-#include "xfs_trans_priv.h"
-#include "xfs_rmap_item.h"
-#include "xfs_alloc.h"
-#include "xfs_rmap.h"
-
-/* Set the map extent flags for this reverse mapping. */
-static void
-xfs_trans_set_rmap_flags(
- struct xfs_map_extent *rmap,
- enum xfs_rmap_intent_type type,
- int whichfork,
- xfs_exntst_t state)
-{
- rmap->me_flags = 0;
- if (state == XFS_EXT_UNWRITTEN)
- rmap->me_flags |= XFS_RMAP_EXTENT_UNWRITTEN;
- if (whichfork == XFS_ATTR_FORK)
- rmap->me_flags |= XFS_RMAP_EXTENT_ATTR_FORK;
- switch (type) {
- case XFS_RMAP_MAP:
- rmap->me_flags |= XFS_RMAP_EXTENT_MAP;
- break;
- case XFS_RMAP_MAP_SHARED:
- rmap->me_flags |= XFS_RMAP_EXTENT_MAP_SHARED;
- break;
- case XFS_RMAP_UNMAP:
- rmap->me_flags |= XFS_RMAP_EXTENT_UNMAP;
- break;
- case XFS_RMAP_UNMAP_SHARED:
- rmap->me_flags |= XFS_RMAP_EXTENT_UNMAP_SHARED;
- break;
- case XFS_RMAP_CONVERT:
- rmap->me_flags |= XFS_RMAP_EXTENT_CONVERT;
- break;
- case XFS_RMAP_CONVERT_SHARED:
- rmap->me_flags |= XFS_RMAP_EXTENT_CONVERT_SHARED;
- break;
- case XFS_RMAP_ALLOC:
- rmap->me_flags |= XFS_RMAP_EXTENT_ALLOC;
- break;
- case XFS_RMAP_FREE:
- rmap->me_flags |= XFS_RMAP_EXTENT_FREE;
- break;
- default:
- ASSERT(0);
- }
-}
-
-struct xfs_rud_log_item *
-xfs_trans_get_rud(
- struct xfs_trans *tp,
- struct xfs_rui_log_item *ruip)
-{
- struct xfs_rud_log_item *rudp;
-
- rudp = xfs_rud_init(tp->t_mountp, ruip);
- xfs_trans_add_item(tp, &rudp->rud_item);
- return rudp;
-}
-
-/*
- * Finish an rmap update and log it to the RUD. Note that the transaction is
- * marked dirty regardless of whether the rmap update succeeds or fails to
- * support the RUI/RUD lifecycle rules.
- */
-int
-xfs_trans_log_finish_rmap_update(
- struct xfs_trans *tp,
- struct xfs_rud_log_item *rudp,
- enum xfs_rmap_intent_type type,
- uint64_t owner,
- int whichfork,
- xfs_fileoff_t startoff,
- xfs_fsblock_t startblock,
- xfs_filblks_t blockcount,
- xfs_exntst_t state,
- struct xfs_btree_cur **pcur)
-{
- int error;
-
- error = xfs_rmap_finish_one(tp, type, owner, whichfork, startoff,
- startblock, blockcount, state, pcur);
-
- /*
- * Mark the transaction dirty, even on error. This ensures the
- * transaction is aborted, which:
- *
- * 1.) releases the RUI and frees the RUD
- * 2.) shuts down the filesystem
- */
- tp->t_flags |= XFS_TRANS_DIRTY;
- set_bit(XFS_LI_DIRTY, &rudp->rud_item.li_flags);
-
- return error;
-}
-
-/* Sort rmap intents by AG. */
-static int
-xfs_rmap_update_diff_items(
- void *priv,
- struct list_head *a,
- struct list_head *b)
-{
- struct xfs_mount *mp = priv;
- struct xfs_rmap_intent *ra;
- struct xfs_rmap_intent *rb;
-
- ra = container_of(a, struct xfs_rmap_intent, ri_list);
- rb = container_of(b, struct xfs_rmap_intent, ri_list);
- return XFS_FSB_TO_AGNO(mp, ra->ri_bmap.br_startblock) -
- XFS_FSB_TO_AGNO(mp, rb->ri_bmap.br_startblock);
-}
-
-/* Get an RUI. */
-STATIC void *
-xfs_rmap_update_create_intent(
- struct xfs_trans *tp,
- unsigned int count)
-{
- struct xfs_rui_log_item *ruip;
-
- ASSERT(tp != NULL);
- ASSERT(count > 0);
-
- ruip = xfs_rui_init(tp->t_mountp, count);
- ASSERT(ruip != NULL);
-
- /*
- * Get a log_item_desc to point at the new item.
- */
- xfs_trans_add_item(tp, &ruip->rui_item);
- return ruip;
-}
-
-/* Log rmap updates in the intent item. */
-STATIC void
-xfs_rmap_update_log_item(
- struct xfs_trans *tp,
- void *intent,
- struct list_head *item)
-{
- struct xfs_rui_log_item *ruip = intent;
- struct xfs_rmap_intent *rmap;
- uint next_extent;
- struct xfs_map_extent *map;
-
- rmap = container_of(item, struct xfs_rmap_intent, ri_list);
-
- tp->t_flags |= XFS_TRANS_DIRTY;
- set_bit(XFS_LI_DIRTY, &ruip->rui_item.li_flags);
-
- /*
- * atomic_inc_return gives us the value after the increment;
- * we want to use it as an array index so we need to subtract 1 from
- * it.
- */
- next_extent = atomic_inc_return(&ruip->rui_next_extent) - 1;
- ASSERT(next_extent < ruip->rui_format.rui_nextents);
- map = &ruip->rui_format.rui_extents[next_extent];
- map->me_owner = rmap->ri_owner;
- map->me_startblock = rmap->ri_bmap.br_startblock;
- map->me_startoff = rmap->ri_bmap.br_startoff;
- map->me_len = rmap->ri_bmap.br_blockcount;
- xfs_trans_set_rmap_flags(map, rmap->ri_type, rmap->ri_whichfork,
- rmap->ri_bmap.br_state);
-}
-
-/* Get an RUD so we can process all the deferred rmap updates. */
-STATIC void *
-xfs_rmap_update_create_done(
- struct xfs_trans *tp,
- void *intent,
- unsigned int count)
-{
- return xfs_trans_get_rud(tp, intent);
-}
-
-/* Process a deferred rmap update. */
-STATIC int
-xfs_rmap_update_finish_item(
- struct xfs_trans *tp,
- struct list_head *item,
- void *done_item,
- void **state)
-{
- struct xfs_rmap_intent *rmap;
- int error;
-
- rmap = container_of(item, struct xfs_rmap_intent, ri_list);
- error = xfs_trans_log_finish_rmap_update(tp, done_item,
- rmap->ri_type,
- rmap->ri_owner, rmap->ri_whichfork,
- rmap->ri_bmap.br_startoff,
- rmap->ri_bmap.br_startblock,
- rmap->ri_bmap.br_blockcount,
- rmap->ri_bmap.br_state,
- (struct xfs_btree_cur **)state);
- kmem_free(rmap);
- return error;
-}
-
-/* Clean up after processing deferred rmaps. */
-STATIC void
-xfs_rmap_update_finish_cleanup(
- struct xfs_trans *tp,
- void *state,
- int error)
-{
- struct xfs_btree_cur *rcur = state;
-
- xfs_rmap_finish_one_cleanup(tp, rcur, error);
-}
-
-/* Abort all pending RUIs. */
-STATIC void
-xfs_rmap_update_abort_intent(
- void *intent)
-{
- xfs_rui_release(intent);
-}
-
-/* Cancel a deferred rmap update. */
-STATIC void
-xfs_rmap_update_cancel_item(
- struct list_head *item)
-{
- struct xfs_rmap_intent *rmap;
-
- rmap = container_of(item, struct xfs_rmap_intent, ri_list);
- kmem_free(rmap);
-}
-
-static const struct xfs_defer_op_type xfs_rmap_update_defer_type = {
- .type = XFS_DEFER_OPS_TYPE_RMAP,
- .max_items = XFS_RUI_MAX_FAST_EXTENTS,
- .diff_items = xfs_rmap_update_diff_items,
- .create_intent = xfs_rmap_update_create_intent,
- .abort_intent = xfs_rmap_update_abort_intent,
- .log_item = xfs_rmap_update_log_item,
- .create_done = xfs_rmap_update_create_done,
- .finish_item = xfs_rmap_update_finish_item,
- .finish_cleanup = xfs_rmap_update_finish_cleanup,
- .cancel_item = xfs_rmap_update_cancel_item,
-};
-
-/* Register the deferred op type. */
-void
-xfs_rmap_update_init_defer_op(void)
-{
- xfs_defer_init_op_type(&xfs_rmap_update_defer_type);
-}
diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c
index 63ee1d5..cb895b1 100644
--- a/fs/xfs/xfs_xattr.c
+++ b/fs/xfs/xfs_xattr.c
@@ -5,15 +5,12 @@
*/
#include "xfs.h"
+#include "xfs_shared.h"
#include "xfs_format.h"
#include "xfs_log_format.h"
-#include "xfs_trans_resv.h"
-#include "xfs_mount.h"
#include "xfs_da_format.h"
#include "xfs_inode.h"
#include "xfs_attr.h"
-#include "xfs_attr_leaf.h"
-#include "xfs_acl.h"
#include <linux/posix_acl_xattr.h>
#include <linux/xattr.h>
@@ -33,7 +30,7 @@
value = NULL;
}
- error = xfs_attr_get(ip, (unsigned char *)name, value, &asize, xflags);
+ error = xfs_attr_get(ip, name, (unsigned char **)&value, &asize, xflags);
if (error)
return error;
return asize;
@@ -129,6 +126,9 @@
char *offset;
int arraytop;
+ if (context->count < 0 || context->seen_enough)
+ return;
+
if (!context->alist)
goto compute_size;