Update Linux to v5.10.109
Sourced from [1]
[1] https://cdn.kernel.org/pub/linux/kernel/v5.x/linux-5.10.109.tar.xz
Change-Id: I19bca9fc6762d4e63bcf3e4cba88bbe560d9c76c
Signed-off-by: Olivier Deprez <olivier.deprez@arm.com>
diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig
index e685299..9fac5ea 100644
--- a/fs/xfs/Kconfig
+++ b/fs/xfs/Kconfig
@@ -22,6 +22,31 @@
system of your root partition is compiled as a module, you'll need
to use an initial ramdisk (initrd) to boot.
+config XFS_SUPPORT_V4
+ bool "Support deprecated V4 (crc=0) format"
+ depends on XFS_FS
+ default y
+ help
+ The V4 filesystem format lacks certain features that are supported
+ by the V5 format, such as metadata checksumming, strengthened
+ metadata verification, and the ability to store timestamps past the
+ year 2038. Because of this, the V4 format is deprecated. All users
+ should upgrade by backing up their files, reformatting, and restoring
+ from the backup.
+
+ Administrators and users can detect a V4 filesystem by running
+ xfs_info against a filesystem mountpoint and checking for a string
+ beginning with "crc=". If the string "crc=0" is found, the
+ filesystem is a V4 filesystem. If no such string is found, please
+ upgrade xfsprogs to the latest version and try again.
+
+ This option will become default N in September 2025. Support for the
+ V4 format will be removed entirely in September 2030. Distributors
+ can say N here to withdraw support earlier.
+
+ To continue supporting the old V4 format (crc=0), say Y.
+ To close off an attack surface, say N.
+
config XFS_QUOTA
bool "XFS Quota support"
depends on XFS_FS
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index 06b68b6..04611a1 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -7,8 +7,6 @@
ccflags-y += -I $(srctree)/$(src) # needed for trace events
ccflags-y += -I $(srctree)/$(src)/libxfs
-ccflags-$(CONFIG_XFS_DEBUG) += -g
-
obj-$(CONFIG_XFS_FS) += xfs.o
# this one should be compiled first, as the tracing macros can easily blow up
@@ -26,8 +24,8 @@
xfs_bmap.o \
xfs_bmap_btree.o \
xfs_btree.o \
+ xfs_btree_staging.o \
xfs_da_btree.o \
- xfs_da_format.o \
xfs_defer.o \
xfs_dir2.o \
xfs_dir2_block.o \
@@ -101,9 +99,12 @@
xfs_log_cil.o \
xfs_bmap_item.o \
xfs_buf_item.o \
+ xfs_buf_item_recover.o \
+ xfs_dquot_item_recover.o \
xfs_extfree_item.o \
xfs_icreate_item.o \
xfs_inode_item.o \
+ xfs_inode_item_recover.o \
xfs_refcount_item.o \
xfs_rmap_item.o \
xfs_log_recover.o \
diff --git a/fs/xfs/kmem.c b/fs/xfs/kmem.c
index da031b9..e986b95 100644
--- a/fs/xfs/kmem.c
+++ b/fs/xfs/kmem.c
@@ -32,7 +32,7 @@
/*
- * __vmalloc() will allocate data pages and auxillary structures (e.g.
+ * __vmalloc() will allocate data pages and auxiliary structures (e.g.
* pagetables) with GFP_KERNEL, yet we may be under GFP_NOFS context here. Hence
* we need to tell memory reclaim that we are in such a context via
* PF_MEMALLOC_NOFS to prevent memory reclaim re-entering the filesystem here
@@ -48,7 +48,7 @@
if (flags & KM_NOFS)
nofs_flag = memalloc_nofs_save();
- ptr = __vmalloc(size, lflags, PAGE_KERNEL);
+ ptr = __vmalloc(size, lflags);
if (flags & KM_NOFS)
memalloc_nofs_restore(nofs_flag);
@@ -93,46 +93,3 @@
return ptr;
return __kmem_vmalloc(size, flags);
}
-
-void *
-kmem_realloc(const void *old, size_t newsize, xfs_km_flags_t flags)
-{
- int retries = 0;
- gfp_t lflags = kmem_flags_convert(flags);
- void *ptr;
-
- trace_kmem_realloc(newsize, flags, _RET_IP_);
-
- do {
- ptr = krealloc(old, newsize, lflags);
- if (ptr || (flags & KM_MAYFAIL))
- return ptr;
- if (!(++retries % 100))
- xfs_err(NULL,
- "%s(%u) possible memory allocation deadlock size %zu in %s (mode:0x%x)",
- current->comm, current->pid,
- newsize, __func__, lflags);
- congestion_wait(BLK_RW_ASYNC, HZ/50);
- } while (1);
-}
-
-void *
-kmem_zone_alloc(kmem_zone_t *zone, xfs_km_flags_t flags)
-{
- int retries = 0;
- gfp_t lflags = kmem_flags_convert(flags);
- void *ptr;
-
- trace_kmem_zone_alloc(kmem_cache_size(zone), flags, _RET_IP_);
- do {
- ptr = kmem_cache_alloc(zone, lflags);
- if (ptr || (flags & KM_MAYFAIL))
- return ptr;
- if (!(++retries % 100))
- xfs_err(NULL,
- "%s(%u) possible memory allocation deadlock in %s (mode:0x%x)",
- current->comm, current->pid,
- __func__, lflags);
- congestion_wait(BLK_RW_ASYNC, HZ/50);
- } while (1);
-}
diff --git a/fs/xfs/kmem.h b/fs/xfs/kmem.h
index 8170d95..3800711 100644
--- a/fs/xfs/kmem.h
+++ b/fs/xfs/kmem.h
@@ -1,4 +1,4 @@
-// SPDX-License-Identifier: GPL-2.0
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* Copyright (c) 2000-2005 Silicon Graphics, Inc.
* All Rights Reserved.
@@ -19,6 +19,7 @@
#define KM_NOFS ((__force xfs_km_flags_t)0x0004u)
#define KM_MAYFAIL ((__force xfs_km_flags_t)0x0008u)
#define KM_ZERO ((__force xfs_km_flags_t)0x0010u)
+#define KM_NOLOCKDEP ((__force xfs_km_flags_t)0x0020u)
/*
* We use a special process flag to avoid recursive callbacks into
@@ -30,7 +31,7 @@
{
gfp_t lflags;
- BUG_ON(flags & ~(KM_NOFS|KM_MAYFAIL|KM_ZERO));
+ BUG_ON(flags & ~(KM_NOFS | KM_MAYFAIL | KM_ZERO | KM_NOLOCKDEP));
lflags = GFP_KERNEL | __GFP_NOWARN;
if (flags & KM_NOFS)
@@ -49,13 +50,15 @@
if (flags & KM_ZERO)
lflags |= __GFP_ZERO;
+ if (flags & KM_NOLOCKDEP)
+ lflags |= __GFP_NOLOCKDEP;
+
return lflags;
}
extern void *kmem_alloc(size_t, xfs_km_flags_t);
extern void *kmem_alloc_io(size_t size, int align_mask, xfs_km_flags_t flags);
extern void *kmem_alloc_large(size_t size, xfs_km_flags_t);
-extern void *kmem_realloc(const void *, size_t, xfs_km_flags_t);
static inline void kmem_free(const void *ptr)
{
kvfree(ptr);
@@ -68,57 +71,13 @@
return kmem_alloc(size, flags | KM_ZERO);
}
-static inline void *
-kmem_zalloc_large(size_t size, xfs_km_flags_t flags)
-{
- return kmem_alloc_large(size, flags | KM_ZERO);
-}
-
/*
* Zone interfaces
*/
-#define KM_ZONE_HWALIGN SLAB_HWCACHE_ALIGN
-#define KM_ZONE_RECLAIM SLAB_RECLAIM_ACCOUNT
-#define KM_ZONE_SPREAD SLAB_MEM_SPREAD
-#define KM_ZONE_ACCOUNT SLAB_ACCOUNT
-
#define kmem_zone kmem_cache
#define kmem_zone_t struct kmem_cache
-static inline kmem_zone_t *
-kmem_zone_init(int size, char *zone_name)
-{
- return kmem_cache_create(zone_name, size, 0, 0, NULL);
-}
-
-static inline kmem_zone_t *
-kmem_zone_init_flags(int size, char *zone_name, slab_flags_t flags,
- void (*construct)(void *))
-{
- return kmem_cache_create(zone_name, size, 0, flags, construct);
-}
-
-static inline void
-kmem_zone_free(kmem_zone_t *zone, void *ptr)
-{
- kmem_cache_free(zone, ptr);
-}
-
-static inline void
-kmem_zone_destroy(kmem_zone_t *zone)
-{
- kmem_cache_destroy(zone);
-}
-
-extern void *kmem_zone_alloc(kmem_zone_t *, xfs_km_flags_t);
-
-static inline void *
-kmem_zone_zalloc(kmem_zone_t *zone, xfs_km_flags_t flags)
-{
- return kmem_zone_alloc(zone, flags | KM_ZERO);
-}
-
static inline struct page *
kmem_to_page(void *addr)
{
diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c
index 14fbdf2..9331f35 100644
--- a/fs/xfs/libxfs/xfs_ag.c
+++ b/fs/xfs/libxfs/xfs_ag.c
@@ -23,25 +23,28 @@
#include "xfs_ag_resv.h"
#include "xfs_health.h"
-static struct xfs_buf *
+static int
xfs_get_aghdr_buf(
struct xfs_mount *mp,
xfs_daddr_t blkno,
size_t numblks,
+ struct xfs_buf **bpp,
const struct xfs_buf_ops *ops)
{
struct xfs_buf *bp;
+ int error;
- bp = xfs_buf_get_uncached(mp->m_ddev_targp, numblks, 0);
- if (!bp)
- return NULL;
+ error = xfs_buf_get_uncached(mp->m_ddev_targp, numblks, 0, &bp);
+ if (error)
+ return error;
xfs_buf_zero(bp, 0, BBTOB(bp->b_length));
bp->b_bn = blkno;
bp->b_maps[0].bm_bn = blkno;
bp->b_ops = ops;
- return bp;
+ *bpp = bp;
+ return 0;
}
static inline bool is_log_ag(struct xfs_mount *mp, struct aghdr_init_data *id)
@@ -228,7 +231,7 @@
struct xfs_buf *bp,
struct aghdr_init_data *id)
{
- struct xfs_dsb *dsb = XFS_BUF_TO_SBP(bp);
+ struct xfs_dsb *dsb = bp->b_addr;
xfs_sb_to_disk(dsb, &mp->m_sb);
dsb->sb_inprogress = 1;
@@ -240,7 +243,7 @@
struct xfs_buf *bp,
struct aghdr_init_data *id)
{
- struct xfs_agf *agf = XFS_BUF_TO_AGF(bp);
+ struct xfs_agf *agf = bp->b_addr;
xfs_extlen_t tmpsize;
agf->agf_magicnum = cpu_to_be32(XFS_AGF_MAGIC);
@@ -298,7 +301,7 @@
uuid_copy(&agfl->agfl_uuid, &mp->m_sb.sb_meta_uuid);
}
- agfl_bno = XFS_BUF_TO_AGFL_BNO(mp, bp);
+ agfl_bno = xfs_buf_to_agfl_bno(bp);
for (bucket = 0; bucket < xfs_agfl_size(mp); bucket++)
agfl_bno[bucket] = cpu_to_be32(NULLAGBLOCK);
}
@@ -309,7 +312,7 @@
struct xfs_buf *bp,
struct aghdr_init_data *id)
{
- struct xfs_agi *agi = XFS_BUF_TO_AGI(bp);
+ struct xfs_agi *agi = bp->b_addr;
int bucket;
agi->agi_magicnum = cpu_to_be32(XFS_AGI_MAGIC);
@@ -330,6 +333,11 @@
}
for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++)
agi->agi_unlinked[bucket] = cpu_to_be32(NULLAGINO);
+ if (xfs_sb_version_hasinobtcounts(&mp->m_sb)) {
+ agi->agi_iblocks = cpu_to_be32(1);
+ if (xfs_sb_version_hasfinobt(&mp->m_sb))
+ agi->agi_fblocks = cpu_to_be32(1);
+ }
}
typedef void (*aghdr_init_work_f)(struct xfs_mount *mp, struct xfs_buf *bp,
@@ -340,13 +348,13 @@
struct aghdr_init_data *id,
aghdr_init_work_f work,
const struct xfs_buf_ops *ops)
-
{
struct xfs_buf *bp;
+ int error;
- bp = xfs_get_aghdr_buf(mp, id->daddr, id->numblks, ops);
- if (!bp)
- return -ENOMEM;
+ error = xfs_get_aghdr_buf(mp, id->daddr, id->numblks, &bp, ops);
+ if (error)
+ return error;
(*work)(mp, bp, id);
@@ -499,7 +507,7 @@
if (error)
return error;
- agi = XFS_BUF_TO_AGI(bp);
+ agi = bp->b_addr;
be32_add_cpu(&agi->agi_length, len);
ASSERT(id->agno == mp->m_sb.sb_agcount - 1 ||
be32_to_cpu(agi->agi_length) == mp->m_sb.sb_agblocks);
@@ -512,7 +520,7 @@
if (error)
return error;
- agf = XFS_BUF_TO_AGF(bp);
+ agf = bp->b_addr;
be32_add_cpu(&agf->agf_length, len);
ASSERT(agf->agf_length == agi->agi_length);
xfs_alloc_log_agf(tp, bp, XFS_AGF_LENGTH);
@@ -560,17 +568,18 @@
error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agf_bp);
if (error)
goto out_agi;
- pag = xfs_perag_get(mp, agno);
+
+ pag = agi_bp->b_pag;
/* Fill out form. */
memset(ageo, 0, sizeof(*ageo));
ageo->ag_number = agno;
- agi = XFS_BUF_TO_AGI(agi_bp);
+ agi = agi_bp->b_addr;
ageo->ag_icount = be32_to_cpu(agi->agi_count);
ageo->ag_ifree = be32_to_cpu(agi->agi_freecount);
- agf = XFS_BUF_TO_AGF(agf_bp);
+ agf = agf_bp->b_addr;
ageo->ag_length = be32_to_cpu(agf->agf_length);
freeblks = pag->pagf_freeblks +
pag->pagf_flcount +
@@ -580,7 +589,6 @@
xfs_ag_geom_health(pag, ageo);
/* Release resources. */
- xfs_perag_put(pag);
xfs_buf_relse(agf_bp);
out_agi:
xfs_buf_relse(agi_bp);
diff --git a/fs/xfs/libxfs/xfs_ag_resv.c b/fs/xfs/libxfs/xfs_ag_resv.c
index 87a9747..fdfe6dc 100644
--- a/fs/xfs/libxfs/xfs_ag_resv.c
+++ b/fs/xfs/libxfs/xfs_ag_resv.c
@@ -19,6 +19,8 @@
#include "xfs_btree.h"
#include "xfs_refcount_btree.h"
#include "xfs_ialloc_btree.h"
+#include "xfs_sb.h"
+#include "xfs_ag_resv.h"
/*
* Per-AG Block Reservations
diff --git a/fs/xfs/libxfs/xfs_ag_resv.h b/fs/xfs/libxfs/xfs_ag_resv.h
index c0352ed..8a8eb4b 100644
--- a/fs/xfs/libxfs/xfs_ag_resv.h
+++ b/fs/xfs/libxfs/xfs_ag_resv.h
@@ -1,4 +1,4 @@
-// SPDX-License-Identifier: GPL-2.0+
+/* SPDX-License-Identifier: GPL-2.0+ */
/*
* Copyright (C) 2016 Oracle. All Rights Reserved.
* Author: Darrick J. Wong <darrick.wong@oracle.com>
@@ -37,16 +37,4 @@
xfs_perag_put(pag);
}
-static inline void
-xfs_ag_resv_rmapbt_free(
- struct xfs_mount *mp,
- xfs_agnumber_t agno)
-{
- struct xfs_perag *pag;
-
- pag = xfs_perag_get(mp, agno);
- xfs_ag_resv_free_extent(pag, XFS_AG_RESV_RMAPBT, NULL, 1);
- xfs_perag_put(pag);
-}
-
#endif /* __XFS_AG_RESV_H__ */
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index 436f686..1564001 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -146,9 +146,13 @@
xfs_extlen_t len, /* length of extent */
int *stat) /* success/failure */
{
+ int error;
+
cur->bc_rec.a.ar_startblock = bno;
cur->bc_rec.a.ar_blockcount = len;
- return xfs_btree_lookup(cur, XFS_LOOKUP_EQ, stat);
+ error = xfs_btree_lookup(cur, XFS_LOOKUP_EQ, stat);
+ cur->bc_ag.abt.active = (*stat == 1);
+ return error;
}
/*
@@ -162,9 +166,13 @@
xfs_extlen_t len, /* length of extent */
int *stat) /* success/failure */
{
+ int error;
+
cur->bc_rec.a.ar_startblock = bno;
cur->bc_rec.a.ar_blockcount = len;
- return xfs_btree_lookup(cur, XFS_LOOKUP_GE, stat);
+ error = xfs_btree_lookup(cur, XFS_LOOKUP_GE, stat);
+ cur->bc_ag.abt.active = (*stat == 1);
+ return error;
}
/*
@@ -178,9 +186,19 @@
xfs_extlen_t len, /* length of extent */
int *stat) /* success/failure */
{
+ int error;
cur->bc_rec.a.ar_startblock = bno;
cur->bc_rec.a.ar_blockcount = len;
- return xfs_btree_lookup(cur, XFS_LOOKUP_LE, stat);
+ error = xfs_btree_lookup(cur, XFS_LOOKUP_LE, stat);
+ cur->bc_ag.abt.active = (*stat == 1);
+ return error;
+}
+
+static inline bool
+xfs_alloc_cur_active(
+ struct xfs_btree_cur *cur)
+{
+ return cur && cur->bc_ag.abt.active;
}
/*
@@ -212,7 +230,7 @@
int *stat) /* output: success/failure */
{
struct xfs_mount *mp = cur->bc_mp;
- xfs_agnumber_t agno = cur->bc_private.a.agno;
+ xfs_agnumber_t agno = cur->bc_ag.agno;
union xfs_btree_rec *rec;
int error;
@@ -313,7 +331,7 @@
xfs_extlen_t newlen1=0; /* length with newbno1 */
xfs_extlen_t newlen2=0; /* length with newbno2 */
xfs_agblock_t wantend; /* end of target extent */
- bool userdata = xfs_alloc_is_userdata(datatype);
+ bool userdata = datatype & XFS_ALLOC_USERDATA;
ASSERT(freelen >= wantlen);
freeend = freebno + freelen;
@@ -433,13 +451,17 @@
#ifdef DEBUG
if ((error = xfs_alloc_get_rec(cnt_cur, &nfbno1, &nflen1, &i)))
return error;
- XFS_WANT_CORRUPTED_RETURN(mp,
- i == 1 && nfbno1 == fbno && nflen1 == flen);
+ if (XFS_IS_CORRUPT(mp,
+ i != 1 ||
+ nfbno1 != fbno ||
+ nflen1 != flen))
+ return -EFSCORRUPTED;
#endif
} else {
if ((error = xfs_alloc_lookup_eq(cnt_cur, fbno, flen, &i)))
return error;
- XFS_WANT_CORRUPTED_RETURN(mp, i == 1);
+ if (XFS_IS_CORRUPT(mp, i != 1))
+ return -EFSCORRUPTED;
}
/*
* Look up the record in the by-block tree if necessary.
@@ -448,13 +470,17 @@
#ifdef DEBUG
if ((error = xfs_alloc_get_rec(bno_cur, &nfbno1, &nflen1, &i)))
return error;
- XFS_WANT_CORRUPTED_RETURN(mp,
- i == 1 && nfbno1 == fbno && nflen1 == flen);
+ if (XFS_IS_CORRUPT(mp,
+ i != 1 ||
+ nfbno1 != fbno ||
+ nflen1 != flen))
+ return -EFSCORRUPTED;
#endif
} else {
if ((error = xfs_alloc_lookup_eq(bno_cur, fbno, flen, &i)))
return error;
- XFS_WANT_CORRUPTED_RETURN(mp, i == 1);
+ if (XFS_IS_CORRUPT(mp, i != 1))
+ return -EFSCORRUPTED;
}
#ifdef DEBUG
@@ -465,8 +491,10 @@
bnoblock = XFS_BUF_TO_BLOCK(bno_cur->bc_bufs[0]);
cntblock = XFS_BUF_TO_BLOCK(cnt_cur->bc_bufs[0]);
- XFS_WANT_CORRUPTED_RETURN(mp,
- bnoblock->bb_numrecs == cntblock->bb_numrecs);
+ if (XFS_IS_CORRUPT(mp,
+ bnoblock->bb_numrecs !=
+ cntblock->bb_numrecs))
+ return -EFSCORRUPTED;
}
#endif
@@ -496,25 +524,30 @@
*/
if ((error = xfs_btree_delete(cnt_cur, &i)))
return error;
- XFS_WANT_CORRUPTED_RETURN(mp, i == 1);
+ if (XFS_IS_CORRUPT(mp, i != 1))
+ return -EFSCORRUPTED;
/*
* Add new by-size btree entry(s).
*/
if (nfbno1 != NULLAGBLOCK) {
if ((error = xfs_alloc_lookup_eq(cnt_cur, nfbno1, nflen1, &i)))
return error;
- XFS_WANT_CORRUPTED_RETURN(mp, i == 0);
+ if (XFS_IS_CORRUPT(mp, i != 0))
+ return -EFSCORRUPTED;
if ((error = xfs_btree_insert(cnt_cur, &i)))
return error;
- XFS_WANT_CORRUPTED_RETURN(mp, i == 1);
+ if (XFS_IS_CORRUPT(mp, i != 1))
+ return -EFSCORRUPTED;
}
if (nfbno2 != NULLAGBLOCK) {
if ((error = xfs_alloc_lookup_eq(cnt_cur, nfbno2, nflen2, &i)))
return error;
- XFS_WANT_CORRUPTED_RETURN(mp, i == 0);
+ if (XFS_IS_CORRUPT(mp, i != 0))
+ return -EFSCORRUPTED;
if ((error = xfs_btree_insert(cnt_cur, &i)))
return error;
- XFS_WANT_CORRUPTED_RETURN(mp, i == 1);
+ if (XFS_IS_CORRUPT(mp, i != 1))
+ return -EFSCORRUPTED;
}
/*
* Fix up the by-block btree entry(s).
@@ -525,7 +558,8 @@
*/
if ((error = xfs_btree_delete(bno_cur, &i)))
return error;
- XFS_WANT_CORRUPTED_RETURN(mp, i == 1);
+ if (XFS_IS_CORRUPT(mp, i != 1))
+ return -EFSCORRUPTED;
} else {
/*
* Update the by-block entry to start later|be shorter.
@@ -539,10 +573,12 @@
*/
if ((error = xfs_alloc_lookup_eq(bno_cur, nfbno2, nflen2, &i)))
return error;
- XFS_WANT_CORRUPTED_RETURN(mp, i == 0);
+ if (XFS_IS_CORRUPT(mp, i != 0))
+ return -EFSCORRUPTED;
if ((error = xfs_btree_insert(bno_cur, &i)))
return error;
- XFS_WANT_CORRUPTED_RETURN(mp, i == 1);
+ if (XFS_IS_CORRUPT(mp, i != 1))
+ return -EFSCORRUPTED;
}
return 0;
}
@@ -553,6 +589,7 @@
{
struct xfs_mount *mp = bp->b_mount;
struct xfs_agfl *agfl = XFS_BUF_TO_AGFL(bp);
+ __be32 *agfl_bno = xfs_buf_to_agfl_bno(bp);
int i;
/*
@@ -578,8 +615,8 @@
return __this_address;
for (i = 0; i < xfs_agfl_size(mp); i++) {
- if (be32_to_cpu(agfl->agfl_bno[i]) != NULLAGBLOCK &&
- be32_to_cpu(agfl->agfl_bno[i]) >= mp->m_sb.sb_agblocks)
+ if (be32_to_cpu(agfl_bno[i]) != NULLAGBLOCK &&
+ be32_to_cpu(agfl_bno[i]) >= mp->m_sb.sb_agblocks)
return __this_address;
}
@@ -673,27 +710,308 @@
STATIC int
xfs_alloc_update_counters(
struct xfs_trans *tp,
- struct xfs_perag *pag,
struct xfs_buf *agbp,
long len)
{
- struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp);
+ struct xfs_agf *agf = agbp->b_addr;
- pag->pagf_freeblks += len;
+ agbp->b_pag->pagf_freeblks += len;
be32_add_cpu(&agf->agf_freeblks, len);
xfs_trans_agblocks_delta(tp, len);
if (unlikely(be32_to_cpu(agf->agf_freeblks) >
- be32_to_cpu(agf->agf_length)))
+ be32_to_cpu(agf->agf_length))) {
+ xfs_buf_mark_corrupt(agbp);
return -EFSCORRUPTED;
+ }
xfs_alloc_log_agf(tp, agbp, XFS_AGF_FREEBLKS);
return 0;
}
/*
- * Allocation group level functions.
+ * Block allocation algorithm and data structures.
*/
+struct xfs_alloc_cur {
+ struct xfs_btree_cur *cnt; /* btree cursors */
+ struct xfs_btree_cur *bnolt;
+ struct xfs_btree_cur *bnogt;
+ xfs_extlen_t cur_len;/* current search length */
+ xfs_agblock_t rec_bno;/* extent startblock */
+ xfs_extlen_t rec_len;/* extent length */
+ xfs_agblock_t bno; /* alloc bno */
+ xfs_extlen_t len; /* alloc len */
+ xfs_extlen_t diff; /* diff from search bno */
+ unsigned int busy_gen;/* busy state */
+ bool busy;
+};
+
+/*
+ * Set up cursors, etc. in the extent allocation cursor. This function can be
+ * called multiple times to reset an initialized structure without having to
+ * reallocate cursors.
+ */
+static int
+xfs_alloc_cur_setup(
+ struct xfs_alloc_arg *args,
+ struct xfs_alloc_cur *acur)
+{
+ int error;
+ int i;
+
+ ASSERT(args->alignment == 1 || args->type != XFS_ALLOCTYPE_THIS_BNO);
+
+ acur->cur_len = args->maxlen;
+ acur->rec_bno = 0;
+ acur->rec_len = 0;
+ acur->bno = 0;
+ acur->len = 0;
+ acur->diff = -1;
+ acur->busy = false;
+ acur->busy_gen = 0;
+
+ /*
+ * Perform an initial cntbt lookup to check for availability of maxlen
+ * extents. If this fails, we'll return -ENOSPC to signal the caller to
+ * attempt a small allocation.
+ */
+ if (!acur->cnt)
+ acur->cnt = xfs_allocbt_init_cursor(args->mp, args->tp,
+ args->agbp, args->agno, XFS_BTNUM_CNT);
+ error = xfs_alloc_lookup_ge(acur->cnt, 0, args->maxlen, &i);
+ if (error)
+ return error;
+
+ /*
+ * Allocate the bnobt left and right search cursors.
+ */
+ if (!acur->bnolt)
+ acur->bnolt = xfs_allocbt_init_cursor(args->mp, args->tp,
+ args->agbp, args->agno, XFS_BTNUM_BNO);
+ if (!acur->bnogt)
+ acur->bnogt = xfs_allocbt_init_cursor(args->mp, args->tp,
+ args->agbp, args->agno, XFS_BTNUM_BNO);
+ return i == 1 ? 0 : -ENOSPC;
+}
+
+static void
+xfs_alloc_cur_close(
+ struct xfs_alloc_cur *acur,
+ bool error)
+{
+ int cur_error = XFS_BTREE_NOERROR;
+
+ if (error)
+ cur_error = XFS_BTREE_ERROR;
+
+ if (acur->cnt)
+ xfs_btree_del_cursor(acur->cnt, cur_error);
+ if (acur->bnolt)
+ xfs_btree_del_cursor(acur->bnolt, cur_error);
+ if (acur->bnogt)
+ xfs_btree_del_cursor(acur->bnogt, cur_error);
+ acur->cnt = acur->bnolt = acur->bnogt = NULL;
+}
+
+/*
+ * Check an extent for allocation and track the best available candidate in the
+ * allocation structure. The cursor is deactivated if it has entered an out of
+ * range state based on allocation arguments. Optionally return the extent
+ * extent geometry and allocation status if requested by the caller.
+ */
+static int
+xfs_alloc_cur_check(
+ struct xfs_alloc_arg *args,
+ struct xfs_alloc_cur *acur,
+ struct xfs_btree_cur *cur,
+ int *new)
+{
+ int error, i;
+ xfs_agblock_t bno, bnoa, bnew;
+ xfs_extlen_t len, lena, diff = -1;
+ bool busy;
+ unsigned busy_gen = 0;
+ bool deactivate = false;
+ bool isbnobt = cur->bc_btnum == XFS_BTNUM_BNO;
+
+ *new = 0;
+
+ error = xfs_alloc_get_rec(cur, &bno, &len, &i);
+ if (error)
+ return error;
+ if (XFS_IS_CORRUPT(args->mp, i != 1))
+ return -EFSCORRUPTED;
+
+ /*
+ * Check minlen and deactivate a cntbt cursor if out of acceptable size
+ * range (i.e., walking backwards looking for a minlen extent).
+ */
+ if (len < args->minlen) {
+ deactivate = !isbnobt;
+ goto out;
+ }
+
+ busy = xfs_alloc_compute_aligned(args, bno, len, &bnoa, &lena,
+ &busy_gen);
+ acur->busy |= busy;
+ if (busy)
+ acur->busy_gen = busy_gen;
+ /* deactivate a bnobt cursor outside of locality range */
+ if (bnoa < args->min_agbno || bnoa > args->max_agbno) {
+ deactivate = isbnobt;
+ goto out;
+ }
+ if (lena < args->minlen)
+ goto out;
+
+ args->len = XFS_EXTLEN_MIN(lena, args->maxlen);
+ xfs_alloc_fix_len(args);
+ ASSERT(args->len >= args->minlen);
+ if (args->len < acur->len)
+ goto out;
+
+ /*
+ * We have an aligned record that satisfies minlen and beats or matches
+ * the candidate extent size. Compare locality for near allocation mode.
+ */
+ ASSERT(args->type == XFS_ALLOCTYPE_NEAR_BNO);
+ diff = xfs_alloc_compute_diff(args->agbno, args->len,
+ args->alignment, args->datatype,
+ bnoa, lena, &bnew);
+ if (bnew == NULLAGBLOCK)
+ goto out;
+
+ /*
+ * Deactivate a bnobt cursor with worse locality than the current best.
+ */
+ if (diff > acur->diff) {
+ deactivate = isbnobt;
+ goto out;
+ }
+
+ ASSERT(args->len > acur->len ||
+ (args->len == acur->len && diff <= acur->diff));
+ acur->rec_bno = bno;
+ acur->rec_len = len;
+ acur->bno = bnew;
+ acur->len = args->len;
+ acur->diff = diff;
+ *new = 1;
+
+ /*
+ * We're done if we found a perfect allocation. This only deactivates
+ * the current cursor, but this is just an optimization to terminate a
+ * cntbt search that otherwise runs to the edge of the tree.
+ */
+ if (acur->diff == 0 && acur->len == args->maxlen)
+ deactivate = true;
+out:
+ if (deactivate)
+ cur->bc_ag.abt.active = false;
+ trace_xfs_alloc_cur_check(args->mp, cur->bc_btnum, bno, len, diff,
+ *new);
+ return 0;
+}
+
+/*
+ * Complete an allocation of a candidate extent. Remove the extent from both
+ * trees and update the args structure.
+ */
+STATIC int
+xfs_alloc_cur_finish(
+ struct xfs_alloc_arg *args,
+ struct xfs_alloc_cur *acur)
+{
+ struct xfs_agf __maybe_unused *agf = args->agbp->b_addr;
+ int error;
+
+ ASSERT(acur->cnt && acur->bnolt);
+ ASSERT(acur->bno >= acur->rec_bno);
+ ASSERT(acur->bno + acur->len <= acur->rec_bno + acur->rec_len);
+ ASSERT(acur->rec_bno + acur->rec_len <= be32_to_cpu(agf->agf_length));
+
+ error = xfs_alloc_fixup_trees(acur->cnt, acur->bnolt, acur->rec_bno,
+ acur->rec_len, acur->bno, acur->len, 0);
+ if (error)
+ return error;
+
+ args->agbno = acur->bno;
+ args->len = acur->len;
+ args->wasfromfl = 0;
+
+ trace_xfs_alloc_cur(args);
+ return 0;
+}
+
+/*
+ * Locality allocation lookup algorithm. This expects a cntbt cursor and uses
+ * bno optimized lookup to search for extents with ideal size and locality.
+ */
+STATIC int
+xfs_alloc_cntbt_iter(
+ struct xfs_alloc_arg *args,
+ struct xfs_alloc_cur *acur)
+{
+ struct xfs_btree_cur *cur = acur->cnt;
+ xfs_agblock_t bno;
+ xfs_extlen_t len, cur_len;
+ int error;
+ int i;
+
+ if (!xfs_alloc_cur_active(cur))
+ return 0;
+
+ /* locality optimized lookup */
+ cur_len = acur->cur_len;
+ error = xfs_alloc_lookup_ge(cur, args->agbno, cur_len, &i);
+ if (error)
+ return error;
+ if (i == 0)
+ return 0;
+ error = xfs_alloc_get_rec(cur, &bno, &len, &i);
+ if (error)
+ return error;
+
+ /* check the current record and update search length from it */
+ error = xfs_alloc_cur_check(args, acur, cur, &i);
+ if (error)
+ return error;
+ ASSERT(len >= acur->cur_len);
+ acur->cur_len = len;
+
+ /*
+ * We looked up the first record >= [agbno, len] above. The agbno is a
+ * secondary key and so the current record may lie just before or after
+ * agbno. If it is past agbno, check the previous record too so long as
+ * the length matches as it may be closer. Don't check a smaller record
+ * because that could deactivate our cursor.
+ */
+ if (bno > args->agbno) {
+ error = xfs_btree_decrement(cur, 0, &i);
+ if (!error && i) {
+ error = xfs_alloc_get_rec(cur, &bno, &len, &i);
+ if (!error && i && len == acur->cur_len)
+ error = xfs_alloc_cur_check(args, acur, cur,
+ &i);
+ }
+ if (error)
+ return error;
+ }
+
+ /*
+ * Increment the search key until we find at least one allocation
+ * candidate or if the extent we found was larger. Otherwise, double the
+ * search key to optimize the search. Efficiency is more important here
+ * than absolute best locality.
+ */
+ cur_len <<= 1;
+ if (!acur->len || acur->cur_len >= cur_len)
+ acur->cur_len++;
+ else
+ acur->cur_len = cur_len;
+
+ return error;
+}
/*
* Deal with the case where only small freespaces remain. Either return the
@@ -708,6 +1026,7 @@
xfs_extlen_t *flenp, /* result length */
int *stat) /* status: 0-freelist, 1-normal/none */
{
+ struct xfs_agf *agf = args->agbp->b_addr;
int error = 0;
xfs_agblock_t fbno = NULLAGBLOCK;
xfs_extlen_t flen = 0;
@@ -727,14 +1046,16 @@
error = xfs_alloc_get_rec(ccur, &fbno, &flen, &i);
if (error)
goto error;
- XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error);
+ if (XFS_IS_CORRUPT(args->mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto error;
+ }
goto out;
}
if (args->minlen != 1 || args->alignment != 1 ||
args->resv == XFS_AG_RESV_AGFL ||
- (be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_flcount) <=
- args->minleft))
+ be32_to_cpu(agf->agf_flcount) <= args->minleft)
goto out;
error = xfs_alloc_get_freelist(args->tp, args->agbp, &fbno, 0);
@@ -744,23 +1065,24 @@
goto out;
xfs_extent_busy_reuse(args->mp, args->agno, fbno, 1,
- xfs_alloc_allow_busy_reuse(args->datatype));
+ (args->datatype & XFS_ALLOC_NOBUSY));
- if (xfs_alloc_is_userdata(args->datatype)) {
+ if (args->datatype & XFS_ALLOC_USERDATA) {
struct xfs_buf *bp;
- bp = xfs_btree_get_bufs(args->mp, args->tp, args->agno, fbno);
- if (!bp) {
- error = -EFSCORRUPTED;
+ error = xfs_trans_get_buf(args->tp, args->mp->m_ddev_targp,
+ XFS_AGB_TO_DADDR(args->mp, args->agno, fbno),
+ args->mp->m_bsize, 0, &bp);
+ if (error)
goto error;
- }
xfs_trans_binval(args->tp, bp);
}
*fbnop = args->agbno = fbno;
*flenp = args->len = 1;
- XFS_WANT_CORRUPTED_GOTO(args->mp,
- fbno < be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length),
- error);
+ if (XFS_IS_CORRUPT(args->mp, fbno >= be32_to_cpu(agf->agf_length))) {
+ error = -EFSCORRUPTED;
+ goto error;
+ }
args->wasfromfl = 1;
trace_xfs_alloc_small_freelist(args);
@@ -852,8 +1174,7 @@
}
if (!args->wasfromfl) {
- error = xfs_alloc_update_counters(args->tp, args->pag,
- args->agbp,
+ error = xfs_alloc_update_counters(args->tp, args->agbp,
-((long)(args->len)));
if (error)
return error;
@@ -879,6 +1200,7 @@
xfs_alloc_ag_vextent_exact(
xfs_alloc_arg_t *args) /* allocation argument structure */
{
+ struct xfs_agf __maybe_unused *agf = args->agbp->b_addr;
xfs_btree_cur_t *bno_cur;/* by block-number btree cursor */
xfs_btree_cur_t *cnt_cur;/* by count btree cursor */
int error;
@@ -915,7 +1237,10 @@
error = xfs_alloc_get_rec(bno_cur, &fbno, &flen, &i);
if (error)
goto error0;
- XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0);
+ if (XFS_IS_CORRUPT(args->mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto error0;
+ }
ASSERT(fbno <= args->agbno);
/*
@@ -954,8 +1279,7 @@
*/
cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
args->agno, XFS_BTNUM_CNT);
- ASSERT(args->agbno + args->len <=
- be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length));
+ ASSERT(args->agbno + args->len <= be32_to_cpu(agf->agf_length));
error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen, args->agbno,
args->len, XFSA_FIXUP_BNO_OK);
if (error) {
@@ -984,98 +1308,243 @@
}
/*
- * Search the btree in a given direction via the search cursor and compare
- * the records found against the good extent we've already found.
+ * Search a given number of btree records in a given direction. Check each
+ * record against the good extent we've already found.
*/
STATIC int
-xfs_alloc_find_best_extent(
- struct xfs_alloc_arg *args, /* allocation argument structure */
- struct xfs_btree_cur **gcur, /* good cursor */
- struct xfs_btree_cur **scur, /* searching cursor */
- xfs_agblock_t gdiff, /* difference for search comparison */
- xfs_agblock_t *sbno, /* extent found by search */
- xfs_extlen_t *slen, /* extent length */
- xfs_agblock_t *sbnoa, /* aligned extent found by search */
- xfs_extlen_t *slena, /* aligned extent length */
- int dir) /* 0 = search right, 1 = search left */
+xfs_alloc_walk_iter(
+ struct xfs_alloc_arg *args,
+ struct xfs_alloc_cur *acur,
+ struct xfs_btree_cur *cur,
+ bool increment,
+ bool find_one, /* quit on first candidate */
+ int count, /* rec count (-1 for infinite) */
+ int *stat)
{
- xfs_agblock_t new;
- xfs_agblock_t sdiff;
int error;
int i;
- unsigned busy_gen;
- /* The good extent is perfect, no need to search. */
- if (!gdiff)
- goto out_use_good;
+ *stat = 0;
/*
- * Look until we find a better one, run out of space or run off the end.
+ * Search so long as the cursor is active or we find a better extent.
+ * The cursor is deactivated if it extends beyond the range of the
+ * current allocation candidate.
*/
- do {
- error = xfs_alloc_get_rec(*scur, sbno, slen, &i);
+ while (xfs_alloc_cur_active(cur) && count) {
+ error = xfs_alloc_cur_check(args, acur, cur, &i);
if (error)
- goto error0;
- XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0);
- xfs_alloc_compute_aligned(args, *sbno, *slen,
- sbnoa, slena, &busy_gen);
-
- /*
- * The good extent is closer than this one.
- */
- if (!dir) {
- if (*sbnoa > args->max_agbno)
- goto out_use_good;
- if (*sbnoa >= args->agbno + gdiff)
- goto out_use_good;
- } else {
- if (*sbnoa < args->min_agbno)
- goto out_use_good;
- if (*sbnoa <= args->agbno - gdiff)
- goto out_use_good;
+ return error;
+ if (i == 1) {
+ *stat = 1;
+ if (find_one)
+ break;
}
+ if (!xfs_alloc_cur_active(cur))
+ break;
- /*
- * Same distance, compare length and pick the best.
- */
- if (*slena >= args->minlen) {
- args->len = XFS_EXTLEN_MIN(*slena, args->maxlen);
- xfs_alloc_fix_len(args);
-
- sdiff = xfs_alloc_compute_diff(args->agbno, args->len,
- args->alignment,
- args->datatype, *sbnoa,
- *slena, &new);
-
- /*
- * Choose closer size and invalidate other cursor.
- */
- if (sdiff < gdiff)
- goto out_use_search;
- goto out_use_good;
- }
-
- if (!dir)
- error = xfs_btree_increment(*scur, 0, &i);
+ if (increment)
+ error = xfs_btree_increment(cur, 0, &i);
else
- error = xfs_btree_decrement(*scur, 0, &i);
+ error = xfs_btree_decrement(cur, 0, &i);
if (error)
- goto error0;
- } while (i);
+ return error;
+ if (i == 0)
+ cur->bc_ag.abt.active = false;
-out_use_good:
- xfs_btree_del_cursor(*scur, XFS_BTREE_NOERROR);
- *scur = NULL;
+ if (count > 0)
+ count--;
+ }
+
return 0;
+}
-out_use_search:
- xfs_btree_del_cursor(*gcur, XFS_BTREE_NOERROR);
- *gcur = NULL;
+/*
+ * Search the by-bno and by-size btrees in parallel in search of an extent with
+ * ideal locality based on the NEAR mode ->agbno locality hint.
+ */
+STATIC int
+xfs_alloc_ag_vextent_locality(
+ struct xfs_alloc_arg *args,
+ struct xfs_alloc_cur *acur,
+ int *stat)
+{
+ struct xfs_btree_cur *fbcur = NULL;
+ int error;
+ int i;
+ bool fbinc;
+
+ ASSERT(acur->len == 0);
+ ASSERT(args->type == XFS_ALLOCTYPE_NEAR_BNO);
+
+ *stat = 0;
+
+ error = xfs_alloc_lookup_ge(acur->cnt, args->agbno, acur->cur_len, &i);
+ if (error)
+ return error;
+ error = xfs_alloc_lookup_le(acur->bnolt, args->agbno, 0, &i);
+ if (error)
+ return error;
+ error = xfs_alloc_lookup_ge(acur->bnogt, args->agbno, 0, &i);
+ if (error)
+ return error;
+
+ /*
+ * Search the bnobt and cntbt in parallel. Search the bnobt left and
+ * right and lookup the closest extent to the locality hint for each
+ * extent size key in the cntbt. The entire search terminates
+ * immediately on a bnobt hit because that means we've found best case
+ * locality. Otherwise the search continues until the cntbt cursor runs
+ * off the end of the tree. If no allocation candidate is found at this
+ * point, give up on locality, walk backwards from the end of the cntbt
+ * and take the first available extent.
+ *
+ * The parallel tree searches balance each other out to provide fairly
+ * consistent performance for various situations. The bnobt search can
+ * have pathological behavior in the worst case scenario of larger
+ * allocation requests and fragmented free space. On the other hand, the
+ * bnobt is able to satisfy most smaller allocation requests much more
+ * quickly than the cntbt. The cntbt search can sift through fragmented
+ * free space and sets of free extents for larger allocation requests
+ * more quickly than the bnobt. Since the locality hint is just a hint
+ * and we don't want to scan the entire bnobt for perfect locality, the
+ * cntbt search essentially bounds the bnobt search such that we can
+ * find good enough locality at reasonable performance in most cases.
+ */
+ while (xfs_alloc_cur_active(acur->bnolt) ||
+ xfs_alloc_cur_active(acur->bnogt) ||
+ xfs_alloc_cur_active(acur->cnt)) {
+
+ trace_xfs_alloc_cur_lookup(args);
+
+ /*
+ * Search the bnobt left and right. In the case of a hit, finish
+ * the search in the opposite direction and we're done.
+ */
+ error = xfs_alloc_walk_iter(args, acur, acur->bnolt, false,
+ true, 1, &i);
+ if (error)
+ return error;
+ if (i == 1) {
+ trace_xfs_alloc_cur_left(args);
+ fbcur = acur->bnogt;
+ fbinc = true;
+ break;
+ }
+ error = xfs_alloc_walk_iter(args, acur, acur->bnogt, true, true,
+ 1, &i);
+ if (error)
+ return error;
+ if (i == 1) {
+ trace_xfs_alloc_cur_right(args);
+ fbcur = acur->bnolt;
+ fbinc = false;
+ break;
+ }
+
+ /*
+ * Check the extent with best locality based on the current
+ * extent size search key and keep track of the best candidate.
+ */
+ error = xfs_alloc_cntbt_iter(args, acur);
+ if (error)
+ return error;
+ if (!xfs_alloc_cur_active(acur->cnt)) {
+ trace_xfs_alloc_cur_lookup_done(args);
+ break;
+ }
+ }
+
+ /*
+ * If we failed to find anything due to busy extents, return empty
+ * handed so the caller can flush and retry. If no busy extents were
+ * found, walk backwards from the end of the cntbt as a last resort.
+ */
+ if (!xfs_alloc_cur_active(acur->cnt) && !acur->len && !acur->busy) {
+ error = xfs_btree_decrement(acur->cnt, 0, &i);
+ if (error)
+ return error;
+ if (i) {
+ acur->cnt->bc_ag.abt.active = true;
+ fbcur = acur->cnt;
+ fbinc = false;
+ }
+ }
+
+ /*
+ * Search in the opposite direction for a better entry in the case of
+ * a bnobt hit or walk backwards from the end of the cntbt.
+ */
+ if (fbcur) {
+ error = xfs_alloc_walk_iter(args, acur, fbcur, fbinc, true, -1,
+ &i);
+ if (error)
+ return error;
+ }
+
+ if (acur->len)
+ *stat = 1;
+
return 0;
+}
-error0:
- /* caller invalidates cursors */
- return error;
+/* Check the last block of the cnt btree for allocations. */
+static int
+xfs_alloc_ag_vextent_lastblock(
+ struct xfs_alloc_arg *args,
+ struct xfs_alloc_cur *acur,
+ xfs_agblock_t *bno,
+ xfs_extlen_t *len,
+ bool *allocated)
+{
+ int error;
+ int i;
+
+#ifdef DEBUG
+ /* Randomly don't execute the first algorithm. */
+ if (prandom_u32() & 1)
+ return 0;
+#endif
+
+ /*
+ * Start from the entry that lookup found, sequence through all larger
+ * free blocks. If we're actually pointing at a record smaller than
+ * maxlen, go to the start of this block, and skip all those smaller
+ * than minlen.
+ */
+ if (*len || args->alignment > 1) {
+ acur->cnt->bc_ptrs[0] = 1;
+ do {
+ error = xfs_alloc_get_rec(acur->cnt, bno, len, &i);
+ if (error)
+ return error;
+ if (XFS_IS_CORRUPT(args->mp, i != 1))
+ return -EFSCORRUPTED;
+ if (*len >= args->minlen)
+ break;
+ error = xfs_btree_increment(acur->cnt, 0, &i);
+ if (error)
+ return error;
+ } while (i);
+ ASSERT(*len >= args->minlen);
+ if (!i)
+ return 0;
+ }
+
+ error = xfs_alloc_walk_iter(args, acur, acur->cnt, true, false, -1, &i);
+ if (error)
+ return error;
+
+ /*
+ * It didn't work. We COULD be in a case where there's a good record
+ * somewhere, so try again.
+ */
+ if (acur->len == 0)
+ return 0;
+
+ trace_xfs_alloc_near_first(args);
+ *allocated = true;
+ return 0;
}
/*
@@ -1084,41 +1553,17 @@
* and of the form k * prod + mod unless there's nothing that large.
* Return the starting a.g. block, or NULLAGBLOCK if we can't do it.
*/
-STATIC int /* error */
+STATIC int
xfs_alloc_ag_vextent_near(
- xfs_alloc_arg_t *args) /* allocation argument structure */
+ struct xfs_alloc_arg *args)
{
- xfs_btree_cur_t *bno_cur_gt; /* cursor for bno btree, right side */
- xfs_btree_cur_t *bno_cur_lt; /* cursor for bno btree, left side */
- xfs_btree_cur_t *cnt_cur; /* cursor for count btree */
- xfs_agblock_t gtbno; /* start bno of right side entry */
- xfs_agblock_t gtbnoa; /* aligned ... */
- xfs_extlen_t gtdiff; /* difference to right side entry */
- xfs_extlen_t gtlen; /* length of right side entry */
- xfs_extlen_t gtlena; /* aligned ... */
- xfs_agblock_t gtnew; /* useful start bno of right side */
- int error; /* error code */
- int i; /* result code, temporary */
- int j; /* result code, temporary */
- xfs_agblock_t ltbno; /* start bno of left side entry */
- xfs_agblock_t ltbnoa; /* aligned ... */
- xfs_extlen_t ltdiff; /* difference to left side entry */
- xfs_extlen_t ltlen; /* length of left side entry */
- xfs_extlen_t ltlena; /* aligned ... */
- xfs_agblock_t ltnew; /* useful start bno of left side */
- xfs_extlen_t rlen; /* length of returned extent */
- bool busy;
- unsigned busy_gen;
-#ifdef DEBUG
- /*
- * Randomly don't execute the first algorithm.
- */
- int dofirst; /* set to do first algorithm */
+ struct xfs_alloc_cur acur = {};
+ int error; /* error code */
+ int i; /* result code, temporary */
+ xfs_agblock_t bno;
+ xfs_extlen_t len;
- dofirst = prandom_u32() & 1;
-#endif
-
- /* handle unitialized agbno range so caller doesn't have to */
+ /* handle uninitialized agbno range so caller doesn't have to */
if (!args->min_agbno && !args->max_agbno)
args->max_agbno = args->mp->m_sb.sb_agblocks - 1;
ASSERT(args->min_agbno <= args->max_agbno);
@@ -1130,40 +1575,27 @@
args->agbno = args->max_agbno;
restart:
- bno_cur_lt = NULL;
- bno_cur_gt = NULL;
- ltlen = 0;
- gtlena = 0;
- ltlena = 0;
- busy = false;
+ len = 0;
/*
- * Get a cursor for the by-size btree.
+ * Set up cursors and see if there are any free extents as big as
+ * maxlen. If not, pick the last entry in the tree unless the tree is
+ * empty.
*/
- cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
- args->agno, XFS_BTNUM_CNT);
-
- /*
- * See if there are any free extents as big as maxlen.
- */
- if ((error = xfs_alloc_lookup_ge(cnt_cur, 0, args->maxlen, &i)))
- goto error0;
- /*
- * If none, then pick up the last entry in the tree unless the
- * tree is empty.
- */
- if (!i) {
- if ((error = xfs_alloc_ag_vextent_small(args, cnt_cur, <bno,
- <len, &i)))
- goto error0;
- if (i == 0 || ltlen == 0) {
- xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
+ error = xfs_alloc_cur_setup(args, &acur);
+ if (error == -ENOSPC) {
+ error = xfs_alloc_ag_vextent_small(args, acur.cnt, &bno,
+ &len, &i);
+ if (error)
+ goto out;
+ if (i == 0 || len == 0) {
trace_xfs_alloc_near_noentry(args);
- return 0;
+ goto out;
}
ASSERT(i == 1);
+ } else if (error) {
+ goto out;
}
- args->wasfromfl = 0;
/*
* First algorithm.
@@ -1172,311 +1604,47 @@
* near the right edge of the tree. If it's in the last btree leaf
* block, then we just examine all the entries in that block
* that are big enough, and pick the best one.
- * This is written as a while loop so we can break out of it,
- * but we never loop back to the top.
*/
- while (xfs_btree_islastblock(cnt_cur, 0)) {
- xfs_extlen_t bdiff;
- int besti=0;
- xfs_extlen_t blen=0;
- xfs_agblock_t bnew=0;
+ if (xfs_btree_islastblock(acur.cnt, 0)) {
+ bool allocated = false;
-#ifdef DEBUG
- if (dofirst)
- break;
-#endif
- /*
- * Start from the entry that lookup found, sequence through
- * all larger free blocks. If we're actually pointing at a
- * record smaller than maxlen, go to the start of this block,
- * and skip all those smaller than minlen.
- */
- if (ltlen || args->alignment > 1) {
- cnt_cur->bc_ptrs[0] = 1;
- do {
- if ((error = xfs_alloc_get_rec(cnt_cur, <bno,
- <len, &i)))
- goto error0;
- XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0);
- if (ltlen >= args->minlen)
- break;
- if ((error = xfs_btree_increment(cnt_cur, 0, &i)))
- goto error0;
- } while (i);
- ASSERT(ltlen >= args->minlen);
- if (!i)
- break;
- }
- i = cnt_cur->bc_ptrs[0];
- for (j = 1, blen = 0, bdiff = 0;
- !error && j && (blen < args->maxlen || bdiff > 0);
- error = xfs_btree_increment(cnt_cur, 0, &j)) {
- /*
- * For each entry, decide if it's better than
- * the previous best entry.
- */
- if ((error = xfs_alloc_get_rec(cnt_cur, <bno, <len, &i)))
- goto error0;
- XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0);
- busy = xfs_alloc_compute_aligned(args, ltbno, ltlen,
- <bnoa, <lena, &busy_gen);
- if (ltlena < args->minlen)
- continue;
- if (ltbnoa < args->min_agbno || ltbnoa > args->max_agbno)
- continue;
- args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen);
- xfs_alloc_fix_len(args);
- ASSERT(args->len >= args->minlen);
- if (args->len < blen)
- continue;
- ltdiff = xfs_alloc_compute_diff(args->agbno, args->len,
- args->alignment, args->datatype, ltbnoa,
- ltlena, <new);
- if (ltnew != NULLAGBLOCK &&
- (args->len > blen || ltdiff < bdiff)) {
- bdiff = ltdiff;
- bnew = ltnew;
- blen = args->len;
- besti = cnt_cur->bc_ptrs[0];
- }
- }
- /*
- * It didn't work. We COULD be in a case where
- * there's a good record somewhere, so try again.
- */
- if (blen == 0)
- break;
- /*
- * Point at the best entry, and retrieve it again.
- */
- cnt_cur->bc_ptrs[0] = besti;
- if ((error = xfs_alloc_get_rec(cnt_cur, <bno, <len, &i)))
- goto error0;
- XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0);
- ASSERT(ltbno + ltlen <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length));
- args->len = blen;
-
- /*
- * We are allocating starting at bnew for blen blocks.
- */
- args->agbno = bnew;
- ASSERT(bnew >= ltbno);
- ASSERT(bnew + blen <= ltbno + ltlen);
- /*
- * Set up a cursor for the by-bno tree.
- */
- bno_cur_lt = xfs_allocbt_init_cursor(args->mp, args->tp,
- args->agbp, args->agno, XFS_BTNUM_BNO);
- /*
- * Fix up the btree entries.
- */
- if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur_lt, ltbno,
- ltlen, bnew, blen, XFSA_FIXUP_CNT_OK)))
- goto error0;
- xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
- xfs_btree_del_cursor(bno_cur_lt, XFS_BTREE_NOERROR);
-
- trace_xfs_alloc_near_first(args);
- return 0;
- }
- /*
- * Second algorithm.
- * Search in the by-bno tree to the left and to the right
- * simultaneously, until in each case we find a space big enough,
- * or run into the edge of the tree. When we run into the edge,
- * we deallocate that cursor.
- * If both searches succeed, we compare the two spaces and pick
- * the better one.
- * With alignment, it's possible for both to fail; the upper
- * level algorithm that picks allocation groups for allocations
- * is not supposed to do this.
- */
- /*
- * Allocate and initialize the cursor for the leftward search.
- */
- bno_cur_lt = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
- args->agno, XFS_BTNUM_BNO);
- /*
- * Lookup <= bno to find the leftward search's starting point.
- */
- if ((error = xfs_alloc_lookup_le(bno_cur_lt, args->agbno, args->maxlen, &i)))
- goto error0;
- if (!i) {
- /*
- * Didn't find anything; use this cursor for the rightward
- * search.
- */
- bno_cur_gt = bno_cur_lt;
- bno_cur_lt = NULL;
- }
- /*
- * Found something. Duplicate the cursor for the rightward search.
- */
- else if ((error = xfs_btree_dup_cursor(bno_cur_lt, &bno_cur_gt)))
- goto error0;
- /*
- * Increment the cursor, so we will point at the entry just right
- * of the leftward entry if any, or to the leftmost entry.
- */
- if ((error = xfs_btree_increment(bno_cur_gt, 0, &i)))
- goto error0;
- if (!i) {
- /*
- * It failed, there are no rightward entries.
- */
- xfs_btree_del_cursor(bno_cur_gt, XFS_BTREE_NOERROR);
- bno_cur_gt = NULL;
- }
- /*
- * Loop going left with the leftward cursor, right with the
- * rightward cursor, until either both directions give up or
- * we find an entry at least as big as minlen.
- */
- do {
- if (bno_cur_lt) {
- if ((error = xfs_alloc_get_rec(bno_cur_lt, <bno, <len, &i)))
- goto error0;
- XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0);
- busy |= xfs_alloc_compute_aligned(args, ltbno, ltlen,
- <bnoa, <lena, &busy_gen);
- if (ltlena >= args->minlen && ltbnoa >= args->min_agbno)
- break;
- if ((error = xfs_btree_decrement(bno_cur_lt, 0, &i)))
- goto error0;
- if (!i || ltbnoa < args->min_agbno) {
- xfs_btree_del_cursor(bno_cur_lt,
- XFS_BTREE_NOERROR);
- bno_cur_lt = NULL;
- }
- }
- if (bno_cur_gt) {
- if ((error = xfs_alloc_get_rec(bno_cur_gt, >bno, >len, &i)))
- goto error0;
- XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0);
- busy |= xfs_alloc_compute_aligned(args, gtbno, gtlen,
- >bnoa, >lena, &busy_gen);
- if (gtlena >= args->minlen && gtbnoa <= args->max_agbno)
- break;
- if ((error = xfs_btree_increment(bno_cur_gt, 0, &i)))
- goto error0;
- if (!i || gtbnoa > args->max_agbno) {
- xfs_btree_del_cursor(bno_cur_gt,
- XFS_BTREE_NOERROR);
- bno_cur_gt = NULL;
- }
- }
- } while (bno_cur_lt || bno_cur_gt);
-
- /*
- * Got both cursors still active, need to find better entry.
- */
- if (bno_cur_lt && bno_cur_gt) {
- if (ltlena >= args->minlen) {
- /*
- * Left side is good, look for a right side entry.
- */
- args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen);
- xfs_alloc_fix_len(args);
- ltdiff = xfs_alloc_compute_diff(args->agbno, args->len,
- args->alignment, args->datatype, ltbnoa,
- ltlena, <new);
-
- error = xfs_alloc_find_best_extent(args,
- &bno_cur_lt, &bno_cur_gt,
- ltdiff, >bno, >len,
- >bnoa, >lena,
- 0 /* search right */);
- } else {
- ASSERT(gtlena >= args->minlen);
-
- /*
- * Right side is good, look for a left side entry.
- */
- args->len = XFS_EXTLEN_MIN(gtlena, args->maxlen);
- xfs_alloc_fix_len(args);
- gtdiff = xfs_alloc_compute_diff(args->agbno, args->len,
- args->alignment, args->datatype, gtbnoa,
- gtlena, >new);
-
- error = xfs_alloc_find_best_extent(args,
- &bno_cur_gt, &bno_cur_lt,
- gtdiff, <bno, <len,
- <bnoa, <lena,
- 1 /* search left */);
- }
-
+ error = xfs_alloc_ag_vextent_lastblock(args, &acur, &bno, &len,
+ &allocated);
if (error)
- goto error0;
+ goto out;
+ if (allocated)
+ goto alloc_finish;
}
/*
+ * Second algorithm. Combined cntbt and bnobt search to find ideal
+ * locality.
+ */
+ error = xfs_alloc_ag_vextent_locality(args, &acur, &i);
+ if (error)
+ goto out;
+
+ /*
* If we couldn't get anything, give up.
*/
- if (bno_cur_lt == NULL && bno_cur_gt == NULL) {
- xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
-
- if (busy) {
+ if (!acur.len) {
+ if (acur.busy) {
trace_xfs_alloc_near_busy(args);
- xfs_extent_busy_flush(args->mp, args->pag, busy_gen);
+ xfs_extent_busy_flush(args->mp, args->pag,
+ acur.busy_gen);
goto restart;
}
trace_xfs_alloc_size_neither(args);
args->agbno = NULLAGBLOCK;
- return 0;
+ goto out;
}
- /*
- * At this point we have selected a freespace entry, either to the
- * left or to the right. If it's on the right, copy all the
- * useful variables to the "left" set so we only have one
- * copy of this code.
- */
- if (bno_cur_gt) {
- bno_cur_lt = bno_cur_gt;
- bno_cur_gt = NULL;
- ltbno = gtbno;
- ltbnoa = gtbnoa;
- ltlen = gtlen;
- ltlena = gtlena;
- j = 1;
- } else
- j = 0;
+alloc_finish:
+ /* fix up btrees on a successful allocation */
+ error = xfs_alloc_cur_finish(args, &acur);
- /*
- * Fix up the length and compute the useful address.
- */
- args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen);
- xfs_alloc_fix_len(args);
- rlen = args->len;
- (void)xfs_alloc_compute_diff(args->agbno, rlen, args->alignment,
- args->datatype, ltbnoa, ltlena, <new);
- ASSERT(ltnew >= ltbno);
- ASSERT(ltnew + rlen <= ltbnoa + ltlena);
- ASSERT(ltnew + rlen <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length));
- ASSERT(ltnew >= args->min_agbno && ltnew <= args->max_agbno);
- args->agbno = ltnew;
-
- if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur_lt, ltbno, ltlen,
- ltnew, rlen, XFSA_FIXUP_BNO_OK)))
- goto error0;
-
- if (j)
- trace_xfs_alloc_near_greater(args);
- else
- trace_xfs_alloc_near_lesser(args);
-
- xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
- xfs_btree_del_cursor(bno_cur_lt, XFS_BTREE_NOERROR);
- return 0;
-
- error0:
- trace_xfs_alloc_near_error(args);
- if (cnt_cur != NULL)
- xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR);
- if (bno_cur_lt != NULL)
- xfs_btree_del_cursor(bno_cur_lt, XFS_BTREE_ERROR);
- if (bno_cur_gt != NULL)
- xfs_btree_del_cursor(bno_cur_gt, XFS_BTREE_ERROR);
+out:
+ xfs_alloc_cur_close(&acur, error);
return error;
}
@@ -1490,6 +1658,7 @@
xfs_alloc_ag_vextent_size(
xfs_alloc_arg_t *args) /* allocation argument structure */
{
+ struct xfs_agf *agf = args->agbp->b_addr;
xfs_btree_cur_t *bno_cur; /* cursor for bno btree */
xfs_btree_cur_t *cnt_cur; /* cursor for cnt btree */
int error; /* error result */
@@ -1545,7 +1714,10 @@
error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen, &i);
if (error)
goto error0;
- XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0);
+ if (XFS_IS_CORRUPT(args->mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto error0;
+ }
busy = xfs_alloc_compute_aligned(args, fbno, flen,
&rbno, &rlen, &busy_gen);
@@ -1579,8 +1751,13 @@
* This can't happen in the second case above.
*/
rlen = XFS_EXTLEN_MIN(args->maxlen, rlen);
- XFS_WANT_CORRUPTED_GOTO(args->mp, rlen == 0 ||
- (rlen <= flen && rbno + rlen <= fbno + flen), error0);
+ if (XFS_IS_CORRUPT(args->mp,
+ rlen != 0 &&
+ (rlen > flen ||
+ rbno + rlen > fbno + flen))) {
+ error = -EFSCORRUPTED;
+ goto error0;
+ }
if (rlen < args->maxlen) {
xfs_agblock_t bestfbno;
xfs_extlen_t bestflen;
@@ -1599,15 +1776,22 @@
if ((error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen,
&i)))
goto error0;
- XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0);
+ if (XFS_IS_CORRUPT(args->mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto error0;
+ }
if (flen < bestrlen)
break;
busy = xfs_alloc_compute_aligned(args, fbno, flen,
&rbno, &rlen, &busy_gen);
rlen = XFS_EXTLEN_MIN(args->maxlen, rlen);
- XFS_WANT_CORRUPTED_GOTO(args->mp, rlen == 0 ||
- (rlen <= flen && rbno + rlen <= fbno + flen),
- error0);
+ if (XFS_IS_CORRUPT(args->mp,
+ rlen != 0 &&
+ (rlen > flen ||
+ rbno + rlen > fbno + flen))) {
+ error = -EFSCORRUPTED;
+ goto error0;
+ }
if (rlen > bestrlen) {
bestrlen = rlen;
bestrbno = rbno;
@@ -1620,7 +1804,10 @@
if ((error = xfs_alloc_lookup_eq(cnt_cur, bestfbno, bestflen,
&i)))
goto error0;
- XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0);
+ if (XFS_IS_CORRUPT(args->mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto error0;
+ }
rlen = bestrlen;
rbno = bestrbno;
flen = bestflen;
@@ -1643,7 +1830,10 @@
xfs_alloc_fix_len(args);
rlen = args->len;
- XFS_WANT_CORRUPTED_GOTO(args->mp, rlen <= flen, error0);
+ if (XFS_IS_CORRUPT(args->mp, rlen > flen)) {
+ error = -EFSCORRUPTED;
+ goto error0;
+ }
/*
* Allocate and initialize a cursor for the by-block tree.
*/
@@ -1657,10 +1847,12 @@
cnt_cur = bno_cur = NULL;
args->len = rlen;
args->agbno = rbno;
- XFS_WANT_CORRUPTED_GOTO(args->mp,
- args->agbno + args->len <=
- be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length),
- error0);
+ if (XFS_IS_CORRUPT(args->mp,
+ args->agbno + args->len >
+ be32_to_cpu(agf->agf_length))) {
+ error = -EFSCORRUPTED;
+ goto error0;
+ }
trace_xfs_alloc_size_done(args);
return 0;
@@ -1693,7 +1885,6 @@
enum xfs_ag_resv_type type)
{
struct xfs_mount *mp;
- struct xfs_perag *pag;
struct xfs_btree_cur *bno_cur;
struct xfs_btree_cur *cnt_cur;
xfs_agblock_t gtbno; /* start of right neighbor */
@@ -1732,7 +1923,10 @@
*/
if ((error = xfs_alloc_get_rec(bno_cur, <bno, <len, &i)))
goto error0;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto error0;
+ }
/*
* It's not contiguous, though.
*/
@@ -1744,8 +1938,10 @@
* space was invalid, it's (partly) already free.
* Very bad.
*/
- XFS_WANT_CORRUPTED_GOTO(mp,
- ltbno + ltlen <= bno, error0);
+ if (XFS_IS_CORRUPT(mp, ltbno + ltlen > bno)) {
+ error = -EFSCORRUPTED;
+ goto error0;
+ }
}
}
/*
@@ -1760,7 +1956,10 @@
*/
if ((error = xfs_alloc_get_rec(bno_cur, >bno, >len, &i)))
goto error0;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto error0;
+ }
/*
* It's not contiguous, though.
*/
@@ -1772,7 +1971,10 @@
* space was invalid, it's (partly) already free.
* Very bad.
*/
- XFS_WANT_CORRUPTED_GOTO(mp, gtbno >= bno + len, error0);
+ if (XFS_IS_CORRUPT(mp, bno + len > gtbno)) {
+ error = -EFSCORRUPTED;
+ goto error0;
+ }
}
}
/*
@@ -1789,31 +1991,49 @@
*/
if ((error = xfs_alloc_lookup_eq(cnt_cur, ltbno, ltlen, &i)))
goto error0;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto error0;
+ }
if ((error = xfs_btree_delete(cnt_cur, &i)))
goto error0;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto error0;
+ }
/*
* Delete the old by-size entry on the right.
*/
if ((error = xfs_alloc_lookup_eq(cnt_cur, gtbno, gtlen, &i)))
goto error0;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto error0;
+ }
if ((error = xfs_btree_delete(cnt_cur, &i)))
goto error0;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto error0;
+ }
/*
* Delete the old by-block entry for the right block.
*/
if ((error = xfs_btree_delete(bno_cur, &i)))
goto error0;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto error0;
+ }
/*
* Move the by-block cursor back to the left neighbor.
*/
if ((error = xfs_btree_decrement(bno_cur, 0, &i)))
goto error0;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto error0;
+ }
#ifdef DEBUG
/*
* Check that this is the right record: delete didn't
@@ -1826,9 +2046,13 @@
if ((error = xfs_alloc_get_rec(bno_cur, &xxbno, &xxlen,
&i)))
goto error0;
- XFS_WANT_CORRUPTED_GOTO(mp,
- i == 1 && xxbno == ltbno && xxlen == ltlen,
- error0);
+ if (XFS_IS_CORRUPT(mp,
+ i != 1 ||
+ xxbno != ltbno ||
+ xxlen != ltlen)) {
+ error = -EFSCORRUPTED;
+ goto error0;
+ }
}
#endif
/*
@@ -1849,17 +2073,26 @@
*/
if ((error = xfs_alloc_lookup_eq(cnt_cur, ltbno, ltlen, &i)))
goto error0;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto error0;
+ }
if ((error = xfs_btree_delete(cnt_cur, &i)))
goto error0;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto error0;
+ }
/*
* Back up the by-block cursor to the left neighbor, and
* update its length.
*/
if ((error = xfs_btree_decrement(bno_cur, 0, &i)))
goto error0;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto error0;
+ }
nbno = ltbno;
nlen = len + ltlen;
if ((error = xfs_alloc_update(bno_cur, nbno, nlen)))
@@ -1875,10 +2108,16 @@
*/
if ((error = xfs_alloc_lookup_eq(cnt_cur, gtbno, gtlen, &i)))
goto error0;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto error0;
+ }
if ((error = xfs_btree_delete(cnt_cur, &i)))
goto error0;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto error0;
+ }
/*
* Update the starting block and length of the right
* neighbor in the by-block tree.
@@ -1897,7 +2136,10 @@
nlen = len;
if ((error = xfs_btree_insert(bno_cur, &i)))
goto error0;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto error0;
+ }
}
xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
bno_cur = NULL;
@@ -1906,20 +2148,24 @@
*/
if ((error = xfs_alloc_lookup_eq(cnt_cur, nbno, nlen, &i)))
goto error0;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 0, error0);
+ if (XFS_IS_CORRUPT(mp, i != 0)) {
+ error = -EFSCORRUPTED;
+ goto error0;
+ }
if ((error = xfs_btree_insert(cnt_cur, &i)))
goto error0;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto error0;
+ }
xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
cnt_cur = NULL;
/*
* Update the freespace totals in the ag and superblock.
*/
- pag = xfs_perag_get(mp, agno);
- error = xfs_alloc_update_counters(tp, pag, agbp, len);
- xfs_ag_resv_free_extent(pag, type, tp, len);
- xfs_perag_put(pag);
+ error = xfs_alloc_update_counters(tp, agbp, len);
+ xfs_ag_resv_free_extent(agbp->b_pag, type, tp, len);
if (error)
goto error0;
@@ -1989,30 +2235,39 @@
* reservations and AGFL rules in place, we can return this extent.
*/
if (pag->pagf_longest > delta)
- return pag->pagf_longest - delta;
+ return min_t(xfs_extlen_t, pag->pag_mount->m_ag_max_usable,
+ pag->pagf_longest - delta);
/* Otherwise, let the caller try for 1 block if there's space. */
return pag->pagf_flcount > 0 || pag->pagf_longest > 0;
}
+/*
+ * Compute the minimum length of the AGFL in the given AG. If @pag is NULL,
+ * return the largest possible minimum length.
+ */
unsigned int
xfs_alloc_min_freelist(
struct xfs_mount *mp,
struct xfs_perag *pag)
{
+ /* AG btrees have at least 1 level. */
+ static const uint8_t fake_levels[XFS_BTNUM_AGF] = {1, 1, 1};
+ const uint8_t *levels = pag ? pag->pagf_levels : fake_levels;
unsigned int min_free;
+ ASSERT(mp->m_ag_maxlevels > 0);
+
/* space needed by-bno freespace btree */
- min_free = min_t(unsigned int, pag->pagf_levels[XFS_BTNUM_BNOi] + 1,
+ min_free = min_t(unsigned int, levels[XFS_BTNUM_BNOi] + 1,
mp->m_ag_maxlevels);
/* space needed by-size freespace btree */
- min_free += min_t(unsigned int, pag->pagf_levels[XFS_BTNUM_CNTi] + 1,
+ min_free += min_t(unsigned int, levels[XFS_BTNUM_CNTi] + 1,
mp->m_ag_maxlevels);
/* space needed reverse mapping used space btree */
if (xfs_sb_version_hasrmapbt(&mp->m_sb))
- min_free += min_t(unsigned int,
- pag->pagf_levels[XFS_BTNUM_RMAPi] + 1,
- mp->m_rmap_maxlevels);
+ min_free += min_t(unsigned int, levels[XFS_BTNUM_RMAPi] + 1,
+ mp->m_rmap_maxlevels);
return min_free;
}
@@ -2086,9 +2341,11 @@
if (error)
return error;
- bp = xfs_btree_get_bufs(tp->t_mountp, tp, agno, agbno);
- if (!bp)
- return -EFSCORRUPTED;
+ error = xfs_trans_get_buf(tp, tp->t_mountp->m_ddev_targp,
+ XFS_AGB_TO_DADDR(tp->t_mountp, agno, agbno),
+ tp->t_mountp->m_bsize, 0, &bp);
+ if (error)
+ return error;
xfs_trans_binval(tp, bp);
return 0;
@@ -2161,7 +2418,7 @@
struct xfs_perag *pag)
{
struct xfs_mount *mp = tp->t_mountp;
- struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp);
+ struct xfs_agf *agf = agbp->b_addr;
ASSERT(pag->pagf_agflreset);
trace_xfs_agfl_reset(mp, agf, 0, _RET_IP_);
@@ -2205,7 +2462,8 @@
ASSERT(xfs_bmap_free_item_zone != NULL);
ASSERT(oinfo != NULL);
- new = kmem_zone_alloc(xfs_bmap_free_item_zone, 0);
+ new = kmem_cache_alloc(xfs_bmap_free_item_zone,
+ GFP_KERNEL | __GFP_NOFAIL);
new->xefi_startblock = XFS_AGB_TO_FSB(mp, agno, agbno);
new->xefi_blockcount = 1;
new->xefi_oinfo = *oinfo;
@@ -2240,12 +2498,11 @@
if (!pag->pagf_init) {
error = xfs_alloc_read_agf(mp, tp, args->agno, flags, &agbp);
- if (error)
+ if (error) {
+ /* Couldn't lock the AGF so skip this AG. */
+ if (error == -EAGAIN)
+ error = 0;
goto out_no_agbp;
- if (!pag->pagf_init) {
- ASSERT(flags & XFS_ALLOC_FLAG_TRYLOCK);
- ASSERT(!(flags & XFS_ALLOC_FLAG_FREEING));
- goto out_agbp_relse;
}
}
@@ -2254,7 +2511,7 @@
* somewhere else if we are not being asked to try harder at this
* point
*/
- if (pag->pagf_metadata && xfs_alloc_is_userdata(args->datatype) &&
+ if (pag->pagf_metadata && (args->datatype & XFS_ALLOC_USERDATA) &&
(flags & XFS_ALLOC_FLAG_TRYLOCK)) {
ASSERT(!(flags & XFS_ALLOC_FLAG_FREEING));
goto out_agbp_relse;
@@ -2271,11 +2528,10 @@
*/
if (!agbp) {
error = xfs_alloc_read_agf(mp, tp, args->agno, flags, &agbp);
- if (error)
- goto out_no_agbp;
- if (!agbp) {
- ASSERT(flags & XFS_ALLOC_FLAG_TRYLOCK);
- ASSERT(!(flags & XFS_ALLOC_FLAG_FREEING));
+ if (error) {
+ /* Couldn't lock the AGF so skip this AG. */
+ if (error == -EAGAIN)
+ error = 0;
goto out_no_agbp;
}
}
@@ -2395,7 +2651,7 @@
xfs_agblock_t *bnop, /* block address retrieved from freelist */
int btreeblk) /* destination is a AGF btree */
{
- xfs_agf_t *agf; /* a.g. freespace structure */
+ struct xfs_agf *agf = agbp->b_addr;
xfs_buf_t *agflbp;/* buffer for a.g. freelist structure */
xfs_agblock_t bno; /* block number returned */
__be32 *agfl_bno;
@@ -2407,7 +2663,6 @@
/*
* Freelist is empty, give up.
*/
- agf = XFS_BUF_TO_AGF(agbp);
if (!agf->agf_flcount) {
*bnop = NULLAGBLOCK;
return 0;
@@ -2424,14 +2679,14 @@
/*
* Get the block number and update the data structures.
*/
- agfl_bno = XFS_BUF_TO_AGFL_BNO(mp, agflbp);
+ agfl_bno = xfs_buf_to_agfl_bno(agflbp);
bno = be32_to_cpu(agfl_bno[be32_to_cpu(agf->agf_flfirst)]);
be32_add_cpu(&agf->agf_flfirst, 1);
xfs_trans_brelse(tp, agflbp);
if (be32_to_cpu(agf->agf_flfirst) == xfs_agfl_size(mp))
agf->agf_flfirst = 0;
- pag = xfs_perag_get(mp, be32_to_cpu(agf->agf_seqno));
+ pag = agbp->b_pag;
ASSERT(!pag->pagf_agflreset);
be32_add_cpu(&agf->agf_flcount, -1);
xfs_trans_agflist_delta(tp, -1);
@@ -2443,7 +2698,6 @@
pag->pagf_btreeblks++;
logflags |= XFS_AGF_BTREEBLKS;
}
- xfs_perag_put(pag);
xfs_alloc_log_agf(tp, agbp, logflags);
*bnop = bno;
@@ -2485,7 +2739,7 @@
sizeof(xfs_agf_t)
};
- trace_xfs_agf(tp->t_mountp, XFS_BUF_TO_AGF(bp), fields, _RET_IP_);
+ trace_xfs_agf(tp->t_mountp, bp->b_addr, fields, _RET_IP_);
xfs_trans_buf_set_type(tp, bp, XFS_BLFT_AGF_BUF);
@@ -2506,11 +2760,10 @@
xfs_buf_t *bp;
int error;
- if ((error = xfs_alloc_read_agf(mp, tp, agno, flags, &bp)))
- return error;
- if (bp)
+ error = xfs_alloc_read_agf(mp, tp, agno, flags, &bp);
+ if (!error)
xfs_trans_brelse(tp, bp);
- return 0;
+ return error;
}
/*
@@ -2524,18 +2777,15 @@
xfs_agblock_t bno, /* block being freed */
int btreeblk) /* block came from a AGF btree */
{
- xfs_agf_t *agf; /* a.g. freespace structure */
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_agf *agf = agbp->b_addr;
__be32 *blockp;/* pointer to array entry */
int error;
int logflags;
- xfs_mount_t *mp; /* mount structure */
xfs_perag_t *pag; /* per allocation group data */
__be32 *agfl_bno;
int startoff;
- agf = XFS_BUF_TO_AGF(agbp);
- mp = tp->t_mountp;
-
if (!agflbp && (error = xfs_alloc_read_agfl(mp, tp,
be32_to_cpu(agf->agf_seqno), &agflbp)))
return error;
@@ -2543,7 +2793,7 @@
if (be32_to_cpu(agf->agf_fllast) == xfs_agfl_size(mp))
agf->agf_fllast = 0;
- pag = xfs_perag_get(mp, be32_to_cpu(agf->agf_seqno));
+ pag = agbp->b_pag;
ASSERT(!pag->pagf_agflreset);
be32_add_cpu(&agf->agf_flcount, 1);
xfs_trans_agflist_delta(tp, 1);
@@ -2555,13 +2805,12 @@
pag->pagf_btreeblks--;
logflags |= XFS_AGF_BTREEBLKS;
}
- xfs_perag_put(pag);
xfs_alloc_log_agf(tp, agbp, logflags);
ASSERT(be32_to_cpu(agf->agf_flcount) <= xfs_agfl_size(mp));
- agfl_bno = XFS_BUF_TO_AGFL_BNO(mp, agflbp);
+ agfl_bno = xfs_buf_to_agfl_bno(agflbp);
blockp = &agfl_bno[be32_to_cpu(agf->agf_fllast)];
*blockp = cpu_to_be32(bno);
startoff = (char *)blockp - (char *)agflbp->b_addr;
@@ -2579,13 +2828,12 @@
struct xfs_buf *bp)
{
struct xfs_mount *mp = bp->b_mount;
- struct xfs_agf *agf = XFS_BUF_TO_AGF(bp);
+ struct xfs_agf *agf = bp->b_addr;
if (xfs_sb_version_hascrc(&mp->m_sb)) {
if (!uuid_equal(&agf->agf_uuid, &mp->m_sb.sb_meta_uuid))
return __this_address;
- if (!xfs_log_check_lsn(mp,
- be64_to_cpu(XFS_BUF_TO_AGF(bp)->agf_lsn)))
+ if (!xfs_log_check_lsn(mp, be64_to_cpu(agf->agf_lsn)))
return __this_address;
}
@@ -2671,6 +2919,7 @@
{
struct xfs_mount *mp = bp->b_mount;
struct xfs_buf_log_item *bip = bp->b_log_item;
+ struct xfs_agf *agf = bp->b_addr;
xfs_failaddr_t fa;
fa = xfs_agf_verify(bp);
@@ -2683,7 +2932,7 @@
return;
if (bip)
- XFS_BUF_TO_AGF(bp)->agf_lsn = cpu_to_be64(bip->bli_item.li_lsn);
+ agf->agf_lsn = cpu_to_be64(bip->bli_item.li_lsn);
xfs_buf_update_cksum(bp, XFS_AGF_CRC_OFF);
}
@@ -2712,14 +2961,11 @@
trace_xfs_read_agf(mp, agno);
ASSERT(agno != NULLAGNUMBER);
- error = xfs_trans_read_buf(
- mp, tp, mp->m_ddev_targp,
+ error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)),
XFS_FSS_TO_BB(mp, 1), flags, bpp, &xfs_agf_buf_ops);
if (error)
return error;
- if (!*bpp)
- return 0;
ASSERT(!(*bpp)->b_error);
xfs_buf_set_ref(*bpp, XFS_AGF_REF);
@@ -2743,18 +2989,19 @@
trace_xfs_alloc_read_agf(mp, agno);
+ /* We don't support trylock when freeing. */
+ ASSERT((flags & (XFS_ALLOC_FLAG_FREEING | XFS_ALLOC_FLAG_TRYLOCK)) !=
+ (XFS_ALLOC_FLAG_FREEING | XFS_ALLOC_FLAG_TRYLOCK));
ASSERT(agno != NULLAGNUMBER);
error = xfs_read_agf(mp, tp, agno,
(flags & XFS_ALLOC_FLAG_TRYLOCK) ? XBF_TRYLOCK : 0,
bpp);
if (error)
return error;
- if (!*bpp)
- return 0;
ASSERT(!(*bpp)->b_error);
- agf = XFS_BUF_TO_AGF(*bpp);
- pag = xfs_perag_get(mp, agno);
+ agf = (*bpp)->b_addr;
+ pag = (*bpp)->b_pag;
if (!pag->pagf_init) {
pag->pagf_freeblks = be32_to_cpu(agf->agf_freeblks);
pag->pagf_btreeblks = be32_to_cpu(agf->agf_btreeblks);
@@ -2782,7 +3029,6 @@
be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]));
}
#endif
- xfs_perag_put(pag);
return 0;
}
@@ -2973,13 +3219,6 @@
args->len);
#endif
- /* Zero the extent if we were asked to do so */
- if (args->datatype & XFS_ALLOC_USERDATA_ZERO) {
- error = xfs_zero_extent(args->ip, args->fsbno, args->len);
- if (error)
- goto error0;
- }
-
}
xfs_perag_put(args->pag);
return 0;
@@ -3041,6 +3280,7 @@
struct xfs_buf *agbp;
xfs_agnumber_t agno = XFS_FSB_TO_AGNO(mp, bno);
xfs_agblock_t agbno = XFS_FSB_TO_AGBNO(mp, bno);
+ struct xfs_agf *agf;
int error;
unsigned int busy_flags = 0;
@@ -3054,13 +3294,18 @@
error = xfs_free_extent_fix_freelist(tp, agno, &agbp);
if (error)
return error;
+ agf = agbp->b_addr;
- XFS_WANT_CORRUPTED_GOTO(mp, agbno < mp->m_sb.sb_agblocks, err);
+ if (XFS_IS_CORRUPT(mp, agbno >= mp->m_sb.sb_agblocks)) {
+ error = -EFSCORRUPTED;
+ goto err;
+ }
/* validate the extent size is legal now we have the agf locked */
- XFS_WANT_CORRUPTED_GOTO(mp,
- agbno + len <= be32_to_cpu(XFS_BUF_TO_AGF(agbp)->agf_length),
- err);
+ if (XFS_IS_CORRUPT(mp, agbno + len > be32_to_cpu(agf->agf_length))) {
+ error = -EFSCORRUPTED;
+ goto err;
+ }
error = xfs_free_ag_extent(tp, agbp, agno, agbno, len, oinfo, type);
if (error)
@@ -3168,7 +3413,7 @@
unsigned int i;
int error;
- agfl_bno = XFS_BUF_TO_AGFL_BNO(mp, agflbp);
+ agfl_bno = xfs_buf_to_agfl_bno(agflbp);
i = be32_to_cpu(agf->agf_flfirst);
/* Nothing to walk in an empty AGFL. */
diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h
index d6ed5d2..6c22b12 100644
--- a/fs/xfs/libxfs/xfs_alloc.h
+++ b/fs/xfs/libxfs/xfs_alloc.h
@@ -1,4 +1,4 @@
-// SPDX-License-Identifier: GPL-2.0
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
* All Rights Reserved.
@@ -54,7 +54,6 @@
struct xfs_mount *mp; /* file system mount point */
struct xfs_buf *agbp; /* buffer for a.g. freelist header */
struct xfs_perag *pag; /* per-ag struct for this agno */
- struct xfs_inode *ip; /* for userdata zeroing method */
xfs_fsblock_t fsbno; /* file system block number */
xfs_agnumber_t agno; /* allocation group number */
xfs_agblock_t agbno; /* allocation group-relative block # */
@@ -83,20 +82,7 @@
*/
#define XFS_ALLOC_USERDATA (1 << 0)/* allocation is for user data*/
#define XFS_ALLOC_INITIAL_USER_DATA (1 << 1)/* special case start of file */
-#define XFS_ALLOC_USERDATA_ZERO (1 << 2)/* zero extent on allocation */
-#define XFS_ALLOC_NOBUSY (1 << 3)/* Busy extents not allowed */
-
-static inline bool
-xfs_alloc_is_userdata(int datatype)
-{
- return (datatype & ~XFS_ALLOC_NOBUSY) != 0;
-}
-
-static inline bool
-xfs_alloc_allow_busy_reuse(int datatype)
-{
- return (datatype & XFS_ALLOC_NOBUSY) == 0;
-}
+#define XFS_ALLOC_NOBUSY (1 << 2)/* Busy extents not allowed */
/* freespace limit calculations */
#define XFS_ALLOC_AGFL_RESERVE 4
@@ -250,4 +236,13 @@
int xfs_agfl_walk(struct xfs_mount *mp, struct xfs_agf *agf,
struct xfs_buf *agflbp, xfs_agfl_walk_fn walk_fn, void *priv);
+static inline __be32 *
+xfs_buf_to_agfl_bno(
+ struct xfs_buf *bp)
+{
+ if (xfs_sb_version_hascrc(&bp->b_mount->m_sb))
+ return bp->b_addr + sizeof(struct xfs_agfl);
+ return bp->b_addr;
+}
+
#endif /* __XFS_ALLOC_H__ */
diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c
index 2a94543..8e01231 100644
--- a/fs/xfs/libxfs/xfs_alloc_btree.c
+++ b/fs/xfs/libxfs/xfs_alloc_btree.c
@@ -12,6 +12,7 @@
#include "xfs_sb.h"
#include "xfs_mount.h"
#include "xfs_btree.h"
+#include "xfs_btree_staging.h"
#include "xfs_alloc_btree.h"
#include "xfs_alloc.h"
#include "xfs_extent_busy.h"
@@ -25,7 +26,7 @@
struct xfs_btree_cur *cur)
{
return xfs_allocbt_init_cursor(cur->bc_mp, cur->bc_tp,
- cur->bc_private.a.agbp, cur->bc_private.a.agno,
+ cur->bc_ag.agbp, cur->bc_ag.agno,
cur->bc_btnum);
}
@@ -35,18 +36,16 @@
union xfs_btree_ptr *ptr,
int inc)
{
- struct xfs_buf *agbp = cur->bc_private.a.agbp;
- struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp);
- xfs_agnumber_t seqno = be32_to_cpu(agf->agf_seqno);
+ struct xfs_buf *agbp = cur->bc_ag.agbp;
+ struct xfs_agf *agf = agbp->b_addr;
int btnum = cur->bc_btnum;
- struct xfs_perag *pag = xfs_perag_get(cur->bc_mp, seqno);
+ struct xfs_perag *pag = agbp->b_pag;
ASSERT(ptr->s != 0);
agf->agf_roots[btnum] = ptr->s;
be32_add_cpu(&agf->agf_levels[btnum], inc);
pag->pagf_levels[btnum] += inc;
- xfs_perag_put(pag);
xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS);
}
@@ -62,7 +61,7 @@
xfs_agblock_t bno;
/* Allocate the new block from the freelist. If we can't, give up. */
- error = xfs_alloc_get_freelist(cur->bc_tp, cur->bc_private.a.agbp,
+ error = xfs_alloc_get_freelist(cur->bc_tp, cur->bc_ag.agbp,
&bno, 1);
if (error)
return error;
@@ -72,7 +71,7 @@
return 0;
}
- xfs_extent_busy_reuse(cur->bc_mp, cur->bc_private.a.agno, bno, 1, false);
+ xfs_extent_busy_reuse(cur->bc_mp, cur->bc_ag.agno, bno, 1, false);
xfs_trans_agbtree_delta(cur->bc_tp, 1);
new->s = cpu_to_be32(bno);
@@ -86,8 +85,8 @@
struct xfs_btree_cur *cur,
struct xfs_buf *bp)
{
- struct xfs_buf *agbp = cur->bc_private.a.agbp;
- struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp);
+ struct xfs_buf *agbp = cur->bc_ag.agbp;
+ struct xfs_agf *agf = agbp->b_addr;
xfs_agblock_t bno;
int error;
@@ -113,8 +112,7 @@
int ptr,
int reason)
{
- struct xfs_agf *agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp);
- xfs_agnumber_t seqno = be32_to_cpu(agf->agf_seqno);
+ struct xfs_agf *agf = cur->bc_ag.agbp->b_addr;
struct xfs_perag *pag;
__be32 len;
int numrecs;
@@ -159,10 +157,9 @@
}
agf->agf_longest = len;
- pag = xfs_perag_get(cur->bc_mp, seqno);
+ pag = cur->bc_ag.agbp->b_pag;
pag->pagf_longest = be32_to_cpu(len);
- xfs_perag_put(pag);
- xfs_alloc_log_agf(cur->bc_tp, cur->bc_private.a.agbp, XFS_AGF_LONGEST);
+ xfs_alloc_log_agf(cur->bc_tp, cur->bc_ag.agbp, XFS_AGF_LONGEST);
}
STATIC int
@@ -226,9 +223,9 @@
struct xfs_btree_cur *cur,
union xfs_btree_ptr *ptr)
{
- struct xfs_agf *agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp);
+ struct xfs_agf *agf = cur->bc_ag.agbp->b_addr;
- ASSERT(cur->bc_private.a.agno == be32_to_cpu(agf->agf_seqno));
+ ASSERT(cur->bc_ag.agno == be32_to_cpu(agf->agf_seqno));
ptr->s = agf->agf_roots[cur->bc_btnum];
}
@@ -471,6 +468,43 @@
.recs_inorder = xfs_cntbt_recs_inorder,
};
+/* Allocate most of a new allocation btree cursor. */
+STATIC struct xfs_btree_cur *
+xfs_allocbt_init_common(
+ struct xfs_mount *mp,
+ struct xfs_trans *tp,
+ xfs_agnumber_t agno,
+ xfs_btnum_t btnum)
+{
+ struct xfs_btree_cur *cur;
+
+ ASSERT(btnum == XFS_BTNUM_BNO || btnum == XFS_BTNUM_CNT);
+
+ cur = kmem_cache_zalloc(xfs_btree_cur_zone, GFP_NOFS | __GFP_NOFAIL);
+
+ cur->bc_tp = tp;
+ cur->bc_mp = mp;
+ cur->bc_btnum = btnum;
+ cur->bc_blocklog = mp->m_sb.sb_blocklog;
+
+ if (btnum == XFS_BTNUM_CNT) {
+ cur->bc_ops = &xfs_cntbt_ops;
+ cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_abtc_2);
+ cur->bc_flags = XFS_BTREE_LASTREC_UPDATE;
+ } else {
+ cur->bc_ops = &xfs_bnobt_ops;
+ cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_abtb_2);
+ }
+
+ cur->bc_ag.agno = agno;
+ cur->bc_ag.abt.active = false;
+
+ if (xfs_sb_version_hascrc(&mp->m_sb))
+ cur->bc_flags |= XFS_BTREE_CRC_BLOCKS;
+
+ return cur;
+}
+
/*
* Allocate a new allocation btree cursor.
*/
@@ -482,38 +516,62 @@
xfs_agnumber_t agno, /* allocation group number */
xfs_btnum_t btnum) /* btree identifier */
{
- struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp);
+ struct xfs_agf *agf = agbp->b_addr;
struct xfs_btree_cur *cur;
- ASSERT(btnum == XFS_BTNUM_BNO || btnum == XFS_BTNUM_CNT);
-
- cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_NOFS);
-
- cur->bc_tp = tp;
- cur->bc_mp = mp;
- cur->bc_btnum = btnum;
- cur->bc_blocklog = mp->m_sb.sb_blocklog;
-
- if (btnum == XFS_BTNUM_CNT) {
- cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_abtc_2);
- cur->bc_ops = &xfs_cntbt_ops;
+ cur = xfs_allocbt_init_common(mp, tp, agno, btnum);
+ if (btnum == XFS_BTNUM_CNT)
cur->bc_nlevels = be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]);
- cur->bc_flags = XFS_BTREE_LASTREC_UPDATE;
- } else {
- cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_abtb_2);
- cur->bc_ops = &xfs_bnobt_ops;
+ else
cur->bc_nlevels = be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]);
- }
- cur->bc_private.a.agbp = agbp;
- cur->bc_private.a.agno = agno;
-
- if (xfs_sb_version_hascrc(&mp->m_sb))
- cur->bc_flags |= XFS_BTREE_CRC_BLOCKS;
+ cur->bc_ag.agbp = agbp;
return cur;
}
+/* Create a free space btree cursor with a fake root for staging. */
+struct xfs_btree_cur *
+xfs_allocbt_stage_cursor(
+ struct xfs_mount *mp,
+ struct xbtree_afakeroot *afake,
+ xfs_agnumber_t agno,
+ xfs_btnum_t btnum)
+{
+ struct xfs_btree_cur *cur;
+
+ cur = xfs_allocbt_init_common(mp, NULL, agno, btnum);
+ xfs_btree_stage_afakeroot(cur, afake);
+ return cur;
+}
+
+/*
+ * Install a new free space btree root. Caller is responsible for invalidating
+ * and freeing the old btree blocks.
+ */
+void
+xfs_allocbt_commit_staged_btree(
+ struct xfs_btree_cur *cur,
+ struct xfs_trans *tp,
+ struct xfs_buf *agbp)
+{
+ struct xfs_agf *agf = agbp->b_addr;
+ struct xbtree_afakeroot *afake = cur->bc_ag.afake;
+
+ ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
+
+ agf->agf_roots[cur->bc_btnum] = cpu_to_be32(afake->af_root);
+ agf->agf_levels[cur->bc_btnum] = cpu_to_be32(afake->af_levels);
+ xfs_alloc_log_agf(tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS);
+
+ if (cur->bc_btnum == XFS_BTNUM_BNO) {
+ xfs_btree_commit_afakeroot(cur, tp, agbp, &xfs_bnobt_ops);
+ } else {
+ cur->bc_flags |= XFS_BTREE_LASTREC_UPDATE;
+ xfs_btree_commit_afakeroot(cur, tp, agbp, &xfs_cntbt_ops);
+ }
+}
+
/*
* Calculate number of records in an alloc btree block.
*/
diff --git a/fs/xfs/libxfs/xfs_alloc_btree.h b/fs/xfs/libxfs/xfs_alloc_btree.h
index c9305eb..a5b998e 100644
--- a/fs/xfs/libxfs/xfs_alloc_btree.h
+++ b/fs/xfs/libxfs/xfs_alloc_btree.h
@@ -1,4 +1,4 @@
-// SPDX-License-Identifier: GPL-2.0
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* Copyright (c) 2000,2005 Silicon Graphics, Inc.
* All Rights Reserved.
@@ -13,6 +13,7 @@
struct xfs_buf;
struct xfs_btree_cur;
struct xfs_mount;
+struct xbtree_afakeroot;
/*
* Btree block header size depends on a superblock flag.
@@ -48,8 +49,14 @@
extern struct xfs_btree_cur *xfs_allocbt_init_cursor(struct xfs_mount *,
struct xfs_trans *, struct xfs_buf *,
xfs_agnumber_t, xfs_btnum_t);
+struct xfs_btree_cur *xfs_allocbt_stage_cursor(struct xfs_mount *mp,
+ struct xbtree_afakeroot *afake, xfs_agnumber_t agno,
+ xfs_btnum_t btnum);
extern int xfs_allocbt_maxrecs(struct xfs_mount *, int, int);
extern xfs_extlen_t xfs_allocbt_calc_size(struct xfs_mount *mp,
unsigned long long len);
+void xfs_allocbt_commit_staged_btree(struct xfs_btree_cur *cur,
+ struct xfs_trans *tp, struct xfs_buf *agbp);
+
#endif /* __XFS_ALLOC_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c
index 510ca69..96ac7e5 100644
--- a/fs/xfs/libxfs/xfs_attr.c
+++ b/fs/xfs/libxfs/xfs_attr.c
@@ -46,6 +46,7 @@
STATIC int xfs_attr_leaf_get(xfs_da_args_t *args);
STATIC int xfs_attr_leaf_addname(xfs_da_args_t *args);
STATIC int xfs_attr_leaf_removename(xfs_da_args_t *args);
+STATIC int xfs_attr_leaf_hasname(struct xfs_da_args *args, struct xfs_buf **bp);
/*
* Internal routines when attribute list is more than one block.
@@ -53,42 +54,18 @@
STATIC int xfs_attr_node_get(xfs_da_args_t *args);
STATIC int xfs_attr_node_addname(xfs_da_args_t *args);
STATIC int xfs_attr_node_removename(xfs_da_args_t *args);
+STATIC int xfs_attr_node_hasname(xfs_da_args_t *args,
+ struct xfs_da_state **state);
STATIC int xfs_attr_fillstate(xfs_da_state_t *state);
STATIC int xfs_attr_refillstate(xfs_da_state_t *state);
-
-STATIC int
-xfs_attr_args_init(
- struct xfs_da_args *args,
- struct xfs_inode *dp,
- const unsigned char *name,
- int flags)
-{
-
- if (!name)
- return -EINVAL;
-
- memset(args, 0, sizeof(*args));
- args->geo = dp->i_mount->m_attr_geo;
- args->whichfork = XFS_ATTR_FORK;
- args->dp = dp;
- args->flags = flags;
- args->name = name;
- args->namelen = strlen((const char *)name);
- if (args->namelen >= MAXNAMELEN)
- return -EFAULT; /* match IRIX behaviour */
-
- args->hashval = xfs_da_hashname(args->name, args->namelen);
- return 0;
-}
-
int
xfs_inode_hasattr(
struct xfs_inode *ip)
{
if (!XFS_IFORK_Q(ip) ||
- (ip->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS &&
- ip->i_d.di_anextents == 0))
+ (ip->i_afp->if_format == XFS_DINODE_FMT_EXTENTS &&
+ ip->i_afp->if_nextents == 0))
return 0;
return 1;
}
@@ -103,84 +80,60 @@
*/
int
xfs_attr_get_ilocked(
- struct xfs_inode *ip,
struct xfs_da_args *args)
{
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
+ ASSERT(xfs_isilocked(args->dp, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
- if (!xfs_inode_hasattr(ip))
+ if (!xfs_inode_hasattr(args->dp))
return -ENOATTR;
- else if (ip->i_d.di_aformat == XFS_DINODE_FMT_LOCAL)
+
+ if (args->dp->i_afp->if_format == XFS_DINODE_FMT_LOCAL)
return xfs_attr_shortform_getvalue(args);
- else if (xfs_bmap_one_block(ip, XFS_ATTR_FORK))
+ if (xfs_bmap_one_block(args->dp, XFS_ATTR_FORK))
return xfs_attr_leaf_get(args);
- else
- return xfs_attr_node_get(args);
+ return xfs_attr_node_get(args);
}
/*
* Retrieve an extended attribute by name, and its value if requested.
*
- * If ATTR_KERNOVAL is set in @flags, then the caller does not want the value,
- * just an indication whether the attribute exists and the size of the value if
- * it exists. The size is returned in @valuelenp,
+ * If args->valuelen is zero, then the caller does not want the value, just an
+ * indication whether the attribute exists and the size of the value if it
+ * exists. The size is returned in args.valuelen.
+ *
+ * If args->value is NULL but args->valuelen is non-zero, allocate the buffer
+ * for the value after existence of the attribute has been determined. The
+ * caller always has to free args->value if it is set, no matter if this
+ * function was successful or not.
*
* If the attribute is found, but exceeds the size limit set by the caller in
- * @valuelenp, return -ERANGE with the size of the attribute that was found in
- * @valuelenp.
- *
- * If ATTR_ALLOC is set in @flags, allocate the buffer for the value after
- * existence of the attribute has been determined. On success, return that
- * buffer to the caller and leave them to free it. On failure, free any
- * allocated buffer and ensure the buffer pointer returned to the caller is
- * null.
+ * args->valuelen, return -ERANGE with the size of the attribute that was found
+ * in args->valuelen.
*/
int
xfs_attr_get(
- struct xfs_inode *ip,
- const unsigned char *name,
- unsigned char **value,
- int *valuelenp,
- int flags)
+ struct xfs_da_args *args)
{
- struct xfs_da_args args;
uint lock_mode;
int error;
- ASSERT((flags & (ATTR_ALLOC | ATTR_KERNOVAL)) || *value);
+ XFS_STATS_INC(args->dp->i_mount, xs_attr_get);
- XFS_STATS_INC(ip->i_mount, xs_attr_get);
-
- if (XFS_FORCED_SHUTDOWN(ip->i_mount))
+ if (XFS_FORCED_SHUTDOWN(args->dp->i_mount))
return -EIO;
- error = xfs_attr_args_init(&args, ip, name, flags);
- if (error)
- return error;
+ args->geo = args->dp->i_mount->m_attr_geo;
+ args->whichfork = XFS_ATTR_FORK;
+ args->hashval = xfs_da_hashname(args->name, args->namelen);
/* Entirely possible to look up a name which doesn't exist */
- args.op_flags = XFS_DA_OP_OKNOENT;
- if (flags & ATTR_ALLOC)
- args.op_flags |= XFS_DA_OP_ALLOCVAL;
- else
- args.value = *value;
- args.valuelen = *valuelenp;
+ args->op_flags = XFS_DA_OP_OKNOENT;
- lock_mode = xfs_ilock_attr_map_shared(ip);
- error = xfs_attr_get_ilocked(ip, &args);
- xfs_iunlock(ip, lock_mode);
- *valuelenp = args.valuelen;
+ lock_mode = xfs_ilock_attr_map_shared(args->dp);
+ error = xfs_attr_get_ilocked(args);
+ xfs_iunlock(args->dp, lock_mode);
- /* on error, we have to clean up allocated value buffers */
- if (error) {
- if (flags & ATTR_ALLOC) {
- kmem_free(args.value);
- *value = NULL;
- }
- return error;
- }
- *value = args.value;
- return 0;
+ return error;
}
/*
@@ -225,8 +178,13 @@
struct xfs_da_args *args)
{
- struct xfs_mount *mp = dp->i_mount;
- int error, error2;
+ int error;
+
+ /*
+ * Build initial attribute list (if required).
+ */
+ if (dp->i_afp->if_format == XFS_DINODE_FMT_EXTENTS)
+ xfs_attr_shortform_create(args);
error = xfs_attr_shortform_addname(args);
if (error == -ENOSPC)
@@ -236,15 +194,73 @@
* Commit the shortform mods, and we're done.
* NOTE: this is also the error path (EEXIST, etc).
*/
- if (!error && (args->flags & ATTR_KERNOTIME) == 0)
+ if (!error && !(args->op_flags & XFS_DA_OP_NOTIME))
xfs_trans_ichgtime(args->trans, dp, XFS_ICHGTIME_CHG);
- if (mp->m_flags & XFS_MOUNT_WSYNC)
+ if (dp->i_mount->m_flags & XFS_MOUNT_WSYNC)
xfs_trans_set_sync(args->trans);
- error2 = xfs_trans_commit(args->trans);
- args->trans = NULL;
- return error ? error : error2;
+ return error;
+}
+
+/*
+ * Check to see if the attr should be upgraded from non-existent or shortform to
+ * single-leaf-block attribute list.
+ */
+static inline bool
+xfs_attr_is_shortform(
+ struct xfs_inode *ip)
+{
+ return ip->i_afp->if_format == XFS_DINODE_FMT_LOCAL ||
+ (ip->i_afp->if_format == XFS_DINODE_FMT_EXTENTS &&
+ ip->i_afp->if_nextents == 0);
+}
+
+/*
+ * Attempts to set an attr in shortform, or converts short form to leaf form if
+ * there is not enough room. If the attr is set, the transaction is committed
+ * and set to NULL.
+ */
+STATIC int
+xfs_attr_set_shortform(
+ struct xfs_da_args *args,
+ struct xfs_buf **leaf_bp)
+{
+ struct xfs_inode *dp = args->dp;
+ int error, error2 = 0;
+
+ /*
+ * Try to add the attr to the attribute list in the inode.
+ */
+ error = xfs_attr_try_sf_addname(dp, args);
+ if (error != -ENOSPC) {
+ error2 = xfs_trans_commit(args->trans);
+ args->trans = NULL;
+ return error ? error : error2;
+ }
+ /*
+ * It won't fit in the shortform, transform to a leaf block. GROT:
+ * another possible req'mt for a double-split btree op.
+ */
+ error = xfs_attr_shortform_to_leaf(args, leaf_bp);
+ if (error)
+ return error;
+
+ /*
+ * Prevent the leaf buffer from being unlocked so that a concurrent AIL
+ * push cannot grab the half-baked leaf buffer and run into problems
+ * with the write verifier. Once we're done rolling the transaction we
+ * can release the hold and add the attr to the leaf.
+ */
+ xfs_trans_bhold(args->trans, *leaf_bp);
+ error = xfs_defer_finish(&args->trans);
+ xfs_trans_bhold_release(args->trans, *leaf_bp);
+ if (error) {
+ xfs_trans_brelse(args->trans, *leaf_bp);
+ return error;
+ }
+
+ return 0;
}
/*
@@ -256,61 +272,94 @@
{
struct xfs_inode *dp = args->dp;
struct xfs_buf *leaf_bp = NULL;
- int error;
+ int error = 0;
/*
- * If the attribute list is non-existent or a shortform list,
- * upgrade it to a single-leaf-block attribute list.
+ * If the attribute list is already in leaf format, jump straight to
+ * leaf handling. Otherwise, try to add the attribute to the shortform
+ * list; if there's no room then convert the list to leaf format and try
+ * again.
*/
- if (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL ||
- (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS &&
- dp->i_d.di_anextents == 0)) {
+ if (xfs_attr_is_shortform(dp)) {
/*
- * Build initial attribute list (if required).
+ * If the attr was successfully set in shortform, the
+ * transaction is committed and set to NULL. Otherwise, is it
+ * converted from shortform to leaf, and the transaction is
+ * retained.
*/
- if (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS)
- xfs_attr_shortform_create(args);
+ error = xfs_attr_set_shortform(args, &leaf_bp);
+ if (error || !args->trans)
+ return error;
+ }
- /*
- * Try to add the attr to the attribute list in the inode.
- */
- error = xfs_attr_try_sf_addname(dp, args);
+ if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) {
+ error = xfs_attr_leaf_addname(args);
if (error != -ENOSPC)
return error;
/*
- * It won't fit in the shortform, transform to a leaf block.
- * GROT: another possible req'mt for a double-split btree op.
+ * Promote the attribute list to the Btree format.
*/
- error = xfs_attr_shortform_to_leaf(args, &leaf_bp);
+ error = xfs_attr3_leaf_to_node(args);
if (error)
return error;
/*
- * Prevent the leaf buffer from being unlocked so that a
- * concurrent AIL push cannot grab the half-baked leaf
- * buffer and run into problems with the write verifier.
- * Once we're done rolling the transaction we can release
- * the hold and add the attr to the leaf.
+ * Finish any deferred work items and roll the transaction once
+ * more. The goal here is to call node_addname with the inode
+ * and transaction in the same state (inode locked and joined,
+ * transaction clean) no matter how we got to this step.
*/
- xfs_trans_bhold(args->trans, leaf_bp);
error = xfs_defer_finish(&args->trans);
- xfs_trans_bhold_release(args->trans, leaf_bp);
- if (error) {
- xfs_trans_brelse(args->trans, leaf_bp);
+ if (error)
return error;
- }
+
+ /*
+ * Commit the current trans (including the inode) and
+ * start a new one.
+ */
+ error = xfs_trans_roll_inode(&args->trans, dp);
+ if (error)
+ return error;
}
- if (xfs_bmap_one_block(dp, XFS_ATTR_FORK))
- error = xfs_attr_leaf_addname(args);
- else
- error = xfs_attr_node_addname(args);
+ error = xfs_attr_node_addname(args);
return error;
}
/*
+ * Return EEXIST if attr is found, or ENOATTR if not
+ */
+int
+xfs_has_attr(
+ struct xfs_da_args *args)
+{
+ struct xfs_inode *dp = args->dp;
+ struct xfs_buf *bp = NULL;
+ int error;
+
+ if (!xfs_inode_hasattr(dp))
+ return -ENOATTR;
+
+ if (dp->i_afp->if_format == XFS_DINODE_FMT_LOCAL) {
+ ASSERT(dp->i_afp->if_flags & XFS_IFINLINE);
+ return xfs_attr_sf_findname(args, NULL, NULL);
+ }
+
+ if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) {
+ error = xfs_attr_leaf_hasname(args, &bp);
+
+ if (bp)
+ xfs_trans_brelse(args->trans, bp);
+
+ return error;
+ }
+
+ return xfs_attr_node_hasname(args, NULL);
+}
+
+/*
* Remove the attribute specified in @args.
*/
int
@@ -322,7 +371,7 @@
if (!xfs_inode_hasattr(dp)) {
error = -ENOATTR;
- } else if (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) {
+ } else if (dp->i_afp->if_format == XFS_DINODE_FMT_LOCAL) {
ASSERT(dp->i_afp->if_flags & XFS_IFINLINE);
error = xfs_attr_shortform_remove(args);
} else if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) {
@@ -334,79 +383,115 @@
return error;
}
+/*
+ * Note: If args->value is NULL the attribute will be removed, just like the
+ * Linux ->setattr API.
+ */
int
xfs_attr_set(
- struct xfs_inode *dp,
- const unsigned char *name,
- unsigned char *value,
- int valuelen,
- int flags)
+ struct xfs_da_args *args)
{
+ struct xfs_inode *dp = args->dp;
struct xfs_mount *mp = dp->i_mount;
- struct xfs_da_args args;
struct xfs_trans_res tres;
- int rsvd = (flags & ATTR_ROOT) != 0;
+ bool rsvd = (args->attr_filter & XFS_ATTR_ROOT);
int error, local;
-
- XFS_STATS_INC(mp, xs_attr_set);
+ unsigned int total;
if (XFS_FORCED_SHUTDOWN(dp->i_mount))
return -EIO;
- error = xfs_attr_args_init(&args, dp, name, flags);
- if (error)
- return error;
-
- args.value = value;
- args.valuelen = valuelen;
- args.op_flags = XFS_DA_OP_ADDNAME | XFS_DA_OP_OKNOENT;
- args.total = xfs_attr_calc_size(&args, &local);
-
error = xfs_qm_dqattach(dp);
if (error)
return error;
+ args->geo = mp->m_attr_geo;
+ args->whichfork = XFS_ATTR_FORK;
+ args->hashval = xfs_da_hashname(args->name, args->namelen);
+
/*
- * If the inode doesn't have an attribute fork, add one.
- * (inode must not be locked when we call this routine)
+ * We have no control over the attribute names that userspace passes us
+ * to remove, so we have to allow the name lookup prior to attribute
+ * removal to fail as well.
*/
- if (XFS_IFORK_Q(dp) == 0) {
- int sf_size = sizeof(xfs_attr_sf_hdr_t) +
- XFS_ATTR_SF_ENTSIZE_BYNAME(args.namelen, valuelen);
+ args->op_flags = XFS_DA_OP_OKNOENT;
- error = xfs_bmap_add_attrfork(dp, sf_size, rsvd);
- if (error)
- return error;
+ if (args->value) {
+ XFS_STATS_INC(mp, xs_attr_set);
+
+ args->op_flags |= XFS_DA_OP_ADDNAME;
+ args->total = xfs_attr_calc_size(args, &local);
+
+ /*
+ * If the inode doesn't have an attribute fork, add one.
+ * (inode must not be locked when we call this routine)
+ */
+ if (XFS_IFORK_Q(dp) == 0) {
+ int sf_size = sizeof(struct xfs_attr_sf_hdr) +
+ xfs_attr_sf_entsize_byname(args->namelen,
+ args->valuelen);
+
+ error = xfs_bmap_add_attrfork(dp, sf_size, rsvd);
+ if (error)
+ return error;
+ }
+
+ tres.tr_logres = M_RES(mp)->tr_attrsetm.tr_logres +
+ M_RES(mp)->tr_attrsetrt.tr_logres *
+ args->total;
+ tres.tr_logcount = XFS_ATTRSET_LOG_COUNT;
+ tres.tr_logflags = XFS_TRANS_PERM_LOG_RES;
+ total = args->total;
+ } else {
+ XFS_STATS_INC(mp, xs_attr_remove);
+
+ tres = M_RES(mp)->tr_attrrm;
+ total = XFS_ATTRRM_SPACE_RES(mp);
}
- tres.tr_logres = M_RES(mp)->tr_attrsetm.tr_logres +
- M_RES(mp)->tr_attrsetrt.tr_logres * args.total;
- tres.tr_logcount = XFS_ATTRSET_LOG_COUNT;
- tres.tr_logflags = XFS_TRANS_PERM_LOG_RES;
-
/*
* Root fork attributes can use reserved data blocks for this
* operation if necessary
*/
- error = xfs_trans_alloc(mp, &tres, args.total, 0,
- rsvd ? XFS_TRANS_RESERVE : 0, &args.trans);
+ error = xfs_trans_alloc(mp, &tres, total, 0,
+ rsvd ? XFS_TRANS_RESERVE : 0, &args->trans);
if (error)
return error;
xfs_ilock(dp, XFS_ILOCK_EXCL);
- error = xfs_trans_reserve_quota_nblks(args.trans, dp, args.total, 0,
- rsvd ? XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_FORCE_RES :
- XFS_QMOPT_RES_REGBLKS);
- if (error)
- goto out_trans_cancel;
+ xfs_trans_ijoin(args->trans, dp, 0);
+ if (args->value) {
+ unsigned int quota_flags = XFS_QMOPT_RES_REGBLKS;
- xfs_trans_ijoin(args.trans, dp, 0);
- error = xfs_attr_set_args(&args);
- if (error)
- goto out_trans_cancel;
- if (!args.trans) {
+ if (rsvd)
+ quota_flags |= XFS_QMOPT_FORCE_RES;
+ error = xfs_trans_reserve_quota_nblks(args->trans, dp,
+ args->total, 0, quota_flags);
+ if (error)
+ goto out_trans_cancel;
+
+ error = xfs_has_attr(args);
+ if (error == -EEXIST && (args->attr_flags & XATTR_CREATE))
+ goto out_trans_cancel;
+ if (error == -ENOATTR && (args->attr_flags & XATTR_REPLACE))
+ goto out_trans_cancel;
+ if (error != -ENOATTR && error != -EEXIST)
+ goto out_trans_cancel;
+
+ error = xfs_attr_set_args(args);
+ if (error)
+ goto out_trans_cancel;
/* shortform attribute has already been committed */
- goto out_unlock;
+ if (!args->trans)
+ goto out_unlock;
+ } else {
+ error = xfs_has_attr(args);
+ if (error != -EEXIST)
+ goto out_trans_cancel;
+
+ error = xfs_attr_remove_args(args);
+ if (error)
+ goto out_trans_cancel;
}
/*
@@ -414,112 +499,38 @@
* transaction goes to disk before returning to the user.
*/
if (mp->m_flags & XFS_MOUNT_WSYNC)
- xfs_trans_set_sync(args.trans);
+ xfs_trans_set_sync(args->trans);
- if ((flags & ATTR_KERNOTIME) == 0)
- xfs_trans_ichgtime(args.trans, dp, XFS_ICHGTIME_CHG);
+ if (!(args->op_flags & XFS_DA_OP_NOTIME))
+ xfs_trans_ichgtime(args->trans, dp, XFS_ICHGTIME_CHG);
/*
* Commit the last in the sequence of transactions.
*/
- xfs_trans_log_inode(args.trans, dp, XFS_ILOG_CORE);
- error = xfs_trans_commit(args.trans);
+ xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE);
+ error = xfs_trans_commit(args->trans);
out_unlock:
xfs_iunlock(dp, XFS_ILOCK_EXCL);
return error;
out_trans_cancel:
- if (args.trans)
- xfs_trans_cancel(args.trans);
+ if (args->trans)
+ xfs_trans_cancel(args->trans);
goto out_unlock;
}
-/*
- * Generic handler routine to remove a name from an attribute list.
- * Transitions attribute list from Btree to shortform as necessary.
- */
-int
-xfs_attr_remove(
- struct xfs_inode *dp,
- const unsigned char *name,
- int flags)
-{
- struct xfs_mount *mp = dp->i_mount;
- struct xfs_da_args args;
- int error;
-
- XFS_STATS_INC(mp, xs_attr_remove);
-
- if (XFS_FORCED_SHUTDOWN(dp->i_mount))
- return -EIO;
-
- error = xfs_attr_args_init(&args, dp, name, flags);
- if (error)
- return error;
-
- /*
- * we have no control over the attribute names that userspace passes us
- * to remove, so we have to allow the name lookup prior to attribute
- * removal to fail.
- */
- args.op_flags = XFS_DA_OP_OKNOENT;
-
- error = xfs_qm_dqattach(dp);
- if (error)
- return error;
-
- /*
- * Root fork attributes can use reserved data blocks for this
- * operation if necessary
- */
- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_attrrm,
- XFS_ATTRRM_SPACE_RES(mp), 0,
- (flags & ATTR_ROOT) ? XFS_TRANS_RESERVE : 0,
- &args.trans);
- if (error)
- return error;
-
- xfs_ilock(dp, XFS_ILOCK_EXCL);
- /*
- * No need to make quota reservations here. We expect to release some
- * blocks not allocate in the common case.
- */
- xfs_trans_ijoin(args.trans, dp, 0);
-
- error = xfs_attr_remove_args(&args);
- if (error)
- goto out;
-
- /*
- * If this is a synchronous mount, make sure that the
- * transaction goes to disk before returning to the user.
- */
- if (mp->m_flags & XFS_MOUNT_WSYNC)
- xfs_trans_set_sync(args.trans);
-
- if ((flags & ATTR_KERNOTIME) == 0)
- xfs_trans_ichgtime(args.trans, dp, XFS_ICHGTIME_CHG);
-
- /*
- * Commit the last in the sequence of transactions.
- */
- xfs_trans_log_inode(args.trans, dp, XFS_ILOG_CORE);
- error = xfs_trans_commit(args.trans);
- xfs_iunlock(dp, XFS_ILOCK_EXCL);
-
- return error;
-
-out:
- if (args.trans)
- xfs_trans_cancel(args.trans);
- xfs_iunlock(dp, XFS_ILOCK_EXCL);
- return error;
-}
-
/*========================================================================
* External routines when attribute list is inside the inode
*========================================================================*/
+static inline int xfs_attr_sf_totsize(struct xfs_inode *dp)
+{
+ struct xfs_attr_shortform *sf;
+
+ sf = (struct xfs_attr_shortform *)dp->i_afp->if_u1.if_data;
+ return be16_to_cpu(sf->hdr.totsize);
+}
+
/*
* Add a name to the shortform attribute list structure
* This is the external routine.
@@ -532,10 +543,10 @@
trace_xfs_attr_sf_addname(args);
retval = xfs_attr_shortform_lookup(args);
- if ((args->flags & ATTR_REPLACE) && (retval == -ENOATTR)) {
+ if (retval == -ENOATTR && (args->attr_flags & XATTR_REPLACE))
return retval;
- } else if (retval == -EEXIST) {
- if (args->flags & ATTR_CREATE)
+ if (retval == -EEXIST) {
+ if (args->attr_flags & XATTR_CREATE)
return retval;
retval = xfs_attr_shortform_remove(args);
if (retval)
@@ -545,15 +556,15 @@
* that the leaf format add routine won't trip over the attr
* not being around.
*/
- args->flags &= ~ATTR_REPLACE;
+ args->attr_flags &= ~XATTR_REPLACE;
}
if (args->namelen >= XFS_ATTR_SF_ENTSIZE_MAX ||
args->valuelen >= XFS_ATTR_SF_ENTSIZE_MAX)
return -ENOSPC;
- newsize = XFS_ATTR_SF_TOTSIZE(args->dp);
- newsize += XFS_ATTR_SF_ENTSIZE_BYNAME(args->namelen, args->valuelen);
+ newsize = xfs_attr_sf_totsize(args->dp);
+ newsize += xfs_attr_sf_entsize_byname(args->namelen, args->valuelen);
forkoff = xfs_attr_shortform_bytesfit(args->dp, newsize);
if (!forkoff)
@@ -568,54 +579,65 @@
* External routines when attribute list is one block
*========================================================================*/
-/*
- * Add a name to the leaf attribute list structure
- *
- * This leaf block cannot have a "remote" value, we only call this routine
- * if bmap_one_block() says there is only one block (ie: no remote blks).
- */
-STATIC int
-xfs_attr_leaf_addname(
+/* Store info about a remote block */
+STATIC void
+xfs_attr_save_rmt_blk(
struct xfs_da_args *args)
{
- struct xfs_inode *dp;
- struct xfs_buf *bp;
- int retval, error, forkoff;
+ args->blkno2 = args->blkno;
+ args->index2 = args->index;
+ args->rmtblkno2 = args->rmtblkno;
+ args->rmtblkcnt2 = args->rmtblkcnt;
+ args->rmtvaluelen2 = args->rmtvaluelen;
+}
- trace_xfs_attr_leaf_addname(args);
+/* Set stored info about a remote block */
+STATIC void
+xfs_attr_restore_rmt_blk(
+ struct xfs_da_args *args)
+{
+ args->blkno = args->blkno2;
+ args->index = args->index2;
+ args->rmtblkno = args->rmtblkno2;
+ args->rmtblkcnt = args->rmtblkcnt2;
+ args->rmtvaluelen = args->rmtvaluelen2;
+}
- /*
- * Read the (only) block in the attribute list in.
- */
- dp = args->dp;
- args->blkno = 0;
- error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, -1, &bp);
- if (error)
- return error;
+/*
+ * Tries to add an attribute to an inode in leaf form
+ *
+ * This function is meant to execute as part of a delayed operation and leaves
+ * the transaction handling to the caller. On success the attribute is added
+ * and the inode and transaction are left dirty. If there is not enough space,
+ * the attr data is converted to node format and -ENOSPC is returned. Caller is
+ * responsible for handling the dirty inode and transaction or adding the attr
+ * in node format.
+ */
+STATIC int
+xfs_attr_leaf_try_add(
+ struct xfs_da_args *args,
+ struct xfs_buf *bp)
+{
+ int retval;
/*
* Look up the given attribute in the leaf block. Figure out if
* the given flags produce an error or call for an atomic rename.
*/
- retval = xfs_attr3_leaf_lookup_int(bp, args);
- if ((args->flags & ATTR_REPLACE) && (retval == -ENOATTR)) {
- xfs_trans_brelse(args->trans, bp);
+ retval = xfs_attr_leaf_hasname(args, &bp);
+ if (retval != -ENOATTR && retval != -EEXIST)
return retval;
- } else if (retval == -EEXIST) {
- if (args->flags & ATTR_CREATE) { /* pure create op */
- xfs_trans_brelse(args->trans, bp);
- return retval;
- }
+ if (retval == -ENOATTR && (args->attr_flags & XATTR_REPLACE))
+ goto out_brelse;
+ if (retval == -EEXIST) {
+ if (args->attr_flags & XATTR_CREATE)
+ goto out_brelse;
trace_xfs_attr_leaf_replace(args);
/* save the attribute state for later removal*/
args->op_flags |= XFS_DA_OP_RENAME; /* an atomic rename */
- args->blkno2 = args->blkno; /* set 2nd entry info*/
- args->index2 = args->index;
- args->rmtblkno2 = args->rmtblkno;
- args->rmtblkcnt2 = args->rmtblkcnt;
- args->rmtvaluelen2 = args->rmtvaluelen;
+ xfs_attr_save_rmt_blk(args);
/*
* clear the remote attr state now that it is saved so that the
@@ -628,37 +650,35 @@
}
/*
- * Add the attribute to the leaf block, transitioning to a Btree
- * if required.
+ * Add the attribute to the leaf block
*/
- retval = xfs_attr3_leaf_add(bp, args);
- if (retval == -ENOSPC) {
- /*
- * Promote the attribute list to the Btree format, then
- * Commit that transaction so that the node_addname() call
- * can manage its own transactions.
- */
- error = xfs_attr3_leaf_to_node(args);
- if (error)
- return error;
- error = xfs_defer_finish(&args->trans);
- if (error)
- return error;
+ return xfs_attr3_leaf_add(bp, args);
- /*
- * Commit the current trans (including the inode) and start
- * a new one.
- */
- error = xfs_trans_roll_inode(&args->trans, dp);
- if (error)
- return error;
+out_brelse:
+ xfs_trans_brelse(args->trans, bp);
+ return retval;
+}
- /*
- * Fob the whole rest of the problem off on the Btree code.
- */
- error = xfs_attr_node_addname(args);
+
+/*
+ * Add a name to the leaf attribute list structure
+ *
+ * This leaf block cannot have a "remote" value, we only call this routine
+ * if bmap_one_block() says there is only one block (ie: no remote blks).
+ */
+STATIC int
+xfs_attr_leaf_addname(
+ struct xfs_da_args *args)
+{
+ int error, forkoff;
+ struct xfs_buf *bp = NULL;
+ struct xfs_inode *dp = args->dp;
+
+ trace_xfs_attr_leaf_addname(args);
+
+ error = xfs_attr_leaf_try_add(args, bp);
+ if (error)
return error;
- }
/*
* Commit the transaction that added the attr name so that
@@ -680,71 +700,92 @@
return error;
}
- /*
- * If this is an atomic rename operation, we must "flip" the
- * incomplete flags on the "new" and "old" attribute/value pairs
- * so that one disappears and one appears atomically. Then we
- * must remove the "old" attribute/value pair.
- */
- if (args->op_flags & XFS_DA_OP_RENAME) {
- /*
- * In a separate transaction, set the incomplete flag on the
- * "old" attr and clear the incomplete flag on the "new" attr.
- */
- error = xfs_attr3_leaf_flipflags(args);
- if (error)
- return error;
-
- /*
- * Dismantle the "old" attribute/value pair by removing
- * a "remote" value (if it exists).
- */
- args->index = args->index2;
- args->blkno = args->blkno2;
- args->rmtblkno = args->rmtblkno2;
- args->rmtblkcnt = args->rmtblkcnt2;
- args->rmtvaluelen = args->rmtvaluelen2;
- if (args->rmtblkno) {
- error = xfs_attr_rmtval_remove(args);
- if (error)
- return error;
- }
-
- /*
- * Read in the block containing the "old" attr, then
- * remove the "old" attr from that block (neat, huh!)
- */
- error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno,
- -1, &bp);
- if (error)
- return error;
-
- xfs_attr3_leaf_remove(bp, args);
-
- /*
- * If the result is small enough, shrink it all into the inode.
- */
- if ((forkoff = xfs_attr_shortform_allfit(bp, dp))) {
- error = xfs_attr3_leaf_to_shortform(bp, args, forkoff);
- /* bp is gone due to xfs_da_shrink_inode */
- if (error)
- return error;
- error = xfs_defer_finish(&args->trans);
- if (error)
- return error;
- }
-
- /*
- * Commit the remove and start the next trans in series.
- */
- error = xfs_trans_roll_inode(&args->trans, dp);
-
- } else if (args->rmtblkno > 0) {
+ if (!(args->op_flags & XFS_DA_OP_RENAME)) {
/*
* Added a "remote" value, just clear the incomplete flag.
*/
- error = xfs_attr3_leaf_clearflag(args);
+ if (args->rmtblkno > 0)
+ error = xfs_attr3_leaf_clearflag(args);
+
+ return error;
}
+
+ /*
+ * If this is an atomic rename operation, we must "flip" the incomplete
+ * flags on the "new" and "old" attribute/value pairs so that one
+ * disappears and one appears atomically. Then we must remove the "old"
+ * attribute/value pair.
+ *
+ * In a separate transaction, set the incomplete flag on the "old" attr
+ * and clear the incomplete flag on the "new" attr.
+ */
+
+ error = xfs_attr3_leaf_flipflags(args);
+ if (error)
+ return error;
+ /*
+ * Commit the flag value change and start the next trans in series.
+ */
+ error = xfs_trans_roll_inode(&args->trans, args->dp);
+ if (error)
+ return error;
+
+ /*
+ * Dismantle the "old" attribute/value pair by removing a "remote" value
+ * (if it exists).
+ */
+ xfs_attr_restore_rmt_blk(args);
+
+ if (args->rmtblkno) {
+ error = xfs_attr_rmtval_invalidate(args);
+ if (error)
+ return error;
+
+ error = xfs_attr_rmtval_remove(args);
+ if (error)
+ return error;
+ }
+
+ /*
+ * Read in the block containing the "old" attr, then remove the "old"
+ * attr from that block (neat, huh!)
+ */
+ error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno,
+ &bp);
+ if (error)
+ return error;
+
+ xfs_attr3_leaf_remove(bp, args);
+
+ /*
+ * If the result is small enough, shrink it all into the inode.
+ */
+ forkoff = xfs_attr_shortform_allfit(bp, dp);
+ if (forkoff)
+ error = xfs_attr3_leaf_to_shortform(bp, args, forkoff);
+ /* bp is gone due to xfs_da_shrink_inode */
+
+ return error;
+}
+
+/*
+ * Return EEXIST if attr is found, or ENOATTR if not
+ */
+STATIC int
+xfs_attr_leaf_hasname(
+ struct xfs_da_args *args,
+ struct xfs_buf **bp)
+{
+ int error = 0;
+
+ error = xfs_attr3_leaf_read(args->trans, args->dp, 0, bp);
+ if (error)
+ return error;
+
+ error = xfs_attr3_leaf_lookup_int(*bp, args);
+ if (error != -ENOATTR && error != -EEXIST)
+ xfs_trans_brelse(args->trans, *bp);
+
return error;
}
@@ -768,31 +809,25 @@
* Remove the attribute.
*/
dp = args->dp;
- args->blkno = 0;
- error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, -1, &bp);
- if (error)
- return error;
- error = xfs_attr3_leaf_lookup_int(bp, args);
+ error = xfs_attr_leaf_hasname(args, &bp);
+
if (error == -ENOATTR) {
xfs_trans_brelse(args->trans, bp);
return error;
- }
+ } else if (error != -EEXIST)
+ return error;
xfs_attr3_leaf_remove(bp, args);
/*
* If the result is small enough, shrink it all into the inode.
*/
- if ((forkoff = xfs_attr_shortform_allfit(bp, dp))) {
- error = xfs_attr3_leaf_to_shortform(bp, args, forkoff);
+ forkoff = xfs_attr_shortform_allfit(bp, dp);
+ if (forkoff)
+ return xfs_attr3_leaf_to_shortform(bp, args, forkoff);
/* bp is gone due to xfs_da_shrink_inode */
- if (error)
- return error;
- error = xfs_defer_finish(&args->trans);
- if (error)
- return error;
- }
+
return 0;
}
@@ -812,21 +847,53 @@
trace_xfs_attr_leaf_get(args);
- args->blkno = 0;
- error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, -1, &bp);
- if (error)
- return error;
+ error = xfs_attr_leaf_hasname(args, &bp);
- error = xfs_attr3_leaf_lookup_int(bp, args);
- if (error != -EEXIST) {
+ if (error == -ENOATTR) {
xfs_trans_brelse(args->trans, bp);
return error;
- }
+ } else if (error != -EEXIST)
+ return error;
+
+
error = xfs_attr3_leaf_getvalue(bp, args);
xfs_trans_brelse(args->trans, bp);
return error;
}
+/*
+ * Return EEXIST if attr is found, or ENOATTR if not
+ * statep: If not null is set to point at the found state. Caller will
+ * be responsible for freeing the state in this case.
+ */
+STATIC int
+xfs_attr_node_hasname(
+ struct xfs_da_args *args,
+ struct xfs_da_state **statep)
+{
+ struct xfs_da_state *state;
+ int retval, error;
+
+ state = xfs_da_state_alloc(args);
+ if (statep != NULL)
+ *statep = NULL;
+
+ /*
+ * Search to see if name exists, and get back a pointer to it.
+ */
+ error = xfs_da3_node_lookup_int(state, &retval);
+ if (error) {
+ xfs_da_state_free(state);
+ return error;
+ }
+
+ if (statep != NULL)
+ *statep = state;
+ else
+ xfs_da_state_free(state);
+ return retval;
+}
+
/*========================================================================
* External routines when attribute list size > geo->blksize
*========================================================================*/
@@ -848,7 +915,6 @@
struct xfs_da_state *state;
struct xfs_da_state_blk *blk;
struct xfs_inode *dp;
- struct xfs_mount *mp;
int retval, error;
trace_xfs_attr_node_addname(args);
@@ -857,36 +923,29 @@
* Fill in bucket of arguments/results/context to carry around.
*/
dp = args->dp;
- mp = dp->i_mount;
restart:
- state = xfs_da_state_alloc();
- state->args = args;
- state->mp = mp;
-
/*
* Search to see if name already exists, and get back a pointer
* to where it should go.
*/
- error = xfs_da3_node_lookup_int(state, &retval);
- if (error)
+ error = 0;
+ retval = xfs_attr_node_hasname(args, &state);
+ if (retval != -ENOATTR && retval != -EEXIST)
goto out;
+
blk = &state->path.blk[ state->path.active-1 ];
ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC);
- if ((args->flags & ATTR_REPLACE) && (retval == -ENOATTR)) {
+ if (retval == -ENOATTR && (args->attr_flags & XATTR_REPLACE))
goto out;
- } else if (retval == -EEXIST) {
- if (args->flags & ATTR_CREATE)
+ if (retval == -EEXIST) {
+ if (args->attr_flags & XATTR_CREATE)
goto out;
trace_xfs_attr_node_replace(args);
/* save the attribute state for later removal*/
args->op_flags |= XFS_DA_OP_RENAME; /* atomic rename op */
- args->blkno2 = args->blkno; /* set 2nd entry info*/
- args->index2 = args->index;
- args->rmtblkno2 = args->rmtblkno;
- args->rmtblkcnt2 = args->rmtblkcnt;
- args->rmtvaluelen2 = args->rmtvaluelen;
+ xfs_attr_save_rmt_blk(args);
/*
* clear the remote attr state now that it is saved so that the
@@ -972,82 +1031,75 @@
return error;
}
- /*
- * If this is an atomic rename operation, we must "flip" the
- * incomplete flags on the "new" and "old" attribute/value pairs
- * so that one disappears and one appears atomically. Then we
- * must remove the "old" attribute/value pair.
- */
- if (args->op_flags & XFS_DA_OP_RENAME) {
- /*
- * In a separate transaction, set the incomplete flag on the
- * "old" attr and clear the incomplete flag on the "new" attr.
- */
- error = xfs_attr3_leaf_flipflags(args);
- if (error)
- goto out;
-
- /*
- * Dismantle the "old" attribute/value pair by removing
- * a "remote" value (if it exists).
- */
- args->index = args->index2;
- args->blkno = args->blkno2;
- args->rmtblkno = args->rmtblkno2;
- args->rmtblkcnt = args->rmtblkcnt2;
- args->rmtvaluelen = args->rmtvaluelen2;
- if (args->rmtblkno) {
- error = xfs_attr_rmtval_remove(args);
- if (error)
- return error;
- }
-
- /*
- * Re-find the "old" attribute entry after any split ops.
- * The INCOMPLETE flag means that we will find the "old"
- * attr, not the "new" one.
- */
- args->flags |= XFS_ATTR_INCOMPLETE;
- state = xfs_da_state_alloc();
- state->args = args;
- state->mp = mp;
- state->inleaf = 0;
- error = xfs_da3_node_lookup_int(state, &retval);
- if (error)
- goto out;
-
- /*
- * Remove the name and update the hashvals in the tree.
- */
- blk = &state->path.blk[ state->path.active-1 ];
- ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC);
- error = xfs_attr3_leaf_remove(blk->bp, args);
- xfs_da3_fixhashpath(state, &state->path);
-
- /*
- * Check to see if the tree needs to be collapsed.
- */
- if (retval && (state->path.active > 1)) {
- error = xfs_da3_join(state);
- if (error)
- goto out;
- error = xfs_defer_finish(&args->trans);
- if (error)
- goto out;
- }
-
- /*
- * Commit and start the next trans in the chain.
- */
- error = xfs_trans_roll_inode(&args->trans, dp);
- if (error)
- goto out;
-
- } else if (args->rmtblkno > 0) {
+ if (!(args->op_flags & XFS_DA_OP_RENAME)) {
/*
* Added a "remote" value, just clear the incomplete flag.
*/
- error = xfs_attr3_leaf_clearflag(args);
+ if (args->rmtblkno > 0)
+ error = xfs_attr3_leaf_clearflag(args);
+ retval = error;
+ goto out;
+ }
+
+ /*
+ * If this is an atomic rename operation, we must "flip" the incomplete
+ * flags on the "new" and "old" attribute/value pairs so that one
+ * disappears and one appears atomically. Then we must remove the "old"
+ * attribute/value pair.
+ *
+ * In a separate transaction, set the incomplete flag on the "old" attr
+ * and clear the incomplete flag on the "new" attr.
+ */
+ error = xfs_attr3_leaf_flipflags(args);
+ if (error)
+ goto out;
+ /*
+ * Commit the flag value change and start the next trans in series
+ */
+ error = xfs_trans_roll_inode(&args->trans, args->dp);
+ if (error)
+ goto out;
+
+ /*
+ * Dismantle the "old" attribute/value pair by removing a "remote" value
+ * (if it exists).
+ */
+ xfs_attr_restore_rmt_blk(args);
+
+ if (args->rmtblkno) {
+ error = xfs_attr_rmtval_invalidate(args);
+ if (error)
+ return error;
+
+ error = xfs_attr_rmtval_remove(args);
+ if (error)
+ return error;
+ }
+
+ /*
+ * Re-find the "old" attribute entry after any split ops. The INCOMPLETE
+ * flag means that we will find the "old" attr, not the "new" one.
+ */
+ args->attr_filter |= XFS_ATTR_INCOMPLETE;
+ state = xfs_da_state_alloc(args);
+ state->inleaf = 0;
+ error = xfs_da3_node_lookup_int(state, &retval);
+ if (error)
+ goto out;
+
+ /*
+ * Remove the name and update the hashvals in the tree.
+ */
+ blk = &state->path.blk[state->path.active-1];
+ ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC);
+ error = xfs_attr3_leaf_remove(blk->bp, args);
+ xfs_da3_fixhashpath(state, &state->path);
+
+ /*
+ * Check to see if the tree needs to be collapsed.
+ */
+ if (retval && (state->path.active > 1)) {
+ error = xfs_da3_join(state);
if (error)
goto out;
}
@@ -1062,6 +1114,114 @@
}
/*
+ * Shrink an attribute from leaf to shortform
+ */
+STATIC int
+xfs_attr_node_shrink(
+ struct xfs_da_args *args,
+ struct xfs_da_state *state)
+{
+ struct xfs_inode *dp = args->dp;
+ int error, forkoff;
+ struct xfs_buf *bp;
+
+ /*
+ * Have to get rid of the copy of this dabuf in the state.
+ */
+ ASSERT(state->path.active == 1);
+ ASSERT(state->path.blk[0].bp);
+ state->path.blk[0].bp = NULL;
+
+ error = xfs_attr3_leaf_read(args->trans, args->dp, 0, &bp);
+ if (error)
+ return error;
+
+ forkoff = xfs_attr_shortform_allfit(bp, dp);
+ if (forkoff) {
+ error = xfs_attr3_leaf_to_shortform(bp, args, forkoff);
+ /* bp is gone due to xfs_da_shrink_inode */
+ } else
+ xfs_trans_brelse(args->trans, bp);
+
+ return error;
+}
+
+/*
+ * Mark an attribute entry INCOMPLETE and save pointers to the relevant buffers
+ * for later deletion of the entry.
+ */
+STATIC int
+xfs_attr_leaf_mark_incomplete(
+ struct xfs_da_args *args,
+ struct xfs_da_state *state)
+{
+ int error;
+
+ /*
+ * Fill in disk block numbers in the state structure
+ * so that we can get the buffers back after we commit
+ * several transactions in the following calls.
+ */
+ error = xfs_attr_fillstate(state);
+ if (error)
+ return error;
+
+ /*
+ * Mark the attribute as INCOMPLETE
+ */
+ return xfs_attr3_leaf_setflag(args);
+}
+
+/*
+ * Initial setup for xfs_attr_node_removename. Make sure the attr is there and
+ * the blocks are valid. Attr keys with remote blocks will be marked
+ * incomplete.
+ */
+STATIC
+int xfs_attr_node_removename_setup(
+ struct xfs_da_args *args,
+ struct xfs_da_state **state)
+{
+ int error;
+
+ error = xfs_attr_node_hasname(args, state);
+ if (error != -EEXIST)
+ return error;
+
+ ASSERT((*state)->path.blk[(*state)->path.active - 1].bp != NULL);
+ ASSERT((*state)->path.blk[(*state)->path.active - 1].magic ==
+ XFS_ATTR_LEAF_MAGIC);
+
+ if (args->rmtblkno > 0) {
+ error = xfs_attr_leaf_mark_incomplete(args, *state);
+ if (error)
+ return error;
+
+ return xfs_attr_rmtval_invalidate(args);
+ }
+
+ return 0;
+}
+
+STATIC int
+xfs_attr_node_remove_rmt(
+ struct xfs_da_args *args,
+ struct xfs_da_state *state)
+{
+ int error = 0;
+
+ error = xfs_attr_rmtval_remove(args);
+ if (error)
+ return error;
+
+ /*
+ * Refill the state structure with buffers, the prior calls released our
+ * buffers.
+ */
+ return xfs_attr_refillstate(state);
+}
+
+/*
* Remove a name from a B-tree attribute list.
*
* This will involve walking down the Btree, and may involve joining
@@ -1074,64 +1234,22 @@
{
struct xfs_da_state *state;
struct xfs_da_state_blk *blk;
- struct xfs_inode *dp;
- struct xfs_buf *bp;
- int retval, error, forkoff;
+ int retval, error;
+ struct xfs_inode *dp = args->dp;
trace_xfs_attr_node_removename(args);
- /*
- * Tie a string around our finger to remind us where we are.
- */
- dp = args->dp;
- state = xfs_da_state_alloc();
- state->args = args;
- state->mp = dp->i_mount;
-
- /*
- * Search to see if name exists, and get back a pointer to it.
- */
- error = xfs_da3_node_lookup_int(state, &retval);
- if (error || (retval != -EEXIST)) {
- if (error == 0)
- error = retval;
+ error = xfs_attr_node_removename_setup(args, &state);
+ if (error)
goto out;
- }
/*
* If there is an out-of-line value, de-allocate the blocks.
* This is done before we remove the attribute so that we don't
* overflow the maximum size of a transaction and/or hit a deadlock.
*/
- blk = &state->path.blk[ state->path.active-1 ];
- ASSERT(blk->bp != NULL);
- ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC);
if (args->rmtblkno > 0) {
- /*
- * Fill in disk block numbers in the state structure
- * so that we can get the buffers back after we commit
- * several transactions in the following calls.
- */
- error = xfs_attr_fillstate(state);
- if (error)
- goto out;
-
- /*
- * Mark the attribute as INCOMPLETE, then bunmapi() the
- * remote value.
- */
- error = xfs_attr3_leaf_setflag(args);
- if (error)
- goto out;
- error = xfs_attr_rmtval_remove(args);
- if (error)
- goto out;
-
- /*
- * Refill the state structure with buffers, the prior calls
- * released our buffers.
- */
- error = xfs_attr_refillstate(state);
+ error = xfs_attr_node_remove_rmt(args, state);
if (error)
goto out;
}
@@ -1165,33 +1283,12 @@
/*
* If the result is small enough, push it all into the inode.
*/
- if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) {
- /*
- * Have to get rid of the copy of this dabuf in the state.
- */
- ASSERT(state->path.active == 1);
- ASSERT(state->path.blk[0].bp);
- state->path.blk[0].bp = NULL;
-
- error = xfs_attr3_leaf_read(args->trans, args->dp, 0, -1, &bp);
- if (error)
- goto out;
-
- if ((forkoff = xfs_attr_shortform_allfit(bp, dp))) {
- error = xfs_attr3_leaf_to_shortform(bp, args, forkoff);
- /* bp is gone due to xfs_da_shrink_inode */
- if (error)
- goto out;
- error = xfs_defer_finish(&args->trans);
- if (error)
- goto out;
- } else
- xfs_trans_brelse(args->trans, bp);
- }
- error = 0;
+ if (xfs_bmap_one_block(dp, XFS_ATTR_FORK))
+ error = xfs_attr_node_shrink(args, state);
out:
- xfs_da_state_free(state);
+ if (state)
+ xfs_da_state_free(state);
return error;
}
@@ -1266,10 +1363,9 @@
ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
for (blk = path->blk, level = 0; level < path->active; blk++, level++) {
if (blk->disk_blkno) {
- error = xfs_da3_node_read(state->args->trans,
- state->args->dp,
- blk->blkno, blk->disk_blkno,
- &blk->bp, XFS_ATTR_FORK);
+ error = xfs_da3_node_read_mapped(state->args->trans,
+ state->args->dp, blk->disk_blkno,
+ &blk->bp, XFS_ATTR_FORK);
if (error)
return error;
} else {
@@ -1285,10 +1381,9 @@
ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
for (blk = path->blk, level = 0; level < path->active; blk++, level++) {
if (blk->disk_blkno) {
- error = xfs_da3_node_read(state->args->trans,
- state->args->dp,
- blk->blkno, blk->disk_blkno,
- &blk->bp, XFS_ATTR_FORK);
+ error = xfs_da3_node_read_mapped(state->args->trans,
+ state->args->dp, blk->disk_blkno,
+ &blk->bp, XFS_ATTR_FORK);
if (error)
return error;
} else {
@@ -1309,47 +1404,41 @@
* Returns 0 on successful retrieval, otherwise an error.
*/
STATIC int
-xfs_attr_node_get(xfs_da_args_t *args)
+xfs_attr_node_get(
+ struct xfs_da_args *args)
{
- xfs_da_state_t *state;
- xfs_da_state_blk_t *blk;
- int error, retval;
- int i;
+ struct xfs_da_state *state;
+ struct xfs_da_state_blk *blk;
+ int i;
+ int error;
trace_xfs_attr_node_get(args);
- state = xfs_da_state_alloc();
- state->args = args;
- state->mp = args->dp->i_mount;
-
/*
* Search to see if name exists, and get back a pointer to it.
*/
- error = xfs_da3_node_lookup_int(state, &retval);
- if (error) {
- retval = error;
- goto out_release;
- }
- if (retval != -EEXIST)
+ error = xfs_attr_node_hasname(args, &state);
+ if (error != -EEXIST)
goto out_release;
/*
* Get the value, local or "remote"
*/
blk = &state->path.blk[state->path.active - 1];
- retval = xfs_attr3_leaf_getvalue(blk->bp, args);
+ error = xfs_attr3_leaf_getvalue(blk->bp, args);
/*
* If not in a transaction, we have to release all the buffers.
*/
out_release:
- for (i = 0; i < state->path.active; i++) {
+ for (i = 0; state != NULL && i < state->path.active; i++) {
xfs_trans_brelse(args->trans, state->path.blk[i].bp);
state->path.blk[i].bp = NULL;
}
- xfs_da_state_free(state);
- return retval;
+ if (state)
+ xfs_da_state_free(state);
+ return error;
}
/* Returns true if the attribute entry name is valid. */
diff --git a/fs/xfs/libxfs/xfs_attr.h b/fs/xfs/libxfs/xfs_attr.h
index 91c2cb1..3e97a93 100644
--- a/fs/xfs/libxfs/xfs_attr.h
+++ b/fs/xfs/libxfs/xfs_attr.h
@@ -1,4 +1,4 @@
-// SPDX-License-Identifier: GPL-2.0
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* Copyright (c) 2000,2002-2003,2005 Silicon Graphics, Inc.
* All Rights Reserved.
@@ -21,39 +21,6 @@
* as possible so as to fit into the literal area of the inode.
*/
-/*========================================================================
- * External interfaces
- *========================================================================*/
-
-
-#define ATTR_DONTFOLLOW 0x0001 /* -- ignored, from IRIX -- */
-#define ATTR_ROOT 0x0002 /* use attrs in root (trusted) namespace */
-#define ATTR_TRUST 0x0004 /* -- unused, from IRIX -- */
-#define ATTR_SECURE 0x0008 /* use attrs in security namespace */
-#define ATTR_CREATE 0x0010 /* pure create: fail if attr already exists */
-#define ATTR_REPLACE 0x0020 /* pure set: fail if attr does not exist */
-
-#define ATTR_KERNOTIME 0x1000 /* [kernel] don't update inode timestamps */
-#define ATTR_KERNOVAL 0x2000 /* [kernel] get attr size only, not value */
-
-#define ATTR_INCOMPLETE 0x4000 /* [kernel] return INCOMPLETE attr keys */
-#define ATTR_ALLOC 0x8000 /* [kernel] allocate xattr buffer on demand */
-
-#define ATTR_KERNEL_FLAGS \
- (ATTR_KERNOTIME | ATTR_KERNOVAL | ATTR_INCOMPLETE | ATTR_ALLOC)
-
-#define XFS_ATTR_FLAGS \
- { ATTR_DONTFOLLOW, "DONTFOLLOW" }, \
- { ATTR_ROOT, "ROOT" }, \
- { ATTR_TRUST, "TRUST" }, \
- { ATTR_SECURE, "SECURE" }, \
- { ATTR_CREATE, "CREATE" }, \
- { ATTR_REPLACE, "REPLACE" }, \
- { ATTR_KERNOTIME, "KERNOTIME" }, \
- { ATTR_KERNOVAL, "KERNOVAL" }, \
- { ATTR_INCOMPLETE, "INCOMPLETE" }, \
- { ATTR_ALLOC, "ALLOC" }
-
/*
* The maximum size (into the kernel or returned from the kernel) of an
* attribute value or the buffer used for an attr_list() call. Larger
@@ -62,45 +29,16 @@
#define ATTR_MAX_VALUELEN (64*1024) /* max length of a value */
/*
- * Define how lists of attribute names are returned to the user from
- * the attr_list() call. A large, 32bit aligned, buffer is passed in
- * along with its size. We put an array of offsets at the top that each
- * reference an attrlist_ent_t and pack the attrlist_ent_t's at the bottom.
- */
-typedef struct attrlist {
- __s32 al_count; /* number of entries in attrlist */
- __s32 al_more; /* T/F: more attrs (do call again) */
- __s32 al_offset[1]; /* byte offsets of attrs [var-sized] */
-} attrlist_t;
-
-/*
- * Show the interesting info about one attribute. This is what the
- * al_offset[i] entry points to.
- */
-typedef struct attrlist_ent { /* data from attr_list() */
- __u32 a_valuelen; /* number bytes in value of attr */
- char a_name[1]; /* attr name (NULL terminated) */
-} attrlist_ent_t;
-
-/*
- * Given a pointer to the (char*) buffer containing the attr_list() result,
- * and an index, return a pointer to the indicated attribute in the buffer.
- */
-#define ATTR_ENTRY(buffer, index) \
- ((attrlist_ent_t *) \
- &((char *)buffer)[ ((attrlist_t *)(buffer))->al_offset[index] ])
-
-/*
* Kernel-internal version of the attrlist cursor.
*/
-typedef struct attrlist_cursor_kern {
+struct xfs_attrlist_cursor_kern {
__u32 hashval; /* hash value of next entry to add */
__u32 blkno; /* block containing entry (suggestion) */
__u32 offset; /* offset in list of equal-hashvals */
__u16 pad1; /* padding to match user-level */
__u8 pad2; /* padding to match user-level */
__u8 initted; /* T/F: cursor has been initialized */
-} attrlist_cursor_kern_t;
+};
/*========================================================================
@@ -112,27 +50,28 @@
typedef void (*put_listent_func_t)(struct xfs_attr_list_context *, int,
unsigned char *, int, int);
-typedef struct xfs_attr_list_context {
- struct xfs_trans *tp;
- struct xfs_inode *dp; /* inode */
- struct attrlist_cursor_kern *cursor; /* position in list */
- char *alist; /* output buffer */
+struct xfs_attr_list_context {
+ struct xfs_trans *tp;
+ struct xfs_inode *dp; /* inode */
+ struct xfs_attrlist_cursor_kern cursor; /* position in list */
+ void *buffer; /* output buffer */
/*
* Abort attribute list iteration if non-zero. Can be used to pass
* error values to the xfs_attr_list caller.
*/
- int seen_enough;
+ int seen_enough;
+ bool allow_incomplete;
- ssize_t count; /* num used entries */
- int dupcnt; /* count dup hashvals seen */
- int bufsize; /* total buffer size */
- int firstu; /* first used byte in buffer */
- int flags; /* from VOP call */
- int resynch; /* T/F: resynch with cursor */
- put_listent_func_t put_listent; /* list output fmt function */
- int index; /* index into output buffer */
-} xfs_attr_list_context_t;
+ ssize_t count; /* num used entries */
+ int dupcnt; /* count dup hashvals seen */
+ int bufsize; /* total buffer size */
+ int firstu; /* first used byte in buffer */
+ unsigned int attr_filter; /* XFS_ATTR_{ROOT,SECURE} */
+ int resynch; /* T/F: resynch with cursor */
+ put_listent_func_t put_listent; /* list output fmt function */
+ int index; /* index into output buffer */
+};
/*========================================================================
@@ -143,19 +82,15 @@
* Overall external interface routines.
*/
int xfs_attr_inactive(struct xfs_inode *dp);
-int xfs_attr_list_int_ilocked(struct xfs_attr_list_context *);
-int xfs_attr_list_int(struct xfs_attr_list_context *);
+int xfs_attr_list_ilocked(struct xfs_attr_list_context *);
+int xfs_attr_list(struct xfs_attr_list_context *);
int xfs_inode_hasattr(struct xfs_inode *ip);
-int xfs_attr_get_ilocked(struct xfs_inode *ip, struct xfs_da_args *args);
-int xfs_attr_get(struct xfs_inode *ip, const unsigned char *name,
- unsigned char **value, int *valuelenp, int flags);
-int xfs_attr_set(struct xfs_inode *dp, const unsigned char *name,
- unsigned char *value, int valuelen, int flags);
+int xfs_attr_get_ilocked(struct xfs_da_args *args);
+int xfs_attr_get(struct xfs_da_args *args);
+int xfs_attr_set(struct xfs_da_args *args);
int xfs_attr_set_args(struct xfs_da_args *args);
-int xfs_attr_remove(struct xfs_inode *dp, const unsigned char *name, int flags);
+int xfs_has_attr(struct xfs_da_args *args);
int xfs_attr_remove_args(struct xfs_da_args *args);
-int xfs_attr_list(struct xfs_inode *dp, char *buffer, int bufsize,
- int flags, struct attrlist_cursor_kern *cursor);
bool xfs_attr_namecheck(const void *name, size_t length);
#endif /* __XFS_ATTR_H__ */
diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c
index de33efc..d6ef69a 100644
--- a/fs/xfs/libxfs/xfs_attr_leaf.c
+++ b/fs/xfs/libxfs/xfs_attr_leaf.c
@@ -233,6 +233,61 @@
}
static xfs_failaddr_t
+xfs_attr3_leaf_verify_entry(
+ struct xfs_mount *mp,
+ char *buf_end,
+ struct xfs_attr_leafblock *leaf,
+ struct xfs_attr3_icleaf_hdr *leafhdr,
+ struct xfs_attr_leaf_entry *ent,
+ int idx,
+ __u32 *last_hashval)
+{
+ struct xfs_attr_leaf_name_local *lentry;
+ struct xfs_attr_leaf_name_remote *rentry;
+ char *name_end;
+ unsigned int nameidx;
+ unsigned int namesize;
+ __u32 hashval;
+
+ /* hash order check */
+ hashval = be32_to_cpu(ent->hashval);
+ if (hashval < *last_hashval)
+ return __this_address;
+ *last_hashval = hashval;
+
+ nameidx = be16_to_cpu(ent->nameidx);
+ if (nameidx < leafhdr->firstused || nameidx >= mp->m_attr_geo->blksize)
+ return __this_address;
+
+ /*
+ * Check the name information. The namelen fields are u8 so we can't
+ * possibly exceed the maximum name length of 255 bytes.
+ */
+ if (ent->flags & XFS_ATTR_LOCAL) {
+ lentry = xfs_attr3_leaf_name_local(leaf, idx);
+ namesize = xfs_attr_leaf_entsize_local(lentry->namelen,
+ be16_to_cpu(lentry->valuelen));
+ name_end = (char *)lentry + namesize;
+ if (lentry->namelen == 0)
+ return __this_address;
+ } else {
+ rentry = xfs_attr3_leaf_name_remote(leaf, idx);
+ namesize = xfs_attr_leaf_entsize_remote(rentry->namelen);
+ name_end = (char *)rentry + namesize;
+ if (rentry->namelen == 0)
+ return __this_address;
+ if (!(ent->flags & XFS_ATTR_INCOMPLETE) &&
+ rentry->valueblk == 0)
+ return __this_address;
+ }
+
+ if (name_end > buf_end)
+ return __this_address;
+
+ return NULL;
+}
+
+static xfs_failaddr_t
xfs_attr3_leaf_verify(
struct xfs_buf *bp)
{
@@ -240,7 +295,10 @@
struct xfs_mount *mp = bp->b_mount;
struct xfs_attr_leafblock *leaf = bp->b_addr;
struct xfs_attr_leaf_entry *entries;
+ struct xfs_attr_leaf_entry *ent;
+ char *buf_end;
uint32_t end; /* must be 32bit - see below */
+ __u32 last_hashval = 0;
int i;
xfs_failaddr_t fa;
@@ -251,14 +309,6 @@
return fa;
/*
- * In recovery there is a transient state where count == 0 is valid
- * because we may have transitioned an empty shortform attr to a leaf
- * if the attr didn't fit in shortform.
- */
- if (!xfs_log_in_recovery(mp) && ichdr.count == 0)
- return __this_address;
-
- /*
* firstused is the block offset of the first name info structure.
* Make sure it doesn't go off the block or crash into the header.
*/
@@ -273,8 +323,20 @@
(char *)bp->b_addr + ichdr.firstused)
return __this_address;
- /* XXX: need to range check rest of attr header values */
- /* XXX: hash order check? */
+ /*
+ * NOTE: This verifier historically failed empty leaf buffers because
+ * we expect the fork to be in another format. Empty attr fork format
+ * conversions are possible during xattr set, however, and format
+ * conversion is not atomic with the xattr set that triggers it. We
+ * cannot assume leaf blocks are non-empty until that is addressed.
+ */
+ buf_end = (char *)bp->b_addr + mp->m_attr_geo->blksize;
+ for (i = 0, ent = entries; i < ichdr.count; ent++, i++) {
+ fa = xfs_attr3_leaf_verify_entry(mp, buf_end, leaf, &ichdr,
+ ent, i, &last_hashval);
+ if (fa)
+ return fa;
+ }
/*
* Quickly check the freemap information. Attribute data has to be
@@ -367,13 +429,12 @@
struct xfs_trans *tp,
struct xfs_inode *dp,
xfs_dablk_t bno,
- xfs_daddr_t mappedbno,
struct xfs_buf **bpp)
{
int err;
- err = xfs_da_read_buf(tp, dp, bno, mappedbno, bpp,
- XFS_ATTR_FORK, &xfs_attr3_leaf_buf_ops);
+ err = xfs_da_read_buf(tp, dp, bno, 0, bpp, XFS_ATTR_FORK,
+ &xfs_attr3_leaf_buf_ops);
if (!err && tp && *bpp)
xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_ATTR_LEAF_BUF);
return err;
@@ -383,14 +444,25 @@
* Namespace helper routines
*========================================================================*/
-/*
- * If namespace bits don't match return 0.
- * If all match then return 1.
- */
-STATIC int
-xfs_attr_namesp_match(int arg_flags, int ondisk_flags)
+static bool
+xfs_attr_match(
+ struct xfs_da_args *args,
+ uint8_t namelen,
+ unsigned char *name,
+ int flags)
{
- return XFS_ATTR_NSP_ONDISK(ondisk_flags) == XFS_ATTR_NSP_ARGS_TO_ONDISK(arg_flags);
+ if (args->namelen != namelen)
+ return false;
+ if (memcmp(args->name, name, namelen) != 0)
+ return false;
+ /*
+ * If we are looking for incomplete entries, show only those, else only
+ * show complete entries.
+ */
+ if (args->attr_filter !=
+ (flags & (XFS_ATTR_NSP_ONDISK_MASK | XFS_ATTR_INCOMPLETE)))
+ return false;
+ return true;
}
static int
@@ -402,7 +474,7 @@
/*
* No copy if all we have to do is get the length
*/
- if (args->flags & ATTR_KERNOVAL) {
+ if (!args->valuelen) {
args->valuelen = valuelen;
return 0;
}
@@ -415,8 +487,8 @@
return -ERANGE;
}
- if (args->op_flags & XFS_DA_OP_ALLOCVAL) {
- args->value = kmem_alloc_large(valuelen, 0);
+ if (!args->value) {
+ args->value = kmem_alloc_large(valuelen, KM_NOLOCKDEP);
if (!args->value)
return -ENOMEM;
}
@@ -443,7 +515,7 @@
*========================================================================*/
/*
- * Query whether the requested number of additional bytes of extended
+ * Query whether the total requested number of attr fork bytes of extended
* attribute space will be able to fit inline.
*
* Returns zero if not, else the di_forkoff fork offset to be used in the
@@ -463,10 +535,16 @@
int maxforkoff;
int offset;
- /* rounded down */
- offset = (XFS_LITINO(mp, dp->i_d.di_version) - bytes) >> 3;
+ /*
+ * Check if the new size could fit at all first:
+ */
+ if (bytes > XFS_LITINO(mp))
+ return 0;
- if (dp->i_d.di_format == XFS_DINODE_FMT_DEV) {
+ /* rounded down */
+ offset = (XFS_LITINO(mp) - bytes) >> 3;
+
+ if (dp->i_df.if_format == XFS_DINODE_FMT_DEV) {
minforkoff = roundup(sizeof(xfs_dev_t), 8) >> 3;
return (offset >= minforkoff) ? minforkoff : 0;
}
@@ -494,7 +572,7 @@
dsize = dp->i_df.if_bytes;
- switch (dp->i_d.di_format) {
+ switch (dp->i_df.if_format) {
case XFS_DINODE_FMT_EXTENTS:
/*
* If there is no attr fork and the data fork is extents,
@@ -531,8 +609,7 @@
minforkoff = roundup(minforkoff, 8) >> 3;
/* attr fork btree root can have at least this many key/ptr pairs */
- maxforkoff = XFS_LITINO(mp, dp->i_d.di_version) -
- XFS_BMDR_SPACE_CALC(MINABTPTRS);
+ maxforkoff = XFS_LITINO(mp) - XFS_BMDR_SPACE_CALC(MINABTPTRS);
maxforkoff = maxforkoff >> 3; /* rounded down */
if (offset >= maxforkoff)
@@ -564,22 +641,19 @@
* Create the initial contents of a shortform attribute list.
*/
void
-xfs_attr_shortform_create(xfs_da_args_t *args)
+xfs_attr_shortform_create(
+ struct xfs_da_args *args)
{
- xfs_attr_sf_hdr_t *hdr;
- xfs_inode_t *dp;
- struct xfs_ifork *ifp;
+ struct xfs_inode *dp = args->dp;
+ struct xfs_ifork *ifp = dp->i_afp;
+ struct xfs_attr_sf_hdr *hdr;
trace_xfs_attr_sf_create(args);
- dp = args->dp;
- ASSERT(dp != NULL);
- ifp = dp->i_afp;
- ASSERT(ifp != NULL);
ASSERT(ifp->if_bytes == 0);
- if (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS) {
+ if (ifp->if_format == XFS_DINODE_FMT_EXTENTS) {
ifp->if_flags &= ~XFS_IFEXTENTS; /* just in case */
- dp->i_d.di_aformat = XFS_DINODE_FMT_LOCAL;
+ ifp->if_format = XFS_DINODE_FMT_LOCAL;
ifp->if_flags |= XFS_IFINLINE;
} else {
ASSERT(ifp->if_flags & XFS_IFINLINE);
@@ -592,18 +666,65 @@
}
/*
+ * Return -EEXIST if attr is found, or -ENOATTR if not
+ * args: args containing attribute name and namelen
+ * sfep: If not null, pointer will be set to the last attr entry found on
+ -EEXIST. On -ENOATTR pointer is left at the last entry in the list
+ * basep: If not null, pointer is set to the byte offset of the entry in the
+ * list on -EEXIST. On -ENOATTR, pointer is left at the byte offset of
+ * the last entry in the list
+ */
+int
+xfs_attr_sf_findname(
+ struct xfs_da_args *args,
+ struct xfs_attr_sf_entry **sfep,
+ unsigned int *basep)
+{
+ struct xfs_attr_shortform *sf;
+ struct xfs_attr_sf_entry *sfe;
+ unsigned int base = sizeof(struct xfs_attr_sf_hdr);
+ int size = 0;
+ int end;
+ int i;
+
+ sf = (struct xfs_attr_shortform *)args->dp->i_afp->if_u1.if_data;
+ sfe = &sf->list[0];
+ end = sf->hdr.count;
+ for (i = 0; i < end; sfe = xfs_attr_sf_nextentry(sfe),
+ base += size, i++) {
+ size = xfs_attr_sf_entsize(sfe);
+ if (!xfs_attr_match(args, sfe->namelen, sfe->nameval,
+ sfe->flags))
+ continue;
+ break;
+ }
+
+ if (sfep != NULL)
+ *sfep = sfe;
+
+ if (basep != NULL)
+ *basep = base;
+
+ if (i == end)
+ return -ENOATTR;
+ return -EEXIST;
+}
+
+/*
* Add a name/value pair to the shortform attribute list.
* Overflow from the inode has already been checked for.
*/
void
-xfs_attr_shortform_add(xfs_da_args_t *args, int forkoff)
+xfs_attr_shortform_add(
+ struct xfs_da_args *args,
+ int forkoff)
{
- xfs_attr_shortform_t *sf;
- xfs_attr_sf_entry_t *sfe;
- int i, offset, size;
- xfs_mount_t *mp;
- xfs_inode_t *dp;
- struct xfs_ifork *ifp;
+ struct xfs_attr_shortform *sf;
+ struct xfs_attr_sf_entry *sfe;
+ int offset, size;
+ struct xfs_mount *mp;
+ struct xfs_inode *dp;
+ struct xfs_ifork *ifp;
trace_xfs_attr_sf_add(args);
@@ -613,29 +734,19 @@
ifp = dp->i_afp;
ASSERT(ifp->if_flags & XFS_IFINLINE);
- sf = (xfs_attr_shortform_t *)ifp->if_u1.if_data;
- sfe = &sf->list[0];
- for (i = 0; i < sf->hdr.count; sfe = XFS_ATTR_SF_NEXTENTRY(sfe), i++) {
-#ifdef DEBUG
- if (sfe->namelen != args->namelen)
- continue;
- if (memcmp(args->name, sfe->nameval, args->namelen) != 0)
- continue;
- if (!xfs_attr_namesp_match(args->flags, sfe->flags))
- continue;
+ sf = (struct xfs_attr_shortform *)ifp->if_u1.if_data;
+ if (xfs_attr_sf_findname(args, &sfe, NULL) == -EEXIST)
ASSERT(0);
-#endif
- }
offset = (char *)sfe - (char *)sf;
- size = XFS_ATTR_SF_ENTSIZE_BYNAME(args->namelen, args->valuelen);
+ size = xfs_attr_sf_entsize_byname(args->namelen, args->valuelen);
xfs_idata_realloc(dp, size, XFS_ATTR_FORK);
- sf = (xfs_attr_shortform_t *)ifp->if_u1.if_data;
- sfe = (xfs_attr_sf_entry_t *)((char *)sf + offset);
+ sf = (struct xfs_attr_shortform *)ifp->if_u1.if_data;
+ sfe = (struct xfs_attr_sf_entry *)((char *)sf + offset);
sfe->namelen = args->namelen;
sfe->valuelen = args->valuelen;
- sfe->flags = XFS_ATTR_NSP_ARGS_TO_ONDISK(args->flags);
+ sfe->flags = args->attr_filter;
memcpy(sfe->nameval, args->name, args->namelen);
memcpy(&sfe->nameval[args->namelen], args->value, args->valuelen);
sf->hdr.count++;
@@ -654,13 +765,12 @@
struct xfs_inode *ip,
struct xfs_trans *tp)
{
- xfs_idestroy_fork(ip, XFS_ATTR_FORK);
+ ASSERT(ip->i_afp->if_nextents == 0);
+
+ xfs_idestroy_fork(ip->i_afp);
+ kmem_cache_free(xfs_ifork_zone, ip->i_afp);
+ ip->i_afp = NULL;
ip->i_d.di_forkoff = 0;
- ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS;
-
- ASSERT(ip->i_d.di_anextents == 0);
- ASSERT(ip->i_afp == NULL);
-
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
}
@@ -668,35 +778,27 @@
* Remove an attribute from the shortform attribute list structure.
*/
int
-xfs_attr_shortform_remove(xfs_da_args_t *args)
+xfs_attr_shortform_remove(
+ struct xfs_da_args *args)
{
- xfs_attr_shortform_t *sf;
- xfs_attr_sf_entry_t *sfe;
- int base, size=0, end, totsize, i;
- xfs_mount_t *mp;
- xfs_inode_t *dp;
+ struct xfs_attr_shortform *sf;
+ struct xfs_attr_sf_entry *sfe;
+ int size = 0, end, totsize;
+ unsigned int base;
+ struct xfs_mount *mp;
+ struct xfs_inode *dp;
+ int error;
trace_xfs_attr_sf_remove(args);
dp = args->dp;
mp = dp->i_mount;
- base = sizeof(xfs_attr_sf_hdr_t);
- sf = (xfs_attr_shortform_t *)dp->i_afp->if_u1.if_data;
- sfe = &sf->list[0];
- end = sf->hdr.count;
- for (i = 0; i < end; sfe = XFS_ATTR_SF_NEXTENTRY(sfe),
- base += size, i++) {
- size = XFS_ATTR_SF_ENTSIZE(sfe);
- if (sfe->namelen != args->namelen)
- continue;
- if (memcmp(sfe->nameval, args->name, args->namelen) != 0)
- continue;
- if (!xfs_attr_namesp_match(args->flags, sfe->flags))
- continue;
- break;
- }
- if (i == end)
- return -ENOATTR;
+ sf = (struct xfs_attr_shortform *)dp->i_afp->if_u1.if_data;
+
+ error = xfs_attr_sf_findname(args, &sfe, &base);
+ if (error != -EEXIST)
+ return error;
+ size = xfs_attr_sf_entsize(sfe);
/*
* Fix up the attribute fork data, covering the hole
@@ -714,7 +816,7 @@
totsize -= size;
if (totsize == sizeof(xfs_attr_sf_hdr_t) &&
(mp->m_flags & XFS_MOUNT_ATTR2) &&
- (dp->i_d.di_format != XFS_DINODE_FMT_BTREE) &&
+ (dp->i_df.if_format != XFS_DINODE_FMT_BTREE) &&
!(args->op_flags & XFS_DA_OP_ADDNAME)) {
xfs_attr_fork_remove(dp, args->trans);
} else {
@@ -724,7 +826,7 @@
ASSERT(totsize > sizeof(xfs_attr_sf_hdr_t) ||
(args->op_flags & XFS_DA_OP_ADDNAME) ||
!(mp->m_flags & XFS_MOUNT_ATTR2) ||
- dp->i_d.di_format == XFS_DINODE_FMT_BTREE);
+ dp->i_df.if_format == XFS_DINODE_FMT_BTREE);
xfs_trans_log_inode(args->trans, dp,
XFS_ILOG_CORE | XFS_ILOG_ADATA);
}
@@ -741,8 +843,8 @@
int
xfs_attr_shortform_lookup(xfs_da_args_t *args)
{
- xfs_attr_shortform_t *sf;
- xfs_attr_sf_entry_t *sfe;
+ struct xfs_attr_shortform *sf;
+ struct xfs_attr_sf_entry *sfe;
int i;
struct xfs_ifork *ifp;
@@ -750,27 +852,23 @@
ifp = args->dp->i_afp;
ASSERT(ifp->if_flags & XFS_IFINLINE);
- sf = (xfs_attr_shortform_t *)ifp->if_u1.if_data;
+ sf = (struct xfs_attr_shortform *)ifp->if_u1.if_data;
sfe = &sf->list[0];
for (i = 0; i < sf->hdr.count;
- sfe = XFS_ATTR_SF_NEXTENTRY(sfe), i++) {
- if (sfe->namelen != args->namelen)
- continue;
- if (memcmp(args->name, sfe->nameval, args->namelen) != 0)
- continue;
- if (!xfs_attr_namesp_match(args->flags, sfe->flags))
- continue;
- return -EEXIST;
+ sfe = xfs_attr_sf_nextentry(sfe), i++) {
+ if (xfs_attr_match(args, sfe->namelen, sfe->nameval,
+ sfe->flags))
+ return -EEXIST;
}
return -ENOATTR;
}
/*
- * Retreive the attribute value and length.
+ * Retrieve the attribute value and length.
*
- * If ATTR_KERNOVAL is specified, only the length needs to be returned.
- * Unlike a lookup, we only return an error if the attribute does not
- * exist or we can't retrieve the value.
+ * If args->valuelen is zero, only the length needs to be returned. Unlike a
+ * lookup, we only return an error if the attribute does not exist or we can't
+ * retrieve the value.
*/
int
xfs_attr_shortform_getvalue(
@@ -781,18 +879,14 @@
int i;
ASSERT(args->dp->i_afp->if_flags == XFS_IFINLINE);
- sf = (xfs_attr_shortform_t *)args->dp->i_afp->if_u1.if_data;
+ sf = (struct xfs_attr_shortform *)args->dp->i_afp->if_u1.if_data;
sfe = &sf->list[0];
for (i = 0; i < sf->hdr.count;
- sfe = XFS_ATTR_SF_NEXTENTRY(sfe), i++) {
- if (sfe->namelen != args->namelen)
- continue;
- if (memcmp(args->name, sfe->nameval, args->namelen) != 0)
- continue;
- if (!xfs_attr_namesp_match(args->flags, sfe->flags))
- continue;
- return xfs_attr_copy_value(args, &sfe->nameval[args->namelen],
- sfe->valuelen);
+ sfe = xfs_attr_sf_nextentry(sfe), i++) {
+ if (xfs_attr_match(args, sfe->namelen, sfe->nameval,
+ sfe->flags))
+ return xfs_attr_copy_value(args,
+ &sfe->nameval[args->namelen], sfe->valuelen);
}
return -ENOATTR;
}
@@ -820,12 +914,12 @@
dp = args->dp;
ifp = dp->i_afp;
- sf = (xfs_attr_shortform_t *)ifp->if_u1.if_data;
+ sf = (struct xfs_attr_shortform *)ifp->if_u1.if_data;
size = be16_to_cpu(sf->hdr.totsize);
tmpbuffer = kmem_alloc(size, 0);
ASSERT(tmpbuffer != NULL);
memcpy(tmpbuffer, ifp->if_u1.if_data, size);
- sf = (xfs_attr_shortform_t *)tmpbuffer;
+ sf = (struct xfs_attr_shortform *)tmpbuffer;
xfs_idata_realloc(dp, -size, XFS_ATTR_FORK);
xfs_bmap_local_to_extents_empty(args->trans, dp, XFS_ATTR_FORK);
@@ -856,14 +950,14 @@
nargs.valuelen = sfe->valuelen;
nargs.hashval = xfs_da_hashname(sfe->nameval,
sfe->namelen);
- nargs.flags = XFS_ATTR_NSP_ONDISK_TO_ARGS(sfe->flags);
+ nargs.attr_filter = sfe->flags & XFS_ATTR_NSP_ONDISK_MASK;
error = xfs_attr3_leaf_lookup_int(bp, &nargs); /* set a->index */
ASSERT(error == -ENOATTR);
error = xfs_attr3_leaf_add(bp, &nargs);
ASSERT(error != -ENOSPC);
if (error)
goto out;
- sfe = XFS_ATTR_SF_NEXTENTRY(sfe);
+ sfe = xfs_attr_sf_nextentry(sfe);
}
error = 0;
*leaf_bp = bp;
@@ -904,12 +998,11 @@
return 0;
if (be16_to_cpu(name_loc->valuelen) >= XFS_ATTR_SF_ENTSIZE_MAX)
return 0;
- bytes += sizeof(struct xfs_attr_sf_entry) - 1
- + name_loc->namelen
- + be16_to_cpu(name_loc->valuelen);
+ bytes += xfs_attr_sf_entsize_byname(name_loc->namelen,
+ be16_to_cpu(name_loc->valuelen));
}
if ((dp->i_mount->m_flags & XFS_MOUNT_ATTR2) &&
- (dp->i_d.di_format != XFS_DINODE_FMT_BTREE) &&
+ (dp->i_df.if_format != XFS_DINODE_FMT_BTREE) &&
(bytes == sizeof(struct xfs_attr_sf_hdr)))
return -1;
return xfs_attr_shortform_bytesfit(dp, bytes);
@@ -928,7 +1021,7 @@
int i;
int64_t size;
- ASSERT(ip->i_d.di_aformat == XFS_DINODE_FMT_LOCAL);
+ ASSERT(ip->i_afp->if_format == XFS_DINODE_FMT_LOCAL);
ifp = XFS_IFORK_PTR(ip, XFS_ATTR_FORK);
sfp = (struct xfs_attr_shortform *)ifp->if_u1.if_data;
size = ifp->if_bytes;
@@ -951,7 +1044,7 @@
* xfs_attr_sf_entry is defined with a 1-byte variable
* array at the end, so we must subtract that off.
*/
- if (((char *)sfep + sizeof(*sfep) - 1) >= endp)
+ if (((char *)sfep + sizeof(*sfep)) >= endp)
return __this_address;
/* Don't allow names with known bad length. */
@@ -963,7 +1056,7 @@
* within the data buffer. The next entry starts after the
* name component, so nextentry is an acceptable test.
*/
- next_sfep = XFS_ATTR_SF_NEXTENTRY(sfep);
+ next_sfep = xfs_attr_sf_nextentry(sfep);
if ((char *)next_sfep > endp)
return __this_address;
@@ -1034,7 +1127,7 @@
if (forkoff == -1) {
ASSERT(dp->i_mount->m_flags & XFS_MOUNT_ATTR2);
- ASSERT(dp->i_d.di_format != XFS_DINODE_FMT_BTREE);
+ ASSERT(dp->i_df.if_format != XFS_DINODE_FMT_BTREE);
xfs_attr_fork_remove(dp, args->trans);
goto out;
}
@@ -1064,7 +1157,7 @@
nargs.value = &name_loc->nameval[nargs.namelen];
nargs.valuelen = be16_to_cpu(name_loc->valuelen);
nargs.hashval = be32_to_cpu(entry->hashval);
- nargs.flags = XFS_ATTR_NSP_ONDISK_TO_ARGS(entry->flags);
+ nargs.attr_filter = entry->flags & XFS_ATTR_NSP_ONDISK_MASK;
xfs_attr_shortform_add(&nargs, forkoff);
}
error = 0;
@@ -1084,7 +1177,6 @@
struct xfs_attr_leafblock *leaf;
struct xfs_attr3_icleaf_hdr icleafhdr;
struct xfs_attr_leaf_entry *entries;
- struct xfs_da_node_entry *btree;
struct xfs_da3_icnode_hdr icnodehdr;
struct xfs_da_intnode *node;
struct xfs_inode *dp = args->dp;
@@ -1099,11 +1191,11 @@
error = xfs_da_grow_inode(args, &blkno);
if (error)
goto out;
- error = xfs_attr3_leaf_read(args->trans, dp, 0, -1, &bp1);
+ error = xfs_attr3_leaf_read(args->trans, dp, 0, &bp1);
if (error)
goto out;
- error = xfs_da_get_buf(args->trans, dp, blkno, -1, &bp2, XFS_ATTR_FORK);
+ error = xfs_da_get_buf(args->trans, dp, blkno, &bp2, XFS_ATTR_FORK);
if (error)
goto out;
@@ -1124,18 +1216,17 @@
if (error)
goto out;
node = bp1->b_addr;
- dp->d_ops->node_hdr_from_disk(&icnodehdr, node);
- btree = dp->d_ops->node_tree_p(node);
+ xfs_da3_node_hdr_from_disk(mp, &icnodehdr, node);
leaf = bp2->b_addr;
xfs_attr3_leaf_hdr_from_disk(args->geo, &icleafhdr, leaf);
entries = xfs_attr3_leaf_entryp(leaf);
/* both on-disk, don't endian-flip twice */
- btree[0].hashval = entries[icleafhdr.count - 1].hashval;
- btree[0].before = cpu_to_be32(blkno);
+ icnodehdr.btree[0].hashval = entries[icleafhdr.count - 1].hashval;
+ icnodehdr.btree[0].before = cpu_to_be32(blkno);
icnodehdr.count = 1;
- dp->d_ops->node_hdr_to_disk(node, &icnodehdr);
+ xfs_da3_node_hdr_to_disk(dp->i_mount, node, &icnodehdr);
xfs_trans_log_buf(args->trans, bp1, 0, args->geo->blksize - 1);
error = 0;
out:
@@ -1165,7 +1256,7 @@
trace_xfs_attr_leaf_create(args);
- error = xfs_da_get_buf(args->trans, args->dp, blkno, -1, &bp,
+ error = xfs_da_get_buf(args->trans, args->dp, blkno, &bp,
XFS_ATTR_FORK);
if (error)
return error;
@@ -1391,8 +1482,9 @@
entry->nameidx = cpu_to_be16(ichdr->freemap[mapindex].base +
ichdr->freemap[mapindex].size);
entry->hashval = cpu_to_be32(args->hashval);
- entry->flags = tmp ? XFS_ATTR_LOCAL : 0;
- entry->flags |= XFS_ATTR_NSP_ARGS_TO_ONDISK(args->flags);
+ entry->flags = args->attr_filter;
+ if (tmp)
+ entry->flags |= XFS_ATTR_LOCAL;
if (args->op_flags & XFS_DA_OP_RENAME) {
entry->flags |= XFS_ATTR_INCOMPLETE;
if ((args->blkno2 == args->blkno) &&
@@ -1937,7 +2029,7 @@
if (blkno == 0)
continue;
error = xfs_attr3_leaf_read(state->args->trans, state->args->dp,
- blkno, -1, &bp);
+ blkno, &bp);
if (error)
return error;
@@ -2287,8 +2379,10 @@
leaf = bp->b_addr;
xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr, leaf);
entries = xfs_attr3_leaf_entryp(leaf);
- if (ichdr.count >= args->geo->blksize / 8)
+ if (ichdr.count >= args->geo->blksize / 8) {
+ xfs_buf_mark_corrupt(bp);
return -EFSCORRUPTED;
+ }
/*
* Binary search. (note: small blocks will skip this loop)
@@ -2304,10 +2398,14 @@
else
break;
}
- if (!(probe >= 0 && (!ichdr.count || probe < ichdr.count)))
+ if (!(probe >= 0 && (!ichdr.count || probe < ichdr.count))) {
+ xfs_buf_mark_corrupt(bp);
return -EFSCORRUPTED;
- if (!(span <= 4 || be32_to_cpu(entry->hashval) == hashval))
+ }
+ if (!(span <= 4 || be32_to_cpu(entry->hashval) == hashval)) {
+ xfs_buf_mark_corrupt(bp);
return -EFSCORRUPTED;
+ }
/*
* Since we may have duplicate hashval's, find the first matching
@@ -2335,33 +2433,17 @@
/*
* GROT: Add code to remove incomplete entries.
*/
- /*
- * If we are looking for INCOMPLETE entries, show only those.
- * If we are looking for complete entries, show only those.
- */
- if ((args->flags & XFS_ATTR_INCOMPLETE) !=
- (entry->flags & XFS_ATTR_INCOMPLETE)) {
- continue;
- }
if (entry->flags & XFS_ATTR_LOCAL) {
name_loc = xfs_attr3_leaf_name_local(leaf, probe);
- if (name_loc->namelen != args->namelen)
- continue;
- if (memcmp(args->name, name_loc->nameval,
- args->namelen) != 0)
- continue;
- if (!xfs_attr_namesp_match(args->flags, entry->flags))
+ if (!xfs_attr_match(args, name_loc->namelen,
+ name_loc->nameval, entry->flags))
continue;
args->index = probe;
return -EEXIST;
} else {
name_rmt = xfs_attr3_leaf_name_remote(leaf, probe);
- if (name_rmt->namelen != args->namelen)
- continue;
- if (memcmp(args->name, name_rmt->name,
- args->namelen) != 0)
- continue;
- if (!xfs_attr_namesp_match(args->flags, entry->flags))
+ if (!xfs_attr_match(args, name_rmt->namelen,
+ name_rmt->name, entry->flags))
continue;
args->index = probe;
args->rmtvaluelen = be32_to_cpu(name_rmt->valuelen);
@@ -2380,9 +2462,9 @@
* Get the value associated with an attribute name from a leaf attribute
* list structure.
*
- * If ATTR_KERNOVAL is specified, only the length needs to be returned.
- * Unlike a lookup, we only return an error if the attribute does not
- * exist or we can't retrieve the value.
+ * If args->valuelen is zero, only the length needs to be returned. Unlike a
+ * lookup, we only return an error if the attribute does not exist or we can't
+ * retrieve the value.
*/
int
xfs_attr3_leaf_getvalue(
@@ -2667,7 +2749,7 @@
/*
* Set up the operation.
*/
- error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, -1, &bp);
+ error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, &bp);
if (error)
return error;
@@ -2707,10 +2789,7 @@
XFS_DA_LOGRANGE(leaf, name_rmt, sizeof(*name_rmt)));
}
- /*
- * Commit the flag value change and start the next trans in series.
- */
- return xfs_trans_roll_inode(&args->trans, args->dp);
+ return 0;
}
/*
@@ -2734,7 +2813,7 @@
/*
* Set up the operation.
*/
- error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, -1, &bp);
+ error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, &bp);
if (error)
return error;
@@ -2758,10 +2837,7 @@
XFS_DA_LOGRANGE(leaf, name_rmt, sizeof(*name_rmt)));
}
- /*
- * Commit the flag value change and start the next trans in series.
- */
- return xfs_trans_roll_inode(&args->trans, args->dp);
+ return 0;
}
/*
@@ -2796,7 +2872,7 @@
/*
* Read the block containing the "old" attr
*/
- error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, -1, &bp1);
+ error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, &bp1);
if (error)
return error;
@@ -2805,7 +2881,7 @@
*/
if (args->blkno2 != args->blkno) {
error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno2,
- -1, &bp2);
+ &bp2);
if (error)
return error;
} else {
@@ -2876,10 +2952,5 @@
XFS_DA_LOGRANGE(leaf2, name_rmt, sizeof(*name_rmt)));
}
- /*
- * Commit the flag value change and start the next trans in series.
- */
- error = xfs_trans_roll_inode(&args->trans, args->dp);
-
- return error;
+ return 0;
}
diff --git a/fs/xfs/libxfs/xfs_attr_leaf.h b/fs/xfs/libxfs/xfs_attr_leaf.h
index 7b74e18..9b1c59f 100644
--- a/fs/xfs/libxfs/xfs_attr_leaf.h
+++ b/fs/xfs/libxfs/xfs_attr_leaf.h
@@ -1,4 +1,4 @@
-// SPDX-License-Identifier: GPL-2.0
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* Copyright (c) 2000,2002-2003,2005 Silicon Graphics, Inc.
* Copyright (c) 2013 Red Hat, Inc.
@@ -8,7 +8,6 @@
#define __XFS_ATTR_LEAF_H__
struct attrlist;
-struct attrlist_cursor_kern;
struct xfs_attr_list_context;
struct xfs_da_args;
struct xfs_da_state;
@@ -17,13 +16,27 @@
struct xfs_trans;
/*
- * Used to keep a list of "remote value" extents when unlinking an inode.
+ * Incore version of the attribute leaf header.
*/
-typedef struct xfs_attr_inactive_list {
- xfs_dablk_t valueblk; /* block number of value bytes */
- int valuelen; /* number of bytes in value */
-} xfs_attr_inactive_list_t;
-
+struct xfs_attr3_icleaf_hdr {
+ uint32_t forw;
+ uint32_t back;
+ uint16_t magic;
+ uint16_t count;
+ uint16_t usedbytes;
+ /*
+ * Firstused is 32-bit here instead of 16-bit like the on-disk variant
+ * to support maximum fsb size of 64k without overflow issues throughout
+ * the attr code. Instead, the overflow condition is handled on
+ * conversion to/from disk.
+ */
+ uint32_t firstused;
+ __u8 holes;
+ struct {
+ uint16_t base;
+ uint16_t size;
+ } freemap[XFS_ATTR_LEAF_MAPSIZE];
+};
/*========================================================================
* Function prototypes for the kernel.
@@ -39,6 +52,9 @@
int xfs_attr_shortform_to_leaf(struct xfs_da_args *args,
struct xfs_buf **leaf_bp);
int xfs_attr_shortform_remove(struct xfs_da_args *args);
+int xfs_attr_sf_findname(struct xfs_da_args *args,
+ struct xfs_attr_sf_entry **sfep,
+ unsigned int *basep);
int xfs_attr_shortform_allfit(struct xfs_buf *bp, struct xfs_inode *dp);
int xfs_attr_shortform_bytesfit(struct xfs_inode *dp, int bytes);
xfs_failaddr_t xfs_attr_shortform_verify(struct xfs_inode *ip);
@@ -67,8 +83,8 @@
struct xfs_da_args *args);
int xfs_attr3_leaf_remove(struct xfs_buf *leaf_buffer,
struct xfs_da_args *args);
-void xfs_attr3_leaf_list_int(struct xfs_buf *bp,
- struct xfs_attr_list_context *context);
+int xfs_attr3_leaf_list_int(struct xfs_buf *bp,
+ struct xfs_attr_list_context *context);
/*
* Routines used for shrinking the Btree.
@@ -85,8 +101,7 @@
struct xfs_buf *leaf2_bp);
int xfs_attr_leaf_newentsize(struct xfs_da_args *args, int *local);
int xfs_attr3_leaf_read(struct xfs_trans *tp, struct xfs_inode *dp,
- xfs_dablk_t bno, xfs_daddr_t mappedbno,
- struct xfs_buf **bpp);
+ xfs_dablk_t bno, struct xfs_buf **bpp);
void xfs_attr3_leaf_hdr_from_disk(struct xfs_da_geometry *geo,
struct xfs_attr3_icleaf_hdr *to,
struct xfs_attr_leafblock *from);
diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c
index 3e39b7d..48d8e9c 100644
--- a/fs/xfs/libxfs/xfs_attr_remote.c
+++ b/fs/xfs/libxfs/xfs_attr_remote.c
@@ -19,12 +19,30 @@
#include "xfs_trans.h"
#include "xfs_bmap.h"
#include "xfs_attr.h"
+#include "xfs_attr_remote.h"
#include "xfs_trace.h"
#include "xfs_error.h"
#define ATTR_RMTVALUE_MAPSIZE 1 /* # of map entries at once */
/*
+ * Remote Attribute Values
+ * =======================
+ *
+ * Remote extended attribute values are conceptually simple -- they're written
+ * to data blocks mapped by an inode's attribute fork, and they have an upper
+ * size limit of 64k. Setting a value does not involve the XFS log.
+ *
+ * However, on a v5 filesystem, maximally sized remote attr values require one
+ * block more than 64k worth of space to hold both the remote attribute value
+ * header (64 bytes). On a 4k block filesystem this results in a 68k buffer;
+ * on a 64k block filesystem, this would be a 128k buffer. Note that the log
+ * format can only handle a dirty buffer of XFS_MAX_BLOCKSIZE length (64k).
+ * Therefore, we /must/ ensure that remote attribute value buffers never touch
+ * the logging system and therefore never have a log item.
+ */
+
+/*
* Each contiguous block has a header, so it is not just a simple attribute
* length to FSB conversion.
*/
@@ -78,8 +96,6 @@
{
struct xfs_attr3_rmt_hdr *rmt = ptr;
- if (!xfs_sb_version_hascrc(&mp->m_sb))
- return __this_address;
if (!xfs_verify_magic(bp, rmt->rm_magic))
return __this_address;
if (!uuid_equal(&rmt->rm_uuid, &mp->m_sb.sb_meta_uuid))
@@ -379,7 +395,7 @@
trace_xfs_attr_rmtval_get(args);
- ASSERT(!(args->flags & ATTR_KERNOVAL));
+ ASSERT(args->valuelen != 0);
ASSERT(args->rmtvaluelen == args->valuelen);
valuelen = args->rmtvaluelen;
@@ -400,17 +416,15 @@
(map[i].br_startblock != HOLESTARTBLOCK));
dblkno = XFS_FSB_TO_DADDR(mp, map[i].br_startblock);
dblkcnt = XFS_FSB_TO_BB(mp, map[i].br_blockcount);
- error = xfs_trans_read_buf(mp, args->trans,
- mp->m_ddev_targp,
- dblkno, dblkcnt, 0, &bp,
- &xfs_attr3_rmt_buf_ops);
+ error = xfs_buf_read(mp->m_ddev_targp, dblkno, dblkcnt,
+ 0, &bp, &xfs_attr3_rmt_buf_ops);
if (error)
return error;
error = xfs_attr_rmtval_copyout(mp, bp, args->dp->i_ino,
&offset, &valuelen,
&dst);
- xfs_trans_brelse(args->trans, bp);
+ xfs_buf_relse(bp);
if (error)
return error;
@@ -424,6 +438,130 @@
}
/*
+ * Find a "hole" in the attribute address space large enough for us to drop the
+ * new attribute's value into
+ */
+STATIC int
+xfs_attr_rmt_find_hole(
+ struct xfs_da_args *args)
+{
+ struct xfs_inode *dp = args->dp;
+ struct xfs_mount *mp = dp->i_mount;
+ int error;
+ int blkcnt;
+ xfs_fileoff_t lfileoff = 0;
+
+ /*
+ * Because CRC enable attributes have headers, we can't just do a
+ * straight byte to FSB conversion and have to take the header space
+ * into account.
+ */
+ blkcnt = xfs_attr3_rmt_blocks(mp, args->rmtvaluelen);
+ error = xfs_bmap_first_unused(args->trans, args->dp, blkcnt, &lfileoff,
+ XFS_ATTR_FORK);
+ if (error)
+ return error;
+
+ args->rmtblkno = (xfs_dablk_t)lfileoff;
+ args->rmtblkcnt = blkcnt;
+
+ return 0;
+}
+
+STATIC int
+xfs_attr_rmtval_set_value(
+ struct xfs_da_args *args)
+{
+ struct xfs_inode *dp = args->dp;
+ struct xfs_mount *mp = dp->i_mount;
+ struct xfs_bmbt_irec map;
+ xfs_dablk_t lblkno;
+ uint8_t *src = args->value;
+ int blkcnt;
+ int valuelen;
+ int nmap;
+ int error;
+ int offset = 0;
+
+ /*
+ * Roll through the "value", copying the attribute value to the
+ * already-allocated blocks. Blocks are written synchronously
+ * so that we can know they are all on disk before we turn off
+ * the INCOMPLETE flag.
+ */
+ lblkno = args->rmtblkno;
+ blkcnt = args->rmtblkcnt;
+ valuelen = args->rmtvaluelen;
+ while (valuelen > 0) {
+ struct xfs_buf *bp;
+ xfs_daddr_t dblkno;
+ int dblkcnt;
+
+ ASSERT(blkcnt > 0);
+
+ nmap = 1;
+ error = xfs_bmapi_read(dp, (xfs_fileoff_t)lblkno,
+ blkcnt, &map, &nmap,
+ XFS_BMAPI_ATTRFORK);
+ if (error)
+ return error;
+ ASSERT(nmap == 1);
+ ASSERT((map.br_startblock != DELAYSTARTBLOCK) &&
+ (map.br_startblock != HOLESTARTBLOCK));
+
+ dblkno = XFS_FSB_TO_DADDR(mp, map.br_startblock),
+ dblkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount);
+
+ error = xfs_buf_get(mp->m_ddev_targp, dblkno, dblkcnt, &bp);
+ if (error)
+ return error;
+ bp->b_ops = &xfs_attr3_rmt_buf_ops;
+
+ xfs_attr_rmtval_copyin(mp, bp, args->dp->i_ino, &offset,
+ &valuelen, &src);
+
+ error = xfs_bwrite(bp); /* GROT: NOTE: synchronous write */
+ xfs_buf_relse(bp);
+ if (error)
+ return error;
+
+
+ /* roll attribute extent map forwards */
+ lblkno += map.br_blockcount;
+ blkcnt -= map.br_blockcount;
+ }
+ ASSERT(valuelen == 0);
+ return 0;
+}
+
+/* Mark stale any incore buffers for the remote value. */
+int
+xfs_attr_rmtval_stale(
+ struct xfs_inode *ip,
+ struct xfs_bmbt_irec *map,
+ xfs_buf_flags_t incore_flags)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_buf *bp;
+
+ ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+
+ if (XFS_IS_CORRUPT(mp, map->br_startblock == DELAYSTARTBLOCK) ||
+ XFS_IS_CORRUPT(mp, map->br_startblock == HOLESTARTBLOCK))
+ return -EFSCORRUPTED;
+
+ bp = xfs_buf_incore(mp->m_ddev_targp,
+ XFS_FSB_TO_DADDR(mp, map->br_startblock),
+ XFS_FSB_TO_BB(mp, map->br_blockcount), incore_flags);
+ if (bp) {
+ xfs_buf_stale(bp);
+ xfs_buf_relse(bp);
+ }
+
+ return 0;
+}
+
+/*
* Write the value associated with an attribute into the out-of-line buffer
* that we have defined for it.
*/
@@ -432,34 +570,20 @@
struct xfs_da_args *args)
{
struct xfs_inode *dp = args->dp;
- struct xfs_mount *mp = dp->i_mount;
struct xfs_bmbt_irec map;
xfs_dablk_t lblkno;
- xfs_fileoff_t lfileoff = 0;
- uint8_t *src = args->value;
int blkcnt;
- int valuelen;
int nmap;
int error;
- int offset = 0;
trace_xfs_attr_rmtval_set(args);
- /*
- * Find a "hole" in the attribute address space large enough for
- * us to drop the new attribute's value into. Because CRC enable
- * attributes have headers, we can't just do a straight byte to FSB
- * conversion and have to take the header space into account.
- */
- blkcnt = xfs_attr3_rmt_blocks(mp, args->rmtvaluelen);
- error = xfs_bmap_first_unused(args->trans, args->dp, blkcnt, &lfileoff,
- XFS_ATTR_FORK);
+ error = xfs_attr_rmt_find_hole(args);
if (error)
return error;
- args->rmtblkno = lblkno = (xfs_dablk_t)lfileoff;
- args->rmtblkcnt = blkcnt;
-
+ blkcnt = args->rmtblkcnt;
+ lblkno = (xfs_dablk_t)args->rmtblkno;
/*
* Roll through the "value", allocating blocks on disk as required.
*/
@@ -500,55 +624,7 @@
return error;
}
- /*
- * Roll through the "value", copying the attribute value to the
- * already-allocated blocks. Blocks are written synchronously
- * so that we can know they are all on disk before we turn off
- * the INCOMPLETE flag.
- */
- lblkno = args->rmtblkno;
- blkcnt = args->rmtblkcnt;
- valuelen = args->rmtvaluelen;
- while (valuelen > 0) {
- struct xfs_buf *bp;
- xfs_daddr_t dblkno;
- int dblkcnt;
-
- ASSERT(blkcnt > 0);
-
- nmap = 1;
- error = xfs_bmapi_read(dp, (xfs_fileoff_t)lblkno,
- blkcnt, &map, &nmap,
- XFS_BMAPI_ATTRFORK);
- if (error)
- return error;
- ASSERT(nmap == 1);
- ASSERT((map.br_startblock != DELAYSTARTBLOCK) &&
- (map.br_startblock != HOLESTARTBLOCK));
-
- dblkno = XFS_FSB_TO_DADDR(mp, map.br_startblock),
- dblkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount);
-
- bp = xfs_buf_get(mp->m_ddev_targp, dblkno, dblkcnt);
- if (!bp)
- return -ENOMEM;
- bp->b_ops = &xfs_attr3_rmt_buf_ops;
-
- xfs_attr_rmtval_copyin(mp, bp, args->dp->i_ino, &offset,
- &valuelen, &src);
-
- error = xfs_bwrite(bp); /* GROT: NOTE: synchronous write */
- xfs_buf_relse(bp);
- if (error)
- return error;
-
-
- /* roll attribute extent map forwards */
- lblkno += map.br_blockcount;
- blkcnt -= map.br_blockcount;
- }
- ASSERT(valuelen == 0);
- return 0;
+ return xfs_attr_rmtval_set_value(args);
}
/*
@@ -556,16 +632,12 @@
* out-of-line buffer that it is stored on.
*/
int
-xfs_attr_rmtval_remove(
+xfs_attr_rmtval_invalidate(
struct xfs_da_args *args)
{
- struct xfs_mount *mp = args->dp->i_mount;
xfs_dablk_t lblkno;
int blkcnt;
int error;
- int done;
-
- trace_xfs_attr_rmtval_remove(args);
/*
* Roll through the "value", invalidating the attribute value's blocks.
@@ -574,9 +646,6 @@
blkcnt = args->rmtblkcnt;
while (blkcnt > 0) {
struct xfs_bmbt_irec map;
- struct xfs_buf *bp;
- xfs_daddr_t dblkno;
- int dblkcnt;
int nmap;
/*
@@ -587,41 +656,38 @@
blkcnt, &map, &nmap, XFS_BMAPI_ATTRFORK);
if (error)
return error;
- ASSERT(nmap == 1);
- ASSERT((map.br_startblock != DELAYSTARTBLOCK) &&
- (map.br_startblock != HOLESTARTBLOCK));
-
- dblkno = XFS_FSB_TO_DADDR(mp, map.br_startblock),
- dblkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount);
-
- /*
- * If the "remote" value is in the cache, remove it.
- */
- bp = xfs_buf_incore(mp->m_ddev_targp, dblkno, dblkcnt, XBF_TRYLOCK);
- if (bp) {
- xfs_buf_stale(bp);
- xfs_buf_relse(bp);
- bp = NULL;
- }
+ if (XFS_IS_CORRUPT(args->dp->i_mount, nmap != 1))
+ return -EFSCORRUPTED;
+ error = xfs_attr_rmtval_stale(args->dp, &map, XBF_TRYLOCK);
+ if (error)
+ return error;
lblkno += map.br_blockcount;
blkcnt -= map.br_blockcount;
}
+ return 0;
+}
+
+/*
+ * Remove the value associated with an attribute by deleting the
+ * out-of-line buffer that it is stored on.
+ */
+int
+xfs_attr_rmtval_remove(
+ struct xfs_da_args *args)
+{
+ int error;
+ int retval;
+
+ trace_xfs_attr_rmtval_remove(args);
/*
* Keep de-allocating extents until the remote-value region is gone.
*/
- lblkno = args->rmtblkno;
- blkcnt = args->rmtblkcnt;
- done = 0;
- while (!done) {
- error = xfs_bunmapi(args->trans, args->dp, lblkno, blkcnt,
- XFS_BMAPI_ATTRFORK, 1, &done);
- if (error)
- return error;
- error = xfs_defer_finish(&args->trans);
- if (error)
- return error;
+ do {
+ retval = __xfs_attr_rmtval_remove(args);
+ if (retval && retval != -EAGAIN)
+ return retval;
/*
* Close out trans and start the next one in the chain.
@@ -629,6 +695,36 @@
error = xfs_trans_roll_inode(&args->trans, args->dp);
if (error)
return error;
- }
+ } while (retval == -EAGAIN);
+
return 0;
}
+
+/*
+ * Remove the value associated with an attribute by deleting the out-of-line
+ * buffer that it is stored on. Returns EAGAIN for the caller to refresh the
+ * transaction and re-call the function
+ */
+int
+__xfs_attr_rmtval_remove(
+ struct xfs_da_args *args)
+{
+ int error, done;
+
+ /*
+ * Unmap value blocks for this attr.
+ */
+ error = xfs_bunmapi(args->trans, args->dp, args->rmtblkno,
+ args->rmtblkcnt, XFS_BMAPI_ATTRFORK, 1, &done);
+ if (error)
+ return error;
+
+ error = xfs_defer_finish(&args->trans);
+ if (error)
+ return error;
+
+ if (!done)
+ return -EAGAIN;
+
+ return error;
+}
diff --git a/fs/xfs/libxfs/xfs_attr_remote.h b/fs/xfs/libxfs/xfs_attr_remote.h
index 9d20b66..9eee615 100644
--- a/fs/xfs/libxfs/xfs_attr_remote.h
+++ b/fs/xfs/libxfs/xfs_attr_remote.h
@@ -1,4 +1,4 @@
-// SPDX-License-Identifier: GPL-2.0
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* Copyright (c) 2013 Red Hat, Inc.
* All Rights Reserved.
@@ -11,5 +11,8 @@
int xfs_attr_rmtval_get(struct xfs_da_args *args);
int xfs_attr_rmtval_set(struct xfs_da_args *args);
int xfs_attr_rmtval_remove(struct xfs_da_args *args);
-
+int xfs_attr_rmtval_stale(struct xfs_inode *ip, struct xfs_bmbt_irec *map,
+ xfs_buf_flags_t incore_flags);
+int xfs_attr_rmtval_invalidate(struct xfs_da_args *args);
+int __xfs_attr_rmtval_remove(struct xfs_da_args *args);
#endif /* __XFS_ATTR_REMOTE_H__ */
diff --git a/fs/xfs/libxfs/xfs_attr_sf.h b/fs/xfs/libxfs/xfs_attr_sf.h
index aafa4fe..37578b3 100644
--- a/fs/xfs/libxfs/xfs_attr_sf.h
+++ b/fs/xfs/libxfs/xfs_attr_sf.h
@@ -1,4 +1,4 @@
-// SPDX-License-Identifier: GPL-2.0
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* Copyright (c) 2000,2002,2005 Silicon Graphics, Inc.
* All Rights Reserved.
@@ -13,7 +13,6 @@
* to fit into the literal area of the inode.
*/
typedef struct xfs_attr_sf_hdr xfs_attr_sf_hdr_t;
-typedef struct xfs_attr_sf_entry xfs_attr_sf_entry_t;
/*
* We generate this then sort it, attr_list() must return things in hash-order.
@@ -27,16 +26,26 @@
unsigned char *name; /* name value, pointer into buffer */
} xfs_attr_sf_sort_t;
-#define XFS_ATTR_SF_ENTSIZE_BYNAME(nlen,vlen) /* space name/value uses */ \
- (((int)sizeof(xfs_attr_sf_entry_t)-1 + (nlen)+(vlen)))
#define XFS_ATTR_SF_ENTSIZE_MAX /* max space for name&value */ \
((1 << (NBBY*(int)sizeof(uint8_t))) - 1)
-#define XFS_ATTR_SF_ENTSIZE(sfep) /* space an entry uses */ \
- ((int)sizeof(xfs_attr_sf_entry_t)-1 + (sfep)->namelen+(sfep)->valuelen)
-#define XFS_ATTR_SF_NEXTENTRY(sfep) /* next entry in struct */ \
- ((xfs_attr_sf_entry_t *)((char *)(sfep) + XFS_ATTR_SF_ENTSIZE(sfep)))
-#define XFS_ATTR_SF_TOTSIZE(dp) /* total space in use */ \
- (be16_to_cpu(((xfs_attr_shortform_t *) \
- ((dp)->i_afp->if_u1.if_data))->hdr.totsize))
+
+/* space name/value uses */
+static inline int xfs_attr_sf_entsize_byname(uint8_t nlen, uint8_t vlen)
+{
+ return sizeof(struct xfs_attr_sf_entry) + nlen + vlen;
+}
+
+/* space an entry uses */
+static inline int xfs_attr_sf_entsize(struct xfs_attr_sf_entry *sfep)
+{
+ return struct_size(sfep, nameval, sfep->namelen + sfep->valuelen);
+}
+
+/* next entry in struct */
+static inline struct xfs_attr_sf_entry *
+xfs_attr_sf_nextentry(struct xfs_attr_sf_entry *sfep)
+{
+ return (void *)sfep + xfs_attr_sf_entsize(sfep);
+}
#endif /* __XFS_ATTR_SF_H__ */
diff --git a/fs/xfs/libxfs/xfs_bit.c b/fs/xfs/libxfs/xfs_bit.c
index 7071ff9..40ce5f3 100644
--- a/fs/xfs/libxfs/xfs_bit.c
+++ b/fs/xfs/libxfs/xfs_bit.c
@@ -5,6 +5,7 @@
*/
#include "xfs.h"
#include "xfs_log_format.h"
+#include "xfs_bit.h"
/*
* XFS bit manipulation routines, used in non-realtime code.
diff --git a/fs/xfs/libxfs/xfs_bit.h b/fs/xfs/libxfs/xfs_bit.h
index 99017b8..a04f266 100644
--- a/fs/xfs/libxfs/xfs_bit.h
+++ b/fs/xfs/libxfs/xfs_bit.h
@@ -1,4 +1,4 @@
-// SPDX-License-Identifier: GPL-2.0
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* Copyright (c) 2000,2002,2005 Silicon Graphics, Inc.
* All Rights Reserved.
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index c114d24..d9a6924 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -34,6 +34,7 @@
#include "xfs_ag_resv.h"
#include "xfs_refcount.h"
#include "xfs_icache.h"
+#include "xfs_iomap.h"
kmem_zone_t *xfs_bmap_free_item_zone;
@@ -60,10 +61,10 @@
int sz; /* root block size */
/*
- * The maximum number of extents in a file, hence the maximum
- * number of leaf entries, is controlled by the type of di_nextents
- * (a signed 32-bit number, xfs_extnum_t), or by di_anextents
- * (a signed 16-bit number, xfs_aextnum_t).
+ * The maximum number of extents in a file, hence the maximum number of
+ * leaf entries, is controlled by the size of the on-disk extent count,
+ * either a signed 32-bit number for the data fork, or a signed 16-bit
+ * number for the attr fork.
*
* Note that we can no longer assume that if we are in ATTR1 that
* the fork offset of all the inodes will be
@@ -119,10 +120,11 @@
*/
static inline bool xfs_bmap_needs_btree(struct xfs_inode *ip, int whichfork)
{
+ struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
+
return whichfork != XFS_COW_FORK &&
- XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS &&
- XFS_IFORK_NEXTENTS(ip, whichfork) >
- XFS_IFORK_MAXEXT(ip, whichfork);
+ ifp->if_format == XFS_DINODE_FMT_EXTENTS &&
+ ifp->if_nextents > XFS_IFORK_MAXEXT(ip, whichfork);
}
/*
@@ -130,10 +132,11 @@
*/
static inline bool xfs_bmap_wants_extents(struct xfs_inode *ip, int whichfork)
{
+ struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
+
return whichfork != XFS_COW_FORK &&
- XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE &&
- XFS_IFORK_NEXTENTS(ip, whichfork) <=
- XFS_IFORK_MAXEXT(ip, whichfork);
+ ifp->if_format == XFS_DINODE_FMT_BTREE &&
+ ifp->if_nextents <= XFS_IFORK_MAXEXT(ip, whichfork);
}
/*
@@ -192,14 +195,12 @@
struct xfs_mount *mp = ip->i_mount;
uint offset;
- if (mp->m_sb.sb_inodesize == 256) {
- offset = XFS_LITINO(mp, ip->i_d.di_version) -
- XFS_BMDR_SPACE_CALC(MINABTPTRS);
- } else {
+ if (mp->m_sb.sb_inodesize == 256)
+ offset = XFS_LITINO(mp) - XFS_BMDR_SPACE_CALC(MINABTPTRS);
+ else
offset = XFS_BMDR_SPACE_CALC(6 * MINABTPTRS);
- }
- ASSERT(offset < XFS_LITINO(mp, ip->i_d.di_version));
+ ASSERT(offset < XFS_LITINO(mp));
return offset;
}
@@ -214,8 +215,8 @@
int whichfork)
{
if (whichfork == XFS_ATTR_FORK &&
- ip->i_d.di_format != XFS_DINODE_FMT_DEV &&
- ip->i_d.di_format != XFS_DINODE_FMT_BTREE) {
+ ip->i_df.if_format != XFS_DINODE_FMT_DEV &&
+ ip->i_df.if_format != XFS_DINODE_FMT_BTREE) {
uint dfl_forkoff = xfs_default_attroffset(ip) >> 3;
if (dfl_forkoff > ip->i_d.di_forkoff)
@@ -316,31 +317,28 @@
xfs_inode_t *ip, /* incore inode pointer */
int whichfork) /* data or attr fork */
{
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
struct xfs_btree_block *block; /* current btree block */
xfs_fsblock_t bno; /* block # of "block" */
xfs_buf_t *bp; /* buffer for "block" */
int error; /* error return value */
xfs_extnum_t i=0, j; /* index into the extents list */
- struct xfs_ifork *ifp; /* fork structure */
int level; /* btree level, for checking */
- xfs_mount_t *mp; /* file system mount structure */
__be64 *pp; /* pointer to block address */
xfs_bmbt_rec_t *ep; /* pointer to current extent */
xfs_bmbt_rec_t last = {0, 0}; /* last extent in prev block */
xfs_bmbt_rec_t *nextp; /* pointer to next extent */
int bp_release = 0;
- if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE) {
+ if (ifp->if_format != XFS_DINODE_FMT_BTREE)
return;
- }
/* skip large extent count inodes */
- if (ip->i_d.di_nextents > 10000)
+ if (ip->i_df.if_nextents > 10000)
return;
bno = NULLFSBLOCK;
- mp = ip->i_mount;
- ifp = XFS_IFORK_PTR(ip, whichfork);
block = ifp->if_broot;
/*
* Root level must use BMAP_BROOT_PTR_ADDR macro to get ptr out.
@@ -383,8 +381,10 @@
xfs_check_block(block, mp, 0, 0);
pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]);
bno = be64_to_cpu(*pp);
- XFS_WANT_CORRUPTED_GOTO(mp,
- xfs_verify_fsbno(mp, bno), error0);
+ if (XFS_IS_CORRUPT(mp, !xfs_verify_fsbno(mp, bno))) {
+ error = -EFSCORRUPTED;
+ goto error0;
+ }
if (bp_release) {
bp_release = 0;
xfs_trans_brelse(NULL, bp);
@@ -553,7 +553,8 @@
#endif
ASSERT(xfs_bmap_free_item_zone != NULL);
- new = kmem_zone_alloc(xfs_bmap_free_item_zone, 0);
+ new = kmem_cache_alloc(xfs_bmap_free_item_zone,
+ GFP_KERNEL | __GFP_NOFAIL);
new->xefi_startblock = bno;
new->xefi_blockcount = (xfs_extlen_t)len;
if (oinfo)
@@ -603,7 +604,7 @@
ASSERT(cur);
ASSERT(whichfork != XFS_COW_FORK);
ASSERT(ifp->if_flags & XFS_IFEXTENTS);
- ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE);
+ ASSERT(ifp->if_format == XFS_DINODE_FMT_BTREE);
ASSERT(be16_to_cpu(rblock->bb_level) == 1);
ASSERT(be16_to_cpu(rblock->bb_numrecs) == 1);
ASSERT(xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0) == 1);
@@ -611,8 +612,8 @@
pp = XFS_BMAP_BROOT_PTR_ADDR(mp, rblock, 1, ifp->if_broot_bytes);
cbno = be64_to_cpu(*pp);
#ifdef DEBUG
- XFS_WANT_CORRUPTED_RETURN(cur->bc_mp,
- xfs_btree_check_lptr(cur, cbno, 1));
+ if (XFS_IS_CORRUPT(cur->bc_mp, !xfs_btree_check_lptr(cur, cbno, 1)))
+ return -EFSCORRUPTED;
#endif
error = xfs_btree_read_bufl(mp, tp, cbno, &cbp, XFS_BMAP_BTREE_REF,
&xfs_bmbt_buf_ops);
@@ -631,7 +632,7 @@
xfs_iroot_realloc(ip, -1, whichfork);
ASSERT(ifp->if_broot == NULL);
ASSERT((ifp->if_flags & XFS_IFBROOT) == 0);
- XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS);
+ ifp->if_format = XFS_DINODE_FMT_EXTENTS;
*logflagsp |= XFS_ILOG_CORE | xfs_ilog_fext(whichfork);
return 0;
}
@@ -667,7 +668,7 @@
mp = ip->i_mount;
ASSERT(whichfork != XFS_COW_FORK);
ifp = XFS_IFORK_PTR(ip, whichfork);
- ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS);
+ ASSERT(ifp->if_format == XFS_DINODE_FMT_EXTENTS);
/*
* Make space in the inode incore. This needs to be undone if we fail
@@ -687,11 +688,11 @@
* Need a cursor. Can't allocate until bb_level is filled in.
*/
cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
- cur->bc_private.b.flags = wasdel ? XFS_BTCUR_BPRV_WASDEL : 0;
+ cur->bc_ino.flags = wasdel ? XFS_BTCUR_BMBT_WASDEL : 0;
/*
* Convert to a btree with two levels, one record in root.
*/
- XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_BTREE);
+ ifp->if_format = XFS_DINODE_FMT_BTREE;
memset(&args, 0, sizeof(args));
args.tp = tp;
args.mp = mp;
@@ -724,14 +725,14 @@
ASSERT(tp->t_firstblock == NULLFSBLOCK ||
args.agno >= XFS_FSB_TO_AGNO(mp, tp->t_firstblock));
tp->t_firstblock = args.fsbno;
- cur->bc_private.b.allocated++;
+ cur->bc_ino.allocated++;
ip->i_d.di_nblocks++;
xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, 1L);
- abp = xfs_btree_get_bufl(mp, tp, args.fsbno);
- if (!abp) {
- error = -EFSCORRUPTED;
+ error = xfs_trans_get_buf(tp, mp->m_ddev_targp,
+ XFS_FSB_TO_DADDR(mp, args.fsbno),
+ mp->m_bsize, 0, &abp);
+ if (error)
goto out_unreserve_dquot;
- }
/*
* Fill in the child block.
@@ -749,7 +750,7 @@
xfs_bmbt_disk_set_all(arp, &rec);
cnt++;
}
- ASSERT(cnt == XFS_IFORK_NEXTENTS(ip, whichfork));
+ ASSERT(cnt == ifp->if_nextents);
xfs_btree_set_numrecs(ablock, cnt);
/*
@@ -777,7 +778,7 @@
xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, -1L);
out_root_realloc:
xfs_iroot_realloc(ip, -1, whichfork);
- XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS);
+ ifp->if_format = XFS_DINODE_FMT_EXTENTS;
ASSERT(ifp->if_broot == NULL);
xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
@@ -799,16 +800,16 @@
struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
ASSERT(whichfork != XFS_COW_FORK);
- ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL);
+ ASSERT(ifp->if_format == XFS_DINODE_FMT_LOCAL);
ASSERT(ifp->if_bytes == 0);
- ASSERT(XFS_IFORK_NEXTENTS(ip, whichfork) == 0);
+ ASSERT(ifp->if_nextents == 0);
xfs_bmap_forkoff_reset(ip, whichfork);
ifp->if_flags &= ~XFS_IFINLINE;
ifp->if_flags |= XFS_IFEXTENTS;
ifp->if_u1.if_root = NULL;
ifp->if_height = 0;
- XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS);
+ ifp->if_format = XFS_DINODE_FMT_EXTENTS;
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
}
@@ -839,7 +840,7 @@
*/
ASSERT(!(S_ISREG(VFS_I(ip)->i_mode) && whichfork == XFS_DATA_FORK));
ifp = XFS_IFORK_PTR(ip, whichfork);
- ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL);
+ ASSERT(ifp->if_format == XFS_DINODE_FMT_LOCAL);
if (!ifp->if_bytes) {
xfs_bmap_local_to_extents_empty(tp, ip, whichfork);
@@ -875,7 +876,11 @@
ASSERT(args.fsbno != NULLFSBLOCK);
ASSERT(args.len == 1);
tp->t_firstblock = args.fsbno;
- bp = xfs_btree_get_bufl(args.mp, tp, args.fsbno);
+ error = xfs_trans_get_buf(tp, args.mp->m_ddev_targp,
+ XFS_FSB_TO_DADDR(args.mp, args.fsbno),
+ args.mp->m_bsize, 0, &bp);
+ if (error)
+ goto done;
/*
* Initialize the block, copy the data and log the remote buffer.
@@ -902,7 +907,7 @@
xfs_iext_first(ifp, &icur);
xfs_iext_insert(ip, &icur, &rec, 0);
- XFS_IFORK_NEXT_SET(ip, whichfork, 1);
+ ifp->if_nextents = 1;
ip->i_d.di_nblocks = 1;
xfs_trans_mod_dquot_byino(tp, ip,
XFS_TRANS_DQ_BCOUNT, 1L);
@@ -936,14 +941,17 @@
if (error)
goto error0;
/* must be at least one entry */
- XFS_WANT_CORRUPTED_GOTO(mp, stat == 1, error0);
+ if (XFS_IS_CORRUPT(mp, stat != 1)) {
+ error = -EFSCORRUPTED;
+ goto error0;
+ }
if ((error = xfs_btree_new_iroot(cur, flags, &stat)))
goto error0;
if (stat == 0) {
xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
return -ENOSPC;
}
- cur->bc_private.b.allocated = 0;
+ cur->bc_ino.allocated = 0;
xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
}
return 0;
@@ -964,13 +972,14 @@
xfs_btree_cur_t *cur; /* bmap btree cursor */
int error; /* error return value */
- if (ip->i_d.di_nextents * sizeof(xfs_bmbt_rec_t) <= XFS_IFORK_DSIZE(ip))
+ if (ip->i_df.if_nextents * sizeof(struct xfs_bmbt_rec) <=
+ XFS_IFORK_DSIZE(ip))
return 0;
cur = NULL;
error = xfs_bmap_extents_to_btree(tp, ip, &cur, 0, flags,
XFS_DATA_FORK);
if (cur) {
- cur->bc_private.b.allocated = 0;
+ cur->bc_ino.allocated = 0;
xfs_btree_del_cursor(cur, error);
}
return error;
@@ -1025,7 +1034,7 @@
int size,
int *version)
{
- switch (ip->i_d.di_format) {
+ switch (ip->i_df.if_format) {
case XFS_DINODE_FMT_DEV:
ip->i_d.di_forkoff = roundup(sizeof(xfs_dev_t), 8) >> 3;
break;
@@ -1083,17 +1092,6 @@
goto trans_cancel;
if (XFS_IFORK_Q(ip))
goto trans_cancel;
- if (ip->i_d.di_anextents != 0) {
- error = -EFSCORRUPTED;
- goto trans_cancel;
- }
- if (ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS) {
- /*
- * For inodes coming from pre-6.2 filesystems.
- */
- ASSERT(ip->i_d.di_aformat == 0);
- ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS;
- }
xfs_trans_ijoin(tp, ip, 0);
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
@@ -1101,10 +1099,14 @@
if (error)
goto trans_cancel;
ASSERT(ip->i_afp == NULL);
- ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, 0);
+
+ ip->i_afp = kmem_cache_zalloc(xfs_ifork_zone,
+ GFP_KERNEL | __GFP_NOFAIL);
+
+ ip->i_afp->if_format = XFS_DINODE_FMT_EXTENTS;
ip->i_afp->if_flags = XFS_IFEXTENTS;
logflags = 0;
- switch (ip->i_d.di_format) {
+ switch (ip->i_df.if_format) {
case XFS_DINODE_FMT_LOCAL:
error = xfs_bmap_add_attrfork_local(tp, ip, &logflags);
break;
@@ -1154,6 +1156,65 @@
* Internal and external extent tree search functions.
*/
+struct xfs_iread_state {
+ struct xfs_iext_cursor icur;
+ xfs_extnum_t loaded;
+};
+
+/* Stuff every bmbt record from this block into the incore extent map. */
+static int
+xfs_iread_bmbt_block(
+ struct xfs_btree_cur *cur,
+ int level,
+ void *priv)
+{
+ struct xfs_iread_state *ir = priv;
+ struct xfs_mount *mp = cur->bc_mp;
+ struct xfs_inode *ip = cur->bc_ino.ip;
+ struct xfs_btree_block *block;
+ struct xfs_buf *bp;
+ struct xfs_bmbt_rec *frp;
+ xfs_extnum_t num_recs;
+ xfs_extnum_t j;
+ int whichfork = cur->bc_ino.whichfork;
+ struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
+
+ block = xfs_btree_get_block(cur, level, &bp);
+
+ /* Abort if we find more records than nextents. */
+ num_recs = xfs_btree_get_numrecs(block);
+ if (unlikely(ir->loaded + num_recs > ifp->if_nextents)) {
+ xfs_warn(ip->i_mount, "corrupt dinode %llu, (btree extents).",
+ (unsigned long long)ip->i_ino);
+ xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__, block,
+ sizeof(*block), __this_address);
+ return -EFSCORRUPTED;
+ }
+
+ /* Copy records into the incore cache. */
+ frp = XFS_BMBT_REC_ADDR(mp, block, 1);
+ for (j = 0; j < num_recs; j++, frp++, ir->loaded++) {
+ struct xfs_bmbt_irec new;
+ xfs_failaddr_t fa;
+
+ xfs_bmbt_disk_get_all(frp, &new);
+ fa = xfs_bmap_validate_extent(ip, whichfork, &new);
+ if (fa) {
+ xfs_inode_verifier_error(ip, -EFSCORRUPTED,
+ "xfs_iread_extents(2)", frp,
+ sizeof(*frp), fa);
+ return -EFSCORRUPTED;
+ }
+ xfs_iext_insert(ip, &ir->icur, &new,
+ xfs_bmap_fork_to_state(whichfork));
+ trace_xfs_read_extent(ip, &ir->icur,
+ xfs_bmap_fork_to_state(whichfork), _THIS_IP_);
+ xfs_iext_next(ifp, &ir->icur);
+ }
+
+ return 0;
+}
+
/*
* Read in extents from a btree-format inode.
*/
@@ -1163,134 +1224,36 @@
struct xfs_inode *ip,
int whichfork)
{
- struct xfs_mount *mp = ip->i_mount;
- int state = xfs_bmap_fork_to_state(whichfork);
+ struct xfs_iread_state ir;
struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
- xfs_extnum_t nextents = XFS_IFORK_NEXTENTS(ip, whichfork);
- struct xfs_btree_block *block = ifp->if_broot;
- struct xfs_iext_cursor icur;
- struct xfs_bmbt_irec new;
- xfs_fsblock_t bno;
- struct xfs_buf *bp;
- xfs_extnum_t i, j;
- int level;
- __be64 *pp;
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_btree_cur *cur;
int error;
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
- if (unlikely(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)) {
- XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp);
- return -EFSCORRUPTED;
- }
-
- /*
- * Root level must use BMAP_BROOT_PTR_ADDR macro to get ptr out.
- */
- level = be16_to_cpu(block->bb_level);
- if (unlikely(level == 0)) {
- XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp);
- return -EFSCORRUPTED;
- }
- pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes);
- bno = be64_to_cpu(*pp);
-
- /*
- * Go down the tree until leaf level is reached, following the first
- * pointer (leftmost) at each level.
- */
- while (level-- > 0) {
- error = xfs_btree_read_bufl(mp, tp, bno, &bp,
- XFS_BMAP_BTREE_REF, &xfs_bmbt_buf_ops);
- if (error)
- goto out;
- block = XFS_BUF_TO_BLOCK(bp);
- if (level == 0)
- break;
- pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]);
- bno = be64_to_cpu(*pp);
- XFS_WANT_CORRUPTED_GOTO(mp,
- xfs_verify_fsbno(mp, bno), out_brelse);
- xfs_trans_brelse(tp, bp);
- }
-
- /*
- * Here with bp and block set to the leftmost leaf node in the tree.
- */
- i = 0;
- xfs_iext_first(ifp, &icur);
-
- /*
- * Loop over all leaf nodes. Copy information to the extent records.
- */
- for (;;) {
- xfs_bmbt_rec_t *frp;
- xfs_fsblock_t nextbno;
- xfs_extnum_t num_recs;
-
- num_recs = xfs_btree_get_numrecs(block);
- if (unlikely(i + num_recs > nextents)) {
- xfs_warn(ip->i_mount,
- "corrupt dinode %Lu, (btree extents).",
- (unsigned long long) ip->i_ino);
- xfs_inode_verifier_error(ip, -EFSCORRUPTED,
- __func__, block, sizeof(*block),
- __this_address);
- error = -EFSCORRUPTED;
- goto out_brelse;
- }
- /*
- * Read-ahead the next leaf block, if any.
- */
- nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib);
- if (nextbno != NULLFSBLOCK)
- xfs_btree_reada_bufl(mp, nextbno, 1,
- &xfs_bmbt_buf_ops);
- /*
- * Copy records into the extent records.
- */
- frp = XFS_BMBT_REC_ADDR(mp, block, 1);
- for (j = 0; j < num_recs; j++, frp++, i++) {
- xfs_failaddr_t fa;
-
- xfs_bmbt_disk_get_all(frp, &new);
- fa = xfs_bmap_validate_extent(ip, whichfork, &new);
- if (fa) {
- error = -EFSCORRUPTED;
- xfs_inode_verifier_error(ip, error,
- "xfs_iread_extents(2)",
- frp, sizeof(*frp), fa);
- goto out_brelse;
- }
- xfs_iext_insert(ip, &icur, &new, state);
- trace_xfs_read_extent(ip, &icur, state, _THIS_IP_);
- xfs_iext_next(ifp, &icur);
- }
- xfs_trans_brelse(tp, bp);
- bno = nextbno;
- /*
- * If we've reached the end, stop.
- */
- if (bno == NULLFSBLOCK)
- break;
- error = xfs_btree_read_bufl(mp, tp, bno, &bp,
- XFS_BMAP_BTREE_REF, &xfs_bmbt_buf_ops);
- if (error)
- goto out;
- block = XFS_BUF_TO_BLOCK(bp);
- }
-
- if (i != XFS_IFORK_NEXTENTS(ip, whichfork)) {
+ if (XFS_IS_CORRUPT(mp, ifp->if_format != XFS_DINODE_FMT_BTREE)) {
error = -EFSCORRUPTED;
goto out;
}
- ASSERT(i == xfs_iext_count(ifp));
+
+ ir.loaded = 0;
+ xfs_iext_first(ifp, &ir.icur);
+ cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
+ error = xfs_btree_visit_blocks(cur, xfs_iread_bmbt_block,
+ XFS_BTREE_VISIT_RECORDS, &ir);
+ xfs_btree_del_cursor(cur, error);
+ if (error)
+ goto out;
+
+ if (XFS_IS_CORRUPT(mp, ir.loaded != ifp->if_nextents)) {
+ error = -EFSCORRUPTED;
+ goto out;
+ }
+ ASSERT(ir.loaded == xfs_iext_count(ifp));
ifp->if_flags |= XFS_IFEXTENTS;
return 0;
-
-out_brelse:
- xfs_trans_brelse(tp, bp);
out:
xfs_iext_destroy(ifp);
return error;
@@ -1317,15 +1280,13 @@
xfs_fileoff_t lowest, max;
int error;
- ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE ||
- XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS ||
- XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL);
-
- if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) {
+ if (ifp->if_format == XFS_DINODE_FMT_LOCAL) {
*first_unused = 0;
return 0;
}
+ ASSERT(xfs_ifork_has_extents(ifp));
+
if (!(ifp->if_flags & XFS_IFEXTENTS)) {
error = xfs_iread_extents(tp, ip, whichfork);
if (error)
@@ -1366,7 +1327,7 @@
struct xfs_iext_cursor icur;
int error;
- switch (XFS_IFORK_FORMAT(ip, whichfork)) {
+ switch (ifp->if_format) {
case XFS_DINODE_FMT_LOCAL:
*last_block = 0;
return 0;
@@ -1374,7 +1335,8 @@
case XFS_DINODE_FMT_EXTENTS:
break;
default:
- return -EIO;
+ ASSERT(0);
+ return -EFSCORRUPTED;
}
if (!(ifp->if_flags & XFS_IFEXTENTS)) {
@@ -1464,18 +1426,18 @@
xfs_fileoff_t *last_block,
int whichfork)
{
+ struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
struct xfs_bmbt_irec rec;
int is_empty;
int error;
*last_block = 0;
- if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL)
+ if (ifp->if_format == XFS_DINODE_FMT_LOCAL)
return 0;
- if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE &&
- XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS)
- return -EIO;
+ if (XFS_IS_CORRUPT(ip->i_mount, !xfs_ifork_has_extents(ifp)))
+ return -EFSCORRUPTED;
error = xfs_bmap_last_extent(NULL, ip, whichfork, &rec, &is_empty);
if (error || is_empty)
@@ -1492,23 +1454,22 @@
*/
int /* 1=>1 block, 0=>otherwise */
xfs_bmap_one_block(
- xfs_inode_t *ip, /* incore inode */
- int whichfork) /* data or attr fork */
+ struct xfs_inode *ip, /* incore inode */
+ int whichfork) /* data or attr fork */
{
- struct xfs_ifork *ifp; /* inode fork pointer */
- int rval; /* return value */
- xfs_bmbt_irec_t s; /* internal version of extent */
+ struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
+ int rval; /* return value */
+ struct xfs_bmbt_irec s; /* internal version of extent */
struct xfs_iext_cursor icur;
#ifndef DEBUG
if (whichfork == XFS_DATA_FORK)
return XFS_ISIZE(ip) == ip->i_mount->m_sb.sb_blocksize;
#endif /* !DEBUG */
- if (XFS_IFORK_NEXTENTS(ip, whichfork) != 1)
+ if (ifp->if_nextents != 1)
return 0;
- if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS)
+ if (ifp->if_format != XFS_DINODE_FMT_EXTENTS)
return 0;
- ifp = XFS_IFORK_PTR(ip, whichfork);
ASSERT(ifp->if_flags & XFS_IFEXTENTS);
xfs_iext_first(ifp, &icur);
xfs_iext_get_extent(ifp, &icur, &s);
@@ -1530,10 +1491,11 @@
struct xfs_bmalloca *bma,
int whichfork)
{
+ struct xfs_mount *mp = bma->ip->i_mount;
+ struct xfs_ifork *ifp = XFS_IFORK_PTR(bma->ip, whichfork);
struct xfs_bmbt_irec *new = &bma->got;
int error; /* error return value */
int i; /* temp state */
- struct xfs_ifork *ifp; /* inode fork pointer */
xfs_fileoff_t new_endoff; /* end offset of new entry */
xfs_bmbt_irec_t r[3]; /* neighbor extent entries */
/* left is 0, right is 1, prev is 2 */
@@ -1543,19 +1505,12 @@
xfs_filblks_t da_old; /* old count del alloc blocks used */
xfs_filblks_t temp=0; /* value for da_new calculations */
int tmp_rval; /* partial logging flags */
- struct xfs_mount *mp;
- xfs_extnum_t *nextents;
struct xfs_bmbt_irec old;
- mp = bma->ip->i_mount;
- ifp = XFS_IFORK_PTR(bma->ip, whichfork);
ASSERT(whichfork != XFS_ATTR_FORK);
- nextents = (whichfork == XFS_COW_FORK ? &bma->ip->i_cnextents :
- &bma->ip->i_d.di_nextents);
-
ASSERT(!isnullstartblock(new->br_startblock));
ASSERT(!bma->cur ||
- (bma->cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL));
+ (bma->cur->bc_ino.flags & XFS_BTCUR_BMBT_WASDEL));
XFS_STATS_INC(mp, xs_add_exlist);
@@ -1643,7 +1598,7 @@
xfs_iext_remove(bma->ip, &bma->icur, state);
xfs_iext_prev(ifp, &bma->icur);
xfs_iext_update_extent(bma->ip, state, &bma->icur, &LEFT);
- (*nextents)--;
+ ifp->if_nextents--;
if (bma->cur == NULL)
rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
@@ -1652,15 +1607,24 @@
error = xfs_bmbt_lookup_eq(bma->cur, &RIGHT, &i);
if (error)
goto done;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
error = xfs_btree_delete(bma->cur, &i);
if (error)
goto done;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
error = xfs_btree_decrement(bma->cur, 0, &i);
if (error)
goto done;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
error = xfs_bmbt_update(bma->cur, &LEFT);
if (error)
goto done;
@@ -1686,7 +1650,10 @@
error = xfs_bmbt_lookup_eq(bma->cur, &old, &i);
if (error)
goto done;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
error = xfs_bmbt_update(bma->cur, &LEFT);
if (error)
goto done;
@@ -1716,7 +1683,10 @@
error = xfs_bmbt_lookup_eq(bma->cur, &RIGHT, &i);
if (error)
goto done;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
error = xfs_bmbt_update(bma->cur, &PREV);
if (error)
goto done;
@@ -1732,8 +1702,8 @@
PREV.br_startblock = new->br_startblock;
PREV.br_state = new->br_state;
xfs_iext_update_extent(bma->ip, state, &bma->icur, &PREV);
+ ifp->if_nextents++;
- (*nextents)++;
if (bma->cur == NULL)
rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
else {
@@ -1741,11 +1711,17 @@
error = xfs_bmbt_lookup_eq(bma->cur, new, &i);
if (error)
goto done;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done);
+ if (XFS_IS_CORRUPT(mp, i != 0)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
error = xfs_btree_insert(bma->cur, &i);
if (error)
goto done;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
}
break;
@@ -1776,7 +1752,10 @@
error = xfs_bmbt_lookup_eq(bma->cur, &old, &i);
if (error)
goto done;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
error = xfs_bmbt_update(bma->cur, &LEFT);
if (error)
goto done;
@@ -1789,7 +1768,8 @@
* The left neighbor is not contiguous.
*/
xfs_iext_update_extent(bma->ip, state, &bma->icur, new);
- (*nextents)++;
+ ifp->if_nextents++;
+
if (bma->cur == NULL)
rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
else {
@@ -1797,11 +1777,17 @@
error = xfs_bmbt_lookup_eq(bma->cur, new, &i);
if (error)
goto done;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done);
+ if (XFS_IS_CORRUPT(mp, i != 0)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
error = xfs_btree_insert(bma->cur, &i);
if (error)
goto done;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
}
if (xfs_bmap_needs_btree(bma->ip, whichfork)) {
@@ -1815,7 +1801,7 @@
temp = PREV.br_blockcount - new->br_blockcount;
da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(bma->ip, temp),
startblockval(PREV.br_startblock) -
- (bma->cur ? bma->cur->bc_private.b.allocated : 0));
+ (bma->cur ? bma->cur->bc_ino.allocated : 0));
PREV.br_startoff = new_endoff;
PREV.br_blockcount = temp;
@@ -1842,7 +1828,10 @@
error = xfs_bmbt_lookup_eq(bma->cur, &old, &i);
if (error)
goto done;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
error = xfs_bmbt_update(bma->cur, &RIGHT);
if (error)
goto done;
@@ -1866,7 +1855,8 @@
* The right neighbor is not contiguous.
*/
xfs_iext_update_extent(bma->ip, state, &bma->icur, new);
- (*nextents)++;
+ ifp->if_nextents++;
+
if (bma->cur == NULL)
rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
else {
@@ -1874,11 +1864,17 @@
error = xfs_bmbt_lookup_eq(bma->cur, new, &i);
if (error)
goto done;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done);
+ if (XFS_IS_CORRUPT(mp, i != 0)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
error = xfs_btree_insert(bma->cur, &i);
if (error)
goto done;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
}
if (xfs_bmap_needs_btree(bma->ip, whichfork)) {
@@ -1892,7 +1888,7 @@
temp = PREV.br_blockcount - new->br_blockcount;
da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(bma->ip, temp),
startblockval(PREV.br_startblock) -
- (bma->cur ? bma->cur->bc_private.b.allocated : 0));
+ (bma->cur ? bma->cur->bc_ino.allocated : 0));
PREV.br_startblock = nullstartblock(da_new);
PREV.br_blockcount = temp;
@@ -1945,7 +1941,7 @@
xfs_iext_next(ifp, &bma->icur);
xfs_iext_insert(bma->ip, &bma->icur, &RIGHT, state);
xfs_iext_insert(bma->ip, &bma->icur, &LEFT, state);
- (*nextents)++;
+ ifp->if_nextents++;
if (bma->cur == NULL)
rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
@@ -1954,11 +1950,17 @@
error = xfs_bmbt_lookup_eq(bma->cur, new, &i);
if (error)
goto done;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done);
+ if (XFS_IS_CORRUPT(mp, i != 0)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
error = xfs_btree_insert(bma->cur, &i);
if (error)
goto done;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
}
if (xfs_bmap_needs_btree(bma->ip, whichfork)) {
@@ -2007,8 +2009,8 @@
xfs_mod_delalloc(mp, (int64_t)da_new - da_old);
if (bma->cur) {
- da_new += bma->cur->bc_private.b.allocated;
- bma->cur->bc_private.b.allocated = 0;
+ da_new += bma->cur->bc_ino.allocated;
+ bma->cur->bc_ino.allocated = 0;
}
/* adjust for changes in reserved delayed indirect blocks */
@@ -2143,8 +2145,7 @@
xfs_iext_remove(ip, icur, state);
xfs_iext_prev(ifp, icur);
xfs_iext_update_extent(ip, state, icur, &LEFT);
- XFS_IFORK_NEXT_SET(ip, whichfork,
- XFS_IFORK_NEXTENTS(ip, whichfork) - 2);
+ ifp->if_nextents -= 2;
if (cur == NULL)
rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
else {
@@ -2152,19 +2153,34 @@
error = xfs_bmbt_lookup_eq(cur, &RIGHT, &i);
if (error)
goto done;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
if ((error = xfs_btree_delete(cur, &i)))
goto done;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
if ((error = xfs_btree_decrement(cur, 0, &i)))
goto done;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
if ((error = xfs_btree_delete(cur, &i)))
goto done;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
if ((error = xfs_btree_decrement(cur, 0, &i)))
goto done;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
error = xfs_bmbt_update(cur, &LEFT);
if (error)
goto done;
@@ -2181,8 +2197,7 @@
xfs_iext_remove(ip, icur, state);
xfs_iext_prev(ifp, icur);
xfs_iext_update_extent(ip, state, icur, &LEFT);
- XFS_IFORK_NEXT_SET(ip, whichfork,
- XFS_IFORK_NEXTENTS(ip, whichfork) - 1);
+ ifp->if_nextents--;
if (cur == NULL)
rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
else {
@@ -2190,13 +2205,22 @@
error = xfs_bmbt_lookup_eq(cur, &PREV, &i);
if (error)
goto done;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
if ((error = xfs_btree_delete(cur, &i)))
goto done;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
if ((error = xfs_btree_decrement(cur, 0, &i)))
goto done;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
error = xfs_bmbt_update(cur, &LEFT);
if (error)
goto done;
@@ -2215,9 +2239,8 @@
xfs_iext_remove(ip, icur, state);
xfs_iext_prev(ifp, icur);
xfs_iext_update_extent(ip, state, icur, &PREV);
+ ifp->if_nextents--;
- XFS_IFORK_NEXT_SET(ip, whichfork,
- XFS_IFORK_NEXTENTS(ip, whichfork) - 1);
if (cur == NULL)
rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
else {
@@ -2225,13 +2248,22 @@
error = xfs_bmbt_lookup_eq(cur, &RIGHT, &i);
if (error)
goto done;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
if ((error = xfs_btree_delete(cur, &i)))
goto done;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
if ((error = xfs_btree_decrement(cur, 0, &i)))
goto done;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
error = xfs_bmbt_update(cur, &PREV);
if (error)
goto done;
@@ -2254,7 +2286,10 @@
error = xfs_bmbt_lookup_eq(cur, new, &i);
if (error)
goto done;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
error = xfs_bmbt_update(cur, &PREV);
if (error)
goto done;
@@ -2284,7 +2319,10 @@
error = xfs_bmbt_lookup_eq(cur, &old, &i);
if (error)
goto done;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
error = xfs_bmbt_update(cur, &PREV);
if (error)
goto done;
@@ -2309,8 +2347,8 @@
xfs_iext_update_extent(ip, state, icur, &PREV);
xfs_iext_insert(ip, icur, new, state);
- XFS_IFORK_NEXT_SET(ip, whichfork,
- XFS_IFORK_NEXTENTS(ip, whichfork) + 1);
+ ifp->if_nextents++;
+
if (cur == NULL)
rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
else {
@@ -2318,14 +2356,20 @@
error = xfs_bmbt_lookup_eq(cur, &old, &i);
if (error)
goto done;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
error = xfs_bmbt_update(cur, &PREV);
if (error)
goto done;
cur->bc_rec.b = *new;
if ((error = xfs_btree_insert(cur, &i)))
goto done;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
}
break;
@@ -2352,7 +2396,10 @@
error = xfs_bmbt_lookup_eq(cur, &old, &i);
if (error)
goto done;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
error = xfs_bmbt_update(cur, &PREV);
if (error)
goto done;
@@ -2376,9 +2423,8 @@
xfs_iext_update_extent(ip, state, icur, &PREV);
xfs_iext_next(ifp, icur);
xfs_iext_insert(ip, icur, new, state);
+ ifp->if_nextents++;
- XFS_IFORK_NEXT_SET(ip, whichfork,
- XFS_IFORK_NEXTENTS(ip, whichfork) + 1);
if (cur == NULL)
rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
else {
@@ -2386,17 +2432,26 @@
error = xfs_bmbt_lookup_eq(cur, &old, &i);
if (error)
goto done;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
error = xfs_bmbt_update(cur, &PREV);
if (error)
goto done;
error = xfs_bmbt_lookup_eq(cur, new, &i);
if (error)
goto done;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done);
+ if (XFS_IS_CORRUPT(mp, i != 0)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
if ((error = xfs_btree_insert(cur, &i)))
goto done;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
}
break;
@@ -2420,9 +2475,8 @@
xfs_iext_next(ifp, icur);
xfs_iext_insert(ip, icur, &r[1], state);
xfs_iext_insert(ip, icur, &r[0], state);
+ ifp->if_nextents += 2;
- XFS_IFORK_NEXT_SET(ip, whichfork,
- XFS_IFORK_NEXTENTS(ip, whichfork) + 2);
if (cur == NULL)
rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
else {
@@ -2430,7 +2484,10 @@
error = xfs_bmbt_lookup_eq(cur, &old, &i);
if (error)
goto done;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
/* new right extent - oldext */
error = xfs_bmbt_update(cur, &r[1]);
if (error)
@@ -2439,7 +2496,10 @@
cur->bc_rec.b = PREV;
if ((error = xfs_btree_insert(cur, &i)))
goto done;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
/*
* Reset the cursor to the position of the new extent
* we are about to insert as we can't trust it after
@@ -2448,11 +2508,17 @@
error = xfs_bmbt_lookup_eq(cur, new, &i);
if (error)
goto done;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done);
+ if (XFS_IS_CORRUPT(mp, i != 0)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
/* new middle extent - newext */
if ((error = xfs_btree_insert(cur, &i)))
goto done;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
}
break;
@@ -2486,7 +2552,7 @@
/* clear out the allocated field, done with it now in any case. */
if (cur) {
- cur->bc_private.b.allocated = 0;
+ cur->bc_ino.allocated = 0;
*curp = cur;
}
@@ -2665,7 +2731,7 @@
struct xfs_bmbt_irec old;
ASSERT(!isnullstartblock(new->br_startblock));
- ASSERT(!cur || !(cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL));
+ ASSERT(!cur || !(cur->bc_ino.flags & XFS_BTCUR_BMBT_WASDEL));
XFS_STATS_INC(mp, xs_add_exlist);
@@ -2725,9 +2791,8 @@
xfs_iext_remove(ip, icur, state);
xfs_iext_prev(ifp, icur);
xfs_iext_update_extent(ip, state, icur, &left);
+ ifp->if_nextents--;
- XFS_IFORK_NEXT_SET(ip, whichfork,
- XFS_IFORK_NEXTENTS(ip, whichfork) - 1);
if (cur == NULL) {
rval = XFS_ILOG_CORE | xfs_ilog_fext(whichfork);
} else {
@@ -2735,15 +2800,24 @@
error = xfs_bmbt_lookup_eq(cur, &right, &i);
if (error)
goto done;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
error = xfs_btree_delete(cur, &i);
if (error)
goto done;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
error = xfs_btree_decrement(cur, 0, &i);
if (error)
goto done;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
error = xfs_bmbt_update(cur, &left);
if (error)
goto done;
@@ -2769,7 +2843,10 @@
error = xfs_bmbt_lookup_eq(cur, &old, &i);
if (error)
goto done;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
error = xfs_bmbt_update(cur, &left);
if (error)
goto done;
@@ -2796,7 +2873,10 @@
error = xfs_bmbt_lookup_eq(cur, &old, &i);
if (error)
goto done;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
error = xfs_bmbt_update(cur, &right);
if (error)
goto done;
@@ -2810,8 +2890,8 @@
* Insert a new entry.
*/
xfs_iext_insert(ip, icur, new, state);
- XFS_IFORK_NEXT_SET(ip, whichfork,
- XFS_IFORK_NEXTENTS(ip, whichfork) + 1);
+ ifp->if_nextents++;
+
if (cur == NULL) {
rval = XFS_ILOG_CORE | xfs_ilog_fext(whichfork);
} else {
@@ -2819,11 +2899,17 @@
error = xfs_bmbt_lookup_eq(cur, new, &i);
if (error)
goto done;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done);
+ if (XFS_IS_CORRUPT(mp, i != 0)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
error = xfs_btree_insert(cur, &i);
if (error)
goto done;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
}
break;
}
@@ -2847,7 +2933,7 @@
/* clear out the allocated field, done with it now in any case. */
if (cur)
- cur->bc_private.b.allocated = 0;
+ cur->bc_ino.allocated = 0;
xfs_bmap_check_leaf_extents(cur, ip, whichfork);
done:
@@ -3058,7 +3144,7 @@
mp = ap->ip->i_mount;
nullfb = ap->tp->t_firstblock == NULLFSBLOCK;
rt = XFS_IS_REALTIME_INODE(ap->ip) &&
- xfs_alloc_is_userdata(ap->datatype);
+ (ap->datatype & XFS_ALLOC_USERDATA);
fb_agno = nullfb ? NULLAGNUMBER : XFS_FSB_TO_AGNO(mp,
ap->tp->t_firstblock);
/*
@@ -3203,11 +3289,12 @@
pag = xfs_perag_get(mp, ag);
if (!pag->pagf_init) {
error = xfs_alloc_pagf_init(mp, tp, ag, XFS_ALLOC_FLAG_TRYLOCK);
- if (error)
- goto out;
-
- if (!pag->pagf_init) {
- *notinit = 1;
+ if (error) {
+ /* Couldn't lock the AGF, so skip this AG. */
+ if (error == -EAGAIN) {
+ *notinit = 1;
+ error = 0;
+ }
goto out;
}
}
@@ -3411,7 +3498,7 @@
if (ap->flags & XFS_BMAPI_COWFORK)
align = xfs_get_cowextsz_hint(ap->ip);
- else if (xfs_alloc_is_userdata(ap->datatype))
+ else if (ap->datatype & XFS_ALLOC_USERDATA)
align = xfs_get_extsz_hint(ap->ip);
if (align) {
error = xfs_bmap_extsize_align(mp, &ap->got, &ap->prev,
@@ -3426,7 +3513,7 @@
fb_agno = nullfb ? NULLAGNUMBER : XFS_FSB_TO_AGNO(mp,
ap->tp->t_firstblock);
if (nullfb) {
- if (xfs_alloc_is_userdata(ap->datatype) &&
+ if ((ap->datatype & XFS_ALLOC_USERDATA) &&
xfs_inode_is_filestream(ap->ip)) {
ag = xfs_filestream_lookup_ag(ap->ip);
ag = (ag != NULLAGNUMBER) ? ag : 0;
@@ -3466,7 +3553,7 @@
* enough for the request. If one isn't found, then adjust
* the minimum allocation size to the largest space found.
*/
- if (xfs_alloc_is_userdata(ap->datatype) &&
+ if ((ap->datatype & XFS_ALLOC_USERDATA) &&
xfs_inode_is_filestream(ap->ip))
error = xfs_bmap_btalloc_filestreams(ap, &args, &blen);
else
@@ -3500,13 +3587,11 @@
args.mod = args.prod - args.mod;
}
/*
- * If we are not low on available data blocks, and the
- * underlying logical volume manager is a stripe, and
- * the file offset is zero then try to allocate data
- * blocks on stripe unit boundary.
- * NOTE: ap->aeof is only set if the allocation length
- * is >= the stripe unit and the allocation offset is
- * at the end of file.
+ * If we are not low on available data blocks, and the underlying
+ * logical volume manager is a stripe, and the file offset is zero then
+ * try to allocate data blocks on stripe unit boundary. NOTE: ap->aeof
+ * is only set if the allocation length is >= the stripe unit and the
+ * allocation offset is at the end of file.
*/
if (!(ap->tp->t_flags & XFS_TRANS_LOWMODE) && ap->aeof) {
if (!ap->offset) {
@@ -3514,9 +3599,11 @@
atype = args.type;
isaligned = 1;
/*
- * Adjust for alignment
+ * Adjust minlen to try and preserve alignment if we
+ * can't guarantee an aligned maxlen extent.
*/
- if (blen > args.alignment && blen <= args.maxlen)
+ if (blen > args.alignment &&
+ blen <= args.maxlen + args.alignment)
args.minlen = blen - args.alignment;
args.minalignslop = 0;
} else {
@@ -3554,8 +3641,6 @@
args.wasdel = ap->wasdel;
args.resv = XFS_AG_RESV_NONE;
args.datatype = ap->datatype;
- if (ap->datatype & XFS_ALLOC_USERDATA_ZERO)
- args.ip = ap->ip;
error = xfs_alloc_vextent(&args);
if (error)
@@ -3640,20 +3725,6 @@
return 0;
}
-/*
- * xfs_bmap_alloc is called by xfs_bmapi to allocate an extent for a file.
- * It figures out where to ask the underlying allocator to put the new extent.
- */
-STATIC int
-xfs_bmap_alloc(
- struct xfs_bmalloca *ap) /* bmap alloc argument struct */
-{
- if (XFS_IS_REALTIME_INODE(ap->ip) &&
- xfs_alloc_is_userdata(ap->datatype))
- return xfs_bmap_rtalloc(ap);
- return xfs_bmap_btalloc(ap);
-}
-
/* Trim extent to fit a logical block range. */
void
xfs_trim_extent(
@@ -3800,7 +3871,8 @@
int flags)
{
struct xfs_mount *mp = ip->i_mount;
- struct xfs_ifork *ifp;
+ int whichfork = xfs_bmapi_whichfork(flags);
+ struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
struct xfs_bmbt_irec got;
xfs_fileoff_t obno;
xfs_fileoff_t end;
@@ -3808,51 +3880,23 @@
int error;
bool eof = false;
int n = 0;
- int whichfork = xfs_bmapi_whichfork(flags);
ASSERT(*nmap >= 1);
- ASSERT(!(flags & ~(XFS_BMAPI_ATTRFORK|XFS_BMAPI_ENTIRE|
- XFS_BMAPI_COWFORK)));
+ ASSERT(!(flags & ~(XFS_BMAPI_ATTRFORK | XFS_BMAPI_ENTIRE)));
ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED|XFS_ILOCK_EXCL));
- if (unlikely(XFS_TEST_ERROR(
- (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
- XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE),
- mp, XFS_ERRTAG_BMAPIFORMAT))) {
- XFS_ERROR_REPORT("xfs_bmapi_read", XFS_ERRLEVEL_LOW, mp);
+ if (WARN_ON_ONCE(!ifp))
return -EFSCORRUPTED;
- }
+
+ if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)) ||
+ XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT))
+ return -EFSCORRUPTED;
if (XFS_FORCED_SHUTDOWN(mp))
return -EIO;
XFS_STATS_INC(mp, xs_blk_mapr);
- ifp = XFS_IFORK_PTR(ip, whichfork);
- if (!ifp) {
- /* No CoW fork? Return a hole. */
- if (whichfork == XFS_COW_FORK) {
- mval->br_startoff = bno;
- mval->br_startblock = HOLESTARTBLOCK;
- mval->br_blockcount = len;
- mval->br_state = XFS_EXT_NORM;
- *nmap = 1;
- return 0;
- }
-
- /*
- * A missing attr ifork implies that the inode says we're in
- * extents or btree format but failed to pass the inode fork
- * verifier while trying to load it. Treat that as a file
- * corruption too.
- */
-#ifdef DEBUG
- xfs_alert(mp, "%s: inode %llu missing fork %d",
- __func__, ip->i_ino, whichfork);
-#endif /* DEBUG */
- return -EFSCORRUPTED;
- }
-
if (!(ifp->if_flags & XFS_IFEXTENTS)) {
error = xfs_iread_extents(NULL, ip, whichfork);
if (error)
@@ -4010,6 +4054,39 @@
}
static int
+xfs_bmap_alloc_userdata(
+ struct xfs_bmalloca *bma)
+{
+ struct xfs_mount *mp = bma->ip->i_mount;
+ int whichfork = xfs_bmapi_whichfork(bma->flags);
+ int error;
+
+ /*
+ * Set the data type being allocated. For the data fork, the first data
+ * in the file is treated differently to all other allocations. For the
+ * attribute fork, we only need to ensure the allocated range is not on
+ * the busy list.
+ */
+ bma->datatype = XFS_ALLOC_NOBUSY;
+ if (whichfork == XFS_DATA_FORK) {
+ bma->datatype |= XFS_ALLOC_USERDATA;
+ if (bma->offset == 0)
+ bma->datatype |= XFS_ALLOC_INITIAL_USER_DATA;
+
+ if (mp->m_dalign && bma->length >= mp->m_dalign) {
+ error = xfs_bmap_isaeof(bma, whichfork);
+ if (error)
+ return error;
+ }
+
+ if (XFS_IS_REALTIME_INODE(bma->ip))
+ return xfs_bmap_rtalloc(bma);
+ }
+
+ return xfs_bmap_btalloc(bma);
+}
+
+static int
xfs_bmapi_allocate(
struct xfs_bmalloca *bma)
{
@@ -4028,7 +4105,8 @@
if (bma->wasdel) {
bma->length = (xfs_extlen_t)bma->got.br_blockcount;
bma->offset = bma->got.br_startoff;
- xfs_iext_peek_prev_extent(ifp, &bma->icur, &bma->prev);
+ if (!xfs_iext_peek_prev_extent(ifp, &bma->icur, &bma->prev))
+ bma->prev.br_startoff = NULLFILEOFF;
} else {
bma->length = XFS_FILBLKS_MIN(bma->length, MAXEXTLEN);
if (!bma->eof)
@@ -4036,43 +4114,24 @@
bma->got.br_startoff - bma->offset);
}
- /*
- * Set the data type being allocated. For the data fork, the first data
- * in the file is treated differently to all other allocations. For the
- * attribute fork, we only need to ensure the allocated range is not on
- * the busy list.
- */
- if (!(bma->flags & XFS_BMAPI_METADATA)) {
- bma->datatype = XFS_ALLOC_NOBUSY;
- if (whichfork == XFS_DATA_FORK) {
- if (bma->offset == 0)
- bma->datatype |= XFS_ALLOC_INITIAL_USER_DATA;
- else
- bma->datatype |= XFS_ALLOC_USERDATA;
- }
- if (bma->flags & XFS_BMAPI_ZERO)
- bma->datatype |= XFS_ALLOC_USERDATA_ZERO;
- }
+ if (bma->flags & XFS_BMAPI_CONTIG)
+ bma->minlen = bma->length;
+ else
+ bma->minlen = 1;
- bma->minlen = (bma->flags & XFS_BMAPI_CONTIG) ? bma->length : 1;
+ if (bma->flags & XFS_BMAPI_METADATA)
+ error = xfs_bmap_btalloc(bma);
+ else
+ error = xfs_bmap_alloc_userdata(bma);
+ if (error || bma->blkno == NULLFSBLOCK)
+ return error;
- /*
- * Only want to do the alignment at the eof if it is userdata and
- * allocation length is larger than a stripe unit.
- */
- if (mp->m_dalign && bma->length >= mp->m_dalign &&
- !(bma->flags & XFS_BMAPI_METADATA) && whichfork == XFS_DATA_FORK) {
- error = xfs_bmap_isaeof(bma, whichfork);
+ if (bma->flags & XFS_BMAPI_ZERO) {
+ error = xfs_zero_extent(bma->ip, bma->blkno, bma->length);
if (error)
return error;
}
- error = xfs_bmap_alloc(bma);
- if (error)
- return error;
-
- if (bma->blkno == NULLFSBLOCK)
- return 0;
if ((ifp->if_flags & XFS_IFBROOT) && !bma->cur)
bma->cur = xfs_bmbt_init_cursor(mp, bma->tp, bma->ip, whichfork);
/*
@@ -4082,25 +4141,15 @@
bma->nallocs++;
if (bma->cur)
- bma->cur->bc_private.b.flags =
- bma->wasdel ? XFS_BTCUR_BPRV_WASDEL : 0;
+ bma->cur->bc_ino.flags =
+ bma->wasdel ? XFS_BTCUR_BMBT_WASDEL : 0;
bma->got.br_startoff = bma->offset;
bma->got.br_startblock = bma->blkno;
bma->got.br_blockcount = bma->length;
bma->got.br_state = XFS_EXT_NORM;
- /*
- * In the data fork, a wasdelay extent has been initialized, so
- * shouldn't be flagged as unwritten.
- *
- * For the cow fork, however, we convert delalloc reservations
- * (extents allocated for speculative preallocation) to
- * allocated unwritten extents, and only convert the unwritten
- * extents to real extents when we're about to write the data.
- */
- if ((!bma->wasdel || (bma->flags & XFS_BMAPI_COWFORK)) &&
- (bma->flags & XFS_BMAPI_PREALLOC))
+ if (bma->flags & XFS_BMAPI_PREALLOC)
bma->got.br_state = XFS_EXT_UNWRITTEN;
if (bma->wasdel)
@@ -4214,11 +4263,13 @@
struct xfs_inode *ip,
int fork)
{
+ struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, fork);
+
if (tp && tp->t_firstblock != NULLFSBLOCK)
return 0;
- if (XFS_IFORK_FORMAT(ip, fork) != XFS_DINODE_FMT_BTREE)
+ if (ifp->if_format != XFS_DINODE_FMT_BTREE)
return 1;
- return be16_to_cpu(XFS_IFORK_PTR(ip, fork)->if_broot->bb_level) + 1;
+ return be16_to_cpu(ifp->if_broot->bb_level) + 1;
}
/*
@@ -4233,11 +4284,13 @@
int whichfork,
int error)
{
+ struct xfs_ifork *ifp = XFS_IFORK_PTR(bma->ip, whichfork);
+
if ((bma->logflags & xfs_ilog_fext(whichfork)) &&
- XFS_IFORK_FORMAT(bma->ip, whichfork) != XFS_DINODE_FMT_EXTENTS)
+ ifp->if_format != XFS_DINODE_FMT_EXTENTS)
bma->logflags &= ~xfs_ilog_fext(whichfork);
else if ((bma->logflags & xfs_ilog_fbroot(whichfork)) &&
- XFS_IFORK_FORMAT(bma->ip, whichfork) != XFS_DINODE_FMT_BTREE)
+ ifp->if_format != XFS_DINODE_FMT_BTREE)
bma->logflags &= ~xfs_ilog_fbroot(whichfork);
if (bma->logflags)
@@ -4269,13 +4322,13 @@
.total = total,
};
struct xfs_mount *mp = ip->i_mount;
- struct xfs_ifork *ifp;
+ int whichfork = xfs_bmapi_whichfork(flags);
+ struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
xfs_fileoff_t end; /* end of mapped file region */
bool eof = false; /* after the end of extents */
int error; /* error return */
int n; /* current extent index */
xfs_fileoff_t obno; /* old block number (offset) */
- int whichfork; /* data or attr fork */
#ifdef DEBUG
xfs_fileoff_t orig_bno; /* original block number value */
@@ -4290,13 +4343,12 @@
orig_mval = mval;
orig_nmap = *nmap;
#endif
- whichfork = xfs_bmapi_whichfork(flags);
ASSERT(*nmap >= 1);
ASSERT(*nmap <= XFS_BMAP_MAX_NMAP);
ASSERT(tp != NULL);
ASSERT(len > 0);
- ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL);
+ ASSERT(ifp->if_format != XFS_DINODE_FMT_LOCAL);
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
ASSERT(!(flags & XFS_BMAPI_REMAP));
@@ -4312,19 +4364,14 @@
ASSERT((flags & (XFS_BMAPI_PREALLOC | XFS_BMAPI_ZERO)) !=
(XFS_BMAPI_PREALLOC | XFS_BMAPI_ZERO));
- if (unlikely(XFS_TEST_ERROR(
- (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
- XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE),
- mp, XFS_ERRTAG_BMAPIFORMAT))) {
- XFS_ERROR_REPORT("xfs_bmapi_write", XFS_ERRLEVEL_LOW, mp);
+ if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)) ||
+ XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) {
return -EFSCORRUPTED;
}
if (XFS_FORCED_SHUTDOWN(mp))
return -EIO;
- ifp = XFS_IFORK_PTR(ip, whichfork);
-
XFS_STATS_INC(mp, xs_blk_mapw);
if (!(ifp->if_flags & XFS_IFEXTENTS)) {
@@ -4434,9 +4481,8 @@
if (error)
goto error0;
- ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE ||
- XFS_IFORK_NEXTENTS(ip, whichfork) >
- XFS_IFORK_MAXEXT(ip, whichfork));
+ ASSERT(ifp->if_format != XFS_DINODE_FMT_BTREE ||
+ ifp->if_nextents > XFS_IFORK_MAXEXT(ip, whichfork));
xfs_bmapi_finish(&bma, whichfork, 0);
xfs_bmap_validate_ret(orig_bno, orig_len, orig_flags, orig_mval,
orig_nmap, *nmap);
@@ -4456,16 +4502,21 @@
xfs_bmapi_convert_delalloc(
struct xfs_inode *ip,
int whichfork,
- xfs_fileoff_t offset_fsb,
- struct xfs_bmbt_irec *imap,
+ xfs_off_t offset,
+ struct iomap *iomap,
unsigned int *seq)
{
struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
struct xfs_mount *mp = ip->i_mount;
+ xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
struct xfs_bmalloca bma = { NULL };
+ uint16_t flags = 0;
struct xfs_trans *tp;
int error;
+ if (whichfork == XFS_COW_FORK)
+ flags |= IOMAP_F_SHARED;
+
/*
* Space for the extent and indirect blocks was reserved when the
* delalloc extent was created so there's no need to do so here.
@@ -4495,7 +4546,7 @@
* the extent. Just return the real extent at this offset.
*/
if (!isnullstartblock(bma.got.br_startblock)) {
- *imap = bma.got;
+ xfs_bmbt_to_iomap(ip, iomap, &bma.got, flags);
*seq = READ_ONCE(ifp->if_seq);
goto out_trans_cancel;
}
@@ -4505,10 +4556,24 @@
bma.wasdel = true;
bma.offset = bma.got.br_startoff;
bma.length = max_t(xfs_filblks_t, bma.got.br_blockcount, MAXEXTLEN);
- bma.total = XFS_EXTENTADD_SPACE_RES(ip->i_mount, XFS_DATA_FORK);
bma.minleft = xfs_bmapi_minleft(tp, ip, whichfork);
+
+ /*
+ * When we're converting the delalloc reservations backing dirty pages
+ * in the page cache, we must be careful about how we create the new
+ * extents:
+ *
+ * New CoW fork extents are created unwritten, turned into real extents
+ * when we're about to write the data to disk, and mapped into the data
+ * fork after the write finishes. End of story.
+ *
+ * New data fork extents must be mapped in as unwritten and converted
+ * to real extents after the write succeeds to avoid exposing stale
+ * disk contents if we crash.
+ */
+ bma.flags = XFS_BMAPI_PREALLOC;
if (whichfork == XFS_COW_FORK)
- bma.flags = XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC;
+ bma.flags |= XFS_BMAPI_COWFORK;
if (!xfs_iext_peek_prev_extent(ifp, &bma.icur, &bma.prev))
bma.prev.br_startoff = NULLFILEOFF;
@@ -4528,7 +4593,7 @@
XFS_STATS_INC(mp, xs_xstrat_quick);
ASSERT(!isnullstartblock(bma.got.br_startblock));
- *imap = bma.got;
+ xfs_bmbt_to_iomap(ip, iomap, &bma.got, flags);
*seq = READ_ONCE(ifp->if_seq);
if (whichfork == XFS_COW_FORK)
@@ -4578,11 +4643,8 @@
ASSERT((flags & (XFS_BMAPI_ATTRFORK | XFS_BMAPI_PREALLOC)) !=
(XFS_BMAPI_ATTRFORK | XFS_BMAPI_PREALLOC));
- if (unlikely(XFS_TEST_ERROR(
- (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
- XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE),
- mp, XFS_ERRTAG_BMAPIFORMAT))) {
- XFS_ERROR_REPORT("xfs_bmapi_remap", XFS_ERRLEVEL_LOW, mp);
+ if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)) ||
+ XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) {
return -EFSCORRUPTED;
}
@@ -4606,7 +4668,7 @@
if (ifp->if_flags & XFS_IFBROOT) {
cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
- cur->bc_private.b.flags = 0;
+ cur->bc_ino.flags = 0;
}
got.br_startoff = bno;
@@ -4625,9 +4687,9 @@
error = xfs_bmap_btree_to_extents(tp, ip, cur, &logflags, whichfork);
error0:
- if (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS)
+ if (ip->i_df.if_format != XFS_DINODE_FMT_EXTENTS)
logflags &= ~XFS_ILOG_DEXT;
- else if (ip->i_d.di_format != XFS_DINODE_FMT_BTREE)
+ else if (ip->i_df.if_format != XFS_DINODE_FMT_BTREE)
logflags &= ~XFS_ILOG_DBROOT;
if (logflags)
@@ -4977,9 +5039,8 @@
* conversion to btree format, since the transaction will be dirty then.
*/
if (tp->t_blk_res == 0 &&
- XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS &&
- XFS_IFORK_NEXTENTS(ip, whichfork) >=
- XFS_IFORK_MAXEXT(ip, whichfork) &&
+ ifp->if_format == XFS_DINODE_FMT_EXTENTS &&
+ ifp->if_nextents >= XFS_IFORK_MAXEXT(ip, whichfork) &&
del->br_startoff > got.br_startoff && del_endoff < got_endoff)
return -ENOSPC;
@@ -5018,7 +5079,10 @@
error = xfs_bmbt_lookup_eq(cur, &got, &i);
if (error)
goto done;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
}
if (got.br_startoff == del->br_startoff)
@@ -5033,8 +5097,8 @@
*/
xfs_iext_remove(ip, icur, state);
xfs_iext_prev(ifp, icur);
- XFS_IFORK_NEXT_SET(ip, whichfork,
- XFS_IFORK_NEXTENTS(ip, whichfork) - 1);
+ ifp->if_nextents--;
+
flags |= XFS_ILOG_CORE;
if (!cur) {
flags |= xfs_ilog_fext(whichfork);
@@ -5042,7 +5106,10 @@
}
if ((error = xfs_btree_delete(cur, &i)))
goto done;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
break;
case BMAP_LEFT_FILLING:
/*
@@ -5113,7 +5180,10 @@
error = xfs_bmbt_lookup_eq(cur, &got, &i);
if (error)
goto done;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
/*
* Update the btree record back
* to the original value.
@@ -5130,11 +5200,14 @@
error = -ENOSPC;
goto done;
}
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
} else
flags |= xfs_ilog_fext(whichfork);
- XFS_IFORK_NEXT_SET(ip, whichfork,
- XFS_IFORK_NEXTENTS(ip, whichfork) + 1);
+
+ ifp->if_nextents++;
xfs_iext_next(ifp, icur);
xfs_iext_insert(ip, icur, &new, state);
break;
@@ -5197,7 +5270,7 @@
int isrt; /* freeing in rt area */
int logflags; /* transaction logging flags */
xfs_extlen_t mod; /* rt extent offset */
- struct xfs_mount *mp; /* mount structure */
+ struct xfs_mount *mp = ip->i_mount;
int tmp_logflags; /* partial logging flags */
int wasdel; /* was a delayed alloc extent */
int whichfork; /* data or attribute fork */
@@ -5214,14 +5287,8 @@
whichfork = xfs_bmapi_whichfork(flags);
ASSERT(whichfork != XFS_COW_FORK);
ifp = XFS_IFORK_PTR(ip, whichfork);
- if (unlikely(
- XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
- XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)) {
- XFS_ERROR_REPORT("xfs_bunmapi", XFS_ERRLEVEL_LOW,
- ip->i_mount);
+ if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)))
return -EFSCORRUPTED;
- }
- mp = ip->i_mount;
if (XFS_FORCED_SHUTDOWN(mp))
return -EIO;
@@ -5258,9 +5325,9 @@
logflags = 0;
if (ifp->if_flags & XFS_IFBROOT) {
- ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE);
+ ASSERT(ifp->if_format == XFS_DINODE_FMT_BTREE);
cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
- cur->bc_private.b.flags = 0;
+ cur->bc_ino.flags = 0;
} else
cur = NULL;
@@ -5503,10 +5570,10 @@
* logging the extent records if we've converted to btree format.
*/
if ((logflags & xfs_ilog_fext(whichfork)) &&
- XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS)
+ ifp->if_format != XFS_DINODE_FMT_EXTENTS)
logflags &= ~xfs_ilog_fext(whichfork);
else if ((logflags & xfs_ilog_fbroot(whichfork)) &&
- XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)
+ ifp->if_format != XFS_DINODE_FMT_BTREE)
logflags &= ~xfs_ilog_fbroot(whichfork);
/*
* Log inode even in the error case, if the transaction
@@ -5516,7 +5583,7 @@
xfs_trans_log_inode(tp, ip, logflags);
if (cur) {
if (!error)
- cur->bc_private.b.allocated = 0;
+ cur->bc_ino.allocated = 0;
xfs_btree_del_cursor(cur, error);
}
return error;
@@ -5588,6 +5655,7 @@
struct xfs_btree_cur *cur,
int *logflags) /* output */
{
+ struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
struct xfs_bmbt_irec new;
xfs_filblks_t blockcount;
int error, i;
@@ -5606,8 +5674,7 @@
* Update the on-disk extent count, the btree if necessary and log the
* inode.
*/
- XFS_IFORK_NEXT_SET(ip, whichfork,
- XFS_IFORK_NEXTENTS(ip, whichfork) - 1);
+ ifp->if_nextents--;
*logflags |= XFS_ILOG_CORE;
if (!cur) {
*logflags |= XFS_ILOG_DEXT;
@@ -5618,18 +5685,21 @@
error = xfs_bmbt_lookup_eq(cur, got, &i);
if (error)
return error;
- XFS_WANT_CORRUPTED_RETURN(mp, i == 1);
+ if (XFS_IS_CORRUPT(mp, i != 1))
+ return -EFSCORRUPTED;
error = xfs_btree_delete(cur, &i);
if (error)
return error;
- XFS_WANT_CORRUPTED_RETURN(mp, i == 1);
+ if (XFS_IS_CORRUPT(mp, i != 1))
+ return -EFSCORRUPTED;
/* lookup and update size of the previous extent */
error = xfs_bmbt_lookup_eq(cur, left, &i);
if (error)
return error;
- XFS_WANT_CORRUPTED_RETURN(mp, i == 1);
+ if (XFS_IS_CORRUPT(mp, i != 1))
+ return -EFSCORRUPTED;
error = xfs_bmbt_update(cur, &new);
if (error)
@@ -5642,7 +5712,7 @@
done:
xfs_iext_remove(ip, icur, 0);
- xfs_iext_prev(XFS_IFORK_PTR(ip, whichfork), icur);
+ xfs_iext_prev(ifp, icur);
xfs_iext_update_extent(ip, xfs_bmap_fork_to_state(whichfork), icur,
&new);
@@ -5677,7 +5747,8 @@
error = xfs_bmbt_lookup_eq(cur, &prev, &i);
if (error)
return error;
- XFS_WANT_CORRUPTED_RETURN(mp, i == 1);
+ if (XFS_IS_CORRUPT(mp, i != 1))
+ return -EFSCORRUPTED;
error = xfs_bmbt_update(cur, got);
if (error)
@@ -5713,11 +5784,8 @@
int error = 0;
int logflags = 0;
- if (unlikely(XFS_TEST_ERROR(
- (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
- XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE),
- mp, XFS_ERRTAG_BMAPIFORMAT))) {
- XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp);
+ if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)) ||
+ XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) {
return -EFSCORRUPTED;
}
@@ -5734,15 +5802,17 @@
if (ifp->if_flags & XFS_IFBROOT) {
cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
- cur->bc_private.b.flags = 0;
+ cur->bc_ino.flags = 0;
}
if (!xfs_iext_lookup_extent(ip, ifp, *next_fsb, &icur, &got)) {
*done = true;
goto del_cursor;
}
- XFS_WANT_CORRUPTED_GOTO(mp, !isnullstartblock(got.br_startblock),
- del_cursor);
+ if (XFS_IS_CORRUPT(mp, isnullstartblock(got.br_startblock))) {
+ error = -EFSCORRUPTED;
+ goto del_cursor;
+ }
new_startoff = got.br_startoff - offset_shift_fsb;
if (xfs_iext_peek_prev_extent(ifp, &icur, &prev)) {
@@ -5831,11 +5901,8 @@
int error = 0;
int logflags = 0;
- if (unlikely(XFS_TEST_ERROR(
- (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
- XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE),
- mp, XFS_ERRTAG_BMAPIFORMAT))) {
- XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp);
+ if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)) ||
+ XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) {
return -EFSCORRUPTED;
}
@@ -5852,7 +5919,7 @@
if (ifp->if_flags & XFS_IFBROOT) {
cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
- cur->bc_private.b.flags = 0;
+ cur->bc_ino.flags = 0;
}
if (*next_fsb == NULLFSBLOCK) {
@@ -5868,11 +5935,13 @@
goto del_cursor;
}
}
- XFS_WANT_CORRUPTED_GOTO(mp, !isnullstartblock(got.br_startblock),
- del_cursor);
+ if (XFS_IS_CORRUPT(mp, isnullstartblock(got.br_startblock))) {
+ error = -EFSCORRUPTED;
+ goto del_cursor;
+ }
- if (stop_fsb >= got.br_startoff + got.br_blockcount) {
- error = -EIO;
+ if (XFS_IS_CORRUPT(mp, stop_fsb > got.br_startoff)) {
+ error = -EFSCORRUPTED;
goto del_cursor;
}
@@ -5919,37 +5988,32 @@
* @split_fsb is a block where the extents is split. If split_fsb lies in a
* hole or the first block of extents, just return 0.
*/
-STATIC int
-xfs_bmap_split_extent_at(
+int
+xfs_bmap_split_extent(
struct xfs_trans *tp,
struct xfs_inode *ip,
xfs_fileoff_t split_fsb)
{
int whichfork = XFS_DATA_FORK;
+ struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
struct xfs_btree_cur *cur = NULL;
struct xfs_bmbt_irec got;
struct xfs_bmbt_irec new; /* split extent */
struct xfs_mount *mp = ip->i_mount;
- struct xfs_ifork *ifp;
xfs_fsblock_t gotblkcnt; /* new block count for got */
struct xfs_iext_cursor icur;
int error = 0;
int logflags = 0;
int i = 0;
- if (unlikely(XFS_TEST_ERROR(
- (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
- XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE),
- mp, XFS_ERRTAG_BMAPIFORMAT))) {
- XFS_ERROR_REPORT("xfs_bmap_split_extent_at",
- XFS_ERRLEVEL_LOW, mp);
+ if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)) ||
+ XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) {
return -EFSCORRUPTED;
}
if (XFS_FORCED_SHUTDOWN(mp))
return -EIO;
- ifp = XFS_IFORK_PTR(ip, whichfork);
if (!(ifp->if_flags & XFS_IFEXTENTS)) {
/* Read in all the extents */
error = xfs_iread_extents(tp, ip, whichfork);
@@ -5972,11 +6036,14 @@
if (ifp->if_flags & XFS_IFBROOT) {
cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
- cur->bc_private.b.flags = 0;
+ cur->bc_ino.flags = 0;
error = xfs_bmbt_lookup_eq(cur, &got, &i);
if (error)
goto del_cursor;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, del_cursor);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto del_cursor;
+ }
}
got.br_blockcount = gotblkcnt;
@@ -5994,18 +6061,23 @@
/* Add new extent */
xfs_iext_next(ifp, &icur);
xfs_iext_insert(ip, &icur, &new, 0);
- XFS_IFORK_NEXT_SET(ip, whichfork,
- XFS_IFORK_NEXTENTS(ip, whichfork) + 1);
+ ifp->if_nextents++;
if (cur) {
error = xfs_bmbt_lookup_eq(cur, &new, &i);
if (error)
goto del_cursor;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 0, del_cursor);
+ if (XFS_IS_CORRUPT(mp, i != 0)) {
+ error = -EFSCORRUPTED;
+ goto del_cursor;
+ }
error = xfs_btree_insert(cur, &i);
if (error)
goto del_cursor;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, del_cursor);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto del_cursor;
+ }
}
/*
@@ -6022,7 +6094,7 @@
del_cursor:
if (cur) {
- cur->bc_private.b.allocated = 0;
+ cur->bc_ino.allocated = 0;
xfs_btree_del_cursor(cur, error);
}
@@ -6031,34 +6103,6 @@
return error;
}
-int
-xfs_bmap_split_extent(
- struct xfs_inode *ip,
- xfs_fileoff_t split_fsb)
-{
- struct xfs_mount *mp = ip->i_mount;
- struct xfs_trans *tp;
- int error;
-
- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write,
- XFS_DIOSTRAT_SPACE_RES(mp, 0), 0, 0, &tp);
- if (error)
- return error;
-
- xfs_ilock(ip, XFS_ILOCK_EXCL);
- xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
-
- error = xfs_bmap_split_extent_at(tp, ip, split_fsb);
- if (error)
- goto out;
-
- return xfs_trans_commit(tp);
-
-out:
- xfs_trans_cancel(tp);
- return error;
-}
-
/* Deferred mapping is only for real extents in the data fork. */
static bool
xfs_bmap_is_update_needed(
diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h
index 093716a..6747e97 100644
--- a/fs/xfs/libxfs/xfs_bmap.h
+++ b/fs/xfs/libxfs/xfs_bmap.h
@@ -1,4 +1,4 @@
-// SPDX-License-Identifier: GPL-2.0
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* Copyright (c) 2000-2006 Silicon Graphics, Inc.
* All Rights Reserved.
@@ -158,17 +158,22 @@
{ BMAP_ATTRFORK, "ATTR" }, \
{ BMAP_COWFORK, "COW" }
+/* Return true if the extent is an allocated extent, written or not. */
+static inline bool xfs_bmap_is_real_extent(struct xfs_bmbt_irec *irec)
+{
+ return irec->br_startblock != HOLESTARTBLOCK &&
+ irec->br_startblock != DELAYSTARTBLOCK &&
+ !isnullstartblock(irec->br_startblock);
+}
/*
* Return true if the extent is a real, allocated extent, or false if it is a
* delayed allocation, and unwritten extent or a hole.
*/
-static inline bool xfs_bmap_is_real_extent(struct xfs_bmbt_irec *irec)
+static inline bool xfs_bmap_is_written_extent(struct xfs_bmbt_irec *irec)
{
- return irec->br_state != XFS_EXT_UNWRITTEN &&
- irec->br_startblock != HOLESTARTBLOCK &&
- irec->br_startblock != DELAYSTARTBLOCK &&
- !isnullstartblock(irec->br_startblock);
+ return xfs_bmap_is_real_extent(irec) &&
+ irec->br_state != XFS_EXT_UNWRITTEN;
}
/*
@@ -222,14 +227,14 @@
int xfs_bmap_insert_extents(struct xfs_trans *tp, struct xfs_inode *ip,
xfs_fileoff_t *next_fsb, xfs_fileoff_t offset_shift_fsb,
bool *done, xfs_fileoff_t stop_fsb);
-int xfs_bmap_split_extent(struct xfs_inode *ip, xfs_fileoff_t split_offset);
+int xfs_bmap_split_extent(struct xfs_trans *tp, struct xfs_inode *ip,
+ xfs_fileoff_t split_offset);
int xfs_bmapi_reserve_delalloc(struct xfs_inode *ip, int whichfork,
xfs_fileoff_t off, xfs_filblks_t len, xfs_filblks_t prealloc,
struct xfs_bmbt_irec *got, struct xfs_iext_cursor *cur,
int eof);
int xfs_bmapi_convert_delalloc(struct xfs_inode *ip, int whichfork,
- xfs_fileoff_t offset_fsb, struct xfs_bmbt_irec *imap,
- unsigned int *seq);
+ xfs_off_t offset, struct iomap *iomap, unsigned int *seq);
int xfs_bmap_add_extent_unwritten_real(struct xfs_trans *tp,
struct xfs_inode *ip, int whichfork,
struct xfs_iext_cursor *icur, struct xfs_btree_cur **curp,
diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c
index ffe608d..ecec604 100644
--- a/fs/xfs/libxfs/xfs_bmap_btree.c
+++ b/fs/xfs/libxfs/xfs_bmap_btree.c
@@ -166,13 +166,13 @@
struct xfs_btree_cur *new;
new = xfs_bmbt_init_cursor(cur->bc_mp, cur->bc_tp,
- cur->bc_private.b.ip, cur->bc_private.b.whichfork);
+ cur->bc_ino.ip, cur->bc_ino.whichfork);
/*
* Copy the firstblock, dfops, and flags values,
* since init cursor doesn't get them.
*/
- new->bc_private.b.flags = cur->bc_private.b.flags;
+ new->bc_ino.flags = cur->bc_ino.flags;
return new;
}
@@ -183,12 +183,12 @@
struct xfs_btree_cur *dst)
{
ASSERT((dst->bc_tp->t_firstblock != NULLFSBLOCK) ||
- (dst->bc_private.b.ip->i_d.di_flags & XFS_DIFLAG_REALTIME));
+ (dst->bc_ino.ip->i_d.di_flags & XFS_DIFLAG_REALTIME));
- dst->bc_private.b.allocated += src->bc_private.b.allocated;
+ dst->bc_ino.allocated += src->bc_ino.allocated;
dst->bc_tp->t_firstblock = src->bc_tp->t_firstblock;
- src->bc_private.b.allocated = 0;
+ src->bc_ino.allocated = 0;
}
STATIC int
@@ -205,8 +205,8 @@
args.tp = cur->bc_tp;
args.mp = cur->bc_mp;
args.fsbno = cur->bc_tp->t_firstblock;
- xfs_rmap_ino_bmbt_owner(&args.oinfo, cur->bc_private.b.ip->i_ino,
- cur->bc_private.b.whichfork);
+ xfs_rmap_ino_bmbt_owner(&args.oinfo, cur->bc_ino.ip->i_ino,
+ cur->bc_ino.whichfork);
if (args.fsbno == NULLFSBLOCK) {
args.fsbno = be64_to_cpu(start->l);
@@ -230,7 +230,7 @@
}
args.minlen = args.maxlen = args.prod = 1;
- args.wasdel = cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL;
+ args.wasdel = cur->bc_ino.flags & XFS_BTCUR_BMBT_WASDEL;
if (!args.wasdel && args.tp->t_blk_res == 0) {
error = -ENOSPC;
goto error0;
@@ -259,10 +259,10 @@
ASSERT(args.len == 1);
cur->bc_tp->t_firstblock = args.fsbno;
- cur->bc_private.b.allocated++;
- cur->bc_private.b.ip->i_d.di_nblocks++;
- xfs_trans_log_inode(args.tp, cur->bc_private.b.ip, XFS_ILOG_CORE);
- xfs_trans_mod_dquot_byino(args.tp, cur->bc_private.b.ip,
+ cur->bc_ino.allocated++;
+ cur->bc_ino.ip->i_d.di_nblocks++;
+ xfs_trans_log_inode(args.tp, cur->bc_ino.ip, XFS_ILOG_CORE);
+ xfs_trans_mod_dquot_byino(args.tp, cur->bc_ino.ip,
XFS_TRANS_DQ_BCOUNT, 1L);
new->l = cpu_to_be64(args.fsbno);
@@ -280,12 +280,12 @@
struct xfs_buf *bp)
{
struct xfs_mount *mp = cur->bc_mp;
- struct xfs_inode *ip = cur->bc_private.b.ip;
+ struct xfs_inode *ip = cur->bc_ino.ip;
struct xfs_trans *tp = cur->bc_tp;
xfs_fsblock_t fsbno = XFS_DADDR_TO_FSB(mp, XFS_BUF_ADDR(bp));
struct xfs_owner_info oinfo;
- xfs_rmap_ino_bmbt_owner(&oinfo, ip->i_ino, cur->bc_private.b.whichfork);
+ xfs_rmap_ino_bmbt_owner(&oinfo, ip->i_ino, cur->bc_ino.whichfork);
xfs_bmap_add_free(cur->bc_tp, fsbno, 1, &oinfo);
ip->i_d.di_nblocks--;
@@ -302,8 +302,8 @@
if (level == cur->bc_nlevels - 1) {
struct xfs_ifork *ifp;
- ifp = XFS_IFORK_PTR(cur->bc_private.b.ip,
- cur->bc_private.b.whichfork);
+ ifp = XFS_IFORK_PTR(cur->bc_ino.ip,
+ cur->bc_ino.whichfork);
return xfs_bmbt_maxrecs(cur->bc_mp,
ifp->if_broot_bytes, level == 0) / 2;
@@ -320,8 +320,8 @@
if (level == cur->bc_nlevels - 1) {
struct xfs_ifork *ifp;
- ifp = XFS_IFORK_PTR(cur->bc_private.b.ip,
- cur->bc_private.b.whichfork);
+ ifp = XFS_IFORK_PTR(cur->bc_ino.ip,
+ cur->bc_ino.whichfork);
return xfs_bmbt_maxrecs(cur->bc_mp,
ifp->if_broot_bytes, level == 0);
@@ -347,7 +347,7 @@
{
if (level != cur->bc_nlevels - 1)
return cur->bc_mp->m_bmap_dmxr[level != 0];
- return xfs_bmdr_maxrecs(cur->bc_private.b.forksize, level == 0);
+ return xfs_bmdr_maxrecs(cur->bc_ino.forksize, level == 0);
}
STATIC void
@@ -552,7 +552,7 @@
struct xfs_btree_cur *cur;
ASSERT(whichfork != XFS_COW_FORK);
- cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_NOFS);
+ cur = kmem_cache_zalloc(xfs_btree_cur_zone, GFP_NOFS | __GFP_NOFAIL);
cur->bc_tp = tp;
cur->bc_mp = mp;
@@ -566,11 +566,11 @@
if (xfs_sb_version_hascrc(&mp->m_sb))
cur->bc_flags |= XFS_BTREE_CRC_BLOCKS;
- cur->bc_private.b.forksize = XFS_IFORK_SIZE(ip, whichfork);
- cur->bc_private.b.ip = ip;
- cur->bc_private.b.allocated = 0;
- cur->bc_private.b.flags = 0;
- cur->bc_private.b.whichfork = whichfork;
+ cur->bc_ino.forksize = XFS_IFORK_SIZE(ip, whichfork);
+ cur->bc_ino.ip = ip;
+ cur->bc_ino.allocated = 0;
+ cur->bc_ino.flags = 0;
+ cur->bc_ino.whichfork = whichfork;
return cur;
}
@@ -636,15 +636,12 @@
ASSERT(tp || buffer_list);
ASSERT(!(tp && buffer_list));
- if (whichfork == XFS_DATA_FORK)
- ASSERT(ip->i_d.di_format == XFS_DINODE_FMT_BTREE);
- else
- ASSERT(ip->i_d.di_aformat == XFS_DINODE_FMT_BTREE);
+ ASSERT(XFS_IFORK_PTR(ip, whichfork)->if_format == XFS_DINODE_FMT_BTREE);
cur = xfs_bmbt_init_cursor(ip->i_mount, tp, ip, whichfork);
if (!cur)
return -ENOMEM;
- cur->bc_private.b.flags |= XFS_BTCUR_BPRV_INVALID_OWNER;
+ cur->bc_ino.flags |= XFS_BTCUR_BMBT_INVALID_OWNER;
error = xfs_btree_change_owner(cur, new_owner, buffer_list);
xfs_btree_del_cursor(cur, error);
diff --git a/fs/xfs/libxfs/xfs_bmap_btree.h b/fs/xfs/libxfs/xfs_bmap_btree.h
index 29b407d..72bf74c 100644
--- a/fs/xfs/libxfs/xfs_bmap_btree.h
+++ b/fs/xfs/libxfs/xfs_bmap_btree.h
@@ -1,4 +1,4 @@
-// SPDX-License-Identifier: GPL-2.0
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* Copyright (c) 2000,2002-2005 Silicon Graphics, Inc.
* All Rights Reserved.
diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index 71de937..2d25bab 100644
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -20,6 +20,7 @@
#include "xfs_trace.h"
#include "xfs_alloc.h"
#include "xfs_log.h"
+#include "xfs_btree_staging.h"
/*
* Cursor allocation zone.
@@ -105,11 +106,10 @@
xfs_failaddr_t fa;
fa = __xfs_btree_check_lblock(cur, block, level, bp);
- if (unlikely(XFS_TEST_ERROR(fa != NULL, mp,
- XFS_ERRTAG_BTREE_CHECK_LBLOCK))) {
+ if (XFS_IS_CORRUPT(mp, fa != NULL) ||
+ XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BTREE_CHECK_LBLOCK)) {
if (bp)
trace_xfs_btree_corrupt(bp, _RET_IP_);
- XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp);
return -EFSCORRUPTED;
}
return 0;
@@ -169,11 +169,10 @@
xfs_failaddr_t fa;
fa = __xfs_btree_check_sblock(cur, block, level, bp);
- if (unlikely(XFS_TEST_ERROR(fa != NULL, mp,
- XFS_ERRTAG_BTREE_CHECK_SBLOCK))) {
+ if (XFS_IS_CORRUPT(mp, fa != NULL) ||
+ XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BTREE_CHECK_SBLOCK)) {
if (bp)
trace_xfs_btree_corrupt(bp, _RET_IP_);
- XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp);
return -EFSCORRUPTED;
}
return 0;
@@ -216,7 +215,7 @@
{
if (level <= 0)
return false;
- return xfs_verify_agbno(cur->bc_mp, cur->bc_private.a.agno, agbno);
+ return xfs_verify_agbno(cur->bc_mp, cur->bc_ag.agno, agbno);
}
/*
@@ -236,8 +235,8 @@
return 0;
xfs_err(cur->bc_mp,
"Inode %llu fork %d: Corrupt btree %d pointer at level %d index %d.",
- cur->bc_private.b.ip->i_ino,
- cur->bc_private.b.whichfork, cur->bc_btnum,
+ cur->bc_ino.ip->i_ino,
+ cur->bc_ino.whichfork, cur->bc_btnum,
level, index);
} else {
if (xfs_btree_check_sptr(cur, be32_to_cpu((&ptr->s)[index]),
@@ -245,7 +244,7 @@
return 0;
xfs_err(cur->bc_mp,
"AG %u: Corrupt btree %d pointer at level %d index %d.",
- cur->bc_private.a.agno, cur->bc_btnum,
+ cur->bc_ag.agno, cur->bc_btnum,
level, index);
}
@@ -380,11 +379,13 @@
* allocated indirect blocks' accounting.
*/
ASSERT(cur->bc_btnum != XFS_BTNUM_BMAP ||
- cur->bc_private.b.allocated == 0);
+ cur->bc_ino.allocated == 0);
/*
* Free the cursor.
*/
- kmem_zone_free(xfs_btree_cur_zone, cur);
+ if (unlikely(cur->bc_flags & XFS_BTREE_STAGING))
+ kmem_free((void *)cur->bc_ops);
+ kmem_cache_free(xfs_btree_cur_zone, cur);
}
/*
@@ -644,6 +645,17 @@
((char *)block + xfs_btree_ptr_offset(cur, n, level));
}
+struct xfs_ifork *
+xfs_btree_ifork_ptr(
+ struct xfs_btree_cur *cur)
+{
+ ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
+
+ if (cur->bc_flags & XFS_BTREE_STAGING)
+ return cur->bc_ino.ifake->if_fork;
+ return XFS_IFORK_PTR(cur->bc_ino.ip, cur->bc_ino.whichfork);
+}
+
/*
* Get the root block which is stored in the inode.
*
@@ -654,9 +666,8 @@
xfs_btree_get_iroot(
struct xfs_btree_cur *cur)
{
- struct xfs_ifork *ifp;
+ struct xfs_ifork *ifp = xfs_btree_ifork_ptr(cur);
- ifp = XFS_IFORK_PTR(cur->bc_private.b.ip, cur->bc_private.b.whichfork);
return (struct xfs_btree_block *)ifp->if_broot;
}
@@ -681,61 +692,6 @@
}
/*
- * Get a buffer for the block, return it with no data read.
- * Long-form addressing.
- */
-xfs_buf_t * /* buffer for fsbno */
-xfs_btree_get_bufl(
- xfs_mount_t *mp, /* file system mount point */
- xfs_trans_t *tp, /* transaction pointer */
- xfs_fsblock_t fsbno) /* file system block number */
-{
- xfs_daddr_t d; /* real disk block address */
-
- ASSERT(fsbno != NULLFSBLOCK);
- d = XFS_FSB_TO_DADDR(mp, fsbno);
- return xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize, 0);
-}
-
-/*
- * Get a buffer for the block, return it with no data read.
- * Short-form addressing.
- */
-xfs_buf_t * /* buffer for agno/agbno */
-xfs_btree_get_bufs(
- xfs_mount_t *mp, /* file system mount point */
- xfs_trans_t *tp, /* transaction pointer */
- xfs_agnumber_t agno, /* allocation group number */
- xfs_agblock_t agbno) /* allocation group block number */
-{
- xfs_daddr_t d; /* real disk block address */
-
- ASSERT(agno != NULLAGNUMBER);
- ASSERT(agbno != NULLAGBLOCK);
- d = XFS_AGB_TO_DADDR(mp, agno, agbno);
- return xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize, 0);
-}
-
-/*
- * Check for the cursor referring to the last block at the given level.
- */
-int /* 1=is last block, 0=not last block */
-xfs_btree_islastblock(
- xfs_btree_cur_t *cur, /* btree cursor */
- int level) /* level to check */
-{
- struct xfs_btree_block *block; /* generic btree block pointer */
- xfs_buf_t *bp; /* buffer containing block */
-
- block = xfs_btree_get_block(cur, level, &bp);
- xfs_btree_check_block(cur, block, level, bp);
- if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
- return block->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK);
- else
- return block->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK);
-}
-
-/*
* Change the cursor to point to the first record at the given level.
* Other levels are unaffected.
*/
@@ -938,13 +894,13 @@
if ((lr & XFS_BTCUR_LEFTRA) && left != NULLAGBLOCK) {
- xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno,
+ xfs_btree_reada_bufs(cur->bc_mp, cur->bc_ag.agno,
left, 1, cur->bc_ops->buf_ops);
rval++;
}
if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLAGBLOCK) {
- xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno,
+ xfs_btree_reada_bufs(cur->bc_mp, cur->bc_ag.agno,
right, 1, cur->bc_ops->buf_ops);
rval++;
}
@@ -1002,7 +958,7 @@
*daddr = XFS_FSB_TO_DADDR(cur->bc_mp, fsbno);
} else {
agbno = be32_to_cpu(ptr->s);
- *daddr = XFS_AGB_TO_DADDR(cur->bc_mp, cur->bc_private.a.agno,
+ *daddr = XFS_AGB_TO_DADDR(cur->bc_mp, cur->bc_ag.agno,
agbno);
}
@@ -1071,7 +1027,7 @@
return ptr->s == cpu_to_be32(NULLAGBLOCK);
}
-STATIC void
+void
xfs_btree_set_ptr_null(
struct xfs_btree_cur *cur,
union xfs_btree_ptr *ptr)
@@ -1107,7 +1063,7 @@
}
}
-STATIC void
+void
xfs_btree_set_sibling(
struct xfs_btree_cur *cur,
struct xfs_btree_block *block,
@@ -1185,7 +1141,7 @@
btnum, level, numrecs, owner, 0);
}
-STATIC void
+void
xfs_btree_init_block_cur(
struct xfs_btree_cur *cur,
struct xfs_buf *bp,
@@ -1201,9 +1157,9 @@
* code.
*/
if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
- owner = cur->bc_private.b.ip->i_ino;
+ owner = cur->bc_ino.ip->i_ino;
else
- owner = cur->bc_private.a.agno;
+ owner = cur->bc_ag.agno;
xfs_btree_init_block_int(cur->bc_mp, XFS_BUF_TO_BLOCK(bp), bp->b_bn,
cur->bc_btnum, level, numrecs,
@@ -1277,7 +1233,7 @@
}
}
-STATIC int
+int
xfs_btree_get_buf_block(
struct xfs_btree_cur *cur,
union xfs_btree_ptr *ptr,
@@ -1291,11 +1247,10 @@
error = xfs_btree_ptr_to_daddr(cur, ptr, &d);
if (error)
return error;
- *bpp = xfs_trans_get_buf(cur->bc_tp, mp->m_ddev_targp, d,
- mp->m_bsize, 0);
-
- if (!*bpp)
- return -ENOMEM;
+ error = xfs_trans_get_buf(cur->bc_tp, mp->m_ddev_targp, d, mp->m_bsize,
+ 0, bpp);
+ if (error)
+ return error;
(*bpp)->b_ops = cur->bc_ops->buf_ops;
*block = XFS_BUF_TO_BLOCK(*bpp);
@@ -1338,7 +1293,7 @@
/*
* Copy keys from one btree block to another.
*/
-STATIC void
+void
xfs_btree_copy_keys(
struct xfs_btree_cur *cur,
union xfs_btree_key *dst_key,
@@ -1366,11 +1321,11 @@
/*
* Copy block pointers from one btree block to another.
*/
-STATIC void
+void
xfs_btree_copy_ptrs(
struct xfs_btree_cur *cur,
union xfs_btree_ptr *dst_ptr,
- union xfs_btree_ptr *src_ptr,
+ const union xfs_btree_ptr *src_ptr,
int numptrs)
{
ASSERT(numptrs >= 0);
@@ -1451,8 +1406,8 @@
xfs_btree_key_offset(cur, first),
xfs_btree_key_offset(cur, last + 1) - 1);
} else {
- xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip,
- xfs_ilog_fbroot(cur->bc_private.b.whichfork));
+ xfs_trans_log_inode(cur->bc_tp, cur->bc_ino.ip,
+ xfs_ilog_fbroot(cur->bc_ino.whichfork));
}
}
@@ -1494,8 +1449,8 @@
xfs_btree_ptr_offset(cur, first, level),
xfs_btree_ptr_offset(cur, last + 1, level) - 1);
} else {
- xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip,
- xfs_ilog_fbroot(cur->bc_private.b.whichfork));
+ xfs_trans_log_inode(cur->bc_tp, cur->bc_ino.ip,
+ xfs_ilog_fbroot(cur->bc_ino.whichfork));
}
}
@@ -1563,8 +1518,8 @@
xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
xfs_trans_log_buf(cur->bc_tp, bp, first, last);
} else {
- xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip,
- xfs_ilog_fbroot(cur->bc_private.b.whichfork));
+ xfs_trans_log_inode(cur->bc_tp, cur->bc_ino.ip,
+ xfs_ilog_fbroot(cur->bc_ino.whichfork));
}
}
@@ -1801,10 +1756,10 @@
/* Check the inode owner since the verifiers don't. */
if (xfs_sb_version_hascrc(&cur->bc_mp->m_sb) &&
- !(cur->bc_private.b.flags & XFS_BTCUR_BPRV_INVALID_OWNER) &&
+ !(cur->bc_ino.flags & XFS_BTCUR_BMBT_INVALID_OWNER) &&
(cur->bc_flags & XFS_BTREE_LONG_PTRS) &&
be64_to_cpu((*blkp)->bb_u.l.bb_owner) !=
- cur->bc_private.b.ip->i_ino)
+ cur->bc_ino.ip->i_ino)
goto out_bad;
/* Did we get the level we were looking for? */
@@ -1820,6 +1775,7 @@
out_bad:
*blkp = NULL;
+ xfs_buf_mark_corrupt(bp);
xfs_trans_brelse(cur->bc_tp, bp);
return -EFSCORRUPTED;
}
@@ -1867,7 +1823,7 @@
XFS_BTREE_STATS_INC(cur, lookup);
/* No such thing as a zero-level tree. */
- if (cur->bc_nlevels == 0)
+ if (XFS_IS_CORRUPT(cur->bc_mp, cur->bc_nlevels == 0))
return -EFSCORRUPTED;
block = NULL;
@@ -1987,7 +1943,8 @@
error = xfs_btree_increment(cur, 0, &i);
if (error)
goto error0;
- XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, i == 1);
+ if (XFS_IS_CORRUPT(cur->bc_mp, i != 1))
+ return -EFSCORRUPTED;
*stat = 1;
return 0;
}
@@ -2408,8 +2365,6 @@
XFS_BTREE_STATS_ADD(cur, moves, rrecs - 1);
if (level > 0) {
/* It's a nonleaf. operate on keys and ptrs */
- int i; /* loop index */
-
for (i = 0; i < rrecs; i++) {
error = xfs_btree_debug_check_ptr(cur, rpp, i + 1, level);
if (error)
@@ -2442,7 +2397,10 @@
if (error)
goto error0;
i = xfs_btree_firstrec(tcur, level);
- XFS_WANT_CORRUPTED_GOTO(tcur->bc_mp, i == 1, error0);
+ if (XFS_IS_CORRUPT(tcur->bc_mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto error0;
+ }
error = xfs_btree_decrement(tcur, level, &i);
if (error)
@@ -2609,7 +2567,10 @@
if (error)
goto error0;
i = xfs_btree_lastrec(tcur, level);
- XFS_WANT_CORRUPTED_GOTO(tcur->bc_mp, i == 1, error0);
+ if (XFS_IS_CORRUPT(tcur->bc_mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto error0;
+ }
error = xfs_btree_increment(tcur, level, &i);
if (error)
@@ -2990,9 +2951,9 @@
xfs_btree_copy_ptrs(cur, pp, &nptr, 1);
- xfs_iroot_realloc(cur->bc_private.b.ip,
+ xfs_iroot_realloc(cur->bc_ino.ip,
1 - xfs_btree_get_numrecs(cblock),
- cur->bc_private.b.whichfork);
+ cur->bc_ino.whichfork);
xfs_btree_setbuf(cur, level, cbp);
@@ -3005,7 +2966,7 @@
xfs_btree_log_ptrs(cur, cbp, 1, be16_to_cpu(cblock->bb_numrecs));
*logflags |=
- XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_private.b.whichfork);
+ XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_ino.whichfork);
*stat = 1;
return 0;
error0:
@@ -3157,11 +3118,11 @@
if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
level == cur->bc_nlevels - 1) {
- struct xfs_inode *ip = cur->bc_private.b.ip;
+ struct xfs_inode *ip = cur->bc_ino.ip;
if (numrecs < cur->bc_ops->get_dmaxrecs(cur, level)) {
/* A root block that can be made bigger. */
- xfs_iroot_realloc(ip, 1, cur->bc_private.b.whichfork);
+ xfs_iroot_realloc(ip, 1, cur->bc_ino.whichfork);
*stat = 1;
} else {
/* A root block that needs replacing */
@@ -3463,7 +3424,10 @@
goto error0;
}
- XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
+ if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto error0;
+ }
level++;
/*
@@ -3504,8 +3468,8 @@
xfs_btree_kill_iroot(
struct xfs_btree_cur *cur)
{
- int whichfork = cur->bc_private.b.whichfork;
- struct xfs_inode *ip = cur->bc_private.b.ip;
+ int whichfork = cur->bc_ino.whichfork;
+ struct xfs_inode *ip = cur->bc_ino.ip;
struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
struct xfs_btree_block *block;
struct xfs_btree_block *cblock;
@@ -3563,8 +3527,8 @@
index = numrecs - cur->bc_ops->get_maxrecs(cur, level);
if (index) {
- xfs_iroot_realloc(cur->bc_private.b.ip, index,
- cur->bc_private.b.whichfork);
+ xfs_iroot_realloc(cur->bc_ino.ip, index,
+ cur->bc_ino.whichfork);
block = ifp->if_broot;
}
@@ -3593,7 +3557,7 @@
cur->bc_bufs[level - 1] = NULL;
be16_add_cpu(&block->bb_level, -1);
xfs_trans_log_inode(cur->bc_tp, ip,
- XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_private.b.whichfork));
+ XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_ino.whichfork));
cur->bc_nlevels--;
out0:
return 0;
@@ -3761,8 +3725,8 @@
*/
if (level == cur->bc_nlevels - 1) {
if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) {
- xfs_iroot_realloc(cur->bc_private.b.ip, -1,
- cur->bc_private.b.whichfork);
+ xfs_iroot_realloc(cur->bc_ino.ip, -1,
+ cur->bc_ino.whichfork);
error = xfs_btree_kill_iroot(cur);
if (error)
@@ -3867,15 +3831,24 @@
* Actually any entry but the first would suffice.
*/
i = xfs_btree_lastrec(tcur, level);
- XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
+ if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto error0;
+ }
error = xfs_btree_increment(tcur, level, &i);
if (error)
goto error0;
- XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
+ if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto error0;
+ }
i = xfs_btree_lastrec(tcur, level);
- XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
+ if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto error0;
+ }
/* Grab a pointer to the block. */
right = xfs_btree_get_block(tcur, level, &rbp);
@@ -3919,12 +3892,18 @@
rrecs = xfs_btree_get_numrecs(right);
if (!xfs_btree_ptr_is_null(cur, &lptr)) {
i = xfs_btree_firstrec(tcur, level);
- XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
+ if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto error0;
+ }
error = xfs_btree_decrement(tcur, level, &i);
if (error)
goto error0;
- XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
+ if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto error0;
+ }
}
}
@@ -3938,13 +3917,19 @@
* previous block.
*/
i = xfs_btree_firstrec(tcur, level);
- XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
+ if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto error0;
+ }
error = xfs_btree_decrement(tcur, level, &i);
if (error)
goto error0;
i = xfs_btree_firstrec(tcur, level);
- XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
+ if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto error0;
+ }
/* Grab a pointer to the block. */
left = xfs_btree_get_block(tcur, level, &lbp);
@@ -4286,6 +4271,7 @@
xfs_btree_visit_blocks(
struct xfs_btree_cur *cur,
xfs_btree_visit_blocks_fn fn,
+ unsigned int flags,
void *data)
{
union xfs_btree_ptr lptr;
@@ -4311,6 +4297,11 @@
/* save for the next iteration of the loop */
xfs_btree_copy_ptrs(cur, &lptr, ptr, 1);
+
+ if (!(flags & XFS_BTREE_VISIT_LEAVES))
+ continue;
+ } else if (!(flags & XFS_BTREE_VISIT_RECORDS)) {
+ continue;
}
/* for each buffer in the level */
@@ -4413,7 +4404,7 @@
bbcoi.buffer_list = buffer_list;
return xfs_btree_visit_blocks(cur, xfs_btree_block_change_owner,
- &bbcoi);
+ XFS_BTREE_VISIT_ALL, &bbcoi);
}
/* Verify the v5 fields of a long-format btree block. */
@@ -4865,7 +4856,7 @@
{
*blocks = 0;
return xfs_btree_visit_blocks(cur, xfs_btree_count_blocks_helper,
- blocks);
+ XFS_BTREE_VISIT_ALL, blocks);
}
/* Compare two btree pointers. */
diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h
index ced1e65..10e50cb 100644
--- a/fs/xfs/libxfs/xfs_btree.h
+++ b/fs/xfs/libxfs/xfs_btree.h
@@ -1,4 +1,4 @@
-// SPDX-License-Identifier: GPL-2.0
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* Copyright (c) 2000-2001,2005 Silicon Graphics, Inc.
* All Rights Reserved.
@@ -10,6 +10,7 @@
struct xfs_inode;
struct xfs_mount;
struct xfs_trans;
+struct xfs_ifork;
extern kmem_zone_t *xfs_btree_cur_zone;
@@ -177,12 +178,37 @@
struct xfs_refcount_irec rc;
};
-/* Per-AG btree private information. */
-union xfs_btree_cur_private {
- struct {
- unsigned long nr_ops; /* # record updates */
- int shape_changes; /* # of extent splits */
- } refc;
+/* Per-AG btree information. */
+struct xfs_btree_cur_ag {
+ union {
+ struct xfs_buf *agbp;
+ struct xbtree_afakeroot *afake; /* for staging cursor */
+ };
+ xfs_agnumber_t agno;
+ union {
+ struct {
+ unsigned long nr_ops; /* # record updates */
+ int shape_changes; /* # of extent splits */
+ } refc;
+ struct {
+ bool active; /* allocation cursor state */
+ } abt;
+ };
+};
+
+/* Btree-in-inode cursor information */
+struct xfs_btree_cur_ino {
+ struct xfs_inode *ip;
+ struct xbtree_ifakeroot *ifake; /* for staging cursor */
+ int allocated;
+ short forksize;
+ char whichfork;
+ char flags;
+/* We are converting a delalloc reservation */
+#define XFS_BTCUR_BMBT_WASDEL (1 << 0)
+
+/* For extent swap, ignore owner check in verifier */
+#define XFS_BTCUR_BMBT_INVALID_OWNER (1 << 1)
};
/*
@@ -206,21 +232,9 @@
xfs_btnum_t bc_btnum; /* identifies which btree type */
int bc_statoff; /* offset of btre stats array */
union {
- struct { /* needed for BNO, CNT, INO */
- struct xfs_buf *agbp; /* agf/agi buffer pointer */
- xfs_agnumber_t agno; /* ag number */
- union xfs_btree_cur_private priv;
- } a;
- struct { /* needed for BMAP */
- struct xfs_inode *ip; /* pointer to our inode */
- int allocated; /* count of alloced */
- short forksize; /* fork's inode space */
- char whichfork; /* data or attr fork */
- char flags; /* flags */
-#define XFS_BTCUR_BPRV_WASDEL (1<<0) /* was delayed */
-#define XFS_BTCUR_BPRV_INVALID_OWNER (1<<1) /* for ext swap */
- } b;
- } bc_private; /* per-btree type data */
+ struct xfs_btree_cur_ag bc_ag;
+ struct xfs_btree_cur_ino bc_ino;
+ };
} xfs_btree_cur_t;
/* cursor flags */
@@ -229,6 +243,12 @@
#define XFS_BTREE_LASTREC_UPDATE (1<<2) /* track last rec externally */
#define XFS_BTREE_CRC_BLOCKS (1<<3) /* uses extended btree blocks */
#define XFS_BTREE_OVERLAPPING (1<<4) /* overlapping intervals */
+/*
+ * The root of this btree is a fakeroot structure so that we can stage a btree
+ * rebuild without leaving it accessible via primary metadata. The ops struct
+ * is dynamically allocated and must be freed when the cursor is deleted.
+ */
+#define XFS_BTREE_STAGING (1<<5)
#define XFS_BTREE_NOERROR 0
@@ -294,35 +314,6 @@
xfs_btree_cur_t **ncur);/* output cursor */
/*
- * Get a buffer for the block, return it with no data read.
- * Long-form addressing.
- */
-struct xfs_buf * /* buffer for fsbno */
-xfs_btree_get_bufl(
- struct xfs_mount *mp, /* file system mount point */
- struct xfs_trans *tp, /* transaction pointer */
- xfs_fsblock_t fsbno); /* file system block number */
-
-/*
- * Get a buffer for the block, return it with no data read.
- * Short-form addressing.
- */
-struct xfs_buf * /* buffer for agno/agbno */
-xfs_btree_get_bufs(
- struct xfs_mount *mp, /* file system mount point */
- struct xfs_trans *tp, /* transaction pointer */
- xfs_agnumber_t agno, /* allocation group number */
- xfs_agblock_t agbno); /* allocation group block number */
-
-/*
- * Check for the cursor referring to the last block at the given level.
- */
-int /* 1=is last block, 0=not last block */
-xfs_btree_islastblock(
- xfs_btree_cur_t *cur, /* btree cursor */
- int level); /* level to check */
-
-/*
* Compute first and last byte offsets for the fields given.
* Interprets the offsets table, which contains struct field offsets.
*/
@@ -482,8 +473,15 @@
typedef int (*xfs_btree_visit_blocks_fn)(struct xfs_btree_cur *cur, int level,
void *data);
+/* Visit record blocks. */
+#define XFS_BTREE_VISIT_RECORDS (1 << 0)
+/* Visit leaf blocks. */
+#define XFS_BTREE_VISIT_LEAVES (1 << 1)
+/* Visit all blocks. */
+#define XFS_BTREE_VISIT_ALL (XFS_BTREE_VISIT_RECORDS | \
+ XFS_BTREE_VISIT_LEAVES)
int xfs_btree_visit_blocks(struct xfs_btree_cur *cur,
- xfs_btree_visit_blocks_fn fn, void *data);
+ xfs_btree_visit_blocks_fn fn, unsigned int flags, void *data);
int xfs_btree_count_blocks(struct xfs_btree_cur *cur, xfs_extlen_t *blocks);
@@ -513,5 +511,39 @@
int xfs_btree_has_record(struct xfs_btree_cur *cur, union xfs_btree_irec *low,
union xfs_btree_irec *high, bool *exists);
bool xfs_btree_has_more_records(struct xfs_btree_cur *cur);
+struct xfs_ifork *xfs_btree_ifork_ptr(struct xfs_btree_cur *cur);
+
+/* Does this cursor point to the last block in the given level? */
+static inline bool
+xfs_btree_islastblock(
+ xfs_btree_cur_t *cur,
+ int level)
+{
+ struct xfs_btree_block *block;
+ struct xfs_buf *bp;
+
+ block = xfs_btree_get_block(cur, level, &bp);
+ ASSERT(block && xfs_btree_check_block(cur, block, level, bp) == 0);
+
+ if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
+ return block->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK);
+ return block->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK);
+}
+
+void xfs_btree_set_ptr_null(struct xfs_btree_cur *cur,
+ union xfs_btree_ptr *ptr);
+int xfs_btree_get_buf_block(struct xfs_btree_cur *cur, union xfs_btree_ptr *ptr,
+ struct xfs_btree_block **block, struct xfs_buf **bpp);
+void xfs_btree_set_sibling(struct xfs_btree_cur *cur,
+ struct xfs_btree_block *block, union xfs_btree_ptr *ptr,
+ int lr);
+void xfs_btree_init_block_cur(struct xfs_btree_cur *cur,
+ struct xfs_buf *bp, int level, int numrecs);
+void xfs_btree_copy_ptrs(struct xfs_btree_cur *cur,
+ union xfs_btree_ptr *dst_ptr,
+ const union xfs_btree_ptr *src_ptr, int numptrs);
+void xfs_btree_copy_keys(struct xfs_btree_cur *cur,
+ union xfs_btree_key *dst_key, union xfs_btree_key *src_key,
+ int numkeys);
#endif /* __XFS_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_btree_staging.c b/fs/xfs/libxfs/xfs_btree_staging.c
new file mode 100644
index 0000000..f464a7c
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_btree_staging.c
@@ -0,0 +1,879 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2020 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_trans.h"
+#include "xfs_btree.h"
+#include "xfs_trace.h"
+#include "xfs_btree_staging.h"
+
+/*
+ * Staging Cursors and Fake Roots for Btrees
+ * =========================================
+ *
+ * A staging btree cursor is a special type of btree cursor that callers must
+ * use to construct a new btree index using the btree bulk loader code. The
+ * bulk loading code uses the staging btree cursor to abstract the details of
+ * initializing new btree blocks and filling them with records or key/ptr
+ * pairs. Regular btree operations (e.g. queries and modifications) are not
+ * supported with staging cursors, and callers must not invoke them.
+ *
+ * Fake root structures contain all the information about a btree that is under
+ * construction by the bulk loading code. Staging btree cursors point to fake
+ * root structures instead of the usual AG header or inode structure.
+ *
+ * Callers are expected to initialize a fake root structure and pass it into
+ * the _stage_cursor function for a specific btree type. When bulk loading is
+ * complete, callers should call the _commit_staged_btree function for that
+ * specific btree type to commit the new btree into the filesystem.
+ */
+
+/*
+ * Don't allow staging cursors to be duplicated because they're supposed to be
+ * kept private to a single thread.
+ */
+STATIC struct xfs_btree_cur *
+xfs_btree_fakeroot_dup_cursor(
+ struct xfs_btree_cur *cur)
+{
+ ASSERT(0);
+ return NULL;
+}
+
+/*
+ * Don't allow block allocation for a staging cursor, because staging cursors
+ * do not support regular btree modifications.
+ *
+ * Bulk loading uses a separate callback to obtain new blocks from a
+ * preallocated list, which prevents ENOSPC failures during loading.
+ */
+STATIC int
+xfs_btree_fakeroot_alloc_block(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_ptr *start_bno,
+ union xfs_btree_ptr *new_bno,
+ int *stat)
+{
+ ASSERT(0);
+ return -EFSCORRUPTED;
+}
+
+/*
+ * Don't allow block freeing for a staging cursor, because staging cursors
+ * do not support regular btree modifications.
+ */
+STATIC int
+xfs_btree_fakeroot_free_block(
+ struct xfs_btree_cur *cur,
+ struct xfs_buf *bp)
+{
+ ASSERT(0);
+ return -EFSCORRUPTED;
+}
+
+/* Initialize a pointer to the root block from the fakeroot. */
+STATIC void
+xfs_btree_fakeroot_init_ptr_from_cur(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_ptr *ptr)
+{
+ struct xbtree_afakeroot *afake;
+
+ ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
+
+ afake = cur->bc_ag.afake;
+ ptr->s = cpu_to_be32(afake->af_root);
+}
+
+/*
+ * Bulk Loading for AG Btrees
+ * ==========================
+ *
+ * For a btree rooted in an AG header, pass a xbtree_afakeroot structure to the
+ * staging cursor. Callers should initialize this to zero.
+ *
+ * The _stage_cursor() function for a specific btree type should call
+ * xfs_btree_stage_afakeroot to set up the in-memory cursor as a staging
+ * cursor. The corresponding _commit_staged_btree() function should log the
+ * new root and call xfs_btree_commit_afakeroot() to transform the staging
+ * cursor into a regular btree cursor.
+ */
+
+/* Update the btree root information for a per-AG fake root. */
+STATIC void
+xfs_btree_afakeroot_set_root(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_ptr *ptr,
+ int inc)
+{
+ struct xbtree_afakeroot *afake = cur->bc_ag.afake;
+
+ ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
+ afake->af_root = be32_to_cpu(ptr->s);
+ afake->af_levels += inc;
+}
+
+/*
+ * Initialize a AG-rooted btree cursor with the given AG btree fake root.
+ * The btree cursor's bc_ops will be overridden as needed to make the staging
+ * functionality work.
+ */
+void
+xfs_btree_stage_afakeroot(
+ struct xfs_btree_cur *cur,
+ struct xbtree_afakeroot *afake)
+{
+ struct xfs_btree_ops *nops;
+
+ ASSERT(!(cur->bc_flags & XFS_BTREE_STAGING));
+ ASSERT(!(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE));
+ ASSERT(cur->bc_tp == NULL);
+
+ nops = kmem_alloc(sizeof(struct xfs_btree_ops), KM_NOFS);
+ memcpy(nops, cur->bc_ops, sizeof(struct xfs_btree_ops));
+ nops->alloc_block = xfs_btree_fakeroot_alloc_block;
+ nops->free_block = xfs_btree_fakeroot_free_block;
+ nops->init_ptr_from_cur = xfs_btree_fakeroot_init_ptr_from_cur;
+ nops->set_root = xfs_btree_afakeroot_set_root;
+ nops->dup_cursor = xfs_btree_fakeroot_dup_cursor;
+
+ cur->bc_ag.afake = afake;
+ cur->bc_nlevels = afake->af_levels;
+ cur->bc_ops = nops;
+ cur->bc_flags |= XFS_BTREE_STAGING;
+}
+
+/*
+ * Transform an AG-rooted staging btree cursor back into a regular cursor by
+ * substituting a real btree root for the fake one and restoring normal btree
+ * cursor ops. The caller must log the btree root change prior to calling
+ * this.
+ */
+void
+xfs_btree_commit_afakeroot(
+ struct xfs_btree_cur *cur,
+ struct xfs_trans *tp,
+ struct xfs_buf *agbp,
+ const struct xfs_btree_ops *ops)
+{
+ ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
+ ASSERT(cur->bc_tp == NULL);
+
+ trace_xfs_btree_commit_afakeroot(cur);
+
+ kmem_free((void *)cur->bc_ops);
+ cur->bc_ag.agbp = agbp;
+ cur->bc_ops = ops;
+ cur->bc_flags &= ~XFS_BTREE_STAGING;
+ cur->bc_tp = tp;
+}
+
+/*
+ * Bulk Loading for Inode-Rooted Btrees
+ * ====================================
+ *
+ * For a btree rooted in an inode fork, pass a xbtree_ifakeroot structure to
+ * the staging cursor. This structure should be initialized as follows:
+ *
+ * - if_fork_size field should be set to the number of bytes available to the
+ * fork in the inode.
+ *
+ * - if_fork should point to a freshly allocated struct xfs_ifork.
+ *
+ * - if_format should be set to the appropriate fork type (e.g.
+ * XFS_DINODE_FMT_BTREE).
+ *
+ * All other fields must be zero.
+ *
+ * The _stage_cursor() function for a specific btree type should call
+ * xfs_btree_stage_ifakeroot to set up the in-memory cursor as a staging
+ * cursor. The corresponding _commit_staged_btree() function should log the
+ * new root and call xfs_btree_commit_ifakeroot() to transform the staging
+ * cursor into a regular btree cursor.
+ */
+
+/*
+ * Initialize an inode-rooted btree cursor with the given inode btree fake
+ * root. The btree cursor's bc_ops will be overridden as needed to make the
+ * staging functionality work. If new_ops is not NULL, these new ops will be
+ * passed out to the caller for further overriding.
+ */
+void
+xfs_btree_stage_ifakeroot(
+ struct xfs_btree_cur *cur,
+ struct xbtree_ifakeroot *ifake,
+ struct xfs_btree_ops **new_ops)
+{
+ struct xfs_btree_ops *nops;
+
+ ASSERT(!(cur->bc_flags & XFS_BTREE_STAGING));
+ ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
+ ASSERT(cur->bc_tp == NULL);
+
+ nops = kmem_alloc(sizeof(struct xfs_btree_ops), KM_NOFS);
+ memcpy(nops, cur->bc_ops, sizeof(struct xfs_btree_ops));
+ nops->alloc_block = xfs_btree_fakeroot_alloc_block;
+ nops->free_block = xfs_btree_fakeroot_free_block;
+ nops->init_ptr_from_cur = xfs_btree_fakeroot_init_ptr_from_cur;
+ nops->dup_cursor = xfs_btree_fakeroot_dup_cursor;
+
+ cur->bc_ino.ifake = ifake;
+ cur->bc_nlevels = ifake->if_levels;
+ cur->bc_ops = nops;
+ cur->bc_flags |= XFS_BTREE_STAGING;
+
+ if (new_ops)
+ *new_ops = nops;
+}
+
+/*
+ * Transform an inode-rooted staging btree cursor back into a regular cursor by
+ * substituting a real btree root for the fake one and restoring normal btree
+ * cursor ops. The caller must log the btree root change prior to calling
+ * this.
+ */
+void
+xfs_btree_commit_ifakeroot(
+ struct xfs_btree_cur *cur,
+ struct xfs_trans *tp,
+ int whichfork,
+ const struct xfs_btree_ops *ops)
+{
+ ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
+ ASSERT(cur->bc_tp == NULL);
+
+ trace_xfs_btree_commit_ifakeroot(cur);
+
+ kmem_free((void *)cur->bc_ops);
+ cur->bc_ino.ifake = NULL;
+ cur->bc_ino.whichfork = whichfork;
+ cur->bc_ops = ops;
+ cur->bc_flags &= ~XFS_BTREE_STAGING;
+ cur->bc_tp = tp;
+}
+
+/*
+ * Bulk Loading of Staged Btrees
+ * =============================
+ *
+ * This interface is used with a staged btree cursor to create a totally new
+ * btree with a large number of records (i.e. more than what would fit in a
+ * single root block). When the creation is complete, the new root can be
+ * linked atomically into the filesystem by committing the staged cursor.
+ *
+ * Creation of a new btree proceeds roughly as follows:
+ *
+ * The first step is to initialize an appropriate fake btree root structure and
+ * then construct a staged btree cursor. Refer to the block comments about
+ * "Bulk Loading for AG Btrees" and "Bulk Loading for Inode-Rooted Btrees" for
+ * more information about how to do this.
+ *
+ * The second step is to initialize a struct xfs_btree_bload context as
+ * documented in the structure definition.
+ *
+ * The third step is to call xfs_btree_bload_compute_geometry to compute the
+ * height of and the number of blocks needed to construct the btree. See the
+ * section "Computing the Geometry of the New Btree" for details about this
+ * computation.
+ *
+ * In step four, the caller must allocate xfs_btree_bload.nr_blocks blocks and
+ * save them for later use by ->claim_block(). Bulk loading requires all
+ * blocks to be allocated beforehand to avoid ENOSPC failures midway through a
+ * rebuild, and to minimize seek distances of the new btree.
+ *
+ * Step five is to call xfs_btree_bload() to start constructing the btree.
+ *
+ * The final step is to commit the staging btree cursor, which logs the new
+ * btree root and turns the staging cursor into a regular cursor. The caller
+ * is responsible for cleaning up the previous btree blocks, if any.
+ *
+ * Computing the Geometry of the New Btree
+ * =======================================
+ *
+ * The number of items placed in each btree block is computed via the following
+ * algorithm: For leaf levels, the number of items for the level is nr_records
+ * in the bload structure. For node levels, the number of items for the level
+ * is the number of blocks in the next lower level of the tree. For each
+ * level, the desired number of items per block is defined as:
+ *
+ * desired = max(minrecs, maxrecs - slack factor)
+ *
+ * The number of blocks for the level is defined to be:
+ *
+ * blocks = floor(nr_items / desired)
+ *
+ * Note this is rounded down so that the npb calculation below will never fall
+ * below minrecs. The number of items that will actually be loaded into each
+ * btree block is defined as:
+ *
+ * npb = nr_items / blocks
+ *
+ * Some of the leftmost blocks in the level will contain one extra record as
+ * needed to handle uneven division. If the number of records in any block
+ * would exceed maxrecs for that level, blocks is incremented and npb is
+ * recalculated.
+ *
+ * In other words, we compute the number of blocks needed to satisfy a given
+ * loading level, then spread the items as evenly as possible.
+ *
+ * The height and number of fs blocks required to create the btree are computed
+ * and returned via btree_height and nr_blocks.
+ */
+
+/*
+ * Put a btree block that we're loading onto the ordered list and release it.
+ * The btree blocks will be written to disk when bulk loading is finished.
+ */
+static void
+xfs_btree_bload_drop_buf(
+ struct list_head *buffers_list,
+ struct xfs_buf **bpp)
+{
+ if (*bpp == NULL)
+ return;
+
+ if (!xfs_buf_delwri_queue(*bpp, buffers_list))
+ ASSERT(0);
+
+ xfs_buf_relse(*bpp);
+ *bpp = NULL;
+}
+
+/*
+ * Allocate and initialize one btree block for bulk loading.
+ *
+ * The new btree block will have its level and numrecs fields set to the values
+ * of the level and nr_this_block parameters, respectively.
+ *
+ * The caller should ensure that ptrp, bpp, and blockp refer to the left
+ * sibling of the new block, if there is any. On exit, ptrp, bpp, and blockp
+ * will all point to the new block.
+ */
+STATIC int
+xfs_btree_bload_prep_block(
+ struct xfs_btree_cur *cur,
+ struct xfs_btree_bload *bbl,
+ struct list_head *buffers_list,
+ unsigned int level,
+ unsigned int nr_this_block,
+ union xfs_btree_ptr *ptrp, /* in/out */
+ struct xfs_buf **bpp, /* in/out */
+ struct xfs_btree_block **blockp, /* in/out */
+ void *priv)
+{
+ union xfs_btree_ptr new_ptr;
+ struct xfs_buf *new_bp;
+ struct xfs_btree_block *new_block;
+ int ret;
+
+ if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
+ level == cur->bc_nlevels - 1) {
+ struct xfs_ifork *ifp = xfs_btree_ifork_ptr(cur);
+ size_t new_size;
+
+ ASSERT(*bpp == NULL);
+
+ /* Allocate a new incore btree root block. */
+ new_size = bbl->iroot_size(cur, nr_this_block, priv);
+ ifp->if_broot = kmem_zalloc(new_size, 0);
+ ifp->if_broot_bytes = (int)new_size;
+ ifp->if_flags |= XFS_IFBROOT;
+
+ /* Initialize it and send it out. */
+ xfs_btree_init_block_int(cur->bc_mp, ifp->if_broot,
+ XFS_BUF_DADDR_NULL, cur->bc_btnum, level,
+ nr_this_block, cur->bc_ino.ip->i_ino,
+ cur->bc_flags);
+
+ *bpp = NULL;
+ *blockp = ifp->if_broot;
+ xfs_btree_set_ptr_null(cur, ptrp);
+ return 0;
+ }
+
+ /* Claim one of the caller's preallocated blocks. */
+ xfs_btree_set_ptr_null(cur, &new_ptr);
+ ret = bbl->claim_block(cur, &new_ptr, priv);
+ if (ret)
+ return ret;
+
+ ASSERT(!xfs_btree_ptr_is_null(cur, &new_ptr));
+
+ ret = xfs_btree_get_buf_block(cur, &new_ptr, &new_block, &new_bp);
+ if (ret)
+ return ret;
+
+ /*
+ * The previous block (if any) is the left sibling of the new block,
+ * so set its right sibling pointer to the new block and drop it.
+ */
+ if (*blockp)
+ xfs_btree_set_sibling(cur, *blockp, &new_ptr, XFS_BB_RIGHTSIB);
+ xfs_btree_bload_drop_buf(buffers_list, bpp);
+
+ /* Initialize the new btree block. */
+ xfs_btree_init_block_cur(cur, new_bp, level, nr_this_block);
+ xfs_btree_set_sibling(cur, new_block, ptrp, XFS_BB_LEFTSIB);
+
+ /* Set the out parameters. */
+ *bpp = new_bp;
+ *blockp = new_block;
+ xfs_btree_copy_ptrs(cur, ptrp, &new_ptr, 1);
+ return 0;
+}
+
+/* Load one leaf block. */
+STATIC int
+xfs_btree_bload_leaf(
+ struct xfs_btree_cur *cur,
+ unsigned int recs_this_block,
+ xfs_btree_bload_get_record_fn get_record,
+ struct xfs_btree_block *block,
+ void *priv)
+{
+ unsigned int j;
+ int ret;
+
+ /* Fill the leaf block with records. */
+ for (j = 1; j <= recs_this_block; j++) {
+ union xfs_btree_rec *block_rec;
+
+ ret = get_record(cur, priv);
+ if (ret)
+ return ret;
+ block_rec = xfs_btree_rec_addr(cur, j, block);
+ cur->bc_ops->init_rec_from_cur(cur, block_rec);
+ }
+
+ return 0;
+}
+
+/*
+ * Load one node block with key/ptr pairs.
+ *
+ * child_ptr must point to a block within the next level down in the tree. A
+ * key/ptr entry will be created in the new node block to the block pointed to
+ * by child_ptr. On exit, child_ptr points to the next block on the child
+ * level that needs processing.
+ */
+STATIC int
+xfs_btree_bload_node(
+ struct xfs_btree_cur *cur,
+ unsigned int recs_this_block,
+ union xfs_btree_ptr *child_ptr,
+ struct xfs_btree_block *block)
+{
+ unsigned int j;
+ int ret;
+
+ /* Fill the node block with keys and pointers. */
+ for (j = 1; j <= recs_this_block; j++) {
+ union xfs_btree_key child_key;
+ union xfs_btree_ptr *block_ptr;
+ union xfs_btree_key *block_key;
+ struct xfs_btree_block *child_block;
+ struct xfs_buf *child_bp;
+
+ ASSERT(!xfs_btree_ptr_is_null(cur, child_ptr));
+
+ ret = xfs_btree_get_buf_block(cur, child_ptr, &child_block,
+ &child_bp);
+ if (ret)
+ return ret;
+
+ block_ptr = xfs_btree_ptr_addr(cur, j, block);
+ xfs_btree_copy_ptrs(cur, block_ptr, child_ptr, 1);
+
+ block_key = xfs_btree_key_addr(cur, j, block);
+ xfs_btree_get_keys(cur, child_block, &child_key);
+ xfs_btree_copy_keys(cur, block_key, &child_key, 1);
+
+ xfs_btree_get_sibling(cur, child_block, child_ptr,
+ XFS_BB_RIGHTSIB);
+ xfs_buf_relse(child_bp);
+ }
+
+ return 0;
+}
+
+/*
+ * Compute the maximum number of records (or keyptrs) per block that we want to
+ * install at this level in the btree. Caller is responsible for having set
+ * @cur->bc_ino.forksize to the desired fork size, if appropriate.
+ */
+STATIC unsigned int
+xfs_btree_bload_max_npb(
+ struct xfs_btree_cur *cur,
+ struct xfs_btree_bload *bbl,
+ unsigned int level)
+{
+ unsigned int ret;
+
+ if (level == cur->bc_nlevels - 1 && cur->bc_ops->get_dmaxrecs)
+ return cur->bc_ops->get_dmaxrecs(cur, level);
+
+ ret = cur->bc_ops->get_maxrecs(cur, level);
+ if (level == 0)
+ ret -= bbl->leaf_slack;
+ else
+ ret -= bbl->node_slack;
+ return ret;
+}
+
+/*
+ * Compute the desired number of records (or keyptrs) per block that we want to
+ * install at this level in the btree, which must be somewhere between minrecs
+ * and max_npb. The caller is free to install fewer records per block.
+ */
+STATIC unsigned int
+xfs_btree_bload_desired_npb(
+ struct xfs_btree_cur *cur,
+ struct xfs_btree_bload *bbl,
+ unsigned int level)
+{
+ unsigned int npb = xfs_btree_bload_max_npb(cur, bbl, level);
+
+ /* Root blocks are not subject to minrecs rules. */
+ if (level == cur->bc_nlevels - 1)
+ return max(1U, npb);
+
+ return max_t(unsigned int, cur->bc_ops->get_minrecs(cur, level), npb);
+}
+
+/*
+ * Compute the number of records to be stored in each block at this level and
+ * the number of blocks for this level. For leaf levels, we must populate an
+ * empty root block even if there are no records, so we have to have at least
+ * one block.
+ */
+STATIC void
+xfs_btree_bload_level_geometry(
+ struct xfs_btree_cur *cur,
+ struct xfs_btree_bload *bbl,
+ unsigned int level,
+ uint64_t nr_this_level,
+ unsigned int *avg_per_block,
+ uint64_t *blocks,
+ uint64_t *blocks_with_extra)
+{
+ uint64_t npb;
+ uint64_t dontcare;
+ unsigned int desired_npb;
+ unsigned int maxnr;
+
+ maxnr = cur->bc_ops->get_maxrecs(cur, level);
+
+ /*
+ * Compute the number of blocks we need to fill each block with the
+ * desired number of records/keyptrs per block. Because desired_npb
+ * could be minrecs, we use regular integer division (which rounds
+ * the block count down) so that in the next step the effective # of
+ * items per block will never be less than desired_npb.
+ */
+ desired_npb = xfs_btree_bload_desired_npb(cur, bbl, level);
+ *blocks = div64_u64_rem(nr_this_level, desired_npb, &dontcare);
+ *blocks = max(1ULL, *blocks);
+
+ /*
+ * Compute the number of records that we will actually put in each
+ * block, assuming that we want to spread the records evenly between
+ * the blocks. Take care that the effective # of items per block (npb)
+ * won't exceed maxrecs even for the blocks that get an extra record,
+ * since desired_npb could be maxrecs, and in the previous step we
+ * rounded the block count down.
+ */
+ npb = div64_u64_rem(nr_this_level, *blocks, blocks_with_extra);
+ if (npb > maxnr || (npb == maxnr && *blocks_with_extra > 0)) {
+ (*blocks)++;
+ npb = div64_u64_rem(nr_this_level, *blocks, blocks_with_extra);
+ }
+
+ *avg_per_block = min_t(uint64_t, npb, nr_this_level);
+
+ trace_xfs_btree_bload_level_geometry(cur, level, nr_this_level,
+ *avg_per_block, desired_npb, *blocks,
+ *blocks_with_extra);
+}
+
+/*
+ * Ensure a slack value is appropriate for the btree.
+ *
+ * If the slack value is negative, set slack so that we fill the block to
+ * halfway between minrecs and maxrecs. Make sure the slack is never so large
+ * that we can underflow minrecs.
+ */
+static void
+xfs_btree_bload_ensure_slack(
+ struct xfs_btree_cur *cur,
+ int *slack,
+ int level)
+{
+ int maxr;
+ int minr;
+
+ maxr = cur->bc_ops->get_maxrecs(cur, level);
+ minr = cur->bc_ops->get_minrecs(cur, level);
+
+ /*
+ * If slack is negative, automatically set slack so that we load the
+ * btree block approximately halfway between minrecs and maxrecs.
+ * Generally, this will net us 75% loading.
+ */
+ if (*slack < 0)
+ *slack = maxr - ((maxr + minr) >> 1);
+
+ *slack = min(*slack, maxr - minr);
+}
+
+/*
+ * Prepare a btree cursor for a bulk load operation by computing the geometry
+ * fields in bbl. Caller must ensure that the btree cursor is a staging
+ * cursor. This function can be called multiple times.
+ */
+int
+xfs_btree_bload_compute_geometry(
+ struct xfs_btree_cur *cur,
+ struct xfs_btree_bload *bbl,
+ uint64_t nr_records)
+{
+ uint64_t nr_blocks = 0;
+ uint64_t nr_this_level;
+
+ ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
+
+ /*
+ * Make sure that the slack values make sense for traditional leaf and
+ * node blocks. Inode-rooted btrees will return different minrecs and
+ * maxrecs values for the root block (bc_nlevels == level - 1). We're
+ * checking levels 0 and 1 here, so set bc_nlevels such that the btree
+ * code doesn't interpret either as the root level.
+ */
+ cur->bc_nlevels = XFS_BTREE_MAXLEVELS - 1;
+ xfs_btree_bload_ensure_slack(cur, &bbl->leaf_slack, 0);
+ xfs_btree_bload_ensure_slack(cur, &bbl->node_slack, 1);
+
+ bbl->nr_records = nr_this_level = nr_records;
+ for (cur->bc_nlevels = 1; cur->bc_nlevels < XFS_BTREE_MAXLEVELS;) {
+ uint64_t level_blocks;
+ uint64_t dontcare64;
+ unsigned int level = cur->bc_nlevels - 1;
+ unsigned int avg_per_block;
+
+ xfs_btree_bload_level_geometry(cur, bbl, level, nr_this_level,
+ &avg_per_block, &level_blocks, &dontcare64);
+
+ if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) {
+ /*
+ * If all the items we want to store at this level
+ * would fit in the inode root block, then we have our
+ * btree root and are done.
+ *
+ * Note that bmap btrees forbid records in the root.
+ */
+ if (level != 0 && nr_this_level <= avg_per_block) {
+ nr_blocks++;
+ break;
+ }
+
+ /*
+ * Otherwise, we have to store all the items for this
+ * level in traditional btree blocks and therefore need
+ * another level of btree to point to those blocks.
+ *
+ * We have to re-compute the geometry for each level of
+ * an inode-rooted btree because the geometry differs
+ * between a btree root in an inode fork and a
+ * traditional btree block.
+ *
+ * This distinction is made in the btree code based on
+ * whether level == bc_nlevels - 1. Based on the
+ * previous root block size check against the root
+ * block geometry, we know that we aren't yet ready to
+ * populate the root. Increment bc_nevels and
+ * recalculate the geometry for a traditional
+ * block-based btree level.
+ */
+ cur->bc_nlevels++;
+ xfs_btree_bload_level_geometry(cur, bbl, level,
+ nr_this_level, &avg_per_block,
+ &level_blocks, &dontcare64);
+ } else {
+ /*
+ * If all the items we want to store at this level
+ * would fit in a single root block, we're done.
+ */
+ if (nr_this_level <= avg_per_block) {
+ nr_blocks++;
+ break;
+ }
+
+ /* Otherwise, we need another level of btree. */
+ cur->bc_nlevels++;
+ }
+
+ nr_blocks += level_blocks;
+ nr_this_level = level_blocks;
+ }
+
+ if (cur->bc_nlevels == XFS_BTREE_MAXLEVELS)
+ return -EOVERFLOW;
+
+ bbl->btree_height = cur->bc_nlevels;
+ if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)
+ bbl->nr_blocks = nr_blocks - 1;
+ else
+ bbl->nr_blocks = nr_blocks;
+ return 0;
+}
+
+/* Bulk load a btree given the parameters and geometry established in bbl. */
+int
+xfs_btree_bload(
+ struct xfs_btree_cur *cur,
+ struct xfs_btree_bload *bbl,
+ void *priv)
+{
+ struct list_head buffers_list;
+ union xfs_btree_ptr child_ptr;
+ union xfs_btree_ptr ptr;
+ struct xfs_buf *bp = NULL;
+ struct xfs_btree_block *block = NULL;
+ uint64_t nr_this_level = bbl->nr_records;
+ uint64_t blocks;
+ uint64_t i;
+ uint64_t blocks_with_extra;
+ uint64_t total_blocks = 0;
+ unsigned int avg_per_block;
+ unsigned int level = 0;
+ int ret;
+
+ ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
+
+ INIT_LIST_HEAD(&buffers_list);
+ cur->bc_nlevels = bbl->btree_height;
+ xfs_btree_set_ptr_null(cur, &child_ptr);
+ xfs_btree_set_ptr_null(cur, &ptr);
+
+ xfs_btree_bload_level_geometry(cur, bbl, level, nr_this_level,
+ &avg_per_block, &blocks, &blocks_with_extra);
+
+ /* Load each leaf block. */
+ for (i = 0; i < blocks; i++) {
+ unsigned int nr_this_block = avg_per_block;
+
+ /*
+ * Due to rounding, btree blocks will not be evenly populated
+ * in most cases. blocks_with_extra tells us how many blocks
+ * will receive an extra record to distribute the excess across
+ * the current level as evenly as possible.
+ */
+ if (i < blocks_with_extra)
+ nr_this_block++;
+
+ ret = xfs_btree_bload_prep_block(cur, bbl, &buffers_list, level,
+ nr_this_block, &ptr, &bp, &block, priv);
+ if (ret)
+ goto out;
+
+ trace_xfs_btree_bload_block(cur, level, i, blocks, &ptr,
+ nr_this_block);
+
+ ret = xfs_btree_bload_leaf(cur, nr_this_block, bbl->get_record,
+ block, priv);
+ if (ret)
+ goto out;
+
+ /*
+ * Record the leftmost leaf pointer so we know where to start
+ * with the first node level.
+ */
+ if (i == 0)
+ xfs_btree_copy_ptrs(cur, &child_ptr, &ptr, 1);
+ }
+ total_blocks += blocks;
+ xfs_btree_bload_drop_buf(&buffers_list, &bp);
+
+ /* Populate the internal btree nodes. */
+ for (level = 1; level < cur->bc_nlevels; level++) {
+ union xfs_btree_ptr first_ptr;
+
+ nr_this_level = blocks;
+ block = NULL;
+ xfs_btree_set_ptr_null(cur, &ptr);
+
+ xfs_btree_bload_level_geometry(cur, bbl, level, nr_this_level,
+ &avg_per_block, &blocks, &blocks_with_extra);
+
+ /* Load each node block. */
+ for (i = 0; i < blocks; i++) {
+ unsigned int nr_this_block = avg_per_block;
+
+ if (i < blocks_with_extra)
+ nr_this_block++;
+
+ ret = xfs_btree_bload_prep_block(cur, bbl,
+ &buffers_list, level, nr_this_block,
+ &ptr, &bp, &block, priv);
+ if (ret)
+ goto out;
+
+ trace_xfs_btree_bload_block(cur, level, i, blocks,
+ &ptr, nr_this_block);
+
+ ret = xfs_btree_bload_node(cur, nr_this_block,
+ &child_ptr, block);
+ if (ret)
+ goto out;
+
+ /*
+ * Record the leftmost node pointer so that we know
+ * where to start the next node level above this one.
+ */
+ if (i == 0)
+ xfs_btree_copy_ptrs(cur, &first_ptr, &ptr, 1);
+ }
+ total_blocks += blocks;
+ xfs_btree_bload_drop_buf(&buffers_list, &bp);
+ xfs_btree_copy_ptrs(cur, &child_ptr, &first_ptr, 1);
+ }
+
+ /* Initialize the new root. */
+ if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) {
+ ASSERT(xfs_btree_ptr_is_null(cur, &ptr));
+ cur->bc_ino.ifake->if_levels = cur->bc_nlevels;
+ cur->bc_ino.ifake->if_blocks = total_blocks - 1;
+ } else {
+ cur->bc_ag.afake->af_root = be32_to_cpu(ptr.s);
+ cur->bc_ag.afake->af_levels = cur->bc_nlevels;
+ cur->bc_ag.afake->af_blocks = total_blocks;
+ }
+
+ /*
+ * Write the new blocks to disk. If the ordered list isn't empty after
+ * that, then something went wrong and we have to fail. This should
+ * never happen, but we'll check anyway.
+ */
+ ret = xfs_buf_delwri_submit(&buffers_list);
+ if (ret)
+ goto out;
+ if (!list_empty(&buffers_list)) {
+ ASSERT(list_empty(&buffers_list));
+ ret = -EIO;
+ }
+
+out:
+ xfs_buf_delwri_cancel(&buffers_list);
+ if (bp)
+ xfs_buf_relse(bp);
+ return ret;
+}
diff --git a/fs/xfs/libxfs/xfs_btree_staging.h b/fs/xfs/libxfs/xfs_btree_staging.h
new file mode 100644
index 0000000..f0d2976
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_btree_staging.h
@@ -0,0 +1,123 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (C) 2020 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ */
+#ifndef __XFS_BTREE_STAGING_H__
+#define __XFS_BTREE_STAGING_H__
+
+/* Fake root for an AG-rooted btree. */
+struct xbtree_afakeroot {
+ /* AG block number of the new btree root. */
+ xfs_agblock_t af_root;
+
+ /* Height of the new btree. */
+ unsigned int af_levels;
+
+ /* Number of blocks used by the btree. */
+ unsigned int af_blocks;
+};
+
+/* Cursor interactions with fake roots for AG-rooted btrees. */
+void xfs_btree_stage_afakeroot(struct xfs_btree_cur *cur,
+ struct xbtree_afakeroot *afake);
+void xfs_btree_commit_afakeroot(struct xfs_btree_cur *cur, struct xfs_trans *tp,
+ struct xfs_buf *agbp, const struct xfs_btree_ops *ops);
+
+/* Fake root for an inode-rooted btree. */
+struct xbtree_ifakeroot {
+ /* Fake inode fork. */
+ struct xfs_ifork *if_fork;
+
+ /* Number of blocks used by the btree. */
+ int64_t if_blocks;
+
+ /* Height of the new btree. */
+ unsigned int if_levels;
+
+ /* Number of bytes available for this fork in the inode. */
+ unsigned int if_fork_size;
+
+ /* Fork format. */
+ unsigned int if_format;
+
+ /* Number of records. */
+ unsigned int if_extents;
+};
+
+/* Cursor interactions with fake roots for inode-rooted btrees. */
+void xfs_btree_stage_ifakeroot(struct xfs_btree_cur *cur,
+ struct xbtree_ifakeroot *ifake,
+ struct xfs_btree_ops **new_ops);
+void xfs_btree_commit_ifakeroot(struct xfs_btree_cur *cur, struct xfs_trans *tp,
+ int whichfork, const struct xfs_btree_ops *ops);
+
+/* Bulk loading of staged btrees. */
+typedef int (*xfs_btree_bload_get_record_fn)(struct xfs_btree_cur *cur, void *priv);
+typedef int (*xfs_btree_bload_claim_block_fn)(struct xfs_btree_cur *cur,
+ union xfs_btree_ptr *ptr, void *priv);
+typedef size_t (*xfs_btree_bload_iroot_size_fn)(struct xfs_btree_cur *cur,
+ unsigned int nr_this_level, void *priv);
+
+struct xfs_btree_bload {
+ /*
+ * This function will be called nr_records times to load records into
+ * the btree. The function does this by setting the cursor's bc_rec
+ * field in in-core format. Records must be returned in sort order.
+ */
+ xfs_btree_bload_get_record_fn get_record;
+
+ /*
+ * This function will be called nr_blocks times to obtain a pointer
+ * to a new btree block on disk. Callers must preallocate all space
+ * for the new btree before calling xfs_btree_bload, and this function
+ * is what claims that reservation.
+ */
+ xfs_btree_bload_claim_block_fn claim_block;
+
+ /*
+ * This function should return the size of the in-core btree root
+ * block. It is only necessary for XFS_BTREE_ROOT_IN_INODE btree
+ * types.
+ */
+ xfs_btree_bload_iroot_size_fn iroot_size;
+
+ /*
+ * The caller should set this to the number of records that will be
+ * stored in the new btree.
+ */
+ uint64_t nr_records;
+
+ /*
+ * Number of free records to leave in each leaf block. If the caller
+ * sets this to -1, the slack value will be calculated to be halfway
+ * between maxrecs and minrecs. This typically leaves the block 75%
+ * full. Note that slack values are not enforced on inode root blocks.
+ */
+ int leaf_slack;
+
+ /*
+ * Number of free key/ptrs pairs to leave in each node block. This
+ * field has the same semantics as leaf_slack.
+ */
+ int node_slack;
+
+ /*
+ * The xfs_btree_bload_compute_geometry function will set this to the
+ * number of btree blocks needed to store nr_records records.
+ */
+ uint64_t nr_blocks;
+
+ /*
+ * The xfs_btree_bload_compute_geometry function will set this to the
+ * height of the new btree.
+ */
+ unsigned int btree_height;
+};
+
+int xfs_btree_bload_compute_geometry(struct xfs_btree_cur *cur,
+ struct xfs_btree_bload *bbl, uint64_t nr_records);
+int xfs_btree_bload(struct xfs_btree_cur *cur, struct xfs_btree_bload *bbl,
+ void *priv);
+
+#endif /* __XFS_BTREE_STAGING_H__ */
diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c
index 4fd1223..e46bc03 100644
--- a/fs/xfs/libxfs/xfs_da_btree.c
+++ b/fs/xfs/libxfs/xfs_da_btree.c
@@ -12,9 +12,9 @@
#include "xfs_trans_resv.h"
#include "xfs_bit.h"
#include "xfs_mount.h"
+#include "xfs_inode.h"
#include "xfs_dir2.h"
#include "xfs_dir2_priv.h"
-#include "xfs_inode.h"
#include "xfs_trans.h"
#include "xfs_bmap.h"
#include "xfs_attr_leaf.h"
@@ -78,10 +78,16 @@
* Allocate a dir-state structure.
* We don't put them on the stack since they're large.
*/
-xfs_da_state_t *
-xfs_da_state_alloc(void)
+struct xfs_da_state *
+xfs_da_state_alloc(
+ struct xfs_da_args *args)
{
- return kmem_zone_zalloc(xfs_da_state_zone, KM_NOFS);
+ struct xfs_da_state *state;
+
+ state = kmem_cache_zalloc(xfs_da_state_zone, GFP_NOFS | __GFP_NOFAIL);
+ state->args = args;
+ state->mp = args->dp->i_mount;
+ return state;
}
/*
@@ -107,7 +113,66 @@
#ifdef DEBUG
memset((char *)state, 0, sizeof(*state));
#endif /* DEBUG */
- kmem_zone_free(xfs_da_state_zone, state);
+ kmem_cache_free(xfs_da_state_zone, state);
+}
+
+static inline int xfs_dabuf_nfsb(struct xfs_mount *mp, int whichfork)
+{
+ if (whichfork == XFS_DATA_FORK)
+ return mp->m_dir_geo->fsbcount;
+ return mp->m_attr_geo->fsbcount;
+}
+
+void
+xfs_da3_node_hdr_from_disk(
+ struct xfs_mount *mp,
+ struct xfs_da3_icnode_hdr *to,
+ struct xfs_da_intnode *from)
+{
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ struct xfs_da3_intnode *from3 = (struct xfs_da3_intnode *)from;
+
+ to->forw = be32_to_cpu(from3->hdr.info.hdr.forw);
+ to->back = be32_to_cpu(from3->hdr.info.hdr.back);
+ to->magic = be16_to_cpu(from3->hdr.info.hdr.magic);
+ to->count = be16_to_cpu(from3->hdr.__count);
+ to->level = be16_to_cpu(from3->hdr.__level);
+ to->btree = from3->__btree;
+ ASSERT(to->magic == XFS_DA3_NODE_MAGIC);
+ } else {
+ to->forw = be32_to_cpu(from->hdr.info.forw);
+ to->back = be32_to_cpu(from->hdr.info.back);
+ to->magic = be16_to_cpu(from->hdr.info.magic);
+ to->count = be16_to_cpu(from->hdr.__count);
+ to->level = be16_to_cpu(from->hdr.__level);
+ to->btree = from->__btree;
+ ASSERT(to->magic == XFS_DA_NODE_MAGIC);
+ }
+}
+
+void
+xfs_da3_node_hdr_to_disk(
+ struct xfs_mount *mp,
+ struct xfs_da_intnode *to,
+ struct xfs_da3_icnode_hdr *from)
+{
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ struct xfs_da3_intnode *to3 = (struct xfs_da3_intnode *)to;
+
+ ASSERT(from->magic == XFS_DA3_NODE_MAGIC);
+ to3->hdr.info.hdr.forw = cpu_to_be32(from->forw);
+ to3->hdr.info.hdr.back = cpu_to_be32(from->back);
+ to3->hdr.info.hdr.magic = cpu_to_be16(from->magic);
+ to3->hdr.__count = cpu_to_be16(from->count);
+ to3->hdr.__level = cpu_to_be16(from->level);
+ } else {
+ ASSERT(from->magic == XFS_DA_NODE_MAGIC);
+ to->hdr.info.forw = cpu_to_be32(from->forw);
+ to->hdr.info.back = cpu_to_be32(from->back);
+ to->hdr.info.magic = cpu_to_be16(from->magic);
+ to->hdr.__count = cpu_to_be16(from->count);
+ to->hdr.__level = cpu_to_be16(from->level);
+ }
}
/*
@@ -145,12 +210,9 @@
struct xfs_mount *mp = bp->b_mount;
struct xfs_da_intnode *hdr = bp->b_addr;
struct xfs_da3_icnode_hdr ichdr;
- const struct xfs_dir_ops *ops;
xfs_failaddr_t fa;
- ops = xfs_dir_get_ops(mp, NULL);
-
- ops->node_hdr_from_disk(&ichdr, hdr);
+ xfs_da3_node_hdr_from_disk(mp, &ichdr, hdr);
fa = xfs_da3_blkinfo_verify(bp, bp->b_addr);
if (fa)
@@ -275,46 +337,76 @@
.verify_struct = xfs_da3_node_verify_struct,
};
+static int
+xfs_da3_node_set_type(
+ struct xfs_trans *tp,
+ struct xfs_buf *bp)
+{
+ struct xfs_da_blkinfo *info = bp->b_addr;
+
+ switch (be16_to_cpu(info->magic)) {
+ case XFS_DA_NODE_MAGIC:
+ case XFS_DA3_NODE_MAGIC:
+ xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DA_NODE_BUF);
+ return 0;
+ case XFS_ATTR_LEAF_MAGIC:
+ case XFS_ATTR3_LEAF_MAGIC:
+ xfs_trans_buf_set_type(tp, bp, XFS_BLFT_ATTR_LEAF_BUF);
+ return 0;
+ case XFS_DIR2_LEAFN_MAGIC:
+ case XFS_DIR3_LEAFN_MAGIC:
+ xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DIR_LEAFN_BUF);
+ return 0;
+ default:
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, tp->t_mountp,
+ info, sizeof(*info));
+ xfs_trans_brelse(tp, bp);
+ return -EFSCORRUPTED;
+ }
+}
+
int
xfs_da3_node_read(
struct xfs_trans *tp,
struct xfs_inode *dp,
xfs_dablk_t bno,
+ struct xfs_buf **bpp,
+ int whichfork)
+{
+ int error;
+
+ error = xfs_da_read_buf(tp, dp, bno, 0, bpp, whichfork,
+ &xfs_da3_node_buf_ops);
+ if (error || !*bpp || !tp)
+ return error;
+ return xfs_da3_node_set_type(tp, *bpp);
+}
+
+int
+xfs_da3_node_read_mapped(
+ struct xfs_trans *tp,
+ struct xfs_inode *dp,
xfs_daddr_t mappedbno,
struct xfs_buf **bpp,
- int which_fork)
+ int whichfork)
{
- int err;
+ struct xfs_mount *mp = dp->i_mount;
+ int error;
- err = xfs_da_read_buf(tp, dp, bno, mappedbno, bpp,
- which_fork, &xfs_da3_node_buf_ops);
- if (!err && tp && *bpp) {
- struct xfs_da_blkinfo *info = (*bpp)->b_addr;
- int type;
+ error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, mappedbno,
+ XFS_FSB_TO_BB(mp, xfs_dabuf_nfsb(mp, whichfork)), 0,
+ bpp, &xfs_da3_node_buf_ops);
+ if (error || !*bpp)
+ return error;
- switch (be16_to_cpu(info->magic)) {
- case XFS_DA_NODE_MAGIC:
- case XFS_DA3_NODE_MAGIC:
- type = XFS_BLFT_DA_NODE_BUF;
- break;
- case XFS_ATTR_LEAF_MAGIC:
- case XFS_ATTR3_LEAF_MAGIC:
- type = XFS_BLFT_ATTR_LEAF_BUF;
- break;
- case XFS_DIR2_LEAFN_MAGIC:
- case XFS_DIR3_LEAFN_MAGIC:
- type = XFS_BLFT_DIR_LEAFN_BUF;
- break;
- default:
- XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW,
- tp->t_mountp, info, sizeof(*info));
- xfs_trans_brelse(tp, *bpp);
- *bpp = NULL;
- return -EFSCORRUPTED;
- }
- xfs_trans_buf_set_type(tp, *bpp, type);
- }
- return err;
+ if (whichfork == XFS_ATTR_FORK)
+ xfs_buf_set_ref(*bpp, XFS_ATTR_BTREE_REF);
+ else
+ xfs_buf_set_ref(*bpp, XFS_DIR_BTREE_REF);
+
+ if (!tp)
+ return 0;
+ return xfs_da3_node_set_type(tp, *bpp);
}
/*========================================================================
@@ -343,7 +435,7 @@
trace_xfs_da_node_create(args);
ASSERT(level <= XFS_DA_NODE_MAXDEPTH);
- error = xfs_da_get_buf(tp, dp, blkno, -1, &bp, whichfork);
+ error = xfs_da_get_buf(tp, dp, blkno, &bp, whichfork);
if (error)
return error;
bp->b_ops = &xfs_da3_node_buf_ops;
@@ -363,9 +455,9 @@
}
ichdr.level = level;
- dp->d_ops->node_hdr_to_disk(node, &ichdr);
+ xfs_da3_node_hdr_to_disk(dp->i_mount, node, &ichdr);
xfs_trans_log_buf(tp, bp,
- XFS_DA_LOGRANGE(node, &node->hdr, dp->d_ops->node_hdr_size));
+ XFS_DA_LOGRANGE(node, &node->hdr, args->geo->node_hdr_size));
*bpp = bp;
return 0;
@@ -504,6 +596,7 @@
node = oldblk->bp->b_addr;
if (node->hdr.info.forw) {
if (be32_to_cpu(node->hdr.info.forw) != addblk->blkno) {
+ xfs_buf_mark_corrupt(oldblk->bp);
error = -EFSCORRUPTED;
goto out;
}
@@ -516,6 +609,7 @@
node = oldblk->bp->b_addr;
if (node->hdr.info.back) {
if (be32_to_cpu(node->hdr.info.back) != addblk->blkno) {
+ xfs_buf_mark_corrupt(oldblk->bp);
error = -EFSCORRUPTED;
goto out;
}
@@ -568,7 +662,7 @@
dp = args->dp;
tp = args->trans;
- error = xfs_da_get_buf(tp, dp, blkno, -1, &bp, args->whichfork);
+ error = xfs_da_get_buf(tp, dp, blkno, &bp, args->whichfork);
if (error)
return error;
node = bp->b_addr;
@@ -577,8 +671,8 @@
oldroot->hdr.info.magic == cpu_to_be16(XFS_DA3_NODE_MAGIC)) {
struct xfs_da3_icnode_hdr icnodehdr;
- dp->d_ops->node_hdr_from_disk(&icnodehdr, oldroot);
- btree = dp->d_ops->node_tree_p(oldroot);
+ xfs_da3_node_hdr_from_disk(dp->i_mount, &icnodehdr, oldroot);
+ btree = icnodehdr.btree;
size = (int)((char *)&btree[icnodehdr.count] - (char *)oldroot);
level = icnodehdr.level;
@@ -589,15 +683,14 @@
xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DA_NODE_BUF);
} else {
struct xfs_dir3_icleaf_hdr leafhdr;
- struct xfs_dir2_leaf_entry *ents;
leaf = (xfs_dir2_leaf_t *)oldroot;
- dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
- ents = dp->d_ops->leaf_ents_p(leaf);
+ xfs_dir2_leaf_hdr_from_disk(dp->i_mount, &leafhdr, leaf);
ASSERT(leafhdr.magic == XFS_DIR2_LEAFN_MAGIC ||
leafhdr.magic == XFS_DIR3_LEAFN_MAGIC);
- size = (int)((char *)&ents[leafhdr.count] - (char *)leaf);
+ size = (int)((char *)&leafhdr.ents[leafhdr.count] -
+ (char *)leaf);
level = 0;
/*
@@ -637,14 +730,14 @@
return error;
node = bp->b_addr;
- dp->d_ops->node_hdr_from_disk(&nodehdr, node);
- btree = dp->d_ops->node_tree_p(node);
+ xfs_da3_node_hdr_from_disk(dp->i_mount, &nodehdr, node);
+ btree = nodehdr.btree;
btree[0].hashval = cpu_to_be32(blk1->hashval);
btree[0].before = cpu_to_be32(blk1->blkno);
btree[1].hashval = cpu_to_be32(blk2->hashval);
btree[1].before = cpu_to_be32(blk2->blkno);
nodehdr.count = 2;
- dp->d_ops->node_hdr_to_disk(node, &nodehdr);
+ xfs_da3_node_hdr_to_disk(dp->i_mount, node, &nodehdr);
#ifdef DEBUG
if (oldroot->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) ||
@@ -686,7 +779,7 @@
trace_xfs_da_node_split(state->args);
node = oldblk->bp->b_addr;
- dp->d_ops->node_hdr_from_disk(&nodehdr, node);
+ xfs_da3_node_hdr_from_disk(dp->i_mount, &nodehdr, node);
/*
* With V2 dirs the extra block is data or freespace.
@@ -733,7 +826,7 @@
* If we had double-split op below us, then add the extra block too.
*/
node = oldblk->bp->b_addr;
- dp->d_ops->node_hdr_from_disk(&nodehdr, node);
+ xfs_da3_node_hdr_from_disk(dp->i_mount, &nodehdr, node);
if (oldblk->index <= nodehdr.count) {
oldblk->index++;
xfs_da3_node_add(state, oldblk, addblk);
@@ -788,10 +881,10 @@
node1 = blk1->bp->b_addr;
node2 = blk2->bp->b_addr;
- dp->d_ops->node_hdr_from_disk(&nodehdr1, node1);
- dp->d_ops->node_hdr_from_disk(&nodehdr2, node2);
- btree1 = dp->d_ops->node_tree_p(node1);
- btree2 = dp->d_ops->node_tree_p(node2);
+ xfs_da3_node_hdr_from_disk(dp->i_mount, &nodehdr1, node1);
+ xfs_da3_node_hdr_from_disk(dp->i_mount, &nodehdr2, node2);
+ btree1 = nodehdr1.btree;
+ btree2 = nodehdr2.btree;
/*
* Figure out how many entries need to move, and in which direction.
@@ -804,10 +897,10 @@
tmpnode = node1;
node1 = node2;
node2 = tmpnode;
- dp->d_ops->node_hdr_from_disk(&nodehdr1, node1);
- dp->d_ops->node_hdr_from_disk(&nodehdr2, node2);
- btree1 = dp->d_ops->node_tree_p(node1);
- btree2 = dp->d_ops->node_tree_p(node2);
+ xfs_da3_node_hdr_from_disk(dp->i_mount, &nodehdr1, node1);
+ xfs_da3_node_hdr_from_disk(dp->i_mount, &nodehdr2, node2);
+ btree1 = nodehdr1.btree;
+ btree2 = nodehdr2.btree;
swap = 1;
}
@@ -869,14 +962,15 @@
/*
* Log header of node 1 and all current bits of node 2.
*/
- dp->d_ops->node_hdr_to_disk(node1, &nodehdr1);
+ xfs_da3_node_hdr_to_disk(dp->i_mount, node1, &nodehdr1);
xfs_trans_log_buf(tp, blk1->bp,
- XFS_DA_LOGRANGE(node1, &node1->hdr, dp->d_ops->node_hdr_size));
+ XFS_DA_LOGRANGE(node1, &node1->hdr,
+ state->args->geo->node_hdr_size));
- dp->d_ops->node_hdr_to_disk(node2, &nodehdr2);
+ xfs_da3_node_hdr_to_disk(dp->i_mount, node2, &nodehdr2);
xfs_trans_log_buf(tp, blk2->bp,
XFS_DA_LOGRANGE(node2, &node2->hdr,
- dp->d_ops->node_hdr_size +
+ state->args->geo->node_hdr_size +
(sizeof(btree2[0]) * nodehdr2.count)));
/*
@@ -886,10 +980,10 @@
if (swap) {
node1 = blk1->bp->b_addr;
node2 = blk2->bp->b_addr;
- dp->d_ops->node_hdr_from_disk(&nodehdr1, node1);
- dp->d_ops->node_hdr_from_disk(&nodehdr2, node2);
- btree1 = dp->d_ops->node_tree_p(node1);
- btree2 = dp->d_ops->node_tree_p(node2);
+ xfs_da3_node_hdr_from_disk(dp->i_mount, &nodehdr1, node1);
+ xfs_da3_node_hdr_from_disk(dp->i_mount, &nodehdr2, node2);
+ btree1 = nodehdr1.btree;
+ btree2 = nodehdr2.btree;
}
blk1->hashval = be32_to_cpu(btree1[nodehdr1.count - 1].hashval);
blk2->hashval = be32_to_cpu(btree2[nodehdr2.count - 1].hashval);
@@ -921,8 +1015,8 @@
trace_xfs_da_node_add(state->args);
node = oldblk->bp->b_addr;
- dp->d_ops->node_hdr_from_disk(&nodehdr, node);
- btree = dp->d_ops->node_tree_p(node);
+ xfs_da3_node_hdr_from_disk(dp->i_mount, &nodehdr, node);
+ btree = nodehdr.btree;
ASSERT(oldblk->index >= 0 && oldblk->index <= nodehdr.count);
ASSERT(newblk->blkno != 0);
@@ -945,9 +1039,10 @@
tmp + sizeof(*btree)));
nodehdr.count += 1;
- dp->d_ops->node_hdr_to_disk(node, &nodehdr);
+ xfs_da3_node_hdr_to_disk(dp->i_mount, node, &nodehdr);
xfs_trans_log_buf(state->args->trans, oldblk->bp,
- XFS_DA_LOGRANGE(node, &node->hdr, dp->d_ops->node_hdr_size));
+ XFS_DA_LOGRANGE(node, &node->hdr,
+ state->args->geo->node_hdr_size));
/*
* Copy the last hash value from the oldblk to propagate upwards.
@@ -1082,7 +1177,6 @@
xfs_dablk_t child;
struct xfs_buf *bp;
struct xfs_da3_icnode_hdr oldroothdr;
- struct xfs_da_node_entry *btree;
int error;
struct xfs_inode *dp = state->args->dp;
@@ -1092,7 +1186,7 @@
args = state->args;
oldroot = root_blk->bp->b_addr;
- dp->d_ops->node_hdr_from_disk(&oldroothdr, oldroot);
+ xfs_da3_node_hdr_from_disk(dp->i_mount, &oldroothdr, oldroot);
ASSERT(oldroothdr.forw == 0);
ASSERT(oldroothdr.back == 0);
@@ -1106,11 +1200,9 @@
* Read in the (only) child block, then copy those bytes into
* the root block's buffer and free the original child block.
*/
- btree = dp->d_ops->node_tree_p(oldroot);
- child = be32_to_cpu(btree[0].before);
+ child = be32_to_cpu(oldroothdr.btree[0].before);
ASSERT(child != 0);
- error = xfs_da3_node_read(args->trans, dp, child, -1, &bp,
- args->whichfork);
+ error = xfs_da3_node_read(args->trans, dp, child, &bp, args->whichfork);
if (error)
return error;
xfs_da_blkinfo_onlychild_validate(bp->b_addr, oldroothdr.level);
@@ -1172,7 +1264,7 @@
blk = &state->path.blk[ state->path.active-1 ];
info = blk->bp->b_addr;
node = (xfs_da_intnode_t *)info;
- dp->d_ops->node_hdr_from_disk(&nodehdr, node);
+ xfs_da3_node_hdr_from_disk(dp->i_mount, &nodehdr, node);
if (nodehdr.count > (state->args->geo->node_ents >> 1)) {
*action = 0; /* blk over 50%, don't try to join */
return 0; /* blk over 50%, don't try to join */
@@ -1224,13 +1316,13 @@
blkno = nodehdr.back;
if (blkno == 0)
continue;
- error = xfs_da3_node_read(state->args->trans, dp,
- blkno, -1, &bp, state->args->whichfork);
+ error = xfs_da3_node_read(state->args->trans, dp, blkno, &bp,
+ state->args->whichfork);
if (error)
return error;
node = bp->b_addr;
- dp->d_ops->node_hdr_from_disk(&thdr, node);
+ xfs_da3_node_hdr_from_disk(dp->i_mount, &thdr, node);
xfs_trans_brelse(state->args->trans, bp);
if (count - thdr.count >= 0)
@@ -1272,18 +1364,14 @@
struct xfs_buf *bp,
int *count)
{
- struct xfs_da_intnode *node;
- struct xfs_da_node_entry *btree;
struct xfs_da3_icnode_hdr nodehdr;
- node = bp->b_addr;
- dp->d_ops->node_hdr_from_disk(&nodehdr, node);
+ xfs_da3_node_hdr_from_disk(dp->i_mount, &nodehdr, bp->b_addr);
if (count)
*count = nodehdr.count;
if (!nodehdr.count)
return 0;
- btree = dp->d_ops->node_tree_p(node);
- return be32_to_cpu(btree[nodehdr.count - 1].hashval);
+ return be32_to_cpu(nodehdr.btree[nodehdr.count - 1].hashval);
}
/*
@@ -1328,8 +1416,8 @@
struct xfs_da3_icnode_hdr nodehdr;
node = blk->bp->b_addr;
- dp->d_ops->node_hdr_from_disk(&nodehdr, node);
- btree = dp->d_ops->node_tree_p(node);
+ xfs_da3_node_hdr_from_disk(dp->i_mount, &nodehdr, node);
+ btree = nodehdr.btree;
if (be32_to_cpu(btree[blk->index].hashval) == lasthash)
break;
blk->hashval = lasthash;
@@ -1360,7 +1448,7 @@
trace_xfs_da_node_remove(state->args);
node = drop_blk->bp->b_addr;
- dp->d_ops->node_hdr_from_disk(&nodehdr, node);
+ xfs_da3_node_hdr_from_disk(dp->i_mount, &nodehdr, node);
ASSERT(drop_blk->index < nodehdr.count);
ASSERT(drop_blk->index >= 0);
@@ -1368,7 +1456,7 @@
* Copy over the offending entry, or just zero it out.
*/
index = drop_blk->index;
- btree = dp->d_ops->node_tree_p(node);
+ btree = nodehdr.btree;
if (index < nodehdr.count - 1) {
tmp = nodehdr.count - index - 1;
tmp *= (uint)sizeof(xfs_da_node_entry_t);
@@ -1381,9 +1469,9 @@
xfs_trans_log_buf(state->args->trans, drop_blk->bp,
XFS_DA_LOGRANGE(node, &btree[index], sizeof(btree[index])));
nodehdr.count -= 1;
- dp->d_ops->node_hdr_to_disk(node, &nodehdr);
+ xfs_da3_node_hdr_to_disk(dp->i_mount, node, &nodehdr);
xfs_trans_log_buf(state->args->trans, drop_blk->bp,
- XFS_DA_LOGRANGE(node, &node->hdr, dp->d_ops->node_hdr_size));
+ XFS_DA_LOGRANGE(node, &node->hdr, state->args->geo->node_hdr_size));
/*
* Copy the last hash value from the block to propagate upwards.
@@ -1416,10 +1504,10 @@
drop_node = drop_blk->bp->b_addr;
save_node = save_blk->bp->b_addr;
- dp->d_ops->node_hdr_from_disk(&drop_hdr, drop_node);
- dp->d_ops->node_hdr_from_disk(&save_hdr, save_node);
- drop_btree = dp->d_ops->node_tree_p(drop_node);
- save_btree = dp->d_ops->node_tree_p(save_node);
+ xfs_da3_node_hdr_from_disk(dp->i_mount, &drop_hdr, drop_node);
+ xfs_da3_node_hdr_from_disk(dp->i_mount, &save_hdr, save_node);
+ drop_btree = drop_hdr.btree;
+ save_btree = save_hdr.btree;
tp = state->args->trans;
/*
@@ -1453,10 +1541,10 @@
memcpy(&save_btree[sindex], &drop_btree[0], tmp);
save_hdr.count += drop_hdr.count;
- dp->d_ops->node_hdr_to_disk(save_node, &save_hdr);
+ xfs_da3_node_hdr_to_disk(dp->i_mount, save_node, &save_hdr);
xfs_trans_log_buf(tp, save_blk->bp,
XFS_DA_LOGRANGE(save_node, &save_node->hdr,
- dp->d_ops->node_hdr_size));
+ state->args->geo->node_hdr_size));
/*
* Save the last hashval in the remaining block for upward propagation.
@@ -1517,7 +1605,7 @@
*/
blk->blkno = blkno;
error = xfs_da3_node_read(args->trans, args->dp, blkno,
- -1, &blk->bp, args->whichfork);
+ &blk->bp, args->whichfork);
if (error) {
blk->blkno = 0;
state->path.active--;
@@ -1541,8 +1629,10 @@
break;
}
- if (magic != XFS_DA_NODE_MAGIC && magic != XFS_DA3_NODE_MAGIC)
+ if (magic != XFS_DA_NODE_MAGIC && magic != XFS_DA3_NODE_MAGIC) {
+ xfs_buf_mark_corrupt(blk->bp);
return -EFSCORRUPTED;
+ }
blk->magic = XFS_DA_NODE_MAGIC;
@@ -1550,19 +1640,22 @@
* Search an intermediate node for a match.
*/
node = blk->bp->b_addr;
- dp->d_ops->node_hdr_from_disk(&nodehdr, node);
- btree = dp->d_ops->node_tree_p(node);
+ xfs_da3_node_hdr_from_disk(dp->i_mount, &nodehdr, node);
+ btree = nodehdr.btree;
/* Tree taller than we can handle; bail out! */
- if (nodehdr.level >= XFS_DA_NODE_MAXDEPTH)
+ if (nodehdr.level >= XFS_DA_NODE_MAXDEPTH) {
+ xfs_buf_mark_corrupt(blk->bp);
return -EFSCORRUPTED;
+ }
/* Check the level from the root. */
if (blkno == args->geo->leafblk)
expected_level = nodehdr.level - 1;
- else if (expected_level != nodehdr.level)
+ else if (expected_level != nodehdr.level) {
+ xfs_buf_mark_corrupt(blk->bp);
return -EFSCORRUPTED;
- else
+ } else
expected_level--;
max = nodehdr.count;
@@ -1612,11 +1705,11 @@
}
/* We can't point back to the root. */
- if (blkno == args->geo->leafblk)
+ if (XFS_IS_CORRUPT(dp->i_mount, blkno == args->geo->leafblk))
return -EFSCORRUPTED;
}
- if (expected_level != 0)
+ if (XFS_IS_CORRUPT(dp->i_mount, expected_level != 0))
return -EFSCORRUPTED;
/*
@@ -1678,10 +1771,10 @@
node1 = node1_bp->b_addr;
node2 = node2_bp->b_addr;
- dp->d_ops->node_hdr_from_disk(&node1hdr, node1);
- dp->d_ops->node_hdr_from_disk(&node2hdr, node2);
- btree1 = dp->d_ops->node_tree_p(node1);
- btree2 = dp->d_ops->node_tree_p(node2);
+ xfs_da3_node_hdr_from_disk(dp->i_mount, &node1hdr, node1);
+ xfs_da3_node_hdr_from_disk(dp->i_mount, &node2hdr, node2);
+ btree1 = node1hdr.btree;
+ btree2 = node2hdr.btree;
if (node1hdr.count > 0 && node2hdr.count > 0 &&
((be32_to_cpu(btree2[0].hashval) < be32_to_cpu(btree1[0].hashval)) ||
@@ -1746,7 +1839,7 @@
if (old_info->back) {
error = xfs_da3_node_read(args->trans, dp,
be32_to_cpu(old_info->back),
- -1, &bp, args->whichfork);
+ &bp, args->whichfork);
if (error)
return error;
ASSERT(bp != NULL);
@@ -1767,7 +1860,7 @@
if (old_info->forw) {
error = xfs_da3_node_read(args->trans, dp,
be32_to_cpu(old_info->forw),
- -1, &bp, args->whichfork);
+ &bp, args->whichfork);
if (error)
return error;
ASSERT(bp != NULL);
@@ -1826,7 +1919,7 @@
if (drop_info->back) {
error = xfs_da3_node_read(args->trans, args->dp,
be32_to_cpu(drop_info->back),
- -1, &bp, args->whichfork);
+ &bp, args->whichfork);
if (error)
return error;
ASSERT(bp != NULL);
@@ -1843,7 +1936,7 @@
if (drop_info->forw) {
error = xfs_da3_node_read(args->trans, args->dp,
be32_to_cpu(drop_info->forw),
- -1, &bp, args->whichfork);
+ &bp, args->whichfork);
if (error)
return error;
ASSERT(bp != NULL);
@@ -1878,7 +1971,6 @@
{
struct xfs_da_state_blk *blk;
struct xfs_da_blkinfo *info;
- struct xfs_da_intnode *node;
struct xfs_da_args *args;
struct xfs_da_node_entry *btree;
struct xfs_da3_icnode_hdr nodehdr;
@@ -1900,18 +1992,18 @@
ASSERT(path != NULL);
ASSERT((path->active > 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
level = (path->active-1) - 1; /* skip bottom layer in path */
- for (blk = &path->blk[level]; level >= 0; blk--, level--) {
- node = blk->bp->b_addr;
- dp->d_ops->node_hdr_from_disk(&nodehdr, node);
- btree = dp->d_ops->node_tree_p(node);
+ for (; level >= 0; level--) {
+ blk = &path->blk[level];
+ xfs_da3_node_hdr_from_disk(dp->i_mount, &nodehdr,
+ blk->bp->b_addr);
if (forward && (blk->index < nodehdr.count - 1)) {
blk->index++;
- blkno = be32_to_cpu(btree[blk->index].before);
+ blkno = be32_to_cpu(nodehdr.btree[blk->index].before);
break;
} else if (!forward && (blk->index > 0)) {
blk->index--;
- blkno = be32_to_cpu(btree[blk->index].before);
+ blkno = be32_to_cpu(nodehdr.btree[blk->index].before);
break;
}
}
@@ -1929,7 +2021,7 @@
/*
* Read the next child block into a local buffer.
*/
- error = xfs_da3_node_read(args->trans, dp, blkno, -1, &bp,
+ error = xfs_da3_node_read(args->trans, dp, blkno, &bp,
args->whichfork);
if (error)
return error;
@@ -1962,9 +2054,9 @@
case XFS_DA_NODE_MAGIC:
case XFS_DA3_NODE_MAGIC:
blk->magic = XFS_DA_NODE_MAGIC;
- node = (xfs_da_intnode_t *)info;
- dp->d_ops->node_hdr_from_disk(&nodehdr, node);
- btree = dp->d_ops->node_tree_p(node);
+ xfs_da3_node_hdr_from_disk(dp->i_mount, &nodehdr,
+ bp->b_addr);
+ btree = nodehdr.btree;
blk->hashval = be32_to_cpu(btree[nodehdr.count - 1].hashval);
if (forward)
blk->index = 0;
@@ -2044,18 +2136,6 @@
XFS_CMP_EXACT : XFS_CMP_DIFFERENT;
}
-static xfs_dahash_t
-xfs_default_hashname(
- struct xfs_name *name)
-{
- return xfs_da_hashname(name->name, name->len);
-}
-
-const struct xfs_nameops xfs_default_nameops = {
- .hashname = xfs_default_hashname,
- .compname = xfs_da_compname
-};
-
int
xfs_da_grow_inode_int(
struct xfs_da_args *args,
@@ -2213,16 +2293,13 @@
error = xfs_bmap_last_before(tp, dp, &lastoff, w);
if (error)
return error;
- if (unlikely(lastoff == 0)) {
- XFS_ERROR_REPORT("xfs_da_swap_lastblock(1)", XFS_ERRLEVEL_LOW,
- mp);
+ if (XFS_IS_CORRUPT(mp, lastoff == 0))
return -EFSCORRUPTED;
- }
/*
* Read the last block in the btree space.
*/
last_blkno = (xfs_dablk_t)lastoff - args->geo->fsbcount;
- error = xfs_da3_node_read(tp, dp, last_blkno, -1, &last_buf, w);
+ error = xfs_da3_node_read(tp, dp, last_blkno, &last_buf, w);
if (error)
return error;
/*
@@ -2240,16 +2317,17 @@
struct xfs_dir2_leaf_entry *ents;
dead_leaf2 = (xfs_dir2_leaf_t *)dead_info;
- dp->d_ops->leaf_hdr_from_disk(&leafhdr, dead_leaf2);
- ents = dp->d_ops->leaf_ents_p(dead_leaf2);
+ xfs_dir2_leaf_hdr_from_disk(dp->i_mount, &leafhdr,
+ dead_leaf2);
+ ents = leafhdr.ents;
dead_level = 0;
dead_hash = be32_to_cpu(ents[leafhdr.count - 1].hashval);
} else {
struct xfs_da3_icnode_hdr deadhdr;
dead_node = (xfs_da_intnode_t *)dead_info;
- dp->d_ops->node_hdr_from_disk(&deadhdr, dead_node);
- btree = dp->d_ops->node_tree_p(dead_node);
+ xfs_da3_node_hdr_from_disk(dp->i_mount, &deadhdr, dead_node);
+ btree = deadhdr.btree;
dead_level = deadhdr.level;
dead_hash = be32_to_cpu(btree[deadhdr.count - 1].hashval);
}
@@ -2258,15 +2336,13 @@
* If the moved block has a left sibling, fix up the pointers.
*/
if ((sib_blkno = be32_to_cpu(dead_info->back))) {
- error = xfs_da3_node_read(tp, dp, sib_blkno, -1, &sib_buf, w);
+ error = xfs_da3_node_read(tp, dp, sib_blkno, &sib_buf, w);
if (error)
goto done;
sib_info = sib_buf->b_addr;
- if (unlikely(
- be32_to_cpu(sib_info->forw) != last_blkno ||
- sib_info->magic != dead_info->magic)) {
- XFS_ERROR_REPORT("xfs_da_swap_lastblock(2)",
- XFS_ERRLEVEL_LOW, mp);
+ if (XFS_IS_CORRUPT(mp,
+ be32_to_cpu(sib_info->forw) != last_blkno ||
+ sib_info->magic != dead_info->magic)) {
error = -EFSCORRUPTED;
goto done;
}
@@ -2280,15 +2356,13 @@
* If the moved block has a right sibling, fix up the pointers.
*/
if ((sib_blkno = be32_to_cpu(dead_info->forw))) {
- error = xfs_da3_node_read(tp, dp, sib_blkno, -1, &sib_buf, w);
+ error = xfs_da3_node_read(tp, dp, sib_blkno, &sib_buf, w);
if (error)
goto done;
sib_info = sib_buf->b_addr;
- if (unlikely(
- be32_to_cpu(sib_info->back) != last_blkno ||
- sib_info->magic != dead_info->magic)) {
- XFS_ERROR_REPORT("xfs_da_swap_lastblock(3)",
- XFS_ERRLEVEL_LOW, mp);
+ if (XFS_IS_CORRUPT(mp,
+ be32_to_cpu(sib_info->back) != last_blkno ||
+ sib_info->magic != dead_info->magic)) {
error = -EFSCORRUPTED;
goto done;
}
@@ -2304,27 +2378,24 @@
* Walk down the tree looking for the parent of the moved block.
*/
for (;;) {
- error = xfs_da3_node_read(tp, dp, par_blkno, -1, &par_buf, w);
+ error = xfs_da3_node_read(tp, dp, par_blkno, &par_buf, w);
if (error)
goto done;
par_node = par_buf->b_addr;
- dp->d_ops->node_hdr_from_disk(&par_hdr, par_node);
- if (level >= 0 && level != par_hdr.level + 1) {
- XFS_ERROR_REPORT("xfs_da_swap_lastblock(4)",
- XFS_ERRLEVEL_LOW, mp);
+ xfs_da3_node_hdr_from_disk(dp->i_mount, &par_hdr, par_node);
+ if (XFS_IS_CORRUPT(mp,
+ level >= 0 && level != par_hdr.level + 1)) {
error = -EFSCORRUPTED;
goto done;
}
level = par_hdr.level;
- btree = dp->d_ops->node_tree_p(par_node);
+ btree = par_hdr.btree;
for (entno = 0;
entno < par_hdr.count &&
be32_to_cpu(btree[entno].hashval) < dead_hash;
entno++)
continue;
- if (entno == par_hdr.count) {
- XFS_ERROR_REPORT("xfs_da_swap_lastblock(5)",
- XFS_ERRLEVEL_LOW, mp);
+ if (XFS_IS_CORRUPT(mp, entno == par_hdr.count)) {
error = -EFSCORRUPTED;
goto done;
}
@@ -2349,24 +2420,20 @@
par_blkno = par_hdr.forw;
xfs_trans_brelse(tp, par_buf);
par_buf = NULL;
- if (unlikely(par_blkno == 0)) {
- XFS_ERROR_REPORT("xfs_da_swap_lastblock(6)",
- XFS_ERRLEVEL_LOW, mp);
+ if (XFS_IS_CORRUPT(mp, par_blkno == 0)) {
error = -EFSCORRUPTED;
goto done;
}
- error = xfs_da3_node_read(tp, dp, par_blkno, -1, &par_buf, w);
+ error = xfs_da3_node_read(tp, dp, par_blkno, &par_buf, w);
if (error)
goto done;
par_node = par_buf->b_addr;
- dp->d_ops->node_hdr_from_disk(&par_hdr, par_node);
- if (par_hdr.level != level) {
- XFS_ERROR_REPORT("xfs_da_swap_lastblock(7)",
- XFS_ERRLEVEL_LOW, mp);
+ xfs_da3_node_hdr_from_disk(dp->i_mount, &par_hdr, par_node);
+ if (XFS_IS_CORRUPT(mp, par_hdr.level != level)) {
error = -EFSCORRUPTED;
goto done;
}
- btree = dp->d_ops->node_tree_p(par_node);
+ btree = par_hdr.btree;
entno = 0;
}
/*
@@ -2429,159 +2496,86 @@
return error;
}
-/*
- * See if the mapping(s) for this btree block are valid, i.e.
- * don't contain holes, are logically contiguous, and cover the whole range.
- */
-STATIC int
-xfs_da_map_covers_blocks(
- int nmap,
- xfs_bmbt_irec_t *mapp,
- xfs_dablk_t bno,
- int count)
-{
- int i;
- xfs_fileoff_t off;
-
- for (i = 0, off = bno; i < nmap; i++) {
- if (mapp[i].br_startblock == HOLESTARTBLOCK ||
- mapp[i].br_startblock == DELAYSTARTBLOCK) {
- return 0;
- }
- if (off != mapp[i].br_startoff) {
- return 0;
- }
- off += mapp[i].br_blockcount;
- }
- return off == bno + count;
-}
-
-/*
- * Convert a struct xfs_bmbt_irec to a struct xfs_buf_map.
- *
- * For the single map case, it is assumed that the caller has provided a pointer
- * to a valid xfs_buf_map. For the multiple map case, this function will
- * allocate the xfs_buf_map to hold all the maps and replace the caller's single
- * map pointer with the allocated map.
- */
-static int
-xfs_buf_map_from_irec(
- struct xfs_mount *mp,
- struct xfs_buf_map **mapp,
- int *nmaps,
- struct xfs_bmbt_irec *irecs,
- int nirecs)
-{
- struct xfs_buf_map *map;
- int i;
-
- ASSERT(*nmaps == 1);
- ASSERT(nirecs >= 1);
-
- if (nirecs > 1) {
- map = kmem_zalloc(nirecs * sizeof(struct xfs_buf_map),
- KM_NOFS);
- if (!map)
- return -ENOMEM;
- *mapp = map;
- }
-
- *nmaps = nirecs;
- map = *mapp;
- for (i = 0; i < *nmaps; i++) {
- ASSERT(irecs[i].br_startblock != DELAYSTARTBLOCK &&
- irecs[i].br_startblock != HOLESTARTBLOCK);
- map[i].bm_bn = XFS_FSB_TO_DADDR(mp, irecs[i].br_startblock);
- map[i].bm_len = XFS_FSB_TO_BB(mp, irecs[i].br_blockcount);
- }
- return 0;
-}
-
-/*
- * Map the block we are given ready for reading. There are three possible return
- * values:
- * -1 - will be returned if we land in a hole and mappedbno == -2 so the
- * caller knows not to execute a subsequent read.
- * 0 - if we mapped the block successfully
- * >0 - positive error number if there was an error.
- */
static int
xfs_dabuf_map(
struct xfs_inode *dp,
xfs_dablk_t bno,
- xfs_daddr_t mappedbno,
+ unsigned int flags,
int whichfork,
- struct xfs_buf_map **map,
+ struct xfs_buf_map **mapp,
int *nmaps)
{
struct xfs_mount *mp = dp->i_mount;
- int nfsb;
- int error = 0;
- struct xfs_bmbt_irec irec;
- struct xfs_bmbt_irec *irecs = &irec;
- int nirecs;
+ int nfsb = xfs_dabuf_nfsb(mp, whichfork);
+ struct xfs_bmbt_irec irec, *irecs = &irec;
+ struct xfs_buf_map *map = *mapp;
+ xfs_fileoff_t off = bno;
+ int error = 0, nirecs, i;
- ASSERT(map && *map);
- ASSERT(*nmaps == 1);
+ if (nfsb > 1)
+ irecs = kmem_zalloc(sizeof(irec) * nfsb, KM_NOFS);
- if (whichfork == XFS_DATA_FORK)
- nfsb = mp->m_dir_geo->fsbcount;
- else
- nfsb = mp->m_attr_geo->fsbcount;
+ nirecs = nfsb;
+ error = xfs_bmapi_read(dp, bno, nfsb, irecs, &nirecs,
+ xfs_bmapi_aflag(whichfork));
+ if (error)
+ goto out_free_irecs;
/*
- * Caller doesn't have a mapping. -2 means don't complain
- * if we land in a hole.
+ * Use the caller provided map for the single map case, else allocate a
+ * larger one that needs to be free by the caller.
*/
- if (mappedbno == -1 || mappedbno == -2) {
- /*
- * Optimize the one-block case.
- */
- if (nfsb != 1)
- irecs = kmem_zalloc(sizeof(irec) * nfsb,
- KM_NOFS);
-
- nirecs = nfsb;
- error = xfs_bmapi_read(dp, (xfs_fileoff_t)bno, nfsb, irecs,
- &nirecs, xfs_bmapi_aflag(whichfork));
- if (error)
- goto out;
- } else {
- irecs->br_startblock = XFS_DADDR_TO_FSB(mp, mappedbno);
- irecs->br_startoff = (xfs_fileoff_t)bno;
- irecs->br_blockcount = nfsb;
- irecs->br_state = 0;
- nirecs = 1;
- }
-
- if (!xfs_da_map_covers_blocks(nirecs, irecs, bno, nfsb)) {
- error = mappedbno == -2 ? -1 : -EFSCORRUPTED;
- if (unlikely(error == -EFSCORRUPTED)) {
- if (xfs_error_level >= XFS_ERRLEVEL_LOW) {
- int i;
- xfs_alert(mp, "%s: bno %lld dir: inode %lld",
- __func__, (long long)bno,
- (long long)dp->i_ino);
- for (i = 0; i < *nmaps; i++) {
- xfs_alert(mp,
-"[%02d] br_startoff %lld br_startblock %lld br_blockcount %lld br_state %d",
- i,
- (long long)irecs[i].br_startoff,
- (long long)irecs[i].br_startblock,
- (long long)irecs[i].br_blockcount,
- irecs[i].br_state);
- }
- }
- XFS_ERROR_REPORT("xfs_da_do_buf(1)",
- XFS_ERRLEVEL_LOW, mp);
+ if (nirecs > 1) {
+ map = kmem_zalloc(nirecs * sizeof(struct xfs_buf_map), KM_NOFS);
+ if (!map) {
+ error = -ENOMEM;
+ goto out_free_irecs;
}
- goto out;
+ *mapp = map;
}
- error = xfs_buf_map_from_irec(mp, map, nmaps, irecs, nirecs);
-out:
+
+ for (i = 0; i < nirecs; i++) {
+ if (irecs[i].br_startblock == HOLESTARTBLOCK ||
+ irecs[i].br_startblock == DELAYSTARTBLOCK)
+ goto invalid_mapping;
+ if (off != irecs[i].br_startoff)
+ goto invalid_mapping;
+
+ map[i].bm_bn = XFS_FSB_TO_DADDR(mp, irecs[i].br_startblock);
+ map[i].bm_len = XFS_FSB_TO_BB(mp, irecs[i].br_blockcount);
+ off += irecs[i].br_blockcount;
+ }
+
+ if (off != bno + nfsb)
+ goto invalid_mapping;
+
+ *nmaps = nirecs;
+out_free_irecs:
if (irecs != &irec)
kmem_free(irecs);
return error;
+
+invalid_mapping:
+ /* Caller ok with no mapping. */
+ if (XFS_IS_CORRUPT(mp, !(flags & XFS_DABUF_MAP_HOLE_OK))) {
+ error = -EFSCORRUPTED;
+ if (xfs_error_level >= XFS_ERRLEVEL_LOW) {
+ xfs_alert(mp, "%s: bno %u inode %llu",
+ __func__, bno, dp->i_ino);
+
+ for (i = 0; i < nirecs; i++) {
+ xfs_alert(mp,
+"[%02d] br_startoff %lld br_startblock %lld br_blockcount %lld br_state %d",
+ i, irecs[i].br_startoff,
+ irecs[i].br_startblock,
+ irecs[i].br_blockcount,
+ irecs[i].br_state);
+ }
+ }
+ } else {
+ *nmaps = 0;
+ }
+ goto out_free_irecs;
}
/*
@@ -2589,39 +2583,26 @@
*/
int
xfs_da_get_buf(
- struct xfs_trans *trans,
+ struct xfs_trans *tp,
struct xfs_inode *dp,
xfs_dablk_t bno,
- xfs_daddr_t mappedbno,
struct xfs_buf **bpp,
int whichfork)
{
+ struct xfs_mount *mp = dp->i_mount;
struct xfs_buf *bp;
- struct xfs_buf_map map;
- struct xfs_buf_map *mapp;
- int nmap;
+ struct xfs_buf_map map, *mapp = ↦
+ int nmap = 1;
int error;
*bpp = NULL;
- mapp = ↦
- nmap = 1;
- error = xfs_dabuf_map(dp, bno, mappedbno, whichfork,
- &mapp, &nmap);
- if (error) {
- /* mapping a hole is not an error, but we don't continue */
- if (error == -1)
- error = 0;
+ error = xfs_dabuf_map(dp, bno, 0, whichfork, &mapp, &nmap);
+ if (error || nmap == 0)
goto out_free;
- }
- bp = xfs_trans_get_buf_map(trans, dp->i_mount->m_ddev_targp,
- mapp, nmap, 0);
- error = bp ? bp->b_error : -EIO;
- if (error) {
- if (bp)
- xfs_trans_brelse(trans, bp);
+ error = xfs_trans_get_buf_map(tp, mp->m_ddev_targp, mapp, nmap, 0, &bp);
+ if (error)
goto out_free;
- }
*bpp = bp;
@@ -2637,35 +2618,27 @@
*/
int
xfs_da_read_buf(
- struct xfs_trans *trans,
+ struct xfs_trans *tp,
struct xfs_inode *dp,
xfs_dablk_t bno,
- xfs_daddr_t mappedbno,
+ unsigned int flags,
struct xfs_buf **bpp,
int whichfork,
const struct xfs_buf_ops *ops)
{
+ struct xfs_mount *mp = dp->i_mount;
struct xfs_buf *bp;
- struct xfs_buf_map map;
- struct xfs_buf_map *mapp;
- int nmap;
+ struct xfs_buf_map map, *mapp = ↦
+ int nmap = 1;
int error;
*bpp = NULL;
- mapp = ↦
- nmap = 1;
- error = xfs_dabuf_map(dp, bno, mappedbno, whichfork,
- &mapp, &nmap);
- if (error) {
- /* mapping a hole is not an error, but we don't continue */
- if (error == -1)
- error = 0;
+ error = xfs_dabuf_map(dp, bno, flags, whichfork, &mapp, &nmap);
+ if (error || !nmap)
goto out_free;
- }
- error = xfs_trans_read_buf_map(dp->i_mount, trans,
- dp->i_mount->m_ddev_targp,
- mapp, nmap, 0, &bp, ops);
+ error = xfs_trans_read_buf_map(mp, tp, mp->m_ddev_targp, mapp, nmap, 0,
+ &bp, ops);
if (error)
goto out_free;
@@ -2688,7 +2661,7 @@
xfs_da_reada_buf(
struct xfs_inode *dp,
xfs_dablk_t bno,
- xfs_daddr_t mappedbno,
+ unsigned int flags,
int whichfork,
const struct xfs_buf_ops *ops)
{
@@ -2699,16 +2672,10 @@
mapp = ↦
nmap = 1;
- error = xfs_dabuf_map(dp, bno, mappedbno, whichfork,
- &mapp, &nmap);
- if (error) {
- /* mapping a hole is not an error, but we don't continue */
- if (error == -1)
- error = 0;
+ error = xfs_dabuf_map(dp, bno, flags, whichfork, &mapp, &nmap);
+ if (error || !nmap)
goto out_free;
- }
- mappedbno = mapp[0].bm_bn;
xfs_buf_readahead_map(dp->i_mount->m_ddev_targp, mapp, nmap, ops);
out_free:
diff --git a/fs/xfs/libxfs/xfs_da_btree.h b/fs/xfs/libxfs/xfs_da_btree.h
index ae0bbd2..ad5dd32 100644
--- a/fs/xfs/libxfs/xfs_da_btree.h
+++ b/fs/xfs/libxfs/xfs_da_btree.h
@@ -1,4 +1,4 @@
-// SPDX-License-Identifier: GPL-2.0
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* Copyright (c) 2000,2002,2005 Silicon Graphics, Inc.
* Copyright (c) 2013 Red Hat, Inc.
@@ -10,7 +10,6 @@
struct xfs_inode;
struct xfs_trans;
struct zone;
-struct xfs_dir_ops;
/*
* Directory/attribute geometry information. There will be one of these for each
@@ -18,15 +17,23 @@
* structures will be attached to the xfs_mount.
*/
struct xfs_da_geometry {
- int blksize; /* da block size in bytes */
- int fsbcount; /* da block size in filesystem blocks */
+ unsigned int blksize; /* da block size in bytes */
+ unsigned int fsbcount; /* da block size in filesystem blocks */
uint8_t fsblog; /* log2 of _filesystem_ block size */
uint8_t blklog; /* log2 of da block size */
- uint node_ents; /* # of entries in a danode */
- int magicpct; /* 37% of block size in bytes */
+ unsigned int node_hdr_size; /* danode header size in bytes */
+ unsigned int node_ents; /* # of entries in a danode */
+ unsigned int magicpct; /* 37% of block size in bytes */
xfs_dablk_t datablk; /* blockno of dir data v2 */
+ unsigned int leaf_hdr_size; /* dir2 leaf header size */
+ unsigned int leaf_max_ents; /* # of entries in dir2 leaf */
xfs_dablk_t leafblk; /* blockno of leaf data v2 */
+ unsigned int free_hdr_size; /* dir2 free header size */
+ unsigned int free_max_bests; /* # of bests entries in dir2 free */
xfs_dablk_t freeblk; /* blockno of free data v2 */
+
+ xfs_dir2_data_aoff_t data_first_offset;
+ size_t data_entry_offset;
};
/*========================================================================
@@ -50,9 +57,10 @@
const uint8_t *name; /* string (maybe not NULL terminated) */
int namelen; /* length of string (maybe no NULL) */
uint8_t filetype; /* filetype of inode for directories */
- uint8_t *value; /* set of bytes (maybe contain NULLs) */
+ void *value; /* set of bytes (maybe contain NULLs) */
int valuelen; /* length of value */
- int flags; /* argument flags (eg: ATTR_NOCREATE) */
+ unsigned int attr_filter; /* XFS_ATTR_{ROOT,SECURE,INCOMPLETE} */
+ unsigned int attr_flags; /* XATTR_{CREATE,REPLACE} */
xfs_dahash_t hashval; /* hash value of name */
xfs_ino_t inumber; /* input/output inode number */
struct xfs_inode *dp; /* directory inode to manipulate */
@@ -81,7 +89,7 @@
#define XFS_DA_OP_ADDNAME 0x0004 /* this is an add operation */
#define XFS_DA_OP_OKNOENT 0x0008 /* lookup/add op, ENOENT ok, else die */
#define XFS_DA_OP_CILOOKUP 0x0010 /* lookup to return CI name if found */
-#define XFS_DA_OP_ALLOCVAL 0x0020 /* lookup to alloc buffer if found */
+#define XFS_DA_OP_NOTIME 0x0020 /* don't update inode timestamps */
#define XFS_DA_OP_FLAGS \
{ XFS_DA_OP_JUSTCHECK, "JUSTCHECK" }, \
@@ -89,7 +97,7 @@
{ XFS_DA_OP_ADDNAME, "ADDNAME" }, \
{ XFS_DA_OP_OKNOENT, "OKNOENT" }, \
{ XFS_DA_OP_CILOOKUP, "CILOOKUP" }, \
- { XFS_DA_OP_ALLOCVAL, "ALLOCVAL" }
+ { XFS_DA_OP_NOTIME, "NOTIME" }
/*
* Storage for holding state during Btree searches and split/join ops.
@@ -125,6 +133,25 @@
} xfs_da_state_t;
/*
+ * In-core version of the node header to abstract the differences in the v2 and
+ * v3 disk format of the headers. Callers need to convert to/from disk format as
+ * appropriate.
+ */
+struct xfs_da3_icnode_hdr {
+ uint32_t forw;
+ uint32_t back;
+ uint16_t magic;
+ uint16_t count;
+ uint16_t level;
+
+ /*
+ * Pointer to the on-disk format entries, which are behind the
+ * variable size (v4 vs v5) header in the on-disk block.
+ */
+ struct xfs_da_node_entry *btree;
+};
+
+/*
* Utility macros to aid in logging changed structure fields.
*/
#define XFS_DA_LOGOFF(BASE, ADDR) ((char *)(ADDR) - (char *)(BASE))
@@ -132,16 +159,6 @@
(uint)(XFS_DA_LOGOFF(BASE, ADDR)), \
(uint)(XFS_DA_LOGOFF(BASE, ADDR)+(SIZE)-1)
-/*
- * Name ops for directory and/or attr name operations
- */
-struct xfs_nameops {
- xfs_dahash_t (*hashname)(struct xfs_name *);
- enum xfs_dacmp (*compname)(struct xfs_da_args *,
- const unsigned char *, int);
-};
-
-
/*========================================================================
* Function prototypes.
*========================================================================*/
@@ -172,25 +189,28 @@
int xfs_da3_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk,
xfs_da_state_blk_t *new_blk);
int xfs_da3_node_read(struct xfs_trans *tp, struct xfs_inode *dp,
- xfs_dablk_t bno, xfs_daddr_t mappedbno,
- struct xfs_buf **bpp, int which_fork);
+ xfs_dablk_t bno, struct xfs_buf **bpp, int whichfork);
+int xfs_da3_node_read_mapped(struct xfs_trans *tp, struct xfs_inode *dp,
+ xfs_daddr_t mappedbno, struct xfs_buf **bpp,
+ int whichfork);
/*
* Utility routines.
*/
+
+#define XFS_DABUF_MAP_HOLE_OK (1 << 0)
+
int xfs_da_grow_inode(xfs_da_args_t *args, xfs_dablk_t *new_blkno);
int xfs_da_grow_inode_int(struct xfs_da_args *args, xfs_fileoff_t *bno,
int count);
int xfs_da_get_buf(struct xfs_trans *trans, struct xfs_inode *dp,
- xfs_dablk_t bno, xfs_daddr_t mappedbno,
- struct xfs_buf **bp, int whichfork);
+ xfs_dablk_t bno, struct xfs_buf **bp, int whichfork);
int xfs_da_read_buf(struct xfs_trans *trans, struct xfs_inode *dp,
- xfs_dablk_t bno, xfs_daddr_t mappedbno,
- struct xfs_buf **bpp, int whichfork,
- const struct xfs_buf_ops *ops);
+ xfs_dablk_t bno, unsigned int flags, struct xfs_buf **bpp,
+ int whichfork, const struct xfs_buf_ops *ops);
int xfs_da_reada_buf(struct xfs_inode *dp, xfs_dablk_t bno,
- xfs_daddr_t mapped_bno, int whichfork,
- const struct xfs_buf_ops *ops);
+ unsigned int flags, int whichfork,
+ const struct xfs_buf_ops *ops);
int xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno,
struct xfs_buf *dead_buf);
@@ -199,10 +219,14 @@
const unsigned char *name, int len);
-xfs_da_state_t *xfs_da_state_alloc(void);
+struct xfs_da_state *xfs_da_state_alloc(struct xfs_da_args *args);
void xfs_da_state_free(xfs_da_state_t *state);
+void xfs_da3_node_hdr_from_disk(struct xfs_mount *mp,
+ struct xfs_da3_icnode_hdr *to, struct xfs_da_intnode *from);
+void xfs_da3_node_hdr_to_disk(struct xfs_mount *mp,
+ struct xfs_da_intnode *to, struct xfs_da3_icnode_hdr *from);
+
extern struct kmem_zone *xfs_da_state_zone;
-extern const struct xfs_nameops xfs_default_nameops;
#endif /* __XFS_DA_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_da_format.c b/fs/xfs/libxfs/xfs_da_format.c
deleted file mode 100644
index b1ae572..0000000
--- a/fs/xfs/libxfs/xfs_da_format.c
+++ /dev/null
@@ -1,888 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright (c) 2000,2002,2005 Silicon Graphics, Inc.
- * Copyright (c) 2013 Red Hat, Inc.
- * All Rights Reserved.
- */
-#include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_shared.h"
-#include "xfs_format.h"
-#include "xfs_log_format.h"
-#include "xfs_trans_resv.h"
-#include "xfs_mount.h"
-#include "xfs_inode.h"
-#include "xfs_dir2.h"
-
-/*
- * Shortform directory ops
- */
-static int
-xfs_dir2_sf_entsize(
- struct xfs_dir2_sf_hdr *hdr,
- int len)
-{
- int count = sizeof(struct xfs_dir2_sf_entry); /* namelen + offset */
-
- count += len; /* name */
- count += hdr->i8count ? XFS_INO64_SIZE : XFS_INO32_SIZE; /* ino # */
- return count;
-}
-
-static int
-xfs_dir3_sf_entsize(
- struct xfs_dir2_sf_hdr *hdr,
- int len)
-{
- return xfs_dir2_sf_entsize(hdr, len) + sizeof(uint8_t);
-}
-
-static struct xfs_dir2_sf_entry *
-xfs_dir2_sf_nextentry(
- struct xfs_dir2_sf_hdr *hdr,
- struct xfs_dir2_sf_entry *sfep)
-{
- return (struct xfs_dir2_sf_entry *)
- ((char *)sfep + xfs_dir2_sf_entsize(hdr, sfep->namelen));
-}
-
-static struct xfs_dir2_sf_entry *
-xfs_dir3_sf_nextentry(
- struct xfs_dir2_sf_hdr *hdr,
- struct xfs_dir2_sf_entry *sfep)
-{
- return (struct xfs_dir2_sf_entry *)
- ((char *)sfep + xfs_dir3_sf_entsize(hdr, sfep->namelen));
-}
-
-
-/*
- * For filetype enabled shortform directories, the file type field is stored at
- * the end of the name. Because it's only a single byte, endian conversion is
- * not necessary. For non-filetype enable directories, the type is always
- * unknown and we never store the value.
- */
-static uint8_t
-xfs_dir2_sfe_get_ftype(
- struct xfs_dir2_sf_entry *sfep)
-{
- return XFS_DIR3_FT_UNKNOWN;
-}
-
-static void
-xfs_dir2_sfe_put_ftype(
- struct xfs_dir2_sf_entry *sfep,
- uint8_t ftype)
-{
- ASSERT(ftype < XFS_DIR3_FT_MAX);
-}
-
-static uint8_t
-xfs_dir3_sfe_get_ftype(
- struct xfs_dir2_sf_entry *sfep)
-{
- uint8_t ftype;
-
- ftype = sfep->name[sfep->namelen];
- if (ftype >= XFS_DIR3_FT_MAX)
- return XFS_DIR3_FT_UNKNOWN;
- return ftype;
-}
-
-static void
-xfs_dir3_sfe_put_ftype(
- struct xfs_dir2_sf_entry *sfep,
- uint8_t ftype)
-{
- ASSERT(ftype < XFS_DIR3_FT_MAX);
-
- sfep->name[sfep->namelen] = ftype;
-}
-
-/*
- * Inode numbers in short-form directories can come in two versions,
- * either 4 bytes or 8 bytes wide. These helpers deal with the
- * two forms transparently by looking at the headers i8count field.
- *
- * For 64-bit inode number the most significant byte must be zero.
- */
-static xfs_ino_t
-xfs_dir2_sf_get_ino(
- struct xfs_dir2_sf_hdr *hdr,
- uint8_t *from)
-{
- if (hdr->i8count)
- return get_unaligned_be64(from) & 0x00ffffffffffffffULL;
- else
- return get_unaligned_be32(from);
-}
-
-static void
-xfs_dir2_sf_put_ino(
- struct xfs_dir2_sf_hdr *hdr,
- uint8_t *to,
- xfs_ino_t ino)
-{
- ASSERT((ino & 0xff00000000000000ULL) == 0);
-
- if (hdr->i8count)
- put_unaligned_be64(ino, to);
- else
- put_unaligned_be32(ino, to);
-}
-
-static xfs_ino_t
-xfs_dir2_sf_get_parent_ino(
- struct xfs_dir2_sf_hdr *hdr)
-{
- return xfs_dir2_sf_get_ino(hdr, hdr->parent);
-}
-
-static void
-xfs_dir2_sf_put_parent_ino(
- struct xfs_dir2_sf_hdr *hdr,
- xfs_ino_t ino)
-{
- xfs_dir2_sf_put_ino(hdr, hdr->parent, ino);
-}
-
-/*
- * In short-form directory entries the inode numbers are stored at variable
- * offset behind the entry name. If the entry stores a filetype value, then it
- * sits between the name and the inode number. Hence the inode numbers may only
- * be accessed through the helpers below.
- */
-static xfs_ino_t
-xfs_dir2_sfe_get_ino(
- struct xfs_dir2_sf_hdr *hdr,
- struct xfs_dir2_sf_entry *sfep)
-{
- return xfs_dir2_sf_get_ino(hdr, &sfep->name[sfep->namelen]);
-}
-
-static void
-xfs_dir2_sfe_put_ino(
- struct xfs_dir2_sf_hdr *hdr,
- struct xfs_dir2_sf_entry *sfep,
- xfs_ino_t ino)
-{
- xfs_dir2_sf_put_ino(hdr, &sfep->name[sfep->namelen], ino);
-}
-
-static xfs_ino_t
-xfs_dir3_sfe_get_ino(
- struct xfs_dir2_sf_hdr *hdr,
- struct xfs_dir2_sf_entry *sfep)
-{
- return xfs_dir2_sf_get_ino(hdr, &sfep->name[sfep->namelen + 1]);
-}
-
-static void
-xfs_dir3_sfe_put_ino(
- struct xfs_dir2_sf_hdr *hdr,
- struct xfs_dir2_sf_entry *sfep,
- xfs_ino_t ino)
-{
- xfs_dir2_sf_put_ino(hdr, &sfep->name[sfep->namelen + 1], ino);
-}
-
-
-/*
- * Directory data block operations
- */
-
-/*
- * For special situations, the dirent size ends up fixed because we always know
- * what the size of the entry is. That's true for the "." and "..", and
- * therefore we know that they are a fixed size and hence their offsets are
- * constant, as is the first entry.
- *
- * Hence, this calculation is written as a macro to be able to be calculated at
- * compile time and so certain offsets can be calculated directly in the
- * structure initaliser via the macro. There are two macros - one for dirents
- * with ftype and without so there are no unresolvable conditionals in the
- * calculations. We also use round_up() as XFS_DIR2_DATA_ALIGN is always a power
- * of 2 and the compiler doesn't reject it (unlike roundup()).
- */
-#define XFS_DIR2_DATA_ENTSIZE(n) \
- round_up((offsetof(struct xfs_dir2_data_entry, name[0]) + (n) + \
- sizeof(xfs_dir2_data_off_t)), XFS_DIR2_DATA_ALIGN)
-
-#define XFS_DIR3_DATA_ENTSIZE(n) \
- round_up((offsetof(struct xfs_dir2_data_entry, name[0]) + (n) + \
- sizeof(xfs_dir2_data_off_t) + sizeof(uint8_t)), \
- XFS_DIR2_DATA_ALIGN)
-
-static int
-xfs_dir2_data_entsize(
- int n)
-{
- return XFS_DIR2_DATA_ENTSIZE(n);
-}
-
-static int
-xfs_dir3_data_entsize(
- int n)
-{
- return XFS_DIR3_DATA_ENTSIZE(n);
-}
-
-static uint8_t
-xfs_dir2_data_get_ftype(
- struct xfs_dir2_data_entry *dep)
-{
- return XFS_DIR3_FT_UNKNOWN;
-}
-
-static void
-xfs_dir2_data_put_ftype(
- struct xfs_dir2_data_entry *dep,
- uint8_t ftype)
-{
- ASSERT(ftype < XFS_DIR3_FT_MAX);
-}
-
-static uint8_t
-xfs_dir3_data_get_ftype(
- struct xfs_dir2_data_entry *dep)
-{
- uint8_t ftype = dep->name[dep->namelen];
-
- if (ftype >= XFS_DIR3_FT_MAX)
- return XFS_DIR3_FT_UNKNOWN;
- return ftype;
-}
-
-static void
-xfs_dir3_data_put_ftype(
- struct xfs_dir2_data_entry *dep,
- uint8_t type)
-{
- ASSERT(type < XFS_DIR3_FT_MAX);
- ASSERT(dep->namelen != 0);
-
- dep->name[dep->namelen] = type;
-}
-
-/*
- * Pointer to an entry's tag word.
- */
-static __be16 *
-xfs_dir2_data_entry_tag_p(
- struct xfs_dir2_data_entry *dep)
-{
- return (__be16 *)((char *)dep +
- xfs_dir2_data_entsize(dep->namelen) - sizeof(__be16));
-}
-
-static __be16 *
-xfs_dir3_data_entry_tag_p(
- struct xfs_dir2_data_entry *dep)
-{
- return (__be16 *)((char *)dep +
- xfs_dir3_data_entsize(dep->namelen) - sizeof(__be16));
-}
-
-/*
- * location of . and .. in data space (always block 0)
- */
-static struct xfs_dir2_data_entry *
-xfs_dir2_data_dot_entry_p(
- struct xfs_dir2_data_hdr *hdr)
-{
- return (struct xfs_dir2_data_entry *)
- ((char *)hdr + sizeof(struct xfs_dir2_data_hdr));
-}
-
-static struct xfs_dir2_data_entry *
-xfs_dir2_data_dotdot_entry_p(
- struct xfs_dir2_data_hdr *hdr)
-{
- return (struct xfs_dir2_data_entry *)
- ((char *)hdr + sizeof(struct xfs_dir2_data_hdr) +
- XFS_DIR2_DATA_ENTSIZE(1));
-}
-
-static struct xfs_dir2_data_entry *
-xfs_dir2_data_first_entry_p(
- struct xfs_dir2_data_hdr *hdr)
-{
- return (struct xfs_dir2_data_entry *)
- ((char *)hdr + sizeof(struct xfs_dir2_data_hdr) +
- XFS_DIR2_DATA_ENTSIZE(1) +
- XFS_DIR2_DATA_ENTSIZE(2));
-}
-
-static struct xfs_dir2_data_entry *
-xfs_dir2_ftype_data_dotdot_entry_p(
- struct xfs_dir2_data_hdr *hdr)
-{
- return (struct xfs_dir2_data_entry *)
- ((char *)hdr + sizeof(struct xfs_dir2_data_hdr) +
- XFS_DIR3_DATA_ENTSIZE(1));
-}
-
-static struct xfs_dir2_data_entry *
-xfs_dir2_ftype_data_first_entry_p(
- struct xfs_dir2_data_hdr *hdr)
-{
- return (struct xfs_dir2_data_entry *)
- ((char *)hdr + sizeof(struct xfs_dir2_data_hdr) +
- XFS_DIR3_DATA_ENTSIZE(1) +
- XFS_DIR3_DATA_ENTSIZE(2));
-}
-
-static struct xfs_dir2_data_entry *
-xfs_dir3_data_dot_entry_p(
- struct xfs_dir2_data_hdr *hdr)
-{
- return (struct xfs_dir2_data_entry *)
- ((char *)hdr + sizeof(struct xfs_dir3_data_hdr));
-}
-
-static struct xfs_dir2_data_entry *
-xfs_dir3_data_dotdot_entry_p(
- struct xfs_dir2_data_hdr *hdr)
-{
- return (struct xfs_dir2_data_entry *)
- ((char *)hdr + sizeof(struct xfs_dir3_data_hdr) +
- XFS_DIR3_DATA_ENTSIZE(1));
-}
-
-static struct xfs_dir2_data_entry *
-xfs_dir3_data_first_entry_p(
- struct xfs_dir2_data_hdr *hdr)
-{
- return (struct xfs_dir2_data_entry *)
- ((char *)hdr + sizeof(struct xfs_dir3_data_hdr) +
- XFS_DIR3_DATA_ENTSIZE(1) +
- XFS_DIR3_DATA_ENTSIZE(2));
-}
-
-static struct xfs_dir2_data_free *
-xfs_dir2_data_bestfree_p(struct xfs_dir2_data_hdr *hdr)
-{
- return hdr->bestfree;
-}
-
-static struct xfs_dir2_data_free *
-xfs_dir3_data_bestfree_p(struct xfs_dir2_data_hdr *hdr)
-{
- return ((struct xfs_dir3_data_hdr *)hdr)->best_free;
-}
-
-static struct xfs_dir2_data_entry *
-xfs_dir2_data_entry_p(struct xfs_dir2_data_hdr *hdr)
-{
- return (struct xfs_dir2_data_entry *)
- ((char *)hdr + sizeof(struct xfs_dir2_data_hdr));
-}
-
-static struct xfs_dir2_data_unused *
-xfs_dir2_data_unused_p(struct xfs_dir2_data_hdr *hdr)
-{
- return (struct xfs_dir2_data_unused *)
- ((char *)hdr + sizeof(struct xfs_dir2_data_hdr));
-}
-
-static struct xfs_dir2_data_entry *
-xfs_dir3_data_entry_p(struct xfs_dir2_data_hdr *hdr)
-{
- return (struct xfs_dir2_data_entry *)
- ((char *)hdr + sizeof(struct xfs_dir3_data_hdr));
-}
-
-static struct xfs_dir2_data_unused *
-xfs_dir3_data_unused_p(struct xfs_dir2_data_hdr *hdr)
-{
- return (struct xfs_dir2_data_unused *)
- ((char *)hdr + sizeof(struct xfs_dir3_data_hdr));
-}
-
-
-/*
- * Directory Leaf block operations
- */
-static int
-xfs_dir2_max_leaf_ents(struct xfs_da_geometry *geo)
-{
- return (geo->blksize - sizeof(struct xfs_dir2_leaf_hdr)) /
- (uint)sizeof(struct xfs_dir2_leaf_entry);
-}
-
-static struct xfs_dir2_leaf_entry *
-xfs_dir2_leaf_ents_p(struct xfs_dir2_leaf *lp)
-{
- return lp->__ents;
-}
-
-static int
-xfs_dir3_max_leaf_ents(struct xfs_da_geometry *geo)
-{
- return (geo->blksize - sizeof(struct xfs_dir3_leaf_hdr)) /
- (uint)sizeof(struct xfs_dir2_leaf_entry);
-}
-
-static struct xfs_dir2_leaf_entry *
-xfs_dir3_leaf_ents_p(struct xfs_dir2_leaf *lp)
-{
- return ((struct xfs_dir3_leaf *)lp)->__ents;
-}
-
-static void
-xfs_dir2_leaf_hdr_from_disk(
- struct xfs_dir3_icleaf_hdr *to,
- struct xfs_dir2_leaf *from)
-{
- to->forw = be32_to_cpu(from->hdr.info.forw);
- to->back = be32_to_cpu(from->hdr.info.back);
- to->magic = be16_to_cpu(from->hdr.info.magic);
- to->count = be16_to_cpu(from->hdr.count);
- to->stale = be16_to_cpu(from->hdr.stale);
-
- ASSERT(to->magic == XFS_DIR2_LEAF1_MAGIC ||
- to->magic == XFS_DIR2_LEAFN_MAGIC);
-}
-
-static void
-xfs_dir2_leaf_hdr_to_disk(
- struct xfs_dir2_leaf *to,
- struct xfs_dir3_icleaf_hdr *from)
-{
- ASSERT(from->magic == XFS_DIR2_LEAF1_MAGIC ||
- from->magic == XFS_DIR2_LEAFN_MAGIC);
-
- to->hdr.info.forw = cpu_to_be32(from->forw);
- to->hdr.info.back = cpu_to_be32(from->back);
- to->hdr.info.magic = cpu_to_be16(from->magic);
- to->hdr.count = cpu_to_be16(from->count);
- to->hdr.stale = cpu_to_be16(from->stale);
-}
-
-static void
-xfs_dir3_leaf_hdr_from_disk(
- struct xfs_dir3_icleaf_hdr *to,
- struct xfs_dir2_leaf *from)
-{
- struct xfs_dir3_leaf_hdr *hdr3 = (struct xfs_dir3_leaf_hdr *)from;
-
- to->forw = be32_to_cpu(hdr3->info.hdr.forw);
- to->back = be32_to_cpu(hdr3->info.hdr.back);
- to->magic = be16_to_cpu(hdr3->info.hdr.magic);
- to->count = be16_to_cpu(hdr3->count);
- to->stale = be16_to_cpu(hdr3->stale);
-
- ASSERT(to->magic == XFS_DIR3_LEAF1_MAGIC ||
- to->magic == XFS_DIR3_LEAFN_MAGIC);
-}
-
-static void
-xfs_dir3_leaf_hdr_to_disk(
- struct xfs_dir2_leaf *to,
- struct xfs_dir3_icleaf_hdr *from)
-{
- struct xfs_dir3_leaf_hdr *hdr3 = (struct xfs_dir3_leaf_hdr *)to;
-
- ASSERT(from->magic == XFS_DIR3_LEAF1_MAGIC ||
- from->magic == XFS_DIR3_LEAFN_MAGIC);
-
- hdr3->info.hdr.forw = cpu_to_be32(from->forw);
- hdr3->info.hdr.back = cpu_to_be32(from->back);
- hdr3->info.hdr.magic = cpu_to_be16(from->magic);
- hdr3->count = cpu_to_be16(from->count);
- hdr3->stale = cpu_to_be16(from->stale);
-}
-
-
-/*
- * Directory/Attribute Node block operations
- */
-static struct xfs_da_node_entry *
-xfs_da2_node_tree_p(struct xfs_da_intnode *dap)
-{
- return dap->__btree;
-}
-
-static struct xfs_da_node_entry *
-xfs_da3_node_tree_p(struct xfs_da_intnode *dap)
-{
- return ((struct xfs_da3_intnode *)dap)->__btree;
-}
-
-static void
-xfs_da2_node_hdr_from_disk(
- struct xfs_da3_icnode_hdr *to,
- struct xfs_da_intnode *from)
-{
- ASSERT(from->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC));
- to->forw = be32_to_cpu(from->hdr.info.forw);
- to->back = be32_to_cpu(from->hdr.info.back);
- to->magic = be16_to_cpu(from->hdr.info.magic);
- to->count = be16_to_cpu(from->hdr.__count);
- to->level = be16_to_cpu(from->hdr.__level);
-}
-
-static void
-xfs_da2_node_hdr_to_disk(
- struct xfs_da_intnode *to,
- struct xfs_da3_icnode_hdr *from)
-{
- ASSERT(from->magic == XFS_DA_NODE_MAGIC);
- to->hdr.info.forw = cpu_to_be32(from->forw);
- to->hdr.info.back = cpu_to_be32(from->back);
- to->hdr.info.magic = cpu_to_be16(from->magic);
- to->hdr.__count = cpu_to_be16(from->count);
- to->hdr.__level = cpu_to_be16(from->level);
-}
-
-static void
-xfs_da3_node_hdr_from_disk(
- struct xfs_da3_icnode_hdr *to,
- struct xfs_da_intnode *from)
-{
- struct xfs_da3_node_hdr *hdr3 = (struct xfs_da3_node_hdr *)from;
-
- ASSERT(from->hdr.info.magic == cpu_to_be16(XFS_DA3_NODE_MAGIC));
- to->forw = be32_to_cpu(hdr3->info.hdr.forw);
- to->back = be32_to_cpu(hdr3->info.hdr.back);
- to->magic = be16_to_cpu(hdr3->info.hdr.magic);
- to->count = be16_to_cpu(hdr3->__count);
- to->level = be16_to_cpu(hdr3->__level);
-}
-
-static void
-xfs_da3_node_hdr_to_disk(
- struct xfs_da_intnode *to,
- struct xfs_da3_icnode_hdr *from)
-{
- struct xfs_da3_node_hdr *hdr3 = (struct xfs_da3_node_hdr *)to;
-
- ASSERT(from->magic == XFS_DA3_NODE_MAGIC);
- hdr3->info.hdr.forw = cpu_to_be32(from->forw);
- hdr3->info.hdr.back = cpu_to_be32(from->back);
- hdr3->info.hdr.magic = cpu_to_be16(from->magic);
- hdr3->__count = cpu_to_be16(from->count);
- hdr3->__level = cpu_to_be16(from->level);
-}
-
-
-/*
- * Directory free space block operations
- */
-static int
-xfs_dir2_free_max_bests(struct xfs_da_geometry *geo)
-{
- return (geo->blksize - sizeof(struct xfs_dir2_free_hdr)) /
- sizeof(xfs_dir2_data_off_t);
-}
-
-static __be16 *
-xfs_dir2_free_bests_p(struct xfs_dir2_free *free)
-{
- return (__be16 *)((char *)free + sizeof(struct xfs_dir2_free_hdr));
-}
-
-/*
- * Convert data space db to the corresponding free db.
- */
-static xfs_dir2_db_t
-xfs_dir2_db_to_fdb(struct xfs_da_geometry *geo, xfs_dir2_db_t db)
-{
- return xfs_dir2_byte_to_db(geo, XFS_DIR2_FREE_OFFSET) +
- (db / xfs_dir2_free_max_bests(geo));
-}
-
-/*
- * Convert data space db to the corresponding index in a free db.
- */
-static int
-xfs_dir2_db_to_fdindex(struct xfs_da_geometry *geo, xfs_dir2_db_t db)
-{
- return db % xfs_dir2_free_max_bests(geo);
-}
-
-static int
-xfs_dir3_free_max_bests(struct xfs_da_geometry *geo)
-{
- return (geo->blksize - sizeof(struct xfs_dir3_free_hdr)) /
- sizeof(xfs_dir2_data_off_t);
-}
-
-static __be16 *
-xfs_dir3_free_bests_p(struct xfs_dir2_free *free)
-{
- return (__be16 *)((char *)free + sizeof(struct xfs_dir3_free_hdr));
-}
-
-/*
- * Convert data space db to the corresponding free db.
- */
-static xfs_dir2_db_t
-xfs_dir3_db_to_fdb(struct xfs_da_geometry *geo, xfs_dir2_db_t db)
-{
- return xfs_dir2_byte_to_db(geo, XFS_DIR2_FREE_OFFSET) +
- (db / xfs_dir3_free_max_bests(geo));
-}
-
-/*
- * Convert data space db to the corresponding index in a free db.
- */
-static int
-xfs_dir3_db_to_fdindex(struct xfs_da_geometry *geo, xfs_dir2_db_t db)
-{
- return db % xfs_dir3_free_max_bests(geo);
-}
-
-static void
-xfs_dir2_free_hdr_from_disk(
- struct xfs_dir3_icfree_hdr *to,
- struct xfs_dir2_free *from)
-{
- to->magic = be32_to_cpu(from->hdr.magic);
- to->firstdb = be32_to_cpu(from->hdr.firstdb);
- to->nvalid = be32_to_cpu(from->hdr.nvalid);
- to->nused = be32_to_cpu(from->hdr.nused);
- ASSERT(to->magic == XFS_DIR2_FREE_MAGIC);
-}
-
-static void
-xfs_dir2_free_hdr_to_disk(
- struct xfs_dir2_free *to,
- struct xfs_dir3_icfree_hdr *from)
-{
- ASSERT(from->magic == XFS_DIR2_FREE_MAGIC);
-
- to->hdr.magic = cpu_to_be32(from->magic);
- to->hdr.firstdb = cpu_to_be32(from->firstdb);
- to->hdr.nvalid = cpu_to_be32(from->nvalid);
- to->hdr.nused = cpu_to_be32(from->nused);
-}
-
-static void
-xfs_dir3_free_hdr_from_disk(
- struct xfs_dir3_icfree_hdr *to,
- struct xfs_dir2_free *from)
-{
- struct xfs_dir3_free_hdr *hdr3 = (struct xfs_dir3_free_hdr *)from;
-
- to->magic = be32_to_cpu(hdr3->hdr.magic);
- to->firstdb = be32_to_cpu(hdr3->firstdb);
- to->nvalid = be32_to_cpu(hdr3->nvalid);
- to->nused = be32_to_cpu(hdr3->nused);
-
- ASSERT(to->magic == XFS_DIR3_FREE_MAGIC);
-}
-
-static void
-xfs_dir3_free_hdr_to_disk(
- struct xfs_dir2_free *to,
- struct xfs_dir3_icfree_hdr *from)
-{
- struct xfs_dir3_free_hdr *hdr3 = (struct xfs_dir3_free_hdr *)to;
-
- ASSERT(from->magic == XFS_DIR3_FREE_MAGIC);
-
- hdr3->hdr.magic = cpu_to_be32(from->magic);
- hdr3->firstdb = cpu_to_be32(from->firstdb);
- hdr3->nvalid = cpu_to_be32(from->nvalid);
- hdr3->nused = cpu_to_be32(from->nused);
-}
-
-static const struct xfs_dir_ops xfs_dir2_ops = {
- .sf_entsize = xfs_dir2_sf_entsize,
- .sf_nextentry = xfs_dir2_sf_nextentry,
- .sf_get_ftype = xfs_dir2_sfe_get_ftype,
- .sf_put_ftype = xfs_dir2_sfe_put_ftype,
- .sf_get_ino = xfs_dir2_sfe_get_ino,
- .sf_put_ino = xfs_dir2_sfe_put_ino,
- .sf_get_parent_ino = xfs_dir2_sf_get_parent_ino,
- .sf_put_parent_ino = xfs_dir2_sf_put_parent_ino,
-
- .data_entsize = xfs_dir2_data_entsize,
- .data_get_ftype = xfs_dir2_data_get_ftype,
- .data_put_ftype = xfs_dir2_data_put_ftype,
- .data_entry_tag_p = xfs_dir2_data_entry_tag_p,
- .data_bestfree_p = xfs_dir2_data_bestfree_p,
-
- .data_dot_offset = sizeof(struct xfs_dir2_data_hdr),
- .data_dotdot_offset = sizeof(struct xfs_dir2_data_hdr) +
- XFS_DIR2_DATA_ENTSIZE(1),
- .data_first_offset = sizeof(struct xfs_dir2_data_hdr) +
- XFS_DIR2_DATA_ENTSIZE(1) +
- XFS_DIR2_DATA_ENTSIZE(2),
- .data_entry_offset = sizeof(struct xfs_dir2_data_hdr),
-
- .data_dot_entry_p = xfs_dir2_data_dot_entry_p,
- .data_dotdot_entry_p = xfs_dir2_data_dotdot_entry_p,
- .data_first_entry_p = xfs_dir2_data_first_entry_p,
- .data_entry_p = xfs_dir2_data_entry_p,
- .data_unused_p = xfs_dir2_data_unused_p,
-
- .leaf_hdr_size = sizeof(struct xfs_dir2_leaf_hdr),
- .leaf_hdr_to_disk = xfs_dir2_leaf_hdr_to_disk,
- .leaf_hdr_from_disk = xfs_dir2_leaf_hdr_from_disk,
- .leaf_max_ents = xfs_dir2_max_leaf_ents,
- .leaf_ents_p = xfs_dir2_leaf_ents_p,
-
- .node_hdr_size = sizeof(struct xfs_da_node_hdr),
- .node_hdr_to_disk = xfs_da2_node_hdr_to_disk,
- .node_hdr_from_disk = xfs_da2_node_hdr_from_disk,
- .node_tree_p = xfs_da2_node_tree_p,
-
- .free_hdr_size = sizeof(struct xfs_dir2_free_hdr),
- .free_hdr_to_disk = xfs_dir2_free_hdr_to_disk,
- .free_hdr_from_disk = xfs_dir2_free_hdr_from_disk,
- .free_max_bests = xfs_dir2_free_max_bests,
- .free_bests_p = xfs_dir2_free_bests_p,
- .db_to_fdb = xfs_dir2_db_to_fdb,
- .db_to_fdindex = xfs_dir2_db_to_fdindex,
-};
-
-static const struct xfs_dir_ops xfs_dir2_ftype_ops = {
- .sf_entsize = xfs_dir3_sf_entsize,
- .sf_nextentry = xfs_dir3_sf_nextentry,
- .sf_get_ftype = xfs_dir3_sfe_get_ftype,
- .sf_put_ftype = xfs_dir3_sfe_put_ftype,
- .sf_get_ino = xfs_dir3_sfe_get_ino,
- .sf_put_ino = xfs_dir3_sfe_put_ino,
- .sf_get_parent_ino = xfs_dir2_sf_get_parent_ino,
- .sf_put_parent_ino = xfs_dir2_sf_put_parent_ino,
-
- .data_entsize = xfs_dir3_data_entsize,
- .data_get_ftype = xfs_dir3_data_get_ftype,
- .data_put_ftype = xfs_dir3_data_put_ftype,
- .data_entry_tag_p = xfs_dir3_data_entry_tag_p,
- .data_bestfree_p = xfs_dir2_data_bestfree_p,
-
- .data_dot_offset = sizeof(struct xfs_dir2_data_hdr),
- .data_dotdot_offset = sizeof(struct xfs_dir2_data_hdr) +
- XFS_DIR3_DATA_ENTSIZE(1),
- .data_first_offset = sizeof(struct xfs_dir2_data_hdr) +
- XFS_DIR3_DATA_ENTSIZE(1) +
- XFS_DIR3_DATA_ENTSIZE(2),
- .data_entry_offset = sizeof(struct xfs_dir2_data_hdr),
-
- .data_dot_entry_p = xfs_dir2_data_dot_entry_p,
- .data_dotdot_entry_p = xfs_dir2_ftype_data_dotdot_entry_p,
- .data_first_entry_p = xfs_dir2_ftype_data_first_entry_p,
- .data_entry_p = xfs_dir2_data_entry_p,
- .data_unused_p = xfs_dir2_data_unused_p,
-
- .leaf_hdr_size = sizeof(struct xfs_dir2_leaf_hdr),
- .leaf_hdr_to_disk = xfs_dir2_leaf_hdr_to_disk,
- .leaf_hdr_from_disk = xfs_dir2_leaf_hdr_from_disk,
- .leaf_max_ents = xfs_dir2_max_leaf_ents,
- .leaf_ents_p = xfs_dir2_leaf_ents_p,
-
- .node_hdr_size = sizeof(struct xfs_da_node_hdr),
- .node_hdr_to_disk = xfs_da2_node_hdr_to_disk,
- .node_hdr_from_disk = xfs_da2_node_hdr_from_disk,
- .node_tree_p = xfs_da2_node_tree_p,
-
- .free_hdr_size = sizeof(struct xfs_dir2_free_hdr),
- .free_hdr_to_disk = xfs_dir2_free_hdr_to_disk,
- .free_hdr_from_disk = xfs_dir2_free_hdr_from_disk,
- .free_max_bests = xfs_dir2_free_max_bests,
- .free_bests_p = xfs_dir2_free_bests_p,
- .db_to_fdb = xfs_dir2_db_to_fdb,
- .db_to_fdindex = xfs_dir2_db_to_fdindex,
-};
-
-static const struct xfs_dir_ops xfs_dir3_ops = {
- .sf_entsize = xfs_dir3_sf_entsize,
- .sf_nextentry = xfs_dir3_sf_nextentry,
- .sf_get_ftype = xfs_dir3_sfe_get_ftype,
- .sf_put_ftype = xfs_dir3_sfe_put_ftype,
- .sf_get_ino = xfs_dir3_sfe_get_ino,
- .sf_put_ino = xfs_dir3_sfe_put_ino,
- .sf_get_parent_ino = xfs_dir2_sf_get_parent_ino,
- .sf_put_parent_ino = xfs_dir2_sf_put_parent_ino,
-
- .data_entsize = xfs_dir3_data_entsize,
- .data_get_ftype = xfs_dir3_data_get_ftype,
- .data_put_ftype = xfs_dir3_data_put_ftype,
- .data_entry_tag_p = xfs_dir3_data_entry_tag_p,
- .data_bestfree_p = xfs_dir3_data_bestfree_p,
-
- .data_dot_offset = sizeof(struct xfs_dir3_data_hdr),
- .data_dotdot_offset = sizeof(struct xfs_dir3_data_hdr) +
- XFS_DIR3_DATA_ENTSIZE(1),
- .data_first_offset = sizeof(struct xfs_dir3_data_hdr) +
- XFS_DIR3_DATA_ENTSIZE(1) +
- XFS_DIR3_DATA_ENTSIZE(2),
- .data_entry_offset = sizeof(struct xfs_dir3_data_hdr),
-
- .data_dot_entry_p = xfs_dir3_data_dot_entry_p,
- .data_dotdot_entry_p = xfs_dir3_data_dotdot_entry_p,
- .data_first_entry_p = xfs_dir3_data_first_entry_p,
- .data_entry_p = xfs_dir3_data_entry_p,
- .data_unused_p = xfs_dir3_data_unused_p,
-
- .leaf_hdr_size = sizeof(struct xfs_dir3_leaf_hdr),
- .leaf_hdr_to_disk = xfs_dir3_leaf_hdr_to_disk,
- .leaf_hdr_from_disk = xfs_dir3_leaf_hdr_from_disk,
- .leaf_max_ents = xfs_dir3_max_leaf_ents,
- .leaf_ents_p = xfs_dir3_leaf_ents_p,
-
- .node_hdr_size = sizeof(struct xfs_da3_node_hdr),
- .node_hdr_to_disk = xfs_da3_node_hdr_to_disk,
- .node_hdr_from_disk = xfs_da3_node_hdr_from_disk,
- .node_tree_p = xfs_da3_node_tree_p,
-
- .free_hdr_size = sizeof(struct xfs_dir3_free_hdr),
- .free_hdr_to_disk = xfs_dir3_free_hdr_to_disk,
- .free_hdr_from_disk = xfs_dir3_free_hdr_from_disk,
- .free_max_bests = xfs_dir3_free_max_bests,
- .free_bests_p = xfs_dir3_free_bests_p,
- .db_to_fdb = xfs_dir3_db_to_fdb,
- .db_to_fdindex = xfs_dir3_db_to_fdindex,
-};
-
-static const struct xfs_dir_ops xfs_dir2_nondir_ops = {
- .node_hdr_size = sizeof(struct xfs_da_node_hdr),
- .node_hdr_to_disk = xfs_da2_node_hdr_to_disk,
- .node_hdr_from_disk = xfs_da2_node_hdr_from_disk,
- .node_tree_p = xfs_da2_node_tree_p,
-};
-
-static const struct xfs_dir_ops xfs_dir3_nondir_ops = {
- .node_hdr_size = sizeof(struct xfs_da3_node_hdr),
- .node_hdr_to_disk = xfs_da3_node_hdr_to_disk,
- .node_hdr_from_disk = xfs_da3_node_hdr_from_disk,
- .node_tree_p = xfs_da3_node_tree_p,
-};
-
-/*
- * Return the ops structure according to the current config. If we are passed
- * an inode, then that overrides the default config we use which is based on
- * feature bits.
- */
-const struct xfs_dir_ops *
-xfs_dir_get_ops(
- struct xfs_mount *mp,
- struct xfs_inode *dp)
-{
- if (dp)
- return dp->d_ops;
- if (mp->m_dir_inode_ops)
- return mp->m_dir_inode_ops;
- if (xfs_sb_version_hascrc(&mp->m_sb))
- return &xfs_dir3_ops;
- if (xfs_sb_version_hasftype(&mp->m_sb))
- return &xfs_dir2_ftype_ops;
- return &xfs_dir2_ops;
-}
-
-const struct xfs_dir_ops *
-xfs_nondir_get_ops(
- struct xfs_mount *mp,
- struct xfs_inode *dp)
-{
- if (dp)
- return dp->d_ops;
- if (mp->m_nondir_inode_ops)
- return mp->m_nondir_inode_ops;
- if (xfs_sb_version_hascrc(&mp->m_sb))
- return &xfs_dir3_nondir_ops;
- return &xfs_dir2_nondir_ops;
-}
diff --git a/fs/xfs/libxfs/xfs_da_format.h b/fs/xfs/libxfs/xfs_da_format.h
index ae654e0..b876b44 100644
--- a/fs/xfs/libxfs/xfs_da_format.h
+++ b/fs/xfs/libxfs/xfs_da_format.h
@@ -1,4 +1,4 @@
-// SPDX-License-Identifier: GPL-2.0
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* Copyright (c) 2000-2001,2005 Silicon Graphics, Inc.
* Copyright (c) 2013 Red Hat, Inc.
@@ -15,8 +15,8 @@
*/
#define XFS_DA_NODE_MAGIC 0xfebe /* magic number: non-leaf blocks */
#define XFS_ATTR_LEAF_MAGIC 0xfbee /* magic number: attribute leaf blks */
-#define XFS_DIR2_LEAF1_MAGIC 0xd2f1 /* magic number: v2 dirlf single blks */
-#define XFS_DIR2_LEAFN_MAGIC 0xd2ff /* magic number: v2 dirlf multi blks */
+#define XFS_DIR2_LEAF1_MAGIC 0xd2f1 /* magic number: v2 dirlf single blks */
+#define XFS_DIR2_LEAFN_MAGIC 0xd2ff /* magic number: v2 dirlf multi blks */
typedef struct xfs_da_blkinfo {
__be32 forw; /* previous block in list */
@@ -35,8 +35,8 @@
*/
#define XFS_DA3_NODE_MAGIC 0x3ebe /* magic number: non-leaf blocks */
#define XFS_ATTR3_LEAF_MAGIC 0x3bee /* magic number: attribute leaf blks */
-#define XFS_DIR3_LEAF1_MAGIC 0x3df1 /* magic number: v2 dirlf single blks */
-#define XFS_DIR3_LEAFN_MAGIC 0x3dff /* magic number: v2 dirlf multi blks */
+#define XFS_DIR3_LEAF1_MAGIC 0x3df1 /* magic number: v3 dirlf single blks */
+#define XFS_DIR3_LEAFN_MAGIC 0x3dff /* magic number: v3 dirlf multi blks */
struct xfs_da3_blkinfo {
/*
@@ -61,7 +61,7 @@
* Since we have duplicate keys, use a binary search but always follow
* all match in the block, not just the first match found.
*/
-#define XFS_DA_NODE_MAXDEPTH 5 /* max depth of Btree */
+#define XFS_DA_NODE_MAXDEPTH 5 /* max depth of Btree */
typedef struct xfs_da_node_hdr {
struct xfs_da_blkinfo info; /* block type, links, etc. */
@@ -94,19 +94,6 @@
};
/*
- * In-core version of the node header to abstract the differences in the v2 and
- * v3 disk format of the headers. Callers need to convert to/from disk format as
- * appropriate.
- */
-struct xfs_da3_icnode_hdr {
- uint32_t forw;
- uint32_t back;
- uint16_t magic;
- uint16_t count;
- uint16_t level;
-};
-
-/*
* Directory version 2.
*
* There are 4 possible formats:
@@ -230,7 +217,7 @@
* A 64-bit or 32-bit inode number follows here, at a variable offset
* after the name.
*/
-} xfs_dir2_sf_entry_t;
+} __packed xfs_dir2_sf_entry_t;
static inline int xfs_dir2_sf_hdr_size(int i8count)
{
@@ -434,14 +421,6 @@
__be32 pad; /* 64 bit alignment */
};
-struct xfs_dir3_icleaf_hdr {
- uint32_t forw;
- uint32_t back;
- uint16_t magic;
- uint16_t count;
- uint16_t stale;
-};
-
/*
* Leaf block entry.
*/
@@ -482,7 +461,7 @@
}
/*
- * Free space block defintions for the node format.
+ * Free space block definitions for the node format.
*/
/*
@@ -521,19 +500,6 @@
#define XFS_DIR3_FREE_CRC_OFF offsetof(struct xfs_dir3_free, hdr.hdr.crc)
/*
- * In core version of the free block header, abstracted away from on-disk format
- * differences. Use this in the code, and convert to/from the disk version using
- * xfs_dir3_free_hdr_from_disk/xfs_dir3_free_hdr_to_disk.
- */
-struct xfs_dir3_icfree_hdr {
- uint32_t magic;
- uint32_t firstdb;
- uint32_t nvalid;
- uint32_t nused;
-
-};
-
-/*
* Single block format.
*
* The single block format looks like the following drawing on disk:
@@ -613,7 +579,7 @@
/*
* Entries are packed toward the top as tight as possible.
*/
-typedef struct xfs_attr_shortform {
+struct xfs_attr_shortform {
struct xfs_attr_sf_hdr { /* constant-structure header block */
__be16 totsize; /* total bytes in shortform list */
__u8 count; /* count of active entries */
@@ -623,9 +589,9 @@
uint8_t namelen; /* actual length of name (no NULL) */
uint8_t valuelen; /* actual length of value (no NULL) */
uint8_t flags; /* flags bits (see xfs_attr_leaf.h) */
- uint8_t nameval[1]; /* name & value bytes concatenated */
+ uint8_t nameval[]; /* name & value bytes concatenated */
} list[1]; /* variable sized array */
-} xfs_attr_shortform_t;
+};
typedef struct xfs_attr_leaf_map { /* RLE map of free bytes */
__be16 base; /* base of free region */
@@ -710,29 +676,6 @@
};
/*
- * incore, neutral version of the attribute leaf header
- */
-struct xfs_attr3_icleaf_hdr {
- uint32_t forw;
- uint32_t back;
- uint16_t magic;
- uint16_t count;
- uint16_t usedbytes;
- /*
- * firstused is 32-bit here instead of 16-bit like the on-disk variant
- * to support maximum fsb size of 64k without overflow issues throughout
- * the attr code. Instead, the overflow condition is handled on
- * conversion to/from disk.
- */
- uint32_t firstused;
- __u8 holes;
- struct {
- uint16_t base;
- uint16_t size;
- } freemap[XFS_ATTR_LEAF_MAPSIZE];
-};
-
-/*
* Special value to represent fs block size in the leaf header firstused field.
* Only used when block size overflows the 2-bytes available on disk.
*/
@@ -740,8 +683,6 @@
/*
* Flags used in the leaf_entry[i].flags field.
- * NOTE: the INCOMPLETE bit must not collide with the flags bits specified
- * on the system call, they are "or"ed together for various operations.
*/
#define XFS_ATTR_LOCAL_BIT 0 /* attr is stored locally */
#define XFS_ATTR_ROOT_BIT 1 /* limit access to trusted attrs */
@@ -751,19 +692,7 @@
#define XFS_ATTR_ROOT (1 << XFS_ATTR_ROOT_BIT)
#define XFS_ATTR_SECURE (1 << XFS_ATTR_SECURE_BIT)
#define XFS_ATTR_INCOMPLETE (1 << XFS_ATTR_INCOMPLETE_BIT)
-
-/*
- * Conversion macros for converting namespace bits from argument flags
- * to ondisk flags.
- */
-#define XFS_ATTR_NSP_ARGS_MASK (ATTR_ROOT | ATTR_SECURE)
#define XFS_ATTR_NSP_ONDISK_MASK (XFS_ATTR_ROOT | XFS_ATTR_SECURE)
-#define XFS_ATTR_NSP_ONDISK(flags) ((flags) & XFS_ATTR_NSP_ONDISK_MASK)
-#define XFS_ATTR_NSP_ARGS(flags) ((flags) & XFS_ATTR_NSP_ARGS_MASK)
-#define XFS_ATTR_NSP_ARGS_TO_ONDISK(x) (((x) & ATTR_ROOT ? XFS_ATTR_ROOT : 0) |\
- ((x) & ATTR_SECURE ? XFS_ATTR_SECURE : 0))
-#define XFS_ATTR_NSP_ONDISK_TO_ARGS(x) (((x) & XFS_ATTR_ROOT ? ATTR_ROOT : 0) |\
- ((x) & XFS_ATTR_SECURE ? ATTR_SECURE : 0))
/*
* Alignment for namelist and valuelist entries (since they are mixed
@@ -817,14 +746,14 @@
*/
static inline int xfs_attr_leaf_entsize_remote(int nlen)
{
- return ((uint)sizeof(xfs_attr_leaf_name_remote_t) - 1 + (nlen) + \
- XFS_ATTR_LEAF_NAME_ALIGN - 1) & ~(XFS_ATTR_LEAF_NAME_ALIGN - 1);
+ return round_up(sizeof(struct xfs_attr_leaf_name_remote) - 1 +
+ nlen, XFS_ATTR_LEAF_NAME_ALIGN);
}
static inline int xfs_attr_leaf_entsize_local(int nlen, int vlen)
{
- return ((uint)sizeof(xfs_attr_leaf_name_local_t) - 1 + (nlen) + (vlen) +
- XFS_ATTR_LEAF_NAME_ALIGN - 1) & ~(XFS_ATTR_LEAF_NAME_ALIGN - 1);
+ return round_up(sizeof(struct xfs_attr_leaf_name_local) - 1 +
+ nlen + vlen, XFS_ATTR_LEAF_NAME_ALIGN);
}
static inline int xfs_attr_leaf_entsize_local_max(int bsize)
diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c
index 2255752..eff4a12 100644
--- a/fs/xfs/libxfs/xfs_defer.c
+++ b/fs/xfs/libxfs/xfs_defer.c
@@ -16,6 +16,8 @@
#include "xfs_inode.h"
#include "xfs_inode_item.h"
#include "xfs_trace.h"
+#include "xfs_icache.h"
+#include "xfs_log.h"
/*
* Deferred Operations in XFS
@@ -178,6 +180,19 @@
[XFS_DEFER_OPS_TYPE_AGFL_FREE] = &xfs_agfl_free_defer_type,
};
+static void
+xfs_defer_create_intent(
+ struct xfs_trans *tp,
+ struct xfs_defer_pending *dfp,
+ bool sort)
+{
+ const struct xfs_defer_op_type *ops = defer_op_types[dfp->dfp_type];
+
+ if (!dfp->dfp_intent)
+ dfp->dfp_intent = ops->create_intent(tp, &dfp->dfp_work,
+ dfp->dfp_count, sort);
+}
+
/*
* For each pending item in the intake list, log its intent item and the
* associated extents, then add the entire intake list to the end of
@@ -187,17 +202,11 @@
xfs_defer_create_intents(
struct xfs_trans *tp)
{
- struct list_head *li;
struct xfs_defer_pending *dfp;
- const struct xfs_defer_op_type *ops;
list_for_each_entry(dfp, &tp->t_dfops, dfp_list) {
- ops = defer_op_types[dfp->dfp_type];
- dfp->dfp_intent = ops->create_intent(tp, dfp->dfp_count);
trace_xfs_defer_create_intent(tp->t_mountp, dfp);
- list_sort(tp->t_mountp, &dfp->dfp_work, ops->diff_items);
- list_for_each(li, &dfp->dfp_work)
- ops->log_item(tp, dfp->dfp_intent, li);
+ xfs_defer_create_intent(tp, dfp, true);
}
}
@@ -234,10 +243,13 @@
struct xfs_log_item *lip;
struct xfs_buf *bplist[XFS_DEFER_OPS_NR_BUFS];
struct xfs_inode *iplist[XFS_DEFER_OPS_NR_INODES];
+ unsigned int ordered = 0; /* bitmap */
int bpcount = 0, ipcount = 0;
int i;
int error;
+ BUILD_BUG_ON(NBBY * sizeof(ordered) < XFS_DEFER_OPS_NR_BUFS);
+
list_for_each_entry(lip, &tp->t_items, li_trans) {
switch (lip->li_type) {
case XFS_LI_BUF:
@@ -248,7 +260,10 @@
ASSERT(0);
return -EFSCORRUPTED;
}
- xfs_trans_dirty_buf(tp, bli->bli_buf);
+ if (bli->bli_flags & XFS_BLI_ORDERED)
+ ordered |= (1U << bpcount);
+ else
+ xfs_trans_dirty_buf(tp, bli->bli_buf);
bplist[bpcount++] = bli->bli_buf;
}
break;
@@ -289,6 +304,8 @@
/* Rejoin the buffers and dirty them so the log moves forward. */
for (i = 0; i < bpcount; i++) {
xfs_trans_bjoin(tp, bplist[i]);
+ if (ordered & (1U << i))
+ xfs_trans_ordered_buf(tp, bplist[i]);
xfs_trans_bhold(tp, bplist[i]);
}
@@ -298,22 +315,6 @@
}
/*
- * Reset an already used dfops after finish.
- */
-static void
-xfs_defer_reset(
- struct xfs_trans *tp)
-{
- ASSERT(list_empty(&tp->t_dfops));
-
- /*
- * Low mode state transfers across transaction rolls to mirror dfops
- * lifetime. Clear it now that dfops is reset.
- */
- tp->t_flags &= ~XFS_TRANS_LOWMODE;
-}
-
-/*
* Free up any items left in the list.
*/
static void
@@ -346,6 +347,106 @@
}
/*
+ * Prevent a log intent item from pinning the tail of the log by logging a
+ * done item to release the intent item; and then log a new intent item.
+ * The caller should provide a fresh transaction and roll it after we're done.
+ */
+static int
+xfs_defer_relog(
+ struct xfs_trans **tpp,
+ struct list_head *dfops)
+{
+ struct xlog *log = (*tpp)->t_mountp->m_log;
+ struct xfs_defer_pending *dfp;
+ xfs_lsn_t threshold_lsn = NULLCOMMITLSN;
+
+
+ ASSERT((*tpp)->t_flags & XFS_TRANS_PERM_LOG_RES);
+
+ list_for_each_entry(dfp, dfops, dfp_list) {
+ /*
+ * If the log intent item for this deferred op is not a part of
+ * the current log checkpoint, relog the intent item to keep
+ * the log tail moving forward. We're ok with this being racy
+ * because an incorrect decision means we'll be a little slower
+ * at pushing the tail.
+ */
+ if (dfp->dfp_intent == NULL ||
+ xfs_log_item_in_current_chkpt(dfp->dfp_intent))
+ continue;
+
+ /*
+ * Figure out where we need the tail to be in order to maintain
+ * the minimum required free space in the log. Only sample
+ * the log threshold once per call.
+ */
+ if (threshold_lsn == NULLCOMMITLSN) {
+ threshold_lsn = xlog_grant_push_threshold(log, 0);
+ if (threshold_lsn == NULLCOMMITLSN)
+ break;
+ }
+ if (XFS_LSN_CMP(dfp->dfp_intent->li_lsn, threshold_lsn) >= 0)
+ continue;
+
+ trace_xfs_defer_relog_intent((*tpp)->t_mountp, dfp);
+ XFS_STATS_INC((*tpp)->t_mountp, defer_relog);
+ dfp->dfp_intent = xfs_trans_item_relog(dfp->dfp_intent, *tpp);
+ }
+
+ if ((*tpp)->t_flags & XFS_TRANS_DIRTY)
+ return xfs_defer_trans_roll(tpp);
+ return 0;
+}
+
+/*
+ * Log an intent-done item for the first pending intent, and finish the work
+ * items.
+ */
+static int
+xfs_defer_finish_one(
+ struct xfs_trans *tp,
+ struct xfs_defer_pending *dfp)
+{
+ const struct xfs_defer_op_type *ops = defer_op_types[dfp->dfp_type];
+ struct xfs_btree_cur *state = NULL;
+ struct list_head *li, *n;
+ int error;
+
+ trace_xfs_defer_pending_finish(tp->t_mountp, dfp);
+
+ dfp->dfp_done = ops->create_done(tp, dfp->dfp_intent, dfp->dfp_count);
+ list_for_each_safe(li, n, &dfp->dfp_work) {
+ list_del(li);
+ dfp->dfp_count--;
+ error = ops->finish_item(tp, dfp->dfp_done, li, &state);
+ if (error == -EAGAIN) {
+ /*
+ * Caller wants a fresh transaction; put the work item
+ * back on the list and log a new log intent item to
+ * replace the old one. See "Requesting a Fresh
+ * Transaction while Finishing Deferred Work" above.
+ */
+ list_add(li, &dfp->dfp_work);
+ dfp->dfp_count++;
+ dfp->dfp_done = NULL;
+ dfp->dfp_intent = NULL;
+ xfs_defer_create_intent(tp, dfp, false);
+ }
+
+ if (error)
+ goto out;
+ }
+
+ /* Done with the dfp, free it. */
+ list_del(&dfp->dfp_list);
+ kmem_free(dfp);
+out:
+ if (ops->finish_cleanup)
+ ops->finish_cleanup(tp, state, error);
+ return error;
+}
+
+/*
* Finish all the pending work. This involves logging intent items for
* any work items that wandered in since the last transaction roll (if
* one has even happened), rolling the transaction, and finishing the
@@ -358,11 +459,7 @@
struct xfs_trans **tp)
{
struct xfs_defer_pending *dfp;
- struct list_head *li;
- struct list_head *n;
- void *state;
int error = 0;
- const struct xfs_defer_op_type *ops;
LIST_HEAD(dop_pending);
ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES);
@@ -371,87 +468,44 @@
/* Until we run out of pending work to finish... */
while (!list_empty(&dop_pending) || !list_empty(&(*tp)->t_dfops)) {
- /* log intents and pull in intake items */
- xfs_defer_create_intents(*tp);
- list_splice_tail_init(&(*tp)->t_dfops, &dop_pending);
-
/*
- * Roll the transaction.
+ * Deferred items that are created in the process of finishing
+ * other deferred work items should be queued at the head of
+ * the pending list, which puts them ahead of the deferred work
+ * that was created by the caller. This keeps the number of
+ * pending work items to a minimum, which decreases the amount
+ * of time that any one intent item can stick around in memory,
+ * pinning the log tail.
*/
+ xfs_defer_create_intents(*tp);
+ list_splice_init(&(*tp)->t_dfops, &dop_pending);
+
error = xfs_defer_trans_roll(tp);
if (error)
- goto out;
+ goto out_shutdown;
- /* Log an intent-done item for the first pending item. */
+ /* Possibly relog intent items to keep the log moving. */
+ error = xfs_defer_relog(tp, &dop_pending);
+ if (error)
+ goto out_shutdown;
+
dfp = list_first_entry(&dop_pending, struct xfs_defer_pending,
dfp_list);
- ops = defer_op_types[dfp->dfp_type];
- trace_xfs_defer_pending_finish((*tp)->t_mountp, dfp);
- dfp->dfp_done = ops->create_done(*tp, dfp->dfp_intent,
- dfp->dfp_count);
-
- /* Finish the work items. */
- state = NULL;
- list_for_each_safe(li, n, &dfp->dfp_work) {
- list_del(li);
- dfp->dfp_count--;
- error = ops->finish_item(*tp, li, dfp->dfp_done,
- &state);
- if (error == -EAGAIN) {
- /*
- * Caller wants a fresh transaction;
- * put the work item back on the list
- * and jump out.
- */
- list_add(li, &dfp->dfp_work);
- dfp->dfp_count++;
- break;
- } else if (error) {
- /*
- * Clean up after ourselves and jump out.
- * xfs_defer_cancel will take care of freeing
- * all these lists and stuff.
- */
- if (ops->finish_cleanup)
- ops->finish_cleanup(*tp, state, error);
- goto out;
- }
- }
- if (error == -EAGAIN) {
- /*
- * Caller wants a fresh transaction, so log a
- * new log intent item to replace the old one
- * and roll the transaction. See "Requesting
- * a Fresh Transaction while Finishing
- * Deferred Work" above.
- */
- dfp->dfp_intent = ops->create_intent(*tp,
- dfp->dfp_count);
- dfp->dfp_done = NULL;
- list_for_each(li, &dfp->dfp_work)
- ops->log_item(*tp, dfp->dfp_intent, li);
- } else {
- /* Done with the dfp, free it. */
- list_del(&dfp->dfp_list);
- kmem_free(dfp);
- }
-
- if (ops->finish_cleanup)
- ops->finish_cleanup(*tp, state, error);
- }
-
-out:
- if (error) {
- xfs_defer_trans_abort(*tp, &dop_pending);
- xfs_force_shutdown((*tp)->t_mountp, SHUTDOWN_CORRUPT_INCORE);
- trace_xfs_defer_finish_error(*tp, error);
- xfs_defer_cancel_list((*tp)->t_mountp, &dop_pending);
- xfs_defer_cancel(*tp);
- return error;
+ error = xfs_defer_finish_one(*tp, dfp);
+ if (error && error != -EAGAIN)
+ goto out_shutdown;
}
trace_xfs_defer_finish_done(*tp, _RET_IP_);
return 0;
+
+out_shutdown:
+ xfs_defer_trans_abort(*tp, &dop_pending);
+ xfs_force_shutdown((*tp)->t_mountp, SHUTDOWN_CORRUPT_INCORE);
+ trace_xfs_defer_finish_error(*tp, error);
+ xfs_defer_cancel_list((*tp)->t_mountp, &dop_pending);
+ xfs_defer_cancel(*tp);
+ return error;
}
int
@@ -475,7 +529,10 @@
return error;
}
}
- xfs_defer_reset(*tp);
+
+ /* Reset LOWMODE now that we've finished all the dfops. */
+ ASSERT(list_empty(&(*tp)->t_dfops));
+ (*tp)->t_flags &= ~XFS_TRANS_LOWMODE;
return 0;
}
@@ -549,6 +606,139 @@
* that behavior.
*/
dtp->t_flags |= (stp->t_flags & XFS_TRANS_LOWMODE);
+ stp->t_flags &= ~XFS_TRANS_LOWMODE;
+}
- xfs_defer_reset(stp);
+/*
+ * Prepare a chain of fresh deferred ops work items to be completed later. Log
+ * recovery requires the ability to put off until later the actual finishing
+ * work so that it can process unfinished items recovered from the log in
+ * correct order.
+ *
+ * Create and log intent items for all the work that we're capturing so that we
+ * can be assured that the items will get replayed if the system goes down
+ * before log recovery gets a chance to finish the work it put off. The entire
+ * deferred ops state is transferred to the capture structure and the
+ * transaction is then ready for the caller to commit it. If there are no
+ * intent items to capture, this function returns NULL.
+ *
+ * If capture_ip is not NULL, the capture structure will obtain an extra
+ * reference to the inode.
+ */
+static struct xfs_defer_capture *
+xfs_defer_ops_capture(
+ struct xfs_trans *tp,
+ struct xfs_inode *capture_ip)
+{
+ struct xfs_defer_capture *dfc;
+
+ if (list_empty(&tp->t_dfops))
+ return NULL;
+
+ /* Create an object to capture the defer ops. */
+ dfc = kmem_zalloc(sizeof(*dfc), KM_NOFS);
+ INIT_LIST_HEAD(&dfc->dfc_list);
+ INIT_LIST_HEAD(&dfc->dfc_dfops);
+
+ xfs_defer_create_intents(tp);
+
+ /* Move the dfops chain and transaction state to the capture struct. */
+ list_splice_init(&tp->t_dfops, &dfc->dfc_dfops);
+ dfc->dfc_tpflags = tp->t_flags & XFS_TRANS_LOWMODE;
+ tp->t_flags &= ~XFS_TRANS_LOWMODE;
+
+ /* Capture the remaining block reservations along with the dfops. */
+ dfc->dfc_blkres = tp->t_blk_res - tp->t_blk_res_used;
+ dfc->dfc_rtxres = tp->t_rtx_res - tp->t_rtx_res_used;
+
+ /* Preserve the log reservation size. */
+ dfc->dfc_logres = tp->t_log_res;
+
+ /*
+ * Grab an extra reference to this inode and attach it to the capture
+ * structure.
+ */
+ if (capture_ip) {
+ ihold(VFS_I(capture_ip));
+ dfc->dfc_capture_ip = capture_ip;
+ }
+
+ return dfc;
+}
+
+/* Release all resources that we used to capture deferred ops. */
+void
+xfs_defer_ops_release(
+ struct xfs_mount *mp,
+ struct xfs_defer_capture *dfc)
+{
+ xfs_defer_cancel_list(mp, &dfc->dfc_dfops);
+ if (dfc->dfc_capture_ip)
+ xfs_irele(dfc->dfc_capture_ip);
+ kmem_free(dfc);
+}
+
+/*
+ * Capture any deferred ops and commit the transaction. This is the last step
+ * needed to finish a log intent item that we recovered from the log. If any
+ * of the deferred ops operate on an inode, the caller must pass in that inode
+ * so that the reference can be transferred to the capture structure. The
+ * caller must hold ILOCK_EXCL on the inode, and must unlock it before calling
+ * xfs_defer_ops_continue.
+ */
+int
+xfs_defer_ops_capture_and_commit(
+ struct xfs_trans *tp,
+ struct xfs_inode *capture_ip,
+ struct list_head *capture_list)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_defer_capture *dfc;
+ int error;
+
+ ASSERT(!capture_ip || xfs_isilocked(capture_ip, XFS_ILOCK_EXCL));
+
+ /* If we don't capture anything, commit transaction and exit. */
+ dfc = xfs_defer_ops_capture(tp, capture_ip);
+ if (!dfc)
+ return xfs_trans_commit(tp);
+
+ /* Commit the transaction and add the capture structure to the list. */
+ error = xfs_trans_commit(tp);
+ if (error) {
+ xfs_defer_ops_release(mp, dfc);
+ return error;
+ }
+
+ list_add_tail(&dfc->dfc_list, capture_list);
+ return 0;
+}
+
+/*
+ * Attach a chain of captured deferred ops to a new transaction and free the
+ * capture structure. If an inode was captured, it will be passed back to the
+ * caller with ILOCK_EXCL held and joined to the transaction with lockflags==0.
+ * The caller now owns the inode reference.
+ */
+void
+xfs_defer_ops_continue(
+ struct xfs_defer_capture *dfc,
+ struct xfs_trans *tp,
+ struct xfs_inode **captured_ipp)
+{
+ ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
+ ASSERT(!(tp->t_flags & XFS_TRANS_DIRTY));
+
+ /* Lock and join the captured inode to the new transaction. */
+ if (dfc->dfc_capture_ip) {
+ xfs_ilock(dfc->dfc_capture_ip, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, dfc->dfc_capture_ip, 0);
+ }
+ *captured_ipp = dfc->dfc_capture_ip;
+
+ /* Move captured dfops chain and state to the transaction. */
+ list_splice_init(&dfc->dfc_dfops, &tp->t_dfops);
+ tp->t_flags |= dfc->dfc_tpflags;
+
+ kmem_free(dfc);
}
diff --git a/fs/xfs/libxfs/xfs_defer.h b/fs/xfs/libxfs/xfs_defer.h
index 7c28d76..05472f7 100644
--- a/fs/xfs/libxfs/xfs_defer.h
+++ b/fs/xfs/libxfs/xfs_defer.h
@@ -1,4 +1,4 @@
-// SPDX-License-Identifier: GPL-2.0+
+/* SPDX-License-Identifier: GPL-2.0+ */
/*
* Copyright (C) 2016 Oracle. All Rights Reserved.
* Author: Darrick J. Wong <darrick.wong@oracle.com>
@@ -6,7 +6,9 @@
#ifndef __XFS_DEFER_H__
#define __XFS_DEFER_H__
+struct xfs_btree_cur;
struct xfs_defer_op_type;
+struct xfs_defer_capture;
/*
* Header for deferred operation list.
@@ -28,8 +30,8 @@
struct xfs_defer_pending {
struct list_head dfp_list; /* pending items */
struct list_head dfp_work; /* work items */
- void *dfp_intent; /* log intent item */
- void *dfp_done; /* log done item */
+ struct xfs_log_item *dfp_intent; /* log intent item */
+ struct xfs_log_item *dfp_done; /* log done item */
unsigned int dfp_count; /* # extent items */
enum xfs_defer_ops_type dfp_type;
};
@@ -43,15 +45,16 @@
/* Description of a deferred type. */
struct xfs_defer_op_type {
- void (*abort_intent)(void *);
- void *(*create_done)(struct xfs_trans *, void *, unsigned int);
- int (*finish_item)(struct xfs_trans *, struct list_head *, void *,
- void **);
- void (*finish_cleanup)(struct xfs_trans *, void *, int);
- void (*cancel_item)(struct list_head *);
- int (*diff_items)(void *, struct list_head *, struct list_head *);
- void *(*create_intent)(struct xfs_trans *, uint);
- void (*log_item)(struct xfs_trans *, void *, struct list_head *);
+ struct xfs_log_item *(*create_intent)(struct xfs_trans *tp,
+ struct list_head *items, unsigned int count, bool sort);
+ void (*abort_intent)(struct xfs_log_item *intent);
+ struct xfs_log_item *(*create_done)(struct xfs_trans *tp,
+ struct xfs_log_item *intent, unsigned int count);
+ int (*finish_item)(struct xfs_trans *tp, struct xfs_log_item *done,
+ struct list_head *item, struct xfs_btree_cur **state);
+ void (*finish_cleanup)(struct xfs_trans *tp,
+ struct xfs_btree_cur *state, int error);
+ void (*cancel_item)(struct list_head *item);
unsigned int max_items;
};
@@ -61,4 +64,40 @@
extern const struct xfs_defer_op_type xfs_extent_free_defer_type;
extern const struct xfs_defer_op_type xfs_agfl_free_defer_type;
+/*
+ * This structure enables a dfops user to detach the chain of deferred
+ * operations from a transaction so that they can be continued later.
+ */
+struct xfs_defer_capture {
+ /* List of other capture structures. */
+ struct list_head dfc_list;
+
+ /* Deferred ops state saved from the transaction. */
+ struct list_head dfc_dfops;
+ unsigned int dfc_tpflags;
+
+ /* Block reservations for the data and rt devices. */
+ unsigned int dfc_blkres;
+ unsigned int dfc_rtxres;
+
+ /* Log reservation saved from the transaction. */
+ unsigned int dfc_logres;
+
+ /*
+ * An inode reference that must be maintained to complete the deferred
+ * work.
+ */
+ struct xfs_inode *dfc_capture_ip;
+};
+
+/*
+ * Functions to capture a chain of deferred operations and continue them later.
+ * This doesn't normally happen except log recovery.
+ */
+int xfs_defer_ops_capture_and_commit(struct xfs_trans *tp,
+ struct xfs_inode *capture_ip, struct list_head *capture_list);
+void xfs_defer_ops_continue(struct xfs_defer_capture *d, struct xfs_trans *tp,
+ struct xfs_inode **captured_ipp);
+void xfs_defer_ops_release(struct xfs_mount *mp, struct xfs_defer_capture *d);
+
#endif /* __XFS_DEFER_H__ */
diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c
index 867c5de..612a9c5 100644
--- a/fs/xfs/libxfs/xfs_dir2.c
+++ b/fs/xfs/libxfs/xfs_dir2.c
@@ -52,7 +52,7 @@
* ASCII case-insensitive (ie. A-Z) support for directories that was
* used in IRIX.
*/
-STATIC xfs_dahash_t
+xfs_dahash_t
xfs_ascii_ci_hashname(
struct xfs_name *name)
{
@@ -65,14 +65,14 @@
return hash;
}
-STATIC enum xfs_dacmp
+enum xfs_dacmp
xfs_ascii_ci_compname(
- struct xfs_da_args *args,
- const unsigned char *name,
- int len)
+ struct xfs_da_args *args,
+ const unsigned char *name,
+ int len)
{
- enum xfs_dacmp result;
- int i;
+ enum xfs_dacmp result;
+ int i;
if (args->namelen != len)
return XFS_CMP_DIFFERENT;
@@ -89,26 +89,16 @@
return result;
}
-static const struct xfs_nameops xfs_ascii_ci_nameops = {
- .hashname = xfs_ascii_ci_hashname,
- .compname = xfs_ascii_ci_compname,
-};
-
int
xfs_da_mount(
struct xfs_mount *mp)
{
struct xfs_da_geometry *dageo;
- int nodehdr_size;
ASSERT(mp->m_sb.sb_versionnum & XFS_SB_VERSION_DIRV2BIT);
ASSERT(xfs_dir2_dirblock_bytes(&mp->m_sb) <= XFS_MAX_BLOCKSIZE);
- mp->m_dir_inode_ops = xfs_dir_get_ops(mp, NULL);
- mp->m_nondir_inode_ops = xfs_nondir_get_ops(mp, NULL);
-
- nodehdr_size = mp->m_dir_inode_ops->node_hdr_size;
mp->m_dir_geo = kmem_zalloc(sizeof(struct xfs_da_geometry),
KM_MAYFAIL);
mp->m_attr_geo = kmem_zalloc(sizeof(struct xfs_da_geometry),
@@ -125,6 +115,27 @@
dageo->fsblog = mp->m_sb.sb_blocklog;
dageo->blksize = xfs_dir2_dirblock_bytes(&mp->m_sb);
dageo->fsbcount = 1 << mp->m_sb.sb_dirblklog;
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ dageo->node_hdr_size = sizeof(struct xfs_da3_node_hdr);
+ dageo->leaf_hdr_size = sizeof(struct xfs_dir3_leaf_hdr);
+ dageo->free_hdr_size = sizeof(struct xfs_dir3_free_hdr);
+ dageo->data_entry_offset =
+ sizeof(struct xfs_dir3_data_hdr);
+ } else {
+ dageo->node_hdr_size = sizeof(struct xfs_da_node_hdr);
+ dageo->leaf_hdr_size = sizeof(struct xfs_dir2_leaf_hdr);
+ dageo->free_hdr_size = sizeof(struct xfs_dir2_free_hdr);
+ dageo->data_entry_offset =
+ sizeof(struct xfs_dir2_data_hdr);
+ }
+ dageo->leaf_max_ents = (dageo->blksize - dageo->leaf_hdr_size) /
+ sizeof(struct xfs_dir2_leaf_entry);
+ dageo->free_max_bests = (dageo->blksize - dageo->free_hdr_size) /
+ sizeof(xfs_dir2_data_off_t);
+
+ dageo->data_first_offset = dageo->data_entry_offset +
+ xfs_dir2_data_entsize(mp, 1) +
+ xfs_dir2_data_entsize(mp, 2);
/*
* Now we've set up the block conversion variables, we can calculate the
@@ -133,7 +144,7 @@
dageo->datablk = xfs_dir2_byte_to_da(dageo, XFS_DIR2_DATA_OFFSET);
dageo->leafblk = xfs_dir2_byte_to_da(dageo, XFS_DIR2_LEAF_OFFSET);
dageo->freeblk = xfs_dir2_byte_to_da(dageo, XFS_DIR2_FREE_OFFSET);
- dageo->node_ents = (dageo->blksize - nodehdr_size) /
+ dageo->node_ents = (dageo->blksize - dageo->node_hdr_size) /
(uint)sizeof(xfs_da_node_entry_t);
dageo->magicpct = (dageo->blksize * 37) / 100;
@@ -143,15 +154,10 @@
dageo->fsblog = mp->m_sb.sb_blocklog;
dageo->blksize = 1 << dageo->blklog;
dageo->fsbcount = 1;
- dageo->node_ents = (dageo->blksize - nodehdr_size) /
+ dageo->node_hdr_size = mp->m_dir_geo->node_hdr_size;
+ dageo->node_ents = (dageo->blksize - dageo->node_hdr_size) /
(uint)sizeof(xfs_da_node_entry_t);
dageo->magicpct = (dageo->blksize * 37) / 100;
-
- if (xfs_sb_version_hasasciici(&mp->m_sb))
- mp->m_dirnameops = &xfs_ascii_ci_nameops;
- else
- mp->m_dirnameops = &xfs_default_nameops;
-
return 0;
}
@@ -191,10 +197,10 @@
{
bool ino_ok = xfs_verify_dir_ino(mp, ino);
- if (unlikely(XFS_TEST_ERROR(!ino_ok, mp, XFS_ERRTAG_DIR_INO_VALIDATE))) {
+ if (XFS_IS_CORRUPT(mp, !ino_ok) ||
+ XFS_TEST_ERROR(false, mp, XFS_ERRTAG_DIR_INO_VALIDATE)) {
xfs_warn(mp, "Invalid inode number 0x%Lx",
(unsigned long long) ino);
- XFS_ERROR_REPORT("xfs_dir_ino_validate", XFS_ERRLEVEL_LOW, mp);
return -EFSCORRUPTED;
}
return 0;
@@ -262,7 +268,7 @@
args->name = name->name;
args->namelen = name->len;
args->filetype = name->type;
- args->hashval = dp->i_mount->m_dirnameops->hashname(name);
+ args->hashval = xfs_dir2_hashname(dp->i_mount, name);
args->inumber = inum;
args->dp = dp;
args->total = total;
@@ -272,7 +278,7 @@
if (!inum)
args->op_flags |= XFS_DA_OP_JUSTCHECK;
- if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
+ if (dp->i_df.if_format == XFS_DINODE_FMT_LOCAL) {
rval = xfs_dir2_sf_addname(args);
goto out_free;
}
@@ -358,7 +364,7 @@
args->name = name->name;
args->namelen = name->len;
args->filetype = name->type;
- args->hashval = dp->i_mount->m_dirnameops->hashname(name);
+ args->hashval = xfs_dir2_hashname(dp->i_mount, name);
args->dp = dp;
args->whichfork = XFS_DATA_FORK;
args->trans = tp;
@@ -367,7 +373,7 @@
args->op_flags |= XFS_DA_OP_CILOOKUP;
lock_mode = xfs_ilock_data_map_shared(dp);
- if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
+ if (dp->i_df.if_format == XFS_DINODE_FMT_LOCAL) {
rval = xfs_dir2_sf_lookup(args);
goto out_check_rval;
}
@@ -430,14 +436,14 @@
args->name = name->name;
args->namelen = name->len;
args->filetype = name->type;
- args->hashval = dp->i_mount->m_dirnameops->hashname(name);
+ args->hashval = xfs_dir2_hashname(dp->i_mount, name);
args->inumber = ino;
args->dp = dp;
args->total = total;
args->whichfork = XFS_DATA_FORK;
args->trans = tp;
- if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
+ if (dp->i_df.if_format == XFS_DINODE_FMT_LOCAL) {
rval = xfs_dir2_sf_removename(args);
goto out_free;
}
@@ -491,14 +497,14 @@
args->name = name->name;
args->namelen = name->len;
args->filetype = name->type;
- args->hashval = dp->i_mount->m_dirnameops->hashname(name);
+ args->hashval = xfs_dir2_hashname(dp->i_mount, name);
args->inumber = inum;
args->dp = dp;
args->total = total;
args->whichfork = XFS_DATA_FORK;
args->trans = tp;
- if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
+ if (dp->i_df.if_format == XFS_DINODE_FMT_LOCAL) {
rval = xfs_dir2_sf_replace(args);
goto out_free;
}
@@ -600,7 +606,9 @@
if ((rval = xfs_bmap_last_offset(args->dp, &last, XFS_DATA_FORK)))
return rval;
rval = XFS_FSB_TO_B(args->dp->i_mount, last) == args->geo->blksize;
- if (rval != 0 && args->dp->i_d.di_size != args->geo->blksize)
+ if (XFS_IS_CORRUPT(args->dp->i_mount,
+ rval != 0 &&
+ args->dp->i_d.di_size != args->geo->blksize))
return -EFSCORRUPTED;
*vp = rval;
return 0;
@@ -716,3 +724,24 @@
/* There shouldn't be any slashes or nulls here */
return !memchr(name, '/', length) && !memchr(name, 0, length);
}
+
+xfs_dahash_t
+xfs_dir2_hashname(
+ struct xfs_mount *mp,
+ struct xfs_name *name)
+{
+ if (unlikely(xfs_sb_version_hasasciici(&mp->m_sb)))
+ return xfs_ascii_ci_hashname(name);
+ return xfs_da_hashname(name->name, name->len);
+}
+
+enum xfs_dacmp
+xfs_dir2_compname(
+ struct xfs_da_args *args,
+ const unsigned char *name,
+ int len)
+{
+ if (unlikely(xfs_sb_version_hasasciici(&args->dp->i_mount->m_sb)))
+ return xfs_ascii_ci_compname(args, name, len);
+ return xfs_da_compname(args, name, len);
+}
diff --git a/fs/xfs/libxfs/xfs_dir2.h b/fs/xfs/libxfs/xfs_dir2.h
index f542447..e553786 100644
--- a/fs/xfs/libxfs/xfs_dir2.h
+++ b/fs/xfs/libxfs/xfs_dir2.h
@@ -1,4 +1,4 @@
-// SPDX-License-Identifier: GPL-2.0
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* Copyright (c) 2000-2001,2005 Silicon Graphics, Inc.
* All Rights Reserved.
@@ -18,6 +18,8 @@
struct xfs_dir2_data_hdr;
struct xfs_dir2_data_entry;
struct xfs_dir2_data_unused;
+struct xfs_dir3_icfree_hdr;
+struct xfs_dir3_icleaf_hdr;
extern struct xfs_name xfs_name_dotdot;
@@ -27,85 +29,6 @@
extern unsigned char xfs_mode_to_ftype(int mode);
/*
- * directory operations vector for encode/decode routines
- */
-struct xfs_dir_ops {
- int (*sf_entsize)(struct xfs_dir2_sf_hdr *hdr, int len);
- struct xfs_dir2_sf_entry *
- (*sf_nextentry)(struct xfs_dir2_sf_hdr *hdr,
- struct xfs_dir2_sf_entry *sfep);
- uint8_t (*sf_get_ftype)(struct xfs_dir2_sf_entry *sfep);
- void (*sf_put_ftype)(struct xfs_dir2_sf_entry *sfep,
- uint8_t ftype);
- xfs_ino_t (*sf_get_ino)(struct xfs_dir2_sf_hdr *hdr,
- struct xfs_dir2_sf_entry *sfep);
- void (*sf_put_ino)(struct xfs_dir2_sf_hdr *hdr,
- struct xfs_dir2_sf_entry *sfep,
- xfs_ino_t ino);
- xfs_ino_t (*sf_get_parent_ino)(struct xfs_dir2_sf_hdr *hdr);
- void (*sf_put_parent_ino)(struct xfs_dir2_sf_hdr *hdr,
- xfs_ino_t ino);
-
- int (*data_entsize)(int len);
- uint8_t (*data_get_ftype)(struct xfs_dir2_data_entry *dep);
- void (*data_put_ftype)(struct xfs_dir2_data_entry *dep,
- uint8_t ftype);
- __be16 * (*data_entry_tag_p)(struct xfs_dir2_data_entry *dep);
- struct xfs_dir2_data_free *
- (*data_bestfree_p)(struct xfs_dir2_data_hdr *hdr);
-
- xfs_dir2_data_aoff_t data_dot_offset;
- xfs_dir2_data_aoff_t data_dotdot_offset;
- xfs_dir2_data_aoff_t data_first_offset;
- size_t data_entry_offset;
-
- struct xfs_dir2_data_entry *
- (*data_dot_entry_p)(struct xfs_dir2_data_hdr *hdr);
- struct xfs_dir2_data_entry *
- (*data_dotdot_entry_p)(struct xfs_dir2_data_hdr *hdr);
- struct xfs_dir2_data_entry *
- (*data_first_entry_p)(struct xfs_dir2_data_hdr *hdr);
- struct xfs_dir2_data_entry *
- (*data_entry_p)(struct xfs_dir2_data_hdr *hdr);
- struct xfs_dir2_data_unused *
- (*data_unused_p)(struct xfs_dir2_data_hdr *hdr);
-
- int leaf_hdr_size;
- void (*leaf_hdr_to_disk)(struct xfs_dir2_leaf *to,
- struct xfs_dir3_icleaf_hdr *from);
- void (*leaf_hdr_from_disk)(struct xfs_dir3_icleaf_hdr *to,
- struct xfs_dir2_leaf *from);
- int (*leaf_max_ents)(struct xfs_da_geometry *geo);
- struct xfs_dir2_leaf_entry *
- (*leaf_ents_p)(struct xfs_dir2_leaf *lp);
-
- int node_hdr_size;
- void (*node_hdr_to_disk)(struct xfs_da_intnode *to,
- struct xfs_da3_icnode_hdr *from);
- void (*node_hdr_from_disk)(struct xfs_da3_icnode_hdr *to,
- struct xfs_da_intnode *from);
- struct xfs_da_node_entry *
- (*node_tree_p)(struct xfs_da_intnode *dap);
-
- int free_hdr_size;
- void (*free_hdr_to_disk)(struct xfs_dir2_free *to,
- struct xfs_dir3_icfree_hdr *from);
- void (*free_hdr_from_disk)(struct xfs_dir3_icfree_hdr *to,
- struct xfs_dir2_free *from);
- int (*free_max_bests)(struct xfs_da_geometry *geo);
- __be16 * (*free_bests_p)(struct xfs_dir2_free *free);
- xfs_dir2_db_t (*db_to_fdb)(struct xfs_da_geometry *geo,
- xfs_dir2_db_t db);
- int (*db_to_fdindex)(struct xfs_da_geometry *geo,
- xfs_dir2_db_t db);
-};
-
-extern const struct xfs_dir_ops *
- xfs_dir_get_ops(struct xfs_mount *mp, struct xfs_inode *dp);
-extern const struct xfs_dir_ops *
- xfs_nondir_get_ops(struct xfs_mount *mp, struct xfs_inode *dp);
-
-/*
* Generic directory interface routines
*/
extern void xfs_dir_startup(void);
@@ -124,6 +47,8 @@
extern int xfs_dir_removename(struct xfs_trans *tp, struct xfs_inode *dp,
struct xfs_name *name, xfs_ino_t ino,
xfs_extlen_t tot);
+extern bool xfs_dir2_sf_replace_needblock(struct xfs_inode *dp,
+ xfs_ino_t inum);
extern int xfs_dir_replace(struct xfs_trans *tp, struct xfs_inode *dp,
struct xfs_name *name, xfs_ino_t inum,
xfs_extlen_t tot);
@@ -143,10 +68,7 @@
extern int xfs_dir2_shrink_inode(struct xfs_da_args *args, xfs_dir2_db_t db,
struct xfs_buf *bp);
-extern void xfs_dir2_data_freescan_int(struct xfs_da_geometry *geo,
- const struct xfs_dir_ops *ops,
- struct xfs_dir2_data_hdr *hdr, int *loghead);
-extern void xfs_dir2_data_freescan(struct xfs_inode *dp,
+extern void xfs_dir2_data_freescan(struct xfs_mount *mp,
struct xfs_dir2_data_hdr *hdr, int *loghead);
extern void xfs_dir2_data_log_entry(struct xfs_da_args *args,
struct xfs_buf *bp, struct xfs_dir2_data_entry *dep);
@@ -324,7 +246,7 @@
#define XFS_READDIR_BUFSIZE (32768)
unsigned char xfs_dir3_get_dtype(struct xfs_mount *mp, uint8_t filetype);
-void *xfs_dir3_data_endp(struct xfs_da_geometry *geo,
+unsigned int xfs_dir3_data_end_offset(struct xfs_da_geometry *geo,
struct xfs_dir2_data_hdr *hdr);
bool xfs_dir2_namecheck(const void *name, size_t length);
diff --git a/fs/xfs/libxfs/xfs_dir2_block.c b/fs/xfs/libxfs/xfs_dir2_block.c
index 49e4bc3..5b59d3f 100644
--- a/fs/xfs/libxfs/xfs_dir2_block.c
+++ b/fs/xfs/libxfs/xfs_dir2_block.c
@@ -114,6 +114,23 @@
.verify_struct = xfs_dir3_block_verify,
};
+static xfs_failaddr_t
+xfs_dir3_block_header_check(
+ struct xfs_inode *dp,
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = dp->i_mount;
+
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr;
+
+ if (be64_to_cpu(hdr3->owner) != dp->i_ino)
+ return __this_address;
+ }
+
+ return NULL;
+}
+
int
xfs_dir3_block_read(
struct xfs_trans *tp,
@@ -121,12 +138,24 @@
struct xfs_buf **bpp)
{
struct xfs_mount *mp = dp->i_mount;
+ xfs_failaddr_t fa;
int err;
- err = xfs_da_read_buf(tp, dp, mp->m_dir_geo->datablk, -1, bpp,
+ err = xfs_da_read_buf(tp, dp, mp->m_dir_geo->datablk, 0, bpp,
XFS_DATA_FORK, &xfs_dir3_block_buf_ops);
- if (!err && tp && *bpp)
- xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_BLOCK_BUF);
+ if (err || !*bpp)
+ return err;
+
+ /* Check things that we can't do in the verifier. */
+ fa = xfs_dir3_block_header_check(dp, *bpp);
+ if (fa) {
+ __xfs_buf_mark_corrupt(*bpp, fa);
+ xfs_trans_brelse(tp, *bpp);
+ *bpp = NULL;
+ return -EFSCORRUPTED;
+ }
+
+ xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_BLOCK_BUF);
return err;
}
@@ -172,7 +201,7 @@
struct xfs_dir2_data_unused *enddup = NULL;
*compact = 0;
- bf = dp->d_ops->data_bestfree_p(hdr);
+ bf = xfs_dir2_data_bestfree_p(dp->i_mount, hdr);
/*
* If there are stale entries we'll use one for the leaf.
@@ -311,7 +340,7 @@
* This needs to happen before the next call to use_free.
*/
if (needscan)
- xfs_dir2_data_freescan(args->dp, hdr, needlog);
+ xfs_dir2_data_freescan(args->dp->i_mount, hdr, needlog);
}
/*
@@ -355,7 +384,7 @@
if (error)
return error;
- len = dp->d_ops->data_entsize(args->namelen);
+ len = xfs_dir2_data_entsize(dp->i_mount, args->namelen);
/*
* Set up pointers to parts of the block.
@@ -458,7 +487,7 @@
* This needs to happen before the next call to use_free.
*/
if (needscan) {
- xfs_dir2_data_freescan(dp, hdr, &needlog);
+ xfs_dir2_data_freescan(dp->i_mount, hdr, &needlog);
needscan = 0;
}
/*
@@ -541,14 +570,14 @@
dep->inumber = cpu_to_be64(args->inumber);
dep->namelen = args->namelen;
memcpy(dep->name, args->name, args->namelen);
- dp->d_ops->data_put_ftype(dep, args->filetype);
- tagp = dp->d_ops->data_entry_tag_p(dep);
+ xfs_dir2_data_put_ftype(dp->i_mount, dep, args->filetype);
+ tagp = xfs_dir2_data_entry_tag_p(dp->i_mount, dep);
*tagp = cpu_to_be16((char *)dep - (char *)hdr);
/*
* Clean up the bestfree array and log the header, tail, and entry.
*/
if (needscan)
- xfs_dir2_data_freescan(dp, hdr, &needlog);
+ xfs_dir2_data_freescan(dp->i_mount, hdr, &needlog);
if (needlog)
xfs_dir2_data_log_header(args, bp);
xfs_dir2_block_log_tail(tp, bp);
@@ -633,7 +662,7 @@
* Fill in inode number, CI name if appropriate, release the block.
*/
args->inumber = be64_to_cpu(dep->inumber);
- args->filetype = dp->d_ops->data_get_ftype(dep);
+ args->filetype = xfs_dir2_data_get_ftype(dp->i_mount, dep);
error = xfs_dir_cilookup_result(args, dep->name, dep->namelen);
xfs_trans_brelse(args->trans, bp);
return error;
@@ -660,13 +689,11 @@
int high; /* binary search high index */
int low; /* binary search low index */
int mid; /* binary search current idx */
- xfs_mount_t *mp; /* filesystem mount point */
xfs_trans_t *tp; /* transaction pointer */
enum xfs_dacmp cmp; /* comparison result */
dp = args->dp;
tp = args->trans;
- mp = dp->i_mount;
error = xfs_dir3_block_read(tp, dp, &bp);
if (error)
@@ -718,7 +745,7 @@
* and buffer. If it's the first case-insensitive match, store
* the index and buffer and continue looking for an exact match.
*/
- cmp = mp->m_dirnameops->compname(args, dep->name, dep->namelen);
+ cmp = xfs_dir2_compname(args, dep->name, dep->namelen);
if (cmp != XFS_CMP_DIFFERENT && cmp != args->cmpresult) {
args->cmpresult = cmp;
*bpp = bp;
@@ -791,7 +818,8 @@
needlog = needscan = 0;
xfs_dir2_data_make_free(args, bp,
(xfs_dir2_data_aoff_t)((char *)dep - (char *)hdr),
- dp->d_ops->data_entsize(dep->namelen), &needlog, &needscan);
+ xfs_dir2_data_entsize(dp->i_mount, dep->namelen), &needlog,
+ &needscan);
/*
* Fix up the block tail.
*/
@@ -806,7 +834,7 @@
* Fix up bestfree, log the header if necessary.
*/
if (needscan)
- xfs_dir2_data_freescan(dp, hdr, &needlog);
+ xfs_dir2_data_freescan(dp->i_mount, hdr, &needlog);
if (needlog)
xfs_dir2_data_log_header(args, bp);
xfs_dir3_data_check(dp, bp);
@@ -864,7 +892,7 @@
* Change the inode number to the new value.
*/
dep->inumber = cpu_to_be64(args->inumber);
- dp->d_ops->data_put_ftype(dep, args->filetype);
+ xfs_dir2_data_put_ftype(dp->i_mount, dep, args->filetype);
xfs_dir2_data_log_entry(args, bp, dep);
xfs_dir3_data_check(dp, bp);
return 0;
@@ -914,7 +942,6 @@
__be16 *tagp; /* end of entry (tag) */
int to; /* block/leaf to index */
xfs_trans_t *tp; /* transaction pointer */
- struct xfs_dir2_leaf_entry *ents;
struct xfs_dir3_icleaf_hdr leafhdr;
trace_xfs_dir2_leaf_to_block(args);
@@ -923,8 +950,7 @@
tp = args->trans;
mp = dp->i_mount;
leaf = lbp->b_addr;
- dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
- ents = dp->d_ops->leaf_ents_p(leaf);
+ xfs_dir2_leaf_hdr_from_disk(mp, &leafhdr, leaf);
ltp = xfs_dir2_leaf_tail_p(args->geo, leaf);
ASSERT(leafhdr.magic == XFS_DIR2_LEAF1_MAGIC ||
@@ -938,7 +964,7 @@
while (dp->i_d.di_size > args->geo->blksize) {
int hdrsz;
- hdrsz = dp->d_ops->data_entry_offset;
+ hdrsz = args->geo->data_entry_offset;
bestsp = xfs_dir2_leaf_bests_p(ltp);
if (be16_to_cpu(bestsp[be32_to_cpu(ltp->bestcount) - 1]) ==
args->geo->blksize - hdrsz) {
@@ -953,7 +979,7 @@
* Read the data block if we don't already have it, give up if it fails.
*/
if (!dbp) {
- error = xfs_dir3_data_read(tp, dp, args->geo->datablk, -1, &dbp);
+ error = xfs_dir3_data_read(tp, dp, args->geo->datablk, 0, &dbp);
if (error)
return error;
}
@@ -1004,9 +1030,10 @@
*/
lep = xfs_dir2_block_leaf_p(btp);
for (from = to = 0; from < leafhdr.count; from++) {
- if (ents[from].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR))
+ if (leafhdr.ents[from].address ==
+ cpu_to_be32(XFS_DIR2_NULL_DATAPTR))
continue;
- lep[to++] = ents[from];
+ lep[to++] = leafhdr.ents[from];
}
ASSERT(to == be32_to_cpu(btp->count));
xfs_dir2_block_log_leaf(tp, dbp, 0, be32_to_cpu(btp->count) - 1);
@@ -1014,7 +1041,7 @@
* Scan the bestfree if we need it and log the data block header.
*/
if (needscan)
- xfs_dir2_data_freescan(dp, hdr, &needlog);
+ xfs_dir2_data_freescan(dp->i_mount, hdr, &needlog);
if (needlog)
xfs_dir2_data_log_header(args, dbp);
/*
@@ -1039,54 +1066,45 @@
*/
int /* error */
xfs_dir2_sf_to_block(
- xfs_da_args_t *args) /* operation arguments */
+ struct xfs_da_args *args)
{
+ struct xfs_trans *tp = args->trans;
+ struct xfs_inode *dp = args->dp;
+ struct xfs_mount *mp = dp->i_mount;
+ struct xfs_ifork *ifp = XFS_IFORK_PTR(dp, XFS_DATA_FORK);
+ struct xfs_da_geometry *geo = args->geo;
xfs_dir2_db_t blkno; /* dir-relative block # (0) */
xfs_dir2_data_hdr_t *hdr; /* block header */
xfs_dir2_leaf_entry_t *blp; /* block leaf entries */
struct xfs_buf *bp; /* block buffer */
xfs_dir2_block_tail_t *btp; /* block tail pointer */
xfs_dir2_data_entry_t *dep; /* data entry pointer */
- xfs_inode_t *dp; /* incore directory inode */
int dummy; /* trash */
xfs_dir2_data_unused_t *dup; /* unused entry pointer */
int endoffset; /* end of data objects */
int error; /* error return value */
int i; /* index */
- xfs_mount_t *mp; /* filesystem mount point */
int needlog; /* need to log block header */
int needscan; /* need to scan block freespc */
int newoffset; /* offset from current entry */
- int offset; /* target block offset */
+ unsigned int offset = geo->data_entry_offset;
xfs_dir2_sf_entry_t *sfep; /* sf entry pointer */
xfs_dir2_sf_hdr_t *oldsfp; /* old shortform header */
xfs_dir2_sf_hdr_t *sfp; /* shortform header */
__be16 *tagp; /* end of data entry */
- xfs_trans_t *tp; /* transaction pointer */
struct xfs_name name;
- struct xfs_ifork *ifp;
trace_xfs_dir2_sf_to_block(args);
- dp = args->dp;
- tp = args->trans;
- mp = dp->i_mount;
- ifp = XFS_IFORK_PTR(dp, XFS_DATA_FORK);
ASSERT(ifp->if_flags & XFS_IFINLINE);
- /*
- * Bomb out if the shortform directory is way too short.
- */
- if (dp->i_d.di_size < offsetof(xfs_dir2_sf_hdr_t, parent)) {
- ASSERT(XFS_FORCED_SHUTDOWN(mp));
- return -EIO;
- }
+ ASSERT(dp->i_d.di_size >= offsetof(struct xfs_dir2_sf_hdr, parent));
oldsfp = (xfs_dir2_sf_hdr_t *)ifp->if_u1.if_data;
ASSERT(ifp->if_bytes == dp->i_d.di_size);
ASSERT(ifp->if_u1.if_data != NULL);
ASSERT(dp->i_d.di_size >= xfs_dir2_sf_hdr_size(oldsfp->i8count));
- ASSERT(dp->i_d.di_nextents == 0);
+ ASSERT(dp->i_df.if_nextents == 0);
/*
* Copy the directory into a temporary buffer.
@@ -1123,7 +1141,7 @@
* The whole thing is initialized to free by the init routine.
* Say we're using the leaf and tail area.
*/
- dup = dp->d_ops->data_unused_p(hdr);
+ dup = bp->b_addr + offset;
needlog = needscan = 0;
error = xfs_dir2_data_use_free(args, bp, dup, args->geo->blksize - i,
i, &needlog, &needscan);
@@ -1146,35 +1164,37 @@
be16_to_cpu(dup->length), &needlog, &needscan);
if (error)
goto out_free;
+
/*
* Create entry for .
*/
- dep = dp->d_ops->data_dot_entry_p(hdr);
+ dep = bp->b_addr + offset;
dep->inumber = cpu_to_be64(dp->i_ino);
dep->namelen = 1;
dep->name[0] = '.';
- dp->d_ops->data_put_ftype(dep, XFS_DIR3_FT_DIR);
- tagp = dp->d_ops->data_entry_tag_p(dep);
- *tagp = cpu_to_be16((char *)dep - (char *)hdr);
+ xfs_dir2_data_put_ftype(mp, dep, XFS_DIR3_FT_DIR);
+ tagp = xfs_dir2_data_entry_tag_p(mp, dep);
+ *tagp = cpu_to_be16(offset);
xfs_dir2_data_log_entry(args, bp, dep);
blp[0].hashval = cpu_to_be32(xfs_dir_hash_dot);
- blp[0].address = cpu_to_be32(xfs_dir2_byte_to_dataptr(
- (char *)dep - (char *)hdr));
+ blp[0].address = cpu_to_be32(xfs_dir2_byte_to_dataptr(offset));
+ offset += xfs_dir2_data_entsize(mp, dep->namelen);
+
/*
* Create entry for ..
*/
- dep = dp->d_ops->data_dotdot_entry_p(hdr);
- dep->inumber = cpu_to_be64(dp->d_ops->sf_get_parent_ino(sfp));
+ dep = bp->b_addr + offset;
+ dep->inumber = cpu_to_be64(xfs_dir2_sf_get_parent_ino(sfp));
dep->namelen = 2;
dep->name[0] = dep->name[1] = '.';
- dp->d_ops->data_put_ftype(dep, XFS_DIR3_FT_DIR);
- tagp = dp->d_ops->data_entry_tag_p(dep);
- *tagp = cpu_to_be16((char *)dep - (char *)hdr);
+ xfs_dir2_data_put_ftype(mp, dep, XFS_DIR3_FT_DIR);
+ tagp = xfs_dir2_data_entry_tag_p(mp, dep);
+ *tagp = cpu_to_be16(offset);
xfs_dir2_data_log_entry(args, bp, dep);
blp[1].hashval = cpu_to_be32(xfs_dir_hash_dotdot);
- blp[1].address = cpu_to_be32(xfs_dir2_byte_to_dataptr(
- (char *)dep - (char *)hdr));
- offset = dp->d_ops->data_first_offset;
+ blp[1].address = cpu_to_be32(xfs_dir2_byte_to_dataptr(offset));
+ offset += xfs_dir2_data_entsize(mp, dep->namelen);
+
/*
* Loop over existing entries, stuff them in.
*/
@@ -1183,6 +1203,7 @@
sfep = NULL;
else
sfep = xfs_dir2_sf_firstentry(sfp);
+
/*
* Need to preserve the existing offset values in the sf directory.
* Insert holes (unused entries) where necessary.
@@ -1199,40 +1220,39 @@
* There should be a hole here, make one.
*/
if (offset < newoffset) {
- dup = (xfs_dir2_data_unused_t *)((char *)hdr + offset);
+ dup = bp->b_addr + offset;
dup->freetag = cpu_to_be16(XFS_DIR2_DATA_FREE_TAG);
dup->length = cpu_to_be16(newoffset - offset);
- *xfs_dir2_data_unused_tag_p(dup) = cpu_to_be16(
- ((char *)dup - (char *)hdr));
+ *xfs_dir2_data_unused_tag_p(dup) = cpu_to_be16(offset);
xfs_dir2_data_log_unused(args, bp, dup);
xfs_dir2_data_freeinsert(hdr,
- dp->d_ops->data_bestfree_p(hdr),
- dup, &dummy);
+ xfs_dir2_data_bestfree_p(mp, hdr),
+ dup, &dummy);
offset += be16_to_cpu(dup->length);
continue;
}
/*
* Copy a real entry.
*/
- dep = (xfs_dir2_data_entry_t *)((char *)hdr + newoffset);
- dep->inumber = cpu_to_be64(dp->d_ops->sf_get_ino(sfp, sfep));
+ dep = bp->b_addr + newoffset;
+ dep->inumber = cpu_to_be64(xfs_dir2_sf_get_ino(mp, sfp, sfep));
dep->namelen = sfep->namelen;
- dp->d_ops->data_put_ftype(dep, dp->d_ops->sf_get_ftype(sfep));
+ xfs_dir2_data_put_ftype(mp, dep,
+ xfs_dir2_sf_get_ftype(mp, sfep));
memcpy(dep->name, sfep->name, dep->namelen);
- tagp = dp->d_ops->data_entry_tag_p(dep);
- *tagp = cpu_to_be16((char *)dep - (char *)hdr);
+ tagp = xfs_dir2_data_entry_tag_p(mp, dep);
+ *tagp = cpu_to_be16(newoffset);
xfs_dir2_data_log_entry(args, bp, dep);
name.name = sfep->name;
name.len = sfep->namelen;
- blp[2 + i].hashval = cpu_to_be32(mp->m_dirnameops->
- hashname(&name));
- blp[2 + i].address = cpu_to_be32(xfs_dir2_byte_to_dataptr(
- (char *)dep - (char *)hdr));
+ blp[2 + i].hashval = cpu_to_be32(xfs_dir2_hashname(mp, &name));
+ blp[2 + i].address =
+ cpu_to_be32(xfs_dir2_byte_to_dataptr(newoffset));
offset = (int)((char *)(tagp + 1) - (char *)hdr);
if (++i == sfp->count)
sfep = NULL;
else
- sfep = dp->d_ops->sf_nextentry(sfp, sfep);
+ sfep = xfs_dir2_sf_nextentry(mp, sfp, sfep);
}
/* Done with the temporary buffer */
kmem_free(sfp);
diff --git a/fs/xfs/libxfs/xfs_dir2_data.c b/fs/xfs/libxfs/xfs_dir2_data.c
index 2c79be4..375b3ed 100644
--- a/fs/xfs/libxfs/xfs_dir2_data.c
+++ b/fs/xfs/libxfs/xfs_dir2_data.c
@@ -13,6 +13,7 @@
#include "xfs_mount.h"
#include "xfs_inode.h"
#include "xfs_dir2.h"
+#include "xfs_dir2_priv.h"
#include "xfs_error.h"
#include "xfs_trans.h"
#include "xfs_buf_item.h"
@@ -23,6 +24,71 @@
struct xfs_dir2_data_unused *dup,
struct xfs_dir2_data_free **bf_ent);
+struct xfs_dir2_data_free *
+xfs_dir2_data_bestfree_p(
+ struct xfs_mount *mp,
+ struct xfs_dir2_data_hdr *hdr)
+{
+ if (xfs_sb_version_hascrc(&mp->m_sb))
+ return ((struct xfs_dir3_data_hdr *)hdr)->best_free;
+ return hdr->bestfree;
+}
+
+/*
+ * Pointer to an entry's tag word.
+ */
+__be16 *
+xfs_dir2_data_entry_tag_p(
+ struct xfs_mount *mp,
+ struct xfs_dir2_data_entry *dep)
+{
+ return (__be16 *)((char *)dep +
+ xfs_dir2_data_entsize(mp, dep->namelen) - sizeof(__be16));
+}
+
+uint8_t
+xfs_dir2_data_get_ftype(
+ struct xfs_mount *mp,
+ struct xfs_dir2_data_entry *dep)
+{
+ if (xfs_sb_version_hasftype(&mp->m_sb)) {
+ uint8_t ftype = dep->name[dep->namelen];
+
+ if (likely(ftype < XFS_DIR3_FT_MAX))
+ return ftype;
+ }
+
+ return XFS_DIR3_FT_UNKNOWN;
+}
+
+void
+xfs_dir2_data_put_ftype(
+ struct xfs_mount *mp,
+ struct xfs_dir2_data_entry *dep,
+ uint8_t ftype)
+{
+ ASSERT(ftype < XFS_DIR3_FT_MAX);
+ ASSERT(dep->namelen != 0);
+
+ if (xfs_sb_version_hasftype(&mp->m_sb))
+ dep->name[dep->namelen] = ftype;
+}
+
+/*
+ * The number of leaf entries is limited by the size of the block and the amount
+ * of space used by the data entries. We don't know how much space is used by
+ * the data entries yet, so just ensure that the count falls somewhere inside
+ * the block right now.
+ */
+static inline unsigned int
+xfs_dir2_data_max_leaf_entries(
+ struct xfs_da_geometry *geo)
+{
+ return (geo->blksize - sizeof(struct xfs_dir2_block_tail) -
+ geo->data_entry_offset) /
+ sizeof(struct xfs_dir2_leaf_entry);
+}
+
/*
* Check the consistency of the data block.
* The input can also be a block-format directory.
@@ -38,40 +104,27 @@
xfs_dir2_block_tail_t *btp=NULL; /* block tail */
int count; /* count of entries found */
xfs_dir2_data_hdr_t *hdr; /* data block header */
- xfs_dir2_data_entry_t *dep; /* data entry */
xfs_dir2_data_free_t *dfp; /* bestfree entry */
- xfs_dir2_data_unused_t *dup; /* unused entry */
- char *endp; /* end of useful data */
int freeseen; /* mask of bestfrees seen */
xfs_dahash_t hash; /* hash of current name */
int i; /* leaf index */
int lastfree; /* last entry was unused */
xfs_dir2_leaf_entry_t *lep=NULL; /* block leaf entries */
struct xfs_mount *mp = bp->b_mount;
- char *p; /* current data position */
int stale; /* count of stale leaves */
struct xfs_name name;
- const struct xfs_dir_ops *ops;
- struct xfs_da_geometry *geo;
-
- geo = mp->m_dir_geo;
+ unsigned int offset;
+ unsigned int end;
+ struct xfs_da_geometry *geo = mp->m_dir_geo;
/*
- * We can be passed a null dp here from a verifier, so we need to go the
- * hard way to get them.
+ * If this isn't a directory, something is seriously wrong. Bail out.
*/
- ops = xfs_dir_get_ops(mp, dp);
-
- /*
- * If this isn't a directory, or we don't get handed the dir ops,
- * something is seriously wrong. Bail out.
- */
- if ((dp && !S_ISDIR(VFS_I(dp)->i_mode)) ||
- ops != xfs_dir_get_ops(mp, NULL))
+ if (dp && !S_ISDIR(VFS_I(dp)->i_mode))
return __this_address;
hdr = bp->b_addr;
- p = (char *)ops->data_entry_p(hdr);
+ offset = geo->data_entry_offset;
switch (hdr->magic) {
case cpu_to_be32(XFS_DIR3_BLOCK_MAGIC):
@@ -79,15 +132,8 @@
btp = xfs_dir2_block_tail_p(geo, hdr);
lep = xfs_dir2_block_leaf_p(btp);
- /*
- * The number of leaf entries is limited by the size of the
- * block and the amount of space used by the data entries.
- * We don't know how much space is used by the data entries yet,
- * so just ensure that the count falls somewhere inside the
- * block right now.
- */
if (be32_to_cpu(btp->count) >=
- ((char *)btp - p) / sizeof(struct xfs_dir2_leaf_entry))
+ xfs_dir2_data_max_leaf_entries(geo))
return __this_address;
break;
case cpu_to_be32(XFS_DIR3_DATA_MAGIC):
@@ -96,14 +142,14 @@
default:
return __this_address;
}
- endp = xfs_dir3_data_endp(geo, hdr);
- if (!endp)
+ end = xfs_dir3_data_end_offset(geo, hdr);
+ if (!end)
return __this_address;
/*
* Account for zero bestfree entries.
*/
- bf = ops->data_bestfree_p(hdr);
+ bf = xfs_dir2_data_bestfree_p(mp, hdr);
count = lastfree = freeseen = 0;
if (!bf[0].length) {
if (bf[0].offset)
@@ -128,8 +174,10 @@
/*
* Loop over the data/unused entries.
*/
- while (p < endp) {
- dup = (xfs_dir2_data_unused_t *)p;
+ while (offset < end) {
+ struct xfs_dir2_data_unused *dup = bp->b_addr + offset;
+ struct xfs_dir2_data_entry *dep = bp->b_addr + offset;
+
/*
* If it's unused, look for the space in the bestfree table.
* If we find it, account for that, else make sure it
@@ -140,10 +188,10 @@
if (lastfree != 0)
return __this_address;
- if (endp < p + be16_to_cpu(dup->length))
+ if (offset + be16_to_cpu(dup->length) > end)
return __this_address;
if (be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup)) !=
- (char *)dup - (char *)hdr)
+ offset)
return __this_address;
fa = xfs_dir2_data_freefind_verify(hdr, bf, dup, &dfp);
if (fa)
@@ -158,7 +206,7 @@
be16_to_cpu(bf[2].length))
return __this_address;
}
- p += be16_to_cpu(dup->length);
+ offset += be16_to_cpu(dup->length);
lastfree = 1;
continue;
}
@@ -168,17 +216,15 @@
* in the leaf section of the block.
* The linear search is crude but this is DEBUG code.
*/
- dep = (xfs_dir2_data_entry_t *)p;
if (dep->namelen == 0)
return __this_address;
if (xfs_dir_ino_validate(mp, be64_to_cpu(dep->inumber)))
return __this_address;
- if (endp < p + ops->data_entsize(dep->namelen))
+ if (offset + xfs_dir2_data_entsize(mp, dep->namelen) > end)
return __this_address;
- if (be16_to_cpu(*ops->data_entry_tag_p(dep)) !=
- (char *)dep - (char *)hdr)
+ if (be16_to_cpu(*xfs_dir2_data_entry_tag_p(mp, dep)) != offset)
return __this_address;
- if (ops->data_get_ftype(dep) >= XFS_DIR3_FT_MAX)
+ if (xfs_dir2_data_get_ftype(mp, dep) >= XFS_DIR3_FT_MAX)
return __this_address;
count++;
lastfree = 0;
@@ -189,7 +235,7 @@
((char *)dep - (char *)hdr));
name.name = dep->name;
name.len = dep->namelen;
- hash = mp->m_dirnameops->hashname(&name);
+ hash = xfs_dir2_hashname(mp, &name);
for (i = 0; i < be32_to_cpu(btp->count); i++) {
if (be32_to_cpu(lep[i].address) == addr &&
be32_to_cpu(lep[i].hashval) == hash)
@@ -198,7 +244,7 @@
if (i >= be32_to_cpu(btp->count))
return __this_address;
}
- p += ops->data_entsize(dep->namelen);
+ offset += xfs_dir2_data_entsize(mp, dep->namelen);
}
/*
* Need to have seen all the entries and all the bestfree slots.
@@ -348,21 +394,49 @@
.verify_write = xfs_dir3_data_write_verify,
};
+static xfs_failaddr_t
+xfs_dir3_data_header_check(
+ struct xfs_inode *dp,
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = dp->i_mount;
+
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ struct xfs_dir3_data_hdr *hdr3 = bp->b_addr;
+
+ if (be64_to_cpu(hdr3->hdr.owner) != dp->i_ino)
+ return __this_address;
+ }
+
+ return NULL;
+}
int
xfs_dir3_data_read(
struct xfs_trans *tp,
struct xfs_inode *dp,
xfs_dablk_t bno,
- xfs_daddr_t mapped_bno,
+ unsigned int flags,
struct xfs_buf **bpp)
{
+ xfs_failaddr_t fa;
int err;
- err = xfs_da_read_buf(tp, dp, bno, mapped_bno, bpp,
- XFS_DATA_FORK, &xfs_dir3_data_buf_ops);
- if (!err && tp && *bpp)
- xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_DATA_BUF);
+ err = xfs_da_read_buf(tp, dp, bno, flags, bpp, XFS_DATA_FORK,
+ &xfs_dir3_data_buf_ops);
+ if (err || !*bpp)
+ return err;
+
+ /* Check things that we can't do in the verifier. */
+ fa = xfs_dir3_data_header_check(dp, *bpp);
+ if (fa) {
+ __xfs_buf_mark_corrupt(*bpp, fa);
+ xfs_trans_brelse(tp, *bpp);
+ *bpp = NULL;
+ return -EFSCORRUPTED;
+ }
+
+ xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_DATA_BUF);
return err;
}
@@ -370,10 +444,10 @@
xfs_dir3_data_readahead(
struct xfs_inode *dp,
xfs_dablk_t bno,
- xfs_daddr_t mapped_bno)
+ unsigned int flags)
{
- return xfs_da_reada_buf(dp, bno, mapped_bno,
- XFS_DATA_FORK, &xfs_dir3_data_reada_buf_ops);
+ return xfs_da_reada_buf(dp, bno, flags, XFS_DATA_FORK,
+ &xfs_dir3_data_reada_buf_ops);
}
/*
@@ -561,17 +635,16 @@
* Given a data block, reconstruct its bestfree map.
*/
void
-xfs_dir2_data_freescan_int(
- struct xfs_da_geometry *geo,
- const struct xfs_dir_ops *ops,
- struct xfs_dir2_data_hdr *hdr,
- int *loghead)
+xfs_dir2_data_freescan(
+ struct xfs_mount *mp,
+ struct xfs_dir2_data_hdr *hdr,
+ int *loghead)
{
- xfs_dir2_data_entry_t *dep; /* active data entry */
- xfs_dir2_data_unused_t *dup; /* unused data entry */
- struct xfs_dir2_data_free *bf;
- char *endp; /* end of block's data */
- char *p; /* current entry pointer */
+ struct xfs_da_geometry *geo = mp->m_dir_geo;
+ struct xfs_dir2_data_free *bf = xfs_dir2_data_bestfree_p(mp, hdr);
+ void *addr = hdr;
+ unsigned int offset = geo->data_entry_offset;
+ unsigned int end;
ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) ||
@@ -581,79 +654,60 @@
/*
* Start by clearing the table.
*/
- bf = ops->data_bestfree_p(hdr);
memset(bf, 0, sizeof(*bf) * XFS_DIR2_DATA_FD_COUNT);
*loghead = 1;
- /*
- * Set up pointers.
- */
- p = (char *)ops->data_entry_p(hdr);
- endp = xfs_dir3_data_endp(geo, hdr);
- /*
- * Loop over the block's entries.
- */
- while (p < endp) {
- dup = (xfs_dir2_data_unused_t *)p;
+
+ end = xfs_dir3_data_end_offset(geo, addr);
+ while (offset < end) {
+ struct xfs_dir2_data_unused *dup = addr + offset;
+ struct xfs_dir2_data_entry *dep = addr + offset;
+
/*
* If it's a free entry, insert it.
*/
if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
- ASSERT((char *)dup - (char *)hdr ==
+ ASSERT(offset ==
be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup)));
xfs_dir2_data_freeinsert(hdr, bf, dup, loghead);
- p += be16_to_cpu(dup->length);
+ offset += be16_to_cpu(dup->length);
+ continue;
}
+
/*
* For active entries, check their tags and skip them.
*/
- else {
- dep = (xfs_dir2_data_entry_t *)p;
- ASSERT((char *)dep - (char *)hdr ==
- be16_to_cpu(*ops->data_entry_tag_p(dep)));
- p += ops->data_entsize(dep->namelen);
- }
+ ASSERT(offset ==
+ be16_to_cpu(*xfs_dir2_data_entry_tag_p(mp, dep)));
+ offset += xfs_dir2_data_entsize(mp, dep->namelen);
}
}
-void
-xfs_dir2_data_freescan(
- struct xfs_inode *dp,
- struct xfs_dir2_data_hdr *hdr,
- int *loghead)
-{
- return xfs_dir2_data_freescan_int(dp->i_mount->m_dir_geo, dp->d_ops,
- hdr, loghead);
-}
-
/*
* Initialize a data block at the given block number in the directory.
* Give back the buffer for the created block.
*/
int /* error */
xfs_dir3_data_init(
- xfs_da_args_t *args, /* directory operation args */
- xfs_dir2_db_t blkno, /* logical dir block number */
- struct xfs_buf **bpp) /* output block buffer */
+ struct xfs_da_args *args, /* directory operation args */
+ xfs_dir2_db_t blkno, /* logical dir block number */
+ struct xfs_buf **bpp) /* output block buffer */
{
- struct xfs_buf *bp; /* block buffer */
- xfs_dir2_data_hdr_t *hdr; /* data block header */
- xfs_inode_t *dp; /* incore directory inode */
- xfs_dir2_data_unused_t *dup; /* unused entry pointer */
- struct xfs_dir2_data_free *bf;
- int error; /* error return value */
- int i; /* bestfree index */
- xfs_mount_t *mp; /* filesystem mount point */
- xfs_trans_t *tp; /* transaction pointer */
- int t; /* temp */
+ struct xfs_trans *tp = args->trans;
+ struct xfs_inode *dp = args->dp;
+ struct xfs_mount *mp = dp->i_mount;
+ struct xfs_da_geometry *geo = args->geo;
+ struct xfs_buf *bp;
+ struct xfs_dir2_data_hdr *hdr;
+ struct xfs_dir2_data_unused *dup;
+ struct xfs_dir2_data_free *bf;
+ int error;
+ int i;
- dp = args->dp;
- mp = dp->i_mount;
- tp = args->trans;
/*
* Get the buffer set up for the block.
*/
error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(args->geo, blkno),
- -1, &bp, XFS_DATA_FORK);
+ &bp, XFS_DATA_FORK);
if (error)
return error;
bp->b_ops = &xfs_dir3_data_buf_ops;
@@ -675,8 +729,9 @@
} else
hdr->magic = cpu_to_be32(XFS_DIR2_DATA_MAGIC);
- bf = dp->d_ops->data_bestfree_p(hdr);
- bf[0].offset = cpu_to_be16(dp->d_ops->data_entry_offset);
+ bf = xfs_dir2_data_bestfree_p(mp, hdr);
+ bf[0].offset = cpu_to_be16(geo->data_entry_offset);
+ bf[0].length = cpu_to_be16(geo->blksize - geo->data_entry_offset);
for (i = 1; i < XFS_DIR2_DATA_FD_COUNT; i++) {
bf[i].length = 0;
bf[i].offset = 0;
@@ -685,13 +740,11 @@
/*
* Set up an unused entry for the block's body.
*/
- dup = dp->d_ops->data_unused_p(hdr);
+ dup = bp->b_addr + geo->data_entry_offset;
dup->freetag = cpu_to_be16(XFS_DIR2_DATA_FREE_TAG);
-
- t = args->geo->blksize - (uint)dp->d_ops->data_entry_offset;
- bf[0].length = cpu_to_be16(t);
- dup->length = cpu_to_be16(t);
+ dup->length = bf[0].length;
*xfs_dir2_data_unused_tag_p(dup) = cpu_to_be16((char *)dup - (char *)hdr);
+
/*
* Log it and return it.
*/
@@ -710,6 +763,7 @@
struct xfs_buf *bp,
xfs_dir2_data_entry_t *dep) /* data entry pointer */
{
+ struct xfs_mount *mp = bp->b_mount;
struct xfs_dir2_data_hdr *hdr = bp->b_addr;
ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
@@ -718,7 +772,7 @@
hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC));
xfs_trans_log_buf(args->trans, bp, (uint)((char *)dep - (char *)hdr),
- (uint)((char *)(args->dp->d_ops->data_entry_tag_p(dep) + 1) -
+ (uint)((char *)(xfs_dir2_data_entry_tag_p(mp, dep) + 1) -
(char *)hdr - 1));
}
@@ -739,8 +793,7 @@
hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC));
#endif
- xfs_trans_log_buf(args->trans, bp, 0,
- args->dp->d_ops->data_entry_offset - 1);
+ xfs_trans_log_buf(args->trans, bp, 0, args->geo->data_entry_offset - 1);
}
/*
@@ -789,11 +842,11 @@
{
xfs_dir2_data_hdr_t *hdr; /* data block pointer */
xfs_dir2_data_free_t *dfp; /* bestfree pointer */
- char *endptr; /* end of data area */
int needscan; /* need to regen bestfree */
xfs_dir2_data_unused_t *newdup; /* new unused entry */
xfs_dir2_data_unused_t *postdup; /* unused entry after us */
xfs_dir2_data_unused_t *prevdup; /* unused entry before us */
+ unsigned int end;
struct xfs_dir2_data_free *bf;
hdr = bp->b_addr;
@@ -801,14 +854,14 @@
/*
* Figure out where the end of the data area is.
*/
- endptr = xfs_dir3_data_endp(args->geo, hdr);
- ASSERT(endptr != NULL);
+ end = xfs_dir3_data_end_offset(args->geo, hdr);
+ ASSERT(end != 0);
/*
* If this isn't the start of the block, then back up to
* the previous entry and see if it's free.
*/
- if (offset > args->dp->d_ops->data_entry_offset) {
+ if (offset > args->geo->data_entry_offset) {
__be16 *tagp; /* tag just before us */
tagp = (__be16 *)((char *)hdr + offset) - 1;
@@ -821,7 +874,7 @@
* If this isn't the end of the block, see if the entry after
* us is free.
*/
- if ((char *)hdr + offset + len < endptr) {
+ if (offset + len < end) {
postdup =
(xfs_dir2_data_unused_t *)((char *)hdr + offset + len);
if (be16_to_cpu(postdup->freetag) != XFS_DIR2_DATA_FREE_TAG)
@@ -834,7 +887,7 @@
* Previous and following entries are both free,
* merge everything into a single free entry.
*/
- bf = args->dp->d_ops->data_bestfree_p(hdr);
+ bf = xfs_dir2_data_bestfree_p(args->dp->i_mount, hdr);
if (prevdup && postdup) {
xfs_dir2_data_free_t *dfp2; /* another bestfree pointer */
@@ -1025,7 +1078,7 @@
* Look up the entry in the bestfree table.
*/
oldlen = be16_to_cpu(dup->length);
- bf = args->dp->d_ops->data_bestfree_p(hdr);
+ bf = xfs_dir2_data_bestfree_p(args->dp->i_mount, hdr);
dfp = xfs_dir2_data_freefind(hdr, bf, dup);
ASSERT(dfp || oldlen <= be16_to_cpu(bf[2].length));
/*
@@ -1149,19 +1202,22 @@
}
/* Find the end of the entry data in a data/block format dir block. */
-void *
-xfs_dir3_data_endp(
+unsigned int
+xfs_dir3_data_end_offset(
struct xfs_da_geometry *geo,
struct xfs_dir2_data_hdr *hdr)
{
+ void *p;
+
switch (hdr->magic) {
case cpu_to_be32(XFS_DIR3_BLOCK_MAGIC):
case cpu_to_be32(XFS_DIR2_BLOCK_MAGIC):
- return xfs_dir2_block_leaf_p(xfs_dir2_block_tail_p(geo, hdr));
+ p = xfs_dir2_block_leaf_p(xfs_dir2_block_tail_p(geo, hdr));
+ return p - (void *)hdr;
case cpu_to_be32(XFS_DIR3_DATA_MAGIC):
case cpu_to_be32(XFS_DIR2_DATA_MAGIC):
- return (char *)hdr + geo->blksize;
+ return geo->blksize;
default:
- return NULL;
+ return 0;
}
}
diff --git a/fs/xfs/libxfs/xfs_dir2_leaf.c b/fs/xfs/libxfs/xfs_dir2_leaf.c
index a53e458..95d2a3f 100644
--- a/fs/xfs/libxfs/xfs_dir2_leaf.c
+++ b/fs/xfs/libxfs/xfs_dir2_leaf.c
@@ -24,12 +24,73 @@
* Local function declarations.
*/
static int xfs_dir2_leaf_lookup_int(xfs_da_args_t *args, struct xfs_buf **lbpp,
- int *indexp, struct xfs_buf **dbpp);
+ int *indexp, struct xfs_buf **dbpp,
+ struct xfs_dir3_icleaf_hdr *leafhdr);
static void xfs_dir3_leaf_log_bests(struct xfs_da_args *args,
struct xfs_buf *bp, int first, int last);
static void xfs_dir3_leaf_log_tail(struct xfs_da_args *args,
struct xfs_buf *bp);
+void
+xfs_dir2_leaf_hdr_from_disk(
+ struct xfs_mount *mp,
+ struct xfs_dir3_icleaf_hdr *to,
+ struct xfs_dir2_leaf *from)
+{
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ struct xfs_dir3_leaf *from3 = (struct xfs_dir3_leaf *)from;
+
+ to->forw = be32_to_cpu(from3->hdr.info.hdr.forw);
+ to->back = be32_to_cpu(from3->hdr.info.hdr.back);
+ to->magic = be16_to_cpu(from3->hdr.info.hdr.magic);
+ to->count = be16_to_cpu(from3->hdr.count);
+ to->stale = be16_to_cpu(from3->hdr.stale);
+ to->ents = from3->__ents;
+
+ ASSERT(to->magic == XFS_DIR3_LEAF1_MAGIC ||
+ to->magic == XFS_DIR3_LEAFN_MAGIC);
+ } else {
+ to->forw = be32_to_cpu(from->hdr.info.forw);
+ to->back = be32_to_cpu(from->hdr.info.back);
+ to->magic = be16_to_cpu(from->hdr.info.magic);
+ to->count = be16_to_cpu(from->hdr.count);
+ to->stale = be16_to_cpu(from->hdr.stale);
+ to->ents = from->__ents;
+
+ ASSERT(to->magic == XFS_DIR2_LEAF1_MAGIC ||
+ to->magic == XFS_DIR2_LEAFN_MAGIC);
+ }
+}
+
+void
+xfs_dir2_leaf_hdr_to_disk(
+ struct xfs_mount *mp,
+ struct xfs_dir2_leaf *to,
+ struct xfs_dir3_icleaf_hdr *from)
+{
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ struct xfs_dir3_leaf *to3 = (struct xfs_dir3_leaf *)to;
+
+ ASSERT(from->magic == XFS_DIR3_LEAF1_MAGIC ||
+ from->magic == XFS_DIR3_LEAFN_MAGIC);
+
+ to3->hdr.info.hdr.forw = cpu_to_be32(from->forw);
+ to3->hdr.info.hdr.back = cpu_to_be32(from->back);
+ to3->hdr.info.hdr.magic = cpu_to_be16(from->magic);
+ to3->hdr.count = cpu_to_be16(from->count);
+ to3->hdr.stale = cpu_to_be16(from->stale);
+ } else {
+ ASSERT(from->magic == XFS_DIR2_LEAF1_MAGIC ||
+ from->magic == XFS_DIR2_LEAFN_MAGIC);
+
+ to->hdr.info.forw = cpu_to_be32(from->forw);
+ to->hdr.info.back = cpu_to_be32(from->back);
+ to->hdr.info.magic = cpu_to_be16(from->magic);
+ to->hdr.count = cpu_to_be16(from->count);
+ to->hdr.stale = cpu_to_be16(from->stale);
+ }
+}
+
/*
* Check the internal consistency of a leaf1 block.
* Pop an assert if something is wrong.
@@ -43,7 +104,7 @@
struct xfs_dir2_leaf *leaf = bp->b_addr;
struct xfs_dir3_icleaf_hdr leafhdr;
- dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
+ xfs_dir2_leaf_hdr_from_disk(dp->i_mount, &leafhdr, leaf);
if (leafhdr.magic == XFS_DIR3_LEAF1_MAGIC) {
struct xfs_dir3_leaf_hdr *leaf3 = bp->b_addr;
@@ -52,7 +113,7 @@
} else if (leafhdr.magic != XFS_DIR2_LEAF1_MAGIC)
return __this_address;
- return xfs_dir3_leaf_check_int(dp->i_mount, dp, &leafhdr, leaf);
+ return xfs_dir3_leaf_check_int(dp->i_mount, &leafhdr, leaf);
}
static inline void
@@ -76,31 +137,15 @@
xfs_failaddr_t
xfs_dir3_leaf_check_int(
- struct xfs_mount *mp,
- struct xfs_inode *dp,
- struct xfs_dir3_icleaf_hdr *hdr,
- struct xfs_dir2_leaf *leaf)
+ struct xfs_mount *mp,
+ struct xfs_dir3_icleaf_hdr *hdr,
+ struct xfs_dir2_leaf *leaf)
{
- struct xfs_dir2_leaf_entry *ents;
- xfs_dir2_leaf_tail_t *ltp;
- int stale;
- int i;
- const struct xfs_dir_ops *ops;
- struct xfs_dir3_icleaf_hdr leafhdr;
- struct xfs_da_geometry *geo = mp->m_dir_geo;
+ struct xfs_da_geometry *geo = mp->m_dir_geo;
+ xfs_dir2_leaf_tail_t *ltp;
+ int stale;
+ int i;
- /*
- * we can be passed a null dp here from a verifier, so we need to go the
- * hard way to get them.
- */
- ops = xfs_dir_get_ops(mp, dp);
-
- if (!hdr) {
- ops->leaf_hdr_from_disk(&leafhdr, leaf);
- hdr = &leafhdr;
- }
-
- ents = ops->leaf_ents_p(leaf);
ltp = xfs_dir2_leaf_tail_p(geo, leaf);
/*
@@ -108,23 +153,23 @@
* Should factor in the size of the bests table as well.
* We can deduce a value for that from di_size.
*/
- if (hdr->count > ops->leaf_max_ents(geo))
+ if (hdr->count > geo->leaf_max_ents)
return __this_address;
/* Leaves and bests don't overlap in leaf format. */
if ((hdr->magic == XFS_DIR2_LEAF1_MAGIC ||
hdr->magic == XFS_DIR3_LEAF1_MAGIC) &&
- (char *)&ents[hdr->count] > (char *)xfs_dir2_leaf_bests_p(ltp))
+ (char *)&hdr->ents[hdr->count] > (char *)xfs_dir2_leaf_bests_p(ltp))
return __this_address;
/* Check hash value order, count stale entries. */
for (i = stale = 0; i < hdr->count; i++) {
if (i + 1 < hdr->count) {
- if (be32_to_cpu(ents[i].hashval) >
- be32_to_cpu(ents[i + 1].hashval))
+ if (be32_to_cpu(hdr->ents[i].hashval) >
+ be32_to_cpu(hdr->ents[i + 1].hashval))
return __this_address;
}
- if (ents[i].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR))
+ if (hdr->ents[i].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR))
stale++;
}
if (hdr->stale != stale)
@@ -139,17 +184,18 @@
*/
static xfs_failaddr_t
xfs_dir3_leaf_verify(
- struct xfs_buf *bp)
+ struct xfs_buf *bp)
{
- struct xfs_mount *mp = bp->b_mount;
- struct xfs_dir2_leaf *leaf = bp->b_addr;
- xfs_failaddr_t fa;
+ struct xfs_mount *mp = bp->b_mount;
+ struct xfs_dir3_icleaf_hdr leafhdr;
+ xfs_failaddr_t fa;
fa = xfs_da3_blkinfo_verify(bp, bp->b_addr);
if (fa)
return fa;
- return xfs_dir3_leaf_check_int(mp, NULL, NULL, leaf);
+ xfs_dir2_leaf_hdr_from_disk(mp, &leafhdr, bp->b_addr);
+ return xfs_dir3_leaf_check_int(mp, &leafhdr, bp->b_addr);
}
static void
@@ -216,13 +262,12 @@
struct xfs_trans *tp,
struct xfs_inode *dp,
xfs_dablk_t fbno,
- xfs_daddr_t mappedbno,
struct xfs_buf **bpp)
{
int err;
- err = xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp,
- XFS_DATA_FORK, &xfs_dir3_leaf1_buf_ops);
+ err = xfs_da_read_buf(tp, dp, fbno, 0, bpp, XFS_DATA_FORK,
+ &xfs_dir3_leaf1_buf_ops);
if (!err && tp && *bpp)
xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_LEAF1_BUF);
return err;
@@ -233,13 +278,12 @@
struct xfs_trans *tp,
struct xfs_inode *dp,
xfs_dablk_t fbno,
- xfs_daddr_t mappedbno,
struct xfs_buf **bpp)
{
int err;
- err = xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp,
- XFS_DATA_FORK, &xfs_dir3_leafn_buf_ops);
+ err = xfs_da_read_buf(tp, dp, fbno, 0, bpp, XFS_DATA_FORK,
+ &xfs_dir3_leafn_buf_ops);
if (!err && tp && *bpp)
xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_LEAFN_BUF);
return err;
@@ -311,7 +355,7 @@
bno < xfs_dir2_byte_to_db(args->geo, XFS_DIR2_FREE_OFFSET));
error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(args->geo, bno),
- -1, &bp, XFS_DATA_FORK);
+ &bp, XFS_DATA_FORK);
if (error)
return error;
@@ -346,7 +390,6 @@
int needscan; /* need to rescan bestfree */
xfs_trans_t *tp; /* transaction pointer */
struct xfs_dir2_data_free *bf;
- struct xfs_dir2_leaf_entry *ents;
struct xfs_dir3_icleaf_hdr leafhdr;
trace_xfs_dir2_block_to_leaf(args);
@@ -375,24 +418,24 @@
xfs_dir3_data_check(dp, dbp);
btp = xfs_dir2_block_tail_p(args->geo, hdr);
blp = xfs_dir2_block_leaf_p(btp);
- bf = dp->d_ops->data_bestfree_p(hdr);
- ents = dp->d_ops->leaf_ents_p(leaf);
+ bf = xfs_dir2_data_bestfree_p(dp->i_mount, hdr);
/*
* Set the counts in the leaf header.
*/
- dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
+ xfs_dir2_leaf_hdr_from_disk(dp->i_mount, &leafhdr, leaf);
leafhdr.count = be32_to_cpu(btp->count);
leafhdr.stale = be32_to_cpu(btp->stale);
- dp->d_ops->leaf_hdr_to_disk(leaf, &leafhdr);
+ xfs_dir2_leaf_hdr_to_disk(dp->i_mount, leaf, &leafhdr);
xfs_dir3_leaf_log_header(args, lbp);
/*
* Could compact these but I think we always do the conversion
* after squeezing out stale entries.
*/
- memcpy(ents, blp, be32_to_cpu(btp->count) * sizeof(xfs_dir2_leaf_entry_t));
- xfs_dir3_leaf_log_ents(args, lbp, 0, leafhdr.count - 1);
+ memcpy(leafhdr.ents, blp,
+ be32_to_cpu(btp->count) * sizeof(struct xfs_dir2_leaf_entry));
+ xfs_dir3_leaf_log_ents(args, &leafhdr, lbp, 0, leafhdr.count - 1);
needscan = 0;
needlog = 1;
/*
@@ -415,7 +458,7 @@
hdr->magic = cpu_to_be32(XFS_DIR3_DATA_MAGIC);
if (needscan)
- xfs_dir2_data_freescan(dp, hdr, &needlog);
+ xfs_dir2_data_freescan(dp->i_mount, hdr, &needlog);
/*
* Set up leaf tail and bests table.
*/
@@ -594,7 +637,7 @@
trace_xfs_dir2_leaf_addname(args);
- error = xfs_dir3_leaf_read(tp, dp, args->geo->leafblk, -1, &lbp);
+ error = xfs_dir3_leaf_read(tp, dp, args->geo->leafblk, &lbp);
if (error)
return error;
@@ -607,10 +650,10 @@
index = xfs_dir2_leaf_search_hash(args, lbp);
leaf = lbp->b_addr;
ltp = xfs_dir2_leaf_tail_p(args->geo, leaf);
- ents = dp->d_ops->leaf_ents_p(leaf);
- dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
+ xfs_dir2_leaf_hdr_from_disk(dp->i_mount, &leafhdr, leaf);
+ ents = leafhdr.ents;
bestsp = xfs_dir2_leaf_bests_p(ltp);
- length = dp->d_ops->data_entsize(args->namelen);
+ length = xfs_dir2_data_entsize(dp->i_mount, args->namelen);
/*
* See if there are any entries with the same hash value
@@ -773,7 +816,7 @@
else
xfs_dir3_leaf_log_bests(args, lbp, use_block, use_block);
hdr = dbp->b_addr;
- bf = dp->d_ops->data_bestfree_p(hdr);
+ bf = xfs_dir2_data_bestfree_p(dp->i_mount, hdr);
bestsp[use_block] = bf[0].length;
grown = 1;
} else {
@@ -783,13 +826,13 @@
*/
error = xfs_dir3_data_read(tp, dp,
xfs_dir2_db_to_da(args->geo, use_block),
- -1, &dbp);
+ 0, &dbp);
if (error) {
xfs_trans_brelse(tp, lbp);
return error;
}
hdr = dbp->b_addr;
- bf = dp->d_ops->data_bestfree_p(hdr);
+ bf = xfs_dir2_data_bestfree_p(dp->i_mount, hdr);
grown = 0;
}
/*
@@ -815,14 +858,14 @@
dep->inumber = cpu_to_be64(args->inumber);
dep->namelen = args->namelen;
memcpy(dep->name, args->name, dep->namelen);
- dp->d_ops->data_put_ftype(dep, args->filetype);
- tagp = dp->d_ops->data_entry_tag_p(dep);
+ xfs_dir2_data_put_ftype(dp->i_mount, dep, args->filetype);
+ tagp = xfs_dir2_data_entry_tag_p(dp->i_mount, dep);
*tagp = cpu_to_be16((char *)dep - (char *)hdr);
/*
* Need to scan fix up the bestfree table.
*/
if (needscan)
- xfs_dir2_data_freescan(dp, hdr, &needlog);
+ xfs_dir2_data_freescan(dp->i_mount, hdr, &needlog);
/*
* Need to log the data block's header.
*/
@@ -852,9 +895,9 @@
/*
* Log the leaf fields and give up the buffers.
*/
- dp->d_ops->leaf_hdr_to_disk(leaf, &leafhdr);
+ xfs_dir2_leaf_hdr_to_disk(dp->i_mount, leaf, &leafhdr);
xfs_dir3_leaf_log_header(args, lbp);
- xfs_dir3_leaf_log_ents(args, lbp, lfloglow, lfloghigh);
+ xfs_dir3_leaf_log_ents(args, &leafhdr, lbp, lfloglow, lfloghigh);
xfs_dir3_leaf_check(dp, lbp);
xfs_dir3_data_check(dp, dbp);
return 0;
@@ -874,7 +917,6 @@
xfs_dir2_leaf_t *leaf; /* leaf structure */
int loglow; /* first leaf entry to log */
int to; /* target leaf index */
- struct xfs_dir2_leaf_entry *ents;
struct xfs_inode *dp = args->dp;
leaf = bp->b_addr;
@@ -884,9 +926,9 @@
/*
* Compress out the stale entries in place.
*/
- ents = dp->d_ops->leaf_ents_p(leaf);
for (from = to = 0, loglow = -1; from < leafhdr->count; from++) {
- if (ents[from].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR))
+ if (leafhdr->ents[from].address ==
+ cpu_to_be32(XFS_DIR2_NULL_DATAPTR))
continue;
/*
* Only actually copy the entries that are different.
@@ -894,7 +936,7 @@
if (from > to) {
if (loglow == -1)
loglow = to;
- ents[to] = ents[from];
+ leafhdr->ents[to] = leafhdr->ents[from];
}
to++;
}
@@ -905,10 +947,10 @@
leafhdr->count -= leafhdr->stale;
leafhdr->stale = 0;
- dp->d_ops->leaf_hdr_to_disk(leaf, leafhdr);
+ xfs_dir2_leaf_hdr_to_disk(dp->i_mount, leaf, leafhdr);
xfs_dir3_leaf_log_header(args, bp);
if (loglow != -1)
- xfs_dir3_leaf_log_ents(args, bp, loglow, to - 1);
+ xfs_dir3_leaf_log_ents(args, leafhdr, bp, loglow, to - 1);
}
/*
@@ -1037,6 +1079,7 @@
void
xfs_dir3_leaf_log_ents(
struct xfs_da_args *args,
+ struct xfs_dir3_icleaf_hdr *hdr,
struct xfs_buf *bp,
int first,
int last)
@@ -1044,16 +1087,14 @@
xfs_dir2_leaf_entry_t *firstlep; /* pointer to first entry */
xfs_dir2_leaf_entry_t *lastlep; /* pointer to last entry */
struct xfs_dir2_leaf *leaf = bp->b_addr;
- struct xfs_dir2_leaf_entry *ents;
ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC) ||
leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAF1_MAGIC) ||
leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) ||
leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC));
- ents = args->dp->d_ops->leaf_ents_p(leaf);
- firstlep = &ents[first];
- lastlep = &ents[last];
+ firstlep = &hdr->ents[first];
+ lastlep = &hdr->ents[last];
xfs_trans_log_buf(args->trans, bp,
(uint)((char *)firstlep - (char *)leaf),
(uint)((char *)lastlep - (char *)leaf + sizeof(*lastlep) - 1));
@@ -1076,7 +1117,7 @@
xfs_trans_log_buf(args->trans, bp,
(uint)((char *)&leaf->hdr - (char *)leaf),
- args->dp->d_ops->leaf_hdr_size - 1);
+ args->geo->leaf_hdr_size - 1);
}
/*
@@ -1115,28 +1156,27 @@
int error; /* error return code */
int index; /* found entry index */
struct xfs_buf *lbp; /* leaf buffer */
- xfs_dir2_leaf_t *leaf; /* leaf structure */
xfs_dir2_leaf_entry_t *lep; /* leaf entry */
xfs_trans_t *tp; /* transaction pointer */
- struct xfs_dir2_leaf_entry *ents;
+ struct xfs_dir3_icleaf_hdr leafhdr;
trace_xfs_dir2_leaf_lookup(args);
/*
* Look up name in the leaf block, returning both buffers and index.
*/
- if ((error = xfs_dir2_leaf_lookup_int(args, &lbp, &index, &dbp))) {
+ error = xfs_dir2_leaf_lookup_int(args, &lbp, &index, &dbp, &leafhdr);
+ if (error)
return error;
- }
+
tp = args->trans;
dp = args->dp;
xfs_dir3_leaf_check(dp, lbp);
- leaf = lbp->b_addr;
- ents = dp->d_ops->leaf_ents_p(leaf);
+
/*
* Get to the leaf entry and contained data entry address.
*/
- lep = &ents[index];
+ lep = &leafhdr.ents[index];
/*
* Point to the data entry.
@@ -1148,7 +1188,7 @@
* Return the found inode number & CI name if appropriate
*/
args->inumber = be64_to_cpu(dep->inumber);
- args->filetype = dp->d_ops->data_get_ftype(dep);
+ args->filetype = xfs_dir2_data_get_ftype(dp->i_mount, dep);
error = xfs_dir_cilookup_result(args, dep->name, dep->namelen);
xfs_trans_brelse(tp, dbp);
xfs_trans_brelse(tp, lbp);
@@ -1166,7 +1206,8 @@
xfs_da_args_t *args, /* operation arguments */
struct xfs_buf **lbpp, /* out: leaf buffer */
int *indexp, /* out: index in leaf block */
- struct xfs_buf **dbpp) /* out: data buffer */
+ struct xfs_buf **dbpp, /* out: data buffer */
+ struct xfs_dir3_icleaf_hdr *leafhdr)
{
xfs_dir2_db_t curdb = -1; /* current data block number */
struct xfs_buf *dbp = NULL; /* data buffer */
@@ -1182,22 +1223,19 @@
xfs_trans_t *tp; /* transaction pointer */
xfs_dir2_db_t cidb = -1; /* case match data block no. */
enum xfs_dacmp cmp; /* name compare result */
- struct xfs_dir2_leaf_entry *ents;
- struct xfs_dir3_icleaf_hdr leafhdr;
dp = args->dp;
tp = args->trans;
mp = dp->i_mount;
- error = xfs_dir3_leaf_read(tp, dp, args->geo->leafblk, -1, &lbp);
+ error = xfs_dir3_leaf_read(tp, dp, args->geo->leafblk, &lbp);
if (error)
return error;
*lbpp = lbp;
leaf = lbp->b_addr;
xfs_dir3_leaf_check(dp, lbp);
- ents = dp->d_ops->leaf_ents_p(leaf);
- dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
+ xfs_dir2_leaf_hdr_from_disk(mp, leafhdr, leaf);
/*
* Look for the first leaf entry with our hash value.
@@ -1207,8 +1245,9 @@
* Loop over all the entries with the right hash value
* looking to match the name.
*/
- for (lep = &ents[index];
- index < leafhdr.count && be32_to_cpu(lep->hashval) == args->hashval;
+ for (lep = &leafhdr->ents[index];
+ index < leafhdr->count &&
+ be32_to_cpu(lep->hashval) == args->hashval;
lep++, index++) {
/*
* Skip over stale leaf entries.
@@ -1229,7 +1268,7 @@
xfs_trans_brelse(tp, dbp);
error = xfs_dir3_data_read(tp, dp,
xfs_dir2_db_to_da(args->geo, newdb),
- -1, &dbp);
+ 0, &dbp);
if (error) {
xfs_trans_brelse(tp, lbp);
return error;
@@ -1247,7 +1286,7 @@
* and buffer. If it's the first case-insensitive match, store
* the index and buffer and continue looking for an exact match.
*/
- cmp = mp->m_dirnameops->compname(args, dep->name, dep->namelen);
+ cmp = xfs_dir2_compname(args, dep->name, dep->namelen);
if (cmp != XFS_CMP_DIFFERENT && cmp != args->cmpresult) {
args->cmpresult = cmp;
*indexp = index;
@@ -1271,7 +1310,7 @@
xfs_trans_brelse(tp, dbp);
error = xfs_dir3_data_read(tp, dp,
xfs_dir2_db_to_da(args->geo, cidb),
- -1, &dbp);
+ 0, &dbp);
if (error) {
xfs_trans_brelse(tp, lbp);
return error;
@@ -1297,6 +1336,7 @@
xfs_dir2_leaf_removename(
xfs_da_args_t *args) /* operation arguments */
{
+ struct xfs_da_geometry *geo = args->geo;
__be16 *bestsp; /* leaf block best freespace */
xfs_dir2_data_hdr_t *hdr; /* data block header */
xfs_dir2_db_t db; /* data block number */
@@ -1314,7 +1354,6 @@
int needscan; /* need to rescan data frees */
xfs_dir2_data_off_t oldbest; /* old value of best free */
struct xfs_dir2_data_free *bf; /* bestfree table */
- struct xfs_dir2_leaf_entry *ents;
struct xfs_dir3_icleaf_hdr leafhdr;
trace_xfs_dir2_leaf_removename(args);
@@ -1322,51 +1361,54 @@
/*
* Lookup the leaf entry, get the leaf and data blocks read in.
*/
- if ((error = xfs_dir2_leaf_lookup_int(args, &lbp, &index, &dbp))) {
+ error = xfs_dir2_leaf_lookup_int(args, &lbp, &index, &dbp, &leafhdr);
+ if (error)
return error;
- }
+
dp = args->dp;
leaf = lbp->b_addr;
hdr = dbp->b_addr;
xfs_dir3_data_check(dp, dbp);
- bf = dp->d_ops->data_bestfree_p(hdr);
- dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
- ents = dp->d_ops->leaf_ents_p(leaf);
+ bf = xfs_dir2_data_bestfree_p(dp->i_mount, hdr);
+
/*
* Point to the leaf entry, use that to point to the data entry.
*/
- lep = &ents[index];
- db = xfs_dir2_dataptr_to_db(args->geo, be32_to_cpu(lep->address));
+ lep = &leafhdr.ents[index];
+ db = xfs_dir2_dataptr_to_db(geo, be32_to_cpu(lep->address));
dep = (xfs_dir2_data_entry_t *)((char *)hdr +
- xfs_dir2_dataptr_to_off(args->geo, be32_to_cpu(lep->address)));
+ xfs_dir2_dataptr_to_off(geo, be32_to_cpu(lep->address)));
needscan = needlog = 0;
oldbest = be16_to_cpu(bf[0].length);
- ltp = xfs_dir2_leaf_tail_p(args->geo, leaf);
+ ltp = xfs_dir2_leaf_tail_p(geo, leaf);
bestsp = xfs_dir2_leaf_bests_p(ltp);
- if (be16_to_cpu(bestsp[db]) != oldbest)
+ if (be16_to_cpu(bestsp[db]) != oldbest) {
+ xfs_buf_mark_corrupt(lbp);
return -EFSCORRUPTED;
+ }
/*
* Mark the former data entry unused.
*/
xfs_dir2_data_make_free(args, dbp,
(xfs_dir2_data_aoff_t)((char *)dep - (char *)hdr),
- dp->d_ops->data_entsize(dep->namelen), &needlog, &needscan);
+ xfs_dir2_data_entsize(dp->i_mount, dep->namelen), &needlog,
+ &needscan);
/*
* We just mark the leaf entry stale by putting a null in it.
*/
leafhdr.stale++;
- dp->d_ops->leaf_hdr_to_disk(leaf, &leafhdr);
+ xfs_dir2_leaf_hdr_to_disk(dp->i_mount, leaf, &leafhdr);
xfs_dir3_leaf_log_header(args, lbp);
lep->address = cpu_to_be32(XFS_DIR2_NULL_DATAPTR);
- xfs_dir3_leaf_log_ents(args, lbp, index, index);
+ xfs_dir3_leaf_log_ents(args, &leafhdr, lbp, index, index);
/*
* Scan the freespace in the data block again if necessary,
* log the data block header if necessary.
*/
if (needscan)
- xfs_dir2_data_freescan(dp, hdr, &needlog);
+ xfs_dir2_data_freescan(dp->i_mount, hdr, &needlog);
if (needlog)
xfs_dir2_data_log_header(args, dbp);
/*
@@ -1382,8 +1424,8 @@
* If the data block is now empty then get rid of the data block.
*/
if (be16_to_cpu(bf[0].length) ==
- args->geo->blksize - dp->d_ops->data_entry_offset) {
- ASSERT(db != args->geo->datablk);
+ geo->blksize - geo->data_entry_offset) {
+ ASSERT(db != geo->datablk);
if ((error = xfs_dir2_shrink_inode(args, db, dbp))) {
/*
* Nope, can't get rid of it because it caused
@@ -1425,7 +1467,7 @@
/*
* If the data block was not the first one, drop it.
*/
- else if (db != args->geo->datablk)
+ else if (db != geo->datablk)
dbp = NULL;
xfs_dir3_leaf_check(dp, lbp);
@@ -1448,26 +1490,24 @@
int error; /* error return code */
int index; /* index of leaf entry */
struct xfs_buf *lbp; /* leaf buffer */
- xfs_dir2_leaf_t *leaf; /* leaf structure */
xfs_dir2_leaf_entry_t *lep; /* leaf entry */
xfs_trans_t *tp; /* transaction pointer */
- struct xfs_dir2_leaf_entry *ents;
+ struct xfs_dir3_icleaf_hdr leafhdr;
trace_xfs_dir2_leaf_replace(args);
/*
* Look up the entry.
*/
- if ((error = xfs_dir2_leaf_lookup_int(args, &lbp, &index, &dbp))) {
+ error = xfs_dir2_leaf_lookup_int(args, &lbp, &index, &dbp, &leafhdr);
+ if (error)
return error;
- }
+
dp = args->dp;
- leaf = lbp->b_addr;
- ents = dp->d_ops->leaf_ents_p(leaf);
/*
* Point to the leaf entry, get data address from it.
*/
- lep = &ents[index];
+ lep = &leafhdr.ents[index];
/*
* Point to the data entry.
*/
@@ -1479,7 +1519,7 @@
* Put the new inode number in, log it.
*/
dep->inumber = cpu_to_be64(args->inumber);
- dp->d_ops->data_put_ftype(dep, args->filetype);
+ xfs_dir2_data_put_ftype(dp->i_mount, dep, args->filetype);
tp = args->trans;
xfs_dir2_data_log_entry(args, dbp, dep);
xfs_dir3_leaf_check(dp, lbp);
@@ -1501,21 +1541,17 @@
xfs_dahash_t hashwant; /* hash value looking for */
int high; /* high leaf index */
int low; /* low leaf index */
- xfs_dir2_leaf_t *leaf; /* leaf structure */
xfs_dir2_leaf_entry_t *lep; /* leaf entry */
int mid=0; /* current leaf index */
- struct xfs_dir2_leaf_entry *ents;
struct xfs_dir3_icleaf_hdr leafhdr;
- leaf = lbp->b_addr;
- ents = args->dp->d_ops->leaf_ents_p(leaf);
- args->dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
+ xfs_dir2_leaf_hdr_from_disk(args->dp->i_mount, &leafhdr, lbp->b_addr);
/*
* Note, the table cannot be empty, so we have to go through the loop.
* Binary search the leaf entries looking for our hash value.
*/
- for (lep = ents, low = 0, high = leafhdr.count - 1,
+ for (lep = leafhdr.ents, low = 0, high = leafhdr.count - 1,
hashwant = args->hashval;
low <= high; ) {
mid = (low + high) >> 1;
@@ -1552,6 +1588,7 @@
struct xfs_buf *lbp, /* leaf buffer */
xfs_dir2_db_t db) /* data block number */
{
+ struct xfs_da_geometry *geo = args->geo;
__be16 *bestsp; /* leaf bests table */
struct xfs_buf *dbp; /* data block buffer */
xfs_inode_t *dp; /* incore directory inode */
@@ -1565,23 +1602,23 @@
/*
* Read the offending data block. We need its buffer.
*/
- error = xfs_dir3_data_read(tp, dp, xfs_dir2_db_to_da(args->geo, db),
- -1, &dbp);
+ error = xfs_dir3_data_read(tp, dp, xfs_dir2_db_to_da(geo, db), 0, &dbp);
if (error)
return error;
leaf = lbp->b_addr;
- ltp = xfs_dir2_leaf_tail_p(args->geo, leaf);
+ ltp = xfs_dir2_leaf_tail_p(geo, leaf);
#ifdef DEBUG
{
struct xfs_dir2_data_hdr *hdr = dbp->b_addr;
- struct xfs_dir2_data_free *bf = dp->d_ops->data_bestfree_p(hdr);
+ struct xfs_dir2_data_free *bf =
+ xfs_dir2_data_bestfree_p(dp->i_mount, hdr);
ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC));
ASSERT(be16_to_cpu(bf[0].length) ==
- args->geo->blksize - dp->d_ops->data_entry_offset);
+ geo->blksize - geo->data_entry_offset);
ASSERT(db == be32_to_cpu(ltp->bestcount) - 1);
}
#endif
@@ -1639,7 +1676,6 @@
int error; /* error return code */
struct xfs_buf *fbp; /* buffer for freespace block */
xfs_fileoff_t fo; /* freespace file offset */
- xfs_dir2_free_t *free; /* freespace structure */
struct xfs_buf *lbp; /* buffer for leaf block */
xfs_dir2_leaf_tail_t *ltp; /* tail of leaf structure */
xfs_dir2_leaf_t *leaf; /* leaf structure */
@@ -1697,7 +1733,7 @@
return 0;
lbp = state->path.blk[0].bp;
leaf = lbp->b_addr;
- dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
+ xfs_dir2_leaf_hdr_from_disk(mp, &leafhdr, leaf);
ASSERT(leafhdr.magic == XFS_DIR2_LEAFN_MAGIC ||
leafhdr.magic == XFS_DIR3_LEAFN_MAGIC);
@@ -1708,8 +1744,7 @@
error = xfs_dir2_free_read(tp, dp, args->geo->freeblk, &fbp);
if (error)
return error;
- free = fbp->b_addr;
- dp->d_ops->free_hdr_from_disk(&freehdr, free);
+ xfs_dir2_free_hdr_from_disk(mp, &freehdr, fbp->b_addr);
ASSERT(!freehdr.firstdb);
@@ -1743,10 +1778,10 @@
/*
* Set up the leaf bests table.
*/
- memcpy(xfs_dir2_leaf_bests_p(ltp), dp->d_ops->free_bests_p(free),
+ memcpy(xfs_dir2_leaf_bests_p(ltp), freehdr.bests,
freehdr.nvalid * sizeof(xfs_dir2_data_off_t));
- dp->d_ops->leaf_hdr_to_disk(leaf, &leafhdr);
+ xfs_dir2_leaf_hdr_to_disk(mp, leaf, &leafhdr);
xfs_dir3_leaf_log_header(args, lbp);
xfs_dir3_leaf_log_bests(args, lbp, 0, be32_to_cpu(ltp->bestcount) - 1);
xfs_dir3_leaf_log_tail(args, lbp);
diff --git a/fs/xfs/libxfs/xfs_dir2_node.c b/fs/xfs/libxfs/xfs_dir2_node.c
index 99d5b2e..5d51265 100644
--- a/fs/xfs/libxfs/xfs_dir2_node.c
+++ b/fs/xfs/libxfs/xfs_dir2_node.c
@@ -34,6 +34,25 @@
int *rval);
/*
+ * Convert data space db to the corresponding free db.
+ */
+static xfs_dir2_db_t
+xfs_dir2_db_to_fdb(struct xfs_da_geometry *geo, xfs_dir2_db_t db)
+{
+ return xfs_dir2_byte_to_db(geo, XFS_DIR2_FREE_OFFSET) +
+ (db / geo->free_max_bests);
+}
+
+/*
+ * Convert data space db to the corresponding index in a free db.
+ */
+static int
+xfs_dir2_db_to_fdindex(struct xfs_da_geometry *geo, xfs_dir2_db_t db)
+{
+ return db % geo->free_max_bests;
+}
+
+/*
* Check internal consistency of a leafn block.
*/
#ifdef DEBUG
@@ -45,7 +64,7 @@
struct xfs_dir2_leaf *leaf = bp->b_addr;
struct xfs_dir3_icleaf_hdr leafhdr;
- dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
+ xfs_dir2_leaf_hdr_from_disk(dp->i_mount, &leafhdr, leaf);
if (leafhdr.magic == XFS_DIR3_LEAFN_MAGIC) {
struct xfs_dir3_leaf_hdr *leaf3 = bp->b_addr;
@@ -54,7 +73,7 @@
} else if (leafhdr.magic != XFS_DIR2_LEAFN_MAGIC)
return __this_address;
- return xfs_dir3_leaf_check_int(dp->i_mount, dp, &leafhdr, leaf);
+ return xfs_dir3_leaf_check_int(dp->i_mount, &leafhdr, leaf);
}
static inline void
@@ -160,10 +179,9 @@
struct xfs_buf *bp)
{
struct xfs_mount *mp = dp->i_mount;
+ int maxbests = mp->m_dir_geo->free_max_bests;
unsigned int firstdb;
- int maxbests;
- maxbests = dp->d_ops->free_max_bests(mp->m_dir_geo);
firstdb = (xfs_dir2_da_to_db(mp->m_dir_geo, fbno) -
xfs_dir2_byte_to_db(mp->m_dir_geo, XFS_DIR2_FREE_OFFSET)) *
maxbests;
@@ -176,6 +194,8 @@
return __this_address;
if (be32_to_cpu(hdr3->nvalid) < be32_to_cpu(hdr3->nused))
return __this_address;
+ if (be64_to_cpu(hdr3->hdr.owner) != dp->i_ino)
+ return __this_address;
} else {
struct xfs_dir2_free_hdr *hdr = bp->b_addr;
@@ -194,21 +214,21 @@
struct xfs_trans *tp,
struct xfs_inode *dp,
xfs_dablk_t fbno,
- xfs_daddr_t mappedbno,
+ unsigned int flags,
struct xfs_buf **bpp)
{
xfs_failaddr_t fa;
int err;
- err = xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp,
- XFS_DATA_FORK, &xfs_dir3_free_buf_ops);
+ err = xfs_da_read_buf(tp, dp, fbno, flags, bpp, XFS_DATA_FORK,
+ &xfs_dir3_free_buf_ops);
if (err || !*bpp)
return err;
/* Check things that we can't do in the verifier. */
fa = xfs_dir3_free_header_check(dp, fbno, *bpp);
if (fa) {
- xfs_verifier_error(*bpp, -EFSCORRUPTED, fa);
+ __xfs_buf_mark_corrupt(*bpp, fa);
xfs_trans_brelse(tp, *bpp);
*bpp = NULL;
return -EFSCORRUPTED;
@@ -221,6 +241,58 @@
return 0;
}
+void
+xfs_dir2_free_hdr_from_disk(
+ struct xfs_mount *mp,
+ struct xfs_dir3_icfree_hdr *to,
+ struct xfs_dir2_free *from)
+{
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ struct xfs_dir3_free *from3 = (struct xfs_dir3_free *)from;
+
+ to->magic = be32_to_cpu(from3->hdr.hdr.magic);
+ to->firstdb = be32_to_cpu(from3->hdr.firstdb);
+ to->nvalid = be32_to_cpu(from3->hdr.nvalid);
+ to->nused = be32_to_cpu(from3->hdr.nused);
+ to->bests = from3->bests;
+
+ ASSERT(to->magic == XFS_DIR3_FREE_MAGIC);
+ } else {
+ to->magic = be32_to_cpu(from->hdr.magic);
+ to->firstdb = be32_to_cpu(from->hdr.firstdb);
+ to->nvalid = be32_to_cpu(from->hdr.nvalid);
+ to->nused = be32_to_cpu(from->hdr.nused);
+ to->bests = from->bests;
+
+ ASSERT(to->magic == XFS_DIR2_FREE_MAGIC);
+ }
+}
+
+static void
+xfs_dir2_free_hdr_to_disk(
+ struct xfs_mount *mp,
+ struct xfs_dir2_free *to,
+ struct xfs_dir3_icfree_hdr *from)
+{
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ struct xfs_dir3_free *to3 = (struct xfs_dir3_free *)to;
+
+ ASSERT(from->magic == XFS_DIR3_FREE_MAGIC);
+
+ to3->hdr.hdr.magic = cpu_to_be32(from->magic);
+ to3->hdr.firstdb = cpu_to_be32(from->firstdb);
+ to3->hdr.nvalid = cpu_to_be32(from->nvalid);
+ to3->hdr.nused = cpu_to_be32(from->nused);
+ } else {
+ ASSERT(from->magic == XFS_DIR2_FREE_MAGIC);
+
+ to->hdr.magic = cpu_to_be32(from->magic);
+ to->hdr.firstdb = cpu_to_be32(from->firstdb);
+ to->hdr.nvalid = cpu_to_be32(from->nvalid);
+ to->hdr.nused = cpu_to_be32(from->nused);
+ }
+}
+
int
xfs_dir2_free_read(
struct xfs_trans *tp,
@@ -228,7 +300,7 @@
xfs_dablk_t fbno,
struct xfs_buf **bpp)
{
- return __xfs_dir3_free_read(tp, dp, fbno, -1, bpp);
+ return __xfs_dir3_free_read(tp, dp, fbno, 0, bpp);
}
static int
@@ -238,7 +310,7 @@
xfs_dablk_t fbno,
struct xfs_buf **bpp)
{
- return __xfs_dir3_free_read(tp, dp, fbno, -2, bpp);
+ return __xfs_dir3_free_read(tp, dp, fbno, XFS_DABUF_MAP_HOLE_OK, bpp);
}
static int
@@ -255,7 +327,7 @@
struct xfs_dir3_icfree_hdr hdr;
error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(args->geo, fbno),
- -1, &bp, XFS_DATA_FORK);
+ &bp, XFS_DATA_FORK);
if (error)
return error;
@@ -279,7 +351,7 @@
uuid_copy(&hdr3->hdr.uuid, &mp->m_sb.sb_meta_uuid);
} else
hdr.magic = XFS_DIR2_FREE_MAGIC;
- dp->d_ops->free_hdr_to_disk(bp->b_addr, &hdr);
+ xfs_dir2_free_hdr_to_disk(mp, bp->b_addr, &hdr);
*bpp = bp;
return 0;
}
@@ -290,21 +362,19 @@
STATIC void
xfs_dir2_free_log_bests(
struct xfs_da_args *args,
+ struct xfs_dir3_icfree_hdr *hdr,
struct xfs_buf *bp,
int first, /* first entry to log */
int last) /* last entry to log */
{
- xfs_dir2_free_t *free; /* freespace structure */
- __be16 *bests;
+ struct xfs_dir2_free *free = bp->b_addr;
- free = bp->b_addr;
- bests = args->dp->d_ops->free_bests_p(free);
ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC) ||
free->hdr.magic == cpu_to_be32(XFS_DIR3_FREE_MAGIC));
xfs_trans_log_buf(args->trans, bp,
- (uint)((char *)&bests[first] - (char *)free),
- (uint)((char *)&bests[last] - (char *)free +
- sizeof(bests[0]) - 1));
+ (char *)&hdr->bests[first] - (char *)free,
+ (char *)&hdr->bests[last] - (char *)free +
+ sizeof(hdr->bests[0]) - 1);
}
/*
@@ -323,7 +393,7 @@
free->hdr.magic == cpu_to_be32(XFS_DIR3_FREE_MAGIC));
#endif
xfs_trans_log_buf(args->trans, bp, 0,
- args->dp->d_ops->free_hdr_size - 1);
+ args->geo->free_hdr_size - 1);
}
/*
@@ -340,14 +410,12 @@
int error; /* error return value */
struct xfs_buf *fbp; /* freespace buffer */
xfs_dir2_db_t fdb; /* freespace block number */
- xfs_dir2_free_t *free; /* freespace structure */
__be16 *from; /* pointer to freespace entry */
int i; /* leaf freespace index */
xfs_dir2_leaf_t *leaf; /* leaf structure */
xfs_dir2_leaf_tail_t *ltp; /* leaf tail structure */
int n; /* count of live freespc ents */
xfs_dir2_data_off_t off; /* freespace entry value */
- __be16 *to; /* pointer to freespace entry */
xfs_trans_t *tp; /* transaction pointer */
struct xfs_dir3_icfree_hdr freehdr;
@@ -369,24 +437,25 @@
if (error)
return error;
- free = fbp->b_addr;
- dp->d_ops->free_hdr_from_disk(&freehdr, free);
+ xfs_dir2_free_hdr_from_disk(dp->i_mount, &freehdr, fbp->b_addr);
leaf = lbp->b_addr;
ltp = xfs_dir2_leaf_tail_p(args->geo, leaf);
if (be32_to_cpu(ltp->bestcount) >
- (uint)dp->i_d.di_size / args->geo->blksize)
+ (uint)dp->i_d.di_size / args->geo->blksize) {
+ xfs_buf_mark_corrupt(lbp);
return -EFSCORRUPTED;
+ }
/*
* Copy freespace entries from the leaf block to the new block.
* Count active entries.
*/
from = xfs_dir2_leaf_bests_p(ltp);
- to = dp->d_ops->free_bests_p(free);
- for (i = n = 0; i < be32_to_cpu(ltp->bestcount); i++, from++, to++) {
- if ((off = be16_to_cpu(*from)) != NULLDATAOFF)
+ for (i = n = 0; i < be32_to_cpu(ltp->bestcount); i++, from++) {
+ off = be16_to_cpu(*from);
+ if (off != NULLDATAOFF)
n++;
- *to = cpu_to_be16(off);
+ freehdr.bests[i] = cpu_to_be16(off);
}
/*
@@ -395,8 +464,8 @@
freehdr.nused = n;
freehdr.nvalid = be32_to_cpu(ltp->bestcount);
- dp->d_ops->free_hdr_to_disk(fbp->b_addr, &freehdr);
- xfs_dir2_free_log_bests(args, fbp, 0, freehdr.nvalid - 1);
+ xfs_dir2_free_hdr_to_disk(dp->i_mount, fbp->b_addr, &freehdr);
+ xfs_dir2_free_log_bests(args, &freehdr, fbp, 0, freehdr.nvalid - 1);
xfs_dir2_free_log_header(args, fbp);
/*
@@ -439,15 +508,17 @@
trace_xfs_dir2_leafn_add(args, index);
- dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
- ents = dp->d_ops->leaf_ents_p(leaf);
+ xfs_dir2_leaf_hdr_from_disk(dp->i_mount, &leafhdr, leaf);
+ ents = leafhdr.ents;
/*
* Quick check just to make sure we are not going to index
* into other peoples memory
*/
- if (index < 0)
+ if (index < 0) {
+ xfs_buf_mark_corrupt(bp);
return -EFSCORRUPTED;
+ }
/*
* If there are already the maximum number of leaf entries in
@@ -456,7 +527,7 @@
* a compact.
*/
- if (leafhdr.count == dp->d_ops->leaf_max_ents(args->geo)) {
+ if (leafhdr.count == args->geo->leaf_max_ents) {
if (!leafhdr.stale)
return -ENOSPC;
compact = leafhdr.stale > 1;
@@ -494,9 +565,9 @@
lep->address = cpu_to_be32(xfs_dir2_db_off_to_dataptr(args->geo,
args->blkno, args->index));
- dp->d_ops->leaf_hdr_to_disk(leaf, &leafhdr);
+ xfs_dir2_leaf_hdr_to_disk(dp->i_mount, leaf, &leafhdr);
xfs_dir3_leaf_log_header(args, bp);
- xfs_dir3_leaf_log_ents(args, bp, lfloglow, lfloghigh);
+ xfs_dir3_leaf_log_ents(args, &leafhdr, bp, lfloglow, lfloghigh);
xfs_dir3_leaf_check(dp, bp);
return 0;
}
@@ -510,10 +581,9 @@
{
struct xfs_dir3_icfree_hdr hdr;
- dp->d_ops->free_hdr_from_disk(&hdr, bp->b_addr);
+ xfs_dir2_free_hdr_from_disk(dp->i_mount, &hdr, bp->b_addr);
- ASSERT((hdr.firstdb %
- dp->d_ops->free_max_bests(dp->i_mount->m_dir_geo)) == 0);
+ ASSERT((hdr.firstdb % dp->i_mount->m_dir_geo->free_max_bests) == 0);
ASSERT(hdr.firstdb <= db);
ASSERT(db < hdr.firstdb + hdr.nvalid);
}
@@ -531,11 +601,9 @@
struct xfs_buf *bp, /* leaf buffer */
int *count) /* count of entries in leaf */
{
- struct xfs_dir2_leaf *leaf = bp->b_addr;
- struct xfs_dir2_leaf_entry *ents;
struct xfs_dir3_icleaf_hdr leafhdr;
- dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
+ xfs_dir2_leaf_hdr_from_disk(dp->i_mount, &leafhdr, bp->b_addr);
ASSERT(leafhdr.magic == XFS_DIR2_LEAFN_MAGIC ||
leafhdr.magic == XFS_DIR3_LEAFN_MAGIC ||
@@ -546,9 +614,7 @@
*count = leafhdr.count;
if (!leafhdr.count)
return 0;
-
- ents = dp->d_ops->leaf_ents_p(leaf);
- return be32_to_cpu(ents[leafhdr.count - 1].hashval);
+ return be32_to_cpu(leafhdr.ents[leafhdr.count - 1].hashval);
}
/*
@@ -577,15 +643,13 @@
xfs_dir2_db_t newdb; /* new data block number */
xfs_dir2_db_t newfdb; /* new free block number */
xfs_trans_t *tp; /* transaction pointer */
- struct xfs_dir2_leaf_entry *ents;
struct xfs_dir3_icleaf_hdr leafhdr;
dp = args->dp;
tp = args->trans;
mp = dp->i_mount;
leaf = bp->b_addr;
- dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
- ents = dp->d_ops->leaf_ents_p(leaf);
+ xfs_dir2_leaf_hdr_from_disk(mp, &leafhdr, leaf);
xfs_dir3_leaf_check(dp, bp);
ASSERT(leafhdr.count > 0);
@@ -605,11 +669,11 @@
ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC) ||
free->hdr.magic == cpu_to_be32(XFS_DIR3_FREE_MAGIC));
}
- length = dp->d_ops->data_entsize(args->namelen);
+ length = xfs_dir2_data_entsize(mp, args->namelen);
/*
* Loop over leaf entries with the right hash value.
*/
- for (lep = &ents[index];
+ for (lep = &leafhdr.ents[index];
index < leafhdr.count && be32_to_cpu(lep->hashval) == args->hashval;
lep++, index++) {
/*
@@ -631,14 +695,14 @@
* in hand, take a look at it.
*/
if (newdb != curdb) {
- __be16 *bests;
+ struct xfs_dir3_icfree_hdr freehdr;
curdb = newdb;
/*
* Convert the data block to the free block
* holding its freespace information.
*/
- newfdb = dp->d_ops->db_to_fdb(args->geo, newdb);
+ newfdb = xfs_dir2_db_to_fdb(args->geo, newdb);
/*
* If it's not the one we have in hand, read it in.
*/
@@ -662,20 +726,20 @@
/*
* Get the index for our entry.
*/
- fi = dp->d_ops->db_to_fdindex(args->geo, curdb);
+ fi = xfs_dir2_db_to_fdindex(args->geo, curdb);
/*
* If it has room, return it.
*/
- bests = dp->d_ops->free_bests_p(free);
- if (unlikely(bests[fi] == cpu_to_be16(NULLDATAOFF))) {
- XFS_ERROR_REPORT("xfs_dir2_leafn_lookup_int",
- XFS_ERRLEVEL_LOW, mp);
+ xfs_dir2_free_hdr_from_disk(mp, &freehdr, free);
+ if (XFS_IS_CORRUPT(mp,
+ freehdr.bests[fi] ==
+ cpu_to_be16(NULLDATAOFF))) {
if (curfdb != newfdb)
xfs_trans_brelse(tp, curbp);
return -EFSCORRUPTED;
}
curfdb = newfdb;
- if (be16_to_cpu(bests[fi]) >= length)
+ if (be16_to_cpu(freehdr.bests[fi]) >= length)
goto out;
}
}
@@ -729,19 +793,19 @@
xfs_dir2_db_t newdb; /* new data block number */
xfs_trans_t *tp; /* transaction pointer */
enum xfs_dacmp cmp; /* comparison result */
- struct xfs_dir2_leaf_entry *ents;
struct xfs_dir3_icleaf_hdr leafhdr;
dp = args->dp;
tp = args->trans;
mp = dp->i_mount;
leaf = bp->b_addr;
- dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
- ents = dp->d_ops->leaf_ents_p(leaf);
+ xfs_dir2_leaf_hdr_from_disk(mp, &leafhdr, leaf);
xfs_dir3_leaf_check(dp, bp);
- if (leafhdr.count <= 0)
+ if (leafhdr.count <= 0) {
+ xfs_buf_mark_corrupt(bp);
return -EFSCORRUPTED;
+ }
/*
* Look up the hash value in the leaf entries.
@@ -757,7 +821,7 @@
/*
* Loop over leaf entries with the right hash value.
*/
- for (lep = &ents[index];
+ for (lep = &leafhdr.ents[index];
index < leafhdr.count && be32_to_cpu(lep->hashval) == args->hashval;
lep++, index++) {
/*
@@ -796,7 +860,7 @@
error = xfs_dir3_data_read(tp, dp,
xfs_dir2_db_to_da(args->geo,
newdb),
- -1, &curbp);
+ 0, &curbp);
if (error)
return error;
}
@@ -814,7 +878,7 @@
* EEXIST immediately. If it's the first case-insensitive
* match, store the block & inode number and continue looking.
*/
- cmp = mp->m_dirnameops->compname(args, dep->name, dep->namelen);
+ cmp = xfs_dir2_compname(args, dep->name, dep->namelen);
if (cmp != XFS_CMP_DIFFERENT && cmp != args->cmpresult) {
/* If there is a CI match block, drop it */
if (args->cmpresult != XFS_CMP_DIFFERENT &&
@@ -822,7 +886,7 @@
xfs_trans_brelse(tp, state->extrablk.bp);
args->cmpresult = cmp;
args->inumber = be64_to_cpu(dep->inumber);
- args->filetype = dp->d_ops->data_get_ftype(dep);
+ args->filetype = xfs_dir2_data_get_ftype(mp, dep);
*indexp = index;
state->extravalid = 1;
state->extrablk.bp = curbp;
@@ -912,7 +976,7 @@
if (start_d < dhdr->count) {
memmove(&dents[start_d + count], &dents[start_d],
(dhdr->count - start_d) * sizeof(xfs_dir2_leaf_entry_t));
- xfs_dir3_leaf_log_ents(args, bp_d, start_d + count,
+ xfs_dir3_leaf_log_ents(args, dhdr, bp_d, start_d + count,
count + dhdr->count - 1);
}
/*
@@ -934,7 +998,7 @@
*/
memcpy(&dents[start_d], &sents[start_s],
count * sizeof(xfs_dir2_leaf_entry_t));
- xfs_dir3_leaf_log_ents(args, bp_d, start_d, start_d + count - 1);
+ xfs_dir3_leaf_log_ents(args, dhdr, bp_d, start_d, start_d + count - 1);
/*
* If there are source entries after the ones we copied,
@@ -943,7 +1007,8 @@
if (start_s + count < shdr->count) {
memmove(&sents[start_s], &sents[start_s + count],
count * sizeof(xfs_dir2_leaf_entry_t));
- xfs_dir3_leaf_log_ents(args, bp_s, start_s, start_s + count - 1);
+ xfs_dir3_leaf_log_ents(args, shdr, bp_s, start_s,
+ start_s + count - 1);
}
/*
@@ -972,10 +1037,10 @@
struct xfs_dir3_icleaf_hdr hdr1;
struct xfs_dir3_icleaf_hdr hdr2;
- dp->d_ops->leaf_hdr_from_disk(&hdr1, leaf1);
- dp->d_ops->leaf_hdr_from_disk(&hdr2, leaf2);
- ents1 = dp->d_ops->leaf_ents_p(leaf1);
- ents2 = dp->d_ops->leaf_ents_p(leaf2);
+ xfs_dir2_leaf_hdr_from_disk(dp->i_mount, &hdr1, leaf1);
+ xfs_dir2_leaf_hdr_from_disk(dp->i_mount, &hdr2, leaf2);
+ ents1 = hdr1.ents;
+ ents2 = hdr2.ents;
if (hdr1.count > 0 && hdr2.count > 0 &&
(be32_to_cpu(ents2[0].hashval) < be32_to_cpu(ents1[0].hashval) ||
@@ -1025,10 +1090,10 @@
leaf1 = blk1->bp->b_addr;
leaf2 = blk2->bp->b_addr;
- dp->d_ops->leaf_hdr_from_disk(&hdr1, leaf1);
- dp->d_ops->leaf_hdr_from_disk(&hdr2, leaf2);
- ents1 = dp->d_ops->leaf_ents_p(leaf1);
- ents2 = dp->d_ops->leaf_ents_p(leaf2);
+ xfs_dir2_leaf_hdr_from_disk(dp->i_mount, &hdr1, leaf1);
+ xfs_dir2_leaf_hdr_from_disk(dp->i_mount, &hdr2, leaf2);
+ ents1 = hdr1.ents;
+ ents2 = hdr2.ents;
oldsum = hdr1.count + hdr2.count;
#if defined(DEBUG) || defined(XFS_WARN)
@@ -1074,8 +1139,8 @@
ASSERT(hdr1.stale + hdr2.stale == oldstale);
/* log the changes made when moving the entries */
- dp->d_ops->leaf_hdr_to_disk(leaf1, &hdr1);
- dp->d_ops->leaf_hdr_to_disk(leaf2, &hdr2);
+ xfs_dir2_leaf_hdr_to_disk(dp->i_mount, leaf1, &hdr1);
+ xfs_dir2_leaf_hdr_to_disk(dp->i_mount, leaf2, &hdr2);
xfs_dir3_leaf_log_header(args, blk1->bp);
xfs_dir3_leaf_log_header(args, blk2->bp);
@@ -1121,19 +1186,17 @@
int longest)
{
int logfree = 0;
- __be16 *bests;
struct xfs_dir3_icfree_hdr freehdr;
struct xfs_inode *dp = args->dp;
- dp->d_ops->free_hdr_from_disk(&freehdr, free);
- bests = dp->d_ops->free_bests_p(free);
+ xfs_dir2_free_hdr_from_disk(dp->i_mount, &freehdr, free);
if (hdr) {
/*
* Data block is not empty, just set the free entry to the new
* value.
*/
- bests[findex] = cpu_to_be16(longest);
- xfs_dir2_free_log_bests(args, fbp, findex, findex);
+ freehdr.bests[findex] = cpu_to_be16(longest);
+ xfs_dir2_free_log_bests(args, &freehdr, fbp, findex, findex);
return 0;
}
@@ -1149,18 +1212,18 @@
int i; /* free entry index */
for (i = findex - 1; i >= 0; i--) {
- if (bests[i] != cpu_to_be16(NULLDATAOFF))
+ if (freehdr.bests[i] != cpu_to_be16(NULLDATAOFF))
break;
}
freehdr.nvalid = i + 1;
logfree = 0;
} else {
/* Not the last entry, just punch it out. */
- bests[findex] = cpu_to_be16(NULLDATAOFF);
+ freehdr.bests[findex] = cpu_to_be16(NULLDATAOFF);
logfree = 1;
}
- dp->d_ops->free_hdr_to_disk(free, &freehdr);
+ xfs_dir2_free_hdr_to_disk(dp->i_mount, free, &freehdr);
xfs_dir2_free_log_header(args, fbp);
/*
@@ -1185,7 +1248,7 @@
/* Log the free entry that changed, unless we got rid of it. */
if (logfree)
- xfs_dir2_free_log_bests(args, fbp, findex, findex);
+ xfs_dir2_free_log_bests(args, &freehdr, fbp, findex, findex);
return 0;
}
@@ -1202,6 +1265,7 @@
xfs_da_state_blk_t *dblk, /* data block */
int *rval) /* resulting block needs join */
{
+ struct xfs_da_geometry *geo = args->geo;
xfs_dir2_data_hdr_t *hdr; /* data block header */
xfs_dir2_db_t db; /* data block number */
struct xfs_buf *dbp; /* data block buffer */
@@ -1216,27 +1280,25 @@
xfs_trans_t *tp; /* transaction pointer */
struct xfs_dir2_data_free *bf; /* bestfree table */
struct xfs_dir3_icleaf_hdr leafhdr;
- struct xfs_dir2_leaf_entry *ents;
trace_xfs_dir2_leafn_remove(args, index);
dp = args->dp;
tp = args->trans;
leaf = bp->b_addr;
- dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
- ents = dp->d_ops->leaf_ents_p(leaf);
+ xfs_dir2_leaf_hdr_from_disk(dp->i_mount, &leafhdr, leaf);
/*
* Point to the entry we're removing.
*/
- lep = &ents[index];
+ lep = &leafhdr.ents[index];
/*
* Extract the data block and offset from the entry.
*/
- db = xfs_dir2_dataptr_to_db(args->geo, be32_to_cpu(lep->address));
+ db = xfs_dir2_dataptr_to_db(geo, be32_to_cpu(lep->address));
ASSERT(dblk->blkno == db);
- off = xfs_dir2_dataptr_to_off(args->geo, be32_to_cpu(lep->address));
+ off = xfs_dir2_dataptr_to_off(geo, be32_to_cpu(lep->address));
ASSERT(dblk->index == off);
/*
@@ -1244,11 +1306,11 @@
* Log the leaf block changes.
*/
leafhdr.stale++;
- dp->d_ops->leaf_hdr_to_disk(leaf, &leafhdr);
+ xfs_dir2_leaf_hdr_to_disk(dp->i_mount, leaf, &leafhdr);
xfs_dir3_leaf_log_header(args, bp);
lep->address = cpu_to_be32(XFS_DIR2_NULL_DATAPTR);
- xfs_dir3_leaf_log_ents(args, bp, index, index);
+ xfs_dir3_leaf_log_ents(args, &leafhdr, bp, index, index);
/*
* Make the data entry free. Keep track of the longest freespace
@@ -1257,17 +1319,18 @@
dbp = dblk->bp;
hdr = dbp->b_addr;
dep = (xfs_dir2_data_entry_t *)((char *)hdr + off);
- bf = dp->d_ops->data_bestfree_p(hdr);
+ bf = xfs_dir2_data_bestfree_p(dp->i_mount, hdr);
longest = be16_to_cpu(bf[0].length);
needlog = needscan = 0;
xfs_dir2_data_make_free(args, dbp, off,
- dp->d_ops->data_entsize(dep->namelen), &needlog, &needscan);
+ xfs_dir2_data_entsize(dp->i_mount, dep->namelen), &needlog,
+ &needscan);
/*
* Rescan the data block freespaces for bestfree.
* Log the data block header if needed.
*/
if (needscan)
- xfs_dir2_data_freescan(dp, hdr, &needlog);
+ xfs_dir2_data_freescan(dp->i_mount, hdr, &needlog);
if (needlog)
xfs_dir2_data_log_header(args, dbp);
xfs_dir3_data_check(dp, dbp);
@@ -1286,9 +1349,8 @@
* Convert the data block number to a free block,
* read in the free block.
*/
- fdb = dp->d_ops->db_to_fdb(args->geo, db);
- error = xfs_dir2_free_read(tp, dp,
- xfs_dir2_db_to_da(args->geo, fdb),
+ fdb = xfs_dir2_db_to_fdb(geo, db);
+ error = xfs_dir2_free_read(tp, dp, xfs_dir2_db_to_da(geo, fdb),
&fbp);
if (error)
return error;
@@ -1296,23 +1358,22 @@
#ifdef DEBUG
{
struct xfs_dir3_icfree_hdr freehdr;
- dp->d_ops->free_hdr_from_disk(&freehdr, free);
- ASSERT(freehdr.firstdb == dp->d_ops->free_max_bests(args->geo) *
- (fdb - xfs_dir2_byte_to_db(args->geo,
- XFS_DIR2_FREE_OFFSET)));
+
+ xfs_dir2_free_hdr_from_disk(dp->i_mount, &freehdr, free);
+ ASSERT(freehdr.firstdb == geo->free_max_bests *
+ (fdb - xfs_dir2_byte_to_db(geo, XFS_DIR2_FREE_OFFSET)));
}
#endif
/*
* Calculate which entry we need to fix.
*/
- findex = dp->d_ops->db_to_fdindex(args->geo, db);
+ findex = xfs_dir2_db_to_fdindex(geo, db);
longest = be16_to_cpu(bf[0].length);
/*
* If the data block is now empty we can get rid of it
* (usually).
*/
- if (longest == args->geo->blksize -
- dp->d_ops->data_entry_offset) {
+ if (longest == geo->blksize - geo->data_entry_offset) {
/*
* Try to punch out the data block.
*/
@@ -1344,9 +1405,9 @@
* Return indication of whether this leaf block is empty enough
* to justify trying to join it with a neighbor.
*/
- *rval = (dp->d_ops->leaf_hdr_size +
- (uint)sizeof(ents[0]) * (leafhdr.count - leafhdr.stale)) <
- args->geo->magicpct;
+ *rval = (geo->leaf_hdr_size +
+ (uint)sizeof(leafhdr.ents) * (leafhdr.count - leafhdr.stale)) <
+ geo->magicpct;
return 0;
}
@@ -1445,12 +1506,12 @@
*/
blk = &state->path.blk[state->path.active - 1];
leaf = blk->bp->b_addr;
- dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
- ents = dp->d_ops->leaf_ents_p(leaf);
+ xfs_dir2_leaf_hdr_from_disk(dp->i_mount, &leafhdr, leaf);
+ ents = leafhdr.ents;
xfs_dir3_leaf_check(dp, blk->bp);
count = leafhdr.count - leafhdr.stale;
- bytes = dp->d_ops->leaf_hdr_size + count * sizeof(ents[0]);
+ bytes = state->args->geo->leaf_hdr_size + count * sizeof(ents[0]);
if (bytes > (state->args->geo->blksize >> 1)) {
/*
* Blk over 50%, don't try to join.
@@ -1495,8 +1556,7 @@
/*
* Read the sibling leaf block.
*/
- error = xfs_dir3_leafn_read(state->args->trans, dp,
- blkno, -1, &bp);
+ error = xfs_dir3_leafn_read(state->args->trans, dp, blkno, &bp);
if (error)
return error;
@@ -1508,8 +1568,8 @@
(state->args->geo->blksize >> 2);
leaf = bp->b_addr;
- dp->d_ops->leaf_hdr_from_disk(&hdr2, leaf);
- ents = dp->d_ops->leaf_ents_p(leaf);
+ xfs_dir2_leaf_hdr_from_disk(dp->i_mount, &hdr2, leaf);
+ ents = hdr2.ents;
count += hdr2.count - hdr2.stale;
bytes -= count * sizeof(ents[0]);
@@ -1571,10 +1631,10 @@
drop_leaf = drop_blk->bp->b_addr;
save_leaf = save_blk->bp->b_addr;
- dp->d_ops->leaf_hdr_from_disk(&savehdr, save_leaf);
- dp->d_ops->leaf_hdr_from_disk(&drophdr, drop_leaf);
- sents = dp->d_ops->leaf_ents_p(save_leaf);
- dents = dp->d_ops->leaf_ents_p(drop_leaf);
+ xfs_dir2_leaf_hdr_from_disk(dp->i_mount, &savehdr, save_leaf);
+ xfs_dir2_leaf_hdr_from_disk(dp->i_mount, &drophdr, drop_leaf);
+ sents = savehdr.ents;
+ dents = drophdr.ents;
/*
* If there are any stale leaf entries, take this opportunity
@@ -1600,8 +1660,8 @@
save_blk->hashval = be32_to_cpu(sents[savehdr.count - 1].hashval);
/* log the changes made when moving the entries */
- dp->d_ops->leaf_hdr_to_disk(save_leaf, &savehdr);
- dp->d_ops->leaf_hdr_to_disk(drop_leaf, &drophdr);
+ xfs_dir2_leaf_hdr_to_disk(dp->i_mount, save_leaf, &savehdr);
+ xfs_dir2_leaf_hdr_to_disk(dp->i_mount, drop_leaf, &drophdr);
xfs_dir3_leaf_log_header(args, save_blk->bp);
xfs_dir3_leaf_log_header(args, drop_blk->bp);
@@ -1620,19 +1680,16 @@
xfs_dir2_db_t *dbno,
struct xfs_buf **dbpp,
struct xfs_buf **fbpp,
+ struct xfs_dir3_icfree_hdr *hdr,
int *findex)
{
struct xfs_inode *dp = args->dp;
struct xfs_trans *tp = args->trans;
struct xfs_mount *mp = dp->i_mount;
- struct xfs_dir3_icfree_hdr freehdr;
struct xfs_dir2_data_free *bf;
- struct xfs_dir2_data_hdr *hdr;
- struct xfs_dir2_free *free = NULL;
xfs_dir2_db_t fbno;
struct xfs_buf *fbp;
struct xfs_buf *dbp;
- __be16 *bests = NULL;
int error;
/* Not allowed to allocate, return failure. */
@@ -1651,7 +1708,7 @@
* Get the freespace block corresponding to the data block
* that was just allocated.
*/
- fbno = dp->d_ops->db_to_fdb(args->geo, *dbno);
+ fbno = xfs_dir2_db_to_fdb(args->geo, *dbno);
error = xfs_dir2_free_try_read(tp, dp,
xfs_dir2_db_to_da(args->geo, fbno), &fbp);
if (error)
@@ -1666,11 +1723,13 @@
if (error)
return error;
- if (dp->d_ops->db_to_fdb(args->geo, *dbno) != fbno) {
+ if (XFS_IS_CORRUPT(mp,
+ xfs_dir2_db_to_fdb(args->geo, *dbno) !=
+ fbno)) {
xfs_alert(mp,
"%s: dir ino %llu needed freesp block %lld for data block %lld, got %lld",
__func__, (unsigned long long)dp->i_ino,
- (long long)dp->d_ops->db_to_fdb(args->geo, *dbno),
+ (long long)xfs_dir2_db_to_fdb(args->geo, *dbno),
(long long)*dbno, (long long)fbno);
if (fblk) {
xfs_alert(mp,
@@ -1680,7 +1739,6 @@
} else {
xfs_alert(mp, " ... fblk is NULL");
}
- XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp);
return -EFSCORRUPTED;
}
@@ -1688,44 +1746,39 @@
error = xfs_dir3_free_get_buf(args, fbno, &fbp);
if (error)
return error;
- free = fbp->b_addr;
- bests = dp->d_ops->free_bests_p(free);
- dp->d_ops->free_hdr_from_disk(&freehdr, free);
+ xfs_dir2_free_hdr_from_disk(mp, hdr, fbp->b_addr);
/* Remember the first slot as our empty slot. */
- freehdr.firstdb = (fbno - xfs_dir2_byte_to_db(args->geo,
+ hdr->firstdb = (fbno - xfs_dir2_byte_to_db(args->geo,
XFS_DIR2_FREE_OFFSET)) *
- dp->d_ops->free_max_bests(args->geo);
+ args->geo->free_max_bests;
} else {
- free = fbp->b_addr;
- bests = dp->d_ops->free_bests_p(free);
- dp->d_ops->free_hdr_from_disk(&freehdr, free);
+ xfs_dir2_free_hdr_from_disk(mp, hdr, fbp->b_addr);
}
/* Set the freespace block index from the data block number. */
- *findex = dp->d_ops->db_to_fdindex(args->geo, *dbno);
+ *findex = xfs_dir2_db_to_fdindex(args->geo, *dbno);
/* Extend the freespace table if the new data block is off the end. */
- if (*findex >= freehdr.nvalid) {
- ASSERT(*findex < dp->d_ops->free_max_bests(args->geo));
- freehdr.nvalid = *findex + 1;
- bests[*findex] = cpu_to_be16(NULLDATAOFF);
+ if (*findex >= hdr->nvalid) {
+ ASSERT(*findex < args->geo->free_max_bests);
+ hdr->nvalid = *findex + 1;
+ hdr->bests[*findex] = cpu_to_be16(NULLDATAOFF);
}
/*
* If this entry was for an empty data block (this should always be
* true) then update the header.
*/
- if (bests[*findex] == cpu_to_be16(NULLDATAOFF)) {
- freehdr.nused++;
- dp->d_ops->free_hdr_to_disk(fbp->b_addr, &freehdr);
+ if (hdr->bests[*findex] == cpu_to_be16(NULLDATAOFF)) {
+ hdr->nused++;
+ xfs_dir2_free_hdr_to_disk(mp, fbp->b_addr, hdr);
xfs_dir2_free_log_header(args, fbp);
}
/* Update the freespace value for the new block in the table. */
- hdr = dbp->b_addr;
- bf = dp->d_ops->data_bestfree_p(hdr);
- bests[*findex] = bf[0].length;
+ bf = xfs_dir2_data_bestfree_p(mp, dbp->b_addr);
+ hdr->bests[*findex] = bf[0].length;
*dbpp = dbp;
*fbpp = fbp;
@@ -1738,11 +1791,10 @@
struct xfs_da_state_blk *fblk,
xfs_dir2_db_t *dbnop,
struct xfs_buf **fbpp,
+ struct xfs_dir3_icfree_hdr *hdr,
int *findexp,
int length)
{
- struct xfs_dir3_icfree_hdr freehdr;
- struct xfs_dir2_free *free = NULL;
struct xfs_inode *dp = args->dp;
struct xfs_trans *tp = args->trans;
struct xfs_buf *fbp = NULL;
@@ -1752,7 +1804,6 @@
xfs_dir2_db_t dbno = -1;
xfs_dir2_db_t fbno;
xfs_fileoff_t fo;
- __be16 *bests = NULL;
int findex = 0;
int error;
@@ -1763,17 +1814,14 @@
*/
if (fblk) {
fbp = fblk->bp;
- free = fbp->b_addr;
findex = fblk->index;
+ xfs_dir2_free_hdr_from_disk(dp->i_mount, hdr, fbp->b_addr);
if (findex >= 0) {
/* caller already found the freespace for us. */
- bests = dp->d_ops->free_bests_p(free);
- dp->d_ops->free_hdr_from_disk(&freehdr, free);
-
- ASSERT(findex < freehdr.nvalid);
- ASSERT(be16_to_cpu(bests[findex]) != NULLDATAOFF);
- ASSERT(be16_to_cpu(bests[findex]) >= length);
- dbno = freehdr.firstdb + findex;
+ ASSERT(findex < hdr->nvalid);
+ ASSERT(be16_to_cpu(hdr->bests[findex]) != NULLDATAOFF);
+ ASSERT(be16_to_cpu(hdr->bests[findex]) >= length);
+ dbno = hdr->firstdb + findex;
goto found_block;
}
@@ -1815,15 +1863,13 @@
if (!fbp)
continue;
- free = fbp->b_addr;
- bests = dp->d_ops->free_bests_p(free);
- dp->d_ops->free_hdr_from_disk(&freehdr, free);
+ xfs_dir2_free_hdr_from_disk(dp->i_mount, hdr, fbp->b_addr);
/* Scan the free entry array for a large enough free space. */
- for (findex = freehdr.nvalid - 1; findex >= 0; findex--) {
- if (be16_to_cpu(bests[findex]) != NULLDATAOFF &&
- be16_to_cpu(bests[findex]) >= length) {
- dbno = freehdr.firstdb + findex;
+ for (findex = hdr->nvalid - 1; findex >= 0; findex--) {
+ if (be16_to_cpu(hdr->bests[findex]) != NULLDATAOFF &&
+ be16_to_cpu(hdr->bests[findex]) >= length) {
+ dbno = hdr->firstdb + findex;
goto found_block;
}
}
@@ -1839,7 +1885,6 @@
return 0;
}
-
/*
* Add the data entry for a node-format directory name addition.
* The leaf entry is added in xfs_dir2_leafn_add.
@@ -1854,9 +1899,9 @@
struct xfs_dir2_data_entry *dep; /* data entry pointer */
struct xfs_dir2_data_hdr *hdr; /* data block header */
struct xfs_dir2_data_free *bf;
- struct xfs_dir2_free *free = NULL; /* freespace block structure */
struct xfs_trans *tp = args->trans;
struct xfs_inode *dp = args->dp;
+ struct xfs_dir3_icfree_hdr freehdr;
struct xfs_buf *dbp; /* data block buffer */
struct xfs_buf *fbp; /* freespace buffer */
xfs_dir2_data_aoff_t aoff;
@@ -1868,11 +1913,10 @@
int needlog = 0; /* need to log data header */
int needscan = 0; /* need to rescan data frees */
__be16 *tagp; /* data entry tag pointer */
- __be16 *bests;
- length = dp->d_ops->data_entsize(args->namelen);
- error = xfs_dir2_node_find_freeblk(args, fblk, &dbno, &fbp, &findex,
- length);
+ length = xfs_dir2_data_entsize(dp->i_mount, args->namelen);
+ error = xfs_dir2_node_find_freeblk(args, fblk, &dbno, &fbp, &freehdr,
+ &findex, length);
if (error)
return error;
@@ -1894,19 +1938,19 @@
/* we're going to have to log the free block index later */
logfree = 1;
error = xfs_dir2_node_add_datablk(args, fblk, &dbno, &dbp, &fbp,
- &findex);
+ &freehdr, &findex);
} else {
/* Read the data block in. */
error = xfs_dir3_data_read(tp, dp,
xfs_dir2_db_to_da(args->geo, dbno),
- -1, &dbp);
+ 0, &dbp);
}
if (error)
return error;
/* setup for data block up now */
hdr = dbp->b_addr;
- bf = dp->d_ops->data_bestfree_p(hdr);
+ bf = xfs_dir2_data_bestfree_p(dp->i_mount, hdr);
ASSERT(be16_to_cpu(bf[0].length) >= length);
/* Point to the existing unused space. */
@@ -1927,28 +1971,26 @@
dep->inumber = cpu_to_be64(args->inumber);
dep->namelen = args->namelen;
memcpy(dep->name, args->name, dep->namelen);
- dp->d_ops->data_put_ftype(dep, args->filetype);
- tagp = dp->d_ops->data_entry_tag_p(dep);
+ xfs_dir2_data_put_ftype(dp->i_mount, dep, args->filetype);
+ tagp = xfs_dir2_data_entry_tag_p(dp->i_mount, dep);
*tagp = cpu_to_be16((char *)dep - (char *)hdr);
xfs_dir2_data_log_entry(args, dbp, dep);
/* Rescan the freespace and log the data block if needed. */
if (needscan)
- xfs_dir2_data_freescan(dp, hdr, &needlog);
+ xfs_dir2_data_freescan(dp->i_mount, hdr, &needlog);
if (needlog)
xfs_dir2_data_log_header(args, dbp);
/* If the freespace block entry is now wrong, update it. */
- free = fbp->b_addr;
- bests = dp->d_ops->free_bests_p(free);
- if (bests[findex] != bf[0].length) {
- bests[findex] = bf[0].length;
+ if (freehdr.bests[findex] != bf[0].length) {
+ freehdr.bests[findex] = bf[0].length;
logfree = 1;
}
/* Log the freespace entry if needed. */
if (logfree)
- xfs_dir2_free_log_bests(args, fbp, findex, findex);
+ xfs_dir2_free_log_bests(args, &freehdr, fbp, findex, findex);
/* Return the data block and offset in args. */
args->blkno = (xfs_dablk_t)dbno;
@@ -1973,9 +2015,7 @@
/*
* Allocate and initialize the state (btree cursor).
*/
- state = xfs_da_state_alloc();
- state->args = args;
- state->mp = args->dp->i_mount;
+ state = xfs_da_state_alloc(args);
/*
* Look up the name. We're not supposed to find it, but
* this gives us the insertion point.
@@ -2044,9 +2084,8 @@
/*
* Allocate and initialize the btree cursor.
*/
- state = xfs_da_state_alloc();
- state->args = args;
- state->mp = args->dp->i_mount;
+ state = xfs_da_state_alloc(args);
+
/*
* Fill in the path to the entry in the cursor.
*/
@@ -2097,9 +2136,7 @@
/*
* Allocate and initialize the btree cursor.
*/
- state = xfs_da_state_alloc();
- state->args = args;
- state->mp = args->dp->i_mount;
+ state = xfs_da_state_alloc(args);
/* Look up the entry we're deleting, set up the cursor. */
error = xfs_da3_node_lookup_int(state, &rval);
@@ -2156,8 +2193,6 @@
int i; /* btree level */
xfs_ino_t inum; /* new inode number */
int ftype; /* new file type */
- xfs_dir2_leaf_t *leaf; /* leaf structure */
- xfs_dir2_leaf_entry_t *lep; /* leaf entry being changed */
int rval; /* internal return value */
xfs_da_state_t *state; /* btree cursor */
@@ -2166,9 +2201,7 @@
/*
* Allocate and initialize the btree cursor.
*/
- state = xfs_da_state_alloc();
- state->args = args;
- state->mp = args->dp->i_mount;
+ state = xfs_da_state_alloc(args);
/*
* We have to save new inode number and ftype since
@@ -2189,16 +2222,17 @@
* and locked it. But paranoia is good.
*/
if (rval == -EEXIST) {
- struct xfs_dir2_leaf_entry *ents;
+ struct xfs_dir3_icleaf_hdr leafhdr;
+
/*
* Find the leaf entry.
*/
blk = &state->path.blk[state->path.active - 1];
ASSERT(blk->magic == XFS_DIR2_LEAFN_MAGIC);
- leaf = blk->bp->b_addr;
- ents = args->dp->d_ops->leaf_ents_p(leaf);
- lep = &ents[blk->index];
ASSERT(state->extravalid);
+
+ xfs_dir2_leaf_hdr_from_disk(state->mp, &leafhdr,
+ blk->bp->b_addr);
/*
* Point to the data entry.
*/
@@ -2208,13 +2242,13 @@
dep = (xfs_dir2_data_entry_t *)
((char *)hdr +
xfs_dir2_dataptr_to_off(args->geo,
- be32_to_cpu(lep->address)));
+ be32_to_cpu(leafhdr.ents[blk->index].address)));
ASSERT(inum != be64_to_cpu(dep->inumber));
/*
* Fill in the new inode number and log the entry.
*/
dep->inumber = cpu_to_be64(inum);
- args->dp->d_ops->data_put_ftype(dep, ftype);
+ xfs_dir2_data_put_ftype(state->mp, dep, ftype);
xfs_dir2_data_log_entry(args, state->extrablk.bp, dep);
rval = 0;
}
@@ -2271,7 +2305,7 @@
if (!bp)
return 0;
free = bp->b_addr;
- dp->d_ops->free_hdr_from_disk(&freehdr, free);
+ xfs_dir2_free_hdr_from_disk(dp->i_mount, &freehdr, free);
/*
* If there are used entries, there's nothing to do.
diff --git a/fs/xfs/libxfs/xfs_dir2_priv.h b/fs/xfs/libxfs/xfs_dir2_priv.h
index 59f9fb2..44c6a77 100644
--- a/fs/xfs/libxfs/xfs_dir2_priv.h
+++ b/fs/xfs/libxfs/xfs_dir2_priv.h
@@ -1,4 +1,4 @@
-// SPDX-License-Identifier: GPL-2.0
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* Copyright (c) 2000-2001,2005 Silicon Graphics, Inc.
* All Rights Reserved.
@@ -8,7 +8,41 @@
struct dir_context;
+/*
+ * In-core version of the leaf and free block headers to abstract the
+ * differences in the v2 and v3 disk format of the headers.
+ */
+struct xfs_dir3_icleaf_hdr {
+ uint32_t forw;
+ uint32_t back;
+ uint16_t magic;
+ uint16_t count;
+ uint16_t stale;
+
+ /*
+ * Pointer to the on-disk format entries, which are behind the
+ * variable size (v4 vs v5) header in the on-disk block.
+ */
+ struct xfs_dir2_leaf_entry *ents;
+};
+
+struct xfs_dir3_icfree_hdr {
+ uint32_t magic;
+ uint32_t firstdb;
+ uint32_t nvalid;
+ uint32_t nused;
+
+ /*
+ * Pointer to the on-disk format entries, which are behind the
+ * variable size (v4 vs v5) header in the on-disk block.
+ */
+ __be16 *bests;
+};
+
/* xfs_dir2.c */
+xfs_dahash_t xfs_ascii_ci_hashname(struct xfs_name *name);
+enum xfs_dacmp xfs_ascii_ci_compname(struct xfs_da_args *args,
+ const unsigned char *name, int len);
extern int xfs_dir2_grow_inode(struct xfs_da_args *args, int space,
xfs_dir2_db_t *dbp);
extern int xfs_dir_cilookup_result(struct xfs_da_args *args,
@@ -26,6 +60,15 @@
struct xfs_buf *lbp, struct xfs_buf *dbp);
/* xfs_dir2_data.c */
+struct xfs_dir2_data_free *xfs_dir2_data_bestfree_p(struct xfs_mount *mp,
+ struct xfs_dir2_data_hdr *hdr);
+__be16 *xfs_dir2_data_entry_tag_p(struct xfs_mount *mp,
+ struct xfs_dir2_data_entry *dep);
+uint8_t xfs_dir2_data_get_ftype(struct xfs_mount *mp,
+ struct xfs_dir2_data_entry *dep);
+void xfs_dir2_data_put_ftype(struct xfs_mount *mp,
+ struct xfs_dir2_data_entry *dep, uint8_t ftype);
+
#ifdef DEBUG
extern void xfs_dir3_data_check(struct xfs_inode *dp, struct xfs_buf *bp);
#else
@@ -34,10 +77,10 @@
extern xfs_failaddr_t __xfs_dir3_data_check(struct xfs_inode *dp,
struct xfs_buf *bp);
-extern int xfs_dir3_data_read(struct xfs_trans *tp, struct xfs_inode *dp,
- xfs_dablk_t bno, xfs_daddr_t mapped_bno, struct xfs_buf **bpp);
-extern int xfs_dir3_data_readahead(struct xfs_inode *dp, xfs_dablk_t bno,
- xfs_daddr_t mapped_bno);
+int xfs_dir3_data_read(struct xfs_trans *tp, struct xfs_inode *dp,
+ xfs_dablk_t bno, unsigned int flags, struct xfs_buf **bpp);
+int xfs_dir3_data_readahead(struct xfs_inode *dp, xfs_dablk_t bno,
+ unsigned int flags);
extern struct xfs_dir2_data_free *
xfs_dir2_data_freeinsert(struct xfs_dir2_data_hdr *hdr,
@@ -47,10 +90,14 @@
struct xfs_buf **bpp);
/* xfs_dir2_leaf.c */
-extern int xfs_dir3_leaf_read(struct xfs_trans *tp, struct xfs_inode *dp,
- xfs_dablk_t fbno, xfs_daddr_t mappedbno, struct xfs_buf **bpp);
-extern int xfs_dir3_leafn_read(struct xfs_trans *tp, struct xfs_inode *dp,
- xfs_dablk_t fbno, xfs_daddr_t mappedbno, struct xfs_buf **bpp);
+void xfs_dir2_leaf_hdr_from_disk(struct xfs_mount *mp,
+ struct xfs_dir3_icleaf_hdr *to, struct xfs_dir2_leaf *from);
+void xfs_dir2_leaf_hdr_to_disk(struct xfs_mount *mp, struct xfs_dir2_leaf *to,
+ struct xfs_dir3_icleaf_hdr *from);
+int xfs_dir3_leaf_read(struct xfs_trans *tp, struct xfs_inode *dp,
+ xfs_dablk_t fbno, struct xfs_buf **bpp);
+int xfs_dir3_leafn_read(struct xfs_trans *tp, struct xfs_inode *dp,
+ xfs_dablk_t fbno, struct xfs_buf **bpp);
extern int xfs_dir2_block_to_leaf(struct xfs_da_args *args,
struct xfs_buf *dbp);
extern int xfs_dir2_leaf_addname(struct xfs_da_args *args);
@@ -62,7 +109,8 @@
extern int xfs_dir3_leaf_get_buf(struct xfs_da_args *args, xfs_dir2_db_t bno,
struct xfs_buf **bpp, uint16_t magic);
extern void xfs_dir3_leaf_log_ents(struct xfs_da_args *args,
- struct xfs_buf *bp, int first, int last);
+ struct xfs_dir3_icleaf_hdr *hdr, struct xfs_buf *bp, int first,
+ int last);
extern void xfs_dir3_leaf_log_header(struct xfs_da_args *args,
struct xfs_buf *bp);
extern int xfs_dir2_leaf_lookup(struct xfs_da_args *args);
@@ -79,10 +127,11 @@
extern int xfs_dir2_node_to_leaf(struct xfs_da_state *state);
extern xfs_failaddr_t xfs_dir3_leaf_check_int(struct xfs_mount *mp,
- struct xfs_inode *dp, struct xfs_dir3_icleaf_hdr *hdr,
- struct xfs_dir2_leaf *leaf);
+ struct xfs_dir3_icleaf_hdr *hdr, struct xfs_dir2_leaf *leaf);
/* xfs_dir2_node.c */
+void xfs_dir2_free_hdr_from_disk(struct xfs_mount *mp,
+ struct xfs_dir3_icfree_hdr *to, struct xfs_dir2_free *from);
extern int xfs_dir2_leaf_to_node(struct xfs_da_args *args,
struct xfs_buf *lbp);
extern xfs_dahash_t xfs_dir2_leaf_lasthash(struct xfs_inode *dp,
@@ -108,6 +157,14 @@
xfs_dablk_t fbno, struct xfs_buf **bpp);
/* xfs_dir2_sf.c */
+xfs_ino_t xfs_dir2_sf_get_ino(struct xfs_mount *mp, struct xfs_dir2_sf_hdr *hdr,
+ struct xfs_dir2_sf_entry *sfep);
+xfs_ino_t xfs_dir2_sf_get_parent_ino(struct xfs_dir2_sf_hdr *hdr);
+void xfs_dir2_sf_put_parent_ino(struct xfs_dir2_sf_hdr *hdr, xfs_ino_t ino);
+uint8_t xfs_dir2_sf_get_ftype(struct xfs_mount *mp,
+ struct xfs_dir2_sf_entry *sfep);
+struct xfs_dir2_sf_entry *xfs_dir2_sf_nextentry(struct xfs_mount *mp,
+ struct xfs_dir2_sf_hdr *hdr, struct xfs_dir2_sf_entry *sfep);
extern int xfs_dir2_block_sfsize(struct xfs_inode *dp,
struct xfs_dir2_data_hdr *block, struct xfs_dir2_sf_hdr *sfhp);
extern int xfs_dir2_block_to_sf(struct xfs_da_args *args, struct xfs_buf *bp,
@@ -118,9 +175,33 @@
extern int xfs_dir2_sf_removename(struct xfs_da_args *args);
extern int xfs_dir2_sf_replace(struct xfs_da_args *args);
extern xfs_failaddr_t xfs_dir2_sf_verify(struct xfs_inode *ip);
+int xfs_dir2_sf_entsize(struct xfs_mount *mp,
+ struct xfs_dir2_sf_hdr *hdr, int len);
+void xfs_dir2_sf_put_ino(struct xfs_mount *mp, struct xfs_dir2_sf_hdr *hdr,
+ struct xfs_dir2_sf_entry *sfep, xfs_ino_t ino);
+void xfs_dir2_sf_put_ftype(struct xfs_mount *mp,
+ struct xfs_dir2_sf_entry *sfep, uint8_t ftype);
/* xfs_dir2_readdir.c */
extern int xfs_readdir(struct xfs_trans *tp, struct xfs_inode *dp,
struct dir_context *ctx, size_t bufsize);
+static inline unsigned int
+xfs_dir2_data_entsize(
+ struct xfs_mount *mp,
+ unsigned int namelen)
+{
+ unsigned int len;
+
+ len = offsetof(struct xfs_dir2_data_entry, name[0]) + namelen +
+ sizeof(xfs_dir2_data_off_t) /* tag */;
+ if (xfs_sb_version_hasftype(&mp->m_sb))
+ len += sizeof(uint8_t);
+ return round_up(len, XFS_DIR2_DATA_ALIGN);
+}
+
+xfs_dahash_t xfs_dir2_hashname(struct xfs_mount *mp, struct xfs_name *name);
+enum xfs_dacmp xfs_dir2_compname(struct xfs_da_args *args,
+ const unsigned char *name, int len);
+
#endif /* __XFS_DIR2_PRIV_H__ */
diff --git a/fs/xfs/libxfs/xfs_dir2_sf.c b/fs/xfs/libxfs/xfs_dir2_sf.c
index ae16ca7..2463b5d 100644
--- a/fs/xfs/libxfs/xfs_dir2_sf.c
+++ b/fs/xfs/libxfs/xfs_dir2_sf.c
@@ -37,6 +37,126 @@
static void xfs_dir2_sf_toino4(xfs_da_args_t *args);
static void xfs_dir2_sf_toino8(xfs_da_args_t *args);
+int
+xfs_dir2_sf_entsize(
+ struct xfs_mount *mp,
+ struct xfs_dir2_sf_hdr *hdr,
+ int len)
+{
+ int count = len;
+
+ count += sizeof(struct xfs_dir2_sf_entry); /* namelen + offset */
+ count += hdr->i8count ? XFS_INO64_SIZE : XFS_INO32_SIZE; /* ino # */
+
+ if (xfs_sb_version_hasftype(&mp->m_sb))
+ count += sizeof(uint8_t);
+ return count;
+}
+
+struct xfs_dir2_sf_entry *
+xfs_dir2_sf_nextentry(
+ struct xfs_mount *mp,
+ struct xfs_dir2_sf_hdr *hdr,
+ struct xfs_dir2_sf_entry *sfep)
+{
+ return (void *)sfep + xfs_dir2_sf_entsize(mp, hdr, sfep->namelen);
+}
+
+/*
+ * In short-form directory entries the inode numbers are stored at variable
+ * offset behind the entry name. If the entry stores a filetype value, then it
+ * sits between the name and the inode number. The actual inode numbers can
+ * come in two formats as well, either 4 bytes or 8 bytes wide.
+ */
+xfs_ino_t
+xfs_dir2_sf_get_ino(
+ struct xfs_mount *mp,
+ struct xfs_dir2_sf_hdr *hdr,
+ struct xfs_dir2_sf_entry *sfep)
+{
+ uint8_t *from = sfep->name + sfep->namelen;
+
+ if (xfs_sb_version_hasftype(&mp->m_sb))
+ from++;
+
+ if (!hdr->i8count)
+ return get_unaligned_be32(from);
+ return get_unaligned_be64(from) & XFS_MAXINUMBER;
+}
+
+void
+xfs_dir2_sf_put_ino(
+ struct xfs_mount *mp,
+ struct xfs_dir2_sf_hdr *hdr,
+ struct xfs_dir2_sf_entry *sfep,
+ xfs_ino_t ino)
+{
+ uint8_t *to = sfep->name + sfep->namelen;
+
+ ASSERT(ino <= XFS_MAXINUMBER);
+
+ if (xfs_sb_version_hasftype(&mp->m_sb))
+ to++;
+
+ if (hdr->i8count)
+ put_unaligned_be64(ino, to);
+ else
+ put_unaligned_be32(ino, to);
+}
+
+xfs_ino_t
+xfs_dir2_sf_get_parent_ino(
+ struct xfs_dir2_sf_hdr *hdr)
+{
+ if (!hdr->i8count)
+ return get_unaligned_be32(hdr->parent);
+ return get_unaligned_be64(hdr->parent) & XFS_MAXINUMBER;
+}
+
+void
+xfs_dir2_sf_put_parent_ino(
+ struct xfs_dir2_sf_hdr *hdr,
+ xfs_ino_t ino)
+{
+ ASSERT(ino <= XFS_MAXINUMBER);
+
+ if (hdr->i8count)
+ put_unaligned_be64(ino, hdr->parent);
+ else
+ put_unaligned_be32(ino, hdr->parent);
+}
+
+/*
+ * The file type field is stored at the end of the name for filetype enabled
+ * shortform directories, or not at all otherwise.
+ */
+uint8_t
+xfs_dir2_sf_get_ftype(
+ struct xfs_mount *mp,
+ struct xfs_dir2_sf_entry *sfep)
+{
+ if (xfs_sb_version_hasftype(&mp->m_sb)) {
+ uint8_t ftype = sfep->name[sfep->namelen];
+
+ if (ftype < XFS_DIR3_FT_MAX)
+ return ftype;
+ }
+
+ return XFS_DIR3_FT_UNKNOWN;
+}
+
+void
+xfs_dir2_sf_put_ftype(
+ struct xfs_mount *mp,
+ struct xfs_dir2_sf_entry *sfep,
+ uint8_t ftype)
+{
+ ASSERT(ftype < XFS_DIR3_FT_MAX);
+
+ if (xfs_sb_version_hasftype(&mp->m_sb))
+ sfep->name[sfep->namelen] = ftype;
+}
+
/*
* Given a block directory (dp/block), calculate its size as a shortform (sf)
* directory and a header for the sf directory, if it will fit it the
@@ -125,7 +245,7 @@
*/
sfhp->count = count;
sfhp->i8count = i8count;
- dp->d_ops->sf_put_parent_ino(sfhp, parent);
+ xfs_dir2_sf_put_parent_ino(sfhp, parent);
return size;
}
@@ -135,64 +255,48 @@
*/
int /* error */
xfs_dir2_block_to_sf(
- xfs_da_args_t *args, /* operation arguments */
+ struct xfs_da_args *args, /* operation arguments */
struct xfs_buf *bp,
int size, /* shortform directory size */
- xfs_dir2_sf_hdr_t *sfhp) /* shortform directory hdr */
+ struct xfs_dir2_sf_hdr *sfhp) /* shortform directory hdr */
{
- xfs_dir2_data_hdr_t *hdr; /* block header */
- xfs_dir2_data_entry_t *dep; /* data entry pointer */
- xfs_inode_t *dp; /* incore directory inode */
- xfs_dir2_data_unused_t *dup; /* unused data pointer */
- char *endptr; /* end of data entries */
+ struct xfs_inode *dp = args->dp;
+ struct xfs_mount *mp = dp->i_mount;
int error; /* error return value */
int logflags; /* inode logging flags */
- xfs_mount_t *mp; /* filesystem mount point */
- char *ptr; /* current data pointer */
- xfs_dir2_sf_entry_t *sfep; /* shortform entry */
- xfs_dir2_sf_hdr_t *sfp; /* shortform directory header */
- xfs_dir2_sf_hdr_t *dst; /* temporary data buffer */
+ struct xfs_dir2_sf_entry *sfep; /* shortform entry */
+ struct xfs_dir2_sf_hdr *sfp; /* shortform directory header */
+ unsigned int offset = args->geo->data_entry_offset;
+ unsigned int end;
trace_xfs_dir2_block_to_sf(args);
- dp = args->dp;
- mp = dp->i_mount;
-
/*
- * allocate a temporary destination buffer the size of the inode
- * to format the data into. Once we have formatted the data, we
- * can free the block and copy the formatted data into the inode literal
- * area.
+ * Allocate a temporary destination buffer the size of the inode to
+ * format the data into. Once we have formatted the data, we can free
+ * the block and copy the formatted data into the inode literal area.
*/
- dst = kmem_alloc(mp->m_sb.sb_inodesize, 0);
- hdr = bp->b_addr;
-
- /*
- * Copy the header into the newly allocate local space.
- */
- sfp = (xfs_dir2_sf_hdr_t *)dst;
+ sfp = kmem_alloc(mp->m_sb.sb_inodesize, 0);
memcpy(sfp, sfhp, xfs_dir2_sf_hdr_size(sfhp->i8count));
/*
- * Set up to loop over the block's entries.
+ * Loop over the active and unused entries. Stop when we reach the
+ * leaf/tail portion of the block.
*/
- ptr = (char *)dp->d_ops->data_entry_p(hdr);
- endptr = xfs_dir3_data_endp(args->geo, hdr);
+ end = xfs_dir3_data_end_offset(args->geo, bp->b_addr);
sfep = xfs_dir2_sf_firstentry(sfp);
- /*
- * Loop over the active and unused entries.
- * Stop when we reach the leaf/tail portion of the block.
- */
- while (ptr < endptr) {
+ while (offset < end) {
+ struct xfs_dir2_data_unused *dup = bp->b_addr + offset;
+ struct xfs_dir2_data_entry *dep = bp->b_addr + offset;
+
/*
* If it's unused, just skip over it.
*/
- dup = (xfs_dir2_data_unused_t *)ptr;
if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
- ptr += be16_to_cpu(dup->length);
+ offset += be16_to_cpu(dup->length);
continue;
}
- dep = (xfs_dir2_data_entry_t *)ptr;
+
/*
* Skip .
*/
@@ -204,24 +308,22 @@
else if (dep->namelen == 2 &&
dep->name[0] == '.' && dep->name[1] == '.')
ASSERT(be64_to_cpu(dep->inumber) ==
- dp->d_ops->sf_get_parent_ino(sfp));
+ xfs_dir2_sf_get_parent_ino(sfp));
/*
* Normal entry, copy it into shortform.
*/
else {
sfep->namelen = dep->namelen;
- xfs_dir2_sf_put_offset(sfep,
- (xfs_dir2_data_aoff_t)
- ((char *)dep - (char *)hdr));
+ xfs_dir2_sf_put_offset(sfep, offset);
memcpy(sfep->name, dep->name, dep->namelen);
- dp->d_ops->sf_put_ino(sfp, sfep,
+ xfs_dir2_sf_put_ino(mp, sfp, sfep,
be64_to_cpu(dep->inumber));
- dp->d_ops->sf_put_ftype(sfep,
- dp->d_ops->data_get_ftype(dep));
+ xfs_dir2_sf_put_ftype(mp, sfep,
+ xfs_dir2_data_get_ftype(mp, dep));
- sfep = dp->d_ops->sf_nextentry(sfp, sfep);
+ sfep = xfs_dir2_sf_nextentry(mp, sfp, sfep);
}
- ptr += dp->d_ops->data_entsize(dep->namelen);
+ offset += xfs_dir2_data_entsize(mp, dep->namelen);
}
ASSERT((char *)sfep - (char *)sfp == size);
@@ -240,15 +342,15 @@
* Convert the inode to local format and copy the data in.
*/
ASSERT(dp->i_df.if_bytes == 0);
- xfs_init_local_fork(dp, XFS_DATA_FORK, dst, size);
- dp->i_d.di_format = XFS_DINODE_FMT_LOCAL;
+ xfs_init_local_fork(dp, XFS_DATA_FORK, sfp, size);
+ dp->i_df.if_format = XFS_DINODE_FMT_LOCAL;
dp->i_d.di_size = size;
logflags |= XFS_ILOG_DDATA;
xfs_dir2_sf_check(args);
out:
xfs_trans_log_inode(args->trans, dp, logflags);
- kmem_free(dst);
+ kmem_free(sfp);
return error;
}
@@ -277,13 +379,7 @@
ASSERT(xfs_dir2_sf_lookup(args) == -ENOENT);
dp = args->dp;
ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
- /*
- * Make sure the shortform value has some of its header.
- */
- if (dp->i_d.di_size < offsetof(xfs_dir2_sf_hdr_t, parent)) {
- ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount));
- return -EIO;
- }
+ ASSERT(dp->i_d.di_size >= offsetof(struct xfs_dir2_sf_hdr, parent));
ASSERT(dp->i_df.if_bytes == dp->i_d.di_size);
ASSERT(dp->i_df.if_u1.if_data != NULL);
sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
@@ -291,7 +387,7 @@
/*
* Compute entry (and change in) size.
*/
- incr_isize = dp->d_ops->sf_entsize(sfp, args->namelen);
+ incr_isize = xfs_dir2_sf_entsize(dp->i_mount, sfp, args->namelen);
objchange = 0;
/*
@@ -364,18 +460,17 @@
xfs_dir2_data_aoff_t offset, /* offset to use for new ent */
int new_isize) /* new directory size */
{
+ struct xfs_inode *dp = args->dp;
+ struct xfs_mount *mp = dp->i_mount;
int byteoff; /* byte offset in sf dir */
- xfs_inode_t *dp; /* incore directory inode */
xfs_dir2_sf_hdr_t *sfp; /* shortform structure */
- dp = args->dp;
-
sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
byteoff = (int)((char *)sfep - (char *)sfp);
/*
* Grow the in-inode space.
*/
- xfs_idata_realloc(dp, dp->d_ops->sf_entsize(sfp, args->namelen),
+ xfs_idata_realloc(dp, xfs_dir2_sf_entsize(mp, sfp, args->namelen),
XFS_DATA_FORK);
/*
* Need to set up again due to realloc of the inode data.
@@ -388,8 +483,8 @@
sfep->namelen = args->namelen;
xfs_dir2_sf_put_offset(sfep, offset);
memcpy(sfep->name, args->name, sfep->namelen);
- dp->d_ops->sf_put_ino(sfp, sfep, args->inumber);
- dp->d_ops->sf_put_ftype(sfep, args->filetype);
+ xfs_dir2_sf_put_ino(mp, sfp, sfep, args->inumber);
+ xfs_dir2_sf_put_ftype(mp, sfep, args->filetype);
/*
* Update the header and inode.
@@ -416,9 +511,10 @@
int objchange, /* changing inode number size */
int new_isize) /* new directory size */
{
+ struct xfs_inode *dp = args->dp;
+ struct xfs_mount *mp = dp->i_mount;
int add_datasize; /* data size need for new ent */
char *buf; /* buffer for old */
- xfs_inode_t *dp; /* incore directory inode */
int eof; /* reached end of old dir */
int nbytes; /* temp for byte copies */
xfs_dir2_data_aoff_t new_offset; /* next offset value */
@@ -432,8 +528,6 @@
/*
* Copy the old directory to the stack buffer.
*/
- dp = args->dp;
-
sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
old_isize = (int)dp->i_d.di_size;
buf = kmem_alloc(old_isize, 0);
@@ -444,13 +538,13 @@
* to insert the new entry.
* If it's going to end up at the end then oldsfep will point there.
*/
- for (offset = dp->d_ops->data_first_offset,
+ for (offset = args->geo->data_first_offset,
oldsfep = xfs_dir2_sf_firstentry(oldsfp),
- add_datasize = dp->d_ops->data_entsize(args->namelen),
+ add_datasize = xfs_dir2_data_entsize(mp, args->namelen),
eof = (char *)oldsfep == &buf[old_isize];
!eof;
- offset = new_offset + dp->d_ops->data_entsize(oldsfep->namelen),
- oldsfep = dp->d_ops->sf_nextentry(oldsfp, oldsfep),
+ offset = new_offset + xfs_dir2_data_entsize(mp, oldsfep->namelen),
+ oldsfep = xfs_dir2_sf_nextentry(mp, oldsfp, oldsfep),
eof = (char *)oldsfep == &buf[old_isize]) {
new_offset = xfs_dir2_sf_get_offset(oldsfep);
if (offset + add_datasize <= new_offset)
@@ -479,8 +573,8 @@
sfep->namelen = args->namelen;
xfs_dir2_sf_put_offset(sfep, offset);
memcpy(sfep->name, args->name, sfep->namelen);
- dp->d_ops->sf_put_ino(sfp, sfep, args->inumber);
- dp->d_ops->sf_put_ftype(sfep, args->filetype);
+ xfs_dir2_sf_put_ino(mp, sfp, sfep, args->inumber);
+ xfs_dir2_sf_put_ftype(mp, sfep, args->filetype);
sfp->count++;
if (args->inumber > XFS_DIR2_MAX_SHORT_INUM && !objchange)
sfp->i8count++;
@@ -488,7 +582,7 @@
* If there's more left to copy, do that.
*/
if (!eof) {
- sfep = dp->d_ops->sf_nextentry(sfp, sfep);
+ sfep = xfs_dir2_sf_nextentry(mp, sfp, sfep);
memcpy(sfep, oldsfep, old_isize - nbytes);
}
kmem_free(buf);
@@ -510,7 +604,8 @@
xfs_dir2_sf_entry_t **sfepp, /* out(1): new entry ptr */
xfs_dir2_data_aoff_t *offsetp) /* out(1): new offset */
{
- xfs_inode_t *dp; /* incore directory inode */
+ struct xfs_inode *dp = args->dp;
+ struct xfs_mount *mp = dp->i_mount;
int holefit; /* found hole it will fit in */
int i; /* entry number */
xfs_dir2_data_aoff_t offset; /* data block offset */
@@ -519,11 +614,9 @@
int size; /* entry's data size */
int used; /* data bytes used */
- dp = args->dp;
-
sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
- size = dp->d_ops->data_entsize(args->namelen);
- offset = dp->d_ops->data_first_offset;
+ size = xfs_dir2_data_entsize(mp, args->namelen);
+ offset = args->geo->data_first_offset;
sfep = xfs_dir2_sf_firstentry(sfp);
holefit = 0;
/*
@@ -535,8 +628,8 @@
if (!holefit)
holefit = offset + size <= xfs_dir2_sf_get_offset(sfep);
offset = xfs_dir2_sf_get_offset(sfep) +
- dp->d_ops->data_entsize(sfep->namelen);
- sfep = dp->d_ops->sf_nextentry(sfp, sfep);
+ xfs_dir2_data_entsize(mp, sfep->namelen);
+ sfep = xfs_dir2_sf_nextentry(mp, sfp, sfep);
}
/*
* Calculate data bytes used excluding the new entry, if this
@@ -578,7 +671,8 @@
xfs_dir2_sf_check(
xfs_da_args_t *args) /* operation arguments */
{
- xfs_inode_t *dp; /* incore directory inode */
+ struct xfs_inode *dp = args->dp;
+ struct xfs_mount *mp = dp->i_mount;
int i; /* entry number */
int i8count; /* number of big inode#s */
xfs_ino_t ino; /* entry inode number */
@@ -586,23 +680,21 @@
xfs_dir2_sf_entry_t *sfep; /* shortform dir entry */
xfs_dir2_sf_hdr_t *sfp; /* shortform structure */
- dp = args->dp;
-
sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
- offset = dp->d_ops->data_first_offset;
- ino = dp->d_ops->sf_get_parent_ino(sfp);
+ offset = args->geo->data_first_offset;
+ ino = xfs_dir2_sf_get_parent_ino(sfp);
i8count = ino > XFS_DIR2_MAX_SHORT_INUM;
for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp);
i < sfp->count;
- i++, sfep = dp->d_ops->sf_nextentry(sfp, sfep)) {
+ i++, sfep = xfs_dir2_sf_nextentry(mp, sfp, sfep)) {
ASSERT(xfs_dir2_sf_get_offset(sfep) >= offset);
- ino = dp->d_ops->sf_get_ino(sfp, sfep);
+ ino = xfs_dir2_sf_get_ino(mp, sfp, sfep);
i8count += ino > XFS_DIR2_MAX_SHORT_INUM;
offset =
xfs_dir2_sf_get_offset(sfep) +
- dp->d_ops->data_entsize(sfep->namelen);
- ASSERT(dp->d_ops->sf_get_ftype(sfep) < XFS_DIR3_FT_MAX);
+ xfs_dir2_data_entsize(mp, sfep->namelen);
+ ASSERT(xfs_dir2_sf_get_ftype(mp, sfep) < XFS_DIR3_FT_MAX);
}
ASSERT(i8count == sfp->i8count);
ASSERT((char *)sfep - (char *)sfp == dp->i_d.di_size);
@@ -618,12 +710,11 @@
struct xfs_inode *ip)
{
struct xfs_mount *mp = ip->i_mount;
+ struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
struct xfs_dir2_sf_hdr *sfp;
struct xfs_dir2_sf_entry *sfep;
struct xfs_dir2_sf_entry *next_sfep;
char *endp;
- const struct xfs_dir_ops *dops;
- struct xfs_ifork *ifp;
xfs_ino_t ino;
int i;
int i8count;
@@ -632,14 +723,8 @@
int error;
uint8_t filetype;
- ASSERT(ip->i_d.di_format == XFS_DINODE_FMT_LOCAL);
- /*
- * xfs_iread calls us before xfs_setup_inode sets up ip->d_ops,
- * so we can only trust the mountpoint to have the right pointer.
- */
- dops = xfs_dir_get_ops(mp, NULL);
+ ASSERT(ifp->if_format == XFS_DINODE_FMT_LOCAL);
- ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
sfp = (struct xfs_dir2_sf_hdr *)ifp->if_u1.if_data;
size = ifp->if_bytes;
@@ -653,12 +738,12 @@
endp = (char *)sfp + size;
/* Check .. entry */
- ino = dops->sf_get_parent_ino(sfp);
+ ino = xfs_dir2_sf_get_parent_ino(sfp);
i8count = ino > XFS_DIR2_MAX_SHORT_INUM;
error = xfs_dir_ino_validate(mp, ino);
if (error)
return __this_address;
- offset = dops->data_first_offset;
+ offset = mp->m_dir_geo->data_first_offset;
/* Check all reported entries */
sfep = xfs_dir2_sf_firstentry(sfp);
@@ -680,7 +765,7 @@
* within the data buffer. The next entry starts after the
* name component, so nextentry is an acceptable test.
*/
- next_sfep = dops->sf_nextentry(sfp, sfep);
+ next_sfep = xfs_dir2_sf_nextentry(mp, sfp, sfep);
if (endp < (char *)next_sfep)
return __this_address;
@@ -689,19 +774,19 @@
return __this_address;
/* Check the inode number. */
- ino = dops->sf_get_ino(sfp, sfep);
+ ino = xfs_dir2_sf_get_ino(mp, sfp, sfep);
i8count += ino > XFS_DIR2_MAX_SHORT_INUM;
error = xfs_dir_ino_validate(mp, ino);
if (error)
return __this_address;
/* Check the file type. */
- filetype = dops->sf_get_ftype(sfep);
+ filetype = xfs_dir2_sf_get_ftype(mp, sfep);
if (filetype >= XFS_DIR3_FT_MAX)
return __this_address;
offset = xfs_dir2_sf_get_offset(sfep) +
- dops->data_entsize(sfep->namelen);
+ xfs_dir2_data_entsize(mp, sfep->namelen);
sfep = next_sfep;
}
@@ -741,9 +826,9 @@
* If it's currently a zero-length extent file,
* convert it to local format.
*/
- if (dp->i_d.di_format == XFS_DINODE_FMT_EXTENTS) {
+ if (dp->i_df.if_format == XFS_DINODE_FMT_EXTENTS) {
dp->i_df.if_flags &= ~XFS_IFEXTENTS; /* just in case */
- dp->i_d.di_format = XFS_DINODE_FMT_LOCAL;
+ dp->i_df.if_format = XFS_DINODE_FMT_LOCAL;
xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE);
dp->i_df.if_flags |= XFS_IFINLINE;
}
@@ -763,7 +848,7 @@
/*
* Now can put in the inode number, since i8count is set.
*/
- dp->d_ops->sf_put_parent_ino(sfp, pino);
+ xfs_dir2_sf_put_parent_ino(sfp, pino);
sfp->count = 0;
dp->i_d.di_size = size;
xfs_dir2_sf_check(args);
@@ -779,7 +864,8 @@
xfs_dir2_sf_lookup(
xfs_da_args_t *args) /* operation arguments */
{
- xfs_inode_t *dp; /* incore directory inode */
+ struct xfs_inode *dp = args->dp;
+ struct xfs_mount *mp = dp->i_mount;
int i; /* entry index */
int error;
xfs_dir2_sf_entry_t *sfep; /* shortform directory entry */
@@ -790,16 +876,9 @@
trace_xfs_dir2_sf_lookup(args);
xfs_dir2_sf_check(args);
- dp = args->dp;
ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
- /*
- * Bail out if the directory is way too short.
- */
- if (dp->i_d.di_size < offsetof(xfs_dir2_sf_hdr_t, parent)) {
- ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount));
- return -EIO;
- }
+ ASSERT(dp->i_d.di_size >= offsetof(struct xfs_dir2_sf_hdr, parent));
ASSERT(dp->i_df.if_bytes == dp->i_d.di_size);
ASSERT(dp->i_df.if_u1.if_data != NULL);
sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
@@ -818,7 +897,7 @@
*/
if (args->namelen == 2 &&
args->name[0] == '.' && args->name[1] == '.') {
- args->inumber = dp->d_ops->sf_get_parent_ino(sfp);
+ args->inumber = xfs_dir2_sf_get_parent_ino(sfp);
args->cmpresult = XFS_CMP_EXACT;
args->filetype = XFS_DIR3_FT_DIR;
return -EEXIST;
@@ -828,18 +907,17 @@
*/
ci_sfep = NULL;
for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp); i < sfp->count;
- i++, sfep = dp->d_ops->sf_nextentry(sfp, sfep)) {
+ i++, sfep = xfs_dir2_sf_nextentry(mp, sfp, sfep)) {
/*
* Compare name and if it's an exact match, return the inode
* number. If it's the first case-insensitive match, store the
* inode number and continue looking for an exact match.
*/
- cmp = dp->i_mount->m_dirnameops->compname(args, sfep->name,
- sfep->namelen);
+ cmp = xfs_dir2_compname(args, sfep->name, sfep->namelen);
if (cmp != XFS_CMP_DIFFERENT && cmp != args->cmpresult) {
args->cmpresult = cmp;
- args->inumber = dp->d_ops->sf_get_ino(sfp, sfep);
- args->filetype = dp->d_ops->sf_get_ftype(sfep);
+ args->inumber = xfs_dir2_sf_get_ino(mp, sfp, sfep);
+ args->filetype = xfs_dir2_sf_get_ftype(mp, sfep);
if (cmp == XFS_CMP_EXACT)
return -EEXIST;
ci_sfep = sfep;
@@ -864,8 +942,9 @@
xfs_dir2_sf_removename(
xfs_da_args_t *args)
{
+ struct xfs_inode *dp = args->dp;
+ struct xfs_mount *mp = dp->i_mount;
int byteoff; /* offset of removed entry */
- xfs_inode_t *dp; /* incore directory inode */
int entsize; /* this entry's size */
int i; /* shortform entry index */
int newsize; /* new inode size */
@@ -875,17 +954,9 @@
trace_xfs_dir2_sf_removename(args);
- dp = args->dp;
-
ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
oldsize = (int)dp->i_d.di_size;
- /*
- * Bail out if the directory is way too short.
- */
- if (oldsize < offsetof(xfs_dir2_sf_hdr_t, parent)) {
- ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount));
- return -EIO;
- }
+ ASSERT(oldsize >= offsetof(struct xfs_dir2_sf_hdr, parent));
ASSERT(dp->i_df.if_bytes == oldsize);
ASSERT(dp->i_df.if_u1.if_data != NULL);
sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
@@ -895,10 +966,10 @@
* Find the one we're deleting.
*/
for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp); i < sfp->count;
- i++, sfep = dp->d_ops->sf_nextentry(sfp, sfep)) {
+ i++, sfep = xfs_dir2_sf_nextentry(mp, sfp, sfep)) {
if (xfs_da_compname(args, sfep->name, sfep->namelen) ==
XFS_CMP_EXACT) {
- ASSERT(dp->d_ops->sf_get_ino(sfp, sfep) ==
+ ASSERT(xfs_dir2_sf_get_ino(mp, sfp, sfep) ==
args->inumber);
break;
}
@@ -912,7 +983,7 @@
* Calculate sizes.
*/
byteoff = (int)((char *)sfep - (char *)sfp);
- entsize = dp->d_ops->sf_entsize(sfp, args->namelen);
+ entsize = xfs_dir2_sf_entsize(mp, sfp, args->namelen);
newsize = oldsize - entsize;
/*
* Copy the part if any after the removed entry, sliding it down.
@@ -945,13 +1016,35 @@
}
/*
+ * Check whether the sf dir replace operation need more blocks.
+ */
+bool
+xfs_dir2_sf_replace_needblock(
+ struct xfs_inode *dp,
+ xfs_ino_t inum)
+{
+ int newsize;
+ struct xfs_dir2_sf_hdr *sfp;
+
+ if (dp->i_df.if_format != XFS_DINODE_FMT_LOCAL)
+ return false;
+
+ sfp = (struct xfs_dir2_sf_hdr *)dp->i_df.if_u1.if_data;
+ newsize = dp->i_df.if_bytes + (sfp->count + 1) * XFS_INO64_DIFF;
+
+ return inum > XFS_DIR2_MAX_SHORT_INUM &&
+ sfp->i8count == 0 && newsize > XFS_IFORK_DSIZE(dp);
+}
+
+/*
* Replace the inode number of an entry in a shortform directory.
*/
int /* error */
xfs_dir2_sf_replace(
xfs_da_args_t *args) /* operation arguments */
{
- xfs_inode_t *dp; /* incore directory inode */
+ struct xfs_inode *dp = args->dp;
+ struct xfs_mount *mp = dp->i_mount;
int i; /* entry index */
xfs_ino_t ino=0; /* entry old inode number */
int i8elevated; /* sf_toino8 set i8count=1 */
@@ -960,16 +1053,8 @@
trace_xfs_dir2_sf_replace(args);
- dp = args->dp;
-
ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
- /*
- * Bail out if the shortform directory is way too small.
- */
- if (dp->i_d.di_size < offsetof(xfs_dir2_sf_hdr_t, parent)) {
- ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount));
- return -EIO;
- }
+ ASSERT(dp->i_d.di_size >= offsetof(struct xfs_dir2_sf_hdr, parent));
ASSERT(dp->i_df.if_bytes == dp->i_d.di_size);
ASSERT(dp->i_df.if_u1.if_data != NULL);
sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
@@ -980,17 +1065,14 @@
*/
if (args->inumber > XFS_DIR2_MAX_SHORT_INUM && sfp->i8count == 0) {
int error; /* error return value */
- int newsize; /* new inode size */
- newsize = dp->i_df.if_bytes + (sfp->count + 1) * XFS_INO64_DIFF;
/*
* Won't fit as shortform, convert to block then do replace.
*/
- if (newsize > XFS_IFORK_DSIZE(dp)) {
+ if (xfs_dir2_sf_replace_needblock(dp, args->inumber)) {
error = xfs_dir2_sf_to_block(args);
- if (error) {
+ if (error)
return error;
- }
return xfs_dir2_block_replace(args);
}
/*
@@ -1008,22 +1090,23 @@
*/
if (args->namelen == 2 &&
args->name[0] == '.' && args->name[1] == '.') {
- ino = dp->d_ops->sf_get_parent_ino(sfp);
+ ino = xfs_dir2_sf_get_parent_ino(sfp);
ASSERT(args->inumber != ino);
- dp->d_ops->sf_put_parent_ino(sfp, args->inumber);
+ xfs_dir2_sf_put_parent_ino(sfp, args->inumber);
}
/*
* Normal entry, look for the name.
*/
else {
for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp); i < sfp->count;
- i++, sfep = dp->d_ops->sf_nextentry(sfp, sfep)) {
+ i++, sfep = xfs_dir2_sf_nextentry(mp, sfp, sfep)) {
if (xfs_da_compname(args, sfep->name, sfep->namelen) ==
XFS_CMP_EXACT) {
- ino = dp->d_ops->sf_get_ino(sfp, sfep);
+ ino = xfs_dir2_sf_get_ino(mp, sfp, sfep);
ASSERT(args->inumber != ino);
- dp->d_ops->sf_put_ino(sfp, sfep, args->inumber);
- dp->d_ops->sf_put_ftype(sfep, args->filetype);
+ xfs_dir2_sf_put_ino(mp, sfp, sfep,
+ args->inumber);
+ xfs_dir2_sf_put_ftype(mp, sfep, args->filetype);
break;
}
}
@@ -1076,8 +1159,9 @@
xfs_dir2_sf_toino4(
xfs_da_args_t *args) /* operation arguments */
{
+ struct xfs_inode *dp = args->dp;
+ struct xfs_mount *mp = dp->i_mount;
char *buf; /* old dir's buffer */
- xfs_inode_t *dp; /* incore directory inode */
int i; /* entry index */
int newsize; /* new inode size */
xfs_dir2_sf_entry_t *oldsfep; /* old sf entry */
@@ -1088,8 +1172,6 @@
trace_xfs_dir2_sf_toino4(args);
- dp = args->dp;
-
/*
* Copy the old directory to the buffer.
* Then nuke it from the inode, and add the new buffer to the inode.
@@ -1116,21 +1198,22 @@
*/
sfp->count = oldsfp->count;
sfp->i8count = 0;
- dp->d_ops->sf_put_parent_ino(sfp, dp->d_ops->sf_get_parent_ino(oldsfp));
+ xfs_dir2_sf_put_parent_ino(sfp, xfs_dir2_sf_get_parent_ino(oldsfp));
/*
* Copy the entries field by field.
*/
for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp),
oldsfep = xfs_dir2_sf_firstentry(oldsfp);
i < sfp->count;
- i++, sfep = dp->d_ops->sf_nextentry(sfp, sfep),
- oldsfep = dp->d_ops->sf_nextentry(oldsfp, oldsfep)) {
+ i++, sfep = xfs_dir2_sf_nextentry(mp, sfp, sfep),
+ oldsfep = xfs_dir2_sf_nextentry(mp, oldsfp, oldsfep)) {
sfep->namelen = oldsfep->namelen;
memcpy(sfep->offset, oldsfep->offset, sizeof(sfep->offset));
memcpy(sfep->name, oldsfep->name, sfep->namelen);
- dp->d_ops->sf_put_ino(sfp, sfep,
- dp->d_ops->sf_get_ino(oldsfp, oldsfep));
- dp->d_ops->sf_put_ftype(sfep, dp->d_ops->sf_get_ftype(oldsfep));
+ xfs_dir2_sf_put_ino(mp, sfp, sfep,
+ xfs_dir2_sf_get_ino(mp, oldsfp, oldsfep));
+ xfs_dir2_sf_put_ftype(mp, sfep,
+ xfs_dir2_sf_get_ftype(mp, oldsfep));
}
/*
* Clean up the inode.
@@ -1149,8 +1232,9 @@
xfs_dir2_sf_toino8(
xfs_da_args_t *args) /* operation arguments */
{
+ struct xfs_inode *dp = args->dp;
+ struct xfs_mount *mp = dp->i_mount;
char *buf; /* old dir's buffer */
- xfs_inode_t *dp; /* incore directory inode */
int i; /* entry index */
int newsize; /* new inode size */
xfs_dir2_sf_entry_t *oldsfep; /* old sf entry */
@@ -1161,8 +1245,6 @@
trace_xfs_dir2_sf_toino8(args);
- dp = args->dp;
-
/*
* Copy the old directory to the buffer.
* Then nuke it from the inode, and add the new buffer to the inode.
@@ -1189,21 +1271,22 @@
*/
sfp->count = oldsfp->count;
sfp->i8count = 1;
- dp->d_ops->sf_put_parent_ino(sfp, dp->d_ops->sf_get_parent_ino(oldsfp));
+ xfs_dir2_sf_put_parent_ino(sfp, xfs_dir2_sf_get_parent_ino(oldsfp));
/*
* Copy the entries field by field.
*/
for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp),
oldsfep = xfs_dir2_sf_firstentry(oldsfp);
i < sfp->count;
- i++, sfep = dp->d_ops->sf_nextentry(sfp, sfep),
- oldsfep = dp->d_ops->sf_nextentry(oldsfp, oldsfep)) {
+ i++, sfep = xfs_dir2_sf_nextentry(mp, sfp, sfep),
+ oldsfep = xfs_dir2_sf_nextentry(mp, oldsfp, oldsfep)) {
sfep->namelen = oldsfep->namelen;
memcpy(sfep->offset, oldsfep->offset, sizeof(sfep->offset));
memcpy(sfep->name, oldsfep->name, sfep->namelen);
- dp->d_ops->sf_put_ino(sfp, sfep,
- dp->d_ops->sf_get_ino(oldsfp, oldsfep));
- dp->d_ops->sf_put_ftype(sfep, dp->d_ops->sf_get_ftype(oldsfep));
+ xfs_dir2_sf_put_ino(mp, sfp, sfep,
+ xfs_dir2_sf_get_ino(mp, oldsfp, oldsfep));
+ xfs_dir2_sf_put_ftype(mp, sfep,
+ xfs_dir2_sf_get_ftype(mp, oldsfep));
}
/*
* Clean up the inode.
diff --git a/fs/xfs/libxfs/xfs_dquot_buf.c b/fs/xfs/libxfs/xfs_dquot_buf.c
index e8bd688..6766417 100644
--- a/fs/xfs/libxfs/xfs_dquot_buf.c
+++ b/fs/xfs/libxfs/xfs_dquot_buf.c
@@ -35,11 +35,12 @@
xfs_failaddr_t
xfs_dquot_verify(
- struct xfs_mount *mp,
- xfs_disk_dquot_t *ddq,
- xfs_dqid_t id,
- uint type) /* used only during quotacheck */
+ struct xfs_mount *mp,
+ struct xfs_disk_dquot *ddq,
+ xfs_dqid_t id) /* used only during quotacheck */
{
+ __u8 ddq_type;
+
/*
* We can encounter an uninitialized dquot buffer for 2 reasons:
* 1. If we crash while deleting the quotainode(s), and those blks got
@@ -60,11 +61,19 @@
if (ddq->d_version != XFS_DQUOT_VERSION)
return __this_address;
- if (type && ddq->d_flags != type)
+ if (ddq->d_type & ~XFS_DQTYPE_ANY)
return __this_address;
- if (ddq->d_flags != XFS_DQ_USER &&
- ddq->d_flags != XFS_DQ_PROJ &&
- ddq->d_flags != XFS_DQ_GROUP)
+ ddq_type = ddq->d_type & XFS_DQTYPE_REC_MASK;
+ if (ddq_type != XFS_DQTYPE_USER &&
+ ddq_type != XFS_DQTYPE_PROJ &&
+ ddq_type != XFS_DQTYPE_GROUP)
+ return __this_address;
+
+ if ((ddq->d_type & XFS_DQTYPE_BIGTIME) &&
+ !xfs_sb_version_hasbigtime(&mp->m_sb))
+ return __this_address;
+
+ if ((ddq->d_type & XFS_DQTYPE_BIGTIME) && !ddq->d_id)
return __this_address;
if (id != -1 && id != be32_to_cpu(ddq->d_id))
@@ -95,14 +104,13 @@
xfs_dqblk_verify(
struct xfs_mount *mp,
struct xfs_dqblk *dqb,
- xfs_dqid_t id,
- uint type) /* used only during quotacheck */
+ xfs_dqid_t id) /* used only during quotacheck */
{
if (xfs_sb_version_hascrc(&mp->m_sb) &&
!uuid_equal(&dqb->dd_uuid, &mp->m_sb.sb_meta_uuid))
return __this_address;
- return xfs_dquot_verify(mp, &dqb->dd_diskdq, id, type);
+ return xfs_dquot_verify(mp, &dqb->dd_diskdq, id);
}
/*
@@ -113,7 +121,7 @@
struct xfs_mount *mp,
struct xfs_dqblk *dqb,
xfs_dqid_t id,
- uint type)
+ xfs_dqtype_t type)
{
/*
* Typically, a repair is only requested by quotacheck.
@@ -123,7 +131,7 @@
dqb->dd_diskdq.d_magic = cpu_to_be16(XFS_DQUOT_MAGIC);
dqb->dd_diskdq.d_version = XFS_DQUOT_VERSION;
- dqb->dd_diskdq.d_flags = type;
+ dqb->dd_diskdq.d_type = type;
dqb->dd_diskdq.d_id = cpu_to_be32(id);
if (xfs_sb_version_hascrc(&mp->m_sb)) {
@@ -205,7 +213,7 @@
if (i == 0)
id = be32_to_cpu(ddq->d_id);
- fa = xfs_dqblk_verify(mp, &dqb[i], id + i, 0);
+ fa = xfs_dqblk_verify(mp, &dqb[i], id + i);
if (fa) {
if (!readahead)
xfs_buf_verifier_error(bp, -EFSCORRUPTED,
@@ -287,3 +295,31 @@
.verify_read = xfs_dquot_buf_readahead_verify,
.verify_write = xfs_dquot_buf_write_verify,
};
+
+/* Convert an on-disk timer value into an incore timer value. */
+time64_t
+xfs_dquot_from_disk_ts(
+ struct xfs_disk_dquot *ddq,
+ __be32 dtimer)
+{
+ uint32_t t = be32_to_cpu(dtimer);
+
+ if (t != 0 && (ddq->d_type & XFS_DQTYPE_BIGTIME))
+ return xfs_dq_bigtime_to_unix(t);
+
+ return t;
+}
+
+/* Convert an incore timer value into an on-disk timer value. */
+__be32
+xfs_dquot_to_disk_ts(
+ struct xfs_dquot *dqp,
+ time64_t timer)
+{
+ uint32_t t = timer;
+
+ if (timer != 0 && (dqp->q_type & XFS_DQTYPE_BIGTIME))
+ t = xfs_dq_unix_to_bigtime(timer);
+
+ return cpu_to_be32(t);
+}
diff --git a/fs/xfs/libxfs/xfs_errortag.h b/fs/xfs/libxfs/xfs_errortag.h
index 79e6c4f..53b305d 100644
--- a/fs/xfs/libxfs/xfs_errortag.h
+++ b/fs/xfs/libxfs/xfs_errortag.h
@@ -1,4 +1,4 @@
-// SPDX-License-Identifier: GPL-2.0+
+/* SPDX-License-Identifier: GPL-2.0+ */
/*
* Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
* Copyright (C) 2017 Oracle.
@@ -55,7 +55,8 @@
#define XFS_ERRTAG_FORCE_SCRUB_REPAIR 32
#define XFS_ERRTAG_FORCE_SUMMARY_RECALC 33
#define XFS_ERRTAG_IUNLINK_FALLBACK 34
-#define XFS_ERRTAG_MAX 35
+#define XFS_ERRTAG_BUF_IOERROR 35
+#define XFS_ERRTAG_MAX 36
/*
* Random factors for above tags, 1 means always, 2 means 1/2 time, etc.
@@ -95,5 +96,6 @@
#define XFS_RANDOM_FORCE_SCRUB_REPAIR 1
#define XFS_RANDOM_FORCE_SUMMARY_RECALC 1
#define XFS_RANDOM_IUNLINK_FALLBACK (XFS_RANDOM_DEFAULT/10)
+#define XFS_RANDOM_BUF_IOERROR XFS_RANDOM_DEFAULT
#endif /* __XFS_ERRORTAG_H_ */
diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
index c968b60..dd764da 100644
--- a/fs/xfs/libxfs/xfs_format.h
+++ b/fs/xfs/libxfs/xfs_format.h
@@ -1,4 +1,4 @@
-// SPDX-License-Identifier: GPL-2.0
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* Copyright (c) 2000-2005 Silicon Graphics, Inc.
* All Rights Reserved.
@@ -449,10 +449,12 @@
#define XFS_SB_FEAT_RO_COMPAT_FINOBT (1 << 0) /* free inode btree */
#define XFS_SB_FEAT_RO_COMPAT_RMAPBT (1 << 1) /* reverse map btree */
#define XFS_SB_FEAT_RO_COMPAT_REFLINK (1 << 2) /* reflinked files */
+#define XFS_SB_FEAT_RO_COMPAT_INOBTCNT (1 << 3) /* inobt block counts */
#define XFS_SB_FEAT_RO_COMPAT_ALL \
(XFS_SB_FEAT_RO_COMPAT_FINOBT | \
XFS_SB_FEAT_RO_COMPAT_RMAPBT | \
- XFS_SB_FEAT_RO_COMPAT_REFLINK)
+ XFS_SB_FEAT_RO_COMPAT_REFLINK| \
+ XFS_SB_FEAT_RO_COMPAT_INOBTCNT)
#define XFS_SB_FEAT_RO_COMPAT_UNKNOWN ~XFS_SB_FEAT_RO_COMPAT_ALL
static inline bool
xfs_sb_has_ro_compat_feature(
@@ -465,10 +467,12 @@
#define XFS_SB_FEAT_INCOMPAT_FTYPE (1 << 0) /* filetype in dirent */
#define XFS_SB_FEAT_INCOMPAT_SPINODES (1 << 1) /* sparse inode chunks */
#define XFS_SB_FEAT_INCOMPAT_META_UUID (1 << 2) /* metadata UUID */
+#define XFS_SB_FEAT_INCOMPAT_BIGTIME (1 << 3) /* large timestamps */
#define XFS_SB_FEAT_INCOMPAT_ALL \
(XFS_SB_FEAT_INCOMPAT_FTYPE| \
XFS_SB_FEAT_INCOMPAT_SPINODES| \
- XFS_SB_FEAT_INCOMPAT_META_UUID)
+ XFS_SB_FEAT_INCOMPAT_META_UUID| \
+ XFS_SB_FEAT_INCOMPAT_BIGTIME)
#define XFS_SB_FEAT_INCOMPAT_UNKNOWN ~XFS_SB_FEAT_INCOMPAT_ALL
static inline bool
@@ -497,6 +501,23 @@
return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5;
}
+/*
+ * v5 file systems support V3 inodes only, earlier file systems support
+ * v2 and v1 inodes.
+ */
+static inline bool xfs_sb_version_has_v3inode(struct xfs_sb *sbp)
+{
+ return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5;
+}
+
+static inline bool xfs_dinode_good_version(struct xfs_sb *sbp,
+ uint8_t version)
+{
+ if (xfs_sb_version_has_v3inode(sbp))
+ return version == 3;
+ return version == 1 || version == 2;
+}
+
static inline bool xfs_sb_version_has_pquotino(struct xfs_sb *sbp)
{
return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5;
@@ -546,6 +567,23 @@
(sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_REFLINK);
}
+static inline bool xfs_sb_version_hasbigtime(struct xfs_sb *sbp)
+{
+ return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 &&
+ (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_BIGTIME);
+}
+
+/*
+ * Inode btree block counter. We record the number of inobt and finobt blocks
+ * in the AGI header so that we can skip the finobt walk at mount time when
+ * setting up per-AG reservations.
+ */
+static inline bool xfs_sb_version_hasinobtcounts(struct xfs_sb *sbp)
+{
+ return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 &&
+ (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_INOBTCNT);
+}
+
/*
* end of superblock version macros
*/
@@ -560,7 +598,6 @@
#define XFS_SB_DADDR ((xfs_daddr_t)0) /* daddr in filesystem/ag */
#define XFS_SB_BLOCK(mp) XFS_HDR_BLOCK(mp, XFS_SB_DADDR)
-#define XFS_BUF_TO_SBP(bp) ((xfs_dsb_t *)((bp)->b_addr))
#define XFS_HDR_BLOCK(mp,d) ((xfs_agblock_t)XFS_BB_TO_FSBT(mp,d))
#define XFS_DADDR_TO_FSB(mp,d) XFS_AGB_TO_FSB(mp, \
@@ -707,7 +744,6 @@
/* disk block (xfs_daddr_t) in the AG */
#define XFS_AGF_DADDR(mp) ((xfs_daddr_t)(1 << (mp)->m_sectbb_log))
#define XFS_AGF_BLOCK(mp) XFS_HDR_BLOCK(mp, XFS_AGF_DADDR(mp))
-#define XFS_BUF_TO_AGF(bp) ((xfs_agf_t *)((bp)->b_addr))
/*
* Size of the unlinked inode hash table in the agi.
@@ -750,6 +786,9 @@
__be32 agi_free_root; /* root of the free inode btree */
__be32 agi_free_level;/* levels in free inode btree */
+ __be32 agi_iblocks; /* inobt blocks used */
+ __be32 agi_fblocks; /* finobt blocks used */
+
/* structure must be padded to 64 bit alignment */
} xfs_agi_t;
@@ -770,12 +809,12 @@
#define XFS_AGI_ALL_BITS_R1 ((1 << XFS_AGI_NUM_BITS_R1) - 1)
#define XFS_AGI_FREE_ROOT (1 << 11)
#define XFS_AGI_FREE_LEVEL (1 << 12)
-#define XFS_AGI_NUM_BITS_R2 13
+#define XFS_AGI_IBLOCKS (1 << 13) /* both inobt/finobt block counters */
+#define XFS_AGI_NUM_BITS_R2 14
/* disk block (xfs_daddr_t) in the AG */
#define XFS_AGI_DADDR(mp) ((xfs_daddr_t)(2 << (mp)->m_sectbb_log))
#define XFS_AGI_BLOCK(mp) XFS_HDR_BLOCK(mp, XFS_AGI_DADDR(mp))
-#define XFS_BUF_TO_AGI(bp) ((xfs_agi_t *)((bp)->b_addr))
/*
* The third a.g. block contains the a.g. freelist, an array
@@ -783,21 +822,15 @@
*/
#define XFS_AGFL_DADDR(mp) ((xfs_daddr_t)(3 << (mp)->m_sectbb_log))
#define XFS_AGFL_BLOCK(mp) XFS_HDR_BLOCK(mp, XFS_AGFL_DADDR(mp))
-#define XFS_BUF_TO_AGFL(bp) ((xfs_agfl_t *)((bp)->b_addr))
+#define XFS_BUF_TO_AGFL(bp) ((struct xfs_agfl *)((bp)->b_addr))
-#define XFS_BUF_TO_AGFL_BNO(mp, bp) \
- (xfs_sb_version_hascrc(&((mp)->m_sb)) ? \
- &(XFS_BUF_TO_AGFL(bp)->agfl_bno[0]) : \
- (__be32 *)(bp)->b_addr)
-
-typedef struct xfs_agfl {
+struct xfs_agfl {
__be32 agfl_magicnum;
__be32 agfl_seqno;
uuid_t agfl_uuid;
__be64 agfl_lsn;
__be32 agfl_crc;
- __be32 agfl_bno[]; /* actually xfs_agfl_size(mp) */
-} __attribute__((packed)) xfs_agfl_t;
+} __attribute__((packed));
#define XFS_AGFL_CRC_OFF offsetof(struct xfs_agfl, agfl_crc)
@@ -823,10 +856,87 @@
ASSERT(xfs_daddr_to_agno(mp, d) == \
xfs_daddr_to_agno(mp, (d) + (len) - 1)))
-typedef struct xfs_timestamp {
+/*
+ * XFS Timestamps
+ * ==============
+ *
+ * Traditional ondisk inode timestamps consist of signed 32-bit counters for
+ * seconds and nanoseconds; time zero is the Unix epoch, Jan 1 00:00:00 UTC
+ * 1970, which means that the timestamp epoch is the same as the Unix epoch.
+ * Therefore, the ondisk min and max defined here can be used directly to
+ * constrain the incore timestamps on a Unix system. Note that we actually
+ * encode a __be64 value on disk.
+ *
+ * When the bigtime feature is enabled, ondisk inode timestamps become an
+ * unsigned 64-bit nanoseconds counter. This means that the bigtime inode
+ * timestamp epoch is the start of the classic timestamp range, which is
+ * Dec 31 20:45:52 UTC 1901. Because the epochs are not the same, callers
+ * /must/ use the bigtime conversion functions when encoding and decoding raw
+ * timestamps.
+ */
+typedef __be64 xfs_timestamp_t;
+
+/* Legacy timestamp encoding format. */
+struct xfs_legacy_timestamp {
__be32 t_sec; /* timestamp seconds */
__be32 t_nsec; /* timestamp nanoseconds */
-} xfs_timestamp_t;
+};
+
+/*
+ * Smallest possible ondisk seconds value with traditional timestamps. This
+ * corresponds exactly with the incore timestamp Dec 13 20:45:52 UTC 1901.
+ */
+#define XFS_LEGACY_TIME_MIN ((int64_t)S32_MIN)
+
+/*
+ * Largest possible ondisk seconds value with traditional timestamps. This
+ * corresponds exactly with the incore timestamp Jan 19 03:14:07 UTC 2038.
+ */
+#define XFS_LEGACY_TIME_MAX ((int64_t)S32_MAX)
+
+/*
+ * Smallest possible ondisk seconds value with bigtime timestamps. This
+ * corresponds (after conversion to a Unix timestamp) with the traditional
+ * minimum timestamp of Dec 13 20:45:52 UTC 1901.
+ */
+#define XFS_BIGTIME_TIME_MIN ((int64_t)0)
+
+/*
+ * Largest supported ondisk seconds value with bigtime timestamps. This
+ * corresponds (after conversion to a Unix timestamp) with an incore timestamp
+ * of Jul 2 20:20:24 UTC 2486.
+ *
+ * We round down the ondisk limit so that the bigtime quota and inode max
+ * timestamps will be the same.
+ */
+#define XFS_BIGTIME_TIME_MAX ((int64_t)((-1ULL / NSEC_PER_SEC) & ~0x3ULL))
+
+/*
+ * Bigtime epoch is set exactly to the minimum time value that a traditional
+ * 32-bit timestamp can represent when using the Unix epoch as a reference.
+ * Hence the Unix epoch is at a fixed offset into the supported bigtime
+ * timestamp range.
+ *
+ * The bigtime epoch also matches the minimum value an on-disk 32-bit XFS
+ * timestamp can represent so we will not lose any fidelity in converting
+ * to/from unix and bigtime timestamps.
+ *
+ * The following conversion factor converts a seconds counter from the Unix
+ * epoch to the bigtime epoch.
+ */
+#define XFS_BIGTIME_EPOCH_OFFSET (-(int64_t)S32_MIN)
+
+/* Convert a timestamp from the Unix epoch to the bigtime epoch. */
+static inline uint64_t xfs_unix_to_bigtime(time64_t unix_seconds)
+{
+ return (uint64_t)unix_seconds + XFS_BIGTIME_EPOCH_OFFSET;
+}
+
+/* Convert a timestamp from the bigtime epoch to the Unix epoch. */
+static inline time64_t xfs_bigtime_to_unix(uint64_t ondisk_seconds)
+{
+ return (time64_t)ondisk_seconds - XFS_BIGTIME_EPOCH_OFFSET;
+}
/*
* On-disk inode structure.
@@ -920,13 +1030,13 @@
* This enum is used in string mapping in xfs_trace.h; please keep the
* TRACE_DEFINE_ENUMs for it up to date.
*/
-typedef enum xfs_dinode_fmt {
+enum xfs_dinode_fmt {
XFS_DINODE_FMT_DEV, /* xfs_dev_t */
XFS_DINODE_FMT_LOCAL, /* bulk data */
XFS_DINODE_FMT_EXTENTS, /* struct xfs_bmbt_rec */
XFS_DINODE_FMT_BTREE, /* struct xfs_bmdr_block */
XFS_DINODE_FMT_UUID /* added long ago, but never used */
-} xfs_dinode_fmt_t;
+};
#define XFS_INODE_FORMAT_STR \
{ XFS_DINODE_FMT_DEV, "dev" }, \
@@ -946,23 +1056,22 @@
/*
* Inode size for given fs.
*/
-#define XFS_LITINO(mp, version) \
- ((int)(((mp)->m_sb.sb_inodesize) - xfs_dinode_size(version)))
+#define XFS_DINODE_SIZE(sbp) \
+ (xfs_sb_version_has_v3inode(sbp) ? \
+ sizeof(struct xfs_dinode) : \
+ offsetof(struct xfs_dinode, di_crc))
+#define XFS_LITINO(mp) \
+ ((mp)->m_sb.sb_inodesize - XFS_DINODE_SIZE(&(mp)->m_sb))
/*
* Inode data & attribute fork sizes, per inode.
*/
-#define XFS_DFORK_Q(dip) ((dip)->di_forkoff != 0)
#define XFS_DFORK_BOFF(dip) ((int)((dip)->di_forkoff << 3))
#define XFS_DFORK_DSIZE(dip,mp) \
- (XFS_DFORK_Q(dip) ? \
- XFS_DFORK_BOFF(dip) : \
- XFS_LITINO(mp, (dip)->di_version))
+ ((dip)->di_forkoff ? XFS_DFORK_BOFF(dip) : XFS_LITINO(mp))
#define XFS_DFORK_ASIZE(dip,mp) \
- (XFS_DFORK_Q(dip) ? \
- XFS_LITINO(mp, (dip)->di_version) - XFS_DFORK_BOFF(dip) : \
- 0)
+ ((dip)->di_forkoff ? XFS_LITINO(mp) - XFS_DFORK_BOFF(dip) : 0)
#define XFS_DFORK_SIZE(dip,mp,w) \
((w) == XFS_DATA_FORK ? \
XFS_DFORK_DSIZE(dip, mp) : \
@@ -1054,12 +1163,22 @@
#define XFS_DIFLAG2_DAX_BIT 0 /* use DAX for this inode */
#define XFS_DIFLAG2_REFLINK_BIT 1 /* file's blocks may be shared */
#define XFS_DIFLAG2_COWEXTSIZE_BIT 2 /* copy on write extent size hint */
+#define XFS_DIFLAG2_BIGTIME_BIT 3 /* big timestamps */
+
#define XFS_DIFLAG2_DAX (1 << XFS_DIFLAG2_DAX_BIT)
#define XFS_DIFLAG2_REFLINK (1 << XFS_DIFLAG2_REFLINK_BIT)
#define XFS_DIFLAG2_COWEXTSIZE (1 << XFS_DIFLAG2_COWEXTSIZE_BIT)
+#define XFS_DIFLAG2_BIGTIME (1 << XFS_DIFLAG2_BIGTIME_BIT)
#define XFS_DIFLAG2_ANY \
- (XFS_DIFLAG2_DAX | XFS_DIFLAG2_REFLINK | XFS_DIFLAG2_COWEXTSIZE)
+ (XFS_DIFLAG2_DAX | XFS_DIFLAG2_REFLINK | XFS_DIFLAG2_COWEXTSIZE | \
+ XFS_DIFLAG2_BIGTIME)
+
+static inline bool xfs_dinode_has_bigtime(const struct xfs_dinode *dip)
+{
+ return dip->di_version >= 3 &&
+ (dip->di_flags2 & cpu_to_be64(XFS_DIFLAG2_BIGTIME));
+}
/*
* Inode number format:
@@ -1142,16 +1261,111 @@
#define XFS_DQUOT_MAGIC 0x4451 /* 'DQ' */
#define XFS_DQUOT_VERSION (uint8_t)0x01 /* latest version number */
+#define XFS_DQTYPE_USER 0x01 /* user dquot record */
+#define XFS_DQTYPE_PROJ 0x02 /* project dquot record */
+#define XFS_DQTYPE_GROUP 0x04 /* group dquot record */
+#define XFS_DQTYPE_BIGTIME 0x80 /* large expiry timestamps */
+
+/* bitmask to determine if this is a user/group/project dquot */
+#define XFS_DQTYPE_REC_MASK (XFS_DQTYPE_USER | \
+ XFS_DQTYPE_PROJ | \
+ XFS_DQTYPE_GROUP)
+
+#define XFS_DQTYPE_ANY (XFS_DQTYPE_REC_MASK | \
+ XFS_DQTYPE_BIGTIME)
+
/*
- * This is the main portion of the on-disk representation of quota
- * information for a user. This is the q_core of the xfs_dquot_t that
- * is kept in kernel memory. We pad this with some more expansion room
- * to construct the on disk structure.
+ * XFS Quota Timers
+ * ================
+ *
+ * Traditional quota grace period expiration timers are an unsigned 32-bit
+ * seconds counter; time zero is the Unix epoch, Jan 1 00:00:01 UTC 1970.
+ * Note that an expiration value of zero means that the quota limit has not
+ * been reached, and therefore no expiration has been set. Therefore, the
+ * ondisk min and max defined here can be used directly to constrain the incore
+ * quota expiration timestamps on a Unix system.
+ *
+ * When bigtime is enabled, we trade two bits of precision to expand the
+ * expiration timeout range to match that of big inode timestamps. The min and
+ * max recorded here are the on-disk limits, not a Unix timestamp.
+ *
+ * The grace period for each quota type is stored in the root dquot (id = 0)
+ * and is applied to a non-root dquot when it exceeds the soft or hard limits.
+ * The length of quota grace periods are unsigned 32-bit quantities measured in
+ * units of seconds. A value of zero means to use the default period.
*/
-typedef struct xfs_disk_dquot {
+
+/*
+ * Smallest possible ondisk quota expiration value with traditional timestamps.
+ * This corresponds exactly with the incore expiration Jan 1 00:00:01 UTC 1970.
+ */
+#define XFS_DQ_LEGACY_EXPIRY_MIN ((int64_t)1)
+
+/*
+ * Largest possible ondisk quota expiration value with traditional timestamps.
+ * This corresponds exactly with the incore expiration Feb 7 06:28:15 UTC 2106.
+ */
+#define XFS_DQ_LEGACY_EXPIRY_MAX ((int64_t)U32_MAX)
+
+/*
+ * Smallest possible ondisk quota expiration value with bigtime timestamps.
+ * This corresponds (after conversion to a Unix timestamp) with the incore
+ * expiration of Jan 1 00:00:04 UTC 1970.
+ */
+#define XFS_DQ_BIGTIME_EXPIRY_MIN (XFS_DQ_LEGACY_EXPIRY_MIN)
+
+/*
+ * Largest supported ondisk quota expiration value with bigtime timestamps.
+ * This corresponds (after conversion to a Unix timestamp) with an incore
+ * expiration of Jul 2 20:20:24 UTC 2486.
+ *
+ * The ondisk field supports values up to -1U, which corresponds to an incore
+ * expiration in 2514. This is beyond the maximum the bigtime inode timestamp,
+ * so we cap the maximum bigtime quota expiration to the max inode timestamp.
+ */
+#define XFS_DQ_BIGTIME_EXPIRY_MAX ((int64_t)4074815106U)
+
+/*
+ * The following conversion factors assist in converting a quota expiration
+ * timestamp between the incore and ondisk formats.
+ */
+#define XFS_DQ_BIGTIME_SHIFT (2)
+#define XFS_DQ_BIGTIME_SLACK ((int64_t)(1ULL << XFS_DQ_BIGTIME_SHIFT) - 1)
+
+/* Convert an incore quota expiration timestamp to an ondisk bigtime value. */
+static inline uint32_t xfs_dq_unix_to_bigtime(time64_t unix_seconds)
+{
+ /*
+ * Round the expiration timestamp up to the nearest bigtime timestamp
+ * that we can store, to give users the most time to fix problems.
+ */
+ return ((uint64_t)unix_seconds + XFS_DQ_BIGTIME_SLACK) >>
+ XFS_DQ_BIGTIME_SHIFT;
+}
+
+/* Convert an ondisk bigtime quota expiration value to an incore timestamp. */
+static inline time64_t xfs_dq_bigtime_to_unix(uint32_t ondisk_seconds)
+{
+ return (time64_t)ondisk_seconds << XFS_DQ_BIGTIME_SHIFT;
+}
+
+/*
+ * Default quota grace periods, ranging from zero (use the compiled defaults)
+ * to ~136 years. These are applied to a non-root dquot that has exceeded
+ * either limit.
+ */
+#define XFS_DQ_GRACE_MIN ((int64_t)0)
+#define XFS_DQ_GRACE_MAX ((int64_t)U32_MAX)
+
+/*
+ * This is the main portion of the on-disk representation of quota information
+ * for a user. We pad this with some more expansion room to construct the on
+ * disk structure.
+ */
+struct xfs_disk_dquot {
__be16 d_magic; /* dquot magic = XFS_DQUOT_MAGIC */
__u8 d_version; /* dquot version */
- __u8 d_flags; /* XFS_DQ_USER/PROJ/GROUP */
+ __u8 d_type; /* XFS_DQTYPE_USER/PROJ/GROUP */
__be32 d_id; /* user,project,group id */
__be64 d_blk_hardlimit;/* absolute limit on disk blks */
__be64 d_blk_softlimit;/* preferred limit on disk blks */
@@ -1171,15 +1385,15 @@
__be32 d_rtbtimer; /* similar to above; for RT disk blocks */
__be16 d_rtbwarns; /* warnings issued wrt RT disk blocks */
__be16 d_pad;
-} xfs_disk_dquot_t;
+};
/*
* This is what goes on disk. This is separated from the xfs_disk_dquot because
* carrying the unnecessary padding would be a waste of memory.
*/
typedef struct xfs_dqblk {
- xfs_disk_dquot_t dd_diskdq; /* portion that lives incore as well */
- char dd_fill[4]; /* filling for posterity */
+ struct xfs_disk_dquot dd_diskdq; /* portion living incore as well */
+ char dd_fill[4];/* filling for posterity */
/*
* These two are only present on filesystems with the CRC bits set.
@@ -1192,6 +1406,22 @@
#define XFS_DQUOT_CRC_OFF offsetof(struct xfs_dqblk, dd_crc)
/*
+ * This defines the unit of allocation of dquots.
+ *
+ * Currently, it is just one file system block, and a 4K blk contains 30
+ * (136 * 30 = 4080) dquots. It's probably not worth trying to make
+ * this more dynamic.
+ *
+ * However, if this number is changed, we have to make sure that we don't
+ * implicitly assume that we do allocations in chunks of a single filesystem
+ * block in the dquot/xqm code.
+ *
+ * This is part of the ondisk format because the structure size is not a power
+ * of two, which leaves slack at the end of the disk block.
+ */
+#define XFS_DQUOT_CLUSTER_SIZE_FSB (xfs_filblks_t)1
+
+/*
* Remote symlink format and access functions.
*/
#define XFS_SYMLINK_MAGIC 0x58534c4d /* XSLM */
@@ -1540,6 +1770,13 @@
#define BMBT_BLOCKCOUNT_BITLEN 21
#define BMBT_STARTOFF_MASK ((1ULL << BMBT_STARTOFF_BITLEN) - 1)
+#define BMBT_BLOCKCOUNT_MASK ((1ULL << BMBT_BLOCKCOUNT_BITLEN) - 1)
+
+/*
+ * bmbt records have a file offset (block) field that is 54 bits wide, so this
+ * is the largest xfs_fileoff_t that we ever expect to see.
+ */
+#define XFS_MAX_FILEOFF (BMBT_STARTOFF_MASK + BMBT_BLOCKCOUNT_MASK)
typedef struct xfs_bmbt_rec {
__be64 l0, l1;
@@ -1666,7 +1903,7 @@
struct xfs_acl {
__be32 acl_cnt;
- struct xfs_acl_entry acl_entry[0];
+ struct xfs_acl_entry acl_entry[];
};
/*
diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h
index e9371a8..2a2e3cf 100644
--- a/fs/xfs/libxfs/xfs_fs.h
+++ b/fs/xfs/libxfs/xfs_fs.h
@@ -1,4 +1,4 @@
-// SPDX-License-Identifier: LGPL-2.1
+/* SPDX-License-Identifier: LGPL-2.1 */
/*
* Copyright (c) 1995-2005 Silicon Graphics, Inc.
* All Rights Reserved.
@@ -249,6 +249,7 @@
#define XFS_FSOP_GEOM_FLAGS_SPINODES (1 << 18) /* sparse inode chunks */
#define XFS_FSOP_GEOM_FLAGS_RMAPBT (1 << 19) /* reverse mapping btree */
#define XFS_FSOP_GEOM_FLAGS_REFLINK (1 << 20) /* files can share blocks */
+#define XFS_FSOP_GEOM_FLAGS_BIGTIME (1 << 21) /* 64-bit nsec timestamps */
/*
* Minimum and maximum sizes need for growth checks.
@@ -324,7 +325,7 @@
* Structures returned from ioctl XFS_IOC_FSBULKSTAT & XFS_IOC_FSBULKSTAT_SINGLE
*/
typedef struct xfs_bstime {
- time_t tv_sec; /* seconds */
+ __kernel_long_t tv_sec; /* seconds */
__s32 tv_nsec; /* and nanoseconds */
} xfs_bstime_t;
@@ -416,7 +417,7 @@
/*
* Project quota id helpers (previously projid was 16bit only
- * and using two 16bit values to hold new 32bit projid was choosen
+ * and using two 16bit values to hold new 32bit projid was chosen
* to retain compatibility with "old" filesystems).
*/
static inline uint32_t
@@ -568,10 +569,40 @@
struct fsdmidata __user *data; /* DMAPI data */
} xfs_fsop_setdm_handlereq_t;
+/*
+ * Flags passed in xfs_attr_multiop.am_flags for the attr ioctl interface.
+ *
+ * NOTE: Must match the values declared in libattr without the XFS_IOC_ prefix.
+ */
+#define XFS_IOC_ATTR_ROOT 0x0002 /* use attrs in root namespace */
+#define XFS_IOC_ATTR_SECURE 0x0008 /* use attrs in security namespace */
+#define XFS_IOC_ATTR_CREATE 0x0010 /* fail if attr already exists */
+#define XFS_IOC_ATTR_REPLACE 0x0020 /* fail if attr does not exist */
+
typedef struct xfs_attrlist_cursor {
__u32 opaque[4];
} xfs_attrlist_cursor_t;
+/*
+ * Define how lists of attribute names are returned to userspace from the
+ * XFS_IOC_ATTRLIST_BY_HANDLE ioctl. struct xfs_attrlist is the header at the
+ * beginning of the returned buffer, and a each entry in al_offset contains the
+ * relative offset of an xfs_attrlist_ent containing the actual entry.
+ *
+ * NOTE: struct xfs_attrlist must match struct attrlist defined in libattr, and
+ * struct xfs_attrlist_ent must match struct attrlist_ent defined in libattr.
+ */
+struct xfs_attrlist {
+ __s32 al_count; /* number of entries in attrlist */
+ __s32 al_more; /* T/F: more attrs (do call again) */
+ __s32 al_offset[1]; /* byte offsets of attrs [var-sized] */
+};
+
+struct xfs_attrlist_ent { /* data from attr_list() */
+ __u32 a_valuelen; /* number bytes in value of attr */
+ char a_name[1]; /* attr name (NULL terminated) */
+};
+
typedef struct xfs_fsop_attrlist_handlereq {
struct xfs_fsop_handlereq hreq; /* handle interface structure */
struct xfs_attrlist_cursor pos; /* opaque cookie, list offset */
@@ -589,7 +620,7 @@
void __user *am_attrname;
void __user *am_attrvalue;
__u32 am_length;
- __u32 am_flags;
+ __u32 am_flags; /* XFS_IOC_ATTR_* */
} xfs_attr_multiop_t;
typedef struct xfs_fsop_attrmulti_handlereq {
diff --git a/fs/xfs/libxfs/xfs_health.h b/fs/xfs/libxfs/xfs_health.h
index 272005a..99e7962 100644
--- a/fs/xfs/libxfs/xfs_health.h
+++ b/fs/xfs/libxfs/xfs_health.h
@@ -1,4 +1,4 @@
-// SPDX-License-Identifier: GPL-2.0+
+/* SPDX-License-Identifier: GPL-2.0+ */
/*
* Copyright (C) 2019 Oracle. All Rights Reserved.
* Author: Darrick J. Wong <darrick.wong@oracle.com>
diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c
index 443cf33..974e71b 100644
--- a/fs/xfs/libxfs/xfs_ialloc.c
+++ b/fs/xfs/libxfs/xfs_ialloc.c
@@ -105,7 +105,7 @@
int *stat)
{
struct xfs_mount *mp = cur->bc_mp;
- xfs_agnumber_t agno = cur->bc_private.a.agno;
+ xfs_agnumber_t agno = cur->bc_ag.agno;
union xfs_btree_rec *rec;
int error;
uint64_t realfree;
@@ -177,7 +177,7 @@
xfs_btnum_t btnum)
{
struct xfs_btree_cur *cur;
- struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp);
+ struct xfs_agi *agi = agbp->b_addr;
xfs_agnumber_t agno = be32_to_cpu(agi->agi_seqno);
xfs_agino_t thisino;
int i;
@@ -276,6 +276,7 @@
int i, j;
xfs_daddr_t d;
xfs_ino_t ino = 0;
+ int error;
/*
* Loop over the new block(s), filling in the inodes. For small block
@@ -303,7 +304,7 @@
* That means for v3 inode we log the entire buffer rather than just the
* inode cores.
*/
- if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ if (xfs_sb_version_has_v3inode(&mp->m_sb)) {
version = 3;
ino = XFS_AGINO_TO_INO(mp, agno, XFS_AGB_TO_AGINO(mp, agbno));
@@ -327,19 +328,18 @@
*/
d = XFS_AGB_TO_DADDR(mp, agno, agbno +
(j * M_IGEO(mp)->blocks_per_cluster));
- fbuf = xfs_trans_get_buf(tp, mp->m_ddev_targp, d,
- mp->m_bsize *
- M_IGEO(mp)->blocks_per_cluster,
- XBF_UNMAPPED);
- if (!fbuf)
- return -ENOMEM;
+ error = xfs_trans_get_buf(tp, mp->m_ddev_targp, d,
+ mp->m_bsize * M_IGEO(mp)->blocks_per_cluster,
+ XBF_UNMAPPED, &fbuf);
+ if (error)
+ return error;
/* Initialize the inode buffers and log them appropriately. */
fbuf->b_ops = &xfs_inode_buf_ops;
xfs_buf_zero(fbuf, 0, BBTOB(fbuf->b_length));
for (i = 0; i < M_IGEO(mp)->inodes_per_cluster; i++) {
int ioffset = i << mp->m_sb.sb_inodelog;
- uint isize = xfs_dinode_size(version);
+ uint isize = XFS_DINODE_SIZE(&mp->m_sb);
free = xfs_make_iptr(mp, fbuf, i);
free->di_magic = cpu_to_be16(XFS_DINODE_MAGIC);
@@ -525,7 +525,7 @@
bool merge) /* merge or replace */
{
struct xfs_btree_cur *cur;
- struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp);
+ struct xfs_agi *agi = agbp->b_addr;
xfs_agnumber_t agno = be32_to_cpu(agi->agi_seqno);
int error;
int i;
@@ -544,7 +544,10 @@
nrec->ir_free, &i);
if (error)
goto error;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto error;
+ }
goto out;
}
@@ -557,17 +560,23 @@
error = xfs_inobt_get_rec(cur, &rec, &i);
if (error)
goto error;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error);
- XFS_WANT_CORRUPTED_GOTO(mp,
- rec.ir_startino == nrec->ir_startino,
- error);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto error;
+ }
+ if (XFS_IS_CORRUPT(mp, rec.ir_startino != nrec->ir_startino)) {
+ error = -EFSCORRUPTED;
+ goto error;
+ }
/*
* This should never fail. If we have coexisting records that
* cannot merge, something is seriously wrong.
*/
- XFS_WANT_CORRUPTED_GOTO(mp, __xfs_inobt_can_merge(nrec, &rec),
- error);
+ if (XFS_IS_CORRUPT(mp, !__xfs_inobt_can_merge(nrec, &rec))) {
+ error = -EFSCORRUPTED;
+ goto error;
+ }
trace_xfs_irec_merge_pre(mp, agno, rec.ir_startino,
rec.ir_holemask, nrec->ir_startino,
@@ -649,7 +658,7 @@
* chunk of inodes. If the filesystem is striped, this will fill
* an entire stripe unit with inodes.
*/
- agi = XFS_BUF_TO_AGI(agbp);
+ agi = agbp->b_addr;
newino = be32_to_cpu(agi->agi_newino);
agno = be32_to_cpu(agi->agi_seqno);
args.agbno = XFS_AGINO_TO_AGBNO(args.mp, newino) +
@@ -879,10 +888,9 @@
*/
be32_add_cpu(&agi->agi_count, newlen);
be32_add_cpu(&agi->agi_freecount, newlen);
- pag = xfs_perag_get(args.mp, agno);
+ pag = agbp->b_pag;
pag->pagi_freecount += newlen;
pag->pagi_count += newlen;
- xfs_perag_put(pag);
agi->agi_newino = cpu_to_be32(newino);
/*
@@ -1057,7 +1065,8 @@
error = xfs_inobt_get_rec(cur, rec, &i);
if (error)
return error;
- XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, i == 1);
+ if (XFS_IS_CORRUPT(cur->bc_mp, i != 1))
+ return -EFSCORRUPTED;
}
return 0;
@@ -1081,7 +1090,8 @@
error = xfs_inobt_get_rec(cur, rec, &i);
if (error)
return error;
- XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, i == 1);
+ if (XFS_IS_CORRUPT(cur->bc_mp, i != 1))
+ return -EFSCORRUPTED;
}
return 0;
@@ -1119,11 +1129,11 @@
xfs_ino_t *inop)
{
struct xfs_mount *mp = tp->t_mountp;
- struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp);
+ struct xfs_agi *agi = agbp->b_addr;
xfs_agnumber_t agno = be32_to_cpu(agi->agi_seqno);
xfs_agnumber_t pagno = XFS_INO_TO_AGNO(mp, parent);
xfs_agino_t pagino = XFS_INO_TO_AGINO(mp, parent);
- struct xfs_perag *pag;
+ struct xfs_perag *pag = agbp->b_pag;
struct xfs_btree_cur *cur, *tcur;
struct xfs_inobt_rec_incore rec, trec;
xfs_ino_t ino;
@@ -1132,8 +1142,6 @@
int i, j;
int searchdistance = 10;
- pag = xfs_perag_get(mp, agno);
-
ASSERT(pag->pagi_init);
ASSERT(pag->pagi_inodeok);
ASSERT(pag->pagi_freecount > 0);
@@ -1161,12 +1169,18 @@
error = xfs_inobt_lookup(cur, pagino, XFS_LOOKUP_LE, &i);
if (error)
goto error0;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto error0;
+ }
error = xfs_inobt_get_rec(cur, &rec, &j);
if (error)
goto error0;
- XFS_WANT_CORRUPTED_GOTO(mp, j == 1, error0);
+ if (XFS_IS_CORRUPT(mp, j != 1)) {
+ error = -EFSCORRUPTED;
+ goto error0;
+ }
if (rec.ir_freecount > 0) {
/*
@@ -1321,19 +1335,28 @@
error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &i);
if (error)
goto error0;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto error0;
+ }
for (;;) {
error = xfs_inobt_get_rec(cur, &rec, &i);
if (error)
goto error0;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto error0;
+ }
if (rec.ir_freecount > 0)
break;
error = xfs_btree_increment(cur, 0, &i);
if (error)
goto error0;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto error0;
+ }
}
alloc_inode:
@@ -1358,14 +1381,12 @@
xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -1);
- xfs_perag_put(pag);
*inop = ino;
return 0;
error1:
xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
error0:
xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
- xfs_perag_put(pag);
return error;
}
@@ -1393,7 +1414,8 @@
error = xfs_inobt_get_rec(lcur, rec, &i);
if (error)
return error;
- XFS_WANT_CORRUPTED_RETURN(lcur->bc_mp, i == 1);
+ if (XFS_IS_CORRUPT(lcur->bc_mp, i != 1))
+ return -EFSCORRUPTED;
/*
* See if we've landed in the parent inode record. The finobt
@@ -1416,10 +1438,16 @@
error = xfs_inobt_get_rec(rcur, &rrec, &j);
if (error)
goto error_rcur;
- XFS_WANT_CORRUPTED_GOTO(lcur->bc_mp, j == 1, error_rcur);
+ if (XFS_IS_CORRUPT(lcur->bc_mp, j != 1)) {
+ error = -EFSCORRUPTED;
+ goto error_rcur;
+ }
}
- XFS_WANT_CORRUPTED_GOTO(lcur->bc_mp, i == 1 || j == 1, error_rcur);
+ if (XFS_IS_CORRUPT(lcur->bc_mp, i != 1 && j != 1)) {
+ error = -EFSCORRUPTED;
+ goto error_rcur;
+ }
if (i == 1 && j == 1) {
/*
* Both the left and right records are valid. Choose the closer
@@ -1472,7 +1500,8 @@
error = xfs_inobt_get_rec(cur, rec, &i);
if (error)
return error;
- XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, i == 1);
+ if (XFS_IS_CORRUPT(cur->bc_mp, i != 1))
+ return -EFSCORRUPTED;
return 0;
}
}
@@ -1483,12 +1512,14 @@
error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &i);
if (error)
return error;
- XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, i == 1);
+ if (XFS_IS_CORRUPT(cur->bc_mp, i != 1))
+ return -EFSCORRUPTED;
error = xfs_inobt_get_rec(cur, rec, &i);
if (error)
return error;
- XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, i == 1);
+ if (XFS_IS_CORRUPT(cur->bc_mp, i != 1))
+ return -EFSCORRUPTED;
return 0;
}
@@ -1510,20 +1541,24 @@
error = xfs_inobt_lookup(cur, frec->ir_startino, XFS_LOOKUP_EQ, &i);
if (error)
return error;
- XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, i == 1);
+ if (XFS_IS_CORRUPT(cur->bc_mp, i != 1))
+ return -EFSCORRUPTED;
error = xfs_inobt_get_rec(cur, &rec, &i);
if (error)
return error;
- XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, i == 1);
+ if (XFS_IS_CORRUPT(cur->bc_mp, i != 1))
+ return -EFSCORRUPTED;
ASSERT((XFS_AGINO_TO_OFFSET(cur->bc_mp, rec.ir_startino) %
XFS_INODES_PER_CHUNK) == 0);
rec.ir_free &= ~XFS_INOBT_MASK(offset);
rec.ir_freecount--;
- XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, (rec.ir_free == frec->ir_free) &&
- (rec.ir_freecount == frec->ir_freecount));
+ if (XFS_IS_CORRUPT(cur->bc_mp,
+ rec.ir_free != frec->ir_free ||
+ rec.ir_freecount != frec->ir_freecount))
+ return -EFSCORRUPTED;
return xfs_inobt_update(cur, &rec);
}
@@ -1543,11 +1578,10 @@
xfs_ino_t *inop)
{
struct xfs_mount *mp = tp->t_mountp;
- struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp);
+ struct xfs_agi *agi = agbp->b_addr;
xfs_agnumber_t agno = be32_to_cpu(agi->agi_seqno);
xfs_agnumber_t pagno = XFS_INO_TO_AGNO(mp, parent);
xfs_agino_t pagino = XFS_INO_TO_AGINO(mp, parent);
- struct xfs_perag *pag;
struct xfs_btree_cur *cur; /* finobt cursor */
struct xfs_btree_cur *icur; /* inobt cursor */
struct xfs_inobt_rec_incore rec;
@@ -1559,8 +1593,6 @@
if (!xfs_sb_version_hasfinobt(&mp->m_sb))
return xfs_dialloc_ag_inobt(tp, agbp, parent, inop);
- pag = xfs_perag_get(mp, agno);
-
/*
* If pagino is 0 (this is the root inode allocation) use newino.
* This must work because we've just allocated some.
@@ -1627,7 +1659,7 @@
*/
be32_add_cpu(&agi->agi_freecount, -1);
xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT);
- pag->pagi_freecount--;
+ agbp->b_pag->pagi_freecount--;
xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -1);
@@ -1640,7 +1672,6 @@
xfs_btree_del_cursor(icur, XFS_BTREE_NOERROR);
xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
- xfs_perag_put(pag);
*inop = ino;
return 0;
@@ -1648,7 +1679,6 @@
xfs_btree_del_cursor(icur, XFS_BTREE_ERROR);
error_cur:
xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
- xfs_perag_put(pag);
return error;
}
@@ -1903,9 +1933,8 @@
struct xfs_icluster *xic,
struct xfs_inobt_rec_incore *orec)
{
- struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp);
+ struct xfs_agi *agi = agbp->b_addr;
xfs_agnumber_t agno = be32_to_cpu(agi->agi_seqno);
- struct xfs_perag *pag;
struct xfs_btree_cur *cur;
struct xfs_inobt_rec_incore rec;
int ilen;
@@ -1933,14 +1962,20 @@
__func__, error);
goto error0;
}
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto error0;
+ }
error = xfs_inobt_get_rec(cur, &rec, &i);
if (error) {
xfs_warn(mp, "%s: xfs_inobt_get_rec() returned error %d.",
__func__, error);
goto error0;
}
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto error0;
+ }
/*
* Get the offset in the inode chunk.
*/
@@ -1961,6 +1996,8 @@
if (!(mp->m_flags & XFS_MOUNT_IKEEP) &&
rec.ir_free == XFS_INOBT_ALL_FREE &&
mp->m_sb.sb_inopblock <= XFS_INODES_PER_CHUNK) {
+ struct xfs_perag *pag = agbp->b_pag;
+
xic->deleted = true;
xic->first_ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino);
xic->alloc = xfs_inobt_irec_to_allocmask(&rec);
@@ -1974,10 +2011,8 @@
be32_add_cpu(&agi->agi_count, -ilen);
be32_add_cpu(&agi->agi_freecount, -(ilen - 1));
xfs_ialloc_log_agi(tp, agbp, XFS_AGI_COUNT | XFS_AGI_FREECOUNT);
- pag = xfs_perag_get(mp, agno);
pag->pagi_freecount -= ilen - 1;
pag->pagi_count -= ilen;
- xfs_perag_put(pag);
xfs_trans_mod_sb(tp, XFS_TRANS_SB_ICOUNT, -ilen);
xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -(ilen - 1));
@@ -2003,9 +2038,7 @@
*/
be32_add_cpu(&agi->agi_freecount, 1);
xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT);
- pag = xfs_perag_get(mp, agno);
- pag->pagi_freecount++;
- xfs_perag_put(pag);
+ agbp->b_pag->pagi_freecount++;
xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, 1);
}
@@ -2033,7 +2066,7 @@
xfs_agino_t agino,
struct xfs_inobt_rec_incore *ibtrec) /* inobt record */
{
- struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp);
+ struct xfs_agi *agi = agbp->b_addr;
xfs_agnumber_t agno = be32_to_cpu(agi->agi_seqno);
struct xfs_btree_cur *cur;
struct xfs_inobt_rec_incore rec;
@@ -2052,7 +2085,10 @@
* freed an inode in a previously fully allocated chunk. If not,
* something is out of sync.
*/
- XFS_WANT_CORRUPTED_GOTO(mp, ibtrec->ir_freecount == 1, error);
+ if (XFS_IS_CORRUPT(mp, ibtrec->ir_freecount != 1)) {
+ error = -EFSCORRUPTED;
+ goto error;
+ }
error = xfs_inobt_insert_rec(cur, ibtrec->ir_holemask,
ibtrec->ir_count,
@@ -2075,14 +2111,20 @@
error = xfs_inobt_get_rec(cur, &rec, &i);
if (error)
goto error;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto error;
+ }
rec.ir_free |= XFS_INOBT_MASK(offset);
rec.ir_freecount++;
- XFS_WANT_CORRUPTED_GOTO(mp, (rec.ir_free == ibtrec->ir_free) &&
- (rec.ir_freecount == ibtrec->ir_freecount),
- error);
+ if (XFS_IS_CORRUPT(mp,
+ rec.ir_free != ibtrec->ir_free ||
+ rec.ir_freecount != ibtrec->ir_freecount)) {
+ error = -EFSCORRUPTED;
+ goto error;
+ }
/*
* The content of inobt records should always match between the inobt
@@ -2431,12 +2473,12 @@
offsetof(xfs_agi_t, agi_unlinked),
offsetof(xfs_agi_t, agi_free_root),
offsetof(xfs_agi_t, agi_free_level),
+ offsetof(xfs_agi_t, agi_iblocks),
sizeof(xfs_agi_t)
};
#ifdef DEBUG
- xfs_agi_t *agi; /* allocation group header */
+ struct xfs_agi *agi = bp->b_addr;
- agi = XFS_BUF_TO_AGI(bp);
ASSERT(agi->agi_magicnum == cpu_to_be32(XFS_AGI_MAGIC));
#endif
@@ -2468,14 +2510,13 @@
struct xfs_buf *bp)
{
struct xfs_mount *mp = bp->b_mount;
- struct xfs_agi *agi = XFS_BUF_TO_AGI(bp);
+ struct xfs_agi *agi = bp->b_addr;
int i;
if (xfs_sb_version_hascrc(&mp->m_sb)) {
if (!uuid_equal(&agi->agi_uuid, &mp->m_sb.sb_meta_uuid))
return __this_address;
- if (!xfs_log_check_lsn(mp,
- be64_to_cpu(XFS_BUF_TO_AGI(bp)->agi_lsn)))
+ if (!xfs_log_check_lsn(mp, be64_to_cpu(agi->agi_lsn)))
return __this_address;
}
@@ -2538,6 +2579,7 @@
{
struct xfs_mount *mp = bp->b_mount;
struct xfs_buf_log_item *bip = bp->b_log_item;
+ struct xfs_agi *agi = bp->b_addr;
xfs_failaddr_t fa;
fa = xfs_agi_verify(bp);
@@ -2550,7 +2592,7 @@
return;
if (bip)
- XFS_BUF_TO_AGI(bp)->agi_lsn = cpu_to_be64(bip->bli_item.li_lsn);
+ agi->agi_lsn = cpu_to_be64(bip->bli_item.li_lsn);
xfs_buf_update_cksum(bp, XFS_AGI_CRC_OFF);
}
@@ -2606,8 +2648,8 @@
if (error)
return error;
- agi = XFS_BUF_TO_AGI(*bpp);
- pag = xfs_perag_get(mp, agno);
+ agi = (*bpp)->b_addr;
+ pag = (*bpp)->b_pag;
if (!pag->pagi_init) {
pag->pagi_freecount = be32_to_cpu(agi->agi_freecount);
pag->pagi_count = be32_to_cpu(agi->agi_count);
@@ -2620,7 +2662,6 @@
*/
ASSERT(pag->pagi_freecount == be32_to_cpu(agi->agi_freecount) ||
XFS_FORCED_SHUTDOWN(mp));
- xfs_perag_put(pag);
return 0;
}
@@ -2766,6 +2807,10 @@
uint64_t icount;
uint inodes;
+ igeo->new_diflags2 = 0;
+ if (xfs_sb_version_hasbigtime(&mp->m_sb))
+ igeo->new_diflags2 |= XFS_DIFLAG2_BIGTIME;
+
/* Compute inode btree geometry. */
igeo->agino_log = sbp->sb_inopblog + sbp->sb_agblklog;
igeo->inobt_mxr[0] = xfs_inobt_maxrecs(mp, sbp->sb_blocksize, 1);
@@ -2818,7 +2863,7 @@
* cannot change the behavior.
*/
igeo->inode_cluster_size_raw = XFS_INODE_BIG_CLUSTER_SIZE;
- if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ if (xfs_sb_version_has_v3inode(&mp->m_sb)) {
int new_size = igeo->inode_cluster_size_raw;
new_size *= mp->m_sb.sb_inodesize / XFS_DINODE_MIN_SIZE;
@@ -2854,3 +2899,67 @@
else
igeo->ialloc_align = 0;
}
+
+/* Compute the location of the root directory inode that is laid out by mkfs. */
+xfs_ino_t
+xfs_ialloc_calc_rootino(
+ struct xfs_mount *mp,
+ int sunit)
+{
+ struct xfs_ino_geometry *igeo = M_IGEO(mp);
+ xfs_agblock_t first_bno;
+
+ /*
+ * Pre-calculate the geometry of AG 0. We know what it looks like
+ * because libxfs knows how to create allocation groups now.
+ *
+ * first_bno is the first block in which mkfs could possibly have
+ * allocated the root directory inode, once we factor in the metadata
+ * that mkfs formats before it. Namely, the four AG headers...
+ */
+ first_bno = howmany(4 * mp->m_sb.sb_sectsize, mp->m_sb.sb_blocksize);
+
+ /* ...the two free space btree roots... */
+ first_bno += 2;
+
+ /* ...the inode btree root... */
+ first_bno += 1;
+
+ /* ...the initial AGFL... */
+ first_bno += xfs_alloc_min_freelist(mp, NULL);
+
+ /* ...the free inode btree root... */
+ if (xfs_sb_version_hasfinobt(&mp->m_sb))
+ first_bno++;
+
+ /* ...the reverse mapping btree root... */
+ if (xfs_sb_version_hasrmapbt(&mp->m_sb))
+ first_bno++;
+
+ /* ...the reference count btree... */
+ if (xfs_sb_version_hasreflink(&mp->m_sb))
+ first_bno++;
+
+ /*
+ * ...and the log, if it is allocated in the first allocation group.
+ *
+ * This can happen with filesystems that only have a single
+ * allocation group, or very odd geometries created by old mkfs
+ * versions on very small filesystems.
+ */
+ if (mp->m_sb.sb_logstart &&
+ XFS_FSB_TO_AGNO(mp, mp->m_sb.sb_logstart) == 0)
+ first_bno += mp->m_sb.sb_logblocks;
+
+ /*
+ * Now round first_bno up to whatever allocation alignment is given
+ * by the filesystem or was passed in.
+ */
+ if (xfs_sb_version_hasdalign(&mp->m_sb) && igeo->ialloc_align > 0)
+ first_bno = roundup(first_bno, sunit);
+ else if (xfs_sb_version_hasalign(&mp->m_sb) &&
+ mp->m_sb.sb_inoalignmt > 1)
+ first_bno = roundup(first_bno, mp->m_sb.sb_inoalignmt);
+
+ return XFS_AGINO_TO_INO(mp, 0, XFS_AGB_TO_AGINO(mp, first_bno));
+}
diff --git a/fs/xfs/libxfs/xfs_ialloc.h b/fs/xfs/libxfs/xfs_ialloc.h
index 323592d..72b3468 100644
--- a/fs/xfs/libxfs/xfs_ialloc.h
+++ b/fs/xfs/libxfs/xfs_ialloc.h
@@ -152,5 +152,6 @@
int xfs_ialloc_cluster_alignment(struct xfs_mount *mp);
void xfs_ialloc_setup_geometry(struct xfs_mount *mp);
+xfs_ino_t xfs_ialloc_calc_rootino(struct xfs_mount *mp, int sunit);
#endif /* __XFS_IALLOC_H__ */
diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c
index b82992f..cc919a2 100644
--- a/fs/xfs/libxfs/xfs_ialloc_btree.c
+++ b/fs/xfs/libxfs/xfs_ialloc_btree.c
@@ -12,6 +12,7 @@
#include "xfs_bit.h"
#include "xfs_mount.h"
#include "xfs_btree.h"
+#include "xfs_btree_staging.h"
#include "xfs_ialloc.h"
#include "xfs_ialloc_btree.h"
#include "xfs_alloc.h"
@@ -20,7 +21,6 @@
#include "xfs_trans.h"
#include "xfs_rmap.h"
-
STATIC int
xfs_inobt_get_minrecs(
struct xfs_btree_cur *cur,
@@ -34,7 +34,7 @@
struct xfs_btree_cur *cur)
{
return xfs_inobt_init_cursor(cur->bc_mp, cur->bc_tp,
- cur->bc_private.a.agbp, cur->bc_private.a.agno,
+ cur->bc_ag.agbp, cur->bc_ag.agno,
cur->bc_btnum);
}
@@ -44,8 +44,8 @@
union xfs_btree_ptr *nptr,
int inc) /* level change */
{
- struct xfs_buf *agbp = cur->bc_private.a.agbp;
- struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp);
+ struct xfs_buf *agbp = cur->bc_ag.agbp;
+ struct xfs_agi *agi = agbp->b_addr;
agi->agi_root = nptr->s;
be32_add_cpu(&agi->agi_level, inc);
@@ -58,8 +58,8 @@
union xfs_btree_ptr *nptr,
int inc) /* level change */
{
- struct xfs_buf *agbp = cur->bc_private.a.agbp;
- struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp);
+ struct xfs_buf *agbp = cur->bc_ag.agbp;
+ struct xfs_agi *agi = agbp->b_addr;
agi->agi_free_root = nptr->s;
be32_add_cpu(&agi->agi_free_level, inc);
@@ -67,6 +67,25 @@
XFS_AGI_FREE_ROOT | XFS_AGI_FREE_LEVEL);
}
+/* Update the inode btree block counter for this btree. */
+static inline void
+xfs_inobt_mod_blockcount(
+ struct xfs_btree_cur *cur,
+ int howmuch)
+{
+ struct xfs_buf *agbp = cur->bc_ag.agbp;
+ struct xfs_agi *agi = agbp->b_addr;
+
+ if (!xfs_sb_version_hasinobtcounts(&cur->bc_mp->m_sb))
+ return;
+
+ if (cur->bc_btnum == XFS_BTNUM_FINO)
+ be32_add_cpu(&agi->agi_fblocks, howmuch);
+ else if (cur->bc_btnum == XFS_BTNUM_INO)
+ be32_add_cpu(&agi->agi_iblocks, howmuch);
+ xfs_ialloc_log_agi(cur->bc_tp, agbp, XFS_AGI_IBLOCKS);
+}
+
STATIC int
__xfs_inobt_alloc_block(
struct xfs_btree_cur *cur,
@@ -83,7 +102,7 @@
args.tp = cur->bc_tp;
args.mp = cur->bc_mp;
args.oinfo = XFS_RMAP_OINFO_INOBT;
- args.fsbno = XFS_AGB_TO_FSB(args.mp, cur->bc_private.a.agno, sbno);
+ args.fsbno = XFS_AGB_TO_FSB(args.mp, cur->bc_ag.agno, sbno);
args.minlen = 1;
args.maxlen = 1;
args.prod = 1;
@@ -102,6 +121,7 @@
new->s = cpu_to_be32(XFS_FSB_TO_AGBNO(args.mp, args.fsbno));
*stat = 1;
+ xfs_inobt_mod_blockcount(cur, 1);
return 0;
}
@@ -134,6 +154,7 @@
struct xfs_buf *bp,
enum xfs_ag_resv_type resv)
{
+ xfs_inobt_mod_blockcount(cur, -1);
return xfs_free_extent(cur->bc_tp,
XFS_DADDR_TO_FSB(cur->bc_mp, XFS_BUF_ADDR(bp)), 1,
&XFS_RMAP_OINFO_INOBT, resv);
@@ -212,9 +233,9 @@
struct xfs_btree_cur *cur,
union xfs_btree_ptr *ptr)
{
- struct xfs_agi *agi = XFS_BUF_TO_AGI(cur->bc_private.a.agbp);
+ struct xfs_agi *agi = cur->bc_ag.agbp->b_addr;
- ASSERT(cur->bc_private.a.agno == be32_to_cpu(agi->agi_seqno));
+ ASSERT(cur->bc_ag.agno == be32_to_cpu(agi->agi_seqno));
ptr->s = agi->agi_root;
}
@@ -224,9 +245,9 @@
struct xfs_btree_cur *cur,
union xfs_btree_ptr *ptr)
{
- struct xfs_agi *agi = XFS_BUF_TO_AGI(cur->bc_private.a.agbp);
+ struct xfs_agi *agi = cur->bc_ag.agbp->b_addr;
- ASSERT(cur->bc_private.a.agno == be32_to_cpu(agi->agi_seqno));
+ ASSERT(cur->bc_ag.agno == be32_to_cpu(agi->agi_seqno));
ptr->s = agi->agi_free_root;
}
@@ -400,32 +421,27 @@
};
/*
- * Allocate a new inode btree cursor.
+ * Initialize a new inode btree cursor.
*/
-struct xfs_btree_cur * /* new inode btree cursor */
-xfs_inobt_init_cursor(
+static struct xfs_btree_cur *
+xfs_inobt_init_common(
struct xfs_mount *mp, /* file system mount point */
struct xfs_trans *tp, /* transaction pointer */
- struct xfs_buf *agbp, /* buffer for agi structure */
xfs_agnumber_t agno, /* allocation group number */
xfs_btnum_t btnum) /* ialloc or free ino btree */
{
- struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp);
struct xfs_btree_cur *cur;
- cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_NOFS);
-
+ cur = kmem_cache_zalloc(xfs_btree_cur_zone, GFP_NOFS | __GFP_NOFAIL);
cur->bc_tp = tp;
cur->bc_mp = mp;
cur->bc_btnum = btnum;
if (btnum == XFS_BTNUM_INO) {
- cur->bc_nlevels = be32_to_cpu(agi->agi_level);
- cur->bc_ops = &xfs_inobt_ops;
cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_ibt_2);
+ cur->bc_ops = &xfs_inobt_ops;
} else {
- cur->bc_nlevels = be32_to_cpu(agi->agi_free_level);
- cur->bc_ops = &xfs_finobt_ops;
cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_fibt_2);
+ cur->bc_ops = &xfs_finobt_ops;
}
cur->bc_blocklog = mp->m_sb.sb_blocklog;
@@ -433,12 +449,85 @@
if (xfs_sb_version_hascrc(&mp->m_sb))
cur->bc_flags |= XFS_BTREE_CRC_BLOCKS;
- cur->bc_private.a.agbp = agbp;
- cur->bc_private.a.agno = agno;
-
+ cur->bc_ag.agno = agno;
return cur;
}
+/* Create an inode btree cursor. */
+struct xfs_btree_cur *
+xfs_inobt_init_cursor(
+ struct xfs_mount *mp,
+ struct xfs_trans *tp,
+ struct xfs_buf *agbp,
+ xfs_agnumber_t agno,
+ xfs_btnum_t btnum)
+{
+ struct xfs_btree_cur *cur;
+ struct xfs_agi *agi = agbp->b_addr;
+
+ cur = xfs_inobt_init_common(mp, tp, agno, btnum);
+ if (btnum == XFS_BTNUM_INO)
+ cur->bc_nlevels = be32_to_cpu(agi->agi_level);
+ else
+ cur->bc_nlevels = be32_to_cpu(agi->agi_free_level);
+ cur->bc_ag.agbp = agbp;
+ return cur;
+}
+
+/* Create an inode btree cursor with a fake root for staging. */
+struct xfs_btree_cur *
+xfs_inobt_stage_cursor(
+ struct xfs_mount *mp,
+ struct xbtree_afakeroot *afake,
+ xfs_agnumber_t agno,
+ xfs_btnum_t btnum)
+{
+ struct xfs_btree_cur *cur;
+
+ cur = xfs_inobt_init_common(mp, NULL, agno, btnum);
+ xfs_btree_stage_afakeroot(cur, afake);
+ return cur;
+}
+
+/*
+ * Install a new inobt btree root. Caller is responsible for invalidating
+ * and freeing the old btree blocks.
+ */
+void
+xfs_inobt_commit_staged_btree(
+ struct xfs_btree_cur *cur,
+ struct xfs_trans *tp,
+ struct xfs_buf *agbp)
+{
+ struct xfs_agi *agi = agbp->b_addr;
+ struct xbtree_afakeroot *afake = cur->bc_ag.afake;
+ int fields;
+
+ ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
+
+ if (cur->bc_btnum == XFS_BTNUM_INO) {
+ fields = XFS_AGI_ROOT | XFS_AGI_LEVEL;
+ agi->agi_root = cpu_to_be32(afake->af_root);
+ agi->agi_level = cpu_to_be32(afake->af_levels);
+ if (xfs_sb_version_hasinobtcounts(&cur->bc_mp->m_sb)) {
+ agi->agi_iblocks = cpu_to_be32(afake->af_blocks);
+ fields |= XFS_AGI_IBLOCKS;
+ }
+ xfs_ialloc_log_agi(tp, agbp, fields);
+ xfs_btree_commit_afakeroot(cur, tp, agbp, &xfs_inobt_ops);
+ } else {
+ fields = XFS_AGI_FREE_ROOT | XFS_AGI_FREE_LEVEL;
+ agi->agi_free_root = cpu_to_be32(afake->af_root);
+ agi->agi_free_level = cpu_to_be32(afake->af_levels);
+ if (xfs_sb_version_hasinobtcounts(&cur->bc_mp->m_sb)) {
+ agi->agi_fblocks = cpu_to_be32(afake->af_blocks);
+ fields |= XFS_AGI_IBLOCKS;
+ }
+ xfs_ialloc_log_agi(tp, agbp, fields);
+ xfs_btree_commit_afakeroot(cur, tp, agbp, &xfs_finobt_ops);
+ }
+}
+
/*
* Calculate number of records in an inobt btree block.
*/
@@ -615,6 +704,28 @@
return error;
}
+/* Read finobt block count from AGI header. */
+static int
+xfs_finobt_read_blocks(
+ struct xfs_mount *mp,
+ struct xfs_trans *tp,
+ xfs_agnumber_t agno,
+ xfs_extlen_t *tree_blocks)
+{
+ struct xfs_buf *agbp;
+ struct xfs_agi *agi;
+ int error;
+
+ error = xfs_ialloc_read_agi(mp, tp, agno, &agbp);
+ if (error)
+ return error;
+
+ agi = agbp->b_addr;
+ *tree_blocks = be32_to_cpu(agi->agi_fblocks);
+ xfs_trans_brelse(tp, agbp);
+ return 0;
+}
+
/*
* Figure out how many blocks to reserve and how many are used by this btree.
*/
@@ -632,7 +743,11 @@
if (!xfs_sb_version_hasfinobt(&mp->m_sb))
return 0;
- error = xfs_inobt_count_blocks(mp, tp, agno, XFS_BTNUM_FINO, &tree_len);
+ if (xfs_sb_version_hasinobtcounts(&mp->m_sb))
+ error = xfs_finobt_read_blocks(mp, tp, agno, &tree_len);
+ else
+ error = xfs_inobt_count_blocks(mp, tp, agno, XFS_BTNUM_FINO,
+ &tree_len);
if (error)
return error;
diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.h b/fs/xfs/libxfs/xfs_ialloc_btree.h
index 951305e..35bbd97 100644
--- a/fs/xfs/libxfs/xfs_ialloc_btree.h
+++ b/fs/xfs/libxfs/xfs_ialloc_btree.h
@@ -48,6 +48,9 @@
extern struct xfs_btree_cur *xfs_inobt_init_cursor(struct xfs_mount *,
struct xfs_trans *, struct xfs_buf *, xfs_agnumber_t,
xfs_btnum_t);
+struct xfs_btree_cur *xfs_inobt_stage_cursor(struct xfs_mount *mp,
+ struct xbtree_afakeroot *afake, xfs_agnumber_t agno,
+ xfs_btnum_t btnum);
extern int xfs_inobt_maxrecs(struct xfs_mount *, int, int);
/* ir_holemask to inode allocation bitmap conversion */
@@ -68,4 +71,7 @@
xfs_agnumber_t agno, xfs_btnum_t btnum,
struct xfs_btree_cur **curpp, struct xfs_buf **agi_bpp);
+void xfs_inobt_commit_staged_btree(struct xfs_btree_cur *cur,
+ struct xfs_trans *tp, struct xfs_buf *agbp);
+
#endif /* __XFS_IALLOC_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_iext_tree.c b/fs/xfs/libxfs/xfs_iext_tree.c
index 5245180..b416425 100644
--- a/fs/xfs/libxfs/xfs_iext_tree.c
+++ b/fs/xfs/libxfs/xfs_iext_tree.c
@@ -603,7 +603,7 @@
if (new_size / sizeof(struct xfs_iext_rec) == RECS_PER_LEAF)
new_size = NODE_SIZE;
- new = kmem_realloc(ifp->if_u1.if_root, new_size, KM_NOFS);
+ new = krealloc(ifp->if_u1.if_root, new_size, GFP_NOFS | __GFP_NOFAIL);
memset(new + ifp->if_bytes, 0, new_size - ifp->if_bytes);
ifp->if_u1.if_root = new;
cur->leaf = new;
diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c
index 28ab3c5..c667c63 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.c
+++ b/fs/xfs/libxfs/xfs_inode_buf.c
@@ -21,41 +21,6 @@
#include <linux/iversion.h>
/*
- * Check that none of the inode's in the buffer have a next
- * unlinked field of 0.
- */
-#if defined(DEBUG)
-void
-xfs_inobp_check(
- xfs_mount_t *mp,
- xfs_buf_t *bp)
-{
- int i;
- xfs_dinode_t *dip;
-
- for (i = 0; i < M_IGEO(mp)->inodes_per_cluster; i++) {
- dip = xfs_buf_offset(bp, i * mp->m_sb.sb_inodesize);
- if (!dip->di_next_unlinked) {
- xfs_alert(mp,
- "Detected bogus zero next_unlinked field in inode %d buffer 0x%llx.",
- i, (long long)bp->b_bn);
- }
- }
-}
-#endif
-
-bool
-xfs_dinode_good_version(
- struct xfs_mount *mp,
- __u8 version)
-{
- if (xfs_sb_version_hascrc(&mp->m_sb))
- return version == 3;
-
- return version == 1 || version == 2;
-}
-
-/*
* If we are doing readahead on an inode buffer, we might be in log recovery
* reading an inode allocation buffer that hasn't yet been replayed, and hence
* has not had the inode cores stamped into it. Hence for readahead, the buffer
@@ -64,10 +29,10 @@
* If the readahead buffer is invalid, we need to mark it with an error and
* clear the DONE status of the buffer so that a followup read will re-read it
* from disk. We don't report the error otherwise to avoid warnings during log
- * recovery and we don't get unnecssary panics on debug kernels. We use EIO here
+ * recovery and we don't get unnecessary panics on debug kernels. We use EIO here
* because all we want to do is say readahead failed; there is no-one to report
* the error to, so this will distinguish it from a non-ra verifier failure.
- * Changes to this readahead error behavour also need to be reflected in
+ * Changes to this readahead error behaviour also need to be reflected in
* xfs_dquot_buf_readahead_verify().
*/
static void
@@ -93,7 +58,7 @@
dip = xfs_buf_offset(bp, (i << mp->m_sb.sb_inodelog));
unlinked_ino = be32_to_cpu(dip->di_next_unlinked);
di_ok = xfs_verify_magic16(bp, dip->di_magic) &&
- xfs_dinode_good_version(mp, dip->di_version) &&
+ xfs_dinode_good_version(&mp->m_sb, dip->di_version) &&
xfs_verify_agino_or_null(mp, agno, unlinked_ino);
if (unlikely(XFS_TEST_ERROR(!di_ok, mp,
XFS_ERRTAG_ITOBP_INOTOBP))) {
@@ -172,8 +137,7 @@
struct xfs_imap *imap,
struct xfs_dinode **dipp,
struct xfs_buf **bpp,
- uint buf_flags,
- uint iget_flags)
+ uint buf_flags)
{
struct xfs_buf *bp;
int error;
@@ -183,49 +147,93 @@
(int)imap->im_len, buf_flags, &bp,
&xfs_inode_buf_ops);
if (error) {
- if (error == -EAGAIN) {
- ASSERT(buf_flags & XBF_TRYLOCK);
- return error;
- }
- xfs_warn(mp, "%s: xfs_trans_read_buf() returned error %d.",
- __func__, error);
+ ASSERT(error != -EAGAIN || (buf_flags & XBF_TRYLOCK));
return error;
}
*bpp = bp;
- *dipp = xfs_buf_offset(bp, imap->im_boffset);
+ if (dipp)
+ *dipp = xfs_buf_offset(bp, imap->im_boffset);
return 0;
}
-void
+static inline struct timespec64 xfs_inode_decode_bigtime(uint64_t ts)
+{
+ struct timespec64 tv;
+ uint32_t n;
+
+ tv.tv_sec = xfs_bigtime_to_unix(div_u64_rem(ts, NSEC_PER_SEC, &n));
+ tv.tv_nsec = n;
+
+ return tv;
+}
+
+/* Convert an ondisk timestamp to an incore timestamp. */
+struct timespec64
+xfs_inode_from_disk_ts(
+ struct xfs_dinode *dip,
+ const xfs_timestamp_t ts)
+{
+ struct timespec64 tv;
+ struct xfs_legacy_timestamp *lts;
+
+ if (xfs_dinode_has_bigtime(dip))
+ return xfs_inode_decode_bigtime(be64_to_cpu(ts));
+
+ lts = (struct xfs_legacy_timestamp *)&ts;
+ tv.tv_sec = (int)be32_to_cpu(lts->t_sec);
+ tv.tv_nsec = (int)be32_to_cpu(lts->t_nsec);
+
+ return tv;
+}
+
+int
xfs_inode_from_disk(
struct xfs_inode *ip,
struct xfs_dinode *from)
{
struct xfs_icdinode *to = &ip->i_d;
struct inode *inode = VFS_I(ip);
+ int error;
+ xfs_failaddr_t fa;
+ ASSERT(ip->i_cowfp == NULL);
+ ASSERT(ip->i_afp == NULL);
+
+ fa = xfs_dinode_verify(ip->i_mount, ip->i_ino, from);
+ if (fa) {
+ xfs_inode_verifier_error(ip, -EFSCORRUPTED, "dinode", from,
+ sizeof(*from), fa);
+ return -EFSCORRUPTED;
+ }
+
+ /*
+ * First get the permanent information that is needed to allocate an
+ * inode. If the inode is unused, mode is zero and we shouldn't mess
+ * with the uninitialized part of it.
+ */
+ to->di_flushiter = be16_to_cpu(from->di_flushiter);
+ inode->i_generation = be32_to_cpu(from->di_gen);
+ inode->i_mode = be16_to_cpu(from->di_mode);
+ if (!inode->i_mode)
+ return 0;
/*
* Convert v1 inodes immediately to v2 inode format as this is the
* minimum inode version format we support in the rest of the code.
+ * They will also be unconditionally written back to disk as v2 inodes.
*/
- to->di_version = from->di_version;
- if (to->di_version == 1) {
+ if (unlikely(from->di_version == 1)) {
set_nlink(inode, be16_to_cpu(from->di_onlink));
- to->di_projid_lo = 0;
- to->di_projid_hi = 0;
- to->di_version = 2;
+ to->di_projid = 0;
} else {
set_nlink(inode, be32_to_cpu(from->di_nlink));
- to->di_projid_lo = be16_to_cpu(from->di_projid_lo);
- to->di_projid_hi = be16_to_cpu(from->di_projid_hi);
+ to->di_projid = (prid_t)be16_to_cpu(from->di_projid_hi) << 16 |
+ be16_to_cpu(from->di_projid_lo);
}
- to->di_format = from->di_format;
- to->di_uid = be32_to_cpu(from->di_uid);
- to->di_gid = be32_to_cpu(from->di_gid);
- to->di_flushiter = be16_to_cpu(from->di_flushiter);
+ i_uid_write(inode, be32_to_cpu(from->di_uid));
+ i_gid_write(inode, be32_to_cpu(from->di_gid));
/*
* Time is signed, so need to convert to signed 32 bit before
@@ -233,34 +241,60 @@
* a time before epoch is converted to a time long after epoch
* on 64 bit systems.
*/
- inode->i_atime.tv_sec = (int)be32_to_cpu(from->di_atime.t_sec);
- inode->i_atime.tv_nsec = (int)be32_to_cpu(from->di_atime.t_nsec);
- inode->i_mtime.tv_sec = (int)be32_to_cpu(from->di_mtime.t_sec);
- inode->i_mtime.tv_nsec = (int)be32_to_cpu(from->di_mtime.t_nsec);
- inode->i_ctime.tv_sec = (int)be32_to_cpu(from->di_ctime.t_sec);
- inode->i_ctime.tv_nsec = (int)be32_to_cpu(from->di_ctime.t_nsec);
- inode->i_generation = be32_to_cpu(from->di_gen);
- inode->i_mode = be16_to_cpu(from->di_mode);
+ inode->i_atime = xfs_inode_from_disk_ts(from, from->di_atime);
+ inode->i_mtime = xfs_inode_from_disk_ts(from, from->di_mtime);
+ inode->i_ctime = xfs_inode_from_disk_ts(from, from->di_ctime);
to->di_size = be64_to_cpu(from->di_size);
to->di_nblocks = be64_to_cpu(from->di_nblocks);
to->di_extsize = be32_to_cpu(from->di_extsize);
- to->di_nextents = be32_to_cpu(from->di_nextents);
- to->di_anextents = be16_to_cpu(from->di_anextents);
to->di_forkoff = from->di_forkoff;
- to->di_aformat = from->di_aformat;
to->di_dmevmask = be32_to_cpu(from->di_dmevmask);
to->di_dmstate = be16_to_cpu(from->di_dmstate);
to->di_flags = be16_to_cpu(from->di_flags);
- if (to->di_version == 3) {
+ if (xfs_sb_version_has_v3inode(&ip->i_mount->m_sb)) {
inode_set_iversion_queried(inode,
be64_to_cpu(from->di_changecount));
- to->di_crtime.t_sec = be32_to_cpu(from->di_crtime.t_sec);
- to->di_crtime.t_nsec = be32_to_cpu(from->di_crtime.t_nsec);
+ to->di_crtime = xfs_inode_from_disk_ts(from, from->di_crtime);
to->di_flags2 = be64_to_cpu(from->di_flags2);
to->di_cowextsize = be32_to_cpu(from->di_cowextsize);
}
+
+ error = xfs_iformat_data_fork(ip, from);
+ if (error)
+ return error;
+ if (from->di_forkoff) {
+ error = xfs_iformat_attr_fork(ip, from);
+ if (error)
+ goto out_destroy_data_fork;
+ }
+ if (xfs_is_reflink_inode(ip))
+ xfs_ifork_init_cow(ip);
+ return 0;
+
+out_destroy_data_fork:
+ xfs_idestroy_fork(&ip->i_df);
+ return error;
+}
+
+/* Convert an incore timestamp to an ondisk timestamp. */
+static inline xfs_timestamp_t
+xfs_inode_to_disk_ts(
+ struct xfs_inode *ip,
+ const struct timespec64 tv)
+{
+ struct xfs_legacy_timestamp *lts;
+ xfs_timestamp_t ts;
+
+ if (xfs_inode_has_bigtime(ip))
+ return cpu_to_be64(xfs_inode_encode_bigtime(tv));
+
+ lts = (struct xfs_legacy_timestamp *)&ts;
+ lts->t_sec = cpu_to_be32(tv.tv_sec);
+ lts->t_nsec = cpu_to_be32(tv.tv_nsec);
+
+ return ts;
}
void
@@ -275,20 +309,16 @@
to->di_magic = cpu_to_be16(XFS_DINODE_MAGIC);
to->di_onlink = 0;
- to->di_version = from->di_version;
- to->di_format = from->di_format;
- to->di_uid = cpu_to_be32(from->di_uid);
- to->di_gid = cpu_to_be32(from->di_gid);
- to->di_projid_lo = cpu_to_be16(from->di_projid_lo);
- to->di_projid_hi = cpu_to_be16(from->di_projid_hi);
+ to->di_format = xfs_ifork_format(&ip->i_df);
+ to->di_uid = cpu_to_be32(i_uid_read(inode));
+ to->di_gid = cpu_to_be32(i_gid_read(inode));
+ to->di_projid_lo = cpu_to_be16(from->di_projid & 0xffff);
+ to->di_projid_hi = cpu_to_be16(from->di_projid >> 16);
memset(to->di_pad, 0, sizeof(to->di_pad));
- to->di_atime.t_sec = cpu_to_be32(inode->i_atime.tv_sec);
- to->di_atime.t_nsec = cpu_to_be32(inode->i_atime.tv_nsec);
- to->di_mtime.t_sec = cpu_to_be32(inode->i_mtime.tv_sec);
- to->di_mtime.t_nsec = cpu_to_be32(inode->i_mtime.tv_nsec);
- to->di_ctime.t_sec = cpu_to_be32(inode->i_ctime.tv_sec);
- to->di_ctime.t_nsec = cpu_to_be32(inode->i_ctime.tv_nsec);
+ to->di_atime = xfs_inode_to_disk_ts(ip, inode->i_atime);
+ to->di_mtime = xfs_inode_to_disk_ts(ip, inode->i_mtime);
+ to->di_ctime = xfs_inode_to_disk_ts(ip, inode->i_ctime);
to->di_nlink = cpu_to_be32(inode->i_nlink);
to->di_gen = cpu_to_be32(inode->i_generation);
to->di_mode = cpu_to_be16(inode->i_mode);
@@ -296,18 +326,18 @@
to->di_size = cpu_to_be64(from->di_size);
to->di_nblocks = cpu_to_be64(from->di_nblocks);
to->di_extsize = cpu_to_be32(from->di_extsize);
- to->di_nextents = cpu_to_be32(from->di_nextents);
- to->di_anextents = cpu_to_be16(from->di_anextents);
+ to->di_nextents = cpu_to_be32(xfs_ifork_nextents(&ip->i_df));
+ to->di_anextents = cpu_to_be16(xfs_ifork_nextents(ip->i_afp));
to->di_forkoff = from->di_forkoff;
- to->di_aformat = from->di_aformat;
+ to->di_aformat = xfs_ifork_format(ip->i_afp);
to->di_dmevmask = cpu_to_be32(from->di_dmevmask);
to->di_dmstate = cpu_to_be16(from->di_dmstate);
to->di_flags = cpu_to_be16(from->di_flags);
- if (from->di_version == 3) {
+ if (xfs_sb_version_has_v3inode(&ip->i_mount->m_sb)) {
+ to->di_version = 3;
to->di_changecount = cpu_to_be64(inode_peek_iversion(inode));
- to->di_crtime.t_sec = cpu_to_be32(from->di_crtime.t_sec);
- to->di_crtime.t_nsec = cpu_to_be32(from->di_crtime.t_nsec);
+ to->di_crtime = xfs_inode_to_disk_ts(ip, from->di_crtime);
to->di_flags2 = cpu_to_be64(from->di_flags2);
to->di_cowextsize = cpu_to_be32(from->di_cowextsize);
to->di_ino = cpu_to_be64(ip->i_ino);
@@ -316,58 +346,7 @@
uuid_copy(&to->di_uuid, &ip->i_mount->m_sb.sb_meta_uuid);
to->di_flushiter = 0;
} else {
- to->di_flushiter = cpu_to_be16(from->di_flushiter);
- }
-}
-
-void
-xfs_log_dinode_to_disk(
- struct xfs_log_dinode *from,
- struct xfs_dinode *to)
-{
- to->di_magic = cpu_to_be16(from->di_magic);
- to->di_mode = cpu_to_be16(from->di_mode);
- to->di_version = from->di_version;
- to->di_format = from->di_format;
- to->di_onlink = 0;
- to->di_uid = cpu_to_be32(from->di_uid);
- to->di_gid = cpu_to_be32(from->di_gid);
- to->di_nlink = cpu_to_be32(from->di_nlink);
- to->di_projid_lo = cpu_to_be16(from->di_projid_lo);
- to->di_projid_hi = cpu_to_be16(from->di_projid_hi);
- memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad));
-
- to->di_atime.t_sec = cpu_to_be32(from->di_atime.t_sec);
- to->di_atime.t_nsec = cpu_to_be32(from->di_atime.t_nsec);
- to->di_mtime.t_sec = cpu_to_be32(from->di_mtime.t_sec);
- to->di_mtime.t_nsec = cpu_to_be32(from->di_mtime.t_nsec);
- to->di_ctime.t_sec = cpu_to_be32(from->di_ctime.t_sec);
- to->di_ctime.t_nsec = cpu_to_be32(from->di_ctime.t_nsec);
-
- to->di_size = cpu_to_be64(from->di_size);
- to->di_nblocks = cpu_to_be64(from->di_nblocks);
- to->di_extsize = cpu_to_be32(from->di_extsize);
- to->di_nextents = cpu_to_be32(from->di_nextents);
- to->di_anextents = cpu_to_be16(from->di_anextents);
- to->di_forkoff = from->di_forkoff;
- to->di_aformat = from->di_aformat;
- to->di_dmevmask = cpu_to_be32(from->di_dmevmask);
- to->di_dmstate = cpu_to_be16(from->di_dmstate);
- to->di_flags = cpu_to_be16(from->di_flags);
- to->di_gen = cpu_to_be32(from->di_gen);
-
- if (from->di_version == 3) {
- to->di_changecount = cpu_to_be64(from->di_changecount);
- to->di_crtime.t_sec = cpu_to_be32(from->di_crtime.t_sec);
- to->di_crtime.t_nsec = cpu_to_be32(from->di_crtime.t_nsec);
- to->di_flags2 = cpu_to_be64(from->di_flags2);
- to->di_cowextsize = cpu_to_be32(from->di_cowextsize);
- to->di_ino = cpu_to_be64(from->di_ino);
- to->di_lsn = cpu_to_be64(from->di_lsn);
- memcpy(to->di_pad2, from->di_pad2, sizeof(to->di_pad2));
- uuid_copy(&to->di_uuid, &from->di_uuid);
- to->di_flushiter = 0;
- } else {
+ to->di_version = 2;
to->di_flushiter = cpu_to_be16(from->di_flushiter);
}
}
@@ -418,7 +397,7 @@
struct xfs_dinode *dip,
struct xfs_mount *mp)
{
- if (!XFS_DFORK_Q(dip))
+ if (!dip->di_forkoff)
return NULL;
switch (dip->di_format) {
@@ -429,7 +408,7 @@
case XFS_DINODE_FMT_LOCAL: /* fall through ... */
case XFS_DINODE_FMT_EXTENTS: /* fall through ... */
case XFS_DINODE_FMT_BTREE:
- if (dip->di_forkoff >= (XFS_LITINO(mp, dip->di_version) >> 3))
+ if (dip->di_forkoff >= (XFS_LITINO(mp) >> 3))
return __this_address;
break;
default:
@@ -455,7 +434,7 @@
/* Verify v3 integrity information first */
if (dip->di_version >= 3) {
- if (!xfs_sb_version_hascrc(&mp->m_sb))
+ if (!xfs_sb_version_has_v3inode(&mp->m_sb))
return __this_address;
if (!xfs_verify_cksum((char *)dip, mp->m_sb.sb_inodesize,
XFS_DINODE_CRC_OFF))
@@ -521,7 +500,7 @@
return __this_address;
}
- if (XFS_DFORK_Q(dip)) {
+ if (dip->di_forkoff) {
fa = xfs_dinode_verify_fork(dip, mp, XFS_ATTR_FORK);
if (fa)
return fa;
@@ -578,6 +557,11 @@
if (fa)
return fa;
+ /* bigtime iflag can only happen on bigtime filesystems */
+ if (xfs_dinode_has_bigtime(dip) &&
+ !xfs_sb_version_hasbigtime(&mp->m_sb))
+ return __this_address;
+
return NULL;
}
@@ -598,127 +582,6 @@
}
/*
- * Read the disk inode attributes into the in-core inode structure.
- *
- * For version 5 superblocks, if we are initialising a new inode and we are not
- * utilising the XFS_MOUNT_IKEEP inode cluster mode, we can simple build the new
- * inode core with a random generation number. If we are keeping inodes around,
- * we need to read the inode cluster to get the existing generation number off
- * disk. Further, if we are using version 4 superblocks (i.e. v1/v2 inode
- * format) then log recovery is dependent on the di_flushiter field being
- * initialised from the current on-disk value and hence we must also read the
- * inode off disk.
- */
-int
-xfs_iread(
- xfs_mount_t *mp,
- xfs_trans_t *tp,
- xfs_inode_t *ip,
- uint iget_flags)
-{
- xfs_buf_t *bp;
- xfs_dinode_t *dip;
- xfs_failaddr_t fa;
- int error;
-
- /*
- * Fill in the location information in the in-core inode.
- */
- error = xfs_imap(mp, tp, ip->i_ino, &ip->i_imap, iget_flags);
- if (error)
- return error;
-
- /* shortcut IO on inode allocation if possible */
- if ((iget_flags & XFS_IGET_CREATE) &&
- xfs_sb_version_hascrc(&mp->m_sb) &&
- !(mp->m_flags & XFS_MOUNT_IKEEP)) {
- /* initialise the on-disk inode core */
- memset(&ip->i_d, 0, sizeof(ip->i_d));
- VFS_I(ip)->i_generation = prandom_u32();
- ip->i_d.di_version = 3;
- return 0;
- }
-
- /*
- * Get pointers to the on-disk inode and the buffer containing it.
- */
- error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &bp, 0, iget_flags);
- if (error)
- return error;
-
- /* even unallocated inodes are verified */
- fa = xfs_dinode_verify(mp, ip->i_ino, dip);
- if (fa) {
- xfs_inode_verifier_error(ip, -EFSCORRUPTED, "dinode", dip,
- sizeof(*dip), fa);
- error = -EFSCORRUPTED;
- goto out_brelse;
- }
-
- /*
- * If the on-disk inode is already linked to a directory
- * entry, copy all of the inode into the in-core inode.
- * xfs_iformat_fork() handles copying in the inode format
- * specific information.
- * Otherwise, just get the truly permanent information.
- */
- if (dip->di_mode) {
- xfs_inode_from_disk(ip, dip);
- error = xfs_iformat_fork(ip, dip);
- if (error) {
-#ifdef DEBUG
- xfs_alert(mp, "%s: xfs_iformat() returned error %d",
- __func__, error);
-#endif /* DEBUG */
- goto out_brelse;
- }
- } else {
- /*
- * Partial initialisation of the in-core inode. Just the bits
- * that xfs_ialloc won't overwrite or relies on being correct.
- */
- ip->i_d.di_version = dip->di_version;
- VFS_I(ip)->i_generation = be32_to_cpu(dip->di_gen);
- ip->i_d.di_flushiter = be16_to_cpu(dip->di_flushiter);
-
- /*
- * Make sure to pull in the mode here as well in
- * case the inode is released without being used.
- * This ensures that xfs_inactive() will see that
- * the inode is already free and not try to mess
- * with the uninitialized part of it.
- */
- VFS_I(ip)->i_mode = 0;
- }
-
- ASSERT(ip->i_d.di_version >= 2);
- ip->i_delayed_blks = 0;
-
- /*
- * Mark the buffer containing the inode as something to keep
- * around for a while. This helps to keep recently accessed
- * meta-data in-core longer.
- */
- xfs_buf_set_ref(bp, XFS_INO_REF);
-
- /*
- * Use xfs_trans_brelse() to release the buffer containing the on-disk
- * inode, because it was acquired with xfs_trans_read_buf() in
- * xfs_imap_to_bp() above. If tp is NULL, this is just a normal
- * brelse(). If we're within a transaction, then xfs_trans_brelse()
- * will only release the buffer if it is not dirty within the
- * transaction. It will be OK to release the buffer in this case,
- * because inodes on disk are never destroyed and we will be locking the
- * new in-core inode before putting it in the cache where other
- * processes can find it. Thus we don't have to worry about the inode
- * being changed just because we released the buffer.
- */
- out_brelse:
- xfs_trans_brelse(tp, bp);
- return error;
-}
-
-/*
* Validate di_extsize hint.
*
* The rules are documented at xfs_ioctl_setattr_check_extsize().
diff --git a/fs/xfs/libxfs/xfs_inode_buf.h b/fs/xfs/libxfs/xfs_inode_buf.h
index ab0f841..ef5eaf3 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.h
+++ b/fs/xfs/libxfs/xfs_inode_buf.h
@@ -16,20 +16,12 @@
* format specific structures at the appropriate time.
*/
struct xfs_icdinode {
- int8_t di_version; /* inode version */
- int8_t di_format; /* format of di_c data */
uint16_t di_flushiter; /* incremented on flush */
- uint32_t di_uid; /* owner's user id */
- uint32_t di_gid; /* owner's group id */
- uint16_t di_projid_lo; /* lower part of owner's project id */
- uint16_t di_projid_hi; /* higher part of owner's project id */
+ prid_t di_projid; /* owner's project id */
xfs_fsize_t di_size; /* number of bytes in file */
xfs_rfsblock_t di_nblocks; /* # of direct & btree blocks used */
xfs_extlen_t di_extsize; /* basic/minimum extent size for file */
- xfs_extnum_t di_nextents; /* number of extents in data fork */
- xfs_aextnum_t di_anextents; /* number of extents in attribute fork*/
uint8_t di_forkoff; /* attr fork offs, <<3 for 64b align */
- int8_t di_aformat; /* format of attr fork's data */
uint32_t di_dmevmask; /* DMIG event mask */
uint16_t di_dmstate; /* DMIG state info */
uint16_t di_flags; /* random flags, XFS_DIFLAG_... */
@@ -37,9 +29,14 @@
uint64_t di_flags2; /* more random flags */
uint32_t di_cowextsize; /* basic cow extent size for file */
- xfs_ictimestamp_t di_crtime; /* time created */
+ struct timespec64 di_crtime; /* time created */
};
+static inline bool xfs_icdinode_has_bigtime(const struct xfs_icdinode *icd)
+{
+ return icd->di_flags2 & XFS_DIFLAG2_BIGTIME;
+}
+
/*
* Inode location information. Stored in the inode and passed to
* xfs_imap_to_bp() to get a buffer and dinode for a given inode.
@@ -52,23 +49,11 @@
int xfs_imap_to_bp(struct xfs_mount *, struct xfs_trans *,
struct xfs_imap *, struct xfs_dinode **,
- struct xfs_buf **, uint, uint);
-int xfs_iread(struct xfs_mount *, struct xfs_trans *,
- struct xfs_inode *, uint);
+ struct xfs_buf **, uint);
void xfs_dinode_calc_crc(struct xfs_mount *, struct xfs_dinode *);
void xfs_inode_to_disk(struct xfs_inode *ip, struct xfs_dinode *to,
xfs_lsn_t lsn);
-void xfs_inode_from_disk(struct xfs_inode *ip, struct xfs_dinode *from);
-void xfs_log_dinode_to_disk(struct xfs_log_dinode *from,
- struct xfs_dinode *to);
-
-bool xfs_dinode_good_version(struct xfs_mount *mp, __u8 version);
-
-#if defined(DEBUG)
-void xfs_inobp_check(struct xfs_mount *, struct xfs_buf *);
-#else
-#define xfs_inobp_check(mp, bp)
-#endif /* DEBUG */
+int xfs_inode_from_disk(struct xfs_inode *ip, struct xfs_dinode *from);
xfs_failaddr_t xfs_dinode_verify(struct xfs_mount *mp, xfs_ino_t ino,
struct xfs_dinode *dip);
@@ -78,4 +63,12 @@
uint32_t cowextsize, uint16_t mode, uint16_t flags,
uint64_t flags2);
+static inline uint64_t xfs_inode_encode_bigtime(struct timespec64 tv)
+{
+ return xfs_unix_to_bigtime(tv.tv_sec) * NSEC_PER_SEC + tv.tv_nsec;
+}
+
+struct timespec64 xfs_inode_from_disk_ts(struct xfs_dinode *dip,
+ const xfs_timestamp_t ts);
+
#endif /* __XFS_INODE_BUF_H__ */
diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c
index 8fdd042..7575de5 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.c
+++ b/fs/xfs/libxfs/xfs_inode_fork.c
@@ -26,104 +26,6 @@
kmem_zone_t *xfs_ifork_zone;
-STATIC int xfs_iformat_local(xfs_inode_t *, xfs_dinode_t *, int, int);
-STATIC int xfs_iformat_extents(xfs_inode_t *, xfs_dinode_t *, int);
-STATIC int xfs_iformat_btree(xfs_inode_t *, xfs_dinode_t *, int);
-
-/*
- * Copy inode type and data and attr format specific information from the
- * on-disk inode to the in-core inode and fork structures. For fifos, devices,
- * and sockets this means set i_rdev to the proper value. For files,
- * directories, and symlinks this means to bring in the in-line data or extent
- * pointers as well as the attribute fork. For a fork in B-tree format, only
- * the root is immediately brought in-core. The rest will be read in later when
- * first referenced (see xfs_iread_extents()).
- */
-int
-xfs_iformat_fork(
- struct xfs_inode *ip,
- struct xfs_dinode *dip)
-{
- struct inode *inode = VFS_I(ip);
- struct xfs_attr_shortform *atp;
- int size;
- int error = 0;
- xfs_fsize_t di_size;
-
- switch (inode->i_mode & S_IFMT) {
- case S_IFIFO:
- case S_IFCHR:
- case S_IFBLK:
- case S_IFSOCK:
- ip->i_d.di_size = 0;
- inode->i_rdev = xfs_to_linux_dev_t(xfs_dinode_get_rdev(dip));
- break;
-
- case S_IFREG:
- case S_IFLNK:
- case S_IFDIR:
- switch (dip->di_format) {
- case XFS_DINODE_FMT_LOCAL:
- di_size = be64_to_cpu(dip->di_size);
- size = (int)di_size;
- error = xfs_iformat_local(ip, dip, XFS_DATA_FORK, size);
- break;
- case XFS_DINODE_FMT_EXTENTS:
- error = xfs_iformat_extents(ip, dip, XFS_DATA_FORK);
- break;
- case XFS_DINODE_FMT_BTREE:
- error = xfs_iformat_btree(ip, dip, XFS_DATA_FORK);
- break;
- default:
- return -EFSCORRUPTED;
- }
- break;
-
- default:
- return -EFSCORRUPTED;
- }
- if (error)
- return error;
-
- if (xfs_is_reflink_inode(ip)) {
- ASSERT(ip->i_cowfp == NULL);
- xfs_ifork_init_cow(ip);
- }
-
- if (!XFS_DFORK_Q(dip))
- return 0;
-
- ASSERT(ip->i_afp == NULL);
- ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_NOFS);
-
- switch (dip->di_aformat) {
- case XFS_DINODE_FMT_LOCAL:
- atp = (xfs_attr_shortform_t *)XFS_DFORK_APTR(dip);
- size = be16_to_cpu(atp->hdr.totsize);
-
- error = xfs_iformat_local(ip, dip, XFS_ATTR_FORK, size);
- break;
- case XFS_DINODE_FMT_EXTENTS:
- error = xfs_iformat_extents(ip, dip, XFS_ATTR_FORK);
- break;
- case XFS_DINODE_FMT_BTREE:
- error = xfs_iformat_btree(ip, dip, XFS_ATTR_FORK);
- break;
- default:
- error = -EFSCORRUPTED;
- break;
- }
- if (error) {
- kmem_zone_free(xfs_ifork_zone, ip->i_afp);
- ip->i_afp = NULL;
- if (ip->i_cowfp)
- kmem_zone_free(xfs_ifork_zone, ip->i_cowfp);
- ip->i_cowfp = NULL;
- xfs_idestroy_fork(ip, XFS_DATA_FORK);
- }
- return error;
-}
-
void
xfs_init_local_fork(
struct xfs_inode *ip,
@@ -177,7 +79,7 @@
*/
if (unlikely(size > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork))) {
xfs_warn(ip->i_mount,
- "corrupt inode %Lu (bad size %d for local fork, size = %d).",
+ "corrupt inode %Lu (bad size %d for local fork, size = %zd).",
(unsigned long long) ip->i_ino, size,
XFS_DFORK_SIZE(dip, ip->i_mount, whichfork));
xfs_inode_verifier_error(ip, -EFSCORRUPTED,
@@ -286,12 +188,11 @@
* or the number of extents is greater than the number of
* blocks.
*/
- if (unlikely(XFS_IFORK_NEXTENTS(ip, whichfork) <=
- XFS_IFORK_MAXEXT(ip, whichfork) ||
+ if (unlikely(ifp->if_nextents <= XFS_IFORK_MAXEXT(ip, whichfork) ||
nrecs == 0 ||
XFS_BMDR_SPACE_CALC(nrecs) >
XFS_DFORK_SIZE(dip, mp, whichfork) ||
- XFS_IFORK_NEXTENTS(ip, whichfork) > ip->i_d.di_nblocks) ||
+ ifp->if_nextents > ip->i_d.di_nblocks) ||
level == 0 || level > XFS_BTREE_MAXLEVELS) {
xfs_warn(mp, "corrupt inode %Lu (btree).",
(unsigned long long) ip->i_ino);
@@ -319,6 +220,110 @@
return 0;
}
+int
+xfs_iformat_data_fork(
+ struct xfs_inode *ip,
+ struct xfs_dinode *dip)
+{
+ struct inode *inode = VFS_I(ip);
+ int error;
+
+ /*
+ * Initialize the extent count early, as the per-format routines may
+ * depend on it.
+ */
+ ip->i_df.if_format = dip->di_format;
+ ip->i_df.if_nextents = be32_to_cpu(dip->di_nextents);
+
+ switch (inode->i_mode & S_IFMT) {
+ case S_IFIFO:
+ case S_IFCHR:
+ case S_IFBLK:
+ case S_IFSOCK:
+ ip->i_d.di_size = 0;
+ inode->i_rdev = xfs_to_linux_dev_t(xfs_dinode_get_rdev(dip));
+ return 0;
+ case S_IFREG:
+ case S_IFLNK:
+ case S_IFDIR:
+ switch (ip->i_df.if_format) {
+ case XFS_DINODE_FMT_LOCAL:
+ error = xfs_iformat_local(ip, dip, XFS_DATA_FORK,
+ be64_to_cpu(dip->di_size));
+ if (!error)
+ error = xfs_ifork_verify_local_data(ip);
+ return error;
+ case XFS_DINODE_FMT_EXTENTS:
+ return xfs_iformat_extents(ip, dip, XFS_DATA_FORK);
+ case XFS_DINODE_FMT_BTREE:
+ return xfs_iformat_btree(ip, dip, XFS_DATA_FORK);
+ default:
+ xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__,
+ dip, sizeof(*dip), __this_address);
+ return -EFSCORRUPTED;
+ }
+ break;
+ default:
+ xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__, dip,
+ sizeof(*dip), __this_address);
+ return -EFSCORRUPTED;
+ }
+}
+
+static uint16_t
+xfs_dfork_attr_shortform_size(
+ struct xfs_dinode *dip)
+{
+ struct xfs_attr_shortform *atp =
+ (struct xfs_attr_shortform *)XFS_DFORK_APTR(dip);
+
+ return be16_to_cpu(atp->hdr.totsize);
+}
+
+int
+xfs_iformat_attr_fork(
+ struct xfs_inode *ip,
+ struct xfs_dinode *dip)
+{
+ int error = 0;
+
+ /*
+ * Initialize the extent count early, as the per-format routines may
+ * depend on it.
+ */
+ ip->i_afp = kmem_cache_zalloc(xfs_ifork_zone, GFP_NOFS | __GFP_NOFAIL);
+ ip->i_afp->if_format = dip->di_aformat;
+ if (unlikely(ip->i_afp->if_format == 0)) /* pre IRIX 6.2 file system */
+ ip->i_afp->if_format = XFS_DINODE_FMT_EXTENTS;
+ ip->i_afp->if_nextents = be16_to_cpu(dip->di_anextents);
+
+ switch (ip->i_afp->if_format) {
+ case XFS_DINODE_FMT_LOCAL:
+ error = xfs_iformat_local(ip, dip, XFS_ATTR_FORK,
+ xfs_dfork_attr_shortform_size(dip));
+ if (!error)
+ error = xfs_ifork_verify_local_attr(ip);
+ break;
+ case XFS_DINODE_FMT_EXTENTS:
+ error = xfs_iformat_extents(ip, dip, XFS_ATTR_FORK);
+ break;
+ case XFS_DINODE_FMT_BTREE:
+ error = xfs_iformat_btree(ip, dip, XFS_ATTR_FORK);
+ break;
+ default:
+ xfs_inode_verifier_error(ip, error, __func__, dip,
+ sizeof(*dip), __this_address);
+ error = -EFSCORRUPTED;
+ break;
+ }
+
+ if (error) {
+ kmem_cache_free(xfs_ifork_zone, ip->i_afp);
+ ip->i_afp = NULL;
+ }
+ return error;
+}
+
/*
* Reallocate the space for if_broot based on the number of records
* being added or deleted as indicated in rec_diff. Move the records
@@ -381,8 +386,8 @@
cur_max = xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0);
new_max = cur_max + rec_diff;
new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, new_max);
- ifp->if_broot = kmem_realloc(ifp->if_broot, new_size,
- KM_NOFS);
+ ifp->if_broot = krealloc(ifp->if_broot, new_size,
+ GFP_NOFS | __GFP_NOFAIL);
op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
ifp->if_broot_bytes);
np = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
@@ -491,45 +496,31 @@
* in size so that it can be logged and stay on word boundaries.
* We enforce that here.
*/
- ifp->if_u1.if_data = kmem_realloc(ifp->if_u1.if_data,
- roundup(new_size, 4), KM_NOFS);
+ ifp->if_u1.if_data = krealloc(ifp->if_u1.if_data, roundup(new_size, 4),
+ GFP_NOFS | __GFP_NOFAIL);
ifp->if_bytes = new_size;
}
void
xfs_idestroy_fork(
- xfs_inode_t *ip,
- int whichfork)
+ struct xfs_ifork *ifp)
{
- struct xfs_ifork *ifp;
-
- ifp = XFS_IFORK_PTR(ip, whichfork);
if (ifp->if_broot != NULL) {
kmem_free(ifp->if_broot);
ifp->if_broot = NULL;
}
/*
- * If the format is local, then we can't have an extents
- * array so just look for an inline data array. If we're
- * not local then we may or may not have an extents list,
- * so check and free it up if we do.
+ * If the format is local, then we can't have an extents array so just
+ * look for an inline data array. If we're not local then we may or may
+ * not have an extents list, so check and free it up if we do.
*/
- if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) {
- if (ifp->if_u1.if_data != NULL) {
- kmem_free(ifp->if_u1.if_data);
- ifp->if_u1.if_data = NULL;
- }
- } else if ((ifp->if_flags & XFS_IFEXTENTS) && ifp->if_height) {
- xfs_iext_destroy(ifp);
- }
-
- if (whichfork == XFS_ATTR_FORK) {
- kmem_zone_free(xfs_ifork_zone, ip->i_afp);
- ip->i_afp = NULL;
- } else if (whichfork == XFS_COW_FORK) {
- kmem_zone_free(xfs_ifork_zone, ip->i_cowfp);
- ip->i_cowfp = NULL;
+ if (ifp->if_format == XFS_DINODE_FMT_LOCAL) {
+ kmem_free(ifp->if_u1.if_data);
+ ifp->if_u1.if_data = NULL;
+ } else if (ifp->if_flags & XFS_IFEXTENTS) {
+ if (ifp->if_height)
+ xfs_iext_destroy(ifp);
}
}
@@ -586,7 +577,7 @@
xfs_iflush_fork(
xfs_inode_t *ip,
xfs_dinode_t *dip,
- xfs_inode_log_item_t *iip,
+ struct xfs_inode_log_item *iip,
int whichfork)
{
char *cp;
@@ -612,7 +603,7 @@
}
cp = XFS_DFORK_PTR(dip, whichfork);
mp = ip->i_mount;
- switch (XFS_IFORK_FORMAT(ip, whichfork)) {
+ switch (ifp->if_format) {
case XFS_DINODE_FMT_LOCAL:
if ((iip->ili_fields & dataflag[whichfork]) &&
(ifp->if_bytes > 0)) {
@@ -627,7 +618,7 @@
!(iip->ili_fields & extflag[whichfork]));
if ((iip->ili_fields & extflag[whichfork]) &&
(ifp->if_bytes > 0)) {
- ASSERT(XFS_IFORK_NEXTENTS(ip, whichfork) > 0);
+ ASSERT(ifp->if_nextents > 0);
(void)xfs_iextents_copy(ip, (xfs_bmbt_rec_t *)cp,
whichfork);
}
@@ -682,51 +673,58 @@
if (ip->i_cowfp)
return;
- ip->i_cowfp = kmem_zone_zalloc(xfs_ifork_zone,
- KM_NOFS);
+ ip->i_cowfp = kmem_cache_zalloc(xfs_ifork_zone,
+ GFP_NOFS | __GFP_NOFAIL);
ip->i_cowfp->if_flags = XFS_IFEXTENTS;
- ip->i_cformat = XFS_DINODE_FMT_EXTENTS;
- ip->i_cnextents = 0;
+ ip->i_cowfp->if_format = XFS_DINODE_FMT_EXTENTS;
}
-/* Default fork content verifiers. */
-struct xfs_ifork_ops xfs_default_ifork_ops = {
- .verify_attr = xfs_attr_shortform_verify,
- .verify_dir = xfs_dir2_sf_verify,
- .verify_symlink = xfs_symlink_shortform_verify,
-};
-
/* Verify the inline contents of the data fork of an inode. */
-xfs_failaddr_t
-xfs_ifork_verify_data(
- struct xfs_inode *ip,
- struct xfs_ifork_ops *ops)
+int
+xfs_ifork_verify_local_data(
+ struct xfs_inode *ip)
{
- /* Non-local data fork, we're done. */
- if (ip->i_d.di_format != XFS_DINODE_FMT_LOCAL)
- return NULL;
+ xfs_failaddr_t fa = NULL;
- /* Check the inline data fork if there is one. */
switch (VFS_I(ip)->i_mode & S_IFMT) {
case S_IFDIR:
- return ops->verify_dir(ip);
+ fa = xfs_dir2_sf_verify(ip);
+ break;
case S_IFLNK:
- return ops->verify_symlink(ip);
+ fa = xfs_symlink_shortform_verify(ip);
+ break;
default:
- return NULL;
+ break;
}
+
+ if (fa) {
+ xfs_inode_verifier_error(ip, -EFSCORRUPTED, "data fork",
+ ip->i_df.if_u1.if_data, ip->i_df.if_bytes, fa);
+ return -EFSCORRUPTED;
+ }
+
+ return 0;
}
/* Verify the inline contents of the attr fork of an inode. */
-xfs_failaddr_t
-xfs_ifork_verify_attr(
- struct xfs_inode *ip,
- struct xfs_ifork_ops *ops)
+int
+xfs_ifork_verify_local_attr(
+ struct xfs_inode *ip)
{
- /* There has to be an attr fork allocated if aformat is local. */
- if (ip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)
- return NULL;
- if (!XFS_IFORK_PTR(ip, XFS_ATTR_FORK))
- return __this_address;
- return ops->verify_attr(ip);
+ struct xfs_ifork *ifp = ip->i_afp;
+ xfs_failaddr_t fa;
+
+ if (!ifp)
+ fa = __this_address;
+ else
+ fa = xfs_attr_shortform_verify(ip);
+
+ if (fa) {
+ xfs_inode_verifier_error(ip, -EFSCORRUPTED, "attr fork",
+ ifp ? ifp->if_u1.if_data : NULL,
+ ifp ? ifp->if_bytes : 0, fa);
+ return -EFSCORRUPTED;
+ }
+
+ return 0;
}
diff --git a/fs/xfs/libxfs/xfs_inode_fork.h b/fs/xfs/libxfs/xfs_inode_fork.h
index 7b845c0..a4953e9 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.h
+++ b/fs/xfs/libxfs/xfs_inode_fork.h
@@ -23,6 +23,8 @@
} if_u1;
short if_broot_bytes; /* bytes allocated for root */
unsigned char if_flags; /* per-fork flags */
+ int8_t if_format; /* format of this fork */
+ xfs_extnum_t if_nextents; /* # of extents in this fork */
};
/*
@@ -46,53 +48,45 @@
(ip)->i_afp : \
(ip)->i_cowfp))
#define XFS_IFORK_DSIZE(ip) \
- (XFS_IFORK_Q(ip) ? \
- XFS_IFORK_BOFF(ip) : \
- XFS_LITINO((ip)->i_mount, (ip)->i_d.di_version))
+ (XFS_IFORK_Q(ip) ? XFS_IFORK_BOFF(ip) : XFS_LITINO((ip)->i_mount))
#define XFS_IFORK_ASIZE(ip) \
- (XFS_IFORK_Q(ip) ? \
- XFS_LITINO((ip)->i_mount, (ip)->i_d.di_version) - \
- XFS_IFORK_BOFF(ip) : \
- 0)
+ (XFS_IFORK_Q(ip) ? XFS_LITINO((ip)->i_mount) - XFS_IFORK_BOFF(ip) : 0)
#define XFS_IFORK_SIZE(ip,w) \
((w) == XFS_DATA_FORK ? \
XFS_IFORK_DSIZE(ip) : \
((w) == XFS_ATTR_FORK ? \
XFS_IFORK_ASIZE(ip) : \
0))
-#define XFS_IFORK_FORMAT(ip,w) \
- ((w) == XFS_DATA_FORK ? \
- (ip)->i_d.di_format : \
- ((w) == XFS_ATTR_FORK ? \
- (ip)->i_d.di_aformat : \
- (ip)->i_cformat))
-#define XFS_IFORK_FMT_SET(ip,w,n) \
- ((w) == XFS_DATA_FORK ? \
- ((ip)->i_d.di_format = (n)) : \
- ((w) == XFS_ATTR_FORK ? \
- ((ip)->i_d.di_aformat = (n)) : \
- ((ip)->i_cformat = (n))))
-#define XFS_IFORK_NEXTENTS(ip,w) \
- ((w) == XFS_DATA_FORK ? \
- (ip)->i_d.di_nextents : \
- ((w) == XFS_ATTR_FORK ? \
- (ip)->i_d.di_anextents : \
- (ip)->i_cnextents))
-#define XFS_IFORK_NEXT_SET(ip,w,n) \
- ((w) == XFS_DATA_FORK ? \
- ((ip)->i_d.di_nextents = (n)) : \
- ((w) == XFS_ATTR_FORK ? \
- ((ip)->i_d.di_anextents = (n)) : \
- ((ip)->i_cnextents = (n))))
#define XFS_IFORK_MAXEXT(ip, w) \
(XFS_IFORK_SIZE(ip, w) / sizeof(xfs_bmbt_rec_t))
+static inline bool xfs_ifork_has_extents(struct xfs_ifork *ifp)
+{
+ return ifp->if_format == XFS_DINODE_FMT_EXTENTS ||
+ ifp->if_format == XFS_DINODE_FMT_BTREE;
+}
+
+static inline xfs_extnum_t xfs_ifork_nextents(struct xfs_ifork *ifp)
+{
+ if (!ifp)
+ return 0;
+ return ifp->if_nextents;
+}
+
+static inline int8_t xfs_ifork_format(struct xfs_ifork *ifp)
+{
+ if (!ifp)
+ return XFS_DINODE_FMT_EXTENTS;
+ return ifp->if_format;
+}
+
struct xfs_ifork *xfs_iext_state_to_fork(struct xfs_inode *ip, int state);
-int xfs_iformat_fork(struct xfs_inode *, struct xfs_dinode *);
+int xfs_iformat_data_fork(struct xfs_inode *, struct xfs_dinode *);
+int xfs_iformat_attr_fork(struct xfs_inode *, struct xfs_dinode *);
void xfs_iflush_fork(struct xfs_inode *, struct xfs_dinode *,
struct xfs_inode_log_item *, int);
-void xfs_idestroy_fork(struct xfs_inode *, int);
+void xfs_idestroy_fork(struct xfs_ifork *ifp);
void xfs_idata_realloc(struct xfs_inode *ip, int64_t byte_diff,
int whichfork);
void xfs_iroot_realloc(struct xfs_inode *, int, int);
@@ -176,18 +170,7 @@
extern void xfs_ifork_init_cow(struct xfs_inode *ip);
-typedef xfs_failaddr_t (*xfs_ifork_verifier_t)(struct xfs_inode *);
-
-struct xfs_ifork_ops {
- xfs_ifork_verifier_t verify_symlink;
- xfs_ifork_verifier_t verify_dir;
- xfs_ifork_verifier_t verify_attr;
-};
-extern struct xfs_ifork_ops xfs_default_ifork_ops;
-
-xfs_failaddr_t xfs_ifork_verify_data(struct xfs_inode *ip,
- struct xfs_ifork_ops *ops);
-xfs_failaddr_t xfs_ifork_verify_attr(struct xfs_inode *ip,
- struct xfs_ifork_ops *ops);
+int xfs_ifork_verify_local_data(struct xfs_inode *ip);
+int xfs_ifork_verify_local_attr(struct xfs_inode *ip);
#endif /* __XFS_INODE_FORK_H__ */
diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h
index e5f97c6..8bd00da 100644
--- a/fs/xfs/libxfs/xfs_log_format.h
+++ b/fs/xfs/libxfs/xfs_log_format.h
@@ -368,10 +368,13 @@
* directly mirrors the xfs_dinode structure as it must contain all the same
* information.
*/
-typedef struct xfs_ictimestamp {
+typedef uint64_t xfs_ictimestamp_t;
+
+/* Legacy timestamp encoding format. */
+struct xfs_legacy_ictimestamp {
int32_t t_sec; /* timestamp seconds */
int32_t t_nsec; /* timestamp nanoseconds */
-} xfs_ictimestamp_t;
+};
/*
* Define the format of the inode core that is logged. This structure must be
@@ -424,17 +427,15 @@
/* structure must be padded to 64 bit alignment */
};
-static inline uint xfs_log_dinode_size(int version)
-{
- if (version == 3)
- return sizeof(struct xfs_log_dinode);
- return offsetof(struct xfs_log_dinode, di_next_unlinked);
-}
+#define xfs_log_dinode_size(mp) \
+ (xfs_sb_version_has_v3inode(&(mp)->m_sb) ? \
+ sizeof(struct xfs_log_dinode) : \
+ offsetof(struct xfs_log_dinode, di_next_unlinked))
/*
- * Buffer Log Format defintions
+ * Buffer Log Format definitions
*
- * These are the physical dirty bitmap defintions for the log format structure.
+ * These are the physical dirty bitmap definitions for the log format structure.
*/
#define XFS_BLF_CHUNK 128
#define XFS_BLF_SHIFT 7
@@ -462,11 +463,20 @@
#define XFS_BLF_GDQUOT_BUF (1<<4)
/*
- * This is the structure used to lay out a buf log item in the
- * log. The data map describes which 128 byte chunks of the buffer
- * have been logged.
+ * This is the structure used to lay out a buf log item in the log. The data
+ * map describes which 128 byte chunks of the buffer have been logged.
+ *
+ * The placement of blf_map_size causes blf_data_map to start at an odd
+ * multiple of sizeof(unsigned int) offset within the struct. Because the data
+ * bitmap size will always be an even number, the end of the data_map (and
+ * therefore the structure) will also be at an odd multiple of sizeof(unsigned
+ * int). Some 64-bit compilers will insert padding at the end of the struct to
+ * ensure 64-bit alignment of blf_blkno, but 32-bit ones will not. Therefore,
+ * XFS_BLF_DATAMAP_SIZE must be an odd number to make the padding explicit and
+ * keep the structure size consistent between 32-bit and 64-bit platforms.
*/
-#define XFS_BLF_DATAMAP_SIZE ((XFS_MAX_BLOCKSIZE / XFS_BLF_CHUNK) / NBWORD)
+#define __XFS_BLF_DATAMAP_SIZE ((XFS_MAX_BLOCKSIZE / XFS_BLF_CHUNK) / NBWORD)
+#define XFS_BLF_DATAMAP_SIZE (__XFS_BLF_DATAMAP_SIZE + 1)
typedef struct xfs_buf_log_format {
unsigned short blf_type; /* buf log item type indicator */
diff --git a/fs/xfs/libxfs/xfs_log_recover.h b/fs/xfs/libxfs/xfs_log_recover.h
index f3d18ea..3cca2bf 100644
--- a/fs/xfs/libxfs/xfs_log_recover.h
+++ b/fs/xfs/libxfs/xfs_log_recover.h
@@ -7,6 +7,73 @@
#define __XFS_LOG_RECOVER_H__
/*
+ * Each log item type (XFS_LI_*) gets its own xlog_recover_item_ops to
+ * define how recovery should work for that type of log item.
+ */
+struct xlog_recover_item;
+
+/* Sorting hat for log items as they're read in. */
+enum xlog_recover_reorder {
+ XLOG_REORDER_BUFFER_LIST,
+ XLOG_REORDER_ITEM_LIST,
+ XLOG_REORDER_INODE_BUFFER_LIST,
+ XLOG_REORDER_CANCEL_LIST,
+};
+
+struct xlog_recover_item_ops {
+ uint16_t item_type; /* XFS_LI_* type code. */
+
+ /*
+ * Help sort recovered log items into the order required to replay them
+ * correctly. Log item types that always use XLOG_REORDER_ITEM_LIST do
+ * not have to supply a function here. See the comment preceding
+ * xlog_recover_reorder_trans for more details about what the return
+ * values mean.
+ */
+ enum xlog_recover_reorder (*reorder)(struct xlog_recover_item *item);
+
+ /* Start readahead for pass2, if provided. */
+ void (*ra_pass2)(struct xlog *log, struct xlog_recover_item *item);
+
+ /* Do whatever work we need to do for pass1, if provided. */
+ int (*commit_pass1)(struct xlog *log, struct xlog_recover_item *item);
+
+ /*
+ * This function should do whatever work is needed for pass2 of log
+ * recovery, if provided.
+ *
+ * If the recovered item is an intent item, this function should parse
+ * the recovered item to construct an in-core log intent item and
+ * insert it into the AIL. The in-core log intent item should have 1
+ * refcount so that the item is freed either (a) when we commit the
+ * recovered log item for the intent-done item; (b) replay the work and
+ * log a new intent-done item; or (c) recovery fails and we have to
+ * abort.
+ *
+ * If the recovered item is an intent-done item, this function should
+ * parse the recovered item to find the id of the corresponding intent
+ * log item. Next, it should find the in-core log intent item in the
+ * AIL and release it.
+ */
+ int (*commit_pass2)(struct xlog *log, struct list_head *buffer_list,
+ struct xlog_recover_item *item, xfs_lsn_t lsn);
+};
+
+extern const struct xlog_recover_item_ops xlog_icreate_item_ops;
+extern const struct xlog_recover_item_ops xlog_buf_item_ops;
+extern const struct xlog_recover_item_ops xlog_inode_item_ops;
+extern const struct xlog_recover_item_ops xlog_dquot_item_ops;
+extern const struct xlog_recover_item_ops xlog_quotaoff_item_ops;
+extern const struct xlog_recover_item_ops xlog_bui_item_ops;
+extern const struct xlog_recover_item_ops xlog_bud_item_ops;
+extern const struct xlog_recover_item_ops xlog_efi_item_ops;
+extern const struct xlog_recover_item_ops xlog_efd_item_ops;
+extern const struct xlog_recover_item_ops xlog_rui_item_ops;
+extern const struct xlog_recover_item_ops xlog_rud_item_ops;
+extern const struct xlog_recover_item_ops xlog_cui_item_ops;
+extern const struct xlog_recover_item_ops xlog_cud_item_ops;
+
+/*
* Macros, structures, prototypes for internal log manager use.
*/
@@ -22,22 +89,22 @@
/*
* item headers are in ri_buf[0]. Additional buffers follow.
*/
-typedef struct xlog_recover_item {
+struct xlog_recover_item {
struct list_head ri_list;
- int ri_type;
int ri_cnt; /* count of regions found */
int ri_total; /* total regions */
- xfs_log_iovec_t *ri_buf; /* ptr to regions buffer */
-} xlog_recover_item_t;
+ struct xfs_log_iovec *ri_buf; /* ptr to regions buffer */
+ const struct xlog_recover_item_ops *ri_ops;
+};
-typedef struct xlog_recover {
+struct xlog_recover {
struct hlist_node r_list;
xlog_tid_t r_log_tid; /* log's transaction id */
xfs_trans_header_t r_theader; /* trans header for partial */
int r_state; /* not needed */
xfs_lsn_t r_lsn; /* xact lsn */
struct list_head r_itemq; /* q for items */
-} xlog_recover_t;
+};
#define ITEM_TYPE(i) (*(unsigned short *)(i)->ri_buf[0].i_addr)
@@ -51,4 +118,11 @@
#define XLOG_RECOVER_PASS1 1
#define XLOG_RECOVER_PASS2 2
+void xlog_buf_readahead(struct xlog *log, xfs_daddr_t blkno, uint len,
+ const struct xfs_buf_ops *ops);
+bool xlog_is_buffer_cancelled(struct xlog *log, xfs_daddr_t blkno, uint len);
+
+void xlog_recover_release_intent(struct xlog *log, unsigned short intent_type,
+ uint64_t intent_id);
+
#endif /* __XFS_LOG_RECOVER_H__ */
diff --git a/fs/xfs/libxfs/xfs_quota_defs.h b/fs/xfs/libxfs/xfs_quota_defs.h
index b2113b1..0f0af4e 100644
--- a/fs/xfs/libxfs/xfs_quota_defs.h
+++ b/fs/xfs/libxfs/xfs_quota_defs.h
@@ -18,23 +18,23 @@
typedef uint64_t xfs_qcnt_t;
typedef uint16_t xfs_qwarncnt_t;
+typedef uint8_t xfs_dqtype_t;
+
+#define XFS_DQTYPE_STRINGS \
+ { XFS_DQTYPE_USER, "USER" }, \
+ { XFS_DQTYPE_PROJ, "PROJ" }, \
+ { XFS_DQTYPE_GROUP, "GROUP" }, \
+ { XFS_DQTYPE_BIGTIME, "BIGTIME" }
+
/*
* flags for q_flags field in the dquot.
*/
-#define XFS_DQ_USER 0x0001 /* a user quota */
-#define XFS_DQ_PROJ 0x0002 /* project quota */
-#define XFS_DQ_GROUP 0x0004 /* a group quota */
-#define XFS_DQ_DIRTY 0x0008 /* dquot is dirty */
-#define XFS_DQ_FREEING 0x0010 /* dquot is being torn down */
+#define XFS_DQFLAG_DIRTY (1 << 0) /* dquot is dirty */
+#define XFS_DQFLAG_FREEING (1 << 1) /* dquot is being torn down */
-#define XFS_DQ_ALLTYPES (XFS_DQ_USER|XFS_DQ_PROJ|XFS_DQ_GROUP)
-
-#define XFS_DQ_FLAGS \
- { XFS_DQ_USER, "USER" }, \
- { XFS_DQ_PROJ, "PROJ" }, \
- { XFS_DQ_GROUP, "GROUP" }, \
- { XFS_DQ_DIRTY, "DIRTY" }, \
- { XFS_DQ_FREEING, "FREEING" }
+#define XFS_DQFLAG_STRINGS \
+ { XFS_DQFLAG_DIRTY, "DIRTY" }, \
+ { XFS_DQFLAG_FREEING, "FREEING" }
/*
* We have the possibility of all three quota types being active at once, and
@@ -100,7 +100,6 @@
#define XFS_QMOPT_FORCE_RES 0x0000010 /* ignore quota limits */
#define XFS_QMOPT_SBVERSION 0x0000040 /* change superblock version num */
#define XFS_QMOPT_GQUOTA 0x0002000 /* group dquot requested */
-#define XFS_QMOPT_ENOSPC 0x0004000 /* enospc instead of edquot (prj) */
/*
* flags to xfs_trans_mod_dquot to indicate which field needs to be
@@ -138,11 +137,16 @@
#define XFS_QMOPT_RESBLK_MASK (XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_RES_RTBLKS)
extern xfs_failaddr_t xfs_dquot_verify(struct xfs_mount *mp,
- struct xfs_disk_dquot *ddq, xfs_dqid_t id, uint type);
+ struct xfs_disk_dquot *ddq, xfs_dqid_t id);
extern xfs_failaddr_t xfs_dqblk_verify(struct xfs_mount *mp,
- struct xfs_dqblk *dqb, xfs_dqid_t id, uint type);
+ struct xfs_dqblk *dqb, xfs_dqid_t id);
extern int xfs_calc_dquots_per_chunk(unsigned int nbblks);
extern void xfs_dqblk_repair(struct xfs_mount *mp, struct xfs_dqblk *dqb,
- xfs_dqid_t id, uint type);
+ xfs_dqid_t id, xfs_dqtype_t type);
+
+struct xfs_dquot;
+time64_t xfs_dquot_from_disk_ts(struct xfs_disk_dquot *ddq,
+ __be32 dtimer);
+__be32 xfs_dquot_to_disk_ts(struct xfs_dquot *ddq, time64_t timer);
#endif /* __XFS_QUOTA_H__ */
diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c
index 9a7fadb..2076627 100644
--- a/fs/xfs/libxfs/xfs_refcount.c
+++ b/fs/xfs/libxfs/xfs_refcount.c
@@ -46,7 +46,7 @@
xfs_agblock_t bno,
int *stat)
{
- trace_xfs_refcount_lookup(cur->bc_mp, cur->bc_private.a.agno, bno,
+ trace_xfs_refcount_lookup(cur->bc_mp, cur->bc_ag.agno, bno,
XFS_LOOKUP_LE);
cur->bc_rec.rc.rc_startblock = bno;
cur->bc_rec.rc.rc_blockcount = 0;
@@ -63,7 +63,7 @@
xfs_agblock_t bno,
int *stat)
{
- trace_xfs_refcount_lookup(cur->bc_mp, cur->bc_private.a.agno, bno,
+ trace_xfs_refcount_lookup(cur->bc_mp, cur->bc_ag.agno, bno,
XFS_LOOKUP_GE);
cur->bc_rec.rc.rc_startblock = bno;
cur->bc_rec.rc.rc_blockcount = 0;
@@ -80,7 +80,7 @@
xfs_agblock_t bno,
int *stat)
{
- trace_xfs_refcount_lookup(cur->bc_mp, cur->bc_private.a.agno, bno,
+ trace_xfs_refcount_lookup(cur->bc_mp, cur->bc_ag.agno, bno,
XFS_LOOKUP_LE);
cur->bc_rec.rc.rc_startblock = bno;
cur->bc_rec.rc.rc_blockcount = 0;
@@ -108,7 +108,7 @@
int *stat)
{
struct xfs_mount *mp = cur->bc_mp;
- xfs_agnumber_t agno = cur->bc_private.a.agno;
+ xfs_agnumber_t agno = cur->bc_ag.agno;
union xfs_btree_rec *rec;
int error;
xfs_agblock_t realstart;
@@ -119,7 +119,7 @@
xfs_refcount_btrec_to_irec(rec, irec);
- agno = cur->bc_private.a.agno;
+ agno = cur->bc_ag.agno;
if (irec->rc_blockcount == 0 || irec->rc_blockcount > MAXREFCEXTLEN)
goto out_bad_rec;
@@ -144,7 +144,7 @@
if (irec->rc_refcount == 0 || irec->rc_refcount > MAXREFCOUNT)
goto out_bad_rec;
- trace_xfs_refcount_get(cur->bc_mp, cur->bc_private.a.agno, irec);
+ trace_xfs_refcount_get(cur->bc_mp, cur->bc_ag.agno, irec);
return 0;
out_bad_rec:
@@ -169,14 +169,14 @@
union xfs_btree_rec rec;
int error;
- trace_xfs_refcount_update(cur->bc_mp, cur->bc_private.a.agno, irec);
+ trace_xfs_refcount_update(cur->bc_mp, cur->bc_ag.agno, irec);
rec.refc.rc_startblock = cpu_to_be32(irec->rc_startblock);
rec.refc.rc_blockcount = cpu_to_be32(irec->rc_blockcount);
rec.refc.rc_refcount = cpu_to_be32(irec->rc_refcount);
error = xfs_btree_update(cur, &rec);
if (error)
trace_xfs_refcount_update_error(cur->bc_mp,
- cur->bc_private.a.agno, error, _RET_IP_);
+ cur->bc_ag.agno, error, _RET_IP_);
return error;
}
@@ -193,19 +193,22 @@
{
int error;
- trace_xfs_refcount_insert(cur->bc_mp, cur->bc_private.a.agno, irec);
+ trace_xfs_refcount_insert(cur->bc_mp, cur->bc_ag.agno, irec);
cur->bc_rec.rc.rc_startblock = irec->rc_startblock;
cur->bc_rec.rc.rc_blockcount = irec->rc_blockcount;
cur->bc_rec.rc.rc_refcount = irec->rc_refcount;
error = xfs_btree_insert(cur, i);
if (error)
goto out_error;
- XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, *i == 1, out_error);
+ if (XFS_IS_CORRUPT(cur->bc_mp, *i != 1)) {
+ error = -EFSCORRUPTED;
+ goto out_error;
+ }
out_error:
if (error)
trace_xfs_refcount_insert_error(cur->bc_mp,
- cur->bc_private.a.agno, error, _RET_IP_);
+ cur->bc_ag.agno, error, _RET_IP_);
return error;
}
@@ -227,17 +230,23 @@
error = xfs_refcount_get_rec(cur, &irec, &found_rec);
if (error)
goto out_error;
- XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, out_error);
- trace_xfs_refcount_delete(cur->bc_mp, cur->bc_private.a.agno, &irec);
+ if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
+ error = -EFSCORRUPTED;
+ goto out_error;
+ }
+ trace_xfs_refcount_delete(cur->bc_mp, cur->bc_ag.agno, &irec);
error = xfs_btree_delete(cur, i);
- XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, *i == 1, out_error);
+ if (XFS_IS_CORRUPT(cur->bc_mp, *i != 1)) {
+ error = -EFSCORRUPTED;
+ goto out_error;
+ }
if (error)
goto out_error;
error = xfs_refcount_lookup_ge(cur, irec.rc_startblock, &found_rec);
out_error:
if (error)
trace_xfs_refcount_delete_error(cur->bc_mp,
- cur->bc_private.a.agno, error, _RET_IP_);
+ cur->bc_ag.agno, error, _RET_IP_);
return error;
}
@@ -349,12 +358,15 @@
error = xfs_refcount_get_rec(cur, &rcext, &found_rec);
if (error)
goto out_error;
- XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, out_error);
+ if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
+ error = -EFSCORRUPTED;
+ goto out_error;
+ }
if (rcext.rc_startblock == agbno || xfs_refc_next(&rcext) <= agbno)
return 0;
*shape_changed = true;
- trace_xfs_refcount_split_extent(cur->bc_mp, cur->bc_private.a.agno,
+ trace_xfs_refcount_split_extent(cur->bc_mp, cur->bc_ag.agno,
&rcext, agbno);
/* Establish the right extent. */
@@ -371,12 +383,15 @@
error = xfs_refcount_insert(cur, &tmp, &found_rec);
if (error)
goto out_error;
- XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, out_error);
+ if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
+ error = -EFSCORRUPTED;
+ goto out_error;
+ }
return error;
out_error:
trace_xfs_refcount_split_extent_error(cur->bc_mp,
- cur->bc_private.a.agno, error, _RET_IP_);
+ cur->bc_ag.agno, error, _RET_IP_);
return error;
}
@@ -396,7 +411,7 @@
int found_rec;
trace_xfs_refcount_merge_center_extents(cur->bc_mp,
- cur->bc_private.a.agno, left, center, right);
+ cur->bc_ag.agno, left, center, right);
/*
* Make sure the center and right extents are not in the btree.
@@ -410,19 +425,27 @@
&found_rec);
if (error)
goto out_error;
- XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, out_error);
+ if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
+ error = -EFSCORRUPTED;
+ goto out_error;
+ }
error = xfs_refcount_delete(cur, &found_rec);
if (error)
goto out_error;
- XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, out_error);
+ if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
+ error = -EFSCORRUPTED;
+ goto out_error;
+ }
if (center->rc_refcount > 1) {
error = xfs_refcount_delete(cur, &found_rec);
if (error)
goto out_error;
- XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1,
- out_error);
+ if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
+ error = -EFSCORRUPTED;
+ goto out_error;
+ }
}
/* Enlarge the left extent. */
@@ -430,7 +453,10 @@
&found_rec);
if (error)
goto out_error;
- XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, out_error);
+ if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
+ error = -EFSCORRUPTED;
+ goto out_error;
+ }
left->rc_blockcount = extlen;
error = xfs_refcount_update(cur, left);
@@ -442,7 +468,7 @@
out_error:
trace_xfs_refcount_merge_center_extents_error(cur->bc_mp,
- cur->bc_private.a.agno, error, _RET_IP_);
+ cur->bc_ag.agno, error, _RET_IP_);
return error;
}
@@ -461,7 +487,7 @@
int found_rec;
trace_xfs_refcount_merge_left_extent(cur->bc_mp,
- cur->bc_private.a.agno, left, cleft);
+ cur->bc_ag.agno, left, cleft);
/* If the extent at agbno (cleft) wasn't synthesized, remove it. */
if (cleft->rc_refcount > 1) {
@@ -469,14 +495,18 @@
&found_rec);
if (error)
goto out_error;
- XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1,
- out_error);
+ if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
+ error = -EFSCORRUPTED;
+ goto out_error;
+ }
error = xfs_refcount_delete(cur, &found_rec);
if (error)
goto out_error;
- XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1,
- out_error);
+ if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
+ error = -EFSCORRUPTED;
+ goto out_error;
+ }
}
/* Enlarge the left extent. */
@@ -484,7 +514,10 @@
&found_rec);
if (error)
goto out_error;
- XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, out_error);
+ if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
+ error = -EFSCORRUPTED;
+ goto out_error;
+ }
left->rc_blockcount += cleft->rc_blockcount;
error = xfs_refcount_update(cur, left);
@@ -497,7 +530,7 @@
out_error:
trace_xfs_refcount_merge_left_extent_error(cur->bc_mp,
- cur->bc_private.a.agno, error, _RET_IP_);
+ cur->bc_ag.agno, error, _RET_IP_);
return error;
}
@@ -515,7 +548,7 @@
int found_rec;
trace_xfs_refcount_merge_right_extent(cur->bc_mp,
- cur->bc_private.a.agno, cright, right);
+ cur->bc_ag.agno, cright, right);
/*
* If the extent ending at agbno+aglen (cright) wasn't synthesized,
@@ -526,14 +559,18 @@
&found_rec);
if (error)
goto out_error;
- XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1,
- out_error);
+ if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
+ error = -EFSCORRUPTED;
+ goto out_error;
+ }
error = xfs_refcount_delete(cur, &found_rec);
if (error)
goto out_error;
- XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1,
- out_error);
+ if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
+ error = -EFSCORRUPTED;
+ goto out_error;
+ }
}
/* Enlarge the right extent. */
@@ -541,7 +578,10 @@
&found_rec);
if (error)
goto out_error;
- XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, out_error);
+ if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
+ error = -EFSCORRUPTED;
+ goto out_error;
+ }
right->rc_startblock -= cright->rc_blockcount;
right->rc_blockcount += cright->rc_blockcount;
@@ -554,7 +594,7 @@
out_error:
trace_xfs_refcount_merge_right_extent_error(cur->bc_mp,
- cur->bc_private.a.agno, error, _RET_IP_);
+ cur->bc_ag.agno, error, _RET_IP_);
return error;
}
@@ -587,7 +627,10 @@
error = xfs_refcount_get_rec(cur, &tmp, &found_rec);
if (error)
goto out_error;
- XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, out_error);
+ if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
+ error = -EFSCORRUPTED;
+ goto out_error;
+ }
if (xfs_refc_next(&tmp) != agbno)
return 0;
@@ -605,8 +648,10 @@
error = xfs_refcount_get_rec(cur, &tmp, &found_rec);
if (error)
goto out_error;
- XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1,
- out_error);
+ if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
+ error = -EFSCORRUPTED;
+ goto out_error;
+ }
/* if tmp starts at the end of our range, just use that */
if (tmp.rc_startblock == agbno)
@@ -634,13 +679,13 @@
cleft->rc_blockcount = aglen;
cleft->rc_refcount = 1;
}
- trace_xfs_refcount_find_left_extent(cur->bc_mp, cur->bc_private.a.agno,
+ trace_xfs_refcount_find_left_extent(cur->bc_mp, cur->bc_ag.agno,
left, cleft, agbno);
return error;
out_error:
trace_xfs_refcount_find_left_extent_error(cur->bc_mp,
- cur->bc_private.a.agno, error, _RET_IP_);
+ cur->bc_ag.agno, error, _RET_IP_);
return error;
}
@@ -671,7 +716,10 @@
error = xfs_refcount_get_rec(cur, &tmp, &found_rec);
if (error)
goto out_error;
- XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, out_error);
+ if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
+ error = -EFSCORRUPTED;
+ goto out_error;
+ }
if (tmp.rc_startblock != agbno + aglen)
return 0;
@@ -689,8 +737,10 @@
error = xfs_refcount_get_rec(cur, &tmp, &found_rec);
if (error)
goto out_error;
- XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1,
- out_error);
+ if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
+ error = -EFSCORRUPTED;
+ goto out_error;
+ }
/* if tmp ends at the end of our range, just use that */
if (xfs_refc_next(&tmp) == agbno + aglen)
@@ -718,13 +768,13 @@
cright->rc_blockcount = aglen;
cright->rc_refcount = 1;
}
- trace_xfs_refcount_find_right_extent(cur->bc_mp, cur->bc_private.a.agno,
+ trace_xfs_refcount_find_right_extent(cur->bc_mp, cur->bc_ag.agno,
cright, right, agbno + aglen);
return error;
out_error:
trace_xfs_refcount_find_right_extent_error(cur->bc_mp,
- cur->bc_private.a.agno, error, _RET_IP_);
+ cur->bc_ag.agno, error, _RET_IP_);
return error;
}
@@ -833,7 +883,7 @@
{
unsigned long overhead;
- overhead = cur->bc_private.a.priv.refc.shape_changes *
+ overhead = cur->bc_ag.refc.shape_changes *
xfs_allocfree_log_count(cur->bc_mp, 1);
overhead *= cur->bc_mp->m_sb.sb_blocksize;
@@ -841,17 +891,17 @@
* Only allow 2 refcount extent updates per transaction if the
* refcount continue update "error" has been injected.
*/
- if (cur->bc_private.a.priv.refc.nr_ops > 2 &&
+ if (cur->bc_ag.refc.nr_ops > 2 &&
XFS_TEST_ERROR(false, cur->bc_mp,
XFS_ERRTAG_REFCOUNT_CONTINUE_UPDATE))
return false;
- if (cur->bc_private.a.priv.refc.nr_ops == 0)
+ if (cur->bc_ag.refc.nr_ops == 0)
return true;
else if (overhead > cur->bc_tp->t_log_res)
return false;
return cur->bc_tp->t_log_res - overhead >
- cur->bc_private.a.priv.refc.nr_ops * XFS_REFCOUNT_ITEM_OVERHEAD;
+ cur->bc_ag.refc.nr_ops * XFS_REFCOUNT_ITEM_OVERHEAD;
}
/*
@@ -902,7 +952,7 @@
ext.rc_startblock - *agbno);
tmp.rc_refcount = 1 + adj;
trace_xfs_refcount_modify_extent(cur->bc_mp,
- cur->bc_private.a.agno, &tmp);
+ cur->bc_ag.agno, &tmp);
/*
* Either cover the hole (increment) or
@@ -913,12 +963,15 @@
&found_tmp);
if (error)
goto out_error;
- XFS_WANT_CORRUPTED_GOTO(cur->bc_mp,
- found_tmp == 1, out_error);
- cur->bc_private.a.priv.refc.nr_ops++;
+ if (XFS_IS_CORRUPT(cur->bc_mp,
+ found_tmp != 1)) {
+ error = -EFSCORRUPTED;
+ goto out_error;
+ }
+ cur->bc_ag.refc.nr_ops++;
} else {
fsbno = XFS_AGB_TO_FSB(cur->bc_mp,
- cur->bc_private.a.agno,
+ cur->bc_ag.agno,
tmp.rc_startblock);
xfs_bmap_add_free(cur->bc_tp, fsbno,
tmp.rc_blockcount, oinfo);
@@ -945,23 +998,25 @@
goto skip;
ext.rc_refcount += adj;
trace_xfs_refcount_modify_extent(cur->bc_mp,
- cur->bc_private.a.agno, &ext);
+ cur->bc_ag.agno, &ext);
if (ext.rc_refcount > 1) {
error = xfs_refcount_update(cur, &ext);
if (error)
goto out_error;
- cur->bc_private.a.priv.refc.nr_ops++;
+ cur->bc_ag.refc.nr_ops++;
} else if (ext.rc_refcount == 1) {
error = xfs_refcount_delete(cur, &found_rec);
if (error)
goto out_error;
- XFS_WANT_CORRUPTED_GOTO(cur->bc_mp,
- found_rec == 1, out_error);
- cur->bc_private.a.priv.refc.nr_ops++;
+ if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
+ error = -EFSCORRUPTED;
+ goto out_error;
+ }
+ cur->bc_ag.refc.nr_ops++;
goto advloop;
} else {
fsbno = XFS_AGB_TO_FSB(cur->bc_mp,
- cur->bc_private.a.agno,
+ cur->bc_ag.agno,
ext.rc_startblock);
xfs_bmap_add_free(cur->bc_tp, fsbno, ext.rc_blockcount,
oinfo);
@@ -980,7 +1035,7 @@
return error;
out_error:
trace_xfs_refcount_modify_extent_error(cur->bc_mp,
- cur->bc_private.a.agno, error, _RET_IP_);
+ cur->bc_ag.agno, error, _RET_IP_);
return error;
}
@@ -1002,10 +1057,10 @@
*new_agbno = agbno;
*new_aglen = aglen;
if (adj == XFS_REFCOUNT_ADJUST_INCREASE)
- trace_xfs_refcount_increase(cur->bc_mp, cur->bc_private.a.agno,
+ trace_xfs_refcount_increase(cur->bc_mp, cur->bc_ag.agno,
agbno, aglen);
else
- trace_xfs_refcount_decrease(cur->bc_mp, cur->bc_private.a.agno,
+ trace_xfs_refcount_decrease(cur->bc_mp, cur->bc_ag.agno,
agbno, aglen);
/*
@@ -1033,7 +1088,7 @@
if (shape_changed)
shape_changes++;
if (shape_changes)
- cur->bc_private.a.priv.refc.shape_changes++;
+ cur->bc_ag.refc.shape_changes++;
/* Now that we've taken care of the ends, adjust the middle extents */
error = xfs_refcount_adjust_extents(cur, new_agbno, new_aglen,
@@ -1044,7 +1099,7 @@
return 0;
out_error:
- trace_xfs_refcount_adjust_error(cur->bc_mp, cur->bc_private.a.agno,
+ trace_xfs_refcount_adjust_error(cur->bc_mp, cur->bc_ag.agno,
error, _RET_IP_);
return error;
}
@@ -1060,7 +1115,7 @@
if (rcur == NULL)
return;
- agbp = rcur->bc_private.a.agbp;
+ agbp = rcur->bc_ag.agbp;
xfs_btree_del_cursor(rcur, error);
if (error)
xfs_trans_brelse(tp, agbp);
@@ -1110,9 +1165,9 @@
* the startblock, get one now.
*/
rcur = *pcur;
- if (rcur != NULL && rcur->bc_private.a.agno != agno) {
- nr_ops = rcur->bc_private.a.priv.refc.nr_ops;
- shape_changes = rcur->bc_private.a.priv.refc.shape_changes;
+ if (rcur != NULL && rcur->bc_ag.agno != agno) {
+ nr_ops = rcur->bc_ag.refc.nr_ops;
+ shape_changes = rcur->bc_ag.refc.shape_changes;
xfs_refcount_finish_one_cleanup(tp, rcur, 0);
rcur = NULL;
*pcur = NULL;
@@ -1122,16 +1177,14 @@
XFS_ALLOC_FLAG_FREEING, &agbp);
if (error)
return error;
- if (!agbp)
- return -EFSCORRUPTED;
rcur = xfs_refcountbt_init_cursor(mp, tp, agbp, agno);
if (!rcur) {
error = -ENOMEM;
goto out_cur;
}
- rcur->bc_private.a.priv.refc.nr_ops = nr_ops;
- rcur->bc_private.a.priv.refc.shape_changes = shape_changes;
+ rcur->bc_ag.refc.nr_ops = nr_ops;
+ rcur->bc_ag.refc.shape_changes = shape_changes;
}
*pcur = rcur;
@@ -1250,7 +1303,7 @@
int have;
int error;
- trace_xfs_refcount_find_shared(cur->bc_mp, cur->bc_private.a.agno,
+ trace_xfs_refcount_find_shared(cur->bc_mp, cur->bc_ag.agno,
agbno, aglen);
/* By default, skip the whole range */
@@ -1272,7 +1325,10 @@
error = xfs_refcount_get_rec(cur, &tmp, &i);
if (error)
goto out_error;
- XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, out_error);
+ if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto out_error;
+ }
/* If the extent ends before the start, look at the next one */
if (tmp.rc_startblock + tmp.rc_blockcount <= agbno) {
@@ -1284,7 +1340,10 @@
error = xfs_refcount_get_rec(cur, &tmp, &i);
if (error)
goto out_error;
- XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, out_error);
+ if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto out_error;
+ }
}
/* If the extent starts after the range we want, bail out */
@@ -1312,7 +1371,10 @@
error = xfs_refcount_get_rec(cur, &tmp, &i);
if (error)
goto out_error;
- XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, out_error);
+ if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto out_error;
+ }
if (tmp.rc_startblock >= agbno + aglen ||
tmp.rc_startblock != *fbno + *flen)
break;
@@ -1321,12 +1383,12 @@
done:
trace_xfs_refcount_find_shared_result(cur->bc_mp,
- cur->bc_private.a.agno, *fbno, *flen);
+ cur->bc_ag.agno, *fbno, *flen);
out_error:
if (error)
trace_xfs_refcount_find_shared_error(cur->bc_mp,
- cur->bc_private.a.agno, error, _RET_IP_);
+ cur->bc_ag.agno, error, _RET_IP_);
return error;
}
@@ -1413,39 +1475,52 @@
switch (adj) {
case XFS_REFCOUNT_ADJUST_COW_ALLOC:
/* Adding a CoW reservation, there should be nothing here. */
- XFS_WANT_CORRUPTED_GOTO(cur->bc_mp,
- ext.rc_startblock >= agbno + aglen, out_error);
+ if (XFS_IS_CORRUPT(cur->bc_mp,
+ agbno + aglen > ext.rc_startblock)) {
+ error = -EFSCORRUPTED;
+ goto out_error;
+ }
tmp.rc_startblock = agbno;
tmp.rc_blockcount = aglen;
tmp.rc_refcount = 1;
trace_xfs_refcount_modify_extent(cur->bc_mp,
- cur->bc_private.a.agno, &tmp);
+ cur->bc_ag.agno, &tmp);
error = xfs_refcount_insert(cur, &tmp,
&found_tmp);
if (error)
goto out_error;
- XFS_WANT_CORRUPTED_GOTO(cur->bc_mp,
- found_tmp == 1, out_error);
+ if (XFS_IS_CORRUPT(cur->bc_mp, found_tmp != 1)) {
+ error = -EFSCORRUPTED;
+ goto out_error;
+ }
break;
case XFS_REFCOUNT_ADJUST_COW_FREE:
/* Removing a CoW reservation, there should be one extent. */
- XFS_WANT_CORRUPTED_GOTO(cur->bc_mp,
- ext.rc_startblock == agbno, out_error);
- XFS_WANT_CORRUPTED_GOTO(cur->bc_mp,
- ext.rc_blockcount == aglen, out_error);
- XFS_WANT_CORRUPTED_GOTO(cur->bc_mp,
- ext.rc_refcount == 1, out_error);
+ if (XFS_IS_CORRUPT(cur->bc_mp, ext.rc_startblock != agbno)) {
+ error = -EFSCORRUPTED;
+ goto out_error;
+ }
+ if (XFS_IS_CORRUPT(cur->bc_mp, ext.rc_blockcount != aglen)) {
+ error = -EFSCORRUPTED;
+ goto out_error;
+ }
+ if (XFS_IS_CORRUPT(cur->bc_mp, ext.rc_refcount != 1)) {
+ error = -EFSCORRUPTED;
+ goto out_error;
+ }
ext.rc_refcount = 0;
trace_xfs_refcount_modify_extent(cur->bc_mp,
- cur->bc_private.a.agno, &ext);
+ cur->bc_ag.agno, &ext);
error = xfs_refcount_delete(cur, &found_rec);
if (error)
goto out_error;
- XFS_WANT_CORRUPTED_GOTO(cur->bc_mp,
- found_rec == 1, out_error);
+ if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
+ error = -EFSCORRUPTED;
+ goto out_error;
+ }
break;
default:
ASSERT(0);
@@ -1454,7 +1529,7 @@
return error;
out_error:
trace_xfs_refcount_modify_extent_error(cur->bc_mp,
- cur->bc_private.a.agno, error, _RET_IP_);
+ cur->bc_ag.agno, error, _RET_IP_);
return error;
}
@@ -1500,7 +1575,7 @@
return 0;
out_error:
- trace_xfs_refcount_adjust_cow_error(cur->bc_mp, cur->bc_private.a.agno,
+ trace_xfs_refcount_adjust_cow_error(cur->bc_mp, cur->bc_ag.agno,
error, _RET_IP_);
return error;
}
@@ -1514,7 +1589,7 @@
xfs_agblock_t agbno,
xfs_extlen_t aglen)
{
- trace_xfs_refcount_cow_increase(rcur->bc_mp, rcur->bc_private.a.agno,
+ trace_xfs_refcount_cow_increase(rcur->bc_mp, rcur->bc_ag.agno,
agbno, aglen);
/* Add refcount btree reservation */
@@ -1531,7 +1606,7 @@
xfs_agblock_t agbno,
xfs_extlen_t aglen)
{
- trace_xfs_refcount_cow_decrease(rcur->bc_mp, rcur->bc_private.a.agno,
+ trace_xfs_refcount_cow_decrease(rcur->bc_mp, rcur->bc_ag.agno,
agbno, aglen);
/* Remove refcount btree reservation */
@@ -1584,14 +1659,15 @@
/* Stuff an extent on the recovery list. */
STATIC int
xfs_refcount_recover_extent(
- struct xfs_btree_cur *cur,
+ struct xfs_btree_cur *cur,
union xfs_btree_rec *rec,
void *priv)
{
struct list_head *debris = priv;
struct xfs_refcount_recovery *rr;
- if (be32_to_cpu(rec->refc.rc_refcount) != 1)
+ if (XFS_IS_CORRUPT(cur->bc_mp,
+ be32_to_cpu(rec->refc.rc_refcount) != 1))
return -EFSCORRUPTED;
rr = kmem_alloc(sizeof(struct xfs_refcount_recovery), 0);
@@ -1640,10 +1716,6 @@
error = xfs_alloc_read_agf(mp, tp, agno, 0, &agbp);
if (error)
goto out_trans;
- if (!agbp) {
- error = -ENOMEM;
- goto out_trans;
- }
cur = xfs_refcountbt_init_cursor(mp, tp, agbp, agno);
/* Find all the leftover CoW staging extents. */
diff --git a/fs/xfs/libxfs/xfs_refcount_btree.c b/fs/xfs/libxfs/xfs_refcount_btree.c
index 38529db..a6ac60a 100644
--- a/fs/xfs/libxfs/xfs_refcount_btree.c
+++ b/fs/xfs/libxfs/xfs_refcount_btree.c
@@ -12,6 +12,7 @@
#include "xfs_sb.h"
#include "xfs_mount.h"
#include "xfs_btree.h"
+#include "xfs_btree_staging.h"
#include "xfs_refcount_btree.h"
#include "xfs_alloc.h"
#include "xfs_error.h"
@@ -25,7 +26,7 @@
struct xfs_btree_cur *cur)
{
return xfs_refcountbt_init_cursor(cur->bc_mp, cur->bc_tp,
- cur->bc_private.a.agbp, cur->bc_private.a.agno);
+ cur->bc_ag.agbp, cur->bc_ag.agno);
}
STATIC void
@@ -34,17 +35,15 @@
union xfs_btree_ptr *ptr,
int inc)
{
- struct xfs_buf *agbp = cur->bc_private.a.agbp;
- struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp);
- xfs_agnumber_t seqno = be32_to_cpu(agf->agf_seqno);
- struct xfs_perag *pag = xfs_perag_get(cur->bc_mp, seqno);
+ struct xfs_buf *agbp = cur->bc_ag.agbp;
+ struct xfs_agf *agf = agbp->b_addr;
+ struct xfs_perag *pag = agbp->b_pag;
ASSERT(ptr->s != 0);
agf->agf_refcount_root = ptr->s;
be32_add_cpu(&agf->agf_refcount_level, inc);
pag->pagf_refcount_level += inc;
- xfs_perag_put(pag);
xfs_alloc_log_agf(cur->bc_tp, agbp,
XFS_AGF_REFCOUNT_ROOT | XFS_AGF_REFCOUNT_LEVEL);
@@ -57,8 +56,8 @@
union xfs_btree_ptr *new,
int *stat)
{
- struct xfs_buf *agbp = cur->bc_private.a.agbp;
- struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp);
+ struct xfs_buf *agbp = cur->bc_ag.agbp;
+ struct xfs_agf *agf = agbp->b_addr;
struct xfs_alloc_arg args; /* block allocation args */
int error; /* error return value */
@@ -66,7 +65,7 @@
args.tp = cur->bc_tp;
args.mp = cur->bc_mp;
args.type = XFS_ALLOCTYPE_NEAR_BNO;
- args.fsbno = XFS_AGB_TO_FSB(cur->bc_mp, cur->bc_private.a.agno,
+ args.fsbno = XFS_AGB_TO_FSB(cur->bc_mp, cur->bc_ag.agno,
xfs_refc_block(args.mp));
args.oinfo = XFS_RMAP_OINFO_REFC;
args.minlen = args.maxlen = args.prod = 1;
@@ -75,13 +74,13 @@
error = xfs_alloc_vextent(&args);
if (error)
goto out_error;
- trace_xfs_refcountbt_alloc_block(cur->bc_mp, cur->bc_private.a.agno,
+ trace_xfs_refcountbt_alloc_block(cur->bc_mp, cur->bc_ag.agno,
args.agbno, 1);
if (args.fsbno == NULLFSBLOCK) {
*stat = 0;
return 0;
}
- ASSERT(args.agno == cur->bc_private.a.agno);
+ ASSERT(args.agno == cur->bc_ag.agno);
ASSERT(args.len == 1);
new->s = cpu_to_be32(args.agbno);
@@ -101,12 +100,12 @@
struct xfs_buf *bp)
{
struct xfs_mount *mp = cur->bc_mp;
- struct xfs_buf *agbp = cur->bc_private.a.agbp;
- struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp);
+ struct xfs_buf *agbp = cur->bc_ag.agbp;
+ struct xfs_agf *agf = agbp->b_addr;
xfs_fsblock_t fsbno = XFS_DADDR_TO_FSB(mp, XFS_BUF_ADDR(bp));
int error;
- trace_xfs_refcountbt_free_block(cur->bc_mp, cur->bc_private.a.agno,
+ trace_xfs_refcountbt_free_block(cur->bc_mp, cur->bc_ag.agno,
XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno), 1);
be32_add_cpu(&agf->agf_refcount_blocks, -1);
xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_REFCOUNT_BLOCKS);
@@ -169,9 +168,9 @@
struct xfs_btree_cur *cur,
union xfs_btree_ptr *ptr)
{
- struct xfs_agf *agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp);
+ struct xfs_agf *agf = cur->bc_ag.agbp->b_addr;
- ASSERT(cur->bc_private.a.agno == be32_to_cpu(agf->agf_seqno));
+ ASSERT(cur->bc_ag.agno == be32_to_cpu(agf->agf_seqno));
ptr->s = agf->agf_refcount_root;
}
@@ -311,8 +310,36 @@
};
/*
- * Allocate a new refcount btree cursor.
+ * Initialize a new refcount btree cursor.
*/
+static struct xfs_btree_cur *
+xfs_refcountbt_init_common(
+ struct xfs_mount *mp,
+ struct xfs_trans *tp,
+ xfs_agnumber_t agno)
+{
+ struct xfs_btree_cur *cur;
+
+ ASSERT(agno != NULLAGNUMBER);
+ ASSERT(agno < mp->m_sb.sb_agcount);
+
+ cur = kmem_cache_zalloc(xfs_btree_cur_zone, GFP_NOFS | __GFP_NOFAIL);
+ cur->bc_tp = tp;
+ cur->bc_mp = mp;
+ cur->bc_btnum = XFS_BTNUM_REFC;
+ cur->bc_blocklog = mp->m_sb.sb_blocklog;
+ cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_refcbt_2);
+
+ cur->bc_ag.agno = agno;
+ cur->bc_flags |= XFS_BTREE_CRC_BLOCKS;
+
+ cur->bc_ag.refc.nr_ops = 0;
+ cur->bc_ag.refc.shape_changes = 0;
+ cur->bc_ops = &xfs_refcountbt_ops;
+ return cur;
+}
+
+/* Create a btree cursor. */
struct xfs_btree_cur *
xfs_refcountbt_init_cursor(
struct xfs_mount *mp,
@@ -320,32 +347,53 @@
struct xfs_buf *agbp,
xfs_agnumber_t agno)
{
- struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp);
+ struct xfs_agf *agf = agbp->b_addr;
struct xfs_btree_cur *cur;
- ASSERT(agno != NULLAGNUMBER);
- ASSERT(agno < mp->m_sb.sb_agcount);
- cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_NOFS);
-
- cur->bc_tp = tp;
- cur->bc_mp = mp;
- cur->bc_btnum = XFS_BTNUM_REFC;
- cur->bc_blocklog = mp->m_sb.sb_blocklog;
- cur->bc_ops = &xfs_refcountbt_ops;
- cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_refcbt_2);
-
+ cur = xfs_refcountbt_init_common(mp, tp, agno);
cur->bc_nlevels = be32_to_cpu(agf->agf_refcount_level);
-
- cur->bc_private.a.agbp = agbp;
- cur->bc_private.a.agno = agno;
- cur->bc_flags |= XFS_BTREE_CRC_BLOCKS;
-
- cur->bc_private.a.priv.refc.nr_ops = 0;
- cur->bc_private.a.priv.refc.shape_changes = 0;
-
+ cur->bc_ag.agbp = agbp;
return cur;
}
+/* Create a btree cursor with a fake root for staging. */
+struct xfs_btree_cur *
+xfs_refcountbt_stage_cursor(
+ struct xfs_mount *mp,
+ struct xbtree_afakeroot *afake,
+ xfs_agnumber_t agno)
+{
+ struct xfs_btree_cur *cur;
+
+ cur = xfs_refcountbt_init_common(mp, NULL, agno);
+ xfs_btree_stage_afakeroot(cur, afake);
+ return cur;
+}
+
+/*
+ * Swap in the new btree root. Once we pass this point the newly rebuilt btree
+ * is in place and we have to kill off all the old btree blocks.
+ */
+void
+xfs_refcountbt_commit_staged_btree(
+ struct xfs_btree_cur *cur,
+ struct xfs_trans *tp,
+ struct xfs_buf *agbp)
+{
+ struct xfs_agf *agf = agbp->b_addr;
+ struct xbtree_afakeroot *afake = cur->bc_ag.afake;
+
+ ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
+
+ agf->agf_refcount_root = cpu_to_be32(afake->af_root);
+ agf->agf_refcount_level = cpu_to_be32(afake->af_levels);
+ agf->agf_refcount_blocks = cpu_to_be32(afake->af_blocks);
+ xfs_alloc_log_agf(tp, agbp, XFS_AGF_REFCOUNT_BLOCKS |
+ XFS_AGF_REFCOUNT_ROOT |
+ XFS_AGF_REFCOUNT_LEVEL);
+ xfs_btree_commit_afakeroot(cur, tp, agbp, &xfs_refcountbt_ops);
+}
+
/*
* Calculate the number of records in a refcount btree block.
*/
@@ -420,7 +468,7 @@
if (error)
return error;
- agf = XFS_BUF_TO_AGF(agbp);
+ agf = agbp->b_addr;
agblocks = be32_to_cpu(agf->agf_length);
tree_len = be32_to_cpu(agf->agf_refcount_blocks);
xfs_trans_brelse(tp, agbp);
diff --git a/fs/xfs/libxfs/xfs_refcount_btree.h b/fs/xfs/libxfs/xfs_refcount_btree.h
index ba416f7..69dc515 100644
--- a/fs/xfs/libxfs/xfs_refcount_btree.h
+++ b/fs/xfs/libxfs/xfs_refcount_btree.h
@@ -13,6 +13,7 @@
struct xfs_buf;
struct xfs_btree_cur;
struct xfs_mount;
+struct xbtree_afakeroot;
/*
* Btree block header size
@@ -46,6 +47,8 @@
extern struct xfs_btree_cur *xfs_refcountbt_init_cursor(struct xfs_mount *mp,
struct xfs_trans *tp, struct xfs_buf *agbp,
xfs_agnumber_t agno);
+struct xfs_btree_cur *xfs_refcountbt_stage_cursor(struct xfs_mount *mp,
+ struct xbtree_afakeroot *afake, xfs_agnumber_t agno);
extern int xfs_refcountbt_maxrecs(int blocklen, bool leaf);
extern void xfs_refcountbt_compute_maxlevels(struct xfs_mount *mp);
@@ -58,4 +61,7 @@
struct xfs_trans *tp, xfs_agnumber_t agno, xfs_extlen_t *ask,
xfs_extlen_t *used);
+void xfs_refcountbt_commit_staged_btree(struct xfs_btree_cur *cur,
+ struct xfs_trans *tp, struct xfs_buf *agbp);
+
#endif /* __XFS_REFCOUNT_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c
index 9d3c67b..2668ebe 100644
--- a/fs/xfs/libxfs/xfs_rmap.c
+++ b/fs/xfs/libxfs/xfs_rmap.c
@@ -79,7 +79,7 @@
union xfs_btree_rec rec;
int error;
- trace_xfs_rmap_update(cur->bc_mp, cur->bc_private.a.agno,
+ trace_xfs_rmap_update(cur->bc_mp, cur->bc_ag.agno,
irec->rm_startblock, irec->rm_blockcount,
irec->rm_owner, irec->rm_offset, irec->rm_flags);
@@ -91,7 +91,7 @@
error = xfs_btree_update(cur, &rec);
if (error)
trace_xfs_rmap_update_error(cur->bc_mp,
- cur->bc_private.a.agno, error, _RET_IP_);
+ cur->bc_ag.agno, error, _RET_IP_);
return error;
}
@@ -107,13 +107,16 @@
int i;
int error;
- trace_xfs_rmap_insert(rcur->bc_mp, rcur->bc_private.a.agno, agbno,
+ trace_xfs_rmap_insert(rcur->bc_mp, rcur->bc_ag.agno, agbno,
len, owner, offset, flags);
error = xfs_rmap_lookup_eq(rcur, agbno, len, owner, offset, flags, &i);
if (error)
goto done;
- XFS_WANT_CORRUPTED_GOTO(rcur->bc_mp, i == 0, done);
+ if (XFS_IS_CORRUPT(rcur->bc_mp, i != 0)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
rcur->bc_rec.r.rm_startblock = agbno;
rcur->bc_rec.r.rm_blockcount = len;
@@ -123,11 +126,14 @@
error = xfs_btree_insert(rcur, &i);
if (error)
goto done;
- XFS_WANT_CORRUPTED_GOTO(rcur->bc_mp, i == 1, done);
+ if (XFS_IS_CORRUPT(rcur->bc_mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
done:
if (error)
trace_xfs_rmap_insert_error(rcur->bc_mp,
- rcur->bc_private.a.agno, error, _RET_IP_);
+ rcur->bc_ag.agno, error, _RET_IP_);
return error;
}
@@ -143,22 +149,28 @@
int i;
int error;
- trace_xfs_rmap_delete(rcur->bc_mp, rcur->bc_private.a.agno, agbno,
+ trace_xfs_rmap_delete(rcur->bc_mp, rcur->bc_ag.agno, agbno,
len, owner, offset, flags);
error = xfs_rmap_lookup_eq(rcur, agbno, len, owner, offset, flags, &i);
if (error)
goto done;
- XFS_WANT_CORRUPTED_GOTO(rcur->bc_mp, i == 1, done);
+ if (XFS_IS_CORRUPT(rcur->bc_mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
error = xfs_btree_delete(rcur, &i);
if (error)
goto done;
- XFS_WANT_CORRUPTED_GOTO(rcur->bc_mp, i == 1, done);
+ if (XFS_IS_CORRUPT(rcur->bc_mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
done:
if (error)
trace_xfs_rmap_delete_error(rcur->bc_mp,
- rcur->bc_private.a.agno, error, _RET_IP_);
+ rcur->bc_ag.agno, error, _RET_IP_);
return error;
}
@@ -185,7 +197,7 @@
int *stat)
{
struct xfs_mount *mp = cur->bc_mp;
- xfs_agnumber_t agno = cur->bc_private.a.agno;
+ xfs_agnumber_t agno = cur->bc_ag.agno;
union xfs_btree_rec *rec;
int error;
@@ -248,7 +260,7 @@
struct xfs_find_left_neighbor_info *info = priv;
trace_xfs_rmap_find_left_neighbor_candidate(cur->bc_mp,
- cur->bc_private.a.agno, rec->rm_startblock,
+ cur->bc_ag.agno, rec->rm_startblock,
rec->rm_blockcount, rec->rm_owner, rec->rm_offset,
rec->rm_flags);
@@ -300,7 +312,7 @@
info.stat = stat;
trace_xfs_rmap_find_left_neighbor_query(cur->bc_mp,
- cur->bc_private.a.agno, bno, 0, owner, offset, flags);
+ cur->bc_ag.agno, bno, 0, owner, offset, flags);
error = xfs_rmap_query_range(cur, &info.high, &info.high,
xfs_rmap_find_left_neighbor_helper, &info);
@@ -308,7 +320,7 @@
error = 0;
if (*stat)
trace_xfs_rmap_find_left_neighbor_result(cur->bc_mp,
- cur->bc_private.a.agno, irec->rm_startblock,
+ cur->bc_ag.agno, irec->rm_startblock,
irec->rm_blockcount, irec->rm_owner,
irec->rm_offset, irec->rm_flags);
return error;
@@ -324,7 +336,7 @@
struct xfs_find_left_neighbor_info *info = priv;
trace_xfs_rmap_lookup_le_range_candidate(cur->bc_mp,
- cur->bc_private.a.agno, rec->rm_startblock,
+ cur->bc_ag.agno, rec->rm_startblock,
rec->rm_blockcount, rec->rm_owner, rec->rm_offset,
rec->rm_flags);
@@ -373,14 +385,14 @@
info.stat = stat;
trace_xfs_rmap_lookup_le_range(cur->bc_mp,
- cur->bc_private.a.agno, bno, 0, owner, offset, flags);
+ cur->bc_ag.agno, bno, 0, owner, offset, flags);
error = xfs_rmap_query_range(cur, &info.high, &info.high,
xfs_rmap_lookup_le_range_helper, &info);
if (error == -ECANCELED)
error = 0;
if (*stat)
trace_xfs_rmap_lookup_le_range_result(cur->bc_mp,
- cur->bc_private.a.agno, irec->rm_startblock,
+ cur->bc_ag.agno, irec->rm_startblock,
irec->rm_blockcount, irec->rm_owner,
irec->rm_offset, irec->rm_flags);
return error;
@@ -406,24 +418,39 @@
return 0;
/* Make sure the unwritten flag matches. */
- XFS_WANT_CORRUPTED_GOTO(mp, (flags & XFS_RMAP_UNWRITTEN) ==
- (rec->rm_flags & XFS_RMAP_UNWRITTEN), out);
+ if (XFS_IS_CORRUPT(mp,
+ (flags & XFS_RMAP_UNWRITTEN) !=
+ (rec->rm_flags & XFS_RMAP_UNWRITTEN))) {
+ error = -EFSCORRUPTED;
+ goto out;
+ }
/* Make sure the owner matches what we expect to find in the tree. */
- XFS_WANT_CORRUPTED_GOTO(mp, owner == rec->rm_owner, out);
+ if (XFS_IS_CORRUPT(mp, owner != rec->rm_owner)) {
+ error = -EFSCORRUPTED;
+ goto out;
+ }
/* Check the offset, if necessary. */
if (XFS_RMAP_NON_INODE_OWNER(owner))
goto out;
if (flags & XFS_RMAP_BMBT_BLOCK) {
- XFS_WANT_CORRUPTED_GOTO(mp, rec->rm_flags & XFS_RMAP_BMBT_BLOCK,
- out);
+ if (XFS_IS_CORRUPT(mp,
+ !(rec->rm_flags & XFS_RMAP_BMBT_BLOCK))) {
+ error = -EFSCORRUPTED;
+ goto out;
+ }
} else {
- XFS_WANT_CORRUPTED_GOTO(mp, rec->rm_offset <= offset, out);
- XFS_WANT_CORRUPTED_GOTO(mp,
- ltoff + rec->rm_blockcount >= offset + len,
- out);
+ if (XFS_IS_CORRUPT(mp, rec->rm_offset > offset)) {
+ error = -EFSCORRUPTED;
+ goto out;
+ }
+ if (XFS_IS_CORRUPT(mp,
+ offset + len > ltoff + rec->rm_blockcount)) {
+ error = -EFSCORRUPTED;
+ goto out;
+ }
}
out:
@@ -471,7 +498,7 @@
(flags & XFS_RMAP_BMBT_BLOCK);
if (unwritten)
flags |= XFS_RMAP_UNWRITTEN;
- trace_xfs_rmap_unmap(mp, cur->bc_private.a.agno, bno, len,
+ trace_xfs_rmap_unmap(mp, cur->bc_ag.agno, bno, len,
unwritten, oinfo);
/*
@@ -482,14 +509,20 @@
error = xfs_rmap_lookup_le(cur, bno, len, owner, offset, flags, &i);
if (error)
goto out_error;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, out_error);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto out_error;
+ }
error = xfs_rmap_get_rec(cur, <rec, &i);
if (error)
goto out_error;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, out_error);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto out_error;
+ }
trace_xfs_rmap_lookup_le_range_result(cur->bc_mp,
- cur->bc_private.a.agno, ltrec.rm_startblock,
+ cur->bc_ag.agno, ltrec.rm_startblock,
ltrec.rm_blockcount, ltrec.rm_owner,
ltrec.rm_offset, ltrec.rm_flags);
ltoff = ltrec.rm_offset;
@@ -502,8 +535,12 @@
* be the case that the "left" extent goes all the way to EOFS.
*/
if (owner == XFS_RMAP_OWN_NULL) {
- XFS_WANT_CORRUPTED_GOTO(mp, bno >= ltrec.rm_startblock +
- ltrec.rm_blockcount, out_error);
+ if (XFS_IS_CORRUPT(mp,
+ bno <
+ ltrec.rm_startblock + ltrec.rm_blockcount)) {
+ error = -EFSCORRUPTED;
+ goto out_error;
+ }
goto out_done;
}
@@ -526,15 +563,22 @@
error = xfs_rmap_get_rec(cur, &rtrec, &i);
if (error)
goto out_error;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, out_error);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto out_error;
+ }
if (rtrec.rm_startblock >= bno + len)
goto out_done;
}
/* Make sure the extent we found covers the entire freeing range. */
- XFS_WANT_CORRUPTED_GOTO(mp, ltrec.rm_startblock <= bno &&
- ltrec.rm_startblock + ltrec.rm_blockcount >=
- bno + len, out_error);
+ if (XFS_IS_CORRUPT(mp,
+ ltrec.rm_startblock > bno ||
+ ltrec.rm_startblock + ltrec.rm_blockcount <
+ bno + len)) {
+ error = -EFSCORRUPTED;
+ goto out_error;
+ }
/* Check owner information. */
error = xfs_rmap_free_check_owner(mp, ltoff, <rec, len, owner,
@@ -544,14 +588,17 @@
if (ltrec.rm_startblock == bno && ltrec.rm_blockcount == len) {
/* exact match, simply remove the record from rmap tree */
- trace_xfs_rmap_delete(mp, cur->bc_private.a.agno,
+ trace_xfs_rmap_delete(mp, cur->bc_ag.agno,
ltrec.rm_startblock, ltrec.rm_blockcount,
ltrec.rm_owner, ltrec.rm_offset,
ltrec.rm_flags);
error = xfs_btree_delete(cur, &i);
if (error)
goto out_error;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, out_error);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto out_error;
+ }
} else if (ltrec.rm_startblock == bno) {
/*
* overlap left hand side of extent: move the start, trim the
@@ -619,7 +666,7 @@
else
cur->bc_rec.r.rm_offset = offset + len;
cur->bc_rec.r.rm_flags = flags;
- trace_xfs_rmap_insert(mp, cur->bc_private.a.agno,
+ trace_xfs_rmap_insert(mp, cur->bc_ag.agno,
cur->bc_rec.r.rm_startblock,
cur->bc_rec.r.rm_blockcount,
cur->bc_rec.r.rm_owner,
@@ -631,11 +678,11 @@
}
out_done:
- trace_xfs_rmap_unmap_done(mp, cur->bc_private.a.agno, bno, len,
+ trace_xfs_rmap_unmap_done(mp, cur->bc_ag.agno, bno, len,
unwritten, oinfo);
out_error:
if (error)
- trace_xfs_rmap_unmap_error(mp, cur->bc_private.a.agno,
+ trace_xfs_rmap_unmap_error(mp, cur->bc_ag.agno,
error, _RET_IP_);
return error;
}
@@ -726,7 +773,7 @@
(flags & XFS_RMAP_BMBT_BLOCK);
if (unwritten)
flags |= XFS_RMAP_UNWRITTEN;
- trace_xfs_rmap_map(mp, cur->bc_private.a.agno, bno, len,
+ trace_xfs_rmap_map(mp, cur->bc_ag.agno, bno, len,
unwritten, oinfo);
ASSERT(!xfs_rmap_should_skip_owner_update(oinfo));
@@ -743,9 +790,12 @@
error = xfs_rmap_get_rec(cur, <rec, &have_lt);
if (error)
goto out_error;
- XFS_WANT_CORRUPTED_GOTO(mp, have_lt == 1, out_error);
+ if (XFS_IS_CORRUPT(mp, have_lt != 1)) {
+ error = -EFSCORRUPTED;
+ goto out_error;
+ }
trace_xfs_rmap_lookup_le_range_result(cur->bc_mp,
- cur->bc_private.a.agno, ltrec.rm_startblock,
+ cur->bc_ag.agno, ltrec.rm_startblock,
ltrec.rm_blockcount, ltrec.rm_owner,
ltrec.rm_offset, ltrec.rm_flags);
@@ -753,9 +803,12 @@
have_lt = 0;
}
- XFS_WANT_CORRUPTED_GOTO(mp,
- have_lt == 0 ||
- ltrec.rm_startblock + ltrec.rm_blockcount <= bno, out_error);
+ if (XFS_IS_CORRUPT(mp,
+ have_lt != 0 &&
+ ltrec.rm_startblock + ltrec.rm_blockcount > bno)) {
+ error = -EFSCORRUPTED;
+ goto out_error;
+ }
/*
* Increment the cursor to see if we have a right-adjacent record to our
@@ -769,11 +822,16 @@
error = xfs_rmap_get_rec(cur, >rec, &have_gt);
if (error)
goto out_error;
- XFS_WANT_CORRUPTED_GOTO(mp, have_gt == 1, out_error);
- XFS_WANT_CORRUPTED_GOTO(mp, bno + len <= gtrec.rm_startblock,
- out_error);
+ if (XFS_IS_CORRUPT(mp, have_gt != 1)) {
+ error = -EFSCORRUPTED;
+ goto out_error;
+ }
+ if (XFS_IS_CORRUPT(mp, bno + len > gtrec.rm_startblock)) {
+ error = -EFSCORRUPTED;
+ goto out_error;
+ }
trace_xfs_rmap_find_right_neighbor_result(cur->bc_mp,
- cur->bc_private.a.agno, gtrec.rm_startblock,
+ cur->bc_ag.agno, gtrec.rm_startblock,
gtrec.rm_blockcount, gtrec.rm_owner,
gtrec.rm_offset, gtrec.rm_flags);
if (!xfs_rmap_is_mergeable(>rec, owner, flags))
@@ -812,7 +870,7 @@
* result: |rrrrrrrrrrrrrrrrrrrrrrrrrrrrr|
*/
ltrec.rm_blockcount += gtrec.rm_blockcount;
- trace_xfs_rmap_delete(mp, cur->bc_private.a.agno,
+ trace_xfs_rmap_delete(mp, cur->bc_ag.agno,
gtrec.rm_startblock,
gtrec.rm_blockcount,
gtrec.rm_owner,
@@ -821,7 +879,10 @@
error = xfs_btree_delete(cur, &i);
if (error)
goto out_error;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, out_error);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto out_error;
+ }
}
/* point the cursor back to the left record and update */
@@ -860,19 +921,22 @@
cur->bc_rec.r.rm_owner = owner;
cur->bc_rec.r.rm_offset = offset;
cur->bc_rec.r.rm_flags = flags;
- trace_xfs_rmap_insert(mp, cur->bc_private.a.agno, bno, len,
+ trace_xfs_rmap_insert(mp, cur->bc_ag.agno, bno, len,
owner, offset, flags);
error = xfs_btree_insert(cur, &i);
if (error)
goto out_error;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, out_error);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto out_error;
+ }
}
- trace_xfs_rmap_map_done(mp, cur->bc_private.a.agno, bno, len,
+ trace_xfs_rmap_map_done(mp, cur->bc_ag.agno, bno, len,
unwritten, oinfo);
out_error:
if (error)
- trace_xfs_rmap_map_error(mp, cur->bc_private.a.agno,
+ trace_xfs_rmap_map_error(mp, cur->bc_ag.agno,
error, _RET_IP_);
return error;
}
@@ -946,7 +1010,7 @@
(flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK))));
oldext = unwritten ? XFS_RMAP_UNWRITTEN : 0;
new_endoff = offset + len;
- trace_xfs_rmap_convert(mp, cur->bc_private.a.agno, bno, len,
+ trace_xfs_rmap_convert(mp, cur->bc_ag.agno, bno, len,
unwritten, oinfo);
/*
@@ -957,14 +1021,20 @@
error = xfs_rmap_lookup_le(cur, bno, len, owner, offset, oldext, &i);
if (error)
goto done;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
error = xfs_rmap_get_rec(cur, &PREV, &i);
if (error)
goto done;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
trace_xfs_rmap_lookup_le_range_result(cur->bc_mp,
- cur->bc_private.a.agno, PREV.rm_startblock,
+ cur->bc_ag.agno, PREV.rm_startblock,
PREV.rm_blockcount, PREV.rm_owner,
PREV.rm_offset, PREV.rm_flags);
@@ -995,12 +1065,18 @@
error = xfs_rmap_get_rec(cur, &LEFT, &i);
if (error)
goto done;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
- XFS_WANT_CORRUPTED_GOTO(mp,
- LEFT.rm_startblock + LEFT.rm_blockcount <= bno,
- done);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
+ if (XFS_IS_CORRUPT(mp,
+ LEFT.rm_startblock + LEFT.rm_blockcount >
+ bno)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
trace_xfs_rmap_find_left_neighbor_result(cur->bc_mp,
- cur->bc_private.a.agno, LEFT.rm_startblock,
+ cur->bc_ag.agno, LEFT.rm_startblock,
LEFT.rm_blockcount, LEFT.rm_owner,
LEFT.rm_offset, LEFT.rm_flags);
if (LEFT.rm_startblock + LEFT.rm_blockcount == bno &&
@@ -1017,7 +1093,10 @@
error = xfs_btree_increment(cur, 0, &i);
if (error)
goto done;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
error = xfs_btree_increment(cur, 0, &i);
if (error)
goto done;
@@ -1026,11 +1105,16 @@
error = xfs_rmap_get_rec(cur, &RIGHT, &i);
if (error)
goto done;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
- XFS_WANT_CORRUPTED_GOTO(mp, bno + len <= RIGHT.rm_startblock,
- done);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
+ if (XFS_IS_CORRUPT(mp, bno + len > RIGHT.rm_startblock)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
trace_xfs_rmap_find_right_neighbor_result(cur->bc_mp,
- cur->bc_private.a.agno, RIGHT.rm_startblock,
+ cur->bc_ag.agno, RIGHT.rm_startblock,
RIGHT.rm_blockcount, RIGHT.rm_owner,
RIGHT.rm_offset, RIGHT.rm_flags);
if (bno + len == RIGHT.rm_startblock &&
@@ -1048,14 +1132,17 @@
RIGHT.rm_blockcount > XFS_RMAP_LEN_MAX)
state &= ~RMAP_RIGHT_CONTIG;
- trace_xfs_rmap_convert_state(mp, cur->bc_private.a.agno, state,
+ trace_xfs_rmap_convert_state(mp, cur->bc_ag.agno, state,
_RET_IP_);
/* reset the cursor back to PREV */
error = xfs_rmap_lookup_le(cur, bno, len, owner, offset, oldext, &i);
if (error)
goto done;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
/*
* Switch out based on the FILLING and CONTIG state bits.
@@ -1071,31 +1158,46 @@
error = xfs_btree_increment(cur, 0, &i);
if (error)
goto done;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
- trace_xfs_rmap_delete(mp, cur->bc_private.a.agno,
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
+ trace_xfs_rmap_delete(mp, cur->bc_ag.agno,
RIGHT.rm_startblock, RIGHT.rm_blockcount,
RIGHT.rm_owner, RIGHT.rm_offset,
RIGHT.rm_flags);
error = xfs_btree_delete(cur, &i);
if (error)
goto done;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
error = xfs_btree_decrement(cur, 0, &i);
if (error)
goto done;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
- trace_xfs_rmap_delete(mp, cur->bc_private.a.agno,
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
+ trace_xfs_rmap_delete(mp, cur->bc_ag.agno,
PREV.rm_startblock, PREV.rm_blockcount,
PREV.rm_owner, PREV.rm_offset,
PREV.rm_flags);
error = xfs_btree_delete(cur, &i);
if (error)
goto done;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
error = xfs_btree_decrement(cur, 0, &i);
if (error)
goto done;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
NEW = LEFT;
NEW.rm_blockcount += PREV.rm_blockcount + RIGHT.rm_blockcount;
error = xfs_rmap_update(cur, &NEW);
@@ -1108,18 +1210,24 @@
* Setting all of a previous oldext extent to newext.
* The left neighbor is contiguous, the right is not.
*/
- trace_xfs_rmap_delete(mp, cur->bc_private.a.agno,
+ trace_xfs_rmap_delete(mp, cur->bc_ag.agno,
PREV.rm_startblock, PREV.rm_blockcount,
PREV.rm_owner, PREV.rm_offset,
PREV.rm_flags);
error = xfs_btree_delete(cur, &i);
if (error)
goto done;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
error = xfs_btree_decrement(cur, 0, &i);
if (error)
goto done;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
NEW = LEFT;
NEW.rm_blockcount += PREV.rm_blockcount;
error = xfs_rmap_update(cur, &NEW);
@@ -1135,19 +1243,28 @@
error = xfs_btree_increment(cur, 0, &i);
if (error)
goto done;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
- trace_xfs_rmap_delete(mp, cur->bc_private.a.agno,
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
+ trace_xfs_rmap_delete(mp, cur->bc_ag.agno,
RIGHT.rm_startblock, RIGHT.rm_blockcount,
RIGHT.rm_owner, RIGHT.rm_offset,
RIGHT.rm_flags);
error = xfs_btree_delete(cur, &i);
if (error)
goto done;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
error = xfs_btree_decrement(cur, 0, &i);
if (error)
goto done;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
NEW = PREV;
NEW.rm_blockcount = len + RIGHT.rm_blockcount;
NEW.rm_flags = newext;
@@ -1209,12 +1326,15 @@
NEW.rm_blockcount = len;
NEW.rm_flags = newext;
cur->bc_rec.r = NEW;
- trace_xfs_rmap_insert(mp, cur->bc_private.a.agno, bno,
+ trace_xfs_rmap_insert(mp, cur->bc_ag.agno, bno,
len, owner, offset, newext);
error = xfs_btree_insert(cur, &i);
if (error)
goto done;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
break;
case RMAP_RIGHT_FILLING | RMAP_RIGHT_CONTIG:
@@ -1253,19 +1373,25 @@
oldext, &i);
if (error)
goto done;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done);
+ if (XFS_IS_CORRUPT(mp, i != 0)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
NEW.rm_startblock = bno;
NEW.rm_owner = owner;
NEW.rm_offset = offset;
NEW.rm_blockcount = len;
NEW.rm_flags = newext;
cur->bc_rec.r = NEW;
- trace_xfs_rmap_insert(mp, cur->bc_private.a.agno, bno,
+ trace_xfs_rmap_insert(mp, cur->bc_ag.agno, bno,
len, owner, offset, newext);
error = xfs_btree_insert(cur, &i);
if (error)
goto done;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
break;
case 0:
@@ -1288,14 +1414,17 @@
NEW = PREV;
NEW.rm_blockcount = offset - PREV.rm_offset;
cur->bc_rec.r = NEW;
- trace_xfs_rmap_insert(mp, cur->bc_private.a.agno,
+ trace_xfs_rmap_insert(mp, cur->bc_ag.agno,
NEW.rm_startblock, NEW.rm_blockcount,
NEW.rm_owner, NEW.rm_offset,
NEW.rm_flags);
error = xfs_btree_insert(cur, &i);
if (error)
goto done;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
/*
* Reset the cursor to the position of the new extent
* we are about to insert as we can't trust it after
@@ -1305,16 +1434,22 @@
oldext, &i);
if (error)
goto done;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done);
+ if (XFS_IS_CORRUPT(mp, i != 0)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
/* new middle extent - newext */
cur->bc_rec.r.rm_flags &= ~XFS_RMAP_UNWRITTEN;
cur->bc_rec.r.rm_flags |= newext;
- trace_xfs_rmap_insert(mp, cur->bc_private.a.agno, bno, len,
+ trace_xfs_rmap_insert(mp, cur->bc_ag.agno, bno, len,
owner, offset, newext);
error = xfs_btree_insert(cur, &i);
if (error)
goto done;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
break;
case RMAP_LEFT_FILLING | RMAP_LEFT_CONTIG | RMAP_RIGHT_CONTIG:
@@ -1330,12 +1465,12 @@
ASSERT(0);
}
- trace_xfs_rmap_convert_done(mp, cur->bc_private.a.agno, bno, len,
+ trace_xfs_rmap_convert_done(mp, cur->bc_ag.agno, bno, len,
unwritten, oinfo);
done:
if (error)
trace_xfs_rmap_convert_error(cur->bc_mp,
- cur->bc_private.a.agno, error, _RET_IP_);
+ cur->bc_ag.agno, error, _RET_IP_);
return error;
}
@@ -1371,7 +1506,7 @@
(flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK))));
oldext = unwritten ? XFS_RMAP_UNWRITTEN : 0;
new_endoff = offset + len;
- trace_xfs_rmap_convert(mp, cur->bc_private.a.agno, bno, len,
+ trace_xfs_rmap_convert(mp, cur->bc_ag.agno, bno, len,
unwritten, oinfo);
/*
@@ -1383,7 +1518,10 @@
&PREV, &i);
if (error)
goto done;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
ASSERT(PREV.rm_offset <= offset);
ASSERT(PREV.rm_offset + PREV.rm_blockcount >= new_endoff);
@@ -1406,9 +1544,12 @@
goto done;
if (i) {
state |= RMAP_LEFT_VALID;
- XFS_WANT_CORRUPTED_GOTO(mp,
- LEFT.rm_startblock + LEFT.rm_blockcount <= bno,
- done);
+ if (XFS_IS_CORRUPT(mp,
+ LEFT.rm_startblock + LEFT.rm_blockcount >
+ bno)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
if (xfs_rmap_is_mergeable(&LEFT, owner, newext))
state |= RMAP_LEFT_CONTIG;
}
@@ -1423,11 +1564,16 @@
error = xfs_rmap_get_rec(cur, &RIGHT, &i);
if (error)
goto done;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
- XFS_WANT_CORRUPTED_GOTO(mp, bno + len <= RIGHT.rm_startblock,
- done);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
+ if (XFS_IS_CORRUPT(mp, bno + len > RIGHT.rm_startblock)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
trace_xfs_rmap_find_right_neighbor_result(cur->bc_mp,
- cur->bc_private.a.agno, RIGHT.rm_startblock,
+ cur->bc_ag.agno, RIGHT.rm_startblock,
RIGHT.rm_blockcount, RIGHT.rm_owner,
RIGHT.rm_offset, RIGHT.rm_flags);
if (xfs_rmap_is_mergeable(&RIGHT, owner, newext))
@@ -1443,7 +1589,7 @@
RIGHT.rm_blockcount > XFS_RMAP_LEN_MAX)
state &= ~RMAP_RIGHT_CONTIG;
- trace_xfs_rmap_convert_state(mp, cur->bc_private.a.agno, state,
+ trace_xfs_rmap_convert_state(mp, cur->bc_ag.agno, state,
_RET_IP_);
/*
* Switch out based on the FILLING and CONTIG state bits.
@@ -1472,7 +1618,10 @@
NEW.rm_offset, NEW.rm_flags, &i);
if (error)
goto done;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
NEW.rm_blockcount += PREV.rm_blockcount + RIGHT.rm_blockcount;
error = xfs_rmap_update(cur, &NEW);
if (error)
@@ -1495,7 +1644,10 @@
NEW.rm_offset, NEW.rm_flags, &i);
if (error)
goto done;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
NEW.rm_blockcount += PREV.rm_blockcount;
error = xfs_rmap_update(cur, &NEW);
if (error)
@@ -1518,7 +1670,10 @@
NEW.rm_offset, NEW.rm_flags, &i);
if (error)
goto done;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
NEW.rm_blockcount += RIGHT.rm_blockcount;
NEW.rm_flags = RIGHT.rm_flags;
error = xfs_rmap_update(cur, &NEW);
@@ -1538,7 +1693,10 @@
NEW.rm_offset, NEW.rm_flags, &i);
if (error)
goto done;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
NEW.rm_flags = newext;
error = xfs_rmap_update(cur, &NEW);
if (error)
@@ -1570,7 +1728,10 @@
NEW.rm_offset, NEW.rm_flags, &i);
if (error)
goto done;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
NEW.rm_blockcount += len;
error = xfs_rmap_update(cur, &NEW);
if (error)
@@ -1612,7 +1773,10 @@
NEW.rm_offset, NEW.rm_flags, &i);
if (error)
goto done;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
NEW.rm_blockcount = offset - NEW.rm_offset;
error = xfs_rmap_update(cur, &NEW);
if (error)
@@ -1644,7 +1808,10 @@
NEW.rm_offset, NEW.rm_flags, &i);
if (error)
goto done;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
NEW.rm_blockcount -= len;
error = xfs_rmap_update(cur, &NEW);
if (error)
@@ -1679,7 +1846,10 @@
NEW.rm_offset, NEW.rm_flags, &i);
if (error)
goto done;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
NEW.rm_blockcount = offset - NEW.rm_offset;
error = xfs_rmap_update(cur, &NEW);
if (error)
@@ -1710,12 +1880,12 @@
ASSERT(0);
}
- trace_xfs_rmap_convert_done(mp, cur->bc_private.a.agno, bno, len,
+ trace_xfs_rmap_convert_done(mp, cur->bc_ag.agno, bno, len,
unwritten, oinfo);
done:
if (error)
trace_xfs_rmap_convert_error(cur->bc_mp,
- cur->bc_private.a.agno, error, _RET_IP_);
+ cur->bc_ag.agno, error, _RET_IP_);
return error;
}
@@ -1753,7 +1923,7 @@
xfs_owner_info_unpack(oinfo, &owner, &offset, &flags);
if (unwritten)
flags |= XFS_RMAP_UNWRITTEN;
- trace_xfs_rmap_unmap(mp, cur->bc_private.a.agno, bno, len,
+ trace_xfs_rmap_unmap(mp, cur->bc_ag.agno, bno, len,
unwritten, oinfo);
/*
@@ -1765,25 +1935,44 @@
<rec, &i);
if (error)
goto out_error;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, out_error);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto out_error;
+ }
ltoff = ltrec.rm_offset;
/* Make sure the extent we found covers the entire freeing range. */
- XFS_WANT_CORRUPTED_GOTO(mp, ltrec.rm_startblock <= bno &&
- ltrec.rm_startblock + ltrec.rm_blockcount >=
- bno + len, out_error);
+ if (XFS_IS_CORRUPT(mp,
+ ltrec.rm_startblock > bno ||
+ ltrec.rm_startblock + ltrec.rm_blockcount <
+ bno + len)) {
+ error = -EFSCORRUPTED;
+ goto out_error;
+ }
/* Make sure the owner matches what we expect to find in the tree. */
- XFS_WANT_CORRUPTED_GOTO(mp, owner == ltrec.rm_owner, out_error);
+ if (XFS_IS_CORRUPT(mp, owner != ltrec.rm_owner)) {
+ error = -EFSCORRUPTED;
+ goto out_error;
+ }
/* Make sure the unwritten flag matches. */
- XFS_WANT_CORRUPTED_GOTO(mp, (flags & XFS_RMAP_UNWRITTEN) ==
- (ltrec.rm_flags & XFS_RMAP_UNWRITTEN), out_error);
+ if (XFS_IS_CORRUPT(mp,
+ (flags & XFS_RMAP_UNWRITTEN) !=
+ (ltrec.rm_flags & XFS_RMAP_UNWRITTEN))) {
+ error = -EFSCORRUPTED;
+ goto out_error;
+ }
/* Check the offset. */
- XFS_WANT_CORRUPTED_GOTO(mp, ltrec.rm_offset <= offset, out_error);
- XFS_WANT_CORRUPTED_GOTO(mp, offset <= ltoff + ltrec.rm_blockcount,
- out_error);
+ if (XFS_IS_CORRUPT(mp, ltrec.rm_offset > offset)) {
+ error = -EFSCORRUPTED;
+ goto out_error;
+ }
+ if (XFS_IS_CORRUPT(mp, offset > ltoff + ltrec.rm_blockcount)) {
+ error = -EFSCORRUPTED;
+ goto out_error;
+ }
if (ltrec.rm_startblock == bno && ltrec.rm_blockcount == len) {
/* Exact match, simply remove the record from rmap tree. */
@@ -1836,7 +2025,10 @@
ltrec.rm_offset, ltrec.rm_flags, &i);
if (error)
goto out_error;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, out_error);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto out_error;
+ }
ltrec.rm_blockcount -= len;
error = xfs_rmap_update(cur, <rec);
if (error)
@@ -1862,7 +2054,10 @@
ltrec.rm_offset, ltrec.rm_flags, &i);
if (error)
goto out_error;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, out_error);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto out_error;
+ }
ltrec.rm_blockcount = bno - ltrec.rm_startblock;
error = xfs_rmap_update(cur, <rec);
if (error)
@@ -1877,12 +2072,12 @@
goto out_error;
}
- trace_xfs_rmap_unmap_done(mp, cur->bc_private.a.agno, bno, len,
+ trace_xfs_rmap_unmap_done(mp, cur->bc_ag.agno, bno, len,
unwritten, oinfo);
out_error:
if (error)
trace_xfs_rmap_unmap_error(cur->bc_mp,
- cur->bc_private.a.agno, error, _RET_IP_);
+ cur->bc_ag.agno, error, _RET_IP_);
return error;
}
@@ -1917,7 +2112,7 @@
xfs_owner_info_unpack(oinfo, &owner, &offset, &flags);
if (unwritten)
flags |= XFS_RMAP_UNWRITTEN;
- trace_xfs_rmap_map(mp, cur->bc_private.a.agno, bno, len,
+ trace_xfs_rmap_map(mp, cur->bc_ag.agno, bno, len,
unwritten, oinfo);
/* Is there a left record that abuts our range? */
@@ -1938,9 +2133,12 @@
error = xfs_rmap_get_rec(cur, >rec, &have_gt);
if (error)
goto out_error;
- XFS_WANT_CORRUPTED_GOTO(mp, have_gt == 1, out_error);
+ if (XFS_IS_CORRUPT(mp, have_gt != 1)) {
+ error = -EFSCORRUPTED;
+ goto out_error;
+ }
trace_xfs_rmap_find_right_neighbor_result(cur->bc_mp,
- cur->bc_private.a.agno, gtrec.rm_startblock,
+ cur->bc_ag.agno, gtrec.rm_startblock,
gtrec.rm_blockcount, gtrec.rm_owner,
gtrec.rm_offset, gtrec.rm_flags);
@@ -1987,7 +2185,10 @@
ltrec.rm_offset, ltrec.rm_flags, &i);
if (error)
goto out_error;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, out_error);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto out_error;
+ }
error = xfs_rmap_update(cur, <rec);
if (error)
@@ -2030,12 +2231,12 @@
goto out_error;
}
- trace_xfs_rmap_map_done(mp, cur->bc_private.a.agno, bno, len,
+ trace_xfs_rmap_map_done(mp, cur->bc_ag.agno, bno, len,
unwritten, oinfo);
out_error:
if (error)
trace_xfs_rmap_map_error(cur->bc_mp,
- cur->bc_private.a.agno, error, _RET_IP_);
+ cur->bc_ag.agno, error, _RET_IP_);
return error;
}
@@ -2135,7 +2336,7 @@
if (rcur == NULL)
return;
- agbp = rcur->bc_private.a.agbp;
+ agbp = rcur->bc_ag.agbp;
xfs_btree_del_cursor(rcur, error);
if (error)
xfs_trans_brelse(tp, agbp);
@@ -2185,7 +2386,7 @@
* the startblock, get one now.
*/
rcur = *pcur;
- if (rcur != NULL && rcur->bc_private.a.agno != agno) {
+ if (rcur != NULL && rcur->bc_ag.agno != agno) {
xfs_rmap_finish_one_cleanup(tp, rcur, 0);
rcur = NULL;
*pcur = NULL;
@@ -2199,7 +2400,7 @@
error = xfs_free_extent_fix_freelist(tp, agno, &agbp);
if (error)
return error;
- if (!agbp)
+ if (XFS_IS_CORRUPT(tp->t_mountp, !agbp))
return -EFSCORRUPTED;
rcur = xfs_rmapbt_init_cursor(mp, tp, agbp, agno);
@@ -2304,12 +2505,15 @@
int whichfork,
struct xfs_bmbt_irec *PREV)
{
+ enum xfs_rmap_intent_type type = XFS_RMAP_MAP;
+
if (!xfs_rmap_update_is_needed(tp->t_mountp, whichfork))
return;
- __xfs_rmap_add(tp, xfs_is_reflink_inode(ip) ?
- XFS_RMAP_MAP_SHARED : XFS_RMAP_MAP, ip->i_ino,
- whichfork, PREV);
+ if (whichfork != XFS_ATTR_FORK && xfs_is_reflink_inode(ip))
+ type = XFS_RMAP_MAP_SHARED;
+
+ __xfs_rmap_add(tp, type, ip->i_ino, whichfork, PREV);
}
/* Unmap an extent out of a file. */
@@ -2320,12 +2524,15 @@
int whichfork,
struct xfs_bmbt_irec *PREV)
{
+ enum xfs_rmap_intent_type type = XFS_RMAP_UNMAP;
+
if (!xfs_rmap_update_is_needed(tp->t_mountp, whichfork))
return;
- __xfs_rmap_add(tp, xfs_is_reflink_inode(ip) ?
- XFS_RMAP_UNMAP_SHARED : XFS_RMAP_UNMAP, ip->i_ino,
- whichfork, PREV);
+ if (whichfork != XFS_ATTR_FORK && xfs_is_reflink_inode(ip))
+ type = XFS_RMAP_UNMAP_SHARED;
+
+ __xfs_rmap_add(tp, type, ip->i_ino, whichfork, PREV);
}
/*
@@ -2342,12 +2549,15 @@
int whichfork,
struct xfs_bmbt_irec *PREV)
{
+ enum xfs_rmap_intent_type type = XFS_RMAP_CONVERT;
+
if (!xfs_rmap_update_is_needed(mp, whichfork))
return;
- __xfs_rmap_add(tp, xfs_is_reflink_inode(ip) ?
- XFS_RMAP_CONVERT_SHARED : XFS_RMAP_CONVERT, ip->i_ino,
- whichfork, PREV);
+ if (whichfork != XFS_ATTR_FORK && xfs_is_reflink_inode(ip))
+ type = XFS_RMAP_CONVERT_SHARED;
+
+ __xfs_rmap_add(tp, type, ip->i_ino, whichfork, PREV);
}
/* Schedule the creation of an rmap for non-file data. */
@@ -2493,7 +2703,6 @@
uint64_t owner;
uint64_t offset;
unsigned int flags;
- bool has_rmap;
};
/* For each rmap given, figure out if it doesn't match the key we want. */
@@ -2508,7 +2717,6 @@
if (rks->owner == rec->rm_owner && rks->offset == rec->rm_offset &&
((rks->flags & rec->rm_flags) & XFS_RMAP_KEY_FLAGS) == rks->flags)
return 0;
- rks->has_rmap = true;
return -ECANCELED;
}
@@ -2530,7 +2738,7 @@
int error;
xfs_owner_info_unpack(oinfo, &rks.owner, &rks.offset, &rks.flags);
- rks.has_rmap = false;
+ *has_rmap = false;
low.rm_startblock = bno;
memset(&high, 0xFF, sizeof(high));
@@ -2538,11 +2746,12 @@
error = xfs_rmap_query_range(cur, &low, &high,
xfs_rmap_has_other_keys_helper, &rks);
- if (error < 0)
- return error;
+ if (error == -ECANCELED) {
+ *has_rmap = true;
+ return 0;
+ }
- *has_rmap = rks.has_rmap;
- return 0;
+ return error;
}
const struct xfs_owner_info XFS_RMAP_OINFO_SKIP_UPDATE = {
diff --git a/fs/xfs/libxfs/xfs_rmap_btree.c b/fs/xfs/libxfs/xfs_rmap_btree.c
index fc78efa..beb81c8 100644
--- a/fs/xfs/libxfs/xfs_rmap_btree.c
+++ b/fs/xfs/libxfs/xfs_rmap_btree.c
@@ -14,6 +14,7 @@
#include "xfs_trans.h"
#include "xfs_alloc.h"
#include "xfs_btree.h"
+#include "xfs_btree_staging.h"
#include "xfs_rmap.h"
#include "xfs_rmap_btree.h"
#include "xfs_trace.h"
@@ -51,7 +52,7 @@
struct xfs_btree_cur *cur)
{
return xfs_rmapbt_init_cursor(cur->bc_mp, cur->bc_tp,
- cur->bc_private.a.agbp, cur->bc_private.a.agno);
+ cur->bc_ag.agbp, cur->bc_ag.agno);
}
STATIC void
@@ -60,18 +61,16 @@
union xfs_btree_ptr *ptr,
int inc)
{
- struct xfs_buf *agbp = cur->bc_private.a.agbp;
- struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp);
- xfs_agnumber_t seqno = be32_to_cpu(agf->agf_seqno);
+ struct xfs_buf *agbp = cur->bc_ag.agbp;
+ struct xfs_agf *agf = agbp->b_addr;
int btnum = cur->bc_btnum;
- struct xfs_perag *pag = xfs_perag_get(cur->bc_mp, seqno);
+ struct xfs_perag *pag = agbp->b_pag;
ASSERT(ptr->s != 0);
agf->agf_roots[btnum] = ptr->s;
be32_add_cpu(&agf->agf_levels[btnum], inc);
pag->pagf_levels[btnum] += inc;
- xfs_perag_put(pag);
xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS);
}
@@ -83,25 +82,25 @@
union xfs_btree_ptr *new,
int *stat)
{
- struct xfs_buf *agbp = cur->bc_private.a.agbp;
- struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp);
+ struct xfs_buf *agbp = cur->bc_ag.agbp;
+ struct xfs_agf *agf = agbp->b_addr;
int error;
xfs_agblock_t bno;
/* Allocate the new block from the freelist. If we can't, give up. */
- error = xfs_alloc_get_freelist(cur->bc_tp, cur->bc_private.a.agbp,
+ error = xfs_alloc_get_freelist(cur->bc_tp, cur->bc_ag.agbp,
&bno, 1);
if (error)
return error;
- trace_xfs_rmapbt_alloc_block(cur->bc_mp, cur->bc_private.a.agno,
+ trace_xfs_rmapbt_alloc_block(cur->bc_mp, cur->bc_ag.agno,
bno, 1);
if (bno == NULLAGBLOCK) {
*stat = 0;
return 0;
}
- xfs_extent_busy_reuse(cur->bc_mp, cur->bc_private.a.agno, bno, 1,
+ xfs_extent_busy_reuse(cur->bc_mp, cur->bc_ag.agno, bno, 1,
false);
xfs_trans_agbtree_delta(cur->bc_tp, 1);
@@ -109,7 +108,7 @@
be32_add_cpu(&agf->agf_rmap_blocks, 1);
xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_RMAP_BLOCKS);
- xfs_ag_resv_rmapbt_alloc(cur->bc_mp, cur->bc_private.a.agno);
+ xfs_ag_resv_rmapbt_alloc(cur->bc_mp, cur->bc_ag.agno);
*stat = 1;
return 0;
@@ -120,13 +119,14 @@
struct xfs_btree_cur *cur,
struct xfs_buf *bp)
{
- struct xfs_buf *agbp = cur->bc_private.a.agbp;
- struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp);
+ struct xfs_buf *agbp = cur->bc_ag.agbp;
+ struct xfs_agf *agf = agbp->b_addr;
+ struct xfs_perag *pag;
xfs_agblock_t bno;
int error;
bno = xfs_daddr_to_agbno(cur->bc_mp, XFS_BUF_ADDR(bp));
- trace_xfs_rmapbt_free_block(cur->bc_mp, cur->bc_private.a.agno,
+ trace_xfs_rmapbt_free_block(cur->bc_mp, cur->bc_ag.agno,
bno, 1);
be32_add_cpu(&agf->agf_rmap_blocks, -1);
xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_RMAP_BLOCKS);
@@ -138,8 +138,8 @@
XFS_EXTENT_BUSY_SKIP_DISCARD);
xfs_trans_agbtree_delta(cur->bc_tp, -1);
- xfs_ag_resv_rmapbt_free(cur->bc_mp, cur->bc_private.a.agno);
-
+ pag = cur->bc_ag.agbp->b_pag;
+ xfs_ag_resv_free_extent(pag, XFS_AG_RESV_RMAPBT, NULL, 1);
return 0;
}
@@ -215,9 +215,9 @@
struct xfs_btree_cur *cur,
union xfs_btree_ptr *ptr)
{
- struct xfs_agf *agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp);
+ struct xfs_agf *agf = cur->bc_ag.agbp->b_addr;
- ASSERT(cur->bc_private.a.agno == be32_to_cpu(agf->agf_seqno));
+ ASSERT(cur->bc_ag.agno == be32_to_cpu(agf->agf_seqno));
ptr->s = agf->agf_roots[cur->bc_btnum];
}
@@ -448,9 +448,29 @@
.recs_inorder = xfs_rmapbt_recs_inorder,
};
-/*
- * Allocate a new allocation btree cursor.
- */
+static struct xfs_btree_cur *
+xfs_rmapbt_init_common(
+ struct xfs_mount *mp,
+ struct xfs_trans *tp,
+ xfs_agnumber_t agno)
+{
+ struct xfs_btree_cur *cur;
+
+ cur = kmem_cache_zalloc(xfs_btree_cur_zone, GFP_NOFS | __GFP_NOFAIL);
+ cur->bc_tp = tp;
+ cur->bc_mp = mp;
+ /* Overlapping btree; 2 keys per pointer. */
+ cur->bc_btnum = XFS_BTNUM_RMAP;
+ cur->bc_flags = XFS_BTREE_CRC_BLOCKS | XFS_BTREE_OVERLAPPING;
+ cur->bc_blocklog = mp->m_sb.sb_blocklog;
+ cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_rmap_2);
+ cur->bc_ag.agno = agno;
+ cur->bc_ops = &xfs_rmapbt_ops;
+
+ return cur;
+}
+
+/* Create a new reverse mapping btree cursor. */
struct xfs_btree_cur *
xfs_rmapbt_init_cursor(
struct xfs_mount *mp,
@@ -458,26 +478,52 @@
struct xfs_buf *agbp,
xfs_agnumber_t agno)
{
- struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp);
+ struct xfs_agf *agf = agbp->b_addr;
struct xfs_btree_cur *cur;
- cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_NOFS);
- cur->bc_tp = tp;
- cur->bc_mp = mp;
- /* Overlapping btree; 2 keys per pointer. */
- cur->bc_btnum = XFS_BTNUM_RMAP;
- cur->bc_flags = XFS_BTREE_CRC_BLOCKS | XFS_BTREE_OVERLAPPING;
- cur->bc_blocklog = mp->m_sb.sb_blocklog;
- cur->bc_ops = &xfs_rmapbt_ops;
+ cur = xfs_rmapbt_init_common(mp, tp, agno);
cur->bc_nlevels = be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]);
- cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_rmap_2);
-
- cur->bc_private.a.agbp = agbp;
- cur->bc_private.a.agno = agno;
-
+ cur->bc_ag.agbp = agbp;
return cur;
}
+/* Create a new reverse mapping btree cursor with a fake root for staging. */
+struct xfs_btree_cur *
+xfs_rmapbt_stage_cursor(
+ struct xfs_mount *mp,
+ struct xbtree_afakeroot *afake,
+ xfs_agnumber_t agno)
+{
+ struct xfs_btree_cur *cur;
+
+ cur = xfs_rmapbt_init_common(mp, NULL, agno);
+ xfs_btree_stage_afakeroot(cur, afake);
+ return cur;
+}
+
+/*
+ * Install a new reverse mapping btree root. Caller is responsible for
+ * invalidating and freeing the old btree blocks.
+ */
+void
+xfs_rmapbt_commit_staged_btree(
+ struct xfs_btree_cur *cur,
+ struct xfs_trans *tp,
+ struct xfs_buf *agbp)
+{
+ struct xfs_agf *agf = agbp->b_addr;
+ struct xbtree_afakeroot *afake = cur->bc_ag.afake;
+
+ ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
+
+ agf->agf_roots[cur->bc_btnum] = cpu_to_be32(afake->af_root);
+ agf->agf_levels[cur->bc_btnum] = cpu_to_be32(afake->af_levels);
+ agf->agf_rmap_blocks = cpu_to_be32(afake->af_blocks);
+ xfs_alloc_log_agf(tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS |
+ XFS_AGF_RMAP_BLOCKS);
+ xfs_btree_commit_afakeroot(cur, tp, agbp, &xfs_rmapbt_ops);
+}
+
/*
* Calculate number of records in an rmap btree block.
*/
@@ -569,7 +615,7 @@
if (error)
return error;
- agf = XFS_BUF_TO_AGF(agbp);
+ agf = agbp->b_addr;
agblocks = be32_to_cpu(agf->agf_length);
tree_len = be32_to_cpu(agf->agf_rmap_blocks);
xfs_trans_brelse(tp, agbp);
diff --git a/fs/xfs/libxfs/xfs_rmap_btree.h b/fs/xfs/libxfs/xfs_rmap_btree.h
index 820d668..115c345 100644
--- a/fs/xfs/libxfs/xfs_rmap_btree.h
+++ b/fs/xfs/libxfs/xfs_rmap_btree.h
@@ -9,6 +9,7 @@
struct xfs_buf;
struct xfs_btree_cur;
struct xfs_mount;
+struct xbtree_afakeroot;
/* rmaps only exist on crc enabled filesystems */
#define XFS_RMAP_BLOCK_LEN XFS_BTREE_SBLOCK_CRC_LEN
@@ -43,6 +44,10 @@
struct xfs_btree_cur *xfs_rmapbt_init_cursor(struct xfs_mount *mp,
struct xfs_trans *tp, struct xfs_buf *bp,
xfs_agnumber_t agno);
+struct xfs_btree_cur *xfs_rmapbt_stage_cursor(struct xfs_mount *mp,
+ struct xbtree_afakeroot *afake, xfs_agnumber_t agno);
+void xfs_rmapbt_commit_staged_btree(struct xfs_btree_cur *cur,
+ struct xfs_trans *tp, struct xfs_buf *agbp);
int xfs_rmapbt_maxrecs(int blocklen, int leaf);
extern void xfs_rmapbt_compute_maxlevels(struct xfs_mount *mp);
diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c
index 42085e7..6c1aba1 100644
--- a/fs/xfs/libxfs/xfs_rtbitmap.c
+++ b/fs/xfs/libxfs/xfs_rtbitmap.c
@@ -15,7 +15,7 @@
#include "xfs_bmap.h"
#include "xfs_trans.h"
#include "xfs_rtalloc.h"
-
+#include "xfs_error.h"
/*
* Realtime allocator bitmap functions shared with userspace.
@@ -66,11 +66,11 @@
ip = issum ? mp->m_rsumip : mp->m_rbmip;
- error = xfs_bmapi_read(ip, block, 1, &map, &nmap, XFS_DATA_FORK);
+ error = xfs_bmapi_read(ip, block, 1, &map, &nmap, 0);
if (error)
return error;
- if (nmap == 0 || !xfs_bmap_is_real_extent(&map))
+ if (XFS_IS_CORRUPT(mp, nmap == 0 || !xfs_bmap_is_written_extent(&map)))
return -EFSCORRUPTED;
ASSERT(map.br_startblock != NULLFSBLOCK);
diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
index ac6cdca..5aeafa5 100644
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -10,6 +10,7 @@
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
#include "xfs_bit.h"
+#include "xfs_sb.h"
#include "xfs_mount.h"
#include "xfs_ialloc.h"
#include "xfs_alloc.h"
@@ -219,7 +220,7 @@
struct xfs_buf *bp,
struct xfs_sb *sbp)
{
- struct xfs_dsb *dsb = XFS_BUF_TO_SBP(bp);
+ struct xfs_dsb *dsb = bp->b_addr;
uint32_t agcount = 0;
uint32_t rem;
@@ -242,7 +243,7 @@
} else if (sbp->sb_qflags & (XFS_PQUOTA_ENFD | XFS_GQUOTA_ENFD |
XFS_PQUOTA_CHKD | XFS_GQUOTA_CHKD)) {
xfs_notice(mp,
-"Superblock earlier than Version 5 has XFS_[PQ]UOTA_{ENFD|CHKD} bits.");
+"Superblock earlier than Version 5 has XFS_{P|G}QUOTA_{ENFD|CHKD} bits.");
return -EFSCORRUPTED;
}
@@ -327,6 +328,38 @@
return -EFSCORRUPTED;
}
+ /* Validate the realtime geometry; stolen from xfs_repair */
+ if (sbp->sb_rextsize * sbp->sb_blocksize > XFS_MAX_RTEXTSIZE ||
+ sbp->sb_rextsize * sbp->sb_blocksize < XFS_MIN_RTEXTSIZE) {
+ xfs_notice(mp,
+ "realtime extent sanity check failed");
+ return -EFSCORRUPTED;
+ }
+
+ if (sbp->sb_rblocks == 0) {
+ if (sbp->sb_rextents != 0 || sbp->sb_rbmblocks != 0 ||
+ sbp->sb_rextslog != 0 || sbp->sb_frextents != 0) {
+ xfs_notice(mp,
+ "realtime zeroed geometry check failed");
+ return -EFSCORRUPTED;
+ }
+ } else {
+ uint64_t rexts;
+ uint64_t rbmblocks;
+
+ rexts = div_u64(sbp->sb_rblocks, sbp->sb_rextsize);
+ rbmblocks = howmany_64(sbp->sb_rextents,
+ NBBY * sbp->sb_blocksize);
+
+ if (sbp->sb_rextents != rexts ||
+ sbp->sb_rextslog != xfs_highbit32(sbp->sb_rextents) ||
+ sbp->sb_rbmblocks != rbmblocks) {
+ xfs_notice(mp,
+ "realtime geometry sanity check failed");
+ return -EFSCORRUPTED;
+ }
+ }
+
if (sbp->sb_unit) {
if (!xfs_sb_version_hasdalign(sbp) ||
sbp->sb_unit > sbp->sb_width ||
@@ -567,7 +600,7 @@
* disk. If neither are active, we should NULL the inode.
*
* In all cases, the separate pquotino must remain 0 because it
- * it beyond the "end" of the valid non-pquotino superblock.
+ * is beyond the "end" of the valid non-pquotino superblock.
*/
if (from->sb_qflags & XFS_GQUOTA_ACCT)
to->sb_gquotino = cpu_to_be64(from->sb_gquotino);
@@ -680,7 +713,7 @@
{
struct xfs_sb sb;
struct xfs_mount *mp = bp->b_mount;
- struct xfs_dsb *dsb = XFS_BUF_TO_SBP(bp);
+ struct xfs_dsb *dsb = bp->b_addr;
int error;
/*
@@ -706,7 +739,7 @@
* Check all the superblock fields. Don't byteswap the xquota flags
* because _verify_common checks the on-disk values.
*/
- __xfs_sb_from_disk(&sb, XFS_BUF_TO_SBP(bp), false);
+ __xfs_sb_from_disk(&sb, dsb, false);
error = xfs_validate_sb_common(mp, bp, &sb);
if (error)
goto out_error;
@@ -729,7 +762,7 @@
xfs_sb_quiet_read_verify(
struct xfs_buf *bp)
{
- struct xfs_dsb *dsb = XFS_BUF_TO_SBP(bp);
+ struct xfs_dsb *dsb = bp->b_addr;
if (dsb->sb_magicnum == cpu_to_be32(XFS_SB_MAGIC)) {
/* XFS filesystem, verify noisily! */
@@ -747,13 +780,14 @@
struct xfs_sb sb;
struct xfs_mount *mp = bp->b_mount;
struct xfs_buf_log_item *bip = bp->b_log_item;
+ struct xfs_dsb *dsb = bp->b_addr;
int error;
/*
* Check all the superblock fields. Don't byteswap the xquota flags
* because _verify_common checks the on-disk values.
*/
- __xfs_sb_from_disk(&sb, XFS_BUF_TO_SBP(bp), false);
+ __xfs_sb_from_disk(&sb, dsb, false);
error = xfs_validate_sb_common(mp, bp, &sb);
if (error)
goto out_error;
@@ -765,7 +799,7 @@
return;
if (bip)
- XFS_BUF_TO_SBP(bp)->sb_lsn = cpu_to_be64(bip->bli_item.li_lsn);
+ dsb->sb_lsn = cpu_to_be64(bip->bli_item.li_lsn);
xfs_buf_update_cksum(bp, XFS_SB_CRC_OFF);
return;
@@ -920,13 +954,13 @@
struct xfs_trans *tp)
{
struct xfs_mount *mp = tp->t_mountp;
- struct xfs_buf *bp = xfs_trans_getsb(tp, mp);
+ struct xfs_buf *bp = xfs_trans_getsb(tp);
mp->m_sb.sb_icount = percpu_counter_sum(&mp->m_icount);
mp->m_sb.sb_ifree = percpu_counter_sum(&mp->m_ifree);
mp->m_sb.sb_fdblocks = percpu_counter_sum(&mp->m_fdblocks);
- xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb);
+ xfs_sb_to_disk(bp->b_addr, &mp->m_sb);
xfs_trans_buf_set_type(tp, bp, XFS_BLFT_SB_BUF);
xfs_trans_log_buf(tp, bp, 0, sizeof(struct xfs_dsb) - 1);
}
@@ -984,9 +1018,9 @@
for (agno = 1; agno < mp->m_sb.sb_agcount; agno++) {
struct xfs_buf *bp;
- bp = xfs_buf_get(mp->m_ddev_targp,
+ error = xfs_buf_get(mp->m_ddev_targp,
XFS_AG_DADDR(mp, agno, XFS_SB_DADDR),
- XFS_FSS_TO_BB(mp, 1));
+ XFS_FSS_TO_BB(mp, 1), &bp);
/*
* If we get an error reading or writing alternate superblocks,
* continue. xfs_repair chooses the "best" superblock based
@@ -994,19 +1028,19 @@
* superblocks un-updated than updated, and xfs_repair may
* pick them over the properly-updated primary.
*/
- if (!bp) {
+ if (error) {
xfs_warn(mp,
"error allocating secondary superblock for ag %d",
agno);
if (!saved_error)
- saved_error = -ENOMEM;
+ saved_error = error;
continue;
}
bp->b_ops = &xfs_sb_buf_ops;
xfs_buf_oneshot(bp);
xfs_buf_zero(bp, 0, BBTOB(bp->b_length));
- xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb);
+ xfs_sb_to_disk(bp->b_addr, &mp->m_sb);
xfs_buf_delwri_queue(bp, &buffer_list);
xfs_buf_relse(bp);
@@ -1050,7 +1084,7 @@
if (error)
return error;
- bp = xfs_trans_getsb(tp, mp);
+ bp = xfs_trans_getsb(tp);
xfs_log_sb(tp);
xfs_trans_bhold(tp, bp);
xfs_trans_set_sync(tp);
@@ -1132,6 +1166,8 @@
geo->flags |= XFS_FSOP_GEOM_FLAGS_RMAPBT;
if (xfs_sb_version_hasreflink(sbp))
geo->flags |= XFS_FSOP_GEOM_FLAGS_REFLINK;
+ if (xfs_sb_version_hasbigtime(sbp))
+ geo->flags |= XFS_FSOP_GEOM_FLAGS_BIGTIME;
if (xfs_sb_version_hassector(sbp))
geo->logsectsize = sbp->sb_logsectsize;
else
@@ -1184,13 +1220,14 @@
struct xfs_buf **bpp)
{
struct xfs_buf *bp;
+ int error;
ASSERT(agno != 0 && agno != NULLAGNUMBER);
- bp = xfs_trans_get_buf(tp, mp->m_ddev_targp,
+ error = xfs_trans_get_buf(tp, mp->m_ddev_targp,
XFS_AG_DADDR(mp, agno, XFS_SB_BLOCK(mp)),
- XFS_FSS_TO_BB(mp, 1), 0);
- if (!bp)
- return -ENOMEM;
+ XFS_FSS_TO_BB(mp, 1), 0, &bp);
+ if (error)
+ return error;
bp->b_ops = &xfs_sb_buf_ops;
xfs_buf_oneshot(bp);
*bpp = bp;
diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h
index c45acbd..c795ae4 100644
--- a/fs/xfs/libxfs/xfs_shared.h
+++ b/fs/xfs/libxfs/xfs_shared.h
@@ -65,6 +65,7 @@
#define XFS_TRANS_DQ_DIRTY 0x10 /* at least one dquot in trx dirty */
#define XFS_TRANS_RESERVE 0x20 /* OK to use reserved data blocks */
#define XFS_TRANS_NO_WRITECOUNT 0x40 /* do not elevate SB writecount */
+#define XFS_TRANS_RES_FDBLKS 0x80 /* reserve newly freed blocks */
/*
* LOWMODE is used by the allocator to activate the lowspace algorithm - when
* free space is running low the extent allocator may choose to allocate an
@@ -175,6 +176,9 @@
unsigned int ialloc_align;
unsigned int agino_log; /* #bits for agino in inum */
+
+ /* precomputed value for di_flags2 */
+ uint64_t new_diflags2;
};
#endif /* __XFS_SHARED_H__ */
diff --git a/fs/xfs/libxfs/xfs_symlink_remote.c b/fs/xfs/libxfs/xfs_symlink_remote.c
index 3b8260c..594bc44 100644
--- a/fs/xfs/libxfs/xfs_symlink_remote.c
+++ b/fs/xfs/libxfs/xfs_symlink_remote.c
@@ -204,16 +204,12 @@
xfs_symlink_shortform_verify(
struct xfs_inode *ip)
{
- char *sfp;
- char *endp;
- struct xfs_ifork *ifp;
- int size;
+ struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
+ char *sfp = (char *)ifp->if_u1.if_data;
+ int size = ifp->if_bytes;
+ char *endp = sfp + size;
- ASSERT(ip->i_d.di_format == XFS_DINODE_FMT_LOCAL);
- ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
- sfp = (char *)ifp->if_u1.if_data;
- size = ifp->if_bytes;
- endp = sfp + size;
+ ASSERT(ifp->if_format == XFS_DINODE_FMT_LOCAL);
/*
* Zero length symlinks should never occur in memory as they are
diff --git a/fs/xfs/libxfs/xfs_trans_inode.c b/fs/xfs/libxfs/xfs_trans_inode.c
index 0ba7368..90f1d56 100644
--- a/fs/xfs/libxfs/xfs_trans_inode.c
+++ b/fs/xfs/libxfs/xfs_trans_inode.c
@@ -8,6 +8,8 @@
#include "xfs_shared.h"
#include "xfs_format.h"
#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
#include "xfs_inode.h"
#include "xfs_trans.h"
#include "xfs_trans_priv.h"
@@ -27,7 +29,7 @@
struct xfs_inode *ip,
uint lock_flags)
{
- xfs_inode_log_item_t *iip;
+ struct xfs_inode_log_item *iip;
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
if (ip->i_itemp == NULL)
@@ -56,7 +58,7 @@
int flags)
{
struct inode *inode = VFS_I(ip);
- struct timespec64 tv;
+ struct timespec64 tv;
ASSERT(tp);
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
@@ -67,33 +69,41 @@
inode->i_mtime = tv;
if (flags & XFS_ICHGTIME_CHG)
inode->i_ctime = tv;
- if (flags & XFS_ICHGTIME_CREATE) {
- ip->i_d.di_crtime.t_sec = (int32_t)tv.tv_sec;
- ip->i_d.di_crtime.t_nsec = (int32_t)tv.tv_nsec;
- }
+ if (flags & XFS_ICHGTIME_CREATE)
+ ip->i_d.di_crtime = tv;
}
/*
- * This is called to mark the fields indicated in fieldmask as needing
- * to be logged when the transaction is committed. The inode must
- * already be associated with the given transaction.
+ * This is called to mark the fields indicated in fieldmask as needing to be
+ * logged when the transaction is committed. The inode must already be
+ * associated with the given transaction.
*
- * The values for fieldmask are defined in xfs_inode_item.h. We always
- * log all of the core inode if any of it has changed, and we always log
- * all of the inline data/extents/b-tree root if any of them has changed.
+ * The values for fieldmask are defined in xfs_inode_item.h. We always log all
+ * of the core inode if any of it has changed, and we always log all of the
+ * inline data/extents/b-tree root if any of them has changed.
+ *
+ * Grab and pin the cluster buffer associated with this inode to avoid RMW
+ * cycles at inode writeback time. Avoid the need to add error handling to every
+ * xfs_trans_log_inode() call by shutting down on read error. This will cause
+ * transactions to fail and everything to error out, just like if we return a
+ * read error in a dirty transaction and cancel it.
*/
void
xfs_trans_log_inode(
- xfs_trans_t *tp,
- xfs_inode_t *ip,
- uint flags)
+ struct xfs_trans *tp,
+ struct xfs_inode *ip,
+ uint flags)
{
- struct inode *inode = VFS_I(ip);
+ struct xfs_inode_log_item *iip = ip->i_itemp;
+ struct inode *inode = VFS_I(ip);
+ uint iversion_flags = 0;
- ASSERT(ip->i_itemp != NULL);
+ ASSERT(iip);
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
ASSERT(!xfs_iflags_test(ip, XFS_ISTALE));
+ tp->t_flags |= XFS_TRANS_DIRTY;
+
/*
* Don't bother with i_lock for the I_DIRTY_TIME check here, as races
* don't matter - we either will need an extra transaction in 24 hours
@@ -107,15 +117,6 @@
}
/*
- * Record the specific change for fdatasync optimisation. This
- * allows fdatasync to skip log forces for inodes that are only
- * timestamp dirty. We do this before the change count so that
- * the core being logged in this case does not impact on fdatasync
- * behaviour.
- */
- ip->i_itemp->ili_fsync_fields |= flags;
-
- /*
* First time we log the inode in a transaction, bump the inode change
* counter if it is configured for this to occur. While we have the
* inode locked exclusively for metadata modification, we can usually
@@ -124,23 +125,75 @@
* set however, then go ahead and bump the i_version counter
* unconditionally.
*/
- if (!test_and_set_bit(XFS_LI_DIRTY, &ip->i_itemp->ili_item.li_flags) &&
- IS_I_VERSION(VFS_I(ip))) {
- if (inode_maybe_inc_iversion(VFS_I(ip), flags & XFS_ILOG_CORE))
- flags |= XFS_ILOG_CORE;
+ if (!test_and_set_bit(XFS_LI_DIRTY, &iip->ili_item.li_flags)) {
+ if (IS_I_VERSION(inode) &&
+ inode_maybe_inc_iversion(inode, flags & XFS_ILOG_CORE))
+ iversion_flags = XFS_ILOG_CORE;
}
- tp->t_flags |= XFS_TRANS_DIRTY;
+ /*
+ * If we're updating the inode core or the timestamps and it's possible
+ * to upgrade this inode to bigtime format, do so now.
+ */
+ if ((flags & (XFS_ILOG_CORE | XFS_ILOG_TIMESTAMP)) &&
+ xfs_sb_version_hasbigtime(&ip->i_mount->m_sb) &&
+ !xfs_inode_has_bigtime(ip)) {
+ ip->i_d.di_flags2 |= XFS_DIFLAG2_BIGTIME;
+ flags |= XFS_ILOG_CORE;
+ }
/*
- * Always OR in the bits from the ili_last_fields field.
- * This is to coordinate with the xfs_iflush() and xfs_iflush_done()
- * routines in the eventual clearing of the ili_fields bits.
- * See the big comment in xfs_iflush() for an explanation of
- * this coordination mechanism.
+ * Record the specific change for fdatasync optimisation. This allows
+ * fdatasync to skip log forces for inodes that are only timestamp
+ * dirty.
*/
- flags |= ip->i_itemp->ili_last_fields;
- ip->i_itemp->ili_fields |= flags;
+ spin_lock(&iip->ili_lock);
+ iip->ili_fsync_fields |= flags;
+
+ if (!iip->ili_item.li_buf) {
+ struct xfs_buf *bp;
+ int error;
+
+ /*
+ * We hold the ILOCK here, so this inode is not going to be
+ * flushed while we are here. Further, because there is no
+ * buffer attached to the item, we know that there is no IO in
+ * progress, so nothing will clear the ili_fields while we read
+ * in the buffer. Hence we can safely drop the spin lock and
+ * read the buffer knowing that the state will not change from
+ * here.
+ */
+ spin_unlock(&iip->ili_lock);
+ error = xfs_imap_to_bp(ip->i_mount, tp, &ip->i_imap, NULL,
+ &bp, 0);
+ if (error) {
+ xfs_force_shutdown(ip->i_mount, SHUTDOWN_META_IO_ERROR);
+ return;
+ }
+
+ /*
+ * We need an explicit buffer reference for the log item but
+ * don't want the buffer to remain attached to the transaction.
+ * Hold the buffer but release the transaction reference once
+ * we've attached the inode log item to the buffer log item
+ * list.
+ */
+ xfs_buf_hold(bp);
+ spin_lock(&iip->ili_lock);
+ iip->ili_item.li_buf = bp;
+ bp->b_flags |= _XBF_INODES;
+ list_add_tail(&iip->ili_item.li_bio_list, &bp->b_li_list);
+ xfs_trans_brelse(tp, bp);
+ }
+
+ /*
+ * Always OR in the bits from the ili_last_fields field. This is to
+ * coordinate with the xfs_iflush() and xfs_buf_inode_iodone() routines
+ * in the eventual clearing of the ili_fields bits. See the big comment
+ * in xfs_iflush() for an explanation of this coordination mechanism.
+ */
+ iip->ili_fields |= (flags | iip->ili_last_fields | iversion_flags);
+ spin_unlock(&iip->ili_lock);
}
int
diff --git a/fs/xfs/libxfs/xfs_trans_resv.c b/fs/xfs/libxfs/xfs_trans_resv.c
index b3584cd..d1a0848 100644
--- a/fs/xfs/libxfs/xfs_trans_resv.c
+++ b/fs/xfs/libxfs/xfs_trans_resv.c
@@ -187,7 +187,7 @@
XFS_FSB_TO_B(mp, 1));
if (alloc) {
/* icreate tx uses ordered buffers */
- if (xfs_sb_version_hascrc(&mp->m_sb))
+ if (xfs_sb_version_has_v3inode(&mp->m_sb))
return res;
size = XFS_FSB_TO_B(mp, 1);
}
@@ -202,7 +202,7 @@
* blocks as needed to mark inuse MAXEXTLEN blocks' worth of realtime extents,
* as well as the realtime summary block.
*/
-unsigned int
+static unsigned int
xfs_rtalloc_log_count(
struct xfs_mount *mp,
unsigned int num_ops)
@@ -776,7 +776,7 @@
/*
* Adjusting quota limits.
- * the xfs_disk_dquot_t: sizeof(struct xfs_disk_dquot)
+ * the disk quota buffer: sizeof(struct xfs_disk_dquot)
*/
STATIC uint
xfs_calc_qm_setqlim_reservation(void)
@@ -800,7 +800,7 @@
/*
* Turning off quotas.
- * the xfs_qoff_logitem_t: sizeof(struct xfs_qoff_logitem) * 2
+ * the quota off logitems: sizeof(struct xfs_qoff_logitem) * 2
* the superblock for the quota flags: sector size
*/
STATIC uint
@@ -813,7 +813,7 @@
/*
* End of turning off quotas.
- * the xfs_qoff_logitem_t: sizeof(struct xfs_qoff_logitem) * 2
+ * the quota off logitems: sizeof(struct xfs_qoff_logitem) * 2
*/
STATIC uint
xfs_calc_qm_quotaoff_end_reservation(void)
diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h
index 300b3e9..397d947 100644
--- a/fs/xfs/libxfs/xfs_types.h
+++ b/fs/xfs/libxfs/xfs_types.h
@@ -21,7 +21,6 @@
typedef uint32_t xfs_rtword_t; /* word type for bitmap manipulations */
typedef int64_t xfs_lsn_t; /* log sequence number */
-typedef int32_t xfs_tid_t; /* transaction identifier */
typedef uint32_t xfs_dablk_t; /* dir/attr block number (in file) */
typedef uint32_t xfs_dahash_t; /* dir/attr hash value */
@@ -33,7 +32,6 @@
typedef uint64_t xfs_filblks_t; /* number of blocks in a file */
typedef int64_t xfs_srtblock_t; /* signed version of xfs_rtblock_t */
-typedef int64_t xfs_sfiloff_t; /* signed block number in a file */
/*
* New verifiers will return the instruction address of the failing check.
diff --git a/fs/xfs/scrub/agheader.c b/fs/xfs/scrub/agheader.c
index ba0f747..ae8e2e0 100644
--- a/fs/xfs/scrub/agheader.c
+++ b/fs/xfs/scrub/agheader.c
@@ -92,7 +92,7 @@
if (!xchk_process_error(sc, agno, XFS_SB_BLOCK(mp), &error))
return error;
- sb = XFS_BUF_TO_SBP(bp);
+ sb = bp->b_addr;
/*
* Verify the geometries match. Fields that are permanently
@@ -358,7 +358,7 @@
xchk_agf_xref_freeblks(
struct xfs_scrub *sc)
{
- struct xfs_agf *agf = XFS_BUF_TO_AGF(sc->sa.agf_bp);
+ struct xfs_agf *agf = sc->sa.agf_bp->b_addr;
xfs_extlen_t blocks = 0;
int error;
@@ -378,7 +378,7 @@
xchk_agf_xref_cntbt(
struct xfs_scrub *sc)
{
- struct xfs_agf *agf = XFS_BUF_TO_AGF(sc->sa.agf_bp);
+ struct xfs_agf *agf = sc->sa.agf_bp->b_addr;
xfs_agblock_t agbno;
xfs_extlen_t blocks;
int have;
@@ -410,7 +410,7 @@
xchk_agf_xref_btreeblks(
struct xfs_scrub *sc)
{
- struct xfs_agf *agf = XFS_BUF_TO_AGF(sc->sa.agf_bp);
+ struct xfs_agf *agf = sc->sa.agf_bp->b_addr;
struct xfs_mount *mp = sc->mp;
xfs_agblock_t blocks;
xfs_agblock_t btreeblks;
@@ -456,7 +456,7 @@
xchk_agf_xref_refcblks(
struct xfs_scrub *sc)
{
- struct xfs_agf *agf = XFS_BUF_TO_AGF(sc->sa.agf_bp);
+ struct xfs_agf *agf = sc->sa.agf_bp->b_addr;
xfs_agblock_t blocks;
int error;
@@ -525,7 +525,7 @@
goto out;
xchk_buffer_recheck(sc, sc->sa.agf_bp);
- agf = XFS_BUF_TO_AGF(sc->sa.agf_bp);
+ agf = sc->sa.agf_bp->b_addr;
/* Check the AG length */
eoag = be32_to_cpu(agf->agf_length);
@@ -711,7 +711,7 @@
goto out;
/* Allocate buffer to ensure uniqueness of AGFL entries. */
- agf = XFS_BUF_TO_AGF(sc->sa.agf_bp);
+ agf = sc->sa.agf_bp->b_addr;
agflcount = be32_to_cpu(agf->agf_flcount);
if (agflcount > xfs_agfl_size(sc->mp)) {
xchk_block_set_corrupt(sc, sc->sa.agf_bp);
@@ -728,7 +728,7 @@
}
/* Check the blocks in the AGFL. */
- error = xfs_agfl_walk(sc->mp, XFS_BUF_TO_AGF(sc->sa.agf_bp),
+ error = xfs_agfl_walk(sc->mp, sc->sa.agf_bp->b_addr,
sc->sa.agfl_bp, xchk_agfl_block, &sai);
if (error == -ECANCELED) {
error = 0;
@@ -765,7 +765,7 @@
xchk_agi_xref_icounts(
struct xfs_scrub *sc)
{
- struct xfs_agi *agi = XFS_BUF_TO_AGI(sc->sa.agi_bp);
+ struct xfs_agi *agi = sc->sa.agi_bp->b_addr;
xfs_agino_t icount;
xfs_agino_t freecount;
int error;
@@ -781,6 +781,35 @@
xchk_block_xref_set_corrupt(sc, sc->sa.agi_bp);
}
+/* Check agi_[fi]blocks against tree size */
+static inline void
+xchk_agi_xref_fiblocks(
+ struct xfs_scrub *sc)
+{
+ struct xfs_agi *agi = sc->sa.agi_bp->b_addr;
+ xfs_agblock_t blocks;
+ int error = 0;
+
+ if (!xfs_sb_version_hasinobtcounts(&sc->mp->m_sb))
+ return;
+
+ if (sc->sa.ino_cur) {
+ error = xfs_btree_count_blocks(sc->sa.ino_cur, &blocks);
+ if (!xchk_should_check_xref(sc, &error, &sc->sa.ino_cur))
+ return;
+ if (blocks != be32_to_cpu(agi->agi_iblocks))
+ xchk_block_xref_set_corrupt(sc, sc->sa.agi_bp);
+ }
+
+ if (sc->sa.fino_cur) {
+ error = xfs_btree_count_blocks(sc->sa.fino_cur, &blocks);
+ if (!xchk_should_check_xref(sc, &error, &sc->sa.fino_cur))
+ return;
+ if (blocks != be32_to_cpu(agi->agi_fblocks))
+ xchk_block_xref_set_corrupt(sc, sc->sa.agi_bp);
+ }
+}
+
/* Cross-reference with the other btrees. */
STATIC void
xchk_agi_xref(
@@ -804,6 +833,7 @@
xchk_agi_xref_icounts(sc);
xchk_xref_is_owned_by(sc, agbno, 1, &XFS_RMAP_OINFO_FS);
xchk_xref_is_not_shared(sc, agbno, 1);
+ xchk_agi_xref_fiblocks(sc);
/* scrub teardown will take care of sc->sa for us */
}
@@ -834,7 +864,7 @@
goto out;
xchk_buffer_recheck(sc, sc->sa.agi_bp);
- agi = XFS_BUF_TO_AGI(sc->sa.agi_bp);
+ agi = sc->sa.agi_bp->b_addr;
/* Check the AG length */
eoag = be32_to_cpu(agi->agi_length);
diff --git a/fs/xfs/scrub/agheader_repair.c b/fs/xfs/scrub/agheader_repair.c
index 7a1a38b..401f715 100644
--- a/fs/xfs/scrub/agheader_repair.c
+++ b/fs/xfs/scrub/agheader_repair.c
@@ -49,7 +49,7 @@
/* Copy AG 0's superblock to this one. */
xfs_buf_zero(bp, 0, BBTOB(bp->b_length));
- xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb);
+ xfs_sb_to_disk(bp->b_addr, &mp->m_sb);
/* Write this to disk. */
xfs_trans_buf_set_type(sc->tp, bp, XFS_BLFT_SB_BUF);
@@ -140,7 +140,7 @@
struct xrep_find_ag_btree *fab,
struct xfs_buf *agfl_bp)
{
- struct xfs_agf *old_agf = XFS_BUF_TO_AGF(agf_bp);
+ struct xfs_agf *old_agf = agf_bp->b_addr;
int error;
/* Go find the root data. */
@@ -181,7 +181,7 @@
struct xfs_agf *old_agf)
{
struct xfs_mount *mp = sc->mp;
- struct xfs_agf *agf = XFS_BUF_TO_AGF(agf_bp);
+ struct xfs_agf *agf = agf_bp->b_addr;
memcpy(old_agf, agf, sizeof(*old_agf));
memset(agf, 0, BBTOB(agf_bp->b_length));
@@ -238,7 +238,7 @@
{
struct xrep_agf_allocbt raa = { .sc = sc };
struct xfs_btree_cur *cur = NULL;
- struct xfs_agf *agf = XFS_BUF_TO_AGF(agf_bp);
+ struct xfs_agf *agf = agf_bp->b_addr;
struct xfs_mount *mp = sc->mp;
xfs_agblock_t btreeblks;
xfs_agblock_t blocks;
@@ -302,7 +302,7 @@
struct xfs_buf *agf_bp)
{
struct xfs_perag *pag;
- struct xfs_agf *agf = XFS_BUF_TO_AGF(agf_bp);
+ struct xfs_agf *agf = agf_bp->b_addr;
/* Trigger fdblocks recalculation */
xfs_force_summary_recalc(sc->mp);
@@ -376,7 +376,7 @@
if (error)
return error;
agf_bp->b_ops = &xfs_agf_buf_ops;
- agf = XFS_BUF_TO_AGF(agf_bp);
+ agf = agf_bp->b_addr;
/*
* Load the AGFL so that we can screen out OWN_AG blocks that are on
@@ -395,7 +395,7 @@
* Spot-check the AGFL blocks; if they're obviously corrupt then
* there's nothing we can do but bail out.
*/
- error = xfs_agfl_walk(sc->mp, XFS_BUF_TO_AGF(agf_bp), agfl_bp,
+ error = xfs_agfl_walk(sc->mp, agf_bp->b_addr, agfl_bp,
xrep_agf_check_agfl_block, sc);
if (error)
return error;
@@ -429,10 +429,10 @@
struct xrep_agfl {
/* Bitmap of other OWN_AG metadata blocks. */
- struct xfs_bitmap agmetablocks;
+ struct xbitmap agmetablocks;
/* Bitmap of free space. */
- struct xfs_bitmap *freesp;
+ struct xbitmap *freesp;
struct xfs_scrub *sc;
};
@@ -453,14 +453,14 @@
/* Record all the OWN_AG blocks. */
if (rec->rm_owner == XFS_RMAP_OWN_AG) {
- fsb = XFS_AGB_TO_FSB(cur->bc_mp, cur->bc_private.a.agno,
+ fsb = XFS_AGB_TO_FSB(cur->bc_mp, cur->bc_ag.agno,
rec->rm_startblock);
- error = xfs_bitmap_set(ra->freesp, fsb, rec->rm_blockcount);
+ error = xbitmap_set(ra->freesp, fsb, rec->rm_blockcount);
if (error)
return error;
}
- return xfs_bitmap_set_btcur_path(&ra->agmetablocks, cur);
+ return xbitmap_set_btcur_path(&ra->agmetablocks, cur);
}
/*
@@ -476,19 +476,17 @@
xrep_agfl_collect_blocks(
struct xfs_scrub *sc,
struct xfs_buf *agf_bp,
- struct xfs_bitmap *agfl_extents,
+ struct xbitmap *agfl_extents,
xfs_agblock_t *flcount)
{
struct xrep_agfl ra;
struct xfs_mount *mp = sc->mp;
struct xfs_btree_cur *cur;
- struct xfs_bitmap_range *br;
- struct xfs_bitmap_range *n;
int error;
ra.sc = sc;
ra.freesp = agfl_extents;
- xfs_bitmap_init(&ra.agmetablocks);
+ xbitmap_init(&ra.agmetablocks);
/* Find all space used by the free space btrees & rmapbt. */
cur = xfs_rmapbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.agno);
@@ -500,7 +498,7 @@
/* Find all blocks currently being used by the bnobt. */
cur = xfs_allocbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.agno,
XFS_BTNUM_BNO);
- error = xfs_bitmap_set_btblocks(&ra.agmetablocks, cur);
+ error = xbitmap_set_btblocks(&ra.agmetablocks, cur);
if (error)
goto err;
xfs_btree_del_cursor(cur, error);
@@ -508,7 +506,7 @@
/* Find all blocks currently being used by the cntbt. */
cur = xfs_allocbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.agno,
XFS_BTNUM_CNT);
- error = xfs_bitmap_set_btblocks(&ra.agmetablocks, cur);
+ error = xbitmap_set_btblocks(&ra.agmetablocks, cur);
if (error)
goto err;
@@ -518,8 +516,8 @@
* Drop the freesp meta blocks that are in use by btrees.
* The remaining blocks /should/ be AGFL blocks.
*/
- error = xfs_bitmap_disunion(agfl_extents, &ra.agmetablocks);
- xfs_bitmap_destroy(&ra.agmetablocks);
+ error = xbitmap_disunion(agfl_extents, &ra.agmetablocks);
+ xbitmap_destroy(&ra.agmetablocks);
if (error)
return error;
@@ -527,18 +525,12 @@
* Calculate the new AGFL size. If we found more blocks than fit in
* the AGFL we'll free them later.
*/
- *flcount = 0;
- for_each_xfs_bitmap_extent(br, n, agfl_extents) {
- *flcount += br->len;
- if (*flcount > xfs_agfl_size(mp))
- break;
- }
- if (*flcount > xfs_agfl_size(mp))
- *flcount = xfs_agfl_size(mp);
+ *flcount = min_t(uint64_t, xbitmap_hweight(agfl_extents),
+ xfs_agfl_size(mp));
return 0;
err:
- xfs_bitmap_destroy(&ra.agmetablocks);
+ xbitmap_destroy(&ra.agmetablocks);
xfs_btree_del_cursor(cur, error);
return error;
}
@@ -550,7 +542,7 @@
struct xfs_buf *agf_bp,
xfs_agblock_t flcount)
{
- struct xfs_agf *agf = XFS_BUF_TO_AGF(agf_bp);
+ struct xfs_agf *agf = agf_bp->b_addr;
ASSERT(flcount <= xfs_agfl_size(sc->mp));
@@ -573,13 +565,13 @@
xrep_agfl_init_header(
struct xfs_scrub *sc,
struct xfs_buf *agfl_bp,
- struct xfs_bitmap *agfl_extents,
+ struct xbitmap *agfl_extents,
xfs_agblock_t flcount)
{
struct xfs_mount *mp = sc->mp;
__be32 *agfl_bno;
- struct xfs_bitmap_range *br;
- struct xfs_bitmap_range *n;
+ struct xbitmap_range *br;
+ struct xbitmap_range *n;
struct xfs_agfl *agfl;
xfs_agblock_t agbno;
unsigned int fl_off;
@@ -602,8 +594,8 @@
* step.
*/
fl_off = 0;
- agfl_bno = XFS_BUF_TO_AGFL_BNO(mp, agfl_bp);
- for_each_xfs_bitmap_extent(br, n, agfl_extents) {
+ agfl_bno = xfs_buf_to_agfl_bno(agfl_bp);
+ for_each_xbitmap_extent(br, n, agfl_extents) {
agbno = XFS_FSB_TO_AGBNO(mp, br->start);
trace_xrep_agfl_insert(mp, sc->sa.agno, agbno, br->len);
@@ -637,7 +629,7 @@
xrep_agfl(
struct xfs_scrub *sc)
{
- struct xfs_bitmap agfl_extents;
+ struct xbitmap agfl_extents;
struct xfs_mount *mp = sc->mp;
struct xfs_buf *agf_bp;
struct xfs_buf *agfl_bp;
@@ -649,7 +641,7 @@
return -EOPNOTSUPP;
xchk_perag_get(sc->mp, &sc->sa);
- xfs_bitmap_init(&agfl_extents);
+ xbitmap_init(&agfl_extents);
/*
* Read the AGF so that we can query the rmapbt. We hope that there's
@@ -659,8 +651,6 @@
error = xfs_alloc_read_agf(mp, sc->tp, sc->sa.agno, 0, &agf_bp);
if (error)
return error;
- if (!agf_bp)
- return -ENOMEM;
/*
* Make sure we have the AGFL buffer, as scrub might have decided it
@@ -698,10 +688,10 @@
goto err;
/* Dump any AGFL overflow. */
- return xrep_reap_extents(sc, &agfl_extents, &XFS_RMAP_OINFO_AG,
+ error = xrep_reap_extents(sc, &agfl_extents, &XFS_RMAP_OINFO_AG,
XFS_AG_RESV_AGFL);
err:
- xfs_bitmap_destroy(&agfl_extents);
+ xbitmap_destroy(&agfl_extents);
return error;
}
@@ -735,8 +725,6 @@
error = xfs_alloc_read_agf(mp, sc->tp, sc->sa.agno, 0, &agf_bp);
if (error)
return error;
- if (!agf_bp)
- return -ENOMEM;
/* Find the btree roots. */
error = xrep_find_ag_btree_roots(sc, agf_bp, fab, NULL);
@@ -765,7 +753,7 @@
struct xfs_buf *agi_bp,
struct xfs_agi *old_agi)
{
- struct xfs_agi *agi = XFS_BUF_TO_AGI(agi_bp);
+ struct xfs_agi *agi = agi_bp->b_addr;
struct xfs_mount *mp = sc->mp;
memcpy(old_agi, agi, sizeof(*old_agi));
@@ -811,7 +799,7 @@
struct xfs_buf *agi_bp)
{
struct xfs_btree_cur *cur;
- struct xfs_agi *agi = XFS_BUF_TO_AGI(agi_bp);
+ struct xfs_agi *agi = agi_bp->b_addr;
struct xfs_mount *mp = sc->mp;
xfs_agino_t count;
xfs_agino_t freecount;
@@ -822,10 +810,34 @@
error = xfs_ialloc_count_inodes(cur, &count, &freecount);
if (error)
goto err;
+ if (xfs_sb_version_hasinobtcounts(&mp->m_sb)) {
+ xfs_agblock_t blocks;
+
+ error = xfs_btree_count_blocks(cur, &blocks);
+ if (error)
+ goto err;
+ agi->agi_iblocks = cpu_to_be32(blocks);
+ }
xfs_btree_del_cursor(cur, error);
agi->agi_count = cpu_to_be32(count);
agi->agi_freecount = cpu_to_be32(freecount);
+
+ if (xfs_sb_version_hasfinobt(&mp->m_sb) &&
+ xfs_sb_version_hasinobtcounts(&mp->m_sb)) {
+ xfs_agblock_t blocks;
+
+ cur = xfs_inobt_init_cursor(mp, sc->tp, agi_bp, sc->sa.agno,
+ XFS_BTNUM_FINO);
+ if (error)
+ goto err;
+ error = xfs_btree_count_blocks(cur, &blocks);
+ if (error)
+ goto err;
+ xfs_btree_del_cursor(cur, error);
+ agi->agi_fblocks = cpu_to_be32(blocks);
+ }
+
return 0;
err:
xfs_btree_del_cursor(cur, error);
@@ -839,7 +851,7 @@
struct xfs_buf *agi_bp)
{
struct xfs_perag *pag;
- struct xfs_agi *agi = XFS_BUF_TO_AGI(agi_bp);
+ struct xfs_agi *agi = agi_bp->b_addr;
/* Trigger inode count recalculation */
xfs_force_summary_recalc(sc->mp);
@@ -896,7 +908,7 @@
if (error)
return error;
agi_bp->b_ops = &xfs_agi_buf_ops;
- agi = XFS_BUF_TO_AGI(agi_bp);
+ agi = agi_bp->b_addr;
/* Find the AGI btree roots. */
error = xrep_agi_find_btrees(sc, fab);
diff --git a/fs/xfs/scrub/alloc.c b/fs/xfs/scrub/alloc.c
index 5533e48..73d924e 100644
--- a/fs/xfs/scrub/alloc.c
+++ b/fs/xfs/scrub/alloc.c
@@ -94,7 +94,7 @@
union xfs_btree_rec *rec)
{
struct xfs_mount *mp = bs->cur->bc_mp;
- xfs_agnumber_t agno = bs->cur->bc_private.a.agno;
+ xfs_agnumber_t agno = bs->cur->bc_ag.agno;
xfs_agblock_t bno;
xfs_extlen_t len;
diff --git a/fs/xfs/scrub/attr.c b/fs/xfs/scrub/attr.c
index 0edc7f8..9faddb3 100644
--- a/fs/xfs/scrub/attr.c
+++ b/fs/xfs/scrub/attr.c
@@ -98,7 +98,7 @@
/*
* Check that an extended attribute key can be looked up by hash.
*
- * We use the XFS attribute list iterator (i.e. xfs_attr_list_int_ilocked)
+ * We use the XFS attribute list iterator (i.e. xfs_attr_list_ilocked)
* to call this function for every attribute key in an inode. Once
* we're here, we load the attribute value to see if any errors happen,
* or if we get more or less data than we expected.
@@ -147,11 +147,8 @@
return;
}
- args.flags = ATTR_KERNOTIME;
- if (flags & XFS_ATTR_ROOT)
- args.flags |= ATTR_ROOT;
- else if (flags & XFS_ATTR_SECURE)
- args.flags |= ATTR_SECURE;
+ args.op_flags = XFS_DA_OP_NOTIME;
+ args.attr_filter = flags & XFS_ATTR_NSP_ONDISK_MASK;
args.geo = context->dp->i_mount->m_attr_geo;
args.whichfork = XFS_ATTR_FORK;
args.dp = context->dp;
@@ -162,7 +159,10 @@
args.value = xchk_xattr_valuebuf(sx->sc);
args.valuelen = valuelen;
- error = xfs_attr_get_ilocked(context->dp, &args);
+ error = xfs_attr_get_ilocked(&args);
+ /* ENODATA means the hash lookup failed and the attr is bad */
+ if (error == -ENODATA)
+ error = -EFSCORRUPTED;
if (!xchk_fblock_process_error(sx->sc, XFS_ATTR_FORK, args.blkno,
&error))
goto fail_xref;
@@ -398,15 +398,14 @@
STATIC int
xchk_xattr_rec(
struct xchk_da_btree *ds,
- int level,
- void *rec)
+ int level)
{
struct xfs_mount *mp = ds->state->mp;
- struct xfs_attr_leaf_entry *ent = rec;
- struct xfs_da_state_blk *blk;
+ struct xfs_da_state_blk *blk = &ds->state->path.blk[level];
struct xfs_attr_leaf_name_local *lentry;
struct xfs_attr_leaf_name_remote *rentry;
struct xfs_buf *bp;
+ struct xfs_attr_leaf_entry *ent;
xfs_dahash_t calc_hash;
xfs_dahash_t hash;
int nameidx;
@@ -414,7 +413,9 @@
unsigned int badflags;
int error;
- blk = &ds->state->path.blk[level];
+ ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC);
+
+ ent = xfs_attr3_leaf_entryp(blk->bp->b_addr) + blk->index;
/* Check the whole block, if necessary. */
error = xchk_xattr_block(ds, level);
@@ -473,7 +474,6 @@
struct xfs_scrub *sc)
{
struct xchk_xattr sx;
- struct attrlist_cursor_kern cursor = { 0 };
xfs_dablk_t last_checked = -1U;
int error = 0;
@@ -492,11 +492,10 @@
/* Check that every attr key can also be looked up by hash. */
sx.context.dp = sc->ip;
- sx.context.cursor = &cursor;
sx.context.resynch = 1;
sx.context.put_listent = xchk_xattr_listent;
sx.context.tp = sc->tp;
- sx.context.flags = ATTR_INCOMPLETE;
+ sx.context.allow_incomplete = true;
sx.sc = sc;
/*
@@ -515,7 +514,7 @@
* iteration, which doesn't really follow the usual buffer
* locking order.
*/
- error = xfs_attr_list_int_ilocked(&sx.context);
+ error = xfs_attr_list_ilocked(&sx.context);
if (!xchk_fblock_process_error(sc, XFS_ATTR_FORK, 0, &error))
goto out;
diff --git a/fs/xfs/scrub/bitmap.c b/fs/xfs/scrub/bitmap.c
index 3d47d11..813b5f2 100644
--- a/fs/xfs/scrub/bitmap.c
+++ b/fs/xfs/scrub/bitmap.c
@@ -18,14 +18,14 @@
* This is the logical equivalent of bitmap |= mask(start, len).
*/
int
-xfs_bitmap_set(
- struct xfs_bitmap *bitmap,
+xbitmap_set(
+ struct xbitmap *bitmap,
uint64_t start,
uint64_t len)
{
- struct xfs_bitmap_range *bmr;
+ struct xbitmap_range *bmr;
- bmr = kmem_alloc(sizeof(struct xfs_bitmap_range), KM_MAYFAIL);
+ bmr = kmem_alloc(sizeof(struct xbitmap_range), KM_MAYFAIL);
if (!bmr)
return -ENOMEM;
@@ -39,13 +39,13 @@
/* Free everything related to this bitmap. */
void
-xfs_bitmap_destroy(
- struct xfs_bitmap *bitmap)
+xbitmap_destroy(
+ struct xbitmap *bitmap)
{
- struct xfs_bitmap_range *bmr;
- struct xfs_bitmap_range *n;
+ struct xbitmap_range *bmr;
+ struct xbitmap_range *n;
- for_each_xfs_bitmap_extent(bmr, n, bitmap) {
+ for_each_xbitmap_extent(bmr, n, bitmap) {
list_del(&bmr->list);
kmem_free(bmr);
}
@@ -53,24 +53,24 @@
/* Set up a per-AG block bitmap. */
void
-xfs_bitmap_init(
- struct xfs_bitmap *bitmap)
+xbitmap_init(
+ struct xbitmap *bitmap)
{
INIT_LIST_HEAD(&bitmap->list);
}
/* Compare two btree extents. */
static int
-xfs_bitmap_range_cmp(
+xbitmap_range_cmp(
void *priv,
- struct list_head *a,
- struct list_head *b)
+ const struct list_head *a,
+ const struct list_head *b)
{
- struct xfs_bitmap_range *ap;
- struct xfs_bitmap_range *bp;
+ struct xbitmap_range *ap;
+ struct xbitmap_range *bp;
- ap = container_of(a, struct xfs_bitmap_range, list);
- bp = container_of(b, struct xfs_bitmap_range, list);
+ ap = container_of(a, struct xbitmap_range, list);
+ bp = container_of(b, struct xbitmap_range, list);
if (ap->start > bp->start)
return 1;
@@ -96,14 +96,14 @@
#define LEFT_ALIGNED (1 << 0)
#define RIGHT_ALIGNED (1 << 1)
int
-xfs_bitmap_disunion(
- struct xfs_bitmap *bitmap,
- struct xfs_bitmap *sub)
+xbitmap_disunion(
+ struct xbitmap *bitmap,
+ struct xbitmap *sub)
{
struct list_head *lp;
- struct xfs_bitmap_range *br;
- struct xfs_bitmap_range *new_br;
- struct xfs_bitmap_range *sub_br;
+ struct xbitmap_range *br;
+ struct xbitmap_range *new_br;
+ struct xbitmap_range *sub_br;
uint64_t sub_start;
uint64_t sub_len;
int state;
@@ -113,8 +113,8 @@
return 0;
ASSERT(!list_empty(&sub->list));
- list_sort(NULL, &bitmap->list, xfs_bitmap_range_cmp);
- list_sort(NULL, &sub->list, xfs_bitmap_range_cmp);
+ list_sort(NULL, &bitmap->list, xbitmap_range_cmp);
+ list_sort(NULL, &sub->list, xbitmap_range_cmp);
/*
* Now that we've sorted both lists, we iterate bitmap once, rolling
@@ -124,11 +124,11 @@
* list traversal is similar to merge sort, but we're deleting
* instead. In this manner we avoid O(n^2) operations.
*/
- sub_br = list_first_entry(&sub->list, struct xfs_bitmap_range,
+ sub_br = list_first_entry(&sub->list, struct xbitmap_range,
list);
lp = bitmap->list.next;
while (lp != &bitmap->list) {
- br = list_entry(lp, struct xfs_bitmap_range, list);
+ br = list_entry(lp, struct xbitmap_range, list);
/*
* Advance sub_br and/or br until we find a pair that
@@ -181,7 +181,7 @@
* Deleting from the middle: add the new right extent
* and then shrink the left extent.
*/
- new_br = kmem_alloc(sizeof(struct xfs_bitmap_range),
+ new_br = kmem_alloc(sizeof(struct xbitmap_range),
KM_MAYFAIL);
if (!new_br) {
error = -ENOMEM;
@@ -247,8 +247,8 @@
* blocks going from the leaf towards the root.
*/
int
-xfs_bitmap_set_btcur_path(
- struct xfs_bitmap *bitmap,
+xbitmap_set_btcur_path(
+ struct xbitmap *bitmap,
struct xfs_btree_cur *cur)
{
struct xfs_buf *bp;
@@ -261,7 +261,7 @@
if (!bp)
continue;
fsb = XFS_DADDR_TO_FSB(cur->bc_mp, bp->b_bn);
- error = xfs_bitmap_set(bitmap, fsb, 1);
+ error = xbitmap_set(bitmap, fsb, 1);
if (error)
return error;
}
@@ -271,12 +271,12 @@
/* Collect a btree's block in the bitmap. */
STATIC int
-xfs_bitmap_collect_btblock(
+xbitmap_collect_btblock(
struct xfs_btree_cur *cur,
int level,
void *priv)
{
- struct xfs_bitmap *bitmap = priv;
+ struct xbitmap *bitmap = priv;
struct xfs_buf *bp;
xfs_fsblock_t fsbno;
@@ -285,14 +285,30 @@
return 0;
fsbno = XFS_DADDR_TO_FSB(cur->bc_mp, bp->b_bn);
- return xfs_bitmap_set(bitmap, fsbno, 1);
+ return xbitmap_set(bitmap, fsbno, 1);
}
/* Walk the btree and mark the bitmap wherever a btree block is found. */
int
-xfs_bitmap_set_btblocks(
- struct xfs_bitmap *bitmap,
+xbitmap_set_btblocks(
+ struct xbitmap *bitmap,
struct xfs_btree_cur *cur)
{
- return xfs_btree_visit_blocks(cur, xfs_bitmap_collect_btblock, bitmap);
+ return xfs_btree_visit_blocks(cur, xbitmap_collect_btblock,
+ XFS_BTREE_VISIT_ALL, bitmap);
+}
+
+/* How many bits are set in this bitmap? */
+uint64_t
+xbitmap_hweight(
+ struct xbitmap *bitmap)
+{
+ struct xbitmap_range *bmr;
+ struct xbitmap_range *n;
+ uint64_t ret = 0;
+
+ for_each_xbitmap_extent(bmr, n, bitmap)
+ ret += bmr->len;
+
+ return ret;
}
diff --git a/fs/xfs/scrub/bitmap.h b/fs/xfs/scrub/bitmap.h
index ae8ecbc..900646b 100644
--- a/fs/xfs/scrub/bitmap.h
+++ b/fs/xfs/scrub/bitmap.h
@@ -6,31 +6,32 @@
#ifndef __XFS_SCRUB_BITMAP_H__
#define __XFS_SCRUB_BITMAP_H__
-struct xfs_bitmap_range {
+struct xbitmap_range {
struct list_head list;
uint64_t start;
uint64_t len;
};
-struct xfs_bitmap {
+struct xbitmap {
struct list_head list;
};
-void xfs_bitmap_init(struct xfs_bitmap *bitmap);
-void xfs_bitmap_destroy(struct xfs_bitmap *bitmap);
+void xbitmap_init(struct xbitmap *bitmap);
+void xbitmap_destroy(struct xbitmap *bitmap);
-#define for_each_xfs_bitmap_extent(bex, n, bitmap) \
+#define for_each_xbitmap_extent(bex, n, bitmap) \
list_for_each_entry_safe((bex), (n), &(bitmap)->list, list)
-#define for_each_xfs_bitmap_block(b, bex, n, bitmap) \
+#define for_each_xbitmap_block(b, bex, n, bitmap) \
list_for_each_entry_safe((bex), (n), &(bitmap)->list, list) \
- for ((b) = bex->start; (b) < bex->start + bex->len; (b)++)
+ for ((b) = (bex)->start; (b) < (bex)->start + (bex)->len; (b)++)
-int xfs_bitmap_set(struct xfs_bitmap *bitmap, uint64_t start, uint64_t len);
-int xfs_bitmap_disunion(struct xfs_bitmap *bitmap, struct xfs_bitmap *sub);
-int xfs_bitmap_set_btcur_path(struct xfs_bitmap *bitmap,
+int xbitmap_set(struct xbitmap *bitmap, uint64_t start, uint64_t len);
+int xbitmap_disunion(struct xbitmap *bitmap, struct xbitmap *sub);
+int xbitmap_set_btcur_path(struct xbitmap *bitmap,
struct xfs_btree_cur *cur);
-int xfs_bitmap_set_btblocks(struct xfs_bitmap *bitmap,
+int xbitmap_set_btblocks(struct xbitmap *bitmap,
struct xfs_btree_cur *cur);
+uint64_t xbitmap_hweight(struct xbitmap *bitmap);
#endif /* __XFS_SCRUB_BITMAP_H__ */
diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c
index 52892f4..fed56d2 100644
--- a/fs/xfs/scrub/bmap.c
+++ b/fs/xfs/scrub/bmap.c
@@ -394,7 +394,7 @@
struct xfs_bmbt_irec iext_irec;
struct xfs_iext_cursor icur;
struct xchk_bmap_info *info = bs->private;
- struct xfs_inode *ip = bs->cur->bc_private.b.ip;
+ struct xfs_inode *ip = bs->cur->bc_ino.ip;
struct xfs_buf *bp = NULL;
struct xfs_btree_block *block;
struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, info->whichfork);
@@ -521,7 +521,7 @@
xchk_fblock_set_corrupt(sc, sbcri->whichfork,
rec->rm_offset);
if (irec.br_startblock != XFS_AGB_TO_FSB(sc->mp,
- cur->bc_private.a.agno, rec->rm_startblock))
+ cur->bc_ag.agno, rec->rm_startblock))
xchk_fblock_set_corrupt(sc, sbcri->whichfork,
rec->rm_offset);
if (irec.br_blockcount > rec->rm_blockcount)
@@ -586,8 +586,9 @@
struct xfs_scrub *sc,
int whichfork)
{
- loff_t size;
+ struct xfs_ifork *ifp = XFS_IFORK_PTR(sc->ip, whichfork);
xfs_agnumber_t agno;
+ bool zero_size;
int error;
if (!xfs_sb_version_hasrmapbt(&sc->mp->m_sb) ||
@@ -599,6 +600,8 @@
if (XFS_IS_REALTIME_INODE(sc->ip) && whichfork == XFS_DATA_FORK)
return 0;
+ ASSERT(XFS_IFORK_PTR(sc->ip, whichfork) != NULL);
+
/*
* Only do this for complex maps that are in btree format, or for
* situations where we would seem to have a size but zero extents.
@@ -606,19 +609,14 @@
* to flag this bmap as corrupt if there are rmaps that need to be
* reattached.
*/
- switch (whichfork) {
- case XFS_DATA_FORK:
- size = i_size_read(VFS_I(sc->ip));
- break;
- case XFS_ATTR_FORK:
- size = XFS_IFORK_Q(sc->ip);
- break;
- default:
- size = 0;
- break;
- }
- if (XFS_IFORK_FORMAT(sc->ip, whichfork) != XFS_DINODE_FMT_BTREE &&
- (size == 0 || XFS_IFORK_NEXTENTS(sc->ip, whichfork) > 0))
+
+ if (whichfork == XFS_DATA_FORK)
+ zero_size = i_size_read(VFS_I(sc->ip)) == 0;
+ else
+ zero_size = false;
+
+ if (ifp->if_format != XFS_DINODE_FMT_BTREE &&
+ (zero_size || ifp->if_nextents > 0))
return 0;
for (agno = 0; agno < sc->mp->m_sb.sb_agcount; agno++) {
@@ -647,12 +645,14 @@
struct xchk_bmap_info info = { NULL };
struct xfs_mount *mp = sc->mp;
struct xfs_inode *ip = sc->ip;
- struct xfs_ifork *ifp;
+ struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
xfs_fileoff_t endoff;
struct xfs_iext_cursor icur;
int error = 0;
- ifp = XFS_IFORK_PTR(ip, whichfork);
+ /* Non-existent forks can be ignored. */
+ if (!ifp)
+ goto out;
info.is_rt = whichfork == XFS_DATA_FORK && XFS_IS_REALTIME_INODE(ip);
info.whichfork = whichfork;
@@ -661,9 +661,6 @@
switch (whichfork) {
case XFS_COW_FORK:
- /* Non-existent CoW forks are ignorable. */
- if (!ifp)
- goto out;
/* No CoW forks on non-reflink inodes/filesystems. */
if (!xfs_is_reflink_inode(ip)) {
xchk_ino_set_corrupt(sc, sc->ip->i_ino);
@@ -671,8 +668,6 @@
}
break;
case XFS_ATTR_FORK:
- if (!ifp)
- goto out_check_rmap;
if (!xfs_sb_version_hasattr(&mp->m_sb) &&
!xfs_sb_version_hasattr2(&mp->m_sb))
xchk_ino_set_corrupt(sc, sc->ip->i_ino);
@@ -683,7 +678,7 @@
}
/* Check the fork values */
- switch (XFS_IFORK_FORMAT(ip, whichfork)) {
+ switch (ifp->if_format) {
case XFS_DINODE_FMT_UUID:
case XFS_DINODE_FMT_DEV:
case XFS_DINODE_FMT_LOCAL:
@@ -737,7 +732,6 @@
goto out;
}
-out_check_rmap:
error = xchk_bmap_check_rmaps(sc, whichfork);
if (!xchk_fblock_xref_process_error(sc, whichfork, 0, &error))
goto out;
diff --git a/fs/xfs/scrub/dabtree.c b/fs/xfs/scrub/dabtree.c
index 77ff9f9..653f328 100644
--- a/fs/xfs/scrub/dabtree.c
+++ b/fs/xfs/scrub/dabtree.c
@@ -77,40 +77,18 @@
__return_address);
}
-/* Find an entry at a certain level in a da btree. */
-STATIC void *
-xchk_da_btree_entry(
- struct xchk_da_btree *ds,
- int level,
- int rec)
+static struct xfs_da_node_entry *
+xchk_da_btree_node_entry(
+ struct xchk_da_btree *ds,
+ int level)
{
- char *ents;
- struct xfs_da_state_blk *blk;
- void *baddr;
+ struct xfs_da_state_blk *blk = &ds->state->path.blk[level];
+ struct xfs_da3_icnode_hdr hdr;
- /* Dispatch the entry finding function. */
- blk = &ds->state->path.blk[level];
- baddr = blk->bp->b_addr;
- switch (blk->magic) {
- case XFS_ATTR_LEAF_MAGIC:
- case XFS_ATTR3_LEAF_MAGIC:
- ents = (char *)xfs_attr3_leaf_entryp(baddr);
- return ents + (rec * sizeof(struct xfs_attr_leaf_entry));
- case XFS_DIR2_LEAFN_MAGIC:
- case XFS_DIR3_LEAFN_MAGIC:
- ents = (char *)ds->dargs.dp->d_ops->leaf_ents_p(baddr);
- return ents + (rec * sizeof(struct xfs_dir2_leaf_entry));
- case XFS_DIR2_LEAF1_MAGIC:
- case XFS_DIR3_LEAF1_MAGIC:
- ents = (char *)ds->dargs.dp->d_ops->leaf_ents_p(baddr);
- return ents + (rec * sizeof(struct xfs_dir2_leaf_entry));
- case XFS_DA_NODE_MAGIC:
- case XFS_DA3_NODE_MAGIC:
- ents = (char *)ds->dargs.dp->d_ops->node_tree_p(baddr);
- return ents + (rec * sizeof(struct xfs_da_node_entry));
- }
+ ASSERT(blk->magic == XFS_DA_NODE_MAGIC);
- return NULL;
+ xfs_da3_node_hdr_from_disk(ds->sc->mp, &hdr, blk->bp->b_addr);
+ return hdr.btree + blk->index;
}
/* Scrub a da btree hash (key). */
@@ -120,7 +98,6 @@
int level,
__be32 *hashp)
{
- struct xfs_da_state_blk *blks;
struct xfs_da_node_entry *entry;
xfs_dahash_t hash;
xfs_dahash_t parent_hash;
@@ -135,8 +112,7 @@
return 0;
/* Is this hash no larger than the parent hash? */
- blks = ds->state->path.blk;
- entry = xchk_da_btree_entry(ds, level - 1, blks[level - 1].index);
+ entry = xchk_da_btree_node_entry(ds, level - 1);
parent_hash = be32_to_cpu(entry->hashval);
if (parent_hash < hash)
xchk_da_set_corrupt(ds, level);
@@ -243,19 +219,21 @@
int direction,
xfs_dablk_t sibling)
{
+ struct xfs_da_state_path *path = &ds->state->path;
+ struct xfs_da_state_path *altpath = &ds->state->altpath;
int retval;
+ int plevel;
int error;
- memcpy(&ds->state->altpath, &ds->state->path,
- sizeof(ds->state->altpath));
+ memcpy(altpath, path, sizeof(ds->state->altpath));
/*
* If the pointer is null, we shouldn't be able to move the upper
* level pointer anywhere.
*/
if (sibling == 0) {
- error = xfs_da3_path_shift(ds->state, &ds->state->altpath,
- direction, false, &retval);
+ error = xfs_da3_path_shift(ds->state, altpath, direction,
+ false, &retval);
if (error == 0 && retval == 0)
xchk_da_set_corrupt(ds, level);
error = 0;
@@ -263,27 +241,33 @@
}
/* Move the alternate cursor one block in the direction given. */
- error = xfs_da3_path_shift(ds->state, &ds->state->altpath,
- direction, false, &retval);
+ error = xfs_da3_path_shift(ds->state, altpath, direction, false,
+ &retval);
if (!xchk_da_process_error(ds, level, &error))
- return error;
+ goto out;
if (retval) {
xchk_da_set_corrupt(ds, level);
- return error;
+ goto out;
}
- if (ds->state->altpath.blk[level].bp)
- xchk_buffer_recheck(ds->sc,
- ds->state->altpath.blk[level].bp);
+ if (altpath->blk[level].bp)
+ xchk_buffer_recheck(ds->sc, altpath->blk[level].bp);
/* Compare upper level pointer to sibling pointer. */
- if (ds->state->altpath.blk[level].blkno != sibling)
+ if (altpath->blk[level].blkno != sibling)
xchk_da_set_corrupt(ds, level);
- if (ds->state->altpath.blk[level].bp) {
- xfs_trans_brelse(ds->dargs.trans,
- ds->state->altpath.blk[level].bp);
- ds->state->altpath.blk[level].bp = NULL;
- }
+
out:
+ /* Free all buffers in the altpath that aren't referenced from path. */
+ for (plevel = 0; plevel < altpath->active; plevel++) {
+ if (altpath->blk[plevel].bp == NULL ||
+ (plevel < path->active &&
+ altpath->blk[plevel].bp == path->blk[plevel].bp))
+ continue;
+
+ xfs_trans_brelse(ds->dargs.trans, altpath->blk[plevel].bp);
+ altpath->blk[plevel].bp = NULL;
+ }
+
return error;
}
@@ -355,8 +339,8 @@
goto out_nobuf;
/* Read the buffer. */
- error = xfs_da_read_buf(dargs->trans, dargs->dp, blk->blkno, -2,
- &blk->bp, dargs->whichfork,
+ error = xfs_da_read_buf(dargs->trans, dargs->dp, blk->blkno,
+ XFS_DABUF_MAP_HOLE_OK, &blk->bp, dargs->whichfork,
&xchk_da_btree_buf_ops);
if (!xchk_da_process_error(ds, level, &error))
goto out_nobuf;
@@ -433,8 +417,8 @@
XFS_BLFT_DA_NODE_BUF);
blk->magic = XFS_DA_NODE_MAGIC;
node = blk->bp->b_addr;
- ip->d_ops->node_hdr_from_disk(&nodehdr, node);
- btree = ip->d_ops->node_tree_p(node);
+ xfs_da3_node_hdr_from_disk(ip->i_mount, &nodehdr, node);
+ btree = nodehdr.btree;
*pmaxrecs = nodehdr.count;
blk->hashval = be32_to_cpu(btree[*pmaxrecs - 1].hashval);
if (level == 0) {
@@ -457,6 +441,20 @@
goto out_freebp;
}
+ /*
+ * If we've been handed a block that is below the dabtree root, does
+ * its hashval match what the parent block expected to see?
+ */
+ if (level > 0) {
+ struct xfs_da_node_entry *key;
+
+ key = xchk_da_btree_node_entry(ds, level - 1);
+ if (be32_to_cpu(key->hashval) != blk->hashval) {
+ xchk_da_set_corrupt(ds, level);
+ goto out_freebp;
+ }
+ }
+
out:
return error;
out_freebp:
@@ -479,14 +477,12 @@
struct xfs_mount *mp = sc->mp;
struct xfs_da_state_blk *blks;
struct xfs_da_node_entry *key;
- void *rec;
xfs_dablk_t blkno;
int level;
int error;
/* Skip short format data structures; no btree to scan. */
- if (XFS_IFORK_FORMAT(sc->ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
- XFS_IFORK_FORMAT(sc->ip, whichfork) != XFS_DINODE_FMT_BTREE)
+ if (!xfs_ifork_has_extents(XFS_IFORK_PTR(sc->ip, whichfork)))
return 0;
/* Set up initial da state. */
@@ -494,9 +490,7 @@
ds.dargs.whichfork = whichfork;
ds.dargs.trans = sc->tp;
ds.dargs.op_flags = XFS_DA_OP_OKNOENT;
- ds.state = xfs_da_state_alloc();
- ds.state->args = &ds.dargs;
- ds.state->mp = mp;
+ ds.state = xfs_da_state_alloc(&ds.dargs);
ds.sc = sc;
ds.private = private;
if (whichfork == XFS_ATTR_FORK) {
@@ -538,9 +532,7 @@
}
/* Dispatch record scrubbing. */
- rec = xchk_da_btree_entry(&ds, level,
- blks[level].index);
- error = scrub_fn(&ds, level, rec);
+ error = scrub_fn(&ds, level);
if (error)
break;
if (xchk_should_terminate(sc, &error) ||
@@ -562,7 +554,7 @@
}
/* Hashes in order for scrub? */
- key = xchk_da_btree_entry(&ds, level, blks[level].index);
+ key = xchk_da_btree_node_entry(&ds, level);
error = xchk_da_btree_hash(&ds, level, &key->hashval);
if (error)
goto out;
diff --git a/fs/xfs/scrub/dabtree.h b/fs/xfs/scrub/dabtree.h
index cb3f000..1f3515c 100644
--- a/fs/xfs/scrub/dabtree.h
+++ b/fs/xfs/scrub/dabtree.h
@@ -28,8 +28,7 @@
int tree_level;
};
-typedef int (*xchk_da_btree_rec_fn)(struct xchk_da_btree *ds,
- int level, void *rec);
+typedef int (*xchk_da_btree_rec_fn)(struct xchk_da_btree *ds, int level);
/* Check for da btree operation errors. */
bool xchk_da_process_error(struct xchk_da_btree *ds, int level, int *error);
diff --git a/fs/xfs/scrub/dir.c b/fs/xfs/scrub/dir.c
index 20eca2d..b045e95 100644
--- a/fs/xfs/scrub/dir.c
+++ b/fs/xfs/scrub/dir.c
@@ -113,6 +113,9 @@
offset = xfs_dir2_db_to_da(mp->m_dir_geo,
xfs_dir2_dataptr_to_db(mp->m_dir_geo, pos));
+ if (xchk_should_terminate(sdc->sc, &error))
+ return error;
+
/* Does this inode number make sense? */
if (!xfs_verify_dir_ino(mp, ino)) {
xchk_fblock_set_corrupt(sdc->sc, XFS_DATA_FORK, offset);
@@ -182,15 +185,17 @@
STATIC int
xchk_dir_rec(
struct xchk_da_btree *ds,
- int level,
- void *rec)
+ int level)
{
+ struct xfs_da_state_blk *blk = &ds->state->path.blk[level];
struct xfs_mount *mp = ds->state->mp;
- struct xfs_dir2_leaf_entry *ent = rec;
struct xfs_inode *dp = ds->dargs.dp;
+ struct xfs_da_geometry *geo = mp->m_dir_geo;
struct xfs_dir2_data_entry *dent;
struct xfs_buf *bp;
- char *p, *endp;
+ struct xfs_dir2_leaf_entry *ent;
+ unsigned int end;
+ unsigned int iter_off;
xfs_ino_t ino;
xfs_dablk_t rec_bno;
xfs_dir2_db_t db;
@@ -198,9 +203,16 @@
xfs_dir2_dataptr_t ptr;
xfs_dahash_t calc_hash;
xfs_dahash_t hash;
+ struct xfs_dir3_icleaf_hdr hdr;
unsigned int tag;
int error;
+ ASSERT(blk->magic == XFS_DIR2_LEAF1_MAGIC ||
+ blk->magic == XFS_DIR2_LEAFN_MAGIC);
+
+ xfs_dir2_leaf_hdr_from_disk(mp, &hdr, blk->bp->b_addr);
+ ent = hdr.ents + blk->index;
+
/* Check the hash of the entry. */
error = xchk_da_btree_hash(ds, level, &ent->hashval);
if (error)
@@ -212,15 +224,16 @@
return 0;
/* Find the directory entry's location. */
- db = xfs_dir2_dataptr_to_db(mp->m_dir_geo, ptr);
- off = xfs_dir2_dataptr_to_off(mp->m_dir_geo, ptr);
- rec_bno = xfs_dir2_db_to_da(mp->m_dir_geo, db);
+ db = xfs_dir2_dataptr_to_db(geo, ptr);
+ off = xfs_dir2_dataptr_to_off(geo, ptr);
+ rec_bno = xfs_dir2_db_to_da(geo, db);
- if (rec_bno >= mp->m_dir_geo->leafblk) {
+ if (rec_bno >= geo->leafblk) {
xchk_da_set_corrupt(ds, level);
goto out;
}
- error = xfs_dir3_data_read(ds->dargs.trans, dp, rec_bno, -2, &bp);
+ error = xfs_dir3_data_read(ds->dargs.trans, dp, rec_bno,
+ XFS_DABUF_MAP_HOLE_OK, &bp);
if (!xchk_fblock_process_error(ds->sc, XFS_DATA_FORK, rec_bno,
&error))
goto out;
@@ -233,38 +246,37 @@
if (ds->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
goto out_relse;
- dent = (struct xfs_dir2_data_entry *)(((char *)bp->b_addr) + off);
+ dent = bp->b_addr + off;
/* Make sure we got a real directory entry. */
- p = (char *)mp->m_dir_inode_ops->data_entry_p(bp->b_addr);
- endp = xfs_dir3_data_endp(mp->m_dir_geo, bp->b_addr);
- if (!endp) {
+ iter_off = geo->data_entry_offset;
+ end = xfs_dir3_data_end_offset(geo, bp->b_addr);
+ if (!end) {
xchk_fblock_set_corrupt(ds->sc, XFS_DATA_FORK, rec_bno);
goto out_relse;
}
- while (p < endp) {
- struct xfs_dir2_data_entry *dep;
- struct xfs_dir2_data_unused *dup;
+ for (;;) {
+ struct xfs_dir2_data_entry *dep = bp->b_addr + iter_off;
+ struct xfs_dir2_data_unused *dup = bp->b_addr + iter_off;
- dup = (struct xfs_dir2_data_unused *)p;
+ if (iter_off >= end) {
+ xchk_fblock_set_corrupt(ds->sc, XFS_DATA_FORK, rec_bno);
+ goto out_relse;
+ }
+
if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
- p += be16_to_cpu(dup->length);
+ iter_off += be16_to_cpu(dup->length);
continue;
}
- dep = (struct xfs_dir2_data_entry *)p;
if (dep == dent)
break;
- p += mp->m_dir_inode_ops->data_entsize(dep->namelen);
- }
- if (p >= endp) {
- xchk_fblock_set_corrupt(ds->sc, XFS_DATA_FORK, rec_bno);
- goto out_relse;
+ iter_off += xfs_dir2_data_entsize(mp, dep->namelen);
}
/* Retrieve the entry, sanity check it, and compare hashes. */
ino = be64_to_cpu(dent->inumber);
hash = be32_to_cpu(ent->hashval);
- tag = be16_to_cpup(dp->d_ops->data_entry_tag_p(dent));
+ tag = be16_to_cpup(xfs_dir2_data_entry_tag_p(mp, dent));
if (!xfs_verify_dir_ino(mp, ino) || tag != off)
xchk_fblock_set_corrupt(ds->sc, XFS_DATA_FORK, rec_bno);
if (dent->namelen == 0) {
@@ -322,19 +334,15 @@
struct xfs_buf *bp;
struct xfs_dir2_data_free *bf;
struct xfs_mount *mp = sc->mp;
- const struct xfs_dir_ops *d_ops;
- char *ptr;
- char *endptr;
u16 tag;
unsigned int nr_bestfrees = 0;
unsigned int nr_frees = 0;
unsigned int smallest_bestfree;
int newlen;
- int offset;
+ unsigned int offset;
+ unsigned int end;
int error;
- d_ops = sc->ip->d_ops;
-
if (is_block) {
/* dir block format */
if (lblk != XFS_B_TO_FSBT(mp, XFS_DIR2_DATA_OFFSET))
@@ -342,7 +350,7 @@
error = xfs_dir3_block_read(sc->tp, sc->ip, &bp);
} else {
/* dir data format */
- error = xfs_dir3_data_read(sc->tp, sc->ip, lblk, -1, &bp);
+ error = xfs_dir3_data_read(sc->tp, sc->ip, lblk, 0, &bp);
}
if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, lblk, &error))
goto out;
@@ -354,7 +362,7 @@
goto out_buf;
/* Do the bestfrees correspond to actual free space? */
- bf = d_ops->data_bestfree_p(bp->b_addr);
+ bf = xfs_dir2_data_bestfree_p(mp, bp->b_addr);
smallest_bestfree = UINT_MAX;
for (dfp = &bf[0]; dfp < &bf[XFS_DIR2_DATA_FD_COUNT]; dfp++) {
offset = be16_to_cpu(dfp->offset);
@@ -364,13 +372,13 @@
xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk);
goto out_buf;
}
- dup = (struct xfs_dir2_data_unused *)(bp->b_addr + offset);
+ dup = bp->b_addr + offset;
tag = be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup));
/* bestfree doesn't match the entry it points at? */
if (dup->freetag != cpu_to_be16(XFS_DIR2_DATA_FREE_TAG) ||
be16_to_cpu(dup->length) != be16_to_cpu(dfp->length) ||
- tag != ((char *)dup - (char *)bp->b_addr)) {
+ tag != offset) {
xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk);
goto out_buf;
}
@@ -386,30 +394,30 @@
}
/* Make sure the bestfrees are actually the best free spaces. */
- ptr = (char *)d_ops->data_entry_p(bp->b_addr);
- endptr = xfs_dir3_data_endp(mp->m_dir_geo, bp->b_addr);
+ offset = mp->m_dir_geo->data_entry_offset;
+ end = xfs_dir3_data_end_offset(mp->m_dir_geo, bp->b_addr);
/* Iterate the entries, stopping when we hit or go past the end. */
- while (ptr < endptr) {
- dup = (struct xfs_dir2_data_unused *)ptr;
+ while (offset < end) {
+ dup = bp->b_addr + offset;
+
/* Skip real entries */
if (dup->freetag != cpu_to_be16(XFS_DIR2_DATA_FREE_TAG)) {
- struct xfs_dir2_data_entry *dep;
+ struct xfs_dir2_data_entry *dep = bp->b_addr + offset;
- dep = (struct xfs_dir2_data_entry *)ptr;
- newlen = d_ops->data_entsize(dep->namelen);
+ newlen = xfs_dir2_data_entsize(mp, dep->namelen);
if (newlen <= 0) {
xchk_fblock_set_corrupt(sc, XFS_DATA_FORK,
lblk);
goto out_buf;
}
- ptr += newlen;
+ offset += newlen;
continue;
}
/* Spot check this free entry */
tag = be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup));
- if (tag != ((char *)dup - (char *)bp->b_addr)) {
+ if (tag != offset) {
xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk);
goto out_buf;
}
@@ -428,13 +436,13 @@
xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk);
goto out_buf;
}
- ptr += newlen;
- if (ptr <= endptr)
+ offset += newlen;
+ if (offset <= end)
nr_frees++;
}
/* We're required to fill all the space. */
- if (ptr != endptr)
+ if (offset != end)
xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk);
/* Did we see at least as many free slots as there are bestfrees? */
@@ -461,7 +469,7 @@
{
struct xfs_dir2_data_free *dfp;
- dfp = sc->ip->d_ops->data_bestfree_p(dbp->b_addr);
+ dfp = xfs_dir2_data_bestfree_p(sc->mp, dbp->b_addr);
if (len != be16_to_cpu(dfp->length))
xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk);
@@ -478,12 +486,10 @@
xfs_dablk_t lblk)
{
struct xfs_dir3_icleaf_hdr leafhdr;
- struct xfs_dir2_leaf_entry *ents;
struct xfs_dir2_leaf_tail *ltp;
struct xfs_dir2_leaf *leaf;
struct xfs_buf *dbp;
struct xfs_buf *bp;
- const struct xfs_dir_ops *d_ops = sc->ip->d_ops;
struct xfs_da_geometry *geo = sc->mp->m_dir_geo;
__be16 *bestp;
__u16 best;
@@ -495,14 +501,13 @@
int error;
/* Read the free space block. */
- error = xfs_dir3_leaf_read(sc->tp, sc->ip, lblk, -1, &bp);
+ error = xfs_dir3_leaf_read(sc->tp, sc->ip, lblk, &bp);
if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, lblk, &error))
- goto out;
+ return error;
xchk_buffer_recheck(sc, bp);
leaf = bp->b_addr;
- d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
- ents = d_ops->leaf_ents_p(leaf);
+ xfs_dir2_leaf_hdr_from_disk(sc->ip->i_mount, &leafhdr, leaf);
ltp = xfs_dir2_leaf_tail_p(geo, leaf);
bestcount = be32_to_cpu(ltp->bestcount);
bestp = xfs_dir2_leaf_bests_p(ltp);
@@ -524,24 +529,25 @@
}
/* Is the leaf count even remotely sane? */
- if (leafhdr.count > d_ops->leaf_max_ents(geo)) {
+ if (leafhdr.count > geo->leaf_max_ents) {
xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk);
goto out;
}
/* Leaves and bests don't overlap in leaf format. */
- if ((char *)&ents[leafhdr.count] > (char *)bestp) {
+ if ((char *)&leafhdr.ents[leafhdr.count] > (char *)bestp) {
xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk);
goto out;
}
/* Check hash value order, count stale entries. */
for (i = 0; i < leafhdr.count; i++) {
- hash = be32_to_cpu(ents[i].hashval);
+ hash = be32_to_cpu(leafhdr.ents[i].hashval);
if (i > 0 && lasthash > hash)
xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk);
lasthash = hash;
- if (ents[i].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR))
+ if (leafhdr.ents[i].address ==
+ cpu_to_be32(XFS_DIR2_NULL_DATAPTR))
stale++;
}
if (leafhdr.stale != stale)
@@ -552,19 +558,33 @@
/* Check all the bestfree entries. */
for (i = 0; i < bestcount; i++, bestp++) {
best = be16_to_cpu(*bestp);
- if (best == NULLDATAOFF)
- continue;
error = xfs_dir3_data_read(sc->tp, sc->ip,
- i * args->geo->fsbcount, -1, &dbp);
+ xfs_dir2_db_to_da(args->geo, i),
+ XFS_DABUF_MAP_HOLE_OK,
+ &dbp);
if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, lblk,
&error))
break;
- xchk_directory_check_freesp(sc, lblk, dbp, best);
+
+ if (!dbp) {
+ if (best != NULLDATAOFF) {
+ xchk_fblock_set_corrupt(sc, XFS_DATA_FORK,
+ lblk);
+ break;
+ }
+ continue;
+ }
+
+ if (best == NULLDATAOFF)
+ xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk);
+ else
+ xchk_directory_check_freesp(sc, lblk, dbp, best);
xfs_trans_brelse(sc->tp, dbp);
if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
- goto out;
+ break;
}
out:
+ xfs_trans_brelse(sc->tp, bp);
return error;
}
@@ -578,7 +598,6 @@
struct xfs_dir3_icfree_hdr freehdr;
struct xfs_buf *dbp;
struct xfs_buf *bp;
- __be16 *bestp;
__u16 best;
unsigned int stale = 0;
int i;
@@ -587,7 +606,7 @@
/* Read the free space block */
error = xfs_dir2_free_read(sc->tp, sc->ip, lblk, &bp);
if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, lblk, &error))
- goto out;
+ return error;
xchk_buffer_recheck(sc, bp);
if (xfs_sb_version_hascrc(&sc->mp->m_sb)) {
@@ -598,20 +617,19 @@
}
/* Check all the entries. */
- sc->ip->d_ops->free_hdr_from_disk(&freehdr, bp->b_addr);
- bestp = sc->ip->d_ops->free_bests_p(bp->b_addr);
- for (i = 0; i < freehdr.nvalid; i++, bestp++) {
- best = be16_to_cpu(*bestp);
+ xfs_dir2_free_hdr_from_disk(sc->ip->i_mount, &freehdr, bp->b_addr);
+ for (i = 0; i < freehdr.nvalid; i++) {
+ best = be16_to_cpu(freehdr.bests[i]);
if (best == NULLDATAOFF) {
stale++;
continue;
}
error = xfs_dir3_data_read(sc->tp, sc->ip,
(freehdr.firstdb + i) * args->geo->fsbcount,
- -1, &dbp);
+ 0, &dbp);
if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, lblk,
&error))
- break;
+ goto out;
xchk_directory_check_freesp(sc, lblk, dbp, best);
xfs_trans_brelse(sc->tp, dbp);
}
@@ -619,6 +637,7 @@
if (freehdr.nused + stale != freehdr.nvalid)
xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk);
out:
+ xfs_trans_brelse(sc->tp, bp);
return error;
}
@@ -629,7 +648,7 @@
{
struct xfs_bmbt_irec got;
struct xfs_da_args args;
- struct xfs_ifork *ifp;
+ struct xfs_ifork *ifp = XFS_IFORK_PTR(sc->ip, XFS_DATA_FORK);
struct xfs_mount *mp = sc->mp;
xfs_fileoff_t leaf_lblk;
xfs_fileoff_t free_lblk;
@@ -641,11 +660,10 @@
int error;
/* Ignore local format directories. */
- if (sc->ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS &&
- sc->ip->i_d.di_format != XFS_DINODE_FMT_BTREE)
+ if (ifp->if_format != XFS_DINODE_FMT_EXTENTS &&
+ ifp->if_format != XFS_DINODE_FMT_BTREE)
return 0;
- ifp = XFS_IFORK_PTR(sc->ip, XFS_DATA_FORK);
lblk = XFS_B_TO_FSB(mp, XFS_DIR2_DATA_OFFSET);
leaf_lblk = XFS_B_TO_FSB(mp, XFS_DIR2_LEAF_OFFSET);
free_lblk = XFS_B_TO_FSB(mp, XFS_DIR2_FREE_OFFSET);
diff --git a/fs/xfs/scrub/fscounters.c b/fs/xfs/scrub/fscounters.c
index 98f82d7..ec2064e 100644
--- a/fs/xfs/scrub/fscounters.c
+++ b/fs/xfs/scrub/fscounters.c
@@ -83,9 +83,6 @@
error = xfs_alloc_read_agf(mp, sc->tp, agno, 0, &agf_bp);
if (error)
break;
- error = -ENOMEM;
- if (!agf_bp || !agi_bp)
- break;
/*
* These are supposed to be initialized by the header read
@@ -104,7 +101,7 @@
pag = NULL;
error = 0;
- if (fatal_signal_pending(current))
+ if (xchk_should_terminate(sc, &error))
break;
}
@@ -163,6 +160,7 @@
uint64_t delayed;
xfs_agnumber_t agno;
int tries = 8;
+ int error = 0;
retry:
fsc->icount = 0;
@@ -196,10 +194,13 @@
xfs_perag_put(pag);
- if (fatal_signal_pending(current))
+ if (xchk_should_terminate(sc, &error))
break;
}
+ if (error)
+ return error;
+
/*
* The global incore space reservation is taken from the incore
* counters, so leave that out of the computation.
diff --git a/fs/xfs/scrub/health.c b/fs/xfs/scrub/health.c
index b2f6028..83d27cd 100644
--- a/fs/xfs/scrub/health.c
+++ b/fs/xfs/scrub/health.c
@@ -11,6 +11,7 @@
#include "xfs_sb.h"
#include "xfs_health.h"
#include "scrub/scrub.h"
+#include "scrub/health.h"
/*
* Scrub and In-Core Filesystem Health Assessments
diff --git a/fs/xfs/scrub/ialloc.c b/fs/xfs/scrub/ialloc.c
index 6817587..6517d67 100644
--- a/fs/xfs/scrub/ialloc.c
+++ b/fs/xfs/scrub/ialloc.c
@@ -104,7 +104,7 @@
xfs_extlen_t len)
{
struct xfs_mount *mp = bs->cur->bc_mp;
- xfs_agnumber_t agno = bs->cur->bc_private.a.agno;
+ xfs_agnumber_t agno = bs->cur->bc_ag.agno;
xfs_agblock_t bno;
bno = XFS_AGINO_TO_AGBNO(mp, agino);
@@ -164,7 +164,7 @@
* the record, compute which fs inode we're talking about.
*/
agino = irec->ir_startino + irec_ino;
- fsino = XFS_AGINO_TO_INO(mp, bs->cur->bc_private.a.agno, agino);
+ fsino = XFS_AGINO_TO_INO(mp, bs->cur->bc_ag.agno, agino);
irec_free = (irec->ir_free & XFS_INOBT_MASK(irec_ino));
if (be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC ||
@@ -215,7 +215,7 @@
struct xfs_dinode *dip;
struct xfs_buf *cluster_bp;
unsigned int nr_inodes;
- xfs_agnumber_t agno = bs->cur->bc_private.a.agno;
+ xfs_agnumber_t agno = bs->cur->bc_ag.agno;
xfs_agblock_t agbno;
unsigned int cluster_index;
uint16_t cluster_mask = 0;
@@ -278,8 +278,7 @@
&XFS_RMAP_OINFO_INODES);
/* Grab the inode cluster buffer. */
- error = xfs_imap_to_bp(mp, bs->cur->bc_tp, &imap, &dip, &cluster_bp,
- 0, 0);
+ error = xfs_imap_to_bp(mp, bs->cur->bc_tp, &imap, &dip, &cluster_bp, 0);
if (!xchk_btree_xref_process_error(bs->sc, bs->cur, 0, &error))
return error;
@@ -426,7 +425,7 @@
struct xchk_iallocbt *iabt = bs->private;
struct xfs_inobt_rec_incore irec;
uint64_t holes;
- xfs_agnumber_t agno = bs->cur->bc_private.a.agno;
+ xfs_agnumber_t agno = bs->cur->bc_ag.agno;
xfs_agino_t agino;
xfs_extlen_t len;
int holecount;
diff --git a/fs/xfs/scrub/inode.c b/fs/xfs/scrub/inode.c
index 1bea029..bb25ff1 100644
--- a/fs/xfs/scrub/inode.c
+++ b/fs/xfs/scrub/inode.c
@@ -189,11 +189,30 @@
if ((flags2 & XFS_DIFLAG2_DAX) && (flags2 & XFS_DIFLAG2_REFLINK))
goto bad;
+ /* no bigtime iflag without the bigtime feature */
+ if (xfs_dinode_has_bigtime(dip) &&
+ !xfs_sb_version_hasbigtime(&mp->m_sb))
+ goto bad;
+
return;
bad:
xchk_ino_set_corrupt(sc, ino);
}
+static inline void
+xchk_dinode_nsec(
+ struct xfs_scrub *sc,
+ xfs_ino_t ino,
+ struct xfs_dinode *dip,
+ const xfs_timestamp_t ts)
+{
+ struct timespec64 tv;
+
+ tv = xfs_inode_from_disk_ts(dip, ts);
+ if (tv.tv_nsec < 0 || tv.tv_nsec >= NSEC_PER_SEC)
+ xchk_ino_set_corrupt(sc, ino);
+}
+
/* Scrub all the ondisk inode fields. */
STATIC void
xchk_dinode(
@@ -292,12 +311,9 @@
}
/* di_[amc]time.nsec */
- if (be32_to_cpu(dip->di_atime.t_nsec) >= NSEC_PER_SEC)
- xchk_ino_set_corrupt(sc, ino);
- if (be32_to_cpu(dip->di_mtime.t_nsec) >= NSEC_PER_SEC)
- xchk_ino_set_corrupt(sc, ino);
- if (be32_to_cpu(dip->di_ctime.t_nsec) >= NSEC_PER_SEC)
- xchk_ino_set_corrupt(sc, ino);
+ xchk_dinode_nsec(sc, ino, dip, dip->di_atime);
+ xchk_dinode_nsec(sc, ino, dip, dip->di_mtime);
+ xchk_dinode_nsec(sc, ino, dip, dip->di_ctime);
/*
* di_size. xfs_dinode_verify checks for things that screw up
@@ -402,8 +418,7 @@
}
if (dip->di_version >= 3) {
- if (be32_to_cpu(dip->di_crtime.t_nsec) >= NSEC_PER_SEC)
- xchk_ino_set_corrupt(sc, ino);
+ xchk_dinode_nsec(sc, ino, dip, dip->di_crtime);
xchk_inode_flags2(sc, dip, ino, mode, flags, flags2);
xchk_inode_cowextsize(sc, dip, ino, mode, flags,
flags2);
diff --git a/fs/xfs/scrub/parent.c b/fs/xfs/scrub/parent.c
index c962bd5..855aa8b 100644
--- a/fs/xfs/scrub/parent.c
+++ b/fs/xfs/scrub/parent.c
@@ -32,8 +32,10 @@
struct xchk_parent_ctx {
struct dir_context dc;
+ struct xfs_scrub *sc;
xfs_ino_t ino;
xfs_nlink_t nlink;
+ bool cancelled;
};
/* Look for a single entry in a directory pointing to an inode. */
@@ -47,11 +49,21 @@
unsigned type)
{
struct xchk_parent_ctx *spc;
+ int error = 0;
spc = container_of(dc, struct xchk_parent_ctx, dc);
if (spc->ino == ino)
spc->nlink++;
- return 0;
+
+ /*
+ * If we're facing a fatal signal, bail out. Store the cancellation
+ * status separately because the VFS readdir code squashes error codes
+ * into short directory reads.
+ */
+ if (xchk_should_terminate(spc->sc, &error))
+ spc->cancelled = true;
+
+ return error;
}
/* Count the number of dentries in the parent dir that point to this inode. */
@@ -62,10 +74,9 @@
xfs_nlink_t *nlink)
{
struct xchk_parent_ctx spc = {
- .dc.actor = xchk_parent_actor,
- .dc.pos = 0,
- .ino = sc->ip->i_ino,
- .nlink = 0,
+ .dc.actor = xchk_parent_actor,
+ .ino = sc->ip->i_ino,
+ .sc = sc,
};
size_t bufsize;
loff_t oldpos;
@@ -79,8 +90,8 @@
* if there is one.
*/
lock_mode = xfs_ilock_data_map_shared(parent);
- if (parent->i_d.di_nextents > 0)
- error = xfs_dir3_data_readahead(parent, 0, -1);
+ if (parent->i_df.if_nextents > 0)
+ error = xfs_dir3_data_readahead(parent, 0, 0);
xfs_iunlock(parent, lock_mode);
if (error)
return error;
@@ -97,6 +108,10 @@
error = xfs_readdir(sc->tp, parent, &spc.dc, bufsize);
if (error)
goto out;
+ if (spc.cancelled) {
+ error = -EAGAIN;
+ goto out;
+ }
if (oldpos == spc.dc.pos)
break;
oldpos = spc.dc.pos;
diff --git a/fs/xfs/scrub/quota.c b/fs/xfs/scrub/quota.c
index 0a33b44..e34ca20 100644
--- a/fs/xfs/scrub/quota.c
+++ b/fs/xfs/scrub/quota.c
@@ -18,17 +18,17 @@
#include "scrub/common.h"
/* Convert a scrub type code to a DQ flag, or return 0 if error. */
-static inline uint
+static inline xfs_dqtype_t
xchk_quota_to_dqtype(
struct xfs_scrub *sc)
{
switch (sc->sm->sm_type) {
case XFS_SCRUB_TYPE_UQUOTA:
- return XFS_DQ_USER;
+ return XFS_DQTYPE_USER;
case XFS_SCRUB_TYPE_GQUOTA:
- return XFS_DQ_GROUP;
+ return XFS_DQTYPE_GROUP;
case XFS_SCRUB_TYPE_PQUOTA:
- return XFS_DQ_PROJ;
+ return XFS_DQTYPE_PROJ;
default:
return 0;
}
@@ -40,7 +40,7 @@
struct xfs_scrub *sc,
struct xfs_inode *ip)
{
- uint dqtype;
+ xfs_dqtype_t dqtype;
int error;
if (!XFS_IS_QUOTA_RUNNING(sc->mp) || !XFS_IS_QUOTA_ON(sc->mp))
@@ -73,52 +73,29 @@
STATIC int
xchk_quota_item(
struct xfs_dquot *dq,
- uint dqtype,
+ xfs_dqtype_t dqtype,
void *priv)
{
struct xchk_quota_info *sqi = priv;
struct xfs_scrub *sc = sqi->sc;
struct xfs_mount *mp = sc->mp;
- struct xfs_disk_dquot *d = &dq->q_core;
struct xfs_quotainfo *qi = mp->m_quotainfo;
xfs_fileoff_t offset;
- unsigned long long bsoft;
- unsigned long long isoft;
- unsigned long long rsoft;
- unsigned long long bhard;
- unsigned long long ihard;
- unsigned long long rhard;
- unsigned long long bcount;
- unsigned long long icount;
- unsigned long long rcount;
xfs_ino_t fs_icount;
- xfs_dqid_t id = be32_to_cpu(d->d_id);
+ int error = 0;
+
+ if (xchk_should_terminate(sc, &error))
+ return error;
/*
* Except for the root dquot, the actual dquot we got must either have
* the same or higher id as we saw before.
*/
- offset = id / qi->qi_dqperchunk;
- if (id && id <= sqi->last_id)
+ offset = dq->q_id / qi->qi_dqperchunk;
+ if (dq->q_id && dq->q_id <= sqi->last_id)
xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset);
- sqi->last_id = id;
-
- /* Did we get the dquot type we wanted? */
- if (dqtype != (d->d_flags & XFS_DQ_ALLTYPES))
- xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset);
-
- if (d->d_pad0 != cpu_to_be32(0) || d->d_pad != cpu_to_be16(0))
- xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset);
-
- /* Check the limits. */
- bhard = be64_to_cpu(d->d_blk_hardlimit);
- ihard = be64_to_cpu(d->d_ino_hardlimit);
- rhard = be64_to_cpu(d->d_rtb_hardlimit);
-
- bsoft = be64_to_cpu(d->d_blk_softlimit);
- isoft = be64_to_cpu(d->d_ino_softlimit);
- rsoft = be64_to_cpu(d->d_rtb_softlimit);
+ sqi->last_id = dq->q_id;
/*
* Warn if the hard limits are larger than the fs.
@@ -128,25 +105,22 @@
* Complain about corruption if the soft limit is greater than
* the hard limit.
*/
- if (bhard > mp->m_sb.sb_dblocks)
+ if (dq->q_blk.hardlimit > mp->m_sb.sb_dblocks)
xchk_fblock_set_warning(sc, XFS_DATA_FORK, offset);
- if (bsoft > bhard)
+ if (dq->q_blk.softlimit > dq->q_blk.hardlimit)
xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset);
- if (ihard > M_IGEO(mp)->maxicount)
+ if (dq->q_ino.hardlimit > M_IGEO(mp)->maxicount)
xchk_fblock_set_warning(sc, XFS_DATA_FORK, offset);
- if (isoft > ihard)
+ if (dq->q_ino.softlimit > dq->q_ino.hardlimit)
xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset);
- if (rhard > mp->m_sb.sb_rblocks)
+ if (dq->q_rtb.hardlimit > mp->m_sb.sb_rblocks)
xchk_fblock_set_warning(sc, XFS_DATA_FORK, offset);
- if (rsoft > rhard)
+ if (dq->q_rtb.softlimit > dq->q_rtb.hardlimit)
xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset);
/* Check the resource counts. */
- bcount = be64_to_cpu(d->d_bcount);
- icount = be64_to_cpu(d->d_icount);
- rcount = be64_to_cpu(d->d_rtbcount);
fs_icount = percpu_counter_sum(&mp->m_icount);
/*
@@ -155,15 +129,15 @@
* if there are no quota limits.
*/
if (xfs_sb_version_hasreflink(&mp->m_sb)) {
- if (mp->m_sb.sb_dblocks < bcount)
+ if (mp->m_sb.sb_dblocks < dq->q_blk.count)
xchk_fblock_set_warning(sc, XFS_DATA_FORK,
offset);
} else {
- if (mp->m_sb.sb_dblocks < bcount)
+ if (mp->m_sb.sb_dblocks < dq->q_blk.count)
xchk_fblock_set_corrupt(sc, XFS_DATA_FORK,
offset);
}
- if (icount > fs_icount || rcount > mp->m_sb.sb_rblocks)
+ if (dq->q_ino.count > fs_icount || dq->q_rtb.count > mp->m_sb.sb_rblocks)
xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset);
/*
@@ -171,13 +145,25 @@
* lower limit than the actual usage. However, we flag it for
* admin review.
*/
- if (id != 0 && bhard != 0 && bcount > bhard)
+ if (dq->q_id == 0)
+ goto out;
+
+ if (dq->q_blk.hardlimit != 0 &&
+ dq->q_blk.count > dq->q_blk.hardlimit)
xchk_fblock_set_warning(sc, XFS_DATA_FORK, offset);
- if (id != 0 && ihard != 0 && icount > ihard)
+
+ if (dq->q_ino.hardlimit != 0 &&
+ dq->q_ino.count > dq->q_ino.hardlimit)
xchk_fblock_set_warning(sc, XFS_DATA_FORK, offset);
- if (id != 0 && rhard != 0 && rcount > rhard)
+
+ if (dq->q_rtb.hardlimit != 0 &&
+ dq->q_rtb.count > dq->q_rtb.hardlimit)
xchk_fblock_set_warning(sc, XFS_DATA_FORK, offset);
+out:
+ if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+ return -EFSCORRUPTED;
+
return 0;
}
@@ -228,7 +214,7 @@
struct xchk_quota_info sqi;
struct xfs_mount *mp = sc->mp;
struct xfs_quotainfo *qi = mp->m_quotainfo;
- uint dqtype;
+ xfs_dqtype_t dqtype;
int error = 0;
dqtype = xchk_quota_to_dqtype(sc);
diff --git a/fs/xfs/scrub/refcount.c b/fs/xfs/scrub/refcount.c
index 5c6b71b..dd672e6 100644
--- a/fs/xfs/scrub/refcount.c
+++ b/fs/xfs/scrub/refcount.c
@@ -334,7 +334,7 @@
{
struct xfs_mount *mp = bs->cur->bc_mp;
xfs_agblock_t *cow_blocks = bs->private;
- xfs_agnumber_t agno = bs->cur->bc_private.a.agno;
+ xfs_agnumber_t agno = bs->cur->bc_ag.agno;
xfs_agblock_t bno;
xfs_extlen_t len;
xfs_nlink_t refcount;
diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
index b70a88b..25e86c7 100644
--- a/fs/xfs/scrub/repair.c
+++ b/fs/xfs/scrub/repair.c
@@ -208,8 +208,10 @@
/* Now grab the block counters from the AGF. */
error = xfs_alloc_read_agf(mp, NULL, sm->sm_agno, 0, &bp);
if (!error) {
- aglen = be32_to_cpu(XFS_BUF_TO_AGF(bp)->agf_length);
- freelen = be32_to_cpu(XFS_BUF_TO_AGF(bp)->agf_freeblks);
+ struct xfs_agf *agf = bp->b_addr;
+
+ aglen = be32_to_cpu(agf->agf_length);
+ freelen = be32_to_cpu(agf->agf_freeblks);
usedlen = aglen - freelen;
xfs_buf_relse(bp);
}
@@ -341,13 +343,17 @@
struct xfs_trans *tp = sc->tp;
struct xfs_mount *mp = sc->mp;
struct xfs_buf *bp;
+ int error;
trace_xrep_init_btblock(mp, XFS_FSB_TO_AGNO(mp, fsb),
XFS_FSB_TO_AGBNO(mp, fsb), btnum);
ASSERT(XFS_FSB_TO_AGNO(mp, fsb) == sc->sa.agno);
- bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, XFS_FSB_TO_DADDR(mp, fsb),
- XFS_FSB_TO_BB(mp, 1), 0);
+ error = xfs_trans_get_buf(tp, mp->m_ddev_targp,
+ XFS_FSB_TO_DADDR(mp, fsb), XFS_FSB_TO_BB(mp, 1), 0,
+ &bp);
+ if (error)
+ return error;
xfs_buf_zero(bp, 0, BBTOB(bp->b_length));
xfs_btree_init_block(mp, bp, btnum, 0, 0, sc->sa.agno);
xfs_trans_buf_set_type(tp, bp, XFS_BLFT_BTREE_BUF);
@@ -430,10 +436,10 @@
int
xrep_invalidate_blocks(
struct xfs_scrub *sc,
- struct xfs_bitmap *bitmap)
+ struct xbitmap *bitmap)
{
- struct xfs_bitmap_range *bmr;
- struct xfs_bitmap_range *n;
+ struct xbitmap_range *bmr;
+ struct xbitmap_range *n;
struct xfs_buf *bp;
xfs_fsblock_t fsbno;
@@ -445,7 +451,7 @@
* because we never own those; and if we can't TRYLOCK the buffer we
* assume it's owned by someone else.
*/
- for_each_xfs_bitmap_block(fsbno, bmr, n, bitmap) {
+ for_each_xbitmap_block(fsbno, bmr, n, bitmap) {
/* Skip AG headers and post-EOFS blocks */
if (!xfs_verify_fsbno(sc->mp, fsbno))
continue;
@@ -542,8 +548,6 @@
error = xfs_alloc_read_agf(sc->mp, sc->tp, agno, 0, &agf_bp);
if (error)
return error;
- if (!agf_bp)
- return -ENOMEM;
} else {
agf_bp = sc->sa.agf_bp;
}
@@ -593,18 +597,18 @@
int
xrep_reap_extents(
struct xfs_scrub *sc,
- struct xfs_bitmap *bitmap,
+ struct xbitmap *bitmap,
const struct xfs_owner_info *oinfo,
enum xfs_ag_resv_type type)
{
- struct xfs_bitmap_range *bmr;
- struct xfs_bitmap_range *n;
+ struct xbitmap_range *bmr;
+ struct xbitmap_range *n;
xfs_fsblock_t fsbno;
int error = 0;
ASSERT(xfs_sb_version_hasrmapbt(&sc->mp->m_sb));
- for_each_xfs_bitmap_block(fsbno, bmr, n, bitmap) {
+ for_each_xbitmap_block(fsbno, bmr, n, bitmap) {
ASSERT(sc->ip != NULL ||
XFS_FSB_TO_AGNO(sc->mp, fsbno) == sc->sa.agno);
trace_xrep_dispose_btree_extent(sc->mp,
@@ -613,11 +617,9 @@
error = xrep_reap_block(sc, fsbno, oinfo, type);
if (error)
- goto out;
+ break;
}
-out:
- xfs_bitmap_destroy(bitmap);
return error;
}
@@ -877,7 +879,7 @@
ri.sc = sc;
ri.btree_info = btree_info;
- ri.agf = XFS_BUF_TO_AGF(agf_bp);
+ ri.agf = agf_bp->b_addr;
ri.agfl_bp = agfl_bp;
for (fab = btree_info; fab->buf_ops; fab++) {
ASSERT(agfl_bp || fab->rmap_owner != XFS_RMAP_OWN_AG);
@@ -897,11 +899,11 @@
void
xrep_force_quotacheck(
struct xfs_scrub *sc,
- uint dqtype)
+ xfs_dqtype_t type)
{
uint flag;
- flag = xfs_quota_chkd_flag(dqtype);
+ flag = xfs_quota_chkd_flag(type);
if (!(flag & sc->mp->m_qflags))
return;
@@ -937,11 +939,11 @@
"inode %llu repair encountered quota error %d, quotacheck forced.",
(unsigned long long)sc->ip->i_ino, error);
if (XFS_IS_UQUOTA_ON(sc->mp) && !sc->ip->i_udquot)
- xrep_force_quotacheck(sc, XFS_DQ_USER);
+ xrep_force_quotacheck(sc, XFS_DQTYPE_USER);
if (XFS_IS_GQUOTA_ON(sc->mp) && !sc->ip->i_gdquot)
- xrep_force_quotacheck(sc, XFS_DQ_GROUP);
+ xrep_force_quotacheck(sc, XFS_DQTYPE_GROUP);
if (XFS_IS_PQUOTA_ON(sc->mp) && !sc->ip->i_pdquot)
- xrep_force_quotacheck(sc, XFS_DQ_PROJ);
+ xrep_force_quotacheck(sc, XFS_DQTYPE_PROJ);
/* fall through */
case -ESRCH:
error = 0;
diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h
index 60c61d7..fe77de0 100644
--- a/fs/xfs/scrub/repair.h
+++ b/fs/xfs/scrub/repair.h
@@ -6,6 +6,8 @@
#ifndef __XFS_SCRUB_REPAIR_H__
#define __XFS_SCRUB_REPAIR_H__
+#include "xfs_quota_defs.h"
+
static inline int xrep_notsupported(struct xfs_scrub *sc)
{
return -EOPNOTSUPP;
@@ -28,11 +30,11 @@
struct xfs_buf **bpp, xfs_btnum_t btnum,
const struct xfs_buf_ops *ops);
-struct xfs_bitmap;
+struct xbitmap;
int xrep_fix_freelist(struct xfs_scrub *sc, bool can_shrink);
-int xrep_invalidate_blocks(struct xfs_scrub *sc, struct xfs_bitmap *btlist);
-int xrep_reap_extents(struct xfs_scrub *sc, struct xfs_bitmap *exlist,
+int xrep_invalidate_blocks(struct xfs_scrub *sc, struct xbitmap *btlist);
+int xrep_reap_extents(struct xfs_scrub *sc, struct xbitmap *exlist,
const struct xfs_owner_info *oinfo, enum xfs_ag_resv_type type);
struct xrep_find_ag_btree {
@@ -49,7 +51,7 @@
int xrep_find_ag_btree_roots(struct xfs_scrub *sc, struct xfs_buf *agf_bp,
struct xrep_find_ag_btree *btree_info, struct xfs_buf *agfl_bp);
-void xrep_force_quotacheck(struct xfs_scrub *sc, uint dqtype);
+void xrep_force_quotacheck(struct xfs_scrub *sc, xfs_dqtype_t type);
int xrep_ino_dqattach(struct xfs_scrub *sc);
/* Metadata repairers */
@@ -75,7 +77,6 @@
xrep_calc_ag_resblks(
struct xfs_scrub *sc)
{
- ASSERT(!(sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR));
return 0;
}
diff --git a/fs/xfs/scrub/rmap.c b/fs/xfs/scrub/rmap.c
index 8d4cefd..f4fcb47 100644
--- a/fs/xfs/scrub/rmap.c
+++ b/fs/xfs/scrub/rmap.c
@@ -92,7 +92,7 @@
{
struct xfs_mount *mp = bs->cur->bc_mp;
struct xfs_rmap_irec irec;
- xfs_agnumber_t agno = bs->cur->bc_private.a.agno;
+ xfs_agnumber_t agno = bs->cur->bc_ag.agno;
bool non_inode;
bool is_unwritten;
bool is_bmbt;
diff --git a/fs/xfs/scrub/rtbitmap.c b/fs/xfs/scrub/rtbitmap.c
index c642bc2..76e4ffe 100644
--- a/fs/xfs/scrub/rtbitmap.c
+++ b/fs/xfs/scrub/rtbitmap.c
@@ -13,6 +13,7 @@
#include "xfs_trans.h"
#include "xfs_rtalloc.h"
#include "xfs_inode.h"
+#include "xfs_bmap.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
@@ -58,6 +59,41 @@
return 0;
}
+/* Make sure the entire rtbitmap file is mapped with written extents. */
+STATIC int
+xchk_rtbitmap_check_extents(
+ struct xfs_scrub *sc)
+{
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_bmbt_irec map;
+ xfs_rtblock_t off;
+ int nmap;
+ int error = 0;
+
+ for (off = 0; off < mp->m_sb.sb_rbmblocks;) {
+ if (xchk_should_terminate(sc, &error) ||
+ (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
+ break;
+
+ /* Make sure we have a written extent. */
+ nmap = 1;
+ error = xfs_bmapi_read(mp->m_rbmip, off,
+ mp->m_sb.sb_rbmblocks - off, &map, &nmap,
+ XFS_DATA_FORK);
+ if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, off, &error))
+ break;
+
+ if (nmap != 1 || !xfs_bmap_is_written_extent(&map)) {
+ xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, off);
+ break;
+ }
+
+ off += map.br_blockcount;
+ }
+
+ return error;
+}
+
/* Scrub the realtime bitmap. */
int
xchk_rtbitmap(
@@ -65,11 +101,22 @@
{
int error;
+ /* Is the size of the rtbitmap correct? */
+ if (sc->mp->m_rbmip->i_d.di_size !=
+ XFS_FSB_TO_B(sc->mp, sc->mp->m_sb.sb_rbmblocks)) {
+ xchk_ino_set_corrupt(sc, sc->mp->m_rbmip->i_ino);
+ return 0;
+ }
+
/* Invoke the fork scrubber. */
error = xchk_metadata_inode_forks(sc);
if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
return error;
+ error = xchk_rtbitmap_check_extents(sc);
+ if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
+ return error;
+
error = xfs_rtalloc_query_all(sc->tp, xchk_rtbitmap_rec, sc);
if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, 0, &error))
goto out;
diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c
index 720bef5..8ebf35b 100644
--- a/fs/xfs/scrub/scrub.c
+++ b/fs/xfs/scrub/scrub.c
@@ -16,6 +16,7 @@
#include "xfs_qm.h"
#include "xfs_errortag.h"
#include "xfs_error.h"
+#include "xfs_scrub.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/trace.h"
diff --git a/fs/xfs/scrub/symlink.c b/fs/xfs/scrub/symlink.c
index 5641ae5..c08be5e 100644
--- a/fs/xfs/scrub/symlink.c
+++ b/fs/xfs/scrub/symlink.c
@@ -22,7 +22,7 @@
struct xfs_inode *ip)
{
/* Allocate the buffer without the inode lock held. */
- sc->buf = kmem_zalloc_large(XFS_SYMLINK_MAXLEN + 1, 0);
+ sc->buf = kvzalloc(XFS_SYMLINK_MAXLEN + 1, GFP_KERNEL);
if (!sc->buf)
return -ENOMEM;
diff --git a/fs/xfs/scrub/trace.c b/fs/xfs/scrub/trace.c
index 9eaab2e..2c6c248 100644
--- a/fs/xfs/scrub/trace.c
+++ b/fs/xfs/scrub/trace.c
@@ -24,9 +24,9 @@
return XFS_DADDR_TO_FSB(cur->bc_mp, cur->bc_bufs[level]->b_bn);
else if (level == cur->bc_nlevels - 1 &&
cur->bc_flags & XFS_BTREE_LONG_PTRS)
- return XFS_INO_TO_FSB(cur->bc_mp, cur->bc_private.b.ip->i_ino);
+ return XFS_INO_TO_FSB(cur->bc_mp, cur->bc_ino.ip->i_ino);
else if (!(cur->bc_flags & XFS_BTREE_LONG_PTRS))
- return XFS_AGB_TO_FSB(cur->bc_mp, cur->bc_private.a.agno, 0);
+ return XFS_AGB_TO_FSB(cur->bc_mp, cur->bc_ag.agno, 0);
return NULLFSBLOCK;
}
diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h
index 3362bae..e46f5ce 100644
--- a/fs/xfs/scrub/trace.h
+++ b/fs/xfs/scrub/trace.h
@@ -329,7 +329,7 @@
__field(int, level)
__field(xfs_agnumber_t, agno)
__field(xfs_agblock_t, bno)
- __field(int, ptr);
+ __field(int, ptr)
__field(int, error)
__field(void *, ret_ip)
),
@@ -379,7 +379,7 @@
xfs_fsblock_t fsbno = xchk_btree_cur_fsbno(cur, level);
__entry->dev = sc->mp->m_super->s_dev;
__entry->ino = sc->ip->i_ino;
- __entry->whichfork = cur->bc_private.b.whichfork;
+ __entry->whichfork = cur->bc_ino.whichfork;
__entry->type = sc->sm->sm_type;
__entry->btnum = cur->bc_btnum;
__entry->level = level;
@@ -414,7 +414,7 @@
__field(int, level)
__field(xfs_agnumber_t, agno)
__field(xfs_agblock_t, bno)
- __field(int, ptr);
+ __field(int, ptr)
__field(void *, ret_ip)
),
TP_fast_assign(
@@ -452,14 +452,14 @@
__field(int, level)
__field(xfs_agnumber_t, agno)
__field(xfs_agblock_t, bno)
- __field(int, ptr);
+ __field(int, ptr)
__field(void *, ret_ip)
),
TP_fast_assign(
xfs_fsblock_t fsbno = xchk_btree_cur_fsbno(cur, level);
__entry->dev = sc->mp->m_super->s_dev;
__entry->ino = sc->ip->i_ino;
- __entry->whichfork = cur->bc_private.b.whichfork;
+ __entry->whichfork = cur->bc_ino.whichfork;
__entry->type = sc->sm->sm_type;
__entry->btnum = cur->bc_btnum;
__entry->level = level;
diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c
index 96d7071..c544951 100644
--- a/fs/xfs/xfs_acl.c
+++ b/fs/xfs/xfs_acl.c
@@ -12,8 +12,12 @@
#include "xfs_inode.h"
#include "xfs_attr.h"
#include "xfs_trace.h"
-#include <linux/posix_acl_xattr.h>
+#include "xfs_error.h"
+#include "xfs_acl.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include <linux/posix_acl_xattr.h>
/*
* Locking scheme:
@@ -23,6 +27,7 @@
STATIC struct posix_acl *
xfs_acl_from_disk(
+ struct xfs_mount *mp,
const struct xfs_acl *aclp,
int len,
int max_entries)
@@ -32,11 +37,18 @@
const struct xfs_acl_entry *ace;
unsigned int count, i;
- if (len < sizeof(*aclp))
+ if (len < sizeof(*aclp)) {
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, aclp,
+ len);
return ERR_PTR(-EFSCORRUPTED);
+ }
+
count = be32_to_cpu(aclp->acl_cnt);
- if (count > max_entries || XFS_ACL_SIZE(count) != len)
+ if (count > max_entries || XFS_ACL_SIZE(count) != len) {
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, aclp,
+ len);
return ERR_PTR(-EFSCORRUPTED);
+ }
acl = posix_acl_alloc(count, GFP_KERNEL);
if (!acl)
@@ -57,10 +69,12 @@
switch (acl_e->e_tag) {
case ACL_USER:
- acl_e->e_uid = xfs_uid_to_kuid(be32_to_cpu(ace->ae_id));
+ acl_e->e_uid = make_kuid(&init_user_ns,
+ be32_to_cpu(ace->ae_id));
break;
case ACL_GROUP:
- acl_e->e_gid = xfs_gid_to_kgid(be32_to_cpu(ace->ae_id));
+ acl_e->e_gid = make_kgid(&init_user_ns,
+ be32_to_cpu(ace->ae_id));
break;
case ACL_USER_OBJ:
case ACL_GROUP_OBJ:
@@ -93,10 +107,12 @@
ace->ae_tag = cpu_to_be32(acl_e->e_tag);
switch (acl_e->e_tag) {
case ACL_USER:
- ace->ae_id = cpu_to_be32(xfs_kuid_to_uid(acl_e->e_uid));
+ ace->ae_id = cpu_to_be32(
+ from_kuid(&init_user_ns, acl_e->e_uid));
break;
case ACL_GROUP:
- ace->ae_id = cpu_to_be32(xfs_kgid_to_gid(acl_e->e_gid));
+ ace->ae_id = cpu_to_be32(
+ from_kgid(&init_user_ns, acl_e->e_gid));
break;
default:
ace->ae_id = cpu_to_be32(ACL_UNDEFINED_ID);
@@ -110,99 +126,86 @@
struct posix_acl *
xfs_get_acl(struct inode *inode, int type)
{
- struct xfs_inode *ip = XFS_I(inode);
- struct posix_acl *acl = NULL;
- struct xfs_acl *xfs_acl = NULL;
- unsigned char *ea_name;
- int error;
- int len;
+ struct xfs_inode *ip = XFS_I(inode);
+ struct xfs_mount *mp = ip->i_mount;
+ struct posix_acl *acl = NULL;
+ struct xfs_da_args args = {
+ .dp = ip,
+ .attr_filter = XFS_ATTR_ROOT,
+ .valuelen = XFS_ACL_MAX_SIZE(mp),
+ };
+ int error;
trace_xfs_get_acl(ip);
switch (type) {
case ACL_TYPE_ACCESS:
- ea_name = SGI_ACL_FILE;
+ args.name = SGI_ACL_FILE;
break;
case ACL_TYPE_DEFAULT:
- ea_name = SGI_ACL_DEFAULT;
+ args.name = SGI_ACL_DEFAULT;
break;
default:
BUG();
}
+ args.namelen = strlen(args.name);
/*
- * If we have a cached ACLs value just return it, not need to
- * go out to the disk.
+ * If the attribute doesn't exist make sure we have a negative cache
+ * entry, for any other error assume it is transient.
*/
- len = XFS_ACL_MAX_SIZE(ip->i_mount);
- error = xfs_attr_get(ip, ea_name, (unsigned char **)&xfs_acl, &len,
- ATTR_ALLOC | ATTR_ROOT);
- if (error) {
- /*
- * If the attribute doesn't exist make sure we have a negative
- * cache entry, for any other error assume it is transient.
- */
- if (error != -ENOATTR)
- acl = ERR_PTR(error);
- } else {
- acl = xfs_acl_from_disk(xfs_acl, len,
- XFS_ACL_MAX_ENTRIES(ip->i_mount));
- kmem_free(xfs_acl);
+ error = xfs_attr_get(&args);
+ if (!error) {
+ acl = xfs_acl_from_disk(mp, args.value, args.valuelen,
+ XFS_ACL_MAX_ENTRIES(mp));
+ } else if (error != -ENOATTR) {
+ acl = ERR_PTR(error);
}
+
+ kmem_free(args.value);
return acl;
}
int
__xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
{
- struct xfs_inode *ip = XFS_I(inode);
- unsigned char *ea_name;
- int error;
+ struct xfs_inode *ip = XFS_I(inode);
+ struct xfs_da_args args = {
+ .dp = ip,
+ .attr_filter = XFS_ATTR_ROOT,
+ };
+ int error;
switch (type) {
case ACL_TYPE_ACCESS:
- ea_name = SGI_ACL_FILE;
+ args.name = SGI_ACL_FILE;
break;
case ACL_TYPE_DEFAULT:
if (!S_ISDIR(inode->i_mode))
return acl ? -EACCES : 0;
- ea_name = SGI_ACL_DEFAULT;
+ args.name = SGI_ACL_DEFAULT;
break;
default:
return -EINVAL;
}
+ args.namelen = strlen(args.name);
if (acl) {
- struct xfs_acl *xfs_acl;
- int len = XFS_ACL_MAX_SIZE(ip->i_mount);
-
- xfs_acl = kmem_zalloc_large(len, 0);
- if (!xfs_acl)
+ args.valuelen = XFS_ACL_SIZE(acl->a_count);
+ args.value = kvzalloc(args.valuelen, GFP_KERNEL);
+ if (!args.value)
return -ENOMEM;
-
- xfs_acl_to_disk(xfs_acl, acl);
-
- /* subtract away the unused acl entries */
- len -= sizeof(struct xfs_acl_entry) *
- (XFS_ACL_MAX_ENTRIES(ip->i_mount) - acl->a_count);
-
- error = xfs_attr_set(ip, ea_name, (unsigned char *)xfs_acl,
- len, ATTR_ROOT);
-
- kmem_free(xfs_acl);
- } else {
- /*
- * A NULL ACL argument means we want to remove the ACL.
- */
- error = xfs_attr_remove(ip, ea_name, ATTR_ROOT);
-
- /*
- * If the attribute didn't exist to start with that's fine.
- */
- if (error == -ENOATTR)
- error = 0;
+ xfs_acl_to_disk(args.value, acl);
}
+ error = xfs_attr_set(&args);
+ kmem_free(args.value);
+
+ /*
+ * If the attribute didn't exist to start with that's fine.
+ */
+ if (!acl && error == -ENOATTR)
+ error = 0;
if (!error)
set_cached_acl(inode, type, acl);
return error;
@@ -262,3 +265,19 @@
return error;
}
+
+/*
+ * Invalidate any cached ACLs if the user has bypassed the ACL interface.
+ * We don't validate the content whatsoever so it is caller responsibility to
+ * provide data in valid format and ensure i_mode is consistent.
+ */
+void
+xfs_forget_acl(
+ struct inode *inode,
+ const char *name)
+{
+ if (!strcmp(name, SGI_ACL_FILE))
+ forget_cached_acl(inode, ACL_TYPE_ACCESS);
+ else if (!strcmp(name, SGI_ACL_DEFAULT))
+ forget_cached_acl(inode, ACL_TYPE_DEFAULT);
+}
diff --git a/fs/xfs/xfs_acl.h b/fs/xfs/xfs_acl.h
index 94615e3..c042c08 100644
--- a/fs/xfs/xfs_acl.h
+++ b/fs/xfs/xfs_acl.h
@@ -13,14 +13,16 @@
extern struct posix_acl *xfs_get_acl(struct inode *inode, int type);
extern int xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type);
extern int __xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type);
+void xfs_forget_acl(struct inode *inode, const char *name);
#else
static inline struct posix_acl *xfs_get_acl(struct inode *inode, int type)
{
return NULL;
}
# define xfs_set_acl NULL
+static inline void xfs_forget_acl(struct inode *inode, const char *name)
+{
+}
#endif /* CONFIG_XFS_POSIX_ACL */
-extern void xfs_forget_acl(struct inode *inode, const char *name, int xflags);
-
#endif /* __XFS_ACL_H__ */
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index f16d5f1..4304c64 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -18,108 +18,22 @@
#include "xfs_bmap_util.h"
#include "xfs_reflink.h"
-/*
- * structure owned by writepages passed to individual writepage calls
- */
struct xfs_writepage_ctx {
- struct xfs_bmbt_irec imap;
- int fork;
+ struct iomap_writepage_ctx ctx;
unsigned int data_seq;
unsigned int cow_seq;
- struct xfs_ioend *ioend;
};
-struct block_device *
-xfs_find_bdev_for_inode(
- struct inode *inode)
+static inline struct xfs_writepage_ctx *
+XFS_WPC(struct iomap_writepage_ctx *ctx)
{
- struct xfs_inode *ip = XFS_I(inode);
- struct xfs_mount *mp = ip->i_mount;
-
- if (XFS_IS_REALTIME_INODE(ip))
- return mp->m_rtdev_targp->bt_bdev;
- else
- return mp->m_ddev_targp->bt_bdev;
-}
-
-struct dax_device *
-xfs_find_daxdev_for_inode(
- struct inode *inode)
-{
- struct xfs_inode *ip = XFS_I(inode);
- struct xfs_mount *mp = ip->i_mount;
-
- if (XFS_IS_REALTIME_INODE(ip))
- return mp->m_rtdev_targp->bt_daxdev;
- else
- return mp->m_ddev_targp->bt_daxdev;
-}
-
-static void
-xfs_finish_page_writeback(
- struct inode *inode,
- struct bio_vec *bvec,
- int error)
-{
- struct iomap_page *iop = to_iomap_page(bvec->bv_page);
-
- if (error) {
- SetPageError(bvec->bv_page);
- mapping_set_error(inode->i_mapping, -EIO);
- }
-
- ASSERT(iop || i_blocksize(inode) == PAGE_SIZE);
- ASSERT(!iop || atomic_read(&iop->write_count) > 0);
-
- if (!iop || atomic_dec_and_test(&iop->write_count))
- end_page_writeback(bvec->bv_page);
-}
-
-/*
- * We're now finished for good with this ioend structure. Update the page
- * state, release holds on bios, and finally free up memory. Do not use the
- * ioend after this.
- */
-STATIC void
-xfs_destroy_ioend(
- struct xfs_ioend *ioend,
- int error)
-{
- struct inode *inode = ioend->io_inode;
- struct bio *bio = &ioend->io_inline_bio;
- struct bio *last = ioend->io_bio, *next;
- u64 start = bio->bi_iter.bi_sector;
- bool quiet = bio_flagged(bio, BIO_QUIET);
-
- for (bio = &ioend->io_inline_bio; bio; bio = next) {
- struct bio_vec *bvec;
- struct bvec_iter_all iter_all;
-
- /*
- * For the last bio, bi_private points to the ioend, so we
- * need to explicitly end the iteration here.
- */
- if (bio == last)
- next = NULL;
- else
- next = bio->bi_private;
-
- /* walk each page on bio, ending page IO on them */
- bio_for_each_segment_all(bvec, bio, iter_all)
- xfs_finish_page_writeback(inode, bvec, error);
- bio_put(bio);
- }
-
- if (unlikely(error && !quiet)) {
- xfs_err_ratelimited(XFS_I(inode)->i_mount,
- "writeback error on sector %llu", start);
- }
+ return container_of(ctx, struct xfs_writepage_ctx, ctx);
}
/*
* Fast and loose check if this write could update the on-disk inode size.
*/
-static inline bool xfs_ioend_is_append(struct xfs_ioend *ioend)
+static inline bool xfs_ioend_is_append(struct iomap_ioend *ioend)
{
return ioend->io_offset + ioend->io_size >
XFS_I(ioend->io_inode)->i_d.di_size;
@@ -127,7 +41,7 @@
STATIC int
xfs_setfilesize_trans_alloc(
- struct xfs_ioend *ioend)
+ struct iomap_ioend *ioend)
{
struct xfs_mount *mp = XFS_I(ioend->io_inode)->i_mount;
struct xfs_trans *tp;
@@ -137,7 +51,7 @@
if (error)
return error;
- ioend->io_append_trans = tp;
+ ioend->io_private = tp;
/*
* We may pass freeze protection with a transaction. So tell lockdep
@@ -200,11 +114,11 @@
STATIC int
xfs_setfilesize_ioend(
- struct xfs_ioend *ioend,
+ struct iomap_ioend *ioend,
int error)
{
struct xfs_inode *ip = XFS_I(ioend->io_inode);
- struct xfs_trans *tp = ioend->io_append_trans;
+ struct xfs_trans *tp = ioend->io_private;
/*
* The transaction may have been allocated in the I/O submission thread,
@@ -228,9 +142,8 @@
*/
STATIC void
xfs_end_ioend(
- struct xfs_ioend *ioend)
+ struct iomap_ioend *ioend)
{
- struct list_head ioend_list;
struct xfs_inode *ip = XFS_I(ioend->io_inode);
xfs_off_t offset = ioend->io_offset;
size_t size = ioend->io_size;
@@ -257,7 +170,7 @@
*/
error = blk_status_to_errno(ioend->io_bio->bi_status);
if (unlikely(error)) {
- if (ioend->io_fork == XFS_COW_FORK)
+ if (ioend->io_flags & IOMAP_F_SHARED)
xfs_reflink_cancel_cow_range(ip, offset, size, true);
goto done;
}
@@ -265,154 +178,86 @@
/*
* Success: commit the COW or unwritten blocks if needed.
*/
- if (ioend->io_fork == XFS_COW_FORK)
+ if (ioend->io_flags & IOMAP_F_SHARED)
error = xfs_reflink_end_cow(ip, offset, size);
- else if (ioend->io_state == XFS_EXT_UNWRITTEN)
+ else if (ioend->io_type == IOMAP_UNWRITTEN)
error = xfs_iomap_write_unwritten(ip, offset, size, false);
else
- ASSERT(!xfs_ioend_is_append(ioend) || ioend->io_append_trans);
+ ASSERT(!xfs_ioend_is_append(ioend) || ioend->io_private);
done:
- if (ioend->io_append_trans)
+ if (ioend->io_private)
error = xfs_setfilesize_ioend(ioend, error);
- list_replace_init(&ioend->io_list, &ioend_list);
- xfs_destroy_ioend(ioend, error);
-
- while (!list_empty(&ioend_list)) {
- ioend = list_first_entry(&ioend_list, struct xfs_ioend,
- io_list);
- list_del_init(&ioend->io_list);
- xfs_destroy_ioend(ioend, error);
- }
-
+ iomap_finish_ioends(ioend, error);
memalloc_nofs_restore(nofs_flag);
}
/*
- * We can merge two adjacent ioends if they have the same set of work to do.
- */
-static bool
-xfs_ioend_can_merge(
- struct xfs_ioend *ioend,
- struct xfs_ioend *next)
-{
- if (ioend->io_bio->bi_status != next->io_bio->bi_status)
- return false;
- if ((ioend->io_fork == XFS_COW_FORK) ^ (next->io_fork == XFS_COW_FORK))
- return false;
- if ((ioend->io_state == XFS_EXT_UNWRITTEN) ^
- (next->io_state == XFS_EXT_UNWRITTEN))
- return false;
- if (ioend->io_offset + ioend->io_size != next->io_offset)
- return false;
- return true;
-}
-
-/*
* If the to be merged ioend has a preallocated transaction for file
* size updates we need to ensure the ioend it is merged into also
* has one. If it already has one we can simply cancel the transaction
* as it is guaranteed to be clean.
*/
static void
-xfs_ioend_merge_append_transactions(
- struct xfs_ioend *ioend,
- struct xfs_ioend *next)
+xfs_ioend_merge_private(
+ struct iomap_ioend *ioend,
+ struct iomap_ioend *next)
{
- if (!ioend->io_append_trans) {
- ioend->io_append_trans = next->io_append_trans;
- next->io_append_trans = NULL;
+ if (!ioend->io_private) {
+ ioend->io_private = next->io_private;
+ next->io_private = NULL;
} else {
xfs_setfilesize_ioend(next, -ECANCELED);
}
}
-/* Try to merge adjacent completions. */
-STATIC void
-xfs_ioend_try_merge(
- struct xfs_ioend *ioend,
- struct list_head *more_ioends)
-{
- struct xfs_ioend *next_ioend;
-
- while (!list_empty(more_ioends)) {
- next_ioend = list_first_entry(more_ioends, struct xfs_ioend,
- io_list);
- if (!xfs_ioend_can_merge(ioend, next_ioend))
- break;
- list_move_tail(&next_ioend->io_list, &ioend->io_list);
- ioend->io_size += next_ioend->io_size;
- if (next_ioend->io_append_trans)
- xfs_ioend_merge_append_transactions(ioend, next_ioend);
- }
-}
-
-/* list_sort compare function for ioends */
-static int
-xfs_ioend_compare(
- void *priv,
- struct list_head *a,
- struct list_head *b)
-{
- struct xfs_ioend *ia;
- struct xfs_ioend *ib;
-
- ia = container_of(a, struct xfs_ioend, io_list);
- ib = container_of(b, struct xfs_ioend, io_list);
- if (ia->io_offset < ib->io_offset)
- return -1;
- else if (ia->io_offset > ib->io_offset)
- return 1;
- return 0;
-}
-
/* Finish all pending io completions. */
void
xfs_end_io(
struct work_struct *work)
{
- struct xfs_inode *ip;
- struct xfs_ioend *ioend;
- struct list_head completion_list;
+ struct xfs_inode *ip =
+ container_of(work, struct xfs_inode, i_ioend_work);
+ struct iomap_ioend *ioend;
+ struct list_head tmp;
unsigned long flags;
- ip = container_of(work, struct xfs_inode, i_ioend_work);
-
spin_lock_irqsave(&ip->i_ioend_lock, flags);
- list_replace_init(&ip->i_ioend_list, &completion_list);
+ list_replace_init(&ip->i_ioend_list, &tmp);
spin_unlock_irqrestore(&ip->i_ioend_lock, flags);
- list_sort(NULL, &completion_list, xfs_ioend_compare);
-
- while (!list_empty(&completion_list)) {
- ioend = list_first_entry(&completion_list, struct xfs_ioend,
- io_list);
+ iomap_sort_ioends(&tmp);
+ while ((ioend = list_first_entry_or_null(&tmp, struct iomap_ioend,
+ io_list))) {
list_del_init(&ioend->io_list);
- xfs_ioend_try_merge(ioend, &completion_list);
+ iomap_ioend_try_merge(ioend, &tmp, xfs_ioend_merge_private);
xfs_end_ioend(ioend);
}
}
+static inline bool xfs_ioend_needs_workqueue(struct iomap_ioend *ioend)
+{
+ return ioend->io_private ||
+ ioend->io_type == IOMAP_UNWRITTEN ||
+ (ioend->io_flags & IOMAP_F_SHARED);
+}
+
STATIC void
xfs_end_bio(
struct bio *bio)
{
- struct xfs_ioend *ioend = bio->bi_private;
+ struct iomap_ioend *ioend = bio->bi_private;
struct xfs_inode *ip = XFS_I(ioend->io_inode);
- struct xfs_mount *mp = ip->i_mount;
unsigned long flags;
- if (ioend->io_fork == XFS_COW_FORK ||
- ioend->io_state == XFS_EXT_UNWRITTEN ||
- ioend->io_append_trans != NULL) {
- spin_lock_irqsave(&ip->i_ioend_lock, flags);
- if (list_empty(&ip->i_ioend_list))
- WARN_ON_ONCE(!queue_work(mp->m_unwritten_workqueue,
- &ip->i_ioend_work));
- list_add_tail(&ioend->io_list, &ip->i_ioend_list);
- spin_unlock_irqrestore(&ip->i_ioend_lock, flags);
- } else
- xfs_destroy_ioend(ioend, blk_status_to_errno(bio->bi_status));
+ ASSERT(xfs_ioend_needs_workqueue(ioend));
+
+ spin_lock_irqsave(&ip->i_ioend_lock, flags);
+ if (list_empty(&ip->i_ioend_list))
+ WARN_ON_ONCE(!queue_work(ip->i_mount->m_unwritten_workqueue,
+ &ip->i_ioend_work));
+ list_add_tail(&ioend->io_list, &ip->i_ioend_list);
+ spin_unlock_irqrestore(&ip->i_ioend_lock, flags);
}
/*
@@ -421,19 +266,19 @@
*/
static bool
xfs_imap_valid(
- struct xfs_writepage_ctx *wpc,
+ struct iomap_writepage_ctx *wpc,
struct xfs_inode *ip,
- xfs_fileoff_t offset_fsb)
+ loff_t offset)
{
- if (offset_fsb < wpc->imap.br_startoff ||
- offset_fsb >= wpc->imap.br_startoff + wpc->imap.br_blockcount)
+ if (offset < wpc->iomap.offset ||
+ offset >= wpc->iomap.offset + wpc->iomap.length)
return false;
/*
* If this is a COW mapping, it is sufficient to check that the mapping
* covers the offset. Be careful to check this first because the caller
* can revalidate a COW mapping without updating the data seqno.
*/
- if (wpc->fork == XFS_COW_FORK)
+ if (wpc->iomap.flags & IOMAP_F_SHARED)
return true;
/*
@@ -443,17 +288,17 @@
* checked (and found nothing at this offset) could have added
* overlapping blocks.
*/
- if (wpc->data_seq != READ_ONCE(ip->i_df.if_seq))
+ if (XFS_WPC(wpc)->data_seq != READ_ONCE(ip->i_df.if_seq))
return false;
if (xfs_inode_has_cow_data(ip) &&
- wpc->cow_seq != READ_ONCE(ip->i_cowfp->if_seq))
+ XFS_WPC(wpc)->cow_seq != READ_ONCE(ip->i_cowfp->if_seq))
return false;
return true;
}
/*
* Pass in a dellalloc extent and convert it to real extents, return the real
- * extent that maps offset_fsb in wpc->imap.
+ * extent that maps offset_fsb in wpc->iomap.
*
* The current page is held locked so nothing could have removed the block
* backing offset_fsb, although it could have moved from the COW to the data
@@ -461,32 +306,38 @@
*/
static int
xfs_convert_blocks(
- struct xfs_writepage_ctx *wpc,
+ struct iomap_writepage_ctx *wpc,
struct xfs_inode *ip,
- xfs_fileoff_t offset_fsb)
+ int whichfork,
+ loff_t offset)
{
int error;
+ unsigned *seq;
+
+ if (whichfork == XFS_COW_FORK)
+ seq = &XFS_WPC(wpc)->cow_seq;
+ else
+ seq = &XFS_WPC(wpc)->data_seq;
/*
- * Attempt to allocate whatever delalloc extent currently backs
- * offset_fsb and put the result into wpc->imap. Allocate in a loop
- * because it may take several attempts to allocate real blocks for a
- * contiguous delalloc extent if free space is sufficiently fragmented.
+ * Attempt to allocate whatever delalloc extent currently backs offset
+ * and put the result into wpc->iomap. Allocate in a loop because it
+ * may take several attempts to allocate real blocks for a contiguous
+ * delalloc extent if free space is sufficiently fragmented.
*/
do {
- error = xfs_bmapi_convert_delalloc(ip, wpc->fork, offset_fsb,
- &wpc->imap, wpc->fork == XFS_COW_FORK ?
- &wpc->cow_seq : &wpc->data_seq);
+ error = xfs_bmapi_convert_delalloc(ip, whichfork, offset,
+ &wpc->iomap, seq);
if (error)
return error;
- } while (wpc->imap.br_startoff + wpc->imap.br_blockcount <= offset_fsb);
+ } while (wpc->iomap.offset + wpc->iomap.length <= offset);
return 0;
}
-STATIC int
+static int
xfs_map_blocks(
- struct xfs_writepage_ctx *wpc,
+ struct iomap_writepage_ctx *wpc,
struct inode *inode,
loff_t offset)
{
@@ -495,7 +346,8 @@
ssize_t count = i_blocksize(inode);
xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + count);
- xfs_fileoff_t cow_fsb = NULLFILEOFF;
+ xfs_fileoff_t cow_fsb;
+ int whichfork;
struct xfs_bmbt_irec imap;
struct xfs_iext_cursor icur;
int retries = 0;
@@ -519,7 +371,7 @@
* against concurrent updates and provides a memory barrier on the way
* out that ensures that we always see the current value.
*/
- if (xfs_imap_valid(wpc, ip, offset_fsb))
+ if (xfs_imap_valid(wpc, ip, offset))
return 0;
/*
@@ -529,8 +381,10 @@
* landed in a hole and we skip the block.
*/
retry:
+ cow_fsb = NULLFILEOFF;
+ whichfork = XFS_DATA_FORK;
xfs_ilock(ip, XFS_ILOCK_SHARED);
- ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
+ ASSERT(ip->i_df.if_format != XFS_DINODE_FMT_BTREE ||
(ip->i_df.if_flags & XFS_IFEXTENTS));
/*
@@ -541,10 +395,10 @@
xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &imap))
cow_fsb = imap.br_startoff;
if (cow_fsb != NULLFILEOFF && cow_fsb <= offset_fsb) {
- wpc->cow_seq = READ_ONCE(ip->i_cowfp->if_seq);
+ XFS_WPC(wpc)->cow_seq = READ_ONCE(ip->i_cowfp->if_seq);
xfs_iunlock(ip, XFS_ILOCK_SHARED);
- wpc->fork = XFS_COW_FORK;
+ whichfork = XFS_COW_FORK;
goto allocate_blocks;
}
@@ -552,7 +406,7 @@
* No COW extent overlap. Revalidate now that we may have updated
* ->cow_seq. If the data mapping is still valid, we're done.
*/
- if (xfs_imap_valid(wpc, ip, offset_fsb)) {
+ if (xfs_imap_valid(wpc, ip, offset)) {
xfs_iunlock(ip, XFS_ILOCK_SHARED);
return 0;
}
@@ -564,11 +418,9 @@
*/
if (!xfs_iext_lookup_extent(ip, &ip->i_df, offset_fsb, &icur, &imap))
imap.br_startoff = end_fsb; /* fake a hole past EOF */
- wpc->data_seq = READ_ONCE(ip->i_df.if_seq);
+ XFS_WPC(wpc)->data_seq = READ_ONCE(ip->i_df.if_seq);
xfs_iunlock(ip, XFS_ILOCK_SHARED);
- wpc->fork = XFS_DATA_FORK;
-
/* landed in a hole or beyond EOF? */
if (imap.br_startoff > offset_fsb) {
imap.br_blockcount = imap.br_startoff - offset_fsb;
@@ -592,11 +444,11 @@
isnullstartblock(imap.br_startblock))
goto allocate_blocks;
- wpc->imap = imap;
- trace_xfs_map_blocks_found(ip, offset, count, wpc->fork, &imap);
+ xfs_bmbt_to_iomap(ip, &wpc->iomap, &imap, 0);
+ trace_xfs_map_blocks_found(ip, offset, count, whichfork, &imap);
return 0;
allocate_blocks:
- error = xfs_convert_blocks(wpc, ip, offset_fsb);
+ error = xfs_convert_blocks(wpc, ip, whichfork, offset);
if (error) {
/*
* If we failed to find the extent in the COW fork we might have
@@ -605,7 +457,7 @@
* the former case, but prevent additional retries to avoid
* looping forever for the latter case.
*/
- if (error == -EAGAIN && wpc->fork == XFS_COW_FORK && !retries++)
+ if (error == -EAGAIN && whichfork == XFS_COW_FORK && !retries++)
goto retry;
ASSERT(error != -EAGAIN);
return error;
@@ -616,34 +468,22 @@
* original delalloc one. Trim the return extent to the next COW
* boundary again to force a re-lookup.
*/
- if (wpc->fork != XFS_COW_FORK && cow_fsb != NULLFILEOFF &&
- cow_fsb < wpc->imap.br_startoff + wpc->imap.br_blockcount)
- wpc->imap.br_blockcount = cow_fsb - wpc->imap.br_startoff;
+ if (whichfork != XFS_COW_FORK && cow_fsb != NULLFILEOFF) {
+ loff_t cow_offset = XFS_FSB_TO_B(mp, cow_fsb);
- ASSERT(wpc->imap.br_startoff <= offset_fsb);
- ASSERT(wpc->imap.br_startoff + wpc->imap.br_blockcount > offset_fsb);
- trace_xfs_map_blocks_alloc(ip, offset, count, wpc->fork, &imap);
+ if (cow_offset < wpc->iomap.offset + wpc->iomap.length)
+ wpc->iomap.length = cow_offset - wpc->iomap.offset;
+ }
+
+ ASSERT(wpc->iomap.offset <= offset);
+ ASSERT(wpc->iomap.offset + wpc->iomap.length > offset);
+ trace_xfs_map_blocks_alloc(ip, offset, count, whichfork, &imap);
return 0;
}
-/*
- * Submit the bio for an ioend. We are passed an ioend with a bio attached to
- * it, and we submit that bio. The ioend may be used for multiple bio
- * submissions, so we only want to allocate an append transaction for the ioend
- * once. In the case of multiple bio submission, each bio will take an IO
- * reference to the ioend to ensure that the ioend completion is only done once
- * all bios have been submitted and the ioend is really done.
- *
- * If @status is non-zero, it means that we have a situation where some part of
- * the submission process has failed after we have marked paged for writeback
- * and unlocked them. In this situation, we need to fail the bio and ioend
- * rather than submit it to IO. This typically only happens on a filesystem
- * shutdown.
- */
-STATIC int
-xfs_submit_ioend(
- struct writeback_control *wbc,
- struct xfs_ioend *ioend,
+static int
+xfs_prepare_ioend(
+ struct iomap_ioend *ioend,
int status)
{
unsigned int nofs_flag;
@@ -656,157 +496,24 @@
nofs_flag = memalloc_nofs_save();
/* Convert CoW extents to regular */
- if (!status && ioend->io_fork == XFS_COW_FORK) {
+ if (!status && (ioend->io_flags & IOMAP_F_SHARED)) {
status = xfs_reflink_convert_cow(XFS_I(ioend->io_inode),
ioend->io_offset, ioend->io_size);
}
/* Reserve log space if we might write beyond the on-disk inode size. */
if (!status &&
- (ioend->io_fork == XFS_COW_FORK ||
- ioend->io_state != XFS_EXT_UNWRITTEN) &&
+ ((ioend->io_flags & IOMAP_F_SHARED) ||
+ ioend->io_type != IOMAP_UNWRITTEN) &&
xfs_ioend_is_append(ioend) &&
- !ioend->io_append_trans)
+ !ioend->io_private)
status = xfs_setfilesize_trans_alloc(ioend);
memalloc_nofs_restore(nofs_flag);
- ioend->io_bio->bi_private = ioend;
- ioend->io_bio->bi_end_io = xfs_end_bio;
-
- /*
- * If we are failing the IO now, just mark the ioend with an
- * error and finish it. This will run IO completion immediately
- * as there is only one reference to the ioend at this point in
- * time.
- */
- if (status) {
- ioend->io_bio->bi_status = errno_to_blk_status(status);
- bio_endio(ioend->io_bio);
- return status;
- }
-
- submit_bio(ioend->io_bio);
- return 0;
-}
-
-static struct xfs_ioend *
-xfs_alloc_ioend(
- struct inode *inode,
- int fork,
- xfs_exntst_t state,
- xfs_off_t offset,
- struct block_device *bdev,
- sector_t sector,
- struct writeback_control *wbc)
-{
- struct xfs_ioend *ioend;
- struct bio *bio;
-
- bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_PAGES, &xfs_ioend_bioset);
- bio_set_dev(bio, bdev);
- bio->bi_iter.bi_sector = sector;
- bio->bi_opf = REQ_OP_WRITE | wbc_to_write_flags(wbc);
- bio->bi_write_hint = inode->i_write_hint;
- wbc_init_bio(wbc, bio);
-
- ioend = container_of(bio, struct xfs_ioend, io_inline_bio);
- INIT_LIST_HEAD(&ioend->io_list);
- ioend->io_fork = fork;
- ioend->io_state = state;
- ioend->io_inode = inode;
- ioend->io_size = 0;
- ioend->io_offset = offset;
- ioend->io_append_trans = NULL;
- ioend->io_bio = bio;
- return ioend;
-}
-
-/*
- * Allocate a new bio, and chain the old bio to the new one.
- *
- * Note that we have to do perform the chaining in this unintuitive order
- * so that the bi_private linkage is set up in the right direction for the
- * traversal in xfs_destroy_ioend().
- */
-static struct bio *
-xfs_chain_bio(
- struct bio *prev)
-{
- struct bio *new;
-
- new = bio_alloc(GFP_NOFS, BIO_MAX_PAGES);
- bio_copy_dev(new, prev);/* also copies over blkcg information */
- new->bi_iter.bi_sector = bio_end_sector(prev);
- new->bi_opf = prev->bi_opf;
- new->bi_write_hint = prev->bi_write_hint;
-
- bio_chain(prev, new);
- bio_get(prev); /* for xfs_destroy_ioend */
- submit_bio(prev);
- return new;
-}
-
-/*
- * Test to see if we have an existing ioend structure that we could append to
- * first, otherwise finish off the current ioend and start another.
- */
-STATIC void
-xfs_add_to_ioend(
- struct inode *inode,
- xfs_off_t offset,
- struct page *page,
- struct iomap_page *iop,
- struct xfs_writepage_ctx *wpc,
- struct writeback_control *wbc,
- struct list_head *iolist)
-{
- struct xfs_inode *ip = XFS_I(inode);
- struct xfs_mount *mp = ip->i_mount;
- struct block_device *bdev = xfs_find_bdev_for_inode(inode);
- unsigned len = i_blocksize(inode);
- unsigned poff = offset & (PAGE_SIZE - 1);
- bool merged, same_page = false;
- sector_t sector;
-
- sector = xfs_fsb_to_db(ip, wpc->imap.br_startblock) +
- ((offset - XFS_FSB_TO_B(mp, wpc->imap.br_startoff)) >> 9);
-
- if (!wpc->ioend ||
- wpc->fork != wpc->ioend->io_fork ||
- wpc->imap.br_state != wpc->ioend->io_state ||
- sector != bio_end_sector(wpc->ioend->io_bio) ||
- offset != wpc->ioend->io_offset + wpc->ioend->io_size) {
- if (wpc->ioend)
- list_add(&wpc->ioend->io_list, iolist);
- wpc->ioend = xfs_alloc_ioend(inode, wpc->fork,
- wpc->imap.br_state, offset, bdev, sector, wbc);
- }
-
- merged = __bio_try_merge_page(wpc->ioend->io_bio, page, len, poff,
- &same_page);
-
- if (iop && !same_page)
- atomic_inc(&iop->write_count);
-
- if (!merged) {
- if (bio_full(wpc->ioend->io_bio, len))
- wpc->ioend->io_bio = xfs_chain_bio(wpc->ioend->io_bio);
- bio_add_page(wpc->ioend->io_bio, page, len, poff);
- }
-
- wpc->ioend->io_size += len;
- wbc_account_cgroup_owner(wbc, page, len);
-}
-
-STATIC void
-xfs_vm_invalidatepage(
- struct page *page,
- unsigned int offset,
- unsigned int length)
-{
- trace_xfs_invalidatepage(page->mapping->host, page, offset, length);
- iomap_invalidatepage(page, offset, length);
+ if (xfs_ioend_needs_workqueue(ioend))
+ ioend->io_bio->bi_end_io = xfs_end_bio;
+ return status;
}
/*
@@ -820,269 +527,39 @@
* transaction as there is no space left for block reservation (typically why we
* see a ENOSPC in writeback).
*/
-STATIC void
-xfs_aops_discard_page(
- struct page *page)
+static void
+xfs_discard_page(
+ struct page *page,
+ loff_t fileoff)
{
struct inode *inode = page->mapping->host;
struct xfs_inode *ip = XFS_I(inode);
struct xfs_mount *mp = ip->i_mount;
- loff_t offset = page_offset(page);
- xfs_fileoff_t start_fsb = XFS_B_TO_FSBT(mp, offset);
+ unsigned int pageoff = offset_in_page(fileoff);
+ xfs_fileoff_t start_fsb = XFS_B_TO_FSBT(mp, fileoff);
+ xfs_fileoff_t pageoff_fsb = XFS_B_TO_FSBT(mp, pageoff);
int error;
if (XFS_FORCED_SHUTDOWN(mp))
goto out_invalidate;
- xfs_alert(mp,
+ xfs_alert_ratelimited(mp,
"page discard on page "PTR_FMT", inode 0x%llx, offset %llu.",
- page, ip->i_ino, offset);
+ page, ip->i_ino, fileoff);
error = xfs_bmap_punch_delalloc_range(ip, start_fsb,
- PAGE_SIZE / i_blocksize(inode));
+ i_blocks_per_page(inode, page) - pageoff_fsb);
if (error && !XFS_FORCED_SHUTDOWN(mp))
xfs_alert(mp, "page discard unable to remove delalloc mapping.");
out_invalidate:
- xfs_vm_invalidatepage(page, 0, PAGE_SIZE);
+ iomap_invalidatepage(page, pageoff, PAGE_SIZE - pageoff);
}
-/*
- * We implement an immediate ioend submission policy here to avoid needing to
- * chain multiple ioends and hence nest mempool allocations which can violate
- * forward progress guarantees we need to provide. The current ioend we are
- * adding blocks to is cached on the writepage context, and if the new block
- * does not append to the cached ioend it will create a new ioend and cache that
- * instead.
- *
- * If a new ioend is created and cached, the old ioend is returned and queued
- * locally for submission once the entire page is processed or an error has been
- * detected. While ioends are submitted immediately after they are completed,
- * batching optimisations are provided by higher level block plugging.
- *
- * At the end of a writeback pass, there will be a cached ioend remaining on the
- * writepage context that the caller will need to submit.
- */
-static int
-xfs_writepage_map(
- struct xfs_writepage_ctx *wpc,
- struct writeback_control *wbc,
- struct inode *inode,
- struct page *page,
- uint64_t end_offset)
-{
- LIST_HEAD(submit_list);
- struct iomap_page *iop = to_iomap_page(page);
- unsigned len = i_blocksize(inode);
- struct xfs_ioend *ioend, *next;
- uint64_t file_offset; /* file offset of page */
- int error = 0, count = 0, i;
-
- ASSERT(iop || i_blocksize(inode) == PAGE_SIZE);
- ASSERT(!iop || atomic_read(&iop->write_count) == 0);
-
- /*
- * Walk through the page to find areas to write back. If we run off the
- * end of the current map or find the current map invalid, grab a new
- * one.
- */
- for (i = 0, file_offset = page_offset(page);
- i < (PAGE_SIZE >> inode->i_blkbits) && file_offset < end_offset;
- i++, file_offset += len) {
- if (iop && !test_bit(i, iop->uptodate))
- continue;
-
- error = xfs_map_blocks(wpc, inode, file_offset);
- if (error)
- break;
- if (wpc->imap.br_startblock == HOLESTARTBLOCK)
- continue;
- xfs_add_to_ioend(inode, file_offset, page, iop, wpc, wbc,
- &submit_list);
- count++;
- }
-
- ASSERT(wpc->ioend || list_empty(&submit_list));
- ASSERT(PageLocked(page));
- ASSERT(!PageWriteback(page));
-
- /*
- * On error, we have to fail the ioend here because we may have set
- * pages under writeback, we have to make sure we run IO completion to
- * mark the error state of the IO appropriately, so we can't cancel the
- * ioend directly here. That means we have to mark this page as under
- * writeback if we included any blocks from it in the ioend chain so
- * that completion treats it correctly.
- *
- * If we didn't include the page in the ioend, the on error we can
- * simply discard and unlock it as there are no other users of the page
- * now. The caller will still need to trigger submission of outstanding
- * ioends on the writepage context so they are treated correctly on
- * error.
- */
- if (unlikely(error)) {
- if (!count) {
- xfs_aops_discard_page(page);
- ClearPageUptodate(page);
- unlock_page(page);
- goto done;
- }
-
- /*
- * If the page was not fully cleaned, we need to ensure that the
- * higher layers come back to it correctly. That means we need
- * to keep the page dirty, and for WB_SYNC_ALL writeback we need
- * to ensure the PAGECACHE_TAG_TOWRITE index mark is not removed
- * so another attempt to write this page in this writeback sweep
- * will be made.
- */
- set_page_writeback_keepwrite(page);
- } else {
- clear_page_dirty_for_io(page);
- set_page_writeback(page);
- }
-
- unlock_page(page);
-
- /*
- * Preserve the original error if there was one, otherwise catch
- * submission errors here and propagate into subsequent ioend
- * submissions.
- */
- list_for_each_entry_safe(ioend, next, &submit_list, io_list) {
- int error2;
-
- list_del_init(&ioend->io_list);
- error2 = xfs_submit_ioend(wbc, ioend, error);
- if (error2 && !error)
- error = error2;
- }
-
- /*
- * We can end up here with no error and nothing to write only if we race
- * with a partial page truncate on a sub-page block sized filesystem.
- */
- if (!count)
- end_page_writeback(page);
-done:
- mapping_set_error(page->mapping, error);
- return error;
-}
-
-/*
- * Write out a dirty page.
- *
- * For delalloc space on the page we need to allocate space and flush it.
- * For unwritten space on the page we need to start the conversion to
- * regular allocated space.
- */
-STATIC int
-xfs_do_writepage(
- struct page *page,
- struct writeback_control *wbc,
- void *data)
-{
- struct xfs_writepage_ctx *wpc = data;
- struct inode *inode = page->mapping->host;
- loff_t offset;
- uint64_t end_offset;
- pgoff_t end_index;
-
- trace_xfs_writepage(inode, page, 0, 0);
-
- /*
- * Refuse to write the page out if we are called from reclaim context.
- *
- * This avoids stack overflows when called from deeply used stacks in
- * random callers for direct reclaim or memcg reclaim. We explicitly
- * allow reclaim from kswapd as the stack usage there is relatively low.
- *
- * This should never happen except in the case of a VM regression so
- * warn about it.
- */
- if (WARN_ON_ONCE((current->flags & (PF_MEMALLOC|PF_KSWAPD)) ==
- PF_MEMALLOC))
- goto redirty;
-
- /*
- * Given that we do not allow direct reclaim to call us, we should
- * never be called while in a filesystem transaction.
- */
- if (WARN_ON_ONCE(current->flags & PF_MEMALLOC_NOFS))
- goto redirty;
-
- /*
- * Is this page beyond the end of the file?
- *
- * The page index is less than the end_index, adjust the end_offset
- * to the highest offset that this page should represent.
- * -----------------------------------------------------
- * | file mapping | <EOF> |
- * -----------------------------------------------------
- * | Page ... | Page N-2 | Page N-1 | Page N | |
- * ^--------------------------------^----------|--------
- * | desired writeback range | see else |
- * ---------------------------------^------------------|
- */
- offset = i_size_read(inode);
- end_index = offset >> PAGE_SHIFT;
- if (page->index < end_index)
- end_offset = (xfs_off_t)(page->index + 1) << PAGE_SHIFT;
- else {
- /*
- * Check whether the page to write out is beyond or straddles
- * i_size or not.
- * -------------------------------------------------------
- * | file mapping | <EOF> |
- * -------------------------------------------------------
- * | Page ... | Page N-2 | Page N-1 | Page N | Beyond |
- * ^--------------------------------^-----------|---------
- * | | Straddles |
- * ---------------------------------^-----------|--------|
- */
- unsigned offset_into_page = offset & (PAGE_SIZE - 1);
-
- /*
- * Skip the page if it is fully outside i_size, e.g. due to a
- * truncate operation that is in progress. We must redirty the
- * page so that reclaim stops reclaiming it. Otherwise
- * xfs_vm_releasepage() is called on it and gets confused.
- *
- * Note that the end_index is unsigned long, it would overflow
- * if the given offset is greater than 16TB on 32-bit system
- * and if we do check the page is fully outside i_size or not
- * via "if (page->index >= end_index + 1)" as "end_index + 1"
- * will be evaluated to 0. Hence this page will be redirtied
- * and be written out repeatedly which would result in an
- * infinite loop, the user program that perform this operation
- * will hang. Instead, we can verify this situation by checking
- * if the page to write is totally beyond the i_size or if it's
- * offset is just equal to the EOF.
- */
- if (page->index > end_index ||
- (page->index == end_index && offset_into_page == 0))
- goto redirty;
-
- /*
- * The page straddles i_size. It must be zeroed out on each
- * and every writepage invocation because it may be mmapped.
- * "A file is mapped in multiples of the page size. For a file
- * that is not a multiple of the page size, the remaining
- * memory is zeroed when mapped, and writes to that region are
- * not written out to the file."
- */
- zero_user_segment(page, offset_into_page, PAGE_SIZE);
-
- /* Adjust the end_offset to the end of file */
- end_offset = offset;
- }
-
- return xfs_writepage_map(wpc, wbc, inode, page, end_offset);
-
-redirty:
- redirty_page_for_writepage(wbc, page);
- unlock_page(page);
- return 0;
-}
+static const struct iomap_writeback_ops xfs_writeback_ops = {
+ .map_blocks = xfs_map_blocks,
+ .prepare_ioend = xfs_prepare_ioend,
+ .discard_page = xfs_discard_page,
+};
STATIC int
xfs_vm_writepage(
@@ -1090,12 +567,8 @@
struct writeback_control *wbc)
{
struct xfs_writepage_ctx wpc = { };
- int ret;
- ret = xfs_do_writepage(page, wbc, &wpc);
- if (wpc.ioend)
- ret = xfs_submit_ioend(wbc, wpc.ioend, ret);
- return ret;
+ return iomap_writepage(page, wbc, &wpc.ctx, &xfs_writeback_ops);
}
STATIC int
@@ -1104,13 +577,9 @@
struct writeback_control *wbc)
{
struct xfs_writepage_ctx wpc = { };
- int ret;
xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED);
- ret = write_cache_pages(mapping, wbc, xfs_do_writepage, &wpc);
- if (wpc.ioend)
- ret = xfs_submit_ioend(wbc, wpc.ioend, ret);
- return ret;
+ return iomap_writepages(mapping, wbc, &wpc.ctx, &xfs_writeback_ops);
}
STATIC int
@@ -1118,18 +587,11 @@
struct address_space *mapping,
struct writeback_control *wbc)
{
- xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED);
- return dax_writeback_mapping_range(mapping,
- xfs_find_bdev_for_inode(mapping->host), wbc);
-}
+ struct xfs_inode *ip = XFS_I(mapping->host);
-STATIC int
-xfs_vm_releasepage(
- struct page *page,
- gfp_t gfp_mask)
-{
- trace_xfs_releasepage(page->mapping->host, page, 0, 0);
- return iomap_releasepage(page, gfp_mask);
+ xfs_iflags_clear(ip, XFS_ITRUNCATED);
+ return dax_writeback_mapping_range(mapping,
+ xfs_inode_buftarg(ip)->bt_daxdev, wbc);
}
STATIC sector_t
@@ -1152,7 +614,7 @@
*/
if (xfs_is_cow_inode(ip) || XFS_IS_REALTIME_INODE(ip))
return 0;
- return iomap_bmap(mapping, block, &xfs_iomap_ops);
+ return iomap_bmap(mapping, block, &xfs_read_iomap_ops);
}
STATIC int
@@ -1160,19 +622,14 @@
struct file *unused,
struct page *page)
{
- trace_xfs_vm_readpage(page->mapping->host, 1);
- return iomap_readpage(page, &xfs_iomap_ops);
+ return iomap_readpage(page, &xfs_read_iomap_ops);
}
-STATIC int
-xfs_vm_readpages(
- struct file *unused,
- struct address_space *mapping,
- struct list_head *pages,
- unsigned nr_pages)
+STATIC void
+xfs_vm_readahead(
+ struct readahead_control *rac)
{
- trace_xfs_vm_readpages(mapping->host, nr_pages);
- return iomap_readpages(mapping, pages, nr_pages, &xfs_iomap_ops);
+ iomap_readahead(rac, &xfs_read_iomap_ops);
}
static int
@@ -1181,18 +638,19 @@
struct file *swap_file,
sector_t *span)
{
- sis->bdev = xfs_find_bdev_for_inode(file_inode(swap_file));
- return iomap_swapfile_activate(sis, swap_file, span, &xfs_iomap_ops);
+ sis->bdev = xfs_inode_buftarg(XFS_I(file_inode(swap_file)))->bt_bdev;
+ return iomap_swapfile_activate(sis, swap_file, span,
+ &xfs_read_iomap_ops);
}
const struct address_space_operations xfs_address_space_operations = {
.readpage = xfs_vm_readpage,
- .readpages = xfs_vm_readpages,
+ .readahead = xfs_vm_readahead,
.writepage = xfs_vm_writepage,
.writepages = xfs_vm_writepages,
.set_page_dirty = iomap_set_page_dirty,
- .releasepage = xfs_vm_releasepage,
- .invalidatepage = xfs_vm_invalidatepage,
+ .releasepage = iomap_releasepage,
+ .invalidatepage = iomap_invalidatepage,
.bmap = xfs_vm_bmap,
.direct_IO = noop_direct_IO,
.migratepage = iomap_migrate_page,
diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h
index 45a1ea2..e0bd684 100644
--- a/fs/xfs/xfs_aops.h
+++ b/fs/xfs/xfs_aops.h
@@ -6,29 +6,9 @@
#ifndef __XFS_AOPS_H__
#define __XFS_AOPS_H__
-extern struct bio_set xfs_ioend_bioset;
-
-/*
- * Structure for buffered I/O completions.
- */
-struct xfs_ioend {
- struct list_head io_list; /* next ioend in chain */
- int io_fork; /* inode fork written back */
- xfs_exntst_t io_state; /* extent state */
- struct inode *io_inode; /* file being written to */
- size_t io_size; /* size of the extent */
- xfs_off_t io_offset; /* offset in the file */
- struct xfs_trans *io_append_trans;/* xact. for size update */
- struct bio *io_bio; /* bio being built */
- struct bio io_inline_bio; /* MUST BE LAST! */
-};
-
extern const struct address_space_operations xfs_address_space_operations;
extern const struct address_space_operations xfs_dax_aops;
int xfs_setfilesize(struct xfs_inode *ip, xfs_off_t offset, size_t size);
-extern struct block_device *xfs_find_bdev_for_inode(struct inode *);
-extern struct dax_device *xfs_find_daxdev_for_inode(struct inode *);
-
#endif /* __XFS_AOPS_H__ */
diff --git a/fs/xfs/xfs_attr_inactive.c b/fs/xfs/xfs_attr_inactive.c
index a640a28..bfad669 100644
--- a/fs/xfs/xfs_attr_inactive.c
+++ b/fs/xfs/xfs_attr_inactive.c
@@ -22,24 +22,21 @@
#include "xfs_attr_leaf.h"
#include "xfs_quota.h"
#include "xfs_dir2.h"
+#include "xfs_error.h"
/*
- * Look at all the extents for this logical region,
- * invalidate any buffers that are incore/in transactions.
+ * Invalidate any incore buffers associated with this remote attribute value
+ * extent. We never log remote attribute value buffers, which means that they
+ * won't be attached to a transaction and are therefore safe to mark stale.
+ * The actual bunmapi will be taken care of later.
*/
STATIC int
-xfs_attr3_leaf_freextent(
- struct xfs_trans **trans,
+xfs_attr3_rmt_stale(
struct xfs_inode *dp,
xfs_dablk_t blkno,
int blkcnt)
{
struct xfs_bmbt_irec map;
- struct xfs_buf *bp;
- xfs_dablk_t tblkno;
- xfs_daddr_t dblkno;
- int tblkcnt;
- int dblkcnt;
int nmap;
int error;
@@ -47,47 +44,29 @@
* Roll through the "value", invalidating the attribute value's
* blocks.
*/
- tblkno = blkno;
- tblkcnt = blkcnt;
- while (tblkcnt > 0) {
+ while (blkcnt > 0) {
/*
* Try to remember where we decided to put the value.
*/
nmap = 1;
- error = xfs_bmapi_read(dp, (xfs_fileoff_t)tblkno, tblkcnt,
+ error = xfs_bmapi_read(dp, (xfs_fileoff_t)blkno, blkcnt,
&map, &nmap, XFS_BMAPI_ATTRFORK);
- if (error) {
+ if (error)
return error;
- }
- ASSERT(nmap == 1);
- ASSERT(map.br_startblock != DELAYSTARTBLOCK);
+ if (XFS_IS_CORRUPT(dp->i_mount, nmap != 1))
+ return -EFSCORRUPTED;
/*
- * If it's a hole, these are already unmapped
- * so there's nothing to invalidate.
+ * Mark any incore buffers for the remote value as stale. We
+ * never log remote attr value buffers, so the buffer should be
+ * easy to kill.
*/
- if (map.br_startblock != HOLESTARTBLOCK) {
+ error = xfs_attr_rmtval_stale(dp, &map, 0);
+ if (error)
+ return error;
- dblkno = XFS_FSB_TO_DADDR(dp->i_mount,
- map.br_startblock);
- dblkcnt = XFS_FSB_TO_BB(dp->i_mount,
- map.br_blockcount);
- bp = xfs_trans_get_buf(*trans,
- dp->i_mount->m_ddev_targp,
- dblkno, dblkcnt, 0);
- if (!bp)
- return -ENOMEM;
- xfs_trans_binval(*trans, bp);
- /*
- * Roll to next transaction.
- */
- error = xfs_trans_roll_inode(trans, dp);
- if (error)
- return error;
- }
-
- tblkno += map.br_blockcount;
- tblkcnt -= map.br_blockcount;
+ blkno += map.br_blockcount;
+ blkcnt -= map.br_blockcount;
}
return 0;
@@ -101,86 +80,45 @@
*/
STATIC int
xfs_attr3_leaf_inactive(
- struct xfs_trans **trans,
- struct xfs_inode *dp,
- struct xfs_buf *bp)
+ struct xfs_trans **trans,
+ struct xfs_inode *dp,
+ struct xfs_buf *bp)
{
- struct xfs_attr_leafblock *leaf;
- struct xfs_attr3_icleaf_hdr ichdr;
- struct xfs_attr_leaf_entry *entry;
+ struct xfs_attr3_icleaf_hdr ichdr;
+ struct xfs_mount *mp = bp->b_mount;
+ struct xfs_attr_leafblock *leaf = bp->b_addr;
+ struct xfs_attr_leaf_entry *entry;
struct xfs_attr_leaf_name_remote *name_rmt;
- struct xfs_attr_inactive_list *list;
- struct xfs_attr_inactive_list *lp;
- int error;
- int count;
- int size;
- int tmp;
- int i;
- struct xfs_mount *mp = bp->b_mount;
+ int error = 0;
+ int i;
- leaf = bp->b_addr;
xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr, leaf);
/*
- * Count the number of "remote" value extents.
+ * Find the remote value extents for this leaf and invalidate their
+ * incore buffers.
*/
- count = 0;
entry = xfs_attr3_leaf_entryp(leaf);
for (i = 0; i < ichdr.count; entry++, i++) {
- if (be16_to_cpu(entry->nameidx) &&
- ((entry->flags & XFS_ATTR_LOCAL) == 0)) {
- name_rmt = xfs_attr3_leaf_name_remote(leaf, i);
- if (name_rmt->valueblk)
- count++;
- }
+ int blkcnt;
+
+ if (!entry->nameidx || (entry->flags & XFS_ATTR_LOCAL))
+ continue;
+
+ name_rmt = xfs_attr3_leaf_name_remote(leaf, i);
+ if (!name_rmt->valueblk)
+ continue;
+
+ blkcnt = xfs_attr3_rmt_blocks(dp->i_mount,
+ be32_to_cpu(name_rmt->valuelen));
+ error = xfs_attr3_rmt_stale(dp,
+ be32_to_cpu(name_rmt->valueblk), blkcnt);
+ if (error)
+ goto err;
}
- /*
- * If there are no "remote" values, we're done.
- */
- if (count == 0) {
- xfs_trans_brelse(*trans, bp);
- return 0;
- }
-
- /*
- * Allocate storage for a list of all the "remote" value extents.
- */
- size = count * sizeof(xfs_attr_inactive_list_t);
- list = kmem_alloc(size, 0);
-
- /*
- * Identify each of the "remote" value extents.
- */
- lp = list;
- entry = xfs_attr3_leaf_entryp(leaf);
- for (i = 0; i < ichdr.count; entry++, i++) {
- if (be16_to_cpu(entry->nameidx) &&
- ((entry->flags & XFS_ATTR_LOCAL) == 0)) {
- name_rmt = xfs_attr3_leaf_name_remote(leaf, i);
- if (name_rmt->valueblk) {
- lp->valueblk = be32_to_cpu(name_rmt->valueblk);
- lp->valuelen = xfs_attr3_rmt_blocks(dp->i_mount,
- be32_to_cpu(name_rmt->valuelen));
- lp++;
- }
- }
- }
- xfs_trans_brelse(*trans, bp); /* unlock for trans. in freextent() */
-
- /*
- * Invalidate each of the "remote" value extents.
- */
- error = 0;
- for (lp = list, i = 0; i < count; i++, lp++) {
- tmp = xfs_attr3_leaf_freextent(trans, dp,
- lp->valueblk, lp->valuelen);
-
- if (error == 0)
- error = tmp; /* save only the 1st errno */
- }
-
- kmem_free(list);
+ xfs_trans_brelse(*trans, bp);
+err:
return error;
}
@@ -190,37 +128,35 @@
*/
STATIC int
xfs_attr3_node_inactive(
- struct xfs_trans **trans,
- struct xfs_inode *dp,
- struct xfs_buf *bp,
- int level)
+ struct xfs_trans **trans,
+ struct xfs_inode *dp,
+ struct xfs_buf *bp,
+ int level)
{
- xfs_da_blkinfo_t *info;
- xfs_da_intnode_t *node;
- xfs_dablk_t child_fsb;
- xfs_daddr_t parent_blkno, child_blkno;
- int error, i;
- struct xfs_buf *child_bp;
- struct xfs_da_node_entry *btree;
+ struct xfs_mount *mp = dp->i_mount;
+ struct xfs_da_blkinfo *info;
+ xfs_dablk_t child_fsb;
+ xfs_daddr_t parent_blkno, child_blkno;
+ struct xfs_buf *child_bp;
struct xfs_da3_icnode_hdr ichdr;
+ int error, i;
/*
* Since this code is recursive (gasp!) we must protect ourselves.
*/
if (level > XFS_DA_NODE_MAXDEPTH) {
+ xfs_buf_mark_corrupt(bp);
xfs_trans_brelse(*trans, bp); /* no locks for later trans */
- return -EIO;
+ return -EFSCORRUPTED;
}
- node = bp->b_addr;
- dp->d_ops->node_hdr_from_disk(&ichdr, node);
+ xfs_da3_node_hdr_from_disk(dp->i_mount, &ichdr, bp->b_addr);
parent_blkno = bp->b_bn;
if (!ichdr.count) {
xfs_trans_brelse(*trans, bp);
return 0;
}
- btree = dp->d_ops->node_tree_p(node);
- child_fsb = be32_to_cpu(btree[0].before);
+ child_fsb = be32_to_cpu(ichdr.btree[0].before);
xfs_trans_brelse(*trans, bp); /* no locks for later trans */
/*
@@ -235,7 +171,7 @@
* traversal of the tree so we may deal with many blocks
* before we come back to this one.
*/
- error = xfs_da3_node_read(*trans, dp, child_fsb, -1, &child_bp,
+ error = xfs_da3_node_read(*trans, dp, child_fsb, &child_bp,
XFS_ATTR_FORK);
if (error)
return error;
@@ -258,8 +194,9 @@
error = xfs_attr3_leaf_inactive(trans, dp, child_bp);
break;
default:
- error = -EIO;
+ xfs_buf_mark_corrupt(child_bp);
xfs_trans_brelse(*trans, child_bp);
+ error = -EFSCORRUPTED;
break;
}
if (error)
@@ -268,10 +205,17 @@
/*
* Remove the subsidiary block from the cache and from the log.
*/
- error = xfs_da_get_buf(*trans, dp, 0, child_blkno, &child_bp,
- XFS_ATTR_FORK);
+ error = xfs_trans_get_buf(*trans, mp->m_ddev_targp,
+ child_blkno,
+ XFS_FSB_TO_BB(mp, mp->m_attr_geo->fsbcount), 0,
+ &child_bp);
if (error)
return error;
+ error = bp->b_error;
+ if (error) {
+ xfs_trans_brelse(*trans, child_bp);
+ return error;
+ }
xfs_trans_binval(*trans, child_bp);
/*
@@ -279,13 +223,15 @@
* child block number.
*/
if (i + 1 < ichdr.count) {
- error = xfs_da3_node_read(*trans, dp, 0, parent_blkno,
- &bp, XFS_ATTR_FORK);
+ struct xfs_da3_icnode_hdr phdr;
+
+ error = xfs_da3_node_read_mapped(*trans, dp,
+ parent_blkno, &bp, XFS_ATTR_FORK);
if (error)
return error;
- node = bp->b_addr;
- btree = dp->d_ops->node_tree_p(node);
- child_fsb = be32_to_cpu(btree[i + 1].before);
+ xfs_da3_node_hdr_from_disk(dp->i_mount, &phdr,
+ bp->b_addr);
+ child_fsb = be32_to_cpu(phdr.btree[i + 1].before);
xfs_trans_brelse(*trans, bp);
}
/*
@@ -310,6 +256,7 @@
struct xfs_trans **trans,
struct xfs_inode *dp)
{
+ struct xfs_mount *mp = dp->i_mount;
struct xfs_da_blkinfo *info;
struct xfs_buf *bp;
xfs_daddr_t blkno;
@@ -321,7 +268,7 @@
* the extents in reverse order the extent containing
* block 0 must still be there.
*/
- error = xfs_da3_node_read(*trans, dp, 0, -1, &bp, XFS_ATTR_FORK);
+ error = xfs_da3_node_read(*trans, dp, 0, &bp, XFS_ATTR_FORK);
if (error)
return error;
blkno = bp->b_bn;
@@ -341,7 +288,8 @@
error = xfs_attr3_leaf_inactive(trans, dp, bp);
break;
default:
- error = -EIO;
+ error = -EFSCORRUPTED;
+ xfs_buf_mark_corrupt(bp);
xfs_trans_brelse(*trans, bp);
break;
}
@@ -351,9 +299,15 @@
/*
* Invalidate the incore copy of the root block.
*/
- error = xfs_da_get_buf(*trans, dp, 0, blkno, &bp, XFS_ATTR_FORK);
+ error = xfs_trans_get_buf(*trans, mp->m_ddev_targp, blkno,
+ XFS_FSB_TO_BB(mp, mp->m_attr_geo->fsbcount), 0, &bp);
if (error)
return error;
+ error = bp->b_error;
+ if (error) {
+ xfs_trans_brelse(*trans, bp);
+ return error;
+ }
xfs_trans_binval(*trans, bp); /* remove from cache */
/*
* Commit the invalidate and start the next transaction.
@@ -413,7 +367,7 @@
* removal below.
*/
if (xfs_inode_hasattr(dp) &&
- dp->i_d.di_aformat != XFS_DINODE_FMT_LOCAL) {
+ dp->i_afp->if_format != XFS_DINODE_FMT_LOCAL) {
error = xfs_attr3_root_inactive(&trans, dp);
if (error)
goto out_cancel;
@@ -434,8 +388,11 @@
xfs_trans_cancel(trans);
out_destroy_fork:
/* kill the in-core attr fork before we drop the inode lock */
- if (dp->i_afp)
- xfs_idestroy_fork(dp, XFS_ATTR_FORK);
+ if (dp->i_afp) {
+ xfs_idestroy_fork(dp->i_afp);
+ kmem_cache_free(xfs_ifork_zone, dp->i_afp);
+ dp->i_afp = NULL;
+ }
if (lock_mode)
xfs_iunlock(dp, lock_mode);
return error;
diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c
index 00758fd..8f8837f 100644
--- a/fs/xfs/xfs_attr_list.c
+++ b/fs/xfs/xfs_attr_list.c
@@ -44,30 +44,27 @@
/*
* Copy out entries of shortform attribute lists for attr_list().
* Shortform attribute lists are not stored in hashval sorted order.
- * If the output buffer is not large enough to hold them all, then we
+ * If the output buffer is not large enough to hold them all, then
* we have to calculate each entries' hashvalue and sort them before
* we can begin returning them to the user.
*/
static int
-xfs_attr_shortform_list(xfs_attr_list_context_t *context)
+xfs_attr_shortform_list(
+ struct xfs_attr_list_context *context)
{
- attrlist_cursor_kern_t *cursor;
- xfs_attr_sf_sort_t *sbuf, *sbp;
- xfs_attr_shortform_t *sf;
- xfs_attr_sf_entry_t *sfe;
- xfs_inode_t *dp;
- int sbsize, nsbuf, count, i;
+ struct xfs_attrlist_cursor_kern *cursor = &context->cursor;
+ struct xfs_inode *dp = context->dp;
+ struct xfs_attr_sf_sort *sbuf, *sbp;
+ struct xfs_attr_shortform *sf;
+ struct xfs_attr_sf_entry *sfe;
+ int sbsize, nsbuf, count, i;
+ int error = 0;
- ASSERT(context != NULL);
- dp = context->dp;
- ASSERT(dp != NULL);
ASSERT(dp->i_afp != NULL);
- sf = (xfs_attr_shortform_t *)dp->i_afp->if_u1.if_data;
+ sf = (struct xfs_attr_shortform *)dp->i_afp->if_u1.if_data;
ASSERT(sf != NULL);
if (!sf->hdr.count)
return 0;
- cursor = context->cursor;
- ASSERT(cursor != NULL);
trace_xfs_attr_list_sf(context);
@@ -84,6 +81,10 @@
(XFS_ISRESET_CURSOR(cursor) &&
(dp->i_afp->if_bytes + sf->hdr.count * 16) < context->bufsize)) {
for (i = 0, sfe = &sf->list[0]; i < sf->hdr.count; i++) {
+ if (XFS_IS_CORRUPT(context->dp->i_mount,
+ !xfs_attr_namecheck(sfe->nameval,
+ sfe->namelen)))
+ return -EFSCORRUPTED;
context->put_listent(context,
sfe->flags,
sfe->nameval,
@@ -95,7 +96,7 @@
*/
if (context->seen_enough)
break;
- sfe = XFS_ATTR_SF_NEXTENTRY(sfe);
+ sfe = xfs_attr_sf_nextentry(sfe);
}
trace_xfs_attr_list_sf_all(context);
return 0;
@@ -135,7 +136,7 @@
/* These are bytes, and both on-disk, don't endian-flip */
sbp->valuelen = sfe->valuelen;
sbp->flags = sfe->flags;
- sfe = XFS_ATTR_SF_NEXTENTRY(sfe);
+ sfe = xfs_attr_sf_nextentry(sfe);
sbp++;
nsbuf++;
}
@@ -161,10 +162,8 @@
break;
}
}
- if (i == nsbuf) {
- kmem_free(sbuf);
- return 0;
- }
+ if (i == nsbuf)
+ goto out;
/*
* Loop putting entries into the user buffer.
@@ -174,6 +173,12 @@
cursor->hashval = sbp->hash;
cursor->offset = 0;
}
+ if (XFS_IS_CORRUPT(context->dp->i_mount,
+ !xfs_attr_namecheck(sbp->name,
+ sbp->namelen))) {
+ error = -EFSCORRUPTED;
+ goto out;
+ }
context->put_listent(context,
sbp->flags,
sbp->name,
@@ -183,9 +188,9 @@
break;
cursor->offset++;
}
-
+out:
kmem_free(sbuf);
- return 0;
+ return error;
}
/*
@@ -195,7 +200,7 @@
STATIC int
xfs_attr_node_list_lookup(
struct xfs_attr_list_context *context,
- struct attrlist_cursor_kern *cursor,
+ struct xfs_attrlist_cursor_kern *cursor,
struct xfs_buf **pbp)
{
struct xfs_da3_icnode_hdr nodehdr;
@@ -213,7 +218,7 @@
ASSERT(*pbp == NULL);
cursor->blkno = 0;
for (;;) {
- error = xfs_da3_node_read(tp, dp, cursor->blkno, -1, &bp,
+ error = xfs_da3_node_read(tp, dp, cursor->blkno, &bp,
XFS_ATTR_FORK);
if (error)
return error;
@@ -229,7 +234,7 @@
goto out_corruptbuf;
}
- dp->d_ops->node_hdr_from_disk(&nodehdr, node);
+ xfs_da3_node_hdr_from_disk(mp, &nodehdr, node);
/* Tree taller than we can handle; bail out! */
if (nodehdr.level >= XFS_DA_NODE_MAXDEPTH)
@@ -243,7 +248,7 @@
else
expected_level--;
- btree = dp->d_ops->node_tree_p(node);
+ btree = nodehdr.btree;
for (i = 0; i < nodehdr.count; btree++, i++) {
if (cursor->hashval <= be32_to_cpu(btree->hashval)) {
cursor->blkno = be32_to_cpu(btree->before);
@@ -258,7 +263,7 @@
return 0;
/* We can't point back to the root. */
- if (cursor->blkno == 0)
+ if (XFS_IS_CORRUPT(mp, cursor->blkno == 0))
return -EFSCORRUPTED;
}
@@ -269,6 +274,7 @@
return 0;
out_corruptbuf:
+ xfs_buf_mark_corrupt(bp);
xfs_trans_brelse(tp, bp);
return -EFSCORRUPTED;
}
@@ -277,18 +283,17 @@
xfs_attr_node_list(
struct xfs_attr_list_context *context)
{
+ struct xfs_attrlist_cursor_kern *cursor = &context->cursor;
struct xfs_attr3_icleaf_hdr leafhdr;
- struct attrlist_cursor_kern *cursor;
struct xfs_attr_leafblock *leaf;
struct xfs_da_intnode *node;
struct xfs_buf *bp;
struct xfs_inode *dp = context->dp;
struct xfs_mount *mp = dp->i_mount;
- int error;
+ int error = 0;
trace_xfs_attr_node_list(context);
- cursor = context->cursor;
cursor->initted = 1;
/*
@@ -298,8 +303,8 @@
*/
bp = NULL;
if (cursor->blkno > 0) {
- error = xfs_da3_node_read(context->tp, dp, cursor->blkno, -1,
- &bp, XFS_ATTR_FORK);
+ error = xfs_da3_node_read(context->tp, dp, cursor->blkno, &bp,
+ XFS_ATTR_FORK);
if ((error != 0) && (error != -EFSCORRUPTED))
return error;
if (bp) {
@@ -358,29 +363,32 @@
*/
for (;;) {
leaf = bp->b_addr;
- xfs_attr3_leaf_list_int(bp, context);
+ error = xfs_attr3_leaf_list_int(bp, context);
+ if (error)
+ break;
xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &leafhdr, leaf);
if (context->seen_enough || leafhdr.forw == 0)
break;
cursor->blkno = leafhdr.forw;
xfs_trans_brelse(context->tp, bp);
- error = xfs_attr3_leaf_read(context->tp, dp, cursor->blkno, -1, &bp);
+ error = xfs_attr3_leaf_read(context->tp, dp, cursor->blkno,
+ &bp);
if (error)
return error;
}
xfs_trans_brelse(context->tp, bp);
- return 0;
+ return error;
}
/*
* Copy out attribute list entries for attr_list(), for leaf attribute lists.
*/
-void
+int
xfs_attr3_leaf_list_int(
struct xfs_buf *bp,
struct xfs_attr_list_context *context)
{
- struct attrlist_cursor_kern *cursor;
+ struct xfs_attrlist_cursor_kern *cursor = &context->cursor;
struct xfs_attr_leafblock *leaf;
struct xfs_attr3_icleaf_hdr ichdr;
struct xfs_attr_leaf_entry *entries;
@@ -394,7 +402,6 @@
xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr, leaf);
entries = xfs_attr3_leaf_entryp(leaf);
- cursor = context->cursor;
cursor->initted = 1;
/*
@@ -417,7 +424,7 @@
}
if (i == ichdr.count) {
trace_xfs_attr_list_notfound(context);
- return;
+ return 0;
}
} else {
entry = &entries[0];
@@ -438,8 +445,8 @@
}
if ((entry->flags & XFS_ATTR_INCOMPLETE) &&
- !(context->flags & ATTR_INCOMPLETE))
- continue; /* skip incomplete entries */
+ !context->allow_incomplete)
+ continue;
if (entry->flags & XFS_ATTR_LOCAL) {
xfs_attr_leaf_name_local_t *name_loc;
@@ -457,6 +464,9 @@
valuelen = be32_to_cpu(name_rmt->valuelen);
}
+ if (XFS_IS_CORRUPT(context->dp->i_mount,
+ !xfs_attr_namecheck(name, namelen)))
+ return -EFSCORRUPTED;
context->put_listent(context, entry->flags,
name, namelen, valuelen);
if (context->seen_enough)
@@ -464,32 +474,33 @@
cursor->offset++;
}
trace_xfs_attr_list_leaf_end(context);
- return;
+ return 0;
}
/*
* Copy out attribute entries for attr_list(), for leaf attribute lists.
*/
STATIC int
-xfs_attr_leaf_list(xfs_attr_list_context_t *context)
+xfs_attr_leaf_list(
+ struct xfs_attr_list_context *context)
{
- int error;
- struct xfs_buf *bp;
+ struct xfs_buf *bp;
+ int error;
trace_xfs_attr_leaf_list(context);
- context->cursor->blkno = 0;
- error = xfs_attr3_leaf_read(context->tp, context->dp, 0, -1, &bp);
+ context->cursor.blkno = 0;
+ error = xfs_attr3_leaf_read(context->tp, context->dp, 0, &bp);
if (error)
return error;
- xfs_attr3_leaf_list_int(bp, context);
+ error = xfs_attr3_leaf_list_int(bp, context);
xfs_trans_brelse(context->tp, bp);
- return 0;
+ return error;
}
int
-xfs_attr_list_int_ilocked(
+xfs_attr_list_ilocked(
struct xfs_attr_list_context *context)
{
struct xfs_inode *dp = context->dp;
@@ -501,20 +512,20 @@
*/
if (!xfs_inode_hasattr(dp))
return 0;
- else if (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL)
+ if (dp->i_afp->if_format == XFS_DINODE_FMT_LOCAL)
return xfs_attr_shortform_list(context);
- else if (xfs_bmap_one_block(dp, XFS_ATTR_FORK))
+ if (xfs_bmap_one_block(dp, XFS_ATTR_FORK))
return xfs_attr_leaf_list(context);
return xfs_attr_node_list(context);
}
int
-xfs_attr_list_int(
- xfs_attr_list_context_t *context)
+xfs_attr_list(
+ struct xfs_attr_list_context *context)
{
- int error;
- xfs_inode_t *dp = context->dp;
- uint lock_mode;
+ struct xfs_inode *dp = context->dp;
+ uint lock_mode;
+ int error;
XFS_STATS_INC(dp->i_mount, xs_attr_list);
@@ -522,130 +533,7 @@
return -EIO;
lock_mode = xfs_ilock_attr_map_shared(dp);
- error = xfs_attr_list_int_ilocked(context);
+ error = xfs_attr_list_ilocked(context);
xfs_iunlock(dp, lock_mode);
return error;
}
-
-#define ATTR_ENTBASESIZE /* minimum bytes used by an attr */ \
- (((struct attrlist_ent *) 0)->a_name - (char *) 0)
-#define ATTR_ENTSIZE(namelen) /* actual bytes used by an attr */ \
- ((ATTR_ENTBASESIZE + (namelen) + 1 + sizeof(uint32_t)-1) \
- & ~(sizeof(uint32_t)-1))
-
-/*
- * Format an attribute and copy it out to the user's buffer.
- * Take care to check values and protect against them changing later,
- * we may be reading them directly out of a user buffer.
- */
-STATIC void
-xfs_attr_put_listent(
- xfs_attr_list_context_t *context,
- int flags,
- unsigned char *name,
- int namelen,
- int valuelen)
-{
- struct attrlist *alist = (struct attrlist *)context->alist;
- attrlist_ent_t *aep;
- int arraytop;
-
- ASSERT(!context->seen_enough);
- ASSERT(!(context->flags & ATTR_KERNOVAL));
- ASSERT(context->count >= 0);
- ASSERT(context->count < (ATTR_MAX_VALUELEN/8));
- ASSERT(context->firstu >= sizeof(*alist));
- ASSERT(context->firstu <= context->bufsize);
-
- /*
- * Only list entries in the right namespace.
- */
- if (((context->flags & ATTR_SECURE) == 0) !=
- ((flags & XFS_ATTR_SECURE) == 0))
- return;
- if (((context->flags & ATTR_ROOT) == 0) !=
- ((flags & XFS_ATTR_ROOT) == 0))
- return;
-
- arraytop = sizeof(*alist) +
- context->count * sizeof(alist->al_offset[0]);
- context->firstu -= ATTR_ENTSIZE(namelen);
- if (context->firstu < arraytop) {
- trace_xfs_attr_list_full(context);
- alist->al_more = 1;
- context->seen_enough = 1;
- return;
- }
-
- aep = (attrlist_ent_t *)&context->alist[context->firstu];
- aep->a_valuelen = valuelen;
- memcpy(aep->a_name, name, namelen);
- aep->a_name[namelen] = 0;
- alist->al_offset[context->count++] = context->firstu;
- alist->al_count = context->count;
- trace_xfs_attr_list_add(context);
- return;
-}
-
-/*
- * Generate a list of extended attribute names and optionally
- * also value lengths. Positive return value follows the XFS
- * convention of being an error, zero or negative return code
- * is the length of the buffer returned (negated), indicating
- * success.
- */
-int
-xfs_attr_list(
- xfs_inode_t *dp,
- char *buffer,
- int bufsize,
- int flags,
- attrlist_cursor_kern_t *cursor)
-{
- xfs_attr_list_context_t context;
- struct attrlist *alist;
- int error;
-
- /*
- * Validate the cursor.
- */
- if (cursor->pad1 || cursor->pad2)
- return -EINVAL;
- if ((cursor->initted == 0) &&
- (cursor->hashval || cursor->blkno || cursor->offset))
- return -EINVAL;
-
- /* Only internal consumers can retrieve incomplete attrs. */
- if (flags & ATTR_INCOMPLETE)
- return -EINVAL;
-
- /*
- * Check for a properly aligned buffer.
- */
- if (((long)buffer) & (sizeof(int)-1))
- return -EFAULT;
- if (flags & ATTR_KERNOVAL)
- bufsize = 0;
-
- /*
- * Initialize the output buffer.
- */
- memset(&context, 0, sizeof(context));
- context.dp = dp;
- context.cursor = cursor;
- context.resynch = 1;
- context.flags = flags;
- context.alist = buffer;
- context.bufsize = (bufsize & ~(sizeof(int)-1)); /* align */
- context.firstu = context.bufsize;
- context.put_listent = xfs_attr_put_listent;
-
- alist = (struct attrlist *)context.alist;
- alist->al_count = 0;
- alist->al_more = 0;
- alist->al_offset[0] = context.bufsize;
-
- error = xfs_attr_list_int(&context);
- ASSERT(error <= 0);
- return error;
-}
diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c
index 83d24e9..984bb48 100644
--- a/fs/xfs/xfs_bmap_item.c
+++ b/fs/xfs/xfs_bmap_item.c
@@ -21,21 +21,26 @@
#include "xfs_icache.h"
#include "xfs_bmap_btree.h"
#include "xfs_trans_space.h"
-
+#include "xfs_error.h"
+#include "xfs_log_priv.h"
+#include "xfs_log_recover.h"
+#include "xfs_quota.h"
kmem_zone_t *xfs_bui_zone;
kmem_zone_t *xfs_bud_zone;
+static const struct xfs_item_ops xfs_bui_item_ops;
+
static inline struct xfs_bui_log_item *BUI_ITEM(struct xfs_log_item *lip)
{
return container_of(lip, struct xfs_bui_log_item, bui_item);
}
-void
+STATIC void
xfs_bui_item_free(
struct xfs_bui_log_item *buip)
{
- kmem_zone_free(xfs_bui_zone, buip);
+ kmem_cache_free(xfs_bui_zone, buip);
}
/*
@@ -45,13 +50,13 @@
* committed vs unpin operations in bulk insert operations. Hence the reference
* count to ensure only the last caller frees the BUI.
*/
-void
+STATIC void
xfs_bui_release(
struct xfs_bui_log_item *buip)
{
ASSERT(atomic_read(&buip->bui_refcount) > 0);
if (atomic_dec_and_test(&buip->bui_refcount)) {
- xfs_trans_ail_remove(&buip->bui_item, SHUTDOWN_LOG_IO_ERROR);
+ xfs_trans_ail_delete(&buip->bui_item, SHUTDOWN_LOG_IO_ERROR);
xfs_bui_item_free(buip);
}
}
@@ -124,24 +129,17 @@
xfs_bui_release(BUI_ITEM(lip));
}
-static const struct xfs_item_ops xfs_bui_item_ops = {
- .iop_size = xfs_bui_item_size,
- .iop_format = xfs_bui_item_format,
- .iop_unpin = xfs_bui_item_unpin,
- .iop_release = xfs_bui_item_release,
-};
-
/*
* Allocate and initialize an bui item with the given number of extents.
*/
-struct xfs_bui_log_item *
+STATIC struct xfs_bui_log_item *
xfs_bui_init(
struct xfs_mount *mp)
{
struct xfs_bui_log_item *buip;
- buip = kmem_zone_zalloc(xfs_bui_zone, 0);
+ buip = kmem_cache_zalloc(xfs_bui_zone, GFP_KERNEL | __GFP_NOFAIL);
xfs_log_item_init(mp, &buip->bui_item, XFS_LI_BUI, &xfs_bui_item_ops);
buip->bui_format.bui_nextents = XFS_BUI_MAX_FAST_EXTENTS;
@@ -201,7 +199,7 @@
struct xfs_bud_log_item *budp = BUD_ITEM(lip);
xfs_bui_release(budp->bud_buip);
- kmem_zone_free(xfs_bud_zone, budp);
+ kmem_cache_free(xfs_bud_zone, budp);
}
static const struct xfs_item_ops xfs_bud_item_ops = {
@@ -218,7 +216,7 @@
{
struct xfs_bud_log_item *budp;
- budp = kmem_zone_zalloc(xfs_bud_zone, 0);
+ budp = kmem_cache_zalloc(xfs_bud_zone, GFP_KERNEL | __GFP_NOFAIL);
xfs_log_item_init(tp->t_mountp, &budp->bud_item, XFS_LI_BUD,
&xfs_bud_item_ops);
budp->bud_buip = buip;
@@ -267,8 +265,8 @@
static int
xfs_bmap_update_diff_items(
void *priv,
- struct list_head *a,
- struct list_head *b)
+ const struct list_head *a,
+ const struct list_head *b)
{
struct xfs_bmap_intent *ba;
struct xfs_bmap_intent *bb;
@@ -278,27 +276,6 @@
return ba->bi_owner->i_ino - bb->bi_owner->i_ino;
}
-/* Get an BUI. */
-STATIC void *
-xfs_bmap_update_create_intent(
- struct xfs_trans *tp,
- unsigned int count)
-{
- struct xfs_bui_log_item *buip;
-
- ASSERT(count == XFS_BUI_MAX_FAST_EXTENTS);
- ASSERT(tp != NULL);
-
- buip = xfs_bui_init(tp->t_mountp);
- ASSERT(buip != NULL);
-
- /*
- * Get a log_item_desc to point at the new item.
- */
- xfs_trans_add_item(tp, &buip->bui_item);
- return buip;
-}
-
/* Set the map extent flags for this mapping. */
static void
xfs_trans_set_bmap_flags(
@@ -326,16 +303,12 @@
STATIC void
xfs_bmap_update_log_item(
struct xfs_trans *tp,
- void *intent,
- struct list_head *item)
+ struct xfs_bui_log_item *buip,
+ struct xfs_bmap_intent *bmap)
{
- struct xfs_bui_log_item *buip = intent;
- struct xfs_bmap_intent *bmap;
uint next_extent;
struct xfs_map_extent *map;
- bmap = container_of(item, struct xfs_bmap_intent, bi_list);
-
tp->t_flags |= XFS_TRANS_DIRTY;
set_bit(XFS_LI_DIRTY, &buip->bui_item.li_flags);
@@ -355,23 +328,44 @@
bmap->bi_bmap.br_state);
}
+static struct xfs_log_item *
+xfs_bmap_update_create_intent(
+ struct xfs_trans *tp,
+ struct list_head *items,
+ unsigned int count,
+ bool sort)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_bui_log_item *buip = xfs_bui_init(mp);
+ struct xfs_bmap_intent *bmap;
+
+ ASSERT(count == XFS_BUI_MAX_FAST_EXTENTS);
+
+ xfs_trans_add_item(tp, &buip->bui_item);
+ if (sort)
+ list_sort(mp, items, xfs_bmap_update_diff_items);
+ list_for_each_entry(bmap, items, bi_list)
+ xfs_bmap_update_log_item(tp, buip, bmap);
+ return &buip->bui_item;
+}
+
/* Get an BUD so we can process all the deferred rmap updates. */
-STATIC void *
+static struct xfs_log_item *
xfs_bmap_update_create_done(
struct xfs_trans *tp,
- void *intent,
+ struct xfs_log_item *intent,
unsigned int count)
{
- return xfs_trans_get_bud(tp, intent);
+ return &xfs_trans_get_bud(tp, BUI_ITEM(intent))->bud_item;
}
/* Process a deferred rmap update. */
STATIC int
xfs_bmap_update_finish_item(
struct xfs_trans *tp,
+ struct xfs_log_item *done,
struct list_head *item,
- void *done_item,
- void **state)
+ struct xfs_btree_cur **state)
{
struct xfs_bmap_intent *bmap;
xfs_filblks_t count;
@@ -379,7 +373,7 @@
bmap = container_of(item, struct xfs_bmap_intent, bi_list);
count = bmap->bi_bmap.br_blockcount;
- error = xfs_trans_log_finish_bmap_update(tp, done_item,
+ error = xfs_trans_log_finish_bmap_update(tp, BUD_ITEM(done),
bmap->bi_type,
bmap->bi_owner, bmap->bi_whichfork,
bmap->bi_bmap.br_startoff,
@@ -398,9 +392,9 @@
/* Abort all pending BUIs. */
STATIC void
xfs_bmap_update_abort_intent(
- void *intent)
+ struct xfs_log_item *intent)
{
- xfs_bui_release(intent);
+ xfs_bui_release(BUI_ITEM(intent));
}
/* Cancel a deferred rmap update. */
@@ -416,10 +410,8 @@
const struct xfs_defer_op_type xfs_bmap_update_defer_type = {
.max_items = XFS_BUI_MAX_FAST_EXTENTS,
- .diff_items = xfs_bmap_update_diff_items,
.create_intent = xfs_bmap_update_create_intent,
.abort_intent = xfs_bmap_update_abort_intent,
- .log_item = xfs_bmap_update_log_item,
.create_done = xfs_bmap_update_create_done,
.finish_item = xfs_bmap_update_finish_item,
.cancel_item = xfs_bmap_update_cancel_item,
@@ -429,35 +421,29 @@
* Process a bmap update intent item that was recovered from the log.
* We need to update some inode's bmbt.
*/
-int
-xfs_bui_recover(
- struct xfs_trans *parent_tp,
- struct xfs_bui_log_item *buip)
+STATIC int
+xfs_bui_item_recover(
+ struct xfs_log_item *lip,
+ struct list_head *capture_list)
{
- int error = 0;
- unsigned int bui_type;
+ struct xfs_bmbt_irec irec;
+ struct xfs_bui_log_item *buip = BUI_ITEM(lip);
+ struct xfs_trans *tp;
+ struct xfs_inode *ip = NULL;
+ struct xfs_mount *mp = lip->li_mountp;
struct xfs_map_extent *bmap;
+ struct xfs_bud_log_item *budp;
xfs_fsblock_t startblock_fsb;
xfs_fsblock_t inode_fsb;
xfs_filblks_t count;
- bool op_ok;
- struct xfs_bud_log_item *budp;
- enum xfs_bmap_intent_type type;
- int whichfork;
xfs_exntst_t state;
- struct xfs_trans *tp;
- struct xfs_inode *ip = NULL;
- struct xfs_bmbt_irec irec;
- struct xfs_mount *mp = parent_tp->t_mountp;
-
- ASSERT(!test_bit(XFS_BUI_RECOVERED, &buip->bui_flags));
+ unsigned int bui_type;
+ int whichfork;
+ int error = 0;
/* Only one mapping operation per BUI... */
- if (buip->bui_format.bui_nextents != XFS_BUI_MAX_FAST_EXTENTS) {
- set_bit(XFS_BUI_RECOVERED, &buip->bui_flags);
- xfs_bui_release(buip);
- return -EIO;
- }
+ if (buip->bui_format.bui_nextents != XFS_BUI_MAX_FAST_EXTENTS)
+ return -EFSCORRUPTED;
/*
* First check the validity of the extent described by the
@@ -468,52 +454,6 @@
XFS_FSB_TO_DADDR(mp, bmap->me_startblock));
inode_fsb = XFS_BB_TO_FSB(mp, XFS_FSB_TO_DADDR(mp,
XFS_INO_TO_FSB(mp, bmap->me_owner)));
- switch (bmap->me_flags & XFS_BMAP_EXTENT_TYPE_MASK) {
- case XFS_BMAP_MAP:
- case XFS_BMAP_UNMAP:
- op_ok = true;
- break;
- default:
- op_ok = false;
- break;
- }
- if (!op_ok || startblock_fsb == 0 ||
- bmap->me_len == 0 ||
- inode_fsb == 0 ||
- startblock_fsb >= mp->m_sb.sb_dblocks ||
- bmap->me_len >= mp->m_sb.sb_agblocks ||
- inode_fsb >= mp->m_sb.sb_dblocks ||
- (bmap->me_flags & ~XFS_BMAP_EXTENT_FLAGS)) {
- /*
- * This will pull the BUI from the AIL and
- * free the memory associated with it.
- */
- set_bit(XFS_BUI_RECOVERED, &buip->bui_flags);
- xfs_bui_release(buip);
- return -EIO;
- }
-
- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate,
- XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK), 0, 0, &tp);
- if (error)
- return error;
- /*
- * Recovery stashes all deferred ops during intent processing and
- * finishes them on completion. Transfer current dfops state to this
- * transaction and transfer the result back before we return.
- */
- xfs_defer_move(tp, parent_tp);
- budp = xfs_trans_get_bud(tp, buip);
-
- /* Grab the inode. */
- error = xfs_iget(mp, tp, bmap->me_owner, 0, XFS_ILOCK_EXCL, &ip);
- if (error)
- goto err_inode;
-
- if (VFS_I(ip)->i_nlink == 0)
- xfs_iflags_set(ip, XFS_IRECOVERY);
-
- /* Process deferred bmap item. */
state = (bmap->me_flags & XFS_BMAP_EXTENT_UNWRITTEN) ?
XFS_EXT_UNWRITTEN : XFS_EXT_NORM;
whichfork = (bmap->me_flags & XFS_BMAP_EXTENT_ATTR_FORK) ?
@@ -522,22 +462,50 @@
switch (bui_type) {
case XFS_BMAP_MAP:
case XFS_BMAP_UNMAP:
- type = bui_type;
break;
default:
- error = -EFSCORRUPTED;
- goto err_inode;
+ return -EFSCORRUPTED;
}
+ if (startblock_fsb == 0 ||
+ bmap->me_len == 0 ||
+ inode_fsb == 0 ||
+ startblock_fsb >= mp->m_sb.sb_dblocks ||
+ bmap->me_len >= mp->m_sb.sb_agblocks ||
+ inode_fsb >= mp->m_sb.sb_dblocks ||
+ (bmap->me_flags & ~XFS_BMAP_EXTENT_FLAGS))
+ return -EFSCORRUPTED;
+
+ /* Grab the inode. */
+ error = xfs_iget(mp, NULL, bmap->me_owner, 0, 0, &ip);
+ if (error)
+ return error;
+
+ error = xfs_qm_dqattach(ip);
+ if (error)
+ goto err_rele;
+
+ if (VFS_I(ip)->i_nlink == 0)
+ xfs_iflags_set(ip, XFS_IRECOVERY);
+
+ /* Allocate transaction and do the work. */
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate,
+ XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK), 0, 0, &tp);
+ if (error)
+ goto err_rele;
+
+ budp = xfs_trans_get_bud(tp, buip);
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
xfs_trans_ijoin(tp, ip, 0);
count = bmap->me_len;
- error = xfs_trans_log_finish_bmap_update(tp, budp, type, ip, whichfork,
- bmap->me_startoff, bmap->me_startblock, &count, state);
+ error = xfs_trans_log_finish_bmap_update(tp, budp, bui_type, ip,
+ whichfork, bmap->me_startoff, bmap->me_startblock,
+ &count, state);
if (error)
- goto err_inode;
+ goto err_cancel;
if (count > 0) {
- ASSERT(type == XFS_BMAP_UNMAP);
+ ASSERT(bui_type == XFS_BMAP_UNMAP);
irec.br_startblock = bmap->me_startblock;
irec.br_blockcount = count;
irec.br_startoff = bmap->me_startoff;
@@ -545,20 +513,168 @@
xfs_bmap_unmap_extent(tp, ip, &irec);
}
- set_bit(XFS_BUI_RECOVERED, &buip->bui_flags);
- xfs_defer_move(parent_tp, tp);
- error = xfs_trans_commit(tp);
+ /*
+ * Commit transaction, which frees the transaction and saves the inode
+ * for later replay activities.
+ */
+ error = xfs_defer_ops_capture_and_commit(tp, ip, capture_list);
+ if (error)
+ goto err_unlock;
+
xfs_iunlock(ip, XFS_ILOCK_EXCL);
xfs_irele(ip);
+ return 0;
- return error;
-
-err_inode:
- xfs_defer_move(parent_tp, tp);
+err_cancel:
xfs_trans_cancel(tp);
- if (ip) {
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- xfs_irele(ip);
- }
+err_unlock:
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+err_rele:
+ xfs_irele(ip);
return error;
}
+
+STATIC bool
+xfs_bui_item_match(
+ struct xfs_log_item *lip,
+ uint64_t intent_id)
+{
+ return BUI_ITEM(lip)->bui_format.bui_id == intent_id;
+}
+
+/* Relog an intent item to push the log tail forward. */
+static struct xfs_log_item *
+xfs_bui_item_relog(
+ struct xfs_log_item *intent,
+ struct xfs_trans *tp)
+{
+ struct xfs_bud_log_item *budp;
+ struct xfs_bui_log_item *buip;
+ struct xfs_map_extent *extp;
+ unsigned int count;
+
+ count = BUI_ITEM(intent)->bui_format.bui_nextents;
+ extp = BUI_ITEM(intent)->bui_format.bui_extents;
+
+ tp->t_flags |= XFS_TRANS_DIRTY;
+ budp = xfs_trans_get_bud(tp, BUI_ITEM(intent));
+ set_bit(XFS_LI_DIRTY, &budp->bud_item.li_flags);
+
+ buip = xfs_bui_init(tp->t_mountp);
+ memcpy(buip->bui_format.bui_extents, extp, count * sizeof(*extp));
+ atomic_set(&buip->bui_next_extent, count);
+ xfs_trans_add_item(tp, &buip->bui_item);
+ set_bit(XFS_LI_DIRTY, &buip->bui_item.li_flags);
+ return &buip->bui_item;
+}
+
+static const struct xfs_item_ops xfs_bui_item_ops = {
+ .iop_size = xfs_bui_item_size,
+ .iop_format = xfs_bui_item_format,
+ .iop_unpin = xfs_bui_item_unpin,
+ .iop_release = xfs_bui_item_release,
+ .iop_recover = xfs_bui_item_recover,
+ .iop_match = xfs_bui_item_match,
+ .iop_relog = xfs_bui_item_relog,
+};
+
+/*
+ * Copy an BUI format buffer from the given buf, and into the destination
+ * BUI format structure. The BUI/BUD items were designed not to need any
+ * special alignment handling.
+ */
+static int
+xfs_bui_copy_format(
+ struct xfs_log_iovec *buf,
+ struct xfs_bui_log_format *dst_bui_fmt)
+{
+ struct xfs_bui_log_format *src_bui_fmt;
+ uint len;
+
+ src_bui_fmt = buf->i_addr;
+ len = xfs_bui_log_format_sizeof(src_bui_fmt->bui_nextents);
+
+ if (buf->i_len == len) {
+ memcpy(dst_bui_fmt, src_bui_fmt, len);
+ return 0;
+ }
+ XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, NULL);
+ return -EFSCORRUPTED;
+}
+
+/*
+ * This routine is called to create an in-core extent bmap update
+ * item from the bui format structure which was logged on disk.
+ * It allocates an in-core bui, copies the extents from the format
+ * structure into it, and adds the bui to the AIL with the given
+ * LSN.
+ */
+STATIC int
+xlog_recover_bui_commit_pass2(
+ struct xlog *log,
+ struct list_head *buffer_list,
+ struct xlog_recover_item *item,
+ xfs_lsn_t lsn)
+{
+ int error;
+ struct xfs_mount *mp = log->l_mp;
+ struct xfs_bui_log_item *buip;
+ struct xfs_bui_log_format *bui_formatp;
+
+ bui_formatp = item->ri_buf[0].i_addr;
+
+ if (bui_formatp->bui_nextents != XFS_BUI_MAX_FAST_EXTENTS) {
+ XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, log->l_mp);
+ return -EFSCORRUPTED;
+ }
+ buip = xfs_bui_init(mp);
+ error = xfs_bui_copy_format(&item->ri_buf[0], &buip->bui_format);
+ if (error) {
+ xfs_bui_item_free(buip);
+ return error;
+ }
+ atomic_set(&buip->bui_next_extent, bui_formatp->bui_nextents);
+ /*
+ * Insert the intent into the AIL directly and drop one reference so
+ * that finishing or canceling the work will drop the other.
+ */
+ xfs_trans_ail_insert(log->l_ailp, &buip->bui_item, lsn);
+ xfs_bui_release(buip);
+ return 0;
+}
+
+const struct xlog_recover_item_ops xlog_bui_item_ops = {
+ .item_type = XFS_LI_BUI,
+ .commit_pass2 = xlog_recover_bui_commit_pass2,
+};
+
+/*
+ * This routine is called when an BUD format structure is found in a committed
+ * transaction in the log. Its purpose is to cancel the corresponding BUI if it
+ * was still in the log. To do this it searches the AIL for the BUI with an id
+ * equal to that in the BUD format structure. If we find it we drop the BUD
+ * reference, which removes the BUI from the AIL and frees it.
+ */
+STATIC int
+xlog_recover_bud_commit_pass2(
+ struct xlog *log,
+ struct list_head *buffer_list,
+ struct xlog_recover_item *item,
+ xfs_lsn_t lsn)
+{
+ struct xfs_bud_log_format *bud_formatp;
+
+ bud_formatp = item->ri_buf[0].i_addr;
+ if (item->ri_buf[0].i_len != sizeof(struct xfs_bud_log_format)) {
+ XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, log->l_mp);
+ return -EFSCORRUPTED;
+ }
+
+ xlog_recover_release_intent(log, XFS_LI_BUI, bud_formatp->bud_bui_id);
+ return 0;
+}
+
+const struct xlog_recover_item_ops xlog_bud_item_ops = {
+ .item_type = XFS_LI_BUD,
+ .commit_pass2 = xlog_recover_bud_commit_pass2,
+};
diff --git a/fs/xfs/xfs_bmap_item.h b/fs/xfs/xfs_bmap_item.h
index ad479cc..b9be62f 100644
--- a/fs/xfs/xfs_bmap_item.h
+++ b/fs/xfs/xfs_bmap_item.h
@@ -33,11 +33,6 @@
#define XFS_BUI_MAX_FAST_EXTENTS 1
/*
- * Define BUI flag bits. Manipulated by set/clear/test_bit operators.
- */
-#define XFS_BUI_RECOVERED 1
-
-/*
* This is the "bmap update intent" log item. It is used to log the fact that
* some reverse mappings need to change. It is used in conjunction with the
* "bmap update done" log item described below.
@@ -49,7 +44,6 @@
struct xfs_log_item bui_item;
atomic_t bui_refcount;
atomic_t bui_next_extent;
- unsigned long bui_flags; /* misc flags */
struct xfs_bui_log_format bui_format;
};
@@ -74,9 +68,4 @@
extern struct kmem_zone *xfs_bui_zone;
extern struct kmem_zone *xfs_bud_zone;
-struct xfs_bui_log_item *xfs_bui_init(struct xfs_mount *);
-void xfs_bui_item_free(struct xfs_bui_log_item *);
-void xfs_bui_release(struct xfs_bui_log_item *);
-int xfs_bui_recover(struct xfs_trans *parent_tp, struct xfs_bui_log_item *buip);
-
#endif /* __XFS_BMAP_ITEM_H__ */
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index d6d78e1..7371a7f 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -53,15 +53,16 @@
*/
int
xfs_zero_extent(
- struct xfs_inode *ip,
- xfs_fsblock_t start_fsb,
- xfs_off_t count_fsb)
+ struct xfs_inode *ip,
+ xfs_fsblock_t start_fsb,
+ xfs_off_t count_fsb)
{
- struct xfs_mount *mp = ip->i_mount;
- xfs_daddr_t sector = xfs_fsb_to_db(ip, start_fsb);
- sector_t block = XFS_BB_TO_FSBT(mp, sector);
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_buftarg *target = xfs_inode_buftarg(ip);
+ xfs_daddr_t sector = xfs_fsb_to_db(ip, start_fsb);
+ sector_t block = XFS_BB_TO_FSBT(mp, sector);
- return blkdev_issue_zeroout(xfs_find_bdev_for_inode(VFS_I(ip)),
+ return blkdev_issue_zeroout(target->bt_bdev,
block << (mp->m_super->s_blocksize_bits - 9),
count_fsb << (mp->m_super->s_blocksize_bits - 9),
GFP_NOFS, 0);
@@ -125,7 +126,7 @@
* pick an extent that will space things out in the rt area.
*/
if (ap->eof && ap->offset == 0) {
- xfs_rtblock_t uninitialized_var(rtx); /* realtime extent no */
+ xfs_rtblock_t rtx; /* realtime extent no */
error = xfs_rtpick_extent(mp, ap->tp, ralen, &rtx);
if (error)
@@ -164,13 +165,6 @@
xfs_trans_mod_dquot_byino(ap->tp, ap->ip,
ap->wasdel ? XFS_TRANS_DQ_DELRTBCOUNT :
XFS_TRANS_DQ_RTBCOUNT, (long) ralen);
-
- /* Zero the extent if we were asked to do so */
- if (ap->datatype & XFS_ALLOC_USERDATA_ZERO) {
- error = xfs_zero_extent(ap->ip, ap->blkno, ap->length);
- if (error)
- return error;
- }
} else {
ap->length = 0;
}
@@ -179,29 +173,6 @@
#endif /* CONFIG_XFS_RT */
/*
- * Check if the endoff is outside the last extent. If so the caller will grow
- * the allocation to a stripe unit boundary. All offsets are considered outside
- * the end of file for an empty fork, so 1 is returned in *eof in that case.
- */
-int
-xfs_bmap_eof(
- struct xfs_inode *ip,
- xfs_fileoff_t endoff,
- int whichfork,
- int *eof)
-{
- struct xfs_bmbt_irec rec;
- int error;
-
- error = xfs_bmap_last_extent(NULL, ip, whichfork, &rec, eof);
- if (error || *eof)
- return error;
-
- *eof = endoff >= rec.br_startoff + rec.br_blockcount;
- return 0;
-}
-
-/*
* Extent tree block counting routines.
*/
@@ -229,106 +200,6 @@
}
/*
- * Count leaf blocks given a range of extent records originally
- * in btree format.
- */
-STATIC void
-xfs_bmap_disk_count_leaves(
- struct xfs_mount *mp,
- struct xfs_btree_block *block,
- int numrecs,
- xfs_filblks_t *count)
-{
- int b;
- xfs_bmbt_rec_t *frp;
-
- for (b = 1; b <= numrecs; b++) {
- frp = XFS_BMBT_REC_ADDR(mp, block, b);
- *count += xfs_bmbt_disk_get_blockcount(frp);
- }
-}
-
-/*
- * Recursively walks each level of a btree
- * to count total fsblocks in use.
- */
-STATIC int
-xfs_bmap_count_tree(
- struct xfs_mount *mp,
- struct xfs_trans *tp,
- struct xfs_ifork *ifp,
- xfs_fsblock_t blockno,
- int levelin,
- xfs_extnum_t *nextents,
- xfs_filblks_t *count)
-{
- int error;
- struct xfs_buf *bp, *nbp;
- int level = levelin;
- __be64 *pp;
- xfs_fsblock_t bno = blockno;
- xfs_fsblock_t nextbno;
- struct xfs_btree_block *block, *nextblock;
- int numrecs;
-
- error = xfs_btree_read_bufl(mp, tp, bno, &bp, XFS_BMAP_BTREE_REF,
- &xfs_bmbt_buf_ops);
- if (error)
- return error;
- *count += 1;
- block = XFS_BUF_TO_BLOCK(bp);
-
- if (--level) {
- /* Not at node above leaves, count this level of nodes */
- nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib);
- while (nextbno != NULLFSBLOCK) {
- error = xfs_btree_read_bufl(mp, tp, nextbno, &nbp,
- XFS_BMAP_BTREE_REF,
- &xfs_bmbt_buf_ops);
- if (error)
- return error;
- *count += 1;
- nextblock = XFS_BUF_TO_BLOCK(nbp);
- nextbno = be64_to_cpu(nextblock->bb_u.l.bb_rightsib);
- xfs_trans_brelse(tp, nbp);
- }
-
- /* Dive to the next level */
- pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]);
- bno = be64_to_cpu(*pp);
- error = xfs_bmap_count_tree(mp, tp, ifp, bno, level, nextents,
- count);
- if (error) {
- xfs_trans_brelse(tp, bp);
- XFS_ERROR_REPORT("xfs_bmap_count_tree(1)",
- XFS_ERRLEVEL_LOW, mp);
- return -EFSCORRUPTED;
- }
- xfs_trans_brelse(tp, bp);
- } else {
- /* count all level 1 nodes and their leaves */
- for (;;) {
- nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib);
- numrecs = be16_to_cpu(block->bb_numrecs);
- (*nextents) += numrecs;
- xfs_bmap_disk_count_leaves(mp, block, numrecs, count);
- xfs_trans_brelse(tp, bp);
- if (nextbno == NULLFSBLOCK)
- break;
- bno = nextbno;
- error = xfs_btree_read_bufl(mp, tp, bno, &bp,
- XFS_BMAP_BTREE_REF,
- &xfs_bmbt_buf_ops);
- if (error)
- return error;
- *count += 1;
- block = XFS_BUF_TO_BLOCK(bp);
- }
- }
- return 0;
-}
-
-/*
* Count fsblocks of the given fork. Delayed allocation extents are
* not counted towards the totals.
*/
@@ -340,26 +211,19 @@
xfs_extnum_t *nextents,
xfs_filblks_t *count)
{
- struct xfs_mount *mp; /* file system mount structure */
- __be64 *pp; /* pointer to block address */
- struct xfs_btree_block *block; /* current btree block */
- struct xfs_ifork *ifp; /* fork structure */
- xfs_fsblock_t bno; /* block # of "block" */
- int level; /* btree level, for checking */
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
+ struct xfs_btree_cur *cur;
+ xfs_extlen_t btblocks = 0;
int error;
- bno = NULLFSBLOCK;
- mp = ip->i_mount;
*nextents = 0;
*count = 0;
- ifp = XFS_IFORK_PTR(ip, whichfork);
+
if (!ifp)
return 0;
- switch (XFS_IFORK_FORMAT(ip, whichfork)) {
- case XFS_DINODE_FMT_EXTENTS:
- *nextents = xfs_bmap_count_leaves(ifp, count);
- return 0;
+ switch (ifp->if_format) {
case XFS_DINODE_FMT_BTREE:
if (!(ifp->if_flags & XFS_IFEXTENTS)) {
error = xfs_iread_extents(tp, ip, whichfork);
@@ -367,26 +231,23 @@
return error;
}
- /*
- * Root level must use BMAP_BROOT_PTR_ADDR macro to get ptr out.
- */
- block = ifp->if_broot;
- level = be16_to_cpu(block->bb_level);
- ASSERT(level > 0);
- pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes);
- bno = be64_to_cpu(*pp);
- ASSERT(bno != NULLFSBLOCK);
- ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount);
- ASSERT(XFS_FSB_TO_AGBNO(mp, bno) < mp->m_sb.sb_agblocks);
+ cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
+ error = xfs_btree_count_blocks(cur, &btblocks);
+ xfs_btree_del_cursor(cur, error);
+ if (error)
+ return error;
- error = xfs_bmap_count_tree(mp, tp, ifp, bno, level,
- nextents, count);
- if (error) {
- XFS_ERROR_REPORT("xfs_bmap_count_blocks(2)",
- XFS_ERRLEVEL_LOW, mp);
- return -EFSCORRUPTED;
- }
- return 0;
+ /*
+ * xfs_btree_count_blocks includes the root block contained in
+ * the inode fork in @btblocks, so subtract one because we're
+ * only interested in allocated disk blocks.
+ */
+ *count += btblocks - 1;
+
+ /* fall through */
+ case XFS_DINODE_FMT_EXTENTS:
+ *nextents = xfs_bmap_count_leaves(ifp, count);
+ break;
}
return 0;
@@ -588,7 +449,7 @@
break;
}
- switch (XFS_IFORK_FORMAT(ip, whichfork)) {
+ switch (ifp->if_format) {
case XFS_DINODE_FMT_EXTENTS:
case XFS_DINODE_FMT_BTREE:
break;
@@ -964,8 +825,8 @@
xfs_trans_ijoin(tp, ip, 0);
error = xfs_bmapi_write(tp, ip, startoffset_fsb,
- allocatesize_fsb, alloc_type, resblks,
- imapp, &nimaps);
+ allocatesize_fsb, alloc_type, 0, imapp,
+ &nimaps);
if (error)
goto error0;
@@ -1085,6 +946,14 @@
startoffset_fsb = XFS_B_TO_FSB(mp, offset);
endoffset_fsb = XFS_B_TO_FSBT(mp, offset + len);
+ /* We can only free complete realtime extents. */
+ if (XFS_IS_REALTIME_INODE(ip) && mp->m_sb.sb_rextsize > 1) {
+ startoffset_fsb = roundup_64(startoffset_fsb,
+ mp->m_sb.sb_rextsize);
+ endoffset_fsb = rounddown_64(endoffset_fsb,
+ mp->m_sb.sb_rextsize);
+ }
+
/*
* Need to zero the stuff we're not freeing, on disk.
*/
@@ -1107,7 +976,8 @@
return 0;
if (offset + len > XFS_ISIZE(ip))
len = XFS_ISIZE(ip) - offset;
- error = iomap_zero_range(VFS_I(ip), offset, len, NULL, &xfs_iomap_ops);
+ error = iomap_zero_range(VFS_I(ip), offset, len, NULL,
+ &xfs_buffered_write_iomap_ops);
if (error)
return error;
@@ -1125,48 +995,12 @@
return error;
}
-/*
- * Preallocate and zero a range of a file. This mechanism has the allocation
- * semantics of fallocate and in addition converts data in the range to zeroes.
- */
-int
-xfs_zero_file_space(
- struct xfs_inode *ip,
- xfs_off_t offset,
- xfs_off_t len)
-{
- struct xfs_mount *mp = ip->i_mount;
- uint blksize;
- int error;
-
- trace_xfs_zero_file_space(ip);
-
- blksize = 1 << mp->m_sb.sb_blocklog;
-
- /*
- * Punch a hole and prealloc the range. We use hole punch rather than
- * unwritten extent conversion for two reasons:
- *
- * 1.) Hole punch handles partial block zeroing for us.
- *
- * 2.) If prealloc returns ENOSPC, the file range is still zero-valued
- * by virtue of the hole punch.
- */
- error = xfs_free_file_space(ip, offset, len);
- if (error || xfs_is_always_cow_inode(ip))
- return error;
-
- return xfs_alloc_file_space(ip, round_down(offset, blksize),
- round_up(offset + len, blksize) -
- round_down(offset, blksize),
- XFS_BMAPI_PREALLOC);
-}
-
static int
xfs_prepare_shift(
struct xfs_inode *ip,
loff_t offset)
{
+ struct xfs_mount *mp = ip->i_mount;
int error;
/*
@@ -1180,6 +1014,17 @@
}
/*
+ * Shift operations must stabilize the start block offset boundary along
+ * with the full range of the operation. If we don't, a COW writeback
+ * completion could race with an insert, front merge with the start
+ * extent (after split) during the shift and corrupt the file. Start
+ * with the block just prior to the start to stabilize the boundary.
+ */
+ offset = round_down(offset, 1 << mp->m_sb.sb_blocklog);
+ if (offset)
+ offset -= (1 << mp->m_sb.sb_blocklog);
+
+ /*
* Writeback and invalidate cache for the remainder of the file as we're
* about to shift down every extent from offset to EOF.
*/
@@ -1225,7 +1070,6 @@
int error;
xfs_fileoff_t next_fsb = XFS_B_TO_FSB(mp, offset + len);
xfs_fileoff_t shift_fsb = XFS_B_TO_FSB(mp, len);
- uint resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
bool done = false;
ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
@@ -1241,32 +1085,34 @@
if (error)
return error;
- while (!error && !done) {
- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0,
- &tp);
- if (error)
- break;
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, 0, 0, 0, &tp);
+ if (error)
+ return error;
- xfs_ilock(ip, XFS_ILOCK_EXCL);
- error = xfs_trans_reserve_quota(tp, mp, ip->i_udquot,
- ip->i_gdquot, ip->i_pdquot, resblks, 0,
- XFS_QMOPT_RES_REGBLKS);
- if (error)
- goto out_trans_cancel;
- xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, ip, 0);
+ while (!done) {
error = xfs_bmap_collapse_extents(tp, ip, &next_fsb, shift_fsb,
&done);
if (error)
goto out_trans_cancel;
+ if (done)
+ break;
- error = xfs_trans_commit(tp);
+ /* finish any deferred frees and roll the transaction */
+ error = xfs_defer_finish(&tp);
+ if (error)
+ goto out_trans_cancel;
}
+ error = xfs_trans_commit(tp);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
return error;
out_trans_cancel:
xfs_trans_cancel(tp);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
return error;
}
@@ -1309,35 +1155,41 @@
if (error)
return error;
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write,
+ XFS_DIOSTRAT_SPACE_RES(mp, 0), 0, 0, &tp);
+ if (error)
+ return error;
+
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, ip, 0);
+
/*
* The extent shifting code works on extent granularity. So, if stop_fsb
* is not the starting block of extent, we need to split the extent at
* stop_fsb.
*/
- error = xfs_bmap_split_extent(ip, stop_fsb);
+ error = xfs_bmap_split_extent(tp, ip, stop_fsb);
if (error)
- return error;
+ goto out_trans_cancel;
- while (!error && !done) {
- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, 0, 0, 0,
- &tp);
+ do {
+ error = xfs_defer_finish(&tp);
if (error)
- break;
+ goto out_trans_cancel;
- xfs_ilock(ip, XFS_ILOCK_EXCL);
- xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
error = xfs_bmap_insert_extents(tp, ip, &next_fsb, shift_fsb,
&done, stop_fsb);
if (error)
goto out_trans_cancel;
+ } while (!done);
- error = xfs_trans_commit(tp);
- }
-
+ error = xfs_trans_commit(tp);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
return error;
out_trans_cancel:
xfs_trans_cancel(tp);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
return error;
}
@@ -1366,17 +1218,26 @@
struct xfs_inode *ip, /* target inode */
struct xfs_inode *tip) /* tmp inode */
{
+ struct xfs_ifork *ifp = &ip->i_df;
+ struct xfs_ifork *tifp = &tip->i_df;
+
+ /* User/group/project quota ids must match if quotas are enforced. */
+ if (XFS_IS_QUOTA_ON(ip->i_mount) &&
+ (!uid_eq(VFS_I(ip)->i_uid, VFS_I(tip)->i_uid) ||
+ !gid_eq(VFS_I(ip)->i_gid, VFS_I(tip)->i_gid) ||
+ ip->i_d.di_projid != tip->i_d.di_projid))
+ return -EINVAL;
/* Should never get a local format */
- if (ip->i_d.di_format == XFS_DINODE_FMT_LOCAL ||
- tip->i_d.di_format == XFS_DINODE_FMT_LOCAL)
+ if (ifp->if_format == XFS_DINODE_FMT_LOCAL ||
+ tifp->if_format == XFS_DINODE_FMT_LOCAL)
return -EINVAL;
/*
* if the target inode has less extents that then temporary inode then
* why did userspace call us?
*/
- if (ip->i_d.di_nextents < tip->i_d.di_nextents)
+ if (ifp->if_nextents < tifp->if_nextents)
return -EINVAL;
/*
@@ -1391,20 +1252,18 @@
* form then we will end up with the target inode in the wrong format
* as we already know there are less extents in the temp inode.
*/
- if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
- tip->i_d.di_format == XFS_DINODE_FMT_BTREE)
+ if (ifp->if_format == XFS_DINODE_FMT_EXTENTS &&
+ tifp->if_format == XFS_DINODE_FMT_BTREE)
return -EINVAL;
/* Check temp in extent form to max in target */
- if (tip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
- XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) >
- XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK))
+ if (tifp->if_format == XFS_DINODE_FMT_EXTENTS &&
+ tifp->if_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK))
return -EINVAL;
/* Check target in extent form to max in temp */
- if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
- XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) >
- XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK))
+ if (ifp->if_format == XFS_DINODE_FMT_EXTENTS &&
+ ifp->if_nextents > XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK))
return -EINVAL;
/*
@@ -1416,22 +1275,20 @@
* (a common defrag case) which will occur when the temp inode is in
* extent format...
*/
- if (tip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
+ if (tifp->if_format == XFS_DINODE_FMT_BTREE) {
if (XFS_IFORK_Q(ip) &&
- XFS_BMAP_BMDR_SPACE(tip->i_df.if_broot) > XFS_IFORK_BOFF(ip))
+ XFS_BMAP_BMDR_SPACE(tifp->if_broot) > XFS_IFORK_BOFF(ip))
return -EINVAL;
- if (XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) <=
- XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK))
+ if (tifp->if_nextents <= XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK))
return -EINVAL;
}
/* Reciprocal target->temp btree format checks */
- if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
+ if (ifp->if_format == XFS_DINODE_FMT_BTREE) {
if (XFS_IFORK_Q(tip) &&
XFS_BMAP_BMDR_SPACE(ip->i_df.if_broot) > XFS_IFORK_BOFF(tip))
return -EINVAL;
- if (XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) <=
- XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK))
+ if (ifp->if_nextents <= XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK))
return -EINVAL;
}
@@ -1583,15 +1440,15 @@
/*
* Count the number of extended attribute blocks
*/
- if ( ((XFS_IFORK_Q(ip) != 0) && (ip->i_d.di_anextents > 0)) &&
- (ip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)) {
+ if (XFS_IFORK_Q(ip) && ip->i_afp->if_nextents > 0 &&
+ ip->i_afp->if_format != XFS_DINODE_FMT_LOCAL) {
error = xfs_bmap_count_blocks(tp, ip, XFS_ATTR_FORK, &junk,
&aforkblks);
if (error)
return error;
}
- if ( ((XFS_IFORK_Q(tip) != 0) && (tip->i_d.di_anextents > 0)) &&
- (tip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)) {
+ if (XFS_IFORK_Q(tip) && tip->i_afp->if_nextents > 0 &&
+ tip->i_afp->if_format != XFS_DINODE_FMT_LOCAL) {
error = xfs_bmap_count_blocks(tp, tip, XFS_ATTR_FORK, &junk,
&taforkblks);
if (error)
@@ -1605,12 +1462,12 @@
* event of a crash. Set the owner change log flags now and leave the
* bmbt scan as the last step.
*/
- if (ip->i_d.di_version == 3 &&
- ip->i_d.di_format == XFS_DINODE_FMT_BTREE)
- (*target_log_flags) |= XFS_ILOG_DOWNER;
- if (tip->i_d.di_version == 3 &&
- tip->i_d.di_format == XFS_DINODE_FMT_BTREE)
- (*src_log_flags) |= XFS_ILOG_DOWNER;
+ if (xfs_sb_version_has_v3inode(&ip->i_mount->m_sb)) {
+ if (ip->i_df.if_format == XFS_DINODE_FMT_BTREE)
+ (*target_log_flags) |= XFS_ILOG_DOWNER;
+ if (tip->i_df.if_format == XFS_DINODE_FMT_BTREE)
+ (*src_log_flags) |= XFS_ILOG_DOWNER;
+ }
/*
* Swap the data forks of the inodes
@@ -1624,9 +1481,6 @@
ip->i_d.di_nblocks = tip->i_d.di_nblocks - taforkblks + aforkblks;
tip->i_d.di_nblocks = tmp + taforkblks - aforkblks;
- swap(ip->i_d.di_nextents, tip->i_d.di_nextents);
- swap(ip->i_d.di_format, tip->i_d.di_format);
-
/*
* The extents in the source inode could still contain speculative
* preallocation beyond EOF (e.g. the file is open but not modified
@@ -1640,24 +1494,24 @@
tip->i_delayed_blks = ip->i_delayed_blks;
ip->i_delayed_blks = 0;
- switch (ip->i_d.di_format) {
+ switch (ip->i_df.if_format) {
case XFS_DINODE_FMT_EXTENTS:
(*src_log_flags) |= XFS_ILOG_DEXT;
break;
case XFS_DINODE_FMT_BTREE:
- ASSERT(ip->i_d.di_version < 3 ||
+ ASSERT(!xfs_sb_version_has_v3inode(&ip->i_mount->m_sb) ||
(*src_log_flags & XFS_ILOG_DOWNER));
(*src_log_flags) |= XFS_ILOG_DBROOT;
break;
}
- switch (tip->i_d.di_format) {
+ switch (tip->i_df.if_format) {
case XFS_DINODE_FMT_EXTENTS:
(*target_log_flags) |= XFS_ILOG_DEXT;
break;
case XFS_DINODE_FMT_BTREE:
(*target_log_flags) |= XFS_ILOG_DBROOT;
- ASSERT(tip->i_d.di_version < 3 ||
+ ASSERT(!xfs_sb_version_has_v3inode(&ip->i_mount->m_sb) ||
(*target_log_flags & XFS_ILOG_DOWNER));
break;
}
@@ -1721,6 +1575,7 @@
int lock_flags;
uint64_t f;
int resblks = 0;
+ unsigned int flags = 0;
/*
* Lock the inodes against other IO, page faults and truncate to
@@ -1744,6 +1599,14 @@
goto out_unlock;
}
+ error = xfs_qm_dqattach(ip);
+ if (error)
+ goto out_unlock;
+
+ error = xfs_qm_dqattach(tip);
+ if (error)
+ goto out_unlock;
+
error = xfs_swap_extent_flush(ip);
if (error)
goto out_unlock;
@@ -1763,9 +1626,9 @@
* performed with log redo items!
*/
if (xfs_sb_version_hasrmapbt(&mp->m_sb)) {
- int w = XFS_DATA_FORK;
- uint32_t ipnext = XFS_IFORK_NEXTENTS(ip, w);
- uint32_t tipnext = XFS_IFORK_NEXTENTS(tip, w);
+ int w = XFS_DATA_FORK;
+ uint32_t ipnext = ip->i_df.if_nextents;
+ uint32_t tipnext = tip->i_df.if_nextents;
/*
* Conceptually this shouldn't affect the shape of either bmbt,
@@ -1776,17 +1639,16 @@
resblks += XFS_SWAP_RMAP_SPACE_RES(mp, tipnext, w);
/*
- * Handle the corner case where either inode might straddle the
- * btree format boundary. If so, the inode could bounce between
- * btree <-> extent format on unmap -> remap cycles, freeing and
- * allocating a bmapbt block each time.
+ * If either inode straddles a bmapbt block allocation boundary,
+ * the rmapbt algorithm triggers repeated allocs and frees as
+ * extents are remapped. This can exhaust the block reservation
+ * prematurely and cause shutdown. Return freed blocks to the
+ * transaction reservation to counter this behavior.
*/
- if (ipnext == (XFS_IFORK_MAXEXT(ip, w) + 1))
- resblks += XFS_IFORK_MAXEXT(ip, w);
- if (tipnext == (XFS_IFORK_MAXEXT(tip, w) + 1))
- resblks += XFS_IFORK_MAXEXT(tip, w);
+ flags |= XFS_TRANS_RES_FDBLKS;
}
- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, &tp);
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, flags,
+ &tp);
if (error)
goto out_unlock;
@@ -1865,10 +1727,11 @@
/* Swap the cow forks. */
if (xfs_sb_version_hasreflink(&mp->m_sb)) {
- ASSERT(ip->i_cformat == XFS_DINODE_FMT_EXTENTS);
- ASSERT(tip->i_cformat == XFS_DINODE_FMT_EXTENTS);
+ ASSERT(!ip->i_cowfp ||
+ ip->i_cowfp->if_format == XFS_DINODE_FMT_EXTENTS);
+ ASSERT(!tip->i_cowfp ||
+ tip->i_cowfp->if_format == XFS_DINODE_FMT_EXTENTS);
- swap(ip->i_cnextents, tip->i_cnextents);
swap(ip->i_cowfp, tip->i_cowfp);
if (ip->i_cowfp && ip->i_cowfp->if_bytes)
diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h
index 7a78229..9f99316 100644
--- a/fs/xfs/xfs_bmap_util.h
+++ b/fs/xfs/xfs_bmap_util.h
@@ -30,8 +30,6 @@
}
#endif /* CONFIG_XFS_RT */
-int xfs_bmap_eof(struct xfs_inode *ip, xfs_fileoff_t endoff,
- int whichfork, int *eof);
int xfs_bmap_punch_delalloc_range(struct xfs_inode *ip,
xfs_fileoff_t start_fsb, xfs_fileoff_t length);
@@ -59,8 +57,6 @@
xfs_off_t len, int alloc_type);
int xfs_free_file_space(struct xfs_inode *ip, xfs_off_t offset,
xfs_off_t len);
-int xfs_zero_file_space(struct xfs_inode *ip, xfs_off_t offset,
- xfs_off_t len);
int xfs_collapse_file_space(struct xfs_inode *, xfs_off_t offset,
xfs_off_t len);
int xfs_insert_file_space(struct xfs_inode *, xfs_off_t offset,
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 1264ac6..1188190 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -14,6 +14,9 @@
#include "xfs_mount.h"
#include "xfs_trace.h"
#include "xfs_log.h"
+#include "xfs_log_recover.h"
+#include "xfs_trans.h"
+#include "xfs_buf_item.h"
#include "xfs_errortag.h"
#include "xfs_error.h"
@@ -49,6 +52,15 @@
* b_lock (trylock due to inversion)
*/
+static int __xfs_buf_submit(struct xfs_buf *bp, bool wait);
+
+static inline int
+xfs_buf_submit(
+ struct xfs_buf *bp)
+{
+ return __xfs_buf_submit(bp, !(bp->b_flags & XBF_ASYNC));
+}
+
static inline int
xfs_buf_is_vmapped(
struct xfs_buf *bp)
@@ -198,20 +210,20 @@
}
}
-static struct xfs_buf *
+static int
_xfs_buf_alloc(
struct xfs_buftarg *target,
struct xfs_buf_map *map,
int nmaps,
- xfs_buf_flags_t flags)
+ xfs_buf_flags_t flags,
+ struct xfs_buf **bpp)
{
struct xfs_buf *bp;
int error;
int i;
- bp = kmem_zone_zalloc(xfs_buf_zone, KM_NOFS);
- if (unlikely(!bp))
- return NULL;
+ *bpp = NULL;
+ bp = kmem_cache_zalloc(xfs_buf_zone, GFP_NOFS | __GFP_NOFAIL);
/*
* We don't want certain flags to appear in b_flags unless they are
@@ -238,8 +250,8 @@
*/
error = xfs_buf_get_maps(bp, nmaps);
if (error) {
- kmem_zone_free(xfs_buf_zone, bp);
- return NULL;
+ kmem_cache_free(xfs_buf_zone, bp);
+ return error;
}
bp->b_bn = map[0].bm_bn;
@@ -256,7 +268,8 @@
XFS_STATS_INC(bp->b_mount, xb_create);
trace_xfs_buf_init(bp, _RET_IP_);
- return bp;
+ *bpp = bp;
+ return 0;
}
/*
@@ -304,7 +317,7 @@
* The buffer must not be on any hash - use xfs_buf_rele instead for
* hashed and refcounted buffers
*/
-void
+static void
xfs_buf_free(
xfs_buf_t *bp)
{
@@ -324,11 +337,14 @@
__free_page(page);
}
+ if (current->reclaim_state)
+ current->reclaim_state->reclaimed_slab +=
+ bp->b_page_count;
} else if (bp->b_flags & _XBF_KMEM)
kmem_free(bp->b_addr);
_xfs_buf_free_pages(bp);
xfs_buf_free_maps(bp);
- kmem_zone_free(xfs_buf_zone, bp);
+ kmem_cache_free(xfs_buf_zone, bp);
}
/*
@@ -461,7 +477,7 @@
unsigned nofs_flag;
/*
- * vm_map_ram() will allocate auxillary structures (e.g.
+ * vm_map_ram() will allocate auxiliary structures (e.g.
* pagetables) with GFP_KERNEL, yet we are likely to be under
* GFP_NOFS context here. Hence we need to tell memory reclaim
* that we are in such a context via PF_MEMALLOC_NOFS to prevent
@@ -471,7 +487,7 @@
nofs_flag = memalloc_nofs_save();
do {
bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count,
- -1, PAGE_KERNEL);
+ -1);
if (bp->b_addr)
break;
vm_unmap_aliases();
@@ -649,7 +665,6 @@
*/
if (bp->b_flags & XBF_STALE) {
ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0);
- ASSERT(bp->b_iodone == NULL);
bp->b_flags &= _XBF_KMEM | _XBF_PAGES;
bp->b_ops = NULL;
}
@@ -682,53 +697,39 @@
* cache hits, as metadata intensive workloads will see 3 orders of magnitude
* more hits than misses.
*/
-struct xfs_buf *
+int
xfs_buf_get_map(
struct xfs_buftarg *target,
struct xfs_buf_map *map,
int nmaps,
- xfs_buf_flags_t flags)
+ xfs_buf_flags_t flags,
+ struct xfs_buf **bpp)
{
struct xfs_buf *bp;
struct xfs_buf *new_bp;
int error = 0;
+ *bpp = NULL;
error = xfs_buf_find(target, map, nmaps, flags, NULL, &bp);
-
- switch (error) {
- case 0:
- /* cache hit */
+ if (!error)
goto found;
- case -EAGAIN:
- /* cache hit, trylock failure, caller handles failure */
- ASSERT(flags & XBF_TRYLOCK);
- return NULL;
- case -ENOENT:
- /* cache miss, go for insert */
- break;
- case -EFSCORRUPTED:
- default:
- /*
- * None of the higher layers understand failure types
- * yet, so return NULL to signal a fatal lookup error.
- */
- return NULL;
- }
+ if (error != -ENOENT)
+ return error;
- new_bp = _xfs_buf_alloc(target, map, nmaps, flags);
- if (unlikely(!new_bp))
- return NULL;
+ error = _xfs_buf_alloc(target, map, nmaps, flags, &new_bp);
+ if (error)
+ return error;
error = xfs_buf_allocate_memory(new_bp, flags);
if (error) {
xfs_buf_free(new_bp);
- return NULL;
+ return error;
}
error = xfs_buf_find(target, map, nmaps, flags, new_bp, &bp);
if (error) {
xfs_buf_free(new_bp);
- return NULL;
+ return error;
}
if (bp != new_bp)
@@ -738,10 +739,11 @@
if (!bp->b_addr) {
error = _xfs_buf_map_pages(bp, flags);
if (unlikely(error)) {
- xfs_warn(target->bt_mount,
- "%s: failed to map pagesn", __func__);
+ xfs_warn_ratelimited(target->bt_mount,
+ "%s: failed to map %u pages", __func__,
+ bp->b_page_count);
xfs_buf_relse(bp);
- return NULL;
+ return error;
}
}
@@ -754,10 +756,11 @@
XFS_STATS_INC(target->bt_mount, xb_get);
trace_xfs_buf_get(bp, flags, _RET_IP_);
- return bp;
+ *bpp = bp;
+ return 0;
}
-STATIC int
+int
_xfs_buf_read(
xfs_buf_t *bp,
xfs_buf_flags_t flags)
@@ -765,7 +768,7 @@
ASSERT(!(flags & XBF_WRITE));
ASSERT(bp->b_maps[0].bm_bn != XFS_BUF_DADDR_NULL);
- bp->b_flags &= ~(XBF_WRITE | XBF_ASYNC | XBF_READ_AHEAD);
+ bp->b_flags &= ~(XBF_WRITE | XBF_ASYNC | XBF_READ_AHEAD | XBF_DONE);
bp->b_flags |= flags & (XBF_READ | XBF_ASYNC | XBF_READ_AHEAD);
return xfs_buf_submit(bp);
@@ -806,46 +809,77 @@
return bp->b_error;
}
-xfs_buf_t *
+int
xfs_buf_read_map(
struct xfs_buftarg *target,
struct xfs_buf_map *map,
int nmaps,
xfs_buf_flags_t flags,
- const struct xfs_buf_ops *ops)
+ struct xfs_buf **bpp,
+ const struct xfs_buf_ops *ops,
+ xfs_failaddr_t fa)
{
struct xfs_buf *bp;
+ int error;
flags |= XBF_READ;
+ *bpp = NULL;
- bp = xfs_buf_get_map(target, map, nmaps, flags);
- if (!bp)
- return NULL;
+ error = xfs_buf_get_map(target, map, nmaps, flags, &bp);
+ if (error)
+ return error;
trace_xfs_buf_read(bp, flags, _RET_IP_);
if (!(bp->b_flags & XBF_DONE)) {
+ /* Initiate the buffer read and wait. */
XFS_STATS_INC(target->bt_mount, xb_get_read);
bp->b_ops = ops;
- _xfs_buf_read(bp, flags);
- return bp;
+ error = _xfs_buf_read(bp, flags);
+
+ /* Readahead iodone already dropped the buffer, so exit. */
+ if (flags & XBF_ASYNC)
+ return 0;
+ } else {
+ /* Buffer already read; all we need to do is check it. */
+ error = xfs_buf_reverify(bp, ops);
+
+ /* Readahead already finished; drop the buffer and exit. */
+ if (flags & XBF_ASYNC) {
+ xfs_buf_relse(bp);
+ return 0;
+ }
+
+ /* We do not want read in the flags */
+ bp->b_flags &= ~XBF_READ;
+ ASSERT(bp->b_ops != NULL || ops == NULL);
}
- xfs_buf_reverify(bp, ops);
+ /*
+ * If we've had a read error, then the contents of the buffer are
+ * invalid and should not be used. To ensure that a followup read tries
+ * to pull the buffer from disk again, we clear the XBF_DONE flag and
+ * mark the buffer stale. This ensures that anyone who has a current
+ * reference to the buffer will interpret it's contents correctly and
+ * future cache lookups will also treat it as an empty, uninitialised
+ * buffer.
+ */
+ if (error) {
+ if (!XFS_FORCED_SHUTDOWN(target->bt_mount))
+ xfs_buf_ioerror_alert(bp, fa);
- if (flags & XBF_ASYNC) {
- /*
- * Read ahead call which is already satisfied,
- * drop the buffer
- */
+ bp->b_flags &= ~XBF_DONE;
+ xfs_buf_stale(bp);
xfs_buf_relse(bp);
- return NULL;
+
+ /* bad CRC means corrupted metadata */
+ if (error == -EFSBADCRC)
+ error = -EFSCORRUPTED;
+ return error;
}
- /* We do not want read in the flags */
- bp->b_flags &= ~XBF_READ;
- ASSERT(bp->b_ops != NULL || ops == NULL);
- return bp;
+ *bpp = bp;
+ return 0;
}
/*
@@ -859,11 +893,14 @@
int nmaps,
const struct xfs_buf_ops *ops)
{
+ struct xfs_buf *bp;
+
if (bdi_read_congested(target->bt_bdev->bd_bdi))
return;
xfs_buf_read_map(target, map, nmaps,
- XBF_TRYLOCK|XBF_ASYNC|XBF_READ_AHEAD, ops);
+ XBF_TRYLOCK | XBF_ASYNC | XBF_READ_AHEAD, &bp, ops,
+ __this_address);
}
/*
@@ -880,12 +917,13 @@
const struct xfs_buf_ops *ops)
{
struct xfs_buf *bp;
+ int error;
*bpp = NULL;
- bp = xfs_buf_get_uncached(target, numblks, flags);
- if (!bp)
- return -ENOMEM;
+ error = xfs_buf_get_uncached(target, numblks, flags, &bp);
+ if (error)
+ return error;
/* set up the buffer for a read IO */
ASSERT(bp->b_map_count == 1);
@@ -896,7 +934,7 @@
xfs_buf_submit(bp);
if (bp->b_error) {
- int error = bp->b_error;
+ error = bp->b_error;
xfs_buf_relse(bp);
return error;
}
@@ -905,20 +943,23 @@
return 0;
}
-xfs_buf_t *
+int
xfs_buf_get_uncached(
struct xfs_buftarg *target,
size_t numblks,
- int flags)
+ int flags,
+ struct xfs_buf **bpp)
{
unsigned long page_count;
int error, i;
struct xfs_buf *bp;
DEFINE_SINGLE_BUF_MAP(map, XFS_BUF_DADDR_NULL, numblks);
+ *bpp = NULL;
+
/* flags might contain irrelevant bits, pass only what we care about */
- bp = _xfs_buf_alloc(target, &map, 1, flags & XBF_NO_IOACCT);
- if (unlikely(bp == NULL))
+ error = _xfs_buf_alloc(target, &map, 1, flags & XBF_NO_IOACCT, &bp);
+ if (error)
goto fail;
page_count = PAGE_ALIGN(numblks << BBSHIFT) >> PAGE_SHIFT;
@@ -928,8 +969,10 @@
for (i = 0; i < page_count; i++) {
bp->b_pages[i] = alloc_page(xb_to_gfp(flags));
- if (!bp->b_pages[i])
+ if (!bp->b_pages[i]) {
+ error = -ENOMEM;
goto fail_free_mem;
+ }
}
bp->b_flags |= _XBF_PAGES;
@@ -941,7 +984,8 @@
}
trace_xfs_buf_get_uncached(bp, _RET_IP_);
- return bp;
+ *bpp = bp;
+ return 0;
fail_free_mem:
while (--i >= 0)
@@ -949,9 +993,9 @@
_xfs_buf_free_pages(bp);
fail_free_buf:
xfs_buf_free_maps(bp);
- kmem_zone_free(xfs_buf_zone, bp);
+ kmem_cache_free(xfs_buf_zone, bp);
fail:
- return NULL;
+ return error;
}
/*
@@ -1135,20 +1179,145 @@
set_current_state(TASK_RUNNING);
}
-/*
- * Buffer Utility Routines
- */
+static void
+xfs_buf_ioerror_alert_ratelimited(
+ struct xfs_buf *bp)
+{
+ static unsigned long lasttime;
+ static struct xfs_buftarg *lasttarg;
-void
+ if (bp->b_target != lasttarg ||
+ time_after(jiffies, (lasttime + 5*HZ))) {
+ lasttime = jiffies;
+ xfs_buf_ioerror_alert(bp, __this_address);
+ }
+ lasttarg = bp->b_target;
+}
+
+/*
+ * Account for this latest trip around the retry handler, and decide if
+ * we've failed enough times to constitute a permanent failure.
+ */
+static bool
+xfs_buf_ioerror_permanent(
+ struct xfs_buf *bp,
+ struct xfs_error_cfg *cfg)
+{
+ struct xfs_mount *mp = bp->b_mount;
+
+ if (cfg->max_retries != XFS_ERR_RETRY_FOREVER &&
+ ++bp->b_retries > cfg->max_retries)
+ return true;
+ if (cfg->retry_timeout != XFS_ERR_RETRY_FOREVER &&
+ time_after(jiffies, cfg->retry_timeout + bp->b_first_retry_time))
+ return true;
+
+ /* At unmount we may treat errors differently */
+ if ((mp->m_flags & XFS_MOUNT_UNMOUNTING) && mp->m_fail_unmount)
+ return true;
+
+ return false;
+}
+
+/*
+ * On a sync write or shutdown we just want to stale the buffer and let the
+ * caller handle the error in bp->b_error appropriately.
+ *
+ * If the write was asynchronous then no one will be looking for the error. If
+ * this is the first failure of this type, clear the error state and write the
+ * buffer out again. This means we always retry an async write failure at least
+ * once, but we also need to set the buffer up to behave correctly now for
+ * repeated failures.
+ *
+ * If we get repeated async write failures, then we take action according to the
+ * error configuration we have been set up to use.
+ *
+ * Returns true if this function took care of error handling and the caller must
+ * not touch the buffer again. Return false if the caller should proceed with
+ * normal I/O completion handling.
+ */
+static bool
+xfs_buf_ioend_handle_error(
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = bp->b_mount;
+ struct xfs_error_cfg *cfg;
+
+ /*
+ * If we've already decided to shutdown the filesystem because of I/O
+ * errors, there's no point in giving this a retry.
+ */
+ if (XFS_FORCED_SHUTDOWN(mp))
+ goto out_stale;
+
+ xfs_buf_ioerror_alert_ratelimited(bp);
+
+ /*
+ * We're not going to bother about retrying this during recovery.
+ * One strike!
+ */
+ if (bp->b_flags & _XBF_LOGRECOVERY) {
+ xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
+ return false;
+ }
+
+ /*
+ * Synchronous writes will have callers process the error.
+ */
+ if (!(bp->b_flags & XBF_ASYNC))
+ goto out_stale;
+
+ trace_xfs_buf_iodone_async(bp, _RET_IP_);
+
+ cfg = xfs_error_get_cfg(mp, XFS_ERR_METADATA, bp->b_error);
+ if (bp->b_last_error != bp->b_error ||
+ !(bp->b_flags & (XBF_STALE | XBF_WRITE_FAIL))) {
+ bp->b_last_error = bp->b_error;
+ if (cfg->retry_timeout != XFS_ERR_RETRY_FOREVER &&
+ !bp->b_first_retry_time)
+ bp->b_first_retry_time = jiffies;
+ goto resubmit;
+ }
+
+ /*
+ * Permanent error - we need to trigger a shutdown if we haven't already
+ * to indicate that inconsistency will result from this action.
+ */
+ if (xfs_buf_ioerror_permanent(bp, cfg)) {
+ xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
+ goto out_stale;
+ }
+
+ /* Still considered a transient error. Caller will schedule retries. */
+ if (bp->b_flags & _XBF_INODES)
+ xfs_buf_inode_io_fail(bp);
+ else if (bp->b_flags & _XBF_DQUOTS)
+ xfs_buf_dquot_io_fail(bp);
+ else
+ ASSERT(list_empty(&bp->b_li_list));
+ xfs_buf_ioerror(bp, 0);
+ xfs_buf_relse(bp);
+ return true;
+
+resubmit:
+ xfs_buf_ioerror(bp, 0);
+ bp->b_flags |= (XBF_DONE | XBF_WRITE_FAIL);
+ xfs_buf_submit(bp);
+ return true;
+out_stale:
+ xfs_buf_stale(bp);
+ bp->b_flags |= XBF_DONE;
+ bp->b_flags &= ~XBF_WRITE;
+ trace_xfs_buf_error_relse(bp, _RET_IP_);
+ return false;
+}
+
+static void
xfs_buf_ioend(
struct xfs_buf *bp)
{
- bool read = bp->b_flags & XBF_READ;
-
trace_xfs_buf_iodone(bp, _RET_IP_);
- bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD);
-
/*
* Pull in IO completion errors now. We are guaranteed to be running
* single threaded, so we don't need the lock to read b_io_error.
@@ -1156,20 +1325,44 @@
if (!bp->b_error && bp->b_io_error)
xfs_buf_ioerror(bp, bp->b_io_error);
- /* Only validate buffers that were read without errors */
- if (read && !bp->b_error && bp->b_ops) {
- ASSERT(!bp->b_iodone);
- bp->b_ops->verify_read(bp);
+ if (bp->b_flags & XBF_READ) {
+ if (!bp->b_error && bp->b_ops)
+ bp->b_ops->verify_read(bp);
+ if (!bp->b_error)
+ bp->b_flags |= XBF_DONE;
+ } else {
+ if (!bp->b_error) {
+ bp->b_flags &= ~XBF_WRITE_FAIL;
+ bp->b_flags |= XBF_DONE;
+ }
+
+ if (unlikely(bp->b_error) && xfs_buf_ioend_handle_error(bp))
+ return;
+
+ /* clear the retry state */
+ bp->b_last_error = 0;
+ bp->b_retries = 0;
+ bp->b_first_retry_time = 0;
+
+ /*
+ * Note that for things like remote attribute buffers, there may
+ * not be a buffer log item here, so processing the buffer log
+ * item must remain optional.
+ */
+ if (bp->b_log_item)
+ xfs_buf_item_done(bp);
+
+ if (bp->b_flags & _XBF_INODES)
+ xfs_buf_inode_iodone(bp);
+ else if (bp->b_flags & _XBF_DQUOTS)
+ xfs_buf_dquot_iodone(bp);
+
}
- if (!bp->b_error) {
- bp->b_flags &= ~XBF_WRITE_FAIL;
- bp->b_flags |= XBF_DONE;
- }
+ bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD |
+ _XBF_LOGRECOVERY);
- if (bp->b_iodone)
- (*(bp->b_iodone))(bp);
- else if (bp->b_flags & XBF_ASYNC)
+ if (bp->b_flags & XBF_ASYNC)
xfs_buf_relse(bp);
else
complete(&bp->b_iowait);
@@ -1207,12 +1400,28 @@
void
xfs_buf_ioerror_alert(
struct xfs_buf *bp,
- const char *func)
+ xfs_failaddr_t func)
{
- xfs_alert(bp->b_mount,
-"metadata I/O error in \"%s\" at daddr 0x%llx len %d error %d",
- func, (uint64_t)XFS_BUF_ADDR(bp), bp->b_length,
- -bp->b_error);
+ xfs_buf_alert_ratelimited(bp, "XFS: metadata IO error",
+ "metadata I/O error in \"%pS\" at daddr 0x%llx len %d error %d",
+ func, (uint64_t)XFS_BUF_ADDR(bp),
+ bp->b_length, -bp->b_error);
+}
+
+/*
+ * To simulate an I/O failure, the buffer must be locked and held with at least
+ * three references. The LRU reference is dropped by the stale call. The buf
+ * item reference is dropped via ioend processing. The third reference is owned
+ * by the caller and is dropped on I/O completion if the buffer is XBF_ASYNC.
+ */
+void
+xfs_buf_ioend_fail(
+ struct xfs_buf *bp)
+{
+ bp->b_flags &= ~XBF_DONE;
+ xfs_buf_stale(bp);
+ xfs_buf_ioerror(bp, -EIO);
+ xfs_buf_ioend(bp);
}
int
@@ -1239,6 +1448,11 @@
{
struct xfs_buf *bp = (struct xfs_buf *)bio->bi_private;
+ if (!bio->bi_status &&
+ (bp->b_flags & XBF_WRITE) && (bp->b_flags & XBF_ASYNC) &&
+ XFS_TEST_ERROR(false, bp->b_mount, XFS_ERRTAG_BUF_IOERROR))
+ bio->bi_status = BLK_STS_IOERR;
+
/*
* don't overwrite existing errors - otherwise we can lose errors on
* buffers that require multiple bios to complete.
@@ -1263,8 +1477,7 @@
int map,
int *buf_offset,
int *count,
- int op,
- int op_flags)
+ int op)
{
int page_index;
int total_nr_pages = bp->b_page_count;
@@ -1299,7 +1512,7 @@
bio->bi_iter.bi_sector = sector;
bio->bi_end_io = xfs_buf_bio_end_io;
bio->bi_private = bp;
- bio_set_op_attrs(bio, op, op_flags);
+ bio->bi_opf = op;
for (; size && nr_pages; nr_pages--, page_index++) {
int rbytes, nbytes = PAGE_SIZE - offset;
@@ -1344,7 +1557,6 @@
{
struct blk_plug plug;
int op;
- int op_flags = 0;
int offset;
int size;
int i;
@@ -1386,15 +1598,14 @@
dump_stack();
}
}
- } else if (bp->b_flags & XBF_READ_AHEAD) {
- op = REQ_OP_READ;
- op_flags = REQ_RAHEAD;
} else {
op = REQ_OP_READ;
+ if (bp->b_flags & XBF_READ_AHEAD)
+ op |= REQ_RAHEAD;
}
/* we only use the buffer cache for meta-data */
- op_flags |= REQ_META;
+ op |= REQ_META;
/*
* Walk all the vectors issuing IO on them. Set up the initial offset
@@ -1406,7 +1617,7 @@
size = BBTOB(bp->b_length);
blk_start_plug(&plug);
for (i = 0; i < bp->b_map_count; i++) {
- xfs_buf_ioapply_map(bp, i, &offset, &size, op, op_flags);
+ xfs_buf_ioapply_map(bp, i, &offset, &size, op);
if (bp->b_error)
break;
if (size <= 0)
@@ -1437,7 +1648,7 @@
* safe to reference the buffer after a call to this function unless the caller
* holds an additional reference itself.
*/
-int
+static int
__xfs_buf_submit(
struct xfs_buf *bp,
bool wait)
@@ -1450,10 +1661,7 @@
/* on shutdown we stale and complete the buffer immediately */
if (XFS_FORCED_SHUTDOWN(bp->b_mount)) {
- xfs_buf_ioerror(bp, -EIO);
- bp->b_flags &= ~XBF_DONE;
- xfs_buf_stale(bp);
- xfs_buf_ioend(bp);
+ xfs_buf_ioend_fail(bp);
return -EIO;
}
@@ -1547,6 +1755,28 @@
}
/*
+ * Log a message about and stale a buffer that a caller has decided is corrupt.
+ *
+ * This function should be called for the kinds of metadata corruption that
+ * cannot be detect from a verifier, such as incorrect inter-block relationship
+ * data. Do /not/ call this function from a verifier function.
+ *
+ * The buffer must be XBF_DONE prior to the call. Afterwards, the buffer will
+ * be marked stale, but b_error will not be set. The caller is responsible for
+ * releasing the buffer or fixing it.
+ */
+void
+__xfs_buf_mark_corrupt(
+ struct xfs_buf *bp,
+ xfs_failaddr_t fa)
+{
+ ASSERT(bp->b_flags & XBF_DONE);
+
+ xfs_buf_corruption_error(bp, fa);
+ xfs_buf_stale(bp);
+}
+
+/*
* Handling of buffer targets (buftargs).
*/
@@ -1590,7 +1820,8 @@
struct xfs_buftarg *btp)
{
LIST_HEAD(dispose);
- int loop = 0;
+ int loop = 0;
+ bool write_fail = false;
/*
* First wait on the buftarg I/O count for all in-flight buffers to be
@@ -1618,17 +1849,29 @@
bp = list_first_entry(&dispose, struct xfs_buf, b_lru);
list_del_init(&bp->b_lru);
if (bp->b_flags & XBF_WRITE_FAIL) {
- xfs_alert(btp->bt_mount,
+ write_fail = true;
+ xfs_buf_alert_ratelimited(bp,
+ "XFS: Corruption Alert",
"Corruption Alert: Buffer at daddr 0x%llx had permanent write failures!",
(long long)bp->b_bn);
- xfs_alert(btp->bt_mount,
-"Please run xfs_repair to determine the extent of the problem.");
}
xfs_buf_rele(bp);
}
if (loop++ != 0)
delay(100);
}
+
+ /*
+ * If one or more failed buffers were freed, that means dirty metadata
+ * was thrown away. This should only ever happen after I/O completion
+ * handling has elevated I/O error(s) to permanent failures and shuts
+ * down the fs.
+ */
+ if (write_fail) {
+ ASSERT(XFS_FORCED_SHUTDOWN(btp->bt_mount));
+ xfs_alert(btp->bt_mount,
+ "Please run xfs_repair to determine the extent of the problem.");
+ }
}
static enum lru_status
@@ -1761,6 +2004,13 @@
btp->bt_bdev = bdev;
btp->bt_daxdev = dax_dev;
+ /*
+ * Buffer IO error rate limiting. Limit it to no more than 10 messages
+ * per 30 seconds so as to not spam logs too much on repeated errors.
+ */
+ ratelimit_state_init(&btp->bt_ioerror_rl, 30 * HZ,
+ DEFAULT_RATELIMIT_BURST);
+
if (xfs_setsize_buftarg_early(btp, bdev))
goto error_free;
@@ -1864,9 +2114,9 @@
*/
static int
xfs_buf_cmp(
- void *priv,
- struct list_head *a,
- struct list_head *b)
+ void *priv,
+ const struct list_head *a,
+ const struct list_head *b)
{
struct xfs_buf *ap = container_of(a, struct xfs_buf, b_list);
struct xfs_buf *bp = container_of(b, struct xfs_buf, b_list);
@@ -2065,8 +2315,11 @@
int __init
xfs_buf_init(void)
{
- xfs_buf_zone = kmem_zone_init_flags(sizeof(xfs_buf_t), "xfs_buf",
- KM_ZONE_HWALIGN, NULL);
+ xfs_buf_zone = kmem_cache_create("xfs_buf", sizeof(struct xfs_buf), 0,
+ SLAB_HWCACHE_ALIGN |
+ SLAB_RECLAIM_ACCOUNT |
+ SLAB_MEM_SPREAD,
+ NULL);
if (!xfs_buf_zone)
goto out;
@@ -2079,7 +2332,7 @@
void
xfs_buf_terminate(void)
{
- kmem_zone_destroy(xfs_buf_zone);
+ kmem_cache_destroy(xfs_buf_zone);
}
void xfs_buf_set_ref(struct xfs_buf *bp, int lru_ref)
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index f6ce17d..bfd2907 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -18,6 +18,7 @@
/*
* Base types
*/
+struct xfs_buf;
#define XFS_BUF_DADDR_NULL ((xfs_daddr_t) (-1LL))
@@ -30,15 +31,20 @@
#define XBF_STALE (1 << 6) /* buffer has been staled, do not find it */
#define XBF_WRITE_FAIL (1 << 7) /* async writes have failed on this buffer */
-/* flags used only as arguments to access routines */
-#define XBF_TRYLOCK (1 << 16)/* lock requested, but do not wait */
-#define XBF_UNMAPPED (1 << 17)/* do not map the buffer */
+/* buffer type flags for write callbacks */
+#define _XBF_INODES (1 << 16)/* inode buffer */
+#define _XBF_DQUOTS (1 << 17)/* dquot buffer */
+#define _XBF_LOGRECOVERY (1 << 18)/* log recovery buffer */
/* flags used only internally */
#define _XBF_PAGES (1 << 20)/* backed by refcounted pages */
#define _XBF_KMEM (1 << 21)/* backed by heap memory */
#define _XBF_DELWRI_Q (1 << 22)/* buffer on a delwri queue */
+/* flags used only as arguments to access routines */
+#define XBF_TRYLOCK (1 << 30)/* lock requested, but do not wait */
+#define XBF_UNMAPPED (1 << 31)/* do not map the buffer */
+
typedef unsigned int xfs_buf_flags_t;
#define XFS_BUF_FLAGS \
@@ -50,12 +56,15 @@
{ XBF_DONE, "DONE" }, \
{ XBF_STALE, "STALE" }, \
{ XBF_WRITE_FAIL, "WRITE_FAIL" }, \
- { XBF_TRYLOCK, "TRYLOCK" }, /* should never be set */\
- { XBF_UNMAPPED, "UNMAPPED" }, /* ditto */\
+ { _XBF_INODES, "INODES" }, \
+ { _XBF_DQUOTS, "DQUOTS" }, \
+ { _XBF_LOGRECOVERY, "LOG_RECOVERY" }, \
{ _XBF_PAGES, "PAGES" }, \
{ _XBF_KMEM, "KMEM" }, \
- { _XBF_DELWRI_Q, "DELWRI_Q" }
-
+ { _XBF_DELWRI_Q, "DELWRI_Q" }, \
+ /* The following interface flags should never be set */ \
+ { XBF_TRYLOCK, "TRYLOCK" }, \
+ { XBF_UNMAPPED, "UNMAPPED" }
/*
* Internal state flags.
@@ -91,12 +100,9 @@
struct list_lru bt_lru;
struct percpu_counter bt_io_count;
+ struct ratelimit_state bt_ioerror_rl;
} xfs_buftarg_t;
-struct xfs_buf;
-typedef void (*xfs_buf_iodone_t)(struct xfs_buf *);
-
-
#define XB_PAGES 2
struct xfs_buf_map {
@@ -149,7 +155,6 @@
xfs_buftarg_t *b_target; /* buffer target (device) */
void *b_addr; /* virtual address of buffer */
struct work_struct b_ioend_work;
- xfs_buf_iodone_t b_iodone; /* I/O completion function */
struct completion b_iowait; /* queue for I/O waiters */
struct xfs_buf_log_item *b_log_item;
struct list_head b_li_list; /* Log items list head */
@@ -192,37 +197,40 @@
xfs_daddr_t blkno, size_t numblks,
xfs_buf_flags_t flags);
-struct xfs_buf *xfs_buf_get_map(struct xfs_buftarg *target,
- struct xfs_buf_map *map, int nmaps,
- xfs_buf_flags_t flags);
-struct xfs_buf *xfs_buf_read_map(struct xfs_buftarg *target,
- struct xfs_buf_map *map, int nmaps,
- xfs_buf_flags_t flags,
- const struct xfs_buf_ops *ops);
+int xfs_buf_get_map(struct xfs_buftarg *target, struct xfs_buf_map *map,
+ int nmaps, xfs_buf_flags_t flags, struct xfs_buf **bpp);
+int xfs_buf_read_map(struct xfs_buftarg *target, struct xfs_buf_map *map,
+ int nmaps, xfs_buf_flags_t flags, struct xfs_buf **bpp,
+ const struct xfs_buf_ops *ops, xfs_failaddr_t fa);
void xfs_buf_readahead_map(struct xfs_buftarg *target,
struct xfs_buf_map *map, int nmaps,
const struct xfs_buf_ops *ops);
-static inline struct xfs_buf *
+static inline int
xfs_buf_get(
struct xfs_buftarg *target,
xfs_daddr_t blkno,
- size_t numblks)
+ size_t numblks,
+ struct xfs_buf **bpp)
{
DEFINE_SINGLE_BUF_MAP(map, blkno, numblks);
- return xfs_buf_get_map(target, &map, 1, 0);
+
+ return xfs_buf_get_map(target, &map, 1, 0, bpp);
}
-static inline struct xfs_buf *
+static inline int
xfs_buf_read(
struct xfs_buftarg *target,
xfs_daddr_t blkno,
size_t numblks,
xfs_buf_flags_t flags,
+ struct xfs_buf **bpp,
const struct xfs_buf_ops *ops)
{
DEFINE_SINGLE_BUF_MAP(map, blkno, numblks);
- return xfs_buf_read_map(target, &map, 1, flags, ops);
+
+ return xfs_buf_read_map(target, &map, 1, flags, bpp, ops,
+ __builtin_return_address(0));
}
static inline void
@@ -236,15 +244,15 @@
return xfs_buf_readahead_map(target, &map, 1, ops);
}
-struct xfs_buf *xfs_buf_get_uncached(struct xfs_buftarg *target, size_t numblks,
- int flags);
+int xfs_buf_get_uncached(struct xfs_buftarg *target, size_t numblks, int flags,
+ struct xfs_buf **bpp);
int xfs_buf_read_uncached(struct xfs_buftarg *target, xfs_daddr_t daddr,
size_t numblks, int flags, struct xfs_buf **bpp,
const struct xfs_buf_ops *ops);
+int _xfs_buf_read(struct xfs_buf *bp, xfs_buf_flags_t flags);
void xfs_buf_hold(struct xfs_buf *bp);
/* Releasing Buffers */
-extern void xfs_buf_free(xfs_buf_t *);
extern void xfs_buf_rele(xfs_buf_t *);
/* Locking and Unlocking Buffers */
@@ -254,22 +262,23 @@
#define xfs_buf_islocked(bp) \
((bp)->b_sema.count <= 0)
+static inline void xfs_buf_relse(xfs_buf_t *bp)
+{
+ xfs_buf_unlock(bp);
+ xfs_buf_rele(bp);
+}
+
/* Buffer Read and Write Routines */
extern int xfs_bwrite(struct xfs_buf *bp);
-extern void xfs_buf_ioend(struct xfs_buf *bp);
+
extern void __xfs_buf_ioerror(struct xfs_buf *bp, int error,
xfs_failaddr_t failaddr);
#define xfs_buf_ioerror(bp, err) __xfs_buf_ioerror((bp), (err), __this_address)
-extern void xfs_buf_ioerror_alert(struct xfs_buf *, const char *func);
-
-extern int __xfs_buf_submit(struct xfs_buf *bp, bool);
-static inline int xfs_buf_submit(struct xfs_buf *bp)
-{
- bool wait = bp->b_flags & XBF_ASYNC ? false : true;
- return __xfs_buf_submit(bp, wait);
-}
-
+extern void xfs_buf_ioerror_alert(struct xfs_buf *bp, xfs_failaddr_t fa);
+void xfs_buf_ioend_fail(struct xfs_buf *);
void xfs_buf_zero(struct xfs_buf *bp, size_t boff, size_t bsize);
+void __xfs_buf_mark_corrupt(struct xfs_buf *bp, xfs_failaddr_t fa);
+#define xfs_buf_mark_corrupt(bp) __xfs_buf_mark_corrupt((bp), __this_address)
/* Buffer Utility Routines */
extern void *xfs_buf_offset(struct xfs_buf *, size_t);
@@ -318,12 +327,6 @@
return atomic_read(&bp->b_pin_count);
}
-static inline void xfs_buf_relse(xfs_buf_t *bp)
-{
- xfs_buf_unlock(bp);
- xfs_buf_rele(bp);
-}
-
static inline int
xfs_buf_verify_cksum(struct xfs_buf *bp, unsigned long cksum_offset)
{
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index d74fbd1..0356f2e 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -12,8 +12,13 @@
#include "xfs_bit.h"
#include "xfs_mount.h"
#include "xfs_trans.h"
-#include "xfs_buf_item.h"
#include "xfs_trans_priv.h"
+#include "xfs_buf_item.h"
+#include "xfs_inode.h"
+#include "xfs_inode_item.h"
+#include "xfs_quota.h"
+#include "xfs_dquot_item.h"
+#include "xfs_dquot.h"
#include "xfs_trace.h"
#include "xfs_log.h"
@@ -25,7 +30,22 @@
return container_of(lip, struct xfs_buf_log_item, bli_item);
}
-STATIC void xfs_buf_do_callbacks(struct xfs_buf *bp);
+/* Is this log iovec plausibly large enough to contain the buffer log format? */
+bool
+xfs_buf_log_check_iovec(
+ struct xfs_log_iovec *iovec)
+{
+ struct xfs_buf_log_format *blfp = iovec->i_addr;
+ char *bmp_end;
+ char *item_end;
+
+ if (offsetof(struct xfs_buf_log_format, blf_data_map) > iovec->i_len)
+ return false;
+
+ item_end = (char *)iovec->i_addr + iovec->i_len;
+ bmp_end = (char *)&blfp->blf_data_map[blfp->blf_map_size];
+ return bmp_end <= item_end;
+}
static inline int
xfs_buf_log_format_size(
@@ -105,7 +125,7 @@
* stretch of non-contiguous chunks to be logged. Contiguous chunks are logged
* in a single iovec.
*
- * Discontiguous buffers need a format structure per region that that is being
+ * Discontiguous buffers need a format structure per region that is being
* logged. This makes the changes in the buffer appear to log recovery as though
* they came from separate buffers, just like would occur if multiple buffers
* were used instead of a single discontiguous buffer. This enables
@@ -328,7 +348,7 @@
* occurs during recovery.
*/
if (bip->bli_flags & XFS_BLI_INODE_BUF) {
- if (xfs_sb_version_hascrc(&lip->li_mountp->m_sb) ||
+ if (xfs_sb_version_has_v3inode(&lip->li_mountp->m_sb) ||
!((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) &&
xfs_log_item_in_current_chkpt(lip)))
bip->__bli_format.blf_flags |= XFS_BLF_INODE_BUF;
@@ -393,7 +413,6 @@
{
struct xfs_buf_log_item *bip = BUF_ITEM(lip);
xfs_buf_t *bp = bip->bli_buf;
- struct xfs_ail *ailp = lip->li_ailp;
int stale = bip->bli_flags & XFS_BLI_STALE;
int freed;
@@ -435,58 +454,33 @@
}
/*
- * If we get called here because of an IO error, we may
- * or may not have the item on the AIL. xfs_trans_ail_delete()
- * will take care of that situation.
- * xfs_trans_ail_delete() drops the AIL lock.
+ * If we get called here because of an IO error, we may or may
+ * not have the item on the AIL. xfs_trans_ail_delete() will
+ * take care of that situation. xfs_trans_ail_delete() drops
+ * the AIL lock.
*/
if (bip->bli_flags & XFS_BLI_STALE_INODE) {
- xfs_buf_do_callbacks(bp);
- bp->b_log_item = NULL;
- list_del_init(&bp->b_li_list);
- bp->b_iodone = NULL;
+ xfs_buf_item_done(bp);
+ xfs_buf_inode_iodone(bp);
+ ASSERT(list_empty(&bp->b_li_list));
} else {
- spin_lock(&ailp->ail_lock);
- xfs_trans_ail_delete(ailp, lip, SHUTDOWN_LOG_IO_ERROR);
+ xfs_trans_ail_delete(lip, SHUTDOWN_LOG_IO_ERROR);
xfs_buf_item_relse(bp);
ASSERT(bp->b_log_item == NULL);
}
xfs_buf_relse(bp);
} else if (freed && remove) {
/*
- * There are currently two references to the buffer - the active
- * LRU reference and the buf log item. What we are about to do
- * here - simulate a failed IO completion - requires 3
- * references.
- *
- * The LRU reference is removed by the xfs_buf_stale() call. The
- * buf item reference is removed by the xfs_buf_iodone()
- * callback that is run by xfs_buf_do_callbacks() during ioend
- * processing (via the bp->b_iodone callback), and then finally
- * the ioend processing will drop the IO reference if the buffer
- * is marked XBF_ASYNC.
- *
- * Hence we need to take an additional reference here so that IO
- * completion processing doesn't free the buffer prematurely.
+ * The buffer must be locked and held by the caller to simulate
+ * an async I/O failure.
*/
xfs_buf_lock(bp);
xfs_buf_hold(bp);
bp->b_flags |= XBF_ASYNC;
- xfs_buf_ioerror(bp, -EIO);
- bp->b_flags &= ~XBF_DONE;
- xfs_buf_stale(bp);
- xfs_buf_ioend(bp);
+ xfs_buf_ioend_fail(bp);
}
}
-/*
- * Buffer IO error rate limiting. Limit it to no more than 10 messages per 30
- * seconds so as to not spam logs too much on repeated detection of the same
- * buffer being bad..
- */
-
-static DEFINE_RATELIMIT_STATE(xfs_buf_write_fail_rl_state, 30 * HZ, 10);
-
STATIC uint
xfs_buf_item_push(
struct xfs_log_item *lip,
@@ -516,11 +510,10 @@
trace_xfs_buf_item_push(bip);
/* has a previous flush failed due to IO errors? */
- if ((bp->b_flags & XBF_WRITE_FAIL) &&
- ___ratelimit(&xfs_buf_write_fail_rl_state, "XFS: Failing async write")) {
- xfs_warn(bp->b_mount,
-"Failing async write on buffer block 0x%llx. Retrying async write.",
- (long long)bp->b_bn);
+ if (bp->b_flags & XBF_WRITE_FAIL) {
+ xfs_buf_alert_ratelimited(bp, "XFS: Failing async write",
+ "Failing async write on buffer block 0x%llx. Retrying async write.",
+ (long long)bp->b_bn);
}
if (!xfs_buf_delwri_queue(bp, buffer_list))
@@ -567,7 +560,7 @@
* state.
*/
if (aborted)
- xfs_trans_ail_remove(lip, SHUTDOWN_LOG_IO_ERROR);
+ xfs_trans_ail_delete(lip, 0);
xfs_buf_item_relse(bip->bli_buf);
return true;
}
@@ -688,7 +681,7 @@
.iop_push = xfs_buf_item_push,
};
-STATIC int
+STATIC void
xfs_buf_item_get_format(
struct xfs_buf_log_item *bip,
int count)
@@ -698,14 +691,11 @@
if (count == 1) {
bip->bli_formats = &bip->__bli_format;
- return 0;
+ return;
}
bip->bli_formats = kmem_zalloc(count * sizeof(struct xfs_buf_log_format),
0);
- if (!bip->bli_formats)
- return -ENOMEM;
- return 0;
}
STATIC void
@@ -731,7 +721,6 @@
struct xfs_buf_log_item *bip = bp->b_log_item;
int chunks;
int map_size;
- int error;
int i;
/*
@@ -747,7 +736,7 @@
return 0;
}
- bip = kmem_zone_zalloc(xfs_buf_item_zone, 0);
+ bip = kmem_cache_zalloc(xfs_buf_item_zone, GFP_KERNEL | __GFP_NOFAIL);
xfs_log_item_init(mp, &bip->bli_item, XFS_LI_BUF, &xfs_buf_item_ops);
bip->bli_buf = bp;
@@ -760,19 +749,22 @@
* Discontiguous buffer support follows the layout of the underlying
* buffer. This makes the implementation as simple as possible.
*/
- error = xfs_buf_item_get_format(bip, bp->b_map_count);
- ASSERT(error == 0);
- if (error) { /* to stop gcc throwing set-but-unused warnings */
- kmem_zone_free(xfs_buf_item_zone, bip);
- return error;
- }
-
+ xfs_buf_item_get_format(bip, bp->b_map_count);
for (i = 0; i < bip->bli_format_count; i++) {
chunks = DIV_ROUND_UP(BBTOB(bp->b_maps[i].bm_len),
XFS_BLF_CHUNK);
map_size = DIV_ROUND_UP(chunks, NBWORD);
+ if (map_size > XFS_BLF_DATAMAP_SIZE) {
+ kmem_cache_free(xfs_buf_item_zone, bip);
+ xfs_err(mp,
+ "buffer item dirty bitmap (%u uints) too small to reflect %u bytes!",
+ map_size,
+ BBTOB(bp->b_maps[i].bm_len));
+ return -EFSCORRUPTED;
+ }
+
bip->bli_formats[i].blf_type = XFS_LI_BUF;
bip->bli_formats[i].blf_blkno = bp->b_maps[i].bm_bn;
bip->bli_formats[i].blf_len = bp->b_maps[i].bm_len;
@@ -805,6 +797,9 @@
uint end_bit;
uint mask;
+ ASSERT(first < XFS_BLF_DATAMAP_SIZE * XFS_BLF_CHUNK * NBWORD);
+ ASSERT(last < XFS_BLF_DATAMAP_SIZE * XFS_BLF_CHUNK * NBWORD);
+
/*
* Convert byte offsets to bit numbers.
*/
@@ -851,7 +846,7 @@
* first_bit and last_bit.
*/
while ((bits_to_set - bits_set) >= NBWORD) {
- *wordp |= 0xffffffff;
+ *wordp = 0xffffffff;
bits_set += NBWORD;
wordp++;
}
@@ -939,15 +934,11 @@
{
xfs_buf_item_free_format(bip);
kmem_free(bip->bli_item.li_lv_shadow);
- kmem_zone_free(xfs_buf_item_zone, bip);
+ kmem_cache_free(xfs_buf_item_zone, bip);
}
/*
- * This is called when the buf log item is no longer needed. It should
- * free the buf log item associated with the given buffer and clear
- * the buffer's pointer to the buf log item. If there are no more
- * items in the list, clear the b_iodone field of the buffer (see
- * xfs_buf_attach_iodone() below).
+ * xfs_buf_item_relse() is called when the buf log item is no longer needed.
*/
void
xfs_buf_item_relse(
@@ -956,315 +947,31 @@
struct xfs_buf_log_item *bip = bp->b_log_item;
trace_xfs_buf_item_relse(bp, _RET_IP_);
- ASSERT(!(bip->bli_item.li_flags & XFS_LI_IN_AIL));
+ ASSERT(!test_bit(XFS_LI_IN_AIL, &bip->bli_item.li_flags));
bp->b_log_item = NULL;
- if (list_empty(&bp->b_li_list))
- bp->b_iodone = NULL;
-
xfs_buf_rele(bp);
xfs_buf_item_free(bip);
}
-
-/*
- * Add the given log item with its callback to the list of callbacks
- * to be called when the buffer's I/O completes. If it is not set
- * already, set the buffer's b_iodone() routine to be
- * xfs_buf_iodone_callbacks() and link the log item into the list of
- * items rooted at b_li_list.
- */
void
-xfs_buf_attach_iodone(
- struct xfs_buf *bp,
- void (*cb)(struct xfs_buf *, struct xfs_log_item *),
- struct xfs_log_item *lip)
-{
- ASSERT(xfs_buf_islocked(bp));
-
- lip->li_cb = cb;
- list_add_tail(&lip->li_bio_list, &bp->b_li_list);
-
- ASSERT(bp->b_iodone == NULL ||
- bp->b_iodone == xfs_buf_iodone_callbacks);
- bp->b_iodone = xfs_buf_iodone_callbacks;
-}
-
-/*
- * We can have many callbacks on a buffer. Running the callbacks individually
- * can cause a lot of contention on the AIL lock, so we allow for a single
- * callback to be able to scan the remaining items in bp->b_li_list for other
- * items of the same type and callback to be processed in the first call.
- *
- * As a result, the loop walking the callback list below will also modify the
- * list. it removes the first item from the list and then runs the callback.
- * The loop then restarts from the new first item int the list. This allows the
- * callback to scan and modify the list attached to the buffer and we don't
- * have to care about maintaining a next item pointer.
- */
-STATIC void
-xfs_buf_do_callbacks(
- struct xfs_buf *bp)
-{
- struct xfs_buf_log_item *blip = bp->b_log_item;
- struct xfs_log_item *lip;
-
- /* If there is a buf_log_item attached, run its callback */
- if (blip) {
- lip = &blip->bli_item;
- lip->li_cb(bp, lip);
- }
-
- while (!list_empty(&bp->b_li_list)) {
- lip = list_first_entry(&bp->b_li_list, struct xfs_log_item,
- li_bio_list);
-
- /*
- * Remove the item from the list, so we don't have any
- * confusion if the item is added to another buf.
- * Don't touch the log item after calling its
- * callback, because it could have freed itself.
- */
- list_del_init(&lip->li_bio_list);
- lip->li_cb(bp, lip);
- }
-}
-
-/*
- * Invoke the error state callback for each log item affected by the failed I/O.
- *
- * If a metadata buffer write fails with a non-permanent error, the buffer is
- * eventually resubmitted and so the completion callbacks are not run. The error
- * state may need to be propagated to the log items attached to the buffer,
- * however, so the next AIL push of the item knows hot to handle it correctly.
- */
-STATIC void
-xfs_buf_do_callbacks_fail(
- struct xfs_buf *bp)
-{
- struct xfs_log_item *lip;
- struct xfs_ail *ailp;
-
- /*
- * Buffer log item errors are handled directly by xfs_buf_item_push()
- * and xfs_buf_iodone_callback_error, and they have no IO error
- * callbacks. Check only for items in b_li_list.
- */
- if (list_empty(&bp->b_li_list))
- return;
-
- lip = list_first_entry(&bp->b_li_list, struct xfs_log_item,
- li_bio_list);
- ailp = lip->li_ailp;
- spin_lock(&ailp->ail_lock);
- list_for_each_entry(lip, &bp->b_li_list, li_bio_list) {
- if (lip->li_ops->iop_error)
- lip->li_ops->iop_error(lip, bp);
- }
- spin_unlock(&ailp->ail_lock);
-}
-
-static bool
-xfs_buf_iodone_callback_error(
- struct xfs_buf *bp)
-{
- struct xfs_buf_log_item *bip = bp->b_log_item;
- struct xfs_log_item *lip;
- struct xfs_mount *mp;
- static ulong lasttime;
- static xfs_buftarg_t *lasttarg;
- struct xfs_error_cfg *cfg;
-
- /*
- * The failed buffer might not have a buf_log_item attached or the
- * log_item list might be empty. Get the mp from the available
- * xfs_log_item
- */
- lip = list_first_entry_or_null(&bp->b_li_list, struct xfs_log_item,
- li_bio_list);
- mp = lip ? lip->li_mountp : bip->bli_item.li_mountp;
-
- /*
- * If we've already decided to shutdown the filesystem because of
- * I/O errors, there's no point in giving this a retry.
- */
- if (XFS_FORCED_SHUTDOWN(mp))
- goto out_stale;
-
- if (bp->b_target != lasttarg ||
- time_after(jiffies, (lasttime + 5*HZ))) {
- lasttime = jiffies;
- xfs_buf_ioerror_alert(bp, __func__);
- }
- lasttarg = bp->b_target;
-
- /* synchronous writes will have callers process the error */
- if (!(bp->b_flags & XBF_ASYNC))
- goto out_stale;
-
- trace_xfs_buf_item_iodone_async(bp, _RET_IP_);
- ASSERT(bp->b_iodone != NULL);
-
- cfg = xfs_error_get_cfg(mp, XFS_ERR_METADATA, bp->b_error);
-
- /*
- * If the write was asynchronous then no one will be looking for the
- * error. If this is the first failure of this type, clear the error
- * state and write the buffer out again. This means we always retry an
- * async write failure at least once, but we also need to set the buffer
- * up to behave correctly now for repeated failures.
- */
- if (!(bp->b_flags & (XBF_STALE | XBF_WRITE_FAIL)) ||
- bp->b_last_error != bp->b_error) {
- bp->b_flags |= (XBF_WRITE | XBF_DONE | XBF_WRITE_FAIL);
- bp->b_last_error = bp->b_error;
- if (cfg->retry_timeout != XFS_ERR_RETRY_FOREVER &&
- !bp->b_first_retry_time)
- bp->b_first_retry_time = jiffies;
-
- xfs_buf_ioerror(bp, 0);
- xfs_buf_submit(bp);
- return true;
- }
-
- /*
- * Repeated failure on an async write. Take action according to the
- * error configuration we have been set up to use.
- */
-
- if (cfg->max_retries != XFS_ERR_RETRY_FOREVER &&
- ++bp->b_retries > cfg->max_retries)
- goto permanent_error;
- if (cfg->retry_timeout != XFS_ERR_RETRY_FOREVER &&
- time_after(jiffies, cfg->retry_timeout + bp->b_first_retry_time))
- goto permanent_error;
-
- /* At unmount we may treat errors differently */
- if ((mp->m_flags & XFS_MOUNT_UNMOUNTING) && mp->m_fail_unmount)
- goto permanent_error;
-
- /*
- * Still a transient error, run IO completion failure callbacks and let
- * the higher layers retry the buffer.
- */
- xfs_buf_do_callbacks_fail(bp);
- xfs_buf_ioerror(bp, 0);
- xfs_buf_relse(bp);
- return true;
-
- /*
- * Permanent error - we need to trigger a shutdown if we haven't already
- * to indicate that inconsistency will result from this action.
- */
-permanent_error:
- xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
-out_stale:
- xfs_buf_stale(bp);
- bp->b_flags |= XBF_DONE;
- trace_xfs_buf_error_relse(bp, _RET_IP_);
- return false;
-}
-
-/*
- * This is the iodone() function for buffers which have had callbacks attached
- * to them by xfs_buf_attach_iodone(). We need to iterate the items on the
- * callback list, mark the buffer as having no more callbacks and then push the
- * buffer through IO completion processing.
- */
-void
-xfs_buf_iodone_callbacks(
+xfs_buf_item_done(
struct xfs_buf *bp)
{
/*
- * If there is an error, process it. Some errors require us
- * to run callbacks after failure processing is done so we
- * detect that and take appropriate action.
- */
- if (bp->b_error && xfs_buf_iodone_callback_error(bp))
- return;
-
- /*
- * Successful IO or permanent error. Either way, we can clear the
- * retry state here in preparation for the next error that may occur.
- */
- bp->b_last_error = 0;
- bp->b_retries = 0;
- bp->b_first_retry_time = 0;
-
- xfs_buf_do_callbacks(bp);
- bp->b_log_item = NULL;
- list_del_init(&bp->b_li_list);
- bp->b_iodone = NULL;
- xfs_buf_ioend(bp);
-}
-
-/*
- * This is the iodone() function for buffers which have been
- * logged. It is called when they are eventually flushed out.
- * It should remove the buf item from the AIL, and free the buf item.
- * It is called by xfs_buf_iodone_callbacks() above which will take
- * care of cleaning up the buffer itself.
- */
-void
-xfs_buf_iodone(
- struct xfs_buf *bp,
- struct xfs_log_item *lip)
-{
- struct xfs_ail *ailp = lip->li_ailp;
-
- ASSERT(BUF_ITEM(lip)->bli_buf == bp);
-
- xfs_buf_rele(bp);
-
- /*
- * If we are forcibly shutting down, this may well be
- * off the AIL already. That's because we simulate the
- * log-committed callbacks to unpin these buffers. Or we may never
- * have put this item on AIL because of the transaction was
- * aborted forcibly. xfs_trans_ail_delete() takes care of these.
+ * If we are forcibly shutting down, this may well be off the AIL
+ * already. That's because we simulate the log-committed callbacks to
+ * unpin these buffers. Or we may never have put this item on AIL
+ * because of the transaction was aborted forcibly.
+ * xfs_trans_ail_delete() takes care of these.
*
* Either way, AIL is useless if we're forcing a shutdown.
+ *
+ * Note that log recovery writes might have buffer items that are not on
+ * the AIL even when the file system is not shut down.
*/
- spin_lock(&ailp->ail_lock);
- xfs_trans_ail_delete(ailp, lip, SHUTDOWN_CORRUPT_INCORE);
- xfs_buf_item_free(BUF_ITEM(lip));
-}
-
-/*
- * Requeue a failed buffer for writeback.
- *
- * We clear the log item failed state here as well, but we have to be careful
- * about reference counts because the only active reference counts on the buffer
- * may be the failed log items. Hence if we clear the log item failed state
- * before queuing the buffer for IO we can release all active references to
- * the buffer and free it, leading to use after free problems in
- * xfs_buf_delwri_queue. It makes no difference to the buffer or log items which
- * order we process them in - the buffer is locked, and we own the buffer list
- * so nothing on them is going to change while we are performing this action.
- *
- * Hence we can safely queue the buffer for IO before we clear the failed log
- * item state, therefore always having an active reference to the buffer and
- * avoiding the transient zero-reference state that leads to use-after-free.
- *
- * Return true if the buffer was added to the buffer list, false if it was
- * already on the buffer list.
- */
-bool
-xfs_buf_resubmit_failed_buffers(
- struct xfs_buf *bp,
- struct list_head *buffer_list)
-{
- struct xfs_log_item *lip;
- bool ret;
-
- ret = xfs_buf_delwri_queue(bp, buffer_list);
-
- /*
- * XFS_LI_FAILED set/clear is protected by ail_lock, caller of this
- * function already have it acquired
- */
- list_for_each_entry(lip, &bp->b_li_list, li_bio_list)
- xfs_clear_li_failed(lip);
-
- return ret;
+ xfs_trans_ail_delete(&bp->b_log_item->bli_item,
+ (bp->b_flags & _XBF_LOGRECOVERY) ? 0 :
+ SHUTDOWN_CORRUPT_INCORE);
+ xfs_buf_item_relse(bp);
}
diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h
index 4a054b1..50aa0f5 100644
--- a/fs/xfs/xfs_buf_item.h
+++ b/fs/xfs/xfs_buf_item.h
@@ -50,17 +50,26 @@
};
int xfs_buf_item_init(struct xfs_buf *, struct xfs_mount *);
+void xfs_buf_item_done(struct xfs_buf *bp);
void xfs_buf_item_relse(struct xfs_buf *);
bool xfs_buf_item_put(struct xfs_buf_log_item *);
void xfs_buf_item_log(struct xfs_buf_log_item *, uint, uint);
bool xfs_buf_item_dirty_format(struct xfs_buf_log_item *);
-void xfs_buf_attach_iodone(struct xfs_buf *,
- void(*)(struct xfs_buf *, struct xfs_log_item *),
- struct xfs_log_item *);
-void xfs_buf_iodone_callbacks(struct xfs_buf *);
-void xfs_buf_iodone(struct xfs_buf *, struct xfs_log_item *);
-bool xfs_buf_resubmit_failed_buffers(struct xfs_buf *,
- struct list_head *);
+void xfs_buf_inode_iodone(struct xfs_buf *);
+void xfs_buf_inode_io_fail(struct xfs_buf *bp);
+#ifdef CONFIG_XFS_QUOTA
+void xfs_buf_dquot_iodone(struct xfs_buf *);
+void xfs_buf_dquot_io_fail(struct xfs_buf *bp);
+#else
+static inline void xfs_buf_dquot_iodone(struct xfs_buf *bp)
+{
+}
+static inline void xfs_buf_dquot_io_fail(struct xfs_buf *bp)
+{
+}
+#endif /* CONFIG_XFS_QUOTA */
+void xfs_buf_iodone(struct xfs_buf *);
+bool xfs_buf_log_check_iovec(struct xfs_log_iovec *iovec);
extern kmem_zone_t *xfs_buf_item_zone;
diff --git a/fs/xfs/xfs_buf_item_recover.c b/fs/xfs/xfs_buf_item_recover.c
new file mode 100644
index 0000000..d44e8b4
--- /dev/null
+++ b/fs/xfs/xfs_buf_item_recover.c
@@ -0,0 +1,984 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_mount.h"
+#include "xfs_trans.h"
+#include "xfs_buf_item.h"
+#include "xfs_trans_priv.h"
+#include "xfs_trace.h"
+#include "xfs_log.h"
+#include "xfs_log_priv.h"
+#include "xfs_log_recover.h"
+#include "xfs_error.h"
+#include "xfs_inode.h"
+#include "xfs_dir2.h"
+#include "xfs_quota.h"
+
+/*
+ * This structure is used during recovery to record the buf log items which
+ * have been canceled and should not be replayed.
+ */
+struct xfs_buf_cancel {
+ xfs_daddr_t bc_blkno;
+ uint bc_len;
+ int bc_refcount;
+ struct list_head bc_list;
+};
+
+static struct xfs_buf_cancel *
+xlog_find_buffer_cancelled(
+ struct xlog *log,
+ xfs_daddr_t blkno,
+ uint len)
+{
+ struct list_head *bucket;
+ struct xfs_buf_cancel *bcp;
+
+ if (!log->l_buf_cancel_table)
+ return NULL;
+
+ bucket = XLOG_BUF_CANCEL_BUCKET(log, blkno);
+ list_for_each_entry(bcp, bucket, bc_list) {
+ if (bcp->bc_blkno == blkno && bcp->bc_len == len)
+ return bcp;
+ }
+
+ return NULL;
+}
+
+static bool
+xlog_add_buffer_cancelled(
+ struct xlog *log,
+ xfs_daddr_t blkno,
+ uint len)
+{
+ struct xfs_buf_cancel *bcp;
+
+ /*
+ * If we find an existing cancel record, this indicates that the buffer
+ * was cancelled multiple times. To ensure that during pass 2 we keep
+ * the record in the table until we reach its last occurrence in the
+ * log, a reference count is kept to tell how many times we expect to
+ * see this record during the second pass.
+ */
+ bcp = xlog_find_buffer_cancelled(log, blkno, len);
+ if (bcp) {
+ bcp->bc_refcount++;
+ return false;
+ }
+
+ bcp = kmem_alloc(sizeof(struct xfs_buf_cancel), 0);
+ bcp->bc_blkno = blkno;
+ bcp->bc_len = len;
+ bcp->bc_refcount = 1;
+ list_add_tail(&bcp->bc_list, XLOG_BUF_CANCEL_BUCKET(log, blkno));
+ return true;
+}
+
+/*
+ * Check if there is and entry for blkno, len in the buffer cancel record table.
+ */
+bool
+xlog_is_buffer_cancelled(
+ struct xlog *log,
+ xfs_daddr_t blkno,
+ uint len)
+{
+ return xlog_find_buffer_cancelled(log, blkno, len) != NULL;
+}
+
+/*
+ * Check if there is and entry for blkno, len in the buffer cancel record table,
+ * and decremented the reference count on it if there is one.
+ *
+ * Remove the cancel record once the refcount hits zero, so that if the same
+ * buffer is re-used again after its last cancellation we actually replay the
+ * changes made at that point.
+ */
+static bool
+xlog_put_buffer_cancelled(
+ struct xlog *log,
+ xfs_daddr_t blkno,
+ uint len)
+{
+ struct xfs_buf_cancel *bcp;
+
+ bcp = xlog_find_buffer_cancelled(log, blkno, len);
+ if (!bcp) {
+ ASSERT(0);
+ return false;
+ }
+
+ if (--bcp->bc_refcount == 0) {
+ list_del(&bcp->bc_list);
+ kmem_free(bcp);
+ }
+ return true;
+}
+
+/* log buffer item recovery */
+
+/*
+ * Sort buffer items for log recovery. Most buffer items should end up on the
+ * buffer list and are recovered first, with the following exceptions:
+ *
+ * 1. XFS_BLF_CANCEL buffers must be processed last because some log items
+ * might depend on the incor ecancellation record, and replaying a cancelled
+ * buffer item can remove the incore record.
+ *
+ * 2. XFS_BLF_INODE_BUF buffers are handled after most regular items so that
+ * we replay di_next_unlinked only after flushing the inode 'free' state
+ * to the inode buffer.
+ *
+ * See xlog_recover_reorder_trans for more details.
+ */
+STATIC enum xlog_recover_reorder
+xlog_recover_buf_reorder(
+ struct xlog_recover_item *item)
+{
+ struct xfs_buf_log_format *buf_f = item->ri_buf[0].i_addr;
+
+ if (buf_f->blf_flags & XFS_BLF_CANCEL)
+ return XLOG_REORDER_CANCEL_LIST;
+ if (buf_f->blf_flags & XFS_BLF_INODE_BUF)
+ return XLOG_REORDER_INODE_BUFFER_LIST;
+ return XLOG_REORDER_BUFFER_LIST;
+}
+
+STATIC void
+xlog_recover_buf_ra_pass2(
+ struct xlog *log,
+ struct xlog_recover_item *item)
+{
+ struct xfs_buf_log_format *buf_f = item->ri_buf[0].i_addr;
+
+ xlog_buf_readahead(log, buf_f->blf_blkno, buf_f->blf_len, NULL);
+}
+
+/*
+ * Build up the table of buf cancel records so that we don't replay cancelled
+ * data in the second pass.
+ */
+static int
+xlog_recover_buf_commit_pass1(
+ struct xlog *log,
+ struct xlog_recover_item *item)
+{
+ struct xfs_buf_log_format *bf = item->ri_buf[0].i_addr;
+
+ if (!xfs_buf_log_check_iovec(&item->ri_buf[0])) {
+ xfs_err(log->l_mp, "bad buffer log item size (%d)",
+ item->ri_buf[0].i_len);
+ return -EFSCORRUPTED;
+ }
+
+ if (!(bf->blf_flags & XFS_BLF_CANCEL))
+ trace_xfs_log_recover_buf_not_cancel(log, bf);
+ else if (xlog_add_buffer_cancelled(log, bf->blf_blkno, bf->blf_len))
+ trace_xfs_log_recover_buf_cancel_add(log, bf);
+ else
+ trace_xfs_log_recover_buf_cancel_ref_inc(log, bf);
+ return 0;
+}
+
+/*
+ * Validate the recovered buffer is of the correct type and attach the
+ * appropriate buffer operations to them for writeback. Magic numbers are in a
+ * few places:
+ * the first 16 bits of the buffer (inode buffer, dquot buffer),
+ * the first 32 bits of the buffer (most blocks),
+ * inside a struct xfs_da_blkinfo at the start of the buffer.
+ */
+static void
+xlog_recover_validate_buf_type(
+ struct xfs_mount *mp,
+ struct xfs_buf *bp,
+ struct xfs_buf_log_format *buf_f,
+ xfs_lsn_t current_lsn)
+{
+ struct xfs_da_blkinfo *info = bp->b_addr;
+ uint32_t magic32;
+ uint16_t magic16;
+ uint16_t magicda;
+ char *warnmsg = NULL;
+
+ /*
+ * We can only do post recovery validation on items on CRC enabled
+ * fielsystems as we need to know when the buffer was written to be able
+ * to determine if we should have replayed the item. If we replay old
+ * metadata over a newer buffer, then it will enter a temporarily
+ * inconsistent state resulting in verification failures. Hence for now
+ * just avoid the verification stage for non-crc filesystems
+ */
+ if (!xfs_sb_version_hascrc(&mp->m_sb))
+ return;
+
+ magic32 = be32_to_cpu(*(__be32 *)bp->b_addr);
+ magic16 = be16_to_cpu(*(__be16*)bp->b_addr);
+ magicda = be16_to_cpu(info->magic);
+ switch (xfs_blft_from_flags(buf_f)) {
+ case XFS_BLFT_BTREE_BUF:
+ switch (magic32) {
+ case XFS_ABTB_CRC_MAGIC:
+ case XFS_ABTB_MAGIC:
+ bp->b_ops = &xfs_bnobt_buf_ops;
+ break;
+ case XFS_ABTC_CRC_MAGIC:
+ case XFS_ABTC_MAGIC:
+ bp->b_ops = &xfs_cntbt_buf_ops;
+ break;
+ case XFS_IBT_CRC_MAGIC:
+ case XFS_IBT_MAGIC:
+ bp->b_ops = &xfs_inobt_buf_ops;
+ break;
+ case XFS_FIBT_CRC_MAGIC:
+ case XFS_FIBT_MAGIC:
+ bp->b_ops = &xfs_finobt_buf_ops;
+ break;
+ case XFS_BMAP_CRC_MAGIC:
+ case XFS_BMAP_MAGIC:
+ bp->b_ops = &xfs_bmbt_buf_ops;
+ break;
+ case XFS_RMAP_CRC_MAGIC:
+ bp->b_ops = &xfs_rmapbt_buf_ops;
+ break;
+ case XFS_REFC_CRC_MAGIC:
+ bp->b_ops = &xfs_refcountbt_buf_ops;
+ break;
+ default:
+ warnmsg = "Bad btree block magic!";
+ break;
+ }
+ break;
+ case XFS_BLFT_AGF_BUF:
+ if (magic32 != XFS_AGF_MAGIC) {
+ warnmsg = "Bad AGF block magic!";
+ break;
+ }
+ bp->b_ops = &xfs_agf_buf_ops;
+ break;
+ case XFS_BLFT_AGFL_BUF:
+ if (magic32 != XFS_AGFL_MAGIC) {
+ warnmsg = "Bad AGFL block magic!";
+ break;
+ }
+ bp->b_ops = &xfs_agfl_buf_ops;
+ break;
+ case XFS_BLFT_AGI_BUF:
+ if (magic32 != XFS_AGI_MAGIC) {
+ warnmsg = "Bad AGI block magic!";
+ break;
+ }
+ bp->b_ops = &xfs_agi_buf_ops;
+ break;
+ case XFS_BLFT_UDQUOT_BUF:
+ case XFS_BLFT_PDQUOT_BUF:
+ case XFS_BLFT_GDQUOT_BUF:
+#ifdef CONFIG_XFS_QUOTA
+ if (magic16 != XFS_DQUOT_MAGIC) {
+ warnmsg = "Bad DQUOT block magic!";
+ break;
+ }
+ bp->b_ops = &xfs_dquot_buf_ops;
+#else
+ xfs_alert(mp,
+ "Trying to recover dquots without QUOTA support built in!");
+ ASSERT(0);
+#endif
+ break;
+ case XFS_BLFT_DINO_BUF:
+ if (magic16 != XFS_DINODE_MAGIC) {
+ warnmsg = "Bad INODE block magic!";
+ break;
+ }
+ bp->b_ops = &xfs_inode_buf_ops;
+ break;
+ case XFS_BLFT_SYMLINK_BUF:
+ if (magic32 != XFS_SYMLINK_MAGIC) {
+ warnmsg = "Bad symlink block magic!";
+ break;
+ }
+ bp->b_ops = &xfs_symlink_buf_ops;
+ break;
+ case XFS_BLFT_DIR_BLOCK_BUF:
+ if (magic32 != XFS_DIR2_BLOCK_MAGIC &&
+ magic32 != XFS_DIR3_BLOCK_MAGIC) {
+ warnmsg = "Bad dir block magic!";
+ break;
+ }
+ bp->b_ops = &xfs_dir3_block_buf_ops;
+ break;
+ case XFS_BLFT_DIR_DATA_BUF:
+ if (magic32 != XFS_DIR2_DATA_MAGIC &&
+ magic32 != XFS_DIR3_DATA_MAGIC) {
+ warnmsg = "Bad dir data magic!";
+ break;
+ }
+ bp->b_ops = &xfs_dir3_data_buf_ops;
+ break;
+ case XFS_BLFT_DIR_FREE_BUF:
+ if (magic32 != XFS_DIR2_FREE_MAGIC &&
+ magic32 != XFS_DIR3_FREE_MAGIC) {
+ warnmsg = "Bad dir3 free magic!";
+ break;
+ }
+ bp->b_ops = &xfs_dir3_free_buf_ops;
+ break;
+ case XFS_BLFT_DIR_LEAF1_BUF:
+ if (magicda != XFS_DIR2_LEAF1_MAGIC &&
+ magicda != XFS_DIR3_LEAF1_MAGIC) {
+ warnmsg = "Bad dir leaf1 magic!";
+ break;
+ }
+ bp->b_ops = &xfs_dir3_leaf1_buf_ops;
+ break;
+ case XFS_BLFT_DIR_LEAFN_BUF:
+ if (magicda != XFS_DIR2_LEAFN_MAGIC &&
+ magicda != XFS_DIR3_LEAFN_MAGIC) {
+ warnmsg = "Bad dir leafn magic!";
+ break;
+ }
+ bp->b_ops = &xfs_dir3_leafn_buf_ops;
+ break;
+ case XFS_BLFT_DA_NODE_BUF:
+ if (magicda != XFS_DA_NODE_MAGIC &&
+ magicda != XFS_DA3_NODE_MAGIC) {
+ warnmsg = "Bad da node magic!";
+ break;
+ }
+ bp->b_ops = &xfs_da3_node_buf_ops;
+ break;
+ case XFS_BLFT_ATTR_LEAF_BUF:
+ if (magicda != XFS_ATTR_LEAF_MAGIC &&
+ magicda != XFS_ATTR3_LEAF_MAGIC) {
+ warnmsg = "Bad attr leaf magic!";
+ break;
+ }
+ bp->b_ops = &xfs_attr3_leaf_buf_ops;
+ break;
+ case XFS_BLFT_ATTR_RMT_BUF:
+ if (magic32 != XFS_ATTR3_RMT_MAGIC) {
+ warnmsg = "Bad attr remote magic!";
+ break;
+ }
+ bp->b_ops = &xfs_attr3_rmt_buf_ops;
+ break;
+ case XFS_BLFT_SB_BUF:
+ if (magic32 != XFS_SB_MAGIC) {
+ warnmsg = "Bad SB block magic!";
+ break;
+ }
+ bp->b_ops = &xfs_sb_buf_ops;
+ break;
+#ifdef CONFIG_XFS_RT
+ case XFS_BLFT_RTBITMAP_BUF:
+ case XFS_BLFT_RTSUMMARY_BUF:
+ /* no magic numbers for verification of RT buffers */
+ bp->b_ops = &xfs_rtbuf_ops;
+ break;
+#endif /* CONFIG_XFS_RT */
+ default:
+ xfs_warn(mp, "Unknown buffer type %d!",
+ xfs_blft_from_flags(buf_f));
+ break;
+ }
+
+ /*
+ * Nothing else to do in the case of a NULL current LSN as this means
+ * the buffer is more recent than the change in the log and will be
+ * skipped.
+ */
+ if (current_lsn == NULLCOMMITLSN)
+ return;
+
+ if (warnmsg) {
+ xfs_warn(mp, warnmsg);
+ ASSERT(0);
+ }
+
+ /*
+ * We must update the metadata LSN of the buffer as it is written out to
+ * ensure that older transactions never replay over this one and corrupt
+ * the buffer. This can occur if log recovery is interrupted at some
+ * point after the current transaction completes, at which point a
+ * subsequent mount starts recovery from the beginning.
+ *
+ * Write verifiers update the metadata LSN from log items attached to
+ * the buffer. Therefore, initialize a bli purely to carry the LSN to
+ * the verifier.
+ */
+ if (bp->b_ops) {
+ struct xfs_buf_log_item *bip;
+
+ bp->b_flags |= _XBF_LOGRECOVERY;
+ xfs_buf_item_init(bp, mp);
+ bip = bp->b_log_item;
+ bip->bli_item.li_lsn = current_lsn;
+ }
+}
+
+/*
+ * Perform a 'normal' buffer recovery. Each logged region of the
+ * buffer should be copied over the corresponding region in the
+ * given buffer. The bitmap in the buf log format structure indicates
+ * where to place the logged data.
+ */
+STATIC void
+xlog_recover_do_reg_buffer(
+ struct xfs_mount *mp,
+ struct xlog_recover_item *item,
+ struct xfs_buf *bp,
+ struct xfs_buf_log_format *buf_f,
+ xfs_lsn_t current_lsn)
+{
+ int i;
+ int bit;
+ int nbits;
+ xfs_failaddr_t fa;
+ const size_t size_disk_dquot = sizeof(struct xfs_disk_dquot);
+
+ trace_xfs_log_recover_buf_reg_buf(mp->m_log, buf_f);
+
+ bit = 0;
+ i = 1; /* 0 is the buf format structure */
+ while (1) {
+ bit = xfs_next_bit(buf_f->blf_data_map,
+ buf_f->blf_map_size, bit);
+ if (bit == -1)
+ break;
+ nbits = xfs_contig_bits(buf_f->blf_data_map,
+ buf_f->blf_map_size, bit);
+ ASSERT(nbits > 0);
+ ASSERT(item->ri_buf[i].i_addr != NULL);
+ ASSERT(item->ri_buf[i].i_len % XFS_BLF_CHUNK == 0);
+ ASSERT(BBTOB(bp->b_length) >=
+ ((uint)bit << XFS_BLF_SHIFT) + (nbits << XFS_BLF_SHIFT));
+
+ /*
+ * The dirty regions logged in the buffer, even though
+ * contiguous, may span multiple chunks. This is because the
+ * dirty region may span a physical page boundary in a buffer
+ * and hence be split into two separate vectors for writing into
+ * the log. Hence we need to trim nbits back to the length of
+ * the current region being copied out of the log.
+ */
+ if (item->ri_buf[i].i_len < (nbits << XFS_BLF_SHIFT))
+ nbits = item->ri_buf[i].i_len >> XFS_BLF_SHIFT;
+
+ /*
+ * Do a sanity check if this is a dquot buffer. Just checking
+ * the first dquot in the buffer should do. XXXThis is
+ * probably a good thing to do for other buf types also.
+ */
+ fa = NULL;
+ if (buf_f->blf_flags &
+ (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) {
+ if (item->ri_buf[i].i_addr == NULL) {
+ xfs_alert(mp,
+ "XFS: NULL dquot in %s.", __func__);
+ goto next;
+ }
+ if (item->ri_buf[i].i_len < size_disk_dquot) {
+ xfs_alert(mp,
+ "XFS: dquot too small (%d) in %s.",
+ item->ri_buf[i].i_len, __func__);
+ goto next;
+ }
+ fa = xfs_dquot_verify(mp, item->ri_buf[i].i_addr, -1);
+ if (fa) {
+ xfs_alert(mp,
+ "dquot corrupt at %pS trying to replay into block 0x%llx",
+ fa, bp->b_bn);
+ goto next;
+ }
+ }
+
+ memcpy(xfs_buf_offset(bp,
+ (uint)bit << XFS_BLF_SHIFT), /* dest */
+ item->ri_buf[i].i_addr, /* source */
+ nbits<<XFS_BLF_SHIFT); /* length */
+ next:
+ i++;
+ bit += nbits;
+ }
+
+ /* Shouldn't be any more regions */
+ ASSERT(i == item->ri_total);
+
+ xlog_recover_validate_buf_type(mp, bp, buf_f, current_lsn);
+}
+
+/*
+ * Perform a dquot buffer recovery.
+ * Simple algorithm: if we have found a QUOTAOFF log item of the same type
+ * (ie. USR or GRP), then just toss this buffer away; don't recover it.
+ * Else, treat it as a regular buffer and do recovery.
+ *
+ * Return false if the buffer was tossed and true if we recovered the buffer to
+ * indicate to the caller if the buffer needs writing.
+ */
+STATIC bool
+xlog_recover_do_dquot_buffer(
+ struct xfs_mount *mp,
+ struct xlog *log,
+ struct xlog_recover_item *item,
+ struct xfs_buf *bp,
+ struct xfs_buf_log_format *buf_f)
+{
+ uint type;
+
+ trace_xfs_log_recover_buf_dquot_buf(log, buf_f);
+
+ /*
+ * Filesystems are required to send in quota flags at mount time.
+ */
+ if (!mp->m_qflags)
+ return false;
+
+ type = 0;
+ if (buf_f->blf_flags & XFS_BLF_UDQUOT_BUF)
+ type |= XFS_DQTYPE_USER;
+ if (buf_f->blf_flags & XFS_BLF_PDQUOT_BUF)
+ type |= XFS_DQTYPE_PROJ;
+ if (buf_f->blf_flags & XFS_BLF_GDQUOT_BUF)
+ type |= XFS_DQTYPE_GROUP;
+ /*
+ * This type of quotas was turned off, so ignore this buffer
+ */
+ if (log->l_quotaoffs_flag & type)
+ return false;
+
+ xlog_recover_do_reg_buffer(mp, item, bp, buf_f, NULLCOMMITLSN);
+ return true;
+}
+
+/*
+ * Perform recovery for a buffer full of inodes. In these buffers, the only
+ * data which should be recovered is that which corresponds to the
+ * di_next_unlinked pointers in the on disk inode structures. The rest of the
+ * data for the inodes is always logged through the inodes themselves rather
+ * than the inode buffer and is recovered in xlog_recover_inode_pass2().
+ *
+ * The only time when buffers full of inodes are fully recovered is when the
+ * buffer is full of newly allocated inodes. In this case the buffer will
+ * not be marked as an inode buffer and so will be sent to
+ * xlog_recover_do_reg_buffer() below during recovery.
+ */
+STATIC int
+xlog_recover_do_inode_buffer(
+ struct xfs_mount *mp,
+ struct xlog_recover_item *item,
+ struct xfs_buf *bp,
+ struct xfs_buf_log_format *buf_f)
+{
+ int i;
+ int item_index = 0;
+ int bit = 0;
+ int nbits = 0;
+ int reg_buf_offset = 0;
+ int reg_buf_bytes = 0;
+ int next_unlinked_offset;
+ int inodes_per_buf;
+ xfs_agino_t *logged_nextp;
+ xfs_agino_t *buffer_nextp;
+
+ trace_xfs_log_recover_buf_inode_buf(mp->m_log, buf_f);
+
+ /*
+ * Post recovery validation only works properly on CRC enabled
+ * filesystems.
+ */
+ if (xfs_sb_version_hascrc(&mp->m_sb))
+ bp->b_ops = &xfs_inode_buf_ops;
+
+ inodes_per_buf = BBTOB(bp->b_length) >> mp->m_sb.sb_inodelog;
+ for (i = 0; i < inodes_per_buf; i++) {
+ next_unlinked_offset = (i * mp->m_sb.sb_inodesize) +
+ offsetof(xfs_dinode_t, di_next_unlinked);
+
+ while (next_unlinked_offset >=
+ (reg_buf_offset + reg_buf_bytes)) {
+ /*
+ * The next di_next_unlinked field is beyond
+ * the current logged region. Find the next
+ * logged region that contains or is beyond
+ * the current di_next_unlinked field.
+ */
+ bit += nbits;
+ bit = xfs_next_bit(buf_f->blf_data_map,
+ buf_f->blf_map_size, bit);
+
+ /*
+ * If there are no more logged regions in the
+ * buffer, then we're done.
+ */
+ if (bit == -1)
+ return 0;
+
+ nbits = xfs_contig_bits(buf_f->blf_data_map,
+ buf_f->blf_map_size, bit);
+ ASSERT(nbits > 0);
+ reg_buf_offset = bit << XFS_BLF_SHIFT;
+ reg_buf_bytes = nbits << XFS_BLF_SHIFT;
+ item_index++;
+ }
+
+ /*
+ * If the current logged region starts after the current
+ * di_next_unlinked field, then move on to the next
+ * di_next_unlinked field.
+ */
+ if (next_unlinked_offset < reg_buf_offset)
+ continue;
+
+ ASSERT(item->ri_buf[item_index].i_addr != NULL);
+ ASSERT((item->ri_buf[item_index].i_len % XFS_BLF_CHUNK) == 0);
+ ASSERT((reg_buf_offset + reg_buf_bytes) <= BBTOB(bp->b_length));
+
+ /*
+ * The current logged region contains a copy of the
+ * current di_next_unlinked field. Extract its value
+ * and copy it to the buffer copy.
+ */
+ logged_nextp = item->ri_buf[item_index].i_addr +
+ next_unlinked_offset - reg_buf_offset;
+ if (XFS_IS_CORRUPT(mp, *logged_nextp == 0)) {
+ xfs_alert(mp,
+ "Bad inode buffer log record (ptr = "PTR_FMT", bp = "PTR_FMT"). "
+ "Trying to replay bad (0) inode di_next_unlinked field.",
+ item, bp);
+ return -EFSCORRUPTED;
+ }
+
+ buffer_nextp = xfs_buf_offset(bp, next_unlinked_offset);
+ *buffer_nextp = *logged_nextp;
+
+ /*
+ * If necessary, recalculate the CRC in the on-disk inode. We
+ * have to leave the inode in a consistent state for whoever
+ * reads it next....
+ */
+ xfs_dinode_calc_crc(mp,
+ xfs_buf_offset(bp, i * mp->m_sb.sb_inodesize));
+
+ }
+
+ return 0;
+}
+
+/*
+ * V5 filesystems know the age of the buffer on disk being recovered. We can
+ * have newer objects on disk than we are replaying, and so for these cases we
+ * don't want to replay the current change as that will make the buffer contents
+ * temporarily invalid on disk.
+ *
+ * The magic number might not match the buffer type we are going to recover
+ * (e.g. reallocated blocks), so we ignore the xfs_buf_log_format flags. Hence
+ * extract the LSN of the existing object in the buffer based on it's current
+ * magic number. If we don't recognise the magic number in the buffer, then
+ * return a LSN of -1 so that the caller knows it was an unrecognised block and
+ * so can recover the buffer.
+ *
+ * Note: we cannot rely solely on magic number matches to determine that the
+ * buffer has a valid LSN - we also need to verify that it belongs to this
+ * filesystem, so we need to extract the object's LSN and compare it to that
+ * which we read from the superblock. If the UUIDs don't match, then we've got a
+ * stale metadata block from an old filesystem instance that we need to recover
+ * over the top of.
+ */
+static xfs_lsn_t
+xlog_recover_get_buf_lsn(
+ struct xfs_mount *mp,
+ struct xfs_buf *bp)
+{
+ uint32_t magic32;
+ uint16_t magic16;
+ uint16_t magicda;
+ void *blk = bp->b_addr;
+ uuid_t *uuid;
+ xfs_lsn_t lsn = -1;
+
+ /* v4 filesystems always recover immediately */
+ if (!xfs_sb_version_hascrc(&mp->m_sb))
+ goto recover_immediately;
+
+ magic32 = be32_to_cpu(*(__be32 *)blk);
+ switch (magic32) {
+ case XFS_ABTB_CRC_MAGIC:
+ case XFS_ABTC_CRC_MAGIC:
+ case XFS_ABTB_MAGIC:
+ case XFS_ABTC_MAGIC:
+ case XFS_RMAP_CRC_MAGIC:
+ case XFS_REFC_CRC_MAGIC:
+ case XFS_FIBT_CRC_MAGIC:
+ case XFS_FIBT_MAGIC:
+ case XFS_IBT_CRC_MAGIC:
+ case XFS_IBT_MAGIC: {
+ struct xfs_btree_block *btb = blk;
+
+ lsn = be64_to_cpu(btb->bb_u.s.bb_lsn);
+ uuid = &btb->bb_u.s.bb_uuid;
+ break;
+ }
+ case XFS_BMAP_CRC_MAGIC:
+ case XFS_BMAP_MAGIC: {
+ struct xfs_btree_block *btb = blk;
+
+ lsn = be64_to_cpu(btb->bb_u.l.bb_lsn);
+ uuid = &btb->bb_u.l.bb_uuid;
+ break;
+ }
+ case XFS_AGF_MAGIC:
+ lsn = be64_to_cpu(((struct xfs_agf *)blk)->agf_lsn);
+ uuid = &((struct xfs_agf *)blk)->agf_uuid;
+ break;
+ case XFS_AGFL_MAGIC:
+ lsn = be64_to_cpu(((struct xfs_agfl *)blk)->agfl_lsn);
+ uuid = &((struct xfs_agfl *)blk)->agfl_uuid;
+ break;
+ case XFS_AGI_MAGIC:
+ lsn = be64_to_cpu(((struct xfs_agi *)blk)->agi_lsn);
+ uuid = &((struct xfs_agi *)blk)->agi_uuid;
+ break;
+ case XFS_SYMLINK_MAGIC:
+ lsn = be64_to_cpu(((struct xfs_dsymlink_hdr *)blk)->sl_lsn);
+ uuid = &((struct xfs_dsymlink_hdr *)blk)->sl_uuid;
+ break;
+ case XFS_DIR3_BLOCK_MAGIC:
+ case XFS_DIR3_DATA_MAGIC:
+ case XFS_DIR3_FREE_MAGIC:
+ lsn = be64_to_cpu(((struct xfs_dir3_blk_hdr *)blk)->lsn);
+ uuid = &((struct xfs_dir3_blk_hdr *)blk)->uuid;
+ break;
+ case XFS_ATTR3_RMT_MAGIC:
+ /*
+ * Remote attr blocks are written synchronously, rather than
+ * being logged. That means they do not contain a valid LSN
+ * (i.e. transactionally ordered) in them, and hence any time we
+ * see a buffer to replay over the top of a remote attribute
+ * block we should simply do so.
+ */
+ goto recover_immediately;
+ case XFS_SB_MAGIC:
+ /*
+ * superblock uuids are magic. We may or may not have a
+ * sb_meta_uuid on disk, but it will be set in the in-core
+ * superblock. We set the uuid pointer for verification
+ * according to the superblock feature mask to ensure we check
+ * the relevant UUID in the superblock.
+ */
+ lsn = be64_to_cpu(((struct xfs_dsb *)blk)->sb_lsn);
+ if (xfs_sb_version_hasmetauuid(&mp->m_sb))
+ uuid = &((struct xfs_dsb *)blk)->sb_meta_uuid;
+ else
+ uuid = &((struct xfs_dsb *)blk)->sb_uuid;
+ break;
+ default:
+ break;
+ }
+
+ if (lsn != (xfs_lsn_t)-1) {
+ if (!uuid_equal(&mp->m_sb.sb_meta_uuid, uuid))
+ goto recover_immediately;
+ return lsn;
+ }
+
+ magicda = be16_to_cpu(((struct xfs_da_blkinfo *)blk)->magic);
+ switch (magicda) {
+ case XFS_DIR3_LEAF1_MAGIC:
+ case XFS_DIR3_LEAFN_MAGIC:
+ case XFS_DA3_NODE_MAGIC:
+ lsn = be64_to_cpu(((struct xfs_da3_blkinfo *)blk)->lsn);
+ uuid = &((struct xfs_da3_blkinfo *)blk)->uuid;
+ break;
+ default:
+ break;
+ }
+
+ if (lsn != (xfs_lsn_t)-1) {
+ if (!uuid_equal(&mp->m_sb.sb_uuid, uuid))
+ goto recover_immediately;
+ return lsn;
+ }
+
+ /*
+ * We do individual object checks on dquot and inode buffers as they
+ * have their own individual LSN records. Also, we could have a stale
+ * buffer here, so we have to at least recognise these buffer types.
+ *
+ * A notd complexity here is inode unlinked list processing - it logs
+ * the inode directly in the buffer, but we don't know which inodes have
+ * been modified, and there is no global buffer LSN. Hence we need to
+ * recover all inode buffer types immediately. This problem will be
+ * fixed by logical logging of the unlinked list modifications.
+ */
+ magic16 = be16_to_cpu(*(__be16 *)blk);
+ switch (magic16) {
+ case XFS_DQUOT_MAGIC:
+ case XFS_DINODE_MAGIC:
+ goto recover_immediately;
+ default:
+ break;
+ }
+
+ /* unknown buffer contents, recover immediately */
+
+recover_immediately:
+ return (xfs_lsn_t)-1;
+
+}
+
+/*
+ * This routine replays a modification made to a buffer at runtime.
+ * There are actually two types of buffer, regular and inode, which
+ * are handled differently. Inode buffers are handled differently
+ * in that we only recover a specific set of data from them, namely
+ * the inode di_next_unlinked fields. This is because all other inode
+ * data is actually logged via inode records and any data we replay
+ * here which overlaps that may be stale.
+ *
+ * When meta-data buffers are freed at run time we log a buffer item
+ * with the XFS_BLF_CANCEL bit set to indicate that previous copies
+ * of the buffer in the log should not be replayed at recovery time.
+ * This is so that if the blocks covered by the buffer are reused for
+ * file data before we crash we don't end up replaying old, freed
+ * meta-data into a user's file.
+ *
+ * To handle the cancellation of buffer log items, we make two passes
+ * over the log during recovery. During the first we build a table of
+ * those buffers which have been cancelled, and during the second we
+ * only replay those buffers which do not have corresponding cancel
+ * records in the table. See xlog_recover_buf_pass[1,2] above
+ * for more details on the implementation of the table of cancel records.
+ */
+STATIC int
+xlog_recover_buf_commit_pass2(
+ struct xlog *log,
+ struct list_head *buffer_list,
+ struct xlog_recover_item *item,
+ xfs_lsn_t current_lsn)
+{
+ struct xfs_buf_log_format *buf_f = item->ri_buf[0].i_addr;
+ struct xfs_mount *mp = log->l_mp;
+ struct xfs_buf *bp;
+ int error;
+ uint buf_flags;
+ xfs_lsn_t lsn;
+
+ /*
+ * In this pass we only want to recover all the buffers which have
+ * not been cancelled and are not cancellation buffers themselves.
+ */
+ if (buf_f->blf_flags & XFS_BLF_CANCEL) {
+ if (xlog_put_buffer_cancelled(log, buf_f->blf_blkno,
+ buf_f->blf_len))
+ goto cancelled;
+ } else {
+
+ if (xlog_is_buffer_cancelled(log, buf_f->blf_blkno,
+ buf_f->blf_len))
+ goto cancelled;
+ }
+
+ trace_xfs_log_recover_buf_recover(log, buf_f);
+
+ buf_flags = 0;
+ if (buf_f->blf_flags & XFS_BLF_INODE_BUF)
+ buf_flags |= XBF_UNMAPPED;
+
+ error = xfs_buf_read(mp->m_ddev_targp, buf_f->blf_blkno, buf_f->blf_len,
+ buf_flags, &bp, NULL);
+ if (error)
+ return error;
+
+ /*
+ * Recover the buffer only if we get an LSN from it and it's less than
+ * the lsn of the transaction we are replaying.
+ *
+ * Note that we have to be extremely careful of readahead here.
+ * Readahead does not attach verfiers to the buffers so if we don't
+ * actually do any replay after readahead because of the LSN we found
+ * in the buffer if more recent than that current transaction then we
+ * need to attach the verifier directly. Failure to do so can lead to
+ * future recovery actions (e.g. EFI and unlinked list recovery) can
+ * operate on the buffers and they won't get the verifier attached. This
+ * can lead to blocks on disk having the correct content but a stale
+ * CRC.
+ *
+ * It is safe to assume these clean buffers are currently up to date.
+ * If the buffer is dirtied by a later transaction being replayed, then
+ * the verifier will be reset to match whatever recover turns that
+ * buffer into.
+ */
+ lsn = xlog_recover_get_buf_lsn(mp, bp);
+ if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) {
+ trace_xfs_log_recover_buf_skip(log, buf_f);
+ xlog_recover_validate_buf_type(mp, bp, buf_f, NULLCOMMITLSN);
+ goto out_release;
+ }
+
+ if (buf_f->blf_flags & XFS_BLF_INODE_BUF) {
+ error = xlog_recover_do_inode_buffer(mp, item, bp, buf_f);
+ if (error)
+ goto out_release;
+ } else if (buf_f->blf_flags &
+ (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) {
+ bool dirty;
+
+ dirty = xlog_recover_do_dquot_buffer(mp, log, item, bp, buf_f);
+ if (!dirty)
+ goto out_release;
+ } else {
+ xlog_recover_do_reg_buffer(mp, item, bp, buf_f, current_lsn);
+ }
+
+ /*
+ * Perform delayed write on the buffer. Asynchronous writes will be
+ * slower when taking into account all the buffers to be flushed.
+ *
+ * Also make sure that only inode buffers with good sizes stay in
+ * the buffer cache. The kernel moves inodes in buffers of 1 block
+ * or inode_cluster_size bytes, whichever is bigger. The inode
+ * buffers in the log can be a different size if the log was generated
+ * by an older kernel using unclustered inode buffers or a newer kernel
+ * running with a different inode cluster size. Regardless, if
+ * the inode buffer size isn't max(blocksize, inode_cluster_size)
+ * for *our* value of inode_cluster_size, then we need to keep
+ * the buffer out of the buffer cache so that the buffer won't
+ * overlap with future reads of those inodes.
+ */
+ if (XFS_DINODE_MAGIC ==
+ be16_to_cpu(*((__be16 *)xfs_buf_offset(bp, 0))) &&
+ (BBTOB(bp->b_length) != M_IGEO(log->l_mp)->inode_cluster_size)) {
+ xfs_buf_stale(bp);
+ error = xfs_bwrite(bp);
+ } else {
+ ASSERT(bp->b_mount == mp);
+ bp->b_flags |= _XBF_LOGRECOVERY;
+ xfs_buf_delwri_queue(bp, buffer_list);
+ }
+
+out_release:
+ xfs_buf_relse(bp);
+ return error;
+cancelled:
+ trace_xfs_log_recover_buf_cancel(log, buf_f);
+ return 0;
+}
+
+const struct xlog_recover_item_ops xlog_buf_item_ops = {
+ .item_type = XFS_LI_BUF,
+ .reorder = xlog_recover_buf_reorder,
+ .ra_pass2 = xlog_recover_buf_ra_pass2,
+ .commit_pass1 = xlog_recover_buf_commit_pass1,
+ .commit_pass2 = xlog_recover_buf_commit_pass2,
+};
diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c
index 283df89..66deddd 100644
--- a/fs/xfs/xfs_dir2_readdir.c
+++ b/fs/xfs/xfs_dir2_readdir.c
@@ -17,6 +17,7 @@
#include "xfs_trace.h"
#include "xfs_bmap.h"
#include "xfs_trans.h"
+#include "xfs_error.h"
/*
* Directory file type support functions
@@ -47,6 +48,7 @@
{
int i; /* shortform entry number */
struct xfs_inode *dp = args->dp; /* incore directory inode */
+ struct xfs_mount *mp = dp->i_mount;
xfs_dir2_dataptr_t off; /* current entry's offset */
xfs_dir2_sf_entry_t *sfep; /* shortform directory entry */
xfs_dir2_sf_hdr_t *sfp; /* shortform structure */
@@ -68,15 +70,15 @@
return 0;
/*
- * Precalculate offsets for . and .. as we will always need them.
- *
- * XXX(hch): the second argument is sometimes 0 and sometimes
- * geo->datablk
+ * Precalculate offsets for "." and ".." as we will always need them.
+ * This relies on the fact that directories always start with the
+ * entries for "." and "..".
*/
dot_offset = xfs_dir2_db_off_to_dataptr(geo, geo->datablk,
- dp->d_ops->data_dot_offset);
+ geo->data_entry_offset);
dotdot_offset = xfs_dir2_db_off_to_dataptr(geo, geo->datablk,
- dp->d_ops->data_dotdot_offset);
+ geo->data_entry_offset +
+ xfs_dir2_data_entsize(mp, sizeof(".") - 1));
/*
* Put . entry unless we're starting past it.
@@ -91,7 +93,7 @@
* Put .. entry unless we're starting past it.
*/
if (ctx->pos <= dotdot_offset) {
- ino = dp->d_ops->sf_get_parent_ino(sfp);
+ ino = xfs_dir2_sf_get_parent_ino(sfp);
ctx->pos = dotdot_offset & 0x7fffffff;
if (!dir_emit(ctx, "..", 2, ino, DT_DIR))
return 0;
@@ -108,17 +110,21 @@
xfs_dir2_sf_get_offset(sfep));
if (ctx->pos > off) {
- sfep = dp->d_ops->sf_nextentry(sfp, sfep);
+ sfep = xfs_dir2_sf_nextentry(mp, sfp, sfep);
continue;
}
- ino = dp->d_ops->sf_get_ino(sfp, sfep);
- filetype = dp->d_ops->sf_get_ftype(sfep);
+ ino = xfs_dir2_sf_get_ino(mp, sfp, sfep);
+ filetype = xfs_dir2_sf_get_ftype(mp, sfep);
ctx->pos = off & 0x7fffffff;
+ if (XFS_IS_CORRUPT(dp->i_mount,
+ !xfs_dir2_namecheck(sfep->name,
+ sfep->namelen)))
+ return -EFSCORRUPTED;
if (!dir_emit(ctx, (char *)sfep->name, sfep->namelen, ino,
- xfs_dir3_get_dtype(dp->i_mount, filetype)))
+ xfs_dir3_get_dtype(mp, filetype)))
return 0;
- sfep = dp->d_ops->sf_nextentry(sfp, sfep);
+ sfep = xfs_dir2_sf_nextentry(mp, sfp, sfep);
}
ctx->pos = xfs_dir2_db_off_to_dataptr(geo, geo->datablk + 1, 0) &
@@ -135,17 +141,14 @@
struct dir_context *ctx)
{
struct xfs_inode *dp = args->dp; /* incore directory inode */
- xfs_dir2_data_hdr_t *hdr; /* block header */
struct xfs_buf *bp; /* buffer for block */
- xfs_dir2_data_entry_t *dep; /* block data entry */
- xfs_dir2_data_unused_t *dup; /* block unused entry */
- char *endptr; /* end of the data entries */
int error; /* error return value */
- char *ptr; /* current data entry */
int wantoff; /* starting block offset */
xfs_off_t cook;
struct xfs_da_geometry *geo = args->geo;
int lock_mode;
+ unsigned int offset, next_offset;
+ unsigned int end;
/*
* If the block number in the offset is out of range, we're done.
@@ -164,56 +167,57 @@
* We'll skip entries before this.
*/
wantoff = xfs_dir2_dataptr_to_off(geo, ctx->pos);
- hdr = bp->b_addr;
xfs_dir3_data_check(dp, bp);
- /*
- * Set up values for the loop.
- */
- ptr = (char *)dp->d_ops->data_entry_p(hdr);
- endptr = xfs_dir3_data_endp(geo, hdr);
/*
* Loop over the data portion of the block.
* Each object is a real entry (dep) or an unused one (dup).
*/
- while (ptr < endptr) {
+ end = xfs_dir3_data_end_offset(geo, bp->b_addr);
+ for (offset = geo->data_entry_offset;
+ offset < end;
+ offset = next_offset) {
+ struct xfs_dir2_data_unused *dup = bp->b_addr + offset;
+ struct xfs_dir2_data_entry *dep = bp->b_addr + offset;
uint8_t filetype;
- dup = (xfs_dir2_data_unused_t *)ptr;
/*
* Unused, skip it.
*/
if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
- ptr += be16_to_cpu(dup->length);
+ next_offset = offset + be16_to_cpu(dup->length);
continue;
}
- dep = (xfs_dir2_data_entry_t *)ptr;
-
/*
* Bump pointer for the next iteration.
*/
- ptr += dp->d_ops->data_entsize(dep->namelen);
+ next_offset = offset +
+ xfs_dir2_data_entsize(dp->i_mount, dep->namelen);
+
/*
* The entry is before the desired starting point, skip it.
*/
- if ((char *)dep - (char *)hdr < wantoff)
+ if (offset < wantoff)
continue;
- cook = xfs_dir2_db_off_to_dataptr(geo, geo->datablk,
- (char *)dep - (char *)hdr);
+ cook = xfs_dir2_db_off_to_dataptr(geo, geo->datablk, offset);
ctx->pos = cook & 0x7fffffff;
- filetype = dp->d_ops->data_get_ftype(dep);
+ filetype = xfs_dir2_data_get_ftype(dp->i_mount, dep);
/*
* If it didn't fit, set the final offset to here & return.
*/
+ if (XFS_IS_CORRUPT(dp->i_mount,
+ !xfs_dir2_namecheck(dep->name,
+ dep->namelen))) {
+ error = -EFSCORRUPTED;
+ goto out_rele;
+ }
if (!dir_emit(ctx, (char *)dep->name, dep->namelen,
be64_to_cpu(dep->inumber),
- xfs_dir3_get_dtype(dp->i_mount, filetype))) {
- xfs_trans_brelse(args->trans, bp);
- return 0;
- }
+ xfs_dir3_get_dtype(dp->i_mount, filetype)))
+ goto out_rele;
}
/*
@@ -222,8 +226,9 @@
*/
ctx->pos = xfs_dir2_db_off_to_dataptr(geo, geo->datablk + 1, 0) &
0x7fffffff;
+out_rele:
xfs_trans_brelse(args->trans, bp);
- return 0;
+ return error;
}
/*
@@ -276,7 +281,7 @@
new_off = xfs_dir2_da_to_byte(geo, map.br_startoff);
if (new_off > *cur_off)
*cur_off = new_off;
- error = xfs_dir3_data_read(args->trans, dp, map.br_startoff, -1, &bp);
+ error = xfs_dir3_data_read(args->trans, dp, map.br_startoff, 0, &bp);
if (error)
goto out;
@@ -311,7 +316,8 @@
break;
}
if (next_ra > *ra_blk) {
- xfs_dir3_data_readahead(dp, next_ra, -2);
+ xfs_dir3_data_readahead(dp, next_ra,
+ XFS_DABUF_MAP_HOLE_OK);
*ra_blk = next_ra;
}
ra_want -= geo->fsbcount;
@@ -343,17 +349,17 @@
size_t bufsize)
{
struct xfs_inode *dp = args->dp;
+ struct xfs_mount *mp = dp->i_mount;
struct xfs_buf *bp = NULL; /* data block buffer */
- xfs_dir2_data_hdr_t *hdr; /* data block header */
xfs_dir2_data_entry_t *dep; /* data entry */
xfs_dir2_data_unused_t *dup; /* unused entry */
- char *ptr = NULL; /* pointer to current data */
struct xfs_da_geometry *geo = args->geo;
xfs_dablk_t rablk = 0; /* current readahead block */
xfs_dir2_off_t curoff; /* current overall offset */
int length; /* temporary length value */
int byteoff; /* offset in current block */
int lock_mode;
+ unsigned int offset = 0;
int error = 0; /* error return value */
/*
@@ -380,7 +386,7 @@
* If we have no buffer, or we're off the end of the
* current buffer, need to get another one.
*/
- if (!bp || ptr >= (char *)bp->b_addr + geo->blksize) {
+ if (!bp || offset >= geo->blksize) {
if (bp) {
xfs_trans_brelse(args->trans, bp);
bp = NULL;
@@ -393,36 +399,35 @@
if (error || !bp)
break;
- hdr = bp->b_addr;
xfs_dir3_data_check(dp, bp);
/*
* Find our position in the block.
*/
- ptr = (char *)dp->d_ops->data_entry_p(hdr);
+ offset = geo->data_entry_offset;
byteoff = xfs_dir2_byte_to_off(geo, curoff);
/*
* Skip past the header.
*/
if (byteoff == 0)
- curoff += dp->d_ops->data_entry_offset;
+ curoff += geo->data_entry_offset;
/*
* Skip past entries until we reach our offset.
*/
else {
- while ((char *)ptr - (char *)hdr < byteoff) {
- dup = (xfs_dir2_data_unused_t *)ptr;
+ while (offset < byteoff) {
+ dup = bp->b_addr + offset;
if (be16_to_cpu(dup->freetag)
== XFS_DIR2_DATA_FREE_TAG) {
length = be16_to_cpu(dup->length);
- ptr += length;
+ offset += length;
continue;
}
- dep = (xfs_dir2_data_entry_t *)ptr;
- length =
- dp->d_ops->data_entsize(dep->namelen);
- ptr += length;
+ dep = bp->b_addr + offset;
+ length = xfs_dir2_data_entsize(mp,
+ dep->namelen);
+ offset += length;
}
/*
* Now set our real offset.
@@ -430,32 +435,38 @@
curoff =
xfs_dir2_db_off_to_byte(geo,
xfs_dir2_byte_to_db(geo, curoff),
- (char *)ptr - (char *)hdr);
- if (ptr >= (char *)hdr + geo->blksize) {
+ offset);
+ if (offset >= geo->blksize)
continue;
- }
}
}
+
/*
- * We have a pointer to an entry.
- * Is it a live one?
+ * We have a pointer to an entry. Is it a live one?
*/
- dup = (xfs_dir2_data_unused_t *)ptr;
+ dup = bp->b_addr + offset;
+
/*
* No, it's unused, skip over it.
*/
if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
length = be16_to_cpu(dup->length);
- ptr += length;
+ offset += length;
curoff += length;
continue;
}
- dep = (xfs_dir2_data_entry_t *)ptr;
- length = dp->d_ops->data_entsize(dep->namelen);
- filetype = dp->d_ops->data_get_ftype(dep);
+ dep = bp->b_addr + offset;
+ length = xfs_dir2_data_entsize(mp, dep->namelen);
+ filetype = xfs_dir2_data_get_ftype(mp, dep);
ctx->pos = xfs_dir2_byte_to_dataptr(curoff) & 0x7fffffff;
+ if (XFS_IS_CORRUPT(dp->i_mount,
+ !xfs_dir2_namecheck(dep->name,
+ dep->namelen))) {
+ error = -EFSCORRUPTED;
+ break;
+ }
if (!dir_emit(ctx, (char *)dep->name, dep->namelen,
be64_to_cpu(dep->inumber),
xfs_dir3_get_dtype(dp->i_mount, filetype)))
@@ -464,7 +475,7 @@
/*
* Advance to next entry in the block.
*/
- ptr += length;
+ offset += length;
curoff += length;
/* bufsize may have just been a guess; don't go negative */
bufsize = bufsize > length ? bufsize - length : 0;
@@ -513,7 +524,7 @@
args.geo = dp->i_mount->m_dir_geo;
args.trans = tp;
- if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL)
+ if (dp->i_df.if_format == XFS_DINODE_FMT_LOCAL)
rval = xfs_dir2_sf_getdents(&args, ctx);
else if ((rval = xfs_dir2_isblock(&args, &v)))
;
diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c
index 8ec7aab..f979d0d 100644
--- a/fs/xfs/xfs_discard.c
+++ b/fs/xfs/xfs_discard.c
@@ -13,6 +13,7 @@
#include "xfs_btree.h"
#include "xfs_alloc_btree.h"
#include "xfs_alloc.h"
+#include "xfs_discard.h"
#include "xfs_error.h"
#include "xfs_extent_busy.h"
#include "xfs_trace.h"
@@ -30,6 +31,7 @@
struct block_device *bdev = mp->m_ddev_targp->bt_bdev;
struct xfs_btree_cur *cur;
struct xfs_buf *agbp;
+ struct xfs_agf *agf;
struct xfs_perag *pag;
int error;
int i;
@@ -44,16 +46,16 @@
xfs_log_force(mp, XFS_LOG_SYNC);
error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agbp);
- if (error || !agbp)
+ if (error)
goto out_put_perag;
+ agf = agbp->b_addr;
cur = xfs_allocbt_init_cursor(mp, NULL, agbp, agno, XFS_BTNUM_CNT);
/*
* Look up the longest btree in the AGF and start with it.
*/
- error = xfs_alloc_lookup_ge(cur, 0,
- be32_to_cpu(XFS_BUF_TO_AGF(agbp)->agf_longest), &i);
+ error = xfs_alloc_lookup_ge(cur, 0, be32_to_cpu(agf->agf_longest), &i);
if (error)
goto out_del_cursor;
@@ -70,8 +72,11 @@
error = xfs_alloc_get_rec(cur, &fbno, &flen, &i);
if (error)
goto out_del_cursor;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, out_del_cursor);
- ASSERT(flen <= be32_to_cpu(XFS_BUF_TO_AGF(agbp)->agf_longest));
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto out_del_cursor;
+ }
+ ASSERT(flen <= be32_to_cpu(agf->agf_longest));
/*
* use daddr format for all range/len calculations as that is
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index 3cbf248..1d95ed3 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -23,6 +23,7 @@
#include "xfs_trace.h"
#include "xfs_log.h"
#include "xfs_bmap_btree.h"
+#include "xfs_error.h"
/*
* Lock order:
@@ -48,7 +49,7 @@
*/
void
xfs_qm_dqdestroy(
- xfs_dquot_t *dqp)
+ struct xfs_dquot *dqp)
{
ASSERT(list_empty(&dqp->q_lru));
@@ -56,7 +57,7 @@
mutex_destroy(&dqp->q_qlock);
XFS_STATS_DEC(dqp->q_mount, xs_qm_dquot);
- kmem_zone_free(xfs_qm_dqzone, dqp);
+ kmem_cache_free(xfs_qm_dqzone, dqp);
}
/*
@@ -66,38 +67,82 @@
*/
void
xfs_qm_adjust_dqlimits(
- struct xfs_mount *mp,
struct xfs_dquot *dq)
{
+ struct xfs_mount *mp = dq->q_mount;
struct xfs_quotainfo *q = mp->m_quotainfo;
- struct xfs_disk_dquot *d = &dq->q_core;
struct xfs_def_quota *defq;
int prealloc = 0;
- ASSERT(d->d_id);
- defq = xfs_get_defquota(dq, q);
+ ASSERT(dq->q_id);
+ defq = xfs_get_defquota(q, xfs_dquot_type(dq));
- if (defq->bsoftlimit && !d->d_blk_softlimit) {
- d->d_blk_softlimit = cpu_to_be64(defq->bsoftlimit);
+ if (!dq->q_blk.softlimit) {
+ dq->q_blk.softlimit = defq->blk.soft;
prealloc = 1;
}
- if (defq->bhardlimit && !d->d_blk_hardlimit) {
- d->d_blk_hardlimit = cpu_to_be64(defq->bhardlimit);
+ if (!dq->q_blk.hardlimit) {
+ dq->q_blk.hardlimit = defq->blk.hard;
prealloc = 1;
}
- if (defq->isoftlimit && !d->d_ino_softlimit)
- d->d_ino_softlimit = cpu_to_be64(defq->isoftlimit);
- if (defq->ihardlimit && !d->d_ino_hardlimit)
- d->d_ino_hardlimit = cpu_to_be64(defq->ihardlimit);
- if (defq->rtbsoftlimit && !d->d_rtb_softlimit)
- d->d_rtb_softlimit = cpu_to_be64(defq->rtbsoftlimit);
- if (defq->rtbhardlimit && !d->d_rtb_hardlimit)
- d->d_rtb_hardlimit = cpu_to_be64(defq->rtbhardlimit);
+ if (!dq->q_ino.softlimit)
+ dq->q_ino.softlimit = defq->ino.soft;
+ if (!dq->q_ino.hardlimit)
+ dq->q_ino.hardlimit = defq->ino.hard;
+ if (!dq->q_rtb.softlimit)
+ dq->q_rtb.softlimit = defq->rtb.soft;
+ if (!dq->q_rtb.hardlimit)
+ dq->q_rtb.hardlimit = defq->rtb.hard;
if (prealloc)
xfs_dquot_set_prealloc_limits(dq);
}
+/* Set the expiration time of a quota's grace period. */
+time64_t
+xfs_dquot_set_timeout(
+ struct xfs_mount *mp,
+ time64_t timeout)
+{
+ struct xfs_quotainfo *qi = mp->m_quotainfo;
+
+ return clamp_t(time64_t, timeout, qi->qi_expiry_min,
+ qi->qi_expiry_max);
+}
+
+/* Set the length of the default grace period. */
+time64_t
+xfs_dquot_set_grace_period(
+ time64_t grace)
+{
+ return clamp_t(time64_t, grace, XFS_DQ_GRACE_MIN, XFS_DQ_GRACE_MAX);
+}
+
+/*
+ * Determine if this quota counter is over either limit and set the quota
+ * timers as appropriate.
+ */
+static inline void
+xfs_qm_adjust_res_timer(
+ struct xfs_mount *mp,
+ struct xfs_dquot_res *res,
+ struct xfs_quota_limits *qlim)
+{
+ ASSERT(res->hardlimit == 0 || res->softlimit <= res->hardlimit);
+
+ if ((res->softlimit && res->count > res->softlimit) ||
+ (res->hardlimit && res->count > res->hardlimit)) {
+ if (res->timer == 0)
+ res->timer = xfs_dquot_set_timeout(mp,
+ ktime_get_real_seconds() + qlim->time);
+ } else {
+ if (res->timer == 0)
+ res->warnings = 0;
+ else
+ res->timer = 0;
+ }
+}
+
/*
* Check the limits and timers of a dquot and start or reset timers
* if necessary.
@@ -113,91 +158,18 @@
*/
void
xfs_qm_adjust_dqtimers(
- xfs_mount_t *mp,
- xfs_disk_dquot_t *d)
+ struct xfs_dquot *dq)
{
- ASSERT(d->d_id);
+ struct xfs_mount *mp = dq->q_mount;
+ struct xfs_quotainfo *qi = mp->m_quotainfo;
+ struct xfs_def_quota *defq;
-#ifdef DEBUG
- if (d->d_blk_hardlimit)
- ASSERT(be64_to_cpu(d->d_blk_softlimit) <=
- be64_to_cpu(d->d_blk_hardlimit));
- if (d->d_ino_hardlimit)
- ASSERT(be64_to_cpu(d->d_ino_softlimit) <=
- be64_to_cpu(d->d_ino_hardlimit));
- if (d->d_rtb_hardlimit)
- ASSERT(be64_to_cpu(d->d_rtb_softlimit) <=
- be64_to_cpu(d->d_rtb_hardlimit));
-#endif
+ ASSERT(dq->q_id);
+ defq = xfs_get_defquota(qi, xfs_dquot_type(dq));
- if (!d->d_btimer) {
- if ((d->d_blk_softlimit &&
- (be64_to_cpu(d->d_bcount) >
- be64_to_cpu(d->d_blk_softlimit))) ||
- (d->d_blk_hardlimit &&
- (be64_to_cpu(d->d_bcount) >
- be64_to_cpu(d->d_blk_hardlimit)))) {
- d->d_btimer = cpu_to_be32(get_seconds() +
- mp->m_quotainfo->qi_btimelimit);
- } else {
- d->d_bwarns = 0;
- }
- } else {
- if ((!d->d_blk_softlimit ||
- (be64_to_cpu(d->d_bcount) <=
- be64_to_cpu(d->d_blk_softlimit))) &&
- (!d->d_blk_hardlimit ||
- (be64_to_cpu(d->d_bcount) <=
- be64_to_cpu(d->d_blk_hardlimit)))) {
- d->d_btimer = 0;
- }
- }
-
- if (!d->d_itimer) {
- if ((d->d_ino_softlimit &&
- (be64_to_cpu(d->d_icount) >
- be64_to_cpu(d->d_ino_softlimit))) ||
- (d->d_ino_hardlimit &&
- (be64_to_cpu(d->d_icount) >
- be64_to_cpu(d->d_ino_hardlimit)))) {
- d->d_itimer = cpu_to_be32(get_seconds() +
- mp->m_quotainfo->qi_itimelimit);
- } else {
- d->d_iwarns = 0;
- }
- } else {
- if ((!d->d_ino_softlimit ||
- (be64_to_cpu(d->d_icount) <=
- be64_to_cpu(d->d_ino_softlimit))) &&
- (!d->d_ino_hardlimit ||
- (be64_to_cpu(d->d_icount) <=
- be64_to_cpu(d->d_ino_hardlimit)))) {
- d->d_itimer = 0;
- }
- }
-
- if (!d->d_rtbtimer) {
- if ((d->d_rtb_softlimit &&
- (be64_to_cpu(d->d_rtbcount) >
- be64_to_cpu(d->d_rtb_softlimit))) ||
- (d->d_rtb_hardlimit &&
- (be64_to_cpu(d->d_rtbcount) >
- be64_to_cpu(d->d_rtb_hardlimit)))) {
- d->d_rtbtimer = cpu_to_be32(get_seconds() +
- mp->m_quotainfo->qi_rtbtimelimit);
- } else {
- d->d_rtbwarns = 0;
- }
- } else {
- if ((!d->d_rtb_softlimit ||
- (be64_to_cpu(d->d_rtbcount) <=
- be64_to_cpu(d->d_rtb_softlimit))) &&
- (!d->d_rtb_hardlimit ||
- (be64_to_cpu(d->d_rtbcount) <=
- be64_to_cpu(d->d_rtb_hardlimit)))) {
- d->d_rtbtimer = 0;
- }
- }
+ xfs_qm_adjust_res_timer(dq->q_mount, &dq->q_blk, &defq->blk);
+ xfs_qm_adjust_res_timer(dq->q_mount, &dq->q_ino, &defq->ino);
+ xfs_qm_adjust_res_timer(dq->q_mount, &dq->q_rtb, &defq->rtb);
}
/*
@@ -205,20 +177,40 @@
*/
STATIC void
xfs_qm_init_dquot_blk(
- xfs_trans_t *tp,
- xfs_mount_t *mp,
- xfs_dqid_t id,
- uint type,
- xfs_buf_t *bp)
+ struct xfs_trans *tp,
+ struct xfs_mount *mp,
+ xfs_dqid_t id,
+ xfs_dqtype_t type,
+ struct xfs_buf *bp)
{
struct xfs_quotainfo *q = mp->m_quotainfo;
- xfs_dqblk_t *d;
- xfs_dqid_t curid;
- int i;
+ struct xfs_dqblk *d;
+ xfs_dqid_t curid;
+ unsigned int qflag;
+ unsigned int blftype;
+ int i;
ASSERT(tp);
ASSERT(xfs_buf_islocked(bp));
+ switch (type) {
+ case XFS_DQTYPE_USER:
+ qflag = XFS_UQUOTA_CHKD;
+ blftype = XFS_BLF_UDQUOT_BUF;
+ break;
+ case XFS_DQTYPE_PROJ:
+ qflag = XFS_PQUOTA_CHKD;
+ blftype = XFS_BLF_PDQUOT_BUF;
+ break;
+ case XFS_DQTYPE_GROUP:
+ qflag = XFS_GQUOTA_CHKD;
+ blftype = XFS_BLF_GDQUOT_BUF;
+ break;
+ default:
+ ASSERT(0);
+ return;
+ }
+
d = bp->b_addr;
/*
@@ -230,7 +222,9 @@
d->dd_diskdq.d_magic = cpu_to_be16(XFS_DQUOT_MAGIC);
d->dd_diskdq.d_version = XFS_DQUOT_VERSION;
d->dd_diskdq.d_id = cpu_to_be32(curid);
- d->dd_diskdq.d_flags = type;
+ d->dd_diskdq.d_type = type;
+ if (curid > 0 && xfs_sb_version_hasbigtime(&mp->m_sb))
+ d->dd_diskdq.d_type |= XFS_DQTYPE_BIGTIME;
if (xfs_sb_version_hascrc(&mp->m_sb)) {
uuid_copy(&d->dd_uuid, &mp->m_sb.sb_meta_uuid);
xfs_update_cksum((char *)d, sizeof(struct xfs_dqblk),
@@ -238,11 +232,28 @@
}
}
- xfs_trans_dquot_buf(tp, bp,
- (type & XFS_DQ_USER ? XFS_BLF_UDQUOT_BUF :
- ((type & XFS_DQ_PROJ) ? XFS_BLF_PDQUOT_BUF :
- XFS_BLF_GDQUOT_BUF)));
- xfs_trans_log_buf(tp, bp, 0, BBTOB(q->qi_dqchunklen) - 1);
+ xfs_trans_dquot_buf(tp, bp, blftype);
+
+ /*
+ * quotacheck uses delayed writes to update all the dquots on disk in an
+ * efficient manner instead of logging the individual dquot changes as
+ * they are made. However if we log the buffer allocated here and crash
+ * after quotacheck while the logged initialisation is still in the
+ * active region of the log, log recovery can replay the dquot buffer
+ * initialisation over the top of the checked dquots and corrupt quota
+ * accounting.
+ *
+ * To avoid this problem, quotacheck cannot log the initialised buffer.
+ * We must still dirty the buffer and write it back before the
+ * allocation transaction clears the log. Therefore, mark the buffer as
+ * ordered instead of logging it directly. This is safe for quotacheck
+ * because it detects and repairs allocated but initialized dquot blocks
+ * in the quota inodes.
+ */
+ if (!(mp->m_qflags & qflag))
+ xfs_trans_ordered_buf(tp, bp);
+ else
+ xfs_trans_log_buf(tp, bp, 0, BBTOB(q->qi_dqchunklen) - 1);
}
/*
@@ -255,8 +266,8 @@
{
uint64_t space;
- dqp->q_prealloc_hi_wmark = be64_to_cpu(dqp->q_core.d_blk_hardlimit);
- dqp->q_prealloc_lo_wmark = be64_to_cpu(dqp->q_core.d_blk_softlimit);
+ dqp->q_prealloc_hi_wmark = dqp->q_blk.hardlimit;
+ dqp->q_prealloc_lo_wmark = dqp->q_blk.softlimit;
if (!dqp->q_prealloc_lo_wmark) {
dqp->q_prealloc_lo_wmark = dqp->q_prealloc_hi_wmark;
do_div(dqp->q_prealloc_lo_wmark, 100);
@@ -286,14 +297,15 @@
struct xfs_trans *tp = *tpp;
struct xfs_mount *mp = tp->t_mountp;
struct xfs_buf *bp;
- struct xfs_inode *quotip = xfs_quota_inode(mp, dqp->dq_flags);
+ xfs_dqtype_t qtype = xfs_dquot_type(dqp);
+ struct xfs_inode *quotip = xfs_quota_inode(mp, qtype);
int nmaps = 1;
int error;
trace_xfs_dqalloc(dqp);
xfs_ilock(quotip, XFS_ILOCK_EXCL);
- if (!xfs_this_quota_on(dqp->q_mount, dqp->dq_flags)) {
+ if (!xfs_this_quota_on(dqp->q_mount, qtype)) {
/*
* Return if this type of quotas is turned off while we didn't
* have an inode lock
@@ -305,8 +317,8 @@
/* Create the block mapping. */
xfs_trans_ijoin(tp, quotip, XFS_ILOCK_EXCL);
error = xfs_bmapi_write(tp, quotip, dqp->q_fileoffset,
- XFS_DQUOT_CLUSTER_SIZE_FSB, XFS_BMAPI_METADATA,
- XFS_QM_DQALLOC_SPACE_RES(mp), &map, &nmaps);
+ XFS_DQUOT_CLUSTER_SIZE_FSB, XFS_BMAPI_METADATA, 0, &map,
+ &nmaps);
if (error)
return error;
ASSERT(map.br_blockcount == XFS_DQUOT_CLUSTER_SIZE_FSB);
@@ -320,18 +332,17 @@
dqp->q_blkno = XFS_FSB_TO_DADDR(mp, map.br_startblock);
/* now we can just get the buffer (there's nothing to read yet) */
- bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, dqp->q_blkno,
- mp->m_quotainfo->qi_dqchunklen, 0);
- if (!bp)
- return -ENOMEM;
+ error = xfs_trans_get_buf(tp, mp->m_ddev_targp, dqp->q_blkno,
+ mp->m_quotainfo->qi_dqchunklen, 0, &bp);
+ if (error)
+ return error;
bp->b_ops = &xfs_dquot_buf_ops;
/*
* Make a chunk of dquots out of this buffer and log
* the entire thing.
*/
- xfs_qm_init_dquot_blk(tp, mp, be32_to_cpu(dqp->q_core.d_id),
- dqp->dq_flags & XFS_DQ_ALLTYPES, bp);
+ xfs_qm_init_dquot_blk(tp, mp, dqp->q_id, qtype, bp);
xfs_buf_set_ref(bp, XFS_DQUOT_REF);
/*
@@ -378,13 +389,14 @@
{
struct xfs_bmbt_irec map;
struct xfs_buf *bp;
- struct xfs_inode *quotip = xfs_quota_inode(mp, dqp->dq_flags);
+ xfs_dqtype_t qtype = xfs_dquot_type(dqp);
+ struct xfs_inode *quotip = xfs_quota_inode(mp, qtype);
uint lock_mode;
int nmaps = 1;
int error;
lock_mode = xfs_ilock_data_map_shared(quotip);
- if (!xfs_this_quota_on(mp, dqp->dq_flags)) {
+ if (!xfs_this_quota_on(mp, qtype)) {
/*
* Return if this type of quotas is turned off while we
* didn't have the quota inode lock.
@@ -436,14 +448,14 @@
xfs_dquot_alloc(
struct xfs_mount *mp,
xfs_dqid_t id,
- uint type)
+ xfs_dqtype_t type)
{
struct xfs_dquot *dqp;
- dqp = kmem_zone_zalloc(xfs_qm_dqzone, 0);
+ dqp = kmem_cache_zalloc(xfs_qm_dqzone, GFP_KERNEL | __GFP_NOFAIL);
- dqp->dq_flags = type;
- dqp->q_core.d_id = cpu_to_be32(id);
+ dqp->q_type = type;
+ dqp->q_id = id;
dqp->q_mount = mp;
INIT_LIST_HEAD(&dqp->q_lru);
mutex_init(&dqp->q_qlock);
@@ -468,13 +480,13 @@
* quotas.
*/
switch (type) {
- case XFS_DQ_USER:
+ case XFS_DQTYPE_USER:
/* uses the default lock class */
break;
- case XFS_DQ_GROUP:
+ case XFS_DQTYPE_GROUP:
lockdep_set_class(&dqp->q_qlock, &xfs_dquot_group_class);
break;
- case XFS_DQ_PROJ:
+ case XFS_DQTYPE_PROJ:
lockdep_set_class(&dqp->q_qlock, &xfs_dquot_project_class);
break;
default:
@@ -489,26 +501,91 @@
}
/* Copy the in-core quota fields in from the on-disk buffer. */
-STATIC void
+STATIC int
xfs_dquot_from_disk(
struct xfs_dquot *dqp,
struct xfs_buf *bp)
{
struct xfs_disk_dquot *ddqp = bp->b_addr + dqp->q_bufoffset;
+ /*
+ * Ensure that we got the type and ID we were looking for.
+ * Everything else was checked by the dquot buffer verifier.
+ */
+ if ((ddqp->d_type & XFS_DQTYPE_REC_MASK) != xfs_dquot_type(dqp) ||
+ be32_to_cpu(ddqp->d_id) != dqp->q_id) {
+ xfs_alert_tag(bp->b_mount, XFS_PTAG_VERIFIER_ERROR,
+ "Metadata corruption detected at %pS, quota %u",
+ __this_address, dqp->q_id);
+ xfs_alert(bp->b_mount, "Unmount and run xfs_repair");
+ return -EFSCORRUPTED;
+ }
+
/* copy everything from disk dquot to the incore dquot */
- memcpy(&dqp->q_core, ddqp, sizeof(xfs_disk_dquot_t));
+ dqp->q_type = ddqp->d_type;
+ dqp->q_blk.hardlimit = be64_to_cpu(ddqp->d_blk_hardlimit);
+ dqp->q_blk.softlimit = be64_to_cpu(ddqp->d_blk_softlimit);
+ dqp->q_ino.hardlimit = be64_to_cpu(ddqp->d_ino_hardlimit);
+ dqp->q_ino.softlimit = be64_to_cpu(ddqp->d_ino_softlimit);
+ dqp->q_rtb.hardlimit = be64_to_cpu(ddqp->d_rtb_hardlimit);
+ dqp->q_rtb.softlimit = be64_to_cpu(ddqp->d_rtb_softlimit);
+
+ dqp->q_blk.count = be64_to_cpu(ddqp->d_bcount);
+ dqp->q_ino.count = be64_to_cpu(ddqp->d_icount);
+ dqp->q_rtb.count = be64_to_cpu(ddqp->d_rtbcount);
+
+ dqp->q_blk.warnings = be16_to_cpu(ddqp->d_bwarns);
+ dqp->q_ino.warnings = be16_to_cpu(ddqp->d_iwarns);
+ dqp->q_rtb.warnings = be16_to_cpu(ddqp->d_rtbwarns);
+
+ dqp->q_blk.timer = xfs_dquot_from_disk_ts(ddqp, ddqp->d_btimer);
+ dqp->q_ino.timer = xfs_dquot_from_disk_ts(ddqp, ddqp->d_itimer);
+ dqp->q_rtb.timer = xfs_dquot_from_disk_ts(ddqp, ddqp->d_rtbtimer);
/*
* Reservation counters are defined as reservation plus current usage
* to avoid having to add every time.
*/
- dqp->q_res_bcount = be64_to_cpu(ddqp->d_bcount);
- dqp->q_res_icount = be64_to_cpu(ddqp->d_icount);
- dqp->q_res_rtbcount = be64_to_cpu(ddqp->d_rtbcount);
+ dqp->q_blk.reserved = dqp->q_blk.count;
+ dqp->q_ino.reserved = dqp->q_ino.count;
+ dqp->q_rtb.reserved = dqp->q_rtb.count;
/* initialize the dquot speculative prealloc thresholds */
xfs_dquot_set_prealloc_limits(dqp);
+ return 0;
+}
+
+/* Copy the in-core quota fields into the on-disk buffer. */
+void
+xfs_dquot_to_disk(
+ struct xfs_disk_dquot *ddqp,
+ struct xfs_dquot *dqp)
+{
+ ddqp->d_magic = cpu_to_be16(XFS_DQUOT_MAGIC);
+ ddqp->d_version = XFS_DQUOT_VERSION;
+ ddqp->d_type = dqp->q_type;
+ ddqp->d_id = cpu_to_be32(dqp->q_id);
+ ddqp->d_pad0 = 0;
+ ddqp->d_pad = 0;
+
+ ddqp->d_blk_hardlimit = cpu_to_be64(dqp->q_blk.hardlimit);
+ ddqp->d_blk_softlimit = cpu_to_be64(dqp->q_blk.softlimit);
+ ddqp->d_ino_hardlimit = cpu_to_be64(dqp->q_ino.hardlimit);
+ ddqp->d_ino_softlimit = cpu_to_be64(dqp->q_ino.softlimit);
+ ddqp->d_rtb_hardlimit = cpu_to_be64(dqp->q_rtb.hardlimit);
+ ddqp->d_rtb_softlimit = cpu_to_be64(dqp->q_rtb.softlimit);
+
+ ddqp->d_bcount = cpu_to_be64(dqp->q_blk.count);
+ ddqp->d_icount = cpu_to_be64(dqp->q_ino.count);
+ ddqp->d_rtbcount = cpu_to_be64(dqp->q_rtb.count);
+
+ ddqp->d_bwarns = cpu_to_be16(dqp->q_blk.warnings);
+ ddqp->d_iwarns = cpu_to_be16(dqp->q_ino.warnings);
+ ddqp->d_rtbwarns = cpu_to_be16(dqp->q_rtb.warnings);
+
+ ddqp->d_btimer = xfs_dquot_to_disk_ts(dqp, dqp->q_blk.timer);
+ ddqp->d_itimer = xfs_dquot_to_disk_ts(dqp, dqp->q_ino.timer);
+ ddqp->d_rtbtimer = xfs_dquot_to_disk_ts(dqp, dqp->q_rtb.timer);
}
/* Allocate and initialize the dquot buffer for this in-core dquot. */
@@ -557,7 +634,7 @@
xfs_qm_dqread(
struct xfs_mount *mp,
xfs_dqid_t id,
- uint type,
+ xfs_dqtype_t type,
bool can_alloc,
struct xfs_dquot **dqpp)
{
@@ -582,9 +659,11 @@
* further.
*/
ASSERT(xfs_buf_islocked(bp));
- xfs_dquot_from_disk(dqp, bp);
-
+ error = xfs_dquot_from_disk(dqp, bp);
xfs_buf_relse(bp);
+ if (error)
+ goto err;
+
*dqpp = dqp;
return error;
@@ -603,7 +682,7 @@
static int
xfs_dq_get_next_id(
struct xfs_mount *mp,
- uint type,
+ xfs_dqtype_t type,
xfs_dqid_t *id)
{
struct xfs_inode *quotip = xfs_quota_inode(mp, type);
@@ -671,7 +750,7 @@
}
xfs_dqlock(dqp);
- if (dqp->dq_flags & XFS_DQ_FREEING) {
+ if (dqp->q_flags & XFS_DQFLAG_FREEING) {
xfs_dqunlock(dqp);
mutex_unlock(&qi->qi_tree_lock);
trace_xfs_dqget_freeing(dqp);
@@ -727,21 +806,21 @@
static int
xfs_qm_dqget_checks(
struct xfs_mount *mp,
- uint type)
+ xfs_dqtype_t type)
{
if (WARN_ON_ONCE(!XFS_IS_QUOTA_RUNNING(mp)))
return -ESRCH;
switch (type) {
- case XFS_DQ_USER:
+ case XFS_DQTYPE_USER:
if (!XFS_IS_UQUOTA_ON(mp))
return -ESRCH;
return 0;
- case XFS_DQ_GROUP:
+ case XFS_DQTYPE_GROUP:
if (!XFS_IS_GQUOTA_ON(mp))
return -ESRCH;
return 0;
- case XFS_DQ_PROJ:
+ case XFS_DQTYPE_PROJ:
if (!XFS_IS_PQUOTA_ON(mp))
return -ESRCH;
return 0;
@@ -752,14 +831,14 @@
}
/*
- * Given the file system, id, and type (UDQUOT/GDQUOT), return a a locked
- * dquot, doing an allocation (if requested) as needed.
+ * Given the file system, id, and type (UDQUOT/GDQUOT/PDQUOT), return a
+ * locked dquot, doing an allocation (if requested) as needed.
*/
int
xfs_qm_dqget(
struct xfs_mount *mp,
xfs_dqid_t id,
- uint type,
+ xfs_dqtype_t type,
bool can_alloc,
struct xfs_dquot **O_dqpp)
{
@@ -809,7 +888,7 @@
xfs_qm_dqget_uncached(
struct xfs_mount *mp,
xfs_dqid_t id,
- uint type,
+ xfs_dqtype_t type,
struct xfs_dquot **dqpp)
{
int error;
@@ -825,15 +904,15 @@
xfs_dqid_t
xfs_qm_id_for_quotatype(
struct xfs_inode *ip,
- uint type)
+ xfs_dqtype_t type)
{
switch (type) {
- case XFS_DQ_USER:
- return ip->i_d.di_uid;
- case XFS_DQ_GROUP:
- return ip->i_d.di_gid;
- case XFS_DQ_PROJ:
- return xfs_get_projid(ip);
+ case XFS_DQTYPE_USER:
+ return i_uid_read(VFS_I(ip));
+ case XFS_DQTYPE_GROUP:
+ return i_gid_read(VFS_I(ip));
+ case XFS_DQTYPE_PROJ:
+ return ip->i_d.di_projid;
}
ASSERT(0);
return 0;
@@ -847,7 +926,7 @@
int
xfs_qm_dqget_inode(
struct xfs_inode *ip,
- uint type,
+ xfs_dqtype_t type,
bool can_alloc,
struct xfs_dquot **O_dqpp)
{
@@ -933,7 +1012,7 @@
xfs_qm_dqget_next(
struct xfs_mount *mp,
xfs_dqid_t id,
- uint type,
+ xfs_dqtype_t type,
struct xfs_dquot **dqpp)
{
struct xfs_dquot *dqp;
@@ -989,7 +1068,7 @@
*/
void
xfs_qm_dqrele(
- xfs_dquot_t *dqp)
+ struct xfs_dquot *dqp)
{
if (!dqp)
return;
@@ -1013,14 +1092,14 @@
* from the AIL if it has not been re-logged, and unlocking the dquot's
* flush lock. This behavior is very similar to that of inodes..
*/
-STATIC void
+static void
xfs_qm_dqflush_done(
- struct xfs_buf *bp,
struct xfs_log_item *lip)
{
- xfs_dq_logitem_t *qip = (struct xfs_dq_logitem *)lip;
- xfs_dquot_t *dqp = qip->qli_dquot;
+ struct xfs_dq_logitem *qip = (struct xfs_dq_logitem *)lip;
+ struct xfs_dquot *dqp = qip->qli_dquot;
struct xfs_ail *ailp = lip->li_ailp;
+ xfs_lsn_t tail_lsn;
/*
* We only want to pull the item from the AIL if its
@@ -1034,16 +1113,13 @@
((lip->li_lsn == qip->qli_flush_lsn) ||
test_bit(XFS_LI_FAILED, &lip->li_flags))) {
- /* xfs_trans_ail_delete() drops the AIL lock. */
spin_lock(&ailp->ail_lock);
+ xfs_clear_li_failed(lip);
if (lip->li_lsn == qip->qli_flush_lsn) {
- xfs_trans_ail_delete(ailp, lip, SHUTDOWN_CORRUPT_INCORE);
+ /* xfs_ail_update_finish() drops the AIL lock */
+ tail_lsn = xfs_ail_delete_one(ailp, lip);
+ xfs_ail_update_finish(ailp, tail_lsn);
} else {
- /*
- * Clear the failed state since we are about to drop the
- * flush lock
- */
- xfs_clear_li_failed(lip);
spin_unlock(&ailp->ail_lock);
}
}
@@ -1054,6 +1130,68 @@
xfs_dqfunlock(dqp);
}
+void
+xfs_buf_dquot_iodone(
+ struct xfs_buf *bp)
+{
+ struct xfs_log_item *lip, *n;
+
+ list_for_each_entry_safe(lip, n, &bp->b_li_list, li_bio_list) {
+ list_del_init(&lip->li_bio_list);
+ xfs_qm_dqflush_done(lip);
+ }
+}
+
+void
+xfs_buf_dquot_io_fail(
+ struct xfs_buf *bp)
+{
+ struct xfs_log_item *lip;
+
+ spin_lock(&bp->b_mount->m_ail->ail_lock);
+ list_for_each_entry(lip, &bp->b_li_list, li_bio_list)
+ xfs_set_li_failed(lip, bp);
+ spin_unlock(&bp->b_mount->m_ail->ail_lock);
+}
+
+/* Check incore dquot for errors before we flush. */
+static xfs_failaddr_t
+xfs_qm_dqflush_check(
+ struct xfs_dquot *dqp)
+{
+ xfs_dqtype_t type = xfs_dquot_type(dqp);
+
+ if (type != XFS_DQTYPE_USER &&
+ type != XFS_DQTYPE_GROUP &&
+ type != XFS_DQTYPE_PROJ)
+ return __this_address;
+
+ if (dqp->q_id == 0)
+ return NULL;
+
+ if (dqp->q_blk.softlimit && dqp->q_blk.count > dqp->q_blk.softlimit &&
+ !dqp->q_blk.timer)
+ return __this_address;
+
+ if (dqp->q_ino.softlimit && dqp->q_ino.count > dqp->q_ino.softlimit &&
+ !dqp->q_ino.timer)
+ return __this_address;
+
+ if (dqp->q_rtb.softlimit && dqp->q_rtb.count > dqp->q_rtb.softlimit &&
+ !dqp->q_rtb.timer)
+ return __this_address;
+
+ /* bigtime flag should never be set on root dquots */
+ if (dqp->q_type & XFS_DQTYPE_BIGTIME) {
+ if (!xfs_sb_version_hasbigtime(&dqp->q_mount->m_sb))
+ return __this_address;
+ if (dqp->q_id == 0)
+ return __this_address;
+ }
+
+ return NULL;
+}
+
/*
* Write a modified dquot to disk.
* The dquot must be locked and the flush lock too taken by caller.
@@ -1068,9 +1206,9 @@
struct xfs_buf **bpp)
{
struct xfs_mount *mp = dqp->q_mount;
+ struct xfs_log_item *lip = &dqp->q_logitem.qli_item;
struct xfs_buf *bp;
- struct xfs_dqblk *dqb;
- struct xfs_disk_dquot *ddqp;
+ struct xfs_dqblk *dqblk;
xfs_failaddr_t fa;
int error;
@@ -1084,57 +1222,33 @@
xfs_qm_dqunpin_wait(dqp);
/*
- * This may have been unpinned because the filesystem is shutting
- * down forcibly. If that's the case we must not write this dquot
- * to disk, because the log record didn't make it to disk.
- *
- * We also have to remove the log item from the AIL in this case,
- * as we wait for an emptry AIL as part of the unmount process.
- */
- if (XFS_FORCED_SHUTDOWN(mp)) {
- struct xfs_log_item *lip = &dqp->q_logitem.qli_item;
- dqp->dq_flags &= ~XFS_DQ_DIRTY;
-
- xfs_trans_ail_remove(lip, SHUTDOWN_CORRUPT_INCORE);
-
- error = -EIO;
- goto out_unlock;
- }
-
- /*
* Get the buffer containing the on-disk dquot
*/
error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dqp->q_blkno,
- mp->m_quotainfo->qi_dqchunklen, 0, &bp,
- &xfs_dquot_buf_ops);
- if (error)
+ mp->m_quotainfo->qi_dqchunklen, XBF_TRYLOCK,
+ &bp, &xfs_dquot_buf_ops);
+ if (error == -EAGAIN)
goto out_unlock;
+ if (error)
+ goto out_abort;
- /*
- * Calculate the location of the dquot inside the buffer.
- */
- dqb = bp->b_addr + dqp->q_bufoffset;
- ddqp = &dqb->dd_diskdq;
-
- /* sanity check the in-core structure before we flush */
- fa = xfs_dquot_verify(mp, &dqp->q_core, be32_to_cpu(dqp->q_core.d_id),
- 0);
+ fa = xfs_qm_dqflush_check(dqp);
if (fa) {
xfs_alert(mp, "corrupt dquot ID 0x%x in memory at %pS",
- be32_to_cpu(dqp->q_core.d_id), fa);
+ dqp->q_id, fa);
xfs_buf_relse(bp);
- xfs_dqfunlock(dqp);
- xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
- return -EIO;
+ error = -EFSCORRUPTED;
+ goto out_abort;
}
- /* This is the only portion of data that needs to persist */
- memcpy(ddqp, &dqp->q_core, sizeof(xfs_disk_dquot_t));
+ /* Flush the incore dquot to the ondisk buffer. */
+ dqblk = bp->b_addr + dqp->q_bufoffset;
+ xfs_dquot_to_disk(&dqblk->dd_diskdq, dqp);
/*
* Clear the dirty field and remember the flush lsn for later use.
*/
- dqp->dq_flags &= ~XFS_DQ_DIRTY;
+ dqp->q_flags &= ~XFS_DQFLAG_DIRTY;
xfs_trans_ail_copy_lsn(mp->m_ail, &dqp->q_logitem.qli_flush_lsn,
&dqp->q_logitem.qli_item.li_lsn);
@@ -1149,17 +1263,17 @@
* of a dquot without an up-to-date CRC getting to disk.
*/
if (xfs_sb_version_hascrc(&mp->m_sb)) {
- dqb->dd_lsn = cpu_to_be64(dqp->q_logitem.qli_item.li_lsn);
- xfs_update_cksum((char *)dqb, sizeof(struct xfs_dqblk),
+ dqblk->dd_lsn = cpu_to_be64(dqp->q_logitem.qli_item.li_lsn);
+ xfs_update_cksum((char *)dqblk, sizeof(struct xfs_dqblk),
XFS_DQUOT_CRC_OFF);
}
/*
- * Attach an iodone routine so that we can remove this dquot from the
- * AIL and release the flush lock once the dquot is synced to disk.
+ * Attach the dquot to the buffer so that we can remove this dquot from
+ * the AIL and release the flush lock once the dquot is synced to disk.
*/
- xfs_buf_attach_iodone(bp, xfs_qm_dqflush_done,
- &dqp->q_logitem.qli_item);
+ bp->b_flags |= _XBF_DQUOTS;
+ list_add_tail(&dqp->q_logitem.qli_item.li_bio_list, &bp->b_li_list);
/*
* If the buffer is pinned then push on the log so we won't
@@ -1174,9 +1288,13 @@
*bpp = bp;
return 0;
+out_abort:
+ dqp->q_flags &= ~XFS_DQFLAG_DIRTY;
+ xfs_trans_ail_delete(lip, 0);
+ xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
out_unlock:
xfs_dqfunlock(dqp);
- return -EIO;
+ return error;
}
/*
@@ -1187,13 +1305,12 @@
*/
void
xfs_dqlock2(
- xfs_dquot_t *d1,
- xfs_dquot_t *d2)
+ struct xfs_dquot *d1,
+ struct xfs_dquot *d2)
{
if (d1 && d2) {
ASSERT(d1 != d2);
- if (be32_to_cpu(d1->q_core.d_id) >
- be32_to_cpu(d2->q_core.d_id)) {
+ if (d1->q_id > d2->q_id) {
mutex_lock(&d2->q_qlock);
mutex_lock_nested(&d1->q_qlock, XFS_QLOCK_NESTED);
} else {
@@ -1210,20 +1327,22 @@
int __init
xfs_qm_init(void)
{
- xfs_qm_dqzone =
- kmem_zone_init(sizeof(struct xfs_dquot), "xfs_dquot");
+ xfs_qm_dqzone = kmem_cache_create("xfs_dquot",
+ sizeof(struct xfs_dquot),
+ 0, 0, NULL);
if (!xfs_qm_dqzone)
goto out;
- xfs_qm_dqtrxzone =
- kmem_zone_init(sizeof(struct xfs_dquot_acct), "xfs_dqtrx");
+ xfs_qm_dqtrxzone = kmem_cache_create("xfs_dqtrx",
+ sizeof(struct xfs_dquot_acct),
+ 0, 0, NULL);
if (!xfs_qm_dqtrxzone)
goto out_free_dqzone;
return 0;
out_free_dqzone:
- kmem_zone_destroy(xfs_qm_dqzone);
+ kmem_cache_destroy(xfs_qm_dqzone);
out:
return -ENOMEM;
}
@@ -1231,8 +1350,8 @@
void
xfs_qm_exit(void)
{
- kmem_zone_destroy(xfs_qm_dqtrxzone);
- kmem_zone_destroy(xfs_qm_dqzone);
+ kmem_cache_destroy(xfs_qm_dqtrxzone);
+ kmem_cache_destroy(xfs_qm_dqzone);
}
/*
@@ -1243,7 +1362,7 @@
int
xfs_qm_dqiterate(
struct xfs_mount *mp,
- uint dqtype,
+ xfs_dqtype_t type,
xfs_qm_dqiterate_fn iter_fn,
void *priv)
{
@@ -1252,16 +1371,15 @@
int error;
do {
- error = xfs_qm_dqget_next(mp, id, dqtype, &dq);
+ error = xfs_qm_dqget_next(mp, id, type, &dq);
if (error == -ENOENT)
return 0;
if (error)
return error;
- error = iter_fn(dq, dqtype, priv);
- id = be32_to_cpu(dq->q_core.d_id);
+ error = iter_fn(dq, type, priv);
+ id = dq->q_id;
xfs_qm_dqput(dq);
- id++;
} while (error == 0 && id != 0);
return error;
diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h
index 4fe8570..f642884 100644
--- a/fs/xfs/xfs_dquot.h
+++ b/fs/xfs/xfs_dquot.h
@@ -27,36 +27,66 @@
XFS_QLOWSP_MAX
};
+struct xfs_dquot_res {
+ /* Total resources allocated and reserved. */
+ xfs_qcnt_t reserved;
+
+ /* Total resources allocated. */
+ xfs_qcnt_t count;
+
+ /* Absolute and preferred limits. */
+ xfs_qcnt_t hardlimit;
+ xfs_qcnt_t softlimit;
+
+ /*
+ * For root dquots, this is the default grace period, in seconds.
+ * Otherwise, this is when the quota grace period expires,
+ * in seconds since the Unix epoch.
+ */
+ time64_t timer;
+
+ /*
+ * For root dquots, this is the maximum number of warnings that will
+ * be issued for this quota type. Otherwise, this is the number of
+ * warnings issued against this quota. Note that none of this is
+ * implemented.
+ */
+ xfs_qwarncnt_t warnings;
+};
+
/*
* The incore dquot structure
*/
-typedef struct xfs_dquot {
- uint dq_flags; /* various flags (XFS_DQ_*) */
- struct list_head q_lru; /* global free list of dquots */
- struct xfs_mount*q_mount; /* filesystem this relates to */
- uint q_nrefs; /* # active refs from inodes */
- xfs_daddr_t q_blkno; /* blkno of dquot buffer */
- int q_bufoffset; /* off of dq in buffer (# dquots) */
- xfs_fileoff_t q_fileoffset; /* offset in quotas file */
+struct xfs_dquot {
+ struct list_head q_lru;
+ struct xfs_mount *q_mount;
+ xfs_dqtype_t q_type;
+ uint16_t q_flags;
+ xfs_dqid_t q_id;
+ uint q_nrefs;
+ int q_bufoffset;
+ xfs_daddr_t q_blkno;
+ xfs_fileoff_t q_fileoffset;
- xfs_disk_dquot_t q_core; /* actual usage & quotas */
- xfs_dq_logitem_t q_logitem; /* dquot log item */
- xfs_qcnt_t q_res_bcount; /* total regular nblks used+reserved */
- xfs_qcnt_t q_res_icount; /* total inos allocd+reserved */
- xfs_qcnt_t q_res_rtbcount;/* total realtime blks used+reserved */
- xfs_qcnt_t q_prealloc_lo_wmark;/* prealloc throttle wmark */
- xfs_qcnt_t q_prealloc_hi_wmark;/* prealloc disabled wmark */
- int64_t q_low_space[XFS_QLOWSP_MAX];
- struct mutex q_qlock; /* quota lock */
- struct completion q_flush; /* flush completion queue */
- atomic_t q_pincount; /* dquot pin count */
- wait_queue_head_t q_pinwait; /* dquot pinning wait queue */
-} xfs_dquot_t;
+ struct xfs_dquot_res q_blk; /* regular blocks */
+ struct xfs_dquot_res q_ino; /* inodes */
+ struct xfs_dquot_res q_rtb; /* realtime blocks */
+
+ struct xfs_dq_logitem q_logitem;
+
+ xfs_qcnt_t q_prealloc_lo_wmark;
+ xfs_qcnt_t q_prealloc_hi_wmark;
+ int64_t q_low_space[XFS_QLOWSP_MAX];
+ struct mutex q_qlock;
+ struct completion q_flush;
+ atomic_t q_pincount;
+ struct wait_queue_head q_pinwait;
+};
/*
* Lock hierarchy for q_qlock:
* XFS_QLOCK_NORMAL is the implicit default,
- * XFS_QLOCK_NESTED is the dquot with the higher id in xfs_dqlock2
+ * XFS_QLOCK_NESTED is the dquot with the higher id in xfs_dqlock2
*/
enum {
XFS_QLOCK_NORMAL = 0,
@@ -64,21 +94,21 @@
};
/*
- * Manage the q_flush completion queue embedded in the dquot. This completion
+ * Manage the q_flush completion queue embedded in the dquot. This completion
* queue synchronizes processes attempting to flush the in-core dquot back to
* disk.
*/
-static inline void xfs_dqflock(xfs_dquot_t *dqp)
+static inline void xfs_dqflock(struct xfs_dquot *dqp)
{
wait_for_completion(&dqp->q_flush);
}
-static inline bool xfs_dqflock_nowait(xfs_dquot_t *dqp)
+static inline bool xfs_dqflock_nowait(struct xfs_dquot *dqp)
{
return try_wait_for_completion(&dqp->q_flush);
}
-static inline void xfs_dqfunlock(xfs_dquot_t *dqp)
+static inline void xfs_dqfunlock(struct xfs_dquot *dqp)
{
complete(&dqp->q_flush);
}
@@ -98,34 +128,59 @@
mutex_unlock(&dqp->q_qlock);
}
-static inline int xfs_this_quota_on(struct xfs_mount *mp, int type)
+static inline int
+xfs_dquot_type(const struct xfs_dquot *dqp)
{
- switch (type & XFS_DQ_ALLTYPES) {
- case XFS_DQ_USER:
+ return dqp->q_type & XFS_DQTYPE_REC_MASK;
+}
+
+static inline int xfs_this_quota_on(struct xfs_mount *mp, xfs_dqtype_t type)
+{
+ switch (type) {
+ case XFS_DQTYPE_USER:
return XFS_IS_UQUOTA_ON(mp);
- case XFS_DQ_GROUP:
+ case XFS_DQTYPE_GROUP:
return XFS_IS_GQUOTA_ON(mp);
- case XFS_DQ_PROJ:
+ case XFS_DQTYPE_PROJ:
return XFS_IS_PQUOTA_ON(mp);
default:
return 0;
}
}
-static inline xfs_dquot_t *xfs_inode_dquot(struct xfs_inode *ip, int type)
+static inline struct xfs_dquot *xfs_inode_dquot(
+ struct xfs_inode *ip,
+ xfs_dqtype_t type)
{
- switch (type & XFS_DQ_ALLTYPES) {
- case XFS_DQ_USER:
+ switch (type) {
+ case XFS_DQTYPE_USER:
return ip->i_udquot;
- case XFS_DQ_GROUP:
+ case XFS_DQTYPE_GROUP:
return ip->i_gdquot;
- case XFS_DQ_PROJ:
+ case XFS_DQTYPE_PROJ:
return ip->i_pdquot;
default:
return NULL;
}
}
+/* Decide if the dquot's limits are actually being enforced. */
+static inline bool
+xfs_dquot_is_enforced(
+ const struct xfs_dquot *dqp)
+{
+ switch (xfs_dquot_type(dqp)) {
+ case XFS_DQTYPE_USER:
+ return XFS_IS_UQUOTA_ENFORCED(dqp->q_mount);
+ case XFS_DQTYPE_GROUP:
+ return XFS_IS_GQUOTA_ENFORCED(dqp->q_mount);
+ case XFS_DQTYPE_PROJ:
+ return XFS_IS_PQUOTA_ENFORCED(dqp->q_mount);
+ }
+ ASSERT(0);
+ return false;
+}
+
/*
* Check whether a dquot is under low free space conditions. We assume the quota
* is enabled and enforced.
@@ -134,44 +189,40 @@
{
int64_t freesp;
- freesp = be64_to_cpu(dqp->q_core.d_blk_hardlimit) - dqp->q_res_bcount;
+ freesp = dqp->q_blk.hardlimit - dqp->q_blk.reserved;
if (freesp < dqp->q_low_space[XFS_QLOWSP_1_PCNT])
return true;
return false;
}
+void xfs_dquot_to_disk(struct xfs_disk_dquot *ddqp, struct xfs_dquot *dqp);
+
#define XFS_DQ_IS_LOCKED(dqp) (mutex_is_locked(&((dqp)->q_qlock)))
-#define XFS_DQ_IS_DIRTY(dqp) ((dqp)->dq_flags & XFS_DQ_DIRTY)
-#define XFS_QM_ISUDQ(dqp) ((dqp)->dq_flags & XFS_DQ_USER)
-#define XFS_QM_ISPDQ(dqp) ((dqp)->dq_flags & XFS_DQ_PROJ)
-#define XFS_QM_ISGDQ(dqp) ((dqp)->dq_flags & XFS_DQ_GROUP)
+#define XFS_DQ_IS_DIRTY(dqp) ((dqp)->q_flags & XFS_DQFLAG_DIRTY)
-extern void xfs_qm_dqdestroy(xfs_dquot_t *);
-extern int xfs_qm_dqflush(struct xfs_dquot *, struct xfs_buf **);
-extern void xfs_qm_dqunpin_wait(xfs_dquot_t *);
-extern void xfs_qm_adjust_dqtimers(xfs_mount_t *,
- xfs_disk_dquot_t *);
-extern void xfs_qm_adjust_dqlimits(struct xfs_mount *,
- struct xfs_dquot *);
-extern xfs_dqid_t xfs_qm_id_for_quotatype(struct xfs_inode *ip,
- uint type);
-extern int xfs_qm_dqget(struct xfs_mount *mp, xfs_dqid_t id,
- uint type, bool can_alloc,
- struct xfs_dquot **dqpp);
-extern int xfs_qm_dqget_inode(struct xfs_inode *ip, uint type,
- bool can_alloc,
- struct xfs_dquot **dqpp);
-extern int xfs_qm_dqget_next(struct xfs_mount *mp, xfs_dqid_t id,
- uint type, struct xfs_dquot **dqpp);
-extern int xfs_qm_dqget_uncached(struct xfs_mount *mp,
- xfs_dqid_t id, uint type,
- struct xfs_dquot **dqpp);
-extern void xfs_qm_dqput(xfs_dquot_t *);
+void xfs_qm_dqdestroy(struct xfs_dquot *dqp);
+int xfs_qm_dqflush(struct xfs_dquot *dqp, struct xfs_buf **bpp);
+void xfs_qm_dqunpin_wait(struct xfs_dquot *dqp);
+void xfs_qm_adjust_dqtimers(struct xfs_dquot *d);
+void xfs_qm_adjust_dqlimits(struct xfs_dquot *d);
+xfs_dqid_t xfs_qm_id_for_quotatype(struct xfs_inode *ip,
+ xfs_dqtype_t type);
+int xfs_qm_dqget(struct xfs_mount *mp, xfs_dqid_t id,
+ xfs_dqtype_t type, bool can_alloc,
+ struct xfs_dquot **dqpp);
+int xfs_qm_dqget_inode(struct xfs_inode *ip, xfs_dqtype_t type,
+ bool can_alloc, struct xfs_dquot **dqpp);
+int xfs_qm_dqget_next(struct xfs_mount *mp, xfs_dqid_t id,
+ xfs_dqtype_t type, struct xfs_dquot **dqpp);
+int xfs_qm_dqget_uncached(struct xfs_mount *mp,
+ xfs_dqid_t id, xfs_dqtype_t type,
+ struct xfs_dquot **dqpp);
+void xfs_qm_dqput(struct xfs_dquot *dqp);
-extern void xfs_dqlock2(struct xfs_dquot *, struct xfs_dquot *);
+void xfs_dqlock2(struct xfs_dquot *, struct xfs_dquot *);
-extern void xfs_dquot_set_prealloc_limits(struct xfs_dquot *);
+void xfs_dquot_set_prealloc_limits(struct xfs_dquot *);
static inline struct xfs_dquot *xfs_qm_dqhold(struct xfs_dquot *dqp)
{
@@ -181,9 +232,12 @@
return dqp;
}
-typedef int (*xfs_qm_dqiterate_fn)(struct xfs_dquot *dq, uint dqtype,
- void *priv);
-int xfs_qm_dqiterate(struct xfs_mount *mp, uint dqtype,
+typedef int (*xfs_qm_dqiterate_fn)(struct xfs_dquot *dq,
+ xfs_dqtype_t type, void *priv);
+int xfs_qm_dqiterate(struct xfs_mount *mp, xfs_dqtype_t type,
xfs_qm_dqiterate_fn iter_fn, void *priv);
+time64_t xfs_dquot_set_timeout(struct xfs_mount *mp, time64_t timeout);
+time64_t xfs_dquot_set_grace_period(time64_t grace);
+
#endif /* __XFS_DQUOT_H__ */
diff --git a/fs/xfs/xfs_dquot_item.c b/fs/xfs/xfs_dquot_item.c
index d60647d..8c1fdf3 100644
--- a/fs/xfs/xfs_dquot_item.c
+++ b/fs/xfs/xfs_dquot_item.c
@@ -45,6 +45,7 @@
struct xfs_log_item *lip,
struct xfs_log_vec *lv)
{
+ struct xfs_disk_dquot ddq;
struct xfs_dq_logitem *qlip = DQUOT_ITEM(lip);
struct xfs_log_iovec *vecp = NULL;
struct xfs_dq_logformat *qlf;
@@ -52,14 +53,15 @@
qlf = xlog_prepare_iovec(lv, &vecp, XLOG_REG_TYPE_QFORMAT);
qlf->qlf_type = XFS_LI_DQUOT;
qlf->qlf_size = 2;
- qlf->qlf_id = be32_to_cpu(qlip->qli_dquot->q_core.d_id);
+ qlf->qlf_id = qlip->qli_dquot->q_id;
qlf->qlf_blkno = qlip->qli_dquot->q_blkno;
qlf->qlf_len = 1;
qlf->qlf_boffset = qlip->qli_dquot->q_bufoffset;
xlog_finish_iovec(lv, vecp, sizeof(struct xfs_dq_logformat));
- xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_DQUOT,
- &qlip->qli_dquot->q_core,
+ xfs_dquot_to_disk(&ddq, qlip->qli_dquot);
+
+ xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_DQUOT, &ddq,
sizeof(struct xfs_disk_dquot));
}
@@ -113,23 +115,6 @@
wait_event(dqp->q_pinwait, (atomic_read(&dqp->q_pincount) == 0));
}
-/*
- * Callback used to mark a buffer with XFS_LI_FAILED when items in the buffer
- * have been failed during writeback
- *
- * this informs the AIL that the dquot is already flush locked on the next push,
- * and acquires a hold on the buffer to ensure that it isn't reclaimed before
- * dirty data makes it to disk.
- */
-STATIC void
-xfs_dquot_item_error(
- struct xfs_log_item *lip,
- struct xfs_buf *bp)
-{
- ASSERT(!completion_done(&DQUOT_ITEM(lip)->qli_dquot->q_flush));
- xfs_set_li_failed(lip, bp);
-}
-
STATIC uint
xfs_qm_dquot_logitem_push(
struct xfs_log_item *lip,
@@ -145,21 +130,6 @@
if (atomic_read(&dqp->q_pincount) > 0)
return XFS_ITEM_PINNED;
- /*
- * The buffer containing this item failed to be written back
- * previously. Resubmit the buffer for IO
- */
- if (test_bit(XFS_LI_FAILED, &lip->li_flags)) {
- if (!xfs_buf_trylock(bp))
- return XFS_ITEM_LOCKED;
-
- if (!xfs_buf_resubmit_failed_buffers(bp, buffer_list))
- rval = XFS_ITEM_FLUSHING;
-
- xfs_buf_unlock(bp);
- return rval;
- }
-
if (!xfs_dqlock_nowait(dqp))
return XFS_ITEM_LOCKED;
@@ -189,7 +159,8 @@
if (!xfs_buf_delwri_queue(bp, buffer_list))
rval = XFS_ITEM_FLUSHING;
xfs_buf_relse(bp);
- }
+ } else if (error == -EAGAIN)
+ rval = XFS_ITEM_LOCKED;
spin_lock(&lip->li_ailp->ail_lock);
out_unlock:
@@ -230,7 +201,6 @@
.iop_release = xfs_qm_dquot_logitem_release,
.iop_committing = xfs_qm_dquot_logitem_committing,
.iop_push = xfs_qm_dquot_logitem_push,
- .iop_error = xfs_dquot_item_error
};
/*
@@ -307,36 +277,62 @@
{
struct xfs_qoff_logitem *qfe = QOFF_ITEM(lip);
struct xfs_qoff_logitem *qfs = qfe->qql_start_lip;
- struct xfs_ail *ailp = qfs->qql_item.li_ailp;
- /*
- * Delete the qoff-start logitem from the AIL.
- * xfs_trans_ail_delete() drops the AIL lock.
- */
- spin_lock(&ailp->ail_lock);
- xfs_trans_ail_delete(ailp, &qfs->qql_item, SHUTDOWN_LOG_IO_ERROR);
+ xfs_qm_qoff_logitem_relse(qfs);
- kmem_free(qfs->qql_item.li_lv_shadow);
kmem_free(lip->li_lv_shadow);
- kmem_free(qfs);
kmem_free(qfe);
return (xfs_lsn_t)-1;
}
+STATIC void
+xfs_qm_qoff_logitem_release(
+ struct xfs_log_item *lip)
+{
+ struct xfs_qoff_logitem *qoff = QOFF_ITEM(lip);
+
+ if (test_bit(XFS_LI_ABORTED, &lip->li_flags)) {
+ if (qoff->qql_start_lip)
+ xfs_qm_qoff_logitem_relse(qoff->qql_start_lip);
+ xfs_qm_qoff_logitem_relse(qoff);
+ }
+}
+
static const struct xfs_item_ops xfs_qm_qoffend_logitem_ops = {
.iop_size = xfs_qm_qoff_logitem_size,
.iop_format = xfs_qm_qoff_logitem_format,
.iop_committed = xfs_qm_qoffend_logitem_committed,
.iop_push = xfs_qm_qoff_logitem_push,
+ .iop_release = xfs_qm_qoff_logitem_release,
};
static const struct xfs_item_ops xfs_qm_qoff_logitem_ops = {
.iop_size = xfs_qm_qoff_logitem_size,
.iop_format = xfs_qm_qoff_logitem_format,
.iop_push = xfs_qm_qoff_logitem_push,
+ .iop_release = xfs_qm_qoff_logitem_release,
};
/*
+ * Delete the quotaoff intent from the AIL and free it. On success,
+ * this should only be called for the start item. It can be used for
+ * either on shutdown or abort.
+ */
+void
+xfs_qm_qoff_logitem_relse(
+ struct xfs_qoff_logitem *qoff)
+{
+ struct xfs_log_item *lip = &qoff->qql_item;
+
+ ASSERT(test_bit(XFS_LI_IN_AIL, &lip->li_flags) ||
+ test_bit(XFS_LI_ABORTED, &lip->li_flags) ||
+ XFS_FORCED_SHUTDOWN(lip->li_mountp));
+ xfs_trans_ail_delete(lip, 0);
+ kmem_free(lip->li_lv_shadow);
+ kmem_free(qoff);
+}
+
+/*
* Allocate and initialize an quotaoff item of the correct quota type(s).
*/
struct xfs_qoff_logitem *
diff --git a/fs/xfs/xfs_dquot_item.h b/fs/xfs/xfs_dquot_item.h
index 1aed34c..2b86a43 100644
--- a/fs/xfs/xfs_dquot_item.h
+++ b/fs/xfs/xfs_dquot_item.h
@@ -11,25 +11,28 @@
struct xfs_mount;
struct xfs_qoff_logitem;
-typedef struct xfs_dq_logitem {
- struct xfs_log_item qli_item; /* common portion */
- struct xfs_dquot *qli_dquot; /* dquot ptr */
- xfs_lsn_t qli_flush_lsn; /* lsn at last flush */
-} xfs_dq_logitem_t;
+struct xfs_dq_logitem {
+ struct xfs_log_item qli_item; /* common portion */
+ struct xfs_dquot *qli_dquot; /* dquot ptr */
+ xfs_lsn_t qli_flush_lsn; /* lsn at last flush */
+};
-typedef struct xfs_qoff_logitem {
- struct xfs_log_item qql_item; /* common portion */
- struct xfs_qoff_logitem *qql_start_lip; /* qoff-start logitem, if any */
+struct xfs_qoff_logitem {
+ struct xfs_log_item qql_item; /* common portion */
+ struct xfs_qoff_logitem *qql_start_lip; /* qoff-start logitem, if any */
unsigned int qql_flags;
-} xfs_qoff_logitem_t;
+};
-extern void xfs_qm_dquot_logitem_init(struct xfs_dquot *);
-extern xfs_qoff_logitem_t *xfs_qm_qoff_logitem_init(struct xfs_mount *,
- struct xfs_qoff_logitem *, uint);
-extern xfs_qoff_logitem_t *xfs_trans_get_qoff_item(struct xfs_trans *,
- struct xfs_qoff_logitem *, uint);
-extern void xfs_trans_log_quotaoff_item(struct xfs_trans *,
- struct xfs_qoff_logitem *);
+void xfs_qm_dquot_logitem_init(struct xfs_dquot *dqp);
+struct xfs_qoff_logitem *xfs_qm_qoff_logitem_init(struct xfs_mount *mp,
+ struct xfs_qoff_logitem *start,
+ uint flags);
+void xfs_qm_qoff_logitem_relse(struct xfs_qoff_logitem *);
+struct xfs_qoff_logitem *xfs_trans_get_qoff_item(struct xfs_trans *tp,
+ struct xfs_qoff_logitem *startqoff,
+ uint flags);
+void xfs_trans_log_quotaoff_item(struct xfs_trans *tp,
+ struct xfs_qoff_logitem *qlp);
#endif /* __XFS_DQUOT_ITEM_H__ */
diff --git a/fs/xfs/xfs_dquot_item_recover.c b/fs/xfs/xfs_dquot_item_recover.c
new file mode 100644
index 0000000..5875c7e
--- /dev/null
+++ b/fs/xfs/xfs_dquot_item_recover.c
@@ -0,0 +1,201 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_quota.h"
+#include "xfs_trans.h"
+#include "xfs_buf_item.h"
+#include "xfs_trans_priv.h"
+#include "xfs_qm.h"
+#include "xfs_log.h"
+#include "xfs_log_priv.h"
+#include "xfs_log_recover.h"
+
+STATIC void
+xlog_recover_dquot_ra_pass2(
+ struct xlog *log,
+ struct xlog_recover_item *item)
+{
+ struct xfs_mount *mp = log->l_mp;
+ struct xfs_disk_dquot *recddq;
+ struct xfs_dq_logformat *dq_f;
+ uint type;
+
+ if (mp->m_qflags == 0)
+ return;
+
+ recddq = item->ri_buf[1].i_addr;
+ if (recddq == NULL)
+ return;
+ if (item->ri_buf[1].i_len < sizeof(struct xfs_disk_dquot))
+ return;
+
+ type = recddq->d_type & XFS_DQTYPE_REC_MASK;
+ ASSERT(type);
+ if (log->l_quotaoffs_flag & type)
+ return;
+
+ dq_f = item->ri_buf[0].i_addr;
+ ASSERT(dq_f);
+ ASSERT(dq_f->qlf_len == 1);
+
+ xlog_buf_readahead(log, dq_f->qlf_blkno,
+ XFS_FSB_TO_BB(mp, dq_f->qlf_len),
+ &xfs_dquot_buf_ra_ops);
+}
+
+/*
+ * Recover a dquot record
+ */
+STATIC int
+xlog_recover_dquot_commit_pass2(
+ struct xlog *log,
+ struct list_head *buffer_list,
+ struct xlog_recover_item *item,
+ xfs_lsn_t current_lsn)
+{
+ struct xfs_mount *mp = log->l_mp;
+ struct xfs_buf *bp;
+ struct xfs_disk_dquot *ddq, *recddq;
+ struct xfs_dq_logformat *dq_f;
+ xfs_failaddr_t fa;
+ int error;
+ uint type;
+
+ /*
+ * Filesystems are required to send in quota flags at mount time.
+ */
+ if (mp->m_qflags == 0)
+ return 0;
+
+ recddq = item->ri_buf[1].i_addr;
+ if (recddq == NULL) {
+ xfs_alert(log->l_mp, "NULL dquot in %s.", __func__);
+ return -EFSCORRUPTED;
+ }
+ if (item->ri_buf[1].i_len < sizeof(struct xfs_disk_dquot)) {
+ xfs_alert(log->l_mp, "dquot too small (%d) in %s.",
+ item->ri_buf[1].i_len, __func__);
+ return -EFSCORRUPTED;
+ }
+
+ /*
+ * This type of quotas was turned off, so ignore this record.
+ */
+ type = recddq->d_type & XFS_DQTYPE_REC_MASK;
+ ASSERT(type);
+ if (log->l_quotaoffs_flag & type)
+ return 0;
+
+ /*
+ * At this point we know that quota was _not_ turned off.
+ * Since the mount flags are not indicating to us otherwise, this
+ * must mean that quota is on, and the dquot needs to be replayed.
+ * Remember that we may not have fully recovered the superblock yet,
+ * so we can't do the usual trick of looking at the SB quota bits.
+ *
+ * The other possibility, of course, is that the quota subsystem was
+ * removed since the last mount - ENOSYS.
+ */
+ dq_f = item->ri_buf[0].i_addr;
+ ASSERT(dq_f);
+ fa = xfs_dquot_verify(mp, recddq, dq_f->qlf_id);
+ if (fa) {
+ xfs_alert(mp, "corrupt dquot ID 0x%x in log at %pS",
+ dq_f->qlf_id, fa);
+ return -EFSCORRUPTED;
+ }
+ ASSERT(dq_f->qlf_len == 1);
+
+ /*
+ * At this point we are assuming that the dquots have been allocated
+ * and hence the buffer has valid dquots stamped in it. It should,
+ * therefore, pass verifier validation. If the dquot is bad, then the
+ * we'll return an error here, so we don't need to specifically check
+ * the dquot in the buffer after the verifier has run.
+ */
+ error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dq_f->qlf_blkno,
+ XFS_FSB_TO_BB(mp, dq_f->qlf_len), 0, &bp,
+ &xfs_dquot_buf_ops);
+ if (error)
+ return error;
+
+ ASSERT(bp);
+ ddq = xfs_buf_offset(bp, dq_f->qlf_boffset);
+
+ /*
+ * If the dquot has an LSN in it, recover the dquot only if it's less
+ * than the lsn of the transaction we are replaying.
+ */
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ struct xfs_dqblk *dqb = (struct xfs_dqblk *)ddq;
+ xfs_lsn_t lsn = be64_to_cpu(dqb->dd_lsn);
+
+ if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) {
+ goto out_release;
+ }
+ }
+
+ memcpy(ddq, recddq, item->ri_buf[1].i_len);
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ xfs_update_cksum((char *)ddq, sizeof(struct xfs_dqblk),
+ XFS_DQUOT_CRC_OFF);
+ }
+
+ ASSERT(dq_f->qlf_size == 2);
+ ASSERT(bp->b_mount == mp);
+ bp->b_flags |= _XBF_LOGRECOVERY;
+ xfs_buf_delwri_queue(bp, buffer_list);
+
+out_release:
+ xfs_buf_relse(bp);
+ return 0;
+}
+
+const struct xlog_recover_item_ops xlog_dquot_item_ops = {
+ .item_type = XFS_LI_DQUOT,
+ .ra_pass2 = xlog_recover_dquot_ra_pass2,
+ .commit_pass2 = xlog_recover_dquot_commit_pass2,
+};
+
+/*
+ * Recover QUOTAOFF records. We simply make a note of it in the xlog
+ * structure, so that we know not to do any dquot item or dquot buffer recovery,
+ * of that type.
+ */
+STATIC int
+xlog_recover_quotaoff_commit_pass1(
+ struct xlog *log,
+ struct xlog_recover_item *item)
+{
+ struct xfs_qoff_logformat *qoff_f = item->ri_buf[0].i_addr;
+ ASSERT(qoff_f);
+
+ /*
+ * The logitem format's flag tells us if this was user quotaoff,
+ * group/project quotaoff or both.
+ */
+ if (qoff_f->qf_flags & XFS_UQUOTA_ACCT)
+ log->l_quotaoffs_flag |= XFS_DQTYPE_USER;
+ if (qoff_f->qf_flags & XFS_PQUOTA_ACCT)
+ log->l_quotaoffs_flag |= XFS_DQTYPE_PROJ;
+ if (qoff_f->qf_flags & XFS_GQUOTA_ACCT)
+ log->l_quotaoffs_flag |= XFS_DQTYPE_GROUP;
+
+ return 0;
+}
+
+const struct xlog_recover_item_ops xlog_quotaoff_item_ops = {
+ .item_type = XFS_LI_QUOTAOFF,
+ .commit_pass1 = xlog_recover_quotaoff_commit_pass1,
+ /* nothing to commit in pass2 */
+};
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index 849fd44..7f6e208 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -53,6 +53,7 @@
XFS_RANDOM_FORCE_SCRUB_REPAIR,
XFS_RANDOM_FORCE_SUMMARY_RECALC,
XFS_RANDOM_IUNLINK_FALLBACK,
+ XFS_RANDOM_BUF_IOERROR,
};
struct xfs_errortag_attr {
@@ -162,6 +163,7 @@
XFS_ERRORTAG_ATTR_RW(force_repair, XFS_ERRTAG_FORCE_SCRUB_REPAIR);
XFS_ERRORTAG_ATTR_RW(bad_summary, XFS_ERRTAG_FORCE_SUMMARY_RECALC);
XFS_ERRORTAG_ATTR_RW(iunlink_fallback, XFS_ERRTAG_IUNLINK_FALLBACK);
+XFS_ERRORTAG_ATTR_RW(buf_ioerror, XFS_ERRTAG_BUF_IOERROR);
static struct attribute *xfs_errortag_attrs[] = {
XFS_ERRORTAG_ATTR_LIST(noerror),
@@ -199,6 +201,7 @@
XFS_ERRORTAG_ATTR_LIST(force_repair),
XFS_ERRORTAG_ATTR_LIST(bad_summary),
XFS_ERRORTAG_ATTR_LIST(iunlink_fallback),
+ XFS_ERRORTAG_ATTR_LIST(buf_ioerror),
NULL,
};
@@ -257,7 +260,7 @@
xfs_warn_ratelimited(mp,
"Injecting error (%s) at file %s, line %d, on filesystem \"%s\"",
- expression, file, line, mp->m_fsname);
+ expression, file, line, mp->m_super->s_id);
return true;
}
@@ -329,19 +332,43 @@
const char *tag,
int level,
struct xfs_mount *mp,
- void *buf,
+ const void *buf,
size_t bufsize,
const char *filename,
int linenum,
xfs_failaddr_t failaddr)
{
- if (level <= xfs_error_level)
+ if (buf && level <= xfs_error_level)
xfs_hex_dump(buf, bufsize);
xfs_error_report(tag, level, mp, filename, linenum, failaddr);
xfs_alert(mp, "Corruption detected. Unmount and run xfs_repair");
}
/*
+ * Complain about the kinds of metadata corruption that we can't detect from a
+ * verifier, such as incorrect inter-block relationship data. Does not set
+ * bp->b_error.
+ *
+ * Call xfs_buf_mark_corrupt, not this function.
+ */
+void
+xfs_buf_corruption_error(
+ struct xfs_buf *bp,
+ xfs_failaddr_t fa)
+{
+ struct xfs_mount *mp = bp->b_mount;
+
+ xfs_alert_tag(mp, XFS_PTAG_VERIFIER_ERROR,
+ "Metadata corruption detected at %pS, %s block 0x%llx",
+ fa, bp->b_ops->name, bp->b_bn);
+
+ xfs_alert(mp, "Unmount and run xfs_repair");
+
+ if (xfs_error_level >= XFS_ERRLEVEL_HIGH)
+ xfs_stack_trace();
+}
+
+/*
* Warnings specifically for verifier errors. Differentiate CRC vs. invalid
* values, and omit the stack trace unless the error level is tuned high.
*/
@@ -350,7 +377,7 @@
struct xfs_buf *bp,
int error,
const char *name,
- void *buf,
+ const void *buf,
size_t bufsz,
xfs_failaddr_t failaddr)
{
@@ -402,7 +429,7 @@
struct xfs_inode *ip,
int error,
const char *name,
- void *buf,
+ const void *buf,
size_t bufsz,
xfs_failaddr_t failaddr)
{
diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h
index 602aa7d..1717b75 100644
--- a/fs/xfs/xfs_error.h
+++ b/fs/xfs/xfs_error.h
@@ -12,16 +12,17 @@
const char *filename, int linenum,
xfs_failaddr_t failaddr);
extern void xfs_corruption_error(const char *tag, int level,
- struct xfs_mount *mp, void *buf, size_t bufsize,
+ struct xfs_mount *mp, const void *buf, size_t bufsize,
const char *filename, int linenum,
xfs_failaddr_t failaddr);
+void xfs_buf_corruption_error(struct xfs_buf *bp, xfs_failaddr_t fa);
extern void xfs_buf_verifier_error(struct xfs_buf *bp, int error,
- const char *name, void *buf, size_t bufsz,
+ const char *name, const void *buf, size_t bufsz,
xfs_failaddr_t failaddr);
extern void xfs_verifier_error(struct xfs_buf *bp, int error,
xfs_failaddr_t failaddr);
extern void xfs_inode_verifier_error(struct xfs_inode *ip, int error,
- const char *name, void *buf, size_t bufsz,
+ const char *name, const void *buf, size_t bufsz,
xfs_failaddr_t failaddr);
#define XFS_ERROR_REPORT(e, lvl, mp) \
@@ -37,32 +38,6 @@
/* Dump 128 bytes of any corrupt buffer */
#define XFS_CORRUPTION_DUMP_LEN (128)
-/*
- * Macros to set EFSCORRUPTED & return/branch.
- */
-#define XFS_WANT_CORRUPTED_GOTO(mp, x, l) \
- { \
- int fs_is_ok = (x); \
- ASSERT(fs_is_ok); \
- if (unlikely(!fs_is_ok)) { \
- XFS_ERROR_REPORT("XFS_WANT_CORRUPTED_GOTO", \
- XFS_ERRLEVEL_LOW, mp); \
- error = -EFSCORRUPTED; \
- goto l; \
- } \
- }
-
-#define XFS_WANT_CORRUPTED_RETURN(mp, x) \
- { \
- int fs_is_ok = (x); \
- ASSERT(fs_is_ok); \
- if (unlikely(!fs_is_ok)) { \
- XFS_ERROR_REPORT("XFS_WANT_CORRUPTED_RETURN", \
- XFS_ERRLEVEL_LOW, mp); \
- return -EFSCORRUPTED; \
- } \
- }
-
#ifdef DEBUG
extern int xfs_errortag_init(struct xfs_mount *mp);
extern void xfs_errortag_del(struct xfs_mount *mp);
diff --git a/fs/xfs/xfs_export.c b/fs/xfs/xfs_export.c
index f1372f9..465fd9e 100644
--- a/fs/xfs/xfs_export.c
+++ b/fs/xfs/xfs_export.c
@@ -15,7 +15,6 @@
#include "xfs_trans.h"
#include "xfs_inode_item.h"
#include "xfs_icache.h"
-#include "xfs_log.h"
#include "xfs_pnfs.h"
/*
@@ -57,7 +56,7 @@
fileid_type = FILEID_INO32_GEN_PARENT;
/*
- * If the the filesystem may contain 64bit inode numbers, we need
+ * If the filesystem may contain 64bit inode numbers, we need
* to use larger file handles that can represent them.
*
* While we only allocate inodes that do not fit into 32 bits any
@@ -221,18 +220,7 @@
xfs_fs_nfs_commit_metadata(
struct inode *inode)
{
- struct xfs_inode *ip = XFS_I(inode);
- struct xfs_mount *mp = ip->i_mount;
- xfs_lsn_t lsn = 0;
-
- xfs_ilock(ip, XFS_ILOCK_SHARED);
- if (xfs_ipincount(ip))
- lsn = ip->i_itemp->ili_last_lsn;
- xfs_iunlock(ip, XFS_ILOCK_SHARED);
-
- if (!lsn)
- return 0;
- return xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL);
+ return xfs_log_force_inode(XFS_I(inode));
}
const struct export_operations xfs_export_operations = {
diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c
index 2183d87..5c2695a 100644
--- a/fs/xfs/xfs_extent_busy.c
+++ b/fs/xfs/xfs_extent_busy.c
@@ -367,7 +367,7 @@
* If this is a metadata allocation, try to reuse the busy
* extent instead of trimming the allocation.
*/
- if (!xfs_alloc_is_userdata(args->datatype) &&
+ if (!(args->datatype & XFS_ALLOC_USERDATA) &&
!(busyp->flags & XFS_EXTENT_BUSY_DISCARDED)) {
if (!xfs_extent_busy_update_extent(args->mp, args->pag,
busyp, fbno, flen,
@@ -643,8 +643,8 @@
int
xfs_extent_busy_ag_cmp(
void *priv,
- struct list_head *l1,
- struct list_head *l2)
+ const struct list_head *l1,
+ const struct list_head *l2)
{
struct xfs_extent_busy *b1 =
container_of(l1, struct xfs_extent_busy, list);
diff --git a/fs/xfs/xfs_extent_busy.h b/fs/xfs/xfs_extent_busy.h
index 990ab38..8aea071 100644
--- a/fs/xfs/xfs_extent_busy.h
+++ b/fs/xfs/xfs_extent_busy.h
@@ -58,7 +58,8 @@
xfs_extent_busy_wait_all(struct xfs_mount *mp);
int
-xfs_extent_busy_ag_cmp(void *priv, struct list_head *a, struct list_head *b);
+xfs_extent_busy_ag_cmp(void *priv, const struct list_head *a,
+ const struct list_head *b);
static inline void xfs_extent_busy_sort(struct list_head *list)
{
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index e44efc4..5c03952 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -21,17 +21,21 @@
#include "xfs_alloc.h"
#include "xfs_bmap.h"
#include "xfs_trace.h"
-
+#include "xfs_error.h"
+#include "xfs_log_priv.h"
+#include "xfs_log_recover.h"
kmem_zone_t *xfs_efi_zone;
kmem_zone_t *xfs_efd_zone;
+static const struct xfs_item_ops xfs_efi_item_ops;
+
static inline struct xfs_efi_log_item *EFI_ITEM(struct xfs_log_item *lip)
{
return container_of(lip, struct xfs_efi_log_item, efi_item);
}
-void
+STATIC void
xfs_efi_item_free(
struct xfs_efi_log_item *efip)
{
@@ -39,7 +43,7 @@
if (efip->efi_format.efi_nextents > XFS_EFI_MAX_FAST_EXTENTS)
kmem_free(efip);
else
- kmem_zone_free(xfs_efi_zone, efip);
+ kmem_cache_free(xfs_efi_zone, efip);
}
/*
@@ -49,13 +53,13 @@
* committed vs unpin operations in bulk insert operations. Hence the reference
* count to ensure only the last caller frees the EFI.
*/
-void
+STATIC void
xfs_efi_release(
struct xfs_efi_log_item *efip)
{
ASSERT(atomic_read(&efip->efi_refcount) > 0);
if (atomic_dec_and_test(&efip->efi_refcount)) {
- xfs_trans_ail_remove(&efip->efi_item, SHUTDOWN_LOG_IO_ERROR);
+ xfs_trans_ail_delete(&efip->efi_item, SHUTDOWN_LOG_IO_ERROR);
xfs_efi_item_free(efip);
}
}
@@ -139,18 +143,10 @@
xfs_efi_release(EFI_ITEM(lip));
}
-static const struct xfs_item_ops xfs_efi_item_ops = {
- .iop_size = xfs_efi_item_size,
- .iop_format = xfs_efi_item_format,
- .iop_unpin = xfs_efi_item_unpin,
- .iop_release = xfs_efi_item_release,
-};
-
-
/*
* Allocate and initialize an efi item with the given number of extents.
*/
-struct xfs_efi_log_item *
+STATIC struct xfs_efi_log_item *
xfs_efi_init(
struct xfs_mount *mp,
uint nextents)
@@ -161,11 +157,12 @@
ASSERT(nextents > 0);
if (nextents > XFS_EFI_MAX_FAST_EXTENTS) {
- size = (uint)(sizeof(xfs_efi_log_item_t) +
+ size = (uint)(sizeof(struct xfs_efi_log_item) +
((nextents - 1) * sizeof(xfs_extent_t)));
efip = kmem_zalloc(size, 0);
} else {
- efip = kmem_zone_zalloc(xfs_efi_zone, 0);
+ efip = kmem_cache_zalloc(xfs_efi_zone,
+ GFP_KERNEL | __GFP_NOFAIL);
}
xfs_log_item_init(mp, &efip->efi_item, XFS_LI_EFI, &xfs_efi_item_ops);
@@ -184,7 +181,7 @@
* one of which will be the native format for this kernel.
* It will handle the conversion of formats if necessary.
*/
-int
+STATIC int
xfs_efi_copy_format(xfs_log_iovec_t *buf, xfs_efi_log_format_t *dst_efi_fmt)
{
xfs_efi_log_format_t *src_efi_fmt = buf->i_addr;
@@ -228,6 +225,7 @@
}
return 0;
}
+ XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, NULL);
return -EFSCORRUPTED;
}
@@ -243,7 +241,7 @@
if (efdp->efd_format.efd_nextents > XFS_EFD_MAX_FAST_EXTENTS)
kmem_free(efdp);
else
- kmem_zone_free(xfs_efd_zone, efdp);
+ kmem_cache_free(xfs_efd_zone, efdp);
}
/*
@@ -335,7 +333,8 @@
(nextents - 1) * sizeof(struct xfs_extent),
0);
} else {
- efdp = kmem_zone_zalloc(xfs_efd_zone, 0);
+ efdp = kmem_cache_zalloc(xfs_efd_zone,
+ GFP_KERNEL | __GFP_NOFAIL);
}
xfs_log_item_init(tp->t_mountp, &efdp->efd_item, XFS_LI_EFD,
@@ -398,8 +397,8 @@
static int
xfs_extent_free_diff_items(
void *priv,
- struct list_head *a,
- struct list_head *b)
+ const struct list_head *a,
+ const struct list_head *b)
{
struct xfs_mount *mp = priv;
struct xfs_extent_free_item *ra;
@@ -411,41 +410,16 @@
XFS_FSB_TO_AGNO(mp, rb->xefi_startblock);
}
-/* Get an EFI. */
-STATIC void *
-xfs_extent_free_create_intent(
- struct xfs_trans *tp,
- unsigned int count)
-{
- struct xfs_efi_log_item *efip;
-
- ASSERT(tp != NULL);
- ASSERT(count > 0);
-
- efip = xfs_efi_init(tp->t_mountp, count);
- ASSERT(efip != NULL);
-
- /*
- * Get a log_item_desc to point at the new item.
- */
- xfs_trans_add_item(tp, &efip->efi_item);
- return efip;
-}
-
/* Log a free extent to the intent item. */
STATIC void
xfs_extent_free_log_item(
struct xfs_trans *tp,
- void *intent,
- struct list_head *item)
+ struct xfs_efi_log_item *efip,
+ struct xfs_extent_free_item *free)
{
- struct xfs_efi_log_item *efip = intent;
- struct xfs_extent_free_item *free;
uint next_extent;
struct xfs_extent *extp;
- free = container_of(item, struct xfs_extent_free_item, xefi_list);
-
tp->t_flags |= XFS_TRANS_DIRTY;
set_bit(XFS_LI_DIRTY, &efip->efi_item.li_flags);
@@ -461,29 +435,50 @@
extp->ext_len = free->xefi_blockcount;
}
+static struct xfs_log_item *
+xfs_extent_free_create_intent(
+ struct xfs_trans *tp,
+ struct list_head *items,
+ unsigned int count,
+ bool sort)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_efi_log_item *efip = xfs_efi_init(mp, count);
+ struct xfs_extent_free_item *free;
+
+ ASSERT(count > 0);
+
+ xfs_trans_add_item(tp, &efip->efi_item);
+ if (sort)
+ list_sort(mp, items, xfs_extent_free_diff_items);
+ list_for_each_entry(free, items, xefi_list)
+ xfs_extent_free_log_item(tp, efip, free);
+ return &efip->efi_item;
+}
+
/* Get an EFD so we can process all the free extents. */
-STATIC void *
+static struct xfs_log_item *
xfs_extent_free_create_done(
struct xfs_trans *tp,
- void *intent,
+ struct xfs_log_item *intent,
unsigned int count)
{
- return xfs_trans_get_efd(tp, intent, count);
+ return &xfs_trans_get_efd(tp, EFI_ITEM(intent), count)->efd_item;
}
/* Process a free extent. */
STATIC int
xfs_extent_free_finish_item(
struct xfs_trans *tp,
+ struct xfs_log_item *done,
struct list_head *item,
- void *done_item,
- void **state)
+ struct xfs_btree_cur **state)
{
struct xfs_extent_free_item *free;
int error;
free = container_of(item, struct xfs_extent_free_item, xefi_list);
- error = xfs_trans_free_extent(tp, done_item,
+ error = xfs_trans_free_extent(tp, EFD_ITEM(done),
free->xefi_startblock,
free->xefi_blockcount,
&free->xefi_oinfo, free->xefi_skip_discard);
@@ -494,9 +489,9 @@
/* Abort all pending EFIs. */
STATIC void
xfs_extent_free_abort_intent(
- void *intent)
+ struct xfs_log_item *intent)
{
- xfs_efi_release(intent);
+ xfs_efi_release(EFI_ITEM(intent));
}
/* Cancel a free extent. */
@@ -512,10 +507,8 @@
const struct xfs_defer_op_type xfs_extent_free_defer_type = {
.max_items = XFS_EFI_MAX_FAST_EXTENTS,
- .diff_items = xfs_extent_free_diff_items,
.create_intent = xfs_extent_free_create_intent,
.abort_intent = xfs_extent_free_abort_intent,
- .log_item = xfs_extent_free_log_item,
.create_done = xfs_extent_free_create_done,
.finish_item = xfs_extent_free_finish_item,
.cancel_item = xfs_extent_free_cancel_item,
@@ -528,12 +521,12 @@
STATIC int
xfs_agfl_free_finish_item(
struct xfs_trans *tp,
+ struct xfs_log_item *done,
struct list_head *item,
- void *done_item,
- void **state)
+ struct xfs_btree_cur **state)
{
struct xfs_mount *mp = tp->t_mountp;
- struct xfs_efd_log_item *efdp = done_item;
+ struct xfs_efd_log_item *efdp = EFD_ITEM(done);
struct xfs_extent_free_item *free;
struct xfs_extent *extp;
struct xfs_buf *agbp;
@@ -578,10 +571,8 @@
/* sub-type with special handling for AGFL deferred frees */
const struct xfs_defer_op_type xfs_agfl_free_defer_type = {
.max_items = XFS_EFI_MAX_FAST_EXTENTS,
- .diff_items = xfs_extent_free_diff_items,
.create_intent = xfs_extent_free_create_intent,
.abort_intent = xfs_extent_free_abort_intent,
- .log_item = xfs_extent_free_log_item,
.create_done = xfs_extent_free_create_done,
.finish_item = xfs_agfl_free_finish_item,
.cancel_item = xfs_extent_free_cancel_item,
@@ -591,19 +582,19 @@
* Process an extent free intent item that was recovered from
* the log. We need to free the extents that it describes.
*/
-int
-xfs_efi_recover(
- struct xfs_mount *mp,
- struct xfs_efi_log_item *efip)
+STATIC int
+xfs_efi_item_recover(
+ struct xfs_log_item *lip,
+ struct list_head *capture_list)
{
- struct xfs_efd_log_item *efdp;
- struct xfs_trans *tp;
- int i;
- int error = 0;
- xfs_extent_t *extp;
- xfs_fsblock_t startblock_fsb;
-
- ASSERT(!test_bit(XFS_EFI_RECOVERED, &efip->efi_flags));
+ struct xfs_efi_log_item *efip = EFI_ITEM(lip);
+ struct xfs_mount *mp = lip->li_mountp;
+ struct xfs_efd_log_item *efdp;
+ struct xfs_trans *tp;
+ struct xfs_extent *extp;
+ xfs_fsblock_t startblock_fsb;
+ int i;
+ int error = 0;
/*
* First check the validity of the extents described by the
@@ -617,15 +608,8 @@
if (startblock_fsb == 0 ||
extp->ext_len == 0 ||
startblock_fsb >= mp->m_sb.sb_dblocks ||
- extp->ext_len >= mp->m_sb.sb_agblocks) {
- /*
- * This will pull the EFI from the AIL and
- * free the memory associated with it.
- */
- set_bit(XFS_EFI_RECOVERED, &efip->efi_flags);
- xfs_efi_release(efip);
- return -EIO;
- }
+ extp->ext_len >= mp->m_sb.sb_agblocks)
+ return -EFSCORRUPTED;
}
error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp);
@@ -643,11 +627,128 @@
}
- set_bit(XFS_EFI_RECOVERED, &efip->efi_flags);
- error = xfs_trans_commit(tp);
- return error;
+ return xfs_defer_ops_capture_and_commit(tp, NULL, capture_list);
abort_error:
xfs_trans_cancel(tp);
return error;
}
+
+STATIC bool
+xfs_efi_item_match(
+ struct xfs_log_item *lip,
+ uint64_t intent_id)
+{
+ return EFI_ITEM(lip)->efi_format.efi_id == intent_id;
+}
+
+/* Relog an intent item to push the log tail forward. */
+static struct xfs_log_item *
+xfs_efi_item_relog(
+ struct xfs_log_item *intent,
+ struct xfs_trans *tp)
+{
+ struct xfs_efd_log_item *efdp;
+ struct xfs_efi_log_item *efip;
+ struct xfs_extent *extp;
+ unsigned int count;
+
+ count = EFI_ITEM(intent)->efi_format.efi_nextents;
+ extp = EFI_ITEM(intent)->efi_format.efi_extents;
+
+ tp->t_flags |= XFS_TRANS_DIRTY;
+ efdp = xfs_trans_get_efd(tp, EFI_ITEM(intent), count);
+ efdp->efd_next_extent = count;
+ memcpy(efdp->efd_format.efd_extents, extp, count * sizeof(*extp));
+ set_bit(XFS_LI_DIRTY, &efdp->efd_item.li_flags);
+
+ efip = xfs_efi_init(tp->t_mountp, count);
+ memcpy(efip->efi_format.efi_extents, extp, count * sizeof(*extp));
+ atomic_set(&efip->efi_next_extent, count);
+ xfs_trans_add_item(tp, &efip->efi_item);
+ set_bit(XFS_LI_DIRTY, &efip->efi_item.li_flags);
+ return &efip->efi_item;
+}
+
+static const struct xfs_item_ops xfs_efi_item_ops = {
+ .iop_size = xfs_efi_item_size,
+ .iop_format = xfs_efi_item_format,
+ .iop_unpin = xfs_efi_item_unpin,
+ .iop_release = xfs_efi_item_release,
+ .iop_recover = xfs_efi_item_recover,
+ .iop_match = xfs_efi_item_match,
+ .iop_relog = xfs_efi_item_relog,
+};
+
+/*
+ * This routine is called to create an in-core extent free intent
+ * item from the efi format structure which was logged on disk.
+ * It allocates an in-core efi, copies the extents from the format
+ * structure into it, and adds the efi to the AIL with the given
+ * LSN.
+ */
+STATIC int
+xlog_recover_efi_commit_pass2(
+ struct xlog *log,
+ struct list_head *buffer_list,
+ struct xlog_recover_item *item,
+ xfs_lsn_t lsn)
+{
+ struct xfs_mount *mp = log->l_mp;
+ struct xfs_efi_log_item *efip;
+ struct xfs_efi_log_format *efi_formatp;
+ int error;
+
+ efi_formatp = item->ri_buf[0].i_addr;
+
+ efip = xfs_efi_init(mp, efi_formatp->efi_nextents);
+ error = xfs_efi_copy_format(&item->ri_buf[0], &efip->efi_format);
+ if (error) {
+ xfs_efi_item_free(efip);
+ return error;
+ }
+ atomic_set(&efip->efi_next_extent, efi_formatp->efi_nextents);
+ /*
+ * Insert the intent into the AIL directly and drop one reference so
+ * that finishing or canceling the work will drop the other.
+ */
+ xfs_trans_ail_insert(log->l_ailp, &efip->efi_item, lsn);
+ xfs_efi_release(efip);
+ return 0;
+}
+
+const struct xlog_recover_item_ops xlog_efi_item_ops = {
+ .item_type = XFS_LI_EFI,
+ .commit_pass2 = xlog_recover_efi_commit_pass2,
+};
+
+/*
+ * This routine is called when an EFD format structure is found in a committed
+ * transaction in the log. Its purpose is to cancel the corresponding EFI if it
+ * was still in the log. To do this it searches the AIL for the EFI with an id
+ * equal to that in the EFD format structure. If we find it we drop the EFD
+ * reference, which removes the EFI from the AIL and frees it.
+ */
+STATIC int
+xlog_recover_efd_commit_pass2(
+ struct xlog *log,
+ struct list_head *buffer_list,
+ struct xlog_recover_item *item,
+ xfs_lsn_t lsn)
+{
+ struct xfs_efd_log_format *efd_formatp;
+
+ efd_formatp = item->ri_buf[0].i_addr;
+ ASSERT((item->ri_buf[0].i_len == (sizeof(xfs_efd_log_format_32_t) +
+ ((efd_formatp->efd_nextents - 1) * sizeof(xfs_extent_32_t)))) ||
+ (item->ri_buf[0].i_len == (sizeof(xfs_efd_log_format_64_t) +
+ ((efd_formatp->efd_nextents - 1) * sizeof(xfs_extent_64_t)))));
+
+ xlog_recover_release_intent(log, XFS_LI_EFI, efd_formatp->efd_efi_id);
+ return 0;
+}
+
+const struct xlog_recover_item_ops xlog_efd_item_ops = {
+ .item_type = XFS_LI_EFD,
+ .commit_pass2 = xlog_recover_efd_commit_pass2,
+};
diff --git a/fs/xfs/xfs_extfree_item.h b/fs/xfs/xfs_extfree_item.h
index 16aaab0..cd2860c 100644
--- a/fs/xfs/xfs_extfree_item.h
+++ b/fs/xfs/xfs_extfree_item.h
@@ -17,11 +17,6 @@
#define XFS_EFI_MAX_FAST_EXTENTS 16
/*
- * Define EFI flag bits. Manipulated by set/clear/test_bit operators.
- */
-#define XFS_EFI_RECOVERED 1
-
-/*
* This is the "extent free intention" log item. It is used to log the fact
* that some extents need to be free. It is used in conjunction with the
* "extent free done" log item described below.
@@ -50,25 +45,24 @@
* of commit failure or log I/O errors. Note that the EFD is not inserted in the
* AIL, so at this point both the EFI and EFD are freed.
*/
-typedef struct xfs_efi_log_item {
+struct xfs_efi_log_item {
struct xfs_log_item efi_item;
atomic_t efi_refcount;
atomic_t efi_next_extent;
- unsigned long efi_flags; /* misc flags */
xfs_efi_log_format_t efi_format;
-} xfs_efi_log_item_t;
+};
/*
* This is the "extent free done" log item. It is used to log
* the fact that some extents earlier mentioned in an efi item
* have been freed.
*/
-typedef struct xfs_efd_log_item {
+struct xfs_efd_log_item {
struct xfs_log_item efd_item;
- xfs_efi_log_item_t *efd_efip;
+ struct xfs_efi_log_item *efd_efip;
uint efd_next_extent;
xfs_efd_log_format_t efd_format;
-} xfs_efd_log_item_t;
+};
/*
* Max number of extents in fast allocation path.
@@ -78,13 +72,4 @@
extern struct kmem_zone *xfs_efi_zone;
extern struct kmem_zone *xfs_efd_zone;
-xfs_efi_log_item_t *xfs_efi_init(struct xfs_mount *, uint);
-int xfs_efi_copy_format(xfs_log_iovec_t *buf,
- xfs_efi_log_format_t *dst_efi_fmt);
-void xfs_efi_item_free(xfs_efi_log_item_t *);
-void xfs_efi_release(struct xfs_efi_log_item *);
-
-int xfs_efi_recover(struct xfs_mount *mp,
- struct xfs_efi_log_item *efip);
-
#endif /* __XFS_EXTFREE_ITEM_H__ */
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 203065a..5b0f93f 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -32,6 +32,39 @@
static const struct vm_operations_struct xfs_file_vm_ops;
+/*
+ * Decide if the given file range is aligned to the size of the fundamental
+ * allocation unit for the file.
+ */
+static bool
+xfs_is_falloc_aligned(
+ struct xfs_inode *ip,
+ loff_t pos,
+ long long int len)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ uint64_t mask;
+
+ if (XFS_IS_REALTIME_INODE(ip)) {
+ if (!is_power_of_2(mp->m_sb.sb_rextsize)) {
+ u64 rextbytes;
+ u32 mod;
+
+ rextbytes = XFS_FSB_TO_B(mp, mp->m_sb.sb_rextsize);
+ div_u64_rem(pos, rextbytes, &mod);
+ if (mod)
+ return false;
+ div_u64_rem(len, rextbytes, &mod);
+ return mod == 0;
+ }
+ mask = XFS_FSB_TO_B(mp, mp->m_sb.sb_rextsize) - 1;
+ } else {
+ mask = mp->m_sb.sb_blocksize - 1;
+ }
+
+ return !((pos | len) & mask);
+}
+
int
xfs_update_prealloc_flags(
struct xfs_inode *ip,
@@ -80,19 +113,9 @@
int datasync)
{
struct xfs_inode *ip = XFS_I(file->f_mapping->host);
- struct xfs_mount *mp = ip->i_mount;
- xfs_lsn_t lsn = 0;
trace_xfs_dir_fsync(ip);
-
- xfs_ilock(ip, XFS_ILOCK_SHARED);
- if (xfs_ipincount(ip))
- lsn = ip->i_itemp->ili_last_lsn;
- xfs_iunlock(ip, XFS_ILOCK_SHARED);
-
- if (!lsn)
- return 0;
- return xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL);
+ return xfs_log_force_inode(ip);
}
STATIC int
@@ -104,6 +127,7 @@
{
struct inode *inode = file->f_mapping->host;
struct xfs_inode *ip = XFS_I(inode);
+ struct xfs_inode_log_item *iip = ip->i_itemp;
struct xfs_mount *mp = ip->i_mount;
int error = 0;
int log_flushed = 0;
@@ -147,13 +171,15 @@
xfs_ilock(ip, XFS_ILOCK_SHARED);
if (xfs_ipincount(ip)) {
if (!datasync ||
- (ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP))
- lsn = ip->i_itemp->ili_last_lsn;
+ (iip->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP))
+ lsn = iip->ili_last_lsn;
}
if (lsn) {
error = xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, &log_flushed);
- ip->i_itemp->ili_fsync_fields = 0;
+ spin_lock(&iip->ili_lock);
+ iip->ili_fsync_fields = 0;
+ spin_unlock(&iip->ili_lock);
}
xfs_iunlock(ip, XFS_ILOCK_SHARED);
@@ -187,8 +213,14 @@
file_accessed(iocb->ki_filp);
- xfs_ilock(ip, XFS_IOLOCK_SHARED);
- ret = iomap_dio_rw(iocb, to, &xfs_iomap_ops, NULL);
+ if (iocb->ki_flags & IOCB_NOWAIT) {
+ if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED))
+ return -EAGAIN;
+ } else {
+ xfs_ilock(ip, XFS_IOLOCK_SHARED);
+ }
+ ret = iomap_dio_rw(iocb, to, &xfs_read_iomap_ops, NULL,
+ is_sync_kiocb(iocb));
xfs_iunlock(ip, XFS_IOLOCK_SHARED);
return ret;
@@ -215,7 +247,7 @@
xfs_ilock(ip, XFS_IOLOCK_SHARED);
}
- ret = dax_iomap_rw(iocb, to, &xfs_iomap_ops);
+ ret = dax_iomap_rw(iocb, to, &xfs_read_iomap_ops);
xfs_iunlock(ip, XFS_IOLOCK_SHARED);
file_accessed(iocb->ki_filp);
@@ -351,7 +383,7 @@
trace_xfs_zero_eof(ip, isize, iocb->ki_pos - isize);
error = iomap_zero_range(inode, isize, iocb->ki_pos - isize,
- NULL, &xfs_iomap_ops);
+ NULL, &xfs_buffered_write_iomap_ops);
if (error)
return error;
} else
@@ -486,8 +518,7 @@
int unaligned_io = 0;
int iolock;
size_t count = iov_iter_count(from);
- struct xfs_buftarg *target = XFS_IS_REALTIME_INODE(ip) ?
- mp->m_rtdev_targp : mp->m_ddev_targp;
+ struct xfs_buftarg *target = xfs_inode_buftarg(ip);
/* DIO must be aligned to device logical sector size */
if ((iocb->ki_pos | count) & target->bt_logical_sectormask)
@@ -510,7 +541,7 @@
*/
if (xfs_is_cow_inode(ip)) {
trace_xfs_reflink_bounce_dio_write(ip, iocb->ki_pos, count);
- return -EREMCHG;
+ return -ENOTBLK;
}
iolock = XFS_IOLOCK_EXCL;
} else {
@@ -547,21 +578,19 @@
}
trace_xfs_file_direct_write(ip, count, iocb->ki_pos);
- ret = iomap_dio_rw(iocb, from, &xfs_iomap_ops, &xfs_dio_write_ops);
-
/*
- * If unaligned, this is the only IO in-flight. If it has not yet
- * completed, wait on it before we release the iolock to prevent
- * subsequent overlapping IO.
+ * If unaligned, this is the only IO in-flight. Wait on it before we
+ * release the iolock to prevent subsequent overlapping IO.
*/
- if (ret == -EIOCBQUEUED && unaligned_io)
- inode_dio_wait(inode);
+ ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops,
+ &xfs_dio_write_ops,
+ is_sync_kiocb(iocb) || unaligned_io);
out:
xfs_iunlock(ip, iolock);
/*
- * No fallback to buffered IO on errors for XFS, direct IO will either
- * complete fully or fail.
+ * No fallback to buffered IO after short writes for XFS, direct I/O
+ * will either complete fully or return an error.
*/
ASSERT(ret < 0 || ret == count);
return ret;
@@ -594,7 +623,7 @@
count = iov_iter_count(from);
trace_xfs_file_dax_write(ip, count, pos);
- ret = dax_iomap_rw(iocb, from, &xfs_iomap_ops);
+ ret = dax_iomap_rw(iocb, from, &xfs_direct_write_iomap_ops);
if (ret > 0 && iocb->ki_pos > i_size_read(inode)) {
i_size_write(inode, iocb->ki_pos);
error = xfs_setfilesize(ip, pos, ret);
@@ -641,7 +670,8 @@
current->backing_dev_info = inode_to_bdi(inode);
trace_xfs_file_buffered_write(ip, iov_iter_count(from), iocb->ki_pos);
- ret = iomap_file_buffered_write(iocb, from, &xfs_iomap_ops);
+ ret = iomap_file_buffered_write(iocb, from,
+ &xfs_buffered_write_iomap_ops);
if (likely(ret >= 0))
iocb->ki_pos += ret;
@@ -720,7 +750,7 @@
* allow an operation to fall back to buffered mode.
*/
ret = xfs_file_dio_aio_write(iocb, from);
- if (ret != -EREMCHG)
+ if (ret != -ENOTBLK)
return ret;
}
@@ -853,9 +883,7 @@
if (error)
goto out_unlock;
} else if (mode & FALLOC_FL_COLLAPSE_RANGE) {
- unsigned int blksize_mask = i_blocksize(inode) - 1;
-
- if (offset & blksize_mask || len & blksize_mask) {
+ if (!xfs_is_falloc_aligned(ip, offset, len)) {
error = -EINVAL;
goto out_unlock;
}
@@ -875,10 +903,9 @@
if (error)
goto out_unlock;
} else if (mode & FALLOC_FL_INSERT_RANGE) {
- unsigned int blksize_mask = i_blocksize(inode) - 1;
loff_t isize = i_size_read(inode);
- if (offset & blksize_mask || len & blksize_mask) {
+ if (!xfs_is_falloc_aligned(ip, offset, len)) {
error = -EINVAL;
goto out_unlock;
}
@@ -911,16 +938,30 @@
}
if (mode & FALLOC_FL_ZERO_RANGE) {
- error = xfs_zero_file_space(ip, offset, len);
+ /*
+ * Punch a hole and prealloc the range. We use a hole
+ * punch rather than unwritten extent conversion for two
+ * reasons:
+ *
+ * 1.) Hole punch handles partial block zeroing for us.
+ * 2.) If prealloc returns ENOSPC, the file range is
+ * still zero-valued by virtue of the hole punch.
+ */
+ unsigned int blksize = i_blocksize(inode);
+
+ trace_xfs_zero_file_space(ip);
+
+ error = xfs_free_file_space(ip, offset, len);
+ if (error)
+ goto out_unlock;
+
+ len = round_up(offset + len, blksize) -
+ round_down(offset, blksize);
+ offset = round_down(offset, blksize);
} else if (mode & FALLOC_FL_UNSHARE_RANGE) {
error = xfs_reflink_unshare(ip, offset, len);
if (error)
goto out_unlock;
-
- if (!xfs_is_always_cow_inode(ip)) {
- error = xfs_alloc_file_space(ip, offset, len,
- XFS_BMAPI_PREALLOC);
- }
} else {
/*
* If always_cow mode we can't use preallocations and
@@ -930,12 +971,14 @@
error = -EOPNOTSUPP;
goto out_unlock;
}
+ }
+ if (!xfs_is_always_cow_inode(ip)) {
error = xfs_alloc_file_space(ip, offset, len,
XFS_BMAPI_PREALLOC);
+ if (error)
+ goto out_unlock;
}
- if (error)
- goto out_unlock;
}
if (file->f_flags & O_DSYNC)
@@ -995,6 +1038,21 @@
return ret;
}
+/* Does this file, inode, or mount want synchronous writes? */
+static inline bool xfs_file_sync_writes(struct file *filp)
+{
+ struct xfs_inode *ip = XFS_I(file_inode(filp));
+
+ if (ip->i_mount->m_flags & XFS_MOUNT_WSYNC)
+ return true;
+ if (filp->f_flags & (__O_SYNC | O_DSYNC))
+ return true;
+ if (IS_SYNC(file_inode(filp)))
+ return true;
+
+ return false;
+}
+
STATIC loff_t
xfs_file_remap_range(
struct file *file_in,
@@ -1025,7 +1083,7 @@
/* Prepare and then clone file data. */
ret = xfs_reflink_remap_prep(file_in, pos_in, file_out, pos_out,
&len, remap_flags);
- if (ret < 0 || len == 0)
+ if (ret || len == 0)
return ret;
trace_xfs_reflink_remap_range(src, pos_in, len, dest, pos_out);
@@ -1049,9 +1107,13 @@
ret = xfs_reflink_update_dest(dest, pos_out + len, cowextsize,
remap_flags);
+ if (ret)
+ goto out_unlock;
+ if (xfs_file_sync_writes(file_in) || xfs_file_sync_writes(file_out))
+ xfs_log_force_inode(dest);
out_unlock:
- xfs_reflink_remap_unlock(file_in, file_out);
+ xfs_iunlock2_io_mmap(src, dest);
if (ret)
trace_xfs_reflink_remap_range_error(dest, ret, _RET_IP_);
return remapped > 0 ? remapped : ret;
@@ -1066,7 +1128,7 @@
return -EFBIG;
if (XFS_FORCED_SHUTDOWN(XFS_M(inode->i_sb)))
return -EIO;
- file->f_mode |= FMODE_NOWAIT;
+ file->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC;
return 0;
}
@@ -1088,8 +1150,8 @@
* certain to have the next operation be a read there.
*/
mode = xfs_ilock_data_map_shared(ip);
- if (ip->i_d.di_nextents > 0)
- error = xfs_dir3_data_readahead(ip, 0, -1);
+ if (ip->i_df.if_nextents > 0)
+ error = xfs_dir3_data_readahead(ip, 0, 0);
xfs_iunlock(ip, mode);
return error;
}
@@ -1159,7 +1221,7 @@
* Locking for serialisation of IO during page faults. This results in a lock
* ordering of:
*
- * mmap_sem (MM)
+ * mmap_lock (MM)
* sb_start_pagefault(vfs, freeze)
* i_mmaplock (XFS - truncate serialisation)
* page_lock (MM)
@@ -1186,12 +1248,16 @@
if (IS_DAX(inode)) {
pfn_t pfn;
- ret = dax_iomap_fault(vmf, pe_size, &pfn, NULL, &xfs_iomap_ops);
+ ret = dax_iomap_fault(vmf, pe_size, &pfn, NULL,
+ (write_fault && !vmf->cow_page) ?
+ &xfs_direct_write_iomap_ops :
+ &xfs_read_iomap_ops);
if (ret & VM_FAULT_NEEDDSYNC)
ret = dax_finish_sync_fault(vmf, pe_size, pfn);
} else {
if (write_fault)
- ret = iomap_page_mkwrite(vmf, &xfs_iomap_ops);
+ ret = iomap_page_mkwrite(vmf,
+ &xfs_buffered_write_iomap_ops);
else
ret = filemap_fault(vmf);
}
@@ -1253,32 +1319,45 @@
return __xfs_filemap_fault(vmf, PE_SIZE_PTE, true);
}
+static void
+xfs_filemap_map_pages(
+ struct vm_fault *vmf,
+ pgoff_t start_pgoff,
+ pgoff_t end_pgoff)
+{
+ struct inode *inode = file_inode(vmf->vma->vm_file);
+
+ xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
+ filemap_map_pages(vmf, start_pgoff, end_pgoff);
+ xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
+}
+
static const struct vm_operations_struct xfs_file_vm_ops = {
.fault = xfs_filemap_fault,
.huge_fault = xfs_filemap_huge_fault,
- .map_pages = filemap_map_pages,
+ .map_pages = xfs_filemap_map_pages,
.page_mkwrite = xfs_filemap_page_mkwrite,
.pfn_mkwrite = xfs_filemap_pfn_mkwrite,
};
STATIC int
xfs_file_mmap(
- struct file *filp,
- struct vm_area_struct *vma)
+ struct file *file,
+ struct vm_area_struct *vma)
{
- struct dax_device *dax_dev;
+ struct inode *inode = file_inode(file);
+ struct xfs_buftarg *target = xfs_inode_buftarg(XFS_I(inode));
- dax_dev = xfs_find_daxdev_for_inode(file_inode(filp));
/*
* We don't support synchronous mappings for non-DAX files and
* for DAX files if underneath dax_device is not synchronous.
*/
- if (!daxdev_mapping_supported(vma, dax_dev))
+ if (!daxdev_mapping_supported(vma, target->bt_daxdev))
return -EOPNOTSUPP;
- file_accessed(filp);
+ file_accessed(file);
vma->vm_ops = &xfs_file_vm_ops;
- if (IS_DAX(file_inode(filp)))
+ if (IS_DAX(inode))
vma->vm_flags |= VM_HUGEPAGE;
return 0;
}
diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c
index 574a7a8..db23e45 100644
--- a/fs/xfs/xfs_filestream.c
+++ b/fs/xfs/xfs_filestream.c
@@ -18,6 +18,7 @@
#include "xfs_trace.h"
#include "xfs_ag_resv.h"
#include "xfs_trans.h"
+#include "xfs_filestream.h"
struct xfs_fstrm_item {
struct xfs_mru_cache_elem mru;
@@ -32,39 +33,7 @@
/*
* Allocation group filestream associations are tracked with per-ag atomic
* counters. These counters allow xfs_filestream_pick_ag() to tell whether a
- * particular AG already has active filestreams associated with it. The mount
- * point's m_peraglock is used to protect these counters from per-ag array
- * re-allocation during a growfs operation. When xfs_growfs_data_private() is
- * about to reallocate the array, it calls xfs_filestream_flush() with the
- * m_peraglock held in write mode.
- *
- * Since xfs_mru_cache_flush() guarantees that all the free functions for all
- * the cache elements have finished executing before it returns, it's safe for
- * the free functions to use the atomic counters without m_peraglock protection.
- * This allows the implementation of xfs_fstrm_free_func() to be agnostic about
- * whether it was called with the m_peraglock held in read mode, write mode or
- * not held at all. The race condition this addresses is the following:
- *
- * - The work queue scheduler fires and pulls a filestream directory cache
- * element off the LRU end of the cache for deletion, then gets pre-empted.
- * - A growfs operation grabs the m_peraglock in write mode, flushes all the
- * remaining items from the cache and reallocates the mount point's per-ag
- * array, resetting all the counters to zero.
- * - The work queue thread resumes and calls the free function for the element
- * it started cleaning up earlier. In the process it decrements the
- * filestreams counter for an AG that now has no references.
- *
- * With a shrinkfs feature, the above scenario could panic the system.
- *
- * All other uses of the following macros should be protected by either the
- * m_peraglock held in read mode, or the cache's internal locking exposed by the
- * interval between a call to xfs_mru_cache_lookup() and a call to
- * xfs_mru_cache_done(). In addition, the m_peraglock must be held in read mode
- * when new elements are added to the cache.
- *
- * Combined, these locking rules ensure that no associations will ever exist in
- * the cache that reference per-ag array elements that have since been
- * reallocated.
+ * particular AG already has active filestreams associated with it.
*/
int
xfs_filestream_peek_ag(
@@ -158,16 +127,15 @@
if (!pag->pagf_init) {
err = xfs_alloc_pagf_init(mp, NULL, ag, trylock);
- if (err && !trylock) {
+ if (err) {
xfs_perag_put(pag);
- return err;
+ if (err != -EAGAIN)
+ return err;
+ /* Couldn't lock the AGF, skip this AG. */
+ continue;
}
}
- /* Might fail sometimes during the 1st pass with trylock set. */
- if (!pag->pagf_init)
- goto next_ag;
-
/* Keep track of the AG with the most free blocks. */
if (pag->pagf_freeblks > maxfree) {
maxfree = pag->pagf_freeblks;
@@ -374,7 +342,7 @@
startag = (item->ag + 1) % mp->m_sb.sb_agcount;
}
- if (xfs_alloc_is_userdata(ap->datatype))
+ if (ap->datatype & XFS_ALLOC_USERDATA)
flags |= XFS_PICK_USERDATA;
if (ap->tp->t_flags & XFS_TRANS_LOWMODE)
flags |= XFS_PICK_LOWSPACE;
diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c
index 01c0933..9ce5e7d 100644
--- a/fs/xfs/xfs_fsmap.c
+++ b/fs/xfs/xfs_fsmap.c
@@ -146,6 +146,7 @@
dest->fmr_owner = XFS_FMR_OWN_FREE;
break;
default:
+ ASSERT(0);
return -EFSCORRUPTED;
}
return 0;
@@ -353,7 +354,7 @@
xfs_fsblock_t fsb;
xfs_daddr_t rec_daddr;
- fsb = XFS_AGB_TO_FSB(mp, cur->bc_private.a.agno, rec->rm_startblock);
+ fsb = XFS_AGB_TO_FSB(mp, cur->bc_ag.agno, rec->rm_startblock);
rec_daddr = XFS_FSB_TO_DADDR(mp, fsb);
return xfs_getfsmap_helper(cur->bc_tp, info, rec, rec_daddr);
@@ -371,7 +372,7 @@
struct xfs_rmap_irec irec;
xfs_daddr_t rec_daddr;
- rec_daddr = XFS_AGB_TO_DADDR(mp, cur->bc_private.a.agno,
+ rec_daddr = XFS_AGB_TO_DADDR(mp, cur->bc_ag.agno,
rec->ar_startblock);
irec.rm_startblock = rec->ar_startblock;
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 3e61d0c..ef1d5bb 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -504,10 +504,7 @@
} else if (logerror) {
xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_LOGERROR,
"Log I/O Error Detected. Shutting down filesystem");
- } else if (flags & SHUTDOWN_DEVICE_REQ) {
- xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_IOERROR,
- "All device paths lost. Shutting down filesystem");
- } else if (!(flags & SHUTDOWN_REMOTE_REQ)) {
+ } else {
xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_IOERROR,
"I/O Error Detected. Shutting down filesystem");
}
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index a1135b8..deb9930 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -22,6 +22,7 @@
#include "xfs_dquot_item.h"
#include "xfs_dquot.h"
#include "xfs_reflink.h"
+#include "xfs_ialloc.h"
#include <linux/iversion.h>
@@ -36,15 +37,13 @@
struct xfs_inode *ip;
/*
- * if this didn't occur in transactions, we could use
- * KM_MAYFAIL and return NULL here on ENOMEM. Set the
- * code up to do this anyway.
+ * XXX: If this didn't occur in transactions, we could drop GFP_NOFAIL
+ * and return NULL here on ENOMEM.
*/
- ip = kmem_zone_alloc(xfs_inode_zone, 0);
- if (!ip)
- return NULL;
+ ip = kmem_cache_alloc(xfs_inode_zone, GFP_KERNEL | __GFP_NOFAIL);
+
if (inode_init_always(mp->m_super, VFS_I(ip))) {
- kmem_zone_free(xfs_inode_zone, ip);
+ kmem_cache_free(xfs_inode_zone, ip);
return NULL;
}
@@ -53,7 +52,6 @@
XFS_STATS_INC(mp, vn_active);
ASSERT(atomic_read(&ip->i_pincount) == 0);
- ASSERT(!xfs_isiflocked(ip));
ASSERT(ip->i_ino == 0);
/* initialise the xfs inode */
@@ -62,8 +60,6 @@
memset(&ip->i_imap, 0, sizeof(struct xfs_imap));
ip->i_afp = NULL;
ip->i_cowfp = NULL;
- ip->i_cnextents = 0;
- ip->i_cformat = XFS_DINODE_FMT_EXTENTS;
memset(&ip->i_df, 0, sizeof(ip->i_df));
ip->i_flags = 0;
ip->i_delayed_blks = 0;
@@ -88,15 +84,18 @@
case S_IFREG:
case S_IFDIR:
case S_IFLNK:
- xfs_idestroy_fork(ip, XFS_DATA_FORK);
+ xfs_idestroy_fork(&ip->i_df);
break;
}
- if (ip->i_afp)
- xfs_idestroy_fork(ip, XFS_ATTR_FORK);
- if (ip->i_cowfp)
- xfs_idestroy_fork(ip, XFS_COW_FORK);
-
+ if (ip->i_afp) {
+ xfs_idestroy_fork(ip->i_afp);
+ kmem_cache_free(xfs_ifork_zone, ip->i_afp);
+ }
+ if (ip->i_cowfp) {
+ xfs_idestroy_fork(ip->i_cowfp);
+ kmem_cache_free(xfs_ifork_zone, ip->i_cowfp);
+ }
if (ip->i_itemp) {
ASSERT(!test_bit(XFS_LI_IN_AIL,
&ip->i_itemp->ili_item.li_flags));
@@ -104,7 +103,7 @@
ip->i_itemp = NULL;
}
- kmem_zone_free(xfs_inode_zone, ip);
+ kmem_cache_free(xfs_inode_zone, ip);
}
static void
@@ -113,6 +112,7 @@
{
/* asserts to verify all state is correct here */
ASSERT(atomic_read(&ip->i_pincount) == 0);
+ ASSERT(!ip->i_itemp || list_empty(&ip->i_itemp->ili_item.li_bio_list));
XFS_STATS_DEC(ip->i_mount, vn_active);
call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback);
@@ -122,7 +122,7 @@
xfs_inode_free(
struct xfs_inode *ip)
{
- ASSERT(!xfs_isiflocked(ip));
+ ASSERT(!xfs_iflags_test(ip, XFS_IFLUSHING));
/*
* Because we use RCU freeing we need to ensure the inode always
@@ -139,11 +139,8 @@
}
/*
- * Queue a new inode reclaim pass if there are reclaimable inodes and there
- * isn't a reclaim pass already in progress. By default it runs every 5s based
- * on the xfs periodic sync default of 30s. Perhaps this should have it's own
- * tunable, but that can be done if this method proves to be ineffective or too
- * aggressive.
+ * Queue background inode reclaim work if there are reclaimable inodes and there
+ * isn't reclaim work already scheduled or in progress.
*/
static void
xfs_reclaim_work_queue(
@@ -158,24 +155,6 @@
rcu_read_unlock();
}
-/*
- * This is a fast pass over the inode cache to try to get reclaim moving on as
- * many inodes as possible in a short period of time. It kicks itself every few
- * seconds, as well as being kicked by the inode cache shrinker when memory
- * goes low. It scans as quickly as possible avoiding locked inodes or those
- * already being flushed, and once done schedules a future pass.
- */
-void
-xfs_reclaim_worker(
- struct work_struct *work)
-{
- struct xfs_mount *mp = container_of(to_delayed_work(work),
- struct xfs_mount, m_reclaim_work);
-
- xfs_reclaim_inodes(mp, SYNC_TRYLOCK);
- xfs_reclaim_work_queue(mp);
-}
-
static void
xfs_perag_set_reclaim_tag(
struct xfs_perag *pag)
@@ -289,6 +268,8 @@
uint64_t version = inode_peek_iversion(inode);
umode_t mode = inode->i_mode;
dev_t dev = inode->i_rdev;
+ kuid_t uid = inode->i_uid;
+ kgid_t gid = inode->i_gid;
error = inode_init_always(mp->m_super, inode);
@@ -297,6 +278,8 @@
inode_set_iversion_queried(inode, version);
inode->i_mode = mode;
inode->i_rdev = dev;
+ inode->i_uid = uid;
+ inode->i_gid = gid;
return error;
}
@@ -419,6 +402,7 @@
spin_unlock(&ip->i_flags_lock);
rcu_read_unlock();
+ ASSERT(!rwsem_is_locked(&inode->i_rwsem));
error = xfs_reinit_inode(mp, inode);
if (error) {
bool wake;
@@ -452,9 +436,6 @@
ip->i_sick = 0;
ip->i_checked = 0;
- ASSERT(!rwsem_is_locked(&inode->i_rwsem));
- init_rwsem(&inode->i_rwsem);
-
spin_unlock(&ip->i_flags_lock);
spin_unlock(&pag->pag_ici_lock);
} else {
@@ -475,7 +456,7 @@
xfs_ilock(ip, lock_flags);
if (!(flags & XFS_IGET_INCORE))
- xfs_iflags_clear(ip, XFS_ISTALE | XFS_IDONTCACHE);
+ xfs_iflags_clear(ip, XFS_ISTALE);
XFS_STATS_INC(mp, xs_ig_found);
return 0;
@@ -506,18 +487,42 @@
if (!ip)
return -ENOMEM;
- error = xfs_iread(mp, tp, ip, flags);
+ error = xfs_imap(mp, tp, ip->i_ino, &ip->i_imap, flags);
if (error)
goto out_destroy;
- if (!xfs_inode_verify_forks(ip)) {
- error = -EFSCORRUPTED;
- goto out_destroy;
+ /*
+ * For version 5 superblocks, if we are initialising a new inode and we
+ * are not utilising the XFS_MOUNT_IKEEP inode cluster mode, we can
+ * simply build the new inode core with a random generation number.
+ *
+ * For version 4 (and older) superblocks, log recovery is dependent on
+ * the di_flushiter field being initialised from the current on-disk
+ * value and hence we must also read the inode off disk even when
+ * initializing new inodes.
+ */
+ if (xfs_sb_version_has_v3inode(&mp->m_sb) &&
+ (flags & XFS_IGET_CREATE) && !(mp->m_flags & XFS_MOUNT_IKEEP)) {
+ VFS_I(ip)->i_generation = prandom_u32();
+ } else {
+ struct xfs_dinode *dip;
+ struct xfs_buf *bp;
+
+ error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &bp, 0);
+ if (error)
+ goto out_destroy;
+
+ error = xfs_inode_from_disk(ip, dip);
+ if (!error)
+ xfs_buf_set_ref(bp, XFS_INO_REF);
+ xfs_trans_brelse(tp, bp);
+
+ if (error)
+ goto out_destroy;
}
trace_xfs_iget_miss(ip);
-
/*
* Check the inode free state is valid. This also detects lookup
* racing with unlinks.
@@ -557,7 +562,7 @@
*/
iflags = XFS_INEW;
if (flags & XFS_IGET_DONTCACHE)
- iflags |= XFS_IDONTCACHE;
+ d_mark_dontcache(VFS_I(ip));
ip->i_udquot = NULL;
ip->i_gdquot = NULL;
ip->i_pdquot = NULL;
@@ -590,48 +595,31 @@
}
/*
- * Look up an inode by number in the given file system.
- * The inode is looked up in the cache held in each AG.
- * If the inode is found in the cache, initialise the vfs inode
- * if necessary.
+ * Look up an inode by number in the given file system. The inode is looked up
+ * in the cache held in each AG. If the inode is found in the cache, initialise
+ * the vfs inode if necessary.
*
- * If it is not in core, read it in from the file system's device,
- * add it to the cache and initialise the vfs inode.
+ * If it is not in core, read it in from the file system's device, add it to the
+ * cache and initialise the vfs inode.
*
* The inode is locked according to the value of the lock_flags parameter.
- * This flag parameter indicates how and if the inode's IO lock and inode lock
- * should be taken.
- *
- * mp -- the mount point structure for the current file system. It points
- * to the inode hash table.
- * tp -- a pointer to the current transaction if there is one. This is
- * simply passed through to the xfs_iread() call.
- * ino -- the number of the inode desired. This is the unique identifier
- * within the file system for the inode being requested.
- * lock_flags -- flags indicating how to lock the inode. See the comment
- * for xfs_ilock() for a list of valid values.
+ * Inode lookup is only done during metadata operations and not as part of the
+ * data IO path. Hence we only allow locking of the XFS_ILOCK during lookup.
*/
int
xfs_iget(
- xfs_mount_t *mp,
- xfs_trans_t *tp,
- xfs_ino_t ino,
- uint flags,
- uint lock_flags,
- xfs_inode_t **ipp)
+ struct xfs_mount *mp,
+ struct xfs_trans *tp,
+ xfs_ino_t ino,
+ uint flags,
+ uint lock_flags,
+ struct xfs_inode **ipp)
{
- xfs_inode_t *ip;
- int error;
- xfs_perag_t *pag;
- xfs_agino_t agino;
+ struct xfs_inode *ip;
+ struct xfs_perag *pag;
+ xfs_agino_t agino;
+ int error;
- /*
- * xfs_reclaim_inode() uses the ILOCK to ensure an inode
- * doesn't get freed while it's being referenced during a
- * radix tree traversal here. It assumes this function
- * aqcuires only the ILOCK (and therefore it has no need to
- * involve the IOLOCK in this synchronization).
- */
ASSERT((lock_flags & (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED)) == 0);
/* reject inode numbers outside existing AGs */
@@ -733,25 +721,22 @@
*/
#define XFS_LOOKUP_BATCH 32
-STATIC int
-xfs_inode_ag_walk_grab(
+/*
+ * Decide if the given @ip is eligible to be a part of the inode walk, and
+ * grab it if so. Returns true if it's ready to go or false if we should just
+ * ignore it.
+ */
+STATIC bool
+xfs_inode_walk_ag_grab(
struct xfs_inode *ip,
int flags)
{
struct inode *inode = VFS_I(ip);
- bool newinos = !!(flags & XFS_AGITER_INEW_WAIT);
+ bool newinos = !!(flags & XFS_INODE_WALK_INEW_WAIT);
ASSERT(rcu_read_lock_held());
- /*
- * check for stale RCU freed inode
- *
- * If the inode has been reallocated, it doesn't matter if it's not in
- * the AG we are walking - we are walking for writeback, so if it
- * passes all the "valid inode" checks and is dirty, then we'll write
- * it back anyway. If it has been reallocated and still being
- * initialised, the XFS_INEW check below will catch it.
- */
+ /* Check for stale RCU freed inode */
spin_lock(&ip->i_flags_lock);
if (!ip->i_ino)
goto out_unlock_noent;
@@ -764,39 +749,41 @@
/* nothing to sync during shutdown */
if (XFS_FORCED_SHUTDOWN(ip->i_mount))
- return -EFSCORRUPTED;
+ return false;
/* If we can't grab the inode, it must on it's way to reclaim. */
if (!igrab(inode))
- return -ENOENT;
+ return false;
/* inode is valid */
- return 0;
+ return true;
out_unlock_noent:
spin_unlock(&ip->i_flags_lock);
- return -ENOENT;
+ return false;
}
+/*
+ * For a given per-AG structure @pag, grab, @execute, and rele all incore
+ * inodes with the given radix tree @tag.
+ */
STATIC int
-xfs_inode_ag_walk(
- struct xfs_mount *mp,
+xfs_inode_walk_ag(
struct xfs_perag *pag,
- int (*execute)(struct xfs_inode *ip, int flags,
- void *args),
- int flags,
+ int iter_flags,
+ int (*execute)(struct xfs_inode *ip, void *args),
void *args,
- int tag,
- int iter_flags)
+ int tag)
{
+ struct xfs_mount *mp = pag->pag_mount;
uint32_t first_index;
int last_error = 0;
int skipped;
- int done;
+ bool done;
int nr_found;
restart:
- done = 0;
+ done = false;
skipped = 0;
first_index = 0;
nr_found = 0;
@@ -807,7 +794,7 @@
rcu_read_lock();
- if (tag == -1)
+ if (tag == XFS_ICI_NO_TAG)
nr_found = radix_tree_gang_lookup(&pag->pag_ici_root,
(void **)batch, first_index,
XFS_LOOKUP_BATCH);
@@ -829,7 +816,7 @@
for (i = 0; i < nr_found; i++) {
struct xfs_inode *ip = batch[i];
- if (done || xfs_inode_ag_walk_grab(ip, iter_flags))
+ if (done || !xfs_inode_walk_ag_grab(ip, iter_flags))
batch[i] = NULL;
/*
@@ -848,7 +835,7 @@
continue;
first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
- done = 1;
+ done = true;
}
/* unlock now we've grabbed the inodes. */
@@ -857,10 +844,10 @@
for (i = 0; i < nr_found; i++) {
if (!batch[i])
continue;
- if ((iter_flags & XFS_AGITER_INEW_WAIT) &&
+ if ((iter_flags & XFS_INODE_WALK_INEW_WAIT) &&
xfs_iflags_test(batch[i], XFS_INEW))
xfs_inew_wait(batch[i]);
- error = execute(batch[i], flags, args);
+ error = execute(batch[i], args);
xfs_irele(batch[i]);
if (error == -EAGAIN) {
skipped++;
@@ -885,6 +872,49 @@
return last_error;
}
+/* Fetch the next (possibly tagged) per-AG structure. */
+static inline struct xfs_perag *
+xfs_inode_walk_get_perag(
+ struct xfs_mount *mp,
+ xfs_agnumber_t agno,
+ int tag)
+{
+ if (tag == XFS_ICI_NO_TAG)
+ return xfs_perag_get(mp, agno);
+ return xfs_perag_get_tag(mp, agno, tag);
+}
+
+/*
+ * Call the @execute function on all incore inodes matching the radix tree
+ * @tag.
+ */
+int
+xfs_inode_walk(
+ struct xfs_mount *mp,
+ int iter_flags,
+ int (*execute)(struct xfs_inode *ip, void *args),
+ void *args,
+ int tag)
+{
+ struct xfs_perag *pag;
+ int error = 0;
+ int last_error = 0;
+ xfs_agnumber_t ag;
+
+ ag = 0;
+ while ((pag = xfs_inode_walk_get_perag(mp, ag, tag))) {
+ ag = pag->pag_agno + 1;
+ error = xfs_inode_walk_ag(pag, iter_flags, execute, args, tag);
+ xfs_perag_put(pag);
+ if (error) {
+ last_error = error;
+ if (error == -EFSCORRUPTED)
+ break;
+ }
+ }
+ return last_error;
+}
+
/*
* Background scanning to trim post-EOF preallocated space. This is queued
* based on the 'speculative_prealloc_lifetime' tunable (5m by default).
@@ -948,233 +978,77 @@
xfs_queue_cowblocks(mp);
}
-int
-xfs_inode_ag_iterator_flags(
- struct xfs_mount *mp,
- int (*execute)(struct xfs_inode *ip, int flags,
- void *args),
- int flags,
- void *args,
- int iter_flags)
-{
- struct xfs_perag *pag;
- int error = 0;
- int last_error = 0;
- xfs_agnumber_t ag;
-
- ag = 0;
- while ((pag = xfs_perag_get(mp, ag))) {
- ag = pag->pag_agno + 1;
- error = xfs_inode_ag_walk(mp, pag, execute, flags, args, -1,
- iter_flags);
- xfs_perag_put(pag);
- if (error) {
- last_error = error;
- if (error == -EFSCORRUPTED)
- break;
- }
- }
- return last_error;
-}
-
-int
-xfs_inode_ag_iterator(
- struct xfs_mount *mp,
- int (*execute)(struct xfs_inode *ip, int flags,
- void *args),
- int flags,
- void *args)
-{
- return xfs_inode_ag_iterator_flags(mp, execute, flags, args, 0);
-}
-
-int
-xfs_inode_ag_iterator_tag(
- struct xfs_mount *mp,
- int (*execute)(struct xfs_inode *ip, int flags,
- void *args),
- int flags,
- void *args,
- int tag)
-{
- struct xfs_perag *pag;
- int error = 0;
- int last_error = 0;
- xfs_agnumber_t ag;
-
- ag = 0;
- while ((pag = xfs_perag_get_tag(mp, ag, tag))) {
- ag = pag->pag_agno + 1;
- error = xfs_inode_ag_walk(mp, pag, execute, flags, args, tag,
- 0);
- xfs_perag_put(pag);
- if (error) {
- last_error = error;
- if (error == -EFSCORRUPTED)
- break;
- }
- }
- return last_error;
-}
-
/*
* Grab the inode for reclaim exclusively.
- * Return 0 if we grabbed it, non-zero otherwise.
+ *
+ * We have found this inode via a lookup under RCU, so the inode may have
+ * already been freed, or it may be in the process of being recycled by
+ * xfs_iget(). In both cases, the inode will have XFS_IRECLAIM set. If the inode
+ * has been fully recycled by the time we get the i_flags_lock, XFS_IRECLAIMABLE
+ * will not be set. Hence we need to check for both these flag conditions to
+ * avoid inodes that are no longer reclaim candidates.
+ *
+ * Note: checking for other state flags here, under the i_flags_lock or not, is
+ * racy and should be avoided. Those races should be resolved only after we have
+ * ensured that we are able to reclaim this inode and the world can see that we
+ * are going to reclaim it.
+ *
+ * Return true if we grabbed it, false otherwise.
*/
-STATIC int
+static bool
xfs_reclaim_inode_grab(
- struct xfs_inode *ip,
- int flags)
+ struct xfs_inode *ip)
{
ASSERT(rcu_read_lock_held());
- /* quick check for stale RCU freed inode */
- if (!ip->i_ino)
- return 1;
-
- /*
- * If we are asked for non-blocking operation, do unlocked checks to
- * see if the inode already is being flushed or in reclaim to avoid
- * lock traffic.
- */
- if ((flags & SYNC_TRYLOCK) &&
- __xfs_iflags_test(ip, XFS_IFLOCK | XFS_IRECLAIM))
- return 1;
-
- /*
- * The radix tree lock here protects a thread in xfs_iget from racing
- * with us starting reclaim on the inode. Once we have the
- * XFS_IRECLAIM flag set it will not touch us.
- *
- * Due to RCU lookup, we may find inodes that have been freed and only
- * have XFS_IRECLAIM set. Indeed, we may see reallocated inodes that
- * aren't candidates for reclaim at all, so we must check the
- * XFS_IRECLAIMABLE is set first before proceeding to reclaim.
- */
spin_lock(&ip->i_flags_lock);
if (!__xfs_iflags_test(ip, XFS_IRECLAIMABLE) ||
__xfs_iflags_test(ip, XFS_IRECLAIM)) {
/* not a reclaim candidate. */
spin_unlock(&ip->i_flags_lock);
- return 1;
+ return false;
}
__xfs_iflags_set(ip, XFS_IRECLAIM);
spin_unlock(&ip->i_flags_lock);
- return 0;
+ return true;
}
/*
- * Inodes in different states need to be treated differently. The following
- * table lists the inode states and the reclaim actions necessary:
+ * Inode reclaim is non-blocking, so the default action if progress cannot be
+ * made is to "requeue" the inode for reclaim by unlocking it and clearing the
+ * XFS_IRECLAIM flag. If we are in a shutdown state, we don't care about
+ * blocking anymore and hence we can wait for the inode to be able to reclaim
+ * it.
*
- * inode state iflush ret required action
- * --------------- ---------- ---------------
- * bad - reclaim
- * shutdown EIO unpin and reclaim
- * clean, unpinned 0 reclaim
- * stale, unpinned 0 reclaim
- * clean, pinned(*) 0 requeue
- * stale, pinned EAGAIN requeue
- * dirty, async - requeue
- * dirty, sync 0 reclaim
- *
- * (*) dgc: I don't think the clean, pinned state is possible but it gets
- * handled anyway given the order of checks implemented.
- *
- * Also, because we get the flush lock first, we know that any inode that has
- * been flushed delwri has had the flush completed by the time we check that
- * the inode is clean.
- *
- * Note that because the inode is flushed delayed write by AIL pushing, the
- * flush lock may already be held here and waiting on it can result in very
- * long latencies. Hence for sync reclaims, where we wait on the flush lock,
- * the caller should push the AIL first before trying to reclaim inodes to
- * minimise the amount of time spent waiting. For background relaim, we only
- * bother to reclaim clean inodes anyway.
- *
- * Hence the order of actions after gaining the locks should be:
- * bad => reclaim
- * shutdown => unpin and reclaim
- * pinned, async => requeue
- * pinned, sync => unpin
- * stale => reclaim
- * clean => reclaim
- * dirty, async => requeue
- * dirty, sync => flush, wait and reclaim
+ * We do no IO here - if callers require inodes to be cleaned they must push the
+ * AIL first to trigger writeback of dirty inodes. This enables writeback to be
+ * done in the background in a non-blocking manner, and enables memory reclaim
+ * to make progress without blocking.
*/
-STATIC int
+static void
xfs_reclaim_inode(
struct xfs_inode *ip,
- struct xfs_perag *pag,
- int sync_mode)
+ struct xfs_perag *pag)
{
- struct xfs_buf *bp = NULL;
xfs_ino_t ino = ip->i_ino; /* for radix_tree_delete */
- int error;
-restart:
- error = 0;
- xfs_ilock(ip, XFS_ILOCK_EXCL);
- if (!xfs_iflock_nowait(ip)) {
- if (!(sync_mode & SYNC_WAIT))
- goto out;
- xfs_iflock(ip);
- }
+ if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL))
+ goto out;
+ if (xfs_iflags_test_and_set(ip, XFS_IFLUSHING))
+ goto out_iunlock;
if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
xfs_iunpin_wait(ip);
- /* xfs_iflush_abort() drops the flush lock */
- xfs_iflush_abort(ip, false);
+ xfs_iflush_abort(ip);
goto reclaim;
}
- if (xfs_ipincount(ip)) {
- if (!(sync_mode & SYNC_WAIT))
- goto out_ifunlock;
- xfs_iunpin_wait(ip);
- }
- if (xfs_inode_clean(ip)) {
- xfs_ifunlock(ip);
- goto reclaim;
- }
+ if (xfs_ipincount(ip))
+ goto out_clear_flush;
+ if (!xfs_inode_clean(ip))
+ goto out_clear_flush;
- /*
- * Never flush out dirty data during non-blocking reclaim, as it would
- * just contend with AIL pushing trying to do the same job.
- */
- if (!(sync_mode & SYNC_WAIT))
- goto out_ifunlock;
-
- /*
- * Now we have an inode that needs flushing.
- *
- * Note that xfs_iflush will never block on the inode buffer lock, as
- * xfs_ifree_cluster() can lock the inode buffer before it locks the
- * ip->i_lock, and we are doing the exact opposite here. As a result,
- * doing a blocking xfs_imap_to_bp() to get the cluster buffer would
- * result in an ABBA deadlock with xfs_ifree_cluster().
- *
- * As xfs_ifree_cluser() must gather all inodes that are active in the
- * cache to mark them stale, if we hit this case we don't actually want
- * to do IO here - we want the inode marked stale so we can simply
- * reclaim it. Hence if we get an EAGAIN error here, just unlock the
- * inode, back off and try again. Hopefully the next pass through will
- * see the stale flag set on the inode.
- */
- error = xfs_iflush(ip, &bp);
- if (error == -EAGAIN) {
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- /* backoff longer than in xfs_ifree_cluster */
- delay(2);
- goto restart;
- }
-
- if (!error) {
- error = xfs_bwrite(bp);
- xfs_buf_relse(bp);
- }
-
+ xfs_iflags_clear(ip, XFS_IFLUSHING);
reclaim:
- ASSERT(!xfs_isiflocked(ip));
/*
* Because we use RCU freeing we need to ensure the inode always appears
@@ -1222,21 +1096,14 @@
ASSERT(xfs_inode_clean(ip));
__xfs_inode_free(ip);
- return error;
+ return;
-out_ifunlock:
- xfs_ifunlock(ip);
+out_clear_flush:
+ xfs_iflags_clear(ip, XFS_IFLUSHING);
+out_iunlock:
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
out:
xfs_iflags_clear(ip, XFS_IRECLAIM);
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- /*
- * We could return -EAGAIN here to make reclaim rescan the inode tree in
- * a short while. However, this just burns CPU time scanning the tree
- * waiting for IO to complete and the reclaim work never goes back to
- * the idle state. Instead, return 0 to let the next scheduled
- * background reclaim attempt to reclaim the inode again.
- */
- return 0;
}
/*
@@ -1244,23 +1111,19 @@
* corrupted, we still want to try to reclaim all the inodes. If we don't,
* then a shut down during filesystem unmount reclaim walk leak all the
* unreclaimed inodes.
+ *
+ * Returns non-zero if any AGs or inodes were skipped in the reclaim pass
+ * so that callers that want to block until all dirty inodes are written back
+ * and reclaimed can sanely loop.
*/
-STATIC int
+static void
xfs_reclaim_inodes_ag(
struct xfs_mount *mp,
- int flags,
int *nr_to_scan)
{
struct xfs_perag *pag;
- int error = 0;
- int last_error = 0;
- xfs_agnumber_t ag;
- int trylock = flags & SYNC_TRYLOCK;
- int skipped;
+ xfs_agnumber_t ag = 0;
-restart:
- ag = 0;
- skipped = 0;
while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) {
unsigned long first_index = 0;
int done = 0;
@@ -1268,16 +1131,7 @@
ag = pag->pag_agno + 1;
- if (trylock) {
- if (!mutex_trylock(&pag->pag_ici_reclaim_lock)) {
- skipped++;
- xfs_perag_put(pag);
- continue;
- }
- first_index = pag->pag_ici_reclaim_cursor;
- } else
- mutex_lock(&pag->pag_ici_reclaim_lock);
-
+ first_index = READ_ONCE(pag->pag_ici_reclaim_cursor);
do {
struct xfs_inode *batch[XFS_LOOKUP_BATCH];
int i;
@@ -1301,7 +1155,7 @@
for (i = 0; i < nr_found; i++) {
struct xfs_inode *ip = batch[i];
- if (done || xfs_reclaim_inode_grab(ip, flags))
+ if (done || !xfs_reclaim_inode_grab(ip))
batch[i] = NULL;
/*
@@ -1330,59 +1184,39 @@
rcu_read_unlock();
for (i = 0; i < nr_found; i++) {
- if (!batch[i])
- continue;
- error = xfs_reclaim_inode(batch[i], pag, flags);
- if (error && last_error != -EFSCORRUPTED)
- last_error = error;
+ if (batch[i])
+ xfs_reclaim_inode(batch[i], pag);
}
*nr_to_scan -= XFS_LOOKUP_BATCH;
-
cond_resched();
-
} while (nr_found && !done && *nr_to_scan > 0);
- if (trylock && !done)
- pag->pag_ici_reclaim_cursor = first_index;
- else
- pag->pag_ici_reclaim_cursor = 0;
- mutex_unlock(&pag->pag_ici_reclaim_lock);
+ if (done)
+ first_index = 0;
+ WRITE_ONCE(pag->pag_ici_reclaim_cursor, first_index);
xfs_perag_put(pag);
}
-
- /*
- * if we skipped any AG, and we still have scan count remaining, do
- * another pass this time using blocking reclaim semantics (i.e
- * waiting on the reclaim locks and ignoring the reclaim cursors). This
- * ensure that when we get more reclaimers than AGs we block rather
- * than spin trying to execute reclaim.
- */
- if (skipped && (flags & SYNC_WAIT) && *nr_to_scan > 0) {
- trylock = 0;
- goto restart;
- }
- return last_error;
}
-int
+void
xfs_reclaim_inodes(
- xfs_mount_t *mp,
- int mode)
+ struct xfs_mount *mp)
{
int nr_to_scan = INT_MAX;
- return xfs_reclaim_inodes_ag(mp, mode, &nr_to_scan);
+ while (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) {
+ xfs_ail_push_all_sync(mp->m_ail);
+ xfs_reclaim_inodes_ag(mp, &nr_to_scan);
+ }
}
/*
- * Scan a certain number of inodes for reclaim.
- *
- * When called we make sure that there is a background (fast) inode reclaim in
- * progress, while we will throttle the speed of reclaim via doing synchronous
- * reclaim of inodes. That means if we come across dirty inodes, we wait for
- * them to be cleaned, which we hope will not be very long due to the
- * background walker having already kicked the IO off on those dirty inodes.
+ * The shrinker infrastructure determines how many inodes we should scan for
+ * reclaim. We want as many clean inodes ready to reclaim as possible, so we
+ * push the AIL here. We also want to proactively free up memory if we can to
+ * minimise the amount of work memory reclaim has to do so we kick the
+ * background reclaim if it isn't already scheduled.
*/
long
xfs_reclaim_inodes_nr(
@@ -1393,7 +1227,8 @@
xfs_reclaim_work_queue(mp);
xfs_ail_push_all(mp->m_ail);
- return xfs_reclaim_inodes_ag(mp, SYNC_TRYLOCK | SYNC_WAIT, &nr_to_scan);
+ xfs_reclaim_inodes_ag(mp, &nr_to_scan);
+ return 0;
}
/*
@@ -1416,59 +1251,108 @@
return reclaimable;
}
-STATIC int
+STATIC bool
xfs_inode_match_id(
struct xfs_inode *ip,
struct xfs_eofblocks *eofb)
{
if ((eofb->eof_flags & XFS_EOF_FLAGS_UID) &&
!uid_eq(VFS_I(ip)->i_uid, eofb->eof_uid))
- return 0;
+ return false;
if ((eofb->eof_flags & XFS_EOF_FLAGS_GID) &&
!gid_eq(VFS_I(ip)->i_gid, eofb->eof_gid))
- return 0;
+ return false;
if ((eofb->eof_flags & XFS_EOF_FLAGS_PRID) &&
- xfs_get_projid(ip) != eofb->eof_prid)
- return 0;
+ ip->i_d.di_projid != eofb->eof_prid)
+ return false;
- return 1;
+ return true;
}
/*
* A union-based inode filtering algorithm. Process the inode if any of the
* criteria match. This is for global/internal scans only.
*/
-STATIC int
+STATIC bool
xfs_inode_match_id_union(
struct xfs_inode *ip,
struct xfs_eofblocks *eofb)
{
if ((eofb->eof_flags & XFS_EOF_FLAGS_UID) &&
uid_eq(VFS_I(ip)->i_uid, eofb->eof_uid))
- return 1;
+ return true;
if ((eofb->eof_flags & XFS_EOF_FLAGS_GID) &&
gid_eq(VFS_I(ip)->i_gid, eofb->eof_gid))
- return 1;
+ return true;
if ((eofb->eof_flags & XFS_EOF_FLAGS_PRID) &&
- xfs_get_projid(ip) == eofb->eof_prid)
- return 1;
+ ip->i_d.di_projid == eofb->eof_prid)
+ return true;
- return 0;
+ return false;
+}
+
+/*
+ * Is this inode @ip eligible for eof/cow block reclamation, given some
+ * filtering parameters @eofb? The inode is eligible if @eofb is null or
+ * if the predicate functions match.
+ */
+static bool
+xfs_inode_matches_eofb(
+ struct xfs_inode *ip,
+ struct xfs_eofblocks *eofb)
+{
+ bool match;
+
+ if (!eofb)
+ return true;
+
+ if (eofb->eof_flags & XFS_EOF_FLAGS_UNION)
+ match = xfs_inode_match_id_union(ip, eofb);
+ else
+ match = xfs_inode_match_id(ip, eofb);
+ if (!match)
+ return false;
+
+ /* skip the inode if the file size is too small */
+ if ((eofb->eof_flags & XFS_EOF_FLAGS_MINFILESIZE) &&
+ XFS_ISIZE(ip) < eofb->eof_min_file_size)
+ return false;
+
+ return true;
+}
+
+/*
+ * This is a fast pass over the inode cache to try to get reclaim moving on as
+ * many inodes as possible in a short period of time. It kicks itself every few
+ * seconds, as well as being kicked by the inode cache shrinker when memory
+ * goes low.
+ */
+void
+xfs_reclaim_worker(
+ struct work_struct *work)
+{
+ struct xfs_mount *mp = container_of(to_delayed_work(work),
+ struct xfs_mount, m_reclaim_work);
+ int nr_to_scan = INT_MAX;
+
+ xfs_reclaim_inodes_ag(mp, &nr_to_scan);
+ xfs_reclaim_work_queue(mp);
}
STATIC int
xfs_inode_free_eofblocks(
struct xfs_inode *ip,
- int flags,
void *args)
{
- int ret = 0;
- struct xfs_eofblocks *eofb = args;
- int match;
+ struct xfs_eofblocks *eofb = args;
+ bool wait;
+ int ret;
+
+ wait = eofb && (eofb->eof_flags & XFS_EOF_FLAGS_SYNC);
if (!xfs_can_free_eofblocks(ip, false)) {
/* inode could be preallocated or append-only */
@@ -1481,62 +1365,34 @@
* If the mapping is dirty the operation can block and wait for some
* time. Unless we are waiting, skip it.
*/
- if (!(flags & SYNC_WAIT) &&
- mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_DIRTY))
+ if (!wait && mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_DIRTY))
return 0;
- if (eofb) {
- if (eofb->eof_flags & XFS_EOF_FLAGS_UNION)
- match = xfs_inode_match_id_union(ip, eofb);
- else
- match = xfs_inode_match_id(ip, eofb);
- if (!match)
- return 0;
-
- /* skip the inode if the file size is too small */
- if (eofb->eof_flags & XFS_EOF_FLAGS_MINFILESIZE &&
- XFS_ISIZE(ip) < eofb->eof_min_file_size)
- return 0;
- }
+ if (!xfs_inode_matches_eofb(ip, eofb))
+ return 0;
/*
* If the caller is waiting, return -EAGAIN to keep the background
* scanner moving and revisit the inode in a subsequent pass.
*/
if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) {
- if (flags & SYNC_WAIT)
- ret = -EAGAIN;
- return ret;
+ if (wait)
+ return -EAGAIN;
+ return 0;
}
+
ret = xfs_free_eofblocks(ip);
xfs_iunlock(ip, XFS_IOLOCK_EXCL);
return ret;
}
-static int
-__xfs_icache_free_eofblocks(
- struct xfs_mount *mp,
- struct xfs_eofblocks *eofb,
- int (*execute)(struct xfs_inode *ip, int flags,
- void *args),
- int tag)
-{
- int flags = SYNC_TRYLOCK;
-
- if (eofb && (eofb->eof_flags & XFS_EOF_FLAGS_SYNC))
- flags = SYNC_WAIT;
-
- return xfs_inode_ag_iterator_tag(mp, execute, flags,
- eofb, tag);
-}
-
int
xfs_icache_free_eofblocks(
struct xfs_mount *mp,
struct xfs_eofblocks *eofb)
{
- return __xfs_icache_free_eofblocks(mp, eofb, xfs_inode_free_eofblocks,
+ return xfs_inode_walk(mp, 0, xfs_inode_free_eofblocks, eofb,
XFS_ICI_EOFBLOCKS_TAG);
}
@@ -1563,7 +1419,7 @@
eofb.eof_flags = XFS_EOF_FLAGS_UNION|XFS_EOF_FLAGS_SYNC;
if (XFS_IS_UQUOTA_ENFORCED(ip->i_mount)) {
- dq = xfs_inode_dquot(ip, XFS_DQ_USER);
+ dq = xfs_inode_dquot(ip, XFS_DQTYPE_USER);
if (dq && xfs_dquot_lowsp(dq)) {
eofb.eof_uid = VFS_I(ip)->i_uid;
eofb.eof_flags |= XFS_EOF_FLAGS_UID;
@@ -1572,7 +1428,7 @@
}
if (XFS_IS_GQUOTA_ENFORCED(ip->i_mount)) {
- dq = xfs_inode_dquot(ip, XFS_DQ_GROUP);
+ dq = xfs_inode_dquot(ip, XFS_DQTYPE_GROUP);
if (dq && xfs_dquot_lowsp(dq)) {
eofb.eof_gid = VFS_I(ip)->i_gid;
eofb.eof_flags |= XFS_EOF_FLAGS_GID;
@@ -1753,29 +1609,16 @@
STATIC int
xfs_inode_free_cowblocks(
struct xfs_inode *ip,
- int flags,
void *args)
{
struct xfs_eofblocks *eofb = args;
- int match;
int ret = 0;
if (!xfs_prep_free_cowblocks(ip))
return 0;
- if (eofb) {
- if (eofb->eof_flags & XFS_EOF_FLAGS_UNION)
- match = xfs_inode_match_id_union(ip, eofb);
- else
- match = xfs_inode_match_id(ip, eofb);
- if (!match)
- return 0;
-
- /* skip the inode if the file size is too small */
- if (eofb->eof_flags & XFS_EOF_FLAGS_MINFILESIZE &&
- XFS_ISIZE(ip) < eofb->eof_min_file_size)
- return 0;
- }
+ if (!xfs_inode_matches_eofb(ip, eofb))
+ return 0;
/* Free the CoW blocks */
xfs_ilock(ip, XFS_IOLOCK_EXCL);
@@ -1799,7 +1642,7 @@
struct xfs_mount *mp,
struct xfs_eofblocks *eofb)
{
- return __xfs_icache_free_eofblocks(mp, eofb, xfs_inode_free_cowblocks,
+ return xfs_inode_walk(mp, 0, xfs_inode_free_cowblocks, eofb,
XFS_ICI_COWBLOCKS_TAG);
}
diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h
index 48f1fd2..3a4c8b3 100644
--- a/fs/xfs/xfs_icache.h
+++ b/fs/xfs/xfs_icache.h
@@ -17,14 +17,11 @@
__u64 eof_min_file_size;
};
-#define SYNC_WAIT 0x0001 /* wait for i/o to complete */
-#define SYNC_TRYLOCK 0x0002 /* only try to lock inodes */
-
/*
* tags for inode radix tree
*/
#define XFS_ICI_NO_TAG (-1) /* special flag for an untagged lookup
- in xfs_inode_ag_iterator */
+ in xfs_inode_walk */
#define XFS_ICI_RECLAIM_TAG 0 /* inode is to be reclaimed */
#define XFS_ICI_EOFBLOCKS_TAG 1 /* inode has blocks beyond EOF */
#define XFS_ICI_COWBLOCKS_TAG 2 /* inode can have cow blocks to gc */
@@ -40,7 +37,7 @@
/*
* flags for AG inode iterator
*/
-#define XFS_AGITER_INEW_WAIT 0x1 /* wait on new inodes */
+#define XFS_INODE_WALK_INEW_WAIT 0x1 /* wait on new inodes */
int xfs_iget(struct xfs_mount *mp, struct xfs_trans *tp, xfs_ino_t ino,
uint flags, uint lock_flags, xfs_inode_t **ipp);
@@ -51,7 +48,7 @@
void xfs_reclaim_worker(struct work_struct *work);
-int xfs_reclaim_inodes(struct xfs_mount *mp, int mode);
+void xfs_reclaim_inodes(struct xfs_mount *mp);
int xfs_reclaim_inodes_count(struct xfs_mount *mp);
long xfs_reclaim_inodes_nr(struct xfs_mount *mp, int nr_to_scan);
@@ -71,50 +68,9 @@
void xfs_cowblocks_worker(struct work_struct *);
void xfs_queue_cowblocks(struct xfs_mount *);
-int xfs_inode_ag_iterator(struct xfs_mount *mp,
- int (*execute)(struct xfs_inode *ip, int flags, void *args),
- int flags, void *args);
-int xfs_inode_ag_iterator_flags(struct xfs_mount *mp,
- int (*execute)(struct xfs_inode *ip, int flags, void *args),
- int flags, void *args, int iter_flags);
-int xfs_inode_ag_iterator_tag(struct xfs_mount *mp,
- int (*execute)(struct xfs_inode *ip, int flags, void *args),
- int flags, void *args, int tag);
-
-static inline int
-xfs_fs_eofblocks_from_user(
- struct xfs_fs_eofblocks *src,
- struct xfs_eofblocks *dst)
-{
- if (src->eof_version != XFS_EOFBLOCKS_VERSION)
- return -EINVAL;
-
- if (src->eof_flags & ~XFS_EOF_FLAGS_VALID)
- return -EINVAL;
-
- if (memchr_inv(&src->pad32, 0, sizeof(src->pad32)) ||
- memchr_inv(src->pad64, 0, sizeof(src->pad64)))
- return -EINVAL;
-
- dst->eof_flags = src->eof_flags;
- dst->eof_prid = src->eof_prid;
- dst->eof_min_file_size = src->eof_min_file_size;
-
- dst->eof_uid = INVALID_UID;
- if (src->eof_flags & XFS_EOF_FLAGS_UID) {
- dst->eof_uid = make_kuid(current_user_ns(), src->eof_uid);
- if (!uid_valid(dst->eof_uid))
- return -EINVAL;
- }
-
- dst->eof_gid = INVALID_GID;
- if (src->eof_flags & XFS_EOF_FLAGS_GID) {
- dst->eof_gid = make_kgid(current_user_ns(), src->eof_gid);
- if (!gid_valid(dst->eof_gid))
- return -EINVAL;
- }
- return 0;
-}
+int xfs_inode_walk(struct xfs_mount *mp, int iter_flags,
+ int (*execute)(struct xfs_inode *ip, void *args),
+ void *args, int tag);
int xfs_icache_inode_is_allocated(struct xfs_mount *mp, struct xfs_trans *tp,
xfs_ino_t ino, bool *inuse);
diff --git a/fs/xfs/xfs_icreate_item.c b/fs/xfs/xfs_icreate_item.c
index 3ebd1b7..9b3994b 100644
--- a/fs/xfs/xfs_icreate_item.c
+++ b/fs/xfs/xfs_icreate_item.c
@@ -6,11 +6,19 @@
#include "xfs.h"
#include "xfs_fs.h"
#include "xfs_shared.h"
+#include "xfs_format.h"
#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
#include "xfs_trans.h"
#include "xfs_trans_priv.h"
#include "xfs_icreate_item.h"
#include "xfs_log.h"
+#include "xfs_log_priv.h"
+#include "xfs_log_recover.h"
+#include "xfs_ialloc.h"
+#include "xfs_trace.h"
kmem_zone_t *xfs_icreate_zone; /* inode create item zone */
@@ -55,7 +63,7 @@
xfs_icreate_item_release(
struct xfs_log_item *lip)
{
- kmem_zone_free(xfs_icreate_zone, ICR_ITEM(lip));
+ kmem_cache_free(xfs_icreate_zone, ICR_ITEM(lip));
}
static const struct xfs_item_ops xfs_icreate_item_ops = {
@@ -89,7 +97,7 @@
{
struct xfs_icreate_item *icp;
- icp = kmem_zone_zalloc(xfs_icreate_zone, 0);
+ icp = kmem_cache_zalloc(xfs_icreate_zone, GFP_KERNEL | __GFP_NOFAIL);
xfs_log_item_init(tp->t_mountp, &icp->ic_item, XFS_LI_ICREATE,
&xfs_icreate_item_ops);
@@ -107,3 +115,147 @@
tp->t_flags |= XFS_TRANS_DIRTY;
set_bit(XFS_LI_DIRTY, &icp->ic_item.li_flags);
}
+
+static enum xlog_recover_reorder
+xlog_recover_icreate_reorder(
+ struct xlog_recover_item *item)
+{
+ /*
+ * Inode allocation buffers must be replayed before subsequent inode
+ * items try to modify those buffers. ICREATE items are the logical
+ * equivalent of logging a newly initialized inode buffer, so recover
+ * these at the same time that we recover logged buffers.
+ */
+ return XLOG_REORDER_BUFFER_LIST;
+}
+
+/*
+ * This routine is called when an inode create format structure is found in a
+ * committed transaction in the log. It's purpose is to initialise the inodes
+ * being allocated on disk. This requires us to get inode cluster buffers that
+ * match the range to be initialised, stamped with inode templates and written
+ * by delayed write so that subsequent modifications will hit the cached buffer
+ * and only need writing out at the end of recovery.
+ */
+STATIC int
+xlog_recover_icreate_commit_pass2(
+ struct xlog *log,
+ struct list_head *buffer_list,
+ struct xlog_recover_item *item,
+ xfs_lsn_t lsn)
+{
+ struct xfs_mount *mp = log->l_mp;
+ struct xfs_icreate_log *icl;
+ struct xfs_ino_geometry *igeo = M_IGEO(mp);
+ xfs_agnumber_t agno;
+ xfs_agblock_t agbno;
+ unsigned int count;
+ unsigned int isize;
+ xfs_agblock_t length;
+ int bb_per_cluster;
+ int cancel_count;
+ int nbufs;
+ int i;
+
+ icl = (struct xfs_icreate_log *)item->ri_buf[0].i_addr;
+ if (icl->icl_type != XFS_LI_ICREATE) {
+ xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad type");
+ return -EINVAL;
+ }
+
+ if (icl->icl_size != 1) {
+ xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad icl size");
+ return -EINVAL;
+ }
+
+ agno = be32_to_cpu(icl->icl_ag);
+ if (agno >= mp->m_sb.sb_agcount) {
+ xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad agno");
+ return -EINVAL;
+ }
+ agbno = be32_to_cpu(icl->icl_agbno);
+ if (!agbno || agbno == NULLAGBLOCK || agbno >= mp->m_sb.sb_agblocks) {
+ xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad agbno");
+ return -EINVAL;
+ }
+ isize = be32_to_cpu(icl->icl_isize);
+ if (isize != mp->m_sb.sb_inodesize) {
+ xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad isize");
+ return -EINVAL;
+ }
+ count = be32_to_cpu(icl->icl_count);
+ if (!count) {
+ xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad count");
+ return -EINVAL;
+ }
+ length = be32_to_cpu(icl->icl_length);
+ if (!length || length >= mp->m_sb.sb_agblocks) {
+ xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad length");
+ return -EINVAL;
+ }
+
+ /*
+ * The inode chunk is either full or sparse and we only support
+ * m_ino_geo.ialloc_min_blks sized sparse allocations at this time.
+ */
+ if (length != igeo->ialloc_blks &&
+ length != igeo->ialloc_min_blks) {
+ xfs_warn(log->l_mp,
+ "%s: unsupported chunk length", __FUNCTION__);
+ return -EINVAL;
+ }
+
+ /* verify inode count is consistent with extent length */
+ if ((count >> mp->m_sb.sb_inopblog) != length) {
+ xfs_warn(log->l_mp,
+ "%s: inconsistent inode count and chunk length",
+ __FUNCTION__);
+ return -EINVAL;
+ }
+
+ /*
+ * The icreate transaction can cover multiple cluster buffers and these
+ * buffers could have been freed and reused. Check the individual
+ * buffers for cancellation so we don't overwrite anything written after
+ * a cancellation.
+ */
+ bb_per_cluster = XFS_FSB_TO_BB(mp, igeo->blocks_per_cluster);
+ nbufs = length / igeo->blocks_per_cluster;
+ for (i = 0, cancel_count = 0; i < nbufs; i++) {
+ xfs_daddr_t daddr;
+
+ daddr = XFS_AGB_TO_DADDR(mp, agno,
+ agbno + i * igeo->blocks_per_cluster);
+ if (xlog_is_buffer_cancelled(log, daddr, bb_per_cluster))
+ cancel_count++;
+ }
+
+ /*
+ * We currently only use icreate for a single allocation at a time. This
+ * means we should expect either all or none of the buffers to be
+ * cancelled. Be conservative and skip replay if at least one buffer is
+ * cancelled, but warn the user that something is awry if the buffers
+ * are not consistent.
+ *
+ * XXX: This must be refined to only skip cancelled clusters once we use
+ * icreate for multiple chunk allocations.
+ */
+ ASSERT(!cancel_count || cancel_count == nbufs);
+ if (cancel_count) {
+ if (cancel_count != nbufs)
+ xfs_warn(mp,
+ "WARNING: partial inode chunk cancellation, skipped icreate.");
+ trace_xfs_log_recover_icreate_cancel(log, icl);
+ return 0;
+ }
+
+ trace_xfs_log_recover_icreate_recover(log, icl);
+ return xfs_ialloc_inode_init(mp, NULL, buffer_list, count, agno, agbno,
+ length, be32_to_cpu(icl->icl_gen));
+}
+
+const struct xlog_recover_item_ops xlog_icreate_item_ops = {
+ .item_type = XFS_LI_ICREATE,
+ .reorder = xlog_recover_icreate_reorder,
+ .commit_pass2 = xlog_recover_icreate_commit_pass2,
+};
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index b339ff9..2bfbcf2 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -44,7 +44,6 @@
*/
#define XFS_ITRUNC_MAX_EXTENTS 2
-STATIC int xfs_iflush_int(struct xfs_inode *, struct xfs_buf *);
STATIC int xfs_iunlink(struct xfs_trans *, struct xfs_inode *);
STATIC int xfs_iunlink_remove(struct xfs_trans *, struct xfs_inode *);
@@ -55,6 +54,12 @@
xfs_get_extsz_hint(
struct xfs_inode *ip)
{
+ /*
+ * No point in aligning allocations if we need to COW to actually
+ * write to them.
+ */
+ if (xfs_is_always_cow_inode(ip))
+ return 0;
if ((ip->i_d.di_flags & XFS_DIFLAG_EXTSIZE) && ip->i_d.di_extsize)
return ip->i_d.di_extsize;
if (XFS_IS_REALTIME_INODE(ip))
@@ -106,7 +111,7 @@
{
uint lock_mode = XFS_ILOCK_SHARED;
- if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE &&
+ if (ip->i_df.if_format == XFS_DINODE_FMT_BTREE &&
(ip->i_df.if_flags & XFS_IFEXTENTS) == 0)
lock_mode = XFS_ILOCK_EXCL;
xfs_ilock(ip, lock_mode);
@@ -119,7 +124,8 @@
{
uint lock_mode = XFS_ILOCK_SHARED;
- if (ip->i_d.di_aformat == XFS_DINODE_FMT_BTREE &&
+ if (ip->i_afp &&
+ ip->i_afp->if_format == XFS_DINODE_FMT_BTREE &&
(ip->i_afp->if_flags & XFS_IFEXTENTS) == 0)
lock_mode = XFS_ILOCK_EXCL;
xfs_ilock(ip, lock_mode);
@@ -138,17 +144,17 @@
*
* i_rwsem -> i_mmap_lock -> page_lock -> i_ilock
*
- * mmap_sem locking order:
+ * mmap_lock locking order:
*
- * i_rwsem -> page lock -> mmap_sem
- * mmap_sem -> i_mmap_lock -> page_lock
+ * i_rwsem -> page lock -> mmap_lock
+ * mmap_lock -> i_mmap_lock -> page_lock
*
- * The difference in mmap_sem locking order mean that we cannot hold the
+ * The difference in mmap_lock locking order mean that we cannot hold the
* i_mmap_lock over syscall based read(2)/write(2) based IO. These IO paths can
- * fault in pages during copy in/out (for buffered IO) or require the mmap_sem
+ * fault in pages during copy in/out (for buffered IO) or require the mmap_lock
* in get_user_pages() to map the user pages into the kernel address space for
* direct IO. Similarly the i_rwsem cannot be taken inside a page fault because
- * page faults already hold the mmap_sem.
+ * page faults already hold the mmap_lock.
*
* Hence to serialise fully against both syscall and mmap based IO, we need to
* take both the i_rwsem and the i_mmap_lock. These locks should *only* be both
@@ -445,7 +451,7 @@
/*
* Currently supports between 2 and 5 inodes with exclusive locking. We
* support an arbitrary depth of locking here, but absolute limits on
- * inodes depend on the the type of locking and the limits placed by
+ * inodes depend on the type of locking and the limits placed by
* lockdep annotations in xfs_lock_inumorder. These are all checked by
* the asserts.
*/
@@ -592,22 +598,6 @@
}
}
-void
-__xfs_iflock(
- struct xfs_inode *ip)
-{
- wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_IFLOCK_BIT);
- DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_IFLOCK_BIT);
-
- do {
- prepare_to_wait_exclusive(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
- if (xfs_isiflocked(ip))
- io_schedule();
- } while (!xfs_iflock_nowait(ip));
-
- finish_wait(wq, &wait.wq_entry);
-}
-
STATIC uint
_xfs_dic2xflags(
uint16_t di_flags,
@@ -708,6 +698,68 @@
return error;
}
+/* Propagate di_flags from a parent inode to a child inode. */
+static void
+xfs_inode_inherit_flags(
+ struct xfs_inode *ip,
+ const struct xfs_inode *pip)
+{
+ unsigned int di_flags = 0;
+ umode_t mode = VFS_I(ip)->i_mode;
+
+ if (S_ISDIR(mode)) {
+ if (pip->i_d.di_flags & XFS_DIFLAG_RTINHERIT)
+ di_flags |= XFS_DIFLAG_RTINHERIT;
+ if (pip->i_d.di_flags & XFS_DIFLAG_EXTSZINHERIT) {
+ di_flags |= XFS_DIFLAG_EXTSZINHERIT;
+ ip->i_d.di_extsize = pip->i_d.di_extsize;
+ }
+ if (pip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
+ di_flags |= XFS_DIFLAG_PROJINHERIT;
+ } else if (S_ISREG(mode)) {
+ if ((pip->i_d.di_flags & XFS_DIFLAG_RTINHERIT) &&
+ xfs_sb_version_hasrealtime(&ip->i_mount->m_sb))
+ di_flags |= XFS_DIFLAG_REALTIME;
+ if (pip->i_d.di_flags & XFS_DIFLAG_EXTSZINHERIT) {
+ di_flags |= XFS_DIFLAG_EXTSIZE;
+ ip->i_d.di_extsize = pip->i_d.di_extsize;
+ }
+ }
+ if ((pip->i_d.di_flags & XFS_DIFLAG_NOATIME) &&
+ xfs_inherit_noatime)
+ di_flags |= XFS_DIFLAG_NOATIME;
+ if ((pip->i_d.di_flags & XFS_DIFLAG_NODUMP) &&
+ xfs_inherit_nodump)
+ di_flags |= XFS_DIFLAG_NODUMP;
+ if ((pip->i_d.di_flags & XFS_DIFLAG_SYNC) &&
+ xfs_inherit_sync)
+ di_flags |= XFS_DIFLAG_SYNC;
+ if ((pip->i_d.di_flags & XFS_DIFLAG_NOSYMLINKS) &&
+ xfs_inherit_nosymlinks)
+ di_flags |= XFS_DIFLAG_NOSYMLINKS;
+ if ((pip->i_d.di_flags & XFS_DIFLAG_NODEFRAG) &&
+ xfs_inherit_nodefrag)
+ di_flags |= XFS_DIFLAG_NODEFRAG;
+ if (pip->i_d.di_flags & XFS_DIFLAG_FILESTREAM)
+ di_flags |= XFS_DIFLAG_FILESTREAM;
+
+ ip->i_d.di_flags |= di_flags;
+}
+
+/* Propagate di_flags2 from a parent inode to a child inode. */
+static void
+xfs_inode_inherit_flags2(
+ struct xfs_inode *ip,
+ const struct xfs_inode *pip)
+{
+ if (pip->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE) {
+ ip->i_d.di_flags2 |= XFS_DIFLAG2_COWEXTSIZE;
+ ip->i_d.di_cowextsize = pip->i_d.di_cowextsize;
+ }
+ if (pip->i_d.di_flags2 & XFS_DIFLAG2_DAX)
+ ip->i_d.di_flags2 |= XFS_DIFLAG2_DAX;
+}
+
/*
* Allocate an inode on disk and return a copy of its in-core version.
* The in-core inode is locked exclusively. Set mode, nlink, and rdev
@@ -795,26 +847,18 @@
return error;
ASSERT(ip != NULL);
inode = VFS_I(ip);
-
- /*
- * We always convert v1 inodes to v2 now - we only support filesystems
- * with >= v2 inode capability, so there is no reason for ever leaving
- * an inode in v1 format.
- */
- if (ip->i_d.di_version == 1)
- ip->i_d.di_version = 2;
-
inode->i_mode = mode;
set_nlink(inode, nlink);
- ip->i_d.di_uid = xfs_kuid_to_uid(current_fsuid());
- ip->i_d.di_gid = xfs_kgid_to_gid(current_fsgid());
+ inode->i_uid = current_fsuid();
inode->i_rdev = rdev;
- xfs_set_projid(ip, prid);
+ ip->i_d.di_projid = prid;
if (pip && XFS_INHERIT_GID(pip)) {
- ip->i_d.di_gid = pip->i_d.di_gid;
+ inode->i_gid = VFS_I(pip)->i_gid;
if ((VFS_I(pip)->i_mode & S_ISGID) && S_ISDIR(mode))
inode->i_mode |= S_ISGID;
+ } else {
+ inode->i_gid = current_fsgid();
}
/*
@@ -822,13 +866,12 @@
* ID or one of the supplementary group IDs, the S_ISGID bit is cleared
* (and only if the irix_sgid_inherit compatibility variable is set).
*/
- if ((irix_sgid_inherit) &&
- (inode->i_mode & S_ISGID) &&
- (!in_group_p(xfs_gid_to_kgid(ip->i_d.di_gid))))
+ if (irix_sgid_inherit &&
+ (inode->i_mode & S_ISGID) && !in_group_p(inode->i_gid))
inode->i_mode &= ~S_ISGID;
ip->i_d.di_size = 0;
- ip->i_d.di_nextents = 0;
+ ip->i_df.if_nextents = 0;
ASSERT(ip->i_d.di_nblocks == 0);
tv = current_time(inode);
@@ -841,85 +884,32 @@
ip->i_d.di_dmstate = 0;
ip->i_d.di_flags = 0;
- if (ip->i_d.di_version == 3) {
+ if (xfs_sb_version_has_v3inode(&mp->m_sb)) {
inode_set_iversion(inode, 1);
- ip->i_d.di_flags2 = 0;
+ ip->i_d.di_flags2 = mp->m_ino_geo.new_diflags2;
ip->i_d.di_cowextsize = 0;
- ip->i_d.di_crtime.t_sec = (int32_t)tv.tv_sec;
- ip->i_d.di_crtime.t_nsec = (int32_t)tv.tv_nsec;
+ ip->i_d.di_crtime = tv;
}
-
flags = XFS_ILOG_CORE;
switch (mode & S_IFMT) {
case S_IFIFO:
case S_IFCHR:
case S_IFBLK:
case S_IFSOCK:
- ip->i_d.di_format = XFS_DINODE_FMT_DEV;
+ ip->i_df.if_format = XFS_DINODE_FMT_DEV;
ip->i_df.if_flags = 0;
flags |= XFS_ILOG_DEV;
break;
case S_IFREG:
case S_IFDIR:
- if (pip && (pip->i_d.di_flags & XFS_DIFLAG_ANY)) {
- uint di_flags = 0;
-
- if (S_ISDIR(mode)) {
- if (pip->i_d.di_flags & XFS_DIFLAG_RTINHERIT)
- di_flags |= XFS_DIFLAG_RTINHERIT;
- if (pip->i_d.di_flags & XFS_DIFLAG_EXTSZINHERIT) {
- di_flags |= XFS_DIFLAG_EXTSZINHERIT;
- ip->i_d.di_extsize = pip->i_d.di_extsize;
- }
- if (pip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
- di_flags |= XFS_DIFLAG_PROJINHERIT;
- } else if (S_ISREG(mode)) {
- if (pip->i_d.di_flags & XFS_DIFLAG_RTINHERIT)
- di_flags |= XFS_DIFLAG_REALTIME;
- if (pip->i_d.di_flags & XFS_DIFLAG_EXTSZINHERIT) {
- di_flags |= XFS_DIFLAG_EXTSIZE;
- ip->i_d.di_extsize = pip->i_d.di_extsize;
- }
- }
- if ((pip->i_d.di_flags & XFS_DIFLAG_NOATIME) &&
- xfs_inherit_noatime)
- di_flags |= XFS_DIFLAG_NOATIME;
- if ((pip->i_d.di_flags & XFS_DIFLAG_NODUMP) &&
- xfs_inherit_nodump)
- di_flags |= XFS_DIFLAG_NODUMP;
- if ((pip->i_d.di_flags & XFS_DIFLAG_SYNC) &&
- xfs_inherit_sync)
- di_flags |= XFS_DIFLAG_SYNC;
- if ((pip->i_d.di_flags & XFS_DIFLAG_NOSYMLINKS) &&
- xfs_inherit_nosymlinks)
- di_flags |= XFS_DIFLAG_NOSYMLINKS;
- if ((pip->i_d.di_flags & XFS_DIFLAG_NODEFRAG) &&
- xfs_inherit_nodefrag)
- di_flags |= XFS_DIFLAG_NODEFRAG;
- if (pip->i_d.di_flags & XFS_DIFLAG_FILESTREAM)
- di_flags |= XFS_DIFLAG_FILESTREAM;
-
- ip->i_d.di_flags |= di_flags;
- }
- if (pip &&
- (pip->i_d.di_flags2 & XFS_DIFLAG2_ANY) &&
- pip->i_d.di_version == 3 &&
- ip->i_d.di_version == 3) {
- uint64_t di_flags2 = 0;
-
- if (pip->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE) {
- di_flags2 |= XFS_DIFLAG2_COWEXTSIZE;
- ip->i_d.di_cowextsize = pip->i_d.di_cowextsize;
- }
- if (pip->i_d.di_flags2 & XFS_DIFLAG2_DAX)
- di_flags2 |= XFS_DIFLAG2_DAX;
-
- ip->i_d.di_flags2 |= di_flags2;
- }
+ if (pip && (pip->i_d.di_flags & XFS_DIFLAG_ANY))
+ xfs_inode_inherit_flags(ip, pip);
+ if (pip && (pip->i_d.di_flags2 & XFS_DIFLAG2_ANY))
+ xfs_inode_inherit_flags2(ip, pip);
/* FALLTHROUGH */
case S_IFLNK:
- ip->i_d.di_format = XFS_DINODE_FMT_EXTENTS;
+ ip->i_df.if_format = XFS_DINODE_FMT_EXTENTS;
ip->i_df.if_flags = XFS_IFEXTENTS;
ip->i_df.if_bytes = 0;
ip->i_df.if_u1.if_root = NULL;
@@ -927,11 +917,6 @@
default:
ASSERT(0);
}
- /*
- * Attribute fork settings for new inode.
- */
- ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS;
- ip->i_d.di_anextents = 0;
/*
* Log the new values stuffed into the inode.
@@ -1117,7 +1102,6 @@
{
xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
- ASSERT(ip->i_d.di_version > 1);
inc_nlink(VFS_I(ip));
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
}
@@ -1153,8 +1137,7 @@
/*
* Make sure that we have allocated dquot(s) on disk.
*/
- error = xfs_qm_vop_dqalloc(dp, xfs_kuid_to_uid(current_fsuid()),
- xfs_kgid_to_gid(current_fsgid()), prid,
+ error = xfs_qm_vop_dqalloc(dp, current_fsuid(), current_fsgid(), prid,
XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT,
&udqp, &gdqp, &pdqp);
if (error)
@@ -1214,8 +1197,7 @@
unlock_dp_on_error = false;
error = xfs_dir_createname(tp, dp, name, ip->i_ino,
- resblks ?
- resblks - XFS_IALLOC_SPACE_RES(mp) : 0);
+ resblks - XFS_IALLOC_SPACE_RES(mp));
if (error) {
ASSERT(error != -ENOSPC);
goto out_trans_cancel;
@@ -1304,8 +1286,7 @@
/*
* Make sure that we have allocated dquot(s) on disk.
*/
- error = xfs_qm_vop_dqalloc(dp, xfs_kuid_to_uid(current_fsuid()),
- xfs_kgid_to_gid(current_fsgid()), prid,
+ error = xfs_qm_vop_dqalloc(dp, current_fsuid(), current_fsgid(), prid,
XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT,
&udqp, &gdqp, &pdqp);
if (error)
@@ -1418,7 +1399,7 @@
* the tree quota mechanism could be circumvented.
*/
if (unlikely((tdp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) &&
- (xfs_get_projid(tdp) != xfs_get_projid(sip)))) {
+ tdp->i_d.di_projid != sip->i_d.di_projid)) {
error = -EXDEV;
goto error_return;
}
@@ -1513,10 +1494,8 @@
struct xfs_mount *mp = ip->i_mount;
struct xfs_trans *tp = *tpp;
xfs_fileoff_t first_unmap_block;
- xfs_fileoff_t last_block;
xfs_filblks_t unmap_len;
int error = 0;
- int done = 0;
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
ASSERT(!atomic_read(&VFS_I(ip)->i_count) ||
@@ -1536,41 +1515,35 @@
* the end of the file (in a crash where the space is allocated
* but the inode size is not yet updated), simply remove any
* blocks which show up between the new EOF and the maximum
- * possible file size. If the first block to be removed is
- * beyond the maximum file size (ie it is the same as last_block),
- * then there is nothing to do.
+ * possible file size.
+ *
+ * We have to free all the blocks to the bmbt maximum offset, even if
+ * the page cache can't scale that far.
*/
first_unmap_block = XFS_B_TO_FSB(mp, (xfs_ufsize_t)new_size);
- last_block = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes);
- if (first_unmap_block == last_block)
+ if (first_unmap_block >= XFS_MAX_FILEOFF) {
+ WARN_ON_ONCE(first_unmap_block > XFS_MAX_FILEOFF);
return 0;
+ }
- ASSERT(first_unmap_block < last_block);
- unmap_len = last_block - first_unmap_block + 1;
- while (!done) {
+ unmap_len = XFS_MAX_FILEOFF - first_unmap_block + 1;
+ while (unmap_len > 0) {
ASSERT(tp->t_firstblock == NULLFSBLOCK);
- error = xfs_bunmapi(tp, ip, first_unmap_block, unmap_len, flags,
- XFS_ITRUNC_MAX_EXTENTS, &done);
+ error = __xfs_bunmapi(tp, ip, first_unmap_block, &unmap_len,
+ flags, XFS_ITRUNC_MAX_EXTENTS);
if (error)
goto out;
- /*
- * Duplicate the transaction that has the permanent
- * reservation and commit the old transaction.
- */
+ /* free the just unmapped extents */
error = xfs_defer_finish(&tp);
if (error)
goto out;
-
- error = xfs_trans_roll_inode(&tp, ip);
- if (error)
- goto out;
}
if (whichfork == XFS_DATA_FORK) {
/* Remove all pending CoW reservations. */
error = xfs_reflink_cancel_cow_blocks(ip, &tp,
- first_unmap_block, last_block, true);
+ first_unmap_block, XFS_MAX_FILEOFF, true);
if (error)
goto out;
@@ -1651,7 +1624,7 @@
return 0;
/*
* If we can't get the iolock just skip truncating the blocks
- * past EOF because we could deadlock with the mmap_sem
+ * past EOF because we could deadlock with the mmap_lock
* otherwise. We'll get another chance to drop them once the
* last reference to the inode is dropped, so we'll never leak
* blocks permanently.
@@ -1703,7 +1676,7 @@
if (error)
goto error_trans_cancel;
- ASSERT(ip->i_d.di_nextents == 0);
+ ASSERT(ip->i_df.if_nextents == 0);
error = xfs_trans_commit(tp);
if (error)
@@ -1872,7 +1845,7 @@
if (S_ISREG(VFS_I(ip)->i_mode) &&
(ip->i_d.di_size != 0 || XFS_ISIZE(ip) != 0 ||
- ip->i_d.di_nextents > 0 || ip->i_delayed_blks > 0))
+ ip->i_df.if_nextents > 0 || ip->i_delayed_blks > 0))
truncate = 1;
error = xfs_qm_dqattach(ip);
@@ -1898,7 +1871,6 @@
}
ASSERT(!ip->i_afp);
- ASSERT(ip->i_d.di_anextents == 0);
ASSERT(ip->i_d.di_forkoff == 0);
/*
@@ -2134,7 +2106,7 @@
unsigned int bucket_index,
xfs_agino_t new_agino)
{
- struct xfs_agi *agi = XFS_BUF_TO_AGI(agibp);
+ struct xfs_agi *agi = agibp->b_addr;
xfs_agino_t old_value;
int offset;
@@ -2149,8 +2121,10 @@
* passed in because either we're adding or removing ourselves from the
* head of the list.
*/
- if (old_value == new_agino)
+ if (old_value == new_agino) {
+ xfs_buf_mark_corrupt(agibp);
return -EFSCORRUPTED;
+ }
agi->agi_unlinked[bucket_index] = cpu_to_be32(new_agino);
offset = offsetof(struct xfs_agi, agi_unlinked) +
@@ -2186,7 +2160,6 @@
xfs_dinode_calc_crc(mp, dip);
xfs_trans_inode_buf(tp, ibp);
xfs_trans_log_buf(tp, ibp, offset, offset + sizeof(xfs_agino_t) - 1);
- xfs_inobp_check(mp, ibp);
}
/* Set an in-core inode's unlinked pointer and return the old value. */
@@ -2206,13 +2179,15 @@
ASSERT(xfs_verify_agino_or_null(mp, agno, next_agino));
- error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp, 0, 0);
+ error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp, 0);
if (error)
return error;
/* Make sure the old pointer isn't garbage. */
old_value = be32_to_cpu(dip->di_next_unlinked);
if (!xfs_verify_agino_or_null(mp, agno, old_value)) {
+ xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__, dip,
+ sizeof(*dip), __this_address);
error = -EFSCORRUPTED;
goto out;
}
@@ -2224,8 +2199,11 @@
*/
*old_next_agino = old_value;
if (old_value == next_agino) {
- if (next_agino != NULLAGINO)
+ if (next_agino != NULLAGINO) {
+ xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__,
+ dip, sizeof(*dip), __this_address);
error = -EFSCORRUPTED;
+ }
goto out;
}
@@ -2267,7 +2245,7 @@
error = xfs_read_agi(mp, tp, agno, &agibp);
if (error)
return error;
- agi = XFS_BUF_TO_AGI(agibp);
+ agi = agibp->b_addr;
/*
* Get the index into the agi hash table for the list this inode will
@@ -2276,11 +2254,12 @@
*/
next_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]);
if (next_agino == agino ||
- !xfs_verify_agino_or_null(mp, agno, next_agino))
+ !xfs_verify_agino_or_null(mp, agno, next_agino)) {
+ xfs_buf_mark_corrupt(agibp);
return -EFSCORRUPTED;
+ }
if (next_agino != NULLAGINO) {
- struct xfs_perag *pag;
xfs_agino_t old_agino;
/*
@@ -2297,9 +2276,7 @@
* agino has been unlinked, add a backref from the next inode
* back to agino.
*/
- pag = xfs_perag_get(mp, agno);
- error = xfs_iunlink_add_backref(pag, agino, next_agino);
- xfs_perag_put(pag);
+ error = xfs_iunlink_add_backref(agibp->b_pag, agino, next_agino);
if (error)
return error;
}
@@ -2329,7 +2306,7 @@
return error;
}
- error = xfs_imap_to_bp(mp, tp, imap, dipp, bpp, 0, 0);
+ error = xfs_imap_to_bp(mp, tp, imap, dipp, bpp, 0);
if (error) {
xfs_warn(mp, "%s: xfs_imap_to_bp returned error %d.",
__func__, error);
@@ -2435,7 +2412,6 @@
struct xfs_buf *agibp;
struct xfs_buf *last_ibp;
struct xfs_dinode *last_dip = NULL;
- struct xfs_perag *pag = NULL;
xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
xfs_agino_t next_agino;
@@ -2449,7 +2425,7 @@
error = xfs_read_agi(mp, tp, agno, &agibp);
if (error)
return error;
- agi = XFS_BUF_TO_AGI(agibp);
+ agi = agibp->b_addr;
/*
* Get the index into the agi hash table for the list this inode will
@@ -2479,32 +2455,22 @@
* this inode's backref to point from the next inode.
*/
if (next_agino != NULLAGINO) {
- pag = xfs_perag_get(mp, agno);
- error = xfs_iunlink_change_backref(pag, next_agino,
+ error = xfs_iunlink_change_backref(agibp->b_pag, next_agino,
NULLAGINO);
if (error)
- goto out;
+ return error;
}
- if (head_agino == agino) {
- /* Point the head of the list to the next unlinked inode. */
- error = xfs_iunlink_update_bucket(tp, agno, agibp, bucket_index,
- next_agino);
- if (error)
- goto out;
- } else {
+ if (head_agino != agino) {
struct xfs_imap imap;
xfs_agino_t prev_agino;
- if (!pag)
- pag = xfs_perag_get(mp, agno);
-
/* We need to search the list for the inode being freed. */
error = xfs_iunlink_map_prev(tp, agno, head_agino, agino,
&prev_agino, &imap, &last_dip, &last_ibp,
- pag);
+ agibp->b_pag);
if (error)
- goto out;
+ return error;
/* Point the previous inode on the list to the next inode. */
xfs_iunlink_update_dinode(tp, agno, prev_agino, last_ibp,
@@ -2518,15 +2484,110 @@
* change_backref takes care of deleting the backref if
* next_agino is NULLAGINO.
*/
- error = xfs_iunlink_change_backref(pag, agino, next_agino);
- if (error)
- goto out;
+ return xfs_iunlink_change_backref(agibp->b_pag, agino,
+ next_agino);
}
-out:
- if (pag)
- xfs_perag_put(pag);
- return error;
+ /* Point the head of the list to the next unlinked inode. */
+ return xfs_iunlink_update_bucket(tp, agno, agibp, bucket_index,
+ next_agino);
+}
+
+/*
+ * Look up the inode number specified and if it is not already marked XFS_ISTALE
+ * mark it stale. We should only find clean inodes in this lookup that aren't
+ * already stale.
+ */
+static void
+xfs_ifree_mark_inode_stale(
+ struct xfs_buf *bp,
+ struct xfs_inode *free_ip,
+ xfs_ino_t inum)
+{
+ struct xfs_mount *mp = bp->b_mount;
+ struct xfs_perag *pag = bp->b_pag;
+ struct xfs_inode_log_item *iip;
+ struct xfs_inode *ip;
+
+retry:
+ rcu_read_lock();
+ ip = radix_tree_lookup(&pag->pag_ici_root, XFS_INO_TO_AGINO(mp, inum));
+
+ /* Inode not in memory, nothing to do */
+ if (!ip) {
+ rcu_read_unlock();
+ return;
+ }
+
+ /*
+ * because this is an RCU protected lookup, we could find a recently
+ * freed or even reallocated inode during the lookup. We need to check
+ * under the i_flags_lock for a valid inode here. Skip it if it is not
+ * valid, the wrong inode or stale.
+ */
+ spin_lock(&ip->i_flags_lock);
+ if (ip->i_ino != inum || __xfs_iflags_test(ip, XFS_ISTALE))
+ goto out_iflags_unlock;
+
+ /*
+ * Don't try to lock/unlock the current inode, but we _cannot_ skip the
+ * other inodes that we did not find in the list attached to the buffer
+ * and are not already marked stale. If we can't lock it, back off and
+ * retry.
+ */
+ if (ip != free_ip) {
+ if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
+ spin_unlock(&ip->i_flags_lock);
+ rcu_read_unlock();
+ delay(1);
+ goto retry;
+ }
+ }
+ ip->i_flags |= XFS_ISTALE;
+
+ /*
+ * If the inode is flushing, it is already attached to the buffer. All
+ * we needed to do here is mark the inode stale so buffer IO completion
+ * will remove it from the AIL.
+ */
+ iip = ip->i_itemp;
+ if (__xfs_iflags_test(ip, XFS_IFLUSHING)) {
+ ASSERT(!list_empty(&iip->ili_item.li_bio_list));
+ ASSERT(iip->ili_last_fields);
+ goto out_iunlock;
+ }
+
+ /*
+ * Inodes not attached to the buffer can be released immediately.
+ * Everything else has to go through xfs_iflush_abort() on journal
+ * commit as the flock synchronises removal of the inode from the
+ * cluster buffer against inode reclaim.
+ */
+ if (!iip || list_empty(&iip->ili_item.li_bio_list))
+ goto out_iunlock;
+
+ __xfs_iflags_set(ip, XFS_IFLUSHING);
+ spin_unlock(&ip->i_flags_lock);
+ rcu_read_unlock();
+
+ /* we have a dirty inode in memory that has not yet been flushed. */
+ spin_lock(&iip->ili_lock);
+ iip->ili_last_fields = iip->ili_fields;
+ iip->ili_fields = 0;
+ iip->ili_fsync_fields = 0;
+ spin_unlock(&iip->ili_lock);
+ ASSERT(iip->ili_last_fields);
+
+ if (ip != free_ip)
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ return;
+
+out_iunlock:
+ if (ip != free_ip)
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+out_iflags_unlock:
+ spin_unlock(&ip->i_flags_lock);
+ rcu_read_unlock();
}
/*
@@ -2536,25 +2597,20 @@
*/
STATIC int
xfs_ifree_cluster(
- xfs_inode_t *free_ip,
- xfs_trans_t *tp,
+ struct xfs_inode *free_ip,
+ struct xfs_trans *tp,
struct xfs_icluster *xic)
{
- xfs_mount_t *mp = free_ip->i_mount;
+ struct xfs_mount *mp = free_ip->i_mount;
+ struct xfs_ino_geometry *igeo = M_IGEO(mp);
+ struct xfs_buf *bp;
+ xfs_daddr_t blkno;
+ xfs_ino_t inum = xic->first_ino;
int nbufs;
int i, j;
int ioffset;
- xfs_daddr_t blkno;
- xfs_buf_t *bp;
- xfs_inode_t *ip;
- xfs_inode_log_item_t *iip;
- struct xfs_log_item *lip;
- struct xfs_perag *pag;
- struct xfs_ino_geometry *igeo = M_IGEO(mp);
- xfs_ino_t inum;
+ int error;
- inum = xic->first_ino;
- pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, inum));
nbufs = igeo->ialloc_blks / igeo->blocks_per_cluster;
for (j = 0; j < nbufs; j++, inum += igeo->inodes_per_cluster) {
@@ -2574,18 +2630,18 @@
/*
* We obtain and lock the backing buffer first in the process
- * here, as we have to ensure that any dirty inode that we
- * can't get the flush lock on is attached to the buffer.
+ * here to ensure dirty inodes attached to the buffer remain in
+ * the flushing state while we mark them stale.
+ *
* If we scan the in-memory inodes first, then buffer IO can
* complete before we get a lock on it, and hence we may fail
* to mark all the active inodes on the buffer stale.
*/
- bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno,
- mp->m_bsize * igeo->blocks_per_cluster,
- XBF_UNMAPPED);
-
- if (!bp)
- return -ENOMEM;
+ error = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno,
+ mp->m_bsize * igeo->blocks_per_cluster,
+ XBF_UNMAPPED, &bp);
+ if (error)
+ return error;
/*
* This buffer may not have been correctly initialised as we
@@ -2599,148 +2655,20 @@
bp->b_ops = &xfs_inode_buf_ops;
/*
- * Walk the inodes already attached to the buffer and mark them
- * stale. These will all have the flush locks held, so an
- * in-memory inode walk can't lock them. By marking them all
- * stale first, we will not attempt to lock them in the loop
- * below as the XFS_ISTALE flag will be set.
+ * Now we need to set all the cached clean inodes as XFS_ISTALE,
+ * too. This requires lookups, and will skip inodes that we've
+ * already marked XFS_ISTALE.
*/
- list_for_each_entry(lip, &bp->b_li_list, li_bio_list) {
- if (lip->li_type == XFS_LI_INODE) {
- iip = (xfs_inode_log_item_t *)lip;
- ASSERT(iip->ili_logged == 1);
- lip->li_cb = xfs_istale_done;
- xfs_trans_ail_copy_lsn(mp->m_ail,
- &iip->ili_flush_lsn,
- &iip->ili_item.li_lsn);
- xfs_iflags_set(iip->ili_inode, XFS_ISTALE);
- }
- }
-
-
- /*
- * For each inode in memory attempt to add it to the inode
- * buffer and set it up for being staled on buffer IO
- * completion. This is safe as we've locked out tail pushing
- * and flushing by locking the buffer.
- *
- * We have already marked every inode that was part of a
- * transaction stale above, which means there is no point in
- * even trying to lock them.
- */
- for (i = 0; i < igeo->inodes_per_cluster; i++) {
-retry:
- rcu_read_lock();
- ip = radix_tree_lookup(&pag->pag_ici_root,
- XFS_INO_TO_AGINO(mp, (inum + i)));
-
- /* Inode not in memory, nothing to do */
- if (!ip) {
- rcu_read_unlock();
- continue;
- }
-
- /*
- * because this is an RCU protected lookup, we could
- * find a recently freed or even reallocated inode
- * during the lookup. We need to check under the
- * i_flags_lock for a valid inode here. Skip it if it
- * is not valid, the wrong inode or stale.
- */
- spin_lock(&ip->i_flags_lock);
- if (ip->i_ino != inum + i ||
- __xfs_iflags_test(ip, XFS_ISTALE)) {
- spin_unlock(&ip->i_flags_lock);
- rcu_read_unlock();
- continue;
- }
- spin_unlock(&ip->i_flags_lock);
-
- /*
- * Don't try to lock/unlock the current inode, but we
- * _cannot_ skip the other inodes that we did not find
- * in the list attached to the buffer and are not
- * already marked stale. If we can't lock it, back off
- * and retry.
- */
- if (ip != free_ip) {
- if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
- rcu_read_unlock();
- delay(1);
- goto retry;
- }
-
- /*
- * Check the inode number again in case we're
- * racing with freeing in xfs_reclaim_inode().
- * See the comments in that function for more
- * information as to why the initial check is
- * not sufficient.
- */
- if (ip->i_ino != inum + i) {
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- rcu_read_unlock();
- continue;
- }
- }
- rcu_read_unlock();
-
- xfs_iflock(ip);
- xfs_iflags_set(ip, XFS_ISTALE);
-
- /*
- * we don't need to attach clean inodes or those only
- * with unlogged changes (which we throw away, anyway).
- */
- iip = ip->i_itemp;
- if (!iip || xfs_inode_clean(ip)) {
- ASSERT(ip != free_ip);
- xfs_ifunlock(ip);
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- continue;
- }
-
- iip->ili_last_fields = iip->ili_fields;
- iip->ili_fields = 0;
- iip->ili_fsync_fields = 0;
- iip->ili_logged = 1;
- xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn,
- &iip->ili_item.li_lsn);
-
- xfs_buf_attach_iodone(bp, xfs_istale_done,
- &iip->ili_item);
-
- if (ip != free_ip)
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- }
+ for (i = 0; i < igeo->inodes_per_cluster; i++)
+ xfs_ifree_mark_inode_stale(bp, free_ip, inum + i);
xfs_trans_stale_inode_buf(tp, bp);
xfs_trans_binval(tp, bp);
}
-
- xfs_perag_put(pag);
return 0;
}
/*
- * Free any local-format buffers sitting around before we reset to
- * extents format.
- */
-static inline void
-xfs_ifree_local_data(
- struct xfs_inode *ip,
- int whichfork)
-{
- struct xfs_ifork *ifp;
-
- if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL)
- return;
-
- ifp = XFS_IFORK_PTR(ip, whichfork);
- xfs_idata_realloc(ip, -ifp->if_bytes, whichfork);
-}
-
-/*
* This is called to return an inode to the inode free list.
* The inode should already be truncated to 0 length and have
* no pages associated with it. This routine also assumes that
@@ -2757,11 +2685,11 @@
{
int error;
struct xfs_icluster xic = { 0 };
+ struct xfs_inode_log_item *iip = ip->i_itemp;
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
ASSERT(VFS_I(ip)->i_nlink == 0);
- ASSERT(ip->i_d.di_nextents == 0);
- ASSERT(ip->i_d.di_anextents == 0);
+ ASSERT(ip->i_df.if_nextents == 0);
ASSERT(ip->i_d.di_size == 0 || !S_ISREG(VFS_I(ip)->i_mode));
ASSERT(ip->i_d.di_nblocks == 0);
@@ -2776,19 +2704,28 @@
if (error)
return error;
- xfs_ifree_local_data(ip, XFS_DATA_FORK);
- xfs_ifree_local_data(ip, XFS_ATTR_FORK);
+ /*
+ * Free any local-format data sitting around before we reset the
+ * data fork to extents format. Note that the attr fork data has
+ * already been freed by xfs_attr_inactive.
+ */
+ if (ip->i_df.if_format == XFS_DINODE_FMT_LOCAL) {
+ kmem_free(ip->i_df.if_u1.if_data);
+ ip->i_df.if_u1.if_data = NULL;
+ ip->i_df.if_bytes = 0;
+ }
VFS_I(ip)->i_mode = 0; /* mark incore inode as free */
ip->i_d.di_flags = 0;
- ip->i_d.di_flags2 = 0;
+ ip->i_d.di_flags2 = ip->i_mount->m_ino_geo.new_diflags2;
ip->i_d.di_dmevmask = 0;
ip->i_d.di_forkoff = 0; /* mark the attr fork not in use */
- ip->i_d.di_format = XFS_DINODE_FMT_EXTENTS;
- ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS;
+ ip->i_df.if_format = XFS_DINODE_FMT_EXTENTS;
/* Don't attempt to replay owner changes for a deleted inode */
- ip->i_itemp->ili_fields &= ~(XFS_ILOG_AOWNER|XFS_ILOG_DOWNER);
+ spin_lock(&iip->ili_lock);
+ iip->ili_fields &= ~(XFS_ILOG_AOWNER | XFS_ILOG_DOWNER);
+ spin_unlock(&iip->ili_lock);
/*
* Bump the generation count so no one will be confused
@@ -3168,7 +3105,7 @@
/*
* xfs_rename_alloc_whiteout()
*
- * Return a referenced, unlinked, unlocked inode that that can be used as a
+ * Return a referenced, unlinked, unlocked inode that can be used as a
* whiteout in a rename transaction. We use a tmpfile inode here so that if we
* crash between allocating the inode and linking it into the rename transaction
* recovery will free the inode and we won't leak it.
@@ -3215,6 +3152,7 @@
struct xfs_trans *tp;
struct xfs_inode *wip = NULL; /* whiteout inode */
struct xfs_inode *inodes[__XFS_SORT_INODES];
+ struct xfs_buf *agibp;
int num_inodes = __XFS_SORT_INODES;
bool new_parent = (src_dp != target_dp);
bool src_is_directory = S_ISDIR(VFS_I(src_ip)->i_mode);
@@ -3289,7 +3227,7 @@
* tree quota mechanism would be circumvented.
*/
if (unlikely((target_dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) &&
- (xfs_get_projid(target_dp) != xfs_get_projid(src_ip)))) {
+ target_dp->i_d.di_projid != src_ip->i_d.di_projid)) {
error = -EXDEV;
goto out_trans_cancel;
}
@@ -3346,7 +3284,6 @@
goto out_trans_cancel;
xfs_bumplink(tp, wip);
- xfs_trans_log_inode(tp, wip, XFS_ILOG_CORE);
VFS_I(wip)->i_state &= ~I_LINKABLE;
}
@@ -3380,6 +3317,22 @@
* In case there is already an entry with the same
* name at the destination directory, remove it first.
*/
+
+ /*
+ * Check whether the replace operation will need to allocate
+ * blocks. This happens when the shortform directory lacks
+ * space and we have to convert it to a block format directory.
+ * When more blocks are necessary, we must lock the AGI first
+ * to preserve locking order (AGI -> AGF).
+ */
+ if (xfs_dir2_sf_replace_needblock(target_dp, src_ip->i_ino)) {
+ error = xfs_read_agi(mp, tp,
+ XFS_INO_TO_AGNO(mp, target_ip->i_ino),
+ &agibp);
+ if (error)
+ goto out_trans_cancel;
+ }
+
error = xfs_dir_replace(tp, target_dp, target_name,
src_ip->i_ino, spaceres);
if (error)
@@ -3479,374 +3432,76 @@
return error;
}
-STATIC int
-xfs_iflush_cluster(
- struct xfs_inode *ip,
- struct xfs_buf *bp)
-{
- struct xfs_mount *mp = ip->i_mount;
- struct xfs_perag *pag;
- unsigned long first_index, mask;
- int cilist_size;
- struct xfs_inode **cilist;
- struct xfs_inode *cip;
- struct xfs_ino_geometry *igeo = M_IGEO(mp);
- int nr_found;
- int clcount = 0;
- int i;
-
- pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
-
- cilist_size = igeo->inodes_per_cluster * sizeof(struct xfs_inode *);
- cilist = kmem_alloc(cilist_size, KM_MAYFAIL|KM_NOFS);
- if (!cilist)
- goto out_put;
-
- mask = ~(igeo->inodes_per_cluster - 1);
- first_index = XFS_INO_TO_AGINO(mp, ip->i_ino) & mask;
- rcu_read_lock();
- /* really need a gang lookup range call here */
- nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, (void**)cilist,
- first_index, igeo->inodes_per_cluster);
- if (nr_found == 0)
- goto out_free;
-
- for (i = 0; i < nr_found; i++) {
- cip = cilist[i];
- if (cip == ip)
- continue;
-
- /*
- * because this is an RCU protected lookup, we could find a
- * recently freed or even reallocated inode during the lookup.
- * We need to check under the i_flags_lock for a valid inode
- * here. Skip it if it is not valid or the wrong inode.
- */
- spin_lock(&cip->i_flags_lock);
- if (!cip->i_ino ||
- __xfs_iflags_test(cip, XFS_ISTALE)) {
- spin_unlock(&cip->i_flags_lock);
- continue;
- }
-
- /*
- * Once we fall off the end of the cluster, no point checking
- * any more inodes in the list because they will also all be
- * outside the cluster.
- */
- if ((XFS_INO_TO_AGINO(mp, cip->i_ino) & mask) != first_index) {
- spin_unlock(&cip->i_flags_lock);
- break;
- }
- spin_unlock(&cip->i_flags_lock);
-
- /*
- * Do an un-protected check to see if the inode is dirty and
- * is a candidate for flushing. These checks will be repeated
- * later after the appropriate locks are acquired.
- */
- if (xfs_inode_clean(cip) && xfs_ipincount(cip) == 0)
- continue;
-
- /*
- * Try to get locks. If any are unavailable or it is pinned,
- * then this inode cannot be flushed and is skipped.
- */
-
- if (!xfs_ilock_nowait(cip, XFS_ILOCK_SHARED))
- continue;
- if (!xfs_iflock_nowait(cip)) {
- xfs_iunlock(cip, XFS_ILOCK_SHARED);
- continue;
- }
- if (xfs_ipincount(cip)) {
- xfs_ifunlock(cip);
- xfs_iunlock(cip, XFS_ILOCK_SHARED);
- continue;
- }
-
-
- /*
- * Check the inode number again, just to be certain we are not
- * racing with freeing in xfs_reclaim_inode(). See the comments
- * in that function for more information as to why the initial
- * check is not sufficient.
- */
- if (!cip->i_ino) {
- xfs_ifunlock(cip);
- xfs_iunlock(cip, XFS_ILOCK_SHARED);
- continue;
- }
-
- /*
- * arriving here means that this inode can be flushed. First
- * re-check that it's dirty before flushing.
- */
- if (!xfs_inode_clean(cip)) {
- int error;
- error = xfs_iflush_int(cip, bp);
- if (error) {
- xfs_iunlock(cip, XFS_ILOCK_SHARED);
- goto cluster_corrupt_out;
- }
- clcount++;
- } else {
- xfs_ifunlock(cip);
- }
- xfs_iunlock(cip, XFS_ILOCK_SHARED);
- }
-
- if (clcount) {
- XFS_STATS_INC(mp, xs_icluster_flushcnt);
- XFS_STATS_ADD(mp, xs_icluster_flushinode, clcount);
- }
-
-out_free:
- rcu_read_unlock();
- kmem_free(cilist);
-out_put:
- xfs_perag_put(pag);
- return 0;
-
-
-cluster_corrupt_out:
- /*
- * Corruption detected in the clustering loop. Invalidate the
- * inode buffer and shut down the filesystem.
- */
- rcu_read_unlock();
-
- /*
- * We'll always have an inode attached to the buffer for completion
- * process by the time we are called from xfs_iflush(). Hence we have
- * always need to do IO completion processing to abort the inodes
- * attached to the buffer. handle them just like the shutdown case in
- * xfs_buf_submit().
- */
- ASSERT(bp->b_iodone);
- bp->b_flags |= XBF_ASYNC;
- bp->b_flags &= ~XBF_DONE;
- xfs_buf_stale(bp);
- xfs_buf_ioerror(bp, -EIO);
- xfs_buf_ioend(bp);
-
- xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
-
- /* abort the corrupt inode, as it was not attached to the buffer */
- xfs_iflush_abort(cip, false);
- kmem_free(cilist);
- xfs_perag_put(pag);
- return -EFSCORRUPTED;
-}
-
-/*
- * Flush dirty inode metadata into the backing buffer.
- *
- * The caller must have the inode lock and the inode flush lock held. The
- * inode lock will still be held upon return to the caller, and the inode
- * flush lock will be released after the inode has reached the disk.
- *
- * The caller must write out the buffer returned in *bpp and release it.
- */
-int
+static int
xfs_iflush(
struct xfs_inode *ip,
- struct xfs_buf **bpp)
-{
- struct xfs_mount *mp = ip->i_mount;
- struct xfs_buf *bp = NULL;
- struct xfs_dinode *dip;
- int error;
-
- XFS_STATS_INC(mp, xs_iflush_count);
-
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
- ASSERT(xfs_isiflocked(ip));
- ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
- ip->i_d.di_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK));
-
- *bpp = NULL;
-
- xfs_iunpin_wait(ip);
-
- /*
- * For stale inodes we cannot rely on the backing buffer remaining
- * stale in cache for the remaining life of the stale inode and so
- * xfs_imap_to_bp() below may give us a buffer that no longer contains
- * inodes below. We have to check this after ensuring the inode is
- * unpinned so that it is safe to reclaim the stale inode after the
- * flush call.
- */
- if (xfs_iflags_test(ip, XFS_ISTALE)) {
- xfs_ifunlock(ip);
- return 0;
- }
-
- /*
- * This may have been unpinned because the filesystem is shutting
- * down forcibly. If that's the case we must not write this inode
- * to disk, because the log record didn't make it to disk.
- *
- * We also have to remove the log item from the AIL in this case,
- * as we wait for an empty AIL as part of the unmount process.
- */
- if (XFS_FORCED_SHUTDOWN(mp)) {
- error = -EIO;
- goto abort_out;
- }
-
- /*
- * Get the buffer containing the on-disk inode. We are doing a try-lock
- * operation here, so we may get an EAGAIN error. In that case, we
- * simply want to return with the inode still dirty.
- *
- * If we get any other error, we effectively have a corruption situation
- * and we cannot flush the inode, so we treat it the same as failing
- * xfs_iflush_int().
- */
- error = xfs_imap_to_bp(mp, NULL, &ip->i_imap, &dip, &bp, XBF_TRYLOCK,
- 0);
- if (error == -EAGAIN) {
- xfs_ifunlock(ip);
- return error;
- }
- if (error)
- goto corrupt_out;
-
- /*
- * First flush out the inode that xfs_iflush was called with.
- */
- error = xfs_iflush_int(ip, bp);
- if (error)
- goto corrupt_out;
-
- /*
- * If the buffer is pinned then push on the log now so we won't
- * get stuck waiting in the write for too long.
- */
- if (xfs_buf_ispinned(bp))
- xfs_log_force(mp, 0);
-
- /*
- * inode clustering: try to gather other inodes into this write
- *
- * Note: Any error during clustering will result in the filesystem
- * being shut down and completion callbacks run on the cluster buffer.
- * As we have already flushed and attached this inode to the buffer,
- * it has already been aborted and released by xfs_iflush_cluster() and
- * so we have no further error handling to do here.
- */
- error = xfs_iflush_cluster(ip, bp);
- if (error)
- return error;
-
- *bpp = bp;
- return 0;
-
-corrupt_out:
- if (bp)
- xfs_buf_relse(bp);
- xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
-abort_out:
- /* abort the corrupt inode, as it was not attached to the buffer */
- xfs_iflush_abort(ip, false);
- return error;
-}
-
-/*
- * If there are inline format data / attr forks attached to this inode,
- * make sure they're not corrupt.
- */
-bool
-xfs_inode_verify_forks(
- struct xfs_inode *ip)
-{
- struct xfs_ifork *ifp;
- xfs_failaddr_t fa;
-
- fa = xfs_ifork_verify_data(ip, &xfs_default_ifork_ops);
- if (fa) {
- ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
- xfs_inode_verifier_error(ip, -EFSCORRUPTED, "data fork",
- ifp->if_u1.if_data, ifp->if_bytes, fa);
- return false;
- }
-
- fa = xfs_ifork_verify_attr(ip, &xfs_default_ifork_ops);
- if (fa) {
- ifp = XFS_IFORK_PTR(ip, XFS_ATTR_FORK);
- xfs_inode_verifier_error(ip, -EFSCORRUPTED, "attr fork",
- ifp ? ifp->if_u1.if_data : NULL,
- ifp ? ifp->if_bytes : 0, fa);
- return false;
- }
- return true;
-}
-
-STATIC int
-xfs_iflush_int(
- struct xfs_inode *ip,
struct xfs_buf *bp)
{
struct xfs_inode_log_item *iip = ip->i_itemp;
struct xfs_dinode *dip;
struct xfs_mount *mp = ip->i_mount;
+ int error;
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
- ASSERT(xfs_isiflocked(ip));
- ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
- ip->i_d.di_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK));
- ASSERT(iip != NULL && iip->ili_fields != 0);
- ASSERT(ip->i_d.di_version > 1);
+ ASSERT(xfs_iflags_test(ip, XFS_IFLUSHING));
+ ASSERT(ip->i_df.if_format != XFS_DINODE_FMT_BTREE ||
+ ip->i_df.if_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK));
+ ASSERT(iip->ili_item.li_buf == bp);
- /* set *dip = inode's place in the buffer */
dip = xfs_buf_offset(bp, ip->i_imap.im_boffset);
+ /*
+ * We don't flush the inode if any of the following checks fail, but we
+ * do still update the log item and attach to the backing buffer as if
+ * the flush happened. This is a formality to facilitate predictable
+ * error handling as the caller will shutdown and fail the buffer.
+ */
+ error = -EFSCORRUPTED;
if (XFS_TEST_ERROR(dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC),
mp, XFS_ERRTAG_IFLUSH_1)) {
xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
"%s: Bad inode %Lu magic number 0x%x, ptr "PTR_FMT,
__func__, ip->i_ino, be16_to_cpu(dip->di_magic), dip);
- goto corrupt_out;
+ goto flush_out;
}
if (S_ISREG(VFS_I(ip)->i_mode)) {
if (XFS_TEST_ERROR(
- (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) &&
- (ip->i_d.di_format != XFS_DINODE_FMT_BTREE),
+ ip->i_df.if_format != XFS_DINODE_FMT_EXTENTS &&
+ ip->i_df.if_format != XFS_DINODE_FMT_BTREE,
mp, XFS_ERRTAG_IFLUSH_3)) {
xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
"%s: Bad regular inode %Lu, ptr "PTR_FMT,
__func__, ip->i_ino, ip);
- goto corrupt_out;
+ goto flush_out;
}
} else if (S_ISDIR(VFS_I(ip)->i_mode)) {
if (XFS_TEST_ERROR(
- (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) &&
- (ip->i_d.di_format != XFS_DINODE_FMT_BTREE) &&
- (ip->i_d.di_format != XFS_DINODE_FMT_LOCAL),
+ ip->i_df.if_format != XFS_DINODE_FMT_EXTENTS &&
+ ip->i_df.if_format != XFS_DINODE_FMT_BTREE &&
+ ip->i_df.if_format != XFS_DINODE_FMT_LOCAL,
mp, XFS_ERRTAG_IFLUSH_4)) {
xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
"%s: Bad directory inode %Lu, ptr "PTR_FMT,
__func__, ip->i_ino, ip);
- goto corrupt_out;
+ goto flush_out;
}
}
- if (XFS_TEST_ERROR(ip->i_d.di_nextents + ip->i_d.di_anextents >
+ if (XFS_TEST_ERROR(ip->i_df.if_nextents + xfs_ifork_nextents(ip->i_afp) >
ip->i_d.di_nblocks, mp, XFS_ERRTAG_IFLUSH_5)) {
xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
"%s: detected corrupt incore inode %Lu, "
"total extents = %d, nblocks = %Ld, ptr "PTR_FMT,
__func__, ip->i_ino,
- ip->i_d.di_nextents + ip->i_d.di_anextents,
+ ip->i_df.if_nextents + xfs_ifork_nextents(ip->i_afp),
ip->i_d.di_nblocks, ip);
- goto corrupt_out;
+ goto flush_out;
}
if (XFS_TEST_ERROR(ip->i_d.di_forkoff > mp->m_sb.sb_inodesize,
mp, XFS_ERRTAG_IFLUSH_6)) {
xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
"%s: bad inode %Lu, forkoff 0x%x, ptr "PTR_FMT,
__func__, ip->i_ino, ip->i_d.di_forkoff, ip);
- goto corrupt_out;
+ goto flush_out;
}
/*
@@ -3858,12 +3513,19 @@
* backwards compatibility with old kernels that predate logging all
* inode changes.
*/
- if (ip->i_d.di_version < 3)
+ if (!xfs_sb_version_has_v3inode(&mp->m_sb))
ip->i_d.di_flushiter++;
- /* Check the inline fork data before we write out. */
- if (!xfs_inode_verify_forks(ip))
- goto corrupt_out;
+ /*
+ * If there are inline format data / attr forks attached to this inode,
+ * make sure they are not corrupt.
+ */
+ if (ip->i_df.if_format == XFS_DINODE_FMT_LOCAL &&
+ xfs_ifork_verify_local_data(ip))
+ goto flush_out;
+ if (ip->i_afp && ip->i_afp->if_format == XFS_DINODE_FMT_LOCAL &&
+ xfs_ifork_verify_local_attr(ip))
+ goto flush_out;
/*
* Copy the dirty parts of the inode into the on-disk inode. We always
@@ -3879,7 +3541,6 @@
xfs_iflush_fork(ip, dip, iip, XFS_DATA_FORK);
if (XFS_IFORK_Q(ip))
xfs_iflush_fork(ip, dip, iip, XFS_ATTR_FORK);
- xfs_inobp_check(mp, bp);
/*
* We've recorded everything logged in the inode, so we'd like to clear
@@ -3892,45 +3553,144 @@
*
* What we do is move the bits to the ili_last_fields field. When
* logging the inode, these bits are moved back to the ili_fields field.
- * In the xfs_iflush_done() routine we clear ili_last_fields, since we
- * know that the information those bits represent is permanently on
+ * In the xfs_buf_inode_iodone() routine we clear ili_last_fields, since
+ * we know that the information those bits represent is permanently on
* disk. As long as the flush completes before the inode is logged
* again, then both ili_fields and ili_last_fields will be cleared.
- *
- * We can play with the ili_fields bits here, because the inode lock
- * must be held exclusively in order to set bits there and the flush
- * lock protects the ili_last_fields bits. Set ili_logged so the flush
- * done routine can tell whether or not to look in the AIL. Also, store
- * the current LSN of the inode so that we can tell whether the item has
- * moved in the AIL from xfs_iflush_done(). In order to read the lsn we
- * need the AIL lock, because it is a 64 bit value that cannot be read
- * atomically.
*/
+ error = 0;
+flush_out:
+ spin_lock(&iip->ili_lock);
iip->ili_last_fields = iip->ili_fields;
iip->ili_fields = 0;
iip->ili_fsync_fields = 0;
- iip->ili_logged = 1;
+ spin_unlock(&iip->ili_lock);
+ /*
+ * Store the current LSN of the inode so that we can tell whether the
+ * item has moved in the AIL from xfs_buf_inode_iodone().
+ */
xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn,
&iip->ili_item.li_lsn);
- /*
- * Attach the function xfs_iflush_done to the inode's
- * buffer. This will remove the inode from the AIL
- * and unlock the inode's flush lock when the inode is
- * completely written to disk.
- */
- xfs_buf_attach_iodone(bp, xfs_iflush_done, &iip->ili_item);
-
/* generate the checksum. */
xfs_dinode_calc_crc(mp, dip);
+ return error;
+}
- ASSERT(!list_empty(&bp->b_li_list));
- ASSERT(bp->b_iodone != NULL);
+/*
+ * Non-blocking flush of dirty inode metadata into the backing buffer.
+ *
+ * The caller must have a reference to the inode and hold the cluster buffer
+ * locked. The function will walk across all the inodes on the cluster buffer it
+ * can find and lock without blocking, and flush them to the cluster buffer.
+ *
+ * On successful flushing of at least one inode, the caller must write out the
+ * buffer and release it. If no inodes are flushed, -EAGAIN will be returned and
+ * the caller needs to release the buffer. On failure, the filesystem will be
+ * shut down, the buffer will have been unlocked and released, and EFSCORRUPTED
+ * will be returned.
+ */
+int
+xfs_iflush_cluster(
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = bp->b_mount;
+ struct xfs_log_item *lip, *n;
+ struct xfs_inode *ip;
+ struct xfs_inode_log_item *iip;
+ int clcount = 0;
+ int error = 0;
+
+ /*
+ * We must use the safe variant here as on shutdown xfs_iflush_abort()
+ * can remove itself from the list.
+ */
+ list_for_each_entry_safe(lip, n, &bp->b_li_list, li_bio_list) {
+ iip = (struct xfs_inode_log_item *)lip;
+ ip = iip->ili_inode;
+
+ /*
+ * Quick and dirty check to avoid locks if possible.
+ */
+ if (__xfs_iflags_test(ip, XFS_IRECLAIM | XFS_IFLUSHING))
+ continue;
+ if (xfs_ipincount(ip))
+ continue;
+
+ /*
+ * The inode is still attached to the buffer, which means it is
+ * dirty but reclaim might try to grab it. Check carefully for
+ * that, and grab the ilock while still holding the i_flags_lock
+ * to guarantee reclaim will not be able to reclaim this inode
+ * once we drop the i_flags_lock.
+ */
+ spin_lock(&ip->i_flags_lock);
+ ASSERT(!__xfs_iflags_test(ip, XFS_ISTALE));
+ if (__xfs_iflags_test(ip, XFS_IRECLAIM | XFS_IFLUSHING)) {
+ spin_unlock(&ip->i_flags_lock);
+ continue;
+ }
+
+ /*
+ * ILOCK will pin the inode against reclaim and prevent
+ * concurrent transactions modifying the inode while we are
+ * flushing the inode. If we get the lock, set the flushing
+ * state before we drop the i_flags_lock.
+ */
+ if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) {
+ spin_unlock(&ip->i_flags_lock);
+ continue;
+ }
+ __xfs_iflags_set(ip, XFS_IFLUSHING);
+ spin_unlock(&ip->i_flags_lock);
+
+ /*
+ * Abort flushing this inode if we are shut down because the
+ * inode may not currently be in the AIL. This can occur when
+ * log I/O failure unpins the inode without inserting into the
+ * AIL, leaving a dirty/unpinned inode attached to the buffer
+ * that otherwise looks like it should be flushed.
+ */
+ if (XFS_FORCED_SHUTDOWN(mp)) {
+ xfs_iunpin_wait(ip);
+ xfs_iflush_abort(ip);
+ xfs_iunlock(ip, XFS_ILOCK_SHARED);
+ error = -EIO;
+ continue;
+ }
+
+ /* don't block waiting on a log force to unpin dirty inodes */
+ if (xfs_ipincount(ip)) {
+ xfs_iflags_clear(ip, XFS_IFLUSHING);
+ xfs_iunlock(ip, XFS_ILOCK_SHARED);
+ continue;
+ }
+
+ if (!xfs_inode_clean(ip))
+ error = xfs_iflush(ip, bp);
+ else
+ xfs_iflags_clear(ip, XFS_IFLUSHING);
+ xfs_iunlock(ip, XFS_ILOCK_SHARED);
+ if (error)
+ break;
+ clcount++;
+ }
+
+ if (error) {
+ bp->b_flags |= XBF_ASYNC;
+ xfs_buf_ioend_fail(bp);
+ xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+ return error;
+ }
+
+ if (!clcount)
+ return -EAGAIN;
+
+ XFS_STATS_INC(mp, xs_icluster_flushcnt);
+ XFS_STATS_ADD(mp, xs_icluster_flushinode, clcount);
return 0;
-corrupt_out:
- return -EFSCORRUPTED;
}
/* Release an inode. */
@@ -3941,3 +3701,115 @@
trace_xfs_irele(ip, _RET_IP_);
iput(VFS_I(ip));
}
+
+/*
+ * Ensure all commited transactions touching the inode are written to the log.
+ */
+int
+xfs_log_force_inode(
+ struct xfs_inode *ip)
+{
+ xfs_lsn_t lsn = 0;
+
+ xfs_ilock(ip, XFS_ILOCK_SHARED);
+ if (xfs_ipincount(ip))
+ lsn = ip->i_itemp->ili_last_lsn;
+ xfs_iunlock(ip, XFS_ILOCK_SHARED);
+
+ if (!lsn)
+ return 0;
+ return xfs_log_force_lsn(ip->i_mount, lsn, XFS_LOG_SYNC, NULL);
+}
+
+/*
+ * Grab the exclusive iolock for a data copy from src to dest, making sure to
+ * abide vfs locking order (lowest pointer value goes first) and breaking the
+ * layout leases before proceeding. The loop is needed because we cannot call
+ * the blocking break_layout() with the iolocks held, and therefore have to
+ * back out both locks.
+ */
+static int
+xfs_iolock_two_inodes_and_break_layout(
+ struct inode *src,
+ struct inode *dest)
+{
+ int error;
+
+ if (src > dest)
+ swap(src, dest);
+
+retry:
+ /* Wait to break both inodes' layouts before we start locking. */
+ error = break_layout(src, true);
+ if (error)
+ return error;
+ if (src != dest) {
+ error = break_layout(dest, true);
+ if (error)
+ return error;
+ }
+
+ /* Lock one inode and make sure nobody got in and leased it. */
+ inode_lock(src);
+ error = break_layout(src, false);
+ if (error) {
+ inode_unlock(src);
+ if (error == -EWOULDBLOCK)
+ goto retry;
+ return error;
+ }
+
+ if (src == dest)
+ return 0;
+
+ /* Lock the other inode and make sure nobody got in and leased it. */
+ inode_lock_nested(dest, I_MUTEX_NONDIR2);
+ error = break_layout(dest, false);
+ if (error) {
+ inode_unlock(src);
+ inode_unlock(dest);
+ if (error == -EWOULDBLOCK)
+ goto retry;
+ return error;
+ }
+
+ return 0;
+}
+
+/*
+ * Lock two inodes so that userspace cannot initiate I/O via file syscalls or
+ * mmap activity.
+ */
+int
+xfs_ilock2_io_mmap(
+ struct xfs_inode *ip1,
+ struct xfs_inode *ip2)
+{
+ int ret;
+
+ ret = xfs_iolock_two_inodes_and_break_layout(VFS_I(ip1), VFS_I(ip2));
+ if (ret)
+ return ret;
+ if (ip1 == ip2)
+ xfs_ilock(ip1, XFS_MMAPLOCK_EXCL);
+ else
+ xfs_lock_two_inodes(ip1, XFS_MMAPLOCK_EXCL,
+ ip2, XFS_MMAPLOCK_EXCL);
+ return 0;
+}
+
+/* Unlock both inodes to allow IO and mmap activity. */
+void
+xfs_iunlock2_io_mmap(
+ struct xfs_inode *ip1,
+ struct xfs_inode *ip2)
+{
+ bool same_inode = (ip1 == ip2);
+
+ xfs_iunlock(ip2, XFS_MMAPLOCK_EXCL);
+ if (!same_inode)
+ xfs_iunlock(ip1, XFS_MMAPLOCK_EXCL);
+ inode_unlock(VFS_I(ip2));
+ if (!same_inode)
+ inode_unlock(VFS_I(ip1));
+}
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 558173f..751a3d1 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -37,9 +37,6 @@
struct xfs_ifork *i_cowfp; /* copy on write extents */
struct xfs_ifork i_df; /* data fork */
- /* operations vectors */
- const struct xfs_dir_ops *d_ops; /* directory ops vector */
-
/* Transaction and locking information. */
struct xfs_inode_log_item *i_itemp; /* logging information */
mrlock_t i_lock; /* inode lock */
@@ -60,9 +57,6 @@
struct xfs_icdinode i_d; /* most of ondisk inode */
- xfs_extnum_t i_cnextents; /* # of extents in cow fork */
- unsigned int i_cformat; /* format of cow fork */
-
/* VFS inode */
struct inode i_vnode; /* embedded VFS inode */
@@ -177,30 +171,11 @@
return ret;
}
-/*
- * Project quota id helpers (previously projid was 16bit only
- * and using two 16bit values to hold new 32bit projid was chosen
- * to retain compatibility with "old" filesystems).
- */
-static inline prid_t
-xfs_get_projid(struct xfs_inode *ip)
-{
- return (prid_t)ip->i_d.di_projid_hi << 16 | ip->i_d.di_projid_lo;
-}
-
-static inline void
-xfs_set_projid(struct xfs_inode *ip,
- prid_t projid)
-{
- ip->i_d.di_projid_hi = (uint16_t) (projid >> 16);
- ip->i_d.di_projid_lo = (uint16_t) (projid & 0xffff);
-}
-
static inline prid_t
xfs_get_initial_prid(struct xfs_inode *dp)
{
if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
- return xfs_get_projid(dp);
+ return dp->i_d.di_projid;
return XFS_PROJID_DEFAULT;
}
@@ -219,6 +194,18 @@
return ip->i_cowfp && ip->i_cowfp->if_bytes;
}
+static inline bool xfs_inode_has_bigtime(struct xfs_inode *ip)
+{
+ return ip->i_d.di_flags2 & XFS_DIFLAG2_BIGTIME;
+}
+
+/*
+ * Return the buftarg used for data allocations on a given inode.
+ */
+#define xfs_inode_buftarg(ip) \
+ (XFS_IS_REALTIME_INODE(ip) ? \
+ (ip)->i_mount->m_rtdev_targp : (ip)->i_mount->m_ddev_targp)
+
/*
* In-core inode flags.
*/
@@ -229,12 +216,10 @@
#define XFS_INEW (1 << __XFS_INEW_BIT)
#define XFS_ITRUNCATED (1 << 5) /* truncated down so flush-on-close */
#define XFS_IDIRTY_RELEASE (1 << 6) /* dirty release already seen */
-#define __XFS_IFLOCK_BIT 7 /* inode is being flushed right now */
-#define XFS_IFLOCK (1 << __XFS_IFLOCK_BIT)
+#define XFS_IFLUSHING (1 << 7) /* inode is being flushed */
#define __XFS_IPINNED_BIT 8 /* wakeup key for zero pin count */
#define XFS_IPINNED (1 << __XFS_IPINNED_BIT)
-#define XFS_IDONTCACHE (1 << 9) /* don't cache the inode long term */
-#define XFS_IEOFBLOCKS (1 << 10)/* has the preallocblocks tag set */
+#define XFS_IEOFBLOCKS (1 << 9) /* has the preallocblocks tag set */
/*
* If this unlinked inode is in the middle of recovery, don't let drop_inode
* truncate and free the inode. This can happen if we iget the inode during
@@ -253,36 +238,6 @@
XFS_IDIRTY_RELEASE | XFS_ITRUNCATED)
/*
- * Synchronize processes attempting to flush the in-core inode back to disk.
- */
-
-static inline int xfs_isiflocked(struct xfs_inode *ip)
-{
- return xfs_iflags_test(ip, XFS_IFLOCK);
-}
-
-extern void __xfs_iflock(struct xfs_inode *ip);
-
-static inline int xfs_iflock_nowait(struct xfs_inode *ip)
-{
- return !xfs_iflags_test_and_set(ip, XFS_IFLOCK);
-}
-
-static inline void xfs_iflock(struct xfs_inode *ip)
-{
- if (!xfs_iflock_nowait(ip))
- __xfs_iflock(ip);
-}
-
-static inline void xfs_ifunlock(struct xfs_inode *ip)
-{
- ASSERT(xfs_isiflocked(ip));
- xfs_iflags_clear(ip, XFS_IFLOCK);
- smp_mb();
- wake_up_bit(&ip->i_flags, __XFS_IFLOCK_BIT);
-}
-
-/*
* Flags for inode locking.
* Bit ranges: 1<<1 - 1<<16-1 -- iolock/ilock modes (bitfield)
* 1<<16 - 1<<32-1 -- lockdep annotation (integers)
@@ -441,10 +396,11 @@
struct xfs_inode *, int, xfs_fsize_t, int);
void xfs_iext_realloc(xfs_inode_t *, int, int);
+int xfs_log_force_inode(struct xfs_inode *ip);
void xfs_iunpin_wait(xfs_inode_t *);
#define xfs_ipincount(ip) ((unsigned int) atomic_read(&ip->i_pincount))
-int xfs_iflush(struct xfs_inode *, struct xfs_buf **);
+int xfs_iflush_cluster(struct xfs_buf *);
void xfs_lock_two_inodes(struct xfs_inode *ip0, uint ip0_mode,
struct xfs_inode *ip1, uint ip1_mode);
@@ -481,6 +437,7 @@
/* from xfs_iops.c */
extern void xfs_setup_inode(struct xfs_inode *ip);
extern void xfs_setup_iops(struct xfs_inode *ip);
+extern void xfs_diflags_to_iflags(struct xfs_inode *ip, bool init);
/*
* When setting up a newly allocated inode, we need to call
@@ -511,11 +468,12 @@
/* The default CoW extent size hint. */
#define XFS_DEFAULT_COWEXTSZ_HINT 32
-bool xfs_inode_verify_forks(struct xfs_inode *ip);
-
int xfs_iunlink_init(struct xfs_perag *pag);
void xfs_iunlink_destroy(struct xfs_perag *pag);
void xfs_end_io(struct work_struct *work);
+int xfs_ilock2_io_mmap(struct xfs_inode *ip1, struct xfs_inode *ip2);
+void xfs_iunlock2_io_mmap(struct xfs_inode *ip1, struct xfs_inode *ip2);
+
#endif /* __XFS_INODE_H__ */
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index bb8f076..17e20a6 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -17,6 +17,7 @@
#include "xfs_trans_priv.h"
#include "xfs_buf_item.h"
#include "xfs_log.h"
+#include "xfs_error.h"
#include <linux/iversion.h>
@@ -35,10 +36,10 @@
{
struct xfs_inode *ip = iip->ili_inode;
- switch (ip->i_d.di_format) {
+ switch (ip->i_df.if_format) {
case XFS_DINODE_FMT_EXTENTS:
if ((iip->ili_fields & XFS_ILOG_DEXT) &&
- ip->i_d.di_nextents > 0 &&
+ ip->i_df.if_nextents > 0 &&
ip->i_df.if_bytes > 0) {
/* worst case, doesn't subtract delalloc extents */
*nbytes += XFS_IFORK_DSIZE(ip);
@@ -76,10 +77,10 @@
{
struct xfs_inode *ip = iip->ili_inode;
- switch (ip->i_d.di_aformat) {
+ switch (ip->i_afp->if_format) {
case XFS_DINODE_FMT_EXTENTS:
if ((iip->ili_fields & XFS_ILOG_AEXT) &&
- ip->i_d.di_anextents > 0 &&
+ ip->i_afp->if_nextents > 0 &&
ip->i_afp->if_bytes > 0) {
/* worst case, doesn't subtract unused space */
*nbytes += XFS_IFORK_ASIZE(ip);
@@ -124,7 +125,7 @@
*nvecs += 2;
*nbytes += sizeof(struct xfs_inode_log_format) +
- xfs_log_dinode_size(ip->i_d.di_version);
+ xfs_log_dinode_size(ip->i_mount);
xfs_inode_item_data_fork_size(iip, nvecs, nbytes);
if (XFS_IFORK_Q(ip))
@@ -141,13 +142,13 @@
struct xfs_inode *ip = iip->ili_inode;
size_t data_bytes;
- switch (ip->i_d.di_format) {
+ switch (ip->i_df.if_format) {
case XFS_DINODE_FMT_EXTENTS:
iip->ili_fields &=
~(XFS_ILOG_DDATA | XFS_ILOG_DBROOT | XFS_ILOG_DEV);
if ((iip->ili_fields & XFS_ILOG_DEXT) &&
- ip->i_d.di_nextents > 0 &&
+ ip->i_df.if_nextents > 0 &&
ip->i_df.if_bytes > 0) {
struct xfs_bmbt_rec *p;
@@ -190,7 +191,7 @@
ip->i_df.if_bytes > 0) {
/*
* Round i_bytes up to a word boundary.
- * The underlying memory is guaranteed to
+ * The underlying memory is guaranteed
* to be there by xfs_idata_realloc().
*/
data_bytes = roundup(ip->i_df.if_bytes, 4);
@@ -226,18 +227,18 @@
struct xfs_inode *ip = iip->ili_inode;
size_t data_bytes;
- switch (ip->i_d.di_aformat) {
+ switch (ip->i_afp->if_format) {
case XFS_DINODE_FMT_EXTENTS:
iip->ili_fields &=
~(XFS_ILOG_ADATA | XFS_ILOG_ABROOT);
if ((iip->ili_fields & XFS_ILOG_AEXT) &&
- ip->i_d.di_anextents > 0 &&
+ ip->i_afp->if_nextents > 0 &&
ip->i_afp->if_bytes > 0) {
struct xfs_bmbt_rec *p;
ASSERT(xfs_iext_count(ip->i_afp) ==
- ip->i_d.di_anextents);
+ ip->i_afp->if_nextents);
p = xlog_prepare_iovec(lv, vecp, XLOG_REG_TYPE_IATTR_EXT);
data_bytes = xfs_iextents_copy(ip, p, XFS_ATTR_FORK);
@@ -274,7 +275,7 @@
ip->i_afp->if_bytes > 0) {
/*
* Round i_bytes up to a word boundary.
- * The underlying memory is guaranteed to
+ * The underlying memory is guaranteed
* to be there by xfs_idata_realloc().
*/
data_bytes = roundup(ip->i_afp->if_bytes, 4);
@@ -294,6 +295,28 @@
}
}
+/*
+ * Convert an incore timestamp to a log timestamp. Note that the log format
+ * specifies host endian format!
+ */
+static inline xfs_ictimestamp_t
+xfs_inode_to_log_dinode_ts(
+ struct xfs_inode *ip,
+ const struct timespec64 tv)
+{
+ struct xfs_legacy_ictimestamp *lits;
+ xfs_ictimestamp_t its;
+
+ if (xfs_inode_has_bigtime(ip))
+ return xfs_inode_encode_bigtime(tv);
+
+ lits = (struct xfs_legacy_ictimestamp *)&its;
+ lits->t_sec = tv.tv_sec;
+ lits->t_nsec = tv.tv_nsec;
+
+ return its;
+}
+
static void
xfs_inode_to_log_dinode(
struct xfs_inode *ip,
@@ -304,22 +327,17 @@
struct inode *inode = VFS_I(ip);
to->di_magic = XFS_DINODE_MAGIC;
-
- to->di_version = from->di_version;
- to->di_format = from->di_format;
- to->di_uid = from->di_uid;
- to->di_gid = from->di_gid;
- to->di_projid_lo = from->di_projid_lo;
- to->di_projid_hi = from->di_projid_hi;
+ to->di_format = xfs_ifork_format(&ip->i_df);
+ to->di_uid = i_uid_read(inode);
+ to->di_gid = i_gid_read(inode);
+ to->di_projid_lo = from->di_projid & 0xffff;
+ to->di_projid_hi = from->di_projid >> 16;
memset(to->di_pad, 0, sizeof(to->di_pad));
memset(to->di_pad3, 0, sizeof(to->di_pad3));
- to->di_atime.t_sec = inode->i_atime.tv_sec;
- to->di_atime.t_nsec = inode->i_atime.tv_nsec;
- to->di_mtime.t_sec = inode->i_mtime.tv_sec;
- to->di_mtime.t_nsec = inode->i_mtime.tv_nsec;
- to->di_ctime.t_sec = inode->i_ctime.tv_sec;
- to->di_ctime.t_nsec = inode->i_ctime.tv_nsec;
+ to->di_atime = xfs_inode_to_log_dinode_ts(ip, inode->i_atime);
+ to->di_mtime = xfs_inode_to_log_dinode_ts(ip, inode->i_mtime);
+ to->di_ctime = xfs_inode_to_log_dinode_ts(ip, inode->i_ctime);
to->di_nlink = inode->i_nlink;
to->di_gen = inode->i_generation;
to->di_mode = inode->i_mode;
@@ -327,10 +345,10 @@
to->di_size = from->di_size;
to->di_nblocks = from->di_nblocks;
to->di_extsize = from->di_extsize;
- to->di_nextents = from->di_nextents;
- to->di_anextents = from->di_anextents;
+ to->di_nextents = xfs_ifork_nextents(&ip->i_df);
+ to->di_anextents = xfs_ifork_nextents(ip->i_afp);
to->di_forkoff = from->di_forkoff;
- to->di_aformat = from->di_aformat;
+ to->di_aformat = xfs_ifork_format(ip->i_afp);
to->di_dmevmask = from->di_dmevmask;
to->di_dmstate = from->di_dmstate;
to->di_flags = from->di_flags;
@@ -338,10 +356,10 @@
/* log a dummy value to ensure log structure is fully initialised */
to->di_next_unlinked = NULLAGINO;
- if (from->di_version == 3) {
+ if (xfs_sb_version_has_v3inode(&ip->i_mount->m_sb)) {
+ to->di_version = 3;
to->di_changecount = inode_peek_iversion(inode);
- to->di_crtime.t_sec = from->di_crtime.t_sec;
- to->di_crtime.t_nsec = from->di_crtime.t_nsec;
+ to->di_crtime = xfs_inode_to_log_dinode_ts(ip, from->di_crtime);
to->di_flags2 = from->di_flags2;
to->di_cowextsize = from->di_cowextsize;
to->di_ino = ip->i_ino;
@@ -350,6 +368,7 @@
uuid_copy(&to->di_uuid, &ip->i_mount->m_sb.sb_meta_uuid);
to->di_flushiter = 0;
} else {
+ to->di_version = 2;
to->di_flushiter = from->di_flushiter;
}
}
@@ -369,7 +388,7 @@
dic = xlog_prepare_iovec(lv, vecp, XLOG_REG_TYPE_ICORE);
xfs_inode_to_log_dinode(ip, dic, ip->i_itemp->ili_item.li_lsn);
- xlog_finish_iovec(lv, *vecp, xfs_log_dinode_size(ip->i_d.di_version));
+ xlog_finish_iovec(lv, *vecp, xfs_log_dinode_size(ip->i_mount));
}
/*
@@ -394,8 +413,6 @@
struct xfs_log_iovec *vecp = NULL;
struct xfs_inode_log_format *ilf;
- ASSERT(ip->i_d.di_version > 1);
-
ilf = xlog_prepare_iovec(lv, &vecp, XLOG_REG_TYPE_IFORMAT);
ilf->ilf_type = XFS_LI_INODE;
ilf->ilf_ino = ip->i_ino;
@@ -440,6 +457,7 @@
struct xfs_inode *ip = INODE_ITEM(lip)->ili_inode;
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+ ASSERT(lip->li_buf);
trace_xfs_inode_pin(ip, _RET_IP_);
atomic_inc(&ip->i_pincount);
@@ -451,6 +469,12 @@
* item which was previously pinned with a call to xfs_inode_item_pin().
*
* Also wake up anyone in xfs_iunpin_wait() if the count goes to 0.
+ *
+ * Note that unpin can race with inode cluster buffer freeing marking the buffer
+ * stale. In that case, flush completions are run from the buffer unpin call,
+ * which may happen before the inode is unpinned. If we lose the race, there
+ * will be no buffer attached to the log item, but the inode will be marked
+ * XFS_ISTALE.
*/
STATIC void
xfs_inode_item_unpin(
@@ -460,28 +484,12 @@
struct xfs_inode *ip = INODE_ITEM(lip)->ili_inode;
trace_xfs_inode_unpin(ip, _RET_IP_);
+ ASSERT(lip->li_buf || xfs_iflags_test(ip, XFS_ISTALE));
ASSERT(atomic_read(&ip->i_pincount) > 0);
if (atomic_dec_and_test(&ip->i_pincount))
wake_up_bit(&ip->i_flags, __XFS_IPINNED_BIT);
}
-/*
- * Callback used to mark a buffer with XFS_LI_FAILED when items in the buffer
- * have been failed during writeback
- *
- * This informs the AIL that the inode is already flush locked on the next push,
- * and acquires a hold on the buffer to ensure that it isn't reclaimed before
- * dirty data makes it to disk.
- */
-STATIC void
-xfs_inode_item_error(
- struct xfs_log_item *lip,
- struct xfs_buf *bp)
-{
- ASSERT(xfs_isiflocked(INODE_ITEM(lip)->ili_inode));
- xfs_set_li_failed(lip, bp);
-}
-
STATIC uint
xfs_inode_item_push(
struct xfs_log_item *lip,
@@ -495,69 +503,43 @@
uint rval = XFS_ITEM_SUCCESS;
int error;
- if (xfs_ipincount(ip) > 0)
+ ASSERT(iip->ili_item.li_buf);
+
+ if (xfs_ipincount(ip) > 0 || xfs_buf_ispinned(bp) ||
+ (ip->i_flags & XFS_ISTALE))
return XFS_ITEM_PINNED;
- /*
- * The buffer containing this item failed to be written back
- * previously. Resubmit the buffer for IO.
- */
- if (test_bit(XFS_LI_FAILED, &lip->li_flags)) {
- if (!xfs_buf_trylock(bp))
- return XFS_ITEM_LOCKED;
+ if (xfs_iflags_test(ip, XFS_IFLUSHING))
+ return XFS_ITEM_FLUSHING;
- if (!xfs_buf_resubmit_failed_buffers(bp, buffer_list))
- rval = XFS_ITEM_FLUSHING;
-
- xfs_buf_unlock(bp);
- return rval;
- }
-
- if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED))
+ if (!xfs_buf_trylock(bp))
return XFS_ITEM_LOCKED;
- /*
- * Re-check the pincount now that we stabilized the value by
- * taking the ilock.
- */
- if (xfs_ipincount(ip) > 0) {
- rval = XFS_ITEM_PINNED;
- goto out_unlock;
- }
-
- /*
- * Stale inode items should force out the iclog.
- */
- if (ip->i_flags & XFS_ISTALE) {
- rval = XFS_ITEM_PINNED;
- goto out_unlock;
- }
-
- /*
- * Someone else is already flushing the inode. Nothing we can do
- * here but wait for the flush to finish and remove the item from
- * the AIL.
- */
- if (!xfs_iflock_nowait(ip)) {
- rval = XFS_ITEM_FLUSHING;
- goto out_unlock;
- }
-
- ASSERT(iip->ili_fields != 0 || XFS_FORCED_SHUTDOWN(ip->i_mount));
- ASSERT(iip->ili_logged == 0 || XFS_FORCED_SHUTDOWN(ip->i_mount));
-
spin_unlock(&lip->li_ailp->ail_lock);
- error = xfs_iflush(ip, &bp);
+ /*
+ * We need to hold a reference for flushing the cluster buffer as it may
+ * fail the buffer without IO submission. In which case, we better get a
+ * reference for that completion because otherwise we don't get a
+ * reference for IO until we queue the buffer for delwri submission.
+ */
+ xfs_buf_hold(bp);
+ error = xfs_iflush_cluster(bp);
if (!error) {
if (!xfs_buf_delwri_queue(bp, buffer_list))
rval = XFS_ITEM_FLUSHING;
xfs_buf_relse(bp);
+ } else {
+ /*
+ * Release the buffer if we were unable to flush anything. On
+ * any other error, the buffer has already been released.
+ */
+ if (error == -EAGAIN)
+ xfs_buf_relse(bp);
+ rval = XFS_ITEM_LOCKED;
}
spin_lock(&lip->li_ailp->ail_lock);
-out_unlock:
- xfs_iunlock(ip, XFS_ILOCK_SHARED);
return rval;
}
@@ -636,7 +618,6 @@
.iop_committed = xfs_inode_item_committed,
.iop_push = xfs_inode_item_push,
.iop_committing = xfs_inode_item_committing,
- .iop_error = xfs_inode_item_error
};
@@ -651,9 +632,11 @@
struct xfs_inode_log_item *iip;
ASSERT(ip->i_itemp == NULL);
- iip = ip->i_itemp = kmem_zone_zalloc(xfs_ili_zone, 0);
+ iip = ip->i_itemp = kmem_cache_zalloc(xfs_ili_zone,
+ GFP_KERNEL | __GFP_NOFAIL);
iip->ili_inode = ip;
+ spin_lock_init(&iip->ili_lock);
xfs_log_item_init(mp, &iip->ili_item, XFS_LI_INODE,
&xfs_inode_item_ops);
}
@@ -663,158 +646,180 @@
*/
void
xfs_inode_item_destroy(
- xfs_inode_t *ip)
+ struct xfs_inode *ip)
{
- kmem_free(ip->i_itemp->ili_item.li_lv_shadow);
- kmem_zone_free(xfs_ili_zone, ip->i_itemp);
+ struct xfs_inode_log_item *iip = ip->i_itemp;
+
+ ASSERT(iip->ili_item.li_buf == NULL);
+
+ ip->i_itemp = NULL;
+ kmem_free(iip->ili_item.li_lv_shadow);
+ kmem_cache_free(xfs_ili_zone, iip);
}
/*
- * This is the inode flushing I/O completion routine. It is called
- * from interrupt level when the buffer containing the inode is
- * flushed to disk. It is responsible for removing the inode item
- * from the AIL if it has not been re-logged, and unlocking the inode's
- * flush lock.
- *
- * To reduce AIL lock traffic as much as possible, we scan the buffer log item
- * list for other inodes that will run this function. We remove them from the
- * buffer list so we can process all the inode IO completions in one AIL lock
- * traversal.
+ * We only want to pull the item from the AIL if it is actually there
+ * and its location in the log has not changed since we started the
+ * flush. Thus, we only bother if the inode's lsn has not changed.
*/
-void
-xfs_iflush_done(
- struct xfs_buf *bp,
- struct xfs_log_item *lip)
+static void
+xfs_iflush_ail_updates(
+ struct xfs_ail *ailp,
+ struct list_head *list)
{
- struct xfs_inode_log_item *iip;
- struct xfs_log_item *blip, *n;
- struct xfs_ail *ailp = lip->li_ailp;
- int need_ail = 0;
- LIST_HEAD(tmp);
+ struct xfs_log_item *lip;
+ xfs_lsn_t tail_lsn = 0;
- /*
- * Scan the buffer IO completions for other inodes being completed and
- * attach them to the current inode log item.
- */
+ /* this is an opencoded batch version of xfs_trans_ail_delete */
+ spin_lock(&ailp->ail_lock);
+ list_for_each_entry(lip, list, li_bio_list) {
+ xfs_lsn_t lsn;
- list_add_tail(&lip->li_bio_list, &tmp);
-
- list_for_each_entry_safe(blip, n, &bp->b_li_list, li_bio_list) {
- if (lip->li_cb != xfs_iflush_done)
+ clear_bit(XFS_LI_FAILED, &lip->li_flags);
+ if (INODE_ITEM(lip)->ili_flush_lsn != lip->li_lsn)
continue;
- list_move_tail(&blip->li_bio_list, &tmp);
- /*
- * while we have the item, do the unlocked check for needing
- * the AIL lock.
- */
- iip = INODE_ITEM(blip);
- if ((iip->ili_logged && blip->li_lsn == iip->ili_flush_lsn) ||
- test_bit(XFS_LI_FAILED, &blip->li_flags))
- need_ail++;
+ lsn = xfs_ail_delete_one(ailp, lip);
+ if (!tail_lsn && lsn)
+ tail_lsn = lsn;
}
-
- /* make sure we capture the state of the initial inode. */
- iip = INODE_ITEM(lip);
- if ((iip->ili_logged && lip->li_lsn == iip->ili_flush_lsn) ||
- test_bit(XFS_LI_FAILED, &lip->li_flags))
- need_ail++;
-
- /*
- * We only want to pull the item from the AIL if it is
- * actually there and its location in the log has not
- * changed since we started the flush. Thus, we only bother
- * if the ili_logged flag is set and the inode's lsn has not
- * changed. First we check the lsn outside
- * the lock since it's cheaper, and then we recheck while
- * holding the lock before removing the inode from the AIL.
- */
- if (need_ail) {
- bool mlip_changed = false;
-
- /* this is an opencoded batch version of xfs_trans_ail_delete */
- spin_lock(&ailp->ail_lock);
- list_for_each_entry(blip, &tmp, li_bio_list) {
- if (INODE_ITEM(blip)->ili_logged &&
- blip->li_lsn == INODE_ITEM(blip)->ili_flush_lsn)
- mlip_changed |= xfs_ail_delete_one(ailp, blip);
- else {
- xfs_clear_li_failed(blip);
- }
- }
-
- if (mlip_changed) {
- if (!XFS_FORCED_SHUTDOWN(ailp->ail_mount))
- xlog_assign_tail_lsn_locked(ailp->ail_mount);
- if (list_empty(&ailp->ail_head))
- wake_up_all(&ailp->ail_empty);
- }
- spin_unlock(&ailp->ail_lock);
-
- if (mlip_changed)
- xfs_log_space_wake(ailp->ail_mount);
- }
-
- /*
- * clean up and unlock the flush lock now we are done. We can clear the
- * ili_last_fields bits now that we know that the data corresponding to
- * them is safely on disk.
- */
- list_for_each_entry_safe(blip, n, &tmp, li_bio_list) {
- list_del_init(&blip->li_bio_list);
- iip = INODE_ITEM(blip);
- iip->ili_logged = 0;
- iip->ili_last_fields = 0;
- xfs_ifunlock(iip->ili_inode);
- }
- list_del(&tmp);
+ xfs_ail_update_finish(ailp, tail_lsn);
}
/*
- * This is the inode flushing abort routine. It is called from xfs_iflush when
+ * Walk the list of inodes that have completed their IOs. If they are clean
+ * remove them from the list and dissociate them from the buffer. Buffers that
+ * are still dirty remain linked to the buffer and on the list. Caller must
+ * handle them appropriately.
+ */
+static void
+xfs_iflush_finish(
+ struct xfs_buf *bp,
+ struct list_head *list)
+{
+ struct xfs_log_item *lip, *n;
+
+ list_for_each_entry_safe(lip, n, list, li_bio_list) {
+ struct xfs_inode_log_item *iip = INODE_ITEM(lip);
+ bool drop_buffer = false;
+
+ spin_lock(&iip->ili_lock);
+
+ /*
+ * Remove the reference to the cluster buffer if the inode is
+ * clean in memory and drop the buffer reference once we've
+ * dropped the locks we hold.
+ */
+ ASSERT(iip->ili_item.li_buf == bp);
+ if (!iip->ili_fields) {
+ iip->ili_item.li_buf = NULL;
+ list_del_init(&lip->li_bio_list);
+ drop_buffer = true;
+ }
+ iip->ili_last_fields = 0;
+ iip->ili_flush_lsn = 0;
+ spin_unlock(&iip->ili_lock);
+ xfs_iflags_clear(iip->ili_inode, XFS_IFLUSHING);
+ if (drop_buffer)
+ xfs_buf_rele(bp);
+ }
+}
+
+/*
+ * Inode buffer IO completion routine. It is responsible for removing inodes
+ * attached to the buffer from the AIL if they have not been re-logged and
+ * completing the inode flush.
+ */
+void
+xfs_buf_inode_iodone(
+ struct xfs_buf *bp)
+{
+ struct xfs_log_item *lip, *n;
+ LIST_HEAD(flushed_inodes);
+ LIST_HEAD(ail_updates);
+
+ /*
+ * Pull the attached inodes from the buffer one at a time and take the
+ * appropriate action on them.
+ */
+ list_for_each_entry_safe(lip, n, &bp->b_li_list, li_bio_list) {
+ struct xfs_inode_log_item *iip = INODE_ITEM(lip);
+
+ if (xfs_iflags_test(iip->ili_inode, XFS_ISTALE)) {
+ xfs_iflush_abort(iip->ili_inode);
+ continue;
+ }
+ if (!iip->ili_last_fields)
+ continue;
+
+ /* Do an unlocked check for needing the AIL lock. */
+ if (iip->ili_flush_lsn == lip->li_lsn ||
+ test_bit(XFS_LI_FAILED, &lip->li_flags))
+ list_move_tail(&lip->li_bio_list, &ail_updates);
+ else
+ list_move_tail(&lip->li_bio_list, &flushed_inodes);
+ }
+
+ if (!list_empty(&ail_updates)) {
+ xfs_iflush_ail_updates(bp->b_mount->m_ail, &ail_updates);
+ list_splice_tail(&ail_updates, &flushed_inodes);
+ }
+
+ xfs_iflush_finish(bp, &flushed_inodes);
+ if (!list_empty(&flushed_inodes))
+ list_splice_tail(&flushed_inodes, &bp->b_li_list);
+}
+
+void
+xfs_buf_inode_io_fail(
+ struct xfs_buf *bp)
+{
+ struct xfs_log_item *lip;
+
+ list_for_each_entry(lip, &bp->b_li_list, li_bio_list)
+ set_bit(XFS_LI_FAILED, &lip->li_flags);
+}
+
+/*
+ * This is the inode flushing abort routine. It is called when
* the filesystem is shutting down to clean up the inode state. It is
* responsible for removing the inode item from the AIL if it has not been
- * re-logged, and unlocking the inode's flush lock.
+ * re-logged and clearing the inode's flush state.
*/
void
xfs_iflush_abort(
- xfs_inode_t *ip,
- bool stale)
+ struct xfs_inode *ip)
{
- xfs_inode_log_item_t *iip = ip->i_itemp;
+ struct xfs_inode_log_item *iip = ip->i_itemp;
+ struct xfs_buf *bp = NULL;
if (iip) {
- if (test_bit(XFS_LI_IN_AIL, &iip->ili_item.li_flags)) {
- xfs_trans_ail_remove(&iip->ili_item,
- stale ? SHUTDOWN_LOG_IO_ERROR :
- SHUTDOWN_CORRUPT_INCORE);
- }
- iip->ili_logged = 0;
/*
- * Clear the ili_last_fields bits now that we know that the
- * data corresponding to them is safely on disk.
+ * Clear the failed bit before removing the item from the AIL so
+ * xfs_trans_ail_delete() doesn't try to clear and release the
+ * buffer attached to the log item before we are done with it.
*/
- iip->ili_last_fields = 0;
+ clear_bit(XFS_LI_FAILED, &iip->ili_item.li_flags);
+ xfs_trans_ail_delete(&iip->ili_item, 0);
+
/*
* Clear the inode logging fields so no more flushes are
* attempted.
*/
+ spin_lock(&iip->ili_lock);
+ iip->ili_last_fields = 0;
iip->ili_fields = 0;
iip->ili_fsync_fields = 0;
+ iip->ili_flush_lsn = 0;
+ bp = iip->ili_item.li_buf;
+ iip->ili_item.li_buf = NULL;
+ list_del_init(&iip->ili_item.li_bio_list);
+ spin_unlock(&iip->ili_lock);
}
- /*
- * Release the inode's flush lock since we're done with it.
- */
- xfs_ifunlock(ip);
-}
-
-void
-xfs_istale_done(
- struct xfs_buf *bp,
- struct xfs_log_item *lip)
-{
- xfs_iflush_abort(INODE_ITEM(lip)->ili_inode, true);
+ xfs_iflags_clear(ip, XFS_IFLUSHING);
+ if (bp)
+ xfs_buf_rele(bp);
}
/*
@@ -828,8 +833,10 @@
{
struct xfs_inode_log_format_32 *in_f32 = buf->i_addr;
- if (buf->i_len != sizeof(*in_f32))
+ if (buf->i_len != sizeof(*in_f32)) {
+ XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, NULL);
return -EFSCORRUPTED;
+ }
in_f->ilf_type = in_f32->ilf_type;
in_f->ilf_size = in_f32->ilf_size;
diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h
index 07a60e7..4b926e3 100644
--- a/fs/xfs/xfs_inode_item.h
+++ b/fs/xfs/xfs_inode_item.h
@@ -13,28 +13,37 @@
struct xfs_inode;
struct xfs_mount;
-typedef struct xfs_inode_log_item {
+struct xfs_inode_log_item {
struct xfs_log_item ili_item; /* common portion */
struct xfs_inode *ili_inode; /* inode ptr */
- xfs_lsn_t ili_flush_lsn; /* lsn at last flush */
- xfs_lsn_t ili_last_lsn; /* lsn at last transaction */
- unsigned short ili_lock_flags; /* lock flags */
- unsigned short ili_logged; /* flushed logged data */
+ unsigned short ili_lock_flags; /* inode lock flags */
+ /*
+ * The ili_lock protects the interactions between the dirty state and
+ * the flush state of the inode log item. This allows us to do atomic
+ * modifications of multiple state fields without having to hold a
+ * specific inode lock to serialise them.
+ *
+ * We need atomic changes between inode dirtying, inode flushing and
+ * inode completion, but these all hold different combinations of
+ * ILOCK and IFLUSHING and hence we need some other method of
+ * serialising updates to the flush state.
+ */
+ spinlock_t ili_lock; /* flush state lock */
unsigned int ili_last_fields; /* fields when flushed */
unsigned int ili_fields; /* fields to be logged */
unsigned int ili_fsync_fields; /* logged since last fsync */
-} xfs_inode_log_item_t;
+ xfs_lsn_t ili_flush_lsn; /* lsn at last flush */
+ xfs_lsn_t ili_last_lsn; /* lsn at last transaction */
+};
-static inline int xfs_inode_clean(xfs_inode_t *ip)
+static inline int xfs_inode_clean(struct xfs_inode *ip)
{
return !ip->i_itemp || !(ip->i_itemp->ili_fields & XFS_ILOG_ALL);
}
extern void xfs_inode_item_init(struct xfs_inode *, struct xfs_mount *);
extern void xfs_inode_item_destroy(struct xfs_inode *);
-extern void xfs_iflush_done(struct xfs_buf *, struct xfs_log_item *);
-extern void xfs_istale_done(struct xfs_buf *, struct xfs_log_item *);
-extern void xfs_iflush_abort(struct xfs_inode *, bool);
+extern void xfs_iflush_abort(struct xfs_inode *);
extern int xfs_inode_item_format_convert(xfs_log_iovec_t *,
struct xfs_inode_log_format *);
diff --git a/fs/xfs/xfs_inode_item_recover.c b/fs/xfs/xfs_inode_item_recover.c
new file mode 100644
index 0000000..cb44f76
--- /dev/null
+++ b/fs/xfs/xfs_inode_item_recover.c
@@ -0,0 +1,470 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_trans.h"
+#include "xfs_inode_item.h"
+#include "xfs_trace.h"
+#include "xfs_trans_priv.h"
+#include "xfs_buf_item.h"
+#include "xfs_log.h"
+#include "xfs_error.h"
+#include "xfs_log_priv.h"
+#include "xfs_log_recover.h"
+#include "xfs_icache.h"
+#include "xfs_bmap_btree.h"
+
+STATIC void
+xlog_recover_inode_ra_pass2(
+ struct xlog *log,
+ struct xlog_recover_item *item)
+{
+ if (item->ri_buf[0].i_len == sizeof(struct xfs_inode_log_format)) {
+ struct xfs_inode_log_format *ilfp = item->ri_buf[0].i_addr;
+
+ xlog_buf_readahead(log, ilfp->ilf_blkno, ilfp->ilf_len,
+ &xfs_inode_buf_ra_ops);
+ } else {
+ struct xfs_inode_log_format_32 *ilfp = item->ri_buf[0].i_addr;
+
+ xlog_buf_readahead(log, ilfp->ilf_blkno, ilfp->ilf_len,
+ &xfs_inode_buf_ra_ops);
+ }
+}
+
+/*
+ * Inode fork owner changes
+ *
+ * If we have been told that we have to reparent the inode fork, it's because an
+ * extent swap operation on a CRC enabled filesystem has been done and we are
+ * replaying it. We need to walk the BMBT of the appropriate fork and change the
+ * owners of it.
+ *
+ * The complexity here is that we don't have an inode context to work with, so
+ * after we've replayed the inode we need to instantiate one. This is where the
+ * fun begins.
+ *
+ * We are in the middle of log recovery, so we can't run transactions. That
+ * means we cannot use cache coherent inode instantiation via xfs_iget(), as
+ * that will result in the corresponding iput() running the inode through
+ * xfs_inactive(). If we've just replayed an inode core that changes the link
+ * count to zero (i.e. it's been unlinked), then xfs_inactive() will run
+ * transactions (bad!).
+ *
+ * So, to avoid this, we instantiate an inode directly from the inode core we've
+ * just recovered. We have the buffer still locked, and all we really need to
+ * instantiate is the inode core and the forks being modified. We can do this
+ * manually, then run the inode btree owner change, and then tear down the
+ * xfs_inode without having to run any transactions at all.
+ *
+ * Also, because we don't have a transaction context available here but need to
+ * gather all the buffers we modify for writeback so we pass the buffer_list
+ * instead for the operation to use.
+ */
+
+STATIC int
+xfs_recover_inode_owner_change(
+ struct xfs_mount *mp,
+ struct xfs_dinode *dip,
+ struct xfs_inode_log_format *in_f,
+ struct list_head *buffer_list)
+{
+ struct xfs_inode *ip;
+ int error;
+
+ ASSERT(in_f->ilf_fields & (XFS_ILOG_DOWNER|XFS_ILOG_AOWNER));
+
+ ip = xfs_inode_alloc(mp, in_f->ilf_ino);
+ if (!ip)
+ return -ENOMEM;
+
+ /* instantiate the inode */
+ ASSERT(dip->di_version >= 3);
+
+ error = xfs_inode_from_disk(ip, dip);
+ if (error)
+ goto out_free_ip;
+
+ if (in_f->ilf_fields & XFS_ILOG_DOWNER) {
+ ASSERT(in_f->ilf_fields & XFS_ILOG_DBROOT);
+ error = xfs_bmbt_change_owner(NULL, ip, XFS_DATA_FORK,
+ ip->i_ino, buffer_list);
+ if (error)
+ goto out_free_ip;
+ }
+
+ if (in_f->ilf_fields & XFS_ILOG_AOWNER) {
+ ASSERT(in_f->ilf_fields & XFS_ILOG_ABROOT);
+ error = xfs_bmbt_change_owner(NULL, ip, XFS_ATTR_FORK,
+ ip->i_ino, buffer_list);
+ if (error)
+ goto out_free_ip;
+ }
+
+out_free_ip:
+ xfs_inode_free(ip);
+ return error;
+}
+
+static inline bool xfs_log_dinode_has_bigtime(const struct xfs_log_dinode *ld)
+{
+ return ld->di_version >= 3 &&
+ (ld->di_flags2 & XFS_DIFLAG2_BIGTIME);
+}
+
+/* Convert a log timestamp to an ondisk timestamp. */
+static inline xfs_timestamp_t
+xfs_log_dinode_to_disk_ts(
+ struct xfs_log_dinode *from,
+ const xfs_ictimestamp_t its)
+{
+ struct xfs_legacy_timestamp *lts;
+ struct xfs_legacy_ictimestamp *lits;
+ xfs_timestamp_t ts;
+
+ if (xfs_log_dinode_has_bigtime(from))
+ return cpu_to_be64(its);
+
+ lts = (struct xfs_legacy_timestamp *)&ts;
+ lits = (struct xfs_legacy_ictimestamp *)&its;
+ lts->t_sec = cpu_to_be32(lits->t_sec);
+ lts->t_nsec = cpu_to_be32(lits->t_nsec);
+
+ return ts;
+}
+
+STATIC void
+xfs_log_dinode_to_disk(
+ struct xfs_log_dinode *from,
+ struct xfs_dinode *to)
+{
+ to->di_magic = cpu_to_be16(from->di_magic);
+ to->di_mode = cpu_to_be16(from->di_mode);
+ to->di_version = from->di_version;
+ to->di_format = from->di_format;
+ to->di_onlink = 0;
+ to->di_uid = cpu_to_be32(from->di_uid);
+ to->di_gid = cpu_to_be32(from->di_gid);
+ to->di_nlink = cpu_to_be32(from->di_nlink);
+ to->di_projid_lo = cpu_to_be16(from->di_projid_lo);
+ to->di_projid_hi = cpu_to_be16(from->di_projid_hi);
+ memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad));
+
+ to->di_atime = xfs_log_dinode_to_disk_ts(from, from->di_atime);
+ to->di_mtime = xfs_log_dinode_to_disk_ts(from, from->di_mtime);
+ to->di_ctime = xfs_log_dinode_to_disk_ts(from, from->di_ctime);
+
+ to->di_size = cpu_to_be64(from->di_size);
+ to->di_nblocks = cpu_to_be64(from->di_nblocks);
+ to->di_extsize = cpu_to_be32(from->di_extsize);
+ to->di_nextents = cpu_to_be32(from->di_nextents);
+ to->di_anextents = cpu_to_be16(from->di_anextents);
+ to->di_forkoff = from->di_forkoff;
+ to->di_aformat = from->di_aformat;
+ to->di_dmevmask = cpu_to_be32(from->di_dmevmask);
+ to->di_dmstate = cpu_to_be16(from->di_dmstate);
+ to->di_flags = cpu_to_be16(from->di_flags);
+ to->di_gen = cpu_to_be32(from->di_gen);
+
+ if (from->di_version == 3) {
+ to->di_changecount = cpu_to_be64(from->di_changecount);
+ to->di_crtime = xfs_log_dinode_to_disk_ts(from,
+ from->di_crtime);
+ to->di_flags2 = cpu_to_be64(from->di_flags2);
+ to->di_cowextsize = cpu_to_be32(from->di_cowextsize);
+ to->di_ino = cpu_to_be64(from->di_ino);
+ to->di_lsn = cpu_to_be64(from->di_lsn);
+ memcpy(to->di_pad2, from->di_pad2, sizeof(to->di_pad2));
+ uuid_copy(&to->di_uuid, &from->di_uuid);
+ to->di_flushiter = 0;
+ } else {
+ to->di_flushiter = cpu_to_be16(from->di_flushiter);
+ }
+}
+
+STATIC int
+xlog_recover_inode_commit_pass2(
+ struct xlog *log,
+ struct list_head *buffer_list,
+ struct xlog_recover_item *item,
+ xfs_lsn_t current_lsn)
+{
+ struct xfs_inode_log_format *in_f;
+ struct xfs_mount *mp = log->l_mp;
+ struct xfs_buf *bp;
+ struct xfs_dinode *dip;
+ int len;
+ char *src;
+ char *dest;
+ int error;
+ int attr_index;
+ uint fields;
+ struct xfs_log_dinode *ldip;
+ uint isize;
+ int need_free = 0;
+
+ if (item->ri_buf[0].i_len == sizeof(struct xfs_inode_log_format)) {
+ in_f = item->ri_buf[0].i_addr;
+ } else {
+ in_f = kmem_alloc(sizeof(struct xfs_inode_log_format), 0);
+ need_free = 1;
+ error = xfs_inode_item_format_convert(&item->ri_buf[0], in_f);
+ if (error)
+ goto error;
+ }
+
+ /*
+ * Inode buffers can be freed, look out for it,
+ * and do not replay the inode.
+ */
+ if (xlog_is_buffer_cancelled(log, in_f->ilf_blkno, in_f->ilf_len)) {
+ error = 0;
+ trace_xfs_log_recover_inode_cancel(log, in_f);
+ goto error;
+ }
+ trace_xfs_log_recover_inode_recover(log, in_f);
+
+ error = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len,
+ 0, &bp, &xfs_inode_buf_ops);
+ if (error)
+ goto error;
+ ASSERT(in_f->ilf_fields & XFS_ILOG_CORE);
+ dip = xfs_buf_offset(bp, in_f->ilf_boffset);
+
+ /*
+ * Make sure the place we're flushing out to really looks
+ * like an inode!
+ */
+ if (XFS_IS_CORRUPT(mp, !xfs_verify_magic16(bp, dip->di_magic))) {
+ xfs_alert(mp,
+ "%s: Bad inode magic number, dip = "PTR_FMT", dino bp = "PTR_FMT", ino = %Ld",
+ __func__, dip, bp, in_f->ilf_ino);
+ error = -EFSCORRUPTED;
+ goto out_release;
+ }
+ ldip = item->ri_buf[1].i_addr;
+ if (XFS_IS_CORRUPT(mp, ldip->di_magic != XFS_DINODE_MAGIC)) {
+ xfs_alert(mp,
+ "%s: Bad inode log record, rec ptr "PTR_FMT", ino %Ld",
+ __func__, item, in_f->ilf_ino);
+ error = -EFSCORRUPTED;
+ goto out_release;
+ }
+
+ /*
+ * If the inode has an LSN in it, recover the inode only if it's less
+ * than the lsn of the transaction we are replaying. Note: we still
+ * need to replay an owner change even though the inode is more recent
+ * than the transaction as there is no guarantee that all the btree
+ * blocks are more recent than this transaction, too.
+ */
+ if (dip->di_version >= 3) {
+ xfs_lsn_t lsn = be64_to_cpu(dip->di_lsn);
+
+ if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) {
+ trace_xfs_log_recover_inode_skip(log, in_f);
+ error = 0;
+ goto out_owner_change;
+ }
+ }
+
+ /*
+ * di_flushiter is only valid for v1/2 inodes. All changes for v3 inodes
+ * are transactional and if ordering is necessary we can determine that
+ * more accurately by the LSN field in the V3 inode core. Don't trust
+ * the inode versions we might be changing them here - use the
+ * superblock flag to determine whether we need to look at di_flushiter
+ * to skip replay when the on disk inode is newer than the log one
+ */
+ if (!xfs_sb_version_has_v3inode(&mp->m_sb) &&
+ ldip->di_flushiter < be16_to_cpu(dip->di_flushiter)) {
+ /*
+ * Deal with the wrap case, DI_MAX_FLUSH is less
+ * than smaller numbers
+ */
+ if (be16_to_cpu(dip->di_flushiter) == DI_MAX_FLUSH &&
+ ldip->di_flushiter < (DI_MAX_FLUSH >> 1)) {
+ /* do nothing */
+ } else {
+ trace_xfs_log_recover_inode_skip(log, in_f);
+ error = 0;
+ goto out_release;
+ }
+ }
+
+ /* Take the opportunity to reset the flush iteration count */
+ ldip->di_flushiter = 0;
+
+ if (unlikely(S_ISREG(ldip->di_mode))) {
+ if ((ldip->di_format != XFS_DINODE_FMT_EXTENTS) &&
+ (ldip->di_format != XFS_DINODE_FMT_BTREE)) {
+ XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(3)",
+ XFS_ERRLEVEL_LOW, mp, ldip,
+ sizeof(*ldip));
+ xfs_alert(mp,
+ "%s: Bad regular inode log record, rec ptr "PTR_FMT", "
+ "ino ptr = "PTR_FMT", ino bp = "PTR_FMT", ino %Ld",
+ __func__, item, dip, bp, in_f->ilf_ino);
+ error = -EFSCORRUPTED;
+ goto out_release;
+ }
+ } else if (unlikely(S_ISDIR(ldip->di_mode))) {
+ if ((ldip->di_format != XFS_DINODE_FMT_EXTENTS) &&
+ (ldip->di_format != XFS_DINODE_FMT_BTREE) &&
+ (ldip->di_format != XFS_DINODE_FMT_LOCAL)) {
+ XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(4)",
+ XFS_ERRLEVEL_LOW, mp, ldip,
+ sizeof(*ldip));
+ xfs_alert(mp,
+ "%s: Bad dir inode log record, rec ptr "PTR_FMT", "
+ "ino ptr = "PTR_FMT", ino bp = "PTR_FMT", ino %Ld",
+ __func__, item, dip, bp, in_f->ilf_ino);
+ error = -EFSCORRUPTED;
+ goto out_release;
+ }
+ }
+ if (unlikely(ldip->di_nextents + ldip->di_anextents > ldip->di_nblocks)){
+ XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(5)",
+ XFS_ERRLEVEL_LOW, mp, ldip,
+ sizeof(*ldip));
+ xfs_alert(mp,
+ "%s: Bad inode log record, rec ptr "PTR_FMT", dino ptr "PTR_FMT", "
+ "dino bp "PTR_FMT", ino %Ld, total extents = %d, nblocks = %Ld",
+ __func__, item, dip, bp, in_f->ilf_ino,
+ ldip->di_nextents + ldip->di_anextents,
+ ldip->di_nblocks);
+ error = -EFSCORRUPTED;
+ goto out_release;
+ }
+ if (unlikely(ldip->di_forkoff > mp->m_sb.sb_inodesize)) {
+ XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(6)",
+ XFS_ERRLEVEL_LOW, mp, ldip,
+ sizeof(*ldip));
+ xfs_alert(mp,
+ "%s: Bad inode log record, rec ptr "PTR_FMT", dino ptr "PTR_FMT", "
+ "dino bp "PTR_FMT", ino %Ld, forkoff 0x%x", __func__,
+ item, dip, bp, in_f->ilf_ino, ldip->di_forkoff);
+ error = -EFSCORRUPTED;
+ goto out_release;
+ }
+ isize = xfs_log_dinode_size(mp);
+ if (unlikely(item->ri_buf[1].i_len > isize)) {
+ XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(7)",
+ XFS_ERRLEVEL_LOW, mp, ldip,
+ sizeof(*ldip));
+ xfs_alert(mp,
+ "%s: Bad inode log record length %d, rec ptr "PTR_FMT,
+ __func__, item->ri_buf[1].i_len, item);
+ error = -EFSCORRUPTED;
+ goto out_release;
+ }
+
+ /* recover the log dinode inode into the on disk inode */
+ xfs_log_dinode_to_disk(ldip, dip);
+
+ fields = in_f->ilf_fields;
+ if (fields & XFS_ILOG_DEV)
+ xfs_dinode_put_rdev(dip, in_f->ilf_u.ilfu_rdev);
+
+ if (in_f->ilf_size == 2)
+ goto out_owner_change;
+ len = item->ri_buf[2].i_len;
+ src = item->ri_buf[2].i_addr;
+ ASSERT(in_f->ilf_size <= 4);
+ ASSERT((in_f->ilf_size == 3) || (fields & XFS_ILOG_AFORK));
+ ASSERT(!(fields & XFS_ILOG_DFORK) ||
+ (len == in_f->ilf_dsize));
+
+ switch (fields & XFS_ILOG_DFORK) {
+ case XFS_ILOG_DDATA:
+ case XFS_ILOG_DEXT:
+ memcpy(XFS_DFORK_DPTR(dip), src, len);
+ break;
+
+ case XFS_ILOG_DBROOT:
+ xfs_bmbt_to_bmdr(mp, (struct xfs_btree_block *)src, len,
+ (struct xfs_bmdr_block *)XFS_DFORK_DPTR(dip),
+ XFS_DFORK_DSIZE(dip, mp));
+ break;
+
+ default:
+ /*
+ * There are no data fork flags set.
+ */
+ ASSERT((fields & XFS_ILOG_DFORK) == 0);
+ break;
+ }
+
+ /*
+ * If we logged any attribute data, recover it. There may or
+ * may not have been any other non-core data logged in this
+ * transaction.
+ */
+ if (in_f->ilf_fields & XFS_ILOG_AFORK) {
+ if (in_f->ilf_fields & XFS_ILOG_DFORK) {
+ attr_index = 3;
+ } else {
+ attr_index = 2;
+ }
+ len = item->ri_buf[attr_index].i_len;
+ src = item->ri_buf[attr_index].i_addr;
+ ASSERT(len == in_f->ilf_asize);
+
+ switch (in_f->ilf_fields & XFS_ILOG_AFORK) {
+ case XFS_ILOG_ADATA:
+ case XFS_ILOG_AEXT:
+ dest = XFS_DFORK_APTR(dip);
+ ASSERT(len <= XFS_DFORK_ASIZE(dip, mp));
+ memcpy(dest, src, len);
+ break;
+
+ case XFS_ILOG_ABROOT:
+ dest = XFS_DFORK_APTR(dip);
+ xfs_bmbt_to_bmdr(mp, (struct xfs_btree_block *)src,
+ len, (struct xfs_bmdr_block *)dest,
+ XFS_DFORK_ASIZE(dip, mp));
+ break;
+
+ default:
+ xfs_warn(log->l_mp, "%s: Invalid flag", __func__);
+ ASSERT(0);
+ error = -EFSCORRUPTED;
+ goto out_release;
+ }
+ }
+
+out_owner_change:
+ /* Recover the swapext owner change unless inode has been deleted */
+ if ((in_f->ilf_fields & (XFS_ILOG_DOWNER|XFS_ILOG_AOWNER)) &&
+ (dip->di_mode != 0))
+ error = xfs_recover_inode_owner_change(mp, dip, in_f,
+ buffer_list);
+ /* re-generate the checksum. */
+ xfs_dinode_calc_crc(log->l_mp, dip);
+
+ ASSERT(bp->b_mount == mp);
+ bp->b_flags |= _XBF_LOGRECOVERY;
+ xfs_buf_delwri_queue(bp, buffer_list);
+
+out_release:
+ xfs_buf_relse(bp);
+error:
+ if (need_free)
+ kmem_free(in_f);
+ return error;
+}
+
+const struct xlog_recover_item_ops xlog_inode_item_ops = {
+ .item_type = XFS_LI_INODE,
+ .ra_pass2 = xlog_recover_inode_ra_pass2,
+ .commit_pass2 = xlog_recover_inode_commit_pass2,
+};
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index b3021d9..646735a 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -33,6 +33,10 @@
#include "xfs_sb.h"
#include "xfs_ag.h"
#include "xfs_health.h"
+#include "xfs_reflink.h"
+#include "xfs_ioctl.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
#include <linux/mount.h>
#include <linux/namei.h>
@@ -290,138 +294,173 @@
return error;
}
-int
-xfs_set_dmattrs(
- xfs_inode_t *ip,
- uint evmask,
- uint16_t state)
+/*
+ * Format an attribute and copy it out to the user's buffer.
+ * Take care to check values and protect against them changing later,
+ * we may be reading them directly out of a user buffer.
+ */
+static void
+xfs_ioc_attr_put_listent(
+ struct xfs_attr_list_context *context,
+ int flags,
+ unsigned char *name,
+ int namelen,
+ int valuelen)
{
- xfs_mount_t *mp = ip->i_mount;
- xfs_trans_t *tp;
- int error;
+ struct xfs_attrlist *alist = context->buffer;
+ struct xfs_attrlist_ent *aep;
+ int arraytop;
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
+ ASSERT(!context->seen_enough);
+ ASSERT(context->count >= 0);
+ ASSERT(context->count < (ATTR_MAX_VALUELEN/8));
+ ASSERT(context->firstu >= sizeof(*alist));
+ ASSERT(context->firstu <= context->bufsize);
- if (XFS_FORCED_SHUTDOWN(mp))
- return -EIO;
+ /*
+ * Only list entries in the right namespace.
+ */
+ if (context->attr_filter != (flags & XFS_ATTR_NSP_ONDISK_MASK))
+ return;
- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp);
- if (error)
- return error;
+ arraytop = sizeof(*alist) +
+ context->count * sizeof(alist->al_offset[0]);
- xfs_ilock(ip, XFS_ILOCK_EXCL);
- xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+ /* decrement by the actual bytes used by the attr */
+ context->firstu -= round_up(offsetof(struct xfs_attrlist_ent, a_name) +
+ namelen + 1, sizeof(uint32_t));
+ if (context->firstu < arraytop) {
+ trace_xfs_attr_list_full(context);
+ alist->al_more = 1;
+ context->seen_enough = 1;
+ return;
+ }
- ip->i_d.di_dmevmask = evmask;
- ip->i_d.di_dmstate = state;
-
- xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
- error = xfs_trans_commit(tp);
-
- return error;
+ aep = context->buffer + context->firstu;
+ aep->a_valuelen = valuelen;
+ memcpy(aep->a_name, name, namelen);
+ aep->a_name[namelen] = 0;
+ alist->al_offset[context->count++] = context->firstu;
+ alist->al_count = context->count;
+ trace_xfs_attr_list_add(context);
}
-STATIC int
-xfs_fssetdm_by_handle(
- struct file *parfilp,
- void __user *arg)
+static unsigned int
+xfs_attr_filter(
+ u32 ioc_flags)
{
- int error;
- struct fsdmidata fsd;
- xfs_fsop_setdm_handlereq_t dmhreq;
- struct dentry *dentry;
+ if (ioc_flags & XFS_IOC_ATTR_ROOT)
+ return XFS_ATTR_ROOT;
+ if (ioc_flags & XFS_IOC_ATTR_SECURE)
+ return XFS_ATTR_SECURE;
+ return 0;
+}
- if (!capable(CAP_MKNOD))
- return -EPERM;
- if (copy_from_user(&dmhreq, arg, sizeof(xfs_fsop_setdm_handlereq_t)))
+static unsigned int
+xfs_attr_flags(
+ u32 ioc_flags)
+{
+ if (ioc_flags & XFS_IOC_ATTR_CREATE)
+ return XATTR_CREATE;
+ if (ioc_flags & XFS_IOC_ATTR_REPLACE)
+ return XATTR_REPLACE;
+ return 0;
+}
+
+int
+xfs_ioc_attr_list(
+ struct xfs_inode *dp,
+ void __user *ubuf,
+ int bufsize,
+ int flags,
+ struct xfs_attrlist_cursor __user *ucursor)
+{
+ struct xfs_attr_list_context context = { };
+ struct xfs_attrlist *alist;
+ void *buffer;
+ int error;
+
+ if (bufsize < sizeof(struct xfs_attrlist) ||
+ bufsize > XFS_XATTR_LIST_MAX)
+ return -EINVAL;
+
+ /*
+ * Reject flags, only allow namespaces.
+ */
+ if (flags & ~(XFS_IOC_ATTR_ROOT | XFS_IOC_ATTR_SECURE))
+ return -EINVAL;
+ if (flags == (XFS_IOC_ATTR_ROOT | XFS_IOC_ATTR_SECURE))
+ return -EINVAL;
+
+ /*
+ * Validate the cursor.
+ */
+ if (copy_from_user(&context.cursor, ucursor, sizeof(context.cursor)))
return -EFAULT;
+ if (context.cursor.pad1 || context.cursor.pad2)
+ return -EINVAL;
+ if (!context.cursor.initted &&
+ (context.cursor.hashval || context.cursor.blkno ||
+ context.cursor.offset))
+ return -EINVAL;
- error = mnt_want_write_file(parfilp);
+ buffer = kvzalloc(bufsize, GFP_KERNEL);
+ if (!buffer)
+ return -ENOMEM;
+
+ /*
+ * Initialize the output buffer.
+ */
+ context.dp = dp;
+ context.resynch = 1;
+ context.attr_filter = xfs_attr_filter(flags);
+ context.buffer = buffer;
+ context.bufsize = round_down(bufsize, sizeof(uint32_t));
+ context.firstu = context.bufsize;
+ context.put_listent = xfs_ioc_attr_put_listent;
+
+ alist = context.buffer;
+ alist->al_count = 0;
+ alist->al_more = 0;
+ alist->al_offset[0] = context.bufsize;
+
+ error = xfs_attr_list(&context);
if (error)
- return error;
+ goto out_free;
- dentry = xfs_handlereq_to_dentry(parfilp, &dmhreq.hreq);
- if (IS_ERR(dentry)) {
- mnt_drop_write_file(parfilp);
- return PTR_ERR(dentry);
- }
-
- if (IS_IMMUTABLE(d_inode(dentry)) || IS_APPEND(d_inode(dentry))) {
- error = -EPERM;
- goto out;
- }
-
- if (copy_from_user(&fsd, dmhreq.data, sizeof(fsd))) {
+ if (copy_to_user(ubuf, buffer, bufsize) ||
+ copy_to_user(ucursor, &context.cursor, sizeof(context.cursor)))
error = -EFAULT;
- goto out;
- }
-
- error = xfs_set_dmattrs(XFS_I(d_inode(dentry)), fsd.fsd_dmevmask,
- fsd.fsd_dmstate);
-
- out:
- mnt_drop_write_file(parfilp);
- dput(dentry);
+out_free:
+ kmem_free(buffer);
return error;
}
STATIC int
xfs_attrlist_by_handle(
struct file *parfilp,
- void __user *arg)
+ struct xfs_fsop_attrlist_handlereq __user *p)
{
- int error = -ENOMEM;
- attrlist_cursor_kern_t *cursor;
- struct xfs_fsop_attrlist_handlereq __user *p = arg;
- xfs_fsop_attrlist_handlereq_t al_hreq;
+ struct xfs_fsop_attrlist_handlereq al_hreq;
struct dentry *dentry;
- char *kbuf;
+ int error = -ENOMEM;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- if (copy_from_user(&al_hreq, arg, sizeof(xfs_fsop_attrlist_handlereq_t)))
+ if (copy_from_user(&al_hreq, p, sizeof(al_hreq)))
return -EFAULT;
- if (al_hreq.buflen < sizeof(struct attrlist) ||
- al_hreq.buflen > XFS_XATTR_LIST_MAX)
- return -EINVAL;
-
- /*
- * Reject flags, only allow namespaces.
- */
- if (al_hreq.flags & ~(ATTR_ROOT | ATTR_SECURE))
- return -EINVAL;
dentry = xfs_handlereq_to_dentry(parfilp, &al_hreq.hreq);
if (IS_ERR(dentry))
return PTR_ERR(dentry);
- kbuf = kmem_zalloc_large(al_hreq.buflen, 0);
- if (!kbuf)
- goto out_dput;
-
- cursor = (attrlist_cursor_kern_t *)&al_hreq.pos;
- error = xfs_attr_list(XFS_I(d_inode(dentry)), kbuf, al_hreq.buflen,
- al_hreq.flags, cursor);
- if (error)
- goto out_kfree;
-
- if (copy_to_user(&p->pos, cursor, sizeof(attrlist_cursor_kern_t))) {
- error = -EFAULT;
- goto out_kfree;
- }
-
- if (copy_to_user(al_hreq.buffer, kbuf, al_hreq.buflen))
- error = -EFAULT;
-
-out_kfree:
- kmem_free(kbuf);
-out_dput:
+ error = xfs_ioc_attr_list(XFS_I(d_inode(dentry)), al_hreq.buffer,
+ al_hreq.buflen, al_hreq.flags, &p->pos);
dput(dentry);
return error;
}
-int
+static int
xfs_attrmulti_attr_get(
struct inode *inode,
unsigned char *name,
@@ -429,28 +468,33 @@
uint32_t *len,
uint32_t flags)
{
- unsigned char *kbuf;
- int error = -EFAULT;
+ struct xfs_da_args args = {
+ .dp = XFS_I(inode),
+ .attr_filter = xfs_attr_filter(flags),
+ .attr_flags = xfs_attr_flags(flags),
+ .name = name,
+ .namelen = strlen(name),
+ .valuelen = *len,
+ };
+ int error;
if (*len > XFS_XATTR_SIZE_MAX)
return -EINVAL;
- kbuf = kmem_zalloc_large(*len, 0);
- if (!kbuf)
- return -ENOMEM;
- error = xfs_attr_get(XFS_I(inode), name, &kbuf, (int *)len, flags);
+ error = xfs_attr_get(&args);
if (error)
goto out_kfree;
- if (copy_to_user(ubuf, kbuf, *len))
+ *len = args.valuelen;
+ if (copy_to_user(ubuf, args.value, args.valuelen))
error = -EFAULT;
out_kfree:
- kmem_free(kbuf);
+ kmem_free(args.value);
return error;
}
-int
+static int
xfs_attrmulti_attr_set(
struct inode *inode,
unsigned char *name,
@@ -458,38 +502,75 @@
uint32_t len,
uint32_t flags)
{
- unsigned char *kbuf;
+ struct xfs_da_args args = {
+ .dp = XFS_I(inode),
+ .attr_filter = xfs_attr_filter(flags),
+ .attr_flags = xfs_attr_flags(flags),
+ .name = name,
+ .namelen = strlen(name),
+ };
int error;
if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
return -EPERM;
- if (len > XFS_XATTR_SIZE_MAX)
- return -EINVAL;
- kbuf = memdup_user(ubuf, len);
- if (IS_ERR(kbuf))
- return PTR_ERR(kbuf);
+ if (ubuf) {
+ if (len > XFS_XATTR_SIZE_MAX)
+ return -EINVAL;
+ args.value = memdup_user(ubuf, len);
+ if (IS_ERR(args.value))
+ return PTR_ERR(args.value);
+ args.valuelen = len;
+ }
- error = xfs_attr_set(XFS_I(inode), name, kbuf, len, flags);
- if (!error)
- xfs_forget_acl(inode, name, flags);
- kfree(kbuf);
+ error = xfs_attr_set(&args);
+ if (!error && (flags & XFS_IOC_ATTR_ROOT))
+ xfs_forget_acl(inode, name);
+ kfree(args.value);
return error;
}
int
-xfs_attrmulti_attr_remove(
+xfs_ioc_attrmulti_one(
+ struct file *parfilp,
struct inode *inode,
- unsigned char *name,
+ uint32_t opcode,
+ void __user *uname,
+ void __user *value,
+ uint32_t *len,
uint32_t flags)
{
+ unsigned char *name;
int error;
- if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
- return -EPERM;
- error = xfs_attr_remove(XFS_I(inode), name, flags);
- if (!error)
- xfs_forget_acl(inode, name, flags);
+ if ((flags & XFS_IOC_ATTR_ROOT) && (flags & XFS_IOC_ATTR_SECURE))
+ return -EINVAL;
+
+ name = strndup_user(uname, MAXNAMELEN);
+ if (IS_ERR(name))
+ return PTR_ERR(name);
+
+ switch (opcode) {
+ case ATTR_OP_GET:
+ error = xfs_attrmulti_attr_get(inode, name, value, len, flags);
+ break;
+ case ATTR_OP_REMOVE:
+ value = NULL;
+ *len = 0;
+ /* fall through */
+ case ATTR_OP_SET:
+ error = mnt_want_write_file(parfilp);
+ if (error)
+ break;
+ error = xfs_attrmulti_attr_set(inode, name, value, *len, flags);
+ mnt_drop_write_file(parfilp);
+ break;
+ default:
+ error = -EINVAL;
+ break;
+ }
+
+ kfree(name);
return error;
}
@@ -503,7 +584,6 @@
xfs_fsop_attrmulti_handlereq_t am_hreq;
struct dentry *dentry;
unsigned int i, size;
- unsigned char *attr_name;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
@@ -529,58 +609,17 @@
goto out_dput;
}
- error = -ENOMEM;
- attr_name = kmalloc(MAXNAMELEN, GFP_KERNEL);
- if (!attr_name)
- goto out_kfree_ops;
-
error = 0;
for (i = 0; i < am_hreq.opcount; i++) {
- ops[i].am_flags &= ~ATTR_KERNEL_FLAGS;
-
- ops[i].am_error = strncpy_from_user((char *)attr_name,
- ops[i].am_attrname, MAXNAMELEN);
- if (ops[i].am_error == 0 || ops[i].am_error == MAXNAMELEN)
- error = -ERANGE;
- if (ops[i].am_error < 0)
- break;
-
- switch (ops[i].am_opcode) {
- case ATTR_OP_GET:
- ops[i].am_error = xfs_attrmulti_attr_get(
- d_inode(dentry), attr_name,
- ops[i].am_attrvalue, &ops[i].am_length,
- ops[i].am_flags);
- break;
- case ATTR_OP_SET:
- ops[i].am_error = mnt_want_write_file(parfilp);
- if (ops[i].am_error)
- break;
- ops[i].am_error = xfs_attrmulti_attr_set(
- d_inode(dentry), attr_name,
- ops[i].am_attrvalue, ops[i].am_length,
- ops[i].am_flags);
- mnt_drop_write_file(parfilp);
- break;
- case ATTR_OP_REMOVE:
- ops[i].am_error = mnt_want_write_file(parfilp);
- if (ops[i].am_error)
- break;
- ops[i].am_error = xfs_attrmulti_attr_remove(
- d_inode(dentry), attr_name,
- ops[i].am_flags);
- mnt_drop_write_file(parfilp);
- break;
- default:
- ops[i].am_error = -EINVAL;
- }
+ ops[i].am_error = xfs_ioc_attrmulti_one(parfilp,
+ d_inode(dentry), ops[i].am_opcode,
+ ops[i].am_attrname, ops[i].am_attrvalue,
+ &ops[i].am_length, ops[i].am_flags);
}
if (copy_to_user(am_hreq.ops, ops, size))
error = -EFAULT;
- kfree(attr_name);
- out_kfree_ops:
kfree(ops);
out_dput:
dput(dentry);
@@ -590,13 +629,12 @@
int
xfs_ioc_space(
struct file *filp,
- unsigned int cmd,
xfs_flock64_t *bf)
{
struct inode *inode = file_inode(filp);
struct xfs_inode *ip = XFS_I(inode);
struct iattr iattr;
- enum xfs_prealloc_flags flags = 0;
+ enum xfs_prealloc_flags flags = XFS_PREALLOC_CLEAR;
uint iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
int error;
@@ -609,6 +647,9 @@
if (!S_ISREG(inode->i_mode))
return -EINVAL;
+ if (xfs_is_always_cow_inode(ip))
+ return -EOPNOTSUPP;
+
if (filp->f_flags & O_DSYNC)
flags |= XFS_PREALLOC_SYNC;
if (filp->f_mode & FMODE_NOCMTIME)
@@ -622,6 +663,7 @@
error = xfs_break_layouts(inode, &iolock, BREAK_UNMAP);
if (error)
goto out_unlock;
+ inode_dio_wait(inode);
switch (bf->l_whence) {
case 0: /*SEEK_SET*/
@@ -637,98 +679,22 @@
goto out_unlock;
}
- /*
- * length of <= 0 for resv/unresv/zero is invalid. length for
- * alloc/free is ignored completely and we have no idea what userspace
- * might have set it to, so set it to zero to allow range
- * checks to pass.
- */
- switch (cmd) {
- case XFS_IOC_ZERO_RANGE:
- case XFS_IOC_RESVSP:
- case XFS_IOC_RESVSP64:
- case XFS_IOC_UNRESVSP:
- case XFS_IOC_UNRESVSP64:
- if (bf->l_len <= 0) {
- error = -EINVAL;
- goto out_unlock;
- }
- break;
- default:
- bf->l_len = 0;
- break;
- }
-
- if (bf->l_start < 0 ||
- bf->l_start > inode->i_sb->s_maxbytes ||
- bf->l_start + bf->l_len < 0 ||
- bf->l_start + bf->l_len >= inode->i_sb->s_maxbytes) {
+ if (bf->l_start < 0 || bf->l_start > inode->i_sb->s_maxbytes) {
error = -EINVAL;
goto out_unlock;
}
- /*
- * Must wait for all AIO to complete before we continue as AIO can
- * change the file size on completion without holding any locks we
- * currently hold. We must do this first because AIO can update both
- * the on disk and in memory inode sizes, and the operations that follow
- * require the in-memory size to be fully up-to-date.
- */
- inode_dio_wait(inode);
-
- /*
- * Now that AIO and DIO has drained we can flush and (if necessary)
- * invalidate the cached range over the first operation we are about to
- * run. We include zero range here because it starts with a hole punch
- * over the target range.
- */
- switch (cmd) {
- case XFS_IOC_ZERO_RANGE:
- case XFS_IOC_UNRESVSP:
- case XFS_IOC_UNRESVSP64:
- error = xfs_flush_unmap_range(ip, bf->l_start, bf->l_len);
+ if (bf->l_start > XFS_ISIZE(ip)) {
+ error = xfs_alloc_file_space(ip, XFS_ISIZE(ip),
+ bf->l_start - XFS_ISIZE(ip),
+ XFS_BMAPI_PREALLOC);
if (error)
goto out_unlock;
- break;
}
- switch (cmd) {
- case XFS_IOC_ZERO_RANGE:
- flags |= XFS_PREALLOC_SET;
- error = xfs_zero_file_space(ip, bf->l_start, bf->l_len);
- break;
- case XFS_IOC_RESVSP:
- case XFS_IOC_RESVSP64:
- flags |= XFS_PREALLOC_SET;
- error = xfs_alloc_file_space(ip, bf->l_start, bf->l_len,
- XFS_BMAPI_PREALLOC);
- break;
- case XFS_IOC_UNRESVSP:
- case XFS_IOC_UNRESVSP64:
- error = xfs_free_file_space(ip, bf->l_start, bf->l_len);
- break;
- case XFS_IOC_ALLOCSP:
- case XFS_IOC_ALLOCSP64:
- case XFS_IOC_FREESP:
- case XFS_IOC_FREESP64:
- flags |= XFS_PREALLOC_CLEAR;
- if (bf->l_start > XFS_ISIZE(ip)) {
- error = xfs_alloc_file_space(ip, XFS_ISIZE(ip),
- bf->l_start - XFS_ISIZE(ip), 0);
- if (error)
- goto out_unlock;
- }
-
- iattr.ia_valid = ATTR_SIZE;
- iattr.ia_size = bf->l_start;
-
- error = xfs_vn_setattr_size(file_dentry(filp), &iattr);
- break;
- default:
- ASSERT(0);
- error = -EINVAL;
- }
-
+ iattr.ia_valid = ATTR_SIZE;
+ iattr.ia_size = bf->l_start;
+ error = xfs_vn_setattr_size(file_dentry(filp), &iattr);
if (error)
goto out_unlock;
@@ -1110,13 +1076,18 @@
xflags |= FS_XFLAG_NODUMP;
else
xflags &= ~FS_XFLAG_NODUMP;
+ if (flags & FS_DAX_FL)
+ xflags |= FS_XFLAG_DAX;
+ else
+ xflags &= ~FS_XFLAG_DAX;
return xflags;
}
STATIC unsigned int
xfs_di2lxflags(
- uint16_t di_flags)
+ uint16_t di_flags,
+ uint64_t di_flags2)
{
unsigned int flags = 0;
@@ -1130,6 +1101,9 @@
flags |= FS_NOATIME_FL;
if (di_flags & XFS_DIFLAG_NODUMP)
flags |= FS_NODUMP_FL;
+ if (di_flags2 & XFS_DIFLAG2_DAX) {
+ flags |= FS_DAX_FL;
+ }
return flags;
}
@@ -1139,26 +1113,17 @@
bool attr,
struct fsxattr *fa)
{
+ struct xfs_ifork *ifp = attr ? ip->i_afp : &ip->i_df;
+
simple_fill_fsxattr(fa, xfs_ip2xflags(ip));
fa->fsx_extsize = ip->i_d.di_extsize << ip->i_mount->m_sb.sb_blocklog;
fa->fsx_cowextsize = ip->i_d.di_cowextsize <<
ip->i_mount->m_sb.sb_blocklog;
- fa->fsx_projid = xfs_get_projid(ip);
-
- if (attr) {
- if (ip->i_afp) {
- if (ip->i_afp->if_flags & XFS_IFEXTENTS)
- fa->fsx_nextents = xfs_iext_count(ip->i_afp);
- else
- fa->fsx_nextents = ip->i_d.di_anextents;
- } else
- fa->fsx_nextents = 0;
- } else {
- if (ip->i_df.if_flags & XFS_IFEXTENTS)
- fa->fsx_nextents = xfs_iext_count(&ip->i_df);
- else
- fa->fsx_nextents = ip->i_d.di_nextents;
- }
+ fa->fsx_projid = ip->i_d.di_projid;
+ if (ifp && (ifp->if_flags & XFS_IFEXTENTS))
+ fa->fsx_nextents = xfs_iext_count(ifp);
+ else
+ fa->fsx_nextents = xfs_ifork_nextents(ifp);
}
STATIC int
@@ -1226,7 +1191,8 @@
unsigned int xflags)
{
uint64_t di_flags2 =
- (ip->i_d.di_flags2 & XFS_DIFLAG2_REFLINK);
+ (ip->i_d.di_flags2 & (XFS_DIFLAG2_REFLINK |
+ XFS_DIFLAG2_BIGTIME));
if (xflags & FS_XFLAG_DAX)
di_flags2 |= XFS_DIFLAG2_DAX;
@@ -1236,37 +1202,6 @@
return di_flags2;
}
-STATIC void
-xfs_diflags_to_linux(
- struct xfs_inode *ip)
-{
- struct inode *inode = VFS_I(ip);
- unsigned int xflags = xfs_ip2xflags(ip);
-
- if (xflags & FS_XFLAG_IMMUTABLE)
- inode->i_flags |= S_IMMUTABLE;
- else
- inode->i_flags &= ~S_IMMUTABLE;
- if (xflags & FS_XFLAG_APPEND)
- inode->i_flags |= S_APPEND;
- else
- inode->i_flags &= ~S_APPEND;
- if (xflags & FS_XFLAG_SYNC)
- inode->i_flags |= S_SYNC;
- else
- inode->i_flags &= ~S_SYNC;
- if (xflags & FS_XFLAG_NOATIME)
- inode->i_flags |= S_NOATIME;
- else
- inode->i_flags &= ~S_NOATIME;
-#if 0 /* disabled until the flag switching races are sorted out */
- if (xflags & FS_XFLAG_DAX)
- inode->i_flags |= S_DAX;
- else
- inode->i_flags &= ~S_DAX;
-#endif
-}
-
static int
xfs_ioctl_setattr_xflags(
struct xfs_trans *tp,
@@ -1277,7 +1212,7 @@
uint64_t di_flags2;
/* Can't change realtime flag if any extents are allocated. */
- if ((ip->i_d.di_nextents || ip->i_delayed_blks) &&
+ if ((ip->i_df.if_nextents || ip->i_delayed_blks) &&
XFS_IS_REALTIME_INODE(ip) != (fa->fsx_xflags & FS_XFLAG_REALTIME))
return -EINVAL;
@@ -1298,78 +1233,39 @@
/* diflags2 only valid for v3 inodes. */
di_flags2 = xfs_flags2diflags2(ip, fa->fsx_xflags);
- if (di_flags2 && ip->i_d.di_version < 3)
+ if (di_flags2 && !xfs_sb_version_has_v3inode(&mp->m_sb))
return -EINVAL;
ip->i_d.di_flags = xfs_flags2diflags(ip, fa->fsx_xflags);
ip->i_d.di_flags2 = di_flags2;
- xfs_diflags_to_linux(ip);
+ xfs_diflags_to_iflags(ip, false);
xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
XFS_STATS_INC(mp, xs_ig_attrchg);
return 0;
}
-/*
- * If we are changing DAX flags, we have to ensure the file is clean and any
- * cached objects in the address space are invalidated and removed. This
- * requires us to lock out other IO and page faults similar to a truncate
- * operation. The locks need to be held until the transaction has been committed
- * so that the cache invalidation is atomic with respect to the DAX flag
- * manipulation.
- */
-static int
-xfs_ioctl_setattr_dax_invalidate(
+static void
+xfs_ioctl_setattr_prepare_dax(
struct xfs_inode *ip,
- struct fsxattr *fa,
- int *join_flags)
+ struct fsxattr *fa)
{
- struct inode *inode = VFS_I(ip);
- struct super_block *sb = inode->i_sb;
- int error;
-
- *join_flags = 0;
-
- /*
- * It is only valid to set the DAX flag on regular files and
- * directories on filesystems where the block size is equal to the page
- * size. On directories it serves as an inherited hint so we don't
- * have to check the device for dax support or flush pagecache.
- */
- if (fa->fsx_xflags & FS_XFLAG_DAX) {
- if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)))
- return -EINVAL;
- if (!bdev_dax_supported(xfs_find_bdev_for_inode(VFS_I(ip)),
- sb->s_blocksize))
- return -EINVAL;
- }
-
- /* If the DAX state is not changing, we have nothing to do here. */
- if ((fa->fsx_xflags & FS_XFLAG_DAX) && IS_DAX(inode))
- return 0;
- if (!(fa->fsx_xflags & FS_XFLAG_DAX) && !IS_DAX(inode))
- return 0;
+ struct xfs_mount *mp = ip->i_mount;
+ struct inode *inode = VFS_I(ip);
if (S_ISDIR(inode->i_mode))
- return 0;
+ return;
- /* lock, flush and invalidate mapping in preparation for flag change */
- xfs_ilock(ip, XFS_MMAPLOCK_EXCL | XFS_IOLOCK_EXCL);
- error = filemap_write_and_wait(inode->i_mapping);
- if (error)
- goto out_unlock;
- error = invalidate_inode_pages2(inode->i_mapping);
- if (error)
- goto out_unlock;
+ if ((mp->m_flags & XFS_MOUNT_DAX_ALWAYS) ||
+ (mp->m_flags & XFS_MOUNT_DAX_NEVER))
+ return;
- *join_flags = XFS_MMAPLOCK_EXCL | XFS_IOLOCK_EXCL;
- return 0;
-
-out_unlock:
- xfs_iunlock(ip, XFS_MMAPLOCK_EXCL | XFS_IOLOCK_EXCL);
- return error;
-
+ if (((fa->fsx_xflags & FS_XFLAG_DAX) &&
+ !(ip->i_d.di_flags2 & XFS_DIFLAG2_DAX)) ||
+ (!(fa->fsx_xflags & FS_XFLAG_DAX) &&
+ (ip->i_d.di_flags2 & XFS_DIFLAG2_DAX)))
+ d_mark_dontcache(inode);
}
/*
@@ -1377,17 +1273,10 @@
* have permission to do so. On success, return a clean transaction and the
* inode locked exclusively ready for further operation specific checks. On
* failure, return an error without modifying or locking the inode.
- *
- * The inode might already be IO locked on call. If this is the case, it is
- * indicated in @join_flags and we take full responsibility for ensuring they
- * are unlocked from now on. Hence if we have an error here, we still have to
- * unlock them. Otherwise, once they are joined to the transaction, they will
- * be unlocked on commit/cancel.
*/
static struct xfs_trans *
xfs_ioctl_setattr_get_trans(
- struct xfs_inode *ip,
- int join_flags)
+ struct xfs_inode *ip)
{
struct xfs_mount *mp = ip->i_mount;
struct xfs_trans *tp;
@@ -1404,8 +1293,7 @@
goto out_unlock;
xfs_ilock(ip, XFS_ILOCK_EXCL);
- xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | join_flags);
- join_flags = 0;
+ xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
/*
* CAP_FOWNER overrides the following restrictions:
@@ -1426,8 +1314,6 @@
out_cancel:
xfs_trans_cancel(tp);
out_unlock:
- if (join_flags)
- xfs_iunlock(ip, join_flags);
return ERR_PTR(error);
}
@@ -1456,7 +1342,7 @@
xfs_extlen_t size;
xfs_fsblock_t extsize_fsb;
- if (S_ISREG(VFS_I(ip)->i_mode) && ip->i_d.di_nextents &&
+ if (S_ISREG(VFS_I(ip)->i_mode) && ip->i_df.if_nextents &&
((ip->i_d.di_extsize << mp->m_sb.sb_blocklog) != fa->fsx_extsize))
return -EINVAL;
@@ -1509,8 +1395,7 @@
if (!(fa->fsx_xflags & FS_XFLAG_COWEXTSIZE))
return 0;
- if (!xfs_sb_version_hasreflink(&ip->i_mount->m_sb) ||
- ip->i_d.di_version != 3)
+ if (!xfs_sb_version_hasreflink(&ip->i_mount->m_sb))
return -EINVAL;
if (fa->fsx_cowextsize == 0)
@@ -1550,11 +1435,9 @@
struct fsxattr old_fa;
struct xfs_mount *mp = ip->i_mount;
struct xfs_trans *tp;
- struct xfs_dquot *udqp = NULL;
struct xfs_dquot *pdqp = NULL;
struct xfs_dquot *olddquot = NULL;
int code;
- int join_flags = 0;
trace_xfs_ioctl_setattr(ip);
@@ -1571,33 +1454,24 @@
* because the i_*dquot fields will get updated anyway.
*/
if (XFS_IS_QUOTA_ON(mp)) {
- code = xfs_qm_vop_dqalloc(ip, ip->i_d.di_uid,
- ip->i_d.di_gid, fa->fsx_projid,
- XFS_QMOPT_PQUOTA, &udqp, NULL, &pdqp);
+ code = xfs_qm_vop_dqalloc(ip, VFS_I(ip)->i_uid,
+ VFS_I(ip)->i_gid, fa->fsx_projid,
+ XFS_QMOPT_PQUOTA, NULL, NULL, &pdqp);
if (code)
return code;
}
- /*
- * Changing DAX config may require inode locking for mapping
- * invalidation. These need to be held all the way to transaction commit
- * or cancel time, so need to be passed through to
- * xfs_ioctl_setattr_get_trans() so it can apply them to the join call
- * appropriately.
- */
- code = xfs_ioctl_setattr_dax_invalidate(ip, fa, &join_flags);
- if (code)
- goto error_free_dquots;
+ xfs_ioctl_setattr_prepare_dax(ip, fa);
- tp = xfs_ioctl_setattr_get_trans(ip, join_flags);
+ tp = xfs_ioctl_setattr_get_trans(ip);
if (IS_ERR(tp)) {
code = PTR_ERR(tp);
goto error_free_dquots;
}
if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_PQUOTA_ON(mp) &&
- xfs_get_projid(ip) != fa->fsx_projid) {
- code = xfs_qm_vop_chown_reserve(tp, ip, udqp, NULL, pdqp,
+ ip->i_d.di_projid != fa->fsx_projid) {
+ code = xfs_qm_vop_chown_reserve(tp, ip, NULL, NULL, pdqp,
capable(CAP_FOWNER) ? XFS_QMOPT_FORCE_RES : 0);
if (code) /* out of quota */
goto error_trans_cancel;
@@ -1633,13 +1507,12 @@
VFS_I(ip)->i_mode &= ~(S_ISUID|S_ISGID);
/* Change the ownerships and register project quota modifications */
- if (xfs_get_projid(ip) != fa->fsx_projid) {
+ if (ip->i_d.di_projid != fa->fsx_projid) {
if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_PQUOTA_ON(mp)) {
olddquot = xfs_qm_vop_chown(tp, ip,
&ip->i_pdquot, pdqp);
}
- ASSERT(ip->i_d.di_version > 1);
- xfs_set_projid(ip, fa->fsx_projid);
+ ip->i_d.di_projid = fa->fsx_projid;
}
/*
@@ -1651,7 +1524,7 @@
ip->i_d.di_extsize = fa->fsx_extsize >> mp->m_sb.sb_blocklog;
else
ip->i_d.di_extsize = 0;
- if (ip->i_d.di_version == 3 &&
+ if (xfs_sb_version_has_v3inode(&mp->m_sb) &&
(ip->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE))
ip->i_d.di_cowextsize = fa->fsx_cowextsize >>
mp->m_sb.sb_blocklog;
@@ -1664,7 +1537,6 @@
* Release any dquot(s) the inode had kept before chown.
*/
xfs_qm_dqrele(olddquot);
- xfs_qm_dqrele(udqp);
xfs_qm_dqrele(pdqp);
return code;
@@ -1672,7 +1544,6 @@
error_trans_cancel:
xfs_trans_cancel(tp);
error_free_dquots:
- xfs_qm_dqrele(udqp);
xfs_qm_dqrele(pdqp);
return code;
}
@@ -1704,7 +1575,7 @@
{
unsigned int flags;
- flags = xfs_di2lxflags(ip->i_d.di_flags);
+ flags = xfs_di2lxflags(ip->i_d.di_flags, ip->i_d.di_flags2);
if (copy_to_user(arg, &flags, sizeof(flags)))
return -EFAULT;
return 0;
@@ -1720,7 +1591,6 @@
struct fsxattr fa;
struct fsxattr old_fa;
unsigned int flags;
- int join_flags = 0;
int error;
if (copy_from_user(&flags, arg, sizeof(flags)))
@@ -1728,7 +1598,7 @@
if (flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL | \
FS_NOATIME_FL | FS_NODUMP_FL | \
- FS_SYNC_FL))
+ FS_SYNC_FL | FS_DAX_FL))
return -EOPNOTSUPP;
fa.fsx_xflags = xfs_merge_ioc_xflags(flags, xfs_ip2xflags(ip));
@@ -1737,18 +1607,9 @@
if (error)
return error;
- /*
- * Changing DAX config may require inode locking for mapping
- * invalidation. These need to be held all the way to transaction commit
- * or cancel time, so need to be passed through to
- * xfs_ioctl_setattr_get_trans() so it can apply them to the join call
- * appropriately.
- */
- error = xfs_ioctl_setattr_dax_invalidate(ip, &fa, &join_flags);
- if (error)
- goto out_drop_write;
+ xfs_ioctl_setattr_prepare_dax(ip, &fa);
- tp = xfs_ioctl_setattr_get_trans(ip, join_flags);
+ tp = xfs_ioctl_setattr_get_trans(ip);
if (IS_ERR(tp)) {
error = PTR_ERR(tp);
goto out_drop_write;
@@ -1831,7 +1692,7 @@
if (bmx.bmv_count > ULONG_MAX / recsize)
return -ENOMEM;
- buf = kmem_zalloc_large(bmx.bmv_count * sizeof(*buf), 0);
+ buf = kvzalloc(bmx.bmv_count * sizeof(*buf), GFP_KERNEL);
if (!buf)
return -ENOMEM;
@@ -2172,6 +2033,41 @@
return error;
}
+static inline int
+xfs_fs_eofblocks_from_user(
+ struct xfs_fs_eofblocks *src,
+ struct xfs_eofblocks *dst)
+{
+ if (src->eof_version != XFS_EOFBLOCKS_VERSION)
+ return -EINVAL;
+
+ if (src->eof_flags & ~XFS_EOF_FLAGS_VALID)
+ return -EINVAL;
+
+ if (memchr_inv(&src->pad32, 0, sizeof(src->pad32)) ||
+ memchr_inv(src->pad64, 0, sizeof(src->pad64)))
+ return -EINVAL;
+
+ dst->eof_flags = src->eof_flags;
+ dst->eof_prid = src->eof_prid;
+ dst->eof_min_file_size = src->eof_min_file_size;
+
+ dst->eof_uid = INVALID_UID;
+ if (src->eof_flags & XFS_EOF_FLAGS_UID) {
+ dst->eof_uid = make_kuid(current_user_ns(), src->eof_uid);
+ if (!uid_valid(dst->eof_uid))
+ return -EINVAL;
+ }
+
+ dst->eof_gid = INVALID_GID;
+ if (src->eof_flags & XFS_EOF_FLAGS_GID) {
+ dst->eof_gid = make_kgid(current_user_ns(), src->eof_gid);
+ if (!gid_valid(dst->eof_gid))
+ return -EINVAL;
+ }
+ return 0;
+}
+
/*
* Note: some of the ioctl's return positive numbers as a
* byte count indicating success, such as readlink_by_handle.
@@ -2201,24 +2097,17 @@
return xfs_ioc_setlabel(filp, mp, arg);
case XFS_IOC_ALLOCSP:
case XFS_IOC_FREESP:
- case XFS_IOC_RESVSP:
- case XFS_IOC_UNRESVSP:
case XFS_IOC_ALLOCSP64:
- case XFS_IOC_FREESP64:
- case XFS_IOC_RESVSP64:
- case XFS_IOC_UNRESVSP64:
- case XFS_IOC_ZERO_RANGE: {
+ case XFS_IOC_FREESP64: {
xfs_flock64_t bf;
if (copy_from_user(&bf, arg, sizeof(bf)))
return -EFAULT;
- return xfs_ioc_space(filp, cmd, &bf);
+ return xfs_ioc_space(filp, &bf);
}
case XFS_IOC_DIOINFO: {
- struct dioattr da;
- xfs_buftarg_t *target =
- XFS_IS_REALTIME_INODE(ip) ?
- mp->m_rtdev_targp : mp->m_ddev_targp;
+ struct xfs_buftarg *target = xfs_inode_buftarg(ip);
+ struct dioattr da;
da.d_mem = da.d_miniosz = target->bt_logical_sectorsize;
da.d_maxiosz = INT_MAX & ~(da.d_miniosz - 1);
@@ -2262,22 +2151,6 @@
case XFS_IOC_SETXFLAGS:
return xfs_ioc_setxflags(ip, filp, arg);
- case XFS_IOC_FSSETDM: {
- struct fsdmidata dmi;
-
- if (copy_from_user(&dmi, arg, sizeof(dmi)))
- return -EFAULT;
-
- error = mnt_want_write_file(filp);
- if (error)
- return error;
-
- error = xfs_set_dmattrs(ip, dmi.fsd_dmevmask,
- dmi.fsd_dmstate);
- mnt_drop_write_file(filp);
- return error;
- }
-
case XFS_IOC_GETBMAP:
case XFS_IOC_GETBMAPA:
case XFS_IOC_GETBMAPX:
@@ -2305,8 +2178,6 @@
return -EFAULT;
return xfs_open_by_handle(filp, &hreq);
}
- case XFS_IOC_FSSETDM_BY_HANDLE:
- return xfs_fssetdm_by_handle(filp, arg);
case XFS_IOC_READLINK_BY_HANDLE: {
xfs_fsop_handlereq_t hreq;
diff --git a/fs/xfs/xfs_ioctl.h b/fs/xfs/xfs_ioctl.h
index 654c0bb..bab6a5a 100644
--- a/fs/xfs/xfs_ioctl.h
+++ b/fs/xfs/xfs_ioctl.h
@@ -6,10 +6,14 @@
#ifndef __XFS_IOCTL_H__
#define __XFS_IOCTL_H__
+struct xfs_bstat;
+struct xfs_ibulk;
+struct xfs_inogrp;
+
+
extern int
xfs_ioc_space(
struct file *filp,
- unsigned int cmd,
xfs_flock64_t *bf);
int
@@ -31,27 +35,11 @@
struct file *parfilp,
xfs_fsop_handlereq_t *hreq);
-extern int
-xfs_attrmulti_attr_get(
- struct inode *inode,
- unsigned char *name,
- unsigned char __user *ubuf,
- uint32_t *len,
- uint32_t flags);
-
-extern int
-xfs_attrmulti_attr_set(
- struct inode *inode,
- unsigned char *name,
- const unsigned char __user *ubuf,
- uint32_t len,
- uint32_t flags);
-
-extern int
-xfs_attrmulti_attr_remove(
- struct inode *inode,
- unsigned char *name,
- uint32_t flags);
+int xfs_ioc_attrmulti_one(struct file *parfilp, struct inode *inode,
+ uint32_t opcode, void __user *uname, void __user *value,
+ uint32_t *len, uint32_t flags);
+int xfs_ioc_attr_list(struct xfs_inode *dp, void __user *ubuf, int bufsize,
+ int flags, struct xfs_attrlist_cursor __user *ucursor);
extern struct dentry *
xfs_handle_to_dentry(
@@ -71,16 +59,6 @@
unsigned int cmd,
unsigned long arg);
-extern int
-xfs_set_dmattrs(
- struct xfs_inode *ip,
- uint evmask,
- uint16_t state);
-
-struct xfs_ibulk;
-struct xfs_bstat;
-struct xfs_inogrp;
-
int xfs_fsbulkstat_one_fmt(struct xfs_ibulk *breq,
const struct xfs_bulkstat *bstat);
int xfs_fsinumbers_fmt(struct xfs_ibulk *breq, const struct xfs_inumbers *igrp);
diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c
index e61cc41..c1771e7 100644
--- a/fs/xfs/xfs_ioctl32.c
+++ b/fs/xfs/xfs_ioctl32.c
@@ -107,7 +107,7 @@
xfs_bstime_t *bstime,
compat_xfs_bstime_t __user *bstime32)
{
- compat_time_t sec32; /* tv_sec differs on 64 vs. 32 */
+ old_time32_t sec32; /* tv_sec differs on 64 vs. 32 */
if (get_user(sec32, &bstime32->tv_sec) ||
get_user(bstime->tv_nsec, &bstime32->tv_nsec))
@@ -352,56 +352,24 @@
STATIC int
xfs_compat_attrlist_by_handle(
struct file *parfilp,
- void __user *arg)
+ compat_xfs_fsop_attrlist_handlereq_t __user *p)
{
- int error;
- attrlist_cursor_kern_t *cursor;
- compat_xfs_fsop_attrlist_handlereq_t __user *p = arg;
compat_xfs_fsop_attrlist_handlereq_t al_hreq;
struct dentry *dentry;
- char *kbuf;
+ int error;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- if (copy_from_user(&al_hreq, arg,
- sizeof(compat_xfs_fsop_attrlist_handlereq_t)))
+ if (copy_from_user(&al_hreq, p, sizeof(al_hreq)))
return -EFAULT;
- if (al_hreq.buflen < sizeof(struct attrlist) ||
- al_hreq.buflen > XFS_XATTR_LIST_MAX)
- return -EINVAL;
-
- /*
- * Reject flags, only allow namespaces.
- */
- if (al_hreq.flags & ~(ATTR_ROOT | ATTR_SECURE))
- return -EINVAL;
dentry = xfs_compat_handlereq_to_dentry(parfilp, &al_hreq.hreq);
if (IS_ERR(dentry))
return PTR_ERR(dentry);
- error = -ENOMEM;
- kbuf = kmem_zalloc_large(al_hreq.buflen, 0);
- if (!kbuf)
- goto out_dput;
-
- cursor = (attrlist_cursor_kern_t *)&al_hreq.pos;
- error = xfs_attr_list(XFS_I(d_inode(dentry)), kbuf, al_hreq.buflen,
- al_hreq.flags, cursor);
- if (error)
- goto out_kfree;
-
- if (copy_to_user(&p->pos, cursor, sizeof(attrlist_cursor_kern_t))) {
- error = -EFAULT;
- goto out_kfree;
- }
-
- if (copy_to_user(compat_ptr(al_hreq.buffer), kbuf, al_hreq.buflen))
- error = -EFAULT;
-
-out_kfree:
- kmem_free(kbuf);
-out_dput:
+ error = xfs_ioc_attr_list(XFS_I(d_inode(dentry)),
+ compat_ptr(al_hreq.buffer), al_hreq.buflen,
+ al_hreq.flags, &p->pos);
dput(dentry);
return error;
}
@@ -416,7 +384,6 @@
compat_xfs_fsop_attrmulti_handlereq_t am_hreq;
struct dentry *dentry;
unsigned int i, size;
- unsigned char *attr_name;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
@@ -443,103 +410,24 @@
goto out_dput;
}
- error = -ENOMEM;
- attr_name = kmalloc(MAXNAMELEN, GFP_KERNEL);
- if (!attr_name)
- goto out_kfree_ops;
-
error = 0;
for (i = 0; i < am_hreq.opcount; i++) {
- ops[i].am_flags &= ~ATTR_KERNEL_FLAGS;
-
- ops[i].am_error = strncpy_from_user((char *)attr_name,
+ ops[i].am_error = xfs_ioc_attrmulti_one(parfilp,
+ d_inode(dentry), ops[i].am_opcode,
compat_ptr(ops[i].am_attrname),
- MAXNAMELEN);
- if (ops[i].am_error == 0 || ops[i].am_error == MAXNAMELEN)
- error = -ERANGE;
- if (ops[i].am_error < 0)
- break;
-
- switch (ops[i].am_opcode) {
- case ATTR_OP_GET:
- ops[i].am_error = xfs_attrmulti_attr_get(
- d_inode(dentry), attr_name,
- compat_ptr(ops[i].am_attrvalue),
- &ops[i].am_length, ops[i].am_flags);
- break;
- case ATTR_OP_SET:
- ops[i].am_error = mnt_want_write_file(parfilp);
- if (ops[i].am_error)
- break;
- ops[i].am_error = xfs_attrmulti_attr_set(
- d_inode(dentry), attr_name,
- compat_ptr(ops[i].am_attrvalue),
- ops[i].am_length, ops[i].am_flags);
- mnt_drop_write_file(parfilp);
- break;
- case ATTR_OP_REMOVE:
- ops[i].am_error = mnt_want_write_file(parfilp);
- if (ops[i].am_error)
- break;
- ops[i].am_error = xfs_attrmulti_attr_remove(
- d_inode(dentry), attr_name,
- ops[i].am_flags);
- mnt_drop_write_file(parfilp);
- break;
- default:
- ops[i].am_error = -EINVAL;
- }
+ compat_ptr(ops[i].am_attrvalue),
+ &ops[i].am_length, ops[i].am_flags);
}
if (copy_to_user(compat_ptr(am_hreq.ops), ops, size))
error = -EFAULT;
- kfree(attr_name);
- out_kfree_ops:
kfree(ops);
out_dput:
dput(dentry);
return error;
}
-STATIC int
-xfs_compat_fssetdm_by_handle(
- struct file *parfilp,
- void __user *arg)
-{
- int error;
- struct fsdmidata fsd;
- compat_xfs_fsop_setdm_handlereq_t dmhreq;
- struct dentry *dentry;
-
- if (!capable(CAP_MKNOD))
- return -EPERM;
- if (copy_from_user(&dmhreq, arg,
- sizeof(compat_xfs_fsop_setdm_handlereq_t)))
- return -EFAULT;
-
- dentry = xfs_compat_handlereq_to_dentry(parfilp, &dmhreq.hreq);
- if (IS_ERR(dentry))
- return PTR_ERR(dentry);
-
- if (IS_IMMUTABLE(d_inode(dentry)) || IS_APPEND(d_inode(dentry))) {
- error = -EPERM;
- goto out;
- }
-
- if (copy_from_user(&fsd, compat_ptr(dmhreq.data), sizeof(fsd))) {
- error = -EFAULT;
- goto out;
- }
-
- error = xfs_set_dmattrs(XFS_I(d_inode(dentry)), fsd.fsd_dmevmask,
- fsd.fsd_dmstate);
-
-out:
- dput(dentry);
- return error;
-}
-
long
xfs_file_compat_ioctl(
struct file *filp,
@@ -559,18 +447,13 @@
case XFS_IOC_ALLOCSP_32:
case XFS_IOC_FREESP_32:
case XFS_IOC_ALLOCSP64_32:
- case XFS_IOC_FREESP64_32:
- case XFS_IOC_RESVSP_32:
- case XFS_IOC_UNRESVSP_32:
- case XFS_IOC_RESVSP64_32:
- case XFS_IOC_UNRESVSP64_32:
- case XFS_IOC_ZERO_RANGE_32: {
+ case XFS_IOC_FREESP64_32: {
struct xfs_flock64 bf;
if (xfs_compat_flock64_copyin(&bf, arg))
return -EFAULT;
cmd = _NATIVE_IOC(cmd, struct xfs_flock64);
- return xfs_ioc_space(filp, cmd, &bf);
+ return xfs_ioc_space(filp, &bf);
}
case XFS_IOC_FSGEOMETRY_V1_32:
return xfs_compat_ioc_fsgeometry_v1(mp, arg);
@@ -653,8 +536,6 @@
return xfs_compat_attrlist_by_handle(filp, arg);
case XFS_IOC_ATTRMULTI_BY_HANDLE_32:
return xfs_compat_attrmulti_by_handle(filp, arg);
- case XFS_IOC_FSSETDM_BY_HANDLE_32:
- return xfs_compat_fssetdm_by_handle(filp, arg);
default:
/* try the native version */
return xfs_file_ioctl(filp, cmd, (unsigned long)arg);
diff --git a/fs/xfs/xfs_ioctl32.h b/fs/xfs/xfs_ioctl32.h
index 7985344..053de7d 100644
--- a/fs/xfs/xfs_ioctl32.h
+++ b/fs/xfs/xfs_ioctl32.h
@@ -32,7 +32,7 @@
#endif
typedef struct compat_xfs_bstime {
- compat_time_t tv_sec; /* seconds */
+ old_time32_t tv_sec; /* seconds */
__s32 tv_nsec; /* and nanoseconds */
} compat_xfs_bstime_t;
@@ -99,7 +99,7 @@
_IOWR('X', 108, struct compat_xfs_fsop_handlereq)
/* The bstat field in the swapext struct needs translation */
-typedef struct compat_xfs_swapext {
+struct compat_xfs_swapext {
int64_t sx_version; /* version */
int64_t sx_fdtarget; /* fd of target file */
int64_t sx_fdtmp; /* fd of tmp file */
@@ -107,7 +107,7 @@
xfs_off_t sx_length; /* leng from offset */
char sx_pad[16]; /* pad space, unused */
struct compat_xfs_bstat sx_stat; /* stat of target b4 copy */
-} __compat_packed compat_xfs_swapext_t;
+} __compat_packed;
#define XFS_IOC_SWAPEXT_32 _IOWR('X', 109, struct compat_xfs_swapext)
@@ -143,15 +143,6 @@
#define XFS_IOC_ATTRMULTI_BY_HANDLE_32 \
_IOW('X', 123, struct compat_xfs_fsop_attrmulti_handlereq)
-typedef struct compat_xfs_fsop_setdm_handlereq {
- struct compat_xfs_fsop_handlereq hreq; /* handle information */
- /* ptr to struct fsdmidata */
- compat_uptr_t data; /* DMAPI data */
-} compat_xfs_fsop_setdm_handlereq_t;
-
-#define XFS_IOC_FSSETDM_BY_HANDLE_32 \
- _IOW('X', 121, struct compat_xfs_fsop_setdm_handlereq)
-
#ifdef BROKEN_X86_ALIGNMENT
/* on ia32 l_start is on a 32-bit boundary */
typedef struct compat_xfs_flock64 {
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 239c954..7b9ff82 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -29,8 +29,8 @@
#include "xfs_reflink.h"
-#define XFS_WRITEIO_ALIGN(mp,off) (((off) >> mp->m_writeio_log) \
- << mp->m_writeio_log)
+#define XFS_ALLOC_ALIGN(mp, off) \
+ (((off) >> mp->m_allocsize_log) << mp->m_allocsize_log)
static int
xfs_alert_fsblock_zero(
@@ -54,9 +54,10 @@
struct xfs_inode *ip,
struct iomap *iomap,
struct xfs_bmbt_irec *imap,
- bool shared)
+ u16 flags)
{
struct xfs_mount *mp = ip->i_mount;
+ struct xfs_buftarg *target = xfs_inode_buftarg(ip);
if (unlikely(!xfs_valid_startblock(ip, imap->br_startblock)))
return xfs_alert_fsblock_zero(ip, imap);
@@ -77,14 +78,13 @@
}
iomap->offset = XFS_FSB_TO_B(mp, imap->br_startoff);
iomap->length = XFS_FSB_TO_B(mp, imap->br_blockcount);
- iomap->bdev = xfs_find_bdev_for_inode(VFS_I(ip));
- iomap->dax_dev = xfs_find_daxdev_for_inode(VFS_I(ip));
+ iomap->bdev = target->bt_bdev;
+ iomap->dax_dev = target->bt_daxdev;
+ iomap->flags = flags;
if (xfs_ipincount(ip) &&
(ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP))
iomap->flags |= IOMAP_F_DIRTY;
- if (shared)
- iomap->flags |= IOMAP_F_SHARED;
return 0;
}
@@ -95,18 +95,30 @@
xfs_fileoff_t offset_fsb,
xfs_fileoff_t end_fsb)
{
+ struct xfs_buftarg *target = xfs_inode_buftarg(ip);
+
iomap->addr = IOMAP_NULL_ADDR;
iomap->type = IOMAP_HOLE;
iomap->offset = XFS_FSB_TO_B(ip->i_mount, offset_fsb);
iomap->length = XFS_FSB_TO_B(ip->i_mount, end_fsb - offset_fsb);
- iomap->bdev = xfs_find_bdev_for_inode(VFS_I(ip));
- iomap->dax_dev = xfs_find_daxdev_for_inode(VFS_I(ip));
+ iomap->bdev = target->bt_bdev;
+ iomap->dax_dev = target->bt_daxdev;
}
-xfs_extlen_t
+static inline xfs_fileoff_t
+xfs_iomap_end_fsb(
+ struct xfs_mount *mp,
+ loff_t offset,
+ loff_t count)
+{
+ ASSERT(offset <= mp->m_super->s_maxbytes);
+ return min(XFS_B_TO_FSB(mp, offset + count),
+ XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes));
+}
+
+static xfs_extlen_t
xfs_eof_alignment(
- struct xfs_inode *ip,
- xfs_extlen_t extsize)
+ struct xfs_inode *ip)
{
struct xfs_mount *mp = ip->i_mount;
xfs_extlen_t align = 0;
@@ -129,111 +141,80 @@
align = 0;
}
- /*
- * Always round up the allocation request to an extent boundary
- * (when file on a real-time subvolume or has di_extsize hint).
- */
- if (extsize) {
- if (align)
- align = roundup_64(align, extsize);
- else
- align = extsize;
- }
-
return align;
}
-STATIC int
+/*
+ * Check if last_fsb is outside the last extent, and if so grow it to the next
+ * stripe unit boundary.
+ */
+xfs_fileoff_t
xfs_iomap_eof_align_last_fsb(
struct xfs_inode *ip,
- xfs_extlen_t extsize,
- xfs_fileoff_t *last_fsb)
+ xfs_fileoff_t end_fsb)
{
- xfs_extlen_t align = xfs_eof_alignment(ip, extsize);
+ struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
+ xfs_extlen_t extsz = xfs_get_extsz_hint(ip);
+ xfs_extlen_t align = xfs_eof_alignment(ip);
+ struct xfs_bmbt_irec irec;
+ struct xfs_iext_cursor icur;
+
+ ASSERT(ifp->if_flags & XFS_IFEXTENTS);
+
+ /*
+ * Always round up the allocation request to the extent hint boundary.
+ */
+ if (extsz) {
+ if (align)
+ align = roundup_64(align, extsz);
+ else
+ align = extsz;
+ }
if (align) {
- xfs_fileoff_t new_last_fsb = roundup_64(*last_fsb, align);
- int eof, error;
+ xfs_fileoff_t aligned_end_fsb = roundup_64(end_fsb, align);
- error = xfs_bmap_eof(ip, new_last_fsb, XFS_DATA_FORK, &eof);
- if (error)
- return error;
- if (eof)
- *last_fsb = new_last_fsb;
+ xfs_iext_last(ifp, &icur);
+ if (!xfs_iext_get_extent(ifp, &icur, &irec) ||
+ aligned_end_fsb >= irec.br_startoff + irec.br_blockcount)
+ return aligned_end_fsb;
}
- return 0;
+
+ return end_fsb;
}
int
xfs_iomap_write_direct(
- xfs_inode_t *ip,
- xfs_off_t offset,
- size_t count,
- xfs_bmbt_irec_t *imap,
- int nmaps)
+ struct xfs_inode *ip,
+ xfs_fileoff_t offset_fsb,
+ xfs_fileoff_t count_fsb,
+ struct xfs_bmbt_irec *imap)
{
- xfs_mount_t *mp = ip->i_mount;
- xfs_fileoff_t offset_fsb;
- xfs_fileoff_t last_fsb;
- xfs_filblks_t count_fsb, resaligned;
- xfs_extlen_t extsz;
- int nimaps;
- int quota_flag;
- int rt;
- xfs_trans_t *tp;
- uint qblocks, resblks, resrtextents;
- int error;
- int lockmode;
- int bmapi_flags = XFS_BMAPI_PREALLOC;
- uint tflags = 0;
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_trans *tp;
+ xfs_filblks_t resaligned;
+ int nimaps;
+ int quota_flag;
+ uint qblocks, resblks;
+ unsigned int resrtextents = 0;
+ int error;
+ int bmapi_flags = XFS_BMAPI_PREALLOC;
+ uint tflags = 0;
- rt = XFS_IS_REALTIME_INODE(ip);
- extsz = xfs_get_extsz_hint(ip);
- lockmode = XFS_ILOCK_SHARED; /* locked by caller */
-
- ASSERT(xfs_isilocked(ip, lockmode));
-
- offset_fsb = XFS_B_TO_FSBT(mp, offset);
- last_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count)));
- if ((offset + count) > XFS_ISIZE(ip)) {
- /*
- * Assert that the in-core extent list is present since this can
- * call xfs_iread_extents() and we only have the ilock shared.
- * This should be safe because the lock was held around a bmapi
- * call in the caller and we only need it to access the in-core
- * list.
- */
- ASSERT(XFS_IFORK_PTR(ip, XFS_DATA_FORK)->if_flags &
- XFS_IFEXTENTS);
- error = xfs_iomap_eof_align_last_fsb(ip, extsz, &last_fsb);
- if (error)
- goto out_unlock;
- } else {
- if (nmaps && (imap->br_startblock == HOLESTARTBLOCK))
- last_fsb = min(last_fsb, (xfs_fileoff_t)
- imap->br_blockcount +
- imap->br_startoff);
- }
- count_fsb = last_fsb - offset_fsb;
ASSERT(count_fsb > 0);
- resaligned = xfs_aligned_fsb_count(offset_fsb, count_fsb, extsz);
- if (unlikely(rt)) {
+ resaligned = xfs_aligned_fsb_count(offset_fsb, count_fsb,
+ xfs_get_extsz_hint(ip));
+ if (unlikely(XFS_IS_REALTIME_INODE(ip))) {
resrtextents = qblocks = resaligned;
resrtextents /= mp->m_sb.sb_rextsize;
resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
quota_flag = XFS_QMOPT_RES_RTBLKS;
} else {
- resrtextents = 0;
resblks = qblocks = XFS_DIOSTRAT_SPACE_RES(mp, resaligned);
quota_flag = XFS_QMOPT_RES_REGBLKS;
}
- /*
- * Drop the shared lock acquired by the caller, attach the dquot if
- * necessary and move on to transaction setup.
- */
- xfs_iunlock(ip, lockmode);
error = xfs_qm_dqattach(ip);
if (error)
return error;
@@ -263,8 +244,7 @@
if (error)
return error;
- lockmode = XFS_ILOCK_EXCL;
- xfs_ilock(ip, lockmode);
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
error = xfs_trans_reserve_quota_nblks(tp, ip, qblocks, 0, quota_flag);
if (error)
@@ -277,8 +257,8 @@
* caller gave to us.
*/
nimaps = 1;
- error = xfs_bmapi_write(tp, ip, offset_fsb, count_fsb,
- bmapi_flags, resblks, imap, &nimaps);
+ error = xfs_bmapi_write(tp, ip, offset_fsb, count_fsb, bmapi_flags, 0,
+ imap, &nimaps);
if (error)
goto out_res_cancel;
@@ -301,7 +281,7 @@
error = xfs_alert_fsblock_zero(ip, imap);
out_unlock:
- xfs_iunlock(ip, lockmode);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
return error;
out_res_cancel:
@@ -313,11 +293,11 @@
STATIC bool
xfs_quota_need_throttle(
- struct xfs_inode *ip,
- int type,
- xfs_fsblock_t alloc_blocks)
+ struct xfs_inode *ip,
+ xfs_dqtype_t type,
+ xfs_fsblock_t alloc_blocks)
{
- struct xfs_dquot *dq = xfs_inode_dquot(ip, type);
+ struct xfs_dquot *dq = xfs_inode_dquot(ip, type);
if (!dq || !xfs_this_quota_on(ip->i_mount, type))
return false;
@@ -327,7 +307,7 @@
return false;
/* under the lo watermark, no throttle */
- if (dq->q_res_bcount + alloc_blocks < dq->q_prealloc_lo_wmark)
+ if (dq->q_blk.reserved + alloc_blocks < dq->q_prealloc_lo_wmark)
return false;
return true;
@@ -335,24 +315,24 @@
STATIC void
xfs_quota_calc_throttle(
- struct xfs_inode *ip,
- int type,
- xfs_fsblock_t *qblocks,
- int *qshift,
- int64_t *qfreesp)
+ struct xfs_inode *ip,
+ xfs_dqtype_t type,
+ xfs_fsblock_t *qblocks,
+ int *qshift,
+ int64_t *qfreesp)
{
- int64_t freesp;
- int shift = 0;
- struct xfs_dquot *dq = xfs_inode_dquot(ip, type);
+ struct xfs_dquot *dq = xfs_inode_dquot(ip, type);
+ int64_t freesp;
+ int shift = 0;
/* no dq, or over hi wmark, squash the prealloc completely */
- if (!dq || dq->q_res_bcount >= dq->q_prealloc_hi_wmark) {
+ if (!dq || dq->q_blk.reserved >= dq->q_prealloc_hi_wmark) {
*qblocks = 0;
*qfreesp = 0;
return;
}
- freesp = dq->q_prealloc_hi_wmark - dq->q_res_bcount;
+ freesp = dq->q_prealloc_hi_wmark - dq->q_blk.reserved;
if (freesp < dq->q_low_space[XFS_QLOWSP_5_PCNT]) {
shift = 2;
if (freesp < dq->q_low_space[XFS_QLOWSP_3_PCNT])
@@ -372,22 +352,10 @@
}
/*
- * If we are doing a write at the end of the file and there are no allocations
- * past this one, then extend the allocation out to the file system's write
- * iosize.
- *
* If we don't have a user specified preallocation size, dynamically increase
* the preallocation size as the size of the file grows. Cap the maximum size
* at a single extent or less if the filesystem is near full. The closer the
- * filesystem is to full, the smaller the maximum prealocation.
- *
- * As an exception we don't do any preallocation at all if the file is smaller
- * than the minimum preallocation and we are using the default dynamic
- * preallocation scheme, as it is likely this is the only write to the file that
- * is going to be done.
- *
- * We clean up any extra space left over when the file is closed in
- * xfs_inactive().
+ * filesystem is to being full, the smaller the maximum preallocation.
*/
STATIC xfs_fsblock_t
xfs_iomap_prealloc_size(
@@ -397,63 +365,70 @@
loff_t count,
struct xfs_iext_cursor *icur)
{
+ struct xfs_iext_cursor ncur = *icur;
+ struct xfs_bmbt_irec prev, got;
struct xfs_mount *mp = ip->i_mount;
struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
- struct xfs_bmbt_irec prev;
- int shift = 0;
int64_t freesp;
xfs_fsblock_t qblocks;
- int qshift = 0;
xfs_fsblock_t alloc_blocks = 0;
+ xfs_extlen_t plen;
+ int shift = 0;
+ int qshift = 0;
- if (offset + count <= XFS_ISIZE(ip))
- return 0;
-
- if (!(mp->m_flags & XFS_MOUNT_DFLT_IOSIZE) &&
- (XFS_ISIZE(ip) < XFS_FSB_TO_B(mp, mp->m_writeio_blocks)))
+ /*
+ * As an exception we don't do any preallocation at all if the file is
+ * smaller than the minimum preallocation and we are using the default
+ * dynamic preallocation scheme, as it is likely this is the only write
+ * to the file that is going to be done.
+ */
+ if (XFS_ISIZE(ip) < XFS_FSB_TO_B(mp, mp->m_allocsize_blocks))
return 0;
/*
- * If an explicit allocsize is set, the file is small, or we
- * are writing behind a hole, then use the minimum prealloc:
+ * Use the minimum preallocation size for small files or if we are
+ * writing right after a hole.
*/
- if ((mp->m_flags & XFS_MOUNT_DFLT_IOSIZE) ||
- XFS_ISIZE(ip) < XFS_FSB_TO_B(mp, mp->m_dalign) ||
- !xfs_iext_peek_prev_extent(ifp, icur, &prev) ||
+ if (XFS_ISIZE(ip) < XFS_FSB_TO_B(mp, mp->m_dalign) ||
+ !xfs_iext_prev_extent(ifp, &ncur, &prev) ||
prev.br_startoff + prev.br_blockcount < offset_fsb)
- return mp->m_writeio_blocks;
+ return mp->m_allocsize_blocks;
/*
- * Determine the initial size of the preallocation. We are beyond the
- * current EOF here, but we need to take into account whether this is
- * a sparse write or an extending write when determining the
- * preallocation size. Hence we need to look up the extent that ends
- * at the current write offset and use the result to determine the
- * preallocation size.
- *
- * If the extent is a hole, then preallocation is essentially disabled.
- * Otherwise we take the size of the preceding data extent as the basis
- * for the preallocation size. If the size of the extent is greater than
- * half the maximum extent length, then use the current offset as the
- * basis. This ensures that for large files the preallocation size
- * always extends to MAXEXTLEN rather than falling short due to things
- * like stripe unit/width alignment of real extents.
+ * Take the size of the preceding data extents as the basis for the
+ * preallocation size. Note that we don't care if the previous extents
+ * are written or not.
*/
- if (prev.br_blockcount <= (MAXEXTLEN >> 1))
- alloc_blocks = prev.br_blockcount << 1;
- else
+ plen = prev.br_blockcount;
+ while (xfs_iext_prev_extent(ifp, &ncur, &got)) {
+ if (plen > MAXEXTLEN / 2 ||
+ isnullstartblock(got.br_startblock) ||
+ got.br_startoff + got.br_blockcount != prev.br_startoff ||
+ got.br_startblock + got.br_blockcount != prev.br_startblock)
+ break;
+ plen += got.br_blockcount;
+ prev = got;
+ }
+
+ /*
+ * If the size of the extents is greater than half the maximum extent
+ * length, then use the current offset as the basis. This ensures that
+ * for large files the preallocation size always extends to MAXEXTLEN
+ * rather than falling short due to things like stripe unit/width
+ * alignment of real extents.
+ */
+ alloc_blocks = plen * 2;
+ if (alloc_blocks > MAXEXTLEN)
alloc_blocks = XFS_B_TO_FSB(mp, offset);
- if (!alloc_blocks)
- goto check_writeio;
qblocks = alloc_blocks;
/*
* MAXEXTLEN is not a power of two value but we round the prealloc down
* to the nearest power of two value after throttling. To prevent the
- * round down from unconditionally reducing the maximum supported prealloc
- * size, we round up first, apply appropriate throttling, round down and
- * cap the value to MAXEXTLEN.
+ * round down from unconditionally reducing the maximum supported
+ * prealloc size, we round up first, apply appropriate throttling,
+ * round down and cap the value to MAXEXTLEN.
*/
alloc_blocks = XFS_FILEOFF_MIN(roundup_pow_of_two(MAXEXTLEN),
alloc_blocks);
@@ -475,14 +450,14 @@
* Check each quota to cap the prealloc size, provide a shift value to
* throttle with and adjust amount of available space.
*/
- if (xfs_quota_need_throttle(ip, XFS_DQ_USER, alloc_blocks))
- xfs_quota_calc_throttle(ip, XFS_DQ_USER, &qblocks, &qshift,
+ if (xfs_quota_need_throttle(ip, XFS_DQTYPE_USER, alloc_blocks))
+ xfs_quota_calc_throttle(ip, XFS_DQTYPE_USER, &qblocks, &qshift,
&freesp);
- if (xfs_quota_need_throttle(ip, XFS_DQ_GROUP, alloc_blocks))
- xfs_quota_calc_throttle(ip, XFS_DQ_GROUP, &qblocks, &qshift,
+ if (xfs_quota_need_throttle(ip, XFS_DQTYPE_GROUP, alloc_blocks))
+ xfs_quota_calc_throttle(ip, XFS_DQTYPE_GROUP, &qblocks, &qshift,
&freesp);
- if (xfs_quota_need_throttle(ip, XFS_DQ_PROJ, alloc_blocks))
- xfs_quota_calc_throttle(ip, XFS_DQ_PROJ, &qblocks, &qshift,
+ if (xfs_quota_need_throttle(ip, XFS_DQTYPE_PROJ, alloc_blocks))
+ xfs_quota_calc_throttle(ip, XFS_DQTYPE_PROJ, &qblocks, &qshift,
&freesp);
/*
@@ -514,220 +489,13 @@
*/
while (alloc_blocks && alloc_blocks >= freesp)
alloc_blocks >>= 4;
-check_writeio:
- if (alloc_blocks < mp->m_writeio_blocks)
- alloc_blocks = mp->m_writeio_blocks;
+ if (alloc_blocks < mp->m_allocsize_blocks)
+ alloc_blocks = mp->m_allocsize_blocks;
trace_xfs_iomap_prealloc_size(ip, alloc_blocks, shift,
- mp->m_writeio_blocks);
+ mp->m_allocsize_blocks);
return alloc_blocks;
}
-static int
-xfs_file_iomap_begin_delay(
- struct inode *inode,
- loff_t offset,
- loff_t count,
- unsigned flags,
- struct iomap *iomap)
-{
- struct xfs_inode *ip = XFS_I(inode);
- struct xfs_mount *mp = ip->i_mount;
- xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
- xfs_fileoff_t maxbytes_fsb =
- XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes);
- xfs_fileoff_t end_fsb;
- struct xfs_bmbt_irec imap, cmap;
- struct xfs_iext_cursor icur, ccur;
- xfs_fsblock_t prealloc_blocks = 0;
- bool eof = false, cow_eof = false, shared = false;
- int whichfork = XFS_DATA_FORK;
- int error = 0;
-
- ASSERT(!XFS_IS_REALTIME_INODE(ip));
- ASSERT(!xfs_get_extsz_hint(ip));
-
- xfs_ilock(ip, XFS_ILOCK_EXCL);
-
- if (unlikely(XFS_TEST_ERROR(
- (XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) != XFS_DINODE_FMT_EXTENTS &&
- XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) != XFS_DINODE_FMT_BTREE),
- mp, XFS_ERRTAG_BMAPIFORMAT))) {
- XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp);
- error = -EFSCORRUPTED;
- goto out_unlock;
- }
-
- XFS_STATS_INC(mp, xs_blk_mapw);
-
- if (!(ip->i_df.if_flags & XFS_IFEXTENTS)) {
- error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK);
- if (error)
- goto out_unlock;
- }
-
- end_fsb = min(XFS_B_TO_FSB(mp, offset + count), maxbytes_fsb);
-
- /*
- * Search the data fork fork first to look up our source mapping. We
- * always need the data fork map, as we have to return it to the
- * iomap code so that the higher level write code can read data in to
- * perform read-modify-write cycles for unaligned writes.
- */
- eof = !xfs_iext_lookup_extent(ip, &ip->i_df, offset_fsb, &icur, &imap);
- if (eof)
- imap.br_startoff = end_fsb; /* fake hole until the end */
-
- /* We never need to allocate blocks for zeroing a hole. */
- if ((flags & IOMAP_ZERO) && imap.br_startoff > offset_fsb) {
- xfs_hole_to_iomap(ip, iomap, offset_fsb, imap.br_startoff);
- goto out_unlock;
- }
-
- /*
- * Search the COW fork extent list even if we did not find a data fork
- * extent. This serves two purposes: first this implements the
- * speculative preallocation using cowextsize, so that we also unshare
- * block adjacent to shared blocks instead of just the shared blocks
- * themselves. Second the lookup in the extent list is generally faster
- * than going out to the shared extent tree.
- */
- if (xfs_is_cow_inode(ip)) {
- if (!ip->i_cowfp) {
- ASSERT(!xfs_is_reflink_inode(ip));
- xfs_ifork_init_cow(ip);
- }
- cow_eof = !xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb,
- &ccur, &cmap);
- if (!cow_eof && cmap.br_startoff <= offset_fsb) {
- trace_xfs_reflink_cow_found(ip, &cmap);
- whichfork = XFS_COW_FORK;
- goto done;
- }
- }
-
- if (imap.br_startoff <= offset_fsb) {
- /*
- * For reflink files we may need a delalloc reservation when
- * overwriting shared extents. This includes zeroing of
- * existing extents that contain data.
- */
- if (!xfs_is_cow_inode(ip) ||
- ((flags & IOMAP_ZERO) && imap.br_state != XFS_EXT_NORM)) {
- trace_xfs_iomap_found(ip, offset, count, XFS_DATA_FORK,
- &imap);
- goto done;
- }
-
- xfs_trim_extent(&imap, offset_fsb, end_fsb - offset_fsb);
-
- /* Trim the mapping to the nearest shared extent boundary. */
- error = xfs_inode_need_cow(ip, &imap, &shared);
- if (error)
- goto out_unlock;
-
- /* Not shared? Just report the (potentially capped) extent. */
- if (!shared) {
- trace_xfs_iomap_found(ip, offset, count, XFS_DATA_FORK,
- &imap);
- goto done;
- }
-
- /*
- * Fork all the shared blocks from our write offset until the
- * end of the extent.
- */
- whichfork = XFS_COW_FORK;
- end_fsb = imap.br_startoff + imap.br_blockcount;
- } else {
- /*
- * We cap the maximum length we map here to MAX_WRITEBACK_PAGES
- * pages to keep the chunks of work done where somewhat
- * symmetric with the work writeback does. This is a completely
- * arbitrary number pulled out of thin air.
- *
- * Note that the values needs to be less than 32-bits wide until
- * the lower level functions are updated.
- */
- count = min_t(loff_t, count, 1024 * PAGE_SIZE);
- end_fsb = min(XFS_B_TO_FSB(mp, offset + count), maxbytes_fsb);
-
- if (xfs_is_always_cow_inode(ip))
- whichfork = XFS_COW_FORK;
- }
-
- error = xfs_qm_dqattach_locked(ip, false);
- if (error)
- goto out_unlock;
-
- if (eof) {
- prealloc_blocks = xfs_iomap_prealloc_size(ip, whichfork, offset,
- count, &icur);
- if (prealloc_blocks) {
- xfs_extlen_t align;
- xfs_off_t end_offset;
- xfs_fileoff_t p_end_fsb;
-
- end_offset = XFS_WRITEIO_ALIGN(mp, offset + count - 1);
- p_end_fsb = XFS_B_TO_FSBT(mp, end_offset) +
- prealloc_blocks;
-
- align = xfs_eof_alignment(ip, 0);
- if (align)
- p_end_fsb = roundup_64(p_end_fsb, align);
-
- p_end_fsb = min(p_end_fsb, maxbytes_fsb);
- ASSERT(p_end_fsb > offset_fsb);
- prealloc_blocks = p_end_fsb - end_fsb;
- }
- }
-
-retry:
- error = xfs_bmapi_reserve_delalloc(ip, whichfork, offset_fsb,
- end_fsb - offset_fsb, prealloc_blocks,
- whichfork == XFS_DATA_FORK ? &imap : &cmap,
- whichfork == XFS_DATA_FORK ? &icur : &ccur,
- whichfork == XFS_DATA_FORK ? eof : cow_eof);
- switch (error) {
- case 0:
- break;
- case -ENOSPC:
- case -EDQUOT:
- /* retry without any preallocation */
- trace_xfs_delalloc_enospc(ip, offset, count);
- if (prealloc_blocks) {
- prealloc_blocks = 0;
- goto retry;
- }
- /*FALLTHRU*/
- default:
- goto out_unlock;
- }
-
- /*
- * Flag newly allocated delalloc blocks with IOMAP_F_NEW so we punch
- * them out if the write happens to fail.
- */
- iomap->flags |= IOMAP_F_NEW;
- trace_xfs_iomap_alloc(ip, offset, count, whichfork,
- whichfork == XFS_DATA_FORK ? &imap : &cmap);
-done:
- if (whichfork == XFS_COW_FORK) {
- if (imap.br_startoff > offset_fsb) {
- xfs_trim_extent(&cmap, offset_fsb,
- imap.br_startoff - offset_fsb);
- error = xfs_bmbt_to_iomap(ip, iomap, &cmap, true);
- goto out_unlock;
- }
- /* ensure we only report blocks we have a reservation for */
- xfs_trim_extent(&imap, cmap.br_startoff, cmap.br_blockcount);
- shared = true;
- }
- error = xfs_bmbt_to_iomap(ip, iomap, &imap, shared);
-out_unlock:
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- return error;
-}
-
int
xfs_iomap_write_unwritten(
xfs_inode_t *ip,
@@ -765,6 +533,11 @@
*/
resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0) << 1;
+ /* Attach dquots so that bmbt splits are accounted correctly. */
+ error = xfs_qm_dqattach(ip);
+ if (error)
+ return error;
+
do {
/*
* Set up a transaction to convert the range of extents
@@ -783,6 +556,11 @@
xfs_ilock(ip, XFS_ILOCK_EXCL);
xfs_trans_ijoin(tp, ip, 0);
+ error = xfs_trans_reserve_quota_nblks(tp, ip, resblks, 0,
+ XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_FORCE_RES);
+ if (error)
+ goto error_on_bmapi_transaction;
+
/*
* Modify the unwritten extent state of the buffer.
*/
@@ -840,23 +618,42 @@
static inline bool
imap_needs_alloc(
struct inode *inode,
+ unsigned flags,
struct xfs_bmbt_irec *imap,
int nimaps)
{
- return !nimaps ||
- imap->br_startblock == HOLESTARTBLOCK ||
- imap->br_startblock == DELAYSTARTBLOCK ||
- (IS_DAX(inode) && imap->br_state == XFS_EXT_UNWRITTEN);
+ /* don't allocate blocks when just zeroing */
+ if (flags & IOMAP_ZERO)
+ return false;
+ if (!nimaps ||
+ imap->br_startblock == HOLESTARTBLOCK ||
+ imap->br_startblock == DELAYSTARTBLOCK)
+ return true;
+ /* we convert unwritten extents before copying the data for DAX */
+ if (IS_DAX(inode) && imap->br_state == XFS_EXT_UNWRITTEN)
+ return true;
+ return false;
}
static inline bool
-needs_cow_for_zeroing(
+imap_needs_cow(
+ struct xfs_inode *ip,
+ unsigned int flags,
struct xfs_bmbt_irec *imap,
int nimaps)
{
- return nimaps &&
- imap->br_startblock != HOLESTARTBLOCK &&
- imap->br_state != XFS_EXT_UNWRITTEN;
+ if (!xfs_is_cow_inode(ip))
+ return false;
+
+ /* when zeroing we don't have to COW holes or unwritten extents */
+ if (flags & IOMAP_ZERO) {
+ if (!nimaps ||
+ imap->br_startblock == HOLESTARTBLOCK ||
+ imap->br_state == XFS_EXT_UNWRITTEN)
+ return false;
+ }
+
+ return true;
}
static int
@@ -872,15 +669,8 @@
* COW writes may allocate delalloc space or convert unwritten COW
* extents, so we need to make sure to take the lock exclusively here.
*/
- if (xfs_is_cow_inode(ip) && is_write) {
- /*
- * FIXME: It could still overwrite on unshared extents and not
- * need allocation.
- */
- if (flags & IOMAP_NOWAIT)
- return -EAGAIN;
+ if (xfs_is_cow_inode(ip) && is_write)
mode = XFS_ILOCK_EXCL;
- }
/*
* Extents not yet cached requires exclusive access, don't block. This
@@ -916,119 +706,104 @@
return 0;
}
+/*
+ * Check that the imap we are going to return to the caller spans the entire
+ * range that the caller requested for the IO.
+ */
+static bool
+imap_spans_range(
+ struct xfs_bmbt_irec *imap,
+ xfs_fileoff_t offset_fsb,
+ xfs_fileoff_t end_fsb)
+{
+ if (imap->br_startoff > offset_fsb)
+ return false;
+ if (imap->br_startoff + imap->br_blockcount < end_fsb)
+ return false;
+ return true;
+}
+
static int
-xfs_file_iomap_begin(
+xfs_direct_write_iomap_begin(
struct inode *inode,
loff_t offset,
loff_t length,
unsigned flags,
- struct iomap *iomap)
+ struct iomap *iomap,
+ struct iomap *srcmap)
{
struct xfs_inode *ip = XFS_I(inode);
struct xfs_mount *mp = ip->i_mount;
- struct xfs_bmbt_irec imap;
- xfs_fileoff_t offset_fsb, end_fsb;
+ struct xfs_bmbt_irec imap, cmap;
+ xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
+ xfs_fileoff_t end_fsb = xfs_iomap_end_fsb(mp, offset, length);
int nimaps = 1, error = 0;
bool shared = false;
+ u16 iomap_flags = 0;
unsigned lockmode;
+ ASSERT(flags & (IOMAP_WRITE | IOMAP_ZERO));
+
if (XFS_FORCED_SHUTDOWN(mp))
return -EIO;
- if ((flags & (IOMAP_WRITE | IOMAP_ZERO)) && !(flags & IOMAP_DIRECT) &&
- !IS_DAX(inode) && !xfs_get_extsz_hint(ip)) {
- /* Reserve delalloc blocks for regular writeback. */
- return xfs_file_iomap_begin_delay(inode, offset, length, flags,
- iomap);
- }
-
/*
- * Lock the inode in the manner required for the specified operation and
- * check for as many conditions that would result in blocking as
- * possible. This removes most of the non-blocking checks from the
- * mapping code below.
+ * Writes that span EOF might trigger an IO size update on completion,
+ * so consider them to be dirty for the purposes of O_DSYNC even if
+ * there is no other metadata changes pending or have been made here.
*/
+ if (offset + length > i_size_read(inode))
+ iomap_flags |= IOMAP_F_DIRTY;
+
error = xfs_ilock_for_iomap(ip, flags, &lockmode);
if (error)
return error;
- ASSERT(offset <= mp->m_super->s_maxbytes);
- if (offset > mp->m_super->s_maxbytes - length)
- length = mp->m_super->s_maxbytes - offset;
- offset_fsb = XFS_B_TO_FSBT(mp, offset);
- end_fsb = XFS_B_TO_FSB(mp, offset + length);
-
error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap,
&nimaps, 0);
if (error)
goto out_unlock;
- if (flags & IOMAP_REPORT) {
- /* Trim the mapping to the nearest shared extent boundary. */
- error = xfs_reflink_trim_around_shared(ip, &imap, &shared);
- if (error)
+ if (imap_needs_cow(ip, flags, &imap, nimaps)) {
+ error = -EAGAIN;
+ if (flags & IOMAP_NOWAIT)
goto out_unlock;
- }
-
- /* Non-modifying mapping requested, so we are done */
- if (!(flags & (IOMAP_WRITE | IOMAP_ZERO)))
- goto out_found;
-
- /*
- * Break shared extents if necessary. Checks for non-blocking IO have
- * been done up front, so we don't need to do them here.
- */
- if (xfs_is_cow_inode(ip)) {
- struct xfs_bmbt_irec cmap;
- bool directio = (flags & IOMAP_DIRECT);
-
- /* if zeroing doesn't need COW allocation, then we are done. */
- if ((flags & IOMAP_ZERO) &&
- !needs_cow_for_zeroing(&imap, nimaps))
- goto out_found;
/* may drop and re-acquire the ilock */
- cmap = imap;
- error = xfs_reflink_allocate_cow(ip, &cmap, &shared, &lockmode,
- directio);
+ error = xfs_reflink_allocate_cow(ip, &imap, &cmap, &shared,
+ &lockmode, flags & IOMAP_DIRECT);
if (error)
goto out_unlock;
-
- /*
- * For buffered writes we need to report the address of the
- * previous block (if there was any) so that the higher level
- * write code can perform read-modify-write operations; we
- * won't need the CoW fork mapping until writeback. For direct
- * I/O, which must be block aligned, we need to report the
- * newly allocated address. If the data fork has a hole, copy
- * the COW fork mapping to avoid allocating to the data fork.
- *
- * Otherwise, ensure that the imap range does not extend past
- * the range allocated/found in cmap.
- */
- if (directio || imap.br_startblock == HOLESTARTBLOCK)
- imap = cmap;
- else
- xfs_trim_extent(&imap, cmap.br_startoff,
- cmap.br_blockcount);
-
+ if (shared)
+ goto out_found_cow;
end_fsb = imap.br_startoff + imap.br_blockcount;
length = XFS_FSB_TO_B(mp, end_fsb) - offset;
}
- /* Don't need to allocate over holes when doing zeroing operations. */
- if (flags & IOMAP_ZERO)
- goto out_found;
+ if (imap_needs_alloc(inode, flags, &imap, nimaps))
+ goto allocate_blocks;
- if (!imap_needs_alloc(inode, &imap, nimaps))
- goto out_found;
-
- /* If nowait is set bail since we are going to make allocations. */
- if (flags & IOMAP_NOWAIT) {
+ /*
+ * NOWAIT IO needs to span the entire requested IO with a single map so
+ * that we avoid partial IO failures due to the rest of the IO range not
+ * covered by this map triggering an EAGAIN condition when it is
+ * subsequently mapped and aborting the IO.
+ */
+ if ((flags & IOMAP_NOWAIT) &&
+ !imap_spans_range(&imap, offset_fsb, end_fsb)) {
error = -EAGAIN;
goto out_unlock;
}
+ xfs_iunlock(ip, lockmode);
+ trace_xfs_iomap_found(ip, offset, length, XFS_DATA_FORK, &imap);
+ return xfs_bmbt_to_iomap(ip, iomap, &imap, iomap_flags);
+
+allocate_blocks:
+ error = -EAGAIN;
+ if (flags & IOMAP_NOWAIT)
+ goto out_unlock;
+
/*
* We cap the maximum length we map to a sane size to keep the chunks
* of work done where somewhat symmetric with the work writeback does.
@@ -1039,48 +814,280 @@
* lower level functions are updated.
*/
length = min_t(loff_t, length, 1024 * PAGE_SIZE);
+ end_fsb = xfs_iomap_end_fsb(mp, offset, length);
- /*
- * xfs_iomap_write_direct() expects the shared lock. It is unlocked on
- * return.
- */
- if (lockmode == XFS_ILOCK_EXCL)
- xfs_ilock_demote(ip, lockmode);
- error = xfs_iomap_write_direct(ip, offset, length, &imap,
- nimaps);
+ if (offset + length > XFS_ISIZE(ip))
+ end_fsb = xfs_iomap_eof_align_last_fsb(ip, end_fsb);
+ else if (nimaps && imap.br_startblock == HOLESTARTBLOCK)
+ end_fsb = min(end_fsb, imap.br_startoff + imap.br_blockcount);
+ xfs_iunlock(ip, lockmode);
+
+ error = xfs_iomap_write_direct(ip, offset_fsb, end_fsb - offset_fsb,
+ &imap);
if (error)
return error;
- iomap->flags |= IOMAP_F_NEW;
trace_xfs_iomap_alloc(ip, offset, length, XFS_DATA_FORK, &imap);
+ return xfs_bmbt_to_iomap(ip, iomap, &imap, iomap_flags | IOMAP_F_NEW);
-out_finish:
- return xfs_bmbt_to_iomap(ip, iomap, &imap, shared);
-
-out_found:
- ASSERT(nimaps);
+out_found_cow:
xfs_iunlock(ip, lockmode);
- trace_xfs_iomap_found(ip, offset, length, XFS_DATA_FORK, &imap);
- goto out_finish;
+ length = XFS_FSB_TO_B(mp, cmap.br_startoff + cmap.br_blockcount);
+ trace_xfs_iomap_found(ip, offset, length - offset, XFS_COW_FORK, &cmap);
+ if (imap.br_startblock != HOLESTARTBLOCK) {
+ error = xfs_bmbt_to_iomap(ip, srcmap, &imap, 0);
+ if (error)
+ return error;
+ }
+ return xfs_bmbt_to_iomap(ip, iomap, &cmap, IOMAP_F_SHARED);
out_unlock:
xfs_iunlock(ip, lockmode);
return error;
}
+const struct iomap_ops xfs_direct_write_iomap_ops = {
+ .iomap_begin = xfs_direct_write_iomap_begin,
+};
+
static int
-xfs_file_iomap_end_delalloc(
- struct xfs_inode *ip,
+xfs_buffered_write_iomap_begin(
+ struct inode *inode,
+ loff_t offset,
+ loff_t count,
+ unsigned flags,
+ struct iomap *iomap,
+ struct iomap *srcmap)
+{
+ struct xfs_inode *ip = XFS_I(inode);
+ struct xfs_mount *mp = ip->i_mount;
+ xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
+ xfs_fileoff_t end_fsb = xfs_iomap_end_fsb(mp, offset, count);
+ struct xfs_bmbt_irec imap, cmap;
+ struct xfs_iext_cursor icur, ccur;
+ xfs_fsblock_t prealloc_blocks = 0;
+ bool eof = false, cow_eof = false, shared = false;
+ int allocfork = XFS_DATA_FORK;
+ int error = 0;
+
+ /* we can't use delayed allocations when using extent size hints */
+ if (xfs_get_extsz_hint(ip))
+ return xfs_direct_write_iomap_begin(inode, offset, count,
+ flags, iomap, srcmap);
+
+ ASSERT(!XFS_IS_REALTIME_INODE(ip));
+
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+
+ if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(&ip->i_df)) ||
+ XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) {
+ error = -EFSCORRUPTED;
+ goto out_unlock;
+ }
+
+ XFS_STATS_INC(mp, xs_blk_mapw);
+
+ if (!(ip->i_df.if_flags & XFS_IFEXTENTS)) {
+ error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK);
+ if (error)
+ goto out_unlock;
+ }
+
+ /*
+ * Search the data fork first to look up our source mapping. We
+ * always need the data fork map, as we have to return it to the
+ * iomap code so that the higher level write code can read data in to
+ * perform read-modify-write cycles for unaligned writes.
+ */
+ eof = !xfs_iext_lookup_extent(ip, &ip->i_df, offset_fsb, &icur, &imap);
+ if (eof)
+ imap.br_startoff = end_fsb; /* fake hole until the end */
+
+ /* We never need to allocate blocks for zeroing a hole. */
+ if ((flags & IOMAP_ZERO) && imap.br_startoff > offset_fsb) {
+ xfs_hole_to_iomap(ip, iomap, offset_fsb, imap.br_startoff);
+ goto out_unlock;
+ }
+
+ /*
+ * Search the COW fork extent list even if we did not find a data fork
+ * extent. This serves two purposes: first this implements the
+ * speculative preallocation using cowextsize, so that we also unshare
+ * block adjacent to shared blocks instead of just the shared blocks
+ * themselves. Second the lookup in the extent list is generally faster
+ * than going out to the shared extent tree.
+ */
+ if (xfs_is_cow_inode(ip)) {
+ if (!ip->i_cowfp) {
+ ASSERT(!xfs_is_reflink_inode(ip));
+ xfs_ifork_init_cow(ip);
+ }
+ cow_eof = !xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb,
+ &ccur, &cmap);
+ if (!cow_eof && cmap.br_startoff <= offset_fsb) {
+ trace_xfs_reflink_cow_found(ip, &cmap);
+ goto found_cow;
+ }
+ }
+
+ if (imap.br_startoff <= offset_fsb) {
+ /*
+ * For reflink files we may need a delalloc reservation when
+ * overwriting shared extents. This includes zeroing of
+ * existing extents that contain data.
+ */
+ if (!xfs_is_cow_inode(ip) ||
+ ((flags & IOMAP_ZERO) && imap.br_state != XFS_EXT_NORM)) {
+ trace_xfs_iomap_found(ip, offset, count, XFS_DATA_FORK,
+ &imap);
+ goto found_imap;
+ }
+
+ xfs_trim_extent(&imap, offset_fsb, end_fsb - offset_fsb);
+
+ /* Trim the mapping to the nearest shared extent boundary. */
+ error = xfs_bmap_trim_cow(ip, &imap, &shared);
+ if (error)
+ goto out_unlock;
+
+ /* Not shared? Just report the (potentially capped) extent. */
+ if (!shared) {
+ trace_xfs_iomap_found(ip, offset, count, XFS_DATA_FORK,
+ &imap);
+ goto found_imap;
+ }
+
+ /*
+ * Fork all the shared blocks from our write offset until the
+ * end of the extent.
+ */
+ allocfork = XFS_COW_FORK;
+ end_fsb = imap.br_startoff + imap.br_blockcount;
+ } else {
+ /*
+ * We cap the maximum length we map here to MAX_WRITEBACK_PAGES
+ * pages to keep the chunks of work done where somewhat
+ * symmetric with the work writeback does. This is a completely
+ * arbitrary number pulled out of thin air.
+ *
+ * Note that the values needs to be less than 32-bits wide until
+ * the lower level functions are updated.
+ */
+ count = min_t(loff_t, count, 1024 * PAGE_SIZE);
+ end_fsb = xfs_iomap_end_fsb(mp, offset, count);
+
+ if (xfs_is_always_cow_inode(ip))
+ allocfork = XFS_COW_FORK;
+ }
+
+ error = xfs_qm_dqattach_locked(ip, false);
+ if (error)
+ goto out_unlock;
+
+ if (eof && offset + count > XFS_ISIZE(ip)) {
+ /*
+ * Determine the initial size of the preallocation.
+ * We clean up any extra preallocation when the file is closed.
+ */
+ if (mp->m_flags & XFS_MOUNT_ALLOCSIZE)
+ prealloc_blocks = mp->m_allocsize_blocks;
+ else
+ prealloc_blocks = xfs_iomap_prealloc_size(ip, allocfork,
+ offset, count, &icur);
+ if (prealloc_blocks) {
+ xfs_extlen_t align;
+ xfs_off_t end_offset;
+ xfs_fileoff_t p_end_fsb;
+
+ end_offset = XFS_ALLOC_ALIGN(mp, offset + count - 1);
+ p_end_fsb = XFS_B_TO_FSBT(mp, end_offset) +
+ prealloc_blocks;
+
+ align = xfs_eof_alignment(ip);
+ if (align)
+ p_end_fsb = roundup_64(p_end_fsb, align);
+
+ p_end_fsb = min(p_end_fsb,
+ XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes));
+ ASSERT(p_end_fsb > offset_fsb);
+ prealloc_blocks = p_end_fsb - end_fsb;
+ }
+ }
+
+retry:
+ error = xfs_bmapi_reserve_delalloc(ip, allocfork, offset_fsb,
+ end_fsb - offset_fsb, prealloc_blocks,
+ allocfork == XFS_DATA_FORK ? &imap : &cmap,
+ allocfork == XFS_DATA_FORK ? &icur : &ccur,
+ allocfork == XFS_DATA_FORK ? eof : cow_eof);
+ switch (error) {
+ case 0:
+ break;
+ case -ENOSPC:
+ case -EDQUOT:
+ /* retry without any preallocation */
+ trace_xfs_delalloc_enospc(ip, offset, count);
+ if (prealloc_blocks) {
+ prealloc_blocks = 0;
+ goto retry;
+ }
+ /*FALLTHRU*/
+ default:
+ goto out_unlock;
+ }
+
+ if (allocfork == XFS_COW_FORK) {
+ trace_xfs_iomap_alloc(ip, offset, count, allocfork, &cmap);
+ goto found_cow;
+ }
+
+ /*
+ * Flag newly allocated delalloc blocks with IOMAP_F_NEW so we punch
+ * them out if the write happens to fail.
+ */
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ trace_xfs_iomap_alloc(ip, offset, count, allocfork, &imap);
+ return xfs_bmbt_to_iomap(ip, iomap, &imap, IOMAP_F_NEW);
+
+found_imap:
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ return xfs_bmbt_to_iomap(ip, iomap, &imap, 0);
+
+found_cow:
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ if (imap.br_startoff <= offset_fsb) {
+ error = xfs_bmbt_to_iomap(ip, srcmap, &imap, 0);
+ if (error)
+ return error;
+ } else {
+ xfs_trim_extent(&cmap, offset_fsb,
+ imap.br_startoff - offset_fsb);
+ }
+ return xfs_bmbt_to_iomap(ip, iomap, &cmap, IOMAP_F_SHARED);
+
+out_unlock:
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ return error;
+}
+
+static int
+xfs_buffered_write_iomap_end(
+ struct inode *inode,
loff_t offset,
loff_t length,
ssize_t written,
+ unsigned flags,
struct iomap *iomap)
{
+ struct xfs_inode *ip = XFS_I(inode);
struct xfs_mount *mp = ip->i_mount;
xfs_fileoff_t start_fsb;
xfs_fileoff_t end_fsb;
int error = 0;
+ if (iomap->type != IOMAP_DELALLOC)
+ return 0;
+
/*
* Behave as if the write failed if drop writes is enabled. Set the NEW
* flag to force delalloc cleanup.
@@ -1125,24 +1132,51 @@
return 0;
}
+const struct iomap_ops xfs_buffered_write_iomap_ops = {
+ .iomap_begin = xfs_buffered_write_iomap_begin,
+ .iomap_end = xfs_buffered_write_iomap_end,
+};
+
static int
-xfs_file_iomap_end(
+xfs_read_iomap_begin(
struct inode *inode,
loff_t offset,
loff_t length,
- ssize_t written,
unsigned flags,
- struct iomap *iomap)
+ struct iomap *iomap,
+ struct iomap *srcmap)
{
- if ((flags & IOMAP_WRITE) && iomap->type == IOMAP_DELALLOC)
- return xfs_file_iomap_end_delalloc(XFS_I(inode), offset,
- length, written, iomap);
- return 0;
+ struct xfs_inode *ip = XFS_I(inode);
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_bmbt_irec imap;
+ xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
+ xfs_fileoff_t end_fsb = xfs_iomap_end_fsb(mp, offset, length);
+ int nimaps = 1, error = 0;
+ bool shared = false;
+ unsigned lockmode;
+
+ ASSERT(!(flags & (IOMAP_WRITE | IOMAP_ZERO)));
+
+ if (XFS_FORCED_SHUTDOWN(mp))
+ return -EIO;
+
+ error = xfs_ilock_for_iomap(ip, flags, &lockmode);
+ if (error)
+ return error;
+ error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap,
+ &nimaps, 0);
+ if (!error && (flags & IOMAP_REPORT))
+ error = xfs_reflink_trim_around_shared(ip, &imap, &shared);
+ xfs_iunlock(ip, lockmode);
+
+ if (error)
+ return error;
+ trace_xfs_iomap_found(ip, offset, length, XFS_DATA_FORK, &imap);
+ return xfs_bmbt_to_iomap(ip, iomap, &imap, shared ? IOMAP_F_SHARED : 0);
}
-const struct iomap_ops xfs_iomap_ops = {
- .iomap_begin = xfs_file_iomap_begin,
- .iomap_end = xfs_file_iomap_end,
+const struct iomap_ops xfs_read_iomap_ops = {
+ .iomap_begin = xfs_read_iomap_begin,
};
static int
@@ -1151,7 +1185,8 @@
loff_t offset,
loff_t length,
unsigned flags,
- struct iomap *iomap)
+ struct iomap *iomap,
+ struct iomap *srcmap)
{
struct xfs_inode *ip = XFS_I(inode);
struct xfs_mount *mp = ip->i_mount;
@@ -1184,8 +1219,7 @@
/*
* Fake a hole until the end of the file.
*/
- data_fsb = min(XFS_B_TO_FSB(mp, offset + length),
- XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes));
+ data_fsb = xfs_iomap_end_fsb(mp, offset, length);
}
/*
@@ -1199,7 +1233,7 @@
if (data_fsb < cow_fsb + cmap.br_blockcount)
end_fsb = min(end_fsb, data_fsb);
xfs_trim_extent(&cmap, offset_fsb, end_fsb);
- error = xfs_bmbt_to_iomap(ip, iomap, &cmap, true);
+ error = xfs_bmbt_to_iomap(ip, iomap, &cmap, IOMAP_F_SHARED);
/*
* This is a COW extent, so we must probe the page cache
* because there could be dirty page cache being backed
@@ -1221,7 +1255,7 @@
imap.br_state = XFS_EXT_NORM;
done:
xfs_trim_extent(&imap, offset_fsb, end_fsb);
- error = xfs_bmbt_to_iomap(ip, iomap, &imap, false);
+ error = xfs_bmbt_to_iomap(ip, iomap, &imap, 0);
out_unlock:
xfs_iunlock(ip, lockmode);
return error;
@@ -1237,7 +1271,8 @@
loff_t offset,
loff_t length,
unsigned flags,
- struct iomap *iomap)
+ struct iomap *iomap,
+ struct iomap *srcmap)
{
struct xfs_inode *ip = XFS_I(inode);
struct xfs_mount *mp = ip->i_mount;
@@ -1253,12 +1288,12 @@
lockmode = xfs_ilock_attr_map_shared(ip);
/* if there are no attribute fork or extents, return ENOENT */
- if (!XFS_IFORK_Q(ip) || !ip->i_d.di_anextents) {
+ if (!XFS_IFORK_Q(ip) || !ip->i_afp->if_nextents) {
error = -ENOENT;
goto out_unlock;
}
- ASSERT(ip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL);
+ ASSERT(ip->i_afp->if_format != XFS_DINODE_FMT_LOCAL);
error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap,
&nimaps, XFS_BMAPI_ATTRFORK);
out_unlock:
@@ -1267,7 +1302,7 @@
if (error)
return error;
ASSERT(nimaps);
- return xfs_bmbt_to_iomap(ip, iomap, &imap, false);
+ return xfs_bmbt_to_iomap(ip, iomap, &imap, 0);
}
const struct iomap_ops xfs_xattr_iomap_ops = {
diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h
index 5c2f6aa..7d37035 100644
--- a/fs/xfs/xfs_iomap.h
+++ b/fs/xfs/xfs_iomap.h
@@ -11,13 +11,14 @@
struct xfs_inode;
struct xfs_bmbt_irec;
-int xfs_iomap_write_direct(struct xfs_inode *, xfs_off_t, size_t,
- struct xfs_bmbt_irec *, int);
+int xfs_iomap_write_direct(struct xfs_inode *ip, xfs_fileoff_t offset_fsb,
+ xfs_fileoff_t count_fsb, struct xfs_bmbt_irec *imap);
int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, xfs_off_t, bool);
+xfs_fileoff_t xfs_iomap_eof_align_last_fsb(struct xfs_inode *ip,
+ xfs_fileoff_t end_fsb);
int xfs_bmbt_to_iomap(struct xfs_inode *, struct iomap *,
- struct xfs_bmbt_irec *, bool shared);
-xfs_extlen_t xfs_eof_alignment(struct xfs_inode *ip, xfs_extlen_t extsize);
+ struct xfs_bmbt_irec *, u16);
static inline xfs_filblks_t
xfs_aligned_fsb_count(
@@ -39,7 +40,9 @@
return count_fsb;
}
-extern const struct iomap_ops xfs_iomap_ops;
+extern const struct iomap_ops xfs_buffered_write_iomap_ops;
+extern const struct iomap_ops xfs_direct_write_iomap_ops;
+extern const struct iomap_ops xfs_read_iomap_ops;
extern const struct iomap_ops xfs_seek_iomap_ops;
extern const struct iomap_ops xfs_xattr_iomap_ops;
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index ca8c763..b7f7b31 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -20,18 +20,19 @@
#include "xfs_symlink.h"
#include "xfs_dir2.h"
#include "xfs_iomap.h"
+#include "xfs_error.h"
-#include <linux/xattr.h>
#include <linux/posix_acl.h>
#include <linux/security.h>
#include <linux/iversion.h>
+#include <linux/fiemap.h>
/*
- * Directories have different lock order w.r.t. mmap_sem compared to regular
+ * Directories have different lock order w.r.t. mmap_lock compared to regular
* files. This is due to readdir potentially triggering page faults on a user
* buffer inside filldir(), and this happens with the ilock on the directory
* held. For regular files, the lock order is the other way around - the
- * mmap_sem is taken during the page fault, and then we lock the ilock to do
+ * mmap_lock is taken during the page fault, and then we lock the ilock to do
* block mapping. Hence we need a different class for the directory ilock so
* that lockdep can tell them apart.
*/
@@ -49,8 +50,15 @@
int error = 0;
for (xattr = xattr_array; xattr->name != NULL; xattr++) {
- error = xfs_attr_set(ip, xattr->name, xattr->value,
- xattr->value_len, ATTR_SECURE);
+ struct xfs_da_args args = {
+ .dp = ip,
+ .attr_filter = XFS_ATTR_SECURE,
+ .name = xattr->name,
+ .namelen = strlen(xattr->name),
+ .value = xattr->value,
+ .valuelen = xattr->value_len,
+ };
+ error = xfs_attr_set(&args);
if (error < 0)
break;
}
@@ -229,7 +237,7 @@
umode_t mode,
bool flags)
{
- return xfs_vn_mknod(dir, dentry, mode, 0);
+ return xfs_generic_create(dir, dentry, mode, 0, false);
}
STATIC int
@@ -238,7 +246,7 @@
struct dentry *dentry,
umode_t mode)
{
- return xfs_vn_mknod(dir, dentry, mode|S_IFDIR, 0);
+ return xfs_generic_create(dir, dentry, mode | S_IFDIR, 0, false);
}
STATIC struct dentry *
@@ -470,20 +478,57 @@
struct inode *inode,
struct delayed_call *done)
{
+ struct xfs_inode *ip = XFS_I(inode);
char *link;
- ASSERT(XFS_I(inode)->i_df.if_flags & XFS_IFINLINE);
+ ASSERT(ip->i_df.if_flags & XFS_IFINLINE);
/*
* The VFS crashes on a NULL pointer, so return -EFSCORRUPTED if
* if_data is junk.
*/
- link = XFS_I(inode)->i_df.if_u1.if_data;
- if (!link)
+ link = ip->i_df.if_u1.if_data;
+ if (XFS_IS_CORRUPT(ip->i_mount, !link))
return ERR_PTR(-EFSCORRUPTED);
return link;
}
+static uint32_t
+xfs_stat_blksize(
+ struct xfs_inode *ip)
+{
+ struct xfs_mount *mp = ip->i_mount;
+
+ /*
+ * If the file blocks are being allocated from a realtime volume, then
+ * always return the realtime extent size.
+ */
+ if (XFS_IS_REALTIME_INODE(ip))
+ return xfs_get_extsz_hint(ip) << mp->m_sb.sb_blocklog;
+
+ /*
+ * Allow large block sizes to be reported to userspace programs if the
+ * "largeio" mount option is used.
+ *
+ * If compatibility mode is specified, simply return the basic unit of
+ * caching so that we don't get inefficient read/modify/write I/O from
+ * user apps. Otherwise....
+ *
+ * If the underlying volume is a stripe, then return the stripe width in
+ * bytes as the recommended I/O size. It is not a stripe and we've set a
+ * default buffered I/O size, return that, otherwise return the compat
+ * default.
+ */
+ if (mp->m_flags & XFS_MOUNT_LARGEIO) {
+ if (mp->m_swidth)
+ return mp->m_swidth << mp->m_sb.sb_blocklog;
+ if (mp->m_flags & XFS_MOUNT_ALLOCSIZE)
+ return 1U << mp->m_allocsize_log;
+ }
+
+ return PAGE_SIZE;
+}
+
STATIC int
xfs_vn_getattr(
const struct path *path,
@@ -513,11 +558,10 @@
stat->blocks =
XFS_FSB_TO_BB(mp, ip->i_d.di_nblocks + ip->i_delayed_blks);
- if (ip->i_d.di_version == 3) {
+ if (xfs_sb_version_has_v3inode(&mp->m_sb)) {
if (request_mask & STATX_BTIME) {
stat->result_mask |= STATX_BTIME;
- stat->btime.tv_sec = ip->i_d.di_crtime.t_sec;
- stat->btime.tv_nsec = ip->i_d.di_crtime.t_nsec;
+ stat->btime = ip->i_d.di_crtime;
}
}
@@ -543,16 +587,7 @@
stat->rdev = inode->i_rdev;
break;
default:
- if (XFS_IS_REALTIME_INODE(ip)) {
- /*
- * If the file blocks are being allocated from a
- * realtime volume, then return the inode's realtime
- * extent size or the realtime volume's extent size.
- */
- stat->blksize =
- xfs_get_extsz_hint(ip) << mp->m_sb.sb_blocklog;
- } else
- stat->blksize = xfs_preferred_iosize(mp);
+ stat->blksize = xfs_stat_blksize(ip);
stat->rdev = 0;
break;
}
@@ -662,9 +697,7 @@
*/
ASSERT(udqp == NULL);
ASSERT(gdqp == NULL);
- error = xfs_qm_vop_dqalloc(ip, xfs_kuid_to_uid(uid),
- xfs_kgid_to_gid(gid),
- xfs_get_projid(ip),
+ error = xfs_qm_vop_dqalloc(ip, uid, gid, ip->i_d.di_projid,
qflags, &udqp, &gdqp, NULL);
if (error)
return error;
@@ -706,12 +739,7 @@
if (error) /* out of quota */
goto out_cancel;
}
- }
- /*
- * Change file ownership. Must be the owner or privileged.
- */
- if (mask & (ATTR_UID|ATTR_GID)) {
/*
* CAP_FSETID overrides the following restrictions:
*
@@ -733,7 +761,6 @@
olddquot1 = xfs_qm_vop_chown(tp, ip,
&ip->i_udquot, udqp);
}
- ip->i_d.di_uid = xfs_kuid_to_uid(uid);
inode->i_uid = uid;
}
if (!gid_eq(igid, gid)) {
@@ -745,7 +772,6 @@
olddquot2 = xfs_qm_vop_chown(tp, ip,
&ip->i_gdquot, gdqp);
}
- ip->i_d.di_gid = xfs_kgid_to_gid(gid);
inode->i_gid = gid;
}
}
@@ -847,7 +873,7 @@
/*
* Short circuit the truncate case for zero length files.
*/
- if (newsize == 0 && oldsize == 0 && ip->i_d.di_nextents == 0) {
+ if (newsize == 0 && oldsize == 0 && ip->i_df.if_nextents == 0) {
if (!(iattr->ia_valid & (ATTR_CTIME|ATTR_MTIME)))
return 0;
@@ -883,7 +909,7 @@
if (newsize > oldsize) {
trace_xfs_zero_eof(ip, oldsize, newsize - oldsize);
error = iomap_zero_range(inode, oldsize, newsize - oldsize,
- &did_zeroing, &xfs_iomap_ops);
+ &did_zeroing, &xfs_buffered_write_iomap_ops);
} else {
/*
* iomap won't detect a dirty page over an unwritten block (or a
@@ -896,7 +922,7 @@
if (error)
return error;
error = iomap_truncate_page(inode, newsize, &did_zeroing,
- &xfs_iomap_ops);
+ &xfs_buffered_write_iomap_ops);
}
if (error)
@@ -1124,7 +1150,7 @@
&xfs_xattr_iomap_ops);
} else {
error = iomap_fiemap(inode, fieinfo, start, length,
- &xfs_iomap_ops);
+ &xfs_read_iomap_ops);
}
xfs_iunlock(XFS_I(inode), XFS_IOLOCK_SHARED);
@@ -1223,13 +1249,12 @@
{
struct xfs_mount *mp = ip->i_mount;
- /* Only supported on non-reflinked files. */
- if (!S_ISREG(VFS_I(ip)->i_mode) || xfs_is_reflink_inode(ip))
+ /* Only supported on regular files. */
+ if (!S_ISREG(VFS_I(ip)->i_mode))
return false;
- /* DAX mount option or DAX iflag must be set. */
- if (!(mp->m_flags & XFS_MOUNT_DAX) &&
- !(ip->i_d.di_flags2 & XFS_DIFLAG2_DAX))
+ /* Only supported on non-reflinked files. */
+ if (xfs_is_reflink_inode(ip))
return false;
/* Block size must match page size */
@@ -1237,29 +1262,54 @@
return false;
/* Device has to support DAX too. */
- return xfs_find_daxdev_for_inode(VFS_I(ip)) != NULL;
+ return xfs_inode_buftarg(ip)->bt_daxdev != NULL;
}
-STATIC void
-xfs_diflags_to_iflags(
- struct inode *inode,
- struct xfs_inode *ip)
+static bool
+xfs_inode_should_enable_dax(
+ struct xfs_inode *ip)
{
- uint16_t flags = ip->i_d.di_flags;
+ if (!IS_ENABLED(CONFIG_FS_DAX))
+ return false;
+ if (ip->i_mount->m_flags & XFS_MOUNT_DAX_NEVER)
+ return false;
+ if (!xfs_inode_supports_dax(ip))
+ return false;
+ if (ip->i_mount->m_flags & XFS_MOUNT_DAX_ALWAYS)
+ return true;
+ if (ip->i_d.di_flags2 & XFS_DIFLAG2_DAX)
+ return true;
+ return false;
+}
- inode->i_flags &= ~(S_IMMUTABLE | S_APPEND | S_SYNC |
- S_NOATIME | S_DAX);
+void
+xfs_diflags_to_iflags(
+ struct xfs_inode *ip,
+ bool init)
+{
+ struct inode *inode = VFS_I(ip);
+ unsigned int xflags = xfs_ip2xflags(ip);
+ unsigned int flags = 0;
- if (flags & XFS_DIFLAG_IMMUTABLE)
- inode->i_flags |= S_IMMUTABLE;
- if (flags & XFS_DIFLAG_APPEND)
- inode->i_flags |= S_APPEND;
- if (flags & XFS_DIFLAG_SYNC)
- inode->i_flags |= S_SYNC;
- if (flags & XFS_DIFLAG_NOATIME)
- inode->i_flags |= S_NOATIME;
- if (xfs_inode_supports_dax(ip))
- inode->i_flags |= S_DAX;
+ ASSERT(!(IS_DAX(inode) && init));
+
+ if (xflags & FS_XFLAG_IMMUTABLE)
+ flags |= S_IMMUTABLE;
+ if (xflags & FS_XFLAG_APPEND)
+ flags |= S_APPEND;
+ if (xflags & FS_XFLAG_SYNC)
+ flags |= S_SYNC;
+ if (xflags & FS_XFLAG_NOATIME)
+ flags |= S_NOATIME;
+ if (init && xfs_inode_should_enable_dax(ip))
+ flags |= S_DAX;
+
+ /*
+ * S_DAX can only be set during inode initialization and is never set by
+ * the VFS, so we cannot mask off S_DAX in i_flags.
+ */
+ inode->i_flags &= ~(S_IMMUTABLE | S_APPEND | S_SYNC | S_NOATIME);
+ inode->i_flags |= flags;
}
/*
@@ -1284,11 +1334,8 @@
/* make the inode look hashed for the writeback code */
inode_fake_hash(inode);
- inode->i_uid = xfs_uid_to_kuid(ip->i_d.di_uid);
- inode->i_gid = xfs_gid_to_kgid(ip->i_d.di_gid);
-
i_size_write(inode, ip->i_d.di_size);
- xfs_diflags_to_iflags(inode, ip);
+ xfs_diflags_to_iflags(ip, true);
if (S_ISDIR(inode->i_mode)) {
/*
@@ -1300,9 +1347,7 @@
lockdep_set_class(&inode->i_rwsem,
&inode->i_sb->s_type->i_mutex_dir_key);
lockdep_set_class(&ip->i_lock.mr_lock, &xfs_dir_ilock_class);
- ip->d_ops = ip->i_mount->m_dir_inode_ops;
} else {
- ip->d_ops = ip->i_mount->m_nondir_inode_ops;
lockdep_set_class(&ip->i_lock.mr_lock, &xfs_nondir_ilock_class);
}
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index 884950a..16ca97a 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -84,10 +84,10 @@
/* xfs_iget returns the following without needing
* further change.
*/
- buf->bs_projectid = xfs_get_projid(ip);
+ buf->bs_projectid = ip->i_d.di_projid;
buf->bs_ino = ino;
- buf->bs_uid = dic->di_uid;
- buf->bs_gid = dic->di_gid;
+ buf->bs_uid = i_uid_read(inode);
+ buf->bs_gid = i_gid_read(inode);
buf->bs_size = dic->di_size;
buf->bs_nlink = inode->i_nlink;
@@ -97,25 +97,25 @@
buf->bs_mtime_nsec = inode->i_mtime.tv_nsec;
buf->bs_ctime = inode->i_ctime.tv_sec;
buf->bs_ctime_nsec = inode->i_ctime.tv_nsec;
- buf->bs_btime = dic->di_crtime.t_sec;
- buf->bs_btime_nsec = dic->di_crtime.t_nsec;
+ buf->bs_btime = dic->di_crtime.tv_sec;
+ buf->bs_btime_nsec = dic->di_crtime.tv_nsec;
buf->bs_gen = inode->i_generation;
buf->bs_mode = inode->i_mode;
buf->bs_xflags = xfs_ip2xflags(ip);
buf->bs_extsize_blks = dic->di_extsize;
- buf->bs_extents = dic->di_nextents;
+ buf->bs_extents = xfs_ifork_nextents(&ip->i_df);
xfs_bulkstat_health(ip, buf);
- buf->bs_aextents = dic->di_anextents;
+ buf->bs_aextents = xfs_ifork_nextents(ip->i_afp);
buf->bs_forkoff = XFS_IFORK_BOFF(ip);
buf->bs_version = XFS_BULKSTAT_VERSION_V5;
- if (dic->di_version == 3) {
+ if (xfs_sb_version_has_v3inode(&mp->m_sb)) {
if (dic->di_flags2 & XFS_DIFLAG2_COWEXTSIZE)
buf->bs_cowextsize_blks = dic->di_cowextsize;
}
- switch (dic->di_format) {
+ switch (ip->i_df.if_format) {
case XFS_DINODE_FMT_DEV:
buf->bs_rdev = sysv_encode_dev(inode->i_rdev);
buf->bs_blksize = BLKDEV_IOSIZE;
diff --git a/fs/xfs/xfs_iwalk.c b/fs/xfs/xfs_iwalk.c
index aa375cf..2a45138 100644
--- a/fs/xfs/xfs_iwalk.c
+++ b/fs/xfs/xfs_iwalk.c
@@ -55,6 +55,9 @@
/* Where do we start the traversal? */
xfs_ino_t startino;
+ /* What was the last inode number we saw when iterating the inobt? */
+ xfs_ino_t lastino;
+
/* Array of inobt records we cache. */
struct xfs_inobt_rec_incore *recs;
@@ -298,7 +301,11 @@
error = xfs_inobt_get_rec(*curpp, irec, has_more);
if (error)
return error;
- XFS_WANT_CORRUPTED_RETURN(mp, *has_more == 1);
+ if (XFS_IS_CORRUPT(mp, *has_more != 1))
+ return -EFSCORRUPTED;
+
+ iwag->lastino = XFS_AGINO_TO_INO(mp, agno,
+ irec->ir_startino + XFS_INODES_PER_CHUNK - 1);
/*
* If the LE lookup yielded an inobt record before the cursor position,
@@ -346,15 +353,17 @@
struct xfs_mount *mp = iwag->mp;
struct xfs_trans *tp = iwag->tp;
struct xfs_inobt_rec_incore *irec;
- xfs_agino_t restart;
+ xfs_agino_t next_agino;
int error;
+ next_agino = XFS_INO_TO_AGINO(mp, iwag->lastino) + 1;
+
ASSERT(iwag->nr_recs > 0);
/* Delete cursor but remember the last record we cached... */
xfs_iwalk_del_inobt(tp, curpp, agi_bpp, 0);
irec = &iwag->recs[iwag->nr_recs - 1];
- restart = irec->ir_startino + XFS_INODES_PER_CHUNK - 1;
+ ASSERT(next_agino == irec->ir_startino + XFS_INODES_PER_CHUNK);
error = xfs_iwalk_ag_recs(iwag);
if (error)
@@ -371,7 +380,7 @@
if (error)
return error;
- return xfs_inobt_lookup(*curpp, restart, XFS_LOOKUP_GE, has_more);
+ return xfs_inobt_lookup(*curpp, next_agino, XFS_LOOKUP_GE, has_more);
}
/* Walk all inodes in a single AG, from @iwag->startino to the end of the AG. */
@@ -395,6 +404,7 @@
while (!error && has_more) {
struct xfs_inobt_rec_incore *irec;
+ xfs_ino_t rec_fsino;
cond_resched();
if (xfs_pwork_want_abort(&iwag->pwork))
@@ -406,6 +416,15 @@
if (error || !has_more)
break;
+ /* Make sure that we always move forward. */
+ rec_fsino = XFS_AGINO_TO_INO(mp, agno, irec->ir_startino);
+ if (iwag->lastino != NULLFSINO &&
+ XFS_IS_CORRUPT(mp, iwag->lastino >= rec_fsino)) {
+ error = -EFSCORRUPTED;
+ goto out;
+ }
+ iwag->lastino = rec_fsino + XFS_INODES_PER_CHUNK - 1;
+
/* No allocated inodes in this chunk; skip it. */
if (iwag->skip_empty && irec->ir_freecount == irec->ir_count) {
error = xfs_btree_increment(cur, 0, &has_more);
@@ -534,6 +553,7 @@
.trim_start = 1,
.skip_empty = 1,
.pwork = XFS_PWORK_SINGLE_THREADED,
+ .lastino = NULLFSINO,
};
xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, startino);
int error;
@@ -622,6 +642,7 @@
iwag->data = data;
iwag->startino = startino;
iwag->sz_recs = xfs_iwalk_prefetch(inode_records);
+ iwag->lastino = NULLFSINO;
xfs_pwork_queue(&pctl, &iwag->pwork);
startino = XFS_AGINO_TO_INO(mp, agno + 1, 0);
if (flags & XFS_INOBT_WALK_SAME_AG)
@@ -695,6 +716,7 @@
.startino = startino,
.sz_recs = xfs_inobt_walk_prefetch(inobt_records),
.pwork = XFS_PWORK_SINGLE_THREADED,
+ .lastino = NULLFSINO,
};
xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, startino);
int error;
diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h
index ca15105..5b7a1e2 100644
--- a/fs/xfs/xfs_linux.h
+++ b/fs/xfs/xfs_linux.h
@@ -60,6 +60,7 @@
#include <linux/list_sort.h>
#include <linux/ratelimit.h>
#include <linux/rhashtable.h>
+#include <linux/xattr.h>
#include <asm/page.h>
#include <asm/div64.h>
@@ -101,12 +102,8 @@
#define xfs_cowb_secs xfs_params.cowb_timer.val
#define current_cpu() (raw_smp_processor_id())
-#define current_pid() (current->pid)
-#define current_test_flags(f) (current->flags & (f))
#define current_set_flags_nested(sp, f) \
(*(sp) = current->flags, current->flags |= (f))
-#define current_clear_flags_nested(sp, f) \
- (*(sp) = current->flags, current->flags &= ~(f))
#define current_restore_flags_nested(sp, f) \
(current->flags = ((current->flags & ~(f)) | (*(sp) & (f))))
@@ -126,7 +123,6 @@
#define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */
#define EFSBADCRC EBADMSG /* Bad CRC detected */
-#define SYNCHRONIZE() barrier()
#define __return_address __builtin_return_address(0)
/*
@@ -163,32 +159,6 @@
extern struct xstats xfsstats;
-/* Kernel uid/gid conversion. These are used to convert to/from the on disk
- * uid_t/gid_t types to the kuid_t/kgid_t types that the kernel uses internally.
- * The conversion here is type only, the value will remain the same since we
- * are converting to the init_user_ns. The uid is later mapped to a particular
- * user namespace value when crossing the kernel/user boundary.
- */
-static inline uint32_t xfs_kuid_to_uid(kuid_t uid)
-{
- return from_kuid(&init_user_ns, uid);
-}
-
-static inline kuid_t xfs_uid_to_kuid(uint32_t uid)
-{
- return make_kuid(&init_user_ns, uid);
-}
-
-static inline uint32_t xfs_kgid_to_gid(kgid_t gid)
-{
- return from_kgid(&init_user_ns, gid);
-}
-
-static inline kgid_t xfs_gid_to_kgid(uint32_t gid)
-{
- return make_kgid(&init_user_ns, gid);
-}
-
static inline dev_t xfs_to_linux_dev_t(xfs_dev_t dev)
{
return MKDEV(sysv_major(dev) & 0x1ff, sysv_minor(dev));
@@ -205,6 +175,12 @@
#define xfs_sort(a,n,s,fn) sort(a,n,s,fn,NULL)
#define xfs_stack_trace() dump_stack()
+static inline uint64_t rounddown_64(uint64_t x, uint32_t y)
+{
+ do_div(x, y);
+ return x * y;
+}
+
static inline uint64_t roundup_64(uint64_t x, uint32_t y)
{
x += y - 1;
@@ -223,26 +199,32 @@
char *data, unsigned int op);
#define ASSERT_ALWAYS(expr) \
- (likely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__))
+ (likely(expr) ? (void)0 : assfail(NULL, #expr, __FILE__, __LINE__))
#ifdef DEBUG
#define ASSERT(expr) \
- (likely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__))
+ (likely(expr) ? (void)0 : assfail(NULL, #expr, __FILE__, __LINE__))
#else /* !DEBUG */
#ifdef XFS_WARN
#define ASSERT(expr) \
- (likely(expr) ? (void)0 : asswarn(#expr, __FILE__, __LINE__))
+ (likely(expr) ? (void)0 : asswarn(NULL, #expr, __FILE__, __LINE__))
#else /* !DEBUG && !XFS_WARN */
-#define ASSERT(expr) ((void)0)
+#define ASSERT(expr) ((void)0)
#endif /* XFS_WARN */
#endif /* DEBUG */
+#define XFS_IS_CORRUPT(mp, expr) \
+ (unlikely(expr) ? xfs_corruption_error(#expr, XFS_ERRLEVEL_LOW, (mp), \
+ NULL, 0, __FILE__, __LINE__, \
+ __this_address), \
+ true : false)
+
#define STATIC static noinline
#ifdef CONFIG_XFS_RT
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 7b0d9ad..fa2d05e 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -24,13 +24,6 @@
kmem_zone_t *xfs_log_ticket_zone;
/* Local miscellaneous function prototypes */
-STATIC int
-xlog_commit_record(
- struct xlog *log,
- struct xlog_ticket *ticket,
- struct xlog_in_core **iclog,
- xfs_lsn_t *commitlsnp);
-
STATIC struct xlog *
xlog_alloc_log(
struct xfs_mount *mp,
@@ -47,8 +40,7 @@
/* local state machine functions */
STATIC void xlog_state_done_syncing(
- struct xlog_in_core *iclog,
- bool aborted);
+ struct xlog_in_core *iclog);
STATIC int
xlog_state_get_iclog_space(
struct xlog *log,
@@ -57,33 +49,19 @@
struct xlog_ticket *ticket,
int *continued_write,
int *logoffsetp);
-STATIC int
-xlog_state_release_iclog(
- struct xlog *log,
- struct xlog_in_core *iclog);
STATIC void
xlog_state_switch_iclogs(
struct xlog *log,
struct xlog_in_core *iclog,
int eventual_size);
STATIC void
-xlog_state_want_sync(
- struct xlog *log,
- struct xlog_in_core *iclog);
-
-STATIC void
xlog_grant_push_ail(
struct xlog *log,
int need_bytes);
STATIC void
-xlog_regrant_reserve_log_space(
+xlog_sync(
struct xlog *log,
- struct xlog_ticket *ticket);
-STATIC void
-xlog_ungrant_log_space(
- struct xlog *log,
- struct xlog_ticket *ticket);
-
+ struct xlog_in_core *iclog);
#if defined(DEBUG)
STATIC void
xlog_verify_dest_ptr(
@@ -455,7 +433,7 @@
XFS_STATS_INC(mp, xs_try_logspace);
ASSERT(*ticp == NULL);
- tic = xlog_ticket_alloc(log, unit_bytes, cnt, client, permanent, 0);
+ tic = xlog_ticket_alloc(log, unit_bytes, cnt, client, permanent);
*ticp = tic;
xlog_grant_push_ail(log, tic->t_cnt ? tic->t_unit_res * tic->t_cnt
@@ -485,86 +463,69 @@
return error;
}
-
-/*
- * NOTES:
- *
- * 1. currblock field gets updated at startup and after in-core logs
- * marked as with WANT_SYNC.
- */
-
-/*
- * This routine is called when a user of a log manager ticket is done with
- * the reservation. If the ticket was ever used, then a commit record for
- * the associated transaction is written out as a log operation header with
- * no data. The flag XLOG_TIC_INITED is set when the first write occurs with
- * a given ticket. If the ticket was one with a permanent reservation, then
- * a few operations are done differently. Permanent reservation tickets by
- * default don't release the reservation. They just commit the current
- * transaction with the belief that the reservation is still needed. A flag
- * must be passed in before permanent reservations are actually released.
- * When these type of tickets are not released, they need to be set into
- * the inited state again. By doing this, a start record will be written
- * out when the next write occurs.
- */
-xfs_lsn_t
-xfs_log_done(
- struct xfs_mount *mp,
- struct xlog_ticket *ticket,
- struct xlog_in_core **iclog,
- bool regrant)
-{
- struct xlog *log = mp->m_log;
- xfs_lsn_t lsn = 0;
-
- if (XLOG_FORCED_SHUTDOWN(log) ||
- /*
- * If nothing was ever written, don't write out commit record.
- * If we get an error, just continue and give back the log ticket.
- */
- (((ticket->t_flags & XLOG_TIC_INITED) == 0) &&
- (xlog_commit_record(log, ticket, iclog, &lsn)))) {
- lsn = (xfs_lsn_t) -1;
- regrant = false;
- }
-
-
- if (!regrant) {
- trace_xfs_log_done_nonperm(log, ticket);
-
- /*
- * Release ticket if not permanent reservation or a specific
- * request has been made to release a permanent reservation.
- */
- xlog_ungrant_log_space(log, ticket);
- } else {
- trace_xfs_log_done_perm(log, ticket);
-
- xlog_regrant_reserve_log_space(log, ticket);
- /* If this ticket was a permanent reservation and we aren't
- * trying to release it, reset the inited flags; so next time
- * we write, a start record will be written out.
- */
- ticket->t_flags |= XLOG_TIC_INITED;
- }
-
- xfs_log_ticket_put(ticket);
- return lsn;
-}
-
-int
-xfs_log_release_iclog(
- struct xfs_mount *mp,
+static bool
+__xlog_state_release_iclog(
+ struct xlog *log,
struct xlog_in_core *iclog)
{
- if (xlog_state_release_iclog(mp->m_log, iclog)) {
- xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR);
+ lockdep_assert_held(&log->l_icloglock);
+
+ if (iclog->ic_state == XLOG_STATE_WANT_SYNC) {
+ /* update tail before writing to iclog */
+ xfs_lsn_t tail_lsn = xlog_assign_tail_lsn(log->l_mp);
+
+ iclog->ic_state = XLOG_STATE_SYNCING;
+ iclog->ic_header.h_tail_lsn = cpu_to_be64(tail_lsn);
+ xlog_verify_tail_lsn(log, iclog, tail_lsn);
+ /* cycle incremented when incrementing curr_block */
+ return true;
+ }
+
+ ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE);
+ return false;
+}
+
+/*
+ * Flush iclog to disk if this is the last reference to the given iclog and the
+ * it is in the WANT_SYNC state.
+ */
+static int
+xlog_state_release_iclog(
+ struct xlog *log,
+ struct xlog_in_core *iclog)
+{
+ lockdep_assert_held(&log->l_icloglock);
+
+ if (iclog->ic_state == XLOG_STATE_IOERROR)
return -EIO;
+
+ if (atomic_dec_and_test(&iclog->ic_refcnt) &&
+ __xlog_state_release_iclog(log, iclog)) {
+ spin_unlock(&log->l_icloglock);
+ xlog_sync(log, iclog);
+ spin_lock(&log->l_icloglock);
}
return 0;
}
+void
+xfs_log_release_iclog(
+ struct xlog_in_core *iclog)
+{
+ struct xlog *log = iclog->ic_log;
+ bool sync = false;
+
+ if (atomic_dec_and_lock(&iclog->ic_refcnt, &log->l_icloglock)) {
+ if (iclog->ic_state != XLOG_STATE_IOERROR)
+ sync = __xlog_state_release_iclog(log, iclog);
+ spin_unlock(&log->l_icloglock);
+ }
+
+ if (sync)
+ xlog_sync(log, iclog);
+}
+
/*
* Mount a log filesystem
*
@@ -801,32 +762,69 @@
}
/*
- * Final log writes as part of unmount.
- *
- * Mark the filesystem clean as unmount happens. Note that during relocation
- * this routine needs to be executed as part of source-bag while the
- * deallocation must not be done until source-end.
+ * Wait for the iclog to be written disk, or return an error if the log has been
+ * shut down.
*/
-
-/* Actually write the unmount record to disk. */
-static void
-xfs_log_write_unmount_record(
- struct xfs_mount *mp)
+static int
+xlog_wait_on_iclog(
+ struct xlog_in_core *iclog)
+ __releases(iclog->ic_log->l_icloglock)
{
- /* the data section must be 32 bit size aligned */
- struct xfs_unmount_log_format magic = {
+ struct xlog *log = iclog->ic_log;
+
+ if (!XLOG_FORCED_SHUTDOWN(log) &&
+ iclog->ic_state != XLOG_STATE_ACTIVE &&
+ iclog->ic_state != XLOG_STATE_DIRTY) {
+ XFS_STATS_INC(log->l_mp, xs_log_force_sleep);
+ xlog_wait(&iclog->ic_force_wait, &log->l_icloglock);
+ } else {
+ spin_unlock(&log->l_icloglock);
+ }
+
+ if (XLOG_FORCED_SHUTDOWN(log))
+ return -EIO;
+ return 0;
+}
+
+/*
+ * Write out an unmount record using the ticket provided. We have to account for
+ * the data space used in the unmount ticket as this write is not done from a
+ * transaction context that has already done the accounting for us.
+ */
+static int
+xlog_write_unmount_record(
+ struct xlog *log,
+ struct xlog_ticket *ticket,
+ xfs_lsn_t *lsn,
+ uint flags)
+{
+ struct xfs_unmount_log_format ulf = {
.magic = XLOG_UNMOUNT_TYPE,
};
struct xfs_log_iovec reg = {
- .i_addr = &magic,
- .i_len = sizeof(magic),
+ .i_addr = &ulf,
+ .i_len = sizeof(ulf),
.i_type = XLOG_REG_TYPE_UNMOUNT,
};
struct xfs_log_vec vec = {
.lv_niovecs = 1,
.lv_iovecp = ®,
};
- struct xlog *log = mp->m_log;
+
+ /* account for space used by record data */
+ ticket->t_curr_res -= sizeof(ulf);
+ return xlog_write(log, &vec, ticket, lsn, NULL, flags, false);
+}
+
+/*
+ * Mark the filesystem clean by writing an unmount record to the head of the
+ * log.
+ */
+static void
+xlog_unmount_write(
+ struct xlog *log)
+{
+ struct xfs_mount *mp = log->l_mp;
struct xlog_in_core *iclog;
struct xlog_ticket *tic = NULL;
xfs_lsn_t lsn;
@@ -837,23 +835,7 @@
if (error)
goto out_err;
- /*
- * If we think the summary counters are bad, clear the unmount header
- * flag in the unmount record so that the summary counters will be
- * recalculated during log recovery at next mount. Refer to
- * xlog_check_unmount_rec for more details.
- */
- if (XFS_TEST_ERROR(xfs_fs_has_sickness(mp, XFS_SICK_FS_COUNTERS), mp,
- XFS_ERRTAG_FORCE_SUMMARY_RECALC)) {
- xfs_alert(mp, "%s: will fix summary counters at next mount",
- __func__);
- flags &= ~XLOG_UNMOUNT_TRANS;
- }
-
- /* remove inited flag, and account for space used */
- tic->t_flags = 0;
- tic->t_curr_res -= sizeof(magic);
- error = xlog_write(log, &vec, tic, &lsn, NULL, flags);
+ error = xlog_write_unmount_record(log, tic, &lsn, flags);
/*
* At this point, we're umounting anyway, so there's no point in
* transitioning log state to IOERROR. Just continue...
@@ -865,31 +847,32 @@
spin_lock(&log->l_icloglock);
iclog = log->l_iclog;
atomic_inc(&iclog->ic_refcnt);
- xlog_state_want_sync(log, iclog);
- spin_unlock(&log->l_icloglock);
+ if (iclog->ic_state == XLOG_STATE_ACTIVE)
+ xlog_state_switch_iclogs(log, iclog, 0);
+ else
+ ASSERT(iclog->ic_state == XLOG_STATE_WANT_SYNC ||
+ iclog->ic_state == XLOG_STATE_IOERROR);
error = xlog_state_release_iclog(log, iclog);
-
- spin_lock(&log->l_icloglock);
- switch (iclog->ic_state) {
- default:
- if (!XLOG_FORCED_SHUTDOWN(log)) {
- xlog_wait(&iclog->ic_force_wait, &log->l_icloglock);
- break;
- }
- /* fall through */
- case XLOG_STATE_ACTIVE:
- case XLOG_STATE_DIRTY:
- spin_unlock(&log->l_icloglock);
- break;
- }
+ xlog_wait_on_iclog(iclog);
if (tic) {
trace_xfs_log_umount_write(log, tic);
- xlog_ungrant_log_space(log, tic);
- xfs_log_ticket_put(tic);
+ xfs_log_ticket_ungrant(log, tic);
}
}
+static void
+xfs_log_unmount_verify_iclog(
+ struct xlog *log)
+{
+ struct xlog_in_core *iclog = log->l_iclog;
+
+ do {
+ ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE);
+ ASSERT(iclog->ic_offset == 0);
+ } while ((iclog = iclog->ic_next) != log->l_iclog);
+}
+
/*
* Unmount record used to have a string "Unmount filesystem--" in the
* data section where the "Un" was really a magic number (XLOG_UNMOUNT_TYPE).
@@ -897,16 +880,11 @@
* currently architecture converted and "Unmount" is a bit foo.
* As far as I know, there weren't any dependencies on the old behaviour.
*/
-
-static int
-xfs_log_unmount_write(xfs_mount_t *mp)
+static void
+xfs_log_unmount_write(
+ struct xfs_mount *mp)
{
- struct xlog *log = mp->m_log;
- xlog_in_core_t *iclog;
-#ifdef DEBUG
- xlog_in_core_t *first_iclog;
-#endif
- int error;
+ struct xlog *log = mp->m_log;
/*
* Don't write out unmount record on norecovery mounts or ro devices.
@@ -915,61 +893,30 @@
if (mp->m_flags & XFS_MOUNT_NORECOVERY ||
xfs_readonly_buftarg(log->l_targ)) {
ASSERT(mp->m_flags & XFS_MOUNT_RDONLY);
- return 0;
+ return;
}
- error = xfs_log_force(mp, XFS_LOG_SYNC);
- ASSERT(error || !(XLOG_FORCED_SHUTDOWN(log)));
+ xfs_log_force(mp, XFS_LOG_SYNC);
-#ifdef DEBUG
- first_iclog = iclog = log->l_iclog;
- do {
- if (!(iclog->ic_state & XLOG_STATE_IOERROR)) {
- ASSERT(iclog->ic_state & XLOG_STATE_ACTIVE);
- ASSERT(iclog->ic_offset == 0);
- }
- iclog = iclog->ic_next;
- } while (iclog != first_iclog);
-#endif
- if (! (XLOG_FORCED_SHUTDOWN(log))) {
- xfs_log_write_unmount_record(mp);
- } else {
- /*
- * We're already in forced_shutdown mode, couldn't
- * even attempt to write out the unmount transaction.
- *
- * Go through the motions of sync'ing and releasing
- * the iclog, even though no I/O will actually happen,
- * we need to wait for other log I/Os that may already
- * be in progress. Do this as a separate section of
- * code so we'll know if we ever get stuck here that
- * we're in this odd situation of trying to unmount
- * a file system that went into forced_shutdown as
- * the result of an unmount..
- */
- spin_lock(&log->l_icloglock);
- iclog = log->l_iclog;
- atomic_inc(&iclog->ic_refcnt);
+ if (XLOG_FORCED_SHUTDOWN(log))
+ return;
- xlog_state_want_sync(log, iclog);
- spin_unlock(&log->l_icloglock);
- error = xlog_state_release_iclog(log, iclog);
-
- spin_lock(&log->l_icloglock);
-
- if ( ! ( iclog->ic_state == XLOG_STATE_ACTIVE
- || iclog->ic_state == XLOG_STATE_DIRTY
- || iclog->ic_state == XLOG_STATE_IOERROR) ) {
-
- xlog_wait(&iclog->ic_force_wait,
- &log->l_icloglock);
- } else {
- spin_unlock(&log->l_icloglock);
- }
+ /*
+ * If we think the summary counters are bad, avoid writing the unmount
+ * record to force log recovery at next mount, after which the summary
+ * counters will be recalculated. Refer to xlog_check_unmount_rec for
+ * more details.
+ */
+ if (XFS_TEST_ERROR(xfs_fs_has_sickness(mp, XFS_SICK_FS_COUNTERS), mp,
+ XFS_ERRTAG_FORCE_SUMMARY_RECALC)) {
+ xfs_alert(mp, "%s: will fix summary counters at next mount",
+ __func__);
+ return;
}
- return error;
-} /* xfs_log_unmount_write */
+ xfs_log_unmount_verify_iclog(log);
+ xlog_unmount_write(log);
+}
/*
* Empty the log for unmount/freeze.
@@ -1232,7 +1179,6 @@
struct xlog_in_core *iclog =
container_of(work, struct xlog_in_core, ic_end_io_work);
struct xlog *log = iclog->ic_log;
- bool aborted = false;
int error;
error = blk_status_to_errno(iclog->ic_bio.bi_status);
@@ -1248,17 +1194,9 @@
if (XFS_TEST_ERROR(error, log->l_mp, XFS_ERRTAG_IODONE_IOERR)) {
xfs_alert(log->l_mp, "log I/O error %d", error);
xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR);
- /*
- * This flag will be propagated to the trans-committed
- * callback routines to let them know that the log-commit
- * didn't succeed.
- */
- aborted = true;
- } else if (iclog->ic_state & XLOG_STATE_IOERROR) {
- aborted = true;
}
- xlog_state_done_syncing(iclog, aborted);
+ xlog_state_done_syncing(iclog);
bio_uninit(&iclog->ic_bio);
/*
@@ -1479,7 +1417,7 @@
log->l_ioend_workqueue = alloc_workqueue("xfs-log/%s",
WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_HIGHPRI, 0,
- mp->m_fsname);
+ mp->m_super->s_id);
if (!log->l_ioend_workqueue)
goto out_free_iclog;
@@ -1504,20 +1442,17 @@
return ERR_PTR(error);
} /* xlog_alloc_log */
-
/*
* Write out the commit record of a transaction associated with the given
- * ticket. Return the lsn of the commit record.
+ * ticket to close off a running log write. Return the lsn of the commit record.
*/
-STATIC int
+int
xlog_commit_record(
struct xlog *log,
struct xlog_ticket *ticket,
struct xlog_in_core **iclog,
- xfs_lsn_t *commitlsnp)
+ xfs_lsn_t *lsn)
{
- struct xfs_mount *mp = log->l_mp;
- int error;
struct xfs_log_iovec reg = {
.i_addr = NULL,
.i_len = 0,
@@ -1527,24 +1462,27 @@
.lv_niovecs = 1,
.lv_iovecp = ®,
};
+ int error;
- ASSERT_ALWAYS(iclog);
- error = xlog_write(log, &vec, ticket, commitlsnp, iclog,
- XLOG_COMMIT_TRANS);
+ if (XLOG_FORCED_SHUTDOWN(log))
+ return -EIO;
+
+ error = xlog_write(log, &vec, ticket, lsn, iclog, XLOG_COMMIT_TRANS,
+ false);
if (error)
- xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR);
+ xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR);
return error;
}
/*
- * Push on the buffer cache code if we ever use more than 75% of the on-disk
- * log space. This code pushes on the lsn which would supposedly free up
- * the 25% which we want to leave free. We may need to adopt a policy which
- * pushes on an lsn which is further along in the log once we reach the high
- * water mark. In this manner, we would be creating a low water mark.
+ * Compute the LSN that we'd need to push the log tail towards in order to have
+ * (a) enough on-disk log space to log the number of bytes specified, (b) at
+ * least 25% of the log space free, and (c) at least 256 blocks free. If the
+ * log free space already meets all three thresholds, this function returns
+ * NULLCOMMITLSN.
*/
-STATIC void
-xlog_grant_push_ail(
+xfs_lsn_t
+xlog_grant_push_threshold(
struct xlog *log,
int need_bytes)
{
@@ -1570,7 +1508,7 @@
free_threshold = max(free_threshold, (log->l_logBBsize >> 2));
free_threshold = max(free_threshold, 256);
if (free_blocks >= free_threshold)
- return;
+ return NULLCOMMITLSN;
xlog_crack_atomic_lsn(&log->l_tail_lsn, &threshold_cycle,
&threshold_block);
@@ -1590,13 +1528,33 @@
if (XFS_LSN_CMP(threshold_lsn, last_sync_lsn) > 0)
threshold_lsn = last_sync_lsn;
+ return threshold_lsn;
+}
+
+/*
+ * Push the tail of the log if we need to do so to maintain the free log space
+ * thresholds set out by xlog_grant_push_threshold. We may need to adopt a
+ * policy which pushes on an lsn which is further along in the log once we
+ * reach the high water mark. In this manner, we would be creating a low water
+ * mark.
+ */
+STATIC void
+xlog_grant_push_ail(
+ struct xlog *log,
+ int need_bytes)
+{
+ xfs_lsn_t threshold_lsn;
+
+ threshold_lsn = xlog_grant_push_threshold(log, need_bytes);
+ if (threshold_lsn == NULLCOMMITLSN || XLOG_FORCED_SHUTDOWN(log))
+ return;
+
/*
* Get the transaction layer to kick the dirty buffers out to
* disk asynchronously. No point in trying to do this if
* the filesystem is shutting down.
*/
- if (!XLOG_FORCED_SHUTDOWN(log))
- xfs_ail_push(log->l_ailp, threshold_lsn);
+ xfs_ail_push(log->l_ailp, threshold_lsn);
}
/*
@@ -1666,9 +1624,7 @@
int i;
int xheads;
- xheads = size / XLOG_HEADER_CYCLE_SIZE;
- if (size % XLOG_HEADER_CYCLE_SIZE)
- xheads++;
+ xheads = DIV_ROUND_UP(size, XLOG_HEADER_CYCLE_SIZE);
for (i = 1; i < xheads; i++) {
crc = crc32c(crc, &xhdr[i].hic_xheader,
@@ -1692,7 +1648,7 @@
&iclog->ic_end_io_work);
}
-static void
+static int
xlog_map_iclog_data(
struct bio *bio,
void *data,
@@ -1703,11 +1659,14 @@
unsigned int off = offset_in_page(data);
size_t len = min_t(size_t, count, PAGE_SIZE - off);
- WARN_ON_ONCE(bio_add_page(bio, page, len, off) != len);
+ if (bio_add_page(bio, page, len, off) != len)
+ return -EIO;
data += len;
count -= len;
} while (count);
+
+ return 0;
}
STATIC void
@@ -1729,7 +1688,7 @@
* across the log IO to archieve that.
*/
down(&iclog->ic_sema);
- if (unlikely(iclog->ic_state & XLOG_STATE_IOERROR)) {
+ if (unlikely(iclog->ic_state == XLOG_STATE_IOERROR)) {
/*
* It would seem logical to return EIO here, but we rely on
* the log state machine to propagate I/O errors instead of
@@ -1737,25 +1696,34 @@
* the buffer manually, the code needs to be kept in sync
* with the I/O completion path.
*/
- xlog_state_done_syncing(iclog, XFS_LI_ABORTED);
+ xlog_state_done_syncing(iclog);
up(&iclog->ic_sema);
return;
}
- iclog->ic_io_size = count;
-
bio_init(&iclog->ic_bio, iclog->ic_bvec, howmany(count, PAGE_SIZE));
bio_set_dev(&iclog->ic_bio, log->l_targ->bt_bdev);
iclog->ic_bio.bi_iter.bi_sector = log->l_logBBstart + bno;
iclog->ic_bio.bi_end_io = xlog_bio_end_io;
iclog->ic_bio.bi_private = iclog;
- iclog->ic_bio.bi_opf = REQ_OP_WRITE | REQ_META | REQ_SYNC | REQ_FUA;
+
+ /*
+ * We use REQ_SYNC | REQ_IDLE here to tell the block layer the are more
+ * IOs coming immediately after this one. This prevents the block layer
+ * writeback throttle from throttling log writes behind background
+ * metadata writeback and causing priority inversions.
+ */
+ iclog->ic_bio.bi_opf = REQ_OP_WRITE | REQ_META | REQ_SYNC |
+ REQ_IDLE | REQ_FUA;
if (need_flush)
iclog->ic_bio.bi_opf |= REQ_PREFLUSH;
- xlog_map_iclog_data(&iclog->ic_bio, iclog->ic_data, iclog->ic_io_size);
+ if (xlog_map_iclog_data(&iclog->ic_bio, iclog->ic_data, count)) {
+ xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR);
+ return;
+ }
if (is_vmalloc_addr(iclog->ic_data))
- flush_kernel_vmap_range(iclog->ic_data, iclog->ic_io_size);
+ flush_kernel_vmap_range(iclog->ic_data, count);
/*
* If this log buffer would straddle the end of the log we will have
@@ -1966,12 +1934,11 @@
log->l_mp->m_log = NULL;
destroy_workqueue(log->l_ioend_workqueue);
kmem_free(log);
-} /* xlog_dealloc_log */
+}
/*
* Update counters atomically now that memcpy is done.
*/
-/* ARGSUSED */
static inline void
xlog_state_finish_copy(
struct xlog *log,
@@ -1979,16 +1946,11 @@
int record_cnt,
int copy_bytes)
{
- spin_lock(&log->l_icloglock);
+ lockdep_assert_held(&log->l_icloglock);
be32_add_cpu(&iclog->ic_header.h_num_logops, record_cnt);
iclog->ic_offset += copy_bytes;
-
- spin_unlock(&log->l_icloglock);
-} /* xlog_state_finish_copy */
-
-
-
+}
/*
* print out info relating to regions written which consume
@@ -2109,23 +2071,21 @@
}
/*
- * Calculate the potential space needed by the log vector. Each region gets
- * its own xlog_op_header_t and may need to be double word aligned.
+ * Calculate the potential space needed by the log vector. We may need a start
+ * record, and each region gets its own struct xlog_op_header and may need to be
+ * double word aligned.
*/
static int
xlog_write_calc_vec_length(
struct xlog_ticket *ticket,
- struct xfs_log_vec *log_vector)
+ struct xfs_log_vec *log_vector,
+ bool need_start_rec)
{
struct xfs_log_vec *lv;
- int headers = 0;
+ int headers = need_start_rec ? 1 : 0;
int len = 0;
int i;
- /* acct for start rec of xact */
- if (ticket->t_flags & XLOG_TIC_INITED)
- headers++;
-
for (lv = log_vector; lv; lv = lv->lv_next) {
/* we don't write ordered log vectors */
if (lv->lv_buf_len == XFS_LOG_VEC_ORDERED)
@@ -2147,27 +2107,16 @@
return len;
}
-/*
- * If first write for transaction, insert start record We can't be trying to
- * commit if we are inited. We can't have any "partial_copy" if we are inited.
- */
-static int
+static void
xlog_write_start_rec(
struct xlog_op_header *ophdr,
struct xlog_ticket *ticket)
{
- if (!(ticket->t_flags & XLOG_TIC_INITED))
- return 0;
-
ophdr->oh_tid = cpu_to_be32(ticket->t_tid);
ophdr->oh_clientid = ticket->t_clientid;
ophdr->oh_len = 0;
ophdr->oh_flags = XLOG_START_TRANS;
ophdr->oh_res2 = 0;
-
- ticket->t_flags &= ~XLOG_TIC_INITED;
-
- return sizeof(struct xlog_op_header);
}
static xlog_op_header_t *
@@ -2265,15 +2214,18 @@
int log_offset,
struct xlog_in_core **commit_iclog)
{
+ int error;
+
if (*partial_copy) {
/*
* This iclog has already been marked WANT_SYNC by
* xlog_state_get_iclog_space.
*/
+ spin_lock(&log->l_icloglock);
xlog_state_finish_copy(log, iclog, *record_cnt, *data_cnt);
*record_cnt = 0;
*data_cnt = 0;
- return xlog_state_release_iclog(log, iclog);
+ goto release_iclog;
}
*partial_copy = 0;
@@ -2281,21 +2233,29 @@
if (iclog->ic_size - log_offset <= sizeof(xlog_op_header_t)) {
/* no more space in this iclog - push it. */
+ spin_lock(&log->l_icloglock);
xlog_state_finish_copy(log, iclog, *record_cnt, *data_cnt);
*record_cnt = 0;
*data_cnt = 0;
- spin_lock(&log->l_icloglock);
- xlog_state_want_sync(log, iclog);
- spin_unlock(&log->l_icloglock);
-
+ if (iclog->ic_state == XLOG_STATE_ACTIVE)
+ xlog_state_switch_iclogs(log, iclog, 0);
+ else
+ ASSERT(iclog->ic_state == XLOG_STATE_WANT_SYNC ||
+ iclog->ic_state == XLOG_STATE_IOERROR);
if (!commit_iclog)
- return xlog_state_release_iclog(log, iclog);
+ goto release_iclog;
+ spin_unlock(&log->l_icloglock);
ASSERT(flags & XLOG_COMMIT_TRANS);
*commit_iclog = iclog;
}
return 0;
+
+release_iclog:
+ error = xlog_state_release_iclog(log, iclog);
+ spin_unlock(&log->l_icloglock);
+ return error;
}
/*
@@ -2345,39 +2305,28 @@
struct xlog_ticket *ticket,
xfs_lsn_t *start_lsn,
struct xlog_in_core **commit_iclog,
- uint flags)
+ uint flags,
+ bool need_start_rec)
{
struct xlog_in_core *iclog = NULL;
- struct xfs_log_iovec *vecp;
- struct xfs_log_vec *lv;
+ struct xfs_log_vec *lv = log_vector;
+ struct xfs_log_iovec *vecp = lv->lv_iovecp;
+ int index = 0;
int len;
- int index;
int partial_copy = 0;
int partial_copy_len = 0;
int contwr = 0;
int record_cnt = 0;
int data_cnt = 0;
- int error;
-
- *start_lsn = 0;
-
- len = xlog_write_calc_vec_length(ticket, log_vector);
+ int error = 0;
/*
- * Region headers and bytes are already accounted for.
- * We only need to take into account start records and
- * split regions in this function.
+ * If this is a commit or unmount transaction, we don't need a start
+ * record to be written. We do, however, have to account for the
+ * commit or unmount header that gets written. Hence we always have
+ * to account for an extra xlog_op_header here.
*/
- if (ticket->t_flags & XLOG_TIC_INITED)
- ticket->t_curr_res -= sizeof(xlog_op_header_t);
-
- /*
- * Commit record headers need to be accounted for. These
- * come in as separate writes so are easy to detect.
- */
- if (flags & (XLOG_COMMIT_TRANS | XLOG_UNMOUNT_TRANS))
- ticket->t_curr_res -= sizeof(xlog_op_header_t);
-
+ ticket->t_curr_res -= sizeof(struct xlog_op_header);
if (ticket->t_curr_res < 0) {
xfs_alert_tag(log->l_mp, XFS_PTAG_LOGRES,
"ctx ticket reservation ran out. Need to up reservation");
@@ -2385,9 +2334,8 @@
xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR);
}
- index = 0;
- lv = log_vector;
- vecp = lv->lv_iovecp;
+ len = xlog_write_calc_vec_length(ticket, log_vector, need_start_rec);
+ *start_lsn = 0;
while (lv && (!lv->lv_niovecs || index < lv->lv_niovecs)) {
void *ptr;
int log_offset;
@@ -2411,7 +2359,6 @@
while (lv && (!lv->lv_niovecs || index < lv->lv_niovecs)) {
struct xfs_log_iovec *reg;
struct xlog_op_header *ophdr;
- int start_rec_copy;
int copy_len;
int copy_off;
bool ordered = false;
@@ -2427,11 +2374,15 @@
ASSERT(reg->i_len % sizeof(int32_t) == 0);
ASSERT((unsigned long)ptr % sizeof(int32_t) == 0);
- start_rec_copy = xlog_write_start_rec(ptr, ticket);
- if (start_rec_copy) {
- record_cnt++;
+ /*
+ * Before we start formatting log vectors, we need to
+ * write a start record. Only do this for the first
+ * iclog we write to.
+ */
+ if (need_start_rec) {
+ xlog_write_start_rec(ptr, ticket);
xlog_write_adv_cnt(&ptr, &len, &log_offset,
- start_rec_copy);
+ sizeof(struct xlog_op_header));
}
ophdr = xlog_write_setup_ophdr(log, ptr, ticket, flags);
@@ -2463,8 +2414,13 @@
xlog_write_adv_cnt(&ptr, &len, &log_offset,
copy_len);
}
- copy_len += start_rec_copy + sizeof(xlog_op_header_t);
+ copy_len += sizeof(struct xlog_op_header);
record_cnt++;
+ if (need_start_rec) {
+ copy_len += sizeof(struct xlog_op_header);
+ record_cnt++;
+ need_start_rec = false;
+ }
data_cnt += contwr ? copy_len : 0;
error = xlog_write_copy_finish(log, iclog, flags,
@@ -2508,128 +2464,119 @@
ASSERT(len == 0);
+ spin_lock(&log->l_icloglock);
xlog_state_finish_copy(log, iclog, record_cnt, data_cnt);
- if (!commit_iclog)
- return xlog_state_release_iclog(log, iclog);
+ if (commit_iclog) {
+ ASSERT(flags & XLOG_COMMIT_TRANS);
+ *commit_iclog = iclog;
+ } else {
+ error = xlog_state_release_iclog(log, iclog);
+ }
+ spin_unlock(&log->l_icloglock);
- ASSERT(flags & XLOG_COMMIT_TRANS);
- *commit_iclog = iclog;
- return 0;
+ return error;
}
+static void
+xlog_state_activate_iclog(
+ struct xlog_in_core *iclog,
+ int *iclogs_changed)
+{
+ ASSERT(list_empty_careful(&iclog->ic_callbacks));
-/*****************************************************************************
- *
- * State Machine functions
- *
- *****************************************************************************
- */
+ /*
+ * If the number of ops in this iclog indicate it just contains the
+ * dummy transaction, we can change state into IDLE (the second time
+ * around). Otherwise we should change the state into NEED a dummy.
+ * We don't need to cover the dummy.
+ */
+ if (*iclogs_changed == 0 &&
+ iclog->ic_header.h_num_logops == cpu_to_be32(XLOG_COVER_OPS)) {
+ *iclogs_changed = 1;
+ } else {
+ /*
+ * We have two dirty iclogs so start over. This could also be
+ * num of ops indicating this is not the dummy going out.
+ */
+ *iclogs_changed = 2;
+ }
+
+ iclog->ic_state = XLOG_STATE_ACTIVE;
+ iclog->ic_offset = 0;
+ iclog->ic_header.h_num_logops = 0;
+ memset(iclog->ic_header.h_cycle_data, 0,
+ sizeof(iclog->ic_header.h_cycle_data));
+ iclog->ic_header.h_lsn = 0;
+}
/*
- * An iclog has just finished IO completion processing, so we need to update
- * the iclog state and propagate that up into the overall log state. Hence we
- * prepare the iclog for cleaning, and then clean all the pending dirty iclogs
- * starting from the head, and then wake up any threads that are waiting for the
- * iclog to be marked clean.
- *
- * The ordering of marking iclogs ACTIVE must be maintained, so an iclog
- * doesn't become ACTIVE beyond one that is SYNCING. This is also required to
- * maintain the notion that we use a ordered wait queue to hold off would be
- * writers to the log when every iclog is trying to sync to disk.
- *
- * Caller must hold the icloglock before calling us.
- *
- * State Change: !IOERROR -> DIRTY -> ACTIVE
+ * Loop through all iclogs and mark all iclogs currently marked DIRTY as
+ * ACTIVE after iclog I/O has completed.
*/
+static void
+xlog_state_activate_iclogs(
+ struct xlog *log,
+ int *iclogs_changed)
+{
+ struct xlog_in_core *iclog = log->l_iclog;
+
+ do {
+ if (iclog->ic_state == XLOG_STATE_DIRTY)
+ xlog_state_activate_iclog(iclog, iclogs_changed);
+ /*
+ * The ordering of marking iclogs ACTIVE must be maintained, so
+ * an iclog doesn't become ACTIVE beyond one that is SYNCING.
+ */
+ else if (iclog->ic_state != XLOG_STATE_ACTIVE)
+ break;
+ } while ((iclog = iclog->ic_next) != log->l_iclog);
+}
+
+static int
+xlog_covered_state(
+ int prev_state,
+ int iclogs_changed)
+{
+ /*
+ * We usually go to NEED. But we go to NEED2 if the changed indicates we
+ * are done writing the dummy record. If we are done with the second
+ * dummy recored (DONE2), then we go to IDLE.
+ */
+ switch (prev_state) {
+ case XLOG_STATE_COVER_IDLE:
+ case XLOG_STATE_COVER_NEED:
+ case XLOG_STATE_COVER_NEED2:
+ break;
+ case XLOG_STATE_COVER_DONE:
+ if (iclogs_changed == 1)
+ return XLOG_STATE_COVER_NEED2;
+ break;
+ case XLOG_STATE_COVER_DONE2:
+ if (iclogs_changed == 1)
+ return XLOG_STATE_COVER_IDLE;
+ break;
+ default:
+ ASSERT(0);
+ }
+
+ return XLOG_STATE_COVER_NEED;
+}
+
STATIC void
xlog_state_clean_iclog(
struct xlog *log,
struct xlog_in_core *dirty_iclog)
{
- struct xlog_in_core *iclog;
- int changed = 0;
+ int iclogs_changed = 0;
- /* Prepare the completed iclog. */
- if (!(dirty_iclog->ic_state & XLOG_STATE_IOERROR))
- dirty_iclog->ic_state = XLOG_STATE_DIRTY;
+ dirty_iclog->ic_state = XLOG_STATE_DIRTY;
- /* Walk all the iclogs to update the ordered active state. */
- iclog = log->l_iclog;
- do {
- if (iclog->ic_state == XLOG_STATE_DIRTY) {
- iclog->ic_state = XLOG_STATE_ACTIVE;
- iclog->ic_offset = 0;
- ASSERT(list_empty_careful(&iclog->ic_callbacks));
- /*
- * If the number of ops in this iclog indicate it just
- * contains the dummy transaction, we can
- * change state into IDLE (the second time around).
- * Otherwise we should change the state into
- * NEED a dummy.
- * We don't need to cover the dummy.
- */
- if (!changed &&
- (be32_to_cpu(iclog->ic_header.h_num_logops) ==
- XLOG_COVER_OPS)) {
- changed = 1;
- } else {
- /*
- * We have two dirty iclogs so start over
- * This could also be num of ops indicates
- * this is not the dummy going out.
- */
- changed = 2;
- }
- iclog->ic_header.h_num_logops = 0;
- memset(iclog->ic_header.h_cycle_data, 0,
- sizeof(iclog->ic_header.h_cycle_data));
- iclog->ic_header.h_lsn = 0;
- } else if (iclog->ic_state == XLOG_STATE_ACTIVE)
- /* do nothing */;
- else
- break; /* stop cleaning */
- iclog = iclog->ic_next;
- } while (iclog != log->l_iclog);
-
-
- /*
- * Wake up threads waiting in xfs_log_force() for the dirty iclog
- * to be cleaned.
- */
+ xlog_state_activate_iclogs(log, &iclogs_changed);
wake_up_all(&dirty_iclog->ic_force_wait);
- /*
- * Change state for the dummy log recording.
- * We usually go to NEED. But we go to NEED2 if the changed indicates
- * we are done writing the dummy record.
- * If we are done with the second dummy recored (DONE2), then
- * we go to IDLE.
- */
- if (changed) {
- switch (log->l_covered_state) {
- case XLOG_STATE_COVER_IDLE:
- case XLOG_STATE_COVER_NEED:
- case XLOG_STATE_COVER_NEED2:
- log->l_covered_state = XLOG_STATE_COVER_NEED;
- break;
-
- case XLOG_STATE_COVER_DONE:
- if (changed == 1)
- log->l_covered_state = XLOG_STATE_COVER_NEED2;
- else
- log->l_covered_state = XLOG_STATE_COVER_NEED;
- break;
-
- case XLOG_STATE_COVER_DONE2:
- if (changed == 1)
- log->l_covered_state = XLOG_STATE_COVER_IDLE;
- else
- log->l_covered_state = XLOG_STATE_COVER_NEED;
- break;
-
- default:
- ASSERT(0);
- }
+ if (iclogs_changed) {
+ log->l_covered_state = xlog_covered_state(log->l_covered_state,
+ iclogs_changed);
}
}
@@ -2641,7 +2588,8 @@
xfs_lsn_t lowest_lsn = 0, lsn;
do {
- if (iclog->ic_state & (XLOG_STATE_ACTIVE | XLOG_STATE_DIRTY))
+ if (iclog->ic_state == XLOG_STATE_ACTIVE ||
+ iclog->ic_state == XLOG_STATE_DIRTY)
continue;
lsn = be64_to_cpu(iclog->ic_header.h_lsn);
@@ -2701,61 +2649,48 @@
xlog_state_iodone_process_iclog(
struct xlog *log,
struct xlog_in_core *iclog,
- struct xlog_in_core *completed_iclog,
bool *ioerror)
{
xfs_lsn_t lowest_lsn;
xfs_lsn_t header_lsn;
- /* Skip all iclogs in the ACTIVE & DIRTY states */
- if (iclog->ic_state & (XLOG_STATE_ACTIVE | XLOG_STATE_DIRTY))
+ switch (iclog->ic_state) {
+ case XLOG_STATE_ACTIVE:
+ case XLOG_STATE_DIRTY:
+ /*
+ * Skip all iclogs in the ACTIVE & DIRTY states:
+ */
return false;
-
- /*
- * Between marking a filesystem SHUTDOWN and stopping the log, we do
- * flush all iclogs to disk (if there wasn't a log I/O error). So, we do
- * want things to go smoothly in case of just a SHUTDOWN w/o a
- * LOG_IO_ERROR.
- */
- if (iclog->ic_state & XLOG_STATE_IOERROR) {
+ case XLOG_STATE_IOERROR:
+ /*
+ * Between marking a filesystem SHUTDOWN and stopping the log,
+ * we do flush all iclogs to disk (if there wasn't a log I/O
+ * error). So, we do want things to go smoothly in case of just
+ * a SHUTDOWN w/o a LOG_IO_ERROR.
+ */
*ioerror = true;
return false;
- }
-
- /*
- * Can only perform callbacks in order. Since this iclog is not in the
- * DONE_SYNC/ DO_CALLBACK state, we skip the rest and just try to clean
- * up. If we set our iclog to DO_CALLBACK, we will not process it when
- * we retry since a previous iclog is in the CALLBACK and the state
- * cannot change since we are holding the l_icloglock.
- */
- if (!(iclog->ic_state &
- (XLOG_STATE_DONE_SYNC | XLOG_STATE_DO_CALLBACK))) {
- if (completed_iclog &&
- (completed_iclog->ic_state == XLOG_STATE_DONE_SYNC)) {
- completed_iclog->ic_state = XLOG_STATE_DO_CALLBACK;
- }
+ case XLOG_STATE_DONE_SYNC:
+ /*
+ * Now that we have an iclog that is in the DONE_SYNC state, do
+ * one more check here to see if we have chased our tail around.
+ * If this is not the lowest lsn iclog, then we will leave it
+ * for another completion to process.
+ */
+ header_lsn = be64_to_cpu(iclog->ic_header.h_lsn);
+ lowest_lsn = xlog_get_lowest_lsn(log);
+ if (lowest_lsn && XFS_LSN_CMP(lowest_lsn, header_lsn) < 0)
+ return false;
+ xlog_state_set_callback(log, iclog, header_lsn);
+ return false;
+ default:
+ /*
+ * Can only perform callbacks in order. Since this iclog is not
+ * in the DONE_SYNC state, we skip the rest and just try to
+ * clean up.
+ */
return true;
}
-
- /*
- * We now have an iclog that is in either the DO_CALLBACK or DONE_SYNC
- * states. The other states (WANT_SYNC, SYNCING, or CALLBACK were caught
- * by the above if and are going to clean (i.e. we aren't doing their
- * callbacks) see the above if.
- *
- * We will do one more check here to see if we have chased our tail
- * around. If this is not the lowest lsn iclog, then we will leave it
- * for another completion to process.
- */
- header_lsn = be64_to_cpu(iclog->ic_header.h_lsn);
- lowest_lsn = xlog_get_lowest_lsn(log);
- if (lowest_lsn && XFS_LSN_CMP(lowest_lsn, header_lsn) < 0)
- return false;
-
- xlog_state_set_callback(log, iclog, header_lsn);
- return false;
-
}
/*
@@ -2770,8 +2705,9 @@
static void
xlog_state_do_iclog_callbacks(
struct xlog *log,
- struct xlog_in_core *iclog,
- bool aborted)
+ struct xlog_in_core *iclog)
+ __releases(&log->l_icloglock)
+ __acquires(&log->l_icloglock)
{
spin_unlock(&log->l_icloglock);
spin_lock(&iclog->ic_callback_lock);
@@ -2781,7 +2717,7 @@
list_splice_init(&iclog->ic_callbacks, &tmp);
spin_unlock(&iclog->ic_callback_lock);
- xlog_cil_process_committed(&tmp, aborted);
+ xlog_cil_process_committed(&tmp);
spin_lock(&iclog->ic_callback_lock);
}
@@ -2794,57 +2730,12 @@
spin_unlock(&iclog->ic_callback_lock);
}
-#ifdef DEBUG
-/*
- * Make one last gasp attempt to see if iclogs are being left in limbo. If the
- * above loop finds an iclog earlier than the current iclog and in one of the
- * syncing states, the current iclog is put into DO_CALLBACK and the callbacks
- * are deferred to the completion of the earlier iclog. Walk the iclogs in order
- * and make sure that no iclog is in DO_CALLBACK unless an earlier iclog is in
- * one of the syncing states.
- *
- * Note that SYNCING|IOERROR is a valid state so we cannot just check for
- * ic_state == SYNCING.
- */
-static void
-xlog_state_callback_check_state(
- struct xlog *log)
-{
- struct xlog_in_core *first_iclog = log->l_iclog;
- struct xlog_in_core *iclog = first_iclog;
-
- do {
- ASSERT(iclog->ic_state != XLOG_STATE_DO_CALLBACK);
- /*
- * Terminate the loop if iclogs are found in states
- * which will cause other threads to clean up iclogs.
- *
- * SYNCING - i/o completion will go through logs
- * DONE_SYNC - interrupt thread should be waiting for
- * l_icloglock
- * IOERROR - give up hope all ye who enter here
- */
- if (iclog->ic_state == XLOG_STATE_WANT_SYNC ||
- iclog->ic_state & XLOG_STATE_SYNCING ||
- iclog->ic_state == XLOG_STATE_DONE_SYNC ||
- iclog->ic_state == XLOG_STATE_IOERROR )
- break;
- iclog = iclog->ic_next;
- } while (first_iclog != iclog);
-}
-#else
-#define xlog_state_callback_check_state(l) ((void)0)
-#endif
-
STATIC void
xlog_state_do_callback(
- struct xlog *log,
- bool aborted,
- struct xlog_in_core *ciclog)
+ struct xlog *log)
{
struct xlog_in_core *iclog;
struct xlog_in_core *first_iclog;
- bool did_callbacks = false;
bool cycled_icloglock;
bool ioerror;
int flushcnt = 0;
@@ -2868,11 +2759,11 @@
do {
if (xlog_state_iodone_process_iclog(log, iclog,
- ciclog, &ioerror))
+ &ioerror))
break;
- if (!(iclog->ic_state &
- (XLOG_STATE_CALLBACK | XLOG_STATE_IOERROR))) {
+ if (iclog->ic_state != XLOG_STATE_CALLBACK &&
+ iclog->ic_state != XLOG_STATE_IOERROR) {
iclog = iclog->ic_next;
continue;
}
@@ -2882,14 +2773,14 @@
* we'll have to run at least one more complete loop.
*/
cycled_icloglock = true;
- xlog_state_do_iclog_callbacks(log, iclog, aborted);
-
- xlog_state_clean_iclog(log, iclog);
+ xlog_state_do_iclog_callbacks(log, iclog);
+ if (XLOG_FORCED_SHUTDOWN(log))
+ wake_up_all(&iclog->ic_force_wait);
+ else
+ xlog_state_clean_iclog(log, iclog);
iclog = iclog->ic_next;
} while (first_iclog != iclog);
- did_callbacks |= cycled_icloglock;
-
if (repeats > 5000) {
flushcnt += repeats;
repeats = 0;
@@ -2899,10 +2790,8 @@
}
} while (!ioerror && cycled_icloglock);
- if (did_callbacks)
- xlog_state_callback_check_state(log);
-
- if (log->l_iclog->ic_state & (XLOG_STATE_ACTIVE|XLOG_STATE_IOERROR))
+ if (log->l_iclog->ic_state == XLOG_STATE_ACTIVE ||
+ log->l_iclog->ic_state == XLOG_STATE_IOERROR)
wake_up_all(&log->l_flush_wait);
spin_unlock(&log->l_icloglock);
@@ -2924,25 +2813,22 @@
*/
STATIC void
xlog_state_done_syncing(
- struct xlog_in_core *iclog,
- bool aborted)
+ struct xlog_in_core *iclog)
{
struct xlog *log = iclog->ic_log;
spin_lock(&log->l_icloglock);
-
- ASSERT(iclog->ic_state == XLOG_STATE_SYNCING ||
- iclog->ic_state == XLOG_STATE_IOERROR);
ASSERT(atomic_read(&iclog->ic_refcnt) == 0);
/*
* If we got an error, either on the first buffer, or in the case of
- * split log writes, on the second, we mark ALL iclogs STATE_IOERROR,
- * and none should ever be attempted to be written to disk
- * again.
+ * split log writes, on the second, we shut down the file system and
+ * no iclogs should ever be attempted to be written to disk again.
*/
- if (iclog->ic_state != XLOG_STATE_IOERROR)
+ if (!XLOG_FORCED_SHUTDOWN(log)) {
+ ASSERT(iclog->ic_state == XLOG_STATE_SYNCING);
iclog->ic_state = XLOG_STATE_DONE_SYNC;
+ }
/*
* Someone could be sleeping prior to writing out the next
@@ -2951,9 +2837,8 @@
*/
wake_up_all(&iclog->ic_write_wait);
spin_unlock(&log->l_icloglock);
- xlog_state_do_callback(log, aborted, iclog); /* also cleans log */
-} /* xlog_state_done_syncing */
-
+ xlog_state_do_callback(log);
+}
/*
* If the head of the in-core log ring is not (ACTIVE or DIRTY), then we must
@@ -2985,7 +2870,6 @@
int log_offset;
xlog_rec_header_t *head;
xlog_in_core_t *iclog;
- int error;
restart:
spin_lock(&log->l_icloglock);
@@ -3034,24 +2918,22 @@
* can fit into remaining data section.
*/
if (iclog->ic_size - iclog->ic_offset < 2*sizeof(xlog_op_header_t)) {
+ int error = 0;
+
xlog_state_switch_iclogs(log, iclog, iclog->ic_size);
/*
- * If I'm the only one writing to this iclog, sync it to disk.
- * We need to do an atomic compare and decrement here to avoid
- * racing with concurrent atomic_dec_and_lock() calls in
+ * If we are the only one writing to this iclog, sync it to
+ * disk. We need to do an atomic compare and decrement here to
+ * avoid racing with concurrent atomic_dec_and_lock() calls in
* xlog_state_release_iclog() when there is more than one
* reference to the iclog.
*/
- if (!atomic_add_unless(&iclog->ic_refcnt, -1, 1)) {
- /* we are the only one */
- spin_unlock(&log->l_icloglock);
+ if (!atomic_add_unless(&iclog->ic_refcnt, -1, 1))
error = xlog_state_release_iclog(log, iclog);
- if (error)
- return error;
- } else {
- spin_unlock(&log->l_icloglock);
- }
+ spin_unlock(&log->l_icloglock);
+ if (error)
+ return error;
goto restart;
}
@@ -3075,21 +2957,21 @@
*logoffsetp = log_offset;
return 0;
-} /* xlog_state_get_iclog_space */
+}
-/* The first cnt-1 times through here we don't need to
- * move the grant write head because the permanent
- * reservation has reserved cnt times the unit amount.
- * Release part of current permanent unit reservation and
- * reset current reservation to be one units worth. Also
- * move grant reservation head forward.
+/*
+ * The first cnt-1 times a ticket goes through here we don't need to move the
+ * grant write head because the permanent reservation has reserved cnt times the
+ * unit amount. Release part of current permanent unit reservation and reset
+ * current reservation to be one units worth. Also move grant reservation head
+ * forward.
*/
-STATIC void
-xlog_regrant_reserve_log_space(
+void
+xfs_log_ticket_regrant(
struct xlog *log,
struct xlog_ticket *ticket)
{
- trace_xfs_log_regrant_reserve_enter(log, ticket);
+ trace_xfs_log_ticket_regrant(log, ticket);
if (ticket->t_cnt > 0)
ticket->t_cnt--;
@@ -3101,21 +2983,20 @@
ticket->t_curr_res = ticket->t_unit_res;
xlog_tic_reset_res(ticket);
- trace_xfs_log_regrant_reserve_sub(log, ticket);
+ trace_xfs_log_ticket_regrant_sub(log, ticket);
/* just return if we still have some of the pre-reserved space */
- if (ticket->t_cnt > 0)
- return;
+ if (!ticket->t_cnt) {
+ xlog_grant_add_space(log, &log->l_reserve_head.grant,
+ ticket->t_unit_res);
+ trace_xfs_log_ticket_regrant_exit(log, ticket);
- xlog_grant_add_space(log, &log->l_reserve_head.grant,
- ticket->t_unit_res);
+ ticket->t_curr_res = ticket->t_unit_res;
+ xlog_tic_reset_res(ticket);
+ }
- trace_xfs_log_regrant_reserve_exit(log, ticket);
-
- ticket->t_curr_res = ticket->t_unit_res;
- xlog_tic_reset_res(ticket);
-} /* xlog_regrant_reserve_log_space */
-
+ xfs_log_ticket_put(ticket);
+}
/*
* Give back the space left from a reservation.
@@ -3131,18 +3012,19 @@
* space, the count will stay at zero and the only space remaining will be
* in the current reservation field.
*/
-STATIC void
-xlog_ungrant_log_space(
+void
+xfs_log_ticket_ungrant(
struct xlog *log,
struct xlog_ticket *ticket)
{
- int bytes;
+ int bytes;
+
+ trace_xfs_log_ticket_ungrant(log, ticket);
if (ticket->t_cnt > 0)
ticket->t_cnt--;
- trace_xfs_log_ungrant_enter(log, ticket);
- trace_xfs_log_ungrant_sub(log, ticket);
+ trace_xfs_log_ticket_ungrant_sub(log, ticket);
/*
* If this is a permanent reservation ticket, we may be able to free
@@ -3157,71 +3039,15 @@
xlog_grant_sub_space(log, &log->l_reserve_head.grant, bytes);
xlog_grant_sub_space(log, &log->l_write_head.grant, bytes);
- trace_xfs_log_ungrant_exit(log, ticket);
+ trace_xfs_log_ticket_ungrant_exit(log, ticket);
xfs_log_space_wake(log->l_mp);
+ xfs_log_ticket_put(ticket);
}
/*
- * Flush iclog to disk if this is the last reference to the given iclog and
- * the WANT_SYNC bit is set.
- *
- * When this function is entered, the iclog is not necessarily in the
- * WANT_SYNC state. It may be sitting around waiting to get filled.
- *
- *
- */
-STATIC int
-xlog_state_release_iclog(
- struct xlog *log,
- struct xlog_in_core *iclog)
-{
- int sync = 0; /* do we sync? */
-
- if (iclog->ic_state & XLOG_STATE_IOERROR)
- return -EIO;
-
- ASSERT(atomic_read(&iclog->ic_refcnt) > 0);
- if (!atomic_dec_and_lock(&iclog->ic_refcnt, &log->l_icloglock))
- return 0;
-
- if (iclog->ic_state & XLOG_STATE_IOERROR) {
- spin_unlock(&log->l_icloglock);
- return -EIO;
- }
- ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE ||
- iclog->ic_state == XLOG_STATE_WANT_SYNC);
-
- if (iclog->ic_state == XLOG_STATE_WANT_SYNC) {
- /* update tail before writing to iclog */
- xfs_lsn_t tail_lsn = xlog_assign_tail_lsn(log->l_mp);
- sync++;
- iclog->ic_state = XLOG_STATE_SYNCING;
- iclog->ic_header.h_tail_lsn = cpu_to_be64(tail_lsn);
- xlog_verify_tail_lsn(log, iclog, tail_lsn);
- /* cycle incremented when incrementing curr_block */
- }
- spin_unlock(&log->l_icloglock);
-
- /*
- * We let the log lock go, so it's possible that we hit a log I/O
- * error or some other SHUTDOWN condition that marks the iclog
- * as XLOG_STATE_IOERROR before the bwrite. However, we know that
- * this iclog has consistent data, so we ignore IOERROR
- * flags after this point.
- */
- if (sync)
- xlog_sync(log, iclog);
- return 0;
-} /* xlog_state_release_iclog */
-
-
-/*
- * This routine will mark the current iclog in the ring as WANT_SYNC
- * and move the current iclog pointer to the next iclog in the ring.
- * When this routine is called from xlog_state_get_iclog_space(), the
- * exact size of the iclog has not yet been determined. All we know is
- * that every data block. We have run out of space in this log record.
+ * This routine will mark the current iclog in the ring as WANT_SYNC and move
+ * the current iclog pointer to the next iclog in the ring.
*/
STATIC void
xlog_state_switch_iclogs(
@@ -3230,6 +3056,8 @@
int eventual_size)
{
ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE);
+ assert_spin_locked(&log->l_icloglock);
+
if (!eventual_size)
eventual_size = iclog->ic_offset;
iclog->ic_state = XLOG_STATE_WANT_SYNC;
@@ -3264,7 +3092,7 @@
}
ASSERT(iclog == log->l_iclog);
log->l_iclog = iclog->ic_next;
-} /* xlog_state_switch_iclogs */
+}
/*
* Write out all data in the in-core log as of this exact moment in time.
@@ -3309,7 +3137,7 @@
spin_lock(&log->l_icloglock);
iclog = log->l_iclog;
- if (iclog->ic_state & XLOG_STATE_IOERROR)
+ if (iclog->ic_state == XLOG_STATE_IOERROR)
goto out_error;
if (iclog->ic_state == XLOG_STATE_DIRTY ||
@@ -3324,9 +3152,6 @@
* previous iclog and go to sleep.
*/
iclog = iclog->ic_prev;
- if (iclog->ic_state == XLOG_STATE_ACTIVE ||
- iclog->ic_state == XLOG_STATE_DIRTY)
- goto out_unlock;
} else if (iclog->ic_state == XLOG_STATE_ACTIVE) {
if (atomic_read(&iclog->ic_refcnt) == 0) {
/*
@@ -3339,14 +3164,10 @@
atomic_inc(&iclog->ic_refcnt);
lsn = be64_to_cpu(iclog->ic_header.h_lsn);
xlog_state_switch_iclogs(log, iclog, 0);
- spin_unlock(&log->l_icloglock);
-
if (xlog_state_release_iclog(log, iclog))
- return -EIO;
+ goto out_error;
- spin_lock(&log->l_icloglock);
- if (be64_to_cpu(iclog->ic_header.h_lsn) != lsn ||
- iclog->ic_state == XLOG_STATE_DIRTY)
+ if (be64_to_cpu(iclog->ic_header.h_lsn) != lsn)
goto out_unlock;
} else {
/*
@@ -3366,17 +3187,8 @@
;
}
- if (!(flags & XFS_LOG_SYNC))
- goto out_unlock;
-
- if (iclog->ic_state & XLOG_STATE_IOERROR)
- goto out_error;
- XFS_STATS_INC(mp, xs_log_force_sleep);
- xlog_wait(&iclog->ic_force_wait, &log->l_icloglock);
- if (iclog->ic_state & XLOG_STATE_IOERROR)
- return -EIO;
- return 0;
-
+ if (flags & XFS_LOG_SYNC)
+ return xlog_wait_on_iclog(iclog);
out_unlock:
spin_unlock(&log->l_icloglock);
return 0;
@@ -3398,7 +3210,7 @@
spin_lock(&log->l_icloglock);
iclog = log->l_iclog;
- if (iclog->ic_state & XLOG_STATE_IOERROR)
+ if (iclog->ic_state == XLOG_STATE_IOERROR)
goto out_error;
while (be64_to_cpu(iclog->ic_header.h_lsn) != lsn) {
@@ -3407,9 +3219,6 @@
goto out_unlock;
}
- if (iclog->ic_state == XLOG_STATE_DIRTY)
- goto out_unlock;
-
if (iclog->ic_state == XLOG_STATE_ACTIVE) {
/*
* We sleep here if we haven't already slept (e.g. this is the
@@ -3427,10 +3236,8 @@
* will go out then.
*/
if (!already_slept &&
- (iclog->ic_prev->ic_state &
- (XLOG_STATE_WANT_SYNC | XLOG_STATE_SYNCING))) {
- ASSERT(!(iclog->ic_state & XLOG_STATE_IOERROR));
-
+ (iclog->ic_prev->ic_state == XLOG_STATE_WANT_SYNC ||
+ iclog->ic_prev->ic_state == XLOG_STATE_SYNCING)) {
XFS_STATS_INC(mp, xs_log_force_sleep);
xlog_wait(&iclog->ic_prev->ic_write_wait,
@@ -3439,27 +3246,14 @@
}
atomic_inc(&iclog->ic_refcnt);
xlog_state_switch_iclogs(log, iclog, 0);
- spin_unlock(&log->l_icloglock);
if (xlog_state_release_iclog(log, iclog))
- return -EIO;
+ goto out_error;
if (log_flushed)
*log_flushed = 1;
- spin_lock(&log->l_icloglock);
}
- if (!(flags & XFS_LOG_SYNC) ||
- (iclog->ic_state & (XLOG_STATE_ACTIVE | XLOG_STATE_DIRTY)))
- goto out_unlock;
-
- if (iclog->ic_state & XLOG_STATE_IOERROR)
- goto out_error;
-
- XFS_STATS_INC(mp, xs_log_force_sleep);
- xlog_wait(&iclog->ic_force_wait, &log->l_icloglock);
- if (iclog->ic_state & XLOG_STATE_IOERROR)
- return -EIO;
- return 0;
-
+ if (flags & XFS_LOG_SYNC)
+ return xlog_wait_on_iclog(iclog);
out_unlock:
spin_unlock(&log->l_icloglock);
return 0;
@@ -3506,33 +3300,6 @@
}
/*
- * Called when we want to mark the current iclog as being ready to sync to
- * disk.
- */
-STATIC void
-xlog_state_want_sync(
- struct xlog *log,
- struct xlog_in_core *iclog)
-{
- assert_spin_locked(&log->l_icloglock);
-
- if (iclog->ic_state == XLOG_STATE_ACTIVE) {
- xlog_state_switch_iclogs(log, iclog, 0);
- } else {
- ASSERT(iclog->ic_state &
- (XLOG_STATE_WANT_SYNC|XLOG_STATE_IOERROR));
- }
-}
-
-
-/*****************************************************************************
- *
- * TICKET functions
- *
- *****************************************************************************
- */
-
-/*
* Free a used ticket when its refcount falls to zero.
*/
void
@@ -3541,7 +3308,7 @@
{
ASSERT(atomic_read(&ticket->t_ref) > 0);
if (atomic_dec_and_test(&ticket->t_ref))
- kmem_zone_free(xfs_log_ticket_zone, ticket);
+ kmem_cache_free(xfs_log_ticket_zone, ticket);
}
xlog_ticket_t *
@@ -3659,15 +3426,12 @@
int unit_bytes,
int cnt,
char client,
- bool permanent,
- xfs_km_flags_t alloc_flags)
+ bool permanent)
{
struct xlog_ticket *tic;
int unit_res;
- tic = kmem_zone_zalloc(xfs_log_ticket_zone, alloc_flags);
- if (!tic)
- return NULL;
+ tic = kmem_cache_zalloc(xfs_log_ticket_zone, GFP_NOFS | __GFP_NOFAIL);
unit_res = xfs_log_calc_unit_res(log->l_mp, unit_bytes);
@@ -3680,7 +3444,6 @@
tic->t_ocnt = cnt;
tic->t_tid = prandom_u32();
tic->t_clientid = client;
- tic->t_flags = XLOG_TIC_INITED;
if (permanent)
tic->t_flags |= XLOG_TIC_PERM_RESERV;
@@ -3689,13 +3452,6 @@
return tic;
}
-
-/******************************************************************************
- *
- * Log debug routines
- *
- ******************************************************************************
- */
#if defined(DEBUG)
/*
* Make sure that the destination ptr is within the valid data region of
@@ -3781,7 +3537,7 @@
if (blocks < BTOBB(iclog->ic_offset) + 1)
xfs_emerg(log->l_mp, "%s: ran out of log space", __func__);
}
-} /* xlog_verify_tail_lsn */
+}
/*
* Perform a number of checks on the iclog before writing to disk.
@@ -3884,7 +3640,7 @@
}
ptr += sizeof(xlog_op_header_t) + op_len;
}
-} /* xlog_verify_iclog */
+}
#endif
/*
@@ -3897,7 +3653,7 @@
xlog_in_core_t *iclog, *ic;
iclog = log->l_iclog;
- if (! (iclog->ic_state & XLOG_STATE_IOERROR)) {
+ if (iclog->ic_state != XLOG_STATE_IOERROR) {
/*
* Mark all the incore logs IOERROR.
* From now on, no log flushes will result.
@@ -3957,7 +3713,7 @@
* Somebody could've already done the hard work for us.
* No need to get locks for this.
*/
- if (logerror && log->l_iclog->ic_state & XLOG_STATE_IOERROR) {
+ if (logerror && log->l_iclog->ic_state == XLOG_STATE_IOERROR) {
ASSERT(XLOG_FORCED_SHUTDOWN(log));
return 1;
}
@@ -4008,21 +3764,8 @@
spin_lock(&log->l_cilp->xc_push_lock);
wake_up_all(&log->l_cilp->xc_commit_wait);
spin_unlock(&log->l_cilp->xc_push_lock);
- xlog_state_do_callback(log, true, NULL);
+ xlog_state_do_callback(log);
-#ifdef XFSERRORDEBUG
- {
- xlog_in_core_t *iclog;
-
- spin_lock(&log->l_icloglock);
- iclog = log->l_iclog;
- do {
- ASSERT(iclog->ic_callback == 0);
- iclog = iclog->ic_next;
- } while (iclog != log->l_iclog);
- spin_unlock(&log->l_icloglock);
- }
-#endif
/* return non-zero if log IOERROR transition had already happened */
return retval;
}
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index 84e0680..58c3fcb 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -105,10 +105,6 @@
struct xfs_item_ops;
struct xfs_trans;
-xfs_lsn_t xfs_log_done(struct xfs_mount *mp,
- struct xlog_ticket *ticket,
- struct xlog_in_core **iclog,
- bool regrant);
int xfs_log_force(struct xfs_mount *mp, uint flags);
int xfs_log_force_lsn(struct xfs_mount *mp, xfs_lsn_t lsn, uint flags,
int *log_forced);
@@ -121,8 +117,7 @@
xfs_lsn_t xlog_assign_tail_lsn(struct xfs_mount *mp);
xfs_lsn_t xlog_assign_tail_lsn_locked(struct xfs_mount *mp);
void xfs_log_space_wake(struct xfs_mount *mp);
-int xfs_log_release_iclog(struct xfs_mount *mp,
- struct xlog_in_core *iclog);
+void xfs_log_release_iclog(struct xlog_in_core *iclog);
int xfs_log_reserve(struct xfs_mount *mp,
int length,
int count,
@@ -138,7 +133,7 @@
void xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp,
xfs_lsn_t *commit_lsn, bool regrant);
-void xlog_cil_process_committed(struct list_head *list, bool aborted);
+void xlog_cil_process_committed(struct list_head *list);
bool xfs_log_item_in_current_chkpt(struct xfs_log_item *lip);
void xfs_log_work_queue(struct xfs_mount *mp);
@@ -146,4 +141,6 @@
bool xfs_log_check_lsn(struct xfs_mount *, xfs_lsn_t);
bool xfs_log_in_recovery(struct xfs_mount *);
+xfs_lsn_t xlog_grant_push_threshold(struct xlog *log, int need_bytes);
+
#endif /* __XFS_LOG_H__ */
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index ef652ab..b0ef071 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -37,8 +37,7 @@
{
struct xlog_ticket *tic;
- tic = xlog_ticket_alloc(log, 0, 1, XFS_TRANSACTION, 0,
- KM_NOFS);
+ tic = xlog_ticket_alloc(log, 0, 1, XFS_TRANSACTION, 0);
/*
* set the current reservation to zero so we know to steal the basic
@@ -179,7 +178,7 @@
/*
* We free and allocate here as a realloc would copy
- * unecessary data. We don't use kmem_zalloc() for the
+ * unnecessary data. We don't use kmem_zalloc() for the
* same reason - we don't need to zero the data area in
* the buffer, only the log vector header and the iovec
* storage.
@@ -240,7 +239,7 @@
* this CIL context and so we need to pin it. If we are replacing the
* old_lv, then remove the space it accounts for and make it the shadow
* buffer for later freeing. In both cases we are now switching to the
- * shadow buffer, so update the the pointer to it appropriately.
+ * shadow buffer, so update the pointer to it appropriately.
*/
if (!old_lv) {
if (lv->lv_item->li_ops->iop_pin)
@@ -574,10 +573,10 @@
*/
static void
xlog_cil_committed(
- struct xfs_cil_ctx *ctx,
- bool abort)
+ struct xfs_cil_ctx *ctx)
{
struct xfs_mount *mp = ctx->cil->xc_log->l_mp;
+ bool abort = XLOG_FORCED_SHUTDOWN(ctx->cil->xc_log);
/*
* If the I/O failed, we're aborting the commit and already shutdown.
@@ -613,37 +612,38 @@
void
xlog_cil_process_committed(
- struct list_head *list,
- bool aborted)
+ struct list_head *list)
{
struct xfs_cil_ctx *ctx;
while ((ctx = list_first_entry_or_null(list,
struct xfs_cil_ctx, iclog_entry))) {
list_del(&ctx->iclog_entry);
- xlog_cil_committed(ctx, aborted);
+ xlog_cil_committed(ctx);
}
}
/*
- * Push the Committed Item List to the log. If @push_seq flag is zero, then it
- * is a background flush and so we can chose to ignore it. Otherwise, if the
- * current sequence is the same as @push_seq we need to do a flush. If
- * @push_seq is less than the current sequence, then it has already been
+ * Push the Committed Item List to the log.
+ *
+ * If the current sequence is the same as xc_push_seq we need to do a flush. If
+ * xc_push_seq is less than the current sequence, then it has already been
* flushed and we don't need to do anything - the caller will wait for it to
* complete if necessary.
*
- * @push_seq is a value rather than a flag because that allows us to do an
- * unlocked check of the sequence number for a match. Hence we can allows log
- * forces to run racily and not issue pushes for the same sequence twice. If we
- * get a race between multiple pushes for the same sequence they will block on
- * the first one and then abort, hence avoiding needless pushes.
+ * xc_push_seq is checked unlocked against the sequence number for a match.
+ * Hence we can allow log forces to run racily and not issue pushes for the
+ * same sequence twice. If we get a race between multiple pushes for the same
+ * sequence they will block on the first one and then abort, hence avoiding
+ * needless pushes.
*/
-STATIC int
-xlog_cil_push(
- struct xlog *log)
+static void
+xlog_cil_push_work(
+ struct work_struct *work)
{
- struct xfs_cil *cil = log->l_cilp;
+ struct xfs_cil *cil =
+ container_of(work, struct xfs_cil, xc_push_work);
+ struct xlog *log = cil->xc_log;
struct xfs_log_vec *lv;
struct xfs_cil_ctx *ctx;
struct xfs_cil_ctx *new_ctx;
@@ -657,9 +657,6 @@
xfs_lsn_t commit_lsn;
xfs_lsn_t push_seq;
- if (!cil)
- return 0;
-
new_ctx = kmem_zalloc(sizeof(*new_ctx), KM_NOFS);
new_ctx->ticket = xlog_cil_ticket_alloc(log);
@@ -671,6 +668,12 @@
ASSERT(push_seq <= ctx->sequence);
/*
+ * Wake up any background push waiters now this context is being pushed.
+ */
+ if (ctx->space_used >= XLOG_CIL_BLOCKING_SPACE_LIMIT(log))
+ wake_up_all(&cil->xc_push_wait);
+
+ /*
* Check if we've anything to push. If there is nothing, then we don't
* move on to a new sequence number and so we have to be able to push
* this sequence again later.
@@ -682,7 +685,7 @@
}
- /* check for a previously pushed seqeunce */
+ /* check for a previously pushed sequence */
if (push_seq < cil->xc_ctx->sequence) {
spin_unlock(&cil->xc_push_lock);
goto out_skip;
@@ -740,7 +743,7 @@
/*
* initialise the new context and attach it to the CIL. Then attach
- * the current context to the CIL committing lsit so it can be found
+ * the current context to the CIL committing list so it can be found
* during log forces to extract the commit lsn of the sequence that
* needs to be forced.
*/
@@ -803,7 +806,7 @@
lvhdr.lv_iovecp = &lhdr;
lvhdr.lv_next = ctx->lv_chain;
- error = xlog_write(log, &lvhdr, tic, &ctx->start_lsn, NULL, 0);
+ error = xlog_write(log, &lvhdr, tic, &ctx->start_lsn, NULL, 0, true);
if (error)
goto out_abort_free_ticket;
@@ -841,13 +844,14 @@
}
spin_unlock(&cil->xc_push_lock);
- /* xfs_log_done always frees the ticket on error. */
- commit_lsn = xfs_log_done(log->l_mp, tic, &commit_iclog, false);
- if (commit_lsn == -1)
- goto out_abort;
+ error = xlog_commit_record(log, tic, &commit_iclog, &commit_lsn);
+ if (error)
+ goto out_abort_free_ticket;
+
+ xfs_log_ticket_ungrant(log, tic);
spin_lock(&commit_iclog->ic_callback_lock);
- if (commit_iclog->ic_state & XLOG_STATE_IOERROR) {
+ if (commit_iclog->ic_state == XLOG_STATE_IOERROR) {
spin_unlock(&commit_iclog->ic_callback_lock);
goto out_abort;
}
@@ -867,28 +871,20 @@
spin_unlock(&cil->xc_push_lock);
/* release the hounds! */
- return xfs_log_release_iclog(log->l_mp, commit_iclog);
+ xfs_log_release_iclog(commit_iclog);
+ return;
out_skip:
up_write(&cil->xc_ctx_lock);
xfs_log_ticket_put(new_ctx->ticket);
kmem_free(new_ctx);
- return 0;
+ return;
out_abort_free_ticket:
- xfs_log_ticket_put(tic);
+ xfs_log_ticket_ungrant(log, tic);
out_abort:
- xlog_cil_committed(ctx, true);
- return -EIO;
-}
-
-static void
-xlog_cil_push_work(
- struct work_struct *work)
-{
- struct xfs_cil *cil = container_of(work, struct xfs_cil,
- xc_push_work);
- xlog_cil_push(cil->xc_log);
+ ASSERT(XLOG_FORCED_SHUTDOWN(log));
+ xlog_cil_committed(ctx);
}
/*
@@ -900,7 +896,7 @@
*/
static void
xlog_cil_push_background(
- struct xlog *log)
+ struct xlog *log) __releases(cil->xc_ctx_lock)
{
struct xfs_cil *cil = log->l_cilp;
@@ -914,14 +910,36 @@
* don't do a background push if we haven't used up all the
* space available yet.
*/
- if (cil->xc_ctx->space_used < XLOG_CIL_SPACE_LIMIT(log))
+ if (cil->xc_ctx->space_used < XLOG_CIL_SPACE_LIMIT(log)) {
+ up_read(&cil->xc_ctx_lock);
return;
+ }
spin_lock(&cil->xc_push_lock);
if (cil->xc_push_seq < cil->xc_current_sequence) {
cil->xc_push_seq = cil->xc_current_sequence;
queue_work(log->l_mp->m_cil_workqueue, &cil->xc_push_work);
}
+
+ /*
+ * Drop the context lock now, we can't hold that if we need to sleep
+ * because we are over the blocking threshold. The push_lock is still
+ * held, so blocking threshold sleep/wakeup is still correctly
+ * serialised here.
+ */
+ up_read(&cil->xc_ctx_lock);
+
+ /*
+ * If we are well over the space limit, throttle the work that is being
+ * done until the push work on this context has begun.
+ */
+ if (cil->xc_ctx->space_used >= XLOG_CIL_BLOCKING_SPACE_LIMIT(log)) {
+ trace_xfs_log_cil_wait(log, cil->xc_ctx->ticket);
+ ASSERT(cil->xc_ctx->space_used < log->l_logsize);
+ xlog_wait(&cil->xc_push_wait, &cil->xc_push_lock);
+ return;
+ }
+
spin_unlock(&cil->xc_push_lock);
}
@@ -1017,7 +1035,10 @@
if (commit_lsn)
*commit_lsn = xc_commit_lsn;
- xfs_log_done(mp, tp->t_ticket, NULL, regrant);
+ if (regrant && !XLOG_FORCED_SHUTDOWN(log))
+ xfs_log_ticket_regrant(log, tp->t_ticket);
+ else
+ xfs_log_ticket_ungrant(log, tp->t_ticket);
tp->t_ticket = NULL;
xfs_trans_unreserve_and_mod_sb(tp);
@@ -1038,9 +1059,9 @@
if (lip->li_ops->iop_committing)
lip->li_ops->iop_committing(lip, xc_commit_lsn);
}
- xlog_cil_push_background(log);
- up_read(&cil->xc_ctx_lock);
+ /* xlog_cil_push_background() releases cil->xc_ctx_lock */
+ xlog_cil_push_background(log);
}
/*
@@ -1194,6 +1215,7 @@
INIT_LIST_HEAD(&cil->xc_committing);
spin_lock_init(&cil->xc_cil_lock);
spin_lock_init(&cil->xc_push_lock);
+ init_waitqueue_head(&cil->xc_push_wait);
init_rwsem(&cil->xc_ctx_lock);
init_waitqueue_head(&cil->xc_commit_wait);
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index b880c23..1c6fdbf 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -40,26 +40,22 @@
/*
* In core log state
*/
-#define XLOG_STATE_ACTIVE 0x0001 /* Current IC log being written to */
-#define XLOG_STATE_WANT_SYNC 0x0002 /* Want to sync this iclog; no more writes */
-#define XLOG_STATE_SYNCING 0x0004 /* This IC log is syncing */
-#define XLOG_STATE_DONE_SYNC 0x0008 /* Done syncing to disk */
-#define XLOG_STATE_DO_CALLBACK \
- 0x0010 /* Process callback functions */
-#define XLOG_STATE_CALLBACK 0x0020 /* Callback functions now */
-#define XLOG_STATE_DIRTY 0x0040 /* Dirty IC log, not ready for ACTIVE status*/
-#define XLOG_STATE_IOERROR 0x0080 /* IO error happened in sync'ing log */
-#define XLOG_STATE_ALL 0x7FFF /* All possible valid flags */
-#define XLOG_STATE_NOTUSED 0x8000 /* This IC log not being used */
+enum xlog_iclog_state {
+ XLOG_STATE_ACTIVE, /* Current IC log being written to */
+ XLOG_STATE_WANT_SYNC, /* Want to sync this iclog; no more writes */
+ XLOG_STATE_SYNCING, /* This IC log is syncing */
+ XLOG_STATE_DONE_SYNC, /* Done syncing to disk */
+ XLOG_STATE_CALLBACK, /* Callback functions now */
+ XLOG_STATE_DIRTY, /* Dirty IC log, not ready for ACTIVE status */
+ XLOG_STATE_IOERROR, /* IO error happened in sync'ing log */
+};
/*
- * Flags to log ticket
+ * Log ticket flags
*/
-#define XLOG_TIC_INITED 0x1 /* has been initialized */
-#define XLOG_TIC_PERM_RESERV 0x2 /* permanent reservation */
+#define XLOG_TIC_PERM_RESERV 0x1 /* permanent reservation */
#define XLOG_TIC_FLAGS \
- { XLOG_TIC_INITED, "XLOG_TIC_INITED" }, \
{ XLOG_TIC_PERM_RESERV, "XLOG_TIC_PERM_RESERV" }
/*
@@ -179,8 +175,6 @@
* - ic_next is the pointer to the next iclog in the ring.
* - ic_log is a pointer back to the global log structure.
* - ic_size is the full size of the log buffer, minus the cycle headers.
- * - ic_io_size is the size of the currently pending log buffer write, which
- * might be smaller than ic_size
* - ic_offset is the current number of bytes written to in this iclog.
* - ic_refcnt is bumped when someone is writing to the log.
* - ic_state is the state of the iclog.
@@ -205,9 +199,8 @@
struct xlog_in_core *ic_prev;
struct xlog *ic_log;
u32 ic_size;
- u32 ic_io_size;
u32 ic_offset;
- unsigned short ic_state;
+ enum xlog_iclog_state ic_state;
char *ic_datap; /* pointer to iclog data */
/* Callback structures need their own cacheline */
@@ -280,6 +273,7 @@
wait_queue_head_t xc_commit_wait;
xfs_lsn_t xc_current_sequence;
struct work_struct xc_push_work;
+ wait_queue_head_t xc_push_wait; /* background push throttle */
} ____cacheline_aligned_in_smp;
/*
@@ -323,13 +317,53 @@
* tries to keep 25% of the log free, so we need to keep below that limit or we
* risk running out of free log space to start any new transactions.
*
- * In order to keep background CIL push efficient, we will set a lower
- * threshold at which background pushing is attempted without blocking current
- * transaction commits. A separate, higher bound defines when CIL pushes are
- * enforced to ensure we stay within our maximum checkpoint size bounds.
- * threshold, yet give us plenty of space for aggregation on large logs.
+ * In order to keep background CIL push efficient, we only need to ensure the
+ * CIL is large enough to maintain sufficient in-memory relogging to avoid
+ * repeated physical writes of frequently modified metadata. If we allow the CIL
+ * to grow to a substantial fraction of the log, then we may be pinning hundreds
+ * of megabytes of metadata in memory until the CIL flushes. This can cause
+ * issues when we are running low on memory - pinned memory cannot be reclaimed,
+ * and the CIL consumes a lot of memory. Hence we need to set an upper physical
+ * size limit for the CIL that limits the maximum amount of memory pinned by the
+ * CIL but does not limit performance by reducing relogging efficiency
+ * significantly.
+ *
+ * As such, the CIL push threshold ends up being the smaller of two thresholds:
+ * - a threshold large enough that it allows CIL to be pushed and progress to be
+ * made without excessive blocking of incoming transaction commits. This is
+ * defined to be 12.5% of the log space - half the 25% push threshold of the
+ * AIL.
+ * - small enough that it doesn't pin excessive amounts of memory but maintains
+ * close to peak relogging efficiency. This is defined to be 16x the iclog
+ * buffer window (32MB) as measurements have shown this to be roughly the
+ * point of diminishing performance increases under highly concurrent
+ * modification workloads.
+ *
+ * To prevent the CIL from overflowing upper commit size bounds, we introduce a
+ * new threshold at which we block committing transactions until the background
+ * CIL commit commences and switches to a new context. While this is not a hard
+ * limit, it forces the process committing a transaction to the CIL to block and
+ * yeild the CPU, giving the CIL push work a chance to be scheduled and start
+ * work. This prevents a process running lots of transactions from overfilling
+ * the CIL because it is not yielding the CPU. We set the blocking limit at
+ * twice the background push space threshold so we keep in line with the AIL
+ * push thresholds.
+ *
+ * Note: this is not a -hard- limit as blocking is applied after the transaction
+ * is inserted into the CIL and the push has been triggered. It is largely a
+ * throttling mechanism that allows the CIL push to be scheduled and run. A hard
+ * limit will be difficult to implement without introducing global serialisation
+ * in the CIL commit fast path, and it's not at all clear that we actually need
+ * such hard limits given the ~7 years we've run without a hard limit before
+ * finding the first situation where a checkpoint size overflow actually
+ * occurred. Hence the simple throttle, and an ASSERT check to tell us that
+ * we've overrun the max size.
*/
-#define XLOG_CIL_SPACE_LIMIT(log) (log->l_logsize >> 3)
+#define XLOG_CIL_SPACE_LIMIT(log) \
+ min_t(int, (log)->l_logsize >> 3, BBTOB(XLOG_TOTAL_REC_SHIFT(log)) << 4)
+
+#define XLOG_CIL_BLOCKING_SPACE_LIMIT(log) \
+ (XLOG_CIL_SPACE_LIMIT(log) * 2)
/*
* ticket grant locks, queues and accounting have their own cachlines
@@ -399,8 +433,6 @@
/* The following field are used for debugging; need to hold icloglock */
#ifdef DEBUG
void *l_iclog_bak[XLOG_MAX_ICLOGS];
- /* log record crc error injection factor */
- uint32_t l_badcrc_factor;
#endif
/* log recovery lsn tracking (for buffer submission */
xfs_lsn_t l_recovery_lsn;
@@ -409,7 +441,8 @@
#define XLOG_BUF_CANCEL_BUCKET(log, blkno) \
((log)->l_buf_cancel_table + ((uint64_t)blkno % XLOG_BC_TABLE_SIZE))
-#define XLOG_FORCED_SHUTDOWN(log) ((log)->l_flags & XLOG_IO_ERROR)
+#define XLOG_FORCED_SHUTDOWN(log) \
+ (unlikely((log)->l_flags & XLOG_IO_ERROR))
/* common routines */
extern int
@@ -431,9 +464,7 @@
int unit_bytes,
int count,
char client,
- bool permanent,
- xfs_km_flags_t alloc_flags);
-
+ bool permanent);
static inline void
xlog_write_adv_cnt(void **ptr, int *len, int *off, size_t bytes)
@@ -445,14 +476,14 @@
void xlog_print_tic_res(struct xfs_mount *mp, struct xlog_ticket *ticket);
void xlog_print_trans(struct xfs_trans *);
-int
-xlog_write(
- struct xlog *log,
- struct xfs_log_vec *log_vector,
- struct xlog_ticket *tic,
- xfs_lsn_t *start_lsn,
- struct xlog_in_core **commit_iclog,
- uint flags);
+int xlog_write(struct xlog *log, struct xfs_log_vec *log_vector,
+ struct xlog_ticket *tic, xfs_lsn_t *start_lsn,
+ struct xlog_in_core **commit_iclog, uint flags,
+ bool need_start_rec);
+int xlog_commit_record(struct xlog *log, struct xlog_ticket *ticket,
+ struct xlog_in_core **iclog, xfs_lsn_t *lsn);
+void xfs_log_ticket_ungrant(struct xlog *log, struct xlog_ticket *ticket);
+void xfs_log_ticket_regrant(struct xlog *log, struct xlog_ticket *ticket);
/*
* When we crack an atomic LSN, we sample it first so that the value will not
@@ -532,17 +563,15 @@
}
/*
- * Unmount record type is used as a pseudo transaction type for the ticket.
- * It's value must be outside the range of XFS_TRANS_* values.
- */
-#define XLOG_UNMOUNT_REC_TYPE (-1U)
-
-/*
* Wrapper function for waiting on a wait queue serialised against wakeups
* by a spinlock. This matches the semantics of all the wait queues used in the
* log code.
*/
-static inline void xlog_wait(wait_queue_head_t *wq, spinlock_t *lock)
+static inline void
+xlog_wait(
+ struct wait_queue_head *wq,
+ struct spinlock *lock)
+ __releases(lock)
{
DECLARE_WAITQUEUE(wait, current);
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index c1a514f..87886b7 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -18,21 +18,13 @@
#include "xfs_log.h"
#include "xfs_log_priv.h"
#include "xfs_log_recover.h"
-#include "xfs_inode_item.h"
-#include "xfs_extfree_item.h"
#include "xfs_trans_priv.h"
#include "xfs_alloc.h"
#include "xfs_ialloc.h"
-#include "xfs_quota.h"
#include "xfs_trace.h"
#include "xfs_icache.h"
-#include "xfs_bmap_btree.h"
#include "xfs_error.h"
-#include "xfs_dir2.h"
-#include "xfs_rmap_item.h"
#include "xfs_buf_item.h"
-#include "xfs_refcount_item.h"
-#include "xfs_bmap_item.h"
#define BLK_AVG(blk1, blk2) ((blk1+blk2) >> 1)
@@ -56,17 +48,6 @@
struct xlog *, xfs_daddr_t, xfs_daddr_t, int, xfs_daddr_t *);
/*
- * This structure is used during recovery to record the buf log items which
- * have been canceled and should not be replayed.
- */
-struct xfs_buf_cancel {
- xfs_daddr_t bc_blkno;
- uint bc_len;
- int bc_refcount;
- struct list_head bc_list;
-};
-
-/*
* Sector aligned buffer routines for buffer create/read/write/access
*/
@@ -103,10 +84,9 @@
* Pass log block 0 since we don't have an addr yet, buffer will be
* verified on read.
*/
- if (!xlog_verify_bno(log, 0, nbblks)) {
+ if (XFS_IS_CORRUPT(log->l_mp, !xlog_verify_bno(log, 0, nbblks))) {
xfs_warn(log->l_mp, "Invalid block length (0x%x) for buffer",
nbblks);
- XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp);
return NULL;
}
@@ -152,11 +132,10 @@
{
int error;
- if (!xlog_verify_bno(log, blk_no, nbblks)) {
+ if (XFS_IS_CORRUPT(log->l_mp, !xlog_verify_bno(log, blk_no, nbblks))) {
xfs_warn(log->l_mp,
"Invalid log block/length (0x%llx, 0x%x) for buffer",
blk_no, nbblks);
- XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp);
return -EFSCORRUPTED;
}
@@ -244,19 +223,17 @@
* (XLOG_FMT_UNKNOWN). This stops us from trying to recover
* a dirty log created in IRIX.
*/
- if (unlikely(head->h_fmt != cpu_to_be32(XLOG_FMT))) {
+ if (XFS_IS_CORRUPT(mp, head->h_fmt != cpu_to_be32(XLOG_FMT))) {
xfs_warn(mp,
"dirty log written in incompatible format - can't recover");
xlog_header_check_dump(mp, head);
- XFS_ERROR_REPORT("xlog_header_check_recover(1)",
- XFS_ERRLEVEL_HIGH, mp);
return -EFSCORRUPTED;
- } else if (unlikely(!uuid_equal(&mp->m_sb.sb_uuid, &head->h_fs_uuid))) {
+ }
+ if (XFS_IS_CORRUPT(mp, !uuid_equal(&mp->m_sb.sb_uuid,
+ &head->h_fs_uuid))) {
xfs_warn(mp,
"dirty log entry has mismatched uuid - can't recover");
xlog_header_check_dump(mp, head);
- XFS_ERROR_REPORT("xlog_header_check_recover(2)",
- XFS_ERRLEVEL_HIGH, mp);
return -EFSCORRUPTED;
}
return 0;
@@ -279,43 +256,15 @@
* by IRIX and continue.
*/
xfs_warn(mp, "null uuid in log - IRIX style log");
- } else if (unlikely(!uuid_equal(&mp->m_sb.sb_uuid, &head->h_fs_uuid))) {
+ } else if (XFS_IS_CORRUPT(mp, !uuid_equal(&mp->m_sb.sb_uuid,
+ &head->h_fs_uuid))) {
xfs_warn(mp, "log has mismatched uuid - can't recover");
xlog_header_check_dump(mp, head);
- XFS_ERROR_REPORT("xlog_header_check_mount",
- XFS_ERRLEVEL_HIGH, mp);
return -EFSCORRUPTED;
}
return 0;
}
-STATIC void
-xlog_recover_iodone(
- struct xfs_buf *bp)
-{
- if (bp->b_error) {
- /*
- * We're not going to bother about retrying
- * this during recovery. One strike!
- */
- if (!XFS_FORCED_SHUTDOWN(bp->b_mount)) {
- xfs_buf_ioerror_alert(bp, __func__);
- xfs_force_shutdown(bp->b_mount, SHUTDOWN_META_IO_ERROR);
- }
- }
-
- /*
- * On v5 supers, a bli could be attached to update the metadata LSN.
- * Clean it up.
- */
- if (bp->b_log_item)
- xfs_buf_item_relse(bp);
- ASSERT(bp->b_log_item == NULL);
-
- bp->b_iodone = NULL;
- xfs_buf_ioend(bp);
-}
-
/*
* This routine finds (to an approximation) the first block in the physical
* log which contains the given cycle. It uses a binary search algorithm.
@@ -422,6 +371,19 @@
return error;
}
+static inline int
+xlog_logrec_hblks(struct xlog *log, struct xlog_rec_header *rh)
+{
+ if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
+ int h_size = be32_to_cpu(rh->h_size);
+
+ if ((be32_to_cpu(rh->h_version) & XLOG_VERSION_2) &&
+ h_size > XLOG_HEADER_CYCLE_SIZE)
+ return DIV_ROUND_UP(h_size, XLOG_HEADER_CYCLE_SIZE);
+ }
+ return 1;
+}
+
/*
* Potentially backup over partial log record write.
*
@@ -471,7 +433,7 @@
xfs_warn(log->l_mp,
"Log inconsistent (didn't find previous header)");
ASSERT(0);
- error = -EIO;
+ error = -EFSCORRUPTED;
goto out;
}
@@ -514,15 +476,7 @@
* reset last_blk. Only when last_blk points in the middle of a log
* record do we update last_blk.
*/
- if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
- uint h_size = be32_to_cpu(head->h_size);
-
- xhdrs = h_size / XLOG_HEADER_CYCLE_SIZE;
- if (h_size % XLOG_HEADER_CYCLE_SIZE)
- xhdrs++;
- } else {
- xhdrs = 1;
- }
+ xhdrs = xlog_logrec_hblks(log, head);
if (*last_blk - i + extra_bblks !=
BTOBB(be32_to_cpu(head->h_len)) + xhdrs)
@@ -1125,7 +1079,7 @@
*
* Note that xlog_find_tail() clears the blocks at the new head
* (i.e., the records with invalid CRC) if the cycle number
- * matches the the current cycle.
+ * matches the current cycle.
*/
found = xlog_rseek_logrec_hdr(log, first_bad, *tail_blk, 1,
buffer, rhead_blk, rhead, wrapped);
@@ -1209,22 +1163,7 @@
* below. We won't want to clear the unmount record if there is one, so
* we pass the lsn of the unmount record rather than the block after it.
*/
- if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
- int h_size = be32_to_cpu(rhead->h_size);
- int h_version = be32_to_cpu(rhead->h_version);
-
- if ((h_version & XLOG_VERSION_2) &&
- (h_size > XLOG_HEADER_CYCLE_SIZE)) {
- hblks = h_size / XLOG_HEADER_CYCLE_SIZE;
- if (h_size % XLOG_HEADER_CYCLE_SIZE)
- hblks++;
- } else {
- hblks = 1;
- }
- } else {
- hblks = 1;
- }
-
+ hblks = xlog_logrec_hblks(log, rhead);
after_umount_blk = xlog_wrap_logbno(log,
rhead_blk + hblks + BTOBB(be32_to_cpu(rhead->h_len)));
@@ -1347,10 +1286,11 @@
error = xlog_rseek_logrec_hdr(log, *head_blk, *head_blk, 1, buffer,
&rhead_blk, &rhead, &wrapped);
if (error < 0)
- return error;
+ goto done;
if (!error) {
xfs_warn(log->l_mp, "%s: couldn't find sync record", __func__);
- return -EIO;
+ error = -EFSCORRUPTED;
+ goto done;
}
*tail_blk = BLOCK_LSN(be64_to_cpu(rhead->h_tail_lsn));
@@ -1699,11 +1639,10 @@
* the distance from the beginning of the log to the
* tail.
*/
- if (unlikely(head_block < tail_block || head_block >= log->l_logBBsize)) {
- XFS_ERROR_REPORT("xlog_clear_stale_blocks(1)",
- XFS_ERRLEVEL_LOW, log->l_mp);
+ if (XFS_IS_CORRUPT(log->l_mp,
+ head_block < tail_block ||
+ head_block >= log->l_logBBsize))
return -EFSCORRUPTED;
- }
tail_distance = tail_block + (log->l_logBBsize - head_block);
} else {
/*
@@ -1711,11 +1650,10 @@
* so the distance from the head to the tail is just
* the tail block minus the head block.
*/
- if (unlikely(head_block >= tail_block || head_cycle != (tail_cycle + 1))){
- XFS_ERROR_REPORT("xlog_clear_stale_blocks(2)",
- XFS_ERRLEVEL_LOW, log->l_mp);
+ if (XFS_IS_CORRUPT(log->l_mp,
+ head_block >= tail_block ||
+ head_cycle != tail_cycle + 1))
return -EFSCORRUPTED;
- }
tail_distance = tail_block - head_block;
}
@@ -1785,12 +1723,72 @@
return 0;
}
+/*
+ * Release the recovered intent item in the AIL that matches the given intent
+ * type and intent id.
+ */
+void
+xlog_recover_release_intent(
+ struct xlog *log,
+ unsigned short intent_type,
+ uint64_t intent_id)
+{
+ struct xfs_ail_cursor cur;
+ struct xfs_log_item *lip;
+ struct xfs_ail *ailp = log->l_ailp;
+
+ spin_lock(&ailp->ail_lock);
+ for (lip = xfs_trans_ail_cursor_first(ailp, &cur, 0); lip != NULL;
+ lip = xfs_trans_ail_cursor_next(ailp, &cur)) {
+ if (lip->li_type != intent_type)
+ continue;
+ if (!lip->li_ops->iop_match(lip, intent_id))
+ continue;
+
+ spin_unlock(&ailp->ail_lock);
+ lip->li_ops->iop_release(lip);
+ spin_lock(&ailp->ail_lock);
+ break;
+ }
+
+ xfs_trans_ail_cursor_done(&cur);
+ spin_unlock(&ailp->ail_lock);
+}
+
/******************************************************************************
*
* Log recover routines
*
******************************************************************************
*/
+static const struct xlog_recover_item_ops *xlog_recover_item_ops[] = {
+ &xlog_buf_item_ops,
+ &xlog_inode_item_ops,
+ &xlog_dquot_item_ops,
+ &xlog_quotaoff_item_ops,
+ &xlog_icreate_item_ops,
+ &xlog_efi_item_ops,
+ &xlog_efd_item_ops,
+ &xlog_rui_item_ops,
+ &xlog_rud_item_ops,
+ &xlog_cui_item_ops,
+ &xlog_cud_item_ops,
+ &xlog_bui_item_ops,
+ &xlog_bud_item_ops,
+};
+
+static const struct xlog_recover_item_ops *
+xlog_find_item_ops(
+ struct xlog_recover_item *item)
+{
+ unsigned int i;
+
+ for (i = 0; i < ARRAY_SIZE(xlog_recover_item_ops); i++)
+ if (ITEM_TYPE(item) == xlog_recover_item_ops[i]->item_type)
+ return xlog_recover_item_ops[i];
+
+ return NULL;
+}
/*
* Sort the log items in the transaction.
@@ -1847,54 +1845,23 @@
struct xlog_recover *trans,
int pass)
{
- xlog_recover_item_t *item, *n;
+ struct xlog_recover_item *item, *n;
int error = 0;
LIST_HEAD(sort_list);
LIST_HEAD(cancel_list);
LIST_HEAD(buffer_list);
LIST_HEAD(inode_buffer_list);
- LIST_HEAD(inode_list);
+ LIST_HEAD(item_list);
list_splice_init(&trans->r_itemq, &sort_list);
list_for_each_entry_safe(item, n, &sort_list, ri_list) {
- xfs_buf_log_format_t *buf_f = item->ri_buf[0].i_addr;
+ enum xlog_recover_reorder fate = XLOG_REORDER_ITEM_LIST;
- switch (ITEM_TYPE(item)) {
- case XFS_LI_ICREATE:
- list_move_tail(&item->ri_list, &buffer_list);
- break;
- case XFS_LI_BUF:
- if (buf_f->blf_flags & XFS_BLF_CANCEL) {
- trace_xfs_log_recover_item_reorder_head(log,
- trans, item, pass);
- list_move(&item->ri_list, &cancel_list);
- break;
- }
- if (buf_f->blf_flags & XFS_BLF_INODE_BUF) {
- list_move(&item->ri_list, &inode_buffer_list);
- break;
- }
- list_move_tail(&item->ri_list, &buffer_list);
- break;
- case XFS_LI_INODE:
- case XFS_LI_DQUOT:
- case XFS_LI_QUOTAOFF:
- case XFS_LI_EFD:
- case XFS_LI_EFI:
- case XFS_LI_RUI:
- case XFS_LI_RUD:
- case XFS_LI_CUI:
- case XFS_LI_CUD:
- case XFS_LI_BUI:
- case XFS_LI_BUD:
- trace_xfs_log_recover_item_reorder_tail(log,
- trans, item, pass);
- list_move_tail(&item->ri_list, &inode_list);
- break;
- default:
+ item->ri_ops = xlog_find_item_ops(item);
+ if (!item->ri_ops) {
xfs_warn(log->l_mp,
- "%s: unrecognized type of log operation",
- __func__);
+ "%s: unrecognized type of log operation (%d)",
+ __func__, ITEM_TYPE(item));
ASSERT(0);
/*
* return the remaining items back to the transaction
@@ -1902,16 +1869,38 @@
*/
if (!list_empty(&sort_list))
list_splice_init(&sort_list, &trans->r_itemq);
- error = -EIO;
- goto out;
+ error = -EFSCORRUPTED;
+ break;
+ }
+
+ if (item->ri_ops->reorder)
+ fate = item->ri_ops->reorder(item);
+
+ switch (fate) {
+ case XLOG_REORDER_BUFFER_LIST:
+ list_move_tail(&item->ri_list, &buffer_list);
+ break;
+ case XLOG_REORDER_CANCEL_LIST:
+ trace_xfs_log_recover_item_reorder_head(log,
+ trans, item, pass);
+ list_move(&item->ri_list, &cancel_list);
+ break;
+ case XLOG_REORDER_INODE_BUFFER_LIST:
+ list_move(&item->ri_list, &inode_buffer_list);
+ break;
+ case XLOG_REORDER_ITEM_LIST:
+ trace_xfs_log_recover_item_reorder_tail(log,
+ trans, item, pass);
+ list_move_tail(&item->ri_list, &item_list);
+ break;
}
}
-out:
+
ASSERT(list_empty(&sort_list));
if (!list_empty(&buffer_list))
list_splice(&buffer_list, &trans->r_itemq);
- if (!list_empty(&inode_list))
- list_splice_tail(&inode_list, &trans->r_itemq);
+ if (!list_empty(&item_list))
+ list_splice_tail(&item_list, &trans->r_itemq);
if (!list_empty(&inode_buffer_list))
list_splice_tail(&inode_buffer_list, &trans->r_itemq);
if (!list_empty(&cancel_list))
@@ -1919,2155 +1908,15 @@
return error;
}
-/*
- * Build up the table of buf cancel records so that we don't replay
- * cancelled data in the second pass. For buffer records that are
- * not cancel records, there is nothing to do here so we just return.
- *
- * If we get a cancel record which is already in the table, this indicates
- * that the buffer was cancelled multiple times. In order to ensure
- * that during pass 2 we keep the record in the table until we reach its
- * last occurrence in the log, we keep a reference count in the cancel
- * record in the table to tell us how many times we expect to see this
- * record during the second pass.
- */
-STATIC int
-xlog_recover_buffer_pass1(
- struct xlog *log,
- struct xlog_recover_item *item)
-{
- xfs_buf_log_format_t *buf_f = item->ri_buf[0].i_addr;
- struct list_head *bucket;
- struct xfs_buf_cancel *bcp;
-
- /*
- * If this isn't a cancel buffer item, then just return.
- */
- if (!(buf_f->blf_flags & XFS_BLF_CANCEL)) {
- trace_xfs_log_recover_buf_not_cancel(log, buf_f);
- return 0;
- }
-
- /*
- * Insert an xfs_buf_cancel record into the hash table of them.
- * If there is already an identical record, bump its reference count.
- */
- bucket = XLOG_BUF_CANCEL_BUCKET(log, buf_f->blf_blkno);
- list_for_each_entry(bcp, bucket, bc_list) {
- if (bcp->bc_blkno == buf_f->blf_blkno &&
- bcp->bc_len == buf_f->blf_len) {
- bcp->bc_refcount++;
- trace_xfs_log_recover_buf_cancel_ref_inc(log, buf_f);
- return 0;
- }
- }
-
- bcp = kmem_alloc(sizeof(struct xfs_buf_cancel), 0);
- bcp->bc_blkno = buf_f->blf_blkno;
- bcp->bc_len = buf_f->blf_len;
- bcp->bc_refcount = 1;
- list_add_tail(&bcp->bc_list, bucket);
-
- trace_xfs_log_recover_buf_cancel_add(log, buf_f);
- return 0;
-}
-
-/*
- * Check to see whether the buffer being recovered has a corresponding
- * entry in the buffer cancel record table. If it is, return the cancel
- * buffer structure to the caller.
- */
-STATIC struct xfs_buf_cancel *
-xlog_peek_buffer_cancelled(
+void
+xlog_buf_readahead(
struct xlog *log,
xfs_daddr_t blkno,
uint len,
- unsigned short flags)
+ const struct xfs_buf_ops *ops)
{
- struct list_head *bucket;
- struct xfs_buf_cancel *bcp;
-
- if (!log->l_buf_cancel_table) {
- /* empty table means no cancelled buffers in the log */
- ASSERT(!(flags & XFS_BLF_CANCEL));
- return NULL;
- }
-
- bucket = XLOG_BUF_CANCEL_BUCKET(log, blkno);
- list_for_each_entry(bcp, bucket, bc_list) {
- if (bcp->bc_blkno == blkno && bcp->bc_len == len)
- return bcp;
- }
-
- /*
- * We didn't find a corresponding entry in the table, so return 0 so
- * that the buffer is NOT cancelled.
- */
- ASSERT(!(flags & XFS_BLF_CANCEL));
- return NULL;
-}
-
-/*
- * If the buffer is being cancelled then return 1 so that it will be cancelled,
- * otherwise return 0. If the buffer is actually a buffer cancel item
- * (XFS_BLF_CANCEL is set), then decrement the refcount on the entry in the
- * table and remove it from the table if this is the last reference.
- *
- * We remove the cancel record from the table when we encounter its last
- * occurrence in the log so that if the same buffer is re-used again after its
- * last cancellation we actually replay the changes made at that point.
- */
-STATIC int
-xlog_check_buffer_cancelled(
- struct xlog *log,
- xfs_daddr_t blkno,
- uint len,
- unsigned short flags)
-{
- struct xfs_buf_cancel *bcp;
-
- bcp = xlog_peek_buffer_cancelled(log, blkno, len, flags);
- if (!bcp)
- return 0;
-
- /*
- * We've go a match, so return 1 so that the recovery of this buffer
- * is cancelled. If this buffer is actually a buffer cancel log
- * item, then decrement the refcount on the one in the table and
- * remove it if this is the last reference.
- */
- if (flags & XFS_BLF_CANCEL) {
- if (--bcp->bc_refcount == 0) {
- list_del(&bcp->bc_list);
- kmem_free(bcp);
- }
- }
- return 1;
-}
-
-/*
- * Perform recovery for a buffer full of inodes. In these buffers, the only
- * data which should be recovered is that which corresponds to the
- * di_next_unlinked pointers in the on disk inode structures. The rest of the
- * data for the inodes is always logged through the inodes themselves rather
- * than the inode buffer and is recovered in xlog_recover_inode_pass2().
- *
- * The only time when buffers full of inodes are fully recovered is when the
- * buffer is full of newly allocated inodes. In this case the buffer will
- * not be marked as an inode buffer and so will be sent to
- * xlog_recover_do_reg_buffer() below during recovery.
- */
-STATIC int
-xlog_recover_do_inode_buffer(
- struct xfs_mount *mp,
- xlog_recover_item_t *item,
- struct xfs_buf *bp,
- xfs_buf_log_format_t *buf_f)
-{
- int i;
- int item_index = 0;
- int bit = 0;
- int nbits = 0;
- int reg_buf_offset = 0;
- int reg_buf_bytes = 0;
- int next_unlinked_offset;
- int inodes_per_buf;
- xfs_agino_t *logged_nextp;
- xfs_agino_t *buffer_nextp;
-
- trace_xfs_log_recover_buf_inode_buf(mp->m_log, buf_f);
-
- /*
- * Post recovery validation only works properly on CRC enabled
- * filesystems.
- */
- if (xfs_sb_version_hascrc(&mp->m_sb))
- bp->b_ops = &xfs_inode_buf_ops;
-
- inodes_per_buf = BBTOB(bp->b_length) >> mp->m_sb.sb_inodelog;
- for (i = 0; i < inodes_per_buf; i++) {
- next_unlinked_offset = (i * mp->m_sb.sb_inodesize) +
- offsetof(xfs_dinode_t, di_next_unlinked);
-
- while (next_unlinked_offset >=
- (reg_buf_offset + reg_buf_bytes)) {
- /*
- * The next di_next_unlinked field is beyond
- * the current logged region. Find the next
- * logged region that contains or is beyond
- * the current di_next_unlinked field.
- */
- bit += nbits;
- bit = xfs_next_bit(buf_f->blf_data_map,
- buf_f->blf_map_size, bit);
-
- /*
- * If there are no more logged regions in the
- * buffer, then we're done.
- */
- if (bit == -1)
- return 0;
-
- nbits = xfs_contig_bits(buf_f->blf_data_map,
- buf_f->blf_map_size, bit);
- ASSERT(nbits > 0);
- reg_buf_offset = bit << XFS_BLF_SHIFT;
- reg_buf_bytes = nbits << XFS_BLF_SHIFT;
- item_index++;
- }
-
- /*
- * If the current logged region starts after the current
- * di_next_unlinked field, then move on to the next
- * di_next_unlinked field.
- */
- if (next_unlinked_offset < reg_buf_offset)
- continue;
-
- ASSERT(item->ri_buf[item_index].i_addr != NULL);
- ASSERT((item->ri_buf[item_index].i_len % XFS_BLF_CHUNK) == 0);
- ASSERT((reg_buf_offset + reg_buf_bytes) <= BBTOB(bp->b_length));
-
- /*
- * The current logged region contains a copy of the
- * current di_next_unlinked field. Extract its value
- * and copy it to the buffer copy.
- */
- logged_nextp = item->ri_buf[item_index].i_addr +
- next_unlinked_offset - reg_buf_offset;
- if (unlikely(*logged_nextp == 0)) {
- xfs_alert(mp,
- "Bad inode buffer log record (ptr = "PTR_FMT", bp = "PTR_FMT"). "
- "Trying to replay bad (0) inode di_next_unlinked field.",
- item, bp);
- XFS_ERROR_REPORT("xlog_recover_do_inode_buf",
- XFS_ERRLEVEL_LOW, mp);
- return -EFSCORRUPTED;
- }
-
- buffer_nextp = xfs_buf_offset(bp, next_unlinked_offset);
- *buffer_nextp = *logged_nextp;
-
- /*
- * If necessary, recalculate the CRC in the on-disk inode. We
- * have to leave the inode in a consistent state for whoever
- * reads it next....
- */
- xfs_dinode_calc_crc(mp,
- xfs_buf_offset(bp, i * mp->m_sb.sb_inodesize));
-
- }
-
- return 0;
-}
-
-/*
- * V5 filesystems know the age of the buffer on disk being recovered. We can
- * have newer objects on disk than we are replaying, and so for these cases we
- * don't want to replay the current change as that will make the buffer contents
- * temporarily invalid on disk.
- *
- * The magic number might not match the buffer type we are going to recover
- * (e.g. reallocated blocks), so we ignore the xfs_buf_log_format flags. Hence
- * extract the LSN of the existing object in the buffer based on it's current
- * magic number. If we don't recognise the magic number in the buffer, then
- * return a LSN of -1 so that the caller knows it was an unrecognised block and
- * so can recover the buffer.
- *
- * Note: we cannot rely solely on magic number matches to determine that the
- * buffer has a valid LSN - we also need to verify that it belongs to this
- * filesystem, so we need to extract the object's LSN and compare it to that
- * which we read from the superblock. If the UUIDs don't match, then we've got a
- * stale metadata block from an old filesystem instance that we need to recover
- * over the top of.
- */
-static xfs_lsn_t
-xlog_recover_get_buf_lsn(
- struct xfs_mount *mp,
- struct xfs_buf *bp)
-{
- uint32_t magic32;
- uint16_t magic16;
- uint16_t magicda;
- void *blk = bp->b_addr;
- uuid_t *uuid;
- xfs_lsn_t lsn = -1;
-
- /* v4 filesystems always recover immediately */
- if (!xfs_sb_version_hascrc(&mp->m_sb))
- goto recover_immediately;
-
- magic32 = be32_to_cpu(*(__be32 *)blk);
- switch (magic32) {
- case XFS_ABTB_CRC_MAGIC:
- case XFS_ABTC_CRC_MAGIC:
- case XFS_ABTB_MAGIC:
- case XFS_ABTC_MAGIC:
- case XFS_RMAP_CRC_MAGIC:
- case XFS_REFC_CRC_MAGIC:
- case XFS_IBT_CRC_MAGIC:
- case XFS_IBT_MAGIC: {
- struct xfs_btree_block *btb = blk;
-
- lsn = be64_to_cpu(btb->bb_u.s.bb_lsn);
- uuid = &btb->bb_u.s.bb_uuid;
- break;
- }
- case XFS_BMAP_CRC_MAGIC:
- case XFS_BMAP_MAGIC: {
- struct xfs_btree_block *btb = blk;
-
- lsn = be64_to_cpu(btb->bb_u.l.bb_lsn);
- uuid = &btb->bb_u.l.bb_uuid;
- break;
- }
- case XFS_AGF_MAGIC:
- lsn = be64_to_cpu(((struct xfs_agf *)blk)->agf_lsn);
- uuid = &((struct xfs_agf *)blk)->agf_uuid;
- break;
- case XFS_AGFL_MAGIC:
- lsn = be64_to_cpu(((struct xfs_agfl *)blk)->agfl_lsn);
- uuid = &((struct xfs_agfl *)blk)->agfl_uuid;
- break;
- case XFS_AGI_MAGIC:
- lsn = be64_to_cpu(((struct xfs_agi *)blk)->agi_lsn);
- uuid = &((struct xfs_agi *)blk)->agi_uuid;
- break;
- case XFS_SYMLINK_MAGIC:
- lsn = be64_to_cpu(((struct xfs_dsymlink_hdr *)blk)->sl_lsn);
- uuid = &((struct xfs_dsymlink_hdr *)blk)->sl_uuid;
- break;
- case XFS_DIR3_BLOCK_MAGIC:
- case XFS_DIR3_DATA_MAGIC:
- case XFS_DIR3_FREE_MAGIC:
- lsn = be64_to_cpu(((struct xfs_dir3_blk_hdr *)blk)->lsn);
- uuid = &((struct xfs_dir3_blk_hdr *)blk)->uuid;
- break;
- case XFS_ATTR3_RMT_MAGIC:
- /*
- * Remote attr blocks are written synchronously, rather than
- * being logged. That means they do not contain a valid LSN
- * (i.e. transactionally ordered) in them, and hence any time we
- * see a buffer to replay over the top of a remote attribute
- * block we should simply do so.
- */
- goto recover_immediately;
- case XFS_SB_MAGIC:
- /*
- * superblock uuids are magic. We may or may not have a
- * sb_meta_uuid on disk, but it will be set in the in-core
- * superblock. We set the uuid pointer for verification
- * according to the superblock feature mask to ensure we check
- * the relevant UUID in the superblock.
- */
- lsn = be64_to_cpu(((struct xfs_dsb *)blk)->sb_lsn);
- if (xfs_sb_version_hasmetauuid(&mp->m_sb))
- uuid = &((struct xfs_dsb *)blk)->sb_meta_uuid;
- else
- uuid = &((struct xfs_dsb *)blk)->sb_uuid;
- break;
- default:
- break;
- }
-
- if (lsn != (xfs_lsn_t)-1) {
- if (!uuid_equal(&mp->m_sb.sb_meta_uuid, uuid))
- goto recover_immediately;
- return lsn;
- }
-
- magicda = be16_to_cpu(((struct xfs_da_blkinfo *)blk)->magic);
- switch (magicda) {
- case XFS_DIR3_LEAF1_MAGIC:
- case XFS_DIR3_LEAFN_MAGIC:
- case XFS_DA3_NODE_MAGIC:
- lsn = be64_to_cpu(((struct xfs_da3_blkinfo *)blk)->lsn);
- uuid = &((struct xfs_da3_blkinfo *)blk)->uuid;
- break;
- default:
- break;
- }
-
- if (lsn != (xfs_lsn_t)-1) {
- if (!uuid_equal(&mp->m_sb.sb_uuid, uuid))
- goto recover_immediately;
- return lsn;
- }
-
- /*
- * We do individual object checks on dquot and inode buffers as they
- * have their own individual LSN records. Also, we could have a stale
- * buffer here, so we have to at least recognise these buffer types.
- *
- * A notd complexity here is inode unlinked list processing - it logs
- * the inode directly in the buffer, but we don't know which inodes have
- * been modified, and there is no global buffer LSN. Hence we need to
- * recover all inode buffer types immediately. This problem will be
- * fixed by logical logging of the unlinked list modifications.
- */
- magic16 = be16_to_cpu(*(__be16 *)blk);
- switch (magic16) {
- case XFS_DQUOT_MAGIC:
- case XFS_DINODE_MAGIC:
- goto recover_immediately;
- default:
- break;
- }
-
- /* unknown buffer contents, recover immediately */
-
-recover_immediately:
- return (xfs_lsn_t)-1;
-
-}
-
-/*
- * Validate the recovered buffer is of the correct type and attach the
- * appropriate buffer operations to them for writeback. Magic numbers are in a
- * few places:
- * the first 16 bits of the buffer (inode buffer, dquot buffer),
- * the first 32 bits of the buffer (most blocks),
- * inside a struct xfs_da_blkinfo at the start of the buffer.
- */
-static void
-xlog_recover_validate_buf_type(
- struct xfs_mount *mp,
- struct xfs_buf *bp,
- xfs_buf_log_format_t *buf_f,
- xfs_lsn_t current_lsn)
-{
- struct xfs_da_blkinfo *info = bp->b_addr;
- uint32_t magic32;
- uint16_t magic16;
- uint16_t magicda;
- char *warnmsg = NULL;
-
- /*
- * We can only do post recovery validation on items on CRC enabled
- * fielsystems as we need to know when the buffer was written to be able
- * to determine if we should have replayed the item. If we replay old
- * metadata over a newer buffer, then it will enter a temporarily
- * inconsistent state resulting in verification failures. Hence for now
- * just avoid the verification stage for non-crc filesystems
- */
- if (!xfs_sb_version_hascrc(&mp->m_sb))
- return;
-
- magic32 = be32_to_cpu(*(__be32 *)bp->b_addr);
- magic16 = be16_to_cpu(*(__be16*)bp->b_addr);
- magicda = be16_to_cpu(info->magic);
- switch (xfs_blft_from_flags(buf_f)) {
- case XFS_BLFT_BTREE_BUF:
- switch (magic32) {
- case XFS_ABTB_CRC_MAGIC:
- case XFS_ABTB_MAGIC:
- bp->b_ops = &xfs_bnobt_buf_ops;
- break;
- case XFS_ABTC_CRC_MAGIC:
- case XFS_ABTC_MAGIC:
- bp->b_ops = &xfs_cntbt_buf_ops;
- break;
- case XFS_IBT_CRC_MAGIC:
- case XFS_IBT_MAGIC:
- bp->b_ops = &xfs_inobt_buf_ops;
- break;
- case XFS_FIBT_CRC_MAGIC:
- case XFS_FIBT_MAGIC:
- bp->b_ops = &xfs_finobt_buf_ops;
- break;
- case XFS_BMAP_CRC_MAGIC:
- case XFS_BMAP_MAGIC:
- bp->b_ops = &xfs_bmbt_buf_ops;
- break;
- case XFS_RMAP_CRC_MAGIC:
- bp->b_ops = &xfs_rmapbt_buf_ops;
- break;
- case XFS_REFC_CRC_MAGIC:
- bp->b_ops = &xfs_refcountbt_buf_ops;
- break;
- default:
- warnmsg = "Bad btree block magic!";
- break;
- }
- break;
- case XFS_BLFT_AGF_BUF:
- if (magic32 != XFS_AGF_MAGIC) {
- warnmsg = "Bad AGF block magic!";
- break;
- }
- bp->b_ops = &xfs_agf_buf_ops;
- break;
- case XFS_BLFT_AGFL_BUF:
- if (magic32 != XFS_AGFL_MAGIC) {
- warnmsg = "Bad AGFL block magic!";
- break;
- }
- bp->b_ops = &xfs_agfl_buf_ops;
- break;
- case XFS_BLFT_AGI_BUF:
- if (magic32 != XFS_AGI_MAGIC) {
- warnmsg = "Bad AGI block magic!";
- break;
- }
- bp->b_ops = &xfs_agi_buf_ops;
- break;
- case XFS_BLFT_UDQUOT_BUF:
- case XFS_BLFT_PDQUOT_BUF:
- case XFS_BLFT_GDQUOT_BUF:
-#ifdef CONFIG_XFS_QUOTA
- if (magic16 != XFS_DQUOT_MAGIC) {
- warnmsg = "Bad DQUOT block magic!";
- break;
- }
- bp->b_ops = &xfs_dquot_buf_ops;
-#else
- xfs_alert(mp,
- "Trying to recover dquots without QUOTA support built in!");
- ASSERT(0);
-#endif
- break;
- case XFS_BLFT_DINO_BUF:
- if (magic16 != XFS_DINODE_MAGIC) {
- warnmsg = "Bad INODE block magic!";
- break;
- }
- bp->b_ops = &xfs_inode_buf_ops;
- break;
- case XFS_BLFT_SYMLINK_BUF:
- if (magic32 != XFS_SYMLINK_MAGIC) {
- warnmsg = "Bad symlink block magic!";
- break;
- }
- bp->b_ops = &xfs_symlink_buf_ops;
- break;
- case XFS_BLFT_DIR_BLOCK_BUF:
- if (magic32 != XFS_DIR2_BLOCK_MAGIC &&
- magic32 != XFS_DIR3_BLOCK_MAGIC) {
- warnmsg = "Bad dir block magic!";
- break;
- }
- bp->b_ops = &xfs_dir3_block_buf_ops;
- break;
- case XFS_BLFT_DIR_DATA_BUF:
- if (magic32 != XFS_DIR2_DATA_MAGIC &&
- magic32 != XFS_DIR3_DATA_MAGIC) {
- warnmsg = "Bad dir data magic!";
- break;
- }
- bp->b_ops = &xfs_dir3_data_buf_ops;
- break;
- case XFS_BLFT_DIR_FREE_BUF:
- if (magic32 != XFS_DIR2_FREE_MAGIC &&
- magic32 != XFS_DIR3_FREE_MAGIC) {
- warnmsg = "Bad dir3 free magic!";
- break;
- }
- bp->b_ops = &xfs_dir3_free_buf_ops;
- break;
- case XFS_BLFT_DIR_LEAF1_BUF:
- if (magicda != XFS_DIR2_LEAF1_MAGIC &&
- magicda != XFS_DIR3_LEAF1_MAGIC) {
- warnmsg = "Bad dir leaf1 magic!";
- break;
- }
- bp->b_ops = &xfs_dir3_leaf1_buf_ops;
- break;
- case XFS_BLFT_DIR_LEAFN_BUF:
- if (magicda != XFS_DIR2_LEAFN_MAGIC &&
- magicda != XFS_DIR3_LEAFN_MAGIC) {
- warnmsg = "Bad dir leafn magic!";
- break;
- }
- bp->b_ops = &xfs_dir3_leafn_buf_ops;
- break;
- case XFS_BLFT_DA_NODE_BUF:
- if (magicda != XFS_DA_NODE_MAGIC &&
- magicda != XFS_DA3_NODE_MAGIC) {
- warnmsg = "Bad da node magic!";
- break;
- }
- bp->b_ops = &xfs_da3_node_buf_ops;
- break;
- case XFS_BLFT_ATTR_LEAF_BUF:
- if (magicda != XFS_ATTR_LEAF_MAGIC &&
- magicda != XFS_ATTR3_LEAF_MAGIC) {
- warnmsg = "Bad attr leaf magic!";
- break;
- }
- bp->b_ops = &xfs_attr3_leaf_buf_ops;
- break;
- case XFS_BLFT_ATTR_RMT_BUF:
- if (magic32 != XFS_ATTR3_RMT_MAGIC) {
- warnmsg = "Bad attr remote magic!";
- break;
- }
- bp->b_ops = &xfs_attr3_rmt_buf_ops;
- break;
- case XFS_BLFT_SB_BUF:
- if (magic32 != XFS_SB_MAGIC) {
- warnmsg = "Bad SB block magic!";
- break;
- }
- bp->b_ops = &xfs_sb_buf_ops;
- break;
-#ifdef CONFIG_XFS_RT
- case XFS_BLFT_RTBITMAP_BUF:
- case XFS_BLFT_RTSUMMARY_BUF:
- /* no magic numbers for verification of RT buffers */
- bp->b_ops = &xfs_rtbuf_ops;
- break;
-#endif /* CONFIG_XFS_RT */
- default:
- xfs_warn(mp, "Unknown buffer type %d!",
- xfs_blft_from_flags(buf_f));
- break;
- }
-
- /*
- * Nothing else to do in the case of a NULL current LSN as this means
- * the buffer is more recent than the change in the log and will be
- * skipped.
- */
- if (current_lsn == NULLCOMMITLSN)
- return;
-
- if (warnmsg) {
- xfs_warn(mp, warnmsg);
- ASSERT(0);
- }
-
- /*
- * We must update the metadata LSN of the buffer as it is written out to
- * ensure that older transactions never replay over this one and corrupt
- * the buffer. This can occur if log recovery is interrupted at some
- * point after the current transaction completes, at which point a
- * subsequent mount starts recovery from the beginning.
- *
- * Write verifiers update the metadata LSN from log items attached to
- * the buffer. Therefore, initialize a bli purely to carry the LSN to
- * the verifier. We'll clean it up in our ->iodone() callback.
- */
- if (bp->b_ops) {
- struct xfs_buf_log_item *bip;
-
- ASSERT(!bp->b_iodone || bp->b_iodone == xlog_recover_iodone);
- bp->b_iodone = xlog_recover_iodone;
- xfs_buf_item_init(bp, mp);
- bip = bp->b_log_item;
- bip->bli_item.li_lsn = current_lsn;
- }
-}
-
-/*
- * Perform a 'normal' buffer recovery. Each logged region of the
- * buffer should be copied over the corresponding region in the
- * given buffer. The bitmap in the buf log format structure indicates
- * where to place the logged data.
- */
-STATIC void
-xlog_recover_do_reg_buffer(
- struct xfs_mount *mp,
- xlog_recover_item_t *item,
- struct xfs_buf *bp,
- xfs_buf_log_format_t *buf_f,
- xfs_lsn_t current_lsn)
-{
- int i;
- int bit;
- int nbits;
- xfs_failaddr_t fa;
-
- trace_xfs_log_recover_buf_reg_buf(mp->m_log, buf_f);
-
- bit = 0;
- i = 1; /* 0 is the buf format structure */
- while (1) {
- bit = xfs_next_bit(buf_f->blf_data_map,
- buf_f->blf_map_size, bit);
- if (bit == -1)
- break;
- nbits = xfs_contig_bits(buf_f->blf_data_map,
- buf_f->blf_map_size, bit);
- ASSERT(nbits > 0);
- ASSERT(item->ri_buf[i].i_addr != NULL);
- ASSERT(item->ri_buf[i].i_len % XFS_BLF_CHUNK == 0);
- ASSERT(BBTOB(bp->b_length) >=
- ((uint)bit << XFS_BLF_SHIFT) + (nbits << XFS_BLF_SHIFT));
-
- /*
- * The dirty regions logged in the buffer, even though
- * contiguous, may span multiple chunks. This is because the
- * dirty region may span a physical page boundary in a buffer
- * and hence be split into two separate vectors for writing into
- * the log. Hence we need to trim nbits back to the length of
- * the current region being copied out of the log.
- */
- if (item->ri_buf[i].i_len < (nbits << XFS_BLF_SHIFT))
- nbits = item->ri_buf[i].i_len >> XFS_BLF_SHIFT;
-
- /*
- * Do a sanity check if this is a dquot buffer. Just checking
- * the first dquot in the buffer should do. XXXThis is
- * probably a good thing to do for other buf types also.
- */
- fa = NULL;
- if (buf_f->blf_flags &
- (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) {
- if (item->ri_buf[i].i_addr == NULL) {
- xfs_alert(mp,
- "XFS: NULL dquot in %s.", __func__);
- goto next;
- }
- if (item->ri_buf[i].i_len < sizeof(xfs_disk_dquot_t)) {
- xfs_alert(mp,
- "XFS: dquot too small (%d) in %s.",
- item->ri_buf[i].i_len, __func__);
- goto next;
- }
- fa = xfs_dquot_verify(mp, item->ri_buf[i].i_addr,
- -1, 0);
- if (fa) {
- xfs_alert(mp,
- "dquot corrupt at %pS trying to replay into block 0x%llx",
- fa, bp->b_bn);
- goto next;
- }
- }
-
- memcpy(xfs_buf_offset(bp,
- (uint)bit << XFS_BLF_SHIFT), /* dest */
- item->ri_buf[i].i_addr, /* source */
- nbits<<XFS_BLF_SHIFT); /* length */
- next:
- i++;
- bit += nbits;
- }
-
- /* Shouldn't be any more regions */
- ASSERT(i == item->ri_total);
-
- xlog_recover_validate_buf_type(mp, bp, buf_f, current_lsn);
-}
-
-/*
- * Perform a dquot buffer recovery.
- * Simple algorithm: if we have found a QUOTAOFF log item of the same type
- * (ie. USR or GRP), then just toss this buffer away; don't recover it.
- * Else, treat it as a regular buffer and do recovery.
- *
- * Return false if the buffer was tossed and true if we recovered the buffer to
- * indicate to the caller if the buffer needs writing.
- */
-STATIC bool
-xlog_recover_do_dquot_buffer(
- struct xfs_mount *mp,
- struct xlog *log,
- struct xlog_recover_item *item,
- struct xfs_buf *bp,
- struct xfs_buf_log_format *buf_f)
-{
- uint type;
-
- trace_xfs_log_recover_buf_dquot_buf(log, buf_f);
-
- /*
- * Filesystems are required to send in quota flags at mount time.
- */
- if (!mp->m_qflags)
- return false;
-
- type = 0;
- if (buf_f->blf_flags & XFS_BLF_UDQUOT_BUF)
- type |= XFS_DQ_USER;
- if (buf_f->blf_flags & XFS_BLF_PDQUOT_BUF)
- type |= XFS_DQ_PROJ;
- if (buf_f->blf_flags & XFS_BLF_GDQUOT_BUF)
- type |= XFS_DQ_GROUP;
- /*
- * This type of quotas was turned off, so ignore this buffer
- */
- if (log->l_quotaoffs_flag & type)
- return false;
-
- xlog_recover_do_reg_buffer(mp, item, bp, buf_f, NULLCOMMITLSN);
- return true;
-}
-
-/*
- * This routine replays a modification made to a buffer at runtime.
- * There are actually two types of buffer, regular and inode, which
- * are handled differently. Inode buffers are handled differently
- * in that we only recover a specific set of data from them, namely
- * the inode di_next_unlinked fields. This is because all other inode
- * data is actually logged via inode records and any data we replay
- * here which overlaps that may be stale.
- *
- * When meta-data buffers are freed at run time we log a buffer item
- * with the XFS_BLF_CANCEL bit set to indicate that previous copies
- * of the buffer in the log should not be replayed at recovery time.
- * This is so that if the blocks covered by the buffer are reused for
- * file data before we crash we don't end up replaying old, freed
- * meta-data into a user's file.
- *
- * To handle the cancellation of buffer log items, we make two passes
- * over the log during recovery. During the first we build a table of
- * those buffers which have been cancelled, and during the second we
- * only replay those buffers which do not have corresponding cancel
- * records in the table. See xlog_recover_buffer_pass[1,2] above
- * for more details on the implementation of the table of cancel records.
- */
-STATIC int
-xlog_recover_buffer_pass2(
- struct xlog *log,
- struct list_head *buffer_list,
- struct xlog_recover_item *item,
- xfs_lsn_t current_lsn)
-{
- xfs_buf_log_format_t *buf_f = item->ri_buf[0].i_addr;
- xfs_mount_t *mp = log->l_mp;
- xfs_buf_t *bp;
- int error;
- uint buf_flags;
- xfs_lsn_t lsn;
-
- /*
- * In this pass we only want to recover all the buffers which have
- * not been cancelled and are not cancellation buffers themselves.
- */
- if (xlog_check_buffer_cancelled(log, buf_f->blf_blkno,
- buf_f->blf_len, buf_f->blf_flags)) {
- trace_xfs_log_recover_buf_cancel(log, buf_f);
- return 0;
- }
-
- trace_xfs_log_recover_buf_recover(log, buf_f);
-
- buf_flags = 0;
- if (buf_f->blf_flags & XFS_BLF_INODE_BUF)
- buf_flags |= XBF_UNMAPPED;
-
- bp = xfs_buf_read(mp->m_ddev_targp, buf_f->blf_blkno, buf_f->blf_len,
- buf_flags, NULL);
- if (!bp)
- return -ENOMEM;
- error = bp->b_error;
- if (error) {
- xfs_buf_ioerror_alert(bp, "xlog_recover_do..(read#1)");
- goto out_release;
- }
-
- /*
- * Recover the buffer only if we get an LSN from it and it's less than
- * the lsn of the transaction we are replaying.
- *
- * Note that we have to be extremely careful of readahead here.
- * Readahead does not attach verfiers to the buffers so if we don't
- * actually do any replay after readahead because of the LSN we found
- * in the buffer if more recent than that current transaction then we
- * need to attach the verifier directly. Failure to do so can lead to
- * future recovery actions (e.g. EFI and unlinked list recovery) can
- * operate on the buffers and they won't get the verifier attached. This
- * can lead to blocks on disk having the correct content but a stale
- * CRC.
- *
- * It is safe to assume these clean buffers are currently up to date.
- * If the buffer is dirtied by a later transaction being replayed, then
- * the verifier will be reset to match whatever recover turns that
- * buffer into.
- */
- lsn = xlog_recover_get_buf_lsn(mp, bp);
- if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) {
- trace_xfs_log_recover_buf_skip(log, buf_f);
- xlog_recover_validate_buf_type(mp, bp, buf_f, NULLCOMMITLSN);
- goto out_release;
- }
-
- if (buf_f->blf_flags & XFS_BLF_INODE_BUF) {
- error = xlog_recover_do_inode_buffer(mp, item, bp, buf_f);
- if (error)
- goto out_release;
- } else if (buf_f->blf_flags &
- (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) {
- bool dirty;
-
- dirty = xlog_recover_do_dquot_buffer(mp, log, item, bp, buf_f);
- if (!dirty)
- goto out_release;
- } else {
- xlog_recover_do_reg_buffer(mp, item, bp, buf_f, current_lsn);
- }
-
- /*
- * Perform delayed write on the buffer. Asynchronous writes will be
- * slower when taking into account all the buffers to be flushed.
- *
- * Also make sure that only inode buffers with good sizes stay in
- * the buffer cache. The kernel moves inodes in buffers of 1 block
- * or inode_cluster_size bytes, whichever is bigger. The inode
- * buffers in the log can be a different size if the log was generated
- * by an older kernel using unclustered inode buffers or a newer kernel
- * running with a different inode cluster size. Regardless, if the
- * the inode buffer size isn't max(blocksize, inode_cluster_size)
- * for *our* value of inode_cluster_size, then we need to keep
- * the buffer out of the buffer cache so that the buffer won't
- * overlap with future reads of those inodes.
- */
- if (XFS_DINODE_MAGIC ==
- be16_to_cpu(*((__be16 *)xfs_buf_offset(bp, 0))) &&
- (BBTOB(bp->b_length) != M_IGEO(log->l_mp)->inode_cluster_size)) {
- xfs_buf_stale(bp);
- error = xfs_bwrite(bp);
- } else {
- ASSERT(bp->b_mount == mp);
- bp->b_iodone = xlog_recover_iodone;
- xfs_buf_delwri_queue(bp, buffer_list);
- }
-
-out_release:
- xfs_buf_relse(bp);
- return error;
-}
-
-/*
- * Inode fork owner changes
- *
- * If we have been told that we have to reparent the inode fork, it's because an
- * extent swap operation on a CRC enabled filesystem has been done and we are
- * replaying it. We need to walk the BMBT of the appropriate fork and change the
- * owners of it.
- *
- * The complexity here is that we don't have an inode context to work with, so
- * after we've replayed the inode we need to instantiate one. This is where the
- * fun begins.
- *
- * We are in the middle of log recovery, so we can't run transactions. That
- * means we cannot use cache coherent inode instantiation via xfs_iget(), as
- * that will result in the corresponding iput() running the inode through
- * xfs_inactive(). If we've just replayed an inode core that changes the link
- * count to zero (i.e. it's been unlinked), then xfs_inactive() will run
- * transactions (bad!).
- *
- * So, to avoid this, we instantiate an inode directly from the inode core we've
- * just recovered. We have the buffer still locked, and all we really need to
- * instantiate is the inode core and the forks being modified. We can do this
- * manually, then run the inode btree owner change, and then tear down the
- * xfs_inode without having to run any transactions at all.
- *
- * Also, because we don't have a transaction context available here but need to
- * gather all the buffers we modify for writeback so we pass the buffer_list
- * instead for the operation to use.
- */
-
-STATIC int
-xfs_recover_inode_owner_change(
- struct xfs_mount *mp,
- struct xfs_dinode *dip,
- struct xfs_inode_log_format *in_f,
- struct list_head *buffer_list)
-{
- struct xfs_inode *ip;
- int error;
-
- ASSERT(in_f->ilf_fields & (XFS_ILOG_DOWNER|XFS_ILOG_AOWNER));
-
- ip = xfs_inode_alloc(mp, in_f->ilf_ino);
- if (!ip)
- return -ENOMEM;
-
- /* instantiate the inode */
- xfs_inode_from_disk(ip, dip);
- ASSERT(ip->i_d.di_version >= 3);
-
- error = xfs_iformat_fork(ip, dip);
- if (error)
- goto out_free_ip;
-
- if (!xfs_inode_verify_forks(ip)) {
- error = -EFSCORRUPTED;
- goto out_free_ip;
- }
-
- if (in_f->ilf_fields & XFS_ILOG_DOWNER) {
- ASSERT(in_f->ilf_fields & XFS_ILOG_DBROOT);
- error = xfs_bmbt_change_owner(NULL, ip, XFS_DATA_FORK,
- ip->i_ino, buffer_list);
- if (error)
- goto out_free_ip;
- }
-
- if (in_f->ilf_fields & XFS_ILOG_AOWNER) {
- ASSERT(in_f->ilf_fields & XFS_ILOG_ABROOT);
- error = xfs_bmbt_change_owner(NULL, ip, XFS_ATTR_FORK,
- ip->i_ino, buffer_list);
- if (error)
- goto out_free_ip;
- }
-
-out_free_ip:
- xfs_inode_free(ip);
- return error;
-}
-
-STATIC int
-xlog_recover_inode_pass2(
- struct xlog *log,
- struct list_head *buffer_list,
- struct xlog_recover_item *item,
- xfs_lsn_t current_lsn)
-{
- struct xfs_inode_log_format *in_f;
- xfs_mount_t *mp = log->l_mp;
- xfs_buf_t *bp;
- xfs_dinode_t *dip;
- int len;
- char *src;
- char *dest;
- int error;
- int attr_index;
- uint fields;
- struct xfs_log_dinode *ldip;
- uint isize;
- int need_free = 0;
-
- if (item->ri_buf[0].i_len == sizeof(struct xfs_inode_log_format)) {
- in_f = item->ri_buf[0].i_addr;
- } else {
- in_f = kmem_alloc(sizeof(struct xfs_inode_log_format), 0);
- need_free = 1;
- error = xfs_inode_item_format_convert(&item->ri_buf[0], in_f);
- if (error)
- goto error;
- }
-
- /*
- * Inode buffers can be freed, look out for it,
- * and do not replay the inode.
- */
- if (xlog_check_buffer_cancelled(log, in_f->ilf_blkno,
- in_f->ilf_len, 0)) {
- error = 0;
- trace_xfs_log_recover_inode_cancel(log, in_f);
- goto error;
- }
- trace_xfs_log_recover_inode_recover(log, in_f);
-
- bp = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len, 0,
- &xfs_inode_buf_ops);
- if (!bp) {
- error = -ENOMEM;
- goto error;
- }
- error = bp->b_error;
- if (error) {
- xfs_buf_ioerror_alert(bp, "xlog_recover_do..(read#2)");
- goto out_release;
- }
- ASSERT(in_f->ilf_fields & XFS_ILOG_CORE);
- dip = xfs_buf_offset(bp, in_f->ilf_boffset);
-
- /*
- * Make sure the place we're flushing out to really looks
- * like an inode!
- */
- if (unlikely(!xfs_verify_magic16(bp, dip->di_magic))) {
- xfs_alert(mp,
- "%s: Bad inode magic number, dip = "PTR_FMT", dino bp = "PTR_FMT", ino = %Ld",
- __func__, dip, bp, in_f->ilf_ino);
- XFS_ERROR_REPORT("xlog_recover_inode_pass2(1)",
- XFS_ERRLEVEL_LOW, mp);
- error = -EFSCORRUPTED;
- goto out_release;
- }
- ldip = item->ri_buf[1].i_addr;
- if (unlikely(ldip->di_magic != XFS_DINODE_MAGIC)) {
- xfs_alert(mp,
- "%s: Bad inode log record, rec ptr "PTR_FMT", ino %Ld",
- __func__, item, in_f->ilf_ino);
- XFS_ERROR_REPORT("xlog_recover_inode_pass2(2)",
- XFS_ERRLEVEL_LOW, mp);
- error = -EFSCORRUPTED;
- goto out_release;
- }
-
- /*
- * If the inode has an LSN in it, recover the inode only if it's less
- * than the lsn of the transaction we are replaying. Note: we still
- * need to replay an owner change even though the inode is more recent
- * than the transaction as there is no guarantee that all the btree
- * blocks are more recent than this transaction, too.
- */
- if (dip->di_version >= 3) {
- xfs_lsn_t lsn = be64_to_cpu(dip->di_lsn);
-
- if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) {
- trace_xfs_log_recover_inode_skip(log, in_f);
- error = 0;
- goto out_owner_change;
- }
- }
-
- /*
- * di_flushiter is only valid for v1/2 inodes. All changes for v3 inodes
- * are transactional and if ordering is necessary we can determine that
- * more accurately by the LSN field in the V3 inode core. Don't trust
- * the inode versions we might be changing them here - use the
- * superblock flag to determine whether we need to look at di_flushiter
- * to skip replay when the on disk inode is newer than the log one
- */
- if (!xfs_sb_version_hascrc(&mp->m_sb) &&
- ldip->di_flushiter < be16_to_cpu(dip->di_flushiter)) {
- /*
- * Deal with the wrap case, DI_MAX_FLUSH is less
- * than smaller numbers
- */
- if (be16_to_cpu(dip->di_flushiter) == DI_MAX_FLUSH &&
- ldip->di_flushiter < (DI_MAX_FLUSH >> 1)) {
- /* do nothing */
- } else {
- trace_xfs_log_recover_inode_skip(log, in_f);
- error = 0;
- goto out_release;
- }
- }
-
- /* Take the opportunity to reset the flush iteration count */
- ldip->di_flushiter = 0;
-
- if (unlikely(S_ISREG(ldip->di_mode))) {
- if ((ldip->di_format != XFS_DINODE_FMT_EXTENTS) &&
- (ldip->di_format != XFS_DINODE_FMT_BTREE)) {
- XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(3)",
- XFS_ERRLEVEL_LOW, mp, ldip,
- sizeof(*ldip));
- xfs_alert(mp,
- "%s: Bad regular inode log record, rec ptr "PTR_FMT", "
- "ino ptr = "PTR_FMT", ino bp = "PTR_FMT", ino %Ld",
- __func__, item, dip, bp, in_f->ilf_ino);
- error = -EFSCORRUPTED;
- goto out_release;
- }
- } else if (unlikely(S_ISDIR(ldip->di_mode))) {
- if ((ldip->di_format != XFS_DINODE_FMT_EXTENTS) &&
- (ldip->di_format != XFS_DINODE_FMT_BTREE) &&
- (ldip->di_format != XFS_DINODE_FMT_LOCAL)) {
- XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(4)",
- XFS_ERRLEVEL_LOW, mp, ldip,
- sizeof(*ldip));
- xfs_alert(mp,
- "%s: Bad dir inode log record, rec ptr "PTR_FMT", "
- "ino ptr = "PTR_FMT", ino bp = "PTR_FMT", ino %Ld",
- __func__, item, dip, bp, in_f->ilf_ino);
- error = -EFSCORRUPTED;
- goto out_release;
- }
- }
- if (unlikely(ldip->di_nextents + ldip->di_anextents > ldip->di_nblocks)){
- XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(5)",
- XFS_ERRLEVEL_LOW, mp, ldip,
- sizeof(*ldip));
- xfs_alert(mp,
- "%s: Bad inode log record, rec ptr "PTR_FMT", dino ptr "PTR_FMT", "
- "dino bp "PTR_FMT", ino %Ld, total extents = %d, nblocks = %Ld",
- __func__, item, dip, bp, in_f->ilf_ino,
- ldip->di_nextents + ldip->di_anextents,
- ldip->di_nblocks);
- error = -EFSCORRUPTED;
- goto out_release;
- }
- if (unlikely(ldip->di_forkoff > mp->m_sb.sb_inodesize)) {
- XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(6)",
- XFS_ERRLEVEL_LOW, mp, ldip,
- sizeof(*ldip));
- xfs_alert(mp,
- "%s: Bad inode log record, rec ptr "PTR_FMT", dino ptr "PTR_FMT", "
- "dino bp "PTR_FMT", ino %Ld, forkoff 0x%x", __func__,
- item, dip, bp, in_f->ilf_ino, ldip->di_forkoff);
- error = -EFSCORRUPTED;
- goto out_release;
- }
- isize = xfs_log_dinode_size(ldip->di_version);
- if (unlikely(item->ri_buf[1].i_len > isize)) {
- XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(7)",
- XFS_ERRLEVEL_LOW, mp, ldip,
- sizeof(*ldip));
- xfs_alert(mp,
- "%s: Bad inode log record length %d, rec ptr "PTR_FMT,
- __func__, item->ri_buf[1].i_len, item);
- error = -EFSCORRUPTED;
- goto out_release;
- }
-
- /* recover the log dinode inode into the on disk inode */
- xfs_log_dinode_to_disk(ldip, dip);
-
- fields = in_f->ilf_fields;
- if (fields & XFS_ILOG_DEV)
- xfs_dinode_put_rdev(dip, in_f->ilf_u.ilfu_rdev);
-
- if (in_f->ilf_size == 2)
- goto out_owner_change;
- len = item->ri_buf[2].i_len;
- src = item->ri_buf[2].i_addr;
- ASSERT(in_f->ilf_size <= 4);
- ASSERT((in_f->ilf_size == 3) || (fields & XFS_ILOG_AFORK));
- ASSERT(!(fields & XFS_ILOG_DFORK) ||
- (len == in_f->ilf_dsize));
-
- switch (fields & XFS_ILOG_DFORK) {
- case XFS_ILOG_DDATA:
- case XFS_ILOG_DEXT:
- memcpy(XFS_DFORK_DPTR(dip), src, len);
- break;
-
- case XFS_ILOG_DBROOT:
- xfs_bmbt_to_bmdr(mp, (struct xfs_btree_block *)src, len,
- (xfs_bmdr_block_t *)XFS_DFORK_DPTR(dip),
- XFS_DFORK_DSIZE(dip, mp));
- break;
-
- default:
- /*
- * There are no data fork flags set.
- */
- ASSERT((fields & XFS_ILOG_DFORK) == 0);
- break;
- }
-
- /*
- * If we logged any attribute data, recover it. There may or
- * may not have been any other non-core data logged in this
- * transaction.
- */
- if (in_f->ilf_fields & XFS_ILOG_AFORK) {
- if (in_f->ilf_fields & XFS_ILOG_DFORK) {
- attr_index = 3;
- } else {
- attr_index = 2;
- }
- len = item->ri_buf[attr_index].i_len;
- src = item->ri_buf[attr_index].i_addr;
- ASSERT(len == in_f->ilf_asize);
-
- switch (in_f->ilf_fields & XFS_ILOG_AFORK) {
- case XFS_ILOG_ADATA:
- case XFS_ILOG_AEXT:
- dest = XFS_DFORK_APTR(dip);
- ASSERT(len <= XFS_DFORK_ASIZE(dip, mp));
- memcpy(dest, src, len);
- break;
-
- case XFS_ILOG_ABROOT:
- dest = XFS_DFORK_APTR(dip);
- xfs_bmbt_to_bmdr(mp, (struct xfs_btree_block *)src,
- len, (xfs_bmdr_block_t*)dest,
- XFS_DFORK_ASIZE(dip, mp));
- break;
-
- default:
- xfs_warn(log->l_mp, "%s: Invalid flag", __func__);
- ASSERT(0);
- error = -EIO;
- goto out_release;
- }
- }
-
-out_owner_change:
- /* Recover the swapext owner change unless inode has been deleted */
- if ((in_f->ilf_fields & (XFS_ILOG_DOWNER|XFS_ILOG_AOWNER)) &&
- (dip->di_mode != 0))
- error = xfs_recover_inode_owner_change(mp, dip, in_f,
- buffer_list);
- /* re-generate the checksum. */
- xfs_dinode_calc_crc(log->l_mp, dip);
-
- ASSERT(bp->b_mount == mp);
- bp->b_iodone = xlog_recover_iodone;
- xfs_buf_delwri_queue(bp, buffer_list);
-
-out_release:
- xfs_buf_relse(bp);
-error:
- if (need_free)
- kmem_free(in_f);
- return error;
-}
-
-/*
- * Recover QUOTAOFF records. We simply make a note of it in the xlog
- * structure, so that we know not to do any dquot item or dquot buffer recovery,
- * of that type.
- */
-STATIC int
-xlog_recover_quotaoff_pass1(
- struct xlog *log,
- struct xlog_recover_item *item)
-{
- xfs_qoff_logformat_t *qoff_f = item->ri_buf[0].i_addr;
- ASSERT(qoff_f);
-
- /*
- * The logitem format's flag tells us if this was user quotaoff,
- * group/project quotaoff or both.
- */
- if (qoff_f->qf_flags & XFS_UQUOTA_ACCT)
- log->l_quotaoffs_flag |= XFS_DQ_USER;
- if (qoff_f->qf_flags & XFS_PQUOTA_ACCT)
- log->l_quotaoffs_flag |= XFS_DQ_PROJ;
- if (qoff_f->qf_flags & XFS_GQUOTA_ACCT)
- log->l_quotaoffs_flag |= XFS_DQ_GROUP;
-
- return 0;
-}
-
-/*
- * Recover a dquot record
- */
-STATIC int
-xlog_recover_dquot_pass2(
- struct xlog *log,
- struct list_head *buffer_list,
- struct xlog_recover_item *item,
- xfs_lsn_t current_lsn)
-{
- xfs_mount_t *mp = log->l_mp;
- xfs_buf_t *bp;
- struct xfs_disk_dquot *ddq, *recddq;
- xfs_failaddr_t fa;
- int error;
- xfs_dq_logformat_t *dq_f;
- uint type;
-
-
- /*
- * Filesystems are required to send in quota flags at mount time.
- */
- if (mp->m_qflags == 0)
- return 0;
-
- recddq = item->ri_buf[1].i_addr;
- if (recddq == NULL) {
- xfs_alert(log->l_mp, "NULL dquot in %s.", __func__);
- return -EIO;
- }
- if (item->ri_buf[1].i_len < sizeof(xfs_disk_dquot_t)) {
- xfs_alert(log->l_mp, "dquot too small (%d) in %s.",
- item->ri_buf[1].i_len, __func__);
- return -EIO;
- }
-
- /*
- * This type of quotas was turned off, so ignore this record.
- */
- type = recddq->d_flags & (XFS_DQ_USER | XFS_DQ_PROJ | XFS_DQ_GROUP);
- ASSERT(type);
- if (log->l_quotaoffs_flag & type)
- return 0;
-
- /*
- * At this point we know that quota was _not_ turned off.
- * Since the mount flags are not indicating to us otherwise, this
- * must mean that quota is on, and the dquot needs to be replayed.
- * Remember that we may not have fully recovered the superblock yet,
- * so we can't do the usual trick of looking at the SB quota bits.
- *
- * The other possibility, of course, is that the quota subsystem was
- * removed since the last mount - ENOSYS.
- */
- dq_f = item->ri_buf[0].i_addr;
- ASSERT(dq_f);
- fa = xfs_dquot_verify(mp, recddq, dq_f->qlf_id, 0);
- if (fa) {
- xfs_alert(mp, "corrupt dquot ID 0x%x in log at %pS",
- dq_f->qlf_id, fa);
- return -EIO;
- }
- ASSERT(dq_f->qlf_len == 1);
-
- /*
- * At this point we are assuming that the dquots have been allocated
- * and hence the buffer has valid dquots stamped in it. It should,
- * therefore, pass verifier validation. If the dquot is bad, then the
- * we'll return an error here, so we don't need to specifically check
- * the dquot in the buffer after the verifier has run.
- */
- error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dq_f->qlf_blkno,
- XFS_FSB_TO_BB(mp, dq_f->qlf_len), 0, &bp,
- &xfs_dquot_buf_ops);
- if (error)
- return error;
-
- ASSERT(bp);
- ddq = xfs_buf_offset(bp, dq_f->qlf_boffset);
-
- /*
- * If the dquot has an LSN in it, recover the dquot only if it's less
- * than the lsn of the transaction we are replaying.
- */
- if (xfs_sb_version_hascrc(&mp->m_sb)) {
- struct xfs_dqblk *dqb = (struct xfs_dqblk *)ddq;
- xfs_lsn_t lsn = be64_to_cpu(dqb->dd_lsn);
-
- if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) {
- goto out_release;
- }
- }
-
- memcpy(ddq, recddq, item->ri_buf[1].i_len);
- if (xfs_sb_version_hascrc(&mp->m_sb)) {
- xfs_update_cksum((char *)ddq, sizeof(struct xfs_dqblk),
- XFS_DQUOT_CRC_OFF);
- }
-
- ASSERT(dq_f->qlf_size == 2);
- ASSERT(bp->b_mount == mp);
- bp->b_iodone = xlog_recover_iodone;
- xfs_buf_delwri_queue(bp, buffer_list);
-
-out_release:
- xfs_buf_relse(bp);
- return 0;
-}
-
-/*
- * This routine is called to create an in-core extent free intent
- * item from the efi format structure which was logged on disk.
- * It allocates an in-core efi, copies the extents from the format
- * structure into it, and adds the efi to the AIL with the given
- * LSN.
- */
-STATIC int
-xlog_recover_efi_pass2(
- struct xlog *log,
- struct xlog_recover_item *item,
- xfs_lsn_t lsn)
-{
- int error;
- struct xfs_mount *mp = log->l_mp;
- struct xfs_efi_log_item *efip;
- struct xfs_efi_log_format *efi_formatp;
-
- efi_formatp = item->ri_buf[0].i_addr;
-
- efip = xfs_efi_init(mp, efi_formatp->efi_nextents);
- error = xfs_efi_copy_format(&item->ri_buf[0], &efip->efi_format);
- if (error) {
- xfs_efi_item_free(efip);
- return error;
- }
- atomic_set(&efip->efi_next_extent, efi_formatp->efi_nextents);
-
- spin_lock(&log->l_ailp->ail_lock);
- /*
- * The EFI has two references. One for the EFD and one for EFI to ensure
- * it makes it into the AIL. Insert the EFI into the AIL directly and
- * drop the EFI reference. Note that xfs_trans_ail_update() drops the
- * AIL lock.
- */
- xfs_trans_ail_update(log->l_ailp, &efip->efi_item, lsn);
- xfs_efi_release(efip);
- return 0;
-}
-
-
-/*
- * This routine is called when an EFD format structure is found in a committed
- * transaction in the log. Its purpose is to cancel the corresponding EFI if it
- * was still in the log. To do this it searches the AIL for the EFI with an id
- * equal to that in the EFD format structure. If we find it we drop the EFD
- * reference, which removes the EFI from the AIL and frees it.
- */
-STATIC int
-xlog_recover_efd_pass2(
- struct xlog *log,
- struct xlog_recover_item *item)
-{
- xfs_efd_log_format_t *efd_formatp;
- xfs_efi_log_item_t *efip = NULL;
- struct xfs_log_item *lip;
- uint64_t efi_id;
- struct xfs_ail_cursor cur;
- struct xfs_ail *ailp = log->l_ailp;
-
- efd_formatp = item->ri_buf[0].i_addr;
- ASSERT((item->ri_buf[0].i_len == (sizeof(xfs_efd_log_format_32_t) +
- ((efd_formatp->efd_nextents - 1) * sizeof(xfs_extent_32_t)))) ||
- (item->ri_buf[0].i_len == (sizeof(xfs_efd_log_format_64_t) +
- ((efd_formatp->efd_nextents - 1) * sizeof(xfs_extent_64_t)))));
- efi_id = efd_formatp->efd_efi_id;
-
- /*
- * Search for the EFI with the id in the EFD format structure in the
- * AIL.
- */
- spin_lock(&ailp->ail_lock);
- lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
- while (lip != NULL) {
- if (lip->li_type == XFS_LI_EFI) {
- efip = (xfs_efi_log_item_t *)lip;
- if (efip->efi_format.efi_id == efi_id) {
- /*
- * Drop the EFD reference to the EFI. This
- * removes the EFI from the AIL and frees it.
- */
- spin_unlock(&ailp->ail_lock);
- xfs_efi_release(efip);
- spin_lock(&ailp->ail_lock);
- break;
- }
- }
- lip = xfs_trans_ail_cursor_next(ailp, &cur);
- }
-
- xfs_trans_ail_cursor_done(&cur);
- spin_unlock(&ailp->ail_lock);
-
- return 0;
-}
-
-/*
- * This routine is called to create an in-core extent rmap update
- * item from the rui format structure which was logged on disk.
- * It allocates an in-core rui, copies the extents from the format
- * structure into it, and adds the rui to the AIL with the given
- * LSN.
- */
-STATIC int
-xlog_recover_rui_pass2(
- struct xlog *log,
- struct xlog_recover_item *item,
- xfs_lsn_t lsn)
-{
- int error;
- struct xfs_mount *mp = log->l_mp;
- struct xfs_rui_log_item *ruip;
- struct xfs_rui_log_format *rui_formatp;
-
- rui_formatp = item->ri_buf[0].i_addr;
-
- ruip = xfs_rui_init(mp, rui_formatp->rui_nextents);
- error = xfs_rui_copy_format(&item->ri_buf[0], &ruip->rui_format);
- if (error) {
- xfs_rui_item_free(ruip);
- return error;
- }
- atomic_set(&ruip->rui_next_extent, rui_formatp->rui_nextents);
-
- spin_lock(&log->l_ailp->ail_lock);
- /*
- * The RUI has two references. One for the RUD and one for RUI to ensure
- * it makes it into the AIL. Insert the RUI into the AIL directly and
- * drop the RUI reference. Note that xfs_trans_ail_update() drops the
- * AIL lock.
- */
- xfs_trans_ail_update(log->l_ailp, &ruip->rui_item, lsn);
- xfs_rui_release(ruip);
- return 0;
-}
-
-
-/*
- * This routine is called when an RUD format structure is found in a committed
- * transaction in the log. Its purpose is to cancel the corresponding RUI if it
- * was still in the log. To do this it searches the AIL for the RUI with an id
- * equal to that in the RUD format structure. If we find it we drop the RUD
- * reference, which removes the RUI from the AIL and frees it.
- */
-STATIC int
-xlog_recover_rud_pass2(
- struct xlog *log,
- struct xlog_recover_item *item)
-{
- struct xfs_rud_log_format *rud_formatp;
- struct xfs_rui_log_item *ruip = NULL;
- struct xfs_log_item *lip;
- uint64_t rui_id;
- struct xfs_ail_cursor cur;
- struct xfs_ail *ailp = log->l_ailp;
-
- rud_formatp = item->ri_buf[0].i_addr;
- ASSERT(item->ri_buf[0].i_len == sizeof(struct xfs_rud_log_format));
- rui_id = rud_formatp->rud_rui_id;
-
- /*
- * Search for the RUI with the id in the RUD format structure in the
- * AIL.
- */
- spin_lock(&ailp->ail_lock);
- lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
- while (lip != NULL) {
- if (lip->li_type == XFS_LI_RUI) {
- ruip = (struct xfs_rui_log_item *)lip;
- if (ruip->rui_format.rui_id == rui_id) {
- /*
- * Drop the RUD reference to the RUI. This
- * removes the RUI from the AIL and frees it.
- */
- spin_unlock(&ailp->ail_lock);
- xfs_rui_release(ruip);
- spin_lock(&ailp->ail_lock);
- break;
- }
- }
- lip = xfs_trans_ail_cursor_next(ailp, &cur);
- }
-
- xfs_trans_ail_cursor_done(&cur);
- spin_unlock(&ailp->ail_lock);
-
- return 0;
-}
-
-/*
- * Copy an CUI format buffer from the given buf, and into the destination
- * CUI format structure. The CUI/CUD items were designed not to need any
- * special alignment handling.
- */
-static int
-xfs_cui_copy_format(
- struct xfs_log_iovec *buf,
- struct xfs_cui_log_format *dst_cui_fmt)
-{
- struct xfs_cui_log_format *src_cui_fmt;
- uint len;
-
- src_cui_fmt = buf->i_addr;
- len = xfs_cui_log_format_sizeof(src_cui_fmt->cui_nextents);
-
- if (buf->i_len == len) {
- memcpy(dst_cui_fmt, src_cui_fmt, len);
- return 0;
- }
- return -EFSCORRUPTED;
-}
-
-/*
- * This routine is called to create an in-core extent refcount update
- * item from the cui format structure which was logged on disk.
- * It allocates an in-core cui, copies the extents from the format
- * structure into it, and adds the cui to the AIL with the given
- * LSN.
- */
-STATIC int
-xlog_recover_cui_pass2(
- struct xlog *log,
- struct xlog_recover_item *item,
- xfs_lsn_t lsn)
-{
- int error;
- struct xfs_mount *mp = log->l_mp;
- struct xfs_cui_log_item *cuip;
- struct xfs_cui_log_format *cui_formatp;
-
- cui_formatp = item->ri_buf[0].i_addr;
-
- cuip = xfs_cui_init(mp, cui_formatp->cui_nextents);
- error = xfs_cui_copy_format(&item->ri_buf[0], &cuip->cui_format);
- if (error) {
- xfs_cui_item_free(cuip);
- return error;
- }
- atomic_set(&cuip->cui_next_extent, cui_formatp->cui_nextents);
-
- spin_lock(&log->l_ailp->ail_lock);
- /*
- * The CUI has two references. One for the CUD and one for CUI to ensure
- * it makes it into the AIL. Insert the CUI into the AIL directly and
- * drop the CUI reference. Note that xfs_trans_ail_update() drops the
- * AIL lock.
- */
- xfs_trans_ail_update(log->l_ailp, &cuip->cui_item, lsn);
- xfs_cui_release(cuip);
- return 0;
-}
-
-
-/*
- * This routine is called when an CUD format structure is found in a committed
- * transaction in the log. Its purpose is to cancel the corresponding CUI if it
- * was still in the log. To do this it searches the AIL for the CUI with an id
- * equal to that in the CUD format structure. If we find it we drop the CUD
- * reference, which removes the CUI from the AIL and frees it.
- */
-STATIC int
-xlog_recover_cud_pass2(
- struct xlog *log,
- struct xlog_recover_item *item)
-{
- struct xfs_cud_log_format *cud_formatp;
- struct xfs_cui_log_item *cuip = NULL;
- struct xfs_log_item *lip;
- uint64_t cui_id;
- struct xfs_ail_cursor cur;
- struct xfs_ail *ailp = log->l_ailp;
-
- cud_formatp = item->ri_buf[0].i_addr;
- if (item->ri_buf[0].i_len != sizeof(struct xfs_cud_log_format))
- return -EFSCORRUPTED;
- cui_id = cud_formatp->cud_cui_id;
-
- /*
- * Search for the CUI with the id in the CUD format structure in the
- * AIL.
- */
- spin_lock(&ailp->ail_lock);
- lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
- while (lip != NULL) {
- if (lip->li_type == XFS_LI_CUI) {
- cuip = (struct xfs_cui_log_item *)lip;
- if (cuip->cui_format.cui_id == cui_id) {
- /*
- * Drop the CUD reference to the CUI. This
- * removes the CUI from the AIL and frees it.
- */
- spin_unlock(&ailp->ail_lock);
- xfs_cui_release(cuip);
- spin_lock(&ailp->ail_lock);
- break;
- }
- }
- lip = xfs_trans_ail_cursor_next(ailp, &cur);
- }
-
- xfs_trans_ail_cursor_done(&cur);
- spin_unlock(&ailp->ail_lock);
-
- return 0;
-}
-
-/*
- * Copy an BUI format buffer from the given buf, and into the destination
- * BUI format structure. The BUI/BUD items were designed not to need any
- * special alignment handling.
- */
-static int
-xfs_bui_copy_format(
- struct xfs_log_iovec *buf,
- struct xfs_bui_log_format *dst_bui_fmt)
-{
- struct xfs_bui_log_format *src_bui_fmt;
- uint len;
-
- src_bui_fmt = buf->i_addr;
- len = xfs_bui_log_format_sizeof(src_bui_fmt->bui_nextents);
-
- if (buf->i_len == len) {
- memcpy(dst_bui_fmt, src_bui_fmt, len);
- return 0;
- }
- return -EFSCORRUPTED;
-}
-
-/*
- * This routine is called to create an in-core extent bmap update
- * item from the bui format structure which was logged on disk.
- * It allocates an in-core bui, copies the extents from the format
- * structure into it, and adds the bui to the AIL with the given
- * LSN.
- */
-STATIC int
-xlog_recover_bui_pass2(
- struct xlog *log,
- struct xlog_recover_item *item,
- xfs_lsn_t lsn)
-{
- int error;
- struct xfs_mount *mp = log->l_mp;
- struct xfs_bui_log_item *buip;
- struct xfs_bui_log_format *bui_formatp;
-
- bui_formatp = item->ri_buf[0].i_addr;
-
- if (bui_formatp->bui_nextents != XFS_BUI_MAX_FAST_EXTENTS)
- return -EFSCORRUPTED;
- buip = xfs_bui_init(mp);
- error = xfs_bui_copy_format(&item->ri_buf[0], &buip->bui_format);
- if (error) {
- xfs_bui_item_free(buip);
- return error;
- }
- atomic_set(&buip->bui_next_extent, bui_formatp->bui_nextents);
-
- spin_lock(&log->l_ailp->ail_lock);
- /*
- * The RUI has two references. One for the RUD and one for RUI to ensure
- * it makes it into the AIL. Insert the RUI into the AIL directly and
- * drop the RUI reference. Note that xfs_trans_ail_update() drops the
- * AIL lock.
- */
- xfs_trans_ail_update(log->l_ailp, &buip->bui_item, lsn);
- xfs_bui_release(buip);
- return 0;
-}
-
-
-/*
- * This routine is called when an BUD format structure is found in a committed
- * transaction in the log. Its purpose is to cancel the corresponding BUI if it
- * was still in the log. To do this it searches the AIL for the BUI with an id
- * equal to that in the BUD format structure. If we find it we drop the BUD
- * reference, which removes the BUI from the AIL and frees it.
- */
-STATIC int
-xlog_recover_bud_pass2(
- struct xlog *log,
- struct xlog_recover_item *item)
-{
- struct xfs_bud_log_format *bud_formatp;
- struct xfs_bui_log_item *buip = NULL;
- struct xfs_log_item *lip;
- uint64_t bui_id;
- struct xfs_ail_cursor cur;
- struct xfs_ail *ailp = log->l_ailp;
-
- bud_formatp = item->ri_buf[0].i_addr;
- if (item->ri_buf[0].i_len != sizeof(struct xfs_bud_log_format))
- return -EFSCORRUPTED;
- bui_id = bud_formatp->bud_bui_id;
-
- /*
- * Search for the BUI with the id in the BUD format structure in the
- * AIL.
- */
- spin_lock(&ailp->ail_lock);
- lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
- while (lip != NULL) {
- if (lip->li_type == XFS_LI_BUI) {
- buip = (struct xfs_bui_log_item *)lip;
- if (buip->bui_format.bui_id == bui_id) {
- /*
- * Drop the BUD reference to the BUI. This
- * removes the BUI from the AIL and frees it.
- */
- spin_unlock(&ailp->ail_lock);
- xfs_bui_release(buip);
- spin_lock(&ailp->ail_lock);
- break;
- }
- }
- lip = xfs_trans_ail_cursor_next(ailp, &cur);
- }
-
- xfs_trans_ail_cursor_done(&cur);
- spin_unlock(&ailp->ail_lock);
-
- return 0;
-}
-
-/*
- * This routine is called when an inode create format structure is found in a
- * committed transaction in the log. It's purpose is to initialise the inodes
- * being allocated on disk. This requires us to get inode cluster buffers that
- * match the range to be initialised, stamped with inode templates and written
- * by delayed write so that subsequent modifications will hit the cached buffer
- * and only need writing out at the end of recovery.
- */
-STATIC int
-xlog_recover_do_icreate_pass2(
- struct xlog *log,
- struct list_head *buffer_list,
- xlog_recover_item_t *item)
-{
- struct xfs_mount *mp = log->l_mp;
- struct xfs_icreate_log *icl;
- struct xfs_ino_geometry *igeo = M_IGEO(mp);
- xfs_agnumber_t agno;
- xfs_agblock_t agbno;
- unsigned int count;
- unsigned int isize;
- xfs_agblock_t length;
- int bb_per_cluster;
- int cancel_count;
- int nbufs;
- int i;
-
- icl = (struct xfs_icreate_log *)item->ri_buf[0].i_addr;
- if (icl->icl_type != XFS_LI_ICREATE) {
- xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad type");
- return -EINVAL;
- }
-
- if (icl->icl_size != 1) {
- xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad icl size");
- return -EINVAL;
- }
-
- agno = be32_to_cpu(icl->icl_ag);
- if (agno >= mp->m_sb.sb_agcount) {
- xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad agno");
- return -EINVAL;
- }
- agbno = be32_to_cpu(icl->icl_agbno);
- if (!agbno || agbno == NULLAGBLOCK || agbno >= mp->m_sb.sb_agblocks) {
- xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad agbno");
- return -EINVAL;
- }
- isize = be32_to_cpu(icl->icl_isize);
- if (isize != mp->m_sb.sb_inodesize) {
- xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad isize");
- return -EINVAL;
- }
- count = be32_to_cpu(icl->icl_count);
- if (!count) {
- xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad count");
- return -EINVAL;
- }
- length = be32_to_cpu(icl->icl_length);
- if (!length || length >= mp->m_sb.sb_agblocks) {
- xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad length");
- return -EINVAL;
- }
-
- /*
- * The inode chunk is either full or sparse and we only support
- * m_ino_geo.ialloc_min_blks sized sparse allocations at this time.
- */
- if (length != igeo->ialloc_blks &&
- length != igeo->ialloc_min_blks) {
- xfs_warn(log->l_mp,
- "%s: unsupported chunk length", __FUNCTION__);
- return -EINVAL;
- }
-
- /* verify inode count is consistent with extent length */
- if ((count >> mp->m_sb.sb_inopblog) != length) {
- xfs_warn(log->l_mp,
- "%s: inconsistent inode count and chunk length",
- __FUNCTION__);
- return -EINVAL;
- }
-
- /*
- * The icreate transaction can cover multiple cluster buffers and these
- * buffers could have been freed and reused. Check the individual
- * buffers for cancellation so we don't overwrite anything written after
- * a cancellation.
- */
- bb_per_cluster = XFS_FSB_TO_BB(mp, igeo->blocks_per_cluster);
- nbufs = length / igeo->blocks_per_cluster;
- for (i = 0, cancel_count = 0; i < nbufs; i++) {
- xfs_daddr_t daddr;
-
- daddr = XFS_AGB_TO_DADDR(mp, agno,
- agbno + i * igeo->blocks_per_cluster);
- if (xlog_check_buffer_cancelled(log, daddr, bb_per_cluster, 0))
- cancel_count++;
- }
-
- /*
- * We currently only use icreate for a single allocation at a time. This
- * means we should expect either all or none of the buffers to be
- * cancelled. Be conservative and skip replay if at least one buffer is
- * cancelled, but warn the user that something is awry if the buffers
- * are not consistent.
- *
- * XXX: This must be refined to only skip cancelled clusters once we use
- * icreate for multiple chunk allocations.
- */
- ASSERT(!cancel_count || cancel_count == nbufs);
- if (cancel_count) {
- if (cancel_count != nbufs)
- xfs_warn(mp,
- "WARNING: partial inode chunk cancellation, skipped icreate.");
- trace_xfs_log_recover_icreate_cancel(log, icl);
- return 0;
- }
-
- trace_xfs_log_recover_icreate_recover(log, icl);
- return xfs_ialloc_inode_init(mp, NULL, buffer_list, count, agno, agbno,
- length, be32_to_cpu(icl->icl_gen));
-}
-
-STATIC void
-xlog_recover_buffer_ra_pass2(
- struct xlog *log,
- struct xlog_recover_item *item)
-{
- struct xfs_buf_log_format *buf_f = item->ri_buf[0].i_addr;
- struct xfs_mount *mp = log->l_mp;
-
- if (xlog_peek_buffer_cancelled(log, buf_f->blf_blkno,
- buf_f->blf_len, buf_f->blf_flags)) {
- return;
- }
-
- xfs_buf_readahead(mp->m_ddev_targp, buf_f->blf_blkno,
- buf_f->blf_len, NULL);
-}
-
-STATIC void
-xlog_recover_inode_ra_pass2(
- struct xlog *log,
- struct xlog_recover_item *item)
-{
- struct xfs_inode_log_format ilf_buf;
- struct xfs_inode_log_format *ilfp;
- struct xfs_mount *mp = log->l_mp;
- int error;
-
- if (item->ri_buf[0].i_len == sizeof(struct xfs_inode_log_format)) {
- ilfp = item->ri_buf[0].i_addr;
- } else {
- ilfp = &ilf_buf;
- memset(ilfp, 0, sizeof(*ilfp));
- error = xfs_inode_item_format_convert(&item->ri_buf[0], ilfp);
- if (error)
- return;
- }
-
- if (xlog_peek_buffer_cancelled(log, ilfp->ilf_blkno, ilfp->ilf_len, 0))
- return;
-
- xfs_buf_readahead(mp->m_ddev_targp, ilfp->ilf_blkno,
- ilfp->ilf_len, &xfs_inode_buf_ra_ops);
-}
-
-STATIC void
-xlog_recover_dquot_ra_pass2(
- struct xlog *log,
- struct xlog_recover_item *item)
-{
- struct xfs_mount *mp = log->l_mp;
- struct xfs_disk_dquot *recddq;
- struct xfs_dq_logformat *dq_f;
- uint type;
- int len;
-
-
- if (mp->m_qflags == 0)
- return;
-
- recddq = item->ri_buf[1].i_addr;
- if (recddq == NULL)
- return;
- if (item->ri_buf[1].i_len < sizeof(struct xfs_disk_dquot))
- return;
-
- type = recddq->d_flags & (XFS_DQ_USER | XFS_DQ_PROJ | XFS_DQ_GROUP);
- ASSERT(type);
- if (log->l_quotaoffs_flag & type)
- return;
-
- dq_f = item->ri_buf[0].i_addr;
- ASSERT(dq_f);
- ASSERT(dq_f->qlf_len == 1);
-
- len = XFS_FSB_TO_BB(mp, dq_f->qlf_len);
- if (xlog_peek_buffer_cancelled(log, dq_f->qlf_blkno, len, 0))
- return;
-
- xfs_buf_readahead(mp->m_ddev_targp, dq_f->qlf_blkno, len,
- &xfs_dquot_buf_ra_ops);
-}
-
-STATIC void
-xlog_recover_ra_pass2(
- struct xlog *log,
- struct xlog_recover_item *item)
-{
- switch (ITEM_TYPE(item)) {
- case XFS_LI_BUF:
- xlog_recover_buffer_ra_pass2(log, item);
- break;
- case XFS_LI_INODE:
- xlog_recover_inode_ra_pass2(log, item);
- break;
- case XFS_LI_DQUOT:
- xlog_recover_dquot_ra_pass2(log, item);
- break;
- case XFS_LI_EFI:
- case XFS_LI_EFD:
- case XFS_LI_QUOTAOFF:
- case XFS_LI_RUI:
- case XFS_LI_RUD:
- case XFS_LI_CUI:
- case XFS_LI_CUD:
- case XFS_LI_BUI:
- case XFS_LI_BUD:
- default:
- break;
- }
-}
-
-STATIC int
-xlog_recover_commit_pass1(
- struct xlog *log,
- struct xlog_recover *trans,
- struct xlog_recover_item *item)
-{
- trace_xfs_log_recover_item_recover(log, trans, item, XLOG_RECOVER_PASS1);
-
- switch (ITEM_TYPE(item)) {
- case XFS_LI_BUF:
- return xlog_recover_buffer_pass1(log, item);
- case XFS_LI_QUOTAOFF:
- return xlog_recover_quotaoff_pass1(log, item);
- case XFS_LI_INODE:
- case XFS_LI_EFI:
- case XFS_LI_EFD:
- case XFS_LI_DQUOT:
- case XFS_LI_ICREATE:
- case XFS_LI_RUI:
- case XFS_LI_RUD:
- case XFS_LI_CUI:
- case XFS_LI_CUD:
- case XFS_LI_BUI:
- case XFS_LI_BUD:
- /* nothing to do in pass 1 */
- return 0;
- default:
- xfs_warn(log->l_mp, "%s: invalid item type (%d)",
- __func__, ITEM_TYPE(item));
- ASSERT(0);
- return -EIO;
- }
-}
-
-STATIC int
-xlog_recover_commit_pass2(
- struct xlog *log,
- struct xlog_recover *trans,
- struct list_head *buffer_list,
- struct xlog_recover_item *item)
-{
- trace_xfs_log_recover_item_recover(log, trans, item, XLOG_RECOVER_PASS2);
-
- switch (ITEM_TYPE(item)) {
- case XFS_LI_BUF:
- return xlog_recover_buffer_pass2(log, buffer_list, item,
- trans->r_lsn);
- case XFS_LI_INODE:
- return xlog_recover_inode_pass2(log, buffer_list, item,
- trans->r_lsn);
- case XFS_LI_EFI:
- return xlog_recover_efi_pass2(log, item, trans->r_lsn);
- case XFS_LI_EFD:
- return xlog_recover_efd_pass2(log, item);
- case XFS_LI_RUI:
- return xlog_recover_rui_pass2(log, item, trans->r_lsn);
- case XFS_LI_RUD:
- return xlog_recover_rud_pass2(log, item);
- case XFS_LI_CUI:
- return xlog_recover_cui_pass2(log, item, trans->r_lsn);
- case XFS_LI_CUD:
- return xlog_recover_cud_pass2(log, item);
- case XFS_LI_BUI:
- return xlog_recover_bui_pass2(log, item, trans->r_lsn);
- case XFS_LI_BUD:
- return xlog_recover_bud_pass2(log, item);
- case XFS_LI_DQUOT:
- return xlog_recover_dquot_pass2(log, buffer_list, item,
- trans->r_lsn);
- case XFS_LI_ICREATE:
- return xlog_recover_do_icreate_pass2(log, buffer_list, item);
- case XFS_LI_QUOTAOFF:
- /* nothing to do in pass2 */
- return 0;
- default:
- xfs_warn(log->l_mp, "%s: invalid item type (%d)",
- __func__, ITEM_TYPE(item));
- ASSERT(0);
- return -EIO;
- }
+ if (!xlog_is_buffer_cancelled(log, blkno, len))
+ xfs_buf_readahead(log->l_mp->m_ddev_targp, blkno, len, ops);
}
STATIC int
@@ -4081,8 +1930,12 @@
int error = 0;
list_for_each_entry(item, item_list, ri_list) {
- error = xlog_recover_commit_pass2(log, trans,
- buffer_list, item);
+ trace_xfs_log_recover_item_recover(log, trans, item,
+ XLOG_RECOVER_PASS2);
+
+ if (item->ri_ops->commit_pass2)
+ error = item->ri_ops->commit_pass2(log, buffer_list,
+ item, trans->r_lsn);
if (error)
return error;
}
@@ -4119,12 +1972,16 @@
return error;
list_for_each_entry_safe(item, next, &trans->r_itemq, ri_list) {
+ trace_xfs_log_recover_item_recover(log, trans, item, pass);
+
switch (pass) {
case XLOG_RECOVER_PASS1:
- error = xlog_recover_commit_pass1(log, trans, item);
+ if (item->ri_ops->commit_pass1)
+ error = item->ri_ops->commit_pass1(log, item);
break;
case XLOG_RECOVER_PASS2:
- xlog_recover_ra_pass2(log, item);
+ if (item->ri_ops->ra_pass2)
+ item->ri_ops->ra_pass2(log, item);
list_move_tail(&item->ri_list, &ra_list);
items_queued++;
if (items_queued >= XLOG_RECOVER_COMMIT_QUEUE_MAX) {
@@ -4161,9 +2018,9 @@
xlog_recover_add_item(
struct list_head *head)
{
- xlog_recover_item_t *item;
+ struct xlog_recover_item *item;
- item = kmem_zalloc(sizeof(xlog_recover_item_t), 0);
+ item = kmem_zalloc(sizeof(struct xlog_recover_item), 0);
INIT_LIST_HEAD(&item->ri_list);
list_add_tail(&item->ri_list, head);
}
@@ -4175,7 +2032,7 @@
char *dp,
int len)
{
- xlog_recover_item_t *item;
+ struct xlog_recover_item *item;
char *ptr, *old_ptr;
int old_len;
@@ -4187,7 +2044,7 @@
ASSERT(len <= sizeof(struct xfs_trans_header));
if (len > sizeof(struct xfs_trans_header)) {
xfs_warn(log->l_mp, "%s: bad header length", __func__);
- return -EIO;
+ return -EFSCORRUPTED;
}
xlog_recover_add_item(&trans->r_itemq);
@@ -4198,12 +2055,13 @@
}
/* take the tail entry */
- item = list_entry(trans->r_itemq.prev, xlog_recover_item_t, ri_list);
+ item = list_entry(trans->r_itemq.prev, struct xlog_recover_item,
+ ri_list);
old_ptr = item->ri_buf[item->ri_cnt-1].i_addr;
old_len = item->ri_buf[item->ri_cnt-1].i_len;
- ptr = kmem_realloc(old_ptr, len + old_len, 0);
+ ptr = krealloc(old_ptr, len + old_len, GFP_KERNEL | __GFP_NOFAIL);
memcpy(&ptr[old_len], dp, len);
item->ri_buf[item->ri_cnt-1].i_len += len;
item->ri_buf[item->ri_cnt-1].i_addr = ptr;
@@ -4232,7 +2090,7 @@
int len)
{
struct xfs_inode_log_format *in_f; /* any will do */
- xlog_recover_item_t *item;
+ struct xlog_recover_item *item;
char *ptr;
if (!len)
@@ -4243,13 +2101,13 @@
xfs_warn(log->l_mp, "%s: bad header magic number",
__func__);
ASSERT(0);
- return -EIO;
+ return -EFSCORRUPTED;
}
if (len > sizeof(struct xfs_trans_header)) {
xfs_warn(log->l_mp, "%s: bad header length", __func__);
ASSERT(0);
- return -EIO;
+ return -EFSCORRUPTED;
}
/*
@@ -4268,13 +2126,14 @@
in_f = (struct xfs_inode_log_format *)ptr;
/* take the tail entry */
- item = list_entry(trans->r_itemq.prev, xlog_recover_item_t, ri_list);
+ item = list_entry(trans->r_itemq.prev, struct xlog_recover_item,
+ ri_list);
if (item->ri_total != 0 &&
item->ri_total == item->ri_cnt) {
/* tail item is in use, get a new one */
xlog_recover_add_item(&trans->r_itemq);
item = list_entry(trans->r_itemq.prev,
- xlog_recover_item_t, ri_list);
+ struct xlog_recover_item, ri_list);
}
if (item->ri_total == 0) { /* first region to be added */
@@ -4285,7 +2144,7 @@
in_f->ilf_size);
ASSERT(0);
kmem_free(ptr);
- return -EIO;
+ return -EFSCORRUPTED;
}
item->ri_total = in_f->ilf_size;
@@ -4293,7 +2152,16 @@
kmem_zalloc(item->ri_total * sizeof(xfs_log_iovec_t),
0);
}
- ASSERT(item->ri_total > item->ri_cnt);
+
+ if (item->ri_total <= item->ri_cnt) {
+ xfs_warn(log->l_mp,
+ "log item region count (%d) overflowed size (%d)",
+ item->ri_cnt, item->ri_total);
+ ASSERT(0);
+ kmem_free(ptr);
+ return -EFSCORRUPTED;
+ }
+
/* Description region is ri_buf[0] */
item->ri_buf[item->ri_cnt].i_addr = ptr;
item->ri_buf[item->ri_cnt].i_len = len;
@@ -4311,7 +2179,7 @@
xlog_recover_free_trans(
struct xlog_recover *trans)
{
- xlog_recover_item_t *item, *n;
+ struct xlog_recover_item *item, *n;
int i;
hlist_del_init(&trans->r_list);
@@ -4380,7 +2248,7 @@
default:
xfs_warn(log->l_mp, "%s: bad flag 0x%x", __func__, flags);
ASSERT(0);
- error = -EIO;
+ error = -EFSCORRUPTED;
break;
}
if (error || freeit)
@@ -4460,7 +2328,7 @@
xfs_warn(log->l_mp, "%s: bad clientid 0x%x",
__func__, ohead->oh_clientid);
ASSERT(0);
- return -EIO;
+ return -EFSCORRUPTED;
}
/*
@@ -4470,7 +2338,7 @@
if (dp + len > end) {
xfs_warn(log->l_mp, "%s: bad length 0x%x", __func__, len);
WARN_ON(1);
- return -EIO;
+ return -EFSCORRUPTED;
}
trans = xlog_recover_ophdr_to_trans(rhash, rhead, ohead);
@@ -4563,214 +2431,69 @@
return 0;
}
-/* Recover the EFI if necessary. */
-STATIC int
-xlog_recover_process_efi(
- struct xfs_mount *mp,
- struct xfs_ail *ailp,
- struct xfs_log_item *lip)
-{
- struct xfs_efi_log_item *efip;
- int error;
-
- /*
- * Skip EFIs that we've already processed.
- */
- efip = container_of(lip, struct xfs_efi_log_item, efi_item);
- if (test_bit(XFS_EFI_RECOVERED, &efip->efi_flags))
- return 0;
-
- spin_unlock(&ailp->ail_lock);
- error = xfs_efi_recover(mp, efip);
- spin_lock(&ailp->ail_lock);
-
- return error;
-}
-
-/* Release the EFI since we're cancelling everything. */
-STATIC void
-xlog_recover_cancel_efi(
- struct xfs_mount *mp,
- struct xfs_ail *ailp,
- struct xfs_log_item *lip)
-{
- struct xfs_efi_log_item *efip;
-
- efip = container_of(lip, struct xfs_efi_log_item, efi_item);
-
- spin_unlock(&ailp->ail_lock);
- xfs_efi_release(efip);
- spin_lock(&ailp->ail_lock);
-}
-
-/* Recover the RUI if necessary. */
-STATIC int
-xlog_recover_process_rui(
- struct xfs_mount *mp,
- struct xfs_ail *ailp,
- struct xfs_log_item *lip)
-{
- struct xfs_rui_log_item *ruip;
- int error;
-
- /*
- * Skip RUIs that we've already processed.
- */
- ruip = container_of(lip, struct xfs_rui_log_item, rui_item);
- if (test_bit(XFS_RUI_RECOVERED, &ruip->rui_flags))
- return 0;
-
- spin_unlock(&ailp->ail_lock);
- error = xfs_rui_recover(mp, ruip);
- spin_lock(&ailp->ail_lock);
-
- return error;
-}
-
-/* Release the RUI since we're cancelling everything. */
-STATIC void
-xlog_recover_cancel_rui(
- struct xfs_mount *mp,
- struct xfs_ail *ailp,
- struct xfs_log_item *lip)
-{
- struct xfs_rui_log_item *ruip;
-
- ruip = container_of(lip, struct xfs_rui_log_item, rui_item);
-
- spin_unlock(&ailp->ail_lock);
- xfs_rui_release(ruip);
- spin_lock(&ailp->ail_lock);
-}
-
-/* Recover the CUI if necessary. */
-STATIC int
-xlog_recover_process_cui(
- struct xfs_trans *parent_tp,
- struct xfs_ail *ailp,
- struct xfs_log_item *lip)
-{
- struct xfs_cui_log_item *cuip;
- int error;
-
- /*
- * Skip CUIs that we've already processed.
- */
- cuip = container_of(lip, struct xfs_cui_log_item, cui_item);
- if (test_bit(XFS_CUI_RECOVERED, &cuip->cui_flags))
- return 0;
-
- spin_unlock(&ailp->ail_lock);
- error = xfs_cui_recover(parent_tp, cuip);
- spin_lock(&ailp->ail_lock);
-
- return error;
-}
-
-/* Release the CUI since we're cancelling everything. */
-STATIC void
-xlog_recover_cancel_cui(
- struct xfs_mount *mp,
- struct xfs_ail *ailp,
- struct xfs_log_item *lip)
-{
- struct xfs_cui_log_item *cuip;
-
- cuip = container_of(lip, struct xfs_cui_log_item, cui_item);
-
- spin_unlock(&ailp->ail_lock);
- xfs_cui_release(cuip);
- spin_lock(&ailp->ail_lock);
-}
-
-/* Recover the BUI if necessary. */
-STATIC int
-xlog_recover_process_bui(
- struct xfs_trans *parent_tp,
- struct xfs_ail *ailp,
- struct xfs_log_item *lip)
-{
- struct xfs_bui_log_item *buip;
- int error;
-
- /*
- * Skip BUIs that we've already processed.
- */
- buip = container_of(lip, struct xfs_bui_log_item, bui_item);
- if (test_bit(XFS_BUI_RECOVERED, &buip->bui_flags))
- return 0;
-
- spin_unlock(&ailp->ail_lock);
- error = xfs_bui_recover(parent_tp, buip);
- spin_lock(&ailp->ail_lock);
-
- return error;
-}
-
-/* Release the BUI since we're cancelling everything. */
-STATIC void
-xlog_recover_cancel_bui(
- struct xfs_mount *mp,
- struct xfs_ail *ailp,
- struct xfs_log_item *lip)
-{
- struct xfs_bui_log_item *buip;
-
- buip = container_of(lip, struct xfs_bui_log_item, bui_item);
-
- spin_unlock(&ailp->ail_lock);
- xfs_bui_release(buip);
- spin_lock(&ailp->ail_lock);
-}
-
-/* Is this log item a deferred action intent? */
-static inline bool xlog_item_is_intent(struct xfs_log_item *lip)
-{
- switch (lip->li_type) {
- case XFS_LI_EFI:
- case XFS_LI_RUI:
- case XFS_LI_CUI:
- case XFS_LI_BUI:
- return true;
- default:
- return false;
- }
-}
-
/* Take all the collected deferred ops and finish them in order. */
static int
xlog_finish_defer_ops(
- struct xfs_trans *parent_tp)
+ struct xfs_mount *mp,
+ struct list_head *capture_list)
{
- struct xfs_mount *mp = parent_tp->t_mountp;
+ struct xfs_defer_capture *dfc, *next;
struct xfs_trans *tp;
- int64_t freeblks;
- uint resblks;
- int error;
+ struct xfs_inode *ip;
+ int error = 0;
- /*
- * We're finishing the defer_ops that accumulated as a result of
- * recovering unfinished intent items during log recovery. We
- * reserve an itruncate transaction because it is the largest
- * permanent transaction type. Since we're the only user of the fs
- * right now, take 93% (15/16) of the available free blocks. Use
- * weird math to avoid a 64-bit division.
- */
- freeblks = percpu_counter_sum(&mp->m_fdblocks);
- if (freeblks <= 0)
- return -ENOSPC;
- resblks = min_t(int64_t, UINT_MAX, freeblks);
- resblks = (resblks * 15) >> 4;
- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, resblks,
- 0, XFS_TRANS_RESERVE, &tp);
- if (error)
- return error;
- /* transfer all collected dfops to this transaction */
- xfs_defer_move(tp, parent_tp);
+ list_for_each_entry_safe(dfc, next, capture_list, dfc_list) {
+ struct xfs_trans_res resv;
- return xfs_trans_commit(tp);
+ /*
+ * Create a new transaction reservation from the captured
+ * information. Set logcount to 1 to force the new transaction
+ * to regrant every roll so that we can make forward progress
+ * in recovery no matter how full the log might be.
+ */
+ resv.tr_logres = dfc->dfc_logres;
+ resv.tr_logcount = 1;
+ resv.tr_logflags = XFS_TRANS_PERM_LOG_RES;
+
+ error = xfs_trans_alloc(mp, &resv, dfc->dfc_blkres,
+ dfc->dfc_rtxres, XFS_TRANS_RESERVE, &tp);
+ if (error)
+ return error;
+
+ /*
+ * Transfer to this new transaction all the dfops we captured
+ * from recovering a single intent item.
+ */
+ list_del_init(&dfc->dfc_list);
+ xfs_defer_ops_continue(dfc, tp, &ip);
+
+ error = xfs_trans_commit(tp);
+ if (ip) {
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ xfs_irele(ip);
+ }
+ if (error)
+ return error;
+ }
+
+ ASSERT(list_empty(capture_list));
+ return 0;
}
+/* Release all the captured defer ops and capture structures in this list. */
+static void
+xlog_abort_defer_ops(
+ struct xfs_mount *mp,
+ struct list_head *capture_list)
+{
+ struct xfs_defer_capture *dfc;
+ struct xfs_defer_capture *next;
+
+ list_for_each_entry_safe(dfc, next, capture_list, dfc_list) {
+ list_del_init(&dfc->dfc_list);
+ xfs_defer_ops_release(mp, dfc);
+ }
+}
/*
* When this is called, all of the log intent items which did not have
* corresponding log done items should be in the AIL. What we do now
@@ -4791,35 +2514,23 @@
xlog_recover_process_intents(
struct xlog *log)
{
- struct xfs_trans *parent_tp;
+ LIST_HEAD(capture_list);
struct xfs_ail_cursor cur;
struct xfs_log_item *lip;
struct xfs_ail *ailp;
- int error;
+ int error = 0;
#if defined(DEBUG) || defined(XFS_WARN)
xfs_lsn_t last_lsn;
#endif
- /*
- * The intent recovery handlers commit transactions to complete recovery
- * for individual intents, but any new deferred operations that are
- * queued during that process are held off until the very end. The
- * purpose of this transaction is to serve as a container for deferred
- * operations. Each intent recovery handler must transfer dfops here
- * before its local transaction commits, and we'll finish the entire
- * list below.
- */
- error = xfs_trans_alloc_empty(log->l_mp, &parent_tp);
- if (error)
- return error;
-
ailp = log->l_ailp;
spin_lock(&ailp->ail_lock);
- lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
#if defined(DEBUG) || defined(XFS_WARN)
last_lsn = xlog_assign_lsn(log->l_curr_cycle, log->l_curr_block);
#endif
- while (lip != NULL) {
+ for (lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
+ lip != NULL;
+ lip = xfs_trans_ail_cursor_next(ailp, &cur)) {
/*
* We're done when we see something other than an intent.
* There should be no intents left in the AIL now.
@@ -4841,35 +2552,29 @@
/*
* NOTE: If your intent processing routine can create more
- * deferred ops, you /must/ attach them to the dfops in this
- * routine or else those subsequent intents will get
+ * deferred ops, you /must/ attach them to the capture list in
+ * the recover routine or else those subsequent intents will be
* replayed in the wrong order!
*/
- switch (lip->li_type) {
- case XFS_LI_EFI:
- error = xlog_recover_process_efi(log->l_mp, ailp, lip);
- break;
- case XFS_LI_RUI:
- error = xlog_recover_process_rui(log->l_mp, ailp, lip);
- break;
- case XFS_LI_CUI:
- error = xlog_recover_process_cui(parent_tp, ailp, lip);
- break;
- case XFS_LI_BUI:
- error = xlog_recover_process_bui(parent_tp, ailp, lip);
- break;
- }
+ spin_unlock(&ailp->ail_lock);
+ error = lip->li_ops->iop_recover(lip, &capture_list);
+ spin_lock(&ailp->ail_lock);
if (error)
- goto out;
- lip = xfs_trans_ail_cursor_next(ailp, &cur);
+ break;
}
-out:
+
xfs_trans_ail_cursor_done(&cur);
spin_unlock(&ailp->ail_lock);
- if (!error)
- error = xlog_finish_defer_ops(parent_tp);
- xfs_trans_cancel(parent_tp);
+ if (error)
+ goto err;
+ error = xlog_finish_defer_ops(log->l_mp, &capture_list);
+ if (error)
+ goto err;
+
+ return 0;
+err:
+ xlog_abort_defer_ops(log->l_mp, &capture_list);
return error;
}
@@ -4901,21 +2606,9 @@
break;
}
- switch (lip->li_type) {
- case XFS_LI_EFI:
- xlog_recover_cancel_efi(log->l_mp, ailp, lip);
- break;
- case XFS_LI_RUI:
- xlog_recover_cancel_rui(log->l_mp, ailp, lip);
- break;
- case XFS_LI_CUI:
- xlog_recover_cancel_cui(log->l_mp, ailp, lip);
- break;
- case XFS_LI_BUI:
- xlog_recover_cancel_bui(log->l_mp, ailp, lip);
- break;
- }
-
+ spin_unlock(&ailp->ail_lock);
+ lip->li_ops->iop_release(lip);
+ spin_lock(&ailp->ail_lock);
lip = xfs_trans_ail_cursor_next(ailp, &cur);
}
@@ -4947,7 +2640,7 @@
if (error)
goto out_abort;
- agi = XFS_BUF_TO_AGI(agibp);
+ agi = agibp->b_addr;
agi->agi_unlinked[bucket] = cpu_to_be32(NULLAGINO);
offset = offsetof(xfs_agi_t, agi_unlinked) +
(sizeof(xfs_agino_t) * bucket);
@@ -4987,7 +2680,7 @@
/*
* Get the on disk inode to find the next inode in the bucket.
*/
- error = xfs_imap_to_bp(mp, NULL, &ip->i_imap, &dip, &ibp, 0, 0);
+ error = xfs_imap_to_bp(mp, NULL, &ip->i_imap, &dip, &ibp, 0);
if (error)
goto fail_iput;
@@ -5083,7 +2776,7 @@
* buffer reference though, so that it stays pinned in memory
* while we need the buffer.
*/
- agi = XFS_BUF_TO_AGI(agibp);
+ agi = agibp->b_addr;
xfs_buf_unlock(agibp);
for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++) {
@@ -5172,8 +2865,10 @@
* If the filesystem is CRC enabled, this mismatch becomes a
* fatal log corruption failure.
*/
- if (xfs_sb_version_hascrc(&log->l_mp->m_sb))
+ if (xfs_sb_version_hascrc(&log->l_mp->m_sb)) {
+ XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, log->l_mp);
return -EFSCORRUPTED;
+ }
}
xlog_unpack_data(rhead, dp, log);
@@ -5186,35 +2881,34 @@
xlog_valid_rec_header(
struct xlog *log,
struct xlog_rec_header *rhead,
- xfs_daddr_t blkno)
+ xfs_daddr_t blkno,
+ int bufsize)
{
int hlen;
- if (unlikely(rhead->h_magicno != cpu_to_be32(XLOG_HEADER_MAGIC_NUM))) {
- XFS_ERROR_REPORT("xlog_valid_rec_header(1)",
- XFS_ERRLEVEL_LOW, log->l_mp);
+ if (XFS_IS_CORRUPT(log->l_mp,
+ rhead->h_magicno != cpu_to_be32(XLOG_HEADER_MAGIC_NUM)))
return -EFSCORRUPTED;
- }
- if (unlikely(
- (!rhead->h_version ||
- (be32_to_cpu(rhead->h_version) & (~XLOG_VERSION_OKBITS))))) {
+ if (XFS_IS_CORRUPT(log->l_mp,
+ (!rhead->h_version ||
+ (be32_to_cpu(rhead->h_version) &
+ (~XLOG_VERSION_OKBITS))))) {
xfs_warn(log->l_mp, "%s: unrecognised log version (%d).",
__func__, be32_to_cpu(rhead->h_version));
- return -EIO;
+ return -EFSCORRUPTED;
}
- /* LR body must have data or it wouldn't have been written */
+ /*
+ * LR body must have data (or it wouldn't have been written)
+ * and h_len must not be greater than LR buffer size.
+ */
hlen = be32_to_cpu(rhead->h_len);
- if (unlikely( hlen <= 0 || hlen > INT_MAX )) {
- XFS_ERROR_REPORT("xlog_valid_rec_header(2)",
- XFS_ERRLEVEL_LOW, log->l_mp);
+ if (XFS_IS_CORRUPT(log->l_mp, hlen <= 0 || hlen > bufsize))
return -EFSCORRUPTED;
- }
- if (unlikely( blkno > log->l_logBBsize || blkno > INT_MAX )) {
- XFS_ERROR_REPORT("xlog_valid_rec_header(3)",
- XFS_ERRLEVEL_LOW, log->l_mp);
+
+ if (XFS_IS_CORRUPT(log->l_mp,
+ blkno > log->l_logBBsize || blkno > INT_MAX))
return -EFSCORRUPTED;
- }
return 0;
}
@@ -5272,9 +2966,6 @@
goto bread_err1;
rhead = (xlog_rec_header_t *)offset;
- error = xlog_valid_rec_header(log, rhead, tail_blk);
- if (error)
- goto bread_err1;
/*
* xfsprogs has a bug where record length is based on lsunit but
@@ -5289,26 +2980,22 @@
*/
h_size = be32_to_cpu(rhead->h_size);
h_len = be32_to_cpu(rhead->h_len);
- if (h_len > h_size) {
- if (h_len <= log->l_mp->m_logbsize &&
- be32_to_cpu(rhead->h_num_logops) == 1) {
- xfs_warn(log->l_mp,
+ if (h_len > h_size && h_len <= log->l_mp->m_logbsize &&
+ rhead->h_num_logops == cpu_to_be32(1)) {
+ xfs_warn(log->l_mp,
"invalid iclog size (%d bytes), using lsunit (%d bytes)",
- h_size, log->l_mp->m_logbsize);
- h_size = log->l_mp->m_logbsize;
- } else
- return -EFSCORRUPTED;
+ h_size, log->l_mp->m_logbsize);
+ h_size = log->l_mp->m_logbsize;
}
- if ((be32_to_cpu(rhead->h_version) & XLOG_VERSION_2) &&
- (h_size > XLOG_HEADER_CYCLE_SIZE)) {
- hblks = h_size / XLOG_HEADER_CYCLE_SIZE;
- if (h_size % XLOG_HEADER_CYCLE_SIZE)
- hblks++;
+ error = xlog_valid_rec_header(log, rhead, tail_blk, h_size);
+ if (error)
+ goto bread_err1;
+
+ hblks = xlog_logrec_hblks(log, rhead);
+ if (hblks != 1) {
kmem_free(hbp);
hbp = xlog_alloc_buffer(log, hblks);
- } else {
- hblks = 1;
}
} else {
ASSERT(log->l_sectBBsize == 1);
@@ -5380,7 +3067,7 @@
}
rhead = (xlog_rec_header_t *)offset;
error = xlog_valid_rec_header(log, rhead,
- split_hblks ? blk_no : 0);
+ split_hblks ? blk_no : 0, h_size);
if (error)
goto bread_err2;
@@ -5461,7 +3148,7 @@
goto bread_err2;
rhead = (xlog_rec_header_t *)offset;
- error = xlog_valid_rec_header(log, rhead, blk_no);
+ error = xlog_valid_rec_header(log, rhead, blk_no, h_size);
if (error)
goto bread_err2;
@@ -5578,14 +3265,14 @@
*/
STATIC int
xlog_do_recover(
- struct xlog *log,
- xfs_daddr_t head_blk,
- xfs_daddr_t tail_blk)
+ struct xlog *log,
+ xfs_daddr_t head_blk,
+ xfs_daddr_t tail_blk)
{
- struct xfs_mount *mp = log->l_mp;
- int error;
- xfs_buf_t *bp;
- xfs_sb_t *sbp;
+ struct xfs_mount *mp = log->l_mp;
+ struct xfs_buf *bp = mp->m_sb_bp;
+ struct xfs_sb *sbp = &mp->m_sb;
+ int error;
trace_xfs_log_recover(log, head_blk, tail_blk);
@@ -5599,9 +3286,8 @@
/*
* If IO errors happened during recovery, bail out.
*/
- if (XFS_FORCED_SHUTDOWN(mp)) {
+ if (XFS_FORCED_SHUTDOWN(mp))
return -EIO;
- }
/*
* We now update the tail_lsn since much of the recovery has completed
@@ -5615,19 +3301,15 @@
xlog_assign_tail_lsn(mp);
/*
- * Now that we've finished replaying all buffer and inode
- * updates, re-read in the superblock and reverify it.
+ * Now that we've finished replaying all buffer and inode updates,
+ * re-read the superblock and reverify it.
*/
- bp = xfs_getsb(mp);
- bp->b_flags &= ~(XBF_DONE | XBF_ASYNC);
- ASSERT(!(bp->b_flags & XBF_WRITE));
- bp->b_flags |= XBF_READ;
- bp->b_ops = &xfs_sb_buf_ops;
-
- error = xfs_buf_submit(bp);
+ xfs_buf_lock(bp);
+ xfs_buf_hold(bp);
+ error = _xfs_buf_read(bp, XBF_READ);
if (error) {
if (!XFS_FORCED_SHUTDOWN(mp)) {
- xfs_buf_ioerror_alert(bp, __func__);
+ xfs_buf_ioerror_alert(bp, __this_address);
ASSERT(0);
}
xfs_buf_relse(bp);
@@ -5635,8 +3317,7 @@
}
/* Convert superblock from on-disk format */
- sbp = &mp->m_sb;
- xfs_sb_from_disk(sbp, XFS_BUF_TO_SBP(bp));
+ xfs_sb_from_disk(sbp, bp->b_addr);
xfs_buf_relse(bp);
/* re-initialise in-core superblock and geometry structures */
@@ -5765,6 +3446,14 @@
int error;
error = xlog_recover_process_intents(log);
if (error) {
+ /*
+ * Cancel all the unprocessed intent items now so that
+ * we don't leave them pinned in the AIL. This can
+ * cause the AIL to livelock on the pinned item if
+ * anyone tries to push the AIL (inode reclaim does
+ * this) before we get around to xfs_log_mount_cancel.
+ */
+ xlog_recover_cancel_intents(log);
xfs_alert(log->l_mp, "Failed to recover intents");
return error;
}
@@ -5809,7 +3498,6 @@
struct xlog *log)
{
xfs_mount_t *mp;
- xfs_agf_t *agfp;
xfs_buf_t *agfbp;
xfs_buf_t *agibp;
xfs_agnumber_t agno;
@@ -5829,7 +3517,8 @@
xfs_alert(mp, "%s agf read failed agno %d error %d",
__func__, agno, error);
} else {
- agfp = XFS_BUF_TO_AGF(agfbp);
+ struct xfs_agf *agfp = agfbp->b_addr;
+
freeblks += be32_to_cpu(agfp->agf_freeblks) +
be32_to_cpu(agfp->agf_flcount);
xfs_buf_relse(agfbp);
@@ -5840,7 +3529,7 @@
xfs_alert(mp, "%s agi read failed agno %d error %d",
__func__, agno, error);
} else {
- struct xfs_agi *agi = XFS_BUF_TO_AGI(agibp);
+ struct xfs_agi *agi = agibp->b_addr;
itotal += be32_to_cpu(agi->agi_count);
ifree += be32_to_cpu(agi->agi_freecount);
diff --git a/fs/xfs/xfs_message.c b/fs/xfs/xfs_message.c
index 9804efe..bc66d95 100644
--- a/fs/xfs/xfs_message.c
+++ b/fs/xfs/xfs_message.c
@@ -20,8 +20,8 @@
const struct xfs_mount *mp,
struct va_format *vaf)
{
- if (mp && mp->m_fsname) {
- printk("%sXFS (%s): %pV\n", level, mp->m_fsname, vaf);
+ if (mp && mp->m_super) {
+ printk("%sXFS (%s): %pV\n", level, mp->m_super->s_id, vaf);
return;
}
printk("%sXFS: %pV\n", level, vaf);
@@ -86,17 +86,25 @@
}
void
-asswarn(char *expr, char *file, int line)
+asswarn(
+ struct xfs_mount *mp,
+ char *expr,
+ char *file,
+ int line)
{
- xfs_warn(NULL, "Assertion failed: %s, file: %s, line: %d",
+ xfs_warn(mp, "Assertion failed: %s, file: %s, line: %d",
expr, file, line);
WARN_ON(1);
}
void
-assfail(char *expr, char *file, int line)
+assfail(
+ struct xfs_mount *mp,
+ char *expr,
+ char *file,
+ int line)
{
- xfs_emerg(NULL, "Assertion failed: %s, file: %s, line: %d",
+ xfs_emerg(mp, "Assertion failed: %s, file: %s, line: %d",
expr, file, line);
if (xfs_globals.bug_on_assert)
BUG();
@@ -105,7 +113,29 @@
}
void
-xfs_hex_dump(void *p, int length)
+xfs_hex_dump(const void *p, int length)
{
print_hex_dump(KERN_ALERT, "", DUMP_PREFIX_OFFSET, 16, 1, p, length, 1);
}
+
+void
+xfs_buf_alert_ratelimited(
+ struct xfs_buf *bp,
+ const char *rlmsg,
+ const char *fmt,
+ ...)
+{
+ struct xfs_mount *mp = bp->b_mount;
+ struct va_format vaf;
+ va_list args;
+
+ /* use the more aggressive per-target rate limit for buffers */
+ if (!___ratelimit(&bp->b_target->bt_ioerror_rl, rlmsg))
+ return;
+
+ va_start(args, fmt);
+ vaf.fmt = fmt;
+ vaf.va = &args;
+ __xfs_printk(KERN_ALERT, mp, &vaf);
+ va_end(args);
+}
diff --git a/fs/xfs/xfs_message.h b/fs/xfs/xfs_message.h
index 34447dc..3c392b1 100644
--- a/fs/xfs/xfs_message.h
+++ b/fs/xfs/xfs_message.h
@@ -31,15 +31,27 @@
}
#endif
-#define xfs_printk_ratelimited(func, dev, fmt, ...) \
+#define xfs_printk_ratelimited(func, dev, fmt, ...) \
do { \
static DEFINE_RATELIMIT_STATE(_rs, \
DEFAULT_RATELIMIT_INTERVAL, \
DEFAULT_RATELIMIT_BURST); \
if (__ratelimit(&_rs)) \
- func(dev, fmt, ##__VA_ARGS__); \
+ func(dev, fmt, ##__VA_ARGS__); \
} while (0)
+#define xfs_printk_once(func, dev, fmt, ...) \
+({ \
+ static bool __section(".data.once") __print_once; \
+ bool __ret_print_once = !__print_once; \
+ \
+ if (!__print_once) { \
+ __print_once = true; \
+ func(dev, fmt, ##__VA_ARGS__); \
+ } \
+ unlikely(__ret_print_once); \
+})
+
#define xfs_emerg_ratelimited(dev, fmt, ...) \
xfs_printk_ratelimited(xfs_emerg, dev, fmt, ##__VA_ARGS__)
#define xfs_alert_ratelimited(dev, fmt, ...) \
@@ -57,9 +69,17 @@
#define xfs_debug_ratelimited(dev, fmt, ...) \
xfs_printk_ratelimited(xfs_debug, dev, fmt, ##__VA_ARGS__)
-extern void assfail(char *expr, char *f, int l);
-extern void asswarn(char *expr, char *f, int l);
+#define xfs_warn_once(dev, fmt, ...) \
+ xfs_printk_once(xfs_warn, dev, fmt, ##__VA_ARGS__)
+#define xfs_notice_once(dev, fmt, ...) \
+ xfs_printk_once(xfs_notice, dev, fmt, ##__VA_ARGS__)
-extern void xfs_hex_dump(void *p, int length);
+void assfail(struct xfs_mount *mp, char *expr, char *f, int l);
+void asswarn(struct xfs_mount *mp, char *expr, char *f, int l);
+
+extern void xfs_hex_dump(const void *p, int length);
+
+void xfs_buf_alert_ratelimited(struct xfs_buf *bp, const char *rlmsg,
+ const char *fmt, ...);
#endif /* __XFS_MESSAGE_H */
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 5a0ce0c..7110507 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -31,7 +31,7 @@
#include "xfs_reflink.h"
#include "xfs_extent_busy.h"
#include "xfs_health.h"
-
+#include "xfs_trace.h"
static DEFINE_MUTEX(xfs_uuid_table_mutex);
static int xfs_uuid_table_size;
@@ -80,9 +80,9 @@
}
if (hole < 0) {
- xfs_uuid_table = kmem_realloc(xfs_uuid_table,
+ xfs_uuid_table = krealloc(xfs_uuid_table,
(xfs_uuid_table_size + 1) * sizeof(*xfs_uuid_table),
- 0);
+ GFP_KERNEL | __GFP_NOFAIL);
hole = xfs_uuid_table_size++;
}
xfs_uuid_table[hole] = *uuid;
@@ -148,7 +148,6 @@
ASSERT(atomic_read(&pag->pag_ref) == 0);
xfs_iunlink_destroy(pag);
xfs_buf_hash_destroy(pag);
- mutex_destroy(&pag->pag_ici_reclaim_lock);
call_rcu(&pag->rcu_head, __xfs_free_perag);
}
}
@@ -202,7 +201,6 @@
pag->pag_agno = index;
pag->pag_mount = mp;
spin_lock_init(&pag->pag_ici_lock);
- mutex_init(&pag->pag_ici_reclaim_lock);
INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC);
error = xfs_buf_hash_init(pag);
@@ -247,7 +245,6 @@
out_hash_destroy:
xfs_buf_hash_destroy(pag);
out_free_pag:
- mutex_destroy(&pag->pag_ici_reclaim_lock);
kmem_free(pag);
out_unwind_new_pags:
/* unwind any prior newly initialized pags */
@@ -257,7 +254,6 @@
break;
xfs_buf_hash_destroy(pag);
xfs_iunlink_destroy(pag);
- mutex_destroy(&pag->pag_ici_reclaim_lock);
kmem_free(pag);
}
return error;
@@ -315,7 +311,7 @@
/*
* Initialize the mount structure from the superblock.
*/
- xfs_sb_from_disk(sbp, XFS_BUF_TO_SBP(bp));
+ xfs_sb_from_disk(sbp, bp->b_addr);
/*
* If we haven't validated the superblock, do so now before we try
@@ -365,108 +361,122 @@
}
/*
- * Update alignment values based on mount options and sb values
+ * If the sunit/swidth change would move the precomputed root inode value, we
+ * must reject the ondisk change because repair will stumble over that.
+ * However, we allow the mount to proceed because we never rejected this
+ * combination before. Returns true to update the sb, false otherwise.
+ */
+static inline int
+xfs_check_new_dalign(
+ struct xfs_mount *mp,
+ int new_dalign,
+ bool *update_sb)
+{
+ struct xfs_sb *sbp = &mp->m_sb;
+ xfs_ino_t calc_ino;
+
+ calc_ino = xfs_ialloc_calc_rootino(mp, new_dalign);
+ trace_xfs_check_new_dalign(mp, new_dalign, calc_ino);
+
+ if (sbp->sb_rootino == calc_ino) {
+ *update_sb = true;
+ return 0;
+ }
+
+ xfs_warn(mp,
+"Cannot change stripe alignment; would require moving root inode.");
+
+ /*
+ * XXX: Next time we add a new incompat feature, this should start
+ * returning -EINVAL to fail the mount. Until then, spit out a warning
+ * that we're ignoring the administrator's instructions.
+ */
+ xfs_warn(mp, "Skipping superblock stripe alignment update.");
+ *update_sb = false;
+ return 0;
+}
+
+/*
+ * If we were provided with new sunit/swidth values as mount options, make sure
+ * that they pass basic alignment and superblock feature checks, and convert
+ * them into the same units (FSB) that everything else expects. This step
+ * /must/ be done before computing the inode geometry.
*/
STATIC int
-xfs_update_alignment(xfs_mount_t *mp)
+xfs_validate_new_dalign(
+ struct xfs_mount *mp)
{
- xfs_sb_t *sbp = &(mp->m_sb);
+ if (mp->m_dalign == 0)
+ return 0;
- if (mp->m_dalign) {
+ /*
+ * If stripe unit and stripe width are not multiples
+ * of the fs blocksize turn off alignment.
+ */
+ if ((BBTOB(mp->m_dalign) & mp->m_blockmask) ||
+ (BBTOB(mp->m_swidth) & mp->m_blockmask)) {
+ xfs_warn(mp,
+ "alignment check failed: sunit/swidth vs. blocksize(%d)",
+ mp->m_sb.sb_blocksize);
+ return -EINVAL;
+ } else {
/*
- * If stripe unit and stripe width are not multiples
- * of the fs blocksize turn off alignment.
+ * Convert the stripe unit and width to FSBs.
*/
- if ((BBTOB(mp->m_dalign) & mp->m_blockmask) ||
- (BBTOB(mp->m_swidth) & mp->m_blockmask)) {
+ mp->m_dalign = XFS_BB_TO_FSBT(mp, mp->m_dalign);
+ if (mp->m_dalign && (mp->m_sb.sb_agblocks % mp->m_dalign)) {
xfs_warn(mp,
- "alignment check failed: sunit/swidth vs. blocksize(%d)",
- sbp->sb_blocksize);
+ "alignment check failed: sunit/swidth vs. agsize(%d)",
+ mp->m_sb.sb_agblocks);
return -EINVAL;
- } else {
- /*
- * Convert the stripe unit and width to FSBs.
- */
- mp->m_dalign = XFS_BB_TO_FSBT(mp, mp->m_dalign);
- if (mp->m_dalign && (sbp->sb_agblocks % mp->m_dalign)) {
- xfs_warn(mp,
- "alignment check failed: sunit/swidth vs. agsize(%d)",
- sbp->sb_agblocks);
- return -EINVAL;
- } else if (mp->m_dalign) {
- mp->m_swidth = XFS_BB_TO_FSBT(mp, mp->m_swidth);
- } else {
- xfs_warn(mp,
- "alignment check failed: sunit(%d) less than bsize(%d)",
- mp->m_dalign, sbp->sb_blocksize);
- return -EINVAL;
- }
- }
-
- /*
- * Update superblock with new values
- * and log changes
- */
- if (xfs_sb_version_hasdalign(sbp)) {
- if (sbp->sb_unit != mp->m_dalign) {
- sbp->sb_unit = mp->m_dalign;
- mp->m_update_sb = true;
- }
- if (sbp->sb_width != mp->m_swidth) {
- sbp->sb_width = mp->m_swidth;
- mp->m_update_sb = true;
- }
+ } else if (mp->m_dalign) {
+ mp->m_swidth = XFS_BB_TO_FSBT(mp, mp->m_swidth);
} else {
xfs_warn(mp,
- "cannot change alignment: superblock does not support data alignment");
+ "alignment check failed: sunit(%d) less than bsize(%d)",
+ mp->m_dalign, mp->m_sb.sb_blocksize);
return -EINVAL;
}
- } else if ((mp->m_flags & XFS_MOUNT_NOALIGN) != XFS_MOUNT_NOALIGN &&
- xfs_sb_version_hasdalign(&mp->m_sb)) {
- mp->m_dalign = sbp->sb_unit;
- mp->m_swidth = sbp->sb_width;
+ }
+
+ if (!xfs_sb_version_hasdalign(&mp->m_sb)) {
+ xfs_warn(mp,
+"cannot change alignment: superblock does not support data alignment");
+ return -EINVAL;
}
return 0;
}
-/*
- * Set the default minimum read and write sizes unless
- * already specified in a mount option.
- * We use smaller I/O sizes when the file system
- * is being used for NFS service (wsync mount option).
- */
-STATIC void
-xfs_set_rw_sizes(xfs_mount_t *mp)
+/* Update alignment values based on mount options and sb values. */
+STATIC int
+xfs_update_alignment(
+ struct xfs_mount *mp)
{
- xfs_sb_t *sbp = &(mp->m_sb);
- int readio_log, writeio_log;
+ struct xfs_sb *sbp = &mp->m_sb;
- if (!(mp->m_flags & XFS_MOUNT_DFLT_IOSIZE)) {
- if (mp->m_flags & XFS_MOUNT_WSYNC) {
- readio_log = XFS_WSYNC_READIO_LOG;
- writeio_log = XFS_WSYNC_WRITEIO_LOG;
- } else {
- readio_log = XFS_READIO_LOG_LARGE;
- writeio_log = XFS_WRITEIO_LOG_LARGE;
- }
- } else {
- readio_log = mp->m_readio_log;
- writeio_log = mp->m_writeio_log;
+ if (mp->m_dalign) {
+ bool update_sb;
+ int error;
+
+ if (sbp->sb_unit == mp->m_dalign &&
+ sbp->sb_width == mp->m_swidth)
+ return 0;
+
+ error = xfs_check_new_dalign(mp, mp->m_dalign, &update_sb);
+ if (error || !update_sb)
+ return error;
+
+ sbp->sb_unit = mp->m_dalign;
+ sbp->sb_width = mp->m_swidth;
+ mp->m_update_sb = true;
+ } else if ((mp->m_flags & XFS_MOUNT_NOALIGN) != XFS_MOUNT_NOALIGN &&
+ xfs_sb_version_hasdalign(&mp->m_sb)) {
+ mp->m_dalign = sbp->sb_unit;
+ mp->m_swidth = sbp->sb_width;
}
- if (sbp->sb_blocklog > readio_log) {
- mp->m_readio_log = sbp->sb_blocklog;
- } else {
- mp->m_readio_log = readio_log;
- }
- mp->m_readio_blocks = 1 << (mp->m_readio_log - sbp->sb_blocklog);
- if (sbp->sb_blocklog > writeio_log) {
- mp->m_writeio_log = sbp->sb_blocklog;
- } else {
- mp->m_writeio_log = writeio_log;
- }
- mp->m_writeio_blocks = 1 << (mp->m_writeio_log - sbp->sb_blocklog);
+ return 0;
}
/*
@@ -692,12 +702,12 @@
}
/*
- * Check if sb_agblocks is aligned at stripe boundary
- * If sb_agblocks is NOT aligned turn off m_dalign since
- * allocator alignment is within an ag, therefore ag has
- * to be aligned at stripe boundary.
+ * If we were given new sunit/swidth options, do some basic validation
+ * checks and convert the incore dalign and swidth values to the
+ * same units (FSB) that everything else uses. This /must/ happen
+ * before computing the inode geometry.
*/
- error = xfs_update_alignment(mp);
+ error = xfs_validate_new_dalign(mp);
if (error)
goto out;
@@ -708,10 +718,22 @@
xfs_rmapbt_compute_maxlevels(mp);
xfs_refcountbt_compute_maxlevels(mp);
+ /*
+ * Check if sb_agblocks is aligned at stripe boundary. If sb_agblocks
+ * is NOT aligned turn off m_dalign since allocator alignment is within
+ * an ag, therefore ag has to be aligned at stripe boundary. Note that
+ * we must compute the free space and rmap btree geometry before doing
+ * this.
+ */
+ error = xfs_update_alignment(mp);
+ if (error)
+ goto out;
+
/* enable fail_at_unmount as default */
mp->m_fail_unmount = true;
- error = xfs_sysfs_init(&mp->m_kobj, &xfs_mp_ktype, NULL, mp->m_fsname);
+ error = xfs_sysfs_init(&mp->m_kobj, &xfs_mp_ktype,
+ NULL, mp->m_super->s_id);
if (error)
goto out;
@@ -733,9 +755,12 @@
goto out_remove_errortag;
/*
- * Set the minimum read and write sizes
+ * Update the preferred write size based on the information from the
+ * on-disk superblock.
*/
- xfs_set_rw_sizes(mp);
+ mp->m_allocsize_log =
+ max_t(uint32_t, sbp->sb_blocklog, mp->m_allocsize_log);
+ mp->m_allocsize_blocks = 1U << (mp->m_allocsize_log - sbp->sb_blocklog);
/* set the low space thresholds for dynamic preallocation */
xfs_set_low_space_thresholds(mp);
@@ -801,9 +826,8 @@
goto out_free_dir;
}
- if (!sbp->sb_logblocks) {
+ if (XFS_IS_CORRUPT(mp, !sbp->sb_logblocks)) {
xfs_warn(mp, "no log defined");
- XFS_ERROR_REPORT("xfs_mountfs", XFS_ERRLEVEL_LOW, mp);
error = -EFSCORRUPTED;
goto out_free_perag;
}
@@ -841,12 +865,10 @@
ASSERT(rip != NULL);
- if (unlikely(!S_ISDIR(VFS_I(rip)->i_mode))) {
+ if (XFS_IS_CORRUPT(mp, !S_ISDIR(VFS_I(rip)->i_mode))) {
xfs_warn(mp, "corrupted root inode %llu: not a directory",
(unsigned long long)rip->i_ino);
xfs_iunlock(rip, XFS_ILOCK_EXCL);
- XFS_ERROR_REPORT("xfs_mountfs_int(2)", XFS_ERRLEVEL_LOW,
- mp);
error = -EFSCORRUPTED;
goto out_rele_rip;
}
@@ -994,7 +1016,7 @@
* quota inodes.
*/
cancel_delayed_work_sync(&mp->m_reclaim_work);
- xfs_reclaim_inodes(mp, SYNC_WAIT);
+ xfs_reclaim_inodes(mp);
xfs_health_unmount(mp);
out_log_dealloc:
mp->m_flags |= XFS_MOUNT_UNMOUNTING;
@@ -1042,11 +1064,12 @@
* We can potentially deadlock here if we have an inode cluster
* that has been freed has its buffer still pinned in memory because
* the transaction is still sitting in a iclog. The stale inodes
- * on that buffer will have their flush locks held until the
- * transaction hits the disk and the callbacks run. the inode
- * flush takes the flush lock unconditionally and with nothing to
- * push out the iclog we will never get that unlocked. hence we
- * need to force the log first.
+ * on that buffer will be pinned to the buffer until the
+ * transaction hits the disk and the callbacks run. Pushing the AIL will
+ * skip the stale inodes and may never see the pinned buffer, so
+ * nothing will push out the iclog and unpin the buffer. Hence we
+ * need to force the log here to ensure all items are flushed into the
+ * AIL before we go any further.
*/
xfs_log_force(mp, XFS_LOG_SYNC);
@@ -1071,13 +1094,12 @@
xfs_ail_push_all_sync(mp->m_ail);
/*
- * And reclaim all inodes. At this point there should be no dirty
- * inodes and none should be pinned or locked, but use synchronous
- * reclaim just to be sure. We can stop background inode reclaim
- * here as well if it is still running.
+ * Reclaim all inodes. At this point there should be no dirty inodes and
+ * none should be pinned or locked. Stop background inode reclaim here
+ * if it is still running.
*/
cancel_delayed_work_sync(&mp->m_reclaim_work);
- xfs_reclaim_inodes(mp, SYNC_WAIT);
+ xfs_reclaim_inodes(mp);
xfs_health_unmount(mp);
xfs_qm_unmount(mp);
@@ -1169,39 +1191,6 @@
}
/*
- * Deltas for the inode count are +/-64, hence we use a large batch size
- * of 128 so we don't need to take the counter lock on every update.
- */
-#define XFS_ICOUNT_BATCH 128
-int
-xfs_mod_icount(
- struct xfs_mount *mp,
- int64_t delta)
-{
- percpu_counter_add_batch(&mp->m_icount, delta, XFS_ICOUNT_BATCH);
- if (__percpu_counter_compare(&mp->m_icount, 0, XFS_ICOUNT_BATCH) < 0) {
- ASSERT(0);
- percpu_counter_add(&mp->m_icount, -delta);
- return -EINVAL;
- }
- return 0;
-}
-
-int
-xfs_mod_ifree(
- struct xfs_mount *mp,
- int64_t delta)
-{
- percpu_counter_add(&mp->m_ifree, delta);
- if (percpu_counter_compare(&mp->m_ifree, 0) < 0) {
- ASSERT(0);
- percpu_counter_add(&mp->m_ifree, -delta);
- return -EINVAL;
- }
- return 0;
-}
-
-/*
* Deltas for the block count can vary from 1 to very large, but lock contention
* only occurs on frequent small block count updates such as in the delayed
* allocation path for buffered writes (page a time updates). Hence we set
@@ -1279,10 +1268,9 @@
spin_unlock(&mp->m_sb_lock);
return 0;
}
- printk_once(KERN_WARNING
- "Filesystem \"%s\": reserve blocks depleted! "
- "Consider increasing reserve pool size.",
- mp->m_fsname);
+ xfs_warn_once(mp,
+"Reserve blocks depleted! Consider increasing reserve pool size.");
+
fdblocks_enospc:
spin_unlock(&mp->m_sb_lock);
return -ENOSPC;
@@ -1307,23 +1295,6 @@
}
/*
- * xfs_getsb() is called to obtain the buffer for the superblock.
- * The buffer is returned locked and read in from disk.
- * The buffer should be released with a call to xfs_brelse().
- */
-struct xfs_buf *
-xfs_getsb(
- struct xfs_mount *mp)
-{
- struct xfs_buf *bp = mp->m_sb_bp;
-
- xfs_buf_lock(bp);
- xfs_buf_hold(bp);
- ASSERT(bp->b_flags & XBF_DONE);
- return bp;
-}
-
-/*
* Used to free the superblock along various error paths.
*/
void
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index fdb60e0..dfa429b 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -9,10 +9,8 @@
struct xlog;
struct xfs_inode;
struct xfs_mru_cache;
-struct xfs_nameops;
struct xfs_ail;
struct xfs_quotainfo;
-struct xfs_dir_ops;
struct xfs_da_geometry;
/* dynamic preallocation free space thresholds, 5% down to 1% */
@@ -57,66 +55,25 @@
long retry_timeout; /* in jiffies, -1 = infinite */
};
+/*
+ * The struct xfsmount layout is optimised to separate read-mostly variables
+ * from variables that are frequently modified. We put the read-mostly variables
+ * first, then place all the other variables at the end.
+ *
+ * Typically, read-mostly variables are those that are set at mount time and
+ * never changed again, or only change rarely as a result of things like sysfs
+ * knobs being tweaked.
+ */
typedef struct xfs_mount {
- struct super_block *m_super;
- xfs_tid_t m_tid; /* next unused tid for fs */
-
- /*
- * Bitsets of per-fs metadata that have been checked and/or are sick.
- * Callers must hold m_sb_lock to access these two fields.
- */
- uint8_t m_fs_checked;
- uint8_t m_fs_sick;
- /*
- * Bitsets of rt metadata that have been checked and/or are sick.
- * Callers must hold m_sb_lock to access this field.
- */
- uint8_t m_rt_checked;
- uint8_t m_rt_sick;
-
- struct xfs_ail *m_ail; /* fs active log item list */
-
struct xfs_sb m_sb; /* copy of fs superblock */
- spinlock_t m_sb_lock; /* sb counter lock */
- struct percpu_counter m_icount; /* allocated inodes counter */
- struct percpu_counter m_ifree; /* free inodes counter */
- struct percpu_counter m_fdblocks; /* free block counter */
- /*
- * Count of data device blocks reserved for delayed allocations,
- * including indlen blocks. Does not include allocated CoW staging
- * extents or anything related to the rt device.
- */
- struct percpu_counter m_delalloc_blks;
-
+ struct super_block *m_super;
+ struct xfs_ail *m_ail; /* fs active log item list */
struct xfs_buf *m_sb_bp; /* buffer for superblock */
- char *m_fsname; /* filesystem name */
- int m_fsname_len; /* strlen of fs name */
char *m_rtname; /* realtime device name */
char *m_logname; /* external log device name */
- int m_bsize; /* fs logical block size */
- xfs_agnumber_t m_agfrotor; /* last ag where space found */
- xfs_agnumber_t m_agirotor; /* last ag dir inode alloced */
- spinlock_t m_agirotor_lock;/* .. and lock protecting it */
- xfs_agnumber_t m_maxagi; /* highest inode alloc group */
- uint m_readio_log; /* min read size log bytes */
- uint m_readio_blocks; /* min read size blocks */
- uint m_writeio_log; /* min write size log bytes */
- uint m_writeio_blocks; /* min write size blocks */
struct xfs_da_geometry *m_dir_geo; /* directory block geometry */
struct xfs_da_geometry *m_attr_geo; /* attribute block geometry */
struct xlog *m_log; /* log specific stuff */
- struct xfs_ino_geometry m_ino_geo; /* inode geometry */
- int m_logbufs; /* number of log buffers */
- int m_logbsize; /* size of each log buffer */
- uint m_rsumlevels; /* rt summary levels */
- uint m_rsumsize; /* size of rt summary, bytes */
- /*
- * Optional cache of rt summary level per bitmap block with the
- * invariant that m_rsum_cache[bbno] <= the minimum i for which
- * rsum[i][bbno] != 0. Reads and writes are serialized by the rsumip
- * inode lock.
- */
- uint8_t *m_rsum_cache;
struct xfs_inode *m_rbmip; /* pointer to bitmap inode */
struct xfs_inode *m_rsumip; /* pointer to summary inode */
struct xfs_inode *m_rootip; /* pointer to root directory */
@@ -124,9 +81,26 @@
xfs_buftarg_t *m_ddev_targp; /* saves taking the address */
xfs_buftarg_t *m_logdev_targp;/* ptr to log device */
xfs_buftarg_t *m_rtdev_targp; /* ptr to rt device */
+ /*
+ * Optional cache of rt summary level per bitmap block with the
+ * invariant that m_rsum_cache[bbno] <= the minimum i for which
+ * rsum[i][bbno] != 0. Reads and writes are serialized by the rsumip
+ * inode lock.
+ */
+ uint8_t *m_rsum_cache;
+ struct xfs_mru_cache *m_filestream; /* per-mount filestream data */
+ struct workqueue_struct *m_buf_workqueue;
+ struct workqueue_struct *m_unwritten_workqueue;
+ struct workqueue_struct *m_cil_workqueue;
+ struct workqueue_struct *m_reclaim_workqueue;
+ struct workqueue_struct *m_eofblocks_workqueue;
+ struct workqueue_struct *m_sync_workqueue;
+
+ int m_bsize; /* fs logical block size */
uint8_t m_blkbit_log; /* blocklog + NBBY */
uint8_t m_blkbb_log; /* blocklog - BBSHIFT */
uint8_t m_agno_log; /* log #ag's */
+ uint8_t m_sectbb_log; /* sectlog - BBSHIFT */
uint m_blockmask; /* sb_blocksize-1 */
uint m_blockwsize; /* sb_blocksize in words */
uint m_blockwmask; /* blockwsize-1 */
@@ -145,46 +119,82 @@
xfs_extlen_t m_ag_prealloc_blocks; /* reserved ag blocks */
uint m_alloc_set_aside; /* space we can't use */
uint m_ag_max_usable; /* max space per AG */
+ int m_dalign; /* stripe unit */
+ int m_swidth; /* stripe width */
+ xfs_agnumber_t m_maxagi; /* highest inode alloc group */
+ uint m_allocsize_log;/* min write size log bytes */
+ uint m_allocsize_blocks; /* min write size blocks */
+ int m_logbufs; /* number of log buffers */
+ int m_logbsize; /* size of each log buffer */
+ uint m_rsumlevels; /* rt summary levels */
+ uint m_rsumsize; /* size of rt summary, bytes */
+ int m_fixedfsid[2]; /* unchanged for life of FS */
+ uint m_qflags; /* quota status flags */
+ uint64_t m_flags; /* global mount flags */
+ int64_t m_low_space[XFS_LOWSP_MAX];
+ struct xfs_ino_geometry m_ino_geo; /* inode geometry */
+ struct xfs_trans_resv m_resv; /* precomputed res values */
+ /* low free space thresholds */
+ bool m_always_cow;
+ bool m_fail_unmount;
+ bool m_finobt_nores; /* no per-AG finobt resv. */
+ bool m_update_sb; /* sb needs update in mount */
+
+ /*
+ * Bitsets of per-fs metadata that have been checked and/or are sick.
+ * Callers must hold m_sb_lock to access these two fields.
+ */
+ uint8_t m_fs_checked;
+ uint8_t m_fs_sick;
+ /*
+ * Bitsets of rt metadata that have been checked and/or are sick.
+ * Callers must hold m_sb_lock to access this field.
+ */
+ uint8_t m_rt_checked;
+ uint8_t m_rt_sick;
+
+ /*
+ * End of read-mostly variables. Frequently written variables and locks
+ * should be placed below this comment from now on. The first variable
+ * here is marked as cacheline aligned so they it is separated from
+ * the read-mostly variables.
+ */
+
+ spinlock_t ____cacheline_aligned m_sb_lock; /* sb counter lock */
+ struct percpu_counter m_icount; /* allocated inodes counter */
+ struct percpu_counter m_ifree; /* free inodes counter */
+ struct percpu_counter m_fdblocks; /* free block counter */
+ /*
+ * Count of data device blocks reserved for delayed allocations,
+ * including indlen blocks. Does not include allocated CoW staging
+ * extents or anything related to the rt device.
+ */
+ struct percpu_counter m_delalloc_blks;
+
struct radix_tree_root m_perag_tree; /* per-ag accounting info */
spinlock_t m_perag_lock; /* lock for m_perag_tree */
- struct mutex m_growlock; /* growfs mutex */
- int m_fixedfsid[2]; /* unchanged for life of FS */
- uint64_t m_flags; /* global mount flags */
- bool m_finobt_nores; /* no per-AG finobt resv. */
- uint m_qflags; /* quota status flags */
- struct xfs_trans_resv m_resv; /* precomputed res values */
uint64_t m_resblks; /* total reserved blocks */
uint64_t m_resblks_avail;/* available reserved blocks */
uint64_t m_resblks_save; /* reserved blks @ remount,ro */
- int m_dalign; /* stripe unit */
- int m_swidth; /* stripe width */
- uint8_t m_sectbb_log; /* sectlog - BBSHIFT */
- const struct xfs_nameops *m_dirnameops; /* vector of dir name ops */
- const struct xfs_dir_ops *m_dir_inode_ops; /* vector of dir inode ops */
- const struct xfs_dir_ops *m_nondir_inode_ops; /* !dir inode ops */
- uint m_chsize; /* size of next field */
- atomic_t m_active_trans; /* number trans frozen */
- struct xfs_mru_cache *m_filestream; /* per-mount filestream data */
struct delayed_work m_reclaim_work; /* background inode reclaim */
struct delayed_work m_eofblocks_work; /* background eof blocks
trimming */
struct delayed_work m_cowblocks_work; /* background cow blocks
trimming */
- bool m_update_sb; /* sb needs update in mount */
- int64_t m_low_space[XFS_LOWSP_MAX];
- /* low free space thresholds */
struct xfs_kobj m_kobj;
struct xfs_kobj m_error_kobj;
struct xfs_kobj m_error_meta_kobj;
struct xfs_error_cfg m_error_cfg[XFS_ERR_CLASS_MAX][XFS_ERR_ERRNO_MAX];
struct xstats m_stats; /* per-fs stats */
+ xfs_agnumber_t m_agfrotor; /* last ag where space found */
+ xfs_agnumber_t m_agirotor; /* last ag dir inode alloced */
+ spinlock_t m_agirotor_lock;/* .. and lock protecting it */
- struct workqueue_struct *m_buf_workqueue;
- struct workqueue_struct *m_unwritten_workqueue;
- struct workqueue_struct *m_cil_workqueue;
- struct workqueue_struct *m_reclaim_workqueue;
- struct workqueue_struct *m_eofblocks_workqueue;
- struct workqueue_struct *m_sync_workqueue;
+ /*
+ * Workqueue item so that we can coalesce multiple inode flush attempts
+ * into a single flush.
+ */
+ struct work_struct m_flush_inodes_work;
/*
* Generation of the filesysyem layout. This is incremented by each
@@ -196,9 +206,8 @@
* to various other kinds of pain inflicted on the pNFS server.
*/
uint32_t m_generation;
+ struct mutex m_growlock; /* growfs mutex */
- bool m_always_cow;
- bool m_fail_unmount;
#ifdef DEBUG
/*
* Frequency with which errors are injected. Replaces xfs_etest; the
@@ -229,7 +238,7 @@
#define XFS_MOUNT_ATTR2 (1ULL << 8) /* allow use of attr2 format */
#define XFS_MOUNT_GRPID (1ULL << 9) /* group-ID assigned from directory */
#define XFS_MOUNT_NORECOVERY (1ULL << 10) /* no recovery - dirty fs */
-#define XFS_MOUNT_DFLT_IOSIZE (1ULL << 12) /* set default i/o size */
+#define XFS_MOUNT_ALLOCSIZE (1ULL << 12) /* specified allocation size */
#define XFS_MOUNT_SMALL_INUMS (1ULL << 14) /* user wants 32bit inodes */
#define XFS_MOUNT_32BITINODES (1ULL << 15) /* inode32 allocator active */
#define XFS_MOUNT_NOUUID (1ULL << 16) /* ignore uuid during mount */
@@ -238,20 +247,13 @@
* allocation */
#define XFS_MOUNT_RDONLY (1ULL << 20) /* read-only fs */
#define XFS_MOUNT_DIRSYNC (1ULL << 21) /* synchronous directory ops */
-#define XFS_MOUNT_COMPAT_IOSIZE (1ULL << 22) /* don't report large preferred
+#define XFS_MOUNT_LARGEIO (1ULL << 22) /* report large preferred
* I/O size in stat() */
#define XFS_MOUNT_FILESTREAMS (1ULL << 24) /* enable the filestreams
allocator */
#define XFS_MOUNT_NOATTR2 (1ULL << 25) /* disable use of attr2 format */
-
-#define XFS_MOUNT_DAX (1ULL << 62) /* TEST ONLY! */
-
-
-/*
- * Default minimum read and write sizes.
- */
-#define XFS_READIO_LOG_LARGE 16
-#define XFS_WRITEIO_LOG_LARGE 16
+#define XFS_MOUNT_DAX_ALWAYS (1ULL << 26)
+#define XFS_MOUNT_DAX_NEVER (1ULL << 27)
/*
* Max and min values for mount-option defined I/O
@@ -260,37 +262,6 @@
#define XFS_MAX_IO_LOG 30 /* 1G */
#define XFS_MIN_IO_LOG PAGE_SHIFT
-/*
- * Synchronous read and write sizes. This should be
- * better for NFSv2 wsync filesystems.
- */
-#define XFS_WSYNC_READIO_LOG 15 /* 32k */
-#define XFS_WSYNC_WRITEIO_LOG 14 /* 16k */
-
-/*
- * Allow large block sizes to be reported to userspace programs if the
- * "largeio" mount option is used.
- *
- * If compatibility mode is specified, simply return the basic unit of caching
- * so that we don't get inefficient read/modify/write I/O from user apps.
- * Otherwise....
- *
- * If the underlying volume is a stripe, then return the stripe width in bytes
- * as the recommended I/O size. It is not a stripe and we've set a default
- * buffered I/O size, return that, otherwise return the compat default.
- */
-static inline unsigned long
-xfs_preferred_iosize(xfs_mount_t *mp)
-{
- if (mp->m_flags & XFS_MOUNT_COMPAT_IOSIZE)
- return PAGE_SIZE;
- return (mp->m_swidth ?
- (mp->m_swidth << mp->m_sb.sb_blocklog) :
- ((mp->m_flags & XFS_MOUNT_DFLT_IOSIZE) ?
- (1 << (int)max(mp->m_readio_log, mp->m_writeio_log)) :
- PAGE_SIZE));
-}
-
#define XFS_LAST_UNMOUNT_WAS_CLEAN(mp) \
((mp)->m_flags & XFS_MOUNT_WAS_CLEAN)
#define XFS_FORCED_SHUTDOWN(mp) ((mp)->m_flags & XFS_MOUNT_FS_SHUTDOWN)
@@ -303,8 +274,6 @@
#define SHUTDOWN_LOG_IO_ERROR 0x0002 /* write attempt to the log failed */
#define SHUTDOWN_FORCE_UMOUNT 0x0004 /* shutdown from a forced unmount */
#define SHUTDOWN_CORRUPT_INCORE 0x0008 /* corrupt in-memory data structures */
-#define SHUTDOWN_REMOTE_REQ 0x0010 /* shutdown came from remote cell */
-#define SHUTDOWN_DEVICE_REQ 0x0020 /* failed all paths to the device */
/*
* Flags for xfs_mountfs
@@ -385,7 +354,6 @@
spinlock_t pag_ici_lock; /* incore inode cache lock */
struct radix_tree_root pag_ici_root; /* incore inode cache root */
int pag_ici_reclaimable; /* reclaimable inodes */
- struct mutex pag_ici_reclaim_lock; /* serialisation point */
unsigned long pag_ici_reclaim_cursor; /* reclaim restart point */
/* buffer cache index */
@@ -438,13 +406,10 @@
xfs_agnumber_t *maxagi);
extern void xfs_unmountfs(xfs_mount_t *);
-extern int xfs_mod_icount(struct xfs_mount *mp, int64_t delta);
-extern int xfs_mod_ifree(struct xfs_mount *mp, int64_t delta);
extern int xfs_mod_fdblocks(struct xfs_mount *mp, int64_t delta,
bool reserved);
extern int xfs_mod_frextents(struct xfs_mount *mp, int64_t delta);
-extern struct xfs_buf *xfs_getsb(xfs_mount_t *);
extern int xfs_readsb(xfs_mount_t *, int);
extern void xfs_freesb(xfs_mount_t *);
extern bool xfs_fs_writable(struct xfs_mount *mp, int level);
diff --git a/fs/xfs/xfs_ondisk.h b/fs/xfs/xfs_ondisk.h
index b6701b4..0aa87c2 100644
--- a/fs/xfs/xfs_ondisk.h
+++ b/fs/xfs/xfs_ondisk.h
@@ -15,6 +15,10 @@
"XFS: offsetof(" #structname ", " #member ") is wrong, " \
"expected " #off)
+#define XFS_CHECK_VALUE(value, expected) \
+ BUILD_BUG_ON_MSG((value) != (expected), \
+ "XFS: value of " #value " is wrong, expected " #expected)
+
static inline void __init
xfs_check_ondisk_structs(void)
{
@@ -23,7 +27,7 @@
XFS_CHECK_STRUCT_SIZE(struct xfs_acl_entry, 12);
XFS_CHECK_STRUCT_SIZE(struct xfs_agf, 224);
XFS_CHECK_STRUCT_SIZE(struct xfs_agfl, 36);
- XFS_CHECK_STRUCT_SIZE(struct xfs_agi, 336);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_agi, 344);
XFS_CHECK_STRUCT_SIZE(struct xfs_bmbt_key, 8);
XFS_CHECK_STRUCT_SIZE(struct xfs_bmbt_rec, 16);
XFS_CHECK_STRUCT_SIZE(struct xfs_bmdr_block, 4);
@@ -41,7 +45,8 @@
XFS_CHECK_STRUCT_SIZE(struct xfs_refcount_rec, 12);
XFS_CHECK_STRUCT_SIZE(struct xfs_rmap_key, 20);
XFS_CHECK_STRUCT_SIZE(struct xfs_rmap_rec, 24);
- XFS_CHECK_STRUCT_SIZE(struct xfs_timestamp, 8);
+ XFS_CHECK_STRUCT_SIZE(xfs_timestamp_t, 8);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_legacy_timestamp, 8);
XFS_CHECK_STRUCT_SIZE(xfs_alloc_key_t, 8);
XFS_CHECK_STRUCT_SIZE(xfs_alloc_ptr_t, 4);
XFS_CHECK_STRUCT_SIZE(xfs_alloc_rec_t, 8);
@@ -84,12 +89,12 @@
XFS_CHECK_OFFSET(xfs_attr_leaf_name_remote_t, namelen, 8);
XFS_CHECK_OFFSET(xfs_attr_leaf_name_remote_t, name, 9);
XFS_CHECK_STRUCT_SIZE(xfs_attr_leafblock_t, 40);
- XFS_CHECK_OFFSET(xfs_attr_shortform_t, hdr.totsize, 0);
- XFS_CHECK_OFFSET(xfs_attr_shortform_t, hdr.count, 2);
- XFS_CHECK_OFFSET(xfs_attr_shortform_t, list[0].namelen, 4);
- XFS_CHECK_OFFSET(xfs_attr_shortform_t, list[0].valuelen, 5);
- XFS_CHECK_OFFSET(xfs_attr_shortform_t, list[0].flags, 6);
- XFS_CHECK_OFFSET(xfs_attr_shortform_t, list[0].nameval, 7);
+ XFS_CHECK_OFFSET(struct xfs_attr_shortform, hdr.totsize, 0);
+ XFS_CHECK_OFFSET(struct xfs_attr_shortform, hdr.count, 2);
+ XFS_CHECK_OFFSET(struct xfs_attr_shortform, list[0].namelen, 4);
+ XFS_CHECK_OFFSET(struct xfs_attr_shortform, list[0].valuelen, 5);
+ XFS_CHECK_OFFSET(struct xfs_attr_shortform, list[0].flags, 6);
+ XFS_CHECK_OFFSET(struct xfs_attr_shortform, list[0].nameval, 7);
XFS_CHECK_STRUCT_SIZE(xfs_da_blkinfo_t, 12);
XFS_CHECK_STRUCT_SIZE(xfs_da_intnode_t, 16);
XFS_CHECK_STRUCT_SIZE(xfs_da_node_entry_t, 8);
@@ -111,6 +116,7 @@
XFS_CHECK_STRUCT_SIZE(xfs_dir2_sf_hdr_t, 10);
/* log structures */
+ XFS_CHECK_STRUCT_SIZE(struct xfs_buf_log_format, 88);
XFS_CHECK_STRUCT_SIZE(struct xfs_dq_logformat, 24);
XFS_CHECK_STRUCT_SIZE(struct xfs_efd_log_format_32, 28);
XFS_CHECK_STRUCT_SIZE(struct xfs_efd_log_format_64, 32);
@@ -120,7 +126,8 @@
XFS_CHECK_STRUCT_SIZE(struct xfs_extent_64, 16);
XFS_CHECK_STRUCT_SIZE(struct xfs_log_dinode, 176);
XFS_CHECK_STRUCT_SIZE(struct xfs_icreate_log, 28);
- XFS_CHECK_STRUCT_SIZE(struct xfs_ictimestamp, 8);
+ XFS_CHECK_STRUCT_SIZE(xfs_ictimestamp_t, 8);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_legacy_ictimestamp, 8);
XFS_CHECK_STRUCT_SIZE(struct xfs_inode_log_format_32, 52);
XFS_CHECK_STRUCT_SIZE(struct xfs_inode_log_format, 56);
XFS_CHECK_STRUCT_SIZE(struct xfs_qoff_logformat, 20);
@@ -151,6 +158,20 @@
XFS_CHECK_STRUCT_SIZE(struct xfs_inumbers, 24);
XFS_CHECK_STRUCT_SIZE(struct xfs_bulkstat_req, 64);
XFS_CHECK_STRUCT_SIZE(struct xfs_inumbers_req, 64);
+
+ /*
+ * Make sure the incore inode timestamp range corresponds to hand
+ * converted values based on the ondisk format specification.
+ */
+ XFS_CHECK_VALUE(XFS_BIGTIME_TIME_MIN - XFS_BIGTIME_EPOCH_OFFSET,
+ XFS_LEGACY_TIME_MIN);
+ XFS_CHECK_VALUE(XFS_BIGTIME_TIME_MAX - XFS_BIGTIME_EPOCH_OFFSET,
+ 16299260424LL);
+
+ /* Do the same with the incore quota expiration range. */
+ XFS_CHECK_VALUE(XFS_DQ_BIGTIME_EXPIRY_MIN << XFS_DQ_BIGTIME_SHIFT, 4);
+ XFS_CHECK_VALUE(XFS_DQ_BIGTIME_EXPIRY_MAX << XFS_DQ_BIGTIME_SHIFT,
+ 16299260424LL);
}
#endif /* __XFS_ONDISK_H */
diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c
index f63fe8d..f3082a9 100644
--- a/fs/xfs/xfs_pnfs.c
+++ b/fs/xfs/xfs_pnfs.c
@@ -12,6 +12,7 @@
#include "xfs_trans.h"
#include "xfs_bmap.h"
#include "xfs_iomap.h"
+#include "xfs_pnfs.h"
/*
* Ensure that we do not have any outstanding pNFS layouts that can be used by
@@ -57,9 +58,8 @@
{
struct xfs_mount *mp = XFS_M(sb);
- printk_once(KERN_NOTICE
-"XFS (%s): using experimental pNFS feature, use at your own risk!\n",
- mp->m_fsname);
+ xfs_notice_once(mp,
+"Using experimental pNFS feature, use at your own risk!");
if (*len < sizeof(uuid_t))
return -EINVAL;
@@ -142,43 +142,38 @@
lock_flags = xfs_ilock_data_map_shared(ip);
error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb,
&imap, &nimaps, bmapi_flags);
- xfs_iunlock(ip, lock_flags);
- if (error)
- goto out_unlock;
+ ASSERT(!nimaps || imap.br_startblock != DELAYSTARTBLOCK);
- if (write) {
- enum xfs_prealloc_flags flags = 0;
+ if (!error && write &&
+ (!nimaps || imap.br_startblock == HOLESTARTBLOCK)) {
+ if (offset + length > XFS_ISIZE(ip))
+ end_fsb = xfs_iomap_eof_align_last_fsb(ip, end_fsb);
+ else if (nimaps && imap.br_startblock == HOLESTARTBLOCK)
+ end_fsb = min(end_fsb, imap.br_startoff +
+ imap.br_blockcount);
+ xfs_iunlock(ip, lock_flags);
- ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
-
- if (!nimaps || imap.br_startblock == HOLESTARTBLOCK) {
- /*
- * xfs_iomap_write_direct() expects to take ownership of
- * the shared ilock.
- */
- xfs_ilock(ip, XFS_ILOCK_SHARED);
- error = xfs_iomap_write_direct(ip, offset, length,
- &imap, nimaps);
- if (error)
- goto out_unlock;
-
- /*
- * Ensure the next transaction is committed
- * synchronously so that the blocks allocated and
- * handed out to the client are guaranteed to be
- * present even after a server crash.
- */
- flags |= XFS_PREALLOC_SET | XFS_PREALLOC_SYNC;
- }
-
- error = xfs_update_prealloc_flags(ip, flags);
+ error = xfs_iomap_write_direct(ip, offset_fsb,
+ end_fsb - offset_fsb, &imap);
if (error)
goto out_unlock;
+
+ /*
+ * Ensure the next transaction is committed synchronously so
+ * that the blocks allocated and handed out to the client are
+ * guaranteed to be present even after a server crash.
+ */
+ error = xfs_update_prealloc_flags(ip,
+ XFS_PREALLOC_SET | XFS_PREALLOC_SYNC);
+ if (error)
+ goto out_unlock;
+ } else {
+ xfs_iunlock(ip, lock_flags);
}
xfs_iunlock(ip, XFS_IOLOCK_EXCL);
- error = xfs_bmbt_to_iomap(ip, iomap, &imap, false);
+ error = xfs_bmbt_to_iomap(ip, iomap, &imap, 0);
*device_generation = mp->m_generation;
return error;
out_unlock:
diff --git a/fs/xfs/xfs_pwork.c b/fs/xfs/xfs_pwork.c
index 4bcc3e6..b03333f 100644
--- a/fs/xfs/xfs_pwork.c
+++ b/fs/xfs/xfs_pwork.c
@@ -132,5 +132,5 @@
* For now we'll go with the most conservative setting possible,
* which is two threads for an SSD and 1 thread everywhere else.
*/
- return blk_queue_nonrot(btp->bt_bdev->bd_queue) ? 2 : 1;
+ return blk_queue_nonrot(btp->bt_bdev->bd_disk->queue) ? 2 : 1;
}
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index ecd8ce1..b2a9abe 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -22,6 +22,7 @@
#include "xfs_qm.h"
#include "xfs_trace.h"
#include "xfs_icache.h"
+#include "xfs_error.h"
/*
* The global quota manager. There is only one of these for the entire
@@ -29,10 +30,10 @@
* quota functionality, including maintaining the freelist and hash
* tables of dquots.
*/
-STATIC int xfs_qm_init_quotainos(xfs_mount_t *);
-STATIC int xfs_qm_init_quotainfo(xfs_mount_t *);
+STATIC int xfs_qm_init_quotainos(struct xfs_mount *mp);
+STATIC int xfs_qm_init_quotainfo(struct xfs_mount *mp);
-STATIC void xfs_qm_destroy_quotainos(xfs_quotainfo_t *qi);
+STATIC void xfs_qm_destroy_quotainos(struct xfs_quotainfo *qi);
STATIC void xfs_qm_dqfree_one(struct xfs_dquot *dqp);
/*
* We use the batch lookup interface to iterate over the dquots as it
@@ -46,7 +47,7 @@
STATIC int
xfs_qm_dquot_walk(
struct xfs_mount *mp,
- int type,
+ xfs_dqtype_t type,
int (*execute)(struct xfs_dquot *dqp, void *data),
void *data)
{
@@ -78,7 +79,7 @@
for (i = 0; i < nr_found; i++) {
struct xfs_dquot *dqp = batch[i];
- next_index = be32_to_cpu(dqp->q_core.d_id) + 1;
+ next_index = dqp->q_id + 1;
error = execute(batch[i], data);
if (error == -EAGAIN) {
@@ -120,14 +121,13 @@
{
struct xfs_mount *mp = dqp->q_mount;
struct xfs_quotainfo *qi = mp->m_quotainfo;
+ int error = -EAGAIN;
xfs_dqlock(dqp);
- if ((dqp->dq_flags & XFS_DQ_FREEING) || dqp->q_nrefs != 0) {
- xfs_dqunlock(dqp);
- return -EAGAIN;
- }
+ if ((dqp->q_flags & XFS_DQFLAG_FREEING) || dqp->q_nrefs != 0)
+ goto out_unlock;
- dqp->dq_flags |= XFS_DQ_FREEING;
+ dqp->q_flags |= XFS_DQFLAG_FREEING;
xfs_dqflock(dqp);
@@ -138,7 +138,6 @@
*/
if (XFS_DQ_IS_DIRTY(dqp)) {
struct xfs_buf *bp = NULL;
- int error;
/*
* We don't care about getting disk errors here. We need
@@ -148,6 +147,9 @@
if (!error) {
error = xfs_bwrite(bp);
xfs_buf_relse(bp);
+ } else if (error == -EAGAIN) {
+ dqp->q_flags &= ~XFS_DQFLAG_FREEING;
+ goto out_unlock;
}
xfs_dqflock(dqp);
}
@@ -159,8 +161,7 @@
xfs_dqfunlock(dqp);
xfs_dqunlock(dqp);
- radix_tree_delete(xfs_dquot_tree(qi, dqp->q_core.d_flags),
- be32_to_cpu(dqp->q_core.d_id));
+ radix_tree_delete(xfs_dquot_tree(qi, xfs_dquot_type(dqp)), dqp->q_id);
qi->qi_dquots--;
/*
@@ -173,6 +174,10 @@
xfs_qm_dqdestroy(dqp);
return 0;
+
+out_unlock:
+ xfs_dqunlock(dqp);
+ return error;
}
/*
@@ -184,11 +189,11 @@
uint flags)
{
if (flags & XFS_QMOPT_UQUOTA)
- xfs_qm_dquot_walk(mp, XFS_DQ_USER, xfs_qm_dqpurge, NULL);
+ xfs_qm_dquot_walk(mp, XFS_DQTYPE_USER, xfs_qm_dqpurge, NULL);
if (flags & XFS_QMOPT_GQUOTA)
- xfs_qm_dquot_walk(mp, XFS_DQ_GROUP, xfs_qm_dqpurge, NULL);
+ xfs_qm_dquot_walk(mp, XFS_DQTYPE_GROUP, xfs_qm_dqpurge, NULL);
if (flags & XFS_QMOPT_PQUOTA)
- xfs_qm_dquot_walk(mp, XFS_DQ_PROJ, xfs_qm_dqpurge, NULL);
+ xfs_qm_dquot_walk(mp, XFS_DQTYPE_PROJ, xfs_qm_dqpurge, NULL);
}
/*
@@ -243,14 +248,13 @@
STATIC int
xfs_qm_dqattach_one(
- xfs_inode_t *ip,
- xfs_dqid_t id,
- uint type,
- bool doalloc,
- xfs_dquot_t **IO_idqpp)
+ struct xfs_inode *ip,
+ xfs_dqtype_t type,
+ bool doalloc,
+ struct xfs_dquot **IO_idqpp)
{
- xfs_dquot_t *dqp;
- int error;
+ struct xfs_dquot *dqp;
+ int error;
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
error = 0;
@@ -325,7 +329,7 @@
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
if (XFS_IS_UQUOTA_ON(mp) && !ip->i_udquot) {
- error = xfs_qm_dqattach_one(ip, ip->i_d.di_uid, XFS_DQ_USER,
+ error = xfs_qm_dqattach_one(ip, XFS_DQTYPE_USER,
doalloc, &ip->i_udquot);
if (error)
goto done;
@@ -333,7 +337,7 @@
}
if (XFS_IS_GQUOTA_ON(mp) && !ip->i_gdquot) {
- error = xfs_qm_dqattach_one(ip, ip->i_d.di_gid, XFS_DQ_GROUP,
+ error = xfs_qm_dqattach_one(ip, XFS_DQTYPE_GROUP,
doalloc, &ip->i_gdquot);
if (error)
goto done;
@@ -341,7 +345,7 @@
}
if (XFS_IS_PQUOTA_ON(mp) && !ip->i_pdquot) {
- error = xfs_qm_dqattach_one(ip, xfs_get_projid(ip), XFS_DQ_PROJ,
+ error = xfs_qm_dqattach_one(ip, XFS_DQTYPE_PROJ,
doalloc, &ip->i_pdquot);
if (error)
goto done;
@@ -468,7 +472,7 @@
/*
* Prevent lookups now that we are past the point of no return.
*/
- dqp->dq_flags |= XFS_DQ_FREEING;
+ dqp->q_flags |= XFS_DQFLAG_FREEING;
xfs_dqunlock(dqp);
ASSERT(dqp->q_nrefs == 0);
@@ -539,32 +543,30 @@
STATIC void
xfs_qm_set_defquota(
- xfs_mount_t *mp,
- uint type,
- xfs_quotainfo_t *qinf)
+ struct xfs_mount *mp,
+ xfs_dqtype_t type,
+ struct xfs_quotainfo *qinf)
{
- xfs_dquot_t *dqp;
- struct xfs_def_quota *defq;
- struct xfs_disk_dquot *ddqp;
+ struct xfs_dquot *dqp;
+ struct xfs_def_quota *defq;
int error;
error = xfs_qm_dqget_uncached(mp, 0, type, &dqp);
if (error)
return;
- ddqp = &dqp->q_core;
- defq = xfs_get_defquota(dqp, qinf);
+ defq = xfs_get_defquota(qinf, xfs_dquot_type(dqp));
/*
* Timers and warnings have been already set, let's just set the
* default limits for this quota type
*/
- defq->bhardlimit = be64_to_cpu(ddqp->d_blk_hardlimit);
- defq->bsoftlimit = be64_to_cpu(ddqp->d_blk_softlimit);
- defq->ihardlimit = be64_to_cpu(ddqp->d_ino_hardlimit);
- defq->isoftlimit = be64_to_cpu(ddqp->d_ino_softlimit);
- defq->rtbhardlimit = be64_to_cpu(ddqp->d_rtb_hardlimit);
- defq->rtbsoftlimit = be64_to_cpu(ddqp->d_rtb_softlimit);
+ defq->blk.hard = dqp->q_blk.hardlimit;
+ defq->blk.soft = dqp->q_blk.softlimit;
+ defq->ino.hard = dqp->q_ino.hardlimit;
+ defq->ino.soft = dqp->q_ino.softlimit;
+ defq->rtb.hard = dqp->q_rtb.hardlimit;
+ defq->rtb.soft = dqp->q_rtb.softlimit;
xfs_qm_dqdestroy(dqp);
}
@@ -572,19 +574,21 @@
static void
xfs_qm_init_timelimits(
struct xfs_mount *mp,
- struct xfs_quotainfo *qinf)
+ xfs_dqtype_t type)
{
- struct xfs_disk_dquot *ddqp;
+ struct xfs_quotainfo *qinf = mp->m_quotainfo;
+ struct xfs_def_quota *defq;
struct xfs_dquot *dqp;
- uint type;
int error;
- qinf->qi_btimelimit = XFS_QM_BTIMELIMIT;
- qinf->qi_itimelimit = XFS_QM_ITIMELIMIT;
- qinf->qi_rtbtimelimit = XFS_QM_RTBTIMELIMIT;
- qinf->qi_bwarnlimit = XFS_QM_BWARNLIMIT;
- qinf->qi_iwarnlimit = XFS_QM_IWARNLIMIT;
- qinf->qi_rtbwarnlimit = XFS_QM_RTBWARNLIMIT;
+ defq = xfs_get_defquota(qinf, type);
+
+ defq->blk.time = XFS_QM_BTIMELIMIT;
+ defq->ino.time = XFS_QM_ITIMELIMIT;
+ defq->rtb.time = XFS_QM_RTBTIMELIMIT;
+ defq->blk.warn = XFS_QM_BWARNLIMIT;
+ defq->ino.warn = XFS_QM_IWARNLIMIT;
+ defq->rtb.warn = XFS_QM_RTBWARNLIMIT;
/*
* We try to get the limits from the superuser's limits fields.
@@ -592,39 +596,28 @@
*
* Since we may not have done a quotacheck by this point, just read
* the dquot without attaching it to any hashtables or lists.
- *
- * Timers and warnings are globally set by the first timer found in
- * user/group/proj quota types, otherwise a default value is used.
- * This should be split into different fields per quota type.
*/
- if (XFS_IS_UQUOTA_RUNNING(mp))
- type = XFS_DQ_USER;
- else if (XFS_IS_GQUOTA_RUNNING(mp))
- type = XFS_DQ_GROUP;
- else
- type = XFS_DQ_PROJ;
error = xfs_qm_dqget_uncached(mp, 0, type, &dqp);
if (error)
return;
- ddqp = &dqp->q_core;
/*
* The warnings and timers set the grace period given to
* a user or group before he or she can not perform any
* more writing. If it is zero, a default is used.
*/
- if (ddqp->d_btimer)
- qinf->qi_btimelimit = be32_to_cpu(ddqp->d_btimer);
- if (ddqp->d_itimer)
- qinf->qi_itimelimit = be32_to_cpu(ddqp->d_itimer);
- if (ddqp->d_rtbtimer)
- qinf->qi_rtbtimelimit = be32_to_cpu(ddqp->d_rtbtimer);
- if (ddqp->d_bwarns)
- qinf->qi_bwarnlimit = be16_to_cpu(ddqp->d_bwarns);
- if (ddqp->d_iwarns)
- qinf->qi_iwarnlimit = be16_to_cpu(ddqp->d_iwarns);
- if (ddqp->d_rtbwarns)
- qinf->qi_rtbwarnlimit = be16_to_cpu(ddqp->d_rtbwarns);
+ if (dqp->q_blk.timer)
+ defq->blk.time = dqp->q_blk.timer;
+ if (dqp->q_ino.timer)
+ defq->ino.time = dqp->q_ino.timer;
+ if (dqp->q_rtb.timer)
+ defq->rtb.time = dqp->q_rtb.timer;
+ if (dqp->q_blk.warnings)
+ defq->blk.warn = dqp->q_blk.warnings;
+ if (dqp->q_ino.warnings)
+ defq->ino.warn = dqp->q_ino.warnings;
+ if (dqp->q_rtb.warnings)
+ defq->rtb.warn = dqp->q_rtb.warnings;
xfs_qm_dqdestroy(dqp);
}
@@ -642,7 +635,7 @@
ASSERT(XFS_IS_QUOTA_RUNNING(mp));
- qinf = mp->m_quotainfo = kmem_zalloc(sizeof(xfs_quotainfo_t), 0);
+ qinf = mp->m_quotainfo = kmem_zalloc(sizeof(struct xfs_quotainfo), 0);
error = list_lru_init(&qinf->qi_lru);
if (error)
@@ -667,17 +660,30 @@
/* Precalc some constants */
qinf->qi_dqchunklen = XFS_FSB_TO_BB(mp, XFS_DQUOT_CLUSTER_SIZE_FSB);
qinf->qi_dqperchunk = xfs_calc_dquots_per_chunk(qinf->qi_dqchunklen);
+ if (xfs_sb_version_hasbigtime(&mp->m_sb)) {
+ qinf->qi_expiry_min =
+ xfs_dq_bigtime_to_unix(XFS_DQ_BIGTIME_EXPIRY_MIN);
+ qinf->qi_expiry_max =
+ xfs_dq_bigtime_to_unix(XFS_DQ_BIGTIME_EXPIRY_MAX);
+ } else {
+ qinf->qi_expiry_min = XFS_DQ_LEGACY_EXPIRY_MIN;
+ qinf->qi_expiry_max = XFS_DQ_LEGACY_EXPIRY_MAX;
+ }
+ trace_xfs_quota_expiry_range(mp, qinf->qi_expiry_min,
+ qinf->qi_expiry_max);
mp->m_qflags |= (mp->m_sb.sb_qflags & XFS_ALL_QUOTA_CHKD);
- xfs_qm_init_timelimits(mp, qinf);
+ xfs_qm_init_timelimits(mp, XFS_DQTYPE_USER);
+ xfs_qm_init_timelimits(mp, XFS_DQTYPE_GROUP);
+ xfs_qm_init_timelimits(mp, XFS_DQTYPE_PROJ);
if (XFS_IS_UQUOTA_RUNNING(mp))
- xfs_qm_set_defquota(mp, XFS_DQ_USER, qinf);
+ xfs_qm_set_defquota(mp, XFS_DQTYPE_USER, qinf);
if (XFS_IS_GQUOTA_RUNNING(mp))
- xfs_qm_set_defquota(mp, XFS_DQ_GROUP, qinf);
+ xfs_qm_set_defquota(mp, XFS_DQTYPE_GROUP, qinf);
if (XFS_IS_PQUOTA_RUNNING(mp))
- xfs_qm_set_defquota(mp, XFS_DQ_PROJ, qinf);
+ xfs_qm_set_defquota(mp, XFS_DQTYPE_PROJ, qinf);
qinf->qi_shrinker.count_objects = xfs_qm_shrink_count;
qinf->qi_shrinker.scan_objects = xfs_qm_shrink_scan;
@@ -709,9 +715,9 @@
*/
void
xfs_qm_destroy_quotainfo(
- xfs_mount_t *mp)
+ struct xfs_mount *mp)
{
- xfs_quotainfo_t *qi;
+ struct xfs_quotainfo *qi;
qi = mp->m_quotainfo;
ASSERT(qi != NULL);
@@ -754,11 +760,15 @@
if ((flags & XFS_QMOPT_PQUOTA) &&
(mp->m_sb.sb_gquotino != NULLFSINO)) {
ino = mp->m_sb.sb_gquotino;
- ASSERT(mp->m_sb.sb_pquotino == NULLFSINO);
+ if (XFS_IS_CORRUPT(mp,
+ mp->m_sb.sb_pquotino != NULLFSINO))
+ return -EFSCORRUPTED;
} else if ((flags & XFS_QMOPT_GQUOTA) &&
(mp->m_sb.sb_pquotino != NULLFSINO)) {
ino = mp->m_sb.sb_pquotino;
- ASSERT(mp->m_sb.sb_gquotino == NULLFSINO);
+ if (XFS_IS_CORRUPT(mp,
+ mp->m_sb.sb_gquotino != NULLFSINO))
+ return -EFSCORRUPTED;
}
if (ino != NULLFSINO) {
error = xfs_iget(mp, NULL, ino, 0, 0, ip);
@@ -771,7 +781,8 @@
}
error = xfs_trans_alloc(mp, &M_RES(mp)->tr_create,
- XFS_QM_QINOCREATE_SPACE_RES(mp), 0, 0, &tp);
+ need_alloc ? XFS_QM_QINOCREATE_SPACE_RES(mp) : 0,
+ 0, 0, &tp);
if (error)
return error;
@@ -822,14 +833,13 @@
STATIC void
xfs_qm_reset_dqcounts(
- xfs_mount_t *mp,
- xfs_buf_t *bp,
- xfs_dqid_t id,
- uint type)
+ struct xfs_mount *mp,
+ struct xfs_buf *bp,
+ xfs_dqid_t id,
+ xfs_dqtype_t type)
{
struct xfs_dqblk *dqb;
int j;
- xfs_failaddr_t fa;
trace_xfs_reset_dqcounts(bp, _RET_IP_);
@@ -854,24 +864,34 @@
* find uninitialised dquot blks. See comment in
* xfs_dquot_verify.
*/
- fa = xfs_dqblk_verify(mp, &dqb[j], id + j, type);
- if (fa)
+ if (xfs_dqblk_verify(mp, &dqb[j], id + j) ||
+ (dqb[j].dd_diskdq.d_type & XFS_DQTYPE_REC_MASK) != type)
xfs_dqblk_repair(mp, &dqb[j], id + j, type);
/*
* Reset type in case we are reusing group quota file for
* project quotas or vice versa
*/
- ddq->d_flags = type;
+ ddq->d_type = type;
ddq->d_bcount = 0;
ddq->d_icount = 0;
ddq->d_rtbcount = 0;
- ddq->d_btimer = 0;
- ddq->d_itimer = 0;
- ddq->d_rtbtimer = 0;
- ddq->d_bwarns = 0;
- ddq->d_iwarns = 0;
- ddq->d_rtbwarns = 0;
+
+ /*
+ * dquot id 0 stores the default grace period and the maximum
+ * warning limit that were set by the administrator, so we
+ * should not reset them.
+ */
+ if (ddq->d_id != 0) {
+ ddq->d_btimer = 0;
+ ddq->d_itimer = 0;
+ ddq->d_rtbtimer = 0;
+ ddq->d_bwarns = 0;
+ ddq->d_iwarns = 0;
+ ddq->d_rtbwarns = 0;
+ if (xfs_sb_version_hasbigtime(&mp->m_sb))
+ ddq->d_type |= XFS_DQTYPE_BIGTIME;
+ }
if (xfs_sb_version_hascrc(&mp->m_sb)) {
xfs_update_cksum((char *)&dqb[j],
@@ -887,17 +907,13 @@
xfs_dqid_t firstid,
xfs_fsblock_t bno,
xfs_filblks_t blkcnt,
- uint flags,
+ xfs_dqtype_t type,
struct list_head *buffer_list)
{
struct xfs_buf *bp;
- int error;
- int type;
+ int error = 0;
ASSERT(blkcnt > 0);
- type = flags & XFS_QMOPT_UQUOTA ? XFS_DQ_USER :
- (flags & XFS_QMOPT_PQUOTA ? XFS_DQ_PROJ : XFS_DQ_GROUP);
- error = 0;
/*
* Blkcnt arg can be a very big number, and might even be
@@ -957,7 +973,7 @@
xfs_qm_reset_dqcounts_buf(
struct xfs_mount *mp,
struct xfs_inode *qip,
- uint flags,
+ xfs_dqtype_t type,
struct list_head *buffer_list)
{
struct xfs_bmbt_irec *map;
@@ -1033,7 +1049,7 @@
error = xfs_qm_reset_dqcounts_all(mp, firstid,
map[i].br_startblock,
map[i].br_blockcount,
- flags, buffer_list);
+ type, buffer_list);
if (error)
goto out;
}
@@ -1055,7 +1071,7 @@
STATIC int
xfs_qm_quotacheck_dqadjust(
struct xfs_inode *ip,
- uint type,
+ xfs_dqtype_t type,
xfs_qcnt_t nblks,
xfs_qcnt_t rtblks)
{
@@ -1081,15 +1097,15 @@
* Adjust the inode count and the block count to reflect this inode's
* resource usage.
*/
- be64_add_cpu(&dqp->q_core.d_icount, 1);
- dqp->q_res_icount++;
+ dqp->q_ino.count++;
+ dqp->q_ino.reserved++;
if (nblks) {
- be64_add_cpu(&dqp->q_core.d_bcount, nblks);
- dqp->q_res_bcount += nblks;
+ dqp->q_blk.count += nblks;
+ dqp->q_blk.reserved += nblks;
}
if (rtblks) {
- be64_add_cpu(&dqp->q_core.d_rtbcount, rtblks);
- dqp->q_res_rtbcount += rtblks;
+ dqp->q_rtb.count += rtblks;
+ dqp->q_rtb.reserved += rtblks;
}
/*
@@ -1097,12 +1113,12 @@
*
* There are no timers for the default values set in the root dquot.
*/
- if (dqp->q_core.d_id) {
- xfs_qm_adjust_dqlimits(mp, dqp);
- xfs_qm_adjust_dqtimers(mp, &dqp->q_core);
+ if (dqp->q_id) {
+ xfs_qm_adjust_dqlimits(dqp);
+ xfs_qm_adjust_dqtimers(dqp);
}
- dqp->dq_flags |= XFS_DQ_DIRTY;
+ dqp->q_flags |= XFS_DQFLAG_DIRTY;
xfs_qm_dqput(dqp);
return 0;
}
@@ -1172,21 +1188,21 @@
* and quotaoffs don't race. (Quotachecks happen at mount time only).
*/
if (XFS_IS_UQUOTA_ON(mp)) {
- error = xfs_qm_quotacheck_dqadjust(ip, XFS_DQ_USER, nblks,
+ error = xfs_qm_quotacheck_dqadjust(ip, XFS_DQTYPE_USER, nblks,
rtblks);
if (error)
goto error0;
}
if (XFS_IS_GQUOTA_ON(mp)) {
- error = xfs_qm_quotacheck_dqadjust(ip, XFS_DQ_GROUP, nblks,
+ error = xfs_qm_quotacheck_dqadjust(ip, XFS_DQTYPE_GROUP, nblks,
rtblks);
if (error)
goto error0;
}
if (XFS_IS_PQUOTA_ON(mp)) {
- error = xfs_qm_quotacheck_dqadjust(ip, XFS_DQ_PROJ, nblks,
+ error = xfs_qm_quotacheck_dqadjust(ip, XFS_DQTYPE_PROJ, nblks,
rtblks);
if (error)
goto error0;
@@ -1208,7 +1224,7 @@
int error = 0;
xfs_dqlock(dqp);
- if (dqp->dq_flags & XFS_DQ_FREEING)
+ if (dqp->q_flags & XFS_DQFLAG_FREEING)
goto out_unlock;
if (!XFS_DQ_IS_DIRTY(dqp))
goto out_unlock;
@@ -1277,7 +1293,7 @@
* We don't log our changes till later.
*/
if (uip) {
- error = xfs_qm_reset_dqcounts_buf(mp, uip, XFS_QMOPT_UQUOTA,
+ error = xfs_qm_reset_dqcounts_buf(mp, uip, XFS_DQTYPE_USER,
&buffer_list);
if (error)
goto error_return;
@@ -1285,7 +1301,7 @@
}
if (gip) {
- error = xfs_qm_reset_dqcounts_buf(mp, gip, XFS_QMOPT_GQUOTA,
+ error = xfs_qm_reset_dqcounts_buf(mp, gip, XFS_DQTYPE_GROUP,
&buffer_list);
if (error)
goto error_return;
@@ -1293,7 +1309,7 @@
}
if (pip) {
- error = xfs_qm_reset_dqcounts_buf(mp, pip, XFS_QMOPT_PQUOTA,
+ error = xfs_qm_reset_dqcounts_buf(mp, pip, XFS_DQTYPE_PROJ,
&buffer_list);
if (error)
goto error_return;
@@ -1310,17 +1326,17 @@
* down to disk buffers if everything was updated successfully.
*/
if (XFS_IS_UQUOTA_ON(mp)) {
- error = xfs_qm_dquot_walk(mp, XFS_DQ_USER, xfs_qm_flush_one,
+ error = xfs_qm_dquot_walk(mp, XFS_DQTYPE_USER, xfs_qm_flush_one,
&buffer_list);
}
if (XFS_IS_GQUOTA_ON(mp)) {
- error2 = xfs_qm_dquot_walk(mp, XFS_DQ_GROUP, xfs_qm_flush_one,
+ error2 = xfs_qm_dquot_walk(mp, XFS_DQTYPE_GROUP, xfs_qm_flush_one,
&buffer_list);
if (!error)
error = error2;
}
if (XFS_IS_PQUOTA_ON(mp)) {
- error2 = xfs_qm_dquot_walk(mp, XFS_DQ_PROJ, xfs_qm_flush_one,
+ error2 = xfs_qm_dquot_walk(mp, XFS_DQTYPE_PROJ, xfs_qm_flush_one,
&buffer_list);
if (!error)
error = error2;
@@ -1559,7 +1575,7 @@
STATIC void
xfs_qm_destroy_quotainos(
- xfs_quotainfo_t *qi)
+ struct xfs_quotainfo *qi)
{
if (qi->qi_uquotaip) {
xfs_irele(qi->qi_uquotaip);
@@ -1583,8 +1599,7 @@
struct xfs_quotainfo *qi = mp->m_quotainfo;
mutex_lock(&qi->qi_tree_lock);
- radix_tree_delete(xfs_dquot_tree(qi, dqp->q_core.d_flags),
- be32_to_cpu(dqp->q_core.d_id));
+ radix_tree_delete(xfs_dquot_tree(qi, xfs_dquot_type(dqp)), dqp->q_id);
qi->qi_dquots--;
mutex_unlock(&qi->qi_tree_lock);
@@ -1608,8 +1623,8 @@
int
xfs_qm_vop_dqalloc(
struct xfs_inode *ip,
- xfs_dqid_t uid,
- xfs_dqid_t gid,
+ kuid_t uid,
+ kgid_t gid,
prid_t prid,
uint flags,
struct xfs_dquot **O_udqpp,
@@ -1617,6 +1632,8 @@
struct xfs_dquot **O_pdqpp)
{
struct xfs_mount *mp = ip->i_mount;
+ struct inode *inode = VFS_I(ip);
+ struct user_namespace *user_ns = inode->i_sb->s_user_ns;
struct xfs_dquot *uq = NULL;
struct xfs_dquot *gq = NULL;
struct xfs_dquot *pq = NULL;
@@ -1630,7 +1647,7 @@
xfs_ilock(ip, lockflags);
if ((flags & XFS_QMOPT_INHERIT) && XFS_INHERIT_GID(ip))
- gid = ip->i_d.di_gid;
+ gid = inode->i_gid;
/*
* Attach the dquot(s) to this inode, doing a dquot allocation
@@ -1645,7 +1662,8 @@
}
if ((flags & XFS_QMOPT_UQUOTA) && XFS_IS_UQUOTA_ON(mp)) {
- if (ip->i_d.di_uid != uid) {
+ ASSERT(O_udqpp);
+ if (!uid_eq(inode->i_uid, uid)) {
/*
* What we need is the dquot that has this uid, and
* if we send the inode to dqget, the uid of the inode
@@ -1656,7 +1674,8 @@
* holding ilock.
*/
xfs_iunlock(ip, lockflags);
- error = xfs_qm_dqget(mp, uid, XFS_DQ_USER, true, &uq);
+ error = xfs_qm_dqget(mp, from_kuid(user_ns, uid),
+ XFS_DQTYPE_USER, true, &uq);
if (error) {
ASSERT(error != -ENOENT);
return error;
@@ -1677,9 +1696,11 @@
}
}
if ((flags & XFS_QMOPT_GQUOTA) && XFS_IS_GQUOTA_ON(mp)) {
- if (ip->i_d.di_gid != gid) {
+ ASSERT(O_gdqpp);
+ if (!gid_eq(inode->i_gid, gid)) {
xfs_iunlock(ip, lockflags);
- error = xfs_qm_dqget(mp, gid, XFS_DQ_GROUP, true, &gq);
+ error = xfs_qm_dqget(mp, from_kgid(user_ns, gid),
+ XFS_DQTYPE_GROUP, true, &gq);
if (error) {
ASSERT(error != -ENOENT);
goto error_rele;
@@ -1693,10 +1714,11 @@
}
}
if ((flags & XFS_QMOPT_PQUOTA) && XFS_IS_PQUOTA_ON(mp)) {
- if (xfs_get_projid(ip) != prid) {
+ ASSERT(O_pdqpp);
+ if (ip->i_d.di_projid != prid) {
xfs_iunlock(ip, lockflags);
- error = xfs_qm_dqget(mp, (xfs_dqid_t)prid, XFS_DQ_PROJ,
- true, &pq);
+ error = xfs_qm_dqget(mp, prid,
+ XFS_DQTYPE_PROJ, true, &pq);
if (error) {
ASSERT(error != -ENOENT);
goto error_rele;
@@ -1709,8 +1731,7 @@
pq = xfs_qm_dqhold(ip->i_pdquot);
}
}
- if (uq)
- trace_xfs_dquot_dqalloc(ip);
+ trace_xfs_dquot_dqalloc(ip);
xfs_iunlock(ip, lockflags);
if (O_udqpp)
@@ -1737,14 +1758,14 @@
* Actually transfer ownership, and do dquot modifications.
* These were already reserved.
*/
-xfs_dquot_t *
+struct xfs_dquot *
xfs_qm_vop_chown(
- xfs_trans_t *tp,
- xfs_inode_t *ip,
- xfs_dquot_t **IO_olddq,
- xfs_dquot_t *newdq)
+ struct xfs_trans *tp,
+ struct xfs_inode *ip,
+ struct xfs_dquot **IO_olddq,
+ struct xfs_dquot *newdq)
{
- xfs_dquot_t *prevdq;
+ struct xfs_dquot *prevdq;
uint bfield = XFS_IS_REALTIME_INODE(ip) ?
XFS_TRANS_DQ_RTBCOUNT : XFS_TRANS_DQ_BCOUNT;
@@ -1787,7 +1808,7 @@
{
struct xfs_mount *mp = ip->i_mount;
uint64_t delblks;
- unsigned int blkflags, prjflags = 0;
+ unsigned int blkflags;
struct xfs_dquot *udq_unres = NULL;
struct xfs_dquot *gdq_unres = NULL;
struct xfs_dquot *pdq_unres = NULL;
@@ -1805,7 +1826,7 @@
XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS;
if (XFS_IS_UQUOTA_ON(mp) && udqp &&
- ip->i_d.di_uid != be32_to_cpu(udqp->q_core.d_id)) {
+ i_uid_read(VFS_I(ip)) != udqp->q_id) {
udq_delblks = udqp;
/*
* If there are delayed allocation blocks, then we have to
@@ -1818,7 +1839,7 @@
}
}
if (XFS_IS_GQUOTA_ON(ip->i_mount) && gdqp &&
- ip->i_d.di_gid != be32_to_cpu(gdqp->q_core.d_id)) {
+ i_gid_read(VFS_I(ip)) != gdqp->q_id) {
gdq_delblks = gdqp;
if (delblks) {
ASSERT(ip->i_gdquot);
@@ -1827,8 +1848,7 @@
}
if (XFS_IS_PQUOTA_ON(ip->i_mount) && pdqp &&
- xfs_get_projid(ip) != be32_to_cpu(pdqp->q_core.d_id)) {
- prjflags = XFS_QMOPT_ENOSPC;
+ ip->i_d.di_projid != pdqp->q_id) {
pdq_delblks = pdqp;
if (delblks) {
ASSERT(ip->i_pdquot);
@@ -1838,8 +1858,7 @@
error = xfs_trans_reserve_quota_bydquots(tp, ip->i_mount,
udq_delblks, gdq_delblks, pdq_delblks,
- ip->i_d.di_nblocks, 1,
- flags | blkflags | prjflags);
+ ip->i_d.di_nblocks, 1, flags | blkflags);
if (error)
return error;
@@ -1857,8 +1876,7 @@
ASSERT(udq_unres || gdq_unres || pdq_unres);
error = xfs_trans_reserve_quota_bydquots(NULL, ip->i_mount,
udq_delblks, gdq_delblks, pdq_delblks,
- (xfs_qcnt_t)delblks, 0,
- flags | blkflags | prjflags);
+ (xfs_qcnt_t)delblks, 0, flags | blkflags);
if (error)
return error;
xfs_trans_reserve_quota_bydquots(NULL, ip->i_mount,
@@ -1911,24 +1929,24 @@
return;
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
- ASSERT(XFS_IS_QUOTA_RUNNING(mp));
if (udqp && XFS_IS_UQUOTA_ON(mp)) {
ASSERT(ip->i_udquot == NULL);
- ASSERT(ip->i_d.di_uid == be32_to_cpu(udqp->q_core.d_id));
+ ASSERT(i_uid_read(VFS_I(ip)) == udqp->q_id);
ip->i_udquot = xfs_qm_dqhold(udqp);
xfs_trans_mod_dquot(tp, udqp, XFS_TRANS_DQ_ICOUNT, 1);
}
if (gdqp && XFS_IS_GQUOTA_ON(mp)) {
ASSERT(ip->i_gdquot == NULL);
- ASSERT(ip->i_d.di_gid == be32_to_cpu(gdqp->q_core.d_id));
+ ASSERT(i_gid_read(VFS_I(ip)) == gdqp->q_id);
+
ip->i_gdquot = xfs_qm_dqhold(gdqp);
xfs_trans_mod_dquot(tp, gdqp, XFS_TRANS_DQ_ICOUNT, 1);
}
if (pdqp && XFS_IS_PQUOTA_ON(mp)) {
ASSERT(ip->i_pdquot == NULL);
- ASSERT(xfs_get_projid(ip) == be32_to_cpu(pdqp->q_core.d_id));
+ ASSERT(ip->i_d.di_projid == pdqp->q_id);
ip->i_pdquot = xfs_qm_dqhold(pdqp);
xfs_trans_mod_dquot(tp, pdqp, XFS_TRANS_DQ_ICOUNT, 1);
diff --git a/fs/xfs/xfs_qm.h b/fs/xfs/xfs_qm.h
index b41b750..e3dabab 100644
--- a/fs/xfs/xfs_qm.h
+++ b/fs/xfs/xfs_qm.h
@@ -20,76 +20,68 @@
#define XFS_DQITER_MAP_SIZE 10
#define XFS_IS_DQUOT_UNINITIALIZED(dqp) ( \
- !dqp->q_core.d_blk_hardlimit && \
- !dqp->q_core.d_blk_softlimit && \
- !dqp->q_core.d_rtb_hardlimit && \
- !dqp->q_core.d_rtb_softlimit && \
- !dqp->q_core.d_ino_hardlimit && \
- !dqp->q_core.d_ino_softlimit && \
- !dqp->q_core.d_bcount && \
- !dqp->q_core.d_rtbcount && \
- !dqp->q_core.d_icount)
+ !dqp->q_blk.hardlimit && \
+ !dqp->q_blk.softlimit && \
+ !dqp->q_rtb.hardlimit && \
+ !dqp->q_rtb.softlimit && \
+ !dqp->q_ino.hardlimit && \
+ !dqp->q_ino.softlimit && \
+ !dqp->q_blk.count && \
+ !dqp->q_rtb.count && \
+ !dqp->q_ino.count)
-/*
- * This defines the unit of allocation of dquots.
- * Currently, it is just one file system block, and a 4K blk contains 30
- * (136 * 30 = 4080) dquots. It's probably not worth trying to make
- * this more dynamic.
- * XXXsup However, if this number is changed, we have to make sure that we don't
- * implicitly assume that we do allocations in chunks of a single filesystem
- * block in the dquot/xqm code.
- */
-#define XFS_DQUOT_CLUSTER_SIZE_FSB (xfs_filblks_t)1
+struct xfs_quota_limits {
+ xfs_qcnt_t hard; /* default hard limit */
+ xfs_qcnt_t soft; /* default soft limit */
+ time64_t time; /* limit for timers */
+ xfs_qwarncnt_t warn; /* limit for warnings */
+};
+/* Defaults for each quota type: time limits, warn limits, usage limits */
struct xfs_def_quota {
- xfs_qcnt_t bhardlimit; /* default data blk hard limit */
- xfs_qcnt_t bsoftlimit; /* default data blk soft limit */
- xfs_qcnt_t ihardlimit; /* default inode count hard limit */
- xfs_qcnt_t isoftlimit; /* default inode count soft limit */
- xfs_qcnt_t rtbhardlimit; /* default realtime blk hard limit */
- xfs_qcnt_t rtbsoftlimit; /* default realtime blk soft limit */
+ struct xfs_quota_limits blk;
+ struct xfs_quota_limits ino;
+ struct xfs_quota_limits rtb;
};
/*
* Various quota information for individual filesystems.
* The mount structure keeps a pointer to this.
*/
-typedef struct xfs_quotainfo {
- struct radix_tree_root qi_uquota_tree;
- struct radix_tree_root qi_gquota_tree;
- struct radix_tree_root qi_pquota_tree;
- struct mutex qi_tree_lock;
+struct xfs_quotainfo {
+ struct radix_tree_root qi_uquota_tree;
+ struct radix_tree_root qi_gquota_tree;
+ struct radix_tree_root qi_pquota_tree;
+ struct mutex qi_tree_lock;
struct xfs_inode *qi_uquotaip; /* user quota inode */
struct xfs_inode *qi_gquotaip; /* group quota inode */
struct xfs_inode *qi_pquotaip; /* project quota inode */
- struct list_lru qi_lru;
- int qi_dquots;
- time_t qi_btimelimit; /* limit for blks timer */
- time_t qi_itimelimit; /* limit for inodes timer */
- time_t qi_rtbtimelimit;/* limit for rt blks timer */
- xfs_qwarncnt_t qi_bwarnlimit; /* limit for blks warnings */
- xfs_qwarncnt_t qi_iwarnlimit; /* limit for inodes warnings */
- xfs_qwarncnt_t qi_rtbwarnlimit;/* limit for rt blks warnings */
- struct mutex qi_quotaofflock;/* to serialize quotaoff */
- xfs_filblks_t qi_dqchunklen; /* # BBs in a chunk of dqs */
- uint qi_dqperchunk; /* # ondisk dqs in above chunk */
+ struct list_lru qi_lru;
+ int qi_dquots;
+ struct mutex qi_quotaofflock;/* to serialize quotaoff */
+ xfs_filblks_t qi_dqchunklen; /* # BBs in a chunk of dqs */
+ uint qi_dqperchunk; /* # ondisk dq in above chunk */
struct xfs_def_quota qi_usr_default;
struct xfs_def_quota qi_grp_default;
struct xfs_def_quota qi_prj_default;
- struct shrinker qi_shrinker;
-} xfs_quotainfo_t;
+ struct shrinker qi_shrinker;
+
+ /* Minimum and maximum quota expiration timestamp values. */
+ time64_t qi_expiry_min;
+ time64_t qi_expiry_max;
+};
static inline struct radix_tree_root *
xfs_dquot_tree(
struct xfs_quotainfo *qi,
- int type)
+ xfs_dqtype_t type)
{
switch (type) {
- case XFS_DQ_USER:
+ case XFS_DQTYPE_USER:
return &qi->qi_uquota_tree;
- case XFS_DQ_GROUP:
+ case XFS_DQTYPE_GROUP:
return &qi->qi_gquota_tree;
- case XFS_DQ_PROJ:
+ case XFS_DQTYPE_PROJ:
return &qi->qi_pquota_tree;
default:
ASSERT(0);
@@ -98,14 +90,14 @@
}
static inline struct xfs_inode *
-xfs_quota_inode(xfs_mount_t *mp, uint dq_flags)
+xfs_quota_inode(struct xfs_mount *mp, xfs_dqtype_t type)
{
- switch (dq_flags & XFS_DQ_ALLTYPES) {
- case XFS_DQ_USER:
+ switch (type) {
+ case XFS_DQTYPE_USER:
return mp->m_quotainfo->qi_uquotaip;
- case XFS_DQ_GROUP:
+ case XFS_DQTYPE_GROUP:
return mp->m_quotainfo->qi_gquotaip;
- case XFS_DQ_PROJ:
+ case XFS_DQTYPE_PROJ:
return mp->m_quotainfo->qi_pquotaip;
default:
ASSERT(0);
@@ -154,29 +146,35 @@
/* quota ops */
extern int xfs_qm_scall_trunc_qfiles(struct xfs_mount *, uint);
-extern int xfs_qm_scall_getquota(struct xfs_mount *, xfs_dqid_t,
- uint, struct qc_dqblk *);
-extern int xfs_qm_scall_getquota_next(struct xfs_mount *,
- xfs_dqid_t *, uint, struct qc_dqblk *);
-extern int xfs_qm_scall_setqlim(struct xfs_mount *, xfs_dqid_t, uint,
- struct qc_dqblk *);
+extern int xfs_qm_scall_getquota(struct xfs_mount *mp,
+ xfs_dqid_t id,
+ xfs_dqtype_t type,
+ struct qc_dqblk *dst);
+extern int xfs_qm_scall_getquota_next(struct xfs_mount *mp,
+ xfs_dqid_t *id,
+ xfs_dqtype_t type,
+ struct qc_dqblk *dst);
+extern int xfs_qm_scall_setqlim(struct xfs_mount *mp,
+ xfs_dqid_t id,
+ xfs_dqtype_t type,
+ struct qc_dqblk *newlim);
extern int xfs_qm_scall_quotaon(struct xfs_mount *, uint);
extern int xfs_qm_scall_quotaoff(struct xfs_mount *, uint);
static inline struct xfs_def_quota *
-xfs_get_defquota(struct xfs_dquot *dqp, struct xfs_quotainfo *qi)
+xfs_get_defquota(struct xfs_quotainfo *qi, xfs_dqtype_t type)
{
- struct xfs_def_quota *defq;
-
- if (XFS_QM_ISUDQ(dqp))
- defq = &qi->qi_usr_default;
- else if (XFS_QM_ISGDQ(dqp))
- defq = &qi->qi_grp_default;
- else {
- ASSERT(XFS_QM_ISPDQ(dqp));
- defq = &qi->qi_prj_default;
+ switch (type) {
+ case XFS_DQTYPE_USER:
+ return &qi->qi_usr_default;
+ case XFS_DQTYPE_GROUP:
+ return &qi->qi_grp_default;
+ case XFS_DQTYPE_PROJ:
+ return &qi->qi_prj_default;
+ default:
+ ASSERT(0);
+ return NULL;
}
- return defq;
}
#endif /* __XFS_QM_H__ */
diff --git a/fs/xfs/xfs_qm_bhv.c b/fs/xfs/xfs_qm_bhv.c
index 5d72e88..6393980 100644
--- a/fs/xfs/xfs_qm_bhv.c
+++ b/fs/xfs/xfs_qm_bhv.c
@@ -23,24 +23,24 @@
{
uint64_t limit;
- limit = dqp->q_core.d_blk_softlimit ?
- be64_to_cpu(dqp->q_core.d_blk_softlimit) :
- be64_to_cpu(dqp->q_core.d_blk_hardlimit);
+ limit = dqp->q_blk.softlimit ?
+ dqp->q_blk.softlimit :
+ dqp->q_blk.hardlimit;
if (limit && statp->f_blocks > limit) {
statp->f_blocks = limit;
statp->f_bfree = statp->f_bavail =
- (statp->f_blocks > dqp->q_res_bcount) ?
- (statp->f_blocks - dqp->q_res_bcount) : 0;
+ (statp->f_blocks > dqp->q_blk.reserved) ?
+ (statp->f_blocks - dqp->q_blk.reserved) : 0;
}
- limit = dqp->q_core.d_ino_softlimit ?
- be64_to_cpu(dqp->q_core.d_ino_softlimit) :
- be64_to_cpu(dqp->q_core.d_ino_hardlimit);
+ limit = dqp->q_ino.softlimit ?
+ dqp->q_ino.softlimit :
+ dqp->q_ino.hardlimit;
if (limit && statp->f_files > limit) {
statp->f_files = limit;
statp->f_ffree =
- (statp->f_files > dqp->q_res_icount) ?
- (statp->f_files - dqp->q_res_icount) : 0;
+ (statp->f_files > dqp->q_ino.reserved) ?
+ (statp->f_files - dqp->q_ino.reserved) : 0;
}
}
@@ -54,13 +54,13 @@
*/
void
xfs_qm_statvfs(
- xfs_inode_t *ip,
+ struct xfs_inode *ip,
struct kstatfs *statp)
{
- xfs_mount_t *mp = ip->i_mount;
- xfs_dquot_t *dqp;
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_dquot *dqp;
- if (!xfs_qm_dqget(mp, xfs_get_projid(ip), XFS_DQ_PROJ, false, &dqp)) {
+ if (!xfs_qm_dqget(mp, ip->i_d.di_projid, XFS_DQTYPE_PROJ, false, &dqp)) {
xfs_fill_statvfs_from_dquot(statp, dqp);
xfs_qm_dqput(dqp);
}
diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c
index da7ad03..ca1b57d 100644
--- a/fs/xfs/xfs_qm_syscalls.c
+++ b/fs/xfs/xfs_qm_syscalls.c
@@ -19,9 +19,71 @@
#include "xfs_qm.h"
#include "xfs_icache.h"
-STATIC int xfs_qm_log_quotaoff(xfs_mount_t *, xfs_qoff_logitem_t **, uint);
-STATIC int xfs_qm_log_quotaoff_end(xfs_mount_t *, xfs_qoff_logitem_t *,
- uint);
+STATIC int
+xfs_qm_log_quotaoff(
+ struct xfs_mount *mp,
+ struct xfs_qoff_logitem **qoffstartp,
+ uint flags)
+{
+ struct xfs_trans *tp;
+ int error;
+ struct xfs_qoff_logitem *qoffi;
+
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_qm_quotaoff, 0, 0, 0, &tp);
+ if (error)
+ goto out;
+
+ qoffi = xfs_trans_get_qoff_item(tp, NULL, flags & XFS_ALL_QUOTA_ACCT);
+ xfs_trans_log_quotaoff_item(tp, qoffi);
+
+ spin_lock(&mp->m_sb_lock);
+ mp->m_sb.sb_qflags = (mp->m_qflags & ~(flags)) & XFS_MOUNT_QUOTA_ALL;
+ spin_unlock(&mp->m_sb_lock);
+
+ xfs_log_sb(tp);
+
+ /*
+ * We have to make sure that the transaction is secure on disk before we
+ * return and actually stop quota accounting. So, make it synchronous.
+ * We don't care about quotoff's performance.
+ */
+ xfs_trans_set_sync(tp);
+ error = xfs_trans_commit(tp);
+ if (error)
+ goto out;
+
+ *qoffstartp = qoffi;
+out:
+ return error;
+}
+
+STATIC int
+xfs_qm_log_quotaoff_end(
+ struct xfs_mount *mp,
+ struct xfs_qoff_logitem **startqoff,
+ uint flags)
+{
+ struct xfs_trans *tp;
+ int error;
+ struct xfs_qoff_logitem *qoffi;
+
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_qm_equotaoff, 0, 0, 0, &tp);
+ if (error)
+ return error;
+
+ qoffi = xfs_trans_get_qoff_item(tp, *startqoff,
+ flags & XFS_ALL_QUOTA_ACCT);
+ xfs_trans_log_quotaoff_item(tp, qoffi);
+ *startqoff = NULL;
+
+ /*
+ * We have to make sure that the transaction is secure on disk before we
+ * return and actually stop quota accounting. So, make it synchronous.
+ * We don't care about quotoff's performance.
+ */
+ xfs_trans_set_sync(tp);
+ return xfs_trans_commit(tp);
+}
/*
* Turn off quota accounting and/or enforcement for all udquots and/or
@@ -40,7 +102,7 @@
uint dqtype;
int error;
uint inactivate_flags;
- xfs_qoff_logitem_t *qoffstart;
+ struct xfs_qoff_logitem *qoffstart = NULL;
/*
* No file system can have quotas enabled on disk but not in core.
@@ -165,7 +227,7 @@
* So, we have QUOTAOFF start and end logitems; the start
* logitem won't get overwritten until the end logitem appears...
*/
- error = xfs_qm_log_quotaoff_end(mp, qoffstart, flags);
+ error = xfs_qm_log_quotaoff_end(mp, &qoffstart, flags);
if (error) {
/* We're screwed now. Shutdown is the only option. */
xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
@@ -198,6 +260,8 @@
}
out_unlock:
+ if (error && qoffstart)
+ xfs_qm_qoff_logitem_relse(qoffstart);
mutex_unlock(&q->qi_quotaofflock);
return error;
}
@@ -238,7 +302,7 @@
goto out_unlock;
}
- ASSERT(ip->i_d.di_nextents == 0);
+ ASSERT(ip->i_df.if_nextents == 0);
xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
error = xfs_trans_commit(tp);
@@ -258,23 +322,23 @@
int error = -EINVAL;
if (!xfs_sb_version_hasquota(&mp->m_sb) || flags == 0 ||
- (flags & ~XFS_DQ_ALLTYPES)) {
+ (flags & ~XFS_QMOPT_QUOTALL)) {
xfs_debug(mp, "%s: flags=%x m_qflags=%x",
__func__, flags, mp->m_qflags);
return -EINVAL;
}
- if (flags & XFS_DQ_USER) {
+ if (flags & XFS_QMOPT_UQUOTA) {
error = xfs_qm_scall_trunc_qfile(mp, mp->m_sb.sb_uquotino);
if (error)
return error;
}
- if (flags & XFS_DQ_GROUP) {
+ if (flags & XFS_QMOPT_GQUOTA) {
error = xfs_qm_scall_trunc_qfile(mp, mp->m_sb.sb_gquotino);
if (error)
return error;
}
- if (flags & XFS_DQ_PROJ)
+ if (flags & XFS_QMOPT_PQUOTA)
error = xfs_qm_scall_trunc_qfile(mp, mp->m_sb.sb_pquotino);
return error;
@@ -293,11 +357,11 @@
int error;
uint qf;
- flags &= (XFS_ALL_QUOTA_ACCT | XFS_ALL_QUOTA_ENFD);
/*
- * Switching on quota accounting must be done at mount time.
+ * Switching on quota accounting must be done at mount time,
+ * only consider quota enforcement stuff here.
*/
- flags &= ~(XFS_ALL_QUOTA_ACCT);
+ flags &= XFS_ALL_QUOTA_ENFD;
if (flags == 0) {
xfs_debug(mp, "%s: zero flags, m_qflags=%x",
@@ -373,20 +437,79 @@
(QC_LIMIT_MASK | QC_TIMER_MASK | QC_WARNS_MASK)
/*
+ * Adjust limits of this quota, and the defaults if passed in. Returns true
+ * if the new limits made sense and were applied, false otherwise.
+ */
+static inline bool
+xfs_setqlim_limits(
+ struct xfs_mount *mp,
+ struct xfs_dquot_res *res,
+ struct xfs_quota_limits *qlim,
+ xfs_qcnt_t hard,
+ xfs_qcnt_t soft,
+ const char *tag)
+{
+ /* The hard limit can't be less than the soft limit. */
+ if (hard != 0 && hard < soft) {
+ xfs_debug(mp, "%shard %lld < %ssoft %lld", tag, hard, tag,
+ soft);
+ return false;
+ }
+
+ res->hardlimit = hard;
+ res->softlimit = soft;
+ if (qlim) {
+ qlim->hard = hard;
+ qlim->soft = soft;
+ }
+
+ return true;
+}
+
+static inline void
+xfs_setqlim_warns(
+ struct xfs_dquot_res *res,
+ struct xfs_quota_limits *qlim,
+ int warns)
+{
+ res->warnings = warns;
+ if (qlim)
+ qlim->warn = warns;
+}
+
+static inline void
+xfs_setqlim_timer(
+ struct xfs_mount *mp,
+ struct xfs_dquot_res *res,
+ struct xfs_quota_limits *qlim,
+ s64 timer)
+{
+ if (qlim) {
+ /* Set the length of the default grace period. */
+ res->timer = xfs_dquot_set_grace_period(timer);
+ qlim->time = res->timer;
+ } else {
+ /* Set the grace period expiration on a quota. */
+ res->timer = xfs_dquot_set_timeout(mp, timer);
+ }
+}
+
+/*
* Adjust quota limits, and start/stop timers accordingly.
*/
int
xfs_qm_scall_setqlim(
struct xfs_mount *mp,
xfs_dqid_t id,
- uint type,
+ xfs_dqtype_t type,
struct qc_dqblk *newlim)
{
struct xfs_quotainfo *q = mp->m_quotainfo;
- struct xfs_disk_dquot *ddq;
struct xfs_dquot *dqp;
struct xfs_trans *tp;
struct xfs_def_quota *defq;
+ struct xfs_dquot_res *res;
+ struct xfs_quota_limits *qlim;
int error;
xfs_qcnt_t hard, soft;
@@ -415,7 +538,7 @@
goto out_unlock;
}
- defq = xfs_get_defquota(dqp, q);
+ defq = xfs_get_defquota(q, xfs_dquot_type(dqp));
xfs_dqunlock(dqp);
error = xfs_trans_alloc(mp, &M_RES(mp)->tr_qm_setqlim, 0, 0, 0, &tp);
@@ -424,99 +547,74 @@
xfs_dqlock(dqp);
xfs_trans_dqjoin(tp, dqp);
- ddq = &dqp->q_core;
/*
+ * Update quota limits, warnings, and timers, and the defaults
+ * if we're touching id == 0.
+ *
* Make sure that hardlimits are >= soft limits before changing.
+ *
+ * Update warnings counter(s) if requested.
+ *
+ * Timelimits for the super user set the relative time the other users
+ * can be over quota for this file system. If it is zero a default is
+ * used. Ditto for the default soft and hard limit values (already
+ * done, above), and for warnings.
+ *
+ * For other IDs, userspace can bump out the grace period if over
+ * the soft limit.
*/
+
+ /* Blocks on the data device. */
hard = (newlim->d_fieldmask & QC_SPC_HARD) ?
(xfs_qcnt_t) XFS_B_TO_FSB(mp, newlim->d_spc_hardlimit) :
- be64_to_cpu(ddq->d_blk_hardlimit);
+ dqp->q_blk.hardlimit;
soft = (newlim->d_fieldmask & QC_SPC_SOFT) ?
(xfs_qcnt_t) XFS_B_TO_FSB(mp, newlim->d_spc_softlimit) :
- be64_to_cpu(ddq->d_blk_softlimit);
- if (hard == 0 || hard >= soft) {
- ddq->d_blk_hardlimit = cpu_to_be64(hard);
- ddq->d_blk_softlimit = cpu_to_be64(soft);
+ dqp->q_blk.softlimit;
+ res = &dqp->q_blk;
+ qlim = id == 0 ? &defq->blk : NULL;
+
+ if (xfs_setqlim_limits(mp, res, qlim, hard, soft, "blk"))
xfs_dquot_set_prealloc_limits(dqp);
- if (id == 0) {
- defq->bhardlimit = hard;
- defq->bsoftlimit = soft;
- }
- } else {
- xfs_debug(mp, "blkhard %Ld < blksoft %Ld", hard, soft);
- }
+ if (newlim->d_fieldmask & QC_SPC_WARNS)
+ xfs_setqlim_warns(res, qlim, newlim->d_spc_warns);
+ if (newlim->d_fieldmask & QC_SPC_TIMER)
+ xfs_setqlim_timer(mp, res, qlim, newlim->d_spc_timer);
+
+ /* Blocks on the realtime device. */
hard = (newlim->d_fieldmask & QC_RT_SPC_HARD) ?
(xfs_qcnt_t) XFS_B_TO_FSB(mp, newlim->d_rt_spc_hardlimit) :
- be64_to_cpu(ddq->d_rtb_hardlimit);
+ dqp->q_rtb.hardlimit;
soft = (newlim->d_fieldmask & QC_RT_SPC_SOFT) ?
(xfs_qcnt_t) XFS_B_TO_FSB(mp, newlim->d_rt_spc_softlimit) :
- be64_to_cpu(ddq->d_rtb_softlimit);
- if (hard == 0 || hard >= soft) {
- ddq->d_rtb_hardlimit = cpu_to_be64(hard);
- ddq->d_rtb_softlimit = cpu_to_be64(soft);
- if (id == 0) {
- defq->rtbhardlimit = hard;
- defq->rtbsoftlimit = soft;
- }
- } else {
- xfs_debug(mp, "rtbhard %Ld < rtbsoft %Ld", hard, soft);
- }
+ dqp->q_rtb.softlimit;
+ res = &dqp->q_rtb;
+ qlim = id == 0 ? &defq->rtb : NULL;
+ xfs_setqlim_limits(mp, res, qlim, hard, soft, "rtb");
+ if (newlim->d_fieldmask & QC_RT_SPC_WARNS)
+ xfs_setqlim_warns(res, qlim, newlim->d_rt_spc_warns);
+ if (newlim->d_fieldmask & QC_RT_SPC_TIMER)
+ xfs_setqlim_timer(mp, res, qlim, newlim->d_rt_spc_timer);
+
+ /* Inodes */
hard = (newlim->d_fieldmask & QC_INO_HARD) ?
(xfs_qcnt_t) newlim->d_ino_hardlimit :
- be64_to_cpu(ddq->d_ino_hardlimit);
+ dqp->q_ino.hardlimit;
soft = (newlim->d_fieldmask & QC_INO_SOFT) ?
(xfs_qcnt_t) newlim->d_ino_softlimit :
- be64_to_cpu(ddq->d_ino_softlimit);
- if (hard == 0 || hard >= soft) {
- ddq->d_ino_hardlimit = cpu_to_be64(hard);
- ddq->d_ino_softlimit = cpu_to_be64(soft);
- if (id == 0) {
- defq->ihardlimit = hard;
- defq->isoftlimit = soft;
- }
- } else {
- xfs_debug(mp, "ihard %Ld < isoft %Ld", hard, soft);
- }
+ dqp->q_ino.softlimit;
+ res = &dqp->q_ino;
+ qlim = id == 0 ? &defq->ino : NULL;
- /*
- * Update warnings counter(s) if requested
- */
- if (newlim->d_fieldmask & QC_SPC_WARNS)
- ddq->d_bwarns = cpu_to_be16(newlim->d_spc_warns);
+ xfs_setqlim_limits(mp, res, qlim, hard, soft, "ino");
if (newlim->d_fieldmask & QC_INO_WARNS)
- ddq->d_iwarns = cpu_to_be16(newlim->d_ino_warns);
- if (newlim->d_fieldmask & QC_RT_SPC_WARNS)
- ddq->d_rtbwarns = cpu_to_be16(newlim->d_rt_spc_warns);
+ xfs_setqlim_warns(res, qlim, newlim->d_ino_warns);
+ if (newlim->d_fieldmask & QC_INO_TIMER)
+ xfs_setqlim_timer(mp, res, qlim, newlim->d_ino_timer);
- if (id == 0) {
- /*
- * Timelimits for the super user set the relative time
- * the other users can be over quota for this file system.
- * If it is zero a default is used. Ditto for the default
- * soft and hard limit values (already done, above), and
- * for warnings.
- */
- if (newlim->d_fieldmask & QC_SPC_TIMER) {
- q->qi_btimelimit = newlim->d_spc_timer;
- ddq->d_btimer = cpu_to_be32(newlim->d_spc_timer);
- }
- if (newlim->d_fieldmask & QC_INO_TIMER) {
- q->qi_itimelimit = newlim->d_ino_timer;
- ddq->d_itimer = cpu_to_be32(newlim->d_ino_timer);
- }
- if (newlim->d_fieldmask & QC_RT_SPC_TIMER) {
- q->qi_rtbtimelimit = newlim->d_rt_spc_timer;
- ddq->d_rtbtimer = cpu_to_be32(newlim->d_rt_spc_timer);
- }
- if (newlim->d_fieldmask & QC_SPC_WARNS)
- q->qi_bwarnlimit = newlim->d_spc_warns;
- if (newlim->d_fieldmask & QC_INO_WARNS)
- q->qi_iwarnlimit = newlim->d_ino_warns;
- if (newlim->d_fieldmask & QC_RT_SPC_WARNS)
- q->qi_rtbwarnlimit = newlim->d_rt_spc_warns;
- } else {
+ if (id != 0) {
/*
* If the user is now over quota, start the timelimit.
* The user will not be 'warned'.
@@ -524,9 +622,9 @@
* is on or off. We don't really want to bother with iterating
* over all ondisk dquots and turning the timers on/off.
*/
- xfs_qm_adjust_dqtimers(mp, ddq);
+ xfs_qm_adjust_dqtimers(dqp);
}
- dqp->dq_flags |= XFS_DQ_DIRTY;
+ dqp->q_flags |= XFS_DQFLAG_DIRTY;
xfs_trans_log_dquot(tp, dqp);
error = xfs_trans_commit(tp);
@@ -538,130 +636,50 @@
return error;
}
-STATIC int
-xfs_qm_log_quotaoff_end(
- xfs_mount_t *mp,
- xfs_qoff_logitem_t *startqoff,
- uint flags)
-{
- xfs_trans_t *tp;
- int error;
- xfs_qoff_logitem_t *qoffi;
-
- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_qm_equotaoff, 0, 0, 0, &tp);
- if (error)
- return error;
-
- qoffi = xfs_trans_get_qoff_item(tp, startqoff,
- flags & XFS_ALL_QUOTA_ACCT);
- xfs_trans_log_quotaoff_item(tp, qoffi);
-
- /*
- * We have to make sure that the transaction is secure on disk before we
- * return and actually stop quota accounting. So, make it synchronous.
- * We don't care about quotoff's performance.
- */
- xfs_trans_set_sync(tp);
- return xfs_trans_commit(tp);
-}
-
-
-STATIC int
-xfs_qm_log_quotaoff(
- xfs_mount_t *mp,
- xfs_qoff_logitem_t **qoffstartp,
- uint flags)
-{
- xfs_trans_t *tp;
- int error;
- xfs_qoff_logitem_t *qoffi;
-
- *qoffstartp = NULL;
-
- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_qm_quotaoff, 0, 0, 0, &tp);
- if (error)
- goto out;
-
- qoffi = xfs_trans_get_qoff_item(tp, NULL, flags & XFS_ALL_QUOTA_ACCT);
- xfs_trans_log_quotaoff_item(tp, qoffi);
-
- spin_lock(&mp->m_sb_lock);
- mp->m_sb.sb_qflags = (mp->m_qflags & ~(flags)) & XFS_MOUNT_QUOTA_ALL;
- spin_unlock(&mp->m_sb_lock);
-
- xfs_log_sb(tp);
-
- /*
- * We have to make sure that the transaction is secure on disk before we
- * return and actually stop quota accounting. So, make it synchronous.
- * We don't care about quotoff's performance.
- */
- xfs_trans_set_sync(tp);
- error = xfs_trans_commit(tp);
- if (error)
- goto out;
-
- *qoffstartp = qoffi;
-out:
- return error;
-}
-
/* Fill out the quota context. */
static void
xfs_qm_scall_getquota_fill_qc(
struct xfs_mount *mp,
- uint type,
+ xfs_dqtype_t type,
const struct xfs_dquot *dqp,
struct qc_dqblk *dst)
{
memset(dst, 0, sizeof(*dst));
- dst->d_spc_hardlimit =
- XFS_FSB_TO_B(mp, be64_to_cpu(dqp->q_core.d_blk_hardlimit));
- dst->d_spc_softlimit =
- XFS_FSB_TO_B(mp, be64_to_cpu(dqp->q_core.d_blk_softlimit));
- dst->d_ino_hardlimit = be64_to_cpu(dqp->q_core.d_ino_hardlimit);
- dst->d_ino_softlimit = be64_to_cpu(dqp->q_core.d_ino_softlimit);
- dst->d_space = XFS_FSB_TO_B(mp, dqp->q_res_bcount);
- dst->d_ino_count = dqp->q_res_icount;
- dst->d_spc_timer = be32_to_cpu(dqp->q_core.d_btimer);
- dst->d_ino_timer = be32_to_cpu(dqp->q_core.d_itimer);
- dst->d_ino_warns = be16_to_cpu(dqp->q_core.d_iwarns);
- dst->d_spc_warns = be16_to_cpu(dqp->q_core.d_bwarns);
- dst->d_rt_spc_hardlimit =
- XFS_FSB_TO_B(mp, be64_to_cpu(dqp->q_core.d_rtb_hardlimit));
- dst->d_rt_spc_softlimit =
- XFS_FSB_TO_B(mp, be64_to_cpu(dqp->q_core.d_rtb_softlimit));
- dst->d_rt_space = XFS_FSB_TO_B(mp, dqp->q_res_rtbcount);
- dst->d_rt_spc_timer = be32_to_cpu(dqp->q_core.d_rtbtimer);
- dst->d_rt_spc_warns = be16_to_cpu(dqp->q_core.d_rtbwarns);
+ dst->d_spc_hardlimit = XFS_FSB_TO_B(mp, dqp->q_blk.hardlimit);
+ dst->d_spc_softlimit = XFS_FSB_TO_B(mp, dqp->q_blk.softlimit);
+ dst->d_ino_hardlimit = dqp->q_ino.hardlimit;
+ dst->d_ino_softlimit = dqp->q_ino.softlimit;
+ dst->d_space = XFS_FSB_TO_B(mp, dqp->q_blk.reserved);
+ dst->d_ino_count = dqp->q_ino.reserved;
+ dst->d_spc_timer = dqp->q_blk.timer;
+ dst->d_ino_timer = dqp->q_ino.timer;
+ dst->d_ino_warns = dqp->q_ino.warnings;
+ dst->d_spc_warns = dqp->q_blk.warnings;
+ dst->d_rt_spc_hardlimit = XFS_FSB_TO_B(mp, dqp->q_rtb.hardlimit);
+ dst->d_rt_spc_softlimit = XFS_FSB_TO_B(mp, dqp->q_rtb.softlimit);
+ dst->d_rt_space = XFS_FSB_TO_B(mp, dqp->q_rtb.reserved);
+ dst->d_rt_spc_timer = dqp->q_rtb.timer;
+ dst->d_rt_spc_warns = dqp->q_rtb.warnings;
/*
* Internally, we don't reset all the timers when quota enforcement
* gets turned off. No need to confuse the user level code,
* so return zeroes in that case.
*/
- if ((!XFS_IS_UQUOTA_ENFORCED(mp) &&
- dqp->q_core.d_flags == XFS_DQ_USER) ||
- (!XFS_IS_GQUOTA_ENFORCED(mp) &&
- dqp->q_core.d_flags == XFS_DQ_GROUP) ||
- (!XFS_IS_PQUOTA_ENFORCED(mp) &&
- dqp->q_core.d_flags == XFS_DQ_PROJ)) {
+ if (!xfs_dquot_is_enforced(dqp)) {
dst->d_spc_timer = 0;
dst->d_ino_timer = 0;
dst->d_rt_spc_timer = 0;
}
#ifdef DEBUG
- if (((XFS_IS_UQUOTA_ENFORCED(mp) && type == XFS_DQ_USER) ||
- (XFS_IS_GQUOTA_ENFORCED(mp) && type == XFS_DQ_GROUP) ||
- (XFS_IS_PQUOTA_ENFORCED(mp) && type == XFS_DQ_PROJ)) &&
- dqp->q_core.d_id != 0) {
+ if (xfs_dquot_is_enforced(dqp) && dqp->q_id != 0) {
if ((dst->d_space > dst->d_spc_softlimit) &&
(dst->d_spc_softlimit > 0)) {
ASSERT(dst->d_spc_timer != 0);
}
- if ((dst->d_ino_count > dst->d_ino_softlimit) &&
- (dst->d_ino_softlimit > 0)) {
+ if ((dst->d_ino_count > dqp->q_ino.softlimit) &&
+ (dqp->q_ino.softlimit > 0)) {
ASSERT(dst->d_ino_timer != 0);
}
}
@@ -673,7 +691,7 @@
xfs_qm_scall_getquota(
struct xfs_mount *mp,
xfs_dqid_t id,
- uint type,
+ xfs_dqtype_t type,
struct qc_dqblk *dst)
{
struct xfs_dquot *dqp;
@@ -711,7 +729,7 @@
xfs_qm_scall_getquota_next(
struct xfs_mount *mp,
xfs_dqid_t *id,
- uint type,
+ xfs_dqtype_t type,
struct qc_dqblk *dst)
{
struct xfs_dquot *dqp;
@@ -722,7 +740,7 @@
return error;
/* Fill in the ID we actually read from disk */
- *id = be32_to_cpu(dqp->q_core.d_id);
+ *id = dqp->q_id;
xfs_qm_scall_getquota_fill_qc(mp, type, dqp, dst);
@@ -733,9 +751,10 @@
STATIC int
xfs_dqrele_inode(
struct xfs_inode *ip,
- int flags,
void *args)
{
+ uint *flags = args;
+
/* skip quota inodes */
if (ip == ip->i_mount->m_quotainfo->qi_uquotaip ||
ip == ip->i_mount->m_quotainfo->qi_gquotaip ||
@@ -747,15 +766,15 @@
}
xfs_ilock(ip, XFS_ILOCK_EXCL);
- if ((flags & XFS_UQUOTA_ACCT) && ip->i_udquot) {
+ if ((*flags & XFS_UQUOTA_ACCT) && ip->i_udquot) {
xfs_qm_dqrele(ip->i_udquot);
ip->i_udquot = NULL;
}
- if ((flags & XFS_GQUOTA_ACCT) && ip->i_gdquot) {
+ if ((*flags & XFS_GQUOTA_ACCT) && ip->i_gdquot) {
xfs_qm_dqrele(ip->i_gdquot);
ip->i_gdquot = NULL;
}
- if ((flags & XFS_PQUOTA_ACCT) && ip->i_pdquot) {
+ if ((*flags & XFS_PQUOTA_ACCT) && ip->i_pdquot) {
xfs_qm_dqrele(ip->i_pdquot);
ip->i_pdquot = NULL;
}
@@ -772,10 +791,10 @@
*/
void
xfs_qm_dqrele_all_inodes(
- struct xfs_mount *mp,
- uint flags)
+ struct xfs_mount *mp,
+ uint flags)
{
ASSERT(mp->m_quotainfo);
- xfs_inode_ag_iterator_flags(mp, xfs_dqrele_inode, flags, NULL,
- XFS_AGITER_INEW_WAIT);
+ xfs_inode_walk(mp, XFS_INODE_WALK_INEW_WAIT, xfs_dqrele_inode,
+ &flags, XFS_ICI_NO_TAG);
}
diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h
index efe42ae..5a62398 100644
--- a/fs/xfs/xfs_quota.h
+++ b/fs/xfs/xfs_quota.h
@@ -13,6 +13,7 @@
*/
struct xfs_trans;
+struct xfs_buf;
/*
* This check is done typically without holding the inode lock;
@@ -38,14 +39,14 @@
static inline uint
xfs_quota_chkd_flag(
- uint dqtype)
+ xfs_dqtype_t type)
{
- switch (dqtype) {
- case XFS_DQ_USER:
+ switch (type) {
+ case XFS_DQTYPE_USER:
return XFS_UQUOTA_CHKD;
- case XFS_DQ_GROUP:
+ case XFS_DQTYPE_GROUP:
return XFS_GQUOTA_CHKD;
- case XFS_DQ_PROJ:
+ case XFS_DQTYPE_PROJ:
return XFS_PQUOTA_CHKD;
default:
return 0;
@@ -86,7 +87,7 @@
struct xfs_mount *, struct xfs_dquot *,
struct xfs_dquot *, struct xfs_dquot *, int64_t, long, uint);
-extern int xfs_qm_vop_dqalloc(struct xfs_inode *, xfs_dqid_t, xfs_dqid_t,
+extern int xfs_qm_vop_dqalloc(struct xfs_inode *, kuid_t, kgid_t,
prid_t, uint, struct xfs_dquot **, struct xfs_dquot **,
struct xfs_dquot **);
extern void xfs_qm_vop_create_dqattach(struct xfs_trans *, struct xfs_inode *,
@@ -109,7 +110,7 @@
#else
static inline int
-xfs_qm_vop_dqalloc(struct xfs_inode *ip, xfs_dqid_t uid, xfs_dqid_t gid,
+xfs_qm_vop_dqalloc(struct xfs_inode *ip, kuid_t kuid, kgid_t kgid,
prid_t prid, uint flags, struct xfs_dquot **udqp,
struct xfs_dquot **gdqp, struct xfs_dquot **pdqp)
{
diff --git a/fs/xfs/xfs_quotaops.c b/fs/xfs/xfs_quotaops.c
index c7de17d..d27c0e8 100644
--- a/fs/xfs/xfs_quotaops.c
+++ b/fs/xfs/xfs_quotaops.c
@@ -21,10 +21,10 @@
struct qc_type_state *tstate,
struct xfs_mount *mp,
struct xfs_inode *ip,
- xfs_ino_t ino)
+ xfs_ino_t ino,
+ struct xfs_def_quota *defq)
{
- struct xfs_quotainfo *q = mp->m_quotainfo;
- bool tempqip = false;
+ bool tempqip = false;
tstate->ino = ino;
if (!ip && ino == NULLFSINO)
@@ -36,13 +36,13 @@
}
tstate->flags |= QCI_SYSFILE;
tstate->blocks = ip->i_d.di_nblocks;
- tstate->nextents = ip->i_d.di_nextents;
- tstate->spc_timelimit = q->qi_btimelimit;
- tstate->ino_timelimit = q->qi_itimelimit;
- tstate->rt_spc_timelimit = q->qi_rtbtimelimit;
- tstate->spc_warnlimit = q->qi_bwarnlimit;
- tstate->ino_warnlimit = q->qi_iwarnlimit;
- tstate->rt_spc_warnlimit = q->qi_rtbwarnlimit;
+ tstate->nextents = ip->i_df.if_nextents;
+ tstate->spc_timelimit = (u32)defq->blk.time;
+ tstate->ino_timelimit = (u32)defq->ino.time;
+ tstate->rt_spc_timelimit = (u32)defq->rtb.time;
+ tstate->spc_warnlimit = defq->blk.warn;
+ tstate->ino_warnlimit = defq->ino.warn;
+ tstate->rt_spc_warnlimit = defq->rtb.warn;
if (tempqip)
xfs_irele(ip);
}
@@ -77,24 +77,24 @@
state->s_state[PRJQUOTA].flags |= QCI_LIMITS_ENFORCED;
xfs_qm_fill_state(&state->s_state[USRQUOTA], mp, q->qi_uquotaip,
- mp->m_sb.sb_uquotino);
+ mp->m_sb.sb_uquotino, &q->qi_usr_default);
xfs_qm_fill_state(&state->s_state[GRPQUOTA], mp, q->qi_gquotaip,
- mp->m_sb.sb_gquotino);
+ mp->m_sb.sb_gquotino, &q->qi_grp_default);
xfs_qm_fill_state(&state->s_state[PRJQUOTA], mp, q->qi_pquotaip,
- mp->m_sb.sb_pquotino);
+ mp->m_sb.sb_pquotino, &q->qi_prj_default);
return 0;
}
-STATIC int
+STATIC xfs_dqtype_t
xfs_quota_type(int type)
{
switch (type) {
case USRQUOTA:
- return XFS_DQ_USER;
+ return XFS_DQTYPE_USER;
case GRPQUOTA:
- return XFS_DQ_GROUP;
+ return XFS_DQTYPE_GROUP;
default:
- return XFS_DQ_PROJ;
+ return XFS_DQTYPE_PROJ;
}
}
@@ -109,8 +109,8 @@
int type,
struct qc_info *info)
{
- struct xfs_mount *mp = XFS_M(sb);
- struct qc_dqblk newlim;
+ struct xfs_mount *mp = XFS_M(sb);
+ struct qc_dqblk newlim;
if (sb_rdonly(sb))
return -EROFS;
@@ -205,11 +205,11 @@
return -EINVAL;
if (uflags & FS_USER_QUOTA)
- flags |= XFS_DQ_USER;
+ flags |= XFS_QMOPT_UQUOTA;
if (uflags & FS_GROUP_QUOTA)
- flags |= XFS_DQ_GROUP;
+ flags |= XFS_QMOPT_GQUOTA;
if (uflags & FS_PROJ_QUOTA)
- flags |= XFS_DQ_PROJ;
+ flags |= XFS_QMOPT_PQUOTA;
return xfs_qm_scall_trunc_qfiles(mp, flags);
}
diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c
index 2328268..0dee316 100644
--- a/fs/xfs/xfs_refcount_item.c
+++ b/fs/xfs/xfs_refcount_item.c
@@ -17,24 +17,28 @@
#include "xfs_refcount_item.h"
#include "xfs_log.h"
#include "xfs_refcount.h"
-
+#include "xfs_error.h"
+#include "xfs_log_priv.h"
+#include "xfs_log_recover.h"
kmem_zone_t *xfs_cui_zone;
kmem_zone_t *xfs_cud_zone;
+static const struct xfs_item_ops xfs_cui_item_ops;
+
static inline struct xfs_cui_log_item *CUI_ITEM(struct xfs_log_item *lip)
{
return container_of(lip, struct xfs_cui_log_item, cui_item);
}
-void
+STATIC void
xfs_cui_item_free(
struct xfs_cui_log_item *cuip)
{
if (cuip->cui_format.cui_nextents > XFS_CUI_MAX_FAST_EXTENTS)
kmem_free(cuip);
else
- kmem_zone_free(xfs_cui_zone, cuip);
+ kmem_cache_free(xfs_cui_zone, cuip);
}
/*
@@ -44,13 +48,13 @@
* committed vs unpin operations in bulk insert operations. Hence the reference
* count to ensure only the last caller frees the CUI.
*/
-void
+STATIC void
xfs_cui_release(
struct xfs_cui_log_item *cuip)
{
ASSERT(atomic_read(&cuip->cui_refcount) > 0);
if (atomic_dec_and_test(&cuip->cui_refcount)) {
- xfs_trans_ail_remove(&cuip->cui_item, SHUTDOWN_LOG_IO_ERROR);
+ xfs_trans_ail_delete(&cuip->cui_item, SHUTDOWN_LOG_IO_ERROR);
xfs_cui_item_free(cuip);
}
}
@@ -123,17 +127,10 @@
xfs_cui_release(CUI_ITEM(lip));
}
-static const struct xfs_item_ops xfs_cui_item_ops = {
- .iop_size = xfs_cui_item_size,
- .iop_format = xfs_cui_item_format,
- .iop_unpin = xfs_cui_item_unpin,
- .iop_release = xfs_cui_item_release,
-};
-
/*
* Allocate and initialize an cui item with the given number of extents.
*/
-struct xfs_cui_log_item *
+STATIC struct xfs_cui_log_item *
xfs_cui_init(
struct xfs_mount *mp,
uint nextents)
@@ -146,7 +143,8 @@
cuip = kmem_zalloc(xfs_cui_log_item_sizeof(nextents),
0);
else
- cuip = kmem_zone_zalloc(xfs_cui_zone, 0);
+ cuip = kmem_cache_zalloc(xfs_cui_zone,
+ GFP_KERNEL | __GFP_NOFAIL);
xfs_log_item_init(mp, &cuip->cui_item, XFS_LI_CUI, &xfs_cui_item_ops);
cuip->cui_format.cui_nextents = nextents;
@@ -206,7 +204,7 @@
struct xfs_cud_log_item *cudp = CUD_ITEM(lip);
xfs_cui_release(cudp->cud_cuip);
- kmem_zone_free(xfs_cud_zone, cudp);
+ kmem_cache_free(xfs_cud_zone, cudp);
}
static const struct xfs_item_ops xfs_cud_item_ops = {
@@ -223,7 +221,7 @@
{
struct xfs_cud_log_item *cudp;
- cudp = kmem_zone_zalloc(xfs_cud_zone, 0);
+ cudp = kmem_cache_zalloc(xfs_cud_zone, GFP_KERNEL | __GFP_NOFAIL);
xfs_log_item_init(tp->t_mountp, &cudp->cud_item, XFS_LI_CUD,
&xfs_cud_item_ops);
cudp->cud_cuip = cuip;
@@ -271,8 +269,8 @@
static int
xfs_refcount_update_diff_items(
void *priv,
- struct list_head *a,
- struct list_head *b)
+ const struct list_head *a,
+ const struct list_head *b)
{
struct xfs_mount *mp = priv;
struct xfs_refcount_intent *ra;
@@ -284,27 +282,6 @@
XFS_FSB_TO_AGNO(mp, rb->ri_startblock);
}
-/* Get an CUI. */
-STATIC void *
-xfs_refcount_update_create_intent(
- struct xfs_trans *tp,
- unsigned int count)
-{
- struct xfs_cui_log_item *cuip;
-
- ASSERT(tp != NULL);
- ASSERT(count > 0);
-
- cuip = xfs_cui_init(tp->t_mountp, count);
- ASSERT(cuip != NULL);
-
- /*
- * Get a log_item_desc to point at the new item.
- */
- xfs_trans_add_item(tp, &cuip->cui_item);
- return cuip;
-}
-
/* Set the phys extent flags for this reverse mapping. */
static void
xfs_trans_set_refcount_flags(
@@ -328,16 +305,12 @@
STATIC void
xfs_refcount_update_log_item(
struct xfs_trans *tp,
- void *intent,
- struct list_head *item)
+ struct xfs_cui_log_item *cuip,
+ struct xfs_refcount_intent *refc)
{
- struct xfs_cui_log_item *cuip = intent;
- struct xfs_refcount_intent *refc;
uint next_extent;
struct xfs_phys_extent *ext;
- refc = container_of(item, struct xfs_refcount_intent, ri_list);
-
tp->t_flags |= XFS_TRANS_DIRTY;
set_bit(XFS_LI_DIRTY, &cuip->cui_item.li_flags);
@@ -354,23 +327,44 @@
xfs_trans_set_refcount_flags(ext, refc->ri_type);
}
+static struct xfs_log_item *
+xfs_refcount_update_create_intent(
+ struct xfs_trans *tp,
+ struct list_head *items,
+ unsigned int count,
+ bool sort)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_cui_log_item *cuip = xfs_cui_init(mp, count);
+ struct xfs_refcount_intent *refc;
+
+ ASSERT(count > 0);
+
+ xfs_trans_add_item(tp, &cuip->cui_item);
+ if (sort)
+ list_sort(mp, items, xfs_refcount_update_diff_items);
+ list_for_each_entry(refc, items, ri_list)
+ xfs_refcount_update_log_item(tp, cuip, refc);
+ return &cuip->cui_item;
+}
+
/* Get an CUD so we can process all the deferred refcount updates. */
-STATIC void *
+static struct xfs_log_item *
xfs_refcount_update_create_done(
struct xfs_trans *tp,
- void *intent,
+ struct xfs_log_item *intent,
unsigned int count)
{
- return xfs_trans_get_cud(tp, intent);
+ return &xfs_trans_get_cud(tp, CUI_ITEM(intent))->cud_item;
}
/* Process a deferred refcount update. */
STATIC int
xfs_refcount_update_finish_item(
struct xfs_trans *tp,
+ struct xfs_log_item *done,
struct list_head *item,
- void *done_item,
- void **state)
+ struct xfs_btree_cur **state)
{
struct xfs_refcount_intent *refc;
xfs_fsblock_t new_fsb;
@@ -378,12 +372,10 @@
int error;
refc = container_of(item, struct xfs_refcount_intent, ri_list);
- error = xfs_trans_log_finish_refcount_update(tp, done_item,
- refc->ri_type,
- refc->ri_startblock,
- refc->ri_blockcount,
- &new_fsb, &new_aglen,
- (struct xfs_btree_cur **)state);
+ error = xfs_trans_log_finish_refcount_update(tp, CUD_ITEM(done),
+ refc->ri_type, refc->ri_startblock, refc->ri_blockcount,
+ &new_fsb, &new_aglen, state);
+
/* Did we run out of reservation? Requeue what we didn't finish. */
if (!error && new_aglen > 0) {
ASSERT(refc->ri_type == XFS_REFCOUNT_INCREASE ||
@@ -396,24 +388,12 @@
return error;
}
-/* Clean up after processing deferred refcounts. */
-STATIC void
-xfs_refcount_update_finish_cleanup(
- struct xfs_trans *tp,
- void *state,
- int error)
-{
- struct xfs_btree_cur *rcur = state;
-
- xfs_refcount_finish_one_cleanup(tp, rcur, error);
-}
-
/* Abort all pending CUIs. */
STATIC void
xfs_refcount_update_abort_intent(
- void *intent)
+ struct xfs_log_item *intent)
{
- xfs_cui_release(intent);
+ xfs_cui_release(CUI_ITEM(intent));
}
/* Cancel a deferred refcount update. */
@@ -429,13 +409,11 @@
const struct xfs_defer_op_type xfs_refcount_update_defer_type = {
.max_items = XFS_CUI_MAX_FAST_EXTENTS,
- .diff_items = xfs_refcount_update_diff_items,
.create_intent = xfs_refcount_update_create_intent,
.abort_intent = xfs_refcount_update_abort_intent,
- .log_item = xfs_refcount_update_log_item,
.create_done = xfs_refcount_update_create_done,
.finish_item = xfs_refcount_update_finish_item,
- .finish_cleanup = xfs_refcount_update_finish_cleanup,
+ .finish_cleanup = xfs_refcount_finish_one_cleanup,
.cancel_item = xfs_refcount_update_cancel_item,
};
@@ -443,28 +421,27 @@
* Process a refcount update intent item that was recovered from the log.
* We need to update the refcountbt.
*/
-int
-xfs_cui_recover(
- struct xfs_trans *parent_tp,
- struct xfs_cui_log_item *cuip)
+STATIC int
+xfs_cui_item_recover(
+ struct xfs_log_item *lip,
+ struct list_head *capture_list)
{
- int i;
- int error = 0;
- unsigned int refc_type;
+ struct xfs_bmbt_irec irec;
+ struct xfs_cui_log_item *cuip = CUI_ITEM(lip);
struct xfs_phys_extent *refc;
- xfs_fsblock_t startblock_fsb;
- bool op_ok;
struct xfs_cud_log_item *cudp;
struct xfs_trans *tp;
struct xfs_btree_cur *rcur = NULL;
- enum xfs_refcount_intent_type type;
+ struct xfs_mount *mp = lip->li_mountp;
+ xfs_fsblock_t startblock_fsb;
xfs_fsblock_t new_fsb;
xfs_extlen_t new_len;
- struct xfs_bmbt_irec irec;
+ unsigned int refc_type;
+ bool op_ok;
bool requeue_only = false;
- struct xfs_mount *mp = parent_tp->t_mountp;
-
- ASSERT(!test_bit(XFS_CUI_RECOVERED, &cuip->cui_flags));
+ enum xfs_refcount_intent_type type;
+ int i;
+ int error = 0;
/*
* First check the validity of the extents described by the
@@ -490,15 +467,8 @@
refc->pe_len == 0 ||
startblock_fsb >= mp->m_sb.sb_dblocks ||
refc->pe_len >= mp->m_sb.sb_agblocks ||
- (refc->pe_flags & ~XFS_REFCOUNT_EXTENT_FLAGS)) {
- /*
- * This will pull the CUI from the AIL and
- * free the memory associated with it.
- */
- set_bit(XFS_CUI_RECOVERED, &cuip->cui_flags);
- xfs_cui_release(cuip);
- return -EIO;
- }
+ (refc->pe_flags & ~XFS_REFCOUNT_EXTENT_FLAGS))
+ return -EFSCORRUPTED;
}
/*
@@ -509,7 +479,7 @@
* transaction. Normally, any work that needs to be deferred
* gets attached to the same defer_ops that scheduled the
* refcount update. However, we're in log recovery here, so we
- * we use the passed in defer_ops and to finish up any work that
+ * use the passed in defer_ops and to finish up any work that
* doesn't fit. We need to reserve enough blocks to handle a
* full btree split on either end of the refcount range.
*/
@@ -517,12 +487,7 @@
mp->m_refc_maxlevels * 2, 0, XFS_TRANS_RESERVE, &tp);
if (error)
return error;
- /*
- * Recovery stashes all deferred ops during intent processing and
- * finishes them on completion. Transfer current dfops state to this
- * transaction and transfer the result back before we return.
- */
- xfs_defer_move(tp, parent_tp);
+
cudp = xfs_trans_get_cud(tp, cuip);
for (i = 0; i < cuip->cui_format.cui_nextents; i++) {
@@ -536,6 +501,7 @@
type = refc_type;
break;
default:
+ XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp);
error = -EFSCORRUPTED;
goto abort_error;
}
@@ -578,14 +544,151 @@
}
xfs_refcount_finish_one_cleanup(tp, rcur, error);
- set_bit(XFS_CUI_RECOVERED, &cuip->cui_flags);
- xfs_defer_move(parent_tp, tp);
- error = xfs_trans_commit(tp);
- return error;
+ return xfs_defer_ops_capture_and_commit(tp, NULL, capture_list);
abort_error:
xfs_refcount_finish_one_cleanup(tp, rcur, error);
- xfs_defer_move(parent_tp, tp);
xfs_trans_cancel(tp);
return error;
}
+
+STATIC bool
+xfs_cui_item_match(
+ struct xfs_log_item *lip,
+ uint64_t intent_id)
+{
+ return CUI_ITEM(lip)->cui_format.cui_id == intent_id;
+}
+
+/* Relog an intent item to push the log tail forward. */
+static struct xfs_log_item *
+xfs_cui_item_relog(
+ struct xfs_log_item *intent,
+ struct xfs_trans *tp)
+{
+ struct xfs_cud_log_item *cudp;
+ struct xfs_cui_log_item *cuip;
+ struct xfs_phys_extent *extp;
+ unsigned int count;
+
+ count = CUI_ITEM(intent)->cui_format.cui_nextents;
+ extp = CUI_ITEM(intent)->cui_format.cui_extents;
+
+ tp->t_flags |= XFS_TRANS_DIRTY;
+ cudp = xfs_trans_get_cud(tp, CUI_ITEM(intent));
+ set_bit(XFS_LI_DIRTY, &cudp->cud_item.li_flags);
+
+ cuip = xfs_cui_init(tp->t_mountp, count);
+ memcpy(cuip->cui_format.cui_extents, extp, count * sizeof(*extp));
+ atomic_set(&cuip->cui_next_extent, count);
+ xfs_trans_add_item(tp, &cuip->cui_item);
+ set_bit(XFS_LI_DIRTY, &cuip->cui_item.li_flags);
+ return &cuip->cui_item;
+}
+
+static const struct xfs_item_ops xfs_cui_item_ops = {
+ .iop_size = xfs_cui_item_size,
+ .iop_format = xfs_cui_item_format,
+ .iop_unpin = xfs_cui_item_unpin,
+ .iop_release = xfs_cui_item_release,
+ .iop_recover = xfs_cui_item_recover,
+ .iop_match = xfs_cui_item_match,
+ .iop_relog = xfs_cui_item_relog,
+};
+
+/*
+ * Copy an CUI format buffer from the given buf, and into the destination
+ * CUI format structure. The CUI/CUD items were designed not to need any
+ * special alignment handling.
+ */
+static int
+xfs_cui_copy_format(
+ struct xfs_log_iovec *buf,
+ struct xfs_cui_log_format *dst_cui_fmt)
+{
+ struct xfs_cui_log_format *src_cui_fmt;
+ uint len;
+
+ src_cui_fmt = buf->i_addr;
+ len = xfs_cui_log_format_sizeof(src_cui_fmt->cui_nextents);
+
+ if (buf->i_len == len) {
+ memcpy(dst_cui_fmt, src_cui_fmt, len);
+ return 0;
+ }
+ XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, NULL);
+ return -EFSCORRUPTED;
+}
+
+/*
+ * This routine is called to create an in-core extent refcount update
+ * item from the cui format structure which was logged on disk.
+ * It allocates an in-core cui, copies the extents from the format
+ * structure into it, and adds the cui to the AIL with the given
+ * LSN.
+ */
+STATIC int
+xlog_recover_cui_commit_pass2(
+ struct xlog *log,
+ struct list_head *buffer_list,
+ struct xlog_recover_item *item,
+ xfs_lsn_t lsn)
+{
+ int error;
+ struct xfs_mount *mp = log->l_mp;
+ struct xfs_cui_log_item *cuip;
+ struct xfs_cui_log_format *cui_formatp;
+
+ cui_formatp = item->ri_buf[0].i_addr;
+
+ cuip = xfs_cui_init(mp, cui_formatp->cui_nextents);
+ error = xfs_cui_copy_format(&item->ri_buf[0], &cuip->cui_format);
+ if (error) {
+ xfs_cui_item_free(cuip);
+ return error;
+ }
+ atomic_set(&cuip->cui_next_extent, cui_formatp->cui_nextents);
+ /*
+ * Insert the intent into the AIL directly and drop one reference so
+ * that finishing or canceling the work will drop the other.
+ */
+ xfs_trans_ail_insert(log->l_ailp, &cuip->cui_item, lsn);
+ xfs_cui_release(cuip);
+ return 0;
+}
+
+const struct xlog_recover_item_ops xlog_cui_item_ops = {
+ .item_type = XFS_LI_CUI,
+ .commit_pass2 = xlog_recover_cui_commit_pass2,
+};
+
+/*
+ * This routine is called when an CUD format structure is found in a committed
+ * transaction in the log. Its purpose is to cancel the corresponding CUI if it
+ * was still in the log. To do this it searches the AIL for the CUI with an id
+ * equal to that in the CUD format structure. If we find it we drop the CUD
+ * reference, which removes the CUI from the AIL and frees it.
+ */
+STATIC int
+xlog_recover_cud_commit_pass2(
+ struct xlog *log,
+ struct list_head *buffer_list,
+ struct xlog_recover_item *item,
+ xfs_lsn_t lsn)
+{
+ struct xfs_cud_log_format *cud_formatp;
+
+ cud_formatp = item->ri_buf[0].i_addr;
+ if (item->ri_buf[0].i_len != sizeof(struct xfs_cud_log_format)) {
+ XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, log->l_mp);
+ return -EFSCORRUPTED;
+ }
+
+ xlog_recover_release_intent(log, XFS_LI_CUI, cud_formatp->cud_cui_id);
+ return 0;
+}
+
+const struct xlog_recover_item_ops xlog_cud_item_ops = {
+ .item_type = XFS_LI_CUD,
+ .commit_pass2 = xlog_recover_cud_commit_pass2,
+};
diff --git a/fs/xfs/xfs_refcount_item.h b/fs/xfs/xfs_refcount_item.h
index e47530f..f4f2e83 100644
--- a/fs/xfs/xfs_refcount_item.h
+++ b/fs/xfs/xfs_refcount_item.h
@@ -33,11 +33,6 @@
#define XFS_CUI_MAX_FAST_EXTENTS 16
/*
- * Define CUI flag bits. Manipulated by set/clear/test_bit operators.
- */
-#define XFS_CUI_RECOVERED 1
-
-/*
* This is the "refcount update intent" log item. It is used to log
* the fact that some reverse mappings need to change. It is used in
* conjunction with the "refcount update done" log item described
@@ -51,7 +46,6 @@
struct xfs_log_item cui_item;
atomic_t cui_refcount;
atomic_t cui_next_extent;
- unsigned long cui_flags; /* misc flags */
struct xfs_cui_log_format cui_format;
};
@@ -77,9 +71,4 @@
extern struct kmem_zone *xfs_cui_zone;
extern struct kmem_zone *xfs_cud_zone;
-struct xfs_cui_log_item *xfs_cui_init(struct xfs_mount *, uint);
-void xfs_cui_item_free(struct xfs_cui_log_item *);
-void xfs_cui_release(struct xfs_cui_log_item *);
-int xfs_cui_recover(struct xfs_trans *parent_tp, struct xfs_cui_log_item *cuip);
-
#endif /* __XFS_REFCOUNT_ITEM_H__ */
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index 904d828..6fa05fb 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -143,8 +143,6 @@
error = xfs_alloc_read_agf(mp, tp, agno, 0, &agbp);
if (error)
return error;
- if (!agbp)
- return -ENOMEM;
cur = xfs_refcountbt_init_cursor(mp, tp, agbp, agno);
@@ -181,7 +179,7 @@
int error = 0;
/* Holes, unwritten, and delalloc extents cannot be shared */
- if (!xfs_is_cow_inode(ip) || !xfs_bmap_is_real_extent(irec)) {
+ if (!xfs_is_cow_inode(ip) || !xfs_bmap_is_written_extent(irec)) {
*shared = false;
return 0;
}
@@ -223,8 +221,8 @@
}
}
-bool
-xfs_inode_need_cow(
+int
+xfs_bmap_trim_cow(
struct xfs_inode *ip,
struct xfs_bmbt_irec *imap,
bool *shared)
@@ -308,13 +306,13 @@
xfs_find_trim_cow_extent(
struct xfs_inode *ip,
struct xfs_bmbt_irec *imap,
+ struct xfs_bmbt_irec *cmap,
bool *shared,
bool *found)
{
xfs_fileoff_t offset_fsb = imap->br_startoff;
xfs_filblks_t count_fsb = imap->br_blockcount;
struct xfs_iext_cursor icur;
- struct xfs_bmbt_irec got;
*found = false;
@@ -322,23 +320,22 @@
* If we don't find an overlapping extent, trim the range we need to
* allocate to fit the hole we found.
*/
- if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &got))
- got.br_startoff = offset_fsb + count_fsb;
- if (got.br_startoff > offset_fsb) {
+ if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, cmap))
+ cmap->br_startoff = offset_fsb + count_fsb;
+ if (cmap->br_startoff > offset_fsb) {
xfs_trim_extent(imap, imap->br_startoff,
- got.br_startoff - imap->br_startoff);
- return xfs_inode_need_cow(ip, imap, shared);
+ cmap->br_startoff - imap->br_startoff);
+ return xfs_bmap_trim_cow(ip, imap, shared);
}
*shared = true;
- if (isnullstartblock(got.br_startblock)) {
- xfs_trim_extent(imap, got.br_startoff, got.br_blockcount);
+ if (isnullstartblock(cmap->br_startblock)) {
+ xfs_trim_extent(imap, cmap->br_startoff, cmap->br_blockcount);
return 0;
}
/* real extent found - no need to allocate */
- xfs_trim_extent(&got, offset_fsb, count_fsb);
- *imap = got;
+ xfs_trim_extent(cmap, offset_fsb, count_fsb);
*found = true;
return 0;
}
@@ -348,6 +345,7 @@
xfs_reflink_allocate_cow(
struct xfs_inode *ip,
struct xfs_bmbt_irec *imap,
+ struct xfs_bmbt_irec *cmap,
bool *shared,
uint *lockmode,
bool convert_now)
@@ -367,7 +365,7 @@
xfs_ifork_init_cow(ip);
}
- error = xfs_find_trim_cow_extent(ip, imap, shared, &found);
+ error = xfs_find_trim_cow_extent(ip, imap, cmap, shared, &found);
if (error || !*shared)
return error;
if (found)
@@ -392,7 +390,7 @@
/*
* Check for an overlapping extent again now that we dropped the ilock.
*/
- error = xfs_find_trim_cow_extent(ip, imap, shared, &found);
+ error = xfs_find_trim_cow_extent(ip, imap, cmap, shared, &found);
if (error || !*shared)
goto out_trans_cancel;
if (found) {
@@ -410,8 +408,8 @@
/* Allocate the entire reservation as unwritten blocks. */
nimaps = 1;
error = xfs_bmapi_write(tp, ip, imap->br_startoff, imap->br_blockcount,
- XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC,
- resblks, imap, &nimaps);
+ XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC, 0, cmap,
+ &nimaps);
if (error)
goto out_unreserve;
@@ -427,15 +425,15 @@
if (nimaps == 0)
return -ENOSPC;
convert:
- xfs_trim_extent(imap, offset_fsb, count_fsb);
+ xfs_trim_extent(cmap, offset_fsb, count_fsb);
/*
* COW fork extents are supposed to remain unwritten until we're ready
* to initiate a disk write. For direct I/O we are going to write the
* data and need the conversion, but for buffered writes we're done.
*/
- if (!convert_now || imap->br_state == XFS_EXT_NORM)
+ if (!convert_now || cmap->br_state == XFS_EXT_NORM)
return 0;
- trace_xfs_reflink_convert_cow(ip, imap);
+ trace_xfs_reflink_convert_cow(ip, cmap);
return xfs_reflink_convert_cow_locked(ip, offset_fsb, count_fsb);
out_unreserve:
@@ -657,7 +655,7 @@
* preallocations can leak into the range we are called upon, and we
* need to skip them.
*/
- if (!xfs_bmap_is_real_extent(&got)) {
+ if (!xfs_bmap_is_written_extent(&got)) {
*end_fsb = del.br_startoff;
goto out_cancel;
}
@@ -723,7 +721,7 @@
* repeatedly cycles the ILOCK to allocate one transaction per remapped
* extent.
*
- * If we're being called by writeback then the the pages will still
+ * If we're being called by writeback then the pages will still
* have PageWriteback set, which prevents races with reflink remapping
* and truncate. Reflink remapping prevents races with writeback by
* taking the iolock and mmaplock before flushing the pages and
@@ -986,41 +984,28 @@
}
/*
- * Unmap a range of blocks from a file, then map other blocks into the hole.
- * The range to unmap is (destoff : destoff + srcioff + irec->br_blockcount).
- * The extent irec is mapped into dest at irec->br_startoff.
+ * Remap the given extent into the file. The dmap blockcount will be set to
+ * the number of blocks that were actually remapped.
*/
STATIC int
xfs_reflink_remap_extent(
struct xfs_inode *ip,
- struct xfs_bmbt_irec *irec,
- xfs_fileoff_t destoff,
+ struct xfs_bmbt_irec *dmap,
xfs_off_t new_isize)
{
+ struct xfs_bmbt_irec smap;
struct xfs_mount *mp = ip->i_mount;
- bool real_extent = xfs_bmap_is_real_extent(irec);
struct xfs_trans *tp;
- unsigned int resblks;
- struct xfs_bmbt_irec uirec;
- xfs_filblks_t rlen;
- xfs_filblks_t unmap_len;
xfs_off_t newlen;
- int64_t qres;
+ int64_t qres, qdelta;
+ unsigned int resblks;
+ bool smap_real;
+ bool dmap_written = xfs_bmap_is_written_extent(dmap);
+ int nimaps;
int error;
- unmap_len = irec->br_startoff + irec->br_blockcount - destoff;
- trace_xfs_reflink_punch_range(ip, destoff, unmap_len);
-
- /* No reflinking if we're low on space */
- if (real_extent) {
- error = xfs_reflink_ag_has_free_space(mp,
- XFS_FSB_TO_AGNO(mp, irec->br_startblock));
- if (error)
- goto out;
- }
-
/* Start a rolling transaction to switch the mappings */
- resblks = XFS_EXTENTADD_SPACE_RES(ip->i_mount, XFS_DATA_FORK);
+ resblks = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK);
error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, &tp);
if (error)
goto out;
@@ -1029,92 +1014,146 @@
xfs_trans_ijoin(tp, ip, 0);
/*
- * Reserve quota for this operation. We don't know if the first unmap
- * in the dest file will cause a bmap btree split, so we always reserve
- * at least enough blocks for that split. If the extent being mapped
- * in is written, we need to reserve quota for that too.
+ * Read what's currently mapped in the destination file into smap.
+ * If smap isn't a hole, we will have to remove it before we can add
+ * dmap to the destination file.
*/
- qres = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK);
- if (real_extent)
- qres += irec->br_blockcount;
- error = xfs_trans_reserve_quota_nblks(tp, ip, qres, 0,
- XFS_QMOPT_RES_REGBLKS);
+ nimaps = 1;
+ error = xfs_bmapi_read(ip, dmap->br_startoff, dmap->br_blockcount,
+ &smap, &nimaps, 0);
if (error)
goto out_cancel;
+ ASSERT(nimaps == 1 && smap.br_startoff == dmap->br_startoff);
+ smap_real = xfs_bmap_is_real_extent(&smap);
- trace_xfs_reflink_remap(ip, irec->br_startoff,
- irec->br_blockcount, irec->br_startblock);
+ /*
+ * We can only remap as many blocks as the smaller of the two extent
+ * maps, because we can only remap one extent at a time.
+ */
+ dmap->br_blockcount = min(dmap->br_blockcount, smap.br_blockcount);
+ ASSERT(dmap->br_blockcount == smap.br_blockcount);
- /* Unmap the old blocks in the data fork. */
- rlen = unmap_len;
- while (rlen) {
- ASSERT(tp->t_firstblock == NULLFSBLOCK);
- error = __xfs_bunmapi(tp, ip, destoff, &rlen, 0, 1);
- if (error)
- goto out_cancel;
+ trace_xfs_reflink_remap_extent_dest(ip, &smap);
- /*
- * Trim the extent to whatever got unmapped.
- * Remember, bunmapi works backwards.
- */
- uirec.br_startblock = irec->br_startblock + rlen;
- uirec.br_startoff = irec->br_startoff + rlen;
- uirec.br_blockcount = unmap_len - rlen;
- uirec.br_state = irec->br_state;
- unmap_len = rlen;
+ /*
+ * Two extents mapped to the same physical block must not have
+ * different states; that's filesystem corruption. Move on to the next
+ * extent if they're both holes or both the same physical extent.
+ */
+ if (dmap->br_startblock == smap.br_startblock) {
+ if (dmap->br_state != smap.br_state)
+ error = -EFSCORRUPTED;
+ goto out_cancel;
+ }
- /* If this isn't a real mapping, we're done. */
- if (!real_extent || uirec.br_blockcount == 0)
- goto next_extent;
+ /* If both extents are unwritten, leave them alone. */
+ if (dmap->br_state == XFS_EXT_UNWRITTEN &&
+ smap.br_state == XFS_EXT_UNWRITTEN)
+ goto out_cancel;
- trace_xfs_reflink_remap(ip, uirec.br_startoff,
- uirec.br_blockcount, uirec.br_startblock);
-
- /* Update the refcount tree */
- xfs_refcount_increase_extent(tp, &uirec);
-
- /* Map the new blocks into the data fork. */
- xfs_bmap_map_extent(tp, ip, &uirec);
-
- /* Update quota accounting. */
- xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT,
- uirec.br_blockcount);
-
- /* Update dest isize if needed. */
- newlen = XFS_FSB_TO_B(mp,
- uirec.br_startoff + uirec.br_blockcount);
- newlen = min_t(xfs_off_t, newlen, new_isize);
- if (newlen > i_size_read(VFS_I(ip))) {
- trace_xfs_reflink_update_inode_size(ip, newlen);
- i_size_write(VFS_I(ip), newlen);
- ip->i_d.di_size = newlen;
- xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
- }
-
-next_extent:
- /* Process all the deferred stuff. */
- error = xfs_defer_finish(&tp);
+ /* No reflinking if the AG of the dest mapping is low on space. */
+ if (dmap_written) {
+ error = xfs_reflink_ag_has_free_space(mp,
+ XFS_FSB_TO_AGNO(mp, dmap->br_startblock));
if (error)
goto out_cancel;
}
+ /*
+ * Compute quota reservation if we think the quota block counter for
+ * this file could increase.
+ *
+ * Adding a written extent to the extent map can cause a bmbt split,
+ * and removing a mapped extent from the extent can cause a bmbt split.
+ * The two operations cannot both cause a split since they operate on
+ * the same index in the bmap btree, so we only need a reservation for
+ * one bmbt split if either thing is happening.
+ *
+ * If we are mapping a written extent into the file, we need to have
+ * enough quota block count reservation to handle the blocks in that
+ * extent. We log only the delta to the quota block counts, so if the
+ * extent we're unmapping also has blocks allocated to it, we don't
+ * need a quota reservation for the extent itself.
+ *
+ * Note that if we're replacing a delalloc reservation with a written
+ * extent, we have to take the full quota reservation because removing
+ * the delalloc reservation gives the block count back to the quota
+ * count. This is suboptimal, but the VFS flushed the dest range
+ * before we started. That should have removed all the delalloc
+ * reservations, but we code defensively.
+ */
+ qres = qdelta = 0;
+ if (smap_real || dmap_written)
+ qres = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK);
+ if (!smap_real && dmap_written)
+ qres += dmap->br_blockcount;
+ if (qres > 0) {
+ error = xfs_trans_reserve_quota_nblks(tp, ip, qres, 0,
+ XFS_QMOPT_RES_REGBLKS);
+ if (error)
+ goto out_cancel;
+ }
+
+ if (smap_real) {
+ /*
+ * If the extent we're unmapping is backed by storage (written
+ * or not), unmap the extent and drop its refcount.
+ */
+ xfs_bmap_unmap_extent(tp, ip, &smap);
+ xfs_refcount_decrease_extent(tp, &smap);
+ qdelta -= smap.br_blockcount;
+ } else if (smap.br_startblock == DELAYSTARTBLOCK) {
+ xfs_filblks_t len = smap.br_blockcount;
+
+ /*
+ * If the extent we're unmapping is a delalloc reservation,
+ * we can use the regular bunmapi function to release the
+ * incore state. Dropping the delalloc reservation takes care
+ * of the quota reservation for us.
+ */
+ error = __xfs_bunmapi(NULL, ip, smap.br_startoff, &len, 0, 1);
+ if (error)
+ goto out_cancel;
+ ASSERT(len == 0);
+ }
+
+ /*
+ * If the extent we're sharing is backed by written storage, increase
+ * its refcount and map it into the file.
+ */
+ if (dmap_written) {
+ xfs_refcount_increase_extent(tp, dmap);
+ xfs_bmap_map_extent(tp, ip, dmap);
+ qdelta += dmap->br_blockcount;
+ }
+
+ xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, qdelta);
+
+ /* Update dest isize if needed. */
+ newlen = XFS_FSB_TO_B(mp, dmap->br_startoff + dmap->br_blockcount);
+ newlen = min_t(xfs_off_t, newlen, new_isize);
+ if (newlen > i_size_read(VFS_I(ip))) {
+ trace_xfs_reflink_update_inode_size(ip, newlen);
+ i_size_write(VFS_I(ip), newlen);
+ ip->i_d.di_size = newlen;
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+ }
+
+ /* Commit everything and unlock. */
error = xfs_trans_commit(tp);
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- if (error)
- goto out;
- return 0;
+ goto out_unlock;
out_cancel:
xfs_trans_cancel(tp);
+out_unlock:
xfs_iunlock(ip, XFS_ILOCK_EXCL);
out:
- trace_xfs_reflink_remap_extent_error(ip, error, _RET_IP_);
+ if (error)
+ trace_xfs_reflink_remap_extent_error(ip, error, _RET_IP_);
return error;
}
-/*
- * Iteratively remap one file's extents (and holes) to another's.
- */
+/* Remap a range of one file to the other. */
int
xfs_reflink_remap_blocks(
struct xfs_inode *src,
@@ -1125,25 +1164,22 @@
loff_t *remapped)
{
struct xfs_bmbt_irec imap;
- xfs_fileoff_t srcoff;
- xfs_fileoff_t destoff;
+ struct xfs_mount *mp = src->i_mount;
+ xfs_fileoff_t srcoff = XFS_B_TO_FSBT(mp, pos_in);
+ xfs_fileoff_t destoff = XFS_B_TO_FSBT(mp, pos_out);
xfs_filblks_t len;
- xfs_filblks_t range_len;
xfs_filblks_t remapped_len = 0;
xfs_off_t new_isize = pos_out + remap_len;
int nimaps;
int error = 0;
- destoff = XFS_B_TO_FSBT(src->i_mount, pos_out);
- srcoff = XFS_B_TO_FSBT(src->i_mount, pos_in);
- len = XFS_B_TO_FSB(src->i_mount, remap_len);
+ len = min_t(xfs_filblks_t, XFS_B_TO_FSB(mp, remap_len),
+ XFS_MAX_FILEOFF);
- /* drange = (destoff, destoff + len); srange = (srcoff, srcoff + len) */
- while (len) {
- uint lock_mode;
+ trace_xfs_reflink_remap_blocks(src, srcoff, len, dest, destoff);
- trace_xfs_reflink_remap_blocks_loop(src, srcoff, len,
- dest, destoff);
+ while (len > 0) {
+ unsigned int lock_mode;
/* Read extent from the source file */
nimaps = 1;
@@ -1152,18 +1188,25 @@
xfs_iunlock(src, lock_mode);
if (error)
break;
- ASSERT(nimaps == 1);
+ /*
+ * The caller supposedly flushed all dirty pages in the source
+ * file range, which means that writeback should have allocated
+ * or deleted all delalloc reservations in that range. If we
+ * find one, that's a good sign that something is seriously
+ * wrong here.
+ */
+ ASSERT(nimaps == 1 && imap.br_startoff == srcoff);
+ if (imap.br_startblock == DELAYSTARTBLOCK) {
+ ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
+ error = -EFSCORRUPTED;
+ break;
+ }
- trace_xfs_reflink_remap_imap(src, srcoff, len, XFS_DATA_FORK,
- &imap);
+ trace_xfs_reflink_remap_extent_src(src, &imap);
- /* Translate imap into the destination file. */
- range_len = imap.br_startoff + imap.br_blockcount - srcoff;
- imap.br_startoff += destoff - srcoff;
-
- /* Clear dest from destoff to the end of imap and map it in. */
- error = xfs_reflink_remap_extent(dest, &imap, destoff,
- new_isize);
+ /* Remap into the destination file at the given offset. */
+ imap.br_startoff = destoff;
+ error = xfs_reflink_remap_extent(dest, &imap, new_isize);
if (error)
break;
@@ -1173,10 +1216,10 @@
}
/* Advance drange/srange */
- srcoff += range_len;
- destoff += range_len;
- len -= range_len;
- remapped_len += range_len;
+ srcoff += imap.br_blockcount;
+ destoff += imap.br_blockcount;
+ len -= imap.br_blockcount;
+ remapped_len += imap.br_blockcount;
}
if (error)
@@ -1187,81 +1230,6 @@
}
/*
- * Grab the exclusive iolock for a data copy from src to dest, making sure to
- * abide vfs locking order (lowest pointer value goes first) and breaking the
- * layout leases before proceeding. The loop is needed because we cannot call
- * the blocking break_layout() with the iolocks held, and therefore have to
- * back out both locks.
- */
-static int
-xfs_iolock_two_inodes_and_break_layout(
- struct inode *src,
- struct inode *dest)
-{
- int error;
-
- if (src > dest)
- swap(src, dest);
-
-retry:
- /* Wait to break both inodes' layouts before we start locking. */
- error = break_layout(src, true);
- if (error)
- return error;
- if (src != dest) {
- error = break_layout(dest, true);
- if (error)
- return error;
- }
-
- /* Lock one inode and make sure nobody got in and leased it. */
- inode_lock(src);
- error = break_layout(src, false);
- if (error) {
- inode_unlock(src);
- if (error == -EWOULDBLOCK)
- goto retry;
- return error;
- }
-
- if (src == dest)
- return 0;
-
- /* Lock the other inode and make sure nobody got in and leased it. */
- inode_lock_nested(dest, I_MUTEX_NONDIR2);
- error = break_layout(dest, false);
- if (error) {
- inode_unlock(src);
- inode_unlock(dest);
- if (error == -EWOULDBLOCK)
- goto retry;
- return error;
- }
-
- return 0;
-}
-
-/* Unlock both inodes after they've been prepped for a range clone. */
-void
-xfs_reflink_remap_unlock(
- struct file *file_in,
- struct file *file_out)
-{
- struct inode *inode_in = file_inode(file_in);
- struct xfs_inode *src = XFS_I(inode_in);
- struct inode *inode_out = file_inode(file_out);
- struct xfs_inode *dest = XFS_I(inode_out);
- bool same_inode = (inode_in == inode_out);
-
- xfs_iunlock(dest, XFS_MMAPLOCK_EXCL);
- if (!same_inode)
- xfs_iunlock(src, XFS_MMAPLOCK_EXCL);
- inode_unlock(inode_out);
- if (!same_inode)
- inode_unlock(inode_in);
-}
-
-/*
* If we're reflinking to a point past the destination file's EOF, we must
* zero any speculative post-EOF preallocations that sit between the old EOF
* and the destination file offset.
@@ -1278,7 +1246,7 @@
trace_xfs_zero_eof(ip, isize, pos - isize);
return iomap_zero_range(VFS_I(ip), isize, pos - isize, NULL,
- &xfs_iomap_ops);
+ &xfs_buffered_write_iomap_ops);
}
/*
@@ -1323,18 +1291,12 @@
struct xfs_inode *src = XFS_I(inode_in);
struct inode *inode_out = file_inode(file_out);
struct xfs_inode *dest = XFS_I(inode_out);
- bool same_inode = (inode_in == inode_out);
- ssize_t ret;
+ int ret;
/* Lock both files against IO */
- ret = xfs_iolock_two_inodes_and_break_layout(inode_in, inode_out);
+ ret = xfs_ilock2_io_mmap(src, dest);
if (ret)
return ret;
- if (same_inode)
- xfs_ilock(src, XFS_MMAPLOCK_EXCL);
- else
- xfs_lock_two_inodes(src, XFS_MMAPLOCK_EXCL, dest,
- XFS_MMAPLOCK_EXCL);
/* Check file eligibility and prepare for block sharing. */
ret = -EINVAL;
@@ -1348,7 +1310,7 @@
ret = generic_remap_file_range_prep(file_in, pos_in, file_out, pos_out,
len, remap_flags);
- if (ret < 0 || *len == 0)
+ if (ret || *len == 0)
goto out_unlock;
/* Attach dquots to dest inode before changing block map */
@@ -1383,91 +1345,12 @@
if (ret)
goto out_unlock;
- return 1;
+ return 0;
out_unlock:
- xfs_reflink_remap_unlock(file_in, file_out);
+ xfs_iunlock2_io_mmap(src, dest);
return ret;
}
-/*
- * The user wants to preemptively CoW all shared blocks in this file,
- * which enables us to turn off the reflink flag. Iterate all
- * extents which are not prealloc/delalloc to see which ranges are
- * mentioned in the refcount tree, then read those blocks into the
- * pagecache, dirty them, fsync them back out, and then we can update
- * the inode flag. What happens if we run out of memory? :)
- */
-STATIC int
-xfs_reflink_dirty_extents(
- struct xfs_inode *ip,
- xfs_fileoff_t fbno,
- xfs_filblks_t end,
- xfs_off_t isize)
-{
- struct xfs_mount *mp = ip->i_mount;
- xfs_agnumber_t agno;
- xfs_agblock_t agbno;
- xfs_extlen_t aglen;
- xfs_agblock_t rbno;
- xfs_extlen_t rlen;
- xfs_off_t fpos;
- xfs_off_t flen;
- struct xfs_bmbt_irec map[2];
- int nmaps;
- int error = 0;
-
- while (end - fbno > 0) {
- nmaps = 1;
- /*
- * Look for extents in the file. Skip holes, delalloc, or
- * unwritten extents; they can't be reflinked.
- */
- error = xfs_bmapi_read(ip, fbno, end - fbno, map, &nmaps, 0);
- if (error)
- goto out;
- if (nmaps == 0)
- break;
- if (!xfs_bmap_is_real_extent(&map[0]))
- goto next;
-
- map[1] = map[0];
- while (map[1].br_blockcount) {
- agno = XFS_FSB_TO_AGNO(mp, map[1].br_startblock);
- agbno = XFS_FSB_TO_AGBNO(mp, map[1].br_startblock);
- aglen = map[1].br_blockcount;
-
- error = xfs_reflink_find_shared(mp, NULL, agno, agbno,
- aglen, &rbno, &rlen, true);
- if (error)
- goto out;
- if (rbno == NULLAGBLOCK)
- break;
-
- /* Dirty the pages */
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- fpos = XFS_FSB_TO_B(mp, map[1].br_startoff +
- (rbno - agbno));
- flen = XFS_FSB_TO_B(mp, rlen);
- if (fpos + flen > isize)
- flen = isize - fpos;
- error = iomap_file_dirty(VFS_I(ip), fpos, flen,
- &xfs_iomap_ops);
- xfs_ilock(ip, XFS_ILOCK_EXCL);
- if (error)
- goto out;
-
- map[1].br_blockcount -= (rbno - agbno + rlen);
- map[1].br_startoff += (rbno - agbno + rlen);
- map[1].br_startblock += (rbno - agbno + rlen);
- }
-
-next:
- fbno = map[0].br_startoff + map[0].br_blockcount;
- }
-out:
- return error;
-}
-
/* Does this inode need the reflink flag? */
int
xfs_reflink_inode_has_shared_extents(
@@ -1544,7 +1427,8 @@
* We didn't find any shared blocks so turn off the reflink flag.
* First, get rid of any leftover CoW mappings.
*/
- error = xfs_reflink_cancel_cow_blocks(ip, tpp, 0, NULLFILEOFF, true);
+ error = xfs_reflink_cancel_cow_blocks(ip, tpp, 0, XFS_MAX_FILEOFF,
+ true);
if (error)
return error;
@@ -1604,10 +1488,7 @@
xfs_off_t offset,
xfs_off_t len)
{
- struct xfs_mount *mp = ip->i_mount;
- xfs_fileoff_t fbno;
- xfs_filblks_t end;
- xfs_off_t isize;
+ struct inode *inode = VFS_I(ip);
int error;
if (!xfs_is_reflink_inode(ip))
@@ -1615,20 +1496,14 @@
trace_xfs_reflink_unshare(ip, offset, len);
- inode_dio_wait(VFS_I(ip));
+ inode_dio_wait(inode);
- /* Try to CoW the selected ranges */
- xfs_ilock(ip, XFS_ILOCK_EXCL);
- fbno = XFS_B_TO_FSBT(mp, offset);
- isize = i_size_read(VFS_I(ip));
- end = XFS_B_TO_FSB(mp, offset + len);
- error = xfs_reflink_dirty_extents(ip, fbno, end, isize);
+ error = iomap_file_unshare(inode, offset, len,
+ &xfs_buffered_write_iomap_ops);
if (error)
- goto out_unlock;
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ goto out;
- /* Wait for the IO to finish */
- error = filemap_write_and_wait(VFS_I(ip)->i_mapping);
+ error = filemap_write_and_wait_range(inode->i_mapping, offset, len);
if (error)
goto out;
@@ -1636,11 +1511,8 @@
error = xfs_reflink_try_clear_inode_flag(ip);
if (error)
goto out;
-
return 0;
-out_unlock:
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
out:
trace_xfs_reflink_unshare_error(ip, error, _RET_IP_);
return error;
diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h
index 28a43b7..487b004 100644
--- a/fs/xfs/xfs_reflink.h
+++ b/fs/xfs/xfs_reflink.h
@@ -22,11 +22,11 @@
xfs_agblock_t *fbno, xfs_extlen_t *flen, bool find_maximal);
extern int xfs_reflink_trim_around_shared(struct xfs_inode *ip,
struct xfs_bmbt_irec *irec, bool *shared);
-bool xfs_inode_need_cow(struct xfs_inode *ip, struct xfs_bmbt_irec *imap,
+int xfs_bmap_trim_cow(struct xfs_inode *ip, struct xfs_bmbt_irec *imap,
bool *shared);
-extern int xfs_reflink_allocate_cow(struct xfs_inode *ip,
- struct xfs_bmbt_irec *imap, bool *shared, uint *lockmode,
+int xfs_reflink_allocate_cow(struct xfs_inode *ip, struct xfs_bmbt_irec *imap,
+ struct xfs_bmbt_irec *cmap, bool *shared, uint *lockmode,
bool convert_now);
extern int xfs_reflink_convert_cow(struct xfs_inode *ip, xfs_off_t offset,
xfs_off_t count);
@@ -56,7 +56,5 @@
loff_t *remapped);
extern int xfs_reflink_update_dest(struct xfs_inode *dest, xfs_off_t newlen,
xfs_extlen_t cowextsize, unsigned int remap_flags);
-extern void xfs_reflink_remap_unlock(struct file *file_in,
- struct file *file_out);
#endif /* __XFS_REFLINK_H */
diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c
index 8939e0e..2090595 100644
--- a/fs/xfs/xfs_rmap_item.c
+++ b/fs/xfs/xfs_rmap_item.c
@@ -17,24 +17,28 @@
#include "xfs_rmap_item.h"
#include "xfs_log.h"
#include "xfs_rmap.h"
-
+#include "xfs_error.h"
+#include "xfs_log_priv.h"
+#include "xfs_log_recover.h"
kmem_zone_t *xfs_rui_zone;
kmem_zone_t *xfs_rud_zone;
+static const struct xfs_item_ops xfs_rui_item_ops;
+
static inline struct xfs_rui_log_item *RUI_ITEM(struct xfs_log_item *lip)
{
return container_of(lip, struct xfs_rui_log_item, rui_item);
}
-void
+STATIC void
xfs_rui_item_free(
struct xfs_rui_log_item *ruip)
{
if (ruip->rui_format.rui_nextents > XFS_RUI_MAX_FAST_EXTENTS)
kmem_free(ruip);
else
- kmem_zone_free(xfs_rui_zone, ruip);
+ kmem_cache_free(xfs_rui_zone, ruip);
}
/*
@@ -44,13 +48,13 @@
* committed vs unpin operations in bulk insert operations. Hence the reference
* count to ensure only the last caller frees the RUI.
*/
-void
+STATIC void
xfs_rui_release(
struct xfs_rui_log_item *ruip)
{
ASSERT(atomic_read(&ruip->rui_refcount) > 0);
if (atomic_dec_and_test(&ruip->rui_refcount)) {
- xfs_trans_ail_remove(&ruip->rui_item, SHUTDOWN_LOG_IO_ERROR);
+ xfs_trans_ail_delete(&ruip->rui_item, SHUTDOWN_LOG_IO_ERROR);
xfs_rui_item_free(ruip);
}
}
@@ -122,17 +126,10 @@
xfs_rui_release(RUI_ITEM(lip));
}
-static const struct xfs_item_ops xfs_rui_item_ops = {
- .iop_size = xfs_rui_item_size,
- .iop_format = xfs_rui_item_format,
- .iop_unpin = xfs_rui_item_unpin,
- .iop_release = xfs_rui_item_release,
-};
-
/*
* Allocate and initialize an rui item with the given number of extents.
*/
-struct xfs_rui_log_item *
+STATIC struct xfs_rui_log_item *
xfs_rui_init(
struct xfs_mount *mp,
uint nextents)
@@ -144,7 +141,8 @@
if (nextents > XFS_RUI_MAX_FAST_EXTENTS)
ruip = kmem_zalloc(xfs_rui_log_item_sizeof(nextents), 0);
else
- ruip = kmem_zone_zalloc(xfs_rui_zone, 0);
+ ruip = kmem_cache_zalloc(xfs_rui_zone,
+ GFP_KERNEL | __GFP_NOFAIL);
xfs_log_item_init(mp, &ruip->rui_item, XFS_LI_RUI, &xfs_rui_item_ops);
ruip->rui_format.rui_nextents = nextents;
@@ -160,7 +158,7 @@
* RUI format structure. The RUI/RUD items were designed not to need any
* special alignment handling.
*/
-int
+STATIC int
xfs_rui_copy_format(
struct xfs_log_iovec *buf,
struct xfs_rui_log_format *dst_rui_fmt)
@@ -171,8 +169,10 @@
src_rui_fmt = buf->i_addr;
len = xfs_rui_log_format_sizeof(src_rui_fmt->rui_nextents);
- if (buf->i_len != len)
+ if (buf->i_len != len) {
+ XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, NULL);
return -EFSCORRUPTED;
+ }
memcpy(dst_rui_fmt, src_rui_fmt, len);
return 0;
@@ -227,7 +227,7 @@
struct xfs_rud_log_item *rudp = RUD_ITEM(lip);
xfs_rui_release(rudp->rud_ruip);
- kmem_zone_free(xfs_rud_zone, rudp);
+ kmem_cache_free(xfs_rud_zone, rudp);
}
static const struct xfs_item_ops xfs_rud_item_ops = {
@@ -244,7 +244,7 @@
{
struct xfs_rud_log_item *rudp;
- rudp = kmem_zone_zalloc(xfs_rud_zone, 0);
+ rudp = kmem_cache_zalloc(xfs_rud_zone, GFP_KERNEL | __GFP_NOFAIL);
xfs_log_item_init(tp->t_mountp, &rudp->rud_item, XFS_LI_RUD,
&xfs_rud_item_ops);
rudp->rud_ruip = ruip;
@@ -337,8 +337,8 @@
static int
xfs_rmap_update_diff_items(
void *priv,
- struct list_head *a,
- struct list_head *b)
+ const struct list_head *a,
+ const struct list_head *b)
{
struct xfs_mount *mp = priv;
struct xfs_rmap_intent *ra;
@@ -350,41 +350,16 @@
XFS_FSB_TO_AGNO(mp, rb->ri_bmap.br_startblock);
}
-/* Get an RUI. */
-STATIC void *
-xfs_rmap_update_create_intent(
- struct xfs_trans *tp,
- unsigned int count)
-{
- struct xfs_rui_log_item *ruip;
-
- ASSERT(tp != NULL);
- ASSERT(count > 0);
-
- ruip = xfs_rui_init(tp->t_mountp, count);
- ASSERT(ruip != NULL);
-
- /*
- * Get a log_item_desc to point at the new item.
- */
- xfs_trans_add_item(tp, &ruip->rui_item);
- return ruip;
-}
-
/* Log rmap updates in the intent item. */
STATIC void
xfs_rmap_update_log_item(
struct xfs_trans *tp,
- void *intent,
- struct list_head *item)
+ struct xfs_rui_log_item *ruip,
+ struct xfs_rmap_intent *rmap)
{
- struct xfs_rui_log_item *ruip = intent;
- struct xfs_rmap_intent *rmap;
uint next_extent;
struct xfs_map_extent *map;
- rmap = container_of(item, struct xfs_rmap_intent, ri_list);
-
tp->t_flags |= XFS_TRANS_DIRTY;
set_bit(XFS_LI_DIRTY, &ruip->rui_item.li_flags);
@@ -404,58 +379,64 @@
rmap->ri_bmap.br_state);
}
+static struct xfs_log_item *
+xfs_rmap_update_create_intent(
+ struct xfs_trans *tp,
+ struct list_head *items,
+ unsigned int count,
+ bool sort)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_rui_log_item *ruip = xfs_rui_init(mp, count);
+ struct xfs_rmap_intent *rmap;
+
+ ASSERT(count > 0);
+
+ xfs_trans_add_item(tp, &ruip->rui_item);
+ if (sort)
+ list_sort(mp, items, xfs_rmap_update_diff_items);
+ list_for_each_entry(rmap, items, ri_list)
+ xfs_rmap_update_log_item(tp, ruip, rmap);
+ return &ruip->rui_item;
+}
+
/* Get an RUD so we can process all the deferred rmap updates. */
-STATIC void *
+static struct xfs_log_item *
xfs_rmap_update_create_done(
struct xfs_trans *tp,
- void *intent,
+ struct xfs_log_item *intent,
unsigned int count)
{
- return xfs_trans_get_rud(tp, intent);
+ return &xfs_trans_get_rud(tp, RUI_ITEM(intent))->rud_item;
}
/* Process a deferred rmap update. */
STATIC int
xfs_rmap_update_finish_item(
struct xfs_trans *tp,
+ struct xfs_log_item *done,
struct list_head *item,
- void *done_item,
- void **state)
+ struct xfs_btree_cur **state)
{
struct xfs_rmap_intent *rmap;
int error;
rmap = container_of(item, struct xfs_rmap_intent, ri_list);
- error = xfs_trans_log_finish_rmap_update(tp, done_item,
- rmap->ri_type,
- rmap->ri_owner, rmap->ri_whichfork,
- rmap->ri_bmap.br_startoff,
- rmap->ri_bmap.br_startblock,
- rmap->ri_bmap.br_blockcount,
- rmap->ri_bmap.br_state,
- (struct xfs_btree_cur **)state);
+ error = xfs_trans_log_finish_rmap_update(tp, RUD_ITEM(done),
+ rmap->ri_type, rmap->ri_owner, rmap->ri_whichfork,
+ rmap->ri_bmap.br_startoff, rmap->ri_bmap.br_startblock,
+ rmap->ri_bmap.br_blockcount, rmap->ri_bmap.br_state,
+ state);
kmem_free(rmap);
return error;
}
-/* Clean up after processing deferred rmaps. */
-STATIC void
-xfs_rmap_update_finish_cleanup(
- struct xfs_trans *tp,
- void *state,
- int error)
-{
- struct xfs_btree_cur *rcur = state;
-
- xfs_rmap_finish_one_cleanup(tp, rcur, error);
-}
-
/* Abort all pending RUIs. */
STATIC void
xfs_rmap_update_abort_intent(
- void *intent)
+ struct xfs_log_item *intent)
{
- xfs_rui_release(intent);
+ xfs_rui_release(RUI_ITEM(intent));
}
/* Cancel a deferred rmap update. */
@@ -471,13 +452,11 @@
const struct xfs_defer_op_type xfs_rmap_update_defer_type = {
.max_items = XFS_RUI_MAX_FAST_EXTENTS,
- .diff_items = xfs_rmap_update_diff_items,
.create_intent = xfs_rmap_update_create_intent,
.abort_intent = xfs_rmap_update_abort_intent,
- .log_item = xfs_rmap_update_log_item,
.create_done = xfs_rmap_update_create_done,
.finish_item = xfs_rmap_update_finish_item,
- .finish_cleanup = xfs_rmap_update_finish_cleanup,
+ .finish_cleanup = xfs_rmap_finish_one_cleanup,
.cancel_item = xfs_rmap_update_cancel_item,
};
@@ -485,24 +464,24 @@
* Process an rmap update intent item that was recovered from the log.
* We need to update the rmapbt.
*/
-int
-xfs_rui_recover(
- struct xfs_mount *mp,
- struct xfs_rui_log_item *ruip)
+STATIC int
+xfs_rui_item_recover(
+ struct xfs_log_item *lip,
+ struct list_head *capture_list)
{
- int i;
- int error = 0;
+ struct xfs_rui_log_item *ruip = RUI_ITEM(lip);
struct xfs_map_extent *rmap;
- xfs_fsblock_t startblock_fsb;
- bool op_ok;
struct xfs_rud_log_item *rudp;
- enum xfs_rmap_intent_type type;
- int whichfork;
- xfs_exntst_t state;
struct xfs_trans *tp;
struct xfs_btree_cur *rcur = NULL;
-
- ASSERT(!test_bit(XFS_RUI_RECOVERED, &ruip->rui_flags));
+ struct xfs_mount *mp = lip->li_mountp;
+ xfs_fsblock_t startblock_fsb;
+ enum xfs_rmap_intent_type type;
+ xfs_exntst_t state;
+ bool op_ok;
+ int i;
+ int whichfork;
+ int error = 0;
/*
* First check the validity of the extents described by the
@@ -532,15 +511,8 @@
rmap->me_len == 0 ||
startblock_fsb >= mp->m_sb.sb_dblocks ||
rmap->me_len >= mp->m_sb.sb_agblocks ||
- (rmap->me_flags & ~XFS_RMAP_EXTENT_FLAGS)) {
- /*
- * This will pull the RUI from the AIL and
- * free the memory associated with it.
- */
- set_bit(XFS_RUI_RECOVERED, &ruip->rui_flags);
- xfs_rui_release(ruip);
- return -EIO;
- }
+ (rmap->me_flags & ~XFS_RMAP_EXTENT_FLAGS))
+ return -EFSCORRUPTED;
}
error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate,
@@ -581,6 +553,7 @@
type = XFS_RMAP_FREE;
break;
default:
+ XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, NULL);
error = -EFSCORRUPTED;
goto abort_error;
}
@@ -594,12 +567,124 @@
}
xfs_rmap_finish_one_cleanup(tp, rcur, error);
- set_bit(XFS_RUI_RECOVERED, &ruip->rui_flags);
- error = xfs_trans_commit(tp);
- return error;
+ return xfs_defer_ops_capture_and_commit(tp, NULL, capture_list);
abort_error:
xfs_rmap_finish_one_cleanup(tp, rcur, error);
xfs_trans_cancel(tp);
return error;
}
+
+STATIC bool
+xfs_rui_item_match(
+ struct xfs_log_item *lip,
+ uint64_t intent_id)
+{
+ return RUI_ITEM(lip)->rui_format.rui_id == intent_id;
+}
+
+/* Relog an intent item to push the log tail forward. */
+static struct xfs_log_item *
+xfs_rui_item_relog(
+ struct xfs_log_item *intent,
+ struct xfs_trans *tp)
+{
+ struct xfs_rud_log_item *rudp;
+ struct xfs_rui_log_item *ruip;
+ struct xfs_map_extent *extp;
+ unsigned int count;
+
+ count = RUI_ITEM(intent)->rui_format.rui_nextents;
+ extp = RUI_ITEM(intent)->rui_format.rui_extents;
+
+ tp->t_flags |= XFS_TRANS_DIRTY;
+ rudp = xfs_trans_get_rud(tp, RUI_ITEM(intent));
+ set_bit(XFS_LI_DIRTY, &rudp->rud_item.li_flags);
+
+ ruip = xfs_rui_init(tp->t_mountp, count);
+ memcpy(ruip->rui_format.rui_extents, extp, count * sizeof(*extp));
+ atomic_set(&ruip->rui_next_extent, count);
+ xfs_trans_add_item(tp, &ruip->rui_item);
+ set_bit(XFS_LI_DIRTY, &ruip->rui_item.li_flags);
+ return &ruip->rui_item;
+}
+
+static const struct xfs_item_ops xfs_rui_item_ops = {
+ .iop_size = xfs_rui_item_size,
+ .iop_format = xfs_rui_item_format,
+ .iop_unpin = xfs_rui_item_unpin,
+ .iop_release = xfs_rui_item_release,
+ .iop_recover = xfs_rui_item_recover,
+ .iop_match = xfs_rui_item_match,
+ .iop_relog = xfs_rui_item_relog,
+};
+
+/*
+ * This routine is called to create an in-core extent rmap update
+ * item from the rui format structure which was logged on disk.
+ * It allocates an in-core rui, copies the extents from the format
+ * structure into it, and adds the rui to the AIL with the given
+ * LSN.
+ */
+STATIC int
+xlog_recover_rui_commit_pass2(
+ struct xlog *log,
+ struct list_head *buffer_list,
+ struct xlog_recover_item *item,
+ xfs_lsn_t lsn)
+{
+ int error;
+ struct xfs_mount *mp = log->l_mp;
+ struct xfs_rui_log_item *ruip;
+ struct xfs_rui_log_format *rui_formatp;
+
+ rui_formatp = item->ri_buf[0].i_addr;
+
+ ruip = xfs_rui_init(mp, rui_formatp->rui_nextents);
+ error = xfs_rui_copy_format(&item->ri_buf[0], &ruip->rui_format);
+ if (error) {
+ xfs_rui_item_free(ruip);
+ return error;
+ }
+ atomic_set(&ruip->rui_next_extent, rui_formatp->rui_nextents);
+ /*
+ * Insert the intent into the AIL directly and drop one reference so
+ * that finishing or canceling the work will drop the other.
+ */
+ xfs_trans_ail_insert(log->l_ailp, &ruip->rui_item, lsn);
+ xfs_rui_release(ruip);
+ return 0;
+}
+
+const struct xlog_recover_item_ops xlog_rui_item_ops = {
+ .item_type = XFS_LI_RUI,
+ .commit_pass2 = xlog_recover_rui_commit_pass2,
+};
+
+/*
+ * This routine is called when an RUD format structure is found in a committed
+ * transaction in the log. Its purpose is to cancel the corresponding RUI if it
+ * was still in the log. To do this it searches the AIL for the RUI with an id
+ * equal to that in the RUD format structure. If we find it we drop the RUD
+ * reference, which removes the RUI from the AIL and frees it.
+ */
+STATIC int
+xlog_recover_rud_commit_pass2(
+ struct xlog *log,
+ struct list_head *buffer_list,
+ struct xlog_recover_item *item,
+ xfs_lsn_t lsn)
+{
+ struct xfs_rud_log_format *rud_formatp;
+
+ rud_formatp = item->ri_buf[0].i_addr;
+ ASSERT(item->ri_buf[0].i_len == sizeof(struct xfs_rud_log_format));
+
+ xlog_recover_release_intent(log, XFS_LI_RUI, rud_formatp->rud_rui_id);
+ return 0;
+}
+
+const struct xlog_recover_item_ops xlog_rud_item_ops = {
+ .item_type = XFS_LI_RUD,
+ .commit_pass2 = xlog_recover_rud_commit_pass2,
+};
diff --git a/fs/xfs/xfs_rmap_item.h b/fs/xfs/xfs_rmap_item.h
index 8708e4a..31e6cdf 100644
--- a/fs/xfs/xfs_rmap_item.h
+++ b/fs/xfs/xfs_rmap_item.h
@@ -36,11 +36,6 @@
#define XFS_RUI_MAX_FAST_EXTENTS 16
/*
- * Define RUI flag bits. Manipulated by set/clear/test_bit operators.
- */
-#define XFS_RUI_RECOVERED 1
-
-/*
* This is the "rmap update intent" log item. It is used to log the fact that
* some reverse mappings need to change. It is used in conjunction with the
* "rmap update done" log item described below.
@@ -52,7 +47,6 @@
struct xfs_log_item rui_item;
atomic_t rui_refcount;
atomic_t rui_next_extent;
- unsigned long rui_flags; /* misc flags */
struct xfs_rui_log_format rui_format;
};
@@ -77,11 +71,4 @@
extern struct kmem_zone *xfs_rui_zone;
extern struct kmem_zone *xfs_rud_zone;
-struct xfs_rui_log_item *xfs_rui_init(struct xfs_mount *, uint);
-int xfs_rui_copy_format(struct xfs_log_iovec *buf,
- struct xfs_rui_log_format *dst_rui_fmt);
-void xfs_rui_item_free(struct xfs_rui_log_item *);
-void xfs_rui_release(struct xfs_rui_log_item *);
-int xfs_rui_recover(struct xfs_mount *mp, struct xfs_rui_log_item *ruip);
-
#endif /* __XFS_RMAP_ITEM_H__ */
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index 6d5ddc4..ede1baf 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -18,7 +18,7 @@
#include "xfs_trans_space.h"
#include "xfs_icache.h"
#include "xfs_rtalloc.h"
-
+#include "xfs_sb.h"
/*
* Read and return the summary information for a given extent size,
@@ -778,8 +778,14 @@
struct xfs_bmbt_irec map; /* block map output */
int nmap; /* number of block maps */
int resblks; /* space reservation */
+ enum xfs_blft buf_type;
struct xfs_trans *tp;
+ if (ip == mp->m_rsumip)
+ buf_type = XFS_BLFT_RTSUMMARY_BUF;
+ else
+ buf_type = XFS_BLFT_RTBITMAP_BUF;
+
/*
* Allocate space to the file, as necessary.
*/
@@ -803,8 +809,7 @@
*/
nmap = 1;
error = xfs_bmapi_write(tp, ip, oblocks, nblocks - oblocks,
- XFS_BMAPI_METADATA, resblks, &map,
- &nmap);
+ XFS_BMAPI_METADATA, 0, &map, &nmap);
if (!error && nmap < 1)
error = -ENOSPC;
if (error)
@@ -838,12 +843,13 @@
* Get a buffer for the block.
*/
d = XFS_FSB_TO_DADDR(mp, fsbno);
- bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d,
- mp->m_bsize, 0);
- if (bp == NULL) {
- error = -EIO;
+ error = xfs_trans_get_buf(tp, mp->m_ddev_targp, d,
+ mp->m_bsize, 0, &bp);
+ if (error)
goto out_trans_cancel;
- }
+
+ xfs_trans_buf_set_type(tp, bp, buf_type);
+ bp->b_ops = &xfs_rtbuf_ops;
memset(bp->b_addr, 0, mp->m_sb.sb_blocksize);
xfs_trans_log_buf(tp, bp, 0, mp->m_sb.sb_blocksize - 1);
/*
@@ -876,7 +882,7 @@
* lower bound on the minimum level with any free extents. We can
* continue without the cache if it couldn't be allocated.
*/
- mp->m_rsum_cache = kmem_zalloc_large(rbmblocks, 0);
+ mp->m_rsum_cache = kvzalloc(rbmblocks, GFP_KERNEL);
if (!mp->m_rsum_cache)
xfs_warn(mp, "could not allocate realtime summary cache");
}
@@ -1018,7 +1024,7 @@
/*
* Lock out other callers by grabbing the bitmap inode lock.
*/
- xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL);
+ xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL | XFS_ILOCK_RTBITMAP);
xfs_trans_ijoin(tp, mp->m_rbmip, XFS_ILOCK_EXCL);
/*
* Update the bitmap inode's size ondisk and incore. We need
@@ -1032,7 +1038,7 @@
/*
* Get the summary inode into the transaction.
*/
- xfs_ilock(mp->m_rsumip, XFS_ILOCK_EXCL);
+ xfs_ilock(mp->m_rsumip, XFS_ILOCK_EXCL | XFS_ILOCK_RTSUM);
xfs_trans_ijoin(tp, mp->m_rsumip, XFS_ILOCK_EXCL);
/*
* Update the summary inode's size. We need to update the
@@ -1096,7 +1102,13 @@
if (error)
break;
}
+ if (error)
+ goto out_free;
+ /* Update secondary superblocks now the physical grow has completed */
+ error = xfs_update_secondary_sbs(mp);
+
+out_free:
/*
* Free the fake mp structure.
*/
diff --git a/fs/xfs/xfs_stats.c b/fs/xfs/xfs_stats.c
index 113883c..20e0534 100644
--- a/fs/xfs/xfs_stats.c
+++ b/fs/xfs/xfs_stats.c
@@ -23,6 +23,7 @@
uint64_t xs_xstrat_bytes = 0;
uint64_t xs_write_bytes = 0;
uint64_t xs_read_bytes = 0;
+ uint64_t defer_relog = 0;
static const struct xstats_entry {
char *desc;
@@ -57,24 +58,27 @@
/* Loop over all stats groups */
for (i = j = 0; i < ARRAY_SIZE(xstats); i++) {
- len += snprintf(buf + len, PATH_MAX - len, "%s",
+ len += scnprintf(buf + len, PATH_MAX - len, "%s",
xstats[i].desc);
/* inner loop does each group */
for (; j < xstats[i].endpoint; j++)
- len += snprintf(buf + len, PATH_MAX - len, " %u",
+ len += scnprintf(buf + len, PATH_MAX - len, " %u",
counter_val(stats, j));
- len += snprintf(buf + len, PATH_MAX - len, "\n");
+ len += scnprintf(buf + len, PATH_MAX - len, "\n");
}
/* extra precision counters */
for_each_possible_cpu(i) {
xs_xstrat_bytes += per_cpu_ptr(stats, i)->s.xs_xstrat_bytes;
xs_write_bytes += per_cpu_ptr(stats, i)->s.xs_write_bytes;
xs_read_bytes += per_cpu_ptr(stats, i)->s.xs_read_bytes;
+ defer_relog += per_cpu_ptr(stats, i)->s.defer_relog;
}
- len += snprintf(buf + len, PATH_MAX-len, "xpc %Lu %Lu %Lu\n",
+ len += scnprintf(buf + len, PATH_MAX-len, "xpc %Lu %Lu %Lu\n",
xs_xstrat_bytes, xs_write_bytes, xs_read_bytes);
- len += snprintf(buf + len, PATH_MAX-len, "debug %u\n",
+ len += scnprintf(buf + len, PATH_MAX-len, "defer_relog %llu\n",
+ defer_relog);
+ len += scnprintf(buf + len, PATH_MAX-len, "debug %u\n",
#if defined(DEBUG)
1);
#else
diff --git a/fs/xfs/xfs_stats.h b/fs/xfs/xfs_stats.h
index 34d704f..43ffba7 100644
--- a/fs/xfs/xfs_stats.h
+++ b/fs/xfs/xfs_stats.h
@@ -137,6 +137,7 @@
uint64_t xs_xstrat_bytes;
uint64_t xs_write_bytes;
uint64_t xs_read_bytes;
+ uint64_t defer_relog;
};
#define xfsstats_offset(f) (offsetof(struct __xfsstats, f)/sizeof(uint32_t))
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 8d1df9f..e3e229e 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -37,21 +37,54 @@
#include "xfs_reflink.h"
#include <linux/magic.h>
-#include <linux/parser.h>
+#include <linux/fs_context.h>
+#include <linux/fs_parser.h>
static const struct super_operations xfs_super_operations;
-struct bio_set xfs_ioend_bioset;
static struct kset *xfs_kset; /* top-level xfs sysfs dir */
#ifdef DEBUG
static struct xfs_kobj xfs_dbg_kobj; /* global debug sysfs attrs */
#endif
+enum xfs_dax_mode {
+ XFS_DAX_INODE = 0,
+ XFS_DAX_ALWAYS = 1,
+ XFS_DAX_NEVER = 2,
+};
+
+static void
+xfs_mount_set_dax_mode(
+ struct xfs_mount *mp,
+ enum xfs_dax_mode mode)
+{
+ switch (mode) {
+ case XFS_DAX_INODE:
+ mp->m_flags &= ~(XFS_MOUNT_DAX_ALWAYS | XFS_MOUNT_DAX_NEVER);
+ break;
+ case XFS_DAX_ALWAYS:
+ mp->m_flags |= XFS_MOUNT_DAX_ALWAYS;
+ mp->m_flags &= ~XFS_MOUNT_DAX_NEVER;
+ break;
+ case XFS_DAX_NEVER:
+ mp->m_flags |= XFS_MOUNT_DAX_NEVER;
+ mp->m_flags &= ~XFS_MOUNT_DAX_ALWAYS;
+ break;
+ }
+}
+
+static const struct constant_table dax_param_enums[] = {
+ {"inode", XFS_DAX_INODE },
+ {"always", XFS_DAX_ALWAYS },
+ {"never", XFS_DAX_NEVER },
+ {}
+};
+
/*
* Table driven mount option parser.
*/
enum {
- Opt_logbufs, Opt_logbsize, Opt_logdev, Opt_rtdev, Opt_biosize,
+ Opt_logbufs, Opt_logbsize, Opt_logdev, Opt_rtdev,
Opt_wsync, Opt_noalign, Opt_swalloc, Opt_sunit, Opt_swidth, Opt_nouuid,
Opt_grpid, Opt_nogrpid, Opt_bsdgroups, Opt_sysvgroups,
Opt_allocsize, Opt_norecovery, Opt_inode64, Opt_inode32, Opt_ikeep,
@@ -59,382 +92,63 @@
Opt_filestreams, Opt_quota, Opt_noquota, Opt_usrquota, Opt_grpquota,
Opt_prjquota, Opt_uquota, Opt_gquota, Opt_pquota,
Opt_uqnoenforce, Opt_gqnoenforce, Opt_pqnoenforce, Opt_qnoenforce,
- Opt_discard, Opt_nodiscard, Opt_dax, Opt_err,
+ Opt_discard, Opt_nodiscard, Opt_dax, Opt_dax_enum,
};
-static const match_table_t tokens = {
- {Opt_logbufs, "logbufs=%u"}, /* number of XFS log buffers */
- {Opt_logbsize, "logbsize=%s"}, /* size of XFS log buffers */
- {Opt_logdev, "logdev=%s"}, /* log device */
- {Opt_rtdev, "rtdev=%s"}, /* realtime I/O device */
- {Opt_biosize, "biosize=%u"}, /* log2 of preferred buffered io size */
- {Opt_wsync, "wsync"}, /* safe-mode nfs compatible mount */
- {Opt_noalign, "noalign"}, /* turn off stripe alignment */
- {Opt_swalloc, "swalloc"}, /* turn on stripe width allocation */
- {Opt_sunit, "sunit=%u"}, /* data volume stripe unit */
- {Opt_swidth, "swidth=%u"}, /* data volume stripe width */
- {Opt_nouuid, "nouuid"}, /* ignore filesystem UUID */
- {Opt_grpid, "grpid"}, /* group-ID from parent directory */
- {Opt_nogrpid, "nogrpid"}, /* group-ID from current process */
- {Opt_bsdgroups, "bsdgroups"}, /* group-ID from parent directory */
- {Opt_sysvgroups,"sysvgroups"}, /* group-ID from current process */
- {Opt_allocsize, "allocsize=%s"},/* preferred allocation size */
- {Opt_norecovery,"norecovery"}, /* don't run XFS recovery */
- {Opt_inode64, "inode64"}, /* inodes can be allocated anywhere */
- {Opt_inode32, "inode32"}, /* inode allocation limited to
- * XFS_MAXINUMBER_32 */
- {Opt_ikeep, "ikeep"}, /* do not free empty inode clusters */
- {Opt_noikeep, "noikeep"}, /* free empty inode clusters */
- {Opt_largeio, "largeio"}, /* report large I/O sizes in stat() */
- {Opt_nolargeio, "nolargeio"}, /* do not report large I/O sizes
- * in stat(). */
- {Opt_attr2, "attr2"}, /* do use attr2 attribute format */
- {Opt_noattr2, "noattr2"}, /* do not use attr2 attribute format */
- {Opt_filestreams,"filestreams"},/* use filestreams allocator */
- {Opt_quota, "quota"}, /* disk quotas (user) */
- {Opt_noquota, "noquota"}, /* no quotas */
- {Opt_usrquota, "usrquota"}, /* user quota enabled */
- {Opt_grpquota, "grpquota"}, /* group quota enabled */
- {Opt_prjquota, "prjquota"}, /* project quota enabled */
- {Opt_uquota, "uquota"}, /* user quota (IRIX variant) */
- {Opt_gquota, "gquota"}, /* group quota (IRIX variant) */
- {Opt_pquota, "pquota"}, /* project quota (IRIX variant) */
- {Opt_uqnoenforce,"uqnoenforce"},/* user quota limit enforcement */
- {Opt_gqnoenforce,"gqnoenforce"},/* group quota limit enforcement */
- {Opt_pqnoenforce,"pqnoenforce"},/* project quota limit enforcement */
- {Opt_qnoenforce, "qnoenforce"}, /* same as uqnoenforce */
- {Opt_discard, "discard"}, /* Discard unused blocks */
- {Opt_nodiscard, "nodiscard"}, /* Do not discard unused blocks */
- {Opt_dax, "dax"}, /* Enable direct access to bdev pages */
- {Opt_err, NULL},
+static const struct fs_parameter_spec xfs_fs_parameters[] = {
+ fsparam_u32("logbufs", Opt_logbufs),
+ fsparam_string("logbsize", Opt_logbsize),
+ fsparam_string("logdev", Opt_logdev),
+ fsparam_string("rtdev", Opt_rtdev),
+ fsparam_flag("wsync", Opt_wsync),
+ fsparam_flag("noalign", Opt_noalign),
+ fsparam_flag("swalloc", Opt_swalloc),
+ fsparam_u32("sunit", Opt_sunit),
+ fsparam_u32("swidth", Opt_swidth),
+ fsparam_flag("nouuid", Opt_nouuid),
+ fsparam_flag("grpid", Opt_grpid),
+ fsparam_flag("nogrpid", Opt_nogrpid),
+ fsparam_flag("bsdgroups", Opt_bsdgroups),
+ fsparam_flag("sysvgroups", Opt_sysvgroups),
+ fsparam_string("allocsize", Opt_allocsize),
+ fsparam_flag("norecovery", Opt_norecovery),
+ fsparam_flag("inode64", Opt_inode64),
+ fsparam_flag("inode32", Opt_inode32),
+ fsparam_flag("ikeep", Opt_ikeep),
+ fsparam_flag("noikeep", Opt_noikeep),
+ fsparam_flag("largeio", Opt_largeio),
+ fsparam_flag("nolargeio", Opt_nolargeio),
+ fsparam_flag("attr2", Opt_attr2),
+ fsparam_flag("noattr2", Opt_noattr2),
+ fsparam_flag("filestreams", Opt_filestreams),
+ fsparam_flag("quota", Opt_quota),
+ fsparam_flag("noquota", Opt_noquota),
+ fsparam_flag("usrquota", Opt_usrquota),
+ fsparam_flag("grpquota", Opt_grpquota),
+ fsparam_flag("prjquota", Opt_prjquota),
+ fsparam_flag("uquota", Opt_uquota),
+ fsparam_flag("gquota", Opt_gquota),
+ fsparam_flag("pquota", Opt_pquota),
+ fsparam_flag("uqnoenforce", Opt_uqnoenforce),
+ fsparam_flag("gqnoenforce", Opt_gqnoenforce),
+ fsparam_flag("pqnoenforce", Opt_pqnoenforce),
+ fsparam_flag("qnoenforce", Opt_qnoenforce),
+ fsparam_flag("discard", Opt_discard),
+ fsparam_flag("nodiscard", Opt_nodiscard),
+ fsparam_flag("dax", Opt_dax),
+ fsparam_enum("dax", Opt_dax_enum, dax_param_enums),
+ {}
};
-
-STATIC int
-suffix_kstrtoint(const substring_t *s, unsigned int base, int *res)
-{
- int last, shift_left_factor = 0, _res;
- char *value;
- int ret = 0;
-
- value = match_strdup(s);
- if (!value)
- return -ENOMEM;
-
- last = strlen(value) - 1;
- if (value[last] == 'K' || value[last] == 'k') {
- shift_left_factor = 10;
- value[last] = '\0';
- }
- if (value[last] == 'M' || value[last] == 'm') {
- shift_left_factor = 20;
- value[last] = '\0';
- }
- if (value[last] == 'G' || value[last] == 'g') {
- shift_left_factor = 30;
- value[last] = '\0';
- }
-
- if (kstrtoint(value, base, &_res))
- ret = -EINVAL;
- kfree(value);
- *res = _res << shift_left_factor;
- return ret;
-}
-
-/*
- * This function fills in xfs_mount_t fields based on mount args.
- * Note: the superblock has _not_ yet been read in.
- *
- * Note that this function leaks the various device name allocations on
- * failure. The caller takes care of them.
- *
- * *sb is const because this is also used to test options on the remount
- * path, and we don't want this to have any side effects at remount time.
- * Today this function does not change *sb, but just to future-proof...
- */
-STATIC int
-xfs_parseargs(
- struct xfs_mount *mp,
- char *options)
-{
- const struct super_block *sb = mp->m_super;
- char *p;
- substring_t args[MAX_OPT_ARGS];
- int dsunit = 0;
- int dswidth = 0;
- int iosize = 0;
- uint8_t iosizelog = 0;
-
- /*
- * set up the mount name first so all the errors will refer to the
- * correct device.
- */
- mp->m_fsname = kstrndup(sb->s_id, MAXNAMELEN, GFP_KERNEL);
- if (!mp->m_fsname)
- return -ENOMEM;
- mp->m_fsname_len = strlen(mp->m_fsname) + 1;
-
- /*
- * Copy binary VFS mount flags we are interested in.
- */
- if (sb_rdonly(sb))
- mp->m_flags |= XFS_MOUNT_RDONLY;
- if (sb->s_flags & SB_DIRSYNC)
- mp->m_flags |= XFS_MOUNT_DIRSYNC;
- if (sb->s_flags & SB_SYNCHRONOUS)
- mp->m_flags |= XFS_MOUNT_WSYNC;
-
- /*
- * Set some default flags that could be cleared by the mount option
- * parsing.
- */
- mp->m_flags |= XFS_MOUNT_COMPAT_IOSIZE;
-
- /*
- * These can be overridden by the mount option parsing.
- */
- mp->m_logbufs = -1;
- mp->m_logbsize = -1;
-
- if (!options)
- goto done;
-
- while ((p = strsep(&options, ",")) != NULL) {
- int token;
-
- if (!*p)
- continue;
-
- token = match_token(p, tokens, args);
- switch (token) {
- case Opt_logbufs:
- if (match_int(args, &mp->m_logbufs))
- return -EINVAL;
- break;
- case Opt_logbsize:
- if (suffix_kstrtoint(args, 10, &mp->m_logbsize))
- return -EINVAL;
- break;
- case Opt_logdev:
- kfree(mp->m_logname);
- mp->m_logname = match_strdup(args);
- if (!mp->m_logname)
- return -ENOMEM;
- break;
- case Opt_rtdev:
- kfree(mp->m_rtname);
- mp->m_rtname = match_strdup(args);
- if (!mp->m_rtname)
- return -ENOMEM;
- break;
- case Opt_allocsize:
- case Opt_biosize:
- if (suffix_kstrtoint(args, 10, &iosize))
- return -EINVAL;
- iosizelog = ffs(iosize) - 1;
- break;
- case Opt_grpid:
- case Opt_bsdgroups:
- mp->m_flags |= XFS_MOUNT_GRPID;
- break;
- case Opt_nogrpid:
- case Opt_sysvgroups:
- mp->m_flags &= ~XFS_MOUNT_GRPID;
- break;
- case Opt_wsync:
- mp->m_flags |= XFS_MOUNT_WSYNC;
- break;
- case Opt_norecovery:
- mp->m_flags |= XFS_MOUNT_NORECOVERY;
- break;
- case Opt_noalign:
- mp->m_flags |= XFS_MOUNT_NOALIGN;
- break;
- case Opt_swalloc:
- mp->m_flags |= XFS_MOUNT_SWALLOC;
- break;
- case Opt_sunit:
- if (match_int(args, &dsunit))
- return -EINVAL;
- break;
- case Opt_swidth:
- if (match_int(args, &dswidth))
- return -EINVAL;
- break;
- case Opt_inode32:
- mp->m_flags |= XFS_MOUNT_SMALL_INUMS;
- break;
- case Opt_inode64:
- mp->m_flags &= ~XFS_MOUNT_SMALL_INUMS;
- break;
- case Opt_nouuid:
- mp->m_flags |= XFS_MOUNT_NOUUID;
- break;
- case Opt_ikeep:
- mp->m_flags |= XFS_MOUNT_IKEEP;
- break;
- case Opt_noikeep:
- mp->m_flags &= ~XFS_MOUNT_IKEEP;
- break;
- case Opt_largeio:
- mp->m_flags &= ~XFS_MOUNT_COMPAT_IOSIZE;
- break;
- case Opt_nolargeio:
- mp->m_flags |= XFS_MOUNT_COMPAT_IOSIZE;
- break;
- case Opt_attr2:
- mp->m_flags |= XFS_MOUNT_ATTR2;
- break;
- case Opt_noattr2:
- mp->m_flags &= ~XFS_MOUNT_ATTR2;
- mp->m_flags |= XFS_MOUNT_NOATTR2;
- break;
- case Opt_filestreams:
- mp->m_flags |= XFS_MOUNT_FILESTREAMS;
- break;
- case Opt_noquota:
- mp->m_qflags &= ~XFS_ALL_QUOTA_ACCT;
- mp->m_qflags &= ~XFS_ALL_QUOTA_ENFD;
- mp->m_qflags &= ~XFS_ALL_QUOTA_ACTIVE;
- break;
- case Opt_quota:
- case Opt_uquota:
- case Opt_usrquota:
- mp->m_qflags |= (XFS_UQUOTA_ACCT | XFS_UQUOTA_ACTIVE |
- XFS_UQUOTA_ENFD);
- break;
- case Opt_qnoenforce:
- case Opt_uqnoenforce:
- mp->m_qflags |= (XFS_UQUOTA_ACCT | XFS_UQUOTA_ACTIVE);
- mp->m_qflags &= ~XFS_UQUOTA_ENFD;
- break;
- case Opt_pquota:
- case Opt_prjquota:
- mp->m_qflags |= (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE |
- XFS_PQUOTA_ENFD);
- break;
- case Opt_pqnoenforce:
- mp->m_qflags |= (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE);
- mp->m_qflags &= ~XFS_PQUOTA_ENFD;
- break;
- case Opt_gquota:
- case Opt_grpquota:
- mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE |
- XFS_GQUOTA_ENFD);
- break;
- case Opt_gqnoenforce:
- mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE);
- mp->m_qflags &= ~XFS_GQUOTA_ENFD;
- break;
- case Opt_discard:
- mp->m_flags |= XFS_MOUNT_DISCARD;
- break;
- case Opt_nodiscard:
- mp->m_flags &= ~XFS_MOUNT_DISCARD;
- break;
-#ifdef CONFIG_FS_DAX
- case Opt_dax:
- mp->m_flags |= XFS_MOUNT_DAX;
- break;
-#endif
- default:
- xfs_warn(mp, "unknown mount option [%s].", p);
- return -EINVAL;
- }
- }
-
- /*
- * no recovery flag requires a read-only mount
- */
- if ((mp->m_flags & XFS_MOUNT_NORECOVERY) &&
- !(mp->m_flags & XFS_MOUNT_RDONLY)) {
- xfs_warn(mp, "no-recovery mounts must be read-only.");
- return -EINVAL;
- }
-
- if ((mp->m_flags & XFS_MOUNT_NOALIGN) && (dsunit || dswidth)) {
- xfs_warn(mp,
- "sunit and swidth options incompatible with the noalign option");
- return -EINVAL;
- }
-
-#ifndef CONFIG_XFS_QUOTA
- if (XFS_IS_QUOTA_RUNNING(mp)) {
- xfs_warn(mp, "quota support not available in this kernel.");
- return -EINVAL;
- }
-#endif
-
- if ((dsunit && !dswidth) || (!dsunit && dswidth)) {
- xfs_warn(mp, "sunit and swidth must be specified together");
- return -EINVAL;
- }
-
- if (dsunit && (dswidth % dsunit != 0)) {
- xfs_warn(mp,
- "stripe width (%d) must be a multiple of the stripe unit (%d)",
- dswidth, dsunit);
- return -EINVAL;
- }
-
-done:
- if (dsunit && !(mp->m_flags & XFS_MOUNT_NOALIGN)) {
- /*
- * At this point the superblock has not been read
- * in, therefore we do not know the block size.
- * Before the mount call ends we will convert
- * these to FSBs.
- */
- mp->m_dalign = dsunit;
- mp->m_swidth = dswidth;
- }
-
- if (mp->m_logbufs != -1 &&
- mp->m_logbufs != 0 &&
- (mp->m_logbufs < XLOG_MIN_ICLOGS ||
- mp->m_logbufs > XLOG_MAX_ICLOGS)) {
- xfs_warn(mp, "invalid logbufs value: %d [not %d-%d]",
- mp->m_logbufs, XLOG_MIN_ICLOGS, XLOG_MAX_ICLOGS);
- return -EINVAL;
- }
- if (mp->m_logbsize != -1 &&
- mp->m_logbsize != 0 &&
- (mp->m_logbsize < XLOG_MIN_RECORD_BSIZE ||
- mp->m_logbsize > XLOG_MAX_RECORD_BSIZE ||
- !is_power_of_2(mp->m_logbsize))) {
- xfs_warn(mp,
- "invalid logbufsize: %d [not 16k,32k,64k,128k or 256k]",
- mp->m_logbsize);
- return -EINVAL;
- }
-
- if (iosizelog) {
- if (iosizelog > XFS_MAX_IO_LOG ||
- iosizelog < XFS_MIN_IO_LOG) {
- xfs_warn(mp, "invalid log iosize: %d [not %d-%d]",
- iosizelog, XFS_MIN_IO_LOG,
- XFS_MAX_IO_LOG);
- return -EINVAL;
- }
-
- mp->m_flags |= XFS_MOUNT_DFLT_IOSIZE;
- mp->m_readio_log = iosizelog;
- mp->m_writeio_log = iosizelog;
- }
-
- return 0;
-}
-
struct proc_xfs_info {
uint64_t flag;
char *str;
};
-STATIC void
-xfs_showargs(
- struct xfs_mount *mp,
- struct seq_file *m)
+static int
+xfs_fs_show_options(
+ struct seq_file *m,
+ struct dentry *root)
{
static struct proc_xfs_info xfs_info_set[] = {
/* the few simple ones we can get from the mount struct */
@@ -448,30 +162,25 @@
{ XFS_MOUNT_FILESTREAMS, ",filestreams" },
{ XFS_MOUNT_GRPID, ",grpid" },
{ XFS_MOUNT_DISCARD, ",discard" },
- { XFS_MOUNT_SMALL_INUMS, ",inode32" },
- { XFS_MOUNT_DAX, ",dax" },
+ { XFS_MOUNT_LARGEIO, ",largeio" },
+ { XFS_MOUNT_DAX_ALWAYS, ",dax=always" },
+ { XFS_MOUNT_DAX_NEVER, ",dax=never" },
{ 0, NULL }
};
- static struct proc_xfs_info xfs_info_unset[] = {
- /* the few simple ones we can get from the mount struct */
- { XFS_MOUNT_COMPAT_IOSIZE, ",largeio" },
- { XFS_MOUNT_SMALL_INUMS, ",inode64" },
- { 0, NULL }
- };
+ struct xfs_mount *mp = XFS_M(root->d_sb);
struct proc_xfs_info *xfs_infop;
for (xfs_infop = xfs_info_set; xfs_infop->flag; xfs_infop++) {
if (mp->m_flags & xfs_infop->flag)
seq_puts(m, xfs_infop->str);
}
- for (xfs_infop = xfs_info_unset; xfs_infop->flag; xfs_infop++) {
- if (!(mp->m_flags & xfs_infop->flag))
- seq_puts(m, xfs_infop->str);
- }
- if (mp->m_flags & XFS_MOUNT_DFLT_IOSIZE)
+ seq_printf(m, ",inode%d",
+ (mp->m_flags & XFS_MOUNT_SMALL_INUMS) ? 32 : 64);
+
+ if (mp->m_flags & XFS_MOUNT_ALLOCSIZE)
seq_printf(m, ",allocsize=%dk",
- (int)(1 << mp->m_writeio_log) >> 10);
+ (1 << mp->m_allocsize_log) >> 10);
if (mp->m_logbufs > 0)
seq_printf(m, ",logbufs=%d", mp->m_logbufs);
@@ -510,32 +219,8 @@
if (!(mp->m_qflags & XFS_ALL_QUOTA_ACCT))
seq_puts(m, ",noquota");
-}
-static uint64_t
-xfs_max_file_offset(
- unsigned int blockshift)
-{
- unsigned int pagefactor = 1;
- unsigned int bitshift = BITS_PER_LONG - 1;
-
- /* Figure out maximum filesize, on Linux this can depend on
- * the filesystem blocksize (on 32 bit platforms).
- * __block_write_begin does this in an [unsigned] long long...
- * page->index << (PAGE_SHIFT - bbits)
- * So, for page sized blocks (4K on 32 bit platforms),
- * this wraps at around 8Tb (hence MAX_LFS_FILESIZE which is
- * (((u64)PAGE_SIZE << (BITS_PER_LONG-1))-1)
- * but for smaller blocksizes it is less (bbits = log2 bsize).
- */
-
-#if BITS_PER_LONG == 32
- ASSERT(sizeof(sector_t) == 8);
- pagefactor = PAGE_SIZE;
- bitshift = BITS_PER_LONG;
-#endif
-
- return (((uint64_t)pagefactor) << bitshift) - 1;
+ return 0;
}
/*
@@ -655,7 +340,7 @@
xfs_blkdev_issue_flush(
xfs_buftarg_t *buftarg)
{
- blkdev_issue_flush(buftarg->bt_bdev, GFP_NOFS, NULL);
+ blkdev_issue_flush(buftarg->bt_bdev, GFP_NOFS);
}
STATIC void
@@ -808,33 +493,33 @@
struct xfs_mount *mp)
{
mp->m_buf_workqueue = alloc_workqueue("xfs-buf/%s",
- WQ_MEM_RECLAIM|WQ_FREEZABLE, 1, mp->m_fsname);
+ WQ_MEM_RECLAIM|WQ_FREEZABLE, 1, mp->m_super->s_id);
if (!mp->m_buf_workqueue)
goto out;
mp->m_unwritten_workqueue = alloc_workqueue("xfs-conv/%s",
- WQ_MEM_RECLAIM|WQ_FREEZABLE, 0, mp->m_fsname);
+ WQ_MEM_RECLAIM|WQ_FREEZABLE, 0, mp->m_super->s_id);
if (!mp->m_unwritten_workqueue)
goto out_destroy_buf;
mp->m_cil_workqueue = alloc_workqueue("xfs-cil/%s",
WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_UNBOUND,
- 0, mp->m_fsname);
+ 0, mp->m_super->s_id);
if (!mp->m_cil_workqueue)
goto out_destroy_unwritten;
mp->m_reclaim_workqueue = alloc_workqueue("xfs-reclaim/%s",
- WQ_MEM_RECLAIM|WQ_FREEZABLE, 0, mp->m_fsname);
+ WQ_MEM_RECLAIM|WQ_FREEZABLE, 0, mp->m_super->s_id);
if (!mp->m_reclaim_workqueue)
goto out_destroy_cil;
mp->m_eofblocks_workqueue = alloc_workqueue("xfs-eofblocks/%s",
- WQ_MEM_RECLAIM|WQ_FREEZABLE, 0, mp->m_fsname);
+ WQ_MEM_RECLAIM|WQ_FREEZABLE, 0, mp->m_super->s_id);
if (!mp->m_eofblocks_workqueue)
goto out_destroy_reclaim;
mp->m_sync_workqueue = alloc_workqueue("xfs-sync/%s", WQ_FREEZABLE, 0,
- mp->m_fsname);
+ mp->m_super->s_id);
if (!mp->m_sync_workqueue)
goto out_destroy_eofb;
@@ -866,6 +551,20 @@
destroy_workqueue(mp->m_buf_workqueue);
}
+static void
+xfs_flush_inodes_worker(
+ struct work_struct *work)
+{
+ struct xfs_mount *mp = container_of(work, struct xfs_mount,
+ m_flush_inodes_work);
+ struct super_block *sb = mp->m_super;
+
+ if (down_read_trylock(&sb->s_umount)) {
+ sync_inodes_sb(sb);
+ up_read(&sb->s_umount);
+ }
+}
+
/*
* Flush all dirty data to disk. Must not be called while holding an XFS_ILOCK
* or a page lock. We use sync_inodes_sb() here to ensure we block while waiting
@@ -876,12 +575,15 @@
xfs_flush_inodes(
struct xfs_mount *mp)
{
- struct super_block *sb = mp->m_super;
+ /*
+ * If flush_work() returns true then that means we waited for a flush
+ * which was already in progress. Don't bother running another scan.
+ */
+ if (flush_work(&mp->m_flush_inodes_work))
+ return;
- if (down_read_trylock(&sb->s_umount)) {
- sync_inodes_sb(sb);
- up_read(&sb->s_umount);
- }
+ queue_work(mp->m_sync_workqueue, &mp->m_flush_inodes_work);
+ flush_work(&mp->m_flush_inodes_work);
}
/* Catch misguided souls that try to use this interface on XFS */
@@ -952,11 +654,11 @@
ASSERT_ALWAYS(!xfs_iflags_test(ip, XFS_IRECLAIM));
/*
- * We always use background reclaim here because even if the
- * inode is clean, it still may be under IO and hence we have
- * to take the flush lock. The background reclaim path handles
- * this more efficiently than we can here, so simply let background
- * reclaim tear down all inodes.
+ * We always use background reclaim here because even if the inode is
+ * clean, it still may be under IO and hence we have wait for IO
+ * completion to occur before we can reclaim the inode. The background
+ * reclaim path handles this more efficiently than we can here, so
+ * simply let background reclaim tear down all inodes.
*/
xfs_inode_set_reclaim_tag(ip);
}
@@ -1035,16 +737,16 @@
return 0;
}
- return generic_drop_inode(inode) || (ip->i_flags & XFS_IDONTCACHE);
+ return generic_drop_inode(inode);
}
-STATIC void
-xfs_free_fsname(
+static void
+xfs_mount_free(
struct xfs_mount *mp)
{
- kfree(mp->m_fsname);
kfree(mp->m_rtname);
kfree(mp->m_logname);
+ kmem_free(mp);
}
STATIC int
@@ -1092,8 +794,7 @@
statp->f_namelen = MAXNAMELEN - 1;
id = huge_encode_dev(mp->m_ddev_targp->bt_dev);
- statp->f_fsid.val[0] = (u32)id;
- statp->f_fsid.val[1] = (u32)(id >> 32);
+ statp->f_fsid = u64_to_fsid(id);
icount = percpu_counter_sum(&mp->m_icount);
ifree = percpu_counter_sum(&mp->m_ifree);
@@ -1105,7 +806,8 @@
statp->f_blocks = sbp->sb_dblocks - lsize;
spin_unlock(&mp->m_sb_lock);
- statp->f_bfree = fdblocks - mp->m_alloc_set_aside;
+ /* make sure statp->f_bfree does not underflow */
+ statp->f_bfree = max_t(int64_t, fdblocks - mp->m_alloc_set_aside, 0);
statp->f_bavail = statp->f_bfree;
fakeinos = XFS_FSB_TO_INO(mp, statp->f_bfree);
@@ -1171,8 +873,10 @@
* there is no log replay required to write the inodes to disk - this is the
* primary difference between a sync and a quiesce.
*
- * Note: xfs_log_quiesce() stops background log work - the callers must ensure
- * it is started again when appropriate.
+ * We cancel log work early here to ensure all transactions the log worker may
+ * run have finished before we clean up and log the superblock and write an
+ * unmount record. The unfreeze process is responsible for restarting the log
+ * worker correctly.
*/
void
xfs_quiesce_attr(
@@ -1180,206 +884,20 @@
{
int error = 0;
- /* wait for all modifications to complete */
- while (atomic_read(&mp->m_active_trans) > 0)
- delay(100);
+ cancel_delayed_work_sync(&mp->m_log->l_work);
/* force the log to unpin objects from the now complete transactions */
xfs_log_force(mp, XFS_LOG_SYNC);
- /* reclaim inodes to do any IO before the freeze completes */
- xfs_reclaim_inodes(mp, 0);
- xfs_reclaim_inodes(mp, SYNC_WAIT);
/* Push the superblock and write an unmount record */
error = xfs_log_sbcount(mp);
if (error)
xfs_warn(mp, "xfs_attr_quiesce: failed to log sb changes. "
"Frozen image may not be consistent.");
- /*
- * Just warn here till VFS can correctly support
- * read-only remount without racing.
- */
- WARN_ON(atomic_read(&mp->m_active_trans) != 0);
-
xfs_log_quiesce(mp);
}
-STATIC int
-xfs_test_remount_options(
- struct super_block *sb,
- char *options)
-{
- int error = 0;
- struct xfs_mount *tmp_mp;
-
- tmp_mp = kmem_zalloc(sizeof(*tmp_mp), KM_MAYFAIL);
- if (!tmp_mp)
- return -ENOMEM;
-
- tmp_mp->m_super = sb;
- error = xfs_parseargs(tmp_mp, options);
- xfs_free_fsname(tmp_mp);
- kmem_free(tmp_mp);
-
- return error;
-}
-
-STATIC int
-xfs_fs_remount(
- struct super_block *sb,
- int *flags,
- char *options)
-{
- struct xfs_mount *mp = XFS_M(sb);
- xfs_sb_t *sbp = &mp->m_sb;
- substring_t args[MAX_OPT_ARGS];
- char *p;
- int error;
-
- /* First, check for complete junk; i.e. invalid options */
- error = xfs_test_remount_options(sb, options);
- if (error)
- return error;
-
- sync_filesystem(sb);
- while ((p = strsep(&options, ",")) != NULL) {
- int token;
-
- if (!*p)
- continue;
-
- token = match_token(p, tokens, args);
- switch (token) {
- case Opt_inode64:
- mp->m_flags &= ~XFS_MOUNT_SMALL_INUMS;
- mp->m_maxagi = xfs_set_inode_alloc(mp, sbp->sb_agcount);
- break;
- case Opt_inode32:
- mp->m_flags |= XFS_MOUNT_SMALL_INUMS;
- mp->m_maxagi = xfs_set_inode_alloc(mp, sbp->sb_agcount);
- break;
- default:
- /*
- * Logically we would return an error here to prevent
- * users from believing they might have changed
- * mount options using remount which can't be changed.
- *
- * But unfortunately mount(8) adds all options from
- * mtab and fstab to the mount arguments in some cases
- * so we can't blindly reject options, but have to
- * check for each specified option if it actually
- * differs from the currently set option and only
- * reject it if that's the case.
- *
- * Until that is implemented we return success for
- * every remount request, and silently ignore all
- * options that we can't actually change.
- */
-#if 0
- xfs_info(mp,
- "mount option \"%s\" not supported for remount", p);
- return -EINVAL;
-#else
- break;
-#endif
- }
- }
-
- /* ro -> rw */
- if ((mp->m_flags & XFS_MOUNT_RDONLY) && !(*flags & SB_RDONLY)) {
- if (mp->m_flags & XFS_MOUNT_NORECOVERY) {
- xfs_warn(mp,
- "ro->rw transition prohibited on norecovery mount");
- return -EINVAL;
- }
-
- if (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 &&
- xfs_sb_has_ro_compat_feature(sbp,
- XFS_SB_FEAT_RO_COMPAT_UNKNOWN)) {
- xfs_warn(mp,
-"ro->rw transition prohibited on unknown (0x%x) ro-compat filesystem",
- (sbp->sb_features_ro_compat &
- XFS_SB_FEAT_RO_COMPAT_UNKNOWN));
- return -EINVAL;
- }
-
- mp->m_flags &= ~XFS_MOUNT_RDONLY;
-
- /*
- * If this is the first remount to writeable state we
- * might have some superblock changes to update.
- */
- if (mp->m_update_sb) {
- error = xfs_sync_sb(mp, false);
- if (error) {
- xfs_warn(mp, "failed to write sb changes");
- return error;
- }
- mp->m_update_sb = false;
- }
-
- /*
- * Fill out the reserve pool if it is empty. Use the stashed
- * value if it is non-zero, otherwise go with the default.
- */
- xfs_restore_resvblks(mp);
- xfs_log_work_queue(mp);
-
- /* Recover any CoW blocks that never got remapped. */
- error = xfs_reflink_recover_cow(mp);
- if (error) {
- xfs_err(mp,
- "Error %d recovering leftover CoW allocations.", error);
- xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
- return error;
- }
- xfs_start_block_reaping(mp);
-
- /* Create the per-AG metadata reservation pool .*/
- error = xfs_fs_reserve_ag_blocks(mp);
- if (error && error != -ENOSPC)
- return error;
- }
-
- /* rw -> ro */
- if (!(mp->m_flags & XFS_MOUNT_RDONLY) && (*flags & SB_RDONLY)) {
- /*
- * Cancel background eofb scanning so it cannot race with the
- * final log force+buftarg wait and deadlock the remount.
- */
- xfs_stop_block_reaping(mp);
-
- /* Get rid of any leftover CoW reservations... */
- error = xfs_icache_free_cowblocks(mp, NULL);
- if (error) {
- xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
- return error;
- }
-
- /* Free the per-AG metadata reservation pool. */
- error = xfs_fs_unreserve_ag_blocks(mp);
- if (error) {
- xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
- return error;
- }
-
- /*
- * Before we sync the metadata, we need to free up the reserve
- * block pool so that the used block count in the superblock on
- * disk is correct at the end of the remount. Stash the current
- * reserve pool size so that if we get remounted rw, we can
- * return it to the same size.
- */
- xfs_save_resvblks(mp);
-
- xfs_quiesce_attr(mp);
- mp->m_flags |= XFS_MOUNT_RDONLY;
- }
-
- return 0;
-}
-
/*
* Second stage of a freeze. The data is already frozen so we only
* need to take care of the metadata. Once that's done sync the superblock
@@ -1391,11 +909,21 @@
struct super_block *sb)
{
struct xfs_mount *mp = XFS_M(sb);
+ unsigned int flags;
+ int ret;
+ /*
+ * The filesystem is now frozen far enough that memory reclaim
+ * cannot safely operate on the filesystem. Hence we need to
+ * set a GFP_NOFS context here to avoid recursion deadlocks.
+ */
+ flags = memalloc_nofs_save();
xfs_stop_block_reaping(mp);
xfs_save_resvblks(mp);
xfs_quiesce_attr(mp);
- return xfs_sync_sb(mp, true);
+ ret = xfs_sync_sb(mp, true);
+ memalloc_nofs_restore(flags);
+ return ret;
}
STATIC int
@@ -1410,15 +938,6 @@
return 0;
}
-STATIC int
-xfs_fs_show_options(
- struct seq_file *m,
- struct dentry *root)
-{
- xfs_showargs(XFS_M(root->d_sb), m);
- return 0;
-}
-
/*
* This function fills in xfs_mount_t fields based on mount args.
* Note: the superblock _has_ now been read in.
@@ -1541,60 +1060,345 @@
percpu_counter_destroy(&mp->m_delalloc_blks);
}
-static struct xfs_mount *
-xfs_mount_alloc(
+static void
+xfs_fs_put_super(
struct super_block *sb)
{
- struct xfs_mount *mp;
+ struct xfs_mount *mp = XFS_M(sb);
- mp = kzalloc(sizeof(struct xfs_mount), GFP_KERNEL);
- if (!mp)
- return NULL;
+ /* if ->fill_super failed, we have no mount to tear down */
+ if (!sb->s_fs_info)
+ return;
- mp->m_super = sb;
- spin_lock_init(&mp->m_sb_lock);
- spin_lock_init(&mp->m_agirotor_lock);
- INIT_RADIX_TREE(&mp->m_perag_tree, GFP_ATOMIC);
- spin_lock_init(&mp->m_perag_lock);
- mutex_init(&mp->m_growlock);
- atomic_set(&mp->m_active_trans, 0);
- INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker);
- INIT_DELAYED_WORK(&mp->m_eofblocks_work, xfs_eofblocks_worker);
- INIT_DELAYED_WORK(&mp->m_cowblocks_work, xfs_cowblocks_worker);
- mp->m_kobj.kobject.kset = xfs_kset;
- /*
- * We don't create the finobt per-ag space reservation until after log
- * recovery, so we must set this to true so that an ifree transaction
- * started during log recovery will not depend on space reservations
- * for finobt expansion.
- */
- mp->m_finobt_nores = true;
- return mp;
+ xfs_notice(mp, "Unmounting Filesystem");
+ xfs_filestream_unmount(mp);
+ xfs_unmountfs(mp);
+
+ xfs_freesb(mp);
+ free_percpu(mp->m_stats.xs_stats);
+ xfs_destroy_percpu_counters(mp);
+ xfs_destroy_mount_workqueues(mp);
+ xfs_close_devices(mp);
+
+ sb->s_fs_info = NULL;
+ xfs_mount_free(mp);
}
-
-STATIC int
-xfs_fs_fill_super(
+static long
+xfs_fs_nr_cached_objects(
struct super_block *sb,
- void *data,
- int silent)
+ struct shrink_control *sc)
{
- struct inode *root;
- struct xfs_mount *mp = NULL;
- int flags = 0, error = -ENOMEM;
+ /* Paranoia: catch incorrect calls during mount setup or teardown */
+ if (WARN_ON_ONCE(!sb->s_fs_info))
+ return 0;
+ return xfs_reclaim_inodes_count(XFS_M(sb));
+}
+static long
+xfs_fs_free_cached_objects(
+ struct super_block *sb,
+ struct shrink_control *sc)
+{
+ return xfs_reclaim_inodes_nr(XFS_M(sb), sc->nr_to_scan);
+}
+
+static const struct super_operations xfs_super_operations = {
+ .alloc_inode = xfs_fs_alloc_inode,
+ .destroy_inode = xfs_fs_destroy_inode,
+ .dirty_inode = xfs_fs_dirty_inode,
+ .drop_inode = xfs_fs_drop_inode,
+ .put_super = xfs_fs_put_super,
+ .sync_fs = xfs_fs_sync_fs,
+ .freeze_fs = xfs_fs_freeze,
+ .unfreeze_fs = xfs_fs_unfreeze,
+ .statfs = xfs_fs_statfs,
+ .show_options = xfs_fs_show_options,
+ .nr_cached_objects = xfs_fs_nr_cached_objects,
+ .free_cached_objects = xfs_fs_free_cached_objects,
+};
+
+static int
+suffix_kstrtoint(
+ const char *s,
+ unsigned int base,
+ int *res)
+{
+ int last, shift_left_factor = 0, _res;
+ char *value;
+ int ret = 0;
+
+ value = kstrdup(s, GFP_KERNEL);
+ if (!value)
+ return -ENOMEM;
+
+ last = strlen(value) - 1;
+ if (value[last] == 'K' || value[last] == 'k') {
+ shift_left_factor = 10;
+ value[last] = '\0';
+ }
+ if (value[last] == 'M' || value[last] == 'm') {
+ shift_left_factor = 20;
+ value[last] = '\0';
+ }
+ if (value[last] == 'G' || value[last] == 'g') {
+ shift_left_factor = 30;
+ value[last] = '\0';
+ }
+
+ if (kstrtoint(value, base, &_res))
+ ret = -EINVAL;
+ kfree(value);
+ *res = _res << shift_left_factor;
+ return ret;
+}
+
+/*
+ * Set mount state from a mount option.
+ *
+ * NOTE: mp->m_super is NULL here!
+ */
+static int
+xfs_fc_parse_param(
+ struct fs_context *fc,
+ struct fs_parameter *param)
+{
+ struct xfs_mount *mp = fc->s_fs_info;
+ struct fs_parse_result result;
+ int size = 0;
+ int opt;
+
+ opt = fs_parse(fc, xfs_fs_parameters, param, &result);
+ if (opt < 0)
+ return opt;
+
+ switch (opt) {
+ case Opt_logbufs:
+ mp->m_logbufs = result.uint_32;
+ return 0;
+ case Opt_logbsize:
+ if (suffix_kstrtoint(param->string, 10, &mp->m_logbsize))
+ return -EINVAL;
+ return 0;
+ case Opt_logdev:
+ kfree(mp->m_logname);
+ mp->m_logname = kstrdup(param->string, GFP_KERNEL);
+ if (!mp->m_logname)
+ return -ENOMEM;
+ return 0;
+ case Opt_rtdev:
+ kfree(mp->m_rtname);
+ mp->m_rtname = kstrdup(param->string, GFP_KERNEL);
+ if (!mp->m_rtname)
+ return -ENOMEM;
+ return 0;
+ case Opt_allocsize:
+ if (suffix_kstrtoint(param->string, 10, &size))
+ return -EINVAL;
+ mp->m_allocsize_log = ffs(size) - 1;
+ mp->m_flags |= XFS_MOUNT_ALLOCSIZE;
+ return 0;
+ case Opt_grpid:
+ case Opt_bsdgroups:
+ mp->m_flags |= XFS_MOUNT_GRPID;
+ return 0;
+ case Opt_nogrpid:
+ case Opt_sysvgroups:
+ mp->m_flags &= ~XFS_MOUNT_GRPID;
+ return 0;
+ case Opt_wsync:
+ mp->m_flags |= XFS_MOUNT_WSYNC;
+ return 0;
+ case Opt_norecovery:
+ mp->m_flags |= XFS_MOUNT_NORECOVERY;
+ return 0;
+ case Opt_noalign:
+ mp->m_flags |= XFS_MOUNT_NOALIGN;
+ return 0;
+ case Opt_swalloc:
+ mp->m_flags |= XFS_MOUNT_SWALLOC;
+ return 0;
+ case Opt_sunit:
+ mp->m_dalign = result.uint_32;
+ return 0;
+ case Opt_swidth:
+ mp->m_swidth = result.uint_32;
+ return 0;
+ case Opt_inode32:
+ mp->m_flags |= XFS_MOUNT_SMALL_INUMS;
+ return 0;
+ case Opt_inode64:
+ mp->m_flags &= ~XFS_MOUNT_SMALL_INUMS;
+ return 0;
+ case Opt_nouuid:
+ mp->m_flags |= XFS_MOUNT_NOUUID;
+ return 0;
+ case Opt_largeio:
+ mp->m_flags |= XFS_MOUNT_LARGEIO;
+ return 0;
+ case Opt_nolargeio:
+ mp->m_flags &= ~XFS_MOUNT_LARGEIO;
+ return 0;
+ case Opt_filestreams:
+ mp->m_flags |= XFS_MOUNT_FILESTREAMS;
+ return 0;
+ case Opt_noquota:
+ mp->m_qflags &= ~XFS_ALL_QUOTA_ACCT;
+ mp->m_qflags &= ~XFS_ALL_QUOTA_ENFD;
+ mp->m_qflags &= ~XFS_ALL_QUOTA_ACTIVE;
+ return 0;
+ case Opt_quota:
+ case Opt_uquota:
+ case Opt_usrquota:
+ mp->m_qflags |= (XFS_UQUOTA_ACCT | XFS_UQUOTA_ACTIVE |
+ XFS_UQUOTA_ENFD);
+ return 0;
+ case Opt_qnoenforce:
+ case Opt_uqnoenforce:
+ mp->m_qflags |= (XFS_UQUOTA_ACCT | XFS_UQUOTA_ACTIVE);
+ mp->m_qflags &= ~XFS_UQUOTA_ENFD;
+ return 0;
+ case Opt_pquota:
+ case Opt_prjquota:
+ mp->m_qflags |= (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE |
+ XFS_PQUOTA_ENFD);
+ return 0;
+ case Opt_pqnoenforce:
+ mp->m_qflags |= (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE);
+ mp->m_qflags &= ~XFS_PQUOTA_ENFD;
+ return 0;
+ case Opt_gquota:
+ case Opt_grpquota:
+ mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE |
+ XFS_GQUOTA_ENFD);
+ return 0;
+ case Opt_gqnoenforce:
+ mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE);
+ mp->m_qflags &= ~XFS_GQUOTA_ENFD;
+ return 0;
+ case Opt_discard:
+ mp->m_flags |= XFS_MOUNT_DISCARD;
+ return 0;
+ case Opt_nodiscard:
+ mp->m_flags &= ~XFS_MOUNT_DISCARD;
+ return 0;
+#ifdef CONFIG_FS_DAX
+ case Opt_dax:
+ xfs_mount_set_dax_mode(mp, XFS_DAX_ALWAYS);
+ return 0;
+ case Opt_dax_enum:
+ xfs_mount_set_dax_mode(mp, result.uint_32);
+ return 0;
+#endif
+ /* Following mount options will be removed in September 2025 */
+ case Opt_ikeep:
+ xfs_warn(mp, "%s mount option is deprecated.", param->key);
+ mp->m_flags |= XFS_MOUNT_IKEEP;
+ return 0;
+ case Opt_noikeep:
+ xfs_warn(mp, "%s mount option is deprecated.", param->key);
+ mp->m_flags &= ~XFS_MOUNT_IKEEP;
+ return 0;
+ case Opt_attr2:
+ xfs_warn(mp, "%s mount option is deprecated.", param->key);
+ mp->m_flags |= XFS_MOUNT_ATTR2;
+ return 0;
+ case Opt_noattr2:
+ xfs_warn(mp, "%s mount option is deprecated.", param->key);
+ mp->m_flags &= ~XFS_MOUNT_ATTR2;
+ mp->m_flags |= XFS_MOUNT_NOATTR2;
+ return 0;
+ default:
+ xfs_warn(mp, "unknown mount option [%s].", param->key);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int
+xfs_fc_validate_params(
+ struct xfs_mount *mp)
+{
/*
- * allocate mp and do all low-level struct initializations before we
- * attach it to the super
+ * no recovery flag requires a read-only mount
*/
- mp = xfs_mount_alloc(sb);
- if (!mp)
- goto out;
- sb->s_fs_info = mp;
+ if ((mp->m_flags & XFS_MOUNT_NORECOVERY) &&
+ !(mp->m_flags & XFS_MOUNT_RDONLY)) {
+ xfs_warn(mp, "no-recovery mounts must be read-only.");
+ return -EINVAL;
+ }
- error = xfs_parseargs(mp, (char *)data);
+ if ((mp->m_flags & XFS_MOUNT_NOALIGN) &&
+ (mp->m_dalign || mp->m_swidth)) {
+ xfs_warn(mp,
+ "sunit and swidth options incompatible with the noalign option");
+ return -EINVAL;
+ }
+
+ if (!IS_ENABLED(CONFIG_XFS_QUOTA) && mp->m_qflags != 0) {
+ xfs_warn(mp, "quota support not available in this kernel.");
+ return -EINVAL;
+ }
+
+ if ((mp->m_dalign && !mp->m_swidth) ||
+ (!mp->m_dalign && mp->m_swidth)) {
+ xfs_warn(mp, "sunit and swidth must be specified together");
+ return -EINVAL;
+ }
+
+ if (mp->m_dalign && (mp->m_swidth % mp->m_dalign != 0)) {
+ xfs_warn(mp,
+ "stripe width (%d) must be a multiple of the stripe unit (%d)",
+ mp->m_swidth, mp->m_dalign);
+ return -EINVAL;
+ }
+
+ if (mp->m_logbufs != -1 &&
+ mp->m_logbufs != 0 &&
+ (mp->m_logbufs < XLOG_MIN_ICLOGS ||
+ mp->m_logbufs > XLOG_MAX_ICLOGS)) {
+ xfs_warn(mp, "invalid logbufs value: %d [not %d-%d]",
+ mp->m_logbufs, XLOG_MIN_ICLOGS, XLOG_MAX_ICLOGS);
+ return -EINVAL;
+ }
+
+ if (mp->m_logbsize != -1 &&
+ mp->m_logbsize != 0 &&
+ (mp->m_logbsize < XLOG_MIN_RECORD_BSIZE ||
+ mp->m_logbsize > XLOG_MAX_RECORD_BSIZE ||
+ !is_power_of_2(mp->m_logbsize))) {
+ xfs_warn(mp,
+ "invalid logbufsize: %d [not 16k,32k,64k,128k or 256k]",
+ mp->m_logbsize);
+ return -EINVAL;
+ }
+
+ if ((mp->m_flags & XFS_MOUNT_ALLOCSIZE) &&
+ (mp->m_allocsize_log > XFS_MAX_IO_LOG ||
+ mp->m_allocsize_log < XFS_MIN_IO_LOG)) {
+ xfs_warn(mp, "invalid log iosize: %d [not %d-%d]",
+ mp->m_allocsize_log, XFS_MIN_IO_LOG, XFS_MAX_IO_LOG);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int
+xfs_fc_fill_super(
+ struct super_block *sb,
+ struct fs_context *fc)
+{
+ struct xfs_mount *mp = sb->s_fs_info;
+ struct inode *root;
+ int flags = 0, error;
+
+ mp->m_super = sb;
+
+ error = xfs_fc_validate_params(mp);
if (error)
- goto out_free_fsname;
+ goto out_free_names;
sb_min_blocksize(sb, BBSIZE);
sb->s_xattr = xfs_xattr_handlers;
@@ -1616,12 +1420,12 @@
msleep(xfs_globals.mount_delay * 1000);
}
- if (silent)
+ if (fc->sb_flags & SB_SILENT)
flags |= XFS_MFSI_QUIET;
error = xfs_open_devices(mp);
if (error)
- goto out_free_fsname;
+ goto out_free_names;
error = xfs_init_mount_workqueues(mp);
if (error)
@@ -1650,6 +1454,39 @@
if (error)
goto out_free_sb;
+ /* V4 support is undergoing deprecation. */
+ if (!xfs_sb_version_hascrc(&mp->m_sb)) {
+#ifdef CONFIG_XFS_SUPPORT_V4
+ xfs_warn_once(mp,
+ "Deprecated V4 format (crc=0) will not be supported after September 2030.");
+#else
+ xfs_warn(mp,
+ "Deprecated V4 format (crc=0) not supported by kernel.");
+ error = -EINVAL;
+ goto out_free_sb;
+#endif
+ }
+
+ /*
+ * XFS block mappings use 54 bits to store the logical block offset.
+ * This should suffice to handle the maximum file size that the VFS
+ * supports (currently 2^63 bytes on 64-bit and ULONG_MAX << PAGE_SHIFT
+ * bytes on 32-bit), but as XFS and VFS have gotten the s_maxbytes
+ * calculation wrong on 32-bit kernels in the past, we'll add a WARN_ON
+ * to check this assertion.
+ *
+ * Avoid integer overflow by comparing the maximum bmbt offset to the
+ * maximum pagecache offset in units of fs blocks.
+ */
+ if (XFS_B_TO_FSBT(mp, MAX_LFS_FILESIZE) > XFS_MAX_FILEOFF) {
+ xfs_warn(mp,
+"MAX_LFS_FILESIZE block offset (%llu) exceeds extent map maximum (%llu)!",
+ XFS_B_TO_FSBT(mp, MAX_LFS_FILESIZE),
+ XFS_MAX_FILEOFF);
+ error = -EINVAL;
+ goto out_free_sb;
+ }
+
error = xfs_filestream_mount(mp);
if (error)
goto out_free_sb;
@@ -1661,11 +1498,17 @@
sb->s_magic = XFS_SUPER_MAGIC;
sb->s_blocksize = mp->m_sb.sb_blocksize;
sb->s_blocksize_bits = ffs(sb->s_blocksize) - 1;
- sb->s_maxbytes = xfs_max_file_offset(sb->s_blocksize_bits);
+ sb->s_maxbytes = MAX_LFS_FILESIZE;
sb->s_max_links = XFS_MAXLINK;
sb->s_time_gran = 1;
- sb->s_time_min = S32_MIN;
- sb->s_time_max = S32_MAX;
+ if (xfs_sb_version_hasbigtime(&mp->m_sb)) {
+ sb->s_time_min = xfs_bigtime_to_unix(XFS_BIGTIME_TIME_MIN);
+ sb->s_time_max = xfs_bigtime_to_unix(XFS_BIGTIME_TIME_MAX);
+ } else {
+ sb->s_time_min = XFS_LEGACY_TIME_MIN;
+ sb->s_time_max = XFS_LEGACY_TIME_MAX;
+ }
+ trace_xfs_inode_timestamp_range(mp, sb->s_time_min, sb->s_time_max);
sb->s_iflags |= SB_I_CGROUPWB;
set_posix_acl_flag(sb);
@@ -1674,7 +1517,11 @@
if (XFS_SB_VERSION_NUM(&mp->m_sb) == XFS_SB_VERSION_5)
sb->s_flags |= SB_I_VERSION;
- if (mp->m_flags & XFS_MOUNT_DAX) {
+ if (xfs_sb_version_hasbigtime(&mp->m_sb))
+ xfs_warn(mp,
+ "EXPERIMENTAL big timestamp feature in use. Use at your own risk!");
+
+ if (mp->m_flags & XFS_MOUNT_DAX_ALWAYS) {
bool rtdev_is_dax = false, datadev_is_dax;
xfs_warn(mp,
@@ -1688,7 +1535,7 @@
if (!rtdev_is_dax && !datadev_is_dax) {
xfs_alert(mp,
"DAX unsupported by block device. Turning off DAX.");
- mp->m_flags &= ~XFS_MOUNT_DAX;
+ xfs_mount_set_dax_mode(mp, XFS_DAX_NEVER);
}
if (xfs_sb_version_hasreflink(&mp->m_sb)) {
xfs_alert(mp,
@@ -1729,6 +1576,10 @@
goto out_filestream_unmount;
}
+ if (xfs_sb_version_hasinobtcounts(&mp->m_sb))
+ xfs_warn(mp,
+ "EXPERIMENTAL inode btree counters feature in use. Use at your own risk!");
+
error = xfs_mountfs(mp);
if (error)
goto out_filestream_unmount;
@@ -1758,11 +1609,9 @@
xfs_destroy_mount_workqueues(mp);
out_close_devices:
xfs_close_devices(mp);
- out_free_fsname:
+ out_free_names:
sb->s_fs_info = NULL;
- xfs_free_fsname(mp);
- kfree(mp);
- out:
+ xfs_mount_free(mp);
return error;
out_unmount:
@@ -1771,80 +1620,256 @@
goto out_free_sb;
}
-STATIC void
-xfs_fs_put_super(
- struct super_block *sb)
+static int
+xfs_fc_get_tree(
+ struct fs_context *fc)
{
- struct xfs_mount *mp = XFS_M(sb);
-
- /* if ->fill_super failed, we have no mount to tear down */
- if (!sb->s_fs_info)
- return;
-
- xfs_notice(mp, "Unmounting Filesystem");
- xfs_filestream_unmount(mp);
- xfs_unmountfs(mp);
-
- xfs_freesb(mp);
- free_percpu(mp->m_stats.xs_stats);
- xfs_destroy_percpu_counters(mp);
- xfs_destroy_mount_workqueues(mp);
- xfs_close_devices(mp);
-
- sb->s_fs_info = NULL;
- xfs_free_fsname(mp);
- kfree(mp);
+ return get_tree_bdev(fc, xfs_fc_fill_super);
}
-STATIC struct dentry *
-xfs_fs_mount(
- struct file_system_type *fs_type,
- int flags,
- const char *dev_name,
- void *data)
+static int
+xfs_remount_rw(
+ struct xfs_mount *mp)
{
- return mount_bdev(fs_type, flags, dev_name, data, xfs_fs_fill_super);
+ struct xfs_sb *sbp = &mp->m_sb;
+ int error;
+
+ if (mp->m_flags & XFS_MOUNT_NORECOVERY) {
+ xfs_warn(mp,
+ "ro->rw transition prohibited on norecovery mount");
+ return -EINVAL;
+ }
+
+ if (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 &&
+ xfs_sb_has_ro_compat_feature(sbp, XFS_SB_FEAT_RO_COMPAT_UNKNOWN)) {
+ xfs_warn(mp,
+ "ro->rw transition prohibited on unknown (0x%x) ro-compat filesystem",
+ (sbp->sb_features_ro_compat &
+ XFS_SB_FEAT_RO_COMPAT_UNKNOWN));
+ return -EINVAL;
+ }
+
+ mp->m_flags &= ~XFS_MOUNT_RDONLY;
+
+ /*
+ * If this is the first remount to writeable state we might have some
+ * superblock changes to update.
+ */
+ if (mp->m_update_sb) {
+ error = xfs_sync_sb(mp, false);
+ if (error) {
+ xfs_warn(mp, "failed to write sb changes");
+ return error;
+ }
+ mp->m_update_sb = false;
+ }
+
+ /*
+ * Fill out the reserve pool if it is empty. Use the stashed value if
+ * it is non-zero, otherwise go with the default.
+ */
+ xfs_restore_resvblks(mp);
+ xfs_log_work_queue(mp);
+
+ /* Recover any CoW blocks that never got remapped. */
+ error = xfs_reflink_recover_cow(mp);
+ if (error) {
+ xfs_err(mp,
+ "Error %d recovering leftover CoW allocations.", error);
+ xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+ return error;
+ }
+ xfs_start_block_reaping(mp);
+
+ /* Create the per-AG metadata reservation pool .*/
+ error = xfs_fs_reserve_ag_blocks(mp);
+ if (error && error != -ENOSPC)
+ return error;
+
+ return 0;
}
-static long
-xfs_fs_nr_cached_objects(
- struct super_block *sb,
- struct shrink_control *sc)
+static int
+xfs_remount_ro(
+ struct xfs_mount *mp)
{
- /* Paranoia: catch incorrect calls during mount setup or teardown */
- if (WARN_ON_ONCE(!sb->s_fs_info))
- return 0;
- return xfs_reclaim_inodes_count(XFS_M(sb));
+ int error;
+
+ /*
+ * Cancel background eofb scanning so it cannot race with the final
+ * log force+buftarg wait and deadlock the remount.
+ */
+ xfs_stop_block_reaping(mp);
+
+ /* Get rid of any leftover CoW reservations... */
+ error = xfs_icache_free_cowblocks(mp, NULL);
+ if (error) {
+ xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+ return error;
+ }
+
+ /* Free the per-AG metadata reservation pool. */
+ error = xfs_fs_unreserve_ag_blocks(mp);
+ if (error) {
+ xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+ return error;
+ }
+
+ /*
+ * Before we sync the metadata, we need to free up the reserve block
+ * pool so that the used block count in the superblock on disk is
+ * correct at the end of the remount. Stash the current* reserve pool
+ * size so that if we get remounted rw, we can return it to the same
+ * size.
+ */
+ xfs_save_resvblks(mp);
+
+ xfs_quiesce_attr(mp);
+ mp->m_flags |= XFS_MOUNT_RDONLY;
+
+ return 0;
}
-static long
-xfs_fs_free_cached_objects(
- struct super_block *sb,
- struct shrink_control *sc)
+/*
+ * Logically we would return an error here to prevent users from believing
+ * they might have changed mount options using remount which can't be changed.
+ *
+ * But unfortunately mount(8) adds all options from mtab and fstab to the mount
+ * arguments in some cases so we can't blindly reject options, but have to
+ * check for each specified option if it actually differs from the currently
+ * set option and only reject it if that's the case.
+ *
+ * Until that is implemented we return success for every remount request, and
+ * silently ignore all options that we can't actually change.
+ */
+static int
+xfs_fc_reconfigure(
+ struct fs_context *fc)
{
- return xfs_reclaim_inodes_nr(XFS_M(sb), sc->nr_to_scan);
+ struct xfs_mount *mp = XFS_M(fc->root->d_sb);
+ struct xfs_mount *new_mp = fc->s_fs_info;
+ xfs_sb_t *sbp = &mp->m_sb;
+ int flags = fc->sb_flags;
+ int error;
+
+ /* version 5 superblocks always support version counters. */
+ if (XFS_SB_VERSION_NUM(&mp->m_sb) == XFS_SB_VERSION_5)
+ fc->sb_flags |= SB_I_VERSION;
+
+ error = xfs_fc_validate_params(new_mp);
+ if (error)
+ return error;
+
+ sync_filesystem(mp->m_super);
+
+ /* inode32 -> inode64 */
+ if ((mp->m_flags & XFS_MOUNT_SMALL_INUMS) &&
+ !(new_mp->m_flags & XFS_MOUNT_SMALL_INUMS)) {
+ mp->m_flags &= ~XFS_MOUNT_SMALL_INUMS;
+ mp->m_maxagi = xfs_set_inode_alloc(mp, sbp->sb_agcount);
+ }
+
+ /* inode64 -> inode32 */
+ if (!(mp->m_flags & XFS_MOUNT_SMALL_INUMS) &&
+ (new_mp->m_flags & XFS_MOUNT_SMALL_INUMS)) {
+ mp->m_flags |= XFS_MOUNT_SMALL_INUMS;
+ mp->m_maxagi = xfs_set_inode_alloc(mp, sbp->sb_agcount);
+ }
+
+ /* ro -> rw */
+ if ((mp->m_flags & XFS_MOUNT_RDONLY) && !(flags & SB_RDONLY)) {
+ error = xfs_remount_rw(mp);
+ if (error)
+ return error;
+ }
+
+ /* rw -> ro */
+ if (!(mp->m_flags & XFS_MOUNT_RDONLY) && (flags & SB_RDONLY)) {
+ error = xfs_remount_ro(mp);
+ if (error)
+ return error;
+ }
+
+ return 0;
}
-static const struct super_operations xfs_super_operations = {
- .alloc_inode = xfs_fs_alloc_inode,
- .destroy_inode = xfs_fs_destroy_inode,
- .dirty_inode = xfs_fs_dirty_inode,
- .drop_inode = xfs_fs_drop_inode,
- .put_super = xfs_fs_put_super,
- .sync_fs = xfs_fs_sync_fs,
- .freeze_fs = xfs_fs_freeze,
- .unfreeze_fs = xfs_fs_unfreeze,
- .statfs = xfs_fs_statfs,
- .remount_fs = xfs_fs_remount,
- .show_options = xfs_fs_show_options,
- .nr_cached_objects = xfs_fs_nr_cached_objects,
- .free_cached_objects = xfs_fs_free_cached_objects,
+static void xfs_fc_free(
+ struct fs_context *fc)
+{
+ struct xfs_mount *mp = fc->s_fs_info;
+
+ /*
+ * mp is stored in the fs_context when it is initialized.
+ * mp is transferred to the superblock on a successful mount,
+ * but if an error occurs before the transfer we have to free
+ * it here.
+ */
+ if (mp)
+ xfs_mount_free(mp);
+}
+
+static const struct fs_context_operations xfs_context_ops = {
+ .parse_param = xfs_fc_parse_param,
+ .get_tree = xfs_fc_get_tree,
+ .reconfigure = xfs_fc_reconfigure,
+ .free = xfs_fc_free,
};
+static int xfs_init_fs_context(
+ struct fs_context *fc)
+{
+ struct xfs_mount *mp;
+
+ mp = kmem_alloc(sizeof(struct xfs_mount), KM_ZERO);
+ if (!mp)
+ return -ENOMEM;
+
+ spin_lock_init(&mp->m_sb_lock);
+ spin_lock_init(&mp->m_agirotor_lock);
+ INIT_RADIX_TREE(&mp->m_perag_tree, GFP_ATOMIC);
+ spin_lock_init(&mp->m_perag_lock);
+ mutex_init(&mp->m_growlock);
+ INIT_WORK(&mp->m_flush_inodes_work, xfs_flush_inodes_worker);
+ INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker);
+ INIT_DELAYED_WORK(&mp->m_eofblocks_work, xfs_eofblocks_worker);
+ INIT_DELAYED_WORK(&mp->m_cowblocks_work, xfs_cowblocks_worker);
+ mp->m_kobj.kobject.kset = xfs_kset;
+ /*
+ * We don't create the finobt per-ag space reservation until after log
+ * recovery, so we must set this to true so that an ifree transaction
+ * started during log recovery will not depend on space reservations
+ * for finobt expansion.
+ */
+ mp->m_finobt_nores = true;
+
+ /*
+ * These can be overridden by the mount option parsing.
+ */
+ mp->m_logbufs = -1;
+ mp->m_logbsize = -1;
+ mp->m_allocsize_log = 16; /* 64k */
+
+ /*
+ * Copy binary VFS mount flags we are interested in.
+ */
+ if (fc->sb_flags & SB_RDONLY)
+ mp->m_flags |= XFS_MOUNT_RDONLY;
+ if (fc->sb_flags & SB_DIRSYNC)
+ mp->m_flags |= XFS_MOUNT_DIRSYNC;
+ if (fc->sb_flags & SB_SYNCHRONOUS)
+ mp->m_flags |= XFS_MOUNT_WSYNC;
+
+ fc->s_fs_info = mp;
+ fc->ops = &xfs_context_ops;
+
+ return 0;
+}
+
static struct file_system_type xfs_fs_type = {
.owner = THIS_MODULE,
.name = "xfs",
- .mount = xfs_fs_mount,
+ .init_fs_context = xfs_init_fs_context,
+ .parameters = xfs_fs_parameters,
.kill_sb = kill_block_super,
.fs_flags = FS_REQUIRES_DEV,
};
@@ -1853,37 +1878,39 @@
STATIC int __init
xfs_init_zones(void)
{
- if (bioset_init(&xfs_ioend_bioset, 4 * (PAGE_SIZE / SECTOR_SIZE),
- offsetof(struct xfs_ioend, io_inline_bio),
- BIOSET_NEED_BVECS))
+ xfs_log_ticket_zone = kmem_cache_create("xfs_log_ticket",
+ sizeof(struct xlog_ticket),
+ 0, 0, NULL);
+ if (!xfs_log_ticket_zone)
goto out;
- xfs_log_ticket_zone = kmem_zone_init(sizeof(xlog_ticket_t),
- "xfs_log_ticket");
- if (!xfs_log_ticket_zone)
- goto out_free_ioend_bioset;
-
- xfs_bmap_free_item_zone = kmem_zone_init(
- sizeof(struct xfs_extent_free_item),
- "xfs_bmap_free_item");
+ xfs_bmap_free_item_zone = kmem_cache_create("xfs_bmap_free_item",
+ sizeof(struct xfs_extent_free_item),
+ 0, 0, NULL);
if (!xfs_bmap_free_item_zone)
goto out_destroy_log_ticket_zone;
- xfs_btree_cur_zone = kmem_zone_init(sizeof(xfs_btree_cur_t),
- "xfs_btree_cur");
+ xfs_btree_cur_zone = kmem_cache_create("xfs_btree_cur",
+ sizeof(struct xfs_btree_cur),
+ 0, 0, NULL);
if (!xfs_btree_cur_zone)
goto out_destroy_bmap_free_item_zone;
- xfs_da_state_zone = kmem_zone_init(sizeof(xfs_da_state_t),
- "xfs_da_state");
+ xfs_da_state_zone = kmem_cache_create("xfs_da_state",
+ sizeof(struct xfs_da_state),
+ 0, 0, NULL);
if (!xfs_da_state_zone)
goto out_destroy_btree_cur_zone;
- xfs_ifork_zone = kmem_zone_init(sizeof(struct xfs_ifork), "xfs_ifork");
+ xfs_ifork_zone = kmem_cache_create("xfs_ifork",
+ sizeof(struct xfs_ifork),
+ 0, 0, NULL);
if (!xfs_ifork_zone)
goto out_destroy_da_state_zone;
- xfs_trans_zone = kmem_zone_init(sizeof(xfs_trans_t), "xfs_trans");
+ xfs_trans_zone = kmem_cache_create("xf_trans",
+ sizeof(struct xfs_trans),
+ 0, 0, NULL);
if (!xfs_trans_zone)
goto out_destroy_ifork_zone;
@@ -1893,111 +1920,122 @@
* size possible under XFS. This wastes a little bit of memory,
* but it is much faster.
*/
- xfs_buf_item_zone = kmem_zone_init(sizeof(struct xfs_buf_log_item),
- "xfs_buf_item");
+ xfs_buf_item_zone = kmem_cache_create("xfs_buf_item",
+ sizeof(struct xfs_buf_log_item),
+ 0, 0, NULL);
if (!xfs_buf_item_zone)
goto out_destroy_trans_zone;
- xfs_efd_zone = kmem_zone_init((sizeof(xfs_efd_log_item_t) +
- ((XFS_EFD_MAX_FAST_EXTENTS - 1) *
- sizeof(xfs_extent_t))), "xfs_efd_item");
+ xfs_efd_zone = kmem_cache_create("xfs_efd_item",
+ (sizeof(struct xfs_efd_log_item) +
+ (XFS_EFD_MAX_FAST_EXTENTS - 1) *
+ sizeof(struct xfs_extent)),
+ 0, 0, NULL);
if (!xfs_efd_zone)
goto out_destroy_buf_item_zone;
- xfs_efi_zone = kmem_zone_init((sizeof(xfs_efi_log_item_t) +
- ((XFS_EFI_MAX_FAST_EXTENTS - 1) *
- sizeof(xfs_extent_t))), "xfs_efi_item");
+ xfs_efi_zone = kmem_cache_create("xfs_efi_item",
+ (sizeof(struct xfs_efi_log_item) +
+ (XFS_EFI_MAX_FAST_EXTENTS - 1) *
+ sizeof(struct xfs_extent)),
+ 0, 0, NULL);
if (!xfs_efi_zone)
goto out_destroy_efd_zone;
- xfs_inode_zone =
- kmem_zone_init_flags(sizeof(xfs_inode_t), "xfs_inode",
- KM_ZONE_HWALIGN | KM_ZONE_RECLAIM | KM_ZONE_SPREAD |
- KM_ZONE_ACCOUNT, xfs_fs_inode_init_once);
+ xfs_inode_zone = kmem_cache_create("xfs_inode",
+ sizeof(struct xfs_inode), 0,
+ (SLAB_HWCACHE_ALIGN |
+ SLAB_RECLAIM_ACCOUNT |
+ SLAB_MEM_SPREAD | SLAB_ACCOUNT),
+ xfs_fs_inode_init_once);
if (!xfs_inode_zone)
goto out_destroy_efi_zone;
- xfs_ili_zone =
- kmem_zone_init_flags(sizeof(xfs_inode_log_item_t), "xfs_ili",
- KM_ZONE_SPREAD, NULL);
+ xfs_ili_zone = kmem_cache_create("xfs_ili",
+ sizeof(struct xfs_inode_log_item), 0,
+ SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
+ NULL);
if (!xfs_ili_zone)
goto out_destroy_inode_zone;
- xfs_icreate_zone = kmem_zone_init(sizeof(struct xfs_icreate_item),
- "xfs_icr");
+
+ xfs_icreate_zone = kmem_cache_create("xfs_icr",
+ sizeof(struct xfs_icreate_item),
+ 0, 0, NULL);
if (!xfs_icreate_zone)
goto out_destroy_ili_zone;
- xfs_rud_zone = kmem_zone_init(sizeof(struct xfs_rud_log_item),
- "xfs_rud_item");
+ xfs_rud_zone = kmem_cache_create("xfs_rud_item",
+ sizeof(struct xfs_rud_log_item),
+ 0, 0, NULL);
if (!xfs_rud_zone)
goto out_destroy_icreate_zone;
- xfs_rui_zone = kmem_zone_init(
+ xfs_rui_zone = kmem_cache_create("xfs_rui_item",
xfs_rui_log_item_sizeof(XFS_RUI_MAX_FAST_EXTENTS),
- "xfs_rui_item");
+ 0, 0, NULL);
if (!xfs_rui_zone)
goto out_destroy_rud_zone;
- xfs_cud_zone = kmem_zone_init(sizeof(struct xfs_cud_log_item),
- "xfs_cud_item");
+ xfs_cud_zone = kmem_cache_create("xfs_cud_item",
+ sizeof(struct xfs_cud_log_item),
+ 0, 0, NULL);
if (!xfs_cud_zone)
goto out_destroy_rui_zone;
- xfs_cui_zone = kmem_zone_init(
+ xfs_cui_zone = kmem_cache_create("xfs_cui_item",
xfs_cui_log_item_sizeof(XFS_CUI_MAX_FAST_EXTENTS),
- "xfs_cui_item");
+ 0, 0, NULL);
if (!xfs_cui_zone)
goto out_destroy_cud_zone;
- xfs_bud_zone = kmem_zone_init(sizeof(struct xfs_bud_log_item),
- "xfs_bud_item");
+ xfs_bud_zone = kmem_cache_create("xfs_bud_item",
+ sizeof(struct xfs_bud_log_item),
+ 0, 0, NULL);
if (!xfs_bud_zone)
goto out_destroy_cui_zone;
- xfs_bui_zone = kmem_zone_init(
+ xfs_bui_zone = kmem_cache_create("xfs_bui_item",
xfs_bui_log_item_sizeof(XFS_BUI_MAX_FAST_EXTENTS),
- "xfs_bui_item");
+ 0, 0, NULL);
if (!xfs_bui_zone)
goto out_destroy_bud_zone;
return 0;
out_destroy_bud_zone:
- kmem_zone_destroy(xfs_bud_zone);
+ kmem_cache_destroy(xfs_bud_zone);
out_destroy_cui_zone:
- kmem_zone_destroy(xfs_cui_zone);
+ kmem_cache_destroy(xfs_cui_zone);
out_destroy_cud_zone:
- kmem_zone_destroy(xfs_cud_zone);
+ kmem_cache_destroy(xfs_cud_zone);
out_destroy_rui_zone:
- kmem_zone_destroy(xfs_rui_zone);
+ kmem_cache_destroy(xfs_rui_zone);
out_destroy_rud_zone:
- kmem_zone_destroy(xfs_rud_zone);
+ kmem_cache_destroy(xfs_rud_zone);
out_destroy_icreate_zone:
- kmem_zone_destroy(xfs_icreate_zone);
+ kmem_cache_destroy(xfs_icreate_zone);
out_destroy_ili_zone:
- kmem_zone_destroy(xfs_ili_zone);
+ kmem_cache_destroy(xfs_ili_zone);
out_destroy_inode_zone:
- kmem_zone_destroy(xfs_inode_zone);
+ kmem_cache_destroy(xfs_inode_zone);
out_destroy_efi_zone:
- kmem_zone_destroy(xfs_efi_zone);
+ kmem_cache_destroy(xfs_efi_zone);
out_destroy_efd_zone:
- kmem_zone_destroy(xfs_efd_zone);
+ kmem_cache_destroy(xfs_efd_zone);
out_destroy_buf_item_zone:
- kmem_zone_destroy(xfs_buf_item_zone);
+ kmem_cache_destroy(xfs_buf_item_zone);
out_destroy_trans_zone:
- kmem_zone_destroy(xfs_trans_zone);
+ kmem_cache_destroy(xfs_trans_zone);
out_destroy_ifork_zone:
- kmem_zone_destroy(xfs_ifork_zone);
+ kmem_cache_destroy(xfs_ifork_zone);
out_destroy_da_state_zone:
- kmem_zone_destroy(xfs_da_state_zone);
+ kmem_cache_destroy(xfs_da_state_zone);
out_destroy_btree_cur_zone:
- kmem_zone_destroy(xfs_btree_cur_zone);
+ kmem_cache_destroy(xfs_btree_cur_zone);
out_destroy_bmap_free_item_zone:
- kmem_zone_destroy(xfs_bmap_free_item_zone);
+ kmem_cache_destroy(xfs_bmap_free_item_zone);
out_destroy_log_ticket_zone:
- kmem_zone_destroy(xfs_log_ticket_zone);
- out_free_ioend_bioset:
- bioset_exit(&xfs_ioend_bioset);
+ kmem_cache_destroy(xfs_log_ticket_zone);
out:
return -ENOMEM;
}
@@ -2010,25 +2048,24 @@
* destroy caches.
*/
rcu_barrier();
- kmem_zone_destroy(xfs_bui_zone);
- kmem_zone_destroy(xfs_bud_zone);
- kmem_zone_destroy(xfs_cui_zone);
- kmem_zone_destroy(xfs_cud_zone);
- kmem_zone_destroy(xfs_rui_zone);
- kmem_zone_destroy(xfs_rud_zone);
- kmem_zone_destroy(xfs_icreate_zone);
- kmem_zone_destroy(xfs_ili_zone);
- kmem_zone_destroy(xfs_inode_zone);
- kmem_zone_destroy(xfs_efi_zone);
- kmem_zone_destroy(xfs_efd_zone);
- kmem_zone_destroy(xfs_buf_item_zone);
- kmem_zone_destroy(xfs_trans_zone);
- kmem_zone_destroy(xfs_ifork_zone);
- kmem_zone_destroy(xfs_da_state_zone);
- kmem_zone_destroy(xfs_btree_cur_zone);
- kmem_zone_destroy(xfs_bmap_free_item_zone);
- kmem_zone_destroy(xfs_log_ticket_zone);
- bioset_exit(&xfs_ioend_bioset);
+ kmem_cache_destroy(xfs_bui_zone);
+ kmem_cache_destroy(xfs_bud_zone);
+ kmem_cache_destroy(xfs_cui_zone);
+ kmem_cache_destroy(xfs_cud_zone);
+ kmem_cache_destroy(xfs_rui_zone);
+ kmem_cache_destroy(xfs_rud_zone);
+ kmem_cache_destroy(xfs_icreate_zone);
+ kmem_cache_destroy(xfs_ili_zone);
+ kmem_cache_destroy(xfs_inode_zone);
+ kmem_cache_destroy(xfs_efi_zone);
+ kmem_cache_destroy(xfs_efd_zone);
+ kmem_cache_destroy(xfs_buf_item_zone);
+ kmem_cache_destroy(xfs_trans_zone);
+ kmem_cache_destroy(xfs_ifork_zone);
+ kmem_cache_destroy(xfs_da_state_zone);
+ kmem_cache_destroy(xfs_btree_cur_zone);
+ kmem_cache_destroy(xfs_bmap_free_item_zone);
+ kmem_cache_destroy(xfs_log_ticket_zone);
}
STATIC int __init
diff --git a/fs/xfs/xfs_super.h b/fs/xfs/xfs_super.h
index 763e43d..b552cf6 100644
--- a/fs/xfs/xfs_super.h
+++ b/fs/xfs/xfs_super.h
@@ -11,9 +11,11 @@
#ifdef CONFIG_XFS_QUOTA
extern int xfs_qm_init(void);
extern void xfs_qm_exit(void);
+# define XFS_QUOTA_STRING "quota, "
#else
# define xfs_qm_init() (0)
# define xfs_qm_exit() do { } while (0)
+# define XFS_QUOTA_STRING
#endif
#ifdef CONFIG_XFS_POSIX_ACL
@@ -50,6 +52,12 @@
# define XFS_WARN_STRING
#endif
+#ifdef CONFIG_XFS_ASSERT_FATAL
+# define XFS_ASSERT_FATAL_STRING "fatal assert, "
+#else
+# define XFS_ASSERT_FATAL_STRING
+#endif
+
#ifdef DEBUG
# define XFS_DBG_STRING "debug"
#else
@@ -63,6 +71,8 @@
XFS_SCRUB_STRING \
XFS_REPAIR_STRING \
XFS_WARN_STRING \
+ XFS_QUOTA_STRING \
+ XFS_ASSERT_FATAL_STRING \
XFS_DBG_STRING /* DBG must be last */
struct xfs_inode;
diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c
index ed66fd2..8e88a7c 100644
--- a/fs/xfs/xfs_symlink.c
+++ b/fs/xfs/xfs_symlink.c
@@ -17,6 +17,7 @@
#include "xfs_bmap.h"
#include "xfs_bmap_btree.h"
#include "xfs_quota.h"
+#include "xfs_symlink.h"
#include "xfs_trans_space.h"
#include "xfs_trace.h"
#include "xfs_trans.h"
@@ -52,20 +53,10 @@
d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock);
byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount);
- bp = xfs_buf_read(mp->m_ddev_targp, d, BTOBB(byte_cnt), 0,
- &xfs_symlink_buf_ops);
- if (!bp)
- return -ENOMEM;
- error = bp->b_error;
- if (error) {
- xfs_buf_ioerror_alert(bp, __func__);
- xfs_buf_relse(bp);
-
- /* bad CRC means corrupted metadata */
- if (error == -EFSBADCRC)
- error = -EFSCORRUPTED;
- goto out;
- }
+ error = xfs_buf_read(mp->m_ddev_targp, d, BTOBB(byte_cnt), 0,
+ &bp, &xfs_symlink_buf_ops);
+ if (error)
+ return error;
byte_cnt = XFS_SYMLINK_BUF_SPACE(mp, byte_cnt);
if (pathlen < byte_cnt)
byte_cnt = pathlen;
@@ -185,15 +176,12 @@
return -ENAMETOOLONG;
ASSERT(pathlen > 0);
- udqp = gdqp = NULL;
prid = xfs_get_initial_prid(dp);
/*
* Make sure that we have allocated dquot(s) on disk.
*/
- error = xfs_qm_vop_dqalloc(dp,
- xfs_kuid_to_uid(current_fsuid()),
- xfs_kgid_to_gid(current_fsgid()), prid,
+ error = xfs_qm_vop_dqalloc(dp, current_fsuid(), current_fsgid(), prid,
XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT,
&udqp, &gdqp, &pdqp);
if (error)
@@ -203,7 +191,7 @@
* The symlink will fit into the inode data fork?
* There can't be any attributes so we get the whole variable part.
*/
- if (pathlen <= XFS_LITINO(mp, dp->i_d.di_version))
+ if (pathlen <= XFS_LITINO(mp))
fs_blocks = 0;
else
fs_blocks = xfs_symlink_blocks(mp, pathlen);
@@ -255,8 +243,7 @@
*/
xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp, pdqp);
- if (resblks)
- resblks -= XFS_IALLOC_SPACE_RES(mp);
+ resblks -= XFS_IALLOC_SPACE_RES(mp);
/*
* If the symlink will fit into the inode, write it inline.
*/
@@ -264,7 +251,7 @@
xfs_init_local_fork(ip, XFS_DATA_FORK, target_path, pathlen);
ip->i_d.di_size = pathlen;
- ip->i_d.di_format = XFS_DINODE_FMT_LOCAL;
+ ip->i_df.if_format = XFS_DINODE_FMT_LOCAL;
xfs_trans_log_inode(tp, ip, XFS_ILOG_DDATA | XFS_ILOG_CORE);
} else {
int offset;
@@ -277,8 +264,7 @@
if (error)
goto out_trans_cancel;
- if (resblks)
- resblks -= fs_blocks;
+ resblks -= fs_blocks;
ip->i_d.di_size = pathlen;
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
@@ -289,12 +275,10 @@
d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock);
byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount);
- bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d,
- BTOBB(byte_cnt), 0);
- if (!bp) {
- error = -ENOMEM;
+ error = xfs_trans_get_buf(tp, mp->m_ddev_targp, d,
+ BTOBB(byte_cnt), 0, &bp);
+ if (error)
goto out_trans_cancel;
- }
bp->b_ops = &xfs_symlink_buf_ops;
byte_cnt = XFS_SYMLINK_BUF_SPACE(mp, byte_cnt);
@@ -400,7 +384,7 @@
* either 1 or 2 extents and that we can
* free them all in one bunmapi call.
*/
- ASSERT(ip->i_d.di_nextents > 0 && ip->i_d.di_nextents <= 2);
+ ASSERT(ip->i_df.if_nextents > 0 && ip->i_df.if_nextents <= 2);
error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp);
if (error)
@@ -432,13 +416,12 @@
* Invalidate the block(s). No validation is done.
*/
for (i = 0; i < nmaps; i++) {
- bp = xfs_trans_get_buf(tp, mp->m_ddev_targp,
- XFS_FSB_TO_DADDR(mp, mval[i].br_startblock),
- XFS_FSB_TO_BB(mp, mval[i].br_blockcount), 0);
- if (!bp) {
- error = -ENOMEM;
+ error = xfs_trans_get_buf(tp, mp->m_ddev_targp,
+ XFS_FSB_TO_DADDR(mp, mval[i].br_startblock),
+ XFS_FSB_TO_BB(mp, mval[i].br_blockcount), 0,
+ &bp);
+ if (error)
goto error_trans_cancel;
- }
xfs_trans_binval(tp, bp);
}
/*
diff --git a/fs/xfs/xfs_symlink.h b/fs/xfs/xfs_symlink.h
index 9743d8c..b1fa091 100644
--- a/fs/xfs/xfs_symlink.h
+++ b/fs/xfs/xfs_symlink.h
@@ -5,7 +5,7 @@
#ifndef __XFS_SYMLINK_H
#define __XFS_SYMLINK_H 1
-/* Kernel only symlink defintions */
+/* Kernel only symlink definitions */
int xfs_symlink(struct xfs_inode *dp, struct xfs_name *link_name,
const char *target_path, umode_t mode, struct xfs_inode **ipp);
diff --git a/fs/xfs/xfs_sysctl.c b/fs/xfs/xfs_sysctl.c
index 31b3bdb..fac9de7 100644
--- a/fs/xfs/xfs_sysctl.c
+++ b/fs/xfs/xfs_sysctl.c
@@ -13,7 +13,7 @@
xfs_stats_clear_proc_handler(
struct ctl_table *ctl,
int write,
- void __user *buffer,
+ void *buffer,
size_t *lenp,
loff_t *ppos)
{
@@ -33,7 +33,7 @@
xfs_panic_mask_proc_handler(
struct ctl_table *ctl,
int write,
- void __user *buffer,
+ void *buffer,
size_t *lenp,
loff_t *ppos)
{
@@ -50,13 +50,45 @@
}
#endif /* CONFIG_PROC_FS */
+STATIC int
+xfs_deprecate_irix_sgid_inherit_proc_handler(
+ struct ctl_table *ctl,
+ int write,
+ void *buffer,
+ size_t *lenp,
+ loff_t *ppos)
+{
+ if (write) {
+ printk_once(KERN_WARNING
+ "XFS: " "%s sysctl option is deprecated.\n",
+ ctl->procname);
+ }
+ return proc_dointvec_minmax(ctl, write, buffer, lenp, ppos);
+}
+
+STATIC int
+xfs_deprecate_irix_symlink_mode_proc_handler(
+ struct ctl_table *ctl,
+ int write,
+ void *buffer,
+ size_t *lenp,
+ loff_t *ppos)
+{
+ if (write) {
+ printk_once(KERN_WARNING
+ "XFS: " "%s sysctl option is deprecated.\n",
+ ctl->procname);
+ }
+ return proc_dointvec_minmax(ctl, write, buffer, lenp, ppos);
+}
+
static struct ctl_table xfs_table[] = {
{
.procname = "irix_sgid_inherit",
.data = &xfs_params.sgid_inherit.val,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = proc_dointvec_minmax,
+ .proc_handler = xfs_deprecate_irix_sgid_inherit_proc_handler,
.extra1 = &xfs_params.sgid_inherit.min,
.extra2 = &xfs_params.sgid_inherit.max
},
@@ -65,7 +97,7 @@
.data = &xfs_params.symlink_mode.val,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = proc_dointvec_minmax,
+ .proc_handler = xfs_deprecate_irix_symlink_mode_proc_handler,
.extra1 = &xfs_params.symlink_mode.min,
.extra2 = &xfs_params.symlink_mode.max
},
diff --git a/fs/xfs/xfs_trace.c b/fs/xfs/xfs_trace.c
index bc85b89..120398a 100644
--- a/fs/xfs/xfs_trace.c
+++ b/fs/xfs/xfs_trace.c
@@ -6,6 +6,7 @@
#include "xfs.h"
#include "xfs_fs.h"
#include "xfs_shared.h"
+#include "xfs_bit.h"
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
@@ -27,6 +28,7 @@
#include "xfs_log_recover.h"
#include "xfs_filestream.h"
#include "xfs_fsmap.h"
+#include "xfs_btree_staging.h"
/*
* We include this last to have the helpers above available for the trace
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index eaae275..8695165 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -35,6 +35,13 @@
struct xfs_owner_info;
struct xfs_trans_res;
struct xfs_inobt_rec_incore;
+union xfs_btree_ptr;
+struct xfs_dqtrx;
+
+#define XFS_ATTR_FILTER_FLAGS \
+ { XFS_ATTR_ROOT, "ROOT" }, \
+ { XFS_ATTR_SECURE, "SECURE" }, \
+ { XFS_ATTR_INCOMPLETE, "INCOMPLETE" }
DECLARE_EVENT_CLASS(xfs_attr_list_class,
TP_PROTO(struct xfs_attr_list_context *ctx),
@@ -45,39 +52,39 @@
__field(u32, hashval)
__field(u32, blkno)
__field(u32, offset)
- __field(void *, alist)
+ __field(void *, buffer)
__field(int, bufsize)
__field(int, count)
__field(int, firstu)
__field(int, dupcnt)
- __field(int, flags)
+ __field(unsigned int, attr_filter)
),
TP_fast_assign(
__entry->dev = VFS_I(ctx->dp)->i_sb->s_dev;
__entry->ino = ctx->dp->i_ino;
- __entry->hashval = ctx->cursor->hashval;
- __entry->blkno = ctx->cursor->blkno;
- __entry->offset = ctx->cursor->offset;
- __entry->alist = ctx->alist;
+ __entry->hashval = ctx->cursor.hashval;
+ __entry->blkno = ctx->cursor.blkno;
+ __entry->offset = ctx->cursor.offset;
+ __entry->buffer = ctx->buffer;
__entry->bufsize = ctx->bufsize;
__entry->count = ctx->count;
__entry->firstu = ctx->firstu;
- __entry->flags = ctx->flags;
+ __entry->attr_filter = ctx->attr_filter;
),
TP_printk("dev %d:%d ino 0x%llx cursor h/b/o 0x%x/0x%x/%u dupcnt %u "
- "alist %p size %u count %u firstu %u flags %d %s",
+ "buffer %p size %u count %u firstu %u filter %s",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->ino,
__entry->hashval,
__entry->blkno,
__entry->offset,
__entry->dupcnt,
- __entry->alist,
+ __entry->buffer,
__entry->bufsize,
__entry->count,
__entry->firstu,
- __entry->flags,
- __print_flags(__entry->flags, "|", XFS_ATTR_FLAGS)
+ __print_flags(__entry->attr_filter, "|",
+ XFS_ATTR_FILTER_FLAGS)
)
)
@@ -169,31 +176,31 @@
__field(u32, hashval)
__field(u32, blkno)
__field(u32, offset)
- __field(void *, alist)
+ __field(void *, buffer)
__field(int, bufsize)
__field(int, count)
__field(int, firstu)
__field(int, dupcnt)
- __field(int, flags)
+ __field(unsigned int, attr_filter)
__field(u32, bt_hashval)
__field(u32, bt_before)
),
TP_fast_assign(
__entry->dev = VFS_I(ctx->dp)->i_sb->s_dev;
__entry->ino = ctx->dp->i_ino;
- __entry->hashval = ctx->cursor->hashval;
- __entry->blkno = ctx->cursor->blkno;
- __entry->offset = ctx->cursor->offset;
- __entry->alist = ctx->alist;
+ __entry->hashval = ctx->cursor.hashval;
+ __entry->blkno = ctx->cursor.blkno;
+ __entry->offset = ctx->cursor.offset;
+ __entry->buffer = ctx->buffer;
__entry->bufsize = ctx->bufsize;
__entry->count = ctx->count;
__entry->firstu = ctx->firstu;
- __entry->flags = ctx->flags;
+ __entry->attr_filter = ctx->attr_filter;
__entry->bt_hashval = be32_to_cpu(btree->hashval);
__entry->bt_before = be32_to_cpu(btree->before);
),
TP_printk("dev %d:%d ino 0x%llx cursor h/b/o 0x%x/0x%x/%u dupcnt %u "
- "alist %p size %u count %u firstu %u flags %d %s "
+ "buffer %p size %u count %u firstu %u filter %s "
"node hashval %u, node before %u",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->ino,
@@ -201,12 +208,12 @@
__entry->blkno,
__entry->offset,
__entry->dupcnt,
- __entry->alist,
+ __entry->buffer,
__entry->bufsize,
__entry->count,
__entry->firstu,
- __entry->flags,
- __print_flags(__entry->flags, "|", XFS_ATTR_FLAGS),
+ __print_flags(__entry->attr_filter, "|",
+ XFS_ATTR_FILTER_FLAGS),
__entry->bt_hashval,
__entry->bt_before)
);
@@ -218,8 +225,8 @@
TP_STRUCT__entry(
__field(dev_t, dev)
__field(xfs_ino_t, ino)
- __field(void *, leaf);
- __field(int, pos);
+ __field(void *, leaf)
+ __field(int, pos)
__field(xfs_fileoff_t, startoff)
__field(xfs_fsblock_t, startblock)
__field(xfs_filblks_t, blockcount)
@@ -331,7 +338,7 @@
DEFINE_BUF_EVENT(xfs_buf_delwri_pushbuf);
DEFINE_BUF_EVENT(xfs_buf_get_uncached);
DEFINE_BUF_EVENT(xfs_buf_item_relse);
-DEFINE_BUF_EVENT(xfs_buf_item_iodone_async);
+DEFINE_BUF_EVENT(xfs_buf_iodone_async);
DEFINE_BUF_EVENT(xfs_buf_error_relse);
DEFINE_BUF_EVENT(xfs_buf_wait_buftarg);
DEFINE_BUF_EVENT(xfs_trans_read_buf_shut);
@@ -725,7 +732,7 @@
__entry->writeio_blocks = writeio_blocks;
),
TP_printk("dev %d:%d ino 0x%llx prealloc blocks %llu shift %d "
- "m_writeio_blocks %u",
+ "m_allocsize_blocks %u",
MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino,
__entry->blocks, __entry->shift, __entry->writeio_blocks)
)
@@ -858,44 +865,65 @@
TP_STRUCT__entry(
__field(dev_t, dev)
__field(u32, id)
+ __field(xfs_dqtype_t, type)
__field(unsigned, flags)
__field(unsigned, nrefs)
__field(unsigned long long, res_bcount)
+ __field(unsigned long long, res_rtbcount)
+ __field(unsigned long long, res_icount)
+
__field(unsigned long long, bcount)
+ __field(unsigned long long, rtbcount)
__field(unsigned long long, icount)
+
__field(unsigned long long, blk_hardlimit)
__field(unsigned long long, blk_softlimit)
+ __field(unsigned long long, rtb_hardlimit)
+ __field(unsigned long long, rtb_softlimit)
__field(unsigned long long, ino_hardlimit)
__field(unsigned long long, ino_softlimit)
- ), \
+ ),
TP_fast_assign(
__entry->dev = dqp->q_mount->m_super->s_dev;
- __entry->id = be32_to_cpu(dqp->q_core.d_id);
- __entry->flags = dqp->dq_flags;
+ __entry->id = dqp->q_id;
+ __entry->type = dqp->q_type;
+ __entry->flags = dqp->q_flags;
__entry->nrefs = dqp->q_nrefs;
- __entry->res_bcount = dqp->q_res_bcount;
- __entry->bcount = be64_to_cpu(dqp->q_core.d_bcount);
- __entry->icount = be64_to_cpu(dqp->q_core.d_icount);
- __entry->blk_hardlimit =
- be64_to_cpu(dqp->q_core.d_blk_hardlimit);
- __entry->blk_softlimit =
- be64_to_cpu(dqp->q_core.d_blk_softlimit);
- __entry->ino_hardlimit =
- be64_to_cpu(dqp->q_core.d_ino_hardlimit);
- __entry->ino_softlimit =
- be64_to_cpu(dqp->q_core.d_ino_softlimit);
+
+ __entry->res_bcount = dqp->q_blk.reserved;
+ __entry->res_rtbcount = dqp->q_rtb.reserved;
+ __entry->res_icount = dqp->q_ino.reserved;
+
+ __entry->bcount = dqp->q_blk.count;
+ __entry->rtbcount = dqp->q_rtb.count;
+ __entry->icount = dqp->q_ino.count;
+
+ __entry->blk_hardlimit = dqp->q_blk.hardlimit;
+ __entry->blk_softlimit = dqp->q_blk.softlimit;
+ __entry->rtb_hardlimit = dqp->q_rtb.hardlimit;
+ __entry->rtb_softlimit = dqp->q_rtb.softlimit;
+ __entry->ino_hardlimit = dqp->q_ino.hardlimit;
+ __entry->ino_softlimit = dqp->q_ino.softlimit;
),
- TP_printk("dev %d:%d id 0x%x flags %s nrefs %u res_bc 0x%llx "
+ TP_printk("dev %d:%d id 0x%x type %s flags %s nrefs %u "
+ "res_bc 0x%llx res_rtbc 0x%llx res_ic 0x%llx "
"bcnt 0x%llx bhardlimit 0x%llx bsoftlimit 0x%llx "
+ "rtbcnt 0x%llx rtbhardlimit 0x%llx rtbsoftlimit 0x%llx "
"icnt 0x%llx ihardlimit 0x%llx isoftlimit 0x%llx]",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->id,
- __print_flags(__entry->flags, "|", XFS_DQ_FLAGS),
+ __print_flags(__entry->type, "|", XFS_DQTYPE_STRINGS),
+ __print_flags(__entry->flags, "|", XFS_DQFLAG_STRINGS),
__entry->nrefs,
__entry->res_bcount,
+ __entry->res_rtbcount,
+ __entry->res_icount,
__entry->bcount,
__entry->blk_hardlimit,
__entry->blk_softlimit,
+ __entry->rtbcount,
+ __entry->rtb_hardlimit,
+ __entry->rtb_softlimit,
__entry->icount,
__entry->ino_hardlimit,
__entry->ino_softlimit)
@@ -926,6 +954,125 @@
DEFINE_DQUOT_EVENT(xfs_dqflush);
DEFINE_DQUOT_EVENT(xfs_dqflush_force);
DEFINE_DQUOT_EVENT(xfs_dqflush_done);
+DEFINE_DQUOT_EVENT(xfs_trans_apply_dquot_deltas_before);
+DEFINE_DQUOT_EVENT(xfs_trans_apply_dquot_deltas_after);
+
+#define XFS_QMOPT_FLAGS \
+ { XFS_QMOPT_UQUOTA, "UQUOTA" }, \
+ { XFS_QMOPT_PQUOTA, "PQUOTA" }, \
+ { XFS_QMOPT_FORCE_RES, "FORCE_RES" }, \
+ { XFS_QMOPT_SBVERSION, "SBVERSION" }, \
+ { XFS_QMOPT_GQUOTA, "GQUOTA" }, \
+ { XFS_QMOPT_INHERIT, "INHERIT" }, \
+ { XFS_QMOPT_RES_REGBLKS, "RES_REGBLKS" }, \
+ { XFS_QMOPT_RES_RTBLKS, "RES_RTBLKS" }, \
+ { XFS_QMOPT_BCOUNT, "BCOUNT" }, \
+ { XFS_QMOPT_ICOUNT, "ICOUNT" }, \
+ { XFS_QMOPT_RTBCOUNT, "RTBCOUNT" }, \
+ { XFS_QMOPT_DELBCOUNT, "DELBCOUNT" }, \
+ { XFS_QMOPT_DELRTBCOUNT, "DELRTBCOUNT" }, \
+ { XFS_QMOPT_RES_INOS, "RES_INOS" }
+
+TRACE_EVENT(xfs_trans_mod_dquot,
+ TP_PROTO(struct xfs_trans *tp, struct xfs_dquot *dqp,
+ unsigned int field, int64_t delta),
+ TP_ARGS(tp, dqp, field, delta),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_dqtype_t, type)
+ __field(unsigned int, flags)
+ __field(unsigned int, dqid)
+ __field(unsigned int, field)
+ __field(int64_t, delta)
+ ),
+ TP_fast_assign(
+ __entry->dev = tp->t_mountp->m_super->s_dev;
+ __entry->type = dqp->q_type;
+ __entry->flags = dqp->q_flags;
+ __entry->dqid = dqp->q_id;
+ __entry->field = field;
+ __entry->delta = delta;
+ ),
+ TP_printk("dev %d:%d dquot id 0x%x type %s flags %s field %s delta %lld",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->dqid,
+ __print_flags(__entry->type, "|", XFS_DQTYPE_STRINGS),
+ __print_flags(__entry->flags, "|", XFS_DQFLAG_STRINGS),
+ __print_flags(__entry->field, "|", XFS_QMOPT_FLAGS),
+ __entry->delta)
+);
+
+DECLARE_EVENT_CLASS(xfs_dqtrx_class,
+ TP_PROTO(struct xfs_dqtrx *qtrx),
+ TP_ARGS(qtrx),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_dqtype_t, type)
+ __field(unsigned int, flags)
+ __field(u32, dqid)
+
+ __field(uint64_t, blk_res)
+ __field(int64_t, bcount_delta)
+ __field(int64_t, delbcnt_delta)
+
+ __field(uint64_t, rtblk_res)
+ __field(uint64_t, rtblk_res_used)
+ __field(int64_t, rtbcount_delta)
+ __field(int64_t, delrtb_delta)
+
+ __field(uint64_t, ino_res)
+ __field(uint64_t, ino_res_used)
+ __field(int64_t, icount_delta)
+ ),
+ TP_fast_assign(
+ __entry->dev = qtrx->qt_dquot->q_mount->m_super->s_dev;
+ __entry->type = qtrx->qt_dquot->q_type;
+ __entry->flags = qtrx->qt_dquot->q_flags;
+ __entry->dqid = qtrx->qt_dquot->q_id;
+
+ __entry->blk_res = qtrx->qt_blk_res;
+ __entry->bcount_delta = qtrx->qt_bcount_delta;
+ __entry->delbcnt_delta = qtrx->qt_delbcnt_delta;
+
+ __entry->rtblk_res = qtrx->qt_rtblk_res;
+ __entry->rtblk_res_used = qtrx->qt_rtblk_res_used;
+ __entry->rtbcount_delta = qtrx->qt_rtbcount_delta;
+ __entry->delrtb_delta = qtrx->qt_delrtb_delta;
+
+ __entry->ino_res = qtrx->qt_ino_res;
+ __entry->ino_res_used = qtrx->qt_ino_res_used;
+ __entry->icount_delta = qtrx->qt_icount_delta;
+ ),
+ TP_printk("dev %d:%d dquot id 0x%x type %s flags %s"
+ "blk_res %llu bcount_delta %lld delbcnt_delta %lld "
+ "rtblk_res %llu rtblk_res_used %llu rtbcount_delta %lld delrtb_delta %lld "
+ "ino_res %llu ino_res_used %llu icount_delta %lld",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->dqid,
+ __print_flags(__entry->type, "|", XFS_DQTYPE_STRINGS),
+ __print_flags(__entry->flags, "|", XFS_DQFLAG_STRINGS),
+
+ __entry->blk_res,
+ __entry->bcount_delta,
+ __entry->delbcnt_delta,
+
+ __entry->rtblk_res,
+ __entry->rtblk_res_used,
+ __entry->rtbcount_delta,
+ __entry->delrtb_delta,
+
+ __entry->ino_res,
+ __entry->ino_res_used,
+ __entry->icount_delta)
+)
+
+#define DEFINE_DQTRX_EVENT(name) \
+DEFINE_EVENT(xfs_dqtrx_class, name, \
+ TP_PROTO(struct xfs_dqtrx *qtrx), \
+ TP_ARGS(qtrx))
+DEFINE_DQTRX_EVENT(xfs_trans_apply_dquot_deltas);
+DEFINE_DQTRX_EVENT(xfs_trans_mod_dquot_before);
+DEFINE_DQTRX_EVENT(xfs_trans_mod_dquot_after);
DECLARE_EVENT_CLASS(xfs_loggrant_class,
TP_PROTO(struct xlog *log, struct xlog_ticket *tic),
@@ -995,8 +1142,6 @@
DEFINE_EVENT(xfs_loggrant_class, name, \
TP_PROTO(struct xlog *log, struct xlog_ticket *tic), \
TP_ARGS(log, tic))
-DEFINE_LOGGRANT_EVENT(xfs_log_done_nonperm);
-DEFINE_LOGGRANT_EVENT(xfs_log_done_perm);
DEFINE_LOGGRANT_EVENT(xfs_log_umount_write);
DEFINE_LOGGRANT_EVENT(xfs_log_grant_sleep);
DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake);
@@ -1005,12 +1150,13 @@
DEFINE_LOGGRANT_EVENT(xfs_log_reserve_exit);
DEFINE_LOGGRANT_EVENT(xfs_log_regrant);
DEFINE_LOGGRANT_EVENT(xfs_log_regrant_exit);
-DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_enter);
-DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_exit);
-DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_sub);
-DEFINE_LOGGRANT_EVENT(xfs_log_ungrant_enter);
-DEFINE_LOGGRANT_EVENT(xfs_log_ungrant_exit);
-DEFINE_LOGGRANT_EVENT(xfs_log_ungrant_sub);
+DEFINE_LOGGRANT_EVENT(xfs_log_ticket_regrant);
+DEFINE_LOGGRANT_EVENT(xfs_log_ticket_regrant_exit);
+DEFINE_LOGGRANT_EVENT(xfs_log_ticket_regrant_sub);
+DEFINE_LOGGRANT_EVENT(xfs_log_ticket_ungrant);
+DEFINE_LOGGRANT_EVENT(xfs_log_ticket_ungrant_sub);
+DEFINE_LOGGRANT_EVENT(xfs_log_ticket_ungrant_exit);
+DEFINE_LOGGRANT_EVENT(xfs_log_cil_wait);
DECLARE_EVENT_CLASS(xfs_log_item_class,
TP_PROTO(struct xfs_log_item *lip),
@@ -1158,71 +1304,6 @@
DEFINE_RW_EVENT(xfs_file_direct_write);
DEFINE_RW_EVENT(xfs_file_dax_write);
-DECLARE_EVENT_CLASS(xfs_page_class,
- TP_PROTO(struct inode *inode, struct page *page, unsigned long off,
- unsigned int len),
- TP_ARGS(inode, page, off, len),
- TP_STRUCT__entry(
- __field(dev_t, dev)
- __field(xfs_ino_t, ino)
- __field(pgoff_t, pgoff)
- __field(loff_t, size)
- __field(unsigned long, offset)
- __field(unsigned int, length)
- ),
- TP_fast_assign(
- __entry->dev = inode->i_sb->s_dev;
- __entry->ino = XFS_I(inode)->i_ino;
- __entry->pgoff = page_offset(page);
- __entry->size = i_size_read(inode);
- __entry->offset = off;
- __entry->length = len;
- ),
- TP_printk("dev %d:%d ino 0x%llx pgoff 0x%lx size 0x%llx offset %lx "
- "length %x",
- MAJOR(__entry->dev), MINOR(__entry->dev),
- __entry->ino,
- __entry->pgoff,
- __entry->size,
- __entry->offset,
- __entry->length)
-)
-
-#define DEFINE_PAGE_EVENT(name) \
-DEFINE_EVENT(xfs_page_class, name, \
- TP_PROTO(struct inode *inode, struct page *page, unsigned long off, \
- unsigned int len), \
- TP_ARGS(inode, page, off, len))
-DEFINE_PAGE_EVENT(xfs_writepage);
-DEFINE_PAGE_EVENT(xfs_releasepage);
-DEFINE_PAGE_EVENT(xfs_invalidatepage);
-
-DECLARE_EVENT_CLASS(xfs_readpage_class,
- TP_PROTO(struct inode *inode, int nr_pages),
- TP_ARGS(inode, nr_pages),
- TP_STRUCT__entry(
- __field(dev_t, dev)
- __field(xfs_ino_t, ino)
- __field(int, nr_pages)
- ),
- TP_fast_assign(
- __entry->dev = inode->i_sb->s_dev;
- __entry->ino = inode->i_ino;
- __entry->nr_pages = nr_pages;
- ),
- TP_printk("dev %d:%d ino 0x%llx nr_pages %d",
- MAJOR(__entry->dev), MINOR(__entry->dev),
- __entry->ino,
- __entry->nr_pages)
-)
-
-#define DEFINE_READPAGE_EVENT(name) \
-DEFINE_EVENT(xfs_readpage_class, name, \
- TP_PROTO(struct inode *inode, int nr_pages), \
- TP_ARGS(inode, nr_pages))
-DEFINE_READPAGE_EVENT(xfs_vm_readpage);
-DEFINE_READPAGE_EVENT(xfs_vm_readpages);
-
DECLARE_EVENT_CLASS(xfs_imap_class,
TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count,
int whichfork, struct xfs_bmbt_irec *irec),
@@ -1642,8 +1723,11 @@
DEFINE_ALLOC_EVENT(xfs_alloc_exact_error);
DEFINE_ALLOC_EVENT(xfs_alloc_near_nominleft);
DEFINE_ALLOC_EVENT(xfs_alloc_near_first);
-DEFINE_ALLOC_EVENT(xfs_alloc_near_greater);
-DEFINE_ALLOC_EVENT(xfs_alloc_near_lesser);
+DEFINE_ALLOC_EVENT(xfs_alloc_cur);
+DEFINE_ALLOC_EVENT(xfs_alloc_cur_right);
+DEFINE_ALLOC_EVENT(xfs_alloc_cur_left);
+DEFINE_ALLOC_EVENT(xfs_alloc_cur_lookup);
+DEFINE_ALLOC_EVENT(xfs_alloc_cur_lookup_done);
DEFINE_ALLOC_EVENT(xfs_alloc_near_error);
DEFINE_ALLOC_EVENT(xfs_alloc_near_noentry);
DEFINE_ALLOC_EVENT(xfs_alloc_near_busy);
@@ -1663,6 +1747,32 @@
DEFINE_ALLOC_EVENT(xfs_alloc_vextent_loopfailed);
DEFINE_ALLOC_EVENT(xfs_alloc_vextent_allfailed);
+TRACE_EVENT(xfs_alloc_cur_check,
+ TP_PROTO(struct xfs_mount *mp, xfs_btnum_t btnum, xfs_agblock_t bno,
+ xfs_extlen_t len, xfs_extlen_t diff, bool new),
+ TP_ARGS(mp, btnum, bno, len, diff, new),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_btnum_t, btnum)
+ __field(xfs_agblock_t, bno)
+ __field(xfs_extlen_t, len)
+ __field(xfs_extlen_t, diff)
+ __field(bool, new)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->btnum = btnum;
+ __entry->bno = bno;
+ __entry->len = len;
+ __entry->diff = diff;
+ __entry->new = new;
+ ),
+ TP_printk("dev %d:%d btree %s bno 0x%x len 0x%x diff 0x%x new %d",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS),
+ __entry->bno, __entry->len, __entry->diff, __entry->new)
+)
+
DECLARE_EVENT_CLASS(xfs_da_class,
TP_PROTO(struct xfs_da_args *args),
TP_ARGS(args),
@@ -1737,7 +1847,8 @@
__field(int, namelen)
__field(int, valuelen)
__field(xfs_dahash_t, hashval)
- __field(int, flags)
+ __field(unsigned int, attr_filter)
+ __field(unsigned int, attr_flags)
__field(int, op_flags)
),
TP_fast_assign(
@@ -1748,11 +1859,12 @@
__entry->namelen = args->namelen;
__entry->valuelen = args->valuelen;
__entry->hashval = args->hashval;
- __entry->flags = args->flags;
+ __entry->attr_filter = args->attr_filter;
+ __entry->attr_flags = args->attr_flags;
__entry->op_flags = args->op_flags;
),
TP_printk("dev %d:%d ino 0x%llx name %.*s namelen %d valuelen %d "
- "hashval 0x%x flags %s op_flags %s",
+ "hashval 0x%x filter %s flags %s op_flags %s",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->ino,
__entry->namelen,
@@ -1760,7 +1872,11 @@
__entry->namelen,
__entry->valuelen,
__entry->hashval,
- __print_flags(__entry->flags, "|", XFS_ATTR_FLAGS),
+ __print_flags(__entry->attr_filter, "|",
+ XFS_ATTR_FILTER_FLAGS),
+ __print_flags(__entry->attr_flags, "|",
+ { XATTR_CREATE, "CREATE" },
+ { XATTR_REPLACE, "REPLACE" }),
__print_flags(__entry->op_flags, "|", XFS_DA_OP_FLAGS))
)
@@ -1922,8 +2038,8 @@
__entry->dev = VFS_I(ip)->i_sb->s_dev;
__entry->which = which;
__entry->ino = ip->i_ino;
- __entry->format = ip->i_d.di_format;
- __entry->nex = ip->i_d.di_nextents;
+ __entry->format = ip->i_df.if_format;
+ __entry->nex = ip->i_df.if_nextents;
__entry->broot_size = ip->i_df.if_broot_bytes;
__entry->fork_off = XFS_IFORK_BOFF(ip);
),
@@ -2417,6 +2533,7 @@
DEFINE_DEFER_PENDING_EVENT(xfs_defer_cancel_list);
DEFINE_DEFER_PENDING_EVENT(xfs_defer_pending_finish);
DEFINE_DEFER_PENDING_EVENT(xfs_defer_pending_abort);
+DEFINE_DEFER_PENDING_EVENT(xfs_defer_relog_intent);
#define DEFINE_BMAP_FREE_DEFERRED_EVENT DEFINE_PHYS_EXTENT_DEFERRED_EVENT
DEFINE_BMAP_FREE_DEFERRED_EVENT(xfs_bmap_free_defer);
@@ -3077,8 +3194,7 @@
DEFINE_INODE_EVENT(xfs_reflink_set_inode_flag);
DEFINE_INODE_EVENT(xfs_reflink_unset_inode_flag);
DEFINE_ITRUNC_EVENT(xfs_reflink_update_inode_size);
-DEFINE_IMAP_EVENT(xfs_reflink_remap_imap);
-TRACE_EVENT(xfs_reflink_remap_blocks_loop,
+TRACE_EVENT(xfs_reflink_remap_blocks,
TP_PROTO(struct xfs_inode *src, xfs_fileoff_t soffset,
xfs_filblks_t len, struct xfs_inode *dest,
xfs_fileoff_t doffset),
@@ -3109,59 +3225,14 @@
__entry->dest_ino,
__entry->dest_lblk)
);
-TRACE_EVENT(xfs_reflink_punch_range,
- TP_PROTO(struct xfs_inode *ip, xfs_fileoff_t lblk,
- xfs_extlen_t len),
- TP_ARGS(ip, lblk, len),
- TP_STRUCT__entry(
- __field(dev_t, dev)
- __field(xfs_ino_t, ino)
- __field(xfs_fileoff_t, lblk)
- __field(xfs_extlen_t, len)
- ),
- TP_fast_assign(
- __entry->dev = VFS_I(ip)->i_sb->s_dev;
- __entry->ino = ip->i_ino;
- __entry->lblk = lblk;
- __entry->len = len;
- ),
- TP_printk("dev %d:%d ino 0x%llx lblk 0x%llx len 0x%x",
- MAJOR(__entry->dev), MINOR(__entry->dev),
- __entry->ino,
- __entry->lblk,
- __entry->len)
-);
-TRACE_EVENT(xfs_reflink_remap,
- TP_PROTO(struct xfs_inode *ip, xfs_fileoff_t lblk,
- xfs_extlen_t len, xfs_fsblock_t new_pblk),
- TP_ARGS(ip, lblk, len, new_pblk),
- TP_STRUCT__entry(
- __field(dev_t, dev)
- __field(xfs_ino_t, ino)
- __field(xfs_fileoff_t, lblk)
- __field(xfs_extlen_t, len)
- __field(xfs_fsblock_t, new_pblk)
- ),
- TP_fast_assign(
- __entry->dev = VFS_I(ip)->i_sb->s_dev;
- __entry->ino = ip->i_ino;
- __entry->lblk = lblk;
- __entry->len = len;
- __entry->new_pblk = new_pblk;
- ),
- TP_printk("dev %d:%d ino 0x%llx lblk 0x%llx len 0x%x new_pblk %llu",
- MAJOR(__entry->dev), MINOR(__entry->dev),
- __entry->ino,
- __entry->lblk,
- __entry->len,
- __entry->new_pblk)
-);
DEFINE_DOUBLE_IO_EVENT(xfs_reflink_remap_range);
DEFINE_INODE_ERROR_EVENT(xfs_reflink_remap_range_error);
DEFINE_INODE_ERROR_EVENT(xfs_reflink_set_inode_flag_error);
DEFINE_INODE_ERROR_EVENT(xfs_reflink_update_inode_size_error);
DEFINE_INODE_ERROR_EVENT(xfs_reflink_remap_blocks_error);
DEFINE_INODE_ERROR_EVENT(xfs_reflink_remap_extent_error);
+DEFINE_INODE_IREC_EVENT(xfs_reflink_remap_extent_src);
+DEFINE_INODE_IREC_EVENT(xfs_reflink_remap_extent_dest);
/* dedupe tracepoints */
DEFINE_DOUBLE_IO_EVENT(xfs_reflink_compare_extents);
@@ -3606,8 +3677,198 @@
DEFINE_KMEM_EVENT(kmem_alloc);
DEFINE_KMEM_EVENT(kmem_alloc_io);
DEFINE_KMEM_EVENT(kmem_alloc_large);
-DEFINE_KMEM_EVENT(kmem_realloc);
-DEFINE_KMEM_EVENT(kmem_zone_alloc);
+
+TRACE_EVENT(xfs_check_new_dalign,
+ TP_PROTO(struct xfs_mount *mp, int new_dalign, xfs_ino_t calc_rootino),
+ TP_ARGS(mp, new_dalign, calc_rootino),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(int, new_dalign)
+ __field(xfs_ino_t, sb_rootino)
+ __field(xfs_ino_t, calc_rootino)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->new_dalign = new_dalign;
+ __entry->sb_rootino = mp->m_sb.sb_rootino;
+ __entry->calc_rootino = calc_rootino;
+ ),
+ TP_printk("dev %d:%d new_dalign %d sb_rootino %llu calc_rootino %llu",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->new_dalign, __entry->sb_rootino,
+ __entry->calc_rootino)
+)
+
+TRACE_EVENT(xfs_btree_commit_afakeroot,
+ TP_PROTO(struct xfs_btree_cur *cur),
+ TP_ARGS(cur),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_btnum_t, btnum)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_agblock_t, agbno)
+ __field(unsigned int, levels)
+ __field(unsigned int, blocks)
+ ),
+ TP_fast_assign(
+ __entry->dev = cur->bc_mp->m_super->s_dev;
+ __entry->btnum = cur->bc_btnum;
+ __entry->agno = cur->bc_ag.agno;
+ __entry->agbno = cur->bc_ag.afake->af_root;
+ __entry->levels = cur->bc_ag.afake->af_levels;
+ __entry->blocks = cur->bc_ag.afake->af_blocks;
+ ),
+ TP_printk("dev %d:%d btree %s ag %u levels %u blocks %u root %u",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS),
+ __entry->agno,
+ __entry->levels,
+ __entry->blocks,
+ __entry->agbno)
+)
+
+TRACE_EVENT(xfs_btree_commit_ifakeroot,
+ TP_PROTO(struct xfs_btree_cur *cur),
+ TP_ARGS(cur),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_btnum_t, btnum)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_agino_t, agino)
+ __field(unsigned int, levels)
+ __field(unsigned int, blocks)
+ __field(int, whichfork)
+ ),
+ TP_fast_assign(
+ __entry->dev = cur->bc_mp->m_super->s_dev;
+ __entry->btnum = cur->bc_btnum;
+ __entry->agno = XFS_INO_TO_AGNO(cur->bc_mp,
+ cur->bc_ino.ip->i_ino);
+ __entry->agino = XFS_INO_TO_AGINO(cur->bc_mp,
+ cur->bc_ino.ip->i_ino);
+ __entry->levels = cur->bc_ino.ifake->if_levels;
+ __entry->blocks = cur->bc_ino.ifake->if_blocks;
+ __entry->whichfork = cur->bc_ino.whichfork;
+ ),
+ TP_printk("dev %d:%d btree %s ag %u agino %u whichfork %s levels %u blocks %u",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS),
+ __entry->agno,
+ __entry->agino,
+ __entry->whichfork == XFS_ATTR_FORK ? "attr" : "data",
+ __entry->levels,
+ __entry->blocks)
+)
+
+TRACE_EVENT(xfs_btree_bload_level_geometry,
+ TP_PROTO(struct xfs_btree_cur *cur, unsigned int level,
+ uint64_t nr_this_level, unsigned int nr_per_block,
+ unsigned int desired_npb, uint64_t blocks,
+ uint64_t blocks_with_extra),
+ TP_ARGS(cur, level, nr_this_level, nr_per_block, desired_npb, blocks,
+ blocks_with_extra),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_btnum_t, btnum)
+ __field(unsigned int, level)
+ __field(unsigned int, nlevels)
+ __field(uint64_t, nr_this_level)
+ __field(unsigned int, nr_per_block)
+ __field(unsigned int, desired_npb)
+ __field(unsigned long long, blocks)
+ __field(unsigned long long, blocks_with_extra)
+ ),
+ TP_fast_assign(
+ __entry->dev = cur->bc_mp->m_super->s_dev;
+ __entry->btnum = cur->bc_btnum;
+ __entry->level = level;
+ __entry->nlevels = cur->bc_nlevels;
+ __entry->nr_this_level = nr_this_level;
+ __entry->nr_per_block = nr_per_block;
+ __entry->desired_npb = desired_npb;
+ __entry->blocks = blocks;
+ __entry->blocks_with_extra = blocks_with_extra;
+ ),
+ TP_printk("dev %d:%d btree %s level %u/%u nr_this_level %llu nr_per_block %u desired_npb %u blocks %llu blocks_with_extra %llu",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS),
+ __entry->level,
+ __entry->nlevels,
+ __entry->nr_this_level,
+ __entry->nr_per_block,
+ __entry->desired_npb,
+ __entry->blocks,
+ __entry->blocks_with_extra)
+)
+
+TRACE_EVENT(xfs_btree_bload_block,
+ TP_PROTO(struct xfs_btree_cur *cur, unsigned int level,
+ uint64_t block_idx, uint64_t nr_blocks,
+ union xfs_btree_ptr *ptr, unsigned int nr_records),
+ TP_ARGS(cur, level, block_idx, nr_blocks, ptr, nr_records),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_btnum_t, btnum)
+ __field(unsigned int, level)
+ __field(unsigned long long, block_idx)
+ __field(unsigned long long, nr_blocks)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_agblock_t, agbno)
+ __field(unsigned int, nr_records)
+ ),
+ TP_fast_assign(
+ __entry->dev = cur->bc_mp->m_super->s_dev;
+ __entry->btnum = cur->bc_btnum;
+ __entry->level = level;
+ __entry->block_idx = block_idx;
+ __entry->nr_blocks = nr_blocks;
+ if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
+ xfs_fsblock_t fsb = be64_to_cpu(ptr->l);
+
+ __entry->agno = XFS_FSB_TO_AGNO(cur->bc_mp, fsb);
+ __entry->agbno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsb);
+ } else {
+ __entry->agno = cur->bc_ag.agno;
+ __entry->agbno = be32_to_cpu(ptr->s);
+ }
+ __entry->nr_records = nr_records;
+ ),
+ TP_printk("dev %d:%d btree %s level %u block %llu/%llu fsb (%u/%u) recs %u",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS),
+ __entry->level,
+ __entry->block_idx,
+ __entry->nr_blocks,
+ __entry->agno,
+ __entry->agbno,
+ __entry->nr_records)
+)
+
+DECLARE_EVENT_CLASS(xfs_timestamp_range_class,
+ TP_PROTO(struct xfs_mount *mp, time64_t min, time64_t max),
+ TP_ARGS(mp, min, max),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(long long, min)
+ __field(long long, max)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->min = min;
+ __entry->max = max;
+ ),
+ TP_printk("dev %d:%d min %lld max %lld",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->min,
+ __entry->max)
+)
+
+#define DEFINE_TIMESTAMP_RANGE_EVENT(name) \
+DEFINE_EVENT(xfs_timestamp_range_class, name, \
+ TP_PROTO(struct xfs_mount *mp, long long min, long long max), \
+ TP_ARGS(mp, min, max))
+DEFINE_TIMESTAMP_RANGE_EVENT(xfs_inode_timestamp_range);
+DEFINE_TIMESTAMP_RANGE_EVENT(xfs_quota_expiry_range);
#endif /* _TRACE_XFS_H */
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index b32a664..c94e71f 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -9,6 +9,7 @@
#include "xfs_shared.h"
#include "xfs_format.h"
#include "xfs_log_format.h"
+#include "xfs_log_priv.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
#include "xfs_extent_busy.h"
@@ -67,11 +68,10 @@
xfs_extent_busy_clear(tp->t_mountp, &tp->t_busy, false);
trace_xfs_trans_free(tp, _RET_IP_);
- atomic_dec(&tp->t_mountp->m_active_trans);
if (!(tp->t_flags & XFS_TRANS_NO_WRITECOUNT))
sb_end_intwrite(tp->t_mountp->m_super);
xfs_trans_free_dqinfo(tp);
- kmem_zone_free(xfs_trans_zone, tp);
+ kmem_cache_free(xfs_trans_zone, tp);
}
/*
@@ -90,7 +90,7 @@
trace_xfs_trans_dup(tp, _RET_IP_);
- ntp = kmem_zone_zalloc(xfs_trans_zone, 0);
+ ntp = kmem_cache_zalloc(xfs_trans_zone, GFP_KERNEL | __GFP_NOFAIL);
/*
* Initialize the new transaction structure.
@@ -107,7 +107,8 @@
ntp->t_flags = XFS_TRANS_PERM_LOG_RES |
(tp->t_flags & XFS_TRANS_RESERVE) |
- (tp->t_flags & XFS_TRANS_NO_WRITECOUNT);
+ (tp->t_flags & XFS_TRANS_NO_WRITECOUNT) |
+ (tp->t_flags & XFS_TRANS_RES_FDBLKS);
/* We gave our writer reference to the new transaction */
tp->t_flags |= XFS_TRANS_NO_WRITECOUNT;
ntp->t_ticket = xfs_log_ticket_get(tp->t_ticket);
@@ -124,8 +125,6 @@
xfs_defer_move(ntp, tp);
xfs_trans_dup_dqinfo(tp, ntp);
-
- atomic_inc(&tp->t_mountp->m_active_trans);
return ntp;
}
@@ -150,8 +149,9 @@
uint blocks,
uint rtextents)
{
- int error = 0;
- bool rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0;
+ struct xfs_mount *mp = tp->t_mountp;
+ int error = 0;
+ bool rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0;
/* Mark this thread as being in a transaction */
current_set_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
@@ -162,7 +162,7 @@
* fail if the count would go below zero.
*/
if (blocks > 0) {
- error = xfs_mod_fdblocks(tp->t_mountp, -((int64_t)blocks), rsvd);
+ error = xfs_mod_fdblocks(mp, -((int64_t)blocks), rsvd);
if (error != 0) {
current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
return -ENOSPC;
@@ -191,9 +191,9 @@
if (tp->t_ticket != NULL) {
ASSERT(resp->tr_logflags & XFS_TRANS_PERM_LOG_RES);
- error = xfs_log_regrant(tp->t_mountp, tp->t_ticket);
+ error = xfs_log_regrant(mp, tp->t_ticket);
} else {
- error = xfs_log_reserve(tp->t_mountp,
+ error = xfs_log_reserve(mp,
resp->tr_logres,
resp->tr_logcount,
&tp->t_ticket, XFS_TRANSACTION,
@@ -213,7 +213,7 @@
* fail if the count would go below zero.
*/
if (rtextents > 0) {
- error = xfs_mod_frextents(tp->t_mountp, -((int64_t)rtextents));
+ error = xfs_mod_frextents(mp, -((int64_t)rtextents));
if (error) {
error = -ENOSPC;
goto undo_log;
@@ -229,7 +229,7 @@
*/
undo_log:
if (resp->tr_logres > 0) {
- xfs_log_done(tp->t_mountp, tp->t_ticket, NULL, false);
+ xfs_log_ticket_ungrant(mp->m_log, tp->t_ticket);
tp->t_ticket = NULL;
tp->t_log_res = 0;
tp->t_flags &= ~XFS_TRANS_PERM_LOG_RES;
@@ -237,7 +237,7 @@
undo_blocks:
if (blocks > 0) {
- xfs_mod_fdblocks(tp->t_mountp, (int64_t)blocks, rsvd);
+ xfs_mod_fdblocks(mp, (int64_t)blocks, rsvd);
tp->t_blk_res = 0;
}
@@ -263,7 +263,7 @@
* GFP_NOFS allocation context so that we avoid lockdep false positives
* by doing GFP_KERNEL allocations inside sb_start_intwrite().
*/
- tp = kmem_zone_zalloc(xfs_trans_zone, 0);
+ tp = kmem_cache_zalloc(xfs_trans_zone, GFP_KERNEL | __GFP_NOFAIL);
if (!(flags & XFS_TRANS_NO_WRITECOUNT))
sb_start_intwrite(mp->m_super);
@@ -273,7 +273,8 @@
*/
WARN_ON(resp->tr_logres > 0 &&
mp->m_super->s_writers.frozen == SB_FREEZE_COMPLETE);
- atomic_inc(&mp->m_active_trans);
+ ASSERT(!(flags & XFS_TRANS_RES_FDBLKS) ||
+ xfs_sb_version_haslazysbcount(&mp->m_sb));
tp->t_magic = XFS_TRANS_HEADER_MAGIC;
tp->t_flags = flags;
@@ -297,20 +298,19 @@
/*
* Create an empty transaction with no reservation. This is a defensive
- * mechanism for routines that query metadata without actually modifying
- * them -- if the metadata being queried is somehow cross-linked (think a
- * btree block pointer that points higher in the tree), we risk deadlock.
- * However, blocks grabbed as part of a transaction can be re-grabbed.
- * The verifiers will notice the corrupt block and the operation will fail
- * back to userspace without deadlocking.
+ * mechanism for routines that query metadata without actually modifying them --
+ * if the metadata being queried is somehow cross-linked (think a btree block
+ * pointer that points higher in the tree), we risk deadlock. However, blocks
+ * grabbed as part of a transaction can be re-grabbed. The verifiers will
+ * notice the corrupt block and the operation will fail back to userspace
+ * without deadlocking.
*
- * Note the zero-length reservation; this transaction MUST be cancelled
- * without any dirty data.
+ * Note the zero-length reservation; this transaction MUST be cancelled without
+ * any dirty data.
*
- * Callers should obtain freeze protection to avoid two conflicts with fs
- * freezing: (1) having active transactions trip the m_active_trans ASSERTs;
- * and (2) grabbing buffers at the same time that freeze is trying to drain
- * the buffer LRU list.
+ * Callers should obtain freeze protection to avoid a conflict with fs freezing
+ * where we can be grabbing buffers at the same time that freeze is trying to
+ * drain the buffer LRU list.
*/
int
xfs_trans_alloc_empty(
@@ -368,6 +368,20 @@
tp->t_blk_res_used += (uint)-delta;
if (tp->t_blk_res_used > tp->t_blk_res)
xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+ } else if (delta > 0 && (tp->t_flags & XFS_TRANS_RES_FDBLKS)) {
+ int64_t blkres_delta;
+
+ /*
+ * Return freed blocks directly to the reservation
+ * instead of the global pool, being careful not to
+ * overflow the trans counter. This is used to preserve
+ * reservation across chains of transaction rolls that
+ * repeatedly free and allocate blocks.
+ */
+ blkres_delta = min_t(int64_t, delta,
+ UINT_MAX - tp->t_blk_res);
+ tp->t_blk_res += blkres_delta;
+ delta -= blkres_delta;
}
tp->t_fdblocks_delta += delta;
if (xfs_sb_version_haslazysbcount(&mp->m_sb))
@@ -454,8 +468,8 @@
xfs_buf_t *bp;
int whole = 0;
- bp = xfs_trans_getsb(tp, tp->t_mountp);
- sbp = XFS_BUF_TO_SBP(bp);
+ bp = xfs_trans_getsb(tp);
+ sbp = bp->b_addr;
/*
* Check that superblock mods match the mods made to AGF counters.
@@ -532,57 +546,9 @@
sizeof(sbp->sb_frextents) - 1);
}
-STATIC int
-xfs_sb_mod8(
- uint8_t *field,
- int8_t delta)
-{
- int8_t counter = *field;
-
- counter += delta;
- if (counter < 0) {
- ASSERT(0);
- return -EINVAL;
- }
- *field = counter;
- return 0;
-}
-
-STATIC int
-xfs_sb_mod32(
- uint32_t *field,
- int32_t delta)
-{
- int32_t counter = *field;
-
- counter += delta;
- if (counter < 0) {
- ASSERT(0);
- return -EINVAL;
- }
- *field = counter;
- return 0;
-}
-
-STATIC int
-xfs_sb_mod64(
- uint64_t *field,
- int64_t delta)
-{
- int64_t counter = *field;
-
- counter += delta;
- if (counter < 0) {
- ASSERT(0);
- return -EINVAL;
- }
- *field = counter;
- return 0;
-}
-
/*
- * xfs_trans_unreserve_and_mod_sb() is called to release unused reservations
- * and apply superblock counter changes to the in-core superblock. The
+ * xfs_trans_unreserve_and_mod_sb() is called to release unused reservations and
+ * apply superblock counter changes to the in-core superblock. The
* t_res_fdblocks_delta and t_res_frextents_delta fields are explicitly NOT
* applied to the in-core superblock. The idea is that that has already been
* done.
@@ -591,7 +557,12 @@
* used block counts are not updated in the on disk superblock. In this case,
* XFS_TRANS_SB_DIRTY will not be set when the transaction is updated but we
* still need to update the incore superblock with the changes.
+ *
+ * Deltas for the inode count are +/-64, hence we use a large batch size of 128
+ * so we don't need to take the counter lock on every update.
*/
+#define XFS_ICOUNT_BATCH 128
+
void
xfs_trans_unreserve_and_mod_sb(
struct xfs_trans *tp)
@@ -627,20 +598,21 @@
/* apply the per-cpu counters */
if (blkdelta) {
error = xfs_mod_fdblocks(mp, blkdelta, rsvd);
- if (error)
- goto out;
+ ASSERT(!error);
}
if (idelta) {
- error = xfs_mod_icount(mp, idelta);
- if (error)
- goto out_undo_fdblocks;
+ percpu_counter_add_batch(&mp->m_icount, idelta,
+ XFS_ICOUNT_BATCH);
+ if (idelta < 0)
+ ASSERT(__percpu_counter_compare(&mp->m_icount, 0,
+ XFS_ICOUNT_BATCH) >= 0);
}
if (ifreedelta) {
- error = xfs_mod_ifree(mp, ifreedelta);
- if (error)
- goto out_undo_icount;
+ percpu_counter_add(&mp->m_ifree, ifreedelta);
+ if (ifreedelta < 0)
+ ASSERT(percpu_counter_compare(&mp->m_ifree, 0) >= 0);
}
if (rtxdelta == 0 && !(tp->t_flags & XFS_TRANS_SB_DIRTY))
@@ -648,95 +620,23 @@
/* apply remaining deltas */
spin_lock(&mp->m_sb_lock);
- if (rtxdelta) {
- error = xfs_sb_mod64(&mp->m_sb.sb_frextents, rtxdelta);
- if (error)
- goto out_undo_ifree;
- }
-
- if (tp->t_dblocks_delta != 0) {
- error = xfs_sb_mod64(&mp->m_sb.sb_dblocks, tp->t_dblocks_delta);
- if (error)
- goto out_undo_frextents;
- }
- if (tp->t_agcount_delta != 0) {
- error = xfs_sb_mod32(&mp->m_sb.sb_agcount, tp->t_agcount_delta);
- if (error)
- goto out_undo_dblocks;
- }
- if (tp->t_imaxpct_delta != 0) {
- error = xfs_sb_mod8(&mp->m_sb.sb_imax_pct, tp->t_imaxpct_delta);
- if (error)
- goto out_undo_agcount;
- }
- if (tp->t_rextsize_delta != 0) {
- error = xfs_sb_mod32(&mp->m_sb.sb_rextsize,
- tp->t_rextsize_delta);
- if (error)
- goto out_undo_imaxpct;
- }
- if (tp->t_rbmblocks_delta != 0) {
- error = xfs_sb_mod32(&mp->m_sb.sb_rbmblocks,
- tp->t_rbmblocks_delta);
- if (error)
- goto out_undo_rextsize;
- }
- if (tp->t_rblocks_delta != 0) {
- error = xfs_sb_mod64(&mp->m_sb.sb_rblocks, tp->t_rblocks_delta);
- if (error)
- goto out_undo_rbmblocks;
- }
- if (tp->t_rextents_delta != 0) {
- error = xfs_sb_mod64(&mp->m_sb.sb_rextents,
- tp->t_rextents_delta);
- if (error)
- goto out_undo_rblocks;
- }
- if (tp->t_rextslog_delta != 0) {
- error = xfs_sb_mod8(&mp->m_sb.sb_rextslog,
- tp->t_rextslog_delta);
- if (error)
- goto out_undo_rextents;
- }
+ mp->m_sb.sb_frextents += rtxdelta;
+ mp->m_sb.sb_dblocks += tp->t_dblocks_delta;
+ mp->m_sb.sb_agcount += tp->t_agcount_delta;
+ mp->m_sb.sb_imax_pct += tp->t_imaxpct_delta;
+ mp->m_sb.sb_rextsize += tp->t_rextsize_delta;
+ mp->m_sb.sb_rbmblocks += tp->t_rbmblocks_delta;
+ mp->m_sb.sb_rblocks += tp->t_rblocks_delta;
+ mp->m_sb.sb_rextents += tp->t_rextents_delta;
+ mp->m_sb.sb_rextslog += tp->t_rextslog_delta;
spin_unlock(&mp->m_sb_lock);
- return;
-out_undo_rextents:
- if (tp->t_rextents_delta)
- xfs_sb_mod64(&mp->m_sb.sb_rextents, -tp->t_rextents_delta);
-out_undo_rblocks:
- if (tp->t_rblocks_delta)
- xfs_sb_mod64(&mp->m_sb.sb_rblocks, -tp->t_rblocks_delta);
-out_undo_rbmblocks:
- if (tp->t_rbmblocks_delta)
- xfs_sb_mod32(&mp->m_sb.sb_rbmblocks, -tp->t_rbmblocks_delta);
-out_undo_rextsize:
- if (tp->t_rextsize_delta)
- xfs_sb_mod32(&mp->m_sb.sb_rextsize, -tp->t_rextsize_delta);
-out_undo_imaxpct:
- if (tp->t_rextsize_delta)
- xfs_sb_mod8(&mp->m_sb.sb_imax_pct, -tp->t_imaxpct_delta);
-out_undo_agcount:
- if (tp->t_agcount_delta)
- xfs_sb_mod32(&mp->m_sb.sb_agcount, -tp->t_agcount_delta);
-out_undo_dblocks:
- if (tp->t_dblocks_delta)
- xfs_sb_mod64(&mp->m_sb.sb_dblocks, -tp->t_dblocks_delta);
-out_undo_frextents:
- if (rtxdelta)
- xfs_sb_mod64(&mp->m_sb.sb_frextents, -rtxdelta);
-out_undo_ifree:
- spin_unlock(&mp->m_sb_lock);
- if (ifreedelta)
- xfs_mod_ifree(mp, -ifreedelta);
-out_undo_icount:
- if (idelta)
- xfs_mod_icount(mp, -idelta);
-out_undo_fdblocks:
- if (blkdelta)
- xfs_mod_fdblocks(mp, -blkdelta, rsvd);
-out:
- ASSERT(error == 0);
+ /*
+ * Debug checks outside of the spinlock so they don't lock up the
+ * machine if they fail.
+ */
+ ASSERT(mp->m_sb.sb_imax_pct >= 0);
+ ASSERT(mp->m_sb.sb_rextslog >= 0);
return;
}
@@ -1004,9 +904,10 @@
*/
xfs_trans_unreserve_and_mod_dquots(tp);
if (tp->t_ticket) {
- commit_lsn = xfs_log_done(mp, tp->t_ticket, NULL, regrant);
- if (commit_lsn == -1 && !error)
- error = -EIO;
+ if (regrant && !XLOG_FORCED_SHUTDOWN(mp->m_log))
+ xfs_log_ticket_regrant(mp->m_log, tp->t_ticket);
+ else
+ xfs_log_ticket_ungrant(mp->m_log, tp->t_ticket);
tp->t_ticket = NULL;
}
current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
@@ -1058,14 +959,14 @@
struct xfs_log_item *lip;
list_for_each_entry(lip, &tp->t_items, li_trans)
- ASSERT(!(lip->li_type == XFS_LI_EFD));
+ ASSERT(!xlog_item_is_intent_done(lip));
}
#endif
xfs_trans_unreserve_and_mod_sb(tp);
xfs_trans_unreserve_and_mod_dquots(tp);
if (tp->t_ticket) {
- xfs_log_done(mp, tp->t_ticket, NULL, false);
+ xfs_log_ticket_ungrant(mp->m_log, tp->t_ticket);
tp->t_ticket = NULL;
}
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index 64d7f17..0846589 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -37,10 +37,6 @@
unsigned long li_flags; /* misc flags */
struct xfs_buf *li_buf; /* real buffer pointer */
struct list_head li_bio_list; /* buffer item list */
- void (*li_cb)(struct xfs_buf *,
- struct xfs_log_item *);
- /* buffer item iodone */
- /* callback func */
const struct xfs_item_ops *li_ops; /* function list */
/* delayed logging */
@@ -76,9 +72,29 @@
void (*iop_committing)(struct xfs_log_item *, xfs_lsn_t commit_lsn);
void (*iop_release)(struct xfs_log_item *);
xfs_lsn_t (*iop_committed)(struct xfs_log_item *, xfs_lsn_t);
- void (*iop_error)(struct xfs_log_item *, xfs_buf_t *);
+ int (*iop_recover)(struct xfs_log_item *lip,
+ struct list_head *capture_list);
+ bool (*iop_match)(struct xfs_log_item *item, uint64_t id);
+ struct xfs_log_item *(*iop_relog)(struct xfs_log_item *intent,
+ struct xfs_trans *tp);
};
+/* Is this log item a deferred action intent? */
+static inline bool
+xlog_item_is_intent(struct xfs_log_item *lip)
+{
+ return lip->li_ops->iop_recover != NULL &&
+ lip->li_ops->iop_match != NULL;
+}
+
+/* Is this a log intent-done item? */
+static inline bool
+xlog_item_is_intent_done(struct xfs_log_item *lip)
+{
+ return lip->li_ops->iop_unpin == NULL &&
+ lip->li_ops->iop_push == NULL;
+}
+
/*
* Release the log item as soon as committed. This is for items just logging
* intents that never need to be written back in place.
@@ -169,21 +185,21 @@
struct xfs_trans **tpp);
void xfs_trans_mod_sb(xfs_trans_t *, uint, int64_t);
-struct xfs_buf *xfs_trans_get_buf_map(struct xfs_trans *tp,
- struct xfs_buftarg *target,
- struct xfs_buf_map *map, int nmaps,
- uint flags);
+int xfs_trans_get_buf_map(struct xfs_trans *tp, struct xfs_buftarg *target,
+ struct xfs_buf_map *map, int nmaps, xfs_buf_flags_t flags,
+ struct xfs_buf **bpp);
-static inline struct xfs_buf *
+static inline int
xfs_trans_get_buf(
struct xfs_trans *tp,
struct xfs_buftarg *target,
xfs_daddr_t blkno,
int numblks,
- uint flags)
+ uint flags,
+ struct xfs_buf **bpp)
{
DEFINE_SINGLE_BUF_MAP(map, blkno, numblks);
- return xfs_trans_get_buf_map(tp, target, &map, 1, flags);
+ return xfs_trans_get_buf_map(tp, target, &map, 1, flags, bpp);
}
int xfs_trans_read_buf_map(struct xfs_mount *mp,
@@ -210,7 +226,7 @@
flags, bpp, ops);
}
-struct xfs_buf *xfs_trans_getsb(xfs_trans_t *, struct xfs_mount *);
+struct xfs_buf *xfs_trans_getsb(struct xfs_trans *);
void xfs_trans_brelse(xfs_trans_t *, struct xfs_buf *);
void xfs_trans_bjoin(xfs_trans_t *, struct xfs_buf *);
@@ -244,4 +260,12 @@
extern kmem_zone_t *xfs_trans_zone;
+static inline struct xfs_log_item *
+xfs_trans_item_relog(
+ struct xfs_log_item *lip,
+ struct xfs_trans *tp)
+{
+ return lip->li_ops->iop_relog(lip, tp);
+}
+
#endif /* __XFS_TRANS_H__ */
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index 812108f..dbb69b4 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -32,6 +32,7 @@
xfs_ail_check(
struct xfs_ail *ailp,
struct xfs_log_item *lip)
+ __must_hold(&ailp->ail_lock)
{
struct xfs_log_item *prev_lip;
struct xfs_log_item *next_lip;
@@ -108,17 +109,25 @@
* We need the AIL lock in order to get a coherent read of the lsn of the last
* item in the AIL.
*/
+static xfs_lsn_t
+__xfs_ail_min_lsn(
+ struct xfs_ail *ailp)
+{
+ struct xfs_log_item *lip = xfs_ail_min(ailp);
+
+ if (lip)
+ return lip->li_lsn;
+ return 0;
+}
+
xfs_lsn_t
xfs_ail_min_lsn(
struct xfs_ail *ailp)
{
- xfs_lsn_t lsn = 0;
- struct xfs_log_item *lip;
+ xfs_lsn_t lsn;
spin_lock(&ailp->ail_lock);
- lip = xfs_ail_min(ailp);
- if (lip)
- lsn = lip->li_lsn;
+ lsn = __xfs_ail_min_lsn(ailp);
spin_unlock(&ailp->ail_lock);
return lsn;
@@ -336,6 +345,49 @@
xfs_trans_ail_cursor_clear(ailp, lip);
}
+/*
+ * Requeue a failed buffer for writeback.
+ *
+ * We clear the log item failed state here as well, but we have to be careful
+ * about reference counts because the only active reference counts on the buffer
+ * may be the failed log items. Hence if we clear the log item failed state
+ * before queuing the buffer for IO we can release all active references to
+ * the buffer and free it, leading to use after free problems in
+ * xfs_buf_delwri_queue. It makes no difference to the buffer or log items which
+ * order we process them in - the buffer is locked, and we own the buffer list
+ * so nothing on them is going to change while we are performing this action.
+ *
+ * Hence we can safely queue the buffer for IO before we clear the failed log
+ * item state, therefore always having an active reference to the buffer and
+ * avoiding the transient zero-reference state that leads to use-after-free.
+ */
+static inline int
+xfsaild_resubmit_item(
+ struct xfs_log_item *lip,
+ struct list_head *buffer_list)
+{
+ struct xfs_buf *bp = lip->li_buf;
+
+ if (!xfs_buf_trylock(bp))
+ return XFS_ITEM_LOCKED;
+
+ if (!xfs_buf_delwri_queue(bp, buffer_list)) {
+ xfs_buf_unlock(bp);
+ return XFS_ITEM_FLUSHING;
+ }
+
+ /* protected by ail_lock */
+ list_for_each_entry(lip, &bp->b_li_list, li_bio_list) {
+ if (bp->b_flags & _XBF_INODES)
+ clear_bit(XFS_LI_FAILED, &lip->li_flags);
+ else
+ xfs_clear_li_failed(lip);
+ }
+
+ xfs_buf_unlock(bp);
+ return XFS_ITEM_SUCCESS;
+}
+
static inline uint
xfsaild_push_item(
struct xfs_ail *ailp,
@@ -356,6 +408,8 @@
*/
if (!lip->li_ops->iop_push)
return XFS_ITEM_PINNED;
+ if (test_bit(XFS_LI_FAILED, &lip->li_flags))
+ return xfsaild_resubmit_item(lip, &ailp->ail_buf_list);
return lip->li_ops->iop_push(lip, &ailp->ail_buf_list);
}
@@ -394,16 +448,10 @@
target = ailp->ail_target;
ailp->ail_target_prev = target;
+ /* we're done if the AIL is empty or our push has reached the end */
lip = xfs_trans_ail_cursor_first(ailp, &cur, ailp->ail_last_pushed_lsn);
- if (!lip) {
- /*
- * If the AIL is empty or our push has reached the end we are
- * done now.
- */
- xfs_trans_ail_cursor_done(&cur);
- spin_unlock(&ailp->ail_lock);
+ if (!lip)
goto out_done;
- }
XFS_STATS_INC(mp, xs_push_ail);
@@ -427,15 +475,15 @@
case XFS_ITEM_FLUSHING:
/*
- * The item or its backing buffer is already beeing
+ * The item or its backing buffer is already being
* flushed. The typical reason for that is that an
* inode buffer is locked because we already pushed the
* updates to it as part of inode clustering.
*
- * We do not want to to stop flushing just because lots
- * of items are already beeing flushed, but we need to
+ * We do not want to stop flushing just because lots
+ * of items are already being flushed, but we need to
* re-try the flushing relatively soon if most of the
- * AIL is beeing flushed.
+ * AIL is being flushed.
*/
XFS_STATS_INC(mp, xs_push_ail_flushing);
trace_xfs_ail_flushing(lip);
@@ -467,7 +515,7 @@
/*
* Are there too many items we can't do anything with?
*
- * If we we are skipping too many items because we can't flush
+ * If we are skipping too many items because we can't flush
* them or they are already being flushed, we back off and
* given them time to complete whatever operation is being
* done. i.e. remove pressure from the AIL while we can't make
@@ -485,6 +533,8 @@
break;
lsn = lip->li_lsn;
}
+
+out_done:
xfs_trans_ail_cursor_done(&cur);
spin_unlock(&ailp->ail_lock);
@@ -492,7 +542,6 @@
ailp->ail_log_flush++;
if (!count || XFS_LSN_CMP(lsn, target) >= 0) {
-out_done:
/*
* We reached the target or the AIL is empty, so wait a bit
* longer for I/O to complete and remove pushed items from the
@@ -584,7 +633,8 @@
*/
smp_rmb();
if (!xfs_ail_min(ailp) &&
- ailp->ail_target == ailp->ail_target_prev) {
+ ailp->ail_target == ailp->ail_target_prev &&
+ list_empty(&ailp->ail_buf_list)) {
spin_unlock(&ailp->ail_lock);
freezable_schedule();
tout = 0;
@@ -614,7 +664,7 @@
* The push is run asynchronously in a workqueue, which means the caller needs
* to handle waiting on the async flush for space to become available.
* We don't want to interrupt any push that is in progress, hence we only queue
- * work if we set the pushing bit approriately.
+ * work if we set the pushing bit appropriately.
*
* We do this unlocked - we only need to know whether there is anything in the
* AIL at the time we are called. We don't need to access the contents of
@@ -680,6 +730,28 @@
finish_wait(&ailp->ail_empty, &wait);
}
+void
+xfs_ail_update_finish(
+ struct xfs_ail *ailp,
+ xfs_lsn_t old_lsn) __releases(ailp->ail_lock)
+{
+ struct xfs_mount *mp = ailp->ail_mount;
+
+ /* if the tail lsn hasn't changed, don't do updates or wakeups. */
+ if (!old_lsn || old_lsn == __xfs_ail_min_lsn(ailp)) {
+ spin_unlock(&ailp->ail_lock);
+ return;
+ }
+
+ if (!XFS_FORCED_SHUTDOWN(mp))
+ xlog_assign_tail_lsn_locked(mp);
+
+ if (list_empty(&ailp->ail_head))
+ wake_up_all(&ailp->ail_empty);
+ spin_unlock(&ailp->ail_lock);
+ xfs_log_space_wake(mp);
+}
+
/*
* xfs_trans_ail_update - bulk AIL insertion operation.
*
@@ -711,7 +783,7 @@
xfs_lsn_t lsn) __releases(ailp->ail_lock)
{
struct xfs_log_item *mlip;
- int mlip_changed = 0;
+ xfs_lsn_t tail_lsn = 0;
int i;
LIST_HEAD(tmp);
@@ -726,9 +798,10 @@
continue;
trace_xfs_ail_move(lip, lip->li_lsn, lsn);
+ if (mlip == lip && !tail_lsn)
+ tail_lsn = lip->li_lsn;
+
xfs_ail_delete(ailp, lip);
- if (mlip == lip)
- mlip_changed = 1;
} else {
trace_xfs_ail_insert(lip, 0, lsn);
}
@@ -739,66 +812,58 @@
if (!list_empty(&tmp))
xfs_ail_splice(ailp, cur, &tmp, lsn);
- if (mlip_changed) {
- if (!XFS_FORCED_SHUTDOWN(ailp->ail_mount))
- xlog_assign_tail_lsn_locked(ailp->ail_mount);
- spin_unlock(&ailp->ail_lock);
-
- xfs_log_space_wake(ailp->ail_mount);
- } else {
- spin_unlock(&ailp->ail_lock);
- }
+ xfs_ail_update_finish(ailp, tail_lsn);
}
-bool
+/* Insert a log item into the AIL. */
+void
+xfs_trans_ail_insert(
+ struct xfs_ail *ailp,
+ struct xfs_log_item *lip,
+ xfs_lsn_t lsn)
+{
+ spin_lock(&ailp->ail_lock);
+ xfs_trans_ail_update_bulk(ailp, NULL, &lip, 1, lsn);
+}
+
+/*
+ * Delete one log item from the AIL.
+ *
+ * If this item was at the tail of the AIL, return the LSN of the log item so
+ * that we can use it to check if the LSN of the tail of the log has moved
+ * when finishing up the AIL delete process in xfs_ail_update_finish().
+ */
+xfs_lsn_t
xfs_ail_delete_one(
struct xfs_ail *ailp,
struct xfs_log_item *lip)
{
struct xfs_log_item *mlip = xfs_ail_min(ailp);
+ xfs_lsn_t lsn = lip->li_lsn;
trace_xfs_ail_delete(lip, mlip->li_lsn, lip->li_lsn);
xfs_ail_delete(ailp, lip);
- xfs_clear_li_failed(lip);
clear_bit(XFS_LI_IN_AIL, &lip->li_flags);
lip->li_lsn = 0;
- return mlip == lip;
+ if (mlip == lip)
+ return lsn;
+ return 0;
}
-/**
- * Remove a log items from the AIL
- *
- * @xfs_trans_ail_delete_bulk takes an array of log items that all need to
- * removed from the AIL. The caller is already holding the AIL lock, and done
- * all the checks necessary to ensure the items passed in via @log_items are
- * ready for deletion. This includes checking that the items are in the AIL.
- *
- * For each log item to be removed, unlink it from the AIL, clear the IN_AIL
- * flag from the item and reset the item's lsn to 0. If we remove the first
- * item in the AIL, update the log tail to match the new minimum LSN in the
- * AIL.
- *
- * This function will not drop the AIL lock until all items are removed from
- * the AIL to minimise the amount of lock traffic on the AIL. This does not
- * greatly increase the AIL hold time, but does significantly reduce the amount
- * of traffic on the lock, especially during IO completion.
- *
- * This function must be called with the AIL lock held. The lock is dropped
- * before returning.
- */
void
xfs_trans_ail_delete(
- struct xfs_ail *ailp,
struct xfs_log_item *lip,
- int shutdown_type) __releases(ailp->ail_lock)
+ int shutdown_type)
{
+ struct xfs_ail *ailp = lip->li_ailp;
struct xfs_mount *mp = ailp->ail_mount;
- bool mlip_changed;
+ xfs_lsn_t tail_lsn;
+ spin_lock(&ailp->ail_lock);
if (!test_bit(XFS_LI_IN_AIL, &lip->li_flags)) {
spin_unlock(&ailp->ail_lock);
- if (!XFS_FORCED_SHUTDOWN(mp)) {
+ if (shutdown_type && !XFS_FORCED_SHUTDOWN(mp)) {
xfs_alert_tag(mp, XFS_PTAG_AILDELETE,
"%s: attempting to delete a log item that is not in the AIL",
__func__);
@@ -807,17 +872,10 @@
return;
}
- mlip_changed = xfs_ail_delete_one(ailp, lip);
- if (mlip_changed) {
- if (!XFS_FORCED_SHUTDOWN(mp))
- xlog_assign_tail_lsn_locked(mp);
- if (list_empty(&ailp->ail_head))
- wake_up_all(&ailp->ail_empty);
- }
-
- spin_unlock(&ailp->ail_lock);
- if (mlip_changed)
- xfs_log_space_wake(ailp->ail_mount);
+ /* xfs_ail_update_finish() drops the AIL lock */
+ xfs_clear_li_failed(lip);
+ tail_lsn = xfs_ail_delete_one(ailp, lip);
+ xfs_ail_update_finish(ailp, tail_lsn);
}
int
@@ -838,7 +896,7 @@
init_waitqueue_head(&ailp->ail_empty);
ailp->ail_task = kthread_run(xfsaild, ailp, "xfsaild/%s",
- ailp->ail_mount->m_fsname);
+ ailp->ail_mount->m_super->s_id);
if (IS_ERR(ailp->ail_task))
goto out_free_ailp;
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index b5b3a78..42d63b8 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -112,19 +112,22 @@
* If the transaction pointer is NULL, make this just a normal
* get_buf() call.
*/
-struct xfs_buf *
+int
xfs_trans_get_buf_map(
struct xfs_trans *tp,
struct xfs_buftarg *target,
struct xfs_buf_map *map,
int nmaps,
- xfs_buf_flags_t flags)
+ xfs_buf_flags_t flags,
+ struct xfs_buf **bpp)
{
xfs_buf_t *bp;
struct xfs_buf_log_item *bip;
+ int error;
+ *bpp = NULL;
if (!tp)
- return xfs_buf_get_map(target, map, nmaps, flags);
+ return xfs_buf_get_map(target, map, nmaps, flags, bpp);
/*
* If we find the buffer in the cache with this transaction
@@ -146,66 +149,51 @@
ASSERT(atomic_read(&bip->bli_refcount) > 0);
bip->bli_recur++;
trace_xfs_trans_get_buf_recur(bip);
- return bp;
+ *bpp = bp;
+ return 0;
}
- bp = xfs_buf_get_map(target, map, nmaps, flags);
- if (bp == NULL) {
- return NULL;
- }
+ error = xfs_buf_get_map(target, map, nmaps, flags, &bp);
+ if (error)
+ return error;
ASSERT(!bp->b_error);
_xfs_trans_bjoin(tp, bp, 1);
trace_xfs_trans_get_buf(bp->b_log_item);
- return bp;
+ *bpp = bp;
+ return 0;
}
/*
- * Get and lock the superblock buffer of this file system for the
- * given transaction.
- *
- * We don't need to use incore_match() here, because the superblock
- * buffer is a private buffer which we keep a pointer to in the
- * mount structure.
+ * Get and lock the superblock buffer for the given transaction.
*/
-xfs_buf_t *
+struct xfs_buf *
xfs_trans_getsb(
- xfs_trans_t *tp,
- struct xfs_mount *mp)
+ struct xfs_trans *tp)
{
- xfs_buf_t *bp;
- struct xfs_buf_log_item *bip;
+ struct xfs_buf *bp = tp->t_mountp->m_sb_bp;
/*
- * Default to just trying to lock the superblock buffer
- * if tp is NULL.
+ * Just increment the lock recursion count if the buffer is already
+ * attached to this transaction.
*/
- if (tp == NULL)
- return xfs_getsb(mp);
-
- /*
- * If the superblock buffer already has this transaction
- * pointer in its b_fsprivate2 field, then we know we already
- * have it locked. In this case we just increment the lock
- * recursion count and return the buffer to the caller.
- */
- bp = mp->m_sb_bp;
if (bp->b_transp == tp) {
- bip = bp->b_log_item;
+ struct xfs_buf_log_item *bip = bp->b_log_item;
+
ASSERT(bip != NULL);
ASSERT(atomic_read(&bip->bli_refcount) > 0);
bip->bli_recur++;
+
trace_xfs_trans_getsb_recur(bip);
- return bp;
+ } else {
+ xfs_buf_lock(bp);
+ xfs_buf_hold(bp);
+ _xfs_trans_bjoin(tp, bp, 1);
+
+ trace_xfs_trans_getsb(bp->b_log_item);
}
- bp = xfs_getsb(mp);
- if (bp == NULL)
- return NULL;
-
- _xfs_trans_bjoin(tp, bp, 1);
- trace_xfs_trans_getsb(bp->b_log_item);
return bp;
}
@@ -276,7 +264,7 @@
ASSERT(bp->b_ops != NULL);
error = xfs_buf_reverify(bp, ops);
if (error) {
- xfs_buf_ioerror_alert(bp, __func__);
+ xfs_buf_ioerror_alert(bp, __return_address);
if (tp->t_flags & XFS_TRANS_DIRTY)
xfs_force_shutdown(tp->t_mountp,
@@ -298,36 +286,17 @@
return 0;
}
- bp = xfs_buf_read_map(target, map, nmaps, flags, ops);
- if (!bp) {
- if (!(flags & XBF_TRYLOCK))
- return -ENOMEM;
- return tp ? 0 : -EAGAIN;
- }
-
- /*
- * If we've had a read error, then the contents of the buffer are
- * invalid and should not be used. To ensure that a followup read tries
- * to pull the buffer from disk again, we clear the XBF_DONE flag and
- * mark the buffer stale. This ensures that anyone who has a current
- * reference to the buffer will interpret it's contents correctly and
- * future cache lookups will also treat it as an empty, uninitialised
- * buffer.
- */
- if (bp->b_error) {
- error = bp->b_error;
- if (!XFS_FORCED_SHUTDOWN(mp))
- xfs_buf_ioerror_alert(bp, __func__);
- bp->b_flags &= ~XBF_DONE;
- xfs_buf_stale(bp);
-
+ error = xfs_buf_read_map(target, map, nmaps, flags, &bp, ops,
+ __return_address);
+ switch (error) {
+ case 0:
+ break;
+ default:
if (tp && (tp->t_flags & XFS_TRANS_DIRTY))
xfs_force_shutdown(tp->t_mountp, SHUTDOWN_META_IO_ERROR);
- xfs_buf_relse(bp);
-
- /* bad CRC means corrupted metadata */
- if (error == -EFSBADCRC)
- error = -EFSCORRUPTED;
+ /* fall through */
+ case -ENOMEM:
+ case -EAGAIN:
return error;
}
@@ -480,24 +449,16 @@
ASSERT(bp->b_transp == tp);
ASSERT(bip != NULL);
- ASSERT(bp->b_iodone == NULL ||
- bp->b_iodone == xfs_buf_iodone_callbacks);
/*
* Mark the buffer as needing to be written out eventually,
* and set its iodone function to remove the buffer's buf log
* item from the AIL and free it when the buffer is flushed
- * to disk. See xfs_buf_attach_iodone() for more details
- * on li_cb and xfs_buf_iodone_callbacks().
- * If we end up aborting this transaction, we trap this buffer
- * inside the b_bdstrat callback so that this won't get written to
- * disk.
+ * to disk.
*/
bp->b_flags |= XBF_DONE;
ASSERT(atomic_read(&bip->bli_refcount) > 0);
- bp->b_iodone = xfs_buf_iodone_callbacks;
- bip->bli_item.li_cb = xfs_buf_iodone;
/*
* If we invalidated the buffer within this transaction, then
@@ -641,6 +602,7 @@
ASSERT(atomic_read(&bip->bli_refcount) > 0);
bip->bli_flags |= XFS_BLI_INODE_BUF;
+ bp->b_flags |= _XBF_INODES;
xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DINO_BUF);
}
@@ -665,7 +627,7 @@
ASSERT(atomic_read(&bip->bli_refcount) > 0);
bip->bli_flags |= XFS_BLI_STALE_INODE;
- bip->bli_item.li_cb = xfs_buf_iodone;
+ bp->b_flags |= _XBF_INODES;
xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DINO_BUF);
}
@@ -690,6 +652,7 @@
ASSERT(atomic_read(&bip->bli_refcount) > 0);
bip->bli_flags |= XFS_BLI_INODE_ALLOC_BUF;
+ bp->b_flags |= _XBF_INODES;
xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DINO_BUF);
}
@@ -800,5 +763,6 @@
break;
}
+ bp->b_flags |= _XBF_DQUOTS;
xfs_trans_buf_set_type(tp, bp, type);
}
diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c
index 904780d..fe45b0c 100644
--- a/fs/xfs/xfs_trans_dquot.c
+++ b/fs/xfs/xfs_trans_dquot.c
@@ -15,6 +15,7 @@
#include "xfs_trans_priv.h"
#include "xfs_quota.h"
#include "xfs_qm.h"
+#include "xfs_trace.h"
STATIC void xfs_trans_alloc_dqinfo(xfs_trans_t *);
@@ -25,8 +26,8 @@
*/
void
xfs_trans_dqjoin(
- xfs_trans_t *tp,
- xfs_dquot_t *dqp)
+ struct xfs_trans *tp,
+ struct xfs_dquot *dqp)
{
ASSERT(XFS_DQ_IS_LOCKED(dqp));
ASSERT(dqp->q_logitem.qli_dquot == dqp);
@@ -49,11 +50,17 @@
*/
void
xfs_trans_log_dquot(
- xfs_trans_t *tp,
- xfs_dquot_t *dqp)
+ struct xfs_trans *tp,
+ struct xfs_dquot *dqp)
{
ASSERT(XFS_DQ_IS_LOCKED(dqp));
+ /* Upgrade the dquot to bigtime format if possible. */
+ if (dqp->q_id != 0 &&
+ xfs_sb_version_hasbigtime(&tp->t_mountp->m_sb) &&
+ !(dqp->q_type & XFS_DQTYPE_BIGTIME))
+ dqp->q_type |= XFS_DQTYPE_BIGTIME;
+
tp->t_flags |= XFS_TRANS_DIRTY;
set_bit(XFS_LI_DIRTY, &dqp->q_logitem.qli_item.li_flags);
}
@@ -155,14 +162,19 @@
int i;
struct xfs_dqtrx *qa;
- if (XFS_QM_ISUDQ(dqp))
+ switch (xfs_dquot_type(dqp)) {
+ case XFS_DQTYPE_USER:
qa = tp->t_dqinfo->dqs[XFS_QM_TRANS_USR];
- else if (XFS_QM_ISGDQ(dqp))
+ break;
+ case XFS_DQTYPE_GROUP:
qa = tp->t_dqinfo->dqs[XFS_QM_TRANS_GRP];
- else if (XFS_QM_ISPDQ(dqp))
+ break;
+ case XFS_DQTYPE_PROJ:
qa = tp->t_dqinfo->dqs[XFS_QM_TRANS_PRJ];
- else
+ break;
+ default:
return NULL;
+ }
for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) {
if (qa[i].qt_dquot == NULL ||
@@ -203,37 +215,33 @@
if (qtrx->qt_dquot == NULL)
qtrx->qt_dquot = dqp;
- switch (field) {
+ if (delta) {
+ trace_xfs_trans_mod_dquot_before(qtrx);
+ trace_xfs_trans_mod_dquot(tp, dqp, field, delta);
+ }
- /*
- * regular disk blk reservation
- */
- case XFS_TRANS_DQ_RES_BLKS:
+ switch (field) {
+ /* regular disk blk reservation */
+ case XFS_TRANS_DQ_RES_BLKS:
qtrx->qt_blk_res += delta;
break;
- /*
- * inode reservation
- */
- case XFS_TRANS_DQ_RES_INOS:
+ /* inode reservation */
+ case XFS_TRANS_DQ_RES_INOS:
qtrx->qt_ino_res += delta;
break;
- /*
- * disk blocks used.
- */
- case XFS_TRANS_DQ_BCOUNT:
+ /* disk blocks used. */
+ case XFS_TRANS_DQ_BCOUNT:
qtrx->qt_bcount_delta += delta;
break;
- case XFS_TRANS_DQ_DELBCOUNT:
+ case XFS_TRANS_DQ_DELBCOUNT:
qtrx->qt_delbcnt_delta += delta;
break;
- /*
- * Inode Count
- */
- case XFS_TRANS_DQ_ICOUNT:
+ /* Inode Count */
+ case XFS_TRANS_DQ_ICOUNT:
if (qtrx->qt_ino_res && delta > 0) {
qtrx->qt_ino_res_used += delta;
ASSERT(qtrx->qt_ino_res >= qtrx->qt_ino_res_used);
@@ -241,17 +249,13 @@
qtrx->qt_icount_delta += delta;
break;
- /*
- * rtblk reservation
- */
- case XFS_TRANS_DQ_RES_RTBLKS:
+ /* rtblk reservation */
+ case XFS_TRANS_DQ_RES_RTBLKS:
qtrx->qt_rtblk_res += delta;
break;
- /*
- * rtblk count
- */
- case XFS_TRANS_DQ_RTBCOUNT:
+ /* rtblk count */
+ case XFS_TRANS_DQ_RTBCOUNT:
if (qtrx->qt_rtblk_res && delta > 0) {
qtrx->qt_rtblk_res_used += delta;
ASSERT(qtrx->qt_rtblk_res >= qtrx->qt_rtblk_res_used);
@@ -259,13 +263,17 @@
qtrx->qt_rtbcount_delta += delta;
break;
- case XFS_TRANS_DQ_DELRTBCOUNT:
+ case XFS_TRANS_DQ_DELRTBCOUNT:
qtrx->qt_delrtb_delta += delta;
break;
- default:
+ default:
ASSERT(0);
}
+
+ if (delta)
+ trace_xfs_trans_mod_dquot_after(qtrx);
+
tp->t_flags |= XFS_TRANS_DQ_DIRTY;
}
@@ -293,6 +301,37 @@
}
}
+/* Apply dqtrx changes to the quota reservation counters. */
+static inline void
+xfs_apply_quota_reservation_deltas(
+ struct xfs_dquot_res *res,
+ uint64_t reserved,
+ int64_t res_used,
+ int64_t count_delta)
+{
+ if (reserved != 0) {
+ /*
+ * Subtle math here: If reserved > res_used (the normal case),
+ * we're simply subtracting the unused transaction quota
+ * reservation from the dquot reservation.
+ *
+ * If, however, res_used > reserved, then we have allocated
+ * more quota blocks than were reserved for the transaction.
+ * We must add that excess to the dquot reservation since it
+ * tracks (usage + resv) and by definition we didn't reserve
+ * that excess.
+ */
+ res->reserved -= abs(reserved - res_used);
+ } else if (count_delta != 0) {
+ /*
+ * These blks were never reserved, either inside a transaction
+ * or outside one (in a delayed allocation). Also, this isn't
+ * always a negative number since we sometimes deliberately
+ * skip quota reservations.
+ */
+ res->reserved += count_delta;
+ }
+}
/*
* Called by xfs_trans_commit() and similar in spirit to
@@ -309,7 +348,6 @@
int i, j;
struct xfs_dquot *dqp;
struct xfs_dqtrx *qtrx, *qa;
- struct xfs_disk_dquot *d;
int64_t totalbdelta;
int64_t totalrtbdelta;
@@ -328,6 +366,8 @@
xfs_trans_dqlockedjoin(tp, qa);
for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) {
+ uint64_t blk_res_used;
+
qtrx = &qa[i];
/*
* The array of dquots is filled
@@ -341,7 +381,6 @@
/*
* adjust the actual number of blocks used
*/
- d = &dqp->q_core;
/*
* The issue here is - sometimes we don't make a blkquota
@@ -360,38 +399,46 @@
qtrx->qt_delbcnt_delta;
totalrtbdelta = qtrx->qt_rtbcount_delta +
qtrx->qt_delrtb_delta;
+
+ if (totalbdelta != 0 || totalrtbdelta != 0 ||
+ qtrx->qt_icount_delta != 0) {
+ trace_xfs_trans_apply_dquot_deltas_before(dqp);
+ trace_xfs_trans_apply_dquot_deltas(qtrx);
+ }
+
#ifdef DEBUG
if (totalbdelta < 0)
- ASSERT(be64_to_cpu(d->d_bcount) >=
- -totalbdelta);
+ ASSERT(dqp->q_blk.count >= -totalbdelta);
if (totalrtbdelta < 0)
- ASSERT(be64_to_cpu(d->d_rtbcount) >=
- -totalrtbdelta);
+ ASSERT(dqp->q_rtb.count >= -totalrtbdelta);
if (qtrx->qt_icount_delta < 0)
- ASSERT(be64_to_cpu(d->d_icount) >=
- -qtrx->qt_icount_delta);
+ ASSERT(dqp->q_ino.count >= -qtrx->qt_icount_delta);
#endif
if (totalbdelta)
- be64_add_cpu(&d->d_bcount, (xfs_qcnt_t)totalbdelta);
+ dqp->q_blk.count += totalbdelta;
if (qtrx->qt_icount_delta)
- be64_add_cpu(&d->d_icount, (xfs_qcnt_t)qtrx->qt_icount_delta);
+ dqp->q_ino.count += qtrx->qt_icount_delta;
if (totalrtbdelta)
- be64_add_cpu(&d->d_rtbcount, (xfs_qcnt_t)totalrtbdelta);
+ dqp->q_rtb.count += totalrtbdelta;
+
+ if (totalbdelta != 0 || totalrtbdelta != 0 ||
+ qtrx->qt_icount_delta != 0)
+ trace_xfs_trans_apply_dquot_deltas_after(dqp);
/*
* Get any default limits in use.
* Start/reset the timer(s) if needed.
*/
- if (d->d_id) {
- xfs_qm_adjust_dqlimits(tp->t_mountp, dqp);
- xfs_qm_adjust_dqtimers(tp->t_mountp, d);
+ if (dqp->q_id) {
+ xfs_qm_adjust_dqlimits(dqp);
+ xfs_qm_adjust_dqtimers(dqp);
}
- dqp->dq_flags |= XFS_DQ_DIRTY;
+ dqp->q_flags |= XFS_DQFLAG_DIRTY;
/*
* add this to the list of items to get logged
*/
@@ -401,78 +448,31 @@
* In case of delayed allocations, there's no
* reservation that a transaction structure knows of.
*/
- if (qtrx->qt_blk_res != 0) {
- uint64_t blk_res_used = 0;
+ blk_res_used = max_t(int64_t, 0, qtrx->qt_bcount_delta);
+ xfs_apply_quota_reservation_deltas(&dqp->q_blk,
+ qtrx->qt_blk_res, blk_res_used,
+ qtrx->qt_bcount_delta);
- if (qtrx->qt_bcount_delta > 0)
- blk_res_used = qtrx->qt_bcount_delta;
-
- if (qtrx->qt_blk_res != blk_res_used) {
- if (qtrx->qt_blk_res > blk_res_used)
- dqp->q_res_bcount -= (xfs_qcnt_t)
- (qtrx->qt_blk_res -
- blk_res_used);
- else
- dqp->q_res_bcount -= (xfs_qcnt_t)
- (blk_res_used -
- qtrx->qt_blk_res);
- }
- } else {
- /*
- * These blks were never reserved, either inside
- * a transaction or outside one (in a delayed
- * allocation). Also, this isn't always a
- * negative number since we sometimes
- * deliberately skip quota reservations.
- */
- if (qtrx->qt_bcount_delta) {
- dqp->q_res_bcount +=
- (xfs_qcnt_t)qtrx->qt_bcount_delta;
- }
- }
/*
* Adjust the RT reservation.
*/
- if (qtrx->qt_rtblk_res != 0) {
- if (qtrx->qt_rtblk_res != qtrx->qt_rtblk_res_used) {
- if (qtrx->qt_rtblk_res >
- qtrx->qt_rtblk_res_used)
- dqp->q_res_rtbcount -= (xfs_qcnt_t)
- (qtrx->qt_rtblk_res -
- qtrx->qt_rtblk_res_used);
- else
- dqp->q_res_rtbcount -= (xfs_qcnt_t)
- (qtrx->qt_rtblk_res_used -
- qtrx->qt_rtblk_res);
- }
- } else {
- if (qtrx->qt_rtbcount_delta)
- dqp->q_res_rtbcount +=
- (xfs_qcnt_t)qtrx->qt_rtbcount_delta;
- }
+ xfs_apply_quota_reservation_deltas(&dqp->q_rtb,
+ qtrx->qt_rtblk_res,
+ qtrx->qt_rtblk_res_used,
+ qtrx->qt_rtbcount_delta);
/*
* Adjust the inode reservation.
*/
- if (qtrx->qt_ino_res != 0) {
- ASSERT(qtrx->qt_ino_res >=
- qtrx->qt_ino_res_used);
- if (qtrx->qt_ino_res > qtrx->qt_ino_res_used)
- dqp->q_res_icount -= (xfs_qcnt_t)
- (qtrx->qt_ino_res -
- qtrx->qt_ino_res_used);
- } else {
- if (qtrx->qt_icount_delta)
- dqp->q_res_icount +=
- (xfs_qcnt_t)qtrx->qt_icount_delta;
- }
+ ASSERT(qtrx->qt_ino_res >= qtrx->qt_ino_res_used);
+ xfs_apply_quota_reservation_deltas(&dqp->q_ino,
+ qtrx->qt_ino_res,
+ qtrx->qt_ino_res_used,
+ qtrx->qt_icount_delta);
- ASSERT(dqp->q_res_bcount >=
- be64_to_cpu(dqp->q_core.d_bcount));
- ASSERT(dqp->q_res_icount >=
- be64_to_cpu(dqp->q_core.d_icount));
- ASSERT(dqp->q_res_rtbcount >=
- be64_to_cpu(dqp->q_core.d_rtbcount));
+ ASSERT(dqp->q_blk.reserved >= dqp->q_blk.count);
+ ASSERT(dqp->q_ino.reserved >= dqp->q_ino.count);
+ ASSERT(dqp->q_rtb.reserved >= dqp->q_rtb.count);
}
}
}
@@ -486,12 +486,12 @@
*/
void
xfs_trans_unreserve_and_mod_dquots(
- xfs_trans_t *tp)
+ struct xfs_trans *tp)
{
int i, j;
- xfs_dquot_t *dqp;
+ struct xfs_dquot *dqp;
struct xfs_dqtrx *qtrx, *qa;
- bool locked;
+ bool locked;
if (!tp->t_dqinfo || !(tp->t_flags & XFS_TRANS_DQ_DIRTY))
return;
@@ -516,7 +516,7 @@
if (qtrx->qt_blk_res) {
xfs_dqlock(dqp);
locked = true;
- dqp->q_res_bcount -=
+ dqp->q_blk.reserved -=
(xfs_qcnt_t)qtrx->qt_blk_res;
}
if (qtrx->qt_ino_res) {
@@ -524,7 +524,7 @@
xfs_dqlock(dqp);
locked = true;
}
- dqp->q_res_icount -=
+ dqp->q_ino.reserved -=
(xfs_qcnt_t)qtrx->qt_ino_res;
}
@@ -533,7 +533,7 @@
xfs_dqlock(dqp);
locked = true;
}
- dqp->q_res_rtbcount -=
+ dqp->q_rtb.reserved -=
(xfs_qcnt_t)qtrx->qt_rtblk_res;
}
if (locked)
@@ -549,21 +549,80 @@
struct xfs_dquot *dqp,
int type)
{
- enum quota_type qtype;
+ enum quota_type qtype;
- if (dqp->dq_flags & XFS_DQ_PROJ)
+ switch (xfs_dquot_type(dqp)) {
+ case XFS_DQTYPE_PROJ:
qtype = PRJQUOTA;
- else if (dqp->dq_flags & XFS_DQ_USER)
+ break;
+ case XFS_DQTYPE_USER:
qtype = USRQUOTA;
- else
+ break;
+ case XFS_DQTYPE_GROUP:
qtype = GRPQUOTA;
+ break;
+ default:
+ return;
+ }
- quota_send_warning(make_kqid(&init_user_ns, qtype,
- be32_to_cpu(dqp->q_core.d_id)),
+ quota_send_warning(make_kqid(&init_user_ns, qtype, dqp->q_id),
mp->m_super->s_dev, type);
}
/*
+ * Decide if we can make an additional reservation against a quota resource.
+ * Returns an inode QUOTA_NL_ warning code and whether or not it's fatal.
+ *
+ * Note that we assume that the numeric difference between the inode and block
+ * warning codes will always be 3 since it's userspace ABI now, and will never
+ * decrease the quota reservation, so the *BELOW messages are irrelevant.
+ */
+static inline int
+xfs_dqresv_check(
+ struct xfs_dquot_res *res,
+ struct xfs_quota_limits *qlim,
+ int64_t delta,
+ bool *fatal)
+{
+ xfs_qcnt_t hardlimit = res->hardlimit;
+ xfs_qcnt_t softlimit = res->softlimit;
+ xfs_qcnt_t total_count = res->reserved + delta;
+
+ BUILD_BUG_ON(QUOTA_NL_BHARDWARN != QUOTA_NL_IHARDWARN + 3);
+ BUILD_BUG_ON(QUOTA_NL_BSOFTLONGWARN != QUOTA_NL_ISOFTLONGWARN + 3);
+ BUILD_BUG_ON(QUOTA_NL_BSOFTWARN != QUOTA_NL_ISOFTWARN + 3);
+
+ *fatal = false;
+ if (delta <= 0)
+ return QUOTA_NL_NOWARN;
+
+ if (!hardlimit)
+ hardlimit = qlim->hard;
+ if (!softlimit)
+ softlimit = qlim->soft;
+
+ if (hardlimit && total_count > hardlimit) {
+ *fatal = true;
+ return QUOTA_NL_IHARDWARN;
+ }
+
+ if (softlimit && total_count > softlimit) {
+ time64_t now = ktime_get_real_seconds();
+
+ if ((res->timer != 0 && now > res->timer) ||
+ (res->warnings != 0 && res->warnings >= qlim->warn)) {
+ *fatal = true;
+ return QUOTA_NL_ISOFTLONGWARN;
+ }
+
+ res->warnings++;
+ return QUOTA_NL_ISOFTWARN;
+ }
+
+ return QUOTA_NL_NOWARN;
+}
+
+/*
* This reserves disk blocks and inodes against a dquot.
* Flags indicate if the dquot is to be locked here and also
* if the blk reservation is for RT or regular blocks.
@@ -571,115 +630,65 @@
*/
STATIC int
xfs_trans_dqresv(
- xfs_trans_t *tp,
- xfs_mount_t *mp,
- xfs_dquot_t *dqp,
- int64_t nblks,
- long ninos,
- uint flags)
+ struct xfs_trans *tp,
+ struct xfs_mount *mp,
+ struct xfs_dquot *dqp,
+ int64_t nblks,
+ long ninos,
+ uint flags)
{
- xfs_qcnt_t hardlimit;
- xfs_qcnt_t softlimit;
- time_t timer;
- xfs_qwarncnt_t warns;
- xfs_qwarncnt_t warnlimit;
- xfs_qcnt_t total_count;
- xfs_qcnt_t *resbcountp;
- xfs_quotainfo_t *q = mp->m_quotainfo;
+ struct xfs_quotainfo *q = mp->m_quotainfo;
struct xfs_def_quota *defq;
-
+ struct xfs_dquot_res *blkres;
+ struct xfs_quota_limits *qlim;
xfs_dqlock(dqp);
- defq = xfs_get_defquota(dqp, q);
+ defq = xfs_get_defquota(q, xfs_dquot_type(dqp));
if (flags & XFS_TRANS_DQ_RES_BLKS) {
- hardlimit = be64_to_cpu(dqp->q_core.d_blk_hardlimit);
- if (!hardlimit)
- hardlimit = defq->bhardlimit;
- softlimit = be64_to_cpu(dqp->q_core.d_blk_softlimit);
- if (!softlimit)
- softlimit = defq->bsoftlimit;
- timer = be32_to_cpu(dqp->q_core.d_btimer);
- warns = be16_to_cpu(dqp->q_core.d_bwarns);
- warnlimit = dqp->q_mount->m_quotainfo->qi_bwarnlimit;
- resbcountp = &dqp->q_res_bcount;
+ blkres = &dqp->q_blk;
+ qlim = &defq->blk;
} else {
- ASSERT(flags & XFS_TRANS_DQ_RES_RTBLKS);
- hardlimit = be64_to_cpu(dqp->q_core.d_rtb_hardlimit);
- if (!hardlimit)
- hardlimit = defq->rtbhardlimit;
- softlimit = be64_to_cpu(dqp->q_core.d_rtb_softlimit);
- if (!softlimit)
- softlimit = defq->rtbsoftlimit;
- timer = be32_to_cpu(dqp->q_core.d_rtbtimer);
- warns = be16_to_cpu(dqp->q_core.d_rtbwarns);
- warnlimit = dqp->q_mount->m_quotainfo->qi_rtbwarnlimit;
- resbcountp = &dqp->q_res_rtbcount;
+ blkres = &dqp->q_rtb;
+ qlim = &defq->rtb;
}
- if ((flags & XFS_QMOPT_FORCE_RES) == 0 &&
- dqp->q_core.d_id &&
- ((XFS_IS_UQUOTA_ENFORCED(dqp->q_mount) && XFS_QM_ISUDQ(dqp)) ||
- (XFS_IS_GQUOTA_ENFORCED(dqp->q_mount) && XFS_QM_ISGDQ(dqp)) ||
- (XFS_IS_PQUOTA_ENFORCED(dqp->q_mount) && XFS_QM_ISPDQ(dqp)))) {
- if (nblks > 0) {
+ if ((flags & XFS_QMOPT_FORCE_RES) == 0 && dqp->q_id &&
+ xfs_dquot_is_enforced(dqp)) {
+ int quota_nl;
+ bool fatal;
+
+ /*
+ * dquot is locked already. See if we'd go over the hardlimit
+ * or exceed the timelimit if we'd reserve resources.
+ */
+ quota_nl = xfs_dqresv_check(blkres, qlim, nblks, &fatal);
+ if (quota_nl != QUOTA_NL_NOWARN) {
/*
- * dquot is locked already. See if we'd go over the
- * hardlimit or exceed the timelimit if we allocate
- * nblks.
+ * Quota block warning codes are 3 more than the inode
+ * codes, which we check above.
*/
- total_count = *resbcountp + nblks;
- if (hardlimit && total_count > hardlimit) {
- xfs_quota_warn(mp, dqp, QUOTA_NL_BHARDWARN);
+ xfs_quota_warn(mp, dqp, quota_nl + 3);
+ if (fatal)
goto error_return;
- }
- if (softlimit && total_count > softlimit) {
- if ((timer != 0 && get_seconds() > timer) ||
- (warns != 0 && warns >= warnlimit)) {
- xfs_quota_warn(mp, dqp,
- QUOTA_NL_BSOFTLONGWARN);
- goto error_return;
- }
-
- xfs_quota_warn(mp, dqp, QUOTA_NL_BSOFTWARN);
- }
}
- if (ninos > 0) {
- total_count = dqp->q_res_icount + ninos;
- timer = be32_to_cpu(dqp->q_core.d_itimer);
- warns = be16_to_cpu(dqp->q_core.d_iwarns);
- warnlimit = dqp->q_mount->m_quotainfo->qi_iwarnlimit;
- hardlimit = be64_to_cpu(dqp->q_core.d_ino_hardlimit);
- if (!hardlimit)
- hardlimit = defq->ihardlimit;
- softlimit = be64_to_cpu(dqp->q_core.d_ino_softlimit);
- if (!softlimit)
- softlimit = defq->isoftlimit;
- if (hardlimit && total_count > hardlimit) {
- xfs_quota_warn(mp, dqp, QUOTA_NL_IHARDWARN);
+ quota_nl = xfs_dqresv_check(&dqp->q_ino, &defq->ino, ninos,
+ &fatal);
+ if (quota_nl != QUOTA_NL_NOWARN) {
+ xfs_quota_warn(mp, dqp, quota_nl);
+ if (fatal)
goto error_return;
- }
- if (softlimit && total_count > softlimit) {
- if ((timer != 0 && get_seconds() > timer) ||
- (warns != 0 && warns >= warnlimit)) {
- xfs_quota_warn(mp, dqp,
- QUOTA_NL_ISOFTLONGWARN);
- goto error_return;
- }
- xfs_quota_warn(mp, dqp, QUOTA_NL_ISOFTWARN);
- }
}
}
/*
* Change the reservation, but not the actual usage.
- * Note that q_res_bcount = q_core.d_bcount + resv
+ * Note that q_blk.reserved = q_blk.count + resv
*/
- (*resbcountp) += (xfs_qcnt_t)nblks;
- if (ninos != 0)
- dqp->q_res_icount += (xfs_qcnt_t)ninos;
+ blkres->reserved += (xfs_qcnt_t)nblks;
+ dqp->q_ino.reserved += (xfs_qcnt_t)ninos;
/*
* note the reservation amt in the trans struct too,
@@ -700,16 +709,16 @@
XFS_TRANS_DQ_RES_INOS,
ninos);
}
- ASSERT(dqp->q_res_bcount >= be64_to_cpu(dqp->q_core.d_bcount));
- ASSERT(dqp->q_res_rtbcount >= be64_to_cpu(dqp->q_core.d_rtbcount));
- ASSERT(dqp->q_res_icount >= be64_to_cpu(dqp->q_core.d_icount));
+ ASSERT(dqp->q_blk.reserved >= dqp->q_blk.count);
+ ASSERT(dqp->q_rtb.reserved >= dqp->q_rtb.count);
+ ASSERT(dqp->q_ino.reserved >= dqp->q_ino.count);
xfs_dqunlock(dqp);
return 0;
error_return:
xfs_dqunlock(dqp);
- if (flags & XFS_QMOPT_ENOSPC)
+ if (xfs_dquot_type(dqp) == XFS_DQTYPE_PROJ)
return -ENOSPC;
return -EDQUOT;
}
@@ -749,8 +758,7 @@
ASSERT(flags & XFS_QMOPT_RESBLK_MASK);
if (udqp) {
- error = xfs_trans_dqresv(tp, mp, udqp, nblks, ninos,
- (flags & ~XFS_QMOPT_ENOSPC));
+ error = xfs_trans_dqresv(tp, mp, udqp, nblks, ninos, flags);
if (error)
return error;
}
@@ -801,16 +809,12 @@
if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp))
return 0;
- if (XFS_IS_PQUOTA_ON(mp))
- flags |= XFS_QMOPT_ENOSPC;
ASSERT(!xfs_is_quota_inode(&mp->m_sb, ip->i_ino));
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
- ASSERT((flags & ~(XFS_QMOPT_FORCE_RES | XFS_QMOPT_ENOSPC)) ==
- XFS_TRANS_DQ_RES_RTBLKS ||
- (flags & ~(XFS_QMOPT_FORCE_RES | XFS_QMOPT_ENOSPC)) ==
- XFS_TRANS_DQ_RES_BLKS);
+ ASSERT((flags & ~(XFS_QMOPT_FORCE_RES)) == XFS_TRANS_DQ_RES_RTBLKS ||
+ (flags & ~(XFS_QMOPT_FORCE_RES)) == XFS_TRANS_DQ_RES_BLKS);
/*
* Reserve nblks against these dquots, with trans as the mediator.
@@ -824,13 +828,13 @@
/*
* This routine is called to allocate a quotaoff log item.
*/
-xfs_qoff_logitem_t *
+struct xfs_qoff_logitem *
xfs_trans_get_qoff_item(
- xfs_trans_t *tp,
- xfs_qoff_logitem_t *startqoff,
+ struct xfs_trans *tp,
+ struct xfs_qoff_logitem *startqoff,
uint flags)
{
- xfs_qoff_logitem_t *q;
+ struct xfs_qoff_logitem *q;
ASSERT(tp != NULL);
@@ -852,8 +856,8 @@
*/
void
xfs_trans_log_quotaoff_item(
- xfs_trans_t *tp,
- xfs_qoff_logitem_t *qlp)
+ struct xfs_trans *tp,
+ struct xfs_qoff_logitem *qlp)
{
tp->t_flags |= XFS_TRANS_DIRTY;
set_bit(XFS_LI_DIRTY, &qlp->qql_item.li_flags);
@@ -863,7 +867,8 @@
xfs_trans_alloc_dqinfo(
xfs_trans_t *tp)
{
- tp->t_dqinfo = kmem_zone_zalloc(xfs_qm_dqtrxzone, 0);
+ tp->t_dqinfo = kmem_cache_zalloc(xfs_qm_dqtrxzone,
+ GFP_KERNEL | __GFP_NOFAIL);
}
void
@@ -872,6 +877,6 @@
{
if (!tp->t_dqinfo)
return;
- kmem_zone_free(xfs_qm_dqtrxzone, tp->t_dqinfo);
+ kmem_cache_free(xfs_qm_dqtrxzone, tp->t_dqinfo);
tp->t_dqinfo = NULL;
}
diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h
index 2e073c1..3004aea 100644
--- a/fs/xfs/xfs_trans_priv.h
+++ b/fs/xfs/xfs_trans_priv.h
@@ -91,24 +91,13 @@
xfs_trans_ail_update_bulk(ailp, NULL, &lip, 1, lsn);
}
-bool xfs_ail_delete_one(struct xfs_ail *ailp, struct xfs_log_item *lip);
-void xfs_trans_ail_delete(struct xfs_ail *ailp, struct xfs_log_item *lip,
- int shutdown_type) __releases(ailp->ail_lock);
+void xfs_trans_ail_insert(struct xfs_ail *ailp, struct xfs_log_item *lip,
+ xfs_lsn_t lsn);
-static inline void
-xfs_trans_ail_remove(
- struct xfs_log_item *lip,
- int shutdown_type)
-{
- struct xfs_ail *ailp = lip->li_ailp;
-
- spin_lock(&ailp->ail_lock);
- /* xfs_trans_ail_delete() drops the AIL lock */
- if (test_bit(XFS_LI_IN_AIL, &lip->li_flags))
- xfs_trans_ail_delete(ailp, lip, shutdown_type);
- else
- spin_unlock(&ailp->ail_lock);
-}
+xfs_lsn_t xfs_ail_delete_one(struct xfs_ail *ailp, struct xfs_log_item *lip);
+void xfs_ail_update_finish(struct xfs_ail *ailp, xfs_lsn_t old_lsn)
+ __releases(ailp->ail_lock);
+void xfs_trans_ail_delete(struct xfs_log_item *lip, int shutdown_type);
void xfs_ail_push(struct xfs_ail *, xfs_lsn_t);
void xfs_ail_push_all(struct xfs_ail *);
diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c
index cb895b1..bca48b3 100644
--- a/fs/xfs/xfs_xattr.c
+++ b/fs/xfs/xfs_xattr.c
@@ -11,51 +11,30 @@
#include "xfs_da_format.h"
#include "xfs_inode.h"
#include "xfs_attr.h"
+#include "xfs_acl.h"
+#include "xfs_da_btree.h"
#include <linux/posix_acl_xattr.h>
-#include <linux/xattr.h>
static int
xfs_xattr_get(const struct xattr_handler *handler, struct dentry *unused,
struct inode *inode, const char *name, void *value, size_t size)
{
- int xflags = handler->flags;
- struct xfs_inode *ip = XFS_I(inode);
- int error, asize = size;
+ struct xfs_da_args args = {
+ .dp = XFS_I(inode),
+ .attr_filter = handler->flags,
+ .name = name,
+ .namelen = strlen(name),
+ .value = value,
+ .valuelen = size,
+ };
+ int error;
- /* Convert Linux syscall to XFS internal ATTR flags */
- if (!size) {
- xflags |= ATTR_KERNOVAL;
- value = NULL;
- }
-
- error = xfs_attr_get(ip, name, (unsigned char **)&value, &asize, xflags);
+ error = xfs_attr_get(&args);
if (error)
return error;
- return asize;
-}
-
-void
-xfs_forget_acl(
- struct inode *inode,
- const char *name,
- int xflags)
-{
- /*
- * Invalidate any cached ACLs if the user has bypassed the ACL
- * interface. We don't validate the content whatsoever so it is caller
- * responsibility to provide data in valid format and ensure i_mode is
- * consistent.
- */
- if (xflags & ATTR_ROOT) {
-#ifdef CONFIG_XFS_POSIX_ACL
- if (!strcmp(name, SGI_ACL_FILE))
- forget_cached_acl(inode, ACL_TYPE_ACCESS);
- else if (!strcmp(name, SGI_ACL_DEFAULT))
- forget_cached_acl(inode, ACL_TYPE_DEFAULT);
-#endif
- }
+ return args.valuelen;
}
static int
@@ -63,23 +42,20 @@
struct inode *inode, const char *name, const void *value,
size_t size, int flags)
{
- int xflags = handler->flags;
- struct xfs_inode *ip = XFS_I(inode);
+ struct xfs_da_args args = {
+ .dp = XFS_I(inode),
+ .attr_filter = handler->flags,
+ .attr_flags = flags,
+ .name = name,
+ .namelen = strlen(name),
+ .value = (void *)value,
+ .valuelen = size,
+ };
int error;
- /* Convert Linux syscall to XFS internal ATTR flags */
- if (flags & XATTR_CREATE)
- xflags |= ATTR_CREATE;
- if (flags & XATTR_REPLACE)
- xflags |= ATTR_REPLACE;
-
- if (!value)
- return xfs_attr_remove(ip, (unsigned char *)name, xflags);
- error = xfs_attr_set(ip, (unsigned char *)name,
- (void *)value, size, xflags);
- if (!error)
- xfs_forget_acl(inode, name, xflags);
-
+ error = xfs_attr_set(&args);
+ if (!error && (handler->flags & XFS_ATTR_ROOT))
+ xfs_forget_acl(inode, name);
return error;
}
@@ -92,14 +68,14 @@
static const struct xattr_handler xfs_xattr_trusted_handler = {
.prefix = XATTR_TRUSTED_PREFIX,
- .flags = ATTR_ROOT,
+ .flags = XFS_ATTR_ROOT,
.get = xfs_xattr_get,
.set = xfs_xattr_set,
};
static const struct xattr_handler xfs_xattr_security_handler = {
.prefix = XATTR_SECURITY_PREFIX,
- .flags = ATTR_SECURE,
+ .flags = XFS_ATTR_SECURE,
.get = xfs_xattr_get,
.set = xfs_xattr_set,
};
@@ -129,7 +105,7 @@
if (context->count < 0 || context->seen_enough)
return;
- if (!context->alist)
+ if (!context->buffer)
goto compute_size;
arraytop = context->count + prefix_len + namelen + 1;
@@ -138,7 +114,7 @@
context->seen_enough = 1;
return;
}
- offset = (char *)context->alist + context->count;
+ offset = context->buffer + context->count;
strncpy(offset, prefix, prefix_len);
offset += prefix_len;
strncpy(offset, (char *)name, namelen); /* real name */
@@ -213,7 +189,6 @@
size_t size)
{
struct xfs_attr_list_context context;
- struct attrlist_cursor_kern cursor = { 0 };
struct inode *inode = d_inode(dentry);
int error;
@@ -222,14 +197,13 @@
*/
memset(&context, 0, sizeof(context));
context.dp = XFS_I(inode);
- context.cursor = &cursor;
context.resynch = 1;
- context.alist = size ? data : NULL;
+ context.buffer = size ? data : NULL;
context.bufsize = size;
context.firstu = context.bufsize;
context.put_listent = xfs_xattr_put_listent;
- error = xfs_attr_list_int(&context);
+ error = xfs_attr_list(&context);
if (error)
return error;
if (context.count < 0)