Update Linux to v5.10.109
Sourced from [1]
[1] https://cdn.kernel.org/pub/linux/kernel/v5.x/linux-5.10.109.tar.xz
Change-Id: I19bca9fc6762d4e63bcf3e4cba88bbe560d9c76c
Signed-off-by: Olivier Deprez <olivier.deprez@arm.com>
diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c
index 56b5539..4fadafd 100644
--- a/fs/overlayfs/inode.c
+++ b/fs/overlayfs/inode.c
@@ -10,6 +10,7 @@
#include <linux/xattr.h>
#include <linux/posix_acl.h>
#include <linux/ratelimit.h>
+#include <linux/fiemap.h>
#include "overlayfs.h"
@@ -58,6 +59,24 @@
if (attr->ia_valid & (ATTR_KILL_SUID|ATTR_KILL_SGID))
attr->ia_valid &= ~ATTR_MODE;
+ /*
+ * We might have to translate ovl file into real file object
+ * once use cases emerge. For now, simply don't let underlying
+ * filesystem rely on attr->ia_file
+ */
+ attr->ia_valid &= ~ATTR_FILE;
+
+ /*
+ * If open(O_TRUNC) is done, VFS calls ->setattr with ATTR_OPEN
+ * set. Overlayfs does not pass O_TRUNC flag to underlying
+ * filesystem during open -> do not pass ATTR_OPEN. This
+ * disables optimization in fuse which assumes open(O_TRUNC)
+ * already set file size to 0. But we never passed O_TRUNC to
+ * fuse. So by clearing ATTR_OPEN, fuse will be forced to send
+ * setattr request to server.
+ */
+ attr->ia_valid &= ~ATTR_OPEN;
+
inode_lock(upperdentry->d_inode);
old_cred = ovl_override_creds(dentry->d_sb);
err = notify_change(upperdentry, attr, NULL);
@@ -75,11 +94,11 @@
return err;
}
-static int ovl_map_dev_ino(struct dentry *dentry, struct kstat *stat,
- struct ovl_layer *lower_layer)
+static int ovl_map_dev_ino(struct dentry *dentry, struct kstat *stat, int fsid)
{
- bool samefs = ovl_same_sb(dentry->d_sb);
+ bool samefs = ovl_same_fs(dentry->d_sb);
unsigned int xinobits = ovl_xino_bits(dentry->d_sb);
+ unsigned int xinoshift = 64 - xinobits;
if (samefs) {
/*
@@ -90,24 +109,22 @@
stat->dev = dentry->d_sb->s_dev;
return 0;
} else if (xinobits) {
- unsigned int shift = 64 - xinobits;
/*
* All inode numbers of underlying fs should not be using the
* high xinobits, so we use high xinobits to partition the
* overlay st_ino address space. The high bits holds the fsid
- * (upper fsid is 0). This way overlay inode numbers are unique
- * and all inodes use overlay st_dev. Inode numbers are also
- * persistent for a given layer configuration.
+ * (upper fsid is 0). The lowest xinobit is reserved for mapping
+ * the non-peresistent inode numbers range in case of overflow.
+ * This way all overlay inode numbers are unique and use the
+ * overlay st_dev.
*/
- if (stat->ino >> shift) {
- pr_warn_ratelimited("overlayfs: inode number too big (%pd2, ino=%llu, xinobits=%d)\n",
- dentry, stat->ino, xinobits);
- } else {
- if (lower_layer)
- stat->ino |= ((u64)lower_layer->fsid) << shift;
-
+ if (likely(!(stat->ino >> xinoshift))) {
+ stat->ino |= ((u64)fsid) << (xinoshift + 1);
stat->dev = dentry->d_sb->s_dev;
return 0;
+ } else if (ovl_xino_warn(dentry->d_sb)) {
+ pr_warn_ratelimited("inode number too big (%pd2, ino=%llu, xinobits=%d)\n",
+ dentry, stat->ino, xinobits);
}
}
@@ -124,15 +141,14 @@
*/
stat->dev = dentry->d_sb->s_dev;
stat->ino = dentry->d_inode->i_ino;
- } else if (lower_layer && lower_layer->fsid) {
+ } else {
/*
* For non-samefs setup, if we cannot map all layers st_ino
* to a unified address space, we need to make sure that st_dev
- * is unique per lower fs. Upper layer uses real st_dev and
- * lower layers use the unique anonymous bdev assigned to the
- * lower fs.
+ * is unique per underlying fs, so we use the unique anonymous
+ * bdev assigned to the underlying fs.
*/
- stat->dev = lower_layer->fs->pseudo_dev;
+ stat->dev = OVL_FS(dentry->d_sb)->fs[fsid].pseudo_dev;
}
return 0;
@@ -146,8 +162,7 @@
struct path realpath;
const struct cred *old_cred;
bool is_dir = S_ISDIR(dentry->d_inode->i_mode);
- bool samefs = ovl_same_sb(dentry->d_sb);
- struct ovl_layer *lower_layer = NULL;
+ int fsid = 0;
int err;
bool metacopy_blocks = false;
@@ -168,9 +183,9 @@
* If lower filesystem supports NFS file handles, this also guaranties
* persistent st_ino across mount cycle.
*/
- if (!is_dir || samefs || ovl_xino_bits(dentry->d_sb)) {
+ if (!is_dir || ovl_same_dev(dentry->d_sb)) {
if (!OVL_TYPE_UPPER(type)) {
- lower_layer = ovl_layer_lower(dentry);
+ fsid = ovl_layer_lower(dentry)->fsid;
} else if (OVL_TYPE_ORIGIN(type)) {
struct kstat lowerstat;
u32 lowermask = STATX_INO | STATX_BLOCKS |
@@ -200,14 +215,8 @@
if (ovl_test_flag(OVL_INDEX, d_inode(dentry)) ||
(!ovl_verify_lower(dentry->d_sb) &&
(is_dir || lowerstat.nlink == 1))) {
- lower_layer = ovl_layer_lower(dentry);
- /*
- * Cannot use origin st_dev;st_ino because
- * origin inode content may differ from overlay
- * inode content.
- */
- if (samefs || lower_layer->fsid)
- stat->ino = lowerstat.ino;
+ fsid = ovl_layer_lower(dentry)->fsid;
+ stat->ino = lowerstat.ino;
}
/*
@@ -241,7 +250,7 @@
}
}
- err = ovl_map_dev_ino(dentry, stat, lower_layer);
+ err = ovl_map_dev_ino(dentry, stat, fsid);
if (err)
goto out;
@@ -318,7 +327,7 @@
return p;
}
-bool ovl_is_private_xattr(const char *name)
+bool ovl_is_private_xattr(struct super_block *sb, const char *name)
{
return strncmp(name, OVL_XATTR_PREFIX,
sizeof(OVL_XATTR_PREFIX) - 1) == 0;
@@ -384,15 +393,18 @@
return res;
}
-static bool ovl_can_list(const char *s)
+static bool ovl_can_list(struct super_block *sb, const char *s)
{
+ /* Never list private (.overlay) */
+ if (ovl_is_private_xattr(sb, s))
+ return false;
+
/* List all non-trusted xatts */
if (strncmp(s, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) != 0)
return true;
- /* Never list trusted.overlay, list other trusted for superuser only */
- return !ovl_is_private_xattr(s) &&
- ns_capable_noaudit(&init_user_ns, CAP_SYS_ADMIN);
+ /* list other trusted for superuser only */
+ return ns_capable_noaudit(&init_user_ns, CAP_SYS_ADMIN);
}
ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size)
@@ -418,7 +430,7 @@
return -EIO;
len -= slen;
- if (!ovl_can_list(s)) {
+ if (!ovl_can_list(dentry->d_sb, s)) {
res -= slen;
memmove(s, s + slen, len);
} else {
@@ -450,7 +462,7 @@
if (flags & S_ATIME) {
struct ovl_fs *ofs = inode->i_sb->s_fs_info;
struct path upperpath = {
- .mnt = ofs->upper_mnt,
+ .mnt = ovl_upper_mnt(ofs),
.dentry = ovl_upperdentry_dereference(OVL_I(inode)),
};
@@ -473,10 +485,6 @@
return -EOPNOTSUPP;
old_cred = ovl_override_creds(inode->i_sb);
-
- if (fieinfo->fi_flags & FIEMAP_FLAG_SYNC)
- filemap_write_and_wait(realinode->i_mapping);
-
err = realinode->i_op->fiemap(realinode, fieinfo, start, len);
revert_creds(old_cred);
@@ -517,7 +525,7 @@
/*
* It is possible to stack overlayfs instance on top of another
- * overlayfs instance as lower layer. We need to annonate the
+ * overlayfs instance as lower layer. We need to annotate the
* stackable i_mutex locks according to stack level of the super
* block instance. An overlayfs instance can never be in stack
* depth 0 (there is always a real fs below it). An overlayfs
@@ -529,6 +537,27 @@
* [...] &ovl_i_mutex_dir_key[depth] (stack_depth=2)
* [...] &ovl_i_mutex_dir_key[depth]#2 (stack_depth=1)
* [...] &type->i_mutex_dir_key (stack_depth=0)
+ *
+ * Locking order w.r.t ovl_want_write() is important for nested overlayfs.
+ *
+ * This chain is valid:
+ * - inode->i_rwsem (inode_lock[2])
+ * - upper_mnt->mnt_sb->s_writers (ovl_want_write[0])
+ * - OVL_I(inode)->lock (ovl_inode_lock[2])
+ * - OVL_I(lowerinode)->lock (ovl_inode_lock[1])
+ *
+ * And this chain is valid:
+ * - inode->i_rwsem (inode_lock[2])
+ * - OVL_I(inode)->lock (ovl_inode_lock[2])
+ * - lowerinode->i_rwsem (inode_lock[1])
+ * - OVL_I(lowerinode)->lock (ovl_inode_lock[1])
+ *
+ * But lowerinode->i_rwsem SHOULD NOT be acquired while ovl_want_write() is
+ * held, because it is in reverse order of the non-nested case using the same
+ * upper fs:
+ * - inode->i_rwsem (inode_lock[1])
+ * - upper_mnt->mnt_sb->s_writers (ovl_want_write[0])
+ * - OVL_I(inode)->lock (ovl_inode_lock[1])
*/
#define OVL_MAX_NESTING FILESYSTEM_MAX_STACK_DEPTH
@@ -553,27 +582,73 @@
#endif
}
-static void ovl_fill_inode(struct inode *inode, umode_t mode, dev_t rdev,
- unsigned long ino, int fsid)
+static void ovl_next_ino(struct inode *inode)
+{
+ struct ovl_fs *ofs = inode->i_sb->s_fs_info;
+
+ inode->i_ino = atomic_long_inc_return(&ofs->last_ino);
+ if (unlikely(!inode->i_ino))
+ inode->i_ino = atomic_long_inc_return(&ofs->last_ino);
+}
+
+static void ovl_map_ino(struct inode *inode, unsigned long ino, int fsid)
{
int xinobits = ovl_xino_bits(inode->i_sb);
+ unsigned int xinoshift = 64 - xinobits;
/*
* When d_ino is consistent with st_ino (samefs or i_ino has enough
* bits to encode layer), set the same value used for st_ino to i_ino,
* so inode number exposed via /proc/locks and a like will be
* consistent with d_ino and st_ino values. An i_ino value inconsistent
- * with d_ino also causes nfsd readdirplus to fail. When called from
- * ovl_new_inode(), ino arg is 0, so i_ino will be updated to real
- * upper inode i_ino on ovl_inode_init() or ovl_inode_update().
+ * with d_ino also causes nfsd readdirplus to fail.
*/
- if (ovl_same_sb(inode->i_sb) || xinobits) {
- inode->i_ino = ino;
- if (xinobits && fsid && !(ino >> (64 - xinobits)))
- inode->i_ino |= (unsigned long)fsid << (64 - xinobits);
- } else {
- inode->i_ino = get_next_ino();
+ inode->i_ino = ino;
+ if (ovl_same_fs(inode->i_sb)) {
+ return;
+ } else if (xinobits && likely(!(ino >> xinoshift))) {
+ inode->i_ino |= (unsigned long)fsid << (xinoshift + 1);
+ return;
}
+
+ /*
+ * For directory inodes on non-samefs with xino disabled or xino
+ * overflow, we allocate a non-persistent inode number, to be used for
+ * resolving st_ino collisions in ovl_map_dev_ino().
+ *
+ * To avoid ino collision with legitimate xino values from upper
+ * layer (fsid 0), use the lowest xinobit to map the non
+ * persistent inode numbers to the unified st_ino address space.
+ */
+ if (S_ISDIR(inode->i_mode)) {
+ ovl_next_ino(inode);
+ if (xinobits) {
+ inode->i_ino &= ~0UL >> xinobits;
+ inode->i_ino |= 1UL << xinoshift;
+ }
+ }
+}
+
+void ovl_inode_init(struct inode *inode, struct ovl_inode_params *oip,
+ unsigned long ino, int fsid)
+{
+ struct inode *realinode;
+
+ if (oip->upperdentry)
+ OVL_I(inode)->__upperdentry = oip->upperdentry;
+ if (oip->lowerpath && oip->lowerpath->dentry)
+ OVL_I(inode)->lower = igrab(d_inode(oip->lowerpath->dentry));
+ if (oip->lowerdata)
+ OVL_I(inode)->lowerdata = igrab(d_inode(oip->lowerdata));
+
+ realinode = ovl_inode_real(inode);
+ ovl_copyattr(realinode, inode);
+ ovl_copyflags(realinode, inode);
+ ovl_map_ino(inode, ino, fsid);
+}
+
+static void ovl_fill_inode(struct inode *inode, umode_t mode, dev_t rdev)
+{
inode->i_mode = mode;
inode->i_flags |= S_NOCMTIME;
#ifdef CONFIG_FS_POSIX_ACL
@@ -652,8 +727,8 @@
if (WARN_ON(len >= sizeof(buf)))
return -EIO;
- return ovl_do_setxattr(ovl_dentry_upper(dentry),
- OVL_XATTR_NLINK, buf, len, 0);
+ return ovl_do_setxattr(OVL_FS(inode->i_sb), ovl_dentry_upper(dentry),
+ OVL_XATTR_NLINK, buf, len);
}
int ovl_set_nlink_upper(struct dentry *dentry)
@@ -666,7 +741,7 @@
return ovl_set_nlink_common(dentry, ovl_dentry_lower(dentry), "L%+i");
}
-unsigned int ovl_get_nlink(struct dentry *lowerdentry,
+unsigned int ovl_get_nlink(struct ovl_fs *ofs, struct dentry *lowerdentry,
struct dentry *upperdentry,
unsigned int fallback)
{
@@ -678,7 +753,8 @@
if (!lowerdentry || !upperdentry || d_inode(lowerdentry)->i_nlink == 1)
return fallback;
- err = vfs_getxattr(upperdentry, OVL_XATTR_NLINK, &buf, sizeof(buf) - 1);
+ err = ovl_do_getxattr(ofs, upperdentry, OVL_XATTR_NLINK,
+ &buf, sizeof(buf) - 1);
if (err < 0)
goto fail;
@@ -700,7 +776,7 @@
return nlink;
fail:
- pr_warn_ratelimited("overlayfs: failed to get index nlink (%pd2, err=%i)\n",
+ pr_warn_ratelimited("failed to get index nlink (%pd2, err=%i)\n",
upperdentry, err);
return fallback;
}
@@ -711,7 +787,7 @@
inode = new_inode(sb);
if (inode)
- ovl_fill_inode(inode, mode, rdev, 0, 0);
+ ovl_fill_inode(inode, mode, rdev);
return inode;
}
@@ -835,7 +911,7 @@
* Does overlay inode need to be hashed by lower inode?
*/
static bool ovl_hash_bylower(struct super_block *sb, struct dentry *upper,
- struct dentry *lower, struct dentry *index)
+ struct dentry *lower, bool index)
{
struct ovl_fs *ofs = sb->s_fs_info;
@@ -848,7 +924,7 @@
return true;
/* Yes, if won't be copied up */
- if (!ofs->upper_mnt)
+ if (!ovl_upper_mnt(ofs))
return true;
/* No, if lower hardlink is or will be broken on copy up */
@@ -876,6 +952,7 @@
struct inode *ovl_get_inode(struct super_block *sb,
struct ovl_inode_params *oip)
{
+ struct ovl_fs *ofs = OVL_FS(sb);
struct dentry *upperdentry = oip->upperdentry;
struct ovl_path *lowerpath = oip->lowerpath;
struct inode *realinode = upperdentry ? d_inode(upperdentry) : NULL;
@@ -884,7 +961,7 @@
bool bylower = ovl_hash_bylower(sb, upperdentry, lowerdentry,
oip->index);
int fsid = bylower ? lowerpath->layer->fsid : 0;
- bool is_dir, metacopy = false;
+ bool is_dir;
unsigned long ino = 0;
int err = oip->newinode ? -EEXIST : -ENOMEM;
@@ -923,7 +1000,8 @@
/* Recalculate nlink for non-dir due to indexing */
if (!is_dir)
- nlink = ovl_get_nlink(lowerdentry, upperdentry, nlink);
+ nlink = ovl_get_nlink(ofs, lowerdentry, upperdentry,
+ nlink);
set_nlink(inode, nlink);
ino = key->i_ino;
} else {
@@ -936,24 +1014,15 @@
ino = realinode->i_ino;
fsid = lowerpath->layer->fsid;
}
- ovl_fill_inode(inode, realinode->i_mode, realinode->i_rdev, ino, fsid);
- ovl_inode_init(inode, upperdentry, lowerdentry, oip->lowerdata);
+ ovl_fill_inode(inode, realinode->i_mode, realinode->i_rdev);
+ ovl_inode_init(inode, oip, ino, fsid);
- if (upperdentry && ovl_is_impuredir(upperdentry))
+ if (upperdentry && ovl_is_impuredir(sb, upperdentry))
ovl_set_flag(OVL_IMPURE, inode);
if (oip->index)
ovl_set_flag(OVL_INDEX, inode);
- if (upperdentry) {
- err = ovl_check_metacopy_xattr(upperdentry);
- if (err < 0)
- goto out_err;
- metacopy = err;
- if (!metacopy)
- ovl_set_flag(OVL_UPPERDATA, inode);
- }
-
OVL_I(inode)->redirect = oip->redirect;
if (bylower)
@@ -962,7 +1031,7 @@
/* Check for non-merge dir that may have whiteouts */
if (is_dir) {
if (((upperdentry && lowerdentry) || oip->numlower > 1) ||
- ovl_check_origin_xattr(upperdentry ?: lowerdentry)) {
+ ovl_check_origin_xattr(ofs, upperdentry ?: lowerdentry)) {
ovl_set_flag(OVL_WHITEOUTS, inode);
}
}
@@ -973,7 +1042,7 @@
return inode;
out_err:
- pr_warn_ratelimited("overlayfs: failed to get inode (%i)\n", err);
+ pr_warn_ratelimited("failed to get inode (%i)\n", err);
inode = ERR_PTR(err);
goto out;
}