Update Linux to v5.4.2
Change-Id: Idf6911045d9d382da2cfe01b1edff026404ac8fd
diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig
index a453cc8..cbb5ca8 100644
--- a/fs/ext4/Kconfig
+++ b/fs/ext4/Kconfig
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
# Ext3 configs are here for backward compatibility with old configs which may
# have EXT3_FS set but not EXT4_FS set and thus would result in non-bootable
# kernels after the removal of ext3 driver.
@@ -96,23 +97,8 @@
If you are not using a security module that requires using
extended attributes for file security labels, say N.
-config EXT4_ENCRYPTION
- bool "Ext4 Encryption"
- depends on EXT4_FS
- select FS_ENCRYPTION
- help
- Enable encryption of ext4 files and directories. This
- feature is similar to ecryptfs, but it is more memory
- efficient since it avoids caching the encrypted and
- decrypted pages in the page cache.
-
-config EXT4_FS_ENCRYPTION
- bool
- default y
- depends on EXT4_ENCRYPTION
-
config EXT4_DEBUG
- bool "EXT4 debugging support"
+ bool "Ext4 debugging support"
depends on EXT4_FS
help
Enables run-time debugging support for the ext4 filesystem.
diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile
index 8fdfcd3..b17ddc2 100644
--- a/fs/ext4/Makefile
+++ b/fs/ext4/Makefile
@@ -13,3 +13,4 @@
ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o
ext4-$(CONFIG_EXT4_FS_SECURITY) += xattr_security.o
+ext4-$(CONFIG_FS_VERITY) += verity.o
diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
index fb50f9a..8c7bbf3 100644
--- a/fs/ext4/acl.c
+++ b/fs/ext4/acl.c
@@ -248,7 +248,8 @@
error = posix_acl_update_mode(inode, &mode, &acl);
if (error)
goto out_stop;
- update_mode = 1;
+ if (mode != inode->i_mode)
+ update_mode = 1;
}
error = __ext4_set_acl(handle, inode, type, acl, 0 /* xattr_flags */);
@@ -284,12 +285,16 @@
error = __ext4_set_acl(handle, inode, ACL_TYPE_DEFAULT,
default_acl, XATTR_CREATE);
posix_acl_release(default_acl);
+ } else {
+ inode->i_default_acl = NULL;
}
if (acl) {
if (!error)
error = __ext4_set_acl(handle, inode, ACL_TYPE_ACCESS,
acl, XATTR_CREATE);
posix_acl_release(acl);
+ } else {
+ inode->i_acl = NULL;
}
return error;
}
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index e5d6ee6..0b202e0 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -603,9 +603,9 @@
}
/**
- * ext4_should_retry_alloc()
+ * ext4_should_retry_alloc() - check if a block allocation should be retried
* @sb: super block
- * @retries number of attemps has been made
+ * @retries: number of attemps has been made
*
* ext4_should_retry_alloc() is called when ENOSPC is returned, and if
* it is profitable to retry the operation, this function will wait
diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c
index 913061c..d4d4fdf 100644
--- a/fs/ext4/block_validity.c
+++ b/fs/ext4/block_validity.c
@@ -38,6 +38,7 @@
void ext4_exit_system_zone(void)
{
+ rcu_barrier();
kmem_cache_destroy(ext4_system_zone_cachep);
}
@@ -49,17 +50,26 @@
return 0;
}
+static void release_system_zone(struct ext4_system_blocks *system_blks)
+{
+ struct ext4_system_zone *entry, *n;
+
+ rbtree_postorder_for_each_entry_safe(entry, n,
+ &system_blks->root, node)
+ kmem_cache_free(ext4_system_zone_cachep, entry);
+}
+
/*
* Mark a range of blocks as belonging to the "system zone" --- that
* is, filesystem metadata blocks which should never be used by
* inodes.
*/
-static int add_system_zone(struct ext4_sb_info *sbi,
+static int add_system_zone(struct ext4_system_blocks *system_blks,
ext4_fsblk_t start_blk,
unsigned int count)
{
struct ext4_system_zone *new_entry = NULL, *entry;
- struct rb_node **n = &sbi->system_blks.rb_node, *node;
+ struct rb_node **n = &system_blks->root.rb_node, *node;
struct rb_node *parent = NULL, *new_node = NULL;
while (*n) {
@@ -91,7 +101,7 @@
new_node = &new_entry->node;
rb_link_node(new_node, parent, n);
- rb_insert_color(new_node, &sbi->system_blks);
+ rb_insert_color(new_node, &system_blks->root);
}
/* Can we merge to the left? */
@@ -101,7 +111,7 @@
if (can_merge(entry, new_entry)) {
new_entry->start_blk = entry->start_blk;
new_entry->count += entry->count;
- rb_erase(node, &sbi->system_blks);
+ rb_erase(node, &system_blks->root);
kmem_cache_free(ext4_system_zone_cachep, entry);
}
}
@@ -112,7 +122,7 @@
entry = rb_entry(node, struct ext4_system_zone, node);
if (can_merge(new_entry, entry)) {
new_entry->count += entry->count;
- rb_erase(node, &sbi->system_blks);
+ rb_erase(node, &system_blks->root);
kmem_cache_free(ext4_system_zone_cachep, entry);
}
}
@@ -126,7 +136,7 @@
int first = 1;
printk(KERN_INFO "System zones: ");
- node = rb_first(&sbi->system_blks);
+ node = rb_first(&sbi->system_blks->root);
while (node) {
entry = rb_entry(node, struct ext4_system_zone, node);
printk(KERN_CONT "%s%llu-%llu", first ? "" : ", ",
@@ -137,68 +147,18 @@
printk(KERN_CONT "\n");
}
-int ext4_setup_system_zone(struct super_block *sb)
-{
- ext4_group_t ngroups = ext4_get_groups_count(sb);
- struct ext4_sb_info *sbi = EXT4_SB(sb);
- struct ext4_group_desc *gdp;
- ext4_group_t i;
- int flex_size = ext4_flex_bg_size(sbi);
- int ret;
-
- if (!test_opt(sb, BLOCK_VALIDITY)) {
- if (sbi->system_blks.rb_node)
- ext4_release_system_zone(sb);
- return 0;
- }
- if (sbi->system_blks.rb_node)
- return 0;
-
- for (i=0; i < ngroups; i++) {
- if (ext4_bg_has_super(sb, i) &&
- ((i < 5) || ((i % flex_size) == 0)))
- add_system_zone(sbi, ext4_group_first_block_no(sb, i),
- ext4_bg_num_gdb(sb, i) + 1);
- gdp = ext4_get_group_desc(sb, i, NULL);
- ret = add_system_zone(sbi, ext4_block_bitmap(sb, gdp), 1);
- if (ret)
- return ret;
- ret = add_system_zone(sbi, ext4_inode_bitmap(sb, gdp), 1);
- if (ret)
- return ret;
- ret = add_system_zone(sbi, ext4_inode_table(sb, gdp),
- sbi->s_itb_per_group);
- if (ret)
- return ret;
- }
-
- if (test_opt(sb, DEBUG))
- debug_print_tree(sbi);
- return 0;
-}
-
-/* Called when the filesystem is unmounted */
-void ext4_release_system_zone(struct super_block *sb)
-{
- struct ext4_system_zone *entry, *n;
-
- rbtree_postorder_for_each_entry_safe(entry, n,
- &EXT4_SB(sb)->system_blks, node)
- kmem_cache_free(ext4_system_zone_cachep, entry);
-
- EXT4_SB(sb)->system_blks = RB_ROOT;
-}
-
/*
* Returns 1 if the passed-in block region (start_blk,
* start_blk+count) is valid; 0 if some part of the block region
* overlaps with filesystem metadata blocks.
*/
-int ext4_data_block_valid(struct ext4_sb_info *sbi, ext4_fsblk_t start_blk,
- unsigned int count)
+static int ext4_data_block_valid_rcu(struct ext4_sb_info *sbi,
+ struct ext4_system_blocks *system_blks,
+ ext4_fsblk_t start_blk,
+ unsigned int count)
{
struct ext4_system_zone *entry;
- struct rb_node *n = sbi->system_blks.rb_node;
+ struct rb_node *n;
if ((start_blk <= le32_to_cpu(sbi->s_es->s_first_data_block)) ||
(start_blk + count < start_blk) ||
@@ -206,6 +166,11 @@
sbi->s_es->s_last_error_block = cpu_to_le64(start_blk);
return 0;
}
+
+ if (system_blks == NULL)
+ return 1;
+
+ n = system_blks->root.rb_node;
while (n) {
entry = rb_entry(n, struct ext4_system_zone, node);
if (start_blk + count - 1 < entry->start_blk)
@@ -220,6 +185,178 @@
return 1;
}
+static int ext4_protect_reserved_inode(struct super_block *sb,
+ struct ext4_system_blocks *system_blks,
+ u32 ino)
+{
+ struct inode *inode;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_map_blocks map;
+ u32 i = 0, num;
+ int err = 0, n;
+
+ if ((ino < EXT4_ROOT_INO) ||
+ (ino > le32_to_cpu(sbi->s_es->s_inodes_count)))
+ return -EINVAL;
+ inode = ext4_iget(sb, ino, EXT4_IGET_SPECIAL);
+ if (IS_ERR(inode))
+ return PTR_ERR(inode);
+ num = (inode->i_size + sb->s_blocksize - 1) >> sb->s_blocksize_bits;
+ while (i < num) {
+ map.m_lblk = i;
+ map.m_len = num - i;
+ n = ext4_map_blocks(NULL, inode, &map, 0);
+ if (n < 0) {
+ err = n;
+ break;
+ }
+ if (n == 0) {
+ i++;
+ } else {
+ if (!ext4_data_block_valid_rcu(sbi, system_blks,
+ map.m_pblk, n)) {
+ ext4_error(sb, "blocks %llu-%llu from inode %u "
+ "overlap system zone", map.m_pblk,
+ map.m_pblk + map.m_len - 1, ino);
+ err = -EFSCORRUPTED;
+ break;
+ }
+ err = add_system_zone(system_blks, map.m_pblk, n);
+ if (err < 0)
+ break;
+ i += n;
+ }
+ }
+ iput(inode);
+ return err;
+}
+
+static void ext4_destroy_system_zone(struct rcu_head *rcu)
+{
+ struct ext4_system_blocks *system_blks;
+
+ system_blks = container_of(rcu, struct ext4_system_blocks, rcu);
+ release_system_zone(system_blks);
+ kfree(system_blks);
+}
+
+/*
+ * Build system zone rbtree which is used for block validity checking.
+ *
+ * The update of system_blks pointer in this function is protected by
+ * sb->s_umount semaphore. However we have to be careful as we can be
+ * racing with ext4_data_block_valid() calls reading system_blks rbtree
+ * protected only by RCU. That's why we first build the rbtree and then
+ * swap it in place.
+ */
+int ext4_setup_system_zone(struct super_block *sb)
+{
+ ext4_group_t ngroups = ext4_get_groups_count(sb);
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_system_blocks *system_blks;
+ struct ext4_group_desc *gdp;
+ ext4_group_t i;
+ int flex_size = ext4_flex_bg_size(sbi);
+ int ret;
+
+ if (!test_opt(sb, BLOCK_VALIDITY)) {
+ if (sbi->system_blks)
+ ext4_release_system_zone(sb);
+ return 0;
+ }
+ if (sbi->system_blks)
+ return 0;
+
+ system_blks = kzalloc(sizeof(*system_blks), GFP_KERNEL);
+ if (!system_blks)
+ return -ENOMEM;
+
+ for (i=0; i < ngroups; i++) {
+ cond_resched();
+ if (ext4_bg_has_super(sb, i) &&
+ ((i < 5) || ((i % flex_size) == 0)))
+ add_system_zone(system_blks,
+ ext4_group_first_block_no(sb, i),
+ ext4_bg_num_gdb(sb, i) + 1);
+ gdp = ext4_get_group_desc(sb, i, NULL);
+ ret = add_system_zone(system_blks,
+ ext4_block_bitmap(sb, gdp), 1);
+ if (ret)
+ goto err;
+ ret = add_system_zone(system_blks,
+ ext4_inode_bitmap(sb, gdp), 1);
+ if (ret)
+ goto err;
+ ret = add_system_zone(system_blks,
+ ext4_inode_table(sb, gdp),
+ sbi->s_itb_per_group);
+ if (ret)
+ goto err;
+ }
+ if (ext4_has_feature_journal(sb) && sbi->s_es->s_journal_inum) {
+ ret = ext4_protect_reserved_inode(sb, system_blks,
+ le32_to_cpu(sbi->s_es->s_journal_inum));
+ if (ret)
+ goto err;
+ }
+
+ /*
+ * System blks rbtree complete, announce it once to prevent racing
+ * with ext4_data_block_valid() accessing the rbtree at the same
+ * time.
+ */
+ rcu_assign_pointer(sbi->system_blks, system_blks);
+
+ if (test_opt(sb, DEBUG))
+ debug_print_tree(sbi);
+ return 0;
+err:
+ release_system_zone(system_blks);
+ kfree(system_blks);
+ return ret;
+}
+
+/*
+ * Called when the filesystem is unmounted or when remounting it with
+ * noblock_validity specified.
+ *
+ * The update of system_blks pointer in this function is protected by
+ * sb->s_umount semaphore. However we have to be careful as we can be
+ * racing with ext4_data_block_valid() calls reading system_blks rbtree
+ * protected only by RCU. So we first clear the system_blks pointer and
+ * then free the rbtree only after RCU grace period expires.
+ */
+void ext4_release_system_zone(struct super_block *sb)
+{
+ struct ext4_system_blocks *system_blks;
+
+ system_blks = rcu_dereference_protected(EXT4_SB(sb)->system_blks,
+ lockdep_is_held(&sb->s_umount));
+ rcu_assign_pointer(EXT4_SB(sb)->system_blks, NULL);
+
+ if (system_blks)
+ call_rcu(&system_blks->rcu, ext4_destroy_system_zone);
+}
+
+int ext4_data_block_valid(struct ext4_sb_info *sbi, ext4_fsblk_t start_blk,
+ unsigned int count)
+{
+ struct ext4_system_blocks *system_blks;
+ int ret;
+
+ /*
+ * Lock the system zone to prevent it being released concurrently
+ * when doing a remount which inverse current "[no]block_validity"
+ * mount option.
+ */
+ rcu_read_lock();
+ system_blks = rcu_dereference(sbi->system_blks);
+ ret = ext4_data_block_valid_rcu(sbi, system_blks, start_blk,
+ count);
+ rcu_read_unlock();
+ return ret;
+}
+
int ext4_check_blockref(const char *function, unsigned int line,
struct inode *inode, __le32 *p, unsigned int max)
{
@@ -227,6 +364,11 @@
__le32 *bref = p;
unsigned int blk;
+ if (ext4_has_feature_journal(inode->i_sb) &&
+ (inode->i_ino ==
+ le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_journal_inum)))
+ return 0;
+
while (bref < p+max) {
blk = le32_to_cpu(*bref++);
if (blk &&
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index f93f988..9fdd2b2 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -26,12 +26,16 @@
#include <linux/buffer_head.h>
#include <linux/slab.h>
#include <linux/iversion.h>
+#include <linux/unicode.h>
#include "ext4.h"
#include "xattr.h"
static int ext4_dx_readdir(struct file *, struct dir_context *);
/**
+ * is_dx_dir() - check if a directory is using htree indexing
+ * @inode: directory inode
+ *
* Check if the given dir-inode refers to an htree-indexed directory
* (or a directory which could potentially get converted to use htree
* indexing).
@@ -108,10 +112,9 @@
struct inode *inode = file_inode(file);
struct super_block *sb = inode->i_sb;
struct buffer_head *bh = NULL;
- int dir_has_error = 0;
struct fscrypt_str fstr = FSTR_INIT(NULL, 0);
- if (ext4_encrypted_inode(inode)) {
+ if (IS_ENCRYPTED(inode)) {
err = fscrypt_get_encryption_info(inode);
if (err && err != -ENOKEY)
return err;
@@ -138,14 +141,12 @@
return err;
}
- if (ext4_encrypted_inode(inode)) {
+ if (IS_ENCRYPTED(inode)) {
err = fscrypt_fname_alloc_buffer(inode, EXT4_NAME_LEN, &fstr);
if (err < 0)
return err;
}
- offset = ctx->pos & (sb->s_blocksize - 1);
-
while (ctx->pos < inode->i_size) {
struct ext4_map_blocks map;
@@ -154,9 +155,18 @@
goto errout;
}
cond_resched();
+ offset = ctx->pos & (sb->s_blocksize - 1);
map.m_lblk = ctx->pos >> EXT4_BLOCK_SIZE_BITS(sb);
map.m_len = 1;
err = ext4_map_blocks(NULL, inode, &map, 0);
+ if (err == 0) {
+ /* m_len should never be zero but let's avoid
+ * an infinite loop if it somehow is */
+ if (map.m_len == 0)
+ map.m_len = 1;
+ ctx->pos += map.m_len * sb->s_blocksize;
+ continue;
+ }
if (err > 0) {
pgoff_t index = map.m_pblk >>
(PAGE_SHIFT - inode->i_blkbits);
@@ -175,13 +185,6 @@
}
if (!bh) {
- if (!dir_has_error) {
- EXT4_ERROR_FILE(file, 0,
- "directory contains a "
- "hole at offset %llu",
- (unsigned long long) ctx->pos);
- dir_has_error = 1;
- }
/* corrupt size? Maybe no more blocks to read */
if (ctx->pos > inode->i_blocks << 9)
break;
@@ -191,8 +194,7 @@
/* Check the checksum */
if (!buffer_verified(bh) &&
- !ext4_dirent_csum_verify(inode,
- (struct ext4_dir_entry *)bh->b_data)) {
+ !ext4_dirblock_csum_verify(inode, bh)) {
EXT4_ERROR_FILE(file, 0, "directory fails checksum "
"at offset %llu",
(unsigned long long)ctx->pos);
@@ -245,7 +247,7 @@
offset += ext4_rec_len_from_disk(de->rec_len,
sb->s_blocksize);
if (le32_to_cpu(de->inode)) {
- if (!ext4_encrypted_inode(inode)) {
+ if (!IS_ENCRYPTED(inode)) {
if (!dir_emit(ctx, de->name,
de->name_len,
le32_to_cpu(de->inode),
@@ -283,9 +285,7 @@
done:
err = 0;
errout:
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
fscrypt_fname_free_buffer(&fstr);
-#endif
brelse(bh);
return err;
}
@@ -613,7 +613,7 @@
static int ext4_dir_open(struct inode * inode, struct file * filp)
{
- if (ext4_encrypted_inode(inode))
+ if (IS_ENCRYPTED(inode))
return fscrypt_get_encryption_info(inode) ? -EACCES : 0;
return 0;
}
@@ -662,3 +662,51 @@
.open = ext4_dir_open,
.release = ext4_release_dir,
};
+
+#ifdef CONFIG_UNICODE
+static int ext4_d_compare(const struct dentry *dentry, unsigned int len,
+ const char *str, const struct qstr *name)
+{
+ struct qstr qstr = {.name = str, .len = len };
+ struct inode *inode = dentry->d_parent->d_inode;
+
+ if (!IS_CASEFOLDED(inode) || !EXT4_SB(inode->i_sb)->s_encoding) {
+ if (len != name->len)
+ return -1;
+ return memcmp(str, name->name, len);
+ }
+
+ return ext4_ci_compare(inode, name, &qstr, false);
+}
+
+static int ext4_d_hash(const struct dentry *dentry, struct qstr *str)
+{
+ const struct ext4_sb_info *sbi = EXT4_SB(dentry->d_sb);
+ const struct unicode_map *um = sbi->s_encoding;
+ unsigned char *norm;
+ int len, ret = 0;
+
+ if (!IS_CASEFOLDED(dentry->d_inode) || !um)
+ return 0;
+
+ norm = kmalloc(PATH_MAX, GFP_ATOMIC);
+ if (!norm)
+ return -ENOMEM;
+
+ len = utf8_casefold(um, str, norm, PATH_MAX);
+ if (len < 0) {
+ if (ext4_has_strict_mode(sbi))
+ ret = -EINVAL;
+ goto out;
+ }
+ str->hash = full_name_hash(dentry, norm, len);
+out:
+ kfree(norm);
+ return ret;
+}
+
+const struct dentry_operations ext4_dentry_ops = {
+ .d_hash = ext4_d_hash,
+ .d_compare = ext4_d_compare,
+};
+#endif
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 5cfb1e2..03db3e7 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -40,20 +40,11 @@
#include <linux/compat.h>
#endif
-#define __FS_HAS_ENCRYPTION IS_ENABLED(CONFIG_EXT4_FS_ENCRYPTION)
#include <linux/fscrypt.h>
+#include <linux/fsverity.h>
#include <linux/compiler.h>
-/* Until this gets included into linux/compiler-gcc.h */
-#ifndef __nonstring
-#if defined(GCC_VERSION) && (GCC_VERSION >= 80000)
-#define __nonstring __attribute__((nonstring))
-#else
-#define __nonstring
-#endif
-#endif
-
/*
* The fourth extended filesystem constants/structures
*/
@@ -195,6 +186,14 @@
};
/*
+ * Block validity checking, system zone rbtree.
+ */
+struct ext4_system_blocks {
+ struct rb_root root;
+ struct rcu_head rcu;
+};
+
+/*
* Flags for ext4_io_end->flags
*/
#define EXT4_IO_END_UNWRITTEN 0x0001
@@ -294,6 +293,9 @@
~((ext4_fsblk_t) (s)->s_cluster_ratio - 1))
#define EXT4_LBLK_CMASK(s, lblk) ((lblk) & \
~((ext4_lblk_t) (s)->s_cluster_ratio - 1))
+/* Fill in the low bits to get the last block of the cluster */
+#define EXT4_LBLK_CFILL(sbi, lblk) ((lblk) | \
+ ((ext4_lblk_t) (sbi)->s_cluster_ratio - 1))
/* Get the cluster offset */
#define EXT4_PBLK_COFF(s, pblk) ((pblk) & \
((ext4_fsblk_t) (s)->s_cluster_ratio - 1))
@@ -405,14 +407,16 @@
#define EXT4_TOPDIR_FL 0x00020000 /* Top of directory hierarchies*/
#define EXT4_HUGE_FILE_FL 0x00040000 /* Set to each huge file */
#define EXT4_EXTENTS_FL 0x00080000 /* Inode uses extents */
+#define EXT4_VERITY_FL 0x00100000 /* Verity protected inode */
#define EXT4_EA_INODE_FL 0x00200000 /* Inode used for large EA */
#define EXT4_EOFBLOCKS_FL 0x00400000 /* Blocks allocated beyond EOF */
#define EXT4_INLINE_DATA_FL 0x10000000 /* Inode has inline data. */
#define EXT4_PROJINHERIT_FL 0x20000000 /* Create with parents projid */
+#define EXT4_CASEFOLD_FL 0x40000000 /* Casefolded file */
#define EXT4_RESERVED_FL 0x80000000 /* reserved for ext4 lib */
-#define EXT4_FL_USER_VISIBLE 0x304BDFFF /* User visible flags */
-#define EXT4_FL_USER_MODIFIABLE 0x204BC0FF /* User modifiable flags */
+#define EXT4_FL_USER_VISIBLE 0x705BDFFF /* User visible flags */
+#define EXT4_FL_USER_MODIFIABLE 0x604BC0FF /* User modifiable flags */
/* Flags we can manipulate with through EXT4_IOC_FSSETXATTR */
#define EXT4_FL_XFLAG_VISIBLE (EXT4_SYNC_FL | \
@@ -427,14 +431,18 @@
EXT4_SYNC_FL | EXT4_NODUMP_FL | EXT4_NOATIME_FL |\
EXT4_NOCOMPR_FL | EXT4_JOURNAL_DATA_FL |\
EXT4_NOTAIL_FL | EXT4_DIRSYNC_FL |\
- EXT4_PROJINHERIT_FL)
+ EXT4_PROJINHERIT_FL | EXT4_CASEFOLD_FL)
/* Flags that are appropriate for regular files (all but dir-specific ones). */
-#define EXT4_REG_FLMASK (~(EXT4_DIRSYNC_FL | EXT4_TOPDIR_FL))
+#define EXT4_REG_FLMASK (~(EXT4_DIRSYNC_FL | EXT4_TOPDIR_FL | EXT4_CASEFOLD_FL |\
+ EXT4_PROJINHERIT_FL))
/* Flags that are appropriate for non-directories/regular files. */
#define EXT4_OTHER_FLMASK (EXT4_NODUMP_FL | EXT4_NOATIME_FL)
+/* The only flags that should be swapped */
+#define EXT4_FL_SHOULD_SWAP (EXT4_HUGE_FILE_FL | EXT4_EXTENTS_FL)
+
/* Mask out flags that are inappropriate for the given type of inode. */
static inline __u32 ext4_mask_flags(umode_t mode, __u32 flags)
{
@@ -472,6 +480,7 @@
EXT4_INODE_TOPDIR = 17, /* Top of directory hierarchies*/
EXT4_INODE_HUGE_FILE = 18, /* Set to each huge file */
EXT4_INODE_EXTENTS = 19, /* Inode uses extents */
+ EXT4_INODE_VERITY = 20, /* Verity protected inode */
EXT4_INODE_EA_INODE = 21, /* Inode used for large EA */
EXT4_INODE_EOFBLOCKS = 22, /* Blocks allocated beyond EOF */
EXT4_INODE_INLINE_DATA = 28, /* Data in inode. */
@@ -517,6 +526,7 @@
CHECK_FLAG_VALUE(TOPDIR);
CHECK_FLAG_VALUE(HUGE_FILE);
CHECK_FLAG_VALUE(EXTENTS);
+ CHECK_FLAG_VALUE(VERITY);
CHECK_FLAG_VALUE(EA_INODE);
CHECK_FLAG_VALUE(EOFBLOCKS);
CHECK_FLAG_VALUE(INLINE_DATA);
@@ -628,6 +638,7 @@
#define EXT4_FREE_BLOCKS_NO_QUOT_UPDATE 0x0008
#define EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER 0x0010
#define EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER 0x0020
+#define EXT4_FREE_BLOCKS_RERESERVE_CLUSTER 0x0040
/*
* ioctl commands
@@ -653,6 +664,10 @@
#define EXT4_IOC_SET_ENCRYPTION_POLICY FS_IOC_SET_ENCRYPTION_POLICY
#define EXT4_IOC_GET_ENCRYPTION_PWSALT FS_IOC_GET_ENCRYPTION_PWSALT
#define EXT4_IOC_GET_ENCRYPTION_POLICY FS_IOC_GET_ENCRYPTION_POLICY
+/* ioctl codes 19--39 are reserved for fscrypt */
+#define EXT4_IOC_CLEAR_ES_CACHE _IO('f', 40)
+#define EXT4_IOC_GETSTATE _IOW('f', 41, __u32)
+#define EXT4_IOC_GET_ES_CACHE _IOWR('f', 42, struct fiemap)
#define EXT4_IOC_FSGETXATTR FS_IOC_FSGETXATTR
#define EXT4_IOC_FSSETXATTR FS_IOC_FSSETXATTR
@@ -666,6 +681,16 @@
#define EXT4_GOING_FLAGS_LOGFLUSH 0x1 /* flush log but not data */
#define EXT4_GOING_FLAGS_NOLOGFLUSH 0x2 /* don't flush log nor data */
+/*
+ * Flags returned by EXT4_IOC_GETSTATE
+ *
+ * We only expose to userspace a subset of the state flags in
+ * i_state_flags
+ */
+#define EXT4_STATE_FLAG_EXT_PRECACHED 0x00000001
+#define EXT4_STATE_FLAG_NEW 0x00000002
+#define EXT4_STATE_FLAG_NEWENTRY 0x00000004
+#define EXT4_STATE_FLAG_DA_ALLOC_CLOSE 0x00000008
#if defined(__KERNEL__) && defined(CONFIG_COMPAT)
/*
@@ -683,6 +708,12 @@
#define EXT4_IOC32_SETVERSION_OLD FS_IOC32_SETVERSION
#endif
+/*
+ * Returned by EXT4_IOC_GET_ES_CACHE as an additional possible flag.
+ * It indicates that the entry in extent status cache is for a hole.
+ */
+#define EXT4_FIEMAP_EXTENT_HOLE 0x08000000
+
/* Max physical block we can address w/o extents */
#define EXT4_MAX_BLOCK_FILE_PHYS 0xFFFFFFFF
@@ -812,31 +843,20 @@
static inline void ext4_decode_extra_time(struct timespec64 *time,
__le32 extra)
{
- if (unlikely(extra & cpu_to_le32(EXT4_EPOCH_MASK))) {
-
-#if 1
- /* Handle legacy encoding of pre-1970 dates with epoch
- * bits 1,1. (This backwards compatibility may be removed
- * at the discretion of the ext4 developers.)
- */
- u64 extra_bits = le32_to_cpu(extra) & EXT4_EPOCH_MASK;
- if (extra_bits == 3 && ((time->tv_sec) & 0x80000000) != 0)
- extra_bits = 0;
- time->tv_sec += extra_bits << 32;
-#else
+ if (unlikely(extra & cpu_to_le32(EXT4_EPOCH_MASK)))
time->tv_sec += (u64)(le32_to_cpu(extra) & EXT4_EPOCH_MASK) << 32;
-#endif
- }
time->tv_nsec = (le32_to_cpu(extra) & EXT4_NSEC_MASK) >> EXT4_EPOCH_BITS;
}
#define EXT4_INODE_SET_XTIME(xtime, inode, raw_inode) \
do { \
- (raw_inode)->xtime = cpu_to_le32((inode)->xtime.tv_sec); \
if (EXT4_FITS_IN_INODE(raw_inode, EXT4_I(inode), xtime ## _extra)) {\
+ (raw_inode)->xtime = cpu_to_le32((inode)->xtime.tv_sec); \
(raw_inode)->xtime ## _extra = \
ext4_encode_extra_time(&(inode)->xtime); \
} \
+ else \
+ (raw_inode)->xtime = cpu_to_le32(clamp_t(int32_t, (inode)->xtime.tv_sec, S32_MIN, S32_MAX)); \
} while (0)
#define EXT4_EINODE_SET_XTIME(xtime, einode, raw_inode) \
@@ -1030,6 +1050,9 @@
ext4_lblk_t i_da_metadata_calc_last_lblock;
int i_da_metadata_calc_len;
+ /* pending cluster reservations for bigalloc file systems */
+ struct ext4_pending_tree i_pending_tree;
+
/* on-disk additional length */
__u16 i_extra_isize;
@@ -1316,7 +1339,9 @@
__u8 s_first_error_time_hi;
__u8 s_last_error_time_hi;
__u8 s_pad[2];
- __le32 s_reserved[96]; /* Padding to the end of the block */
+ __le16 s_encoding; /* Filename charset encoding */
+ __le16 s_encoding_flags; /* Filename charset encoding flags */
+ __le32 s_reserved[95]; /* Padding to the end of the block */
__le32 s_checksum; /* crc32c(superblock) */
};
@@ -1331,7 +1356,7 @@
#define EXT4_MF_FS_ABORTED 0x0002 /* Fatal error detected */
#define EXT4_MF_TEST_DUMMY_ENCRYPTION 0x0004
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
#define DUMMY_ENCRYPTION_ENABLED(sbi) (unlikely((sbi)->s_mount_flags & \
EXT4_MF_TEST_DUMMY_ENCRYPTION))
#else
@@ -1341,6 +1366,16 @@
/* Number of quota types we support */
#define EXT4_MAXQUOTAS 3
+#define EXT4_ENC_UTF8_12_1 1
+
+/*
+ * Flags for ext4_sb_info.s_encoding_flags.
+ */
+#define EXT4_ENC_STRICT_MODE_FL (1 << 0)
+
+#define ext4_has_strict_mode(sbi) \
+ (sbi->s_encoding_flags & EXT4_ENC_STRICT_MODE_FL)
+
/*
* fourth extended-fs super-block data in memory
*/
@@ -1390,6 +1425,10 @@
struct kobject s_kobj;
struct completion s_kobj_unregister;
struct super_block *s_sb;
+#ifdef CONFIG_UNICODE
+ struct unicode_map *s_encoding;
+ __u16 s_encoding_flags;
+#endif
/* Journaling */
struct journal_s *s_journal;
@@ -1406,7 +1445,7 @@
int s_jquota_fmt; /* Format of quota to use */
#endif
unsigned int s_want_extra_isize; /* New inodes should reserve # bytes */
- struct rb_root system_blks;
+ struct ext4_system_blocks __rcu *system_blks;
#ifdef EXTENTS_STATS
/* ext4 extents stats */
@@ -1545,6 +1584,7 @@
EXT4_STATE_MAY_INLINE_DATA, /* may have in-inode data */
EXT4_STATE_EXT_PRECACHED, /* extents have been precached */
EXT4_STATE_LUSTRE_EA_INODE, /* Lustre-style ea_inode */
+ EXT4_STATE_VERITY_IN_PROGRESS, /* building fs-verity Merkle tree */
};
#define EXT4_INODE_BIT_FNS(name, field, offset) \
@@ -1595,9 +1635,12 @@
#define EXT4_SB(sb) (sb)
#endif
-/*
- * Returns true if the inode is inode is encrypted
- */
+static inline bool ext4_verity_in_progress(struct inode *inode)
+{
+ return IS_ENABLED(CONFIG_FS_VERITY) &&
+ ext4_test_inode_state(inode, EXT4_STATE_VERITY_IN_PROGRESS);
+}
+
#define NEXT_ORPHAN(inode) EXT4_I(inode)->i_dtime
/*
@@ -1620,6 +1663,10 @@
#define EXT4_GOOD_OLD_INODE_SIZE 128
+#define EXT4_EXTRA_TIMESTAMP_MAX (((s64)1 << 34) - 1 + S32_MIN)
+#define EXT4_NON_EXTRA_TIMESTAMP_MAX S32_MAX
+#define EXT4_TIMESTAMP_MIN S32_MIN
+
/*
* Feature set definitions
*/
@@ -1650,6 +1697,7 @@
#define EXT4_FEATURE_RO_COMPAT_METADATA_CSUM 0x0400
#define EXT4_FEATURE_RO_COMPAT_READONLY 0x1000
#define EXT4_FEATURE_RO_COMPAT_PROJECT 0x2000
+#define EXT4_FEATURE_RO_COMPAT_VERITY 0x8000
#define EXT4_FEATURE_INCOMPAT_COMPRESSION 0x0001
#define EXT4_FEATURE_INCOMPAT_FILETYPE 0x0002
@@ -1666,6 +1714,9 @@
#define EXT4_FEATURE_INCOMPAT_LARGEDIR 0x4000 /* >2GB or 3-lvl htree */
#define EXT4_FEATURE_INCOMPAT_INLINE_DATA 0x8000 /* data in inode */
#define EXT4_FEATURE_INCOMPAT_ENCRYPT 0x10000
+#define EXT4_FEATURE_INCOMPAT_CASEFOLD 0x20000
+
+extern void ext4_update_dynamic_rev(struct super_block *sb);
#define EXT4_FEATURE_COMPAT_FUNCS(name, flagname) \
static inline bool ext4_has_feature_##name(struct super_block *sb) \
@@ -1675,6 +1726,7 @@
} \
static inline void ext4_set_feature_##name(struct super_block *sb) \
{ \
+ ext4_update_dynamic_rev(sb); \
EXT4_SB(sb)->s_es->s_feature_compat |= \
cpu_to_le32(EXT4_FEATURE_COMPAT_##flagname); \
} \
@@ -1692,6 +1744,7 @@
} \
static inline void ext4_set_feature_##name(struct super_block *sb) \
{ \
+ ext4_update_dynamic_rev(sb); \
EXT4_SB(sb)->s_es->s_feature_ro_compat |= \
cpu_to_le32(EXT4_FEATURE_RO_COMPAT_##flagname); \
} \
@@ -1709,6 +1762,7 @@
} \
static inline void ext4_set_feature_##name(struct super_block *sb) \
{ \
+ ext4_update_dynamic_rev(sb); \
EXT4_SB(sb)->s_es->s_feature_incompat |= \
cpu_to_le32(EXT4_FEATURE_INCOMPAT_##flagname); \
} \
@@ -1738,6 +1792,7 @@
EXT4_FEATURE_RO_COMPAT_FUNCS(metadata_csum, METADATA_CSUM)
EXT4_FEATURE_RO_COMPAT_FUNCS(readonly, READONLY)
EXT4_FEATURE_RO_COMPAT_FUNCS(project, PROJECT)
+EXT4_FEATURE_RO_COMPAT_FUNCS(verity, VERITY)
EXT4_FEATURE_INCOMPAT_FUNCS(compression, COMPRESSION)
EXT4_FEATURE_INCOMPAT_FUNCS(filetype, FILETYPE)
@@ -1754,6 +1809,7 @@
EXT4_FEATURE_INCOMPAT_FUNCS(largedir, LARGEDIR)
EXT4_FEATURE_INCOMPAT_FUNCS(inline_data, INLINE_DATA)
EXT4_FEATURE_INCOMPAT_FUNCS(encrypt, ENCRYPT)
+EXT4_FEATURE_INCOMPAT_FUNCS(casefold, CASEFOLD)
#define EXT2_FEATURE_COMPAT_SUPP EXT4_FEATURE_COMPAT_EXT_ATTR
#define EXT2_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \
@@ -1781,6 +1837,7 @@
EXT4_FEATURE_INCOMPAT_MMP | \
EXT4_FEATURE_INCOMPAT_INLINE_DATA | \
EXT4_FEATURE_INCOMPAT_ENCRYPT | \
+ EXT4_FEATURE_INCOMPAT_CASEFOLD | \
EXT4_FEATURE_INCOMPAT_CSUM_SEED | \
EXT4_FEATURE_INCOMPAT_LARGEDIR)
#define EXT4_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \
@@ -1793,7 +1850,8 @@
EXT4_FEATURE_RO_COMPAT_BIGALLOC |\
EXT4_FEATURE_RO_COMPAT_METADATA_CSUM|\
EXT4_FEATURE_RO_COMPAT_QUOTA |\
- EXT4_FEATURE_RO_COMPAT_PROJECT)
+ EXT4_FEATURE_RO_COMPAT_PROJECT |\
+ EXT4_FEATURE_RO_COMPAT_VERITY)
#define EXTN_FEATURE_FUNCS(ver) \
static inline bool ext4_has_unknown_ext##ver##_compat_features(struct super_block *sb) \
@@ -2022,7 +2080,6 @@
BUG_ON(crypto_shash_descsize(sbi->s_chksum_driver)!=sizeof(desc.ctx));
desc.shash.tfm = sbi->s_chksum_driver;
- desc.shash.flags = 0;
*(u32 *)desc.ctx = crc;
BUG_ON(crypto_shash_update(&desc.shash, address, length));
@@ -2056,9 +2113,12 @@
const struct qstr *usr_fname;
struct fscrypt_str disk_name;
struct dx_hash_info hinfo;
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
struct fscrypt_str crypto_buf;
#endif
+#ifdef CONFIG_UNICODE
+ struct fscrypt_str cf_name;
+#endif
};
#define fname_name(p) ((p)->disk_name.name)
@@ -2284,29 +2344,62 @@
struct ext4_group_desc *gdp);
ext4_fsblk_t ext4_inode_to_goal_block(struct inode *);
-static inline bool ext4_encrypted_inode(struct inode *inode)
+#ifdef CONFIG_UNICODE
+extern void ext4_fname_setup_ci_filename(struct inode *dir,
+ const struct qstr *iname,
+ struct fscrypt_str *fname);
+#endif
+
+#ifdef CONFIG_FS_ENCRYPTION
+static inline void ext4_fname_from_fscrypt_name(struct ext4_filename *dst,
+ const struct fscrypt_name *src)
{
- return ext4_test_inode_flag(inode, EXT4_INODE_ENCRYPT);
+ memset(dst, 0, sizeof(*dst));
+
+ dst->usr_fname = src->usr_fname;
+ dst->disk_name = src->disk_name;
+ dst->hinfo.hash = src->hash;
+ dst->hinfo.minor_hash = src->minor_hash;
+ dst->crypto_buf = src->crypto_buf;
}
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
static inline int ext4_fname_setup_filename(struct inode *dir,
- const struct qstr *iname,
- int lookup, struct ext4_filename *fname)
+ const struct qstr *iname,
+ int lookup,
+ struct ext4_filename *fname)
{
struct fscrypt_name name;
int err;
- memset(fname, 0, sizeof(struct ext4_filename));
-
err = fscrypt_setup_filename(dir, iname, lookup, &name);
+ if (err)
+ return err;
- fname->usr_fname = name.usr_fname;
- fname->disk_name = name.disk_name;
- fname->hinfo.hash = name.hash;
- fname->hinfo.minor_hash = name.minor_hash;
- fname->crypto_buf = name.crypto_buf;
- return err;
+ ext4_fname_from_fscrypt_name(fname, &name);
+
+#ifdef CONFIG_UNICODE
+ ext4_fname_setup_ci_filename(dir, iname, &fname->cf_name);
+#endif
+ return 0;
+}
+
+static inline int ext4_fname_prepare_lookup(struct inode *dir,
+ struct dentry *dentry,
+ struct ext4_filename *fname)
+{
+ struct fscrypt_name name;
+ int err;
+
+ err = fscrypt_prepare_lookup(dir, dentry, &name);
+ if (err)
+ return err;
+
+ ext4_fname_from_fscrypt_name(fname, &name);
+
+#ifdef CONFIG_UNICODE
+ ext4_fname_setup_ci_filename(dir, &dentry->d_name, &fname->cf_name);
+#endif
+ return 0;
}
static inline void ext4_fname_free_filename(struct ext4_filename *fname)
@@ -2319,20 +2412,44 @@
fname->crypto_buf.name = NULL;
fname->usr_fname = NULL;
fname->disk_name.name = NULL;
+
+#ifdef CONFIG_UNICODE
+ kfree(fname->cf_name.name);
+ fname->cf_name.name = NULL;
+#endif
}
-#else
+#else /* !CONFIG_FS_ENCRYPTION */
static inline int ext4_fname_setup_filename(struct inode *dir,
- const struct qstr *iname,
- int lookup, struct ext4_filename *fname)
+ const struct qstr *iname,
+ int lookup,
+ struct ext4_filename *fname)
{
fname->usr_fname = iname;
fname->disk_name.name = (unsigned char *) iname->name;
fname->disk_name.len = iname->len;
+
+#ifdef CONFIG_UNICODE
+ ext4_fname_setup_ci_filename(dir, iname, &fname->cf_name);
+#endif
+
return 0;
}
-static inline void ext4_fname_free_filename(struct ext4_filename *fname) { }
+static inline int ext4_fname_prepare_lookup(struct inode *dir,
+ struct dentry *dentry,
+ struct ext4_filename *fname)
+{
+ return ext4_fname_setup_filename(dir, &dentry->d_name, 1, fname);
+}
+
+static inline void ext4_fname_free_filename(struct ext4_filename *fname)
+{
+#ifdef CONFIG_UNICODE
+ kfree(fname->cf_name.name);
+ fname->cf_name.name = NULL;
#endif
+}
+#endif /* !CONFIG_FS_ENCRYPTION */
/* dir.c */
extern int __ext4_check_dir_entry(const char *, unsigned int, struct inode *,
@@ -2380,8 +2497,8 @@
extern int ext4_sync_file(struct file *, loff_t, loff_t, int);
/* hash.c */
-extern int ext4fs_dirhash(const char *name, int len, struct
- dx_hash_info *hinfo);
+extern int ext4fs_dirhash(const struct inode *dir, const char *name, int len,
+ struct dx_hash_info *hinfo);
/* ialloc.c */
extern struct inode *__ext4_new_inode(handle_t *, struct inode *, umode_t,
@@ -2459,8 +2576,19 @@
#define FALL_BACK_TO_NONDELALLOC 1
#define CONVERT_INLINE_DATA 2
-extern struct inode *ext4_iget(struct super_block *, unsigned long);
-extern struct inode *ext4_iget_normal(struct super_block *, unsigned long);
+typedef enum {
+ EXT4_IGET_NORMAL = 0,
+ EXT4_IGET_SPECIAL = 0x0001, /* OK to iget a system inode */
+ EXT4_IGET_HANDLE = 0x0002 /* Inode # is from a handle */
+} ext4_iget_flags;
+
+extern struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
+ ext4_iget_flags flags, const char *function,
+ unsigned int line);
+
+#define ext4_iget(sb, ino, flags) \
+ __ext4_iget((sb), (ino), (flags), __func__, __LINE__)
+
extern int ext4_write_inode(struct inode *, struct writeback_control *);
extern int ext4_setattr(struct dentry *, struct iattr *);
extern int ext4_getattr(const struct path *, struct kstat *, u32, unsigned int);
@@ -2484,10 +2612,11 @@
extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode,
loff_t lstart, loff_t lend);
-extern int ext4_page_mkwrite(struct vm_fault *vmf);
-extern int ext4_filemap_fault(struct vm_fault *vmf);
+extern vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf);
+extern vm_fault_t ext4_filemap_fault(struct vm_fault *vmf);
extern qsize_t *ext4_get_reserved_space(struct inode *inode);
extern int ext4_get_projid(struct inode *inode, kprojid_t *projid);
+extern void ext4_da_release_space(struct inode *inode, int to_free);
extern void ext4_da_update_reserve_space(struct inode *inode,
int used, int quota_claim);
extern int ext4_issue_zeroout(struct inode *inode, ext4_lblk_t lblk,
@@ -2511,8 +2640,8 @@
extern int ext4_ind_migrate(struct inode *inode);
/* namei.c */
-extern int ext4_dirent_csum_verify(struct inode *inode,
- struct ext4_dir_entry *dirent);
+extern int ext4_dirblock_csum_verify(struct inode *inode,
+ struct buffer_head *bh);
extern int ext4_orphan_add(handle_t *, struct inode *);
extern int ext4_orphan_del(handle_t *, struct inode *);
extern int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
@@ -2542,6 +2671,8 @@
extern int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count);
/* super.c */
+extern struct buffer_head *ext4_sb_bread(struct super_block *sb,
+ sector_t block, int op_flags);
extern int ext4_seq_options_show(struct seq_file *seq, void *offset);
extern int ext4_calculate_overhead(struct super_block *sb);
extern void ext4_superblock_csum_set(struct super_block *sb);
@@ -2663,7 +2794,6 @@
#endif
-extern void ext4_update_dynamic_rev(struct super_block *sb);
extern int ext4_update_compat_feature(handle_t *handle, struct super_block *sb,
__u32 compat);
extern int ext4_update_rocompat_feature(handle_t *handle,
@@ -2964,6 +3094,10 @@
/* dir.c */
extern const struct file_operations ext4_dir_operations;
+#ifdef CONFIG_UNICODE
+extern const struct dentry_operations ext4_dentry_ops;
+#endif
+
/* file.c */
extern const struct inode_operations ext4_file_inode_operations;
extern const struct file_operations ext4_file_operations;
@@ -3008,11 +3142,11 @@
extern int ext4_read_inline_dir(struct file *filp,
struct dir_context *ctx,
int *has_inline_data);
-extern int htree_inlinedir_to_tree(struct file *dir_file,
- struct inode *dir, ext4_lblk_t block,
- struct dx_hash_info *hinfo,
- __u32 start_hash, __u32 start_minor_hash,
- int *has_inline_data);
+extern int ext4_inlinedir_to_tree(struct file *dir_file,
+ struct inode *dir, ext4_lblk_t block,
+ struct dx_hash_info *hinfo,
+ __u32 start_hash, __u32 start_minor_hash,
+ int *has_inline_data);
extern struct buffer_head *ext4_find_inline_entry(struct inode *dir,
struct ext4_filename *fname,
struct ext4_dir_entry_2 **res_dir,
@@ -3051,11 +3185,14 @@
struct ext4_dir_entry_2 *de,
int blocksize, int csum_size,
unsigned int parent_ino, int dotdot_real_len);
-extern void initialize_dirent_tail(struct ext4_dir_entry_tail *t,
- unsigned int blocksize);
-extern int ext4_handle_dirty_dirent_node(handle_t *handle,
- struct inode *inode,
- struct buffer_head *bh);
+extern void ext4_initialize_dirent_tail(struct buffer_head *bh,
+ unsigned int blocksize);
+extern int ext4_handle_dirty_dirblock(handle_t *handle, struct inode *inode,
+ struct buffer_head *bh);
+extern int ext4_ci_compare(const struct inode *parent,
+ const struct qstr *fname,
+ const struct qstr *entry, bool quick);
+
#define S_SHIFT 12
static const unsigned char ext4_type_by_mode[(S_IFMT >> S_SHIFT) + 1] = {
[S_IFREG >> S_SHIFT] = EXT4_FT_REG_FILE,
@@ -3078,6 +3215,8 @@
extern int ext4_mpage_readpages(struct address_space *mapping,
struct list_head *pages, struct page *page,
unsigned nr_pages, bool is_readahead);
+extern int __init ext4_init_post_read_processing(void);
+extern void ext4_exit_post_read_processing(void);
/* symlink.c */
extern const struct inode_operations ext4_encrypted_symlink_inode_operations;
@@ -3143,13 +3282,12 @@
int flags);
extern void ext4_ext_drop_refs(struct ext4_ext_path *);
extern int ext4_ext_check_inode(struct inode *inode);
-extern int ext4_find_delalloc_range(struct inode *inode,
- ext4_lblk_t lblk_start,
- ext4_lblk_t lblk_end);
-extern int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk);
extern ext4_lblk_t ext4_ext_next_allocated_block(struct ext4_ext_path *path);
extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
__u64 start, __u64 len);
+extern int ext4_get_es_cache(struct inode *inode,
+ struct fiemap_extent_info *fieinfo,
+ __u64 start, __u64 len);
extern int ext4_ext_precache(struct inode *inode);
extern int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len);
extern int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len);
@@ -3157,6 +3295,7 @@
struct inode *inode2, ext4_lblk_t lblk1,
ext4_lblk_t lblk2, ext4_lblk_t count,
int mark_unwritten,int *err);
+extern int ext4_clu_mapped(struct inode *inode, ext4_lblk_t lclu);
/* move_extent.c */
extern void ext4_double_down_write_data_sem(struct inode *first,
@@ -3187,6 +3326,9 @@
/* mmp.c */
extern int ext4_multi_mount_protect(struct super_block *, ext4_fsblk_t);
+/* verity.c */
+extern const struct fsverity_operations ext4_verityops;
+
/*
* Add new method to test whether block and inode bitmaps are properly
* initialized. With uninit_bg reading the block from disk is not enough
@@ -3238,6 +3380,19 @@
extern const struct iomap_ops ext4_iomap_ops;
+static inline int ext4_buffer_uptodate(struct buffer_head *bh)
+{
+ /*
+ * If the buffer has the write error flag, we have failed
+ * to write out data in the block. In this case, we don't
+ * have to read the block because we may read the old data
+ * successfully.
+ */
+ if (!buffer_uptodate(bh) && buffer_write_io_error(bh))
+ set_buffer_uptodate(bh);
+ return buffer_uptodate(bh);
+}
+
#endif /* __KERNEL__ */
#define EFSBADCRC EBADMSG /* Bad CRC detected */
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index adf6668..98bd0e9 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -120,6 +120,19 @@
};
/*
+ * Used to record a portion of a cluster found at the beginning or end
+ * of an extent while traversing the extent tree during space removal.
+ * A partial cluster may be removed if it does not contain blocks shared
+ * with extents that aren't being deleted (tofree state). Otherwise,
+ * it cannot be removed (nofree state).
+ */
+struct partial_cluster {
+ ext4_fsblk_t pclu; /* physical cluster number */
+ ext4_lblk_t lblk; /* logical block number within logical cluster */
+ enum {initial, tofree, nofree} state;
+};
+
+/*
* structure for external API
*/
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index 15b6dd7..ef8fcf7 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -361,20 +361,20 @@
}
static inline int ext4_jbd2_inode_add_write(handle_t *handle,
- struct inode *inode)
+ struct inode *inode, loff_t start_byte, loff_t length)
{
if (ext4_handle_valid(handle))
- return jbd2_journal_inode_add_write(handle,
- EXT4_I(inode)->jinode);
+ return jbd2_journal_inode_ranged_write(handle,
+ EXT4_I(inode)->jinode, start_byte, length);
return 0;
}
static inline int ext4_jbd2_inode_add_wait(handle_t *handle,
- struct inode *inode)
+ struct inode *inode, loff_t start_byte, loff_t length)
{
if (ext4_handle_valid(handle))
- return jbd2_journal_inode_add_wait(handle,
- EXT4_I(inode)->jinode);
+ return jbd2_journal_inode_ranged_wait(handle,
+ EXT4_I(inode)->jinode, start_byte, length);
return 0;
}
@@ -384,7 +384,7 @@
{
struct ext4_inode_info *ei = EXT4_I(inode);
- if (ext4_handle_valid(handle)) {
+ if (ext4_handle_valid(handle) && !is_handle_aborted(handle)) {
ei->i_sync_tid = handle->h_transaction->t_tid;
if (datasync)
ei->i_datasync_tid = handle->h_transaction->t_tid;
@@ -411,7 +411,7 @@
(ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA) &&
!test_opt(inode->i_sb, DELALLOC))) {
/* We do not support data journalling for encrypted data */
- if (S_ISREG(inode->i_mode) && ext4_encrypted_inode(inode))
+ if (S_ISREG(inode->i_mode) && IS_ENCRYPTED(inode))
return EXT4_INODE_ORDERED_DATA_MODE; /* ordered */
return EXT4_INODE_JOURNAL_DATA_MODE; /* journal data */
}
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 72a361d..fb0f99d 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -518,10 +518,14 @@
}
if (buffer_verified(bh) && !(flags & EXT4_EX_FORCE_CACHE))
return bh;
- err = __ext4_ext_check(function, line, inode,
- ext_block_hdr(bh), depth, pblk);
- if (err)
- goto errout;
+ if (!ext4_has_feature_journal(inode->i_sb) ||
+ (inode->i_ino !=
+ le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_journal_inum))) {
+ err = __ext4_ext_check(function, line, inode,
+ ext_block_hdr(bh), depth, pblk);
+ if (err)
+ goto errout;
+ }
set_buffer_verified(bh);
/*
* If this is a leaf block, cache all of its entries
@@ -1035,6 +1039,7 @@
__le32 border;
ext4_fsblk_t *ablocks = NULL; /* array of allocated blocks */
int err = 0;
+ size_t ext_size = 0;
/* make decision: where to split? */
/* FIXME: now decision is simplest: at current extent */
@@ -1126,6 +1131,10 @@
le16_add_cpu(&neh->eh_entries, m);
}
+ /* zero out unused area in the extent block */
+ ext_size = sizeof(struct ext4_extent_header) +
+ sizeof(struct ext4_extent) * le16_to_cpu(neh->eh_entries);
+ memset(bh->b_data + ext_size, 0, inode->i_sb->s_blocksize - ext_size);
ext4_extent_block_csum_set(inode, neh);
set_buffer_uptodate(bh);
unlock_buffer(bh);
@@ -1205,6 +1214,11 @@
sizeof(struct ext4_extent_idx) * m);
le16_add_cpu(&neh->eh_entries, m);
}
+ /* zero out unused area in the extent block */
+ ext_size = sizeof(struct ext4_extent_header) +
+ (sizeof(struct ext4_extent) * le16_to_cpu(neh->eh_entries));
+ memset(bh->b_data + ext_size, 0,
+ inode->i_sb->s_blocksize - ext_size);
ext4_extent_block_csum_set(inode, neh);
set_buffer_uptodate(bh);
unlock_buffer(bh);
@@ -1270,6 +1284,7 @@
ext4_fsblk_t newblock, goal = 0;
struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
int err = 0;
+ size_t ext_size = 0;
/* Try to prepend new index to old one */
if (ext_depth(inode))
@@ -1295,9 +1310,11 @@
goto out;
}
+ ext_size = sizeof(EXT4_I(inode)->i_data);
/* move top-level index/leaf into new block */
- memmove(bh->b_data, EXT4_I(inode)->i_data,
- sizeof(EXT4_I(inode)->i_data));
+ memmove(bh->b_data, EXT4_I(inode)->i_data, ext_size);
+ /* zero out unused area in the extent block */
+ memset(bh->b_data + ext_size, 0, inode->i_sb->s_blocksize - ext_size);
/* set size of new block */
neh = ext_block_hdr(bh);
@@ -2298,6 +2315,52 @@
return err;
}
+static int ext4_fill_es_cache_info(struct inode *inode,
+ ext4_lblk_t block, ext4_lblk_t num,
+ struct fiemap_extent_info *fieinfo)
+{
+ ext4_lblk_t next, end = block + num - 1;
+ struct extent_status es;
+ unsigned char blksize_bits = inode->i_sb->s_blocksize_bits;
+ unsigned int flags;
+ int err;
+
+ while (block <= end) {
+ next = 0;
+ flags = 0;
+ if (!ext4_es_lookup_extent(inode, block, &next, &es))
+ break;
+ if (ext4_es_is_unwritten(&es))
+ flags |= FIEMAP_EXTENT_UNWRITTEN;
+ if (ext4_es_is_delayed(&es))
+ flags |= (FIEMAP_EXTENT_DELALLOC |
+ FIEMAP_EXTENT_UNKNOWN);
+ if (ext4_es_is_hole(&es))
+ flags |= EXT4_FIEMAP_EXTENT_HOLE;
+ if (next == 0)
+ flags |= FIEMAP_EXTENT_LAST;
+ if (flags & (FIEMAP_EXTENT_DELALLOC|
+ EXT4_FIEMAP_EXTENT_HOLE))
+ es.es_pblk = 0;
+ else
+ es.es_pblk = ext4_es_pblock(&es);
+ err = fiemap_fill_next_extent(fieinfo,
+ (__u64)es.es_lblk << blksize_bits,
+ (__u64)es.es_pblk << blksize_bits,
+ (__u64)es.es_len << blksize_bits,
+ flags);
+ if (next == 0)
+ break;
+ block = next;
+ if (err < 0)
+ return err;
+ if (err == 1)
+ return 0;
+ }
+ return 0;
+}
+
+
/*
* ext4_ext_determine_hole - determine hole around given block
* @inode: inode we lookup in
@@ -2351,8 +2414,8 @@
{
struct extent_status es;
- ext4_es_find_delayed_extent_range(inode, hole_start,
- hole_start + hole_len - 1, &es);
+ ext4_es_find_extent_range(inode, &ext4_es_is_delayed, hole_start,
+ hole_start + hole_len - 1, &es);
if (es.es_len) {
/* There's delayed extent containing lblock? */
if (es.es_lblk <= hole_start)
@@ -2490,106 +2553,157 @@
return 0;
}
+/*
+ * ext4_rereserve_cluster - increment the reserved cluster count when
+ * freeing a cluster with a pending reservation
+ *
+ * @inode - file containing the cluster
+ * @lblk - logical block in cluster to be reserved
+ *
+ * Increments the reserved cluster count and adjusts quota in a bigalloc
+ * file system when freeing a partial cluster containing at least one
+ * delayed and unwritten block. A partial cluster meeting that
+ * requirement will have a pending reservation. If so, the
+ * RERESERVE_CLUSTER flag is used when calling ext4_free_blocks() to
+ * defer reserved and allocated space accounting to a subsequent call
+ * to this function.
+ */
+static void ext4_rereserve_cluster(struct inode *inode, ext4_lblk_t lblk)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ struct ext4_inode_info *ei = EXT4_I(inode);
+
+ dquot_reclaim_block(inode, EXT4_C2B(sbi, 1));
+
+ spin_lock(&ei->i_block_reservation_lock);
+ ei->i_reserved_data_blocks++;
+ percpu_counter_add(&sbi->s_dirtyclusters_counter, 1);
+ spin_unlock(&ei->i_block_reservation_lock);
+
+ percpu_counter_add(&sbi->s_freeclusters_counter, 1);
+ ext4_remove_pending(inode, lblk);
+}
+
static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
struct ext4_extent *ex,
- long long *partial_cluster,
+ struct partial_cluster *partial,
ext4_lblk_t from, ext4_lblk_t to)
{
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
unsigned short ee_len = ext4_ext_get_actual_len(ex);
- ext4_fsblk_t pblk;
- int flags = get_default_free_blocks_flags(inode);
+ ext4_fsblk_t last_pblk, pblk;
+ ext4_lblk_t num;
+ int flags;
+
+ /* only extent tail removal is allowed */
+ if (from < le32_to_cpu(ex->ee_block) ||
+ to != le32_to_cpu(ex->ee_block) + ee_len - 1) {
+ ext4_error(sbi->s_sb,
+ "strange request: removal(2) %u-%u from %u:%u",
+ from, to, le32_to_cpu(ex->ee_block), ee_len);
+ return 0;
+ }
+
+#ifdef EXTENTS_STATS
+ spin_lock(&sbi->s_ext_stats_lock);
+ sbi->s_ext_blocks += ee_len;
+ sbi->s_ext_extents++;
+ if (ee_len < sbi->s_ext_min)
+ sbi->s_ext_min = ee_len;
+ if (ee_len > sbi->s_ext_max)
+ sbi->s_ext_max = ee_len;
+ if (ext_depth(inode) > sbi->s_depth_max)
+ sbi->s_depth_max = ext_depth(inode);
+ spin_unlock(&sbi->s_ext_stats_lock);
+#endif
+
+ trace_ext4_remove_blocks(inode, ex, from, to, partial);
+
+ /*
+ * if we have a partial cluster, and it's different from the
+ * cluster of the last block in the extent, we free it
+ */
+ last_pblk = ext4_ext_pblock(ex) + ee_len - 1;
+
+ if (partial->state != initial &&
+ partial->pclu != EXT4_B2C(sbi, last_pblk)) {
+ if (partial->state == tofree) {
+ flags = get_default_free_blocks_flags(inode);
+ if (ext4_is_pending(inode, partial->lblk))
+ flags |= EXT4_FREE_BLOCKS_RERESERVE_CLUSTER;
+ ext4_free_blocks(handle, inode, NULL,
+ EXT4_C2B(sbi, partial->pclu),
+ sbi->s_cluster_ratio, flags);
+ if (flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER)
+ ext4_rereserve_cluster(inode, partial->lblk);
+ }
+ partial->state = initial;
+ }
+
+ num = le32_to_cpu(ex->ee_block) + ee_len - from;
+ pblk = ext4_ext_pblock(ex) + ee_len - num;
+
+ /*
+ * We free the partial cluster at the end of the extent (if any),
+ * unless the cluster is used by another extent (partial_cluster
+ * state is nofree). If a partial cluster exists here, it must be
+ * shared with the last block in the extent.
+ */
+ flags = get_default_free_blocks_flags(inode);
+
+ /* partial, left end cluster aligned, right end unaligned */
+ if ((EXT4_LBLK_COFF(sbi, to) != sbi->s_cluster_ratio - 1) &&
+ (EXT4_LBLK_CMASK(sbi, to) >= from) &&
+ (partial->state != nofree)) {
+ if (ext4_is_pending(inode, to))
+ flags |= EXT4_FREE_BLOCKS_RERESERVE_CLUSTER;
+ ext4_free_blocks(handle, inode, NULL,
+ EXT4_PBLK_CMASK(sbi, last_pblk),
+ sbi->s_cluster_ratio, flags);
+ if (flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER)
+ ext4_rereserve_cluster(inode, to);
+ partial->state = initial;
+ flags = get_default_free_blocks_flags(inode);
+ }
+
+ flags |= EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER;
/*
* For bigalloc file systems, we never free a partial cluster
- * at the beginning of the extent. Instead, we make a note
- * that we tried freeing the cluster, and check to see if we
+ * at the beginning of the extent. Instead, we check to see if we
* need to free it on a subsequent call to ext4_remove_blocks,
* or at the end of ext4_ext_rm_leaf or ext4_ext_remove_space.
*/
flags |= EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER;
+ ext4_free_blocks(handle, inode, NULL, pblk, num, flags);
- trace_ext4_remove_blocks(inode, ex, from, to, *partial_cluster);
+ /* reset the partial cluster if we've freed past it */
+ if (partial->state != initial && partial->pclu != EXT4_B2C(sbi, pblk))
+ partial->state = initial;
+
/*
- * If we have a partial cluster, and it's different from the
- * cluster of the last block, we need to explicitly free the
- * partial cluster here.
+ * If we've freed the entire extent but the beginning is not left
+ * cluster aligned and is not marked as ineligible for freeing we
+ * record the partial cluster at the beginning of the extent. It
+ * wasn't freed by the preceding ext4_free_blocks() call, and we
+ * need to look farther to the left to determine if it's to be freed
+ * (not shared with another extent). Else, reset the partial
+ * cluster - we're either done freeing or the beginning of the
+ * extent is left cluster aligned.
*/
- pblk = ext4_ext_pblock(ex) + ee_len - 1;
- if (*partial_cluster > 0 &&
- *partial_cluster != (long long) EXT4_B2C(sbi, pblk)) {
- ext4_free_blocks(handle, inode, NULL,
- EXT4_C2B(sbi, *partial_cluster),
- sbi->s_cluster_ratio, flags);
- *partial_cluster = 0;
- }
-
-#ifdef EXTENTS_STATS
- {
- struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
- spin_lock(&sbi->s_ext_stats_lock);
- sbi->s_ext_blocks += ee_len;
- sbi->s_ext_extents++;
- if (ee_len < sbi->s_ext_min)
- sbi->s_ext_min = ee_len;
- if (ee_len > sbi->s_ext_max)
- sbi->s_ext_max = ee_len;
- if (ext_depth(inode) > sbi->s_depth_max)
- sbi->s_depth_max = ext_depth(inode);
- spin_unlock(&sbi->s_ext_stats_lock);
- }
-#endif
- if (from >= le32_to_cpu(ex->ee_block)
- && to == le32_to_cpu(ex->ee_block) + ee_len - 1) {
- /* tail removal */
- ext4_lblk_t num;
- long long first_cluster;
-
- num = le32_to_cpu(ex->ee_block) + ee_len - from;
- pblk = ext4_ext_pblock(ex) + ee_len - num;
- /*
- * Usually we want to free partial cluster at the end of the
- * extent, except for the situation when the cluster is still
- * used by any other extent (partial_cluster is negative).
- */
- if (*partial_cluster < 0 &&
- *partial_cluster == -(long long) EXT4_B2C(sbi, pblk+num-1))
- flags |= EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER;
-
- ext_debug("free last %u blocks starting %llu partial %lld\n",
- num, pblk, *partial_cluster);
- ext4_free_blocks(handle, inode, NULL, pblk, num, flags);
- /*
- * If the block range to be freed didn't start at the
- * beginning of a cluster, and we removed the entire
- * extent and the cluster is not used by any other extent,
- * save the partial cluster here, since we might need to
- * delete if we determine that the truncate or punch hole
- * operation has removed all of the blocks in the cluster.
- * If that cluster is used by another extent, preserve its
- * negative value so it isn't freed later on.
- *
- * If the whole extent wasn't freed, we've reached the
- * start of the truncated/punched region and have finished
- * removing blocks. If there's a partial cluster here it's
- * shared with the remainder of the extent and is no longer
- * a candidate for removal.
- */
- if (EXT4_PBLK_COFF(sbi, pblk) && ee_len == num) {
- first_cluster = (long long) EXT4_B2C(sbi, pblk);
- if (first_cluster != -*partial_cluster)
- *partial_cluster = first_cluster;
- } else {
- *partial_cluster = 0;
+ if (EXT4_LBLK_COFF(sbi, from) && num == ee_len) {
+ if (partial->state == initial) {
+ partial->pclu = EXT4_B2C(sbi, pblk);
+ partial->lblk = from;
+ partial->state = tofree;
}
- } else
- ext4_error(sbi->s_sb, "strange request: removal(2) "
- "%u-%u from %u:%u",
- from, to, le32_to_cpu(ex->ee_block), ee_len);
+ } else {
+ partial->state = initial;
+ }
+
return 0;
}
-
/*
* ext4_ext_rm_leaf() Removes the extents associated with the
* blocks appearing between "start" and "end". Both "start"
@@ -2608,7 +2722,7 @@
static int
ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
struct ext4_ext_path *path,
- long long *partial_cluster,
+ struct partial_cluster *partial,
ext4_lblk_t start, ext4_lblk_t end)
{
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
@@ -2640,7 +2754,7 @@
ex_ee_block = le32_to_cpu(ex->ee_block);
ex_ee_len = ext4_ext_get_actual_len(ex);
- trace_ext4_ext_rm_leaf(inode, start, ex, *partial_cluster);
+ trace_ext4_ext_rm_leaf(inode, start, ex, partial);
while (ex >= EXT_FIRST_EXTENT(eh) &&
ex_ee_block + ex_ee_len > start) {
@@ -2671,8 +2785,8 @@
*/
if (sbi->s_cluster_ratio > 1) {
pblk = ext4_ext_pblock(ex);
- *partial_cluster =
- -(long long) EXT4_B2C(sbi, pblk);
+ partial->pclu = EXT4_B2C(sbi, pblk);
+ partial->state = nofree;
}
ex--;
ex_ee_block = le32_to_cpu(ex->ee_block);
@@ -2714,8 +2828,7 @@
if (err)
goto out;
- err = ext4_remove_blocks(handle, inode, ex, partial_cluster,
- a, b);
+ err = ext4_remove_blocks(handle, inode, ex, partial, a, b);
if (err)
goto out;
@@ -2769,18 +2882,23 @@
* If there's a partial cluster and at least one extent remains in
* the leaf, free the partial cluster if it isn't shared with the
* current extent. If it is shared with the current extent
- * we zero partial_cluster because we've reached the start of the
+ * we reset the partial cluster because we've reached the start of the
* truncated/punched region and we're done removing blocks.
*/
- if (*partial_cluster > 0 && ex >= EXT_FIRST_EXTENT(eh)) {
+ if (partial->state == tofree && ex >= EXT_FIRST_EXTENT(eh)) {
pblk = ext4_ext_pblock(ex) + ex_ee_len - 1;
- if (*partial_cluster != (long long) EXT4_B2C(sbi, pblk)) {
+ if (partial->pclu != EXT4_B2C(sbi, pblk)) {
+ int flags = get_default_free_blocks_flags(inode);
+
+ if (ext4_is_pending(inode, partial->lblk))
+ flags |= EXT4_FREE_BLOCKS_RERESERVE_CLUSTER;
ext4_free_blocks(handle, inode, NULL,
- EXT4_C2B(sbi, *partial_cluster),
- sbi->s_cluster_ratio,
- get_default_free_blocks_flags(inode));
+ EXT4_C2B(sbi, partial->pclu),
+ sbi->s_cluster_ratio, flags);
+ if (flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER)
+ ext4_rereserve_cluster(inode, partial->lblk);
}
- *partial_cluster = 0;
+ partial->state = initial;
}
/* if this leaf is free, then we should
@@ -2819,10 +2937,14 @@
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
int depth = ext_depth(inode);
struct ext4_ext_path *path = NULL;
- long long partial_cluster = 0;
+ struct partial_cluster partial;
handle_t *handle;
int i = 0, err = 0;
+ partial.pclu = 0;
+ partial.lblk = 0;
+ partial.state = initial;
+
ext_debug("truncate since %u to %u\n", start, end);
/* probably first extent we're gonna free will be last in block */
@@ -2882,8 +3004,8 @@
*/
if (sbi->s_cluster_ratio > 1) {
pblk = ext4_ext_pblock(ex) + end - ee_block + 2;
- partial_cluster =
- -(long long) EXT4_B2C(sbi, pblk);
+ partial.pclu = EXT4_B2C(sbi, pblk);
+ partial.state = nofree;
}
/*
@@ -2897,23 +3019,27 @@
if (err < 0)
goto out;
- } else if (sbi->s_cluster_ratio > 1 && end >= ex_end) {
+ } else if (sbi->s_cluster_ratio > 1 && end >= ex_end &&
+ partial.state == initial) {
/*
- * If there's an extent to the right its first cluster
- * contains the immediate right boundary of the
- * truncated/punched region. Set partial_cluster to
- * its negative value so it won't be freed if shared
- * with the current extent. The end < ee_block case
- * is handled in ext4_ext_rm_leaf().
+ * If we're punching, there's an extent to the right.
+ * If the partial cluster hasn't been set, set it to
+ * that extent's first cluster and its state to nofree
+ * so it won't be freed should it contain blocks to be
+ * removed. If it's already set (tofree/nofree), we're
+ * retrying and keep the original partial cluster info
+ * so a cluster marked tofree as a result of earlier
+ * extent removal is not lost.
*/
lblk = ex_end + 1;
err = ext4_ext_search_right(inode, path, &lblk, &pblk,
&ex);
if (err)
goto out;
- if (pblk)
- partial_cluster =
- -(long long) EXT4_B2C(sbi, pblk);
+ if (pblk) {
+ partial.pclu = EXT4_B2C(sbi, pblk);
+ partial.state = nofree;
+ }
}
}
/*
@@ -2948,8 +3074,7 @@
if (i == depth) {
/* this is leaf block */
err = ext4_ext_rm_leaf(handle, inode, path,
- &partial_cluster, start,
- end);
+ &partial, start, end);
/* root level has p_bh == NULL, brelse() eats this */
brelse(path[i].p_bh);
path[i].p_bh = NULL;
@@ -3021,21 +3146,24 @@
}
}
- trace_ext4_ext_remove_space_done(inode, start, end, depth,
- partial_cluster, path->p_hdr->eh_entries);
+ trace_ext4_ext_remove_space_done(inode, start, end, depth, &partial,
+ path->p_hdr->eh_entries);
/*
- * If we still have something in the partial cluster and we have removed
- * even the first extent, then we should free the blocks in the partial
- * cluster as well. (This code will only run when there are no leaves
- * to the immediate left of the truncated/punched region.)
+ * if there's a partial cluster and we have removed the first extent
+ * in the file, then we also free the partial cluster, if any
*/
- if (partial_cluster > 0 && err == 0) {
- /* don't zero partial_cluster since it's not used afterwards */
+ if (partial.state == tofree && err == 0) {
+ int flags = get_default_free_blocks_flags(inode);
+
+ if (ext4_is_pending(inode, partial.lblk))
+ flags |= EXT4_FREE_BLOCKS_RERESERVE_CLUSTER;
ext4_free_blocks(handle, inode, NULL,
- EXT4_C2B(sbi, partial_cluster),
- sbi->s_cluster_ratio,
- get_default_free_blocks_flags(inode));
+ EXT4_C2B(sbi, partial.pclu),
+ sbi->s_cluster_ratio, flags);
+ if (flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER)
+ ext4_rereserve_cluster(inode, partial.lblk);
+ partial.state = initial;
}
/* TODO: flexible tree reduction should be here */
@@ -3569,7 +3697,7 @@
max_zeroout = sbi->s_extent_max_zeroout_kb >>
(inode->i_sb->s_blocksize_bits - 10);
- if (ext4_encrypted_inode(inode))
+ if (IS_ENCRYPTED(inode))
max_zeroout = 0;
/*
@@ -3731,8 +3859,8 @@
* illegal.
*/
if (ee_block != map->m_lblk || ee_len > map->m_len) {
-#ifdef EXT4_DEBUG
- ext4_warning("Inode (%ld) finished: extent logical block %llu,"
+#ifdef CONFIG_EXT4_DEBUG
+ ext4_warning(inode->i_sb, "Inode (%ld) finished: extent logical block %llu,"
" len %u; IO logical block %llu, len %u",
inode->i_ino, (unsigned long long)ee_block, ee_len,
(unsigned long long)map->m_lblk, map->m_len);
@@ -3819,114 +3947,6 @@
return ext4_mark_inode_dirty(handle, inode);
}
-/**
- * ext4_find_delalloc_range: find delayed allocated block in the given range.
- *
- * Return 1 if there is a delalloc block in the range, otherwise 0.
- */
-int ext4_find_delalloc_range(struct inode *inode,
- ext4_lblk_t lblk_start,
- ext4_lblk_t lblk_end)
-{
- struct extent_status es;
-
- ext4_es_find_delayed_extent_range(inode, lblk_start, lblk_end, &es);
- if (es.es_len == 0)
- return 0; /* there is no delay extent in this tree */
- else if (es.es_lblk <= lblk_start &&
- lblk_start < es.es_lblk + es.es_len)
- return 1;
- else if (lblk_start <= es.es_lblk && es.es_lblk <= lblk_end)
- return 1;
- else
- return 0;
-}
-
-int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk)
-{
- struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
- ext4_lblk_t lblk_start, lblk_end;
- lblk_start = EXT4_LBLK_CMASK(sbi, lblk);
- lblk_end = lblk_start + sbi->s_cluster_ratio - 1;
-
- return ext4_find_delalloc_range(inode, lblk_start, lblk_end);
-}
-
-/**
- * Determines how many complete clusters (out of those specified by the 'map')
- * are under delalloc and were reserved quota for.
- * This function is called when we are writing out the blocks that were
- * originally written with their allocation delayed, but then the space was
- * allocated using fallocate() before the delayed allocation could be resolved.
- * The cases to look for are:
- * ('=' indicated delayed allocated blocks
- * '-' indicates non-delayed allocated blocks)
- * (a) partial clusters towards beginning and/or end outside of allocated range
- * are not delalloc'ed.
- * Ex:
- * |----c---=|====c====|====c====|===-c----|
- * |++++++ allocated ++++++|
- * ==> 4 complete clusters in above example
- *
- * (b) partial cluster (outside of allocated range) towards either end is
- * marked for delayed allocation. In this case, we will exclude that
- * cluster.
- * Ex:
- * |----====c========|========c========|
- * |++++++ allocated ++++++|
- * ==> 1 complete clusters in above example
- *
- * Ex:
- * |================c================|
- * |++++++ allocated ++++++|
- * ==> 0 complete clusters in above example
- *
- * The ext4_da_update_reserve_space will be called only if we
- * determine here that there were some "entire" clusters that span
- * this 'allocated' range.
- * In the non-bigalloc case, this function will just end up returning num_blks
- * without ever calling ext4_find_delalloc_range.
- */
-static unsigned int
-get_reserved_cluster_alloc(struct inode *inode, ext4_lblk_t lblk_start,
- unsigned int num_blks)
-{
- struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
- ext4_lblk_t alloc_cluster_start, alloc_cluster_end;
- ext4_lblk_t lblk_from, lblk_to, c_offset;
- unsigned int allocated_clusters = 0;
-
- alloc_cluster_start = EXT4_B2C(sbi, lblk_start);
- alloc_cluster_end = EXT4_B2C(sbi, lblk_start + num_blks - 1);
-
- /* max possible clusters for this allocation */
- allocated_clusters = alloc_cluster_end - alloc_cluster_start + 1;
-
- trace_ext4_get_reserved_cluster_alloc(inode, lblk_start, num_blks);
-
- /* Check towards left side */
- c_offset = EXT4_LBLK_COFF(sbi, lblk_start);
- if (c_offset) {
- lblk_from = EXT4_LBLK_CMASK(sbi, lblk_start);
- lblk_to = lblk_from + c_offset - 1;
-
- if (ext4_find_delalloc_range(inode, lblk_from, lblk_to))
- allocated_clusters--;
- }
-
- /* Now check towards right. */
- c_offset = EXT4_LBLK_COFF(sbi, lblk_start + num_blks);
- if (allocated_clusters && c_offset) {
- lblk_from = lblk_start + num_blks;
- lblk_to = lblk_from + (sbi->s_cluster_ratio - c_offset) - 1;
-
- if (ext4_find_delalloc_range(inode, lblk_from, lblk_to))
- allocated_clusters--;
- }
-
- return allocated_clusters;
-}
-
static int
convert_initialized_extent(handle_t *handle, struct inode *inode,
struct ext4_map_blocks *map,
@@ -4094,37 +4114,10 @@
} else
allocated = ret;
map->m_flags |= EXT4_MAP_NEW;
- /*
- * if we allocated more blocks than requested
- * we need to make sure we unmap the extra block
- * allocated. The actual needed block will get
- * unmapped later when we find the buffer_head marked
- * new.
- */
- if (allocated > map->m_len) {
- clean_bdev_aliases(inode->i_sb->s_bdev, newblock + map->m_len,
- allocated - map->m_len);
+ if (allocated > map->m_len)
allocated = map->m_len;
- }
map->m_len = allocated;
- /*
- * If we have done fallocate with the offset that is already
- * delayed allocated, we would have block reservation
- * and quota reservation done in the delayed write path.
- * But fallocate would have already updated quota and block
- * count for this offset. So cancel these reservation
- */
- if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
- unsigned int reserved_clusters;
- reserved_clusters = get_reserved_cluster_alloc(inode,
- map->m_lblk, map->m_len);
- if (reserved_clusters)
- ext4_da_update_reserve_space(inode,
- reserved_clusters,
- 0);
- }
-
map_out:
map->m_flags |= EXT4_MAP_MAPPED;
if ((flags & EXT4_GET_BLOCKS_KEEP_SIZE) == 0) {
@@ -4513,77 +4506,39 @@
map->m_flags |= EXT4_MAP_NEW;
/*
- * Update reserved blocks/metadata blocks after successful
- * block allocation which had been deferred till now.
+ * Reduce the reserved cluster count to reflect successful deferred
+ * allocation of delayed allocated clusters or direct allocation of
+ * clusters discovered to be delayed allocated. Once allocated, a
+ * cluster is not included in the reserved count.
*/
- if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
- unsigned int reserved_clusters;
- /*
- * Check how many clusters we had reserved this allocated range
- */
- reserved_clusters = get_reserved_cluster_alloc(inode,
- map->m_lblk, allocated);
- if (!map_from_cluster) {
- BUG_ON(allocated_clusters < reserved_clusters);
- if (reserved_clusters < allocated_clusters) {
- struct ext4_inode_info *ei = EXT4_I(inode);
- int reservation = allocated_clusters -
- reserved_clusters;
- /*
- * It seems we claimed few clusters outside of
- * the range of this allocation. We should give
- * it back to the reservation pool. This can
- * happen in the following case:
- *
- * * Suppose s_cluster_ratio is 4 (i.e., each
- * cluster has 4 blocks. Thus, the clusters
- * are [0-3],[4-7],[8-11]...
- * * First comes delayed allocation write for
- * logical blocks 10 & 11. Since there were no
- * previous delayed allocated blocks in the
- * range [8-11], we would reserve 1 cluster
- * for this write.
- * * Next comes write for logical blocks 3 to 8.
- * In this case, we will reserve 2 clusters
- * (for [0-3] and [4-7]; and not for [8-11] as
- * that range has a delayed allocated blocks.
- * Thus total reserved clusters now becomes 3.
- * * Now, during the delayed allocation writeout
- * time, we will first write blocks [3-8] and
- * allocate 3 clusters for writing these
- * blocks. Also, we would claim all these
- * three clusters above.
- * * Now when we come here to writeout the
- * blocks [10-11], we would expect to claim
- * the reservation of 1 cluster we had made
- * (and we would claim it since there are no
- * more delayed allocated blocks in the range
- * [8-11]. But our reserved cluster count had
- * already gone to 0.
- *
- * Thus, at the step 4 above when we determine
- * that there are still some unwritten delayed
- * allocated blocks outside of our current
- * block range, we should increment the
- * reserved clusters count so that when the
- * remaining blocks finally gets written, we
- * could claim them.
- */
- dquot_reserve_block(inode,
- EXT4_C2B(sbi, reservation));
- spin_lock(&ei->i_block_reservation_lock);
- ei->i_reserved_data_blocks += reservation;
- spin_unlock(&ei->i_block_reservation_lock);
- }
+ if (test_opt(inode->i_sb, DELALLOC) && !map_from_cluster) {
+ if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
/*
- * We will claim quota for all newly allocated blocks.
- * We're updating the reserved space *after* the
- * correction above so we do not accidentally free
- * all the metadata reservation because we might
- * actually need it later on.
+ * When allocating delayed allocated clusters, simply
+ * reduce the reserved cluster count and claim quota
*/
ext4_da_update_reserve_space(inode, allocated_clusters,
1);
+ } else {
+ ext4_lblk_t lblk, len;
+ unsigned int n;
+
+ /*
+ * When allocating non-delayed allocated clusters
+ * (from fallocate, filemap, DIO, or clusters
+ * allocated when delalloc has been disabled by
+ * ext4_nonda_switch), reduce the reserved cluster
+ * count by the number of allocated clusters that
+ * have previously been delayed allocated. Quota
+ * has been claimed by ext4_mb_new_blocks() above,
+ * so release the quota reservations made for any
+ * previously delayed allocated clusters.
+ */
+ lblk = EXT4_LBLK_CMASK(sbi, map->m_lblk);
+ len = allocated_clusters << sbi->s_cluster_bits;
+ n = ext4_es_delayed_clu(inode, lblk, len);
+ if (n > 0)
+ ext4_da_update_reserve_space(inode, (int) n, 0);
}
}
@@ -4919,7 +4874,7 @@
* leave it disabled for encrypted inodes for now. This is a
* bug we should fix....
*/
- if (ext4_encrypted_inode(inode) &&
+ if (IS_ENCRYPTED(inode) &&
(mode & (FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_INSERT_RANGE |
FALLOC_FL_ZERO_RANGE)))
return -EOPNOTSUPP;
@@ -5075,8 +5030,10 @@
ext4_lblk_t block, next_del;
if (newes->es_pblk == 0) {
- ext4_es_find_delayed_extent_range(inode, newes->es_lblk,
- newes->es_lblk + newes->es_len - 1, &es);
+ ext4_es_find_extent_range(inode, &ext4_es_is_delayed,
+ newes->es_lblk,
+ newes->es_lblk + newes->es_len - 1,
+ &es);
/*
* No extent in extent-tree contains block @newes->es_pblk,
@@ -5097,7 +5054,8 @@
}
block = newes->es_lblk + newes->es_len;
- ext4_es_find_delayed_extent_range(inode, block, EXT_MAX_BLOCKS, &es);
+ ext4_es_find_extent_range(inode, &ext4_es_is_delayed, block,
+ EXT_MAX_BLOCKS, &es);
if (es.es_len == 0)
next_del = EXT_MAX_BLOCKS;
else
@@ -5105,8 +5063,6 @@
return next_del;
}
-/* fiemap flags we can handle specified here */
-#define EXT4_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR)
static int ext4_xattr_fiemap(struct inode *inode,
struct fiemap_extent_info *fieinfo)
@@ -5143,10 +5099,16 @@
return (error < 0 ? error : 0);
}
-int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
- __u64 start, __u64 len)
+static int _ext4_fiemap(struct inode *inode,
+ struct fiemap_extent_info *fieinfo,
+ __u64 start, __u64 len,
+ int (*fill)(struct inode *, ext4_lblk_t,
+ ext4_lblk_t,
+ struct fiemap_extent_info *))
{
ext4_lblk_t start_blk;
+ u32 ext4_fiemap_flags = FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR;
+
int error = 0;
if (ext4_has_inline_data(inode)) {
@@ -5163,14 +5125,18 @@
error = ext4_ext_precache(inode);
if (error)
return error;
+ fieinfo->fi_flags &= ~FIEMAP_FLAG_CACHE;
}
/* fallback to generic here if not in extents fmt */
- if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
+ if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) &&
+ fill == ext4_fill_fiemap_extents)
return generic_block_fiemap(inode, fieinfo, start, len,
ext4_get_block);
- if (fiemap_check_flags(fieinfo, EXT4_FIEMAP_FLAGS))
+ if (fill == ext4_fill_es_cache_info)
+ ext4_fiemap_flags &= FIEMAP_FLAG_XATTR;
+ if (fiemap_check_flags(fieinfo, ext4_fiemap_flags))
return -EBADR;
if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) {
@@ -5189,12 +5155,36 @@
* Walk the extent tree gathering extent information
* and pushing extents back to the user.
*/
- error = ext4_fill_fiemap_extents(inode, start_blk,
- len_blks, fieinfo);
+ error = fill(inode, start_blk, len_blks, fieinfo);
}
return error;
}
+int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+ __u64 start, __u64 len)
+{
+ return _ext4_fiemap(inode, fieinfo, start, len,
+ ext4_fill_fiemap_extents);
+}
+
+int ext4_get_es_cache(struct inode *inode, struct fiemap_extent_info *fieinfo,
+ __u64 start, __u64 len)
+{
+ if (ext4_has_inline_data(inode)) {
+ int has_inline;
+
+ down_read(&EXT4_I(inode)->xattr_sem);
+ has_inline = ext4_has_inline_data(inode);
+ up_read(&EXT4_I(inode)->xattr_sem);
+ if (has_inline)
+ return 0;
+ }
+
+ return _ext4_fiemap(inode, fieinfo, start, len,
+ ext4_fill_es_cache_info);
+}
+
+
/*
* ext4_access_path:
* Function to access the path buffer for marking it dirty.
@@ -5764,8 +5754,8 @@
}
/**
- * ext4_swap_extents - Swap extents between two inodes
- *
+ * ext4_swap_extents() - Swap extents between two inodes
+ * @handle: handle for this transaction
* @inode1: First inode
* @inode2: Second inode
* @lblk1: Start block for first inode
@@ -5958,3 +5948,82 @@
}
return replaced_count;
}
+
+/*
+ * ext4_clu_mapped - determine whether any block in a logical cluster has
+ * been mapped to a physical cluster
+ *
+ * @inode - file containing the logical cluster
+ * @lclu - logical cluster of interest
+ *
+ * Returns 1 if any block in the logical cluster is mapped, signifying
+ * that a physical cluster has been allocated for it. Otherwise,
+ * returns 0. Can also return negative error codes. Derived from
+ * ext4_ext_map_blocks().
+ */
+int ext4_clu_mapped(struct inode *inode, ext4_lblk_t lclu)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ struct ext4_ext_path *path;
+ int depth, mapped = 0, err = 0;
+ struct ext4_extent *extent;
+ ext4_lblk_t first_lblk, first_lclu, last_lclu;
+
+ /* search for the extent closest to the first block in the cluster */
+ path = ext4_find_extent(inode, EXT4_C2B(sbi, lclu), NULL, 0);
+ if (IS_ERR(path)) {
+ err = PTR_ERR(path);
+ path = NULL;
+ goto out;
+ }
+
+ depth = ext_depth(inode);
+
+ /*
+ * A consistent leaf must not be empty. This situation is possible,
+ * though, _during_ tree modification, and it's why an assert can't
+ * be put in ext4_find_extent().
+ */
+ if (unlikely(path[depth].p_ext == NULL && depth != 0)) {
+ EXT4_ERROR_INODE(inode,
+ "bad extent address - lblock: %lu, depth: %d, pblock: %lld",
+ (unsigned long) EXT4_C2B(sbi, lclu),
+ depth, path[depth].p_block);
+ err = -EFSCORRUPTED;
+ goto out;
+ }
+
+ extent = path[depth].p_ext;
+
+ /* can't be mapped if the extent tree is empty */
+ if (extent == NULL)
+ goto out;
+
+ first_lblk = le32_to_cpu(extent->ee_block);
+ first_lclu = EXT4_B2C(sbi, first_lblk);
+
+ /*
+ * Three possible outcomes at this point - found extent spanning
+ * the target cluster, to the left of the target cluster, or to the
+ * right of the target cluster. The first two cases are handled here.
+ * The last case indicates the target cluster is not mapped.
+ */
+ if (lclu >= first_lclu) {
+ last_lclu = EXT4_B2C(sbi, first_lblk +
+ ext4_ext_get_actual_len(extent) - 1);
+ if (lclu <= last_lclu) {
+ mapped = 1;
+ } else {
+ first_lblk = ext4_ext_next_allocated_block(path);
+ first_lclu = EXT4_B2C(sbi, first_lblk);
+ if (lclu == first_lclu)
+ mapped = 1;
+ }
+ }
+
+out:
+ ext4_ext_drop_refs(path);
+ kfree(path);
+
+ return err ? err : mapped;
+}
diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
index c4e6fb1..d996b44 100644
--- a/fs/ext4/extents_status.c
+++ b/fs/ext4/extents_status.c
@@ -142,13 +142,16 @@
*/
static struct kmem_cache *ext4_es_cachep;
+static struct kmem_cache *ext4_pending_cachep;
static int __es_insert_extent(struct inode *inode, struct extent_status *newes);
static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
- ext4_lblk_t end);
+ ext4_lblk_t end, int *reserved);
static int es_reclaim_extents(struct ext4_inode_info *ei, int *nr_to_scan);
static int __es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
struct ext4_inode_info *locked_ei);
+static void __revise_pending(struct inode *inode, ext4_lblk_t lblk,
+ ext4_lblk_t len);
int __init ext4_init_es(void)
{
@@ -233,30 +236,38 @@
}
/*
- * ext4_es_find_delayed_extent_range: find the 1st delayed extent covering
- * @es->lblk if it exists, otherwise, the next extent after @es->lblk.
+ * ext4_es_find_extent_range - find extent with specified status within block
+ * range or next extent following block range in
+ * extents status tree
*
- * @inode: the inode which owns delayed extents
- * @lblk: the offset where we start to search
- * @end: the offset where we stop to search
- * @es: delayed extent that we found
+ * @inode - file containing the range
+ * @matching_fn - pointer to function that matches extents with desired status
+ * @lblk - logical block defining start of range
+ * @end - logical block defining end of range
+ * @es - extent found, if any
+ *
+ * Find the first extent within the block range specified by @lblk and @end
+ * in the extents status tree that satisfies @matching_fn. If a match
+ * is found, it's returned in @es. If not, and a matching extent is found
+ * beyond the block range, it's returned in @es. If no match is found, an
+ * extent is returned in @es whose es_lblk, es_len, and es_pblk components
+ * are 0.
*/
-void ext4_es_find_delayed_extent_range(struct inode *inode,
- ext4_lblk_t lblk, ext4_lblk_t end,
- struct extent_status *es)
+static void __es_find_extent_range(struct inode *inode,
+ int (*matching_fn)(struct extent_status *es),
+ ext4_lblk_t lblk, ext4_lblk_t end,
+ struct extent_status *es)
{
struct ext4_es_tree *tree = NULL;
struct extent_status *es1 = NULL;
struct rb_node *node;
- BUG_ON(es == NULL);
- BUG_ON(end < lblk);
- trace_ext4_es_find_delayed_extent_range_enter(inode, lblk);
+ WARN_ON(es == NULL);
+ WARN_ON(end < lblk);
- read_lock(&EXT4_I(inode)->i_es_lock);
tree = &EXT4_I(inode)->i_es_tree;
- /* find extent in cache firstly */
+ /* see if the extent has been cached */
es->es_lblk = es->es_len = es->es_pblk = 0;
if (tree->cache_es) {
es1 = tree->cache_es;
@@ -271,28 +282,133 @@
es1 = __es_tree_search(&tree->root, lblk);
out:
- if (es1 && !ext4_es_is_delayed(es1)) {
+ if (es1 && !matching_fn(es1)) {
while ((node = rb_next(&es1->rb_node)) != NULL) {
es1 = rb_entry(node, struct extent_status, rb_node);
if (es1->es_lblk > end) {
es1 = NULL;
break;
}
- if (ext4_es_is_delayed(es1))
+ if (matching_fn(es1))
break;
}
}
- if (es1 && ext4_es_is_delayed(es1)) {
+ if (es1 && matching_fn(es1)) {
tree->cache_es = es1;
es->es_lblk = es1->es_lblk;
es->es_len = es1->es_len;
es->es_pblk = es1->es_pblk;
}
+}
+
+/*
+ * Locking for __es_find_extent_range() for external use
+ */
+void ext4_es_find_extent_range(struct inode *inode,
+ int (*matching_fn)(struct extent_status *es),
+ ext4_lblk_t lblk, ext4_lblk_t end,
+ struct extent_status *es)
+{
+ trace_ext4_es_find_extent_range_enter(inode, lblk);
+
+ read_lock(&EXT4_I(inode)->i_es_lock);
+ __es_find_extent_range(inode, matching_fn, lblk, end, es);
read_unlock(&EXT4_I(inode)->i_es_lock);
- trace_ext4_es_find_delayed_extent_range_exit(inode, es);
+ trace_ext4_es_find_extent_range_exit(inode, es);
+}
+
+/*
+ * __es_scan_range - search block range for block with specified status
+ * in extents status tree
+ *
+ * @inode - file containing the range
+ * @matching_fn - pointer to function that matches extents with desired status
+ * @lblk - logical block defining start of range
+ * @end - logical block defining end of range
+ *
+ * Returns true if at least one block in the specified block range satisfies
+ * the criterion specified by @matching_fn, and false if not. If at least
+ * one extent has the specified status, then there is at least one block
+ * in the cluster with that status. Should only be called by code that has
+ * taken i_es_lock.
+ */
+static bool __es_scan_range(struct inode *inode,
+ int (*matching_fn)(struct extent_status *es),
+ ext4_lblk_t start, ext4_lblk_t end)
+{
+ struct extent_status es;
+
+ __es_find_extent_range(inode, matching_fn, start, end, &es);
+ if (es.es_len == 0)
+ return false; /* no matching extent in the tree */
+ else if (es.es_lblk <= start &&
+ start < es.es_lblk + es.es_len)
+ return true;
+ else if (start <= es.es_lblk && es.es_lblk <= end)
+ return true;
+ else
+ return false;
+}
+/*
+ * Locking for __es_scan_range() for external use
+ */
+bool ext4_es_scan_range(struct inode *inode,
+ int (*matching_fn)(struct extent_status *es),
+ ext4_lblk_t lblk, ext4_lblk_t end)
+{
+ bool ret;
+
+ read_lock(&EXT4_I(inode)->i_es_lock);
+ ret = __es_scan_range(inode, matching_fn, lblk, end);
+ read_unlock(&EXT4_I(inode)->i_es_lock);
+
+ return ret;
+}
+
+/*
+ * __es_scan_clu - search cluster for block with specified status in
+ * extents status tree
+ *
+ * @inode - file containing the cluster
+ * @matching_fn - pointer to function that matches extents with desired status
+ * @lblk - logical block in cluster to be searched
+ *
+ * Returns true if at least one extent in the cluster containing @lblk
+ * satisfies the criterion specified by @matching_fn, and false if not. If at
+ * least one extent has the specified status, then there is at least one block
+ * in the cluster with that status. Should only be called by code that has
+ * taken i_es_lock.
+ */
+static bool __es_scan_clu(struct inode *inode,
+ int (*matching_fn)(struct extent_status *es),
+ ext4_lblk_t lblk)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ ext4_lblk_t lblk_start, lblk_end;
+
+ lblk_start = EXT4_LBLK_CMASK(sbi, lblk);
+ lblk_end = lblk_start + sbi->s_cluster_ratio - 1;
+
+ return __es_scan_range(inode, matching_fn, lblk_start, lblk_end);
+}
+
+/*
+ * Locking for __es_scan_clu() for external use
+ */
+bool ext4_es_scan_clu(struct inode *inode,
+ int (*matching_fn)(struct extent_status *es),
+ ext4_lblk_t lblk)
+{
+ bool ret;
+
+ read_lock(&EXT4_I(inode)->i_es_lock);
+ ret = __es_scan_clu(inode, matching_fn, lblk);
+ read_unlock(&EXT4_I(inode)->i_es_lock);
+
+ return ret;
}
static void ext4_es_list_add(struct inode *inode)
@@ -595,7 +711,7 @@
* We don't need to check unwritten extent because
* indirect-based file doesn't have it.
*/
- BUG_ON(1);
+ BUG();
}
} else if (retval == 0) {
if (ext4_es_is_written(es)) {
@@ -664,7 +780,7 @@
}
p = &(*p)->rb_right;
} else {
- BUG_ON(1);
+ BUG();
return -EINVAL;
}
}
@@ -694,6 +810,7 @@
struct extent_status newes;
ext4_lblk_t end = lblk + len - 1;
int err = 0;
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
es_debug("add [%u/%u) %llu %x to extent status tree of inode %lu\n",
lblk, len, pblk, status, inode->i_ino);
@@ -719,7 +836,7 @@
ext4_es_insert_extent_check(inode, &newes);
write_lock(&EXT4_I(inode)->i_es_lock);
- err = __es_remove_extent(inode, lblk, end);
+ err = __es_remove_extent(inode, lblk, end, NULL);
if (err != 0)
goto error;
retry:
@@ -730,6 +847,11 @@
if (err == -ENOMEM && !ext4_es_is_delayed(&newes))
err = 0;
+ if (sbi->s_cluster_ratio > 1 && test_opt(inode->i_sb, DELALLOC) &&
+ (status & EXTENT_STATUS_WRITTEN ||
+ status & EXTENT_STATUS_UNWRITTEN))
+ __revise_pending(inode, lblk, len);
+
error:
write_unlock(&EXT4_I(inode)->i_es_lock);
@@ -777,6 +899,7 @@
* Return: 1 on found, 0 on not
*/
int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk,
+ ext4_lblk_t *next_lblk,
struct extent_status *es)
{
struct ext4_es_tree *tree;
@@ -825,9 +948,18 @@
es->es_pblk = es1->es_pblk;
if (!ext4_es_is_referenced(es1))
ext4_es_set_referenced(es1);
- stats->es_stats_cache_hits++;
+ percpu_counter_inc(&stats->es_stats_cache_hits);
+ if (next_lblk) {
+ node = rb_next(&es1->rb_node);
+ if (node) {
+ es1 = rb_entry(node, struct extent_status,
+ rb_node);
+ *next_lblk = es1->es_lblk;
+ } else
+ *next_lblk = 0;
+ }
} else {
- stats->es_stats_cache_misses++;
+ percpu_counter_inc(&stats->es_stats_cache_misses);
}
read_unlock(&EXT4_I(inode)->i_es_lock);
@@ -836,8 +968,322 @@
return found;
}
+struct rsvd_count {
+ int ndelonly;
+ bool first_do_lblk_found;
+ ext4_lblk_t first_do_lblk;
+ ext4_lblk_t last_do_lblk;
+ struct extent_status *left_es;
+ bool partial;
+ ext4_lblk_t lclu;
+};
+
+/*
+ * init_rsvd - initialize reserved count data before removing block range
+ * in file from extent status tree
+ *
+ * @inode - file containing range
+ * @lblk - first block in range
+ * @es - pointer to first extent in range
+ * @rc - pointer to reserved count data
+ *
+ * Assumes es is not NULL
+ */
+static void init_rsvd(struct inode *inode, ext4_lblk_t lblk,
+ struct extent_status *es, struct rsvd_count *rc)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ struct rb_node *node;
+
+ rc->ndelonly = 0;
+
+ /*
+ * for bigalloc, note the first delonly block in the range has not
+ * been found, record the extent containing the block to the left of
+ * the region to be removed, if any, and note that there's no partial
+ * cluster to track
+ */
+ if (sbi->s_cluster_ratio > 1) {
+ rc->first_do_lblk_found = false;
+ if (lblk > es->es_lblk) {
+ rc->left_es = es;
+ } else {
+ node = rb_prev(&es->rb_node);
+ rc->left_es = node ? rb_entry(node,
+ struct extent_status,
+ rb_node) : NULL;
+ }
+ rc->partial = false;
+ }
+}
+
+/*
+ * count_rsvd - count the clusters containing delayed and not unwritten
+ * (delonly) blocks in a range within an extent and add to
+ * the running tally in rsvd_count
+ *
+ * @inode - file containing extent
+ * @lblk - first block in range
+ * @len - length of range in blocks
+ * @es - pointer to extent containing clusters to be counted
+ * @rc - pointer to reserved count data
+ *
+ * Tracks partial clusters found at the beginning and end of extents so
+ * they aren't overcounted when they span adjacent extents
+ */
+static void count_rsvd(struct inode *inode, ext4_lblk_t lblk, long len,
+ struct extent_status *es, struct rsvd_count *rc)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ ext4_lblk_t i, end, nclu;
+
+ if (!ext4_es_is_delonly(es))
+ return;
+
+ WARN_ON(len <= 0);
+
+ if (sbi->s_cluster_ratio == 1) {
+ rc->ndelonly += (int) len;
+ return;
+ }
+
+ /* bigalloc */
+
+ i = (lblk < es->es_lblk) ? es->es_lblk : lblk;
+ end = lblk + (ext4_lblk_t) len - 1;
+ end = (end > ext4_es_end(es)) ? ext4_es_end(es) : end;
+
+ /* record the first block of the first delonly extent seen */
+ if (rc->first_do_lblk_found == false) {
+ rc->first_do_lblk = i;
+ rc->first_do_lblk_found = true;
+ }
+
+ /* update the last lblk in the region seen so far */
+ rc->last_do_lblk = end;
+
+ /*
+ * if we're tracking a partial cluster and the current extent
+ * doesn't start with it, count it and stop tracking
+ */
+ if (rc->partial && (rc->lclu != EXT4_B2C(sbi, i))) {
+ rc->ndelonly++;
+ rc->partial = false;
+ }
+
+ /*
+ * if the first cluster doesn't start on a cluster boundary but
+ * ends on one, count it
+ */
+ if (EXT4_LBLK_COFF(sbi, i) != 0) {
+ if (end >= EXT4_LBLK_CFILL(sbi, i)) {
+ rc->ndelonly++;
+ rc->partial = false;
+ i = EXT4_LBLK_CFILL(sbi, i) + 1;
+ }
+ }
+
+ /*
+ * if the current cluster starts on a cluster boundary, count the
+ * number of whole delonly clusters in the extent
+ */
+ if ((i + sbi->s_cluster_ratio - 1) <= end) {
+ nclu = (end - i + 1) >> sbi->s_cluster_bits;
+ rc->ndelonly += nclu;
+ i += nclu << sbi->s_cluster_bits;
+ }
+
+ /*
+ * start tracking a partial cluster if there's a partial at the end
+ * of the current extent and we're not already tracking one
+ */
+ if (!rc->partial && i <= end) {
+ rc->partial = true;
+ rc->lclu = EXT4_B2C(sbi, i);
+ }
+}
+
+/*
+ * __pr_tree_search - search for a pending cluster reservation
+ *
+ * @root - root of pending reservation tree
+ * @lclu - logical cluster to search for
+ *
+ * Returns the pending reservation for the cluster identified by @lclu
+ * if found. If not, returns a reservation for the next cluster if any,
+ * and if not, returns NULL.
+ */
+static struct pending_reservation *__pr_tree_search(struct rb_root *root,
+ ext4_lblk_t lclu)
+{
+ struct rb_node *node = root->rb_node;
+ struct pending_reservation *pr = NULL;
+
+ while (node) {
+ pr = rb_entry(node, struct pending_reservation, rb_node);
+ if (lclu < pr->lclu)
+ node = node->rb_left;
+ else if (lclu > pr->lclu)
+ node = node->rb_right;
+ else
+ return pr;
+ }
+ if (pr && lclu < pr->lclu)
+ return pr;
+ if (pr && lclu > pr->lclu) {
+ node = rb_next(&pr->rb_node);
+ return node ? rb_entry(node, struct pending_reservation,
+ rb_node) : NULL;
+ }
+ return NULL;
+}
+
+/*
+ * get_rsvd - calculates and returns the number of cluster reservations to be
+ * released when removing a block range from the extent status tree
+ * and releases any pending reservations within the range
+ *
+ * @inode - file containing block range
+ * @end - last block in range
+ * @right_es - pointer to extent containing next block beyond end or NULL
+ * @rc - pointer to reserved count data
+ *
+ * The number of reservations to be released is equal to the number of
+ * clusters containing delayed and not unwritten (delonly) blocks within
+ * the range, minus the number of clusters still containing delonly blocks
+ * at the ends of the range, and minus the number of pending reservations
+ * within the range.
+ */
+static unsigned int get_rsvd(struct inode *inode, ext4_lblk_t end,
+ struct extent_status *right_es,
+ struct rsvd_count *rc)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ struct pending_reservation *pr;
+ struct ext4_pending_tree *tree = &EXT4_I(inode)->i_pending_tree;
+ struct rb_node *node;
+ ext4_lblk_t first_lclu, last_lclu;
+ bool left_delonly, right_delonly, count_pending;
+ struct extent_status *es;
+
+ if (sbi->s_cluster_ratio > 1) {
+ /* count any remaining partial cluster */
+ if (rc->partial)
+ rc->ndelonly++;
+
+ if (rc->ndelonly == 0)
+ return 0;
+
+ first_lclu = EXT4_B2C(sbi, rc->first_do_lblk);
+ last_lclu = EXT4_B2C(sbi, rc->last_do_lblk);
+
+ /*
+ * decrease the delonly count by the number of clusters at the
+ * ends of the range that still contain delonly blocks -
+ * these clusters still need to be reserved
+ */
+ left_delonly = right_delonly = false;
+
+ es = rc->left_es;
+ while (es && ext4_es_end(es) >=
+ EXT4_LBLK_CMASK(sbi, rc->first_do_lblk)) {
+ if (ext4_es_is_delonly(es)) {
+ rc->ndelonly--;
+ left_delonly = true;
+ break;
+ }
+ node = rb_prev(&es->rb_node);
+ if (!node)
+ break;
+ es = rb_entry(node, struct extent_status, rb_node);
+ }
+ if (right_es && (!left_delonly || first_lclu != last_lclu)) {
+ if (end < ext4_es_end(right_es)) {
+ es = right_es;
+ } else {
+ node = rb_next(&right_es->rb_node);
+ es = node ? rb_entry(node, struct extent_status,
+ rb_node) : NULL;
+ }
+ while (es && es->es_lblk <=
+ EXT4_LBLK_CFILL(sbi, rc->last_do_lblk)) {
+ if (ext4_es_is_delonly(es)) {
+ rc->ndelonly--;
+ right_delonly = true;
+ break;
+ }
+ node = rb_next(&es->rb_node);
+ if (!node)
+ break;
+ es = rb_entry(node, struct extent_status,
+ rb_node);
+ }
+ }
+
+ /*
+ * Determine the block range that should be searched for
+ * pending reservations, if any. Clusters on the ends of the
+ * original removed range containing delonly blocks are
+ * excluded. They've already been accounted for and it's not
+ * possible to determine if an associated pending reservation
+ * should be released with the information available in the
+ * extents status tree.
+ */
+ if (first_lclu == last_lclu) {
+ if (left_delonly | right_delonly)
+ count_pending = false;
+ else
+ count_pending = true;
+ } else {
+ if (left_delonly)
+ first_lclu++;
+ if (right_delonly)
+ last_lclu--;
+ if (first_lclu <= last_lclu)
+ count_pending = true;
+ else
+ count_pending = false;
+ }
+
+ /*
+ * a pending reservation found between first_lclu and last_lclu
+ * represents an allocated cluster that contained at least one
+ * delonly block, so the delonly total must be reduced by one
+ * for each pending reservation found and released
+ */
+ if (count_pending) {
+ pr = __pr_tree_search(&tree->root, first_lclu);
+ while (pr && pr->lclu <= last_lclu) {
+ rc->ndelonly--;
+ node = rb_next(&pr->rb_node);
+ rb_erase(&pr->rb_node, &tree->root);
+ kmem_cache_free(ext4_pending_cachep, pr);
+ if (!node)
+ break;
+ pr = rb_entry(node, struct pending_reservation,
+ rb_node);
+ }
+ }
+ }
+ return rc->ndelonly;
+}
+
+
+/*
+ * __es_remove_extent - removes block range from extent status tree
+ *
+ * @inode - file containing range
+ * @lblk - first block in range
+ * @end - last block in range
+ * @reserved - number of cluster reservations released
+ *
+ * If @reserved is not NULL and delayed allocation is enabled, counts
+ * block/cluster reservations freed by removing range and if bigalloc
+ * enabled cancels pending reservations as needed. Returns 0 on success,
+ * error code on failure.
+ */
static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
- ext4_lblk_t end)
+ ext4_lblk_t end, int *reserved)
{
struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree;
struct rb_node *node;
@@ -846,9 +1292,14 @@
ext4_lblk_t len1, len2;
ext4_fsblk_t block;
int err;
+ bool count_reserved = true;
+ struct rsvd_count rc;
+ if (reserved == NULL || !test_opt(inode->i_sb, DELALLOC))
+ count_reserved = false;
retry:
err = 0;
+
es = __es_tree_search(&tree->root, lblk);
if (!es)
goto out;
@@ -857,6 +1308,8 @@
/* Simply invalidate cache_es. */
tree->cache_es = NULL;
+ if (count_reserved)
+ init_rsvd(inode, lblk, es, &rc);
orig_es.es_lblk = es->es_lblk;
orig_es.es_len = es->es_len;
@@ -898,10 +1351,16 @@
ext4_es_store_pblock(es, block);
}
}
+ if (count_reserved)
+ count_rsvd(inode, lblk, orig_es.es_len - len1 - len2,
+ &orig_es, &rc);
goto out;
}
if (len1 > 0) {
+ if (count_reserved)
+ count_rsvd(inode, lblk, orig_es.es_len - len1,
+ &orig_es, &rc);
node = rb_next(&es->rb_node);
if (node)
es = rb_entry(node, struct extent_status, rb_node);
@@ -910,6 +1369,8 @@
}
while (es && ext4_es_end(es) <= end) {
+ if (count_reserved)
+ count_rsvd(inode, es->es_lblk, es->es_len, es, &rc);
node = rb_next(&es->rb_node);
rb_erase(&es->rb_node, &tree->root);
ext4_es_free_extent(inode, es);
@@ -924,6 +1385,9 @@
ext4_lblk_t orig_len = es->es_len;
len1 = ext4_es_end(es) - end;
+ if (count_reserved)
+ count_rsvd(inode, es->es_lblk, orig_len - len1,
+ es, &rc);
es->es_lblk = end + 1;
es->es_len = len1;
if (ext4_es_is_written(es) || ext4_es_is_unwritten(es)) {
@@ -932,20 +1396,28 @@
}
}
+ if (count_reserved)
+ *reserved = get_rsvd(inode, end, es, &rc);
out:
return err;
}
/*
- * ext4_es_remove_extent() removes a space from a extent status tree.
+ * ext4_es_remove_extent - removes block range from extent status tree
*
- * Return 0 on success, error code on failure.
+ * @inode - file containing range
+ * @lblk - first block in range
+ * @len - number of blocks to remove
+ *
+ * Reduces block/cluster reservation count and for bigalloc cancels pending
+ * reservations as needed. Returns 0 on success, error code on failure.
*/
int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
ext4_lblk_t len)
{
ext4_lblk_t end;
int err = 0;
+ int reserved = 0;
trace_ext4_es_remove_extent(inode, lblk, len);
es_debug("remove [%u/%u) from extent status tree of inode %lu\n",
@@ -963,9 +1435,10 @@
* is reclaimed.
*/
write_lock(&EXT4_I(inode)->i_es_lock);
- err = __es_remove_extent(inode, lblk, end);
+ err = __es_remove_extent(inode, lblk, end, &reserved);
write_unlock(&EXT4_I(inode)->i_es_lock);
ext4_es_print_tree(inode);
+ ext4_da_release_space(inode, reserved);
return err;
}
@@ -1113,9 +1586,9 @@
seq_printf(seq, "stats:\n %lld objects\n %lld reclaimable objects\n",
percpu_counter_sum_positive(&es_stats->es_stats_all_cnt),
percpu_counter_sum_positive(&es_stats->es_stats_shk_cnt));
- seq_printf(seq, " %lu/%lu cache hits/misses\n",
- es_stats->es_stats_cache_hits,
- es_stats->es_stats_cache_misses);
+ seq_printf(seq, " %lld/%lld cache hits/misses\n",
+ percpu_counter_sum_positive(&es_stats->es_stats_cache_hits),
+ percpu_counter_sum_positive(&es_stats->es_stats_cache_misses));
if (inode_cnt)
seq_printf(seq, " %d inodes on list\n", inode_cnt);
@@ -1142,35 +1615,46 @@
sbi->s_es_nr_inode = 0;
spin_lock_init(&sbi->s_es_lock);
sbi->s_es_stats.es_stats_shrunk = 0;
- sbi->s_es_stats.es_stats_cache_hits = 0;
- sbi->s_es_stats.es_stats_cache_misses = 0;
+ err = percpu_counter_init(&sbi->s_es_stats.es_stats_cache_hits, 0,
+ GFP_KERNEL);
+ if (err)
+ return err;
+ err = percpu_counter_init(&sbi->s_es_stats.es_stats_cache_misses, 0,
+ GFP_KERNEL);
+ if (err)
+ goto err1;
sbi->s_es_stats.es_stats_scan_time = 0;
sbi->s_es_stats.es_stats_max_scan_time = 0;
err = percpu_counter_init(&sbi->s_es_stats.es_stats_all_cnt, 0, GFP_KERNEL);
if (err)
- return err;
+ goto err2;
err = percpu_counter_init(&sbi->s_es_stats.es_stats_shk_cnt, 0, GFP_KERNEL);
if (err)
- goto err1;
+ goto err3;
sbi->s_es_shrinker.scan_objects = ext4_es_scan;
sbi->s_es_shrinker.count_objects = ext4_es_count;
sbi->s_es_shrinker.seeks = DEFAULT_SEEKS;
err = register_shrinker(&sbi->s_es_shrinker);
if (err)
- goto err2;
+ goto err4;
return 0;
-
-err2:
+err4:
percpu_counter_destroy(&sbi->s_es_stats.es_stats_shk_cnt);
-err1:
+err3:
percpu_counter_destroy(&sbi->s_es_stats.es_stats_all_cnt);
+err2:
+ percpu_counter_destroy(&sbi->s_es_stats.es_stats_cache_misses);
+err1:
+ percpu_counter_destroy(&sbi->s_es_stats.es_stats_cache_hits);
return err;
}
void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi)
{
+ percpu_counter_destroy(&sbi->s_es_stats.es_stats_cache_hits);
+ percpu_counter_destroy(&sbi->s_es_stats.es_stats_cache_misses);
percpu_counter_destroy(&sbi->s_es_stats.es_stats_all_cnt);
percpu_counter_destroy(&sbi->s_es_stats.es_stats_shk_cnt);
unregister_shrinker(&sbi->s_es_shrinker);
@@ -1195,7 +1679,7 @@
es = __es_tree_search(&tree->root, ei->i_es_shrink_lblk);
if (!es)
goto out_wrap;
- node = &es->rb_node;
+
while (*nr_to_scan > 0) {
if (es->es_lblk > end) {
ei->i_es_shrink_lblk = end + 1;
@@ -1252,3 +1736,437 @@
ei->i_es_tree.cache_es = NULL;
return nr_shrunk;
}
+
+/*
+ * Called to support EXT4_IOC_CLEAR_ES_CACHE. We can only remove
+ * discretionary entries from the extent status cache. (Some entries
+ * must be present for proper operations.)
+ */
+void ext4_clear_inode_es(struct inode *inode)
+{
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ struct extent_status *es;
+ struct ext4_es_tree *tree;
+ struct rb_node *node;
+
+ write_lock(&ei->i_es_lock);
+ tree = &EXT4_I(inode)->i_es_tree;
+ tree->cache_es = NULL;
+ node = rb_first(&tree->root);
+ while (node) {
+ es = rb_entry(node, struct extent_status, rb_node);
+ node = rb_next(node);
+ if (!ext4_es_is_delayed(es)) {
+ rb_erase(&es->rb_node, &tree->root);
+ ext4_es_free_extent(inode, es);
+ }
+ }
+ ext4_clear_inode_state(inode, EXT4_STATE_EXT_PRECACHED);
+ write_unlock(&ei->i_es_lock);
+}
+
+#ifdef ES_DEBUG__
+static void ext4_print_pending_tree(struct inode *inode)
+{
+ struct ext4_pending_tree *tree;
+ struct rb_node *node;
+ struct pending_reservation *pr;
+
+ printk(KERN_DEBUG "pending reservations for inode %lu:", inode->i_ino);
+ tree = &EXT4_I(inode)->i_pending_tree;
+ node = rb_first(&tree->root);
+ while (node) {
+ pr = rb_entry(node, struct pending_reservation, rb_node);
+ printk(KERN_DEBUG " %u", pr->lclu);
+ node = rb_next(node);
+ }
+ printk(KERN_DEBUG "\n");
+}
+#else
+#define ext4_print_pending_tree(inode)
+#endif
+
+int __init ext4_init_pending(void)
+{
+ ext4_pending_cachep = kmem_cache_create("ext4_pending_reservation",
+ sizeof(struct pending_reservation),
+ 0, (SLAB_RECLAIM_ACCOUNT), NULL);
+ if (ext4_pending_cachep == NULL)
+ return -ENOMEM;
+ return 0;
+}
+
+void ext4_exit_pending(void)
+{
+ kmem_cache_destroy(ext4_pending_cachep);
+}
+
+void ext4_init_pending_tree(struct ext4_pending_tree *tree)
+{
+ tree->root = RB_ROOT;
+}
+
+/*
+ * __get_pending - retrieve a pointer to a pending reservation
+ *
+ * @inode - file containing the pending cluster reservation
+ * @lclu - logical cluster of interest
+ *
+ * Returns a pointer to a pending reservation if it's a member of
+ * the set, and NULL if not. Must be called holding i_es_lock.
+ */
+static struct pending_reservation *__get_pending(struct inode *inode,
+ ext4_lblk_t lclu)
+{
+ struct ext4_pending_tree *tree;
+ struct rb_node *node;
+ struct pending_reservation *pr = NULL;
+
+ tree = &EXT4_I(inode)->i_pending_tree;
+ node = (&tree->root)->rb_node;
+
+ while (node) {
+ pr = rb_entry(node, struct pending_reservation, rb_node);
+ if (lclu < pr->lclu)
+ node = node->rb_left;
+ else if (lclu > pr->lclu)
+ node = node->rb_right;
+ else if (lclu == pr->lclu)
+ return pr;
+ }
+ return NULL;
+}
+
+/*
+ * __insert_pending - adds a pending cluster reservation to the set of
+ * pending reservations
+ *
+ * @inode - file containing the cluster
+ * @lblk - logical block in the cluster to be added
+ *
+ * Returns 0 on successful insertion and -ENOMEM on failure. If the
+ * pending reservation is already in the set, returns successfully.
+ */
+static int __insert_pending(struct inode *inode, ext4_lblk_t lblk)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ struct ext4_pending_tree *tree = &EXT4_I(inode)->i_pending_tree;
+ struct rb_node **p = &tree->root.rb_node;
+ struct rb_node *parent = NULL;
+ struct pending_reservation *pr;
+ ext4_lblk_t lclu;
+ int ret = 0;
+
+ lclu = EXT4_B2C(sbi, lblk);
+ /* search to find parent for insertion */
+ while (*p) {
+ parent = *p;
+ pr = rb_entry(parent, struct pending_reservation, rb_node);
+
+ if (lclu < pr->lclu) {
+ p = &(*p)->rb_left;
+ } else if (lclu > pr->lclu) {
+ p = &(*p)->rb_right;
+ } else {
+ /* pending reservation already inserted */
+ goto out;
+ }
+ }
+
+ pr = kmem_cache_alloc(ext4_pending_cachep, GFP_ATOMIC);
+ if (pr == NULL) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ pr->lclu = lclu;
+
+ rb_link_node(&pr->rb_node, parent, p);
+ rb_insert_color(&pr->rb_node, &tree->root);
+
+out:
+ return ret;
+}
+
+/*
+ * __remove_pending - removes a pending cluster reservation from the set
+ * of pending reservations
+ *
+ * @inode - file containing the cluster
+ * @lblk - logical block in the pending cluster reservation to be removed
+ *
+ * Returns successfully if pending reservation is not a member of the set.
+ */
+static void __remove_pending(struct inode *inode, ext4_lblk_t lblk)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ struct pending_reservation *pr;
+ struct ext4_pending_tree *tree;
+
+ pr = __get_pending(inode, EXT4_B2C(sbi, lblk));
+ if (pr != NULL) {
+ tree = &EXT4_I(inode)->i_pending_tree;
+ rb_erase(&pr->rb_node, &tree->root);
+ kmem_cache_free(ext4_pending_cachep, pr);
+ }
+}
+
+/*
+ * ext4_remove_pending - removes a pending cluster reservation from the set
+ * of pending reservations
+ *
+ * @inode - file containing the cluster
+ * @lblk - logical block in the pending cluster reservation to be removed
+ *
+ * Locking for external use of __remove_pending.
+ */
+void ext4_remove_pending(struct inode *inode, ext4_lblk_t lblk)
+{
+ struct ext4_inode_info *ei = EXT4_I(inode);
+
+ write_lock(&ei->i_es_lock);
+ __remove_pending(inode, lblk);
+ write_unlock(&ei->i_es_lock);
+}
+
+/*
+ * ext4_is_pending - determine whether a cluster has a pending reservation
+ * on it
+ *
+ * @inode - file containing the cluster
+ * @lblk - logical block in the cluster
+ *
+ * Returns true if there's a pending reservation for the cluster in the
+ * set of pending reservations, and false if not.
+ */
+bool ext4_is_pending(struct inode *inode, ext4_lblk_t lblk)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ bool ret;
+
+ read_lock(&ei->i_es_lock);
+ ret = (bool)(__get_pending(inode, EXT4_B2C(sbi, lblk)) != NULL);
+ read_unlock(&ei->i_es_lock);
+
+ return ret;
+}
+
+/*
+ * ext4_es_insert_delayed_block - adds a delayed block to the extents status
+ * tree, adding a pending reservation where
+ * needed
+ *
+ * @inode - file containing the newly added block
+ * @lblk - logical block to be added
+ * @allocated - indicates whether a physical cluster has been allocated for
+ * the logical cluster that contains the block
+ *
+ * Returns 0 on success, negative error code on failure.
+ */
+int ext4_es_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk,
+ bool allocated)
+{
+ struct extent_status newes;
+ int err = 0;
+
+ es_debug("add [%u/1) delayed to extent status tree of inode %lu\n",
+ lblk, inode->i_ino);
+
+ newes.es_lblk = lblk;
+ newes.es_len = 1;
+ ext4_es_store_pblock_status(&newes, ~0, EXTENT_STATUS_DELAYED);
+ trace_ext4_es_insert_delayed_block(inode, &newes, allocated);
+
+ ext4_es_insert_extent_check(inode, &newes);
+
+ write_lock(&EXT4_I(inode)->i_es_lock);
+
+ err = __es_remove_extent(inode, lblk, lblk, NULL);
+ if (err != 0)
+ goto error;
+retry:
+ err = __es_insert_extent(inode, &newes);
+ if (err == -ENOMEM && __es_shrink(EXT4_SB(inode->i_sb),
+ 128, EXT4_I(inode)))
+ goto retry;
+ if (err != 0)
+ goto error;
+
+ if (allocated)
+ __insert_pending(inode, lblk);
+
+error:
+ write_unlock(&EXT4_I(inode)->i_es_lock);
+
+ ext4_es_print_tree(inode);
+ ext4_print_pending_tree(inode);
+
+ return err;
+}
+
+/*
+ * __es_delayed_clu - count number of clusters containing blocks that
+ * are delayed only
+ *
+ * @inode - file containing block range
+ * @start - logical block defining start of range
+ * @end - logical block defining end of range
+ *
+ * Returns the number of clusters containing only delayed (not delayed
+ * and unwritten) blocks in the range specified by @start and @end. Any
+ * cluster or part of a cluster within the range and containing a delayed
+ * and not unwritten block within the range is counted as a whole cluster.
+ */
+static unsigned int __es_delayed_clu(struct inode *inode, ext4_lblk_t start,
+ ext4_lblk_t end)
+{
+ struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree;
+ struct extent_status *es;
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ struct rb_node *node;
+ ext4_lblk_t first_lclu, last_lclu;
+ unsigned long long last_counted_lclu;
+ unsigned int n = 0;
+
+ /* guaranteed to be unequal to any ext4_lblk_t value */
+ last_counted_lclu = ~0ULL;
+
+ es = __es_tree_search(&tree->root, start);
+
+ while (es && (es->es_lblk <= end)) {
+ if (ext4_es_is_delonly(es)) {
+ if (es->es_lblk <= start)
+ first_lclu = EXT4_B2C(sbi, start);
+ else
+ first_lclu = EXT4_B2C(sbi, es->es_lblk);
+
+ if (ext4_es_end(es) >= end)
+ last_lclu = EXT4_B2C(sbi, end);
+ else
+ last_lclu = EXT4_B2C(sbi, ext4_es_end(es));
+
+ if (first_lclu == last_counted_lclu)
+ n += last_lclu - first_lclu;
+ else
+ n += last_lclu - first_lclu + 1;
+ last_counted_lclu = last_lclu;
+ }
+ node = rb_next(&es->rb_node);
+ if (!node)
+ break;
+ es = rb_entry(node, struct extent_status, rb_node);
+ }
+
+ return n;
+}
+
+/*
+ * ext4_es_delayed_clu - count number of clusters containing blocks that
+ * are both delayed and unwritten
+ *
+ * @inode - file containing block range
+ * @lblk - logical block defining start of range
+ * @len - number of blocks in range
+ *
+ * Locking for external use of __es_delayed_clu().
+ */
+unsigned int ext4_es_delayed_clu(struct inode *inode, ext4_lblk_t lblk,
+ ext4_lblk_t len)
+{
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ ext4_lblk_t end;
+ unsigned int n;
+
+ if (len == 0)
+ return 0;
+
+ end = lblk + len - 1;
+ WARN_ON(end < lblk);
+
+ read_lock(&ei->i_es_lock);
+
+ n = __es_delayed_clu(inode, lblk, end);
+
+ read_unlock(&ei->i_es_lock);
+
+ return n;
+}
+
+/*
+ * __revise_pending - makes, cancels, or leaves unchanged pending cluster
+ * reservations for a specified block range depending
+ * upon the presence or absence of delayed blocks
+ * outside the range within clusters at the ends of the
+ * range
+ *
+ * @inode - file containing the range
+ * @lblk - logical block defining the start of range
+ * @len - length of range in blocks
+ *
+ * Used after a newly allocated extent is added to the extents status tree.
+ * Requires that the extents in the range have either written or unwritten
+ * status. Must be called while holding i_es_lock.
+ */
+static void __revise_pending(struct inode *inode, ext4_lblk_t lblk,
+ ext4_lblk_t len)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ ext4_lblk_t end = lblk + len - 1;
+ ext4_lblk_t first, last;
+ bool f_del = false, l_del = false;
+
+ if (len == 0)
+ return;
+
+ /*
+ * Two cases - block range within single cluster and block range
+ * spanning two or more clusters. Note that a cluster belonging
+ * to a range starting and/or ending on a cluster boundary is treated
+ * as if it does not contain a delayed extent. The new range may
+ * have allocated space for previously delayed blocks out to the
+ * cluster boundary, requiring that any pre-existing pending
+ * reservation be canceled. Because this code only looks at blocks
+ * outside the range, it should revise pending reservations
+ * correctly even if the extent represented by the range can't be
+ * inserted in the extents status tree due to ENOSPC.
+ */
+
+ if (EXT4_B2C(sbi, lblk) == EXT4_B2C(sbi, end)) {
+ first = EXT4_LBLK_CMASK(sbi, lblk);
+ if (first != lblk)
+ f_del = __es_scan_range(inode, &ext4_es_is_delonly,
+ first, lblk - 1);
+ if (f_del) {
+ __insert_pending(inode, first);
+ } else {
+ last = EXT4_LBLK_CMASK(sbi, end) +
+ sbi->s_cluster_ratio - 1;
+ if (last != end)
+ l_del = __es_scan_range(inode,
+ &ext4_es_is_delonly,
+ end + 1, last);
+ if (l_del)
+ __insert_pending(inode, last);
+ else
+ __remove_pending(inode, last);
+ }
+ } else {
+ first = EXT4_LBLK_CMASK(sbi, lblk);
+ if (first != lblk)
+ f_del = __es_scan_range(inode, &ext4_es_is_delonly,
+ first, lblk - 1);
+ if (f_del)
+ __insert_pending(inode, first);
+ else
+ __remove_pending(inode, first);
+
+ last = EXT4_LBLK_CMASK(sbi, end) + sbi->s_cluster_ratio - 1;
+ if (last != end)
+ l_del = __es_scan_range(inode, &ext4_es_is_delonly,
+ end + 1, last);
+ if (l_del)
+ __insert_pending(inode, last);
+ else
+ __remove_pending(inode, last);
+ }
+}
diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h
index 8efdeb9..825313c 100644
--- a/fs/ext4/extents_status.h
+++ b/fs/ext4/extents_status.h
@@ -70,14 +70,59 @@
struct ext4_es_stats {
unsigned long es_stats_shrunk;
- unsigned long es_stats_cache_hits;
- unsigned long es_stats_cache_misses;
+ struct percpu_counter es_stats_cache_hits;
+ struct percpu_counter es_stats_cache_misses;
u64 es_stats_scan_time;
u64 es_stats_max_scan_time;
struct percpu_counter es_stats_all_cnt;
struct percpu_counter es_stats_shk_cnt;
};
+/*
+ * Pending cluster reservations for bigalloc file systems
+ *
+ * A cluster with a pending reservation is a logical cluster shared by at
+ * least one extent in the extents status tree with delayed and unwritten
+ * status and at least one other written or unwritten extent. The
+ * reservation is said to be pending because a cluster reservation would
+ * have to be taken in the event all blocks in the cluster shared with
+ * written or unwritten extents were deleted while the delayed and
+ * unwritten blocks remained.
+ *
+ * The set of pending cluster reservations is an auxiliary data structure
+ * used with the extents status tree to implement reserved cluster/block
+ * accounting for bigalloc file systems. The set is kept in memory and
+ * records all pending cluster reservations.
+ *
+ * Its primary function is to avoid the need to read extents from the
+ * disk when invalidating pages as a result of a truncate, punch hole, or
+ * collapse range operation. Page invalidation requires a decrease in the
+ * reserved cluster count if it results in the removal of all delayed
+ * and unwritten extents (blocks) from a cluster that is not shared with a
+ * written or unwritten extent, and no decrease otherwise. Determining
+ * whether the cluster is shared can be done by searching for a pending
+ * reservation on it.
+ *
+ * Secondarily, it provides a potentially faster method for determining
+ * whether the reserved cluster count should be increased when a physical
+ * cluster is deallocated as a result of a truncate, punch hole, or
+ * collapse range operation. The necessary information is also present
+ * in the extents status tree, but might be more rapidly accessed in
+ * the pending reservation set in many cases due to smaller size.
+ *
+ * The pending cluster reservation set is implemented as a red-black tree
+ * with the goal of minimizing per page search time overhead.
+ */
+
+struct pending_reservation {
+ struct rb_node rb_node;
+ ext4_lblk_t lclu;
+};
+
+struct ext4_pending_tree {
+ struct rb_root root;
+};
+
extern int __init ext4_init_es(void);
extern void ext4_exit_es(void);
extern void ext4_es_init_tree(struct ext4_es_tree *tree);
@@ -90,11 +135,19 @@
unsigned int status);
extern int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
ext4_lblk_t len);
-extern void ext4_es_find_delayed_extent_range(struct inode *inode,
- ext4_lblk_t lblk, ext4_lblk_t end,
- struct extent_status *es);
+extern void ext4_es_find_extent_range(struct inode *inode,
+ int (*match_fn)(struct extent_status *es),
+ ext4_lblk_t lblk, ext4_lblk_t end,
+ struct extent_status *es);
extern int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk,
+ ext4_lblk_t *next_lblk,
struct extent_status *es);
+extern bool ext4_es_scan_range(struct inode *inode,
+ int (*matching_fn)(struct extent_status *es),
+ ext4_lblk_t lblk, ext4_lblk_t end);
+extern bool ext4_es_scan_clu(struct inode *inode,
+ int (*matching_fn)(struct extent_status *es),
+ ext4_lblk_t lblk);
static inline unsigned int ext4_es_status(struct extent_status *es)
{
@@ -126,6 +179,16 @@
return (ext4_es_type(es) & EXTENT_STATUS_HOLE) != 0;
}
+static inline int ext4_es_is_mapped(struct extent_status *es)
+{
+ return (ext4_es_is_written(es) || ext4_es_is_unwritten(es));
+}
+
+static inline int ext4_es_is_delonly(struct extent_status *es)
+{
+ return (ext4_es_is_delayed(es) && !ext4_es_is_unwritten(es));
+}
+
static inline void ext4_es_set_referenced(struct extent_status *es)
{
es->es_pblk |= ((ext4_fsblk_t)EXTENT_STATUS_REFERENCED) << ES_SHIFT;
@@ -175,4 +238,15 @@
extern int ext4_seq_es_shrinker_info_show(struct seq_file *seq, void *v);
+extern int __init ext4_init_pending(void);
+extern void ext4_exit_pending(void);
+extern void ext4_init_pending_tree(struct ext4_pending_tree *tree);
+extern void ext4_remove_pending(struct inode *inode, ext4_lblk_t lblk);
+extern bool ext4_is_pending(struct inode *inode, ext4_lblk_t lblk);
+extern int ext4_es_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk,
+ bool allocated);
+extern unsigned int ext4_es_delayed_clu(struct inode *inode, ext4_lblk_t lblk,
+ ext4_lblk_t len);
+extern void ext4_clear_inode_es(struct inode *inode);
+
#endif /* _EXT4_EXTENTS_STATUS_H */
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 69d65d4..8d2bbcc 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -125,7 +125,7 @@
struct super_block *sb = inode->i_sb;
int blockmask = sb->s_blocksize - 1;
- if (pos >= i_size_read(inode))
+ if (pos >= ALIGN(i_size_read(inode), sb->s_blocksize))
return 0;
if ((pos | iov_iter_alignment(from)) & blockmask)
@@ -165,6 +165,10 @@
ret = generic_write_checks(iocb, from);
if (ret <= 0)
return ret;
+
+ if (unlikely(IS_IMMUTABLE(inode)))
+ return -EPERM;
+
/*
* If we have encountered a bitmap-format file, the size limit
* is smaller than s_maxbytes, which is for extent-mapped files.
@@ -226,8 +230,6 @@
if (IS_DAX(inode))
return ext4_dax_write_iter(iocb, from);
#endif
- if (!o_direct && (iocb->ki_flags & IOCB_NOWAIT))
- return -EOPNOTSUPP;
if (!inode_trylock(inode)) {
if (iocb->ki_flags & IOCB_NOWAIT)
@@ -264,6 +266,13 @@
}
ret = __generic_file_write_iter(iocb, from);
+ /*
+ * Unaligned direct AIO must be the only IO in flight. Otherwise
+ * overlapping aligned IO after unaligned might result in data
+ * corruption.
+ */
+ if (ret == -EIOCBQUEUED && unaligned_aio)
+ ext4_unwritten_wait(inode);
inode_unlock(inode);
if (ret > 0)
@@ -360,15 +369,17 @@
static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
{
struct inode *inode = file->f_mapping->host;
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ struct dax_device *dax_dev = sbi->s_daxdev;
- if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
+ if (unlikely(ext4_forced_shutdown(sbi)))
return -EIO;
/*
- * We don't support synchronous mappings for non-DAX files. At least
- * until someone comes with a sensible use case.
+ * We don't support synchronous mappings for non-DAX files and
+ * for DAX files if underneath dax_device is not synchronous.
*/
- if (!IS_DAX(file_inode(file)) && (vma->vm_flags & VM_SYNC))
+ if (!daxdev_mapping_supported(vma, dax_dev))
return -EOPNOTSUPP;
file_accessed(file);
@@ -444,6 +455,10 @@
if (ret)
return ret;
+ ret = fsverity_file_open(inode, filp);
+ if (ret)
+ return ret;
+
/*
* Set up the jbd2_inode if we are opening the inode for
* writing and the journal is present
diff --git a/fs/ext4/fsmap.c b/fs/ext4/fsmap.c
index 4b99e2d..dbccf46 100644
--- a/fs/ext4/fsmap.c
+++ b/fs/ext4/fsmap.c
@@ -626,7 +626,7 @@
{
struct ext4_fsmap dkeys[2]; /* per-dev keys */
struct ext4_getfsmap_dev handlers[EXT4_GETFSMAP_DEVS];
- struct ext4_getfsmap_info info = {0};
+ struct ext4_getfsmap_info info = { NULL };
int i;
int error = 0;
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 26a7fe5..5508baa 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -159,6 +159,9 @@
ret = err;
}
out:
+ err = file_check_and_advance_wb_err(file);
+ if (ret == 0)
+ ret = err;
trace_ext4_sync_file_exit(inode, ret);
return ret;
}
diff --git a/fs/ext4/hash.c b/fs/ext4/hash.c
index e22dcfa..3e13379 100644
--- a/fs/ext4/hash.c
+++ b/fs/ext4/hash.c
@@ -6,6 +6,7 @@
*/
#include <linux/fs.h>
+#include <linux/unicode.h>
#include <linux/compiler.h>
#include <linux/bitops.h>
#include "ext4.h"
@@ -196,7 +197,8 @@
* represented, and whether or not the returned hash is 32 bits or 64
* bits. 32 bit hashes will return 0 for the minor hash.
*/
-int ext4fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo)
+static int __ext4fs_dirhash(const char *name, int len,
+ struct dx_hash_info *hinfo)
{
__u32 hash;
__u32 minor_hash = 0;
@@ -231,6 +233,7 @@
break;
case DX_HASH_HALF_MD4_UNSIGNED:
str2hashbuf = str2hashbuf_unsigned;
+ /* fall through */
case DX_HASH_HALF_MD4:
p = name;
while (len > 0) {
@@ -244,6 +247,7 @@
break;
case DX_HASH_TEA_UNSIGNED:
str2hashbuf = str2hashbuf_unsigned;
+ /* fall through */
case DX_HASH_TEA:
p = name;
while (len > 0) {
@@ -266,3 +270,33 @@
hinfo->minor_hash = minor_hash;
return 0;
}
+
+int ext4fs_dirhash(const struct inode *dir, const char *name, int len,
+ struct dx_hash_info *hinfo)
+{
+#ifdef CONFIG_UNICODE
+ const struct unicode_map *um = EXT4_SB(dir->i_sb)->s_encoding;
+ int r, dlen;
+ unsigned char *buff;
+ struct qstr qstr = {.name = name, .len = len };
+
+ if (len && IS_CASEFOLDED(dir) && um) {
+ buff = kzalloc(sizeof(char) * PATH_MAX, GFP_KERNEL);
+ if (!buff)
+ return -ENOMEM;
+
+ dlen = utf8_casefold(um, &qstr, buff, PATH_MAX);
+ if (dlen < 0) {
+ kfree(buff);
+ goto opaque_seq;
+ }
+
+ r = __ext4fs_dirhash(buff, dlen, hinfo);
+
+ kfree(buff);
+ return r;
+ }
+opaque_seq:
+#endif
+ return __ext4fs_dirhash(name, len, hinfo);
+}
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 2addcb8..764ff4c 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -455,7 +455,7 @@
if (qstr) {
hinfo.hash_version = DX_HASH_HALF_MD4;
hinfo.seed = sbi->s_hash_seed;
- ext4fs_dirhash(qstr->name, qstr->len, &hinfo);
+ ext4fs_dirhash(parent, qstr->name, qstr->len, &hinfo);
grp = hinfo.hash;
} else
grp = prandom_u32();
@@ -771,7 +771,7 @@
if (unlikely(ext4_forced_shutdown(sbi)))
return ERR_PTR(-EIO);
- if ((ext4_encrypted_inode(dir) || DUMMY_ENCRYPTION_ENABLED(sbi)) &&
+ if ((IS_ENCRYPTED(dir) || DUMMY_ENCRYPTION_ENABLED(sbi)) &&
(S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)) &&
!(i_flags & EXT4_EA_INODE_FL)) {
err = fscrypt_get_encryption_info(dir);
@@ -1216,7 +1216,7 @@
bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb);
bitmap_bh = ext4_read_inode_bitmap(sb, block_group);
if (IS_ERR(bitmap_bh))
- return (struct inode *) bitmap_bh;
+ return ERR_CAST(bitmap_bh);
/* Having the inode bit set should be a 100% indicator that this
* is a valid orphan (no e2fsck run on fs). Orphans also include
@@ -1225,7 +1225,7 @@
if (!ext4_test_bit(bit, bitmap_bh->b_data))
goto bad_orphan;
- inode = ext4_iget(sb, ino);
+ inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
if (IS_ERR(inode)) {
err = PTR_ERR(inode);
ext4_error(sb, "couldn't read orphan inode %lu (err %d)",
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index bf7fa15..36699a1 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -294,14 +294,12 @@
}
/**
- * ext4_alloc_branch - allocate and set up a chain of blocks.
- * @handle: handle for this transaction
- * @inode: owner
- * @indirect_blks: number of allocated indirect blocks
- * @blks: number of allocated direct blocks
- * @goal: preferred place for allocation
- * @offsets: offsets (in the blocks) to store the pointers to next.
- * @branch: place to store the chain in.
+ * ext4_alloc_branch() - allocate and set up a chain of blocks
+ * @handle: handle for this transaction
+ * @ar: structure describing the allocation request
+ * @indirect_blks: number of allocated indirect blocks
+ * @offsets: offsets (in the blocks) to store the pointers to next.
+ * @branch: place to store the chain in.
*
* This function allocates blocks, zeroes out all but the last one,
* links them into chain and (if we are synchronous) writes them to disk.
@@ -396,15 +394,11 @@
}
/**
- * ext4_splice_branch - splice the allocated branch onto inode.
+ * ext4_splice_branch() - splice the allocated branch onto inode.
* @handle: handle for this transaction
- * @inode: owner
- * @block: (logical) number of block we are adding
- * @chain: chain of indirect blocks (with a missing link - see
- * ext4_alloc_branch)
+ * @ar: structure describing the allocation request
* @where: location of missing link
* @num: number of indirect blocks we are adding
- * @blks: number of direct blocks we are adding
*
* This function fills the missing link and does all housekeeping needed in
* inode (->i_blocks, etc.). In case of success we end up with the full
@@ -1183,18 +1177,21 @@
ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 1);
i_data[EXT4_IND_BLOCK] = 0;
}
+ /* fall through */
case EXT4_IND_BLOCK:
nr = i_data[EXT4_DIND_BLOCK];
if (nr) {
ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 2);
i_data[EXT4_DIND_BLOCK] = 0;
}
+ /* fall through */
case EXT4_DIND_BLOCK:
nr = i_data[EXT4_TIND_BLOCK];
if (nr) {
ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 3);
i_data[EXT4_TIND_BLOCK] = 0;
}
+ /* fall through */
case EXT4_TIND_BLOCK:
;
}
@@ -1219,6 +1216,7 @@
ext4_lblk_t offsets[4], offsets2[4];
Indirect chain[4], chain2[4];
Indirect *partial, *partial2;
+ Indirect *p = NULL, *p2 = NULL;
ext4_lblk_t max_block;
__le32 nr = 0, nr2 = 0;
int n = 0, n2 = 0;
@@ -1260,7 +1258,7 @@
}
- partial = ext4_find_shared(inode, n, offsets, chain, &nr);
+ partial = p = ext4_find_shared(inode, n, offsets, chain, &nr);
if (nr) {
if (partial == chain) {
/* Shared branch grows from the inode */
@@ -1285,13 +1283,11 @@
partial->p + 1,
(__le32 *)partial->bh->b_data+addr_per_block,
(chain+n-1) - partial);
- BUFFER_TRACE(partial->bh, "call brelse");
- brelse(partial->bh);
partial--;
}
end_range:
- partial2 = ext4_find_shared(inode, n2, offsets2, chain2, &nr2);
+ partial2 = p2 = ext4_find_shared(inode, n2, offsets2, chain2, &nr2);
if (nr2) {
if (partial2 == chain2) {
/*
@@ -1321,16 +1317,14 @@
(__le32 *)partial2->bh->b_data,
partial2->p,
(chain2+n2-1) - partial2);
- BUFFER_TRACE(partial2->bh, "call brelse");
- brelse(partial2->bh);
partial2--;
}
goto do_indirects;
}
/* Punch happened within the same level (n == n2) */
- partial = ext4_find_shared(inode, n, offsets, chain, &nr);
- partial2 = ext4_find_shared(inode, n2, offsets2, chain2, &nr2);
+ partial = p = ext4_find_shared(inode, n, offsets, chain, &nr);
+ partial2 = p2 = ext4_find_shared(inode, n2, offsets2, chain2, &nr2);
/* Free top, but only if partial2 isn't its subtree. */
if (nr) {
@@ -1387,11 +1381,7 @@
partial->p + 1,
partial2->p,
(chain+n-1) - partial);
- BUFFER_TRACE(partial->bh, "call brelse");
- brelse(partial->bh);
- BUFFER_TRACE(partial2->bh, "call brelse");
- brelse(partial2->bh);
- return 0;
+ goto cleanup;
}
/*
@@ -1406,8 +1396,6 @@
partial->p + 1,
(__le32 *)partial->bh->b_data+addr_per_block,
(chain+n-1) - partial);
- BUFFER_TRACE(partial->bh, "call brelse");
- brelse(partial->bh);
partial--;
}
if (partial2 > chain2 && depth2 <= depth) {
@@ -1415,11 +1403,21 @@
(__le32 *)partial2->bh->b_data,
partial2->p,
(chain2+n2-1) - partial2);
- BUFFER_TRACE(partial2->bh, "call brelse");
- brelse(partial2->bh);
partial2--;
}
}
+
+cleanup:
+ while (p && p > chain) {
+ BUFFER_TRACE(p->bh, "call brelse");
+ brelse(p->bh);
+ p--;
+ }
+ while (p2 && p2 > chain2) {
+ BUFFER_TRACE(p2->bh, "call brelse");
+ brelse(p2->bh);
+ p2--;
+ }
return 0;
do_indirects:
@@ -1427,30 +1425,33 @@
switch (offsets[0]) {
default:
if (++n >= n2)
- return 0;
+ break;
nr = i_data[EXT4_IND_BLOCK];
if (nr) {
ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 1);
i_data[EXT4_IND_BLOCK] = 0;
}
+ /* fall through */
case EXT4_IND_BLOCK:
if (++n >= n2)
- return 0;
+ break;
nr = i_data[EXT4_DIND_BLOCK];
if (nr) {
ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 2);
i_data[EXT4_DIND_BLOCK] = 0;
}
+ /* fall through */
case EXT4_DIND_BLOCK:
if (++n >= n2)
- return 0;
+ break;
nr = i_data[EXT4_TIND_BLOCK];
if (nr) {
ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 3);
i_data[EXT4_TIND_BLOCK] = 0;
}
+ /* fall through */
case EXT4_TIND_BLOCK:
;
}
- return 0;
+ goto cleanup;
}
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index 9c4bac1..2fec62d 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -705,8 +705,11 @@
if (!PageUptodate(page)) {
ret = ext4_read_inline_page(inode, page);
- if (ret < 0)
+ if (ret < 0) {
+ unlock_page(page);
+ put_page(page);
goto out_up_read;
+ }
}
ret = 1;
@@ -1129,7 +1132,6 @@
{
int err, csum_size = 0, header_size = 0;
struct ext4_dir_entry_2 *de;
- struct ext4_dir_entry_tail *t;
void *target = dir_block->b_data;
/*
@@ -1155,13 +1157,11 @@
inline_size - EXT4_INLINE_DOTDOT_SIZE + header_size,
inode->i_sb->s_blocksize - csum_size);
- if (csum_size) {
- t = EXT4_DIRENT_TAIL(dir_block->b_data,
- inode->i_sb->s_blocksize);
- initialize_dirent_tail(t, inode->i_sb->s_blocksize);
- }
+ if (csum_size)
+ ext4_initialize_dirent_tail(dir_block,
+ inode->i_sb->s_blocksize);
set_buffer_uptodate(dir_block);
- err = ext4_handle_dirty_dirent_node(handle, inode, dir_block);
+ err = ext4_handle_dirty_dirblock(handle, inode, dir_block);
if (err)
return err;
set_buffer_verified(dir_block);
@@ -1324,11 +1324,11 @@
* inlined dir. It returns the number directory entries loaded
* into the tree. If there is an error it is returned in err.
*/
-int htree_inlinedir_to_tree(struct file *dir_file,
- struct inode *dir, ext4_lblk_t block,
- struct dx_hash_info *hinfo,
- __u32 start_hash, __u32 start_minor_hash,
- int *has_inline_data)
+int ext4_inlinedir_to_tree(struct file *dir_file,
+ struct inode *dir, ext4_lblk_t block,
+ struct dx_hash_info *hinfo,
+ __u32 start_hash, __u32 start_minor_hash,
+ int *has_inline_data)
{
int err = 0, count = 0;
unsigned int parent_ino;
@@ -1404,7 +1404,7 @@
}
}
- ext4fs_dirhash(de->name, de->name_len, hinfo);
+ ext4fs_dirhash(dir, de->name, de->name_len, hinfo);
if ((hinfo->hash < start_hash) ||
((hinfo->hash == start_hash) &&
(hinfo->minor_hash < start_minor_hash)))
@@ -1416,7 +1416,7 @@
err = ext4_htree_store_dirent(dir_file, hinfo->hash,
hinfo->minor_hash, de, &tmp_str);
if (err) {
- count = err;
+ ret = err;
goto out;
}
count++;
@@ -1887,12 +1887,12 @@
physical += (char *)ext4_raw_inode(&iloc) - iloc.bh->b_data;
physical += offsetof(struct ext4_inode, i_block);
- if (physical)
- error = fiemap_fill_next_extent(fieinfo, start, physical,
- inline_len, flags);
brelse(iloc.bh);
out:
up_read(&EXT4_I(inode)->xattr_sem);
+ if (physical)
+ error = fiemap_fill_next_extent(fieinfo, start, physical,
+ inline_len, flags);
return (error < 0 ? error : 0);
}
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 244531d..d691d17 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -391,7 +391,7 @@
* inode's preallocations.
*/
if ((ei->i_reserved_data_blocks == 0) &&
- (atomic_read(&inode->i_writecount) == 0))
+ !inode_is_open_for_write(inode))
ext4_discard_preallocations(inode);
}
@@ -399,6 +399,10 @@
unsigned int line,
struct ext4_map_blocks *map)
{
+ if (ext4_has_feature_journal(inode->i_sb) &&
+ (inode->i_ino ==
+ le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_journal_inum)))
+ return 0;
if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), map->m_pblk,
map->m_len)) {
ext4_error_inode(inode, func, line, map->m_pblk,
@@ -415,7 +419,7 @@
{
int ret;
- if (ext4_encrypted_inode(inode))
+ if (IS_ENCRYPTED(inode))
return fscrypt_zeroout_range(inode, lblk, pblk, len);
ret = sb_issue_zeroout(inode->i_sb, pblk, len, GFP_NOFS);
@@ -523,7 +527,7 @@
return -EFSCORRUPTED;
/* Lookup extent status tree firstly */
- if (ext4_es_lookup_extent(inode, map->m_lblk, &es)) {
+ if (ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) {
if (ext4_es_is_written(&es) || ext4_es_is_unwritten(&es)) {
map->m_pblk = ext4_es_pblock(&es) +
map->m_lblk - es.es_lblk;
@@ -541,7 +545,7 @@
map->m_len = retval;
retval = 0;
} else {
- BUG_ON(1);
+ BUG();
}
#ifdef ES_AGGRESSIVE_TEST
ext4_map_blocks_es_recheck(handle, inode, map,
@@ -577,8 +581,8 @@
EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) &&
!(status & EXTENT_STATUS_WRITTEN) &&
- ext4_find_delalloc_range(inode, map->m_lblk,
- map->m_lblk + map->m_len - 1))
+ ext4_es_scan_range(inode, &ext4_es_is_delayed, map->m_lblk,
+ map->m_lblk + map->m_len - 1))
status |= EXTENT_STATUS_DELAYED;
ret = ext4_es_insert_extent(inode, map->m_lblk,
map->m_len, map->m_pblk, status);
@@ -678,8 +682,6 @@
if (flags & EXT4_GET_BLOCKS_ZERO &&
map->m_flags & EXT4_MAP_MAPPED &&
map->m_flags & EXT4_MAP_NEW) {
- clean_bdev_aliases(inode->i_sb->s_bdev, map->m_pblk,
- map->m_len);
ret = ext4_issue_zeroout(inode, map->m_lblk,
map->m_pblk, map->m_len);
if (ret) {
@@ -693,7 +695,7 @@
* extent status tree.
*/
if ((flags & EXT4_GET_BLOCKS_PRE_IO) &&
- ext4_es_lookup_extent(inode, map->m_lblk, &es)) {
+ ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) {
if (ext4_es_is_written(&es))
goto out_sem;
}
@@ -701,8 +703,8 @@
EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) &&
!(status & EXTENT_STATUS_WRITTEN) &&
- ext4_find_delalloc_range(inode, map->m_lblk,
- map->m_lblk + map->m_len - 1))
+ ext4_es_scan_range(inode, &ext4_es_is_delayed, map->m_lblk,
+ map->m_lblk + map->m_len - 1))
status |= EXTENT_STATUS_DELAYED;
ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
map->m_pblk, status);
@@ -729,10 +731,16 @@
!(flags & EXT4_GET_BLOCKS_ZERO) &&
!ext4_is_quota_file(inode) &&
ext4_should_order_data(inode)) {
+ loff_t start_byte =
+ (loff_t)map->m_lblk << inode->i_blkbits;
+ loff_t length = (loff_t)map->m_len << inode->i_blkbits;
+
if (flags & EXT4_GET_BLOCKS_IO_SUBMIT)
- ret = ext4_jbd2_inode_add_wait(handle, inode);
+ ret = ext4_jbd2_inode_add_wait(handle, inode,
+ start_byte, length);
else
- ret = ext4_jbd2_inode_add_write(handle, inode);
+ ret = ext4_jbd2_inode_add_write(handle, inode,
+ start_byte, length);
if (ret)
return ret;
}
@@ -1016,7 +1024,7 @@
bh = ext4_getblk(handle, inode, block, map_flags);
if (IS_ERR(bh))
return bh;
- if (!bh || buffer_uptodate(bh))
+ if (!bh || ext4_buffer_uptodate(bh))
return bh;
ll_rw_block(REQ_OP_READ, REQ_META | REQ_PRIO, 1, &bh);
wait_on_buffer(bh);
@@ -1043,7 +1051,7 @@
for (i = 0; i < bh_count; i++)
/* Note that NULL bhs[i] is valid because of holes. */
- if (bhs[i] && !buffer_uptodate(bhs[i]))
+ if (bhs[i] && !ext4_buffer_uptodate(bhs[i]))
ll_rw_block(REQ_OP_READ, REQ_META | REQ_PRIO, 1,
&bhs[i]);
@@ -1150,7 +1158,7 @@
return ret;
}
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
static int ext4_block_write_begin(struct page *page, loff_t pos, unsigned len,
get_block_t *get_block)
{
@@ -1162,8 +1170,9 @@
int err = 0;
unsigned blocksize = inode->i_sb->s_blocksize;
unsigned bbits;
- struct buffer_head *bh, *head, *wait[2], **wait_bh = wait;
- bool decrypt = false;
+ struct buffer_head *bh, *head, *wait[2];
+ int nr_wait = 0;
+ int i;
BUG_ON(!PageLocked(page));
BUG_ON(from > PAGE_SIZE);
@@ -1194,7 +1203,6 @@
if (err)
break;
if (buffer_new(bh)) {
- clean_bdev_bh_alias(bh);
if (PageUptodate(page)) {
clear_buffer_new(bh);
set_buffer_uptodate(bh);
@@ -1216,24 +1224,32 @@
!buffer_unwritten(bh) &&
(block_start < from || block_end > to)) {
ll_rw_block(REQ_OP_READ, 0, 1, &bh);
- *wait_bh++ = bh;
- decrypt = ext4_encrypted_inode(inode) &&
- S_ISREG(inode->i_mode);
+ wait[nr_wait++] = bh;
}
}
/*
* If we issued read requests, let them complete.
*/
- while (wait_bh > wait) {
- wait_on_buffer(*--wait_bh);
- if (!buffer_uptodate(*wait_bh))
+ for (i = 0; i < nr_wait; i++) {
+ wait_on_buffer(wait[i]);
+ if (!buffer_uptodate(wait[i]))
err = -EIO;
}
- if (unlikely(err))
+ if (unlikely(err)) {
page_zero_new_buffers(page, from, to);
- else if (decrypt)
- err = fscrypt_decrypt_page(page->mapping->host, page,
- PAGE_SIZE, 0, page->index);
+ } else if (IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode)) {
+ for (i = 0; i < nr_wait; i++) {
+ int err2;
+
+ err2 = fscrypt_decrypt_pagecache_blocks(page, blocksize,
+ bh_offset(wait[i]));
+ if (err2) {
+ clear_buffer_uptodate(wait[i]);
+ err = err2;
+ }
+ }
+ }
+
return err;
}
#endif
@@ -1303,7 +1319,7 @@
/* In case writeback began while the page was unlocked */
wait_for_stable_page(page);
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
if (ext4_should_dioread_nolock(inode))
ret = ext4_block_write_begin(page, pos, len,
ext4_get_block_unwritten);
@@ -1324,6 +1340,9 @@
}
if (ret) {
+ bool extended = (pos + len > inode->i_size) &&
+ !ext4_verity_in_progress(inode);
+
unlock_page(page);
/*
* __block_write_begin may have instantiated a few blocks
@@ -1333,11 +1352,11 @@
* Add inode to orphan list in case we crash before
* truncate finishes
*/
- if (pos + len > inode->i_size && ext4_can_truncate(inode))
+ if (extended && ext4_can_truncate(inode))
ext4_orphan_add(handle, inode);
ext4_journal_stop(handle);
- if (pos + len > inode->i_size) {
+ if (extended) {
ext4_truncate_failed_write(inode);
/*
* If truncate failed early the inode might
@@ -1390,6 +1409,7 @@
int ret = 0, ret2;
int i_size_changed = 0;
int inline_data = ext4_has_inline_data(inode);
+ bool verity = ext4_verity_in_progress(inode);
trace_ext4_write_end(inode, pos, len, copied);
if (inline_data) {
@@ -1407,12 +1427,16 @@
/*
* it's important to update i_size while still holding page lock:
* page writeout could otherwise come in and zero beyond i_size.
+ *
+ * If FS_IOC_ENABLE_VERITY is running on this inode, then Merkle tree
+ * blocks are being written past EOF, so skip the i_size update.
*/
- i_size_changed = ext4_update_inode_size(inode, pos + copied);
+ if (!verity)
+ i_size_changed = ext4_update_inode_size(inode, pos + copied);
unlock_page(page);
put_page(page);
- if (old_size < pos)
+ if (old_size < pos && !verity)
pagecache_isize_extended(inode, old_size, pos);
/*
* Don't mark the inode dirty under page lock. First, it unnecessarily
@@ -1423,7 +1447,7 @@
if (i_size_changed || inline_data)
ext4_mark_inode_dirty(handle, inode);
- if (pos + len > inode->i_size && ext4_can_truncate(inode))
+ if (pos + len > inode->i_size && !verity && ext4_can_truncate(inode))
/* if we have allocated more blocks and copied
* less. We will have blocks allocated outside
* inode->i_size. So truncate them
@@ -1434,7 +1458,7 @@
if (!ret)
ret = ret2;
- if (pos + len > inode->i_size) {
+ if (pos + len > inode->i_size && !verity) {
ext4_truncate_failed_write(inode);
/*
* If truncate failed early the inode might still be
@@ -1495,6 +1519,7 @@
unsigned from, to;
int size_changed = 0;
int inline_data = ext4_has_inline_data(inode);
+ bool verity = ext4_verity_in_progress(inode);
trace_ext4_journalled_write_end(inode, pos, len, copied);
from = pos & (PAGE_SIZE - 1);
@@ -1524,13 +1549,14 @@
if (!partial)
SetPageUptodate(page);
}
- size_changed = ext4_update_inode_size(inode, pos + copied);
+ if (!verity)
+ size_changed = ext4_update_inode_size(inode, pos + copied);
ext4_set_inode_state(inode, EXT4_STATE_JDATA);
EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid;
unlock_page(page);
put_page(page);
- if (old_size < pos)
+ if (old_size < pos && !verity)
pagecache_isize_extended(inode, old_size, pos);
if (size_changed || inline_data) {
@@ -1539,7 +1565,7 @@
ret = ret2;
}
- if (pos + len > inode->i_size && ext4_can_truncate(inode))
+ if (pos + len > inode->i_size && !verity && ext4_can_truncate(inode))
/* if we have allocated more blocks and copied
* less. We will have blocks allocated outside
* inode->i_size. So truncate them
@@ -1550,7 +1576,7 @@
ret2 = ext4_journal_stop(handle);
if (!ret)
ret = ret2;
- if (pos + len > inode->i_size) {
+ if (pos + len > inode->i_size && !verity) {
ext4_truncate_failed_write(inode);
/*
* If truncate failed early the inode might still be
@@ -1595,7 +1621,7 @@
return 0; /* success */
}
-static void ext4_da_release_space(struct inode *inode, int to_free)
+void ext4_da_release_space(struct inode *inode, int to_free)
{
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
struct ext4_inode_info *ei = EXT4_I(inode);
@@ -1630,64 +1656,6 @@
dquot_release_reservation_block(inode, EXT4_C2B(sbi, to_free));
}
-static void ext4_da_page_release_reservation(struct page *page,
- unsigned int offset,
- unsigned int length)
-{
- int to_release = 0, contiguous_blks = 0;
- struct buffer_head *head, *bh;
- unsigned int curr_off = 0;
- struct inode *inode = page->mapping->host;
- struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
- unsigned int stop = offset + length;
- int num_clusters;
- ext4_fsblk_t lblk;
-
- BUG_ON(stop > PAGE_SIZE || stop < length);
-
- head = page_buffers(page);
- bh = head;
- do {
- unsigned int next_off = curr_off + bh->b_size;
-
- if (next_off > stop)
- break;
-
- if ((offset <= curr_off) && (buffer_delay(bh))) {
- to_release++;
- contiguous_blks++;
- clear_buffer_delay(bh);
- } else if (contiguous_blks) {
- lblk = page->index <<
- (PAGE_SHIFT - inode->i_blkbits);
- lblk += (curr_off >> inode->i_blkbits) -
- contiguous_blks;
- ext4_es_remove_extent(inode, lblk, contiguous_blks);
- contiguous_blks = 0;
- }
- curr_off = next_off;
- } while ((bh = bh->b_this_page) != head);
-
- if (contiguous_blks) {
- lblk = page->index << (PAGE_SHIFT - inode->i_blkbits);
- lblk += (curr_off >> inode->i_blkbits) - contiguous_blks;
- ext4_es_remove_extent(inode, lblk, contiguous_blks);
- }
-
- /* If we have released all the blocks belonging to a cluster, then we
- * need to release the reserved space for that cluster. */
- num_clusters = EXT4_NUM_B2C(sbi, to_release);
- while (num_clusters > 0) {
- lblk = (page->index << (PAGE_SHIFT - inode->i_blkbits)) +
- ((num_clusters - 1) << sbi->s_cluster_bits);
- if (sbi->s_cluster_ratio == 1 ||
- !ext4_find_delalloc_cluster(inode, lblk))
- ext4_da_release_space(inode, 1);
-
- num_clusters--;
- }
-}
-
/*
* Delayed allocation stuff
*/
@@ -1781,6 +1749,65 @@
}
/*
+ * ext4_insert_delayed_block - adds a delayed block to the extents status
+ * tree, incrementing the reserved cluster/block
+ * count or making a pending reservation
+ * where needed
+ *
+ * @inode - file containing the newly added block
+ * @lblk - logical block to be added
+ *
+ * Returns 0 on success, negative error code on failure.
+ */
+static int ext4_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ int ret;
+ bool allocated = false;
+
+ /*
+ * If the cluster containing lblk is shared with a delayed,
+ * written, or unwritten extent in a bigalloc file system, it's
+ * already been accounted for and does not need to be reserved.
+ * A pending reservation must be made for the cluster if it's
+ * shared with a written or unwritten extent and doesn't already
+ * have one. Written and unwritten extents can be purged from the
+ * extents status tree if the system is under memory pressure, so
+ * it's necessary to examine the extent tree if a search of the
+ * extents status tree doesn't get a match.
+ */
+ if (sbi->s_cluster_ratio == 1) {
+ ret = ext4_da_reserve_space(inode);
+ if (ret != 0) /* ENOSPC */
+ goto errout;
+ } else { /* bigalloc */
+ if (!ext4_es_scan_clu(inode, &ext4_es_is_delonly, lblk)) {
+ if (!ext4_es_scan_clu(inode,
+ &ext4_es_is_mapped, lblk)) {
+ ret = ext4_clu_mapped(inode,
+ EXT4_B2C(sbi, lblk));
+ if (ret < 0)
+ goto errout;
+ if (ret == 0) {
+ ret = ext4_da_reserve_space(inode);
+ if (ret != 0) /* ENOSPC */
+ goto errout;
+ } else {
+ allocated = true;
+ }
+ } else {
+ allocated = true;
+ }
+ }
+ }
+
+ ret = ext4_es_insert_delayed_block(inode, lblk, allocated);
+
+errout:
+ return ret;
+}
+
+/*
* This function is grabs code from the very beginning of
* ext4_map_blocks, but assumes that the caller is from delayed write
* time. This function looks up the requested blocks and sets the
@@ -1808,7 +1835,7 @@
(unsigned long) map->m_lblk);
/* Lookup extent status tree firstly */
- if (ext4_es_lookup_extent(inode, iblock, &es)) {
+ if (ext4_es_lookup_extent(inode, iblock, NULL, &es)) {
if (ext4_es_is_hole(&es)) {
retval = 0;
down_read(&EXT4_I(inode)->i_data_sem);
@@ -1836,7 +1863,7 @@
else if (ext4_es_is_unwritten(&es))
map->m_flags |= EXT4_MAP_UNWRITTEN;
else
- BUG_ON(1);
+ BUG();
#ifdef ES_AGGRESSIVE_TEST
ext4_map_blocks_es_recheck(NULL, inode, map, &orig_map, 0);
@@ -1859,28 +1886,14 @@
add_delayed:
if (retval == 0) {
int ret;
+
/*
* XXX: __block_prepare_write() unmaps passed block,
* is it OK?
*/
- /*
- * If the block was allocated from previously allocated cluster,
- * then we don't need to reserve it again. However we still need
- * to reserve metadata for every block we're going to write.
- */
- if (EXT4_SB(inode->i_sb)->s_cluster_ratio == 1 ||
- !ext4_find_delalloc_cluster(inode, map->m_lblk)) {
- ret = ext4_da_reserve_space(inode);
- if (ret) {
- /* not enough space to reserve */
- retval = ret;
- goto out_unlock;
- }
- }
- ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
- ~0, EXTENT_STATUS_DELAYED);
- if (ret) {
+ ret = ext4_insert_delayed_block(inode, map->m_lblk);
+ if (ret != 0) {
retval = ret;
goto out_unlock;
}
@@ -2116,7 +2129,8 @@
trace_ext4_writepage(page);
size = i_size_read(inode);
- if (page->index == size >> PAGE_SHIFT)
+ if (page->index == size >> PAGE_SHIFT &&
+ !ext4_verity_in_progress(inode))
len = size & ~PAGE_MASK;
else
len = PAGE_SIZE;
@@ -2200,7 +2214,8 @@
* after page tables are updated.
*/
size = i_size_read(mpd->inode);
- if (page->index == size >> PAGE_SHIFT)
+ if (page->index == size >> PAGE_SHIFT &&
+ !ext4_verity_in_progress(mpd->inode))
len = size & ~PAGE_MASK;
else
len = PAGE_SIZE;
@@ -2299,6 +2314,9 @@
ext4_lblk_t blocks = (i_size_read(inode) + i_blocksize(inode) - 1)
>> inode->i_blkbits;
+ if (ext4_verity_in_progress(inode))
+ blocks = EXT_MAX_BLOCKS;
+
do {
BUG_ON(buffer_locked(bh));
@@ -2460,10 +2478,6 @@
}
BUG_ON(map->m_len == 0);
- if (map->m_flags & EXT4_MAP_NEW) {
- clean_bdev_aliases(inode->i_sb->s_bdev, map->m_pblk,
- map->m_len);
- }
return 0;
}
@@ -2613,7 +2627,7 @@
long left = mpd->wbc->nr_to_write;
pgoff_t index = mpd->first_page;
pgoff_t end = mpd->last_page;
- int tag;
+ xa_mark_t tag;
int i, err = 0;
int blkbits = mpd->inode->i_blkbits;
ext4_lblk_t lblk;
@@ -2743,14 +2757,6 @@
goto out_writepages;
}
- if (ext4_should_dioread_nolock(inode)) {
- /*
- * We may need to convert up to one extent per block in
- * the page and we may dirty the inode.
- */
- rsv_blocks = 1 + (PAGE_SIZE >> inode->i_blkbits);
- }
-
/*
* If we have inline data and arrive here, it means that
* we will soon create the block for the 1st page, so
@@ -2769,6 +2775,15 @@
ext4_journal_stop(handle);
}
+ if (ext4_should_dioread_nolock(inode)) {
+ /*
+ * We may need to convert up to one extent per block in
+ * the page and we may dirty the inode.
+ */
+ rsv_blocks = 1 + ext4_chunk_trans_blocks(inode,
+ PAGE_SIZE >> inode->i_blkbits);
+ }
+
if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
range_whole = 1;
@@ -2805,12 +2820,12 @@
goto unplug;
}
ret = mpage_prepare_extent_to_map(&mpd);
+ /* Unlock pages we didn't use */
+ mpage_release_unused_pages(&mpd, false);
/* Submit prepared bio */
ext4_io_submit(&mpd.io_submit);
ext4_put_io_end_defer(mpd.io_submit.io_end);
mpd.io_submit.io_end = NULL;
- /* Unlock pages we didn't use */
- mpage_release_unused_pages(&mpd, false);
if (ret < 0)
goto unplug;
@@ -2878,10 +2893,11 @@
handle = NULL;
mpd.do_map = 0;
}
- /* Submit prepared bio */
- ext4_io_submit(&mpd.io_submit);
/* Unlock pages we didn't use */
mpage_release_unused_pages(&mpd, give_up_on_write);
+ /* Submit prepared bio */
+ ext4_io_submit(&mpd.io_submit);
+
/*
* Drop our io_end reference we got from init. We have
* to be careful and use deferred io_end finishing if
@@ -3017,8 +3033,8 @@
index = pos >> PAGE_SHIFT;
- if (ext4_nonda_switch(inode->i_sb) ||
- S_ISLNK(inode->i_mode)) {
+ if (ext4_nonda_switch(inode->i_sb) || S_ISLNK(inode->i_mode) ||
+ ext4_verity_in_progress(inode)) {
*fsdata = (void *)FALL_BACK_TO_NONDELALLOC;
return ext4_write_begin(file, mapping, pos,
len, flags, pagep, fsdata);
@@ -3074,7 +3090,7 @@
/* In case writeback began while the page was unlocked */
wait_for_stable_page(page);
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
ret = ext4_block_write_begin(page, pos, len,
ext4_da_get_block_prep);
#else
@@ -3183,24 +3199,6 @@
return ret ? ret : copied;
}
-static void ext4_da_invalidatepage(struct page *page, unsigned int offset,
- unsigned int length)
-{
- /*
- * Drop reserved blocks
- */
- BUG_ON(!PageLocked(page));
- if (!page_has_buffers(page))
- goto out;
-
- ext4_da_page_release_reservation(page, offset, length);
-
-out:
- ext4_invalidatepage(page, offset, length);
-
- return;
-}
-
/*
* Force all delayed allocation blocks to be allocated for a given inode.
*/
@@ -3450,7 +3448,8 @@
ext4_lblk_t end = map.m_lblk + map.m_len - 1;
struct extent_status es;
- ext4_es_find_delayed_extent_range(inode, map.m_lblk, end, &es);
+ ext4_es_find_extent_range(inode, &ext4_es_is_delayed,
+ map.m_lblk, end, &es);
if (!es.es_len || es.es_lblk > end) {
/* entire range is a hole */
@@ -3848,10 +3847,12 @@
loff_t offset = iocb->ki_pos;
ssize_t ret;
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
- if (ext4_encrypted_inode(inode) && S_ISREG(inode->i_mode))
+#ifdef CONFIG_FS_ENCRYPTION
+ if (IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode))
return 0;
#endif
+ if (fsverity_active(inode))
+ return 0;
/*
* If we are doing data journalling we don't support O_DIRECT
@@ -3940,7 +3941,7 @@
.write_end = ext4_da_write_end,
.set_page_dirty = ext4_set_page_dirty,
.bmap = ext4_bmap,
- .invalidatepage = ext4_da_invalidatepage,
+ .invalidatepage = ext4_invalidatepage,
.releasepage = ext4_releasepage,
.direct_IO = ext4_direct_IO,
.migratepage = buffer_migrate_page,
@@ -4033,13 +4034,11 @@
/* Uhhuh. Read error. Complain and punt. */
if (!buffer_uptodate(bh))
goto unlock;
- if (S_ISREG(inode->i_mode) &&
- ext4_encrypted_inode(inode)) {
+ if (S_ISREG(inode->i_mode) && IS_ENCRYPTED(inode)) {
/* We expect the key to be set. */
BUG_ON(!fscrypt_has_encryption_key(inode));
- BUG_ON(blocksize != PAGE_SIZE);
- WARN_ON_ONCE(fscrypt_decrypt_page(page->mapping->host,
- page, PAGE_SIZE, 0, page->index));
+ WARN_ON_ONCE(fscrypt_decrypt_pagecache_blocks(
+ page, blocksize, bh_offset(bh)));
}
}
if (ext4_should_journal_data(inode)) {
@@ -4057,7 +4056,8 @@
err = 0;
mark_buffer_dirty(bh);
if (ext4_should_order_data(inode))
- err = ext4_jbd2_inode_add_write(handle, inode);
+ err = ext4_jbd2_inode_add_write(handle, inode, from,
+ length);
}
unlock:
@@ -4110,7 +4110,7 @@
struct inode *inode = mapping->host;
/* If we are processing an encrypted inode during orphan list handling */
- if (ext4_encrypted_inode(inode) && !fscrypt_has_encryption_key(inode))
+ if (IS_ENCRYPTED(inode) && !fscrypt_has_encryption_key(inode))
return 0;
blocksize = inode->i_sb->s_blocksize;
@@ -4253,6 +4253,15 @@
trace_ext4_punch_hole(inode, offset, length, 0);
+ ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
+ if (ext4_has_inline_data(inode)) {
+ down_write(&EXT4_I(inode)->i_mmap_sem);
+ ret = ext4_convert_inline_data(inode);
+ up_write(&EXT4_I(inode)->i_mmap_sem);
+ if (ret)
+ return ret;
+ }
+
/*
* Write out all dirty pages to avoid race conditions
* Then release them.
@@ -4542,6 +4551,7 @@
struct buffer_head *bh;
struct super_block *sb = inode->i_sb;
ext4_fsblk_t block;
+ struct blk_plug plug;
int inodes_per_block, inode_offset;
iloc->bh = NULL;
@@ -4630,6 +4640,7 @@
* If we need to do any I/O, try to pre-readahead extra
* blocks from the inode table.
*/
+ blk_start_plug(&plug);
if (EXT4_SB(sb)->s_inode_readahead_blks) {
ext4_fsblk_t b, end, table;
unsigned num;
@@ -4660,6 +4671,7 @@
get_bh(bh);
bh->b_end_io = end_buffer_read_sync;
submit_bh(REQ_OP_READ, REQ_META | REQ_PRIO, bh);
+ blk_finish_plug(&plug);
wait_on_buffer(bh);
if (!buffer_uptodate(bh)) {
EXT4_ERROR_INODE_BLOCK(inode, block,
@@ -4690,7 +4702,9 @@
return false;
if (ext4_has_inline_data(inode))
return false;
- if (ext4_encrypted_inode(inode))
+ if (ext4_test_inode_flag(inode, EXT4_INODE_ENCRYPT))
+ return false;
+ if (ext4_test_inode_flag(inode, EXT4_INODE_VERITY))
return false;
return true;
}
@@ -4714,9 +4728,13 @@
new_fl |= S_DAX;
if (flags & EXT4_ENCRYPT_FL)
new_fl |= S_ENCRYPTED;
+ if (flags & EXT4_CASEFOLD_FL)
+ new_fl |= S_CASEFOLD;
+ if (flags & EXT4_VERITY_FL)
+ new_fl |= S_VERITY;
inode_set_flags(inode, new_fl,
S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC|S_DAX|
- S_ENCRYPTED);
+ S_ENCRYPTED|S_CASEFOLD|S_VERITY);
}
static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode,
@@ -4786,7 +4804,9 @@
return inode_peek_iversion(inode);
}
-struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
+struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
+ ext4_iget_flags flags, const char *function,
+ unsigned int line)
{
struct ext4_iloc iloc;
struct ext4_inode *raw_inode;
@@ -4800,6 +4820,18 @@
gid_t i_gid;
projid_t i_projid;
+ if ((!(flags & EXT4_IGET_SPECIAL) &&
+ (ino < EXT4_FIRST_INO(sb) && ino != EXT4_ROOT_INO)) ||
+ (ino < EXT4_ROOT_INO) ||
+ (ino > le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count))) {
+ if (flags & EXT4_IGET_HANDLE)
+ return ERR_PTR(-ESTALE);
+ __ext4_error(sb, function, line,
+ "inode #%lu: comm %s: iget: illegal inode #",
+ ino, current->comm);
+ return ERR_PTR(-EFSCORRUPTED);
+ }
+
inode = iget_locked(sb, ino);
if (!inode)
return ERR_PTR(-ENOMEM);
@@ -4815,18 +4847,26 @@
raw_inode = ext4_raw_inode(&iloc);
if ((ino == EXT4_ROOT_INO) && (raw_inode->i_links_count == 0)) {
- EXT4_ERROR_INODE(inode, "root inode unallocated");
+ ext4_error_inode(inode, function, line, 0,
+ "iget: root inode unallocated");
ret = -EFSCORRUPTED;
goto bad_inode;
}
+ if ((flags & EXT4_IGET_HANDLE) &&
+ (raw_inode->i_links_count == 0) && (raw_inode->i_mode == 0)) {
+ ret = -ESTALE;
+ goto bad_inode;
+ }
+
if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize);
if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize >
EXT4_INODE_SIZE(inode->i_sb) ||
(ei->i_extra_isize & 3)) {
- EXT4_ERROR_INODE(inode,
- "bad extra_isize %u (inode size %u)",
+ ext4_error_inode(inode, function, line, 0,
+ "iget: bad extra_isize %u "
+ "(inode size %u)",
ei->i_extra_isize,
EXT4_INODE_SIZE(inode->i_sb));
ret = -EFSCORRUPTED;
@@ -4848,7 +4888,8 @@
}
if (!ext4_inode_csum_verify(inode, raw_inode, ei)) {
- EXT4_ERROR_INODE(inode, "checksum invalid");
+ ext4_error_inode(inode, function, line, 0,
+ "iget: checksum invalid");
ret = -EFSBADCRC;
goto bad_inode;
}
@@ -4905,7 +4946,8 @@
((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32;
inode->i_size = ext4_isize(sb, raw_inode);
if ((size = i_size_read(inode)) < 0) {
- EXT4_ERROR_INODE(inode, "bad i_size value: %lld", size);
+ ext4_error_inode(inode, function, line, 0,
+ "iget: bad i_size value: %lld", size);
ret = -EFSCORRUPTED;
goto bad_inode;
}
@@ -4981,7 +5023,8 @@
ret = 0;
if (ei->i_file_acl &&
!ext4_data_block_valid(EXT4_SB(sb), ei->i_file_acl, 1)) {
- EXT4_ERROR_INODE(inode, "bad extended attribute block %llu",
+ ext4_error_inode(inode, function, line, 0,
+ "iget: bad extended attribute block %llu",
ei->i_file_acl);
ret = -EFSCORRUPTED;
goto bad_inode;
@@ -5009,12 +5052,13 @@
} else if (S_ISLNK(inode->i_mode)) {
/* VFS does not allow setting these so must be corruption */
if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) {
- EXT4_ERROR_INODE(inode,
- "immutable or append flags not allowed on symlinks");
+ ext4_error_inode(inode, function, line, 0,
+ "iget: immutable or append flags "
+ "not allowed on symlinks");
ret = -EFSCORRUPTED;
goto bad_inode;
}
- if (ext4_encrypted_inode(inode)) {
+ if (IS_ENCRYPTED(inode)) {
inode->i_op = &ext4_encrypted_symlink_inode_operations;
ext4_set_aops(inode);
} else if (ext4_inode_is_fast_symlink(inode)) {
@@ -5040,9 +5084,13 @@
make_bad_inode(inode);
} else {
ret = -EFSCORRUPTED;
- EXT4_ERROR_INODE(inode, "bogus i_mode (%o)", inode->i_mode);
+ ext4_error_inode(inode, function, line, 0,
+ "iget: bogus i_mode (%o)", inode->i_mode);
goto bad_inode;
}
+ if (IS_CASEFOLDED(inode) && !ext4_has_feature_casefold(inode->i_sb))
+ ext4_error_inode(inode, function, line, 0,
+ "casefold flag without casefold feature");
brelse(iloc.bh);
unlock_new_inode(inode);
@@ -5054,13 +5102,6 @@
return ERR_PTR(ret);
}
-struct inode *ext4_iget_normal(struct super_block *sb, unsigned long ino)
-{
- if (ino < EXT4_FIRST_INO(sb) && ino != EXT4_ROOT_INO)
- return ERR_PTR(-EFSCORRUPTED);
- return ext4_iget(sb, ino);
-}
-
static int ext4_inode_blocks_set(handle_t *handle,
struct ext4_inode *raw_inode,
struct ext4_inode_info *ei)
@@ -5299,7 +5340,6 @@
err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh);
if (err)
goto out_brelse;
- ext4_update_dynamic_rev(sb);
ext4_set_feature_large_file(sb);
ext4_handle_sync(handle);
err = ext4_handle_dirty_super(handle, sb);
@@ -5349,9 +5389,13 @@
{
int err;
- if (WARN_ON_ONCE(current->flags & PF_MEMALLOC))
+ if (WARN_ON_ONCE(current->flags & PF_MEMALLOC) ||
+ sb_rdonly(inode->i_sb))
return 0;
+ if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
+ return -EIO;
+
if (EXT4_SB(inode->i_sb)->s_journal) {
if (ext4_journal_current_handle()) {
jbd_debug(1, "called recursively, non-PF_MEMALLOC!\n");
@@ -5367,7 +5411,8 @@
if (wbc->sync_mode != WB_SYNC_ALL || wbc->for_sync)
return 0;
- err = ext4_force_commit(inode->i_sb);
+ err = jbd2_complete_transaction(EXT4_SB(inode->i_sb)->s_journal,
+ EXT4_I(inode)->i_sync_tid);
} else {
struct ext4_iloc iloc;
@@ -5466,6 +5511,14 @@
if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
return -EIO;
+ if (unlikely(IS_IMMUTABLE(inode)))
+ return -EPERM;
+
+ if (unlikely(IS_APPEND(inode) &&
+ (ia_valid & (ATTR_MODE | ATTR_UID |
+ ATTR_GID | ATTR_TIMES_SET))))
+ return -EPERM;
+
error = setattr_prepare(dentry, attr);
if (error)
return error;
@@ -5474,6 +5527,10 @@
if (error)
return error;
+ error = fsverity_prepare_setattr(dentry, attr);
+ if (error)
+ return error;
+
if (is_quota_modification(inode, attr)) {
error = dquot_initialize(inode);
if (error)
@@ -5517,7 +5574,7 @@
if (attr->ia_valid & ATTR_SIZE) {
handle_t *handle;
loff_t oldsize = inode->i_size;
- int shrink = (attr->ia_size <= inode->i_size);
+ int shrink = (attr->ia_size < inode->i_size);
if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
@@ -5531,18 +5588,33 @@
if (IS_I_VERSION(inode) && attr->ia_size != inode->i_size)
inode_inc_iversion(inode);
- if (ext4_should_order_data(inode) &&
- (attr->ia_size < inode->i_size)) {
- error = ext4_begin_ordered_truncate(inode,
+ if (shrink) {
+ if (ext4_should_order_data(inode)) {
+ error = ext4_begin_ordered_truncate(inode,
attr->ia_size);
- if (error)
- goto err_out;
+ if (error)
+ goto err_out;
+ }
+ /*
+ * Blocks are going to be removed from the inode. Wait
+ * for dio in flight.
+ */
+ inode_dio_wait(inode);
}
+
+ down_write(&EXT4_I(inode)->i_mmap_sem);
+
+ rc = ext4_break_layouts(inode);
+ if (rc) {
+ up_write(&EXT4_I(inode)->i_mmap_sem);
+ return rc;
+ }
+
if (attr->ia_size != inode->i_size) {
handle = ext4_journal_start(inode, EXT4_HT_INODE, 3);
if (IS_ERR(handle)) {
error = PTR_ERR(handle);
- goto err_out;
+ goto out_mmap_sem;
}
if (ext4_handle_valid(handle) && shrink) {
error = ext4_orphan_add(handle, inode);
@@ -5570,33 +5642,14 @@
i_size_write(inode, attr->ia_size);
up_write(&EXT4_I(inode)->i_data_sem);
ext4_journal_stop(handle);
- if (error) {
- if (orphan)
- ext4_orphan_del(NULL, inode);
- goto err_out;
- }
- }
- if (!shrink)
- pagecache_isize_extended(inode, oldsize, inode->i_size);
-
- /*
- * Blocks are going to be removed from the inode. Wait
- * for dio in flight. Temporarily disable
- * dioread_nolock to prevent livelock.
- */
- if (orphan) {
- if (!ext4_should_journal_data(inode)) {
- inode_dio_wait(inode);
- } else
+ if (error)
+ goto out_mmap_sem;
+ if (!shrink) {
+ pagecache_isize_extended(inode, oldsize,
+ inode->i_size);
+ } else if (ext4_should_journal_data(inode)) {
ext4_wait_for_tail_page_commit(inode);
- }
- down_write(&EXT4_I(inode)->i_mmap_sem);
-
- rc = ext4_break_layouts(inode);
- if (rc) {
- up_write(&EXT4_I(inode)->i_mmap_sem);
- error = rc;
- goto err_out;
+ }
}
/*
@@ -5604,11 +5657,16 @@
* in data=journal mode to make pages freeable.
*/
truncate_pagecache(inode, inode->i_size);
- if (shrink) {
+ /*
+ * Call ext4_truncate() even if i_size didn't change to
+ * truncate possible preallocated blocks.
+ */
+ if (attr->ia_size <= oldsize) {
rc = ext4_truncate(inode);
if (rc)
error = rc;
}
+out_mmap_sem:
up_write(&EXT4_I(inode)->i_mmap_sem);
}
@@ -5854,8 +5912,23 @@
{
struct ext4_inode *raw_inode;
struct ext4_xattr_ibody_header *header;
+ unsigned int inode_size = EXT4_INODE_SIZE(inode->i_sb);
+ struct ext4_inode_info *ei = EXT4_I(inode);
int error;
+ /* this was checked at iget time, but double check for good measure */
+ if ((EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize > inode_size) ||
+ (ei->i_extra_isize & 3)) {
+ EXT4_ERROR_INODE(inode, "bad extra_isize %u (inode size %u)",
+ ei->i_extra_isize,
+ EXT4_INODE_SIZE(inode->i_sb));
+ return -EFSCORRUPTED;
+ }
+ if ((new_extra_isize < ei->i_extra_isize) ||
+ (new_extra_isize < 4) ||
+ (new_extra_isize > inode_size - EXT4_GOOD_OLD_INODE_SIZE))
+ return -EINVAL; /* Should never happen */
+
raw_inode = ext4_raw_inode(iloc);
header = IHDR(inode, raw_inode);
@@ -5945,7 +6018,7 @@
ext4_write_lock_xattr(inode, &no_expand);
- BUFFER_TRACE(iloc.bh, "get_write_access");
+ BUFFER_TRACE(iloc->bh, "get_write_access");
error = ext4_journal_get_write_access(handle, iloc->bh);
if (error) {
brelse(iloc->bh);
@@ -6032,36 +6105,6 @@
return;
}
-#if 0
-/*
- * Bind an inode's backing buffer_head into this transaction, to prevent
- * it from being flushed to disk early. Unlike
- * ext4_reserve_inode_write, this leaves behind no bh reference and
- * returns no iloc structure, so the caller needs to repeat the iloc
- * lookup to mark the inode dirty later.
- */
-static int ext4_pin_inode(handle_t *handle, struct inode *inode)
-{
- struct ext4_iloc iloc;
-
- int err = 0;
- if (handle) {
- err = ext4_get_inode_loc(inode, &iloc);
- if (!err) {
- BUFFER_TRACE(iloc.bh, "get_write_access");
- err = jbd2_journal_get_write_access(handle, iloc.bh);
- if (!err)
- err = ext4_handle_dirty_metadata(handle,
- NULL,
- iloc.bh);
- brelse(iloc.bh);
- }
- }
- ext4_std_error(inode->i_sb, err);
- return err;
-}
-#endif
-
int ext4_change_inode_journal_flag(struct inode *inode, int val)
{
journal_t *journal;
@@ -6154,13 +6197,14 @@
return !buffer_mapped(bh);
}
-int ext4_page_mkwrite(struct vm_fault *vmf)
+vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
struct page *page = vmf->page;
loff_t size;
unsigned long len;
- int ret;
+ int err;
+ vm_fault_t ret;
struct file *file = vma->vm_file;
struct inode *inode = file_inode(file);
struct address_space *mapping = inode->i_mapping;
@@ -6168,13 +6212,16 @@
get_block_t *get_block;
int retries = 0;
+ if (unlikely(IS_IMMUTABLE(inode)))
+ return VM_FAULT_SIGBUS;
+
sb_start_pagefault(inode->i_sb);
file_update_time(vma->vm_file);
down_read(&EXT4_I(inode)->i_mmap_sem);
- ret = ext4_convert_inline_data(inode);
- if (ret)
+ err = ext4_convert_inline_data(inode);
+ if (err)
goto out_ret;
/* Delalloc case is easy... */
@@ -6182,9 +6229,9 @@
!ext4_should_journal_data(inode) &&
!ext4_nonda_switch(inode->i_sb)) {
do {
- ret = block_page_mkwrite(vma, vmf,
+ err = block_page_mkwrite(vma, vmf,
ext4_da_get_block_prep);
- } while (ret == -ENOSPC &&
+ } while (err == -ENOSPC &&
ext4_should_retry_alloc(inode->i_sb, &retries));
goto out_ret;
}
@@ -6229,8 +6276,8 @@
ret = VM_FAULT_SIGBUS;
goto out;
}
- ret = block_page_mkwrite(vma, vmf, get_block);
- if (!ret && ext4_should_journal_data(inode)) {
+ err = block_page_mkwrite(vma, vmf, get_block);
+ if (!err && ext4_should_journal_data(inode)) {
if (ext4_walk_page_buffers(handle, page_buffers(page), 0,
PAGE_SIZE, NULL, do_journal_get_write_access)) {
unlock_page(page);
@@ -6241,24 +6288,24 @@
ext4_set_inode_state(inode, EXT4_STATE_JDATA);
}
ext4_journal_stop(handle);
- if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
+ if (err == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
goto retry_alloc;
out_ret:
- ret = block_page_mkwrite_return(ret);
+ ret = block_page_mkwrite_return(err);
out:
up_read(&EXT4_I(inode)->i_mmap_sem);
sb_end_pagefault(inode->i_sb);
return ret;
}
-int ext4_filemap_fault(struct vm_fault *vmf)
+vm_fault_t ext4_filemap_fault(struct vm_fault *vmf)
{
struct inode *inode = file_inode(vmf->vma->vm_file);
- int err;
+ vm_fault_t ret;
down_read(&EXT4_I(inode)->i_mmap_sem);
- err = filemap_fault(vmf);
+ ret = filemap_fault(vmf);
up_read(&EXT4_I(inode)->i_mmap_sem);
- return err;
+ return ret;
}
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 0edee31..0b7f316 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -63,18 +63,20 @@
loff_t isize;
struct ext4_inode_info *ei1;
struct ext4_inode_info *ei2;
+ unsigned long tmp;
ei1 = EXT4_I(inode1);
ei2 = EXT4_I(inode2);
swap(inode1->i_version, inode2->i_version);
- swap(inode1->i_blocks, inode2->i_blocks);
- swap(inode1->i_bytes, inode2->i_bytes);
swap(inode1->i_atime, inode2->i_atime);
swap(inode1->i_mtime, inode2->i_mtime);
memswap(ei1->i_data, ei2->i_data, sizeof(ei1->i_data));
- swap(ei1->i_flags, ei2->i_flags);
+ tmp = ei1->i_flags & EXT4_FL_SHOULD_SWAP;
+ ei1->i_flags = (ei2->i_flags & EXT4_FL_SHOULD_SWAP) |
+ (ei1->i_flags & ~EXT4_FL_SHOULD_SWAP);
+ ei2->i_flags = tmp | (ei2->i_flags & ~EXT4_FL_SHOULD_SWAP);
swap(ei1->i_disksize, ei2->i_disksize);
ext4_es_remove_extent(inode1, 0, EXT_MAX_BLOCKS);
ext4_es_remove_extent(inode2, 0, EXT_MAX_BLOCKS);
@@ -115,28 +117,42 @@
int err;
struct inode *inode_bl;
struct ext4_inode_info *ei_bl;
+ qsize_t size, size_bl, diff;
+ blkcnt_t blocks;
+ unsigned short bytes;
- if (inode->i_nlink != 1 || !S_ISREG(inode->i_mode) ||
- IS_SWAPFILE(inode) || IS_ENCRYPTED(inode) ||
- ext4_has_inline_data(inode))
- return -EINVAL;
-
- if (IS_RDONLY(inode) || IS_APPEND(inode) || IS_IMMUTABLE(inode) ||
- !inode_owner_or_capable(inode) || !capable(CAP_SYS_ADMIN))
- return -EPERM;
-
- inode_bl = ext4_iget(sb, EXT4_BOOT_LOADER_INO);
+ inode_bl = ext4_iget(sb, EXT4_BOOT_LOADER_INO, EXT4_IGET_SPECIAL);
if (IS_ERR(inode_bl))
return PTR_ERR(inode_bl);
ei_bl = EXT4_I(inode_bl);
- filemap_flush(inode->i_mapping);
- filemap_flush(inode_bl->i_mapping);
-
/* Protect orig inodes against a truncate and make sure,
* that only 1 swap_inode_boot_loader is running. */
lock_two_nondirectories(inode, inode_bl);
+ if (inode->i_nlink != 1 || !S_ISREG(inode->i_mode) ||
+ IS_SWAPFILE(inode) || IS_ENCRYPTED(inode) ||
+ (EXT4_I(inode)->i_flags & EXT4_JOURNAL_DATA_FL) ||
+ ext4_has_inline_data(inode)) {
+ err = -EINVAL;
+ goto journal_err_out;
+ }
+
+ if (IS_RDONLY(inode) || IS_APPEND(inode) || IS_IMMUTABLE(inode) ||
+ !inode_owner_or_capable(inode) || !capable(CAP_SYS_ADMIN)) {
+ err = -EPERM;
+ goto journal_err_out;
+ }
+
+ down_write(&EXT4_I(inode)->i_mmap_sem);
+ err = filemap_write_and_wait(inode->i_mapping);
+ if (err)
+ goto err_out;
+
+ err = filemap_write_and_wait(inode_bl->i_mapping);
+ if (err)
+ goto err_out;
+
/* Wait for all existing dio workers */
inode_dio_wait(inode);
inode_dio_wait(inode_bl);
@@ -147,7 +163,7 @@
handle = ext4_journal_start(inode_bl, EXT4_HT_MOVE_EXTENTS, 2);
if (IS_ERR(handle)) {
err = -EINVAL;
- goto journal_err_out;
+ goto err_out;
}
/* Protect extent tree against block allocations via delalloc */
@@ -170,6 +186,13 @@
memset(ei_bl->i_data, 0, sizeof(ei_bl->i_data));
}
+ err = dquot_initialize(inode);
+ if (err)
+ goto err_out1;
+
+ size = (qsize_t)(inode->i_blocks) * (1 << 9) + inode->i_bytes;
+ size_bl = (qsize_t)(inode_bl->i_blocks) * (1 << 9) + inode_bl->i_bytes;
+ diff = size - size_bl;
swap_inode_data(inode, inode_bl);
inode->i_ctime = inode_bl->i_ctime = current_time(inode);
@@ -183,34 +206,58 @@
err = ext4_mark_inode_dirty(handle, inode);
if (err < 0) {
+ /* No need to update quota information. */
ext4_warning(inode->i_sb,
"couldn't mark inode #%lu dirty (err %d)",
inode->i_ino, err);
/* Revert all changes: */
swap_inode_data(inode, inode_bl);
ext4_mark_inode_dirty(handle, inode);
- } else {
- err = ext4_mark_inode_dirty(handle, inode_bl);
- if (err < 0) {
- ext4_warning(inode_bl->i_sb,
- "couldn't mark inode #%lu dirty (err %d)",
- inode_bl->i_ino, err);
- /* Revert all changes: */
- swap_inode_data(inode, inode_bl);
- ext4_mark_inode_dirty(handle, inode);
- ext4_mark_inode_dirty(handle, inode_bl);
- }
+ goto err_out1;
}
+
+ blocks = inode_bl->i_blocks;
+ bytes = inode_bl->i_bytes;
+ inode_bl->i_blocks = inode->i_blocks;
+ inode_bl->i_bytes = inode->i_bytes;
+ err = ext4_mark_inode_dirty(handle, inode_bl);
+ if (err < 0) {
+ /* No need to update quota information. */
+ ext4_warning(inode_bl->i_sb,
+ "couldn't mark inode #%lu dirty (err %d)",
+ inode_bl->i_ino, err);
+ goto revert;
+ }
+
+ /* Bootloader inode should not be counted into quota information. */
+ if (diff > 0)
+ dquot_free_space(inode, diff);
+ else
+ err = dquot_alloc_space(inode, -1 * diff);
+
+ if (err < 0) {
+revert:
+ /* Revert all changes: */
+ inode_bl->i_blocks = blocks;
+ inode_bl->i_bytes = bytes;
+ swap_inode_data(inode, inode_bl);
+ ext4_mark_inode_dirty(handle, inode);
+ ext4_mark_inode_dirty(handle, inode_bl);
+ }
+
+err_out1:
ext4_journal_stop(handle);
ext4_double_up_write_data_sem(inode, inode_bl);
+err_out:
+ up_write(&EXT4_I(inode)->i_mmap_sem);
journal_err_out:
unlock_two_nondirectories(inode, inode_bl);
iput(inode_bl);
return err;
}
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
static int uuid_is_zero(__u8 u[16])
{
int i;
@@ -222,6 +269,29 @@
}
#endif
+/*
+ * If immutable is set and we are not clearing it, we're not allowed to change
+ * anything else in the inode. Don't error out if we're only trying to set
+ * immutable on an immutable file.
+ */
+static int ext4_ioctl_check_immutable(struct inode *inode, __u32 new_projid,
+ unsigned int flags)
+{
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ unsigned int oldflags = ei->i_flags;
+
+ if (!(oldflags & EXT4_IMMUTABLE_FL) || !(flags & EXT4_IMMUTABLE_FL))
+ return 0;
+
+ if ((oldflags & ~EXT4_IMMUTABLE_FL) != (flags & ~EXT4_IMMUTABLE_FL))
+ return -EPERM;
+ if (ext4_has_feature_project(inode->i_sb) &&
+ __kprojid_val(ei->i_projid) != new_projid)
+ return -EPERM;
+
+ return 0;
+}
+
static int ext4_ioctl_setflags(struct inode *inode,
unsigned int flags)
{
@@ -231,6 +301,7 @@
struct ext4_iloc iloc;
unsigned int oldflags, mask, i;
unsigned int jflag;
+ struct super_block *sb = inode->i_sb;
/* Is it quota file? Do not allow user to mess with it */
if (ext4_is_quota_file(inode))
@@ -241,16 +312,9 @@
/* The JOURNAL_DATA flag is modifiable only by root */
jflag = flags & EXT4_JOURNAL_DATA_FL;
- /*
- * The IMMUTABLE and APPEND_ONLY flags can only be changed by
- * the relevant capability.
- *
- * This test looks nicer. Thanks to Pauline Middelink
- */
- if ((flags ^ oldflags) & (EXT4_APPEND_FL | EXT4_IMMUTABLE_FL)) {
- if (!capable(CAP_LINUX_IMMUTABLE))
- goto flags_out;
- }
+ err = vfs_ioc_setflags_prepare(inode, oldflags, flags);
+ if (err)
+ goto flags_out;
/*
* The JOURNAL_DATA flag can only be changed by
@@ -275,6 +339,37 @@
goto flags_out;
}
+ if ((flags ^ oldflags) & EXT4_CASEFOLD_FL) {
+ if (!ext4_has_feature_casefold(sb)) {
+ err = -EOPNOTSUPP;
+ goto flags_out;
+ }
+
+ if (!S_ISDIR(inode->i_mode)) {
+ err = -ENOTDIR;
+ goto flags_out;
+ }
+
+ if (!ext4_empty_dir(inode)) {
+ err = -ENOTEMPTY;
+ goto flags_out;
+ }
+ }
+
+ /*
+ * Wait for all pending directio and then flush all the dirty pages
+ * for this file. The flush marks all the pages readonly, so any
+ * subsequent attempt to write to the file (particularly mmap pages)
+ * will come through the filesystem and fail.
+ */
+ if (S_ISREG(inode->i_mode) && !IS_IMMUTABLE(inode) &&
+ (flags & EXT4_IMMUTABLE_FL)) {
+ inode_dio_wait(inode);
+ err = filemap_write_and_wait(inode->i_mapping);
+ if (err)
+ goto flags_out;
+ }
+
handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);
if (IS_ERR(handle)) {
err = PTR_ERR(handle);
@@ -543,7 +638,7 @@
static int ext4_ioc_getfsmap(struct super_block *sb,
struct fsmap_head __user *arg)
{
- struct getfsmap_info info = {0};
+ struct getfsmap_info info = { NULL };
struct ext4_fsmap_head xhead = {0};
struct fsmap_head head;
bool aborted = false;
@@ -639,30 +734,85 @@
return err;
}
-static int ext4_ioctl_check_project(struct inode *inode, struct fsxattr *fa)
+static void ext4_fill_fsxattr(struct inode *inode, struct fsxattr *fa)
{
- /*
- * Project Quota ID state is only allowed to change from within the init
- * namespace. Enforce that restriction only if we are trying to change
- * the quota ID state. Everything else is allowed in user namespaces.
- */
- if (current_user_ns() == &init_user_ns)
- return 0;
+ struct ext4_inode_info *ei = EXT4_I(inode);
- if (__kprojid_val(EXT4_I(inode)->i_projid) != fa->fsx_projid)
+ simple_fill_fsxattr(fa, ext4_iflags_to_xflags(ei->i_flags &
+ EXT4_FL_USER_VISIBLE));
+
+ if (ext4_has_feature_project(inode->i_sb))
+ fa->fsx_projid = from_kprojid(&init_user_ns, ei->i_projid);
+}
+
+/* copied from fs/ioctl.c */
+static int fiemap_check_ranges(struct super_block *sb,
+ u64 start, u64 len, u64 *new_len)
+{
+ u64 maxbytes = (u64) sb->s_maxbytes;
+
+ *new_len = len;
+
+ if (len == 0)
return -EINVAL;
- if (ext4_test_inode_flag(inode, EXT4_INODE_PROJINHERIT)) {
- if (!(fa->fsx_xflags & FS_XFLAG_PROJINHERIT))
- return -EINVAL;
- } else {
- if (fa->fsx_xflags & FS_XFLAG_PROJINHERIT)
- return -EINVAL;
- }
+ if (start > maxbytes)
+ return -EFBIG;
+
+ /*
+ * Shrink request scope to what the fs can actually handle.
+ */
+ if (len > maxbytes || (maxbytes - len) < start)
+ *new_len = maxbytes - start;
return 0;
}
+/* So that the fiemap access checks can't overflow on 32 bit machines. */
+#define FIEMAP_MAX_EXTENTS (UINT_MAX / sizeof(struct fiemap_extent))
+
+static int ext4_ioctl_get_es_cache(struct file *filp, unsigned long arg)
+{
+ struct fiemap fiemap;
+ struct fiemap __user *ufiemap = (struct fiemap __user *) arg;
+ struct fiemap_extent_info fieinfo = { 0, };
+ struct inode *inode = file_inode(filp);
+ struct super_block *sb = inode->i_sb;
+ u64 len;
+ int error;
+
+ if (copy_from_user(&fiemap, ufiemap, sizeof(fiemap)))
+ return -EFAULT;
+
+ if (fiemap.fm_extent_count > FIEMAP_MAX_EXTENTS)
+ return -EINVAL;
+
+ error = fiemap_check_ranges(sb, fiemap.fm_start, fiemap.fm_length,
+ &len);
+ if (error)
+ return error;
+
+ fieinfo.fi_flags = fiemap.fm_flags;
+ fieinfo.fi_extents_max = fiemap.fm_extent_count;
+ fieinfo.fi_extents_start = ufiemap->fm_extents;
+
+ if (fiemap.fm_extent_count != 0 &&
+ !access_ok(fieinfo.fi_extents_start,
+ fieinfo.fi_extents_max * sizeof(struct fiemap_extent)))
+ return -EFAULT;
+
+ if (fieinfo.fi_flags & FIEMAP_FLAG_SYNC)
+ filemap_write_and_wait(inode->i_mapping);
+
+ error = ext4_get_es_cache(inode, &fieinfo, fiemap.fm_start, len);
+ fiemap.fm_flags = fieinfo.fi_flags;
+ fiemap.fm_mapped_extents = fieinfo.fi_extents_mapped;
+ if (copy_to_user(ufiemap, &fiemap, sizeof(fiemap)))
+ error = -EFAULT;
+
+ return error;
+}
+
long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
{
struct inode *inode = file_inode(filp);
@@ -677,6 +827,8 @@
return ext4_ioc_getfsmap(sb, (void __user *)arg);
case EXT4_IOC_GETFLAGS:
flags = ei->i_flags & EXT4_FL_USER_VISIBLE;
+ if (S_ISREG(inode->i_mode))
+ flags &= ~EXT4_PROJINHERIT_FL;
return put_user(flags, (int __user *) arg);
case EXT4_IOC_SETFLAGS: {
int err;
@@ -704,7 +856,11 @@
return err;
inode_lock(inode);
- err = ext4_ioctl_setflags(inode, flags);
+ err = ext4_ioctl_check_immutable(inode,
+ from_kprojid(&init_user_ns, ei->i_projid),
+ flags);
+ if (!err)
+ err = ext4_ioctl_setflags(inode, flags);
inode_unlock(inode);
mnt_drop_write_file(filp);
return err;
@@ -931,7 +1087,7 @@
if (err == 0)
err = err2;
mnt_drop_write_file(filp);
- if (!err && (o_group > EXT4_SB(sb)->s_groups_count) &&
+ if (!err && (o_group < EXT4_SB(sb)->s_groups_count) &&
ext4_has_group_desc_csum(sb) &&
test_opt(sb, INIT_INODE_TABLE))
err = ext4_register_li_request(sb, o_group);
@@ -953,6 +1109,13 @@
if (!blk_queue_discard(q))
return -EOPNOTSUPP;
+ /*
+ * We haven't replayed the journal, so we cannot use our
+ * block-bitmap-guided storage zapping commands.
+ */
+ if (test_opt(sb, NOLOAD) && ext4_has_feature_journal(sb))
+ return -EROFS;
+
if (copy_from_user(&range, (struct fstrim_range __user *)arg,
sizeof(range)))
return -EFAULT;
@@ -978,7 +1141,7 @@
return fscrypt_ioctl_set_policy(filp, (const void __user *)arg);
case EXT4_IOC_GET_ENCRYPTION_PWSALT: {
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
int err, err2;
struct ext4_sb_info *sbi = EXT4_SB(sb);
handle_t *handle;
@@ -1018,19 +1181,67 @@
#endif
}
case EXT4_IOC_GET_ENCRYPTION_POLICY:
+ if (!ext4_has_feature_encrypt(sb))
+ return -EOPNOTSUPP;
return fscrypt_ioctl_get_policy(filp, (void __user *)arg);
+ case FS_IOC_GET_ENCRYPTION_POLICY_EX:
+ if (!ext4_has_feature_encrypt(sb))
+ return -EOPNOTSUPP;
+ return fscrypt_ioctl_get_policy_ex(filp, (void __user *)arg);
+
+ case FS_IOC_ADD_ENCRYPTION_KEY:
+ if (!ext4_has_feature_encrypt(sb))
+ return -EOPNOTSUPP;
+ return fscrypt_ioctl_add_key(filp, (void __user *)arg);
+
+ case FS_IOC_REMOVE_ENCRYPTION_KEY:
+ if (!ext4_has_feature_encrypt(sb))
+ return -EOPNOTSUPP;
+ return fscrypt_ioctl_remove_key(filp, (void __user *)arg);
+
+ case FS_IOC_REMOVE_ENCRYPTION_KEY_ALL_USERS:
+ if (!ext4_has_feature_encrypt(sb))
+ return -EOPNOTSUPP;
+ return fscrypt_ioctl_remove_key_all_users(filp,
+ (void __user *)arg);
+ case FS_IOC_GET_ENCRYPTION_KEY_STATUS:
+ if (!ext4_has_feature_encrypt(sb))
+ return -EOPNOTSUPP;
+ return fscrypt_ioctl_get_key_status(filp, (void __user *)arg);
+
+ case EXT4_IOC_CLEAR_ES_CACHE:
+ {
+ if (!inode_owner_or_capable(inode))
+ return -EACCES;
+ ext4_clear_inode_es(inode);
+ return 0;
+ }
+
+ case EXT4_IOC_GETSTATE:
+ {
+ __u32 state = 0;
+
+ if (ext4_test_inode_state(inode, EXT4_STATE_EXT_PRECACHED))
+ state |= EXT4_STATE_FLAG_EXT_PRECACHED;
+ if (ext4_test_inode_state(inode, EXT4_STATE_NEW))
+ state |= EXT4_STATE_FLAG_NEW;
+ if (ext4_test_inode_state(inode, EXT4_STATE_NEWENTRY))
+ state |= EXT4_STATE_FLAG_NEWENTRY;
+ if (ext4_test_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE))
+ state |= EXT4_STATE_FLAG_DA_ALLOC_CLOSE;
+
+ return put_user(state, (__u32 __user *) arg);
+ }
+
+ case EXT4_IOC_GET_ES_CACHE:
+ return ext4_ioctl_get_es_cache(filp, arg);
+
case EXT4_IOC_FSGETXATTR:
{
struct fsxattr fa;
- memset(&fa, 0, sizeof(struct fsxattr));
- fa.fsx_xflags = ext4_iflags_to_xflags(ei->i_flags & EXT4_FL_USER_VISIBLE);
-
- if (ext4_has_feature_project(inode->i_sb)) {
- fa.fsx_projid = (__u32)from_kprojid(&init_user_ns,
- EXT4_I(inode)->i_projid);
- }
+ ext4_fill_fsxattr(inode, &fa);
if (copy_to_user((struct fsxattr __user *)arg,
&fa, sizeof(fa)))
@@ -1039,7 +1250,7 @@
}
case EXT4_IOC_FSSETXATTR:
{
- struct fsxattr fa;
+ struct fsxattr fa, old_fa;
int err;
if (copy_from_user(&fa, (struct fsxattr __user *)arg,
@@ -1062,11 +1273,15 @@
return err;
inode_lock(inode);
- err = ext4_ioctl_check_project(inode, &fa);
+ ext4_fill_fsxattr(inode, &old_fa);
+ err = vfs_ioc_fssetxattr_check(inode, &old_fa, &fa);
if (err)
goto out;
flags = (ei->i_flags & ~EXT4_FL_XFLAG_VISIBLE) |
(flags & EXT4_FL_XFLAG_VISIBLE);
+ err = ext4_ioctl_check_immutable(inode, fa.fsx_projid, flags);
+ if (err)
+ goto out;
err = ext4_ioctl_setflags(inode, flags);
if (err)
goto out;
@@ -1078,6 +1293,17 @@
}
case EXT4_IOC_SHUTDOWN:
return ext4_shutdown(sb, arg);
+
+ case FS_IOC_ENABLE_VERITY:
+ if (!ext4_has_feature_verity(sb))
+ return -EOPNOTSUPP;
+ return fsverity_ioctl_enable(filp, (const void __user *)arg);
+
+ case FS_IOC_MEASURE_VERITY:
+ if (!ext4_has_feature_verity(sb))
+ return -EOPNOTSUPP;
+ return fsverity_ioctl_measure(filp, (void __user *)arg);
+
default:
return -ENOTTY;
}
@@ -1138,8 +1364,18 @@
case EXT4_IOC_SET_ENCRYPTION_POLICY:
case EXT4_IOC_GET_ENCRYPTION_PWSALT:
case EXT4_IOC_GET_ENCRYPTION_POLICY:
+ case FS_IOC_GET_ENCRYPTION_POLICY_EX:
+ case FS_IOC_ADD_ENCRYPTION_KEY:
+ case FS_IOC_REMOVE_ENCRYPTION_KEY:
+ case FS_IOC_REMOVE_ENCRYPTION_KEY_ALL_USERS:
+ case FS_IOC_GET_ENCRYPTION_KEY_STATUS:
case EXT4_IOC_SHUTDOWN:
case FS_IOC_GETFSMAP:
+ case FS_IOC_ENABLE_VERITY:
+ case FS_IOC_MEASURE_VERITY:
+ case EXT4_IOC_CLEAR_ES_CACHE:
+ case EXT4_IOC_GETSTATE:
+ case EXT4_IOC_GET_ES_CACHE:
break;
default:
return -ENOIOCTLCMD;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index e29fce2..a3e2767 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -1539,7 +1539,7 @@
ex->fe_len += 1 << order;
}
- if (ex->fe_start + ex->fe_len > (1 << (e4b->bd_blkbits + 3))) {
+ if (ex->fe_start + ex->fe_len > EXT4_CLUSTERS_PER_GROUP(e4b->bd_sb)) {
/* Should never happen! (but apparently sometimes does?!?) */
WARN_ON(1);
ext4_error(e4b->bd_sb, "corruption or bug in mb_find_extent "
@@ -2490,6 +2490,7 @@
sbi->s_buddy_cache->i_ino = EXT4_BAD_INO;
EXT4_I(sbi->s_buddy_cache)->i_disksize = 0;
for (i = 0; i < ngroups; i++) {
+ cond_resched();
desc = ext4_get_group_desc(sb, i, NULL);
if (desc == NULL) {
ext4_msg(sb, KERN_ERR, "can't read descriptor %u", i);
@@ -2705,6 +2706,7 @@
if (sbi->s_group_info) {
for (i = 0; i < ngroups; i++) {
+ cond_resched();
grinfo = ext4_get_group_info(sb, i);
#ifdef DOUBLE_CHECK
kfree(grinfo->bb_bitmap);
@@ -4176,9 +4178,8 @@
isize = (i_size_read(ac->ac_inode) + ac->ac_sb->s_blocksize - 1)
>> bsbits;
- if ((size == isize) &&
- !ext4_fs_is_busy(sbi) &&
- (atomic_read(&ac->ac_inode->i_writecount) == 0)) {
+ if ((size == isize) && !ext4_fs_is_busy(sbi) &&
+ !inode_is_open_for_write(ac->ac_inode)) {
ac->ac_flags |= EXT4_MB_HINT_NOPREALLOC;
return;
}
@@ -4258,7 +4259,7 @@
(unsigned) ar->goal, ac->ac_flags, ac->ac_2order,
(unsigned) ar->lleft, (unsigned) ar->pleft,
(unsigned) ar->lright, (unsigned) ar->pright,
- atomic_read(&ar->inode->i_writecount) ? "" : "non-");
+ inode_is_open_for_write(ar->inode) ? "" : "non-");
return 0;
}
@@ -4695,8 +4696,9 @@
* ext4_free_blocks() -- Free given blocks and update quota
* @handle: handle for this transaction
* @inode: inode
- * @block: start physical block to free
- * @count: number of blocks to count
+ * @bh: optional buffer of the block to be freed
+ * @block: starting physical block to be freed
+ * @count: number of blocks to be freed
* @flags: flags used by ext4_free_blocks
*/
void ext4_free_blocks(handle_t *handle, struct inode *inode,
@@ -4915,9 +4917,17 @@
&sbi->s_flex_groups[flex_group].free_clusters);
}
- if (!(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE))
- dquot_free_block(inode, EXT4_C2B(sbi, count_clusters));
- percpu_counter_add(&sbi->s_freeclusters_counter, count_clusters);
+ /*
+ * on a bigalloc file system, defer the s_freeclusters_counter
+ * update to the caller (ext4_remove_space and friends) so they
+ * can determine if a cluster freed here should be rereserved
+ */
+ if (!(flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER)) {
+ if (!(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE))
+ dquot_free_block(inode, EXT4_C2B(sbi, count_clusters));
+ percpu_counter_add(&sbi->s_freeclusters_counter,
+ count_clusters);
+ }
ext4_mb_unload_buddy(&e4b);
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index 61a9d19..b1e4d35 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -116,9 +116,9 @@
int i, retval = 0;
unsigned long max_entries = inode->i_sb->s_blocksize >> 2;
- bh = sb_bread(inode->i_sb, pblock);
- if (!bh)
- return -EIO;
+ bh = ext4_sb_bread(inode->i_sb, pblock, 0);
+ if (IS_ERR(bh))
+ return PTR_ERR(bh);
i_data = (__le32 *)bh->b_data;
for (i = 0; i < max_entries; i++) {
@@ -145,9 +145,9 @@
int i, retval = 0;
unsigned long max_entries = inode->i_sb->s_blocksize >> 2;
- bh = sb_bread(inode->i_sb, pblock);
- if (!bh)
- return -EIO;
+ bh = ext4_sb_bread(inode->i_sb, pblock, 0);
+ if (IS_ERR(bh))
+ return PTR_ERR(bh);
i_data = (__le32 *)bh->b_data;
for (i = 0; i < max_entries; i++) {
@@ -175,9 +175,9 @@
int i, retval = 0;
unsigned long max_entries = inode->i_sb->s_blocksize >> 2;
- bh = sb_bread(inode->i_sb, pblock);
- if (!bh)
- return -EIO;
+ bh = ext4_sb_bread(inode->i_sb, pblock, 0);
+ if (IS_ERR(bh))
+ return PTR_ERR(bh);
i_data = (__le32 *)bh->b_data;
for (i = 0; i < max_entries; i++) {
@@ -224,9 +224,9 @@
struct buffer_head *bh;
unsigned long max_entries = inode->i_sb->s_blocksize >> 2;
- bh = sb_bread(inode->i_sb, le32_to_cpu(i_data));
- if (!bh)
- return -EIO;
+ bh = ext4_sb_bread(inode->i_sb, le32_to_cpu(i_data), 0);
+ if (IS_ERR(bh))
+ return PTR_ERR(bh);
tmp_idata = (__le32 *)bh->b_data;
for (i = 0; i < max_entries; i++) {
@@ -254,9 +254,9 @@
struct buffer_head *bh;
unsigned long max_entries = inode->i_sb->s_blocksize >> 2;
- bh = sb_bread(inode->i_sb, le32_to_cpu(i_data));
- if (!bh)
- return -EIO;
+ bh = ext4_sb_bread(inode->i_sb, le32_to_cpu(i_data), 0);
+ if (IS_ERR(bh))
+ return PTR_ERR(bh);
tmp_idata = (__le32 *)bh->b_data;
for (i = 0; i < max_entries; i++) {
@@ -382,9 +382,9 @@
struct ext4_extent_header *eh;
block = ext4_idx_pblock(ix);
- bh = sb_bread(inode->i_sb, block);
- if (!bh)
- return -EIO;
+ bh = ext4_sb_bread(inode->i_sb, block, 0);
+ if (IS_ERR(bh))
+ return PTR_ERR(bh);
eh = (struct ext4_extent_header *)bh->b_data;
if (eh->eh_depth != 0) {
@@ -535,22 +535,22 @@
if (i_data[EXT4_IND_BLOCK]) {
retval = update_ind_extent_range(handle, tmp_inode,
le32_to_cpu(i_data[EXT4_IND_BLOCK]), &lb);
- if (retval)
- goto err_out;
+ if (retval)
+ goto err_out;
} else
lb.curr_block += max_entries;
if (i_data[EXT4_DIND_BLOCK]) {
retval = update_dind_extent_range(handle, tmp_inode,
le32_to_cpu(i_data[EXT4_DIND_BLOCK]), &lb);
- if (retval)
- goto err_out;
+ if (retval)
+ goto err_out;
} else
lb.curr_block += max_entries * max_entries;
if (i_data[EXT4_TIND_BLOCK]) {
retval = update_tind_extent_range(handle, tmp_inode,
le32_to_cpu(i_data[EXT4_TIND_BLOCK]), &lb);
- if (retval)
- goto err_out;
+ if (retval)
+ goto err_out;
}
/*
* Build the last extent
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index 2f5be02..30ce3dc 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -13,11 +13,10 @@
#include "ext4_extents.h"
/**
- * get_ext_path - Find an extent path for designated logical block number.
- *
- * @inode: an inode which is searched
+ * get_ext_path() - Find an extent path for designated logical block number.
+ * @inode: inode to be searched
* @lblock: logical block number to find an extent path
- * @path: pointer to an extent path pointer (for output)
+ * @ppath: pointer to an extent path pointer (for output)
*
* ext4_find_extent wrapper. Return 0 on success, or a negative error value
* on failure.
@@ -42,8 +41,9 @@
}
/**
- * ext4_double_down_write_data_sem - Acquire two inodes' write lock
- * of i_data_sem
+ * ext4_double_down_write_data_sem() - write lock two inodes's i_data_sem
+ * @first: inode to be locked
+ * @second: inode to be locked
*
* Acquire write lock of i_data_sem of the two inodes
*/
@@ -390,7 +390,8 @@
/* Even in case of data=writeback it is reasonable to pin
* inode to transaction, to prevent unexpected data loss */
- *err = ext4_jbd2_inode_add_write(handle, orig_inode);
+ *err = ext4_jbd2_inode_add_write(handle, orig_inode,
+ (loff_t)orig_page_offset << PAGE_SHIFT, replaced_size);
unlock_pages:
unlock_page(pagep[0]);
@@ -592,8 +593,7 @@
return -EOPNOTSUPP;
}
- if (ext4_encrypted_inode(orig_inode) ||
- ext4_encrypted_inode(donor_inode)) {
+ if (IS_ENCRYPTED(orig_inode) || IS_ENCRYPTED(donor_inode)) {
ext4_msg(orig_inode->i_sb, KERN_ERR,
"Online defrag not supported for encrypted files");
return -EOPNOTSUPP;
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index ffa2575..a427d20 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -35,6 +35,7 @@
#include <linux/buffer_head.h>
#include <linux/bio.h>
#include <linux/iversion.h>
+#include <linux/unicode.h>
#include "ext4.h"
#include "ext4_jbd2.h"
@@ -81,8 +82,18 @@
static int ext4_dx_csum_verify(struct inode *inode,
struct ext4_dir_entry *dirent);
+/*
+ * Hints to ext4_read_dirblock regarding whether we expect a directory
+ * block being read to be an index block, or a block containing
+ * directory entries (and if the latter, whether it was found via a
+ * logical block in an htree index block). This is used to control
+ * what sort of sanity checkinig ext4_read_dirblock() will do on the
+ * directory block read from the storage device. EITHER will means
+ * the caller doesn't know what kind of directory block will be read,
+ * so no specific verification will be done.
+ */
typedef enum {
- EITHER, INDEX, DIRENT
+ EITHER, INDEX, DIRENT, DIRENT_HTREE
} dirblock_type_t;
#define ext4_read_dirblock(inode, block, type) \
@@ -108,11 +119,14 @@
return bh;
}
- if (!bh) {
+ if (!bh && (type == INDEX || type == DIRENT_HTREE)) {
ext4_error_inode(inode, func, line, block,
- "Directory hole found");
+ "Directory hole found for htree %s block",
+ (type == INDEX) ? "index" : "leaf");
return ERR_PTR(-EFSCORRUPTED);
}
+ if (!bh)
+ return NULL;
dirent = (struct ext4_dir_entry *) bh->b_data;
/* Determine whether or not we have an index block */
if (is_dx(inode)) {
@@ -149,7 +163,7 @@
}
}
if (!is_dx_block) {
- if (ext4_dirent_csum_verify(inode, dirent))
+ if (ext4_dirblock_csum_verify(inode, bh))
set_buffer_verified(bh);
else {
ext4_error_inode(inode, func, line, block,
@@ -279,9 +293,11 @@
struct inode *dir, struct inode *inode);
/* checksumming functions */
-void initialize_dirent_tail(struct ext4_dir_entry_tail *t,
- unsigned int blocksize)
+void ext4_initialize_dirent_tail(struct buffer_head *bh,
+ unsigned int blocksize)
{
+ struct ext4_dir_entry_tail *t = EXT4_DIRENT_TAIL(bh->b_data, blocksize);
+
memset(t, 0, sizeof(struct ext4_dir_entry_tail));
t->det_rec_len = ext4_rec_len_to_disk(
sizeof(struct ext4_dir_entry_tail), blocksize);
@@ -290,17 +306,17 @@
/* Walk through a dirent block to find a checksum "dirent" at the tail */
static struct ext4_dir_entry_tail *get_dirent_tail(struct inode *inode,
- struct ext4_dir_entry *de)
+ struct buffer_head *bh)
{
struct ext4_dir_entry_tail *t;
#ifdef PARANOID
struct ext4_dir_entry *d, *top;
- d = de;
- top = (struct ext4_dir_entry *)(((void *)de) +
+ d = (struct ext4_dir_entry *)bh->b_data;
+ top = (struct ext4_dir_entry *)(bh->b_data +
(EXT4_BLOCK_SIZE(inode->i_sb) -
- sizeof(struct ext4_dir_entry_tail)));
+ sizeof(struct ext4_dir_entry_tail)));
while (d < top && d->rec_len)
d = (struct ext4_dir_entry *)(((void *)d) +
le16_to_cpu(d->rec_len));
@@ -310,7 +326,7 @@
t = (struct ext4_dir_entry_tail *)d;
#else
- t = EXT4_DIRENT_TAIL(de, EXT4_BLOCK_SIZE(inode->i_sb));
+ t = EXT4_DIRENT_TAIL(bh->b_data, EXT4_BLOCK_SIZE(inode->i_sb));
#endif
if (t->det_reserved_zero1 ||
@@ -322,8 +338,7 @@
return t;
}
-static __le32 ext4_dirent_csum(struct inode *inode,
- struct ext4_dir_entry *dirent, int size)
+static __le32 ext4_dirblock_csum(struct inode *inode, void *dirent, int size)
{
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
struct ext4_inode_info *ei = EXT4_I(inode);
@@ -343,49 +358,49 @@
"No space for directory leaf checksum. Please run e2fsck -D.");
}
-int ext4_dirent_csum_verify(struct inode *inode, struct ext4_dir_entry *dirent)
+int ext4_dirblock_csum_verify(struct inode *inode, struct buffer_head *bh)
{
struct ext4_dir_entry_tail *t;
if (!ext4_has_metadata_csum(inode->i_sb))
return 1;
- t = get_dirent_tail(inode, dirent);
+ t = get_dirent_tail(inode, bh);
if (!t) {
warn_no_space_for_csum(inode);
return 0;
}
- if (t->det_checksum != ext4_dirent_csum(inode, dirent,
- (void *)t - (void *)dirent))
+ if (t->det_checksum != ext4_dirblock_csum(inode, bh->b_data,
+ (char *)t - bh->b_data))
return 0;
return 1;
}
-static void ext4_dirent_csum_set(struct inode *inode,
- struct ext4_dir_entry *dirent)
+static void ext4_dirblock_csum_set(struct inode *inode,
+ struct buffer_head *bh)
{
struct ext4_dir_entry_tail *t;
if (!ext4_has_metadata_csum(inode->i_sb))
return;
- t = get_dirent_tail(inode, dirent);
+ t = get_dirent_tail(inode, bh);
if (!t) {
warn_no_space_for_csum(inode);
return;
}
- t->det_checksum = ext4_dirent_csum(inode, dirent,
- (void *)t - (void *)dirent);
+ t->det_checksum = ext4_dirblock_csum(inode, bh->b_data,
+ (char *)t - bh->b_data);
}
-int ext4_handle_dirty_dirent_node(handle_t *handle,
- struct inode *inode,
- struct buffer_head *bh)
+int ext4_handle_dirty_dirblock(handle_t *handle,
+ struct inode *inode,
+ struct buffer_head *bh)
{
- ext4_dirent_csum_set(inode, (struct ext4_dir_entry *)bh->b_data);
+ ext4_dirblock_csum_set(inode, bh);
return ext4_handle_dirty_metadata(handle, inode, bh);
}
@@ -612,7 +627,7 @@
{
if (show_names)
{
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
int len;
char *name;
struct fscrypt_str fname_crypto_str =
@@ -621,7 +636,7 @@
name = de->name;
len = de->name_len;
- if (ext4_encrypted_inode(dir))
+ if (IS_ENCRYPTED(dir))
res = fscrypt_get_encryption_info(dir);
if (res) {
printk(KERN_WARNING "Error setting up"
@@ -629,7 +644,7 @@
}
if (!fscrypt_has_encryption_key(dir)) {
/* Directory is not encrypted */
- ext4fs_dirhash(de->name,
+ ext4fs_dirhash(dir, de->name,
de->name_len, &h);
printk("%*.s:(U)%x.%u ", len,
name, h.hash,
@@ -662,8 +677,8 @@
name = fname_crypto_str.name;
len = fname_crypto_str.len;
}
- ext4fs_dirhash(de->name, de->name_len,
- &h);
+ ext4fs_dirhash(dir, de->name,
+ de->name_len, &h);
printk("%*.s:(E)%x.%u ", len, name,
h.hash, (unsigned) ((char *) de
- base));
@@ -673,7 +688,7 @@
#else
int len = de->name_len;
char *name = de->name;
- ext4fs_dirhash(de->name, de->name_len, &h);
+ ext4fs_dirhash(dir, de->name, de->name_len, &h);
printk("%*.s:%x.%u ", len, name, h.hash,
(unsigned) ((char *) de - base));
#endif
@@ -762,7 +777,7 @@
hinfo->hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned;
hinfo->seed = EXT4_SB(dir->i_sb)->s_hash_seed;
if (fname && fname_name(fname))
- ext4fs_dirhash(fname_name(fname), fname_len(fname), hinfo);
+ ext4fs_dirhash(dir, fname_name(fname), fname_len(fname), hinfo);
hash = hinfo->hash;
if (root->info.unused_flags & 1) {
@@ -871,12 +886,15 @@
{
struct dx_root_info *info;
int i;
+ unsigned int indirect_levels;
if (frames[0].bh == NULL)
return;
info = &((struct dx_root *)frames[0].bh->b_data)->info;
- for (i = 0; i <= info->indirect_levels; i++) {
+ /* save local copy, "info" may be freed after brelse() */
+ indirect_levels = info->indirect_levels;
+ for (i = 0; i <= indirect_levels; i++) {
if (frames[i].bh == NULL)
break;
brelse(frames[i].bh);
@@ -976,7 +994,7 @@
dxtrace(printk(KERN_INFO "In htree dirblock_to_tree: block %lu\n",
(unsigned long)block));
- bh = ext4_read_dirblock(dir, block, DIRENT);
+ bh = ext4_read_dirblock(dir, block, DIRENT_HTREE);
if (IS_ERR(bh))
return PTR_ERR(bh);
@@ -984,9 +1002,9 @@
top = (struct ext4_dir_entry_2 *) ((char *) de +
dir->i_sb->s_blocksize -
EXT4_DIR_REC_LEN(0));
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
/* Check if the directory is encrypted */
- if (ext4_encrypted_inode(dir)) {
+ if (IS_ENCRYPTED(dir)) {
err = fscrypt_get_encryption_info(dir);
if (err < 0) {
brelse(bh);
@@ -1008,14 +1026,14 @@
/* silently ignore the rest of the block */
break;
}
- ext4fs_dirhash(de->name, de->name_len, hinfo);
+ ext4fs_dirhash(dir, de->name, de->name_len, hinfo);
if ((hinfo->hash < start_hash) ||
((hinfo->hash == start_hash) &&
(hinfo->minor_hash < start_minor_hash)))
continue;
if (de->inode == 0)
continue;
- if (!ext4_encrypted_inode(dir)) {
+ if (!IS_ENCRYPTED(dir)) {
tmp_str.name = de->name;
tmp_str.len = de->name_len;
err = ext4_htree_store_dirent(dir_file,
@@ -1047,7 +1065,7 @@
}
errout:
brelse(bh);
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
fscrypt_fname_free_buffer(&fname_crypto_str);
#endif
return count;
@@ -1086,10 +1104,10 @@
hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed;
if (ext4_has_inline_data(dir)) {
int has_inline_data = 1;
- count = htree_inlinedir_to_tree(dir_file, dir, 0,
- &hinfo, start_hash,
- start_minor_hash,
- &has_inline_data);
+ count = ext4_inlinedir_to_tree(dir_file, dir, 0,
+ &hinfo, start_hash,
+ start_minor_hash,
+ &has_inline_data);
if (has_inline_data) {
*next_hash = ~0;
return count;
@@ -1197,7 +1215,7 @@
while ((char *) de < base + blocksize) {
if (de->name_len && de->inode) {
- ext4fs_dirhash(de->name, de->name_len, &h);
+ ext4fs_dirhash(dir, de->name, de->name_len, &h);
map_tail--;
map_tail->hash = h.hash;
map_tail->offs = ((char *) de - base)>>2;
@@ -1252,24 +1270,105 @@
dx_set_count(entries, count + 1);
}
+#ifdef CONFIG_UNICODE
+/*
+ * Test whether a case-insensitive directory entry matches the filename
+ * being searched for. If quick is set, assume the name being looked up
+ * is already in the casefolded form.
+ *
+ * Returns: 0 if the directory entry matches, more than 0 if it
+ * doesn't match or less than zero on error.
+ */
+int ext4_ci_compare(const struct inode *parent, const struct qstr *name,
+ const struct qstr *entry, bool quick)
+{
+ const struct ext4_sb_info *sbi = EXT4_SB(parent->i_sb);
+ const struct unicode_map *um = sbi->s_encoding;
+ int ret;
+
+ if (quick)
+ ret = utf8_strncasecmp_folded(um, name, entry);
+ else
+ ret = utf8_strncasecmp(um, name, entry);
+
+ if (ret < 0) {
+ /* Handle invalid character sequence as either an error
+ * or as an opaque byte sequence.
+ */
+ if (ext4_has_strict_mode(sbi))
+ return -EINVAL;
+
+ if (name->len != entry->len)
+ return 1;
+
+ return !!memcmp(name->name, entry->name, name->len);
+ }
+
+ return ret;
+}
+
+void ext4_fname_setup_ci_filename(struct inode *dir, const struct qstr *iname,
+ struct fscrypt_str *cf_name)
+{
+ int len;
+
+ if (!IS_CASEFOLDED(dir) || !EXT4_SB(dir->i_sb)->s_encoding) {
+ cf_name->name = NULL;
+ return;
+ }
+
+ cf_name->name = kmalloc(EXT4_NAME_LEN, GFP_NOFS);
+ if (!cf_name->name)
+ return;
+
+ len = utf8_casefold(EXT4_SB(dir->i_sb)->s_encoding,
+ iname, cf_name->name,
+ EXT4_NAME_LEN);
+ if (len <= 0) {
+ kfree(cf_name->name);
+ cf_name->name = NULL;
+ return;
+ }
+ cf_name->len = (unsigned) len;
+
+}
+#endif
+
/*
* Test whether a directory entry matches the filename being searched for.
*
* Return: %true if the directory entry matches, otherwise %false.
*/
-static inline bool ext4_match(const struct ext4_filename *fname,
+static inline bool ext4_match(const struct inode *parent,
+ const struct ext4_filename *fname,
const struct ext4_dir_entry_2 *de)
{
struct fscrypt_name f;
+#ifdef CONFIG_UNICODE
+ const struct qstr entry = {.name = de->name, .len = de->name_len};
+#endif
if (!de->inode)
return false;
f.usr_fname = fname->usr_fname;
f.disk_name = fname->disk_name;
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
f.crypto_buf = fname->crypto_buf;
#endif
+
+#ifdef CONFIG_UNICODE
+ if (EXT4_SB(parent->i_sb)->s_encoding && IS_CASEFOLDED(parent)) {
+ if (fname->cf_name.name) {
+ struct qstr cf = {.name = fname->cf_name.name,
+ .len = fname->cf_name.len};
+ return !ext4_ci_compare(parent, &cf, &entry, true);
+ }
+ return !ext4_ci_compare(parent, fname->usr_fname, &entry,
+ false);
+ }
+#endif
+
return fscrypt_match_name(&f, de->name, de->name_len);
}
@@ -1290,7 +1389,7 @@
/* this code is executed quadratically often */
/* do minimal checking `by hand' */
if ((char *) de + de->name_len <= dlimit &&
- ext4_match(fname, de)) {
+ ext4_match(dir, fname, de)) {
/* found a match - just to be sure, do
* a full check */
if (ext4_check_dir_entry(dir, NULL, de, bh, bh->b_data,
@@ -1327,7 +1426,7 @@
}
/*
- * ext4_find_entry()
+ * __ext4_find_entry()
*
* finds an entry in the specified directory with the wanted name. It
* returns the cache buffer in which the entry was found, and the entry
@@ -1337,39 +1436,32 @@
* The returned buffer_head has ->b_count elevated. The caller is expected
* to brelse() it when appropriate.
*/
-static struct buffer_head * ext4_find_entry (struct inode *dir,
- const struct qstr *d_name,
- struct ext4_dir_entry_2 **res_dir,
- int *inlined)
+static struct buffer_head *__ext4_find_entry(struct inode *dir,
+ struct ext4_filename *fname,
+ struct ext4_dir_entry_2 **res_dir,
+ int *inlined)
{
struct super_block *sb;
struct buffer_head *bh_use[NAMEI_RA_SIZE];
struct buffer_head *bh, *ret = NULL;
ext4_lblk_t start, block;
- const u8 *name = d_name->name;
+ const u8 *name = fname->usr_fname->name;
size_t ra_max = 0; /* Number of bh's in the readahead
buffer, bh_use[] */
size_t ra_ptr = 0; /* Current index into readahead
buffer */
ext4_lblk_t nblocks;
int i, namelen, retval;
- struct ext4_filename fname;
*res_dir = NULL;
sb = dir->i_sb;
- namelen = d_name->len;
+ namelen = fname->usr_fname->len;
if (namelen > EXT4_NAME_LEN)
return NULL;
- retval = ext4_fname_setup_filename(dir, d_name, 1, &fname);
- if (retval == -ENOENT)
- return NULL;
- if (retval)
- return ERR_PTR(retval);
-
if (ext4_has_inline_data(dir)) {
int has_inline_data = 1;
- ret = ext4_find_inline_entry(dir, &fname, res_dir,
+ ret = ext4_find_inline_entry(dir, fname, res_dir,
&has_inline_data);
if (has_inline_data) {
if (inlined)
@@ -1389,7 +1481,7 @@
goto restart;
}
if (is_dx(dir)) {
- ret = ext4_dx_find_entry(dir, &fname, res_dir);
+ ret = ext4_dx_find_entry(dir, fname, res_dir);
/*
* On success, or if the error was file not found,
* return. Otherwise, fall back to doing a search the
@@ -1444,8 +1536,7 @@
if (!buffer_verified(bh) &&
!is_dx_internal_node(dir, block,
(struct ext4_dir_entry *)bh->b_data) &&
- !ext4_dirent_csum_verify(dir,
- (struct ext4_dir_entry *)bh->b_data)) {
+ !ext4_dirblock_csum_verify(dir, bh)) {
EXT4_ERROR_INODE(dir, "checksumming directory "
"block %lu", (unsigned long)block);
brelse(bh);
@@ -1453,7 +1544,7 @@
goto cleanup_and_exit;
}
set_buffer_verified(bh);
- i = search_dirblock(bh, dir, &fname,
+ i = search_dirblock(bh, dir, fname,
block << EXT4_BLOCK_SIZE_BITS(sb), res_dir);
if (i == 1) {
EXT4_I(dir)->i_dir_start_lookup = block;
@@ -1484,10 +1575,50 @@
/* Clean up the read-ahead blocks */
for (; ra_ptr < ra_max; ra_ptr++)
brelse(bh_use[ra_ptr]);
- ext4_fname_free_filename(&fname);
return ret;
}
+static struct buffer_head *ext4_find_entry(struct inode *dir,
+ const struct qstr *d_name,
+ struct ext4_dir_entry_2 **res_dir,
+ int *inlined)
+{
+ int err;
+ struct ext4_filename fname;
+ struct buffer_head *bh;
+
+ err = ext4_fname_setup_filename(dir, d_name, 1, &fname);
+ if (err == -ENOENT)
+ return NULL;
+ if (err)
+ return ERR_PTR(err);
+
+ bh = __ext4_find_entry(dir, &fname, res_dir, inlined);
+
+ ext4_fname_free_filename(&fname);
+ return bh;
+}
+
+static struct buffer_head *ext4_lookup_entry(struct inode *dir,
+ struct dentry *dentry,
+ struct ext4_dir_entry_2 **res_dir)
+{
+ int err;
+ struct ext4_filename fname;
+ struct buffer_head *bh;
+
+ err = ext4_fname_prepare_lookup(dir, dentry, &fname);
+ if (err == -ENOENT)
+ return NULL;
+ if (err)
+ return ERR_PTR(err);
+
+ bh = __ext4_find_entry(dir, &fname, res_dir, NULL);
+
+ ext4_fname_free_filename(&fname);
+ return bh;
+}
+
static struct buffer_head * ext4_dx_find_entry(struct inode *dir,
struct ext4_filename *fname,
struct ext4_dir_entry_2 **res_dir)
@@ -1498,7 +1629,7 @@
ext4_lblk_t block;
int retval;
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
*res_dir = NULL;
#endif
frame = dx_probe(fname, dir, NULL, frames);
@@ -1506,7 +1637,7 @@
return (struct buffer_head *) frame;
do {
block = dx_get_block(frame->at);
- bh = ext4_read_dirblock(dir, block, DIRENT);
+ bh = ext4_read_dirblock(dir, block, DIRENT_HTREE);
if (IS_ERR(bh))
goto errout;
@@ -1546,18 +1677,13 @@
struct inode *inode;
struct ext4_dir_entry_2 *de;
struct buffer_head *bh;
- int err;
-
- err = fscrypt_prepare_lookup(dir, dentry, flags);
- if (err)
- return ERR_PTR(err);
if (dentry->d_name.len > EXT4_NAME_LEN)
return ERR_PTR(-ENAMETOOLONG);
- bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL);
+ bh = ext4_lookup_entry(dir, dentry, &de);
if (IS_ERR(bh))
- return (struct dentry *) bh;
+ return ERR_CAST(bh);
inode = NULL;
if (bh) {
__u32 ino = le32_to_cpu(de->inode);
@@ -1571,14 +1697,14 @@
dentry);
return ERR_PTR(-EFSCORRUPTED);
}
- inode = ext4_iget_normal(dir->i_sb, ino);
+ inode = ext4_iget(dir->i_sb, ino, EXT4_IGET_NORMAL);
if (inode == ERR_PTR(-ESTALE)) {
EXT4_ERROR_INODE(dir,
"deleted inode referenced: %u",
ino);
return ERR_PTR(-EFSCORRUPTED);
}
- if (!IS_ERR(inode) && ext4_encrypted_inode(dir) &&
+ if (!IS_ERR(inode) && IS_ENCRYPTED(dir) &&
(S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) &&
!fscrypt_has_permitted_context(dir, inode)) {
ext4_warning(inode->i_sb,
@@ -1588,6 +1714,17 @@
return ERR_PTR(-EPERM);
}
}
+
+#ifdef CONFIG_UNICODE
+ if (!inode && IS_CASEFOLDED(dir)) {
+ /* Eventually we want to call d_add_ci(dentry, NULL)
+ * for negative dentries in the encoding case as
+ * well. For now, prevent the negative dentry
+ * from being cached.
+ */
+ return NULL;
+ }
+#endif
return d_splice_alias(inode, dentry);
}
@@ -1601,7 +1738,7 @@
bh = ext4_find_entry(d_inode(child), &dotdot, &de, NULL);
if (IS_ERR(bh))
- return (struct dentry *) bh;
+ return ERR_CAST(bh);
if (!bh)
return ERR_PTR(-ENOENT);
ino = le32_to_cpu(de->inode);
@@ -1613,7 +1750,7 @@
return ERR_PTR(-EFSCORRUPTED);
}
- return d_obtain_alias(ext4_iget_normal(child->d_sb, ino));
+ return d_obtain_alias(ext4_iget(child->d_sb, ino, EXT4_IGET_NORMAL));
}
/*
@@ -1683,7 +1820,6 @@
char *data1 = (*bh)->b_data, *data2;
unsigned split, move, size;
struct ext4_dir_entry_2 *de = NULL, *de2;
- struct ext4_dir_entry_tail *t;
int csum_size = 0;
int err = 0, i;
@@ -1744,11 +1880,8 @@
(char *) de2,
blocksize);
if (csum_size) {
- t = EXT4_DIRENT_TAIL(data2, blocksize);
- initialize_dirent_tail(t, blocksize);
-
- t = EXT4_DIRENT_TAIL(data1, blocksize);
- initialize_dirent_tail(t, blocksize);
+ ext4_initialize_dirent_tail(*bh, blocksize);
+ ext4_initialize_dirent_tail(bh2, blocksize);
}
dxtrace(dx_show_leaf(dir, hinfo, (struct ext4_dir_entry_2 *) data1,
@@ -1762,7 +1895,7 @@
de = de2;
}
dx_insert_block(frame, hash2 + continued, newblock);
- err = ext4_handle_dirty_dirent_node(handle, dir, bh2);
+ err = ext4_handle_dirty_dirblock(handle, dir, bh2);
if (err)
goto journal_error;
err = ext4_handle_dirty_dx_node(handle, dir, frame->bh);
@@ -1798,7 +1931,7 @@
if (ext4_check_dir_entry(dir, NULL, de, bh,
buf, buf_size, offset))
return -EFSCORRUPTED;
- if (ext4_match(fname, de))
+ if (ext4_match(dir, fname, de))
return -EEXIST;
nlen = EXT4_DIR_REC_LEN(de->name_len);
rlen = ext4_rec_len_from_disk(de->rec_len, buf_size);
@@ -1890,7 +2023,7 @@
inode_inc_iversion(dir);
ext4_mark_inode_dirty(handle, dir);
BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
- err = ext4_handle_dirty_dirent_node(handle, dir, bh);
+ err = ext4_handle_dirty_dirblock(handle, dir, bh);
if (err)
ext4_std_error(dir->i_sb, err);
return 0;
@@ -1909,8 +2042,7 @@
struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
struct dx_entry *entries;
struct ext4_dir_entry_2 *de, *de2;
- struct ext4_dir_entry_tail *t;
- char *data1, *top;
+ char *data2, *top;
unsigned len;
int retval;
unsigned blocksize;
@@ -1950,21 +2082,18 @@
return PTR_ERR(bh2);
}
ext4_set_inode_flag(dir, EXT4_INODE_INDEX);
- data1 = bh2->b_data;
+ data2 = bh2->b_data;
- memcpy (data1, de, len);
- de = (struct ext4_dir_entry_2 *) data1;
- top = data1 + len;
+ memcpy(data2, de, len);
+ de = (struct ext4_dir_entry_2 *) data2;
+ top = data2 + len;
while ((char *)(de2 = ext4_next_entry(de, blocksize)) < top)
de = de2;
- de->rec_len = ext4_rec_len_to_disk(data1 + (blocksize - csum_size) -
- (char *) de,
- blocksize);
+ de->rec_len = ext4_rec_len_to_disk(data2 + (blocksize - csum_size) -
+ (char *) de, blocksize);
- if (csum_size) {
- t = EXT4_DIRENT_TAIL(data1, blocksize);
- initialize_dirent_tail(t, blocksize);
- }
+ if (csum_size)
+ ext4_initialize_dirent_tail(bh2, blocksize);
/* Initialize the root; the dot dirents already exist */
de = (struct ext4_dir_entry_2 *) (&root->dotdot);
@@ -1983,7 +2112,7 @@
if (fname->hinfo.hash_version <= DX_HASH_TEA)
fname->hinfo.hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned;
fname->hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed;
- ext4fs_dirhash(fname_name(fname), fname_len(fname), &fname->hinfo);
+ ext4fs_dirhash(dir, fname_name(fname), fname_len(fname), &fname->hinfo);
memset(frames, 0, sizeof(frames));
frame = frames;
@@ -1994,7 +2123,7 @@
retval = ext4_handle_dirty_dx_node(handle, dir, frame->bh);
if (retval)
goto out_frames;
- retval = ext4_handle_dirty_dirent_node(handle, dir, bh2);
+ retval = ext4_handle_dirty_dirblock(handle, dir, bh2);
if (retval)
goto out_frames;
@@ -2034,8 +2163,8 @@
struct inode *dir = d_inode(dentry->d_parent);
struct buffer_head *bh = NULL;
struct ext4_dir_entry_2 *de;
- struct ext4_dir_entry_tail *t;
struct super_block *sb;
+ struct ext4_sb_info *sbi;
struct ext4_filename fname;
int retval;
int dx_fallback=0;
@@ -2047,10 +2176,17 @@
csum_size = sizeof(struct ext4_dir_entry_tail);
sb = dir->i_sb;
+ sbi = EXT4_SB(sb);
blocksize = sb->s_blocksize;
if (!dentry->d_name.len)
return -EINVAL;
+#ifdef CONFIG_UNICODE
+ if (ext4_has_strict_mode(sbi) && IS_CASEFOLDED(dir) &&
+ sbi->s_encoding && utf8_validate(sbi->s_encoding, &dentry->d_name))
+ return -EINVAL;
+#endif
+
retval = ext4_fname_setup_filename(dir, &dentry->d_name, 0, &fname);
if (retval)
return retval;
@@ -2076,6 +2212,11 @@
blocks = dir->i_size >> sb->s_blocksize_bits;
for (block = 0; block < blocks; block++) {
bh = ext4_read_dirblock(dir, block, DIRENT);
+ if (bh == NULL) {
+ bh = ext4_bread(handle, dir, block,
+ EXT4_GET_BLOCKS_CREATE);
+ goto add_to_new_block;
+ }
if (IS_ERR(bh)) {
retval = PTR_ERR(bh);
bh = NULL;
@@ -2096,6 +2237,7 @@
brelse(bh);
}
bh = ext4_append(handle, dir, &block);
+add_to_new_block:
if (IS_ERR(bh)) {
retval = PTR_ERR(bh);
bh = NULL;
@@ -2105,10 +2247,8 @@
de->inode = 0;
de->rec_len = ext4_rec_len_to_disk(blocksize - csum_size, blocksize);
- if (csum_size) {
- t = EXT4_DIRENT_TAIL(bh->b_data, blocksize);
- initialize_dirent_tail(t, blocksize);
- }
+ if (csum_size)
+ ext4_initialize_dirent_tail(bh, blocksize);
retval = add_dirent_to_buf(handle, &fname, dir, inode, de, bh);
out:
@@ -2140,7 +2280,7 @@
return PTR_ERR(frame);
entries = frame->entries;
at = frame->at;
- bh = ext4_read_dirblock(dir, dx_get_block(frame->at), DIRENT);
+ bh = ext4_read_dirblock(dir, dx_get_block(frame->at), DIRENT_HTREE);
if (IS_ERR(bh)) {
err = PTR_ERR(bh);
bh = NULL;
@@ -2262,7 +2402,7 @@
dxroot->info.indirect_levels += 1;
dxtrace(printk(KERN_DEBUG
"Creating %d level index...\n",
- info->indirect_levels));
+ dxroot->info.indirect_levels));
err = ext4_handle_dirty_dx_node(handle, dir, frame->bh);
if (err)
goto journal_error;
@@ -2366,7 +2506,7 @@
goto out;
BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
- err = ext4_handle_dirty_dirent_node(handle, dir, bh);
+ err = ext4_handle_dirty_dirblock(handle, dir, bh);
if (unlikely(err))
goto out;
@@ -2568,7 +2708,6 @@
{
struct buffer_head *dir_block = NULL;
struct ext4_dir_entry_2 *de;
- struct ext4_dir_entry_tail *t;
ext4_lblk_t block = 0;
unsigned int blocksize = dir->i_sb->s_blocksize;
int csum_size = 0;
@@ -2592,13 +2731,11 @@
de = (struct ext4_dir_entry_2 *)dir_block->b_data;
ext4_init_dot_dotdot(inode, de, blocksize, csum_size, dir->i_ino, 0);
set_nlink(inode, 2);
- if (csum_size) {
- t = EXT4_DIRENT_TAIL(dir_block->b_data, blocksize);
- initialize_dirent_tail(t, blocksize);
- }
+ if (csum_size)
+ ext4_initialize_dirent_tail(dir_block, blocksize);
BUFFER_TRACE(dir_block, "call ext4_handle_dirty_metadata");
- err = ext4_handle_dirty_dirent_node(handle, inode, dir_block);
+ err = ext4_handle_dirty_dirblock(handle, inode, dir_block);
if (err)
goto out;
set_buffer_verified(dir_block);
@@ -2688,7 +2825,10 @@
EXT4_ERROR_INODE(inode, "invalid size");
return true;
}
- bh = ext4_read_dirblock(inode, 0, EITHER);
+ /* The first directory block must not be a hole,
+ * so treat it as DIRENT_HTREE
+ */
+ bh = ext4_read_dirblock(inode, 0, DIRENT_HTREE);
if (IS_ERR(bh))
return true;
@@ -2710,6 +2850,10 @@
brelse(bh);
lblock = offset >> EXT4_BLOCK_SIZE_BITS(sb);
bh = ext4_read_dirblock(inode, lblock, EITHER);
+ if (bh == NULL) {
+ offset += sb->s_blocksize;
+ continue;
+ }
if (IS_ERR(bh))
return true;
de = (struct ext4_dir_entry_2 *) bh->b_data;
@@ -2975,6 +3119,17 @@
ext4_update_dx_flag(dir);
ext4_mark_inode_dirty(handle, dir);
+#ifdef CONFIG_UNICODE
+ /* VFS negative dentries are incompatible with Encoding and
+ * Case-insensitiveness. Eventually we'll want avoid
+ * invalidating the dentries here, alongside with returning the
+ * negative dentries at ext4_lookup(), when it is better
+ * supported by the VFS for the CI case.
+ */
+ if (IS_CASEFOLDED(dir))
+ d_invalidate(dentry);
+#endif
+
end_rmdir:
brelse(bh);
if (handle)
@@ -3044,6 +3199,17 @@
inode->i_ctime = current_time(inode);
ext4_mark_inode_dirty(handle, inode);
+#ifdef CONFIG_UNICODE
+ /* VFS negative dentries are incompatible with Encoding and
+ * Case-insensitiveness. Eventually we'll want avoid
+ * invalidating the dentries here, alongside with returning the
+ * negative dentries at ext4_lookup(), when it is better
+ * supported by the VFS for the CI case.
+ */
+ if (IS_CASEFOLDED(dir))
+ d_invalidate(dentry);
+#endif
+
end_unlink:
brelse(bh);
if (handle)
@@ -3253,7 +3419,10 @@
struct buffer_head *bh;
if (!ext4_has_inline_data(inode)) {
- bh = ext4_read_dirblock(inode, 0, EITHER);
+ /* The first directory block must not be a hole, so
+ * treat it as DIRENT_HTREE
+ */
+ bh = ext4_read_dirblock(inode, 0, DIRENT_HTREE);
if (IS_ERR(bh)) {
*retval = PTR_ERR(bh);
return NULL;
@@ -3314,9 +3483,8 @@
ent->inode,
ent->dir_bh);
} else {
- retval = ext4_handle_dirty_dirent_node(handle,
- ent->inode,
- ent->dir_bh);
+ retval = ext4_handle_dirty_dirblock(handle, ent->inode,
+ ent->dir_bh);
}
} else {
retval = ext4_mark_inode_dirty(handle, ent->inode);
@@ -3346,8 +3514,7 @@
ext4_mark_inode_dirty(handle, ent->dir);
BUFFER_TRACE(ent->bh, "call ext4_handle_dirty_metadata");
if (!ent->inlined) {
- retval = ext4_handle_dirty_dirent_node(handle,
- ent->dir, ent->bh);
+ retval = ext4_handle_dirty_dirblock(handle, ent->dir, ent->bh);
if (unlikely(retval)) {
ext4_std_error(ent->dir->i_sb, retval);
return retval;
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index db75901..12ceade 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -61,14 +61,12 @@
static void ext4_finish_bio(struct bio *bio)
{
- int i;
struct bio_vec *bvec;
+ struct bvec_iter_all iter_all;
- bio_for_each_segment_all(bvec, bio, i) {
+ bio_for_each_segment_all(bvec, bio, iter_all) {
struct page *page = bvec->bv_page;
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
- struct page *data_page = NULL;
-#endif
+ struct page *bounce_page = NULL;
struct buffer_head *bh, *head;
unsigned bio_start = bvec->bv_offset;
unsigned bio_end = bio_start + bvec->bv_len;
@@ -78,13 +76,10 @@
if (!page)
continue;
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
- if (!page->mapping) {
- /* The bounce data pages are unmapped. */
- data_page = page;
- fscrypt_pullback_bio_page(&page, false);
+ if (fscrypt_is_bounce_page(page)) {
+ bounce_page = page;
+ page = fscrypt_pagecache_page(bounce_page);
}
-#endif
if (bio->bi_status) {
SetPageError(page);
@@ -111,10 +106,7 @@
bit_spin_unlock(BH_Uptodate_Lock, &head->b_state);
local_irq_restore(flags);
if (!under_io) {
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
- if (data_page)
- fscrypt_restore_control_page(data_page);
-#endif
+ fscrypt_free_bounce_page(bounce_page);
end_page_writeback(page);
}
}
@@ -374,13 +366,13 @@
bio = bio_alloc(GFP_NOIO, BIO_MAX_PAGES);
if (!bio)
return -ENOMEM;
- wbc_init_bio(io->io_wbc, bio);
bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
bio_set_dev(bio, bh->b_bdev);
bio->bi_end_io = ext4_end_bio;
bio->bi_private = ext4_get_io_end(io->io_end);
io->io_bio = bio;
io->io_next_block = bh->b_blocknr;
+ wbc_init_bio(io->io_wbc, bio);
return 0;
}
@@ -404,7 +396,7 @@
ret = bio_add_page(io->io_bio, page, bh->b_size, bh_offset(bh));
if (ret != bh->b_size)
goto submit_and_retry;
- wbc_account_io(io->io_wbc, page, bh->b_size);
+ wbc_account_cgroup_owner(io->io_wbc, page, bh->b_size);
io->io_next_block++;
return 0;
}
@@ -415,7 +407,7 @@
struct writeback_control *wbc,
bool keep_towrite)
{
- struct page *data_page = NULL;
+ struct page *bounce_page = NULL;
struct inode *inode = page->mapping->host;
unsigned block_start;
struct buffer_head *bh, *head;
@@ -467,25 +459,30 @@
ext4_io_submit(io);
continue;
}
- if (buffer_new(bh)) {
+ if (buffer_new(bh))
clear_buffer_new(bh);
- clean_bdev_bh_alias(bh);
- }
set_buffer_async_write(bh);
nr_to_submit++;
} while ((bh = bh->b_this_page) != head);
bh = head = page_buffers(page);
- if (ext4_encrypted_inode(inode) && S_ISREG(inode->i_mode) &&
- nr_to_submit) {
+ /*
+ * If any blocks are being written to an encrypted file, encrypt them
+ * into a bounce page. For simplicity, just encrypt until the last
+ * block which might be needed. This may cause some unneeded blocks
+ * (e.g. holes) to be unnecessarily encrypted, but this is rare and
+ * can't happen in the common case of blocksize == PAGE_SIZE.
+ */
+ if (IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode) && nr_to_submit) {
gfp_t gfp_flags = GFP_NOFS;
+ unsigned int enc_bytes = round_up(len, i_blocksize(inode));
retry_encrypt:
- data_page = fscrypt_encrypt_page(inode, page, PAGE_SIZE, 0,
- page->index, gfp_flags);
- if (IS_ERR(data_page)) {
- ret = PTR_ERR(data_page);
+ bounce_page = fscrypt_encrypt_pagecache_blocks(page, enc_bytes,
+ 0, gfp_flags);
+ if (IS_ERR(bounce_page)) {
+ ret = PTR_ERR(bounce_page);
if (ret == -ENOMEM && wbc->sync_mode == WB_SYNC_ALL) {
if (io->io_bio) {
ext4_io_submit(io);
@@ -494,7 +491,7 @@
gfp_flags |= __GFP_NOFAIL;
goto retry_encrypt;
}
- data_page = NULL;
+ bounce_page = NULL;
goto out;
}
}
@@ -503,8 +500,7 @@
do {
if (!buffer_async_write(bh))
continue;
- ret = io_submit_add_bh(io, inode,
- data_page ? data_page : page, bh);
+ ret = io_submit_add_bh(io, inode, bounce_page ?: page, bh);
if (ret) {
/*
* We only get here on ENOMEM. Not much else
@@ -520,8 +516,7 @@
/* Error stopped previous loop? Clean up buffers... */
if (ret) {
out:
- if (data_page)
- fscrypt_restore_control_page(data_page);
+ fscrypt_free_bounce_page(bounce_page);
printk_ratelimited(KERN_ERR "%s: ret = %d\n", __func__, ret);
redirty_page_for_writepage(wbc, page);
do {
diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c
index f461d75..a30b203 100644
--- a/fs/ext4/readpage.c
+++ b/fs/ext4/readpage.c
@@ -47,13 +47,103 @@
#include "ext4.h"
-static inline bool ext4_bio_encrypted(struct bio *bio)
+#define NUM_PREALLOC_POST_READ_CTXS 128
+
+static struct kmem_cache *bio_post_read_ctx_cache;
+static mempool_t *bio_post_read_ctx_pool;
+
+/* postprocessing steps for read bios */
+enum bio_post_read_step {
+ STEP_INITIAL = 0,
+ STEP_DECRYPT,
+ STEP_VERITY,
+};
+
+struct bio_post_read_ctx {
+ struct bio *bio;
+ struct work_struct work;
+ unsigned int cur_step;
+ unsigned int enabled_steps;
+};
+
+static void __read_end_io(struct bio *bio)
{
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
- return unlikely(bio->bi_private != NULL);
-#else
- return false;
-#endif
+ struct page *page;
+ struct bio_vec *bv;
+ struct bvec_iter_all iter_all;
+
+ bio_for_each_segment_all(bv, bio, iter_all) {
+ page = bv->bv_page;
+
+ /* PG_error was set if any post_read step failed */
+ if (bio->bi_status || PageError(page)) {
+ ClearPageUptodate(page);
+ /* will re-read again later */
+ ClearPageError(page);
+ } else {
+ SetPageUptodate(page);
+ }
+ unlock_page(page);
+ }
+ if (bio->bi_private)
+ mempool_free(bio->bi_private, bio_post_read_ctx_pool);
+ bio_put(bio);
+}
+
+static void bio_post_read_processing(struct bio_post_read_ctx *ctx);
+
+static void decrypt_work(struct work_struct *work)
+{
+ struct bio_post_read_ctx *ctx =
+ container_of(work, struct bio_post_read_ctx, work);
+
+ fscrypt_decrypt_bio(ctx->bio);
+
+ bio_post_read_processing(ctx);
+}
+
+static void verity_work(struct work_struct *work)
+{
+ struct bio_post_read_ctx *ctx =
+ container_of(work, struct bio_post_read_ctx, work);
+
+ fsverity_verify_bio(ctx->bio);
+
+ bio_post_read_processing(ctx);
+}
+
+static void bio_post_read_processing(struct bio_post_read_ctx *ctx)
+{
+ /*
+ * We use different work queues for decryption and for verity because
+ * verity may require reading metadata pages that need decryption, and
+ * we shouldn't recurse to the same workqueue.
+ */
+ switch (++ctx->cur_step) {
+ case STEP_DECRYPT:
+ if (ctx->enabled_steps & (1 << STEP_DECRYPT)) {
+ INIT_WORK(&ctx->work, decrypt_work);
+ fscrypt_enqueue_decrypt_work(&ctx->work);
+ return;
+ }
+ ctx->cur_step++;
+ /* fall-through */
+ case STEP_VERITY:
+ if (ctx->enabled_steps & (1 << STEP_VERITY)) {
+ INIT_WORK(&ctx->work, verity_work);
+ fsverity_enqueue_verify_work(&ctx->work);
+ return;
+ }
+ ctx->cur_step++;
+ /* fall-through */
+ default:
+ __read_end_io(ctx->bio);
+ }
+}
+
+static bool bio_post_read_required(struct bio *bio)
+{
+ return bio->bi_private && !bio->bi_status;
}
/*
@@ -70,30 +160,53 @@
*/
static void mpage_end_io(struct bio *bio)
{
- struct bio_vec *bv;
- int i;
+ if (bio_post_read_required(bio)) {
+ struct bio_post_read_ctx *ctx = bio->bi_private;
- if (ext4_bio_encrypted(bio)) {
- if (bio->bi_status) {
- fscrypt_release_ctx(bio->bi_private);
- } else {
- fscrypt_enqueue_decrypt_bio(bio->bi_private, bio);
- return;
- }
+ ctx->cur_step = STEP_INITIAL;
+ bio_post_read_processing(ctx);
+ return;
}
- bio_for_each_segment_all(bv, bio, i) {
- struct page *page = bv->bv_page;
+ __read_end_io(bio);
+}
- if (!bio->bi_status) {
- SetPageUptodate(page);
- } else {
- ClearPageUptodate(page);
- SetPageError(page);
- }
- unlock_page(page);
+static inline bool ext4_need_verity(const struct inode *inode, pgoff_t idx)
+{
+ return fsverity_active(inode) &&
+ idx < DIV_ROUND_UP(inode->i_size, PAGE_SIZE);
+}
+
+static struct bio_post_read_ctx *get_bio_post_read_ctx(struct inode *inode,
+ struct bio *bio,
+ pgoff_t first_idx)
+{
+ unsigned int post_read_steps = 0;
+ struct bio_post_read_ctx *ctx = NULL;
+
+ if (IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode))
+ post_read_steps |= 1 << STEP_DECRYPT;
+
+ if (ext4_need_verity(inode, first_idx))
+ post_read_steps |= 1 << STEP_VERITY;
+
+ if (post_read_steps) {
+ ctx = mempool_alloc(bio_post_read_ctx_pool, GFP_NOFS);
+ if (!ctx)
+ return ERR_PTR(-ENOMEM);
+ ctx->bio = bio;
+ ctx->enabled_steps = post_read_steps;
+ bio->bi_private = ctx;
}
+ return ctx;
+}
- bio_put(bio);
+static inline loff_t ext4_readpage_limit(struct inode *inode)
+{
+ if (IS_ENABLED(CONFIG_FS_VERITY) &&
+ (IS_VERITY(inode) || ext4_verity_in_progress(inode)))
+ return inode->i_sb->s_maxbytes;
+
+ return i_size_read(inode);
}
int ext4_mpage_readpages(struct address_space *mapping,
@@ -126,9 +239,10 @@
int fully_mapped = 1;
unsigned first_hole = blocks_per_page;
- prefetchw(&page->flags);
if (pages) {
- page = list_entry(pages->prev, struct page, lru);
+ page = lru_to_page(pages);
+
+ prefetchw(&page->flags);
list_del(&page->lru);
if (add_to_page_cache_lru(page, mapping, page->index,
readahead_gfp_mask(mapping)))
@@ -140,7 +254,8 @@
block_in_file = (sector_t)page->index << (PAGE_SHIFT - blkbits);
last_block = block_in_file + nr_pages * blocks_per_page;
- last_block_in_file = (i_size_read(inode) + blocksize - 1) >> blkbits;
+ last_block_in_file = (ext4_readpage_limit(inode) +
+ blocksize - 1) >> blkbits;
if (last_block > last_block_in_file)
last_block = last_block_in_file;
page_block = 0;
@@ -217,6 +332,9 @@
zero_user_segment(page, first_hole << blkbits,
PAGE_SIZE);
if (first_hole == 0) {
+ if (ext4_need_verity(inode, page->index) &&
+ !fsverity_verify_page(page))
+ goto set_error_page;
SetPageUptodate(page);
unlock_page(page);
goto next_page;
@@ -240,19 +358,16 @@
bio = NULL;
}
if (bio == NULL) {
- struct fscrypt_ctx *ctx = NULL;
+ struct bio_post_read_ctx *ctx;
- if (ext4_encrypted_inode(inode) &&
- S_ISREG(inode->i_mode)) {
- ctx = fscrypt_get_ctx(inode, GFP_NOFS);
- if (IS_ERR(ctx))
- goto set_error_page;
- }
bio = bio_alloc(GFP_KERNEL,
min_t(int, nr_pages, BIO_MAX_PAGES));
- if (!bio) {
- if (ctx)
- fscrypt_release_ctx(ctx);
+ if (!bio)
+ goto set_error_page;
+ ctx = get_bio_post_read_ctx(inode, bio, page->index);
+ if (IS_ERR(ctx)) {
+ bio_put(bio);
+ bio = NULL;
goto set_error_page;
}
bio_set_dev(bio, bdev);
@@ -293,3 +408,29 @@
submit_bio(bio);
return 0;
}
+
+int __init ext4_init_post_read_processing(void)
+{
+ bio_post_read_ctx_cache =
+ kmem_cache_create("ext4_bio_post_read_ctx",
+ sizeof(struct bio_post_read_ctx), 0, 0, NULL);
+ if (!bio_post_read_ctx_cache)
+ goto fail;
+ bio_post_read_ctx_pool =
+ mempool_create_slab_pool(NUM_PREALLOC_POST_READ_CTXS,
+ bio_post_read_ctx_cache);
+ if (!bio_post_read_ctx_pool)
+ goto fail_free_cache;
+ return 0;
+
+fail_free_cache:
+ kmem_cache_destroy(bio_post_read_ctx_cache);
+fail:
+ return -ENOMEM;
+}
+
+void ext4_exit_post_read_processing(void)
+{
+ mempool_destroy(bio_post_read_ctx_pool);
+ kmem_cache_destroy(bio_post_read_ctx_cache);
+}
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index a5efee3..c0e9aef 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -127,10 +127,12 @@
else if (free_blocks_count < 0)
ext4_warning(sb, "Bad blocks count %u",
input->blocks_count);
- else if (!(bh = sb_bread(sb, end - 1)))
+ else if (IS_ERR(bh = ext4_sb_bread(sb, end - 1, 0))) {
+ err = PTR_ERR(bh);
+ bh = NULL;
ext4_warning(sb, "Cannot read last block (%llu)",
end - 1);
- else if (outside(input->block_bitmap, start, end))
+ } else if (outside(input->block_bitmap, start, end))
ext4_warning(sb, "Block bitmap not in group (block %llu)",
(unsigned long long)input->block_bitmap);
else if (outside(input->inode_bitmap, start, end))
@@ -781,11 +783,11 @@
struct ext4_super_block *es = EXT4_SB(sb)->s_es;
unsigned long gdb_num = group / EXT4_DESC_PER_BLOCK(sb);
ext4_fsblk_t gdblock = EXT4_SB(sb)->s_sbh->b_blocknr + 1 + gdb_num;
- struct buffer_head **o_group_desc, **n_group_desc;
- struct buffer_head *dind;
- struct buffer_head *gdb_bh;
+ struct buffer_head **o_group_desc, **n_group_desc = NULL;
+ struct buffer_head *dind = NULL;
+ struct buffer_head *gdb_bh = NULL;
int gdbackups;
- struct ext4_iloc iloc;
+ struct ext4_iloc iloc = { .bh = NULL };
__le32 *data;
int err;
@@ -794,21 +796,22 @@
"EXT4-fs: ext4_add_new_gdb: adding group block %lu\n",
gdb_num);
- gdb_bh = sb_bread(sb, gdblock);
- if (!gdb_bh)
- return -EIO;
+ gdb_bh = ext4_sb_bread(sb, gdblock, 0);
+ if (IS_ERR(gdb_bh))
+ return PTR_ERR(gdb_bh);
gdbackups = verify_reserved_gdb(sb, group, gdb_bh);
if (gdbackups < 0) {
err = gdbackups;
- goto exit_bh;
+ goto errout;
}
data = EXT4_I(inode)->i_data + EXT4_DIND_BLOCK;
- dind = sb_bread(sb, le32_to_cpu(*data));
- if (!dind) {
- err = -EIO;
- goto exit_bh;
+ dind = ext4_sb_bread(sb, le32_to_cpu(*data), 0);
+ if (IS_ERR(dind)) {
+ err = PTR_ERR(dind);
+ dind = NULL;
+ goto errout;
}
data = (__le32 *)dind->b_data;
@@ -816,18 +819,18 @@
ext4_warning(sb, "new group %u GDT block %llu not reserved",
group, gdblock);
err = -EINVAL;
- goto exit_dind;
+ goto errout;
}
BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get_write_access");
err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh);
if (unlikely(err))
- goto exit_dind;
+ goto errout;
BUFFER_TRACE(gdb_bh, "get_write_access");
err = ext4_journal_get_write_access(handle, gdb_bh);
if (unlikely(err))
- goto exit_dind;
+ goto errout;
BUFFER_TRACE(dind, "get_write_access");
err = ext4_journal_get_write_access(handle, dind);
@@ -837,7 +840,7 @@
/* ext4_reserve_inode_write() gets a reference on the iloc */
err = ext4_reserve_inode_write(handle, inode, &iloc);
if (unlikely(err))
- goto exit_dind;
+ goto errout;
n_group_desc = ext4_kvmalloc((gdb_num + 1) *
sizeof(struct buffer_head *),
@@ -846,7 +849,7 @@
err = -ENOMEM;
ext4_warning(sb, "not enough memory for %lu groups",
gdb_num + 1);
- goto exit_inode;
+ goto errout;
}
/*
@@ -862,7 +865,7 @@
err = ext4_handle_dirty_metadata(handle, NULL, dind);
if (unlikely(err)) {
ext4_std_error(sb, err);
- goto exit_inode;
+ goto errout;
}
inode->i_blocks -= (gdbackups + 1) * sb->s_blocksize >>
(9 - EXT4_SB(sb)->s_cluster_bits);
@@ -872,7 +875,7 @@
if (unlikely(err)) {
ext4_std_error(sb, err);
iloc.bh = NULL;
- goto exit_inode;
+ goto errout;
}
brelse(dind);
@@ -888,15 +891,11 @@
err = ext4_handle_dirty_super(handle, sb);
if (err)
ext4_std_error(sb, err);
-
return err;
-
-exit_inode:
+errout:
kvfree(n_group_desc);
brelse(iloc.bh);
-exit_dind:
brelse(dind);
-exit_bh:
brelse(gdb_bh);
ext4_debug("leaving with error %d\n", err);
@@ -916,9 +915,9 @@
gdblock = ext4_meta_bg_first_block_no(sb, group) +
ext4_bg_has_super(sb, group);
- gdb_bh = sb_bread(sb, gdblock);
- if (!gdb_bh)
- return -EIO;
+ gdb_bh = ext4_sb_bread(sb, gdblock, 0);
+ if (IS_ERR(gdb_bh))
+ return PTR_ERR(gdb_bh);
n_group_desc = ext4_kvmalloc((gdb_num + 1) *
sizeof(struct buffer_head *),
GFP_NOFS);
@@ -934,11 +933,18 @@
memcpy(n_group_desc, o_group_desc,
EXT4_SB(sb)->s_gdb_count * sizeof(struct buffer_head *));
n_group_desc[gdb_num] = gdb_bh;
+
+ BUFFER_TRACE(gdb_bh, "get_write_access");
+ err = ext4_journal_get_write_access(handle, gdb_bh);
+ if (err) {
+ kvfree(n_group_desc);
+ brelse(gdb_bh);
+ return err;
+ }
+
EXT4_SB(sb)->s_group_desc = n_group_desc;
EXT4_SB(sb)->s_gdb_count++;
kvfree(o_group_desc);
- BUFFER_TRACE(gdb_bh, "get_write_access");
- err = ext4_journal_get_write_access(handle, gdb_bh);
return err;
}
@@ -975,9 +981,10 @@
return -ENOMEM;
data = EXT4_I(inode)->i_data + EXT4_DIND_BLOCK;
- dind = sb_bread(sb, le32_to_cpu(*data));
- if (!dind) {
- err = -EIO;
+ dind = ext4_sb_bread(sb, le32_to_cpu(*data), 0);
+ if (IS_ERR(dind)) {
+ err = PTR_ERR(dind);
+ dind = NULL;
goto exit_free;
}
@@ -996,9 +1003,10 @@
err = -EINVAL;
goto exit_bh;
}
- primary[res] = sb_bread(sb, blk);
- if (!primary[res]) {
- err = -EIO;
+ primary[res] = ext4_sb_bread(sb, blk, 0);
+ if (IS_ERR(primary[res])) {
+ err = PTR_ERR(primary[res]);
+ primary[res] = NULL;
goto exit_bh;
}
gdbackups = verify_reserved_gdb(sb, group, primary[res]);
@@ -1631,13 +1639,13 @@
}
if (reserved_gdb || gdb_off == 0) {
- if (ext4_has_feature_resize_inode(sb) ||
+ if (!ext4_has_feature_resize_inode(sb) ||
!le16_to_cpu(es->s_reserved_gdt_blocks)) {
ext4_warning(sb,
"No reserved GDT blocks, can't resize");
return -EPERM;
}
- inode = ext4_iget(sb, EXT4_RESIZE_INO);
+ inode = ext4_iget(sb, EXT4_RESIZE_INO, EXT4_IGET_SPECIAL);
if (IS_ERR(inode)) {
ext4_warning(sb, "Error opening resize inode");
return PTR_ERR(inode);
@@ -1753,8 +1761,6 @@
ext4_msg(sb, KERN_ERR,
"filesystem too large to resize to %llu blocks safely",
n_blocks_count);
- if (sizeof(sector_t) < 8)
- ext4_warning(sb, "CONFIG_LBDAF not enabled");
return -EINVAL;
}
@@ -1960,12 +1966,14 @@
le16_to_cpu(es->s_reserved_gdt_blocks);
n_group = n_desc_blocks * EXT4_DESC_PER_BLOCK(sb);
n_blocks_count = (ext4_fsblk_t)n_group *
- EXT4_BLOCKS_PER_GROUP(sb);
+ EXT4_BLOCKS_PER_GROUP(sb) +
+ le32_to_cpu(es->s_first_data_block);
n_group--; /* set to last group number */
}
if (!resize_inode)
- resize_inode = ext4_iget(sb, EXT4_RESIZE_INO);
+ resize_inode = ext4_iget(sb, EXT4_RESIZE_INO,
+ EXT4_IGET_SPECIAL);
if (IS_ERR(resize_inode)) {
ext4_warning(sb, "Error opening resize inode");
return PTR_ERR(resize_inode);
@@ -2071,6 +2079,10 @@
free_flex_gd(flex_gd);
if (resize_inode != NULL)
iput(resize_inode);
- ext4_msg(sb, KERN_INFO, "resized filesystem to %llu", n_blocks_count);
+ if (err)
+ ext4_warning(sb, "error (%d) occurred during "
+ "file system resize", err);
+ ext4_msg(sb, KERN_INFO, "resized filesystem to %llu",
+ ext4_blocks_count(es));
return err;
}
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 8a149df..7357835 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -42,6 +42,7 @@
#include <linux/cleancache.h>
#include <linux/uaccess.h>
#include <linux/iversion.h>
+#include <linux/unicode.h>
#include <linux/kthread.h>
#include <linux/freezer.h>
@@ -140,6 +141,29 @@
MODULE_ALIAS("ext3");
#define IS_EXT3_SB(sb) ((sb)->s_bdev->bd_holder == &ext3_fs_type)
+/*
+ * This works like sb_bread() except it uses ERR_PTR for error
+ * returns. Currently with sb_bread it's impossible to distinguish
+ * between ENOMEM and EIO situations (since both result in a NULL
+ * return.
+ */
+struct buffer_head *
+ext4_sb_bread(struct super_block *sb, sector_t block, int op_flags)
+{
+ struct buffer_head *bh = sb_getblk(sb, block);
+
+ if (bh == NULL)
+ return ERR_PTR(-ENOMEM);
+ if (buffer_uptodate(bh))
+ return bh;
+ ll_rw_block(REQ_OP_READ, REQ_META | op_flags, 1, &bh);
+ wait_on_buffer(bh);
+ if (buffer_uptodate(bh))
+ return bh;
+ put_bh(bh);
+ return ERR_PTR(-EIO);
+}
+
static int ext4_verify_csum_type(struct super_block *sb,
struct ext4_super_block *es)
{
@@ -407,6 +431,12 @@
spin_unlock(&sbi->s_md_lock);
}
+static bool system_going_down(void)
+{
+ return system_state == SYSTEM_HALT || system_state == SYSTEM_POWER_OFF
+ || system_state == SYSTEM_RESTART;
+}
+
/* Deal with the reporting of failure conditions on a filesystem such as
* inconsistencies detected or read IO failures.
*
@@ -437,7 +467,12 @@
if (journal)
jbd2_journal_abort(journal, -EIO);
}
- if (test_opt(sb, ERRORS_RO)) {
+ /*
+ * We force ERRORS_RO behavior when system is rebooting. Otherwise we
+ * could panic during 'reboot -f' as the underlying device got already
+ * disabled.
+ */
+ if (test_opt(sb, ERRORS_RO) || system_going_down()) {
ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only");
/*
* Make sure updated value of ->s_mount_flags will be visible
@@ -445,8 +480,7 @@
*/
smp_wmb();
sb->s_flags |= SB_RDONLY;
- }
- if (test_opt(sb, ERRORS_PANIC)) {
+ } else if (test_opt(sb, ERRORS_PANIC)) {
if (EXT4_SB(sb)->s_journal &&
!(EXT4_SB(sb)->s_journal->j_flags & JBD2_REC_ERR))
return;
@@ -665,7 +699,7 @@
jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO);
save_error_info(sb, function, line);
}
- if (test_opt(sb, ERRORS_PANIC)) {
+ if (test_opt(sb, ERRORS_PANIC) && !system_going_down()) {
if (EXT4_SB(sb)->s_journal &&
!(EXT4_SB(sb)->s_journal->j_flags & JBD2_REC_ERR))
return;
@@ -1000,14 +1034,13 @@
invalidate_bdev(sbi->journal_bdev);
ext4_blkdev_remove(sbi);
}
- if (sbi->s_ea_inode_cache) {
- ext4_xattr_destroy_cache(sbi->s_ea_inode_cache);
- sbi->s_ea_inode_cache = NULL;
- }
- if (sbi->s_ea_block_cache) {
- ext4_xattr_destroy_cache(sbi->s_ea_block_cache);
- sbi->s_ea_block_cache = NULL;
- }
+
+ ext4_xattr_destroy_cache(sbi->s_ea_inode_cache);
+ sbi->s_ea_inode_cache = NULL;
+
+ ext4_xattr_destroy_cache(sbi->s_ea_block_cache);
+ sbi->s_ea_block_cache = NULL;
+
if (sbi->s_mmp_tsk)
kthread_stop(sbi->s_mmp_tsk);
brelse(sbi->s_sbh);
@@ -1022,6 +1055,9 @@
crypto_free_shash(sbi->s_chksum_driver);
kfree(sbi->s_blockgroup_lock);
fs_put_dax(sbi->s_daxdev);
+#ifdef CONFIG_UNICODE
+ utf8_unload(sbi->s_encoding);
+#endif
kfree(sbi);
}
@@ -1052,6 +1088,7 @@
ei->i_da_metadata_calc_len = 0;
ei->i_da_metadata_calc_last_lblock = 0;
spin_lock_init(&(ei->i_block_reservation_lock));
+ ext4_init_pending_tree(&ei->i_pending_tree);
#ifdef CONFIG_QUOTA
ei->i_reserved_quota = 0;
memset(&ei->i_dquot, 0, sizeof(ei->i_dquot));
@@ -1070,13 +1107,16 @@
{
int drop = generic_drop_inode(inode);
+ if (!drop)
+ drop = fscrypt_drop_inode(inode);
+
trace_ext4_drop_inode(inode, drop);
return drop;
}
-static void ext4_i_callback(struct rcu_head *head)
+static void ext4_free_in_core_inode(struct inode *inode)
{
- struct inode *inode = container_of(head, struct inode, i_rcu);
+ fscrypt_free_inode(inode);
kmem_cache_free(ext4_inode_cachep, EXT4_I(inode));
}
@@ -1091,7 +1131,6 @@
true);
dump_stack();
}
- call_rcu(&inode->i_rcu, ext4_i_callback);
}
static void init_once(void *foo)
@@ -1143,6 +1182,7 @@
EXT4_I(inode)->jinode = NULL;
}
fscrypt_put_encryption_info(inode);
+ fsverity_cleanup_inode(inode);
}
static struct inode *ext4_nfs_get_inode(struct super_block *sb,
@@ -1150,20 +1190,11 @@
{
struct inode *inode;
- if (ino < EXT4_FIRST_INO(sb) && ino != EXT4_ROOT_INO)
- return ERR_PTR(-ESTALE);
- if (ino > le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count))
- return ERR_PTR(-ESTALE);
-
- /* iget isn't really right if the inode is currently unallocated!!
- *
- * ext4_read_inode will return a bad_inode if the inode had been
- * deleted, so we should be safe.
- *
+ /*
* Currently we don't know the generation for parent directory, so
* a generation of 0 means "accept any"
*/
- inode = ext4_iget_normal(sb, ino);
+ inode = ext4_iget(sb, ino, EXT4_IGET_HANDLE);
if (IS_ERR(inode))
return ERR_CAST(inode);
if (generation && inode->i_generation != generation) {
@@ -1188,6 +1219,16 @@
ext4_nfs_get_inode);
}
+static int ext4_nfs_commit_metadata(struct inode *inode)
+{
+ struct writeback_control wbc = {
+ .sync_mode = WB_SYNC_ALL
+ };
+
+ trace_ext4_nfs_commit_metadata(inode);
+ return ext4_write_inode(inode, &wbc);
+}
+
/*
* Try to release metadata pages (indirect blocks, directories) which are
* mapped via the block device. Since these pages could have journal heads
@@ -1208,7 +1249,7 @@
return try_to_free_buffers(page);
}
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
static int ext4_get_context(struct inode *inode, void *ctx, size_t len)
{
return ext4_xattr_get(inode, EXT4_XATTR_INDEX_ENCRYPTION,
@@ -1368,6 +1409,7 @@
static const struct super_operations ext4_sops = {
.alloc_inode = ext4_alloc_inode,
+ .free_inode = ext4_free_in_core_inode,
.destroy_inode = ext4_destroy_inode,
.write_inode = ext4_write_inode,
.dirty_inode = ext4_dirty_inode,
@@ -1392,6 +1434,7 @@
.fh_to_dentry = ext4_fh_to_dentry,
.fh_to_parent = ext4_fh_to_parent,
.get_parent = ext4_get_parent,
+ .commit_metadata = ext4_nfs_commit_metadata,
};
enum {
@@ -1715,6 +1758,36 @@
{Opt_err, 0, 0}
};
+#ifdef CONFIG_UNICODE
+static const struct ext4_sb_encodings {
+ __u16 magic;
+ char *name;
+ char *version;
+} ext4_sb_encoding_map[] = {
+ {EXT4_ENC_UTF8_12_1, "utf8", "12.1.0"},
+};
+
+static int ext4_sb_read_encoding(const struct ext4_super_block *es,
+ const struct ext4_sb_encodings **encoding,
+ __u16 *flags)
+{
+ __u16 magic = le16_to_cpu(es->s_encoding);
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(ext4_sb_encoding_map); i++)
+ if (magic == ext4_sb_encoding_map[i].magic)
+ break;
+
+ if (i >= ARRAY_SIZE(ext4_sb_encoding_map))
+ return -EINVAL;
+
+ *encoding = &ext4_sb_encoding_map[i];
+ *flags = le16_to_cpu(es->s_encoding_flags);
+
+ return 0;
+}
+#endif
+
static int handle_mount_opt(struct super_block *sb, char *opt, int token,
substring_t *args, unsigned long *journal_devnum,
unsigned int *journal_ioprio, int is_remount)
@@ -1805,6 +1878,13 @@
} else if (token == Opt_commit) {
if (arg == 0)
arg = JBD2_DEFAULT_MAX_COMMIT_AGE;
+ else if (arg > INT_MAX / HZ) {
+ ext4_msg(sb, KERN_ERR,
+ "Invalid commit interval %d, "
+ "must be smaller than %d",
+ arg, INT_MAX / HZ);
+ return -1;
+ }
sbi->s_commit_interval = HZ * arg;
} else if (token == Opt_debug_want_extra_isize) {
sbi->s_want_extra_isize = arg;
@@ -1897,7 +1977,7 @@
*journal_ioprio =
IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, arg);
} else if (token == Opt_test_dummy_encryption) {
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
sbi->s_mount_flags |= EXT4_MF_TEST_DUMMY_ENCRYPTION;
ext4_msg(sb, KERN_WARNING,
"Test dummy encryption mode enabled");
@@ -1938,7 +2018,7 @@
#ifdef CONFIG_FS_DAX
ext4_msg(sb, KERN_WARNING,
"DAX enabled. Warning: EXPERIMENTAL, use at your own risk");
- sbi->s_mount_opt |= m->mount_opt;
+ sbi->s_mount_opt |= m->mount_opt;
#else
ext4_msg(sb, KERN_INFO, "dax option not supported");
return -1;
@@ -2224,7 +2304,6 @@
es->s_max_mnt_count = cpu_to_le16(EXT4_DFL_MAX_MNT_COUNT);
le16_add_cpu(&es->s_mnt_count, 1);
ext4_update_tstamp(es, s_mtime);
- ext4_update_dynamic_rev(sb);
if (sbi->s_journal)
ext4_set_feature_journal_needs_recovery(sb);
@@ -2672,13 +2751,9 @@
loff_t res;
loff_t upper_limit = MAX_LFS_FILESIZE;
- /* small i_blocks in vfs inode? */
- if (!has_huge_files || sizeof(blkcnt_t) < sizeof(u64)) {
- /*
- * CONFIG_LBDAF is not enabled implies the inode
- * i_block represent total blocks in 512 bytes
- * 32 == size of vfs inode i_blocks * 8
- */
+ BUILD_BUG_ON(sizeof(blkcnt_t) < sizeof(u64));
+
+ if (!has_huge_files) {
upper_limit = (1LL << 32) - 1;
/* total blocks in file system block size */
@@ -2719,11 +2794,11 @@
* number of 512-byte sectors of the file.
*/
- if (!has_huge_files || sizeof(blkcnt_t) < sizeof(u64)) {
+ if (!has_huge_files) {
/*
- * !has_huge_files or CONFIG_LBDAF not enabled implies that
- * the inode i_block field represents total file blocks in
- * 2^32 512-byte sectors == size of vfs inode i_blocks * 8
+ * !has_huge_files or implies that the inode i_block field
+ * represents total file blocks in 2^32 512-byte sectors ==
+ * size of vfs inode i_blocks * 8
*/
upper_limit = (1LL << 32) - 1;
@@ -2846,6 +2921,15 @@
return 0;
}
+#ifndef CONFIG_UNICODE
+ if (ext4_has_feature_casefold(sb)) {
+ ext4_msg(sb, KERN_ERR,
+ "Filesystem with casefold feature cannot be "
+ "mounted without CONFIG_UNICODE");
+ return 0;
+ }
+#endif
+
if (readonly)
return 1;
@@ -2863,18 +2947,6 @@
~EXT4_FEATURE_RO_COMPAT_SUPP));
return 0;
}
- /*
- * Large file size enabled file system can only be mounted
- * read-write on 32-bit systems if kernel is built with CONFIG_LBDAF
- */
- if (ext4_has_feature_huge_file(sb)) {
- if (sizeof(blkcnt_t) < sizeof(u64)) {
- ext4_msg(sb, KERN_ERR, "Filesystem with huge files "
- "cannot be mounted RDWR without "
- "CONFIG_LBDAF");
- return 0;
- }
- }
if (ext4_has_feature_bigalloc(sb) && !ext4_has_feature_extents(sb)) {
ext4_msg(sb, KERN_ERR,
"Can't support bigalloc feature without "
@@ -3479,6 +3551,40 @@
return 0;
}
+static void ext4_clamp_want_extra_isize(struct super_block *sb)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_super_block *es = sbi->s_es;
+ unsigned def_extra_isize = sizeof(struct ext4_inode) -
+ EXT4_GOOD_OLD_INODE_SIZE;
+
+ if (sbi->s_inode_size == EXT4_GOOD_OLD_INODE_SIZE) {
+ sbi->s_want_extra_isize = 0;
+ return;
+ }
+ if (sbi->s_want_extra_isize < 4) {
+ sbi->s_want_extra_isize = def_extra_isize;
+ if (ext4_has_feature_extra_isize(sb)) {
+ if (sbi->s_want_extra_isize <
+ le16_to_cpu(es->s_want_extra_isize))
+ sbi->s_want_extra_isize =
+ le16_to_cpu(es->s_want_extra_isize);
+ if (sbi->s_want_extra_isize <
+ le16_to_cpu(es->s_min_extra_isize))
+ sbi->s_want_extra_isize =
+ le16_to_cpu(es->s_min_extra_isize);
+ }
+ }
+ /* Check if enough inode space is available */
+ if ((sbi->s_want_extra_isize > sbi->s_inode_size) ||
+ (EXT4_GOOD_OLD_INODE_SIZE + sbi->s_want_extra_isize >
+ sbi->s_inode_size)) {
+ sbi->s_want_extra_isize = def_extra_isize;
+ ext4_msg(sb, KERN_INFO,
+ "required extra inode space not available");
+ }
+}
+
static void ext4_set_resv_clusters(struct super_block *sb)
{
ext4_fsblk_t resv_clusters;
@@ -3705,6 +3811,43 @@
&journal_ioprio, 0))
goto failed_mount;
+#ifdef CONFIG_UNICODE
+ if (ext4_has_feature_casefold(sb) && !sbi->s_encoding) {
+ const struct ext4_sb_encodings *encoding_info;
+ struct unicode_map *encoding;
+ __u16 encoding_flags;
+
+ if (ext4_has_feature_encrypt(sb)) {
+ ext4_msg(sb, KERN_ERR,
+ "Can't mount with encoding and encryption");
+ goto failed_mount;
+ }
+
+ if (ext4_sb_read_encoding(es, &encoding_info,
+ &encoding_flags)) {
+ ext4_msg(sb, KERN_ERR,
+ "Encoding requested by superblock is unknown");
+ goto failed_mount;
+ }
+
+ encoding = utf8_load(encoding_info->version);
+ if (IS_ERR(encoding)) {
+ ext4_msg(sb, KERN_ERR,
+ "can't mount with superblock charset: %s-%s "
+ "not supported by the kernel. flags: 0x%x.",
+ encoding_info->name, encoding_info->version,
+ encoding_flags);
+ goto failed_mount;
+ }
+ ext4_msg(sb, KERN_INFO,"Using encoding defined by superblock: "
+ "%s-%s with flags 0x%hx", encoding_info->name,
+ encoding_info->version?:"\b", encoding_flags);
+
+ sbi->s_encoding = encoding;
+ sbi->s_encoding_flags = encoding_flags;
+ }
+#endif
+
if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
printk_once(KERN_WARNING "EXT4-fs: Warning: mounting "
"with data=journal disables delayed "
@@ -3841,12 +3984,12 @@
if (ext4_has_feature_inline_data(sb)) {
ext4_msg(sb, KERN_ERR, "Cannot use DAX on a filesystem"
" that may contain inline data");
- sbi->s_mount_opt &= ~EXT4_MOUNT_DAX;
+ goto failed_mount;
}
if (!bdev_dax_supported(sb->s_bdev, blocksize)) {
ext4_msg(sb, KERN_ERR,
- "DAX unsupported by block device. Turning off DAX.");
- sbi->s_mount_opt &= ~EXT4_MOUNT_DAX;
+ "DAX unsupported by block device.");
+ goto failed_mount;
}
}
@@ -3906,8 +4049,21 @@
sbi->s_inode_size);
goto failed_mount;
}
- if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE)
- sb->s_time_gran = 1 << (EXT4_EPOCH_BITS - 2);
+ /*
+ * i_atime_extra is the last extra field available for [acm]times in
+ * struct ext4_inode. Checking for that field should suffice to ensure
+ * we have extra space for all three.
+ */
+ if (sbi->s_inode_size >= offsetof(struct ext4_inode, i_atime_extra) +
+ sizeof(((struct ext4_inode *)0)->i_atime_extra)) {
+ sb->s_time_gran = 1;
+ sb->s_time_max = EXT4_EXTRA_TIMESTAMP_MAX;
+ } else {
+ sb->s_time_gran = NSEC_PER_SEC;
+ sb->s_time_max = EXT4_NON_EXTRA_TIMESTAMP_MAX;
+ }
+
+ sb->s_time_min = EXT4_TIMESTAMP_MIN;
}
sbi->s_desc_size = le16_to_cpu(es->s_desc_size);
@@ -4023,8 +4179,6 @@
if (err) {
ext4_msg(sb, KERN_ERR, "filesystem"
" too large to mount safely on this system");
- if (sizeof(sector_t) < 8)
- ext4_msg(sb, KERN_WARNING, "CONFIG_LBDAF not enabled");
goto failed_mount;
}
@@ -4142,9 +4296,12 @@
sb->s_op = &ext4_sops;
sb->s_export_op = &ext4_export_ops;
sb->s_xattr = ext4_xattr_handlers;
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
sb->s_cop = &ext4_cryptops;
#endif
+#ifdef CONFIG_FS_VERITY
+ sb->s_vop = &ext4_verityops;
+#endif
#ifdef CONFIG_QUOTA
sb->dq_op = &ext4_quota_operations;
if (ext4_has_feature_quota(sb))
@@ -4204,7 +4361,7 @@
"data=, fs mounted w/o journal");
goto failed_mount_wq;
}
- sbi->s_def_mount_opt &= EXT4_MOUNT_JOURNAL_CHECKSUM;
+ sbi->s_def_mount_opt &= ~EXT4_MOUNT_JOURNAL_CHECKSUM;
clear_opt(sb, JOURNAL_CHECKSUM);
clear_opt(sb, DATA_FLAGS);
sbi->s_journal = NULL;
@@ -4292,6 +4449,11 @@
goto failed_mount_wq;
}
+ if (ext4_has_feature_verity(sb) && blocksize != PAGE_SIZE) {
+ ext4_msg(sb, KERN_ERR, "Unsupported blocksize for fs-verity");
+ goto failed_mount_wq;
+ }
+
if (DUMMY_ENCRYPTION_ENABLED(sbi) && !sb_rdonly(sb) &&
!ext4_has_feature_encrypt(sb)) {
ext4_set_feature_encrypt(sb);
@@ -4327,7 +4489,7 @@
* so we can safely mount the rest of the filesystem now.
*/
- root = ext4_iget(sb, EXT4_ROOT_INO);
+ root = ext4_iget(sb, EXT4_ROOT_INO, EXT4_IGET_SPECIAL);
if (IS_ERR(root)) {
ext4_msg(sb, KERN_ERR, "get root inode failed");
ret = PTR_ERR(root);
@@ -4339,6 +4501,12 @@
iput(root);
goto failed_mount4;
}
+
+#ifdef CONFIG_UNICODE
+ if (sbi->s_encoding)
+ sb->s_d_op = &ext4_dentry_ops;
+#endif
+
sb->s_root = d_make_root(root);
if (!sb->s_root) {
ext4_msg(sb, KERN_ERR, "get root dentry failed");
@@ -4353,30 +4521,7 @@
} else if (ret)
goto failed_mount4a;
- /* determine the minimum size of new large inodes, if present */
- if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE &&
- sbi->s_want_extra_isize == 0) {
- sbi->s_want_extra_isize = sizeof(struct ext4_inode) -
- EXT4_GOOD_OLD_INODE_SIZE;
- if (ext4_has_feature_extra_isize(sb)) {
- if (sbi->s_want_extra_isize <
- le16_to_cpu(es->s_want_extra_isize))
- sbi->s_want_extra_isize =
- le16_to_cpu(es->s_want_extra_isize);
- if (sbi->s_want_extra_isize <
- le16_to_cpu(es->s_min_extra_isize))
- sbi->s_want_extra_isize =
- le16_to_cpu(es->s_min_extra_isize);
- }
- }
- /* Check if enough inode space is available */
- if (EXT4_GOOD_OLD_INODE_SIZE + sbi->s_want_extra_isize >
- sbi->s_inode_size) {
- sbi->s_want_extra_isize = sizeof(struct ext4_inode) -
- EXT4_GOOD_OLD_INODE_SIZE;
- ext4_msg(sb, KERN_INFO, "required extra inode space not"
- "available");
- }
+ ext4_clamp_want_extra_isize(sb);
ext4_set_resv_clusters(sb);
@@ -4521,14 +4666,12 @@
if (EXT4_SB(sb)->rsv_conversion_wq)
destroy_workqueue(EXT4_SB(sb)->rsv_conversion_wq);
failed_mount_wq:
- if (sbi->s_ea_inode_cache) {
- ext4_xattr_destroy_cache(sbi->s_ea_inode_cache);
- sbi->s_ea_inode_cache = NULL;
- }
- if (sbi->s_ea_block_cache) {
- ext4_xattr_destroy_cache(sbi->s_ea_block_cache);
- sbi->s_ea_block_cache = NULL;
- }
+ ext4_xattr_destroy_cache(sbi->s_ea_inode_cache);
+ sbi->s_ea_inode_cache = NULL;
+
+ ext4_xattr_destroy_cache(sbi->s_ea_block_cache);
+ sbi->s_ea_block_cache = NULL;
+
if (sbi->s_journal) {
jbd2_journal_destroy(sbi->s_journal);
sbi->s_journal = NULL;
@@ -4546,9 +4689,14 @@
failed_mount:
if (sbi->s_chksum_driver)
crypto_free_shash(sbi->s_chksum_driver);
+
+#ifdef CONFIG_UNICODE
+ utf8_unload(sbi->s_encoding);
+#endif
+
#ifdef CONFIG_QUOTA
for (i = 0; i < EXT4_MAXQUOTAS; i++)
- kfree(sbi->s_qf_names[i]);
+ kfree(get_qf_name(sb, sbi, i));
#endif
ext4_blkdev_remove(sbi);
brelse(bh);
@@ -4597,7 +4745,7 @@
* happen if we iget() an unused inode, as the subsequent iput()
* will try to delete it.
*/
- journal_inode = ext4_iget(sb, journal_inum);
+ journal_inode = ext4_iget(sb, journal_inum, EXT4_IGET_SPECIAL);
if (IS_ERR(journal_inode)) {
ext4_msg(sb, KERN_ERR, "no journal found");
return NULL;
@@ -4879,7 +5027,7 @@
ext4_superblock_csum_set(sb);
if (sync)
lock_buffer(sbh);
- if (buffer_write_io_error(sbh)) {
+ if (buffer_write_io_error(sbh) || !buffer_uptodate(sbh)) {
/*
* Oh, dear. A previous attempt to write the
* superblock failed. This could happen because the
@@ -5162,6 +5310,8 @@
goto restore_opts;
}
+ ext4_clamp_want_extra_isize(sb);
+
if ((old_opts.s_mount_opt & EXT4_MOUNT_JOURNAL_CHECKSUM) ^
test_opt(sb, JOURNAL_CHECKSUM)) {
ext4_msg(sb, KERN_ERR, "changing journal_checksum "
@@ -5679,7 +5829,7 @@
if (!qf_inums[type])
return -EPERM;
- qf_inode = ext4_iget(sb, qf_inums[type]);
+ qf_inode = ext4_iget(sb, qf_inums[type], EXT4_IGET_SPECIAL);
if (IS_ERR(qf_inode)) {
ext4_error(sb, "Bad quota inode # %lu", qf_inums[type]);
return PTR_ERR(qf_inode);
@@ -5689,9 +5839,9 @@
qf_inode->i_flags |= S_NOQUOTA;
lockdep_set_quota_inode(qf_inode, I_DATA_SEM_QUOTA);
err = dquot_enable(qf_inode, type, format_id, flags);
- iput(qf_inode);
if (err)
lockdep_set_quota_inode(qf_inode, I_DATA_SEM_NORMAL);
+ iput(qf_inode);
return err;
}
@@ -5978,6 +6128,14 @@
if (err)
return err;
+ err = ext4_init_pending();
+ if (err)
+ goto out7;
+
+ err = ext4_init_post_read_processing();
+ if (err)
+ goto out6;
+
err = ext4_init_pageio();
if (err)
goto out5;
@@ -6016,6 +6174,10 @@
out4:
ext4_exit_pageio();
out5:
+ ext4_exit_post_read_processing();
+out6:
+ ext4_exit_pending();
+out7:
ext4_exit_es();
return err;
@@ -6032,7 +6194,9 @@
ext4_exit_sysfs();
ext4_exit_system_zone();
ext4_exit_pageio();
+ ext4_exit_post_read_processing();
ext4_exit_es();
+ ext4_exit_pending();
}
MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others");
diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c
index 9212a02..eb1efad 100644
--- a/fs/ext4/sysfs.c
+++ b/fs/ext4/sysfs.c
@@ -30,6 +30,7 @@
attr_feature,
attr_pointer_ui,
attr_pointer_atomic,
+ attr_journal_task,
} attr_id_t;
typedef enum {
@@ -125,6 +126,14 @@
return count;
}
+static ssize_t journal_task_show(struct ext4_sb_info *sbi, char *buf)
+{
+ if (!sbi->s_journal)
+ return snprintf(buf, PAGE_SIZE, "<none>\n");
+ return snprintf(buf, PAGE_SIZE, "%d\n",
+ task_pid_vnr(sbi->s_journal->j_task));
+}
+
#define EXT4_ATTR(_name,_mode,_id) \
static struct ext4_attr ext4_attr_##_name = { \
.attr = {.name = __stringify(_name), .mode = _mode }, \
@@ -188,6 +197,7 @@
EXT4_RO_ATTR_ES_UI(errors_count, s_error_count);
EXT4_ATTR(first_error_time, 0444, first_error_time);
EXT4_ATTR(last_error_time, 0444, last_error_time);
+EXT4_ATTR(journal_task, 0444, journal_task);
static unsigned int old_bump_val = 128;
EXT4_ATTR_PTR(max_writeback_mb_bump, 0444, pointer_ui, &old_bump_val);
@@ -217,28 +227,43 @@
ATTR_LIST(errors_count),
ATTR_LIST(first_error_time),
ATTR_LIST(last_error_time),
+ ATTR_LIST(journal_task),
NULL,
};
+ATTRIBUTE_GROUPS(ext4);
/* Features this copy of ext4 supports */
EXT4_ATTR_FEATURE(lazy_itable_init);
EXT4_ATTR_FEATURE(batched_discard);
EXT4_ATTR_FEATURE(meta_bg_resize);
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
EXT4_ATTR_FEATURE(encryption);
#endif
+#ifdef CONFIG_UNICODE
+EXT4_ATTR_FEATURE(casefold);
+#endif
+#ifdef CONFIG_FS_VERITY
+EXT4_ATTR_FEATURE(verity);
+#endif
EXT4_ATTR_FEATURE(metadata_csum_seed);
static struct attribute *ext4_feat_attrs[] = {
ATTR_LIST(lazy_itable_init),
ATTR_LIST(batched_discard),
ATTR_LIST(meta_bg_resize),
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
+#ifdef CONFIG_FS_ENCRYPTION
ATTR_LIST(encryption),
#endif
+#ifdef CONFIG_UNICODE
+ ATTR_LIST(casefold),
+#endif
+#ifdef CONFIG_FS_VERITY
+ ATTR_LIST(verity),
+#endif
ATTR_LIST(metadata_csum_seed),
NULL,
};
+ATTRIBUTE_GROUPS(ext4_feat);
static void *calc_ptr(struct ext4_attr *a, struct ext4_sb_info *sbi)
{
@@ -304,6 +329,8 @@
return print_tstamp(buf, sbi->s_es, s_first_error_time);
case attr_last_error_time:
return print_tstamp(buf, sbi->s_es, s_last_error_time);
+ case attr_journal_task:
+ return journal_task_show(sbi, buf);
}
return 0;
@@ -355,13 +382,13 @@
};
static struct kobj_type ext4_sb_ktype = {
- .default_attrs = ext4_attrs,
+ .default_groups = ext4_groups,
.sysfs_ops = &ext4_attr_ops,
.release = ext4_sb_release,
};
static struct kobj_type ext4_feat_ktype = {
- .default_attrs = ext4_feat_attrs,
+ .default_groups = ext4_feat_groups,
.sysfs_ops = &ext4_attr_ops,
.release = (void (*)(struct kobject *))kfree,
};
diff --git a/fs/ext4/verity.c b/fs/ext4/verity.c
new file mode 100644
index 0000000..d0d8a97
--- /dev/null
+++ b/fs/ext4/verity.c
@@ -0,0 +1,367 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * fs/ext4/verity.c: fs-verity support for ext4
+ *
+ * Copyright 2019 Google LLC
+ */
+
+/*
+ * Implementation of fsverity_operations for ext4.
+ *
+ * ext4 stores the verity metadata (Merkle tree and fsverity_descriptor) past
+ * the end of the file, starting at the first 64K boundary beyond i_size. This
+ * approach works because (a) verity files are readonly, and (b) pages fully
+ * beyond i_size aren't visible to userspace but can be read/written internally
+ * by ext4 with only some relatively small changes to ext4. This approach
+ * avoids having to depend on the EA_INODE feature and on rearchitecturing
+ * ext4's xattr support to support paging multi-gigabyte xattrs into memory, and
+ * to support encrypting xattrs. Note that the verity metadata *must* be
+ * encrypted when the file is, since it contains hashes of the plaintext data.
+ *
+ * Using a 64K boundary rather than a 4K one keeps things ready for
+ * architectures with 64K pages, and it doesn't necessarily waste space on-disk
+ * since there can be a hole between i_size and the start of the Merkle tree.
+ */
+
+#include <linux/quotaops.h>
+
+#include "ext4.h"
+#include "ext4_extents.h"
+#include "ext4_jbd2.h"
+
+static inline loff_t ext4_verity_metadata_pos(const struct inode *inode)
+{
+ return round_up(inode->i_size, 65536);
+}
+
+/*
+ * Read some verity metadata from the inode. __vfs_read() can't be used because
+ * we need to read beyond i_size.
+ */
+static int pagecache_read(struct inode *inode, void *buf, size_t count,
+ loff_t pos)
+{
+ while (count) {
+ size_t n = min_t(size_t, count,
+ PAGE_SIZE - offset_in_page(pos));
+ struct page *page;
+ void *addr;
+
+ page = read_mapping_page(inode->i_mapping, pos >> PAGE_SHIFT,
+ NULL);
+ if (IS_ERR(page))
+ return PTR_ERR(page);
+
+ addr = kmap_atomic(page);
+ memcpy(buf, addr + offset_in_page(pos), n);
+ kunmap_atomic(addr);
+
+ put_page(page);
+
+ buf += n;
+ pos += n;
+ count -= n;
+ }
+ return 0;
+}
+
+/*
+ * Write some verity metadata to the inode for FS_IOC_ENABLE_VERITY.
+ * kernel_write() can't be used because the file descriptor is readonly.
+ */
+static int pagecache_write(struct inode *inode, const void *buf, size_t count,
+ loff_t pos)
+{
+ if (pos + count > inode->i_sb->s_maxbytes)
+ return -EFBIG;
+
+ while (count) {
+ size_t n = min_t(size_t, count,
+ PAGE_SIZE - offset_in_page(pos));
+ struct page *page;
+ void *fsdata;
+ void *addr;
+ int res;
+
+ res = pagecache_write_begin(NULL, inode->i_mapping, pos, n, 0,
+ &page, &fsdata);
+ if (res)
+ return res;
+
+ addr = kmap_atomic(page);
+ memcpy(addr + offset_in_page(pos), buf, n);
+ kunmap_atomic(addr);
+
+ res = pagecache_write_end(NULL, inode->i_mapping, pos, n, n,
+ page, fsdata);
+ if (res < 0)
+ return res;
+ if (res != n)
+ return -EIO;
+
+ buf += n;
+ pos += n;
+ count -= n;
+ }
+ return 0;
+}
+
+static int ext4_begin_enable_verity(struct file *filp)
+{
+ struct inode *inode = file_inode(filp);
+ const int credits = 2; /* superblock and inode for ext4_orphan_add() */
+ handle_t *handle;
+ int err;
+
+ if (ext4_verity_in_progress(inode))
+ return -EBUSY;
+
+ /*
+ * Since the file was opened readonly, we have to initialize the jbd
+ * inode and quotas here and not rely on ->open() doing it. This must
+ * be done before evicting the inline data.
+ */
+
+ err = ext4_inode_attach_jinode(inode);
+ if (err)
+ return err;
+
+ err = dquot_initialize(inode);
+ if (err)
+ return err;
+
+ err = ext4_convert_inline_data(inode);
+ if (err)
+ return err;
+
+ if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
+ ext4_warning_inode(inode,
+ "verity is only allowed on extent-based files");
+ return -EOPNOTSUPP;
+ }
+
+ /*
+ * ext4 uses the last allocated block to find the verity descriptor, so
+ * we must remove any other blocks past EOF which might confuse things.
+ */
+ err = ext4_truncate(inode);
+ if (err)
+ return err;
+
+ handle = ext4_journal_start(inode, EXT4_HT_INODE, credits);
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+
+ err = ext4_orphan_add(handle, inode);
+ if (err == 0)
+ ext4_set_inode_state(inode, EXT4_STATE_VERITY_IN_PROGRESS);
+
+ ext4_journal_stop(handle);
+ return err;
+}
+
+/*
+ * ext4 stores the verity descriptor beginning on the next filesystem block
+ * boundary after the Merkle tree. Then, the descriptor size is stored in the
+ * last 4 bytes of the last allocated filesystem block --- which is either the
+ * block in which the descriptor ends, or the next block after that if there
+ * weren't at least 4 bytes remaining.
+ *
+ * We can't simply store the descriptor in an xattr because it *must* be
+ * encrypted when ext4 encryption is used, but ext4 encryption doesn't encrypt
+ * xattrs. Also, if the descriptor includes a large signature blob it may be
+ * too large to store in an xattr without the EA_INODE feature.
+ */
+static int ext4_write_verity_descriptor(struct inode *inode, const void *desc,
+ size_t desc_size, u64 merkle_tree_size)
+{
+ const u64 desc_pos = round_up(ext4_verity_metadata_pos(inode) +
+ merkle_tree_size, i_blocksize(inode));
+ const u64 desc_end = desc_pos + desc_size;
+ const __le32 desc_size_disk = cpu_to_le32(desc_size);
+ const u64 desc_size_pos = round_up(desc_end + sizeof(desc_size_disk),
+ i_blocksize(inode)) -
+ sizeof(desc_size_disk);
+ int err;
+
+ err = pagecache_write(inode, desc, desc_size, desc_pos);
+ if (err)
+ return err;
+
+ return pagecache_write(inode, &desc_size_disk, sizeof(desc_size_disk),
+ desc_size_pos);
+}
+
+static int ext4_end_enable_verity(struct file *filp, const void *desc,
+ size_t desc_size, u64 merkle_tree_size)
+{
+ struct inode *inode = file_inode(filp);
+ const int credits = 2; /* superblock and inode for ext4_orphan_del() */
+ handle_t *handle;
+ int err = 0;
+ int err2;
+
+ if (desc != NULL) {
+ /* Succeeded; write the verity descriptor. */
+ err = ext4_write_verity_descriptor(inode, desc, desc_size,
+ merkle_tree_size);
+
+ /* Write all pages before clearing VERITY_IN_PROGRESS. */
+ if (!err)
+ err = filemap_write_and_wait(inode->i_mapping);
+ }
+
+ /* If we failed, truncate anything we wrote past i_size. */
+ if (desc == NULL || err)
+ ext4_truncate(inode);
+
+ /*
+ * We must always clean up by clearing EXT4_STATE_VERITY_IN_PROGRESS and
+ * deleting the inode from the orphan list, even if something failed.
+ * If everything succeeded, we'll also set the verity bit in the same
+ * transaction.
+ */
+
+ ext4_clear_inode_state(inode, EXT4_STATE_VERITY_IN_PROGRESS);
+
+ handle = ext4_journal_start(inode, EXT4_HT_INODE, credits);
+ if (IS_ERR(handle)) {
+ ext4_orphan_del(NULL, inode);
+ return PTR_ERR(handle);
+ }
+
+ err2 = ext4_orphan_del(handle, inode);
+ if (err2)
+ goto out_stop;
+
+ if (desc != NULL && !err) {
+ struct ext4_iloc iloc;
+
+ err = ext4_reserve_inode_write(handle, inode, &iloc);
+ if (err)
+ goto out_stop;
+ ext4_set_inode_flag(inode, EXT4_INODE_VERITY);
+ ext4_set_inode_flags(inode);
+ err = ext4_mark_iloc_dirty(handle, inode, &iloc);
+ }
+out_stop:
+ ext4_journal_stop(handle);
+ return err ?: err2;
+}
+
+static int ext4_get_verity_descriptor_location(struct inode *inode,
+ size_t *desc_size_ret,
+ u64 *desc_pos_ret)
+{
+ struct ext4_ext_path *path;
+ struct ext4_extent *last_extent;
+ u32 end_lblk;
+ u64 desc_size_pos;
+ __le32 desc_size_disk;
+ u32 desc_size;
+ u64 desc_pos;
+ int err;
+
+ /*
+ * Descriptor size is in last 4 bytes of last allocated block.
+ * See ext4_write_verity_descriptor().
+ */
+
+ if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
+ EXT4_ERROR_INODE(inode, "verity file doesn't use extents");
+ return -EFSCORRUPTED;
+ }
+
+ path = ext4_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL, 0);
+ if (IS_ERR(path))
+ return PTR_ERR(path);
+
+ last_extent = path[path->p_depth].p_ext;
+ if (!last_extent) {
+ EXT4_ERROR_INODE(inode, "verity file has no extents");
+ ext4_ext_drop_refs(path);
+ kfree(path);
+ return -EFSCORRUPTED;
+ }
+
+ end_lblk = le32_to_cpu(last_extent->ee_block) +
+ ext4_ext_get_actual_len(last_extent);
+ desc_size_pos = (u64)end_lblk << inode->i_blkbits;
+ ext4_ext_drop_refs(path);
+ kfree(path);
+
+ if (desc_size_pos < sizeof(desc_size_disk))
+ goto bad;
+ desc_size_pos -= sizeof(desc_size_disk);
+
+ err = pagecache_read(inode, &desc_size_disk, sizeof(desc_size_disk),
+ desc_size_pos);
+ if (err)
+ return err;
+ desc_size = le32_to_cpu(desc_size_disk);
+
+ /*
+ * The descriptor is stored just before the desc_size_disk, but starting
+ * on a filesystem block boundary.
+ */
+
+ if (desc_size > INT_MAX || desc_size > desc_size_pos)
+ goto bad;
+
+ desc_pos = round_down(desc_size_pos - desc_size, i_blocksize(inode));
+ if (desc_pos < ext4_verity_metadata_pos(inode))
+ goto bad;
+
+ *desc_size_ret = desc_size;
+ *desc_pos_ret = desc_pos;
+ return 0;
+
+bad:
+ EXT4_ERROR_INODE(inode, "verity file corrupted; can't find descriptor");
+ return -EFSCORRUPTED;
+}
+
+static int ext4_get_verity_descriptor(struct inode *inode, void *buf,
+ size_t buf_size)
+{
+ size_t desc_size = 0;
+ u64 desc_pos = 0;
+ int err;
+
+ err = ext4_get_verity_descriptor_location(inode, &desc_size, &desc_pos);
+ if (err)
+ return err;
+
+ if (buf_size) {
+ if (desc_size > buf_size)
+ return -ERANGE;
+ err = pagecache_read(inode, buf, desc_size, desc_pos);
+ if (err)
+ return err;
+ }
+ return desc_size;
+}
+
+static struct page *ext4_read_merkle_tree_page(struct inode *inode,
+ pgoff_t index)
+{
+ index += ext4_verity_metadata_pos(inode) >> PAGE_SHIFT;
+
+ return read_mapping_page(inode->i_mapping, index, NULL);
+}
+
+static int ext4_write_merkle_tree_block(struct inode *inode, const void *buf,
+ u64 index, int log_blocksize)
+{
+ loff_t pos = ext4_verity_metadata_pos(inode) + (index << log_blocksize);
+
+ return pagecache_write(inode, buf, 1 << log_blocksize, pos);
+}
+
+const struct fsverity_operations ext4_verityops = {
+ .begin_enable_verity = ext4_begin_enable_verity,
+ .end_enable_verity = ext4_end_enable_verity,
+ .get_verity_descriptor = ext4_get_verity_descriptor,
+ .read_merkle_tree_page = ext4_read_merkle_tree_page,
+ .write_merkle_tree_block = ext4_write_merkle_tree_block,
+};
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 4380c86..491f9ee 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -384,7 +384,7 @@
struct inode *inode;
int err;
- inode = ext4_iget(parent->i_sb, ea_ino);
+ inode = ext4_iget(parent->i_sb, ea_ino, EXT4_IGET_NORMAL);
if (IS_ERR(inode)) {
err = PTR_ERR(inode);
ext4_error(parent->i_sb,
@@ -522,14 +522,13 @@
ea_idebug(inode, "name=%d.%s, buffer=%p, buffer_size=%ld",
name_index, name, buffer, (long)buffer_size);
- error = -ENODATA;
if (!EXT4_I(inode)->i_file_acl)
- goto cleanup;
+ return -ENODATA;
ea_idebug(inode, "reading block %llu",
(unsigned long long)EXT4_I(inode)->i_file_acl);
- bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
- if (!bh)
- goto cleanup;
+ bh = ext4_sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl, REQ_PRIO);
+ if (IS_ERR(bh))
+ return PTR_ERR(bh);
ea_bdebug(bh, "b_count=%d, refcount=%d",
atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount));
error = ext4_xattr_check_block(inode, bh);
@@ -696,26 +695,23 @@
ea_idebug(inode, "buffer=%p, buffer_size=%ld",
buffer, (long)buffer_size);
- error = 0;
if (!EXT4_I(inode)->i_file_acl)
- goto cleanup;
+ return 0;
ea_idebug(inode, "reading block %llu",
(unsigned long long)EXT4_I(inode)->i_file_acl);
- bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
- error = -EIO;
- if (!bh)
- goto cleanup;
+ bh = ext4_sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl, REQ_PRIO);
+ if (IS_ERR(bh))
+ return PTR_ERR(bh);
ea_bdebug(bh, "b_count=%d, refcount=%d",
atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount));
error = ext4_xattr_check_block(inode, bh);
if (error)
goto cleanup;
ext4_xattr_block_cache_insert(EA_BLOCK_CACHE(inode), bh);
- error = ext4_xattr_list_entries(dentry, BFIRST(bh), buffer, buffer_size);
-
+ error = ext4_xattr_list_entries(dentry, BFIRST(bh), buffer,
+ buffer_size);
cleanup:
brelse(bh);
-
return error;
}
@@ -830,9 +826,10 @@
}
if (EXT4_I(inode)->i_file_acl) {
- bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
- if (!bh) {
- ret = -EIO;
+ bh = ext4_sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl, REQ_PRIO);
+ if (IS_ERR(bh)) {
+ ret = PTR_ERR(bh);
+ bh = NULL;
goto out;
}
@@ -1031,10 +1028,8 @@
inode_lock(ea_inode);
ret = ext4_reserve_inode_write(handle, ea_inode, &iloc);
- if (ret) {
- iloc.bh = NULL;
+ if (ret)
goto out;
- }
ref_count = ext4_xattr_inode_get_ref(ea_inode);
ref_count += ref_change;
@@ -1080,12 +1075,10 @@
}
ret = ext4_mark_iloc_dirty(handle, ea_inode, &iloc);
- iloc.bh = NULL;
if (ret)
ext4_warning_inode(ea_inode,
"ext4_mark_iloc_dirty() failed ret=%d", ret);
out:
- brelse(iloc.bh);
inode_unlock(ea_inode);
return ret;
}
@@ -1490,7 +1483,8 @@
}
while (ce) {
- ea_inode = ext4_iget(inode->i_sb, ce->e_value);
+ ea_inode = ext4_iget(inode->i_sb, ce->e_value,
+ EXT4_IGET_NORMAL);
if (!IS_ERR(ea_inode) &&
!is_bad_inode(ea_inode) &&
(EXT4_I(ea_inode)->i_flags & EXT4_EA_INODE_FL) &&
@@ -1702,7 +1696,7 @@
/* No failures allowed past this point. */
- if (!s->not_found && here->e_value_size && here->e_value_offs) {
+ if (!s->not_found && here->e_value_size && !here->e_value_inum) {
/* Remove the old value. */
void *first_val = s->base + min_offs;
size_t offs = le16_to_cpu(here->e_value_offs);
@@ -1825,16 +1819,15 @@
if (EXT4_I(inode)->i_file_acl) {
/* The inode already has an extended attribute block. */
- bs->bh = sb_bread(sb, EXT4_I(inode)->i_file_acl);
- error = -EIO;
- if (!bs->bh)
- goto cleanup;
+ bs->bh = ext4_sb_bread(sb, EXT4_I(inode)->i_file_acl, REQ_PRIO);
+ if (IS_ERR(bs->bh))
+ return PTR_ERR(bs->bh);
ea_bdebug(bs->bh, "b_count=%d, refcount=%d",
atomic_read(&(bs->bh->b_count)),
le32_to_cpu(BHDR(bs->bh)->h_refcount));
error = ext4_xattr_check_block(inode, bs->bh);
if (error)
- goto cleanup;
+ return error;
/* Find the named attribute. */
bs->s.base = BHDR(bs->bh);
bs->s.first = BFIRST(bs->bh);
@@ -1843,13 +1836,10 @@
error = xattr_find_entry(inode, &bs->s.here, bs->s.end,
i->name_index, i->name, 1);
if (error && error != -ENODATA)
- goto cleanup;
+ return error;
bs->s.not_found = error;
}
- error = 0;
-
-cleanup:
- return error;
+ return 0;
}
static int
@@ -2278,9 +2268,9 @@
if (!EXT4_I(inode)->i_file_acl)
return NULL;
- bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
- if (!bh)
- return ERR_PTR(-EIO);
+ bh = ext4_sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl, REQ_PRIO);
+ if (IS_ERR(bh))
+ return bh;
error = ext4_xattr_check_block(inode, bh);
if (error) {
brelse(bh);
@@ -2733,7 +2723,7 @@
base = IFIRST(header);
end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size;
min_offs = end - base;
- total_ino = sizeof(struct ext4_xattr_ibody_header);
+ total_ino = sizeof(struct ext4_xattr_ibody_header) + sizeof(u32);
error = xattr_check_inode(inode, header, end);
if (error)
@@ -2750,10 +2740,11 @@
if (EXT4_I(inode)->i_file_acl) {
struct buffer_head *bh;
- bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
- error = -EIO;
- if (!bh)
+ bh = ext4_sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl, REQ_PRIO);
+ if (IS_ERR(bh)) {
+ error = PTR_ERR(bh);
goto cleanup;
+ }
error = ext4_xattr_check_block(inode, bh);
if (error) {
brelse(bh);
@@ -2907,11 +2898,13 @@
}
if (EXT4_I(inode)->i_file_acl) {
- bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
- if (!bh) {
- EXT4_ERROR_INODE(inode, "block %llu read error",
- EXT4_I(inode)->i_file_acl);
- error = -EIO;
+ bh = ext4_sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl, REQ_PRIO);
+ if (IS_ERR(bh)) {
+ error = PTR_ERR(bh);
+ if (error == -EIO)
+ EXT4_ERROR_INODE(inode, "block %llu read error",
+ EXT4_I(inode)->i_file_acl);
+ bh = NULL;
goto cleanup;
}
error = ext4_xattr_check_block(inode, bh);
@@ -3064,8 +3057,11 @@
while (ce) {
struct buffer_head *bh;
- bh = sb_bread(inode->i_sb, ce->e_value);
- if (!bh) {
+ bh = ext4_sb_bread(inode->i_sb, ce->e_value, REQ_PRIO);
+ if (IS_ERR(bh)) {
+ if (PTR_ERR(bh) == -ENOMEM)
+ return NULL;
+ bh = NULL;
EXT4_ERROR_INODE(inode, "block %lu read error",
(unsigned long)ce->e_value);
} else if (ext4_xattr_cmp(header, BHDR(bh)) == 0) {