Update Linux to v5.10.109
Sourced from [1]
[1] https://cdn.kernel.org/pub/linux/kernel/v5.x/linux-5.10.109.tar.xz
Change-Id: I19bca9fc6762d4e63bcf3e4cba88bbe560d9c76c
Signed-off-by: Olivier Deprez <olivier.deprez@arm.com>
diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig
index cbb5ca8..86699c8 100644
--- a/fs/ext4/Kconfig
+++ b/fs/ext4/Kconfig
@@ -4,12 +4,7 @@
# kernels after the removal of ext3 driver.
config EXT3_FS
tristate "The Extended 3 (ext3) filesystem"
- # These must match EXT4_FS selects...
select EXT4_FS
- select JBD2
- select CRC16
- select CRYPTO
- select CRYPTO_CRC32C
help
This config option is here only for backward compatibility. ext3
filesystem is now handled by the ext4 driver.
@@ -33,12 +28,12 @@
config EXT4_FS
tristate "The Extended 4 (ext4) filesystem"
- # Please update EXT3_FS selects when changing these
select JBD2
select CRC16
select CRYPTO
select CRYPTO_CRC32C
select FS_IOMAP
+ select FS_ENCRYPTION_ALGS if FS_ENCRYPTION
help
This is the next generation of the ext3 filesystem.
@@ -104,5 +99,21 @@
Enables run-time debugging support for the ext4 filesystem.
If you select Y here, then you will be able to turn on debugging
- with a command such as:
- echo 1 > /sys/module/ext4/parameters/mballoc_debug
+ using dynamic debug control for mb_debug() / ext_debug() msgs.
+
+config EXT4_KUNIT_TESTS
+ tristate "KUnit tests for ext4" if !KUNIT_ALL_TESTS
+ depends on EXT4_FS && KUNIT
+ default KUNIT_ALL_TESTS
+ help
+ This builds the ext4 KUnit tests.
+
+ KUnit tests run during boot and output the results to the debug log
+ in TAP format (https://testanything.org/). Only useful for kernel devs
+ running KUnit test harness and are not for inclusion into a production
+ build.
+
+ For more information on KUnit and unit tests in general please refer
+ to the KUnit documentation in Documentation/dev-tools/kunit/.
+
+ If unsure, say N.
diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile
index b17ddc2..49e7af6 100644
--- a/fs/ext4/Makefile
+++ b/fs/ext4/Makefile
@@ -9,8 +9,11 @@
extents_status.o file.o fsmap.o fsync.o hash.o ialloc.o \
indirect.o inline.o inode.o ioctl.o mballoc.o migrate.o \
mmp.o move_extent.o namei.o page-io.o readpage.o resize.o \
- super.o symlink.o sysfs.o xattr.o xattr_trusted.o xattr_user.o
+ super.o symlink.o sysfs.o xattr.o xattr_hurd.o xattr_trusted.o \
+ xattr_user.o fast_commit.o
ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o
ext4-$(CONFIG_EXT4_FS_SECURITY) += xattr_security.o
+ext4-inode-test-objs += inode-test.o
+obj-$(CONFIG_EXT4_KUNIT_TESTS) += ext4-inode-test.o
ext4-$(CONFIG_FS_VERITY) += verity.o
diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
index 8c7bbf3..68aaed4 100644
--- a/fs/ext4/acl.c
+++ b/fs/ext4/acl.c
@@ -215,9 +215,8 @@
value, size, xattr_flags);
kfree(value);
- if (!error) {
+ if (!error)
set_cached_acl(inode, type, acl);
- }
return error;
}
@@ -243,6 +242,7 @@
handle = ext4_journal_start(inode, EXT4_HT_XATTR, credits);
if (IS_ERR(handle))
return PTR_ERR(handle);
+ ext4_fc_start_update(inode);
if ((type == ACL_TYPE_ACCESS) && acl) {
error = posix_acl_update_mode(inode, &mode, &acl);
@@ -256,10 +256,11 @@
if (!error && update_mode) {
inode->i_mode = mode;
inode->i_ctime = current_time(inode);
- ext4_mark_inode_dirty(handle, inode);
+ error = ext4_mark_inode_dirty(handle, inode);
}
out_stop:
ext4_journal_stop(handle);
+ ext4_fc_stop_update(inode);
if (error == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
goto retry;
return error;
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 031ff3f..1afd60f 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -368,7 +368,12 @@
struct buffer_head *bh)
{
ext4_fsblk_t blk;
- struct ext4_group_info *grp = ext4_get_group_info(sb, block_group);
+ struct ext4_group_info *grp;
+
+ if (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY)
+ return 0;
+
+ grp = ext4_get_group_info(sb, block_group);
if (buffer_verified(bh))
return 0;
@@ -379,7 +384,8 @@
if (buffer_verified(bh))
goto verified;
if (unlikely(!ext4_block_bitmap_csum_verify(sb, block_group,
- desc, bh))) {
+ desc, bh) ||
+ ext4_simulate_fail(sb, EXT4_SIM_BBITMAP_CRC))) {
ext4_unlock_group(sb, block_group);
ext4_error(sb, "bg %u: bad block bitmap checksum", block_group);
ext4_mark_group_bitmap_corrupted(sb, block_group,
@@ -409,10 +415,11 @@
* Read the bitmap for a given block_group,and validate the
* bits for block/inode/inode tables are set in the bitmaps
*
- * Return buffer_head on success or NULL in case of failure.
+ * Return buffer_head on success or an ERR_PTR in case of failure.
*/
struct buffer_head *
-ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group)
+ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group,
+ bool ignore_locked)
{
struct ext4_group_desc *desc;
struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -440,6 +447,12 @@
return ERR_PTR(-ENOMEM);
}
+ if (ignore_locked && buffer_locked(bh)) {
+ /* buffer under IO already, return if called for prefetching */
+ put_bh(bh);
+ return NULL;
+ }
+
if (bitmap_uptodate(bh))
goto verify;
@@ -486,10 +499,10 @@
* submit the buffer_head for reading
*/
set_buffer_new(bh);
- trace_ext4_read_block_bitmap_load(sb, block_group);
- bh->b_end_io = ext4_end_bitmap_read;
- get_bh(bh);
- submit_bh(REQ_OP_READ, REQ_META | REQ_PRIO, bh);
+ trace_ext4_read_block_bitmap_load(sb, block_group, ignore_locked);
+ ext4_read_bh_nowait(bh, REQ_META | REQ_PRIO |
+ (ignore_locked ? REQ_RAHEAD : 0),
+ ext4_end_bitmap_read);
return bh;
verify:
err = ext4_validate_block_bitmap(sb, desc, block_group, bh);
@@ -501,7 +514,7 @@
return ERR_PTR(err);
}
-/* Returns 0 on success, 1 on error */
+/* Returns 0 on success, -errno on error */
int ext4_wait_block_bitmap(struct super_block *sb, ext4_group_t block_group,
struct buffer_head *bh)
{
@@ -513,10 +526,11 @@
if (!desc)
return -EFSCORRUPTED;
wait_on_buffer(bh);
+ ext4_simulate_fail_bh(sb, bh, EXT4_SIM_BBITMAP_EIO);
if (!buffer_uptodate(bh)) {
- ext4_error(sb, "Cannot read block bitmap - "
- "block_group = %u, block_bitmap = %llu",
- block_group, (unsigned long long) bh->b_blocknr);
+ ext4_error_err(sb, EIO, "Cannot read block bitmap - "
+ "block_group = %u, block_bitmap = %llu",
+ block_group, (unsigned long long) bh->b_blocknr);
ext4_mark_group_bitmap_corrupted(sb, block_group,
EXT4_GROUP_INFO_BBITMAP_CORRUPT);
return -EIO;
@@ -532,7 +546,7 @@
struct buffer_head *bh;
int err;
- bh = ext4_read_block_bitmap_nowait(sb, block_group);
+ bh = ext4_read_block_bitmap_nowait(sb, block_group, false);
if (IS_ERR(bh))
return bh;
err = ext4_wait_block_bitmap(sb, block_group, bh);
@@ -915,10 +929,11 @@
return bg_start;
if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block)
- colour = (current->pid % 16) *
+ colour = (task_pid_nr(current) % 16) *
(EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16);
else
- colour = (current->pid % 16) * ((last_block - bg_start) / 16);
+ colour = (task_pid_nr(current) % 16) *
+ ((last_block - bg_start) / 16);
return bg_start + colour;
}
diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c
index 97c56d0..8e6ca23 100644
--- a/fs/ext4/block_validity.c
+++ b/fs/ext4/block_validity.c
@@ -24,6 +24,7 @@
struct rb_node node;
ext4_fsblk_t start_blk;
unsigned int count;
+ u32 ino;
};
static struct kmem_cache *ext4_system_zone_cachep;
@@ -45,7 +46,8 @@
static inline int can_merge(struct ext4_system_zone *entry1,
struct ext4_system_zone *entry2)
{
- if ((entry1->start_blk + entry1->count) == entry2->start_blk)
+ if ((entry1->start_blk + entry1->count) == entry2->start_blk &&
+ entry1->ino == entry2->ino)
return 1;
return 0;
}
@@ -66,7 +68,7 @@
*/
static int add_system_zone(struct ext4_system_blocks *system_blks,
ext4_fsblk_t start_blk,
- unsigned int count)
+ unsigned int count, u32 ino)
{
struct ext4_system_zone *new_entry, *entry;
struct rb_node **n = &system_blks->root.rb_node, *node;
@@ -89,6 +91,7 @@
return -ENOMEM;
new_entry->start_blk = start_blk;
new_entry->count = count;
+ new_entry->ino = ino;
new_node = &new_entry->node;
rb_link_node(new_node, parent, n);
@@ -123,10 +126,13 @@
{
struct rb_node *node;
struct ext4_system_zone *entry;
+ struct ext4_system_blocks *system_blks;
int first = 1;
printk(KERN_INFO "System zones: ");
- node = rb_first(&sbi->system_blks->root);
+ rcu_read_lock();
+ system_blks = rcu_dereference(sbi->s_system_blks);
+ node = rb_first(&system_blks->root);
while (node) {
entry = rb_entry(node, struct ext4_system_zone, node);
printk(KERN_CONT "%s%llu-%llu", first ? "" : ", ",
@@ -134,47 +140,10 @@
first = 0;
node = rb_next(node);
}
+ rcu_read_unlock();
printk(KERN_CONT "\n");
}
-/*
- * Returns 1 if the passed-in block region (start_blk,
- * start_blk+count) is valid; 0 if some part of the block region
- * overlaps with filesystem metadata blocks.
- */
-static int ext4_data_block_valid_rcu(struct ext4_sb_info *sbi,
- struct ext4_system_blocks *system_blks,
- ext4_fsblk_t start_blk,
- unsigned int count)
-{
- struct ext4_system_zone *entry;
- struct rb_node *n;
-
- if ((start_blk <= le32_to_cpu(sbi->s_es->s_first_data_block)) ||
- (start_blk + count < start_blk) ||
- (start_blk + count > ext4_blocks_count(sbi->s_es))) {
- sbi->s_es->s_last_error_block = cpu_to_le64(start_blk);
- return 0;
- }
-
- if (system_blks == NULL)
- return 1;
-
- n = system_blks->root.rb_node;
- while (n) {
- entry = rb_entry(n, struct ext4_system_zone, node);
- if (start_blk + count - 1 < entry->start_blk)
- n = n->rb_left;
- else if (start_blk >= (entry->start_blk + entry->count))
- n = n->rb_right;
- else {
- sbi->s_es->s_last_error_block = cpu_to_le64(start_blk);
- return 0;
- }
- }
- return 1;
-}
-
static int ext4_protect_reserved_inode(struct super_block *sb,
struct ext4_system_blocks *system_blks,
u32 ino)
@@ -204,17 +173,18 @@
if (n == 0) {
i++;
} else {
- if (!ext4_data_block_valid_rcu(sbi, system_blks,
- map.m_pblk, n)) {
- ext4_error(sb, "blocks %llu-%llu from inode %u "
- "overlap system zone", map.m_pblk,
- map.m_pblk + map.m_len - 1, ino);
- err = -EFSCORRUPTED;
+ err = add_system_zone(system_blks, map.m_pblk, n, ino);
+ if (err < 0) {
+ if (err == -EFSCORRUPTED) {
+ __ext4_error(sb, __func__, __LINE__,
+ -err, map.m_pblk,
+ "blocks %llu-%llu from inode %u overlap system zone",
+ map.m_pblk,
+ map.m_pblk + map.m_len - 1,
+ ino);
+ }
break;
}
- err = add_system_zone(system_blks, map.m_pblk, n);
- if (err < 0)
- break;
i += n;
}
}
@@ -257,22 +227,25 @@
for (i=0; i < ngroups; i++) {
cond_resched();
if (ext4_bg_has_super(sb, i) &&
- ((i < 5) || ((i % flex_size) == 0)))
- add_system_zone(system_blks,
+ ((i < 5) || ((i % flex_size) == 0))) {
+ ret = add_system_zone(system_blks,
ext4_group_first_block_no(sb, i),
- ext4_bg_num_gdb(sb, i) + 1);
+ ext4_bg_num_gdb(sb, i) + 1, 0);
+ if (ret)
+ goto err;
+ }
gdp = ext4_get_group_desc(sb, i, NULL);
ret = add_system_zone(system_blks,
- ext4_block_bitmap(sb, gdp), 1);
+ ext4_block_bitmap(sb, gdp), 1, 0);
if (ret)
goto err;
ret = add_system_zone(system_blks,
- ext4_inode_bitmap(sb, gdp), 1);
+ ext4_inode_bitmap(sb, gdp), 1, 0);
if (ret)
goto err;
ret = add_system_zone(system_blks,
ext4_inode_table(sb, gdp),
- sbi->s_itb_per_group);
+ sbi->s_itb_per_group, 0);
if (ret)
goto err;
}
@@ -288,7 +261,7 @@
* with ext4_data_block_valid() accessing the rbtree at the same
* time.
*/
- rcu_assign_pointer(sbi->system_blks, system_blks);
+ rcu_assign_pointer(sbi->s_system_blks, system_blks);
if (test_opt(sb, DEBUG))
debug_print_tree(sbi);
@@ -313,19 +286,32 @@
{
struct ext4_system_blocks *system_blks;
- system_blks = rcu_dereference_protected(EXT4_SB(sb)->system_blks,
+ system_blks = rcu_dereference_protected(EXT4_SB(sb)->s_system_blks,
lockdep_is_held(&sb->s_umount));
- rcu_assign_pointer(EXT4_SB(sb)->system_blks, NULL);
+ rcu_assign_pointer(EXT4_SB(sb)->s_system_blks, NULL);
if (system_blks)
call_rcu(&system_blks->rcu, ext4_destroy_system_zone);
}
-int ext4_data_block_valid(struct ext4_sb_info *sbi, ext4_fsblk_t start_blk,
+/*
+ * Returns 1 if the passed-in block region (start_blk,
+ * start_blk+count) is valid; 0 if some part of the block region
+ * overlaps with some other filesystem metadata blocks.
+ */
+int ext4_inode_block_valid(struct inode *inode, ext4_fsblk_t start_blk,
unsigned int count)
{
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
struct ext4_system_blocks *system_blks;
- int ret;
+ struct ext4_system_zone *entry;
+ struct rb_node *n;
+ int ret = 1;
+
+ if ((start_blk <= le32_to_cpu(sbi->s_es->s_first_data_block)) ||
+ (start_blk + count < start_blk) ||
+ (start_blk + count > ext4_blocks_count(sbi->s_es)))
+ return 0;
/*
* Lock the system zone to prevent it being released concurrently
@@ -333,9 +319,23 @@
* mount option.
*/
rcu_read_lock();
- system_blks = rcu_dereference(sbi->system_blks);
- ret = ext4_data_block_valid_rcu(sbi, system_blks, start_blk,
- count);
+ system_blks = rcu_dereference(sbi->s_system_blks);
+ if (system_blks == NULL)
+ goto out_rcu;
+
+ n = system_blks->root.rb_node;
+ while (n) {
+ entry = rb_entry(n, struct ext4_system_zone, node);
+ if (start_blk + count - 1 < entry->start_blk)
+ n = n->rb_left;
+ else if (start_blk >= (entry->start_blk + entry->count))
+ n = n->rb_right;
+ else {
+ ret = (entry->ino == inode->i_ino);
+ break;
+ }
+ }
+out_rcu:
rcu_read_unlock();
return ret;
}
@@ -343,7 +343,6 @@
int ext4_check_blockref(const char *function, unsigned int line,
struct inode *inode, __le32 *p, unsigned int max)
{
- struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
__le32 *bref = p;
unsigned int blk;
@@ -355,9 +354,7 @@
while (bref < p+max) {
blk = le32_to_cpu(*bref++);
if (blk &&
- unlikely(!ext4_data_block_valid(EXT4_SB(inode->i_sb),
- blk, 1))) {
- es->s_last_error_block = cpu_to_le64(blk);
+ unlikely(!ext4_inode_block_valid(inode, blk, 1))) {
ext4_error_inode(inode, function, line, blk,
"invalid block");
return -EFSCORRUPTED;
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index 0589e91..70a0f5e 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -72,6 +72,7 @@
const char *error_msg = NULL;
const int rlen = ext4_rec_len_from_disk(de->rec_len,
dir->i_sb->s_blocksize);
+ const int next_offset = ((char *) de - buf) + rlen;
if (unlikely(rlen < EXT4_DIR_REC_LEN(1)))
error_msg = "rec_len is smaller than minimal";
@@ -79,13 +80,11 @@
error_msg = "rec_len % 4 != 0";
else if (unlikely(rlen < EXT4_DIR_REC_LEN(de->name_len)))
error_msg = "rec_len is too small for name_len";
- else if (unlikely(((char *) de - buf) + rlen > size))
+ else if (unlikely(next_offset > size))
error_msg = "directory entry overrun";
- else if (unlikely(((char *) de - buf) + rlen >
- size - EXT4_DIR_REC_LEN(1) &&
- ((char *) de - buf) + rlen != size)) {
+ else if (unlikely(next_offset > size - EXT4_DIR_REC_LEN(1) &&
+ next_offset != size))
error_msg = "directory entry too close to block end";
- }
else if (unlikely(le32_to_cpu(de->inode) >
le32_to_cpu(EXT4_SB(dir->i_sb)->s_es->s_inodes_count)))
error_msg = "inode out of bounds";
@@ -121,7 +120,7 @@
if (IS_ENCRYPTED(inode)) {
err = fscrypt_get_encryption_info(inode);
- if (err && err != -ENOKEY)
+ if (err)
return err;
}
@@ -149,7 +148,7 @@
}
if (IS_ENCRYPTED(inode)) {
- err = fscrypt_fname_alloc_buffer(inode, EXT4_NAME_LEN, &fstr);
+ err = fscrypt_fname_alloc_buffer(EXT4_NAME_LEN, &fstr);
if (err < 0)
return err;
}
@@ -393,7 +392,7 @@
__u32 inode;
__u8 name_len;
__u8 file_type;
- char name[0];
+ char name[];
};
/*
@@ -465,7 +464,6 @@
new_fn->name_len = ent_name->len;
new_fn->file_type = dirent->file_type;
memcpy(new_fn->name, ent_name->name, ent_name->len);
- new_fn->name[ent_name->len] = 0;
while (*p) {
parent = *p;
@@ -536,7 +534,7 @@
struct dir_private_info *info = file->private_data;
struct inode *inode = file_inode(file);
struct fname *fname;
- int ret;
+ int ret = 0;
if (!info) {
info = ext4_htree_create_dir_info(file, ctx->pos);
@@ -584,7 +582,7 @@
info->curr_minor_hash,
&info->next_hash);
if (ret < 0)
- return ret;
+ goto finished;
if (ret == 0) {
ctx->pos = ext4_get_htree_eof(file);
break;
@@ -615,7 +613,7 @@
}
finished:
info->last_pos = ctx->pos;
- return 0;
+ return ret < 0 ? ret : 0;
}
static int ext4_dir_open(struct inode * inode, struct file * filp)
@@ -671,68 +669,8 @@
};
#ifdef CONFIG_UNICODE
-static int ext4_d_compare(const struct dentry *dentry, unsigned int len,
- const char *str, const struct qstr *name)
-{
- struct qstr qstr = {.name = str, .len = len };
- const struct dentry *parent = READ_ONCE(dentry->d_parent);
- const struct inode *inode = READ_ONCE(parent->d_inode);
- char strbuf[DNAME_INLINE_LEN];
-
- if (!inode || !IS_CASEFOLDED(inode) ||
- !EXT4_SB(inode->i_sb)->s_encoding) {
- if (len != name->len)
- return -1;
- return memcmp(str, name->name, len);
- }
-
- /*
- * If the dentry name is stored in-line, then it may be concurrently
- * modified by a rename. If this happens, the VFS will eventually retry
- * the lookup, so it doesn't matter what ->d_compare() returns.
- * However, it's unsafe to call utf8_strncasecmp() with an unstable
- * string. Therefore, we have to copy the name into a temporary buffer.
- */
- if (len <= DNAME_INLINE_LEN - 1) {
- memcpy(strbuf, str, len);
- strbuf[len] = 0;
- qstr.name = strbuf;
- /* prevent compiler from optimizing out the temporary buffer */
- barrier();
- }
-
- return ext4_ci_compare(inode, name, &qstr, false);
-}
-
-static int ext4_d_hash(const struct dentry *dentry, struct qstr *str)
-{
- const struct ext4_sb_info *sbi = EXT4_SB(dentry->d_sb);
- const struct unicode_map *um = sbi->s_encoding;
- const struct inode *inode = READ_ONCE(dentry->d_inode);
- unsigned char *norm;
- int len, ret = 0;
-
- if (!inode || !IS_CASEFOLDED(inode) || !um)
- return 0;
-
- norm = kmalloc(PATH_MAX, GFP_ATOMIC);
- if (!norm)
- return -ENOMEM;
-
- len = utf8_casefold(um, str, norm, PATH_MAX);
- if (len < 0) {
- if (ext4_has_strict_mode(sbi))
- ret = -EINVAL;
- goto out;
- }
- str->hash = full_name_hash(dentry, norm, len);
-out:
- kfree(norm);
- return ret;
-}
-
const struct dentry_operations ext4_dentry_ops = {
- .d_hash = ext4_d_hash,
- .d_compare = ext4_d_compare,
+ .d_hash = generic_ci_d_hash,
+ .d_compare = generic_ci_d_compare,
};
#endif
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index ae2cb15..455eb34 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -27,7 +27,6 @@
#include <linux/seqlock.h>
#include <linux/mutex.h>
#include <linux/timer.h>
-#include <linux/version.h>
#include <linux/wait.h>
#include <linux/sched/signal.h>
#include <linux/blockgroup_lock.h>
@@ -36,6 +35,7 @@
#include <crypto/hash.h>
#include <linux/falloc.h>
#include <linux/percpu-rwsem.h>
+#include <linux/fiemap.h>
#ifdef __KERNEL__
#include <linux/compat.h>
#endif
@@ -80,14 +80,22 @@
#define ext4_debug(fmt, ...) no_printk(fmt, ##__VA_ARGS__)
#endif
-/*
- * Turn on EXT_DEBUG to get lots of info about extents operations.
- */
+ /*
+ * Turn on EXT_DEBUG to enable ext4_ext_show_path/leaf/move in extents.c
+ */
#define EXT_DEBUG__
-#ifdef EXT_DEBUG
-#define ext_debug(fmt, ...) printk(fmt, ##__VA_ARGS__)
+
+/*
+ * Dynamic printk for controlled extents debugging.
+ */
+#ifdef CONFIG_EXT4_DEBUG
+#define ext_debug(ino, fmt, ...) \
+ pr_debug("[%s/%d] EXT4-fs (%s): ino %lu: (%s, %d): %s:" fmt, \
+ current->comm, task_pid_nr(current), \
+ ino->i_sb->s_id, ino->i_ino, __FILE__, __LINE__, \
+ __func__, ##__VA_ARGS__)
#else
-#define ext_debug(fmt, ...) no_printk(fmt, ##__VA_ARGS__)
+#define ext_debug(ino, fmt, ...) no_printk(fmt, ##__VA_ARGS__)
#endif
/* data type for block offset of block group */
@@ -142,6 +150,8 @@
#define EXT4_MB_USE_ROOT_BLOCKS 0x1000
/* Use blocks from reserved pool */
#define EXT4_MB_USE_RESERVED 0x2000
+/* Do strict check for free blocks while retrying block allocation */
+#define EXT4_MB_STRICT_CHECK 0x4000
struct ext4_allocation_request {
/* target inode for block we're allocating */
@@ -171,10 +181,10 @@
* well as to store the information returned by ext4_map_blocks(). It
* takes less room on the stack than a struct buffer_head.
*/
-#define EXT4_MAP_NEW (1 << BH_New)
-#define EXT4_MAP_MAPPED (1 << BH_Mapped)
-#define EXT4_MAP_UNWRITTEN (1 << BH_Unwritten)
-#define EXT4_MAP_BOUNDARY (1 << BH_Boundary)
+#define EXT4_MAP_NEW BIT(BH_New)
+#define EXT4_MAP_MAPPED BIT(BH_Mapped)
+#define EXT4_MAP_UNWRITTEN BIT(BH_Unwritten)
+#define EXT4_MAP_BOUNDARY BIT(BH_Boundary)
#define EXT4_MAP_FLAGS (EXT4_MAP_NEW | EXT4_MAP_MAPPED |\
EXT4_MAP_UNWRITTEN | EXT4_MAP_BOUNDARY)
@@ -198,6 +208,12 @@
*/
#define EXT4_IO_END_UNWRITTEN 0x0001
+struct ext4_io_end_vec {
+ struct list_head list; /* list of io_end_vec */
+ loff_t offset; /* offset in the file */
+ ssize_t size; /* size of the extent */
+};
+
/*
* For converting unwritten extents on a work queue. 'handle' is used for
* buffered writeback.
@@ -211,8 +227,7 @@
* bios covering the extent */
unsigned int flag; /* unwritten or not */
atomic_t count; /* reference counter */
- loff_t offset; /* offset in the file */
- ssize_t size; /* size of the extent */
+ struct list_head list_vec; /* list of ext4_io_end_vec */
} ext4_io_end_t;
struct ext4_io_submit {
@@ -409,29 +424,60 @@
#define EXT4_EXTENTS_FL 0x00080000 /* Inode uses extents */
#define EXT4_VERITY_FL 0x00100000 /* Verity protected inode */
#define EXT4_EA_INODE_FL 0x00200000 /* Inode used for large EA */
-#define EXT4_EOFBLOCKS_FL 0x00400000 /* Blocks allocated beyond EOF */
+/* 0x00400000 was formerly EXT4_EOFBLOCKS_FL */
+
+#define EXT4_DAX_FL 0x02000000 /* Inode is DAX */
+
#define EXT4_INLINE_DATA_FL 0x10000000 /* Inode has inline data. */
#define EXT4_PROJINHERIT_FL 0x20000000 /* Create with parents projid */
-#define EXT4_CASEFOLD_FL 0x40000000 /* Casefolded file */
+#define EXT4_CASEFOLD_FL 0x40000000 /* Casefolded directory */
#define EXT4_RESERVED_FL 0x80000000 /* reserved for ext4 lib */
-#define EXT4_FL_USER_VISIBLE 0x705BDFFF /* User visible flags */
-#define EXT4_FL_USER_MODIFIABLE 0x604BC0FF /* User modifiable flags */
+/* User modifiable flags */
+#define EXT4_FL_USER_MODIFIABLE (EXT4_SECRM_FL | \
+ EXT4_UNRM_FL | \
+ EXT4_COMPR_FL | \
+ EXT4_SYNC_FL | \
+ EXT4_IMMUTABLE_FL | \
+ EXT4_APPEND_FL | \
+ EXT4_NODUMP_FL | \
+ EXT4_NOATIME_FL | \
+ EXT4_JOURNAL_DATA_FL | \
+ EXT4_NOTAIL_FL | \
+ EXT4_DIRSYNC_FL | \
+ EXT4_TOPDIR_FL | \
+ EXT4_EXTENTS_FL | \
+ 0x00400000 /* EXT4_EOFBLOCKS_FL */ | \
+ EXT4_DAX_FL | \
+ EXT4_PROJINHERIT_FL | \
+ EXT4_CASEFOLD_FL)
-/* Flags we can manipulate with through EXT4_IOC_FSSETXATTR */
+/* User visible flags */
+#define EXT4_FL_USER_VISIBLE (EXT4_FL_USER_MODIFIABLE | \
+ EXT4_DIRTY_FL | \
+ EXT4_COMPRBLK_FL | \
+ EXT4_NOCOMPR_FL | \
+ EXT4_ENCRYPT_FL | \
+ EXT4_INDEX_FL | \
+ EXT4_VERITY_FL | \
+ EXT4_INLINE_DATA_FL)
+
+/* Flags we can manipulate with through FS_IOC_FSSETXATTR */
#define EXT4_FL_XFLAG_VISIBLE (EXT4_SYNC_FL | \
EXT4_IMMUTABLE_FL | \
EXT4_APPEND_FL | \
EXT4_NODUMP_FL | \
EXT4_NOATIME_FL | \
- EXT4_PROJINHERIT_FL)
+ EXT4_PROJINHERIT_FL | \
+ EXT4_DAX_FL)
/* Flags that should be inherited by new inodes from their parent. */
#define EXT4_FL_INHERITED (EXT4_SECRM_FL | EXT4_UNRM_FL | EXT4_COMPR_FL |\
EXT4_SYNC_FL | EXT4_NODUMP_FL | EXT4_NOATIME_FL |\
EXT4_NOCOMPR_FL | EXT4_JOURNAL_DATA_FL |\
EXT4_NOTAIL_FL | EXT4_DIRSYNC_FL |\
- EXT4_PROJINHERIT_FL | EXT4_CASEFOLD_FL)
+ EXT4_PROJINHERIT_FL | EXT4_CASEFOLD_FL |\
+ EXT4_DAX_FL)
/* Flags that are appropriate for regular files (all but dir-specific ones). */
#define EXT4_REG_FLMASK (~(EXT4_DIRSYNC_FL | EXT4_TOPDIR_FL | EXT4_CASEFOLD_FL |\
@@ -443,6 +489,10 @@
/* The only flags that should be swapped */
#define EXT4_FL_SHOULD_SWAP (EXT4_HUGE_FILE_FL | EXT4_EXTENTS_FL)
+/* Flags which are mutually exclusive to DAX */
+#define EXT4_DAX_MUT_EXCL (EXT4_VERITY_FL | EXT4_ENCRYPT_FL |\
+ EXT4_JOURNAL_DATA_FL | EXT4_INLINE_DATA_FL)
+
/* Mask out flags that are inappropriate for the given type of inode. */
static inline __u32 ext4_mask_flags(umode_t mode, __u32 flags)
{
@@ -482,9 +532,11 @@
EXT4_INODE_EXTENTS = 19, /* Inode uses extents */
EXT4_INODE_VERITY = 20, /* Verity protected inode */
EXT4_INODE_EA_INODE = 21, /* Inode used for large EA */
- EXT4_INODE_EOFBLOCKS = 22, /* Blocks allocated beyond EOF */
+/* 22 was formerly EXT4_INODE_EOFBLOCKS */
+ EXT4_INODE_DAX = 25, /* Inode is DAX */
EXT4_INODE_INLINE_DATA = 28, /* Data in inode. */
EXT4_INODE_PROJINHERIT = 29, /* Create with parents projid */
+ EXT4_INODE_CASEFOLD = 30, /* Casefolded directory */
EXT4_INODE_RESERVED = 31, /* reserved for ext4 lib */
};
@@ -528,9 +580,9 @@
CHECK_FLAG_VALUE(EXTENTS);
CHECK_FLAG_VALUE(VERITY);
CHECK_FLAG_VALUE(EA_INODE);
- CHECK_FLAG_VALUE(EOFBLOCKS);
CHECK_FLAG_VALUE(INLINE_DATA);
CHECK_FLAG_VALUE(PROJINHERIT);
+ CHECK_FLAG_VALUE(CASEFOLD);
CHECK_FLAG_VALUE(RESERVED);
}
@@ -605,8 +657,6 @@
#define EXT4_GET_BLOCKS_METADATA_NOFAIL 0x0020
/* Don't normalize allocation size (used for fallocate) */
#define EXT4_GET_BLOCKS_NO_NORMALIZE 0x0040
- /* Request will not result in inode size update (user for fallocate) */
-#define EXT4_GET_BLOCKS_KEEP_SIZE 0x0080
/* Convert written extents to unwritten */
#define EXT4_GET_BLOCKS_CONVERT_UNWRITTEN 0x0100
/* Write zeros to newly created written extents */
@@ -628,6 +678,7 @@
*/
#define EXT4_EX_NOCACHE 0x40000000
#define EXT4_EX_FORCE_CACHE 0x20000000
+#define EXT4_EX_NOFAIL 0x10000000
/*
* Flags used by ext4_free_blocks
@@ -643,8 +694,6 @@
/*
* ioctl commands
*/
-#define EXT4_IOC_GETFLAGS FS_IOC_GETFLAGS
-#define EXT4_IOC_SETFLAGS FS_IOC_SETFLAGS
#define EXT4_IOC_GETVERSION _IOR('f', 3, long)
#define EXT4_IOC_SETVERSION _IOW('f', 4, long)
#define EXT4_IOC_GETVERSION_OLD FS_IOC_GETVERSION
@@ -661,17 +710,11 @@
#define EXT4_IOC_RESIZE_FS _IOW('f', 16, __u64)
#define EXT4_IOC_SWAP_BOOT _IO('f', 17)
#define EXT4_IOC_PRECACHE_EXTENTS _IO('f', 18)
-#define EXT4_IOC_SET_ENCRYPTION_POLICY FS_IOC_SET_ENCRYPTION_POLICY
-#define EXT4_IOC_GET_ENCRYPTION_PWSALT FS_IOC_GET_ENCRYPTION_PWSALT
-#define EXT4_IOC_GET_ENCRYPTION_POLICY FS_IOC_GET_ENCRYPTION_POLICY
/* ioctl codes 19--39 are reserved for fscrypt */
#define EXT4_IOC_CLEAR_ES_CACHE _IO('f', 40)
#define EXT4_IOC_GETSTATE _IOW('f', 41, __u32)
#define EXT4_IOC_GET_ES_CACHE _IOWR('f', 42, struct fiemap)
-#define EXT4_IOC_FSGETXATTR FS_IOC_FSGETXATTR
-#define EXT4_IOC_FSSETXATTR FS_IOC_FSSETXATTR
-
#define EXT4_IOC_SHUTDOWN _IOR ('X', 125, __u32)
/*
@@ -696,8 +739,6 @@
/*
* ioctl commands in 32 bit emulation
*/
-#define EXT4_IOC32_GETFLAGS FS_IOC32_GETFLAGS
-#define EXT4_IOC32_SETFLAGS FS_IOC32_SETFLAGS
#define EXT4_IOC32_GETVERSION _IOR('f', 3, int)
#define EXT4_IOC32_SETVERSION _IOW('f', 4, int)
#define EXT4_IOC32_GETRSVSZ _IOR('f', 5, int)
@@ -922,6 +963,7 @@
#endif /* defined(__KERNEL__) || defined(__linux__) */
#include "extents_status.h"
+#include "fast_commit.h"
/*
* Lock subclasses for i_data_sem in the ext4_inode_info structure.
@@ -979,6 +1021,28 @@
struct list_head i_orphan; /* unlinked but open inodes */
+ /* Fast commit related info */
+
+ struct list_head i_fc_list; /*
+ * inodes that need fast commit
+ * protected by sbi->s_fc_lock.
+ */
+
+ /* Start of lblk range that needs to be committed in this fast commit */
+ ext4_lblk_t i_fc_lblk_start;
+
+ /* End of lblk range that needs to be committed in this fast commit */
+ ext4_lblk_t i_fc_lblk_len;
+
+ /* Number of ongoing updates on this inode */
+ atomic_t i_fc_updates;
+
+ /* Fast commit wait queue for this inode */
+ wait_queue_head_t i_fc_wait;
+
+ /* Protect concurrent accesses on i_fc_lblk_start, i_fc_lblk_len */
+ struct mutex i_fc_lock;
+
/*
* i_disksize keeps track of what the inode size is ON DISK, not
* in memory. During truncate, i_size is set to the new size by
@@ -1028,6 +1092,7 @@
struct timespec64 i_crtime;
/* mballoc */
+ atomic_t i_prealloc_active;
struct list_head i_prealloc_list;
spinlock_t i_prealloc_lock;
@@ -1047,8 +1112,6 @@
/* allocation reservation info for delalloc */
/* In case of bigalloc, this refer to clusters rather than blocks */
unsigned int i_reserved_data_blocks;
- ext4_lblk_t i_da_metadata_calc_last_lblock;
- int i_da_metadata_calc_len;
/* pending cluster reservations for bigalloc file systems */
struct ext4_pending_tree i_pending_tree;
@@ -1100,6 +1163,7 @@
#define EXT4_VALID_FS 0x0001 /* Unmounted cleanly */
#define EXT4_ERROR_FS 0x0002 /* Errors detected */
#define EXT4_ORPHAN_FS 0x0004 /* Orphans being recovered */
+#define EXT4_FC_REPLAY 0x0020 /* Fast commit replay ongoing */
/*
* Misc. filesystem flags
@@ -1121,9 +1185,9 @@
#define EXT4_MOUNT_MINIX_DF 0x00080 /* Mimics the Minix statfs */
#define EXT4_MOUNT_NOLOAD 0x00100 /* Don't use existing journal*/
#ifdef CONFIG_FS_DAX
-#define EXT4_MOUNT_DAX 0x00200 /* Direct Access */
+#define EXT4_MOUNT_DAX_ALWAYS 0x00200 /* Direct Access */
#else
-#define EXT4_MOUNT_DAX 0
+#define EXT4_MOUNT_DAX_ALWAYS 0
#endif
#define EXT4_MOUNT_DATA_FLAGS 0x00C00 /* Mode for data writes: */
#define EXT4_MOUNT_JOURNAL_DATA 0x00400 /* Write data to journal */
@@ -1148,6 +1212,7 @@
#define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */
#define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */
#define EXT4_MOUNT_WARN_ON_ERROR 0x2000000 /* Trigger WARN_ON on error */
+#define EXT4_MOUNT_PREFETCH_BLOCK_BITMAPS 0x4000000
#define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */
#define EXT4_MOUNT_DATA_ERR_ABORT 0x10000000 /* Abort on file data write */
#define EXT4_MOUNT_BLOCK_VALIDITY 0x20000000 /* Block validity checking */
@@ -1166,10 +1231,14 @@
blocks */
#define EXT4_MOUNT2_HURD_COMPAT 0x00000004 /* Support HURD-castrated
file systems */
-
#define EXT4_MOUNT2_EXPLICIT_JOURNAL_CHECKSUM 0x00000008 /* User explicitly
specified journal checksum */
+#define EXT4_MOUNT2_JOURNAL_FAST_COMMIT 0x00000010 /* Journal fast commit */
+#define EXT4_MOUNT2_DAX_NEVER 0x00000020 /* Do not allow Direct Access */
+#define EXT4_MOUNT2_DAX_INODE 0x00000040 /* For printing options only */
+
+
#define clear_opt(sb, opt) EXT4_SB(sb)->s_mount_opt &= \
~EXT4_MOUNT_##opt
#define set_opt(sb, opt) EXT4_SB(sb)->s_mount_opt |= \
@@ -1338,7 +1407,8 @@
__u8 s_lastcheck_hi;
__u8 s_first_error_time_hi;
__u8 s_last_error_time_hi;
- __u8 s_pad[2];
+ __u8 s_first_error_errcode;
+ __u8 s_last_error_errcode;
__le16 s_encoding; /* Filename charset encoding */
__le16 s_encoding_flags; /* Filename charset encoding flags */
__le32 s_reserved[95]; /* Padding to the end of the block */
@@ -1349,16 +1419,8 @@
#ifdef __KERNEL__
-/*
- * run-time mount flags
- */
-#define EXT4_MF_MNTDIR_SAMPLED 0x0001
-#define EXT4_MF_FS_ABORTED 0x0002 /* Fatal error detected */
-#define EXT4_MF_TEST_DUMMY_ENCRYPTION 0x0004
-
#ifdef CONFIG_FS_ENCRYPTION
-#define DUMMY_ENCRYPTION_ENABLED(sbi) (unlikely((sbi)->s_mount_flags & \
- EXT4_MF_TEST_DUMMY_ENCRYPTION))
+#define DUMMY_ENCRYPTION_ENABLED(sbi) ((sbi)->s_dummy_enc_policy.policy != NULL)
#else
#define DUMMY_ENCRYPTION_ENABLED(sbi) (0)
#endif
@@ -1369,14 +1431,6 @@
#define EXT4_ENC_UTF8_12_1 1
/*
- * Flags for ext4_sb_info.s_encoding_flags.
- */
-#define EXT4_ENC_STRICT_MODE_FL (1 << 0)
-
-#define ext4_has_strict_mode(sbi) \
- (sbi->s_encoding_flags & EXT4_ENC_STRICT_MODE_FL)
-
-/*
* fourth extended-fs super-block data in memory
*/
struct ext4_sb_info {
@@ -1399,7 +1453,7 @@
struct buffer_head * __rcu *s_group_desc;
unsigned int s_mount_opt;
unsigned int s_mount_opt2;
- unsigned int s_mount_flags;
+ unsigned long s_mount_flags;
unsigned int s_def_mount_opt;
ext4_fsblk_t s_sb_block;
atomic64_t s_resv_clusters;
@@ -1426,10 +1480,7 @@
struct kobject s_kobj;
struct completion s_kobj_unregister;
struct super_block *s_sb;
-#ifdef CONFIG_UNICODE
- struct unicode_map *s_encoding;
- __u16 s_encoding_flags;
-#endif
+ struct buffer_head *s_mmp_bh;
/* Journaling */
struct journal_s *s_journal;
@@ -1439,14 +1490,14 @@
unsigned long s_commit_interval;
u32 s_max_batch_time;
u32 s_min_batch_time;
- struct block_device *journal_bdev;
+ struct block_device *s_journal_bdev;
#ifdef CONFIG_QUOTA
/* Names of quota files with journalled quota */
char __rcu *s_qf_names[EXT4_MAXQUOTAS];
int s_jquota_fmt; /* Format of quota to use */
#endif
unsigned int s_want_extra_isize; /* New inodes should reserve # bytes */
- struct ext4_system_blocks __rcu *system_blks;
+ struct ext4_system_blocks __rcu *s_system_blks;
#ifdef EXTENTS_STATS
/* ext4 extents stats */
@@ -1477,10 +1528,13 @@
unsigned int s_mb_stats;
unsigned int s_mb_order2_reqs;
unsigned int s_mb_group_prealloc;
+ unsigned int s_mb_max_inode_prealloc;
unsigned int s_max_dir_size_kb;
/* where last allocation was done - for stream allocation */
unsigned long s_mb_last_group;
unsigned long s_mb_last_start;
+ unsigned int s_mb_prefetch;
+ unsigned int s_mb_prefetch_limit;
/* stats for buddy allocator */
atomic_t s_bal_reqs; /* number of reqs with len > 1 */
@@ -1548,6 +1602,11 @@
struct ratelimit_state s_err_ratelimit_state;
struct ratelimit_state s_warning_ratelimit_state;
struct ratelimit_state s_msg_ratelimit_state;
+ atomic_t s_warning_count;
+ atomic_t s_msg_count;
+
+ /* Encryption policy for '-o test_dummy_encryption' */
+ struct fscrypt_dummy_policy s_dummy_enc_policy;
/*
* Barrier between writepages ops and changing any inode's JOURNAL_DATA
@@ -1555,6 +1614,40 @@
*/
struct percpu_rw_semaphore s_writepages_rwsem;
struct dax_device *s_daxdev;
+#ifdef CONFIG_EXT4_DEBUG
+ unsigned long s_simulate_fail;
+#endif
+ /* Record the errseq of the backing block device */
+ errseq_t s_bdev_wb_err;
+ spinlock_t s_bdev_wb_lock;
+
+ /* Ext4 fast commit stuff */
+ atomic_t s_fc_subtid;
+ atomic_t s_fc_ineligible_updates;
+ /*
+ * After commit starts, the main queue gets locked, and the further
+ * updates get added in the staging queue.
+ */
+#define FC_Q_MAIN 0
+#define FC_Q_STAGING 1
+ struct list_head s_fc_q[2]; /* Inodes staged for fast commit
+ * that have data changes in them.
+ */
+ struct list_head s_fc_dentry_q[2]; /* directory entry updates */
+ unsigned int s_fc_bytes;
+ /*
+ * Main fast commit lock. This lock protects accesses to the
+ * following fields:
+ * ei->i_fc_list, s_fc_dentry_q, s_fc_q, s_fc_bytes, s_fc_bh.
+ */
+ spinlock_t s_fc_lock;
+ struct buffer_head *s_fc_bh;
+ struct ext4_fc_stats s_fc_stats;
+ u64 s_fc_avg_commit_time;
+#ifdef CONFIG_EXT4_DEBUG
+ int s_fc_debug_max_replay;
+#endif
+ struct ext4_fc_replay_state s_fc_replay_state;
};
static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
@@ -1591,6 +1684,94 @@
})
/*
+ * run-time mount flags
+ */
+enum {
+ EXT4_MF_MNTDIR_SAMPLED,
+ EXT4_MF_FS_ABORTED, /* Fatal error detected */
+ EXT4_MF_FC_INELIGIBLE, /* Fast commit ineligible */
+ EXT4_MF_FC_COMMITTING /* File system underoing a fast
+ * commit.
+ */
+};
+
+static inline void ext4_set_mount_flag(struct super_block *sb, int bit)
+{
+ set_bit(bit, &EXT4_SB(sb)->s_mount_flags);
+}
+
+static inline void ext4_clear_mount_flag(struct super_block *sb, int bit)
+{
+ clear_bit(bit, &EXT4_SB(sb)->s_mount_flags);
+}
+
+static inline int ext4_test_mount_flag(struct super_block *sb, int bit)
+{
+ return test_bit(bit, &EXT4_SB(sb)->s_mount_flags);
+}
+
+
+/*
+ * Simulate_fail codes
+ */
+#define EXT4_SIM_BBITMAP_EIO 1
+#define EXT4_SIM_BBITMAP_CRC 2
+#define EXT4_SIM_IBITMAP_EIO 3
+#define EXT4_SIM_IBITMAP_CRC 4
+#define EXT4_SIM_INODE_EIO 5
+#define EXT4_SIM_INODE_CRC 6
+#define EXT4_SIM_DIRBLOCK_EIO 7
+#define EXT4_SIM_DIRBLOCK_CRC 8
+
+static inline bool ext4_simulate_fail(struct super_block *sb,
+ unsigned long code)
+{
+#ifdef CONFIG_EXT4_DEBUG
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+ if (unlikely(sbi->s_simulate_fail == code)) {
+ sbi->s_simulate_fail = 0;
+ return true;
+ }
+#endif
+ return false;
+}
+
+static inline void ext4_simulate_fail_bh(struct super_block *sb,
+ struct buffer_head *bh,
+ unsigned long code)
+{
+ if (!IS_ERR(bh) && ext4_simulate_fail(sb, code))
+ clear_buffer_uptodate(bh);
+}
+
+/*
+ * Error number codes for s_{first,last}_error_errno
+ *
+ * Linux errno numbers are architecture specific, so we need to translate
+ * them into something which is architecture independent. We don't define
+ * codes for all errno's; just the ones which are most likely to be the cause
+ * of an ext4_error() call.
+ */
+#define EXT4_ERR_UNKNOWN 1
+#define EXT4_ERR_EIO 2
+#define EXT4_ERR_ENOMEM 3
+#define EXT4_ERR_EFSBADCRC 4
+#define EXT4_ERR_EFSCORRUPTED 5
+#define EXT4_ERR_ENOSPC 6
+#define EXT4_ERR_ENOKEY 7
+#define EXT4_ERR_EROFS 8
+#define EXT4_ERR_EFBIG 9
+#define EXT4_ERR_EEXIST 10
+#define EXT4_ERR_ERANGE 11
+#define EXT4_ERR_EOVERFLOW 12
+#define EXT4_ERR_EBUSY 13
+#define EXT4_ERR_ENOTDIR 14
+#define EXT4_ERR_ENOTEMPTY 15
+#define EXT4_ERR_ESHUTDOWN 16
+#define EXT4_ERR_EFAULT 17
+
+/*
* Inode dynamic state flags
*/
enum {
@@ -1600,12 +1781,12 @@
EXT4_STATE_NO_EXPAND, /* No space for expansion */
EXT4_STATE_DA_ALLOC_CLOSE, /* Alloc DA blks on close */
EXT4_STATE_EXT_MIGRATE, /* Inode is migrating */
- EXT4_STATE_DIO_UNWRITTEN, /* need convert on dio done*/
EXT4_STATE_NEWENTRY, /* File just added to dir */
EXT4_STATE_MAY_INLINE_DATA, /* may have in-inode data */
EXT4_STATE_EXT_PRECACHED, /* extents have been precached */
EXT4_STATE_LUSTRE_EA_INODE, /* Lustre-style ea_inode */
EXT4_STATE_VERITY_IN_PROGRESS, /* building fs-verity Merkle tree */
+ EXT4_STATE_FC_COMMITTING, /* Fast commit ongoing */
};
#define EXT4_INODE_BIT_FNS(name, field, offset) \
@@ -1699,6 +1880,15 @@
#define EXT4_FEATURE_COMPAT_RESIZE_INODE 0x0010
#define EXT4_FEATURE_COMPAT_DIR_INDEX 0x0020
#define EXT4_FEATURE_COMPAT_SPARSE_SUPER2 0x0200
+/*
+ * The reason why "FAST_COMMIT" is a compat feature is that, FS becomes
+ * incompatible only if fast commit blocks are present in the FS. Since we
+ * clear the journal (and thus the fast commit blocks), we don't mark FS as
+ * incompatible. We also have a JBD2 incompat feature, which gets set when
+ * there are fast commit blocks present in the journal.
+ */
+#define EXT4_FEATURE_COMPAT_FAST_COMMIT 0x0400
+#define EXT4_FEATURE_COMPAT_STABLE_INODES 0x0800
#define EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER 0x0001
#define EXT4_FEATURE_RO_COMPAT_LARGE_FILE 0x0002
@@ -1800,6 +1990,8 @@
EXT4_FEATURE_COMPAT_FUNCS(resize_inode, RESIZE_INODE)
EXT4_FEATURE_COMPAT_FUNCS(dir_index, DIR_INDEX)
EXT4_FEATURE_COMPAT_FUNCS(sparse_super2, SPARSE_SUPER2)
+EXT4_FEATURE_COMPAT_FUNCS(fast_commit, FAST_COMMIT)
+EXT4_FEATURE_COMPAT_FUNCS(stable_inodes, STABLE_INODES)
EXT4_FEATURE_RO_COMPAT_FUNCS(sparse_super, SPARSE_SUPER)
EXT4_FEATURE_RO_COMPAT_FUNCS(large_file, LARGE_FILE)
@@ -1913,6 +2105,7 @@
*/
#define EXT4_FLAGS_RESIZING 0
#define EXT4_FLAGS_SHUTDOWN 1
+#define EXT4_FLAGS_BDEV_IS_DAX 2
static inline int ext4_forced_shutdown(struct ext4_sb_info *sbi)
{
@@ -1984,7 +2177,7 @@
__le32 inode; /* Inode number */
__le16 rec_len; /* Directory entry length */
__u8 name_len; /* Name length */
- __u8 file_type;
+ __u8 file_type; /* See file type macros EXT4_FT_* below */
char name[EXT4_NAME_LEN]; /* File name */
};
@@ -2221,9 +2414,15 @@
struct mutex li_list_mtx;
};
+enum ext4_li_mode {
+ EXT4_LI_MODE_PREFETCH_BBITMAP,
+ EXT4_LI_MODE_ITABLE,
+};
+
struct ext4_li_request {
struct super_block *lr_super;
- struct ext4_sb_info *lr_sbi;
+ enum ext4_li_mode lr_mode;
+ ext4_group_t lr_first_not_zeroed;
ext4_group_t lr_next_group;
struct list_head lr_request;
unsigned long lr_next_sched;
@@ -2354,7 +2553,8 @@
extern int ext4_should_retry_alloc(struct super_block *sb, int *retries);
extern struct buffer_head *ext4_read_block_bitmap_nowait(struct super_block *sb,
- ext4_group_t block_group);
+ ext4_group_t block_group,
+ bool ignore_locked);
extern int ext4_wait_block_bitmap(struct super_block *sb,
ext4_group_t block_group,
struct buffer_head *bh);
@@ -2526,6 +2726,7 @@
struct dx_hash_info *hinfo);
/* ialloc.c */
+extern int ext4_mark_inode_used(struct super_block *sb, int ino);
extern struct inode *__ext4_new_inode(handle_t *, struct inode *, umode_t,
const struct qstr *qstr, __u32 goal,
uid_t *owner, __u32 i_flags,
@@ -2551,6 +2752,37 @@
ext4_group_t group, int barrier);
extern void ext4_end_bitmap_read(struct buffer_head *bh, int uptodate);
+/* fast_commit.c */
+int ext4_fc_info_show(struct seq_file *seq, void *v);
+void ext4_fc_init(struct super_block *sb, journal_t *journal);
+void ext4_fc_init_inode(struct inode *inode);
+void ext4_fc_track_range(handle_t *handle, struct inode *inode, ext4_lblk_t start,
+ ext4_lblk_t end);
+void __ext4_fc_track_unlink(handle_t *handle, struct inode *inode,
+ struct dentry *dentry);
+void __ext4_fc_track_link(handle_t *handle, struct inode *inode,
+ struct dentry *dentry);
+void ext4_fc_track_unlink(handle_t *handle, struct dentry *dentry);
+void ext4_fc_track_link(handle_t *handle, struct dentry *dentry);
+void __ext4_fc_track_create(handle_t *handle, struct inode *inode,
+ struct dentry *dentry);
+void ext4_fc_track_create(handle_t *handle, struct dentry *dentry);
+void ext4_fc_track_inode(handle_t *handle, struct inode *inode);
+void ext4_fc_mark_ineligible(struct super_block *sb, int reason);
+void ext4_fc_start_ineligible(struct super_block *sb, int reason);
+void ext4_fc_stop_ineligible(struct super_block *sb);
+void ext4_fc_start_update(struct inode *inode);
+void ext4_fc_stop_update(struct inode *inode);
+void ext4_fc_del(struct inode *inode);
+bool ext4_fc_replay_check_excluded(struct super_block *sb, ext4_fsblk_t block);
+void ext4_fc_replay_cleanup(struct super_block *sb);
+int ext4_fc_commit(journal_t *journal, tid_t commit_tid);
+int __init ext4_fc_init_dentry_cache(void);
+void ext4_fc_destroy_dentry_cache(void);
+int ext4_fc_record_regions(struct super_block *sb, int ino,
+ ext4_lblk_t lblk, ext4_fsblk_t pblk,
+ int len, int replay);
+
/* mballoc.c */
extern const struct seq_operations ext4_mb_seq_groups_ops;
extern long ext4_mb_stats;
@@ -2560,9 +2792,15 @@
extern ext4_fsblk_t ext4_mb_new_blocks(handle_t *,
struct ext4_allocation_request *, int *);
extern int ext4_mb_reserve_blocks(struct super_block *, int);
-extern void ext4_discard_preallocations(struct inode *);
+extern void ext4_discard_preallocations(struct inode *, unsigned int);
extern int __init ext4_init_mballoc(void);
extern void ext4_exit_mballoc(void);
+extern ext4_group_t ext4_mb_prefetch(struct super_block *sb,
+ ext4_group_t group,
+ unsigned int nr, int *cnt);
+extern void ext4_mb_prefetch_fini(struct super_block *sb, ext4_group_t group,
+ unsigned int nr);
+
extern void ext4_free_blocks(handle_t *handle, struct inode *inode,
struct buffer_head *bh, ext4_fsblk_t block,
unsigned long count, int flags);
@@ -2574,8 +2812,12 @@
ext4_fsblk_t block, unsigned long count);
extern int ext4_trim_fs(struct super_block *, struct fstrim_range *);
extern void ext4_process_freed_data(struct super_block *sb, tid_t commit_tid);
+extern void ext4_mb_mark_bb(struct super_block *sb, ext4_fsblk_t block,
+ int len, int state);
/* inode.c */
+void ext4_inode_csum_set(struct inode *inode, struct ext4_inode *raw,
+ struct ext4_inode_info *ei);
int ext4_inode_is_fast_symlink(struct inode *inode);
struct buffer_head *ext4_getblk(handle_t *, struct inode *, ext4_lblk_t, int);
struct buffer_head *ext4_bread(handle_t *, struct inode *, ext4_lblk_t, int);
@@ -2585,8 +2827,6 @@
struct buffer_head *bh_result, int create);
int ext4_get_block(struct inode *inode, sector_t iblock,
struct buffer_head *bh_result, int create);
-int ext4_dio_get_block(struct inode *inode, sector_t iblock,
- struct buffer_head *bh_result, int create);
int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
struct buffer_head *bh, int create);
int ext4_walk_page_buffers(handle_t *handle,
@@ -2624,13 +2864,14 @@
extern void ext4_dirty_inode(struct inode *, int);
extern int ext4_change_inode_journal_flag(struct inode *, int);
extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *);
+extern int ext4_get_fc_inode_loc(struct super_block *sb, unsigned long ino,
+ struct ext4_iloc *iloc);
extern int ext4_inode_attach_jinode(struct inode *inode);
extern int ext4_can_truncate(struct inode *inode);
extern int ext4_truncate(struct inode *);
extern int ext4_break_layouts(struct inode *);
extern int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length);
-extern int ext4_truncate_restart_trans(handle_t *, struct inode *, int nblocks);
-extern void ext4_set_inode_flags(struct inode *);
+extern void ext4_set_inode_flags(struct inode *, bool init);
extern int ext4_alloc_da_blocks(struct inode *inode);
extern void ext4_set_aops(struct inode *inode);
extern int ext4_writepage_trans_blocks(struct inode *);
@@ -2650,7 +2891,6 @@
/* indirect.c */
extern int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
struct ext4_map_blocks *map, int flags);
-extern int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock);
extern int ext4_ind_trans_blocks(struct inode *inode, int nrblocks);
extern void ext4_ind_truncate(handle_t *, struct inode *inode);
extern int ext4_ind_remove_space(handle_t *handle, struct inode *inode,
@@ -2659,12 +2899,15 @@
/* ioctl.c */
extern long ext4_ioctl(struct file *, unsigned int, unsigned long);
extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long);
+extern void ext4_reset_inode_seed(struct inode *inode);
/* migrate.c */
extern int ext4_ext_migrate(struct inode *);
extern int ext4_ind_migrate(struct inode *inode);
/* namei.c */
+extern int ext4_init_new_dir(handle_t *handle, struct inode *dir,
+ struct inode *inode);
extern int ext4_dirblock_csum_verify(struct inode *inode,
struct buffer_head *bh);
extern int ext4_orphan_add(handle_t *, struct inode *);
@@ -2678,8 +2921,7 @@
struct ext4_filename *fname,
unsigned int offset,
struct ext4_dir_entry_2 **res_dir);
-extern int ext4_generic_delete_entry(handle_t *handle,
- struct inode *dir,
+extern int ext4_generic_delete_entry(struct inode *dir,
struct ext4_dir_entry_2 *de_del,
struct buffer_head *bh,
void *entry_buf,
@@ -2699,11 +2941,17 @@
/* super.c */
extern struct buffer_head *ext4_sb_bread(struct super_block *sb,
sector_t block, int op_flags);
+extern struct buffer_head *ext4_sb_bread_unmovable(struct super_block *sb,
+ sector_t block);
+extern void ext4_read_bh_nowait(struct buffer_head *bh, int op_flags,
+ bh_end_io_t *end_io);
+extern int ext4_read_bh(struct buffer_head *bh, int op_flags,
+ bh_end_io_t *end_io);
+extern int ext4_read_bh_lock(struct buffer_head *bh, int op_flags, bool wait);
+extern void ext4_sb_breadahead_unmovable(struct super_block *sb, sector_t block);
extern int ext4_seq_options_show(struct seq_file *seq, void *offset);
extern int ext4_calculate_overhead(struct super_block *sb);
extern void ext4_superblock_csum_set(struct super_block *sb);
-extern void *ext4_kvmalloc(size_t size, gfp_t flags);
-extern void *ext4_kvzalloc(size_t size, gfp_t flags);
extern int ext4_alloc_flex_bg_array(struct super_block *sb,
ext4_group_t ngroup);
extern const char *ext4_decode_error(struct super_block *sb, int errno,
@@ -2712,19 +2960,19 @@
ext4_group_t block_group,
unsigned int flags);
-extern __printf(4, 5)
-void __ext4_error(struct super_block *, const char *, unsigned int,
+extern __printf(6, 7)
+void __ext4_error(struct super_block *, const char *, unsigned int, int, __u64,
const char *, ...);
-extern __printf(5, 6)
-void __ext4_error_inode(struct inode *, const char *, unsigned int, ext4_fsblk_t,
- const char *, ...);
+extern __printf(6, 7)
+void __ext4_error_inode(struct inode *, const char *, unsigned int,
+ ext4_fsblk_t, int, const char *, ...);
extern __printf(5, 6)
void __ext4_error_file(struct file *, const char *, unsigned int, ext4_fsblk_t,
const char *, ...);
extern void __ext4_std_error(struct super_block *, const char *,
unsigned int, int);
-extern __printf(4, 5)
-void __ext4_abort(struct super_block *, const char *, unsigned int,
+extern __printf(5, 6)
+void __ext4_abort(struct super_block *, const char *, unsigned int, int,
const char *, ...);
extern __printf(4, 5)
void __ext4_warning(struct super_block *, const char *, unsigned int,
@@ -2745,8 +2993,12 @@
#define EXT4_ERROR_INODE(inode, fmt, a...) \
ext4_error_inode((inode), __func__, __LINE__, 0, (fmt), ## a)
-#define EXT4_ERROR_INODE_BLOCK(inode, block, fmt, a...) \
- ext4_error_inode((inode), __func__, __LINE__, (block), (fmt), ## a)
+#define EXT4_ERROR_INODE_ERR(inode, err, fmt, a...) \
+ __ext4_error_inode((inode), __func__, __LINE__, 0, (err), (fmt), ## a)
+
+#define ext4_error_inode_block(inode, block, err, fmt, a...) \
+ __ext4_error_inode((inode), __func__, __LINE__, (block), (err), \
+ (fmt), ## a)
#define EXT4_ERROR_FILE(file, block, fmt, a...) \
ext4_error_file((file), __func__, __LINE__, (block), (fmt), ## a)
@@ -2754,13 +3006,18 @@
#ifdef CONFIG_PRINTK
#define ext4_error_inode(inode, func, line, block, fmt, ...) \
- __ext4_error_inode(inode, func, line, block, fmt, ##__VA_ARGS__)
+ __ext4_error_inode(inode, func, line, block, 0, fmt, ##__VA_ARGS__)
+#define ext4_error_inode_err(inode, func, line, block, err, fmt, ...) \
+ __ext4_error_inode((inode), (func), (line), (block), \
+ (err), (fmt), ##__VA_ARGS__)
#define ext4_error_file(file, func, line, block, fmt, ...) \
__ext4_error_file(file, func, line, block, fmt, ##__VA_ARGS__)
#define ext4_error(sb, fmt, ...) \
- __ext4_error(sb, __func__, __LINE__, fmt, ##__VA_ARGS__)
-#define ext4_abort(sb, fmt, ...) \
- __ext4_abort(sb, __func__, __LINE__, fmt, ##__VA_ARGS__)
+ __ext4_error((sb), __func__, __LINE__, 0, 0, (fmt), ##__VA_ARGS__)
+#define ext4_error_err(sb, err, fmt, ...) \
+ __ext4_error((sb), __func__, __LINE__, (err), 0, (fmt), ##__VA_ARGS__)
+#define ext4_abort(sb, err, fmt, ...) \
+ __ext4_abort((sb), __func__, __LINE__, (err), (fmt), ##__VA_ARGS__)
#define ext4_warning(sb, fmt, ...) \
__ext4_warning(sb, __func__, __LINE__, fmt, ##__VA_ARGS__)
#define ext4_warning_inode(inode, fmt, ...) \
@@ -2778,7 +3035,12 @@
#define ext4_error_inode(inode, func, line, block, fmt, ...) \
do { \
no_printk(fmt, ##__VA_ARGS__); \
- __ext4_error_inode(inode, "", 0, block, " "); \
+ __ext4_error_inode(inode, "", 0, block, 0, " "); \
+} while (0)
+#define ext4_error_inode_err(inode, func, line, block, err, fmt, ...) \
+do { \
+ no_printk(fmt, ##__VA_ARGS__); \
+ __ext4_error_inode(inode, "", 0, block, err, " "); \
} while (0)
#define ext4_error_file(file, func, line, block, fmt, ...) \
do { \
@@ -2788,12 +3050,17 @@
#define ext4_error(sb, fmt, ...) \
do { \
no_printk(fmt, ##__VA_ARGS__); \
- __ext4_error(sb, "", 0, " "); \
+ __ext4_error(sb, "", 0, 0, 0, " "); \
} while (0)
-#define ext4_abort(sb, fmt, ...) \
+#define ext4_error_err(sb, err, fmt, ...) \
do { \
no_printk(fmt, ##__VA_ARGS__); \
- __ext4_abort(sb, "", 0, " "); \
+ __ext4_error(sb, "", 0, err, 0, " "); \
+} while (0)
+#define ext4_abort(sb, err, fmt, ...) \
+do { \
+ no_printk(fmt, ##__VA_ARGS__); \
+ __ext4_abort(sb, "", 0, err, " "); \
} while (0)
#define ext4_warning(sb, fmt, ...) \
do { \
@@ -2820,12 +3087,6 @@
#endif
-extern int ext4_update_compat_feature(handle_t *handle, struct super_block *sb,
- __u32 compat);
-extern int ext4_update_rocompat_feature(handle_t *handle,
- struct super_block *sb, __u32 rocompat);
-extern int ext4_update_incompat_feature(handle_t *handle,
- struct super_block *sb, __u32 incompat);
extern ext4_fsblk_t ext4_block_bitmap(struct super_block *sb,
struct ext4_group_desc *bg);
extern ext4_fsblk_t ext4_inode_bitmap(struct super_block *sb,
@@ -2876,22 +3137,24 @@
return ext4_has_feature_gdt_csum(sb) || ext4_has_metadata_csum(sb);
}
+#define ext4_read_incompat_64bit_val(es, name) \
+ (((es)->s_feature_incompat & cpu_to_le32(EXT4_FEATURE_INCOMPAT_64BIT) \
+ ? (ext4_fsblk_t)le32_to_cpu(es->name##_hi) << 32 : 0) | \
+ le32_to_cpu(es->name##_lo))
+
static inline ext4_fsblk_t ext4_blocks_count(struct ext4_super_block *es)
{
- return ((ext4_fsblk_t)le32_to_cpu(es->s_blocks_count_hi) << 32) |
- le32_to_cpu(es->s_blocks_count_lo);
+ return ext4_read_incompat_64bit_val(es, s_blocks_count);
}
static inline ext4_fsblk_t ext4_r_blocks_count(struct ext4_super_block *es)
{
- return ((ext4_fsblk_t)le32_to_cpu(es->s_r_blocks_count_hi) << 32) |
- le32_to_cpu(es->s_r_blocks_count_lo);
+ return ext4_read_incompat_64bit_val(es, s_r_blocks_count);
}
static inline ext4_fsblk_t ext4_free_blocks_count(struct ext4_super_block *es)
{
- return ((ext4_fsblk_t)le32_to_cpu(es->s_free_blocks_count_hi) << 32) |
- le32_to_cpu(es->s_free_blocks_count_lo);
+ return ext4_read_incompat_64bit_val(es, s_free_blocks_count);
}
static inline void ext4_blocks_count_set(struct ext4_super_block *es,
@@ -3017,6 +3280,9 @@
struct ext4_group_info {
unsigned long bb_state;
+#ifdef AGGRESSIVE_CHECK
+ unsigned long bb_check_counter;
+#endif
struct rb_root bb_free_root;
ext4_grpblk_t bb_first_free; /* first free block */
ext4_grpblk_t bb_free; /* total free blocks */
@@ -3041,6 +3307,7 @@
(1 << EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT)
#define EXT4_GROUP_INFO_IBITMAP_CORRUPT \
(1 << EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT)
+#define EXT4_GROUP_INFO_BBITMAP_READ_BIT 4
#define EXT4_MB_GRP_NEED_INIT(grp) \
(test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state)))
@@ -3055,6 +3322,8 @@
(set_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state)))
#define EXT4_MB_GRP_CLEAR_TRIMMED(grp) \
(clear_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state)))
+#define EXT4_MB_GRP_TEST_AND_SET_READ(grp) \
+ (test_and_set_bit(EXT4_GROUP_INFO_BBITMAP_READ_BIT, &((grp)->bb_state)))
#define EXT4_MAX_CONTENTION 8
#define EXT4_CONTENTION_THRESHOLD 2
@@ -3218,6 +3487,10 @@
extern int ext4_ci_compare(const struct inode *parent,
const struct qstr *fname,
const struct qstr *entry, bool quick);
+extern int __ext4_unlink(handle_t *handle, struct inode *dir, const struct qstr *d_name,
+ struct inode *inode);
+extern int __ext4_link(struct inode *dir, struct inode *inode,
+ struct dentry *dentry);
#define S_SHIFT 12
static const unsigned char ext4_type_by_mode[(S_IFMT >> S_SHIFT) + 1] = {
@@ -3238,9 +3511,8 @@
}
/* readpages.c */
-extern int ext4_mpage_readpages(struct address_space *mapping,
- struct list_head *pages, struct page *page,
- unsigned nr_pages, bool is_readahead);
+extern int ext4_mpage_readpages(struct inode *inode,
+ struct readahead_control *rac, struct page *page);
extern int __init ext4_init_post_read_processing(void);
extern void ext4_exit_post_read_processing(void);
@@ -3260,9 +3532,9 @@
extern int ext4_setup_system_zone(struct super_block *sb);
extern int __init ext4_init_system_zone(void);
extern void ext4_exit_system_zone(void);
-extern int ext4_data_block_valid(struct ext4_sb_info *sbi,
- ext4_fsblk_t start_blk,
- unsigned int count);
+extern int ext4_inode_block_valid(struct inode *inode,
+ ext4_fsblk_t start_blk,
+ unsigned int count);
extern int ext4_check_blockref(const char *, unsigned int,
struct inode *, __le32 *, unsigned int);
@@ -3276,8 +3548,7 @@
*/
#define EXT_MAX_BLOCKS 0xffffffff
-extern int ext4_ext_tree_init(handle_t *handle, struct inode *);
-extern int ext4_ext_writepage_trans_blocks(struct inode *, int);
+extern void ext4_ext_tree_init(handle_t *handle, struct inode *inode);
extern int ext4_ext_index_trans_blocks(struct inode *inode, int extents);
extern int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
struct ext4_map_blocks *map, int flags);
@@ -3290,16 +3561,13 @@
loff_t len);
extern int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode,
loff_t offset, ssize_t len);
+extern int ext4_convert_unwritten_io_end_vec(handle_t *handle,
+ ext4_io_end_t *io_end);
extern int ext4_map_blocks(handle_t *handle, struct inode *inode,
struct ext4_map_blocks *map, int flags);
-extern int ext4_ext_calc_metadata_amount(struct inode *inode,
- ext4_lblk_t lblocks);
extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode,
int num,
struct ext4_ext_path *path);
-extern int ext4_can_extents_be_merged(struct inode *inode,
- struct ext4_extent *ex1,
- struct ext4_extent *ex2);
extern int ext4_ext_insert_extent(handle_t *, struct inode *,
struct ext4_ext_path **,
struct ext4_extent *, int);
@@ -3315,13 +3583,20 @@
struct fiemap_extent_info *fieinfo,
__u64 start, __u64 len);
extern int ext4_ext_precache(struct inode *inode);
-extern int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len);
-extern int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len);
extern int ext4_swap_extents(handle_t *handle, struct inode *inode1,
struct inode *inode2, ext4_lblk_t lblk1,
ext4_lblk_t lblk2, ext4_lblk_t count,
int mark_unwritten,int *err);
extern int ext4_clu_mapped(struct inode *inode, ext4_lblk_t lclu);
+extern int ext4_datasem_ensure_credits(handle_t *handle, struct inode *inode,
+ int check_cred, int restart_cred,
+ int revoke_cred);
+extern void ext4_ext_replay_shrink_inode(struct inode *inode, ext4_lblk_t end);
+extern int ext4_ext_replay_set_iblocks(struct inode *inode);
+extern int ext4_ext_replay_update_ex(struct inode *inode, ext4_lblk_t start,
+ int len, int unwritten, ext4_fsblk_t pblk);
+extern int ext4_ext_clear_bb(struct inode *inode);
+
/* move_extent.c */
extern void ext4_double_down_write_data_sem(struct inode *first,
@@ -3348,10 +3623,15 @@
int len,
struct writeback_control *wbc,
bool keep_towrite);
+extern struct ext4_io_end_vec *ext4_alloc_io_end_vec(ext4_io_end_t *io_end);
+extern struct ext4_io_end_vec *ext4_last_io_end_vec(ext4_io_end_t *io_end);
/* mmp.c */
extern int ext4_multi_mount_protect(struct super_block *, ext4_fsblk_t);
+/* mmp.c */
+extern void ext4_stop_mmpd(struct ext4_sb_info *sbi);
+
/* verity.c */
extern const struct fsverity_operations ext4_verityops;
@@ -3405,6 +3685,8 @@
}
extern const struct iomap_ops ext4_iomap_ops;
+extern const struct iomap_ops ext4_iomap_overwrite_ops;
+extern const struct iomap_ops ext4_iomap_report_ops;
static inline int ext4_buffer_uptodate(struct buffer_head *bh)
{
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index ca78fd7..44e5988 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -270,10 +270,5 @@
0xffff);
}
-#define ext4_ext_dirty(handle, inode, path) \
- __ext4_ext_dirty(__func__, __LINE__, (handle), (inode), (path))
-int __ext4_ext_dirty(const char *where, unsigned int line, handle_t *handle,
- struct inode *inode, struct ext4_ext_path *path);
-
#endif /* _EXT4_EXTENTS */
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index 7c70b08..6ff7b40 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -7,6 +7,28 @@
#include <trace/events/ext4.h>
+int ext4_inode_journal_mode(struct inode *inode)
+{
+ if (EXT4_JOURNAL(inode) == NULL)
+ return EXT4_INODE_WRITEBACK_DATA_MODE; /* writeback */
+ /* We do not support data journalling with delayed allocation */
+ if (!S_ISREG(inode->i_mode) ||
+ ext4_test_inode_flag(inode, EXT4_INODE_EA_INODE) ||
+ test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA ||
+ (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA) &&
+ !test_opt(inode->i_sb, DELALLOC))) {
+ /* We do not support data journalling for encrypted data */
+ if (S_ISREG(inode->i_mode) && IS_ENCRYPTED(inode))
+ return EXT4_INODE_ORDERED_DATA_MODE; /* ordered */
+ return EXT4_INODE_JOURNAL_DATA_MODE; /* journal data */
+ }
+ if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA)
+ return EXT4_INODE_ORDERED_DATA_MODE; /* ordered */
+ if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)
+ return EXT4_INODE_WRITEBACK_DATA_MODE; /* writeback */
+ BUG();
+}
+
/* Just increment the non-pointer handle value */
static handle_t *ext4_get_nojournal(void)
{
@@ -58,28 +80,30 @@
* take the FS itself readonly cleanly.
*/
if (journal && is_journal_aborted(journal)) {
- ext4_abort(sb, "Detected aborted journal");
+ ext4_abort(sb, -journal->j_errno, "Detected aborted journal");
return -EROFS;
}
return 0;
}
handle_t *__ext4_journal_start_sb(struct super_block *sb, unsigned int line,
- int type, int blocks, int rsv_blocks)
+ int type, int blocks, int rsv_blocks,
+ int revoke_creds)
{
journal_t *journal;
int err;
- trace_ext4_journal_start(sb, blocks, rsv_blocks, _RET_IP_);
+ trace_ext4_journal_start(sb, blocks, rsv_blocks, revoke_creds,
+ _RET_IP_);
err = ext4_journal_check_start(sb);
if (err < 0)
return ERR_PTR(err);
journal = EXT4_SB(sb)->s_journal;
- if (!journal)
+ if (!journal || (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
return ext4_get_nojournal();
- return jbd2__journal_start(journal, blocks, rsv_blocks, GFP_NOFS,
- type, line);
+ return jbd2__journal_start(journal, blocks, rsv_blocks, revoke_creds,
+ GFP_NOFS, type, line);
}
int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle)
@@ -119,8 +143,8 @@
return ext4_get_nojournal();
sb = handle->h_journal->j_private;
- trace_ext4_journal_start_reserved(sb, handle->h_buffer_credits,
- _RET_IP_);
+ trace_ext4_journal_start_reserved(sb,
+ jbd2_handle_buffer_credits(handle), _RET_IP_);
err = ext4_journal_check_start(sb);
if (err < 0) {
jbd2_journal_free_reserved(handle);
@@ -133,6 +157,21 @@
return handle;
}
+int __ext4_journal_ensure_credits(handle_t *handle, int check_cred,
+ int extend_cred, int revoke_cred)
+{
+ if (!ext4_handle_valid(handle))
+ return 0;
+ if (is_handle_aborted(handle))
+ return -EROFS;
+ if (jbd2_handle_buffer_credits(handle) >= check_cred &&
+ handle->h_revoke_credits >= revoke_cred)
+ return 0;
+ extend_cred = max(0, extend_cred - jbd2_handle_buffer_credits(handle));
+ revoke_cred = max(0, revoke_cred - handle->h_revoke_credits);
+ return ext4_journal_extend(handle, extend_cred, revoke_cred);
+}
+
static void ext4_journal_abort_handle(const char *caller, unsigned int line,
const char *err_fn,
struct buffer_head *bh,
@@ -158,6 +197,28 @@
jbd2_journal_abort_handle(handle);
}
+static void ext4_check_bdev_write_error(struct super_block *sb)
+{
+ struct address_space *mapping = sb->s_bdev->bd_inode->i_mapping;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ int err;
+
+ /*
+ * If the block device has write error flag, it may have failed to
+ * async write out metadata buffers in the background. In this case,
+ * we could read old data from disk and write it out again, which
+ * may lead to on-disk filesystem inconsistency.
+ */
+ if (errseq_check(&mapping->wb_err, READ_ONCE(sbi->s_bdev_wb_err))) {
+ spin_lock(&sbi->s_bdev_wb_lock);
+ err = errseq_check_and_advance(&mapping->wb_err, &sbi->s_bdev_wb_err);
+ spin_unlock(&sbi->s_bdev_wb_lock);
+ if (err)
+ ext4_error_err(sb, -err,
+ "Error while async write back metadata");
+ }
+}
+
int __ext4_journal_get_write_access(const char *where, unsigned int line,
handle_t *handle, struct buffer_head *bh)
{
@@ -165,6 +226,9 @@
might_sleep();
+ if (bh->b_bdev->bd_super)
+ ext4_check_bdev_write_error(bh->b_bdev->bd_super);
+
if (ext4_handle_valid(handle)) {
err = jbd2_journal_get_write_access(handle, bh);
if (err)
@@ -234,7 +298,7 @@
if (err) {
ext4_journal_abort_handle(where, line, __func__,
bh, handle, err);
- __ext4_abort(inode->i_sb, where, line,
+ __ext4_abort(inode->i_sb, where, line, -err,
"error %d when attempting revoke", err);
}
BUFFER_TRACE(bh, "exit");
@@ -278,7 +342,7 @@
handle->h_type,
handle->h_line_no,
handle->h_requested_credits,
- handle->h_buffer_credits, err);
+ jbd2_handle_buffer_credits(handle), err);
return err;
}
ext4_error_inode(inode, where, line,
@@ -289,9 +353,11 @@
handle->h_type,
handle->h_line_no,
handle->h_requested_credits,
- handle->h_buffer_credits, err);
+ jbd2_handle_buffer_credits(handle),
+ err);
}
} else {
+ set_buffer_uptodate(bh);
if (inode)
mark_buffer_dirty_inode(bh, inode);
else
@@ -299,13 +365,8 @@
if (inode && inode_needs_sync(inode)) {
sync_dirty_buffer(bh);
if (buffer_req(bh) && !buffer_uptodate(bh)) {
- struct ext4_super_block *es;
-
- es = EXT4_SB(inode->i_sb)->s_es;
- es->s_last_error_block =
- cpu_to_le64(bh->b_blocknr);
- ext4_error_inode(inode, where, line,
- bh->b_blocknr,
+ ext4_error_inode_err(inode, where, line,
+ bh->b_blocknr, EIO,
"IO error syncing itable block");
err = -EIO;
}
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index ef8fcf7..00dc668 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -222,7 +222,10 @@
int ext4_reserve_inode_write(handle_t *handle, struct inode *inode,
struct ext4_iloc *iloc);
-int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode);
+#define ext4_mark_inode_dirty(__h, __i) \
+ __ext4_mark_inode_dirty((__h), (__i), __func__, __LINE__)
+int __ext4_mark_inode_dirty(handle_t *handle, struct inode *inode,
+ const char *func, unsigned int line);
int ext4_expand_extra_isize(struct inode *inode,
unsigned int new_extra_isize,
@@ -261,7 +264,8 @@
__ext4_handle_dirty_super(__func__, __LINE__, (handle), (sb))
handle_t *__ext4_journal_start_sb(struct super_block *sb, unsigned int line,
- int type, int blocks, int rsv_blocks);
+ int type, int blocks, int rsv_blocks,
+ int revoke_creds);
int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle);
#define EXT4_NOJOURNAL_MAX_REF_COUNT ((unsigned long) 4096)
@@ -288,28 +292,41 @@
return 0;
}
-static inline int ext4_handle_has_enough_credits(handle_t *handle, int needed)
+static inline int ext4_free_metadata_revoke_credits(struct super_block *sb,
+ int blocks)
{
- if (ext4_handle_valid(handle) && handle->h_buffer_credits < needed)
- return 0;
- return 1;
+ /* Freeing each metadata block can result in freeing one cluster */
+ return blocks * EXT4_SB(sb)->s_cluster_ratio;
+}
+
+static inline int ext4_trans_default_revoke_credits(struct super_block *sb)
+{
+ return ext4_free_metadata_revoke_credits(sb, 8);
}
#define ext4_journal_start_sb(sb, type, nblocks) \
- __ext4_journal_start_sb((sb), __LINE__, (type), (nblocks), 0)
+ __ext4_journal_start_sb((sb), __LINE__, (type), (nblocks), 0, \
+ ext4_trans_default_revoke_credits(sb))
#define ext4_journal_start(inode, type, nblocks) \
- __ext4_journal_start((inode), __LINE__, (type), (nblocks), 0)
+ __ext4_journal_start((inode), __LINE__, (type), (nblocks), 0, \
+ ext4_trans_default_revoke_credits((inode)->i_sb))
-#define ext4_journal_start_with_reserve(inode, type, blocks, rsv_blocks) \
- __ext4_journal_start((inode), __LINE__, (type), (blocks), (rsv_blocks))
+#define ext4_journal_start_with_reserve(inode, type, blocks, rsv_blocks)\
+ __ext4_journal_start((inode), __LINE__, (type), (blocks), (rsv_blocks),\
+ ext4_trans_default_revoke_credits((inode)->i_sb))
+
+#define ext4_journal_start_with_revoke(inode, type, blocks, revoke_creds) \
+ __ext4_journal_start((inode), __LINE__, (type), (blocks), 0, \
+ (revoke_creds))
static inline handle_t *__ext4_journal_start(struct inode *inode,
unsigned int line, int type,
- int blocks, int rsv_blocks)
+ int blocks, int rsv_blocks,
+ int revoke_creds)
{
return __ext4_journal_start_sb(inode->i_sb, line, type, blocks,
- rsv_blocks);
+ rsv_blocks, revoke_creds);
}
#define ext4_journal_stop(handle) \
@@ -321,31 +338,73 @@
handle_t *__ext4_journal_start_reserved(handle_t *handle, unsigned int line,
int type);
-static inline void ext4_journal_free_reserved(handle_t *handle)
-{
- if (ext4_handle_valid(handle))
- jbd2_journal_free_reserved(handle);
-}
-
static inline handle_t *ext4_journal_current_handle(void)
{
return journal_current_handle();
}
-static inline int ext4_journal_extend(handle_t *handle, int nblocks)
+static inline int ext4_journal_extend(handle_t *handle, int nblocks, int revoke)
{
if (ext4_handle_valid(handle))
- return jbd2_journal_extend(handle, nblocks);
+ return jbd2_journal_extend(handle, nblocks, revoke);
return 0;
}
-static inline int ext4_journal_restart(handle_t *handle, int nblocks)
+static inline int ext4_journal_restart(handle_t *handle, int nblocks,
+ int revoke)
{
if (ext4_handle_valid(handle))
- return jbd2_journal_restart(handle, nblocks);
+ return jbd2__journal_restart(handle, nblocks, revoke, GFP_NOFS);
return 0;
}
+int __ext4_journal_ensure_credits(handle_t *handle, int check_cred,
+ int extend_cred, int revoke_cred);
+
+
+/*
+ * Ensure @handle has at least @check_creds credits available. If not,
+ * transaction will be extended or restarted to contain at least @extend_cred
+ * credits. Before restarting transaction @fn is executed to allow for cleanup
+ * before the transaction is restarted.
+ *
+ * The return value is < 0 in case of error, 0 in case the handle has enough
+ * credits or transaction extension succeeded, 1 in case transaction had to be
+ * restarted.
+ */
+#define ext4_journal_ensure_credits_fn(handle, check_cred, extend_cred, \
+ revoke_cred, fn) \
+({ \
+ __label__ __ensure_end; \
+ int err = __ext4_journal_ensure_credits((handle), (check_cred), \
+ (extend_cred), (revoke_cred)); \
+ \
+ if (err <= 0) \
+ goto __ensure_end; \
+ err = (fn); \
+ if (err < 0) \
+ goto __ensure_end; \
+ err = ext4_journal_restart((handle), (extend_cred), (revoke_cred)); \
+ if (err == 0) \
+ err = 1; \
+__ensure_end: \
+ err; \
+})
+
+/*
+ * Ensure given handle has at least requested amount of credits available,
+ * possibly restarting transaction if needed. We also make sure the transaction
+ * has space for at least ext4_trans_default_revoke_credits(sb) revoke records
+ * as freeing one or two blocks is very common pattern and requesting this is
+ * very cheap.
+ */
+static inline int ext4_journal_ensure_credits(handle_t *handle, int credits,
+ int revoke_creds)
+{
+ return ext4_journal_ensure_credits_fn(handle, credits, credits,
+ revoke_creds, 0);
+}
+
static inline int ext4_journal_blocks_per_page(struct inode *inode)
{
if (EXT4_JOURNAL(inode) != NULL)
@@ -401,26 +460,7 @@
#define EXT4_INODE_ORDERED_DATA_MODE 0x02 /* ordered data mode */
#define EXT4_INODE_WRITEBACK_DATA_MODE 0x04 /* writeback data mode */
-static inline int ext4_inode_journal_mode(struct inode *inode)
-{
- if (EXT4_JOURNAL(inode) == NULL)
- return EXT4_INODE_WRITEBACK_DATA_MODE; /* writeback */
- /* We do not support data journalling with delayed allocation */
- if (!S_ISREG(inode->i_mode) ||
- test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA ||
- (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA) &&
- !test_opt(inode->i_sb, DELALLOC))) {
- /* We do not support data journalling for encrypted data */
- if (S_ISREG(inode->i_mode) && IS_ENCRYPTED(inode))
- return EXT4_INODE_ORDERED_DATA_MODE; /* ordered */
- return EXT4_INODE_JOURNAL_DATA_MODE; /* journal data */
- }
- if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA)
- return EXT4_INODE_ORDERED_DATA_MODE; /* ordered */
- if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)
- return EXT4_INODE_WRITEBACK_DATA_MODE; /* writeback */
- BUG();
-}
+int ext4_inode_journal_mode(struct inode *inode);
static inline int ext4_should_journal_data(struct inode *inode)
{
@@ -437,6 +477,19 @@
return ext4_inode_journal_mode(inode) & EXT4_INODE_WRITEBACK_DATA_MODE;
}
+static inline int ext4_free_data_revoke_credits(struct inode *inode, int blocks)
+{
+ if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
+ return 0;
+ if (!ext4_should_journal_data(inode))
+ return 0;
+ /*
+ * Data blocks in one extent are contiguous, just account for partial
+ * clusters at extent boundaries
+ */
+ return blocks + 2*(EXT4_SB(inode->i_sb)->s_cluster_ratio - 1);
+}
+
/*
* This function controls whether or not we should try to go down the
* dioread_nolock code paths, which makes it safe to avoid taking
@@ -456,6 +509,9 @@
return 0;
if (ext4_should_journal_data(inode))
return 0;
+ /* temporary fix to prevent generic/422 test failures */
+ if (!test_opt(inode->i_sb, DELALLOC))
+ return 0;
return 1;
}
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index ae73e67..0fda305 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -28,6 +28,7 @@
#include <linux/uaccess.h>
#include <linux/fiemap.h>
#include <linux/backing-dev.h>
+#include <linux/iomap.h>
#include "ext4_jbd2.h"
#include "ext4_extents.h"
#include "xattr.h"
@@ -83,13 +84,6 @@
et->et_checksum = ext4_extent_block_csum(inode, eh);
}
-static int ext4_split_extent(handle_t *handle,
- struct inode *inode,
- struct ext4_ext_path **ppath,
- struct ext4_map_blocks *map,
- int split_flag,
- int flags);
-
static int ext4_split_extent_at(handle_t *handle,
struct inode *inode,
struct ext4_ext_path **ppath,
@@ -97,32 +91,41 @@
int split_flag,
int flags);
-static int ext4_find_delayed_extent(struct inode *inode,
- struct extent_status *newes);
-
-static int ext4_ext_truncate_extend_restart(handle_t *handle,
- struct inode *inode,
- int needed)
+static int ext4_ext_trunc_restart_fn(struct inode *inode, int *dropped)
{
- int err;
-
- if (!ext4_handle_valid(handle))
- return 0;
- if (handle->h_buffer_credits >= needed)
- return 0;
/*
- * If we need to extend the journal get a few extra blocks
- * while we're at it for efficiency's sake.
+ * Drop i_data_sem to avoid deadlock with ext4_map_blocks. At this
+ * moment, get_block can be called only for blocks inside i_size since
+ * page cache has been already dropped and writes are blocked by
+ * i_mutex. So we can safely drop the i_data_sem here.
*/
- needed += 3;
- err = ext4_journal_extend(handle, needed - handle->h_buffer_credits);
- if (err <= 0)
- return err;
- err = ext4_truncate_restart_trans(handle, inode, needed);
- if (err == 0)
- err = -EAGAIN;
+ BUG_ON(EXT4_JOURNAL(inode) == NULL);
+ ext4_discard_preallocations(inode, 0);
+ up_write(&EXT4_I(inode)->i_data_sem);
+ *dropped = 1;
+ return 0;
+}
- return err;
+/*
+ * Make sure 'handle' has at least 'check_cred' credits. If not, restart
+ * transaction with 'restart_cred' credits. The function drops i_data_sem
+ * when restarting transaction and gets it after transaction is restarted.
+ *
+ * The function returns 0 on success, 1 if transaction had to be restarted,
+ * and < 0 in case of fatal error.
+ */
+int ext4_datasem_ensure_credits(handle_t *handle, struct inode *inode,
+ int check_cred, int restart_cred,
+ int revoke_cred)
+{
+ int ret;
+ int dropped = 0;
+
+ ret = ext4_journal_ensure_credits_fn(handle, check_cred, restart_cred,
+ revoke_cred, ext4_ext_trunc_restart_fn(inode, &dropped));
+ if (dropped)
+ down_write(&EXT4_I(inode)->i_data_sem);
+ return ret;
}
/*
@@ -133,14 +136,24 @@
static int ext4_ext_get_access(handle_t *handle, struct inode *inode,
struct ext4_ext_path *path)
{
+ int err = 0;
+
if (path->p_bh) {
/* path points to block */
BUFFER_TRACE(path->p_bh, "get_write_access");
- return ext4_journal_get_write_access(handle, path->p_bh);
+ err = ext4_journal_get_write_access(handle, path->p_bh);
+ /*
+ * The extent buffer's verified bit will be set again in
+ * __ext4_ext_dirty(). We could leave an inconsistent
+ * buffer if the extents updating procudure break off du
+ * to some error happens, force to check it again.
+ */
+ if (!err)
+ clear_buffer_verified(path->p_bh);
}
/* path points to leaf/index in inode body */
/* we use in-core data, no need to protect them */
- return 0;
+ return err;
}
/*
@@ -149,8 +162,9 @@
* - ENOMEM
* - EIO
*/
-int __ext4_ext_dirty(const char *where, unsigned int line, handle_t *handle,
- struct inode *inode, struct ext4_ext_path *path)
+static int __ext4_ext_dirty(const char *where, unsigned int line,
+ handle_t *handle, struct inode *inode,
+ struct ext4_ext_path *path)
{
int err;
@@ -160,6 +174,9 @@
/* path points to block */
err = __ext4_handle_dirty_metadata(where, line, handle,
inode, path->p_bh);
+ /* Extents updating done, re-set verified flag */
+ if (!err)
+ set_buffer_verified(path->p_bh);
} else {
/* path points to leaf/index in inode body */
err = ext4_mark_inode_dirty(handle, inode);
@@ -167,6 +184,9 @@
return err;
}
+#define ext4_ext_dirty(handle, inode, path) \
+ __ext4_ext_dirty(__func__, __LINE__, (handle), (inode), (path))
+
static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
struct ext4_ext_path *path,
ext4_lblk_t block)
@@ -290,58 +310,14 @@
{
struct ext4_ext_path *path = *ppath;
int unwritten = ext4_ext_is_unwritten(path[path->p_depth].p_ext);
+ int flags = EXT4_EX_NOCACHE | EXT4_GET_BLOCKS_PRE_IO;
+
+ if (nofail)
+ flags |= EXT4_GET_BLOCKS_METADATA_NOFAIL | EXT4_EX_NOFAIL;
return ext4_split_extent_at(handle, inode, ppath, lblk, unwritten ?
EXT4_EXT_MARK_UNWRIT1|EXT4_EXT_MARK_UNWRIT2 : 0,
- EXT4_EX_NOCACHE | EXT4_GET_BLOCKS_PRE_IO |
- (nofail ? EXT4_GET_BLOCKS_METADATA_NOFAIL:0));
-}
-
-/*
- * Calculate the number of metadata blocks needed
- * to allocate @blocks
- * Worse case is one block per extent
- */
-int ext4_ext_calc_metadata_amount(struct inode *inode, ext4_lblk_t lblock)
-{
- struct ext4_inode_info *ei = EXT4_I(inode);
- int idxs;
-
- idxs = ((inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
- / sizeof(struct ext4_extent_idx));
-
- /*
- * If the new delayed allocation block is contiguous with the
- * previous da block, it can share index blocks with the
- * previous block, so we only need to allocate a new index
- * block every idxs leaf blocks. At ldxs**2 blocks, we need
- * an additional index block, and at ldxs**3 blocks, yet
- * another index blocks.
- */
- if (ei->i_da_metadata_calc_len &&
- ei->i_da_metadata_calc_last_lblock+1 == lblock) {
- int num = 0;
-
- if ((ei->i_da_metadata_calc_len % idxs) == 0)
- num++;
- if ((ei->i_da_metadata_calc_len % (idxs*idxs)) == 0)
- num++;
- if ((ei->i_da_metadata_calc_len % (idxs*idxs*idxs)) == 0) {
- num++;
- ei->i_da_metadata_calc_len = 0;
- } else
- ei->i_da_metadata_calc_len++;
- ei->i_da_metadata_calc_last_lblock++;
- return num;
- }
-
- /*
- * In the worst case we need a new set of index blocks at
- * every level of the inode's extent tree.
- */
- ei->i_da_metadata_calc_len = 1;
- ei->i_da_metadata_calc_last_lblock = lblock;
- return ext_depth(inode) + 1;
+ flags);
}
static int
@@ -377,7 +353,7 @@
*/
if (lblock + len <= lblock)
return 0;
- return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, len);
+ return ext4_inode_block_valid(inode, block, len);
}
static int ext4_valid_extent_idx(struct inode *inode,
@@ -385,14 +361,18 @@
{
ext4_fsblk_t block = ext4_idx_pblock(ext_idx);
- return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, 1);
+ return ext4_inode_block_valid(inode, block, 1);
}
static int ext4_valid_extent_entries(struct inode *inode,
- struct ext4_extent_header *eh,
- int depth)
+ struct ext4_extent_header *eh,
+ ext4_lblk_t lblk, ext4_fsblk_t *pblk,
+ int depth)
{
unsigned short entries;
+ ext4_lblk_t lblock = 0;
+ ext4_lblk_t prev = 0;
+
if (eh->eh_entries == 0)
return 1;
@@ -401,34 +381,51 @@
if (depth == 0) {
/* leaf entries */
struct ext4_extent *ext = EXT_FIRST_EXTENT(eh);
- struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
- ext4_fsblk_t pblock = 0;
- ext4_lblk_t lblock = 0;
- ext4_lblk_t prev = 0;
- int len = 0;
+
+ /*
+ * The logical block in the first entry should equal to
+ * the number in the index block.
+ */
+ if (depth != ext_depth(inode) &&
+ lblk != le32_to_cpu(ext->ee_block))
+ return 0;
while (entries) {
if (!ext4_valid_extent(inode, ext))
return 0;
/* Check for overlapping extents */
lblock = le32_to_cpu(ext->ee_block);
- len = ext4_ext_get_actual_len(ext);
if ((lblock <= prev) && prev) {
- pblock = ext4_ext_pblock(ext);
- es->s_last_error_block = cpu_to_le64(pblock);
+ *pblk = ext4_ext_pblock(ext);
return 0;
}
+ prev = lblock + ext4_ext_get_actual_len(ext) - 1;
ext++;
entries--;
- prev = lblock + len - 1;
}
} else {
struct ext4_extent_idx *ext_idx = EXT_FIRST_INDEX(eh);
+
+ /*
+ * The logical block in the first entry should equal to
+ * the number in the parent index block.
+ */
+ if (depth != ext_depth(inode) &&
+ lblk != le32_to_cpu(ext_idx->ei_block))
+ return 0;
while (entries) {
if (!ext4_valid_extent_idx(inode, ext_idx))
return 0;
+
+ /* Check for overlapping index extents */
+ lblock = le32_to_cpu(ext_idx->ei_block);
+ if ((lblock <= prev) && prev) {
+ *pblk = ext4_idx_pblock(ext_idx);
+ return 0;
+ }
ext_idx++;
entries--;
+ prev = lblock;
}
}
return 1;
@@ -436,7 +433,7 @@
static int __ext4_ext_check(const char *function, unsigned int line,
struct inode *inode, struct ext4_extent_header *eh,
- int depth, ext4_fsblk_t pblk)
+ int depth, ext4_fsblk_t pblk, ext4_lblk_t lblk)
{
const char *error_msg;
int max = 0, err = -EFSCORRUPTED;
@@ -462,7 +459,7 @@
error_msg = "invalid eh_entries";
goto corrupted;
}
- if (!ext4_valid_extent_entries(inode, eh, depth)) {
+ if (!ext4_valid_extent_entries(inode, eh, lblk, &pblk, depth)) {
error_msg = "invalid extent entries";
goto corrupted;
}
@@ -480,18 +477,19 @@
return 0;
corrupted:
- ext4_error_inode(inode, function, line, 0,
- "pblk %llu bad header/extent: %s - magic %x, "
- "entries %u, max %u(%u), depth %u(%u)",
- (unsigned long long) pblk, error_msg,
- le16_to_cpu(eh->eh_magic),
- le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max),
- max, le16_to_cpu(eh->eh_depth), depth);
+ ext4_error_inode_err(inode, function, line, 0, -err,
+ "pblk %llu bad header/extent: %s - magic %x, "
+ "entries %u, max %u(%u), depth %u(%u)",
+ (unsigned long long) pblk, error_msg,
+ le16_to_cpu(eh->eh_magic),
+ le16_to_cpu(eh->eh_entries),
+ le16_to_cpu(eh->eh_max),
+ max, le16_to_cpu(eh->eh_depth), depth);
return err;
}
#define ext4_ext_check(inode, eh, depth, pblk) \
- __ext4_ext_check(__func__, __LINE__, (inode), (eh), (depth), (pblk))
+ __ext4_ext_check(__func__, __LINE__, (inode), (eh), (depth), (pblk), 0)
int ext4_ext_check_inode(struct inode *inode)
{
@@ -524,32 +522,34 @@
static struct buffer_head *
__read_extent_tree_block(const char *function, unsigned int line,
- struct inode *inode, ext4_fsblk_t pblk, int depth,
- int flags)
+ struct inode *inode, struct ext4_extent_idx *idx,
+ int depth, int flags)
{
struct buffer_head *bh;
int err;
+ gfp_t gfp_flags = __GFP_MOVABLE | GFP_NOFS;
+ ext4_fsblk_t pblk;
- bh = sb_getblk_gfp(inode->i_sb, pblk, __GFP_MOVABLE | GFP_NOFS);
+ if (flags & EXT4_EX_NOFAIL)
+ gfp_flags |= __GFP_NOFAIL;
+
+ pblk = ext4_idx_pblock(idx);
+ bh = sb_getblk_gfp(inode->i_sb, pblk, gfp_flags);
if (unlikely(!bh))
return ERR_PTR(-ENOMEM);
if (!bh_uptodate_or_lock(bh)) {
trace_ext4_ext_load_extent(inode, pblk, _RET_IP_);
- err = bh_submit_read(bh);
+ err = ext4_read_bh(bh, 0, NULL);
if (err < 0)
goto errout;
}
if (buffer_verified(bh) && !(flags & EXT4_EX_FORCE_CACHE))
return bh;
- if (!ext4_has_feature_journal(inode->i_sb) ||
- (inode->i_ino !=
- le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_journal_inum))) {
- err = __ext4_ext_check(function, line, inode,
- ext_block_hdr(bh), depth, pblk);
- if (err)
- goto errout;
- }
+ err = __ext4_ext_check(function, line, inode, ext_block_hdr(bh),
+ depth, pblk, le32_to_cpu(idx->ei_block));
+ if (err)
+ goto errout;
set_buffer_verified(bh);
/*
* If this is a leaf block, cache all of its entries
@@ -565,8 +565,8 @@
}
-#define read_extent_tree_block(inode, pblk, depth, flags) \
- __read_extent_tree_block(__func__, __LINE__, (inode), (pblk), \
+#define read_extent_tree_block(inode, idx, depth, flags) \
+ __read_extent_tree_block(__func__, __LINE__, (inode), (idx), \
(depth), (flags))
/*
@@ -586,6 +586,12 @@
down_read(&ei->i_data_sem);
depth = ext_depth(inode);
+ /* Don't cache anything if there are no external extent blocks */
+ if (!depth) {
+ up_read(&ei->i_data_sem);
+ return ret;
+ }
+
path = kcalloc(depth + 1, sizeof(struct ext4_ext_path),
GFP_NOFS);
if (path == NULL) {
@@ -593,9 +599,6 @@
return -ENOMEM;
}
- /* Don't cache anything if there are no external extent blocks */
- if (depth == 0)
- goto out;
path[0].p_hdr = ext_inode_hdr(inode);
ret = ext4_ext_check(inode, path[0].p_hdr, depth, 0);
if (ret)
@@ -613,8 +616,7 @@
i--;
continue;
}
- bh = read_extent_tree_block(inode,
- ext4_idx_pblock(path[i].p_idx++),
+ bh = read_extent_tree_block(inode, path[i].p_idx++,
depth - i - 1,
EXT4_EX_FORCE_CACHE);
if (IS_ERR(bh)) {
@@ -639,21 +641,22 @@
{
int k, l = path->p_depth;
- ext_debug("path:");
+ ext_debug(inode, "path:");
for (k = 0; k <= l; k++, path++) {
if (path->p_idx) {
- ext_debug(" %d->%llu", le32_to_cpu(path->p_idx->ei_block),
- ext4_idx_pblock(path->p_idx));
+ ext_debug(inode, " %d->%llu",
+ le32_to_cpu(path->p_idx->ei_block),
+ ext4_idx_pblock(path->p_idx));
} else if (path->p_ext) {
- ext_debug(" %d:[%d]%d:%llu ",
+ ext_debug(inode, " %d:[%d]%d:%llu ",
le32_to_cpu(path->p_ext->ee_block),
ext4_ext_is_unwritten(path->p_ext),
ext4_ext_get_actual_len(path->p_ext),
ext4_ext_pblock(path->p_ext));
} else
- ext_debug(" []");
+ ext_debug(inode, " []");
}
- ext_debug("\n");
+ ext_debug(inode, "\n");
}
static void ext4_ext_show_leaf(struct inode *inode, struct ext4_ext_path *path)
@@ -669,14 +672,14 @@
eh = path[depth].p_hdr;
ex = EXT_FIRST_EXTENT(eh);
- ext_debug("Displaying leaf extents for inode %lu\n", inode->i_ino);
+ ext_debug(inode, "Displaying leaf extents\n");
for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ex++) {
- ext_debug("%d:[%d]%d:%llu ", le32_to_cpu(ex->ee_block),
+ ext_debug(inode, "%d:[%d]%d:%llu ", le32_to_cpu(ex->ee_block),
ext4_ext_is_unwritten(ex),
ext4_ext_get_actual_len(ex), ext4_ext_pblock(ex));
}
- ext_debug("\n");
+ ext_debug(inode, "\n");
}
static void ext4_ext_show_move(struct inode *inode, struct ext4_ext_path *path,
@@ -689,10 +692,9 @@
struct ext4_extent_idx *idx;
idx = path[level].p_idx;
while (idx <= EXT_MAX_INDEX(path[level].p_hdr)) {
- ext_debug("%d: move %d:%llu in new index %llu\n", level,
- le32_to_cpu(idx->ei_block),
- ext4_idx_pblock(idx),
- newblock);
+ ext_debug(inode, "%d: move %d:%llu in new index %llu\n",
+ level, le32_to_cpu(idx->ei_block),
+ ext4_idx_pblock(idx), newblock);
idx++;
}
@@ -701,7 +703,7 @@
ex = path[depth].p_ext;
while (ex <= EXT_MAX_EXTENT(path[depth].p_hdr)) {
- ext_debug("move %d:%llu:[%d]%d in new leaf %llu\n",
+ ext_debug(inode, "move %d:%llu:[%d]%d in new leaf %llu\n",
le32_to_cpu(ex->ee_block),
ext4_ext_pblock(ex),
ext4_ext_is_unwritten(ex),
@@ -724,11 +726,10 @@
if (!path)
return;
depth = path->p_depth;
- for (i = 0; i <= depth; i++, path++)
- if (path->p_bh) {
- brelse(path->p_bh);
- path->p_bh = NULL;
- }
+ for (i = 0; i <= depth; i++, path++) {
+ brelse(path->p_bh);
+ path->p_bh = NULL;
+ }
}
/*
@@ -744,7 +745,7 @@
struct ext4_extent_idx *r, *l, *m;
- ext_debug("binsearch for %u(idx): ", block);
+ ext_debug(inode, "binsearch for %u(idx): ", block);
l = EXT_FIRST_INDEX(eh) + 1;
r = EXT_LAST_INDEX(eh);
@@ -754,13 +755,13 @@
r = m - 1;
else
l = m + 1;
- ext_debug("%p(%u):%p(%u):%p(%u) ", l, le32_to_cpu(l->ei_block),
- m, le32_to_cpu(m->ei_block),
- r, le32_to_cpu(r->ei_block));
+ ext_debug(inode, "%p(%u):%p(%u):%p(%u) ", l,
+ le32_to_cpu(l->ei_block), m, le32_to_cpu(m->ei_block),
+ r, le32_to_cpu(r->ei_block));
}
path->p_idx = l - 1;
- ext_debug(" -> %u->%lld ", le32_to_cpu(path->p_idx->ei_block),
+ ext_debug(inode, " -> %u->%lld ", le32_to_cpu(path->p_idx->ei_block),
ext4_idx_pblock(path->p_idx));
#ifdef CHECK_BINSEARCH
@@ -770,8 +771,8 @@
chix = ix = EXT_FIRST_INDEX(eh);
for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ix++) {
- if (k != 0 &&
- le32_to_cpu(ix->ei_block) <= le32_to_cpu(ix[-1].ei_block)) {
+ if (k != 0 && le32_to_cpu(ix->ei_block) <=
+ le32_to_cpu(ix[-1].ei_block)) {
printk(KERN_DEBUG "k=%d, ix=0x%p, "
"first=0x%p\n", k,
ix, EXT_FIRST_INDEX(eh));
@@ -811,7 +812,7 @@
return;
}
- ext_debug("binsearch for %u: ", block);
+ ext_debug(inode, "binsearch for %u: ", block);
l = EXT_FIRST_EXTENT(eh) + 1;
r = EXT_LAST_EXTENT(eh);
@@ -822,13 +823,13 @@
r = m - 1;
else
l = m + 1;
- ext_debug("%p(%u):%p(%u):%p(%u) ", l, le32_to_cpu(l->ee_block),
- m, le32_to_cpu(m->ee_block),
- r, le32_to_cpu(r->ee_block));
+ ext_debug(inode, "%p(%u):%p(%u):%p(%u) ", l,
+ le32_to_cpu(l->ee_block), m, le32_to_cpu(m->ee_block),
+ r, le32_to_cpu(r->ee_block));
}
path->p_ext = l - 1;
- ext_debug(" -> %d:%llu:[%d]%d ",
+ ext_debug(inode, " -> %d:%llu:[%d]%d ",
le32_to_cpu(path->p_ext->ee_block),
ext4_ext_pblock(path->p_ext),
ext4_ext_is_unwritten(path->p_ext),
@@ -853,7 +854,7 @@
}
-int ext4_ext_tree_init(handle_t *handle, struct inode *inode)
+void ext4_ext_tree_init(handle_t *handle, struct inode *inode)
{
struct ext4_extent_header *eh;
@@ -864,7 +865,6 @@
eh->eh_max = cpu_to_le16(ext4_ext_space_root(inode, 0));
eh->eh_generation = 0;
ext4_mark_inode_dirty(handle, inode);
- return 0;
}
struct ext4_ext_path *
@@ -876,6 +876,10 @@
struct ext4_ext_path *path = orig_path ? *orig_path : NULL;
short int depth, i, ppos = 0;
int ret;
+ gfp_t gfp_flags = GFP_NOFS;
+
+ if (flags & EXT4_EX_NOFAIL)
+ gfp_flags |= __GFP_NOFAIL;
eh = ext_inode_hdr(inode);
depth = ext_depth(inode);
@@ -896,7 +900,7 @@
if (!path) {
/* account possible depth increase */
path = kcalloc(depth + 2, sizeof(struct ext4_ext_path),
- GFP_NOFS);
+ gfp_flags);
if (unlikely(!path))
return ERR_PTR(-ENOMEM);
path[0].p_maxdepth = depth + 1;
@@ -909,7 +913,7 @@
ext4_cache_extents(inode, eh);
/* walk through the tree */
while (i) {
- ext_debug("depth %d: num %d, max %d\n",
+ ext_debug(inode, "depth %d: num %d, max %d\n",
ppos, le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max));
ext4_ext_binsearch_idx(inode, path + ppos, block);
@@ -917,8 +921,7 @@
path[ppos].p_depth = i;
path[ppos].p_ext = NULL;
- bh = read_extent_tree_block(inode, path[ppos].p_block, --i,
- flags);
+ bh = read_extent_tree_block(inode, path[ppos].p_idx, --i, flags);
if (IS_ERR(bh)) {
ret = PTR_ERR(bh);
goto err;
@@ -986,18 +989,20 @@
if (logical > le32_to_cpu(curp->p_idx->ei_block)) {
/* insert after */
- ext_debug("insert new index %d after: %llu\n", logical, ptr);
+ ext_debug(inode, "insert new index %d after: %llu\n",
+ logical, ptr);
ix = curp->p_idx + 1;
} else {
/* insert before */
- ext_debug("insert new index %d before: %llu\n", logical, ptr);
+ ext_debug(inode, "insert new index %d before: %llu\n",
+ logical, ptr);
ix = curp->p_idx;
}
len = EXT_LAST_INDEX(curp->p_hdr) - ix + 1;
BUG_ON(len < 0);
if (len > 0) {
- ext_debug("insert new index %d: "
+ ext_debug(inode, "insert new index %d: "
"move %d indices from 0x%p to 0x%p\n",
logical, len, ix, ix + 1);
memmove(ix + 1, ix, len * sizeof(struct ext4_extent_idx));
@@ -1046,9 +1051,13 @@
ext4_fsblk_t newblock, oldblock;
__le32 border;
ext4_fsblk_t *ablocks = NULL; /* array of allocated blocks */
+ gfp_t gfp_flags = GFP_NOFS;
int err = 0;
size_t ext_size = 0;
+ if (flags & EXT4_EX_NOFAIL)
+ gfp_flags |= __GFP_NOFAIL;
+
/* make decision: where to split? */
/* FIXME: now decision is simplest: at current extent */
@@ -1060,12 +1069,12 @@
}
if (path[depth].p_ext != EXT_MAX_EXTENT(path[depth].p_hdr)) {
border = path[depth].p_ext[1].ee_block;
- ext_debug("leaf will be split."
+ ext_debug(inode, "leaf will be split."
" next leaf starts at %d\n",
le32_to_cpu(border));
} else {
border = newext->ee_block;
- ext_debug("leaf will be added."
+ ext_debug(inode, "leaf will be added."
" next leaf starts at %d\n",
le32_to_cpu(border));
}
@@ -1082,12 +1091,12 @@
* We need this to handle errors and free blocks
* upon them.
*/
- ablocks = kcalloc(depth, sizeof(ext4_fsblk_t), GFP_NOFS);
+ ablocks = kcalloc(depth, sizeof(ext4_fsblk_t), gfp_flags);
if (!ablocks)
return -ENOMEM;
/* allocate all needed blocks */
- ext_debug("allocate %d blocks for indexes/leaf\n", depth - at);
+ ext_debug(inode, "allocate %d blocks for indexes/leaf\n", depth - at);
for (a = 0; a < depth - at; a++) {
newblock = ext4_ext_new_meta_block(handle, inode, path,
newext, &err, flags);
@@ -1174,7 +1183,7 @@
goto cleanup;
}
if (k)
- ext_debug("create %d intermediate indices\n", k);
+ ext_debug(inode, "create %d intermediate indices\n", k);
/* insert new index into current index block */
/* current depth stored in i var */
i = depth - 1;
@@ -1202,7 +1211,7 @@
fidx->ei_block = border;
ext4_idx_store_pblock(fidx, oldblock);
- ext_debug("int.index at %d (block %llu): %u -> %llu\n",
+ ext_debug(inode, "int.index at %d (block %llu): %u -> %llu\n",
i, newblock, le32_to_cpu(border), oldblock);
/* move remainder of path[i] to the new index block */
@@ -1216,7 +1225,7 @@
}
/* start copy indexes */
m = EXT_MAX_INDEX(path[i].p_hdr) - path[i].p_idx++;
- ext_debug("cur 0x%p, last 0x%p\n", path[i].p_idx,
+ ext_debug(inode, "cur 0x%p, last 0x%p\n", path[i].p_idx,
EXT_MAX_INDEX(path[i].p_hdr));
ext4_ext_show_move(inode, path, newblock, i);
if (m) {
@@ -1353,13 +1362,13 @@
EXT_FIRST_INDEX(neh)->ei_block =
EXT_FIRST_EXTENT(neh)->ee_block;
}
- ext_debug("new root: num %d(%d), lblock %d, ptr %llu\n",
+ ext_debug(inode, "new root: num %d(%d), lblock %d, ptr %llu\n",
le16_to_cpu(neh->eh_entries), le16_to_cpu(neh->eh_max),
le32_to_cpu(EXT_FIRST_INDEX(neh)->ei_block),
ext4_idx_pblock(EXT_FIRST_INDEX(neh)));
le16_add_cpu(&neh->eh_depth, 1);
- ext4_mark_inode_dirty(handle, inode);
+ err = ext4_mark_inode_dirty(handle, inode);
out:
brelse(bh);
@@ -1502,22 +1511,21 @@
}
/*
- * search the closest allocated block to the right for *logical
- * and returns it at @logical + it's physical address at @phys
- * if *logical is the largest allocated block, the function
- * returns 0 at @phys
- * return value contains 0 (success) or error code
+ * Search the closest allocated block to the right for *logical
+ * and returns it at @logical + it's physical address at @phys.
+ * If not exists, return 0 and @phys is set to 0. We will return
+ * 1 which means we found an allocated block and ret_ex is valid.
+ * Or return a (< 0) error code.
*/
static int ext4_ext_search_right(struct inode *inode,
struct ext4_ext_path *path,
ext4_lblk_t *logical, ext4_fsblk_t *phys,
- struct ext4_extent **ret_ex)
+ struct ext4_extent *ret_ex)
{
struct buffer_head *bh = NULL;
struct ext4_extent_header *eh;
struct ext4_extent_idx *ix;
struct ext4_extent *ex;
- ext4_fsblk_t block;
int depth; /* Note, NOT eh_depth; depth from top of tree */
int ee_len;
@@ -1584,20 +1592,17 @@
* follow it and find the closest allocated
* block to the right */
ix++;
- block = ext4_idx_pblock(ix);
while (++depth < path->p_depth) {
/* subtract from p_depth to get proper eh_depth */
- bh = read_extent_tree_block(inode, block,
- path->p_depth - depth, 0);
+ bh = read_extent_tree_block(inode, ix, path->p_depth - depth, 0);
if (IS_ERR(bh))
return PTR_ERR(bh);
eh = ext_block_hdr(bh);
ix = EXT_FIRST_INDEX(eh);
- block = ext4_idx_pblock(ix);
put_bh(bh);
}
- bh = read_extent_tree_block(inode, block, path->p_depth - depth, 0);
+ bh = read_extent_tree_block(inode, ix, path->p_depth - depth, 0);
if (IS_ERR(bh))
return PTR_ERR(bh);
eh = ext_block_hdr(bh);
@@ -1605,10 +1610,11 @@
found_extent:
*logical = le32_to_cpu(ex->ee_block);
*phys = ext4_ext_pblock(ex);
- *ret_ex = ex;
+ if (ret_ex)
+ *ret_ex = *ex;
if (bh)
put_bh(bh);
- return 0;
+ return 1;
}
/*
@@ -1630,17 +1636,16 @@
return EXT_MAX_BLOCKS;
while (depth >= 0) {
+ struct ext4_ext_path *p = &path[depth];
+
if (depth == path->p_depth) {
/* leaf */
- if (path[depth].p_ext &&
- path[depth].p_ext !=
- EXT_LAST_EXTENT(path[depth].p_hdr))
- return le32_to_cpu(path[depth].p_ext[1].ee_block);
+ if (p->p_ext && p->p_ext != EXT_LAST_EXTENT(p->p_hdr))
+ return le32_to_cpu(p->p_ext[1].ee_block);
} else {
/* index */
- if (path[depth].p_idx !=
- EXT_LAST_INDEX(path[depth].p_hdr))
- return le32_to_cpu(path[depth].p_idx[1].ei_block);
+ if (p->p_idx != EXT_LAST_INDEX(p->p_hdr))
+ return le32_to_cpu(p->p_idx[1].ei_block);
}
depth--;
}
@@ -1740,9 +1745,9 @@
return err;
}
-int
-ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
- struct ext4_extent *ex2)
+static int ext4_can_extents_be_merged(struct inode *inode,
+ struct ext4_extent *ex1,
+ struct ext4_extent *ex2)
{
unsigned short ext1_ee_len, ext2_ee_len;
@@ -1756,23 +1761,11 @@
le32_to_cpu(ex2->ee_block))
return 0;
- /*
- * To allow future support for preallocated extents to be added
- * as an RO_COMPAT feature, refuse to merge to extents if
- * this can result in the top bit of ee_len being set.
- */
if (ext1_ee_len + ext2_ee_len > EXT_INIT_MAX_LEN)
return 0;
- /*
- * The check for IO to unwritten extent is somewhat racy as we
- * increment i_unwritten / set EXT4_STATE_DIO_UNWRITTEN only after
- * dropping i_data_sem. But reserved blocks should save us in that
- * case.
- */
+
if (ext4_ext_is_unwritten(ex1) &&
- (ext4_test_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN) ||
- atomic_read(&EXT4_I(inode)->i_unwritten) ||
- (ext1_ee_len + ext2_ee_len > EXT_UNWRITTEN_MAX_LEN)))
+ ext1_ee_len + ext2_ee_len > EXT_UNWRITTEN_MAX_LEN)
return 0;
#ifdef AGGRESSIVE_TEST
if (ext1_ee_len >= 4)
@@ -1850,7 +1843,8 @@
* group descriptor to release the extent tree block. If we
* can't get the journal credits, give up.
*/
- if (ext4_journal_extend(handle, 2))
+ if (ext4_journal_extend(handle, 2,
+ ext4_free_metadata_revoke_credits(inode->i_sb, 1)))
return;
/*
@@ -1874,13 +1868,14 @@
}
/*
- * This function tries to merge the @ex extent to neighbours in the tree.
- * return 1 if merge left else 0.
+ * This function tries to merge the @ex extent to neighbours in the tree, then
+ * tries to collapse the extent tree into the inode.
*/
static void ext4_ext_try_to_merge(handle_t *handle,
struct inode *inode,
struct ext4_ext_path *path,
- struct ext4_extent *ex) {
+ struct ext4_extent *ex)
+{
struct ext4_extent_header *eh;
unsigned int depth;
int merge_done = 0;
@@ -1951,7 +1946,7 @@
/*
* ext4_ext_insert_extent:
- * tries to merge requsted extent into the existing extent or
+ * tries to merge requested extent into the existing extent or
* inserts requested extent as new one into the tree,
* creating new leaf in the no-space case.
*/
@@ -2006,7 +2001,7 @@
/* Try to append newex to the ex */
if (ext4_can_extents_be_merged(inode, ex, newext)) {
- ext_debug("append [%d]%d block to %u:[%d]%d"
+ ext_debug(inode, "append [%d]%d block to %u:[%d]%d"
"(from %llu)\n",
ext4_ext_is_unwritten(newext),
ext4_ext_get_actual_len(newext),
@@ -2031,7 +2026,7 @@
prepend:
/* Try to prepend newex to the ex */
if (ext4_can_extents_be_merged(inode, newext, ex)) {
- ext_debug("prepend %u[%d]%d block to %u:[%d]%d"
+ ext_debug(inode, "prepend %u[%d]%d block to %u:[%d]%d"
"(from %llu)\n",
le32_to_cpu(newext->ee_block),
ext4_ext_is_unwritten(newext),
@@ -2069,20 +2064,20 @@
if (le32_to_cpu(newext->ee_block) > le32_to_cpu(fex->ee_block))
next = ext4_ext_next_leaf_block(path);
if (next != EXT_MAX_BLOCKS) {
- ext_debug("next leaf block - %u\n", next);
+ ext_debug(inode, "next leaf block - %u\n", next);
BUG_ON(npath != NULL);
- npath = ext4_find_extent(inode, next, NULL, 0);
+ npath = ext4_find_extent(inode, next, NULL, gb_flags);
if (IS_ERR(npath))
return PTR_ERR(npath);
BUG_ON(npath->p_depth != path->p_depth);
eh = npath[depth].p_hdr;
if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max)) {
- ext_debug("next leaf isn't full(%d)\n",
+ ext_debug(inode, "next leaf isn't full(%d)\n",
le16_to_cpu(eh->eh_entries));
path = npath;
goto has_space;
}
- ext_debug("next leaf has no free space(%d,%d)\n",
+ ext_debug(inode, "next leaf has no free space(%d,%d)\n",
le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max));
}
@@ -2108,7 +2103,7 @@
if (!nearex) {
/* there is no extent in this leaf, create first one */
- ext_debug("first extent in the leaf: %u:%llu:[%d]%d\n",
+ ext_debug(inode, "first extent in the leaf: %u:%llu:[%d]%d\n",
le32_to_cpu(newext->ee_block),
ext4_ext_pblock(newext),
ext4_ext_is_unwritten(newext),
@@ -2118,7 +2113,7 @@
if (le32_to_cpu(newext->ee_block)
> le32_to_cpu(nearex->ee_block)) {
/* Insert after */
- ext_debug("insert %u:%llu:[%d]%d before: "
+ ext_debug(inode, "insert %u:%llu:[%d]%d before: "
"nearest %p\n",
le32_to_cpu(newext->ee_block),
ext4_ext_pblock(newext),
@@ -2129,7 +2124,7 @@
} else {
/* Insert before */
BUG_ON(newext->ee_block == nearex->ee_block);
- ext_debug("insert %u:%llu:[%d]%d after: "
+ ext_debug(inode, "insert %u:%llu:[%d]%d after: "
"nearest %p\n",
le32_to_cpu(newext->ee_block),
ext4_ext_pblock(newext),
@@ -2139,7 +2134,7 @@
}
len = EXT_LAST_EXTENT(eh) - nearex + 1;
if (len > 0) {
- ext_debug("insert %u:%llu:[%d]%d: "
+ ext_debug(inode, "insert %u:%llu:[%d]%d: "
"move %d extents from 0x%p to 0x%p\n",
le32_to_cpu(newext->ee_block),
ext4_ext_pblock(newext),
@@ -2176,155 +2171,6 @@
return err;
}
-static int ext4_fill_fiemap_extents(struct inode *inode,
- ext4_lblk_t block, ext4_lblk_t num,
- struct fiemap_extent_info *fieinfo)
-{
- struct ext4_ext_path *path = NULL;
- struct ext4_extent *ex;
- struct extent_status es;
- ext4_lblk_t next, next_del, start = 0, end = 0;
- ext4_lblk_t last = block + num;
- int exists, depth = 0, err = 0;
- unsigned int flags = 0;
- unsigned char blksize_bits = inode->i_sb->s_blocksize_bits;
-
- while (block < last && block != EXT_MAX_BLOCKS) {
- num = last - block;
- /* find extent for this block */
- down_read(&EXT4_I(inode)->i_data_sem);
-
- path = ext4_find_extent(inode, block, &path, 0);
- if (IS_ERR(path)) {
- up_read(&EXT4_I(inode)->i_data_sem);
- err = PTR_ERR(path);
- path = NULL;
- break;
- }
-
- depth = ext_depth(inode);
- if (unlikely(path[depth].p_hdr == NULL)) {
- up_read(&EXT4_I(inode)->i_data_sem);
- EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
- err = -EFSCORRUPTED;
- break;
- }
- ex = path[depth].p_ext;
- next = ext4_ext_next_allocated_block(path);
-
- flags = 0;
- exists = 0;
- if (!ex) {
- /* there is no extent yet, so try to allocate
- * all requested space */
- start = block;
- end = block + num;
- } else if (le32_to_cpu(ex->ee_block) > block) {
- /* need to allocate space before found extent */
- start = block;
- end = le32_to_cpu(ex->ee_block);
- if (block + num < end)
- end = block + num;
- } else if (block >= le32_to_cpu(ex->ee_block)
- + ext4_ext_get_actual_len(ex)) {
- /* need to allocate space after found extent */
- start = block;
- end = block + num;
- if (end >= next)
- end = next;
- } else if (block >= le32_to_cpu(ex->ee_block)) {
- /*
- * some part of requested space is covered
- * by found extent
- */
- start = block;
- end = le32_to_cpu(ex->ee_block)
- + ext4_ext_get_actual_len(ex);
- if (block + num < end)
- end = block + num;
- exists = 1;
- } else {
- BUG();
- }
- BUG_ON(end <= start);
-
- if (!exists) {
- es.es_lblk = start;
- es.es_len = end - start;
- es.es_pblk = 0;
- } else {
- es.es_lblk = le32_to_cpu(ex->ee_block);
- es.es_len = ext4_ext_get_actual_len(ex);
- es.es_pblk = ext4_ext_pblock(ex);
- if (ext4_ext_is_unwritten(ex))
- flags |= FIEMAP_EXTENT_UNWRITTEN;
- }
-
- /*
- * Find delayed extent and update es accordingly. We call
- * it even in !exists case to find out whether es is the
- * last existing extent or not.
- */
- next_del = ext4_find_delayed_extent(inode, &es);
- if (!exists && next_del) {
- exists = 1;
- flags |= (FIEMAP_EXTENT_DELALLOC |
- FIEMAP_EXTENT_UNKNOWN);
- }
- up_read(&EXT4_I(inode)->i_data_sem);
-
- if (unlikely(es.es_len == 0)) {
- EXT4_ERROR_INODE(inode, "es.es_len == 0");
- err = -EFSCORRUPTED;
- break;
- }
-
- /*
- * This is possible iff next == next_del == EXT_MAX_BLOCKS.
- * we need to check next == EXT_MAX_BLOCKS because it is
- * possible that an extent is with unwritten and delayed
- * status due to when an extent is delayed allocated and
- * is allocated by fallocate status tree will track both of
- * them in a extent.
- *
- * So we could return a unwritten and delayed extent, and
- * its block is equal to 'next'.
- */
- if (next == next_del && next == EXT_MAX_BLOCKS) {
- flags |= FIEMAP_EXTENT_LAST;
- if (unlikely(next_del != EXT_MAX_BLOCKS ||
- next != EXT_MAX_BLOCKS)) {
- EXT4_ERROR_INODE(inode,
- "next extent == %u, next "
- "delalloc extent = %u",
- next, next_del);
- err = -EFSCORRUPTED;
- break;
- }
- }
-
- if (exists) {
- err = fiemap_fill_next_extent(fieinfo,
- (__u64)es.es_lblk << blksize_bits,
- (__u64)es.es_pblk << blksize_bits,
- (__u64)es.es_len << blksize_bits,
- flags);
- if (err < 0)
- break;
- if (err == 1) {
- err = 0;
- break;
- }
- }
-
- block = es.es_lblk + es.es_len;
- }
-
- ext4_ext_drop_refs(path);
- kfree(path);
- return err;
-}
-
static int ext4_fill_es_cache_info(struct inode *inode,
ext4_lblk_t block, ext4_lblk_t num,
struct fiemap_extent_info *fieinfo)
@@ -2432,7 +2278,7 @@
return;
hole_len = min(es.es_lblk - hole_start, hole_len);
}
- ext_debug(" -> %u:%u\n", hole_start, hole_len);
+ ext_debug(inode, " -> %u:%u\n", hole_start, hole_len);
ext4_es_insert_extent(inode, hole_start, hole_len, ~0,
EXTENT_STATUS_HOLE);
}
@@ -2469,7 +2315,7 @@
err = ext4_ext_dirty(handle, inode, path);
if (err)
return err;
- ext_debug("index is empty, remove it, free block %llu\n", leaf);
+ ext_debug(inode, "index is empty, remove it, free block %llu\n", leaf);
trace_ext4_ext_rm_idx(inode, leaf);
ext4_free_blocks(handle, inode, NULL, leaf, 1,
@@ -2737,7 +2583,7 @@
{
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
int err = 0, correct_index = 0;
- int depth = ext_depth(inode), credits;
+ int depth = ext_depth(inode), credits, revoke_credits;
struct ext4_extent_header *eh;
ext4_lblk_t a, b;
unsigned num;
@@ -2748,7 +2594,7 @@
ext4_fsblk_t pblk;
/* the header must be checked already in ext4_ext_remove_space() */
- ext_debug("truncate since %u in leaf to %u\n", start, end);
+ ext_debug(inode, "truncate since %u in leaf to %u\n", start, end);
if (!path[depth].p_hdr)
path[depth].p_hdr = ext_block_hdr(path[depth].p_bh);
eh = path[depth].p_hdr;
@@ -2774,7 +2620,7 @@
else
unwritten = 0;
- ext_debug("remove ext %u:[%d]%d\n", ex_ee_block,
+ ext_debug(inode, "remove ext %u:[%d]%d\n", ex_ee_block,
unwritten, ex_ee_len);
path[depth].p_ext = ex;
@@ -2782,7 +2628,7 @@
b = ex_ee_block+ex_ee_len - 1 < end ?
ex_ee_block+ex_ee_len - 1 : end;
- ext_debug(" border %u:%u\n", a, b);
+ ext_debug(inode, " border %u:%u\n", a, b);
/* If this extent is beyond the end of the hole, skip it */
if (end < ex_ee_block) {
@@ -2829,10 +2675,23 @@
credits += (ext_depth(inode)) + 1;
}
credits += EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb);
+ /*
+ * We may end up freeing some index blocks and data from the
+ * punched range. Note that partial clusters are accounted for
+ * by ext4_free_data_revoke_credits().
+ */
+ revoke_credits =
+ ext4_free_metadata_revoke_credits(inode->i_sb,
+ ext_depth(inode)) +
+ ext4_free_data_revoke_credits(inode, b - a + 1);
- err = ext4_ext_truncate_extend_restart(handle, inode, credits);
- if (err)
+ err = ext4_datasem_ensure_credits(handle, inode, credits,
+ credits, revoke_credits);
+ if (err) {
+ if (err > 0)
+ err = -EAGAIN;
goto out;
+ }
err = ext4_ext_get_access(handle, inode, path + depth);
if (err)
@@ -2878,7 +2737,7 @@
if (err)
goto out;
- ext_debug("new extent: %u:%u:%llu\n", ex_ee_block, num,
+ ext_debug(inode, "new extent: %u:%u:%llu\n", ex_ee_block, num,
ext4_ext_pblock(ex));
ex--;
ex_ee_block = le32_to_cpu(ex->ee_block);
@@ -2955,10 +2814,12 @@
partial.lblk = 0;
partial.state = initial;
- ext_debug("truncate since %u to %u\n", start, end);
+ ext_debug(inode, "truncate since %u to %u\n", start, end);
/* probably first extent we're gonna free will be last in block */
- handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, depth + 1);
+ handle = ext4_journal_start_with_revoke(inode, EXT4_HT_TRUNCATE,
+ depth + 1,
+ ext4_free_metadata_revoke_credits(inode->i_sb, depth));
if (IS_ERR(handle))
return PTR_ERR(handle);
@@ -2978,7 +2839,8 @@
ext4_fsblk_t pblk;
/* find extent for or closest extent to this block */
- path = ext4_find_extent(inode, end, NULL, EXT4_EX_NOCACHE);
+ path = ext4_find_extent(inode, end, NULL,
+ EXT4_EX_NOCACHE | EXT4_EX_NOFAIL);
if (IS_ERR(path)) {
ext4_journal_stop(handle);
return PTR_ERR(path);
@@ -3043,8 +2905,8 @@
*/
lblk = ex_end + 1;
err = ext4_ext_search_right(inode, path, &lblk, &pblk,
- &ex);
- if (err)
+ NULL);
+ if (err < 0)
goto out;
if (pblk) {
partial.pclu = EXT4_B2C(sbi, pblk);
@@ -3064,7 +2926,7 @@
le16_to_cpu(path[k].p_hdr->eh_entries)+1;
} else {
path = kcalloc(depth + 1, sizeof(struct ext4_ext_path),
- GFP_NOFS);
+ GFP_NOFS | __GFP_NOFAIL);
if (path == NULL) {
ext4_journal_stop(handle);
return -ENOMEM;
@@ -3094,7 +2956,7 @@
/* this is index block */
if (!path[i].p_hdr) {
- ext_debug("initialize header\n");
+ ext_debug(inode, "initialize header\n");
path[i].p_hdr = ext_block_hdr(path[i].p_bh);
}
@@ -3102,7 +2964,7 @@
/* this level hasn't been touched yet */
path[i].p_idx = EXT_LAST_INDEX(path[i].p_hdr);
path[i].p_block = le16_to_cpu(path[i].p_hdr->eh_entries)+1;
- ext_debug("init index ptr: hdr 0x%p, num %d\n",
+ ext_debug(inode, "init index ptr: hdr 0x%p, num %d\n",
path[i].p_hdr,
le16_to_cpu(path[i].p_hdr->eh_entries));
} else {
@@ -3110,18 +2972,18 @@
path[i].p_idx--;
}
- ext_debug("level %d - index, first 0x%p, cur 0x%p\n",
+ ext_debug(inode, "level %d - index, first 0x%p, cur 0x%p\n",
i, EXT_FIRST_INDEX(path[i].p_hdr),
path[i].p_idx);
if (ext4_ext_more_to_rm(path + i)) {
struct buffer_head *bh;
/* go to the next level */
- ext_debug("move to level %d (block %llu)\n",
+ ext_debug(inode, "move to level %d (block %llu)\n",
i + 1, ext4_idx_pblock(path[i].p_idx));
memset(path + i + 1, 0, sizeof(*path));
- bh = read_extent_tree_block(inode,
- ext4_idx_pblock(path[i].p_idx), depth - i - 1,
- EXT4_EX_NOCACHE);
+ bh = read_extent_tree_block(inode, path[i].p_idx,
+ depth - i - 1,
+ EXT4_EX_NOCACHE);
if (IS_ERR(bh)) {
/* should we reset i_size? */
err = PTR_ERR(bh);
@@ -3152,7 +3014,7 @@
brelse(path[i].p_bh);
path[i].p_bh = NULL;
i--;
- ext_debug("return to level %d\n", i);
+ ext_debug(inode, "return to level %d\n", i);
}
}
@@ -3294,7 +3156,7 @@
*
*
* Splits extent [a, b] into two extents [a, @split) and [@split, b], states
- * of which are deterimined by split_flag.
+ * of which are determined by split_flag.
*
* There are two cases:
* a> the extent are splitted into two extent.
@@ -3320,8 +3182,7 @@
BUG_ON((split_flag & (EXT4_EXT_DATA_VALID1 | EXT4_EXT_DATA_VALID2)) ==
(EXT4_EXT_DATA_VALID1 | EXT4_EXT_DATA_VALID2));
- ext_debug("ext4_split_extents_at: inode %lu, logical"
- "block %llu\n", inode->i_ino, (unsigned long long)split);
+ ext_debug(inode, "logical block %llu\n", (unsigned long long)split);
ext4_ext_show_leaf(inode, path);
@@ -3429,6 +3290,10 @@
fix_extent_len:
ex->ee_len = orig_ex.ee_len;
+ /*
+ * Ignore ext4_ext_dirty return value since we are already in error path
+ * and err is a non-zero error code.
+ */
ext4_ext_dirty(handle, inode, path + path->p_depth);
return err;
out:
@@ -3488,7 +3353,7 @@
* Update path is required because previous ext4_split_extent_at() may
* result in split of original leaf or extent zeroout.
*/
- path = ext4_find_extent(inode, map->m_lblk, ppath, 0);
+ path = ext4_find_extent(inode, map->m_lblk, ppath, flags);
if (IS_ERR(path))
return PTR_ERR(path);
depth = ext_depth(inode);
@@ -3557,9 +3422,8 @@
int err = 0;
int split_flag = EXT4_EXT_DATA_VALID2;
- ext_debug("ext4_ext_convert_to_initialized: inode %lu, logical"
- "block %llu, max_blocks %u\n", inode->i_ino,
- (unsigned long long)map->m_lblk, map_len);
+ ext_debug(inode, "logical block %llu, max_blocks %u\n",
+ (unsigned long long)map->m_lblk, map_len);
sbi = EXT4_SB(inode->i_sb);
eof_block = (EXT4_I(inode)->i_disksize + inode->i_sb->s_blocksize - 1)
@@ -3691,7 +3555,7 @@
}
if (allocated) {
/* Mark the block containing both extents as dirty */
- ext4_ext_dirty(handle, inode, path + depth);
+ err = ext4_ext_dirty(handle, inode, path + depth);
/* Update path to point to the right extent */
path[depth].p_ext = abut_ex;
@@ -3710,9 +3574,6 @@
max_zeroout = sbi->s_extent_max_zeroout_kb >>
(inode->i_sb->s_blocksize_bits - 10);
- if (IS_ENCRYPTED(inode))
- max_zeroout = 0;
-
/*
* five cases:
* 1. split the extent into three extents.
@@ -3814,8 +3675,7 @@
unsigned int ee_len;
int split_flag = 0, depth;
- ext_debug("%s: inode %lu, logical block %llu, max_blocks %u\n",
- __func__, inode->i_ino,
+ ext_debug(inode, "logical block %llu, max_blocks %u\n",
(unsigned long long)map->m_lblk, map->m_len);
eof_block = (EXT4_I(inode)->i_disksize + inode->i_sb->s_blocksize - 1)
@@ -3824,7 +3684,7 @@
eof_block = map->m_lblk + map->m_len;
/*
* It is safe to convert extent to initialized via explicit
- * zeroout only if extent is fully insde i_size or new_size.
+ * zeroout only if extent is fully inside i_size or new_size.
*/
depth = ext_depth(inode);
ex = path[depth].p_ext;
@@ -3861,8 +3721,7 @@
ee_block = le32_to_cpu(ex->ee_block);
ee_len = ext4_ext_get_actual_len(ex);
- ext_debug("ext4_convert_unwritten_extents_endio: inode %lu, logical"
- "block %llu, max_blocks %u\n", inode->i_ino,
+ ext_debug(inode, "logical block %llu, max_blocks %u\n",
(unsigned long long)ee_block, ee_len);
/* If extent is larger than requested it is a clear sign that we still
@@ -3907,64 +3766,11 @@
return err;
}
-/*
- * Handle EOFBLOCKS_FL flag, clearing it if necessary
- */
-static int check_eofblocks_fl(handle_t *handle, struct inode *inode,
- ext4_lblk_t lblk,
- struct ext4_ext_path *path,
- unsigned int len)
-{
- int i, depth;
- struct ext4_extent_header *eh;
- struct ext4_extent *last_ex;
-
- if (!ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS))
- return 0;
-
- depth = ext_depth(inode);
- eh = path[depth].p_hdr;
-
- /*
- * We're going to remove EOFBLOCKS_FL entirely in future so we
- * do not care for this case anymore. Simply remove the flag
- * if there are no extents.
- */
- if (unlikely(!eh->eh_entries))
- goto out;
- last_ex = EXT_LAST_EXTENT(eh);
- /*
- * We should clear the EOFBLOCKS_FL flag if we are writing the
- * last block in the last extent in the file. We test this by
- * first checking to see if the caller to
- * ext4_ext_get_blocks() was interested in the last block (or
- * a block beyond the last block) in the current extent. If
- * this turns out to be false, we can bail out from this
- * function immediately.
- */
- if (lblk + len < le32_to_cpu(last_ex->ee_block) +
- ext4_ext_get_actual_len(last_ex))
- return 0;
- /*
- * If the caller does appear to be planning to write at or
- * beyond the end of the current extent, we then test to see
- * if the current extent is the last extent in the file, by
- * checking to make sure it was reached via the rightmost node
- * at each level of the tree.
- */
- for (i = depth-1; i >= 0; i--)
- if (path[i].p_idx != EXT_LAST_INDEX(path[i].p_hdr))
- return 0;
-out:
- ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
- return ext4_mark_inode_dirty(handle, inode);
-}
-
static int
convert_initialized_extent(handle_t *handle, struct inode *inode,
struct ext4_map_blocks *map,
struct ext4_ext_path **ppath,
- unsigned int allocated)
+ unsigned int *allocated)
{
struct ext4_ext_path *path = *ppath;
struct ext4_extent *ex;
@@ -3985,8 +3791,7 @@
ee_block = le32_to_cpu(ex->ee_block);
ee_len = ext4_ext_get_actual_len(ex);
- ext_debug("%s: inode %lu, logical"
- "block %llu, max_blocks %u\n", __func__, inode->i_ino,
+ ext_debug(inode, "logical block %llu, max_blocks %u\n",
(unsigned long long)ee_block, ee_len);
if (ee_block != map->m_lblk || ee_len > map->m_len) {
@@ -4024,14 +3829,12 @@
ext4_ext_show_leaf(inode, path);
ext4_update_inode_fsync_trans(handle, inode, 1);
- err = check_eofblocks_fl(handle, inode, map->m_lblk, path, map->m_len);
- if (err)
- return err;
+
map->m_flags |= EXT4_MAP_UNWRITTEN;
- if (allocated > map->m_len)
- allocated = map->m_len;
- map->m_len = allocated;
- return allocated;
+ if (*allocated > map->m_len)
+ *allocated = map->m_len;
+ map->m_len = *allocated;
+ return 0;
}
static int
@@ -4040,14 +3843,13 @@
struct ext4_ext_path **ppath, int flags,
unsigned int allocated, ext4_fsblk_t newblock)
{
- struct ext4_ext_path *path = *ppath;
+ struct ext4_ext_path __maybe_unused *path = *ppath;
int ret = 0;
int err = 0;
- ext_debug("ext4_ext_handle_unwritten_extents: inode %lu, logical "
- "block %llu, max_blocks %u, flags %x, allocated %u\n",
- inode->i_ino, (unsigned long long)map->m_lblk, map->m_len,
- flags, allocated);
+ ext_debug(inode, "logical block %llu, max_blocks %u, flags 0x%x, allocated %u\n",
+ (unsigned long long)map->m_lblk, map->m_len, flags,
+ allocated);
ext4_ext_show_leaf(inode, path);
/*
@@ -4059,41 +3861,38 @@
trace_ext4_ext_handle_unwritten_extents(inode, map, flags,
allocated, newblock);
- /* get_block() before submit the IO, split the extent */
+ /* get_block() before submitting IO, split the extent */
if (flags & EXT4_GET_BLOCKS_PRE_IO) {
ret = ext4_split_convert_extents(handle, inode, map, ppath,
flags | EXT4_GET_BLOCKS_CONVERT);
- if (ret <= 0)
- goto out;
+ if (ret < 0) {
+ err = ret;
+ goto out2;
+ }
+ /*
+ * shouldn't get a 0 return when splitting an extent unless
+ * m_len is 0 (bug) or extent has been corrupted
+ */
+ if (unlikely(ret == 0)) {
+ EXT4_ERROR_INODE(inode,
+ "unexpected ret == 0, m_len = %u",
+ map->m_len);
+ err = -EFSCORRUPTED;
+ goto out2;
+ }
map->m_flags |= EXT4_MAP_UNWRITTEN;
goto out;
}
/* IO end_io complete, convert the filled extent to written */
if (flags & EXT4_GET_BLOCKS_CONVERT) {
- if (flags & EXT4_GET_BLOCKS_ZERO) {
- if (allocated > map->m_len)
- allocated = map->m_len;
- err = ext4_issue_zeroout(inode, map->m_lblk, newblock,
- allocated);
- if (err < 0)
- goto out2;
- }
- ret = ext4_convert_unwritten_extents_endio(handle, inode, map,
+ err = ext4_convert_unwritten_extents_endio(handle, inode, map,
ppath);
- if (ret >= 0) {
- ext4_update_inode_fsync_trans(handle, inode, 1);
- err = check_eofblocks_fl(handle, inode, map->m_lblk,
- path, map->m_len);
- } else
- err = ret;
- map->m_flags |= EXT4_MAP_MAPPED;
- map->m_pblk = newblock;
- if (allocated > map->m_len)
- allocated = map->m_len;
- map->m_len = allocated;
- goto out2;
+ if (err < 0)
+ goto out2;
+ ext4_update_inode_fsync_trans(handle, inode, 1);
+ goto map_out;
}
- /* buffered IO case */
+ /* buffered IO cases */
/*
* repeat fallocate creation request
* we already have an unwritten extent
@@ -4116,35 +3915,39 @@
goto out1;
}
- /* buffered write, writepage time, convert*/
+ /*
+ * Default case when (flags & EXT4_GET_BLOCKS_CREATE) == 1.
+ * For buffered writes, at writepage time, etc. Convert a
+ * discovered unwritten extent to written.
+ */
ret = ext4_ext_convert_to_initialized(handle, inode, map, ppath, flags);
- if (ret >= 0)
- ext4_update_inode_fsync_trans(handle, inode, 1);
-out:
- if (ret <= 0) {
+ if (ret < 0) {
err = ret;
goto out2;
- } else
- allocated = ret;
- map->m_flags |= EXT4_MAP_NEW;
- if (allocated > map->m_len)
- allocated = map->m_len;
- map->m_len = allocated;
+ }
+ ext4_update_inode_fsync_trans(handle, inode, 1);
+ /*
+ * shouldn't get a 0 return when converting an unwritten extent
+ * unless m_len is 0 (bug) or extent has been corrupted
+ */
+ if (unlikely(ret == 0)) {
+ EXT4_ERROR_INODE(inode, "unexpected ret == 0, m_len = %u",
+ map->m_len);
+ err = -EFSCORRUPTED;
+ goto out2;
+ }
+out:
+ allocated = ret;
+ map->m_flags |= EXT4_MAP_NEW;
map_out:
map->m_flags |= EXT4_MAP_MAPPED;
- if ((flags & EXT4_GET_BLOCKS_KEEP_SIZE) == 0) {
- err = check_eofblocks_fl(handle, inode, map->m_lblk, path,
- map->m_len);
- if (err < 0)
- goto out2;
- }
out1:
+ map->m_pblk = newblock;
if (allocated > map->m_len)
allocated = map->m_len;
- ext4_ext_show_leaf(inode, path);
- map->m_pblk = newblock;
map->m_len = allocated;
+ ext4_ext_show_leaf(inode, path);
out2:
return err ? err : allocated;
}
@@ -4260,7 +4063,7 @@
* down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system block
* (ie, create is zero). Otherwise down_write(&EXT4_I(inode)->i_data_sem)
*
- * return > 0, number of of blocks already mapped/allocated
+ * return > 0, number of blocks already mapped/allocated
* if create == 0 and these are pre-allocated blocks
* buffer head is unmapped
* otherwise blocks are mapped
@@ -4274,18 +4077,16 @@
struct ext4_map_blocks *map, int flags)
{
struct ext4_ext_path *path = NULL;
- struct ext4_extent newex, *ex, *ex2;
+ struct ext4_extent newex, *ex, ex2;
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
- ext4_fsblk_t newblock = 0;
- int free_on_err = 0, err = 0, depth, ret;
+ ext4_fsblk_t newblock = 0, pblk;
+ int err = 0, depth, ret;
unsigned int allocated = 0, offset = 0;
unsigned int allocated_clusters = 0;
struct ext4_allocation_request ar;
ext4_lblk_t cluster_offset;
- bool map_from_cluster = false;
- ext_debug("blocks %u/%u requested for inode %lu\n",
- map->m_lblk, map->m_len, inode->i_ino);
+ ext_debug(inode, "blocks %u/%u requested\n", map->m_lblk, map->m_len);
trace_ext4_ext_map_blocks_enter(inode, map->m_lblk, map->m_len, flags);
/* find extent for this block */
@@ -4293,7 +4094,7 @@
if (IS_ERR(path)) {
err = PTR_ERR(path);
path = NULL;
- goto out2;
+ goto out;
}
depth = ext_depth(inode);
@@ -4309,7 +4110,7 @@
(unsigned long) map->m_lblk, depth,
path[depth].p_block);
err = -EFSCORRUPTED;
- goto out2;
+ goto out;
}
ex = path[depth].p_ext;
@@ -4332,8 +4133,8 @@
newblock = map->m_lblk - ee_block + ee_start;
/* number of remaining blocks in the extent */
allocated = ee_len - (map->m_lblk - ee_block);
- ext_debug("%u fit into %u:%d -> %llu\n", map->m_lblk,
- ee_block, ee_len, newblock);
+ ext_debug(inode, "%u fit into %u:%d -> %llu\n",
+ map->m_lblk, ee_block, ee_len, newblock);
/*
* If the extent is initialized check whether the
@@ -4341,12 +4142,18 @@
*/
if ((!ext4_ext_is_unwritten(ex)) &&
(flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN)) {
- allocated = convert_initialized_extent(
- handle, inode, map, &path,
- allocated);
- goto out2;
- } else if (!ext4_ext_is_unwritten(ex))
+ err = convert_initialized_extent(handle,
+ inode, map, &path, &allocated);
goto out;
+ } else if (!ext4_ext_is_unwritten(ex)) {
+ map->m_flags |= EXT4_MAP_MAPPED;
+ map->m_pblk = newblock;
+ if (allocated > map->m_len)
+ allocated = map->m_len;
+ map->m_len = allocated;
+ ext4_ext_show_leaf(inode, path);
+ goto out;
+ }
ret = ext4_ext_handle_unwritten_extents(
handle, inode, map, &path, flags,
@@ -4355,7 +4162,7 @@
err = ret;
else
allocated = ret;
- goto out2;
+ goto out;
}
}
@@ -4380,7 +4187,7 @@
map->m_pblk = 0;
map->m_len = min_t(unsigned int, map->m_len, hole_len);
- goto out2;
+ goto out;
}
/*
@@ -4397,7 +4204,6 @@
get_implied_cluster_alloc(inode->i_sb, map, ex, path)) {
ar.len = allocated = map->m_len;
newblock = map->m_pblk;
- map_from_cluster = true;
goto got_allocated_blocks;
}
@@ -4405,20 +4211,18 @@
ar.lleft = map->m_lblk;
err = ext4_ext_search_left(inode, path, &ar.lleft, &ar.pleft);
if (err)
- goto out2;
+ goto out;
ar.lright = map->m_lblk;
- ex2 = NULL;
err = ext4_ext_search_right(inode, path, &ar.lright, &ar.pright, &ex2);
- if (err)
- goto out2;
+ if (err < 0)
+ goto out;
/* Check if the extent after searching to the right implies a
* cluster we can use. */
- if ((sbi->s_cluster_ratio > 1) && ex2 &&
- get_implied_cluster_alloc(inode->i_sb, map, ex2, path)) {
+ if ((sbi->s_cluster_ratio > 1) && err &&
+ get_implied_cluster_alloc(inode->i_sb, map, &ex2, path)) {
ar.len = allocated = map->m_len;
newblock = map->m_pblk;
- map_from_cluster = true;
goto got_allocated_blocks;
}
@@ -4472,59 +4276,52 @@
ar.flags |= EXT4_MB_USE_RESERVED;
newblock = ext4_mb_new_blocks(handle, &ar, &err);
if (!newblock)
- goto out2;
- ext_debug("allocate new block: goal %llu, found %llu/%u\n",
- ar.goal, newblock, allocated);
- free_on_err = 1;
+ goto out;
allocated_clusters = ar.len;
ar.len = EXT4_C2B(sbi, ar.len) - offset;
+ ext_debug(inode, "allocate new block: goal %llu, found %llu/%u, requested %u\n",
+ ar.goal, newblock, ar.len, allocated);
if (ar.len > allocated)
ar.len = allocated;
got_allocated_blocks:
/* try to insert new extent into found leaf and return */
- ext4_ext_store_pblock(&newex, newblock + offset);
+ pblk = newblock + offset;
+ ext4_ext_store_pblock(&newex, pblk);
newex.ee_len = cpu_to_le16(ar.len);
/* Mark unwritten */
- if (flags & EXT4_GET_BLOCKS_UNWRIT_EXT){
+ if (flags & EXT4_GET_BLOCKS_UNWRIT_EXT) {
ext4_ext_mark_unwritten(&newex);
map->m_flags |= EXT4_MAP_UNWRITTEN;
}
- err = 0;
- if ((flags & EXT4_GET_BLOCKS_KEEP_SIZE) == 0)
- err = check_eofblocks_fl(handle, inode, map->m_lblk,
- path, ar.len);
- if (!err)
- err = ext4_ext_insert_extent(handle, inode, &path,
- &newex, flags);
+ err = ext4_ext_insert_extent(handle, inode, &path, &newex, flags);
+ if (err) {
+ if (allocated_clusters) {
+ int fb_flags = 0;
- if (err && free_on_err) {
- int fb_flags = flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE ?
- EXT4_FREE_BLOCKS_NO_QUOT_UPDATE : 0;
- /* free data blocks we just allocated */
- /* not a good idea to call discard here directly,
- * but otherwise we'd need to call it every free() */
- ext4_discard_preallocations(inode);
- ext4_free_blocks(handle, inode, NULL, newblock,
- EXT4_C2B(sbi, allocated_clusters), fb_flags);
- goto out2;
+ /*
+ * free data blocks we just allocated.
+ * not a good idea to call discard here directly,
+ * but otherwise we'd need to call it every free().
+ */
+ ext4_discard_preallocations(inode, 0);
+ if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
+ fb_flags = EXT4_FREE_BLOCKS_NO_QUOT_UPDATE;
+ ext4_free_blocks(handle, inode, NULL, newblock,
+ EXT4_C2B(sbi, allocated_clusters),
+ fb_flags);
+ }
+ goto out;
}
- /* previous routine could use block we allocated */
- newblock = ext4_ext_pblock(&newex);
- allocated = ext4_ext_get_actual_len(&newex);
- if (allocated > map->m_len)
- allocated = map->m_len;
- map->m_flags |= EXT4_MAP_NEW;
-
/*
* Reduce the reserved cluster count to reflect successful deferred
* allocation of delayed allocated clusters or direct allocation of
* clusters discovered to be delayed allocated. Once allocated, a
* cluster is not included in the reserved count.
*/
- if (test_opt(inode->i_sb, DELALLOC) && !map_from_cluster) {
+ if (test_opt(inode->i_sb, DELALLOC) && allocated_clusters) {
if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
/*
* When allocating delayed allocated clusters, simply
@@ -4563,14 +4360,13 @@
ext4_update_inode_fsync_trans(handle, inode, 1);
else
ext4_update_inode_fsync_trans(handle, inode, 0);
-out:
- if (allocated > map->m_len)
- allocated = map->m_len;
+
+ map->m_flags |= (EXT4_MAP_NEW | EXT4_MAP_MAPPED);
+ map->m_pblk = pblk;
+ map->m_len = ar.len;
+ allocated = map->m_len;
ext4_ext_show_leaf(inode, path);
- map->m_flags |= EXT4_MAP_MAPPED;
- map->m_pblk = newblock;
- map->m_len = allocated;
-out2:
+out:
ext4_ext_drop_refs(path);
kfree(path);
@@ -4609,7 +4405,14 @@
}
if (err)
return err;
- return ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1);
+retry_remove_space:
+ err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1);
+ if (err == -ENOMEM) {
+ cond_resched();
+ congestion_wait(BLK_RW_ASYNC, HZ/50);
+ goto retry_remove_space;
+ }
+ return err;
}
static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset,
@@ -4619,7 +4422,7 @@
struct inode *inode = file_inode(file);
handle_t *handle;
int ret = 0;
- int ret2 = 0;
+ int ret2 = 0, ret3 = 0;
int retries = 0;
int depth = 0;
struct ext4_map_blocks map;
@@ -4678,15 +4481,12 @@
epos = new_size;
if (ext4_update_inode_size(inode, epos) & 0x1)
inode->i_mtime = inode->i_ctime;
- } else {
- if (epos > inode->i_size)
- ext4_set_inode_flag(inode,
- EXT4_INODE_EOFBLOCKS);
}
- ext4_mark_inode_dirty(handle, inode);
+ ret2 = ext4_mark_inode_dirty(handle, inode);
ext4_update_inode_fsync_trans(handle, inode, 1);
- ret2 = ext4_journal_stop(handle);
- if (ret2)
+ ret3 = ext4_journal_stop(handle);
+ ret2 = ret3 ? ret3 : ret2;
+ if (unlikely(ret2))
break;
}
if (ret == -ENOSPC &&
@@ -4698,6 +4498,10 @@
return ret > 0 ? ret2 : ret;
}
+static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len);
+
+static int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len);
+
static long ext4_zero_range(struct file *file, loff_t offset,
loff_t len, int mode)
{
@@ -4715,9 +4519,6 @@
trace_ext4_zero_range(inode, offset, len, mode);
- if (!S_ISREG(inode->i_mode))
- return -EINVAL;
-
/* Call ext4_force_commit to flush all data in case of data=journal. */
if (ext4_should_journal_data(inode)) {
ret = ext4_force_commit(inode->i_sb);
@@ -4726,7 +4527,7 @@
}
/*
- * Round up offset. This is not fallocate, we neet to zero out
+ * Round up offset. This is not fallocate, we need to zero out
* blocks, so convert interior block aligned part of the range to
* unwritten and possibly manually zero out unaligned parts of the
* range.
@@ -4749,7 +4550,7 @@
inode_lock(inode);
/*
- * Indirect files do not support unwritten extnets
+ * Indirect files do not support unwritten extents
*/
if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
ret = -EOPNOTSUPP;
@@ -4757,7 +4558,7 @@
}
if (!(mode & FALLOC_FL_KEEP_SIZE) &&
- (offset + len > i_size_read(inode) ||
+ (offset + len > inode->i_size ||
offset + len > EXT4_I(inode)->i_disksize)) {
new_size = offset + len;
ret = inode_newsize_ok(inode, new_size);
@@ -4766,8 +4567,6 @@
}
flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT;
- if (mode & FALLOC_FL_KEEP_SIZE)
- flags |= EXT4_GET_BLOCKS_KEEP_SIZE;
/* Wait all existing dio workers, newcomers will block on i_mutex */
inode_dio_wait(inode);
@@ -4834,18 +4633,11 @@
}
inode->i_mtime = inode->i_ctime = current_time(inode);
- if (new_size) {
+ if (new_size)
ext4_update_inode_size(inode, new_size);
- } else {
- /*
- * Mark that we allocate beyond EOF so the subsequent truncate
- * can proceed even if the new size is the same as i_size.
- */
- if ((offset + len) > i_size_read(inode))
- ext4_set_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
- }
- ext4_mark_inode_dirty(handle, inode);
-
+ ret = ext4_mark_inode_dirty(handle, inode);
+ if (unlikely(ret))
+ goto out_handle;
/* Zero out partial block at the edges of the range */
ret = ext4_zero_partial_blocks(handle, inode, offset, len);
if (ret >= 0)
@@ -4854,6 +4646,7 @@
if (file->f_flags & O_SYNC)
ext4_handle_sync(handle);
+out_handle:
ext4_journal_stop(handle);
out_mutex:
inode_unlock(inode);
@@ -4882,14 +4675,9 @@
* range since we would need to re-encrypt blocks with a
* different IV or XTS tweak (which are based on the logical
* block number).
- *
- * XXX It's not clear why zero range isn't working, but we'll
- * leave it disabled for encrypted inodes for now. This is a
- * bug we should fix....
*/
if (IS_ENCRYPTED(inode) &&
- (mode & (FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_INSERT_RANGE |
- FALLOC_FL_ZERO_RANGE)))
+ (mode & (FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_INSERT_RANGE)))
return -EOPNOTSUPP;
/* Return error if mode is not supported */
@@ -4898,29 +4686,36 @@
FALLOC_FL_INSERT_RANGE))
return -EOPNOTSUPP;
- if (mode & FALLOC_FL_PUNCH_HOLE)
- return ext4_punch_hole(inode, offset, len);
+ ext4_fc_start_update(inode);
+
+ if (mode & FALLOC_FL_PUNCH_HOLE) {
+ ret = ext4_punch_hole(inode, offset, len);
+ goto exit;
+ }
ret = ext4_convert_inline_data(inode);
if (ret)
- return ret;
+ goto exit;
- if (mode & FALLOC_FL_COLLAPSE_RANGE)
- return ext4_collapse_range(inode, offset, len);
+ if (mode & FALLOC_FL_COLLAPSE_RANGE) {
+ ret = ext4_collapse_range(inode, offset, len);
+ goto exit;
+ }
- if (mode & FALLOC_FL_INSERT_RANGE)
- return ext4_insert_range(inode, offset, len);
+ if (mode & FALLOC_FL_INSERT_RANGE) {
+ ret = ext4_insert_range(inode, offset, len);
+ goto exit;
+ }
- if (mode & FALLOC_FL_ZERO_RANGE)
- return ext4_zero_range(file, offset, len, mode);
-
+ if (mode & FALLOC_FL_ZERO_RANGE) {
+ ret = ext4_zero_range(file, offset, len, mode);
+ goto exit;
+ }
trace_ext4_fallocate_enter(inode, offset, len, mode);
lblk = offset >> blkbits;
max_blocks = EXT4_MAX_BLOCKS(len, offset, blkbits);
flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT;
- if (mode & FALLOC_FL_KEEP_SIZE)
- flags |= EXT4_GET_BLOCKS_KEEP_SIZE;
inode_lock(inode);
@@ -4933,7 +4728,7 @@
}
if (!(mode & FALLOC_FL_KEEP_SIZE) &&
- (offset + len > i_size_read(inode) ||
+ (offset + len > inode->i_size ||
offset + len > EXT4_I(inode)->i_disksize)) {
new_size = offset + len;
ret = inode_newsize_ok(inode, new_size);
@@ -4949,12 +4744,14 @@
goto out;
if (file->f_flags & O_SYNC && EXT4_SB(inode->i_sb)->s_journal) {
- ret = jbd2_complete_transaction(EXT4_SB(inode->i_sb)->s_journal,
- EXT4_I(inode)->i_sync_tid);
+ ret = ext4_fc_commit(EXT4_SB(inode->i_sb)->s_journal,
+ EXT4_I(inode)->i_sync_tid);
}
out:
inode_unlock(inode);
trace_ext4_fallocate_exit(inode, offset, max_blocks, ret);
+exit:
+ ext4_fc_stop_update(inode);
return ret;
}
@@ -4972,26 +4769,15 @@
loff_t offset, ssize_t len)
{
unsigned int max_blocks;
- int ret = 0;
- int ret2 = 0;
+ int ret = 0, ret2 = 0, ret3 = 0;
struct ext4_map_blocks map;
- unsigned int credits, blkbits = inode->i_blkbits;
+ unsigned int blkbits = inode->i_blkbits;
+ unsigned int credits = 0;
map.m_lblk = offset >> blkbits;
max_blocks = EXT4_MAX_BLOCKS(len, offset, blkbits);
- /*
- * This is somewhat ugly but the idea is clear: When transaction is
- * reserved, everything goes into it. Otherwise we rather start several
- * smaller transactions for conversion of each extent separately.
- */
- if (handle) {
- handle = ext4_journal_start_reserved(handle,
- EXT4_HT_EXT_CONVERT);
- if (IS_ERR(handle))
- return PTR_ERR(handle);
- credits = 0;
- } else {
+ if (!handle) {
/*
* credits to insert 1 extent into extent tree
*/
@@ -5016,75 +4802,57 @@
"ext4_ext_map_blocks returned %d",
inode->i_ino, map.m_lblk,
map.m_len, ret);
- ext4_mark_inode_dirty(handle, inode);
- if (credits)
- ret2 = ext4_journal_stop(handle);
+ ret2 = ext4_mark_inode_dirty(handle, inode);
+ if (credits) {
+ ret3 = ext4_journal_stop(handle);
+ if (unlikely(ret3))
+ ret2 = ret3;
+ }
+
if (ret <= 0 || ret2)
break;
}
- if (!credits)
- ret2 = ext4_journal_stop(handle);
return ret > 0 ? ret2 : ret;
}
-/*
- * If newes is not existing extent (newes->ec_pblk equals zero) find
- * delayed extent at start of newes and update newes accordingly and
- * return start of the next delayed extent.
- *
- * If newes is existing extent (newes->ec_pblk is not equal zero)
- * return start of next delayed extent or EXT_MAX_BLOCKS if no delayed
- * extent found. Leave newes unmodified.
- */
-static int ext4_find_delayed_extent(struct inode *inode,
- struct extent_status *newes)
+int ext4_convert_unwritten_io_end_vec(handle_t *handle, ext4_io_end_t *io_end)
{
- struct extent_status es;
- ext4_lblk_t block, next_del;
+ int ret = 0, err = 0;
+ struct ext4_io_end_vec *io_end_vec;
- if (newes->es_pblk == 0) {
- ext4_es_find_extent_range(inode, &ext4_es_is_delayed,
- newes->es_lblk,
- newes->es_lblk + newes->es_len - 1,
- &es);
-
- /*
- * No extent in extent-tree contains block @newes->es_pblk,
- * then the block may stay in 1)a hole or 2)delayed-extent.
- */
- if (es.es_len == 0)
- /* A hole found. */
- return 0;
-
- if (es.es_lblk > newes->es_lblk) {
- /* A hole found. */
- newes->es_len = min(es.es_lblk - newes->es_lblk,
- newes->es_len);
- return 0;
- }
-
- newes->es_len = es.es_lblk + es.es_len - newes->es_lblk;
+ /*
+ * This is somewhat ugly but the idea is clear: When transaction is
+ * reserved, everything goes into it. Otherwise we rather start several
+ * smaller transactions for conversion of each extent separately.
+ */
+ if (handle) {
+ handle = ext4_journal_start_reserved(handle,
+ EXT4_HT_EXT_CONVERT);
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
}
- block = newes->es_lblk + newes->es_len;
- ext4_es_find_extent_range(inode, &ext4_es_is_delayed, block,
- EXT_MAX_BLOCKS, &es);
- if (es.es_len == 0)
- next_del = EXT_MAX_BLOCKS;
- else
- next_del = es.es_lblk;
+ list_for_each_entry(io_end_vec, &io_end->list_vec, list) {
+ ret = ext4_convert_unwritten_extents(handle, io_end->inode,
+ io_end_vec->offset,
+ io_end_vec->size);
+ if (ret)
+ break;
+ }
- return next_del;
+ if (handle)
+ err = ext4_journal_stop(handle);
+
+ return ret < 0 ? ret : err;
}
-static int ext4_xattr_fiemap(struct inode *inode,
- struct fiemap_extent_info *fieinfo)
+static int ext4_iomap_xattr_fiemap(struct inode *inode, struct iomap *iomap)
{
__u64 physical = 0;
- __u64 length;
- __u32 flags = FIEMAP_EXTENT_LAST;
+ __u64 length = 0;
int blockbits = inode->i_sb->s_blocksize_bits;
int error = 0;
+ u16 iomap_type;
/* in-inode? */
if (ext4_test_inode_state(inode, EXT4_STATE_XATTR)) {
@@ -5099,41 +4867,70 @@
EXT4_I(inode)->i_extra_isize;
physical += offset;
length = EXT4_SB(inode->i_sb)->s_inode_size - offset;
- flags |= FIEMAP_EXTENT_DATA_INLINE;
brelse(iloc.bh);
- } else { /* external block */
+ iomap_type = IOMAP_INLINE;
+ } else if (EXT4_I(inode)->i_file_acl) { /* external block */
physical = (__u64)EXT4_I(inode)->i_file_acl << blockbits;
length = inode->i_sb->s_blocksize;
+ iomap_type = IOMAP_MAPPED;
+ } else {
+ /* no in-inode or external block for xattr, so return -ENOENT */
+ error = -ENOENT;
+ goto out;
}
- if (physical)
- error = fiemap_fill_next_extent(fieinfo, 0, physical,
- length, flags);
- return (error < 0 ? error : 0);
+ iomap->addr = physical;
+ iomap->offset = 0;
+ iomap->length = length;
+ iomap->type = iomap_type;
+ iomap->flags = 0;
+out:
+ return error;
}
-static int _ext4_fiemap(struct inode *inode,
- struct fiemap_extent_info *fieinfo,
- __u64 start, __u64 len,
- int (*fill)(struct inode *, ext4_lblk_t,
- ext4_lblk_t,
- struct fiemap_extent_info *))
+static int ext4_iomap_xattr_begin(struct inode *inode, loff_t offset,
+ loff_t length, unsigned flags,
+ struct iomap *iomap, struct iomap *srcmap)
{
- ext4_lblk_t start_blk;
- u32 ext4_fiemap_flags = FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR;
+ int error;
+ error = ext4_iomap_xattr_fiemap(inode, iomap);
+ if (error == 0 && (offset >= iomap->length))
+ error = -ENOENT;
+ return error;
+}
+
+static const struct iomap_ops ext4_iomap_xattr_ops = {
+ .iomap_begin = ext4_iomap_xattr_begin,
+};
+
+static int ext4_fiemap_check_ranges(struct inode *inode, u64 start, u64 *len)
+{
+ u64 maxbytes;
+
+ if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
+ maxbytes = inode->i_sb->s_maxbytes;
+ else
+ maxbytes = EXT4_SB(inode->i_sb)->s_bitmap_maxbytes;
+
+ if (*len == 0)
+ return -EINVAL;
+ if (start > maxbytes)
+ return -EFBIG;
+
+ /*
+ * Shrink request scope to what the fs can actually handle.
+ */
+ if (*len > maxbytes || (maxbytes - *len) < start)
+ *len = maxbytes - start;
+ return 0;
+}
+
+int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+ u64 start, u64 len)
+{
int error = 0;
- if (ext4_has_inline_data(inode)) {
- int has_inline = 1;
-
- error = ext4_inline_data_fiemap(inode, fieinfo, &has_inline,
- start, len);
-
- if (has_inline)
- return error;
- }
-
if (fieinfo->fi_flags & FIEMAP_FLAG_CACHE) {
error = ext4_ext_precache(inode);
if (error)
@@ -5141,48 +4938,31 @@
fieinfo->fi_flags &= ~FIEMAP_FLAG_CACHE;
}
- /* fallback to generic here if not in extents fmt */
- if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) &&
- fill == ext4_fill_fiemap_extents)
- return generic_block_fiemap(inode, fieinfo, start, len,
- ext4_get_block);
-
- if (fill == ext4_fill_es_cache_info)
- ext4_fiemap_flags &= FIEMAP_FLAG_XATTR;
- if (fiemap_check_flags(fieinfo, ext4_fiemap_flags))
- return -EBADR;
+ /*
+ * For bitmap files the maximum size limit could be smaller than
+ * s_maxbytes, so check len here manually instead of just relying on the
+ * generic check.
+ */
+ error = ext4_fiemap_check_ranges(inode, start, &len);
+ if (error)
+ return error;
if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) {
- error = ext4_xattr_fiemap(inode, fieinfo);
- } else {
- ext4_lblk_t len_blks;
- __u64 last_blk;
-
- start_blk = start >> inode->i_sb->s_blocksize_bits;
- last_blk = (start + len - 1) >> inode->i_sb->s_blocksize_bits;
- if (last_blk >= EXT_MAX_BLOCKS)
- last_blk = EXT_MAX_BLOCKS-1;
- len_blks = ((ext4_lblk_t) last_blk) - start_blk + 1;
-
- /*
- * Walk the extent tree gathering extent information
- * and pushing extents back to the user.
- */
- error = fill(inode, start_blk, len_blks, fieinfo);
+ fieinfo->fi_flags &= ~FIEMAP_FLAG_XATTR;
+ return iomap_fiemap(inode, fieinfo, start, len,
+ &ext4_iomap_xattr_ops);
}
- return error;
-}
-int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
- __u64 start, __u64 len)
-{
- return _ext4_fiemap(inode, fieinfo, start, len,
- ext4_fill_fiemap_extents);
+ return iomap_fiemap(inode, fieinfo, start, len, &ext4_iomap_report_ops);
}
int ext4_get_es_cache(struct inode *inode, struct fiemap_extent_info *fieinfo,
__u64 start, __u64 len)
{
+ ext4_lblk_t start_blk, len_blks;
+ __u64 last_blk;
+ int error = 0;
+
if (ext4_has_inline_data(inode)) {
int has_inline;
@@ -5193,42 +4973,32 @@
return 0;
}
- return _ext4_fiemap(inode, fieinfo, start, len,
- ext4_fill_es_cache_info);
-}
-
-
-/*
- * ext4_access_path:
- * Function to access the path buffer for marking it dirty.
- * It also checks if there are sufficient credits left in the journal handle
- * to update path.
- */
-static int
-ext4_access_path(handle_t *handle, struct inode *inode,
- struct ext4_ext_path *path)
-{
- int credits, err;
-
- if (!ext4_handle_valid(handle))
- return 0;
-
- /*
- * Check if need to extend journal credits
- * 3 for leaf, sb, and inode plus 2 (bmap and group
- * descriptor) for each block group; assume two block
- * groups
- */
- if (handle->h_buffer_credits < 7) {
- credits = ext4_writepage_trans_blocks(inode);
- err = ext4_ext_truncate_extend_restart(handle, inode, credits);
- /* EAGAIN is success */
- if (err && err != -EAGAIN)
- return err;
+ if (fieinfo->fi_flags & FIEMAP_FLAG_CACHE) {
+ error = ext4_ext_precache(inode);
+ if (error)
+ return error;
+ fieinfo->fi_flags &= ~FIEMAP_FLAG_CACHE;
}
- err = ext4_ext_get_access(handle, inode, path);
- return err;
+ error = fiemap_prep(inode, fieinfo, start, &len, 0);
+ if (error)
+ return error;
+
+ error = ext4_fiemap_check_ranges(inode, start, &len);
+ if (error)
+ return error;
+
+ start_blk = start >> inode->i_sb->s_blocksize_bits;
+ last_blk = (start + len - 1) >> inode->i_sb->s_blocksize_bits;
+ if (last_blk >= EXT_MAX_BLOCKS)
+ last_blk = EXT_MAX_BLOCKS-1;
+ len_blks = ((ext4_lblk_t) last_blk) - start_blk + 1;
+
+ /*
+ * Walk the extent tree gathering extent information
+ * and pushing extents back to the user.
+ */
+ return ext4_fill_es_cache_info(inode, start_blk, len_blks, fieinfo);
}
/*
@@ -5244,7 +5014,8 @@
{
int depth, err = 0;
struct ext4_extent *ex_start, *ex_last;
- bool update = 0;
+ bool update = false;
+ int credits, restart_credits;
depth = path->p_depth;
while (depth >= 0) {
@@ -5254,14 +5025,27 @@
return -EFSCORRUPTED;
ex_last = EXT_LAST_EXTENT(path[depth].p_hdr);
+ /* leaf + sb + inode */
+ credits = 3;
+ if (ex_start == EXT_FIRST_EXTENT(path[depth].p_hdr)) {
+ update = true;
+ /* extent tree + sb + inode */
+ credits = depth + 2;
+ }
- err = ext4_access_path(handle, inode, path + depth);
+ restart_credits = ext4_writepage_trans_blocks(inode);
+ err = ext4_datasem_ensure_credits(handle, inode, credits,
+ restart_credits, 0);
+ if (err) {
+ if (err > 0)
+ err = -EAGAIN;
+ goto out;
+ }
+
+ err = ext4_ext_get_access(handle, inode, path + depth);
if (err)
goto out;
- if (ex_start == EXT_FIRST_EXTENT(path[depth].p_hdr))
- update = 1;
-
while (ex_start <= ex_last) {
if (SHIFT == SHIFT_LEFT) {
le32_add_cpu(&ex_start->ee_block,
@@ -5291,7 +5075,7 @@
}
/* Update index too */
- err = ext4_access_path(handle, inode, path + depth);
+ err = ext4_ext_get_access(handle, inode, path + depth);
if (err)
goto out;
@@ -5330,6 +5114,7 @@
int ret = 0, depth;
struct ext4_extent *extent;
ext4_lblk_t stop, *iterator, ex_start, ex_end;
+ ext4_lblk_t tmp = EXT_MAX_BLOCKS;
/* Let path point to the last extent */
path = ext4_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL,
@@ -5383,11 +5168,15 @@
* till we reach stop. In case of right shift, iterator points to stop
* and it is decreased till we reach start.
*/
+again:
if (SHIFT == SHIFT_LEFT)
iterator = &start;
else
iterator = &stop;
+ if (tmp != EXT_MAX_BLOCKS)
+ *iterator = tmp;
+
/*
* Its safe to start updating extents. Start and stop are unsigned, so
* in case of right shift if extent with 0 block is reached, iterator
@@ -5416,6 +5205,7 @@
}
}
+ tmp = *iterator;
if (SHIFT == SHIFT_LEFT) {
extent = EXT_LAST_EXTENT(path[depth].p_hdr);
*iterator = le32_to_cpu(extent->ee_block) +
@@ -5434,6 +5224,9 @@
}
ret = ext4_ext_shift_path_extents(path, shift, inode,
handle, SHIFT);
+ /* iterator can be NULL which means we should break */
+ if (ret == -EAGAIN)
+ goto again;
if (ret)
break;
}
@@ -5448,7 +5241,7 @@
* This implements the fallocate's collapse range functionality for ext4
* Returns: 0 and non-zero on error.
*/
-int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
+static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
{
struct super_block *sb = inode->i_sb;
ext4_lblk_t punch_start, punch_stop;
@@ -5465,12 +5258,8 @@
if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
return -EOPNOTSUPP;
- /* Collapse range works only on fs block size aligned offsets. */
- if (offset & (EXT4_CLUSTER_SIZE(sb) - 1) ||
- len & (EXT4_CLUSTER_SIZE(sb) - 1))
- return -EINVAL;
-
- if (!S_ISREG(inode->i_mode))
+ /* Collapse range works only on fs cluster size aligned regions. */
+ if (!IS_ALIGNED(offset | len, EXT4_CLUSTER_SIZE(sb)))
return -EINVAL;
trace_ext4_collapse_range(inode, offset, len);
@@ -5490,7 +5279,7 @@
* There is no need to overlap collapse range with EOF, in which case
* it is effectively a truncate operation
*/
- if (offset + len >= i_size_read(inode)) {
+ if (offset + len >= inode->i_size) {
ret = -EINVAL;
goto out_mutex;
}
@@ -5543,9 +5332,10 @@
ret = PTR_ERR(handle);
goto out_mmap;
}
+ ext4_fc_start_ineligible(sb, EXT4_FC_REASON_FALLOC_RANGE);
down_write(&EXT4_I(inode)->i_data_sem);
- ext4_discard_preallocations(inode);
+ ext4_discard_preallocations(inode, 0);
ret = ext4_es_remove_extent(inode, punch_start,
EXT_MAX_BLOCKS - punch_start);
@@ -5559,7 +5349,7 @@
up_write(&EXT4_I(inode)->i_data_sem);
goto out_stop;
}
- ext4_discard_preallocations(inode);
+ ext4_discard_preallocations(inode, 0);
ret = ext4_ext_shift_extents(inode, handle, punch_stop,
punch_stop - punch_start, SHIFT_LEFT);
@@ -5568,7 +5358,7 @@
goto out_stop;
}
- new_size = i_size_read(inode) - len;
+ new_size = inode->i_size - len;
i_size_write(inode, new_size);
EXT4_I(inode)->i_disksize = new_size;
@@ -5576,11 +5366,12 @@
if (IS_SYNC(inode))
ext4_handle_sync(handle);
inode->i_mtime = inode->i_ctime = current_time(inode);
- ext4_mark_inode_dirty(handle, inode);
+ ret = ext4_mark_inode_dirty(handle, inode);
ext4_update_inode_fsync_trans(handle, inode, 1);
out_stop:
ext4_journal_stop(handle);
+ ext4_fc_stop_ineligible(sb);
out_mmap:
up_write(&EXT4_I(inode)->i_mmap_sem);
out_mutex:
@@ -5596,7 +5387,7 @@
* by len bytes.
* Returns 0 on success, error otherwise.
*/
-int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len)
+static int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len)
{
struct super_block *sb = inode->i_sb;
handle_t *handle;
@@ -5615,14 +5406,10 @@
if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
return -EOPNOTSUPP;
- /* Insert range works only on fs block size aligned offsets. */
- if (offset & (EXT4_CLUSTER_SIZE(sb) - 1) ||
- len & (EXT4_CLUSTER_SIZE(sb) - 1))
+ /* Insert range works only on fs cluster size aligned regions. */
+ if (!IS_ALIGNED(offset | len, EXT4_CLUSTER_SIZE(sb)))
return -EINVAL;
- if (!S_ISREG(inode->i_mode))
- return -EOPNOTSUPP;
-
trace_ext4_insert_range(inode, offset, len);
offset_lblk = offset >> EXT4_BLOCK_SIZE_BITS(sb);
@@ -5642,14 +5429,14 @@
goto out_mutex;
}
- /* Check for wrap through zero */
- if (inode->i_size + len > inode->i_sb->s_maxbytes) {
+ /* Check whether the maximum file size would be exceeded */
+ if (len > inode->i_sb->s_maxbytes - inode->i_size) {
ret = -EFBIG;
goto out_mutex;
}
- /* Offset should be less than i_size */
- if (offset >= i_size_read(inode)) {
+ /* Offset must be less than i_size */
+ if (offset >= inode->i_size) {
ret = -EINVAL;
goto out_mutex;
}
@@ -5685,6 +5472,7 @@
ret = PTR_ERR(handle);
goto out_mmap;
}
+ ext4_fc_start_ineligible(sb, EXT4_FC_REASON_FALLOC_RANGE);
/* Expand file to avoid data loss if there is error while shifting */
inode->i_size += len;
@@ -5695,7 +5483,7 @@
goto out_stop;
down_write(&EXT4_I(inode)->i_data_sem);
- ext4_discard_preallocations(inode);
+ ext4_discard_preallocations(inode, 0);
path = ext4_find_extent(inode, offset_lblk, NULL, 0);
if (IS_ERR(path)) {
@@ -5759,6 +5547,7 @@
out_stop:
ext4_journal_stop(handle);
+ ext4_fc_stop_ineligible(sb);
out_mmap:
up_write(&EXT4_I(inode)->i_mmap_sem);
out_mutex:
@@ -5829,7 +5618,7 @@
}
ex1 = path1[path1->p_depth].p_ext;
ex2 = path2[path2->p_depth].p_ext;
- /* Do we have somthing to swap ? */
+ /* Do we have something to swap ? */
if (unlikely(!ex2 || !ex1))
goto finish;
@@ -6040,3 +5829,277 @@
return err ? err : mapped;
}
+
+/*
+ * Updates physical block address and unwritten status of extent
+ * starting at lblk start and of len. If such an extent doesn't exist,
+ * this function splits the extent tree appropriately to create an
+ * extent like this. This function is called in the fast commit
+ * replay path. Returns 0 on success and error on failure.
+ */
+int ext4_ext_replay_update_ex(struct inode *inode, ext4_lblk_t start,
+ int len, int unwritten, ext4_fsblk_t pblk)
+{
+ struct ext4_ext_path *path = NULL, *ppath;
+ struct ext4_extent *ex;
+ int ret;
+
+ path = ext4_find_extent(inode, start, NULL, 0);
+ if (IS_ERR(path))
+ return PTR_ERR(path);
+ ex = path[path->p_depth].p_ext;
+ if (!ex) {
+ ret = -EFSCORRUPTED;
+ goto out;
+ }
+
+ if (le32_to_cpu(ex->ee_block) != start ||
+ ext4_ext_get_actual_len(ex) != len) {
+ /* We need to split this extent to match our extent first */
+ ppath = path;
+ down_write(&EXT4_I(inode)->i_data_sem);
+ ret = ext4_force_split_extent_at(NULL, inode, &ppath, start, 1);
+ up_write(&EXT4_I(inode)->i_data_sem);
+ if (ret)
+ goto out;
+ kfree(path);
+ path = ext4_find_extent(inode, start, NULL, 0);
+ if (IS_ERR(path))
+ return -1;
+ ppath = path;
+ ex = path[path->p_depth].p_ext;
+ WARN_ON(le32_to_cpu(ex->ee_block) != start);
+ if (ext4_ext_get_actual_len(ex) != len) {
+ down_write(&EXT4_I(inode)->i_data_sem);
+ ret = ext4_force_split_extent_at(NULL, inode, &ppath,
+ start + len, 1);
+ up_write(&EXT4_I(inode)->i_data_sem);
+ if (ret)
+ goto out;
+ kfree(path);
+ path = ext4_find_extent(inode, start, NULL, 0);
+ if (IS_ERR(path))
+ return -EINVAL;
+ ex = path[path->p_depth].p_ext;
+ }
+ }
+ if (unwritten)
+ ext4_ext_mark_unwritten(ex);
+ else
+ ext4_ext_mark_initialized(ex);
+ ext4_ext_store_pblock(ex, pblk);
+ down_write(&EXT4_I(inode)->i_data_sem);
+ ret = ext4_ext_dirty(NULL, inode, &path[path->p_depth]);
+ up_write(&EXT4_I(inode)->i_data_sem);
+out:
+ ext4_ext_drop_refs(path);
+ kfree(path);
+ ext4_mark_inode_dirty(NULL, inode);
+ return ret;
+}
+
+/* Try to shrink the extent tree */
+void ext4_ext_replay_shrink_inode(struct inode *inode, ext4_lblk_t end)
+{
+ struct ext4_ext_path *path = NULL;
+ struct ext4_extent *ex;
+ ext4_lblk_t old_cur, cur = 0;
+
+ while (cur < end) {
+ path = ext4_find_extent(inode, cur, NULL, 0);
+ if (IS_ERR(path))
+ return;
+ ex = path[path->p_depth].p_ext;
+ if (!ex) {
+ ext4_ext_drop_refs(path);
+ kfree(path);
+ ext4_mark_inode_dirty(NULL, inode);
+ return;
+ }
+ old_cur = cur;
+ cur = le32_to_cpu(ex->ee_block) + ext4_ext_get_actual_len(ex);
+ if (cur <= old_cur)
+ cur = old_cur + 1;
+ ext4_ext_try_to_merge(NULL, inode, path, ex);
+ down_write(&EXT4_I(inode)->i_data_sem);
+ ext4_ext_dirty(NULL, inode, &path[path->p_depth]);
+ up_write(&EXT4_I(inode)->i_data_sem);
+ ext4_mark_inode_dirty(NULL, inode);
+ ext4_ext_drop_refs(path);
+ kfree(path);
+ }
+}
+
+/* Check if *cur is a hole and if it is, skip it */
+static int skip_hole(struct inode *inode, ext4_lblk_t *cur)
+{
+ int ret;
+ struct ext4_map_blocks map;
+
+ map.m_lblk = *cur;
+ map.m_len = ((inode->i_size) >> inode->i_sb->s_blocksize_bits) - *cur;
+
+ ret = ext4_map_blocks(NULL, inode, &map, 0);
+ if (ret < 0)
+ return ret;
+ if (ret != 0)
+ return 0;
+ *cur = *cur + map.m_len;
+ return 0;
+}
+
+/* Count number of blocks used by this inode and update i_blocks */
+int ext4_ext_replay_set_iblocks(struct inode *inode)
+{
+ struct ext4_ext_path *path = NULL, *path2 = NULL;
+ struct ext4_extent *ex;
+ ext4_lblk_t cur = 0, end;
+ int numblks = 0, i, ret = 0;
+ ext4_fsblk_t cmp1, cmp2;
+ struct ext4_map_blocks map;
+
+ /* Determin the size of the file first */
+ path = ext4_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL,
+ EXT4_EX_NOCACHE);
+ if (IS_ERR(path))
+ return PTR_ERR(path);
+ ex = path[path->p_depth].p_ext;
+ if (!ex) {
+ ext4_ext_drop_refs(path);
+ kfree(path);
+ goto out;
+ }
+ end = le32_to_cpu(ex->ee_block) + ext4_ext_get_actual_len(ex);
+ ext4_ext_drop_refs(path);
+ kfree(path);
+
+ /* Count the number of data blocks */
+ cur = 0;
+ while (cur < end) {
+ map.m_lblk = cur;
+ map.m_len = end - cur;
+ ret = ext4_map_blocks(NULL, inode, &map, 0);
+ if (ret < 0)
+ break;
+ if (ret > 0)
+ numblks += ret;
+ cur = cur + map.m_len;
+ }
+
+ /*
+ * Count the number of extent tree blocks. We do it by looking up
+ * two successive extents and determining the difference between
+ * their paths. When path is different for 2 successive extents
+ * we compare the blocks in the path at each level and increment
+ * iblocks by total number of differences found.
+ */
+ cur = 0;
+ ret = skip_hole(inode, &cur);
+ if (ret < 0)
+ goto out;
+ path = ext4_find_extent(inode, cur, NULL, 0);
+ if (IS_ERR(path))
+ goto out;
+ numblks += path->p_depth;
+ ext4_ext_drop_refs(path);
+ kfree(path);
+ while (cur < end) {
+ path = ext4_find_extent(inode, cur, NULL, 0);
+ if (IS_ERR(path))
+ break;
+ ex = path[path->p_depth].p_ext;
+ if (!ex) {
+ ext4_ext_drop_refs(path);
+ kfree(path);
+ return 0;
+ }
+ cur = max(cur + 1, le32_to_cpu(ex->ee_block) +
+ ext4_ext_get_actual_len(ex));
+ ret = skip_hole(inode, &cur);
+ if (ret < 0) {
+ ext4_ext_drop_refs(path);
+ kfree(path);
+ break;
+ }
+ path2 = ext4_find_extent(inode, cur, NULL, 0);
+ if (IS_ERR(path2)) {
+ ext4_ext_drop_refs(path);
+ kfree(path);
+ break;
+ }
+ ex = path2[path2->p_depth].p_ext;
+ for (i = 0; i <= max(path->p_depth, path2->p_depth); i++) {
+ cmp1 = cmp2 = 0;
+ if (i <= path->p_depth)
+ cmp1 = path[i].p_bh ?
+ path[i].p_bh->b_blocknr : 0;
+ if (i <= path2->p_depth)
+ cmp2 = path2[i].p_bh ?
+ path2[i].p_bh->b_blocknr : 0;
+ if (cmp1 != cmp2 && cmp2 != 0)
+ numblks++;
+ }
+ ext4_ext_drop_refs(path);
+ ext4_ext_drop_refs(path2);
+ kfree(path);
+ kfree(path2);
+ }
+
+out:
+ inode->i_blocks = numblks << (inode->i_sb->s_blocksize_bits - 9);
+ ext4_mark_inode_dirty(NULL, inode);
+ return 0;
+}
+
+int ext4_ext_clear_bb(struct inode *inode)
+{
+ struct ext4_ext_path *path = NULL;
+ struct ext4_extent *ex;
+ ext4_lblk_t cur = 0, end;
+ int j, ret = 0;
+ struct ext4_map_blocks map;
+
+ /* Determin the size of the file first */
+ path = ext4_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL,
+ EXT4_EX_NOCACHE);
+ if (IS_ERR(path))
+ return PTR_ERR(path);
+ ex = path[path->p_depth].p_ext;
+ if (!ex) {
+ ext4_ext_drop_refs(path);
+ kfree(path);
+ return 0;
+ }
+ end = le32_to_cpu(ex->ee_block) + ext4_ext_get_actual_len(ex);
+ ext4_ext_drop_refs(path);
+ kfree(path);
+
+ cur = 0;
+ while (cur < end) {
+ map.m_lblk = cur;
+ map.m_len = end - cur;
+ ret = ext4_map_blocks(NULL, inode, &map, 0);
+ if (ret < 0)
+ break;
+ if (ret > 0) {
+ path = ext4_find_extent(inode, map.m_lblk, NULL, 0);
+ if (!IS_ERR_OR_NULL(path)) {
+ for (j = 0; j < path->p_depth; j++) {
+
+ ext4_mb_mark_bb(inode->i_sb,
+ path[j].p_block, 1, 0);
+ ext4_fc_record_regions(inode->i_sb, inode->i_ino,
+ 0, path[j].p_block, 1, 1);
+ }
+ ext4_ext_drop_refs(path);
+ kfree(path);
+ }
+ ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
+ ext4_fc_record_regions(inode->i_sb, inode->i_ino,
+ map.m_lblk, map.m_pblk, map.m_len, 1);
+ }
+ cur = cur + map.m_len;
+ }
+
+ return 0;
+}
diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
index 43fba01..9a3a899 100644
--- a/fs/ext4/extents_status.c
+++ b/fs/ext4/extents_status.c
@@ -311,6 +311,9 @@
ext4_lblk_t lblk, ext4_lblk_t end,
struct extent_status *es)
{
+ if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
+ return;
+
trace_ext4_es_find_extent_range_enter(inode, lblk);
read_lock(&EXT4_I(inode)->i_es_lock);
@@ -361,6 +364,9 @@
{
bool ret;
+ if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
+ return false;
+
read_lock(&EXT4_I(inode)->i_es_lock);
ret = __es_scan_range(inode, matching_fn, lblk, end);
read_unlock(&EXT4_I(inode)->i_es_lock);
@@ -404,6 +410,9 @@
{
bool ret;
+ if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
+ return false;
+
read_lock(&EXT4_I(inode)->i_es_lock);
ret = __es_scan_clu(inode, matching_fn, lblk);
read_unlock(&EXT4_I(inode)->i_es_lock);
@@ -812,6 +821,9 @@
int err = 0;
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
+ return 0;
+
es_debug("add [%u/%u) %llu %x to extent status tree of inode %lu\n",
lblk, len, pblk, status, inode->i_ino);
@@ -873,6 +885,9 @@
struct extent_status newes;
ext4_lblk_t end = lblk + len - 1;
+ if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
+ return;
+
newes.es_lblk = lblk;
newes.es_len = len;
ext4_es_store_pblock_status(&newes, pblk, status);
@@ -908,6 +923,9 @@
struct rb_node *node;
int found = 0;
+ if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
+ return 0;
+
trace_ext4_es_lookup_extent_enter(inode, lblk);
es_debug("lookup extent in block %u\n", lblk);
@@ -1054,7 +1072,7 @@
end = (end > ext4_es_end(es)) ? ext4_es_end(es) : end;
/* record the first block of the first delonly extent seen */
- if (rc->first_do_lblk_found == false) {
+ if (!rc->first_do_lblk_found) {
rc->first_do_lblk = i;
rc->first_do_lblk_found = true;
}
@@ -1419,6 +1437,9 @@
int err = 0;
int reserved = 0;
+ if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
+ return 0;
+
trace_ext4_es_remove_extent(inode, lblk, len);
es_debug("remove [%u/%u) from extent status tree of inode %lu\n",
lblk, len, inode->i_ino);
@@ -1967,6 +1988,9 @@
struct extent_status newes;
int err = 0;
+ if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
+ return 0;
+
es_debug("add [%u/1) delayed to extent status tree of inode %lu\n",
lblk, inode->i_ino);
diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h
index 825313c..4ec30a7 100644
--- a/fs/ext4/extents_status.h
+++ b/fs/ext4/extents_status.h
@@ -209,6 +209,12 @@
return es->es_pblk & ~ES_MASK;
}
+static inline ext4_fsblk_t ext4_es_show_pblock(struct extent_status *es)
+{
+ ext4_fsblk_t pblock = ext4_es_pblock(es);
+ return pblock == ~ES_MASK ? 0 : pblock;
+}
+
static inline void ext4_es_store_pblock(struct extent_status *es,
ext4_fsblk_t pb)
{
diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c
new file mode 100644
index 0000000..501e607
--- /dev/null
+++ b/fs/ext4/fast_commit.c
@@ -0,0 +1,2181 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * fs/ext4/fast_commit.c
+ *
+ * Written by Harshad Shirwadkar <harshadshirwadkar@gmail.com>
+ *
+ * Ext4 fast commits routines.
+ */
+#include "ext4.h"
+#include "ext4_jbd2.h"
+#include "ext4_extents.h"
+#include "mballoc.h"
+
+/*
+ * Ext4 Fast Commits
+ * -----------------
+ *
+ * Ext4 fast commits implement fine grained journalling for Ext4.
+ *
+ * Fast commits are organized as a log of tag-length-value (TLV) structs. (See
+ * struct ext4_fc_tl). Each TLV contains some delta that is replayed TLV by
+ * TLV during the recovery phase. For the scenarios for which we currently
+ * don't have replay code, fast commit falls back to full commits.
+ * Fast commits record delta in one of the following three categories.
+ *
+ * (A) Directory entry updates:
+ *
+ * - EXT4_FC_TAG_UNLINK - records directory entry unlink
+ * - EXT4_FC_TAG_LINK - records directory entry link
+ * - EXT4_FC_TAG_CREAT - records inode and directory entry creation
+ *
+ * (B) File specific data range updates:
+ *
+ * - EXT4_FC_TAG_ADD_RANGE - records addition of new blocks to an inode
+ * - EXT4_FC_TAG_DEL_RANGE - records deletion of blocks from an inode
+ *
+ * (C) Inode metadata (mtime / ctime etc):
+ *
+ * - EXT4_FC_TAG_INODE - record the inode that should be replayed
+ * during recovery. Note that iblocks field is
+ * not replayed and instead derived during
+ * replay.
+ * Commit Operation
+ * ----------------
+ * With fast commits, we maintain all the directory entry operations in the
+ * order in which they are issued in an in-memory queue. This queue is flushed
+ * to disk during the commit operation. We also maintain a list of inodes
+ * that need to be committed during a fast commit in another in memory queue of
+ * inodes. During the commit operation, we commit in the following order:
+ *
+ * [1] Lock inodes for any further data updates by setting COMMITTING state
+ * [2] Submit data buffers of all the inodes
+ * [3] Wait for [2] to complete
+ * [4] Commit all the directory entry updates in the fast commit space
+ * [5] Commit all the changed inode structures
+ * [6] Write tail tag (this tag ensures the atomicity, please read the following
+ * section for more details).
+ * [7] Wait for [4], [5] and [6] to complete.
+ *
+ * All the inode updates must call ext4_fc_start_update() before starting an
+ * update. If such an ongoing update is present, fast commit waits for it to
+ * complete. The completion of such an update is marked by
+ * ext4_fc_stop_update().
+ *
+ * Fast Commit Ineligibility
+ * -------------------------
+ * Not all operations are supported by fast commits today (e.g extended
+ * attributes). Fast commit ineligiblity is marked by calling one of the
+ * two following functions:
+ *
+ * - ext4_fc_mark_ineligible(): This makes next fast commit operation to fall
+ * back to full commit. This is useful in case of transient errors.
+ *
+ * - ext4_fc_start_ineligible() and ext4_fc_stop_ineligible() - This makes all
+ * the fast commits happening between ext4_fc_start_ineligible() and
+ * ext4_fc_stop_ineligible() and one fast commit after the call to
+ * ext4_fc_stop_ineligible() to fall back to full commits. It is important to
+ * make one more fast commit to fall back to full commit after stop call so
+ * that it guaranteed that the fast commit ineligible operation contained
+ * within ext4_fc_start_ineligible() and ext4_fc_stop_ineligible() is
+ * followed by at least 1 full commit.
+ *
+ * Atomicity of commits
+ * --------------------
+ * In order to guarantee atomicity during the commit operation, fast commit
+ * uses "EXT4_FC_TAG_TAIL" tag that marks a fast commit as complete. Tail
+ * tag contains CRC of the contents and TID of the transaction after which
+ * this fast commit should be applied. Recovery code replays fast commit
+ * logs only if there's at least 1 valid tail present. For every fast commit
+ * operation, there is 1 tail. This means, we may end up with multiple tails
+ * in the fast commit space. Here's an example:
+ *
+ * - Create a new file A and remove existing file B
+ * - fsync()
+ * - Append contents to file A
+ * - Truncate file A
+ * - fsync()
+ *
+ * The fast commit space at the end of above operations would look like this:
+ * [HEAD] [CREAT A] [UNLINK B] [TAIL] [ADD_RANGE A] [DEL_RANGE A] [TAIL]
+ * |<--- Fast Commit 1 --->|<--- Fast Commit 2 ---->|
+ *
+ * Replay code should thus check for all the valid tails in the FC area.
+ *
+ * TODOs
+ * -----
+ * 1) Make fast commit atomic updates more fine grained. Today, a fast commit
+ * eligible update must be protected within ext4_fc_start_update() and
+ * ext4_fc_stop_update(). These routines are called at much higher
+ * routines. This can be made more fine grained by combining with
+ * ext4_journal_start().
+ *
+ * 2) Same above for ext4_fc_start_ineligible() and ext4_fc_stop_ineligible()
+ *
+ * 3) Handle more ineligible cases.
+ */
+
+#include <trace/events/ext4.h>
+static struct kmem_cache *ext4_fc_dentry_cachep;
+
+static void ext4_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
+{
+ BUFFER_TRACE(bh, "");
+ if (uptodate) {
+ ext4_debug("%s: Block %lld up-to-date",
+ __func__, bh->b_blocknr);
+ set_buffer_uptodate(bh);
+ } else {
+ ext4_debug("%s: Block %lld not up-to-date",
+ __func__, bh->b_blocknr);
+ clear_buffer_uptodate(bh);
+ }
+
+ unlock_buffer(bh);
+}
+
+static inline void ext4_fc_reset_inode(struct inode *inode)
+{
+ struct ext4_inode_info *ei = EXT4_I(inode);
+
+ ei->i_fc_lblk_start = 0;
+ ei->i_fc_lblk_len = 0;
+}
+
+void ext4_fc_init_inode(struct inode *inode)
+{
+ struct ext4_inode_info *ei = EXT4_I(inode);
+
+ ext4_fc_reset_inode(inode);
+ ext4_clear_inode_state(inode, EXT4_STATE_FC_COMMITTING);
+ INIT_LIST_HEAD(&ei->i_fc_list);
+ init_waitqueue_head(&ei->i_fc_wait);
+ atomic_set(&ei->i_fc_updates, 0);
+}
+
+/* This function must be called with sbi->s_fc_lock held. */
+static void ext4_fc_wait_committing_inode(struct inode *inode)
+__releases(&EXT4_SB(inode->i_sb)->s_fc_lock)
+{
+ wait_queue_head_t *wq;
+ struct ext4_inode_info *ei = EXT4_I(inode);
+
+#if (BITS_PER_LONG < 64)
+ DEFINE_WAIT_BIT(wait, &ei->i_state_flags,
+ EXT4_STATE_FC_COMMITTING);
+ wq = bit_waitqueue(&ei->i_state_flags,
+ EXT4_STATE_FC_COMMITTING);
+#else
+ DEFINE_WAIT_BIT(wait, &ei->i_flags,
+ EXT4_STATE_FC_COMMITTING);
+ wq = bit_waitqueue(&ei->i_flags,
+ EXT4_STATE_FC_COMMITTING);
+#endif
+ lockdep_assert_held(&EXT4_SB(inode->i_sb)->s_fc_lock);
+ prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
+ spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
+ schedule();
+ finish_wait(wq, &wait.wq_entry);
+}
+
+/*
+ * Inform Ext4's fast about start of an inode update
+ *
+ * This function is called by the high level call VFS callbacks before
+ * performing any inode update. This function blocks if there's an ongoing
+ * fast commit on the inode in question.
+ */
+void ext4_fc_start_update(struct inode *inode)
+{
+ struct ext4_inode_info *ei = EXT4_I(inode);
+
+ if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
+ (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
+ return;
+
+restart:
+ spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock);
+ if (list_empty(&ei->i_fc_list))
+ goto out;
+
+ if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
+ ext4_fc_wait_committing_inode(inode);
+ goto restart;
+ }
+out:
+ atomic_inc(&ei->i_fc_updates);
+ spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
+}
+
+/*
+ * Stop inode update and wake up waiting fast commits if any.
+ */
+void ext4_fc_stop_update(struct inode *inode)
+{
+ struct ext4_inode_info *ei = EXT4_I(inode);
+
+ if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
+ (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
+ return;
+
+ if (atomic_dec_and_test(&ei->i_fc_updates))
+ wake_up_all(&ei->i_fc_wait);
+}
+
+/*
+ * Remove inode from fast commit list. If the inode is being committed
+ * we wait until inode commit is done.
+ */
+void ext4_fc_del(struct inode *inode)
+{
+ struct ext4_inode_info *ei = EXT4_I(inode);
+
+ if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
+ (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
+ return;
+
+restart:
+ spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock);
+ if (list_empty(&ei->i_fc_list)) {
+ spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
+ return;
+ }
+
+ if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
+ ext4_fc_wait_committing_inode(inode);
+ goto restart;
+ }
+ list_del_init(&ei->i_fc_list);
+ spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
+}
+
+/*
+ * Mark file system as fast commit ineligible. This means that next commit
+ * operation would result in a full jbd2 commit.
+ */
+void ext4_fc_mark_ineligible(struct super_block *sb, int reason)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+ if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
+ (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
+ return;
+
+ ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
+ WARN_ON(reason >= EXT4_FC_REASON_MAX);
+ sbi->s_fc_stats.fc_ineligible_reason_count[reason]++;
+}
+
+/*
+ * Start a fast commit ineligible update. Any commits that happen while
+ * such an operation is in progress fall back to full commits.
+ */
+void ext4_fc_start_ineligible(struct super_block *sb, int reason)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+ if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
+ (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
+ return;
+
+ WARN_ON(reason >= EXT4_FC_REASON_MAX);
+ sbi->s_fc_stats.fc_ineligible_reason_count[reason]++;
+ atomic_inc(&sbi->s_fc_ineligible_updates);
+}
+
+/*
+ * Stop a fast commit ineligible update. We set EXT4_MF_FC_INELIGIBLE flag here
+ * to ensure that after stopping the ineligible update, at least one full
+ * commit takes place.
+ */
+void ext4_fc_stop_ineligible(struct super_block *sb)
+{
+ if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
+ (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
+ return;
+
+ ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
+ atomic_dec(&EXT4_SB(sb)->s_fc_ineligible_updates);
+}
+
+static inline int ext4_fc_is_ineligible(struct super_block *sb)
+{
+ return (ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE) ||
+ atomic_read(&EXT4_SB(sb)->s_fc_ineligible_updates));
+}
+
+/*
+ * Generic fast commit tracking function. If this is the first time this we are
+ * called after a full commit, we initialize fast commit fields and then call
+ * __fc_track_fn() with update = 0. If we have already been called after a full
+ * commit, we pass update = 1. Based on that, the track function can determine
+ * if it needs to track a field for the first time or if it needs to just
+ * update the previously tracked value.
+ *
+ * If enqueue is set, this function enqueues the inode in fast commit list.
+ */
+static int ext4_fc_track_template(
+ handle_t *handle, struct inode *inode,
+ int (*__fc_track_fn)(struct inode *, void *, bool),
+ void *args, int enqueue)
+{
+ bool update = false;
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ tid_t tid = 0;
+ int ret;
+
+ if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
+ (sbi->s_mount_state & EXT4_FC_REPLAY))
+ return -EOPNOTSUPP;
+
+ if (ext4_fc_is_ineligible(inode->i_sb))
+ return -EINVAL;
+
+ tid = handle->h_transaction->t_tid;
+ mutex_lock(&ei->i_fc_lock);
+ if (tid == ei->i_sync_tid) {
+ update = true;
+ } else {
+ ext4_fc_reset_inode(inode);
+ ei->i_sync_tid = tid;
+ }
+ ret = __fc_track_fn(inode, args, update);
+ mutex_unlock(&ei->i_fc_lock);
+
+ if (!enqueue)
+ return ret;
+
+ spin_lock(&sbi->s_fc_lock);
+ if (list_empty(&EXT4_I(inode)->i_fc_list))
+ list_add_tail(&EXT4_I(inode)->i_fc_list,
+ (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_COMMITTING)) ?
+ &sbi->s_fc_q[FC_Q_STAGING] :
+ &sbi->s_fc_q[FC_Q_MAIN]);
+ spin_unlock(&sbi->s_fc_lock);
+
+ return ret;
+}
+
+struct __track_dentry_update_args {
+ struct dentry *dentry;
+ int op;
+};
+
+/* __track_fn for directory entry updates. Called with ei->i_fc_lock. */
+static int __track_dentry_update(struct inode *inode, void *arg, bool update)
+{
+ struct ext4_fc_dentry_update *node;
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ struct __track_dentry_update_args *dentry_update =
+ (struct __track_dentry_update_args *)arg;
+ struct dentry *dentry = dentry_update->dentry;
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+
+ mutex_unlock(&ei->i_fc_lock);
+ node = kmem_cache_alloc(ext4_fc_dentry_cachep, GFP_NOFS);
+ if (!node) {
+ ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM);
+ mutex_lock(&ei->i_fc_lock);
+ return -ENOMEM;
+ }
+
+ node->fcd_op = dentry_update->op;
+ node->fcd_parent = dentry->d_parent->d_inode->i_ino;
+ node->fcd_ino = inode->i_ino;
+ if (dentry->d_name.len > DNAME_INLINE_LEN) {
+ node->fcd_name.name = kmalloc(dentry->d_name.len, GFP_NOFS);
+ if (!node->fcd_name.name) {
+ kmem_cache_free(ext4_fc_dentry_cachep, node);
+ ext4_fc_mark_ineligible(inode->i_sb,
+ EXT4_FC_REASON_NOMEM);
+ mutex_lock(&ei->i_fc_lock);
+ return -ENOMEM;
+ }
+ memcpy((u8 *)node->fcd_name.name, dentry->d_name.name,
+ dentry->d_name.len);
+ } else {
+ memcpy(node->fcd_iname, dentry->d_name.name,
+ dentry->d_name.len);
+ node->fcd_name.name = node->fcd_iname;
+ }
+ node->fcd_name.len = dentry->d_name.len;
+
+ spin_lock(&sbi->s_fc_lock);
+ if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_COMMITTING))
+ list_add_tail(&node->fcd_list,
+ &sbi->s_fc_dentry_q[FC_Q_STAGING]);
+ else
+ list_add_tail(&node->fcd_list, &sbi->s_fc_dentry_q[FC_Q_MAIN]);
+ spin_unlock(&sbi->s_fc_lock);
+ mutex_lock(&ei->i_fc_lock);
+
+ return 0;
+}
+
+void __ext4_fc_track_unlink(handle_t *handle,
+ struct inode *inode, struct dentry *dentry)
+{
+ struct __track_dentry_update_args args;
+ int ret;
+
+ args.dentry = dentry;
+ args.op = EXT4_FC_TAG_UNLINK;
+
+ ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
+ (void *)&args, 0);
+ trace_ext4_fc_track_unlink(inode, dentry, ret);
+}
+
+void ext4_fc_track_unlink(handle_t *handle, struct dentry *dentry)
+{
+ __ext4_fc_track_unlink(handle, d_inode(dentry), dentry);
+}
+
+void __ext4_fc_track_link(handle_t *handle,
+ struct inode *inode, struct dentry *dentry)
+{
+ struct __track_dentry_update_args args;
+ int ret;
+
+ args.dentry = dentry;
+ args.op = EXT4_FC_TAG_LINK;
+
+ ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
+ (void *)&args, 0);
+ trace_ext4_fc_track_link(inode, dentry, ret);
+}
+
+void ext4_fc_track_link(handle_t *handle, struct dentry *dentry)
+{
+ __ext4_fc_track_link(handle, d_inode(dentry), dentry);
+}
+
+void __ext4_fc_track_create(handle_t *handle, struct inode *inode,
+ struct dentry *dentry)
+{
+ struct __track_dentry_update_args args;
+ int ret;
+
+ args.dentry = dentry;
+ args.op = EXT4_FC_TAG_CREAT;
+
+ ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
+ (void *)&args, 0);
+ trace_ext4_fc_track_create(inode, dentry, ret);
+}
+
+void ext4_fc_track_create(handle_t *handle, struct dentry *dentry)
+{
+ __ext4_fc_track_create(handle, d_inode(dentry), dentry);
+}
+
+/* __track_fn for inode tracking */
+static int __track_inode(struct inode *inode, void *arg, bool update)
+{
+ if (update)
+ return -EEXIST;
+
+ EXT4_I(inode)->i_fc_lblk_len = 0;
+
+ return 0;
+}
+
+void ext4_fc_track_inode(handle_t *handle, struct inode *inode)
+{
+ int ret;
+
+ if (S_ISDIR(inode->i_mode))
+ return;
+
+ if (ext4_should_journal_data(inode)) {
+ ext4_fc_mark_ineligible(inode->i_sb,
+ EXT4_FC_REASON_INODE_JOURNAL_DATA);
+ return;
+ }
+
+ ret = ext4_fc_track_template(handle, inode, __track_inode, NULL, 1);
+ trace_ext4_fc_track_inode(inode, ret);
+}
+
+struct __track_range_args {
+ ext4_lblk_t start, end;
+};
+
+/* __track_fn for tracking data updates */
+static int __track_range(struct inode *inode, void *arg, bool update)
+{
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ ext4_lblk_t oldstart;
+ struct __track_range_args *__arg =
+ (struct __track_range_args *)arg;
+
+ if (inode->i_ino < EXT4_FIRST_INO(inode->i_sb)) {
+ ext4_debug("Special inode %ld being modified\n", inode->i_ino);
+ return -ECANCELED;
+ }
+
+ oldstart = ei->i_fc_lblk_start;
+
+ if (update && ei->i_fc_lblk_len > 0) {
+ ei->i_fc_lblk_start = min(ei->i_fc_lblk_start, __arg->start);
+ ei->i_fc_lblk_len =
+ max(oldstart + ei->i_fc_lblk_len - 1, __arg->end) -
+ ei->i_fc_lblk_start + 1;
+ } else {
+ ei->i_fc_lblk_start = __arg->start;
+ ei->i_fc_lblk_len = __arg->end - __arg->start + 1;
+ }
+
+ return 0;
+}
+
+void ext4_fc_track_range(handle_t *handle, struct inode *inode, ext4_lblk_t start,
+ ext4_lblk_t end)
+{
+ struct __track_range_args args;
+ int ret;
+
+ if (S_ISDIR(inode->i_mode))
+ return;
+
+ args.start = start;
+ args.end = end;
+
+ ret = ext4_fc_track_template(handle, inode, __track_range, &args, 1);
+
+ trace_ext4_fc_track_range(inode, start, end, ret);
+}
+
+static void ext4_fc_submit_bh(struct super_block *sb)
+{
+ int write_flags = REQ_SYNC;
+ struct buffer_head *bh = EXT4_SB(sb)->s_fc_bh;
+
+ /* TODO: REQ_FUA | REQ_PREFLUSH is unnecessarily expensive. */
+ if (test_opt(sb, BARRIER))
+ write_flags |= REQ_FUA | REQ_PREFLUSH;
+ lock_buffer(bh);
+ set_buffer_dirty(bh);
+ set_buffer_uptodate(bh);
+ bh->b_end_io = ext4_end_buffer_io_sync;
+ submit_bh(REQ_OP_WRITE, write_flags, bh);
+ EXT4_SB(sb)->s_fc_bh = NULL;
+}
+
+/* Ext4 commit path routines */
+
+/* memzero and update CRC */
+static void *ext4_fc_memzero(struct super_block *sb, void *dst, int len,
+ u32 *crc)
+{
+ void *ret;
+
+ ret = memset(dst, 0, len);
+ if (crc)
+ *crc = ext4_chksum(EXT4_SB(sb), *crc, dst, len);
+ return ret;
+}
+
+/*
+ * Allocate len bytes on a fast commit buffer.
+ *
+ * During the commit time this function is used to manage fast commit
+ * block space. We don't split a fast commit log onto different
+ * blocks. So this function makes sure that if there's not enough space
+ * on the current block, the remaining space in the current block is
+ * marked as unused by adding EXT4_FC_TAG_PAD tag. In that case,
+ * new block is from jbd2 and CRC is updated to reflect the padding
+ * we added.
+ */
+static u8 *ext4_fc_reserve_space(struct super_block *sb, int len, u32 *crc)
+{
+ struct ext4_fc_tl *tl;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct buffer_head *bh;
+ int bsize = sbi->s_journal->j_blocksize;
+ int ret, off = sbi->s_fc_bytes % bsize;
+ int pad_len;
+
+ /*
+ * After allocating len, we should have space at least for a 0 byte
+ * padding.
+ */
+ if (len + sizeof(struct ext4_fc_tl) > bsize)
+ return NULL;
+
+ if (bsize - off - 1 > len + sizeof(struct ext4_fc_tl)) {
+ /*
+ * Only allocate from current buffer if we have enough space for
+ * this request AND we have space to add a zero byte padding.
+ */
+ if (!sbi->s_fc_bh) {
+ ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
+ if (ret)
+ return NULL;
+ sbi->s_fc_bh = bh;
+ }
+ sbi->s_fc_bytes += len;
+ return sbi->s_fc_bh->b_data + off;
+ }
+ /* Need to add PAD tag */
+ tl = (struct ext4_fc_tl *)(sbi->s_fc_bh->b_data + off);
+ tl->fc_tag = cpu_to_le16(EXT4_FC_TAG_PAD);
+ pad_len = bsize - off - 1 - sizeof(struct ext4_fc_tl);
+ tl->fc_len = cpu_to_le16(pad_len);
+ if (crc)
+ *crc = ext4_chksum(sbi, *crc, tl, sizeof(*tl));
+ if (pad_len > 0)
+ ext4_fc_memzero(sb, tl + 1, pad_len, crc);
+ ext4_fc_submit_bh(sb);
+
+ ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
+ if (ret)
+ return NULL;
+ sbi->s_fc_bh = bh;
+ sbi->s_fc_bytes = (sbi->s_fc_bytes / bsize + 1) * bsize + len;
+ return sbi->s_fc_bh->b_data;
+}
+
+/* memcpy to fc reserved space and update CRC */
+static void *ext4_fc_memcpy(struct super_block *sb, void *dst, const void *src,
+ int len, u32 *crc)
+{
+ if (crc)
+ *crc = ext4_chksum(EXT4_SB(sb), *crc, src, len);
+ return memcpy(dst, src, len);
+}
+
+/*
+ * Complete a fast commit by writing tail tag.
+ *
+ * Writing tail tag marks the end of a fast commit. In order to guarantee
+ * atomicity, after writing tail tag, even if there's space remaining
+ * in the block, next commit shouldn't use it. That's why tail tag
+ * has the length as that of the remaining space on the block.
+ */
+static int ext4_fc_write_tail(struct super_block *sb, u32 crc)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_fc_tl tl;
+ struct ext4_fc_tail tail;
+ int off, bsize = sbi->s_journal->j_blocksize;
+ u8 *dst;
+
+ /*
+ * ext4_fc_reserve_space takes care of allocating an extra block if
+ * there's no enough space on this block for accommodating this tail.
+ */
+ dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(tail), &crc);
+ if (!dst)
+ return -ENOSPC;
+
+ off = sbi->s_fc_bytes % bsize;
+
+ tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_TAIL);
+ tl.fc_len = cpu_to_le16(bsize - off - 1 + sizeof(struct ext4_fc_tail));
+ sbi->s_fc_bytes = round_up(sbi->s_fc_bytes, bsize);
+
+ ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), &crc);
+ dst += sizeof(tl);
+ tail.fc_tid = cpu_to_le32(sbi->s_journal->j_running_transaction->t_tid);
+ ext4_fc_memcpy(sb, dst, &tail.fc_tid, sizeof(tail.fc_tid), &crc);
+ dst += sizeof(tail.fc_tid);
+ tail.fc_crc = cpu_to_le32(crc);
+ ext4_fc_memcpy(sb, dst, &tail.fc_crc, sizeof(tail.fc_crc), NULL);
+
+ ext4_fc_submit_bh(sb);
+
+ return 0;
+}
+
+/*
+ * Adds tag, length, value and updates CRC. Returns true if tlv was added.
+ * Returns false if there's not enough space.
+ */
+static bool ext4_fc_add_tlv(struct super_block *sb, u16 tag, u16 len, u8 *val,
+ u32 *crc)
+{
+ struct ext4_fc_tl tl;
+ u8 *dst;
+
+ dst = ext4_fc_reserve_space(sb, sizeof(tl) + len, crc);
+ if (!dst)
+ return false;
+
+ tl.fc_tag = cpu_to_le16(tag);
+ tl.fc_len = cpu_to_le16(len);
+
+ ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc);
+ ext4_fc_memcpy(sb, dst + sizeof(tl), val, len, crc);
+
+ return true;
+}
+
+/* Same as above, but adds dentry tlv. */
+static bool ext4_fc_add_dentry_tlv(struct super_block *sb, u16 tag,
+ int parent_ino, int ino, int dlen,
+ const unsigned char *dname,
+ u32 *crc)
+{
+ struct ext4_fc_dentry_info fcd;
+ struct ext4_fc_tl tl;
+ u8 *dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(fcd) + dlen,
+ crc);
+
+ if (!dst)
+ return false;
+
+ fcd.fc_parent_ino = cpu_to_le32(parent_ino);
+ fcd.fc_ino = cpu_to_le32(ino);
+ tl.fc_tag = cpu_to_le16(tag);
+ tl.fc_len = cpu_to_le16(sizeof(fcd) + dlen);
+ ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc);
+ dst += sizeof(tl);
+ ext4_fc_memcpy(sb, dst, &fcd, sizeof(fcd), crc);
+ dst += sizeof(fcd);
+ ext4_fc_memcpy(sb, dst, dname, dlen, crc);
+ dst += dlen;
+
+ return true;
+}
+
+/*
+ * Writes inode in the fast commit space under TLV with tag @tag.
+ * Returns 0 on success, error on failure.
+ */
+static int ext4_fc_write_inode(struct inode *inode, u32 *crc)
+{
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ int inode_len = EXT4_GOOD_OLD_INODE_SIZE;
+ int ret;
+ struct ext4_iloc iloc;
+ struct ext4_fc_inode fc_inode;
+ struct ext4_fc_tl tl;
+ u8 *dst;
+
+ ret = ext4_get_inode_loc(inode, &iloc);
+ if (ret)
+ return ret;
+
+ if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE)
+ inode_len += ei->i_extra_isize;
+
+ fc_inode.fc_ino = cpu_to_le32(inode->i_ino);
+ tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_INODE);
+ tl.fc_len = cpu_to_le16(inode_len + sizeof(fc_inode.fc_ino));
+
+ dst = ext4_fc_reserve_space(inode->i_sb,
+ sizeof(tl) + inode_len + sizeof(fc_inode.fc_ino), crc);
+ if (!dst)
+ return -ECANCELED;
+
+ if (!ext4_fc_memcpy(inode->i_sb, dst, &tl, sizeof(tl), crc))
+ return -ECANCELED;
+ dst += sizeof(tl);
+ if (!ext4_fc_memcpy(inode->i_sb, dst, &fc_inode, sizeof(fc_inode), crc))
+ return -ECANCELED;
+ dst += sizeof(fc_inode);
+ if (!ext4_fc_memcpy(inode->i_sb, dst, (u8 *)ext4_raw_inode(&iloc),
+ inode_len, crc))
+ return -ECANCELED;
+
+ return 0;
+}
+
+/*
+ * Writes updated data ranges for the inode in question. Updates CRC.
+ * Returns 0 on success, error otherwise.
+ */
+static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc)
+{
+ ext4_lblk_t old_blk_size, cur_lblk_off, new_blk_size;
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ struct ext4_map_blocks map;
+ struct ext4_fc_add_range fc_ext;
+ struct ext4_fc_del_range lrange;
+ struct ext4_extent *ex;
+ int ret;
+
+ mutex_lock(&ei->i_fc_lock);
+ if (ei->i_fc_lblk_len == 0) {
+ mutex_unlock(&ei->i_fc_lock);
+ return 0;
+ }
+ old_blk_size = ei->i_fc_lblk_start;
+ new_blk_size = ei->i_fc_lblk_start + ei->i_fc_lblk_len - 1;
+ ei->i_fc_lblk_len = 0;
+ mutex_unlock(&ei->i_fc_lock);
+
+ cur_lblk_off = old_blk_size;
+ jbd_debug(1, "%s: will try writing %d to %d for inode %ld\n",
+ __func__, cur_lblk_off, new_blk_size, inode->i_ino);
+
+ while (cur_lblk_off <= new_blk_size) {
+ map.m_lblk = cur_lblk_off;
+ map.m_len = new_blk_size - cur_lblk_off + 1;
+ ret = ext4_map_blocks(NULL, inode, &map, 0);
+ if (ret < 0)
+ return -ECANCELED;
+
+ if (map.m_len == 0) {
+ cur_lblk_off++;
+ continue;
+ }
+
+ if (ret == 0) {
+ lrange.fc_ino = cpu_to_le32(inode->i_ino);
+ lrange.fc_lblk = cpu_to_le32(map.m_lblk);
+ lrange.fc_len = cpu_to_le32(map.m_len);
+ if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_DEL_RANGE,
+ sizeof(lrange), (u8 *)&lrange, crc))
+ return -ENOSPC;
+ } else {
+ unsigned int max = (map.m_flags & EXT4_MAP_UNWRITTEN) ?
+ EXT_UNWRITTEN_MAX_LEN : EXT_INIT_MAX_LEN;
+
+ /* Limit the number of blocks in one extent */
+ map.m_len = min(max, map.m_len);
+
+ fc_ext.fc_ino = cpu_to_le32(inode->i_ino);
+ ex = (struct ext4_extent *)&fc_ext.fc_ex;
+ ex->ee_block = cpu_to_le32(map.m_lblk);
+ ex->ee_len = cpu_to_le16(map.m_len);
+ ext4_ext_store_pblock(ex, map.m_pblk);
+ if (map.m_flags & EXT4_MAP_UNWRITTEN)
+ ext4_ext_mark_unwritten(ex);
+ else
+ ext4_ext_mark_initialized(ex);
+ if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_ADD_RANGE,
+ sizeof(fc_ext), (u8 *)&fc_ext, crc))
+ return -ENOSPC;
+ }
+
+ cur_lblk_off += map.m_len;
+ }
+
+ return 0;
+}
+
+
+/* Submit data for all the fast commit inodes */
+static int ext4_fc_submit_inode_data_all(journal_t *journal)
+{
+ struct super_block *sb = (struct super_block *)(journal->j_private);
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_inode_info *ei;
+ struct list_head *pos;
+ int ret = 0;
+
+ spin_lock(&sbi->s_fc_lock);
+ ext4_set_mount_flag(sb, EXT4_MF_FC_COMMITTING);
+ list_for_each(pos, &sbi->s_fc_q[FC_Q_MAIN]) {
+ ei = list_entry(pos, struct ext4_inode_info, i_fc_list);
+ ext4_set_inode_state(&ei->vfs_inode, EXT4_STATE_FC_COMMITTING);
+ while (atomic_read(&ei->i_fc_updates)) {
+ DEFINE_WAIT(wait);
+
+ prepare_to_wait(&ei->i_fc_wait, &wait,
+ TASK_UNINTERRUPTIBLE);
+ if (atomic_read(&ei->i_fc_updates)) {
+ spin_unlock(&sbi->s_fc_lock);
+ schedule();
+ spin_lock(&sbi->s_fc_lock);
+ }
+ finish_wait(&ei->i_fc_wait, &wait);
+ }
+ spin_unlock(&sbi->s_fc_lock);
+ ret = jbd2_submit_inode_data(ei->jinode);
+ if (ret)
+ return ret;
+ spin_lock(&sbi->s_fc_lock);
+ }
+ spin_unlock(&sbi->s_fc_lock);
+
+ return ret;
+}
+
+/* Wait for completion of data for all the fast commit inodes */
+static int ext4_fc_wait_inode_data_all(journal_t *journal)
+{
+ struct super_block *sb = (struct super_block *)(journal->j_private);
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_inode_info *pos, *n;
+ int ret = 0;
+
+ spin_lock(&sbi->s_fc_lock);
+ list_for_each_entry_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
+ if (!ext4_test_inode_state(&pos->vfs_inode,
+ EXT4_STATE_FC_COMMITTING))
+ continue;
+ spin_unlock(&sbi->s_fc_lock);
+
+ ret = jbd2_wait_inode_data(journal, pos->jinode);
+ if (ret)
+ return ret;
+ spin_lock(&sbi->s_fc_lock);
+ }
+ spin_unlock(&sbi->s_fc_lock);
+
+ return 0;
+}
+
+/* Commit all the directory entry updates */
+static int ext4_fc_commit_dentry_updates(journal_t *journal, u32 *crc)
+__acquires(&sbi->s_fc_lock)
+__releases(&sbi->s_fc_lock)
+{
+ struct super_block *sb = (struct super_block *)(journal->j_private);
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_fc_dentry_update *fc_dentry;
+ struct inode *inode;
+ struct list_head *pos, *n, *fcd_pos, *fcd_n;
+ struct ext4_inode_info *ei;
+ int ret;
+
+ if (list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN]))
+ return 0;
+ list_for_each_safe(fcd_pos, fcd_n, &sbi->s_fc_dentry_q[FC_Q_MAIN]) {
+ fc_dentry = list_entry(fcd_pos, struct ext4_fc_dentry_update,
+ fcd_list);
+ if (fc_dentry->fcd_op != EXT4_FC_TAG_CREAT) {
+ spin_unlock(&sbi->s_fc_lock);
+ if (!ext4_fc_add_dentry_tlv(
+ sb, fc_dentry->fcd_op,
+ fc_dentry->fcd_parent, fc_dentry->fcd_ino,
+ fc_dentry->fcd_name.len,
+ fc_dentry->fcd_name.name, crc)) {
+ ret = -ENOSPC;
+ goto lock_and_exit;
+ }
+ spin_lock(&sbi->s_fc_lock);
+ continue;
+ }
+
+ inode = NULL;
+ list_for_each_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN]) {
+ ei = list_entry(pos, struct ext4_inode_info, i_fc_list);
+ if (ei->vfs_inode.i_ino == fc_dentry->fcd_ino) {
+ inode = &ei->vfs_inode;
+ break;
+ }
+ }
+ /*
+ * If we don't find inode in our list, then it was deleted,
+ * in which case, we don't need to record it's create tag.
+ */
+ if (!inode)
+ continue;
+ spin_unlock(&sbi->s_fc_lock);
+
+ /*
+ * We first write the inode and then the create dirent. This
+ * allows the recovery code to create an unnamed inode first
+ * and then link it to a directory entry. This allows us
+ * to use namei.c routines almost as is and simplifies
+ * the recovery code.
+ */
+ ret = ext4_fc_write_inode(inode, crc);
+ if (ret)
+ goto lock_and_exit;
+
+ ret = ext4_fc_write_inode_data(inode, crc);
+ if (ret)
+ goto lock_and_exit;
+
+ if (!ext4_fc_add_dentry_tlv(
+ sb, fc_dentry->fcd_op,
+ fc_dentry->fcd_parent, fc_dentry->fcd_ino,
+ fc_dentry->fcd_name.len,
+ fc_dentry->fcd_name.name, crc)) {
+ ret = -ENOSPC;
+ goto lock_and_exit;
+ }
+
+ spin_lock(&sbi->s_fc_lock);
+ }
+ return 0;
+lock_and_exit:
+ spin_lock(&sbi->s_fc_lock);
+ return ret;
+}
+
+static int ext4_fc_perform_commit(journal_t *journal)
+{
+ struct super_block *sb = (struct super_block *)(journal->j_private);
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_inode_info *iter;
+ struct ext4_fc_head head;
+ struct list_head *pos;
+ struct inode *inode;
+ struct blk_plug plug;
+ int ret = 0;
+ u32 crc = 0;
+
+ ret = ext4_fc_submit_inode_data_all(journal);
+ if (ret)
+ return ret;
+
+ ret = ext4_fc_wait_inode_data_all(journal);
+ if (ret)
+ return ret;
+
+ /*
+ * If file system device is different from journal device, issue a cache
+ * flush before we start writing fast commit blocks.
+ */
+ if (journal->j_fs_dev != journal->j_dev)
+ blkdev_issue_flush(journal->j_fs_dev, GFP_NOFS);
+
+ blk_start_plug(&plug);
+ if (sbi->s_fc_bytes == 0) {
+ /*
+ * Add a head tag only if this is the first fast commit
+ * in this TID.
+ */
+ head.fc_features = cpu_to_le32(EXT4_FC_SUPPORTED_FEATURES);
+ head.fc_tid = cpu_to_le32(
+ sbi->s_journal->j_running_transaction->t_tid);
+ if (!ext4_fc_add_tlv(sb, EXT4_FC_TAG_HEAD, sizeof(head),
+ (u8 *)&head, &crc)) {
+ ret = -ENOSPC;
+ goto out;
+ }
+ }
+
+ spin_lock(&sbi->s_fc_lock);
+ ret = ext4_fc_commit_dentry_updates(journal, &crc);
+ if (ret) {
+ spin_unlock(&sbi->s_fc_lock);
+ goto out;
+ }
+
+ list_for_each(pos, &sbi->s_fc_q[FC_Q_MAIN]) {
+ iter = list_entry(pos, struct ext4_inode_info, i_fc_list);
+ inode = &iter->vfs_inode;
+ if (!ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING))
+ continue;
+
+ spin_unlock(&sbi->s_fc_lock);
+ ret = ext4_fc_write_inode_data(inode, &crc);
+ if (ret)
+ goto out;
+ ret = ext4_fc_write_inode(inode, &crc);
+ if (ret)
+ goto out;
+ spin_lock(&sbi->s_fc_lock);
+ }
+ spin_unlock(&sbi->s_fc_lock);
+
+ ret = ext4_fc_write_tail(sb, crc);
+
+out:
+ blk_finish_plug(&plug);
+ return ret;
+}
+
+/*
+ * The main commit entry point. Performs a fast commit for transaction
+ * commit_tid if needed. If it's not possible to perform a fast commit
+ * due to various reasons, we fall back to full commit. Returns 0
+ * on success, error otherwise.
+ */
+int ext4_fc_commit(journal_t *journal, tid_t commit_tid)
+{
+ struct super_block *sb = (struct super_block *)(journal->j_private);
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ int nblks = 0, ret, bsize = journal->j_blocksize;
+ int subtid = atomic_read(&sbi->s_fc_subtid);
+ int reason = EXT4_FC_REASON_OK, fc_bufs_before = 0;
+ ktime_t start_time, commit_time;
+
+ trace_ext4_fc_commit_start(sb);
+
+ start_time = ktime_get();
+
+ if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
+ (ext4_fc_is_ineligible(sb))) {
+ reason = EXT4_FC_REASON_INELIGIBLE;
+ goto out;
+ }
+
+restart_fc:
+ ret = jbd2_fc_begin_commit(journal, commit_tid);
+ if (ret == -EALREADY) {
+ /* There was an ongoing commit, check if we need to restart */
+ if (atomic_read(&sbi->s_fc_subtid) <= subtid &&
+ commit_tid > journal->j_commit_sequence)
+ goto restart_fc;
+ reason = EXT4_FC_REASON_ALREADY_COMMITTED;
+ goto out;
+ } else if (ret) {
+ sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
+ reason = EXT4_FC_REASON_FC_START_FAILED;
+ goto out;
+ }
+
+ fc_bufs_before = (sbi->s_fc_bytes + bsize - 1) / bsize;
+ ret = ext4_fc_perform_commit(journal);
+ if (ret < 0) {
+ sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
+ reason = EXT4_FC_REASON_FC_FAILED;
+ goto out;
+ }
+ nblks = (sbi->s_fc_bytes + bsize - 1) / bsize - fc_bufs_before;
+ ret = jbd2_fc_wait_bufs(journal, nblks);
+ if (ret < 0) {
+ sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
+ reason = EXT4_FC_REASON_FC_FAILED;
+ goto out;
+ }
+ atomic_inc(&sbi->s_fc_subtid);
+ jbd2_fc_end_commit(journal);
+out:
+ /* Has any ineligible update happened since we started? */
+ if (reason == EXT4_FC_REASON_OK && ext4_fc_is_ineligible(sb)) {
+ sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
+ reason = EXT4_FC_REASON_INELIGIBLE;
+ }
+
+ spin_lock(&sbi->s_fc_lock);
+ if (reason != EXT4_FC_REASON_OK &&
+ reason != EXT4_FC_REASON_ALREADY_COMMITTED) {
+ sbi->s_fc_stats.fc_ineligible_commits++;
+ } else {
+ sbi->s_fc_stats.fc_num_commits++;
+ sbi->s_fc_stats.fc_numblks += nblks;
+ }
+ spin_unlock(&sbi->s_fc_lock);
+ nblks = (reason == EXT4_FC_REASON_OK) ? nblks : 0;
+ trace_ext4_fc_commit_stop(sb, nblks, reason);
+ commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
+ /*
+ * weight the commit time higher than the average time so we don't
+ * react too strongly to vast changes in the commit time
+ */
+ if (likely(sbi->s_fc_avg_commit_time))
+ sbi->s_fc_avg_commit_time = (commit_time +
+ sbi->s_fc_avg_commit_time * 3) / 4;
+ else
+ sbi->s_fc_avg_commit_time = commit_time;
+ jbd_debug(1,
+ "Fast commit ended with blks = %d, reason = %d, subtid - %d",
+ nblks, reason, subtid);
+ if (reason == EXT4_FC_REASON_FC_FAILED)
+ return jbd2_fc_end_commit_fallback(journal);
+ if (reason == EXT4_FC_REASON_FC_START_FAILED ||
+ reason == EXT4_FC_REASON_INELIGIBLE)
+ return jbd2_complete_transaction(journal, commit_tid);
+ return 0;
+}
+
+/*
+ * Fast commit cleanup routine. This is called after every fast commit and
+ * full commit. full is true if we are called after a full commit.
+ */
+static void ext4_fc_cleanup(journal_t *journal, int full)
+{
+ struct super_block *sb = journal->j_private;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_inode_info *iter;
+ struct ext4_fc_dentry_update *fc_dentry;
+ struct list_head *pos, *n;
+
+ if (full && sbi->s_fc_bh)
+ sbi->s_fc_bh = NULL;
+
+ jbd2_fc_release_bufs(journal);
+
+ spin_lock(&sbi->s_fc_lock);
+ list_for_each_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN]) {
+ iter = list_entry(pos, struct ext4_inode_info, i_fc_list);
+ list_del_init(&iter->i_fc_list);
+ ext4_clear_inode_state(&iter->vfs_inode,
+ EXT4_STATE_FC_COMMITTING);
+ ext4_fc_reset_inode(&iter->vfs_inode);
+ /* Make sure EXT4_STATE_FC_COMMITTING bit is clear */
+ smp_mb();
+#if (BITS_PER_LONG < 64)
+ wake_up_bit(&iter->i_state_flags, EXT4_STATE_FC_COMMITTING);
+#else
+ wake_up_bit(&iter->i_flags, EXT4_STATE_FC_COMMITTING);
+#endif
+ }
+
+ while (!list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN])) {
+ fc_dentry = list_first_entry(&sbi->s_fc_dentry_q[FC_Q_MAIN],
+ struct ext4_fc_dentry_update,
+ fcd_list);
+ list_del_init(&fc_dentry->fcd_list);
+ spin_unlock(&sbi->s_fc_lock);
+
+ if (fc_dentry->fcd_name.name &&
+ fc_dentry->fcd_name.len > DNAME_INLINE_LEN)
+ kfree(fc_dentry->fcd_name.name);
+ kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry);
+ spin_lock(&sbi->s_fc_lock);
+ }
+
+ list_splice_init(&sbi->s_fc_dentry_q[FC_Q_STAGING],
+ &sbi->s_fc_dentry_q[FC_Q_MAIN]);
+ list_splice_init(&sbi->s_fc_q[FC_Q_STAGING],
+ &sbi->s_fc_q[FC_Q_MAIN]);
+
+ ext4_clear_mount_flag(sb, EXT4_MF_FC_COMMITTING);
+ ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
+
+ if (full)
+ sbi->s_fc_bytes = 0;
+ spin_unlock(&sbi->s_fc_lock);
+ trace_ext4_fc_stats(sb);
+}
+
+/* Ext4 Replay Path Routines */
+
+/* Helper struct for dentry replay routines */
+struct dentry_info_args {
+ int parent_ino, dname_len, ino, inode_len;
+ char *dname;
+};
+
+static inline void tl_to_darg(struct dentry_info_args *darg,
+ struct ext4_fc_tl *tl, u8 *val)
+{
+ struct ext4_fc_dentry_info fcd;
+
+ memcpy(&fcd, val, sizeof(fcd));
+
+ darg->parent_ino = le32_to_cpu(fcd.fc_parent_ino);
+ darg->ino = le32_to_cpu(fcd.fc_ino);
+ darg->dname = val + offsetof(struct ext4_fc_dentry_info, fc_dname);
+ darg->dname_len = le16_to_cpu(tl->fc_len) -
+ sizeof(struct ext4_fc_dentry_info);
+}
+
+/* Unlink replay function */
+static int ext4_fc_replay_unlink(struct super_block *sb, struct ext4_fc_tl *tl,
+ u8 *val)
+{
+ struct inode *inode, *old_parent;
+ struct qstr entry;
+ struct dentry_info_args darg;
+ int ret = 0;
+
+ tl_to_darg(&darg, tl, val);
+
+ trace_ext4_fc_replay(sb, EXT4_FC_TAG_UNLINK, darg.ino,
+ darg.parent_ino, darg.dname_len);
+
+ entry.name = darg.dname;
+ entry.len = darg.dname_len;
+ inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
+
+ if (IS_ERR(inode)) {
+ jbd_debug(1, "Inode %d not found", darg.ino);
+ return 0;
+ }
+
+ old_parent = ext4_iget(sb, darg.parent_ino,
+ EXT4_IGET_NORMAL);
+ if (IS_ERR(old_parent)) {
+ jbd_debug(1, "Dir with inode %d not found", darg.parent_ino);
+ iput(inode);
+ return 0;
+ }
+
+ ret = __ext4_unlink(NULL, old_parent, &entry, inode);
+ /* -ENOENT ok coz it might not exist anymore. */
+ if (ret == -ENOENT)
+ ret = 0;
+ iput(old_parent);
+ iput(inode);
+ return ret;
+}
+
+static int ext4_fc_replay_link_internal(struct super_block *sb,
+ struct dentry_info_args *darg,
+ struct inode *inode)
+{
+ struct inode *dir = NULL;
+ struct dentry *dentry_dir = NULL, *dentry_inode = NULL;
+ struct qstr qstr_dname = QSTR_INIT(darg->dname, darg->dname_len);
+ int ret = 0;
+
+ dir = ext4_iget(sb, darg->parent_ino, EXT4_IGET_NORMAL);
+ if (IS_ERR(dir)) {
+ jbd_debug(1, "Dir with inode %d not found.", darg->parent_ino);
+ dir = NULL;
+ goto out;
+ }
+
+ dentry_dir = d_obtain_alias(dir);
+ if (IS_ERR(dentry_dir)) {
+ jbd_debug(1, "Failed to obtain dentry");
+ dentry_dir = NULL;
+ goto out;
+ }
+
+ dentry_inode = d_alloc(dentry_dir, &qstr_dname);
+ if (!dentry_inode) {
+ jbd_debug(1, "Inode dentry not created.");
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ret = __ext4_link(dir, inode, dentry_inode);
+ /*
+ * It's possible that link already existed since data blocks
+ * for the dir in question got persisted before we crashed OR
+ * we replayed this tag and crashed before the entire replay
+ * could complete.
+ */
+ if (ret && ret != -EEXIST) {
+ jbd_debug(1, "Failed to link\n");
+ goto out;
+ }
+
+ ret = 0;
+out:
+ if (dentry_dir) {
+ d_drop(dentry_dir);
+ dput(dentry_dir);
+ } else if (dir) {
+ iput(dir);
+ }
+ if (dentry_inode) {
+ d_drop(dentry_inode);
+ dput(dentry_inode);
+ }
+
+ return ret;
+}
+
+/* Link replay function */
+static int ext4_fc_replay_link(struct super_block *sb, struct ext4_fc_tl *tl,
+ u8 *val)
+{
+ struct inode *inode;
+ struct dentry_info_args darg;
+ int ret = 0;
+
+ tl_to_darg(&darg, tl, val);
+ trace_ext4_fc_replay(sb, EXT4_FC_TAG_LINK, darg.ino,
+ darg.parent_ino, darg.dname_len);
+
+ inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
+ if (IS_ERR(inode)) {
+ jbd_debug(1, "Inode not found.");
+ return 0;
+ }
+
+ ret = ext4_fc_replay_link_internal(sb, &darg, inode);
+ iput(inode);
+ return ret;
+}
+
+/*
+ * Record all the modified inodes during replay. We use this later to setup
+ * block bitmaps correctly.
+ */
+static int ext4_fc_record_modified_inode(struct super_block *sb, int ino)
+{
+ struct ext4_fc_replay_state *state;
+ int i;
+
+ state = &EXT4_SB(sb)->s_fc_replay_state;
+ for (i = 0; i < state->fc_modified_inodes_used; i++)
+ if (state->fc_modified_inodes[i] == ino)
+ return 0;
+ if (state->fc_modified_inodes_used == state->fc_modified_inodes_size) {
+ state->fc_modified_inodes = krealloc(
+ state->fc_modified_inodes,
+ sizeof(int) * (state->fc_modified_inodes_size +
+ EXT4_FC_REPLAY_REALLOC_INCREMENT),
+ GFP_KERNEL);
+ if (!state->fc_modified_inodes)
+ return -ENOMEM;
+ state->fc_modified_inodes_size +=
+ EXT4_FC_REPLAY_REALLOC_INCREMENT;
+ }
+ state->fc_modified_inodes[state->fc_modified_inodes_used++] = ino;
+ return 0;
+}
+
+/*
+ * Inode replay function
+ */
+static int ext4_fc_replay_inode(struct super_block *sb, struct ext4_fc_tl *tl,
+ u8 *val)
+{
+ struct ext4_fc_inode fc_inode;
+ struct ext4_inode *raw_inode;
+ struct ext4_inode *raw_fc_inode;
+ struct inode *inode = NULL;
+ struct ext4_iloc iloc;
+ int inode_len, ino, ret, tag = le16_to_cpu(tl->fc_tag);
+ struct ext4_extent_header *eh;
+
+ memcpy(&fc_inode, val, sizeof(fc_inode));
+
+ ino = le32_to_cpu(fc_inode.fc_ino);
+ trace_ext4_fc_replay(sb, tag, ino, 0, 0);
+
+ inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
+ if (!IS_ERR(inode)) {
+ ext4_ext_clear_bb(inode);
+ iput(inode);
+ }
+ inode = NULL;
+
+ ret = ext4_fc_record_modified_inode(sb, ino);
+ if (ret)
+ goto out;
+
+ raw_fc_inode = (struct ext4_inode *)
+ (val + offsetof(struct ext4_fc_inode, fc_raw_inode));
+ ret = ext4_get_fc_inode_loc(sb, ino, &iloc);
+ if (ret)
+ goto out;
+
+ inode_len = le16_to_cpu(tl->fc_len) - sizeof(struct ext4_fc_inode);
+ raw_inode = ext4_raw_inode(&iloc);
+
+ memcpy(raw_inode, raw_fc_inode, offsetof(struct ext4_inode, i_block));
+ memcpy(&raw_inode->i_generation, &raw_fc_inode->i_generation,
+ inode_len - offsetof(struct ext4_inode, i_generation));
+ if (le32_to_cpu(raw_inode->i_flags) & EXT4_EXTENTS_FL) {
+ eh = (struct ext4_extent_header *)(&raw_inode->i_block[0]);
+ if (eh->eh_magic != EXT4_EXT_MAGIC) {
+ memset(eh, 0, sizeof(*eh));
+ eh->eh_magic = EXT4_EXT_MAGIC;
+ eh->eh_max = cpu_to_le16(
+ (sizeof(raw_inode->i_block) -
+ sizeof(struct ext4_extent_header))
+ / sizeof(struct ext4_extent));
+ }
+ } else if (le32_to_cpu(raw_inode->i_flags) & EXT4_INLINE_DATA_FL) {
+ memcpy(raw_inode->i_block, raw_fc_inode->i_block,
+ sizeof(raw_inode->i_block));
+ }
+
+ /* Immediately update the inode on disk. */
+ ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
+ if (ret)
+ goto out;
+ ret = sync_dirty_buffer(iloc.bh);
+ if (ret)
+ goto out;
+ ret = ext4_mark_inode_used(sb, ino);
+ if (ret)
+ goto out;
+
+ /* Given that we just wrote the inode on disk, this SHOULD succeed. */
+ inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
+ if (IS_ERR(inode)) {
+ jbd_debug(1, "Inode not found.");
+ return -EFSCORRUPTED;
+ }
+
+ /*
+ * Our allocator could have made different decisions than before
+ * crashing. This should be fixed but until then, we calculate
+ * the number of blocks the inode.
+ */
+ ext4_ext_replay_set_iblocks(inode);
+
+ inode->i_generation = le32_to_cpu(ext4_raw_inode(&iloc)->i_generation);
+ ext4_reset_inode_seed(inode);
+
+ ext4_inode_csum_set(inode, ext4_raw_inode(&iloc), EXT4_I(inode));
+ ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
+ sync_dirty_buffer(iloc.bh);
+ brelse(iloc.bh);
+out:
+ iput(inode);
+ if (!ret)
+ blkdev_issue_flush(sb->s_bdev, GFP_KERNEL);
+
+ return 0;
+}
+
+/*
+ * Dentry create replay function.
+ *
+ * EXT4_FC_TAG_CREAT is preceded by EXT4_FC_TAG_INODE_FULL. Which means, the
+ * inode for which we are trying to create a dentry here, should already have
+ * been replayed before we start here.
+ */
+static int ext4_fc_replay_create(struct super_block *sb, struct ext4_fc_tl *tl,
+ u8 *val)
+{
+ int ret = 0;
+ struct inode *inode = NULL;
+ struct inode *dir = NULL;
+ struct dentry_info_args darg;
+
+ tl_to_darg(&darg, tl, val);
+
+ trace_ext4_fc_replay(sb, EXT4_FC_TAG_CREAT, darg.ino,
+ darg.parent_ino, darg.dname_len);
+
+ /* This takes care of update group descriptor and other metadata */
+ ret = ext4_mark_inode_used(sb, darg.ino);
+ if (ret)
+ goto out;
+
+ inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
+ if (IS_ERR(inode)) {
+ jbd_debug(1, "inode %d not found.", darg.ino);
+ inode = NULL;
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (S_ISDIR(inode->i_mode)) {
+ /*
+ * If we are creating a directory, we need to make sure that the
+ * dot and dot dot dirents are setup properly.
+ */
+ dir = ext4_iget(sb, darg.parent_ino, EXT4_IGET_NORMAL);
+ if (IS_ERR(dir)) {
+ jbd_debug(1, "Dir %d not found.", darg.ino);
+ goto out;
+ }
+ ret = ext4_init_new_dir(NULL, dir, inode);
+ iput(dir);
+ if (ret) {
+ ret = 0;
+ goto out;
+ }
+ }
+ ret = ext4_fc_replay_link_internal(sb, &darg, inode);
+ if (ret)
+ goto out;
+ set_nlink(inode, 1);
+ ext4_mark_inode_dirty(NULL, inode);
+out:
+ if (inode)
+ iput(inode);
+ return ret;
+}
+
+/*
+ * Record physical disk regions which are in use as per fast commit area,
+ * and used by inodes during replay phase. Our simple replay phase
+ * allocator excludes these regions from allocation.
+ */
+int ext4_fc_record_regions(struct super_block *sb, int ino,
+ ext4_lblk_t lblk, ext4_fsblk_t pblk, int len, int replay)
+{
+ struct ext4_fc_replay_state *state;
+ struct ext4_fc_alloc_region *region;
+
+ state = &EXT4_SB(sb)->s_fc_replay_state;
+ /*
+ * during replay phase, the fc_regions_valid may not same as
+ * fc_regions_used, update it when do new additions.
+ */
+ if (replay && state->fc_regions_used != state->fc_regions_valid)
+ state->fc_regions_used = state->fc_regions_valid;
+ if (state->fc_regions_used == state->fc_regions_size) {
+ state->fc_regions_size +=
+ EXT4_FC_REPLAY_REALLOC_INCREMENT;
+ state->fc_regions = krealloc(
+ state->fc_regions,
+ state->fc_regions_size *
+ sizeof(struct ext4_fc_alloc_region),
+ GFP_KERNEL);
+ if (!state->fc_regions)
+ return -ENOMEM;
+ }
+ region = &state->fc_regions[state->fc_regions_used++];
+ region->ino = ino;
+ region->lblk = lblk;
+ region->pblk = pblk;
+ region->len = len;
+
+ if (replay)
+ state->fc_regions_valid++;
+
+ return 0;
+}
+
+/* Replay add range tag */
+static int ext4_fc_replay_add_range(struct super_block *sb,
+ struct ext4_fc_tl *tl, u8 *val)
+{
+ struct ext4_fc_add_range fc_add_ex;
+ struct ext4_extent newex, *ex;
+ struct inode *inode;
+ ext4_lblk_t start, cur;
+ int remaining, len;
+ ext4_fsblk_t start_pblk;
+ struct ext4_map_blocks map;
+ struct ext4_ext_path *path = NULL;
+ int ret;
+
+ memcpy(&fc_add_ex, val, sizeof(fc_add_ex));
+ ex = (struct ext4_extent *)&fc_add_ex.fc_ex;
+
+ trace_ext4_fc_replay(sb, EXT4_FC_TAG_ADD_RANGE,
+ le32_to_cpu(fc_add_ex.fc_ino), le32_to_cpu(ex->ee_block),
+ ext4_ext_get_actual_len(ex));
+
+ inode = ext4_iget(sb, le32_to_cpu(fc_add_ex.fc_ino), EXT4_IGET_NORMAL);
+ if (IS_ERR(inode)) {
+ jbd_debug(1, "Inode not found.");
+ return 0;
+ }
+
+ ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
+ if (ret)
+ goto out;
+
+ start = le32_to_cpu(ex->ee_block);
+ start_pblk = ext4_ext_pblock(ex);
+ len = ext4_ext_get_actual_len(ex);
+
+ cur = start;
+ remaining = len;
+ jbd_debug(1, "ADD_RANGE, lblk %d, pblk %lld, len %d, unwritten %d, inode %ld\n",
+ start, start_pblk, len, ext4_ext_is_unwritten(ex),
+ inode->i_ino);
+
+ while (remaining > 0) {
+ map.m_lblk = cur;
+ map.m_len = remaining;
+ map.m_pblk = 0;
+ ret = ext4_map_blocks(NULL, inode, &map, 0);
+
+ if (ret < 0)
+ goto out;
+
+ if (ret == 0) {
+ /* Range is not mapped */
+ path = ext4_find_extent(inode, cur, NULL, 0);
+ if (IS_ERR(path))
+ goto out;
+ memset(&newex, 0, sizeof(newex));
+ newex.ee_block = cpu_to_le32(cur);
+ ext4_ext_store_pblock(
+ &newex, start_pblk + cur - start);
+ newex.ee_len = cpu_to_le16(map.m_len);
+ if (ext4_ext_is_unwritten(ex))
+ ext4_ext_mark_unwritten(&newex);
+ down_write(&EXT4_I(inode)->i_data_sem);
+ ret = ext4_ext_insert_extent(
+ NULL, inode, &path, &newex, 0);
+ up_write((&EXT4_I(inode)->i_data_sem));
+ ext4_ext_drop_refs(path);
+ kfree(path);
+ if (ret)
+ goto out;
+ goto next;
+ }
+
+ if (start_pblk + cur - start != map.m_pblk) {
+ /*
+ * Logical to physical mapping changed. This can happen
+ * if this range was removed and then reallocated to
+ * map to new physical blocks during a fast commit.
+ */
+ ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
+ ext4_ext_is_unwritten(ex),
+ start_pblk + cur - start);
+ if (ret)
+ goto out;
+ /*
+ * Mark the old blocks as free since they aren't used
+ * anymore. We maintain an array of all the modified
+ * inodes. In case these blocks are still used at either
+ * a different logical range in the same inode or in
+ * some different inode, we will mark them as allocated
+ * at the end of the FC replay using our array of
+ * modified inodes.
+ */
+ ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
+ goto next;
+ }
+
+ /* Range is mapped and needs a state change */
+ jbd_debug(1, "Converting from %ld to %d %lld",
+ map.m_flags & EXT4_MAP_UNWRITTEN,
+ ext4_ext_is_unwritten(ex), map.m_pblk);
+ ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
+ ext4_ext_is_unwritten(ex), map.m_pblk);
+ if (ret)
+ goto out;
+ /*
+ * We may have split the extent tree while toggling the state.
+ * Try to shrink the extent tree now.
+ */
+ ext4_ext_replay_shrink_inode(inode, start + len);
+next:
+ cur += map.m_len;
+ remaining -= map.m_len;
+ }
+ ext4_ext_replay_shrink_inode(inode, i_size_read(inode) >>
+ sb->s_blocksize_bits);
+out:
+ iput(inode);
+ return 0;
+}
+
+/* Replay DEL_RANGE tag */
+static int
+ext4_fc_replay_del_range(struct super_block *sb, struct ext4_fc_tl *tl,
+ u8 *val)
+{
+ struct inode *inode;
+ struct ext4_fc_del_range lrange;
+ struct ext4_map_blocks map;
+ ext4_lblk_t cur, remaining;
+ int ret;
+
+ memcpy(&lrange, val, sizeof(lrange));
+ cur = le32_to_cpu(lrange.fc_lblk);
+ remaining = le32_to_cpu(lrange.fc_len);
+
+ trace_ext4_fc_replay(sb, EXT4_FC_TAG_DEL_RANGE,
+ le32_to_cpu(lrange.fc_ino), cur, remaining);
+
+ inode = ext4_iget(sb, le32_to_cpu(lrange.fc_ino), EXT4_IGET_NORMAL);
+ if (IS_ERR(inode)) {
+ jbd_debug(1, "Inode %d not found", le32_to_cpu(lrange.fc_ino));
+ return 0;
+ }
+
+ ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
+ if (ret)
+ goto out;
+
+ jbd_debug(1, "DEL_RANGE, inode %ld, lblk %d, len %d\n",
+ inode->i_ino, le32_to_cpu(lrange.fc_lblk),
+ le32_to_cpu(lrange.fc_len));
+ while (remaining > 0) {
+ map.m_lblk = cur;
+ map.m_len = remaining;
+
+ ret = ext4_map_blocks(NULL, inode, &map, 0);
+ if (ret < 0)
+ goto out;
+ if (ret > 0) {
+ remaining -= ret;
+ cur += ret;
+ ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
+ } else {
+ remaining -= map.m_len;
+ cur += map.m_len;
+ }
+ }
+
+ down_write(&EXT4_I(inode)->i_data_sem);
+ ret = ext4_ext_remove_space(inode, le32_to_cpu(lrange.fc_lblk),
+ le32_to_cpu(lrange.fc_lblk) +
+ le32_to_cpu(lrange.fc_len) - 1);
+ up_write(&EXT4_I(inode)->i_data_sem);
+ if (ret)
+ goto out;
+ ext4_ext_replay_shrink_inode(inode,
+ i_size_read(inode) >> sb->s_blocksize_bits);
+ ext4_mark_inode_dirty(NULL, inode);
+out:
+ iput(inode);
+ return 0;
+}
+
+static inline const char *tag2str(u16 tag)
+{
+ switch (tag) {
+ case EXT4_FC_TAG_LINK:
+ return "TAG_ADD_ENTRY";
+ case EXT4_FC_TAG_UNLINK:
+ return "TAG_DEL_ENTRY";
+ case EXT4_FC_TAG_ADD_RANGE:
+ return "TAG_ADD_RANGE";
+ case EXT4_FC_TAG_CREAT:
+ return "TAG_CREAT_DENTRY";
+ case EXT4_FC_TAG_DEL_RANGE:
+ return "TAG_DEL_RANGE";
+ case EXT4_FC_TAG_INODE:
+ return "TAG_INODE";
+ case EXT4_FC_TAG_PAD:
+ return "TAG_PAD";
+ case EXT4_FC_TAG_TAIL:
+ return "TAG_TAIL";
+ case EXT4_FC_TAG_HEAD:
+ return "TAG_HEAD";
+ default:
+ return "TAG_ERROR";
+ }
+}
+
+static void ext4_fc_set_bitmaps_and_counters(struct super_block *sb)
+{
+ struct ext4_fc_replay_state *state;
+ struct inode *inode;
+ struct ext4_ext_path *path = NULL;
+ struct ext4_map_blocks map;
+ int i, ret, j;
+ ext4_lblk_t cur, end;
+
+ state = &EXT4_SB(sb)->s_fc_replay_state;
+ for (i = 0; i < state->fc_modified_inodes_used; i++) {
+ inode = ext4_iget(sb, state->fc_modified_inodes[i],
+ EXT4_IGET_NORMAL);
+ if (IS_ERR(inode)) {
+ jbd_debug(1, "Inode %d not found.",
+ state->fc_modified_inodes[i]);
+ continue;
+ }
+ cur = 0;
+ end = EXT_MAX_BLOCKS;
+ while (cur < end) {
+ map.m_lblk = cur;
+ map.m_len = end - cur;
+
+ ret = ext4_map_blocks(NULL, inode, &map, 0);
+ if (ret < 0)
+ break;
+
+ if (ret > 0) {
+ path = ext4_find_extent(inode, map.m_lblk, NULL, 0);
+ if (!IS_ERR(path)) {
+ for (j = 0; j < path->p_depth; j++)
+ ext4_mb_mark_bb(inode->i_sb,
+ path[j].p_block, 1, 1);
+ ext4_ext_drop_refs(path);
+ kfree(path);
+ }
+ cur += ret;
+ ext4_mb_mark_bb(inode->i_sb, map.m_pblk,
+ map.m_len, 1);
+ } else {
+ cur = cur + (map.m_len ? map.m_len : 1);
+ }
+ }
+ iput(inode);
+ }
+}
+
+/*
+ * Check if block is in excluded regions for block allocation. The simple
+ * allocator that runs during replay phase is calls this function to see
+ * if it is okay to use a block.
+ */
+bool ext4_fc_replay_check_excluded(struct super_block *sb, ext4_fsblk_t blk)
+{
+ int i;
+ struct ext4_fc_replay_state *state;
+
+ state = &EXT4_SB(sb)->s_fc_replay_state;
+ for (i = 0; i < state->fc_regions_valid; i++) {
+ if (state->fc_regions[i].ino == 0 ||
+ state->fc_regions[i].len == 0)
+ continue;
+ if (blk >= state->fc_regions[i].pblk &&
+ blk < state->fc_regions[i].pblk + state->fc_regions[i].len)
+ return true;
+ }
+ return false;
+}
+
+/* Cleanup function called after replay */
+void ext4_fc_replay_cleanup(struct super_block *sb)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+ sbi->s_mount_state &= ~EXT4_FC_REPLAY;
+ kfree(sbi->s_fc_replay_state.fc_regions);
+ kfree(sbi->s_fc_replay_state.fc_modified_inodes);
+}
+
+/*
+ * Recovery Scan phase handler
+ *
+ * This function is called during the scan phase and is responsible
+ * for doing following things:
+ * - Make sure the fast commit area has valid tags for replay
+ * - Count number of tags that need to be replayed by the replay handler
+ * - Verify CRC
+ * - Create a list of excluded blocks for allocation during replay phase
+ *
+ * This function returns JBD2_FC_REPLAY_CONTINUE to indicate that SCAN is
+ * incomplete and JBD2 should send more blocks. It returns JBD2_FC_REPLAY_STOP
+ * to indicate that scan has finished and JBD2 can now start replay phase.
+ * It returns a negative error to indicate that there was an error. At the end
+ * of a successful scan phase, sbi->s_fc_replay_state.fc_replay_num_tags is set
+ * to indicate the number of tags that need to replayed during the replay phase.
+ */
+static int ext4_fc_replay_scan(journal_t *journal,
+ struct buffer_head *bh, int off,
+ tid_t expected_tid)
+{
+ struct super_block *sb = journal->j_private;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_fc_replay_state *state;
+ int ret = JBD2_FC_REPLAY_CONTINUE;
+ struct ext4_fc_add_range ext;
+ struct ext4_fc_tl tl;
+ struct ext4_fc_tail tail;
+ __u8 *start, *end, *cur, *val;
+ struct ext4_fc_head head;
+ struct ext4_extent *ex;
+
+ state = &sbi->s_fc_replay_state;
+
+ start = (u8 *)bh->b_data;
+ end = (__u8 *)bh->b_data + journal->j_blocksize - 1;
+
+ if (state->fc_replay_expected_off == 0) {
+ state->fc_cur_tag = 0;
+ state->fc_replay_num_tags = 0;
+ state->fc_crc = 0;
+ state->fc_regions = NULL;
+ state->fc_regions_valid = state->fc_regions_used =
+ state->fc_regions_size = 0;
+ /* Check if we can stop early */
+ if (le16_to_cpu(((struct ext4_fc_tl *)start)->fc_tag)
+ != EXT4_FC_TAG_HEAD)
+ return 0;
+ }
+
+ if (off != state->fc_replay_expected_off) {
+ ret = -EFSCORRUPTED;
+ goto out_err;
+ }
+
+ state->fc_replay_expected_off++;
+ for (cur = start; cur < end; cur = cur + sizeof(tl) + le16_to_cpu(tl.fc_len)) {
+ memcpy(&tl, cur, sizeof(tl));
+ val = cur + sizeof(tl);
+ jbd_debug(3, "Scan phase, tag:%s, blk %lld\n",
+ tag2str(le16_to_cpu(tl.fc_tag)), bh->b_blocknr);
+ switch (le16_to_cpu(tl.fc_tag)) {
+ case EXT4_FC_TAG_ADD_RANGE:
+ memcpy(&ext, val, sizeof(ext));
+ ex = (struct ext4_extent *)&ext.fc_ex;
+ ret = ext4_fc_record_regions(sb,
+ le32_to_cpu(ext.fc_ino),
+ le32_to_cpu(ex->ee_block), ext4_ext_pblock(ex),
+ ext4_ext_get_actual_len(ex), 0);
+ if (ret < 0)
+ break;
+ ret = JBD2_FC_REPLAY_CONTINUE;
+ fallthrough;
+ case EXT4_FC_TAG_DEL_RANGE:
+ case EXT4_FC_TAG_LINK:
+ case EXT4_FC_TAG_UNLINK:
+ case EXT4_FC_TAG_CREAT:
+ case EXT4_FC_TAG_INODE:
+ case EXT4_FC_TAG_PAD:
+ state->fc_cur_tag++;
+ state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
+ sizeof(tl) + le16_to_cpu(tl.fc_len));
+ break;
+ case EXT4_FC_TAG_TAIL:
+ state->fc_cur_tag++;
+ memcpy(&tail, val, sizeof(tail));
+ state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
+ sizeof(tl) +
+ offsetof(struct ext4_fc_tail,
+ fc_crc));
+ if (le32_to_cpu(tail.fc_tid) == expected_tid &&
+ le32_to_cpu(tail.fc_crc) == state->fc_crc) {
+ state->fc_replay_num_tags = state->fc_cur_tag;
+ state->fc_regions_valid =
+ state->fc_regions_used;
+ } else {
+ ret = state->fc_replay_num_tags ?
+ JBD2_FC_REPLAY_STOP : -EFSBADCRC;
+ }
+ state->fc_crc = 0;
+ break;
+ case EXT4_FC_TAG_HEAD:
+ memcpy(&head, val, sizeof(head));
+ if (le32_to_cpu(head.fc_features) &
+ ~EXT4_FC_SUPPORTED_FEATURES) {
+ ret = -EOPNOTSUPP;
+ break;
+ }
+ if (le32_to_cpu(head.fc_tid) != expected_tid) {
+ ret = JBD2_FC_REPLAY_STOP;
+ break;
+ }
+ state->fc_cur_tag++;
+ state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
+ sizeof(tl) + le16_to_cpu(tl.fc_len));
+ break;
+ default:
+ ret = state->fc_replay_num_tags ?
+ JBD2_FC_REPLAY_STOP : -ECANCELED;
+ }
+ if (ret < 0 || ret == JBD2_FC_REPLAY_STOP)
+ break;
+ }
+
+out_err:
+ trace_ext4_fc_replay_scan(sb, ret, off);
+ return ret;
+}
+
+/*
+ * Main recovery path entry point.
+ * The meaning of return codes is similar as above.
+ */
+static int ext4_fc_replay(journal_t *journal, struct buffer_head *bh,
+ enum passtype pass, int off, tid_t expected_tid)
+{
+ struct super_block *sb = journal->j_private;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_fc_tl tl;
+ __u8 *start, *end, *cur, *val;
+ int ret = JBD2_FC_REPLAY_CONTINUE;
+ struct ext4_fc_replay_state *state = &sbi->s_fc_replay_state;
+ struct ext4_fc_tail tail;
+
+ if (pass == PASS_SCAN) {
+ state->fc_current_pass = PASS_SCAN;
+ return ext4_fc_replay_scan(journal, bh, off, expected_tid);
+ }
+
+ if (state->fc_current_pass != pass) {
+ state->fc_current_pass = pass;
+ sbi->s_mount_state |= EXT4_FC_REPLAY;
+ }
+ if (!sbi->s_fc_replay_state.fc_replay_num_tags) {
+ jbd_debug(1, "Replay stops\n");
+ ext4_fc_set_bitmaps_and_counters(sb);
+ return 0;
+ }
+
+#ifdef CONFIG_EXT4_DEBUG
+ if (sbi->s_fc_debug_max_replay && off >= sbi->s_fc_debug_max_replay) {
+ pr_warn("Dropping fc block %d because max_replay set\n", off);
+ return JBD2_FC_REPLAY_STOP;
+ }
+#endif
+
+ start = (u8 *)bh->b_data;
+ end = (__u8 *)bh->b_data + journal->j_blocksize - 1;
+
+ for (cur = start; cur < end; cur = cur + sizeof(tl) + le16_to_cpu(tl.fc_len)) {
+ memcpy(&tl, cur, sizeof(tl));
+ val = cur + sizeof(tl);
+
+ if (state->fc_replay_num_tags == 0) {
+ ret = JBD2_FC_REPLAY_STOP;
+ ext4_fc_set_bitmaps_and_counters(sb);
+ break;
+ }
+ jbd_debug(3, "Replay phase, tag:%s\n",
+ tag2str(le16_to_cpu(tl.fc_tag)));
+ state->fc_replay_num_tags--;
+ switch (le16_to_cpu(tl.fc_tag)) {
+ case EXT4_FC_TAG_LINK:
+ ret = ext4_fc_replay_link(sb, &tl, val);
+ break;
+ case EXT4_FC_TAG_UNLINK:
+ ret = ext4_fc_replay_unlink(sb, &tl, val);
+ break;
+ case EXT4_FC_TAG_ADD_RANGE:
+ ret = ext4_fc_replay_add_range(sb, &tl, val);
+ break;
+ case EXT4_FC_TAG_CREAT:
+ ret = ext4_fc_replay_create(sb, &tl, val);
+ break;
+ case EXT4_FC_TAG_DEL_RANGE:
+ ret = ext4_fc_replay_del_range(sb, &tl, val);
+ break;
+ case EXT4_FC_TAG_INODE:
+ ret = ext4_fc_replay_inode(sb, &tl, val);
+ break;
+ case EXT4_FC_TAG_PAD:
+ trace_ext4_fc_replay(sb, EXT4_FC_TAG_PAD, 0,
+ le16_to_cpu(tl.fc_len), 0);
+ break;
+ case EXT4_FC_TAG_TAIL:
+ trace_ext4_fc_replay(sb, EXT4_FC_TAG_TAIL, 0,
+ le16_to_cpu(tl.fc_len), 0);
+ memcpy(&tail, val, sizeof(tail));
+ WARN_ON(le32_to_cpu(tail.fc_tid) != expected_tid);
+ break;
+ case EXT4_FC_TAG_HEAD:
+ break;
+ default:
+ trace_ext4_fc_replay(sb, le16_to_cpu(tl.fc_tag), 0,
+ le16_to_cpu(tl.fc_len), 0);
+ ret = -ECANCELED;
+ break;
+ }
+ if (ret < 0)
+ break;
+ ret = JBD2_FC_REPLAY_CONTINUE;
+ }
+ return ret;
+}
+
+void ext4_fc_init(struct super_block *sb, journal_t *journal)
+{
+ /*
+ * We set replay callback even if fast commit disabled because we may
+ * could still have fast commit blocks that need to be replayed even if
+ * fast commit has now been turned off.
+ */
+ journal->j_fc_replay_callback = ext4_fc_replay;
+ if (!test_opt2(sb, JOURNAL_FAST_COMMIT))
+ return;
+ journal->j_fc_cleanup_callback = ext4_fc_cleanup;
+}
+
+static const char *fc_ineligible_reasons[] = {
+ "Extended attributes changed",
+ "Cross rename",
+ "Journal flag changed",
+ "Insufficient memory",
+ "Swap boot",
+ "Resize",
+ "Dir renamed",
+ "Falloc range op",
+ "Data journalling",
+ "FC Commit Failed"
+};
+
+int ext4_fc_info_show(struct seq_file *seq, void *v)
+{
+ struct ext4_sb_info *sbi = EXT4_SB((struct super_block *)seq->private);
+ struct ext4_fc_stats *stats = &sbi->s_fc_stats;
+ int i;
+
+ if (v != SEQ_START_TOKEN)
+ return 0;
+
+ seq_printf(seq,
+ "fc stats:\n%ld commits\n%ld ineligible\n%ld numblks\n%lluus avg_commit_time\n",
+ stats->fc_num_commits, stats->fc_ineligible_commits,
+ stats->fc_numblks,
+ div_u64(sbi->s_fc_avg_commit_time, 1000));
+ seq_puts(seq, "Ineligible reasons:\n");
+ for (i = 0; i < EXT4_FC_REASON_MAX; i++)
+ seq_printf(seq, "\"%s\":\t%d\n", fc_ineligible_reasons[i],
+ stats->fc_ineligible_reason_count[i]);
+
+ return 0;
+}
+
+int __init ext4_fc_init_dentry_cache(void)
+{
+ ext4_fc_dentry_cachep = KMEM_CACHE(ext4_fc_dentry_update,
+ SLAB_RECLAIM_ACCOUNT);
+
+ if (ext4_fc_dentry_cachep == NULL)
+ return -ENOMEM;
+
+ return 0;
+}
+
+void ext4_fc_destroy_dentry_cache(void)
+{
+ kmem_cache_destroy(ext4_fc_dentry_cachep);
+}
diff --git a/fs/ext4/fast_commit.h b/fs/ext4/fast_commit.h
new file mode 100644
index 0000000..d8d0998
--- /dev/null
+++ b/fs/ext4/fast_commit.h
@@ -0,0 +1,150 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef __FAST_COMMIT_H__
+#define __FAST_COMMIT_H__
+
+/* Fast commit tags */
+#define EXT4_FC_TAG_ADD_RANGE 0x0001
+#define EXT4_FC_TAG_DEL_RANGE 0x0002
+#define EXT4_FC_TAG_CREAT 0x0003
+#define EXT4_FC_TAG_LINK 0x0004
+#define EXT4_FC_TAG_UNLINK 0x0005
+#define EXT4_FC_TAG_INODE 0x0006
+#define EXT4_FC_TAG_PAD 0x0007
+#define EXT4_FC_TAG_TAIL 0x0008
+#define EXT4_FC_TAG_HEAD 0x0009
+
+#define EXT4_FC_SUPPORTED_FEATURES 0x0
+
+/* On disk fast commit tlv value structures */
+
+/* Fast commit on disk tag length structure */
+struct ext4_fc_tl {
+ __le16 fc_tag;
+ __le16 fc_len;
+};
+
+/* Value structure for tag EXT4_FC_TAG_HEAD. */
+struct ext4_fc_head {
+ __le32 fc_features;
+ __le32 fc_tid;
+};
+
+/* Value structure for EXT4_FC_TAG_ADD_RANGE. */
+struct ext4_fc_add_range {
+ __le32 fc_ino;
+ __u8 fc_ex[12];
+};
+
+/* Value structure for tag EXT4_FC_TAG_DEL_RANGE. */
+struct ext4_fc_del_range {
+ __le32 fc_ino;
+ __le32 fc_lblk;
+ __le32 fc_len;
+};
+
+/*
+ * This is the value structure for tags EXT4_FC_TAG_CREAT, EXT4_FC_TAG_LINK
+ * and EXT4_FC_TAG_UNLINK.
+ */
+struct ext4_fc_dentry_info {
+ __le32 fc_parent_ino;
+ __le32 fc_ino;
+ u8 fc_dname[0];
+};
+
+/* Value structure for EXT4_FC_TAG_INODE and EXT4_FC_TAG_INODE_PARTIAL. */
+struct ext4_fc_inode {
+ __le32 fc_ino;
+ __u8 fc_raw_inode[0];
+};
+
+/* Value structure for tag EXT4_FC_TAG_TAIL. */
+struct ext4_fc_tail {
+ __le32 fc_tid;
+ __le32 fc_crc;
+};
+
+/*
+ * In memory list of dentry updates that are performed on the file
+ * system used by fast commit code.
+ */
+struct ext4_fc_dentry_update {
+ int fcd_op; /* Type of update create / unlink / link */
+ int fcd_parent; /* Parent inode number */
+ int fcd_ino; /* Inode number */
+ struct qstr fcd_name; /* Dirent name */
+ unsigned char fcd_iname[DNAME_INLINE_LEN]; /* Dirent name string */
+ struct list_head fcd_list;
+};
+
+/*
+ * Fast commit reason codes
+ */
+enum {
+ /*
+ * Commit status codes:
+ */
+ EXT4_FC_REASON_OK = 0,
+ EXT4_FC_REASON_INELIGIBLE,
+ EXT4_FC_REASON_ALREADY_COMMITTED,
+ EXT4_FC_REASON_FC_START_FAILED,
+ EXT4_FC_REASON_FC_FAILED,
+
+ /*
+ * Fast commit ineligiblity reasons:
+ */
+ EXT4_FC_REASON_XATTR = 0,
+ EXT4_FC_REASON_CROSS_RENAME,
+ EXT4_FC_REASON_JOURNAL_FLAG_CHANGE,
+ EXT4_FC_REASON_NOMEM,
+ EXT4_FC_REASON_SWAP_BOOT,
+ EXT4_FC_REASON_RESIZE,
+ EXT4_FC_REASON_RENAME_DIR,
+ EXT4_FC_REASON_FALLOC_RANGE,
+ EXT4_FC_REASON_INODE_JOURNAL_DATA,
+ EXT4_FC_COMMIT_FAILED,
+ EXT4_FC_REASON_MAX
+};
+
+struct ext4_fc_stats {
+ unsigned int fc_ineligible_reason_count[EXT4_FC_REASON_MAX];
+ unsigned long fc_num_commits;
+ unsigned long fc_ineligible_commits;
+ unsigned long fc_numblks;
+};
+
+#define EXT4_FC_REPLAY_REALLOC_INCREMENT 4
+
+/*
+ * Physical block regions added to different inodes due to fast commit
+ * recovery. These are set during the SCAN phase. During the replay phase,
+ * our allocator excludes these from its allocation. This ensures that
+ * we don't accidentally allocating a block that is going to be used by
+ * another inode.
+ */
+struct ext4_fc_alloc_region {
+ ext4_lblk_t lblk;
+ ext4_fsblk_t pblk;
+ int ino, len;
+};
+
+/*
+ * Fast commit replay state.
+ */
+struct ext4_fc_replay_state {
+ int fc_replay_num_tags;
+ int fc_replay_expected_off;
+ int fc_current_pass;
+ int fc_cur_tag;
+ int fc_crc;
+ struct ext4_fc_alloc_region *fc_regions;
+ int fc_regions_size, fc_regions_used, fc_regions_valid;
+ int *fc_modified_inodes;
+ int fc_modified_inodes_used, fc_modified_inodes_size;
+};
+
+#define region_last(__region) (((__region)->lblk) + ((__region)->len) - 1)
+
+
+#endif /* __FAST_COMMIT_H__ */
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 1513e90..7b28d44 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -29,10 +29,58 @@
#include <linux/pagevec.h>
#include <linux/uio.h>
#include <linux/mman.h>
+#include <linux/backing-dev.h>
#include "ext4.h"
#include "ext4_jbd2.h"
#include "xattr.h"
#include "acl.h"
+#include "truncate.h"
+
+static bool ext4_dio_supported(struct inode *inode)
+{
+ if (IS_ENABLED(CONFIG_FS_ENCRYPTION) && IS_ENCRYPTED(inode))
+ return false;
+ if (fsverity_active(inode))
+ return false;
+ if (ext4_should_journal_data(inode))
+ return false;
+ if (ext4_has_inline_data(inode))
+ return false;
+ return true;
+}
+
+static ssize_t ext4_dio_read_iter(struct kiocb *iocb, struct iov_iter *to)
+{
+ ssize_t ret;
+ struct inode *inode = file_inode(iocb->ki_filp);
+
+ if (iocb->ki_flags & IOCB_NOWAIT) {
+ if (!inode_trylock_shared(inode))
+ return -EAGAIN;
+ } else {
+ inode_lock_shared(inode);
+ }
+
+ if (!ext4_dio_supported(inode)) {
+ inode_unlock_shared(inode);
+ /*
+ * Fallback to buffered I/O if the operation being performed on
+ * the inode is not supported by direct I/O. The IOCB_DIRECT
+ * flag needs to be cleared here in order to ensure that the
+ * direct I/O path within generic_file_read_iter() is not
+ * taken.
+ */
+ iocb->ki_flags &= ~IOCB_DIRECT;
+ return generic_file_read_iter(iocb, to);
+ }
+
+ ret = iomap_dio_rw(iocb, to, &ext4_iomap_ops, NULL,
+ is_sync_kiocb(iocb));
+ inode_unlock_shared(inode);
+
+ file_accessed(iocb->ki_filp);
+ return ret;
+}
#ifdef CONFIG_FS_DAX
static ssize_t ext4_dax_read_iter(struct kiocb *iocb, struct iov_iter *to)
@@ -65,16 +113,21 @@
static ssize_t ext4_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
- if (unlikely(ext4_forced_shutdown(EXT4_SB(file_inode(iocb->ki_filp)->i_sb))))
+ struct inode *inode = file_inode(iocb->ki_filp);
+
+ if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
return -EIO;
if (!iov_iter_count(to))
return 0; /* skip atime */
#ifdef CONFIG_FS_DAX
- if (IS_DAX(file_inode(iocb->ki_filp)))
+ if (IS_DAX(inode))
return ext4_dax_read_iter(iocb, to);
#endif
+ if (iocb->ki_flags & IOCB_DIRECT)
+ return ext4_dio_read_iter(iocb, to);
+
return generic_file_read_iter(iocb, to);
}
@@ -92,10 +145,9 @@
/* if we are the last writer on the inode, drop the block reservation */
if ((filp->f_mode & FMODE_WRITE) &&
(atomic_read(&inode->i_writecount) == 1) &&
- !EXT4_I(inode)->i_reserved_data_blocks)
- {
+ !EXT4_I(inode)->i_reserved_data_blocks) {
down_write(&EXT4_I(inode)->i_data_sem);
- ext4_discard_preallocations(inode);
+ ext4_discard_preallocations(inode, 0);
up_write(&EXT4_I(inode)->i_data_sem);
}
if (is_dx(inode) && filp->private_data)
@@ -104,13 +156,6 @@
return 0;
}
-static void ext4_unwritten_wait(struct inode *inode)
-{
- wait_queue_head_t *wq = ext4_ioend_wq(inode);
-
- wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_unwritten) == 0));
-}
-
/*
* This tests whether the IO in question is block-aligned or not.
* Ext4 utilizes unwritten extents when hole-filling during direct IO, and they
@@ -120,19 +165,25 @@
* threads are at work on the same unwritten block, they must be synchronized
* or one thread will zero the other's data, causing corruption.
*/
-static int
-ext4_unaligned_aio(struct inode *inode, struct iov_iter *from, loff_t pos)
+static bool
+ext4_unaligned_io(struct inode *inode, struct iov_iter *from, loff_t pos)
{
struct super_block *sb = inode->i_sb;
- int blockmask = sb->s_blocksize - 1;
-
- if (pos >= ALIGN(i_size_read(inode), sb->s_blocksize))
- return 0;
+ unsigned long blockmask = sb->s_blocksize - 1;
if ((pos | iov_iter_alignment(from)) & blockmask)
- return 1;
+ return true;
- return 0;
+ return false;
+}
+
+static bool
+ext4_extending_io(struct inode *inode, loff_t offset, size_t len)
+{
+ if (offset + len > i_size_read(inode) ||
+ offset + len > EXT4_I(inode)->i_disksize)
+ return true;
+ return false;
}
/* Is IO overwriting allocated and initialized blocks? */
@@ -158,18 +209,19 @@
return err == blklen && (map.m_flags & EXT4_MAP_MAPPED);
}
-static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from)
+static ssize_t ext4_generic_write_checks(struct kiocb *iocb,
+ struct iov_iter *from)
{
struct inode *inode = file_inode(iocb->ki_filp);
ssize_t ret;
+ if (unlikely(IS_IMMUTABLE(inode)))
+ return -EPERM;
+
ret = generic_write_checks(iocb, from);
if (ret <= 0)
return ret;
- if (unlikely(IS_IMMUTABLE(inode)))
- return -EPERM;
-
/*
* If we have encountered a bitmap-format file, the size limit
* is smaller than s_maxbytes, which is for extent-mapped files.
@@ -181,15 +233,392 @@
return -EFBIG;
iov_iter_truncate(from, sbi->s_bitmap_maxbytes - iocb->ki_pos);
}
+
return iov_iter_count(from);
}
+static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from)
+{
+ ssize_t ret, count;
+
+ count = ext4_generic_write_checks(iocb, from);
+ if (count <= 0)
+ return count;
+
+ ret = file_modified(iocb->ki_filp);
+ if (ret)
+ return ret;
+ return count;
+}
+
+static ssize_t ext4_buffered_write_iter(struct kiocb *iocb,
+ struct iov_iter *from)
+{
+ ssize_t ret;
+ struct inode *inode = file_inode(iocb->ki_filp);
+
+ if (iocb->ki_flags & IOCB_NOWAIT)
+ return -EOPNOTSUPP;
+
+ ext4_fc_start_update(inode);
+ inode_lock(inode);
+ ret = ext4_write_checks(iocb, from);
+ if (ret <= 0)
+ goto out;
+
+ current->backing_dev_info = inode_to_bdi(inode);
+ ret = generic_perform_write(iocb->ki_filp, from, iocb->ki_pos);
+ current->backing_dev_info = NULL;
+
+out:
+ inode_unlock(inode);
+ ext4_fc_stop_update(inode);
+ if (likely(ret > 0)) {
+ iocb->ki_pos += ret;
+ ret = generic_write_sync(iocb, ret);
+ }
+
+ return ret;
+}
+
+static ssize_t ext4_handle_inode_extension(struct inode *inode, loff_t offset,
+ ssize_t written, size_t count)
+{
+ handle_t *handle;
+ bool truncate = false;
+ u8 blkbits = inode->i_blkbits;
+ ext4_lblk_t written_blk, end_blk;
+ int ret;
+
+ /*
+ * Note that EXT4_I(inode)->i_disksize can get extended up to
+ * inode->i_size while the I/O was running due to writeback of delalloc
+ * blocks. But, the code in ext4_iomap_alloc() is careful to use
+ * zeroed/unwritten extents if this is possible; thus we won't leave
+ * uninitialized blocks in a file even if we didn't succeed in writing
+ * as much as we intended.
+ */
+ WARN_ON_ONCE(i_size_read(inode) < EXT4_I(inode)->i_disksize);
+ if (offset + count <= EXT4_I(inode)->i_disksize) {
+ /*
+ * We need to ensure that the inode is removed from the orphan
+ * list if it has been added prematurely, due to writeback of
+ * delalloc blocks.
+ */
+ if (!list_empty(&EXT4_I(inode)->i_orphan) && inode->i_nlink) {
+ handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
+
+ if (IS_ERR(handle)) {
+ ext4_orphan_del(NULL, inode);
+ return PTR_ERR(handle);
+ }
+
+ ext4_orphan_del(handle, inode);
+ ext4_journal_stop(handle);
+ }
+
+ return written;
+ }
+
+ if (written < 0)
+ goto truncate;
+
+ handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
+ if (IS_ERR(handle)) {
+ written = PTR_ERR(handle);
+ goto truncate;
+ }
+
+ if (ext4_update_inode_size(inode, offset + written)) {
+ ret = ext4_mark_inode_dirty(handle, inode);
+ if (unlikely(ret)) {
+ written = ret;
+ ext4_journal_stop(handle);
+ goto truncate;
+ }
+ }
+
+ /*
+ * We may need to truncate allocated but not written blocks beyond EOF.
+ */
+ written_blk = ALIGN(offset + written, 1 << blkbits);
+ end_blk = ALIGN(offset + count, 1 << blkbits);
+ if (written_blk < end_blk && ext4_can_truncate(inode))
+ truncate = true;
+
+ /*
+ * Remove the inode from the orphan list if it has been extended and
+ * everything went OK.
+ */
+ if (!truncate && inode->i_nlink)
+ ext4_orphan_del(handle, inode);
+ ext4_journal_stop(handle);
+
+ if (truncate) {
+truncate:
+ ext4_truncate_failed_write(inode);
+ /*
+ * If the truncate operation failed early, then the inode may
+ * still be on the orphan list. In that case, we need to try
+ * remove the inode from the in-memory linked list.
+ */
+ if (inode->i_nlink)
+ ext4_orphan_del(NULL, inode);
+ }
+
+ return written;
+}
+
+static int ext4_dio_write_end_io(struct kiocb *iocb, ssize_t size,
+ int error, unsigned int flags)
+{
+ loff_t pos = iocb->ki_pos;
+ struct inode *inode = file_inode(iocb->ki_filp);
+
+ if (error)
+ return error;
+
+ if (size && flags & IOMAP_DIO_UNWRITTEN) {
+ error = ext4_convert_unwritten_extents(NULL, inode, pos, size);
+ if (error < 0)
+ return error;
+ }
+ /*
+ * If we are extending the file, we have to update i_size here before
+ * page cache gets invalidated in iomap_dio_rw(). Otherwise racing
+ * buffered reads could zero out too much from page cache pages. Update
+ * of on-disk size will happen later in ext4_dio_write_iter() where
+ * we have enough information to also perform orphan list handling etc.
+ * Note that we perform all extending writes synchronously under
+ * i_rwsem held exclusively so i_size update is safe here in that case.
+ * If the write was not extending, we cannot see pos > i_size here
+ * because operations reducing i_size like truncate wait for all
+ * outstanding DIO before updating i_size.
+ */
+ pos += size;
+ if (pos > i_size_read(inode))
+ i_size_write(inode, pos);
+
+ return 0;
+}
+
+static const struct iomap_dio_ops ext4_dio_write_ops = {
+ .end_io = ext4_dio_write_end_io,
+};
+
+/*
+ * The intention here is to start with shared lock acquired then see if any
+ * condition requires an exclusive inode lock. If yes, then we restart the
+ * whole operation by releasing the shared lock and acquiring exclusive lock.
+ *
+ * - For unaligned_io we never take shared lock as it may cause data corruption
+ * when two unaligned IO tries to modify the same block e.g. while zeroing.
+ *
+ * - For extending writes case we don't take the shared lock, since it requires
+ * updating inode i_disksize and/or orphan handling with exclusive lock.
+ *
+ * - shared locking will only be true mostly with overwrites. Otherwise we will
+ * switch to exclusive i_rwsem lock.
+ */
+static ssize_t ext4_dio_write_checks(struct kiocb *iocb, struct iov_iter *from,
+ bool *ilock_shared, bool *extend)
+{
+ struct file *file = iocb->ki_filp;
+ struct inode *inode = file_inode(file);
+ loff_t offset;
+ size_t count;
+ ssize_t ret;
+
+restart:
+ ret = ext4_generic_write_checks(iocb, from);
+ if (ret <= 0)
+ goto out;
+
+ offset = iocb->ki_pos;
+ count = ret;
+ if (ext4_extending_io(inode, offset, count))
+ *extend = true;
+ /*
+ * Determine whether the IO operation will overwrite allocated
+ * and initialized blocks.
+ * We need exclusive i_rwsem for changing security info
+ * in file_modified().
+ */
+ if (*ilock_shared && (!IS_NOSEC(inode) || *extend ||
+ !ext4_overwrite_io(inode, offset, count))) {
+ if (iocb->ki_flags & IOCB_NOWAIT) {
+ ret = -EAGAIN;
+ goto out;
+ }
+ inode_unlock_shared(inode);
+ *ilock_shared = false;
+ inode_lock(inode);
+ goto restart;
+ }
+
+ ret = file_modified(file);
+ if (ret < 0)
+ goto out;
+
+ return count;
+out:
+ if (*ilock_shared)
+ inode_unlock_shared(inode);
+ else
+ inode_unlock(inode);
+ return ret;
+}
+
+static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
+{
+ ssize_t ret;
+ handle_t *handle;
+ struct inode *inode = file_inode(iocb->ki_filp);
+ loff_t offset = iocb->ki_pos;
+ size_t count = iov_iter_count(from);
+ const struct iomap_ops *iomap_ops = &ext4_iomap_ops;
+ bool extend = false, unaligned_io = false;
+ bool ilock_shared = true;
+
+ /*
+ * We initially start with shared inode lock unless it is
+ * unaligned IO which needs exclusive lock anyways.
+ */
+ if (ext4_unaligned_io(inode, from, offset)) {
+ unaligned_io = true;
+ ilock_shared = false;
+ }
+ /*
+ * Quick check here without any i_rwsem lock to see if it is extending
+ * IO. A more reliable check is done in ext4_dio_write_checks() with
+ * proper locking in place.
+ */
+ if (offset + count > i_size_read(inode))
+ ilock_shared = false;
+
+ if (iocb->ki_flags & IOCB_NOWAIT) {
+ if (ilock_shared) {
+ if (!inode_trylock_shared(inode))
+ return -EAGAIN;
+ } else {
+ if (!inode_trylock(inode))
+ return -EAGAIN;
+ }
+ } else {
+ if (ilock_shared)
+ inode_lock_shared(inode);
+ else
+ inode_lock(inode);
+ }
+
+ /* Fallback to buffered I/O if the inode does not support direct I/O. */
+ if (!ext4_dio_supported(inode)) {
+ if (ilock_shared)
+ inode_unlock_shared(inode);
+ else
+ inode_unlock(inode);
+ return ext4_buffered_write_iter(iocb, from);
+ }
+
+ ret = ext4_dio_write_checks(iocb, from, &ilock_shared, &extend);
+ if (ret <= 0)
+ return ret;
+
+ /* if we're going to block and IOCB_NOWAIT is set, return -EAGAIN */
+ if ((iocb->ki_flags & IOCB_NOWAIT) && (unaligned_io || extend)) {
+ ret = -EAGAIN;
+ goto out;
+ }
+
+ offset = iocb->ki_pos;
+ count = ret;
+
+ /*
+ * Unaligned direct IO must be serialized among each other as zeroing
+ * of partial blocks of two competing unaligned IOs can result in data
+ * corruption.
+ *
+ * So we make sure we don't allow any unaligned IO in flight.
+ * For IOs where we need not wait (like unaligned non-AIO DIO),
+ * below inode_dio_wait() may anyway become a no-op, since we start
+ * with exclusive lock.
+ */
+ if (unaligned_io)
+ inode_dio_wait(inode);
+
+ if (extend) {
+ handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ goto out;
+ }
+
+ ext4_fc_start_update(inode);
+ ret = ext4_orphan_add(handle, inode);
+ ext4_fc_stop_update(inode);
+ if (ret) {
+ ext4_journal_stop(handle);
+ goto out;
+ }
+
+ ext4_journal_stop(handle);
+ }
+
+ if (ilock_shared)
+ iomap_ops = &ext4_iomap_overwrite_ops;
+ ret = iomap_dio_rw(iocb, from, iomap_ops, &ext4_dio_write_ops,
+ is_sync_kiocb(iocb) || unaligned_io || extend);
+ if (ret == -ENOTBLK)
+ ret = 0;
+
+ if (extend)
+ ret = ext4_handle_inode_extension(inode, offset, ret, count);
+
+out:
+ if (ilock_shared)
+ inode_unlock_shared(inode);
+ else
+ inode_unlock(inode);
+
+ if (ret >= 0 && iov_iter_count(from)) {
+ ssize_t err;
+ loff_t endbyte;
+
+ offset = iocb->ki_pos;
+ err = ext4_buffered_write_iter(iocb, from);
+ if (err < 0)
+ return err;
+
+ /*
+ * We need to ensure that the pages within the page cache for
+ * the range covered by this I/O are written to disk and
+ * invalidated. This is in attempt to preserve the expected
+ * direct I/O semantics in the case we fallback to buffered I/O
+ * to complete off the I/O request.
+ */
+ ret += err;
+ endbyte = offset + err - 1;
+ err = filemap_write_and_wait_range(iocb->ki_filp->f_mapping,
+ offset, endbyte);
+ if (!err)
+ invalidate_mapping_pages(iocb->ki_filp->f_mapping,
+ offset >> PAGE_SHIFT,
+ endbyte >> PAGE_SHIFT);
+ }
+
+ return ret;
+}
+
#ifdef CONFIG_FS_DAX
static ssize_t
ext4_dax_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
- struct inode *inode = file_inode(iocb->ki_filp);
ssize_t ret;
+ size_t count;
+ loff_t offset;
+ handle_t *handle;
+ bool extend = false;
+ struct inode *inode = file_inode(iocb->ki_filp);
if (iocb->ki_flags & IOCB_NOWAIT) {
if (!inode_trylock(inode))
@@ -197,17 +626,35 @@
} else {
inode_lock(inode);
}
+
ret = ext4_write_checks(iocb, from);
if (ret <= 0)
goto out;
- ret = file_remove_privs(iocb->ki_filp);
- if (ret)
- goto out;
- ret = file_update_time(iocb->ki_filp);
- if (ret)
- goto out;
+
+ offset = iocb->ki_pos;
+ count = iov_iter_count(from);
+
+ if (offset + count > EXT4_I(inode)->i_disksize) {
+ handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ goto out;
+ }
+
+ ret = ext4_orphan_add(handle, inode);
+ if (ret) {
+ ext4_journal_stop(handle);
+ goto out;
+ }
+
+ extend = true;
+ ext4_journal_stop(handle);
+ }
ret = dax_iomap_rw(iocb, from, &ext4_iomap_ops);
+
+ if (extend)
+ ret = ext4_handle_inode_extension(inode, offset, ret, count);
out:
inode_unlock(inode);
if (ret > 0)
@@ -220,10 +667,6 @@
ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
struct inode *inode = file_inode(iocb->ki_filp);
- int o_direct = iocb->ki_flags & IOCB_DIRECT;
- int unaligned_aio = 0;
- int overwrite = 0;
- ssize_t ret;
if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
return -EIO;
@@ -232,59 +675,10 @@
if (IS_DAX(inode))
return ext4_dax_write_iter(iocb, from);
#endif
-
- if (!inode_trylock(inode)) {
- if (iocb->ki_flags & IOCB_NOWAIT)
- return -EAGAIN;
- inode_lock(inode);
- }
-
- ret = ext4_write_checks(iocb, from);
- if (ret <= 0)
- goto out;
-
- /*
- * Unaligned direct AIO must be serialized among each other as zeroing
- * of partial blocks of two competing unaligned AIOs can result in data
- * corruption.
- */
- if (o_direct && ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) &&
- !is_sync_kiocb(iocb) &&
- ext4_unaligned_aio(inode, from, iocb->ki_pos)) {
- unaligned_aio = 1;
- ext4_unwritten_wait(inode);
- }
-
- iocb->private = &overwrite;
- /* Check whether we do a DIO overwrite or not */
- if (o_direct && !unaligned_aio) {
- if (ext4_overwrite_io(inode, iocb->ki_pos, iov_iter_count(from))) {
- if (ext4_should_dioread_nolock(inode))
- overwrite = 1;
- } else if (iocb->ki_flags & IOCB_NOWAIT) {
- ret = -EAGAIN;
- goto out;
- }
- }
-
- ret = __generic_file_write_iter(iocb, from);
- /*
- * Unaligned direct AIO must be the only IO in flight. Otherwise
- * overlapping aligned IO after unaligned might result in data
- * corruption.
- */
- if (ret == -EIOCBQUEUED && unaligned_aio)
- ext4_unwritten_wait(inode);
- inode_unlock(inode);
-
- if (ret > 0)
- ret = generic_write_sync(iocb, ret);
-
- return ret;
-
-out:
- inode_unlock(inode);
- return ret;
+ if (iocb->ki_flags & IOCB_DIRECT)
+ return ext4_dio_write_iter(iocb, from);
+ else
+ return ext4_buffered_write_iter(iocb, from);
}
#ifdef CONFIG_FS_DAX
@@ -403,13 +797,13 @@
handle_t *handle;
int err;
- if (likely(sbi->s_mount_flags & EXT4_MF_MNTDIR_SAMPLED))
+ if (likely(ext4_test_mount_flag(sb, EXT4_MF_MNTDIR_SAMPLED)))
return 0;
if (sb_rdonly(sb) || !sb_start_intwrite_trylock(sb))
return 0;
- sbi->s_mount_flags |= EXT4_MF_MNTDIR_SAMPLED;
+ ext4_set_mount_flag(sb, EXT4_MF_MNTDIR_SAMPLED);
/*
* Sample where the filesystem has been mounted and
* store it in the superblock for sysadmin convenience
@@ -442,7 +836,7 @@
return err;
}
-static int ext4_file_open(struct inode * inode, struct file * filp)
+static int ext4_file_open(struct inode *inode, struct file *filp)
{
int ret;
@@ -471,7 +865,7 @@
return ret;
}
- filp->f_mode |= FMODE_NOWAIT;
+ filp->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC;
return dquot_file_open(inode, filp);
}
@@ -496,12 +890,14 @@
maxbytes, i_size_read(inode));
case SEEK_HOLE:
inode_lock_shared(inode);
- offset = iomap_seek_hole(inode, offset, &ext4_iomap_ops);
+ offset = iomap_seek_hole(inode, offset,
+ &ext4_iomap_report_ops);
inode_unlock_shared(inode);
break;
case SEEK_DATA:
inode_lock_shared(inode);
- offset = iomap_seek_data(inode, offset, &ext4_iomap_ops);
+ offset = iomap_seek_data(inode, offset,
+ &ext4_iomap_report_ops);
inode_unlock_shared(inode);
break;
}
@@ -515,6 +911,7 @@
.llseek = ext4_llseek,
.read_iter = ext4_file_read_iter,
.write_iter = ext4_file_write_iter,
+ .iopoll = iomap_dio_iopoll,
.unlocked_ioctl = ext4_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = ext4_compat_ioctl,
diff --git a/fs/ext4/fsmap.c b/fs/ext4/fsmap.c
index 37347ba..4493ef0 100644
--- a/fs/ext4/fsmap.c
+++ b/fs/ext4/fsmap.c
@@ -280,7 +280,7 @@
/* Fabricate an rmap entry for the external log device. */
irec.fmr_physical = journal->j_blk_offset;
- irec.fmr_length = journal->j_maxlen;
+ irec.fmr_length = journal->j_total_len;
irec.fmr_owner = EXT4_FMR_OWN_LOG;
irec.fmr_flags = 0;
@@ -354,8 +354,8 @@
/* Compare two fsmap items. */
static int ext4_getfsmap_compare(void *priv,
- struct list_head *a,
- struct list_head *b)
+ const struct list_head *a,
+ const struct list_head *b)
{
struct ext4_fsmap *fa;
struct ext4_fsmap *fb;
@@ -574,8 +574,8 @@
if (fm->fmr_device == 0 || fm->fmr_device == UINT_MAX ||
fm->fmr_device == new_encode_dev(sb->s_bdev->bd_dev))
return true;
- if (EXT4_SB(sb)->journal_bdev &&
- fm->fmr_device == new_encode_dev(EXT4_SB(sb)->journal_bdev->bd_dev))
+ if (EXT4_SB(sb)->s_journal_bdev &&
+ fm->fmr_device == new_encode_dev(EXT4_SB(sb)->s_journal_bdev->bd_dev))
return true;
return false;
}
@@ -645,9 +645,9 @@
memset(handlers, 0, sizeof(handlers));
handlers[0].gfd_dev = new_encode_dev(sb->s_bdev->bd_dev);
handlers[0].gfd_fn = ext4_getfsmap_datadev;
- if (EXT4_SB(sb)->journal_bdev) {
+ if (EXT4_SB(sb)->s_journal_bdev) {
handlers[1].gfd_dev = new_encode_dev(
- EXT4_SB(sb)->journal_bdev->bd_dev);
+ EXT4_SB(sb)->s_journal_bdev->bd_dev);
handlers[1].gfd_fn = ext4_getfsmap_logdev;
}
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 8a28d47..a42ca95 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -78,6 +78,43 @@
return ret;
}
+static int ext4_fsync_nojournal(struct inode *inode, bool datasync,
+ bool *needs_barrier)
+{
+ int ret, err;
+
+ ret = sync_mapping_buffers(inode->i_mapping);
+ if (!(inode->i_state & I_DIRTY_ALL))
+ return ret;
+ if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
+ return ret;
+
+ err = sync_inode_metadata(inode, 1);
+ if (!ret)
+ ret = err;
+
+ if (!ret)
+ ret = ext4_sync_parent(inode);
+ if (test_opt(inode->i_sb, BARRIER))
+ *needs_barrier = true;
+
+ return ret;
+}
+
+static int ext4_fsync_journal(struct inode *inode, bool datasync,
+ bool *needs_barrier)
+{
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
+ tid_t commit_tid = datasync ? ei->i_datasync_tid : ei->i_sync_tid;
+
+ if (journal->j_flags & JBD2_BARRIER &&
+ !jbd2_trans_will_send_data_barrier(journal, commit_tid))
+ *needs_barrier = true;
+
+ return ext4_fc_commit(journal, commit_tid);
+}
+
/*
* akpm: A new design for ext4_sync_file().
*
@@ -89,17 +126,14 @@
* What we do is just kick off a commit and wait on it. This will snapshot the
* inode to disk.
*/
-
int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
{
- struct inode *inode = file->f_mapping->host;
- struct ext4_inode_info *ei = EXT4_I(inode);
- journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
int ret = 0, err;
- tid_t commit_tid;
bool needs_barrier = false;
+ struct inode *inode = file->f_mapping->host;
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
- if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
+ if (unlikely(ext4_forced_shutdown(sbi)))
return -EIO;
J_ASSERT(ext4_journal_current_handle() == NULL);
@@ -109,23 +143,15 @@
if (sb_rdonly(inode->i_sb)) {
/* Make sure that we read updated s_mount_flags value */
smp_rmb();
- if (EXT4_SB(inode->i_sb)->s_mount_flags & EXT4_MF_FS_ABORTED)
+ if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FS_ABORTED))
ret = -EROFS;
goto out;
}
- if (!journal) {
- ret = __generic_file_fsync(file, start, end, datasync);
- if (!ret)
- ret = ext4_sync_parent(inode);
- if (test_opt(inode->i_sb, BARRIER))
- goto issue_flush;
- goto out;
- }
-
ret = file_write_and_wait_range(file, start, end);
if (ret)
- return ret;
+ goto out;
+
/*
* data=writeback,ordered:
* The caller's filemap_fdatawrite()/wait will sync the data.
@@ -140,19 +166,15 @@
* (they were dirtied by commit). But that's OK - the blocks are
* safe in-journal, which is all fsync() needs to ensure.
*/
- if (ext4_should_journal_data(inode)) {
+ if (!sbi->s_journal)
+ ret = ext4_fsync_nojournal(inode, datasync, &needs_barrier);
+ else if (ext4_should_journal_data(inode))
ret = ext4_force_commit(inode->i_sb);
- goto out;
- }
+ else
+ ret = ext4_fsync_journal(inode, datasync, &needs_barrier);
- commit_tid = datasync ? ei->i_datasync_tid : ei->i_sync_tid;
- if (journal->j_flags & JBD2_BARRIER &&
- !jbd2_trans_will_send_data_barrier(journal, commit_tid))
- needs_barrier = true;
- ret = jbd2_complete_transaction(journal, commit_tid);
if (needs_barrier) {
- issue_flush:
- err = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
+ err = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL);
if (!ret)
ret = err;
}
diff --git a/fs/ext4/hash.c b/fs/ext4/hash.c
index 3e13379..a92eb79 100644
--- a/fs/ext4/hash.c
+++ b/fs/ext4/hash.c
@@ -233,7 +233,7 @@
break;
case DX_HASH_HALF_MD4_UNSIGNED:
str2hashbuf = str2hashbuf_unsigned;
- /* fall through */
+ fallthrough;
case DX_HASH_HALF_MD4:
p = name;
while (len > 0) {
@@ -247,7 +247,7 @@
break;
case DX_HASH_TEA_UNSIGNED:
str2hashbuf = str2hashbuf_unsigned;
- /* fall through */
+ fallthrough;
case DX_HASH_TEA:
p = name;
while (len > 0) {
@@ -275,7 +275,7 @@
struct dx_hash_info *hinfo)
{
#ifdef CONFIG_UNICODE
- const struct unicode_map *um = EXT4_SB(dir->i_sb)->s_encoding;
+ const struct unicode_map *um = dir->i_sb->s_encoding;
int r, dlen;
unsigned char *buff;
struct qstr qstr = {.name = name, .len = len };
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 83846cc..875af32 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -82,7 +82,12 @@
struct buffer_head *bh)
{
ext4_fsblk_t blk;
- struct ext4_group_info *grp = ext4_get_group_info(sb, block_group);
+ struct ext4_group_info *grp;
+
+ if (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY)
+ return 0;
+
+ grp = ext4_get_group_info(sb, block_group);
if (buffer_verified(bh))
return 0;
@@ -94,7 +99,8 @@
goto verified;
blk = ext4_inode_bitmap(sb, desc);
if (!ext4_inode_bitmap_csum_verify(sb, block_group, desc, bh,
- EXT4_INODES_PER_GROUP(sb) / 8)) {
+ EXT4_INODES_PER_GROUP(sb) / 8) ||
+ ext4_simulate_fail(sb, EXT4_SIM_IBITMAP_CRC)) {
ext4_unlock_group(sb, block_group);
ext4_error(sb, "Corrupt inode bitmap - block_group = %u, "
"inode_bitmap = %llu", block_group, blk);
@@ -112,7 +118,7 @@
* Read the inode allocation bitmap for a given block_group, reading
* into the specified slot in the superblock's bitmap cache.
*
- * Return buffer_head of bitmap on success or NULL.
+ * Return buffer_head of bitmap on success, or an ERR_PTR on error.
*/
static struct buffer_head *
ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
@@ -188,15 +194,13 @@
* submit the buffer_head for reading
*/
trace_ext4_load_inode_bitmap(sb, block_group);
- bh->b_end_io = ext4_end_bitmap_read;
- get_bh(bh);
- submit_bh(REQ_OP_READ, REQ_META | REQ_PRIO, bh);
- wait_on_buffer(bh);
+ ext4_read_bh(bh, REQ_META | REQ_PRIO, ext4_end_bitmap_read);
+ ext4_simulate_fail_bh(sb, bh, EXT4_SIM_IBITMAP_EIO);
if (!buffer_uptodate(bh)) {
put_bh(bh);
- ext4_error(sb, "Cannot read inode bitmap - "
- "block_group = %u, inode_bitmap = %llu",
- block_group, bitmap_blk);
+ ext4_error_err(sb, EIO, "Cannot read inode bitmap - "
+ "block_group = %u, inode_bitmap = %llu",
+ block_group, bitmap_blk);
ext4_mark_group_bitmap_corrupted(sb, block_group,
EXT4_GROUP_INFO_IBITMAP_CORRUPT);
return ERR_PTR(-EIO);
@@ -282,15 +286,17 @@
bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb);
bitmap_bh = ext4_read_inode_bitmap(sb, block_group);
/* Don't bother if the inode bitmap is corrupt. */
- grp = ext4_get_group_info(sb, block_group);
if (IS_ERR(bitmap_bh)) {
fatal = PTR_ERR(bitmap_bh);
bitmap_bh = NULL;
goto error_return;
}
- if (unlikely(EXT4_MB_GRP_IBITMAP_CORRUPT(grp))) {
- fatal = -EFSCORRUPTED;
- goto error_return;
+ if (!(sbi->s_mount_state & EXT4_FC_REPLAY)) {
+ grp = ext4_get_group_info(sb, block_group);
+ if (unlikely(EXT4_MB_GRP_IBITMAP_CORRUPT(grp))) {
+ fatal = -EFSCORRUPTED;
+ goto error_return;
+ }
}
BUFFER_TRACE(bitmap_bh, "get_write_access");
@@ -316,14 +322,16 @@
if (is_directory) {
count = ext4_used_dirs_count(sb, gdp) - 1;
ext4_used_dirs_set(sb, gdp, count);
- percpu_counter_dec(&sbi->s_dirs_counter);
+ if (percpu_counter_initialized(&sbi->s_dirs_counter))
+ percpu_counter_dec(&sbi->s_dirs_counter);
}
ext4_inode_bitmap_csum_set(sb, block_group, gdp, bitmap_bh,
EXT4_INODES_PER_GROUP(sb) / 8);
ext4_group_desc_csum_set(sb, block_group, gdp);
ext4_unlock_group(sb, block_group);
- percpu_counter_inc(&sbi->s_freeinodes_counter);
+ if (percpu_counter_initialized(&sbi->s_freeinodes_counter))
+ percpu_counter_inc(&sbi->s_freeinodes_counter);
if (sbi->s_log_groups_per_flex) {
struct flex_groups *fg;
@@ -708,22 +716,198 @@
static int find_inode_bit(struct super_block *sb, ext4_group_t group,
struct buffer_head *bitmap, unsigned long *ino)
{
+ bool check_recently_deleted = EXT4_SB(sb)->s_journal == NULL;
+ unsigned long recently_deleted_ino = EXT4_INODES_PER_GROUP(sb);
+
next:
*ino = ext4_find_next_zero_bit((unsigned long *)
bitmap->b_data,
EXT4_INODES_PER_GROUP(sb), *ino);
if (*ino >= EXT4_INODES_PER_GROUP(sb))
- return 0;
+ goto not_found;
- if ((EXT4_SB(sb)->s_journal == NULL) &&
- recently_deleted(sb, group, *ino)) {
+ if (check_recently_deleted && recently_deleted(sb, group, *ino)) {
+ recently_deleted_ino = *ino;
*ino = *ino + 1;
if (*ino < EXT4_INODES_PER_GROUP(sb))
goto next;
+ goto not_found;
+ }
+ return 1;
+not_found:
+ if (recently_deleted_ino >= EXT4_INODES_PER_GROUP(sb))
return 0;
+ /*
+ * Not reusing recently deleted inodes is mostly a preference. We don't
+ * want to report ENOSPC or skew allocation patterns because of that.
+ * So return even recently deleted inode if we could find better in the
+ * given range.
+ */
+ *ino = recently_deleted_ino;
+ return 1;
+}
+
+int ext4_mark_inode_used(struct super_block *sb, int ino)
+{
+ unsigned long max_ino = le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count);
+ struct buffer_head *inode_bitmap_bh = NULL, *group_desc_bh = NULL;
+ struct ext4_group_desc *gdp;
+ ext4_group_t group;
+ int bit;
+ int err = -EFSCORRUPTED;
+
+ if (ino < EXT4_FIRST_INO(sb) || ino > max_ino)
+ goto out;
+
+ group = (ino - 1) / EXT4_INODES_PER_GROUP(sb);
+ bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb);
+ inode_bitmap_bh = ext4_read_inode_bitmap(sb, group);
+ if (IS_ERR(inode_bitmap_bh))
+ return PTR_ERR(inode_bitmap_bh);
+
+ if (ext4_test_bit(bit, inode_bitmap_bh->b_data)) {
+ err = 0;
+ goto out;
}
- return 1;
+ gdp = ext4_get_group_desc(sb, group, &group_desc_bh);
+ if (!gdp || !group_desc_bh) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ ext4_set_bit(bit, inode_bitmap_bh->b_data);
+
+ BUFFER_TRACE(inode_bitmap_bh, "call ext4_handle_dirty_metadata");
+ err = ext4_handle_dirty_metadata(NULL, NULL, inode_bitmap_bh);
+ if (err) {
+ ext4_std_error(sb, err);
+ goto out;
+ }
+ err = sync_dirty_buffer(inode_bitmap_bh);
+ if (err) {
+ ext4_std_error(sb, err);
+ goto out;
+ }
+
+ /* We may have to initialize the block bitmap if it isn't already */
+ if (ext4_has_group_desc_csum(sb) &&
+ gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
+ struct buffer_head *block_bitmap_bh;
+
+ block_bitmap_bh = ext4_read_block_bitmap(sb, group);
+ if (IS_ERR(block_bitmap_bh)) {
+ err = PTR_ERR(block_bitmap_bh);
+ goto out;
+ }
+
+ BUFFER_TRACE(block_bitmap_bh, "dirty block bitmap");
+ err = ext4_handle_dirty_metadata(NULL, NULL, block_bitmap_bh);
+ sync_dirty_buffer(block_bitmap_bh);
+
+ /* recheck and clear flag under lock if we still need to */
+ ext4_lock_group(sb, group);
+ if (ext4_has_group_desc_csum(sb) &&
+ (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) {
+ gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
+ ext4_free_group_clusters_set(sb, gdp,
+ ext4_free_clusters_after_init(sb, group, gdp));
+ ext4_block_bitmap_csum_set(sb, group, gdp,
+ block_bitmap_bh);
+ ext4_group_desc_csum_set(sb, group, gdp);
+ }
+ ext4_unlock_group(sb, group);
+ brelse(block_bitmap_bh);
+
+ if (err) {
+ ext4_std_error(sb, err);
+ goto out;
+ }
+ }
+
+ /* Update the relevant bg descriptor fields */
+ if (ext4_has_group_desc_csum(sb)) {
+ int free;
+
+ ext4_lock_group(sb, group); /* while we modify the bg desc */
+ free = EXT4_INODES_PER_GROUP(sb) -
+ ext4_itable_unused_count(sb, gdp);
+ if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
+ gdp->bg_flags &= cpu_to_le16(~EXT4_BG_INODE_UNINIT);
+ free = 0;
+ }
+
+ /*
+ * Check the relative inode number against the last used
+ * relative inode number in this group. if it is greater
+ * we need to update the bg_itable_unused count
+ */
+ if (bit >= free)
+ ext4_itable_unused_set(sb, gdp,
+ (EXT4_INODES_PER_GROUP(sb) - bit - 1));
+ } else {
+ ext4_lock_group(sb, group);
+ }
+
+ ext4_free_inodes_set(sb, gdp, ext4_free_inodes_count(sb, gdp) - 1);
+ if (ext4_has_group_desc_csum(sb)) {
+ ext4_inode_bitmap_csum_set(sb, group, gdp, inode_bitmap_bh,
+ EXT4_INODES_PER_GROUP(sb) / 8);
+ ext4_group_desc_csum_set(sb, group, gdp);
+ }
+
+ ext4_unlock_group(sb, group);
+ err = ext4_handle_dirty_metadata(NULL, NULL, group_desc_bh);
+ sync_dirty_buffer(group_desc_bh);
+out:
+ return err;
+}
+
+static int ext4_xattr_credits_for_new_inode(struct inode *dir, mode_t mode,
+ bool encrypt)
+{
+ struct super_block *sb = dir->i_sb;
+ int nblocks = 0;
+#ifdef CONFIG_EXT4_FS_POSIX_ACL
+ struct posix_acl *p = get_acl(dir, ACL_TYPE_DEFAULT);
+
+ if (IS_ERR(p))
+ return PTR_ERR(p);
+ if (p) {
+ int acl_size = p->a_count * sizeof(ext4_acl_entry);
+
+ nblocks += (S_ISDIR(mode) ? 2 : 1) *
+ __ext4_xattr_set_credits(sb, NULL /* inode */,
+ NULL /* block_bh */, acl_size,
+ true /* is_create */);
+ posix_acl_release(p);
+ }
+#endif
+
+#ifdef CONFIG_SECURITY
+ {
+ int num_security_xattrs = 1;
+
+#ifdef CONFIG_INTEGRITY
+ num_security_xattrs++;
+#endif
+ /*
+ * We assume that security xattrs are never more than 1k.
+ * In practice they are under 128 bytes.
+ */
+ nblocks += num_security_xattrs *
+ __ext4_xattr_set_credits(sb, NULL /* inode */,
+ NULL /* block_bh */, 1024,
+ true /* is_create */);
+ }
+#endif
+ if (encrypt)
+ nblocks += __ext4_xattr_set_credits(sb,
+ NULL /* inode */,
+ NULL /* block_bh */,
+ FSCRYPT_SET_CONTEXT_MAX_SIZE,
+ true /* is_create */);
+ return nblocks;
}
/*
@@ -755,8 +939,8 @@
struct inode *ret;
ext4_group_t i;
ext4_group_t flex_group;
- struct ext4_group_info *grp;
- int encrypt = 0;
+ struct ext4_group_info *grp = NULL;
+ bool encrypt = false;
/* Cannot create files in a deleted directory */
if (!dir || !dir->i_nlink)
@@ -768,59 +952,6 @@
if (unlikely(ext4_forced_shutdown(sbi)))
return ERR_PTR(-EIO);
- if ((IS_ENCRYPTED(dir) || DUMMY_ENCRYPTION_ENABLED(sbi)) &&
- (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)) &&
- !(i_flags & EXT4_EA_INODE_FL)) {
- err = fscrypt_get_encryption_info(dir);
- if (err)
- return ERR_PTR(err);
- if (!fscrypt_has_encryption_key(dir))
- return ERR_PTR(-ENOKEY);
- encrypt = 1;
- }
-
- if (!handle && sbi->s_journal && !(i_flags & EXT4_EA_INODE_FL)) {
-#ifdef CONFIG_EXT4_FS_POSIX_ACL
- struct posix_acl *p = get_acl(dir, ACL_TYPE_DEFAULT);
-
- if (IS_ERR(p))
- return ERR_CAST(p);
- if (p) {
- int acl_size = p->a_count * sizeof(ext4_acl_entry);
-
- nblocks += (S_ISDIR(mode) ? 2 : 1) *
- __ext4_xattr_set_credits(sb, NULL /* inode */,
- NULL /* block_bh */, acl_size,
- true /* is_create */);
- posix_acl_release(p);
- }
-#endif
-
-#ifdef CONFIG_SECURITY
- {
- int num_security_xattrs = 1;
-
-#ifdef CONFIG_INTEGRITY
- num_security_xattrs++;
-#endif
- /*
- * We assume that security xattrs are never
- * more than 1k. In practice they are under
- * 128 bytes.
- */
- nblocks += num_security_xattrs *
- __ext4_xattr_set_credits(sb, NULL /* inode */,
- NULL /* block_bh */, 1024,
- true /* is_create */);
- }
-#endif
- if (encrypt)
- nblocks += __ext4_xattr_set_credits(sb,
- NULL /* inode */, NULL /* block_bh */,
- FSCRYPT_SET_CONTEXT_MAX_SIZE,
- true /* is_create */);
- }
-
ngroups = ext4_get_groups_count(sb);
trace_ext4_request_inode(dir, mode);
inode = new_inode(sb);
@@ -850,10 +981,25 @@
else
ei->i_projid = make_kprojid(&init_user_ns, EXT4_DEF_PROJID);
+ if (!(i_flags & EXT4_EA_INODE_FL)) {
+ err = fscrypt_prepare_new_inode(dir, inode, &encrypt);
+ if (err)
+ goto out;
+ }
+
err = dquot_initialize(inode);
if (err)
goto out;
+ if (!handle && sbi->s_journal && !(i_flags & EXT4_EA_INODE_FL)) {
+ ret2 = ext4_xattr_credits_for_new_inode(dir, mode, encrypt);
+ if (ret2 < 0) {
+ err = ret2;
+ goto out;
+ }
+ nblocks += ret2;
+ }
+
if (!goal)
goal = sbi->s_inode_goal;
@@ -893,15 +1039,21 @@
if (ext4_free_inodes_count(sb, gdp) == 0)
goto next_group;
- grp = ext4_get_group_info(sb, group);
- /* Skip groups with already-known suspicious inode tables */
- if (EXT4_MB_GRP_IBITMAP_CORRUPT(grp))
- goto next_group;
+ if (!(sbi->s_mount_state & EXT4_FC_REPLAY)) {
+ grp = ext4_get_group_info(sb, group);
+ /*
+ * Skip groups with already-known suspicious inode
+ * tables
+ */
+ if (EXT4_MB_GRP_IBITMAP_CORRUPT(grp))
+ goto next_group;
+ }
brelse(inode_bitmap_bh);
inode_bitmap_bh = ext4_read_inode_bitmap(sb, group);
/* Skip groups with suspicious inode tables */
- if (EXT4_MB_GRP_IBITMAP_CORRUPT(grp) ||
+ if (((!(sbi->s_mount_state & EXT4_FC_REPLAY))
+ && EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) ||
IS_ERR(inode_bitmap_bh)) {
inode_bitmap_bh = NULL;
goto next_group;
@@ -920,11 +1072,11 @@
goto next_group;
}
- if (!handle) {
+ if ((!(sbi->s_mount_state & EXT4_FC_REPLAY)) && !handle) {
BUG_ON(nblocks <= 0);
handle = __ext4_journal_start_sb(dir->i_sb, line_no,
- handle_type, nblocks,
- 0);
+ handle_type, nblocks, 0,
+ ext4_trans_default_revoke_credits(sb));
if (IS_ERR(handle)) {
err = PTR_ERR(handle);
ext4_std_error(sb, err);
@@ -1024,9 +1176,15 @@
/* Update the relevant bg descriptor fields */
if (ext4_has_group_desc_csum(sb)) {
int free;
- struct ext4_group_info *grp = ext4_get_group_info(sb, group);
+ struct ext4_group_info *grp = NULL;
- down_read(&grp->alloc_sem); /* protect vs itable lazyinit */
+ if (!(sbi->s_mount_state & EXT4_FC_REPLAY)) {
+ grp = ext4_get_group_info(sb, group);
+ down_read(&grp->alloc_sem); /*
+ * protect vs itable
+ * lazyinit
+ */
+ }
ext4_lock_group(sb, group); /* while we modify the bg desc */
free = EXT4_INODES_PER_GROUP(sb) -
ext4_itable_unused_count(sb, gdp);
@@ -1042,7 +1200,8 @@
if (ino > free)
ext4_itable_unused_set(sb, gdp,
(EXT4_INODES_PER_GROUP(sb) - ino));
- up_read(&grp->alloc_sem);
+ if (!(sbi->s_mount_state & EXT4_FC_REPLAY))
+ up_read(&grp->alloc_sem);
} else {
ext4_lock_group(sb, group);
}
@@ -1100,7 +1259,7 @@
ei->i_block_group = group;
ei->i_last_alloc_group = ~0;
- ext4_set_inode_flags(inode);
+ ext4_set_inode_flags(inode, true);
if (IS_DIRSYNC(inode))
ext4_handle_sync(handle);
if (insert_inode_locked(inode) < 0) {
@@ -1133,7 +1292,8 @@
ei->i_extra_isize = sbi->s_want_extra_isize;
ei->i_inline_off = 0;
- if (ext4_has_feature_inline_data(sb))
+ if (ext4_has_feature_inline_data(sb) &&
+ (!(ei->i_flags & EXT4_DAX_FL) || S_ISDIR(mode)))
ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
ret = inode;
err = dquot_alloc_inode(inode);
@@ -1146,7 +1306,7 @@
* prevent its deduplication.
*/
if (encrypt) {
- err = fscrypt_inherit_context(dir, inode, handle, true);
+ err = fscrypt_set_context(inode, handle);
if (err)
goto fail_free_drop;
}
@@ -1227,8 +1387,10 @@
inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
if (IS_ERR(inode)) {
err = PTR_ERR(inode);
- ext4_error(sb, "couldn't read orphan inode %lu (err %d)",
- ino, err);
+ ext4_error_err(sb, -err,
+ "couldn't read orphan inode %lu (err %d)",
+ ino, err);
+ brelse(bitmap_bh);
return inode;
}
@@ -1439,7 +1601,7 @@
if (ret < 0)
goto err_out;
if (barrier)
- blkdev_issue_flush(sb->s_bdev, GFP_NOFS, NULL);
+ blkdev_issue_flush(sb->s_bdev, GFP_NOFS);
skip_zeroout:
ext4_lock_group(sb, group);
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index 36699a1..05efa68 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -163,7 +163,7 @@
}
if (!bh_uptodate_or_lock(bh)) {
- if (bh_submit_read(bh) < 0) {
+ if (ext4_read_bh(bh, 0, NULL) < 0) {
put_bh(bh);
goto failure;
}
@@ -331,11 +331,14 @@
for (i = 0; i <= indirect_blks; i++) {
if (i == indirect_blks) {
new_blocks[i] = ext4_mb_new_blocks(handle, ar, &err);
- } else
+ } else {
ar->goal = new_blocks[i] = ext4_new_meta_blocks(handle,
ar->inode, ar->goal,
ar->flags & EXT4_MB_DELALLOC_RESERVED,
NULL, &err);
+ /* Simplify error cleanup... */
+ branch[i+1].bh = NULL;
+ }
if (err) {
i--;
goto failed;
@@ -377,18 +380,25 @@
}
return 0;
failed:
+ if (i == indirect_blks) {
+ /* Free data blocks */
+ ext4_free_blocks(handle, ar->inode, NULL, new_blocks[i],
+ ar->len, 0);
+ i--;
+ }
for (; i >= 0; i--) {
/*
* We want to ext4_forget() only freshly allocated indirect
- * blocks. Buffer for new_blocks[i-1] is at branch[i].bh and
- * buffer at branch[0].bh is indirect block / inode already
- * existing before ext4_alloc_branch() was called.
+ * blocks. Buffer for new_blocks[i] is at branch[i+1].bh
+ * (buffer at branch[0].bh is indirect block / inode already
+ * existing before ext4_alloc_branch() was called). Also
+ * because blocks are freshly allocated, we don't need to
+ * revoke them which is why we don't set
+ * EXT4_FREE_BLOCKS_METADATA.
*/
- if (i > 0 && i != indirect_blks && branch[i].bh)
- ext4_forget(handle, 1, ar->inode, branch[i].bh,
- branch[i].bh->b_blocknr);
- ext4_free_blocks(handle, ar->inode, NULL, new_blocks[i],
- (i == indirect_blks) ? ar->len : 1, 0);
+ ext4_free_blocks(handle, ar->inode, branch[i+1].bh,
+ new_blocks[i], 1,
+ branch[i+1].bh ? EXT4_FREE_BLOCKS_FORGET : 0);
}
return err;
}
@@ -457,7 +467,9 @@
/*
* OK, we spliced it into the inode itself on a direct block.
*/
- ext4_mark_inode_dirty(handle, ar->inode);
+ err = ext4_mark_inode_dirty(handle, ar->inode);
+ if (unlikely(err))
+ goto err_out;
jbd_debug(5, "splicing direct\n");
}
return err;
@@ -581,7 +593,8 @@
if (ext4_has_feature_bigalloc(inode->i_sb)) {
EXT4_ERROR_INODE(inode, "Can't allocate blocks for "
"non-extent mapped inodes with bigalloc");
- return -EFSCORRUPTED;
+ err = -EFSCORRUPTED;
+ goto out;
}
/* Set up for the direct block allocation */
@@ -650,32 +663,6 @@
}
/*
- * Calculate the number of metadata blocks need to reserve
- * to allocate a new block at @lblocks for non extent file based file
- */
-int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock)
-{
- struct ext4_inode_info *ei = EXT4_I(inode);
- sector_t dind_mask = ~((sector_t)EXT4_ADDR_PER_BLOCK(inode->i_sb) - 1);
- int blk_bits;
-
- if (lblock < EXT4_NDIR_BLOCKS)
- return 0;
-
- lblock -= EXT4_NDIR_BLOCKS;
-
- if (ei->i_da_metadata_calc_len &&
- (lblock & dind_mask) == ei->i_da_metadata_calc_last_lblock) {
- ei->i_da_metadata_calc_len++;
- return 0;
- }
- ei->i_da_metadata_calc_last_lblock = lblock & dind_mask;
- ei->i_da_metadata_calc_len = 1;
- blk_bits = order_base_2(lblock);
- return (blk_bits / EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb)) + 1;
-}
-
-/*
* Calculate number of indirect blocks touched by mapping @nrblocks logically
* contiguous blocks
*/
@@ -689,27 +676,63 @@
return DIV_ROUND_UP(nrblocks, EXT4_ADDR_PER_BLOCK(inode->i_sb)) + 4;
}
+static int ext4_ind_trunc_restart_fn(handle_t *handle, struct inode *inode,
+ struct buffer_head *bh, int *dropped)
+{
+ int err;
+
+ if (bh) {
+ BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
+ err = ext4_handle_dirty_metadata(handle, inode, bh);
+ if (unlikely(err))
+ return err;
+ }
+ err = ext4_mark_inode_dirty(handle, inode);
+ if (unlikely(err))
+ return err;
+ /*
+ * Drop i_data_sem to avoid deadlock with ext4_map_blocks. At this
+ * moment, get_block can be called only for blocks inside i_size since
+ * page cache has been already dropped and writes are blocked by
+ * i_mutex. So we can safely drop the i_data_sem here.
+ */
+ BUG_ON(EXT4_JOURNAL(inode) == NULL);
+ ext4_discard_preallocations(inode, 0);
+ up_write(&EXT4_I(inode)->i_data_sem);
+ *dropped = 1;
+ return 0;
+}
+
/*
* Truncate transactions can be complex and absolutely huge. So we need to
* be able to restart the transaction at a conventient checkpoint to make
* sure we don't overflow the journal.
*
* Try to extend this transaction for the purposes of truncation. If
- * extend fails, we need to propagate the failure up and restart the
- * transaction in the top-level truncate loop. --sct
- *
- * Returns 0 if we managed to create more room. If we can't create more
- * room, and the transaction must be restarted we return 1.
+ * extend fails, we restart transaction.
*/
-static int try_to_extend_transaction(handle_t *handle, struct inode *inode)
+static int ext4_ind_truncate_ensure_credits(handle_t *handle,
+ struct inode *inode,
+ struct buffer_head *bh,
+ int revoke_creds)
{
- if (!ext4_handle_valid(handle))
- return 0;
- if (ext4_handle_has_enough_credits(handle, EXT4_RESERVE_TRANS_BLOCKS+1))
- return 0;
- if (!ext4_journal_extend(handle, ext4_blocks_for_truncate(inode)))
- return 0;
- return 1;
+ int ret;
+ int dropped = 0;
+
+ ret = ext4_journal_ensure_credits_fn(handle, EXT4_RESERVE_TRANS_BLOCKS,
+ ext4_blocks_for_truncate(inode), revoke_creds,
+ ext4_ind_trunc_restart_fn(handle, inode, bh, &dropped));
+ if (dropped)
+ down_write(&EXT4_I(inode)->i_data_sem);
+ if (ret <= 0)
+ return ret;
+ if (bh) {
+ BUFFER_TRACE(bh, "retaking write access");
+ ret = ext4_journal_get_write_access(handle, bh);
+ if (unlikely(ret))
+ return ret;
+ }
+ return 0;
}
/*
@@ -836,35 +859,17 @@
else if (ext4_should_journal_data(inode))
flags |= EXT4_FREE_BLOCKS_FORGET;
- if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), block_to_free,
- count)) {
+ if (!ext4_inode_block_valid(inode, block_to_free, count)) {
EXT4_ERROR_INODE(inode, "attempt to clear invalid "
"blocks %llu len %lu",
(unsigned long long) block_to_free, count);
return 1;
}
- if (try_to_extend_transaction(handle, inode)) {
- if (bh) {
- BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
- err = ext4_handle_dirty_metadata(handle, inode, bh);
- if (unlikely(err))
- goto out_err;
- }
- err = ext4_mark_inode_dirty(handle, inode);
- if (unlikely(err))
- goto out_err;
- err = ext4_truncate_restart_trans(handle, inode,
- ext4_blocks_for_truncate(inode));
- if (unlikely(err))
- goto out_err;
- if (bh) {
- BUFFER_TRACE(bh, "retaking write access");
- err = ext4_journal_get_write_access(handle, bh);
- if (unlikely(err))
- goto out_err;
- }
- }
+ err = ext4_ind_truncate_ensure_credits(handle, inode, bh,
+ ext4_free_data_revoke_credits(inode, count));
+ if (err < 0)
+ goto out_err;
for (p = first; p < last; p++)
*p = 0;
@@ -999,8 +1004,7 @@
if (!nr)
continue; /* A hole */
- if (!ext4_data_block_valid(EXT4_SB(inode->i_sb),
- nr, 1)) {
+ if (!ext4_inode_block_valid(inode, nr, 1)) {
EXT4_ERROR_INODE(inode,
"invalid indirect mapped "
"block %lu (level %d)",
@@ -1009,14 +1013,14 @@
}
/* Go read the buffer for the next level down */
- bh = sb_bread(inode->i_sb, nr);
+ bh = ext4_sb_bread(inode->i_sb, nr, 0);
/*
* A read failure? Report error and clear slot
* (should be rare).
*/
- if (!bh) {
- EXT4_ERROR_INODE_BLOCK(inode, nr,
+ if (IS_ERR(bh)) {
+ ext4_error_inode_block(inode, nr, -PTR_ERR(bh),
"Read failure");
continue;
}
@@ -1030,7 +1034,7 @@
brelse(bh);
/*
- * Everything below this this pointer has been
+ * Everything below this pointer has been
* released. Now let this top-of-subtree go.
*
* We want the freeing of this indirect block to be
@@ -1047,11 +1051,11 @@
*/
if (ext4_handle_is_aborted(handle))
return;
- if (try_to_extend_transaction(handle, inode)) {
- ext4_mark_inode_dirty(handle, inode);
- ext4_truncate_restart_trans(handle, inode,
- ext4_blocks_for_truncate(inode));
- }
+ if (ext4_ind_truncate_ensure_credits(handle, inode,
+ NULL,
+ ext4_free_metadata_revoke_credits(
+ inode->i_sb, 1)) < 0)
+ return;
/*
* The forget flag here is critical because if
@@ -1177,21 +1181,21 @@
ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 1);
i_data[EXT4_IND_BLOCK] = 0;
}
- /* fall through */
+ fallthrough;
case EXT4_IND_BLOCK:
nr = i_data[EXT4_DIND_BLOCK];
if (nr) {
ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 2);
i_data[EXT4_DIND_BLOCK] = 0;
}
- /* fall through */
+ fallthrough;
case EXT4_DIND_BLOCK:
nr = i_data[EXT4_TIND_BLOCK];
if (nr) {
ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 3);
i_data[EXT4_TIND_BLOCK] = 0;
}
- /* fall through */
+ fallthrough;
case EXT4_TIND_BLOCK:
;
}
@@ -1431,7 +1435,7 @@
ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 1);
i_data[EXT4_IND_BLOCK] = 0;
}
- /* fall through */
+ fallthrough;
case EXT4_IND_BLOCK:
if (++n >= n2)
break;
@@ -1440,7 +1444,7 @@
ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 2);
i_data[EXT4_DIND_BLOCK] = 0;
}
- /* fall through */
+ fallthrough;
case EXT4_DIND_BLOCK:
if (++n >= n2)
break;
@@ -1449,7 +1453,7 @@
ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 3);
i_data[EXT4_TIND_BLOCK] = 0;
}
- /* fall through */
+ fallthrough;
case EXT4_TIND_BLOCK:
;
}
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index 46151bd..ae1f0c5 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -98,9 +98,9 @@
error = ext4_get_inode_loc(inode, &iloc);
if (error) {
- ext4_error_inode(inode, __func__, __LINE__, 0,
- "can't get inode location %lu",
- inode->i_ino);
+ ext4_error_inode_err(inode, __func__, __LINE__, 0, -error,
+ "can't get inode location %lu",
+ inode->i_ino);
return 0;
}
@@ -276,7 +276,7 @@
len = 0;
}
- /* Insert the the xttr entry. */
+ /* Insert the xttr entry. */
i.value = value;
i.value_len = len;
@@ -354,7 +354,7 @@
if (error)
goto out;
- /* Update the xttr entry. */
+ /* Update the xattr entry. */
i.value = value;
i.value_len = len;
@@ -733,18 +733,13 @@
void *kaddr;
struct ext4_iloc iloc;
- if (unlikely(copied < len)) {
- if (!PageUptodate(page)) {
- copied = 0;
- goto out;
- }
- }
+ if (unlikely(copied < len) && !PageUptodate(page))
+ return 0;
ret = ext4_get_inode_loc(inode, &iloc);
if (ret) {
ext4_std_error(inode->i_sb, ret);
- copied = 0;
- goto out;
+ return ret;
}
ext4_write_lock_xattr(inode, &no_expand);
@@ -757,7 +752,7 @@
(void) ext4_find_inline_data_nolock(inode);
kaddr = kmap_atomic(page);
- ext4_write_inline_data(inode, &iloc, kaddr, pos, len);
+ ext4_write_inline_data(inode, &iloc, kaddr, pos, copied);
kunmap_atomic(kaddr);
SetPageUptodate(page);
/* clear page dirty so that writepages wouldn't work for us. */
@@ -766,7 +761,7 @@
ext4_write_unlock_xattr(inode, &no_expand);
brelse(iloc.bh);
mark_inode_dirty(inode);
-out:
+
return copied;
}
@@ -855,7 +850,7 @@
/*
* Prepare the write for the inline data.
- * If the the data can be written into the inode, we just read
+ * If the data can be written into the inode, we just read
* the page and make it uptodate, and start the journal.
* Otherwise read the page, makes it dirty so that it can be
* handle in writepages(the i_disksize update is left to the
@@ -1125,7 +1120,15 @@
struct ext4_iloc *iloc,
void *buf, int inline_size)
{
- ext4_create_inline_data(handle, inode, inline_size);
+ int ret;
+
+ ret = ext4_create_inline_data(handle, inode, inline_size);
+ if (ret) {
+ ext4_msg(inode->i_sb, KERN_EMERG,
+ "error restoring inline_data for inode -- potential data loss! (inode %lu, error %d)",
+ inode->i_ino, ret);
+ return;
+ }
ext4_write_inline_data(inode, iloc, buf, 0, inline_size);
ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
}
@@ -1266,7 +1269,7 @@
int ext4_try_add_inline_entry(handle_t *handle, struct ext4_filename *fname,
struct inode *dir, struct inode *inode)
{
- int ret, inline_size, no_expand;
+ int ret, ret2, inline_size, no_expand;
void *inline_start;
struct ext4_iloc iloc;
@@ -1320,7 +1323,9 @@
out:
ext4_write_unlock_xattr(dir, &no_expand);
- ext4_mark_inode_dirty(handle, dir);
+ ret2 = ext4_mark_inode_dirty(handle, dir);
+ if (unlikely(ret2 && !ret))
+ ret = ret2;
brelse(iloc.bh);
return ret;
}
@@ -1710,7 +1715,7 @@
if (err)
goto out;
- err = ext4_generic_delete_entry(handle, dir, de_del, bh,
+ err = ext4_generic_delete_entry(dir, de_del, bh,
inline_start, inline_size, 0);
if (err)
goto out;
@@ -1767,8 +1772,9 @@
err = ext4_get_inode_loc(dir, &iloc);
if (err) {
- EXT4_ERROR_INODE(dir, "error %d getting inode %lu block",
- err, dir->i_ino);
+ EXT4_ERROR_INODE_ERR(dir, -err,
+ "error %d getting inode %lu block",
+ err, dir->i_ino);
return true;
}
@@ -1861,47 +1867,6 @@
return error;
}
-int ext4_inline_data_fiemap(struct inode *inode,
- struct fiemap_extent_info *fieinfo,
- int *has_inline, __u64 start, __u64 len)
-{
- __u64 physical = 0;
- __u64 inline_len;
- __u32 flags = FIEMAP_EXTENT_DATA_INLINE | FIEMAP_EXTENT_NOT_ALIGNED |
- FIEMAP_EXTENT_LAST;
- int error = 0;
- struct ext4_iloc iloc;
-
- down_read(&EXT4_I(inode)->xattr_sem);
- if (!ext4_has_inline_data(inode)) {
- *has_inline = 0;
- goto out;
- }
- inline_len = min_t(size_t, ext4_get_inline_size(inode),
- i_size_read(inode));
- if (start >= inline_len)
- goto out;
- if (start + len < inline_len)
- inline_len = start + len;
- inline_len -= start;
-
- error = ext4_get_inode_loc(inode, &iloc);
- if (error)
- goto out;
-
- physical = (__u64)iloc.bh->b_blocknr << inode->i_sb->s_blocksize_bits;
- physical += (char *)ext4_raw_inode(&iloc) - iloc.bh->b_data;
- physical += offsetof(struct ext4_inode, i_block);
-
- brelse(iloc.bh);
-out:
- up_read(&EXT4_I(inode)->xattr_sem);
- if (physical)
- error = fiemap_fill_next_extent(fieinfo, start, physical,
- inline_len, flags);
- return (error < 0 ? error : 0);
-}
-
int ext4_inline_data_truncate(struct inode *inode, int *has_inline)
{
handle_t *handle;
diff --git a/fs/ext4/inode-test.c b/fs/ext4/inode-test.c
new file mode 100644
index 0000000..d62d802
--- /dev/null
+++ b/fs/ext4/inode-test.c
@@ -0,0 +1,274 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * KUnit test of ext4 inode that verify the seconds part of [a/c/m]
+ * timestamps in ext4 inode structs are decoded correctly.
+ */
+
+#include <kunit/test.h>
+#include <linux/kernel.h>
+#include <linux/time64.h>
+
+#include "ext4.h"
+
+/*
+ * For constructing the nonnegative timestamp lower bound value.
+ * binary: 00000000 00000000 00000000 00000000
+ */
+#define LOWER_MSB_0 0L
+/*
+ * For constructing the nonnegative timestamp upper bound value.
+ * binary: 01111111 11111111 11111111 11111111
+ *
+ */
+#define UPPER_MSB_0 0x7fffffffL
+/*
+ * For constructing the negative timestamp lower bound value.
+ * binary: 10000000 00000000 00000000 00000000
+ */
+#define LOWER_MSB_1 (-(UPPER_MSB_0) - 1L) /* avoid overflow */
+/*
+ * For constructing the negative timestamp upper bound value.
+ * binary: 11111111 11111111 11111111 11111111
+ */
+#define UPPER_MSB_1 (-1L)
+/*
+ * Upper bound for nanoseconds value supported by the encoding.
+ * binary: 00111111 11111111 11111111 11111111
+ */
+#define MAX_NANOSECONDS ((1L << 30) - 1)
+
+#define CASE_NAME_FORMAT "%s: msb:%x lower_bound:%x extra_bits: %x"
+
+#define LOWER_BOUND_NEG_NO_EXTRA_BITS_CASE\
+ "1901-12-13 Lower bound of 32bit < 0 timestamp, no extra bits"
+#define UPPER_BOUND_NEG_NO_EXTRA_BITS_CASE\
+ "1969-12-31 Upper bound of 32bit < 0 timestamp, no extra bits"
+#define LOWER_BOUND_NONNEG_NO_EXTRA_BITS_CASE\
+ "1970-01-01 Lower bound of 32bit >=0 timestamp, no extra bits"
+#define UPPER_BOUND_NONNEG_NO_EXTRA_BITS_CASE\
+ "2038-01-19 Upper bound of 32bit >=0 timestamp, no extra bits"
+#define LOWER_BOUND_NEG_LO_1_CASE\
+ "2038-01-19 Lower bound of 32bit <0 timestamp, lo extra sec bit on"
+#define UPPER_BOUND_NEG_LO_1_CASE\
+ "2106-02-07 Upper bound of 32bit <0 timestamp, lo extra sec bit on"
+#define LOWER_BOUND_NONNEG_LO_1_CASE\
+ "2106-02-07 Lower bound of 32bit >=0 timestamp, lo extra sec bit on"
+#define UPPER_BOUND_NONNEG_LO_1_CASE\
+ "2174-02-25 Upper bound of 32bit >=0 timestamp, lo extra sec bit on"
+#define LOWER_BOUND_NEG_HI_1_CASE\
+ "2174-02-25 Lower bound of 32bit <0 timestamp, hi extra sec bit on"
+#define UPPER_BOUND_NEG_HI_1_CASE\
+ "2242-03-16 Upper bound of 32bit <0 timestamp, hi extra sec bit on"
+#define LOWER_BOUND_NONNEG_HI_1_CASE\
+ "2242-03-16 Lower bound of 32bit >=0 timestamp, hi extra sec bit on"
+#define UPPER_BOUND_NONNEG_HI_1_CASE\
+ "2310-04-04 Upper bound of 32bit >=0 timestamp, hi extra sec bit on"
+#define UPPER_BOUND_NONNEG_HI_1_NS_1_CASE\
+ "2310-04-04 Upper bound of 32bit>=0 timestamp, hi extra sec bit 1. 1 ns"
+#define LOWER_BOUND_NONNEG_HI_1_NS_MAX_CASE\
+ "2378-04-22 Lower bound of 32bit>= timestamp. Extra sec bits 1. Max ns"
+#define LOWER_BOUND_NONNEG_EXTRA_BITS_1_CASE\
+ "2378-04-22 Lower bound of 32bit >=0 timestamp. All extra sec bits on"
+#define UPPER_BOUND_NONNEG_EXTRA_BITS_1_CASE\
+ "2446-05-10 Upper bound of 32bit >=0 timestamp. All extra sec bits on"
+
+struct timestamp_expectation {
+ const char *test_case_name;
+ struct timespec64 expected;
+ u32 extra_bits;
+ bool msb_set;
+ bool lower_bound;
+};
+
+static time64_t get_32bit_time(const struct timestamp_expectation * const test)
+{
+ if (test->msb_set) {
+ if (test->lower_bound)
+ return LOWER_MSB_1;
+
+ return UPPER_MSB_1;
+ }
+
+ if (test->lower_bound)
+ return LOWER_MSB_0;
+ return UPPER_MSB_0;
+}
+
+
+/*
+ * Test data is derived from the table in the Inode Timestamps section of
+ * Documentation/filesystems/ext4/inodes.rst.
+ */
+static void inode_test_xtimestamp_decoding(struct kunit *test)
+{
+ const struct timestamp_expectation test_data[] = {
+ {
+ .test_case_name = LOWER_BOUND_NEG_NO_EXTRA_BITS_CASE,
+ .msb_set = true,
+ .lower_bound = true,
+ .extra_bits = 0,
+ .expected = {.tv_sec = -0x80000000LL, .tv_nsec = 0L},
+ },
+
+ {
+ .test_case_name = UPPER_BOUND_NEG_NO_EXTRA_BITS_CASE,
+ .msb_set = true,
+ .lower_bound = false,
+ .extra_bits = 0,
+ .expected = {.tv_sec = -1LL, .tv_nsec = 0L},
+ },
+
+ {
+ .test_case_name = LOWER_BOUND_NONNEG_NO_EXTRA_BITS_CASE,
+ .msb_set = false,
+ .lower_bound = true,
+ .extra_bits = 0,
+ .expected = {0LL, 0L},
+ },
+
+ {
+ .test_case_name = UPPER_BOUND_NONNEG_NO_EXTRA_BITS_CASE,
+ .msb_set = false,
+ .lower_bound = false,
+ .extra_bits = 0,
+ .expected = {.tv_sec = 0x7fffffffLL, .tv_nsec = 0L},
+ },
+
+ {
+ .test_case_name = LOWER_BOUND_NEG_LO_1_CASE,
+ .msb_set = true,
+ .lower_bound = true,
+ .extra_bits = 1,
+ .expected = {.tv_sec = 0x80000000LL, .tv_nsec = 0L},
+ },
+
+ {
+ .test_case_name = UPPER_BOUND_NEG_LO_1_CASE,
+ .msb_set = true,
+ .lower_bound = false,
+ .extra_bits = 1,
+ .expected = {.tv_sec = 0xffffffffLL, .tv_nsec = 0L},
+ },
+
+ {
+ .test_case_name = LOWER_BOUND_NONNEG_LO_1_CASE,
+ .msb_set = false,
+ .lower_bound = true,
+ .extra_bits = 1,
+ .expected = {.tv_sec = 0x100000000LL, .tv_nsec = 0L},
+ },
+
+ {
+ .test_case_name = UPPER_BOUND_NONNEG_LO_1_CASE,
+ .msb_set = false,
+ .lower_bound = false,
+ .extra_bits = 1,
+ .expected = {.tv_sec = 0x17fffffffLL, .tv_nsec = 0L},
+ },
+
+ {
+ .test_case_name = LOWER_BOUND_NEG_HI_1_CASE,
+ .msb_set = true,
+ .lower_bound = true,
+ .extra_bits = 2,
+ .expected = {.tv_sec = 0x180000000LL, .tv_nsec = 0L},
+ },
+
+ {
+ .test_case_name = UPPER_BOUND_NEG_HI_1_CASE,
+ .msb_set = true,
+ .lower_bound = false,
+ .extra_bits = 2,
+ .expected = {.tv_sec = 0x1ffffffffLL, .tv_nsec = 0L},
+ },
+
+ {
+ .test_case_name = LOWER_BOUND_NONNEG_HI_1_CASE,
+ .msb_set = false,
+ .lower_bound = true,
+ .extra_bits = 2,
+ .expected = {.tv_sec = 0x200000000LL, .tv_nsec = 0L},
+ },
+
+ {
+ .test_case_name = UPPER_BOUND_NONNEG_HI_1_CASE,
+ .msb_set = false,
+ .lower_bound = false,
+ .extra_bits = 2,
+ .expected = {.tv_sec = 0x27fffffffLL, .tv_nsec = 0L},
+ },
+
+ {
+ .test_case_name = UPPER_BOUND_NONNEG_HI_1_NS_1_CASE,
+ .msb_set = false,
+ .lower_bound = false,
+ .extra_bits = 6,
+ .expected = {.tv_sec = 0x27fffffffLL, .tv_nsec = 1L},
+ },
+
+ {
+ .test_case_name = LOWER_BOUND_NONNEG_HI_1_NS_MAX_CASE,
+ .msb_set = false,
+ .lower_bound = true,
+ .extra_bits = 0xFFFFFFFF,
+ .expected = {.tv_sec = 0x300000000LL,
+ .tv_nsec = MAX_NANOSECONDS},
+ },
+
+ {
+ .test_case_name = LOWER_BOUND_NONNEG_EXTRA_BITS_1_CASE,
+ .msb_set = false,
+ .lower_bound = true,
+ .extra_bits = 3,
+ .expected = {.tv_sec = 0x300000000LL, .tv_nsec = 0L},
+ },
+
+ {
+ .test_case_name = UPPER_BOUND_NONNEG_EXTRA_BITS_1_CASE,
+ .msb_set = false,
+ .lower_bound = false,
+ .extra_bits = 3,
+ .expected = {.tv_sec = 0x37fffffffLL, .tv_nsec = 0L},
+ }
+ };
+
+ struct timespec64 timestamp;
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(test_data); ++i) {
+ timestamp.tv_sec = get_32bit_time(&test_data[i]);
+ ext4_decode_extra_time(×tamp,
+ cpu_to_le32(test_data[i].extra_bits));
+
+ KUNIT_EXPECT_EQ_MSG(test,
+ test_data[i].expected.tv_sec,
+ timestamp.tv_sec,
+ CASE_NAME_FORMAT,
+ test_data[i].test_case_name,
+ test_data[i].msb_set,
+ test_data[i].lower_bound,
+ test_data[i].extra_bits);
+ KUNIT_EXPECT_EQ_MSG(test,
+ test_data[i].expected.tv_nsec,
+ timestamp.tv_nsec,
+ CASE_NAME_FORMAT,
+ test_data[i].test_case_name,
+ test_data[i].msb_set,
+ test_data[i].lower_bound,
+ test_data[i].extra_bits);
+ }
+}
+
+static struct kunit_case ext4_inode_test_cases[] = {
+ KUNIT_CASE(inode_test_xtimestamp_decoding),
+ {}
+};
+
+static struct kunit_suite ext4_inode_test_suite = {
+ .name = "ext4_inode_test",
+ .test_cases = ext4_inode_test_cases,
+};
+
+kunit_test_suites(&ext4_inode_test_suite);
+
+MODULE_LICENSE("GPL v2");
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 1429d01..d59474a 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -48,8 +48,6 @@
#include <trace/events/ext4.h>
-#define MPAGE_DA_EXTENT_TAIL 0x01
-
static __u32 ext4_inode_csum(struct inode *inode, struct ext4_inode *raw,
struct ext4_inode_info *ei)
{
@@ -103,8 +101,8 @@
return provided == calculated;
}
-static void ext4_inode_csum_set(struct inode *inode, struct ext4_inode *raw,
- struct ext4_inode_info *ei)
+void ext4_inode_csum_set(struct inode *inode, struct ext4_inode *raw,
+ struct ext4_inode_info *ei)
{
__u32 csum;
@@ -164,32 +162,6 @@
}
/*
- * Restart the transaction associated with *handle. This does a commit,
- * so before we call here everything must be consistently dirtied against
- * this transaction.
- */
-int ext4_truncate_restart_trans(handle_t *handle, struct inode *inode,
- int nblocks)
-{
- int ret;
-
- /*
- * Drop i_data_sem to avoid deadlock with ext4_map_blocks. At this
- * moment, get_block can be called only for blocks inside i_size since
- * page cache has been already dropped and writes are blocked by
- * i_mutex. So we can safely drop the i_data_sem here.
- */
- BUG_ON(EXT4_JOURNAL(inode) == NULL);
- jbd_debug(2, "restarting handle %p\n", handle);
- up_write(&EXT4_I(inode)->i_data_sem);
- ret = ext4_journal_restart(handle, nblocks);
- down_write(&EXT4_I(inode)->i_data_sem);
- ext4_discard_preallocations(inode);
-
- return ret;
-}
-
-/*
* Called at the last iput() if i_nlink is zero.
*/
void ext4_evict_inode(struct inode *inode)
@@ -250,6 +222,16 @@
truncate_inode_pages_final(&inode->i_data);
/*
+ * For inodes with journalled data, transaction commit could have
+ * dirtied the inode. Flush worker is ignoring it because of I_FREEING
+ * flag but we still need to remove the inode from the writeback lists.
+ */
+ if (!list_empty_careful(&inode->i_io_list)) {
+ WARN_ON_ONCE(!ext4_should_journal_data(inode));
+ inode_io_list_del(inode);
+ }
+
+ /*
* Protect us against freezing - iput() caller didn't have to have any
* protection against it. When we are in a running transaction though,
* we are already protected against freezing and we cannot grab further
@@ -304,9 +286,9 @@
if (inode->i_blocks) {
err = ext4_truncate(inode);
if (err) {
- ext4_error(inode->i_sb,
- "couldn't truncate inode %lu (err %d)",
- inode->i_ino, err);
+ ext4_error_err(inode->i_sb, -err,
+ "couldn't truncate inode %lu (err %d)",
+ inode->i_ino, err);
goto stop_handle;
}
}
@@ -354,6 +336,8 @@
ext4_xattr_inode_array_free(ea_inode_array);
return;
no_delete:
+ if (!list_empty(&EXT4_I(inode)->i_fc_list))
+ ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM);
ext4_clear_inode(inode); /* We must guarantee clearing of inode... */
}
@@ -410,7 +394,7 @@
*/
if ((ei->i_reserved_data_blocks == 0) &&
!inode_is_open_for_write(inode))
- ext4_discard_preallocations(inode);
+ ext4_discard_preallocations(inode, 0);
}
static int __check_block_validity(struct inode *inode, const char *func,
@@ -421,8 +405,7 @@
(inode->i_ino ==
le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_journal_inum)))
return 0;
- if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), map->m_pblk,
- map->m_len)) {
+ if (!ext4_inode_block_valid(inode, map->m_pblk, map->m_len)) {
ext4_error_inode(inode, func, line, map->m_pblk,
"lblock %lu mapped to illegal pblock %llu "
"(length %d)", (unsigned long) map->m_lblk,
@@ -437,7 +420,7 @@
{
int ret;
- if (IS_ENCRYPTED(inode))
+ if (IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode))
return fscrypt_zeroout_range(inode, lblk, pblk, len);
ret = sb_issue_zeroout(inode->i_sb, pblk, len, GFP_NOFS);
@@ -469,11 +452,9 @@
*/
down_read(&EXT4_I(inode)->i_data_sem);
if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
- retval = ext4_ext_map_blocks(handle, inode, map, flags &
- EXT4_GET_BLOCKS_KEEP_SIZE);
+ retval = ext4_ext_map_blocks(handle, inode, map, 0);
} else {
- retval = ext4_ind_map_blocks(handle, inode, map, flags &
- EXT4_GET_BLOCKS_KEEP_SIZE);
+ retval = ext4_ind_map_blocks(handle, inode, map, 0);
}
up_read((&EXT4_I(inode)->i_data_sem));
@@ -530,9 +511,8 @@
#endif
map->m_flags = 0;
- ext_debug("ext4_map_blocks(): inode %lu, flag %d, max_blocks %u,"
- "logical block %lu\n", inode->i_ino, flags, map->m_len,
- (unsigned long) map->m_lblk);
+ ext_debug(inode, "flag 0x%x, max_blocks %u, logical block %lu\n",
+ flags, map->m_len, (unsigned long) map->m_lblk);
/*
* ext4_map_blocks returns an int, and m_len is an unsigned int
@@ -545,7 +525,8 @@
return -EFSCORRUPTED;
/* Lookup extent status tree firstly */
- if (ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) {
+ if (!(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) &&
+ ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) {
if (ext4_es_is_written(&es) || ext4_es_is_unwritten(&es)) {
map->m_pblk = ext4_es_pblock(&es) +
map->m_lblk - es.es_lblk;
@@ -578,11 +559,9 @@
*/
down_read(&EXT4_I(inode)->i_data_sem);
if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
- retval = ext4_ext_map_blocks(handle, inode, map, flags &
- EXT4_GET_BLOCKS_KEEP_SIZE);
+ retval = ext4_ext_map_blocks(handle, inode, map, 0);
} else {
- retval = ext4_ind_map_blocks(handle, inode, map, flags &
- EXT4_GET_BLOCKS_KEEP_SIZE);
+ retval = ext4_ind_map_blocks(handle, inode, map, 0);
}
if (retval > 0) {
unsigned int status;
@@ -763,6 +742,12 @@
return ret;
}
}
+ if (retval > 0 && (map->m_flags & EXT4_MAP_UNWRITTEN ||
+ map->m_flags & EXT4_MAP_MAPPED))
+ ext4_fc_track_range(handle, inode, map->m_lblk,
+ map->m_lblk + map->m_len - 1);
+ if (retval < 0)
+ ext_debug(inode, "failed with err %d\n", retval);
return retval;
}
@@ -845,136 +830,6 @@
#define DIO_MAX_BLOCKS 4096
/*
- * Get blocks function for the cases that need to start a transaction -
- * generally difference cases of direct IO and DAX IO. It also handles retries
- * in case of ENOSPC.
- */
-static int ext4_get_block_trans(struct inode *inode, sector_t iblock,
- struct buffer_head *bh_result, int flags)
-{
- int dio_credits;
- handle_t *handle;
- int retries = 0;
- int ret;
-
- /* Trim mapping request to maximum we can map at once for DIO */
- if (bh_result->b_size >> inode->i_blkbits > DIO_MAX_BLOCKS)
- bh_result->b_size = DIO_MAX_BLOCKS << inode->i_blkbits;
- dio_credits = ext4_chunk_trans_blocks(inode,
- bh_result->b_size >> inode->i_blkbits);
-retry:
- handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, dio_credits);
- if (IS_ERR(handle))
- return PTR_ERR(handle);
-
- ret = _ext4_get_block(inode, iblock, bh_result, flags);
- ext4_journal_stop(handle);
-
- if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
- goto retry;
- return ret;
-}
-
-/* Get block function for DIO reads and writes to inodes without extents */
-int ext4_dio_get_block(struct inode *inode, sector_t iblock,
- struct buffer_head *bh, int create)
-{
- /* We don't expect handle for direct IO */
- WARN_ON_ONCE(ext4_journal_current_handle());
-
- if (!create)
- return _ext4_get_block(inode, iblock, bh, 0);
- return ext4_get_block_trans(inode, iblock, bh, EXT4_GET_BLOCKS_CREATE);
-}
-
-/*
- * Get block function for AIO DIO writes when we create unwritten extent if
- * blocks are not allocated yet. The extent will be converted to written
- * after IO is complete.
- */
-static int ext4_dio_get_block_unwritten_async(struct inode *inode,
- sector_t iblock, struct buffer_head *bh_result, int create)
-{
- int ret;
-
- /* We don't expect handle for direct IO */
- WARN_ON_ONCE(ext4_journal_current_handle());
-
- ret = ext4_get_block_trans(inode, iblock, bh_result,
- EXT4_GET_BLOCKS_IO_CREATE_EXT);
-
- /*
- * When doing DIO using unwritten extents, we need io_end to convert
- * unwritten extents to written on IO completion. We allocate io_end
- * once we spot unwritten extent and store it in b_private. Generic
- * DIO code keeps b_private set and furthermore passes the value to
- * our completion callback in 'private' argument.
- */
- if (!ret && buffer_unwritten(bh_result)) {
- if (!bh_result->b_private) {
- ext4_io_end_t *io_end;
-
- io_end = ext4_init_io_end(inode, GFP_KERNEL);
- if (!io_end)
- return -ENOMEM;
- bh_result->b_private = io_end;
- ext4_set_io_unwritten_flag(inode, io_end);
- }
- set_buffer_defer_completion(bh_result);
- }
-
- return ret;
-}
-
-/*
- * Get block function for non-AIO DIO writes when we create unwritten extent if
- * blocks are not allocated yet. The extent will be converted to written
- * after IO is complete by ext4_direct_IO_write().
- */
-static int ext4_dio_get_block_unwritten_sync(struct inode *inode,
- sector_t iblock, struct buffer_head *bh_result, int create)
-{
- int ret;
-
- /* We don't expect handle for direct IO */
- WARN_ON_ONCE(ext4_journal_current_handle());
-
- ret = ext4_get_block_trans(inode, iblock, bh_result,
- EXT4_GET_BLOCKS_IO_CREATE_EXT);
-
- /*
- * Mark inode as having pending DIO writes to unwritten extents.
- * ext4_direct_IO_write() checks this flag and converts extents to
- * written.
- */
- if (!ret && buffer_unwritten(bh_result))
- ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
-
- return ret;
-}
-
-static int ext4_dio_get_block_overwrite(struct inode *inode, sector_t iblock,
- struct buffer_head *bh_result, int create)
-{
- int ret;
-
- ext4_debug("ext4_dio_get_block_overwrite: inode %lu, create flag %d\n",
- inode->i_ino, create);
- /* We don't expect handle for direct IO */
- WARN_ON_ONCE(ext4_journal_current_handle());
-
- ret = _ext4_get_block(inode, iblock, bh_result, 0);
- /*
- * Blocks should have been preallocated! ext4_file_write_iter() checks
- * that.
- */
- WARN_ON_ONCE(!buffer_mapped(bh_result) || buffer_unwritten(bh_result));
-
- return ret;
-}
-
-
-/*
* `handle' can be NULL if create is zero
*/
struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
@@ -985,7 +840,8 @@
int create = map_flags & EXT4_GET_BLOCKS_CREATE;
int err;
- J_ASSERT(handle != NULL || create == 0);
+ J_ASSERT((EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
+ || handle != NULL || create == 0);
map.m_lblk = block;
map.m_len = 1;
@@ -1001,7 +857,8 @@
return ERR_PTR(-ENOMEM);
if (map.m_flags & EXT4_MAP_NEW) {
J_ASSERT(create != 0);
- J_ASSERT(handle != NULL);
+ J_ASSERT((EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
+ || (handle != NULL));
/*
* Now that we do not always journal data, we should
@@ -1038,18 +895,20 @@
ext4_lblk_t block, int map_flags)
{
struct buffer_head *bh;
+ int ret;
bh = ext4_getblk(handle, inode, block, map_flags);
if (IS_ERR(bh))
return bh;
if (!bh || ext4_buffer_uptodate(bh))
return bh;
- ll_rw_block(REQ_OP_READ, REQ_META | REQ_PRIO, 1, &bh);
- wait_on_buffer(bh);
- if (buffer_uptodate(bh))
- return bh;
- put_bh(bh);
- return ERR_PTR(-EIO);
+
+ ret = ext4_read_bh_lock(bh, REQ_META | REQ_PRIO, true);
+ if (ret) {
+ put_bh(bh);
+ return ERR_PTR(ret);
+ }
+ return bh;
}
/* Read a contiguous batch of blocks. */
@@ -1070,8 +929,7 @@
for (i = 0; i < bh_count; i++)
/* Note that NULL bhs[i] is valid because of holes. */
if (bhs[i] && !ext4_buffer_uptodate(bhs[i]))
- ll_rw_block(REQ_OP_READ, REQ_META | REQ_PRIO, 1,
- &bhs[i]);
+ ext4_read_bh_lock(bhs[i], REQ_META | REQ_PRIO, false);
if (!wait)
return 0;
@@ -1241,7 +1099,7 @@
if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
!buffer_unwritten(bh) &&
(block_start < from || block_end > to)) {
- ll_rw_block(REQ_OP_READ, 0, 1, &bh);
+ ext4_read_bh_lock(bh, 0, false);
wait[nr_wait++] = bh;
}
}
@@ -1255,7 +1113,7 @@
}
if (unlikely(err)) {
page_zero_new_buffers(page, from, to);
- } else if (IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode)) {
+ } else if (fscrypt_inode_uses_fs_layer_crypto(inode)) {
for (i = 0; i < nr_wait; i++) {
int err2;
@@ -1439,6 +1297,7 @@
goto errout;
}
copied = ret;
+ ret = 0;
} else
copied = block_write_end(file, mapping, pos,
len, copied, page, fsdata);
@@ -1463,15 +1322,16 @@
* filesystems.
*/
if (i_size_changed || inline_data)
- ext4_mark_inode_dirty(handle, inode);
+ ret = ext4_mark_inode_dirty(handle, inode);
+errout:
if (pos + len > inode->i_size && !verity && ext4_can_truncate(inode))
/* if we have allocated more blocks and copied
* less. We will have blocks allocated outside
* inode->i_size. So truncate them
*/
ext4_orphan_add(handle, inode);
-errout:
+
ret2 = ext4_journal_stop(handle);
if (!ret)
ret = ret2;
@@ -1554,6 +1414,7 @@
goto errout;
}
copied = ret;
+ ret = 0;
} else if (unlikely(copied < len) && !PageUptodate(page)) {
copied = 0;
ext4_journalled_zero_new_buffers(handle, page, from, to);
@@ -1583,6 +1444,7 @@
ret = ret2;
}
+errout:
if (pos + len > inode->i_size && !verity && ext4_can_truncate(inode))
/* if we have allocated more blocks and copied
* less. We will have blocks allocated outside
@@ -1590,7 +1452,6 @@
*/
ext4_orphan_add(handle, inode);
-errout:
ret2 = ext4_journal_stop(handle);
if (!ret)
ret = ret2;
@@ -1693,6 +1554,7 @@
struct ext4_map_blocks map;
struct ext4_io_submit io_submit; /* IO submission data */
unsigned int do_map:1;
+ unsigned int scanned_until_end:1;
};
static void mpage_release_unused_pages(struct mpage_da_data *mpd,
@@ -1708,6 +1570,7 @@
if (mpd->first_page >= mpd->next_page)
return;
+ mpd->scanned_until_end = 0;
index = mpd->first_page;
end = mpd->next_page - 1;
if (invalidate) {
@@ -1782,6 +1645,7 @@
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
int ret;
bool allocated = false;
+ bool reserved = false;
/*
* If the cluster containing lblk is shared with a delayed,
@@ -1798,6 +1662,7 @@
ret = ext4_da_reserve_space(inode);
if (ret != 0) /* ENOSPC */
goto errout;
+ reserved = true;
} else { /* bigalloc */
if (!ext4_es_scan_clu(inode, &ext4_es_is_delonly, lblk)) {
if (!ext4_es_scan_clu(inode,
@@ -1810,6 +1675,7 @@
ret = ext4_da_reserve_space(inode);
if (ret != 0) /* ENOSPC */
goto errout;
+ reserved = true;
} else {
allocated = true;
}
@@ -1820,6 +1686,8 @@
}
ret = ext4_es_insert_delayed_block(inode, lblk, allocated);
+ if (ret && reserved)
+ ext4_da_release_space(inode, 1);
errout:
return ret;
@@ -1848,8 +1716,7 @@
invalid_block = ~0;
map->m_flags = 0;
- ext_debug("ext4_da_map_blocks(): inode %lu, max_blocks %u,"
- "logical block %lu\n", inode->i_ino, map->m_len,
+ ext_debug(inode, "max_blocks %u, logical block %lu\n", map->m_len,
(unsigned long) map->m_lblk);
/* Lookup extent status tree firstly */
@@ -2071,6 +1938,9 @@
}
if (ret == 0)
ret = err;
+ err = ext4_jbd2_inode_add_write(handle, inode, page_offset(page), len);
+ if (ret == 0)
+ ret = err;
EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid;
err = ext4_journal_stop(handle);
if (!ret)
@@ -2245,7 +2115,7 @@
return err;
}
-#define BH_FLAGS ((1 << BH_Unwritten) | (1 << BH_Delay))
+#define BH_FLAGS (BIT(BH_Unwritten) | BIT(BH_Delay))
/*
* mballoc gives us at most this number of blocks...
@@ -2355,7 +2225,84 @@
if (err < 0)
return err;
}
- return lblk < blocks;
+ if (lblk >= blocks) {
+ mpd->scanned_until_end = 1;
+ return 0;
+ }
+ return 1;
+}
+
+/*
+ * mpage_process_page - update page buffers corresponding to changed extent and
+ * may submit fully mapped page for IO
+ *
+ * @mpd - description of extent to map, on return next extent to map
+ * @m_lblk - logical block mapping.
+ * @m_pblk - corresponding physical mapping.
+ * @map_bh - determines on return whether this page requires any further
+ * mapping or not.
+ * Scan given page buffers corresponding to changed extent and update buffer
+ * state according to new extent state.
+ * We map delalloc buffers to their physical location, clear unwritten bits.
+ * If the given page is not fully mapped, we update @map to the next extent in
+ * the given page that needs mapping & return @map_bh as true.
+ */
+static int mpage_process_page(struct mpage_da_data *mpd, struct page *page,
+ ext4_lblk_t *m_lblk, ext4_fsblk_t *m_pblk,
+ bool *map_bh)
+{
+ struct buffer_head *head, *bh;
+ ext4_io_end_t *io_end = mpd->io_submit.io_end;
+ ext4_lblk_t lblk = *m_lblk;
+ ext4_fsblk_t pblock = *m_pblk;
+ int err = 0;
+ int blkbits = mpd->inode->i_blkbits;
+ ssize_t io_end_size = 0;
+ struct ext4_io_end_vec *io_end_vec = ext4_last_io_end_vec(io_end);
+
+ bh = head = page_buffers(page);
+ do {
+ if (lblk < mpd->map.m_lblk)
+ continue;
+ if (lblk >= mpd->map.m_lblk + mpd->map.m_len) {
+ /*
+ * Buffer after end of mapped extent.
+ * Find next buffer in the page to map.
+ */
+ mpd->map.m_len = 0;
+ mpd->map.m_flags = 0;
+ io_end_vec->size += io_end_size;
+ io_end_size = 0;
+
+ err = mpage_process_page_bufs(mpd, head, bh, lblk);
+ if (err > 0)
+ err = 0;
+ if (!err && mpd->map.m_len && mpd->map.m_lblk > lblk) {
+ io_end_vec = ext4_alloc_io_end_vec(io_end);
+ if (IS_ERR(io_end_vec)) {
+ err = PTR_ERR(io_end_vec);
+ goto out;
+ }
+ io_end_vec->offset = (loff_t)mpd->map.m_lblk << blkbits;
+ }
+ *map_bh = true;
+ goto out;
+ }
+ if (buffer_delay(bh)) {
+ clear_buffer_delay(bh);
+ bh->b_blocknr = pblock++;
+ }
+ clear_buffer_unwritten(bh);
+ io_end_size += (1 << blkbits);
+ } while (lblk++, (bh = bh->b_this_page) != head);
+
+ io_end_vec->size += io_end_size;
+ io_end_size = 0;
+ *map_bh = false;
+out:
+ *m_lblk = lblk;
+ *m_pblk = pblock;
+ return err;
}
/*
@@ -2377,12 +2324,12 @@
struct pagevec pvec;
int nr_pages, i;
struct inode *inode = mpd->inode;
- struct buffer_head *head, *bh;
int bpp_bits = PAGE_SHIFT - inode->i_blkbits;
pgoff_t start, end;
ext4_lblk_t lblk;
- sector_t pblock;
+ ext4_fsblk_t pblock;
int err;
+ bool map_bh = false;
start = mpd->map.m_lblk >> bpp_bits;
end = (mpd->map.m_lblk + mpd->map.m_len - 1) >> bpp_bits;
@@ -2398,50 +2345,19 @@
for (i = 0; i < nr_pages; i++) {
struct page *page = pvec.pages[i];
- bh = head = page_buffers(page);
- do {
- if (lblk < mpd->map.m_lblk)
- continue;
- if (lblk >= mpd->map.m_lblk + mpd->map.m_len) {
- /*
- * Buffer after end of mapped extent.
- * Find next buffer in the page to map.
- */
- mpd->map.m_len = 0;
- mpd->map.m_flags = 0;
- /*
- * FIXME: If dioread_nolock supports
- * blocksize < pagesize, we need to make
- * sure we add size mapped so far to
- * io_end->size as the following call
- * can submit the page for IO.
- */
- err = mpage_process_page_bufs(mpd, head,
- bh, lblk);
- pagevec_release(&pvec);
- if (err > 0)
- err = 0;
- return err;
- }
- if (buffer_delay(bh)) {
- clear_buffer_delay(bh);
- bh->b_blocknr = pblock++;
- }
- clear_buffer_unwritten(bh);
- } while (lblk++, (bh = bh->b_this_page) != head);
-
+ err = mpage_process_page(mpd, page, &lblk, &pblock,
+ &map_bh);
/*
- * FIXME: This is going to break if dioread_nolock
- * supports blocksize < pagesize as we will try to
- * convert potentially unmapped parts of inode.
+ * If map_bh is true, means page may require further bh
+ * mapping, or maybe the page was submitted for IO.
+ * So we return to call further extent mapping.
*/
- mpd->io_submit.io_end->size += PAGE_SIZE;
+ if (err < 0 || map_bh)
+ goto out;
/* Page fully mapped - let IO run! */
err = mpage_submit_page(mpd, page);
- if (err < 0) {
- pagevec_release(&pvec);
- return err;
- }
+ if (err < 0)
+ goto out;
}
pagevec_release(&pvec);
}
@@ -2449,6 +2365,9 @@
mpd->map.m_len = 0;
mpd->map.m_flags = 0;
return 0;
+out:
+ pagevec_release(&pvec);
+ return err;
}
static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd)
@@ -2480,7 +2399,7 @@
dioread_nolock = ext4_should_dioread_nolock(inode);
if (dioread_nolock)
get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT;
- if (map->m_flags & (1 << BH_Delay))
+ if (map->m_flags & BIT(BH_Delay))
get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE;
err = ext4_map_blocks(handle, inode, map, get_blocks_flags);
@@ -2528,16 +2447,20 @@
int err;
loff_t disksize;
int progress = 0;
+ ext4_io_end_t *io_end = mpd->io_submit.io_end;
+ struct ext4_io_end_vec *io_end_vec;
- mpd->io_submit.io_end->offset =
- ((loff_t)map->m_lblk) << inode->i_blkbits;
+ io_end_vec = ext4_alloc_io_end_vec(io_end);
+ if (IS_ERR(io_end_vec))
+ return PTR_ERR(io_end_vec);
+ io_end_vec->offset = ((loff_t)map->m_lblk) << inode->i_blkbits;
do {
err = mpage_map_one_extent(handle, mpd);
if (err < 0) {
struct super_block *sb = inode->i_sb;
if (ext4_forced_shutdown(EXT4_SB(sb)) ||
- EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)
+ ext4_test_mount_flag(sb, EXT4_MF_FS_ABORTED))
goto invalidate_dirty_pages;
/*
* Let the uper layers retry transient errors.
@@ -2594,10 +2517,11 @@
EXT4_I(inode)->i_disksize = disksize;
up_write(&EXT4_I(inode)->i_data_sem);
err2 = ext4_mark_inode_dirty(handle, inode);
- if (err2)
- ext4_error(inode->i_sb,
- "Failed to mark inode %lu dirty",
- inode->i_ino);
+ if (err2) {
+ ext4_error_err(inode->i_sb, -err2,
+ "Failed to mark inode %lu dirty",
+ inode->i_ino);
+ }
if (!err)
err = err2;
}
@@ -2663,7 +2587,7 @@
nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end,
tag);
if (nr_pages == 0)
- goto out;
+ break;
for (i = 0; i < nr_pages; i++) {
struct page *page = pvec.pages[i];
@@ -2718,6 +2642,7 @@
pagevec_release(&pvec);
cond_resched();
}
+ mpd->scanned_until_end = 1;
return 0;
out:
pagevec_release(&pvec);
@@ -2736,7 +2661,6 @@
struct inode *inode = mapping->host;
int needed_blocks, rsv_blocks = 0, ret = 0;
struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
- bool done;
struct blk_plug plug;
bool give_up_on_write = false;
@@ -2770,7 +2694,7 @@
* the stack trace.
*/
if (unlikely(ext4_forced_shutdown(EXT4_SB(mapping->host->i_sb)) ||
- sbi->s_mount_flags & EXT4_MF_FS_ABORTED)) {
+ ext4_test_mount_flag(inode->i_sb, EXT4_MF_FS_ABORTED))) {
ret = -EROFS;
goto out_writepages;
}
@@ -2822,7 +2746,6 @@
retry:
if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
tag_pages_for_writeback(mapping, mpd.first_page, mpd.last_page);
- done = false;
blk_start_plug(&plug);
/*
@@ -2832,6 +2755,7 @@
* started.
*/
mpd.do_map = 0;
+ mpd.scanned_until_end = 0;
mpd.io_submit.io_end = ext4_init_io_end(inode, GFP_KERNEL);
if (!mpd.io_submit.io_end) {
ret = -ENOMEM;
@@ -2847,7 +2771,7 @@
if (ret < 0)
goto unplug;
- while (!done && mpd.first_page <= mpd.last_page) {
+ while (!mpd.scanned_until_end && wbc->nr_to_write > 0) {
/* For each extent of pages we use new io_end */
mpd.io_submit.io_end = ext4_init_io_end(inode, GFP_KERNEL);
if (!mpd.io_submit.io_end) {
@@ -2882,26 +2806,15 @@
trace_ext4_da_write_pages(inode, mpd.first_page, mpd.wbc);
ret = mpage_prepare_extent_to_map(&mpd);
- if (!ret) {
- if (mpd.map.m_len)
- ret = mpage_map_and_submit_extent(handle, &mpd,
+ if (!ret && mpd.map.m_len)
+ ret = mpage_map_and_submit_extent(handle, &mpd,
&give_up_on_write);
- else {
- /*
- * We scanned the whole range (or exhausted
- * nr_to_write), submitted what was mapped and
- * didn't find anything needing mapping. We are
- * done.
- */
- done = true;
- }
- }
/*
* Caution: If the handle is synchronous,
* ext4_journal_stop() can wait for transaction commit
* to finish which may depend on writeback of pages to
* complete or on page lock to be released. In that
- * case, we have to wait until after after we have
+ * case, we have to wait until after we have
* submitted all the IO, released page locks we hold,
* and dropped io_end reference (for extent conversion
* to be able to complete) before stopping the handle.
@@ -2982,7 +2895,7 @@
percpu_down_read(&sbi->s_writepages_rwsem);
trace_ext4_writepages(inode, wbc);
- ret = dax_writeback_mapping_range(mapping, inode->i_sb->s_bdev, wbc);
+ ret = dax_writeback_mapping_range(mapping, sbi->s_daxdev, wbc);
trace_ext4_writepages_result(inode, wbc, ret,
nr_to_write - wbc->nr_to_write);
percpu_up_read(&sbi->s_writepages_rwsem);
@@ -3181,37 +3094,39 @@
end = start + copied - 1;
/*
- * generic_write_end() will run mark_inode_dirty() if i_size
- * changes. So let's piggyback the i_disksize mark_inode_dirty
- * into that.
+ * Since we are holding inode lock, we are sure i_disksize <=
+ * i_size. We also know that if i_disksize < i_size, there are
+ * delalloc writes pending in the range upto i_size. If the end of
+ * the current write is <= i_size, there's no need to touch
+ * i_disksize since writeback will push i_disksize upto i_size
+ * eventually. If the end of the current write is > i_size and
+ * inside an allocated block (ext4_da_should_update_i_disksize()
+ * check), we need to update i_disksize here as neither
+ * ext4_writepage() nor certain ext4_writepages() paths not
+ * allocating blocks update i_disksize.
+ *
+ * Note that we defer inode dirtying to generic_write_end() /
+ * ext4_da_write_inline_data_end().
*/
new_i_size = pos + copied;
- if (copied && new_i_size > EXT4_I(inode)->i_disksize) {
+ if (copied && new_i_size > inode->i_size) {
if (ext4_has_inline_data(inode) ||
- ext4_da_should_update_i_disksize(page, end)) {
+ ext4_da_should_update_i_disksize(page, end))
ext4_update_i_disksize(inode, new_i_size);
- /* We need to mark inode dirty even if
- * new_i_size is less that inode->i_size
- * bu greater than i_disksize.(hint delalloc)
- */
- ext4_mark_inode_dirty(handle, inode);
- }
}
if (write_mode != CONVERT_INLINE_DATA &&
ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA) &&
ext4_has_inline_data(inode))
- ret2 = ext4_da_write_inline_data_end(inode, pos, len, copied,
+ ret = ext4_da_write_inline_data_end(inode, pos, len, copied,
page);
else
- ret2 = generic_write_end(file, mapping, pos, len, copied,
+ ret = generic_write_end(file, mapping, pos, len, copied,
page, fsdata);
- copied = ret2;
- if (ret2 < 0)
- ret = ret2;
+ copied = ret;
ret2 = ext4_journal_stop(handle);
- if (!ret)
+ if (unlikely(ret2 && !ret))
ret = ret2;
return ret ? ret : copied;
@@ -3327,7 +3242,7 @@
return 0;
}
- return generic_block_bmap(mapping, block, ext4_get_block);
+ return iomap_bmap(mapping, block, &ext4_iomap_ops);
}
static int ext4_readpage(struct file *file, struct page *page)
@@ -3341,23 +3256,20 @@
ret = ext4_readpage_inline(inode, page);
if (ret == -EAGAIN)
- return ext4_mpage_readpages(page->mapping, NULL, page, 1,
- false);
+ return ext4_mpage_readpages(inode, NULL, page);
return ret;
}
-static int
-ext4_readpages(struct file *file, struct address_space *mapping,
- struct list_head *pages, unsigned nr_pages)
+static void ext4_readahead(struct readahead_control *rac)
{
- struct inode *inode = mapping->host;
+ struct inode *inode = rac->mapping->host;
- /* If the file has inline data, no need to do readpages. */
+ /* If the file has inline data, no need to do readahead. */
if (ext4_has_inline_data(inode))
- return 0;
+ return;
- return ext4_mpage_readpages(mapping, pages, NULL, nr_pages, true);
+ ext4_mpage_readpages(inode, rac, NULL);
}
static void ext4_invalidatepage(struct page *page, unsigned int offset,
@@ -3406,7 +3318,7 @@
if (PageChecked(page))
return 0;
if (journal)
- return jbd2_journal_try_to_free_buffers(journal, page, wait);
+ return jbd2_journal_try_to_free_buffers(journal, page);
else
return try_to_free_buffers(page);
}
@@ -3415,217 +3327,207 @@
{
journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
- if (journal)
- return !jbd2_transaction_committed(journal,
- EXT4_I(inode)->i_datasync_tid);
+ if (journal) {
+ if (jbd2_transaction_committed(journal,
+ EXT4_I(inode)->i_datasync_tid))
+ return false;
+ if (test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT))
+ return !list_empty(&EXT4_I(inode)->i_fc_list);
+ return true;
+ }
+
/* Any metadata buffers to write? */
if (!list_empty(&inode->i_mapping->private_list))
return true;
return inode->i_state & I_DIRTY_DATASYNC;
}
-static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
- unsigned flags, struct iomap *iomap)
+static void ext4_set_iomap(struct inode *inode, struct iomap *iomap,
+ struct ext4_map_blocks *map, loff_t offset,
+ loff_t length)
{
- struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
- unsigned int blkbits = inode->i_blkbits;
- unsigned long first_block, last_block;
- struct ext4_map_blocks map;
- bool delalloc = false;
- int ret;
-
- if ((offset >> blkbits) > EXT4_MAX_LOGICAL_BLOCK)
- return -EINVAL;
- first_block = offset >> blkbits;
- last_block = min_t(loff_t, (offset + length - 1) >> blkbits,
- EXT4_MAX_LOGICAL_BLOCK);
-
- if (flags & IOMAP_REPORT) {
- if (ext4_has_inline_data(inode)) {
- ret = ext4_inline_data_iomap(inode, iomap);
- if (ret != -EAGAIN) {
- if (ret == 0 && offset >= iomap->length)
- ret = -ENOENT;
- return ret;
- }
- }
- } else {
- if (WARN_ON_ONCE(ext4_has_inline_data(inode)))
- return -ERANGE;
- }
-
- map.m_lblk = first_block;
- map.m_len = last_block - first_block + 1;
-
- if (flags & IOMAP_REPORT) {
- ret = ext4_map_blocks(NULL, inode, &map, 0);
- if (ret < 0)
- return ret;
-
- if (ret == 0) {
- ext4_lblk_t end = map.m_lblk + map.m_len - 1;
- struct extent_status es;
-
- ext4_es_find_extent_range(inode, &ext4_es_is_delayed,
- map.m_lblk, end, &es);
-
- if (!es.es_len || es.es_lblk > end) {
- /* entire range is a hole */
- } else if (es.es_lblk > map.m_lblk) {
- /* range starts with a hole */
- map.m_len = es.es_lblk - map.m_lblk;
- } else {
- ext4_lblk_t offs = 0;
-
- if (es.es_lblk < map.m_lblk)
- offs = map.m_lblk - es.es_lblk;
- map.m_lblk = es.es_lblk + offs;
- map.m_len = es.es_len - offs;
- delalloc = true;
- }
- }
- } else if (flags & IOMAP_WRITE) {
- int dio_credits;
- handle_t *handle;
- int retries = 0;
-
- /* Trim mapping request to maximum we can map at once for DIO */
- if (map.m_len > DIO_MAX_BLOCKS)
- map.m_len = DIO_MAX_BLOCKS;
- dio_credits = ext4_chunk_trans_blocks(inode, map.m_len);
-retry:
- /*
- * Either we allocate blocks and then we don't get unwritten
- * extent so we have reserved enough credits, or the blocks
- * are already allocated and unwritten and in that case
- * extent conversion fits in the credits as well.
- */
- handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS,
- dio_credits);
- if (IS_ERR(handle))
- return PTR_ERR(handle);
-
- ret = ext4_map_blocks(handle, inode, &map,
- EXT4_GET_BLOCKS_CREATE_ZERO);
- if (ret < 0) {
- ext4_journal_stop(handle);
- if (ret == -ENOSPC &&
- ext4_should_retry_alloc(inode->i_sb, &retries))
- goto retry;
- return ret;
- }
-
- /*
- * If we added blocks beyond i_size, we need to make sure they
- * will get truncated if we crash before updating i_size in
- * ext4_iomap_end(). For faults we don't need to do that (and
- * even cannot because for orphan list operations inode_lock is
- * required) - if we happen to instantiate block beyond i_size,
- * it is because we race with truncate which has already added
- * the inode to the orphan list.
- */
- if (!(flags & IOMAP_FAULT) && first_block + map.m_len >
- (i_size_read(inode) + (1 << blkbits) - 1) >> blkbits) {
- int err;
-
- err = ext4_orphan_add(handle, inode);
- if (err < 0) {
- ext4_journal_stop(handle);
- return err;
- }
- }
- ext4_journal_stop(handle);
- } else {
- ret = ext4_map_blocks(NULL, inode, &map, 0);
- if (ret < 0)
- return ret;
- }
+ u8 blkbits = inode->i_blkbits;
/*
* Writes that span EOF might trigger an I/O size update on completion,
- * so consider them to be dirty for the purposes of O_DSYNC, even if
- * there is no other metadata changes being made or are pending here.
+ * so consider them to be dirty for the purpose of O_DSYNC, even if
+ * there is no other metadata changes being made or are pending.
*/
iomap->flags = 0;
if (ext4_inode_datasync_dirty(inode) ||
offset + length > i_size_read(inode))
iomap->flags |= IOMAP_F_DIRTY;
- iomap->bdev = inode->i_sb->s_bdev;
- iomap->dax_dev = sbi->s_daxdev;
- iomap->offset = (u64)first_block << blkbits;
- iomap->length = (u64)map.m_len << blkbits;
- if (ret == 0) {
- iomap->type = delalloc ? IOMAP_DELALLOC : IOMAP_HOLE;
- iomap->addr = IOMAP_NULL_ADDR;
- } else {
- if (map.m_flags & EXT4_MAP_MAPPED) {
- iomap->type = IOMAP_MAPPED;
- } else if (map.m_flags & EXT4_MAP_UNWRITTEN) {
- iomap->type = IOMAP_UNWRITTEN;
- } else {
- WARN_ON_ONCE(1);
- return -EIO;
- }
- iomap->addr = (u64)map.m_pblk << blkbits;
- }
-
- if (map.m_flags & EXT4_MAP_NEW)
+ if (map->m_flags & EXT4_MAP_NEW)
iomap->flags |= IOMAP_F_NEW;
+ iomap->bdev = inode->i_sb->s_bdev;
+ iomap->dax_dev = EXT4_SB(inode->i_sb)->s_daxdev;
+ iomap->offset = (u64) map->m_lblk << blkbits;
+ iomap->length = (u64) map->m_len << blkbits;
+
+ if ((map->m_flags & EXT4_MAP_MAPPED) &&
+ !ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
+ iomap->flags |= IOMAP_F_MERGED;
+
+ /*
+ * Flags passed to ext4_map_blocks() for direct I/O writes can result
+ * in m_flags having both EXT4_MAP_MAPPED and EXT4_MAP_UNWRITTEN bits
+ * set. In order for any allocated unwritten extents to be converted
+ * into written extents correctly within the ->end_io() handler, we
+ * need to ensure that the iomap->type is set appropriately. Hence, the
+ * reason why we need to check whether the EXT4_MAP_UNWRITTEN bit has
+ * been set first.
+ */
+ if (map->m_flags & EXT4_MAP_UNWRITTEN) {
+ iomap->type = IOMAP_UNWRITTEN;
+ iomap->addr = (u64) map->m_pblk << blkbits;
+ } else if (map->m_flags & EXT4_MAP_MAPPED) {
+ iomap->type = IOMAP_MAPPED;
+ iomap->addr = (u64) map->m_pblk << blkbits;
+ } else {
+ iomap->type = IOMAP_HOLE;
+ iomap->addr = IOMAP_NULL_ADDR;
+ }
+}
+
+static int ext4_iomap_alloc(struct inode *inode, struct ext4_map_blocks *map,
+ unsigned int flags)
+{
+ handle_t *handle;
+ u8 blkbits = inode->i_blkbits;
+ int ret, dio_credits, m_flags = 0, retries = 0;
+
+ /*
+ * Trim the mapping request to the maximum value that we can map at
+ * once for direct I/O.
+ */
+ if (map->m_len > DIO_MAX_BLOCKS)
+ map->m_len = DIO_MAX_BLOCKS;
+ dio_credits = ext4_chunk_trans_blocks(inode, map->m_len);
+
+retry:
+ /*
+ * Either we allocate blocks and then don't get an unwritten extent, so
+ * in that case we have reserved enough credits. Or, the blocks are
+ * already allocated and unwritten. In that case, the extent conversion
+ * fits into the credits as well.
+ */
+ handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, dio_credits);
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+
+ /*
+ * DAX and direct I/O are the only two operations that are currently
+ * supported with IOMAP_WRITE.
+ */
+ WARN_ON(!IS_DAX(inode) && !(flags & IOMAP_DIRECT));
+ if (IS_DAX(inode))
+ m_flags = EXT4_GET_BLOCKS_CREATE_ZERO;
+ /*
+ * We use i_size instead of i_disksize here because delalloc writeback
+ * can complete at any point during the I/O and subsequently push the
+ * i_disksize out to i_size. This could be beyond where direct I/O is
+ * happening and thus expose allocated blocks to direct I/O reads.
+ */
+ else if (((loff_t)map->m_lblk << blkbits) >= i_size_read(inode))
+ m_flags = EXT4_GET_BLOCKS_CREATE;
+ else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
+ m_flags = EXT4_GET_BLOCKS_IO_CREATE_EXT;
+
+ ret = ext4_map_blocks(handle, inode, map, m_flags);
+
+ /*
+ * We cannot fill holes in indirect tree based inodes as that could
+ * expose stale data in the case of a crash. Use the magic error code
+ * to fallback to buffered I/O.
+ */
+ if (!m_flags && !ret)
+ ret = -ENOTBLK;
+
+ ext4_journal_stop(handle);
+ if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
+ goto retry;
+
+ return ret;
+}
+
+
+static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
+ unsigned flags, struct iomap *iomap, struct iomap *srcmap)
+{
+ int ret;
+ struct ext4_map_blocks map;
+ u8 blkbits = inode->i_blkbits;
+
+ if ((offset >> blkbits) > EXT4_MAX_LOGICAL_BLOCK)
+ return -EINVAL;
+
+ if (WARN_ON_ONCE(ext4_has_inline_data(inode)))
+ return -ERANGE;
+
+ /*
+ * Calculate the first and last logical blocks respectively.
+ */
+ map.m_lblk = offset >> blkbits;
+ map.m_len = min_t(loff_t, (offset + length - 1) >> blkbits,
+ EXT4_MAX_LOGICAL_BLOCK) - map.m_lblk + 1;
+
+ if (flags & IOMAP_WRITE) {
+ /*
+ * We check here if the blocks are already allocated, then we
+ * don't need to start a journal txn and we can directly return
+ * the mapping information. This could boost performance
+ * especially in multi-threaded overwrite requests.
+ */
+ if (offset + length <= i_size_read(inode)) {
+ ret = ext4_map_blocks(NULL, inode, &map, 0);
+ if (ret > 0 && (map.m_flags & EXT4_MAP_MAPPED))
+ goto out;
+ }
+ ret = ext4_iomap_alloc(inode, &map, flags);
+ } else {
+ ret = ext4_map_blocks(NULL, inode, &map, 0);
+ }
+
+ if (ret < 0)
+ return ret;
+out:
+ ext4_set_iomap(inode, iomap, &map, offset, length);
+
return 0;
}
+static int ext4_iomap_overwrite_begin(struct inode *inode, loff_t offset,
+ loff_t length, unsigned flags, struct iomap *iomap,
+ struct iomap *srcmap)
+{
+ int ret;
+
+ /*
+ * Even for writes we don't need to allocate blocks, so just pretend
+ * we are reading to save overhead of starting a transaction.
+ */
+ flags &= ~IOMAP_WRITE;
+ ret = ext4_iomap_begin(inode, offset, length, flags, iomap, srcmap);
+ WARN_ON_ONCE(iomap->type != IOMAP_MAPPED);
+ return ret;
+}
+
static int ext4_iomap_end(struct inode *inode, loff_t offset, loff_t length,
ssize_t written, unsigned flags, struct iomap *iomap)
{
- int ret = 0;
- handle_t *handle;
- int blkbits = inode->i_blkbits;
- bool truncate = false;
-
- if (!(flags & IOMAP_WRITE) || (flags & IOMAP_FAULT))
- return 0;
-
- handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
- if (IS_ERR(handle)) {
- ret = PTR_ERR(handle);
- goto orphan_del;
- }
- if (ext4_update_inode_size(inode, offset + written))
- ext4_mark_inode_dirty(handle, inode);
/*
- * We may need to truncate allocated but not written blocks beyond EOF.
+ * Check to see whether an error occurred while writing out the data to
+ * the allocated blocks. If so, return the magic error code so that we
+ * fallback to buffered I/O and attempt to complete the remainder of
+ * the I/O. Any blocks that may have been allocated in preparation for
+ * the direct I/O will be reused during buffered I/O.
*/
- if (iomap->offset + iomap->length >
- ALIGN(inode->i_size, 1 << blkbits)) {
- ext4_lblk_t written_blk, end_blk;
+ if (flags & (IOMAP_WRITE | IOMAP_DIRECT) && written == 0)
+ return -ENOTBLK;
- written_blk = (offset + written) >> blkbits;
- end_blk = (offset + length) >> blkbits;
- if (written_blk < end_blk && ext4_can_truncate(inode))
- truncate = true;
- }
- /*
- * Remove inode from orphan list if we were extending a inode and
- * everything went fine.
- */
- if (!truncate && inode->i_nlink &&
- !list_empty(&EXT4_I(inode)->i_orphan))
- ext4_orphan_del(handle, inode);
- ext4_journal_stop(handle);
- if (truncate) {
- ext4_truncate_failed_write(inode);
-orphan_del:
- /*
- * If truncate failed early the inode might still be on the
- * orphan list; we need to make sure the inode is removed from
- * the orphan list in that case.
- */
- if (inode->i_nlink)
- ext4_orphan_del(NULL, inode);
- }
- return ret;
+ return 0;
}
const struct iomap_ops ext4_iomap_ops = {
@@ -3633,280 +3535,94 @@
.iomap_end = ext4_iomap_end,
};
-static int ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
- ssize_t size, void *private)
+const struct iomap_ops ext4_iomap_overwrite_ops = {
+ .iomap_begin = ext4_iomap_overwrite_begin,
+ .iomap_end = ext4_iomap_end,
+};
+
+static bool ext4_iomap_is_delalloc(struct inode *inode,
+ struct ext4_map_blocks *map)
{
- ext4_io_end_t *io_end = private;
+ struct extent_status es;
+ ext4_lblk_t offset = 0, end = map->m_lblk + map->m_len - 1;
- /* if not async direct IO just return */
- if (!io_end)
- return 0;
+ ext4_es_find_extent_range(inode, &ext4_es_is_delayed,
+ map->m_lblk, end, &es);
- ext_debug("ext4_end_io_dio(): io_end 0x%p "
- "for inode %lu, iocb 0x%p, offset %llu, size %zd\n",
- io_end, io_end->inode->i_ino, iocb, offset, size);
+ if (!es.es_len || es.es_lblk > end)
+ return false;
+
+ if (es.es_lblk > map->m_lblk) {
+ map->m_len = es.es_lblk - map->m_lblk;
+ return false;
+ }
+
+ offset = map->m_lblk - es.es_lblk;
+ map->m_len = es.es_len - offset;
+
+ return true;
+}
+
+static int ext4_iomap_begin_report(struct inode *inode, loff_t offset,
+ loff_t length, unsigned int flags,
+ struct iomap *iomap, struct iomap *srcmap)
+{
+ int ret;
+ bool delalloc = false;
+ struct ext4_map_blocks map;
+ u8 blkbits = inode->i_blkbits;
+
+ if ((offset >> blkbits) > EXT4_MAX_LOGICAL_BLOCK)
+ return -EINVAL;
+
+ if (ext4_has_inline_data(inode)) {
+ ret = ext4_inline_data_iomap(inode, iomap);
+ if (ret != -EAGAIN) {
+ if (ret == 0 && offset >= iomap->length)
+ ret = -ENOENT;
+ return ret;
+ }
+ }
/*
- * Error during AIO DIO. We cannot convert unwritten extents as the
- * data was not written. Just clear the unwritten flag and drop io_end.
+ * Calculate the first and last logical block respectively.
*/
- if (size <= 0) {
- ext4_clear_io_unwritten_flag(io_end);
- size = 0;
+ map.m_lblk = offset >> blkbits;
+ map.m_len = min_t(loff_t, (offset + length - 1) >> blkbits,
+ EXT4_MAX_LOGICAL_BLOCK) - map.m_lblk + 1;
+
+ /*
+ * Fiemap callers may call for offset beyond s_bitmap_maxbytes.
+ * So handle it here itself instead of querying ext4_map_blocks().
+ * Since ext4_map_blocks() will warn about it and will return
+ * -EIO error.
+ */
+ if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+
+ if (offset >= sbi->s_bitmap_maxbytes) {
+ map.m_flags = 0;
+ goto set_iomap;
+ }
}
- io_end->offset = offset;
- io_end->size = size;
- ext4_put_io_end(io_end);
+
+ ret = ext4_map_blocks(NULL, inode, &map, 0);
+ if (ret < 0)
+ return ret;
+ if (ret == 0)
+ delalloc = ext4_iomap_is_delalloc(inode, &map);
+
+set_iomap:
+ ext4_set_iomap(inode, iomap, &map, offset, length);
+ if (delalloc && iomap->type == IOMAP_HOLE)
+ iomap->type = IOMAP_DELALLOC;
return 0;
}
-/*
- * Handling of direct IO writes.
- *
- * For ext4 extent files, ext4 will do direct-io write even to holes,
- * preallocated extents, and those write extend the file, no need to
- * fall back to buffered IO.
- *
- * For holes, we fallocate those blocks, mark them as unwritten
- * If those blocks were preallocated, we mark sure they are split, but
- * still keep the range to write as unwritten.
- *
- * The unwritten extents will be converted to written when DIO is completed.
- * For async direct IO, since the IO may still pending when return, we
- * set up an end_io call back function, which will do the conversion
- * when async direct IO completed.
- *
- * If the O_DIRECT write will extend the file then add this inode to the
- * orphan list. So recovery will truncate it back to the original size
- * if the machine crashes during the write.
- *
- */
-static ssize_t ext4_direct_IO_write(struct kiocb *iocb, struct iov_iter *iter)
-{
- struct file *file = iocb->ki_filp;
- struct inode *inode = file->f_mapping->host;
- struct ext4_inode_info *ei = EXT4_I(inode);
- ssize_t ret;
- loff_t offset = iocb->ki_pos;
- size_t count = iov_iter_count(iter);
- int overwrite = 0;
- get_block_t *get_block_func = NULL;
- int dio_flags = 0;
- loff_t final_size = offset + count;
- int orphan = 0;
- handle_t *handle;
-
- if (final_size > inode->i_size || final_size > ei->i_disksize) {
- /* Credits for sb + inode write */
- handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
- if (IS_ERR(handle)) {
- ret = PTR_ERR(handle);
- goto out;
- }
- ret = ext4_orphan_add(handle, inode);
- if (ret) {
- ext4_journal_stop(handle);
- goto out;
- }
- orphan = 1;
- ext4_update_i_disksize(inode, inode->i_size);
- ext4_journal_stop(handle);
- }
-
- BUG_ON(iocb->private == NULL);
-
- /*
- * Make all waiters for direct IO properly wait also for extent
- * conversion. This also disallows race between truncate() and
- * overwrite DIO as i_dio_count needs to be incremented under i_mutex.
- */
- inode_dio_begin(inode);
-
- /* If we do a overwrite dio, i_mutex locking can be released */
- overwrite = *((int *)iocb->private);
-
- if (overwrite)
- inode_unlock(inode);
-
- /*
- * For extent mapped files we could direct write to holes and fallocate.
- *
- * Allocated blocks to fill the hole are marked as unwritten to prevent
- * parallel buffered read to expose the stale data before DIO complete
- * the data IO.
- *
- * As to previously fallocated extents, ext4 get_block will just simply
- * mark the buffer mapped but still keep the extents unwritten.
- *
- * For non AIO case, we will convert those unwritten extents to written
- * after return back from blockdev_direct_IO. That way we save us from
- * allocating io_end structure and also the overhead of offloading
- * the extent convertion to a workqueue.
- *
- * For async DIO, the conversion needs to be deferred when the
- * IO is completed. The ext4 end_io callback function will be
- * called to take care of the conversion work. Here for async
- * case, we allocate an io_end structure to hook to the iocb.
- */
- iocb->private = NULL;
- if (overwrite)
- get_block_func = ext4_dio_get_block_overwrite;
- else if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) ||
- round_down(offset, i_blocksize(inode)) >= inode->i_size) {
- get_block_func = ext4_dio_get_block;
- dio_flags = DIO_LOCKING | DIO_SKIP_HOLES;
- } else if (is_sync_kiocb(iocb)) {
- get_block_func = ext4_dio_get_block_unwritten_sync;
- dio_flags = DIO_LOCKING;
- } else {
- get_block_func = ext4_dio_get_block_unwritten_async;
- dio_flags = DIO_LOCKING;
- }
- ret = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, iter,
- get_block_func, ext4_end_io_dio, NULL,
- dio_flags);
-
- if (ret > 0 && !overwrite && ext4_test_inode_state(inode,
- EXT4_STATE_DIO_UNWRITTEN)) {
- int err;
- /*
- * for non AIO case, since the IO is already
- * completed, we could do the conversion right here
- */
- err = ext4_convert_unwritten_extents(NULL, inode,
- offset, ret);
- if (err < 0)
- ret = err;
- ext4_clear_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
- }
-
- inode_dio_end(inode);
- /* take i_mutex locking again if we do a ovewrite dio */
- if (overwrite)
- inode_lock(inode);
-
- if (ret < 0 && final_size > inode->i_size)
- ext4_truncate_failed_write(inode);
-
- /* Handle extending of i_size after direct IO write */
- if (orphan) {
- int err;
-
- /* Credits for sb + inode write */
- handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
- if (IS_ERR(handle)) {
- /*
- * We wrote the data but cannot extend
- * i_size. Bail out. In async io case, we do
- * not return error here because we have
- * already submmitted the corresponding
- * bio. Returning error here makes the caller
- * think that this IO is done and failed
- * resulting in race with bio's completion
- * handler.
- */
- if (!ret)
- ret = PTR_ERR(handle);
- if (inode->i_nlink)
- ext4_orphan_del(NULL, inode);
-
- goto out;
- }
- if (inode->i_nlink)
- ext4_orphan_del(handle, inode);
- if (ret > 0) {
- loff_t end = offset + ret;
- if (end > inode->i_size || end > ei->i_disksize) {
- ext4_update_i_disksize(inode, end);
- if (end > inode->i_size)
- i_size_write(inode, end);
- /*
- * We're going to return a positive `ret'
- * here due to non-zero-length I/O, so there's
- * no way of reporting error returns from
- * ext4_mark_inode_dirty() to userspace. So
- * ignore it.
- */
- ext4_mark_inode_dirty(handle, inode);
- }
- }
- err = ext4_journal_stop(handle);
- if (ret == 0)
- ret = err;
- }
-out:
- return ret;
-}
-
-static ssize_t ext4_direct_IO_read(struct kiocb *iocb, struct iov_iter *iter)
-{
- struct address_space *mapping = iocb->ki_filp->f_mapping;
- struct inode *inode = mapping->host;
- size_t count = iov_iter_count(iter);
- ssize_t ret;
- loff_t offset = iocb->ki_pos;
- loff_t size = i_size_read(inode);
-
- if (offset >= size)
- return 0;
-
- /*
- * Shared inode_lock is enough for us - it protects against concurrent
- * writes & truncates and since we take care of writing back page cache,
- * we are protected against page writeback as well.
- */
- if (iocb->ki_flags & IOCB_NOWAIT) {
- if (!inode_trylock_shared(inode))
- return -EAGAIN;
- } else {
- inode_lock_shared(inode);
- }
-
- ret = filemap_write_and_wait_range(mapping, iocb->ki_pos,
- iocb->ki_pos + count - 1);
- if (ret)
- goto out_unlock;
- ret = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev,
- iter, ext4_dio_get_block, NULL, NULL, 0);
-out_unlock:
- inode_unlock_shared(inode);
- return ret;
-}
-
-static ssize_t ext4_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
-{
- struct file *file = iocb->ki_filp;
- struct inode *inode = file->f_mapping->host;
- size_t count = iov_iter_count(iter);
- loff_t offset = iocb->ki_pos;
- ssize_t ret;
-
-#ifdef CONFIG_FS_ENCRYPTION
- if (IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode))
- return 0;
-#endif
- if (fsverity_active(inode))
- return 0;
-
- /*
- * If we are doing data journalling we don't support O_DIRECT
- */
- if (ext4_should_journal_data(inode))
- return 0;
-
- /* Let buffer I/O handle the inline data case. */
- if (ext4_has_inline_data(inode))
- return 0;
-
- trace_ext4_direct_IO_enter(inode, offset, count, iov_iter_rw(iter));
- if (iov_iter_rw(iter) == READ)
- ret = ext4_direct_IO_read(iocb, iter);
- else
- ret = ext4_direct_IO_write(iocb, iter);
- trace_ext4_direct_IO_exit(inode, offset, count, iov_iter_rw(iter), ret);
- return ret;
-}
+const struct iomap_ops ext4_iomap_report_ops = {
+ .iomap_begin = ext4_iomap_begin_report,
+};
/*
* Pages can be marked dirty completely asynchronously from ext4's journalling
@@ -3934,9 +3650,16 @@
return __set_page_dirty_buffers(page);
}
+static int ext4_iomap_swap_activate(struct swap_info_struct *sis,
+ struct file *file, sector_t *span)
+{
+ return iomap_swapfile_activate(sis, file, span,
+ &ext4_iomap_report_ops);
+}
+
static const struct address_space_operations ext4_aops = {
.readpage = ext4_readpage,
- .readpages = ext4_readpages,
+ .readahead = ext4_readahead,
.writepage = ext4_writepage,
.writepages = ext4_writepages,
.write_begin = ext4_write_begin,
@@ -3945,15 +3668,16 @@
.bmap = ext4_bmap,
.invalidatepage = ext4_invalidatepage,
.releasepage = ext4_releasepage,
- .direct_IO = ext4_direct_IO,
+ .direct_IO = noop_direct_IO,
.migratepage = buffer_migrate_page,
.is_partially_uptodate = block_is_partially_uptodate,
.error_remove_page = generic_error_remove_page,
+ .swap_activate = ext4_iomap_swap_activate,
};
static const struct address_space_operations ext4_journalled_aops = {
.readpage = ext4_readpage,
- .readpages = ext4_readpages,
+ .readahead = ext4_readahead,
.writepage = ext4_writepage,
.writepages = ext4_writepages,
.write_begin = ext4_write_begin,
@@ -3962,14 +3686,15 @@
.bmap = ext4_bmap,
.invalidatepage = ext4_journalled_invalidatepage,
.releasepage = ext4_releasepage,
- .direct_IO = ext4_direct_IO,
+ .direct_IO = noop_direct_IO,
.is_partially_uptodate = block_is_partially_uptodate,
.error_remove_page = generic_error_remove_page,
+ .swap_activate = ext4_iomap_swap_activate,
};
static const struct address_space_operations ext4_da_aops = {
.readpage = ext4_readpage,
- .readpages = ext4_readpages,
+ .readahead = ext4_readahead,
.writepage = ext4_writepage,
.writepages = ext4_writepages,
.write_begin = ext4_da_write_begin,
@@ -3978,10 +3703,11 @@
.bmap = ext4_bmap,
.invalidatepage = ext4_invalidatepage,
.releasepage = ext4_releasepage,
- .direct_IO = ext4_direct_IO,
+ .direct_IO = noop_direct_IO,
.migratepage = buffer_migrate_page,
.is_partially_uptodate = block_is_partially_uptodate,
.error_remove_page = generic_error_remove_page,
+ .swap_activate = ext4_iomap_swap_activate,
};
static const struct address_space_operations ext4_dax_aops = {
@@ -3990,6 +3716,7 @@
.set_page_dirty = noop_set_page_dirty,
.bmap = ext4_bmap,
.invalidatepage = noop_invalidatepage,
+ .swap_activate = ext4_iomap_swap_activate,
};
void ext4_set_aops(struct inode *inode)
@@ -4063,17 +3790,18 @@
set_buffer_uptodate(bh);
if (!buffer_uptodate(bh)) {
- err = -EIO;
- ll_rw_block(REQ_OP_READ, 0, 1, &bh);
- wait_on_buffer(bh);
- /* Uhhuh. Read error. Complain and punt. */
- if (!buffer_uptodate(bh))
+ err = ext4_read_bh_lock(bh, 0, true);
+ if (err)
goto unlock;
- if (S_ISREG(inode->i_mode) && IS_ENCRYPTED(inode)) {
+ if (fscrypt_inode_uses_fs_layer_crypto(inode)) {
/* We expect the key to be set. */
BUG_ON(!fscrypt_has_encryption_key(inode));
- WARN_ON_ONCE(fscrypt_decrypt_pagecache_blocks(
- page, blocksize, bh_offset(bh)));
+ err = fscrypt_decrypt_pagecache_blocks(page, blocksize,
+ bh_offset(bh));
+ if (err) {
+ clear_buffer_uptodate(bh);
+ goto unlock;
+ }
}
}
if (ext4_should_journal_data(inode)) {
@@ -4213,6 +3941,8 @@
loff_t len)
{
handle_t *handle;
+ int ret;
+
loff_t size = i_size_read(inode);
WARN_ON(!inode_is_locked(inode));
@@ -4226,10 +3956,10 @@
if (IS_ERR(handle))
return PTR_ERR(handle);
ext4_update_i_disksize(inode, size);
- ext4_mark_inode_dirty(handle, inode);
+ ret = ext4_mark_inode_dirty(handle, inode);
ext4_journal_stop(handle);
- return 0;
+ return ret;
}
static void ext4_wait_dax_page(struct ext4_inode_info *ei)
@@ -4281,10 +4011,7 @@
loff_t first_block_offset, last_block_offset;
handle_t *handle;
unsigned int credits;
- int ret = 0;
-
- if (!S_ISREG(inode->i_mode))
- return -EOPNOTSUPP;
+ int ret = 0, ret2 = 0;
trace_ext4_punch_hole(inode, offset, length, 0);
@@ -4385,7 +4112,7 @@
if (stop_block > first_block) {
down_write(&EXT4_I(inode)->i_data_sem);
- ext4_discard_preallocations(inode);
+ ext4_discard_preallocations(inode, 0);
ret = ext4_es_remove_extent(inode, first_block,
stop_block - first_block);
@@ -4403,11 +4130,14 @@
up_write(&EXT4_I(inode)->i_data_sem);
}
+ ext4_fc_track_range(handle, inode, first_block, stop_block);
if (IS_SYNC(inode))
ext4_handle_sync(handle);
inode->i_mtime = inode->i_ctime = current_time(inode);
- ext4_mark_inode_dirty(handle, inode);
+ ret2 = ext4_mark_inode_dirty(handle, inode);
+ if (unlikely(ret2))
+ ret = ret2;
if (ret >= 0)
ext4_update_inode_fsync_trans(handle, inode, 1);
out_stop:
@@ -4476,7 +4206,7 @@
{
struct ext4_inode_info *ei = EXT4_I(inode);
unsigned int credits;
- int err = 0;
+ int err = 0, err2;
handle_t *handle;
struct address_space *mapping = inode->i_mapping;
@@ -4490,9 +4220,7 @@
trace_ext4_truncate_enter(inode);
if (!ext4_can_truncate(inode))
- return 0;
-
- ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
+ goto out_trace;
if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC))
ext4_set_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE);
@@ -4501,16 +4229,14 @@
int has_inline = 1;
err = ext4_inline_data_truncate(inode, &has_inline);
- if (err)
- return err;
- if (has_inline)
- return 0;
+ if (err || has_inline)
+ goto out_trace;
}
/* If we zero-out tail of the page, we have to create jinode for jbd2 */
if (inode->i_size & (inode->i_sb->s_blocksize - 1)) {
if (ext4_inode_attach_jinode(inode) < 0)
- return 0;
+ goto out_trace;
}
if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
@@ -4519,8 +4245,10 @@
credits = ext4_blocks_for_truncate(inode);
handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
- if (IS_ERR(handle))
- return PTR_ERR(handle);
+ if (IS_ERR(handle)) {
+ err = PTR_ERR(handle);
+ goto out_trace;
+ }
if (inode->i_size & (inode->i_sb->s_blocksize - 1))
ext4_block_truncate_page(handle, mapping, inode->i_size);
@@ -4540,7 +4268,7 @@
down_write(&EXT4_I(inode)->i_data_sem);
- ext4_discard_preallocations(inode);
+ ext4_discard_preallocations(inode, 0);
if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
err = ext4_ext_truncate(handle, inode);
@@ -4566,9 +4294,12 @@
ext4_orphan_del(handle, inode);
inode->i_mtime = inode->i_ctime = current_time(inode);
- ext4_mark_inode_dirty(handle, inode);
+ err2 = ext4_mark_inode_dirty(handle, inode);
+ if (unlikely(err2 && !err))
+ err = err2;
ext4_journal_stop(handle);
+out_trace:
trace_ext4_truncate_exit(inode);
return err;
}
@@ -4579,22 +4310,22 @@
* data in memory that is needed to recreate the on-disk version of this
* inode.
*/
-static int __ext4_get_inode_loc(struct inode *inode,
- struct ext4_iloc *iloc, int in_mem)
+static int __ext4_get_inode_loc(struct super_block *sb, unsigned long ino,
+ struct ext4_iloc *iloc, int in_mem,
+ ext4_fsblk_t *ret_block)
{
struct ext4_group_desc *gdp;
struct buffer_head *bh;
- struct super_block *sb = inode->i_sb;
ext4_fsblk_t block;
struct blk_plug plug;
int inodes_per_block, inode_offset;
iloc->bh = NULL;
- if (inode->i_ino < EXT4_ROOT_INO ||
- inode->i_ino > le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count))
+ if (ino < EXT4_ROOT_INO ||
+ ino > le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count))
return -EFSCORRUPTED;
- iloc->block_group = (inode->i_ino - 1) / EXT4_INODES_PER_GROUP(sb);
+ iloc->block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb);
gdp = ext4_get_group_desc(sb, iloc->block_group, NULL);
if (!gdp)
return -EIO;
@@ -4603,7 +4334,7 @@
* Figure out the offset within the block group inode table
*/
inodes_per_block = EXT4_SB(sb)->s_inodes_per_block;
- inode_offset = ((inode->i_ino - 1) %
+ inode_offset = ((ino - 1) %
EXT4_INODES_PER_GROUP(sb));
block = ext4_inode_table(sb, gdp) + (inode_offset / inodes_per_block);
iloc->offset = (inode_offset % inodes_per_block) * EXT4_INODE_SIZE(sb);
@@ -4611,19 +4342,12 @@
bh = sb_getblk(sb, block);
if (unlikely(!bh))
return -ENOMEM;
+ if (ext4_simulate_fail(sb, EXT4_SIM_INODE_EIO))
+ goto simulate_eio;
if (!buffer_uptodate(bh)) {
lock_buffer(bh);
- /*
- * If the buffer has the write error flag, we have failed
- * to write out another inode in the same block. In this
- * case, we don't have to read the block because we may
- * read the old inode data successfully.
- */
- if (buffer_write_io_error(bh) && !buffer_uptodate(bh))
- set_buffer_uptodate(bh);
-
- if (buffer_uptodate(bh)) {
+ if (ext4_buffer_uptodate(bh)) {
/* someone brought it uptodate while we waited */
unlock_buffer(bh);
goto has_buffer;
@@ -4694,7 +4418,7 @@
if (end > table)
end = table;
while (b <= end)
- sb_breadahead_unmovable(sb, b++);
+ ext4_sb_breadahead_unmovable(sb, b++);
}
/*
@@ -4702,15 +4426,14 @@
* has in-inode xattrs, or we don't have this inode in memory.
* Read the block from disk.
*/
- trace_ext4_load_inode(inode);
- get_bh(bh);
- bh->b_end_io = end_buffer_read_sync;
- submit_bh(REQ_OP_READ, REQ_META | REQ_PRIO, bh);
+ trace_ext4_load_inode(sb, ino);
+ ext4_read_bh_nowait(bh, REQ_META | REQ_PRIO, NULL);
blk_finish_plug(&plug);
wait_on_buffer(bh);
if (!buffer_uptodate(bh)) {
- EXT4_ERROR_INODE_BLOCK(inode, block,
- "unable to read itable block");
+ simulate_eio:
+ if (ret_block)
+ *ret_block = block;
brelse(bh);
return -EIO;
}
@@ -4720,16 +4443,50 @@
return 0;
}
-int ext4_get_inode_loc(struct inode *inode, struct ext4_iloc *iloc)
+static int __ext4_get_inode_loc_noinmem(struct inode *inode,
+ struct ext4_iloc *iloc)
{
- /* We have all inode data except xattrs in memory here. */
- return __ext4_get_inode_loc(inode, iloc,
- !ext4_test_inode_state(inode, EXT4_STATE_XATTR));
+ ext4_fsblk_t err_blk = 0;
+ int ret;
+
+ ret = __ext4_get_inode_loc(inode->i_sb, inode->i_ino, iloc, 0,
+ &err_blk);
+
+ if (ret == -EIO)
+ ext4_error_inode_block(inode, err_blk, EIO,
+ "unable to read itable block");
+
+ return ret;
}
-static bool ext4_should_use_dax(struct inode *inode)
+int ext4_get_inode_loc(struct inode *inode, struct ext4_iloc *iloc)
{
- if (!test_opt(inode->i_sb, DAX))
+ ext4_fsblk_t err_blk = 0;
+ int ret;
+
+ /* We have all inode data except xattrs in memory here. */
+ ret = __ext4_get_inode_loc(inode->i_sb, inode->i_ino, iloc,
+ !ext4_test_inode_state(inode, EXT4_STATE_XATTR), &err_blk);
+
+ if (ret == -EIO)
+ ext4_error_inode_block(inode, err_blk, EIO,
+ "unable to read itable block");
+
+ return ret;
+}
+
+
+int ext4_get_fc_inode_loc(struct super_block *sb, unsigned long ino,
+ struct ext4_iloc *iloc)
+{
+ return __ext4_get_inode_loc(sb, ino, iloc, 0, NULL);
+}
+
+static bool ext4_should_enable_dax(struct inode *inode)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+
+ if (test_opt2(inode->i_sb, DAX_NEVER))
return false;
if (!S_ISREG(inode->i_mode))
return false;
@@ -4741,14 +4498,21 @@
return false;
if (ext4_test_inode_flag(inode, EXT4_INODE_VERITY))
return false;
- return true;
+ if (!test_bit(EXT4_FLAGS_BDEV_IS_DAX, &sbi->s_ext4_flags))
+ return false;
+ if (test_opt(inode->i_sb, DAX_ALWAYS))
+ return true;
+
+ return ext4_test_inode_flag(inode, EXT4_INODE_DAX);
}
-void ext4_set_inode_flags(struct inode *inode)
+void ext4_set_inode_flags(struct inode *inode, bool init)
{
unsigned int flags = EXT4_I(inode)->i_flags;
unsigned int new_fl = 0;
+ WARN_ON_ONCE(IS_DAX(inode) && init);
+
if (flags & EXT4_SYNC_FL)
new_fl |= S_SYNC;
if (flags & EXT4_APPEND_FL)
@@ -4759,8 +4523,13 @@
new_fl |= S_NOATIME;
if (flags & EXT4_DIRSYNC_FL)
new_fl |= S_DIRSYNC;
- if (ext4_should_use_dax(inode))
+
+ /* Because of the way inode_set_flags() works we must preserve S_DAX
+ * here if already set. */
+ new_fl |= (inode->i_flags & S_DAX);
+ if (init && ext4_should_enable_dax(inode))
new_fl |= S_DAX;
+
if (flags & EXT4_ENCRYPT_FL)
new_fl |= S_ENCRYPTED;
if (flags & EXT4_CASEFOLD_FL)
@@ -4861,7 +4630,7 @@
(ino > le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count))) {
if (flags & EXT4_IGET_HANDLE)
return ERR_PTR(-ESTALE);
- __ext4_error(sb, function, line,
+ __ext4_error(sb, function, line, EFSCORRUPTED, 0,
"inode #%lu: comm %s: iget: illegal inode #",
ino, current->comm);
return ERR_PTR(-EFSCORRUPTED);
@@ -4876,7 +4645,7 @@
ei = EXT4_I(inode);
iloc.bh = NULL;
- ret = __ext4_get_inode_loc(inode, &iloc, 0);
+ ret = __ext4_get_inode_loc_noinmem(inode, &iloc);
if (ret < 0)
goto bad_inode;
raw_inode = ext4_raw_inode(&iloc);
@@ -4922,9 +4691,11 @@
sizeof(gen));
}
- if (!ext4_inode_csum_verify(inode, raw_inode, ei)) {
- ext4_error_inode(inode, function, line, 0,
- "iget: checksum invalid");
+ if ((!ext4_inode_csum_verify(inode, raw_inode, ei) ||
+ ext4_simulate_fail(sb, EXT4_SIM_INODE_CRC)) &&
+ (!(EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))) {
+ ext4_error_inode_err(inode, function, line, 0,
+ EFSBADCRC, "iget: checksum invalid");
ret = -EFSBADCRC;
goto bad_inode;
}
@@ -4973,7 +4744,7 @@
* not initialized on a new filesystem. */
}
ei->i_flags = le32_to_cpu(raw_inode->i_flags);
- ext4_set_inode_flags(inode);
+ ext4_set_inode_flags(inode, true);
inode->i_blocks = ext4_inode_blocks(raw_inode, ei);
ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl_lo);
if (ext4_has_feature_64bit(sb))
@@ -5012,6 +4783,7 @@
for (block = 0; block < EXT4_N_BLOCKS; block++)
ei->i_data[block] = raw_inode->i_block[block];
INIT_LIST_HEAD(&ei->i_orphan);
+ ext4_fc_init_inode(&ei->vfs_inode);
/*
* Set transaction id's of transactions that have to be committed
@@ -5069,7 +4841,7 @@
ret = 0;
if (ei->i_file_acl &&
- !ext4_data_block_valid(EXT4_SB(sb), ei->i_file_acl, 1)) {
+ !ext4_inode_block_valid(inode, ei->i_file_acl, 1)) {
ext4_error_inode(inode, function, line, 0,
"iget: bad extended attribute block %llu",
ei->i_file_acl);
@@ -5077,9 +4849,10 @@
goto bad_inode;
} else if (!ext4_has_inline_data(inode)) {
/* validate the block references in the inode */
- if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
- (S_ISLNK(inode->i_mode) &&
- !ext4_inode_is_fast_symlink(inode))) {
+ if (!(EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY) &&
+ (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
+ (S_ISLNK(inode->i_mode) &&
+ !ext4_inode_is_fast_symlink(inode)))) {
if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
ret = ext4_ext_check_inode(inode);
else
@@ -5188,21 +4961,22 @@
return 0;
}
-struct other_inode {
- unsigned long orig_ino;
- struct ext4_inode *raw_inode;
-};
-
-static int other_inode_match(struct inode * inode, unsigned long ino,
- void *data)
+static void __ext4_update_other_inode_time(struct super_block *sb,
+ unsigned long orig_ino,
+ unsigned long ino,
+ struct ext4_inode *raw_inode)
{
- struct other_inode *oi = (struct other_inode *) data;
+ struct inode *inode;
- if ((inode->i_ino != ino) ||
- (inode->i_state & (I_FREEING | I_WILL_FREE | I_NEW |
+ inode = find_inode_by_ino_rcu(sb, ino);
+ if (!inode)
+ return;
+
+ if ((inode->i_state & (I_FREEING | I_WILL_FREE | I_NEW |
I_DIRTY_INODE)) ||
((inode->i_state & I_DIRTY_TIME) == 0))
- return 0;
+ return;
+
spin_lock(&inode->i_lock);
if (((inode->i_state & (I_FREEING | I_WILL_FREE | I_NEW |
I_DIRTY_INODE)) == 0) &&
@@ -5213,16 +4987,15 @@
spin_unlock(&inode->i_lock);
spin_lock(&ei->i_raw_lock);
- EXT4_INODE_SET_XTIME(i_ctime, inode, oi->raw_inode);
- EXT4_INODE_SET_XTIME(i_mtime, inode, oi->raw_inode);
- EXT4_INODE_SET_XTIME(i_atime, inode, oi->raw_inode);
- ext4_inode_csum_set(inode, oi->raw_inode, ei);
+ EXT4_INODE_SET_XTIME(i_ctime, inode, raw_inode);
+ EXT4_INODE_SET_XTIME(i_mtime, inode, raw_inode);
+ EXT4_INODE_SET_XTIME(i_atime, inode, raw_inode);
+ ext4_inode_csum_set(inode, raw_inode, ei);
spin_unlock(&ei->i_raw_lock);
- trace_ext4_other_inode_update_time(inode, oi->orig_ino);
- return -1;
+ trace_ext4_other_inode_update_time(inode, orig_ino);
+ return;
}
spin_unlock(&inode->i_lock);
- return -1;
}
/*
@@ -5232,24 +5005,24 @@
static void ext4_update_other_inodes_time(struct super_block *sb,
unsigned long orig_ino, char *buf)
{
- struct other_inode oi;
unsigned long ino;
int i, inodes_per_block = EXT4_SB(sb)->s_inodes_per_block;
int inode_size = EXT4_INODE_SIZE(sb);
- oi.orig_ino = orig_ino;
/*
* Calculate the first inode in the inode table block. Inode
* numbers are one-based. That is, the first inode in a block
* (assuming 4k blocks and 256 byte inodes) is (n*16 + 1).
*/
ino = ((orig_ino - 1) & ~(inodes_per_block - 1)) + 1;
+ rcu_read_lock();
for (i = 0; i < inodes_per_block; i++, ino++, buf += inode_size) {
if (ino == orig_ino)
continue;
- oi.raw_inode = (struct ext4_inode *) buf;
- (void) find_inode_nowait(sb, ino, other_inode_match, &oi);
+ __ext4_update_other_inode_time(sb, orig_ino, ino,
+ (struct ext4_inode *)buf);
}
+ rcu_read_unlock();
}
/*
@@ -5459,12 +5232,12 @@
if (wbc->sync_mode != WB_SYNC_ALL || wbc->for_sync)
return 0;
- err = jbd2_complete_transaction(EXT4_SB(inode->i_sb)->s_journal,
+ err = ext4_fc_commit(EXT4_SB(inode->i_sb)->s_journal,
EXT4_I(inode)->i_sync_tid);
} else {
struct ext4_iloc iloc;
- err = __ext4_get_inode_loc(inode, &iloc, 0);
+ err = __ext4_get_inode_loc_noinmem(inode, &iloc);
if (err)
return err;
/*
@@ -5474,8 +5247,8 @@
if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync)
sync_dirty_buffer(iloc.bh);
if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) {
- EXT4_ERROR_INODE_BLOCK(inode, iloc.bh->b_blocknr,
- "IO error syncing inode");
+ ext4_error_inode_block(inode, iloc.bh->b_blocknr, EIO,
+ "IO error syncing inode");
err = -EIO;
}
brelse(iloc.bh);
@@ -5588,6 +5361,7 @@
if (error)
return error;
}
+ ext4_fc_start_update(inode);
if ((ia_valid & ATTR_UID && !uid_eq(attr->ia_uid, inode->i_uid)) ||
(ia_valid & ATTR_GID && !gid_eq(attr->ia_gid, inode->i_gid))) {
handle_t *handle;
@@ -5611,6 +5385,7 @@
if (error) {
ext4_journal_stop(handle);
+ ext4_fc_stop_update(inode);
return error;
}
/* Update corresponding info in inode so that everything is in
@@ -5621,6 +5396,10 @@
inode->i_gid = attr->ia_gid;
error = ext4_mark_inode_dirty(handle, inode);
ext4_journal_stop(handle);
+ if (unlikely(error)) {
+ ext4_fc_stop_update(inode);
+ return error;
+ }
}
if (attr->ia_valid & ATTR_SIZE) {
@@ -5631,11 +5410,15 @@
if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
- if (attr->ia_size > sbi->s_bitmap_maxbytes)
+ if (attr->ia_size > sbi->s_bitmap_maxbytes) {
+ ext4_fc_stop_update(inode);
return -EFBIG;
+ }
}
- if (!S_ISREG(inode->i_mode))
+ if (!S_ISREG(inode->i_mode)) {
+ ext4_fc_stop_update(inode);
return -EINVAL;
+ }
if (IS_I_VERSION(inode) && attr->ia_size != inode->i_size)
inode_inc_iversion(inode);
@@ -5659,7 +5442,7 @@
rc = ext4_break_layouts(inode);
if (rc) {
up_write(&EXT4_I(inode)->i_mmap_sem);
- return rc;
+ goto err_out;
}
if (attr->ia_size != inode->i_size) {
@@ -5680,6 +5463,20 @@
inode->i_mtime = current_time(inode);
inode->i_ctime = inode->i_mtime;
}
+
+ if (shrink)
+ ext4_fc_track_range(handle, inode,
+ (attr->ia_size > 0 ? attr->ia_size - 1 : 0) >>
+ inode->i_sb->s_blocksize_bits,
+ EXT_MAX_BLOCKS - 1);
+ else
+ ext4_fc_track_range(
+ handle, inode,
+ (oldsize > 0 ? oldsize - 1 : oldsize) >>
+ inode->i_sb->s_blocksize_bits,
+ (attr->ia_size > 0 ? attr->ia_size - 1 : 0) >>
+ inode->i_sb->s_blocksize_bits);
+
down_write(&EXT4_I(inode)->i_data_sem);
EXT4_I(inode)->i_disksize = attr->ia_size;
rc = ext4_mark_inode_dirty(handle, inode);
@@ -5738,9 +5535,11 @@
rc = posix_acl_chmod(inode, inode->i_mode);
err_out:
- ext4_std_error(inode->i_sb, error);
+ if (error)
+ ext4_std_error(inode->i_sb, error);
if (!error)
error = rc;
+ ext4_fc_stop_update(inode);
return error;
}
@@ -5752,7 +5551,8 @@
struct ext4_inode_info *ei = EXT4_I(inode);
unsigned int flags;
- if (EXT4_FITS_IN_INODE(raw_inode, ei, i_crtime)) {
+ if ((request_mask & STATX_BTIME) &&
+ EXT4_FITS_IN_INODE(raw_inode, ei, i_crtime)) {
stat->result_mask |= STATX_BTIME;
stat->btime.tv_sec = ei->i_crtime.tv_sec;
stat->btime.tv_nsec = ei->i_crtime.tv_nsec;
@@ -5769,12 +5569,15 @@
stat->attributes |= STATX_ATTR_IMMUTABLE;
if (flags & EXT4_NODUMP_FL)
stat->attributes |= STATX_ATTR_NODUMP;
+ if (flags & EXT4_VERITY_FL)
+ stat->attributes |= STATX_ATTR_VERITY;
stat->attributes_mask |= (STATX_ATTR_APPEND |
STATX_ATTR_COMPRESSED |
STATX_ATTR_ENCRYPTED |
STATX_ATTR_IMMUTABLE |
- STATX_ATTR_NODUMP);
+ STATX_ATTR_NODUMP |
+ STATX_ATTR_VERITY);
generic_fillattr(inode, stat);
return 0;
@@ -5918,6 +5721,8 @@
put_bh(iloc->bh);
return -EIO;
}
+ ext4_fc_track_inode(handle, inode);
+
if (IS_I_VERSION(inode))
inode_inc_iversion(inode);
@@ -6032,9 +5837,8 @@
* If this is felt to be critical, then e2fsck should be run to
* force a large enough s_min_extra_isize.
*/
- if (ext4_handle_valid(handle) &&
- jbd2_journal_extend(handle,
- EXT4_DATA_TRANS_BLOCKS(inode->i_sb)) != 0)
+ if (ext4_journal_extend(handle,
+ EXT4_DATA_TRANS_BLOCKS(inode->i_sb), 0) != 0)
return -ENOSPC;
if (ext4_write_trylock_xattr(inode, &no_expand) == 0)
@@ -6103,7 +5907,8 @@
* Whenever the user wants stuff synced (sys_sync, sys_msync, sys_fsync)
* we start and wait on commits.
*/
-int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
+int __ext4_mark_inode_dirty(handle_t *handle, struct inode *inode,
+ const char *func, unsigned int line)
{
struct ext4_iloc iloc;
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
@@ -6113,13 +5918,18 @@
trace_ext4_mark_inode_dirty(inode, _RET_IP_);
err = ext4_reserve_inode_write(handle, inode, &iloc);
if (err)
- return err;
+ goto out;
if (EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize)
ext4_try_to_expand_extra_isize(inode, sbi->s_want_extra_isize,
iloc, handle);
- return ext4_mark_iloc_dirty(handle, inode, &iloc);
+ err = ext4_mark_iloc_dirty(handle, inode, &iloc);
+out:
+ if (unlikely(err))
+ ext4_error_inode_err(inode, func, line, 0, err,
+ "mark_inode_dirty error");
+ return err;
}
/*
@@ -6236,6 +6046,8 @@
if (IS_ERR(handle))
return PTR_ERR(handle);
+ ext4_fc_mark_ineligible(inode->i_sb,
+ EXT4_FC_REASON_JOURNAL_FLAG_CHANGE);
err = ext4_mark_inode_dirty(handle, inode);
ext4_handle_sync(handle);
ext4_journal_stop(handle);
@@ -6276,9 +6088,17 @@
if (err)
goto out_ret;
+ /*
+ * On data journalling we skip straight to the transaction handle:
+ * there's no delalloc; page truncated will be checked later; the
+ * early return w/ all buffers mapped (calculates size/len) can't
+ * be used; and there's no dioread_nolock, so only ext4_get_block.
+ */
+ if (ext4_should_journal_data(inode))
+ goto retry_alloc;
+
/* Delalloc case is easy... */
if (test_opt(inode->i_sb, DELALLOC) &&
- !ext4_should_journal_data(inode) &&
!ext4_nonda_switch(inode->i_sb)) {
do {
err = block_page_mkwrite(vma, vmf,
@@ -6304,6 +6124,9 @@
/*
* Return if we have all the buffers mapped. This avoids the need to do
* journal_start/journal_stop which can block and take a long time
+ *
+ * This cannot be done for data journalling, as we have to add the
+ * inode to the transaction's list to writeprotect pages on commit.
*/
if (page_has_buffers(page)) {
if (!ext4_walk_page_buffers(NULL, page_buffers(page),
@@ -6328,16 +6151,43 @@
ret = VM_FAULT_SIGBUS;
goto out;
}
- err = block_page_mkwrite(vma, vmf, get_block);
- if (!err && ext4_should_journal_data(inode)) {
- if (ext4_walk_page_buffers(handle, page_buffers(page), 0,
- PAGE_SIZE, NULL, do_journal_get_write_access)) {
- unlock_page(page);
- ret = VM_FAULT_SIGBUS;
- ext4_journal_stop(handle);
- goto out;
+ /*
+ * Data journalling can't use block_page_mkwrite() because it
+ * will set_buffer_dirty() before do_journal_get_write_access()
+ * thus might hit warning messages for dirty metadata buffers.
+ */
+ if (!ext4_should_journal_data(inode)) {
+ err = block_page_mkwrite(vma, vmf, get_block);
+ } else {
+ lock_page(page);
+ size = i_size_read(inode);
+ /* Page got truncated from under us? */
+ if (page->mapping != mapping || page_offset(page) > size) {
+ ret = VM_FAULT_NOPAGE;
+ goto out_error;
}
- ext4_set_inode_state(inode, EXT4_STATE_JDATA);
+
+ if (page->index == size >> PAGE_SHIFT)
+ len = size & ~PAGE_MASK;
+ else
+ len = PAGE_SIZE;
+
+ err = __block_write_begin(page, 0, len, ext4_get_block);
+ if (!err) {
+ ret = VM_FAULT_SIGBUS;
+ if (ext4_walk_page_buffers(handle, page_buffers(page),
+ 0, len, NULL, do_journal_get_write_access))
+ goto out_error;
+ if (ext4_walk_page_buffers(handle, page_buffers(page),
+ 0, len, NULL, write_end_fn))
+ goto out_error;
+ if (ext4_jbd2_inode_add_write(handle, inode,
+ page_offset(page), len))
+ goto out_error;
+ ext4_set_inode_state(inode, EXT4_STATE_JDATA);
+ } else {
+ unlock_page(page);
+ }
}
ext4_journal_stop(handle);
if (err == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
@@ -6348,6 +6198,10 @@
up_read(&EXT4_I(inode)->i_mmap_sem);
sb_end_pagefault(inode->i_sb);
return ret;
+out_error:
+ unlock_page(page);
+ ext4_journal_stop(handle);
+ goto out;
}
vm_fault_t ext4_filemap_fault(struct vm_fault *vmf)
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index ba13fbb..413bf3d 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -86,7 +86,7 @@
i_size_write(inode2, isize);
}
-static void reset_inode_seed(struct inode *inode)
+void ext4_reset_inode_seed(struct inode *inode)
{
struct ext4_inode_info *ei = EXT4_I(inode);
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
@@ -165,6 +165,7 @@
err = -EINVAL;
goto err_out;
}
+ ext4_fc_start_ineligible(sb, EXT4_FC_REASON_SWAP_BOOT);
/* Protect extent tree against block allocations via delalloc */
ext4_double_down_write_data_sem(inode, inode_bl);
@@ -199,10 +200,10 @@
inode->i_generation = prandom_u32();
inode_bl->i_generation = prandom_u32();
- reset_inode_seed(inode);
- reset_inode_seed(inode_bl);
+ ext4_reset_inode_seed(inode);
+ ext4_reset_inode_seed(inode_bl);
- ext4_discard_preallocations(inode);
+ ext4_discard_preallocations(inode, 0);
err = ext4_mark_inode_dirty(handle, inode);
if (err < 0) {
@@ -247,6 +248,7 @@
err_out1:
ext4_journal_stop(handle);
+ ext4_fc_stop_ineligible(sb);
ext4_double_up_write_data_sem(inode, inode_bl);
err_out:
@@ -292,6 +294,44 @@
return 0;
}
+static void ext4_dax_dontcache(struct inode *inode, unsigned int flags)
+{
+ struct ext4_inode_info *ei = EXT4_I(inode);
+
+ if (S_ISDIR(inode->i_mode))
+ return;
+
+ if (test_opt2(inode->i_sb, DAX_NEVER) ||
+ test_opt(inode->i_sb, DAX_ALWAYS))
+ return;
+
+ if ((ei->i_flags ^ flags) & EXT4_DAX_FL)
+ d_mark_dontcache(inode);
+}
+
+static bool dax_compatible(struct inode *inode, unsigned int oldflags,
+ unsigned int flags)
+{
+ /* Allow the DAX flag to be changed on inline directories */
+ if (S_ISDIR(inode->i_mode)) {
+ flags &= ~EXT4_INLINE_DATA_FL;
+ oldflags &= ~EXT4_INLINE_DATA_FL;
+ }
+
+ if (flags & EXT4_DAX_FL) {
+ if ((oldflags & EXT4_DAX_MUT_EXCL) ||
+ ext4_test_inode_state(inode,
+ EXT4_STATE_VERITY_IN_PROGRESS)) {
+ return false;
+ }
+ }
+
+ if ((flags & EXT4_DAX_MUT_EXCL) && (oldflags & EXT4_DAX_FL))
+ return false;
+
+ return true;
+}
+
static int ext4_ioctl_setflags(struct inode *inode,
unsigned int flags)
{
@@ -300,7 +340,6 @@
int err = -EPERM, migrate = 0;
struct ext4_iloc iloc;
unsigned int oldflags, mask, i;
- unsigned int jflag;
struct super_block *sb = inode->i_sb;
/* Is it quota file? Do not allow user to mess with it */
@@ -309,9 +348,6 @@
oldflags = ei->i_flags;
- /* The JOURNAL_DATA flag is modifiable only by root */
- jflag = flags & EXT4_JOURNAL_DATA_FL;
-
err = vfs_ioc_setflags_prepare(inode, oldflags, flags);
if (err)
goto flags_out;
@@ -320,25 +356,19 @@
* The JOURNAL_DATA flag can only be changed by
* the relevant capability.
*/
- if ((jflag ^ oldflags) & (EXT4_JOURNAL_DATA_FL)) {
+ if ((flags ^ oldflags) & (EXT4_JOURNAL_DATA_FL)) {
if (!capable(CAP_SYS_RESOURCE))
goto flags_out;
}
+
+ if (!dax_compatible(inode, oldflags, flags)) {
+ err = -EOPNOTSUPP;
+ goto flags_out;
+ }
+
if ((flags ^ oldflags) & EXT4_EXTENTS_FL)
migrate = 1;
- if (flags & EXT4_EOFBLOCKS_FL) {
- /* we don't support adding EOFBLOCKS flag */
- if (!(oldflags & EXT4_EOFBLOCKS_FL)) {
- err = -EOPNOTSUPP;
- goto flags_out;
- }
- } else if (oldflags & EXT4_EOFBLOCKS_FL) {
- err = ext4_truncate(inode);
- if (err)
- goto flags_out;
- }
-
if ((flags ^ oldflags) & EXT4_CASEFOLD_FL) {
if (!ext4_has_feature_casefold(sb)) {
err = -EOPNOTSUPP;
@@ -381,6 +411,8 @@
if (err)
goto flags_err;
+ ext4_dax_dontcache(inode, flags);
+
for (i = 0, mask = 1; i < 32; i++, mask <<= 1) {
if (!(mask & EXT4_FL_USER_MODIFIABLE))
continue;
@@ -393,7 +425,8 @@
ext4_clear_inode_flag(inode, i);
}
- ext4_set_inode_flags(inode);
+ ext4_set_inode_flags(inode, false);
+
inode->i_ctime = current_time(inode);
err = ext4_mark_iloc_dirty(handle, inode, &iloc);
@@ -402,17 +435,18 @@
if (err)
goto flags_out;
- if ((jflag ^ oldflags) & (EXT4_JOURNAL_DATA_FL)) {
+ if ((flags ^ oldflags) & (EXT4_JOURNAL_DATA_FL)) {
/*
* Changes to the journaling mode can cause unsafe changes to
- * S_DAX if we are using the DAX mount option.
+ * S_DAX if the inode is DAX
*/
- if (test_opt(inode->i_sb, DAX)) {
+ if (IS_DAX(inode)) {
err = -EBUSY;
goto flags_out;
}
- err = ext4_change_inode_journal_flag(inode, jflag);
+ err = ext4_change_inode_journal_flag(inode,
+ flags & EXT4_JOURNAL_DATA_FL);
if (err)
goto flags_out;
}
@@ -539,12 +573,15 @@
xflags |= FS_XFLAG_NOATIME;
if (iflags & EXT4_PROJINHERIT_FL)
xflags |= FS_XFLAG_PROJINHERIT;
+ if (iflags & EXT4_DAX_FL)
+ xflags |= FS_XFLAG_DAX;
return xflags;
}
#define EXT4_SUPPORTED_FS_XFLAGS (FS_XFLAG_SYNC | FS_XFLAG_IMMUTABLE | \
FS_XFLAG_APPEND | FS_XFLAG_NODUMP | \
- FS_XFLAG_NOATIME | FS_XFLAG_PROJINHERIT)
+ FS_XFLAG_NOATIME | FS_XFLAG_PROJINHERIT | \
+ FS_XFLAG_DAX)
/* Transfer xflags flags to internal */
static inline unsigned long ext4_xflags_to_iflags(__u32 xflags)
@@ -563,6 +600,8 @@
iflags |= EXT4_NOATIME_FL;
if (xflags & FS_XFLAG_PROJINHERIT)
iflags |= EXT4_PROJINHERIT_FL;
+ if (xflags & FS_XFLAG_DAX)
+ iflags |= EXT4_DAX_FL;
return iflags;
}
@@ -745,29 +784,6 @@
fa->fsx_projid = from_kprojid(&init_user_ns, ei->i_projid);
}
-/* copied from fs/ioctl.c */
-static int fiemap_check_ranges(struct super_block *sb,
- u64 start, u64 len, u64 *new_len)
-{
- u64 maxbytes = (u64) sb->s_maxbytes;
-
- *new_len = len;
-
- if (len == 0)
- return -EINVAL;
-
- if (start > maxbytes)
- return -EFBIG;
-
- /*
- * Shrink request scope to what the fs can actually handle.
- */
- if (len > maxbytes || (maxbytes - len) < start)
- *new_len = maxbytes - start;
-
- return 0;
-}
-
/* So that the fiemap access checks can't overflow on 32 bit machines. */
#define FIEMAP_MAX_EXTENTS (UINT_MAX / sizeof(struct fiemap_extent))
@@ -777,8 +793,6 @@
struct fiemap __user *ufiemap = (struct fiemap __user *) arg;
struct fiemap_extent_info fieinfo = { 0, };
struct inode *inode = file_inode(filp);
- struct super_block *sb = inode->i_sb;
- u64 len;
int error;
if (copy_from_user(&fiemap, ufiemap, sizeof(fiemap)))
@@ -787,24 +801,12 @@
if (fiemap.fm_extent_count > FIEMAP_MAX_EXTENTS)
return -EINVAL;
- error = fiemap_check_ranges(sb, fiemap.fm_start, fiemap.fm_length,
- &len);
- if (error)
- return error;
-
fieinfo.fi_flags = fiemap.fm_flags;
fieinfo.fi_extents_max = fiemap.fm_extent_count;
fieinfo.fi_extents_start = ufiemap->fm_extents;
- if (fiemap.fm_extent_count != 0 &&
- !access_ok(fieinfo.fi_extents_start,
- fieinfo.fi_extents_max * sizeof(struct fiemap_extent)))
- return -EFAULT;
-
- if (fieinfo.fi_flags & FIEMAP_FLAG_SYNC)
- filemap_write_and_wait(inode->i_mapping);
-
- error = ext4_get_es_cache(inode, &fieinfo, fiemap.fm_start, len);
+ error = ext4_get_es_cache(inode, &fieinfo, fiemap.fm_start,
+ fiemap.fm_length);
fiemap.fm_flags = fieinfo.fi_flags;
fiemap.fm_mapped_extents = fieinfo.fi_extents_mapped;
if (copy_to_user(ufiemap, &fiemap, sizeof(fiemap)))
@@ -813,7 +815,7 @@
return error;
}
-long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+static long __ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
{
struct inode *inode = file_inode(filp);
struct super_block *sb = inode->i_sb;
@@ -825,12 +827,12 @@
switch (cmd) {
case FS_IOC_GETFSMAP:
return ext4_ioc_getfsmap(sb, (void __user *)arg);
- case EXT4_IOC_GETFLAGS:
+ case FS_IOC_GETFLAGS:
flags = ei->i_flags & EXT4_FL_USER_VISIBLE;
if (S_ISREG(inode->i_mode))
flags &= ~EXT4_PROJINHERIT_FL;
return put_user(flags, (int __user *) arg);
- case EXT4_IOC_SETFLAGS: {
+ case FS_IOC_SETFLAGS: {
int err;
if (!inode_owner_or_capable(inode))
@@ -1080,6 +1082,7 @@
err = ext4_resize_fs(sb, n_blocks_count);
if (EXT4_SB(sb)->s_journal) {
+ ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_RESIZE);
jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal);
jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
@@ -1120,8 +1123,6 @@
sizeof(range)))
return -EFAULT;
- range.minlen = max((unsigned int)range.minlen,
- q->limits.discard_granularity);
ret = ext4_trim_fs(sb, &range);
if (ret < 0)
return ret;
@@ -1135,12 +1136,12 @@
case EXT4_IOC_PRECACHE_EXTENTS:
return ext4_ext_precache(inode);
- case EXT4_IOC_SET_ENCRYPTION_POLICY:
+ case FS_IOC_SET_ENCRYPTION_POLICY:
if (!ext4_has_feature_encrypt(sb))
return -EOPNOTSUPP;
return fscrypt_ioctl_set_policy(filp, (const void __user *)arg);
- case EXT4_IOC_GET_ENCRYPTION_PWSALT: {
+ case FS_IOC_GET_ENCRYPTION_PWSALT: {
#ifdef CONFIG_FS_ENCRYPTION
int err, err2;
struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -1183,7 +1184,7 @@
return -EOPNOTSUPP;
#endif
}
- case EXT4_IOC_GET_ENCRYPTION_POLICY:
+ case FS_IOC_GET_ENCRYPTION_POLICY:
if (!ext4_has_feature_encrypt(sb))
return -EOPNOTSUPP;
return fscrypt_ioctl_get_policy(filp, (void __user *)arg);
@@ -1213,6 +1214,11 @@
return -EOPNOTSUPP;
return fscrypt_ioctl_get_key_status(filp, (void __user *)arg);
+ case FS_IOC_GET_ENCRYPTION_NONCE:
+ if (!ext4_has_feature_encrypt(sb))
+ return -EOPNOTSUPP;
+ return fscrypt_ioctl_get_nonce(filp, (void __user *)arg);
+
case EXT4_IOC_CLEAR_ES_CACHE:
{
if (!inode_owner_or_capable(inode))
@@ -1240,7 +1246,7 @@
case EXT4_IOC_GET_ES_CACHE:
return ext4_ioctl_get_es_cache(filp, arg);
- case EXT4_IOC_FSGETXATTR:
+ case FS_IOC_FSGETXATTR:
{
struct fsxattr fa;
@@ -1251,7 +1257,7 @@
return -EFAULT;
return 0;
}
- case EXT4_IOC_FSSETXATTR:
+ case FS_IOC_FSSETXATTR:
{
struct fsxattr fa, old_fa;
int err;
@@ -1312,16 +1318,27 @@
}
}
+long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+{
+ long ret;
+
+ ext4_fc_start_update(file_inode(filp));
+ ret = __ext4_ioctl(filp, cmd, arg);
+ ext4_fc_stop_update(file_inode(filp));
+
+ return ret;
+}
+
#ifdef CONFIG_COMPAT
long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{
/* These are just misnamed, they actually get/put from/to user an int */
switch (cmd) {
- case EXT4_IOC32_GETFLAGS:
- cmd = EXT4_IOC_GETFLAGS;
+ case FS_IOC32_GETFLAGS:
+ cmd = FS_IOC_GETFLAGS;
break;
- case EXT4_IOC32_SETFLAGS:
- cmd = EXT4_IOC_SETFLAGS;
+ case FS_IOC32_SETFLAGS:
+ cmd = FS_IOC_SETFLAGS;
break;
case EXT4_IOC32_GETVERSION:
cmd = EXT4_IOC_GETVERSION;
@@ -1363,15 +1380,17 @@
}
case EXT4_IOC_MOVE_EXT:
case EXT4_IOC_RESIZE_FS:
+ case FITRIM:
case EXT4_IOC_PRECACHE_EXTENTS:
- case EXT4_IOC_SET_ENCRYPTION_POLICY:
- case EXT4_IOC_GET_ENCRYPTION_PWSALT:
- case EXT4_IOC_GET_ENCRYPTION_POLICY:
+ case FS_IOC_SET_ENCRYPTION_POLICY:
+ case FS_IOC_GET_ENCRYPTION_PWSALT:
+ case FS_IOC_GET_ENCRYPTION_POLICY:
case FS_IOC_GET_ENCRYPTION_POLICY_EX:
case FS_IOC_ADD_ENCRYPTION_KEY:
case FS_IOC_REMOVE_ENCRYPTION_KEY:
case FS_IOC_REMOVE_ENCRYPTION_KEY_ALL_USERS:
case FS_IOC_GET_ENCRYPTION_KEY_STATUS:
+ case FS_IOC_GET_ENCRYPTION_NONCE:
case EXT4_IOC_SHUTDOWN:
case FS_IOC_GETFSMAP:
case FS_IOC_ENABLE_VERITY:
@@ -1379,6 +1398,8 @@
case EXT4_IOC_CLEAR_ES_CACHE:
case EXT4_IOC_GETSTATE:
case EXT4_IOC_GET_ES_CACHE:
+ case FS_IOC_FSGETXATTR:
+ case FS_IOC_FSSETXATTR:
break;
default:
return -ENOIOCTLCMD;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index b67ea97..110c258 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -18,13 +18,6 @@
#include <linux/backing-dev.h>
#include <trace/events/ext4.h>
-#ifdef CONFIG_EXT4_DEBUG
-ushort ext4_mballoc_debug __read_mostly;
-
-module_param_named(mballoc_debug, ext4_mballoc_debug, ushort, 0644);
-MODULE_PARM_DESC(mballoc_debug, "Debugging level for ext4's mballoc");
-#endif
-
/*
* MUSTDO:
* - test ext4_ext_search_left() and ext4_ext_search_right()
@@ -131,7 +124,7 @@
* /sys/fs/ext4/<partition>/mb_group_prealloc. The value is represented in
* terms of number of blocks. If we have mounted the file system with -O
* stripe=<value> option the group prealloc request is normalized to the
- * the smallest multiple of the stripe value (sbi->s_stripe) which is
+ * smallest multiple of the stripe value (sbi->s_stripe) which is
* greater than the default mb_group_prealloc.
*
* The regular allocator (using the buddy cache) supports a few tunables.
@@ -356,6 +349,36 @@
ext4_group_t group);
static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
ext4_group_t group);
+static void ext4_mb_new_preallocation(struct ext4_allocation_context *ac);
+
+/*
+ * The algorithm using this percpu seq counter goes below:
+ * 1. We sample the percpu discard_pa_seq counter before trying for block
+ * allocation in ext4_mb_new_blocks().
+ * 2. We increment this percpu discard_pa_seq counter when we either allocate
+ * or free these blocks i.e. while marking those blocks as used/free in
+ * mb_mark_used()/mb_free_blocks().
+ * 3. We also increment this percpu seq counter when we successfully identify
+ * that the bb_prealloc_list is not empty and hence proceed for discarding
+ * of those PAs inside ext4_mb_discard_group_preallocations().
+ *
+ * Now to make sure that the regular fast path of block allocation is not
+ * affected, as a small optimization we only sample the percpu seq counter
+ * on that cpu. Only when the block allocation fails and when freed blocks
+ * found were 0, that is when we sample percpu seq counter for all cpus using
+ * below function ext4_get_discard_pa_seq_sum(). This happens after making
+ * sure that all the PAs on grp->bb_prealloc_list got freed or if it's empty.
+ */
+static DEFINE_PER_CPU(u64, discard_pa_seq);
+static inline u64 ext4_get_discard_pa_seq_sum(void)
+{
+ int __cpu;
+ u64 __seq = 0;
+
+ for_each_possible_cpu(__cpu)
+ __seq += per_cpu(discard_pa_seq, __cpu);
+ return __seq;
+}
static inline void *mb_correct_addr_and_bit(int *bit, void *addr)
{
@@ -493,6 +516,8 @@
static void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap)
{
+ if (unlikely(e4b->bd_info->bb_bitmap == NULL))
+ return;
if (memcmp(e4b->bd_info->bb_bitmap, bitmap, e4b->bd_sb->s_blocksize)) {
unsigned char *b1, *b2;
int i;
@@ -511,6 +536,31 @@
}
}
+static void mb_group_bb_bitmap_alloc(struct super_block *sb,
+ struct ext4_group_info *grp, ext4_group_t group)
+{
+ struct buffer_head *bh;
+
+ grp->bb_bitmap = kmalloc(sb->s_blocksize, GFP_NOFS);
+ if (!grp->bb_bitmap)
+ return;
+
+ bh = ext4_read_block_bitmap(sb, group);
+ if (IS_ERR_OR_NULL(bh)) {
+ kfree(grp->bb_bitmap);
+ grp->bb_bitmap = NULL;
+ return;
+ }
+
+ memcpy(grp->bb_bitmap, bh->b_data, sb->s_blocksize);
+ put_bh(bh);
+}
+
+static void mb_group_bb_bitmap_free(struct ext4_group_info *grp)
+{
+ kfree(grp->bb_bitmap);
+}
+
#else
static inline void mb_free_blocks_double(struct inode *inode,
struct ext4_buddy *e4b, int first, int count)
@@ -526,6 +576,17 @@
{
return;
}
+
+static inline void mb_group_bb_bitmap_alloc(struct super_block *sb,
+ struct ext4_group_info *grp, ext4_group_t group)
+{
+ return;
+}
+
+static inline void mb_group_bb_bitmap_free(struct ext4_group_info *grp)
+{
+ return;
+}
#endif
#ifdef AGGRESSIVE_CHECK
@@ -558,11 +619,8 @@
void *buddy;
void *buddy2;
- {
- static int mb_check_counter;
- if (mb_check_counter++ % 100 != 0)
- return 0;
- }
+ if (e4b->bd_info->bb_check_counter++ % 10)
+ return 0;
while (order > 1) {
buddy = mb_find_buddy(e4b, order, &max);
@@ -820,14 +878,14 @@
char *bitmap;
struct ext4_group_info *grinfo;
- mb_debug(1, "init page %lu\n", page->index);
-
inode = page->mapping->host;
sb = inode->i_sb;
ngroups = ext4_get_groups_count(sb);
blocksize = i_blocksize(inode);
blocks_per_page = PAGE_SIZE / blocksize;
+ mb_debug(sb, "init page %lu\n", page->index);
+
groups_per_page = blocks_per_page >> 1;
if (groups_per_page == 0)
groups_per_page = 1;
@@ -861,13 +919,13 @@
bh[i] = NULL;
continue;
}
- bh[i] = ext4_read_block_bitmap_nowait(sb, group);
+ bh[i] = ext4_read_block_bitmap_nowait(sb, group, false);
if (IS_ERR(bh[i])) {
err = PTR_ERR(bh[i]);
bh[i] = NULL;
goto out;
}
- mb_debug(1, "read bitmap for group %u\n", group);
+ mb_debug(sb, "read bitmap for group %u\n", group);
}
/* wait for I/O completion */
@@ -912,7 +970,7 @@
if ((first_block + i) & 1) {
/* this is block of buddy */
BUG_ON(incore == NULL);
- mb_debug(1, "put buddy for group %u in page %lu/%x\n",
+ mb_debug(sb, "put buddy for group %u in page %lu/%x\n",
group, page->index, i * blocksize);
trace_ext4_mb_buddy_bitmap_load(sb, group);
grinfo = ext4_get_group_info(sb, group);
@@ -932,7 +990,7 @@
} else {
/* this is block of bitmap */
BUG_ON(incore != NULL);
- mb_debug(1, "put bitmap for group %u in page %lu/%x\n",
+ mb_debug(sb, "put bitmap for group %u in page %lu/%x\n",
group, page->index, i * blocksize);
trace_ext4_mb_bitmap_load(sb, group);
@@ -1038,7 +1096,7 @@
int ret = 0;
might_sleep();
- mb_debug(1, "init group %u\n", group);
+ mb_debug(sb, "init group %u\n", group);
this_grp = ext4_get_group_info(sb, group);
/*
* This ensures that we don't reinit the buddy cache
@@ -1110,7 +1168,7 @@
struct inode *inode = sbi->s_buddy_cache;
might_sleep();
- mb_debug(1, "load group %u\n", group);
+ mb_debug(sb, "load group %u\n", group);
blocks_per_page = PAGE_SIZE / sb->s_blocksize;
grp = ext4_get_group_info(sb, group);
@@ -1218,9 +1276,6 @@
e4b->bd_buddy_page = page;
e4b->bd_buddy = page_address(page) + (poff * sb->s_blocksize);
- BUG_ON(e4b->bd_bitmap_page == NULL);
- BUG_ON(e4b->bd_buddy_page == NULL);
-
return 0;
err:
@@ -1336,9 +1391,6 @@
}
}
-/*
- * _________________________________________________________________ */
-
static inline int mb_buddy_adjust_border(int* bit, void* bitmap, int side)
{
if (mb_test_bit(*bit + side, bitmap)) {
@@ -1430,6 +1482,7 @@
mb_check_buddy(e4b);
mb_free_blocks_double(inode, e4b, first, count);
+ this_cpu_inc(discard_pa_seq);
e4b->bd_info->bb_free += count;
if (first < e4b->bd_info->bb_first_free)
e4b->bd_info->bb_first_free = first;
@@ -1449,14 +1502,16 @@
blocknr = ext4_group_first_block_no(sb, e4b->bd_group);
blocknr += EXT4_C2B(sbi, block);
- ext4_grp_locked_error(sb, e4b->bd_group,
- inode ? inode->i_ino : 0,
- blocknr,
- "freeing already freed block "
- "(bit %u); block bitmap corrupt.",
- block);
- ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group,
+ if (!(sbi->s_mount_state & EXT4_FC_REPLAY)) {
+ ext4_grp_locked_error(sb, e4b->bd_group,
+ inode ? inode->i_ino : 0,
+ blocknr,
+ "freeing already freed block (bit %u); block bitmap corrupt.",
+ block);
+ ext4_mark_group_bitmap_corrupted(
+ sb, e4b->bd_group,
EXT4_GROUP_INFO_BBITMAP_CORRUPT);
+ }
mb_regenerate_buddy(e4b);
goto done;
}
@@ -1572,6 +1627,7 @@
mb_check_buddy(e4b);
mb_mark_used_double(e4b, start, len);
+ this_cpu_inc(discard_pa_seq);
e4b->bd_info->bb_free -= len;
if (e4b->bd_info->bb_first_free == start)
e4b->bd_info->bb_first_free += len;
@@ -1671,11 +1727,15 @@
sbi->s_mb_last_start = ac->ac_f_ex.fe_start;
spin_unlock(&sbi->s_md_lock);
}
-}
+ /*
+ * As we've just preallocated more space than
+ * user requested originally, we store allocated
+ * space in a special descriptor.
+ */
+ if (ac->ac_o_ex.fe_len < ac->ac_b_ex.fe_len)
+ ext4_mb_new_preallocation(ac);
-/*
- * regular allocator, for general purposes allocation
- */
+}
static void ext4_mb_check_limits(struct ext4_allocation_context *ac,
struct ext4_buddy *e4b,
@@ -1919,7 +1979,7 @@
ext4_mb_use_best_found(ac, e4b);
- BUG_ON(ac->ac_b_ex.fe_len != ac->ac_g_ex.fe_len);
+ BUG_ON(ac->ac_f_ex.fe_len != ac->ac_g_ex.fe_len);
if (EXT4_SB(sb)->s_mb_stats)
atomic_inc(&EXT4_SB(sb)->s_bal_2orders);
@@ -1956,7 +2016,7 @@
/*
* IF we have corrupt bitmap, we won't find any
* free blocks even though group info says we
- * we have free blocks
+ * have free blocks
*/
ext4_grp_locked_error(sb, e4b->bd_group, 0, 0,
"%d free clusters as per "
@@ -2036,39 +2096,29 @@
}
/*
- * This is now called BEFORE we load the buddy bitmap.
+ * This is also called BEFORE we load the buddy bitmap.
* Returns either 1 or 0 indicating that the group is either suitable
- * for the allocation or not. In addition it can also return negative
- * error code when something goes wrong.
+ * for the allocation or not.
*/
-static int ext4_mb_good_group(struct ext4_allocation_context *ac,
+static bool ext4_mb_good_group(struct ext4_allocation_context *ac,
ext4_group_t group, int cr)
{
- unsigned free, fragments;
+ ext4_grpblk_t free, fragments;
int flex_size = ext4_flex_bg_size(EXT4_SB(ac->ac_sb));
struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group);
BUG_ON(cr < 0 || cr >= 4);
+ if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(grp)))
+ return false;
+
free = grp->bb_free;
if (free == 0)
- return 0;
- if (cr <= 2 && free < ac->ac_g_ex.fe_len)
- return 0;
-
- if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(grp)))
- return 0;
-
- /* We only do this if the grp has never been initialized */
- if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
- int ret = ext4_mb_init_group(ac->ac_sb, group, GFP_NOFS);
- if (ret)
- return ret;
- }
+ return false;
fragments = grp->bb_fragments;
if (fragments == 0)
- return 0;
+ return false;
switch (cr) {
case 0:
@@ -2078,42 +2128,185 @@
if ((ac->ac_flags & EXT4_MB_HINT_DATA) &&
(flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) &&
((group % flex_size) == 0))
- return 0;
+ return false;
- if ((ac->ac_2order > ac->ac_sb->s_blocksize_bits+1) ||
- (free / fragments) >= ac->ac_g_ex.fe_len)
- return 1;
+ if (free < ac->ac_g_ex.fe_len)
+ return false;
+
+ if (ac->ac_2order > ac->ac_sb->s_blocksize_bits+1)
+ return true;
if (grp->bb_largest_free_order < ac->ac_2order)
- return 0;
+ return false;
- return 1;
+ return true;
case 1:
if ((free / fragments) >= ac->ac_g_ex.fe_len)
- return 1;
+ return true;
break;
case 2:
if (free >= ac->ac_g_ex.fe_len)
- return 1;
+ return true;
break;
case 3:
- return 1;
+ return true;
default:
BUG();
}
- return 0;
+ return false;
+}
+
+/*
+ * This could return negative error code if something goes wrong
+ * during ext4_mb_init_group(). This should not be called with
+ * ext4_lock_group() held.
+ */
+static int ext4_mb_good_group_nolock(struct ext4_allocation_context *ac,
+ ext4_group_t group, int cr)
+{
+ struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group);
+ struct super_block *sb = ac->ac_sb;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ bool should_lock = ac->ac_flags & EXT4_MB_STRICT_CHECK;
+ ext4_grpblk_t free;
+ int ret = 0;
+
+ if (should_lock)
+ ext4_lock_group(sb, group);
+ free = grp->bb_free;
+ if (free == 0)
+ goto out;
+ if (cr <= 2 && free < ac->ac_g_ex.fe_len)
+ goto out;
+ if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(grp)))
+ goto out;
+ if (should_lock)
+ ext4_unlock_group(sb, group);
+
+ /* We only do this if the grp has never been initialized */
+ if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
+ struct ext4_group_desc *gdp =
+ ext4_get_group_desc(sb, group, NULL);
+ int ret;
+
+ /* cr=0/1 is a very optimistic search to find large
+ * good chunks almost for free. If buddy data is not
+ * ready, then this optimization makes no sense. But
+ * we never skip the first block group in a flex_bg,
+ * since this gets used for metadata block allocation,
+ * and we want to make sure we locate metadata blocks
+ * in the first block group in the flex_bg if possible.
+ */
+ if (cr < 2 &&
+ (!sbi->s_log_groups_per_flex ||
+ ((group & ((1 << sbi->s_log_groups_per_flex) - 1)) != 0)) &&
+ !(ext4_has_group_desc_csum(sb) &&
+ (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))))
+ return 0;
+ ret = ext4_mb_init_group(sb, group, GFP_NOFS);
+ if (ret)
+ return ret;
+ }
+
+ if (should_lock)
+ ext4_lock_group(sb, group);
+ ret = ext4_mb_good_group(ac, group, cr);
+out:
+ if (should_lock)
+ ext4_unlock_group(sb, group);
+ return ret;
+}
+
+/*
+ * Start prefetching @nr block bitmaps starting at @group.
+ * Return the next group which needs to be prefetched.
+ */
+ext4_group_t ext4_mb_prefetch(struct super_block *sb, ext4_group_t group,
+ unsigned int nr, int *cnt)
+{
+ ext4_group_t ngroups = ext4_get_groups_count(sb);
+ struct buffer_head *bh;
+ struct blk_plug plug;
+
+ blk_start_plug(&plug);
+ while (nr-- > 0) {
+ struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group,
+ NULL);
+ struct ext4_group_info *grp = ext4_get_group_info(sb, group);
+
+ /*
+ * Prefetch block groups with free blocks; but don't
+ * bother if it is marked uninitialized on disk, since
+ * it won't require I/O to read. Also only try to
+ * prefetch once, so we avoid getblk() call, which can
+ * be expensive.
+ */
+ if (!EXT4_MB_GRP_TEST_AND_SET_READ(grp) &&
+ EXT4_MB_GRP_NEED_INIT(grp) &&
+ ext4_free_group_clusters(sb, gdp) > 0 &&
+ !(ext4_has_group_desc_csum(sb) &&
+ (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)))) {
+ bh = ext4_read_block_bitmap_nowait(sb, group, true);
+ if (bh && !IS_ERR(bh)) {
+ if (!buffer_uptodate(bh) && cnt)
+ (*cnt)++;
+ brelse(bh);
+ }
+ }
+ if (++group >= ngroups)
+ group = 0;
+ }
+ blk_finish_plug(&plug);
+ return group;
+}
+
+/*
+ * Prefetching reads the block bitmap into the buffer cache; but we
+ * need to make sure that the buddy bitmap in the page cache has been
+ * initialized. Note that ext4_mb_init_group() will block if the I/O
+ * is not yet completed, or indeed if it was not initiated by
+ * ext4_mb_prefetch did not start the I/O.
+ *
+ * TODO: We should actually kick off the buddy bitmap setup in a work
+ * queue when the buffer I/O is completed, so that we don't block
+ * waiting for the block allocation bitmap read to finish when
+ * ext4_mb_prefetch_fini is called from ext4_mb_regular_allocator().
+ */
+void ext4_mb_prefetch_fini(struct super_block *sb, ext4_group_t group,
+ unsigned int nr)
+{
+ while (nr-- > 0) {
+ struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group,
+ NULL);
+ struct ext4_group_info *grp = ext4_get_group_info(sb, group);
+
+ if (!group)
+ group = ext4_get_groups_count(sb);
+ group--;
+ grp = ext4_get_group_info(sb, group);
+
+ if (EXT4_MB_GRP_NEED_INIT(grp) &&
+ ext4_free_group_clusters(sb, gdp) > 0 &&
+ !(ext4_has_group_desc_csum(sb) &&
+ (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)))) {
+ if (ext4_mb_init_group(sb, group, GFP_NOFS))
+ break;
+ }
+ }
}
static noinline_for_stack int
ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
{
- ext4_group_t ngroups, group, i;
- int cr;
+ ext4_group_t prefetch_grp = 0, ngroups, group, i;
+ int cr = -1;
int err = 0, first_err = 0;
+ unsigned int nr = 0, prefetch_ios = 0;
struct ext4_sb_info *sbi;
struct super_block *sb;
struct ext4_buddy e4b;
+ int lost;
sb = ac->ac_sb;
sbi = EXT4_SB(sb);
@@ -2133,8 +2326,8 @@
goto out;
/*
- * ac->ac2_order is set only if the fe_len is a power of 2
- * if ac2_order is set we also set criteria to 0 so that we
+ * ac->ac_2order is set only if the fe_len is a power of 2
+ * if ac->ac_2order is set we also set criteria to 0 so that we
* try exact allocation using buddy.
*/
i = fls(ac->ac_g_ex.fe_len);
@@ -2178,6 +2371,7 @@
* from the goal value specified
*/
group = ac->ac_g_ex.fe_group;
+ prefetch_grp = group;
for (i = 0; i < ngroups; group++, i++) {
int ret = 0;
@@ -2189,8 +2383,31 @@
if (group >= ngroups)
group = 0;
+ /*
+ * Batch reads of the block allocation bitmaps
+ * to get multiple READs in flight; limit
+ * prefetching at cr=0/1, otherwise mballoc can
+ * spend a lot of time loading imperfect groups
+ */
+ if ((prefetch_grp == group) &&
+ (cr > 1 ||
+ prefetch_ios < sbi->s_mb_prefetch_limit)) {
+ unsigned int curr_ios = prefetch_ios;
+
+ nr = sbi->s_mb_prefetch;
+ if (ext4_has_feature_flex_bg(sb)) {
+ nr = 1 << sbi->s_log_groups_per_flex;
+ nr -= group & (nr - 1);
+ nr = min(nr, sbi->s_mb_prefetch);
+ }
+ prefetch_grp = ext4_mb_prefetch(sb, group,
+ nr, &prefetch_ios);
+ if (prefetch_ios == curr_ios)
+ nr = 0;
+ }
+
/* This now checks without needing the buddy page */
- ret = ext4_mb_good_group(ac, group, cr);
+ ret = ext4_mb_good_group_nolock(ac, group, cr);
if (ret <= 0) {
if (!first_err)
first_err = ret;
@@ -2208,11 +2425,9 @@
* block group
*/
ret = ext4_mb_good_group(ac, group, cr);
- if (ret <= 0) {
+ if (ret == 0) {
ext4_unlock_group(sb, group);
ext4_mb_unload_buddy(&e4b);
- if (!first_err)
- first_err = ret;
continue;
}
@@ -2239,28 +2454,38 @@
* We've been searching too long. Let's try to allocate
* the best chunk we've found so far
*/
-
ext4_mb_try_best_found(ac, &e4b);
if (ac->ac_status != AC_STATUS_FOUND) {
/*
* Someone more lucky has already allocated it.
* The only thing we can do is just take first
* found block(s)
- printk(KERN_DEBUG "EXT4-fs: someone won our chunk\n");
*/
+ lost = atomic_inc_return(&sbi->s_mb_lost_chunks);
+ mb_debug(sb, "lost chunk, group: %u, start: %d, len: %d, lost: %d\n",
+ ac->ac_b_ex.fe_group, ac->ac_b_ex.fe_start,
+ ac->ac_b_ex.fe_len, lost);
+
ac->ac_b_ex.fe_group = 0;
ac->ac_b_ex.fe_start = 0;
ac->ac_b_ex.fe_len = 0;
ac->ac_status = AC_STATUS_CONTINUE;
ac->ac_flags |= EXT4_MB_HINT_FIRST;
cr = 3;
- atomic_inc(&sbi->s_mb_lost_chunks);
goto repeat;
}
}
out:
if (!err && ac->ac_status != AC_STATUS_FOUND && first_err)
err = first_err;
+
+ mb_debug(sb, "Best len %d, origin len %d, ac_status %u, ac_flags 0x%x, cr %d ret %d\n",
+ ac->ac_b_ex.fe_len, ac->ac_o_ex.fe_len, ac->ac_status,
+ ac->ac_flags, cr, err);
+
+ if (nr)
+ ext4_mb_prefetch_fini(sb, prefetch_grp, nr);
+
return err;
}
@@ -2333,7 +2558,7 @@
for (i = 0; i <= 13; i++)
seq_printf(seq, " %-5u", i <= blocksize_bits + 1 ?
sg.info.bb_counters[i] : 0);
- seq_printf(seq, " ]\n");
+ seq_puts(seq, " ]\n");
return 0;
}
@@ -2453,20 +2678,7 @@
meta_group_info[i]->bb_free_root = RB_ROOT;
meta_group_info[i]->bb_largest_free_order = -1; /* uninit */
-#ifdef DOUBLE_CHECK
- {
- struct buffer_head *bh;
- meta_group_info[i]->bb_bitmap =
- kmalloc(sb->s_blocksize, GFP_NOFS);
- BUG_ON(meta_group_info[i]->bb_bitmap == NULL);
- bh = ext4_read_block_bitmap(sb, group);
- BUG_ON(IS_ERR_OR_NULL(bh));
- memcpy(meta_group_info[i]->bb_bitmap, bh->b_data,
- sb->s_blocksize);
- put_bh(bh);
- }
-#endif
-
+ mb_group_bb_bitmap_alloc(sb, meta_group_info[i], group);
return 0;
exit_group_info:
@@ -2520,6 +2732,34 @@
goto err_freebuddy;
}
+ if (ext4_has_feature_flex_bg(sb)) {
+ /* a single flex group is supposed to be read by a single IO.
+ * 2 ^ s_log_groups_per_flex != UINT_MAX as s_mb_prefetch is
+ * unsigned integer, so the maximum shift is 32.
+ */
+ if (sbi->s_es->s_log_groups_per_flex >= 32) {
+ ext4_msg(sb, KERN_ERR, "too many log groups per flexible block group");
+ goto err_freebuddy;
+ }
+ sbi->s_mb_prefetch = min_t(uint, 1 << sbi->s_es->s_log_groups_per_flex,
+ BLK_MAX_SEGMENT_SIZE >> (sb->s_blocksize_bits - 9));
+ sbi->s_mb_prefetch *= 8; /* 8 prefetch IOs in flight at most */
+ } else {
+ sbi->s_mb_prefetch = 32;
+ }
+ if (sbi->s_mb_prefetch > ext4_get_groups_count(sb))
+ sbi->s_mb_prefetch = ext4_get_groups_count(sb);
+ /* now many real IOs to prefetch within a single allocation at cr=0
+ * given cr=0 is an CPU-related optimization we shouldn't try to
+ * load too many groups, at some point we should start to use what
+ * we've got in memory.
+ * with an average random access time 5ms, it'd take a second to get
+ * 200 groups (* N with flex_bg), so let's make this limit 4
+ */
+ sbi->s_mb_prefetch_limit = sbi->s_mb_prefetch * 4;
+ if (sbi->s_mb_prefetch_limit > ext4_get_groups_count(sb))
+ sbi->s_mb_prefetch_limit = ext4_get_groups_count(sb);
+
return 0;
err_freebuddy:
@@ -2643,6 +2883,7 @@
sbi->s_mb_stats = MB_DEFAULT_STATS;
sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD;
sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS;
+ sbi->s_mb_max_inode_prealloc = MB_DEFAULT_MAX_INODE_PREALLOC;
/*
* The default group preallocation is 512, which for 4k block
* sizes translates to 2 megabytes. However for bigalloc file
@@ -2703,7 +2944,7 @@
}
/* need to called with the ext4 group lock held */
-static void ext4_mb_cleanup_pa(struct ext4_group_info *grp)
+static int ext4_mb_cleanup_pa(struct ext4_group_info *grp)
{
struct ext4_prealloc_space *pa;
struct list_head *cur, *tmp;
@@ -2715,9 +2956,7 @@
count++;
kmem_cache_free(ext4_pspace_cachep, pa);
}
- if (count)
- mb_debug(1, "mballoc: %u PAs left\n", count);
-
+ return count;
}
int ext4_mb_release(struct super_block *sb)
@@ -2728,16 +2967,18 @@
struct ext4_group_info *grinfo, ***group_info;
struct ext4_sb_info *sbi = EXT4_SB(sb);
struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits);
+ int count;
if (sbi->s_group_info) {
for (i = 0; i < ngroups; i++) {
cond_resched();
grinfo = ext4_get_group_info(sb, i);
-#ifdef DOUBLE_CHECK
- kfree(grinfo->bb_bitmap);
-#endif
+ mb_group_bb_bitmap_free(grinfo);
ext4_lock_group(sb, i);
- ext4_mb_cleanup_pa(grinfo);
+ count = ext4_mb_cleanup_pa(grinfo);
+ if (count)
+ mb_debug(sb, "mballoc: %d PAs left\n",
+ count);
ext4_unlock_group(sb, i);
kmem_cache_free(cachep, grinfo);
}
@@ -2810,7 +3051,7 @@
struct ext4_group_info *db;
int err, count = 0, count2 = 0;
- mb_debug(1, "gonna free %u blocks in group %u (0x%p):",
+ mb_debug(sb, "gonna free %u blocks in group %u (0x%p):",
entry->efd_count, entry->efd_group, entry);
err = ext4_mb_load_buddy(sb, entry->efd_group, &e4b);
@@ -2850,7 +3091,8 @@
kmem_cache_free(ext4_free_data_cachep, entry);
ext4_mb_unload_buddy(&e4b);
- mb_debug(1, "freed %u blocks in %u structures\n", count, count2);
+ mb_debug(sb, "freed %d blocks in %d structures\n", count,
+ count2);
}
/*
@@ -2910,23 +3152,26 @@
ext4_pspace_cachep = KMEM_CACHE(ext4_prealloc_space,
SLAB_RECLAIM_ACCOUNT);
if (ext4_pspace_cachep == NULL)
- return -ENOMEM;
+ goto out;
ext4_ac_cachep = KMEM_CACHE(ext4_allocation_context,
SLAB_RECLAIM_ACCOUNT);
- if (ext4_ac_cachep == NULL) {
- kmem_cache_destroy(ext4_pspace_cachep);
- return -ENOMEM;
- }
+ if (ext4_ac_cachep == NULL)
+ goto out_pa_free;
ext4_free_data_cachep = KMEM_CACHE(ext4_free_data,
SLAB_RECLAIM_ACCOUNT);
- if (ext4_free_data_cachep == NULL) {
- kmem_cache_destroy(ext4_pspace_cachep);
- kmem_cache_destroy(ext4_ac_cachep);
- return -ENOMEM;
- }
+ if (ext4_free_data_cachep == NULL)
+ goto out_ac_free;
+
return 0;
+
+out_ac_free:
+ kmem_cache_destroy(ext4_ac_cachep);
+out_pa_free:
+ kmem_cache_destroy(ext4_pspace_cachep);
+out:
+ return -ENOMEM;
}
void ext4_exit_mballoc(void)
@@ -2993,7 +3238,7 @@
block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
len = EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
- if (!ext4_data_block_valid(sbi, block, len)) {
+ if (!ext4_inode_block_valid(ac->ac_inode, block, len)) {
ext4_error(sb, "Allocating blocks %llu-%llu which overlap "
"fs metadata", block, block+len);
/* File system mounted not to panic on error
@@ -3063,6 +3308,84 @@
}
/*
+ * Idempotent helper for Ext4 fast commit replay path to set the state of
+ * blocks in bitmaps and update counters.
+ */
+void ext4_mb_mark_bb(struct super_block *sb, ext4_fsblk_t block,
+ int len, int state)
+{
+ struct buffer_head *bitmap_bh = NULL;
+ struct ext4_group_desc *gdp;
+ struct buffer_head *gdp_bh;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ ext4_group_t group;
+ ext4_grpblk_t blkoff;
+ int i, clen, err;
+ int already;
+
+ clen = EXT4_B2C(sbi, len);
+
+ ext4_get_group_no_and_offset(sb, block, &group, &blkoff);
+ bitmap_bh = ext4_read_block_bitmap(sb, group);
+ if (IS_ERR(bitmap_bh)) {
+ err = PTR_ERR(bitmap_bh);
+ bitmap_bh = NULL;
+ goto out_err;
+ }
+
+ err = -EIO;
+ gdp = ext4_get_group_desc(sb, group, &gdp_bh);
+ if (!gdp)
+ goto out_err;
+
+ ext4_lock_group(sb, group);
+ already = 0;
+ for (i = 0; i < clen; i++)
+ if (!mb_test_bit(blkoff + i, bitmap_bh->b_data) == !state)
+ already++;
+
+ if (state)
+ ext4_set_bits(bitmap_bh->b_data, blkoff, clen);
+ else
+ mb_test_and_clear_bits(bitmap_bh->b_data, blkoff, clen);
+ if (ext4_has_group_desc_csum(sb) &&
+ (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) {
+ gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
+ ext4_free_group_clusters_set(sb, gdp,
+ ext4_free_clusters_after_init(sb,
+ group, gdp));
+ }
+ if (state)
+ clen = ext4_free_group_clusters(sb, gdp) - clen + already;
+ else
+ clen = ext4_free_group_clusters(sb, gdp) + clen - already;
+
+ ext4_free_group_clusters_set(sb, gdp, clen);
+ ext4_block_bitmap_csum_set(sb, group, gdp, bitmap_bh);
+ ext4_group_desc_csum_set(sb, group, gdp);
+
+ ext4_unlock_group(sb, group);
+
+ if (sbi->s_log_groups_per_flex) {
+ ext4_group_t flex_group = ext4_flex_group(sbi, group);
+
+ atomic64_sub(len,
+ &sbi_array_rcu_deref(sbi, s_flex_groups,
+ flex_group)->free_clusters);
+ }
+
+ err = ext4_handle_dirty_metadata(NULL, NULL, bitmap_bh);
+ if (err)
+ goto out_err;
+ sync_dirty_buffer(bitmap_bh);
+ err = ext4_handle_dirty_metadata(NULL, NULL, gdp_bh);
+ sync_dirty_buffer(gdp_bh);
+
+out_err:
+ brelse(bitmap_bh);
+}
+
+/*
* here we normalize request for locality group
* Group request are normalized to s_mb_group_prealloc, which goes to
* s_strip if we set the same via mount option.
@@ -3078,8 +3401,7 @@
BUG_ON(lg == NULL);
ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_mb_group_prealloc;
- mb_debug(1, "#%u: goal %u blocks for locality group\n",
- current->pid, ac->ac_g_ex.fe_len);
+ mb_debug(sb, "goal %u blocks for locality group\n", ac->ac_g_ex.fe_len);
}
/*
@@ -3277,8 +3599,8 @@
ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL;
}
- mb_debug(1, "goal: %u(was %u) blocks at %u\n", (unsigned) size,
- (unsigned) orig_size, (unsigned) start);
+ mb_debug(ac->ac_sb, "goal: %lld(was %lld) blocks at %u\n", size,
+ orig_size, start);
}
static void ext4_mb_collect_stats(struct ext4_allocation_context *ac)
@@ -3367,7 +3689,7 @@
BUG_ON(pa->pa_free < len);
pa->pa_free -= len;
- mb_debug(1, "use %llu/%u from inode pa %p\n", start, len, pa);
+ mb_debug(ac->ac_sb, "use %llu/%d from inode pa %p\n", start, len, pa);
}
/*
@@ -3391,7 +3713,8 @@
* in on-disk bitmap -- see ext4_mb_release_context()
* Other CPUs are prevented from allocating from this pa by lg_mutex
*/
- mb_debug(1, "use %u/%u from group pa %p\n", pa->pa_lstart-len, len, pa);
+ mb_debug(ac->ac_sb, "use %u/%u from group pa %p\n",
+ pa->pa_lstart-len, len, pa);
}
/*
@@ -3426,7 +3749,7 @@
/*
* search goal blocks in preallocated space
*/
-static noinline_for_stack int
+static noinline_for_stack bool
ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
{
struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
@@ -3438,7 +3761,7 @@
/* only data can be preallocated */
if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
- return 0;
+ return false;
/* first, try per-file preallocation */
rcu_read_lock();
@@ -3465,7 +3788,7 @@
spin_unlock(&pa->pa_lock);
ac->ac_criteria = 10;
rcu_read_unlock();
- return 1;
+ return true;
}
spin_unlock(&pa->pa_lock);
}
@@ -3473,12 +3796,12 @@
/* can we use group allocation? */
if (!(ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC))
- return 0;
+ return false;
/* inode may have no locality group for some reason */
lg = ac->ac_lg;
if (lg == NULL)
- return 0;
+ return false;
order = fls(ac->ac_o_ex.fe_len) - 1;
if (order > PREALLOC_TB_SIZE - 1)
/* The max size of hash table is PREALLOC_TB_SIZE */
@@ -3507,9 +3830,9 @@
if (cpa) {
ext4_mb_use_group_pa(ac, cpa);
ac->ac_criteria = 20;
- return 1;
+ return true;
}
- return 0;
+ return false;
}
/*
@@ -3574,7 +3897,27 @@
ext4_set_bits(bitmap, start, len);
preallocated += len;
}
- mb_debug(1, "preallocated %u for group %u\n", preallocated, group);
+ mb_debug(sb, "preallocated %d for group %u\n", preallocated, group);
+}
+
+static void ext4_mb_mark_pa_deleted(struct super_block *sb,
+ struct ext4_prealloc_space *pa)
+{
+ struct ext4_inode_info *ei;
+
+ if (pa->pa_deleted) {
+ ext4_warning(sb, "deleted pa, type:%d, pblk:%llu, lblk:%u, len:%d\n",
+ pa->pa_type, pa->pa_pstart, pa->pa_lstart,
+ pa->pa_len);
+ return;
+ }
+
+ pa->pa_deleted = 1;
+
+ if (pa->pa_type == MB_INODE_PA) {
+ ei = EXT4_I(pa->pa_inode);
+ atomic_dec(&ei->i_prealloc_active);
+ }
}
static void ext4_mb_pa_callback(struct rcu_head *head)
@@ -3609,7 +3952,7 @@
return;
}
- pa->pa_deleted = 1;
+ ext4_mb_mark_pa_deleted(sb, pa);
spin_unlock(&pa->pa_lock);
grp_blk = pa->pa_pstart;
@@ -3650,7 +3993,7 @@
/*
* creates new preallocated space for given inode
*/
-static noinline_for_stack int
+static noinline_for_stack void
ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
{
struct super_block *sb = ac->ac_sb;
@@ -3663,10 +4006,9 @@
BUG_ON(ac->ac_o_ex.fe_len >= ac->ac_b_ex.fe_len);
BUG_ON(ac->ac_status != AC_STATUS_FOUND);
BUG_ON(!S_ISREG(ac->ac_inode->i_mode));
+ BUG_ON(ac->ac_pa == NULL);
- pa = kmem_cache_alloc(ext4_pspace_cachep, GFP_NOFS);
- if (pa == NULL)
- return -ENOMEM;
+ pa = ac->ac_pa;
if (ac->ac_b_ex.fe_len < ac->ac_g_ex.fe_len) {
int winl;
@@ -3710,15 +4052,14 @@
pa->pa_pstart = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
pa->pa_len = ac->ac_b_ex.fe_len;
pa->pa_free = pa->pa_len;
- atomic_set(&pa->pa_count, 1);
spin_lock_init(&pa->pa_lock);
INIT_LIST_HEAD(&pa->pa_inode_list);
INIT_LIST_HEAD(&pa->pa_group_list);
pa->pa_deleted = 0;
pa->pa_type = MB_INODE_PA;
- mb_debug(1, "new inode pa %p: %llu/%u for %u\n", pa,
- pa->pa_pstart, pa->pa_len, pa->pa_lstart);
+ mb_debug(sb, "new inode pa %p: %llu/%d for %u\n", pa, pa->pa_pstart,
+ pa->pa_len, pa->pa_lstart);
trace_ext4_mb_new_inode_pa(ac, pa);
ext4_mb_use_inode_pa(ac, pa);
@@ -3730,21 +4071,18 @@
pa->pa_obj_lock = &ei->i_prealloc_lock;
pa->pa_inode = ac->ac_inode;
- ext4_lock_group(sb, ac->ac_b_ex.fe_group);
list_add(&pa->pa_group_list, &grp->bb_prealloc_list);
- ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
spin_lock(pa->pa_obj_lock);
list_add_rcu(&pa->pa_inode_list, &ei->i_prealloc_list);
spin_unlock(pa->pa_obj_lock);
-
- return 0;
+ atomic_inc(&ei->i_prealloc_active);
}
/*
* creates new preallocated space for locality group inodes belongs to
*/
-static noinline_for_stack int
+static noinline_for_stack void
ext4_mb_new_group_pa(struct ext4_allocation_context *ac)
{
struct super_block *sb = ac->ac_sb;
@@ -3756,11 +4094,9 @@
BUG_ON(ac->ac_o_ex.fe_len >= ac->ac_b_ex.fe_len);
BUG_ON(ac->ac_status != AC_STATUS_FOUND);
BUG_ON(!S_ISREG(ac->ac_inode->i_mode));
+ BUG_ON(ac->ac_pa == NULL);
- BUG_ON(ext4_pspace_cachep == NULL);
- pa = kmem_cache_alloc(ext4_pspace_cachep, GFP_NOFS);
- if (pa == NULL)
- return -ENOMEM;
+ pa = ac->ac_pa;
/* preallocation can change ac_b_ex, thus we store actually
* allocated blocks for history */
@@ -3770,15 +4106,14 @@
pa->pa_lstart = pa->pa_pstart;
pa->pa_len = ac->ac_b_ex.fe_len;
pa->pa_free = pa->pa_len;
- atomic_set(&pa->pa_count, 1);
spin_lock_init(&pa->pa_lock);
INIT_LIST_HEAD(&pa->pa_inode_list);
INIT_LIST_HEAD(&pa->pa_group_list);
pa->pa_deleted = 0;
pa->pa_type = MB_GROUP_PA;
- mb_debug(1, "new group pa %p: %llu/%u for %u\n", pa,
- pa->pa_pstart, pa->pa_len, pa->pa_lstart);
+ mb_debug(sb, "new group pa %p: %llu/%d for %u\n", pa, pa->pa_pstart,
+ pa->pa_len, pa->pa_lstart);
trace_ext4_mb_new_group_pa(ac, pa);
ext4_mb_use_group_pa(ac, pa);
@@ -3791,26 +4126,20 @@
pa->pa_obj_lock = &lg->lg_prealloc_lock;
pa->pa_inode = NULL;
- ext4_lock_group(sb, ac->ac_b_ex.fe_group);
list_add(&pa->pa_group_list, &grp->bb_prealloc_list);
- ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
/*
* We will later add the new pa to the right bucket
* after updating the pa_free in ext4_mb_release_context
*/
- return 0;
}
-static int ext4_mb_new_preallocation(struct ext4_allocation_context *ac)
+static void ext4_mb_new_preallocation(struct ext4_allocation_context *ac)
{
- int err;
-
if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC)
- err = ext4_mb_new_group_pa(ac);
+ ext4_mb_new_group_pa(ac);
else
- err = ext4_mb_new_inode_pa(ac);
- return err;
+ ext4_mb_new_inode_pa(ac);
}
/*
@@ -3845,7 +4174,7 @@
if (bit >= end)
break;
next = mb_find_next_bit(bitmap_bh->b_data, end, bit);
- mb_debug(1, " free preallocated %u/%u in group %u\n",
+ mb_debug(sb, "free preallocated %u/%u in group %u\n",
(unsigned) ext4_group_first_block_no(sb, group) + bit,
(unsigned) next - bit, (unsigned) group);
free += next - bit;
@@ -3859,10 +4188,10 @@
}
if (free != pa->pa_free) {
ext4_msg(e4b->bd_sb, KERN_CRIT,
- "pa %p: logic %lu, phys. %lu, len %lu",
+ "pa %p: logic %lu, phys. %lu, len %d",
pa, (unsigned long) pa->pa_lstart,
(unsigned long) pa->pa_pstart,
- (unsigned long) pa->pa_len);
+ pa->pa_len);
ext4_grp_locked_error(sb, group, 0, 0, "free %u, pa_free %u",
free, pa->pa_free);
/*
@@ -3905,7 +4234,7 @@
*/
static noinline_for_stack int
ext4_mb_discard_group_preallocations(struct super_block *sb,
- ext4_group_t group, int needed)
+ ext4_group_t group, int *busy)
{
struct ext4_group_info *grp = ext4_get_group_info(sb, group);
struct buffer_head *bitmap_bh = NULL;
@@ -3913,20 +4242,19 @@
struct list_head list;
struct ext4_buddy e4b;
int err;
- int busy = 0;
int free = 0;
- mb_debug(1, "discard preallocation for group %u\n", group);
-
+ mb_debug(sb, "discard preallocation for group %u\n", group);
if (list_empty(&grp->bb_prealloc_list))
- return 0;
+ goto out_dbg;
bitmap_bh = ext4_read_block_bitmap(sb, group);
if (IS_ERR(bitmap_bh)) {
err = PTR_ERR(bitmap_bh);
- ext4_error(sb, "Error %d reading block bitmap for %u",
- err, group);
- return 0;
+ ext4_error_err(sb, -err,
+ "Error %d reading block bitmap for %u",
+ err, group);
+ goto out_dbg;
}
err = ext4_mb_load_buddy(sb, group, &e4b);
@@ -3934,21 +4262,17 @@
ext4_warning(sb, "Error %d loading buddy information for %u",
err, group);
put_bh(bitmap_bh);
- return 0;
+ goto out_dbg;
}
- if (needed == 0)
- needed = EXT4_CLUSTERS_PER_GROUP(sb) + 1;
-
INIT_LIST_HEAD(&list);
-repeat:
ext4_lock_group(sb, group);
list_for_each_entry_safe(pa, tmp,
&grp->bb_prealloc_list, pa_group_list) {
spin_lock(&pa->pa_lock);
if (atomic_read(&pa->pa_count)) {
spin_unlock(&pa->pa_lock);
- busy = 1;
+ *busy = 1;
continue;
}
if (pa->pa_deleted) {
@@ -3957,7 +4281,10 @@
}
/* seems this one can be freed ... */
- pa->pa_deleted = 1;
+ ext4_mb_mark_pa_deleted(sb, pa);
+
+ if (!free)
+ this_cpu_inc(discard_pa_seq);
/* we can trust pa_free ... */
free += pa->pa_free;
@@ -3968,20 +4295,6 @@
list_add(&pa->u.pa_tmp_list, &list);
}
- /* if we still need more blocks and some PAs were used, try again */
- if (free < needed && busy) {
- busy = 0;
- ext4_unlock_group(sb, group);
- cond_resched();
- goto repeat;
- }
-
- /* found anything to free? */
- if (list_empty(&list)) {
- BUG_ON(free != 0);
- goto out;
- }
-
/* now free all selected PAs */
list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) {
@@ -3999,10 +4312,12 @@
call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
}
-out:
ext4_unlock_group(sb, group);
ext4_mb_unload_buddy(&e4b);
put_bh(bitmap_bh);
+out_dbg:
+ mb_debug(sb, "discarded (%d) blocks preallocated for group %u bb_free (%d)\n",
+ free, group, grp->bb_free);
return free;
}
@@ -4015,7 +4330,7 @@
*
* FIXME!! Make sure it is valid at all the call sites
*/
-void ext4_discard_preallocations(struct inode *inode)
+void ext4_discard_preallocations(struct inode *inode, unsigned int needed)
{
struct ext4_inode_info *ei = EXT4_I(inode);
struct super_block *sb = inode->i_sb;
@@ -4031,16 +4346,24 @@
return;
}
- mb_debug(1, "discard preallocation for inode %lu\n", inode->i_ino);
- trace_ext4_discard_preallocations(inode);
+ if (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY)
+ return;
+
+ mb_debug(sb, "discard preallocation for inode %lu\n",
+ inode->i_ino);
+ trace_ext4_discard_preallocations(inode,
+ atomic_read(&ei->i_prealloc_active), needed);
INIT_LIST_HEAD(&list);
+ if (needed == 0)
+ needed = UINT_MAX;
+
repeat:
/* first, collect all pa's in the inode */
spin_lock(&ei->i_prealloc_lock);
- while (!list_empty(&ei->i_prealloc_list)) {
- pa = list_entry(ei->i_prealloc_list.next,
+ while (!list_empty(&ei->i_prealloc_list) && needed) {
+ pa = list_entry(ei->i_prealloc_list.prev,
struct ext4_prealloc_space, pa_inode_list);
BUG_ON(pa->pa_obj_lock != &ei->i_prealloc_lock);
spin_lock(&pa->pa_lock);
@@ -4057,10 +4380,11 @@
}
if (pa->pa_deleted == 0) {
- pa->pa_deleted = 1;
+ ext4_mb_mark_pa_deleted(sb, pa);
spin_unlock(&pa->pa_lock);
list_del_rcu(&pa->pa_inode_list);
list_add(&pa->u.pa_tmp_list, &list);
+ needed--;
continue;
}
@@ -4092,16 +4416,16 @@
err = ext4_mb_load_buddy_gfp(sb, group, &e4b,
GFP_NOFS|__GFP_NOFAIL);
if (err) {
- ext4_error(sb, "Error %d loading buddy information for %u",
- err, group);
+ ext4_error_err(sb, -err, "Error %d loading buddy information for %u",
+ err, group);
continue;
}
bitmap_bh = ext4_read_block_bitmap(sb, group);
if (IS_ERR(bitmap_bh)) {
err = PTR_ERR(bitmap_bh);
- ext4_error(sb, "Error %d reading block bitmap for %u",
- err, group);
+ ext4_error_err(sb, -err, "Error %d reading block bitmap for %u",
+ err, group);
ext4_mb_unload_buddy(&e4b);
continue;
}
@@ -4119,22 +4443,74 @@
}
}
+static int ext4_mb_pa_alloc(struct ext4_allocation_context *ac)
+{
+ struct ext4_prealloc_space *pa;
+
+ BUG_ON(ext4_pspace_cachep == NULL);
+ pa = kmem_cache_zalloc(ext4_pspace_cachep, GFP_NOFS);
+ if (!pa)
+ return -ENOMEM;
+ atomic_set(&pa->pa_count, 1);
+ ac->ac_pa = pa;
+ return 0;
+}
+
+static void ext4_mb_pa_free(struct ext4_allocation_context *ac)
+{
+ struct ext4_prealloc_space *pa = ac->ac_pa;
+
+ BUG_ON(!pa);
+ ac->ac_pa = NULL;
+ WARN_ON(!atomic_dec_and_test(&pa->pa_count));
+ kmem_cache_free(ext4_pspace_cachep, pa);
+}
+
#ifdef CONFIG_EXT4_DEBUG
+static inline void ext4_mb_show_pa(struct super_block *sb)
+{
+ ext4_group_t i, ngroups;
+
+ if (ext4_test_mount_flag(sb, EXT4_MF_FS_ABORTED))
+ return;
+
+ ngroups = ext4_get_groups_count(sb);
+ mb_debug(sb, "groups: ");
+ for (i = 0; i < ngroups; i++) {
+ struct ext4_group_info *grp = ext4_get_group_info(sb, i);
+ struct ext4_prealloc_space *pa;
+ ext4_grpblk_t start;
+ struct list_head *cur;
+ ext4_lock_group(sb, i);
+ list_for_each(cur, &grp->bb_prealloc_list) {
+ pa = list_entry(cur, struct ext4_prealloc_space,
+ pa_group_list);
+ spin_lock(&pa->pa_lock);
+ ext4_get_group_no_and_offset(sb, pa->pa_pstart,
+ NULL, &start);
+ spin_unlock(&pa->pa_lock);
+ mb_debug(sb, "PA:%u:%d:%d\n", i, start,
+ pa->pa_len);
+ }
+ ext4_unlock_group(sb, i);
+ mb_debug(sb, "%u: %d/%d\n", i, grp->bb_free,
+ grp->bb_fragments);
+ }
+}
+
static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
{
struct super_block *sb = ac->ac_sb;
- ext4_group_t ngroups, i;
- if (!ext4_mballoc_debug ||
- (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED))
+ if (ext4_test_mount_flag(sb, EXT4_MF_FS_ABORTED))
return;
- ext4_msg(ac->ac_sb, KERN_ERR, "Can't allocate:"
+ mb_debug(sb, "Can't allocate:"
" Allocation context details:");
- ext4_msg(ac->ac_sb, KERN_ERR, "status %d flags %d",
+ mb_debug(sb, "status %u flags 0x%x",
ac->ac_status, ac->ac_flags);
- ext4_msg(ac->ac_sb, KERN_ERR, "orig %lu/%lu/%lu@%lu, "
- "goal %lu/%lu/%lu@%lu, "
+ mb_debug(sb, "orig %lu/%lu/%lu@%lu, "
+ "goal %lu/%lu/%lu@%lu, "
"best %lu/%lu/%lu@%lu cr %d",
(unsigned long)ac->ac_o_ex.fe_group,
(unsigned long)ac->ac_o_ex.fe_start,
@@ -4149,37 +4525,17 @@
(unsigned long)ac->ac_b_ex.fe_len,
(unsigned long)ac->ac_b_ex.fe_logical,
(int)ac->ac_criteria);
- ext4_msg(ac->ac_sb, KERN_ERR, "%d found", ac->ac_found);
- ext4_msg(ac->ac_sb, KERN_ERR, "groups: ");
- ngroups = ext4_get_groups_count(sb);
- for (i = 0; i < ngroups; i++) {
- struct ext4_group_info *grp = ext4_get_group_info(sb, i);
- struct ext4_prealloc_space *pa;
- ext4_grpblk_t start;
- struct list_head *cur;
- ext4_lock_group(sb, i);
- list_for_each(cur, &grp->bb_prealloc_list) {
- pa = list_entry(cur, struct ext4_prealloc_space,
- pa_group_list);
- spin_lock(&pa->pa_lock);
- ext4_get_group_no_and_offset(sb, pa->pa_pstart,
- NULL, &start);
- spin_unlock(&pa->pa_lock);
- printk(KERN_ERR "PA:%u:%d:%u \n", i,
- start, pa->pa_len);
- }
- ext4_unlock_group(sb, i);
-
- if (grp->bb_free == 0)
- continue;
- printk(KERN_ERR "%u: %d/%d \n",
- i, grp->bb_free, grp->bb_fragments);
- }
- printk(KERN_ERR "\n");
+ mb_debug(sb, "%u found", ac->ac_found);
+ ext4_mb_show_pa(sb);
}
#else
+static inline void ext4_mb_show_pa(struct super_block *sb)
+{
+ return;
+}
static inline void ext4_mb_show_ac(struct ext4_allocation_context *ac)
{
+ ext4_mb_show_pa(ac->ac_sb);
return;
}
#endif
@@ -4278,11 +4634,11 @@
ac->ac_g_ex = ac->ac_o_ex;
ac->ac_flags = ar->flags;
- /* we have to define context: we'll we work with a file or
+ /* we have to define context: we'll work with a file or
* locality group. this is a policy, actually */
ext4_mb_group_or_file(ac);
- mb_debug(1, "init ac: %u blocks @ %u, goal %u, flags %x, 2^%d, "
+ mb_debug(sb, "init ac: %u blocks @ %u, goal %u, flags 0x%x, 2^%d, "
"left: %u/%u, right %u/%u to %swritable\n",
(unsigned) ar->len, (unsigned) ar->logical,
(unsigned) ar->goal, ac->ac_flags, ac->ac_2order,
@@ -4303,13 +4659,14 @@
struct list_head discard_list;
struct ext4_prealloc_space *pa, *tmp;
- mb_debug(1, "discard locality group preallocation\n");
+ mb_debug(sb, "discard locality group preallocation\n");
INIT_LIST_HEAD(&discard_list);
spin_lock(&lg->lg_prealloc_lock);
list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[order],
- pa_inode_list) {
+ pa_inode_list,
+ lockdep_is_held(&lg->lg_prealloc_lock)) {
spin_lock(&pa->pa_lock);
if (atomic_read(&pa->pa_count)) {
/*
@@ -4328,7 +4685,7 @@
BUG_ON(pa->pa_type != MB_GROUP_PA);
/* seems this one can be freed ... */
- pa->pa_deleted = 1;
+ ext4_mb_mark_pa_deleted(sb, pa);
spin_unlock(&pa->pa_lock);
list_del_rcu(&pa->pa_inode_list);
@@ -4354,8 +4711,8 @@
err = ext4_mb_load_buddy_gfp(sb, group, &e4b,
GFP_NOFS|__GFP_NOFAIL);
if (err) {
- ext4_error(sb, "Error %d loading buddy information for %u",
- err, group);
+ ext4_error_err(sb, -err, "Error %d loading buddy information for %u",
+ err, group);
continue;
}
ext4_lock_group(sb, group);
@@ -4392,7 +4749,8 @@
/* Add the prealloc space to lg */
spin_lock(&lg->lg_prealloc_lock);
list_for_each_entry_rcu(tmp_pa, &lg->lg_prealloc_list[order],
- pa_inode_list) {
+ pa_inode_list,
+ lockdep_is_held(&lg->lg_prealloc_lock)) {
spin_lock(&tmp_pa->pa_lock);
if (tmp_pa->pa_deleted) {
spin_unlock(&tmp_pa->pa_lock);
@@ -4426,10 +4784,29 @@
}
/*
+ * if per-inode prealloc list is too long, trim some PA
+ */
+static void ext4_mb_trim_inode_pa(struct inode *inode)
+{
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ int count, delta;
+
+ count = atomic_read(&ei->i_prealloc_active);
+ delta = (sbi->s_mb_max_inode_prealloc >> 2) + 1;
+ if (count > sbi->s_mb_max_inode_prealloc + delta) {
+ count -= sbi->s_mb_max_inode_prealloc;
+ ext4_discard_preallocations(inode, count);
+ }
+}
+
+/*
* release all resource we used in allocation
*/
static int ext4_mb_release_context(struct ext4_allocation_context *ac)
{
+ struct inode *inode = ac->ac_inode;
+ struct ext4_inode_info *ei = EXT4_I(inode);
struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
struct ext4_prealloc_space *pa = ac->ac_pa;
if (pa) {
@@ -4441,21 +4818,31 @@
pa->pa_free -= ac->ac_b_ex.fe_len;
pa->pa_len -= ac->ac_b_ex.fe_len;
spin_unlock(&pa->pa_lock);
+
+ /*
+ * We want to add the pa to the right bucket.
+ * Remove it from the list and while adding
+ * make sure the list to which we are adding
+ * doesn't grow big.
+ */
+ if (likely(pa->pa_free)) {
+ spin_lock(pa->pa_obj_lock);
+ list_del_rcu(&pa->pa_inode_list);
+ spin_unlock(pa->pa_obj_lock);
+ ext4_mb_add_n_trim(ac);
+ }
}
- }
- if (pa) {
- /*
- * We want to add the pa to the right bucket.
- * Remove it from the list and while adding
- * make sure the list to which we are adding
- * doesn't grow big.
- */
- if ((pa->pa_type == MB_GROUP_PA) && likely(pa->pa_free)) {
+
+ if (pa->pa_type == MB_INODE_PA) {
+ /*
+ * treat per-inode prealloc list as a lru list, then try
+ * to trim the least recently used PA.
+ */
spin_lock(pa->pa_obj_lock);
- list_del_rcu(&pa->pa_inode_list);
+ list_move(&pa->pa_inode_list, &ei->i_prealloc_list);
spin_unlock(pa->pa_obj_lock);
- ext4_mb_add_n_trim(ac);
}
+
ext4_mb_put_pa(ac, ac->ac_sb, pa);
}
if (ac->ac_bitmap_page)
@@ -4465,6 +4852,7 @@
if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC)
mutex_unlock(&ac->ac_lg->lg_mutex);
ext4_mb_collect_stats(ac);
+ ext4_mb_trim_inode_pa(inode);
return 0;
}
@@ -4472,18 +4860,56 @@
{
ext4_group_t i, ngroups = ext4_get_groups_count(sb);
int ret;
- int freed = 0;
+ int freed = 0, busy = 0;
+ int retry = 0;
trace_ext4_mb_discard_preallocations(sb, needed);
+
+ if (needed == 0)
+ needed = EXT4_CLUSTERS_PER_GROUP(sb) + 1;
+ repeat:
for (i = 0; i < ngroups && needed > 0; i++) {
- ret = ext4_mb_discard_group_preallocations(sb, i, needed);
+ ret = ext4_mb_discard_group_preallocations(sb, i, &busy);
freed += ret;
needed -= ret;
+ cond_resched();
+ }
+
+ if (needed > 0 && busy && ++retry < 3) {
+ busy = 0;
+ goto repeat;
}
return freed;
}
+static bool ext4_mb_discard_preallocations_should_retry(struct super_block *sb,
+ struct ext4_allocation_context *ac, u64 *seq)
+{
+ int freed;
+ u64 seq_retry = 0;
+ bool ret = false;
+
+ freed = ext4_mb_discard_preallocations(sb, ac->ac_o_ex.fe_len);
+ if (freed) {
+ ret = true;
+ goto out_dbg;
+ }
+ seq_retry = ext4_get_discard_pa_seq_sum();
+ if (!(ac->ac_flags & EXT4_MB_STRICT_CHECK) || seq_retry != *seq) {
+ ac->ac_flags |= EXT4_MB_STRICT_CHECK;
+ *seq = seq_retry;
+ ret = true;
+ }
+
+out_dbg:
+ mb_debug(sb, "freed %d, retry ? %s\n", freed, ret ? "yes" : "no");
+ return ret;
+}
+
+static ext4_fsblk_t ext4_mb_new_blocks_simple(handle_t *handle,
+ struct ext4_allocation_request *ar, int *errp);
+
/*
* Main entry point into mballoc to allocate blocks
* it tries to use preallocation first, then falls back
@@ -4492,19 +4918,21 @@
ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
struct ext4_allocation_request *ar, int *errp)
{
- int freed;
struct ext4_allocation_context *ac = NULL;
struct ext4_sb_info *sbi;
struct super_block *sb;
ext4_fsblk_t block = 0;
unsigned int inquota = 0;
unsigned int reserv_clstrs = 0;
+ u64 seq;
might_sleep();
sb = ar->inode->i_sb;
sbi = EXT4_SB(sb);
trace_ext4_request_blocks(ar);
+ if (sbi->s_mount_state & EXT4_FC_REPLAY)
+ return ext4_mb_new_blocks_simple(handle, ar, errp);
/* Allow to use superuser reservation for quota file */
if (ext4_is_quota_file(ar->inode))
@@ -4523,6 +4951,7 @@
ar->len = ar->len >> 1;
}
if (!ar->len) {
+ ext4_mb_show_pa(sb);
*errp = -ENOSPC;
return 0;
}
@@ -4560,26 +4989,32 @@
}
ac->ac_op = EXT4_MB_HISTORY_PREALLOC;
+ seq = this_cpu_read(discard_pa_seq);
if (!ext4_mb_use_preallocated(ac)) {
ac->ac_op = EXT4_MB_HISTORY_ALLOC;
ext4_mb_normalize_request(ac, ar);
+
+ *errp = ext4_mb_pa_alloc(ac);
+ if (*errp)
+ goto errout;
repeat:
/* allocate space in core */
*errp = ext4_mb_regular_allocator(ac);
- if (*errp)
- goto discard_and_exit;
-
- /* as we've just preallocated more space than
- * user requested originally, we store allocated
- * space in a special descriptor */
- if (ac->ac_status == AC_STATUS_FOUND &&
- ac->ac_o_ex.fe_len < ac->ac_b_ex.fe_len)
- *errp = ext4_mb_new_preallocation(ac);
+ /*
+ * pa allocated above is added to grp->bb_prealloc_list only
+ * when we were able to allocate some block i.e. when
+ * ac->ac_status == AC_STATUS_FOUND.
+ * And error from above mean ac->ac_status != AC_STATUS_FOUND
+ * So we have to free this pa here itself.
+ */
if (*errp) {
- discard_and_exit:
+ ext4_mb_pa_free(ac);
ext4_discard_allocated_blocks(ac);
goto errout;
}
+ if (ac->ac_status == AC_STATUS_FOUND &&
+ ac->ac_o_ex.fe_len >= ac->ac_f_ex.fe_len)
+ ext4_mb_pa_free(ac);
}
if (likely(ac->ac_status == AC_STATUS_FOUND)) {
*errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_clstrs);
@@ -4591,9 +5026,13 @@
ar->len = ac->ac_b_ex.fe_len;
}
} else {
- freed = ext4_mb_discard_preallocations(sb, ac->ac_o_ex.fe_len);
- if (freed)
+ if (ext4_mb_discard_preallocations_should_retry(sb, ac, &seq))
goto repeat;
+ /*
+ * If block allocation fails then the pa allocated above
+ * needs to be freed here itself.
+ */
+ ext4_mb_pa_free(ac);
*errp = -ENOSPC;
}
@@ -4722,6 +5161,110 @@
return 0;
}
+/*
+ * Simple allocator for Ext4 fast commit replay path. It searches for blocks
+ * linearly starting at the goal block and also excludes the blocks which
+ * are going to be in use after fast commit replay.
+ */
+static ext4_fsblk_t ext4_mb_new_blocks_simple(handle_t *handle,
+ struct ext4_allocation_request *ar, int *errp)
+{
+ struct buffer_head *bitmap_bh;
+ struct super_block *sb = ar->inode->i_sb;
+ ext4_group_t group;
+ ext4_grpblk_t blkoff;
+ ext4_grpblk_t max = EXT4_CLUSTERS_PER_GROUP(sb);
+ ext4_grpblk_t i = 0;
+ ext4_fsblk_t goal, block;
+ struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+
+ goal = ar->goal;
+ if (goal < le32_to_cpu(es->s_first_data_block) ||
+ goal >= ext4_blocks_count(es))
+ goal = le32_to_cpu(es->s_first_data_block);
+
+ ar->len = 0;
+ ext4_get_group_no_and_offset(sb, goal, &group, &blkoff);
+ for (; group < ext4_get_groups_count(sb); group++) {
+ bitmap_bh = ext4_read_block_bitmap(sb, group);
+ if (IS_ERR(bitmap_bh)) {
+ *errp = PTR_ERR(bitmap_bh);
+ pr_warn("Failed to read block bitmap\n");
+ return 0;
+ }
+
+ ext4_get_group_no_and_offset(sb,
+ max(ext4_group_first_block_no(sb, group), goal),
+ NULL, &blkoff);
+ while (1) {
+ i = mb_find_next_zero_bit(bitmap_bh->b_data, max,
+ blkoff);
+ if (i >= max)
+ break;
+ if (ext4_fc_replay_check_excluded(sb,
+ ext4_group_first_block_no(sb, group) + i)) {
+ blkoff = i + 1;
+ } else
+ break;
+ }
+ brelse(bitmap_bh);
+ if (i < max)
+ break;
+ }
+
+ if (group >= ext4_get_groups_count(sb) || i >= max) {
+ *errp = -ENOSPC;
+ return 0;
+ }
+
+ block = ext4_group_first_block_no(sb, group) + i;
+ ext4_mb_mark_bb(sb, block, 1, 1);
+ ar->len = 1;
+
+ return block;
+}
+
+static void ext4_free_blocks_simple(struct inode *inode, ext4_fsblk_t block,
+ unsigned long count)
+{
+ struct buffer_head *bitmap_bh;
+ struct super_block *sb = inode->i_sb;
+ struct ext4_group_desc *gdp;
+ struct buffer_head *gdp_bh;
+ ext4_group_t group;
+ ext4_grpblk_t blkoff;
+ int already_freed = 0, err, i;
+
+ ext4_get_group_no_and_offset(sb, block, &group, &blkoff);
+ bitmap_bh = ext4_read_block_bitmap(sb, group);
+ if (IS_ERR(bitmap_bh)) {
+ err = PTR_ERR(bitmap_bh);
+ pr_warn("Failed to read block bitmap\n");
+ return;
+ }
+ gdp = ext4_get_group_desc(sb, group, &gdp_bh);
+ if (!gdp)
+ return;
+
+ for (i = 0; i < count; i++) {
+ if (!mb_test_bit(blkoff + i, bitmap_bh->b_data))
+ already_freed++;
+ }
+ mb_clear_bits(bitmap_bh->b_data, blkoff, count);
+ err = ext4_handle_dirty_metadata(NULL, NULL, bitmap_bh);
+ if (err)
+ return;
+ ext4_free_group_clusters_set(
+ sb, gdp, ext4_free_group_clusters(sb, gdp) +
+ count - already_freed);
+ ext4_block_bitmap_csum_set(sb, group, gdp, bitmap_bh);
+ ext4_group_desc_csum_set(sb, group, gdp);
+ ext4_handle_dirty_metadata(NULL, NULL, gdp_bh);
+ sync_dirty_buffer(bitmap_bh);
+ sync_dirty_buffer(gdp_bh);
+ brelse(bitmap_bh);
+}
+
/**
* ext4_free_blocks() -- Free given blocks and update quota
* @handle: handle for this transaction
@@ -4748,6 +5291,13 @@
int err = 0;
int ret;
+ sbi = EXT4_SB(sb);
+
+ if (sbi->s_mount_state & EXT4_FC_REPLAY) {
+ ext4_free_blocks_simple(inode, block, count);
+ return;
+ }
+
might_sleep();
if (bh) {
if (block)
@@ -4756,9 +5306,8 @@
block = bh->b_blocknr;
}
- sbi = EXT4_SB(sb);
if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) &&
- !ext4_data_block_valid(sbi, block, count)) {
+ !ext4_inode_block_valid(inode, block, count)) {
ext4_error(sb, "Freeing blocks not in datazone - "
"block = %llu, count = %lu", block, count);
goto error_return;
@@ -5270,6 +5819,7 @@
*/
int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
{
+ struct request_queue *q = bdev_get_queue(sb->s_bdev);
struct ext4_group_info *grp;
ext4_group_t group, first_group, last_group;
ext4_grpblk_t cnt = 0, first_cluster, last_cluster;
@@ -5288,6 +5838,13 @@
start >= max_blks ||
range->len < sb->s_blocksize)
return -EINVAL;
+ /* No point to try to trim less than discard granularity */
+ if (range->minlen < q->limits.discard_granularity) {
+ minlen = EXT4_NUM_B2C(EXT4_SB(sb),
+ q->limits.discard_granularity >> sb->s_blocksize_bits);
+ if (minlen > EXT4_CLUSTERS_PER_GROUP(sb))
+ goto out;
+ }
if (end >= max_blks)
end = max_blks - 1;
if (end <= first_data_blk)
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index 88c98f1..e75b474 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -24,19 +24,15 @@
#include "ext4.h"
/*
+ * mb_debug() dynamic printk msgs could be used to debug mballoc code.
*/
#ifdef CONFIG_EXT4_DEBUG
-extern ushort ext4_mballoc_debug;
-
-#define mb_debug(n, fmt, ...) \
-do { \
- if ((n) <= ext4_mballoc_debug) { \
- printk(KERN_DEBUG "(%s, %d): %s: " fmt, \
- __FILE__, __LINE__, __func__, ##__VA_ARGS__); \
- } \
-} while (0)
+#define mb_debug(sb, fmt, ...) \
+ pr_debug("[%s/%d] EXT4-fs (%s): (%s, %d): %s: " fmt, \
+ current->comm, task_pid_nr(current), sb->s_id, \
+ __FILE__, __LINE__, __func__, ##__VA_ARGS__)
#else
-#define mb_debug(n, fmt, ...) no_printk(fmt, ##__VA_ARGS__)
+#define mb_debug(sb, fmt, ...) no_printk(fmt, ##__VA_ARGS__)
#endif
#define EXT4_MB_HISTORY_ALLOC 1 /* allocation */
@@ -77,6 +73,10 @@
*/
#define MB_DEFAULT_GROUP_PREALLOC 512
+/*
+ * maximum length of inode prealloc list
+ */
+#define MB_DEFAULT_MAX_INODE_PREALLOC 512
struct ext4_free_data {
/* this links the free block information from sb_info */
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index be4ee3d..4991281 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -50,29 +50,9 @@
needed = ext4_ext_calc_credits_for_single_extent(inode,
lb->last_block - lb->first_block + 1, path);
- /*
- * Make sure the credit we accumalated is not really high
- */
- if (needed && ext4_handle_has_enough_credits(handle,
- EXT4_RESERVE_TRANS_BLOCKS)) {
- up_write((&EXT4_I(inode)->i_data_sem));
- retval = ext4_journal_restart(handle, needed);
- down_write((&EXT4_I(inode)->i_data_sem));
- if (retval)
- goto err_out;
- } else if (needed) {
- retval = ext4_journal_extend(handle, needed);
- if (retval) {
- /*
- * IF not able to extend the journal restart the journal
- */
- up_write((&EXT4_I(inode)->i_data_sem));
- retval = ext4_journal_restart(handle, needed);
- down_write((&EXT4_I(inode)->i_data_sem));
- if (retval)
- goto err_out;
- }
- }
+ retval = ext4_datasem_ensure_credits(handle, inode, needed, needed, 0);
+ if (retval < 0)
+ goto err_out;
retval = ext4_ext_insert_extent(handle, inode, &path, &newext, 0);
err_out:
up_write((&EXT4_I(inode)->i_data_sem));
@@ -196,42 +176,30 @@
}
-static int extend_credit_for_blkdel(handle_t *handle, struct inode *inode)
-{
- int retval = 0, needed;
-
- if (ext4_handle_has_enough_credits(handle, EXT4_RESERVE_TRANS_BLOCKS+1))
- return 0;
- /*
- * We are freeing a blocks. During this we touch
- * superblock, group descriptor and block bitmap.
- * So allocate a credit of 3. We may update
- * quota (user and group).
- */
- needed = 3 + EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb);
-
- if (ext4_journal_extend(handle, needed) != 0)
- retval = ext4_journal_restart(handle, needed);
-
- return retval;
-}
-
static int free_dind_blocks(handle_t *handle,
struct inode *inode, __le32 i_data)
{
int i;
__le32 *tmp_idata;
struct buffer_head *bh;
+ struct super_block *sb = inode->i_sb;
unsigned long max_entries = inode->i_sb->s_blocksize >> 2;
+ int err;
- bh = ext4_sb_bread(inode->i_sb, le32_to_cpu(i_data), 0);
+ bh = ext4_sb_bread(sb, le32_to_cpu(i_data), 0);
if (IS_ERR(bh))
return PTR_ERR(bh);
tmp_idata = (__le32 *)bh->b_data;
for (i = 0; i < max_entries; i++) {
if (tmp_idata[i]) {
- extend_credit_for_blkdel(handle, inode);
+ err = ext4_journal_ensure_credits(handle,
+ EXT4_RESERVE_TRANS_BLOCKS,
+ ext4_free_metadata_revoke_credits(sb, 1));
+ if (err < 0) {
+ put_bh(bh);
+ return err;
+ }
ext4_free_blocks(handle, inode, NULL,
le32_to_cpu(tmp_idata[i]), 1,
EXT4_FREE_BLOCKS_METADATA |
@@ -239,7 +207,10 @@
}
}
put_bh(bh);
- extend_credit_for_blkdel(handle, inode);
+ err = ext4_journal_ensure_credits(handle, EXT4_RESERVE_TRANS_BLOCKS,
+ ext4_free_metadata_revoke_credits(sb, 1));
+ if (err < 0)
+ return err;
ext4_free_blocks(handle, inode, NULL, le32_to_cpu(i_data), 1,
EXT4_FREE_BLOCKS_METADATA |
EXT4_FREE_BLOCKS_FORGET);
@@ -270,7 +241,10 @@
}
}
put_bh(bh);
- extend_credit_for_blkdel(handle, inode);
+ retval = ext4_journal_ensure_credits(handle, EXT4_RESERVE_TRANS_BLOCKS,
+ ext4_free_metadata_revoke_credits(inode->i_sb, 1));
+ if (retval < 0)
+ return retval;
ext4_free_blocks(handle, inode, NULL, le32_to_cpu(i_data), 1,
EXT4_FREE_BLOCKS_METADATA |
EXT4_FREE_BLOCKS_FORGET);
@@ -283,7 +257,11 @@
/* ei->i_data[EXT4_IND_BLOCK] */
if (i_data[0]) {
- extend_credit_for_blkdel(handle, inode);
+ retval = ext4_journal_ensure_credits(handle,
+ EXT4_RESERVE_TRANS_BLOCKS,
+ ext4_free_metadata_revoke_credits(inode->i_sb, 1));
+ if (retval < 0)
+ return retval;
ext4_free_blocks(handle, inode, NULL,
le32_to_cpu(i_data[0]), 1,
EXT4_FREE_BLOCKS_METADATA |
@@ -309,7 +287,7 @@
static int ext4_ext_swap_inode_data(handle_t *handle, struct inode *inode,
struct inode *tmp_inode)
{
- int retval;
+ int retval, retval2 = 0;
__le32 i_data[3];
struct ext4_inode_info *ei = EXT4_I(inode);
struct ext4_inode_info *tmp_ei = EXT4_I(tmp_inode);
@@ -318,12 +296,9 @@
* One credit accounted for writing the
* i_data field of the original inode
*/
- retval = ext4_journal_extend(handle, 1);
- if (retval) {
- retval = ext4_journal_restart(handle, 1);
- if (retval)
- goto err_out;
- }
+ retval = ext4_journal_ensure_credits(handle, 1, 0);
+ if (retval < 0)
+ goto err_out;
i_data[0] = ei->i_data[EXT4_IND_BLOCK];
i_data[1] = ei->i_data[EXT4_DIND_BLOCK];
@@ -367,7 +342,9 @@
* i_blocks when freeing the indirect meta-data blocks
*/
retval = free_ind_block(handle, inode, i_data);
- ext4_mark_inode_dirty(handle, inode);
+ retval2 = ext4_mark_inode_dirty(handle, inode);
+ if (unlikely(retval2 && !retval))
+ retval = retval2;
err_out:
return retval;
@@ -391,15 +368,20 @@
ix = EXT_FIRST_INDEX(eh);
for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ix++) {
retval = free_ext_idx(handle, inode, ix);
- if (retval)
- break;
+ if (retval) {
+ put_bh(bh);
+ return retval;
+ }
}
}
put_bh(bh);
- extend_credit_for_blkdel(handle, inode);
+ retval = ext4_journal_ensure_credits(handle, EXT4_RESERVE_TRANS_BLOCKS,
+ ext4_free_metadata_revoke_credits(inode->i_sb, 1));
+ if (retval < 0)
+ return retval;
ext4_free_blocks(handle, inode, NULL, block, 1,
EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET);
- return retval;
+ return 0;
}
/*
@@ -455,12 +437,12 @@
percpu_down_write(&sbi->s_writepages_rwsem);
/*
- * Worst case we can touch the allocation bitmaps, a bgd
- * block, and a block to link in the orphan list. We do need
- * need to worry about credits for modifying the quota inode.
+ * Worst case we can touch the allocation bitmaps and a block
+ * group descriptor block. We do need need to worry about
+ * credits for modifying the quota inode.
*/
handle = ext4_journal_start(inode, EXT4_HT_MIGRATE,
- 4 + EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb));
+ 3 + EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb));
if (IS_ERR(handle)) {
retval = PTR_ERR(handle);
@@ -477,6 +459,13 @@
ext4_journal_stop(handle);
goto out_unlock;
}
+ /*
+ * Use the correct seed for checksum (i.e. the seed from 'inode'). This
+ * is so that the metadata blocks will have the correct checksum after
+ * the migration.
+ */
+ ei = EXT4_I(inode);
+ EXT4_I(tmp_inode)->i_csum_seed = ei->i_csum_seed;
i_size_write(tmp_inode, i_size_read(inode));
/*
* Set the i_nlink to zero so it will be deleted later
@@ -485,7 +474,6 @@
clear_nlink(tmp_inode);
ext4_ext_tree_init(handle, tmp_inode);
- ext4_orphan_add(handle, tmp_inode);
ext4_journal_stop(handle);
/*
@@ -510,17 +498,10 @@
handle = ext4_journal_start(inode, EXT4_HT_MIGRATE, 1);
if (IS_ERR(handle)) {
- /*
- * It is impossible to update on-disk structures without
- * a handle, so just rollback in-core changes and live other
- * work to orphan_list_cleanup()
- */
- ext4_orphan_del(NULL, tmp_inode);
retval = PTR_ERR(handle);
goto out_tmp_inode;
}
- ei = EXT4_I(inode);
i_data = ei->i_data;
memset(&lb, 0, sizeof(lb));
@@ -577,9 +558,9 @@
}
/* We mark the tmp_inode dirty via ext4_ext_tree_init. */
- if (ext4_journal_extend(handle, 1) != 0)
- ext4_journal_restart(handle, 1);
-
+ retval = ext4_journal_ensure_credits(handle, 1, 0);
+ if (retval < 0)
+ goto out_stop;
/*
* Mark the tmp_inode as of size zero
*/
@@ -597,6 +578,7 @@
/* Reset the extent details */
ext4_ext_tree_init(handle, tmp_inode);
+out_stop:
ext4_journal_stop(handle);
out_tmp_inode:
unlock_new_inode(tmp_inode);
@@ -620,7 +602,7 @@
ext4_lblk_t start, end;
ext4_fsblk_t blk;
handle_t *handle;
- int ret;
+ int ret, ret2 = 0;
if (!ext4_has_feature_extents(inode->i_sb) ||
(!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
@@ -674,7 +656,9 @@
memset(ei->i_data, 0, sizeof(ei->i_data));
for (i = start; i <= end; i++)
ei->i_data[i] = cpu_to_le32(blk++);
- ext4_mark_inode_dirty(handle, inode);
+ ret2 = ext4_mark_inode_dirty(handle, inode);
+ if (unlikely(ret2 && !ret))
+ ret = ret2;
errout:
ext4_journal_stop(handle);
up_write(&EXT4_I(inode)->i_data_sem);
diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c
index 9d00e0d..bc364c1 100644
--- a/fs/ext4/mmp.c
+++ b/fs/ext4/mmp.c
@@ -56,7 +56,7 @@
wait_on_buffer(bh);
sb_end_write(sb);
if (unlikely(!buffer_uptodate(bh)))
- return 1;
+ return -EIO;
return 0;
}
@@ -85,15 +85,11 @@
}
}
- get_bh(*bh);
lock_buffer(*bh);
- (*bh)->b_end_io = end_buffer_read_sync;
- submit_bh(REQ_OP_READ, REQ_META | REQ_PRIO, *bh);
- wait_on_buffer(*bh);
- if (!buffer_uptodate(*bh)) {
- ret = -EIO;
+ ret = ext4_read_bh(*bh, REQ_META | REQ_PRIO, NULL);
+ if (ret)
goto warn_exit;
- }
+
mmp = (struct mmp_struct *)((*bh)->b_data);
if (le32_to_cpu(mmp->mmp_magic) != EXT4_MMP_MAGIC) {
ret = -EFSCORRUPTED;
@@ -131,9 +127,9 @@
*/
static int kmmpd(void *data)
{
- struct super_block *sb = ((struct mmpd_data *) data)->sb;
- struct buffer_head *bh = ((struct mmpd_data *) data)->bh;
+ struct super_block *sb = (struct super_block *) data;
struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+ struct buffer_head *bh = EXT4_SB(sb)->s_mmp_bh;
struct mmp_struct *mmp;
ext4_fsblk_t mmp_block;
u32 seq = 0;
@@ -160,7 +156,12 @@
memcpy(mmp->mmp_nodename, init_utsname()->nodename,
sizeof(mmp->mmp_nodename));
- while (!kthread_should_stop()) {
+ while (!kthread_should_stop() && !sb_rdonly(sb)) {
+ if (!ext4_has_feature_mmp(sb)) {
+ ext4_warning(sb, "kmmpd being stopped since MMP feature"
+ " has been disabled.");
+ goto wait_to_exit;
+ }
if (++seq > EXT4_MMP_SEQ_MAX)
seq = 1;
@@ -174,21 +175,13 @@
* (s_mmp_update_interval * 60) seconds.
*/
if (retval) {
- if ((failed_writes % 60) == 0)
- ext4_error(sb, "Error writing to MMP block");
+ if ((failed_writes % 60) == 0) {
+ ext4_error_err(sb, -retval,
+ "Error writing to MMP block");
+ }
failed_writes++;
}
- if (!(le32_to_cpu(es->s_feature_incompat) &
- EXT4_FEATURE_INCOMPAT_MMP)) {
- ext4_warning(sb, "kmmpd being stopped since MMP feature"
- " has been disabled.");
- goto exit_thread;
- }
-
- if (sb_rdonly(sb))
- break;
-
diff = jiffies - last_update_time;
if (diff < mmp_update_interval * HZ)
schedule_timeout_interruptible(mmp_update_interval *
@@ -206,9 +199,10 @@
retval = read_mmp_block(sb, &bh_check, mmp_block);
if (retval) {
- ext4_error(sb, "error reading MMP data: %d",
- retval);
- goto exit_thread;
+ ext4_error_err(sb, -retval,
+ "error reading MMP data: %d",
+ retval);
+ goto wait_to_exit;
}
mmp_check = (struct mmp_struct *)(bh_check->b_data);
@@ -219,10 +213,10 @@
"Error while updating MMP info. "
"The filesystem seems to have been"
" multiply mounted.");
- ext4_error(sb, "abort");
+ ext4_error_err(sb, EBUSY, "abort");
put_bh(bh_check);
retval = -EBUSY;
- goto exit_thread;
+ goto wait_to_exit;
}
put_bh(bh_check);
}
@@ -245,13 +239,25 @@
retval = write_mmp_block(sb, bh);
-exit_thread:
- EXT4_SB(sb)->s_mmp_tsk = NULL;
- kfree(data);
- brelse(bh);
+wait_to_exit:
+ while (!kthread_should_stop()) {
+ set_current_state(TASK_INTERRUPTIBLE);
+ if (!kthread_should_stop())
+ schedule();
+ }
+ set_current_state(TASK_RUNNING);
return retval;
}
+void ext4_stop_mmpd(struct ext4_sb_info *sbi)
+{
+ if (sbi->s_mmp_tsk) {
+ kthread_stop(sbi->s_mmp_tsk);
+ brelse(sbi->s_mmp_bh);
+ sbi->s_mmp_tsk = NULL;
+ }
+}
+
/*
* Get a random new sequence number but make sure it is not greater than
* EXT4_MMP_SEQ_MAX.
@@ -276,7 +282,6 @@
struct ext4_super_block *es = EXT4_SB(sb)->s_es;
struct buffer_head *bh = NULL;
struct mmp_struct *mmp = NULL;
- struct mmpd_data *mmpd_data;
u32 seq;
unsigned int mmp_check_interval = le16_to_cpu(es->s_mmp_update_interval);
unsigned int wait_time = 0;
@@ -365,24 +370,17 @@
goto failed;
}
- mmpd_data = kmalloc(sizeof(*mmpd_data), GFP_KERNEL);
- if (!mmpd_data) {
- ext4_warning(sb, "not enough memory for mmpd_data");
- goto failed;
- }
- mmpd_data->sb = sb;
- mmpd_data->bh = bh;
+ EXT4_SB(sb)->s_mmp_bh = bh;
/*
* Start a kernel thread to update the MMP block periodically.
*/
- EXT4_SB(sb)->s_mmp_tsk = kthread_run(kmmpd, mmpd_data, "kmmpd-%.*s",
+ EXT4_SB(sb)->s_mmp_tsk = kthread_run(kmmpd, sb, "kmmpd-%.*s",
(int)sizeof(mmp->mmp_bdevname),
bdevname(bh->b_bdev,
mmp->mmp_bdevname));
if (IS_ERR(EXT4_SB(sb)->s_mmp_tsk)) {
EXT4_SB(sb)->s_mmp_tsk = NULL;
- kfree(mmpd_data);
ext4_warning(sb, "Unable to create kmmpd thread for %s.",
sb->s_id);
goto failed;
@@ -394,5 +392,3 @@
brelse(bh);
return 1;
}
-
-
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index 30ce3dc..64a5797 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -215,7 +215,7 @@
for (i = 0; i < nr; i++) {
bh = arr[i];
if (!bh_uptodate_or_lock(bh)) {
- err = bh_submit_read(bh);
+ err = ext4_read_bh(bh, 0, NULL);
if (err)
return err;
}
@@ -422,8 +422,8 @@
block_len_in_page, 0, &err2);
ext4_double_up_write_data_sem(orig_inode, donor_inode);
if (replaced_count != block_len_in_page) {
- EXT4_ERROR_INODE_BLOCK(orig_inode, (sector_t)(orig_blk_offset),
- "Unable to copy data block,"
+ ext4_error_inode_block(orig_inode, (sector_t)(orig_blk_offset),
+ EIO, "Unable to copy data block,"
" data will be lost.");
*err = -EIO;
}
@@ -686,8 +686,8 @@
out:
if (*moved_len) {
- ext4_discard_preallocations(orig_inode);
- ext4_discard_preallocations(donor_inode);
+ ext4_discard_preallocations(orig_inode, 0);
+ ext4_discard_preallocations(donor_inode, 0);
}
ext4_ext_drop_refs(path);
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 9905720..f71de6c 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -109,7 +109,10 @@
struct ext4_dir_entry *dirent;
int is_dx_block = 0;
- bh = ext4_bread(NULL, inode, block, 0);
+ if (ext4_simulate_fail(inode->i_sb, EXT4_SIM_DIRBLOCK_EIO))
+ bh = ERR_PTR(-EIO);
+ else
+ bh = ext4_bread(NULL, inode, block, 0);
if (IS_ERR(bh)) {
__ext4_warning(inode->i_sb, func, line,
"inode #%lu: lblock %lu: comm %s: "
@@ -153,21 +156,25 @@
* caller is sure it should be an index block.
*/
if (is_dx_block && type == INDEX) {
- if (ext4_dx_csum_verify(inode, dirent))
+ if (ext4_dx_csum_verify(inode, dirent) &&
+ !ext4_simulate_fail(inode->i_sb, EXT4_SIM_DIRBLOCK_CRC))
set_buffer_verified(bh);
else {
- ext4_error_inode(inode, func, line, block,
- "Directory index failed checksum");
+ ext4_error_inode_err(inode, func, line, block,
+ EFSBADCRC,
+ "Directory index failed checksum");
brelse(bh);
return ERR_PTR(-EFSBADCRC);
}
}
if (!is_dx_block) {
- if (ext4_dirblock_csum_verify(inode, bh))
+ if (ext4_dirblock_csum_verify(inode, bh) &&
+ !ext4_simulate_fail(inode->i_sb, EXT4_SIM_DIRBLOCK_CRC))
set_buffer_verified(bh);
else {
- ext4_error_inode(inode, func, line, block,
- "Directory block failed checksum");
+ ext4_error_inode_err(inode, func, line, block,
+ EFSBADCRC,
+ "Directory block failed checksum");
brelse(bh);
return ERR_PTR(-EFSBADCRC);
}
@@ -226,13 +233,13 @@
u8 unused_flags;
}
info;
- struct dx_entry entries[0];
+ struct dx_entry entries[];
};
struct dx_node
{
struct fake_dirent fake;
- struct dx_entry entries[0];
+ struct dx_entry entries[];
};
@@ -656,8 +663,7 @@
/* Directory is encrypted */
res = fscrypt_fname_alloc_buffer(
- dir, len,
- &fname_crypto_str);
+ len, &fname_crypto_str);
if (res)
printk(KERN_WARNING "Error "
"allocating crypto "
@@ -1002,7 +1008,6 @@
top = (struct ext4_dir_entry_2 *) ((char *) de +
dir->i_sb->s_blocksize -
EXT4_DIR_REC_LEN(0));
-#ifdef CONFIG_FS_ENCRYPTION
/* Check if the directory is encrypted */
if (IS_ENCRYPTED(dir)) {
err = fscrypt_get_encryption_info(dir);
@@ -1010,14 +1015,14 @@
brelse(bh);
return err;
}
- err = fscrypt_fname_alloc_buffer(dir, EXT4_NAME_LEN,
- &fname_crypto_str);
+ err = fscrypt_fname_alloc_buffer(EXT4_NAME_LEN,
+ &fname_crypto_str);
if (err < 0) {
brelse(bh);
return err;
}
}
-#endif
+
for (; de < top; de = ext4_next_entry(de, dir->i_sb->s_blocksize)) {
if (ext4_check_dir_entry(dir, NULL, de, bh,
bh->b_data, bh->b_size,
@@ -1065,9 +1070,7 @@
}
errout:
brelse(bh);
-#ifdef CONFIG_FS_ENCRYPTION
fscrypt_fname_free_buffer(&fname_crypto_str);
-#endif
return count;
}
@@ -1282,8 +1285,8 @@
int ext4_ci_compare(const struct inode *parent, const struct qstr *name,
const struct qstr *entry, bool quick)
{
- const struct ext4_sb_info *sbi = EXT4_SB(parent->i_sb);
- const struct unicode_map *um = sbi->s_encoding;
+ const struct super_block *sb = parent->i_sb;
+ const struct unicode_map *um = sb->s_encoding;
int ret;
if (quick)
@@ -1295,7 +1298,7 @@
/* Handle invalid character sequence as either an error
* or as an opaque byte sequence.
*/
- if (ext4_has_strict_mode(sbi))
+ if (sb_has_strict_encoding(sb))
return -EINVAL;
if (name->len != entry->len)
@@ -1312,7 +1315,7 @@
{
int len;
- if (!IS_CASEFOLDED(dir) || !EXT4_SB(dir->i_sb)->s_encoding) {
+ if (!IS_CASEFOLDED(dir) || !dir->i_sb->s_encoding) {
cf_name->name = NULL;
return;
}
@@ -1321,7 +1324,7 @@
if (!cf_name->name)
return;
- len = utf8_casefold(EXT4_SB(dir->i_sb)->s_encoding,
+ len = utf8_casefold(dir->i_sb->s_encoding,
iname, cf_name->name,
EXT4_NAME_LEN);
if (len <= 0) {
@@ -1358,7 +1361,7 @@
#endif
#ifdef CONFIG_UNICODE
- if (EXT4_SB(parent->i_sb)->s_encoding && IS_CASEFOLDED(parent)) {
+ if (parent->i_sb->s_encoding && IS_CASEFOLDED(parent)) {
if (fname->cf_name.name) {
struct qstr cf = {.name = fname->cf_name.name,
.len = fname->cf_name.len};
@@ -1528,8 +1531,9 @@
goto next;
wait_on_buffer(bh);
if (!buffer_uptodate(bh)) {
- EXT4_ERROR_INODE(dir, "reading directory lblock %lu",
- (unsigned long) block);
+ EXT4_ERROR_INODE_ERR(dir, EIO,
+ "reading directory lblock %lu",
+ (unsigned long) block);
brelse(bh);
ret = ERR_PTR(-EIO);
goto cleanup_and_exit;
@@ -1538,8 +1542,9 @@
!is_dx_internal_node(dir, block,
(struct ext4_dir_entry *)bh->b_data) &&
!ext4_dirblock_csum_verify(dir, bh)) {
- EXT4_ERROR_INODE(dir, "checksumming directory "
- "block %lu", (unsigned long)block);
+ EXT4_ERROR_INODE_ERR(dir, EFSBADCRC,
+ "checksumming directory "
+ "block %lu", (unsigned long)block);
brelse(bh);
ret = ERR_PTR(-EFSBADCRC);
goto cleanup_and_exit;
@@ -1997,7 +2002,7 @@
{
unsigned int blocksize = dir->i_sb->s_blocksize;
int csum_size = 0;
- int err;
+ int err, err2;
if (ext4_has_metadata_csum(inode->i_sb))
csum_size = sizeof(struct ext4_dir_entry_tail);
@@ -2032,12 +2037,12 @@
dir->i_mtime = dir->i_ctime = current_time(dir);
ext4_update_dx_flag(dir);
inode_inc_iversion(dir);
- ext4_mark_inode_dirty(handle, dir);
+ err2 = ext4_mark_inode_dirty(handle, dir);
BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
err = ext4_handle_dirty_dirblock(handle, dir, bh);
if (err)
ext4_std_error(dir->i_sb, err);
- return 0;
+ return err ? err : err2;
}
/*
@@ -2175,7 +2180,6 @@
struct buffer_head *bh = NULL;
struct ext4_dir_entry_2 *de;
struct super_block *sb;
- struct ext4_sb_info *sbi;
struct ext4_filename fname;
int retval;
int dx_fallback=0;
@@ -2187,7 +2191,6 @@
csum_size = sizeof(struct ext4_dir_entry_tail);
sb = dir->i_sb;
- sbi = EXT4_SB(sb);
blocksize = sb->s_blocksize;
if (!dentry->d_name.len)
return -EINVAL;
@@ -2196,8 +2199,8 @@
return -ENOKEY;
#ifdef CONFIG_UNICODE
- if (ext4_has_strict_mode(sbi) && IS_CASEFOLDED(dir) &&
- sbi->s_encoding && utf8_validate(sbi->s_encoding, &dentry->d_name))
+ if (sb_has_strict_encoding(sb) && IS_CASEFOLDED(dir) &&
+ sb->s_encoding && utf8_validate(sb->s_encoding, &dentry->d_name))
return -EINVAL;
#endif
@@ -2228,7 +2231,9 @@
}
ext4_clear_inode_flag(dir, EXT4_INODE_INDEX);
dx_fallback++;
- ext4_mark_inode_dirty(handle, dir);
+ retval = ext4_mark_inode_dirty(handle, dir);
+ if (unlikely(retval))
+ goto out;
}
blocks = dir->i_size >> sb->s_blocksize_bits;
for (block = 0; block < blocks; block++) {
@@ -2457,8 +2462,7 @@
* ext4_generic_delete_entry deletes a directory entry by merging it
* with the previous entry
*/
-int ext4_generic_delete_entry(handle_t *handle,
- struct inode *dir,
+int ext4_generic_delete_entry(struct inode *dir,
struct ext4_dir_entry_2 *de_del,
struct buffer_head *bh,
void *entry_buf,
@@ -2519,8 +2523,7 @@
if (unlikely(err))
goto out;
- err = ext4_generic_delete_entry(handle, dir, de_del,
- bh, bh->b_data,
+ err = ext4_generic_delete_entry(dir, de_del, bh, bh->b_data,
dir->i_sb->s_blocksize, csum_size);
if (err)
goto out;
@@ -2548,7 +2551,7 @@
* for checking S_ISDIR(inode) (since the INODE_INDEX feature will not be set
* on regular files) and to avoid creating huge/slow non-HTREE directories.
*/
-static void ext4_inc_count(handle_t *handle, struct inode *inode)
+static void ext4_inc_count(struct inode *inode)
{
inc_nlink(inode);
if (is_dx(inode) &&
@@ -2560,25 +2563,36 @@
* If a directory had nlink == 1, then we should let it be 1. This indicates
* directory has >EXT4_LINK_MAX subdirs.
*/
-static void ext4_dec_count(handle_t *handle, struct inode *inode)
+static void ext4_dec_count(struct inode *inode)
{
if (!S_ISDIR(inode->i_mode) || inode->i_nlink > 2)
drop_nlink(inode);
}
+/*
+ * Add non-directory inode to a directory. On success, the inode reference is
+ * consumed by dentry is instantiation. This is also indicated by clearing of
+ * *inodep pointer. On failure, the caller is responsible for dropping the
+ * inode reference in the safe context.
+ */
static int ext4_add_nondir(handle_t *handle,
- struct dentry *dentry, struct inode *inode)
+ struct dentry *dentry, struct inode **inodep)
{
+ struct inode *dir = d_inode(dentry->d_parent);
+ struct inode *inode = *inodep;
int err = ext4_add_entry(handle, dentry, inode);
if (!err) {
- ext4_mark_inode_dirty(handle, inode);
+ err = ext4_mark_inode_dirty(handle, inode);
+ if (IS_DIRSYNC(dir))
+ ext4_handle_sync(handle);
d_instantiate_new(dentry, inode);
- return 0;
+ *inodep = NULL;
+ return err;
}
drop_nlink(inode);
+ ext4_orphan_add(handle, inode);
unlock_new_inode(inode);
- iput(inode);
return err;
}
@@ -2612,12 +2626,14 @@
inode->i_op = &ext4_file_inode_operations;
inode->i_fop = &ext4_file_operations;
ext4_set_aops(inode);
- err = ext4_add_nondir(handle, dentry, inode);
- if (!err && IS_DIRSYNC(dir))
- ext4_handle_sync(handle);
+ err = ext4_add_nondir(handle, dentry, &inode);
+ if (!err)
+ ext4_fc_track_create(handle, dentry);
}
if (handle)
ext4_journal_stop(handle);
+ if (!IS_ERR_OR_NULL(inode))
+ iput(inode);
if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
goto retry;
return err;
@@ -2644,12 +2660,14 @@
if (!IS_ERR(inode)) {
init_special_inode(inode, inode->i_mode, rdev);
inode->i_op = &ext4_special_inode_operations;
- err = ext4_add_nondir(handle, dentry, inode);
- if (!err && IS_DIRSYNC(dir))
- ext4_handle_sync(handle);
+ err = ext4_add_nondir(handle, dentry, &inode);
+ if (!err)
+ ext4_fc_track_create(handle, dentry);
}
if (handle)
ext4_journal_stop(handle);
+ if (!IS_ERR_OR_NULL(inode))
+ iput(inode);
if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
goto retry;
return err;
@@ -2723,7 +2741,7 @@
return ext4_next_entry(de, blocksize);
}
-static int ext4_init_new_dir(handle_t *handle, struct inode *dir,
+int ext4_init_new_dir(handle_t *handle, struct inode *dir,
struct inode *inode)
{
struct buffer_head *dir_block = NULL;
@@ -2768,7 +2786,7 @@
{
handle_t *handle;
struct inode *inode;
- int err, credits, retries = 0;
+ int err, err2 = 0, credits, retries = 0;
if (EXT4_DIR_LINK_MAX(dir))
return -EMLINK;
@@ -2799,23 +2817,30 @@
if (err) {
out_clear_inode:
clear_nlink(inode);
+ ext4_orphan_add(handle, inode);
unlock_new_inode(inode);
- ext4_mark_inode_dirty(handle, inode);
+ err2 = ext4_mark_inode_dirty(handle, inode);
+ if (unlikely(err2))
+ err = err2;
+ ext4_journal_stop(handle);
iput(inode);
- goto out_stop;
+ goto out_retry;
}
- ext4_inc_count(handle, dir);
+ ext4_inc_count(dir);
+
ext4_update_dx_flag(dir);
err = ext4_mark_inode_dirty(handle, dir);
if (err)
goto out_clear_inode;
d_instantiate_new(dentry, inode);
+ ext4_fc_track_create(handle, dentry);
if (IS_DIRSYNC(dir))
ext4_handle_sync(handle);
out_stop:
if (handle)
ext4_journal_stop(handle);
+out_retry:
if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
goto retry;
return err;
@@ -3138,10 +3163,13 @@
inode->i_size = 0;
ext4_orphan_add(handle, inode);
inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode);
- ext4_mark_inode_dirty(handle, inode);
- ext4_dec_count(handle, dir);
+ retval = ext4_mark_inode_dirty(handle, inode);
+ if (retval)
+ goto end_rmdir;
+ ext4_dec_count(dir);
ext4_update_dx_flag(dir);
- ext4_mark_inode_dirty(handle, dir);
+ ext4_fc_track_unlink(handle, dentry);
+ retval = ext4_mark_inode_dirty(handle, dir);
#ifdef CONFIG_UNICODE
/* VFS negative dentries are incompatible with Encoding and
@@ -3161,67 +3189,93 @@
return retval;
}
-static int ext4_unlink(struct inode *dir, struct dentry *dentry)
+int __ext4_unlink(handle_t *handle, struct inode *dir, const struct qstr *d_name,
+ struct inode *inode)
{
- int retval;
- struct inode *inode;
+ int retval = -ENOENT;
struct buffer_head *bh;
struct ext4_dir_entry_2 *de;
- handle_t *handle = NULL;
+ int skip_remove_dentry = 0;
- if (unlikely(ext4_forced_shutdown(EXT4_SB(dir->i_sb))))
- return -EIO;
-
- trace_ext4_unlink_enter(dir, dentry);
- /* Initialize quotas before so that eventual writes go
- * in separate transaction */
- retval = dquot_initialize(dir);
- if (retval)
- return retval;
- retval = dquot_initialize(d_inode(dentry));
- if (retval)
- return retval;
-
- retval = -ENOENT;
- bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL);
+ bh = ext4_find_entry(dir, d_name, &de, NULL);
if (IS_ERR(bh))
return PTR_ERR(bh);
+
if (!bh)
- goto end_unlink;
+ return -ENOENT;
- inode = d_inode(dentry);
-
- retval = -EFSCORRUPTED;
- if (le32_to_cpu(de->inode) != inode->i_ino)
- goto end_unlink;
-
- handle = ext4_journal_start(dir, EXT4_HT_DIR,
- EXT4_DATA_TRANS_BLOCKS(dir->i_sb));
- if (IS_ERR(handle)) {
- retval = PTR_ERR(handle);
- handle = NULL;
- goto end_unlink;
+ if (le32_to_cpu(de->inode) != inode->i_ino) {
+ /*
+ * It's okay if we find dont find dentry which matches
+ * the inode. That's because it might have gotten
+ * renamed to a different inode number
+ */
+ if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
+ skip_remove_dentry = 1;
+ else
+ goto out;
}
if (IS_DIRSYNC(dir))
ext4_handle_sync(handle);
- retval = ext4_delete_entry(handle, dir, de, bh);
- if (retval)
- goto end_unlink;
- dir->i_ctime = dir->i_mtime = current_time(dir);
- ext4_update_dx_flag(dir);
- ext4_mark_inode_dirty(handle, dir);
+ if (!skip_remove_dentry) {
+ retval = ext4_delete_entry(handle, dir, de, bh);
+ if (retval)
+ goto out;
+ dir->i_ctime = dir->i_mtime = current_time(dir);
+ ext4_update_dx_flag(dir);
+ retval = ext4_mark_inode_dirty(handle, dir);
+ if (retval)
+ goto out;
+ } else {
+ retval = 0;
+ }
if (inode->i_nlink == 0)
ext4_warning_inode(inode, "Deleting file '%.*s' with no links",
- dentry->d_name.len, dentry->d_name.name);
+ d_name->len, d_name->name);
else
drop_nlink(inode);
if (!inode->i_nlink)
ext4_orphan_add(handle, inode);
inode->i_ctime = current_time(inode);
- ext4_mark_inode_dirty(handle, inode);
+ retval = ext4_mark_inode_dirty(handle, inode);
+out:
+ brelse(bh);
+ return retval;
+}
+
+static int ext4_unlink(struct inode *dir, struct dentry *dentry)
+{
+ handle_t *handle;
+ int retval;
+
+ if (unlikely(ext4_forced_shutdown(EXT4_SB(dir->i_sb))))
+ return -EIO;
+
+ trace_ext4_unlink_enter(dir, dentry);
+ /*
+ * Initialize quotas before so that eventual writes go
+ * in separate transaction
+ */
+ retval = dquot_initialize(dir);
+ if (retval)
+ goto out_trace;
+ retval = dquot_initialize(d_inode(dentry));
+ if (retval)
+ goto out_trace;
+
+ handle = ext4_journal_start(dir, EXT4_HT_DIR,
+ EXT4_DATA_TRANS_BLOCKS(dir->i_sb));
+ if (IS_ERR(handle)) {
+ retval = PTR_ERR(handle);
+ goto out_trace;
+ }
+
+ retval = __ext4_unlink(handle, dir, &dentry->d_name, d_inode(dentry));
+ if (!retval)
+ ext4_fc_track_unlink(handle, dentry);
#ifdef CONFIG_UNICODE
/* VFS negative dentries are incompatible with Encoding and
* Case-insensitiveness. Eventually we'll want avoid
@@ -3232,11 +3286,10 @@
if (IS_CASEFOLDED(dir))
d_invalidate(dentry);
#endif
-
-end_unlink:
- brelse(bh);
if (handle)
ext4_journal_stop(handle);
+
+out_trace:
trace_ext4_unlink_exit(dentry, retval);
return retval;
}
@@ -3316,7 +3369,8 @@
*/
drop_nlink(inode);
err = ext4_orphan_add(handle, inode);
- ext4_journal_stop(handle);
+ if (handle)
+ ext4_journal_stop(handle);
handle = NULL;
if (err)
goto err_drop_inode;
@@ -3351,12 +3405,11 @@
inode->i_size = disk_link.len - 1;
}
EXT4_I(inode)->i_disksize = inode->i_size;
- err = ext4_add_nondir(handle, dentry, inode);
- if (!err && IS_DIRSYNC(dir))
- ext4_handle_sync(handle);
-
+ err = ext4_add_nondir(handle, dentry, &inode);
if (handle)
ext4_journal_stop(handle);
+ if (inode)
+ iput(inode);
goto out_free_encrypted_link;
err_drop_inode:
@@ -3371,12 +3424,49 @@
return err;
}
+int __ext4_link(struct inode *dir, struct inode *inode, struct dentry *dentry)
+{
+ handle_t *handle;
+ int err, retries = 0;
+retry:
+ handle = ext4_journal_start(dir, EXT4_HT_DIR,
+ (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
+ EXT4_INDEX_EXTRA_TRANS_BLOCKS) + 1);
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+
+ if (IS_DIRSYNC(dir))
+ ext4_handle_sync(handle);
+
+ inode->i_ctime = current_time(inode);
+ ext4_inc_count(inode);
+ ihold(inode);
+
+ err = ext4_add_entry(handle, dentry, inode);
+ if (!err) {
+ err = ext4_mark_inode_dirty(handle, inode);
+ /* this can happen only for tmpfile being
+ * linked the first time
+ */
+ if (inode->i_nlink == 1)
+ ext4_orphan_del(handle, inode);
+ d_instantiate(dentry, inode);
+ ext4_fc_track_link(handle, dentry);
+ } else {
+ drop_nlink(inode);
+ iput(inode);
+ }
+ ext4_journal_stop(handle);
+ if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
+ goto retry;
+ return err;
+}
+
static int ext4_link(struct dentry *old_dentry,
struct inode *dir, struct dentry *dentry)
{
- handle_t *handle;
struct inode *inode = d_inode(old_dentry);
- int err, retries = 0;
+ int err;
if (inode->i_nlink >= EXT4_LINK_MAX)
return -EMLINK;
@@ -3393,41 +3483,9 @@
err = dquot_initialize(dir);
if (err)
return err;
-
-retry:
- handle = ext4_journal_start(dir, EXT4_HT_DIR,
- (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
- EXT4_INDEX_EXTRA_TRANS_BLOCKS) + 1);
- if (IS_ERR(handle))
- return PTR_ERR(handle);
-
- if (IS_DIRSYNC(dir))
- ext4_handle_sync(handle);
-
- inode->i_ctime = current_time(inode);
- ext4_inc_count(handle, inode);
- ihold(inode);
-
- err = ext4_add_entry(handle, dentry, inode);
- if (!err) {
- ext4_mark_inode_dirty(handle, inode);
- /* this can happen only for tmpfile being
- * linked the first time
- */
- if (inode->i_nlink == 1)
- ext4_orphan_del(handle, inode);
- d_instantiate(dentry, inode);
- } else {
- drop_nlink(inode);
- iput(inode);
- }
- ext4_journal_stop(handle);
- if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
- goto retry;
- return err;
+ return __ext4_link(dir, inode, dentry);
}
-
/*
* Try to find buffer head where contains the parent block.
* It should be the inode block if it is inlined or the 1st block
@@ -3522,7 +3580,7 @@
static int ext4_setent(handle_t *handle, struct ext4_renament *ent,
unsigned ino, unsigned file_type)
{
- int retval;
+ int retval, retval2;
BUFFER_TRACE(ent->bh, "get write access");
retval = ext4_journal_get_write_access(handle, ent->bh);
@@ -3534,17 +3592,16 @@
inode_inc_iversion(ent->dir);
ent->dir->i_ctime = ent->dir->i_mtime =
current_time(ent->dir);
- ext4_mark_inode_dirty(handle, ent->dir);
+ retval = ext4_mark_inode_dirty(handle, ent->dir);
BUFFER_TRACE(ent->bh, "call ext4_handle_dirty_metadata");
if (!ent->inlined) {
- retval = ext4_handle_dirty_dirblock(handle, ent->dir, ent->bh);
- if (unlikely(retval)) {
- ext4_std_error(ent->dir->i_sb, retval);
- return retval;
+ retval2 = ext4_handle_dirty_dirblock(handle, ent->dir, ent->bh);
+ if (unlikely(retval2)) {
+ ext4_std_error(ent->dir->i_sb, retval2);
+ return retval2;
}
}
-
- return 0;
+ return retval;
}
static void ext4_resetent(handle_t *handle, struct ext4_renament *ent,
@@ -3625,9 +3682,9 @@
{
if (ent->dir_nlink_delta) {
if (ent->dir_nlink_delta == -1)
- ext4_dec_count(handle, ent->dir);
+ ext4_dec_count(ent->dir);
else
- ext4_inc_count(handle, ent->dir);
+ ext4_inc_count(ent->dir);
ext4_mark_inode_dirty(handle, ent->dir);
}
}
@@ -3802,7 +3859,10 @@
EXT4_FT_CHRDEV);
if (retval)
goto end_rename;
- ext4_mark_inode_dirty(handle, whiteout);
+ retval = ext4_mark_inode_dirty(handle, whiteout);
+ if (unlikely(retval))
+ goto end_rename;
+
}
if (!new.bh) {
retval = ext4_add_entry(handle, new.dentry, old.inode);
@@ -3823,7 +3883,9 @@
* rename.
*/
old.inode->i_ctime = current_time(old.inode);
- ext4_mark_inode_dirty(handle, old.inode);
+ retval = ext4_mark_inode_dirty(handle, old.inode);
+ if (unlikely(retval))
+ goto end_rename;
if (!whiteout) {
/*
@@ -3833,7 +3895,7 @@
}
if (new.inode) {
- ext4_dec_count(handle, new.inode);
+ ext4_dec_count(new.inode);
new.inode->i_ctime = current_time(new.inode);
}
old.dir->i_ctime = old.dir->i_mtime = current_time(old.dir);
@@ -3843,21 +3905,45 @@
if (retval)
goto end_rename;
- ext4_dec_count(handle, old.dir);
+ ext4_dec_count(old.dir);
if (new.inode) {
/* checked ext4_empty_dir above, can't have another
* parent, ext4_dec_count() won't work for many-linked
* dirs */
clear_nlink(new.inode);
} else {
- ext4_inc_count(handle, new.dir);
+ ext4_inc_count(new.dir);
ext4_update_dx_flag(new.dir);
- ext4_mark_inode_dirty(handle, new.dir);
+ retval = ext4_mark_inode_dirty(handle, new.dir);
+ if (unlikely(retval))
+ goto end_rename;
}
}
- ext4_mark_inode_dirty(handle, old.dir);
+ retval = ext4_mark_inode_dirty(handle, old.dir);
+ if (unlikely(retval))
+ goto end_rename;
+
+ if (S_ISDIR(old.inode->i_mode)) {
+ /*
+ * We disable fast commits here that's because the
+ * replay code is not yet capable of changing dot dot
+ * dirents in directories.
+ */
+ ext4_fc_mark_ineligible(old.inode->i_sb,
+ EXT4_FC_REASON_RENAME_DIR);
+ } else {
+ if (new.inode)
+ ext4_fc_track_unlink(handle, new.dentry);
+ __ext4_fc_track_link(handle, old.inode, new.dentry);
+ __ext4_fc_track_unlink(handle, old.inode, old.dentry);
+ if (whiteout)
+ __ext4_fc_track_create(handle, whiteout, old.dentry);
+ }
+
if (new.inode) {
- ext4_mark_inode_dirty(handle, new.inode);
+ retval = ext4_mark_inode_dirty(handle, new.inode);
+ if (unlikely(retval))
+ goto end_rename;
if (!new.inode->i_nlink)
ext4_orphan_add(handle, new.inode);
}
@@ -3997,9 +4083,14 @@
ctime = current_time(old.inode);
old.inode->i_ctime = ctime;
new.inode->i_ctime = ctime;
- ext4_mark_inode_dirty(handle, old.inode);
- ext4_mark_inode_dirty(handle, new.inode);
-
+ retval = ext4_mark_inode_dirty(handle, old.inode);
+ if (unlikely(retval))
+ goto end_rename;
+ retval = ext4_mark_inode_dirty(handle, new.inode);
+ if (unlikely(retval))
+ goto end_rename;
+ ext4_fc_mark_ineligible(new.inode->i_sb,
+ EXT4_FC_REASON_CROSS_RENAME);
if (old.dir_bh) {
retval = ext4_rename_dir_finish(handle, &old, new.dir->i_ino);
if (retval)
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 2cc9f21..defd2e1 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -31,18 +31,56 @@
#include "acl.h"
static struct kmem_cache *io_end_cachep;
+static struct kmem_cache *io_end_vec_cachep;
int __init ext4_init_pageio(void)
{
io_end_cachep = KMEM_CACHE(ext4_io_end, SLAB_RECLAIM_ACCOUNT);
if (io_end_cachep == NULL)
return -ENOMEM;
+
+ io_end_vec_cachep = KMEM_CACHE(ext4_io_end_vec, 0);
+ if (io_end_vec_cachep == NULL) {
+ kmem_cache_destroy(io_end_cachep);
+ return -ENOMEM;
+ }
return 0;
}
void ext4_exit_pageio(void)
{
kmem_cache_destroy(io_end_cachep);
+ kmem_cache_destroy(io_end_vec_cachep);
+}
+
+struct ext4_io_end_vec *ext4_alloc_io_end_vec(ext4_io_end_t *io_end)
+{
+ struct ext4_io_end_vec *io_end_vec;
+
+ io_end_vec = kmem_cache_zalloc(io_end_vec_cachep, GFP_NOFS);
+ if (!io_end_vec)
+ return ERR_PTR(-ENOMEM);
+ INIT_LIST_HEAD(&io_end_vec->list);
+ list_add_tail(&io_end_vec->list, &io_end->list_vec);
+ return io_end_vec;
+}
+
+static void ext4_free_io_end_vec(ext4_io_end_t *io_end)
+{
+ struct ext4_io_end_vec *io_end_vec, *tmp;
+
+ if (list_empty(&io_end->list_vec))
+ return;
+ list_for_each_entry_safe(io_end_vec, tmp, &io_end->list_vec, list) {
+ list_del(&io_end_vec->list);
+ kmem_cache_free(io_end_vec_cachep, io_end_vec);
+ }
+}
+
+struct ext4_io_end_vec *ext4_last_io_end_vec(ext4_io_end_t *io_end)
+{
+ BUG_ON(list_empty(&io_end->list_vec));
+ return list_last_entry(&io_end->list_vec, struct ext4_io_end_vec, list);
}
/*
@@ -87,11 +125,10 @@
}
bh = head = page_buffers(page);
/*
- * We check all buffers in the page under BH_Uptodate_Lock
+ * We check all buffers in the page under b_uptodate_lock
* to avoid races with other end io clearing async_write flags
*/
- local_irq_save(flags);
- bit_spin_lock(BH_Uptodate_Lock, &head->b_state);
+ spin_lock_irqsave(&head->b_uptodate_lock, flags);
do {
if (bh_offset(bh) < bio_start ||
bh_offset(bh) + bh->b_size > bio_end) {
@@ -103,8 +140,7 @@
if (bio->bi_status)
buffer_io_error(bh);
} while ((bh = bh->b_this_page) != head);
- bit_spin_unlock(BH_Uptodate_Lock, &head->b_state);
- local_irq_restore(flags);
+ spin_unlock_irqrestore(&head->b_uptodate_lock, flags);
if (!under_io) {
fscrypt_free_bounce_page(bounce_page);
end_page_writeback(page);
@@ -125,6 +161,7 @@
ext4_finish_bio(bio);
bio_put(bio);
}
+ ext4_free_io_end_vec(io_end);
kmem_cache_free(io_end_cachep, io_end);
}
@@ -136,29 +173,26 @@
* cannot get to ext4_ext_truncate() before all IOs overlapping that range are
* completed (happens from ext4_free_ioend()).
*/
-static int ext4_end_io(ext4_io_end_t *io)
+static int ext4_end_io_end(ext4_io_end_t *io_end)
{
- struct inode *inode = io->inode;
- loff_t offset = io->offset;
- ssize_t size = io->size;
- handle_t *handle = io->handle;
+ struct inode *inode = io_end->inode;
+ handle_t *handle = io_end->handle;
int ret = 0;
- ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p,"
+ ext4_debug("ext4_end_io_nolock: io_end 0x%p from inode %lu,list->next 0x%p,"
"list->prev 0x%p\n",
- io, inode->i_ino, io->list.next, io->list.prev);
+ io_end, inode->i_ino, io_end->list.next, io_end->list.prev);
- io->handle = NULL; /* Following call will use up the handle */
- ret = ext4_convert_unwritten_extents(handle, inode, offset, size);
+ io_end->handle = NULL; /* Following call will use up the handle */
+ ret = ext4_convert_unwritten_io_end_vec(handle, io_end);
if (ret < 0 && !ext4_forced_shutdown(EXT4_SB(inode->i_sb))) {
ext4_msg(inode->i_sb, KERN_EMERG,
"failed to convert unwritten extents to written "
"extents -- potential data loss! "
- "(inode %lu, offset %llu, size %zd, error %d)",
- inode->i_ino, offset, size, ret);
+ "(inode %lu, error %d)", inode->i_ino, ret);
}
- ext4_clear_io_unwritten_flag(io);
- ext4_release_io_end(io);
+ ext4_clear_io_unwritten_flag(io_end);
+ ext4_release_io_end(io_end);
return ret;
}
@@ -166,21 +200,21 @@
{
#ifdef EXT4FS_DEBUG
struct list_head *cur, *before, *after;
- ext4_io_end_t *io, *io0, *io1;
+ ext4_io_end_t *io_end, *io_end0, *io_end1;
if (list_empty(head))
return;
ext4_debug("Dump inode %lu completed io list\n", inode->i_ino);
- list_for_each_entry(io, head, list) {
- cur = &io->list;
+ list_for_each_entry(io_end, head, list) {
+ cur = &io_end->list;
before = cur->prev;
- io0 = container_of(before, ext4_io_end_t, list);
+ io_end0 = container_of(before, ext4_io_end_t, list);
after = cur->next;
- io1 = container_of(after, ext4_io_end_t, list);
+ io_end1 = container_of(after, ext4_io_end_t, list);
ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n",
- io, inode->i_ino, io0, io1);
+ io_end, inode->i_ino, io_end0, io_end1);
}
#endif
}
@@ -207,7 +241,7 @@
static int ext4_do_flush_completed_IO(struct inode *inode,
struct list_head *head)
{
- ext4_io_end_t *io;
+ ext4_io_end_t *io_end;
struct list_head unwritten;
unsigned long flags;
struct ext4_inode_info *ei = EXT4_I(inode);
@@ -219,11 +253,11 @@
spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
while (!list_empty(&unwritten)) {
- io = list_entry(unwritten.next, ext4_io_end_t, list);
- BUG_ON(!(io->flag & EXT4_IO_END_UNWRITTEN));
- list_del_init(&io->list);
+ io_end = list_entry(unwritten.next, ext4_io_end_t, list);
+ BUG_ON(!(io_end->flag & EXT4_IO_END_UNWRITTEN));
+ list_del_init(&io_end->list);
- err = ext4_end_io(io);
+ err = ext4_end_io_end(io_end);
if (unlikely(!ret && err))
ret = err;
}
@@ -242,19 +276,22 @@
ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags)
{
- ext4_io_end_t *io = kmem_cache_zalloc(io_end_cachep, flags);
- if (io) {
- io->inode = inode;
- INIT_LIST_HEAD(&io->list);
- atomic_set(&io->count, 1);
+ ext4_io_end_t *io_end = kmem_cache_zalloc(io_end_cachep, flags);
+
+ if (io_end) {
+ io_end->inode = inode;
+ INIT_LIST_HEAD(&io_end->list);
+ INIT_LIST_HEAD(&io_end->list_vec);
+ atomic_set(&io_end->count, 1);
}
- return io;
+ return io_end;
}
void ext4_put_io_end_defer(ext4_io_end_t *io_end)
{
if (atomic_dec_and_test(&io_end->count)) {
- if (!(io_end->flag & EXT4_IO_END_UNWRITTEN) || !io_end->size) {
+ if (!(io_end->flag & EXT4_IO_END_UNWRITTEN) ||
+ list_empty(&io_end->list_vec)) {
ext4_release_io_end(io_end);
return;
}
@@ -268,9 +305,8 @@
if (atomic_dec_and_test(&io_end->count)) {
if (io_end->flag & EXT4_IO_END_UNWRITTEN) {
- err = ext4_convert_unwritten_extents(io_end->handle,
- io_end->inode, io_end->offset,
- io_end->size);
+ err = ext4_convert_unwritten_io_end_vec(io_end->handle,
+ io_end);
io_end->handle = NULL;
ext4_clear_io_unwritten_flag(io_end);
}
@@ -307,10 +343,8 @@
struct inode *inode = io_end->inode;
ext4_warning(inode->i_sb, "I/O error %d writing to inode %lu "
- "(offset %llu size %ld starting block %llu)",
+ "starting block %llu)",
bio->bi_status, inode->i_ino,
- (unsigned long long) io_end->offset,
- (long) io_end->size,
(unsigned long long)
bi_sector >> (inode->i_blkbits - 9));
mapping_set_error(inode->i_mapping,
@@ -358,14 +392,17 @@
io->io_end = NULL;
}
-static int io_submit_init_bio(struct ext4_io_submit *io,
- struct buffer_head *bh)
+static void io_submit_init_bio(struct ext4_io_submit *io,
+ struct buffer_head *bh)
{
struct bio *bio;
+ /*
+ * bio_alloc will _always_ be able to allocate a bio if
+ * __GFP_DIRECT_RECLAIM is set, see comments for bio_alloc_bioset().
+ */
bio = bio_alloc(GFP_NOIO, BIO_MAX_PAGES);
- if (!bio)
- return -ENOMEM;
+ fscrypt_set_bio_crypt_ctx_bh(bio, bh, GFP_NOIO);
bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
bio_set_dev(bio, bh->b_bdev);
bio->bi_end_io = ext4_end_bio;
@@ -373,24 +410,22 @@
io->io_bio = bio;
io->io_next_block = bh->b_blocknr;
wbc_init_bio(io->io_wbc, bio);
- return 0;
}
-static int io_submit_add_bh(struct ext4_io_submit *io,
- struct inode *inode,
- struct page *page,
- struct buffer_head *bh)
+static void io_submit_add_bh(struct ext4_io_submit *io,
+ struct inode *inode,
+ struct page *page,
+ struct buffer_head *bh)
{
int ret;
- if (io->io_bio && bh->b_blocknr != io->io_next_block) {
+ if (io->io_bio && (bh->b_blocknr != io->io_next_block ||
+ !fscrypt_mergeable_bio_bh(io->io_bio, bh))) {
submit_and_retry:
ext4_io_submit(io);
}
if (io->io_bio == NULL) {
- ret = io_submit_init_bio(io, bh);
- if (ret)
- return ret;
+ io_submit_init_bio(io, bh);
io->io_bio->bi_write_hint = inode->i_write_hint;
}
ret = bio_add_page(io->io_bio, page, bh->b_size, bh_offset(bh));
@@ -398,7 +433,6 @@
goto submit_and_retry;
wbc_account_cgroup_owner(io->io_wbc, page, bh->b_size);
io->io_next_block++;
- return 0;
}
int ext4_bio_write_page(struct ext4_io_submit *io,
@@ -474,7 +508,7 @@
* (e.g. holes) to be unnecessarily encrypted, but this is rare and
* can't happen in the common case of blocksize == PAGE_SIZE.
*/
- if (IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode) && nr_to_submit) {
+ if (fscrypt_inode_uses_fs_layer_crypto(inode) && nr_to_submit) {
gfp_t gfp_flags = GFP_NOFS;
unsigned int enc_bytes = round_up(len, i_blocksize(inode));
@@ -500,8 +534,14 @@
congestion_wait(BLK_RW_ASYNC, HZ/50);
goto retry_encrypt;
}
- bounce_page = NULL;
- goto out;
+
+ printk_ratelimited(KERN_ERR "%s: ret = %d\n", __func__, ret);
+ redirty_page_for_writepage(wbc, page);
+ do {
+ clear_buffer_async_write(bh);
+ bh = bh->b_this_page;
+ } while (bh != head);
+ goto unlock;
}
}
@@ -509,30 +549,13 @@
do {
if (!buffer_async_write(bh))
continue;
- ret = io_submit_add_bh(io, inode, bounce_page ?: page, bh);
- if (ret) {
- /*
- * We only get here on ENOMEM. Not much else
- * we can do but mark the page as dirty, and
- * better luck next time.
- */
- break;
- }
+ io_submit_add_bh(io, inode,
+ bounce_page ? bounce_page : page, bh);
nr_submitted++;
clear_buffer_dirty(bh);
} while ((bh = bh->b_this_page) != head);
- /* Error stopped previous loop? Clean up buffers... */
- if (ret) {
- out:
- fscrypt_free_bounce_page(bounce_page);
- printk_ratelimited(KERN_ERR "%s: ret = %d\n", __func__, ret);
- redirty_page_for_writepage(wbc, page);
- do {
- clear_buffer_async_write(bh);
- bh = bh->b_this_page;
- } while (bh != head);
- }
+unlock:
unlock_page(page);
/* Nothing submitted - we have to end page writeback */
if (!nr_submitted)
diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c
index a5f55fe..f014c5e 100644
--- a/fs/ext4/readpage.c
+++ b/fs/ext4/readpage.c
@@ -7,8 +7,8 @@
*
* This was originally taken from fs/mpage.c
*
- * The intent is the ext4_mpage_readpages() function here is intended
- * to replace mpage_readpages() in the general case, not just for
+ * The ext4_mpage_readpages() function here is intended to
+ * replace mpage_readahead() in the general case, not just for
* encrypted files. It has some limitations (see below), where it
* will fall back to read_block_full_page(), but these limitations
* should only be hit when page_size != block_size.
@@ -140,7 +140,7 @@
return;
}
ctx->cur_step++;
- /* fall-through */
+ fallthrough;
case STEP_VERITY:
if (ctx->enabled_steps & (1 << STEP_VERITY)) {
INIT_WORK(&ctx->work, verity_work);
@@ -148,7 +148,7 @@
return;
}
ctx->cur_step++;
- /* fall-through */
+ fallthrough;
default:
__read_end_io(ctx->bio);
}
@@ -189,28 +189,27 @@
idx < DIV_ROUND_UP(inode->i_size, PAGE_SIZE);
}
-static struct bio_post_read_ctx *get_bio_post_read_ctx(struct inode *inode,
- struct bio *bio,
- pgoff_t first_idx)
+static void ext4_set_bio_post_read_ctx(struct bio *bio,
+ const struct inode *inode,
+ pgoff_t first_idx)
{
unsigned int post_read_steps = 0;
- struct bio_post_read_ctx *ctx = NULL;
- if (IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode))
+ if (fscrypt_inode_uses_fs_layer_crypto(inode))
post_read_steps |= 1 << STEP_DECRYPT;
if (ext4_need_verity(inode, first_idx))
post_read_steps |= 1 << STEP_VERITY;
if (post_read_steps) {
- ctx = mempool_alloc(bio_post_read_ctx_pool, GFP_NOFS);
- if (!ctx)
- return ERR_PTR(-ENOMEM);
+ /* Due to the mempool, this never fails. */
+ struct bio_post_read_ctx *ctx =
+ mempool_alloc(bio_post_read_ctx_pool, GFP_NOFS);
+
ctx->bio = bio;
ctx->enabled_steps = post_read_steps;
bio->bi_private = ctx;
}
- return ctx;
}
static inline loff_t ext4_readpage_limit(struct inode *inode)
@@ -222,17 +221,16 @@
return i_size_read(inode);
}
-int ext4_mpage_readpages(struct address_space *mapping,
- struct list_head *pages, struct page *page,
- unsigned nr_pages, bool is_readahead)
+int ext4_mpage_readpages(struct inode *inode,
+ struct readahead_control *rac, struct page *page)
{
struct bio *bio = NULL;
sector_t last_block_in_bio = 0;
- struct inode *inode = mapping->host;
const unsigned blkbits = inode->i_blkbits;
const unsigned blocks_per_page = PAGE_SIZE >> blkbits;
const unsigned blocksize = 1 << blkbits;
+ sector_t next_block;
sector_t block_in_file;
sector_t last_block;
sector_t last_block_in_file;
@@ -242,6 +240,7 @@
int length;
unsigned relative_block = 0;
struct ext4_map_blocks map;
+ unsigned int nr_pages = rac ? readahead_count(rac) : 1;
map.m_pblk = 0;
map.m_lblk = 0;
@@ -252,20 +251,16 @@
int fully_mapped = 1;
unsigned first_hole = blocks_per_page;
- if (pages) {
- page = lru_to_page(pages);
-
+ if (rac) {
+ page = readahead_page(rac);
prefetchw(&page->flags);
- list_del(&page->lru);
- if (add_to_page_cache_lru(page, mapping, page->index,
- readahead_gfp_mask(mapping)))
- goto next_page;
}
if (page_has_buffers(page))
goto confused;
- block_in_file = (sector_t)page->index << (PAGE_SHIFT - blkbits);
+ block_in_file = next_block =
+ (sector_t)page->index << (PAGE_SHIFT - blkbits);
last_block = block_in_file + nr_pages * blocks_per_page;
last_block_in_file = (ext4_readpage_limit(inode) +
blocksize - 1) >> blkbits;
@@ -365,30 +360,27 @@
* This page will go to BIO. Do we need to send this
* BIO off first?
*/
- if (bio && (last_block_in_bio != blocks[0] - 1)) {
+ if (bio && (last_block_in_bio != blocks[0] - 1 ||
+ !fscrypt_mergeable_bio(bio, inode, next_block))) {
submit_and_realloc:
submit_bio(bio);
bio = NULL;
}
if (bio == NULL) {
- struct bio_post_read_ctx *ctx;
-
+ /*
+ * bio_alloc will _always_ be able to allocate a bio if
+ * __GFP_DIRECT_RECLAIM is set, see bio_alloc_bioset().
+ */
bio = bio_alloc(GFP_KERNEL,
min_t(int, nr_pages, BIO_MAX_PAGES));
- if (!bio)
- goto set_error_page;
- ctx = get_bio_post_read_ctx(inode, bio, page->index);
- if (IS_ERR(ctx)) {
- bio_put(bio);
- bio = NULL;
- goto set_error_page;
- }
+ fscrypt_set_bio_crypt_ctx(bio, inode, next_block,
+ GFP_KERNEL);
+ ext4_set_bio_post_read_ctx(bio, inode, page->index);
bio_set_dev(bio, bdev);
bio->bi_iter.bi_sector = blocks[0] << (blkbits - 9);
bio->bi_end_io = mpage_end_io;
- bio->bi_private = ctx;
bio_set_op_attrs(bio, REQ_OP_READ,
- is_readahead ? REQ_RAHEAD : 0);
+ rac ? REQ_RAHEAD : 0);
}
length = first_hole << blkbits;
@@ -413,10 +405,9 @@
else
unlock_page(page);
next_page:
- if (pages)
+ if (rac)
put_page(page);
}
- BUG_ON(pages && !list_empty(pages));
if (bio)
submit_bio(bio);
return 0;
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index ad1d4c8..6513079 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -74,6 +74,11 @@
return -EPERM;
}
+ if (ext4_has_feature_sparse_super2(sb)) {
+ ext4_msg(sb, KERN_ERR, "Online resizing not supported with sparse_super2");
+ return -EOPNOTSUPP;
+ }
+
if (test_and_set_bit_lock(EXT4_FLAGS_RESIZING,
&EXT4_SB(sb)->s_ext4_flags))
ret = -EBUSY;
@@ -415,28 +420,10 @@
return bh;
}
-/*
- * If we have fewer than thresh credits, extend by EXT4_MAX_TRANS_DATA.
- * If that fails, restart the transaction & regain write access for the
- * buffer head which is used for block_bitmap modifications.
- */
-static int extend_or_restart_transaction(handle_t *handle, int thresh)
+static int ext4_resize_ensure_credits_batch(handle_t *handle, int credits)
{
- int err;
-
- if (ext4_handle_has_enough_credits(handle, thresh))
- return 0;
-
- err = ext4_journal_extend(handle, EXT4_MAX_TRANS_DATA);
- if (err < 0)
- return err;
- if (err) {
- err = ext4_journal_restart(handle, EXT4_MAX_TRANS_DATA);
- if (err)
- return err;
- }
-
- return 0;
+ return ext4_journal_ensure_credits_fn(handle, credits,
+ EXT4_MAX_TRANS_DATA, 0, 0);
}
/*
@@ -478,8 +465,8 @@
continue;
}
- err = extend_or_restart_transaction(handle, 1);
- if (err)
+ err = ext4_resize_ensure_credits_batch(handle, 1);
+ if (err < 0)
return err;
bh = sb_getblk(sb, flex_gd->groups[group].block_bitmap);
@@ -571,8 +558,8 @@
struct buffer_head *gdb;
ext4_debug("update backup group %#04llx\n", block);
- err = extend_or_restart_transaction(handle, 1);
- if (err)
+ err = ext4_resize_ensure_credits_batch(handle, 1);
+ if (err < 0)
goto out;
gdb = sb_getblk(sb, block);
@@ -629,8 +616,8 @@
/* Initialize block bitmap of the @group */
block = group_data[i].block_bitmap;
- err = extend_or_restart_transaction(handle, 1);
- if (err)
+ err = ext4_resize_ensure_credits_batch(handle, 1);
+ if (err < 0)
goto out;
bh = bclean(handle, sb, block);
@@ -658,8 +645,8 @@
/* Initialize inode bitmap of the @group */
block = group_data[i].inode_bitmap;
- err = extend_or_restart_transaction(handle, 1);
- if (err)
+ err = ext4_resize_ensure_credits_batch(handle, 1);
+ if (err < 0)
goto out;
/* Mark unused entries in inode bitmap used */
bh = bclean(handle, sb, block);
@@ -871,9 +858,8 @@
if (unlikely(err))
goto errout;
- n_group_desc = ext4_kvmalloc((gdb_num + 1) *
- sizeof(struct buffer_head *),
- GFP_NOFS);
+ n_group_desc = kvmalloc((gdb_num + 1) * sizeof(struct buffer_head *),
+ GFP_KERNEL);
if (!n_group_desc) {
err = -ENOMEM;
ext4_warning(sb, "not enough memory for %lu groups",
@@ -949,9 +935,8 @@
gdb_bh = ext4_sb_bread(sb, gdblock, 0);
if (IS_ERR(gdb_bh))
return PTR_ERR(gdb_bh);
- n_group_desc = ext4_kvmalloc((gdb_num + 1) *
- sizeof(struct buffer_head *),
- GFP_NOFS);
+ n_group_desc = kvmalloc((gdb_num + 1) * sizeof(struct buffer_head *),
+ GFP_KERNEL);
if (!n_group_desc) {
brelse(gdb_bh);
err = -ENOMEM;
@@ -1142,10 +1127,8 @@
ext4_fsblk_t backup_block;
/* Out of journal space, and can't get more - abort - so sad */
- if (ext4_handle_valid(handle) &&
- handle->h_buffer_credits == 0 &&
- ext4_journal_extend(handle, EXT4_MAX_TRANS_DATA) &&
- (err = ext4_journal_restart(handle, EXT4_MAX_TRANS_DATA)))
+ err = ext4_resize_ensure_credits_batch(handle, 1);
+ if (err < 0)
break;
if (meta_bg == 0)
@@ -1267,7 +1250,7 @@
if (unlikely(!bh))
return NULL;
if (!bh_uptodate_or_lock(bh)) {
- if (bh_submit_read(bh) < 0) {
+ if (ext4_read_bh(bh, 0, NULL) < 0) {
brelse(bh);
return NULL;
}
@@ -1830,8 +1813,8 @@
o_blocks_count + add, add);
/* See if the device is actually as big as what was requested */
- bh = sb_bread(sb, o_blocks_count + add - 1);
- if (!bh) {
+ bh = ext4_sb_bread(sb, o_blocks_count + add - 1, 0);
+ if (IS_ERR(bh)) {
ext4_warning(sb, "can't read last block, resize aborted");
return -ENOSPC;
}
@@ -1956,8 +1939,8 @@
int meta_bg;
/* See if the device is actually as big as what was requested */
- bh = sb_bread(sb, n_blocks_count - 1);
- if (!bh) {
+ bh = ext4_sb_bread(sb, n_blocks_count - 1, 0);
+ if (IS_ERR(bh)) {
ext4_warning(sb, "can't read last block, resize aborted");
return -ENOSPC;
}
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index ce8372c..9e210bc 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -43,7 +43,7 @@
#include <linux/uaccess.h>
#include <linux/iversion.h>
#include <linux/unicode.h>
-
+#include <linux/part_stat.h>
#include <linux/kthread.h>
#include <linux/freezer.h>
@@ -93,11 +93,11 @@
* i_mmap_rwsem (inode->i_mmap_rwsem)!
*
* page fault path:
- * mmap_sem -> sb_start_pagefault -> i_mmap_sem (r) -> transaction start ->
+ * mmap_lock -> sb_start_pagefault -> i_mmap_sem (r) -> transaction start ->
* page lock -> i_data_sem (rw)
*
* buffered write path:
- * sb_start_write -> i_mutex -> mmap_sem
+ * sb_start_write -> i_mutex -> mmap_lock
* sb_start_write -> i_mutex -> transaction start -> page lock ->
* i_data_sem (rw)
*
@@ -107,7 +107,7 @@
* i_data_sem (rw)
*
* direct IO:
- * sb_start_write -> i_mutex -> mmap_sem
+ * sb_start_write -> i_mutex -> mmap_lock
* sb_start_write -> i_mutex -> transaction start -> i_data_sem (rw)
*
* writepages:
@@ -141,27 +141,115 @@
MODULE_ALIAS("ext3");
#define IS_EXT3_SB(sb) ((sb)->s_bdev->bd_holder == &ext3_fs_type)
+
+static inline void __ext4_read_bh(struct buffer_head *bh, int op_flags,
+ bh_end_io_t *end_io)
+{
+ /*
+ * buffer's verified bit is no longer valid after reading from
+ * disk again due to write out error, clear it to make sure we
+ * recheck the buffer contents.
+ */
+ clear_buffer_verified(bh);
+
+ bh->b_end_io = end_io ? end_io : end_buffer_read_sync;
+ get_bh(bh);
+ submit_bh(REQ_OP_READ, op_flags, bh);
+}
+
+void ext4_read_bh_nowait(struct buffer_head *bh, int op_flags,
+ bh_end_io_t *end_io)
+{
+ BUG_ON(!buffer_locked(bh));
+
+ if (ext4_buffer_uptodate(bh)) {
+ unlock_buffer(bh);
+ return;
+ }
+ __ext4_read_bh(bh, op_flags, end_io);
+}
+
+int ext4_read_bh(struct buffer_head *bh, int op_flags, bh_end_io_t *end_io)
+{
+ BUG_ON(!buffer_locked(bh));
+
+ if (ext4_buffer_uptodate(bh)) {
+ unlock_buffer(bh);
+ return 0;
+ }
+
+ __ext4_read_bh(bh, op_flags, end_io);
+
+ wait_on_buffer(bh);
+ if (buffer_uptodate(bh))
+ return 0;
+ return -EIO;
+}
+
+int ext4_read_bh_lock(struct buffer_head *bh, int op_flags, bool wait)
+{
+ if (trylock_buffer(bh)) {
+ if (wait)
+ return ext4_read_bh(bh, op_flags, NULL);
+ ext4_read_bh_nowait(bh, op_flags, NULL);
+ return 0;
+ }
+ if (wait) {
+ wait_on_buffer(bh);
+ if (buffer_uptodate(bh))
+ return 0;
+ return -EIO;
+ }
+ return 0;
+}
+
/*
- * This works like sb_bread() except it uses ERR_PTR for error
+ * This works like __bread_gfp() except it uses ERR_PTR for error
* returns. Currently with sb_bread it's impossible to distinguish
* between ENOMEM and EIO situations (since both result in a NULL
* return.
*/
-struct buffer_head *
-ext4_sb_bread(struct super_block *sb, sector_t block, int op_flags)
+static struct buffer_head *__ext4_sb_bread_gfp(struct super_block *sb,
+ sector_t block, int op_flags,
+ gfp_t gfp)
{
- struct buffer_head *bh = sb_getblk(sb, block);
+ struct buffer_head *bh;
+ int ret;
+ bh = sb_getblk_gfp(sb, block, gfp);
if (bh == NULL)
return ERR_PTR(-ENOMEM);
- if (buffer_uptodate(bh))
+ if (ext4_buffer_uptodate(bh))
return bh;
- ll_rw_block(REQ_OP_READ, REQ_META | op_flags, 1, &bh);
- wait_on_buffer(bh);
- if (buffer_uptodate(bh))
- return bh;
- put_bh(bh);
- return ERR_PTR(-EIO);
+
+ ret = ext4_read_bh_lock(bh, REQ_META | op_flags, true);
+ if (ret) {
+ put_bh(bh);
+ return ERR_PTR(ret);
+ }
+ return bh;
+}
+
+struct buffer_head *ext4_sb_bread(struct super_block *sb, sector_t block,
+ int op_flags)
+{
+ return __ext4_sb_bread_gfp(sb, block, op_flags, __GFP_MOVABLE);
+}
+
+struct buffer_head *ext4_sb_bread_unmovable(struct super_block *sb,
+ sector_t block)
+{
+ return __ext4_sb_bread_gfp(sb, block, 0, 0);
+}
+
+void ext4_sb_breadahead_unmovable(struct super_block *sb, sector_t block)
+{
+ struct buffer_head *bh = sb_getblk_gfp(sb, block, 0);
+
+ if (likely(bh)) {
+ ext4_read_bh_lock(bh, REQ_RAHEAD, false);
+ brelse(bh);
+ }
}
static int ext4_verify_csum_type(struct super_block *sb,
@@ -204,26 +292,6 @@
es->s_checksum = ext4_superblock_csum(sb, es);
}
-void *ext4_kvmalloc(size_t size, gfp_t flags)
-{
- void *ret;
-
- ret = kmalloc(size, flags | __GFP_NOWARN);
- if (!ret)
- ret = __vmalloc(size, flags, PAGE_KERNEL);
- return ret;
-}
-
-void *ext4_kvzalloc(size_t size, gfp_t flags)
-{
- void *ret;
-
- ret = kzalloc(size, flags | __GFP_NOWARN);
- if (!ret)
- ret = __vmalloc(size, flags | __GFP_ZERO, PAGE_KERNEL);
- return ret;
-}
-
ext4_fsblk_t ext4_block_bitmap(struct super_block *sb,
struct ext4_group_desc *bg)
{
@@ -355,10 +423,12 @@
#define ext4_get_tstamp(es, tstamp) \
__ext4_get_tstamp(&(es)->tstamp, &(es)->tstamp ## _hi)
-static void __save_error_info(struct super_block *sb, const char *func,
- unsigned int line)
+static void __save_error_info(struct super_block *sb, int error,
+ __u32 ino, __u64 block,
+ const char *func, unsigned int line)
{
struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+ int err;
EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
if (bdev_read_only(sb->s_bdev))
@@ -367,6 +437,62 @@
ext4_update_tstamp(es, s_last_error_time);
strncpy(es->s_last_error_func, func, sizeof(es->s_last_error_func));
es->s_last_error_line = cpu_to_le32(line);
+ es->s_last_error_ino = cpu_to_le32(ino);
+ es->s_last_error_block = cpu_to_le64(block);
+ switch (error) {
+ case EIO:
+ err = EXT4_ERR_EIO;
+ break;
+ case ENOMEM:
+ err = EXT4_ERR_ENOMEM;
+ break;
+ case EFSBADCRC:
+ err = EXT4_ERR_EFSBADCRC;
+ break;
+ case 0:
+ case EFSCORRUPTED:
+ err = EXT4_ERR_EFSCORRUPTED;
+ break;
+ case ENOSPC:
+ err = EXT4_ERR_ENOSPC;
+ break;
+ case ENOKEY:
+ err = EXT4_ERR_ENOKEY;
+ break;
+ case EROFS:
+ err = EXT4_ERR_EROFS;
+ break;
+ case EFBIG:
+ err = EXT4_ERR_EFBIG;
+ break;
+ case EEXIST:
+ err = EXT4_ERR_EEXIST;
+ break;
+ case ERANGE:
+ err = EXT4_ERR_ERANGE;
+ break;
+ case EOVERFLOW:
+ err = EXT4_ERR_EOVERFLOW;
+ break;
+ case EBUSY:
+ err = EXT4_ERR_EBUSY;
+ break;
+ case ENOTDIR:
+ err = EXT4_ERR_ENOTDIR;
+ break;
+ case ENOTEMPTY:
+ err = EXT4_ERR_ENOTEMPTY;
+ break;
+ case ESHUTDOWN:
+ err = EXT4_ERR_ESHUTDOWN;
+ break;
+ case EFAULT:
+ err = EXT4_ERR_EFAULT;
+ break;
+ default:
+ err = EXT4_ERR_UNKNOWN;
+ }
+ es->s_last_error_errcode = err;
if (!es->s_first_error_time) {
es->s_first_error_time = es->s_last_error_time;
es->s_first_error_time_hi = es->s_last_error_time_hi;
@@ -375,6 +501,7 @@
es->s_first_error_line = cpu_to_le32(line);
es->s_first_error_ino = es->s_last_error_ino;
es->s_first_error_block = es->s_last_error_block;
+ es->s_first_error_errcode = es->s_last_error_errcode;
}
/*
* Start the daily error reporting function if it hasn't been
@@ -385,10 +512,11 @@
le32_add_cpu(&es->s_error_count, 1);
}
-static void save_error_info(struct super_block *sb, const char *func,
- unsigned int line)
+static void save_error_info(struct super_block *sb, int error,
+ __u32 ino, __u64 block,
+ const char *func, unsigned int line)
{
- __save_error_info(sb, func, line);
+ __save_error_info(sb, error, ino, block, func, line);
if (!bdev_read_only(sb->s_bdev))
ext4_commit_super(sb, 1);
}
@@ -432,6 +560,89 @@
spin_unlock(&sbi->s_md_lock);
}
+/*
+ * This writepage callback for write_cache_pages()
+ * takes care of a few cases after page cleaning.
+ *
+ * write_cache_pages() already checks for dirty pages
+ * and calls clear_page_dirty_for_io(), which we want,
+ * to write protect the pages.
+ *
+ * However, we may have to redirty a page (see below.)
+ */
+static int ext4_journalled_writepage_callback(struct page *page,
+ struct writeback_control *wbc,
+ void *data)
+{
+ transaction_t *transaction = (transaction_t *) data;
+ struct buffer_head *bh, *head;
+ struct journal_head *jh;
+
+ bh = head = page_buffers(page);
+ do {
+ /*
+ * We have to redirty a page in these cases:
+ * 1) If buffer is dirty, it means the page was dirty because it
+ * contains a buffer that needs checkpointing. So the dirty bit
+ * needs to be preserved so that checkpointing writes the buffer
+ * properly.
+ * 2) If buffer is not part of the committing transaction
+ * (we may have just accidentally come across this buffer because
+ * inode range tracking is not exact) or if the currently running
+ * transaction already contains this buffer as well, dirty bit
+ * needs to be preserved so that the buffer gets writeprotected
+ * properly on running transaction's commit.
+ */
+ jh = bh2jh(bh);
+ if (buffer_dirty(bh) ||
+ (jh && (jh->b_transaction != transaction ||
+ jh->b_next_transaction))) {
+ redirty_page_for_writepage(wbc, page);
+ goto out;
+ }
+ } while ((bh = bh->b_this_page) != head);
+
+out:
+ return AOP_WRITEPAGE_ACTIVATE;
+}
+
+static int ext4_journalled_submit_inode_data_buffers(struct jbd2_inode *jinode)
+{
+ struct address_space *mapping = jinode->i_vfs_inode->i_mapping;
+ struct writeback_control wbc = {
+ .sync_mode = WB_SYNC_ALL,
+ .nr_to_write = LONG_MAX,
+ .range_start = jinode->i_dirty_start,
+ .range_end = jinode->i_dirty_end,
+ };
+
+ return write_cache_pages(mapping, &wbc,
+ ext4_journalled_writepage_callback,
+ jinode->i_transaction);
+}
+
+static int ext4_journal_submit_inode_data_buffers(struct jbd2_inode *jinode)
+{
+ int ret;
+
+ if (ext4_should_journal_data(jinode->i_vfs_inode))
+ ret = ext4_journalled_submit_inode_data_buffers(jinode);
+ else
+ ret = jbd2_journal_submit_inode_data_buffers(jinode);
+
+ return ret;
+}
+
+static int ext4_journal_finish_inode_data_buffers(struct jbd2_inode *jinode)
+{
+ int ret = 0;
+
+ if (!ext4_should_journal_data(jinode->i_vfs_inode))
+ ret = jbd2_journal_finish_inode_data_buffers(jinode);
+
+ return ret;
+}
+
static bool system_going_down(void)
{
return system_state == SYSTEM_HALT || system_state == SYSTEM_POWER_OFF
@@ -463,7 +674,7 @@
if (sb_rdonly(sb) || test_opt(sb, ERRORS_CONT))
return;
- EXT4_SB(sb)->s_mount_flags |= EXT4_MF_FS_ABORTED;
+ ext4_set_mount_flag(sb, EXT4_MF_FS_ABORTED);
if (journal)
jbd2_journal_abort(journal, -EIO);
/*
@@ -480,9 +691,6 @@
smp_wmb();
sb->s_flags |= SB_RDONLY;
} else if (test_opt(sb, ERRORS_PANIC)) {
- if (EXT4_SB(sb)->s_journal &&
- !(EXT4_SB(sb)->s_journal->j_flags & JBD2_REC_ERR))
- return;
panic("EXT4-fs (device %s): panic forced after error\n",
sb->s_id);
}
@@ -493,7 +701,8 @@
"EXT4-fs error")
void __ext4_error(struct super_block *sb, const char *function,
- unsigned int line, const char *fmt, ...)
+ unsigned int line, int error, __u64 block,
+ const char *fmt, ...)
{
struct va_format vaf;
va_list args;
@@ -511,24 +720,21 @@
sb->s_id, function, line, current->comm, &vaf);
va_end(args);
}
- save_error_info(sb, function, line);
+ save_error_info(sb, error, 0, block, function, line);
ext4_handle_error(sb);
}
void __ext4_error_inode(struct inode *inode, const char *function,
- unsigned int line, ext4_fsblk_t block,
+ unsigned int line, ext4_fsblk_t block, int error,
const char *fmt, ...)
{
va_list args;
struct va_format vaf;
- struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
return;
trace_ext4_error(inode->i_sb, function, line);
- es->s_last_error_ino = cpu_to_le32(inode->i_ino);
- es->s_last_error_block = cpu_to_le64(block);
if (ext4_error_ratelimit(inode->i_sb)) {
va_start(args, fmt);
vaf.fmt = fmt;
@@ -545,7 +751,8 @@
current->comm, &vaf);
va_end(args);
}
- save_error_info(inode->i_sb, function, line);
+ save_error_info(inode->i_sb, error, inode->i_ino, block,
+ function, line);
ext4_handle_error(inode->i_sb);
}
@@ -555,7 +762,6 @@
{
va_list args;
struct va_format vaf;
- struct ext4_super_block *es;
struct inode *inode = file_inode(file);
char pathname[80], *path;
@@ -563,8 +769,6 @@
return;
trace_ext4_error(inode->i_sb, function, line);
- es = EXT4_SB(inode->i_sb)->s_es;
- es->s_last_error_ino = cpu_to_le32(inode->i_ino);
if (ext4_error_ratelimit(inode->i_sb)) {
path = file_path(file, pathname, sizeof(pathname));
if (IS_ERR(path))
@@ -586,7 +790,8 @@
current->comm, path, &vaf);
va_end(args);
}
- save_error_info(inode->i_sb, function, line);
+ save_error_info(inode->i_sb, EFSCORRUPTED, inode->i_ino, block,
+ function, line);
ext4_handle_error(inode->i_sb);
}
@@ -654,7 +859,7 @@
sb->s_id, function, line, errstr);
}
- save_error_info(sb, function, line);
+ save_error_info(sb, -errno, 0, 0, function, line);
ext4_handle_error(sb);
}
@@ -669,7 +874,7 @@
*/
void __ext4_abort(struct super_block *sb, const char *function,
- unsigned int line, const char *fmt, ...)
+ unsigned int line, int error, const char *fmt, ...)
{
struct va_format vaf;
va_list args;
@@ -677,7 +882,7 @@
if (unlikely(ext4_forced_shutdown(EXT4_SB(sb))))
return;
- save_error_info(sb, function, line);
+ save_error_info(sb, error, 0, 0, function, line);
va_start(args, fmt);
vaf.fmt = fmt;
vaf.va = &args;
@@ -686,24 +891,20 @@
va_end(args);
if (sb_rdonly(sb) == 0) {
+ ext4_set_mount_flag(sb, EXT4_MF_FS_ABORTED);
+ if (EXT4_SB(sb)->s_journal)
+ jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO);
+
ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only");
- EXT4_SB(sb)->s_mount_flags |= EXT4_MF_FS_ABORTED;
/*
* Make sure updated value of ->s_mount_flags will be visible
* before ->s_flags update
*/
smp_wmb();
sb->s_flags |= SB_RDONLY;
- if (EXT4_SB(sb)->s_journal)
- jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO);
- save_error_info(sb, function, line);
}
- if (test_opt(sb, ERRORS_PANIC) && !system_going_down()) {
- if (EXT4_SB(sb)->s_journal &&
- !(EXT4_SB(sb)->s_journal->j_flags & JBD2_REC_ERR))
- return;
+ if (test_opt(sb, ERRORS_PANIC) && !system_going_down())
panic("EXT4-fs panic from previous error\n");
- }
}
void __ext4_msg(struct super_block *sb,
@@ -712,6 +913,7 @@
struct va_format vaf;
va_list args;
+ atomic_inc(&EXT4_SB(sb)->s_msg_count);
if (!___ratelimit(&(EXT4_SB(sb)->s_msg_ratelimit_state), "EXT4-fs"))
return;
@@ -722,9 +924,12 @@
va_end(args);
}
-#define ext4_warning_ratelimit(sb) \
- ___ratelimit(&(EXT4_SB(sb)->s_warning_ratelimit_state), \
- "EXT4-fs warning")
+static int ext4_warning_ratelimit(struct super_block *sb)
+{
+ atomic_inc(&EXT4_SB(sb)->s_warning_count);
+ return ___ratelimit(&(EXT4_SB(sb)->s_warning_ratelimit_state),
+ "EXT4-fs warning");
+}
void __ext4_warning(struct super_block *sb, const char *function,
unsigned int line, const char *fmt, ...)
@@ -770,15 +975,12 @@
{
struct va_format vaf;
va_list args;
- struct ext4_super_block *es = EXT4_SB(sb)->s_es;
if (unlikely(ext4_forced_shutdown(EXT4_SB(sb))))
return;
trace_ext4_error(sb, function, line);
- es->s_last_error_ino = cpu_to_le32(ino);
- es->s_last_error_block = cpu_to_le64(block);
- __save_error_info(sb, function, line);
+ __save_error_info(sb, EFSCORRUPTED, ino, block, function, line);
if (ext4_error_ratelimit(sb)) {
va_start(args, fmt);
@@ -882,7 +1084,6 @@
static struct block_device *ext4_blkdev_get(dev_t dev, struct super_block *sb)
{
struct block_device *bdev;
- char b[BDEVNAME_SIZE];
bdev = blkdev_get_by_dev(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL, sb);
if (IS_ERR(bdev))
@@ -890,8 +1091,9 @@
return bdev;
fail:
- ext4_msg(sb, KERN_ERR, "failed to open journal device %s: %ld",
- __bdevname(dev, b), PTR_ERR(bdev));
+ ext4_msg(sb, KERN_ERR,
+ "failed to open journal device unknown-block(%u,%u) %ld",
+ MAJOR(dev), MINOR(dev), PTR_ERR(bdev));
return NULL;
}
@@ -906,10 +1108,10 @@
static void ext4_blkdev_remove(struct ext4_sb_info *sbi)
{
struct block_device *bdev;
- bdev = sbi->journal_bdev;
+ bdev = sbi->s_journal_bdev;
if (bdev) {
ext4_blkdev_put(bdev);
- sbi->journal_bdev = NULL;
+ sbi->s_journal_bdev = NULL;
}
}
@@ -979,15 +1181,22 @@
destroy_workqueue(sbi->rsv_conversion_wq);
+ /*
+ * Unregister sysfs before destroying jbd2 journal.
+ * Since we could still access attr_journal_task attribute via sysfs
+ * path which could have sbi->s_journal->j_task as NULL
+ */
+ ext4_unregister_sysfs(sb);
+
if (sbi->s_journal) {
aborted = is_journal_aborted(sbi->s_journal);
err = jbd2_journal_destroy(sbi->s_journal);
sbi->s_journal = NULL;
- if ((err < 0) && !aborted)
- ext4_abort(sb, "Couldn't clean up the journal");
+ if ((err < 0) && !aborted) {
+ ext4_abort(sb, -err, "Couldn't clean up the journal");
+ }
}
- ext4_unregister_sysfs(sb);
ext4_es_unregister_shrinker(sbi);
del_timer_sync(&sbi->s_err_report);
ext4_release_system_zone(sb);
@@ -1034,14 +1243,14 @@
sync_blockdev(sb->s_bdev);
invalidate_bdev(sb->s_bdev);
- if (sbi->journal_bdev && sbi->journal_bdev != sb->s_bdev) {
+ if (sbi->s_journal_bdev && sbi->s_journal_bdev != sb->s_bdev) {
/*
* Invalidate the journal device's buffers. We don't want them
* floating about in memory - the physical journal device may
* hotswapped, and it breaks the `ro-after' testing code.
*/
- sync_blockdev(sbi->journal_bdev);
- invalidate_bdev(sbi->journal_bdev);
+ sync_blockdev(sbi->s_journal_bdev);
+ invalidate_bdev(sbi->s_journal_bdev);
ext4_blkdev_remove(sbi);
}
@@ -1051,8 +1260,8 @@
ext4_xattr_destroy_cache(sbi->s_ea_block_cache);
sbi->s_ea_block_cache = NULL;
- if (sbi->s_mmp_tsk)
- kthread_stop(sbi->s_mmp_tsk);
+ ext4_stop_mmpd(sbi);
+
brelse(sbi->s_sbh);
sb->s_fs_info = NULL;
/*
@@ -1065,8 +1274,9 @@
crypto_free_shash(sbi->s_chksum_driver);
kfree(sbi->s_blockgroup_lock);
fs_put_dax(sbi->s_daxdev);
+ fscrypt_free_dummy_policy(&sbi->s_dummy_enc_policy);
#ifdef CONFIG_UNICODE
- utf8_unload(sbi->s_encoding);
+ utf8_unload(sb->s_encoding);
#endif
kfree(sbi);
}
@@ -1087,6 +1297,7 @@
inode_set_iversion(&ei->vfs_inode, 1);
spin_lock_init(&ei->i_raw_lock);
INIT_LIST_HEAD(&ei->i_prealloc_list);
+ atomic_set(&ei->i_prealloc_active, 0);
spin_lock_init(&ei->i_prealloc_lock);
ext4_es_init_tree(&ei->i_es_tree);
rwlock_init(&ei->i_es_lock);
@@ -1095,8 +1306,6 @@
ei->i_es_shk_nr = 0;
ei->i_es_shrink_lblk = 0;
ei->i_reserved_data_blocks = 0;
- ei->i_da_metadata_calc_len = 0;
- ei->i_da_metadata_calc_last_lblock = 0;
spin_lock_init(&(ei->i_block_reservation_lock));
ext4_init_pending_tree(&ei->i_pending_tree);
#ifdef CONFIG_QUOTA
@@ -1110,6 +1319,8 @@
ei->i_datasync_tid = 0;
atomic_set(&ei->i_unwritten, 0);
INIT_WORK(&ei->i_rsv_conversion_work, ext4_end_io_rsv_work);
+ ext4_fc_init_inode(&ei->vfs_inode);
+ mutex_init(&ei->i_fc_lock);
return &ei->vfs_inode;
}
@@ -1127,6 +1338,10 @@
static void ext4_free_in_core_inode(struct inode *inode)
{
fscrypt_free_inode(inode);
+ if (!list_empty(&(EXT4_I(inode)->i_fc_list))) {
+ pr_warn("%s: inode %ld still in fc list",
+ __func__, inode->i_ino);
+ }
kmem_cache_free(ext4_inode_cachep, EXT4_I(inode));
}
@@ -1141,6 +1356,12 @@
true);
dump_stack();
}
+
+ if (EXT4_I(inode)->i_reserved_data_blocks)
+ ext4_msg(inode->i_sb, KERN_ERR,
+ "Inode %lu (%p): i_reserved_data_blocks (%u) not cleared!",
+ inode->i_ino, EXT4_I(inode),
+ EXT4_I(inode)->i_reserved_data_blocks);
}
static void init_once(void *foo)
@@ -1152,6 +1373,7 @@
init_rwsem(&ei->i_data_sem);
init_rwsem(&ei->i_mmap_sem);
inode_init_once(&ei->vfs_inode);
+ ext4_fc_init_inode(&ei->vfs_inode);
}
static int __init init_inodecache(void)
@@ -1180,9 +1402,10 @@
void ext4_clear_inode(struct inode *inode)
{
+ ext4_fc_del(inode);
invalidate_inode_buffers(inode);
clear_inode(inode);
- ext4_discard_preallocations(inode);
+ ext4_discard_preallocations(inode, 0);
ext4_es_remove_extent(inode, 0, EXT_MAX_BLOCKS);
dquot_drop(inode);
if (EXT4_I(inode)->jinode) {
@@ -1254,8 +1477,8 @@
if (!page_has_buffers(page))
return 0;
if (journal)
- return jbd2_journal_try_to_free_buffers(journal, page,
- wait & ~__GFP_DIRECT_RECLAIM);
+ return jbd2_journal_try_to_free_buffers(journal, page);
+
return try_to_free_buffers(page);
}
@@ -1284,6 +1507,9 @@
if (WARN_ON_ONCE(IS_DAX(inode) && i_size_read(inode)))
return -EINVAL;
+ if (ext4_test_inode_flag(inode, EXT4_INODE_DAX))
+ return -EOPNOTSUPP;
+
res = ext4_convert_inline_data(inode);
if (res)
return res;
@@ -1309,7 +1535,7 @@
* Update inode->i_flags - S_ENCRYPTED will be enabled,
* S_DAX may be disabled
*/
- ext4_set_inode_flags(inode);
+ ext4_set_inode_flags(inode, false);
}
return res;
}
@@ -1336,7 +1562,7 @@
* Update inode->i_flags - S_ENCRYPTED will be enabled,
* S_DAX may be disabled
*/
- ext4_set_inode_flags(inode);
+ ext4_set_inode_flags(inode, false);
res = ext4_mark_inode_dirty(handle, inode);
if (res)
EXT4_ERROR_INODE(inode, "Failed to mark inode dirty");
@@ -1350,18 +1576,32 @@
return res;
}
-static bool ext4_dummy_context(struct inode *inode)
+static const union fscrypt_policy *ext4_get_dummy_policy(struct super_block *sb)
{
- return DUMMY_ENCRYPTION_ENABLED(EXT4_SB(inode->i_sb));
+ return EXT4_SB(sb)->s_dummy_enc_policy.policy;
+}
+
+static bool ext4_has_stable_inodes(struct super_block *sb)
+{
+ return ext4_has_feature_stable_inodes(sb);
+}
+
+static void ext4_get_ino_and_lblk_bits(struct super_block *sb,
+ int *ino_bits_ret, int *lblk_bits_ret)
+{
+ *ino_bits_ret = 8 * sizeof(EXT4_SB(sb)->s_es->s_inodes_count);
+ *lblk_bits_ret = 8 * sizeof(ext4_lblk_t);
}
static const struct fscrypt_operations ext4_cryptops = {
.key_prefix = "ext4:",
.get_context = ext4_get_context,
.set_context = ext4_set_context,
- .dummy_context = ext4_dummy_context,
+ .get_dummy_policy = ext4_get_dummy_policy,
.empty_dir = ext4_empty_dir,
.max_namelen = EXT4_NAME_LEN,
+ .has_stable_inodes = ext4_has_stable_inodes,
+ .get_ino_and_lblk_bits = ext4_get_ino_and_lblk_bits,
};
#endif
@@ -1384,7 +1624,6 @@
static int ext4_quota_enable(struct super_block *sb, int type, int format_id,
unsigned int flags);
static int ext4_enable_quotas(struct super_block *sb);
-static int ext4_get_next_id(struct super_block *sb, struct kqid *qid);
static struct dquot **ext4_get_dquots(struct inode *inode)
{
@@ -1402,7 +1641,7 @@
.destroy_dquot = dquot_destroy,
.get_projid = ext4_get_projid,
.get_inode_usage = ext4_get_inode_usage,
- .get_next_id = ext4_get_next_id,
+ .get_next_id = dquot_get_next_id,
};
static const struct quotactl_ops ext4_qctl_operations = {
@@ -1457,10 +1696,12 @@
Opt_journal_path, Opt_journal_checksum, Opt_journal_async_commit,
Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
Opt_data_err_abort, Opt_data_err_ignore, Opt_test_dummy_encryption,
+ Opt_inlinecrypt,
Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota,
Opt_noquota, Opt_barrier, Opt_nobarrier, Opt_err,
- Opt_usrquota, Opt_grpquota, Opt_prjquota, Opt_i_version, Opt_dax,
+ Opt_usrquota, Opt_grpquota, Opt_prjquota, Opt_i_version,
+ Opt_dax, Opt_dax_always, Opt_dax_inode, Opt_dax_never,
Opt_stripe, Opt_delalloc, Opt_nodelalloc, Opt_warn_on_error,
Opt_nowarn_on_error, Opt_mblk_io_submit,
Opt_lazytime, Opt_nolazytime, Opt_debug_want_extra_isize,
@@ -1469,6 +1710,10 @@
Opt_dioread_nolock, Opt_dioread_lock,
Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable,
Opt_max_dir_size_kb, Opt_nojournal_checksum, Opt_nombcache,
+ Opt_prefetch_block_bitmaps,
+#ifdef CONFIG_EXT4_DEBUG
+ Opt_fc_debug_max_replay, Opt_fc_debug_force
+#endif
};
static const match_table_t tokens = {
@@ -1527,6 +1772,9 @@
{Opt_nobarrier, "nobarrier"},
{Opt_i_version, "i_version"},
{Opt_dax, "dax"},
+ {Opt_dax_always, "dax=always"},
+ {Opt_dax_inode, "dax=inode"},
+ {Opt_dax_never, "dax=never"},
{Opt_stripe, "stripe=%u"},
{Opt_delalloc, "delalloc"},
{Opt_warn_on_error, "warn_on_error"},
@@ -1545,16 +1793,24 @@
{Opt_auto_da_alloc, "auto_da_alloc"},
{Opt_noauto_da_alloc, "noauto_da_alloc"},
{Opt_dioread_nolock, "dioread_nolock"},
+ {Opt_dioread_lock, "nodioread_nolock"},
{Opt_dioread_lock, "dioread_lock"},
{Opt_discard, "discard"},
{Opt_nodiscard, "nodiscard"},
{Opt_init_itable, "init_itable=%u"},
{Opt_init_itable, "init_itable"},
{Opt_noinit_itable, "noinit_itable"},
+#ifdef CONFIG_EXT4_DEBUG
+ {Opt_fc_debug_force, "fc_debug_force"},
+ {Opt_fc_debug_max_replay, "fc_debug_max_replay=%u"},
+#endif
{Opt_max_dir_size_kb, "max_dir_size_kb=%u"},
+ {Opt_test_dummy_encryption, "test_dummy_encryption=%s"},
{Opt_test_dummy_encryption, "test_dummy_encryption"},
+ {Opt_inlinecrypt, "inlinecrypt"},
{Opt_nombcache, "nombcache"},
{Opt_nombcache, "no_mbcache"}, /* for backward compatibility */
+ {Opt_prefetch_block_bitmaps, "prefetch_block_bitmaps"},
{Opt_removed, "check=none"}, /* mount option from ext2/3 */
{Opt_removed, "nocheck"}, /* mount option from ext2/3 */
{Opt_removed, "reservation"}, /* mount option from ext2/3 */
@@ -1673,6 +1929,8 @@
#define MOPT_NO_EXT3 0x0200
#define MOPT_EXT4_ONLY (MOPT_NO_EXT2 | MOPT_NO_EXT3)
#define MOPT_STRING 0x0400
+#define MOPT_SKIP 0x0800
+#define MOPT_2 0x1000
static const struct mount_opts {
int token;
@@ -1722,7 +1980,13 @@
{Opt_min_batch_time, 0, MOPT_GTE0},
{Opt_inode_readahead_blks, 0, MOPT_GTE0},
{Opt_init_itable, 0, MOPT_GTE0},
- {Opt_dax, EXT4_MOUNT_DAX, MOPT_SET},
+ {Opt_dax, EXT4_MOUNT_DAX_ALWAYS, MOPT_SET | MOPT_SKIP},
+ {Opt_dax_always, EXT4_MOUNT_DAX_ALWAYS,
+ MOPT_EXT4_ONLY | MOPT_SET | MOPT_SKIP},
+ {Opt_dax_inode, EXT4_MOUNT2_DAX_INODE,
+ MOPT_EXT4_ONLY | MOPT_SET | MOPT_SKIP},
+ {Opt_dax_never, EXT4_MOUNT2_DAX_NEVER,
+ MOPT_EXT4_ONLY | MOPT_SET | MOPT_SKIP},
{Opt_stripe, 0, MOPT_GTE0},
{Opt_resuid, 0, MOPT_GTE0},
{Opt_resgid, 0, MOPT_GTE0},
@@ -1763,8 +2027,15 @@
{Opt_jqfmt_vfsv0, QFMT_VFS_V0, MOPT_QFMT},
{Opt_jqfmt_vfsv1, QFMT_VFS_V1, MOPT_QFMT},
{Opt_max_dir_size_kb, 0, MOPT_GTE0},
- {Opt_test_dummy_encryption, 0, MOPT_GTE0},
+ {Opt_test_dummy_encryption, 0, MOPT_STRING},
{Opt_nombcache, EXT4_MOUNT_NO_MBCACHE, MOPT_SET},
+ {Opt_prefetch_block_bitmaps, EXT4_MOUNT_PREFETCH_BLOCK_BITMAPS,
+ MOPT_SET},
+#ifdef CONFIG_EXT4_DEBUG
+ {Opt_fc_debug_force, EXT4_MOUNT2_JOURNAL_FAST_COMMIT,
+ MOPT_SET | MOPT_2 | MOPT_EXT4_ONLY},
+ {Opt_fc_debug_max_replay, 0, MOPT_GTE0},
+#endif
{Opt_err, 0, 0}
};
@@ -1798,6 +2069,49 @@
}
#endif
+static int ext4_set_test_dummy_encryption(struct super_block *sb,
+ const char *opt,
+ const substring_t *arg,
+ bool is_remount)
+{
+#ifdef CONFIG_FS_ENCRYPTION
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ int err;
+
+ /*
+ * This mount option is just for testing, and it's not worthwhile to
+ * implement the extra complexity (e.g. RCU protection) that would be
+ * needed to allow it to be set or changed during remount. We do allow
+ * it to be specified during remount, but only if there is no change.
+ */
+ if (is_remount && !sbi->s_dummy_enc_policy.policy) {
+ ext4_msg(sb, KERN_WARNING,
+ "Can't set test_dummy_encryption on remount");
+ return -1;
+ }
+ err = fscrypt_set_test_dummy_encryption(sb, arg->from,
+ &sbi->s_dummy_enc_policy);
+ if (err) {
+ if (err == -EEXIST)
+ ext4_msg(sb, KERN_WARNING,
+ "Can't change test_dummy_encryption on remount");
+ else if (err == -EINVAL)
+ ext4_msg(sb, KERN_WARNING,
+ "Value of option \"%s\" is unrecognized", opt);
+ else
+ ext4_msg(sb, KERN_WARNING,
+ "Error processing option \"%s\" [%d]",
+ opt, err);
+ return -1;
+ }
+ ext4_msg(sb, KERN_WARNING, "Test dummy encryption mode enabled");
+#else
+ ext4_msg(sb, KERN_WARNING,
+ "Test dummy encryption mount option ignored");
+#endif
+ return 1;
+}
+
static int handle_mount_opt(struct super_block *sb, char *opt, int token,
substring_t *args, unsigned long *journal_devnum,
unsigned int *journal_ioprio, int is_remount)
@@ -1829,7 +2143,7 @@
ext4_msg(sb, KERN_WARNING, "Ignoring removed %s option", opt);
return 1;
case Opt_abort:
- sbi->s_mount_flags |= EXT4_MF_FS_ABORTED;
+ ext4_set_mount_flag(sb, EXT4_MF_FS_ABORTED);
return 1;
case Opt_i_version:
sb->s_flags |= SB_I_VERSION;
@@ -1840,6 +2154,13 @@
case Opt_nolazytime:
sb->s_flags &= ~SB_LAZYTIME;
return 1;
+ case Opt_inlinecrypt:
+#ifdef CONFIG_FS_ENCRYPTION_INLINE_CRYPT
+ sb->s_flags |= SB_INLINECRYPT;
+#else
+ ext4_msg(sb, KERN_ERR, "inline encryption not supported");
+#endif
+ return 1;
}
for (m = ext4_mount_opts; m->token != Opt_err; m++)
@@ -1924,6 +2245,10 @@
sbi->s_li_wait_mult = arg;
} else if (token == Opt_max_dir_size_kb) {
sbi->s_max_dir_size_kb = arg;
+#ifdef CONFIG_EXT4_DEBUG
+ } else if (token == Opt_fc_debug_max_replay) {
+ sbi->s_fc_debug_max_replay = arg;
+#endif
} else if (token == Opt_stripe) {
sbi->s_stripe = arg;
} else if (token == Opt_resuid) {
@@ -1994,14 +2319,8 @@
*journal_ioprio =
IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, arg);
} else if (token == Opt_test_dummy_encryption) {
-#ifdef CONFIG_FS_ENCRYPTION
- sbi->s_mount_flags |= EXT4_MF_TEST_DUMMY_ENCRYPTION;
- ext4_msg(sb, KERN_WARNING,
- "Test dummy encryption mode enabled");
-#else
- ext4_msg(sb, KERN_WARNING,
- "Test dummy encryption mount option ignored");
-#endif
+ return ext4_set_test_dummy_encryption(sb, opt, &args[0],
+ is_remount);
} else if (m->flags & MOPT_DATAJ) {
if (is_remount) {
if (!sbi->s_journal)
@@ -2031,23 +2350,56 @@
}
sbi->s_jquota_fmt = m->mount_opt;
#endif
- } else if (token == Opt_dax) {
+ } else if (token == Opt_dax || token == Opt_dax_always ||
+ token == Opt_dax_inode || token == Opt_dax_never) {
#ifdef CONFIG_FS_DAX
- if (is_remount && test_opt(sb, DAX)) {
- ext4_msg(sb, KERN_ERR, "can't mount with "
- "both data=journal and dax");
- return -1;
+ switch (token) {
+ case Opt_dax:
+ case Opt_dax_always:
+ if (is_remount &&
+ (!(sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS) ||
+ (sbi->s_mount_opt2 & EXT4_MOUNT2_DAX_NEVER))) {
+ fail_dax_change_remount:
+ ext4_msg(sb, KERN_ERR, "can't change "
+ "dax mount option while remounting");
+ return -1;
+ }
+ if (is_remount &&
+ (test_opt(sb, DATA_FLAGS) ==
+ EXT4_MOUNT_JOURNAL_DATA)) {
+ ext4_msg(sb, KERN_ERR, "can't mount with "
+ "both data=journal and dax");
+ return -1;
+ }
+ ext4_msg(sb, KERN_WARNING,
+ "DAX enabled. Warning: EXPERIMENTAL, use at your own risk");
+ sbi->s_mount_opt |= EXT4_MOUNT_DAX_ALWAYS;
+ sbi->s_mount_opt2 &= ~EXT4_MOUNT2_DAX_NEVER;
+ break;
+ case Opt_dax_never:
+ if (is_remount &&
+ (!(sbi->s_mount_opt2 & EXT4_MOUNT2_DAX_NEVER) ||
+ (sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS)))
+ goto fail_dax_change_remount;
+ sbi->s_mount_opt2 |= EXT4_MOUNT2_DAX_NEVER;
+ sbi->s_mount_opt &= ~EXT4_MOUNT_DAX_ALWAYS;
+ break;
+ case Opt_dax_inode:
+ if (is_remount &&
+ ((sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS) ||
+ (sbi->s_mount_opt2 & EXT4_MOUNT2_DAX_NEVER) ||
+ !(sbi->s_mount_opt2 & EXT4_MOUNT2_DAX_INODE)))
+ goto fail_dax_change_remount;
+ sbi->s_mount_opt &= ~EXT4_MOUNT_DAX_ALWAYS;
+ sbi->s_mount_opt2 &= ~EXT4_MOUNT2_DAX_NEVER;
+ /* Strictly for printing options */
+ sbi->s_mount_opt2 |= EXT4_MOUNT2_DAX_INODE;
+ break;
}
- if (is_remount && !(sbi->s_mount_opt & EXT4_MOUNT_DAX)) {
- ext4_msg(sb, KERN_ERR, "can't change "
- "dax mount option while remounting");
- return -1;
- }
- ext4_msg(sb, KERN_WARNING,
- "DAX enabled. Warning: EXPERIMENTAL, use at your own risk");
- sbi->s_mount_opt |= m->mount_opt;
#else
ext4_msg(sb, KERN_INFO, "dax option not supported");
+ sbi->s_mount_opt2 |= EXT4_MOUNT2_DAX_NEVER;
+ sbi->s_mount_opt &= ~EXT4_MOUNT_DAX_ALWAYS;
return -1;
#endif
} else if (token == Opt_data_err_abort) {
@@ -2065,10 +2417,17 @@
WARN_ON(1);
return -1;
}
- if (arg != 0)
- sbi->s_mount_opt |= m->mount_opt;
- else
- sbi->s_mount_opt &= ~m->mount_opt;
+ if (m->flags & MOPT_2) {
+ if (arg != 0)
+ sbi->s_mount_opt2 |= m->mount_opt;
+ else
+ sbi->s_mount_opt2 &= ~m->mount_opt;
+ } else {
+ if (arg != 0)
+ sbi->s_mount_opt |= m->mount_opt;
+ else
+ sbi->s_mount_opt &= ~m->mount_opt;
+ }
}
return 1;
}
@@ -2078,7 +2437,7 @@
unsigned int *journal_ioprio,
int is_remount)
{
- struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_sb_info __maybe_unused *sbi = EXT4_SB(sb);
char *p, __maybe_unused *usr_qf_name, __maybe_unused *grp_qf_name;
substring_t args[MAX_OPT_ARGS];
int token;
@@ -2135,12 +2494,10 @@
if (test_opt(sb, DIOREAD_NOLOCK)) {
int blocksize =
BLOCK_SIZE << le32_to_cpu(sbi->s_es->s_log_block_size);
-
- if (blocksize < PAGE_SIZE) {
- ext4_msg(sb, KERN_ERR, "can't mount with "
- "dioread_nolock if block size != PAGE_SIZE");
- return 0;
- }
+ if (blocksize < PAGE_SIZE)
+ ext4_msg(sb, KERN_WARNING, "Warning: mounting with an "
+ "experimental mount option 'dioread_nolock' "
+ "for blocksize < PAGE_SIZE");
}
return 1;
}
@@ -2213,7 +2570,7 @@
for (m = ext4_mount_opts; m->token != Opt_err; m++) {
int want_set = m->flags & MOPT_SET;
if (((m->flags & (MOPT_SET|MOPT_CLEAR)) == 0) ||
- (m->flags & MOPT_CLEAR_ERR))
+ (m->flags & MOPT_CLEAR_ERR) || m->flags & MOPT_SKIP)
continue;
if (!nodefs && !(m->mount_opt & (sbi->s_mount_opt ^ def_mount_opt)))
continue; /* skip if same as the default */
@@ -2270,9 +2627,22 @@
SEQ_OPTS_PRINT("max_dir_size_kb=%u", sbi->s_max_dir_size_kb);
if (test_opt(sb, DATA_ERR_ABORT))
SEQ_OPTS_PUTS("data_err=abort");
- if (DUMMY_ENCRYPTION_ENABLED(sbi))
- SEQ_OPTS_PUTS("test_dummy_encryption");
+ fscrypt_show_test_dummy_encryption(seq, sep, sb);
+
+ if (sb->s_flags & SB_INLINECRYPT)
+ SEQ_OPTS_PUTS("inlinecrypt");
+
+ if (test_opt(sb, DAX_ALWAYS)) {
+ if (IS_EXT2_SB(sb))
+ SEQ_OPTS_PUTS("dax");
+ else
+ SEQ_OPTS_PUTS("dax=always");
+ } else if (test_opt2(sb, DAX_NEVER)) {
+ SEQ_OPTS_PUTS("dax=never");
+ } else if (test_opt2(sb, DAX_INODE)) {
+ SEQ_OPTS_PUTS("dax=inode");
+ }
ext4_show_quota_options(seq, sb);
return 0;
}
@@ -2830,17 +3200,17 @@
*/
static loff_t ext4_max_bitmap_size(int bits, int has_huge_files)
{
- loff_t res = EXT4_NDIR_BLOCKS;
+ unsigned long long upper_limit, res = EXT4_NDIR_BLOCKS;
int meta_blocks;
- loff_t upper_limit;
- /* This is calculated to be the largest file size for a dense, block
+
+ /*
+ * This is calculated to be the largest file size for a dense, block
* mapped file such that the file's total number of 512-byte sectors,
* including data and all indirect blocks, does not exceed (2^48 - 1).
*
* __u32 i_blocks_lo and _u16 i_blocks_high represent the total
* number of 512-byte sectors of the file.
*/
-
if (!has_huge_files) {
/*
* !has_huge_files or implies that the inode i_block field
@@ -2883,7 +3253,7 @@
if (res > MAX_LFS_FILESIZE)
res = MAX_LFS_FILESIZE;
- return res;
+ return (loff_t)res;
}
static ext4_fsblk_t descriptor_loc(struct super_block *sb,
@@ -3063,15 +3433,34 @@
static int ext4_run_li_request(struct ext4_li_request *elr)
{
struct ext4_group_desc *gdp = NULL;
- ext4_group_t group, ngroups;
- struct super_block *sb;
- unsigned long timeout = 0;
+ struct super_block *sb = elr->lr_super;
+ ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
+ ext4_group_t group = elr->lr_next_group;
+ unsigned int prefetch_ios = 0;
int ret = 0;
+ u64 start_time;
- sb = elr->lr_super;
- ngroups = EXT4_SB(sb)->s_groups_count;
+ if (elr->lr_mode == EXT4_LI_MODE_PREFETCH_BBITMAP) {
+ elr->lr_next_group = ext4_mb_prefetch(sb, group,
+ EXT4_SB(sb)->s_mb_prefetch, &prefetch_ios);
+ if (prefetch_ios)
+ ext4_mb_prefetch_fini(sb, elr->lr_next_group,
+ prefetch_ios);
+ trace_ext4_prefetch_bitmaps(sb, group, elr->lr_next_group,
+ prefetch_ios);
+ if (group >= elr->lr_next_group) {
+ ret = 1;
+ if (elr->lr_first_not_zeroed != ngroups &&
+ !sb_rdonly(sb) && test_opt(sb, INIT_INODE_TABLE)) {
+ elr->lr_next_group = elr->lr_first_not_zeroed;
+ elr->lr_mode = EXT4_LI_MODE_ITABLE;
+ ret = 0;
+ }
+ }
+ return ret;
+ }
- for (group = elr->lr_next_group; group < ngroups; group++) {
+ for (; group < ngroups; group++) {
gdp = ext4_get_group_desc(sb, group, NULL);
if (!gdp) {
ret = 1;
@@ -3086,13 +3475,13 @@
ret = 1;
if (!ret) {
- timeout = jiffies;
+ start_time = ktime_get_real_ns();
ret = ext4_init_inode_table(sb, group,
elr->lr_timeout ? 0 : 1);
+ trace_ext4_lazy_itable_init(sb, group);
if (elr->lr_timeout == 0) {
- timeout = (jiffies - timeout) *
- elr->lr_sbi->s_li_wait_mult;
- elr->lr_timeout = timeout;
+ elr->lr_timeout = nsecs_to_jiffies((ktime_get_real_ns() - start_time) *
+ EXT4_SB(elr->lr_super)->s_li_wait_mult);
}
elr->lr_next_sched = jiffies + elr->lr_timeout;
elr->lr_next_group = group + 1;
@@ -3106,15 +3495,11 @@
*/
static void ext4_remove_li_request(struct ext4_li_request *elr)
{
- struct ext4_sb_info *sbi;
-
if (!elr)
return;
- sbi = elr->lr_sbi;
-
list_del(&elr->lr_request);
- sbi->s_li_request = NULL;
+ EXT4_SB(elr->lr_super)->s_li_request = NULL;
kfree(elr);
}
@@ -3323,7 +3708,6 @@
static struct ext4_li_request *ext4_li_request_new(struct super_block *sb,
ext4_group_t start)
{
- struct ext4_sb_info *sbi = EXT4_SB(sb);
struct ext4_li_request *elr;
elr = kzalloc(sizeof(*elr), GFP_KERNEL);
@@ -3331,8 +3715,13 @@
return NULL;
elr->lr_super = sb;
- elr->lr_sbi = sbi;
- elr->lr_next_group = start;
+ elr->lr_first_not_zeroed = start;
+ if (test_opt(sb, PREFETCH_BLOCK_BITMAPS))
+ elr->lr_mode = EXT4_LI_MODE_PREFETCH_BBITMAP;
+ else {
+ elr->lr_mode = EXT4_LI_MODE_ITABLE;
+ elr->lr_next_group = start;
+ }
/*
* Randomize first schedule time of the request to
@@ -3362,8 +3751,9 @@
goto out;
}
- if (first_not_zeroed == ngroups || sb_rdonly(sb) ||
- !test_opt(sb, INIT_INODE_TABLE))
+ if (!test_opt(sb, PREFETCH_BLOCK_BITMAPS) &&
+ (first_not_zeroed == ngroups || sb_rdonly(sb) ||
+ !test_opt(sb, INIT_INODE_TABLE)))
goto out;
elr = ext4_li_request_new(sb, first_not_zeroed);
@@ -3574,8 +3964,8 @@
* Add the internal journal blocks whether the journal has been
* loaded or not
*/
- if (sbi->s_journal && !sbi->journal_bdev)
- overhead += EXT4_NUM_B2C(sbi, sbi->s_journal->j_maxlen);
+ if (sbi->s_journal && !sbi->s_journal_bdev)
+ overhead += EXT4_NUM_B2C(sbi, sbi->s_journal->j_total_len);
else if (ext4_has_feature_journal(sb) && !sbi->s_journal && j_inum) {
/* j_inum for internal journal is non-zero */
j_inode = ext4_get_journal_inode(sb, j_inum);
@@ -3643,7 +4033,7 @@
int blocksize, clustersize;
unsigned int db_count;
unsigned int i;
- int needs_recovery, has_huge_files, has_bigalloc;
+ int needs_recovery, has_huge_files;
__u64 blocks_count;
int err = 0;
unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
@@ -3688,8 +4078,11 @@
logical_sb_block = sb_block;
}
- if (!(bh = sb_bread_unmovable(sb, logical_sb_block))) {
+ bh = ext4_sb_bread_unmovable(sb, logical_sb_block);
+ if (IS_ERR(bh)) {
ext4_msg(sb, KERN_ERR, "unable to read superblock");
+ ret = PTR_ERR(bh);
+ bh = NULL;
goto out_fail;
}
/*
@@ -3756,6 +4149,8 @@
#ifdef CONFIG_EXT4_FS_POSIX_ACL
set_opt(sb, POSIX_ACL);
#endif
+ if (ext4_has_feature_fast_commit(sb))
+ set_opt2(sb, JOURNAL_FAST_COMMIT);
/* don't forget to enable journal_csum when metadata_csum is enabled. */
if (ext4_has_metadata_csum(sb))
set_opt(sb, JOURNAL_CHECKSUM);
@@ -3801,14 +4196,25 @@
*/
sbi->s_li_wait_mult = EXT4_DEF_LI_WAIT_MULT;
- blocksize = BLOCK_SIZE << le32_to_cpu(es->s_log_block_size);
- if (blocksize < EXT4_MIN_BLOCK_SIZE ||
- blocksize > EXT4_MAX_BLOCK_SIZE) {
+ if (le32_to_cpu(es->s_log_block_size) >
+ (EXT4_MAX_BLOCK_LOG_SIZE - EXT4_MIN_BLOCK_LOG_SIZE)) {
ext4_msg(sb, KERN_ERR,
- "Unsupported filesystem blocksize %d (%d log_block_size)",
- blocksize, le32_to_cpu(es->s_log_block_size));
+ "Invalid log block size: %u",
+ le32_to_cpu(es->s_log_block_size));
goto failed_mount;
}
+ if (le32_to_cpu(es->s_log_cluster_size) >
+ (EXT4_MAX_CLUSTER_LOG_SIZE - EXT4_MIN_BLOCK_LOG_SIZE)) {
+ ext4_msg(sb, KERN_ERR,
+ "Invalid log cluster size: %u",
+ le32_to_cpu(es->s_log_cluster_size));
+ goto failed_mount;
+ }
+
+ blocksize = EXT4_MIN_BLOCK_SIZE << le32_to_cpu(es->s_log_block_size);
+
+ if (blocksize == PAGE_SIZE)
+ set_opt(sb, DIOREAD_NOLOCK);
if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV) {
sbi->s_inode_size = EXT4_GOOD_OLD_INODE_SIZE;
@@ -3893,7 +4299,7 @@
goto failed_mount;
#ifdef CONFIG_UNICODE
- if (ext4_has_feature_casefold(sb) && !sbi->s_encoding) {
+ if (ext4_has_feature_casefold(sb) && !sb->s_encoding) {
const struct ext4_sb_encodings *encoding_info;
struct unicode_map *encoding;
__u16 encoding_flags;
@@ -3924,26 +4330,22 @@
"%s-%s with flags 0x%hx", encoding_info->name,
encoding_info->version?:"\b", encoding_flags);
- sbi->s_encoding = encoding;
- sbi->s_encoding_flags = encoding_flags;
+ sb->s_encoding = encoding;
+ sb->s_encoding_flags = encoding_flags;
}
#endif
if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
- printk_once(KERN_WARNING "EXT4-fs: Warning: mounting "
- "with data=journal disables delayed "
- "allocation and O_DIRECT support!\n");
+ printk_once(KERN_WARNING "EXT4-fs: Warning: mounting with data=journal disables delayed allocation, dioread_nolock, O_DIRECT and fast_commit support!\n");
+ /* can't mount with both data=journal and dioread_nolock. */
+ clear_opt(sb, DIOREAD_NOLOCK);
+ clear_opt2(sb, JOURNAL_FAST_COMMIT);
if (test_opt2(sb, EXPLICIT_DELALLOC)) {
ext4_msg(sb, KERN_ERR, "can't mount with "
"both data=journal and delalloc");
goto failed_mount;
}
- if (test_opt(sb, DIOREAD_NOLOCK)) {
- ext4_msg(sb, KERN_ERR, "can't mount with "
- "both data=journal and dioread_nolock");
- goto failed_mount;
- }
- if (test_opt(sb, DAX)) {
+ if (test_opt(sb, DAX_ALWAYS)) {
ext4_msg(sb, KERN_ERR, "can't mount with "
"both data=journal and dax");
goto failed_mount;
@@ -4031,21 +4433,6 @@
if (!ext4_feature_set_ok(sb, (sb_rdonly(sb))))
goto failed_mount;
- if (le32_to_cpu(es->s_log_block_size) >
- (EXT4_MAX_BLOCK_LOG_SIZE - EXT4_MIN_BLOCK_LOG_SIZE)) {
- ext4_msg(sb, KERN_ERR,
- "Invalid log block size: %u",
- le32_to_cpu(es->s_log_block_size));
- goto failed_mount;
- }
- if (le32_to_cpu(es->s_log_cluster_size) >
- (EXT4_MAX_CLUSTER_LOG_SIZE - EXT4_MIN_BLOCK_LOG_SIZE)) {
- ext4_msg(sb, KERN_ERR,
- "Invalid log cluster size: %u",
- le32_to_cpu(es->s_log_cluster_size));
- goto failed_mount;
- }
-
if (le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) > (blocksize / 4)) {
ext4_msg(sb, KERN_ERR,
"Number of reserved GDT blocks insanely large: %d",
@@ -4053,13 +4440,16 @@
goto failed_mount;
}
- if (sbi->s_mount_opt & EXT4_MOUNT_DAX) {
+ if (bdev_dax_supported(sb->s_bdev, blocksize))
+ set_bit(EXT4_FLAGS_BDEV_IS_DAX, &sbi->s_ext4_flags);
+
+ if (sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS) {
if (ext4_has_feature_inline_data(sb)) {
ext4_msg(sb, KERN_ERR, "Cannot use DAX on a filesystem"
" that may contain inline data");
goto failed_mount;
}
- if (!bdev_dax_supported(sb->s_bdev, blocksize)) {
+ if (!test_bit(EXT4_FLAGS_BDEV_IS_DAX, &sbi->s_ext4_flags)) {
ext4_msg(sb, KERN_ERR,
"DAX unsupported by block device.");
goto failed_mount;
@@ -4073,20 +4463,28 @@
}
if (sb->s_blocksize != blocksize) {
+ /*
+ * bh must be released before kill_bdev(), otherwise
+ * it won't be freed and its page also. kill_bdev()
+ * is called by sb_set_blocksize().
+ */
+ brelse(bh);
/* Validate the filesystem blocksize */
if (!sb_set_blocksize(sb, blocksize)) {
ext4_msg(sb, KERN_ERR, "bad block size %d",
blocksize);
+ bh = NULL;
goto failed_mount;
}
- brelse(bh);
logical_sb_block = sb_block * EXT4_MIN_BLOCK_SIZE;
offset = do_div(logical_sb_block, blocksize);
- bh = sb_bread_unmovable(sb, logical_sb_block);
- if (!bh) {
+ bh = ext4_sb_bread_unmovable(sb, logical_sb_block);
+ if (IS_ERR(bh)) {
ext4_msg(sb, KERN_ERR,
"Can't read superblock on 2nd try");
+ ret = PTR_ERR(bh);
+ bh = NULL;
goto failed_mount;
}
es = (struct ext4_super_block *)(bh->b_data + offset);
@@ -4159,8 +4557,7 @@
/* Handle clustersize */
clustersize = BLOCK_SIZE << le32_to_cpu(es->s_log_cluster_size);
- has_bigalloc = ext4_has_feature_bigalloc(sb);
- if (has_bigalloc) {
+ if (ext4_has_feature_bigalloc(sb)) {
if (clustersize < blocksize) {
ext4_msg(sb, KERN_ERR,
"cluster size (%d) smaller than "
@@ -4299,18 +4696,20 @@
/* Pre-read the descriptors into the buffer cache */
for (i = 0; i < db_count; i++) {
block = descriptor_loc(sb, logical_sb_block, i);
- sb_breadahead_unmovable(sb, block);
+ ext4_sb_breadahead_unmovable(sb, block);
}
for (i = 0; i < db_count; i++) {
struct buffer_head *bh;
block = descriptor_loc(sb, logical_sb_block, i);
- bh = sb_bread_unmovable(sb, block);
- if (!bh) {
+ bh = ext4_sb_bread_unmovable(sb, block);
+ if (IS_ERR(bh)) {
ext4_msg(sb, KERN_ERR,
"can't read group descriptor %d", i);
db_count = i;
+ ret = PTR_ERR(bh);
+ bh = NULL;
goto failed_mount2;
}
rcu_read_lock();
@@ -4358,6 +4757,26 @@
INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */
mutex_init(&sbi->s_orphan_lock);
+ /* Initialize fast commit stuff */
+ atomic_set(&sbi->s_fc_subtid, 0);
+ atomic_set(&sbi->s_fc_ineligible_updates, 0);
+ INIT_LIST_HEAD(&sbi->s_fc_q[FC_Q_MAIN]);
+ INIT_LIST_HEAD(&sbi->s_fc_q[FC_Q_STAGING]);
+ INIT_LIST_HEAD(&sbi->s_fc_dentry_q[FC_Q_MAIN]);
+ INIT_LIST_HEAD(&sbi->s_fc_dentry_q[FC_Q_STAGING]);
+ sbi->s_fc_bytes = 0;
+ ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
+ ext4_clear_mount_flag(sb, EXT4_MF_FC_COMMITTING);
+ spin_lock_init(&sbi->s_fc_lock);
+ memset(&sbi->s_fc_stats, 0, sizeof(sbi->s_fc_stats));
+ sbi->s_fc_replay_state.fc_regions = NULL;
+ sbi->s_fc_replay_state.fc_regions_size = 0;
+ sbi->s_fc_replay_state.fc_regions_used = 0;
+ sbi->s_fc_replay_state.fc_regions_valid = 0;
+ sbi->s_fc_replay_state.fc_modified_inodes = NULL;
+ sbi->s_fc_replay_state.fc_modified_inodes_size = 0;
+ sbi->s_fc_replay_state.fc_modified_inodes_used = 0;
+
sb->s_root = NULL;
needs_recovery = (es->s_last_orphan != 0 ||
@@ -4407,6 +4826,7 @@
sbi->s_def_mount_opt &= ~EXT4_MOUNT_JOURNAL_CHECKSUM;
clear_opt(sb, JOURNAL_CHECKSUM);
clear_opt(sb, DATA_FLAGS);
+ clear_opt2(sb, JOURNAL_FAST_COMMIT);
sbi->s_journal = NULL;
needs_recovery = 0;
goto no_journal;
@@ -4425,6 +4845,14 @@
goto failed_mount_wq;
}
+ if (test_opt2(sb, JOURNAL_FAST_COMMIT) &&
+ !jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0,
+ JBD2_FEATURE_INCOMPAT_FAST_COMMIT)) {
+ ext4_msg(sb, KERN_ERR,
+ "Failed to set fast commit journal feature");
+ goto failed_mount_wq;
+ }
+
/* We have now updated the journal if required, so we can
* validate the data journaling mode. */
switch (test_opt(sb, DATA_FLAGS)) {
@@ -4464,7 +4892,10 @@
set_task_ioprio(sbi->s_journal->j_task, journal_ioprio);
- sbi->s_journal->j_commit_callback = ext4_journal_commit_callback;
+ sbi->s_journal->j_submit_inode_data_buffers =
+ ext4_journal_submit_inode_data_buffers;
+ sbi->s_journal->j_finish_inode_data_buffers =
+ ext4_journal_finish_inode_data_buffers;
no_journal:
if (!test_opt(sb, NO_MBCACHE)) {
@@ -4485,13 +4916,6 @@
}
}
- if ((DUMMY_ENCRYPTION_ENABLED(sbi) || ext4_has_feature_encrypt(sb)) &&
- (blocksize != PAGE_SIZE)) {
- ext4_msg(sb, KERN_ERR,
- "Unsupported blocksize for fs encryption");
- goto failed_mount_wq;
- }
-
if (ext4_has_feature_verity(sb) && blocksize != PAGE_SIZE) {
ext4_msg(sb, KERN_ERR, "Unsupported blocksize for fs-verity");
goto failed_mount_wq;
@@ -4546,7 +4970,7 @@
}
#ifdef CONFIG_UNICODE
- if (sbi->s_encoding)
+ if (sb->s_encoding)
sb->s_d_op = &ext4_dentry_ops;
#endif
@@ -4574,6 +4998,7 @@
goto failed_mount4a;
}
}
+ ext4_fc_replay_cleanup(sb);
ext4_ext_init(sb);
err = ext4_mb_init(sb);
@@ -4583,6 +5008,14 @@
goto failed_mount5;
}
+ /*
+ * We can only set up the journal commit callback once
+ * mballoc is initialized
+ */
+ if (sbi->s_journal)
+ sbi->s_journal->j_commit_callback =
+ ext4_journal_commit_callback;
+
block = ext4_count_free_clusters(sb);
ext4_free_blocks_count_set(sbi->s_es,
EXT4_C2B(sbi, block));
@@ -4639,6 +5072,14 @@
}
#endif /* CONFIG_QUOTA */
+ /*
+ * Save the original bdev mapping's wb_err value which could be
+ * used to detect the metadata async write error.
+ */
+ spin_lock_init(&sbi->s_bdev_wb_lock);
+ errseq_check_and_advance(&sb->s_bdev->bd_inode->i_mapping->wb_err,
+ &sbi->s_bdev_wb_err);
+ sb->s_bdev->bd_super = sb;
EXT4_SB(sb)->s_mount_state |= EXT4_ORPHAN_FS;
ext4_orphan_cleanup(sb, es);
EXT4_SB(sb)->s_mount_state &= ~EXT4_ORPHAN_FS;
@@ -4680,6 +5121,8 @@
ratelimit_state_init(&sbi->s_err_ratelimit_state, 5 * HZ, 10);
ratelimit_state_init(&sbi->s_warning_ratelimit_state, 5 * HZ, 10);
ratelimit_state_init(&sbi->s_msg_ratelimit_state, 5 * HZ, 10);
+ atomic_set(&sbi->s_warning_count, 0);
+ atomic_set(&sbi->s_msg_count, 0);
kfree(orig_data);
return 0;
@@ -4735,8 +5178,7 @@
ext4_es_unregister_shrinker(sbi);
failed_mount3:
del_timer_sync(&sbi->s_err_report);
- if (sbi->s_mmp_tsk)
- kthread_stop(sbi->s_mmp_tsk);
+ ext4_stop_mmpd(sbi);
failed_mount2:
rcu_read_lock();
group_desc = rcu_dereference(sbi->s_group_desc);
@@ -4749,15 +5191,17 @@
crypto_free_shash(sbi->s_chksum_driver);
#ifdef CONFIG_UNICODE
- utf8_unload(sbi->s_encoding);
+ utf8_unload(sb->s_encoding);
#endif
#ifdef CONFIG_QUOTA
for (i = 0; i < EXT4_MAXQUOTAS; i++)
kfree(get_qf_name(sb, sbi, i));
#endif
- ext4_blkdev_remove(sbi);
+ fscrypt_free_dummy_policy(&sbi->s_dummy_enc_policy);
+ /* ext4_blkdev_remove() calls kill_bdev(), release bh before it. */
brelse(bh);
+ ext4_blkdev_remove(sbi);
out_fail:
sb->s_fs_info = NULL;
kfree(sbi->s_blockgroup_lock);
@@ -4780,6 +5224,7 @@
journal->j_commit_interval = sbi->s_commit_interval;
journal->j_min_batch_time = sbi->s_min_batch_time;
journal->j_max_batch_time = sbi->s_max_batch_time;
+ ext4_fc_init(sb, journal);
write_lock(&journal->j_state_lock);
if (test_opt(sb, BARRIER))
@@ -4922,9 +5367,7 @@
goto out_bdev;
}
journal->j_private = sb;
- ll_rw_block(REQ_OP_READ, REQ_META | REQ_PRIO, 1, &journal->j_sb_buffer);
- wait_on_buffer(journal->j_sb_buffer);
- if (!buffer_uptodate(journal->j_sb_buffer)) {
+ if (ext4_read_bh_lock(journal->j_sb_buffer, REQ_META | REQ_PRIO, true)) {
ext4_msg(sb, KERN_ERR, "I/O error on journal device");
goto out_journal;
}
@@ -4934,7 +5377,7 @@
be32_to_cpu(journal->j_superblock->s_nr_users));
goto out_journal;
}
- EXT4_SB(sb)->journal_bdev = bdev;
+ EXT4_SB(sb)->s_journal_bdev = bdev;
ext4_init_journal_params(sb, journal);
return journal;
@@ -5260,7 +5703,7 @@
needs_barrier = true;
if (needs_barrier) {
int err;
- err = blkdev_issue_flush(sb->s_bdev, GFP_KERNEL, NULL);
+ err = blkdev_issue_flush(sb->s_bdev, GFP_KERNEL);
if (!ret)
ret = err;
}
@@ -5439,8 +5882,8 @@
goto restore_opts;
}
- if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED)
- ext4_abort(sb, "Abort forced by user");
+ if (ext4_test_mount_flag(sb, EXT4_MF_FS_ABORTED))
+ ext4_abort(sb, EXT4_ERR_ESHUTDOWN, "Abort forced by user");
sb->s_flags = (sb->s_flags & ~SB_POSIXACL) |
(test_opt(sb, POSIX_ACL) ? SB_POSIXACL : 0);
@@ -5453,7 +5896,7 @@
}
if ((bool)(*flags & SB_RDONLY) != sb_rdonly(sb)) {
- if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED) {
+ if (ext4_test_mount_flag(sb, EXT4_MF_FS_ABORTED)) {
err = -EROFS;
goto restore_opts;
}
@@ -5488,8 +5931,6 @@
*/
ext4_mark_recovery_complete(sb, es);
}
- if (sbi->s_mmp_tsk)
- kthread_stop(sbi->s_mmp_tsk);
} else {
/* Make sure we can mount this feature set readwrite */
if (ext4_has_feature_readonly(sb) ||
@@ -5574,7 +6015,7 @@
* Releasing of existing data is done when we are sure remount will
* succeed.
*/
- if (test_opt(sb, BLOCK_VALIDITY) && !sbi->system_blks) {
+ if (test_opt(sb, BLOCK_VALIDITY) && !sbi->s_system_blks) {
err = ext4_setup_system_zone(sb);
if (err)
goto restore_opts;
@@ -5600,9 +6041,12 @@
}
}
#endif
- if (!test_opt(sb, BLOCK_VALIDITY) && sbi->system_blks)
+ if (!test_opt(sb, BLOCK_VALIDITY) && sbi->s_system_blks)
ext4_release_system_zone(sb);
+ if (!ext4_has_feature_mmp(sb) || sb_rdonly(sb))
+ ext4_stop_mmpd(sbi);
+
/*
* Some options can be enabled by ext4 and/or by VFS mount flag
* either way we need to make sure it matches in both *flags and
@@ -5623,7 +6067,7 @@
sbi->s_commit_interval = old_opts.s_commit_interval;
sbi->s_min_batch_time = old_opts.s_min_batch_time;
sbi->s_max_batch_time = old_opts.s_max_batch_time;
- if (!test_opt(sb, BLOCK_VALIDITY) && sbi->system_blks)
+ if (!test_opt(sb, BLOCK_VALIDITY) && sbi->s_system_blks)
ext4_release_system_zone(sb);
#ifdef CONFIG_QUOTA
sbi->s_jquota_fmt = old_opts.s_jquota_fmt;
@@ -5635,6 +6079,8 @@
for (i = 0; i < EXT4_MAXQUOTAS; i++)
kfree(to_free[i]);
#endif
+ if (!ext4_has_feature_mmp(sb) || sb_rdonly(sb))
+ ext4_stop_mmpd(sbi);
kfree(orig_data);
return err;
}
@@ -5654,13 +6100,8 @@
return PTR_ERR(dquot);
spin_lock(&dquot->dq_dqb_lock);
- limit = 0;
- if (dquot->dq_dqb.dqb_bsoftlimit &&
- (!limit || dquot->dq_dqb.dqb_bsoftlimit < limit))
- limit = dquot->dq_dqb.dqb_bsoftlimit;
- if (dquot->dq_dqb.dqb_bhardlimit &&
- (!limit || dquot->dq_dqb.dqb_bhardlimit < limit))
- limit = dquot->dq_dqb.dqb_bhardlimit;
+ limit = min_not_zero(dquot->dq_dqb.dqb_bsoftlimit,
+ dquot->dq_dqb.dqb_bhardlimit);
limit >>= sb->s_blocksize_bits;
if (limit && buf->f_blocks > limit) {
@@ -5672,14 +6113,8 @@
(buf->f_blocks - curblock) : 0;
}
- limit = 0;
- if (dquot->dq_dqb.dqb_isoftlimit &&
- (!limit || dquot->dq_dqb.dqb_isoftlimit < limit))
- limit = dquot->dq_dqb.dqb_isoftlimit;
- if (dquot->dq_dqb.dqb_ihardlimit &&
- (!limit || dquot->dq_dqb.dqb_ihardlimit < limit))
- limit = dquot->dq_dqb.dqb_ihardlimit;
-
+ limit = min_not_zero(dquot->dq_dqb.dqb_isoftlimit,
+ dquot->dq_dqb.dqb_ihardlimit);
if (limit && buf->f_files > limit) {
buf->f_files = limit;
buf->f_ffree =
@@ -5722,8 +6157,7 @@
buf->f_namelen = EXT4_NAME_LEN;
fsid = le64_to_cpup((void *)es->s_uuid) ^
le64_to_cpup((void *)es->s_uuid + sizeof(u64));
- buf->f_fsid.val[0] = fsid & 0xFFFFFFFFUL;
- buf->f_fsid.val[1] = (fsid >> 32) & 0xFFFFFFFFUL;
+ buf->f_fsid = u64_to_fsid(fsid);
#ifdef CONFIG_QUOTA
if (ext4_test_inode_flag(dentry->d_inode, EXT4_INODE_PROJINHERIT) &&
@@ -5907,10 +6341,7 @@
lockdep_set_quota_inode(path->dentry->d_inode, I_DATA_SEM_QUOTA);
err = dquot_quota_on(sb, type, format_id, path);
- if (err) {
- lockdep_set_quota_inode(path->dentry->d_inode,
- I_DATA_SEM_NORMAL);
- } else {
+ if (!err) {
struct inode *inode = d_inode(path->dentry);
handle_t *handle;
@@ -5926,11 +6357,16 @@
EXT4_I(inode)->i_flags |= EXT4_NOATIME_FL | EXT4_IMMUTABLE_FL;
inode_set_flags(inode, S_NOATIME | S_IMMUTABLE,
S_NOATIME | S_IMMUTABLE);
- ext4_mark_inode_dirty(handle, inode);
+ err = ext4_mark_inode_dirty(handle, inode);
ext4_journal_stop(handle);
unlock_inode:
inode_unlock(inode);
+ if (err)
+ dquot_quota_off(sb, type);
}
+ if (err)
+ lockdep_set_quota_inode(path->dentry->d_inode,
+ I_DATA_SEM_NORMAL);
return err;
}
@@ -5959,7 +6395,7 @@
/* Don't account quota for quota files to avoid recursion */
qf_inode->i_flags |= S_NOQUOTA;
lockdep_set_quota_inode(qf_inode, I_DATA_SEM_QUOTA);
- err = dquot_enable(qf_inode, type, format_id, flags);
+ err = dquot_load_quota_inode(qf_inode, type, format_id, flags);
if (err)
lockdep_set_quota_inode(qf_inode, I_DATA_SEM_NORMAL);
iput(qf_inode);
@@ -5993,8 +6429,19 @@
"Failed to enable quota tracking "
"(type=%d, err=%d). Please run "
"e2fsck to fix.", type, err);
- for (type--; type >= 0; type--)
+ for (type--; type >= 0; type--) {
+ struct inode *inode;
+
+ inode = sb_dqopt(sb)->files[type];
+ if (inode)
+ inode = igrab(inode);
dquot_quota_off(sb, type);
+ if (inode) {
+ lockdep_set_quota_inode(inode,
+ I_DATA_SEM_NORMAL);
+ iput(inode);
+ }
+ }
return err;
}
@@ -6028,12 +6475,14 @@
* this is not a hard failure and quotas are already disabled.
*/
handle = ext4_journal_start(inode, EXT4_HT_QUOTA, 1);
- if (IS_ERR(handle))
+ if (IS_ERR(handle)) {
+ err = PTR_ERR(handle);
goto out_unlock;
+ }
EXT4_I(inode)->i_flags &= ~(EXT4_NOATIME_FL | EXT4_IMMUTABLE_FL);
inode_set_flags(inode, 0, S_NOATIME | S_IMMUTABLE);
inode->i_mtime = inode->i_ctime = current_time(inode);
- ext4_mark_inode_dirty(handle, inode);
+ err = ext4_mark_inode_dirty(handle, inode);
ext4_journal_stop(handle);
out_unlock:
inode_unlock(inode);
@@ -6091,12 +6540,12 @@
{
struct inode *inode = sb_dqopt(sb)->files[type];
ext4_lblk_t blk = off >> EXT4_BLOCK_SIZE_BITS(sb);
- int err, offset = off & (sb->s_blocksize - 1);
+ int err = 0, err2 = 0, offset = off & (sb->s_blocksize - 1);
int retries = 0;
struct buffer_head *bh;
handle_t *handle = journal_current_handle();
- if (EXT4_SB(sb)->s_journal && !handle) {
+ if (!handle) {
ext4_msg(sb, KERN_WARNING, "Quota write (off=%llu, len=%llu)"
" cancelled because transaction is not started",
(unsigned long long)off, (unsigned long long)len);
@@ -6117,7 +6566,7 @@
bh = ext4_bread(handle, inode, blk,
EXT4_GET_BLOCKS_CREATE |
EXT4_GET_BLOCKS_METADATA_NOFAIL);
- } while (IS_ERR(bh) && (PTR_ERR(bh) == -ENOSPC) &&
+ } while (PTR_ERR(bh) == -ENOSPC &&
ext4_should_retry_alloc(inode->i_sb, &retries));
if (IS_ERR(bh))
return PTR_ERR(bh);
@@ -6139,21 +6588,11 @@
if (inode->i_size < off + len) {
i_size_write(inode, off + len);
EXT4_I(inode)->i_disksize = inode->i_size;
- ext4_mark_inode_dirty(handle, inode);
+ err2 = ext4_mark_inode_dirty(handle, inode);
+ if (unlikely(err2 && !err))
+ err = err2;
}
- return len;
-}
-
-static int ext4_get_next_id(struct super_block *sb, struct kqid *qid)
-{
- const struct quota_format_ops *ops;
-
- if (!sb_has_quota_loaded(sb, qid->type))
- return -ESRCH;
- ops = sb_dqopt(sb)->ops[qid->type];
- if (!ops || !ops->get_next_id)
- return -ENOSYS;
- return dquot_get_next_id(sb, qid);
+ return err ? err : len;
}
#endif
@@ -6275,6 +6714,11 @@
err = init_inodecache();
if (err)
goto out1;
+
+ err = ext4_fc_init_dentry_cache();
+ if (err)
+ goto out05;
+
register_as_ext3();
register_as_ext2();
err = register_filesystem(&ext4_fs_type);
@@ -6285,6 +6729,8 @@
out:
unregister_as_ext2();
unregister_as_ext3();
+ ext4_fc_destroy_dentry_cache();
+out05:
destroy_inodecache();
out1:
ext4_exit_mballoc();
@@ -6310,6 +6756,7 @@
unregister_as_ext2();
unregister_as_ext3();
unregister_filesystem(&ext4_fs_type);
+ ext4_fc_destroy_dentry_cache();
destroy_inodecache();
ext4_exit_mballoc();
ext4_exit_sysfs();
diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c
index 9394360..f24bef3 100644
--- a/fs/ext4/sysfs.c
+++ b/fs/ext4/sysfs.c
@@ -13,6 +13,7 @@
#include <linux/seq_file.h>
#include <linux/slab.h>
#include <linux/proc_fs.h>
+#include <linux/part_stat.h>
#include "ext4.h"
#include "ext4_jbd2.h"
@@ -30,6 +31,10 @@
attr_last_error_time,
attr_feature,
attr_pointer_ui,
+ attr_pointer_ul,
+ attr_pointer_u64,
+ attr_pointer_u8,
+ attr_pointer_string,
attr_pointer_atomic,
attr_journal_task,
} attr_id_t;
@@ -47,6 +52,7 @@
struct attribute attr;
short attr_id;
short attr_ptr;
+ unsigned short attr_size;
union {
int offset;
void *explicit_ptr;
@@ -155,12 +161,38 @@
}, \
}
+#define EXT4_ATTR_STRING(_name,_mode,_size,_struct,_elname) \
+static struct ext4_attr ext4_attr_##_name = { \
+ .attr = {.name = __stringify(_name), .mode = _mode }, \
+ .attr_id = attr_pointer_string, \
+ .attr_size = _size, \
+ .attr_ptr = ptr_##_struct##_offset, \
+ .u = { \
+ .offset = offsetof(struct _struct, _elname),\
+ }, \
+}
+
#define EXT4_RO_ATTR_ES_UI(_name,_elname) \
EXT4_ATTR_OFFSET(_name, 0444, pointer_ui, ext4_super_block, _elname)
+#define EXT4_RO_ATTR_ES_U8(_name,_elname) \
+ EXT4_ATTR_OFFSET(_name, 0444, pointer_u8, ext4_super_block, _elname)
+
+#define EXT4_RO_ATTR_ES_U64(_name,_elname) \
+ EXT4_ATTR_OFFSET(_name, 0444, pointer_u64, ext4_super_block, _elname)
+
+#define EXT4_RO_ATTR_ES_STRING(_name,_elname,_size) \
+ EXT4_ATTR_STRING(_name, 0444, _size, ext4_super_block, _elname)
+
#define EXT4_RW_ATTR_SBI_UI(_name,_elname) \
EXT4_ATTR_OFFSET(_name, 0644, pointer_ui, ext4_sb_info, _elname)
+#define EXT4_RW_ATTR_SBI_UL(_name,_elname) \
+ EXT4_ATTR_OFFSET(_name, 0644, pointer_ul, ext4_sb_info, _elname)
+
+#define EXT4_RO_ATTR_SBI_ATOMIC(_name,_elname) \
+ EXT4_ATTR_OFFSET(_name, 0444, pointer_atomic, ext4_sb_info, _elname)
+
#define EXT4_ATTR_PTR(_name,_mode,_id,_ptr) \
static struct ext4_attr ext4_attr_##_name = { \
.attr = {.name = __stringify(_name), .mode = _mode }, \
@@ -188,6 +220,7 @@
EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs);
EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request);
EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc);
+EXT4_RW_ATTR_SBI_UI(mb_max_inode_prealloc, s_mb_max_inode_prealloc);
EXT4_RW_ATTR_SBI_UI(extent_max_zeroout_kb, s_extent_max_zeroout_kb);
EXT4_ATTR(trigger_fs_error, 0200, trigger_test_error);
EXT4_RW_ATTR_SBI_UI(err_ratelimit_interval_ms, s_err_ratelimit_state.interval);
@@ -196,10 +229,27 @@
EXT4_RW_ATTR_SBI_UI(warning_ratelimit_burst, s_warning_ratelimit_state.burst);
EXT4_RW_ATTR_SBI_UI(msg_ratelimit_interval_ms, s_msg_ratelimit_state.interval);
EXT4_RW_ATTR_SBI_UI(msg_ratelimit_burst, s_msg_ratelimit_state.burst);
+#ifdef CONFIG_EXT4_DEBUG
+EXT4_RW_ATTR_SBI_UL(simulate_fail, s_simulate_fail);
+#endif
+EXT4_RO_ATTR_SBI_ATOMIC(warning_count, s_warning_count);
+EXT4_RO_ATTR_SBI_ATOMIC(msg_count, s_msg_count);
EXT4_RO_ATTR_ES_UI(errors_count, s_error_count);
+EXT4_RO_ATTR_ES_U8(first_error_errcode, s_first_error_errcode);
+EXT4_RO_ATTR_ES_U8(last_error_errcode, s_last_error_errcode);
+EXT4_RO_ATTR_ES_UI(first_error_ino, s_first_error_ino);
+EXT4_RO_ATTR_ES_UI(last_error_ino, s_last_error_ino);
+EXT4_RO_ATTR_ES_U64(first_error_block, s_first_error_block);
+EXT4_RO_ATTR_ES_U64(last_error_block, s_last_error_block);
+EXT4_RO_ATTR_ES_UI(first_error_line, s_first_error_line);
+EXT4_RO_ATTR_ES_UI(last_error_line, s_last_error_line);
+EXT4_RO_ATTR_ES_STRING(first_error_func, s_first_error_func, 32);
+EXT4_RO_ATTR_ES_STRING(last_error_func, s_last_error_func, 32);
EXT4_ATTR(first_error_time, 0444, first_error_time);
EXT4_ATTR(last_error_time, 0444, last_error_time);
EXT4_ATTR(journal_task, 0444, journal_task);
+EXT4_RW_ATTR_SBI_UI(mb_prefetch, s_mb_prefetch);
+EXT4_RW_ATTR_SBI_UI(mb_prefetch_limit, s_mb_prefetch_limit);
static unsigned int old_bump_val = 128;
EXT4_ATTR_PTR(max_writeback_mb_bump, 0444, pointer_ui, &old_bump_val);
@@ -218,6 +268,7 @@
ATTR_LIST(mb_order2_req),
ATTR_LIST(mb_stream_req),
ATTR_LIST(mb_group_prealloc),
+ ATTR_LIST(mb_max_inode_prealloc),
ATTR_LIST(max_writeback_mb_bump),
ATTR_LIST(extent_max_zeroout_kb),
ATTR_LIST(trigger_fs_error),
@@ -228,9 +279,26 @@
ATTR_LIST(msg_ratelimit_interval_ms),
ATTR_LIST(msg_ratelimit_burst),
ATTR_LIST(errors_count),
+ ATTR_LIST(warning_count),
+ ATTR_LIST(msg_count),
+ ATTR_LIST(first_error_ino),
+ ATTR_LIST(last_error_ino),
+ ATTR_LIST(first_error_block),
+ ATTR_LIST(last_error_block),
+ ATTR_LIST(first_error_line),
+ ATTR_LIST(last_error_line),
+ ATTR_LIST(first_error_func),
+ ATTR_LIST(last_error_func),
+ ATTR_LIST(first_error_errcode),
+ ATTR_LIST(last_error_errcode),
ATTR_LIST(first_error_time),
ATTR_LIST(last_error_time),
ATTR_LIST(journal_task),
+#ifdef CONFIG_EXT4_DEBUG
+ ATTR_LIST(simulate_fail),
+#endif
+ ATTR_LIST(mb_prefetch),
+ ATTR_LIST(mb_prefetch_limit),
NULL,
};
ATTRIBUTE_GROUPS(ext4);
@@ -241,6 +309,7 @@
EXT4_ATTR_FEATURE(meta_bg_resize);
#ifdef CONFIG_FS_ENCRYPTION
EXT4_ATTR_FEATURE(encryption);
+EXT4_ATTR_FEATURE(test_dummy_encryption_v2);
#endif
#ifdef CONFIG_UNICODE
EXT4_ATTR_FEATURE(casefold);
@@ -249,6 +318,7 @@
EXT4_ATTR_FEATURE(verity);
#endif
EXT4_ATTR_FEATURE(metadata_csum_seed);
+EXT4_ATTR_FEATURE(fast_commit);
static struct attribute *ext4_feat_attrs[] = {
ATTR_LIST(lazy_itable_init),
@@ -256,6 +326,7 @@
ATTR_LIST(meta_bg_resize),
#ifdef CONFIG_FS_ENCRYPTION
ATTR_LIST(encryption),
+ ATTR_LIST(test_dummy_encryption_v2),
#endif
#ifdef CONFIG_UNICODE
ATTR_LIST(casefold),
@@ -264,6 +335,7 @@
ATTR_LIST(verity),
#endif
ATTR_LIST(metadata_csum_seed),
+ ATTR_LIST(fast_commit),
NULL,
};
ATTRIBUTE_GROUPS(ext4_feat);
@@ -283,7 +355,7 @@
static ssize_t __print_tstamp(char *buf, __le32 lo, __u8 hi)
{
- return snprintf(buf, PAGE_SIZE, "%lld",
+ return snprintf(buf, PAGE_SIZE, "%lld\n",
((time64_t)hi << 32) + le32_to_cpu(lo));
}
@@ -325,6 +397,30 @@
else
return snprintf(buf, PAGE_SIZE, "%u\n",
*((unsigned int *) ptr));
+ case attr_pointer_ul:
+ if (!ptr)
+ return 0;
+ return snprintf(buf, PAGE_SIZE, "%lu\n",
+ *((unsigned long *) ptr));
+ case attr_pointer_u8:
+ if (!ptr)
+ return 0;
+ return snprintf(buf, PAGE_SIZE, "%u\n",
+ *((unsigned char *) ptr));
+ case attr_pointer_u64:
+ if (!ptr)
+ return 0;
+ if (a->attr_ptr == ptr_ext4_super_block_offset)
+ return snprintf(buf, PAGE_SIZE, "%llu\n",
+ le64_to_cpup(ptr));
+ else
+ return snprintf(buf, PAGE_SIZE, "%llu\n",
+ *((unsigned long long *) ptr));
+ case attr_pointer_string:
+ if (!ptr)
+ return 0;
+ return snprintf(buf, PAGE_SIZE, "%.*s\n", a->attr_size,
+ (char *) ptr);
case attr_pointer_atomic:
if (!ptr)
return 0;
@@ -368,6 +464,14 @@
else
*((unsigned int *) ptr) = t;
return len;
+ case attr_pointer_ul:
+ if (!ptr)
+ return 0;
+ ret = kstrtoul(skip_spaces(buf), 0, &t);
+ if (ret)
+ return ret;
+ *((unsigned long *) ptr) = t;
+ return len;
case attr_inode_readahead:
return inode_readahead_blks_store(sbi, buf, len);
case attr_trigger_test_error:
@@ -426,6 +530,8 @@
proc_create_single_data("es_shrinker_info", S_IRUGO,
sbi->s_proc, ext4_seq_es_shrinker_info_show,
sb);
+ proc_create_single_data("fc_info", 0444, sbi->s_proc,
+ ext4_fc_info_show, sb);
proc_create_seq_data("mb_groups", S_IRUGO, sbi->s_proc,
&ext4_mb_seq_groups_ops, sb);
}
diff --git a/fs/ext4/verity.c b/fs/ext4/verity.c
index 6a30e54..00e3cbd 100644
--- a/fs/ext4/verity.c
+++ b/fs/ext4/verity.c
@@ -113,6 +113,9 @@
handle_t *handle;
int err;
+ if (IS_DAX(inode) || ext4_test_inode_flag(inode, EXT4_INODE_DAX))
+ return -EINVAL;
+
if (ext4_verity_in_progress(inode))
return -EBUSY;
@@ -244,7 +247,7 @@
goto stop_and_cleanup;
ext4_set_inode_flag(inode, EXT4_INODE_VERITY);
- ext4_set_inode_flags(inode);
+ ext4_set_inode_flags(inode, false);
err = ext4_mark_iloc_dirty(handle, inode, &iloc);
if (err)
goto stop_and_cleanup;
@@ -364,11 +367,23 @@
}
static struct page *ext4_read_merkle_tree_page(struct inode *inode,
- pgoff_t index)
+ pgoff_t index,
+ unsigned long num_ra_pages)
{
+ DEFINE_READAHEAD(ractl, NULL, inode->i_mapping, index);
+ struct page *page;
+
index += ext4_verity_metadata_pos(inode) >> PAGE_SHIFT;
- return read_mapping_page(inode->i_mapping, index, NULL);
+ page = find_get_page_flags(inode->i_mapping, index, FGP_ACCESSED);
+ if (!page || !PageUptodate(page)) {
+ if (page)
+ put_page(page);
+ else if (num_ra_pages > 1)
+ page_cache_ra_unbounded(&ractl, num_ra_pages, 0);
+ page = read_mapping_page(inode->i_mapping, index, NULL);
+ }
+ return page;
}
static int ext4_write_merkle_tree_block(struct inode *inode, const void *buf,
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 20e40ca..5462f26 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -93,6 +93,7 @@
#ifdef CONFIG_EXT4_FS_SECURITY
[EXT4_XATTR_INDEX_SECURITY] = &ext4_xattr_security_handler,
#endif
+ [EXT4_XATTR_INDEX_HURD] = &ext4_xattr_hurd_handler,
};
const struct xattr_handler *ext4_xattr_handlers[] = {
@@ -105,6 +106,7 @@
#ifdef CONFIG_EXT4_FS_SECURITY
&ext4_xattr_security_handler,
#endif
+ &ext4_xattr_hurd_handler,
NULL
};
@@ -245,7 +247,7 @@
bh->b_data);
errout:
if (error)
- __ext4_error_inode(inode, function, line, 0,
+ __ext4_error_inode(inode, function, line, 0, -error,
"corrupted xattr block %llu",
(unsigned long long) bh->b_blocknr);
else
@@ -269,7 +271,7 @@
error = ext4_xattr_check_entries(IFIRST(header), end, IFIRST(header));
errout:
if (error)
- __ext4_error_inode(inode, function, line, 0,
+ __ext4_error_inode(inode, function, line, 0, -error,
"corrupted in-inode xattr");
return error;
}
@@ -967,55 +969,6 @@
return credits;
}
-static int ext4_xattr_ensure_credits(handle_t *handle, struct inode *inode,
- int credits, struct buffer_head *bh,
- bool dirty, bool block_csum)
-{
- int error;
-
- if (!ext4_handle_valid(handle))
- return 0;
-
- if (handle->h_buffer_credits >= credits)
- return 0;
-
- error = ext4_journal_extend(handle, credits - handle->h_buffer_credits);
- if (!error)
- return 0;
- if (error < 0) {
- ext4_warning(inode->i_sb, "Extend journal (error %d)", error);
- return error;
- }
-
- if (bh && dirty) {
- if (block_csum)
- ext4_xattr_block_csum_set(inode, bh);
- error = ext4_handle_dirty_metadata(handle, NULL, bh);
- if (error) {
- ext4_warning(inode->i_sb, "Handle metadata (error %d)",
- error);
- return error;
- }
- }
-
- error = ext4_journal_restart(handle, credits);
- if (error) {
- ext4_warning(inode->i_sb, "Restart journal (error %d)", error);
- return error;
- }
-
- if (bh) {
- error = ext4_journal_get_write_access(handle, bh);
- if (error) {
- ext4_warning(inode->i_sb,
- "Get write access failed (error %d)",
- error);
- return error;
- }
- }
- return 0;
-}
-
static int ext4_xattr_inode_update_ref(handle_t *handle, struct inode *ea_inode,
int ref_change)
{
@@ -1149,6 +1102,24 @@
return saved_err;
}
+static int ext4_xattr_restart_fn(handle_t *handle, struct inode *inode,
+ struct buffer_head *bh, bool block_csum, bool dirty)
+{
+ int error;
+
+ if (bh && dirty) {
+ if (block_csum)
+ ext4_xattr_block_csum_set(inode, bh);
+ error = ext4_handle_dirty_metadata(handle, NULL, bh);
+ if (error) {
+ ext4_warning(inode->i_sb, "Handle metadata (error %d)",
+ error);
+ return error;
+ }
+ }
+ return 0;
+}
+
static void
ext4_xattr_inode_dec_ref_all(handle_t *handle, struct inode *parent,
struct buffer_head *bh,
@@ -1185,13 +1156,24 @@
continue;
}
- err = ext4_xattr_ensure_credits(handle, parent, credits, bh,
- dirty, block_csum);
- if (err) {
+ err = ext4_journal_ensure_credits_fn(handle, credits, credits,
+ ext4_free_metadata_revoke_credits(parent->i_sb, 1),
+ ext4_xattr_restart_fn(handle, parent, bh, block_csum,
+ dirty));
+ if (err < 0) {
ext4_warning_inode(ea_inode, "Ensure credits err=%d",
err);
continue;
}
+ if (err > 0) {
+ err = ext4_journal_get_write_access(handle, bh);
+ if (err) {
+ ext4_warning_inode(ea_inode,
+ "Re-get write access err=%d",
+ err);
+ continue;
+ }
+ }
err = ext4_xattr_inode_dec_ref(handle, ea_inode);
if (err) {
@@ -1347,7 +1329,7 @@
int blocksize = ea_inode->i_sb->s_blocksize;
int max_blocks = (bufsize + blocksize - 1) >> ea_inode->i_blkbits;
int csize, wsize = 0;
- int ret = 0;
+ int ret = 0, ret2 = 0;
int retries = 0;
retry:
@@ -1374,8 +1356,7 @@
block = 0;
while (wsize < bufsize) {
- if (bh != NULL)
- brelse(bh);
+ brelse(bh);
csize = (bufsize - wsize) > blocksize ? blocksize :
bufsize - wsize;
bh = ext4_getblk(handle, ea_inode, block, 0);
@@ -1405,7 +1386,9 @@
ext4_update_i_disksize(ea_inode, wsize);
inode_unlock(ea_inode);
- ext4_mark_inode_dirty(handle, ea_inode);
+ ret2 = ext4_mark_inode_dirty(handle, ea_inode);
+ if (unlikely(ret2 && !ret))
+ ret = ret2;
out:
brelse(bh);
@@ -1479,7 +1462,7 @@
WARN_ON_ONCE(ext4_handle_valid(journal_current_handle()) &&
!(current->flags & PF_MEMALLOC_NOFS));
- ea_data = ext4_kvmalloc(value_len, GFP_NOFS);
+ ea_data = kvmalloc(value_len, GFP_KERNEL);
if (!ea_data) {
mb_cache_entry_put(ea_inode_cache, ce);
return NULL;
@@ -2341,7 +2324,7 @@
flags & XATTR_CREATE);
brelse(bh);
- if (!ext4_handle_has_enough_credits(handle, credits)) {
+ if (jbd2_handle_buffer_credits(handle) < credits) {
error = -ENOSPC;
goto cleanup;
}
@@ -2440,6 +2423,7 @@
if (IS_SYNC(inode))
ext4_handle_sync(handle);
}
+ ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_XATTR);
cleanup:
brelse(is.iloc.bh);
@@ -2517,6 +2501,7 @@
if (error == 0)
error = error2;
}
+ ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_XATTR);
return error;
}
@@ -2869,11 +2854,9 @@
struct inode *ea_inode;
int error;
- error = ext4_xattr_ensure_credits(handle, inode, extra_credits,
- NULL /* bh */,
- false /* dirty */,
- false /* block_csum */);
- if (error) {
+ error = ext4_journal_ensure_credits(handle, extra_credits,
+ ext4_free_metadata_revoke_credits(inode->i_sb, 1));
+ if (error < 0) {
EXT4_ERROR_INODE(inode, "ensure credits (error %d)", error);
goto cleanup;
}
@@ -2908,9 +2891,11 @@
bh = ext4_sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl, REQ_PRIO);
if (IS_ERR(bh)) {
error = PTR_ERR(bh);
- if (error == -EIO)
- EXT4_ERROR_INODE(inode, "block %llu read error",
- EXT4_I(inode)->i_file_acl);
+ if (error == -EIO) {
+ EXT4_ERROR_INODE_ERR(inode, EIO,
+ "block %llu read error",
+ EXT4_I(inode)->i_file_acl);
+ }
bh = NULL;
goto cleanup;
}
@@ -2949,6 +2934,7 @@
error);
goto cleanup;
}
+ ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_XATTR);
}
error = 0;
cleanup:
diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
index f39cad2..730b91f 100644
--- a/fs/ext4/xattr.h
+++ b/fs/ext4/xattr.h
@@ -48,7 +48,7 @@
__le32 e_value_inum; /* inode in which the value is stored */
__le32 e_value_size; /* size of attribute value */
__le32 e_hash; /* hash value of name and value */
- char e_name[0]; /* attribute name */
+ char e_name[]; /* attribute name */
};
#define EXT4_XATTR_PAD_BITS 2
@@ -118,12 +118,13 @@
struct ext4_xattr_inode_array {
unsigned int count; /* # of used items in the array */
- struct inode *inodes[0];
+ struct inode *inodes[];
};
extern const struct xattr_handler ext4_xattr_user_handler;
extern const struct xattr_handler ext4_xattr_trusted_handler;
extern const struct xattr_handler ext4_xattr_security_handler;
+extern const struct xattr_handler ext4_xattr_hurd_handler;
#define EXT4_XATTR_NAME_ENCRYPTION_CONTEXT "c"
diff --git a/fs/ext4/xattr_hurd.c b/fs/ext4/xattr_hurd.c
new file mode 100644
index 0000000..8cfa74a
--- /dev/null
+++ b/fs/ext4/xattr_hurd.c
@@ -0,0 +1,51 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * linux/fs/ext4/xattr_hurd.c
+ * Handler for extended gnu attributes for the Hurd.
+ *
+ * Copyright (C) 2001 by Andreas Gruenbacher, <a.gruenbacher@computer.org>
+ * Copyright (C) 2020 by Jan (janneke) Nieuwenhuizen, <janneke@gnu.org>
+ */
+
+#include <linux/init.h>
+#include <linux/string.h>
+#include "ext4.h"
+#include "xattr.h"
+
+static bool
+ext4_xattr_hurd_list(struct dentry *dentry)
+{
+ return test_opt(dentry->d_sb, XATTR_USER);
+}
+
+static int
+ext4_xattr_hurd_get(const struct xattr_handler *handler,
+ struct dentry *unused, struct inode *inode,
+ const char *name, void *buffer, size_t size)
+{
+ if (!test_opt(inode->i_sb, XATTR_USER))
+ return -EOPNOTSUPP;
+
+ return ext4_xattr_get(inode, EXT4_XATTR_INDEX_HURD,
+ name, buffer, size);
+}
+
+static int
+ext4_xattr_hurd_set(const struct xattr_handler *handler,
+ struct dentry *unused, struct inode *inode,
+ const char *name, const void *value,
+ size_t size, int flags)
+{
+ if (!test_opt(inode->i_sb, XATTR_USER))
+ return -EOPNOTSUPP;
+
+ return ext4_xattr_set(inode, EXT4_XATTR_INDEX_HURD,
+ name, value, size, flags);
+}
+
+const struct xattr_handler ext4_xattr_hurd_handler = {
+ .prefix = XATTR_HURD_PREFIX,
+ .list = ext4_xattr_hurd_list,
+ .get = ext4_xattr_hurd_get,
+ .set = ext4_xattr_hurd_set,
+};