Update Linux to v5.10.109

Sourced from [1]

[1] https://cdn.kernel.org/pub/linux/kernel/v5.x/linux-5.10.109.tar.xz

Change-Id: I19bca9fc6762d4e63bcf3e4cba88bbe560d9c76c
Signed-off-by: Olivier Deprez <olivier.deprez@arm.com>
diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c
index f4f531c..bacee09 100644
--- a/fs/btrfs/delalloc-space.c
+++ b/fs/btrfs/delalloc-space.c
@@ -9,136 +9,127 @@
 #include "qgroup.h"
 #include "block-group.h"
 
+/*
+ * HOW DOES THIS WORK
+ *
+ * There are two stages to data reservations, one for data and one for metadata
+ * to handle the new extents and checksums generated by writing data.
+ *
+ *
+ * DATA RESERVATION
+ *   The general flow of the data reservation is as follows
+ *
+ *   -> Reserve
+ *     We call into btrfs_reserve_data_bytes() for the user request bytes that
+ *     they wish to write.  We make this reservation and add it to
+ *     space_info->bytes_may_use.  We set EXTENT_DELALLOC on the inode io_tree
+ *     for the range and carry on if this is buffered, or follow up trying to
+ *     make a real allocation if we are pre-allocating or doing O_DIRECT.
+ *
+ *   -> Use
+ *     At writepages()/prealloc/O_DIRECT time we will call into
+ *     btrfs_reserve_extent() for some part or all of this range of bytes.  We
+ *     will make the allocation and subtract space_info->bytes_may_use by the
+ *     original requested length and increase the space_info->bytes_reserved by
+ *     the allocated length.  This distinction is important because compression
+ *     may allocate a smaller on disk extent than we previously reserved.
+ *
+ *   -> Allocation
+ *     finish_ordered_io() will insert the new file extent item for this range,
+ *     and then add a delayed ref update for the extent tree.  Once that delayed
+ *     ref is written the extent size is subtracted from
+ *     space_info->bytes_reserved and added to space_info->bytes_used.
+ *
+ *   Error handling
+ *
+ *   -> By the reservation maker
+ *     This is the simplest case, we haven't completed our operation and we know
+ *     how much we reserved, we can simply call
+ *     btrfs_free_reserved_data_space*() and it will be removed from
+ *     space_info->bytes_may_use.
+ *
+ *   -> After the reservation has been made, but before cow_file_range()
+ *     This is specifically for the delalloc case.  You must clear
+ *     EXTENT_DELALLOC with the EXTENT_CLEAR_DATA_RESV bit, and the range will
+ *     be subtracted from space_info->bytes_may_use.
+ *
+ * METADATA RESERVATION
+ *   The general metadata reservation lifetimes are discussed elsewhere, this
+ *   will just focus on how it is used for delalloc space.
+ *
+ *   We keep track of two things on a per inode bases
+ *
+ *   ->outstanding_extents
+ *     This is the number of file extent items we'll need to handle all of the
+ *     outstanding DELALLOC space we have in this inode.  We limit the maximum
+ *     size of an extent, so a large contiguous dirty area may require more than
+ *     one outstanding_extent, which is why count_max_extents() is used to
+ *     determine how many outstanding_extents get added.
+ *
+ *   ->csum_bytes
+ *     This is essentially how many dirty bytes we have for this inode, so we
+ *     can calculate the number of checksum items we would have to add in order
+ *     to checksum our outstanding data.
+ *
+ *   We keep a per-inode block_rsv in order to make it easier to keep track of
+ *   our reservation.  We use btrfs_calculate_inode_block_rsv_size() to
+ *   calculate the current theoretical maximum reservation we would need for the
+ *   metadata for this inode.  We call this and then adjust our reservation as
+ *   necessary, either by attempting to reserve more space, or freeing up excess
+ *   space.
+ *
+ * OUTSTANDING_EXTENTS HANDLING
+ *
+ *  ->outstanding_extents is used for keeping track of how many extents we will
+ *  need to use for this inode, and it will fluctuate depending on where you are
+ *  in the life cycle of the dirty data.  Consider the following normal case for
+ *  a completely clean inode, with a num_bytes < our maximum allowed extent size
+ *
+ *  -> reserve
+ *    ->outstanding_extents += 1 (current value is 1)
+ *
+ *  -> set_delalloc
+ *    ->outstanding_extents += 1 (currrent value is 2)
+ *
+ *  -> btrfs_delalloc_release_extents()
+ *    ->outstanding_extents -= 1 (current value is 1)
+ *
+ *    We must call this once we are done, as we hold our reservation for the
+ *    duration of our operation, and then assume set_delalloc will update the
+ *    counter appropriately.
+ *
+ *  -> add ordered extent
+ *    ->outstanding_extents += 1 (current value is 2)
+ *
+ *  -> btrfs_clear_delalloc_extent
+ *    ->outstanding_extents -= 1 (current value is 1)
+ *
+ *  -> finish_ordered_io/btrfs_remove_ordered_extent
+ *    ->outstanding_extents -= 1 (current value is 0)
+ *
+ *  Each stage is responsible for their own accounting of the extent, thus
+ *  making error handling and cleanup easier.
+ */
+
 int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes)
 {
 	struct btrfs_root *root = inode->root;
 	struct btrfs_fs_info *fs_info = root->fs_info;
-	struct btrfs_space_info *data_sinfo = fs_info->data_sinfo;
-	u64 used;
-	int ret = 0;
-	int need_commit = 2;
-	int have_pinned_space;
+	enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_DATA;
 
 	/* Make sure bytes are sectorsize aligned */
 	bytes = ALIGN(bytes, fs_info->sectorsize);
 
-	if (btrfs_is_free_space_inode(inode)) {
-		need_commit = 0;
-		ASSERT(current->journal_info);
-	}
+	if (btrfs_is_free_space_inode(inode))
+		flush = BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE;
 
-again:
-	/* Make sure we have enough space to handle the data first */
-	spin_lock(&data_sinfo->lock);
-	used = btrfs_space_info_used(data_sinfo, true);
-
-	if (used + bytes > data_sinfo->total_bytes) {
-		struct btrfs_trans_handle *trans;
-
-		/*
-		 * If we don't have enough free bytes in this space then we need
-		 * to alloc a new chunk.
-		 */
-		if (!data_sinfo->full) {
-			u64 alloc_target;
-
-			data_sinfo->force_alloc = CHUNK_ALLOC_FORCE;
-			spin_unlock(&data_sinfo->lock);
-
-			alloc_target = btrfs_data_alloc_profile(fs_info);
-			/*
-			 * It is ugly that we don't call nolock join
-			 * transaction for the free space inode case here.
-			 * But it is safe because we only do the data space
-			 * reservation for the free space cache in the
-			 * transaction context, the common join transaction
-			 * just increase the counter of the current transaction
-			 * handler, doesn't try to acquire the trans_lock of
-			 * the fs.
-			 */
-			trans = btrfs_join_transaction(root);
-			if (IS_ERR(trans))
-				return PTR_ERR(trans);
-
-			ret = btrfs_chunk_alloc(trans, alloc_target,
-						CHUNK_ALLOC_NO_FORCE);
-			btrfs_end_transaction(trans);
-			if (ret < 0) {
-				if (ret != -ENOSPC)
-					return ret;
-				else {
-					have_pinned_space = 1;
-					goto commit_trans;
-				}
-			}
-
-			goto again;
-		}
-
-		/*
-		 * If we don't have enough pinned space to deal with this
-		 * allocation, and no removed chunk in current transaction,
-		 * don't bother committing the transaction.
-		 */
-		have_pinned_space = __percpu_counter_compare(
-			&data_sinfo->total_bytes_pinned,
-			used + bytes - data_sinfo->total_bytes,
-			BTRFS_TOTAL_BYTES_PINNED_BATCH);
-		spin_unlock(&data_sinfo->lock);
-
-		/* Commit the current transaction and try again */
-commit_trans:
-		if (need_commit) {
-			need_commit--;
-
-			if (need_commit > 0) {
-				btrfs_start_delalloc_roots(fs_info, -1);
-				btrfs_wait_ordered_roots(fs_info, U64_MAX, 0,
-							 (u64)-1);
-			}
-
-			trans = btrfs_join_transaction(root);
-			if (IS_ERR(trans))
-				return PTR_ERR(trans);
-			if (have_pinned_space >= 0 ||
-			    test_bit(BTRFS_TRANS_HAVE_FREE_BGS,
-				     &trans->transaction->flags) ||
-			    need_commit > 0) {
-				ret = btrfs_commit_transaction(trans);
-				if (ret)
-					return ret;
-				/*
-				 * The cleaner kthread might still be doing iput
-				 * operations. Wait for it to finish so that
-				 * more space is released.  We don't need to
-				 * explicitly run the delayed iputs here because
-				 * the commit_transaction would have woken up
-				 * the cleaner.
-				 */
-				ret = btrfs_wait_on_delayed_iputs(fs_info);
-				if (ret)
-					return ret;
-				goto again;
-			} else {
-				btrfs_end_transaction(trans);
-			}
-		}
-
-		trace_btrfs_space_reservation(fs_info,
-					      "space_info:enospc",
-					      data_sinfo->flags, bytes, 1);
-		return -ENOSPC;
-	}
-	btrfs_space_info_update_bytes_may_use(fs_info, data_sinfo, bytes);
-	spin_unlock(&data_sinfo->lock);
-
-	return 0;
+	return btrfs_reserve_data_bytes(fs_info, bytes, flush);
 }
 
-int btrfs_check_data_free_space(struct inode *inode,
+int btrfs_check_data_free_space(struct btrfs_inode *inode,
 			struct extent_changeset **reserved, u64 start, u64 len)
 {
-	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+	struct btrfs_fs_info *fs_info = inode->root->fs_info;
 	int ret;
 
 	/* align the range */
@@ -146,14 +137,14 @@
 	      round_down(start, fs_info->sectorsize);
 	start = round_down(start, fs_info->sectorsize);
 
-	ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode), len);
+	ret = btrfs_alloc_data_chunk_ondemand(inode, len);
 	if (ret < 0)
 		return ret;
 
 	/* Use new btrfs_qgroup_reserve_data to reserve precious data space. */
-	ret = btrfs_qgroup_reserve_data(BTRFS_I(inode), reserved, start, len);
+	ret = btrfs_qgroup_reserve_data(inode, reserved, start, len);
 	if (ret < 0)
-		btrfs_free_reserved_data_space_noquota(inode, start, len);
+		btrfs_free_reserved_data_space_noquota(fs_info, len);
 	else
 		ret = 0;
 	return ret;
@@ -167,21 +158,15 @@
  * which we can't sleep and is sure it won't affect qgroup reserved space.
  * Like clear_bit_hook().
  */
-void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start,
+void btrfs_free_reserved_data_space_noquota(struct btrfs_fs_info *fs_info,
 					    u64 len)
 {
-	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
 	struct btrfs_space_info *data_sinfo;
 
-	/* Make sure the range is aligned to sectorsize */
-	len = round_up(start + len, fs_info->sectorsize) -
-	      round_down(start, fs_info->sectorsize);
-	start = round_down(start, fs_info->sectorsize);
+	ASSERT(IS_ALIGNED(len, fs_info->sectorsize));
 
 	data_sinfo = fs_info->data_sinfo;
-	spin_lock(&data_sinfo->lock);
-	btrfs_space_info_update_bytes_may_use(fs_info, data_sinfo, -len);
-	spin_unlock(&data_sinfo->lock);
+	btrfs_space_info_free_bytes_may_use(fs_info, data_sinfo, len);
 }
 
 /*
@@ -191,17 +176,17 @@
  * This one will handle the per-inode data rsv map for accurate reserved
  * space framework.
  */
-void btrfs_free_reserved_data_space(struct inode *inode,
+void btrfs_free_reserved_data_space(struct btrfs_inode *inode,
 			struct extent_changeset *reserved, u64 start, u64 len)
 {
-	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_fs_info *fs_info = inode->root->fs_info;
 
 	/* Make sure the range is aligned to sectorsize */
-	len = round_up(start + len, root->fs_info->sectorsize) -
-	      round_down(start, root->fs_info->sectorsize);
-	start = round_down(start, root->fs_info->sectorsize);
+	len = round_up(start + len, fs_info->sectorsize) -
+	      round_down(start, fs_info->sectorsize);
+	start = round_down(start, fs_info->sectorsize);
 
-	btrfs_free_reserved_data_space_noquota(inode, start, len);
+	btrfs_free_reserved_data_space_noquota(fs_info, len);
 	btrfs_qgroup_free_data(inode, reserved, start, len);
 }
 
@@ -228,8 +213,8 @@
 	 * are releasing 0 bytes, and then we'll just get the reservation over
 	 * the size free'd.
 	 */
-	released = __btrfs_block_rsv_release(fs_info, block_rsv, 0,
-					     &qgroup_to_release);
+	released = btrfs_block_rsv_release(fs_info, block_rsv, 0,
+					   &qgroup_to_release);
 	if (released > 0)
 		trace_btrfs_space_reservation(fs_info, "delalloc",
 					      btrfs_ino(inode), released, 0);
@@ -307,7 +292,6 @@
 	unsigned nr_extents;
 	enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL;
 	int ret = 0;
-	bool delalloc_lock = true;
 
 	/*
 	 * If we are a free space inode we need to not flush since we will be in
@@ -320,7 +304,6 @@
 	 */
 	if (btrfs_is_free_space_inode(inode)) {
 		flush = BTRFS_RESERVE_NO_FLUSH;
-		delalloc_lock = false;
 	} else {
 		if (current->journal_info)
 			flush = BTRFS_RESERVE_FLUSH_LIMIT;
@@ -329,9 +312,6 @@
 			schedule_timeout(1);
 	}
 
-	if (delalloc_lock)
-		mutex_lock(&inode->delalloc_mutex);
-
 	num_bytes = ALIGN(num_bytes, fs_info->sectorsize);
 
 	/*
@@ -348,10 +328,12 @@
 				&qgroup_reserve);
 	ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_reserve, true);
 	if (ret)
-		goto out_fail;
+		return ret;
 	ret = btrfs_reserve_metadata_bytes(root, block_rsv, meta_reserve, flush);
-	if (ret)
-		goto out_qgroup;
+	if (ret) {
+		btrfs_qgroup_free_meta_prealloc(root, qgroup_reserve);
+		return ret;
+	}
 
 	/*
 	 * Now we need to update our outstanding extents and csum bytes _first_
@@ -375,15 +357,7 @@
 	block_rsv->qgroup_rsv_reserved += qgroup_reserve;
 	spin_unlock(&block_rsv->lock);
 
-	if (delalloc_lock)
-		mutex_unlock(&inode->delalloc_mutex);
 	return 0;
-out_qgroup:
-	btrfs_qgroup_free_meta_prealloc(root, qgroup_reserve);
-out_fail:
-	if (delalloc_lock)
-		mutex_unlock(&inode->delalloc_mutex);
-	return ret;
 }
 
 /**
@@ -466,7 +440,7 @@
  * Return 0 for success
  * Return <0 for error(-ENOSPC or -EQUOT)
  */
-int btrfs_delalloc_reserve_space(struct inode *inode,
+int btrfs_delalloc_reserve_space(struct btrfs_inode *inode,
 			struct extent_changeset **reserved, u64 start, u64 len)
 {
 	int ret;
@@ -474,7 +448,7 @@
 	ret = btrfs_check_data_free_space(inode, reserved, start, len);
 	if (ret < 0)
 		return ret;
-	ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len);
+	ret = btrfs_delalloc_reserve_metadata(inode, len);
 	if (ret < 0)
 		btrfs_free_reserved_data_space(inode, *reserved, start, len);
 	return ret;
@@ -492,10 +466,10 @@
  * list if there are no delalloc bytes left.
  * Also it will handle the qgroup reserved space.
  */
-void btrfs_delalloc_release_space(struct inode *inode,
+void btrfs_delalloc_release_space(struct btrfs_inode *inode,
 				  struct extent_changeset *reserved,
 				  u64 start, u64 len, bool qgroup_free)
 {
-	btrfs_delalloc_release_metadata(BTRFS_I(inode), len, qgroup_free);
+	btrfs_delalloc_release_metadata(inode, len, qgroup_free);
 	btrfs_free_reserved_data_space(inode, reserved, start, len);
 }